docs: add section-by-section comments — correlator.py

This commit is contained in:
2026-05-18 21:00:51 +00:00
parent 62aa1804dc
commit 9366027534
+166 -49
View File
@@ -1,19 +1,33 @@
#!/usr/bin/env python3
"""
correlator.py - Reads NS8 cluster Redis state to determine backup outcome.
"""Read NS8 cluster Redis state to determine the backup outcome.
For each backup plan/schedule, reads the per-module backup status and
produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE.
For each backup plan/schedule, this module reads the per-module backup
status hashes from the cluster Redis and produces a classified outcome:
NS8 Redis key patterns:
cluster/backup/<backup_id>/status -> last overall plan status (hash)
module/<module_id>/backup/<backup_id>/status -> per-module status (hash)
SUCCESS All modules finished without errors.
PARTIAL Some modules failed, others succeeded.
REPO_FAILURE All modules failed, total is zero, or no status was
found in Redis at all (possible repository-level issue).
Fields in status hash:
result : success | error
timestamp: ISO8601
error : error message if any
errors : number of module errors (in plan status)
NS8 Redis key patterns
-----------------------
cluster/backup/<backup_id>/status
Overall plan status hash. Fields: result, timestamp, errors.
module/<module_id>/backup/<backup_id>/status
Per-module status hash. Fields: result, timestamp, error.
Redis hash fields
-----------------
result : "success" | "error"
timestamp : ISO 8601 string
error : human-readable error message (empty on success)
errors : integer count of module errors (plan-level hash only)
Dependencies
------------
Only the standard library and ``redis-cli`` (installed with NS8) are required.
No Python Redis client library is needed.
"""
import logging
@@ -24,25 +38,73 @@ from typing import Optional
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Redis helpers
# ---------------------------------------------------------------------------
# These thin wrappers call redis-cli via subprocess instead of using a Python
# Redis client, keeping the dependency list to zero and staying consistent with
# how other NS8 scripts interact with the cluster Redis.
def _redis_cmd(config: dict, *args) -> str:
"""Run a redis-cli command against the NS8 cluster Redis socket."""
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
"""Run a redis-cli command against the NS8 cluster Redis Unix socket.
Args:
config: Parsed configuration dictionary (reads ``redis.socket``).
*args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
Returns:
Raw stdout string, stripped of leading/trailing whitespace.
"""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
)
cmd = ["redis-cli", "-s", socket] + list(args)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
return result.stdout.strip()
def _redis_hgetall(config: dict, key: str) -> dict:
"""Return all fields of a Redis hash as a dict in a single redis-cli call."""
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
"""Return all fields of a Redis hash as a Python dict.
``redis-cli HGETALL`` outputs alternating field / value lines.
This function zips consecutive pairs into a dict.
Args:
config: Parsed configuration dictionary.
key: Full Redis key of the hash to read.
Returns:
Dict mapping field names to values, or an empty dict if the key
does not exist or the hash is empty.
"""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
)
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key / value lines:
# line 0 → field name, line 1 → value, line 2 → field name, …
return dict(zip(lines[::2], lines[1::2]))
# ---------------------------------------------------------------------------
# Recent backup discovery
# ---------------------------------------------------------------------------
def _get_recent_backup_ids(config: dict, window: int) -> list:
"""Scan Redis for backup plan status keys updated within window seconds."""
"""Scan Redis for plan status keys updated within the last ``window`` seconds.
Used as a fallback when Alertmanager does not include a ``backup_id``
label on the alert (older NS8 versions or custom alert rules).
Args:
config: Parsed configuration dictionary.
window: Look-back window in seconds (from ``correlator.recent_window``).
Returns:
List of backup_id strings whose plan status was updated recently.
"""
raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status")
keys = [k for k in raw.splitlines() if k]
now = datetime.now(timezone.utc).timestamp()
@@ -54,8 +116,11 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
if not ts_raw:
continue
try:
# Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
# for compatibility with Python < 3.11 fromisoformat().
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
if (now - ts) <= window:
# Key format: cluster/backup/<backup_id>/status
parts = key.split("/")
if len(parts) >= 3:
recent.append(parts[2])
@@ -65,71 +130,119 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
return recent
# ---------------------------------------------------------------------------
# Per-module status collection
# ---------------------------------------------------------------------------
def _get_module_statuses(config: dict, backup_id: str) -> list:
"""Get all per-module status entries for a given backup_id via HGETALL."""
"""Return all per-module status entries for a given backup_id.
Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
and reads each hash with HGETALL.
Args:
config: Parsed configuration dictionary.
backup_id: The backup plan identifier (e.g. "1", "2").
Returns:
List of dicts, one per module:
{
"module_id" : str,
"backup_id" : str,
"result" : "success" | "error" | "unknown",
"error" : str,
"timestamp" : str (ISO 8601),
}
"""
pattern = f"module/*/backup/{backup_id}/status"
raw = _redis_cmd(config, "KEYS", pattern)
keys = [k for k in raw.splitlines() if k]
statuses = []
for key in keys:
# Key format: module/<module_id>/backup/<backup_id>/status
module_id = key.split("/")[1]
fields = _redis_hgetall(config, key)
if not fields:
log.debug(f"Empty or missing status hash for {key}")
continue
statuses.append({
"module_id": module_id,
"backup_id": backup_id,
"result": fields.get("result", "unknown"),
"error": fields.get("error", ""),
"timestamp": fields.get("timestamp", ""),
"module_id": module_id,
"backup_id": backup_id,
"result": fields.get("result", "unknown"),
"error": fields.get("error", ""),
"timestamp": fields.get("timestamp", ""),
})
return statuses
# ---------------------------------------------------------------------------
# Main correlator entry point
# ---------------------------------------------------------------------------
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
"""
Main correlator entry point.
"""Classify the overall backup outcome by reading per-module Redis state.
Args:
config: Parsed configuration dictionary.
backup_ids: List of backup plan IDs from Alertmanager alert labels.
When empty or None, the function falls back to scanning
Redis for recently updated plan status keys.
Returns:
{
"outcome" : "SUCCESS" | "PARTIAL" | "REPO_FAILURE",
"backup_ids" : [...],
"modules" : [{module_id, backup_id, result, error, timestamp}, ...],
"failed_modules" : [...],
"total" : int,
"failed" : int,
"succeeded" : int,
"note" : str # optional
}
A dict with the following keys:
outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
backup_ids : list of plan IDs that were evaluated
modules : list of per-module status dicts (see _get_module_statuses)
failed_modules : subset of ``modules`` where result != "success"
total : total number of module status entries found
failed : number of failed modules
succeeded : number of succeeded modules
note : optional human-readable explanation string
Outcome classification rules
----------------------------
failed == 0 and total > 0 → SUCCESS
failed == total or total == 0 → REPO_FAILURE (all failed or nothing found)
otherwise → PARTIAL
"""
window = config.get("correlator", {}).get("recent_window", 3600)
# ---------------------------------------------------------------------------
# Resolve backup_ids
# ---------------------------------------------------------------------------
if not backup_ids:
log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
backup_ids = _get_recent_backup_ids(config, window)
# If still empty, no relevant Redis state exists — treat as full failure.
if not backup_ids:
log.warning("No recent backup status keys found in Redis")
return {
"outcome": "REPO_FAILURE",
"backup_ids": [],
"modules": [],
"outcome": "REPO_FAILURE",
"backup_ids": [],
"modules": [],
"failed_modules": [],
"total": 0,
"failed": 0,
"succeeded": 0,
"note": "No backup status found in Redis - possible repo or scheduling failure",
"total": 0,
"failed": 0,
"succeeded": 0,
"note": "No backup status found in Redis possible repo or scheduling failure",
}
# ---------------------------------------------------------------------------
# Collect per-module statuses across all plans
# ---------------------------------------------------------------------------
all_modules = []
for bid in backup_ids:
modules = _get_module_statuses(config, bid)
log.info(f"backup_id={bid}: found {len(modules)} module status entries")
all_modules.extend(modules)
# ---------------------------------------------------------------------------
# Outcome classification
# ---------------------------------------------------------------------------
total = len(all_modules)
failed_modules = [m for m in all_modules if m["result"] != "success"]
succeeded = total - len(failed_modules)
@@ -137,18 +250,22 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
if len(failed_modules) == 0 and total > 0:
outcome = "SUCCESS"
elif len(failed_modules) == total or total == 0:
# All modules failed, or no modules were found at all.
outcome = "REPO_FAILURE"
else:
outcome = "PARTIAL"
log.info(f"Correlation result: outcome={outcome}, total={total}, succeeded={succeeded}, failed={len(failed_modules)}")
log.info(
f"Correlation result: outcome={outcome}, total={total}, "
f"succeeded={succeeded}, failed={len(failed_modules)}"
)
return {
"outcome": outcome,
"backup_ids": backup_ids,
"modules": all_modules,
"outcome": outcome,
"backup_ids": backup_ids,
"modules": all_modules,
"failed_modules": failed_modules,
"total": total,
"failed": len(failed_modules),
"succeeded": succeeded,
"total": total,
"failed": len(failed_modules),
"succeeded": succeeded,
}