docs: add section-by-section comments — correlator.py

2026-05-18 21:00:51 +00:00
parent 62aa1804dc
commit 9366027534
1 changed files with 166 additions and 49 deletions
@@ -1,19 +1,33 @@
 #!/usr/bin/env python3
-"""
-correlator.py - Reads NS8 cluster Redis state to determine backup outcome.
+"""Read NS8 cluster Redis state to determine the backup outcome.

-For each backup plan/schedule, reads the per-module backup status and
-produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE.
+For each backup plan/schedule, this module reads the per-module backup
+status hashes from the cluster Redis and produces a classified outcome:

-NS8 Redis key patterns:
-  cluster/backup/<backup_id>/status            -> last overall plan status (hash)
-  module/<module_id>/backup/<backup_id>/status -> per-module status (hash)
+    SUCCESS      – All modules finished without errors.
+    PARTIAL      – Some modules failed, others succeeded.
+    REPO_FAILURE – All modules failed, total is zero, or no status was
+                   found in Redis at all (possible repository-level issue).

-Fields in status hash:
-  result   : success | error
-  timestamp: ISO8601
-  error    : error message if any
-  errors   : number of module errors (in plan status)
+NS8 Redis key patterns
+-----------------------
+    cluster/backup/<backup_id>/status
+        Overall plan status hash. Fields: result, timestamp, errors.
+
+    module/<module_id>/backup/<backup_id>/status
+        Per-module status hash. Fields: result, timestamp, error.
+
+Redis hash fields
+-----------------
+    result    : "success" | "error"
+    timestamp : ISO 8601 string
+    error     : human-readable error message (empty on success)
+    errors    : integer count of module errors (plan-level hash only)
+
+Dependencies
+------------
+Only the standard library and ``redis-cli`` (installed with NS8) are required.
+No Python Redis client library is needed.
 """

 import logging
@@ -24,25 +38,73 @@ from typing import Optional
 log = logging.getLogger(__name__)


+# ---------------------------------------------------------------------------
+# Redis helpers
+# ---------------------------------------------------------------------------
+# These thin wrappers call redis-cli via subprocess instead of using a Python
+# Redis client, keeping the dependency list to zero and staying consistent with
+# how other NS8 scripts interact with the cluster Redis.
+
 def _redis_cmd(config: dict, *args) -> str:
-    """Run a redis-cli command against the NS8 cluster Redis socket."""
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+    """Run a redis-cli command against the NS8 cluster Redis Unix socket.
+
+    Args:
+        config: Parsed configuration dictionary (reads ``redis.socket``).
+        *args:  Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
+
+    Returns:
+        Raw stdout string, stripped of leading/trailing whitespace.
+    """
+    socket = config.get("redis", {}).get(
+        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
+    )
    cmd = ["redis-cli", "-s", socket] + list(args)
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    return result.stdout.strip()


 def _redis_hgetall(config: dict, key: str) -> dict:
-    """Return all fields of a Redis hash as a dict in a single redis-cli call."""
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+    """Return all fields of a Redis hash as a Python dict.
+
+    ``redis-cli HGETALL`` outputs alternating field / value lines.
+    This function zips consecutive pairs into a dict.
+
+    Args:
+        config: Parsed configuration dictionary.
+        key:    Full Redis key of the hash to read.
+
+    Returns:
+        Dict mapping field names to values, or an empty dict if the key
+        does not exist or the hash is empty.
+    """
+    socket = config.get("redis", {}).get(
+        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
+    )
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
+    # redis-cli HGETALL returns alternating key / value lines:
+    #   line 0 → field name, line 1 → value, line 2 → field name, …
    return dict(zip(lines[::2], lines[1::2]))


+# ---------------------------------------------------------------------------
+# Recent backup discovery
+# ---------------------------------------------------------------------------
+
 def _get_recent_backup_ids(config: dict, window: int) -> list:
-    """Scan Redis for backup plan status keys updated within window seconds."""
+    """Scan Redis for plan status keys updated within the last ``window`` seconds.
+
+    Used as a fallback when Alertmanager does not include a ``backup_id``
+    label on the alert (older NS8 versions or custom alert rules).
+
+    Args:
+        config: Parsed configuration dictionary.
+        window: Look-back window in seconds (from ``correlator.recent_window``).
+
+    Returns:
+        List of backup_id strings whose plan status was updated recently.
+    """
    raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status")
    keys = [k for k in raw.splitlines() if k]
    now = datetime.now(timezone.utc).timestamp()
@@ -54,8 +116,11 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
        if not ts_raw:
            continue
        try:
+            # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
+            # for compatibility with Python < 3.11 fromisoformat().
            ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
            if (now - ts) <= window:
+                # Key format: cluster/backup/<backup_id>/status
                parts = key.split("/")
                if len(parts) >= 3:
                    recent.append(parts[2])
@@ -65,71 +130,119 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
    return recent


+# ---------------------------------------------------------------------------
+# Per-module status collection
+# ---------------------------------------------------------------------------
+
 def _get_module_statuses(config: dict, backup_id: str) -> list:
-    """Get all per-module status entries for a given backup_id via HGETALL."""
+    """Return all per-module status entries for a given backup_id.
+
+    Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
+    and reads each hash with HGETALL.
+
+    Args:
+        config:    Parsed configuration dictionary.
+        backup_id: The backup plan identifier (e.g. "1", "2").
+
+    Returns:
+        List of dicts, one per module:
+            {
+                "module_id"  : str,
+                "backup_id"  : str,
+                "result"     : "success" | "error" | "unknown",
+                "error"      : str,
+                "timestamp"  : str (ISO 8601),
+            }
+    """
    pattern = f"module/*/backup/{backup_id}/status"
    raw = _redis_cmd(config, "KEYS", pattern)
    keys = [k for k in raw.splitlines() if k]
    statuses = []

    for key in keys:
+        # Key format: module/<module_id>/backup/<backup_id>/status
        module_id = key.split("/")[1]
        fields = _redis_hgetall(config, key)
        if not fields:
            log.debug(f"Empty or missing status hash for {key}")
            continue
        statuses.append({
-            "module_id": module_id,
-            "backup_id": backup_id,
-            "result": fields.get("result", "unknown"),
-            "error": fields.get("error", ""),
-            "timestamp": fields.get("timestamp", ""),
+            "module_id":  module_id,
+            "backup_id":  backup_id,
+            "result":     fields.get("result", "unknown"),
+            "error":      fields.get("error", ""),
+            "timestamp":  fields.get("timestamp", ""),
        })

    return statuses


+# ---------------------------------------------------------------------------
+# Main correlator entry point
+# ---------------------------------------------------------------------------
+
 def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
-    """
-    Main correlator entry point.
+    """Classify the overall backup outcome by reading per-module Redis state.
+
+    Args:
+        config:     Parsed configuration dictionary.
+        backup_ids: List of backup plan IDs from Alertmanager alert labels.
+                    When empty or None, the function falls back to scanning
+                    Redis for recently updated plan status keys.

    Returns:
-    {
-        "outcome"         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE",
-        "backup_ids"      : [...],
-        "modules"         : [{module_id, backup_id, result, error, timestamp}, ...],
-        "failed_modules"  : [...],
-        "total"           : int,
-        "failed"          : int,
-        "succeeded"       : int,
-        "note"            : str  # optional
-    }
+        A dict with the following keys:
+
+        outcome         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
+        backup_ids      : list of plan IDs that were evaluated
+        modules         : list of per-module status dicts (see _get_module_statuses)
+        failed_modules  : subset of ``modules`` where result != "success"
+        total           : total number of module status entries found
+        failed          : number of failed modules
+        succeeded       : number of succeeded modules
+        note            : optional human-readable explanation string
+
+    Outcome classification rules
+    ----------------------------
+        failed == 0 and total > 0   → SUCCESS
+        failed == total or total == 0 → REPO_FAILURE  (all failed or nothing found)
+        otherwise                   → PARTIAL
    """
    window = config.get("correlator", {}).get("recent_window", 3600)

+    # ---------------------------------------------------------------------------
+    # Resolve backup_ids
+    # ---------------------------------------------------------------------------
    if not backup_ids:
        log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
        backup_ids = _get_recent_backup_ids(config, window)

+    # If still empty, no relevant Redis state exists — treat as full failure.
    if not backup_ids:
        log.warning("No recent backup status keys found in Redis")
        return {
-            "outcome": "REPO_FAILURE",
-            "backup_ids": [],
-            "modules": [],
+            "outcome":        "REPO_FAILURE",
+            "backup_ids":     [],
+            "modules":        [],
            "failed_modules": [],
-            "total": 0,
-            "failed": 0,
-            "succeeded": 0,
-            "note": "No backup status found in Redis - possible repo or scheduling failure",
+            "total":          0,
+            "failed":         0,
+            "succeeded":      0,
+            "note": "No backup status found in Redis — possible repo or scheduling failure",
        }

+    # ---------------------------------------------------------------------------
+    # Collect per-module statuses across all plans
+    # ---------------------------------------------------------------------------
    all_modules = []
    for bid in backup_ids:
        modules = _get_module_statuses(config, bid)
        log.info(f"backup_id={bid}: found {len(modules)} module status entries")
        all_modules.extend(modules)

+    # ---------------------------------------------------------------------------
+    # Outcome classification
+    # ---------------------------------------------------------------------------
    total = len(all_modules)
    failed_modules = [m for m in all_modules if m["result"] != "success"]
    succeeded = total - len(failed_modules)
@@ -137,18 +250,22 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
    if len(failed_modules) == 0 and total > 0:
        outcome = "SUCCESS"
    elif len(failed_modules) == total or total == 0:
+        # All modules failed, or no modules were found at all.
        outcome = "REPO_FAILURE"
    else:
        outcome = "PARTIAL"

-    log.info(f"Correlation result: outcome={outcome}, total={total}, succeeded={succeeded}, failed={len(failed_modules)}")
+    log.info(
+        f"Correlation result: outcome={outcome}, total={total}, "
+        f"succeeded={succeeded}, failed={len(failed_modules)}"
+    )

    return {
-        "outcome": outcome,
-        "backup_ids": backup_ids,
-        "modules": all_modules,
+        "outcome":        outcome,
+        "backup_ids":     backup_ids,
+        "modules":        all_modules,
        "failed_modules": failed_modules,
-        "total": total,
-        "failed": len(failed_modules),
-        "succeeded": succeeded,
+        "total":          total,
+        "failed":         len(failed_modules),
+        "succeeded":      succeeded,
    }