docs: add section-by-section comments — correlator.py

2026-05-18 21:00:51 +00:00
parent 62aa1804dc
commit 9366027534
1 changed files with 166 additions and 49 deletions
@@ -1,19 +1,33 @@
 #!/usr/bin/env python3
-"""
+"""Read NS8 cluster Redis state to determine the backup outcome.
 correlator.py - Reads NS8 cluster Redis state to determine backup outcome.
-For each backup plan/schedule, reads the per-module backup status and
+For each backup plan/schedule, this module reads the per-module backup
-produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE.
+status hashes from the cluster Redis and produces a classified outcome:
-NS8 Redis key patterns:
+    SUCCESS      – All modules finished without errors.
-  cluster/backup/<backup_id>/status            -> last overall plan status (hash)
+    PARTIAL      – Some modules failed, others succeeded.
-  module/<module_id>/backup/<backup_id>/status -> per-module status (hash)
+    REPO_FAILURE – All modules failed, total is zero, or no status was
                   found in Redis at all (possible repository-level issue).
-Fields in status hash:
+NS8 Redis key patterns
-  result   : success | error
+-----------------------
-  timestamp: ISO8601
+    cluster/backup/<backup_id>/status
-  error    : error message if any
+        Overall plan status hash. Fields: result, timestamp, errors.
-  errors   : number of module errors (in plan status)
+
    module/<module_id>/backup/<backup_id>/status
        Per-module status hash. Fields: result, timestamp, error.
 Redis hash fields
 -----------------
    result    : "success" | "error"
    timestamp : ISO 8601 string
    error     : human-readable error message (empty on success)
    errors    : integer count of module errors (plan-level hash only)
 Dependencies
 ------------
 Only the standard library and ``redis-cli`` (installed with NS8) are required.
 No Python Redis client library is needed.
 """
 import logging
@@ -24,25 +38,73 @@ from typing import Optional
 log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Redis helpers
 # ---------------------------------------------------------------------------
 # These thin wrappers call redis-cli via subprocess instead of using a Python
 # Redis client, keeping the dependency list to zero and staying consistent with
 # how other NS8 scripts interact with the cluster Redis.
 def _redis_cmd(config: dict, *args) -> str:
-    """Run a redis-cli command against the NS8 cluster Redis socket."""
+    """Run a redis-cli command against the NS8 cluster Redis Unix socket.
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+
    Args:
        config: Parsed configuration dictionary (reads ``redis.socket``).
        *args:  Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
    Returns:
        Raw stdout string, stripped of leading/trailing whitespace.
    """
    socket = config.get("redis", {}).get(
        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
    )
    cmd = ["redis-cli", "-s", socket] + list(args)
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    return result.stdout.strip()
 def _redis_hgetall(config: dict, key: str) -> dict:
-    """Return all fields of a Redis hash as a dict in a single redis-cli call."""
+    """Return all fields of a Redis hash as a Python dict.
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+
    ``redis-cli HGETALL`` outputs alternating field / value lines.
    This function zips consecutive pairs into a dict.
    Args:
        config: Parsed configuration dictionary.
        key:    Full Redis key of the hash to read.
    Returns:
        Dict mapping field names to values, or an empty dict if the key
        does not exist or the hash is empty.
    """
    socket = config.get("redis", {}).get(
        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
    )
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
    # redis-cli HGETALL returns alternating key / value lines:
    #   line 0 → field name, line 1 → value, line 2 → field name, …
    return dict(zip(lines[::2], lines[1::2]))
 # ---------------------------------------------------------------------------
 # Recent backup discovery
 # ---------------------------------------------------------------------------
 def _get_recent_backup_ids(config: dict, window: int) -> list:
-    """Scan Redis for backup plan status keys updated within window seconds."""
+    """Scan Redis for plan status keys updated within the last ``window`` seconds.
    Used as a fallback when Alertmanager does not include a ``backup_id``
    label on the alert (older NS8 versions or custom alert rules).
    Args:
        config: Parsed configuration dictionary.
        window: Look-back window in seconds (from ``correlator.recent_window``).
    Returns:
        List of backup_id strings whose plan status was updated recently.
    """
    raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status")
    keys = [k for k in raw.splitlines() if k]
    now = datetime.now(timezone.utc).timestamp()
@@ -54,8 +116,11 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
        if not ts_raw:
            continue
        try:
            # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
            # for compatibility with Python < 3.11 fromisoformat().
            ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
            if (now - ts) <= window:
                # Key format: cluster/backup/<backup_id>/status
                parts = key.split("/")
                if len(parts) >= 3:
                    recent.append(parts[2])
@@ -65,14 +130,37 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
    return recent
 # ---------------------------------------------------------------------------
 # Per-module status collection
 # ---------------------------------------------------------------------------
 def _get_module_statuses(config: dict, backup_id: str) -> list:
-    """Get all per-module status entries for a given backup_id via HGETALL."""
+    """Return all per-module status entries for a given backup_id.
    Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
    and reads each hash with HGETALL.
    Args:
        config:    Parsed configuration dictionary.
        backup_id: The backup plan identifier (e.g. "1", "2").
    Returns:
        List of dicts, one per module:
            {
                "module_id"  : str,
                "backup_id"  : str,
                "result"     : "success" | "error" | "unknown",
                "error"      : str,
                "timestamp"  : str (ISO 8601),
            }
    """
    pattern = f"module/*/backup/{backup_id}/status"
    raw = _redis_cmd(config, "KEYS", pattern)
    keys = [k for k in raw.splitlines() if k]
    statuses = []
    for key in keys:
        # Key format: module/<module_id>/backup/<backup_id>/status
        module_id = key.split("/")[1]
        fields = _redis_hgetall(config, key)
        if not fields:
@@ -89,28 +177,47 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
    return statuses
 # ---------------------------------------------------------------------------
 # Main correlator entry point
 # ---------------------------------------------------------------------------
 def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
-    """
+    """Classify the overall backup outcome by reading per-module Redis state.
-    Main correlator entry point.
+
    Args:
        config:     Parsed configuration dictionary.
        backup_ids: List of backup plan IDs from Alertmanager alert labels.
                    When empty or None, the function falls back to scanning
                    Redis for recently updated plan status keys.
    Returns:
-    {
+        A dict with the following keys:
-        "outcome"         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE",
+
-        "backup_ids"      : [...],
+        outcome         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
-        "modules"         : [{module_id, backup_id, result, error, timestamp}, ...],
+        backup_ids      : list of plan IDs that were evaluated
-        "failed_modules"  : [...],
+        modules         : list of per-module status dicts (see _get_module_statuses)
-        "total"           : int,
+        failed_modules  : subset of ``modules`` where result != "success"
-        "failed"          : int,
+        total           : total number of module status entries found
-        "succeeded"       : int,
+        failed          : number of failed modules
-        "note"            : str  # optional
+        succeeded       : number of succeeded modules
-    }
+        note            : optional human-readable explanation string
    Outcome classification rules
    ----------------------------
        failed == 0 and total > 0   → SUCCESS
        failed == total or total == 0 → REPO_FAILURE  (all failed or nothing found)
        otherwise                   → PARTIAL
    """
    window = config.get("correlator", {}).get("recent_window", 3600)
    # ---------------------------------------------------------------------------
    # Resolve backup_ids
    # ---------------------------------------------------------------------------
    if not backup_ids:
        log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
        backup_ids = _get_recent_backup_ids(config, window)
    # If still empty, no relevant Redis state exists — treat as full failure.
    if not backup_ids:
        log.warning("No recent backup status keys found in Redis")
        return {
@@ -121,15 +228,21 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
            "total":          0,
            "failed":         0,
            "succeeded":      0,
-            "note": "No backup status found in Redis - possible repo or scheduling failure",
+            "note": "No backup status found in Redis — possible repo or scheduling failure",
        }
    # ---------------------------------------------------------------------------
    # Collect per-module statuses across all plans
    # ---------------------------------------------------------------------------
    all_modules = []
    for bid in backup_ids:
        modules = _get_module_statuses(config, bid)
        log.info(f"backup_id={bid}: found {len(modules)} module status entries")
        all_modules.extend(modules)
    # ---------------------------------------------------------------------------
    # Outcome classification
    # ---------------------------------------------------------------------------
    total = len(all_modules)
    failed_modules = [m for m in all_modules if m["result"] != "success"]
    succeeded = total - len(failed_modules)
@@ -137,11 +250,15 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
    if len(failed_modules) == 0 and total > 0:
        outcome = "SUCCESS"
    elif len(failed_modules) == total or total == 0:
        # All modules failed, or no modules were found at all.
        outcome = "REPO_FAILURE"
    else:
        outcome = "PARTIAL"
-    log.info(f"Correlation result: outcome={outcome}, total={total}, succeeded={succeeded}, failed={len(failed_modules)}")
+    log.info(
        f"Correlation result: outcome={outcome}, total={total}, "
        f"succeeded={succeeded}, failed={len(failed_modules)}"
    )
    return {
        "outcome":        outcome,