From f20a214cd84c2827a2fada9b7c38e7ea5f445616 Mon Sep 17 00:00:00 2001
From: admin <root@lelekaos.com>
Date: Mon, 18 May 2026 21:55:24 +0000
Subject: [PATCH] docs: add full docstrings, Redis key/field reference, outcome
 classification rules

---
 ns8_backup_monitor/correlator.py | 117 ++++++++++++++-----------------
 1 file changed, 52 insertions(+), 65 deletions(-)

diff --git a/ns8_backup_monitor/correlator.py b/ns8_backup_monitor/correlator.py
index e32a4fe..e59f3a7 100644
--- a/ns8_backup_monitor/correlator.py
+++ b/ns8_backup_monitor/correlator.py
@@ -1,33 +1,32 @@
 #!/usr/bin/env python3
-"""Read NS8 cluster Redis state to determine the backup outcome.
+"""Read NS8 cluster Redis state to determine the overall backup outcome.
 
 For each backup plan/schedule, this module reads the per-module backup
-status hashes from the cluster Redis and produces a classified outcome:
+status hashes from the cluster Redis and produces a classified outcome::
 
-    SUCCESS      – All modules finished without errors.
-    PARTIAL      – Some modules failed, others succeeded.
-    REPO_FAILURE – All modules failed, total is zero, or no status was
+    SUCCESS      - All modules finished without errors.
+    PARTIAL      - Some modules failed, others succeeded.
+    REPO_FAILURE - All modules failed, total is zero, or no status was
                    found in Redis at all (possible repository-level issue).
 
 NS8 Redis key patterns
 -----------------------
-    cluster/backup/<backup_id>/status
-        Overall plan status hash. Fields: result, timestamp, errors.
+cluster/backup/<backup_id>/status
+    Overall plan status hash.
+    Fields: result, timestamp, errors (integer count of failed modules).
 
-    module/<module_id>/backup/<backup_id>/status
-        Per-module status hash. Fields: result, timestamp, error.
+module/<module_id>/backup/<backup_id>/status
+    Per-module status hash written by each backup module after it runs.
+    Fields: result ("success"|"error"), timestamp (ISO 8601), error (message).
 
-Redis hash fields
------------------
-    result    : "success" | "error"
-    timestamp : ISO 8601 string
-    error     : human-readable error message (empty on success)
-    errors    : integer count of module errors (plan-level hash only)
+Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python
+Redis client library is required, keeping the dependency list to zero.
 
-Dependencies
-------------
-Only the standard library and ``redis-cli`` (installed with NS8) are required.
-No Python Redis client library is needed.
+Outcome classification rules
+-----------------------------
+    failed == 0 and total > 0   -> SUCCESS
+    failed == total or total == 0 -> REPO_FAILURE
+    otherwise                   -> PARTIAL
 """
 
 import logging
@@ -41,19 +40,17 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Redis helpers
 # ---------------------------------------------------------------------------
-# These thin wrappers call redis-cli via subprocess instead of using a Python
-# Redis client, keeping the dependency list to zero and staying consistent with
-# how other NS8 scripts interact with the cluster Redis.
 
 def _redis_cmd(config: dict, *args) -> str:
     """Run a redis-cli command against the NS8 cluster Redis Unix socket.
 
     Args:
         config: Parsed configuration dictionary (reads ``redis.socket``).
-        *args:  Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
+        *args:  Redis command and arguments, e.g. "KEYS", "cluster/backup/*".
 
     Returns:
         Raw stdout string, stripped of leading/trailing whitespace.
+        Returns an empty string on timeout or error.
     """
     socket = config.get("redis", {}).get(
         "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict:
         key:    Full Redis key of the hash to read.
 
     Returns:
-        Dict mapping field names to values, or an empty dict if the key
-        does not exist or the hash is empty.
+        Dict mapping field names to string values, or an empty dict if
+        the key does not exist or the hash is empty.
     """
     socket = config.get("redis", {}).get(
         "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict:
     cmd = ["redis-cli", "-s", socket, "HGETALL", key]
     result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
     lines = [l for l in result.stdout.strip().splitlines() if l]
-    # redis-cli HGETALL returns alternating key / value lines:
-    #   line 0 → field name, line 1 → value, line 2 → field name, …
+    # HGETALL returns alternating key / value lines:
+    #   line 0 -> field name, line 1 -> value, line 2 -> field name, ...
     return dict(zip(lines[::2], lines[1::2]))
 
 
 # ---------------------------------------------------------------------------
-# Recent backup discovery
+# Recent backup discovery (fallback when alert carries no backup_id / id label)
 # ---------------------------------------------------------------------------
 
 def _get_recent_backup_ids(config: dict, window: int) -> list:
     """Scan Redis for plan status keys updated within the last ``window`` seconds.
 
-    Used as a fallback when Alertmanager does not include a ``backup_id``
-    label on the alert (older NS8 versions or custom alert rules).
+    This is used as a fallback when Alertmanager does not include a backup plan
+    identifier in the alert labels (older NS8 versions or custom alert rules
+    that do not set the ``id`` or ``backup_id`` label).
 
     Args:
         config: Parsed configuration dictionary.
@@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
         if not ts_raw:
             continue
         try:
-            # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
-            # for compatibility with Python < 3.11 fromisoformat().
+            # Replace trailing 'Z' with '+00:00' for Python < 3.11 compat.
             ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
             if (now - ts) <= window:
                 # Key format: cluster/backup/<backup_id>/status
@@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
 # ---------------------------------------------------------------------------
 
 def _get_module_statuses(config: dict, backup_id: str) -> list:
-    """Return all per-module status entries for a given backup_id.
+    """Return all per-module status entries for a given backup plan id.
 
     Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
     and reads each hash with HGETALL.
@@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
         backup_id: The backup plan identifier (e.g. "1", "2").
 
     Returns:
-        List of dicts, one per module:
+        List of dicts, one per module::
+
             {
                 "module_id"  : str,
                 "backup_id"  : str,
@@ -164,21 +162,21 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
         module_id = key.split("/")[1]
         fields = _redis_hgetall(config, key)
         if not fields:
-            log.debug(f"Empty or missing status hash for {key}")
+            log.debug("Empty or missing status hash for %s", key)
             continue
         statuses.append({
-            "module_id":  module_id,
-            "backup_id":  backup_id,
-            "result":     fields.get("result", "unknown"),
-            "error":      fields.get("error", ""),
-            "timestamp":  fields.get("timestamp", ""),
+            "module_id": module_id,
+            "backup_id": backup_id,
+            "result":    fields.get("result", "unknown"),
+            "error":     fields.get("error", ""),
+            "timestamp": fields.get("timestamp", ""),
         })
 
     return statuses
 
 
 # ---------------------------------------------------------------------------
-# Main correlator entry point
+# Main entry point
 # ---------------------------------------------------------------------------
 
 def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
@@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
 
     Args:
         config:     Parsed configuration dictionary.
-        backup_ids: List of backup plan IDs from Alertmanager alert labels.
-                    When empty or None, the function falls back to scanning
-                    Redis for recently updated plan status keys.
+        backup_ids: List of backup plan IDs extracted from Alertmanager labels
+                    (see receiver.py label mapping). When empty or None, falls
+                    back to scanning Redis for recently updated plan status keys.
 
     Returns:
         A dict with the following keys:
 
         outcome         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
         backup_ids      : list of plan IDs that were evaluated
-        modules         : list of per-module status dicts (see _get_module_statuses)
+        modules         : list of per-module status dicts
         failed_modules  : subset of ``modules`` where result != "success"
         total           : total number of module status entries found
         failed          : number of failed modules
         succeeded       : number of succeeded modules
         note            : optional human-readable explanation string
-
-    Outcome classification rules
-    ----------------------------
-        failed == 0 and total > 0   → SUCCESS
-        failed == total or total == 0 → REPO_FAILURE  (all failed or nothing found)
-        otherwise                   → PARTIAL
     """
     window = config.get("correlator", {}).get("recent_window", 3600)
 
-    # ---------------------------------------------------------------------------
-    # Resolve backup_ids
-    # ---------------------------------------------------------------------------
+    # Resolve backup_ids: use alert labels when available, scan Redis otherwise.
     if not backup_ids:
         log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
         backup_ids = _get_recent_backup_ids(config, window)
 
-    # If still empty, no relevant Redis state exists — treat as full failure.
+    # If still empty, no relevant Redis state exists - treat as full failure.
     if not backup_ids:
         log.warning("No recent backup status keys found in Redis")
         return {
@@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
             "total":          0,
             "failed":         0,
             "succeeded":      0,
-            "note": "No backup status found in Redis — possible repo or scheduling failure",
+            "note": "No backup status found in Redis - possible repo or scheduling failure",
         }
 
-    # ---------------------------------------------------------------------------
-    # Collect per-module statuses across all plans
-    # ---------------------------------------------------------------------------
+    # Collect per-module statuses across all resolved plan IDs.
     all_modules = []
     for bid in backup_ids:
         modules = _get_module_statuses(config, bid)
-        log.info(f"backup_id={bid}: found {len(modules)} module status entries")
+        log.info("backup_id=%s: found %d module status entries", bid, len(modules))
         all_modules.extend(modules)
 
-    # ---------------------------------------------------------------------------
-    # Outcome classification
-    # ---------------------------------------------------------------------------
+    # Classify outcome.
     total = len(all_modules)
     failed_modules = [m for m in all_modules if m["result"] != "success"]
     succeeded = total - len(failed_modules)
@@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
         outcome = "PARTIAL"
 
     log.info(
-        f"Correlation result: outcome={outcome}, total={total}, "
-        f"succeeded={succeeded}, failed={len(failed_modules)}"
+        "Outcome: %s (total=%d succeeded=%d failed=%d)",
+        outcome, total, succeeded, len(failed_modules),
     )
 
     return {
@@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
         "total":          total,
         "failed":         len(failed_modules),
         "succeeded":      succeeded,
+        "note":           "",
     }