docs: add full docstrings, Redis key/field reference, outcome classification rules

2026-05-18 21:55:24 +00:00
parent 8c6d085d46
commit f20a214cd8
1 changed files with 52 additions and 65 deletions
@@ -1,33 +1,32 @@
 #!/usr/bin/env python3
-"""Read NS8 cluster Redis state to determine the backup outcome.
+"""Read NS8 cluster Redis state to determine the overall backup outcome.

 For each backup plan/schedule, this module reads the per-module backup
-status hashes from the cluster Redis and produces a classified outcome:
+status hashes from the cluster Redis and produces a classified outcome::

-    SUCCESS      – All modules finished without errors.
-    PARTIAL      – Some modules failed, others succeeded.
-    REPO_FAILURE – All modules failed, total is zero, or no status was
+    SUCCESS      - All modules finished without errors.
+    PARTIAL      - Some modules failed, others succeeded.
+    REPO_FAILURE - All modules failed, total is zero, or no status was
                   found in Redis at all (possible repository-level issue).

 NS8 Redis key patterns
 -----------------------
-    cluster/backup/<backup_id>/status
-        Overall plan status hash. Fields: result, timestamp, errors.
+cluster/backup/<backup_id>/status
+    Overall plan status hash.
+    Fields: result, timestamp, errors (integer count of failed modules).

-    module/<module_id>/backup/<backup_id>/status
-        Per-module status hash. Fields: result, timestamp, error.
+module/<module_id>/backup/<backup_id>/status
+    Per-module status hash written by each backup module after it runs.
+    Fields: result ("success"|"error"), timestamp (ISO 8601), error (message).

-Redis hash fields
-----------------
-    result    : "success" | "error"
-    timestamp : ISO 8601 string
-    error     : human-readable error message (empty on success)
-    errors    : integer count of module errors (plan-level hash only)
+Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python
+Redis client library is required, keeping the dependency list to zero.

-Dependencies
------------
-Only the standard library and ``redis-cli`` (installed with NS8) are required.
-No Python Redis client library is needed.
+Outcome classification rules
+-----------------------------
+    failed == 0 and total > 0   -> SUCCESS
+    failed == total or total == 0 -> REPO_FAILURE
+    otherwise                   -> PARTIAL
 """

 import logging
@@ -41,19 +40,17 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Redis helpers
 # ---------------------------------------------------------------------------
-# These thin wrappers call redis-cli via subprocess instead of using a Python
-# Redis client, keeping the dependency list to zero and staying consistent with
-# how other NS8 scripts interact with the cluster Redis.

 def _redis_cmd(config: dict, *args) -> str:
    """Run a redis-cli command against the NS8 cluster Redis Unix socket.

    Args:
        config: Parsed configuration dictionary (reads ``redis.socket``).
-        *args:  Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
+        *args:  Redis command and arguments, e.g. "KEYS", "cluster/backup/*".

    Returns:
        Raw stdout string, stripped of leading/trailing whitespace.
+        Returns an empty string on timeout or error.
    """
    socket = config.get("redis", {}).get(
        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict:
        key:    Full Redis key of the hash to read.

    Returns:
-        Dict mapping field names to values, or an empty dict if the key
-        does not exist or the hash is empty.
+        Dict mapping field names to string values, or an empty dict if
+        the key does not exist or the hash is empty.
    """
    socket = config.get("redis", {}).get(
        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict:
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
-    # redis-cli HGETALL returns alternating key / value lines:
-    #   line 0 → field name, line 1 → value, line 2 → field name, …
+    # HGETALL returns alternating key / value lines:
+    #   line 0 -> field name, line 1 -> value, line 2 -> field name, ...
    return dict(zip(lines[::2], lines[1::2]))


 # ---------------------------------------------------------------------------
-# Recent backup discovery
+# Recent backup discovery (fallback when alert carries no backup_id / id label)
 # ---------------------------------------------------------------------------

 def _get_recent_backup_ids(config: dict, window: int) -> list:
    """Scan Redis for plan status keys updated within the last ``window`` seconds.

-    Used as a fallback when Alertmanager does not include a ``backup_id``
-    label on the alert (older NS8 versions or custom alert rules).
+    This is used as a fallback when Alertmanager does not include a backup plan
+    identifier in the alert labels (older NS8 versions or custom alert rules
+    that do not set the ``id`` or ``backup_id`` label).

    Args:
        config: Parsed configuration dictionary.
@@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
        if not ts_raw:
            continue
        try:
-            # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
-            # for compatibility with Python < 3.11 fromisoformat().
+            # Replace trailing 'Z' with '+00:00' for Python < 3.11 compat.
            ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
            if (now - ts) <= window:
                # Key format: cluster/backup/<backup_id>/status
@@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
 # ---------------------------------------------------------------------------

 def _get_module_statuses(config: dict, backup_id: str) -> list:
-    """Return all per-module status entries for a given backup_id.
+    """Return all per-module status entries for a given backup plan id.

    Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
    and reads each hash with HGETALL.
@@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
        backup_id: The backup plan identifier (e.g. "1", "2").

    Returns:
-        List of dicts, one per module:
+        List of dicts, one per module::
+
            {
                "module_id"  : str,
                "backup_id"  : str,
@@ -164,21 +162,21 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
        module_id = key.split("/")[1]
        fields = _redis_hgetall(config, key)
        if not fields:
-            log.debug(f"Empty or missing status hash for {key}")
+            log.debug("Empty or missing status hash for %s", key)
            continue
        statuses.append({
-            "module_id":  module_id,
-            "backup_id":  backup_id,
-            "result":     fields.get("result", "unknown"),
-            "error":      fields.get("error", ""),
-            "timestamp":  fields.get("timestamp", ""),
+            "module_id": module_id,
+            "backup_id": backup_id,
+            "result":    fields.get("result", "unknown"),
+            "error":     fields.get("error", ""),
+            "timestamp": fields.get("timestamp", ""),
        })

    return statuses


 # ---------------------------------------------------------------------------
-# Main correlator entry point
+# Main entry point
 # ---------------------------------------------------------------------------

 def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
@@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->

    Args:
        config:     Parsed configuration dictionary.
-        backup_ids: List of backup plan IDs from Alertmanager alert labels.
-                    When empty or None, the function falls back to scanning
-                    Redis for recently updated plan status keys.
+        backup_ids: List of backup plan IDs extracted from Alertmanager labels
+                    (see receiver.py label mapping). When empty or None, falls
+                    back to scanning Redis for recently updated plan status keys.

    Returns:
        A dict with the following keys:

        outcome         : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
        backup_ids      : list of plan IDs that were evaluated
-        modules         : list of per-module status dicts (see _get_module_statuses)
+        modules         : list of per-module status dicts
        failed_modules  : subset of ``modules`` where result != "success"
        total           : total number of module status entries found
        failed          : number of failed modules
        succeeded       : number of succeeded modules
        note            : optional human-readable explanation string
-
-    Outcome classification rules
-    ----------------------------
-        failed == 0 and total > 0   → SUCCESS
-        failed == total or total == 0 → REPO_FAILURE  (all failed or nothing found)
-        otherwise                   → PARTIAL
    """
    window = config.get("correlator", {}).get("recent_window", 3600)

-    # ---------------------------------------------------------------------------
-    # Resolve backup_ids
-    # ---------------------------------------------------------------------------
+    # Resolve backup_ids: use alert labels when available, scan Redis otherwise.
    if not backup_ids:
        log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
        backup_ids = _get_recent_backup_ids(config, window)

-    # If still empty, no relevant Redis state exists — treat as full failure.
+    # If still empty, no relevant Redis state exists - treat as full failure.
    if not backup_ids:
        log.warning("No recent backup status keys found in Redis")
        return {
@@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
            "total":          0,
            "failed":         0,
            "succeeded":      0,
-            "note": "No backup status found in Redis — possible repo or scheduling failure",
+            "note": "No backup status found in Redis - possible repo or scheduling failure",
        }

-    # ---------------------------------------------------------------------------
-    # Collect per-module statuses across all plans
-    # ---------------------------------------------------------------------------
+    # Collect per-module statuses across all resolved plan IDs.
    all_modules = []
    for bid in backup_ids:
        modules = _get_module_statuses(config, bid)
-        log.info(f"backup_id={bid}: found {len(modules)} module status entries")
+        log.info("backup_id=%s: found %d module status entries", bid, len(modules))
        all_modules.extend(modules)

-    # ---------------------------------------------------------------------------
-    # Outcome classification
-    # ---------------------------------------------------------------------------
+    # Classify outcome.
    total = len(all_modules)
    failed_modules = [m for m in all_modules if m["result"] != "success"]
    succeeded = total - len(failed_modules)
@@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
        outcome = "PARTIAL"

    log.info(
-        f"Correlation result: outcome={outcome}, total={total}, "
-        f"succeeded={succeeded}, failed={len(failed_modules)}"
+        "Outcome: %s (total=%d succeeded=%d failed=%d)",
+        outcome, total, succeeded, len(failed_modules),
    )

    return {
@@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
        "total":          total,
        "failed":         len(failed_modules),
        "succeeded":      succeeded,
+        "note":           "",
    }