From f20a214cd84c2827a2fada9b7c38e7ea5f445616 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 18 May 2026 21:55:24 +0000 Subject: [PATCH] docs: add full docstrings, Redis key/field reference, outcome classification rules --- ns8_backup_monitor/correlator.py | 117 ++++++++++++++----------------- 1 file changed, 52 insertions(+), 65 deletions(-) diff --git a/ns8_backup_monitor/correlator.py b/ns8_backup_monitor/correlator.py index e32a4fe..e59f3a7 100644 --- a/ns8_backup_monitor/correlator.py +++ b/ns8_backup_monitor/correlator.py @@ -1,33 +1,32 @@ #!/usr/bin/env python3 -"""Read NS8 cluster Redis state to determine the backup outcome. +"""Read NS8 cluster Redis state to determine the overall backup outcome. For each backup plan/schedule, this module reads the per-module backup -status hashes from the cluster Redis and produces a classified outcome: +status hashes from the cluster Redis and produces a classified outcome:: - SUCCESS – All modules finished without errors. - PARTIAL – Some modules failed, others succeeded. - REPO_FAILURE – All modules failed, total is zero, or no status was + SUCCESS - All modules finished without errors. + PARTIAL - Some modules failed, others succeeded. + REPO_FAILURE - All modules failed, total is zero, or no status was found in Redis at all (possible repository-level issue). NS8 Redis key patterns ----------------------- - cluster/backup//status - Overall plan status hash. Fields: result, timestamp, errors. +cluster/backup//status + Overall plan status hash. + Fields: result, timestamp, errors (integer count of failed modules). - module//backup//status - Per-module status hash. Fields: result, timestamp, error. +module//backup//status + Per-module status hash written by each backup module after it runs. + Fields: result ("success"|"error"), timestamp (ISO 8601), error (message). -Redis hash fields ------------------ - result : "success" | "error" - timestamp : ISO 8601 string - error : human-readable error message (empty on success) - errors : integer count of module errors (plan-level hash only) +Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python +Redis client library is required, keeping the dependency list to zero. -Dependencies ------------- -Only the standard library and ``redis-cli`` (installed with NS8) are required. -No Python Redis client library is needed. +Outcome classification rules +----------------------------- + failed == 0 and total > 0 -> SUCCESS + failed == total or total == 0 -> REPO_FAILURE + otherwise -> PARTIAL """ import logging @@ -41,19 +40,17 @@ log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Redis helpers # --------------------------------------------------------------------------- -# These thin wrappers call redis-cli via subprocess instead of using a Python -# Redis client, keeping the dependency list to zero and staying consistent with -# how other NS8 scripts interact with the cluster Redis. def _redis_cmd(config: dict, *args) -> str: """Run a redis-cli command against the NS8 cluster Redis Unix socket. Args: config: Parsed configuration dictionary (reads ``redis.socket``). - *args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*"). + *args: Redis command and arguments, e.g. "KEYS", "cluster/backup/*". Returns: Raw stdout string, stripped of leading/trailing whitespace. + Returns an empty string on timeout or error. """ socket = config.get("redis", {}).get( "socket", "/var/lib/nethserver/cluster/state/redis.sock" @@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict: key: Full Redis key of the hash to read. Returns: - Dict mapping field names to values, or an empty dict if the key - does not exist or the hash is empty. + Dict mapping field names to string values, or an empty dict if + the key does not exist or the hash is empty. """ socket = config.get("redis", {}).get( "socket", "/var/lib/nethserver/cluster/state/redis.sock" @@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict: cmd = ["redis-cli", "-s", socket, "HGETALL", key] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) lines = [l for l in result.stdout.strip().splitlines() if l] - # redis-cli HGETALL returns alternating key / value lines: - # line 0 → field name, line 1 → value, line 2 → field name, … + # HGETALL returns alternating key / value lines: + # line 0 -> field name, line 1 -> value, line 2 -> field name, ... return dict(zip(lines[::2], lines[1::2])) # --------------------------------------------------------------------------- -# Recent backup discovery +# Recent backup discovery (fallback when alert carries no backup_id / id label) # --------------------------------------------------------------------------- def _get_recent_backup_ids(config: dict, window: int) -> list: """Scan Redis for plan status keys updated within the last ``window`` seconds. - Used as a fallback when Alertmanager does not include a ``backup_id`` - label on the alert (older NS8 versions or custom alert rules). + This is used as a fallback when Alertmanager does not include a backup plan + identifier in the alert labels (older NS8 versions or custom alert rules + that do not set the ``id`` or ``backup_id`` label). Args: config: Parsed configuration dictionary. @@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list: if not ts_raw: continue try: - # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00' - # for compatibility with Python < 3.11 fromisoformat(). + # Replace trailing 'Z' with '+00:00' for Python < 3.11 compat. ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp() if (now - ts) <= window: # Key format: cluster/backup//status @@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list: # --------------------------------------------------------------------------- def _get_module_statuses(config: dict, backup_id: str) -> list: - """Return all per-module status entries for a given backup_id. + """Return all per-module status entries for a given backup plan id. Scans Redis for keys matching ``module/*/backup//status`` and reads each hash with HGETALL. @@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list: backup_id: The backup plan identifier (e.g. "1", "2"). Returns: - List of dicts, one per module: + List of dicts, one per module:: + { "module_id" : str, "backup_id" : str, @@ -164,21 +162,21 @@ def _get_module_statuses(config: dict, backup_id: str) -> list: module_id = key.split("/")[1] fields = _redis_hgetall(config, key) if not fields: - log.debug(f"Empty or missing status hash for {key}") + log.debug("Empty or missing status hash for %s", key) continue statuses.append({ - "module_id": module_id, - "backup_id": backup_id, - "result": fields.get("result", "unknown"), - "error": fields.get("error", ""), - "timestamp": fields.get("timestamp", ""), + "module_id": module_id, + "backup_id": backup_id, + "result": fields.get("result", "unknown"), + "error": fields.get("error", ""), + "timestamp": fields.get("timestamp", ""), }) return statuses # --------------------------------------------------------------------------- -# Main correlator entry point +# Main entry point # --------------------------------------------------------------------------- def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict: @@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> Args: config: Parsed configuration dictionary. - backup_ids: List of backup plan IDs from Alertmanager alert labels. - When empty or None, the function falls back to scanning - Redis for recently updated plan status keys. + backup_ids: List of backup plan IDs extracted from Alertmanager labels + (see receiver.py label mapping). When empty or None, falls + back to scanning Redis for recently updated plan status keys. Returns: A dict with the following keys: outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE" backup_ids : list of plan IDs that were evaluated - modules : list of per-module status dicts (see _get_module_statuses) + modules : list of per-module status dicts failed_modules : subset of ``modules`` where result != "success" total : total number of module status entries found failed : number of failed modules succeeded : number of succeeded modules note : optional human-readable explanation string - - Outcome classification rules - ---------------------------- - failed == 0 and total > 0 → SUCCESS - failed == total or total == 0 → REPO_FAILURE (all failed or nothing found) - otherwise → PARTIAL """ window = config.get("correlator", {}).get("recent_window", 3600) - # --------------------------------------------------------------------------- - # Resolve backup_ids - # --------------------------------------------------------------------------- + # Resolve backup_ids: use alert labels when available, scan Redis otherwise. if not backup_ids: log.info("No backup_ids from alert labels, scanning Redis for recent backups...") backup_ids = _get_recent_backup_ids(config, window) - # If still empty, no relevant Redis state exists — treat as full failure. + # If still empty, no relevant Redis state exists - treat as full failure. if not backup_ids: log.warning("No recent backup status keys found in Redis") return { @@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> "total": 0, "failed": 0, "succeeded": 0, - "note": "No backup status found in Redis — possible repo or scheduling failure", + "note": "No backup status found in Redis - possible repo or scheduling failure", } - # --------------------------------------------------------------------------- - # Collect per-module statuses across all plans - # --------------------------------------------------------------------------- + # Collect per-module statuses across all resolved plan IDs. all_modules = [] for bid in backup_ids: modules = _get_module_statuses(config, bid) - log.info(f"backup_id={bid}: found {len(modules)} module status entries") + log.info("backup_id=%s: found %d module status entries", bid, len(modules)) all_modules.extend(modules) - # --------------------------------------------------------------------------- - # Outcome classification - # --------------------------------------------------------------------------- + # Classify outcome. total = len(all_modules) failed_modules = [m for m in all_modules if m["result"] != "success"] succeeded = total - len(failed_modules) @@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> outcome = "PARTIAL" log.info( - f"Correlation result: outcome={outcome}, total={total}, " - f"succeeded={succeeded}, failed={len(failed_modules)}" + "Outcome: %s (total=%d succeeded=%d failed=%d)", + outcome, total, succeeded, len(failed_modules), ) return { @@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> "total": total, "failed": len(failed_modules), "succeeded": succeeded, + "note": "", }