From 93660275344633be663cd82c1d5c2c9fbf8faf18 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 18 May 2026 21:00:51 +0000 Subject: [PATCH] =?UTF-8?q?docs:=20add=20section-by-section=20comments=20?= =?UTF-8?q?=E2=80=94=20correlator.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ns8_backup_monitor/correlator.py | 215 ++++++++++++++++++++++++------- 1 file changed, 166 insertions(+), 49 deletions(-) diff --git a/ns8_backup_monitor/correlator.py b/ns8_backup_monitor/correlator.py index 2700ca0..e32a4fe 100644 --- a/ns8_backup_monitor/correlator.py +++ b/ns8_backup_monitor/correlator.py @@ -1,19 +1,33 @@ #!/usr/bin/env python3 -""" -correlator.py - Reads NS8 cluster Redis state to determine backup outcome. +"""Read NS8 cluster Redis state to determine the backup outcome. -For each backup plan/schedule, reads the per-module backup status and -produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE. +For each backup plan/schedule, this module reads the per-module backup +status hashes from the cluster Redis and produces a classified outcome: -NS8 Redis key patterns: - cluster/backup//status -> last overall plan status (hash) - module//backup//status -> per-module status (hash) + SUCCESS – All modules finished without errors. + PARTIAL – Some modules failed, others succeeded. + REPO_FAILURE – All modules failed, total is zero, or no status was + found in Redis at all (possible repository-level issue). -Fields in status hash: - result : success | error - timestamp: ISO8601 - error : error message if any - errors : number of module errors (in plan status) +NS8 Redis key patterns +----------------------- + cluster/backup//status + Overall plan status hash. Fields: result, timestamp, errors. + + module//backup//status + Per-module status hash. Fields: result, timestamp, error. + +Redis hash fields +----------------- + result : "success" | "error" + timestamp : ISO 8601 string + error : human-readable error message (empty on success) + errors : integer count of module errors (plan-level hash only) + +Dependencies +------------ +Only the standard library and ``redis-cli`` (installed with NS8) are required. +No Python Redis client library is needed. """ import logging @@ -24,25 +38,73 @@ from typing import Optional log = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Redis helpers +# --------------------------------------------------------------------------- +# These thin wrappers call redis-cli via subprocess instead of using a Python +# Redis client, keeping the dependency list to zero and staying consistent with +# how other NS8 scripts interact with the cluster Redis. + def _redis_cmd(config: dict, *args) -> str: - """Run a redis-cli command against the NS8 cluster Redis socket.""" - socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + """Run a redis-cli command against the NS8 cluster Redis Unix socket. + + Args: + config: Parsed configuration dictionary (reads ``redis.socket``). + *args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*"). + + Returns: + Raw stdout string, stripped of leading/trailing whitespace. + """ + socket = config.get("redis", {}).get( + "socket", "/var/lib/nethserver/cluster/state/redis.sock" + ) cmd = ["redis-cli", "-s", socket] + list(args) result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) return result.stdout.strip() def _redis_hgetall(config: dict, key: str) -> dict: - """Return all fields of a Redis hash as a dict in a single redis-cli call.""" - socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + """Return all fields of a Redis hash as a Python dict. + + ``redis-cli HGETALL`` outputs alternating field / value lines. + This function zips consecutive pairs into a dict. + + Args: + config: Parsed configuration dictionary. + key: Full Redis key of the hash to read. + + Returns: + Dict mapping field names to values, or an empty dict if the key + does not exist or the hash is empty. + """ + socket = config.get("redis", {}).get( + "socket", "/var/lib/nethserver/cluster/state/redis.sock" + ) cmd = ["redis-cli", "-s", socket, "HGETALL", key] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) lines = [l for l in result.stdout.strip().splitlines() if l] + # redis-cli HGETALL returns alternating key / value lines: + # line 0 → field name, line 1 → value, line 2 → field name, … return dict(zip(lines[::2], lines[1::2])) +# --------------------------------------------------------------------------- +# Recent backup discovery +# --------------------------------------------------------------------------- + def _get_recent_backup_ids(config: dict, window: int) -> list: - """Scan Redis for backup plan status keys updated within window seconds.""" + """Scan Redis for plan status keys updated within the last ``window`` seconds. + + Used as a fallback when Alertmanager does not include a ``backup_id`` + label on the alert (older NS8 versions or custom alert rules). + + Args: + config: Parsed configuration dictionary. + window: Look-back window in seconds (from ``correlator.recent_window``). + + Returns: + List of backup_id strings whose plan status was updated recently. + """ raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status") keys = [k for k in raw.splitlines() if k] now = datetime.now(timezone.utc).timestamp() @@ -54,8 +116,11 @@ def _get_recent_backup_ids(config: dict, window: int) -> list: if not ts_raw: continue try: + # Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00' + # for compatibility with Python < 3.11 fromisoformat(). ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp() if (now - ts) <= window: + # Key format: cluster/backup//status parts = key.split("/") if len(parts) >= 3: recent.append(parts[2]) @@ -65,71 +130,119 @@ def _get_recent_backup_ids(config: dict, window: int) -> list: return recent +# --------------------------------------------------------------------------- +# Per-module status collection +# --------------------------------------------------------------------------- + def _get_module_statuses(config: dict, backup_id: str) -> list: - """Get all per-module status entries for a given backup_id via HGETALL.""" + """Return all per-module status entries for a given backup_id. + + Scans Redis for keys matching ``module/*/backup//status`` + and reads each hash with HGETALL. + + Args: + config: Parsed configuration dictionary. + backup_id: The backup plan identifier (e.g. "1", "2"). + + Returns: + List of dicts, one per module: + { + "module_id" : str, + "backup_id" : str, + "result" : "success" | "error" | "unknown", + "error" : str, + "timestamp" : str (ISO 8601), + } + """ pattern = f"module/*/backup/{backup_id}/status" raw = _redis_cmd(config, "KEYS", pattern) keys = [k for k in raw.splitlines() if k] statuses = [] for key in keys: + # Key format: module//backup//status module_id = key.split("/")[1] fields = _redis_hgetall(config, key) if not fields: log.debug(f"Empty or missing status hash for {key}") continue statuses.append({ - "module_id": module_id, - "backup_id": backup_id, - "result": fields.get("result", "unknown"), - "error": fields.get("error", ""), - "timestamp": fields.get("timestamp", ""), + "module_id": module_id, + "backup_id": backup_id, + "result": fields.get("result", "unknown"), + "error": fields.get("error", ""), + "timestamp": fields.get("timestamp", ""), }) return statuses +# --------------------------------------------------------------------------- +# Main correlator entry point +# --------------------------------------------------------------------------- + def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict: - """ - Main correlator entry point. + """Classify the overall backup outcome by reading per-module Redis state. + + Args: + config: Parsed configuration dictionary. + backup_ids: List of backup plan IDs from Alertmanager alert labels. + When empty or None, the function falls back to scanning + Redis for recently updated plan status keys. Returns: - { - "outcome" : "SUCCESS" | "PARTIAL" | "REPO_FAILURE", - "backup_ids" : [...], - "modules" : [{module_id, backup_id, result, error, timestamp}, ...], - "failed_modules" : [...], - "total" : int, - "failed" : int, - "succeeded" : int, - "note" : str # optional - } + A dict with the following keys: + + outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE" + backup_ids : list of plan IDs that were evaluated + modules : list of per-module status dicts (see _get_module_statuses) + failed_modules : subset of ``modules`` where result != "success" + total : total number of module status entries found + failed : number of failed modules + succeeded : number of succeeded modules + note : optional human-readable explanation string + + Outcome classification rules + ---------------------------- + failed == 0 and total > 0 → SUCCESS + failed == total or total == 0 → REPO_FAILURE (all failed or nothing found) + otherwise → PARTIAL """ window = config.get("correlator", {}).get("recent_window", 3600) + # --------------------------------------------------------------------------- + # Resolve backup_ids + # --------------------------------------------------------------------------- if not backup_ids: log.info("No backup_ids from alert labels, scanning Redis for recent backups...") backup_ids = _get_recent_backup_ids(config, window) + # If still empty, no relevant Redis state exists — treat as full failure. if not backup_ids: log.warning("No recent backup status keys found in Redis") return { - "outcome": "REPO_FAILURE", - "backup_ids": [], - "modules": [], + "outcome": "REPO_FAILURE", + "backup_ids": [], + "modules": [], "failed_modules": [], - "total": 0, - "failed": 0, - "succeeded": 0, - "note": "No backup status found in Redis - possible repo or scheduling failure", + "total": 0, + "failed": 0, + "succeeded": 0, + "note": "No backup status found in Redis — possible repo or scheduling failure", } + # --------------------------------------------------------------------------- + # Collect per-module statuses across all plans + # --------------------------------------------------------------------------- all_modules = [] for bid in backup_ids: modules = _get_module_statuses(config, bid) log.info(f"backup_id={bid}: found {len(modules)} module status entries") all_modules.extend(modules) + # --------------------------------------------------------------------------- + # Outcome classification + # --------------------------------------------------------------------------- total = len(all_modules) failed_modules = [m for m in all_modules if m["result"] != "success"] succeeded = total - len(failed_modules) @@ -137,18 +250,22 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> if len(failed_modules) == 0 and total > 0: outcome = "SUCCESS" elif len(failed_modules) == total or total == 0: + # All modules failed, or no modules were found at all. outcome = "REPO_FAILURE" else: outcome = "PARTIAL" - log.info(f"Correlation result: outcome={outcome}, total={total}, succeeded={succeeded}, failed={len(failed_modules)}") + log.info( + f"Correlation result: outcome={outcome}, total={total}, " + f"succeeded={succeeded}, failed={len(failed_modules)}" + ) return { - "outcome": outcome, - "backup_ids": backup_ids, - "modules": all_modules, + "outcome": outcome, + "backup_ids": backup_ids, + "modules": all_modules, "failed_modules": failed_modules, - "total": total, - "failed": len(failed_modules), - "succeeded": succeeded, + "total": total, + "failed": len(failed_modules), + "succeeded": succeeded, }