From 6fdd6408877d3fecbe207f8bab695ffa7f16e109 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 18 May 2026 15:11:35 +0000 Subject: [PATCH] feat: add backup status correlator via Redis cluster state --- ns8_backup_monitor/correlator.py | 144 +++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 ns8_backup_monitor/correlator.py diff --git a/ns8_backup_monitor/correlator.py b/ns8_backup_monitor/correlator.py new file mode 100644 index 0000000..a5ece2f --- /dev/null +++ b/ns8_backup_monitor/correlator.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +""" +correlator.py - Reads NS8 cluster Redis state to determine backup outcome. + +For each backup plan/schedule, reads the per-module backup status and +produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE. + +NS8 Redis key pattern for backup status (community/dev research): + cluster/backup//status -> last overall status + module//backup//status -> per-module status + +Fields in status hash: + result : success | error + timestamp: ISO8601 + error : error message if any + errors : number of module errors (in plan status) +""" + +import json +import logging +import subprocess +import time +from datetime import datetime, timezone +from typing import Optional + +log = logging.getLogger(__name__) + + +def _redis_cmd(config: dict, *args) -> str: + """Run a redis-cli command against the NS8 cluster Redis socket.""" + socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + cmd = ["redis-cli", "-s", socket] + list(args) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) + return result.stdout.strip() + + +def _get_recent_backup_ids(config: dict, window: int) -> list: + """Scan Redis for backup plan status keys updated within window seconds.""" + raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status") + keys = [k for k in raw.splitlines() if k] + now = datetime.now(timezone.utc).timestamp() + recent = [] + + for key in keys: + ts_raw = _redis_cmd(config, "HGET", key, "timestamp") + if not ts_raw: + continue + try: + ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp() + if (now - ts) <= window: + # Extract backup_id from key: cluster/backup//status + parts = key.split("/") + if len(parts) >= 3: + recent.append(parts[2]) + except (ValueError, IndexError): + continue + + return recent + + +def _get_module_statuses(config: dict, backup_id: str) -> list: + """Get all per-module status entries for a given backup_id.""" + pattern = f"module/*/backup/{backup_id}/status" + raw = _redis_cmd(config, "KEYS", pattern) + keys = [k for k in raw.splitlines() if k] + statuses = [] + + for key in keys: + module_id = key.split("/")[1] + result = _redis_cmd(config, "HGET", key, "result") + error = _redis_cmd(config, "HGET", key, "error") + timestamp = _redis_cmd(config, "HGET", key, "timestamp") + statuses.append({ + "module_id": module_id, + "backup_id": backup_id, + "result": result, + "error": error, + "timestamp": timestamp, + }) + + return statuses + + +def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict: + """ + Main correlator entry point. + + Returns a dict: + { + "outcome": "SUCCESS" | "PARTIAL" | "REPO_FAILURE", + "backup_ids": [...], + "modules": [ + {"module_id": ..., "backup_id": ..., "result": ..., "error": ..., "timestamp": ...}, + ... + ], + "failed_modules": [...], + "total": int, + "failed": int, + "succeeded": int, + } + """ + window = config.get("correlator", {}).get("recent_window", 3600) + + if not backup_ids: + log.info("No backup_ids from alert labels, scanning Redis for recent backups...") + backup_ids = _get_recent_backup_ids(config, window) + + if not backup_ids: + log.warning("No recent backup status keys found in Redis") + return { + "outcome": "REPO_FAILURE", + "backup_ids": [], + "modules": [], + "failed_modules": [], + "total": 0, + "failed": 0, + "succeeded": 0, + "note": "No backup status found in Redis - possible repo or scheduling failure", + } + + all_modules = [] + for bid in backup_ids: + all_modules.extend(_get_module_statuses(config, bid)) + + total = len(all_modules) + failed_modules = [m for m in all_modules if m["result"] != "success"] + succeeded = total - len(failed_modules) + + if len(failed_modules) == 0 and total > 0: + outcome = "SUCCESS" + elif len(failed_modules) == total or total == 0: + outcome = "REPO_FAILURE" + else: + outcome = "PARTIAL" + + return { + "outcome": outcome, + "backup_ids": backup_ids, + "modules": all_modules, + "failed_modules": failed_modules, + "total": total, + "failed": len(failed_modules), + "succeeded": succeeded, + }