feat: add backup status correlator via Redis cluster state
This commit is contained in:
@@ -0,0 +1,144 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
correlator.py - Reads NS8 cluster Redis state to determine backup outcome.
|
||||
|
||||
For each backup plan/schedule, reads the per-module backup status and
|
||||
produces a classified outcome: SUCCESS, PARTIAL, or REPO_FAILURE.
|
||||
|
||||
NS8 Redis key pattern for backup status (community/dev research):
|
||||
cluster/backup/<backup_id>/status -> last overall status
|
||||
module/<module_id>/backup/<backup_id>/status -> per-module status
|
||||
|
||||
Fields in status hash:
|
||||
result : success | error
|
||||
timestamp: ISO8601
|
||||
error : error message if any
|
||||
errors : number of module errors (in plan status)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _redis_cmd(config: dict, *args) -> str:
|
||||
"""Run a redis-cli command against the NS8 cluster Redis socket."""
|
||||
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
|
||||
cmd = ["redis-cli", "-s", socket] + list(args)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def _get_recent_backup_ids(config: dict, window: int) -> list:
|
||||
"""Scan Redis for backup plan status keys updated within window seconds."""
|
||||
raw = _redis_cmd(config, "KEYS", "cluster/backup/*/status")
|
||||
keys = [k for k in raw.splitlines() if k]
|
||||
now = datetime.now(timezone.utc).timestamp()
|
||||
recent = []
|
||||
|
||||
for key in keys:
|
||||
ts_raw = _redis_cmd(config, "HGET", key, "timestamp")
|
||||
if not ts_raw:
|
||||
continue
|
||||
try:
|
||||
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
|
||||
if (now - ts) <= window:
|
||||
# Extract backup_id from key: cluster/backup/<id>/status
|
||||
parts = key.split("/")
|
||||
if len(parts) >= 3:
|
||||
recent.append(parts[2])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return recent
|
||||
|
||||
|
||||
def _get_module_statuses(config: dict, backup_id: str) -> list:
|
||||
"""Get all per-module status entries for a given backup_id."""
|
||||
pattern = f"module/*/backup/{backup_id}/status"
|
||||
raw = _redis_cmd(config, "KEYS", pattern)
|
||||
keys = [k for k in raw.splitlines() if k]
|
||||
statuses = []
|
||||
|
||||
for key in keys:
|
||||
module_id = key.split("/")[1]
|
||||
result = _redis_cmd(config, "HGET", key, "result")
|
||||
error = _redis_cmd(config, "HGET", key, "error")
|
||||
timestamp = _redis_cmd(config, "HGET", key, "timestamp")
|
||||
statuses.append({
|
||||
"module_id": module_id,
|
||||
"backup_id": backup_id,
|
||||
"result": result,
|
||||
"error": error,
|
||||
"timestamp": timestamp,
|
||||
})
|
||||
|
||||
return statuses
|
||||
|
||||
|
||||
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
|
||||
"""
|
||||
Main correlator entry point.
|
||||
|
||||
Returns a dict:
|
||||
{
|
||||
"outcome": "SUCCESS" | "PARTIAL" | "REPO_FAILURE",
|
||||
"backup_ids": [...],
|
||||
"modules": [
|
||||
{"module_id": ..., "backup_id": ..., "result": ..., "error": ..., "timestamp": ...},
|
||||
...
|
||||
],
|
||||
"failed_modules": [...],
|
||||
"total": int,
|
||||
"failed": int,
|
||||
"succeeded": int,
|
||||
}
|
||||
"""
|
||||
window = config.get("correlator", {}).get("recent_window", 3600)
|
||||
|
||||
if not backup_ids:
|
||||
log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
|
||||
backup_ids = _get_recent_backup_ids(config, window)
|
||||
|
||||
if not backup_ids:
|
||||
log.warning("No recent backup status keys found in Redis")
|
||||
return {
|
||||
"outcome": "REPO_FAILURE",
|
||||
"backup_ids": [],
|
||||
"modules": [],
|
||||
"failed_modules": [],
|
||||
"total": 0,
|
||||
"failed": 0,
|
||||
"succeeded": 0,
|
||||
"note": "No backup status found in Redis - possible repo or scheduling failure",
|
||||
}
|
||||
|
||||
all_modules = []
|
||||
for bid in backup_ids:
|
||||
all_modules.extend(_get_module_statuses(config, bid))
|
||||
|
||||
total = len(all_modules)
|
||||
failed_modules = [m for m in all_modules if m["result"] != "success"]
|
||||
succeeded = total - len(failed_modules)
|
||||
|
||||
if len(failed_modules) == 0 and total > 0:
|
||||
outcome = "SUCCESS"
|
||||
elif len(failed_modules) == total or total == 0:
|
||||
outcome = "REPO_FAILURE"
|
||||
else:
|
||||
outcome = "PARTIAL"
|
||||
|
||||
return {
|
||||
"outcome": outcome,
|
||||
"backup_ids": backup_ids,
|
||||
"modules": all_modules,
|
||||
"failed_modules": failed_modules,
|
||||
"total": total,
|
||||
"failed": len(failed_modules),
|
||||
"succeeded": succeeded,
|
||||
}
|
||||
Reference in New Issue
Block a user