docs: add full docstrings, Redis key/field reference, outcome classification rules

This commit is contained in:
2026-05-18 21:55:24 +00:00
parent 8c6d085d46
commit f20a214cd8
+45 -58
View File
@@ -1,33 +1,32 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Read NS8 cluster Redis state to determine the backup outcome. """Read NS8 cluster Redis state to determine the overall backup outcome.
For each backup plan/schedule, this module reads the per-module backup For each backup plan/schedule, this module reads the per-module backup
status hashes from the cluster Redis and produces a classified outcome: status hashes from the cluster Redis and produces a classified outcome::
SUCCESS All modules finished without errors. SUCCESS - All modules finished without errors.
PARTIAL Some modules failed, others succeeded. PARTIAL - Some modules failed, others succeeded.
REPO_FAILURE All modules failed, total is zero, or no status was REPO_FAILURE - All modules failed, total is zero, or no status was
found in Redis at all (possible repository-level issue). found in Redis at all (possible repository-level issue).
NS8 Redis key patterns NS8 Redis key patterns
----------------------- -----------------------
cluster/backup/<backup_id>/status cluster/backup/<backup_id>/status
Overall plan status hash. Fields: result, timestamp, errors. Overall plan status hash.
Fields: result, timestamp, errors (integer count of failed modules).
module/<module_id>/backup/<backup_id>/status module/<module_id>/backup/<backup_id>/status
Per-module status hash. Fields: result, timestamp, error. Per-module status hash written by each backup module after it runs.
Fields: result ("success"|"error"), timestamp (ISO 8601), error (message).
Redis hash fields Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python
----------------- Redis client library is required, keeping the dependency list to zero.
result : "success" | "error"
timestamp : ISO 8601 string
error : human-readable error message (empty on success)
errors : integer count of module errors (plan-level hash only)
Dependencies Outcome classification rules
------------ -----------------------------
Only the standard library and ``redis-cli`` (installed with NS8) are required. failed == 0 and total > 0 -> SUCCESS
No Python Redis client library is needed. failed == total or total == 0 -> REPO_FAILURE
otherwise -> PARTIAL
""" """
import logging import logging
@@ -41,19 +40,17 @@ log = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Redis helpers # Redis helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# These thin wrappers call redis-cli via subprocess instead of using a Python
# Redis client, keeping the dependency list to zero and staying consistent with
# how other NS8 scripts interact with the cluster Redis.
def _redis_cmd(config: dict, *args) -> str: def _redis_cmd(config: dict, *args) -> str:
"""Run a redis-cli command against the NS8 cluster Redis Unix socket. """Run a redis-cli command against the NS8 cluster Redis Unix socket.
Args: Args:
config: Parsed configuration dictionary (reads ``redis.socket``). config: Parsed configuration dictionary (reads ``redis.socket``).
*args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*"). *args: Redis command and arguments, e.g. "KEYS", "cluster/backup/*".
Returns: Returns:
Raw stdout string, stripped of leading/trailing whitespace. Raw stdout string, stripped of leading/trailing whitespace.
Returns an empty string on timeout or error.
""" """
socket = config.get("redis", {}).get( socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock" "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict:
key: Full Redis key of the hash to read. key: Full Redis key of the hash to read.
Returns: Returns:
Dict mapping field names to values, or an empty dict if the key Dict mapping field names to string values, or an empty dict if
does not exist or the hash is empty. the key does not exist or the hash is empty.
""" """
socket = config.get("redis", {}).get( socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock" "socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict:
cmd = ["redis-cli", "-s", socket, "HGETALL", key] cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l] lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key / value lines: # HGETALL returns alternating key / value lines:
# line 0 field name, line 1 value, line 2 field name, # line 0 -> field name, line 1 -> value, line 2 -> field name, ...
return dict(zip(lines[::2], lines[1::2])) return dict(zip(lines[::2], lines[1::2]))
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Recent backup discovery # Recent backup discovery (fallback when alert carries no backup_id / id label)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _get_recent_backup_ids(config: dict, window: int) -> list: def _get_recent_backup_ids(config: dict, window: int) -> list:
"""Scan Redis for plan status keys updated within the last ``window`` seconds. """Scan Redis for plan status keys updated within the last ``window`` seconds.
Used as a fallback when Alertmanager does not include a ``backup_id`` This is used as a fallback when Alertmanager does not include a backup plan
label on the alert (older NS8 versions or custom alert rules). identifier in the alert labels (older NS8 versions or custom alert rules
that do not set the ``id`` or ``backup_id`` label).
Args: Args:
config: Parsed configuration dictionary. config: Parsed configuration dictionary.
@@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
if not ts_raw: if not ts_raw:
continue continue
try: try:
# Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00' # Replace trailing 'Z' with '+00:00' for Python < 3.11 compat.
# for compatibility with Python < 3.11 fromisoformat().
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp() ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
if (now - ts) <= window: if (now - ts) <= window:
# Key format: cluster/backup/<backup_id>/status # Key format: cluster/backup/<backup_id>/status
@@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _get_module_statuses(config: dict, backup_id: str) -> list: def _get_module_statuses(config: dict, backup_id: str) -> list:
"""Return all per-module status entries for a given backup_id. """Return all per-module status entries for a given backup plan id.
Scans Redis for keys matching ``module/*/backup/<backup_id>/status`` Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
and reads each hash with HGETALL. and reads each hash with HGETALL.
@@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
backup_id: The backup plan identifier (e.g. "1", "2"). backup_id: The backup plan identifier (e.g. "1", "2").
Returns: Returns:
List of dicts, one per module: List of dicts, one per module::
{ {
"module_id" : str, "module_id" : str,
"backup_id" : str, "backup_id" : str,
@@ -164,7 +162,7 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
module_id = key.split("/")[1] module_id = key.split("/")[1]
fields = _redis_hgetall(config, key) fields = _redis_hgetall(config, key)
if not fields: if not fields:
log.debug(f"Empty or missing status hash for {key}") log.debug("Empty or missing status hash for %s", key)
continue continue
statuses.append({ statuses.append({
"module_id": module_id, "module_id": module_id,
@@ -178,7 +176,7 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Main correlator entry point # Main entry point
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict: def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
@@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
Args: Args:
config: Parsed configuration dictionary. config: Parsed configuration dictionary.
backup_ids: List of backup plan IDs from Alertmanager alert labels. backup_ids: List of backup plan IDs extracted from Alertmanager labels
When empty or None, the function falls back to scanning (see receiver.py label mapping). When empty or None, falls
Redis for recently updated plan status keys. back to scanning Redis for recently updated plan status keys.
Returns: Returns:
A dict with the following keys: A dict with the following keys:
outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE" outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
backup_ids : list of plan IDs that were evaluated backup_ids : list of plan IDs that were evaluated
modules : list of per-module status dicts (see _get_module_statuses) modules : list of per-module status dicts
failed_modules : subset of ``modules`` where result != "success" failed_modules : subset of ``modules`` where result != "success"
total : total number of module status entries found total : total number of module status entries found
failed : number of failed modules failed : number of failed modules
succeeded : number of succeeded modules succeeded : number of succeeded modules
note : optional human-readable explanation string note : optional human-readable explanation string
Outcome classification rules
----------------------------
failed == 0 and total > 0 → SUCCESS
failed == total or total == 0 → REPO_FAILURE (all failed or nothing found)
otherwise → PARTIAL
""" """
window = config.get("correlator", {}).get("recent_window", 3600) window = config.get("correlator", {}).get("recent_window", 3600)
# --------------------------------------------------------------------------- # Resolve backup_ids: use alert labels when available, scan Redis otherwise.
# Resolve backup_ids
# ---------------------------------------------------------------------------
if not backup_ids: if not backup_ids:
log.info("No backup_ids from alert labels, scanning Redis for recent backups...") log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
backup_ids = _get_recent_backup_ids(config, window) backup_ids = _get_recent_backup_ids(config, window)
# If still empty, no relevant Redis state exists treat as full failure. # If still empty, no relevant Redis state exists - treat as full failure.
if not backup_ids: if not backup_ids:
log.warning("No recent backup status keys found in Redis") log.warning("No recent backup status keys found in Redis")
return { return {
@@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
"total": 0, "total": 0,
"failed": 0, "failed": 0,
"succeeded": 0, "succeeded": 0,
"note": "No backup status found in Redis possible repo or scheduling failure", "note": "No backup status found in Redis - possible repo or scheduling failure",
} }
# --------------------------------------------------------------------------- # Collect per-module statuses across all resolved plan IDs.
# Collect per-module statuses across all plans
# ---------------------------------------------------------------------------
all_modules = [] all_modules = []
for bid in backup_ids: for bid in backup_ids:
modules = _get_module_statuses(config, bid) modules = _get_module_statuses(config, bid)
log.info(f"backup_id={bid}: found {len(modules)} module status entries") log.info("backup_id=%s: found %d module status entries", bid, len(modules))
all_modules.extend(modules) all_modules.extend(modules)
# --------------------------------------------------------------------------- # Classify outcome.
# Outcome classification
# ---------------------------------------------------------------------------
total = len(all_modules) total = len(all_modules)
failed_modules = [m for m in all_modules if m["result"] != "success"] failed_modules = [m for m in all_modules if m["result"] != "success"]
succeeded = total - len(failed_modules) succeeded = total - len(failed_modules)
@@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
outcome = "PARTIAL" outcome = "PARTIAL"
log.info( log.info(
f"Correlation result: outcome={outcome}, total={total}, " "Outcome: %s (total=%d succeeded=%d failed=%d)",
f"succeeded={succeeded}, failed={len(failed_modules)}" outcome, total, succeeded, len(failed_modules),
) )
return { return {
@@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
"total": total, "total": total,
"failed": len(failed_modules), "failed": len(failed_modules),
"succeeded": succeeded, "succeeded": succeeded,
"note": "",
} }