docs: add full docstrings, Redis key/field reference, outcome classification rules

This commit is contained in:
2026-05-18 21:55:24 +00:00
parent 8c6d085d46
commit f20a214cd8
+52 -65
View File
@@ -1,33 +1,32 @@
#!/usr/bin/env python3
"""Read NS8 cluster Redis state to determine the backup outcome.
"""Read NS8 cluster Redis state to determine the overall backup outcome.
For each backup plan/schedule, this module reads the per-module backup
status hashes from the cluster Redis and produces a classified outcome:
status hashes from the cluster Redis and produces a classified outcome::
SUCCESS All modules finished without errors.
PARTIAL Some modules failed, others succeeded.
REPO_FAILURE All modules failed, total is zero, or no status was
SUCCESS - All modules finished without errors.
PARTIAL - Some modules failed, others succeeded.
REPO_FAILURE - All modules failed, total is zero, or no status was
found in Redis at all (possible repository-level issue).
NS8 Redis key patterns
-----------------------
cluster/backup/<backup_id>/status
Overall plan status hash. Fields: result, timestamp, errors.
cluster/backup/<backup_id>/status
Overall plan status hash.
Fields: result, timestamp, errors (integer count of failed modules).
module/<module_id>/backup/<backup_id>/status
Per-module status hash. Fields: result, timestamp, error.
module/<module_id>/backup/<backup_id>/status
Per-module status hash written by each backup module after it runs.
Fields: result ("success"|"error"), timestamp (ISO 8601), error (message).
Redis hash fields
-----------------
result : "success" | "error"
timestamp : ISO 8601 string
error : human-readable error message (empty on success)
errors : integer count of module errors (plan-level hash only)
Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python
Redis client library is required, keeping the dependency list to zero.
Dependencies
------------
Only the standard library and ``redis-cli`` (installed with NS8) are required.
No Python Redis client library is needed.
Outcome classification rules
-----------------------------
failed == 0 and total > 0 -> SUCCESS
failed == total or total == 0 -> REPO_FAILURE
otherwise -> PARTIAL
"""
import logging
@@ -41,19 +40,17 @@ log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Redis helpers
# ---------------------------------------------------------------------------
# These thin wrappers call redis-cli via subprocess instead of using a Python
# Redis client, keeping the dependency list to zero and staying consistent with
# how other NS8 scripts interact with the cluster Redis.
def _redis_cmd(config: dict, *args) -> str:
"""Run a redis-cli command against the NS8 cluster Redis Unix socket.
Args:
config: Parsed configuration dictionary (reads ``redis.socket``).
*args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
*args: Redis command and arguments, e.g. "KEYS", "cluster/backup/*".
Returns:
Raw stdout string, stripped of leading/trailing whitespace.
Returns an empty string on timeout or error.
"""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict:
key: Full Redis key of the hash to read.
Returns:
Dict mapping field names to values, or an empty dict if the key
does not exist or the hash is empty.
Dict mapping field names to string values, or an empty dict if
the key does not exist or the hash is empty.
"""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
@@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict:
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key / value lines:
# line 0 field name, line 1 value, line 2 field name,
# HGETALL returns alternating key / value lines:
# line 0 -> field name, line 1 -> value, line 2 -> field name, ...
return dict(zip(lines[::2], lines[1::2]))
# ---------------------------------------------------------------------------
# Recent backup discovery
# Recent backup discovery (fallback when alert carries no backup_id / id label)
# ---------------------------------------------------------------------------
def _get_recent_backup_ids(config: dict, window: int) -> list:
"""Scan Redis for plan status keys updated within the last ``window`` seconds.
Used as a fallback when Alertmanager does not include a ``backup_id``
label on the alert (older NS8 versions or custom alert rules).
This is used as a fallback when Alertmanager does not include a backup plan
identifier in the alert labels (older NS8 versions or custom alert rules
that do not set the ``id`` or ``backup_id`` label).
Args:
config: Parsed configuration dictionary.
@@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
if not ts_raw:
continue
try:
# Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
# for compatibility with Python < 3.11 fromisoformat().
# Replace trailing 'Z' with '+00:00' for Python < 3.11 compat.
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
if (now - ts) <= window:
# Key format: cluster/backup/<backup_id>/status
@@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
# ---------------------------------------------------------------------------
def _get_module_statuses(config: dict, backup_id: str) -> list:
"""Return all per-module status entries for a given backup_id.
"""Return all per-module status entries for a given backup plan id.
Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
and reads each hash with HGETALL.
@@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
backup_id: The backup plan identifier (e.g. "1", "2").
Returns:
List of dicts, one per module:
List of dicts, one per module::
{
"module_id" : str,
"backup_id" : str,
@@ -164,21 +162,21 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
module_id = key.split("/")[1]
fields = _redis_hgetall(config, key)
if not fields:
log.debug(f"Empty or missing status hash for {key}")
log.debug("Empty or missing status hash for %s", key)
continue
statuses.append({
"module_id": module_id,
"backup_id": backup_id,
"result": fields.get("result", "unknown"),
"error": fields.get("error", ""),
"timestamp": fields.get("timestamp", ""),
"module_id": module_id,
"backup_id": backup_id,
"result": fields.get("result", "unknown"),
"error": fields.get("error", ""),
"timestamp": fields.get("timestamp", ""),
})
return statuses
# ---------------------------------------------------------------------------
# Main correlator entry point
# Main entry point
# ---------------------------------------------------------------------------
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
@@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
Args:
config: Parsed configuration dictionary.
backup_ids: List of backup plan IDs from Alertmanager alert labels.
When empty or None, the function falls back to scanning
Redis for recently updated plan status keys.
backup_ids: List of backup plan IDs extracted from Alertmanager labels
(see receiver.py label mapping). When empty or None, falls
back to scanning Redis for recently updated plan status keys.
Returns:
A dict with the following keys:
outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
backup_ids : list of plan IDs that were evaluated
modules : list of per-module status dicts (see _get_module_statuses)
modules : list of per-module status dicts
failed_modules : subset of ``modules`` where result != "success"
total : total number of module status entries found
failed : number of failed modules
succeeded : number of succeeded modules
note : optional human-readable explanation string
Outcome classification rules
----------------------------
failed == 0 and total > 0 → SUCCESS
failed == total or total == 0 → REPO_FAILURE (all failed or nothing found)
otherwise → PARTIAL
"""
window = config.get("correlator", {}).get("recent_window", 3600)
# ---------------------------------------------------------------------------
# Resolve backup_ids
# ---------------------------------------------------------------------------
# Resolve backup_ids: use alert labels when available, scan Redis otherwise.
if not backup_ids:
log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
backup_ids = _get_recent_backup_ids(config, window)
# If still empty, no relevant Redis state exists treat as full failure.
# If still empty, no relevant Redis state exists - treat as full failure.
if not backup_ids:
log.warning("No recent backup status keys found in Redis")
return {
@@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
"total": 0,
"failed": 0,
"succeeded": 0,
"note": "No backup status found in Redis possible repo or scheduling failure",
"note": "No backup status found in Redis - possible repo or scheduling failure",
}
# ---------------------------------------------------------------------------
# Collect per-module statuses across all plans
# ---------------------------------------------------------------------------
# Collect per-module statuses across all resolved plan IDs.
all_modules = []
for bid in backup_ids:
modules = _get_module_statuses(config, bid)
log.info(f"backup_id={bid}: found {len(modules)} module status entries")
log.info("backup_id=%s: found %d module status entries", bid, len(modules))
all_modules.extend(modules)
# ---------------------------------------------------------------------------
# Outcome classification
# ---------------------------------------------------------------------------
# Classify outcome.
total = len(all_modules)
failed_modules = [m for m in all_modules if m["result"] != "success"]
succeeded = total - len(failed_modules)
@@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
outcome = "PARTIAL"
log.info(
f"Correlation result: outcome={outcome}, total={total}, "
f"succeeded={succeeded}, failed={len(failed_modules)}"
"Outcome: %s (total=%d succeeded=%d failed=%d)",
outcome, total, succeeded, len(failed_modules),
)
return {
@@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
"total": total,
"failed": len(failed_modules),
"succeeded": succeeded,
"note": "",
}