docs: add full docstrings, Redis key/field reference, outcome classification rules
This commit is contained in:
@@ -1,33 +1,32 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Read NS8 cluster Redis state to determine the backup outcome.
|
"""Read NS8 cluster Redis state to determine the overall backup outcome.
|
||||||
|
|
||||||
For each backup plan/schedule, this module reads the per-module backup
|
For each backup plan/schedule, this module reads the per-module backup
|
||||||
status hashes from the cluster Redis and produces a classified outcome:
|
status hashes from the cluster Redis and produces a classified outcome::
|
||||||
|
|
||||||
SUCCESS – All modules finished without errors.
|
SUCCESS - All modules finished without errors.
|
||||||
PARTIAL – Some modules failed, others succeeded.
|
PARTIAL - Some modules failed, others succeeded.
|
||||||
REPO_FAILURE – All modules failed, total is zero, or no status was
|
REPO_FAILURE - All modules failed, total is zero, or no status was
|
||||||
found in Redis at all (possible repository-level issue).
|
found in Redis at all (possible repository-level issue).
|
||||||
|
|
||||||
NS8 Redis key patterns
|
NS8 Redis key patterns
|
||||||
-----------------------
|
-----------------------
|
||||||
cluster/backup/<backup_id>/status
|
cluster/backup/<backup_id>/status
|
||||||
Overall plan status hash. Fields: result, timestamp, errors.
|
Overall plan status hash.
|
||||||
|
Fields: result, timestamp, errors (integer count of failed modules).
|
||||||
|
|
||||||
module/<module_id>/backup/<backup_id>/status
|
module/<module_id>/backup/<backup_id>/status
|
||||||
Per-module status hash. Fields: result, timestamp, error.
|
Per-module status hash written by each backup module after it runs.
|
||||||
|
Fields: result ("success"|"error"), timestamp (ISO 8601), error (message).
|
||||||
|
|
||||||
Redis hash fields
|
Redis is accessed via ``redis-cli`` over the cluster Unix socket. No Python
|
||||||
-----------------
|
Redis client library is required, keeping the dependency list to zero.
|
||||||
result : "success" | "error"
|
|
||||||
timestamp : ISO 8601 string
|
|
||||||
error : human-readable error message (empty on success)
|
|
||||||
errors : integer count of module errors (plan-level hash only)
|
|
||||||
|
|
||||||
Dependencies
|
Outcome classification rules
|
||||||
------------
|
-----------------------------
|
||||||
Only the standard library and ``redis-cli`` (installed with NS8) are required.
|
failed == 0 and total > 0 -> SUCCESS
|
||||||
No Python Redis client library is needed.
|
failed == total or total == 0 -> REPO_FAILURE
|
||||||
|
otherwise -> PARTIAL
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -41,19 +40,17 @@ log = logging.getLogger(__name__)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Redis helpers
|
# Redis helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# These thin wrappers call redis-cli via subprocess instead of using a Python
|
|
||||||
# Redis client, keeping the dependency list to zero and staying consistent with
|
|
||||||
# how other NS8 scripts interact with the cluster Redis.
|
|
||||||
|
|
||||||
def _redis_cmd(config: dict, *args) -> str:
|
def _redis_cmd(config: dict, *args) -> str:
|
||||||
"""Run a redis-cli command against the NS8 cluster Redis Unix socket.
|
"""Run a redis-cli command against the NS8 cluster Redis Unix socket.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: Parsed configuration dictionary (reads ``redis.socket``).
|
config: Parsed configuration dictionary (reads ``redis.socket``).
|
||||||
*args: Redis command and arguments (e.g. "KEYS", "cluster/backup/*").
|
*args: Redis command and arguments, e.g. "KEYS", "cluster/backup/*".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Raw stdout string, stripped of leading/trailing whitespace.
|
Raw stdout string, stripped of leading/trailing whitespace.
|
||||||
|
Returns an empty string on timeout or error.
|
||||||
"""
|
"""
|
||||||
socket = config.get("redis", {}).get(
|
socket = config.get("redis", {}).get(
|
||||||
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
|
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
|
||||||
@@ -74,8 +71,8 @@ def _redis_hgetall(config: dict, key: str) -> dict:
|
|||||||
key: Full Redis key of the hash to read.
|
key: Full Redis key of the hash to read.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict mapping field names to values, or an empty dict if the key
|
Dict mapping field names to string values, or an empty dict if
|
||||||
does not exist or the hash is empty.
|
the key does not exist or the hash is empty.
|
||||||
"""
|
"""
|
||||||
socket = config.get("redis", {}).get(
|
socket = config.get("redis", {}).get(
|
||||||
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
|
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
|
||||||
@@ -83,20 +80,21 @@ def _redis_hgetall(config: dict, key: str) -> dict:
|
|||||||
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
|
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
||||||
lines = [l for l in result.stdout.strip().splitlines() if l]
|
lines = [l for l in result.stdout.strip().splitlines() if l]
|
||||||
# redis-cli HGETALL returns alternating key / value lines:
|
# HGETALL returns alternating key / value lines:
|
||||||
# line 0 → field name, line 1 → value, line 2 → field name, …
|
# line 0 -> field name, line 1 -> value, line 2 -> field name, ...
|
||||||
return dict(zip(lines[::2], lines[1::2]))
|
return dict(zip(lines[::2], lines[1::2]))
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Recent backup discovery
|
# Recent backup discovery (fallback when alert carries no backup_id / id label)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _get_recent_backup_ids(config: dict, window: int) -> list:
|
def _get_recent_backup_ids(config: dict, window: int) -> list:
|
||||||
"""Scan Redis for plan status keys updated within the last ``window`` seconds.
|
"""Scan Redis for plan status keys updated within the last ``window`` seconds.
|
||||||
|
|
||||||
Used as a fallback when Alertmanager does not include a ``backup_id``
|
This is used as a fallback when Alertmanager does not include a backup plan
|
||||||
label on the alert (older NS8 versions or custom alert rules).
|
identifier in the alert labels (older NS8 versions or custom alert rules
|
||||||
|
that do not set the ``id`` or ``backup_id`` label).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: Parsed configuration dictionary.
|
config: Parsed configuration dictionary.
|
||||||
@@ -116,8 +114,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
|
|||||||
if not ts_raw:
|
if not ts_raw:
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
# Parse ISO 8601 timestamp; replace trailing 'Z' with '+00:00'
|
# Replace trailing 'Z' with '+00:00' for Python < 3.11 compat.
|
||||||
# for compatibility with Python < 3.11 fromisoformat().
|
|
||||||
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
|
ts = datetime.fromisoformat(ts_raw.replace("Z", "+00:00")).timestamp()
|
||||||
if (now - ts) <= window:
|
if (now - ts) <= window:
|
||||||
# Key format: cluster/backup/<backup_id>/status
|
# Key format: cluster/backup/<backup_id>/status
|
||||||
@@ -135,7 +132,7 @@ def _get_recent_backup_ids(config: dict, window: int) -> list:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _get_module_statuses(config: dict, backup_id: str) -> list:
|
def _get_module_statuses(config: dict, backup_id: str) -> list:
|
||||||
"""Return all per-module status entries for a given backup_id.
|
"""Return all per-module status entries for a given backup plan id.
|
||||||
|
|
||||||
Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
|
Scans Redis for keys matching ``module/*/backup/<backup_id>/status``
|
||||||
and reads each hash with HGETALL.
|
and reads each hash with HGETALL.
|
||||||
@@ -145,7 +142,8 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
|
|||||||
backup_id: The backup plan identifier (e.g. "1", "2").
|
backup_id: The backup plan identifier (e.g. "1", "2").
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of dicts, one per module:
|
List of dicts, one per module::
|
||||||
|
|
||||||
{
|
{
|
||||||
"module_id" : str,
|
"module_id" : str,
|
||||||
"backup_id" : str,
|
"backup_id" : str,
|
||||||
@@ -164,21 +162,21 @@ def _get_module_statuses(config: dict, backup_id: str) -> list:
|
|||||||
module_id = key.split("/")[1]
|
module_id = key.split("/")[1]
|
||||||
fields = _redis_hgetall(config, key)
|
fields = _redis_hgetall(config, key)
|
||||||
if not fields:
|
if not fields:
|
||||||
log.debug(f"Empty or missing status hash for {key}")
|
log.debug("Empty or missing status hash for %s", key)
|
||||||
continue
|
continue
|
||||||
statuses.append({
|
statuses.append({
|
||||||
"module_id": module_id,
|
"module_id": module_id,
|
||||||
"backup_id": backup_id,
|
"backup_id": backup_id,
|
||||||
"result": fields.get("result", "unknown"),
|
"result": fields.get("result", "unknown"),
|
||||||
"error": fields.get("error", ""),
|
"error": fields.get("error", ""),
|
||||||
"timestamp": fields.get("timestamp", ""),
|
"timestamp": fields.get("timestamp", ""),
|
||||||
})
|
})
|
||||||
|
|
||||||
return statuses
|
return statuses
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Main correlator entry point
|
# Main entry point
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
|
def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) -> dict:
|
||||||
@@ -186,38 +184,30 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: Parsed configuration dictionary.
|
config: Parsed configuration dictionary.
|
||||||
backup_ids: List of backup plan IDs from Alertmanager alert labels.
|
backup_ids: List of backup plan IDs extracted from Alertmanager labels
|
||||||
When empty or None, the function falls back to scanning
|
(see receiver.py label mapping). When empty or None, falls
|
||||||
Redis for recently updated plan status keys.
|
back to scanning Redis for recently updated plan status keys.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A dict with the following keys:
|
A dict with the following keys:
|
||||||
|
|
||||||
outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
|
outcome : "SUCCESS" | "PARTIAL" | "REPO_FAILURE"
|
||||||
backup_ids : list of plan IDs that were evaluated
|
backup_ids : list of plan IDs that were evaluated
|
||||||
modules : list of per-module status dicts (see _get_module_statuses)
|
modules : list of per-module status dicts
|
||||||
failed_modules : subset of ``modules`` where result != "success"
|
failed_modules : subset of ``modules`` where result != "success"
|
||||||
total : total number of module status entries found
|
total : total number of module status entries found
|
||||||
failed : number of failed modules
|
failed : number of failed modules
|
||||||
succeeded : number of succeeded modules
|
succeeded : number of succeeded modules
|
||||||
note : optional human-readable explanation string
|
note : optional human-readable explanation string
|
||||||
|
|
||||||
Outcome classification rules
|
|
||||||
----------------------------
|
|
||||||
failed == 0 and total > 0 → SUCCESS
|
|
||||||
failed == total or total == 0 → REPO_FAILURE (all failed or nothing found)
|
|
||||||
otherwise → PARTIAL
|
|
||||||
"""
|
"""
|
||||||
window = config.get("correlator", {}).get("recent_window", 3600)
|
window = config.get("correlator", {}).get("recent_window", 3600)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Resolve backup_ids: use alert labels when available, scan Redis otherwise.
|
||||||
# Resolve backup_ids
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
if not backup_ids:
|
if not backup_ids:
|
||||||
log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
|
log.info("No backup_ids from alert labels, scanning Redis for recent backups...")
|
||||||
backup_ids = _get_recent_backup_ids(config, window)
|
backup_ids = _get_recent_backup_ids(config, window)
|
||||||
|
|
||||||
# If still empty, no relevant Redis state exists — treat as full failure.
|
# If still empty, no relevant Redis state exists - treat as full failure.
|
||||||
if not backup_ids:
|
if not backup_ids:
|
||||||
log.warning("No recent backup status keys found in Redis")
|
log.warning("No recent backup status keys found in Redis")
|
||||||
return {
|
return {
|
||||||
@@ -228,21 +218,17 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
|
|||||||
"total": 0,
|
"total": 0,
|
||||||
"failed": 0,
|
"failed": 0,
|
||||||
"succeeded": 0,
|
"succeeded": 0,
|
||||||
"note": "No backup status found in Redis — possible repo or scheduling failure",
|
"note": "No backup status found in Redis - possible repo or scheduling failure",
|
||||||
}
|
}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Collect per-module statuses across all resolved plan IDs.
|
||||||
# Collect per-module statuses across all plans
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
all_modules = []
|
all_modules = []
|
||||||
for bid in backup_ids:
|
for bid in backup_ids:
|
||||||
modules = _get_module_statuses(config, bid)
|
modules = _get_module_statuses(config, bid)
|
||||||
log.info(f"backup_id={bid}: found {len(modules)} module status entries")
|
log.info("backup_id=%s: found %d module status entries", bid, len(modules))
|
||||||
all_modules.extend(modules)
|
all_modules.extend(modules)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Classify outcome.
|
||||||
# Outcome classification
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
total = len(all_modules)
|
total = len(all_modules)
|
||||||
failed_modules = [m for m in all_modules if m["result"] != "success"]
|
failed_modules = [m for m in all_modules if m["result"] != "success"]
|
||||||
succeeded = total - len(failed_modules)
|
succeeded = total - len(failed_modules)
|
||||||
@@ -256,8 +242,8 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
|
|||||||
outcome = "PARTIAL"
|
outcome = "PARTIAL"
|
||||||
|
|
||||||
log.info(
|
log.info(
|
||||||
f"Correlation result: outcome={outcome}, total={total}, "
|
"Outcome: %s (total=%d succeeded=%d failed=%d)",
|
||||||
f"succeeded={succeeded}, failed={len(failed_modules)}"
|
outcome, total, succeeded, len(failed_modules),
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@@ -268,4 +254,5 @@ def correlate_backup_status(config: dict, backup_ids: Optional[list] = None) ->
|
|||||||
"total": total,
|
"total": total,
|
||||||
"failed": len(failed_modules),
|
"failed": len(failed_modules),
|
||||||
"succeeded": succeeded,
|
"succeeded": succeeded,
|
||||||
|
"note": "",
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user