docs: add per-status docs, backend mapping, runagent rationale; improve log messages

This commit is contained in:
2026-05-18 21:55:26 +00:00
parent f20a214cd8
commit b71e209076
+92 -88
View File
@@ -2,30 +2,40 @@
"""Verify reachability and health of NS8 backup repositories. """Verify reachability and health of NS8 backup repositories.
For each backup destination configured in the NS8 cluster, this module For each backup destination configured in the NS8 cluster, this module
attempts a ``restic snapshots --last --no-cache`` command to confirm that invokes ``restic snapshots --last --no-cache`` to confirm that the
the repository is accessible and readable. repository is accessible and readable.
Status values returned per destination Status values returned per destination
--------------------------------------- ---------------------------------------
OK Repository is reachable and returned a valid response. OK - Repository is reachable and returned a valid response.
UNREACHABLE Network/mount error cannot connect at all. UNREACHABLE - Network or mount error; cannot connect at all.
LOCKED restic repo is locked (a previous backup crashed mid-run). LOCKED - restic repository is locked (a previous backup crashed mid-run).
CORRUPTED Repository exists but its pack integrity check fails. CORRUPTED - Repository exists but its pack integrity check fails.
ERROR restic reported an error not covered by the above categories. ERROR - restic reported an error not covered by the above categories.
UNCONFIGURED No URL or path found in the Redis hash for this destination. UNCONFIGURED - No URL or path found in Redis for this destination.
UNKNOWN Non-zero exit with unrecognised stderr output. UNKNOWN - Non-zero exit with unrecognised stderr output.
NS8 multi-backend credential mapping NS8 multi-backend credential mapping
-------------------------------------- --------------------------------------
local / fs path only, no credentials. local / fs - path only, no credentials.
S3 / AWS aws_access_key_id + aws_secret_access_key from Redis hash. S3 / AWS - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash.
B2 b2_account_id + b2_account_key from Redis hash. B2 - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash.
SFTP URL with sftp: prefix; relies on SSH keys already in place. SFTP - sftp: URL prefix; relies on SSH keys already deployed.
rclone rclone: prefix; RCLONE_CONFIG env var set from Redis hash. rclone - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash.
Why ``runagent`` is NOT used here
-----------------------------------
restic is invoked directly rather than through ``runagent`` because repo_check
runs on the cluster leader and reads repository credentials from the cluster
Redis. The restic binary is available system-wide on NS8 nodes (not inside a
module container), so a direct subprocess call is both simpler and correct.
``runagent`` is used by other NS8 scripts to run commands inside rootless
Podman module containers - that indirection is not needed here.
Dependencies Dependencies
------------ ------------
Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes). Only the standard library. ``restic`` must be present in PATH (installed with
NS8 or manually on the leader node).
""" """
import logging import logging
@@ -37,11 +47,11 @@ log = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Redis helpers (local copies) # Redis helpers
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Duplicated from correlator.py to keep repo_check.py self-contained and # These are local copies of the helpers defined in correlator.py, kept here
# avoid a circular import. If the Redis access layer is ever extracted into # to make repo_check.py self-contained and avoid a circular import. If the
# a shared helper, these can be removed. # Redis access layer is extracted into utils.py in the future, remove these.
def _redis_cmd(config: dict, *args) -> str: def _redis_cmd(config: dict, *args) -> str:
"""Run a redis-cli command against the NS8 cluster Redis Unix socket.""" """Run a redis-cli command against the NS8 cluster Redis Unix socket."""
@@ -65,7 +75,6 @@ def _redis_hgetall(config: dict, key: str) -> dict:
cmd = ["redis-cli", "-s", socket, "HGETALL", key] cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l] lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key/value lines.
return dict(zip(lines[::2], lines[1::2])) return dict(zip(lines[::2], lines[1::2]))
@@ -85,17 +94,18 @@ def _get_backup_destinations(config: dict) -> list:
config: Parsed configuration dictionary. config: Parsed configuration dictionary.
Returns: Returns:
List of dicts, one per configured destination: List of dicts, one per configured destination::
{ {
"repo_id" : str, "repo_id" : str,
"url" : str (cloud URL or empty), "url" : str (cloud URL or empty for local),
"path" : str (local/SFTP path or empty), "path" : str (local/SFTP path or empty for cloud),
"password" : str (restic repo password), "password" : str (restic repository password),
"backend" : str ("s3", "b2", "sftp", "rclone", "local", ...), "backend" : str ("s3", "b2", "sftp", "rclone", "local", ...),
"aws_access_key": str (S3 key ID or B2 account ID), "aws_access_key": str (S3 key ID or B2 account ID, normalised),
"aws_secret_key": str (S3 secret or B2 account key), "aws_secret_key": str (S3 secret or B2 account key, normalised),
"rclone_config" : str (path or inline rclone config), "rclone_config" : str (path to rclone config or empty),
"extra_env" : str (optional additional environment variables), "extra_env" : str (optional extra environment variables),
} }
""" """
raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters") raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
@@ -113,13 +123,13 @@ def _get_backup_destinations(config: dict) -> list:
"path": fields.get("path", ""), "path": fields.get("path", ""),
"password": fields.get("password", ""), "password": fields.get("password", ""),
"backend": fields.get("backend", ""), "backend": fields.get("backend", ""),
# S3 and B2 use different field names in NS8 Redis; # S3 and B2 use different field names in NS8 Redis; normalise both
# normalise both to a single aws_access_key / aws_secret_key pair. # to a single aws_access_key / aws_secret_key pair so _build_env()
# can handle them uniformly.
"aws_access_key": fields.get("aws_access_key_id", "aws_access_key": fields.get("aws_access_key_id",
fields.get("b2_account_id", "")), fields.get("b2_account_id", "")),
"aws_secret_key": fields.get("aws_secret_access_key", "aws_secret_key": fields.get("aws_secret_access_key",
fields.get("b2_account_key", "")), fields.get("b2_account_key", "")),
# rclone and miscellaneous extras
"rclone_config": fields.get("rclone_config", ""), "rclone_config": fields.get("rclone_config", ""),
"extra_env": fields.get("extra_env", ""), "extra_env": fields.get("extra_env", ""),
}) })
@@ -132,30 +142,30 @@ def _get_backup_destinations(config: dict) -> list:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _build_env(dest: dict) -> dict: def _build_env(dest: dict) -> dict:
"""Build the environment dict that restic needs based on the backend type. """Build the environment dict that restic needs for a given backend.
Always starts from a copy of ``os.environ`` so that system-level settings Always starts from a copy of ``os.environ`` so system-level settings
(PATH, HOME, proxy variables, etc.) are preserved. (PATH, HOME, proxy variables, etc.) are inherited.
Args: Args:
dest: A destination dict as returned by ``_get_backup_destinations()``. dest: A destination dict as returned by ``_get_backup_destinations()``.
Returns: Returns:
A dict suitable for passing as the ``env`` argument to subprocess.run(). A dict suitable for the ``env`` argument of ``subprocess.run()``.
""" """
env = dict(os.environ) env = dict(os.environ)
backend = dest.get("backend", "").lower() backend = dest.get("backend", "").lower()
# RESTIC_PASSWORD is used by all backends to unlock the repository. # RESTIC_PASSWORD unlocks the repository for all backends.
if dest.get("password"): if dest.get("password"):
env["RESTIC_PASSWORD"] = dest["password"] env["RESTIC_PASSWORD"] = dest["password"]
# S3 / AWS backend credentials. # S3 / AWS backend.
if backend in ("s3", "aws") and dest.get("aws_access_key"): if backend in ("s3", "aws") and dest.get("aws_access_key"):
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"] env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
# Backblaze B2 backend credentials. # Backblaze B2 backend.
elif backend in ("b2", "backblaze") and dest.get("aws_access_key"): elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
env["B2_ACCOUNT_ID"] = dest["aws_access_key"] env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"] env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
@@ -168,26 +178,26 @@ def _build_env(dest: dict) -> dict:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Single-repository check # Single-repository health check
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def _check_restic_repo(dest: dict, config: dict) -> dict: def _check_restic_repo(dest: dict, config: dict) -> dict:
"""Run ``restic snapshots --last --no-cache`` to verify one repository. """Run ``restic snapshots --last --no-cache`` to verify one repository.
``--no-cache`` is intentional: the cache may be stale or missing on the ``--no-cache`` is intentional: the local cache may be stale or missing
host, and we want a live check against the actual backend. on the cluster leader, and we always want a live check against the backend.
Args: Args:
dest: Destination dict from ``_get_backup_destinations()``. dest: Destination dict from ``_get_backup_destinations()``.
config: Parsed configuration dictionary (reads ``repo_check.*``). config: Parsed configuration dictionary (reads ``repo_check.*``).
Returns: Returns:
Dict: {"repo_id": str, "status": str, "error": str} Dict with keys: ``repo_id``, ``status``, ``error``.
""" """
timeout = config.get("repo_check", {}).get("timeout", 60) timeout = config.get("repo_check", {}).get("timeout", 60)
extra_flags = config.get("repo_check", {}).get("restic_flags", "") extra_flags = config.get("repo_check", {}).get("restic_flags", "")
# Prefer ``url`` (cloud backends) over ``path`` (local/SFTP). # Prefer ``url`` (cloud backends) over ``path`` (local / SFTP).
repo_url = dest.get("url") or dest.get("path") or "" repo_url = dest.get("url") or dest.get("path") or ""
if not repo_url: if not repo_url:
return { return {
@@ -198,8 +208,8 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"] cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
if extra_flags: if extra_flags:
# Allow the operator to append flags like --cacert or --option # Allow operators to append flags (e.g. --cacert, --option) via the
# via the config without modifying the code. # config file without modifying the source code.
cmd += extra_flags.split() cmd += extra_flags.split()
env = _build_env(dest) env = _build_env(dest)
@@ -210,53 +220,47 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
capture_output=True, capture_output=True,
text=True, text=True,
timeout=timeout, timeout=timeout,
env=env env=env,
) )
stderr = result.stderr.lower() stderr = result.stderr.lower()
# ---------------------------------------------------------------------------
# Classify the restic exit code and stderr content
# ---------------------------------------------------------------------------
if result.returncode == 0: if result.returncode == 0:
return {"repo_id": dest["repo_id"], "status": "OK", "error": ""} return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
# Network / connectivity errors. # Network / connectivity errors.
elif any(x in stderr for x in ( elif any(x in stderr for x in (
"unable to open config", "no such file", "does not exist", "unable to open config", "no such file", "does not exist",
"connection refused", "network", "timeout", "no route" "connection refused", "network", "timeout", "no route",
)): )):
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
"error": result.stderr.strip()} "error": result.stderr.strip()}
# Repository locked by a previous crashed backup run. # Repository locked by a previous crashed backup run.
# Run ``restic unlock`` manually to recover.
elif "locked" in stderr or "lock" in stderr: elif "locked" in stderr or "lock" in stderr:
return {"repo_id": dest["repo_id"], "status": "LOCKED", return {"repo_id": dest["repo_id"], "status": "LOCKED",
"error": result.stderr.strip()} "error": result.stderr.strip()}
# Pack / data integrity error repository may be corrupted. # Pack / data integrity error - repository may be corrupted.
elif "pack" in stderr and "error" in stderr: elif "pack" in stderr and "error" in stderr:
return {"repo_id": dest["repo_id"], "status": "CORRUPTED", return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
"error": result.stderr.strip()} "error": result.stderr.strip()}
# Generic restic error not covered by the specific cases above. # Generic restic error.
elif "error" in stderr or "fatal" in stderr: elif "error" in stderr or "fatal" in stderr:
return {"repo_id": dest["repo_id"], "status": "ERROR", return {"repo_id": dest["repo_id"], "status": "ERROR",
"error": result.stderr.strip()} "error": result.stderr.strip()}
# Non-zero exit with unrecognised output.
else: else:
return {"repo_id": dest["repo_id"], "status": "UNKNOWN", return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
"error": result.stderr.strip()} "error": result.stderr.strip()}
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
"error": f"Timeout after {timeout}s"} "error": f"restic timed out after {timeout}s"}
except FileNotFoundError: except FileNotFoundError:
# restic binary is not installed or not in PATH.
return {"repo_id": dest["repo_id"], "status": "ERROR", return {"repo_id": dest["repo_id"], "status": "ERROR",
"error": "restic binary not found in PATH"} "error": "restic not found in PATH - install restic on the cluster leader"}
except Exception as e:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -264,57 +268,57 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def check_repositories(config: dict, correlation: dict) -> dict: def check_repositories(config: dict, correlation: dict) -> dict:
"""Check all configured NS8 backup destinations and return a status summary. """Check all configured NS8 backup repositories and return a status summary.
Called by the pipeline only when the correlator outcome is not SUCCESS, Called only when the correlator outcome is not SUCCESS, to avoid
so restic network calls are avoided on clean backup runs. unnecessary restic network calls on healthy clusters.
Args: Args:
config: Parsed configuration dictionary. config: Parsed configuration dictionary.
correlation: Correlation result dict (from correlator.py); currently correlation: Output dict from ``correlate_backup_status()`` (used for
unused but kept for future filtering by plan/module. context logging only; not read for repository selection).
Returns: Returns:
A dict with the following keys: Dict with keys:
destinations : list of per-repo result dicts (see _check_restic_repo) destinations : list of per-destination result dicts
any_unreachable : bool — True if at least one repo is UNREACHABLE (each has "repo_id", "status", "error")
any_locked : bool True if at least one repo is LOCKED any_ok : bool - True if at least one destination is reachable
all_ok : bool True only if every repo returned OK all_ok : bool - True if all destinations are OK
note : optional str present when no destinations are configured summary : human-readable one-line summary string
""" """
destinations = _get_backup_destinations(config) destinations = _get_backup_destinations(config)
if not destinations: if not destinations:
log.warning("No backup destinations found in Redis") log.warning("No backup_repository keys found in Redis")
return { return {
"destinations": [], "destinations": [],
"any_unreachable": True, "any_ok": False,
"any_locked": False,
"all_ok": False, "all_ok": False,
"note": "No backup destinations configured or readable from Redis", "summary": "No backup repositories configured in NS8",
} }
results = [] results = []
for dest in destinations: for dest in destinations:
log.info( log.info("Checking repository repo_id=%s url=%s",
f"Checking repository {dest['repo_id']} " dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)")
f"(backend={dest.get('backend', 'unknown')})..." result = _check_restic_repo(dest, config)
) log.info(" -> %s", result["status"])
res = _check_restic_repo(dest, config) results.append(result)
log.info(f" -> {res['status']}: {res.get('error', '')}")
results.append(res)
# --------------------------------------------------------------------------- ok_count = sum(1 for r in results if r["status"] == "OK")
# Aggregate flags for quick consumption by the notifier all_ok = ok_count == len(results)
# --------------------------------------------------------------------------- any_ok = ok_count > 0
any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
any_locked = any(r["status"] == "LOCKED" for r in results) summary = (
all_ok = all(r["status"] == "OK" for r in results) f"{ok_count}/{len(results)} repositories OK"
if not all_ok
else "All repositories reachable"
)
return { return {
"destinations": results, "destinations": results,
"any_unreachable": any_unreachable, "any_ok": any_ok,
"any_locked": any_locked,
"all_ok": all_ok, "all_ok": all_ok,
"summary": summary,
} }