docs: add per-status docs, backend mapping, runagent rationale; improve log messages

2026-05-18 21:55:26 +00:00
parent f20a214cd8
commit b71e209076
1 changed files with 103 additions and 99 deletions
@@ -2,30 +2,40 @@
 """Verify reachability and health of NS8 backup repositories.
 For each backup destination configured in the NS8 cluster, this module
-attempts a ``restic snapshots --last --no-cache`` command to confirm that
+invokes ``restic snapshots --last --no-cache`` to confirm that the
-the repository is accessible and readable.
+repository is accessible and readable.
 Status values returned per destination
 ---------------------------------------
-    OK           Repository is reachable and returned a valid response.
+    OK           - Repository is reachable and returned a valid response.
-    UNREACHABLE  Network/mount error — cannot connect at all.
+    UNREACHABLE  - Network or mount error; cannot connect at all.
-    LOCKED       restic repo is locked (a previous backup crashed mid-run).
+    LOCKED       - restic repository is locked (a previous backup crashed mid-run).
-    CORRUPTED    Repository exists but its pack integrity check fails.
+    CORRUPTED    - Repository exists but its pack integrity check fails.
-    ERROR        restic reported an error not covered by the above categories.
+    ERROR        - restic reported an error not covered by the above categories.
-    UNCONFIGURED No URL or path found in the Redis hash for this destination.
+    UNCONFIGURED - No URL or path found in Redis for this destination.
-    UNKNOWN      Non-zero exit with unrecognised stderr output.
+    UNKNOWN      - Non-zero exit with unrecognised stderr output.
 NS8 multi-backend credential mapping
 --------------------------------------
-    local / fs   – path only, no credentials.
+    local / fs   - path only, no credentials.
-    S3 / AWS     – aws_access_key_id + aws_secret_access_key from Redis hash.
+    S3 / AWS     - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash.
-    B2           – b2_account_id + b2_account_key from Redis hash.
+    B2           - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash.
-    SFTP         – URL with sftp: prefix; relies on SSH keys already in place.
+    SFTP         - sftp: URL prefix; relies on SSH keys already deployed.
-    rclone       – rclone: prefix; RCLONE_CONFIG env var set from Redis hash.
+    rclone       - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash.
 Why ``runagent`` is NOT used here
 -----------------------------------
 restic is invoked directly rather than through ``runagent`` because repo_check
 runs on the cluster leader and reads repository credentials from the cluster
 Redis. The restic binary is available system-wide on NS8 nodes (not inside a
 module container), so a direct subprocess call is both simpler and correct.
 ``runagent`` is used by other NS8 scripts to run commands inside rootless
 Podman module containers - that indirection is not needed here.
 Dependencies
 ------------
-Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes).
+Only the standard library. ``restic`` must be present in PATH (installed with
 NS8 or manually on the leader node).
 """
 import logging
@@ -37,11 +47,11 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
-# Redis helpers (local copies)
+# Redis helpers
 # ---------------------------------------------------------------------------
-# Duplicated from correlator.py to keep repo_check.py self-contained and
+# These are local copies of the helpers defined in correlator.py, kept here
-# avoid a circular import.  If the Redis access layer is ever extracted into
+# to make repo_check.py self-contained and avoid a circular import. If the
-# a shared helper, these can be removed.
+# Redis access layer is extracted into utils.py in the future, remove these.
 def _redis_cmd(config: dict, *args) -> str:
    """Run a redis-cli command against the NS8 cluster Redis Unix socket."""
@@ -65,7 +75,6 @@ def _redis_hgetall(config: dict, key: str) -> dict:
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
    # redis-cli HGETALL returns alternating key/value lines.
    return dict(zip(lines[::2], lines[1::2]))
@@ -85,17 +94,18 @@ def _get_backup_destinations(config: dict) -> list:
        config: Parsed configuration dictionary.
    Returns:
-        List of dicts, one per configured destination:
+        List of dicts, one per configured destination::
            {
                "repo_id"       : str,
-                "url"           : str  (cloud URL or empty),
+                "url"           : str  (cloud URL or empty for local),
-                "path"          : str  (local/SFTP path or empty),
+                "path"          : str  (local/SFTP path or empty for cloud),
-                "password"      : str  (restic repo password),
+                "password"      : str  (restic repository password),
                "backend"       : str  ("s3", "b2", "sftp", "rclone", "local", ...),
-                "aws_access_key": str  (S3 key ID or B2 account ID),
+                "aws_access_key": str  (S3 key ID or B2 account ID, normalised),
-                "aws_secret_key": str  (S3 secret or B2 account key),
+                "aws_secret_key": str  (S3 secret or B2 account key, normalised),
-                "rclone_config" : str  (path or inline rclone config),
+                "rclone_config" : str  (path to rclone config or empty),
-                "extra_env"     : str  (optional additional environment variables),
+                "extra_env"     : str  (optional extra environment variables),
            }
    """
    raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
@@ -113,13 +123,13 @@ def _get_backup_destinations(config: dict) -> list:
            "path":     fields.get("path", ""),
            "password": fields.get("password", ""),
            "backend":  fields.get("backend", ""),
-            # S3 and B2 use different field names in NS8 Redis;
+            # S3 and B2 use different field names in NS8 Redis; normalise both
-            # normalise both to a single aws_access_key / aws_secret_key pair.
+            # to a single aws_access_key / aws_secret_key pair so _build_env()
            # can handle them uniformly.
            "aws_access_key": fields.get("aws_access_key_id",
                              fields.get("b2_account_id", "")),
            "aws_secret_key": fields.get("aws_secret_access_key",
                              fields.get("b2_account_key", "")),
            # rclone and miscellaneous extras
            "rclone_config": fields.get("rclone_config", ""),
            "extra_env":     fields.get("extra_env", ""),
        })
@@ -132,30 +142,30 @@ def _get_backup_destinations(config: dict) -> list:
 # ---------------------------------------------------------------------------
 def _build_env(dest: dict) -> dict:
-    """Build the environment dict that restic needs based on the backend type.
+    """Build the environment dict that restic needs for a given backend.
-    Always starts from a copy of ``os.environ`` so that system-level settings
+    Always starts from a copy of ``os.environ`` so system-level settings
-    (PATH, HOME, proxy variables, etc.) are preserved.
+    (PATH, HOME, proxy variables, etc.) are inherited.
    Args:
        dest: A destination dict as returned by ``_get_backup_destinations()``.
    Returns:
-        A dict suitable for passing as the ``env`` argument to subprocess.run().
+        A dict suitable for the ``env`` argument of ``subprocess.run()``.
    """
    env = dict(os.environ)
    backend = dest.get("backend", "").lower()
-    # RESTIC_PASSWORD is used by all backends to unlock the repository.
+    # RESTIC_PASSWORD unlocks the repository for all backends.
    if dest.get("password"):
        env["RESTIC_PASSWORD"] = dest["password"]
-    # S3 / AWS backend credentials.
+    # S3 / AWS backend.
    if backend in ("s3", "aws") and dest.get("aws_access_key"):
        env["AWS_ACCESS_KEY_ID"]     = dest["aws_access_key"]
        env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
-    # Backblaze B2 backend credentials.
+    # Backblaze B2 backend.
    elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
        env["B2_ACCOUNT_ID"]  = dest["aws_access_key"]
        env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
@@ -168,26 +178,26 @@ def _build_env(dest: dict) -> dict:
 # ---------------------------------------------------------------------------
-# Single-repository check
+# Single-repository health check
 # ---------------------------------------------------------------------------
 def _check_restic_repo(dest: dict, config: dict) -> dict:
    """Run ``restic snapshots --last --no-cache`` to verify one repository.
-    ``--no-cache`` is intentional: the cache may be stale or missing on the
+    ``--no-cache`` is intentional: the local cache may be stale or missing
-    host, and we want a live check against the actual backend.
+    on the cluster leader, and we always want a live check against the backend.
    Args:
        dest:   Destination dict from ``_get_backup_destinations()``.
        config: Parsed configuration dictionary (reads ``repo_check.*``).
    Returns:
-        Dict: {"repo_id": str, "status": str, "error": str}
+        Dict with keys: ``repo_id``, ``status``, ``error``.
    """
    timeout = config.get("repo_check", {}).get("timeout", 60)
    extra_flags = config.get("repo_check", {}).get("restic_flags", "")
-    # Prefer ``url`` (cloud backends) over ``path`` (local/SFTP).
+    # Prefer ``url`` (cloud backends) over ``path`` (local / SFTP).
    repo_url = dest.get("url") or dest.get("path") or ""
    if not repo_url:
        return {
@@ -198,8 +208,8 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
    cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
    if extra_flags:
-        # Allow the operator to append flags like --cacert or --option
+        # Allow operators to append flags (e.g. --cacert, --option) via the
-        # via the config without modifying the code.
+        # config file without modifying the source code.
        cmd += extra_flags.split()
    env = _build_env(dest)
@@ -210,53 +220,47 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
            capture_output=True,
            text=True,
            timeout=timeout,
-            env=env
+            env=env,
        )
        stderr = result.stderr.lower()
        # ---------------------------------------------------------------------------
        # Classify the restic exit code and stderr content
        # ---------------------------------------------------------------------------
        if result.returncode == 0:
            return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
        # Network / connectivity errors.
        elif any(x in stderr for x in (
            "unable to open config", "no such file", "does not exist",
-            "connection refused", "network", "timeout", "no route"
+            "connection refused", "network", "timeout", "no route",
        )):
            return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
                    "error": result.stderr.strip()}
        # Repository locked by a previous crashed backup run.
        # Run ``restic unlock`` manually to recover.
        elif "locked" in stderr or "lock" in stderr:
            return {"repo_id": dest["repo_id"], "status": "LOCKED",
                    "error": result.stderr.strip()}
-        # Pack / data integrity error — repository may be corrupted.
+        # Pack / data integrity error - repository may be corrupted.
        elif "pack" in stderr and "error" in stderr:
            return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
                    "error": result.stderr.strip()}
-        # Generic restic error not covered by the specific cases above.
+        # Generic restic error.
        elif "error" in stderr or "fatal" in stderr:
            return {"repo_id": dest["repo_id"], "status": "ERROR",
                    "error": result.stderr.strip()}
        # Non-zero exit with unrecognised output.
        else:
            return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
                    "error": result.stderr.strip()}
    except subprocess.TimeoutExpired:
        return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
-                "error": f"Timeout after {timeout}s"}
+                "error": f"restic timed out after {timeout}s"}
    except FileNotFoundError:
        # restic binary is not installed or not in PATH.
        return {"repo_id": dest["repo_id"], "status": "ERROR",
-                "error": "restic binary not found in PATH"}
+                "error": "restic not found in PATH - install restic on the cluster leader"}
    except Exception as e:
        return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
 # ---------------------------------------------------------------------------
@@ -264,57 +268,57 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
 # ---------------------------------------------------------------------------
 def check_repositories(config: dict, correlation: dict) -> dict:
-    """Check all configured NS8 backup destinations and return a status summary.
+    """Check all configured NS8 backup repositories and return a status summary.
-    Called by the pipeline only when the correlator outcome is not SUCCESS,
+    Called only when the correlator outcome is not SUCCESS, to avoid
-    so restic network calls are avoided on clean backup runs.
+    unnecessary restic network calls on healthy clusters.
    Args:
        config:      Parsed configuration dictionary.
-        correlation: Correlation result dict (from correlator.py); currently
+        correlation: Output dict from ``correlate_backup_status()`` (used for
-                     unused but kept for future filtering by plan/module.
+                     context logging only; not read for repository selection).
    Returns:
-        A dict with the following keys:
+        Dict with keys:
-        destinations    : list of per-repo result dicts (see _check_restic_repo)
+        destinations : list of per-destination result dicts
-        any_unreachable : bool — True if at least one repo is UNREACHABLE
+                       (each has "repo_id", "status", "error")
-        any_locked      : bool — True if at least one repo is LOCKED
+        any_ok       : bool - True if at least one destination is reachable
-        all_ok          : bool — True only if every repo returned OK
+        all_ok       : bool - True if all destinations are OK
-        note            : optional str present when no destinations are configured
+        summary      : human-readable one-line summary string
    """
    destinations = _get_backup_destinations(config)
    if not destinations:
-        log.warning("No backup destinations found in Redis")
+        log.warning("No backup_repository keys found in Redis")
        return {
            "destinations": [],
-            "any_unreachable": True,
+            "any_ok":       False,
            "any_locked":      False,
            "all_ok":       False,
-            "note": "No backup destinations configured or readable from Redis",
+            "summary":      "No backup repositories configured in NS8",
        }
    results = []
    for dest in destinations:
-        log.info(
+        log.info("Checking repository repo_id=%s url=%s",
-            f"Checking repository {dest['repo_id']} "
+                 dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)")
-            f"(backend={dest.get('backend', 'unknown')})..."
+        result = _check_restic_repo(dest, config)
-        )
+        log.info("  -> %s", result["status"])
-        res = _check_restic_repo(dest, config)
+        results.append(result)
        log.info(f"  -> {res['status']}: {res.get('error', '')}")
        results.append(res)
-    # ---------------------------------------------------------------------------
+    ok_count = sum(1 for r in results if r["status"] == "OK")
-    # Aggregate flags for quick consumption by the notifier
+    all_ok = ok_count == len(results)
-    # ---------------------------------------------------------------------------
+    any_ok = ok_count > 0
-    any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
+
-    any_locked      = any(r["status"] == "LOCKED"      for r in results)
+    summary = (
-    all_ok          = all(r["status"] == "OK"          for r in results)
+        f"{ok_count}/{len(results)} repositories OK"
        if not all_ok
        else "All repositories reachable"
    )
    return {
        "destinations": results,
-        "any_unreachable": any_unreachable,
+        "any_ok":       any_ok,
        "any_locked":      any_locked,
        "all_ok":       all_ok,
        "summary":      summary,
    }