docs: add per-status docs, backend mapping, runagent rationale; improve log messages

2026-05-18 21:55:26 +00:00
parent f20a214cd8
commit b71e209076
1 changed files with 103 additions and 99 deletions
@@ -2,30 +2,40 @@
 """Verify reachability and health of NS8 backup repositories.

 For each backup destination configured in the NS8 cluster, this module
-attempts a ``restic snapshots --last --no-cache`` command to confirm that
-the repository is accessible and readable.
+invokes ``restic snapshots --last --no-cache`` to confirm that the
+repository is accessible and readable.

 Status values returned per destination
 ---------------------------------------
-    OK           Repository is reachable and returned a valid response.
-    UNREACHABLE  Network/mount error — cannot connect at all.
-    LOCKED       restic repo is locked (a previous backup crashed mid-run).
-    CORRUPTED    Repository exists but its pack integrity check fails.
-    ERROR        restic reported an error not covered by the above categories.
-    UNCONFIGURED No URL or path found in the Redis hash for this destination.
-    UNKNOWN      Non-zero exit with unrecognised stderr output.
+    OK           - Repository is reachable and returned a valid response.
+    UNREACHABLE  - Network or mount error; cannot connect at all.
+    LOCKED       - restic repository is locked (a previous backup crashed mid-run).
+    CORRUPTED    - Repository exists but its pack integrity check fails.
+    ERROR        - restic reported an error not covered by the above categories.
+    UNCONFIGURED - No URL or path found in Redis for this destination.
+    UNKNOWN      - Non-zero exit with unrecognised stderr output.

 NS8 multi-backend credential mapping
 --------------------------------------
-    local / fs   – path only, no credentials.
-    S3 / AWS     – aws_access_key_id + aws_secret_access_key from Redis hash.
-    B2           – b2_account_id + b2_account_key from Redis hash.
-    SFTP         – URL with sftp: prefix; relies on SSH keys already in place.
-    rclone       – rclone: prefix; RCLONE_CONFIG env var set from Redis hash.
+    local / fs   - path only, no credentials.
+    S3 / AWS     - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash.
+    B2           - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash.
+    SFTP         - sftp: URL prefix; relies on SSH keys already deployed.
+    rclone       - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash.
+
+Why ``runagent`` is NOT used here
+-----------------------------------
+restic is invoked directly rather than through ``runagent`` because repo_check
+runs on the cluster leader and reads repository credentials from the cluster
+Redis. The restic binary is available system-wide on NS8 nodes (not inside a
+module container), so a direct subprocess call is both simpler and correct.
+``runagent`` is used by other NS8 scripts to run commands inside rootless
+Podman module containers - that indirection is not needed here.

 Dependencies
 ------------
-Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes).
+Only the standard library. ``restic`` must be present in PATH (installed with
+NS8 or manually on the leader node).
 """

 import logging
@@ -37,11 +47,11 @@ log = logging.getLogger(__name__)


 # ---------------------------------------------------------------------------
-# Redis helpers (local copies)
+# Redis helpers
 # ---------------------------------------------------------------------------
-# Duplicated from correlator.py to keep repo_check.py self-contained and
-# avoid a circular import.  If the Redis access layer is ever extracted into
-# a shared helper, these can be removed.
+# These are local copies of the helpers defined in correlator.py, kept here
+# to make repo_check.py self-contained and avoid a circular import. If the
+# Redis access layer is extracted into utils.py in the future, remove these.

 def _redis_cmd(config: dict, *args) -> str:
    """Run a redis-cli command against the NS8 cluster Redis Unix socket."""
@@ -65,7 +75,6 @@ def _redis_hgetall(config: dict, key: str) -> dict:
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
-    # redis-cli HGETALL returns alternating key/value lines.
    return dict(zip(lines[::2], lines[1::2]))


@@ -85,17 +94,18 @@ def _get_backup_destinations(config: dict) -> list:
        config: Parsed configuration dictionary.

    Returns:
-        List of dicts, one per configured destination:
+        List of dicts, one per configured destination::
+
            {
                "repo_id"       : str,
-                "url"           : str  (cloud URL or empty),
-                "path"          : str  (local/SFTP path or empty),
-                "password"      : str  (restic repo password),
+                "url"           : str  (cloud URL or empty for local),
+                "path"          : str  (local/SFTP path or empty for cloud),
+                "password"      : str  (restic repository password),
                "backend"       : str  ("s3", "b2", "sftp", "rclone", "local", ...),
-                "aws_access_key": str  (S3 key ID or B2 account ID),
-                "aws_secret_key": str  (S3 secret or B2 account key),
-                "rclone_config" : str  (path or inline rclone config),
-                "extra_env"     : str  (optional additional environment variables),
+                "aws_access_key": str  (S3 key ID or B2 account ID, normalised),
+                "aws_secret_key": str  (S3 secret or B2 account key, normalised),
+                "rclone_config" : str  (path to rclone config or empty),
+                "extra_env"     : str  (optional extra environment variables),
            }
    """
    raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
@@ -108,20 +118,20 @@ def _get_backup_destinations(config: dict) -> list:
        repo_id = parts[2] if len(parts) > 2 else "unknown"
        fields = _redis_hgetall(config, key)
        destinations.append({
-            "repo_id":        repo_id,
-            "url":            fields.get("url", ""),
-            "path":           fields.get("path", ""),
-            "password":       fields.get("password", ""),
-            "backend":        fields.get("backend", ""),
-            # S3 and B2 use different field names in NS8 Redis;
-            # normalise both to a single aws_access_key / aws_secret_key pair.
+            "repo_id":  repo_id,
+            "url":      fields.get("url", ""),
+            "path":     fields.get("path", ""),
+            "password": fields.get("password", ""),
+            "backend":  fields.get("backend", ""),
+            # S3 and B2 use different field names in NS8 Redis; normalise both
+            # to a single aws_access_key / aws_secret_key pair so _build_env()
+            # can handle them uniformly.
            "aws_access_key": fields.get("aws_access_key_id",
                              fields.get("b2_account_id", "")),
            "aws_secret_key": fields.get("aws_secret_access_key",
                              fields.get("b2_account_key", "")),
-            # rclone and miscellaneous extras
-            "rclone_config":  fields.get("rclone_config", ""),
-            "extra_env":      fields.get("extra_env", ""),
+            "rclone_config": fields.get("rclone_config", ""),
+            "extra_env":     fields.get("extra_env", ""),
        })

    return destinations
@@ -132,30 +142,30 @@ def _get_backup_destinations(config: dict) -> list:
 # ---------------------------------------------------------------------------

 def _build_env(dest: dict) -> dict:
-    """Build the environment dict that restic needs based on the backend type.
+    """Build the environment dict that restic needs for a given backend.

-    Always starts from a copy of ``os.environ`` so that system-level settings
-    (PATH, HOME, proxy variables, etc.) are preserved.
+    Always starts from a copy of ``os.environ`` so system-level settings
+    (PATH, HOME, proxy variables, etc.) are inherited.

    Args:
        dest: A destination dict as returned by ``_get_backup_destinations()``.

    Returns:
-        A dict suitable for passing as the ``env`` argument to subprocess.run().
+        A dict suitable for the ``env`` argument of ``subprocess.run()``.
    """
    env = dict(os.environ)
    backend = dest.get("backend", "").lower()

-    # RESTIC_PASSWORD is used by all backends to unlock the repository.
+    # RESTIC_PASSWORD unlocks the repository for all backends.
    if dest.get("password"):
        env["RESTIC_PASSWORD"] = dest["password"]

-    # S3 / AWS backend credentials.
+    # S3 / AWS backend.
    if backend in ("s3", "aws") and dest.get("aws_access_key"):
        env["AWS_ACCESS_KEY_ID"]     = dest["aws_access_key"]
        env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]

-    # Backblaze B2 backend credentials.
+    # Backblaze B2 backend.
    elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
        env["B2_ACCOUNT_ID"]  = dest["aws_access_key"]
        env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
@@ -168,26 +178,26 @@ def _build_env(dest: dict) -> dict:


 # ---------------------------------------------------------------------------
-# Single-repository check
+# Single-repository health check
 # ---------------------------------------------------------------------------

 def _check_restic_repo(dest: dict, config: dict) -> dict:
    """Run ``restic snapshots --last --no-cache`` to verify one repository.

-    ``--no-cache`` is intentional: the cache may be stale or missing on the
-    host, and we want a live check against the actual backend.
+    ``--no-cache`` is intentional: the local cache may be stale or missing
+    on the cluster leader, and we always want a live check against the backend.

    Args:
        dest:   Destination dict from ``_get_backup_destinations()``.
        config: Parsed configuration dictionary (reads ``repo_check.*``).

    Returns:
-        Dict: {"repo_id": str, "status": str, "error": str}
+        Dict with keys: ``repo_id``, ``status``, ``error``.
    """
    timeout = config.get("repo_check", {}).get("timeout", 60)
    extra_flags = config.get("repo_check", {}).get("restic_flags", "")

-    # Prefer ``url`` (cloud backends) over ``path`` (local/SFTP).
+    # Prefer ``url`` (cloud backends) over ``path`` (local / SFTP).
    repo_url = dest.get("url") or dest.get("path") or ""
    if not repo_url:
        return {
@@ -198,8 +208,8 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:

    cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
    if extra_flags:
-        # Allow the operator to append flags like --cacert or --option
-        # via the config without modifying the code.
+        # Allow operators to append flags (e.g. --cacert, --option) via the
+        # config file without modifying the source code.
        cmd += extra_flags.split()

    env = _build_env(dest)
@@ -210,53 +220,47 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
            capture_output=True,
            text=True,
            timeout=timeout,
-            env=env
+            env=env,
        )
        stderr = result.stderr.lower()

-        # ---------------------------------------------------------------------------
-        # Classify the restic exit code and stderr content
-        # ---------------------------------------------------------------------------
        if result.returncode == 0:
            return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}

        # Network / connectivity errors.
        elif any(x in stderr for x in (
            "unable to open config", "no such file", "does not exist",
-            "connection refused", "network", "timeout", "no route"
+            "connection refused", "network", "timeout", "no route",
        )):
            return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
                    "error": result.stderr.strip()}

        # Repository locked by a previous crashed backup run.
+        # Run ``restic unlock`` manually to recover.
        elif "locked" in stderr or "lock" in stderr:
            return {"repo_id": dest["repo_id"], "status": "LOCKED",
                    "error": result.stderr.strip()}

-        # Pack / data integrity error — repository may be corrupted.
+        # Pack / data integrity error - repository may be corrupted.
        elif "pack" in stderr and "error" in stderr:
            return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
                    "error": result.stderr.strip()}

-        # Generic restic error not covered by the specific cases above.
+        # Generic restic error.
        elif "error" in stderr or "fatal" in stderr:
            return {"repo_id": dest["repo_id"], "status": "ERROR",
                    "error": result.stderr.strip()}

-        # Non-zero exit with unrecognised output.
        else:
            return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
                    "error": result.stderr.strip()}

    except subprocess.TimeoutExpired:
        return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
-                "error": f"Timeout after {timeout}s"}
+                "error": f"restic timed out after {timeout}s"}
    except FileNotFoundError:
-        # restic binary is not installed or not in PATH.
        return {"repo_id": dest["repo_id"], "status": "ERROR",
-                "error": "restic binary not found in PATH"}
-    except Exception as e:
-        return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
+                "error": "restic not found in PATH - install restic on the cluster leader"}


 # ---------------------------------------------------------------------------
@@ -264,57 +268,57 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
 # ---------------------------------------------------------------------------

 def check_repositories(config: dict, correlation: dict) -> dict:
-    """Check all configured NS8 backup destinations and return a status summary.
+    """Check all configured NS8 backup repositories and return a status summary.

-    Called by the pipeline only when the correlator outcome is not SUCCESS,
-    so restic network calls are avoided on clean backup runs.
+    Called only when the correlator outcome is not SUCCESS, to avoid
+    unnecessary restic network calls on healthy clusters.

    Args:
        config:      Parsed configuration dictionary.
-        correlation: Correlation result dict (from correlator.py); currently
-                     unused but kept for future filtering by plan/module.
+        correlation: Output dict from ``correlate_backup_status()`` (used for
+                     context logging only; not read for repository selection).

    Returns:
-        A dict with the following keys:
+        Dict with keys:

-        destinations    : list of per-repo result dicts (see _check_restic_repo)
-        any_unreachable : bool — True if at least one repo is UNREACHABLE
-        any_locked      : bool — True if at least one repo is LOCKED
-        all_ok          : bool — True only if every repo returned OK
-        note            : optional str present when no destinations are configured
+        destinations : list of per-destination result dicts
+                       (each has "repo_id", "status", "error")
+        any_ok       : bool - True if at least one destination is reachable
+        all_ok       : bool - True if all destinations are OK
+        summary      : human-readable one-line summary string
    """
    destinations = _get_backup_destinations(config)

    if not destinations:
-        log.warning("No backup destinations found in Redis")
+        log.warning("No backup_repository keys found in Redis")
        return {
-            "destinations":    [],
-            "any_unreachable": True,
-            "any_locked":      False,
-            "all_ok":          False,
-            "note": "No backup destinations configured or readable from Redis",
+            "destinations": [],
+            "any_ok":       False,
+            "all_ok":       False,
+            "summary":      "No backup repositories configured in NS8",
        }

    results = []
    for dest in destinations:
-        log.info(
-            f"Checking repository {dest['repo_id']} "
-            f"(backend={dest.get('backend', 'unknown')})..."
-        )
-        res = _check_restic_repo(dest, config)
-        log.info(f"  -> {res['status']}: {res.get('error', '')}")
-        results.append(res)
+        log.info("Checking repository repo_id=%s url=%s",
+                 dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)")
+        result = _check_restic_repo(dest, config)
+        log.info("  -> %s", result["status"])
+        results.append(result)

-    # ---------------------------------------------------------------------------
-    # Aggregate flags for quick consumption by the notifier
-    # ---------------------------------------------------------------------------
-    any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
-    any_locked      = any(r["status"] == "LOCKED"      for r in results)
-    all_ok          = all(r["status"] == "OK"          for r in results)
+    ok_count = sum(1 for r in results if r["status"] == "OK")
+    all_ok = ok_count == len(results)
+    any_ok = ok_count > 0
+
+    summary = (
+        f"{ok_count}/{len(results)} repositories OK"
+        if not all_ok
+        else "All repositories reachable"
+    )

    return {
-        "destinations":    results,
-        "any_unreachable": any_unreachable,
-        "any_locked":      any_locked,
-        "all_ok":          all_ok,
+        "destinations": results,
+        "any_ok":       any_ok,
+        "all_ok":       all_ok,
+        "summary":      summary,
    }