docs: add section-by-section comments — repo_check.py

2026-05-18 21:02:14 +00:00
parent 9366027534
commit 20d7ecc8c4
1 changed files with 199 additions and 71 deletions
@@ -1,21 +1,31 @@
 #!/usr/bin/env python3
-"""
-repo_check.py - Verifies reachability and health of NS8 backup repositories.
+"""Verify reachability and health of NS8 backup repositories.

-For each backup destination configured in the cluster, attempts a
-`restic snapshots --last` command to verify the repo is accessible.
-Distinguishes between:
-  - UNREACHABLE: network/mount error, cannot connect at all
-  - LOCKED:      restic repo is locked (previous backup crashed)
-  - CORRUPTED:   repo exists but integrity check fails
-  - OK:          repo is accessible
+For each backup destination configured in the NS8 cluster, this module
+attempts a ``restic snapshots --last --no-cache`` command to confirm that
+the repository is accessible and readable.

-Handles NS8 multi-backend credentials:
-  - local / fs:  path only
-  - S3 / B2:     url + AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or
-                 B2_ACCOUNT_ID / B2_ACCOUNT_KEY from Redis hash
-  - SFTP:        url with sftp: prefix
-  - rclone:      rclone: prefix
+Status values returned per destination
+---------------------------------------
+    OK           Repository is reachable and returned a valid response.
+    UNREACHABLE  Network/mount error — cannot connect at all.
+    LOCKED       restic repo is locked (a previous backup crashed mid-run).
+    CORRUPTED    Repository exists but its pack integrity check fails.
+    ERROR        restic reported an error not covered by the above categories.
+    UNCONFIGURED No URL or path found in the Redis hash for this destination.
+    UNKNOWN      Non-zero exit with unrecognised stderr output.
+
+NS8 multi-backend credential mapping
+--------------------------------------
+    local / fs   – path only, no credentials.
+    S3 / AWS     – aws_access_key_id + aws_secret_access_key from Redis hash.
+    B2           – b2_account_id + b2_account_key from Redis hash.
+    SFTP         – URL with sftp: prefix; relies on SSH keys already in place.
+    rclone       – rclone: prefix; RCLONE_CONFIG env var set from Redis hash.
+
+Dependencies
+------------
+Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes).
 """

 import logging
@@ -26,90 +36,170 @@ from typing import Optional
 log = logging.getLogger(__name__)


+# ---------------------------------------------------------------------------
+# Redis helpers (local copies)
+# ---------------------------------------------------------------------------
+# Duplicated from correlator.py to keep repo_check.py self-contained and
+# avoid a circular import.  If the Redis access layer is ever extracted into
+# a shared helper, these can be removed.
+
 def _redis_cmd(config: dict, *args) -> str:
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+    """Run a redis-cli command against the NS8 cluster Redis Unix socket."""
+    socket = config.get("redis", {}).get(
+        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
+    )
    cmd = ["redis-cli", "-s", socket] + list(args)
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    return result.stdout.strip()


 def _redis_hgetall(config: dict, key: str) -> dict:
-    """Return all fields of a Redis hash as a dict."""
-    socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
+    """Return all fields of a Redis hash as a Python dict.
+
+    ``redis-cli HGETALL`` outputs alternating field / value lines;
+    this function zips consecutive pairs into a dict.
+    """
+    socket = config.get("redis", {}).get(
+        "socket", "/var/lib/nethserver/cluster/state/redis.sock"
+    )
    cmd = ["redis-cli", "-s", socket, "HGETALL", key]
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
    lines = [l for l in result.stdout.strip().splitlines() if l]
-    # redis-cli HGETALL returns alternating key/value lines
+    # redis-cli HGETALL returns alternating key/value lines.
    return dict(zip(lines[::2], lines[1::2]))


+# ---------------------------------------------------------------------------
+# Destination discovery
+# ---------------------------------------------------------------------------
+
 def _get_backup_destinations(config: dict) -> list:
-    """
-    Read all configured backup destinations from NS8 Redis.
-    Key pattern: cluster/backup_repository/<repo_id>/parameters
-    Returns list of dicts with full repo config.
+    """Read all configured backup repository destinations from NS8 Redis.
+
+    Key pattern: ``cluster/backup_repository/<repo_id>/parameters``
+
+    Each hash contains the URL/path, password, backend type, and any
+    cloud-provider credentials needed to invoke restic.
+
+    Args:
+        config: Parsed configuration dictionary.
+
+    Returns:
+        List of dicts, one per configured destination:
+            {
+                "repo_id"       : str,
+                "url"           : str  (cloud URL or empty),
+                "path"          : str  (local/SFTP path or empty),
+                "password"      : str  (restic repo password),
+                "backend"       : str  ("s3", "b2", "sftp", "rclone", "local", ...),
+                "aws_access_key": str  (S3 key ID or B2 account ID),
+                "aws_secret_key": str  (S3 secret or B2 account key),
+                "rclone_config" : str  (path or inline rclone config),
+                "extra_env"     : str  (optional additional environment variables),
+            }
    """
    raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
    keys = [k for k in raw.splitlines() if k]
    destinations = []

    for key in keys:
+        # Key format: cluster/backup_repository/<repo_id>/parameters
        parts = key.split("/")
        repo_id = parts[2] if len(parts) > 2 else "unknown"
        fields = _redis_hgetall(config, key)
        destinations.append({
-            "repo_id": repo_id,
-            "url": fields.get("url", ""),
-            "path": fields.get("path", ""),
-            "password": fields.get("password", ""),
-            "backend": fields.get("backend", ""),
-            # S3 / B2 credentials
-            "aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")),
-            "aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")),
-            # rclone / extra
-            "rclone_config": fields.get("rclone_config", ""),
-            "extra_env": fields.get("extra_env", ""),
+            "repo_id":        repo_id,
+            "url":            fields.get("url", ""),
+            "path":           fields.get("path", ""),
+            "password":       fields.get("password", ""),
+            "backend":        fields.get("backend", ""),
+            # S3 and B2 use different field names in NS8 Redis;
+            # normalise both to a single aws_access_key / aws_secret_key pair.
+            "aws_access_key": fields.get("aws_access_key_id",
+                              fields.get("b2_account_id", "")),
+            "aws_secret_key": fields.get("aws_secret_access_key",
+                              fields.get("b2_account_key", "")),
+            # rclone and miscellaneous extras
+            "rclone_config":  fields.get("rclone_config", ""),
+            "extra_env":      fields.get("extra_env", ""),
        })

    return destinations


+# ---------------------------------------------------------------------------
+# Environment builder
+# ---------------------------------------------------------------------------
+
 def _build_env(dest: dict) -> dict:
-    """
-    Build the environment dict for restic based on the backend type.
-    Always inherits from os.environ so system-level creds are preserved.
+    """Build the environment dict that restic needs based on the backend type.
+
+    Always starts from a copy of ``os.environ`` so that system-level settings
+    (PATH, HOME, proxy variables, etc.) are preserved.
+
+    Args:
+        dest: A destination dict as returned by ``_get_backup_destinations()``.
+
+    Returns:
+        A dict suitable for passing as the ``env`` argument to subprocess.run().
    """
    env = dict(os.environ)
    backend = dest.get("backend", "").lower()

+    # RESTIC_PASSWORD is used by all backends to unlock the repository.
    if dest.get("password"):
        env["RESTIC_PASSWORD"] = dest["password"]

+    # S3 / AWS backend credentials.
    if backend in ("s3", "aws") and dest.get("aws_access_key"):
-        env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
+        env["AWS_ACCESS_KEY_ID"]     = dest["aws_access_key"]
        env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]

+    # Backblaze B2 backend credentials.
    elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
-        env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
+        env["B2_ACCOUNT_ID"]  = dest["aws_access_key"]
        env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]

+    # rclone backend: point restic to the rclone config file.
    elif backend == "rclone" and dest.get("rclone_config"):
        env["RCLONE_CONFIG"] = dest["rclone_config"]

    return env


+# ---------------------------------------------------------------------------
+# Single-repository check
+# ---------------------------------------------------------------------------
+
 def _check_restic_repo(dest: dict, config: dict) -> dict:
-    """Run restic snapshots --last to verify repo is accessible."""
+    """Run ``restic snapshots --last --no-cache`` to verify one repository.
+
+    ``--no-cache`` is intentional: the cache may be stale or missing on the
+    host, and we want a live check against the actual backend.
+
+    Args:
+        dest:   Destination dict from ``_get_backup_destinations()``.
+        config: Parsed configuration dictionary (reads ``repo_check.*``).
+
+    Returns:
+        Dict: {"repo_id": str, "status": str, "error": str}
+    """
    timeout = config.get("repo_check", {}).get("timeout", 60)
    extra_flags = config.get("repo_check", {}).get("restic_flags", "")

+    # Prefer ``url`` (cloud backends) over ``path`` (local/SFTP).
    repo_url = dest.get("url") or dest.get("path") or ""
    if not repo_url:
-        return {"repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found"}
+        return {
+            "repo_id": dest["repo_id"],
+            "status":  "UNCONFIGURED",
+            "error":   "No URL or path found in Redis for this destination",
+        }

    cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
    if extra_flags:
+        # Allow the operator to append flags like --cacert or --option
+        # via the config without modifying the code.
        cmd += extra_flags.split()

    env = _build_env(dest)
@@ -124,69 +214,107 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
        )
        stderr = result.stderr.lower()

+        # ---------------------------------------------------------------------------
+        # Classify the restic exit code and stderr content
+        # ---------------------------------------------------------------------------
        if result.returncode == 0:
            return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
-        elif any(x in stderr for x in ("unable to open config", "no such file", "does not exist",
-                                        "connection refused", "network", "timeout", "no route")):
-            return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()}
+
+        # Network / connectivity errors.
+        elif any(x in stderr for x in (
+            "unable to open config", "no such file", "does not exist",
+            "connection refused", "network", "timeout", "no route"
+        )):
+            return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
+                    "error": result.stderr.strip()}
+
+        # Repository locked by a previous crashed backup run.
        elif "locked" in stderr or "lock" in stderr:
-            return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()}
+            return {"repo_id": dest["repo_id"], "status": "LOCKED",
+                    "error": result.stderr.strip()}
+
+        # Pack / data integrity error — repository may be corrupted.
        elif "pack" in stderr and "error" in stderr:
-            return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()}
+            return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
+                    "error": result.stderr.strip()}
+
+        # Generic restic error not covered by the specific cases above.
        elif "error" in stderr or "fatal" in stderr:
-            return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()}
+            return {"repo_id": dest["repo_id"], "status": "ERROR",
+                    "error": result.stderr.strip()}
+
+        # Non-zero exit with unrecognised output.
        else:
-            return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()}
+            return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
+                    "error": result.stderr.strip()}

    except subprocess.TimeoutExpired:
-        return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"Timeout after {timeout}s"}
+        return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
+                "error": f"Timeout after {timeout}s"}
    except FileNotFoundError:
-        return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic binary not found in PATH"}
+        # restic binary is not installed or not in PATH.
+        return {"repo_id": dest["repo_id"], "status": "ERROR",
+                "error": "restic binary not found in PATH"}
    except Exception as e:
        return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}


+# ---------------------------------------------------------------------------
+# Main entry point
+# ---------------------------------------------------------------------------
+
 def check_repositories(config: dict, correlation: dict) -> dict:
-    """
-    Main entry point for repository check.
+    """Check all configured NS8 backup destinations and return a status summary.
+
+    Called by the pipeline only when the correlator outcome is not SUCCESS,
+    so restic network calls are avoided on clean backup runs.
+
+    Args:
+        config:      Parsed configuration dictionary.
+        correlation: Correlation result dict (from correlator.py); currently
+                     unused but kept for future filtering by plan/module.

    Returns:
-    {
-        "destinations": [
-            {"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|CORRUPTED|ERROR, "error": ...},
-            ...
-        ],
-        "any_unreachable": bool,
-        "any_locked": bool,
-        "all_ok": bool,
-    }
+        A dict with the following keys:
+
+        destinations    : list of per-repo result dicts (see _check_restic_repo)
+        any_unreachable : bool — True if at least one repo is UNREACHABLE
+        any_locked      : bool — True if at least one repo is LOCKED
+        all_ok          : bool — True only if every repo returned OK
+        note            : optional str present when no destinations are configured
    """
    destinations = _get_backup_destinations(config)

    if not destinations:
        log.warning("No backup destinations found in Redis")
        return {
-            "destinations": [],
+            "destinations":    [],
            "any_unreachable": True,
-            "any_locked": False,
-            "all_ok": False,
-            "note": "No backup destinations configured or readable from Redis"
+            "any_locked":      False,
+            "all_ok":          False,
+            "note": "No backup destinations configured or readable from Redis",
        }

    results = []
    for dest in destinations:
-        log.info(f"Checking repository {dest['repo_id']} (backend={dest.get('backend','unknown')})...")
+        log.info(
+            f"Checking repository {dest['repo_id']} "
+            f"(backend={dest.get('backend', 'unknown')})..."
+        )
        res = _check_restic_repo(dest, config)
        log.info(f"  -> {res['status']}: {res.get('error', '')}")
        results.append(res)

+    # ---------------------------------------------------------------------------
+    # Aggregate flags for quick consumption by the notifier
+    # ---------------------------------------------------------------------------
    any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
-    any_locked = any(r["status"] == "LOCKED" for r in results)
-    all_ok = all(r["status"] == "OK" for r in results)
+    any_locked      = any(r["status"] == "LOCKED"      for r in results)
+    all_ok          = all(r["status"] == "OK"          for r in results)

    return {
-        "destinations": results,
+        "destinations":    results,
        "any_unreachable": any_unreachable,
-        "any_locked": any_locked,
-        "all_ok": all_ok,
+        "any_locked":      any_locked,
+        "all_ok":          all_ok,
    }