#!/usr/bin/env python3 """Verify reachability and health of NS8 backup repositories. For each backup destination configured in the NS8 cluster, this module invokes ``restic snapshots --last --no-cache`` to confirm that the repository is accessible and readable. Status values returned per destination --------------------------------------- OK - Repository is reachable and returned a valid response. UNREACHABLE - Network or mount error; cannot connect at all. LOCKED - restic repository is locked (a previous backup crashed mid-run). CORRUPTED - Repository exists but its pack integrity check fails. ERROR - restic reported an error not covered by the above categories. UNCONFIGURED - No URL or path found in Redis for this destination. UNKNOWN - Non-zero exit with unrecognised stderr output. NS8 multi-backend credential mapping -------------------------------------- local / fs - path only, no credentials. S3 / AWS - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash. B2 - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash. SFTP - sftp: URL prefix; relies on SSH keys already deployed. rclone - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash. Why ``runagent`` is NOT used here ----------------------------------- restic is invoked directly rather than through ``runagent`` because repo_check runs on the cluster leader and reads repository credentials from the cluster Redis. The restic binary is available system-wide on NS8 nodes (not inside a module container), so a direct subprocess call is both simpler and correct. ``runagent`` is used by other NS8 scripts to run commands inside rootless Podman module containers - that indirection is not needed here. Dependencies ------------ Only the standard library. ``restic`` must be present in PATH (installed with NS8 or manually on the leader node). """ import logging import os import subprocess from typing import Optional log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Redis helpers # --------------------------------------------------------------------------- # These are local copies of the helpers defined in correlator.py, kept here # to make repo_check.py self-contained and avoid a circular import. If the # Redis access layer is extracted into utils.py in the future, remove these. def _redis_cmd(config: dict, *args) -> str: """Run a redis-cli command against the NS8 cluster Redis Unix socket.""" socket = config.get("redis", {}).get( "socket", "/var/lib/nethserver/cluster/state/redis.sock" ) cmd = ["redis-cli", "-s", socket] + list(args) result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) return result.stdout.strip() def _redis_hgetall(config: dict, key: str) -> dict: """Return all fields of a Redis hash as a Python dict. ``redis-cli HGETALL`` outputs alternating field / value lines; this function zips consecutive pairs into a dict. """ socket = config.get("redis", {}).get( "socket", "/var/lib/nethserver/cluster/state/redis.sock" ) cmd = ["redis-cli", "-s", socket, "HGETALL", key] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) lines = [l for l in result.stdout.strip().splitlines() if l] return dict(zip(lines[::2], lines[1::2])) # --------------------------------------------------------------------------- # Destination discovery # --------------------------------------------------------------------------- def _get_backup_destinations(config: dict) -> list: """Read all configured backup repository destinations from NS8 Redis. Key pattern: ``cluster/backup_repository//parameters`` Each hash contains the URL/path, password, backend type, and any cloud-provider credentials needed to invoke restic. Args: config: Parsed configuration dictionary. Returns: List of dicts, one per configured destination:: { "repo_id" : str, "url" : str (cloud URL or empty for local), "path" : str (local/SFTP path or empty for cloud), "password" : str (restic repository password), "backend" : str ("s3", "b2", "sftp", "rclone", "local", ...), "aws_access_key": str (S3 key ID or B2 account ID, normalised), "aws_secret_key": str (S3 secret or B2 account key, normalised), "rclone_config" : str (path to rclone config or empty), "extra_env" : str (optional extra environment variables), } """ raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters") keys = [k for k in raw.splitlines() if k] destinations = [] for key in keys: # Key format: cluster/backup_repository//parameters parts = key.split("/") repo_id = parts[2] if len(parts) > 2 else "unknown" fields = _redis_hgetall(config, key) destinations.append({ "repo_id": repo_id, "url": fields.get("url", ""), "path": fields.get("path", ""), "password": fields.get("password", ""), "backend": fields.get("backend", ""), # S3 and B2 use different field names in NS8 Redis; normalise both # to a single aws_access_key / aws_secret_key pair so _build_env() # can handle them uniformly. "aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")), "aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")), "rclone_config": fields.get("rclone_config", ""), "extra_env": fields.get("extra_env", ""), }) return destinations # --------------------------------------------------------------------------- # Environment builder # --------------------------------------------------------------------------- def _build_env(dest: dict) -> dict: """Build the environment dict that restic needs for a given backend. Always starts from a copy of ``os.environ`` so system-level settings (PATH, HOME, proxy variables, etc.) are inherited. Args: dest: A destination dict as returned by ``_get_backup_destinations()``. Returns: A dict suitable for the ``env`` argument of ``subprocess.run()``. """ env = dict(os.environ) backend = dest.get("backend", "").lower() # RESTIC_PASSWORD unlocks the repository for all backends. if dest.get("password"): env["RESTIC_PASSWORD"] = dest["password"] # S3 / AWS backend. if backend in ("s3", "aws") and dest.get("aws_access_key"): env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"] # Backblaze B2 backend. elif backend in ("b2", "backblaze") and dest.get("aws_access_key"): env["B2_ACCOUNT_ID"] = dest["aws_access_key"] env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"] # rclone backend: point restic to the rclone config file. elif backend == "rclone" and dest.get("rclone_config"): env["RCLONE_CONFIG"] = dest["rclone_config"] return env # --------------------------------------------------------------------------- # Single-repository health check # --------------------------------------------------------------------------- def _check_restic_repo(dest: dict, config: dict) -> dict: """Run ``restic snapshots --last --no-cache`` to verify one repository. ``--no-cache`` is intentional: the local cache may be stale or missing on the cluster leader, and we always want a live check against the backend. Args: dest: Destination dict from ``_get_backup_destinations()``. config: Parsed configuration dictionary (reads ``repo_check.*``). Returns: Dict with keys: ``repo_id``, ``status``, ``error``. """ timeout = config.get("repo_check", {}).get("timeout", 60) extra_flags = config.get("repo_check", {}).get("restic_flags", "") # Prefer ``url`` (cloud backends) over ``path`` (local / SFTP). repo_url = dest.get("url") or dest.get("path") or "" if not repo_url: return { "repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found in Redis for this destination", } cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"] if extra_flags: # Allow operators to append flags (e.g. --cacert, --option) via the # config file without modifying the source code. cmd += extra_flags.split() env = _build_env(dest) try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=timeout, env=env, ) stderr = result.stderr.lower() if result.returncode == 0: return {"repo_id": dest["repo_id"], "status": "OK", "error": ""} # Network / connectivity errors. elif any(x in stderr for x in ( "unable to open config", "no such file", "does not exist", "connection refused", "network", "timeout", "no route", )): return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()} # Repository locked by a previous crashed backup run. # Run ``restic unlock`` manually to recover. elif "locked" in stderr or "lock" in stderr: return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()} # Pack / data integrity error - repository may be corrupted. elif "pack" in stderr and "error" in stderr: return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()} # Generic restic error. elif "error" in stderr or "fatal" in stderr: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()} else: return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()} except subprocess.TimeoutExpired: return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"restic timed out after {timeout}s"} except FileNotFoundError: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic not found in PATH - install restic on the cluster leader"} # --------------------------------------------------------------------------- # Main entry point # --------------------------------------------------------------------------- def check_repositories(config: dict, correlation: dict) -> dict: """Check all configured NS8 backup repositories and return a status summary. Called only when the correlator outcome is not SUCCESS, to avoid unnecessary restic network calls on healthy clusters. Args: config: Parsed configuration dictionary. correlation: Output dict from ``correlate_backup_status()`` (used for context logging only; not read for repository selection). Returns: Dict with keys: destinations : list of per-destination result dicts (each has "repo_id", "status", "error") any_ok : bool - True if at least one destination is reachable all_ok : bool - True if all destinations are OK summary : human-readable one-line summary string """ destinations = _get_backup_destinations(config) if not destinations: log.warning("No backup_repository keys found in Redis") return { "destinations": [], "any_ok": False, "all_ok": False, "summary": "No backup repositories configured in NS8", } results = [] for dest in destinations: log.info("Checking repository repo_id=%s url=%s", dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)") result = _check_restic_repo(dest, config) log.info(" -> %s", result["status"]) results.append(result) ok_count = sum(1 for r in results if r["status"] == "OK") all_ok = ok_count == len(results) any_ok = ok_count > 0 summary = ( f"{ok_count}/{len(results)} repositories OK" if not all_ok else "All repositories reachable" ) return { "destinations": results, "any_ok": any_ok, "all_ok": all_ok, "summary": summary, }