From 20d7ecc8c4b98be4ea76fbe5330a7f8145fe02f8 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 18 May 2026 21:02:14 +0000 Subject: [PATCH] =?UTF-8?q?docs:=20add=20section-by-section=20comments=20?= =?UTF-8?q?=E2=80=94=20repo=5Fcheck.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ns8_backup_monitor/repo_check.py | 270 +++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 71 deletions(-) diff --git a/ns8_backup_monitor/repo_check.py b/ns8_backup_monitor/repo_check.py index c07b16c..71533d6 100644 --- a/ns8_backup_monitor/repo_check.py +++ b/ns8_backup_monitor/repo_check.py @@ -1,21 +1,31 @@ #!/usr/bin/env python3 -""" -repo_check.py - Verifies reachability and health of NS8 backup repositories. +"""Verify reachability and health of NS8 backup repositories. -For each backup destination configured in the cluster, attempts a -`restic snapshots --last` command to verify the repo is accessible. -Distinguishes between: - - UNREACHABLE: network/mount error, cannot connect at all - - LOCKED: restic repo is locked (previous backup crashed) - - CORRUPTED: repo exists but integrity check fails - - OK: repo is accessible +For each backup destination configured in the NS8 cluster, this module +attempts a ``restic snapshots --last --no-cache`` command to confirm that +the repository is accessible and readable. -Handles NS8 multi-backend credentials: - - local / fs: path only - - S3 / B2: url + AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or - B2_ACCOUNT_ID / B2_ACCOUNT_KEY from Redis hash - - SFTP: url with sftp: prefix - - rclone: rclone: prefix +Status values returned per destination +--------------------------------------- + OK Repository is reachable and returned a valid response. + UNREACHABLE Network/mount error — cannot connect at all. + LOCKED restic repo is locked (a previous backup crashed mid-run). + CORRUPTED Repository exists but its pack integrity check fails. + ERROR restic reported an error not covered by the above categories. + UNCONFIGURED No URL or path found in the Redis hash for this destination. + UNKNOWN Non-zero exit with unrecognised stderr output. + +NS8 multi-backend credential mapping +-------------------------------------- + local / fs – path only, no credentials. + S3 / AWS – aws_access_key_id + aws_secret_access_key from Redis hash. + B2 – b2_account_id + b2_account_key from Redis hash. + SFTP – URL with sftp: prefix; relies on SSH keys already in place. + rclone – rclone: prefix; RCLONE_CONFIG env var set from Redis hash. + +Dependencies +------------ +Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes). """ import logging @@ -26,90 +36,170 @@ from typing import Optional log = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Redis helpers (local copies) +# --------------------------------------------------------------------------- +# Duplicated from correlator.py to keep repo_check.py self-contained and +# avoid a circular import. If the Redis access layer is ever extracted into +# a shared helper, these can be removed. + def _redis_cmd(config: dict, *args) -> str: - socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + """Run a redis-cli command against the NS8 cluster Redis Unix socket.""" + socket = config.get("redis", {}).get( + "socket", "/var/lib/nethserver/cluster/state/redis.sock" + ) cmd = ["redis-cli", "-s", socket] + list(args) result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) return result.stdout.strip() def _redis_hgetall(config: dict, key: str) -> dict: - """Return all fields of a Redis hash as a dict.""" - socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + """Return all fields of a Redis hash as a Python dict. + + ``redis-cli HGETALL`` outputs alternating field / value lines; + this function zips consecutive pairs into a dict. + """ + socket = config.get("redis", {}).get( + "socket", "/var/lib/nethserver/cluster/state/redis.sock" + ) cmd = ["redis-cli", "-s", socket, "HGETALL", key] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) lines = [l for l in result.stdout.strip().splitlines() if l] - # redis-cli HGETALL returns alternating key/value lines + # redis-cli HGETALL returns alternating key/value lines. return dict(zip(lines[::2], lines[1::2])) +# --------------------------------------------------------------------------- +# Destination discovery +# --------------------------------------------------------------------------- + def _get_backup_destinations(config: dict) -> list: - """ - Read all configured backup destinations from NS8 Redis. - Key pattern: cluster/backup_repository//parameters - Returns list of dicts with full repo config. + """Read all configured backup repository destinations from NS8 Redis. + + Key pattern: ``cluster/backup_repository//parameters`` + + Each hash contains the URL/path, password, backend type, and any + cloud-provider credentials needed to invoke restic. + + Args: + config: Parsed configuration dictionary. + + Returns: + List of dicts, one per configured destination: + { + "repo_id" : str, + "url" : str (cloud URL or empty), + "path" : str (local/SFTP path or empty), + "password" : str (restic repo password), + "backend" : str ("s3", "b2", "sftp", "rclone", "local", ...), + "aws_access_key": str (S3 key ID or B2 account ID), + "aws_secret_key": str (S3 secret or B2 account key), + "rclone_config" : str (path or inline rclone config), + "extra_env" : str (optional additional environment variables), + } """ raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters") keys = [k for k in raw.splitlines() if k] destinations = [] for key in keys: + # Key format: cluster/backup_repository//parameters parts = key.split("/") repo_id = parts[2] if len(parts) > 2 else "unknown" fields = _redis_hgetall(config, key) destinations.append({ - "repo_id": repo_id, - "url": fields.get("url", ""), - "path": fields.get("path", ""), - "password": fields.get("password", ""), - "backend": fields.get("backend", ""), - # S3 / B2 credentials - "aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")), - "aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")), - # rclone / extra - "rclone_config": fields.get("rclone_config", ""), - "extra_env": fields.get("extra_env", ""), + "repo_id": repo_id, + "url": fields.get("url", ""), + "path": fields.get("path", ""), + "password": fields.get("password", ""), + "backend": fields.get("backend", ""), + # S3 and B2 use different field names in NS8 Redis; + # normalise both to a single aws_access_key / aws_secret_key pair. + "aws_access_key": fields.get("aws_access_key_id", + fields.get("b2_account_id", "")), + "aws_secret_key": fields.get("aws_secret_access_key", + fields.get("b2_account_key", "")), + # rclone and miscellaneous extras + "rclone_config": fields.get("rclone_config", ""), + "extra_env": fields.get("extra_env", ""), }) return destinations +# --------------------------------------------------------------------------- +# Environment builder +# --------------------------------------------------------------------------- + def _build_env(dest: dict) -> dict: - """ - Build the environment dict for restic based on the backend type. - Always inherits from os.environ so system-level creds are preserved. + """Build the environment dict that restic needs based on the backend type. + + Always starts from a copy of ``os.environ`` so that system-level settings + (PATH, HOME, proxy variables, etc.) are preserved. + + Args: + dest: A destination dict as returned by ``_get_backup_destinations()``. + + Returns: + A dict suitable for passing as the ``env`` argument to subprocess.run(). """ env = dict(os.environ) backend = dest.get("backend", "").lower() + # RESTIC_PASSWORD is used by all backends to unlock the repository. if dest.get("password"): env["RESTIC_PASSWORD"] = dest["password"] + # S3 / AWS backend credentials. if backend in ("s3", "aws") and dest.get("aws_access_key"): - env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] + env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"] + # Backblaze B2 backend credentials. elif backend in ("b2", "backblaze") and dest.get("aws_access_key"): - env["B2_ACCOUNT_ID"] = dest["aws_access_key"] + env["B2_ACCOUNT_ID"] = dest["aws_access_key"] env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"] + # rclone backend: point restic to the rclone config file. elif backend == "rclone" and dest.get("rclone_config"): env["RCLONE_CONFIG"] = dest["rclone_config"] return env +# --------------------------------------------------------------------------- +# Single-repository check +# --------------------------------------------------------------------------- + def _check_restic_repo(dest: dict, config: dict) -> dict: - """Run restic snapshots --last to verify repo is accessible.""" + """Run ``restic snapshots --last --no-cache`` to verify one repository. + + ``--no-cache`` is intentional: the cache may be stale or missing on the + host, and we want a live check against the actual backend. + + Args: + dest: Destination dict from ``_get_backup_destinations()``. + config: Parsed configuration dictionary (reads ``repo_check.*``). + + Returns: + Dict: {"repo_id": str, "status": str, "error": str} + """ timeout = config.get("repo_check", {}).get("timeout", 60) extra_flags = config.get("repo_check", {}).get("restic_flags", "") + # Prefer ``url`` (cloud backends) over ``path`` (local/SFTP). repo_url = dest.get("url") or dest.get("path") or "" if not repo_url: - return {"repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found"} + return { + "repo_id": dest["repo_id"], + "status": "UNCONFIGURED", + "error": "No URL or path found in Redis for this destination", + } cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"] if extra_flags: + # Allow the operator to append flags like --cacert or --option + # via the config without modifying the code. cmd += extra_flags.split() env = _build_env(dest) @@ -124,69 +214,107 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: ) stderr = result.stderr.lower() + # --------------------------------------------------------------------------- + # Classify the restic exit code and stderr content + # --------------------------------------------------------------------------- if result.returncode == 0: return {"repo_id": dest["repo_id"], "status": "OK", "error": ""} - elif any(x in stderr for x in ("unable to open config", "no such file", "does not exist", - "connection refused", "network", "timeout", "no route")): - return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()} + + # Network / connectivity errors. + elif any(x in stderr for x in ( + "unable to open config", "no such file", "does not exist", + "connection refused", "network", "timeout", "no route" + )): + return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", + "error": result.stderr.strip()} + + # Repository locked by a previous crashed backup run. elif "locked" in stderr or "lock" in stderr: - return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()} + return {"repo_id": dest["repo_id"], "status": "LOCKED", + "error": result.stderr.strip()} + + # Pack / data integrity error — repository may be corrupted. elif "pack" in stderr and "error" in stderr: - return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()} + return {"repo_id": dest["repo_id"], "status": "CORRUPTED", + "error": result.stderr.strip()} + + # Generic restic error not covered by the specific cases above. elif "error" in stderr or "fatal" in stderr: - return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()} + return {"repo_id": dest["repo_id"], "status": "ERROR", + "error": result.stderr.strip()} + + # Non-zero exit with unrecognised output. else: - return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()} + return {"repo_id": dest["repo_id"], "status": "UNKNOWN", + "error": result.stderr.strip()} except subprocess.TimeoutExpired: - return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"Timeout after {timeout}s"} + return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", + "error": f"Timeout after {timeout}s"} except FileNotFoundError: - return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic binary not found in PATH"} + # restic binary is not installed or not in PATH. + return {"repo_id": dest["repo_id"], "status": "ERROR", + "error": "restic binary not found in PATH"} except Exception as e: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)} +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- + def check_repositories(config: dict, correlation: dict) -> dict: - """ - Main entry point for repository check. + """Check all configured NS8 backup destinations and return a status summary. + + Called by the pipeline only when the correlator outcome is not SUCCESS, + so restic network calls are avoided on clean backup runs. + + Args: + config: Parsed configuration dictionary. + correlation: Correlation result dict (from correlator.py); currently + unused but kept for future filtering by plan/module. Returns: - { - "destinations": [ - {"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|CORRUPTED|ERROR, "error": ...}, - ... - ], - "any_unreachable": bool, - "any_locked": bool, - "all_ok": bool, - } + A dict with the following keys: + + destinations : list of per-repo result dicts (see _check_restic_repo) + any_unreachable : bool — True if at least one repo is UNREACHABLE + any_locked : bool — True if at least one repo is LOCKED + all_ok : bool — True only if every repo returned OK + note : optional str present when no destinations are configured """ destinations = _get_backup_destinations(config) if not destinations: log.warning("No backup destinations found in Redis") return { - "destinations": [], + "destinations": [], "any_unreachable": True, - "any_locked": False, - "all_ok": False, - "note": "No backup destinations configured or readable from Redis" + "any_locked": False, + "all_ok": False, + "note": "No backup destinations configured or readable from Redis", } results = [] for dest in destinations: - log.info(f"Checking repository {dest['repo_id']} (backend={dest.get('backend','unknown')})...") + log.info( + f"Checking repository {dest['repo_id']} " + f"(backend={dest.get('backend', 'unknown')})..." + ) res = _check_restic_repo(dest, config) log.info(f" -> {res['status']}: {res.get('error', '')}") results.append(res) + # --------------------------------------------------------------------------- + # Aggregate flags for quick consumption by the notifier + # --------------------------------------------------------------------------- any_unreachable = any(r["status"] == "UNREACHABLE" for r in results) - any_locked = any(r["status"] == "LOCKED" for r in results) - all_ok = all(r["status"] == "OK" for r in results) + any_locked = any(r["status"] == "LOCKED" for r in results) + all_ok = all(r["status"] == "OK" for r in results) return { - "destinations": results, + "destinations": results, "any_unreachable": any_unreachable, - "any_locked": any_locked, - "all_ok": all_ok, + "any_locked": any_locked, + "all_ok": all_ok, }