From b71e2090763a0d3263adbaa45a33aad1a0464b17 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 18 May 2026 21:55:26 +0000 Subject: [PATCH] docs: add per-status docs, backend mapping, runagent rationale; improve log messages --- ns8_backup_monitor/repo_check.py | 202 ++++++++++++++++--------------- 1 file changed, 103 insertions(+), 99 deletions(-) diff --git a/ns8_backup_monitor/repo_check.py b/ns8_backup_monitor/repo_check.py index 71533d6..a40a5d5 100644 --- a/ns8_backup_monitor/repo_check.py +++ b/ns8_backup_monitor/repo_check.py @@ -2,30 +2,40 @@ """Verify reachability and health of NS8 backup repositories. For each backup destination configured in the NS8 cluster, this module -attempts a ``restic snapshots --last --no-cache`` command to confirm that -the repository is accessible and readable. +invokes ``restic snapshots --last --no-cache`` to confirm that the +repository is accessible and readable. Status values returned per destination --------------------------------------- - OK Repository is reachable and returned a valid response. - UNREACHABLE Network/mount error — cannot connect at all. - LOCKED restic repo is locked (a previous backup crashed mid-run). - CORRUPTED Repository exists but its pack integrity check fails. - ERROR restic reported an error not covered by the above categories. - UNCONFIGURED No URL or path found in the Redis hash for this destination. - UNKNOWN Non-zero exit with unrecognised stderr output. + OK - Repository is reachable and returned a valid response. + UNREACHABLE - Network or mount error; cannot connect at all. + LOCKED - restic repository is locked (a previous backup crashed mid-run). + CORRUPTED - Repository exists but its pack integrity check fails. + ERROR - restic reported an error not covered by the above categories. + UNCONFIGURED - No URL or path found in Redis for this destination. + UNKNOWN - Non-zero exit with unrecognised stderr output. NS8 multi-backend credential mapping -------------------------------------- - local / fs – path only, no credentials. - S3 / AWS – aws_access_key_id + aws_secret_access_key from Redis hash. - B2 – b2_account_id + b2_account_key from Redis hash. - SFTP – URL with sftp: prefix; relies on SSH keys already in place. - rclone – rclone: prefix; RCLONE_CONFIG env var set from Redis hash. + local / fs - path only, no credentials. + S3 / AWS - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash. + B2 - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash. + SFTP - sftp: URL prefix; relies on SSH keys already deployed. + rclone - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash. + +Why ``runagent`` is NOT used here +----------------------------------- +restic is invoked directly rather than through ``runagent`` because repo_check +runs on the cluster leader and reads repository credentials from the cluster +Redis. The restic binary is available system-wide on NS8 nodes (not inside a +module container), so a direct subprocess call is both simpler and correct. +``runagent`` is used by other NS8 scripts to run commands inside rootless +Podman module containers - that indirection is not needed here. Dependencies ------------ -Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes). +Only the standard library. ``restic`` must be present in PATH (installed with +NS8 or manually on the leader node). """ import logging @@ -37,11 +47,11 @@ log = logging.getLogger(__name__) # --------------------------------------------------------------------------- -# Redis helpers (local copies) +# Redis helpers # --------------------------------------------------------------------------- -# Duplicated from correlator.py to keep repo_check.py self-contained and -# avoid a circular import. If the Redis access layer is ever extracted into -# a shared helper, these can be removed. +# These are local copies of the helpers defined in correlator.py, kept here +# to make repo_check.py self-contained and avoid a circular import. If the +# Redis access layer is extracted into utils.py in the future, remove these. def _redis_cmd(config: dict, *args) -> str: """Run a redis-cli command against the NS8 cluster Redis Unix socket.""" @@ -65,7 +75,6 @@ def _redis_hgetall(config: dict, key: str) -> dict: cmd = ["redis-cli", "-s", socket, "HGETALL", key] result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) lines = [l for l in result.stdout.strip().splitlines() if l] - # redis-cli HGETALL returns alternating key/value lines. return dict(zip(lines[::2], lines[1::2])) @@ -85,17 +94,18 @@ def _get_backup_destinations(config: dict) -> list: config: Parsed configuration dictionary. Returns: - List of dicts, one per configured destination: + List of dicts, one per configured destination:: + { "repo_id" : str, - "url" : str (cloud URL or empty), - "path" : str (local/SFTP path or empty), - "password" : str (restic repo password), + "url" : str (cloud URL or empty for local), + "path" : str (local/SFTP path or empty for cloud), + "password" : str (restic repository password), "backend" : str ("s3", "b2", "sftp", "rclone", "local", ...), - "aws_access_key": str (S3 key ID or B2 account ID), - "aws_secret_key": str (S3 secret or B2 account key), - "rclone_config" : str (path or inline rclone config), - "extra_env" : str (optional additional environment variables), + "aws_access_key": str (S3 key ID or B2 account ID, normalised), + "aws_secret_key": str (S3 secret or B2 account key, normalised), + "rclone_config" : str (path to rclone config or empty), + "extra_env" : str (optional extra environment variables), } """ raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters") @@ -108,20 +118,20 @@ def _get_backup_destinations(config: dict) -> list: repo_id = parts[2] if len(parts) > 2 else "unknown" fields = _redis_hgetall(config, key) destinations.append({ - "repo_id": repo_id, - "url": fields.get("url", ""), - "path": fields.get("path", ""), - "password": fields.get("password", ""), - "backend": fields.get("backend", ""), - # S3 and B2 use different field names in NS8 Redis; - # normalise both to a single aws_access_key / aws_secret_key pair. + "repo_id": repo_id, + "url": fields.get("url", ""), + "path": fields.get("path", ""), + "password": fields.get("password", ""), + "backend": fields.get("backend", ""), + # S3 and B2 use different field names in NS8 Redis; normalise both + # to a single aws_access_key / aws_secret_key pair so _build_env() + # can handle them uniformly. "aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")), "aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")), - # rclone and miscellaneous extras - "rclone_config": fields.get("rclone_config", ""), - "extra_env": fields.get("extra_env", ""), + "rclone_config": fields.get("rclone_config", ""), + "extra_env": fields.get("extra_env", ""), }) return destinations @@ -132,30 +142,30 @@ def _get_backup_destinations(config: dict) -> list: # --------------------------------------------------------------------------- def _build_env(dest: dict) -> dict: - """Build the environment dict that restic needs based on the backend type. + """Build the environment dict that restic needs for a given backend. - Always starts from a copy of ``os.environ`` so that system-level settings - (PATH, HOME, proxy variables, etc.) are preserved. + Always starts from a copy of ``os.environ`` so system-level settings + (PATH, HOME, proxy variables, etc.) are inherited. Args: dest: A destination dict as returned by ``_get_backup_destinations()``. Returns: - A dict suitable for passing as the ``env`` argument to subprocess.run(). + A dict suitable for the ``env`` argument of ``subprocess.run()``. """ env = dict(os.environ) backend = dest.get("backend", "").lower() - # RESTIC_PASSWORD is used by all backends to unlock the repository. + # RESTIC_PASSWORD unlocks the repository for all backends. if dest.get("password"): env["RESTIC_PASSWORD"] = dest["password"] - # S3 / AWS backend credentials. + # S3 / AWS backend. if backend in ("s3", "aws") and dest.get("aws_access_key"): env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"] - # Backblaze B2 backend credentials. + # Backblaze B2 backend. elif backend in ("b2", "backblaze") and dest.get("aws_access_key"): env["B2_ACCOUNT_ID"] = dest["aws_access_key"] env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"] @@ -168,26 +178,26 @@ def _build_env(dest: dict) -> dict: # --------------------------------------------------------------------------- -# Single-repository check +# Single-repository health check # --------------------------------------------------------------------------- def _check_restic_repo(dest: dict, config: dict) -> dict: """Run ``restic snapshots --last --no-cache`` to verify one repository. - ``--no-cache`` is intentional: the cache may be stale or missing on the - host, and we want a live check against the actual backend. + ``--no-cache`` is intentional: the local cache may be stale or missing + on the cluster leader, and we always want a live check against the backend. Args: dest: Destination dict from ``_get_backup_destinations()``. config: Parsed configuration dictionary (reads ``repo_check.*``). Returns: - Dict: {"repo_id": str, "status": str, "error": str} + Dict with keys: ``repo_id``, ``status``, ``error``. """ timeout = config.get("repo_check", {}).get("timeout", 60) extra_flags = config.get("repo_check", {}).get("restic_flags", "") - # Prefer ``url`` (cloud backends) over ``path`` (local/SFTP). + # Prefer ``url`` (cloud backends) over ``path`` (local / SFTP). repo_url = dest.get("url") or dest.get("path") or "" if not repo_url: return { @@ -198,8 +208,8 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"] if extra_flags: - # Allow the operator to append flags like --cacert or --option - # via the config without modifying the code. + # Allow operators to append flags (e.g. --cacert, --option) via the + # config file without modifying the source code. cmd += extra_flags.split() env = _build_env(dest) @@ -210,53 +220,47 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: capture_output=True, text=True, timeout=timeout, - env=env + env=env, ) stderr = result.stderr.lower() - # --------------------------------------------------------------------------- - # Classify the restic exit code and stderr content - # --------------------------------------------------------------------------- if result.returncode == 0: return {"repo_id": dest["repo_id"], "status": "OK", "error": ""} # Network / connectivity errors. elif any(x in stderr for x in ( "unable to open config", "no such file", "does not exist", - "connection refused", "network", "timeout", "no route" + "connection refused", "network", "timeout", "no route", )): return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()} # Repository locked by a previous crashed backup run. + # Run ``restic unlock`` manually to recover. elif "locked" in stderr or "lock" in stderr: return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()} - # Pack / data integrity error — repository may be corrupted. + # Pack / data integrity error - repository may be corrupted. elif "pack" in stderr and "error" in stderr: return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()} - # Generic restic error not covered by the specific cases above. + # Generic restic error. elif "error" in stderr or "fatal" in stderr: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()} - # Non-zero exit with unrecognised output. else: return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()} except subprocess.TimeoutExpired: return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", - "error": f"Timeout after {timeout}s"} + "error": f"restic timed out after {timeout}s"} except FileNotFoundError: - # restic binary is not installed or not in PATH. return {"repo_id": dest["repo_id"], "status": "ERROR", - "error": "restic binary not found in PATH"} - except Exception as e: - return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)} + "error": "restic not found in PATH - install restic on the cluster leader"} # --------------------------------------------------------------------------- @@ -264,57 +268,57 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: # --------------------------------------------------------------------------- def check_repositories(config: dict, correlation: dict) -> dict: - """Check all configured NS8 backup destinations and return a status summary. + """Check all configured NS8 backup repositories and return a status summary. - Called by the pipeline only when the correlator outcome is not SUCCESS, - so restic network calls are avoided on clean backup runs. + Called only when the correlator outcome is not SUCCESS, to avoid + unnecessary restic network calls on healthy clusters. Args: config: Parsed configuration dictionary. - correlation: Correlation result dict (from correlator.py); currently - unused but kept for future filtering by plan/module. + correlation: Output dict from ``correlate_backup_status()`` (used for + context logging only; not read for repository selection). Returns: - A dict with the following keys: + Dict with keys: - destinations : list of per-repo result dicts (see _check_restic_repo) - any_unreachable : bool — True if at least one repo is UNREACHABLE - any_locked : bool — True if at least one repo is LOCKED - all_ok : bool — True only if every repo returned OK - note : optional str present when no destinations are configured + destinations : list of per-destination result dicts + (each has "repo_id", "status", "error") + any_ok : bool - True if at least one destination is reachable + all_ok : bool - True if all destinations are OK + summary : human-readable one-line summary string """ destinations = _get_backup_destinations(config) if not destinations: - log.warning("No backup destinations found in Redis") + log.warning("No backup_repository keys found in Redis") return { - "destinations": [], - "any_unreachable": True, - "any_locked": False, - "all_ok": False, - "note": "No backup destinations configured or readable from Redis", + "destinations": [], + "any_ok": False, + "all_ok": False, + "summary": "No backup repositories configured in NS8", } results = [] for dest in destinations: - log.info( - f"Checking repository {dest['repo_id']} " - f"(backend={dest.get('backend', 'unknown')})..." - ) - res = _check_restic_repo(dest, config) - log.info(f" -> {res['status']}: {res.get('error', '')}") - results.append(res) + log.info("Checking repository repo_id=%s url=%s", + dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)") + result = _check_restic_repo(dest, config) + log.info(" -> %s", result["status"]) + results.append(result) - # --------------------------------------------------------------------------- - # Aggregate flags for quick consumption by the notifier - # --------------------------------------------------------------------------- - any_unreachable = any(r["status"] == "UNREACHABLE" for r in results) - any_locked = any(r["status"] == "LOCKED" for r in results) - all_ok = all(r["status"] == "OK" for r in results) + ok_count = sum(1 for r in results if r["status"] == "OK") + all_ok = ok_count == len(results) + any_ok = ok_count > 0 + + summary = ( + f"{ok_count}/{len(results)} repositories OK" + if not all_ok + else "All repositories reachable" + ) return { - "destinations": results, - "any_unreachable": any_unreachable, - "any_locked": any_locked, - "all_ok": all_ok, + "destinations": results, + "any_ok": any_ok, + "all_ok": all_ok, + "summary": summary, }