diff --git a/ns8_backup_monitor/repo_check.py b/ns8_backup_monitor/repo_check.py index 938f9f4..c07b16c 100644 --- a/ns8_backup_monitor/repo_check.py +++ b/ns8_backup_monitor/repo_check.py @@ -3,15 +3,23 @@ repo_check.py - Verifies reachability and health of NS8 backup repositories. For each backup destination configured in the cluster, attempts a -`restic stats` or `restic snapshots` command to verify the repo is -accessible and readable. Distinguishes between: +`restic snapshots --last` command to verify the repo is accessible. +Distinguishes between: - UNREACHABLE: network/mount error, cannot connect at all - LOCKED: restic repo is locked (previous backup crashed) - CORRUPTED: repo exists but integrity check fails - OK: repo is accessible + +Handles NS8 multi-backend credentials: + - local / fs: path only + - S3 / B2: url + AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or + B2_ACCOUNT_ID / B2_ACCOUNT_KEY from Redis hash + - SFTP: url with sftp: prefix + - rclone: rclone: prefix """ import logging +import os import subprocess from typing import Optional @@ -25,11 +33,21 @@ def _redis_cmd(config: dict, *args) -> str: return result.stdout.strip() +def _redis_hgetall(config: dict, key: str) -> dict: + """Return all fields of a Redis hash as a dict.""" + socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock") + cmd = ["redis-cli", "-s", socket, "HGETALL", key] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10) + lines = [l for l in result.stdout.strip().splitlines() if l] + # redis-cli HGETALL returns alternating key/value lines + return dict(zip(lines[::2], lines[1::2])) + + def _get_backup_destinations(config: dict) -> list: """ Read all configured backup destinations from NS8 Redis. Key pattern: cluster/backup_repository//parameters - Returns list of dicts with repo config. + Returns list of dicts with full repo config. """ raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters") keys = [k for k in raw.splitlines() if k] @@ -38,28 +56,55 @@ def _get_backup_destinations(config: dict) -> list: for key in keys: parts = key.split("/") repo_id = parts[2] if len(parts) > 2 else "unknown" - url = _redis_cmd(config, "HGET", key, "url") - repopath = _redis_cmd(config, "HGET", key, "path") - password = _redis_cmd(config, "HGET", key, "password") - backend = _redis_cmd(config, "HGET", key, "backend") + fields = _redis_hgetall(config, key) destinations.append({ "repo_id": repo_id, - "url": url, - "path": repopath, - "password": password, - "backend": backend, + "url": fields.get("url", ""), + "path": fields.get("path", ""), + "password": fields.get("password", ""), + "backend": fields.get("backend", ""), + # S3 / B2 credentials + "aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")), + "aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")), + # rclone / extra + "rclone_config": fields.get("rclone_config", ""), + "extra_env": fields.get("extra_env", ""), }) return destinations +def _build_env(dest: dict) -> dict: + """ + Build the environment dict for restic based on the backend type. + Always inherits from os.environ so system-level creds are preserved. + """ + env = dict(os.environ) + backend = dest.get("backend", "").lower() + + if dest.get("password"): + env["RESTIC_PASSWORD"] = dest["password"] + + if backend in ("s3", "aws") and dest.get("aws_access_key"): + env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"] + env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"] + + elif backend in ("b2", "backblaze") and dest.get("aws_access_key"): + env["B2_ACCOUNT_ID"] = dest["aws_access_key"] + env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"] + + elif backend == "rclone" and dest.get("rclone_config"): + env["RCLONE_CONFIG"] = dest["rclone_config"] + + return env + + def _check_restic_repo(dest: dict, config: dict) -> dict: """Run restic snapshots --last to verify repo is accessible.""" timeout = config.get("repo_check", {}).get("timeout", 60) - repo_url = dest.get("url") or dest.get("path") or "" - password = dest.get("password", "") extra_flags = config.get("repo_check", {}).get("restic_flags", "") + repo_url = dest.get("url") or dest.get("path") or "" if not repo_url: return {"repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found"} @@ -67,7 +112,7 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: if extra_flags: cmd += extra_flags.split() - env = {"RESTIC_PASSWORD": password} if password else {} + env = _build_env(dest) try: result = subprocess.run( @@ -75,23 +120,28 @@ def _check_restic_repo(dest: dict, config: dict) -> dict: capture_output=True, text=True, timeout=timeout, - env={**__import__("os").environ, **env} + env=env ) stderr = result.stderr.lower() if result.returncode == 0: return {"repo_id": dest["repo_id"], "status": "OK", "error": ""} - elif "unable to open config" in stderr or "no such file" in stderr: + elif any(x in stderr for x in ("unable to open config", "no such file", "does not exist", + "connection refused", "network", "timeout", "no route")): return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()} elif "locked" in stderr or "lock" in stderr: return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()} - elif "error" in stderr: + elif "pack" in stderr and "error" in stderr: + return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()} + elif "error" in stderr or "fatal" in stderr: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()} else: return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()} except subprocess.TimeoutExpired: return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"Timeout after {timeout}s"} + except FileNotFoundError: + return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic binary not found in PATH"} except Exception as e: return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)} @@ -103,7 +153,7 @@ def check_repositories(config: dict, correlation: dict) -> dict: Returns: { "destinations": [ - {"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|ERROR, "error": ...}, + {"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|CORRUPTED|ERROR, "error": ...}, ... ], "any_unreachable": bool, @@ -125,7 +175,7 @@ def check_repositories(config: dict, correlation: dict) -> dict: results = [] for dest in destinations: - log.info(f"Checking repository {dest['repo_id']} ({dest.get('backend', 'unknown')})...") + log.info(f"Checking repository {dest['repo_id']} (backend={dest.get('backend','unknown')})...") res = _check_restic_repo(dest, config) log.info(f" -> {res['status']}: {res.get('error', '')}") results.append(res)