Files
ns8-backup-monitor/ns8_backup_monitor/repo_check.py
T

193 lines
7.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
repo_check.py - Verifies reachability and health of NS8 backup repositories.
For each backup destination configured in the cluster, attempts a
`restic snapshots --last` command to verify the repo is accessible.
Distinguishes between:
- UNREACHABLE: network/mount error, cannot connect at all
- LOCKED: restic repo is locked (previous backup crashed)
- CORRUPTED: repo exists but integrity check fails
- OK: repo is accessible
Handles NS8 multi-backend credentials:
- local / fs: path only
- S3 / B2: url + AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or
B2_ACCOUNT_ID / B2_ACCOUNT_KEY from Redis hash
- SFTP: url with sftp: prefix
- rclone: rclone: prefix
"""
import logging
import os
import subprocess
from typing import Optional
log = logging.getLogger(__name__)
def _redis_cmd(config: dict, *args) -> str:
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
cmd = ["redis-cli", "-s", socket] + list(args)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
return result.stdout.strip()
def _redis_hgetall(config: dict, key: str) -> dict:
"""Return all fields of a Redis hash as a dict."""
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key/value lines
return dict(zip(lines[::2], lines[1::2]))
def _get_backup_destinations(config: dict) -> list:
"""
Read all configured backup destinations from NS8 Redis.
Key pattern: cluster/backup_repository/<repo_id>/parameters
Returns list of dicts with full repo config.
"""
raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
keys = [k for k in raw.splitlines() if k]
destinations = []
for key in keys:
parts = key.split("/")
repo_id = parts[2] if len(parts) > 2 else "unknown"
fields = _redis_hgetall(config, key)
destinations.append({
"repo_id": repo_id,
"url": fields.get("url", ""),
"path": fields.get("path", ""),
"password": fields.get("password", ""),
"backend": fields.get("backend", ""),
# S3 / B2 credentials
"aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")),
"aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")),
# rclone / extra
"rclone_config": fields.get("rclone_config", ""),
"extra_env": fields.get("extra_env", ""),
})
return destinations
def _build_env(dest: dict) -> dict:
"""
Build the environment dict for restic based on the backend type.
Always inherits from os.environ so system-level creds are preserved.
"""
env = dict(os.environ)
backend = dest.get("backend", "").lower()
if dest.get("password"):
env["RESTIC_PASSWORD"] = dest["password"]
if backend in ("s3", "aws") and dest.get("aws_access_key"):
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
elif backend == "rclone" and dest.get("rclone_config"):
env["RCLONE_CONFIG"] = dest["rclone_config"]
return env
def _check_restic_repo(dest: dict, config: dict) -> dict:
"""Run restic snapshots --last to verify repo is accessible."""
timeout = config.get("repo_check", {}).get("timeout", 60)
extra_flags = config.get("repo_check", {}).get("restic_flags", "")
repo_url = dest.get("url") or dest.get("path") or ""
if not repo_url:
return {"repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found"}
cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
if extra_flags:
cmd += extra_flags.split()
env = _build_env(dest)
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=timeout,
env=env
)
stderr = result.stderr.lower()
if result.returncode == 0:
return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
elif any(x in stderr for x in ("unable to open config", "no such file", "does not exist",
"connection refused", "network", "timeout", "no route")):
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()}
elif "locked" in stderr or "lock" in stderr:
return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()}
elif "pack" in stderr and "error" in stderr:
return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()}
elif "error" in stderr or "fatal" in stderr:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()}
else:
return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()}
except subprocess.TimeoutExpired:
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"Timeout after {timeout}s"}
except FileNotFoundError:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic binary not found in PATH"}
except Exception as e:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
def check_repositories(config: dict, correlation: dict) -> dict:
"""
Main entry point for repository check.
Returns:
{
"destinations": [
{"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|CORRUPTED|ERROR, "error": ...},
...
],
"any_unreachable": bool,
"any_locked": bool,
"all_ok": bool,
}
"""
destinations = _get_backup_destinations(config)
if not destinations:
log.warning("No backup destinations found in Redis")
return {
"destinations": [],
"any_unreachable": True,
"any_locked": False,
"all_ok": False,
"note": "No backup destinations configured or readable from Redis"
}
results = []
for dest in destinations:
log.info(f"Checking repository {dest['repo_id']} (backend={dest.get('backend','unknown')})...")
res = _check_restic_repo(dest, config)
log.info(f" -> {res['status']}: {res.get('error', '')}")
results.append(res)
any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
any_locked = any(r["status"] == "LOCKED" for r in results)
all_ok = all(r["status"] == "OK" for r in results)
return {
"destinations": results,
"any_unreachable": any_unreachable,
"any_locked": any_locked,
"all_ok": all_ok,
}