docs: add section-by-section comments — repo_check.py

This commit is contained in:
2026-05-18 21:02:14 +00:00
parent 9366027534
commit 20d7ecc8c4
+199 -71
View File
@@ -1,21 +1,31 @@
#!/usr/bin/env python3
"""
repo_check.py - Verifies reachability and health of NS8 backup repositories.
"""Verify reachability and health of NS8 backup repositories.
For each backup destination configured in the cluster, attempts a
`restic snapshots --last` command to verify the repo is accessible.
Distinguishes between:
- UNREACHABLE: network/mount error, cannot connect at all
- LOCKED: restic repo is locked (previous backup crashed)
- CORRUPTED: repo exists but integrity check fails
- OK: repo is accessible
For each backup destination configured in the NS8 cluster, this module
attempts a ``restic snapshots --last --no-cache`` command to confirm that
the repository is accessible and readable.
Handles NS8 multi-backend credentials:
- local / fs: path only
- S3 / B2: url + AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY or
B2_ACCOUNT_ID / B2_ACCOUNT_KEY from Redis hash
- SFTP: url with sftp: prefix
- rclone: rclone: prefix
Status values returned per destination
---------------------------------------
OK Repository is reachable and returned a valid response.
UNREACHABLE Network/mount error — cannot connect at all.
LOCKED restic repo is locked (a previous backup crashed mid-run).
CORRUPTED Repository exists but its pack integrity check fails.
ERROR restic reported an error not covered by the above categories.
UNCONFIGURED No URL or path found in the Redis hash for this destination.
UNKNOWN Non-zero exit with unrecognised stderr output.
NS8 multi-backend credential mapping
--------------------------------------
local / fs path only, no credentials.
S3 / AWS aws_access_key_id + aws_secret_access_key from Redis hash.
B2 b2_account_id + b2_account_key from Redis hash.
SFTP URL with sftp: prefix; relies on SSH keys already in place.
rclone rclone: prefix; RCLONE_CONFIG env var set from Redis hash.
Dependencies
------------
Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes).
"""
import logging
@@ -26,90 +36,170 @@ from typing import Optional
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Redis helpers (local copies)
# ---------------------------------------------------------------------------
# Duplicated from correlator.py to keep repo_check.py self-contained and
# avoid a circular import. If the Redis access layer is ever extracted into
# a shared helper, these can be removed.
def _redis_cmd(config: dict, *args) -> str:
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
"""Run a redis-cli command against the NS8 cluster Redis Unix socket."""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
)
cmd = ["redis-cli", "-s", socket] + list(args)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
return result.stdout.strip()
def _redis_hgetall(config: dict, key: str) -> dict:
"""Return all fields of a Redis hash as a dict."""
socket = config.get("redis", {}).get("socket", "/var/lib/nethserver/cluster/state/redis.sock")
"""Return all fields of a Redis hash as a Python dict.
``redis-cli HGETALL`` outputs alternating field / value lines;
this function zips consecutive pairs into a dict.
"""
socket = config.get("redis", {}).get(
"socket", "/var/lib/nethserver/cluster/state/redis.sock"
)
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
lines = [l for l in result.stdout.strip().splitlines() if l]
# redis-cli HGETALL returns alternating key/value lines
# redis-cli HGETALL returns alternating key/value lines.
return dict(zip(lines[::2], lines[1::2]))
# ---------------------------------------------------------------------------
# Destination discovery
# ---------------------------------------------------------------------------
def _get_backup_destinations(config: dict) -> list:
"""
Read all configured backup destinations from NS8 Redis.
Key pattern: cluster/backup_repository/<repo_id>/parameters
Returns list of dicts with full repo config.
"""Read all configured backup repository destinations from NS8 Redis.
Key pattern: ``cluster/backup_repository/<repo_id>/parameters``
Each hash contains the URL/path, password, backend type, and any
cloud-provider credentials needed to invoke restic.
Args:
config: Parsed configuration dictionary.
Returns:
List of dicts, one per configured destination:
{
"repo_id" : str,
"url" : str (cloud URL or empty),
"path" : str (local/SFTP path or empty),
"password" : str (restic repo password),
"backend" : str ("s3", "b2", "sftp", "rclone", "local", ...),
"aws_access_key": str (S3 key ID or B2 account ID),
"aws_secret_key": str (S3 secret or B2 account key),
"rclone_config" : str (path or inline rclone config),
"extra_env" : str (optional additional environment variables),
}
"""
raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
keys = [k for k in raw.splitlines() if k]
destinations = []
for key in keys:
# Key format: cluster/backup_repository/<repo_id>/parameters
parts = key.split("/")
repo_id = parts[2] if len(parts) > 2 else "unknown"
fields = _redis_hgetall(config, key)
destinations.append({
"repo_id": repo_id,
"url": fields.get("url", ""),
"path": fields.get("path", ""),
"password": fields.get("password", ""),
"backend": fields.get("backend", ""),
# S3 / B2 credentials
"aws_access_key": fields.get("aws_access_key_id", fields.get("b2_account_id", "")),
"aws_secret_key": fields.get("aws_secret_access_key", fields.get("b2_account_key", "")),
# rclone / extra
"rclone_config": fields.get("rclone_config", ""),
"extra_env": fields.get("extra_env", ""),
"repo_id": repo_id,
"url": fields.get("url", ""),
"path": fields.get("path", ""),
"password": fields.get("password", ""),
"backend": fields.get("backend", ""),
# S3 and B2 use different field names in NS8 Redis;
# normalise both to a single aws_access_key / aws_secret_key pair.
"aws_access_key": fields.get("aws_access_key_id",
fields.get("b2_account_id", "")),
"aws_secret_key": fields.get("aws_secret_access_key",
fields.get("b2_account_key", "")),
# rclone and miscellaneous extras
"rclone_config": fields.get("rclone_config", ""),
"extra_env": fields.get("extra_env", ""),
})
return destinations
# ---------------------------------------------------------------------------
# Environment builder
# ---------------------------------------------------------------------------
def _build_env(dest: dict) -> dict:
"""
Build the environment dict for restic based on the backend type.
Always inherits from os.environ so system-level creds are preserved.
"""Build the environment dict that restic needs based on the backend type.
Always starts from a copy of ``os.environ`` so that system-level settings
(PATH, HOME, proxy variables, etc.) are preserved.
Args:
dest: A destination dict as returned by ``_get_backup_destinations()``.
Returns:
A dict suitable for passing as the ``env`` argument to subprocess.run().
"""
env = dict(os.environ)
backend = dest.get("backend", "").lower()
# RESTIC_PASSWORD is used by all backends to unlock the repository.
if dest.get("password"):
env["RESTIC_PASSWORD"] = dest["password"]
# S3 / AWS backend credentials.
if backend in ("s3", "aws") and dest.get("aws_access_key"):
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
# Backblaze B2 backend credentials.
elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
# rclone backend: point restic to the rclone config file.
elif backend == "rclone" and dest.get("rclone_config"):
env["RCLONE_CONFIG"] = dest["rclone_config"]
return env
# ---------------------------------------------------------------------------
# Single-repository check
# ---------------------------------------------------------------------------
def _check_restic_repo(dest: dict, config: dict) -> dict:
"""Run restic snapshots --last to verify repo is accessible."""
"""Run ``restic snapshots --last --no-cache`` to verify one repository.
``--no-cache`` is intentional: the cache may be stale or missing on the
host, and we want a live check against the actual backend.
Args:
dest: Destination dict from ``_get_backup_destinations()``.
config: Parsed configuration dictionary (reads ``repo_check.*``).
Returns:
Dict: {"repo_id": str, "status": str, "error": str}
"""
timeout = config.get("repo_check", {}).get("timeout", 60)
extra_flags = config.get("repo_check", {}).get("restic_flags", "")
# Prefer ``url`` (cloud backends) over ``path`` (local/SFTP).
repo_url = dest.get("url") or dest.get("path") or ""
if not repo_url:
return {"repo_id": dest["repo_id"], "status": "UNCONFIGURED", "error": "No URL or path found"}
return {
"repo_id": dest["repo_id"],
"status": "UNCONFIGURED",
"error": "No URL or path found in Redis for this destination",
}
cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
if extra_flags:
# Allow the operator to append flags like --cacert or --option
# via the config without modifying the code.
cmd += extra_flags.split()
env = _build_env(dest)
@@ -124,69 +214,107 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
)
stderr = result.stderr.lower()
# ---------------------------------------------------------------------------
# Classify the restic exit code and stderr content
# ---------------------------------------------------------------------------
if result.returncode == 0:
return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
elif any(x in stderr for x in ("unable to open config", "no such file", "does not exist",
"connection refused", "network", "timeout", "no route")):
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": result.stderr.strip()}
# Network / connectivity errors.
elif any(x in stderr for x in (
"unable to open config", "no such file", "does not exist",
"connection refused", "network", "timeout", "no route"
)):
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
"error": result.stderr.strip()}
# Repository locked by a previous crashed backup run.
elif "locked" in stderr or "lock" in stderr:
return {"repo_id": dest["repo_id"], "status": "LOCKED", "error": result.stderr.strip()}
return {"repo_id": dest["repo_id"], "status": "LOCKED",
"error": result.stderr.strip()}
# Pack / data integrity error — repository may be corrupted.
elif "pack" in stderr and "error" in stderr:
return {"repo_id": dest["repo_id"], "status": "CORRUPTED", "error": result.stderr.strip()}
return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
"error": result.stderr.strip()}
# Generic restic error not covered by the specific cases above.
elif "error" in stderr or "fatal" in stderr:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": result.stderr.strip()}
return {"repo_id": dest["repo_id"], "status": "ERROR",
"error": result.stderr.strip()}
# Non-zero exit with unrecognised output.
else:
return {"repo_id": dest["repo_id"], "status": "UNKNOWN", "error": result.stderr.strip()}
return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
"error": result.stderr.strip()}
except subprocess.TimeoutExpired:
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE", "error": f"Timeout after {timeout}s"}
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
"error": f"Timeout after {timeout}s"}
except FileNotFoundError:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": "restic binary not found in PATH"}
# restic binary is not installed or not in PATH.
return {"repo_id": dest["repo_id"], "status": "ERROR",
"error": "restic binary not found in PATH"}
except Exception as e:
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------
def check_repositories(config: dict, correlation: dict) -> dict:
"""
Main entry point for repository check.
"""Check all configured NS8 backup destinations and return a status summary.
Called by the pipeline only when the correlator outcome is not SUCCESS,
so restic network calls are avoided on clean backup runs.
Args:
config: Parsed configuration dictionary.
correlation: Correlation result dict (from correlator.py); currently
unused but kept for future filtering by plan/module.
Returns:
{
"destinations": [
{"repo_id": ..., "status": OK|UNREACHABLE|LOCKED|CORRUPTED|ERROR, "error": ...},
...
],
"any_unreachable": bool,
"any_locked": bool,
"all_ok": bool,
}
A dict with the following keys:
destinations : list of per-repo result dicts (see _check_restic_repo)
any_unreachable : bool — True if at least one repo is UNREACHABLE
any_locked : bool — True if at least one repo is LOCKED
all_ok : bool — True only if every repo returned OK
note : optional str present when no destinations are configured
"""
destinations = _get_backup_destinations(config)
if not destinations:
log.warning("No backup destinations found in Redis")
return {
"destinations": [],
"destinations": [],
"any_unreachable": True,
"any_locked": False,
"all_ok": False,
"note": "No backup destinations configured or readable from Redis"
"any_locked": False,
"all_ok": False,
"note": "No backup destinations configured or readable from Redis",
}
results = []
for dest in destinations:
log.info(f"Checking repository {dest['repo_id']} (backend={dest.get('backend','unknown')})...")
log.info(
f"Checking repository {dest['repo_id']} "
f"(backend={dest.get('backend', 'unknown')})..."
)
res = _check_restic_repo(dest, config)
log.info(f" -> {res['status']}: {res.get('error', '')}")
results.append(res)
# ---------------------------------------------------------------------------
# Aggregate flags for quick consumption by the notifier
# ---------------------------------------------------------------------------
any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
any_locked = any(r["status"] == "LOCKED" for r in results)
all_ok = all(r["status"] == "OK" for r in results)
any_locked = any(r["status"] == "LOCKED" for r in results)
all_ok = all(r["status"] == "OK" for r in results)
return {
"destinations": results,
"destinations": results,
"any_unreachable": any_unreachable,
"any_locked": any_locked,
"all_ok": all_ok,
"any_locked": any_locked,
"all_ok": all_ok,
}