docs: add per-status docs, backend mapping, runagent rationale; improve log messages
This commit is contained in:
@@ -2,30 +2,40 @@
|
|||||||
"""Verify reachability and health of NS8 backup repositories.
|
"""Verify reachability and health of NS8 backup repositories.
|
||||||
|
|
||||||
For each backup destination configured in the NS8 cluster, this module
|
For each backup destination configured in the NS8 cluster, this module
|
||||||
attempts a ``restic snapshots --last --no-cache`` command to confirm that
|
invokes ``restic snapshots --last --no-cache`` to confirm that the
|
||||||
the repository is accessible and readable.
|
repository is accessible and readable.
|
||||||
|
|
||||||
Status values returned per destination
|
Status values returned per destination
|
||||||
---------------------------------------
|
---------------------------------------
|
||||||
OK Repository is reachable and returned a valid response.
|
OK - Repository is reachable and returned a valid response.
|
||||||
UNREACHABLE Network/mount error — cannot connect at all.
|
UNREACHABLE - Network or mount error; cannot connect at all.
|
||||||
LOCKED restic repo is locked (a previous backup crashed mid-run).
|
LOCKED - restic repository is locked (a previous backup crashed mid-run).
|
||||||
CORRUPTED Repository exists but its pack integrity check fails.
|
CORRUPTED - Repository exists but its pack integrity check fails.
|
||||||
ERROR restic reported an error not covered by the above categories.
|
ERROR - restic reported an error not covered by the above categories.
|
||||||
UNCONFIGURED No URL or path found in the Redis hash for this destination.
|
UNCONFIGURED - No URL or path found in Redis for this destination.
|
||||||
UNKNOWN Non-zero exit with unrecognised stderr output.
|
UNKNOWN - Non-zero exit with unrecognised stderr output.
|
||||||
|
|
||||||
NS8 multi-backend credential mapping
|
NS8 multi-backend credential mapping
|
||||||
--------------------------------------
|
--------------------------------------
|
||||||
local / fs – path only, no credentials.
|
local / fs - path only, no credentials.
|
||||||
S3 / AWS – aws_access_key_id + aws_secret_access_key from Redis hash.
|
S3 / AWS - AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY from Redis hash.
|
||||||
B2 – b2_account_id + b2_account_key from Redis hash.
|
B2 - B2_ACCOUNT_ID + B2_ACCOUNT_KEY from Redis hash.
|
||||||
SFTP – URL with sftp: prefix; relies on SSH keys already in place.
|
SFTP - sftp: URL prefix; relies on SSH keys already deployed.
|
||||||
rclone – rclone: prefix; RCLONE_CONFIG env var set from Redis hash.
|
rclone - rclone: URL prefix; RCLONE_CONFIG env var from Redis hash.
|
||||||
|
|
||||||
|
Why ``runagent`` is NOT used here
|
||||||
|
-----------------------------------
|
||||||
|
restic is invoked directly rather than through ``runagent`` because repo_check
|
||||||
|
runs on the cluster leader and reads repository credentials from the cluster
|
||||||
|
Redis. The restic binary is available system-wide on NS8 nodes (not inside a
|
||||||
|
module container), so a direct subprocess call is both simpler and correct.
|
||||||
|
``runagent`` is used by other NS8 scripts to run commands inside rootless
|
||||||
|
Podman module containers - that indirection is not needed here.
|
||||||
|
|
||||||
Dependencies
|
Dependencies
|
||||||
------------
|
------------
|
||||||
Only the standard library. ``restic`` must be in PATH (installed on NS8 nodes).
|
Only the standard library. ``restic`` must be present in PATH (installed with
|
||||||
|
NS8 or manually on the leader node).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -37,11 +47,11 @@ log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Redis helpers (local copies)
|
# Redis helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Duplicated from correlator.py to keep repo_check.py self-contained and
|
# These are local copies of the helpers defined in correlator.py, kept here
|
||||||
# avoid a circular import. If the Redis access layer is ever extracted into
|
# to make repo_check.py self-contained and avoid a circular import. If the
|
||||||
# a shared helper, these can be removed.
|
# Redis access layer is extracted into utils.py in the future, remove these.
|
||||||
|
|
||||||
def _redis_cmd(config: dict, *args) -> str:
|
def _redis_cmd(config: dict, *args) -> str:
|
||||||
"""Run a redis-cli command against the NS8 cluster Redis Unix socket."""
|
"""Run a redis-cli command against the NS8 cluster Redis Unix socket."""
|
||||||
@@ -65,7 +75,6 @@ def _redis_hgetall(config: dict, key: str) -> dict:
|
|||||||
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
|
cmd = ["redis-cli", "-s", socket, "HGETALL", key]
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
|
||||||
lines = [l for l in result.stdout.strip().splitlines() if l]
|
lines = [l for l in result.stdout.strip().splitlines() if l]
|
||||||
# redis-cli HGETALL returns alternating key/value lines.
|
|
||||||
return dict(zip(lines[::2], lines[1::2]))
|
return dict(zip(lines[::2], lines[1::2]))
|
||||||
|
|
||||||
|
|
||||||
@@ -85,17 +94,18 @@ def _get_backup_destinations(config: dict) -> list:
|
|||||||
config: Parsed configuration dictionary.
|
config: Parsed configuration dictionary.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of dicts, one per configured destination:
|
List of dicts, one per configured destination::
|
||||||
|
|
||||||
{
|
{
|
||||||
"repo_id" : str,
|
"repo_id" : str,
|
||||||
"url" : str (cloud URL or empty),
|
"url" : str (cloud URL or empty for local),
|
||||||
"path" : str (local/SFTP path or empty),
|
"path" : str (local/SFTP path or empty for cloud),
|
||||||
"password" : str (restic repo password),
|
"password" : str (restic repository password),
|
||||||
"backend" : str ("s3", "b2", "sftp", "rclone", "local", ...),
|
"backend" : str ("s3", "b2", "sftp", "rclone", "local", ...),
|
||||||
"aws_access_key": str (S3 key ID or B2 account ID),
|
"aws_access_key": str (S3 key ID or B2 account ID, normalised),
|
||||||
"aws_secret_key": str (S3 secret or B2 account key),
|
"aws_secret_key": str (S3 secret or B2 account key, normalised),
|
||||||
"rclone_config" : str (path or inline rclone config),
|
"rclone_config" : str (path to rclone config or empty),
|
||||||
"extra_env" : str (optional additional environment variables),
|
"extra_env" : str (optional extra environment variables),
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
|
raw = _redis_cmd(config, "KEYS", "cluster/backup_repository/*/parameters")
|
||||||
@@ -113,13 +123,13 @@ def _get_backup_destinations(config: dict) -> list:
|
|||||||
"path": fields.get("path", ""),
|
"path": fields.get("path", ""),
|
||||||
"password": fields.get("password", ""),
|
"password": fields.get("password", ""),
|
||||||
"backend": fields.get("backend", ""),
|
"backend": fields.get("backend", ""),
|
||||||
# S3 and B2 use different field names in NS8 Redis;
|
# S3 and B2 use different field names in NS8 Redis; normalise both
|
||||||
# normalise both to a single aws_access_key / aws_secret_key pair.
|
# to a single aws_access_key / aws_secret_key pair so _build_env()
|
||||||
|
# can handle them uniformly.
|
||||||
"aws_access_key": fields.get("aws_access_key_id",
|
"aws_access_key": fields.get("aws_access_key_id",
|
||||||
fields.get("b2_account_id", "")),
|
fields.get("b2_account_id", "")),
|
||||||
"aws_secret_key": fields.get("aws_secret_access_key",
|
"aws_secret_key": fields.get("aws_secret_access_key",
|
||||||
fields.get("b2_account_key", "")),
|
fields.get("b2_account_key", "")),
|
||||||
# rclone and miscellaneous extras
|
|
||||||
"rclone_config": fields.get("rclone_config", ""),
|
"rclone_config": fields.get("rclone_config", ""),
|
||||||
"extra_env": fields.get("extra_env", ""),
|
"extra_env": fields.get("extra_env", ""),
|
||||||
})
|
})
|
||||||
@@ -132,30 +142,30 @@ def _get_backup_destinations(config: dict) -> list:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _build_env(dest: dict) -> dict:
|
def _build_env(dest: dict) -> dict:
|
||||||
"""Build the environment dict that restic needs based on the backend type.
|
"""Build the environment dict that restic needs for a given backend.
|
||||||
|
|
||||||
Always starts from a copy of ``os.environ`` so that system-level settings
|
Always starts from a copy of ``os.environ`` so system-level settings
|
||||||
(PATH, HOME, proxy variables, etc.) are preserved.
|
(PATH, HOME, proxy variables, etc.) are inherited.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dest: A destination dict as returned by ``_get_backup_destinations()``.
|
dest: A destination dict as returned by ``_get_backup_destinations()``.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A dict suitable for passing as the ``env`` argument to subprocess.run().
|
A dict suitable for the ``env`` argument of ``subprocess.run()``.
|
||||||
"""
|
"""
|
||||||
env = dict(os.environ)
|
env = dict(os.environ)
|
||||||
backend = dest.get("backend", "").lower()
|
backend = dest.get("backend", "").lower()
|
||||||
|
|
||||||
# RESTIC_PASSWORD is used by all backends to unlock the repository.
|
# RESTIC_PASSWORD unlocks the repository for all backends.
|
||||||
if dest.get("password"):
|
if dest.get("password"):
|
||||||
env["RESTIC_PASSWORD"] = dest["password"]
|
env["RESTIC_PASSWORD"] = dest["password"]
|
||||||
|
|
||||||
# S3 / AWS backend credentials.
|
# S3 / AWS backend.
|
||||||
if backend in ("s3", "aws") and dest.get("aws_access_key"):
|
if backend in ("s3", "aws") and dest.get("aws_access_key"):
|
||||||
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
|
env["AWS_ACCESS_KEY_ID"] = dest["aws_access_key"]
|
||||||
env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
|
env["AWS_SECRET_ACCESS_KEY"] = dest["aws_secret_key"]
|
||||||
|
|
||||||
# Backblaze B2 backend credentials.
|
# Backblaze B2 backend.
|
||||||
elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
|
elif backend in ("b2", "backblaze") and dest.get("aws_access_key"):
|
||||||
env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
|
env["B2_ACCOUNT_ID"] = dest["aws_access_key"]
|
||||||
env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
|
env["B2_ACCOUNT_KEY"] = dest["aws_secret_key"]
|
||||||
@@ -168,26 +178,26 @@ def _build_env(dest: dict) -> dict:
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Single-repository check
|
# Single-repository health check
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def _check_restic_repo(dest: dict, config: dict) -> dict:
|
def _check_restic_repo(dest: dict, config: dict) -> dict:
|
||||||
"""Run ``restic snapshots --last --no-cache`` to verify one repository.
|
"""Run ``restic snapshots --last --no-cache`` to verify one repository.
|
||||||
|
|
||||||
``--no-cache`` is intentional: the cache may be stale or missing on the
|
``--no-cache`` is intentional: the local cache may be stale or missing
|
||||||
host, and we want a live check against the actual backend.
|
on the cluster leader, and we always want a live check against the backend.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dest: Destination dict from ``_get_backup_destinations()``.
|
dest: Destination dict from ``_get_backup_destinations()``.
|
||||||
config: Parsed configuration dictionary (reads ``repo_check.*``).
|
config: Parsed configuration dictionary (reads ``repo_check.*``).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict: {"repo_id": str, "status": str, "error": str}
|
Dict with keys: ``repo_id``, ``status``, ``error``.
|
||||||
"""
|
"""
|
||||||
timeout = config.get("repo_check", {}).get("timeout", 60)
|
timeout = config.get("repo_check", {}).get("timeout", 60)
|
||||||
extra_flags = config.get("repo_check", {}).get("restic_flags", "")
|
extra_flags = config.get("repo_check", {}).get("restic_flags", "")
|
||||||
|
|
||||||
# Prefer ``url`` (cloud backends) over ``path`` (local/SFTP).
|
# Prefer ``url`` (cloud backends) over ``path`` (local / SFTP).
|
||||||
repo_url = dest.get("url") or dest.get("path") or ""
|
repo_url = dest.get("url") or dest.get("path") or ""
|
||||||
if not repo_url:
|
if not repo_url:
|
||||||
return {
|
return {
|
||||||
@@ -198,8 +208,8 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
|
|||||||
|
|
||||||
cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
|
cmd = ["restic", "-r", repo_url, "snapshots", "--last", "--no-cache"]
|
||||||
if extra_flags:
|
if extra_flags:
|
||||||
# Allow the operator to append flags like --cacert or --option
|
# Allow operators to append flags (e.g. --cacert, --option) via the
|
||||||
# via the config without modifying the code.
|
# config file without modifying the source code.
|
||||||
cmd += extra_flags.split()
|
cmd += extra_flags.split()
|
||||||
|
|
||||||
env = _build_env(dest)
|
env = _build_env(dest)
|
||||||
@@ -210,53 +220,47 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
|
|||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
env=env
|
env=env,
|
||||||
)
|
)
|
||||||
stderr = result.stderr.lower()
|
stderr = result.stderr.lower()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Classify the restic exit code and stderr content
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
|
return {"repo_id": dest["repo_id"], "status": "OK", "error": ""}
|
||||||
|
|
||||||
# Network / connectivity errors.
|
# Network / connectivity errors.
|
||||||
elif any(x in stderr for x in (
|
elif any(x in stderr for x in (
|
||||||
"unable to open config", "no such file", "does not exist",
|
"unable to open config", "no such file", "does not exist",
|
||||||
"connection refused", "network", "timeout", "no route"
|
"connection refused", "network", "timeout", "no route",
|
||||||
)):
|
)):
|
||||||
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
|
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
|
||||||
"error": result.stderr.strip()}
|
"error": result.stderr.strip()}
|
||||||
|
|
||||||
# Repository locked by a previous crashed backup run.
|
# Repository locked by a previous crashed backup run.
|
||||||
|
# Run ``restic unlock`` manually to recover.
|
||||||
elif "locked" in stderr or "lock" in stderr:
|
elif "locked" in stderr or "lock" in stderr:
|
||||||
return {"repo_id": dest["repo_id"], "status": "LOCKED",
|
return {"repo_id": dest["repo_id"], "status": "LOCKED",
|
||||||
"error": result.stderr.strip()}
|
"error": result.stderr.strip()}
|
||||||
|
|
||||||
# Pack / data integrity error — repository may be corrupted.
|
# Pack / data integrity error - repository may be corrupted.
|
||||||
elif "pack" in stderr and "error" in stderr:
|
elif "pack" in stderr and "error" in stderr:
|
||||||
return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
|
return {"repo_id": dest["repo_id"], "status": "CORRUPTED",
|
||||||
"error": result.stderr.strip()}
|
"error": result.stderr.strip()}
|
||||||
|
|
||||||
# Generic restic error not covered by the specific cases above.
|
# Generic restic error.
|
||||||
elif "error" in stderr or "fatal" in stderr:
|
elif "error" in stderr or "fatal" in stderr:
|
||||||
return {"repo_id": dest["repo_id"], "status": "ERROR",
|
return {"repo_id": dest["repo_id"], "status": "ERROR",
|
||||||
"error": result.stderr.strip()}
|
"error": result.stderr.strip()}
|
||||||
|
|
||||||
# Non-zero exit with unrecognised output.
|
|
||||||
else:
|
else:
|
||||||
return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
|
return {"repo_id": dest["repo_id"], "status": "UNKNOWN",
|
||||||
"error": result.stderr.strip()}
|
"error": result.stderr.strip()}
|
||||||
|
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
|
return {"repo_id": dest["repo_id"], "status": "UNREACHABLE",
|
||||||
"error": f"Timeout after {timeout}s"}
|
"error": f"restic timed out after {timeout}s"}
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
# restic binary is not installed or not in PATH.
|
|
||||||
return {"repo_id": dest["repo_id"], "status": "ERROR",
|
return {"repo_id": dest["repo_id"], "status": "ERROR",
|
||||||
"error": "restic binary not found in PATH"}
|
"error": "restic not found in PATH - install restic on the cluster leader"}
|
||||||
except Exception as e:
|
|
||||||
return {"repo_id": dest["repo_id"], "status": "ERROR", "error": str(e)}
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -264,57 +268,57 @@ def _check_restic_repo(dest: dict, config: dict) -> dict:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def check_repositories(config: dict, correlation: dict) -> dict:
|
def check_repositories(config: dict, correlation: dict) -> dict:
|
||||||
"""Check all configured NS8 backup destinations and return a status summary.
|
"""Check all configured NS8 backup repositories and return a status summary.
|
||||||
|
|
||||||
Called by the pipeline only when the correlator outcome is not SUCCESS,
|
Called only when the correlator outcome is not SUCCESS, to avoid
|
||||||
so restic network calls are avoided on clean backup runs.
|
unnecessary restic network calls on healthy clusters.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: Parsed configuration dictionary.
|
config: Parsed configuration dictionary.
|
||||||
correlation: Correlation result dict (from correlator.py); currently
|
correlation: Output dict from ``correlate_backup_status()`` (used for
|
||||||
unused but kept for future filtering by plan/module.
|
context logging only; not read for repository selection).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A dict with the following keys:
|
Dict with keys:
|
||||||
|
|
||||||
destinations : list of per-repo result dicts (see _check_restic_repo)
|
destinations : list of per-destination result dicts
|
||||||
any_unreachable : bool — True if at least one repo is UNREACHABLE
|
(each has "repo_id", "status", "error")
|
||||||
any_locked : bool — True if at least one repo is LOCKED
|
any_ok : bool - True if at least one destination is reachable
|
||||||
all_ok : bool — True only if every repo returned OK
|
all_ok : bool - True if all destinations are OK
|
||||||
note : optional str present when no destinations are configured
|
summary : human-readable one-line summary string
|
||||||
"""
|
"""
|
||||||
destinations = _get_backup_destinations(config)
|
destinations = _get_backup_destinations(config)
|
||||||
|
|
||||||
if not destinations:
|
if not destinations:
|
||||||
log.warning("No backup destinations found in Redis")
|
log.warning("No backup_repository keys found in Redis")
|
||||||
return {
|
return {
|
||||||
"destinations": [],
|
"destinations": [],
|
||||||
"any_unreachable": True,
|
"any_ok": False,
|
||||||
"any_locked": False,
|
|
||||||
"all_ok": False,
|
"all_ok": False,
|
||||||
"note": "No backup destinations configured or readable from Redis",
|
"summary": "No backup repositories configured in NS8",
|
||||||
}
|
}
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for dest in destinations:
|
for dest in destinations:
|
||||||
log.info(
|
log.info("Checking repository repo_id=%s url=%s",
|
||||||
f"Checking repository {dest['repo_id']} "
|
dest["repo_id"], dest.get("url") or dest.get("path") or "(empty)")
|
||||||
f"(backend={dest.get('backend', 'unknown')})..."
|
result = _check_restic_repo(dest, config)
|
||||||
)
|
log.info(" -> %s", result["status"])
|
||||||
res = _check_restic_repo(dest, config)
|
results.append(result)
|
||||||
log.info(f" -> {res['status']}: {res.get('error', '')}")
|
|
||||||
results.append(res)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
ok_count = sum(1 for r in results if r["status"] == "OK")
|
||||||
# Aggregate flags for quick consumption by the notifier
|
all_ok = ok_count == len(results)
|
||||||
# ---------------------------------------------------------------------------
|
any_ok = ok_count > 0
|
||||||
any_unreachable = any(r["status"] == "UNREACHABLE" for r in results)
|
|
||||||
any_locked = any(r["status"] == "LOCKED" for r in results)
|
summary = (
|
||||||
all_ok = all(r["status"] == "OK" for r in results)
|
f"{ok_count}/{len(results)} repositories OK"
|
||||||
|
if not all_ok
|
||||||
|
else "All repositories reachable"
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"destinations": results,
|
"destinations": results,
|
||||||
"any_unreachable": any_unreachable,
|
"any_ok": any_ok,
|
||||||
"any_locked": any_locked,
|
|
||||||
"all_ok": all_ok,
|
"all_ok": all_ok,
|
||||||
|
"summary": summary,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user