98 lines
3.6 KiB
Python
98 lines
3.6 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Scheduled backup status check for ns8-backup-monitor.
|
||
|
|
|
||
|
|
This module is the entry point for the *scheduled* recap path, invoked
|
||
|
|
directly by a systemd timer (ns8-backup-monitor-check.timer) rather than
|
||
|
|
by an Alertmanager webhook.
|
||
|
|
|
||
|
|
Why a separate timer path?
|
||
|
|
---------------------------
|
||
|
|
The Alertmanager webhook path (receiver.py) only fires when Alertmanager
|
||
|
|
emits a 'backup_failed' or 'backup_missing' alert — i.e. only on failure.
|
||
|
|
Automatic scheduled backups that complete successfully never produce an
|
||
|
|
Alertmanager alert, so the webhook is never called and no recap email is sent.
|
||
|
|
|
||
|
|
This module solves that by running the full pipeline (correlator → repo_check
|
||
|
|
on failure → notifier) on a schedule that mirrors the backup plan schedule.
|
||
|
|
It reads Redis directly, classifies the outcome, and sends the recap email
|
||
|
|
regardless of success or failure.
|
||
|
|
|
||
|
|
Typical invocation (from systemd OnCalendar):
|
||
|
|
python3 -m ns8_backup_monitor.scheduled_check
|
||
|
|
|
||
|
|
The module exits 0 on success, non-zero on unrecoverable errors.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
import sys
|
||
|
|
|
||
|
|
from .correlator import correlate_backup_status
|
||
|
|
from .notifier import send_notification
|
||
|
|
from .repo_check import check_repositories
|
||
|
|
from .utils import load_config, setup_logging
|
||
|
|
|
||
|
|
log = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
def run_scheduled_check(config: dict):
|
||
|
|
"""Run the full analysis pipeline once and send the recap email.
|
||
|
|
|
||
|
|
Unlike the webhook path, this function does NOT wait before reading
|
||
|
|
Redis: it is meant to be invoked by a timer that fires *after* the
|
||
|
|
backup window has closed (configure OnCalendar accordingly).
|
||
|
|
|
||
|
|
Steps
|
||
|
|
-----
|
||
|
|
1. Run correlator with no specific backup_ids — scans all recent keys
|
||
|
|
within the ``correlator.recent_window`` time window.
|
||
|
|
2. If outcome is not SUCCESS, run the repository health check.
|
||
|
|
3. Send the notification email (success or failure).
|
||
|
|
|
||
|
|
Args:
|
||
|
|
config: Parsed configuration dictionary (output of load_config).
|
||
|
|
"""
|
||
|
|
log.info("Scheduled check starting — reading backup status from Redis...")
|
||
|
|
|
||
|
|
# Pass empty backup_ids list: the correlator will scan all plan status
|
||
|
|
# keys updated within the recent_window (default 3600 seconds).
|
||
|
|
correlation = correlate_backup_status(config, backup_ids=[])
|
||
|
|
|
||
|
|
outcome = correlation.get("outcome", "UNKNOWN")
|
||
|
|
log.info("Correlator outcome: %s", outcome)
|
||
|
|
|
||
|
|
# Run the repository health check on non-SUCCESS outcomes to provide
|
||
|
|
# additional diagnostics in the email body.
|
||
|
|
repo_status = None
|
||
|
|
if outcome != "SUCCESS":
|
||
|
|
log.info("Non-success outcome — running repository health check...")
|
||
|
|
repo_status = check_repositories(config, correlation)
|
||
|
|
|
||
|
|
log.info("Sending recap notification (outcome=%s)...", outcome)
|
||
|
|
# Pass empty alerts list: the notifier will omit the 'Triggered by'
|
||
|
|
# section cleanly when there is no originating Alertmanager alert.
|
||
|
|
send_notification(config, alerts=[], correlation=correlation, repo_status=repo_status)
|
||
|
|
log.info("Scheduled check complete.")
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Entry point for the scheduled check (called by systemd timer)."""
|
||
|
|
try:
|
||
|
|
config = load_config()
|
||
|
|
except (FileNotFoundError, ImportError) as exc:
|
||
|
|
# Print to stderr so systemd captures it in the journal even if
|
||
|
|
# logging has not been initialised yet.
|
||
|
|
print(f"[ERROR] Could not load config: {exc}", file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
setup_logging(config)
|
||
|
|
|
||
|
|
try:
|
||
|
|
run_scheduled_check(config)
|
||
|
|
except Exception as exc: # pylint: disable=broad-except
|
||
|
|
log.exception("Scheduled check failed with unhandled exception: %s", exc)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|