diff --git a/ns8_backup_monitor/scheduled_check.py b/ns8_backup_monitor/scheduled_check.py new file mode 100644 index 0000000..9c183da --- /dev/null +++ b/ns8_backup_monitor/scheduled_check.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Scheduled backup status check for ns8-backup-monitor. + +This module is the entry point for the *scheduled* recap path, invoked +directly by a systemd timer (ns8-backup-monitor-check.timer) rather than +by an Alertmanager webhook. + +Why a separate timer path? +--------------------------- +The Alertmanager webhook path (receiver.py) only fires when Alertmanager +emits a 'backup_failed' or 'backup_missing' alert — i.e. only on failure. +Automatic scheduled backups that complete successfully never produce an +Alertmanager alert, so the webhook is never called and no recap email is sent. + +This module solves that by running the full pipeline (correlator → repo_check +on failure → notifier) on a schedule that mirrors the backup plan schedule. +It reads Redis directly, classifies the outcome, and sends the recap email +regardless of success or failure. + +Typical invocation (from systemd OnCalendar): + python3 -m ns8_backup_monitor.scheduled_check + +The module exits 0 on success, non-zero on unrecoverable errors. +""" + +import logging +import sys + +from .correlator import correlate_backup_status +from .notifier import send_notification +from .repo_check import check_repositories +from .utils import load_config, setup_logging + +log = logging.getLogger(__name__) + + +def run_scheduled_check(config: dict): + """Run the full analysis pipeline once and send the recap email. + + Unlike the webhook path, this function does NOT wait before reading + Redis: it is meant to be invoked by a timer that fires *after* the + backup window has closed (configure OnCalendar accordingly). + + Steps + ----- + 1. Run correlator with no specific backup_ids — scans all recent keys + within the ``correlator.recent_window`` time window. + 2. If outcome is not SUCCESS, run the repository health check. + 3. Send the notification email (success or failure). + + Args: + config: Parsed configuration dictionary (output of load_config). + """ + log.info("Scheduled check starting — reading backup status from Redis...") + + # Pass empty backup_ids list: the correlator will scan all plan status + # keys updated within the recent_window (default 3600 seconds). + correlation = correlate_backup_status(config, backup_ids=[]) + + outcome = correlation.get("outcome", "UNKNOWN") + log.info("Correlator outcome: %s", outcome) + + # Run the repository health check on non-SUCCESS outcomes to provide + # additional diagnostics in the email body. + repo_status = None + if outcome != "SUCCESS": + log.info("Non-success outcome — running repository health check...") + repo_status = check_repositories(config, correlation) + + log.info("Sending recap notification (outcome=%s)...", outcome) + # Pass empty alerts list: the notifier will omit the 'Triggered by' + # section cleanly when there is no originating Alertmanager alert. + send_notification(config, alerts=[], correlation=correlation, repo_status=repo_status) + log.info("Scheduled check complete.") + + +def main(): + """Entry point for the scheduled check (called by systemd timer).""" + try: + config = load_config() + except (FileNotFoundError, ImportError) as exc: + # Print to stderr so systemd captures it in the journal even if + # logging has not been initialised yet. + print(f"[ERROR] Could not load config: {exc}", file=sys.stderr) + sys.exit(1) + + setup_logging(config) + + try: + run_scheduled_check(config) + except Exception as exc: # pylint: disable=broad-except + log.exception("Scheduled check failed with unhandled exception: %s", exc) + sys.exit(1) + + +if __name__ == "__main__": + main()