Files
ns8-backup-monitor/ns8_backup_monitor/scheduled_check.py
T

98 lines
3.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""Scheduled backup status check for ns8-backup-monitor.
This module is the entry point for the *scheduled* recap path, invoked
directly by a systemd timer (ns8-backup-monitor-check.timer) rather than
by an Alertmanager webhook.
Why a separate timer path?
---------------------------
The Alertmanager webhook path (receiver.py) only fires when Alertmanager
emits a 'backup_failed' or 'backup_missing' alert — i.e. only on failure.
Automatic scheduled backups that complete successfully never produce an
Alertmanager alert, so the webhook is never called and no recap email is sent.
This module solves that by running the full pipeline (correlator → repo_check
on failure → notifier) on a schedule that mirrors the backup plan schedule.
It reads Redis directly, classifies the outcome, and sends the recap email
regardless of success or failure.
Typical invocation (from systemd OnCalendar):
python3 -m ns8_backup_monitor.scheduled_check
The module exits 0 on success, non-zero on unrecoverable errors.
"""
import logging
import sys
from .correlator import correlate_backup_status
from .notifier import send_notification
from .repo_check import check_repositories
from .utils import load_config, setup_logging
log = logging.getLogger(__name__)
def run_scheduled_check(config: dict):
"""Run the full analysis pipeline once and send the recap email.
Unlike the webhook path, this function does NOT wait before reading
Redis: it is meant to be invoked by a timer that fires *after* the
backup window has closed (configure OnCalendar accordingly).
Steps
-----
1. Run correlator with no specific backup_ids — scans all recent keys
within the ``correlator.recent_window`` time window.
2. If outcome is not SUCCESS, run the repository health check.
3. Send the notification email (success or failure).
Args:
config: Parsed configuration dictionary (output of load_config).
"""
log.info("Scheduled check starting — reading backup status from Redis...")
# Pass empty backup_ids list: the correlator will scan all plan status
# keys updated within the recent_window (default 3600 seconds).
correlation = correlate_backup_status(config, backup_ids=[])
outcome = correlation.get("outcome", "UNKNOWN")
log.info("Correlator outcome: %s", outcome)
# Run the repository health check on non-SUCCESS outcomes to provide
# additional diagnostics in the email body.
repo_status = None
if outcome != "SUCCESS":
log.info("Non-success outcome — running repository health check...")
repo_status = check_repositories(config, correlation)
log.info("Sending recap notification (outcome=%s)...", outcome)
# Pass empty alerts list: the notifier will omit the 'Triggered by'
# section cleanly when there is no originating Alertmanager alert.
send_notification(config, alerts=[], correlation=correlation, repo_status=repo_status)
log.info("Scheduled check complete.")
def main():
"""Entry point for the scheduled check (called by systemd timer)."""
try:
config = load_config()
except (FileNotFoundError, ImportError) as exc:
# Print to stderr so systemd captures it in the journal even if
# logging has not been initialised yet.
print(f"[ERROR] Could not load config: {exc}", file=sys.stderr)
sys.exit(1)
setup_logging(config)
try:
run_scheduled_check(config)
except Exception as exc: # pylint: disable=broad-except
log.exception("Scheduled check failed with unhandled exception: %s", exc)
sys.exit(1)
if __name__ == "__main__":
main()