feat: add scheduled_check module — timer-based daily recap independent of Alertmanager
This commit is contained in:
@@ -0,0 +1,97 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Scheduled backup status check for ns8-backup-monitor.
|
||||
|
||||
This module is the entry point for the *scheduled* recap path, invoked
|
||||
directly by a systemd timer (ns8-backup-monitor-check.timer) rather than
|
||||
by an Alertmanager webhook.
|
||||
|
||||
Why a separate timer path?
|
||||
---------------------------
|
||||
The Alertmanager webhook path (receiver.py) only fires when Alertmanager
|
||||
emits a 'backup_failed' or 'backup_missing' alert — i.e. only on failure.
|
||||
Automatic scheduled backups that complete successfully never produce an
|
||||
Alertmanager alert, so the webhook is never called and no recap email is sent.
|
||||
|
||||
This module solves that by running the full pipeline (correlator → repo_check
|
||||
on failure → notifier) on a schedule that mirrors the backup plan schedule.
|
||||
It reads Redis directly, classifies the outcome, and sends the recap email
|
||||
regardless of success or failure.
|
||||
|
||||
Typical invocation (from systemd OnCalendar):
|
||||
python3 -m ns8_backup_monitor.scheduled_check
|
||||
|
||||
The module exits 0 on success, non-zero on unrecoverable errors.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from .correlator import correlate_backup_status
|
||||
from .notifier import send_notification
|
||||
from .repo_check import check_repositories
|
||||
from .utils import load_config, setup_logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_scheduled_check(config: dict):
|
||||
"""Run the full analysis pipeline once and send the recap email.
|
||||
|
||||
Unlike the webhook path, this function does NOT wait before reading
|
||||
Redis: it is meant to be invoked by a timer that fires *after* the
|
||||
backup window has closed (configure OnCalendar accordingly).
|
||||
|
||||
Steps
|
||||
-----
|
||||
1. Run correlator with no specific backup_ids — scans all recent keys
|
||||
within the ``correlator.recent_window`` time window.
|
||||
2. If outcome is not SUCCESS, run the repository health check.
|
||||
3. Send the notification email (success or failure).
|
||||
|
||||
Args:
|
||||
config: Parsed configuration dictionary (output of load_config).
|
||||
"""
|
||||
log.info("Scheduled check starting — reading backup status from Redis...")
|
||||
|
||||
# Pass empty backup_ids list: the correlator will scan all plan status
|
||||
# keys updated within the recent_window (default 3600 seconds).
|
||||
correlation = correlate_backup_status(config, backup_ids=[])
|
||||
|
||||
outcome = correlation.get("outcome", "UNKNOWN")
|
||||
log.info("Correlator outcome: %s", outcome)
|
||||
|
||||
# Run the repository health check on non-SUCCESS outcomes to provide
|
||||
# additional diagnostics in the email body.
|
||||
repo_status = None
|
||||
if outcome != "SUCCESS":
|
||||
log.info("Non-success outcome — running repository health check...")
|
||||
repo_status = check_repositories(config, correlation)
|
||||
|
||||
log.info("Sending recap notification (outcome=%s)...", outcome)
|
||||
# Pass empty alerts list: the notifier will omit the 'Triggered by'
|
||||
# section cleanly when there is no originating Alertmanager alert.
|
||||
send_notification(config, alerts=[], correlation=correlation, repo_status=repo_status)
|
||||
log.info("Scheduled check complete.")
|
||||
|
||||
|
||||
def main():
|
||||
"""Entry point for the scheduled check (called by systemd timer)."""
|
||||
try:
|
||||
config = load_config()
|
||||
except (FileNotFoundError, ImportError) as exc:
|
||||
# Print to stderr so systemd captures it in the journal even if
|
||||
# logging has not been initialised yet.
|
||||
print(f"[ERROR] Could not load config: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
setup_logging(config)
|
||||
|
||||
try:
|
||||
run_scheduled_check(config)
|
||||
except Exception as exc: # pylint: disable=broad-except
|
||||
log.exception("Scheduled check failed with unhandled exception: %s", exc)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user