docs: add section-by-section comments — receiver.py

2026-05-18 20:59:29 +00:00
parent 9a02c7c5ae
commit 62aa1804dc
1 changed files with 101 additions and 7 deletions
@@ -1,10 +1,24 @@
 #!/usr/bin/env python3
-"""
+"""HTTP webhook receiver for Alertmanager backup alerts.
 receiver.py - HTTP webhook receiver for Alertmanager alerts.
-Listens on configured host:port for POST /alert from Alertmanager.
+This module exposes a minimal HTTP server that Alertmanager POSTs
-On receiving NsBackupFailed or NsBackupMissing, triggers the pipeline:
+backup-failure notifications to. On receiving a relevant alert it
-  correlator -> repo_check -> notifier
+spawns a background daemon thread that runs the full analysis pipeline:
    correlator  →  repo_check (only on non-SUCCESS)  →  notifier
 Why a background thread?
 ------------------------
 Alertmanager expects a quick HTTP 200 response. The correlation step
 deliberately waits ``wait_seconds`` (default 30) so that slow backup
 modules have time to write their final status into Redis before we read
 it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
 to retry the webhook, so the work is offloaded to a daemon thread.
 Endpoints
 ---------
 POST /alert   Accepts an Alertmanager JSON payload.
              All other paths return 404.
 """
 import json
@@ -21,18 +35,41 @@ from .utils import load_config
 log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Alert filter
 # ---------------------------------------------------------------------------
 # Only these Alertmanager alert names trigger the pipeline.
 # NsBackupFailed  – one or more modules reported an error.
 # NsBackupMissing – expected backup did not run within the time window.
 ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
 # ---------------------------------------------------------------------------
 # HTTP request handler
 # ---------------------------------------------------------------------------
 class AlertHandler(BaseHTTPRequestHandler):
    """Minimal HTTP handler that accepts Alertmanager webhook payloads.
    The ``config`` class attribute is populated by ``run_server()`` before
    the server starts so that every request handler instance shares the
    same configuration dictionary without using global state.
    """
    config: dict = {}
    def do_POST(self):
        """Handle POST /alert — the only supported endpoint."""
        # Reject any path other than /alert.
        if self.path != "/alert":
            self.send_response(404)
            self.end_headers()
            return
        # ---------------------------------------------------------------------------
        # Read and parse the request body
        # ---------------------------------------------------------------------------
        length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(length)
@@ -44,9 +81,17 @@ class AlertHandler(BaseHTTPRequestHandler):
            self.end_headers()
            return
        # Respond immediately so Alertmanager does not retry the webhook.
        self.send_response(200)
        self.end_headers()
        # ---------------------------------------------------------------------------
        # Filter relevant alerts
        # ---------------------------------------------------------------------------
        # Only process alerts that are currently firing and match our alert names.
        # Resolved alerts (status == "resolved") are intentionally ignored because
        # the NS8 monitoring stack clears alerts once the condition is gone; we do
        # not need a separate "backup OK" notification path here.
        alerts = payload.get("alerts", [])
        relevant = [
            a for a in alerts
@@ -54,12 +99,17 @@ class AlertHandler(BaseHTTPRequestHandler):
            and a.get("status") == "firing"
        ]
        # ---------------------------------------------------------------------------
        # Schedule the pipeline in a background thread
        # ---------------------------------------------------------------------------
        if relevant:
            log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
            wait = self.config.get("correlator", {}).get("wait_seconds", 30)
            t = threading.Thread(
                target=_run_pipeline,
                args=(relevant, self.config, wait),
                # Daemon thread: will not prevent the process from exiting if
                # systemd sends SIGTERM while a pipeline run is in progress.
                daemon=True
            )
            t.start()
@@ -67,38 +117,82 @@ class AlertHandler(BaseHTTPRequestHandler):
            log.debug("No relevant alerts in payload, ignoring")
    def log_message(self, fmt, *args):
        """Redirect BaseHTTPRequestHandler access logs to the module logger."""
        log.debug(f"HTTP: {fmt % args}")
 # ---------------------------------------------------------------------------
 # Pipeline runner (background thread target)
 # ---------------------------------------------------------------------------
 def _run_pipeline(alerts: list, config: dict, wait: int):
-    """Wait for modules to finish, then run correlator -> repo_check -> notifier."""
+    """Wait for module states to settle, then run the full analysis pipeline.
    Steps
    -----
    1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
    2. Extract backup_ids from alert labels when available; fall back to
       scanning Redis for recently updated plan status keys.
    3. Run the correlator to classify the overall outcome.
    4. If the outcome is not SUCCESS, run the repository health check to
       gather additional diagnostic information.
    5. Send the email notification.
    """
    log.info(f"Waiting {wait}s before status check...")
    time.sleep(wait)
-    # Collect unique backup_ids from alert labels if available
+    # ---------------------------------------------------------------------------
    # Extract backup_ids from alert labels
    # ---------------------------------------------------------------------------
    # Alertmanager may include a ``backup_id`` label on the alert. When present
    # it is used to read the exact Redis keys for that plan. When absent the
    # correlator falls back to scanning for recent plan status keys.
    backup_ids = list({
        a["labels"].get("backup_id", "")
        for a in alerts
        if a["labels"].get("backup_id")
    })
    # ---------------------------------------------------------------------------
    # Correlation
    # ---------------------------------------------------------------------------
    log.info("Running correlator...")
    correlation = correlate_backup_status(config, backup_ids)
    # ---------------------------------------------------------------------------
    # Repository health check (non-SUCCESS outcomes only)
    # ---------------------------------------------------------------------------
    # Skipped on SUCCESS to avoid unnecessary restic network calls.
    repo_status = None
    if correlation["outcome"] != "SUCCESS":
        log.info("Non-success outcome, running repo check...")
        repo_status = check_repositories(config, correlation)
    # ---------------------------------------------------------------------------
    # Email notification
    # ---------------------------------------------------------------------------
    log.info("Sending notification...")
    send_notification(config, alerts, correlation, repo_status)
 # ---------------------------------------------------------------------------
 # Server bootstrap
 # ---------------------------------------------------------------------------
 def run_server(config: dict):
    """Bind the HTTP server to the configured host/port and serve forever.
    The host and port are read from the ``receiver`` section of the config.
    Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
    """
    host = config.get("receiver", {}).get("host", "127.0.0.1")
    port = config.get("receiver", {}).get("port", 9099)
    # Share the config with the request handler class via a class attribute
    # rather than passing it through BaseHTTPRequestHandler's constructor,
    # which does not support custom arguments.
    AlertHandler.config = config
    server = HTTPServer((host, port), AlertHandler)
    log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
    server.serve_forever()