docs: add section-by-section comments — receiver.py

This commit is contained in:
2026-05-18 20:59:29 +00:00
parent 9a02c7c5ae
commit 62aa1804dc
+101 -7
View File
@@ -1,10 +1,24 @@
#!/usr/bin/env python3
"""
receiver.py - HTTP webhook receiver for Alertmanager alerts.
"""HTTP webhook receiver for Alertmanager backup alerts.
Listens on configured host:port for POST /alert from Alertmanager.
On receiving NsBackupFailed or NsBackupMissing, triggers the pipeline:
correlator -> repo_check -> notifier
This module exposes a minimal HTTP server that Alertmanager POSTs
backup-failure notifications to. On receiving a relevant alert it
spawns a background daemon thread that runs the full analysis pipeline:
correlator → repo_check (only on non-SUCCESS) → notifier
Why a background thread?
------------------------
Alertmanager expects a quick HTTP 200 response. The correlation step
deliberately waits ``wait_seconds`` (default 30) so that slow backup
modules have time to write their final status into Redis before we read
it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
to retry the webhook, so the work is offloaded to a daemon thread.
Endpoints
---------
POST /alert Accepts an Alertmanager JSON payload.
All other paths return 404.
"""
import json
@@ -21,18 +35,41 @@ from .utils import load_config
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Alert filter
# ---------------------------------------------------------------------------
# Only these Alertmanager alert names trigger the pipeline.
# NsBackupFailed one or more modules reported an error.
# NsBackupMissing expected backup did not run within the time window.
ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
# ---------------------------------------------------------------------------
# HTTP request handler
# ---------------------------------------------------------------------------
class AlertHandler(BaseHTTPRequestHandler):
"""Minimal HTTP handler that accepts Alertmanager webhook payloads.
The ``config`` class attribute is populated by ``run_server()`` before
the server starts so that every request handler instance shares the
same configuration dictionary without using global state.
"""
config: dict = {}
def do_POST(self):
"""Handle POST /alert — the only supported endpoint."""
# Reject any path other than /alert.
if self.path != "/alert":
self.send_response(404)
self.end_headers()
return
# ---------------------------------------------------------------------------
# Read and parse the request body
# ---------------------------------------------------------------------------
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length)
@@ -44,9 +81,17 @@ class AlertHandler(BaseHTTPRequestHandler):
self.end_headers()
return
# Respond immediately so Alertmanager does not retry the webhook.
self.send_response(200)
self.end_headers()
# ---------------------------------------------------------------------------
# Filter relevant alerts
# ---------------------------------------------------------------------------
# Only process alerts that are currently firing and match our alert names.
# Resolved alerts (status == "resolved") are intentionally ignored because
# the NS8 monitoring stack clears alerts once the condition is gone; we do
# not need a separate "backup OK" notification path here.
alerts = payload.get("alerts", [])
relevant = [
a for a in alerts
@@ -54,12 +99,17 @@ class AlertHandler(BaseHTTPRequestHandler):
and a.get("status") == "firing"
]
# ---------------------------------------------------------------------------
# Schedule the pipeline in a background thread
# ---------------------------------------------------------------------------
if relevant:
log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
wait = self.config.get("correlator", {}).get("wait_seconds", 30)
t = threading.Thread(
target=_run_pipeline,
args=(relevant, self.config, wait),
# Daemon thread: will not prevent the process from exiting if
# systemd sends SIGTERM while a pipeline run is in progress.
daemon=True
)
t.start()
@@ -67,38 +117,82 @@ class AlertHandler(BaseHTTPRequestHandler):
log.debug("No relevant alerts in payload, ignoring")
def log_message(self, fmt, *args):
"""Redirect BaseHTTPRequestHandler access logs to the module logger."""
log.debug(f"HTTP: {fmt % args}")
# ---------------------------------------------------------------------------
# Pipeline runner (background thread target)
# ---------------------------------------------------------------------------
def _run_pipeline(alerts: list, config: dict, wait: int):
"""Wait for modules to finish, then run correlator -> repo_check -> notifier."""
"""Wait for module states to settle, then run the full analysis pipeline.
Steps
-----
1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
2. Extract backup_ids from alert labels when available; fall back to
scanning Redis for recently updated plan status keys.
3. Run the correlator to classify the overall outcome.
4. If the outcome is not SUCCESS, run the repository health check to
gather additional diagnostic information.
5. Send the email notification.
"""
log.info(f"Waiting {wait}s before status check...")
time.sleep(wait)
# Collect unique backup_ids from alert labels if available
# ---------------------------------------------------------------------------
# Extract backup_ids from alert labels
# ---------------------------------------------------------------------------
# Alertmanager may include a ``backup_id`` label on the alert. When present
# it is used to read the exact Redis keys for that plan. When absent the
# correlator falls back to scanning for recent plan status keys.
backup_ids = list({
a["labels"].get("backup_id", "")
for a in alerts
if a["labels"].get("backup_id")
})
# ---------------------------------------------------------------------------
# Correlation
# ---------------------------------------------------------------------------
log.info("Running correlator...")
correlation = correlate_backup_status(config, backup_ids)
# ---------------------------------------------------------------------------
# Repository health check (non-SUCCESS outcomes only)
# ---------------------------------------------------------------------------
# Skipped on SUCCESS to avoid unnecessary restic network calls.
repo_status = None
if correlation["outcome"] != "SUCCESS":
log.info("Non-success outcome, running repo check...")
repo_status = check_repositories(config, correlation)
# ---------------------------------------------------------------------------
# Email notification
# ---------------------------------------------------------------------------
log.info("Sending notification...")
send_notification(config, alerts, correlation, repo_status)
# ---------------------------------------------------------------------------
# Server bootstrap
# ---------------------------------------------------------------------------
def run_server(config: dict):
"""Bind the HTTP server to the configured host/port and serve forever.
The host and port are read from the ``receiver`` section of the config.
Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
"""
host = config.get("receiver", {}).get("host", "127.0.0.1")
port = config.get("receiver", {}).get("port", 9099)
# Share the config with the request handler class via a class attribute
# rather than passing it through BaseHTTPRequestHandler's constructor,
# which does not support custom arguments.
AlertHandler.config = config
server = HTTPServer((host, port), AlertHandler)
log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
server.serve_forever()