fix: add NS8 native alertnames, extract id label as backup_id fallback, expand comments

2026-05-18 21:55:22 +00:00
parent 39b78f4995
commit 8c6d085d46
1 changed files with 102 additions and 56 deletions
@@ -3,9 +3,9 @@
 This module exposes a minimal HTTP server that Alertmanager POSTs
 backup-failure notifications to. On receiving a relevant alert it
-spawns a background daemon thread that runs the full analysis pipeline:
+spawns a background daemon thread that runs the full analysis pipeline::
-    correlator  →  repo_check (only on non-SUCCESS)  →  notifier
+    correlator  ->  repo_check (only on non-SUCCESS)  ->  notifier
 Why a background thread?
 ------------------------
@@ -15,6 +15,28 @@ modules have time to write their final status into Redis before we read
 it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
 to retry the webhook, so the work is offloaded to a daemon thread.
 Alert name mapping
 ------------------
 NS8 ships two sets of Prometheus alert rules that can fire backup alerts:
  Native stack (node_backup_status metric, present in all NS8 clusters):
    backup_failed   - one or more backup plans reported result != "success".
    backup_missing  - expected backup did not complete within the time window.
  Custom / legacy rules (added manually or present in older NS8 versions):
    NsBackupFailed  - same semantics as backup_failed.
    NsBackupMissing - same semantics as backup_missing.
 All four names are matched. Any other alertname received on this webhook
 is silently ignored so unrelated Alertmanager alerts do not generate noise.
 Label mapping
 -------------
 NS8 native alerts carry the backup plan identifier in the ``id`` label.
 Custom / legacy alerts may carry it in ``backup_id`` instead.
 Both labels are checked when extracting plan IDs so the correlator can
 look up the correct Redis keys regardless of which rule set fired.
 Endpoints
 ---------
 POST /alert   Accepts an Alertmanager JSON payload.
@@ -26,7 +48,6 @@ import logging
 import threading
 import time
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from typing import Any
 from .correlator import correlate_backup_status
 from .notifier import send_notification
@@ -38,10 +59,22 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Alert filter
 # ---------------------------------------------------------------------------
-# Only these Alertmanager alert names trigger the pipeline.
+# Alert names that trigger the analysis pipeline.
-# NsBackupFailed  – one or more modules reported an error.
+#
-# NsBackupMissing – expected backup did not run within the time window.
+# NS8 native monitoring stack (Prometheus node_backup_status rules):
-ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
+#   backup_failed   - emitted when node_backup_status == 0 for one or more plans.
 #   backup_missing  - emitted when no backup completed within the expected window.
 #
 # Custom / legacy alert rule names (kept for backward compatibility with NS8
 # clusters that have manually configured or older rule sets):
 #   NsBackupFailed  - same semantic as backup_failed.
 #   NsBackupMissing - same semantic as backup_missing.
 ALERT_NAMES = {
    "backup_failed",    # NS8 native - canonical name from node_backup_status rules
    "backup_missing",   # NS8 native - missing / timed-out backup
    "NsBackupFailed",   # Legacy custom rule name (backward compatibility)
    "NsBackupMissing",  # Legacy custom rule name (backward compatibility)
 }
 # ---------------------------------------------------------------------------
@@ -59,7 +92,7 @@ class AlertHandler(BaseHTTPRequestHandler):
    config: dict = {}
    def do_POST(self):
-        """Handle POST /alert — the only supported endpoint."""
+        """Handle POST /alert - the only supported endpoint."""
        # Reject any path other than /alert.
        if self.path != "/alert":
@@ -67,9 +100,7 @@ class AlertHandler(BaseHTTPRequestHandler):
            self.end_headers()
            return
-        # ---------------------------------------------------------------------------
+        # Read and parse the request body.
        # Read and parse the request body
        # ---------------------------------------------------------------------------
        length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(length)
@@ -85,32 +116,39 @@ class AlertHandler(BaseHTTPRequestHandler):
        self.send_response(200)
        self.end_headers()
-        # ---------------------------------------------------------------------------
+        # Filter relevant alerts.
-        # Filter relevant alerts
+        # Only process alerts that are currently firing and match one of the
-        # ---------------------------------------------------------------------------
+        # known backup alert names (see ALERT_NAMES above).
-        # Only process alerts that are currently firing and match our alert names.
+        # Resolved alerts (status == "resolved") are intentionally ignored:
-        # Resolved alerts (status == "resolved") are intentionally ignored because
+        # the NS8 monitoring stack clears alerts once the condition is gone
-        # the NS8 monitoring stack clears alerts once the condition is gone; we do
+        # and we do not need a "backup OK" notification path here.
        # not need a separate "backup OK" notification path here.
        alerts = payload.get("alerts", [])
-        relevant = [
+        relevant = []
-            a for a in alerts
+        for a in alerts:
-            if a.get("labels", {}).get("alertname") in ALERT_NAMES
+            labels = a.get("labels", {})
-            and a.get("status") == "firing"
+            alertname = labels.get("alertname", "")
-        ]
+            status = a.get("status", "")
            # Log every received alert at DEBUG level to aid troubleshooting
            # without requiring packet captures or full Alertmanager debug mode.
            log.debug(
                "Received alert: alertname=%r status=%r id=%r name=%r",
                alertname,
                status,
                labels.get("id"),
                labels.get("name"),
            )
            if alertname in ALERT_NAMES and status == "firing":
                relevant.append(a)
        # ---------------------------------------------------------------------------
        # Schedule the pipeline in a background thread
        # ---------------------------------------------------------------------------
        if relevant:
-            log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
+            log.info("Received %d relevant alert(s), scheduling pipeline", len(relevant))
            wait = self.config.get("correlator", {}).get("wait_seconds", 30)
            t = threading.Thread(
                target=_run_pipeline,
                args=(relevant, self.config, wait),
                # Daemon thread: will not prevent the process from exiting if
                # systemd sends SIGTERM while a pipeline run is in progress.
-                daemon=True
+                daemon=True,
            )
            t.start()
        else:
@@ -118,7 +156,7 @@ class AlertHandler(BaseHTTPRequestHandler):
    def log_message(self, fmt, *args):
        """Redirect BaseHTTPRequestHandler access logs to the module logger."""
-        log.debug(f"HTTP: {fmt % args}")
+        log.debug("HTTP: " + fmt, *args)
 # ---------------------------------------------------------------------------
@@ -131,46 +169,52 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
    Steps
    -----
    1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
-    2. Extract backup_ids from alert labels when available; fall back to
+    2. Extract backup_ids from alert labels.
-       scanning Redis for recently updated plan status keys.
+       - Label ``backup_id``: used by custom / legacy rules.
       - Label ``id``:        used by NS8 native rules (backup_failed /
                              backup_missing carry the plan identifier here).
       Both are checked; duplicates are removed so each plan is queried once.
       When no IDs are found the correlator falls back to a broad Redis scan.
    3. Run the correlator to classify the overall outcome.
    4. If the outcome is not SUCCESS, run the repository health check to
       gather additional diagnostic information.
    5. Send the email notification.
    """
-    log.info(f"Waiting {wait}s before status check...")
+    log.info("Waiting %ds before status check...", wait)
    time.sleep(wait)
-    # ---------------------------------------------------------------------------
+    # Extract plan ids from alert labels.
-    # Extract backup_ids from alert labels
+    # Priority:
-    # ---------------------------------------------------------------------------
+    #   1. Label "backup_id" - custom / legacy alert rules.
-    # Alertmanager may include a ``backup_id`` label on the alert. When present
+    #   2. Label "id"        - NS8 native rules (backup_failed / backup_missing
-    # it is used to read the exact Redis keys for that plan. When absent the
+    #                          carry the plan identifier in this label, not in
-    # correlator falls back to scanning for recent plan status keys.
+    #                          "backup_id"). This was the root cause of the
-    backup_ids = list({
+    #                          pipeline not triggering on automatic scheduled
-        a["labels"].get("backup_id", "")
+    #                          backups when only "id" was present in the alert.
-        for a in alerts
+    # A set is used to deduplicate so each plan is only queried once.
-        if a["labels"].get("backup_id")
+    seen: set = set()
-    })
+    backup_ids: list = []
    for a in alerts:
        labels = a.get("labels", {})
        bid = labels.get("backup_id") or labels.get("id")
        if bid and bid not in seen:
            seen.add(bid)
            backup_ids.append(bid)
    if backup_ids:
        log.info("Resolved backup_ids from alert labels: %s", backup_ids)
    else:
        log.info("No backup_id in alert labels; correlator will scan all recent Redis keys")
    # ---------------------------------------------------------------------------
    # Correlation
    # ---------------------------------------------------------------------------
    log.info("Running correlator...")
    correlation = correlate_backup_status(config, backup_ids)
-    # ---------------------------------------------------------------------------
+    # Skip repo check on SUCCESS to avoid unnecessary restic network calls.
    # Repository health check (non-SUCCESS outcomes only)
    # ---------------------------------------------------------------------------
    # Skipped on SUCCESS to avoid unnecessary restic network calls.
    repo_status = None
    if correlation["outcome"] != "SUCCESS":
        log.info("Non-success outcome, running repo check...")
        repo_status = check_repositories(config, correlation)
    # ---------------------------------------------------------------------------
    # Email notification
    # ---------------------------------------------------------------------------
    log.info("Sending notification...")
    send_notification(config, alerts, correlation, repo_status)
@@ -182,8 +226,10 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
 def run_server(config: dict):
    """Bind the HTTP server to the configured host/port and serve forever.
-    The host and port are read from the ``receiver`` section of the config.
+    The host and port are read from the ``receiver`` section of the config
-    Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
+    file. Defaults to 127.0.0.1:9099 to avoid accidental exposure on public
    interfaces - change ``receiver.host`` to ``0.0.0.0`` only if the webhook
    endpoint needs to be reachable from a remote Alertmanager instance.
    """
    host = config.get("receiver", {}).get("host", "127.0.0.1")
    port = config.get("receiver", {}).get("port", 9099)
@@ -194,5 +240,5 @@ def run_server(config: dict):
    AlertHandler.config = config
    server = HTTPServer((host, port), AlertHandler)
-    log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
+    log.info("ns8-backup-monitor receiver listening on %s:%d", host, port)
    server.serve_forever()