fix: add NS8 native alertnames, extract id label as backup_id fallback, expand comments

2026-05-18 21:55:22 +00:00
parent 39b78f4995
commit 8c6d085d46
1 changed files with 102 additions and 56 deletions
@@ -3,9 +3,9 @@

 This module exposes a minimal HTTP server that Alertmanager POSTs
 backup-failure notifications to. On receiving a relevant alert it
-spawns a background daemon thread that runs the full analysis pipeline:
+spawns a background daemon thread that runs the full analysis pipeline::

-    correlator  →  repo_check (only on non-SUCCESS)  →  notifier
+    correlator  ->  repo_check (only on non-SUCCESS)  ->  notifier

 Why a background thread?
 ------------------------
@@ -15,6 +15,28 @@ modules have time to write their final status into Redis before we read
 it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
 to retry the webhook, so the work is offloaded to a daemon thread.

+Alert name mapping
+------------------
+NS8 ships two sets of Prometheus alert rules that can fire backup alerts:
+
+  Native stack (node_backup_status metric, present in all NS8 clusters):
+    backup_failed   - one or more backup plans reported result != "success".
+    backup_missing  - expected backup did not complete within the time window.
+
+  Custom / legacy rules (added manually or present in older NS8 versions):
+    NsBackupFailed  - same semantics as backup_failed.
+    NsBackupMissing - same semantics as backup_missing.
+
+All four names are matched. Any other alertname received on this webhook
+is silently ignored so unrelated Alertmanager alerts do not generate noise.
+
+Label mapping
+-------------
+NS8 native alerts carry the backup plan identifier in the ``id`` label.
+Custom / legacy alerts may carry it in ``backup_id`` instead.
+Both labels are checked when extracting plan IDs so the correlator can
+look up the correct Redis keys regardless of which rule set fired.
+
 Endpoints
 ---------
 POST /alert   Accepts an Alertmanager JSON payload.
@@ -26,7 +48,6 @@ import logging
 import threading
 import time
 from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Any

 from .correlator import correlate_backup_status
 from .notifier import send_notification
@@ -38,10 +59,22 @@ log = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Alert filter
 # ---------------------------------------------------------------------------
-# Only these Alertmanager alert names trigger the pipeline.
-# NsBackupFailed  – one or more modules reported an error.
-# NsBackupMissing – expected backup did not run within the time window.
-ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
+# Alert names that trigger the analysis pipeline.
+#
+# NS8 native monitoring stack (Prometheus node_backup_status rules):
+#   backup_failed   - emitted when node_backup_status == 0 for one or more plans.
+#   backup_missing  - emitted when no backup completed within the expected window.
+#
+# Custom / legacy alert rule names (kept for backward compatibility with NS8
+# clusters that have manually configured or older rule sets):
+#   NsBackupFailed  - same semantic as backup_failed.
+#   NsBackupMissing - same semantic as backup_missing.
+ALERT_NAMES = {
+    "backup_failed",    # NS8 native - canonical name from node_backup_status rules
+    "backup_missing",   # NS8 native - missing / timed-out backup
+    "NsBackupFailed",   # Legacy custom rule name (backward compatibility)
+    "NsBackupMissing",  # Legacy custom rule name (backward compatibility)
+}


 # ---------------------------------------------------------------------------
@@ -59,7 +92,7 @@ class AlertHandler(BaseHTTPRequestHandler):
    config: dict = {}

    def do_POST(self):
-        """Handle POST /alert — the only supported endpoint."""
+        """Handle POST /alert - the only supported endpoint."""

        # Reject any path other than /alert.
        if self.path != "/alert":
@@ -67,9 +100,7 @@ class AlertHandler(BaseHTTPRequestHandler):
            self.end_headers()
            return

-        # ---------------------------------------------------------------------------
-        # Read and parse the request body
-        # ---------------------------------------------------------------------------
+        # Read and parse the request body.
        length = int(self.headers.get("Content-Length", 0))
        body = self.rfile.read(length)

@@ -85,32 +116,39 @@ class AlertHandler(BaseHTTPRequestHandler):
        self.send_response(200)
        self.end_headers()

-        # ---------------------------------------------------------------------------
-        # Filter relevant alerts
-        # ---------------------------------------------------------------------------
-        # Only process alerts that are currently firing and match our alert names.
-        # Resolved alerts (status == "resolved") are intentionally ignored because
-        # the NS8 monitoring stack clears alerts once the condition is gone; we do
-        # not need a separate "backup OK" notification path here.
+        # Filter relevant alerts.
+        # Only process alerts that are currently firing and match one of the
+        # known backup alert names (see ALERT_NAMES above).
+        # Resolved alerts (status == "resolved") are intentionally ignored:
+        # the NS8 monitoring stack clears alerts once the condition is gone
+        # and we do not need a "backup OK" notification path here.
        alerts = payload.get("alerts", [])
-        relevant = [
-            a for a in alerts
-            if a.get("labels", {}).get("alertname") in ALERT_NAMES
-            and a.get("status") == "firing"
-        ]
+        relevant = []
+        for a in alerts:
+            labels = a.get("labels", {})
+            alertname = labels.get("alertname", "")
+            status = a.get("status", "")
+            # Log every received alert at DEBUG level to aid troubleshooting
+            # without requiring packet captures or full Alertmanager debug mode.
+            log.debug(
+                "Received alert: alertname=%r status=%r id=%r name=%r",
+                alertname,
+                status,
+                labels.get("id"),
+                labels.get("name"),
+            )
+            if alertname in ALERT_NAMES and status == "firing":
+                relevant.append(a)

-        # ---------------------------------------------------------------------------
-        # Schedule the pipeline in a background thread
-        # ---------------------------------------------------------------------------
        if relevant:
-            log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
+            log.info("Received %d relevant alert(s), scheduling pipeline", len(relevant))
            wait = self.config.get("correlator", {}).get("wait_seconds", 30)
            t = threading.Thread(
                target=_run_pipeline,
                args=(relevant, self.config, wait),
                # Daemon thread: will not prevent the process from exiting if
                # systemd sends SIGTERM while a pipeline run is in progress.
-                daemon=True
+                daemon=True,
            )
            t.start()
        else:
@@ -118,7 +156,7 @@ class AlertHandler(BaseHTTPRequestHandler):

    def log_message(self, fmt, *args):
        """Redirect BaseHTTPRequestHandler access logs to the module logger."""
-        log.debug(f"HTTP: {fmt % args}")
+        log.debug("HTTP: " + fmt, *args)


 # ---------------------------------------------------------------------------
@@ -131,46 +169,52 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
    Steps
    -----
    1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
-    2. Extract backup_ids from alert labels when available; fall back to
-       scanning Redis for recently updated plan status keys.
+    2. Extract backup_ids from alert labels.
+       - Label ``backup_id``: used by custom / legacy rules.
+       - Label ``id``:        used by NS8 native rules (backup_failed /
+                              backup_missing carry the plan identifier here).
+       Both are checked; duplicates are removed so each plan is queried once.
+       When no IDs are found the correlator falls back to a broad Redis scan.
    3. Run the correlator to classify the overall outcome.
    4. If the outcome is not SUCCESS, run the repository health check to
       gather additional diagnostic information.
    5. Send the email notification.
    """
-    log.info(f"Waiting {wait}s before status check...")
+    log.info("Waiting %ds before status check...", wait)
    time.sleep(wait)

-    # ---------------------------------------------------------------------------
-    # Extract backup_ids from alert labels
-    # ---------------------------------------------------------------------------
-    # Alertmanager may include a ``backup_id`` label on the alert. When present
-    # it is used to read the exact Redis keys for that plan. When absent the
-    # correlator falls back to scanning for recent plan status keys.
-    backup_ids = list({
-        a["labels"].get("backup_id", "")
-        for a in alerts
-        if a["labels"].get("backup_id")
-    })
+    # Extract plan ids from alert labels.
+    # Priority:
+    #   1. Label "backup_id" - custom / legacy alert rules.
+    #   2. Label "id"        - NS8 native rules (backup_failed / backup_missing
+    #                          carry the plan identifier in this label, not in
+    #                          "backup_id"). This was the root cause of the
+    #                          pipeline not triggering on automatic scheduled
+    #                          backups when only "id" was present in the alert.
+    # A set is used to deduplicate so each plan is only queried once.
+    seen: set = set()
+    backup_ids: list = []
+    for a in alerts:
+        labels = a.get("labels", {})
+        bid = labels.get("backup_id") or labels.get("id")
+        if bid and bid not in seen:
+            seen.add(bid)
+            backup_ids.append(bid)
+
+    if backup_ids:
+        log.info("Resolved backup_ids from alert labels: %s", backup_ids)
+    else:
+        log.info("No backup_id in alert labels; correlator will scan all recent Redis keys")

-    # ---------------------------------------------------------------------------
-    # Correlation
-    # ---------------------------------------------------------------------------
    log.info("Running correlator...")
    correlation = correlate_backup_status(config, backup_ids)

-    # ---------------------------------------------------------------------------
-    # Repository health check (non-SUCCESS outcomes only)
-    # ---------------------------------------------------------------------------
-    # Skipped on SUCCESS to avoid unnecessary restic network calls.
+    # Skip repo check on SUCCESS to avoid unnecessary restic network calls.
    repo_status = None
    if correlation["outcome"] != "SUCCESS":
        log.info("Non-success outcome, running repo check...")
        repo_status = check_repositories(config, correlation)

-    # ---------------------------------------------------------------------------
-    # Email notification
-    # ---------------------------------------------------------------------------
    log.info("Sending notification...")
    send_notification(config, alerts, correlation, repo_status)

@@ -182,8 +226,10 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
 def run_server(config: dict):
    """Bind the HTTP server to the configured host/port and serve forever.

-    The host and port are read from the ``receiver`` section of the config.
-    Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
+    The host and port are read from the ``receiver`` section of the config
+    file. Defaults to 127.0.0.1:9099 to avoid accidental exposure on public
+    interfaces - change ``receiver.host`` to ``0.0.0.0`` only if the webhook
+    endpoint needs to be reachable from a remote Alertmanager instance.
    """
    host = config.get("receiver", {}).get("host", "127.0.0.1")
    port = config.get("receiver", {}).get("port", 9099)
@@ -194,5 +240,5 @@ def run_server(config: dict):
    AlertHandler.config = config

    server = HTTPServer((host, port), AlertHandler)
-    log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
+    log.info("ns8-backup-monitor receiver listening on %s:%d", host, port)
    server.serve_forever()