fix: add NS8 native alertnames, extract id label as backup_id fallback, expand comments

This commit is contained in:
2026-05-18 21:55:22 +00:00
parent 39b78f4995
commit 8c6d085d46
+102 -56
View File
@@ -3,9 +3,9 @@
This module exposes a minimal HTTP server that Alertmanager POSTs This module exposes a minimal HTTP server that Alertmanager POSTs
backup-failure notifications to. On receiving a relevant alert it backup-failure notifications to. On receiving a relevant alert it
spawns a background daemon thread that runs the full analysis pipeline: spawns a background daemon thread that runs the full analysis pipeline::
correlator repo_check (only on non-SUCCESS) notifier correlator -> repo_check (only on non-SUCCESS) -> notifier
Why a background thread? Why a background thread?
------------------------ ------------------------
@@ -15,6 +15,28 @@ modules have time to write their final status into Redis before we read
it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
to retry the webhook, so the work is offloaded to a daemon thread. to retry the webhook, so the work is offloaded to a daemon thread.
Alert name mapping
------------------
NS8 ships two sets of Prometheus alert rules that can fire backup alerts:
Native stack (node_backup_status metric, present in all NS8 clusters):
backup_failed - one or more backup plans reported result != "success".
backup_missing - expected backup did not complete within the time window.
Custom / legacy rules (added manually or present in older NS8 versions):
NsBackupFailed - same semantics as backup_failed.
NsBackupMissing - same semantics as backup_missing.
All four names are matched. Any other alertname received on this webhook
is silently ignored so unrelated Alertmanager alerts do not generate noise.
Label mapping
-------------
NS8 native alerts carry the backup plan identifier in the ``id`` label.
Custom / legacy alerts may carry it in ``backup_id`` instead.
Both labels are checked when extracting plan IDs so the correlator can
look up the correct Redis keys regardless of which rule set fired.
Endpoints Endpoints
--------- ---------
POST /alert Accepts an Alertmanager JSON payload. POST /alert Accepts an Alertmanager JSON payload.
@@ -26,7 +48,6 @@ import logging
import threading import threading
import time import time
from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import BaseHTTPRequestHandler, HTTPServer
from typing import Any
from .correlator import correlate_backup_status from .correlator import correlate_backup_status
from .notifier import send_notification from .notifier import send_notification
@@ -38,10 +59,22 @@ log = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Alert filter # Alert filter
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Only these Alertmanager alert names trigger the pipeline. # Alert names that trigger the analysis pipeline.
# NsBackupFailed one or more modules reported an error. #
# NsBackupMissing expected backup did not run within the time window. # NS8 native monitoring stack (Prometheus node_backup_status rules):
ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"} # backup_failed - emitted when node_backup_status == 0 for one or more plans.
# backup_missing - emitted when no backup completed within the expected window.
#
# Custom / legacy alert rule names (kept for backward compatibility with NS8
# clusters that have manually configured or older rule sets):
# NsBackupFailed - same semantic as backup_failed.
# NsBackupMissing - same semantic as backup_missing.
ALERT_NAMES = {
"backup_failed", # NS8 native - canonical name from node_backup_status rules
"backup_missing", # NS8 native - missing / timed-out backup
"NsBackupFailed", # Legacy custom rule name (backward compatibility)
"NsBackupMissing", # Legacy custom rule name (backward compatibility)
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -59,7 +92,7 @@ class AlertHandler(BaseHTTPRequestHandler):
config: dict = {} config: dict = {}
def do_POST(self): def do_POST(self):
"""Handle POST /alert the only supported endpoint.""" """Handle POST /alert - the only supported endpoint."""
# Reject any path other than /alert. # Reject any path other than /alert.
if self.path != "/alert": if self.path != "/alert":
@@ -67,9 +100,7 @@ class AlertHandler(BaseHTTPRequestHandler):
self.end_headers() self.end_headers()
return return
# --------------------------------------------------------------------------- # Read and parse the request body.
# Read and parse the request body
# ---------------------------------------------------------------------------
length = int(self.headers.get("Content-Length", 0)) length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) body = self.rfile.read(length)
@@ -85,32 +116,39 @@ class AlertHandler(BaseHTTPRequestHandler):
self.send_response(200) self.send_response(200)
self.end_headers() self.end_headers()
# --------------------------------------------------------------------------- # Filter relevant alerts.
# Filter relevant alerts # Only process alerts that are currently firing and match one of the
# --------------------------------------------------------------------------- # known backup alert names (see ALERT_NAMES above).
# Only process alerts that are currently firing and match our alert names. # Resolved alerts (status == "resolved") are intentionally ignored:
# Resolved alerts (status == "resolved") are intentionally ignored because # the NS8 monitoring stack clears alerts once the condition is gone
# the NS8 monitoring stack clears alerts once the condition is gone; we do # and we do not need a "backup OK" notification path here.
# not need a separate "backup OK" notification path here.
alerts = payload.get("alerts", []) alerts = payload.get("alerts", [])
relevant = [ relevant = []
a for a in alerts for a in alerts:
if a.get("labels", {}).get("alertname") in ALERT_NAMES labels = a.get("labels", {})
and a.get("status") == "firing" alertname = labels.get("alertname", "")
] status = a.get("status", "")
# Log every received alert at DEBUG level to aid troubleshooting
# without requiring packet captures or full Alertmanager debug mode.
log.debug(
"Received alert: alertname=%r status=%r id=%r name=%r",
alertname,
status,
labels.get("id"),
labels.get("name"),
)
if alertname in ALERT_NAMES and status == "firing":
relevant.append(a)
# ---------------------------------------------------------------------------
# Schedule the pipeline in a background thread
# ---------------------------------------------------------------------------
if relevant: if relevant:
log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline") log.info("Received %d relevant alert(s), scheduling pipeline", len(relevant))
wait = self.config.get("correlator", {}).get("wait_seconds", 30) wait = self.config.get("correlator", {}).get("wait_seconds", 30)
t = threading.Thread( t = threading.Thread(
target=_run_pipeline, target=_run_pipeline,
args=(relevant, self.config, wait), args=(relevant, self.config, wait),
# Daemon thread: will not prevent the process from exiting if # Daemon thread: will not prevent the process from exiting if
# systemd sends SIGTERM while a pipeline run is in progress. # systemd sends SIGTERM while a pipeline run is in progress.
daemon=True daemon=True,
) )
t.start() t.start()
else: else:
@@ -118,7 +156,7 @@ class AlertHandler(BaseHTTPRequestHandler):
def log_message(self, fmt, *args): def log_message(self, fmt, *args):
"""Redirect BaseHTTPRequestHandler access logs to the module logger.""" """Redirect BaseHTTPRequestHandler access logs to the module logger."""
log.debug(f"HTTP: {fmt % args}") log.debug("HTTP: " + fmt, *args)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -131,46 +169,52 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
Steps Steps
----- -----
1. Sleep ``wait`` seconds so backup modules finish writing to Redis. 1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
2. Extract backup_ids from alert labels when available; fall back to 2. Extract backup_ids from alert labels.
scanning Redis for recently updated plan status keys. - Label ``backup_id``: used by custom / legacy rules.
- Label ``id``: used by NS8 native rules (backup_failed /
backup_missing carry the plan identifier here).
Both are checked; duplicates are removed so each plan is queried once.
When no IDs are found the correlator falls back to a broad Redis scan.
3. Run the correlator to classify the overall outcome. 3. Run the correlator to classify the overall outcome.
4. If the outcome is not SUCCESS, run the repository health check to 4. If the outcome is not SUCCESS, run the repository health check to
gather additional diagnostic information. gather additional diagnostic information.
5. Send the email notification. 5. Send the email notification.
""" """
log.info(f"Waiting {wait}s before status check...") log.info("Waiting %ds before status check...", wait)
time.sleep(wait) time.sleep(wait)
# --------------------------------------------------------------------------- # Extract plan ids from alert labels.
# Extract backup_ids from alert labels # Priority:
# --------------------------------------------------------------------------- # 1. Label "backup_id" - custom / legacy alert rules.
# Alertmanager may include a ``backup_id`` label on the alert. When present # 2. Label "id" - NS8 native rules (backup_failed / backup_missing
# it is used to read the exact Redis keys for that plan. When absent the # carry the plan identifier in this label, not in
# correlator falls back to scanning for recent plan status keys. # "backup_id"). This was the root cause of the
backup_ids = list({ # pipeline not triggering on automatic scheduled
a["labels"].get("backup_id", "") # backups when only "id" was present in the alert.
for a in alerts # A set is used to deduplicate so each plan is only queried once.
if a["labels"].get("backup_id") seen: set = set()
}) backup_ids: list = []
for a in alerts:
labels = a.get("labels", {})
bid = labels.get("backup_id") or labels.get("id")
if bid and bid not in seen:
seen.add(bid)
backup_ids.append(bid)
if backup_ids:
log.info("Resolved backup_ids from alert labels: %s", backup_ids)
else:
log.info("No backup_id in alert labels; correlator will scan all recent Redis keys")
# ---------------------------------------------------------------------------
# Correlation
# ---------------------------------------------------------------------------
log.info("Running correlator...") log.info("Running correlator...")
correlation = correlate_backup_status(config, backup_ids) correlation = correlate_backup_status(config, backup_ids)
# --------------------------------------------------------------------------- # Skip repo check on SUCCESS to avoid unnecessary restic network calls.
# Repository health check (non-SUCCESS outcomes only)
# ---------------------------------------------------------------------------
# Skipped on SUCCESS to avoid unnecessary restic network calls.
repo_status = None repo_status = None
if correlation["outcome"] != "SUCCESS": if correlation["outcome"] != "SUCCESS":
log.info("Non-success outcome, running repo check...") log.info("Non-success outcome, running repo check...")
repo_status = check_repositories(config, correlation) repo_status = check_repositories(config, correlation)
# ---------------------------------------------------------------------------
# Email notification
# ---------------------------------------------------------------------------
log.info("Sending notification...") log.info("Sending notification...")
send_notification(config, alerts, correlation, repo_status) send_notification(config, alerts, correlation, repo_status)
@@ -182,8 +226,10 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
def run_server(config: dict): def run_server(config: dict):
"""Bind the HTTP server to the configured host/port and serve forever. """Bind the HTTP server to the configured host/port and serve forever.
The host and port are read from the ``receiver`` section of the config. The host and port are read from the ``receiver`` section of the config
Defaults to localhost:9099 to avoid accidental exposure on public interfaces. file. Defaults to 127.0.0.1:9099 to avoid accidental exposure on public
interfaces - change ``receiver.host`` to ``0.0.0.0`` only if the webhook
endpoint needs to be reachable from a remote Alertmanager instance.
""" """
host = config.get("receiver", {}).get("host", "127.0.0.1") host = config.get("receiver", {}).get("host", "127.0.0.1")
port = config.get("receiver", {}).get("port", 9099) port = config.get("receiver", {}).get("port", 9099)
@@ -194,5 +240,5 @@ def run_server(config: dict):
AlertHandler.config = config AlertHandler.config = config
server = HTTPServer((host, port), AlertHandler) server = HTTPServer((host, port), AlertHandler)
log.info(f"ns8-backup-monitor receiver listening on {host}:{port}") log.info("ns8-backup-monitor receiver listening on %s:%d", host, port)
server.serve_forever() server.serve_forever()