fix: add NS8 native alertnames, extract id label as backup_id fallback, expand comments
This commit is contained in:
+102
-56
@@ -3,9 +3,9 @@
|
||||
|
||||
This module exposes a minimal HTTP server that Alertmanager POSTs
|
||||
backup-failure notifications to. On receiving a relevant alert it
|
||||
spawns a background daemon thread that runs the full analysis pipeline:
|
||||
spawns a background daemon thread that runs the full analysis pipeline::
|
||||
|
||||
correlator → repo_check (only on non-SUCCESS) → notifier
|
||||
correlator -> repo_check (only on non-SUCCESS) -> notifier
|
||||
|
||||
Why a background thread?
|
||||
------------------------
|
||||
@@ -15,6 +15,28 @@ modules have time to write their final status into Redis before we read
|
||||
it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
|
||||
to retry the webhook, so the work is offloaded to a daemon thread.
|
||||
|
||||
Alert name mapping
|
||||
------------------
|
||||
NS8 ships two sets of Prometheus alert rules that can fire backup alerts:
|
||||
|
||||
Native stack (node_backup_status metric, present in all NS8 clusters):
|
||||
backup_failed - one or more backup plans reported result != "success".
|
||||
backup_missing - expected backup did not complete within the time window.
|
||||
|
||||
Custom / legacy rules (added manually or present in older NS8 versions):
|
||||
NsBackupFailed - same semantics as backup_failed.
|
||||
NsBackupMissing - same semantics as backup_missing.
|
||||
|
||||
All four names are matched. Any other alertname received on this webhook
|
||||
is silently ignored so unrelated Alertmanager alerts do not generate noise.
|
||||
|
||||
Label mapping
|
||||
-------------
|
||||
NS8 native alerts carry the backup plan identifier in the ``id`` label.
|
||||
Custom / legacy alerts may carry it in ``backup_id`` instead.
|
||||
Both labels are checked when extracting plan IDs so the correlator can
|
||||
look up the correct Redis keys regardless of which rule set fired.
|
||||
|
||||
Endpoints
|
||||
---------
|
||||
POST /alert Accepts an Alertmanager JSON payload.
|
||||
@@ -26,7 +48,6 @@ import logging
|
||||
import threading
|
||||
import time
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from typing import Any
|
||||
|
||||
from .correlator import correlate_backup_status
|
||||
from .notifier import send_notification
|
||||
@@ -38,10 +59,22 @@ log = logging.getLogger(__name__)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Alert filter
|
||||
# ---------------------------------------------------------------------------
|
||||
# Only these Alertmanager alert names trigger the pipeline.
|
||||
# NsBackupFailed – one or more modules reported an error.
|
||||
# NsBackupMissing – expected backup did not run within the time window.
|
||||
ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
|
||||
# Alert names that trigger the analysis pipeline.
|
||||
#
|
||||
# NS8 native monitoring stack (Prometheus node_backup_status rules):
|
||||
# backup_failed - emitted when node_backup_status == 0 for one or more plans.
|
||||
# backup_missing - emitted when no backup completed within the expected window.
|
||||
#
|
||||
# Custom / legacy alert rule names (kept for backward compatibility with NS8
|
||||
# clusters that have manually configured or older rule sets):
|
||||
# NsBackupFailed - same semantic as backup_failed.
|
||||
# NsBackupMissing - same semantic as backup_missing.
|
||||
ALERT_NAMES = {
|
||||
"backup_failed", # NS8 native - canonical name from node_backup_status rules
|
||||
"backup_missing", # NS8 native - missing / timed-out backup
|
||||
"NsBackupFailed", # Legacy custom rule name (backward compatibility)
|
||||
"NsBackupMissing", # Legacy custom rule name (backward compatibility)
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -59,7 +92,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
||||
config: dict = {}
|
||||
|
||||
def do_POST(self):
|
||||
"""Handle POST /alert — the only supported endpoint."""
|
||||
"""Handle POST /alert - the only supported endpoint."""
|
||||
|
||||
# Reject any path other than /alert.
|
||||
if self.path != "/alert":
|
||||
@@ -67,9 +100,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Read and parse the request body
|
||||
# ---------------------------------------------------------------------------
|
||||
# Read and parse the request body.
|
||||
length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.rfile.read(length)
|
||||
|
||||
@@ -85,32 +116,39 @@ class AlertHandler(BaseHTTPRequestHandler):
|
||||
self.send_response(200)
|
||||
self.end_headers()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Filter relevant alerts
|
||||
# ---------------------------------------------------------------------------
|
||||
# Only process alerts that are currently firing and match our alert names.
|
||||
# Resolved alerts (status == "resolved") are intentionally ignored because
|
||||
# the NS8 monitoring stack clears alerts once the condition is gone; we do
|
||||
# not need a separate "backup OK" notification path here.
|
||||
# Filter relevant alerts.
|
||||
# Only process alerts that are currently firing and match one of the
|
||||
# known backup alert names (see ALERT_NAMES above).
|
||||
# Resolved alerts (status == "resolved") are intentionally ignored:
|
||||
# the NS8 monitoring stack clears alerts once the condition is gone
|
||||
# and we do not need a "backup OK" notification path here.
|
||||
alerts = payload.get("alerts", [])
|
||||
relevant = [
|
||||
a for a in alerts
|
||||
if a.get("labels", {}).get("alertname") in ALERT_NAMES
|
||||
and a.get("status") == "firing"
|
||||
]
|
||||
relevant = []
|
||||
for a in alerts:
|
||||
labels = a.get("labels", {})
|
||||
alertname = labels.get("alertname", "")
|
||||
status = a.get("status", "")
|
||||
# Log every received alert at DEBUG level to aid troubleshooting
|
||||
# without requiring packet captures or full Alertmanager debug mode.
|
||||
log.debug(
|
||||
"Received alert: alertname=%r status=%r id=%r name=%r",
|
||||
alertname,
|
||||
status,
|
||||
labels.get("id"),
|
||||
labels.get("name"),
|
||||
)
|
||||
if alertname in ALERT_NAMES and status == "firing":
|
||||
relevant.append(a)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Schedule the pipeline in a background thread
|
||||
# ---------------------------------------------------------------------------
|
||||
if relevant:
|
||||
log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
|
||||
log.info("Received %d relevant alert(s), scheduling pipeline", len(relevant))
|
||||
wait = self.config.get("correlator", {}).get("wait_seconds", 30)
|
||||
t = threading.Thread(
|
||||
target=_run_pipeline,
|
||||
args=(relevant, self.config, wait),
|
||||
# Daemon thread: will not prevent the process from exiting if
|
||||
# systemd sends SIGTERM while a pipeline run is in progress.
|
||||
daemon=True
|
||||
daemon=True,
|
||||
)
|
||||
t.start()
|
||||
else:
|
||||
@@ -118,7 +156,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
||||
|
||||
def log_message(self, fmt, *args):
|
||||
"""Redirect BaseHTTPRequestHandler access logs to the module logger."""
|
||||
log.debug(f"HTTP: {fmt % args}")
|
||||
log.debug("HTTP: " + fmt, *args)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -131,46 +169,52 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
|
||||
Steps
|
||||
-----
|
||||
1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
|
||||
2. Extract backup_ids from alert labels when available; fall back to
|
||||
scanning Redis for recently updated plan status keys.
|
||||
2. Extract backup_ids from alert labels.
|
||||
- Label ``backup_id``: used by custom / legacy rules.
|
||||
- Label ``id``: used by NS8 native rules (backup_failed /
|
||||
backup_missing carry the plan identifier here).
|
||||
Both are checked; duplicates are removed so each plan is queried once.
|
||||
When no IDs are found the correlator falls back to a broad Redis scan.
|
||||
3. Run the correlator to classify the overall outcome.
|
||||
4. If the outcome is not SUCCESS, run the repository health check to
|
||||
gather additional diagnostic information.
|
||||
5. Send the email notification.
|
||||
"""
|
||||
log.info(f"Waiting {wait}s before status check...")
|
||||
log.info("Waiting %ds before status check...", wait)
|
||||
time.sleep(wait)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extract backup_ids from alert labels
|
||||
# ---------------------------------------------------------------------------
|
||||
# Alertmanager may include a ``backup_id`` label on the alert. When present
|
||||
# it is used to read the exact Redis keys for that plan. When absent the
|
||||
# correlator falls back to scanning for recent plan status keys.
|
||||
backup_ids = list({
|
||||
a["labels"].get("backup_id", "")
|
||||
for a in alerts
|
||||
if a["labels"].get("backup_id")
|
||||
})
|
||||
# Extract plan ids from alert labels.
|
||||
# Priority:
|
||||
# 1. Label "backup_id" - custom / legacy alert rules.
|
||||
# 2. Label "id" - NS8 native rules (backup_failed / backup_missing
|
||||
# carry the plan identifier in this label, not in
|
||||
# "backup_id"). This was the root cause of the
|
||||
# pipeline not triggering on automatic scheduled
|
||||
# backups when only "id" was present in the alert.
|
||||
# A set is used to deduplicate so each plan is only queried once.
|
||||
seen: set = set()
|
||||
backup_ids: list = []
|
||||
for a in alerts:
|
||||
labels = a.get("labels", {})
|
||||
bid = labels.get("backup_id") or labels.get("id")
|
||||
if bid and bid not in seen:
|
||||
seen.add(bid)
|
||||
backup_ids.append(bid)
|
||||
|
||||
if backup_ids:
|
||||
log.info("Resolved backup_ids from alert labels: %s", backup_ids)
|
||||
else:
|
||||
log.info("No backup_id in alert labels; correlator will scan all recent Redis keys")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Correlation
|
||||
# ---------------------------------------------------------------------------
|
||||
log.info("Running correlator...")
|
||||
correlation = correlate_backup_status(config, backup_ids)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Repository health check (non-SUCCESS outcomes only)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Skipped on SUCCESS to avoid unnecessary restic network calls.
|
||||
# Skip repo check on SUCCESS to avoid unnecessary restic network calls.
|
||||
repo_status = None
|
||||
if correlation["outcome"] != "SUCCESS":
|
||||
log.info("Non-success outcome, running repo check...")
|
||||
repo_status = check_repositories(config, correlation)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Email notification
|
||||
# ---------------------------------------------------------------------------
|
||||
log.info("Sending notification...")
|
||||
send_notification(config, alerts, correlation, repo_status)
|
||||
|
||||
@@ -182,8 +226,10 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
|
||||
def run_server(config: dict):
|
||||
"""Bind the HTTP server to the configured host/port and serve forever.
|
||||
|
||||
The host and port are read from the ``receiver`` section of the config.
|
||||
Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
|
||||
The host and port are read from the ``receiver`` section of the config
|
||||
file. Defaults to 127.0.0.1:9099 to avoid accidental exposure on public
|
||||
interfaces - change ``receiver.host`` to ``0.0.0.0`` only if the webhook
|
||||
endpoint needs to be reachable from a remote Alertmanager instance.
|
||||
"""
|
||||
host = config.get("receiver", {}).get("host", "127.0.0.1")
|
||||
port = config.get("receiver", {}).get("port", 9099)
|
||||
@@ -194,5 +240,5 @@ def run_server(config: dict):
|
||||
AlertHandler.config = config
|
||||
|
||||
server = HTTPServer((host, port), AlertHandler)
|
||||
log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
|
||||
log.info("ns8-backup-monitor receiver listening on %s:%d", host, port)
|
||||
server.serve_forever()
|
||||
|
||||
Reference in New Issue
Block a user