fix: add NS8 native alertnames, extract id label as backup_id fallback, expand comments
This commit is contained in:
+102
-56
@@ -3,9 +3,9 @@
|
|||||||
|
|
||||||
This module exposes a minimal HTTP server that Alertmanager POSTs
|
This module exposes a minimal HTTP server that Alertmanager POSTs
|
||||||
backup-failure notifications to. On receiving a relevant alert it
|
backup-failure notifications to. On receiving a relevant alert it
|
||||||
spawns a background daemon thread that runs the full analysis pipeline:
|
spawns a background daemon thread that runs the full analysis pipeline::
|
||||||
|
|
||||||
correlator → repo_check (only on non-SUCCESS) → notifier
|
correlator -> repo_check (only on non-SUCCESS) -> notifier
|
||||||
|
|
||||||
Why a background thread?
|
Why a background thread?
|
||||||
------------------------
|
------------------------
|
||||||
@@ -15,6 +15,28 @@ modules have time to write their final status into Redis before we read
|
|||||||
it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
|
it. Blocking the HTTP handler for 30+ seconds would cause Alertmanager
|
||||||
to retry the webhook, so the work is offloaded to a daemon thread.
|
to retry the webhook, so the work is offloaded to a daemon thread.
|
||||||
|
|
||||||
|
Alert name mapping
|
||||||
|
------------------
|
||||||
|
NS8 ships two sets of Prometheus alert rules that can fire backup alerts:
|
||||||
|
|
||||||
|
Native stack (node_backup_status metric, present in all NS8 clusters):
|
||||||
|
backup_failed - one or more backup plans reported result != "success".
|
||||||
|
backup_missing - expected backup did not complete within the time window.
|
||||||
|
|
||||||
|
Custom / legacy rules (added manually or present in older NS8 versions):
|
||||||
|
NsBackupFailed - same semantics as backup_failed.
|
||||||
|
NsBackupMissing - same semantics as backup_missing.
|
||||||
|
|
||||||
|
All four names are matched. Any other alertname received on this webhook
|
||||||
|
is silently ignored so unrelated Alertmanager alerts do not generate noise.
|
||||||
|
|
||||||
|
Label mapping
|
||||||
|
-------------
|
||||||
|
NS8 native alerts carry the backup plan identifier in the ``id`` label.
|
||||||
|
Custom / legacy alerts may carry it in ``backup_id`` instead.
|
||||||
|
Both labels are checked when extracting plan IDs so the correlator can
|
||||||
|
look up the correct Redis keys regardless of which rule set fired.
|
||||||
|
|
||||||
Endpoints
|
Endpoints
|
||||||
---------
|
---------
|
||||||
POST /alert Accepts an Alertmanager JSON payload.
|
POST /alert Accepts an Alertmanager JSON payload.
|
||||||
@@ -26,7 +48,6 @@ import logging
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from .correlator import correlate_backup_status
|
from .correlator import correlate_backup_status
|
||||||
from .notifier import send_notification
|
from .notifier import send_notification
|
||||||
@@ -38,10 +59,22 @@ log = logging.getLogger(__name__)
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Alert filter
|
# Alert filter
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Only these Alertmanager alert names trigger the pipeline.
|
# Alert names that trigger the analysis pipeline.
|
||||||
# NsBackupFailed – one or more modules reported an error.
|
#
|
||||||
# NsBackupMissing – expected backup did not run within the time window.
|
# NS8 native monitoring stack (Prometheus node_backup_status rules):
|
||||||
ALERT_NAMES = {"NsBackupFailed", "NsBackupMissing"}
|
# backup_failed - emitted when node_backup_status == 0 for one or more plans.
|
||||||
|
# backup_missing - emitted when no backup completed within the expected window.
|
||||||
|
#
|
||||||
|
# Custom / legacy alert rule names (kept for backward compatibility with NS8
|
||||||
|
# clusters that have manually configured or older rule sets):
|
||||||
|
# NsBackupFailed - same semantic as backup_failed.
|
||||||
|
# NsBackupMissing - same semantic as backup_missing.
|
||||||
|
ALERT_NAMES = {
|
||||||
|
"backup_failed", # NS8 native - canonical name from node_backup_status rules
|
||||||
|
"backup_missing", # NS8 native - missing / timed-out backup
|
||||||
|
"NsBackupFailed", # Legacy custom rule name (backward compatibility)
|
||||||
|
"NsBackupMissing", # Legacy custom rule name (backward compatibility)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -59,7 +92,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
|||||||
config: dict = {}
|
config: dict = {}
|
||||||
|
|
||||||
def do_POST(self):
|
def do_POST(self):
|
||||||
"""Handle POST /alert — the only supported endpoint."""
|
"""Handle POST /alert - the only supported endpoint."""
|
||||||
|
|
||||||
# Reject any path other than /alert.
|
# Reject any path other than /alert.
|
||||||
if self.path != "/alert":
|
if self.path != "/alert":
|
||||||
@@ -67,9 +100,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
|||||||
self.end_headers()
|
self.end_headers()
|
||||||
return
|
return
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Read and parse the request body.
|
||||||
# Read and parse the request body
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
length = int(self.headers.get("Content-Length", 0))
|
length = int(self.headers.get("Content-Length", 0))
|
||||||
body = self.rfile.read(length)
|
body = self.rfile.read(length)
|
||||||
|
|
||||||
@@ -85,32 +116,39 @@ class AlertHandler(BaseHTTPRequestHandler):
|
|||||||
self.send_response(200)
|
self.send_response(200)
|
||||||
self.end_headers()
|
self.end_headers()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Filter relevant alerts.
|
||||||
# Filter relevant alerts
|
# Only process alerts that are currently firing and match one of the
|
||||||
# ---------------------------------------------------------------------------
|
# known backup alert names (see ALERT_NAMES above).
|
||||||
# Only process alerts that are currently firing and match our alert names.
|
# Resolved alerts (status == "resolved") are intentionally ignored:
|
||||||
# Resolved alerts (status == "resolved") are intentionally ignored because
|
# the NS8 monitoring stack clears alerts once the condition is gone
|
||||||
# the NS8 monitoring stack clears alerts once the condition is gone; we do
|
# and we do not need a "backup OK" notification path here.
|
||||||
# not need a separate "backup OK" notification path here.
|
|
||||||
alerts = payload.get("alerts", [])
|
alerts = payload.get("alerts", [])
|
||||||
relevant = [
|
relevant = []
|
||||||
a for a in alerts
|
for a in alerts:
|
||||||
if a.get("labels", {}).get("alertname") in ALERT_NAMES
|
labels = a.get("labels", {})
|
||||||
and a.get("status") == "firing"
|
alertname = labels.get("alertname", "")
|
||||||
]
|
status = a.get("status", "")
|
||||||
|
# Log every received alert at DEBUG level to aid troubleshooting
|
||||||
|
# without requiring packet captures or full Alertmanager debug mode.
|
||||||
|
log.debug(
|
||||||
|
"Received alert: alertname=%r status=%r id=%r name=%r",
|
||||||
|
alertname,
|
||||||
|
status,
|
||||||
|
labels.get("id"),
|
||||||
|
labels.get("name"),
|
||||||
|
)
|
||||||
|
if alertname in ALERT_NAMES and status == "firing":
|
||||||
|
relevant.append(a)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Schedule the pipeline in a background thread
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
if relevant:
|
if relevant:
|
||||||
log.info(f"Received {len(relevant)} relevant alert(s), scheduling pipeline")
|
log.info("Received %d relevant alert(s), scheduling pipeline", len(relevant))
|
||||||
wait = self.config.get("correlator", {}).get("wait_seconds", 30)
|
wait = self.config.get("correlator", {}).get("wait_seconds", 30)
|
||||||
t = threading.Thread(
|
t = threading.Thread(
|
||||||
target=_run_pipeline,
|
target=_run_pipeline,
|
||||||
args=(relevant, self.config, wait),
|
args=(relevant, self.config, wait),
|
||||||
# Daemon thread: will not prevent the process from exiting if
|
# Daemon thread: will not prevent the process from exiting if
|
||||||
# systemd sends SIGTERM while a pipeline run is in progress.
|
# systemd sends SIGTERM while a pipeline run is in progress.
|
||||||
daemon=True
|
daemon=True,
|
||||||
)
|
)
|
||||||
t.start()
|
t.start()
|
||||||
else:
|
else:
|
||||||
@@ -118,7 +156,7 @@ class AlertHandler(BaseHTTPRequestHandler):
|
|||||||
|
|
||||||
def log_message(self, fmt, *args):
|
def log_message(self, fmt, *args):
|
||||||
"""Redirect BaseHTTPRequestHandler access logs to the module logger."""
|
"""Redirect BaseHTTPRequestHandler access logs to the module logger."""
|
||||||
log.debug(f"HTTP: {fmt % args}")
|
log.debug("HTTP: " + fmt, *args)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -131,46 +169,52 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
|
|||||||
Steps
|
Steps
|
||||||
-----
|
-----
|
||||||
1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
|
1. Sleep ``wait`` seconds so backup modules finish writing to Redis.
|
||||||
2. Extract backup_ids from alert labels when available; fall back to
|
2. Extract backup_ids from alert labels.
|
||||||
scanning Redis for recently updated plan status keys.
|
- Label ``backup_id``: used by custom / legacy rules.
|
||||||
|
- Label ``id``: used by NS8 native rules (backup_failed /
|
||||||
|
backup_missing carry the plan identifier here).
|
||||||
|
Both are checked; duplicates are removed so each plan is queried once.
|
||||||
|
When no IDs are found the correlator falls back to a broad Redis scan.
|
||||||
3. Run the correlator to classify the overall outcome.
|
3. Run the correlator to classify the overall outcome.
|
||||||
4. If the outcome is not SUCCESS, run the repository health check to
|
4. If the outcome is not SUCCESS, run the repository health check to
|
||||||
gather additional diagnostic information.
|
gather additional diagnostic information.
|
||||||
5. Send the email notification.
|
5. Send the email notification.
|
||||||
"""
|
"""
|
||||||
log.info(f"Waiting {wait}s before status check...")
|
log.info("Waiting %ds before status check...", wait)
|
||||||
time.sleep(wait)
|
time.sleep(wait)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Extract plan ids from alert labels.
|
||||||
# Extract backup_ids from alert labels
|
# Priority:
|
||||||
# ---------------------------------------------------------------------------
|
# 1. Label "backup_id" - custom / legacy alert rules.
|
||||||
# Alertmanager may include a ``backup_id`` label on the alert. When present
|
# 2. Label "id" - NS8 native rules (backup_failed / backup_missing
|
||||||
# it is used to read the exact Redis keys for that plan. When absent the
|
# carry the plan identifier in this label, not in
|
||||||
# correlator falls back to scanning for recent plan status keys.
|
# "backup_id"). This was the root cause of the
|
||||||
backup_ids = list({
|
# pipeline not triggering on automatic scheduled
|
||||||
a["labels"].get("backup_id", "")
|
# backups when only "id" was present in the alert.
|
||||||
for a in alerts
|
# A set is used to deduplicate so each plan is only queried once.
|
||||||
if a["labels"].get("backup_id")
|
seen: set = set()
|
||||||
})
|
backup_ids: list = []
|
||||||
|
for a in alerts:
|
||||||
|
labels = a.get("labels", {})
|
||||||
|
bid = labels.get("backup_id") or labels.get("id")
|
||||||
|
if bid and bid not in seen:
|
||||||
|
seen.add(bid)
|
||||||
|
backup_ids.append(bid)
|
||||||
|
|
||||||
|
if backup_ids:
|
||||||
|
log.info("Resolved backup_ids from alert labels: %s", backup_ids)
|
||||||
|
else:
|
||||||
|
log.info("No backup_id in alert labels; correlator will scan all recent Redis keys")
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Correlation
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
log.info("Running correlator...")
|
log.info("Running correlator...")
|
||||||
correlation = correlate_backup_status(config, backup_ids)
|
correlation = correlate_backup_status(config, backup_ids)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# Skip repo check on SUCCESS to avoid unnecessary restic network calls.
|
||||||
# Repository health check (non-SUCCESS outcomes only)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Skipped on SUCCESS to avoid unnecessary restic network calls.
|
|
||||||
repo_status = None
|
repo_status = None
|
||||||
if correlation["outcome"] != "SUCCESS":
|
if correlation["outcome"] != "SUCCESS":
|
||||||
log.info("Non-success outcome, running repo check...")
|
log.info("Non-success outcome, running repo check...")
|
||||||
repo_status = check_repositories(config, correlation)
|
repo_status = check_repositories(config, correlation)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Email notification
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
log.info("Sending notification...")
|
log.info("Sending notification...")
|
||||||
send_notification(config, alerts, correlation, repo_status)
|
send_notification(config, alerts, correlation, repo_status)
|
||||||
|
|
||||||
@@ -182,8 +226,10 @@ def _run_pipeline(alerts: list, config: dict, wait: int):
|
|||||||
def run_server(config: dict):
|
def run_server(config: dict):
|
||||||
"""Bind the HTTP server to the configured host/port and serve forever.
|
"""Bind the HTTP server to the configured host/port and serve forever.
|
||||||
|
|
||||||
The host and port are read from the ``receiver`` section of the config.
|
The host and port are read from the ``receiver`` section of the config
|
||||||
Defaults to localhost:9099 to avoid accidental exposure on public interfaces.
|
file. Defaults to 127.0.0.1:9099 to avoid accidental exposure on public
|
||||||
|
interfaces - change ``receiver.host`` to ``0.0.0.0`` only if the webhook
|
||||||
|
endpoint needs to be reachable from a remote Alertmanager instance.
|
||||||
"""
|
"""
|
||||||
host = config.get("receiver", {}).get("host", "127.0.0.1")
|
host = config.get("receiver", {}).get("host", "127.0.0.1")
|
||||||
port = config.get("receiver", {}).get("port", 9099)
|
port = config.get("receiver", {}).get("port", 9099)
|
||||||
@@ -194,5 +240,5 @@ def run_server(config: dict):
|
|||||||
AlertHandler.config = config
|
AlertHandler.config = config
|
||||||
|
|
||||||
server = HTTPServer((host, port), AlertHandler)
|
server = HTTPServer((host, port), AlertHandler)
|
||||||
log.info(f"ns8-backup-monitor receiver listening on {host}:{port}")
|
log.info("ns8-backup-monitor receiver listening on %s:%d", host, port)
|
||||||
server.serve_forever()
|
server.serve_forever()
|
||||||
|
|||||||
Reference in New Issue
Block a user