From e77cef13e201260ecbdf462a6afcc0ade523180c Mon Sep 17 00:00:00 2001 From: Philipp Date: Sun, 14 Sep 2025 17:12:25 +0200 Subject: [PATCH] Add prometheus alert example --- contrib/prometheus.alert.yml | 72 ++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 contrib/prometheus.alert.yml diff --git a/contrib/prometheus.alert.yml b/contrib/prometheus.alert.yml new file mode 100644 index 0000000..3b64c94 --- /dev/null +++ b/contrib/prometheus.alert.yml @@ -0,0 +1,72 @@ +groups: + # --- Recording rules (no subqueries) --- + - name: friendica-exporter.records + rules: + # Current backlog per instance (sum over all priority levels) + - record: friendica:worker_backlog + expr: sum by (instance) (friendica_worker_tasks_total{job="friendica"}) + + # Backlog change over 10 minutes (using offset) + - record: friendica:worker_backlog_10m_delta + expr: friendica:worker_backlog - friendica:worker_backlog offset 10m + + # Backlog change over 2 hours (using offset) + - record: friendica:worker_backlog_2h_delta + expr: friendica:worker_backlog - friendica:worker_backlog offset 2h + + # --- Alerts --- + - name: friendica-exporter.alerts + rules: + + # 1) Exporter must be available (target up AND exporter can read Friendica) + - alert: FriendicaExporterUnavailable + expr: | + (up{job="friendica"} == 0) + OR (friendica_up{job="friendica"} == 0) + for: 5m + labels: + severity: critical + service: friendica + component: exporter + tier: app + annotations: + summary: "Friendica exporter unavailable on {{ $labels.instance }}" + description: "Target down or exporter cannot read Friendica (friendica_up=0) for >5m." + + # 2) Worker must be active (<15 minutes since last execution), with JPM fallback + - alert: FriendicaWorkerStale + expr: | + ( + time() + - max by (instance) (friendica_worker_last_execution{job="friendica"}) + > 15 * 60 + ) + OR + ( + max_over_time(friendica_worker_jpm{job="friendica", frequency="1 minute"}[15m]) <= 0 + ) + for: 5m + labels: + severity: warning + service: friendica + component: worker + tier: app + annotations: + summary: "Friendica worker inactive (>15m) on {{ $labels.instance }}" + description: "Last worker execution is older than 15 minutes or 1-minute JPM stayed 0 for the last 15 minutes." + + # 3) Backlog grows without relief (deadlock / stall) + # A: backlog rose >200 over 2h; B: no 10-min window with a decrease in the last 2h + - alert: FriendicaWorkerBacklogMonotonicIncrease + expr: | + (friendica:worker_backlog_2h_delta > 200) + AND (min_over_time(friendica:worker_backlog_10m_delta[2h]) >= 0) + for: 15m + labels: + severity: critical + service: friendica + component: worker + tier: app + annotations: + summary: "Backlog grows without decreases on {{ $labels.instance }}" + description: "Outstanding worker tasks rose by >200 in the last 2h and never dropped in any 10-minute slice → likely deadlock or stalled workers. Δ2h={{ $value }}." -- 2.52.0