Compare commits

...

2 commits

View file

@ -0,0 +1,72 @@
groups:
# --- Recording rules (no subqueries) ---
- name: friendica-exporter.records
rules:
# Current backlog per instance (sum over all priority levels)
- record: friendica:worker_backlog
expr: sum by (instance) (friendica_worker_tasks_total{job="friendica"})
# Backlog change over 10 minutes (using offset)
- record: friendica:worker_backlog_10m_delta
expr: friendica:worker_backlog - friendica:worker_backlog offset 10m
# Backlog change over 2 hours (using offset)
- record: friendica:worker_backlog_2h_delta
expr: friendica:worker_backlog - friendica:worker_backlog offset 2h
# --- Alerts ---
- name: friendica-exporter.alerts
rules:
# 1) Exporter must be available (target up AND exporter can read Friendica)
- alert: FriendicaExporterUnavailable
expr: |
(up{job="friendica"} == 0)
OR (friendica_up{job="friendica"} == 0)
for: 5m
labels:
severity: critical
service: friendica
component: exporter
tier: app
annotations:
summary: "Friendica exporter unavailable on {{ $labels.instance }}"
description: "Target down or exporter cannot read Friendica (friendica_up=0) for >5m."
# 2) Worker must be active (<15 minutes since last execution), with JPM fallback
- alert: FriendicaWorkerStale
expr: |
(
time()
- max by (instance) (friendica_worker_last_execution{job="friendica"})
> 15 * 60
)
OR
(
max_over_time(friendica_worker_jpm{job="friendica", frequency="1 minute"}[15m]) <= 0
)
for: 5m
labels:
severity: warning
service: friendica
component: worker
tier: app
annotations:
summary: "Friendica worker inactive (>15m) on {{ $labels.instance }}"
description: "Last worker execution is older than 15 minutes or 1-minute JPM stayed 0 for the last 15 minutes."
# 3) Backlog grows without relief (deadlock / stall)
# A: backlog rose >200 over 2h; B: no 10-min window with a decrease in the last 2h
- alert: FriendicaWorkerBacklogMonotonicIncrease
expr: |
(friendica:worker_backlog_2h_delta > 200)
AND (min_over_time(friendica:worker_backlog_10m_delta[2h]) >= 0)
for: 15m
labels:
severity: critical
service: friendica
component: worker
tier: app
annotations:
summary: "Backlog grows without decreases on {{ $labels.instance }}"
description: "Outstanding worker tasks rose by >200 in the last 2h and never dropped in any 10-minute slice → likely deadlock or stalled workers. Δ2h={{ $value }}."