Newer
Older
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-custom-alerts-stackspin.rules
namespace: stackspin
labels:
app.kubernetes.io/managed-by: flux
app.kubernetes.io/part-of: custom-flux-config
spec:
groups:
- name: stackspin-general
rules:
- alert: service_down
expr: up == 0 or probe_success == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
\ for more than 5 minutes."
- name: stackspin-systemd
rules:
- alert: failed_systemd_units
expr: node_systemd_unit_state{state="failed"} !=0
for: 5m
labels:
severity: page
annotations:
summary: "Systemd unit failed on {{ $labels.instance }}"
description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
\ $labels.job }}) for more than 5 min."
- name: stackspin-maintenance
rules:
- alert: apt_upgrades_pending
expr: apt_upgrades_pending !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Apt upgrades available on {{ $labels.instance }}"
description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
\ {{ $labels.job }}) for more then 2 days."
- alert: node_reboot_required
expr: node_reboot_required !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Reboot required on {{ $labels.instance }}"
description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
\ }}) for more then 2 days."