Newer
Older
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-custom-alerts-stackspin.rules
namespace: stackspin
labels:
app.kubernetes.io/managed-by: flux
app.kubernetes.io/part-of: custom-flux-config
spec:
groups:
- name: stackspin-general
rules:
- alert: service_down
expr: up == 0 or probe_success == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
\ for more than 5 minutes."
- name: stackspin-systemd
rules:
- alert: failed_systemd_units
expr: node_systemd_unit_state{state="failed"} !=0
for: 5m
labels:
severity: page
annotations:
summary: "Systemd unit failed on {{ $labels.instance }}"
description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
\ $labels.job }}) for more than 5 min."
- name: stackspin-maintenance
rules:
- alert: apt_upgrades_pending
expr: apt_upgrades_pending !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Apt upgrades available on {{ $labels.instance }}"
description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
\ {{ $labels.job }}) for more then 2 days."
- alert: node_reboot_required
expr: node_reboot_required !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Reboot required on {{ $labels.instance }}"
description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
\ }}) for more then 2 days."
- name: stackspin-resources
rules:
# https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-23
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[20m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# https://awesome-prometheus-alerts.grep.to/rules#rule-docker-containers-1-4
- alert: ContainerMemoryUsage
expr: (sum(container_memory_working_set_bytes{container!=""}) BY (pod,container) / sum(container_spec_memory_limit_bytes{container!=""} > 0) BY (pod,container) * 100) > 80
summary: Container Memory usage ({{ $labels.container }})
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighLoad
expr: instance:node_load1_per_cpu:ratio > 1
labels:
severity: warning
annotations:
summary: High load on ({{ $labels.instance }})
description: "Load per cpu is above 1\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"