apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: prometheus-custom-alerts-stackspin.rules namespace: stackspin labels: app.kubernetes.io/managed-by: flux app.kubernetes.io/part-of: custom-flux-config spec: groups: - name: stackspin-general rules: - alert: service_down expr: up == 0 or probe_success == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\ \ for more than 5 minutes." - name: stackspin-systemd rules: - alert: failed_systemd_units expr: node_systemd_unit_state{state="failed"} !=0 for: 5m labels: severity: page annotations: summary: "Systemd unit failed on {{ $labels.instance }}" description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\ \ $labels.job }}) for more than 5 min." - name: stackspin-maintenance rules: - alert: apt_upgrades_pending expr: apt_upgrades_pending !=0 for: 2d labels: severity: warning annotations: summary: "Apt upgrades available on {{ $labels.instance }}" description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\ \ {{ $labels.job }}) for more then 2 days." - alert: node_reboot_required expr: node_reboot_required !=0 for: 2d labels: severity: warning annotations: summary: "Reboot required on {{ $labels.instance }}" description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\ \ }}) for more then 2 days." - name: stackspin-resources rules: # https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-23 - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[20m]) > 0 for: 0m labels: severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # https://awesome-prometheus-alerts.grep.to/rules#rule-docker-containers-1-4 - alert: ContainerMemoryUsage expr: (sum(container_memory_working_set_bytes) BY (container) / sum(container_spec_memory_limit_bytes > 0) BY (container) * 100) > 80 for: 2m labels: severity: warning annotations: summary: Container Memory usage ({{ $labels.container }}) description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighLoad expr: instance:node_load1_per_cpu:ratio > 1 for: 10m labels: severity: warning annotations: summary: High load on ({{ $labels.instance }}) description: "Load per cpu is above 1\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"