stackspin-alerts-prometheusrule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: prometheus-custom-alerts-stackspin.rules
  namespace: stackspin
  labels:
    app.kubernetes.io/managed-by: flux
    app.kubernetes.io/part-of: custom-flux-config
spec:
  groups:
  - name: stackspin-general
    rules:
    - alert: service_down
      expr: up == 0 or probe_success == 0
      for: 5m
      labels:
        severity: page
      annotations:
        summary: "Instance {{ $labels.instance }} down"
        description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
          \ for more than 5 minutes."

  - name: stackspin-systemd
    rules:
    - alert: failed_systemd_units
      expr: node_systemd_unit_state{state="failed"} !=0
      for: 5m
      labels:
        severity: page
      annotations:
        summary: "Systemd unit failed on {{ $labels.instance }}"
        description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
          \ $labels.job }}) for more than 5 min."

  - name: stackspin-maintenance
    rules:
    - alert: apt_upgrades_pending
      expr: apt_upgrades_pending !=0
      for: 2d
      labels:
        severity: warning
      annotations:
        summary: "Apt upgrades available on {{ $labels.instance }}"
        description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
          \ {{ $labels.job }}) for more then 2 days."

    - alert: node_reboot_required
      expr: node_reboot_required !=0
      for: 2d
      labels:
        severity: warning
      annotations:
        summary: "Reboot required on {{ $labels.instance }}"
        description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
          \ }}) for more then 2 days."

  - name: stackspin-resources
    rules:
      # https://awesome-prometheus-alerts.grep.to/rules#rule-host-and-hardware-1-23
      - alert: HostOomKillDetected
        expr: increase(node_vmstat_oom_kill[20m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host OOM kill detected (instance {{ $labels.instance }})
          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # https://awesome-prometheus-alerts.grep.to/rules#rule-docker-containers-1-4
      - alert: ContainerMemoryUsage
        expr: (sum(container_memory_working_set_bytes{container!=""}) BY (pod,container) / sum(container_spec_memory_limit_bytes{container!=""} > 0) BY (pod,container) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container Memory usage ({{ $labels.container }})
          description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: HostHighLoad
        expr: instance:node_load1_per_cpu:ratio > 1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: High load on ({{ $labels.instance }})
          description: "Load per cpu is above 1\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"