stackspin-alerts-prometheusrule.yaml

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: prometheus-custom-alerts-stackspin.rules
  namespace: stackspin
  labels:
    app.kubernetes.io/managed-by: flux
    app.kubernetes.io/part-of: custom-flux-config
spec:
  groups:
  - name: stackspin-general
    rules:
    - alert: service_down
      expr: up == 0 or probe_success == 0
      for: 5m
      labels:
        severity: page
      annotations:
        summary: "Instance {{ $labels.instance }} down"
        description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
          \ for more than 5 minutes."

  - name: stackspin-systemd
    rules:
    - alert: failed_systemd_units
      expr: node_systemd_unit_state{state="failed"} !=0
      for: 5m
      labels:
        severity: page
      annotations:
        summary: "Systemd unit failed on {{ $labels.instance }}"
        description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
          \ $labels.job }}) for more than 5 min."

  - name: stackspin-maintenance
    rules:
    - alert: apt_upgrades_pending
      expr: apt_upgrades_pending !=0
      for: 2d
      labels:
        severity: warning
      annotations:
        summary: "Apt upgrades available on {{ $labels.instance }}"
        description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
          \ {{ $labels.job }}) for more then 2 days."

    - alert: node_reboot_required
      expr: node_reboot_required !=0
      for: 2d
      labels:
        severity: warning
      annotations:
        summary: "Reboot required on {{ $labels.instance }}"
        description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
          \ }}) for more then 2 days."