apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: prometheus-custom-alerts-openappstack.rules namespace: oas labels: app.kubernetes.io/managed-by: flux app.kubernetes.io/part-of: custom-flux-config spec: groups: - name: oas-general rules: - alert: service_down expr: up == 0 or probe_success == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\ \ for more than 5 minutes." - name: oas-systemd rules: - alert: failed_systemd_units expr: node_systemd_unit_state{state="failed"} !=0 for: 5m labels: severity: page annotations: summary: "Systemd unit failed on {{ $labels.instance }}" description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\ \ $labels.job }}) for more than 5 min." - name: oas-maintenance rules: - alert: apt_upgrades_pending expr: apt_upgrades_pending !=0 for: 2d labels: severity: warning annotations: summary: "Apt upgrades available on {{ $labels.instance }}" description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\ \ {{ $labels.job }}) for more then 2 days." - alert: node_reboot_required expr: node_reboot_required !=0 for: 2d labels: severity: warning annotations: summary: "Reboot required on {{ $labels.instance }}" description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\ \ }}) for more then 2 days."