Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-custom-alerts-openappstack.rules
namespace: oas
labels:
app.kubernetes.io/managed-by: flux
app.kubernetes.io/part-of: custom-flux-config
spec:
groups:
- name: oas-general
rules:
- alert: service_down
expr: up == 0 or probe_success == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
\ for more than 5 minutes."
- name: oas-systemd
rules:
- alert: failed_systemd_units
expr: node_systemd_unit_state{state="failed"} !=0
for: 5m
labels:
severity: page
annotations:
summary: "Systemd unit failed on {{ $labels.instance }}"
description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
\ $labels.job }}) for more than 5 min."
- name: oas-maintenance
rules:
- alert: apt_upgrades_pending
expr: apt_upgrades_pending !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Apt upgrades available on {{ $labels.instance }}"
description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
\ {{ $labels.job }}) for more then 2 days."
- alert: node_reboot_required
expr: node_reboot_required !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Reboot required on {{ $labels.instance }}"
description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
\ }}) for more then 2 days."