From 10f59d7655ccbfc4208f0371a139d97faf66ce5f Mon Sep 17 00:00:00 2001 From: Varac <varac@varac.net> Date: Mon, 1 Nov 2021 15:59:36 +0100 Subject: [PATCH] Add first flux prometheus alert --- .../flux-alerts-prometheusrule.yaml | 20 +++++++ .../stackspin-alerts-prometheusrule.yaml | 55 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 flux2/config/monitoring/flux-alerts-prometheusrule.yaml create mode 100644 flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml diff --git a/flux2/config/monitoring/flux-alerts-prometheusrule.yaml b/flux2/config/monitoring/flux-alerts-prometheusrule.yaml new file mode 100644 index 000000000..fb16df35f --- /dev/null +++ b/flux2/config/monitoring/flux-alerts-prometheusrule.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flux-prometheus.rules + namespace: oas + labels: + app.kubernetes.io/managed-by: flux + app.kubernetes.io/part-of: custom-flux-config +spec: + # https://fluxcd.io/docs/guides/monitoring/#metrics + groups: + - name: flux + rules: + - alert: ReconciliationFailure + expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (namespace, name, kind) + on(namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (namespace, name, kind)) * 2 == 1 + for: 10m + labels: + severity: page + annotations: + summary: '{{ $labels.kind }} {{ $labels.namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.' diff --git a/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml b/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml new file mode 100644 index 000000000..5e476ff60 --- /dev/null +++ b/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-custom-alerts-openappstack.rules + namespace: oas + labels: + app.kubernetes.io/managed-by: flux + app.kubernetes.io/part-of: custom-flux-config +spec: + groups: + - name: oas-general + rules: + - alert: service_down + expr: up == 0 or probe_success == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\ + \ for more than 5 minutes." + + - name: oas-systemd + rules: + - alert: failed_systemd_units + expr: node_systemd_unit_state{state="failed"} !=0 + for: 5m + labels: + severity: page + annotations: + summary: "Systemd unit failed on {{ $labels.instance }}" + description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\ + \ $labels.job }}) for more than 5 min." + + - name: oas-maintenance + rules: + - alert: apt_upgrades_pending + expr: apt_upgrades_pending !=0 + for: 2d + labels: + severity: warning + annotations: + summary: "Apt upgrades available on {{ $labels.instance }}" + description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\ + \ {{ $labels.job }}) for more then 2 days." + + - alert: node_reboot_required + expr: node_reboot_required !=0 + for: 2d + labels: + severity: warning + annotations: + summary: "Reboot required on {{ $labels.instance }}" + description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\ + \ }}) for more then 2 days." -- GitLab