Skip to content
Snippets Groups Projects
Unverified Commit 10f59d76 authored by Varac's avatar Varac
Browse files

Add first flux prometheus alert

parent 54d818c8
No related branches found
No related tags found
No related merge requests found
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: flux-prometheus.rules
namespace: oas
labels:
app.kubernetes.io/managed-by: flux
app.kubernetes.io/part-of: custom-flux-config
spec:
# https://fluxcd.io/docs/guides/monitoring/#metrics
groups:
- name: flux
rules:
- alert: ReconciliationFailure
expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (namespace, name, kind) + on(namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (namespace, name, kind)) * 2 == 1
for: 10m
labels:
severity: page
annotations:
summary: '{{ $labels.kind }} {{ $labels.namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.'
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: prometheus-custom-alerts-openappstack.rules
namespace: oas
labels:
app.kubernetes.io/managed-by: flux
app.kubernetes.io/part-of: custom-flux-config
spec:
groups:
- name: oas-general
rules:
- alert: service_down
expr: up == 0 or probe_success == 0
for: 5m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\
\ for more than 5 minutes."
- name: oas-systemd
rules:
- alert: failed_systemd_units
expr: node_systemd_unit_state{state="failed"} !=0
for: 5m
labels:
severity: page
annotations:
summary: "Systemd unit failed on {{ $labels.instance }}"
description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\
\ $labels.job }}) for more than 5 min."
- name: oas-maintenance
rules:
- alert: apt_upgrades_pending
expr: apt_upgrades_pending !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Apt upgrades available on {{ $labels.instance }}"
description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\
\ {{ $labels.job }}) for more then 2 days."
- alert: node_reboot_required
expr: node_reboot_required !=0
for: 2d
labels:
severity: warning
annotations:
summary: "Reboot required on {{ $labels.instance }}"
description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\
\ }}) for more then 2 days."
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment