diff --git a/ansible/roles/apps/tasks/prometheus-stack.yml b/ansible/roles/apps/tasks/prometheus-stack.yml index 72d64954c4fedcdf344378146881f00b717e8d51..bc5158afce88dfa01ba0ddd53385c47cec161a69 100644 --- a/ansible/roles/apps/tasks/prometheus-stack.yml +++ b/ansible/roles/apps/tasks/prometheus-stack.yml @@ -33,7 +33,7 @@ - config - flux - monitoring - - prometheus + - prometheus-stack - name: Create prometheus-related persistent volumes tags: @@ -51,7 +51,7 @@ tags: - config - monitoring - - prometheus + - prometheus-stack loop: - name: "alertmanager" size: "2Gi" diff --git a/ansible/roles/apps/templates/settings/prometheus-stack.yaml b/ansible/roles/apps/templates/settings/prometheus-stack.yaml index 275e201d29fa1525bf8e3cbbc8a55491d92e09f2..64bb277757eb3d25fd95eb7f9a227e02c5833e06 100644 --- a/ansible/roles/apps/templates/settings/prometheus-stack.yaml +++ b/ansible/roles/apps/templates/settings/prometheus-stack.yaml @@ -14,43 +14,39 @@ alertmanager: persistentVolume: existingClaim: "alertmanager" config: + {% if outgoing_mail.enabled %} global: - # smtp_from: you@gmail.com - # smtp_smarthost: mailhog:1025 - # smtp_require_tls: false - # smtp_smarthost: smtp.gmail.com:587 - # smtp_auth_username: you@gmail.com - # smtp_auth_password: yourapppassword # https://support.google.com/mail/answer/185833?hl=en-GB - # smtp_auth_identity: you@gmail.com + smtp_from: "alertmanager@{{ domain }}" + smtp_smarthost: "{{ outgoing_mail.smtp.host }}:{{ outgoing_mail.smtp.port }}" + smtp_auth_username: "{{ outgoing_mail.smtp.user }}" + smtp_auth_password: "{{ outgoing_mail.smtp.password }}" + {% endif %} route: group_by: ['job'] group_wait: 30s group_interval: 5m repeat_interval: 1h + {% if outgoing_mail.enabled %} receiver: email + {% else %} + receiver: null + {% endif %} routes: - match: + # This is an alert meant to ensure that the entire alerting pipeline is functional. + # This alert is always firing, therefore it should always be firing in Alertmanager + # and always fire against a receiver. There are integrations with various notification + # mechanisms that send a notification when this alert is not firing. For example the + # "DeadMansSnitch" integration in PagerDuty. alertname: Watchdog receiver: 'null' - - match: - alertname: CPUThrottlingHigh - receiver: 'null' - - match: - alertname: KubeMemoryOvercommit - receiver: 'null' - - match: - alertname: KubeCPUOvercommit - receiver: 'null' - - match: - alertname: KubeletTooManyPods - receiver: 'null' - # receivers: - # - name: 'null' - # - name: email - # email_configs: - # - send_resolved: true - # to: youremail@gmail.com + receivers: + - name: 'null' + - name: email + email_configs: + - send_resolved: true + to: {{ admin_email }} # Inhibition rules allow to mute a set of alerts given that another alert is firing. # We use this to mute any warning-level notifications if the same alert is already critical.