Skip to content
Snippets Groups Projects
Commit 6a59b5d6 authored by Maarten de Waard's avatar Maarten de Waard :angel:
Browse files

Merge branch '368-fix-prom-resource-alerts' into 'master'

Resolve "Ignore prometheus KubeCPUOvercommit and KubeMemOvercommit alerts"

Closes #368

See merge request openappstack/openappstack!149
parents 6787c483 418beebf
No related branches found
No related tags found
No related merge requests found
......@@ -5,9 +5,12 @@
- git
- helm
- helmfile
- cert-manager
- prometheus
- nginx
- nextcloud
- onlyoffice
- local-storage
- cert-manager
synchronize:
src: '../../helmfiles'
dest: '{{ data_directory }}/source'
......@@ -32,6 +35,9 @@
tags:
- config
- oas
- nextcloud
- prometheus
- nginx
file:
state: directory
path: '{{ configuration_directory }}/values/apps'
......@@ -55,7 +61,14 @@
- name: Check if there are failed helm deployments
tags:
- helm
- helmfile
- prometheus
- nginx
- nextcloud
- onlyoffice
- local-storage
- cert-manager
command: 'helm ls --failed --short'
failed_when: false
register: helm_failed_deployments
......@@ -63,6 +76,13 @@
- name: Remove failed helm deployments
tags:
- helm
- helmfile
- prometheus
- nginx
- nextcloud
- onlyoffice
- local-storage
- cert-manager
shell: 'helm ls --failed --short | xargs -L1 helm delete --purge'
when: helm_failed_deployments.stdout != ""
......@@ -49,6 +49,72 @@ prometheus:
prometheusOperator:
createCustomResource: false
defaultRules:
rules:
# Currently OAS only focusses on a single-node cluster and therfore
# one `KubeCPUOvercommit` and one `KubeMemOvercommit` alerts always
# fire, because a single-node cluster is not considered a redundant setup.
# We disable the whole `kubernetesResources` alert bundle and install
# the other alerts which make sense from this bundle in the next step below
# (`additionalPrometheusRulesMap`).
# See https://open.greenhost.net/openappstack/openappstack/issues/368 for
# details.
kubernetesResources: false
additionalPrometheusRulesMap:
kubernetes-resources:
groups:
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: |-
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: |-
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
/
sum(node_memory_MemTotal_bytes{job="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: |-
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
message: '{{`{{ printf "%0.0f" $value }}`}}% throttling of CPU in namespace {{`{{ $labels.namespace }}`}} for container {{`{{ $labels.container_name }}`}} in pod {{`{{ $labels.pod_name }}`}}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: |-
100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)
> 25
for: 15m
labels:
severity: warning
grafana:
adminPassword: "{{ requiredEnv "GRAFANA_ADMIN_PASSWORD" }}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment