diff --git a/ansible/roles/apps/templates/prometheus-settings.yaml b/ansible/roles/apps/templates/prometheus-settings.yaml index 95eb9d1da97b3c3f4850a01383af59ffaa23ce08..8a503053ca599f0c082edc1933427bc17b7f5457 100644 --- a/ansible/roles/apps/templates/prometheus-settings.yaml +++ b/ansible/roles/apps/templates/prometheus-settings.yaml @@ -1,64 +1,65 @@ alertmanager: persistentVolume: existingClaim: "alertmanager" -# server: -# persistentVolume: -# existingClaim: "prometheus-server" -# -# serverFiles: -# alerting_rules.yml: -# - name: kubernetes-resources -# rules: -# - alert: KubeCPUOvercommit -# annotations: -# message: Cluster has overcommitted CPU resource requests for Namespaces. -# runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit -# expr: |- -# sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) -# / -# sum(node:node_num_cpu:sum) -# > 1.5 -# for: 5m -# labels: -# severity: warning -# - alert: KubeMemOvercommit -# annotations: -# message: Cluster has overcommitted memory resource requests for Namespaces. -# runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit -# expr: |- -# sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) -# / -# sum(node_memory_MemTotal_bytes{job="node-exporter"}) -# > 1.5 -# for: 5m -# labels: -# severity: warning -# - alert: KubeQuotaExceeded -# annotations: -# {%- raw %} -# message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota. -# {% endraw %} -# runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded -# expr: |- -# 100 * kube_resourcequota{job="kube-state-metrics", type="used"} -# / ignoring(instance, job, type) -# (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) -# > 90 -# for: 15m -# labels: -# severity: warning -# - alert: CPUThrottlingHigh -# annotations: -# {%- raw %} -# message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' -# {% endraw %} -# runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh -# expr: |- -# 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace) -# / -# sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace) -# > 25 -# for: 15m -# labels: -# severity: warning -# +server: + persistentVolume: + existingClaim: "prometheus-server" + +serverFiles: + alerting_rules.yml: + groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: +{%- raw %} + message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota. +{% endraw %} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + expr: |- + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: +{%- raw %} + message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' +{% endraw %} + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: |- + 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace) + > 25 + for: 15m + labels: + severity: warning + diff --git a/test/pytest.ini b/test/pytest.ini index 7676b2263cecd2a2fa0ec4c894af54b8b3ccd1ae..083c14bcc77323e71050ef750798ad43e3c615ff 100644 --- a/test/pytest.ini +++ b/test/pytest.ini @@ -5,6 +5,7 @@ markers = testinfra: Run testinfra tests (test OS/package versions etc) prometheus: Test prometheus helmreleases: Test deployed helmreleases installed by flux + apps_running: Test if all the pods for the helmreleases are running # https://docs.pytest.org/en/latest/warnings.html filterwarnings =