diff --git a/ansible/roles/apps/tasks/init.yml b/ansible/roles/apps/tasks/init.yml index cebf1faf90a04ff6c65b7722c5b757ff139e1ffb..d02a739d32e962297ca21b93f7994fea21e0aa75 100644 --- a/ansible/roles/apps/tasks/init.yml +++ b/ansible/roles/apps/tasks/init.yml @@ -5,9 +5,12 @@ - git - helm - helmfile - - cert-manager - prometheus + - nginx - nextcloud + - onlyoffice + - local-storage + - cert-manager synchronize: src: '../../helmfiles' dest: '{{ data_directory }}/source' @@ -32,6 +35,9 @@ tags: - config - oas + - nextcloud + - prometheus + - nginx file: state: directory path: '{{ configuration_directory }}/values/apps' @@ -55,7 +61,14 @@ - name: Check if there are failed helm deployments tags: + - helm - helmfile + - prometheus + - nginx + - nextcloud + - onlyoffice + - local-storage + - cert-manager command: 'helm ls --failed --short' failed_when: false register: helm_failed_deployments @@ -63,6 +76,13 @@ - name: Remove failed helm deployments tags: + - helm - helmfile + - prometheus + - nginx + - nextcloud + - onlyoffice + - local-storage + - cert-manager shell: 'helm ls --failed --short | xargs -L1 helm delete --purge' when: helm_failed_deployments.stdout != "" diff --git a/helmfiles/values/prometheus.yaml.gotmpl b/helmfiles/values/prometheus.yaml.gotmpl index 1f6b5571b3c3b9c08ff20ff78d84ea9e87986cb6..a0d2d2564badd6039e92c5c259213fd961a2e32e 100644 --- a/helmfiles/values/prometheus.yaml.gotmpl +++ b/helmfiles/values/prometheus.yaml.gotmpl @@ -49,6 +49,72 @@ prometheus: prometheusOperator: createCustomResource: false +defaultRules: + rules: + # Currently OAS only focusses on a single-node cluster and therfore + # one `KubeCPUOvercommit` and one `KubeMemOvercommit` alerts always + # fire, because a single-node cluster is not considered a redundant setup. + # We disable the whole `kubernetesResources` alert bundle and install + # the other alerts which make sense from this bundle in the next step below + # (`additionalPrometheusRulesMap`). + # See https://open.greenhost.net/openappstack/openappstack/issues/368 for + # details. + kubernetesResources: false + +additionalPrometheusRulesMap: + kubernetes-resources: + groups: + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: |- + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{`{{ $labels.namespace }}`}} is using {{`{{ printf "%0.0f" $value }}`}}% of its {{`{{ $labels.resource }}`}} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + expr: |- + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{`{{ printf "%0.0f" $value }}`}}% throttling of CPU in namespace {{`{{ $labels.namespace }}`}} for container {{`{{ $labels.container_name }}`}} in pod {{`{{ $labels.pod_name }}`}}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: |- + 100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace) + > 25 + for: 15m + labels: + severity: warning + grafana: adminPassword: "{{ requiredEnv "GRAFANA_ADMIN_PASSWORD" }}"