diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 21784a30bee429c719bbdd3b662d130da723c740..eddd484878fd549b17f218a10bba4f1b6ae74cc4 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -41,7 +41,7 @@ # - A pipeline gets started from the UI and the job name is included in the # CI variable `TRIGGER_JOBS` # - A commit is pushed containing the pattern TRIGGER_JOBS=.*<job name> -# (i.e. TRIGGER_JOBS=ci-test-image-build,enable-grafana) +# (i.e. TRIGGER_JOBS=ci-test-image-build,enable-nextcloud) # # Gitlab CI allows pushing CI vars via `git push` but a bug prevents this when # using merge request pipelines (see https://gitlab.com/gitlab-org/gitlab/-/issues/326098) @@ -49,16 +49,6 @@ rules: - when: always -.grafana_rules: - rules: - - changes: - - flux/**/$APP*.yaml - - ansible/roles/apps/templates/settings/$APP.yaml - - ansible/roles/apps/tasks/$APP.yaml - - test/behave/features/$APP.feature - - if: '$TRIGGER_JOBS =~ /enable-grafana/' - - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-grafana/' - - if: '$CI_COMMIT_BRANCH == "master"' .loki_stack_rules: rules: @@ -75,7 +65,7 @@ - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-nextcloud/' - if: '$CI_COMMIT_BRANCH == "master"' -.prometheus_rules: +.prometheus_stack_rules: rules: - when: always @@ -239,12 +229,6 @@ enable-eventrouter: - .enable_app_template - .eventrouter_rules -enable-grafana: - variables: - APP: "grafana" - extends: - - .enable_app_template - - .grafana_rules enable-loki-stack: variables: @@ -260,12 +244,12 @@ enable-nextcloud: - .enable_app_template - .nextcloud_rules -enable-prometheus: +enable-prometheus-stack: variables: - APP: "prometheus" + APP: "prometheus-stack" extends: - .enable_app_template - - .prometheus_rules + - .prometheus_stack_rules enable-rocketchat: variables: @@ -373,12 +357,6 @@ eventrouter-helm-release: - .helm-release - .eventrouter_rules -grafana-helm-release: - variables: - APP: "grafana" - extends: - - .helm-release - - .grafana_rules loki-stack-helm-release: variables: @@ -394,12 +372,12 @@ nextcloud-helm-release: - .helm-release - .nextcloud_rules -prometheus-helm-release: +prometheus-stack-helm-release: variables: - APP: "prometheus" + APP: "prometheus-stack" extends: - .helm-release - - .prometheus_rules + - .prometheus_stack_rules rocketchat-helm-release: variables: @@ -454,15 +432,6 @@ eventrouter-ready: - .apps-ready - .eventrouter_rules -grafana-ready: - variables: - APP: "grafana" - needs: - - job: grafana-helm-release - extends: - - .apps-ready - - .grafana_rules - loki-stack-ready: variables: APP: "loki-stack" @@ -481,14 +450,14 @@ nextcloud-ready: - .apps-ready - .nextcloud_rules -prometheus-ready: +prometheus-stack-ready: variables: - APP: "prometheus" + APP: "prometheus-stack" needs: - - job: prometheus-helm-release + - job: prometheus-stack-helm-release extends: - .apps-ready - - .prometheus_rules + - .prometheus_stack_rules rocketchat-ready: variables: @@ -537,15 +506,6 @@ wordpress-ready: - .ssh_setup interruptible: true -grafana-cert: - variables: - APP: "grafana" - needs: - - job: grafana-ready - extends: - - .apps-cert - - .grafana_rules - nextcloud-cert: variables: APP: "nextcloud" @@ -555,14 +515,14 @@ nextcloud-cert: - .apps-cert - .nextcloud_rules -prometheus-cert: +prometheus-stack-cert: variables: - APP: "prometheus" + APP: "prometheus-stack" needs: - - job: prometheus-ready + - job: prometheus-stack-ready extends: - .apps-cert - - .prometheus_rules + - .prometheus_stack_rules rocketchat-cert: variables: @@ -614,11 +574,11 @@ testinfra: - .ssh_setup interruptible: true -prometheus-alerts: +prometheus-stack-alerts: stage: health-test variables: - # APP var is used in job specific rules (i.e. .grafana_rules) - APP: "prometheus" + # APP var is used in job specific rules (i.e. .prometheus_stack_rules) + APP: "prometheus-stack" allow_failure: true script: - *debug_information @@ -626,9 +586,9 @@ prometheus-alerts: - pytest -s -m 'prometheus' --connection=ansible --ansible-inventory=${CLUSTER_DIR}/inventory.yml --hosts='ansible://*' extends: - .ssh_setup - - .prometheus_rules + - .prometheus_stack_rules needs: - - job: prometheus-ready + - job: prometheus-stack-ready interruptible: true @@ -653,14 +613,14 @@ prometheus-alerts: - .ssh_setup interruptible: true -grafana-behave: +prometheus-stack-behave: variables: - APP: "grafana" + APP: "prometheus-stack" needs: - - job: grafana-cert + - job: prometheus-stack-cert extends: - .behave - - .grafana_rules + - .prometheus_stack_rules nextcloud-behave: variables: @@ -736,4 +696,3 @@ gitlab-merge-workaround: script: - echo "Not building anything, no changes." interruptible: true - diff --git a/ansible/group_vars/all/settings.yml.example b/ansible/group_vars/all/settings.yml.example index b4797befaec007ac3912d7a2146f0d607700024c..6d711141e1b20b7c66f3588db063455210aedc08 100644 --- a/ansible/group_vars/all/settings.yml.example +++ b/ansible/group_vars/all/settings.yml.example @@ -92,6 +92,12 @@ enabled_applications: - 'ingress' - 'local-path-provisioner' # - 'single-sign-on' + # + # Monitoring components (auto-enabled by GitLab CI) + # - 'prometheus-stack' + # - 'loki-stack' + # - 'eventrouter' + # # The backup system Velero is disabled by default, see settings under `backup` above. # - 'velero' # @@ -99,14 +105,10 @@ enabled_applications: # see https://docs.openappstack.net/en/latest/customization.html for details # - 'flux-custom' # - # Applications. - # - 'grafana' - # - 'loki-stack' - # - 'eventrouter' + # Applications # - 'nextcloud' - # - 'prometheus' - # 'rocketchat' - # - 'wordpress' + # - 'rocketchat' + # - 'wordpress' prometheus_enable_ingress: false diff --git a/ansible/roles/apps/files/ingress_hr.yaml b/ansible/roles/apps/files/ingress_hr.yaml new file mode 120000 index 0000000000000000000000000000000000000000..fa3810d6c3f82f8ee433c91738e4c8848abd7894 --- /dev/null +++ b/ansible/roles/apps/files/ingress_hr.yaml @@ -0,0 +1 @@ +../../../../flux/oas/ingress_hr.yaml \ No newline at end of file diff --git a/ansible/roles/apps/tasks/core.yml b/ansible/roles/apps/tasks/core.yml index 1530ee233d5f9dedad5781bcbbdc1f1459c3d786..84dc81312dad96c046c31a69c2c764f5ca9e17fb 100644 --- a/ansible/roles/apps/tasks/core.yml +++ b/ansible/roles/apps/tasks/core.yml @@ -96,6 +96,36 @@ resource_definition: "{{ lookup('file', 'local-path-provisioner_hr.yaml') | from_yaml }}" when: "'local-path-provisioner' in enabled_applications" +- name: Create Kubernetes secret with nginx-ingress settings + tags: + - config + - flux + - nginx + vars: + flux_secret: + name: "ingress" + namespace: "oas" + include_tasks: + file: flux_secret.yml + apply: + tags: + - config + - flux + - nginx + +# We have to install nginx-ingress before other charts so that the ingress +# validation webhook exists before it is used. +# It will still be managed by flux afterwards. +- name: Create ingress HelmResource + tags: + - config + - flux + - nginx + k8s: + state: present + resource_definition: "{{ lookup('file', 'ingress_hr.yaml') | from_yaml }}" + when: "'ingress' in enabled_applications" + - name: Install flux tags: - flux diff --git a/ansible/roles/apps/tasks/grafana.yml b/ansible/roles/apps/tasks/grafana.yml deleted file mode 100644 index 803c5457ef1401c25848adffd42af8d9679722c8..0000000000000000000000000000000000000000 --- a/ansible/roles/apps/tasks/grafana.yml +++ /dev/null @@ -1,41 +0,0 @@ ---- - -- name: Create Kubernetes secret with grafana settings - tags: - - config - - flux - - monitoring - - grafana - vars: - flux_secret: - name: "grafana" - namespace: "oas" - include_tasks: - file: flux_secret.yml - apply: - tags: - - config - - flux - - monitoring - - grafana - -- name: Create monitoring-related persistent volumes - tags: - - config - - monitoring - - grafana - vars: - pvc: - name: "{{ item.name }}" - namespace: "oas" - size: "{{ item.size }}" - include_tasks: - file: pvc.yml - apply: - tags: - - config - - monitoring - - grafana - loop: - - name: "grafana" - size: "2Gi" diff --git a/ansible/roles/apps/tasks/main.yml b/ansible/roles/apps/tasks/main.yml index 18c388289e4d5381f57fe50abf25a47ba37cc3a4..b1a6f9e326583b70316b4171c1a1ad405fa24358 100644 --- a/ansible/roles/apps/tasks/main.yml +++ b/ansible/roles/apps/tasks/main.yml @@ -9,11 +9,8 @@ - name: Tasks pertaining to letsencrypt import_tasks: letsencrypt.yml -- name: Tasks pertaining to nginx - import_tasks: nginx.yml - -- name: Tasks pertaining to prometheus - import_tasks: prometheus.yml +- name: Tasks pertaining to prometheus and grafana + import_tasks: prometheus-stack.yml - name: Tasks pertaining to loki-stack import_tasks: loki-stack.yml @@ -21,9 +18,6 @@ - name: Tasks pertaining to eventrouter import_tasks: eventrouter.yml -- name: Tasks pertaining to grafana - import_tasks: grafana.yml - - name: Tasks pertaining to Single sign-on import_tasks: single-sign-on.yml diff --git a/ansible/roles/apps/tasks/nginx.yml b/ansible/roles/apps/tasks/nginx.yml index 9224d81d35f5f9851d3395b6972517f74820d975..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 100644 --- a/ansible/roles/apps/tasks/nginx.yml +++ b/ansible/roles/apps/tasks/nginx.yml @@ -1,18 +0,0 @@ ---- - -- name: Create Kubernetes secret with nginx-ingress settings - tags: - - config - - flux - - nginx - vars: - flux_secret: - name: "ingress" - namespace: "oas" - include_tasks: - file: flux_secret.yml - apply: - tags: - - config - - flux - - nginx diff --git a/ansible/roles/apps/tasks/prometheus.yml b/ansible/roles/apps/tasks/prometheus-stack.yml similarity index 88% rename from ansible/roles/apps/tasks/prometheus.yml rename to ansible/roles/apps/tasks/prometheus-stack.yml index 8dd149d119001cfcf877245a90db48ea976661ca..72d64954c4fedcdf344378146881f00b717e8d51 100644 --- a/ansible/roles/apps/tasks/prometheus.yml +++ b/ansible/roles/apps/tasks/prometheus-stack.yml @@ -1,7 +1,7 @@ --- - name: Create auth secret for basic auth tags: - - prometheus + - prometheus-stack - config - secret k8s: @@ -15,15 +15,16 @@ data: auth: "{{ ('admin:' + (prometheus_basic_auth | password_hash('apr_md5_crypt')) + '\n') | b64encode }}" when: prometheus_enable_ingress is defined and prometheus_enable_ingress is true + - name: Create Kubernetes secret with prometheus settings tags: - config - flux - monitoring - - prometheus + - prometheus-stack vars: flux_secret: - name: "prometheus" + name: "prometheus-stack" namespace: "oas" include_tasks: file: flux_secret.yml @@ -38,7 +39,7 @@ tags: - config - monitoring - - prometheus + - prometheus-stack vars: pvc: name: "{{ item.name }}" @@ -54,5 +55,5 @@ loop: - name: "alertmanager" size: "2Gi" - - name: "prometheus-server" - size: "5Gi" + - name: "grafana" + size: "2Gi" diff --git a/ansible/roles/apps/templates/settings/grafana.yaml b/ansible/roles/apps/templates/settings/grafana.yaml deleted file mode 100644 index 7668e853df1fdf4777ff9f990170cebaa5745826..0000000000000000000000000000000000000000 --- a/ansible/roles/apps/templates/settings/grafana.yaml +++ /dev/null @@ -1,73 +0,0 @@ -adminPassword: "{{ grafana_admin_password }}" -grafana.ini: - server: - root_url: "https://grafana.{{ domain }}" - auth.generic_oauth: - name: OpenAppStack - enabled: true - client_id: grafana - client_secret: "{{ grafana_oauth_client_secret }}" - scopes: "openid profile email openappstack_roles" - auth_url: "https://sso.{{ domain }}/oauth2/auth" - token_url: "https://sso.{{ domain }}/oauth2/token" - api_url: "https://sso.{{ domain }}/userinfo" - role_attribute_path: contains(openappstack_roles[*], 'admin') && 'Admin' || 'Editor' -ingress: - enabled: true - annotations: - kubernetes.io/tls-acme: "true" - hosts: - - "grafana.{{ domain }}" - tls: - - secretName: grafana-tls - hosts: - - "grafana.{{ domain }}" -persistence: - enabled: true - existingClaim: "grafana" -podAnnotations: - backup.velero.io/backup-volumes: "storage" - -# This allows us to pick up the Loki datasource -sidecar: - datasources: - enabled: true - label: grafana_datasource - # Make a configmap with the label `grafana_dashboard` to add dashboards to - # Grafana. - dashboards: - enabled: true - lablel: grafana_dashboard - -dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards -dashboards: - default: - kube-dash: - gnetId: 11074 - revision: 2 - datasource: Prometheus - loki-dash: - gnetId: 10880 - revision: 1 - datasource: Loki - -datasources: - datasources.yaml: - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - url: http://prometheus-server - access: proxy - isDefault: true diff --git a/ansible/roles/apps/templates/settings/prometheus-stack.yaml b/ansible/roles/apps/templates/settings/prometheus-stack.yaml new file mode 100644 index 0000000000000000000000000000000000000000..275e201d29fa1525bf8e3cbbc8a55491d92e09f2 --- /dev/null +++ b/ansible/roles/apps/templates/settings/prometheus-stack.yaml @@ -0,0 +1,254 @@ +#jinja2:lstrip_blocks:"True" +# From: https://github.com/cablespaghetti/k3s-monitoring/blob/master/kube-prometheus-stack-values.yaml +# Disable etcd monitoring. See https://github.com/cablespaghetti/k3s-monitoring/issues/4 +kubeEtcd: + enabled: false + +# Disable kube-controller-manager and kube-scheduler monitoring. See https://github.com/cablespaghetti/k3s-monitoring/issues/2 +kubeControllerManager: + enabled: false +kubeScheduler: + enabled: false + +alertmanager: + persistentVolume: + existingClaim: "alertmanager" + config: + global: + # smtp_from: you@gmail.com + # smtp_smarthost: mailhog:1025 + # smtp_require_tls: false + # smtp_smarthost: smtp.gmail.com:587 + # smtp_auth_username: you@gmail.com + # smtp_auth_password: yourapppassword # https://support.google.com/mail/answer/185833?hl=en-GB + # smtp_auth_identity: you@gmail.com + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 1h + receiver: email + routes: + - match: + alertname: Watchdog + receiver: 'null' + - match: + alertname: CPUThrottlingHigh + receiver: 'null' + - match: + alertname: KubeMemoryOvercommit + receiver: 'null' + - match: + alertname: KubeCPUOvercommit + receiver: 'null' + - match: + alertname: KubeletTooManyPods + receiver: 'null' + + # receivers: + # - name: 'null' + # - name: email + # email_configs: + # - send_resolved: true + # to: youremail@gmail.com + + # Inhibition rules allow to mute a set of alerts given that another alert is firing. + # We use this to mute any warning-level notifications if the same alert is already critical. + inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + # Apply inhibition if the alertname is the same. + equal: ['alertname', 'namespace'] + + alertmanagerSpec: +# replicas: 3 +# podAntiAffinity: "soft" + storage: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi +# resources: +# limits: +# cpu: 500m +# memory: 64Mi +# requests: +# cpu: 25m +# memory: 32Mi +# priorityClassName: high-priority + + +prometheus: + prometheusSpec: + scrapeInterval: "3m" + evaluationInterval: "3m" + retention: "10d" + +# replicas: 2 +# podAntiAffinity: "hard" + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + + {% if prometheus_enable_ingress is defined and prometheus_enable_ingress %} + ingress: + enabled: true + annotations: + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth + nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + kubernetes.io/tls-acme: "true" + hosts: + - "prometheus.{{ domain }}" + tls: + - secretName: prometheus-tls + hosts: + - "prometheus.{{ domain }}" + {% endif %} + +# resources: +# limits: +# cpu: "2" +# memory: 5Gi +# requests: +# cpu: 100m +# memory: 4Gi +# priorityClassName: high-priority +# +# service: +# sessionAffinity: "ClientIP" +# + +grafana: + adminPassword: "{{ grafana_admin_password }}" + grafana.ini: + server: + root_url: "https://grafana.{{ domain }}" + auth.generic_oauth: + name: OpenAppStack + enabled: true + client_id: grafana + client_secret: "{{ grafana_oauth_client_secret }}" + scopes: "openid profile email openappstack_roles" + auth_url: "https://sso.{{ domain }}/oauth2/auth" + token_url: "https://sso.{{ domain }}/oauth2/token" + api_url: "https://sso.{{ domain }}/userinfo" + role_attribute_path: contains(openappstack_roles[*], 'admin') && 'Admin' || 'Editor' + ingress: + enabled: true + annotations: + kubernetes.io/tls-acme: "true" + hosts: + - "grafana.{{ domain }}" + tls: + - secretName: grafana-tls + hosts: + - "grafana.{{ domain }}" + persistence: + enabled: true + existingClaim: "grafana" + podAnnotations: + backup.velero.io/backup-volumes: "storage" + + # This allows us to pick up the Loki datasource + # sidecar: + # datasources: + # enabled: true + # label: grafana_datasource + # # Make a configmap with the label `grafana_dashboard` to add dashboards to + # # Grafana. + # dashboards: + # enabled: true + # lablel: grafana_dashboard + + # dashboardProviders: + # dashboardproviders.yaml: + # apiVersion: 1 + # providers: + # - name: 'default' + # orgId: 1 + # folder: '' + # type: file + # disableDeletion: false + # editable: true + # options: + # path: /var/lib/grafana/dashboards + # dashboards: + # default: + # kube-dash: + # gnetId: 11074 + # revision: 2 + # datasource: Prometheus + # loki-dash: + # gnetId: 10880 + # revision: 1 + # datasource: Loki + + # datasources: + # datasources.yaml: + # apiVersion: 1 + # datasources: + # - name: Prometheus + # type: prometheus + # url: http://prometheus-server + # access: proxy + # isDefault: true + + plugins: + - grafana-piechart-panel + +# Resource limits suggested by cablespagetti +# resources: +# limits: +# cpu: 500m +# memory: 128Mi +# requests: +# cpu: 25m +# memory: 64Mi +# +# sidecar: +# resources: +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 5m +# memory: 64Mi + +#prometheusOperator: +# resources: +# limits: +# cpu: 1 +# memory: 512Mi +# requests: +# cpu: 50m +# memory: 128Mi +# priorityClassName: high-priority + +#prometheus-node-exporter: +# resources: +# limits: +# cpu: 50m +# memory: 50Mi +# requests: +# cpu: 5m +# memory: 16Mi +# priorityClassName: high-priority + +kube-state-metrics: +# resources: +# limits: +# cpu: 1 +# memory: 512Mi +# requests: +# cpu: 5m +# memory: 128Mi +# priorityClassName: high-priority diff --git a/ansible/roles/apps/templates/settings/prometheus.yaml b/ansible/roles/apps/templates/settings/prometheus.yaml deleted file mode 100644 index 1839cd7d6d6bc6ce2178c2731883d9a3dd7c4707..0000000000000000000000000000000000000000 --- a/ansible/roles/apps/templates/settings/prometheus.yaml +++ /dev/null @@ -1,56 +0,0 @@ -#jinja2:lstrip_blocks:"True" -alertmanager: - persistentVolume: - existingClaim: "alertmanager" -server: - global: - scrape_interval: "3m" - evaluation_interval: "3m" - persistentVolume: - existingClaim: "prometheus-server" - retention: "10d" - {% if prometheus_enable_ingress is defined and prometheus_enable_ingress %} - ingress: - enabled: true - annotations: - nginx.ingress.kubernetes.io/auth-type: basic - nginx.ingress.kubernetes.io/auth-secret: prometheus-basic-auth - nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' - kubernetes.io/tls-acme: "true" - hosts: - - "prometheus.{{ domain }}" - tls: - - secretName: prometheus-tls - hosts: - - "prometheus.{{ domain }}" - {% endif %} - extraConfigmapMounts: - - name: alerts-custom - mountPath: /etc/custom-config - configMap: prometheus-alerts-custom - readOnly: true - -# https://github.com/helm/charts/issues/9254#issuecomment-443554083 -configmapReload: - prometheus: - extraConfigmapMounts: - - name: alerts-custom - mountPath: /etc/custom-config - configMap: prometheus-alerts-custom - readOnly: true - extraVolumeDirs: - - /etc/custom-config - -serverFiles: - prometheus.yml: - rule_files: - - /etc/custom-config/custom-openappstack-alerts.yaml - - /etc/custom-config/alertmanager-alerts.yaml - - /etc/custom-config/cert-manager-alerts.yaml - - /etc/custom-config/coredns-alerts.yaml - - /etc/custom-config/kube-state-metrics-alerts.yaml - - /etc/custom-config/kubernetes-alerts.yaml - - /etc/custom-config/loki-alerts.yaml - - /etc/custom-config/node-exporter-alerts.yaml - - /etc/custom-config/prometheus-alerts.yaml - - /etc/custom-config/promtail-alerts.yaml diff --git a/flux/oas/grafana_hr.yaml b/flux/oas/grafana_hr.yaml deleted file mode 100644 index 1b7bbfe80025f540a50c33d54747bccd844743c9..0000000000000000000000000000000000000000 --- a/flux/oas/grafana_hr.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: helm.fluxcd.io/v1 -kind: HelmRelease -metadata: - name: grafana - namespace: oas - annotations: - flux.weave.works/automated: "false" -spec: - releaseName: grafana - chart: - repository: https://grafana.github.io/helm-charts - name: grafana - version: 6.2.1 - valuesFrom: - - secretKeyRef: - name: grafana-settings - key: values.yaml diff --git a/flux/oas/prometheus_hr.yaml b/flux/oas/prometheus-stack_hr.yaml similarity index 62% rename from flux/oas/prometheus_hr.yaml rename to flux/oas/prometheus-stack_hr.yaml index 8e4f40b3918d6bc12a321052d5220fff9a2df2dd..659914478bf3d766bd673af42c3e48b0d21b59ea 100644 --- a/flux/oas/prometheus_hr.yaml +++ b/flux/oas/prometheus-stack_hr.yaml @@ -2,17 +2,18 @@ apiVersion: helm.fluxcd.io/v1 kind: HelmRelease metadata: - name: prometheus + name: prometheus-stack namespace: oas annotations: flux.weave.works/automated: "false" spec: - releaseName: prometheus + releaseName: prometheus-stack chart: repository: https://prometheus-community.github.io/helm-charts - name: prometheus - version: 13.2.1 + name: kube-prometheus-stack + version: 14.5.0 valuesFrom: - secretKeyRef: - name: prometheus-settings + name: prometheus-stack-settings key: values.yaml + timeout: 300 diff --git a/flux/oas/prometheus_alerts_custom_cm.yaml b/flux/oas/prometheus_alerts_custom_cm.yaml deleted file mode 100644 index 340d772da3eaa7c24d06441ec16a245a659376ed..0000000000000000000000000000000000000000 --- a/flux/oas/prometheus_alerts_custom_cm.yaml +++ /dev/null @@ -1,1567 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-alerts-custom - namespace: oas -data: - custom-openappstack-alerts.yaml: | - groups: - - name: general - rules: - - alert: service_down - expr: up == 0 or probe_success == 0 - for: 5m - labels: - severity: page - annotations: - summary: "Instance {{ $labels.instance }} down" - description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\ - \ for more than 5 minutes." - - - name: systemd - rules: - - alert: failed_systemd_units - expr: node_systemd_unit_state{state="failed"} !=0 - for: 5m - labels: - severity: page - annotations: - summary: "Systemd unit failed on {{ $labels.instance }}" - description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\ - \ $labels.job }}) for more than 5 min." - - - name: maintenance - rules: - - alert: apt_upgrades_pending - expr: apt_upgrades_pending !=0 - for: 2d - labels: - severity: warning - annotations: - summary: "Apt upgrades available on {{ $labels.instance }}" - description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\ - \ {{ $labels.job }}) for more then 2 days." - - - alert: node_reboot_required - expr: node_reboot_required !=0 - for: 2d - labels: - severity: warning - annotations: - summary: "Reboot required on {{ $labels.instance }}" - description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\ - \ }}) for more then 2 days." - alertmanager-alerts.yaml: | - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerFailedReload - annotations: - description: Configuration has failed to load for {{$labels.instance}}. - summary: Reloading an Alertmanager configuration has failed. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: AlertmanagerMembersInconsistent - annotations: - description: Alertmanager {{$labels.instance}} has only found {{ $value }} members - of the {{$labels.job}} cluster. - summary: A member of an Alertmanager cluster has not found all other cluster - members. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]) - < on (job) group_left - count by (job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])) - for: 10m - labels: - severity: critical - - alert: AlertmanagerFailedToSendAlerts - annotations: - description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage - }} of notifications to {{ $labels.integration }}. - summary: An Alertmanager instance failed to send notifications. - expr: | - ( - rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager"}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration - }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage - }}. - summary: All Alertmanager instances in a cluster failed to send notifications - to a critical integration. - expr: | - min by (job, integration) ( - rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager", integration=~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterFailedToSendAlerts - annotations: - description: The minimum notification failure rate to {{ $labels.integration - }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage - }}. - summary: All Alertmanager instances in a cluster failed to send notifications - to a non-critical integration. - expr: | - min by (job, integration) ( - rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`.*`}[5m]) - / - rate(alertmanager_notifications_total{job="alertmanager", integration!~`.*`}[5m]) - ) - > 0.01 - for: 5m - labels: - severity: warning - - alert: AlertmanagerConfigInconsistent - annotations: - description: Alertmanager instances within the {{$labels.job}} cluster have - different configurations. - summary: Alertmanager instances within the same cluster have different configurations. - expr: | - count by (job) ( - count_values by (job) ("config_hash", alertmanager_config_hash{job="alertmanager"}) - ) - != 1 - for: 20m - labels: - severity: critical - - alert: AlertmanagerClusterDown - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within - the {{$labels.job}} cluster have been up for less than half of the last 5m.' - summary: Half or more of the Alertmanager instances within the same cluster - are down. - expr: | - ( - count by (job) ( - avg_over_time(up{job="alertmanager"}[5m]) < 0.5 - ) - / - count by (job) ( - up{job="alertmanager"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - - alert: AlertmanagerClusterCrashlooping - annotations: - description: '{{ $value | humanizePercentage }} of Alertmanager instances within - the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' - summary: Half or more of the Alertmanager instances within the same cluster - are crashlooping. - expr: | - ( - count by (job) ( - changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4 - ) - / - count by (job) ( - up{job="alertmanager"} - ) - ) - >= 0.5 - for: 5m - labels: - severity: critical - cert-manager-alerts.yaml: | - groups: - - name: cert-manager - rules: [] - - name: certificates - rules: - - alert: CertManagerCertExpirySoon - annotations: - dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager - description: The domain that this cert covers will be unavailable after {{ $value - | humanizeDuration }}. Clients using endpoints that this cert protects will - start to fail in {{ $value | humanizeDuration }}. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertexpirysoon - summary: The cert `{{ $labels.name }}` is {{ $value | humanizeDuration }} from - expiry, it should have renewed over a week ago. - expr: | - avg by (exported_namespace, namespace, name) ( - certmanager_certificate_expiration_timestamp_seconds - time() - ) < (21 * 24 * 3600) # 21 days in seconds - for: 1h - labels: - severity: warning - - alert: CertManagerCertNotReady - annotations: - dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager - description: This certificate has not been ready to serve traffic for at least - 10m. If the cert is being renewed or there is another valid cert, the ingress - controller _may_ be able to serve that instead. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagercertnotready - summary: The cert `{{ $labels.name }}` is not ready to serve traffic. - expr: | - max by (name, exported_namespace, namespace, condition) ( - certmanager_certificate_ready_status{condition!="True"} == 1 - ) - for: 10m - labels: - severity: critical - - alert: CertManagerHittingRateLimits - annotations: - dashboard_url: https://grafana.example.com/d/TvuRo2iMk/cert-manager - description: Depending on the rate limit, cert-manager may be unable to generate - certificates for up to a week. - runbook_url: https://gitlab.com/uneeq-oss/cert-manager-mixin/-/blob/master/RUNBOOK.md#certmanagerhittingratelimits - summary: Cert manager hitting LetsEncrypt rate limits. - expr: | - sum by (host) ( - rate(certmanager_http_acme_client_request_count{status="429"}[5m]) - ) > 0 - for: 5m - labels: - severity: critical - coredns-alerts.yaml: | - groups: - - name: coredns - rules: - - alert: CoreDNSLatencyHigh - annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server - {{ $labels.server }} zone {{ $labels.zone }} . - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh - expr: | - histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 - for: 10m - labels: - severity: critical - - alert: CoreDNSErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of requests. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.03 - for: 10m - labels: - severity: critical - - alert: CoreDNSErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of requests. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.01 - for: 10m - labels: - severity: warning - - name: coredns_forward - rules: - - alert: CoreDNSForwardLatencyHigh - annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding - requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh - expr: | - histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 - for: 10m - labels: - severity: critical - - alert: CoreDNSForwardErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of forward requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh - expr: | - sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns"}[5m])) > 0.03 - for: 10m - labels: - severity: critical - - alert: CoreDNSForwardErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} - of forward requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.01 - for: 10m - labels: - severity: warning - kube-state-metrics-alerts.yaml: | - groups: - - name: kube-state-metrics - rules: - - alert: KubeStateMetricsListErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in - list operations. This is likely causing it to not be able to expose metrics - about Kubernetes objects correctly or at all. - summary: kube-state-metrics is experiencing errors in list operations. - expr: | - (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsWatchErrors - annotations: - description: kube-state-metrics is experiencing errors at an elevated rate in - watch operations. This is likely causing it to not be able to expose metrics - about Kubernetes objects correctly or at all. - summary: kube-state-metrics is experiencing errors in watch operations. - expr: | - (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) - / - sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) - > 0.01 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsShardingMismatch - annotations: - description: kube-state-metrics pods are running with different --total-shards - configuration, some Kubernetes objects may be exposed multiple times or not - exposed at all. - summary: kube-state-metrics sharding is misconfigured. - expr: | - stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 - for: 15m - labels: - severity: critical - - alert: KubeStateMetricsShardsMissing - annotations: - description: kube-state-metrics shards are missing, some Kubernetes objects - are not being exposed. - summary: kube-state-metrics shards are missing. - expr: | - 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 - - - sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) - != 0 - for: 15m - labels: - severity: critical - kubernetes-alerts.yaml: | - groups: - - name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping - summary: Pod is crash looping. - expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) * 60 * 5 > 0 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready - state for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment - }} does not match, this indicates that the Deployment has failed but has not - been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has - not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has - not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset - }} does not match, this indicates that the StatefulSet has failed but has - not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update - has not been rolled out. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not - finished or progressed for at least 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} - has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset - }} are not scheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: | - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset - }} are running where they are not supposed to run.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: | - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobCompletion - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more - than 12 hours to complete. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion - summary: Job did not complete in time - expr: | - kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 - for: 12h - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. - Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed - summary: Job failed to complete. - expr: | - kube_job_failed{job="kube-state-metrics"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the - desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. - expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} - != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - > - kube_hpa_spec_min_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - < - kube_hpa_spec_max_replicas{job="kube-state-metrics"}) - and - changes(kube_hpa_status_current_replicas[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running - at max replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout - summary: HPA is running at max replicas - expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} - == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} - for: 15m - labels: - severity: warning - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot - tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) - / - sum(kube_node_status_allocatable_cpu_cores) - > - (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) - for: 5m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Pods and - cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) - / - sum(kube_node_status_allocatable_memory_bytes) - > - (count(kube_node_status_allocatable_memory_bytes)-1) - / - count(kube_node_status_allocatable_memory_bytes) - for: 5m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable_cpu_cores) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused - summary: Namespace quota is fully used. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace - {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod - }}.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info - - name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage - }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - kubelet_volume_stats_available_bytes{job="kubelet"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet"} - < 0.03 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} is expected to fill up within four - days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | - ( - kubelet_volume_stats_available_bytes{job="kubelet"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet"} - ) < 0.15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status - {{ $labels.phase }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m - labels: - severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes - components running. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: | - count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m - labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance - }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m - labels: - severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver - rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring - in less than 7.0 days. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 - labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring - in less than 24.0 hours. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 - labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has - reported errors. It has appeared unavailable {{ $value | humanize }} times - averaged over the past 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[10m])) > 4 - labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has - been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m - labels: - severity: warning - - alert: KubeAPITerminatedRequests - annotations: - description: The apiserver has terminated {{ $value | humanizePercentage }} - of its incoming requests. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests - summary: The apiserver has terminated {{ $value | humanizePercentage }} of its - incoming requests. - expr: | - sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20 - for: 5m - labels: - severity: warning - - name: kubernetes-system-kubelet - rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m - labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage - }} of its Pod capacity. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) - ) - / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - ) > 0.95 - for: 15m - labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ - $value }} times in the last 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m - labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile - duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m - labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds - on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60 - for: 15m - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires - in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires - in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires - in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 - labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires - in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 - labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client - certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server - certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m - labels: - severity: warning - - name: kubernetes-system-scheduler - rules: [] - - name: kubernetes-system-controller-manager - rules: [] - loki-alerts.yaml: | - groups: - - name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > 1 - for: 15m - labels: - severity: critical - node-exporter-alerts.yaml: | - groups: - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left and is filling up. - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left and is filling up fast. - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_avail_bytes{job="node",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job="node",fstype!=""} / node_filesystem_size_bytes{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up. - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node",fstype!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 5 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job="node",fstype!=""} / node_filesystem_files{job="node",fstype!=""} * 100 < 3 - and - node_filesystem_readonly{job="node",fstype!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} receive errors in the last two minutes.' - summary: Network interface is reporting many receive errors. - expr: | - rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - summary: Network interface is reporting many transmit errors. - expr: | - rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job="node"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - description: Clock on {{ $labels.instance }} is out of sync by more than 300s. - Ensure NTP is configured correctly on this host. - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP - is configured on this host. - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is - in degraded state due to one or more disks failures. Number of spare drives - is insufficient to fix issue automatically. - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. - Array '{{ $labels.device }}' needs attention and possibly a disk swap. - summary: Failed device in RAID array - expr: | - node_md_disks{state="failed"} > 0 - labels: - severity: warning - prometheus-alerts.yaml: | - groups: - - name: prometheus - rules: - - alert: PrometheusBadConfig - annotations: - description: Prometheus {{$labels.instance}} has failed to reload its configuration. - summary: Failed Prometheus configuration reload. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus"}[5m]) == 0 - for: 10m - labels: - severity: critical - - alert: PrometheusNotificationQueueRunningFull - annotations: - description: Alert notification queue of Prometheus {{$labels.instance}} is - running full. - summary: Prometheus alert notification queue predicted to run full in less than - 30m. - expr: | - # Without min_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - predict_linear(prometheus_notifications_queue_length{job="prometheus"}[5m], 60 * 30) - > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers - annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus - {{$labels.instance}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a - specific Alertmanager. - expr: | - ( - rate(prometheus_notifications_errors_total{job="prometheus"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus"}[5m]) - ) - * 100 - > 1 - for: 15m - labels: - severity: warning - - alert: PrometheusNotConnectedToAlertmanagers - annotations: - description: Prometheus {{$labels.instance}} is not connected to any Alertmanagers. - summary: Prometheus is not connected to any Alertmanagers. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus"}[5m]) < 1 - for: 10m - labels: - severity: warning - - alert: PrometheusTSDBReloadsFailing - annotations: - description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} - reload failures over the last 3h. - summary: Prometheus has issues reloading blocks from disk. - expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusTSDBCompactionsFailing - annotations: - description: Prometheus {{$labels.instance}} has detected {{$value | humanize}} - compaction failures over the last 3h. - summary: Prometheus has issues compacting blocks. - expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus"}[3h]) > 0 - for: 4h - labels: - severity: warning - - alert: PrometheusNotIngestingSamples - annotations: - description: Prometheus {{$labels.instance}} is not ingesting samples. - summary: Prometheus is not ingesting samples. - expr: | - ( - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus"}[5m]) <= 0 - and - ( - sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus"}) > 0 - or - sum without(rule_group) (prometheus_rule_group_rules{job="prometheus"}) > 0 - ) - ) - for: 10m - labels: - severity: warning - - alert: PrometheusDuplicateTimestamps - annotations: - description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} - samples/s with different values but duplicated timestamp. - summary: Prometheus is dropping samples with duplicate timestamps. - expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusOutOfOrderTimestamps - annotations: - description: Prometheus {{$labels.instance}} is dropping {{ printf "%.4g" $value }} - samples/s with timestamps arriving out of order. - summary: Prometheus drops samples with out-of-order timestamps. - expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus"}[5m]) > 0 - for: 10m - labels: - severity: warning - - alert: PrometheusRemoteStorageFailures - annotations: - description: Prometheus {{$labels.instance}} failed to send {{ printf "%.1f" - $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} - summary: Prometheus fails to send samples to remote storage. - expr: | - ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) - / - ( - (rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus"}[5m])) - + - (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus"}[5m])) - ) - ) - * 100 - > 1 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteBehind - annotations: - description: Prometheus {{$labels.instance}} remote write is {{ printf "%.1f" - $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. - summary: Prometheus remote write is behind. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus"}[5m]) - - ignoring(remote_name, url) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus"}[5m]) - ) - > 120 - for: 15m - labels: - severity: critical - - alert: PrometheusRemoteWriteDesiredShards - annotations: - description: Prometheus {{$labels.instance}} remote write desired shards calculation - wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url - }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus"}` - $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more - than configured max shards. - expr: | - # Without max_over_time, failed scrapes could create false negatives, see - # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus"}[5m]) - > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus"}[5m]) - ) - for: 15m - labels: - severity: warning - - alert: PrometheusRuleFailures - annotations: - description: Prometheus {{$labels.instance}} has failed to evaluate {{ printf - "%.0f" $value }} rules in the last 5m. - summary: Prometheus is failing rule evaluations. - expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: critical - - alert: PrometheusMissingRuleEvaluations - annotations: - description: Prometheus {{$labels.instance}} has missed {{ printf "%.0f" $value - }} rule group evaluations in the last 5m. - summary: Prometheus is missing rule evaluations due to slow rule group evaluation. - expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusTargetLimitHit - annotations: - description: Prometheus {{$labels.instance}} has dropped {{ printf "%.0f" $value - }} targets because the number of targets exceeded the configured target_limit. - summary: Prometheus has dropped targets because some scrape configs have exceeded - the targets limit. - expr: | - increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus"}[5m]) > 0 - for: 15m - labels: - severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts - from Prometheus {{$labels.instance}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without (alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus",alertmanager!~``}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus",alertmanager!~``}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - promtail-alerts.yaml: | - groups: - - name: promtail_alerts - rules: - - alert: PromtailRequestsErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) - / - sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) - > 10 - for: 15m - labels: - severity: critical - - alert: PromtailRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 - for: 15m - labels: - severity: critical - - alert: PromtailFileLagging - annotations: - message: | - {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m. - expr: | - abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6 - for: 15m - labels: - severity: warning - - alert: PromtailFileMissing - annotations: - message: | - {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed. - expr: | - promtail_file_bytes_total unless promtail_read_bytes_total - for: 15m - labels: - severity: critical diff --git a/test/behave/features/environment.py b/test/behave/features/environment.py index c07e349a24b5f16b276943f3ed2f7f62f178e754..8f696569f1c5e9f08a57db14349881c608df288f 100644 --- a/test/behave/features/environment.py +++ b/test/behave/features/environment.py @@ -5,7 +5,7 @@ def before_tag(context, tag): """Define steps run before each tag.""" userdata = context.config.userdata - if tag == 'grafana': + if tag == 'prometheus-stack': context.grafana = get_values(userdata, 'grafana') if tag == 'nextcloud': diff --git a/test/behave/features/grafana.feature b/test/behave/features/prometheus-stack.feature similarity index 82% rename from test/behave/features/grafana.feature rename to test/behave/features/prometheus-stack.feature index e4d961b60ea089cc0f18ac77db0a98eebd0afa50..0be22f9ef1582192b2fc071b0c46c86b57782bcf 100644 --- a/test/behave/features/grafana.feature +++ b/test/behave/features/prometheus-stack.feature @@ -1,4 +1,4 @@ -@grafana +@prometheus-stack Feature: Test grafana admin login As an OAS admin I want to be able to login to grafana as the user admin @@ -20,7 +20,7 @@ Scenario: Login to grafana Then I wait on element "sidemenu.sidemenu" for 60000ms to be visible And I expect that the path is "/" -Scenario: As an admin I want to look at the eventrouter logs - When I open the grafana explore eventrouter URL - Then I wait on element ".logs-panel" for 25000ms to be visible +Scenario: As an admin I want to look at the helm-operator logs + When I open the grafana explore helm-operator URL + Then I wait on element ".graph-panel" for 25000ms to be visible And I expect that element ".datapoints-warning" does not exist diff --git a/test/behave/features/steps/steps.py b/test/behave/features/steps/steps.py index 6d82a58205b38ac1870dffd8047c2c9df4abac1b..b869f588296147571d3aaf2eeb7e48faea4d94f8 100644 --- a/test/behave/features/steps/steps.py +++ b/test/behave/features/steps/steps.py @@ -15,13 +15,13 @@ def step_impl(context): """Open wordpress URL.""" context.behave_driver.get(context.wordpress['url']) -@when(u'I open the grafana explore eventrouter URL') -@given(u'I open the grafana explore eventrouter URL') +@when(u'I open the grafana explore helm-operator URL') +@given(u'I open the grafana explore helm-operator URL') def step_impl(context): """Open wordpress URL.""" - eventrouter_url = str(context.grafana["url"]) + '/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{app=\\\"eventrouter\\\"}"}]' - print(eventrouter_url) - context.behave_driver.get(eventrouter_url) + helm_operator_url = str(context.grafana["url"]) + '/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{app=\\\"helm-operator\\\"}"}]' + print(helm_operator_url) + context.behave_driver.get(helm_operator_url) @when(u'I wait on element "{element}" to be clickable') @given(u'I wait on element "{element}" to be clickable') diff --git a/test/pytest/test_app_deployments.py b/test/pytest/test_app_deployments.py index a6e99c5a9b38559a1fad601e9f1efdc8de13f873..01fadbdc4650c7294134f1b7975fb9285902d18d 100644 --- a/test/pytest/test_app_deployments.py +++ b/test/pytest/test_app_deployments.py @@ -16,8 +16,7 @@ EXPECTED_RELEASES = { 'kube-system': ['local-path-provisioner'], 'oas': [ 'ingress', - 'prometheus', - 'grafana', + 'prometheus-stack', 'loki-stack', 'eventrouter', 'single-sign-on' @@ -32,15 +31,12 @@ EXPECTED_APP_LABELS = { 'loki-stack': { 'namespace': 'oas', 'label_selector': 'app=loki'}, - 'grafana': { - 'namespace': 'oas', - 'label_selector': 'app=grafana'}, 'nextcloud': { 'namespace': 'oas-apps', 'label_selector': 'app.kubernetes.io/instance=nc'}, - 'prometheus': { + 'prometheus-stack': { 'namespace': 'oas', - 'label_selector': 'app=prometheus'}, + 'label_selector': 'app in (grafana,prometheus)'}, 'rocketchat': { 'namespace': 'oas-apps', 'label_selector': 'app.kubernetes.io/instance=rocketchat'}, @@ -114,7 +110,6 @@ def test_helmreleases(app): Checks if all desired HelmReleases installed by weave flux are in 'deployed' state. """ - if app != 'all': apps = [item for sublist in EXPECTED_RELEASES.values() for item in sublist] diff --git a/test/pytest/test_certs.py b/test/pytest/test_certs.py index c3fde585dcdd61139ed96fb8a9508c6727751bdd..bd4d48c142f6153a91717f29f1bcea4d2cfe0388 100755 --- a/test/pytest/test_certs.py +++ b/test/pytest/test_certs.py @@ -105,6 +105,8 @@ def test_cert_validation(host, app): # pylint: disable=too-many-statements if app == 'all': apps = list(app_subdomains.keys()) + elif app == 'prometheus-stack': + apps = ['grafana', 'prometheus'] else: assert app in app_subdomains, "Error: Unknown app: {}".format(app) apps = [app] diff --git a/test/pytest/test_prometheus.py b/test/pytest/test_prometheus.py index 9ddcdb9a3d29f9dd2a334e54f54f308f00085435..a89ce5b9bf922bdd6c420e0d36c1c7c32b8d6c4d 100755 --- a/test/pytest/test_prometheus.py +++ b/test/pytest/test_prometheus.py @@ -64,7 +64,7 @@ def test_prometheus_alerts(host): print("Starting prometheus test...") - url = 'http://prometheus-server.oas.svc.cluster.local/api/v1/alerts' + url = 'http://prometheus-stack-kube-prom-prometheus.oas.svc.cluster.local:9090/api/v1/alerts' alert_json = json.loads(host.check_output('curl ' + url)) status = alert_json["status"]