diff --git a/flux2/apps/monitoring/kube-prometheus-stack-values-configmap.yaml b/flux2/apps/monitoring/kube-prometheus-stack-values-configmap.yaml index 026880cf4faa181d23ab195301cd898dda648de0..1f4729181818e341650ab0ce637128fc6a70a263 100644 --- a/flux2/apps/monitoring/kube-prometheus-stack-values-configmap.yaml +++ b/flux2/apps/monitoring/kube-prometheus-stack-values-configmap.yaml @@ -102,9 +102,6 @@ data: scrapeInterval: "3m" evaluationInterval: "3m" retention: "30d" - - # replicas: 2 - # podAntiAffinity: "hard" storageSpec: volumeClaimTemplate: spec: @@ -112,7 +109,6 @@ data: resources: requests: storage: 10Gi - resources: limits: cpu: 1 @@ -120,6 +116,13 @@ data: requests: cpu: 300m memory: 1Gi + # Discover ALL custom serviceMonitors, podMonitors and prometheusrules, + # not only those with particular release labels set by + # kube-prometheus-stack + # https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#prometheusioscrape + serviceMonitorSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false ingress: enabled: true @@ -185,20 +188,34 @@ data: # enabled: true # lablel: grafana_dashboard - # dashboardProviders: - # dashboardproviders.yaml: - # apiVersion: 1 - # providers: - # - name: 'default' - # orgId: 1 - # folder: '' - # type: file - # disableDeletion: false - # editable: true - # options: - # path: /var/lib/grafana/dashboards - # dashboards: - # default: + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + # Needed for dashboards configured by the `dashboards` key below + - name: 'default' + folder: '' + options: + path: /var/lib/grafana/dashboards/default + # Default kube-prometheus-stack sidecarProvider + - name: 'sidecarProvider' + orgId: 1 + folder: '' + type: file + disableDeletion: false + allowUiUpdates: false + updateIntervalSeconds: 30 + options: + foldersFromFilesStructure: false + path: /tmp/dashboards + + # https://github.com/grafana/helm-charts/blob/main/charts/grafana/README.md#import-dashboards + dashboards: + default: + flux-control-plane: + url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/control-plane.json + flux-cluster: + url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/cluster.json # kube-dash: # gnetId: 11074 # revision: 2 diff --git a/flux2/cluster/optional/monitoring/monitoring-config.yaml b/flux2/cluster/optional/monitoring/monitoring-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..483b7a2975e9577218f8cf9d973a4ed6da6d1d6f --- /dev/null +++ b/flux2/cluster/optional/monitoring/monitoring-config.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: kustomize.toolkit.fluxcd.io/v1beta1 +kind: Kustomization +metadata: + name: monitoring-config + namespace: flux-system +spec: + interval: 1h + dependsOn: + - name: monitoring + sourceRef: + kind: GitRepository + name: openappstack + path: ./flux2/config/monitoring + prune: true + validation: client diff --git a/flux2/config/README.md b/flux2/config/README.md new file mode 100644 index 0000000000000000000000000000000000000000..071c0ecbdf9d16791140420f7f5f9a0e1e918287 --- /dev/null +++ b/flux2/config/README.md @@ -0,0 +1,5 @@ +# Additional Stackspin configuration + +This directory holds resources that depend on custom CRDs installed by other +apps, i.e. the `podMonitor` CRD installed by `kube-stack-prometheus`. +They can't be installed by the same `kustomization` which installs the CRDs. diff --git a/flux2/config/monitoring/flux-alerts-prometheusrule.yaml b/flux2/config/monitoring/flux-alerts-prometheusrule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fb16df35f72f4cdb5cf9faf00bb25fdd46dd7893 --- /dev/null +++ b/flux2/config/monitoring/flux-alerts-prometheusrule.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: flux-prometheus.rules + namespace: oas + labels: + app.kubernetes.io/managed-by: flux + app.kubernetes.io/part-of: custom-flux-config +spec: + # https://fluxcd.io/docs/guides/monitoring/#metrics + groups: + - name: flux + rules: + - alert: ReconciliationFailure + expr: max(gotk_reconcile_condition{status="False",type="Ready"}) by (namespace, name, kind) + on(namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"}) by (namespace, name, kind)) * 2 == 1 + for: 10m + labels: + severity: page + annotations: + summary: '{{ $labels.kind }} {{ $labels.namespace }}/{{ $labels.name }} reconciliation has been failing for more than ten minutes.' diff --git a/flux2/config/monitoring/kube-prometheus-stack-flux-podmonitor.yaml b/flux2/config/monitoring/kube-prometheus-stack-flux-podmonitor.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c20248c90e47f43886aa9d271b5e79d6836acee8 --- /dev/null +++ b/flux2/config/monitoring/kube-prometheus-stack-flux-podmonitor.yaml @@ -0,0 +1,22 @@ +# https://fluxcd.io/docs/guides/monitoring/ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: flux-system + namespace: flux-system + labels: + app.kubernetes.io/part-of: flux +spec: + namespaceSelector: + matchNames: + - flux-system + selector: + matchExpressions: + - key: app + operator: In + values: + - helm-controller + - source-controller + - kustomize-controller + podMetricsEndpoints: + - targetPort: http-prom diff --git a/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml b/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e476ff60ad2b2084e360c5e37cd2df31b1f0411 --- /dev/null +++ b/flux2/config/monitoring/stackspin-alerts-prometheusrule.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-custom-alerts-openappstack.rules + namespace: oas + labels: + app.kubernetes.io/managed-by: flux + app.kubernetes.io/part-of: custom-flux-config +spec: + groups: + - name: oas-general + rules: + - alert: service_down + expr: up == 0 or probe_success == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down\ + \ for more than 5 minutes." + + - name: oas-systemd + rules: + - alert: failed_systemd_units + expr: node_systemd_unit_state{state="failed"} !=0 + for: 5m + labels: + severity: page + annotations: + summary: "Systemd unit failed on {{ $labels.instance }}" + description: "Warning: Systemd unit failed on {{ $labels.instance }} (job {{\ + \ $labels.job }}) for more than 5 min." + + - name: oas-maintenance + rules: + - alert: apt_upgrades_pending + expr: apt_upgrades_pending !=0 + for: 2d + labels: + severity: warning + annotations: + summary: "Apt upgrades available on {{ $labels.instance }}" + description: "Warning: Apt upgrades available on {{ $labels.instance }} (job\ + \ {{ $labels.job }}) for more then 2 days." + + - alert: node_reboot_required + expr: node_reboot_required !=0 + for: 2d + labels: + severity: warning + annotations: + summary: "Reboot required on {{ $labels.instance }}" + description: "Warning: Reboot required on {{ $labels.instance }} (job {{ $labels.job\ + \ }}) for more then 2 days."