Compare revisions

d4206e60 · 16e9aa69 · 8be624cd · e6136b57 · b9b3bd1d · 07983844
--- a/.gitignore
+++ b/.gitignore
@@ -12,9 +12,12 @@
 /.direnv
 .envrc

-# Ignore files created during CI
+# Ignore files created during CI or development
 /test/group_vars/all/
 /test/inventory*
+/test/cypress/videos/
+/test/cypress/screenshots/
+/test/cypress/logs/
 /clusters
 /install/installation-kustomization/*.txt

@@ -31,3 +34,6 @@ Taskfile.yaml

 # Documentation files
 /docs/_build
+
+# node modules for cypress
+node_modules
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
--- a/.gitlab/ci_pipelines/apps_ready.yml
+++ b/.gitlab/ci_pipelines/apps_ready.yml
+---
+
+workflow:
+  rules:
+  # Only run our pipelines for merge requests.
+  - if: $CI_MERGE_REQUEST_IID
+  # Or when the pipeline was scheduled (nightly
+  # run on main in particular).
+  - if: '$CI_PIPELINE_SOURCE == "schedule"'
+
+include:
+  - local: .gitlab/ci_templates/stackspin_common.yml
+
+stages:
+  - get-artifacts
+  - base-ready
+  - integration-test
+
+# Stage: optional-apps-ready
+# ================
+#
+# Check that the kustomizations of all installed apps are ready.
+
+# Downloads the artifacts from the parent pipeline, used by all the other jobs
+# in this pipeline. This needs to happen in a separate job, because all the
+# other jobs are optional.
+get-artifacts:
+  stage:
+    get-artifacts
+  # This job needs a script, otherwise GitLab CI lint will complain.
+  script:
+    - echo "This job only exists to get artifacts from the parent pipeline"
+  extends:
+    - .child_require_artifacts
+    - .report_artifacts
+
+kustomization-ready:
+  extends:
+    - .kustomization_ready
+  rules:
+    - if: '$SKIP_KUSTOMIZATION_READY == "true"'
+      when: never
+    - when: always
+
+cert:
+  stage: base-ready
+  script:
+    - cd test/
+    - pytest -v -s -m 'certs' --resource="$RESOURCE" --desired-revision "${STACKSPIN_BRANCH}@sha1:${STACKSPIN_COMMIT_SHA}" --reruns 120 --reruns-delay 10
+  interruptible: true
+  rules:
+    - if: '$SKIP_CERT == "true"'
+      when: never
+    - when: always
+
+cypress:
+  stage: integration-test
+  image:
+    # https://hub.docker.com/r/cypress/included/tags
+    name: cypress/included:14.1.0
+    entrypoint: [""]
+  variables:
+    CYPRESS_USE_SSO_LOGIN: "true"
+    CYPRESS_SSO_USER: "admin@ci.stackspin.net"
+    CYPRESS_SSO_PW: "$SSO_PASSWORD"
+    CYPRESS_SSO_LOGOUT: "https://sso.$FQDN/oauth2/sessions/logout"
+    GIT_STRATEGY: "${INSTALL_GIT_STRATEGY}"
+  script:
+    - !reference [.checkout_branch, script]
+    - cd test
+    - npm install
+    - cypress run --record --key "$CYPRESS_RECORD_KEY" --spec "cypress/e2e/$RESOURCE.cy.js"
+  interruptible: true
+  rules:
+    - if: '$CYPRESS_TEST == "true"'
+      when: always
+    - when: never
+
+helm-test:
+  stage: integration-test
+  image:
+    name: alpine/helm:3.17.1
+    entrypoint: ["/bin/sh", "-c"]
+  script:
+    - export KUBECONFIG="${PWD}/clusters/${HOSTNAME}/kube_config_cluster.yml"
+    - time helm test -n $HELM_RELEASE_NAMESPACE --logs --timeout 30m $HELM_RELEASE_NAME
+  interruptible: true
+  rules:
+    - if: '$HELM_TEST == "true"'
+      when: always
+    - when: never
--- a/.gitlab/ci_pipelines/default.yml
+++ b/.gitlab/ci_pipelines/default.yml
+---
+
+workflow:
+  rules:
+  # Only run our pipelines for merge requests.
+  - if: $CI_MERGE_REQUEST_IID
+  # Or when the pipeline was scheduled (nightly
+  # run on main in particular).
+  - if: '$CI_PIPELINE_SOURCE == "schedule"'
+
+include:
+  - /.gitlab/ci_templates/kaniko.yml
+  - /.gitlab/ci_templates/stackspin_common.yml
+
+stages:
+  - build
+  - install-cluster
+  - install-stackspin
+  - apps-ready
+  - cluster-health
+
+variables:
+  # Useful for debugging
+  # !!! CAREFUL - this will expose private keys in the CI logs !!!
+  # !!! Make sure to delete the resulting job output from the  !!!
+  # !!! job's output site (Trash can symbol at the top)        !!!
+  # CI_DEBUG_TRACE: "true"
+  # This decides which branch gets used by the create-vps and install-stackspin
+  # jobs. It also indirectly influences the commit sha that's checked for in the
+  # kustomization ready jobs
+  INSTALL_STACKSPIN_BRANCH: ${CI_COMMIT_REF_NAME}
+  # This pipeline uses the "fetch" (default) git strategy, but the
+  # "upgrade_test" pipeline needs the slower "clone" git strategy to be able to
+  # install an older version of Stackspin
+  INSTALL_GIT_STRATEGY: "fetch"
+  HOSTNAME: ${CI_COMMIT_REF_SLUG}
+
+# Stage: build
+# ============
+#
+# Builds CI test container image
+# There are 2 moments in which we (re)build the container image. If some files are
+# changed, or when the job is triggered with TRIGGER_JOBS.
+
+ci-test-image-build:
+  stage: build
+  after_script:
+    - |
+      echo "CI_CONTAINER_TAG=${CI_COMMIT_REF_SLUG}" | tee .ci.env
+  artifacts:
+    paths:
+      - .ci.env
+    expire_in: 1 month
+    when: always
+    reports:
+      dotenv:
+        .ci.env
+  environment:
+    name: image/${CI_COMMIT_REF_SLUG}
+    url:
+      https://open.greenhost.net:4567/stackspin/stackspin/stackspin-ci:${CI_COMMIT_REF_SLUG}
+    auto_stop_in: 3 weeks
+  rules:
+    # Automatically rebuild the container image if this file, the Dockerfile,
+    # the installed requirements or the kaniko template change
+    - changes:
+        - Dockerfile
+        - requirements.txt
+        - .gitlab/ci_templates/kaniko.yml
+    # Also rebuild when the CI variable contain this jobs name
+    # or commit msg contains /TRIGGER_JOBS=.*ci-test-image-build/
+    - if: '$TRIGGER_JOBS =~ /ci-test-image-build/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*ci-test-image-build/'
+  extends:
+    - .kaniko_build
+  interruptible: true
+
+report-ci-image-tag:
+  stage: build
+  image: "curlimages/curl"
+  script:
+    - |
+      TAG_INFORMATION=$(curl -sS https://open.greenhost.net/api/v4/projects/stackspin%2Fstackspin/registry/repositories/73/tags/${CI_COMMIT_REF_SLUG});
+      echo "Tag information: ${TAG_INFORMATION}"
+      if [ "$TAG_INFORMATION" == '{"message":"404 Tag Not Found"}' ]; then
+        CI_CONTAINER_TAG="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME"
+      else
+        CI_CONTAINER_TAG="${CI_COMMIT_REF_SLUG}"
+      fi
+      echo "CI_CONTAINER_TAG=${CI_CONTAINER_TAG}" | tee .ci.env
+  artifacts:
+    paths:
+      - .ci.env
+    expire_in: 1 month
+    when: always
+    reports:
+      dotenv:
+        .ci.env
+  rules:
+    # Make sure this job does not run if ci-test-image-build runs
+    - changes:
+        - Dockerfile
+        - requirements.txt
+        - .gitlab/ci_templates/kaniko.yml
+      when: never  # Never run on file changes that trigger ci-test-image-build
+    - if: '$TRIGGER_JOBS =~ /ci-test-image-build/'
+      when: never  # Never run when ci-test-image is triggered manually
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*ci-test-image-build/'
+      when: never  # Never run when ci-test-image is triggered manually
+    - when: always
+  interruptible: true
+
+
+# Stage: install-cluster
+# ======================
+#
+# * Creates the vps for the pipeline
+# * Installs k8s with ansible
+
+create-vps:
+  stage: install-cluster
+  variables:
+    SUBDOMAIN: "${HOSTNAME}.ci"
+    DOMAIN: "stackspin.net"
+    GIT_STRATEGY: "${INSTALL_GIT_STRATEGY}"
+    API_URL: "https://open.greenhost.net/api/v4"
+  script:
+    # Find out which apps will get installed so the create_vps.sh script
+    # can create a droplet with proper memory size.
+    # Then create a VPS based on a custom CI image for which the ansible playbook
+    # has already run.
+    - |
+      curl --silent --show-error --header "PRIVATE-TOKEN: $CREATE_VPS_TOKEN" "${API_URL}/projects/6/pipelines/${CI_PIPELINE_ID}/bridges" -o bridges.json
+      export ENABLED_APPS=$(jq -r '.[] | select(.name | contains("app-ready")) | .name' bridges.json | sed 's/-app-ready//' | tr '\n' ',' | sed 's/,$//')
+      # See CONTRIBUTING.md#ci-pipeline-image for more info
+      bash .gitlab/ci_scripts/create_vps.sh
+  extends:
+    - .ssh_setup
+    - .report_artifacts
+    - .general_rules
+  environment:
+    name: $CI_COMMIT_REF_SLUG
+    url: https://$FQDN
+    on_stop: terminate-droplet
+    auto_stop_in: 1 week
+  retry: 1
+  interruptible: true
+
+test-dns:
+  stage: install-cluster
+  needs:
+    - job: create-vps
+  # Needs a pytest ansible connection to get the configured system resolvers
+  script:
+    - cd ansible/
+    - pytest -v -s -m 'dns' --connection=ansible --ansible-inventory=../${CLUSTER_DIR}/inventory.yml --hosts='ansible://*'
+  extends:
+    - .general_rules
+  interruptible: true
+
+# Terminates a droplet and deletes the branch container image once the MR for it is merged
+terminate-droplet:
+  # Stage has to be the same as the step that created the VPS
+  # https://docs.gitlab.com/ee/ci/environments.html#automatically-stopping-an-environment
+  stage: install-cluster
+  # Gets triggered by on_stop of create-vps job
+  when: manual
+  variables:
+    GIT_STRATEGY: none
+  before_script:
+    - echo "Default before script is overwritten because this job has no access to the Git repo"
+  script:
+    # Delete droplet
+    - python3 -c "import greenhost_cloud; greenhost_cloud.terminate_droplets_by_name(\"^${HOSTNAME}\")"
+    # Delete container image if one was created. Do not delete `main` nor `v*`.
+    - >
+      if [[ "$CI_CONTAINER_TAG" != "main" && ! "$CI_CONTAINER_TAG" =~ v\d+ ]]; then
+        curl --request DELETE --header "PRIVATE-TOKEN: ${CLEANER_TOKEN}" https://open.greenhost.net/api/v4/projects/stackspin%2Fstackspin/registry/repositories/73/tags/${CI_CONTAINER_TAG};
+      fi
+  # We have problems with cosmos not allowing deletion even though the droplet
+  # was just stopped.
+  retry: 2
+  environment:
+    name: $CI_COMMIT_REF_SLUG
+    action: stop
+
+install-stackspin:
+  stage: install-stackspin
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+  trigger:
+    include:
+      - local: .gitlab/ci_pipelines/install_stackspin.yml
+    strategy: depend
+  extends:
+    - .general_rules
+
+# Stage: apps
+# ================
+#
+# Check that the kustomizations of all installed apps are ready.
+
+hedgedoc-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "hedgedoc"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://hedgedoc.$FQDN"
+  extends:
+    - .hedgedoc_rules
+    - .trigger_apps_ready_pipeline
+
+monitoring-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "monitoring"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://grafana.$FQDN"
+  extends:
+    - .monitoring_rules
+    - .trigger_apps_ready_pipeline
+
+nextcloud-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "nextcloud"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    # Use `helm test` to run Cypress tests instead of directly in CI image
+    HELM_TEST: "true"
+    HELM_RELEASE_NAME: "nc"
+    HELM_RELEASE_NAMESPACE: "stackspin-apps"
+  extends:
+    - .nextcloud_rules
+    - .trigger_apps_ready_pipeline
+
+velero-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "velero"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    # Do not run `cert` and cypress tests, because we don't have them for velero
+    SKIP_CERT: "true"
+  extends:
+    - .velero_rules
+    - .trigger_apps_ready_pipeline
+
+wekan-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "wekan"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://wekan.$FQDN"
+  extends:
+    - .wekan_rules
+    - .trigger_apps_ready_pipeline
+
+wordpress-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "wordpress"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://www.$FQDN"
+  extends:
+    - .wordpress_rules
+    - .trigger_apps_ready_pipeline
+
+zulip-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "zulip"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://zulip.$FQDN"
+  extends:
+    - .zulip_rules
+    - .trigger_apps_ready_pipeline
+
+single-sign-on-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "single-sign-on"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    # kustomization-ready has already run as part of the base-ready in
+    # the install-stackspin pipeline
+    SKIP_KUSTOMIZATION_READY: "true"
+  extends:
+    - .general_rules
+    - .trigger_apps_ready_pipeline
+
+dashboard-app-ready:
+  stage: apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: create-vps
+    RESOURCE: "dashboard"
+    STACKSPIN_BRANCH: $INSTALL_STACKSPIN_BRANCH
+    # kustomization-ready has already run as part of the base-ready in
+    # the install-stackspin pipeline
+    SKIP_KUSTOMIZATION_READY: "true"
+    HELM_TEST: "true"
+    HELM_RELEASE_NAME: "dashboard"
+    HELM_RELEASE_NAMESPACE: "stackspin"
+  extends:
+    - .general_rules
+    - .trigger_apps_ready_pipeline
+
+
+# Stage: cluster-health
+# =====================
+#
+# General cluster health checks
+
+testinfra:
+  stage: cluster-health
+  extends:
+    - .testinfra
+
+prometheus-alerts:
+  stage: cluster-health
+  extends:
+    - .prometheus_alerts
--- a/.gitlab/ci_pipelines/install_stackspin.yml
+++ b/.gitlab/ci_pipelines/install_stackspin.yml
+---
+
+workflow:
+  rules:
+  # Only run our pipelines for merge requests.
+  - if: $CI_MERGE_REQUEST_IID
+  # Or when the pipeline was scheduled (nightly
+  # run on main in particular).
+  - if: '$CI_PIPELINE_SOURCE == "schedule"'
+
+include:
+  - local: .gitlab/ci_templates/stackspin_common.yml
+  - local: .gitlab/ci_templates/base_ready.yml
+
+stages:
+  - install-stackspin
+  - base-ready
+  - enable-apps
+  - configure-stackspin
+
+# Stage: install-stackspin
+# ========================
+#
+# Installs flux and stackspin with it
+
+install-stackspin:
+  stage: install-stackspin
+  variables:
+    GIT_STRATEGY: "${INSTALL_GIT_STRATEGY}"
+    SSO_PASSWORD: "$SSO_PASSWORD"
+  script:
+    # For upgrade_test pipelines, we install an older version (latest stable)
+    - !reference [.checkout_branch, script]
+    # Customize env file, remove all comments and empty lines
+    - cp install/.flux.env.example ${CLUSTER_DIR}/.flux.env
+    - sed -i "s/1.2.3.4/$IP_ADDRESS/" ${CLUSTER_DIR}/.flux.env
+    # Set admin user email address
+    - sed -i "s/admin_email=admin@example.org/admin_email=admin@ci.stackspin.net/" ${CLUSTER_DIR}/.flux.env
+    # Delete comments
+    - sed -i "/^\s*#.*$/d; /^\s*$/d" ${CLUSTER_DIR}/.flux.env
+    # Disable outgoing mail
+    - sed -i "s/outgoing_mail_enabled=true/outgoing_mail_enabled=false/" ${CLUSTER_DIR}/.flux.env
+    # Some apps (looking at you Zulip) don't have a way to disable email, so we
+    # set the mail domain like this so we don't contact the Greenhost server
+    # anyway (with known wrong credentials).
+    - sed -i "s/outgoing_mail_smtp_host=.*/outgoing_mail_smtp_host=localhost/" ${CLUSTER_DIR}/.flux.env
+    - sed -i "s/example.org/$FQDN/"  ${CLUSTER_DIR}/.flux.env
+    # Deploy secret/stackspin-cluster-variables
+    - cp install/kustomization.yaml ${CLUSTER_DIR}
+    - |
+      kubectl create namespace flux-system  -o yaml --dry-run=client --save-config | kubectl apply -f -
+      kubectl create namespace cert-manager -o yaml --dry-run=client --save-config | kubectl apply -f -
+      kubectl create namespace stackspin -o yaml --dry-run=client --save-config | kubectl apply -f -
+    - kubectl apply -k ${CLUSTER_DIR}
+    # Add an override so cert-manager uses the SSL.com ClusterIssuer
+    - kubectl apply -f ./install/overrides/ci/stackspin-cert-manager-override.yaml
+    # TODO: this is only necessary for the upgrade pipeline between 2.6 and
+    # 2.7, and can be removed after 2.7 is released.
+    - |
+      if [ ! -e ./install/overrides/ci/stackspin-dashboard-override.yaml ]; then cat > ./install/overrides/ci/stackspin-dashboard-override.yaml <<EOF
+      ---
+      apiVersion: v1
+      kind: ConfigMap
+      metadata:
+        namespace: stackspin
+        name: stackspin-dashboard-override
+      data:
+        values.yaml: |
+      EOF
+      fi
+    # Add an override to populate the dashboard helm test credentials.
+    - |
+      cat >> ./install/overrides/ci/stackspin-dashboard-override.yaml <<EOF
+          tests:
+            credentials:
+              user: 'admin@ci.stackspin.net'
+              password: '$SSO_PASSWORD'
+      EOF
+    - cat ./install/overrides/ci/stackspin-dashboard-override.yaml
+    - kubectl apply -f ./install/overrides/ci/stackspin-dashboard-override.yaml
+    # Install flux and general, non-app specific secrets
+    - ./install/install-stackspin.sh
+  extends:
+    - .general_rules
+    - .child_require_artifacts
+    - .report_artifacts
+  interruptible: true
+
+
+.enable_app_template:
+  stage: enable-apps
+  needs:
+    - install-stackspin
+    - dashboard-kustomization-ready
+  # Make sure only one enable-app job runs against a given cluster at the same
+  # time.
+  # TODO: this actually doesn't work, not sure why
+  resource_group: $CI_ENVIRONMENT_NAME
+  variables:
+    GIT_STRATEGY: "${INSTALL_GIT_STRATEGY}"
+  before_script:
+    # For upgrade_test pipelines, we install an older version (latest stable)
+    - |
+      if [[ "$CI_COMMIT_REF_NAME" != "$STACKSPIN_BRANCH" ]]; then
+        # NOTE: this command will fail if GIT_STRATEGY is not set to "clone"
+        git checkout "$STACKSPIN_BRANCH"
+      fi
+  script:
+    # Add optional override values we need for the CI pipeline only
+    - >
+      [ -f ./install/overrides/ci/stackspin-${RESOURCE}-override.yaml ] &&
+        kubectl apply -f ./install/overrides/ci/stackspin-${RESOURCE}-override.yaml
+    - kubectl exec -n stackspin deploy/dashboard-backend -- flask cli app install ${RESOURCE}
+  interruptible: true
+
+enable-monitoring:
+  variables:
+    RESOURCE: "monitoring"
+  extends:
+    - .enable_app_template
+    - .monitoring_rules
+
+enable-nextcloud:
+  variables:
+    RESOURCE: "nextcloud"
+  before_script:
+    # First do the git checkout, otherwise that will refuse to override the
+    # changes we make to the override file.
+    - !reference [.enable_app_template, before_script]
+    # Add Cypress record key so screenshots/videos of helm test are uploaded to
+    # the Cypress dashboard
+    - |
+      cat >> ./install/overrides/ci/stackspin-nextcloud-override.yaml<< EOF
+          tests:
+            ssoLogin:
+              enabled: true
+              username: 'admin@ci.stackspin.net'
+              password: '$SSO_PASSWORD'
+            cypress:
+              projectId: '$CYPRESS_PROJECT_ID'
+              recordKey: '$CYPRESS_RECORD_KEY'
+              commitInfo:
+                branch: '$CI_COMMIT_REF_NAME'
+                message: |
+                  $CI_COMMIT_TITLE
+                author: |
+                  $CI_COMMIT_AUTHOR
+                sha: '$CI_COMMIT_SHORT_SHA'
+      EOF
+  extends:
+    - .enable_app_template
+    - .nextcloud_rules
+
+enable-velero:
+  variables:
+    RESOURCE: "velero"
+  extends:
+    - .enable_app_template
+    - .velero_rules
+
+enable-wekan:
+  variables:
+    RESOURCE: "wekan"
+  extends:
+    - .enable_app_template
+    - .wekan_rules
+
+enable-wordpress:
+  variables:
+    RESOURCE: "wordpress"
+  extends:
+    - .enable_app_template
+    - .wordpress_rules
+
+enable-zulip:
+  variables:
+    RESOURCE: "zulip"
+  extends:
+    - .enable_app_template
+    - .zulip_rules
+
+enable-hedgedoc:
+  variables:
+    RESOURCE: "hedgedoc"
+  extends:
+    - .enable_app_template
+    - .hedgedoc_rules
+
+
+# Stage: configure-stackspin
+#
+# Configure cluster after basic installation
+# i.e. CI-related config like clusterIssuer
+
+configure-sslcom-issuer:
+  stage: configure-stackspin
+  needs:
+    - cert-manager-kustomization-ready
+    - install-stackspin
+  script:
+    # Install custom ClusterIssuer for SSL.com production certificates
+    - bash ./.gitlab/ci_scripts/install_sslcom_issuer.sh
+  extends:
+    - .general_rules
+  interruptible: true
+
+configure-zerossl-issuer:
+  stage: configure-stackspin
+  needs:
+    - cert-manager-kustomization-ready
+    - install-stackspin
+  script:
+    # Install custom ClusterIssuer for zerossl.com production certificates
+    - bash ./.gitlab/ci_scripts/install_zerossl_issuer.sh
+  extends:
+    - .general_rules
+  interruptible: true
+
+# Set the password for the initial Stackspin admin user to a fixed value, for
+# easy testing and debugging.
+admin-user-password:
+  stage: configure-stackspin
+  needs:
+  - install-stackspin
+  - dashboard-kustomization-ready
+  script:
+  - kubectl exec -n stackspin deploy/dashboard-backend -- flask cli user setpassword admin@ci.stackspin.net $SSO_PASSWORD
--- a/.gitlab/ci_pipelines/upgrade_test.yml
+++ b/.gitlab/ci_pipelines/upgrade_test.yml
+---
+# This pipeline first runs the whole `default.yml` pipeline, but instead of
+# installing the newest version of Stackspin, it will install the latest stable
+# (defined in the INSTALL_STACKSPIN_BRANCH variable).
+#
+# After the default pipeline is finished, the stackspin `GitRepository` object
+# is patched to track the current branch and commit. Then, the apps and
+# cluster-health jobs are run again, to make sure that the upgrade broke
+# nothing.
+
+include:
+  - /.gitlab/ci_pipelines/default.yml
+  - local: .gitlab/ci_templates/base_ready.yml
+
+stages:
+  # First 5 stages are just the Default pipeline with different variables
+  - build
+  - install-cluster
+  - install-stackspin
+  - apps-ready
+  - cluster-health
+  # Upgrade the cluster to the current version
+  - upgrade-stackspin
+  # Run tests again
+  - base-ready
+  - upgraded-apps-ready
+  - upgrade-stackspin-health
+
+# Overrides stackspin branch used for create-vps and install-stackspin jobs
+variables:
+  INSTALL_STACKSPIN_BRANCH: v2
+  INSTALL_GIT_STRATEGY: clone
+  UPGRADE_STACKSPIN_BRANCH: $CI_COMMIT_REF_NAME
+  UPGRADE_STACKSPIN_COMMIT_SHA: $CI_COMMIT_SHA
+  # This variable is used by the `base_ready` template and the `app_tests` child
+  # pipelines after the upgrade has been applied. It is overridden in all jobs
+  # that rely on the INSTALL_STACKSPIN_BRANCH.
+  STACKSPIN_BRANCH: $UPGRADE_STACKSPIN_BRANCH
+  HOSTNAME: "upgrade-test"
+  CLUSTER_DIR: "clusters/upgrade-test"
+  # Increase disk size to 40G for upgrade test, see
+  # https://open.greenhost.net/stackspin/stackspin/-/issues/1271
+  # Outdated as we've grown the default disk size.
+  # DISK_SIZE: "40"
+
+# override environment name for upgrade-test, otherwise it will be `main`
+create-vps:
+  environment:
+    name: upgrade-test
+
+terminate-droplet:
+  environment:
+    name: upgrade-test
+
+
+upgrade-stackspin:
+  stage: upgrade-stackspin
+  # Make sure this has the create-vps artifacts
+  dependencies:
+    - create-vps
+  script:
+    # Change the `stackspin` GitRepo to switch to the new branch.
+    - kubectl patch -n flux-system gitrepository stackspin -p "{\"spec\":{\"ref\":{\"branch\":\"$UPGRADE_STACKSPIN_BRANCH\",\"commit\":\"$UPGRADE_STACKSPIN_COMMIT_SHA\"}}}" --type="merge"
+    # Make sure flux acts on that change right away, instead of waiting for the
+    # next nightly maintenance window to start upgrading.
+    - flux resume source git stackspin
+    # Set new STACKSPIN_COMMIT_SHA variable
+    - sed -i "s/STACKSPIN_COMMIT_SHA=.*/STACKSPIN_COMMIT_SHA=${UPGRADE_STACKSPIN_COMMIT_SHA}/" "${CLUSTER_DIR}/.cluster.env"
+  extends:
+    - .general_rules
+    - .report_artifacts
+  interruptible: true
+
+# These are imported, but we have to override `dependencies` for them in the
+# upgrade scenario, so that gitlab knows which exported environment (dotenv) to
+# use for these jobs. We have two jobs that export the `cluster.env` file:
+# first the create-vps one that first creates the cluster, but then later the
+# upgrade-stackspin one above which changes the `STACKSPIN_COMMIT_SHA`. We need
+# the latter for these jobs, because they should check that the new versions
+# (after the upgrade) of the kustomizations are ready.
+.kustomization_ready:
+  dependencies:
+  - upgrade-stackspin
+
+upgraded-monitoring-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "monitoring"
+    CYPRESS_TEST: "true"
+    CYPRESS_BASE_URL: "https://grafana.$FQDN"
+  extends:
+    - .monitoring_rules
+    - .trigger_apps_ready_pipeline
+
+# Override the script in the report-ci-image-tag job from
+# .gitlab/ci_pipelines/default.yml to always report current stable (`v2`) as
+# the image tag.
+report-ci-image-tag:
+  script:
+    - echo "CI_CONTAINER_TAG=v2" | tee .ci.env
+
+# Similar to report-ci-image-tag: we override the script to use the `v2`
+# container tag in pre-release pipelines.
+ci-test-image-build:
+  script:
+    - echo "CI_CONTAINER_TAG=v2" | tee .ci.env
+  after_script:
+    - echo "Removed default after_script behavior"
+  environment: null
+
+upgraded-nextcloud-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "nextcloud"
+  extends:
+    - .nextcloud_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-velero-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "velero"
+    # Do not run `cert` and cypress tests, because we don't have them for velero
+    SKIP_CERT: "true"
+  extends:
+    - .velero_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-wekan-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "wekan"
+  extends:
+    - .wekan_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-wordpress-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "wordpress"
+  extends:
+    - .wordpress_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-zulip-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "zulip"
+  extends:
+    - .zulip_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-hedgedoc-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "hedgedoc"
+  extends:
+    - .hedgedoc_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-single-sign-on-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "single-sign-on"
+    # kustomization-ready has already run as part of the base-ready in
+    # the install-stackspin pipeline
+    SKIP_KUSTOMIZATION_READY: "true"
+  extends:
+    - .general_rules
+    - .trigger_apps_ready_pipeline
+
+upgraded-dashboard-app-ready:
+  stage: upgraded-apps-ready
+  variables:
+    PARENT_PIPELINE_ID: $CI_PIPELINE_ID
+    ARTIFACT_JOB: upgrade-stackspin
+    RESOURCE: "dashboard"
+    # kustomization-ready has already run as part of the base-ready in
+    # the install-stackspin pipeline
+    SKIP_KUSTOMIZATION_READY: "true"
+  extends:
+    - .general_rules
+    - .trigger_apps_ready_pipeline
+
+# Stage: upgrade-cluster-health
+# =====================
+#
+# Check if cluster is still healthy after upgrade
+
+upgrade-testinfra:
+  stage: upgrade-stackspin-health
+  extends:
+    - .testinfra
+
+upgrade-prometheus-alerts:
+  stage: upgrade-stackspin-health
+  extends:
+    - .prometheus_alerts
--- a/.gitlab/ci_scripts/create_vps.sh
+++ b/.gitlab/ci_scripts/create_vps.sh
-#!/usr/bin/env sh
-# Check if cluster directory was available from cache
+#!/usr/bin/env bash

-set -ve
+set -euo pipefail
+
+mem_requirements() {
+  # Adds all mem requirements from all value configmaps of a given app
+  # This is not perfect but ok for a rough estimate
+  app="$1"
+  if [[ "$app" =~ (single-sign-on|dashboard) ]]; then
+    root_dir='flux2/core/base'
+  else
+    root_dir='flux2/apps'
+  fi
+  echo $(($(grep -A9999 'values.yaml' \
+    "$root_dir"/"$app"/*-values-configmap.yaml |
+    grep ' *requests:' -iA2 | grep ' *memory:' | cut -d':' -f2 | numfmt \
+    --from=auto | tr '\n' '+' | sed 's/$/0/')))
+}
+
+# Calculate droplet mem size
+echo "ENABLED_APPS: $ENABLED_APPS"
+total_apps_mem=0
+IFS="," read -ra apps <<<"$ENABLED_APPS"
+for app in "${apps[@]}"; do
+  app_mem=$(mem_requirements "$app")
+  total_apps_mem=$((total_apps_mem + app_mem))
+  echo "Memory request for $app: $(echo "$app_mem" | numfmt --to=iec-i)"
+done
+
+# Total mem utilization for a barebone stackspin cluster without any optional apps
+# 2.8 GiB
+total_core_mem=3006477107
+total_mem=$((total_core_mem + total_apps_mem))
+echo -e "\nMemory requests for core components: $(echo "$total_core_mem" | numfmt --to=iec-i)"
+echo "Memory request for all additional apps: ""$(echo "$total_apps_mem" | numfmt --to=iec-i)"
+echo "Total memory requests: ""$(echo "$total_mem" | numfmt --to=iec-i)"
+
+total_mb=$((total_mem / 1024 / 1024))
+echo "Creating a droplet with $total_mb MB."

 # shellcheck disable=SC2039,SC3028
 VPS_HOSTNAME="$HOSTNAME"

+if [[ "$CI_COMMIT_REF_NAME" != "$INSTALL_STACKSPIN_BRANCH" ]]; then
+  # NOTE: this command will fail if GIT_STRATEGY is not set to "clone"
+  git checkout "$INSTALL_STACKSPIN_BRANCH"
+fi
+
 # Delete old machine if it still exists
 echo "Deleting old machine"
-python3 -c "import greenhost_cloud;
-greenhost_cloud.terminate_droplets_by_name(\"^${VPS_HOSTNAME}$\")"
+python3 -c "import greenhost_cloud; greenhost_cloud.terminate_droplets_by_name(\"^$VPS_HOSTNAME$\")"
 echo "Creating new machine"
-# Uses a custom disk image built with a6854a89 on 2022-01-28. See
+# Uses a custom disk image built from commit
+# 33ccc519e5d239395e94e655a3fbd5cfc3640a60 (around 2.12) on 2024-06-18. See
 # CONTRIBUTING.md#ci-pipeline-image for more info.
-python3 -m stackspin "$VPS_HOSTNAME" create \
-  --create-droplet "$DOMAIN" \
-  --create-hostname "$VPS_HOSTNAME" \
-  --ssh-key-id "$SSH_KEY_ID" \
-  --create-domain-records \
-  --subdomain "$SUBDOMAIN" \
-  --disk-image-id '-10248' \
+args=(
+  --create-droplet "$DOMAIN"
+  --create-hostname "$VPS_HOSTNAME"
+  --ssh-key-id "$SSH_KEY_ID"
+  --mem "$total_mb"
+  --create-domain-records
+  --subdomain "$SUBDOMAIN"
+  --disk-image-id '-22353'
  --truncate-subdomain
+)
+
+if [ "${DISK_SIZE:-}" != "" ]; then
+  args+=(--disk-size "$DISK_SIZE")
+fi
+
+python3 -m stackspin "$VPS_HOSTNAME" create "${args[@]}"
 # Disabled for now, see https://open.greenhost.net/stackspin/stackspin/-/issues/1057
 # --docker-mirror-server="${CI_DEPENDENCY_PROXY_SERVER}" \
 # --docker-mirror-endpoint="${CI_DEPENDENCY_PROXY_GROUP_IMAGE_PREFIX}" \
 # --docker-mirror-username="${CI_RUNNER_PERSONAL_ACCESS_USERNAME}" \
 # --docker-mirror-password="${CI_RUNNER_PERSONAL_ACCESS_TOKEN}"
+
+# Install Kubernetes, firewalle, etc.
+python3 -m stackspin "$VPS_HOSTNAME" install
+
+# Make sure .ci.env variables are not lost
+cat .ci.env >>"$CLUSTER_DIR/.cluster.env"
+
+# Save with which git commit we're installing Stackspin, so we can make sure
+# our Kustomizations are up to date
+echo "STACKSPIN_COMMIT_SHA=$(git rev-parse HEAD)" >>"$CLUSTER_DIR/.cluster.env"
--- a/.gitlab/ci_scripts/install_zerossl_issuer.sh
+++ b/.gitlab/ci_scripts/install_zerossl_issuer.sh
+#!/usr/bin/env bash
+#
+# Installs a ClusterIssuer for zerossl.com with our credentials.
+#
+# Usage:
+#
+#   ./install_zerossl_issuer.sh
+set -euo pipefail
+
+# Create secret with HMAC key
+b64secret=$(echo -n "${ZEROSSL_EAB_HMAC_KEY}" | base64 -w0)
+
+# Add SSL.com ClusterIssuer
+kubectl apply -n cert-manager -f - <<EOF
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  namespace: cert-manager
+  name: zerossl-eabsecret
+data:
+  secret: ${b64secret}
+---
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: zerossl-issuer
+spec:
+  acme:
+    # The email address used for signing up with zerossl.com
+    email: ${ZEROSSL_EMAIL_ADDRESS}
+    # The ACME server URL
+    server: "https://acme.zerossl.com/v2/DV90"
+    externalAccountBinding:
+      keyID: ${ZEROSSL_EAB_KID}
+      keySecretRef:
+        name: zerossl-eabsecret
+        key: secret
+    # Name of a secret used to store the ACME account private key
+    privateKeySecretRef:
+      name: zerossl-prod
+    solvers:
+      - http01:
+          ingress:
+            class: nginx
+EOF
--- a/.gitlab/ci_scripts/print_debug_info.sh
+++ b/.gitlab/ci_scripts/print_debug_info.sh
+#!/usr/bin/env sh
+
+echo "Env vars:"
+echo
+env | grep -E '^(HOSTNAME|CLUSTER_NAME|FQDN|IP_ADDRESS|CLUSTER_DIR|ANSIBLE_HOST_KEY_CHECKING|KANIKO_BUILD_IMAGENAME|SSH_KEY_ID|SHELL|CI_PROJECT_DIR|STACKSPIN_BRANCH|STACKSPIN_COMMIT_SHA)='
+echo
+echo "Uptime:                    $(uptime)"
+echo "KANIKO build image ref:    ${CI_REGISTRY_IMAGE}/${KANIKO_BUILD_IMAGENAME}:${CI_CONTAINER_TAG}"
+echo
+if [ -f .ci.env ]; then
+  echo "Content of .ci.env:"
+  cat .ci.env
+fi
+if [ -f .cluster.env ]; then
+  echo "Content of .cluster.env:"
+  cat .cluster.env
+fi
--- a/.gitlab/ci_templates/base_ready.yml
+++ b/.gitlab/ci_templates/base_ready.yml
+---
+
+# Stage: base-ready
+# ====================
+#
+# Test if base kustomizations are ready, before configuration can get applied
+# that makes use of CRDs, i.e. clusterIssuer
+
+cert-manager-kustomization-ready:
+  variables:
+    RESOURCE: "cert-manager"
+  extends:
+    - .kustomization_ready
+
+dashboard-kustomization-ready:
+  variables:
+    RESOURCE: "dashboard"
+  extends:
+    - .kustomization_ready
+
+letsencrypt-issuer-kustomization-ready:
+  variables:
+    RESOURCE: "letsencrypt-issuer"
+  extends:
+    - .kustomization_ready
+
+local-path-provisioner-kustomization-ready:
+  variables:
+    RESOURCE: "local-path-provisioner"
+  extends:
+    - .kustomization_ready
+
+metallb-kustomization-ready:
+  variables:
+    RESOURCE: "metallb"
+  extends:
+    - .kustomization_ready
+
+namespaces-kustomization-ready:
+  variables:
+    RESOURCE: "namespaces"
+  extends:
+    - .kustomization_ready
+
+nginx-kustomization-ready:
+  variables:
+    RESOURCE: "nginx"
+  extends:
+    - .kustomization_ready
+
+single-sign-on-kustomization-ready:
+  variables:
+    RESOURCE: "single-sign-on"
+  extends:
+    - .kustomization_ready
+
+sources-kustomization-ready:
+  variables:
+    RESOURCE: "sources"
+  extends:
+    - .kustomization_ready
+
+stackspin-kustomization-ready:
+  variables:
+    RESOURCE: "stackspin"
+  extends:
+    - .kustomization_ready
+
+system-upgrade-config-ready:
+  variables:
+    RESOURCE: "system-upgrade-config"
+  extends:
+    - .kustomization_ready
+
+system-upgrade-controller-ready:
+  variables:
+    RESOURCE: "system-upgrade-controller"
+  extends:
+    - .kustomization_ready
--- a/.gitlab/ci_templates/helm_package.yml
+++ b/.gitlab/ci_templates/helm_package.yml
@@ -14,10 +14,18 @@ variables:
    - changes:
      - ${CHART_DIR}Chart.yaml

+.chart_changes_rules:
+  rules:
+    - changes:
+      - ${CHART_DIR}templates/**/*
+      - ${CHART_DIR}values-local.yaml.example
+      - ${CHART_DIR}values.yaml
+      - ${CHART_DIR}Chart.yaml
+
 lint-helm:
  stage: lint-helm-chart
  image:
-    name: alpine/helm:3.7.1
+    name: alpine/helm:3.17.1
    entrypoint: ["/bin/sh", "-c"]
  script:
    - cd ${CHART_DIR:-"."}
@@ -29,15 +37,13 @@ lint-helm:
    expire_in: 1 week
    # Even if lint fails, upload the charts/ folder as artifact
    when: always
-  rules:
-    - changes:
-      - ${CHART_DIR}*.yaml
-      - ${CHART_DIR}templates/*.yaml
+  extends:
+    - .chart_changes_rules

 package-chart:
  stage: package-helm-chart
  image:
-    name: alpine/helm:3.7.1
+    name: alpine/helm:3.17.1
    entrypoint: ["/bin/sh", "-c"]
  script:
    - cd ${CHART_DIR:-"."}
@@ -56,7 +62,7 @@ release-helm:
  stage: release-helm-chart
  script:
    - cd ${CHART_DIR:-"."}
-    - if [ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ]; then export HELM_CHANNEL='stable'; else export HELM_CHANNEL='unstable'; fi
+    - if [ "$CI_COMMIT_BRANCH" == "$CI_DEFAULT_BRANCH" ] || [ -n "$CI_COMMIT_TAG" ]; then export HELM_CHANNEL='stable'; else export HELM_CHANNEL='unstable'; fi
    - export CHART_FILE=$(ls ${CHART_NAME}-*.tgz)
    - curl --fail --request POST --user gitlab-ci-token:$CI_JOB_TOKEN --form "chart=@${CHART_FILE}" "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/${HELM_CHANNEL}/charts"
    - echo "Chart '${CHART_FILE}' published to helm repository '${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/${HELM_CHANNEL}/charts'"

--- a/.gitlab/ci_templates/kaniko.yml
+++ b/.gitlab/ci_templates/kaniko.yml
@@ -17,7 +17,8 @@
  image:
    # We need a shell to provide the registry credentials, so we need to use the
    # kaniko debug image (https://github.com/GoogleContainerTools/kaniko#debug-image)
-    name: gcr.io/kaniko-project/executor:v1.6.0-debug
+    # https://console.cloud.google.com/gcr/images/kaniko-project/global/executor
+    name: gcr.io/kaniko-project/executor:v1.23.2-debug
    entrypoint: [""]
  script:
    - mkdir -p /kaniko/.docker/

--- a/.gitlab/ci_templates/ssh_setup.yml
+++ b/.gitlab/ci_templates/ssh_setup.yml
-.ssh_setup:
-  before_script:
-    - mkdir ~/.ssh
-    - echo -e 'Host *\n  stricthostkeychecking no' > ~/.ssh/config
-    - eval $(ssh-agent -s)
-    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null
--- a/.gitlab/ci_templates/stackspin_common.yml
+++ b/.gitlab/ci_templates/stackspin_common.yml
+# General rules
+# =============
+#
+# Rules that enable the cluster to be built and are applied to most steps
+# (except for application-specific steps)
+
+.general_rules:
+  rules:
+    - changes:
+      - .gitlab-ci.yml
+      - .gitlab/ci_scripts/*
+      - .gitlab/ci_templates/*
+      - .gitlab/ci_pipelines/*
+      - Dockerfile
+      - ansible/**/*
+      - flux/**/*
+      - flux2/**/*
+      - install/**/*
+      - test/**/*
+      - stackspin/**/*
+      - requirements.txt
+      - VERSION
+      - CHANGELOG.md
+    - if: '$TRIGGER_JOBS =~ /enable-.*/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+# app rules
+#
+# Define the rules when/if app specific jobs are run.
+# Just add the variable RESOURCE to the job like this:
+#   variables:
+#     RESOURCE: "nextcloud"
+# and import the templates with i.e.
+#   extends: .nextcloud_rules
+# .nextcloud_rules will ensure that the job is only executed:
+# - when files related to the app changed in the repo
+# - A pipeline gets started from the UI and the job name is included in the
+#   CI variable `TRIGGER_JOBS`
+# - A commit is pushed containing the pattern TRIGGER_JOBS=.*<job name>
+#   (i.e. TRIGGER_JOBS=ci-test-image-build,enable-nextcloud)
+#
+# Gitlab CI allows pushing CI vars via `git push` but a bug prevents this when
+# using merge request pipelines (see https://gitlab.com/gitlab-org/gitlab/-/issues/326098)
+
+.monitoring_rules:
+  rules:
+    - changes:
+        - flux2/apps/monitoring/*.yaml
+        - flux2/cluster/optional/monitoring/*.yaml
+        - flux2/core/base/sources/grafana-helmrepository.yaml
+        - flux2/core/base/sources/wikimedia-helmrepository.yaml
+        - flux2/core/base/sources/prometheus-community-helmrepository.yaml
+        - flux2/config/monitoring/*.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - test/cypress/e2e/monitoring.cy.js
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-monitoring/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-monitoring/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.nextcloud_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/nextcloud-helmrepository.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-nextcloud/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-nextcloud/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.single_sign_on_rules:
+  rules:
+    - changes:
+        - flux2/core/base/$RESOURCE/*.yaml
+        - flux2/infrastructure/sources/single-sign-on-helmrepository.yaml
+        - install/install-stackspin.sh
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-single-sign-on/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-single-sign-on/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.velero_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/vmware-tanzu-helmrepository.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-velero/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-velero/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.wekan_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/wekan-helmrepository.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - test/cypress/e2e/wekan.cy.js
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-wekan/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-wekan/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.wordpress_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/wordpress-helmrepository.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - test/cypress/e2e/wordpress.cy.js
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-wordpress/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-wordpress/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.zulip_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/zulip-helmrepository.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - test/cypress/e2e/zulip.cy.js
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-zulip/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-zulip/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+.hedgedoc_rules:
+  rules:
+    - changes:
+        - flux2/apps/$RESOURCE/*.yaml
+        - flux2/cluster/optional/$RESOURCE/*.yaml
+        - flux2/core/base/sources/hedgedoc-helmrepository.yaml
+        # Install and test if we change anything about SSO.
+        - flux2/core/base/single-sign-on/*.yaml
+        - install/install-app.sh
+        - install/flux-version-check.sh
+        - test/cypress/e2e/hedgedoc.cy.js
+        - .gitlab/ci_pipelines/apps_ready.yml
+    - if: '$TRIGGER_JOBS =~ /enable-hedgedoc/'
+    - if: '$CI_COMMIT_MESSAGE =~ /TRIGGER_JOBS=.*enable-hedgedoc/'
+    - if: '$CI_COMMIT_REF_PROTECTED == "true"'
+
+# Common variables
+# ================
+#
+# Note on variables: some variables are defined in the .cluster.env file
+# generated by the `create-vps` job.
+variables:
+  SSH_KEY_ID: "411"
+  ANSIBLE_HOST_KEY_CHECKING: "False"
+  KANIKO_BUILD_IMAGENAME: "stackspin-ci"
+  CLUSTER_DIR: "clusters/${CI_COMMIT_REF_SLUG}"
+
+default:
+  image: "${CI_REGISTRY_IMAGE}/${KANIKO_BUILD_IMAGENAME}:${CI_CONTAINER_TAG}"
+  before_script:
+    - sh .gitlab/ci_scripts/print_debug_info.sh
+
+# Common job definitions
+# ======================
+#
+# Re-usable job definitions
+
+.kustomization_ready:
+  stage: base-ready
+  script:
+    - cd test/
+    - export KUBECONFIG="${PWD}/../clusters/${HOSTNAME}/kube_config_cluster.yml"
+    - pytest -v -s -m 'kustomizations' --resource="$RESOURCE" --desired-revision "${STACKSPIN_BRANCH}@sha1:${STACKSPIN_COMMIT_SHA}" --reruns 120 --reruns-delay 20
+  extends:
+    - .general_rules
+  interruptible: true
+
+# The dotenv report requires us to report the artifacts in every job that is
+# required with a `needs:` from another job.
+.report_artifacts:
+  artifacts:
+    paths:
+      - clusters
+    expire_in: 1 month
+    when: always
+    reports:
+      dotenv:
+        $CLUSTER_DIR/.cluster.env
+
+.ssh_setup:
+  before_script:
+    - bash .gitlab/ci_scripts/print_debug_info.sh
+    - mkdir ~/.ssh
+    - echo -e 'Host *\n  stricthostkeychecking no' > ~/.ssh/config
+    - eval $(ssh-agent -s)
+    - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null
+
+# Get create-vps artifacts in a child pipeline. Requires you to set a
+# PARENT_PIPELINE_ID from the parent pipeline
+.child_require_artifacts:
+  needs:
+    - pipeline: $PARENT_PIPELINE_ID
+      job: $ARTIFACT_JOB
+
+.require_artifacts:
+  dependencies:
+    - create-vps
+
+.trigger_apps_ready_pipeline:
+  trigger:
+    include:
+      - local: .gitlab/ci_pipelines/apps_ready.yml
+    strategy: depend
+
+.testinfra:
+  script:
+    - cd ansible/
+    - pytest -v -s -m 'testinfra' --connection=ansible --ansible-inventory=../${CLUSTER_DIR}/inventory.yml --hosts='ansible://*'
+  extends:
+    - .ssh_setup
+    - .general_rules
+    - .require_artifacts
+  interruptible: true
+
+.prometheus_alerts:
+  variables:
+    # RESOURCE var is used in job specific rules (i.e. ..monitoring_rules)
+    RESOURCE: "kube-prometheus-stack"
+  script:
+    - export BASIC_AUTH_PW=$(python3 -m stackspin $HOSTNAME secrets | grep stackspin-prometheus-basic-auth | awk '{ print $4 }')
+    - cd test/
+    - bash ../.gitlab/ci_scripts/retry_cmd_until_success.sh 30 10 pytest -s -m prometheus
+  extends:
+    - .monitoring_rules
+    - .require_artifacts
+  interruptible: true
+
+# Checkout older stackspin branch
+# used i.e. in upgrade pipelines where an older version branch of Stackspin
+# (defined by the STACKSPIN_BRANCH env var)
+# is installed and tested before an upgrade to the current version is performed
+.checkout_branch: &checkout_branch
+  script:
+    - |
+      if [[ "$CI_COMMIT_REF_NAME" != "$STACKSPIN_BRANCH" ]]; then
+        # NOTE: this command will fail if GIT_STRATEGY is not set to "clone"
+        git checkout "$STACKSPIN_BRANCH"
+      fi
--- a/.gitlab/commit_template.txt
+++ b/.gitlab/commit_template.txt
@@ -4,6 +4,9 @@

 # Additional CI directives
 #
+# Trigger upgrade-test pipeline:
+# UPGRADE_TEST
+#
 # Include apps to install even there's no code change that would
 # trigger an atomated installation, i.e.:
 # TRIGGER_JOBS=enable-nextcloud

--- a/.gitlab/issue_templates/deploy_stackspin.md
+++ b/.gitlab/issue_templates/deploy_stackspin.md
+
+## Preparation
+
+1. [ ] Received necessary information
+   - [ ] SMTP login credentials
+   - [ ] Domain
+   - [ ] List of requested applications
+   - [ ] Customization (custom subdomains, etc.)
+   - [ ] IP address of VPS
+   - [ ] Access to VPS
+   - [ ] Email address of the first admin
+2. [ ] DNS is configured, one of:
+   - [ ] Wildcard CNAME `*.{domain}` & A record for `{domain}`, or
+   - [ ] Domains for all the requested apps
+
+## Installation
+
+3. [ ] Steps from [create
+   cluster](https://docs.stackspin.net/en/v2/installation/create_cluster.html)
+   finished
+4. [ ] Edited `.flux.env`
+5. [ ] Steps from [Install
+   Stackspin](https://docs.stackspin.net/en/v2/installation/install_stackspin.html)
+   finished
+6. [ ] Installed apps
+7. [ ] Copied cluster folder from `./clusters/<client>` into `cluster-configs` repository
+
+## Test
+
+8. [ ] Confirmed all `kustomizations` and `helmreleases` are `Ready`.
+9. [ ] Confirmed apps are reachable through requested domains
+
+## Communication
+
+10. [ ] Communicated admin user credentials?
+11. [ ] Sent the following email:
+    - [ ] Replaced `{customer}` with customer name
+    - [ ] Replaced `{me}` with my name
+    - [ ] Replaced `{domain}` with right domain (and changed any custom (sub)domains)
+    - [ ] Removed apps that were not installed
+
+    > Dear `{customer}`,
+    >
+    > I have installed Stackspin and created an admin account for you.
+    > You can set a password for your account by going to the following link:
+    >
+    > https://dashboard.{domain}/web/recovery
+    >
+    > There, you have to enter your email address
+    > (the one I'm sending this email to)
+    > and then you'll be emailed another link
+    > which allows you to set the password you use to log in.
+    >
+    > After setting that password you can use it to log in to:
+    >
+    > - The dashboard at: https://dashboard.{domain}
+    >
+    > - The Zulip hosted at: https://talk.prototypefund.de
+    >
+    > - Nextcloud (file storage and sharing, password storage and a calendar)
+    >   at https://files.{domain}
+    >
+    > - WordPress (Website CMS) at https://www.{domain}
+    >
+    > - Wekan ("Kanban" planning board) at https://wekan.{domain}
+    >
+    > Feel free to contact us if you have any questions.
+    >
+    > Kind regards,
+    > `{me}`
+    > Stackspin
--- a/.gitlab/issue_templates/new_app.md
+++ b/.gitlab/issue_templates/new_app.md
-# Integrate new app
+# Integrate a new app

+Based on Wekan as a Stackspin app example.

 ## Flux

-* [ ] Create new source if needed in `flux2/infrastructure/sources/APP.yaml`
-* [ ] Include `APP.yaml` in `flux2/infrastructure/sources/kustomization.yaml`
-* [ ] In case of a core app, add healthChecks to `core` `Kustomization`
-      (`flux2/cluster/base/core.yaml`)
-* [ ] Add app secret: `install/templates/stackspin-APP-variables.yaml.jinja`
- * [ ] Flux kustomization: `flux2/cluster/optional/APP/APP.yaml`
- * [ ] K8s kustomization: `flux2/apps/APP/kustomization.yaml`
- * [ ] If needed, add PVCs in `flux2/apps/APP/pvc.yaml`
- * [ ] Add `HelmRelease` in `flux2/apps/APP/release.yaml`
-   * Mem resource requests/limits: See https://open.greenhost.net/stackspin/stackspin/-/issues/1027
-    * [ ] mem request: the median weekly memory usage of an app
-    * [ ] mem limit: 150% of the weekly max memory usage
-
-### Single sign-on
+### Cluster config

-* Integrate the new app into the single sign-on system
-  * [ ] Add OAuth client secret to `install/templates/stackspin-oauth-variables.yaml.jinja`
-  * In `flux2/core/base/single-sign-on/release.yaml`:
-    * [ ] Add app `userbackend.applications`
-    * [ ] Add app to `oAuthClients`
-  * [ ] Configure app OIDC settings in `HelmRelease` `flux2/apps/APP/release.yaml`
-  * [ ] Disable user/pw login if possible
-  * [ ] Admin-login should grant admin privileges
-  * [ ] Non-admin should not grant admin privileges
+* [ ] Flux kustomization: `flux2/cluster/optional/APP/APP.yaml` (Example: `flux2/cluster/optional/wekan/wekan.yaml`)
+
+### Source (helmRepository / gitRepository)
+
+* [ ] Create new `helmRepository` if needed in `flux2/core/base/sources/APP-helmrepository.yaml` (Example: `flux2/core/base/sources/wekan-helmrepository.yaml`)
+* [ ] Include `APP-helmrepository.yaml` in `flux2/core/base/sources/kustomization.yaml`
+
+### App config
+
+* [ ] Add app secrets template (in `dashboard` repo): `backend/areas/apps/templates/stackspin-APP-variables.yaml.jinja` (Example: `backend/areas/apps/templates/stackspin-wekan-variables.yaml.jinja`)
+* [ ] If the app is storing state to disk, add PVCs in `flux2/apps/APP/pvc.yaml` (Example: `flux2/apps/wekan/pvc.yaml`)
+* [ ] Helm chart values configmap: `flux2/apps/APP/APP-values-configmap.yaml` (Example: `flux2/apps/wekan/wekan-values-configmap.yaml`)
+* [ ] `HelmRelease` in `flux2/apps/APP/release.yaml` (Example: `flux2/apps/wekan/release.yaml`)
+* [ ] Create a kustomization config map for your app that sets the app subdomain by adding an entry to `flux2/core/base/migration-scripts/create-kustomization-variables-configmaps-script-configmap.yaml`
+* [ ] Add your app in the right places in the dashboard source, in particular including an app icon.

-### Backup/restore
+#### Single sign-on
+
+* Integrate the new app into the single sign-on system
+  * [ ] Oauth client resource: `flux2/apps/APP/APP-oauth-client.yaml` (Example: `flux2/apps/wekan/wekan-oauth-client.yaml`)
+  * [ ] Configure app OIDC settings, probably via its helm values: `flux2/apps/APP/APP-values-configmap.yaml` (Example: `flux2/apps/wekan/wekan-values-configmap.yaml`)
+  * [ ] Add the app to the list of dashboard apps and oauthclients:
+	- in `flux2/core/base/dashboard/dashboard-apps-configmap.yaml` if this app will be part of the default Stackspin app set.
+	- If you have a self-managed cluster please create new configmaps `stackspin-apps-custom` and `stackspin-oauthclients-custom` with the same structure as the official ones.
+  * [ ] Disable user/pw login if possible (#881)
+  * Test SSO:
+    * [ ] Admin login should grant admin privileges
+    * [ ] Non-admin login should not grant admin privileges
+
+#### Backup/restore

 This applies if the app has any persistent storage that needs to be part of
 backups.

-* [ ] Add the label `stackspin.net/backupSet=myApp` to some kubernetes
-  resources. This label is used by velero when instructed to restore a single
+* [ ] Add the label `stackspin.net/backupSet: "APP"` to some kubernetes
+  resources. This label is used by Velero when instructed to restore a single
  app. Typically you should add it to:
-  * [ ] the pvc(s) in `flux2/apps/myApp/pvc*.yaml`;
+  * [ ] the pvc(s) in `flux2/apps/APP/pvc*.yaml` (Example: `flux2/apps/wekan/pvc.yaml`)
  * [ ] any pod(s) that use those pvc(s); this would go in the chart's helm
-    values, with the value typically called `podLabels`, or if it doesn't have
-    that maybe `commonLabels`;
+    values configmap, with the value typically called `podLabels`, or if it doesn't have
+    that maybe `commonLabels`: `flux2/apps/APP/APP-values-configmap.yaml` (Example: `flux2/apps/wekan/wekan-values-configmap.yaml`)
  * [ ] the kubernetes objects controlling those pods, typically a deployment
    (`deploymentLabels` or `commonLabels`) or statefulset (`statefulSetLabels`
    or `commonLabels`).
@@ -54,55 +61,58 @@ backups.

 * [ ] Add app to `dump_secrets()` in `stackspin/cluster.py`

+If you made it until here you have completed all necessary steps for adding a
+custom app to your cluster.
+If you intend to contribute to Stackspin by a new app merge request please
+continue and follow the rest of the steps below.
+
 ## Tests

 * [ ] Make sure testing app resources work (`test/pytest/test_resources.py`)
 * [ ] Make sure testing app cert works (`test/pytest/test_certs.py`)
-* [ ] Add app to `test()` in `stackspin/__main__.py`
-* [ ] Add Taiko test (`tests/taiko`)
+* [ ] Add automatic tests for your app. We use the cypress framework for this.
+      See `test/cypress/e2e` for the existing tests.
+

 ## CI

-* Add the following elements to `.gitlab-ci.yml`:
-  * [ ] `.APP-rules` partial
-  * [ ] `enable-APP` job
-  * [ ] `APP-kustomization-ready` job
-  * [ ] `APP-cert` job
-  * [ ] `APP-taiko` test job
+* Add the following elements:
+  * [ ] `.APP-rules` partial to `.gitlab/ci_templates/stackspin_common.yml`
+  * [ ] `enable-APP` job to `.gitlab/ci_pipelines/install_stackspin.yml`
+  * [ ] `APP-app-ready` job to `.gitlab/ci_pipelines/default.yml`
+  * [ ] `upgraded-APP-app-ready` job to `.gitlab/ci_pipelines/upgrade_test.yml`
+  * [ ] an entry to `app_subdomains` in `test/pytest/test_certs.py`

 ## Renovatebot

-* [ ] Make sure the needed `HelmRelease` fields for renovatebot are in place and
-      order, i.e.
-      ```
-      # renovate: registryUrl=https://helm-charts.wikimedia.org/stable/
-      chart: eventrouter
-      version: 0.3.6
-      ```
-
-      **Note**: the order of these lines is important for renovatebot's regular expression to match them.
-
-      For adding the registryUrl automatically, you could use the
-      [renovate-helm-releases](https://github.com/k8s-at-home/renovate-helm-releases)
-      script:
-
-      ```
-      git clone https://github.com/k8s-at-home/renovate-helm-releases /tmp/renovate-helm-releases
-      /tmp/renovate.py --cluster-path flux2
-      ```
 * [ ] Make sure that latest [renovate pipeline](https://open.greenhost.net/stackspin/renovate/-/pipelines)
      checks for app updates **after the new app is merged into the main branch**

 ## Documentation

 * Add app to:
-  * [ ] `docs/installation_instructions.rst`
-  * [ ] `docs/testing_instructions.rst`
-  * [ ] `docs/usage.rst`
-  * [ ] `Step 2: Install additional applications` in `docs/installation/install_stackspin.rst`
+  * [ ] `Step 3: Install additional applications` in `docs/installation/install_stackspin.rst`
+  * [ ] `docs/installation/testing.rst`
+  * [ ] `docs/system_administration/migrating.rst`
+  * [ ] `docs/usage/applications.rst`
+
+## Demo instance
+
+* [ ] Install the app on `demo.stackspin.net` so it can be previewed by anyone.
+* [ ] Update the nightly reset script for the demo instance, on `cli.stackspin.net`,
+      in `/srv/stackspin/clusters/demo.stackspin.net/custom-scripts/vars.sh`.

 ## Follow-up issues

 Create follow-up issue with:

-* [ ] Fine-tune CPU and mem limits (https://open.greenhost.net/stackspin/stackspin/-/issues/1027)
+* [ ] Limit settings (<https://blog.kubecost.com/blog/requests-and-limits/#our-solution>)
+  * [ ] Set CPU request
+    * CPU limits are unset.
+    * Set CPU request to average CPU usage.
+  * [ ] Set memory requests and limits
+    * Set memory request to:
+      ```
+      1.5 * avg(quantile_over_time(.99,container_memory_working_set_bytes{container_name!="POD",container_name!=""}[7d])) by (container_name,pod_name,namespace)`
+      ```
+    * Limit set to `2 * request`
--- a/.gitlab/issue_templates/offboarding_team_member.md
+++ b/.gitlab/issue_templates/offboarding_team_member.md
+- [ ] Remove elevated permissions from Gitlab account on `open.stackspin.net`.
+- Disable/remove Stackspin accounts on `stackspin.net`:
+    - [ ] Remove Stackspin account in Stackspin dashboard.
+    - [ ] Deactivate user in Zulip.
+    - [ ] Disable user in Nextcloud.
+    - [ ] Disable user in Wordpress (change role to “no role for this site”).
+- [ ] For developers/ops: remove from Signal group "Stackspin escalation".
+- [ ] Remove email forwards from `info@stackspin.net`, `admin@stackspin.net`.
+- [ ] Remove personal ceph object storage credentials if present.
--- a/.gitlab/issue_templates/onboarding_team_member.md
+++ b/.gitlab/issue_templates/onboarding_team_member.md
+- [ ] Assign permissions to Gitlab account on `open.stackspin.net`.
+- [ ] Create Stackspin account on `stackspin.net`, possibly with admin role.
+- [ ] For developers/ops: add to Signal group "Stackspin escalation".
+- [ ] If relevant: add email forward to `info@stackspin.net`, `admin@stackspin.net`.
--- a/.gitlab/issue_templates/release.md
+++ b/.gitlab/issue_templates/release.md
 # Release checklist

-## Before finalizing release
+We use a simple two-component version `x.y`, where `y` is updated for
+auto-update releases ("minor release"), and `x` is updated for releases that
+require manual intervention ("major release").

-Make these changes in the main branch before releasing:
+In the checklist below, we'll use a literal `X.Y` to stand for the new version.
+In case of a major release, `Y = 0`.

-* [ ] Ensure all applications/dependencies/charts are at their latest versions
-      see `.gitlab/issue_templates/update_all_components.md`
-* Update/review `docs/*` and make sure it matches the current state
-  * [ ] Update how to checkout the latest stable tag
-        `find docs/ -name '*.md' -o -name '*.rst' -exec grep -H '0.7' {} \;`
-  * [ ] Document how to upgrade in `docs/upgrading.rst`
-* [ ] Update dependencies in `requirements.txt` by following the
-      instructions in `requirements.in`
-* [ ] Do a manual upgrade test from last release version to `main`
-* [ ] Create a release merge request for the main branch
+## Pre-release chores

-## In the release merge request
+### Update our custom charts

+If necessary, do a release of the following charts. Don't forget to change the
+version used in Stackspin (or wait for renovatebot to do that automatically).
+
+* [ ] [dashboard](https://open.greenhost.net/stackspin/dashboard) (see also dashboard#10)
+* [ ] [nextcloud](https://open.greenhost.net/stackspin/nextcloud) (see also nextcloud#1012)
+* [ ] [wordpress](https://open.greenhost.net/stackspin/wordpress-helm)
+* [ ] [hedgedoc](https://open.greenhost.net/stackspin/hedgedoc-helm-chart)
+* [ ] [local-path-provisioner](https://open.greenhost.net/stackspin/local-path-provisioner)
+
+## Release candidate
+
+* [ ] Create a new branch, `release-candidate/vX.Y`, from main:
+
+  ```sh
+  git checkout main
+  git pull
+  git checkout -b release-candidate/vX.Y
+  ```
+
+### Only for major releases
+
+* [ ] Update how to checkout the latest stable tag, in
+  `docs/installation/install_cli.rst`.
+* [ ] Document how to upgrade in `docs/system_administration/upgrading.rst`
+
+### For all releases
+
+* [ ] Update the version number in the `VERSION` file
 * Update [CHANGELOG.md](https://keepachangelog.com)
  * [ ] Include all merged MR since last release, i.e. using [lab](https://github.com/zaquestion/lab#installation):
-        ```
-        lab mr list -s merged -a | awk '{first = $1; $1=""; print "*" $0, "(" first ")"}'
-        ```
-  * [ ] Include app charts and versions table (Use [tablemark-cli](https://github.com/haltcase/tablemark-cli)):
-        ```
-        helm ls -A -o json | jq 'map({name, chart, app_version})' | jq 'map(.chart |= split("-")[-1])' > /tmp/versions.json
-        tablemark /tmp/versions.json
-        ```
+
+    ```sh
+    lab mr list -s merged -a | \
+      awk '{first = $1; $1=""; print "*" $0, "(" first ")"}'
+    ```
+
+  * [ ] Include app charts and versions table. You can use
+    [tablemark-cli](https://github.com/haltcase/tablemark-cli), or
+    [an online tool](https://tableconvert.com/json-to-markdown):
+
+    ```sh
+    helm ls -A -o json | jq 'map({name, chart, app_version})' | \
+      jq 'map(.chart |= split("-")[-1])' > /tmp/versions.json
+    ```
+
+    Now look for image tag overrides and fix those app versions in the
+    table:
+
+    ```sh
+    find ./flux2 -name '*.yaml' | xargs grep -A4 '^ *image:'
+    ```
+
+    Then produce the final table:
+
+    ```
+    tablemark /tmp/versions.json
+    ```
+
  * [ ] Include `Known issues`
-* [ ] Update the version number in the `VERSION` file
-* [ ] Commit (signed)
-* [ ] Push to MR
-* [ ] Make sure main pipeline is successful before merging MR !
-* [ ] Wait for MR to get merged into main
+* [ ] Update app versions in `flux2/core/base/dashboard/dashboard-apps-configmap.yaml`.
+* [ ] Only for major versions: prepare a release blog post. You can
+      use the `CHANGELOG.md` as basis.  (It will be published at a later step.)
+* [ ] Decide where to link to from the dashboard "release notes" link. Could be
+      the release blog post, other release notes page, or as fallback the
+      CHANGELOG.md entry for this version.
+* [ ] Update `flux2/cluster/base/stackspin-static-info.yaml` to reflect the new
+      version and the release notes URL.
+* [ ] Commit, push to `release-candidate/vX.Y`.
+* [ ] Only for minor versions: create MR to merge `release-candidate/vX.Y` into
+      `pre-release/vX`
+
+### Manual pre-release upgrade testing (only major releases)
+
+* [ ] Try the proposed upgrade procedure on a fresh cluster installed with the
+      previous major version. Make sure to document any required actions and
+      potential problems in the upgrade docs.
+
+### Automatic pre-release upgrade testing (only minor releases)
+
+We have a special upgrade pipeline to test upgrading from the previous release
+to this new (candidate) release. This pipeline runs when the target branch of a
+MR matches `pre-release/*`. In principle you could make a MR to merge `main`
+into `pre-release/vX`, but that has the unfortunate side-effect that the
+pipeline will restart whenever the source branch changes, which happens all the
+time because of renovatebot automerging minor updates and colleagues that work
+too hard. Therefore:
+
+* [ ] Make sure that the resulting upgrade-test pipeline is successful
+* [ ] Before merging, notify #general on `stackspin.net` that we're about to
+  release, and that `stackspin.net` itself can experience short downtime while
+  the upgrade is in progress.
+* [ ] Wait until MR gets reviewed and merged
+* [ ] The CI machine created by the upgrade-test pipeline doesn't get destroyed
+  automatically, so please remove it yourself.
+
+### Check automatic upgrade on `stackspin.net` (only minor releases)
+
+Now that the new code has been merged to `pre-release/vX`, it will be picked
+up by the `stackspin.net` cluster which is set to follow that branch. Even
+though the upgrade pipeline already tested the upgrade process, it's still good
+to check if the upgrade goes well there:
+
+* [ ] do a `flux reconcile source git stackspin` so you don't have to wait
+      until flux decides it's time to reconcile;
+* [ ] `watch flux get kustomization` to see components being upgraded. If
+      necessary check the status using `kubectl describe hr ...` and debug.
+
+## Quality assurance
+
+Now is a good time to spend some time using the `stackspin.net` instance: do
+some basic manual testing of the dashboard, log in to all the apps, check if
+the Nextcloud apps are all working, check any major changes or new features,
+etc. Typically we keep the release candidate on `stackspin.net` for a week or
+so before pushing out the release, but that really depends on the type of
+changes.
+
+## Release branch (only minor releases)
+
+* [ ] Merge the just-updated `pre-release/vX` into the release branch `vX`.
+
+## Release branches (only major releases)
+
+* [ ] Create a new branch `vX` from `v(X-1)`.
+* [ ] Change all occurrences of the last release to the new release version:
+      `sed -i 's/v(X-1)/vX/g' .gitlab/ci_pipelines/upgrade_test.yml \
+        .gitlab/issue_templates/update_all_components.md \
+        .gitlab/issue_templates/deploy_stackspin.md \
+        .gitlab/issue_templates/release.md \
+        docs/conf.py`
+* [ ] Create a merge request for merging the release candidate
+      into the release branch `vX`. Ask for review and merge.
+* [ ] Create a new branch `pre-release/vX` from `vX`.
+
+### Upgrade `stackspin.net` (only major releases)
+
+Note that `stackspin.net` doesn't follow a release branch (`vX`) but a
+pre-release branch.
+
+* [ ] Apply the upgrade procedure to `stackspin.net`: following
+      https://docs.stackspin.net/en/latest/system_administration/upgrading.html
+      and any actions specific to this release.
+* [ ] Configure the cluster to follow the new pre-release branch: `pre-release/vX`.
+
+### Upgrade `staging.stackspin.net` (only major releases)
+
+Note that `staging.stackspin.net` doesn't follow a release branch (`vX`) but the
+`main` branch.
+
+* [ ] Apply the upgrade procedure to `staging.stackspin.net`, but do not change
+      the branch followed by flux (keep it at `main`).

 ## Push a signed tag

-* [ ] Make sure the main pipeline succeeds for the last commit before tagging. This
-      is important because tags should not get retagged !
-* [ ] Create and push signed tag (`git tag -s 0.7.0 -m 'Release 0.7.0'; git push --tags`)
+* [ ] Create and push signed tag
+      (`git tag -s X.Y -m 'Release X.Y'; git push --tags`)
+* If this is a major release:
+  * [ ] Log into <https://readthedocs.org> and update documentation for tagged versions.
+        If the new branch is not shown, rebuild the docs for any existing branch,
+        which will as a side-effect fetch all branches from our gitlab.
+  * [ ] Publish the release blog post.
+  * [ ] Create an issue for upgrading managed clusters:
+    * our own production clusters;
+    * our managed customer clusters.
+  * [ ] Create a new CI pipeline image:
+        https://docs.stackspin.net/en/latest/for_developers/contributing.html#ci-pipeline-image
+
+## Post-release chores
+
+* [ ] Announce the release in the public Stackspin matrix room.
+* [ ] Notify Greenhost sysops that an upgrade to their cluster will happen
+      overnight.
+* [ ] Update the "stable" CI docker image (used as base for the upgrade pipeline):
+    - `tag="open.greenhost.net:4567/stackspin/stackspin/stackspin-ci:vX"`
+    - `docker build -t $tag .`
+    - `docker push $tag`
+* [ ] Merge the release branch back into `main`. This is necessary to propagate
+      the changes to CHANGELOG etc.
+* [ ] After `demo.stackspin.net` is upgraded, create a new backup for the new
+      version that will be used for the nightly reset. See `cli.stackspin.net`, directory
+      `/srv/stackspin/clusters/demo.stackspin.net/custom-scripts`.
+* [ ] Update the version of the stackspin repo on `cli.stackspin.net`:
+  - `git pull` in `/srv/stackspin` to get the latest version of the release branch.
+  - Update python requirements: `pip install -r requirements.txt`. Note
+    that we use a virtualenv on `cli.stackspin.net` that's in the PATH of the
+    root user.
+* [ ] Close released milestone and set start date for the new milestone.
+
+### Update flux
+
+See available [releases](https://github.com/fluxcd/flux2/releases).
+
+* [ ] Update flux CLI version:

-## In the new release branch
+      current_flux_version=$(yq eval .flux.version ansible/group_vars/all/stackspin.yml)
+      new_flux_version=$(curl -s https://api.github.com/repos/fluxcd/flux2/releases/latest | jq -r .name | tr -d 'v')
+      sed -i "s/$current_flux_version/$new_flux_version/g" Dockerfile install/flux-version-check.sh docs/installation/install_cli.rst ansible/group_vars/all/stackspin.yml

-* [ ] If this is a major release, create a new release branch (i.e. `v0.7`)
+  * [ ] After changing the required flux version in `flux-version-check.sh`,
+    you also need to rebuild the CI image to contain the new binary, otherwise
+    the check will fail in the CI.

-Create a MR for the new branch with the following changes:
+* [ ] Update flux version (daemon components running on clusters): see
+  `dev-tools/generate-flux-manifests/README.md`.

-* Ensure all dependencies are updated and locked (locking should be done
-    only on the release branch):
-  * [ ] flux chart versions: `find flux2/* -type f -exec yq eval .spec.chart {} \;`
-* [ ] Wait for MR to get merged
+* [ ] After the new version of the CLI in particular has been merged to `main`,
+      manually run a pipeline for `main` (with `CI_PIPELINE_SOURCE: schedule`) so
+      the `main` CI image gets rebuilt with the new flux. Although we don't
+      regularly run pipelines for `main` anymore, this image is used by pipelines for
+      MRs that don't change the CI image.

-## After release
+## Celebration

-* [ ] Log into <https://readthedocs.org> and update documentation for tagged versions
-* [ ] Cherry-pick all changes from the release branch into main which
-      shouldn't be only in the 0.4.0 branch (i.e. not the commit locking the
-      dependencies)
-* [ ] Create an issue for creating a release blog post
-* [ ] Create an issue fpr cleaning up
-  * old branches/tags
 * [ ] Celebrate :clinking_glass: !!
No results found