From fcfade8bf8a3878c6cc3e5dd92a24a117556990f Mon Sep 17 00:00:00 2001 From: Prem Kumar Kalle Date: Fri, 12 Jun 2026 18:30:26 -0700 Subject: [PATCH] Fix(ci): support Ubuntu Noble stemcell in create-bosh-lite The cf-deployment default stemcell moved from ubuntu-jammy to ubuntu-noble (cloudfoundry/cf-deployment#1224), which broke the create-bosh-lite workflow. Three Noble-specific problems plus supporting fixes: - Warden agents wouldn't connect: Noble warden containers boot under systemd, which requires bbl >= 9.0.41 (warden_cpi start_containers_with_systemd:true). Provided via the BBL_CLI_VERSION repository variable (>= 9.0.41; set to 9.0.45). - External DNS broken inside containers: the Noble bosh-dns config lives under the "bosh-dns-systemd" addon with disable_recursors:true, so diego-cells couldn't resolve buildpacks.cloudfoundry.org and app staging failed ("server misbehaving"). bosh-dns-noble-bosh-lite.yml enables recursion with the 169.254.169.254 recursor, applied to the dns runtime-config. - App Envoy sidecars crashed on start ("inotify_fd_ >= 0", exit 134): with systemd in every warden container the director host's fs.inotify.max_user_instances (128) was exhausted. director-inotify.yml adds an os-conf sysctl job (1024 / 524288) to the director via create-director-override.sh. inotify limits bind at the host root user namespace, so this is set on the director VM, not the diego-cell. - Increase the bosh-lite director VM to n2-standard-16 (64 GB): the whole deployment runs as warden containers on one VM and 32 GB overcommitted memory. - Fix the failure-cleanup step: `bbl down` was passed --gcp-service-account-key=key.json (no such file is created), so it parsed the literal string as JSON and failed, leaving orphaned infrastructure on any failed run. Authenticate via BBL_GCP_SERVICE_ACCOUNT_KEY, like `bbl up`. Requires the BBL_CLI_VERSION repository variable to be >= 9.0.41. Signed-off-by: Prem Kumar Kalle --- .../create-director-override.sh | 23 +++++++++++++++++ .../ops-files/bosh-dns-noble-bosh-lite.yml | 15 +++++++++++ .github/ops-files/bosh-lite-vm-type.yml | 8 ++++-- .github/ops-files/director-inotify.yml | 25 +++++++++++++++++++ .github/workflows/create-bosh-lite.yml | 12 +++++++-- 5 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 .github/bosh-lite-files/create-director-override.sh create mode 100644 .github/ops-files/bosh-dns-noble-bosh-lite.yml create mode 100644 .github/ops-files/director-inotify.yml diff --git a/.github/bosh-lite-files/create-director-override.sh b/.github/bosh-lite-files/create-director-override.sh new file mode 100644 index 00000000000..803d10c77ae --- /dev/null +++ b/.github/bosh-lite-files/create-director-override.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# Overrides bbl's generated create-director.sh so we can apply extra ops files +# to the BOSH Lite director. Mirrors the stock bosh-lite-gcp plan-patch override +# and adds director-inotify.yml (raises fs.inotify limits on the director host +# so Noble app Envoy sidecars don't crash with "inotify_fd_ >= 0"). +bosh create-env \ + ${BBL_STATE_DIR}/bosh-deployment/bosh.yml \ + --state ${BBL_STATE_DIR}/vars/bosh-state.json \ + --vars-store ${BBL_STATE_DIR}/vars/director-vars-store.yml \ + --vars-file ${BBL_STATE_DIR}/vars/director-vars-file.yml \ + --var-file gcp_credentials_json="${BBL_GCP_SERVICE_ACCOUNT_KEY_PATH}" \ + -v project_id="${BBL_GCP_PROJECT_ID}" \ + -v zone="${BBL_GCP_ZONE}" \ + -o ${BBL_STATE_DIR}/bosh-deployment/gcp/cpi.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/jumpbox-user.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/uaa.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/credhub.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/bosh-lite.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/bosh-lite-runc.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/gcp/bosh-lite-vm-type.yml \ + -o ${BBL_STATE_DIR}/bosh-deployment/gcp/director-inotify.yml \ + -o ${BBL_STATE_DIR}/external-ip-gcp.yml \ + -o ${BBL_STATE_DIR}/ip-forwarding.yml diff --git a/.github/ops-files/bosh-dns-noble-bosh-lite.yml b/.github/ops-files/bosh-dns-noble-bosh-lite.yml new file mode 100644 index 00000000000..ff7ae00a8c4 --- /dev/null +++ b/.github/ops-files/bosh-dns-noble-bosh-lite.yml @@ -0,0 +1,15 @@ +--- +# BOSH DNS recursor fix for Ubuntu Noble on BOSH Lite (GCP). +# bosh-deployment's dns.yml places the Noble bosh-dns config under the +# "bosh-dns-systemd" addon (NOT the "bosh-dns" addon, which only covers +# trusty/xenial/bionic/jammy) with disable_recursors: true. That blocks +# external DNS resolution (e.g. buildpacks.cloudfoundry.org) inside diego-cell +# containers, so app staging fails with "lookup ... server misbehaving". +# Enable recursion and forward to GCP's metadata resolver. +- type: replace + path: /addons/name=bosh-dns-systemd/jobs/name=bosh-dns/properties/disable_recursors + value: false +- type: replace + path: /addons/name=bosh-dns-systemd/jobs/name=bosh-dns/properties/recursors? + value: + - 169.254.169.254 diff --git a/.github/ops-files/bosh-lite-vm-type.yml b/.github/ops-files/bosh-lite-vm-type.yml index eebd6a6e886..00cd2b2d30b 100644 --- a/.github/ops-files/bosh-lite-vm-type.yml +++ b/.github/ops-files/bosh-lite-vm-type.yml @@ -1,8 +1,12 @@ --- -# Configure sizes for bosh-lite on gcp +# Configure sizes for bosh-lite on gcp. +# n2-standard-16 (16 vCPU / 64 GB): the whole cf-deployment runs as warden +# containers on this single director VM; on Ubuntu Noble each container runs a +# full systemd PID 1, so 32 GB (n2-standard-8) overcommits memory and a random +# instance-group agent fails to boot ("Timed out pinging VM"). 64 GB gives headroom. - type: replace path: /resource_pools/name=vms/cloud_properties/machine_type - value: n2-standard-8 + value: n2-standard-16 - type: replace path: /disk_pools/name=disks/disk_size value: 250000 diff --git a/.github/ops-files/director-inotify.yml b/.github/ops-files/director-inotify.yml new file mode 100644 index 00000000000..e41e33808fe --- /dev/null +++ b/.github/ops-files/director-inotify.yml @@ -0,0 +1,25 @@ +--- +# Raise inotify limits on the BOSH Lite director VM (the host running every +# warden container). On Ubuntu Noble each warden container runs systemd as PID 1 +# (start_containers_with_systemd), a heavy inotify consumer, so the host's +# default fs.inotify.max_user_instances (128) is exhausted. App Envoy sidecars +# then abort with "assert failure: inotify_fd_ >= 0" (Exit status 134), which +# marks every app instance CRASHED even though staging succeeds. +# +# inotify limits are enforced at the host root user namespace (a new userns +# defaults to unlimited and inc_ucount checks every ancestor up to root), so +# this MUST be set on the director VM, NOT on the diego-cell. +# +# The os-conf release is already declared by bosh-deployment's bosh-lite.yml +# (which create-director-override.sh applies before this file, for its +# disable_agent job), so we only add the sysctl job here. Re-declaring the +# release fails with "releases[N].name 'os-conf' must be unique". +- type: replace + path: /instance_groups/name=bosh/jobs/- + value: + name: sysctl + release: os-conf + properties: + sysctl: + - fs.inotify.max_user_instances=1024 + - fs.inotify.max_user_watches=524288 diff --git a/.github/workflows/create-bosh-lite.yml b/.github/workflows/create-bosh-lite.yml index c9e31fc90e3..3ae45cc7f4c 100644 --- a/.github/workflows/create-bosh-lite.yml +++ b/.github/workflows/create-bosh-lite.yml @@ -81,6 +81,12 @@ jobs: cp ${GITHUB_WORKSPACE}/cli/.github/bosh-lite-files/bosh-lite-dns.tf terraform/ cp ${GITHUB_WORKSPACE}/cli/.github/bosh-lite-files/bosh-lite.tfvars vars/ cp ${GITHUB_WORKSPACE}/cli/.github/ops-files/bosh-lite-vm-type.yml bosh-deployment/gcp/ + cp ${GITHUB_WORKSPACE}/cli/.github/ops-files/director-inotify.yml bosh-deployment/gcp/ + # Overwrite the plan-patch's stock create-director-override.sh with ours + # (bbl runs *-override.sh in preference to the generated create-director.sh) + # so the director gets director-inotify.yml during bosh create-env. + cp ${GITHUB_WORKSPACE}/cli/.github/bosh-lite-files/create-director-override.sh create-director-override.sh + chmod +x create-director-override.sh bbl up - name: Authenticate to Google Cloud @@ -131,7 +137,9 @@ jobs: cd $env_name/bbl-state eval "$(bbl print-env --shell-type posix)" - bosh update-runtime-config ${GITHUB_WORKSPACE}/bosh-deployment/runtime-configs/dns.yml --name dns + bosh update-runtime-config ${GITHUB_WORKSPACE}/bosh-deployment/runtime-configs/dns.yml \ + -o ${GITHUB_WORKSPACE}/cli/.github/ops-files/bosh-dns-noble-bosh-lite.yml \ + --name dns STEMCELL_VERSION=$(bosh interpolate ${GITHUB_WORKSPACE}/cf-deployment/cf-deployment.yml --path /stemcells/alias=default/version) bosh upload-stemcell "https://bosh.io/d/stemcells/bosh-warden-boshlite-ubuntu-noble?v=${STEMCELL_VERSION}" bosh update-cloud-config ${GITHUB_WORKSPACE}/cf-deployment/iaas-support/bosh-lite/cloud-config.yml @@ -167,7 +175,7 @@ jobs: eval "$(bbl print-env --shell-type posix)" echo "Deleting env ${env_name}" - bbl down --no-confirm --gcp-service-account-key=key.json + bbl down --no-confirm echo "Deleting bbl state directory" if gsutil ls gs://cf-cli-bosh-lites | grep -q /${env_name}/; then