From 79cd52cd0d098498c2e7023ec3746790cb730ed2 Mon Sep 17 00:00:00 2001 From: Kris Hicks Date: Mon, 1 Jun 2026 09:36:25 -0700 Subject: [PATCH] ci(release): add canary upgrade coverage The release canary now exercises both install paths for dev artifacts: a clean install of the rolling dev release, and an upgrade from the latest stable release to dev. The upgrade jobs catch package-manager regressions that only appear when replacing an installed stable package, while the clean-install jobs keep coverage for first-time dev installs. Installer sourcing now follows the release under test. Stable installs use STABLE_INSTALL_SH_URL, which always points at main/install.sh, so the canary does not use a new dev installer to install an older stable release. Dev installs use DEV_INSTALL_SH_URL: automatic workflow_run canaries source install.sh from the moving dev tag, while manual dispatches source it from the dispatched branch ref so workflow and dev installer changes can be tested before they merge. Signed-off-by: Kris Hicks --- .agents/skills/test-release-canary/SKILL.md | 13 +- .github/workflows/release-canary.yml | 337 ++++++++++++++++++-- 2 files changed, 328 insertions(+), 22 deletions(-) diff --git a/.agents/skills/test-release-canary/SKILL.md b/.agents/skills/test-release-canary/SKILL.md index 4bf7d38ae..9be114419 100644 --- a/.agents/skills/test-release-canary/SKILL.md +++ b/.agents/skills/test-release-canary/SKILL.md @@ -33,7 +33,11 @@ on: - **Automatic.** Every successful `Release Dev` run (on `main` or a manual dispatch of Release Dev) fires the canary. Each job gates on `github.event.workflow_run.conclusion == 'success'` so a failed Release Dev does not run the canary. - **Manual.** `workflow_dispatch` lets you run the canary on demand against any branch's workflow definition. -When dispatched manually, `github.event.workflow_run.head_sha` is empty and the workflow falls back to `github.sha` (the branch tip) for the `install.sh` URL. +Stable installs use `STABLE_INSTALL_SH_URL`, which always points at +`main/install.sh`. Dev installs use `DEV_INSTALL_SH_URL`: automatic runs point +at the moving `dev` tag so the installer matches the published development +release, while manual dispatches use the dispatched ref name so branch changes +to the canary or dev installer can still be exercised. ## Manual dispatch @@ -59,9 +63,12 @@ gh run view --log-failed ## Iterating on the canary itself -When you change `release-canary.yml` on a branch, a manual dispatch on that branch tests *your branch's workflow logic* against *main's published artifacts* (`0.0.0-dev` chart, `:dev` images, latest tagged install.sh assets). This is what you want for iterating on the canary — you're validating that the canary still works against known-good artifacts. +When you change `release-canary.yml` on a branch, a manual dispatch on that branch tests *your branch's workflow logic* against *main's published stable artifacts* and the current published dev artifacts (`0.0.0-dev` chart, `:dev` images). This is what you want for iterating on the canary — you're validating that the canary still works against known-good artifacts. -Note `install.sh` is pulled from `raw.githubusercontent.com/NVIDIA/OpenShell/${head_sha}/install.sh`, so changes to `install.sh` on your branch *are* exercised even though the binaries it downloads are from the latest public tag. +Note stable package installs always pull `install.sh` from `main`, while dev +package installs pull `install.sh` from the dispatched branch ref for manual +runs. Changes to dev installer behavior on your branch are exercised without +using that new installer to install an older stable release. ## Testing artifacts from a specific SHA diff --git a/.github/workflows/release-canary.yml b/.github/workflows/release-canary.yml index aa6d2a1aa..bbfc1195c 100644 --- a/.github/workflows/release-canary.yml +++ b/.github/workflows/release-canary.yml @@ -10,13 +10,43 @@ permissions: actions: read contents: read +env: + STABLE_INSTALL_SH_URL: https://raw.githubusercontent.com/NVIDIA/OpenShell/main/install.sh + DEV_INSTALL_SH_URL: https://raw.githubusercontent.com/NVIDIA/OpenShell/${{ github.event_name == 'workflow_run' && 'dev' || github.ref_name }}/install.sh + defaults: run: shell: bash jobs: macos: - name: macOS Homebrew + name: macOS Homebrew Upgrade + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + runs-on: macos-latest-xlarge + timeout-minutes: 30 + steps: + - name: Ensure VM driver + run: | + launchctl setenv OPENSHELL_DRIVERS vm + + - name: Clean install latest release and check status + run: | + set -euo pipefail + curl -LsSf "$STABLE_INSTALL_SH_URL" | sh + openshell --version + openshell status + + - name: Upgrade to dev release and check status + run: | + set -euo pipefail + before_version="$(openshell --version)" + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + after_version="$(openshell --version)" + echo "Upgraded OpenShell from ${before_version} to ${after_version}" + openshell status + + macos-dev-clean: + name: macOS Homebrew Clean Install if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} runs-on: macos-latest-xlarge timeout-minutes: 20 @@ -25,13 +55,48 @@ jobs: run: | launchctl setenv OPENSHELL_DRIVERS vm - - name: Install and check status + - name: Clean install and check status run: | - curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/${{ github.event.workflow_run.head_sha || github.sha }}/install.sh | sh + set -euo pipefail + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + openshell --version openshell status ubuntu: - name: Ubuntu Docker + name: Ubuntu Docker Upgrade + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Ensure Docker + run: | + if ! command -v docker >/dev/null 2>&1; then + sudo apt-get update + sudo apt-get install -y docker.io + fi + sudo systemctl start docker || sudo service docker start + mkdir -p "${HOME}/.config/openshell" + printf 'OPENSHELL_DRIVERS=docker\n' > "${HOME}/.config/openshell/gateway.env" + docker info + + - name: Clean install latest release and check status + run: | + set -euo pipefail + curl -LsSf "$STABLE_INSTALL_SH_URL" | sh + openshell --version + openshell status + + - name: Upgrade to dev release and check status + run: | + set -euo pipefail + before_version="$(openshell --version)" + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + after_version="$(openshell --version)" + echo "Upgraded OpenShell from ${before_version} to ${after_version}" + openshell status + + ubuntu-dev-clean: + name: Ubuntu Docker Clean Install if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest timeout-minutes: 20 @@ -47,13 +112,47 @@ jobs: printf 'OPENSHELL_DRIVERS=docker\n' > "${HOME}/.config/openshell/gateway.env" docker info - - name: Install and check status + - name: Clean install and check status run: | - curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/${{ github.event.workflow_run.head_sha || github.sha }}/install.sh | sh + set -euo pipefail + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + openshell --version openshell status fedora: - name: Fedora RPM + name: Fedora RPM Upgrade + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + runs-on: linux-amd64-cpu8 + timeout-minutes: 30 + container: + image: fedora:latest + options: --privileged + steps: + - name: Ensure Podman + run: | + dnf install -y curl podman + mkdir -p "${HOME}/.config/openshell" + printf 'OPENSHELL_DRIVERS=podman\n' > "${HOME}/.config/openshell/gateway.env" + podman info + + - name: Clean install latest release and check status + run: | + set -euo pipefail + curl -LsSf "$STABLE_INSTALL_SH_URL" | sh + openshell --version + openshell status + + - name: Upgrade to dev release and check status + run: | + set -euo pipefail + before_version="$(openshell --version)" + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + after_version="$(openshell --version)" + echo "Upgraded OpenShell from ${before_version} to ${after_version}" + openshell status + + fedora-dev-clean: + name: Fedora RPM Clean Install if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} runs-on: linux-amd64-cpu8 timeout-minutes: 20 @@ -122,7 +221,7 @@ jobs: fi EOF - - name: Install and check status + - name: Clean install and check status run: | set -euo pipefail @@ -130,13 +229,14 @@ jobs: HOME=/root \ XDG_RUNTIME_DIR=/run/user/0 \ DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/0/bus \ - INSTALL_SH_URL="https://raw.githubusercontent.com/NVIDIA/OpenShell/${{ github.event.workflow_run.head_sha || github.sha }}/install.sh" \ + DEV_INSTALL_SH_URL="${DEV_INSTALL_SH_URL}" \ bash -s <<'EOF' set -euo pipefail mkdir -p "${HOME}/.config/openshell" printf 'OPENSHELL_DRIVERS=podman\n' > "${HOME}/.config/openshell/gateway.env" podman info - curl -LsSf "${INSTALL_SH_URL}" | sh + curl -LsSf "${DEV_INSTALL_SH_URL}" | OPENSHELL_VERSION=dev sh + openshell --version openshell status EOF @@ -146,8 +246,70 @@ jobs: docker rm -f "${FEDORA_CANARY_CONTAINER}" >/dev/null 2>&1 || true ubuntu-snap: - name: Ubuntu Snap - if: ${{ github.event.workflow_run.conclusion == 'success' }} + name: Ubuntu Snap Upgrade + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Install snapd + run: | + set -euo pipefail + sudo apt-get update + sudo apt-get install -y snapd + sudo systemctl enable --now snapd.socket + sudo systemctl start snapd + sudo snap wait system seed.loaded + + - name: Install Docker snap + run: | + set -euo pipefail + sudo snap install docker + + - name: Clean install latest stable snap and check status + run: | + set -euo pipefail + sudo snap install openshell --channel latest/stable + + - name: Connect interfaces + run: | + set -euo pipefail + sudo snap connect openshell:docker docker:docker-daemon + sudo snap connect openshell:log-observe + sudo snap connect openshell:system-observe + sudo snap connect openshell:ssh-keys + + - name: Check clean snap install status + run: | + set -euo pipefail + openshell --version + sudo snap services openshell + openshell gateway add http://127.0.0.1:17670 --local --name snap-docker + openshell gateway select snap-docker + openshell status + + - name: Download snap from release-dev artifacts + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + github-token: ${{ github.token }} + run-id: ${{ github.event.workflow_run.id }} + pattern: snap-linux-amd64 + path: release/ + merge-multiple: true + + - name: Upgrade to release-dev snap artifact and check status + run: | + set -euo pipefail + before_version="$(openshell --version)" + sudo snap install ./release/*.snap --dangerous + sudo snap restart openshell.gateway + after_version="$(openshell --version)" + echo "Upgraded OpenShell snap from ${before_version} to ${after_version}" + sudo snap services openshell + openshell status + + ubuntu-snap-dev-clean: + name: Ubuntu Snap Clean Install + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest timeout-minutes: 20 steps: @@ -174,7 +336,7 @@ jobs: path: release/ merge-multiple: true - - name: Install snap (dangerous — from release, not store) + - name: Install snap from release-dev artifact run: | set -euo pipefail sudo snap install ./release/*.snap --dangerous @@ -197,10 +359,10 @@ jobs: openshell status kubernetes: - name: Kubernetes Helm (kind) + name: Kubernetes Helm Upgrade (kind) if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} runs-on: ubuntu-latest - timeout-minutes: 20 + timeout-minutes: 30 env: KIND_CLUSTER_NAME: release-canary-${{ github.run_id }} RELEASE_NAME: openshell @@ -216,16 +378,62 @@ jobs: cluster_name: ${{ env.KIND_CLUSTER_NAME }} wait: 120s - - name: Install OpenShell Helm chart from GHCR OCI + - name: Resolve latest public chart version + run: | + set -euo pipefail + OCI_CHART="oci://ghcr.io/nvidia/openshell/helm-chart" + latest_url="$(curl -fLsS -o /dev/null -w '%{url_effective}' https://github.com/NVIDIA/OpenShell/releases/latest)" + latest_tag="${latest_url##*/}" + latest_version="${latest_tag#v}" + if [ -z "$latest_version" ] || [ "$latest_version" = "latest" ]; then + echo "Could not resolve latest public release from ${latest_url}" >&2 + exit 1 + fi + + candidates=("$latest_version") + if [ "$latest_tag" != "$latest_version" ]; then + candidates+=("$latest_tag") + fi + + for candidate in "${candidates[@]}"; do + if helm show chart "$OCI_CHART" --version "$candidate" >/dev/null 2>&1; then + echo "LATEST_CHART_VERSION=${candidate}" >> "$GITHUB_ENV" + echo "Resolved latest public chart version ${candidate} from release ${latest_tag}" + exit 0 + fi + done + + echo "Latest public release ${latest_tag} has no installable Helm chart at ${OCI_CHART}." >&2 + echo "Tried chart versions: ${candidates[*]}" >&2 + exit 1 + + - name: Clean install latest public Helm chart from GHCR OCI run: | set -euo pipefail helm install "$RELEASE_NAME" oci://ghcr.io/nvidia/openshell/helm-chart \ - --version 0.0.0-dev \ + --version "$LATEST_CHART_VERSION" \ --namespace "$RELEASE_NAMESPACE" --create-namespace \ --set server.disableTls=true \ --wait --timeout 5m - - name: Verify gateway pod is Ready + - name: Verify clean install gateway pod is Ready + run: | + set -euo pipefail + kubectl wait --namespace "$RELEASE_NAMESPACE" \ + --for=condition=Ready pod \ + --selector="app.kubernetes.io/name=openshell,app.kubernetes.io/instance=${RELEASE_NAME}" \ + --timeout=300s + + - name: Upgrade to dev Helm chart + run: | + set -euo pipefail + helm upgrade "$RELEASE_NAME" oci://ghcr.io/nvidia/openshell/helm-chart \ + --version 0.0.0-dev \ + --namespace "$RELEASE_NAMESPACE" \ + --set server.disableTls=true \ + --wait --timeout 5m + + - name: Verify upgraded gateway pod is Ready run: | set -euo pipefail kubectl wait --namespace "$RELEASE_NAMESPACE" \ @@ -256,7 +464,7 @@ jobs: set -euo pipefail mkdir -p "${HOME}/.config/openshell" printf 'OPENSHELL_DRIVERS=docker\n' > "${HOME}/.config/openshell/gateway.env" - curl -LsSf https://raw.githubusercontent.com/NVIDIA/OpenShell/${{ github.event.workflow_run.head_sha || github.sha }}/install.sh | sh + curl -LsSf "$STABLE_INSTALL_SH_URL" | sh - name: Register kind gateway and check status run: | @@ -286,3 +494,94 @@ jobs: openshell gateway list 2>/dev/null echo "--- openshell version ---" openshell --version 2>/dev/null + + kubernetes-dev-clean: + name: Kubernetes Helm Clean Install (kind) + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + KIND_CLUSTER_NAME: release-canary-dev-${{ github.run_id }} + RELEASE_NAME: openshell + RELEASE_NAMESPACE: openshell + KIND_GATEWAY_NAME: kind-dev + steps: + - name: Install Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + + - name: Create kind cluster + uses: helm/kind-action@ef37e7f390d99f746eb8b610417061a60e82a6cc # v1.14.0 + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait: 120s + + - name: Clean install Helm chart from GHCR OCI + run: | + set -euo pipefail + helm install "$RELEASE_NAME" oci://ghcr.io/nvidia/openshell/helm-chart \ + --version 0.0.0-dev \ + --namespace "$RELEASE_NAMESPACE" --create-namespace \ + --set server.disableTls=true \ + --wait --timeout 5m + + - name: Verify clean install gateway pod is Ready + run: | + set -euo pipefail + kubectl wait --namespace "$RELEASE_NAMESPACE" \ + --for=condition=Ready pod \ + --selector="app.kubernetes.io/name=openshell,app.kubernetes.io/instance=${RELEASE_NAME}" \ + --timeout=300s + + - name: Port-forward gateway service + run: | + set -euo pipefail + nohup kubectl port-forward --namespace "$RELEASE_NAMESPACE" \ + "svc/${RELEASE_NAME}" 8080:8080 \ + > port-forward-dev.log 2>&1 & + echo $! > port-forward-dev.pid + for _ in $(seq 1 30); do + if (echo > /dev/tcp/127.0.0.1/8080) >/dev/null 2>&1; then + echo "port-forward is reachable" + exit 0 + fi + sleep 1 + done + echo "port-forward did not become reachable" >&2 + cat port-forward-dev.log >&2 + exit 1 + + - name: Install OpenShell CLI + run: | + set -euo pipefail + mkdir -p "${HOME}/.config/openshell" + printf 'OPENSHELL_DRIVERS=docker\n' > "${HOME}/.config/openshell/gateway.env" + curl -LsSf "$DEV_INSTALL_SH_URL" | OPENSHELL_VERSION=dev sh + + - name: Register kind gateway and check status + run: | + set -euo pipefail + openshell gateway add http://127.0.0.1:8080 --local --name "$KIND_GATEWAY_NAME" + openshell status + + - name: Diagnostics on failure + if: failure() + run: | + set +e + echo "--- helm status ---" + helm status "$RELEASE_NAME" --namespace "$RELEASE_NAMESPACE" + echo "--- helm get manifest ---" + helm get manifest "$RELEASE_NAME" --namespace "$RELEASE_NAMESPACE" + echo "--- get all ---" + kubectl get all --namespace "$RELEASE_NAMESPACE" + echo "--- describe pods ---" + kubectl describe pods --namespace "$RELEASE_NAMESPACE" + echo "--- pod logs ---" + kubectl logs --namespace "$RELEASE_NAMESPACE" \ + --selector="app.kubernetes.io/name=openshell,app.kubernetes.io/instance=${RELEASE_NAME}" \ + --tail=200 --all-containers --prefix + echo "--- port-forward log ---" + cat port-forward-dev.log 2>/dev/null + echo "--- openshell gateway list ---" + openshell gateway list 2>/dev/null + echo "--- openshell version ---" + openshell --version 2>/dev/null