|
| 1 | +#!/usr/bin/env bash |
| 2 | +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +# Run an e2e command against a Helm-deployed OpenShell gateway in Kubernetes. |
| 6 | +# |
| 7 | +# Modes: |
| 8 | +# - OPENSHELL_E2E_KUBE_CONTEXT set: |
| 9 | +# Target the named kubectl context, install the chart into an ephemeral |
| 10 | +# namespace, and port-forward the gateway. Cluster lifecycle is the |
| 11 | +# caller's responsibility (e.g. CI provisions kind via helm/kind-action). |
| 12 | +# - OPENSHELL_E2E_KUBE_CONTEXT unset: |
| 13 | +# Create a local k3d cluster via tasks/scripts/helm-k3s-local.sh, install |
| 14 | +# the chart, port-forward, and tear the cluster down on exit. |
| 15 | +# |
| 16 | +# Helm e2e currently uses plaintext gateway traffic (ci/values-tls-disabled.yaml). |
| 17 | +# |
| 18 | +# Image source: helm install pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} |
| 19 | +# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the commit SHA; |
| 20 | +# local devs should set it to a tag pulled from a registry the cluster can reach, |
| 21 | +# or build and import images via a separate bootstrap step before running this script. |
| 22 | + |
| 23 | +set -euo pipefail |
| 24 | + |
| 25 | +if [ "$#" -eq 0 ]; then |
| 26 | + echo "Usage: e2e/with-kube-gateway.sh <command> [args...]" >&2 |
| 27 | + exit 2 |
| 28 | +fi |
| 29 | + |
| 30 | +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" |
| 31 | +# shellcheck source=e2e/support/gateway-common.sh |
| 32 | +source "${ROOT}/e2e/support/gateway-common.sh" |
| 33 | + |
| 34 | +WORKDIR_PARENT="${TMPDIR:-/tmp}" |
| 35 | +WORKDIR_PARENT="${WORKDIR_PARENT%/}" |
| 36 | +WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-kube.XXXXXX")" |
| 37 | + |
| 38 | +CLUSTER_CREATED_BY_US=0 |
| 39 | +CLUSTER_NAME="" |
| 40 | +KUBE_CONTEXT="" |
| 41 | +NAMESPACE="openshell" |
| 42 | +RELEASE_NAME="openshell" |
| 43 | +PORTFORWARD_PID="" |
| 44 | +PORTFORWARD_LOG="${WORKDIR}/portforward.log" |
| 45 | +HELM_INSTALLED=0 |
| 46 | + |
| 47 | +# Isolate CLI/SDK gateway metadata from the developer's real config. |
| 48 | +export XDG_CONFIG_HOME="${WORKDIR}/config" |
| 49 | +export XDG_DATA_HOME="${WORKDIR}/data" |
| 50 | + |
| 51 | +kctl() { |
| 52 | + kubectl --context "${KUBE_CONTEXT}" "$@" |
| 53 | +} |
| 54 | + |
| 55 | +helmctl() { |
| 56 | + helm --kube-context "${KUBE_CONTEXT}" "$@" |
| 57 | +} |
| 58 | + |
| 59 | +cleanup() { |
| 60 | + local exit_code=$? |
| 61 | + |
| 62 | + if [ -n "${PORTFORWARD_PID}" ]; then |
| 63 | + kill "${PORTFORWARD_PID}" >/dev/null 2>&1 || true |
| 64 | + wait "${PORTFORWARD_PID}" >/dev/null 2>&1 || true |
| 65 | + fi |
| 66 | + |
| 67 | + if [ "${exit_code}" -ne 0 ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then |
| 68 | + if command -v kubectl >/dev/null 2>&1 \ |
| 69 | + && kctl get namespace "${NAMESPACE}" >/dev/null 2>&1; then |
| 70 | + echo "=== gateway pod state (preserved for debugging) ===" |
| 71 | + kctl -n "${NAMESPACE}" get pods -o wide 2>&1 || true |
| 72 | + echo "=== gateway events ===" |
| 73 | + kctl -n "${NAMESPACE}" get events --sort-by=.lastTimestamp 2>&1 \ |
| 74 | + | tail -n 80 || true |
| 75 | + echo "=== gateway logs (last 200 lines) ===" |
| 76 | + kctl -n "${NAMESPACE}" logs \ |
| 77 | + -l "app.kubernetes.io/instance=${RELEASE_NAME}" --tail=200 \ |
| 78 | + --all-containers --prefix 2>&1 || true |
| 79 | + echo "=== end gateway debug output ===" |
| 80 | + fi |
| 81 | + if [ -f "${PORTFORWARD_LOG}" ]; then |
| 82 | + echo "=== port-forward log ===" |
| 83 | + cat "${PORTFORWARD_LOG}" || true |
| 84 | + echo "=== end port-forward log ===" |
| 85 | + fi |
| 86 | + fi |
| 87 | + |
| 88 | + if [ "${HELM_INSTALLED}" = "1" ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then |
| 89 | + if command -v helm >/dev/null 2>&1; then |
| 90 | + helmctl uninstall "${RELEASE_NAME}" --namespace "${NAMESPACE}" --wait \ |
| 91 | + --timeout 60s >/dev/null 2>&1 || true |
| 92 | + fi |
| 93 | + if command -v kubectl >/dev/null 2>&1; then |
| 94 | + kctl delete namespace "${NAMESPACE}" --wait=false \ |
| 95 | + --ignore-not-found >/dev/null 2>&1 || true |
| 96 | + fi |
| 97 | + fi |
| 98 | + |
| 99 | + if [ "${CLUSTER_CREATED_BY_US}" = "1" ] && [ -n "${CLUSTER_NAME}" ]; then |
| 100 | + if command -v k3d >/dev/null 2>&1 && k3d cluster list "${CLUSTER_NAME}" \ |
| 101 | + >/dev/null 2>&1; then |
| 102 | + echo "Deleting ephemeral k3d cluster ${CLUSTER_NAME}..." |
| 103 | + k3d cluster delete "${CLUSTER_NAME}" >/dev/null 2>&1 || true |
| 104 | + fi |
| 105 | + fi |
| 106 | + |
| 107 | + rm -rf "${WORKDIR}" 2>/dev/null || true |
| 108 | +} |
| 109 | +trap cleanup EXIT |
| 110 | + |
| 111 | +require_cmd() { |
| 112 | + if ! command -v "$1" >/dev/null 2>&1; then |
| 113 | + echo "ERROR: $1 is required to run Helm-backed e2e tests" >&2 |
| 114 | + exit 2 |
| 115 | + fi |
| 116 | +} |
| 117 | + |
| 118 | +require_cmd helm |
| 119 | +require_cmd kubectl |
| 120 | +require_cmd curl |
| 121 | + |
| 122 | +if [ -n "${OPENSHELL_E2E_KUBE_CONTEXT:-}" ]; then |
| 123 | + KUBE_CONTEXT="${OPENSHELL_E2E_KUBE_CONTEXT}" |
| 124 | + echo "Using existing kubectl context: ${KUBE_CONTEXT}" |
| 125 | + if ! kctl cluster-info >/dev/null 2>&1; then |
| 126 | + echo "ERROR: kubectl context '${KUBE_CONTEXT}' is not reachable." >&2 |
| 127 | + exit 2 |
| 128 | + fi |
| 129 | +else |
| 130 | + require_cmd k3d |
| 131 | + CLUSTER_NAME="oshe2e-$$-$(date +%s | tail -c 8)" |
| 132 | + echo "Creating ephemeral k3d cluster ${CLUSTER_NAME}..." |
| 133 | + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ |
| 134 | + HELM_K3S_KUBECONFIG="${WORKDIR}/kubeconfig" \ |
| 135 | + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create |
| 136 | + CLUSTER_CREATED_BY_US=1 |
| 137 | + export KUBECONFIG="${WORKDIR}/kubeconfig" |
| 138 | + KUBE_CONTEXT="k3d-${CLUSTER_NAME}" |
| 139 | +fi |
| 140 | + |
| 141 | +IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" |
| 142 | +REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" |
| 143 | +REGISTRY_VALUE="${REGISTRY_VALUE%/}" |
| 144 | + |
| 145 | +# When this script created the cluster, import locally-available gateway and |
| 146 | +# supervisor images so devs without a registry login can iterate. Best-effort: |
| 147 | +# missing images fall through to the cluster's pull behavior at install time. |
| 148 | +if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then |
| 149 | + for image in \ |
| 150 | + "${REGISTRY_VALUE}/gateway:${IMAGE_TAG_VALUE}" \ |
| 151 | + "${REGISTRY_VALUE}/supervisor:${IMAGE_TAG_VALUE}"; do |
| 152 | + if docker image inspect "${image}" >/dev/null 2>&1; then |
| 153 | + echo "Importing ${image} into k3d cluster ${CLUSTER_NAME}..." |
| 154 | + k3d image import "${image}" --cluster "${CLUSTER_NAME}" \ |
| 155 | + --mode direct >/dev/null |
| 156 | + fi |
| 157 | + done |
| 158 | +fi |
| 159 | + |
| 160 | +# The Kubernetes compute driver creates and watches Sandbox CRs reconciled |
| 161 | +# by the upstream agent-sandbox-controller. Without the CRD + controller, |
| 162 | +# every gateway K8s call 404s and CreateSandbox never produces a Pod. |
| 163 | +echo "Installing agent-sandbox CRDs and controller..." |
| 164 | +kctl apply -f "${ROOT}/deploy/kube/manifests/agent-sandbox.yaml" |
| 165 | +kctl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=120s |
| 166 | +kctl -n agent-sandbox-system rollout status statefulset/agent-sandbox-controller --timeout=300s |
| 167 | + |
| 168 | +echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..." |
| 169 | +helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \ |
| 170 | + --namespace "${NAMESPACE}" --create-namespace \ |
| 171 | + --values "${ROOT}/deploy/helm/openshell/ci/values-tls-disabled.yaml" \ |
| 172 | + --set "fullnameOverride=openshell" \ |
| 173 | + --set "image.repository=${REGISTRY_VALUE}/gateway" \ |
| 174 | + --set "image.tag=${IMAGE_TAG_VALUE}" \ |
| 175 | + --set "supervisor.image.repository=${REGISTRY_VALUE}/supervisor" \ |
| 176 | + --set "supervisor.image.tag=${IMAGE_TAG_VALUE}" \ |
| 177 | + --wait --timeout 5m |
| 178 | +HELM_INSTALLED=1 |
| 179 | + |
| 180 | +LOCAL_PORT="$(e2e_pick_port)" |
| 181 | +echo "Starting kubectl port-forward svc/openshell ${LOCAL_PORT}:8080..." |
| 182 | +kctl -n "${NAMESPACE}" port-forward "svc/openshell" \ |
| 183 | + "${LOCAL_PORT}:8080" >"${PORTFORWARD_LOG}" 2>&1 & |
| 184 | +PORTFORWARD_PID=$! |
| 185 | + |
| 186 | +elapsed=0 |
| 187 | +timeout=30 |
| 188 | +while [ "${elapsed}" -lt "${timeout}" ]; do |
| 189 | + if ! kill -0 "${PORTFORWARD_PID}" 2>/dev/null; then |
| 190 | + echo "ERROR: kubectl port-forward exited before becoming reachable" >&2 |
| 191 | + cat "${PORTFORWARD_LOG}" >&2 || true |
| 192 | + exit 1 |
| 193 | + fi |
| 194 | + if curl -s -o /dev/null --connect-timeout 1 "http://127.0.0.1:${LOCAL_PORT}"; then |
| 195 | + break |
| 196 | + fi |
| 197 | + sleep 1 |
| 198 | + elapsed=$((elapsed + 1)) |
| 199 | +done |
| 200 | +if [ "${elapsed}" -ge "${timeout}" ]; then |
| 201 | + echo "ERROR: port-forward did not accept TCP within ${timeout}s" >&2 |
| 202 | + cat "${PORTFORWARD_LOG}" >&2 || true |
| 203 | + exit 1 |
| 204 | +fi |
| 205 | + |
| 206 | +GATEWAY_NAME="openshell-e2e-kube-${LOCAL_PORT}" |
| 207 | +GATEWAY_ENDPOINT="http://127.0.0.1:${LOCAL_PORT}" |
| 208 | +e2e_register_plaintext_gateway \ |
| 209 | + "${XDG_CONFIG_HOME}" \ |
| 210 | + "${GATEWAY_NAME}" \ |
| 211 | + "${GATEWAY_ENDPOINT}" \ |
| 212 | + "${LOCAL_PORT}" |
| 213 | + |
| 214 | +export OPENSHELL_GATEWAY="${GATEWAY_NAME}" |
| 215 | +export OPENSHELL_E2E_DRIVER="kubernetes" |
| 216 | +export OPENSHELL_E2E_SANDBOX_NAMESPACE="${NAMESPACE}" |
| 217 | +export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-300}" |
| 218 | + |
| 219 | +echo "Running e2e command against ${GATEWAY_ENDPOINT}: $*" |
| 220 | +"$@" |
0 commit comments