Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/workflows/e2e-gpu-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ permissions:

jobs:
e2e-gpu:
name: "E2E GPU (${{ matrix.name }})"
name: "E2E Docker GPU (${{ matrix.name }})"
runs-on: ${{ matrix.runner }}
continue-on-error: ${{ matrix.experimental }}
timeout-minutes: 30
Expand Down Expand Up @@ -55,8 +55,12 @@ jobs:
- name: Log in to GHCR
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin

- name: Install Python dependencies and generate protobuf stubs
run: uv sync --frozen && mise run --no-deps python:proto
- name: Check Docker GPU prerequisites
run: |
docker info --format '{{json .CDISpecDirs}}'
GPU_PROBE_IMAGE="$(awk '$1 == "FROM" && $3 == "AS" && $4 == "gateway" { print $2; exit }' deploy/docker/Dockerfile.images)"
test -n "${GPU_PROBE_IMAGE}"
docker run --rm --device nvidia.com/gpu=all "${GPU_PROBE_IMAGE}" nvidia-smi -L

- name: Run tests
run: mise run --no-deps --skip-deps e2e:python:gpu
run: mise run --no-deps --skip-deps e2e:docker:gpu
8 changes: 8 additions & 0 deletions TESTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,14 @@ Suites:
- Docker suite (`--features e2e-docker`) - common suite plus Docker-only coverage such as Dockerfile image builds, Docker preflight checks, and managed Docker gateway resume.
- Docker GPU suite (`--features e2e-docker-gpu`) - Docker suite plus GPU sandbox smoke coverage.

GPU device-selection tests compare OpenShell sandboxes against a plain Docker or
Podman container that requests `--device nvidia.com/gpu=all`. The probe image
defaults to the image used by the `gateway` stage in
`deploy/docker/Dockerfile.images`; set `OPENSHELL_E2E_GPU_PROBE_IMAGE` to
override it. Per-device checks run only for NVIDIA CDI device IDs reported by
the runtime's discovered devices list, so WSL2 hosts that expose only
`nvidia.com/gpu=all` skip the index-based cases.

Run the Docker-backed Rust CLI e2e suite:

```shell
Expand Down
5 changes: 3 additions & 2 deletions crates/openshell-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1061,8 +1061,9 @@ enum SandboxCommands {
#[arg(long)]
gpu: bool,

/// Target a specific GPU by PCI address (e.g. "0000:2d:00.0") or index (e.g. "0", "1").
/// Only valid with --gpu. When omitted with --gpu, the first available GPU is assigned.
/// Target a driver-specific GPU device. Docker and Podman use CDI device IDs
/// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index.
/// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection.
#[arg(long, requires = "gpu")]
gpu_device: Option<String>,

Expand Down
48 changes: 48 additions & 0 deletions crates/openshell-core/src/gpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//! Shared GPU request helpers.

use crate::config::CDI_GPU_DEVICE_ALL;

/// Resolve the existing GPU request fields into CDI device identifiers.
///
/// `None` means no GPU was requested. A GPU request with no explicit device
/// ID uses the CDI all-GPU request; otherwise the driver-native ID passes
/// through unchanged.
#[must_use]
pub fn cdi_gpu_device_ids(gpu: bool, gpu_device: &str) -> Option<Vec<String>> {
gpu.then(|| {
if gpu_device.is_empty() {
vec![CDI_GPU_DEVICE_ALL.to_string()]
} else {
vec![gpu_device.to_string()]
}
})
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn cdi_gpu_device_ids_returns_none_when_absent() {
assert_eq!(cdi_gpu_device_ids(false, ""), None);
}

#[test]
fn cdi_gpu_device_ids_defaults_empty_request_to_all_gpus() {
assert_eq!(
cdi_gpu_device_ids(true, ""),
Some(vec![CDI_GPU_DEVICE_ALL.to_string()])
);
}

#[test]
fn cdi_gpu_device_ids_passes_explicit_device_id_through() {
assert_eq!(
cdi_gpu_device_ids(true, "nvidia.com/gpu=0"),
Some(vec!["nvidia.com/gpu=0".to_string()])
);
}
}
1 change: 1 addition & 0 deletions crates/openshell-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
pub mod config;
pub mod error;
pub mod forward;
pub mod gpu;
pub mod image;
pub mod inference;
pub mod metadata;
Expand Down
2 changes: 1 addition & 1 deletion crates/openshell-driver-docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ contract:
| `cap_add` | Grants supervisor-only capabilities required for namespace setup and process inspection. |
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
| CDI GPU request | Requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |

The agent child process does not retain these supervisor privileges.

Expand Down
28 changes: 16 additions & 12 deletions crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@ use bollard::query_parameters::{
};
use bytes::Bytes;
use futures::{Stream, StreamExt};
use openshell_core::config::{
CDI_GPU_DEVICE_ALL, DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS,
};
use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_STOP_TIMEOUT_SECS};
use openshell_core::gpu::cdi_gpu_device_ids;
use openshell_core::proto::compute::v1::{
CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
Expand Down Expand Up @@ -309,11 +308,7 @@ impl DockerComputeDriver {
"docker sandboxes require a template image",
));
}
if spec.gpu && !config.supports_gpu {
return Err(Status::failed_precondition(
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
));
}
Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
if !template.agent_socket_path.trim().is_empty() {
return Err(Status::failed_precondition(
"docker compute driver does not support template.agent_socket_path",
Expand All @@ -333,6 +328,15 @@ impl DockerComputeDriver {
Ok(())
}

fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
if gpu && !supports_gpu {
return Err(Status::failed_precondition(
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
));
}
Ok(())
}

async fn get_sandbox_snapshot(
&self,
sandbox_id: &str,
Expand Down Expand Up @@ -945,11 +949,11 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
.collect()
}

fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
gpu.then(|| {
fn docker_gpu_device_requests(gpu: bool, gpu_device: &str) -> Option<Vec<DeviceRequest>> {
cdi_gpu_device_ids(gpu, gpu_device).map(|device_ids| {
vec![DeviceRequest {
driver: Some("cdi".to_string()),
device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
device_ids: Some(device_ids),
..Default::default()
}]
})
Expand Down Expand Up @@ -996,7 +1000,7 @@ fn build_container_create_body(
host_config: Some(HostConfig {
nano_cpus: resource_limits.nano_cpus,
memory: resource_limits.memory_bytes,
device_requests: docker_gpu_device_requests(spec.gpu),
device_requests: docker_gpu_device_requests(spec.gpu, &spec.gpu_device),
mounts: Some(build_mounts(config)),
restart_policy: Some(RestartPolicy {
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),
Expand Down
26 changes: 25 additions & 1 deletion crates/openshell-driver-docker/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0

use super::*;
use openshell_core::config::DEFAULT_SERVER_PORT;
use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_SERVER_PORT};
use openshell_core::proto::compute::v1::{
DriverResourceRequirements, DriverSandboxSpec, DriverSandboxTemplate,
};
Expand Down Expand Up @@ -425,6 +425,30 @@ fn build_container_create_body_maps_gpu_to_all_cdi_device() {
);
}

#[test]
fn build_container_create_body_passes_explicit_cdi_device_id_through() {
let mut config = runtime_config();
config.supports_gpu = true;
let mut sandbox = test_sandbox();
let spec = sandbox.spec.as_mut().unwrap();
spec.gpu = true;
spec.gpu_device = "nvidia.com/gpu=0".to_string();

let create_body = build_container_create_body(&sandbox, &config).unwrap();
let request = create_body
.host_config
.as_ref()
.and_then(|host_config| host_config.device_requests.as_ref())
.and_then(|requests| requests.first())
.expect("GPU request should add a Docker device request");

assert_eq!(request.driver.as_deref(), Some("cdi"));
assert_eq!(
request.device_ids.as_ref().unwrap(),
&vec!["nvidia.com/gpu=0".to_string()]
);
}

#[test]
fn require_sandbox_identifier_rejects_when_id_and_name_are_empty() {
// Regression test: `delete_sandbox` (and the other identifier-keyed
Expand Down
1 change: 1 addition & 0 deletions crates/openshell-driver-podman/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ The container spec in `container.rs` sets these security-critical fields:
| `no_new_privileges` | `true` | Prevents privilege escalation after exec. |
| `seccomp_profile_path` | `unconfined` | The supervisor installs its own policy-aware BPF filter. A container-level profile can block Landlock/seccomp syscalls during setup. |
| `mounts` | Private tmpfs at `/run/netns` | Lets the supervisor create named network namespaces in rootless Podman. |
| CDI GPU devices | Sandbox `gpu_device` value when set, otherwise all NVIDIA GPUs | Exposes requested GPUs to GPU-enabled sandbox containers. |

The restricted agent child does not retain these supervisor privileges.

Expand Down
63 changes: 55 additions & 8 deletions crates/openshell-driver-podman/src/container.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
//! Container spec construction for the Podman driver.

use crate::config::PodmanComputeConfig;
use openshell_core::config::CDI_GPU_DEVICE_ALL;
use openshell_core::gpu::cdi_gpu_device_ids;
use openshell_core::proto::compute::v1::DriverSandbox;
use serde::Serialize;
use serde_json::Value;
Expand Down Expand Up @@ -345,13 +345,13 @@ fn build_resource_limits(sandbox: &DriverSandbox) -> ResourceLimits {

/// Build CDI GPU device list if GPU is requested.
fn build_devices(sandbox: &DriverSandbox) -> Option<Vec<LinuxDevice>> {
if sandbox.spec.as_ref().is_some_and(|s| s.gpu) {
Some(vec![LinuxDevice {
path: CDI_GPU_DEVICE_ALL.into(),
}])
} else {
None
}
let spec = sandbox.spec.as_ref()?;
cdi_gpu_device_ids(spec.gpu, &spec.gpu_device).map(|device_ids| {
device_ids
.into_iter()
.map(|path| LinuxDevice { path })
.collect()
})
}

/// Build the Podman container creation JSON spec.
Expand Down Expand Up @@ -687,6 +687,53 @@ mod tests {
assert_eq!(short_id("short"), "short");
}

#[test]
fn container_spec_omits_devices_without_gpu_request() {
let sandbox = test_sandbox("test-id", "test-name");
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert!(spec.get("devices").is_none());
}

#[test]
fn container_spec_maps_empty_gpu_request_to_all_cdi_device() {
use openshell_core::config::CDI_GPU_DEVICE_ALL;
use openshell_core::proto::compute::v1::DriverSandboxSpec;

let mut sandbox = test_sandbox("test-id", "test-name");
sandbox.spec = Some(DriverSandboxSpec {
gpu: true,
..Default::default()
});
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert_eq!(
spec["devices"][0]["path"].as_str(),
Some(CDI_GPU_DEVICE_ALL)
);
}

#[test]
fn container_spec_passes_explicit_cdi_device_id_through() {
use openshell_core::proto::compute::v1::DriverSandboxSpec;

let mut sandbox = test_sandbox("test-id", "test-name");
sandbox.spec = Some(DriverSandboxSpec {
gpu: true,
gpu_device: "nvidia.com/gpu=0".to_string(),
..Default::default()
});
let config = test_config();
let spec = build_container_spec(&sandbox, &config);

assert_eq!(
spec["devices"][0]["path"].as_str(),
Some("nvidia.com/gpu=0")
);
}

#[test]
fn container_spec_includes_required_capabilities() {
let sandbox = test_sandbox("test-id", "test-name");
Expand Down
4 changes: 4 additions & 0 deletions crates/openshell-driver-podman/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@ impl PodmanComputeDriver {
sandbox: &DriverSandbox,
) -> Result<(), ComputeDriverError> {
let gpu_requested = sandbox.spec.as_ref().is_some_and(|s| s.gpu);
Self::validate_gpu_request(gpu_requested)
}

fn validate_gpu_request(gpu_requested: bool) -> Result<(), ComputeDriverError> {
if gpu_requested && !Self::has_gpu_capacity() {
return Err(ComputeDriverError::Precondition(
"GPU sandbox requested, but no NVIDIA GPU devices are available.".to_string(),
Expand Down
15 changes: 9 additions & 6 deletions e2e/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,16 @@ publish = false
[features]
e2e = []
e2e-docker = ["e2e"]
e2e-docker-gpu = ["e2e-docker"]
e2e-gpu = ["e2e"]
e2e-docker-gpu = ["e2e-docker", "e2e-gpu"]
e2e-podman = ["e2e"]
e2e-podman-gpu = ["e2e-podman", "e2e-gpu"]

[[test]]
name = "custom_image"
path = "tests/custom_image.rs"
required-features = ["e2e-docker"]

[[test]]
name = "docker_gpu"
path = "tests/docker_gpu.rs"
required-features = ["e2e-docker-gpu"]

[[test]]
name = "docker_preflight"
path = "tests/docker_preflight.rs"
Expand All @@ -40,6 +38,11 @@ name = "gateway_resume"
path = "tests/gateway_resume.rs"
required-features = ["e2e-docker"]

[[test]]
name = "gpu_device_selection"
path = "tests/gpu_device_selection.rs"
required-features = ["e2e-gpu"]

[dependencies]
tokio = { version = "1.43", features = ["full"] }
tempfile = "3"
Expand Down
15 changes: 12 additions & 3 deletions e2e/rust/e2e-podman.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,19 @@
set -euo pipefail

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}"
E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e}"

cargo build -p openshell-cli --features openshell-core/dev-settings

TEST_ARGS=(
cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml"
--features "${E2E_FEATURES}"
)
if [ -n "${E2E_TEST}" ]; then
TEST_ARGS+=(--test "${E2E_TEST}")
fi
TEST_ARGS+=(-- --nocapture)

exec "${ROOT}/e2e/with-podman-gateway.sh" \
cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
--features e2e \
-- --nocapture
"${TEST_ARGS[@]}"
Loading
Loading