Skip to content

Commit ea7a545

Browse files
committed
add gpu support
1 parent 1de95c9 commit ea7a545

12 files changed

Lines changed: 1037 additions & 17 deletions

File tree

crates/openshell-driver-vm/scripts/openshell-vm-sandbox-init.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,17 @@ create_gpu_device_nodes_mknod() {
243243
setup_gpu() {
244244
ts "GPU_ENABLED=true — initializing GPU passthrough"
245245

246+
# Kernel modules are built for a specific guest kernel version.
247+
# If the running kernel doesn't match, depmod/modprobe will silently fail.
248+
local expected_kver="6.12.76"
249+
local actual_kver
250+
actual_kver="$(uname -r)"
251+
if [ "${actual_kver}" != "${expected_kver}" ]; then
252+
ts "WARNING: kernel version mismatch: expected ${expected_kver}, got ${actual_kver}"
253+
ts " GPU modules are installed under lib/modules/${expected_kver}/"
254+
ts " modprobe may fail to find them"
255+
fi
256+
246257
if ! command -v modprobe >/dev/null 2>&1; then
247258
ts "FATAL: modprobe not found; cannot load nvidia kernel modules"
248259
return 1
@@ -258,6 +269,11 @@ setup_gpu() {
258269
fi
259270
fi
260271

272+
ts "generating module dependency index"
273+
if ! depmod -a "$(uname -r)" 2>/dev/null; then
274+
ts "WARNING: depmod failed; modprobe may not find modules"
275+
fi
276+
261277
ts "loading nvidia kernel modules"
262278
modprobe nvidia || { ts "FATAL: modprobe nvidia failed"; return 1; }
263279
modprobe nvidia_uvm 2>/dev/null || true

crates/openshell-driver-vm/src/driver.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ use crate::gpu::{
55
GpuInventory, SubnetAllocator, allocate_vsock_cid, mac_from_sandbox_id, tap_device_name,
66
};
77
use crate::rootfs::{
8-
create_rootfs_archive_from_dir, extract_rootfs_archive_to,
9-
prepare_sandbox_rootfs_from_image_root, sandbox_guest_init_path,
8+
create_rootfs_archive_from_dir, extract_rootfs_archive_to, inject_gpu_modules,
9+
prepare_sandbox_rootfs_from_image_root, refresh_runtime_artifacts, sandbox_guest_init_path,
1010
};
1111
use bollard::Docker;
1212
use bollard::errors::Error as BollardError;
@@ -419,6 +419,28 @@ impl VmDriver {
419419
return Err(err);
420420
}
421421
};
422+
if is_gpu {
423+
let rootfs_for_gpu = rootfs.clone();
424+
let driver_state_dir = self.config.state_dir.clone();
425+
if let Err(err) = tokio::task::spawn_blocking(move || {
426+
inject_gpu_modules(&rootfs_for_gpu, &driver_state_dir)
427+
})
428+
.await
429+
.map_err(|e| Status::internal(format!("GPU module injection panicked: {e}")))?
430+
{
431+
warn!(
432+
sandbox_id = %sandbox.id,
433+
error = %err,
434+
"vm driver: GPU module injection failed"
435+
);
436+
let _ = tokio::fs::remove_dir_all(&state_dir).await;
437+
return Err(Status::failed_precondition(format!(
438+
"GPU module injection failed: {err}"
439+
)));
440+
}
441+
info!(sandbox_id = %sandbox.id, "vm driver: GPU modules injected into rootfs");
442+
}
443+
422444
if let Some(tls_paths) = tls_paths.as_ref()
423445
&& let Err(err) = prepare_guest_tls_materials(&rootfs, tls_paths).await
424446
{
@@ -738,10 +760,13 @@ impl VmDriver {
738760
.await?;
739761
let archive_path = image_cache_rootfs_archive(&self.config.state_dir, &image_identity);
740762
let rootfs_dest = rootfs.to_path_buf();
741-
tokio::task::spawn_blocking(move || extract_rootfs_archive_to(&archive_path, &rootfs_dest))
742-
.await
743-
.map_err(|err| Status::internal(format!("sandbox rootfs extraction panicked: {err}")))?
744-
.map_err(|err| Status::internal(format!("extract sandbox rootfs failed: {err}")))?;
763+
tokio::task::spawn_blocking(move || {
764+
extract_rootfs_archive_to(&archive_path, &rootfs_dest)?;
765+
refresh_runtime_artifacts(&rootfs_dest)
766+
})
767+
.await
768+
.map_err(|err| Status::internal(format!("sandbox rootfs extraction panicked: {err}")))?
769+
.map_err(|err| Status::internal(format!("extract sandbox rootfs failed: {err}")))?;
745770

746771
Ok(image_identity)
747772
}

0 commit comments

Comments
 (0)