Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion architecture/compute-runtimes.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ users.
Custom sandbox images must include the agent runtime and any system
dependencies, but they should not need to include the gateway. GPU-capable
images must include the user-space libraries required by the workload. The
runtime still owns GPU device injection.
runtime still owns GPU device injection. GPU requests can include a driver-native
device identifier or a requested count; the gateway validates the request shape
and each runtime enforces the GPU allocation modes it supports. Kubernetes uses
counted `nvidia.com/gpu` resources and rejects driver-native device identifiers.

## Deployment Shape

Expand Down
83 changes: 81 additions & 2 deletions crates/openshell-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1215,10 +1215,15 @@ enum SandboxCommands {

/// Target a driver-specific GPU device. Docker and Podman use CDI device IDs
/// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index.
/// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection.
#[arg(long, requires = "gpu")]
/// Specifying --gpu-device also requests GPU resources.
/// When omitted with --gpu, the driver uses its default GPU selection.
#[arg(long, conflicts_with = "gpu_count")]
gpu_device: Option<String>,

/// Request a specific number of GPUs. Mutually exclusive with --gpu-device.
#[arg(long, value_parser = clap::value_parser!(u32).range(1..), conflicts_with = "gpu_device")]
gpu_count: Option<u32>,

/// CPU limit for the sandbox (for example: 500m, 1, 2.5).
#[arg(long)]
cpu: Option<String>,
Expand Down Expand Up @@ -2547,6 +2552,7 @@ async fn main() -> Result<()> {
editor,
gpu,
gpu_device,
gpu_count,
cpu,
memory,
driver_config_json,
Expand Down Expand Up @@ -2628,6 +2634,7 @@ async fn main() -> Result<()> {
keep,
gpu,
gpu_device.as_deref(),
gpu_count,
cpu.as_deref(),
memory.as_deref(),
driver_config_json.as_deref(),
Expand Down Expand Up @@ -4371,6 +4378,78 @@ mod tests {
}
}

#[test]
fn sandbox_create_gpu_device_parses_without_gpu_flag() {
let cli = Cli::try_parse_from([
"openshell",
"sandbox",
"create",
"--gpu-device",
"nvidia.com/gpu=0",
])
.expect("sandbox create --gpu-device should parse without --gpu");

match cli.command {
Some(Commands::Sandbox {
command:
Some(SandboxCommands::Create {
gpu, gpu_device, ..
}),
..
}) => {
assert!(!gpu);
assert_eq!(gpu_device.as_deref(), Some("nvidia.com/gpu=0"));
}
other => panic!("expected SandboxCommands::Create, got: {other:?}"),
}
}

#[test]
fn sandbox_create_gpu_count_parses_without_gpu_flag() {
let cli = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "2"])
.expect("sandbox create --gpu-count should parse");

match cli.command {
Some(Commands::Sandbox {
command: Some(SandboxCommands::Create { gpu, gpu_count, .. }),
..
}) => {
assert!(!gpu);
assert_eq!(gpu_count, Some(2));
}
other => panic!("expected SandboxCommands::Create, got: {other:?}"),
}
}

#[test]
fn sandbox_create_gpu_count_rejects_zero() {
let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "0"]);

assert!(
result.is_err(),
"sandbox create --gpu-count 0 should be rejected"
);
}

#[test]
fn sandbox_create_gpu_count_conflicts_with_gpu_device() {
let result = Cli::try_parse_from([
"openshell",
"sandbox",
"create",
"--gpu",
"--gpu-device",
"nvidia.com/gpu=0",
"--gpu-count",
"2",
]);

assert!(
result.is_err(),
"sandbox create should reject --gpu-count with --gpu-device"
);
}

#[test]
fn service_expose_accepts_positional_target_port_and_service() {
let cli = Cli::try_parse_from([
Expand Down
7 changes: 6 additions & 1 deletion crates/openshell-cli/src/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1745,6 +1745,7 @@ pub async fn sandbox_create(
keep: bool,
gpu: bool,
gpu_device: Option<&str>,
gpu_count: Option<u32>,
cpu: Option<&str>,
memory: Option<&str>,
driver_config_json: Option<&str>,
Expand Down Expand Up @@ -1799,7 +1800,10 @@ pub async fn sandbox_create(
}
None => None,
};
let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu);
let requested_gpu = gpu
|| gpu_device.is_some_and(|device_id| !device_id.is_empty())
|| gpu_count.is_some()
|| image.as_deref().is_some_and(image_requests_gpu);

let providers_v2_enabled = gateway_providers_v2_enabled(&mut client).await?;
let inferred_types: Vec<String> = if providers_v2_enabled {
Expand Down Expand Up @@ -1836,6 +1840,7 @@ pub async fn sandbox_create(
spec: Some(SandboxSpec {
gpu: requested_gpu,
gpu_device: gpu_device.unwrap_or_default().to_string(),
gpu_count,
policy,
providers: configured_providers,
template,
Expand Down
104 changes: 104 additions & 0 deletions crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -827,6 +828,7 @@ async fn sandbox_create_sends_cpu_and_memory_limits_only() {
true,
false,
None,
None,
Some("500m"),
Some("2Gi"),
None,
Expand Down Expand Up @@ -886,6 +888,99 @@ async fn sandbox_create_sends_cpu_and_memory_limits_only() {
assert!(!resources.fields.contains_key("requests"));
}

#[tokio::test]
async fn sandbox_create_sends_gpu_device_request_without_gpu_flag() {
let server = run_server().await;
let fake_ssh_dir = tempfile::tempdir().unwrap();
let xdg_dir = tempfile::tempdir().unwrap();
let _env = test_env(&fake_ssh_dir, &xdg_dir);
let tls = test_tls(&server);
install_fake_ssh(&fake_ssh_dir);

run::sandbox_create(
&server.endpoint,
Some("gpu-device"),
None,
"openshell",
&[],
true,
false,
Some("nvidia.com/gpu=0"),
None,
None,
None,
None,
None,
&[],
None,
None,
&["echo".to_string(), "OK".to_string()],
Some(false),
Some(false),
&HashMap::new(),
"manual",
&tls,
)
.await
.expect("sandbox create should succeed");

let requests = create_requests(&server).await;
let spec = requests[0]
.spec
.as_ref()
.expect("sandbox spec should be sent");

assert!(spec.gpu);
assert_eq!(spec.gpu_device, "nvidia.com/gpu=0");
}

#[tokio::test]
async fn sandbox_create_sends_gpu_count_request_without_gpu_flag() {
let server = run_server().await;
let fake_ssh_dir = tempfile::tempdir().unwrap();
let xdg_dir = tempfile::tempdir().unwrap();
let _env = test_env(&fake_ssh_dir, &xdg_dir);
let tls = test_tls(&server);
install_fake_ssh(&fake_ssh_dir);

run::sandbox_create(
&server.endpoint,
Some("gpu-count"),
None,
"openshell",
&[],
true,
false,
None,
Some(2),
None,
None,
None,
None,
&[],
None,
None,
&["echo".to_string(), "OK".to_string()],
Some(false),
Some(false),
&HashMap::new(),
"manual",
&tls,
)
.await
.expect("sandbox create should succeed");

let requests = create_requests(&server).await;
let spec = requests[0]
.spec
.as_ref()
.expect("sandbox spec should be sent");

assert!(spec.gpu);
assert_eq!(spec.gpu_count, Some(2));
assert!(spec.gpu_device.is_empty());
}

#[tokio::test]
async fn sandbox_create_sends_driver_config_json() {
let server = run_server().await;
Expand All @@ -906,6 +1001,7 @@ async fn sandbox_create_sends_driver_config_json() {
None,
None,
None,
None,
Some(r#"{"kubernetes":{"pod":{"priority_class_name":"batch-low"}}}"#),
None,
&[],
Expand Down Expand Up @@ -982,6 +1078,7 @@ async fn sandbox_create_does_not_infer_command_providers_when_v2_enabled() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1040,6 +1137,7 @@ async fn sandbox_create_returns_vm_error_without_waiting_for_timeout() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1094,6 +1192,7 @@ async fn sandbox_create_keeps_waiting_while_vm_progress_arrives() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1140,6 +1239,7 @@ async fn sandbox_create_times_out_when_only_logs_arrive() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1182,6 +1282,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1228,6 +1329,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1274,6 +1376,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() {
None,
None,
None,
None,
&[],
None,
None,
Expand Down Expand Up @@ -1320,6 +1423,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() {
None,
None,
None,
None,
&[],
None,
Some(openshell_core::forward::ForwardSpec::new(forward_port)),
Expand Down
2 changes: 1 addition & 1 deletion crates/openshell-driver-docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ contract:
| `apparmor=unconfined` | Avoids Docker's default profile blocking required mount operations. |
| `restart_policy = unless-stopped` | Keeps managed sandboxes resumable across daemon or gateway restarts. |
| `PidsLimit` | Enforces the sandbox PID budget at the Docker cgroup layer. Set `[openshell.drivers.docker].sandbox_pids_limit = 0` to inherit the Docker/runtime default. |
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. |
| CDI GPU request | Uses the sandbox `gpu_device` value when set; otherwise requests all NVIDIA GPUs when the sandbox spec asks for GPU support and daemon CDI support is detected. Count-based GPU requests are rejected. |

The agent child process does not retain these supervisor privileges.

Expand Down
14 changes: 12 additions & 2 deletions crates/openshell-driver-docker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ impl DockerComputeDriver {
"docker sandboxes require a template image",
));
}
Self::validate_gpu_request(spec.gpu, config.supports_gpu)?;
Self::validate_gpu_request(spec.gpu, spec.gpu_count, config.supports_gpu)?;
if !template.agent_socket_path.trim().is_empty() {
return Err(Status::failed_precondition(
"docker compute driver does not support template.agent_socket_path",
Expand Down Expand Up @@ -409,7 +409,17 @@ impl DockerComputeDriver {
))
}

fn validate_gpu_request(gpu: bool, supports_gpu: bool) -> Result<(), Status> {
fn validate_gpu_request(
gpu: bool,
gpu_count: Option<u32>,
supports_gpu: bool,
) -> Result<(), Status> {
if gpu_count.is_some() {
return Err(Status::invalid_argument(
"docker GPU count requests are not supported; use --gpu or --gpu-device",
));
}

if gpu && !supports_gpu {
return Err(Status::failed_precondition(
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
Expand Down
Loading
Loading