diff --git a/.gitignore b/.gitignore index fb8679fa7..a3d613775 100644 --- a/.gitignore +++ b/.gitignore @@ -168,6 +168,7 @@ logs/ tmp/ temp/ *.tmp +e2e/gpu/images/.build/ # Secrets/credentials (should never be committed) *.pem diff --git a/e2e/gpu/README.md b/e2e/gpu/README.md new file mode 100644 index 000000000..fbdf6a10a --- /dev/null +++ b/e2e/gpu/README.md @@ -0,0 +1,176 @@ + + + +# GPU workload images + +This directory defines workload test images currently used by the OpenShell GPU +e2e suite. + +## Contract + +Each workload image must: + +- Use the standard OpenShell sandbox base image as its final-stage base or + ensure that the requirements for a sandbox image are met. +- Provide a manifest command that runs the workload inside the sandbox image. +- Run the same workload as the image default entrypoint for direct + container-engine validation. +- Require no network access after the image is pulled. +- Print `OPENSHELL_GPU_WORKLOAD_SUCCESS` only when validation succeeds. +- Print `OPENSHELL_GPU_WORKLOAD_FAILURE` and exit non-zero when validation + fails. +- Be usable as an OpenShell sandbox image when OpenShell invokes the manifest + command explicitly. + +OpenShell sandbox creation replaces the image entrypoint with the supervisor and +does not run the OCI image `CMD`. E2e tests that use these images through +OpenShell run the command from each manifest entry explicitly. + +The test harness is manifest-driven. Each workload entry carries: + +- `name` +- `image` +- `command` +- `expect` +- `requirements` + +## Images + +| Source directory | Image name | Purpose | +| --- | --- | --- | +| `smoke-pass` | `gpu-workload-smoke-pass` | Always succeeds and prints the success marker. | +| `smoke-fail` | `gpu-workload-smoke-fail` | Always fails and prints the failure marker. | +| `cuda-basic` | `gpu-workload-cuda-basic` | Runs CUDA `deviceQuery` and `vectorAdd` validation. | + +## Build + +Build all workload images: + +```shell +mise run e2e:workloads:build +``` + +Build a subset by source directory name: + +```shell +OPENSHELL_GPU_WORKLOAD_IMAGES=smoke-pass,smoke-fail \ +mise run e2e:workloads:build +``` + +The build task uses `tasks/scripts/container-engine.sh`. Set +`CONTAINER_ENGINE=docker` or `CONTAINER_ENGINE=podman` to choose an engine +explicitly. When unset, the helper uses its existing auto-detection behavior. + +Local tags use the current commit short SHA. Dirty local trees append `-dirty`. +Set `OPENSHELL_GPU_WORKLOAD_IMAGE_TAG=` to override the tag. + +The task writes the latest build refs to: + +```text +e2e/gpu/images/.build/latest.env +``` + +The task also writes the local workload manifest used by the Rust e2e runner: + +```text +e2e/gpu/images/.build/workloads.yaml +``` + +That local manifest is created by `mise run e2e:workloads:build`. It contains +the full image reference, command, expected outcome, and requirements for each +selected workload. + +Use the env file in later commands: + +```shell +source e2e/gpu/images/.build/latest.env +``` + +That env file exports `OPENSHELL_E2E_WORKLOAD_MANIFEST` pointing at the local +manifest. The per-image refs remain available as a convenience for direct +container-engine validation. + +## Direct Validation + +Validate smoke pass: + +```shell +docker run --rm "${OPENSHELL_E2E_GPU_SMOKE_PASS_IMAGE}" +``` + +Validate smoke fail: + +```shell +docker run --rm "${OPENSHELL_E2E_GPU_SMOKE_FAIL_IMAGE}" +``` + +The smoke fail command should exit non-zero and print +`OPENSHELL_GPU_WORKLOAD_FAILURE`. + +Validate CUDA with Docker CDI: + +```shell +docker run --rm --device nvidia.com/gpu=all \ + "${OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE}" +``` + +Use `podman run` with the same `--device nvidia.com/gpu=all` option on hosts +where Podman CDI is configured. + +Direct container-engine validation catches image, CDI, CUDA, and host GPU setup +issues before OpenShell sandbox behavior is involved. + +## Manifest-Driven Validation + +The Rust GPU validation target is: + +```shell +cargo test --manifest-path e2e/rust/Cargo.toml --features e2e-docker-gpu --test gpu -- --nocapture +``` + +The workload validation path reads: + +```text +OPENSHELL_E2E_WORKLOAD_MANIFEST +``` + +When that variable is unset, the runner uses the default local manifest path: + +```text +e2e/gpu/images/.build/workloads.yaml +``` + +If neither path exists, the workload validation test prints a clear skip +message telling you to run: + +```shell +mise run e2e:workloads:build +``` + +or to set `OPENSHELL_E2E_WORKLOAD_MANIFEST` to an external manifest. + +Each manifest entry supplies the sandbox image and command. OpenShell runs that +command through `openshell sandbox create --gpu --from -- `. +The test runner iterates all GPU-tagged workload entries and enforces each +entry's declared expectation: + +- `expect: pass` requires `OPENSHELL_GPU_WORKLOAD_SUCCESS` +- `expect: fail` requires `OPENSHELL_GPU_WORKLOAD_FAILURE` + +The current local manifest includes three workloads: + +- `smoke-pass` expected to pass +- `smoke-fail` expected to fail +- `cuda-basic` expected to pass + +## External Manifests + +External workload catalogs can use the same schema. Point the runner at one +with: + +```shell +export OPENSHELL_E2E_WORKLOAD_MANIFEST=/abs/path/to/workloads.yaml +``` + +That lets alternate workload manifests use the same test runner without +introducing per-workload env vars. diff --git a/e2e/gpu/images/cuda-basic/Dockerfile b/e2e/gpu/images/cuda-basic/Dockerfile new file mode 100644 index 000000000..a7dde7422 --- /dev/null +++ b/e2e/gpu/images/cuda-basic/Dockerfile @@ -0,0 +1,72 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG CUDA_BUILD_IMAGE=nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04 +ARG OPENSHELL_SANDBOX_BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${CUDA_BUILD_IMAGE} AS builder + +ARG DEBIAN_FRONTEND=noninteractive +ARG CUDA_SAMPLES_REF=v12.8 +ARG CUDA_SAMPLES_REPO=https://github.com/NVIDIA/cuda-samples + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + cmake \ + cuda-nvcc-12-8 \ + curl \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /build/cuda-samples + +RUN set -eux; \ + curl -fsSL "${CUDA_SAMPLES_REPO}/archive/refs/tags/${CUDA_SAMPLES_REF}.tar.gz" \ + -o /tmp/cuda-samples.tar.gz; \ + tar -xzf /tmp/cuda-samples.tar.gz \ + --strip-components=1 \ + --wildcards \ + '*/Common/*' \ + '*/cmake/*' \ + '*/Samples/0_Introduction/vectorAdd/*' \ + '*/Samples/1_Utilities/deviceQuery/*' \ + '*/LICENSE'; \ + sed -i 's/CUDA::cudart/CUDA::cudart_static/g' \ + Samples/1_Utilities/deviceQuery/CMakeLists.txt; \ + cmake -S Samples/1_Utilities/deviceQuery -B /tmp/build-device-query \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \ + cmake --build /tmp/build-device-query --parallel; \ + cmake -S Samples/0_Introduction/vectorAdd -B /tmp/build-vector-add \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_RUNTIME_LIBRARY=Static; \ + cmake --build /tmp/build-vector-add --parallel; \ + mkdir -p /opt/openshell-gpu-workload; \ + cp /tmp/build-device-query/deviceQuery /opt/openshell-gpu-workload/deviceQuery; \ + cp /tmp/build-vector-add/vectorAdd /opt/openshell-gpu-workload/vectorAdd; \ + cp LICENSE /opt/openshell-gpu-workload/cuda-samples.LICENSE; \ + rm -f /tmp/cuda-samples.tar.gz + +FROM ${OPENSHELL_SANDBOX_BASE_IMAGE} + +ARG CUDA_SAMPLES_REF=v12.8 + +LABEL com.nvidia.openshell.gpu-workload.name="cuda-basic" \ + com.nvidia.openshell.gpu-workload.cuda-samples-ref="${CUDA_SAMPLES_REF}" + +USER root +RUN mkdir -p /usr/local/lib/openshell-gpu-workload \ + /usr/local/share/doc/openshell-gpu-workload +COPY --from=builder /opt/openshell-gpu-workload/deviceQuery /usr/local/lib/openshell-gpu-workload/deviceQuery +COPY --from=builder /opt/openshell-gpu-workload/vectorAdd /usr/local/lib/openshell-gpu-workload/vectorAdd +COPY --from=builder /opt/openshell-gpu-workload/cuda-samples.LICENSE /usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload \ + /usr/local/lib/openshell-gpu-workload/deviceQuery \ + /usr/local/lib/openshell-gpu-workload/vectorAdd + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/e2e/gpu/images/cuda-basic/README.md b/e2e/gpu/images/cuda-basic/README.md new file mode 100644 index 000000000..deb0b5c5a --- /dev/null +++ b/e2e/gpu/images/cuda-basic/README.md @@ -0,0 +1,51 @@ + + + +# GPU workload CUDA basic + +`cuda-basic` validates that a GPU-enabled environment can run a basic CUDA +runtime workload. It is a single image that runs two validation steps: + +1. `deviceQuery` checks CUDA runtime, driver, and device discovery. +2. `vectorAdd` checks kernel launch, device memory allocation, host/device + copies, synchronization, and result validation. + +The image builds the samples from `NVIDIA/cuda-samples` tag `v12.8` with a CUDA +12.8 builder image, then copies only the compiled binaries into the OpenShell +community base final image. + +The workload prints `OPENSHELL_GPU_WORKLOAD_SUCCESS` only after both samples +pass. On failure it prints `OPENSHELL_GPU_WORKLOAD_FAILURE` and exits non-zero. + +Build it with: + +```shell +mise run e2e:workloads:build +``` + +That command also refreshes the local workload manifest at +`e2e/gpu/images/.build/workloads.yaml`. + +To build only this workload locally, set: + +```shell +OPENSHELL_GPU_WORKLOAD_IMAGES=cuda-basic mise run e2e:workloads:build +``` + +Run it directly with Docker CDI: + +```shell +source e2e/gpu/images/.build/latest.env +docker run --rm --device nvidia.com/gpu=all \ + "${OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE}" +``` + +Use `podman run` with the same `--device nvidia.com/gpu=all` option when Podman +CDI is configured. + +The image does not vendor GPU driver libraries such as `libcuda.so.1`. Those +libraries must be provided by the host GPU runtime or CDI injection. + +The CUDA samples are redistributed under the NVIDIA CUDA samples license. The +license text is copied into the image at +`/usr/local/share/doc/openshell-gpu-workload/cuda-samples.LICENSE`. diff --git a/e2e/gpu/images/cuda-basic/workload.sh b/e2e/gpu/images/cuda-basic/workload.sh new file mode 100644 index 000000000..e20a67d96 --- /dev/null +++ b/e2e/gpu/images/cuda-basic/workload.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +readonly SUCCESS_MARKER="OPENSHELL_GPU_WORKLOAD_SUCCESS" +readonly FAILURE_MARKER="OPENSHELL_GPU_WORKLOAD_FAILURE" +readonly WORKLOAD_DIR="/usr/local/lib/openshell-gpu-workload" + +run_sample() { + local name=$1 + local expected=$2 + local binary="${WORKLOAD_DIR}/${name}" + local output + + output="$(mktemp)" + echo "running CUDA sample: ${name}" + if ! "${binary}" >"${output}" 2>&1; then + cat "${output}" + echo "${FAILURE_MARKER} ${name} exited non-zero" >&2 + rm -f "${output}" + exit 1 + fi + + cat "${output}" + if ! grep -Fq "${expected}" "${output}"; then + echo "${FAILURE_MARKER} ${name} did not print expected output: ${expected}" >&2 + rm -f "${output}" + exit 1 + fi + + rm -f "${output}" +} + +run_sample "deviceQuery" "Result = PASS" +run_sample "vectorAdd" "Test PASSED" + +echo "${SUCCESS_MARKER} cuda-basic" diff --git a/e2e/gpu/images/smoke-fail/Dockerfile b/e2e/gpu/images/smoke-fail/Dockerfile new file mode 100644 index 000000000..f74aa3c5e --- /dev/null +++ b/e2e/gpu/images/smoke-fail/Dockerfile @@ -0,0 +1,15 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG OPENSHELL_SANDBOX_BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${OPENSHELL_SANDBOX_BASE_IMAGE} + +USER root +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/e2e/gpu/images/smoke-fail/README.md b/e2e/gpu/images/smoke-fail/README.md new file mode 100644 index 000000000..e7e7f2744 --- /dev/null +++ b/e2e/gpu/images/smoke-fail/README.md @@ -0,0 +1,33 @@ + + + +# GPU workload smoke fail + +`smoke-fail` validates negative-path diagnostics in e2e test plumbing. + +The workload does not perform GPU-specific work. It prints +`OPENSHELL_GPU_WORKLOAD_FAILURE`, emits a stable diagnostic, and exits non-zero. + +Build it with: + +```shell +mise run e2e:workloads:build +``` + +That command also refreshes the local workload manifest at +`e2e/gpu/images/.build/workloads.yaml`. + +To build only this workload locally, set: + +```shell +OPENSHELL_GPU_WORKLOAD_IMAGES=smoke-fail mise run e2e:workloads:build +``` + +Run it directly: + +```shell +source e2e/gpu/images/.build/latest.env +docker run --rm "${OPENSHELL_E2E_GPU_SMOKE_FAIL_IMAGE}" +``` + +The direct run should fail. diff --git a/e2e/gpu/images/smoke-fail/workload.sh b/e2e/gpu/images/smoke-fail/workload.sh new file mode 100644 index 000000000..8c57624df --- /dev/null +++ b/e2e/gpu/images/smoke-fail/workload.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +echo "OPENSHELL_GPU_WORKLOAD_FAILURE smoke-fail intentional failure" >&2 +exit 42 diff --git a/e2e/gpu/images/smoke-pass/Dockerfile b/e2e/gpu/images/smoke-pass/Dockerfile new file mode 100644 index 000000000..f74aa3c5e --- /dev/null +++ b/e2e/gpu/images/smoke-pass/Dockerfile @@ -0,0 +1,15 @@ +# syntax=docker/dockerfile:1 + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ARG OPENSHELL_SANDBOX_BASE_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +FROM ${OPENSHELL_SANDBOX_BASE_IMAGE} + +USER root +COPY workload.sh /usr/local/bin/openshell-gpu-workload +RUN chmod 0755 /usr/local/bin/openshell-gpu-workload + +USER sandbox +ENTRYPOINT ["/usr/local/bin/openshell-gpu-workload"] diff --git a/e2e/gpu/images/smoke-pass/README.md b/e2e/gpu/images/smoke-pass/README.md new file mode 100644 index 000000000..cba756406 --- /dev/null +++ b/e2e/gpu/images/smoke-pass/README.md @@ -0,0 +1,32 @@ + + + +# GPU workload smoke pass + +`smoke-pass` validates image publishing, sandbox image compatibility, default +entrypoint execution, and success-marker assertion plumbing. + +The workload does not perform GPU-specific work. It prints +`OPENSHELL_GPU_WORKLOAD_SUCCESS` and exits `0`. + +Build it with: + +```shell +mise run e2e:workloads:build +``` + +That command also refreshes the local workload manifest at +`e2e/gpu/images/.build/workloads.yaml`. + +To build only this workload locally, set: + +```shell +OPENSHELL_GPU_WORKLOAD_IMAGES=smoke-pass mise run e2e:workloads:build +``` + +Run it directly: + +```shell +source e2e/gpu/images/.build/latest.env +docker run --rm "${OPENSHELL_E2E_GPU_SMOKE_PASS_IMAGE}" +``` diff --git a/e2e/gpu/images/smoke-pass/workload.sh b/e2e/gpu/images/smoke-pass/workload.sh new file mode 100644 index 000000000..76f848f50 --- /dev/null +++ b/e2e/gpu/images/smoke-pass/workload.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +echo "OPENSHELL_GPU_WORKLOAD_SUCCESS smoke-pass" diff --git a/e2e/rust/Cargo.lock b/e2e/rust/Cargo.lock index aceacf682..35b701493 100644 --- a/e2e/rust/Cargo.lock +++ b/e2e/rust/Cargo.lock @@ -361,7 +361,9 @@ dependencies = [ "hyper-util", "prost", "rand", + "serde", "serde_json", + "serde_yaml", "sha1", "sha2", "tempfile", @@ -520,6 +522,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + [[package]] name = "scopeguard" version = "1.2.0" @@ -539,6 +547,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ "serde_core", + "serde_derive", ] [[package]] @@ -574,6 +583,19 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" @@ -698,6 +720,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "version_check" version = "0.9.5" diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 31c6a3347..d158bd0e2 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -92,8 +92,8 @@ path = "tests/forward_proxy_graphql_l7.rs" required-features = ["e2e-host-gateway"] [[test]] -name = "gpu_device_selection" -path = "tests/gpu_device_selection.rs" +name = "gpu" +path = "tests/gpu.rs" required-features = ["e2e-gpu"] [dependencies] @@ -109,7 +109,9 @@ sha1 = "0.10" sha2 = "0.10" hex = "0.4" rand = "0.9" +serde = { version = "1", features = ["derive"] } serde_json = "1" +serde_yaml = "0.9" [lints.rust] unsafe_code = "warn" diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index a020f87c8..7b21939fe 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -11,9 +11,14 @@ set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" +DEFAULT_WORKLOAD_MANIFEST="${ROOT}/e2e/gpu/images/.build/workloads.yaml" cargo build -p openshell-cli --features openshell-core/dev-settings +if [ "${E2E_TEST}" = "gpu" ] && [ -z "${OPENSHELL_E2E_WORKLOAD_MANIFEST:-}" ] && [ ! -f "${DEFAULT_WORKLOAD_MANIFEST}" ]; then + echo "note: running GPU e2e without a workload manifest; workload validation will log an explicit skip. Build one with 'mise run e2e:workloads:build' or set OPENSHELL_E2E_WORKLOAD_MANIFEST." +fi + exec "${ROOT}/e2e/with-docker-gateway.sh" \ cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ --features "${E2E_FEATURES}" \ diff --git a/e2e/rust/tests/gpu.rs b/e2e/rust/tests/gpu.rs new file mode 100644 index 000000000..62930bb7d --- /dev/null +++ b/e2e/rust/tests/gpu.rs @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-gpu")] + +#[path = "gpu/device_selection.rs"] +mod device_selection; +#[path = "gpu/workloads.rs"] +mod workloads; diff --git a/e2e/rust/tests/gpu_device_selection.rs b/e2e/rust/tests/gpu/device_selection.rs similarity index 99% rename from e2e/rust/tests/gpu_device_selection.rs rename to e2e/rust/tests/gpu/device_selection.rs index 5f5314b9c..336141047 100644 --- a/e2e/rust/tests/gpu_device_selection.rs +++ b/e2e/rust/tests/gpu/device_selection.rs @@ -1,8 +1,6 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -#![cfg(feature = "e2e-gpu")] - //! GPU device selection e2e tests. //! //! Requires a GPU-backed gateway and a sandbox image containing `nvidia-smi`. diff --git a/e2e/rust/tests/gpu/workloads.rs b/e2e/rust/tests/gpu/workloads.rs new file mode 100644 index 000000000..b8ccb0b2e --- /dev/null +++ b/e2e/rust/tests/gpu/workloads.rs @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! GPU workload validation e2e tests. + +use std::fs; +use std::path::{Path, PathBuf}; + +use openshell_e2e::harness::output::strip_ansi; +use openshell_e2e::harness::sandbox::SandboxGuard; +use serde::Deserialize; + +const WORKLOAD_MANIFEST_ENV: &str = "OPENSHELL_E2E_WORKLOAD_MANIFEST"; +const GPU_WORKLOAD_SUCCESS_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_SUCCESS"; +const GPU_WORKLOAD_FAILURE_MARKER: &str = "OPENSHELL_GPU_WORKLOAD_FAILURE"; + +#[derive(Debug, Deserialize)] +struct WorkloadManifest { + workloads: Vec, +} + +#[derive(Clone, Debug, Deserialize)] +struct WorkloadDefinition { + name: String, + image: String, + command: Vec, + expect: WorkloadExpectation, + #[serde(default)] + requirements: WorkloadRequirements, +} + +#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "lowercase")] +enum WorkloadExpectation { + Pass, + Fail, +} + +#[derive(Clone, Debug, Default, Deserialize)] +struct WorkloadRequirements { + #[serde(default)] + gpu: bool, +} + +fn default_workload_manifest_path() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../gpu/images/.build/workloads.yaml") +} + +fn workload_manifest_path() -> PathBuf { + std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .map(PathBuf::from) + .unwrap_or_else(default_workload_manifest_path) +} + +fn load_workload_manifest() -> Option { + let path = workload_manifest_path(); + let explicit_override = std::env::var(WORKLOAD_MANIFEST_ENV) + .ok() + .map(|value| !value.trim().is_empty()) + .unwrap_or(false); + + let contents = match fs::read_to_string(&path) { + Ok(contents) => contents, + Err(err) if !explicit_override && err.kind() == std::io::ErrorKind::NotFound => { + eprintln!( + "skipping GPU workload validation: no workload manifest at {}. \ + Run `mise run e2e:workloads:build` to create the local manifest \ + or set {WORKLOAD_MANIFEST_ENV} to an external manifest.", + path.display() + ); + return None; + } + Err(err) => panic!("failed to read workload manifest {}: {err}", path.display()), + }; + + let manifest: WorkloadManifest = serde_yaml::from_str(&contents).unwrap_or_else(|err| { + panic!( + "failed to parse workload manifest {}: {err}", + path.display() + ) + }); + assert!( + !manifest.workloads.is_empty(), + "workload manifest {} contains no workloads", + path.display() + ); + Some(manifest) +} + +async fn assert_expected_pass(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + let mut guard = SandboxGuard::create(&arg_refs).await.unwrap_or_else(|err| { + panic!( + "GPU workload '{}' expected success but sandbox create failed:\n{err}", + workload.name + ) + }); + + let clean_output = strip_ansi(&guard.create_output); + assert!( + clean_output.contains(GPU_WORKLOAD_SUCCESS_MARKER), + "expected success marker {GPU_WORKLOAD_SUCCESS_MARKER} for workload '{}' image {} in sandbox output:\n{clean_output}", + workload.name, + workload.image, + ); + + guard.cleanup().await; +} + +async fn assert_expected_fail(workload: &WorkloadDefinition) { + let mut args = vec![ + "--gpu".to_string(), + "--from".to_string(), + workload.image.clone(), + "--".to_string(), + ]; + args.extend(workload.command.clone()); + let arg_refs = args.iter().map(String::as_str).collect::>(); + + match SandboxGuard::create(&arg_refs).await { + Ok(mut guard) => { + let clean_output = strip_ansi(&guard.create_output); + guard.cleanup().await; + panic!( + "GPU workload '{}' unexpectedly succeeded. Output:\n{clean_output}", + workload.name + ); + } + Err(err) => { + let clean_output = strip_ansi(&err); + assert!( + clean_output.contains(GPU_WORKLOAD_FAILURE_MARKER), + "expected failure marker {GPU_WORKLOAD_FAILURE_MARKER} for workload '{}' image {} in failure output:\n{clean_output}", + workload.name, + workload.image, + ); + } + } +} + +#[tokio::test] +async fn gpu_workload_manifest_runs_expected_workloads() { + let Some(manifest) = load_workload_manifest() else { + return; + }; + + let gpu_workloads = manifest + .workloads + .into_iter() + .filter(|workload| workload.requirements.gpu) + .collect::>(); + + assert!( + !gpu_workloads.is_empty(), + "workload manifest contains no GPU-tagged workloads" + ); + + for workload in gpu_workloads { + assert!( + !workload.command.is_empty(), + "workload '{}' must declare a non-empty command", + workload.name + ); + + match workload.expect { + WorkloadExpectation::Pass => assert_expected_pass(&workload).await, + WorkloadExpectation::Fail => assert_expected_fail(&workload).await, + } + } +} diff --git a/tasks/scripts/e2e-gpu-build-images.sh b/tasks/scripts/e2e-gpu-build-images.sh new file mode 100644 index 000000000..0fe6eaca7 --- /dev/null +++ b/tasks/scripts/e2e-gpu-build-images.sh @@ -0,0 +1,216 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +source "${SCRIPT_DIR}/container-engine.sh" + +IMAGES_ROOT="${ROOT}/e2e/gpu/images" +BUILD_DIR="${IMAGES_ROOT}/.build" +BASE_IMAGE="${OPENSHELL_SANDBOX_BASE_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base:latest}" +CUDA_BUILD_IMAGE="${CUDA_BUILD_IMAGE:-nvcr.io/nvidia/cuda:12.8.1-base-ubuntu22.04}" +CUDA_SAMPLES_REPO="${CUDA_SAMPLES_REPO:-https://github.com/NVIDIA/cuda-samples}" +CUDA_SAMPLES_REF="${CUDA_SAMPLES_REF:-v12.8}" + +shell_quote() { + local value=$1 + printf "'%s'" "${value//\'/\'\\\'\'}" +} + +write_env_var() { + local name=$1 + local value=$2 + printf 'export %s=%s\n' "${name}" "$(shell_quote "${value}")" +} + +yaml_quote() { + local value=$1 + value=${value//\\/\\\\} + value=${value//\"/\\\"} + value=${value//$'\n'/\\n} + value=${value//$'\r'/\\r} + value=${value//$'\t'/\\t} + printf '"%s"' "${value}" +} + +available_image_dirs() { + local dockerfile + local preferred + local seen=" " + + for preferred in smoke-pass smoke-fail cuda-basic; do + if [[ -f "${IMAGES_ROOT}/${preferred}/Dockerfile" ]]; then + echo "${preferred}" + seen+="${preferred} " + fi + done + + find "${IMAGES_ROOT}" -mindepth 2 -maxdepth 2 -name Dockerfile -type f | sort | while IFS= read -r dockerfile; do + name="$(basename "$(dirname "${dockerfile}")")" + [[ "${seen}" == *" ${name} "* ]] && continue + echo "${name}" + done +} + +contains_image() { + local needle=$1 + shift + local item + for item in "$@"; do + [[ "${item}" == "${needle}" ]] && return 0 + done + return 1 +} + +image_env_var() { + case "$1" in + smoke-pass) echo "OPENSHELL_E2E_GPU_SMOKE_PASS_IMAGE" ;; + smoke-fail) echo "OPENSHELL_E2E_GPU_SMOKE_FAIL_IMAGE" ;; + cuda-basic) echo "OPENSHELL_E2E_GPU_CUDA_WORKLOAD_IMAGE" ;; + *) + echo "unsupported GPU workload image source directory: $1" >&2 + exit 1 + ;; + esac +} + +image_expectation() { + case "$1" in + smoke-fail) echo "fail" ;; + smoke-pass|cuda-basic) echo "pass" ;; + *) + echo "unsupported GPU workload image source directory: $1" >&2 + exit 1 + ;; + esac +} + +mapfile -t available < <(available_image_dirs) +if [[ ${#available[@]} -eq 0 ]]; then + echo "No GPU workload image Dockerfiles found under ${IMAGES_ROOT}" >&2 + exit 1 +fi + +selected=() +if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGES:-}" ]]; then + IFS=',' read -r -a requested <<< "${OPENSHELL_GPU_WORKLOAD_IMAGES}" + for raw in "${requested[@]}"; do + name="${raw//[[:space:]]/}" + [[ -z "${name}" ]] && continue + if ! contains_image "${name}" "${available[@]}"; then + echo "Unknown GPU workload image source directory: ${name}" >&2 + echo "Available: ${available[*]}" >&2 + exit 1 + fi + selected+=("${name}") + done +else + selected=("${available[@]}") +fi + +if [[ ${#selected[@]} -eq 0 ]]; then + echo "No GPU workload images selected" >&2 + exit 1 +fi + +source_sha="$(git -C "${ROOT}" rev-parse HEAD)" +source_short_sha="$(git -C "${ROOT}" rev-parse --short HEAD)" +source_dirty=false +if [[ -n "$(git -C "${ROOT}" status --short)" ]]; then + source_dirty=true +fi + +if [[ -n "${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG:-}" ]]; then + image_tag="${OPENSHELL_GPU_WORKLOAD_IMAGE_TAG}" +else + image_tag="${source_short_sha}" + if [[ "${source_dirty}" == "true" ]]; then + image_tag="${image_tag}-dirty" + fi +fi + +declare -A image_refs=() + +echo "Building GPU workload images with ${CONTAINER_ENGINE}" +echo "Source: ${source_short_sha} (dirty: ${source_dirty})" +echo "Tag: ${image_tag}" + +for name in "${selected[@]}"; do + image_name="gpu-workload-${name}" + image_ref="localhost/openshell/${image_name}:${image_tag}" + context="${IMAGES_ROOT}/${name}" + + build_args=( + --build-arg "OPENSHELL_SANDBOX_BASE_IMAGE=${BASE_IMAGE}" + ) + if [[ "${name}" == "cuda-basic" ]]; then + build_args+=( + --build-arg "CUDA_BUILD_IMAGE=${CUDA_BUILD_IMAGE}" + --build-arg "CUDA_SAMPLES_REPO=${CUDA_SAMPLES_REPO}" + --build-arg "CUDA_SAMPLES_REF=${CUDA_SAMPLES_REF}" + ) + fi + + echo + echo "Building ${name} as ${image_ref}" + ce_build \ + --load \ + --provenance=false \ + -t "${image_ref}" \ + --label "com.nvidia.openshell.gpu-workload.source=${name}" \ + --label "org.opencontainers.image.revision=${source_sha}" \ + "${build_args[@]}" \ + "${context}" + + image_refs["${name}"]="${image_ref}" +done + +mkdir -p "${BUILD_DIR}" +latest_env="${BUILD_DIR}/latest.env" +manifest_path="${BUILD_DIR}/workloads.yaml" +{ + echo "# Generated by mise run e2e:workloads:build" + echo "# Source this file to use the most recently built GPU workload images." + write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_TAG "${image_tag}" + write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_PATH "${IMAGES_ROOT}" + write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_SHA "${source_sha}" + write_env_var OPENSHELL_GPU_WORKLOAD_IMAGE_SOURCE_DIRTY "${source_dirty}" + write_env_var OPENSHELL_GPU_WORKLOAD_CONTAINER_ENGINE "${CONTAINER_ENGINE}" + write_env_var OPENSHELL_E2E_WORKLOAD_MANIFEST "${manifest_path}" + for name in "${selected[@]}"; do + write_env_var "$(image_env_var "${name}")" "${image_refs[${name}]}" + done +} > "${latest_env}" + +{ + echo "schema_version: 1" + echo "generated_by: $(yaml_quote "mise run e2e:workloads:build")" + echo "source:" + echo " path: $(yaml_quote "${IMAGES_ROOT}")" + echo " revision: $(yaml_quote "${source_sha}")" + echo " dirty: ${source_dirty}" + echo " container_engine: $(yaml_quote "${CONTAINER_ENGINE}")" + echo "workloads:" + for name in "${selected[@]}"; do + echo " - name: $(yaml_quote "${name}")" + echo " image: $(yaml_quote "${image_refs[${name}]}" )" + echo " command:" + echo " - $(yaml_quote "/usr/local/bin/openshell-gpu-workload")" + echo " expect: $(yaml_quote "$(image_expectation "${name}")")" + echo " requirements:" + echo " gpu: true" + done +} > "${manifest_path}" + +echo +echo "Wrote ${latest_env}" +echo "Wrote ${manifest_path}" +echo "Built images:" +for name in "${selected[@]}"; do + echo " ${name}: ${image_refs[${name}]}" +done diff --git a/tasks/test.toml b/tasks/test.toml index 51f24f1be..bed5b0b37 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -25,6 +25,10 @@ depends = ["e2e:rust", "e2e:python"] description = "Run Docker GPU end-to-end tests" depends = ["e2e:docker:gpu"] +["e2e:workloads:build"] +description = "Build local workload test images and manifest for e2e validation" +run = "bash tasks/scripts/e2e-gpu-build-images.sh" + ["e2e:k3s:gpu"] description = "Run k3s GPU end-to-end tests" depends = ["e2e:python:gpu"] @@ -82,7 +86,7 @@ run = "e2e/rust/e2e-podman-rootless.sh" ["e2e:podman:gpu"] description = "Run GPU e2e against a standalone gateway with the Podman compute driver" -env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu_device_selection", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } +env = { OPENSHELL_E2E_PODMAN_GPU = "1", OPENSHELL_E2E_PODMAN_TEST = "gpu", OPENSHELL_E2E_PODMAN_FEATURES = "e2e-podman-gpu" } run = "e2e/rust/e2e-podman.sh" ["e2e:kubernetes"] @@ -111,7 +115,7 @@ run = [ ["e2e:docker:gpu"] description = "Run GPU e2e against a standalone gateway with the Docker compute driver" -env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu_device_selection", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } +env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "gpu", OPENSHELL_E2E_DOCKER_FEATURES = "e2e-docker-gpu" } run = "e2e/rust/e2e-docker.sh" ["e2e:openshift"]