diff --git a/.ci/scripts/setup-webgpu-linux-deps.sh b/.ci/scripts/setup-webgpu-linux-deps.sh
index 8ece5899489..b24ffb460a9 100644
--- a/.ci/scripts/setup-webgpu-linux-deps.sh
+++ b/.ci/scripts/setup-webgpu-linux-deps.sh
@@ -5,26 +5,93 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+# Vendor Dawn (Tint) + SwiftShader for the WebGPU backend CI WITHOUT hosting a
+# private prebuilt:
+# * Dawn : Google's official nightly prebuilt, downloaded directly from
+# github.com/google/dawn/releases (pinned tag+rev+sha256) -- the same
+# "fetch a pinned upstream prebuilt" pattern used for other CI deps.
+# * SwiftShader : built from source at a pinned rev compatible with the Dawn
+# above (the ossci prebuilt is from 2020, too old for current Dawn). No S3.
+# Dawn (Chrome's WebGPU impl; its WGSL compiler Tint is the spec reference) on
+# SwiftShader gives a headless, deterministic, spec-faithful CLI backend.
+#
+# Exports Dawn_DIR / VK_ICD_FILENAMES / LD_LIBRARY_PATH for the cmake build+run.
+# Local/rig override: set DAWN_PREBUILT_DIR=
to
+# skip the Dawn download.
set -ex
-# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
-install_swiftshader() {
- _https_amazon_aws=https://ossci-android.s3.amazonaws.com
- _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
- _swiftshader_dir=/tmp/swiftshader
- mkdir -p $_swiftshader_dir
+# --- pinned versions (bump rev+sha together when upgrading Dawn) --------------
+DAWN_TAG="${DAWN_TAG:-v20260423.175430}"
+DAWN_REV="${DAWN_REV:-31e25af254ab572c77054edec4946d2244e184dd}"
+DAWN_SHA256="${DAWN_SHA256:-ac76fac090162dc1ecea5ed0f28a557bb8f49efc47faab01886105ace82b7b64}"
+# SwiftShader rev verified compatible with DAWN_REV (the old ossci prebuilt is
+# from 2020 and is incompatible with current Dawn -> no adapter / zero compute).
+SWIFTSHADER_REV="${SWIFTSHADER_REV:-9898204d91d6a60b6a08ad74fe4ac52a6913111b}"
- _tmp_archive="/tmp/${_swiftshader_archive}"
+_dawn_dir="${DAWN_PREBUILT_DIR:-/tmp/dawn-ci}"
+_ss_dir=/tmp/swiftshader
- curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
- --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"
+# --- toolchain prereqs --------------------------------------------------------
+# Dawn dlopens the system Vulkan loader at runtime (libvulkan1). And the
+# ubuntu-latest prebuilt is built with a bleeding-edge GCC: it references
+# libstdc++ symbols newer than ubuntu-22.04's default (e.g. _M_replace_cold,
+# GCC 13+), so the static .a won't link against the stock runtime. Pull a current
+# libstdc++ from the ubuntu-toolchain-r PPA when the symbol floor isn't met. All
+# of this is scoped to the WebGPU CI job; newer libstdc++ is backward-compatible.
+if command -v apt-get >/dev/null 2>&1; then
+ _SUDO=""; command -v sudo >/dev/null 2>&1 && _SUDO="sudo"
+ ${_SUDO} apt-get update -y || true
+ ${_SUDO} apt-get install -y libvulkan1 software-properties-common || true
+ if ! strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 2>/dev/null \
+ | grep -q "GLIBCXX_3.4.32"; then
+ ${_SUDO} add-apt-repository -y ppa:ubuntu-toolchain-r/test || true
+ ${_SUDO} apt-get update -y || true
+ ${_SUDO} apt-get install -y libstdc++6 || true # newest GCC runtime
+ fi
+fi
- tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
+# The native binaries / pybind lib run INSIDE the CI conda env, whose libstdc++
+# predates GLIBCXX_3.4.32 (the Dawn prebuilt's floor) -- the same wall ssjia hit
+# for the vulkan op tests. Upgrade the conda runtime libstdc++ so the loaded
+# libstdc++.so.6 (conda's, not the system one) satisfies Dawn at run time.
+if command -v conda >/dev/null 2>&1; then
+ conda install -y -c conda-forge "libstdcxx-ng>=14" || true
+fi
+
+# --- Dawn: official prebuilt from GitHub (no S3) ------------------------------
+mkdir -p "${_dawn_dir}"
+if [[ ! -d "${_dawn_dir}/lib64/cmake/Dawn" ]]; then
+ _dawn_tar="/tmp/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+ curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
+ --output "${_dawn_tar}" \
+ "https://github.com/google/dawn/releases/download/${DAWN_TAG}/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+ echo "${DAWN_SHA256} ${_dawn_tar}" | sha256sum -c -
+ # archive top dir is Dawn--ubuntu-latest-Release/{lib64,include,bin}
+ tar -C "${_dawn_dir}" --strip-components=1 -xzf "${_dawn_tar}"
+fi
- export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
- export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}"
- export ETVK_USING_SWIFTSHADER=1
-}
+# --- SwiftShader: build from source at a pinned rev (no S3) -------------------
+# The old ossci prebuilt (swiftshader-abe07b943, 2020) is incompatible with the
+# current Dawn; build a matching modern SwiftShader instead. Self-contained
+# cmake build (vendored LLVM); the ICD lands under build//.
+if [[ ! -d "${_ss_dir}/build" ]]; then
+ if [[ ! -d "${_ss_dir}/.git" ]]; then
+ git clone https://github.com/google/swiftshader "${_ss_dir}"
+ fi
+ git -C "${_ss_dir}" checkout "${SWIFTSHADER_REV}"
+ # vk_swiftshader's deps are vendored in-tree; tolerate unreachable
+ # disabled-feature submodules (angle, test-only) failing to fetch.
+ git -C "${_ss_dir}" submodule update --init --recursive || true
+ cmake -S "${_ss_dir}" -B "${_ss_dir}/build" -DCMAKE_BUILD_TYPE=Release \
+ -DSWIFTSHADER_BUILD_TESTS=OFF -DSWIFTSHADER_BUILD_PVR=OFF \
+ -DSWIFTSHADER_BUILD_BENCHMARKS=OFF
+ cmake --build "${_ss_dir}/build" --parallel "$(nproc)" --target vk_swiftshader
+fi
+_ss_icd="$(find "${_ss_dir}/build" -name vk_swiftshader_icd.json 2>/dev/null | head -1)"
+[[ -n "${_ss_icd}" ]] || { echo "ERROR: SwiftShader ICD not found after build" >&2; exit 1; }
-install_swiftshader
-bash backends/webgpu/scripts/setup-wgpu-native.sh
+_ss_libdir="$(dirname "${_ss_icd}")"
+export Dawn_DIR="${_dawn_dir}/lib64/cmake/Dawn"
+export VK_ICD_FILENAMES="${_ss_icd}"
+export LD_LIBRARY_PATH="${_ss_libdir}:${LD_LIBRARY_PATH:-}"
+export WEBGPU_USING_SWIFTSHADER=1
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index fe9b564a18f..8b7a36cd79d 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -58,11 +58,10 @@ if [[ "$FLOW" == *vulkan* ]]; then
fi
if [[ "$FLOW" == *webgpu* ]]; then
- # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
- # which are required to build and run the WebGPU delegate.
+ # Dawn (Tint) + SwiftShader, the spec-faithful headless WebGPU backend.
source .ci/scripts/setup-webgpu-linux-deps.sh
- EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
+ EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON -DDawn_DIR=$Dawn_DIR"
fi
if [[ "$FLOW" == *arm* ]]; then
diff --git a/.github/workflows/test-webgpu-native.yml b/.github/workflows/test-webgpu-native.yml
new file mode 100644
index 00000000000..7220ef9f7b5
--- /dev/null
+++ b/.github/workflows/test-webgpu-native.yml
@@ -0,0 +1,54 @@
+name: Test WebGPU Native (Dawn)
+
+# The substantive WebGPU op-coverage gate. The shared operators suite only
+# delegates add.Tensor to WebGPU (everything else is CPU fallback), so the real
+# Dawn coverage comes from the native test executables (rms_norm, multi-dispatch
+# ordering, scratch). This runs them on Dawn (Tint) + SwiftShader, headless, on a
+# CPU runner -- separate from _test_backend.yml so that reusable template stays
+# untouched.
+
+# Nightly-only for now: this job builds SwiftShader from source (vendored LLVM),
+# which is too expensive to run on every PR while the workflow's reliability is
+# still being established. Once it has proven stable, re-enable a scoped PR
+# trigger with a paths: filter (backends/webgpu/**, the webgpu CI scripts, and
+# this file). The pull_request-aware ref/concurrency expressions below are kept
+# intentionally so that re-enable is a one-line change.
+on:
+ schedule:
+ - cron: 0 2 * * *
+ push:
+ branches:
+ - main
+ - release/*
+ tags:
+ - ciflow/nightly/*
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+ cancel-in-progress: true
+
+jobs:
+ test-webgpu-native:
+ uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+ with:
+ ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+ runner: linux.4xlarge.memory
+ docker-image: ci-image:executorch-ubuntu-22.04-clang12
+ submodules: recursive
+ timeout: 120
+ script: |
+ set -eux
+
+ # The generic Linux job uses the base conda env, not the image's; activate
+ # the image env (it has the pinned from-source torch). Mirrors
+ # test-vulkan-operators-linux in pull.yml.
+ CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+ conda activate "${CONDA_ENV}"
+
+ # Install the python package + runtime deps (the .pte exporters).
+ PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+ # Vendor Dawn (Tint) + SwiftShader, then build + run the native executables.
+ source .ci/scripts/setup-webgpu-linux-deps.sh
+ bash backends/webgpu/scripts/test_webgpu_native_ci.sh
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 719d86b3008..b6b41fb6587 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -54,29 +54,14 @@ target_include_directories(
target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
-# Native build: link against wgpu-native
-set(WGPU_NATIVE_DIR
- "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
- CACHE PATH "Path to wgpu-native installation"
-)
-
-# Link the shared lib; the static .a carries LLVM bitcode that breaks LTO.
-# Suffix resolves per platform: .so on Linux, .dylib on macOS.
-set(WGPU_LIB_NAME "libwgpu_native${CMAKE_SHARED_LIBRARY_SUFFIX}")
-set(WGPU_LIB "${WGPU_NATIVE_DIR}/lib/${WGPU_LIB_NAME}")
-if(NOT EXISTS "${WGPU_LIB}")
- message(FATAL_ERROR "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
- "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh"
- )
-endif()
-
-add_library(wgpu_native SHARED IMPORTED)
-set_target_properties(wgpu_native PROPERTIES IMPORTED_LOCATION "${WGPU_LIB}")
-
-target_include_directories(
- webgpu_backend PUBLIC $
-)
-target_link_libraries(webgpu_backend PRIVATE wgpu_native)
+# Native WebGPU backend: Dawn (Tint) + SwiftShader; deps script sets Dawn_DIR.
+# Native-only: browser/Emscripten builds use the system webgpu.h and never reach
+# this find_package (root CMake gates it via EXECUTORCH_BUILD_WEBGPU).
+# dawn::webgpu_dawn's link interface references Threads::Threads.
+find_package(Threads REQUIRED)
+find_package(Dawn REQUIRED)
+set(WEBGPU_GPU_LIB dawn::webgpu_dawn)
+target_link_libraries(webgpu_backend PUBLIC ${WEBGPU_GPU_LIB})
if(APPLE)
target_link_libraries(
@@ -100,19 +85,17 @@ install(
DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
-# Native test target
-if(EXECUTORCH_BUILD_WEBGPU_TEST)
- add_executable(webgpu_native_test test/test_webgpu_native.cpp)
-
+# Native test targets. Helper mirrors backends/vulkan's vulkan_op_test: every
+# test executable links the same backend + runtime libs.
+function(add_webgpu_native_test test_name test_src)
+ add_executable(${test_name} ${test_src})
target_include_directories(
- webgpu_native_test PRIVATE $
- "${WGPU_NATIVE_DIR}/include"
+ ${test_name} PRIVATE $
)
-
target_link_libraries(
- webgpu_native_test
+ ${test_name}
PRIVATE webgpu_backend
- wgpu_native
+ ${WEBGPU_GPU_LIB}
executorch_core
extension_module_static
extension_data_loader
@@ -120,47 +103,25 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
portable_kernels
portable_ops_lib
)
-
if(APPLE)
target_link_libraries(
- webgpu_native_test PRIVATE "-framework Metal" "-framework QuartzCore"
- "-framework CoreGraphics"
+ ${test_name} PRIVATE "-framework Metal" "-framework QuartzCore"
+ "-framework CoreGraphics"
)
else()
- target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
+ target_link_libraries(${test_name} PRIVATE dl m pthread)
endif()
+ target_compile_options(${test_name} PRIVATE -fexceptions)
+ set_property(TARGET ${test_name} PROPERTY CXX_STANDARD 17)
+endfunction()
- target_compile_options(webgpu_native_test PRIVATE -fexceptions)
- set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
-
- add_executable(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
-
- target_include_directories(
- webgpu_rms_norm_test PRIVATE $
- "${WGPU_NATIVE_DIR}/include"
+if(EXECUTORCH_BUILD_WEBGPU_TEST)
+ add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
+ add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
+ add_webgpu_native_test(
+ webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
)
-
- target_link_libraries(
- webgpu_rms_norm_test
- PRIVATE webgpu_backend
- wgpu_native
- executorch_core
- extension_module_static
- extension_data_loader
- extension_tensor
- portable_kernels
- portable_ops_lib
+ add_webgpu_native_test(
+ webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
)
-
- if(APPLE)
- target_link_libraries(
- webgpu_rms_norm_test PRIVATE "-framework Metal" "-framework QuartzCore"
- "-framework CoreGraphics"
- )
- else()
- target_link_libraries(webgpu_rms_norm_test PRIVATE dl m pthread)
- endif()
-
- target_compile_options(webgpu_rms_norm_test PRIVATE -fexceptions)
- set_property(TARGET webgpu_rms_norm_test PROPERTY CXX_STANDARD 17)
endif()
diff --git a/backends/webgpu/runtime/WebGPUCompat.h b/backends/webgpu/runtime/WebGPUCompat.h
new file mode 100644
index 00000000000..06715e0fc81
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUCompat.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include
+
+#include
+
+namespace executorch::backends::webgpu {
+
+// Caller's instance must enable TimedWaitAny; returns the WaitAny status.
+inline WGPUWaitStatus webgpu_wait(WGPUInstance instance, WGPUFuture future) {
+ WGPUFutureWaitInfo info = {};
+ info.future = future;
+ return wgpuInstanceWaitAny(instance, 1, &info, UINT64_MAX);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
index a5bbf8e5806..041cbe5a703 100644
--- a/backends/webgpu/runtime/WebGPUDevice.cpp
+++ b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/
+#include
#include
#include
@@ -21,12 +22,10 @@ namespace {
struct AdapterResult {
WGPUAdapter adapter = nullptr;
- bool done = false;
};
struct DeviceResult {
WGPUDevice device = nullptr;
- bool done = false;
};
void on_adapter_request(
@@ -46,7 +45,6 @@ void on_adapter_request(
static_cast(message.length),
message.data);
}
- result->done = true;
}
void on_device_request(
@@ -66,7 +64,6 @@ void on_device_request(
static_cast(message.length),
message.data);
}
- result->done = true;
}
void on_device_error(
@@ -88,25 +85,36 @@ void on_device_error(
WebGPUContext create_webgpu_context() {
WebGPUContext ctx;
- ctx.instance = wgpuCreateInstance(nullptr);
+ // TimedWaitAny lets webgpu_wait() block on futures via wgpuInstanceWaitAny.
+ WGPUInstanceDescriptor instance_desc = {};
+#if defined(__EMSCRIPTEN__)
+ instance_desc.capabilities.timedWaitAnyEnable = true;
+ instance_desc.capabilities.timedWaitAnyMaxCount = 1;
+#else
+ WGPUInstanceFeatureName features[1] = {WGPUInstanceFeatureName_TimedWaitAny};
+ instance_desc.requiredFeatureCount = 1;
+ instance_desc.requiredFeatures = features;
+#endif
+ ctx.instance = wgpuCreateInstance(&instance_desc);
if (!ctx.instance) {
throw std::runtime_error("Failed to create WebGPU instance");
}
- // Request adapter using AllowSpontaneous mode (fires during
- // wgpuInstanceProcessEvents or any other API call).
AdapterResult adapter_result;
WGPURequestAdapterCallbackInfo adapter_cb = {};
- adapter_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+ adapter_cb.mode = WGPUCallbackMode_WaitAnyOnly;
adapter_cb.callback = on_adapter_request;
adapter_cb.userdata1 = &adapter_result;
- wgpuInstanceRequestAdapter(ctx.instance, nullptr, adapter_cb);
- while (!adapter_result.done) {
- wgpuInstanceProcessEvents(ctx.instance);
- }
+ // No backend pin or forced fallback; Dawn auto-selects the adapter.
+ WGPURequestAdapterOptions adapter_opts = {};
+ adapter_opts.powerPreference = WGPUPowerPreference_HighPerformance;
+ adapter_opts.forceFallbackAdapter = false;
+ WGPUWaitStatus adapter_wait = webgpu_wait(
+ ctx.instance,
+ wgpuInstanceRequestAdapter(ctx.instance, &adapter_opts, adapter_cb));
- if (!adapter_result.adapter) {
+ if (adapter_wait != WGPUWaitStatus_Success || !adapter_result.adapter) {
wgpuInstanceRelease(ctx.instance);
ctx.instance = nullptr;
throw std::runtime_error(
@@ -118,7 +126,7 @@ WebGPUContext create_webgpu_context() {
// Request device
DeviceResult device_result;
WGPURequestDeviceCallbackInfo device_cb = {};
- device_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+ device_cb.mode = WGPUCallbackMode_WaitAnyOnly;
device_cb.callback = on_device_request;
device_cb.userdata1 = &device_result;
@@ -131,12 +139,11 @@ WebGPUContext create_webgpu_context() {
}
device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
- wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);
- while (!device_result.done) {
- wgpuInstanceProcessEvents(ctx.instance);
- }
+ WGPUWaitStatus device_wait = webgpu_wait(
+ ctx.instance,
+ wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb));
- if (!device_result.device) {
+ if (device_wait != WGPUWaitStatus_Success || !device_result.device) {
wgpuAdapterRelease(ctx.adapter);
wgpuInstanceRelease(ctx.instance);
ctx.adapter = nullptr;
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index a11b188f428..a60bfc18e3b 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -12,8 +12,8 @@
#include
#include
+#include
#include
-#include
#include
#include
@@ -471,7 +471,6 @@ void WebGPUGraph::execute() {
namespace {
struct MapCallbackData {
- bool done = false;
WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error;
};
@@ -482,7 +481,6 @@ void buffer_map_callback(
void* /*userdata2*/) {
auto* data = static_cast(userdata1);
data->status = status;
- data->done = true;
}
} // namespace
@@ -491,18 +489,18 @@ void WebGPUGraph::copy_outputs(std::vector>& outputs) {
const size_t count = std::min(outputs.size(), output_staging_buffers_.size());
std::vector cb_data(count);
+ std::vector map_futures(count, WGPUFuture{});
for (size_t i = 0; i < count; i++) {
if (outputs[i].second == 0) {
- cb_data[i].done = true;
cb_data[i].status = WGPUMapAsyncStatus_Success;
continue;
}
WGPUBufferMapCallbackInfo cb_info = {};
- cb_info.mode = WGPUCallbackMode_AllowSpontaneous;
+ cb_info.mode = WGPUCallbackMode_WaitAnyOnly;
cb_info.callback = buffer_map_callback;
cb_info.userdata1 = &cb_data[i];
- wgpuBufferMapAsync(
+ map_futures[i] = wgpuBufferMapAsync(
output_staging_buffers_[i],
WGPUMapMode_Read,
0,
@@ -510,7 +508,12 @@ void WebGPUGraph::copy_outputs(std::vector>& outputs) {
cb_info);
}
- wgpuDevicePoll(device_, true, nullptr);
+ for (size_t i = 0; i < count; i++) {
+ if (outputs[i].second != 0 &&
+ webgpu_wait(instance_, map_futures[i]) != WGPUWaitStatus_Success) {
+ throw std::runtime_error("WebGPU: WaitAny failed for output map");
+ }
+ }
for (size_t i = 0; i < count; i++) {
if (outputs[i].second == 0) {
diff --git a/backends/webgpu/scripts/setup-wgpu-native.sh b/backends/webgpu/scripts/setup-wgpu-native.sh
deleted file mode 100755
index 12ca2afdc46..00000000000
--- a/backends/webgpu/scripts/setup-wgpu-native.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Download prebuilt wgpu-native binaries for native (non-browser) WebGPU testing.
-# Usage: bash backends/webgpu/scripts/setup-wgpu-native.sh
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-WGPU_DIR="${SCRIPT_DIR}/../third-party/wgpu-native"
-
-WGPU_VERSION="v27.0.4.0"
-WGPU_BASE_URL="https://github.com/gfx-rs/wgpu-native/releases/download/${WGPU_VERSION}"
-
-OS="$(uname -s)"
-case "${OS}" in
- Darwin) PLATFORM="macos"; LIB_EXT="dylib" ;;
- Linux) PLATFORM="linux"; LIB_EXT="so" ;;
- *)
- echo "Unsupported OS: ${OS}"
- exit 1
- ;;
-esac
-
-if [[ -f "${WGPU_DIR}/lib/libwgpu_native.${LIB_EXT}" ]]; then
- echo "wgpu-native already installed at ${WGPU_DIR}"
- exit 0
-fi
-
-ARCH="$(uname -m)"
-
-case "${ARCH}" in
- x86_64) WGPU_ARCH="x86_64" ;;
- aarch64|arm64) WGPU_ARCH="aarch64" ;;
- *)
- echo "Unsupported architecture: ${ARCH}"
- exit 1
- ;;
-esac
-
-ZIP_NAME="wgpu-${PLATFORM}-${WGPU_ARCH}-release.zip"
-URL="${WGPU_BASE_URL}/${ZIP_NAME}"
-
-echo "Downloading wgpu-native ${WGPU_VERSION} for ${PLATFORM}-${WGPU_ARCH}..."
-TMPDIR_DL="$(mktemp -d)"
-trap "rm -rf ${TMPDIR_DL}" EXIT
-
-curl -sL "${URL}" -o "${TMPDIR_DL}/${ZIP_NAME}"
-
-mkdir -p "${WGPU_DIR}"
-unzip -qo "${TMPDIR_DL}/${ZIP_NAME}" -d "${WGPU_DIR}"
-
-echo "Installed wgpu-native to ${WGPU_DIR}"
-ls -la "${WGPU_DIR}/lib/"
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
new file mode 100644
index 00000000000..af014efb228
--- /dev/null
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader.
+# This is the substantive op-coverage gate: unlike the python operators suite
+# (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these
+# executables run rms_norm / multi-dispatch ordering / scratch through the real
+# WebGPU backend on Dawn.
+#
+# Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES +
+# LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs:
+# source .ci/scripts/setup-webgpu-linux-deps.sh
+# bash backends/webgpu/scripts/test_webgpu_native_ci.sh
+#
+# Builds whatever native test targets are present in the landed tree (NOT a fixed
+# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) +
+# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199). update_cache
+# / SDPA executables join automatically once their sibling diffs land.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
+NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
+BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-dawn-ci"
+
+if [[ -z "${Dawn_DIR:-}" ]]; then
+ echo "ERROR: Dawn_DIR not set. Source .ci/scripts/setup-webgpu-linux-deps.sh first." >&2
+ exit 1
+fi
+
+cd "${EXECUTORCH_ROOT}"
+
+# ── Exports for the model-driven executables (best-effort) ───────────────────
+# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
+# self-skip if absent; scratch is standalone (generates its own inputs).
+PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
+RMS_NORM_DIR="/tmp/rmsn"
+RMS_NORM_OK=1
+DISPATCH_ORDER_DIR="/tmp/dispatch_order"
+DISPATCH_ORDER_OK=1
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
+export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
+" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
+export_rms_norm_cases('${RMS_NORM_DIR}')
+" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
+export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
+" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }
+
+# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ─────────
+echo "=== Configure WebGPU native tests on Dawn ==="
+rm -rf "${BUILD_DIR}"
+cmake \
+ -DEXECUTORCH_BUILD_WEBGPU=ON \
+ -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \
+ -DDawn_DIR="${Dawn_DIR}" \
+ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+ -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+ -DCMAKE_BUILD_TYPE=Release \
+ -B "${BUILD_DIR}" \
+ "${EXECUTORCH_ROOT}"
+
+# ── Build + run every native test target that exists in this tree ────────────
+TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test)
+BIN_DIR="${BUILD_DIR}/backends/webgpu"
+
+# Which targets are defined depends on which diffs are landed (native_test +
+# rms_norm here; dispatch_order + scratch from D107576199). Query the configured
+# target list ONCE so a not-yet-landed target is skipped WITHOUT masking a real
+# compile failure of a target that IS defined (CI uses the Make generator).
+DEFINED_TARGETS="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)"
+
+# Fail loud if the probe found nothing (e.g. a non-Make generator or a cmake
+# regression): otherwise every target would skip and the job would go green
+# having tested nothing. webgpu_native_test is always defined at/after this diff.
+if ! printf '%s\n' "${DEFINED_TARGETS}" | grep -qw webgpu_native_test; then
+ echo "ERROR: cmake target probe returned no webgpu_native_test; aborting" >&2
+ exit 1
+fi
+
+for t in "${TARGETS[@]}"; do
+ if printf '%s\n' "${DEFINED_TARGETS}" | grep -qw "${t}"; then
+ # Defined target: build with stderr visible; set -e fails the job on a real
+ # build error (never silently skipped).
+ cmake --build "${BUILD_DIR}" --target "${t}" -j"${NPROC}"
+ echo "built ${t}"
+ else
+ echo "(target ${t} not defined in this tree — skipping)"
+ fi
+done
+
+echo "=== Run native tests on Dawn + SwiftShader ==="
+# native_test is model-driven; only run it if the export produced its .pte
+# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare
+# local run without the wheel self-skips here rather than hard-failing on load).
+if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
+ env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+ WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
+ WEBGPU_TEST_SDPA_DIR=/tmp/ \
+ "${BIN_DIR}/webgpu_native_test"
+else
+ echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
+fi
+if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
+ "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
+fi
+if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
+ "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
+fi
+[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
+
+echo "=== WebGPU native tests on Dawn: all run targets passed ==="
diff --git a/backends/webgpu/test/native/test_dispatch_order.cpp b/backends/webgpu/test/native/test_dispatch_order.cpp
new file mode 100644
index 00000000000..0f3eb5dea8e
--- /dev/null
+++ b/backends/webgpu/test/native/test_dispatch_order.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+struct Case {
+ const char* name;
+ std::vector sizes;
+};
+
+// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
+const std::vector kCases = {
+ {"single", {16, 16}},
+ {"chain3", {64, 64}},
+ {"chain5_tiny", {1, 1}},
+ {"chain5_wide", {7, 896}},
+ {"chain8", {256, 256}},
+ {"deep32", {128, 128}},
+ {"large_chain", {1024, 1024}},
+ {"het_small", {1, 1, 7, 896}},
+ {"het_deep", {1, 1, 5, 256}},
+};
+
+std::vector read_f32_bin(const std::string& path) {
+ std::ifstream f(path, std::ios::binary | std::ios::ate);
+ if (!f) {
+ return {};
+ }
+ const auto file_size = static_cast(f.tellg());
+ if (file_size % sizeof(float) != 0) {
+ return {}; // truncated/corrupt golden; caller treats empty as failure
+ }
+ f.seekg(0);
+ std::vector data(file_size / sizeof(float));
+ f.read(
+ reinterpret_cast(data.data()),
+ static_cast(file_size));
+ return data;
+}
+
+bool run_case(const std::string& dir, const Case& tc) {
+ printf("\n--- dispatch_order[%s] ---\n", tc.name);
+ const std::string base = dir + "/" + tc.name;
+ std::vector input = read_f32_bin(base + ".input.bin");
+ std::vector golden = read_f32_bin(base + ".golden.bin");
+ if (input.empty() || golden.empty()) {
+ printf("FAIL: could not read input/golden for %s\n", tc.name);
+ return false;
+ }
+
+ Module module(base + ".pte");
+ if (module.load_forward() != Error::Ok) {
+ printf("FAIL: could not load %s.pte\n", tc.name);
+ return false;
+ }
+
+ size_t expected = 1;
+ for (int32_t d : tc.sizes) {
+ expected *= static_cast(d);
+ }
+ if (input.size() != expected) {
+ printf(
+ "FAIL: input numel %zu != expected %zu for %s\n",
+ input.size(),
+ expected,
+ tc.name);
+ return false;
+ }
+ auto x = make_tensor_ptr(tc.sizes, std::vector(input));
+ auto result = module.forward({EValue(x)});
+ if (!result.ok()) {
+ printf("FAIL: forward failed (error %d)\n", (int)result.error());
+ return false;
+ }
+ const auto& outputs = result.get();
+ if (outputs.empty() || !outputs[0].isTensor()) {
+ printf("FAIL: no tensor output\n");
+ return false;
+ }
+ const auto& out_tensor = outputs[0].toTensor();
+ if (static_cast(out_tensor.numel()) != golden.size()) {
+ printf(
+ "FAIL: output numel %zu != golden %zu\n",
+ (size_t)out_tensor.numel(),
+ golden.size());
+ return false;
+ }
+ const float* out_data = out_tensor.const_data_ptr();
+
+ float max_abs_err = 0.0f;
+ float max_rel_err = 0.0f;
+ for (size_t i = 0; i < golden.size(); i++) {
+ const float abs_err = std::abs(out_data[i] - golden[i]);
+ max_abs_err = std::max(max_abs_err, abs_err);
+ const float denom = std::max(std::abs(golden[i]), 1e-6f);
+ max_rel_err = std::max(max_rel_err, abs_err / denom);
+ }
+ printf(
+ "Max abs error: %e Max rel error: %e (%zu elements)\n",
+ max_abs_err,
+ max_rel_err,
+ golden.size());
+ // Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
+ if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
+ printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
+ return false;
+ }
+ printf("PASS: dispatch_order[%s]\n", tc.name);
+ return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ std::string dir = "/tmp/dispatch_order";
+ if (argc > 1) {
+ dir = argv[1];
+ }
+ if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
+ dir = env;
+ }
+
+ WebGPUContext ctx;
+ try {
+ ctx = create_webgpu_context();
+ } catch (const std::exception& e) {
+ printf("SKIP: %s\n", e.what());
+ return 0;
+ }
+ set_default_webgpu_context(&ctx);
+ printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
+
+ bool ok = true;
+ for (const auto& tc : kCases) {
+ ok = run_case(dir, tc) && ok;
+ }
+
+ set_default_webgpu_context(nullptr);
+ destroy_webgpu_context(ctx);
+
+ if (!ok) {
+ return 1;
+ }
+ printf("\nAll dispatch_order tests passed\n");
+ return 0;
+}
diff --git a/backends/webgpu/test/native/test_scratch_buffer.cpp b/backends/webgpu/test/native/test_scratch_buffer.cpp
new file mode 100644
index 00000000000..7a4df6e9d00
--- /dev/null
+++ b/backends/webgpu/test/native/test_scratch_buffer.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// White-box unit tests for WebGPUGraph::create_scratch_buffer.
+
+#include
+#include
+#include
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace executorch::backends::webgpu;
+
+namespace {
+
+struct MapCb {
+ std::atomic status{WGPUMapAsyncStatus_Error};
+};
+
+void map_cb(
+ WGPUMapAsyncStatus status,
+ WGPUStringView /*message*/,
+ void* userdata1,
+ void* /*userdata2*/) {
+ auto* d = static_cast(userdata1);
+ d->status.store(status, std::memory_order_release);
+}
+
+// Copy `src` (must carry CopySrc) into a staging buffer and read it back.
+std::vector readback(
+ WGPUInstance instance,
+ WGPUDevice device,
+ WGPUQueue queue,
+ WGPUBuffer src,
+ size_t nbytes) {
+ WGPUBufferDescriptor sd = {};
+ sd.size = nbytes;
+ sd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
+ WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &sd);
+
+ WGPUCommandEncoderDescriptor ed = {};
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed);
+ wgpuCommandEncoderCopyBufferToBuffer(enc, src, 0, staging, 0, nbytes);
+ WGPUCommandBufferDescriptor cd = {};
+ WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd);
+ wgpuQueueSubmit(queue, 1, &cmd);
+ wgpuCommandBufferRelease(cmd);
+ wgpuCommandEncoderRelease(enc);
+
+ MapCb cb;
+ WGPUBufferMapCallbackInfo ci = {};
+ ci.mode = WGPUCallbackMode_WaitAnyOnly;
+ ci.callback = map_cb;
+ ci.userdata1 = &cb;
+ webgpu_wait(
+ instance, wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, nbytes, ci));
+
+ std::vector out(nbytes / sizeof(float));
+ if (cb.status.load(std::memory_order_acquire) == WGPUMapAsyncStatus_Success) {
+ const void* m = wgpuBufferGetConstMappedRange(staging, 0, nbytes);
+ if (m != nullptr) {
+ std::memcpy(out.data(), m, nbytes);
+ }
+ wgpuBufferUnmap(staging);
+ }
+ wgpuBufferRelease(staging);
+ return out;
+}
+
+// Tier 1: allocation, zero-size guard, distinct non-null handles.
+bool tier1_alloc(WGPUDevice device) {
+ printf("\n--- scratch[tier1: allocation] ---\n");
+ WebGPUGraph g;
+ g.set_device(device);
+ WGPUBuffer a = g.create_scratch_buffer(64 * sizeof(float));
+ WGPUBuffer z = g.create_scratch_buffer(0); // guarded to 4 bytes
+ WGPUBuffer b = g.create_scratch_buffer(64 * sizeof(float));
+ const bool ok = a && z && b && a != b && a != z && b != z;
+ printf(ok ? "PASS: allocation\n" : "FAIL: allocation\n");
+ return ok; // graph dtor releases all three here
+}
+
+// Tier 2: host->scratch write, scratch->staging copy, read-back round-trip.
+bool tier2_roundtrip(
+ WGPUInstance instance,
+ WGPUDevice device,
+ WGPUQueue queue) {
+ printf("\n--- scratch[tier2: copy round-trip] ---\n");
+ bool ok = true;
+ for (int n : {1, 7, 1024}) {
+ WebGPUGraph g;
+ g.set_device(device);
+ WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float));
+ std::vector in(n);
+ for (int i = 0; i < n; i++) {
+ in[i] = static_cast(i) * 0.5f + 1.0f;
+ }
+ wgpuQueueWriteBuffer(queue, s, 0, in.data(), n * sizeof(float));
+ std::vector back =
+ readback(instance, device, queue, s, n * sizeof(float));
+ float max_err = 0.0f;
+ for (int i = 0; i < n; i++) {
+ max_err = std::max(max_err, std::abs(back[i] - in[i]));
+ }
+ printf(" n=%d max abs error %e\n", n, max_err);
+ if (max_err != 0.0f) { // pure copy: must be bit-exact
+ ok = false;
+ }
+ }
+ printf(ok ? "PASS: copy round-trip\n" : "FAIL: copy round-trip\n");
+ return ok;
+}
+
+// Tier 3a: bind scratch as a Storage buffer in a compute pass (its real use).
+bool tier3_compute(WGPUInstance instance, WGPUDevice device, WGPUQueue queue) {
+ printf("\n--- scratch[tier3: compute Storage round-trip] ---\n");
+ const int n = 256;
+ WebGPUGraph g;
+ g.set_device(device);
+ WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float));
+
+ const char* kWgsl =
+ "@group(0) @binding(0) var buf: array;\n"
+ "@compute @workgroup_size(64)\n"
+ "fn main(@builtin(global_invocation_id) gid: vec3) {\n"
+ " let i = gid.x;\n"
+ " if (i < arrayLength(&buf)) { buf[i] = f32(i) * 2.0 + 1.0; }\n"
+ "}\n";
+
+ WGPUShaderSourceWGSL wgsl = {};
+ wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+ wgsl.code = {kWgsl, WGPU_STRLEN};
+ WGPUShaderModuleDescriptor smd = {};
+ smd.nextInChain = &wgsl.chain;
+ WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &smd);
+
+ WGPUBindGroupLayoutEntry ble = {};
+ ble.binding = 0;
+ ble.visibility = WGPUShaderStage_Compute;
+ ble.buffer.type = WGPUBufferBindingType_Storage;
+ WGPUBindGroupLayoutDescriptor bld = {};
+ bld.entryCount = 1;
+ bld.entries = &ble;
+ WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bld);
+
+ WGPUPipelineLayoutDescriptor pld = {};
+ pld.bindGroupLayoutCount = 1;
+ pld.bindGroupLayouts = &bgl;
+ WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pld);
+
+ WGPUComputePipelineDescriptor cpd = {};
+ cpd.layout = pl;
+ cpd.compute.module = shader;
+ cpd.compute.entryPoint = {"main", WGPU_STRLEN};
+ WGPUComputePipeline pipe = wgpuDeviceCreateComputePipeline(device, &cpd);
+
+ WGPUBindGroupEntry bge = {};
+ bge.binding = 0;
+ bge.buffer = s;
+ bge.size = n * sizeof(float);
+ WGPUBindGroupDescriptor bgd = {};
+ bgd.layout = bgl;
+ bgd.entryCount = 1;
+ bgd.entries = &bge;
+ WGPUBindGroup bg = wgpuDeviceCreateBindGroup(device, &bgd);
+
+ WGPUCommandEncoderDescriptor ed = {};
+ WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed);
+ WGPUComputePassDescriptor pd = {};
+ WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(enc, &pd);
+ wgpuComputePassEncoderSetPipeline(pass, pipe);
+ wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, nullptr);
+ wgpuComputePassEncoderDispatchWorkgroups(pass, (n + 63) / 64, 1, 1);
+ wgpuComputePassEncoderEnd(pass);
+ wgpuComputePassEncoderRelease(pass);
+ WGPUCommandBufferDescriptor cd = {};
+ WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd);
+ wgpuQueueSubmit(queue, 1, &cmd);
+ wgpuCommandBufferRelease(cmd);
+ wgpuCommandEncoderRelease(enc);
+
+ std::vector back =
+ readback(instance, device, queue, s, n * sizeof(float));
+ float max_err = 0.0f;
+ for (int i = 0; i < n; i++) {
+ const float expected = static_cast(i) * 2.0f + 1.0f;
+ max_err = std::max(max_err, std::abs(back[i] - expected));
+ }
+ printf(" max abs error %e (%d elements)\n", max_err, n);
+
+ wgpuBindGroupRelease(bg);
+ wgpuComputePipelineRelease(pipe);
+ wgpuPipelineLayoutRelease(pl);
+ wgpuBindGroupLayoutRelease(bgl);
+ wgpuShaderModuleRelease(shader);
+
+ const bool ok = max_err == 0.0f;
+ printf(
+ ok ? "PASS: compute Storage round-trip\n" : "FAIL: compute round-trip\n");
+ return ok;
+}
+
+// Tier 3b: many scratch buffers across repeated graphs; dtor must release all.
+bool tier3_lifecycle(WGPUDevice device) {
+ printf("\n--- scratch[tier3: lifecycle/stress] ---\n");
+ bool ok = true;
+ for (int iter = 0; iter < 50; iter++) {
+ WebGPUGraph g;
+ g.set_device(device);
+ for (int k = 0; k < 256; k++) {
+ WGPUBuffer b =
+ g.create_scratch_buffer(static_cast(k % 17) * sizeof(float));
+ ok = ok && b != nullptr;
+ }
+ } // each graph's dtor releases its 256 buffers here
+ printf(
+ ok ? "PASS: lifecycle/stress (50 graphs x 256 buffers)\n"
+ : "FAIL: lifecycle/stress (null buffer)\n");
+ return ok;
+}
+
+} // namespace
+
+int main() {
+ WebGPUContext ctx;
+ try {
+ ctx = create_webgpu_context();
+ } catch (const std::exception& e) {
+ printf("SKIP: %s\n", e.what());
+ return 0;
+ }
+ set_default_webgpu_context(&ctx);
+ printf("WebGPU device acquired (native)\n");
+
+ bool ok = true;
+ ok = tier1_alloc(ctx.device) && ok;
+ ok = tier2_roundtrip(ctx.instance, ctx.device, ctx.queue) && ok;
+ ok = tier3_compute(ctx.instance, ctx.device, ctx.queue) && ok;
+ ok = tier3_lifecycle(ctx.device) && ok;
+
+ set_default_webgpu_context(nullptr);
+ destroy_webgpu_context(ctx);
+
+ if (!ok) {
+ return 1;
+ }
+ printf("\nAll scratch_buffer tests passed\n");
+ return 0;
+}
diff --git a/backends/webgpu/test/ops/dispatch_order/__init__.py b/backends/webgpu/test/ops/dispatch_order/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
new file mode 100644
index 00000000000..fbb13ff6426
--- /dev/null
+++ b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Multi-dispatch ordering coverage for WebGPUGraph::execute().
+
+Each model is a dependency chain whose dispatches must execute in order (one
+compute pass per dispatch is the implicit barrier). Vehicle A is a single-input
+add self-chain; Vehicle B chains add on a reused RmsNormModule (a heterogeneous
+cross-pipeline RAW edge). Numerics are checked in test/native/test_dispatch_order.cpp.
+"""
+
+import os
+import unittest
+
+import torch
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import RmsNormModule
+from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS
+from executorch.exir import to_edge_transform_and_lower
+
+
+class ChainAddModule(torch.nn.Module):
+ """z = x + x; z = z + x; ... (depth adds) -> (depth + 1) * x."""
+
+ def __init__(self, depth: int) -> None:
+ super().__init__()
+ self.depth = depth
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ z = x + x
+ for _ in range(self.depth - 1):
+ z = z + x
+ return z
+
+
+class RmsNormAddModule(torch.nn.Module):
+ """t = rms_norm(x); z = t + x; ... (adds adds) -- heterogeneous RAW chain."""
+
+ def __init__(self, width: int, adds: int) -> None:
+ super().__init__()
+ self.rms = RmsNormModule(width, eps=1e-6)
+ self.adds = adds
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ z = self.rms(x) + x
+ for _ in range(self.adds - 1):
+ z = z + x
+ return z
+
+
+# (name, kind, shape, depth) -- MUST match kCases in test_dispatch_order.cpp.
+_CASES = [
+ ("single", "chain", (16, 16), 1),
+ ("chain3", "chain", (64, 64), 3),
+ ("chain5_tiny", "chain", (1, 1), 5),
+ ("chain5_wide", "chain", (7, 896), 5),
+ ("chain8", "chain", (256, 256), 8),
+ ("deep32", "chain", (128, 128), 32),
+ ("large_chain", "chain", (1024, 1024), 6),
+ ("het_small", "rms", (1, 1, 7, 896), 2),
+ ("het_deep", "rms", (1, 1, 5, 256), 3),
+]
+
+
+def _model(kind: str, shape, depth: int) -> torch.nn.Module:
+ if kind == "chain":
+ return ChainAddModule(depth)
+ return RmsNormAddModule(shape[-1], depth)
+
+
+def _lower(model: torch.nn.Module, x: torch.Tensor):
+ ep = torch.export.export(model, (x,))
+ return to_edge_transform_and_lower(
+ ep,
+ partitioner=[VulkanPartitioner(operator_allowlist=WEBGPU_SUPPORTED_OPS)],
+ ).to_executorch()
+
+
+class TestDispatchOrder(unittest.TestCase):
+ def _assert_delegated(self, prog) -> None:
+ found = any(
+ d.id == "VulkanBackend"
+ for p in prog.executorch_program.execution_plan
+ for d in p.delegates
+ )
+ self.assertTrue(found, "Expected VulkanBackend delegate in .pte")
+
+ def test_chain_add(self) -> None:
+ self._assert_delegated(_lower(ChainAddModule(5), torch.randn(64, 64)))
+
+ def test_rms_norm_add(self) -> None:
+ self._assert_delegated(
+ _lower(RmsNormAddModule(896, 2), torch.randn(1, 1, 7, 896))
+ )
+
+
+def export_dispatch_order_cases(out_dir: str) -> None:
+ """Write .pte, .input.bin, .golden.bin (raw le fp32) per case."""
+ os.makedirs(out_dir, exist_ok=True)
+ torch.manual_seed(0)
+ for name, kind, shape, depth in _CASES:
+ x = torch.randn(*shape)
+ model = _model(kind, shape, depth)
+ prog = _lower(model, x)
+ with torch.no_grad():
+ golden = model(x)
+ base = os.path.join(out_dir, name)
+ x.detach().cpu().numpy().astype("