From 9ac735e281978f4dc9c7234a150078548923b9a7 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Wed, 10 Jun 2026 10:13:29 -0700 Subject: [PATCH 1/2] [ExecuTorch][WebGPU] Switch native backend from wgpu-native to Dawn (Tint) + SwiftShader Pull Request resolved: https://github.com/pytorch/executorch/pull/20079 Make Dawn (Chrome's WebGPU implementation, whose WGSL compiler Tint is the spec reference) running on SwiftShader the sole native WebGPU backend, replacing wgpu-native (naga), so the op tests run on a spec-faithful, headless, deterministic CLI backend. The `WEBGPU_IMPL` cache variable, the wgpu-native CMake branch, and the `WEBGPU_IMPL_DAWN` compile define are removed -- CMake now unconditionally `find_package(Dawn REQUIRED)` and links `dawn::webgpu_dawn`. `WebGPUCompat.h` drives pending callbacks via Dawn's `wgpuInstanceProcessEvents` on native and yields to the JS event loop under Emscripten. Dawn is vendored with NO new S3 artifact: `oss/.ci/scripts/setup-webgpu-linux-deps.sh` downloads Google's official `ubuntu-latest-Release` prebuilt directly from github.com/google/dawn/releases (pinned tag + sha256, the same pattern as `setup-wgpu-native.sh`), and reuses the SwiftShader prebuilt already on the ossci bucket. The release exports `dawn::webgpu_dawn` (a static lib) which drops into the existing `find_package(Dawn)`. It has no bundled SwiftShader, so `WebGPUDevice.cpp` requests a normal Vulkan adapter (`forceFallbackAdapter=false`) and `VK_ICD_FILENAMES` makes SwiftShader the only device. The release is built with a recent GCC, so the deps script also pulls a current libstdc++ from the `ubuntu-toolchain-r` PPA (its lib references `_M_replace_cold`, a GCC 13+ symbol) plus `libvulkan1` (Dawn dlopens the Vulkan loader) -- all scoped to the WebGPU CI job, backward-compatible, no repo-wide impact. Authored with assistance from Claude. ghstack-source-id: 391968389 @exported-using-ghexport Differential Revision: [D107589774](https://our.internmc.facebook.com/intern/diff/D107589774/) --- .ci/scripts/setup-webgpu-linux-deps.sh | 99 +++++++++++--- .ci/scripts/test_backend.sh | 5 +- .github/workflows/test-webgpu-native.yml | 54 ++++++++ backends/webgpu/CMakeLists.txt | 93 ++++---------- backends/webgpu/runtime/WebGPUCompat.h | 24 ++++ backends/webgpu/runtime/WebGPUDevice.cpp | 45 ++++--- backends/webgpu/runtime/WebGPUGraph.cpp | 17 ++- backends/webgpu/scripts/setup-wgpu-native.sh | 58 --------- .../webgpu/scripts/test_webgpu_native_ci.sh | 121 ++++++++++++++++++ backends/webgpu/test/test_build_webgpu.sh | 18 ++- 10 files changed, 352 insertions(+), 182 deletions(-) create mode 100644 .github/workflows/test-webgpu-native.yml create mode 100644 backends/webgpu/runtime/WebGPUCompat.h delete mode 100755 backends/webgpu/scripts/setup-wgpu-native.sh create mode 100644 backends/webgpu/scripts/test_webgpu_native_ci.sh diff --git a/.ci/scripts/setup-webgpu-linux-deps.sh b/.ci/scripts/setup-webgpu-linux-deps.sh index 8ece5899489..b24ffb460a9 100644 --- a/.ci/scripts/setup-webgpu-linux-deps.sh +++ b/.ci/scripts/setup-webgpu-linux-deps.sh @@ -5,26 +5,93 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +# Vendor Dawn (Tint) + SwiftShader for the WebGPU backend CI WITHOUT hosting a +# private prebuilt: +# * Dawn : Google's official nightly prebuilt, downloaded directly from +# github.com/google/dawn/releases (pinned tag+rev+sha256) -- the same +# "fetch a pinned upstream prebuilt" pattern used for other CI deps. +# * SwiftShader : built from source at a pinned rev compatible with the Dawn +# above (the ossci prebuilt is from 2020, too old for current Dawn). No S3. +# Dawn (Chrome's WebGPU impl; its WGSL compiler Tint is the spec reference) on +# SwiftShader gives a headless, deterministic, spec-faithful CLI backend. +# +# Exports Dawn_DIR / VK_ICD_FILENAMES / LD_LIBRARY_PATH for the cmake build+run. +# Local/rig override: set DAWN_PREBUILT_DIR= to +# skip the Dawn download. set -ex -# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed). -install_swiftshader() { - _https_amazon_aws=https://ossci-android.s3.amazonaws.com - _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz - _swiftshader_dir=/tmp/swiftshader - mkdir -p $_swiftshader_dir +# --- pinned versions (bump rev+sha together when upgrading Dawn) -------------- +DAWN_TAG="${DAWN_TAG:-v20260423.175430}" +DAWN_REV="${DAWN_REV:-31e25af254ab572c77054edec4946d2244e184dd}" +DAWN_SHA256="${DAWN_SHA256:-ac76fac090162dc1ecea5ed0f28a557bb8f49efc47faab01886105ace82b7b64}" +# SwiftShader rev verified compatible with DAWN_REV (the old ossci prebuilt is +# from 2020 and is incompatible with current Dawn -> no adapter / zero compute). +SWIFTSHADER_REV="${SWIFTSHADER_REV:-9898204d91d6a60b6a08ad74fe4ac52a6913111b}" - _tmp_archive="/tmp/${_swiftshader_archive}" +_dawn_dir="${DAWN_PREBUILT_DIR:-/tmp/dawn-ci}" +_ss_dir=/tmp/swiftshader - curl --silent --show-error --location --fail --retry 3 --retry-all-errors \ - --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}" +# --- toolchain prereqs -------------------------------------------------------- +# Dawn dlopens the system Vulkan loader at runtime (libvulkan1). And the +# ubuntu-latest prebuilt is built with a bleeding-edge GCC: it references +# libstdc++ symbols newer than ubuntu-22.04's default (e.g. _M_replace_cold, +# GCC 13+), so the static .a won't link against the stock runtime. Pull a current +# libstdc++ from the ubuntu-toolchain-r PPA when the symbol floor isn't met. All +# of this is scoped to the WebGPU CI job; newer libstdc++ is backward-compatible. +if command -v apt-get >/dev/null 2>&1; then + _SUDO=""; command -v sudo >/dev/null 2>&1 && _SUDO="sudo" + ${_SUDO} apt-get update -y || true + ${_SUDO} apt-get install -y libvulkan1 software-properties-common || true + if ! strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 2>/dev/null \ + | grep -q "GLIBCXX_3.4.32"; then + ${_SUDO} add-apt-repository -y ppa:ubuntu-toolchain-r/test || true + ${_SUDO} apt-get update -y || true + ${_SUDO} apt-get install -y libstdc++6 || true # newest GCC runtime + fi +fi - tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}" +# The native binaries / pybind lib run INSIDE the CI conda env, whose libstdc++ +# predates GLIBCXX_3.4.32 (the Dawn prebuilt's floor) -- the same wall ssjia hit +# for the vulkan op tests. Upgrade the conda runtime libstdc++ so the loaded +# libstdc++.so.6 (conda's, not the system one) satisfies Dawn at run time. +if command -v conda >/dev/null 2>&1; then + conda install -y -c conda-forge "libstdcxx-ng>=14" || true +fi + +# --- Dawn: official prebuilt from GitHub (no S3) ------------------------------ +mkdir -p "${_dawn_dir}" +if [[ ! -d "${_dawn_dir}/lib64/cmake/Dawn" ]]; then + _dawn_tar="/tmp/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz" + curl --silent --show-error --location --fail --retry 3 --retry-all-errors \ + --output "${_dawn_tar}" \ + "https://github.com/google/dawn/releases/download/${DAWN_TAG}/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz" + echo "${DAWN_SHA256} ${_dawn_tar}" | sha256sum -c - + # archive top dir is Dawn--ubuntu-latest-Release/{lib64,include,bin} + tar -C "${_dawn_dir}" --strip-components=1 -xzf "${_dawn_tar}" +fi - export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json" - export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}" - export ETVK_USING_SWIFTSHADER=1 -} +# --- SwiftShader: build from source at a pinned rev (no S3) ------------------- +# The old ossci prebuilt (swiftshader-abe07b943, 2020) is incompatible with the +# current Dawn; build a matching modern SwiftShader instead. Self-contained +# cmake build (vendored LLVM); the ICD lands under build//. +if [[ ! -d "${_ss_dir}/build" ]]; then + if [[ ! -d "${_ss_dir}/.git" ]]; then + git clone https://github.com/google/swiftshader "${_ss_dir}" + fi + git -C "${_ss_dir}" checkout "${SWIFTSHADER_REV}" + # vk_swiftshader's deps are vendored in-tree; tolerate unreachable + # disabled-feature submodules (angle, test-only) failing to fetch. + git -C "${_ss_dir}" submodule update --init --recursive || true + cmake -S "${_ss_dir}" -B "${_ss_dir}/build" -DCMAKE_BUILD_TYPE=Release \ + -DSWIFTSHADER_BUILD_TESTS=OFF -DSWIFTSHADER_BUILD_PVR=OFF \ + -DSWIFTSHADER_BUILD_BENCHMARKS=OFF + cmake --build "${_ss_dir}/build" --parallel "$(nproc)" --target vk_swiftshader +fi +_ss_icd="$(find "${_ss_dir}/build" -name vk_swiftshader_icd.json 2>/dev/null | head -1)" +[[ -n "${_ss_icd}" ]] || { echo "ERROR: SwiftShader ICD not found after build" >&2; exit 1; } -install_swiftshader -bash backends/webgpu/scripts/setup-wgpu-native.sh +_ss_libdir="$(dirname "${_ss_icd}")" +export Dawn_DIR="${_dawn_dir}/lib64/cmake/Dawn" +export VK_ICD_FILENAMES="${_ss_icd}" +export LD_LIBRARY_PATH="${_ss_libdir}:${LD_LIBRARY_PATH:-}" +export WEBGPU_USING_SWIFTSHADER=1 diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh index fe9b564a18f..8b7a36cd79d 100755 --- a/.ci/scripts/test_backend.sh +++ b/.ci/scripts/test_backend.sh @@ -58,11 +58,10 @@ if [[ "$FLOW" == *vulkan* ]]; then fi if [[ "$FLOW" == *webgpu* ]]; then - # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native, - # which are required to build and run the WebGPU delegate. + # Dawn (Tint) + SwiftShader, the spec-faithful headless WebGPU backend. source .ci/scripts/setup-webgpu-linux-deps.sh - EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON" + EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON -DDawn_DIR=$Dawn_DIR" fi if [[ "$FLOW" == *arm* ]]; then diff --git a/.github/workflows/test-webgpu-native.yml b/.github/workflows/test-webgpu-native.yml new file mode 100644 index 00000000000..7220ef9f7b5 --- /dev/null +++ b/.github/workflows/test-webgpu-native.yml @@ -0,0 +1,54 @@ +name: Test WebGPU Native (Dawn) + +# The substantive WebGPU op-coverage gate. The shared operators suite only +# delegates add.Tensor to WebGPU (everything else is CPU fallback), so the real +# Dawn coverage comes from the native test executables (rms_norm, multi-dispatch +# ordering, scratch). This runs them on Dawn (Tint) + SwiftShader, headless, on a +# CPU runner -- separate from _test_backend.yml so that reusable template stays +# untouched. + +# Nightly-only for now: this job builds SwiftShader from source (vendored LLVM), +# which is too expensive to run on every PR while the workflow's reliability is +# still being established. Once it has proven stable, re-enable a scoped PR +# trigger with a paths: filter (backends/webgpu/**, the webgpu CI scripts, and +# this file). The pull_request-aware ref/concurrency expressions below are kept +# intentionally so that re-enable is a one-line change. +on: + schedule: + - cron: 0 2 * * * + push: + branches: + - main + - release/* + tags: + - ciflow/nightly/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + test-webgpu-native: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + runner: linux.4xlarge.memory + docker-image: ci-image:executorch-ubuntu-22.04-clang12 + submodules: recursive + timeout: 120 + script: | + set -eux + + # The generic Linux job uses the base conda env, not the image's; activate + # the image env (it has the pinned from-source torch). Mirrors + # test-vulkan-operators-linux in pull.yml. + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + # Install the python package + runtime deps (the .pte exporters). + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + + # Vendor Dawn (Tint) + SwiftShader, then build + run the native executables. + source .ci/scripts/setup-webgpu-linux-deps.sh + bash backends/webgpu/scripts/test_webgpu_native_ci.sh diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 719d86b3008..d6ad80fdd20 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -54,29 +54,14 @@ target_include_directories( target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core) -# Native build: link against wgpu-native -set(WGPU_NATIVE_DIR - "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native" - CACHE PATH "Path to wgpu-native installation" -) - -# Link the shared lib; the static .a carries LLVM bitcode that breaks LTO. -# Suffix resolves per platform: .so on Linux, .dylib on macOS. -set(WGPU_LIB_NAME "libwgpu_native${CMAKE_SHARED_LIBRARY_SUFFIX}") -set(WGPU_LIB "${WGPU_NATIVE_DIR}/lib/${WGPU_LIB_NAME}") -if(NOT EXISTS "${WGPU_LIB}") - message(FATAL_ERROR "wgpu-native not found at ${WGPU_NATIVE_DIR}. " - "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh" - ) -endif() - -add_library(wgpu_native SHARED IMPORTED) -set_target_properties(wgpu_native PROPERTIES IMPORTED_LOCATION "${WGPU_LIB}") - -target_include_directories( - webgpu_backend PUBLIC $ -) -target_link_libraries(webgpu_backend PRIVATE wgpu_native) +# Native WebGPU backend: Dawn (Tint) + SwiftShader; deps script sets Dawn_DIR. +# Native-only: browser/Emscripten builds use the system webgpu.h and never reach +# this find_package (root CMake gates it via EXECUTORCH_BUILD_WEBGPU). +# dawn::webgpu_dawn's link interface references Threads::Threads. +find_package(Threads REQUIRED) +find_package(Dawn REQUIRED) +set(WEBGPU_GPU_LIB dawn::webgpu_dawn) +target_link_libraries(webgpu_backend PUBLIC ${WEBGPU_GPU_LIB}) if(APPLE) target_link_libraries( @@ -100,50 +85,17 @@ install( DESTINATION ${CMAKE_INSTALL_LIBDIR} ) -# Native test target -if(EXECUTORCH_BUILD_WEBGPU_TEST) - add_executable(webgpu_native_test test/test_webgpu_native.cpp) - - target_include_directories( - webgpu_native_test PRIVATE $ - "${WGPU_NATIVE_DIR}/include" - ) - - target_link_libraries( - webgpu_native_test - PRIVATE webgpu_backend - wgpu_native - executorch_core - extension_module_static - extension_data_loader - extension_tensor - portable_kernels - portable_ops_lib - ) - - if(APPLE) - target_link_libraries( - webgpu_native_test PRIVATE "-framework Metal" "-framework QuartzCore" - "-framework CoreGraphics" - ) - else() - target_link_libraries(webgpu_native_test PRIVATE dl m pthread) - endif() - - target_compile_options(webgpu_native_test PRIVATE -fexceptions) - set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17) - - add_executable(webgpu_rms_norm_test test/native/test_rms_norm.cpp) - +# Native test targets. Helper mirrors backends/vulkan's vulkan_op_test: every +# test executable links the same backend + runtime libs. +function(add_webgpu_native_test test_name test_src) + add_executable(${test_name} ${test_src}) target_include_directories( - webgpu_rms_norm_test PRIVATE $ - "${WGPU_NATIVE_DIR}/include" + ${test_name} PRIVATE $ ) - target_link_libraries( - webgpu_rms_norm_test + ${test_name} PRIVATE webgpu_backend - wgpu_native + ${WEBGPU_GPU_LIB} executorch_core extension_module_static extension_data_loader @@ -151,16 +103,19 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST) portable_kernels portable_ops_lib ) - if(APPLE) target_link_libraries( - webgpu_rms_norm_test PRIVATE "-framework Metal" "-framework QuartzCore" - "-framework CoreGraphics" + ${test_name} PRIVATE "-framework Metal" "-framework QuartzCore" + "-framework CoreGraphics" ) else() - target_link_libraries(webgpu_rms_norm_test PRIVATE dl m pthread) + target_link_libraries(${test_name} PRIVATE dl m pthread) endif() + target_compile_options(${test_name} PRIVATE -fexceptions) + set_property(TARGET ${test_name} PROPERTY CXX_STANDARD 17) +endfunction() - target_compile_options(webgpu_rms_norm_test PRIVATE -fexceptions) - set_property(TARGET webgpu_rms_norm_test PROPERTY CXX_STANDARD 17) +if(EXECUTORCH_BUILD_WEBGPU_TEST) + add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp) + add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp) endif() diff --git a/backends/webgpu/runtime/WebGPUCompat.h b/backends/webgpu/runtime/WebGPUCompat.h new file mode 100644 index 00000000000..06715e0fc81 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUCompat.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace executorch::backends::webgpu { + +// Caller's instance must enable TimedWaitAny; returns the WaitAny status. +inline WGPUWaitStatus webgpu_wait(WGPUInstance instance, WGPUFuture future) { + WGPUFutureWaitInfo info = {}; + info.future = future; + return wgpuInstanceWaitAny(instance, 1, &info, UINT64_MAX); +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp index a5bbf8e5806..041cbe5a703 100644 --- a/backends/webgpu/runtime/WebGPUDevice.cpp +++ b/backends/webgpu/runtime/WebGPUDevice.cpp @@ -6,6 +6,7 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include @@ -21,12 +22,10 @@ namespace { struct AdapterResult { WGPUAdapter adapter = nullptr; - bool done = false; }; struct DeviceResult { WGPUDevice device = nullptr; - bool done = false; }; void on_adapter_request( @@ -46,7 +45,6 @@ void on_adapter_request( static_cast(message.length), message.data); } - result->done = true; } void on_device_request( @@ -66,7 +64,6 @@ void on_device_request( static_cast(message.length), message.data); } - result->done = true; } void on_device_error( @@ -88,25 +85,36 @@ void on_device_error( WebGPUContext create_webgpu_context() { WebGPUContext ctx; - ctx.instance = wgpuCreateInstance(nullptr); + // TimedWaitAny lets webgpu_wait() block on futures via wgpuInstanceWaitAny. + WGPUInstanceDescriptor instance_desc = {}; +#if defined(__EMSCRIPTEN__) + instance_desc.capabilities.timedWaitAnyEnable = true; + instance_desc.capabilities.timedWaitAnyMaxCount = 1; +#else + WGPUInstanceFeatureName features[1] = {WGPUInstanceFeatureName_TimedWaitAny}; + instance_desc.requiredFeatureCount = 1; + instance_desc.requiredFeatures = features; +#endif + ctx.instance = wgpuCreateInstance(&instance_desc); if (!ctx.instance) { throw std::runtime_error("Failed to create WebGPU instance"); } - // Request adapter using AllowSpontaneous mode (fires during - // wgpuInstanceProcessEvents or any other API call). AdapterResult adapter_result; WGPURequestAdapterCallbackInfo adapter_cb = {}; - adapter_cb.mode = WGPUCallbackMode_AllowSpontaneous; + adapter_cb.mode = WGPUCallbackMode_WaitAnyOnly; adapter_cb.callback = on_adapter_request; adapter_cb.userdata1 = &adapter_result; - wgpuInstanceRequestAdapter(ctx.instance, nullptr, adapter_cb); - while (!adapter_result.done) { - wgpuInstanceProcessEvents(ctx.instance); - } + // No backend pin or forced fallback; Dawn auto-selects the adapter. + WGPURequestAdapterOptions adapter_opts = {}; + adapter_opts.powerPreference = WGPUPowerPreference_HighPerformance; + adapter_opts.forceFallbackAdapter = false; + WGPUWaitStatus adapter_wait = webgpu_wait( + ctx.instance, + wgpuInstanceRequestAdapter(ctx.instance, &adapter_opts, adapter_cb)); - if (!adapter_result.adapter) { + if (adapter_wait != WGPUWaitStatus_Success || !adapter_result.adapter) { wgpuInstanceRelease(ctx.instance); ctx.instance = nullptr; throw std::runtime_error( @@ -118,7 +126,7 @@ WebGPUContext create_webgpu_context() { // Request device DeviceResult device_result; WGPURequestDeviceCallbackInfo device_cb = {}; - device_cb.mode = WGPUCallbackMode_AllowSpontaneous; + device_cb.mode = WGPUCallbackMode_WaitAnyOnly; device_cb.callback = on_device_request; device_cb.userdata1 = &device_result; @@ -131,12 +139,11 @@ WebGPUContext create_webgpu_context() { } device_desc.uncapturedErrorCallbackInfo.callback = on_device_error; - wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb); - while (!device_result.done) { - wgpuInstanceProcessEvents(ctx.instance); - } + WGPUWaitStatus device_wait = webgpu_wait( + ctx.instance, + wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb)); - if (!device_result.device) { + if (device_wait != WGPUWaitStatus_Success || !device_result.device) { wgpuAdapterRelease(ctx.adapter); wgpuInstanceRelease(ctx.instance); ctx.adapter = nullptr; diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index a11b188f428..a60bfc18e3b 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -12,8 +12,8 @@ #include #include +#include #include -#include #include #include @@ -471,7 +471,6 @@ void WebGPUGraph::execute() { namespace { struct MapCallbackData { - bool done = false; WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error; }; @@ -482,7 +481,6 @@ void buffer_map_callback( void* /*userdata2*/) { auto* data = static_cast(userdata1); data->status = status; - data->done = true; } } // namespace @@ -491,18 +489,18 @@ void WebGPUGraph::copy_outputs(std::vector>& outputs) { const size_t count = std::min(outputs.size(), output_staging_buffers_.size()); std::vector cb_data(count); + std::vector map_futures(count, WGPUFuture{}); for (size_t i = 0; i < count; i++) { if (outputs[i].second == 0) { - cb_data[i].done = true; cb_data[i].status = WGPUMapAsyncStatus_Success; continue; } WGPUBufferMapCallbackInfo cb_info = {}; - cb_info.mode = WGPUCallbackMode_AllowSpontaneous; + cb_info.mode = WGPUCallbackMode_WaitAnyOnly; cb_info.callback = buffer_map_callback; cb_info.userdata1 = &cb_data[i]; - wgpuBufferMapAsync( + map_futures[i] = wgpuBufferMapAsync( output_staging_buffers_[i], WGPUMapMode_Read, 0, @@ -510,7 +508,12 @@ void WebGPUGraph::copy_outputs(std::vector>& outputs) { cb_info); } - wgpuDevicePoll(device_, true, nullptr); + for (size_t i = 0; i < count; i++) { + if (outputs[i].second != 0 && + webgpu_wait(instance_, map_futures[i]) != WGPUWaitStatus_Success) { + throw std::runtime_error("WebGPU: WaitAny failed for output map"); + } + } for (size_t i = 0; i < count; i++) { if (outputs[i].second == 0) { diff --git a/backends/webgpu/scripts/setup-wgpu-native.sh b/backends/webgpu/scripts/setup-wgpu-native.sh deleted file mode 100755 index 12ca2afdc46..00000000000 --- a/backends/webgpu/scripts/setup-wgpu-native.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# Download prebuilt wgpu-native binaries for native (non-browser) WebGPU testing. -# Usage: bash backends/webgpu/scripts/setup-wgpu-native.sh - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -WGPU_DIR="${SCRIPT_DIR}/../third-party/wgpu-native" - -WGPU_VERSION="v27.0.4.0" -WGPU_BASE_URL="https://github.com/gfx-rs/wgpu-native/releases/download/${WGPU_VERSION}" - -OS="$(uname -s)" -case "${OS}" in - Darwin) PLATFORM="macos"; LIB_EXT="dylib" ;; - Linux) PLATFORM="linux"; LIB_EXT="so" ;; - *) - echo "Unsupported OS: ${OS}" - exit 1 - ;; -esac - -if [[ -f "${WGPU_DIR}/lib/libwgpu_native.${LIB_EXT}" ]]; then - echo "wgpu-native already installed at ${WGPU_DIR}" - exit 0 -fi - -ARCH="$(uname -m)" - -case "${ARCH}" in - x86_64) WGPU_ARCH="x86_64" ;; - aarch64|arm64) WGPU_ARCH="aarch64" ;; - *) - echo "Unsupported architecture: ${ARCH}" - exit 1 - ;; -esac - -ZIP_NAME="wgpu-${PLATFORM}-${WGPU_ARCH}-release.zip" -URL="${WGPU_BASE_URL}/${ZIP_NAME}" - -echo "Downloading wgpu-native ${WGPU_VERSION} for ${PLATFORM}-${WGPU_ARCH}..." -TMPDIR_DL="$(mktemp -d)" -trap "rm -rf ${TMPDIR_DL}" EXIT - -curl -sL "${URL}" -o "${TMPDIR_DL}/${ZIP_NAME}" - -mkdir -p "${WGPU_DIR}" -unzip -qo "${TMPDIR_DL}/${ZIP_NAME}" -d "${WGPU_DIR}" - -echo "Installed wgpu-native to ${WGPU_DIR}" -ls -la "${WGPU_DIR}/lib/" diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh new file mode 100644 index 00000000000..e8cdfe8a955 --- /dev/null +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader. +# This is the substantive op-coverage gate: unlike the python operators suite +# (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these +# executables run rms_norm / multi-dispatch ordering / scratch through the real +# WebGPU backend on Dawn. +# +# Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES + +# LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs: +# source .ci/scripts/setup-webgpu-linux-deps.sh +# bash backends/webgpu/scripts/test_webgpu_native_ci.sh +# +# Builds whatever native test targets are present in the landed tree (NOT a fixed +# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) + +# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199). update_cache +# / SDPA executables join automatically once their sibling diffs land. + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}" +NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu) +BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-dawn-ci" + +if [[ -z "${Dawn_DIR:-}" ]]; then + echo "ERROR: Dawn_DIR not set. Source .ci/scripts/setup-webgpu-linux-deps.sh first." >&2 + exit 1 +fi + +cd "${EXECUTORCH_ROOT}" + +# ── Exports for the model-driven executables (best-effort) ─────────────────── +# native_test + rms_norm read .pte/golden inputs via WEBGPU_TEST_* env and +# self-skip if absent; dispatch_order + scratch are standalone (no exports). +PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" +RMS_NORM_DIR="/tmp/rmsn" +RMS_NORM_OK=1 + +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model +export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') +" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent" + +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases +export_rms_norm_cases('${RMS_NORM_DIR}') +" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; } + +# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ───────── +echo "=== Configure WebGPU native tests on Dawn ===" +rm -rf "${BUILD_DIR}" +cmake \ + -DEXECUTORCH_BUILD_WEBGPU=ON \ + -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \ + -DDawn_DIR="${Dawn_DIR}" \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -B "${BUILD_DIR}" \ + "${EXECUTORCH_ROOT}" + +# ── Build + run every native test target that exists in this tree ──────────── +TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test) +BIN_DIR="${BUILD_DIR}/backends/webgpu" + +# Which targets are defined depends on which diffs are landed (native_test + +# rms_norm here; dispatch_order + scratch from D107576199). Query the configured +# target list ONCE so a not-yet-landed target is skipped WITHOUT masking a real +# compile failure of a target that IS defined (CI uses the Make generator). +DEFINED_TARGETS="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)" + +# Fail loud if the probe found nothing (e.g. a non-Make generator or a cmake +# regression): otherwise every target would skip and the job would go green +# having tested nothing. webgpu_native_test is always defined at/after this diff. +if ! printf '%s\n' "${DEFINED_TARGETS}" | grep -qw webgpu_native_test; then + echo "ERROR: cmake target probe returned no webgpu_native_test; aborting" >&2 + exit 1 +fi + +for t in "${TARGETS[@]}"; do + if printf '%s\n' "${DEFINED_TARGETS}" | grep -qw "${t}"; then + # Defined target: build with stderr visible; set -e fails the job on a real + # build error (never silently skipped). + cmake --build "${BUILD_DIR}" --target "${t}" -j"${NPROC}" + echo "built ${t}" + else + echo "(target ${t} not defined in this tree — skipping)" + fi +done + +echo "=== Run native tests on Dawn + SwiftShader ===" +# native_test is model-driven; only run it if the export produced its .pte +# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare +# local run without the wheel self-skips here rather than hard-failing on load). +if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then + env WEBGPU_TEST_MODEL="${PTE_MODEL}" \ + WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ + WEBGPU_TEST_SDPA_DIR=/tmp/ \ + "${BIN_DIR}/webgpu_native_test" +else + echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)" +fi +if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then + "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}" +fi +[[ -x "${BIN_DIR}/webgpu_dispatch_order_test" ]] && "${BIN_DIR}/webgpu_dispatch_order_test" +[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" + +echo "=== WebGPU native tests on Dawn: all run targets passed ===" diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 5e3a20e96ac..1b90bdcb593 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# End-to-end build and test script for the WebGPU backend (native via wgpu-native). +# End-to-end build and test script for the WebGPU backend (native via Dawn). # Usage: bash backends/webgpu/test/test_build_webgpu.sh set -e @@ -51,22 +51,20 @@ export_rms_norm_cases('${RMS_NORM_DIR}') " || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; } fi -# ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── +# ── Step 3: Native build + test (Dawn + SwiftShader) ───────────────────────── -WGPU_DIR="${EXECUTORCH_ROOT}/backends/webgpu/third-party/wgpu-native" +# Vendor Dawn (Tint) + SwiftShader and export Dawn_DIR/VK_ICD_FILENAMES. Set +# DAWN_PREBUILT_DIR to an existing Dawn install to skip the download locally. +echo "=== Installing Dawn (Tint) + SwiftShader ===" +source "${EXECUTORCH_ROOT}/.ci/scripts/setup-webgpu-linux-deps.sh" -# Auto-download wgpu-native if not present -if [[ ! -d "${WGPU_DIR}/lib" ]]; then - echo "=== Installing wgpu-native ===" - bash "${EXECUTORCH_ROOT}/backends/webgpu/scripts/setup-wgpu-native.sh" -fi - -echo "=== Step 3: Native build with wgpu-native ===" +echo "=== Step 3: Native build with Dawn ===" NATIVE_BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-native" rm -rf "${NATIVE_BUILD_DIR}" cmake \ -DEXECUTORCH_BUILD_WEBGPU=ON \ + -DDawn_DIR="${Dawn_DIR}" \ -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ From 938c62d50cb5402e048b731dc1739bc2d44cbf1b Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Wed, 10 Jun 2026 10:13:30 -0700 Subject: [PATCH 2/2] [ExecuTorch][WebGPU] Add per-pass dispatch ordering + scratch buffer tests Pull Request resolved: https://github.com/pytorch/executorch/pull/20080 Native unit tests for two runtime enablers: per-pass compute-dispatch ordering (D107543258) and graph-owned scratch buffers (D107543259). `test/native/test_dispatch_order.cpp` exercises multi-dispatch read-after-write ordering through a single `execute()` using dependency chains -- a single-input `add` self-chain and a heterogeneous `rms_norm` -> `add` chain, both lowered via `VulkanPartitioner` -- comparing GPU output to a torch-computed golden per element. `test/native/test_scratch_buffer.cpp` is a white-box test of `WebGPUGraph::create_scratch_buffer` (no black-box consumer exists below the SDPA op): allocation + zero-size guard, copy round-trip, a compute Storage round-trip (its actual use), and a create/destroy lifecycle stress. Authored with assistance from Claude. ghstack-source-id: 391979580 @exported-using-ghexport Differential Revision: [D107576199](https://our.internmc.facebook.com/intern/diff/D107576199/) --- backends/webgpu/CMakeLists.txt | 6 + .../webgpu/scripts/test_webgpu_native_ci.sh | 15 +- .../test/native/test_dispatch_order.cpp | 167 +++++++++++ .../test/native/test_scratch_buffer.cpp | 261 ++++++++++++++++++ .../test/ops/dispatch_order/__init__.py | 0 .../ops/dispatch_order/test_dispatch_order.py | 119 ++++++++ backends/webgpu/test/test_build_webgpu.sh | 10 + backends/webgpu/test/tester.py | 1 + 8 files changed, 576 insertions(+), 3 deletions(-) create mode 100644 backends/webgpu/test/native/test_dispatch_order.cpp create mode 100644 backends/webgpu/test/native/test_scratch_buffer.cpp create mode 100644 backends/webgpu/test/ops/dispatch_order/__init__.py create mode 100644 backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index d6ad80fdd20..b6b41fb6587 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -118,4 +118,10 @@ endfunction() if(EXECUTORCH_BUILD_WEBGPU_TEST) add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp) add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp) + add_webgpu_native_test( + webgpu_dispatch_order_test test/native/test_dispatch_order.cpp + ) + add_webgpu_native_test( + webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp + ) endif() diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh index e8cdfe8a955..af014efb228 100644 --- a/backends/webgpu/scripts/test_webgpu_native_ci.sh +++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh @@ -37,12 +37,14 @@ fi cd "${EXECUTORCH_ROOT}" # ── Exports for the model-driven executables (best-effort) ─────────────────── -# native_test + rms_norm read .pte/golden inputs via WEBGPU_TEST_* env and -# self-skip if absent; dispatch_order + scratch are standalone (no exports). +# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and +# self-skip if absent; scratch is standalone (generates its own inputs). PTE_MODEL="/tmp/webgpu_add_test.pte" PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" RMS_NORM_DIR="/tmp/rmsn" RMS_NORM_OK=1 +DISPATCH_ORDER_DIR="/tmp/dispatch_order" +DISPATCH_ORDER_OK=1 $PYTHON_EXECUTABLE -c " from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model @@ -55,6 +57,11 @@ from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rm export_rms_norm_cases('${RMS_NORM_DIR}') " || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; } +$PYTHON_EXECUTABLE -c " +from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases +export_dispatch_order_cases('${DISPATCH_ORDER_DIR}') +" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; } + # ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ───────── echo "=== Configure WebGPU native tests on Dawn ===" rm -rf "${BUILD_DIR}" @@ -115,7 +122,9 @@ fi if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}" fi -[[ -x "${BIN_DIR}/webgpu_dispatch_order_test" ]] && "${BIN_DIR}/webgpu_dispatch_order_test" +if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then + "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}" +fi [[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test" echo "=== WebGPU native tests on Dawn: all run targets passed ===" diff --git a/backends/webgpu/test/native/test_dispatch_order.cpp b/backends/webgpu/test/native/test_dispatch_order.cpp new file mode 100644 index 00000000000..0f3eb5dea8e --- /dev/null +++ b/backends/webgpu/test/native/test_dispatch_order.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; +using namespace executorch::extension; +using namespace executorch::runtime; + +namespace { + +struct Case { + const char* name; + std::vector sizes; +}; + +// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain). +const std::vector kCases = { + {"single", {16, 16}}, + {"chain3", {64, 64}}, + {"chain5_tiny", {1, 1}}, + {"chain5_wide", {7, 896}}, + {"chain8", {256, 256}}, + {"deep32", {128, 128}}, + {"large_chain", {1024, 1024}}, + {"het_small", {1, 1, 7, 896}}, + {"het_deep", {1, 1, 5, 256}}, +}; + +std::vector read_f32_bin(const std::string& path) { + std::ifstream f(path, std::ios::binary | std::ios::ate); + if (!f) { + return {}; + } + const auto file_size = static_cast(f.tellg()); + if (file_size % sizeof(float) != 0) { + return {}; // truncated/corrupt golden; caller treats empty as failure + } + f.seekg(0); + std::vector data(file_size / sizeof(float)); + f.read( + reinterpret_cast(data.data()), + static_cast(file_size)); + return data; +} + +bool run_case(const std::string& dir, const Case& tc) { + printf("\n--- dispatch_order[%s] ---\n", tc.name); + const std::string base = dir + "/" + tc.name; + std::vector input = read_f32_bin(base + ".input.bin"); + std::vector golden = read_f32_bin(base + ".golden.bin"); + if (input.empty() || golden.empty()) { + printf("FAIL: could not read input/golden for %s\n", tc.name); + return false; + } + + Module module(base + ".pte"); + if (module.load_forward() != Error::Ok) { + printf("FAIL: could not load %s.pte\n", tc.name); + return false; + } + + size_t expected = 1; + for (int32_t d : tc.sizes) { + expected *= static_cast(d); + } + if (input.size() != expected) { + printf( + "FAIL: input numel %zu != expected %zu for %s\n", + input.size(), + expected, + tc.name); + return false; + } + auto x = make_tensor_ptr(tc.sizes, std::vector(input)); + auto result = module.forward({EValue(x)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + const auto& out_tensor = outputs[0].toTensor(); + if (static_cast(out_tensor.numel()) != golden.size()) { + printf( + "FAIL: output numel %zu != golden %zu\n", + (size_t)out_tensor.numel(), + golden.size()); + return false; + } + const float* out_data = out_tensor.const_data_ptr(); + + float max_abs_err = 0.0f; + float max_rel_err = 0.0f; + for (size_t i = 0; i < golden.size(); i++) { + const float abs_err = std::abs(out_data[i] - golden[i]); + max_abs_err = std::max(max_abs_err, abs_err); + const float denom = std::max(std::abs(golden[i]), 1e-6f); + max_rel_err = std::max(max_rel_err, abs_err / denom); + } + printf( + "Max abs error: %e Max rel error: %e (%zu elements)\n", + max_abs_err, + max_rel_err, + golden.size()); + // Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens). + if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) { + printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name); + return false; + } + printf("PASS: dispatch_order[%s]\n", tc.name); + return true; +} + +} // namespace + +int main(int argc, char** argv) { + std::string dir = "/tmp/dispatch_order"; + if (argc > 1) { + dir = argv[1]; + } + if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) { + dir = env; + } + + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str()); + + bool ok = true; + for (const auto& tc : kCases) { + ok = run_case(dir, tc) && ok; + } + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll dispatch_order tests passed\n"); + return 0; +} diff --git a/backends/webgpu/test/native/test_scratch_buffer.cpp b/backends/webgpu/test/native/test_scratch_buffer.cpp new file mode 100644 index 00000000000..7a4df6e9d00 --- /dev/null +++ b/backends/webgpu/test/native/test_scratch_buffer.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// White-box unit tests for WebGPUGraph::create_scratch_buffer. + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +using namespace executorch::backends::webgpu; + +namespace { + +struct MapCb { + std::atomic status{WGPUMapAsyncStatus_Error}; +}; + +void map_cb( + WGPUMapAsyncStatus status, + WGPUStringView /*message*/, + void* userdata1, + void* /*userdata2*/) { + auto* d = static_cast(userdata1); + d->status.store(status, std::memory_order_release); +} + +// Copy `src` (must carry CopySrc) into a staging buffer and read it back. +std::vector readback( + WGPUInstance instance, + WGPUDevice device, + WGPUQueue queue, + WGPUBuffer src, + size_t nbytes) { + WGPUBufferDescriptor sd = {}; + sd.size = nbytes; + sd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; + WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &sd); + + WGPUCommandEncoderDescriptor ed = {}; + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed); + wgpuCommandEncoderCopyBufferToBuffer(enc, src, 0, staging, 0, nbytes); + WGPUCommandBufferDescriptor cd = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd); + wgpuQueueSubmit(queue, 1, &cmd); + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(enc); + + MapCb cb; + WGPUBufferMapCallbackInfo ci = {}; + ci.mode = WGPUCallbackMode_WaitAnyOnly; + ci.callback = map_cb; + ci.userdata1 = &cb; + webgpu_wait( + instance, wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, nbytes, ci)); + + std::vector out(nbytes / sizeof(float)); + if (cb.status.load(std::memory_order_acquire) == WGPUMapAsyncStatus_Success) { + const void* m = wgpuBufferGetConstMappedRange(staging, 0, nbytes); + if (m != nullptr) { + std::memcpy(out.data(), m, nbytes); + } + wgpuBufferUnmap(staging); + } + wgpuBufferRelease(staging); + return out; +} + +// Tier 1: allocation, zero-size guard, distinct non-null handles. +bool tier1_alloc(WGPUDevice device) { + printf("\n--- scratch[tier1: allocation] ---\n"); + WebGPUGraph g; + g.set_device(device); + WGPUBuffer a = g.create_scratch_buffer(64 * sizeof(float)); + WGPUBuffer z = g.create_scratch_buffer(0); // guarded to 4 bytes + WGPUBuffer b = g.create_scratch_buffer(64 * sizeof(float)); + const bool ok = a && z && b && a != b && a != z && b != z; + printf(ok ? "PASS: allocation\n" : "FAIL: allocation\n"); + return ok; // graph dtor releases all three here +} + +// Tier 2: host->scratch write, scratch->staging copy, read-back round-trip. +bool tier2_roundtrip( + WGPUInstance instance, + WGPUDevice device, + WGPUQueue queue) { + printf("\n--- scratch[tier2: copy round-trip] ---\n"); + bool ok = true; + for (int n : {1, 7, 1024}) { + WebGPUGraph g; + g.set_device(device); + WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float)); + std::vector in(n); + for (int i = 0; i < n; i++) { + in[i] = static_cast(i) * 0.5f + 1.0f; + } + wgpuQueueWriteBuffer(queue, s, 0, in.data(), n * sizeof(float)); + std::vector back = + readback(instance, device, queue, s, n * sizeof(float)); + float max_err = 0.0f; + for (int i = 0; i < n; i++) { + max_err = std::max(max_err, std::abs(back[i] - in[i])); + } + printf(" n=%d max abs error %e\n", n, max_err); + if (max_err != 0.0f) { // pure copy: must be bit-exact + ok = false; + } + } + printf(ok ? "PASS: copy round-trip\n" : "FAIL: copy round-trip\n"); + return ok; +} + +// Tier 3a: bind scratch as a Storage buffer in a compute pass (its real use). +bool tier3_compute(WGPUInstance instance, WGPUDevice device, WGPUQueue queue) { + printf("\n--- scratch[tier3: compute Storage round-trip] ---\n"); + const int n = 256; + WebGPUGraph g; + g.set_device(device); + WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float)); + + const char* kWgsl = + "@group(0) @binding(0) var buf: array;\n" + "@compute @workgroup_size(64)\n" + "fn main(@builtin(global_invocation_id) gid: vec3) {\n" + " let i = gid.x;\n" + " if (i < arrayLength(&buf)) { buf[i] = f32(i) * 2.0 + 1.0; }\n" + "}\n"; + + WGPUShaderSourceWGSL wgsl = {}; + wgsl.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl.code = {kWgsl, WGPU_STRLEN}; + WGPUShaderModuleDescriptor smd = {}; + smd.nextInChain = &wgsl.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &smd); + + WGPUBindGroupLayoutEntry ble = {}; + ble.binding = 0; + ble.visibility = WGPUShaderStage_Compute; + ble.buffer.type = WGPUBufferBindingType_Storage; + WGPUBindGroupLayoutDescriptor bld = {}; + bld.entryCount = 1; + bld.entries = &ble; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bld); + + WGPUPipelineLayoutDescriptor pld = {}; + pld.bindGroupLayoutCount = 1; + pld.bindGroupLayouts = &bgl; + WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pld); + + WGPUComputePipelineDescriptor cpd = {}; + cpd.layout = pl; + cpd.compute.module = shader; + cpd.compute.entryPoint = {"main", WGPU_STRLEN}; + WGPUComputePipeline pipe = wgpuDeviceCreateComputePipeline(device, &cpd); + + WGPUBindGroupEntry bge = {}; + bge.binding = 0; + bge.buffer = s; + bge.size = n * sizeof(float); + WGPUBindGroupDescriptor bgd = {}; + bgd.layout = bgl; + bgd.entryCount = 1; + bgd.entries = &bge; + WGPUBindGroup bg = wgpuDeviceCreateBindGroup(device, &bgd); + + WGPUCommandEncoderDescriptor ed = {}; + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed); + WGPUComputePassDescriptor pd = {}; + WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(enc, &pd); + wgpuComputePassEncoderSetPipeline(pass, pipe); + wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups(pass, (n + 63) / 64, 1, 1); + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + WGPUCommandBufferDescriptor cd = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd); + wgpuQueueSubmit(queue, 1, &cmd); + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(enc); + + std::vector back = + readback(instance, device, queue, s, n * sizeof(float)); + float max_err = 0.0f; + for (int i = 0; i < n; i++) { + const float expected = static_cast(i) * 2.0f + 1.0f; + max_err = std::max(max_err, std::abs(back[i] - expected)); + } + printf(" max abs error %e (%d elements)\n", max_err, n); + + wgpuBindGroupRelease(bg); + wgpuComputePipelineRelease(pipe); + wgpuPipelineLayoutRelease(pl); + wgpuBindGroupLayoutRelease(bgl); + wgpuShaderModuleRelease(shader); + + const bool ok = max_err == 0.0f; + printf( + ok ? "PASS: compute Storage round-trip\n" : "FAIL: compute round-trip\n"); + return ok; +} + +// Tier 3b: many scratch buffers across repeated graphs; dtor must release all. +bool tier3_lifecycle(WGPUDevice device) { + printf("\n--- scratch[tier3: lifecycle/stress] ---\n"); + bool ok = true; + for (int iter = 0; iter < 50; iter++) { + WebGPUGraph g; + g.set_device(device); + for (int k = 0; k < 256; k++) { + WGPUBuffer b = + g.create_scratch_buffer(static_cast(k % 17) * sizeof(float)); + ok = ok && b != nullptr; + } + } // each graph's dtor releases its 256 buffers here + printf( + ok ? "PASS: lifecycle/stress (50 graphs x 256 buffers)\n" + : "FAIL: lifecycle/stress (null buffer)\n"); + return ok; +} + +} // namespace + +int main() { + WebGPUContext ctx; + try { + ctx = create_webgpu_context(); + } catch (const std::exception& e) { + printf("SKIP: %s\n", e.what()); + return 0; + } + set_default_webgpu_context(&ctx); + printf("WebGPU device acquired (native)\n"); + + bool ok = true; + ok = tier1_alloc(ctx.device) && ok; + ok = tier2_roundtrip(ctx.instance, ctx.device, ctx.queue) && ok; + ok = tier3_compute(ctx.instance, ctx.device, ctx.queue) && ok; + ok = tier3_lifecycle(ctx.device) && ok; + + set_default_webgpu_context(nullptr); + destroy_webgpu_context(ctx); + + if (!ok) { + return 1; + } + printf("\nAll scratch_buffer tests passed\n"); + return 0; +} diff --git a/backends/webgpu/test/ops/dispatch_order/__init__.py b/backends/webgpu/test/ops/dispatch_order/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py new file mode 100644 index 00000000000..fbb13ff6426 --- /dev/null +++ b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py @@ -0,0 +1,119 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Multi-dispatch ordering coverage for WebGPUGraph::execute(). + +Each model is a dependency chain whose dispatches must execute in order (one +compute pass per dispatch is the implicit barrier). Vehicle A is a single-input +add self-chain; Vehicle B chains add on a reused RmsNormModule (a heterogeneous +cross-pipeline RAW edge). Numerics are checked in test/native/test_dispatch_order.cpp. +""" + +import os +import unittest + +import torch +from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner +from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import RmsNormModule +from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS +from executorch.exir import to_edge_transform_and_lower + + +class ChainAddModule(torch.nn.Module): + """z = x + x; z = z + x; ... (depth adds) -> (depth + 1) * x.""" + + def __init__(self, depth: int) -> None: + super().__init__() + self.depth = depth + + def forward(self, x: torch.Tensor) -> torch.Tensor: + z = x + x + for _ in range(self.depth - 1): + z = z + x + return z + + +class RmsNormAddModule(torch.nn.Module): + """t = rms_norm(x); z = t + x; ... (adds adds) -- heterogeneous RAW chain.""" + + def __init__(self, width: int, adds: int) -> None: + super().__init__() + self.rms = RmsNormModule(width, eps=1e-6) + self.adds = adds + + def forward(self, x: torch.Tensor) -> torch.Tensor: + z = self.rms(x) + x + for _ in range(self.adds - 1): + z = z + x + return z + + +# (name, kind, shape, depth) -- MUST match kCases in test_dispatch_order.cpp. +_CASES = [ + ("single", "chain", (16, 16), 1), + ("chain3", "chain", (64, 64), 3), + ("chain5_tiny", "chain", (1, 1), 5), + ("chain5_wide", "chain", (7, 896), 5), + ("chain8", "chain", (256, 256), 8), + ("deep32", "chain", (128, 128), 32), + ("large_chain", "chain", (1024, 1024), 6), + ("het_small", "rms", (1, 1, 7, 896), 2), + ("het_deep", "rms", (1, 1, 5, 256), 3), +] + + +def _model(kind: str, shape, depth: int) -> torch.nn.Module: + if kind == "chain": + return ChainAddModule(depth) + return RmsNormAddModule(shape[-1], depth) + + +def _lower(model: torch.nn.Module, x: torch.Tensor): + ep = torch.export.export(model, (x,)) + return to_edge_transform_and_lower( + ep, + partitioner=[VulkanPartitioner(operator_allowlist=WEBGPU_SUPPORTED_OPS)], + ).to_executorch() + + +class TestDispatchOrder(unittest.TestCase): + def _assert_delegated(self, prog) -> None: + found = any( + d.id == "VulkanBackend" + for p in prog.executorch_program.execution_plan + for d in p.delegates + ) + self.assertTrue(found, "Expected VulkanBackend delegate in .pte") + + def test_chain_add(self) -> None: + self._assert_delegated(_lower(ChainAddModule(5), torch.randn(64, 64))) + + def test_rms_norm_add(self) -> None: + self._assert_delegated( + _lower(RmsNormAddModule(896, 2), torch.randn(1, 1, 7, 896)) + ) + + +def export_dispatch_order_cases(out_dir: str) -> None: + """Write .pte, .input.bin, .golden.bin (raw le fp32) per case.""" + os.makedirs(out_dir, exist_ok=True) + torch.manual_seed(0) + for name, kind, shape, depth in _CASES: + x = torch.randn(*shape) + model = _model(kind, shape, depth) + prog = _lower(model, x) + with torch.no_grad(): + golden = model(x) + base = os.path.join(out_dir, name) + x.detach().cpu().numpy().astype("