diff --git a/.ci/scripts/setup-webgpu-linux-deps.sh b/.ci/scripts/setup-webgpu-linux-deps.sh
index 8ece5899489..b24ffb460a9 100644
--- a/.ci/scripts/setup-webgpu-linux-deps.sh
+++ b/.ci/scripts/setup-webgpu-linux-deps.sh
@@ -5,26 +5,93 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Vendor Dawn (Tint) + SwiftShader for the WebGPU backend CI WITHOUT hosting a
+# private prebuilt:
+#   * Dawn  : Google's official nightly prebuilt, downloaded directly from
+#             github.com/google/dawn/releases (pinned tag+rev+sha256) -- the same
+#             "fetch a pinned upstream prebuilt" pattern used for other CI deps.
+#   * SwiftShader : built from source at a pinned rev compatible with the Dawn
+#             above (the ossci prebuilt is from 2020, too old for current Dawn). No S3.
+# Dawn (Chrome's WebGPU impl; its WGSL compiler Tint is the spec reference) on
+# SwiftShader gives a headless, deterministic, spec-faithful CLI backend.
+#
+# Exports Dawn_DIR / VK_ICD_FILENAMES / LD_LIBRARY_PATH for the cmake build+run.
+# Local/rig override: set DAWN_PREBUILT_DIR=<dir containing lib64/cmake/Dawn> to
+# skip the Dawn download.
 set -ex
 
-# SwiftShader: software Vulkan adapter for GPU-less CI (LunarG SDK not needed).
-install_swiftshader() {
-  _https_amazon_aws=https://ossci-android.s3.amazonaws.com
-  _swiftshader_archive=swiftshader-abe07b943-prebuilt.tar.gz
-  _swiftshader_dir=/tmp/swiftshader
-  mkdir -p $_swiftshader_dir
+# --- pinned versions (bump rev+sha together when upgrading Dawn) --------------
+DAWN_TAG="${DAWN_TAG:-v20260423.175430}"
+DAWN_REV="${DAWN_REV:-31e25af254ab572c77054edec4946d2244e184dd}"
+DAWN_SHA256="${DAWN_SHA256:-ac76fac090162dc1ecea5ed0f28a557bb8f49efc47faab01886105ace82b7b64}"
+# SwiftShader rev verified compatible with DAWN_REV (the old ossci prebuilt is
+# from 2020 and is incompatible with current Dawn -> no adapter / zero compute).
+SWIFTSHADER_REV="${SWIFTSHADER_REV:-9898204d91d6a60b6a08ad74fe4ac52a6913111b}"
 
-  _tmp_archive="/tmp/${_swiftshader_archive}"
+_dawn_dir="${DAWN_PREBUILT_DIR:-/tmp/dawn-ci}"
+_ss_dir=/tmp/swiftshader
 
-  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
-    --output "${_tmp_archive}" "$_https_amazon_aws/${_swiftshader_archive}"
+# --- toolchain prereqs --------------------------------------------------------
+# Dawn dlopens the system Vulkan loader at runtime (libvulkan1). And the
+# ubuntu-latest prebuilt is built with a bleeding-edge GCC: it references
+# libstdc++ symbols newer than ubuntu-22.04's default (e.g. _M_replace_cold,
+# GCC 13+), so the static .a won't link against the stock runtime. Pull a current
+# libstdc++ from the ubuntu-toolchain-r PPA when the symbol floor isn't met. All
+# of this is scoped to the WebGPU CI job; newer libstdc++ is backward-compatible.
+if command -v apt-get >/dev/null 2>&1; then
+  _SUDO=""; command -v sudo >/dev/null 2>&1 && _SUDO="sudo"
+  ${_SUDO} apt-get update -y || true
+  ${_SUDO} apt-get install -y libvulkan1 software-properties-common || true
+  if ! strings /usr/lib/x86_64-linux-gnu/libstdc++.so.6 2>/dev/null \
+      | grep -q "GLIBCXX_3.4.32"; then
+    ${_SUDO} add-apt-repository -y ppa:ubuntu-toolchain-r/test || true
+    ${_SUDO} apt-get update -y || true
+    ${_SUDO} apt-get install -y libstdc++6 || true  # newest GCC runtime
+  fi
+fi
 
-  tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
+# The native binaries / pybind lib run INSIDE the CI conda env, whose libstdc++
+# predates GLIBCXX_3.4.32 (the Dawn prebuilt's floor) -- the same wall ssjia hit
+# for the vulkan op tests. Upgrade the conda runtime libstdc++ so the loaded
+# libstdc++.so.6 (conda's, not the system one) satisfies Dawn at run time.
+if command -v conda >/dev/null 2>&1; then
+  conda install -y -c conda-forge "libstdcxx-ng>=14" || true
+fi
+
+# --- Dawn: official prebuilt from GitHub (no S3) ------------------------------
+mkdir -p "${_dawn_dir}"
+if [[ ! -d "${_dawn_dir}/lib64/cmake/Dawn" ]]; then
+  _dawn_tar="/tmp/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+  curl --silent --show-error --location --fail --retry 3 --retry-all-errors \
+    --output "${_dawn_tar}" \
+    "https://github.com/google/dawn/releases/download/${DAWN_TAG}/Dawn-${DAWN_REV}-ubuntu-latest-Release.tar.gz"
+  echo "${DAWN_SHA256}  ${_dawn_tar}" | sha256sum -c -
+  # archive top dir is Dawn-<rev>-ubuntu-latest-Release/{lib64,include,bin}
+  tar -C "${_dawn_dir}" --strip-components=1 -xzf "${_dawn_tar}"
+fi
 
-  export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
-  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH}"
-  export ETVK_USING_SWIFTSHADER=1
-}
+# --- SwiftShader: build from source at a pinned rev (no S3) -------------------
+# The old ossci prebuilt (swiftshader-abe07b943, 2020) is incompatible with the
+# current Dawn; build a matching modern SwiftShader instead. Self-contained
+# cmake build (vendored LLVM); the ICD lands under build/<OS>/.
+if [[ ! -d "${_ss_dir}/build" ]]; then
+  if [[ ! -d "${_ss_dir}/.git" ]]; then
+    git clone https://github.com/google/swiftshader "${_ss_dir}"
+  fi
+  git -C "${_ss_dir}" checkout "${SWIFTSHADER_REV}"
+  # vk_swiftshader's deps are vendored in-tree; tolerate unreachable
+  # disabled-feature submodules (angle, test-only) failing to fetch.
+  git -C "${_ss_dir}" submodule update --init --recursive || true
+  cmake -S "${_ss_dir}" -B "${_ss_dir}/build" -DCMAKE_BUILD_TYPE=Release \
+    -DSWIFTSHADER_BUILD_TESTS=OFF -DSWIFTSHADER_BUILD_PVR=OFF \
+    -DSWIFTSHADER_BUILD_BENCHMARKS=OFF
+  cmake --build "${_ss_dir}/build" --parallel "$(nproc)" --target vk_swiftshader
+fi
+_ss_icd="$(find "${_ss_dir}/build" -name vk_swiftshader_icd.json 2>/dev/null | head -1)"
+[[ -n "${_ss_icd}" ]] || { echo "ERROR: SwiftShader ICD not found after build" >&2; exit 1; }
 
-install_swiftshader
-bash backends/webgpu/scripts/setup-wgpu-native.sh
+_ss_libdir="$(dirname "${_ss_icd}")"
+export Dawn_DIR="${_dawn_dir}/lib64/cmake/Dawn"
+export VK_ICD_FILENAMES="${_ss_icd}"
+export LD_LIBRARY_PATH="${_ss_libdir}:${LD_LIBRARY_PATH:-}"
+export WEBGPU_USING_SWIFTSHADER=1
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index fe9b564a18f..8b7a36cd79d 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -58,11 +58,10 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *webgpu* ]]; then
-    # Setup swiftshader (software Vulkan adapter for GPU-less runners) and wgpu-native,
-    # which are required to build and run the WebGPU delegate.
+    # Dawn (Tint) + SwiftShader, the spec-faithful headless WebGPU backend.
     source .ci/scripts/setup-webgpu-linux-deps.sh
 
-    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON"
+    EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_WEBGPU=ON -DDawn_DIR=$Dawn_DIR"
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
diff --git a/.github/workflows/test-webgpu-native.yml b/.github/workflows/test-webgpu-native.yml
new file mode 100644
index 00000000000..7220ef9f7b5
--- /dev/null
+++ b/.github/workflows/test-webgpu-native.yml
@@ -0,0 +1,54 @@
+name: Test WebGPU Native (Dawn)
+
+# The substantive WebGPU op-coverage gate. The shared operators suite only
+# delegates add.Tensor to WebGPU (everything else is CPU fallback), so the real
+# Dawn coverage comes from the native test executables (rms_norm, multi-dispatch
+# ordering, scratch). This runs them on Dawn (Tint) + SwiftShader, headless, on a
+# CPU runner -- separate from _test_backend.yml so that reusable template stays
+# untouched.
+
+# Nightly-only for now: this job builds SwiftShader from source (vendored LLVM),
+# which is too expensive to run on every PR while the workflow's reliability is
+# still being established. Once it has proven stable, re-enable a scoped PR
+# trigger with a paths: filter (backends/webgpu/**, the webgpu CI scripts, and
+# this file). The pull_request-aware ref/concurrency expressions below are kept
+# intentionally so that re-enable is a one-line change.
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-webgpu-native:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: linux.4xlarge.memory
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      submodules: recursive
+      timeout: 120
+      script: |
+        set -eux
+
+        # The generic Linux job uses the base conda env, not the image's; activate
+        # the image env (it has the pinned from-source torch). Mirrors
+        # test-vulkan-operators-linux in pull.yml.
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Install the python package + runtime deps (the .pte exporters).
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+
+        # Vendor Dawn (Tint) + SwiftShader, then build + run the native executables.
+        source .ci/scripts/setup-webgpu-linux-deps.sh
+        bash backends/webgpu/scripts/test_webgpu_native_ci.sh
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 719d86b3008..b6b41fb6587 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -54,29 +54,14 @@ target_include_directories(
 
 target_link_libraries(webgpu_backend PRIVATE vulkan_schema executorch_core)
 
-# Native build: link against wgpu-native
-set(WGPU_NATIVE_DIR
-    "${CMAKE_CURRENT_SOURCE_DIR}/third-party/wgpu-native"
-    CACHE PATH "Path to wgpu-native installation"
-)
-
-# Link the shared lib; the static .a carries LLVM bitcode that breaks LTO.
-# Suffix resolves per platform: .so on Linux, .dylib on macOS.
-set(WGPU_LIB_NAME "libwgpu_native${CMAKE_SHARED_LIBRARY_SUFFIX}")
-set(WGPU_LIB "${WGPU_NATIVE_DIR}/lib/${WGPU_LIB_NAME}")
-if(NOT EXISTS "${WGPU_LIB}")
-  message(FATAL_ERROR "wgpu-native not found at ${WGPU_NATIVE_DIR}. "
-                      "Run: bash backends/webgpu/scripts/setup-wgpu-native.sh"
-  )
-endif()
-
-add_library(wgpu_native SHARED IMPORTED)
-set_target_properties(wgpu_native PROPERTIES IMPORTED_LOCATION "${WGPU_LIB}")
-
-target_include_directories(
-  webgpu_backend PUBLIC $<BUILD_INTERFACE:${WGPU_NATIVE_DIR}/include>
-)
-target_link_libraries(webgpu_backend PRIVATE wgpu_native)
+# Native WebGPU backend: Dawn (Tint) + SwiftShader; deps script sets Dawn_DIR.
+# Native-only: browser/Emscripten builds use the system webgpu.h and never reach
+# this find_package (root CMake gates it via EXECUTORCH_BUILD_WEBGPU).
+# dawn::webgpu_dawn's link interface references Threads::Threads.
+find_package(Threads REQUIRED)
+find_package(Dawn REQUIRED)
+set(WEBGPU_GPU_LIB dawn::webgpu_dawn)
+target_link_libraries(webgpu_backend PUBLIC ${WEBGPU_GPU_LIB})
 
 if(APPLE)
   target_link_libraries(
@@ -100,19 +85,17 @@ install(
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 
-# Native test target
-if(EXECUTORCH_BUILD_WEBGPU_TEST)
-  add_executable(webgpu_native_test test/test_webgpu_native.cpp)
-
+# Native test targets. Helper mirrors backends/vulkan's vulkan_op_test: every
+# test executable links the same backend + runtime libs.
+function(add_webgpu_native_test test_name test_src)
+  add_executable(${test_name} ${test_src})
   target_include_directories(
-    webgpu_native_test PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-                               "${WGPU_NATIVE_DIR}/include"
+    ${test_name} PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
   )
-
   target_link_libraries(
-    webgpu_native_test
+    ${test_name}
     PRIVATE webgpu_backend
-            wgpu_native
+            ${WEBGPU_GPU_LIB}
             executorch_core
             extension_module_static
             extension_data_loader
@@ -120,47 +103,25 @@ if(EXECUTORCH_BUILD_WEBGPU_TEST)
             portable_kernels
             portable_ops_lib
   )
-
   if(APPLE)
     target_link_libraries(
-      webgpu_native_test PRIVATE "-framework Metal" "-framework QuartzCore"
-                                 "-framework CoreGraphics"
+      ${test_name} PRIVATE "-framework Metal" "-framework QuartzCore"
+                           "-framework CoreGraphics"
     )
   else()
-    target_link_libraries(webgpu_native_test PRIVATE dl m pthread)
+    target_link_libraries(${test_name} PRIVATE dl m pthread)
   endif()
+  target_compile_options(${test_name} PRIVATE -fexceptions)
+  set_property(TARGET ${test_name} PROPERTY CXX_STANDARD 17)
+endfunction()
 
-  target_compile_options(webgpu_native_test PRIVATE -fexceptions)
-  set_property(TARGET webgpu_native_test PROPERTY CXX_STANDARD 17)
-
-  add_executable(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
-
-  target_include_directories(
-    webgpu_rms_norm_test PRIVATE $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
-                                 "${WGPU_NATIVE_DIR}/include"
+if(EXECUTORCH_BUILD_WEBGPU_TEST)
+  add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
+  add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
+  add_webgpu_native_test(
+    webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
   )
-
-  target_link_libraries(
-    webgpu_rms_norm_test
-    PRIVATE webgpu_backend
-            wgpu_native
-            executorch_core
-            extension_module_static
-            extension_data_loader
-            extension_tensor
-            portable_kernels
-            portable_ops_lib
+  add_webgpu_native_test(
+    webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
   )
-
-  if(APPLE)
-    target_link_libraries(
-      webgpu_rms_norm_test PRIVATE "-framework Metal" "-framework QuartzCore"
-                                   "-framework CoreGraphics"
-    )
-  else()
-    target_link_libraries(webgpu_rms_norm_test PRIVATE dl m pthread)
-  endif()
-
-  target_compile_options(webgpu_rms_norm_test PRIVATE -fexceptions)
-  set_property(TARGET webgpu_rms_norm_test PROPERTY CXX_STANDARD 17)
 endif()
diff --git a/backends/webgpu/runtime/WebGPUCompat.h b/backends/webgpu/runtime/WebGPUCompat.h
new file mode 100644
index 00000000000..06715e0fc81
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUCompat.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <webgpu/webgpu.h>
+
+#include <cstdint>
+
+namespace executorch::backends::webgpu {
+
+// Caller's instance must enable TimedWaitAny; returns the WaitAny status.
+inline WGPUWaitStatus webgpu_wait(WGPUInstance instance, WGPUFuture future) {
+  WGPUFutureWaitInfo info = {};
+  info.future = future;
+  return wgpuInstanceWaitAny(instance, 1, &info, UINT64_MAX);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
index a5bbf8e5806..041cbe5a703 100644
--- a/backends/webgpu/runtime/WebGPUDevice.cpp
+++ b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 
 #include <cstdio>
@@ -21,12 +22,10 @@ namespace {
 
 struct AdapterResult {
   WGPUAdapter adapter = nullptr;
-  bool done = false;
 };
 
 struct DeviceResult {
   WGPUDevice device = nullptr;
-  bool done = false;
 };
 
 void on_adapter_request(
@@ -46,7 +45,6 @@ void on_adapter_request(
         static_cast<int>(message.length),
         message.data);
   }
-  result->done = true;
 }
 
 void on_device_request(
@@ -66,7 +64,6 @@ void on_device_request(
         static_cast<int>(message.length),
         message.data);
   }
-  result->done = true;
 }
 
 void on_device_error(
@@ -88,25 +85,36 @@ void on_device_error(
 WebGPUContext create_webgpu_context() {
   WebGPUContext ctx;
 
-  ctx.instance = wgpuCreateInstance(nullptr);
+  // TimedWaitAny lets webgpu_wait() block on futures via wgpuInstanceWaitAny.
+  WGPUInstanceDescriptor instance_desc = {};
+#if defined(__EMSCRIPTEN__)
+  instance_desc.capabilities.timedWaitAnyEnable = true;
+  instance_desc.capabilities.timedWaitAnyMaxCount = 1;
+#else
+  WGPUInstanceFeatureName features[1] = {WGPUInstanceFeatureName_TimedWaitAny};
+  instance_desc.requiredFeatureCount = 1;
+  instance_desc.requiredFeatures = features;
+#endif
+  ctx.instance = wgpuCreateInstance(&instance_desc);
   if (!ctx.instance) {
     throw std::runtime_error("Failed to create WebGPU instance");
   }
 
-  // Request adapter using AllowSpontaneous mode (fires during
-  // wgpuInstanceProcessEvents or any other API call).
   AdapterResult adapter_result;
   WGPURequestAdapterCallbackInfo adapter_cb = {};
-  adapter_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+  adapter_cb.mode = WGPUCallbackMode_WaitAnyOnly;
   adapter_cb.callback = on_adapter_request;
   adapter_cb.userdata1 = &adapter_result;
 
-  wgpuInstanceRequestAdapter(ctx.instance, nullptr, adapter_cb);
-  while (!adapter_result.done) {
-    wgpuInstanceProcessEvents(ctx.instance);
-  }
+  // No backend pin or forced fallback; Dawn auto-selects the adapter.
+  WGPURequestAdapterOptions adapter_opts = {};
+  adapter_opts.powerPreference = WGPUPowerPreference_HighPerformance;
+  adapter_opts.forceFallbackAdapter = false;
+  WGPUWaitStatus adapter_wait = webgpu_wait(
+      ctx.instance,
+      wgpuInstanceRequestAdapter(ctx.instance, &adapter_opts, adapter_cb));
 
-  if (!adapter_result.adapter) {
+  if (adapter_wait != WGPUWaitStatus_Success || !adapter_result.adapter) {
     wgpuInstanceRelease(ctx.instance);
     ctx.instance = nullptr;
     throw std::runtime_error(
@@ -118,7 +126,7 @@ WebGPUContext create_webgpu_context() {
   // Request device
   DeviceResult device_result;
   WGPURequestDeviceCallbackInfo device_cb = {};
-  device_cb.mode = WGPUCallbackMode_AllowSpontaneous;
+  device_cb.mode = WGPUCallbackMode_WaitAnyOnly;
   device_cb.callback = on_device_request;
   device_cb.userdata1 = &device_result;
 
@@ -131,12 +139,11 @@ WebGPUContext create_webgpu_context() {
   }
   device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
 
-  wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb);
-  while (!device_result.done) {
-    wgpuInstanceProcessEvents(ctx.instance);
-  }
+  WGPUWaitStatus device_wait = webgpu_wait(
+      ctx.instance,
+      wgpuAdapterRequestDevice(ctx.adapter, &device_desc, device_cb));
 
-  if (!device_result.device) {
+  if (device_wait != WGPUWaitStatus_Success || !device_result.device) {
     wgpuAdapterRelease(ctx.adapter);
     wgpuInstanceRelease(ctx.instance);
     ctx.adapter = nullptr;
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index a11b188f428..a60bfc18e3b 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -12,8 +12,8 @@
 #include <executorch/backends/vulkan/serialization/schema_generated.h>
 #include <executorch/runtime/core/named_data_map.h>
 
+#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
-#include <webgpu/wgpu.h>
 
 #include <cstring>
 #include <stdexcept>
@@ -471,7 +471,6 @@ void WebGPUGraph::execute() {
 namespace {
 
 struct MapCallbackData {
-  bool done = false;
   WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error;
 };
 
@@ -482,7 +481,6 @@ void buffer_map_callback(
     void* /*userdata2*/) {
   auto* data = static_cast<MapCallbackData*>(userdata1);
   data->status = status;
-  data->done = true;
 }
 
 } // namespace
@@ -491,18 +489,18 @@ void WebGPUGraph::copy_outputs(std::vector<std::pair<void*, size_t>>& outputs) {
   const size_t count = std::min(outputs.size(), output_staging_buffers_.size());
 
   std::vector<MapCallbackData> cb_data(count);
+  std::vector<WGPUFuture> map_futures(count, WGPUFuture{});
 
   for (size_t i = 0; i < count; i++) {
     if (outputs[i].second == 0) {
-      cb_data[i].done = true;
       cb_data[i].status = WGPUMapAsyncStatus_Success;
       continue;
     }
     WGPUBufferMapCallbackInfo cb_info = {};
-    cb_info.mode = WGPUCallbackMode_AllowSpontaneous;
+    cb_info.mode = WGPUCallbackMode_WaitAnyOnly;
     cb_info.callback = buffer_map_callback;
     cb_info.userdata1 = &cb_data[i];
-    wgpuBufferMapAsync(
+    map_futures[i] = wgpuBufferMapAsync(
         output_staging_buffers_[i],
         WGPUMapMode_Read,
         0,
@@ -510,7 +508,12 @@ void WebGPUGraph::copy_outputs(std::vector<std::pair<void*, size_t>>& outputs) {
         cb_info);
   }
 
-  wgpuDevicePoll(device_, true, nullptr);
+  for (size_t i = 0; i < count; i++) {
+    if (outputs[i].second != 0 &&
+        webgpu_wait(instance_, map_futures[i]) != WGPUWaitStatus_Success) {
+      throw std::runtime_error("WebGPU: WaitAny failed for output map");
+    }
+  }
 
   for (size_t i = 0; i < count; i++) {
     if (outputs[i].second == 0) {
diff --git a/backends/webgpu/scripts/setup-wgpu-native.sh b/backends/webgpu/scripts/setup-wgpu-native.sh
deleted file mode 100755
index 12ca2afdc46..00000000000
--- a/backends/webgpu/scripts/setup-wgpu-native.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Download prebuilt wgpu-native binaries for native (non-browser) WebGPU testing.
-# Usage: bash backends/webgpu/scripts/setup-wgpu-native.sh
-
-set -euo pipefail
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-WGPU_DIR="${SCRIPT_DIR}/../third-party/wgpu-native"
-
-WGPU_VERSION="v27.0.4.0"
-WGPU_BASE_URL="https://github.com/gfx-rs/wgpu-native/releases/download/${WGPU_VERSION}"
-
-OS="$(uname -s)"
-case "${OS}" in
-    Darwin) PLATFORM="macos"; LIB_EXT="dylib" ;;
-    Linux)  PLATFORM="linux"; LIB_EXT="so" ;;
-    *)
-        echo "Unsupported OS: ${OS}"
-        exit 1
-        ;;
-esac
-
-if [[ -f "${WGPU_DIR}/lib/libwgpu_native.${LIB_EXT}" ]]; then
-    echo "wgpu-native already installed at ${WGPU_DIR}"
-    exit 0
-fi
-
-ARCH="$(uname -m)"
-
-case "${ARCH}" in
-    x86_64)  WGPU_ARCH="x86_64" ;;
-    aarch64|arm64) WGPU_ARCH="aarch64" ;;
-    *)
-        echo "Unsupported architecture: ${ARCH}"
-        exit 1
-        ;;
-esac
-
-ZIP_NAME="wgpu-${PLATFORM}-${WGPU_ARCH}-release.zip"
-URL="${WGPU_BASE_URL}/${ZIP_NAME}"
-
-echo "Downloading wgpu-native ${WGPU_VERSION} for ${PLATFORM}-${WGPU_ARCH}..."
-TMPDIR_DL="$(mktemp -d)"
-trap "rm -rf ${TMPDIR_DL}" EXIT
-
-curl -sL "${URL}" -o "${TMPDIR_DL}/${ZIP_NAME}"
-
-mkdir -p "${WGPU_DIR}"
-unzip -qo "${TMPDIR_DL}/${ZIP_NAME}" -d "${WGPU_DIR}"
-
-echo "Installed wgpu-native to ${WGPU_DIR}"
-ls -la "${WGPU_DIR}/lib/"
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
new file mode 100644
index 00000000000..af014efb228
--- /dev/null
+++ b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build + run the WebGPU native test executables on Dawn (Tint) + SwiftShader.
+# This is the substantive op-coverage gate: unlike the python operators suite
+# (which only delegates add.Tensor to WebGPU, the rest CPU-fallback), these
+# executables run rms_norm / multi-dispatch ordering / scratch through the real
+# WebGPU backend on Dawn.
+#
+# Assumes the Dawn env is already sourced (Dawn_DIR + VK_ICD_FILENAMES +
+# LD_LIBRARY_PATH) via .ci/scripts/setup-webgpu-linux-deps.sh. For local runs:
+#   source .ci/scripts/setup-webgpu-linux-deps.sh
+#   bash backends/webgpu/scripts/test_webgpu_native_ci.sh
+#
+# Builds whatever native test targets are present in the landed tree (NOT a fixed
+# list). This stack lands: webgpu_native_test, webgpu_rms_norm_test (base) +
+# webgpu_dispatch_order_test, webgpu_scratch_buffer_test (D107576199). update_cache
+# / SDPA executables join automatically once their sibling diffs land.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXECUTORCH_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+PYTHON_EXECUTABLE="${PYTHON_EXECUTABLE:-python3}"
+NPROC=$(nproc 2>/dev/null || sysctl -n hw.ncpu)
+BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-dawn-ci"
+
+if [[ -z "${Dawn_DIR:-}" ]]; then
+  echo "ERROR: Dawn_DIR not set. Source .ci/scripts/setup-webgpu-linux-deps.sh first." >&2
+  exit 1
+fi
+
+cd "${EXECUTORCH_ROOT}"
+
+# ── Exports for the model-driven executables (best-effort) ───────────────────
+# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
+# self-skip if absent; scratch is standalone (generates its own inputs).
+PTE_MODEL="/tmp/webgpu_add_test.pte"
+PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
+RMS_NORM_DIR="/tmp/rmsn"
+RMS_NORM_OK=1
+DISPATCH_ORDER_DIR="/tmp/dispatch_order"
+DISPATCH_ORDER_OK=1
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
+export_add_model('${PTE_MODEL}')
+export_chained_add_model('${PTE_CHAINED_MODEL}')
+" || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
+export_rms_norm_cases('${RMS_NORM_DIR}')
+" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
+
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
+export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
+" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }
+
+# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ─────────
+echo "=== Configure WebGPU native tests on Dawn ==="
+rm -rf "${BUILD_DIR}"
+cmake \
+    -DEXECUTORCH_BUILD_WEBGPU=ON \
+    -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \
+    -DDawn_DIR="${Dawn_DIR}" \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -B "${BUILD_DIR}" \
+    "${EXECUTORCH_ROOT}"
+
+# ── Build + run every native test target that exists in this tree ────────────
+TARGETS=(webgpu_native_test webgpu_rms_norm_test webgpu_dispatch_order_test webgpu_scratch_buffer_test)
+BIN_DIR="${BUILD_DIR}/backends/webgpu"
+
+# Which targets are defined depends on which diffs are landed (native_test +
+# rms_norm here; dispatch_order + scratch from D107576199). Query the configured
+# target list ONCE so a not-yet-landed target is skipped WITHOUT masking a real
+# compile failure of a target that IS defined (CI uses the Make generator).
+DEFINED_TARGETS="$(cmake --build "${BUILD_DIR}" --target help 2>/dev/null || true)"
+
+# Fail loud if the probe found nothing (e.g. a non-Make generator or a cmake
+# regression): otherwise every target would skip and the job would go green
+# having tested nothing. webgpu_native_test is always defined at/after this diff.
+if ! printf '%s\n' "${DEFINED_TARGETS}" | grep -qw webgpu_native_test; then
+  echo "ERROR: cmake target probe returned no webgpu_native_test; aborting" >&2
+  exit 1
+fi
+
+for t in "${TARGETS[@]}"; do
+  if printf '%s\n' "${DEFINED_TARGETS}" | grep -qw "${t}"; then
+    # Defined target: build with stderr visible; set -e fails the job on a real
+    # build error (never silently skipped).
+    cmake --build "${BUILD_DIR}" --target "${t}" -j"${NPROC}"
+    echo "built ${t}"
+  else
+    echo "(target ${t} not defined in this tree — skipping)"
+  fi
+done
+
+echo "=== Run native tests on Dawn + SwiftShader ==="
+# native_test is model-driven; only run it if the export produced its .pte
+# (CI's setup-linux.sh provides the executorch wheel so exports succeed; a bare
+# local run without the wheel self-skips here rather than hard-failing on load).
+if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
+  env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
+      WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
+      WEBGPU_TEST_SDPA_DIR=/tmp/ \
+      "${BIN_DIR}/webgpu_native_test"
+else
+  echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
+fi
+if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
+  "${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
+fi
+if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
+  "${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
+fi
+[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
+
+echo "=== WebGPU native tests on Dawn: all run targets passed ==="
diff --git a/backends/webgpu/test/native/test_dispatch_order.cpp b/backends/webgpu/test/native/test_dispatch_order.cpp
new file mode 100644
index 00000000000..0f3eb5dea8e
--- /dev/null
+++ b/backends/webgpu/test/native/test_dispatch_order.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+using namespace executorch::extension;
+using namespace executorch::runtime;
+
+namespace {
+
+struct Case {
+  const char* name;
+  std::vector<int32_t> sizes;
+};
+
+// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
+const std::vector<Case> kCases = {
+    {"single", {16, 16}},
+    {"chain3", {64, 64}},
+    {"chain5_tiny", {1, 1}},
+    {"chain5_wide", {7, 896}},
+    {"chain8", {256, 256}},
+    {"deep32", {128, 128}},
+    {"large_chain", {1024, 1024}},
+    {"het_small", {1, 1, 7, 896}},
+    {"het_deep", {1, 1, 5, 256}},
+};
+
+std::vector<float> read_f32_bin(const std::string& path) {
+  std::ifstream f(path, std::ios::binary | std::ios::ate);
+  if (!f) {
+    return {};
+  }
+  const auto file_size = static_cast<size_t>(f.tellg());
+  if (file_size % sizeof(float) != 0) {
+    return {}; // truncated/corrupt golden; caller treats empty as failure
+  }
+  f.seekg(0);
+  std::vector<float> data(file_size / sizeof(float));
+  f.read(
+      reinterpret_cast<char*>(data.data()),
+      static_cast<std::streamsize>(file_size));
+  return data;
+}
+
+bool run_case(const std::string& dir, const Case& tc) {
+  printf("\n--- dispatch_order[%s] ---\n", tc.name);
+  const std::string base = dir + "/" + tc.name;
+  std::vector<float> input = read_f32_bin(base + ".input.bin");
+  std::vector<float> golden = read_f32_bin(base + ".golden.bin");
+  if (input.empty() || golden.empty()) {
+    printf("FAIL: could not read input/golden for %s\n", tc.name);
+    return false;
+  }
+
+  Module module(base + ".pte");
+  if (module.load_forward() != Error::Ok) {
+    printf("FAIL: could not load %s.pte\n", tc.name);
+    return false;
+  }
+
+  size_t expected = 1;
+  for (int32_t d : tc.sizes) {
+    expected *= static_cast<size_t>(d);
+  }
+  if (input.size() != expected) {
+    printf(
+        "FAIL: input numel %zu != expected %zu for %s\n",
+        input.size(),
+        expected,
+        tc.name);
+    return false;
+  }
+  auto x = make_tensor_ptr(tc.sizes, std::vector<float>(input));
+  auto result = module.forward({EValue(x)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
+    printf(
+        "FAIL: output numel %zu != golden %zu\n",
+        (size_t)out_tensor.numel(),
+        golden.size());
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f;
+  float max_rel_err = 0.0f;
+  for (size_t i = 0; i < golden.size(); i++) {
+    const float abs_err = std::abs(out_data[i] - golden[i]);
+    max_abs_err = std::max(max_abs_err, abs_err);
+    const float denom = std::max(std::abs(golden[i]), 1e-6f);
+    max_rel_err = std::max(max_rel_err, abs_err / denom);
+  }
+  printf(
+      "Max abs error: %e   Max rel error: %e (%zu elements)\n",
+      max_abs_err,
+      max_rel_err,
+      golden.size());
+  // Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
+  if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
+    printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
+    return false;
+  }
+  printf("PASS: dispatch_order[%s]\n", tc.name);
+  return true;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  std::string dir = "/tmp/dispatch_order";
+  if (argc > 1) {
+    dir = argv[1];
+  }
+  if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
+    dir = env;
+  }
+
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
+
+  bool ok = true;
+  for (const auto& tc : kCases) {
+    ok = run_case(dir, tc) && ok;
+  }
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll dispatch_order tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/native/test_scratch_buffer.cpp b/backends/webgpu/test/native/test_scratch_buffer.cpp
new file mode 100644
index 00000000000..7a4df6e9d00
--- /dev/null
+++ b/backends/webgpu/test/native/test_scratch_buffer.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// White-box unit tests for WebGPUGraph::create_scratch_buffer.
+
+#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
+#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+
+#include <webgpu/webgpu.h>
+
+#include <atomic>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+using namespace executorch::backends::webgpu;
+
+namespace {
+
+struct MapCb {
+  std::atomic<WGPUMapAsyncStatus> status{WGPUMapAsyncStatus_Error};
+};
+
+void map_cb(
+    WGPUMapAsyncStatus status,
+    WGPUStringView /*message*/,
+    void* userdata1,
+    void* /*userdata2*/) {
+  auto* d = static_cast<MapCb*>(userdata1);
+  d->status.store(status, std::memory_order_release);
+}
+
+// Copy `src` (must carry CopySrc) into a staging buffer and read it back.
+std::vector<float> readback(
+    WGPUInstance instance,
+    WGPUDevice device,
+    WGPUQueue queue,
+    WGPUBuffer src,
+    size_t nbytes) {
+  WGPUBufferDescriptor sd = {};
+  sd.size = nbytes;
+  sd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
+  WGPUBuffer staging = wgpuDeviceCreateBuffer(device, &sd);
+
+  WGPUCommandEncoderDescriptor ed = {};
+  WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed);
+  wgpuCommandEncoderCopyBufferToBuffer(enc, src, 0, staging, 0, nbytes);
+  WGPUCommandBufferDescriptor cd = {};
+  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd);
+  wgpuQueueSubmit(queue, 1, &cmd);
+  wgpuCommandBufferRelease(cmd);
+  wgpuCommandEncoderRelease(enc);
+
+  MapCb cb;
+  WGPUBufferMapCallbackInfo ci = {};
+  ci.mode = WGPUCallbackMode_WaitAnyOnly;
+  ci.callback = map_cb;
+  ci.userdata1 = &cb;
+  webgpu_wait(
+      instance, wgpuBufferMapAsync(staging, WGPUMapMode_Read, 0, nbytes, ci));
+
+  std::vector<float> out(nbytes / sizeof(float));
+  if (cb.status.load(std::memory_order_acquire) == WGPUMapAsyncStatus_Success) {
+    const void* m = wgpuBufferGetConstMappedRange(staging, 0, nbytes);
+    if (m != nullptr) {
+      std::memcpy(out.data(), m, nbytes);
+    }
+    wgpuBufferUnmap(staging);
+  }
+  wgpuBufferRelease(staging);
+  return out;
+}
+
+// Tier 1: allocation, zero-size guard, distinct non-null handles.
+bool tier1_alloc(WGPUDevice device) {
+  printf("\n--- scratch[tier1: allocation] ---\n");
+  WebGPUGraph g;
+  g.set_device(device);
+  WGPUBuffer a = g.create_scratch_buffer(64 * sizeof(float));
+  WGPUBuffer z = g.create_scratch_buffer(0); // guarded to 4 bytes
+  WGPUBuffer b = g.create_scratch_buffer(64 * sizeof(float));
+  const bool ok = a && z && b && a != b && a != z && b != z;
+  printf(ok ? "PASS: allocation\n" : "FAIL: allocation\n");
+  return ok; // graph dtor releases all three here
+}
+
+// Tier 2: host->scratch write, scratch->staging copy, read-back round-trip.
+bool tier2_roundtrip(
+    WGPUInstance instance,
+    WGPUDevice device,
+    WGPUQueue queue) {
+  printf("\n--- scratch[tier2: copy round-trip] ---\n");
+  bool ok = true;
+  for (int n : {1, 7, 1024}) {
+    WebGPUGraph g;
+    g.set_device(device);
+    WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float));
+    std::vector<float> in(n);
+    for (int i = 0; i < n; i++) {
+      in[i] = static_cast<float>(i) * 0.5f + 1.0f;
+    }
+    wgpuQueueWriteBuffer(queue, s, 0, in.data(), n * sizeof(float));
+    std::vector<float> back =
+        readback(instance, device, queue, s, n * sizeof(float));
+    float max_err = 0.0f;
+    for (int i = 0; i < n; i++) {
+      max_err = std::max(max_err, std::abs(back[i] - in[i]));
+    }
+    printf("  n=%d max abs error %e\n", n, max_err);
+    if (max_err != 0.0f) { // pure copy: must be bit-exact
+      ok = false;
+    }
+  }
+  printf(ok ? "PASS: copy round-trip\n" : "FAIL: copy round-trip\n");
+  return ok;
+}
+
+// Tier 3a: bind scratch as a Storage buffer in a compute pass (its real use).
+bool tier3_compute(WGPUInstance instance, WGPUDevice device, WGPUQueue queue) {
+  printf("\n--- scratch[tier3: compute Storage round-trip] ---\n");
+  const int n = 256;
+  WebGPUGraph g;
+  g.set_device(device);
+  WGPUBuffer s = g.create_scratch_buffer(n * sizeof(float));
+
+  const char* kWgsl =
+      "@group(0) @binding(0) var<storage, read_write> buf: array<f32>;\n"
+      "@compute @workgroup_size(64)\n"
+      "fn main(@builtin(global_invocation_id) gid: vec3<u32>) {\n"
+      "  let i = gid.x;\n"
+      "  if (i < arrayLength(&buf)) { buf[i] = f32(i) * 2.0 + 1.0; }\n"
+      "}\n";
+
+  WGPUShaderSourceWGSL wgsl = {};
+  wgsl.chain.sType = WGPUSType_ShaderSourceWGSL;
+  wgsl.code = {kWgsl, WGPU_STRLEN};
+  WGPUShaderModuleDescriptor smd = {};
+  smd.nextInChain = &wgsl.chain;
+  WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &smd);
+
+  WGPUBindGroupLayoutEntry ble = {};
+  ble.binding = 0;
+  ble.visibility = WGPUShaderStage_Compute;
+  ble.buffer.type = WGPUBufferBindingType_Storage;
+  WGPUBindGroupLayoutDescriptor bld = {};
+  bld.entryCount = 1;
+  bld.entries = &ble;
+  WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bld);
+
+  WGPUPipelineLayoutDescriptor pld = {};
+  pld.bindGroupLayoutCount = 1;
+  pld.bindGroupLayouts = &bgl;
+  WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pld);
+
+  WGPUComputePipelineDescriptor cpd = {};
+  cpd.layout = pl;
+  cpd.compute.module = shader;
+  cpd.compute.entryPoint = {"main", WGPU_STRLEN};
+  WGPUComputePipeline pipe = wgpuDeviceCreateComputePipeline(device, &cpd);
+
+  WGPUBindGroupEntry bge = {};
+  bge.binding = 0;
+  bge.buffer = s;
+  bge.size = n * sizeof(float);
+  WGPUBindGroupDescriptor bgd = {};
+  bgd.layout = bgl;
+  bgd.entryCount = 1;
+  bgd.entries = &bge;
+  WGPUBindGroup bg = wgpuDeviceCreateBindGroup(device, &bgd);
+
+  WGPUCommandEncoderDescriptor ed = {};
+  WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, &ed);
+  WGPUComputePassDescriptor pd = {};
+  WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(enc, &pd);
+  wgpuComputePassEncoderSetPipeline(pass, pipe);
+  wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, nullptr);
+  wgpuComputePassEncoderDispatchWorkgroups(pass, (n + 63) / 64, 1, 1);
+  wgpuComputePassEncoderEnd(pass);
+  wgpuComputePassEncoderRelease(pass);
+  WGPUCommandBufferDescriptor cd = {};
+  WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, &cd);
+  wgpuQueueSubmit(queue, 1, &cmd);
+  wgpuCommandBufferRelease(cmd);
+  wgpuCommandEncoderRelease(enc);
+
+  std::vector<float> back =
+      readback(instance, device, queue, s, n * sizeof(float));
+  float max_err = 0.0f;
+  for (int i = 0; i < n; i++) {
+    const float expected = static_cast<float>(i) * 2.0f + 1.0f;
+    max_err = std::max(max_err, std::abs(back[i] - expected));
+  }
+  printf("  max abs error %e (%d elements)\n", max_err, n);
+
+  wgpuBindGroupRelease(bg);
+  wgpuComputePipelineRelease(pipe);
+  wgpuPipelineLayoutRelease(pl);
+  wgpuBindGroupLayoutRelease(bgl);
+  wgpuShaderModuleRelease(shader);
+
+  const bool ok = max_err == 0.0f;
+  printf(
+      ok ? "PASS: compute Storage round-trip\n" : "FAIL: compute round-trip\n");
+  return ok;
+}
+
+// Tier 3b: many scratch buffers across repeated graphs; dtor must release all.
+bool tier3_lifecycle(WGPUDevice device) {
+  printf("\n--- scratch[tier3: lifecycle/stress] ---\n");
+  bool ok = true;
+  for (int iter = 0; iter < 50; iter++) {
+    WebGPUGraph g;
+    g.set_device(device);
+    for (int k = 0; k < 256; k++) {
+      WGPUBuffer b =
+          g.create_scratch_buffer(static_cast<size_t>(k % 17) * sizeof(float));
+      ok = ok && b != nullptr;
+    }
+  } // each graph's dtor releases its 256 buffers here
+  printf(
+      ok ? "PASS: lifecycle/stress (50 graphs x 256 buffers)\n"
+         : "FAIL: lifecycle/stress (null buffer)\n");
+  return ok;
+}
+
+} // namespace
+
+int main() {
+  WebGPUContext ctx;
+  try {
+    ctx = create_webgpu_context();
+  } catch (const std::exception& e) {
+    printf("SKIP: %s\n", e.what());
+    return 0;
+  }
+  set_default_webgpu_context(&ctx);
+  printf("WebGPU device acquired (native)\n");
+
+  bool ok = true;
+  ok = tier1_alloc(ctx.device) && ok;
+  ok = tier2_roundtrip(ctx.instance, ctx.device, ctx.queue) && ok;
+  ok = tier3_compute(ctx.instance, ctx.device, ctx.queue) && ok;
+  ok = tier3_lifecycle(ctx.device) && ok;
+
+  set_default_webgpu_context(nullptr);
+  destroy_webgpu_context(ctx);
+
+  if (!ok) {
+    return 1;
+  }
+  printf("\nAll scratch_buffer tests passed\n");
+  return 0;
+}
diff --git a/backends/webgpu/test/ops/dispatch_order/__init__.py b/backends/webgpu/test/ops/dispatch_order/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
new file mode 100644
index 00000000000..fbb13ff6426
--- /dev/null
+++ b/backends/webgpu/test/ops/dispatch_order/test_dispatch_order.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Multi-dispatch ordering coverage for WebGPUGraph::execute().
+
+Each model is a dependency chain whose dispatches must execute in order (one
+compute pass per dispatch is the implicit barrier). Vehicle A is a single-input
+add self-chain; Vehicle B chains add on a reused RmsNormModule (a heterogeneous
+cross-pipeline RAW edge). Numerics are checked in test/native/test_dispatch_order.cpp.
+"""
+
+import os
+import unittest
+
+import torch
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
+from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import RmsNormModule
+from executorch.backends.webgpu.test.tester import WEBGPU_SUPPORTED_OPS
+from executorch.exir import to_edge_transform_and_lower
+
+
+class ChainAddModule(torch.nn.Module):
+    """z = x + x; z = z + x; ... (depth adds) -> (depth + 1) * x."""
+
+    def __init__(self, depth: int) -> None:
+        super().__init__()
+        self.depth = depth
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = x + x
+        for _ in range(self.depth - 1):
+            z = z + x
+        return z
+
+
+class RmsNormAddModule(torch.nn.Module):
+    """t = rms_norm(x); z = t + x; ... (adds adds) -- heterogeneous RAW chain."""
+
+    def __init__(self, width: int, adds: int) -> None:
+        super().__init__()
+        self.rms = RmsNormModule(width, eps=1e-6)
+        self.adds = adds
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        z = self.rms(x) + x
+        for _ in range(self.adds - 1):
+            z = z + x
+        return z
+
+
+# (name, kind, shape, depth) -- MUST match kCases in test_dispatch_order.cpp.
+_CASES = [
+    ("single", "chain", (16, 16), 1),
+    ("chain3", "chain", (64, 64), 3),
+    ("chain5_tiny", "chain", (1, 1), 5),
+    ("chain5_wide", "chain", (7, 896), 5),
+    ("chain8", "chain", (256, 256), 8),
+    ("deep32", "chain", (128, 128), 32),
+    ("large_chain", "chain", (1024, 1024), 6),
+    ("het_small", "rms", (1, 1, 7, 896), 2),
+    ("het_deep", "rms", (1, 1, 5, 256), 3),
+]
+
+
+def _model(kind: str, shape, depth: int) -> torch.nn.Module:
+    if kind == "chain":
+        return ChainAddModule(depth)
+    return RmsNormAddModule(shape[-1], depth)
+
+
+def _lower(model: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(model, (x,))
+    return to_edge_transform_and_lower(
+        ep,
+        partitioner=[VulkanPartitioner(operator_allowlist=WEBGPU_SUPPORTED_OPS)],
+    ).to_executorch()
+
+
+class TestDispatchOrder(unittest.TestCase):
+    def _assert_delegated(self, prog) -> None:
+        found = any(
+            d.id == "VulkanBackend"
+            for p in prog.executorch_program.execution_plan
+            for d in p.delegates
+        )
+        self.assertTrue(found, "Expected VulkanBackend delegate in .pte")
+
+    def test_chain_add(self) -> None:
+        self._assert_delegated(_lower(ChainAddModule(5), torch.randn(64, 64)))
+
+    def test_rms_norm_add(self) -> None:
+        self._assert_delegated(
+            _lower(RmsNormAddModule(896, 2), torch.randn(1, 1, 7, 896))
+        )
+
+
+def export_dispatch_order_cases(out_dir: str) -> None:
+    """Write <name>.pte, <name>.input.bin, <name>.golden.bin (raw le fp32) per case."""
+    os.makedirs(out_dir, exist_ok=True)
+    torch.manual_seed(0)
+    for name, kind, shape, depth in _CASES:
+        x = torch.randn(*shape)
+        model = _model(kind, shape, depth)
+        prog = _lower(model, x)
+        with torch.no_grad():
+            golden = model(x)
+        base = os.path.join(out_dir, name)
+        x.detach().cpu().numpy().astype("<f4").tofile(base + ".input.bin")
+        golden.detach().cpu().numpy().astype("<f4").tofile(base + ".golden.bin")
+        with open(base + ".pte", "wb") as f:
+            f.write(prog.buffer)
+        print(f"Exported case {name} {tuple(shape)}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh
index 5e3a20e96ac..152c47b74b9 100755
--- a/backends/webgpu/test/test_build_webgpu.sh
+++ b/backends/webgpu/test/test_build_webgpu.sh
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# End-to-end build and test script for the WebGPU backend (native via wgpu-native).
+# End-to-end build and test script for the WebGPU backend (native via Dawn).
 # Usage: bash backends/webgpu/test/test_build_webgpu.sh
 
 set -e
@@ -38,12 +38,17 @@ echo "=== Step 2: Export test models ==="
 PTE_MODEL="/tmp/webgpu_add_test.pte"
 PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
 RMS_NORM_DIR="/tmp/rmsn"
+DISPATCH_ORDER_DIR="/tmp/dispatch_order"
 cd "${EXECUTORCH_ROOT}"
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
 export_add_model('${PTE_MODEL}')
 export_chained_add_model('${PTE_CHAINED_MODEL}')
 "
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
+export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
+"
 if [[ "${RMS_NORM_PYTEST_OK}" == "1" ]]; then
   $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
@@ -51,22 +56,20 @@ export_rms_norm_cases('${RMS_NORM_DIR}')
 " || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_PYTEST_OK=0; }
 fi
 
-# ── Step 3: Native build + test (wgpu-native) ────────────────────────────────
+# ── Step 3: Native build + test (Dawn + SwiftShader) ─────────────────────────
 
-WGPU_DIR="${EXECUTORCH_ROOT}/backends/webgpu/third-party/wgpu-native"
+# Vendor Dawn (Tint) + SwiftShader and export Dawn_DIR/VK_ICD_FILENAMES. Set
+# DAWN_PREBUILT_DIR to an existing Dawn install to skip the download locally.
+echo "=== Installing Dawn (Tint) + SwiftShader ==="
+source "${EXECUTORCH_ROOT}/.ci/scripts/setup-webgpu-linux-deps.sh"
 
-# Auto-download wgpu-native if not present
-if [[ ! -d "${WGPU_DIR}/lib" ]]; then
-    echo "=== Installing wgpu-native ==="
-    bash "${EXECUTORCH_ROOT}/backends/webgpu/scripts/setup-wgpu-native.sh"
-fi
-
-echo "=== Step 3: Native build with wgpu-native ==="
+echo "=== Step 3: Native build with Dawn ==="
 NATIVE_BUILD_DIR="${EXECUTORCH_ROOT}/cmake-out-webgpu-native"
 rm -rf "${NATIVE_BUILD_DIR}"
 
 cmake \
     -DEXECUTORCH_BUILD_WEBGPU=ON \
+    -DDawn_DIR="${Dawn_DIR}" \
     -DEXECUTORCH_BUILD_WEBGPU_TEST=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -79,6 +82,8 @@ cmake \
 
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC}
 cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_rms_norm_test -j${NPROC}
+cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_dispatch_order_test -j${NPROC}
+cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_scratch_buffer_test -j${NPROC}
 
 echo "=== Step 4: Run native tests ==="
 env \
@@ -92,4 +97,7 @@ else
   echo "(skipping rms_norm native test: pytest or export did not complete)"
 fi
 
+"${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
+"${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_scratch_buffer_test"
+
 echo "=== Done ==="
diff --git a/backends/webgpu/test/tester.py b/backends/webgpu/test/tester.py
index f0f861eda60..2e67df442e6 100644
--- a/backends/webgpu/test/tester.py
+++ b/backends/webgpu/test/tester.py
@@ -20,6 +20,7 @@
 # Edge ops the WebGPU runtime implements; restricts the Vulkan partitioner.
 WEBGPU_SUPPORTED_OPS = [
     exir_ops.edge.aten.add.Tensor,
+    exir_ops.edge.et_vk.rms_norm.default,
 ]