From 25c045f397421bb014684610f164bbdcf35032dd Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Tue, 9 Jun 2026 14:16:20 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/WebGPUDevice.cpp | 11 + backends/webgpu/runtime/WebGPUDevice.h | 8 + backends/webgpu/runtime/WebGPUGraph.cpp | 56 ++++- backends/webgpu/runtime/WebGPUGraph.h | 1 + backends/webgpu/runtime/WebGPUQueryPool.cpp | 220 ++++++++++++++++++++ backends/webgpu/runtime/WebGPUQueryPool.h | 84 ++++++++ backends/webgpu/runtime/ops/sdpa/Sdpa.cpp | 17 +- backends/webgpu/test/test_webgpu_native.cpp | 127 ++++++++++- 9 files changed, 517 insertions(+), 8 deletions(-) create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.cpp create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 67e283ab779..a3fe7b85481 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -30,6 +30,7 @@ set(WEBGPU_SRCS runtime/WebGPUGraph.cpp runtime/WebGPUDelegateHeader.cpp runtime/WebGPUDevice.cpp + runtime/WebGPUQueryPool.cpp runtime/ops/OperatorRegistry.cpp runtime/ops/add/BinaryOp.cpp runtime/ops/rms_norm/RmsNorm.cpp diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp index 041cbe5a703..6672459320d 100644 --- a/backends/webgpu/runtime/WebGPUDevice.cpp +++ b/backends/webgpu/runtime/WebGPUDevice.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace executorch { namespace backends { @@ -137,6 +138,16 @@ WebGPUContext create_webgpu_context() { WGPUStatus_Success) { device_desc.requiredLimits = &supported_limits; } + + // Bench: enable TimestampQuery if available; fail-open (skip timing if not). + std::vector required_features; + if (wgpuAdapterHasFeature(ctx.adapter, WGPUFeatureName_TimestampQuery)) { + required_features.push_back(WGPUFeatureName_TimestampQuery); + device_desc.requiredFeatureCount = required_features.size(); + device_desc.requiredFeatures = required_features.data(); + ctx.timestamp_supported = true; + } + device_desc.uncapturedErrorCallbackInfo.callback = on_device_error; WGPUWaitStatus device_wait = webgpu_wait( diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h index 78afd96316a..90100fa831a 100644 --- a/backends/webgpu/runtime/WebGPUDevice.h +++ b/backends/webgpu/runtime/WebGPUDevice.h @@ -10,6 +10,10 @@ #include +#include + +#include + namespace executorch { namespace backends { namespace webgpu { @@ -19,6 +23,10 @@ struct WebGPUContext { WGPUAdapter adapter = nullptr; WGPUDevice device = nullptr; WGPUQueue queue = nullptr; + // True if the device was created with the TimestampQuery feature (bench). + bool timestamp_supported = false; + // Bench-only: timestamp-query pool, lazily created in execute() (env-gated). + std::unique_ptr querypool; }; WebGPUContext create_webgpu_context(); diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b3ae5511d13..954a609a96e 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -496,18 +497,48 @@ void WebGPUGraph::copy_inputs( } } +namespace { +// Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries. +bool should_timestamp_query() { + static const bool enabled = std::getenv("WEBGPU_TIMESTAMP_QUERY") != nullptr; + return enabled; +} +} // namespace + void WebGPUGraph::execute() { const size_t n = dispatches_.size(); const size_t chunk = execute_config_.chunk_size; if (chunk == 0 || n <= chunk) { + // Bench: timestamp-query pool, null unless env-gated + feature present. + WebGPUQueryPool* qp = nullptr; + if (should_timestamp_query() && n > 0) { + if (auto* ctx = get_default_webgpu_context()) { + if (ctx->timestamp_supported) { + if (!ctx->querypool || ctx->querypool->capacity() < n) { + ctx->querypool = std::make_unique(); + ctx->querypool->initialize(device_, static_cast(n)); + } + qp = ctx->querypool.get(); + qp->reset(static_cast(n)); + } + } + } + WGPUCommandEncoderDescriptor enc_desc = {}; WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device_, &enc_desc); // One pass per dispatch: enforces storage RAW ordering across deps. - for (const auto& dispatch : dispatches_) { + for (size_t i = 0; i < n; i++) { + const auto& dispatch = dispatches_[i]; + // tw must outlive BeginComputePass (the descriptor points at it). + WGPUComputePassTimestampWrites tw = {}; WGPUComputePassDescriptor pass_desc = {}; + if (qp) { + tw = qp->writes_for(static_cast(i)); + pass_desc.timestampWrites = &tw; + } WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); @@ -517,6 +548,13 @@ void WebGPUGraph::execute() { pass, dispatch.workgroup_count_x, 1, 1); wgpuComputePassEncoderEnd(pass); wgpuComputePassEncoderRelease(pass); + if (qp) { + qp->record( + static_cast(i), + dispatch.kernel_name, + {dispatch.workgroup_count_x, 1, 1}, + {1, 1, 1}); + } } for (const auto& copy : output_copies_) { @@ -524,15 +562,31 @@ void WebGPUGraph::execute() { encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); } + if (qp) { + qp->resolve(encoder); + } + WGPUCommandBufferDescriptor cmd_desc = {}; WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); wgpuQueueSubmit(queue_, 1, &cmd); wgpuCommandBufferRelease(cmd); wgpuCommandEncoderRelease(encoder); + + if (qp) { + qp->extract_results(instance_); + qp->print_results(); + } return; } + // GPU timestamp queries assume one submit; chunked execute is multi-submit. + if (should_timestamp_query()) { + throw std::runtime_error( + "WebGPU: WEBGPU_TIMESTAMP_QUERY is incompatible with chunked execute " + "(multi-submit); disable chunking to use GPU timestamp queries"); + } + const size_t first_chunk = execute_config_.initial_chunk_size > 0 ? execute_config_.initial_chunk_size : chunk; diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 7eb0ffbd7d2..3cff09ecb6d 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -31,6 +31,7 @@ struct WebGPUDispatch { WGPUComputePipeline pipeline = nullptr; WGPUBindGroup bind_group = nullptr; uint32_t workgroup_count_x = 1; + std::string kernel_name; // bench label }; struct OutputCopy { diff --git a/backends/webgpu/runtime/WebGPUQueryPool.cpp b/backends/webgpu/runtime/WebGPUQueryPool.cpp new file mode 100644 index 00000000000..356a9110fef --- /dev/null +++ b/backends/webgpu/runtime/WebGPUQueryPool.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct MapCallbackData { + WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error; +}; + +void map_callback( + WGPUMapAsyncStatus status, + WGPUStringView /*message*/, + void* userdata1, + void* /*userdata2*/) { + auto* data = static_cast(userdata1); + data->status = status; +} + +constexpr uint64_t kTimestampBytes = sizeof(uint64_t); + +} // namespace + +WebGPUQueryPool::~WebGPUQueryPool() { + if (readback_buf_) { + wgpuBufferRelease(readback_buf_); + } + if (resolve_buf_) { + wgpuBufferRelease(resolve_buf_); + } + if (qset_) { + wgpuQuerySetRelease(qset_); + } +} + +void WebGPUQueryPool::initialize(WGPUDevice device, uint32_t max_pairs) { + if (max_pairs == 0) { + return; + } + // Re-init guard; mirrors Vulkan QueryPool (avoids leaking a prior QuerySet). + if (qset_ != nullptr) { + return; + } + capacity_pairs_ = max_pairs; + const uint32_t count = 2 * max_pairs; + const uint64_t bytes = static_cast(count) * kTimestampBytes; + + WGPUQuerySetDescriptor qsd = {}; + qsd.type = WGPUQueryType_Timestamp; + qsd.count = count; + qset_ = wgpuDeviceCreateQuerySet(device, &qsd); + + WGPUBufferDescriptor rbd = {}; + rbd.size = bytes; + rbd.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc; + resolve_buf_ = wgpuDeviceCreateBuffer(device, &rbd); + + WGPUBufferDescriptor mbd = {}; + mbd.size = bytes; + mbd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; + readback_buf_ = wgpuDeviceCreateBuffer(device, &mbd); + // WebGPU timestamps are already nanoseconds, so ns_per_tick_ stays 1.0. +} + +void WebGPUQueryPool::reset(uint32_t num_dispatches) { + // Fail loud on overrun; mirrors Vulkan QueryPool VK_CHECK_COND guard. + if (num_dispatches > capacity_pairs_) { + throw std::runtime_error( + "WebGPUQueryPool: num_dispatches " + std::to_string(num_dispatches) + + " exceeds capacity " + std::to_string(capacity_pairs_)); + } + num_pairs_ = num_dispatches; + durations_.clear(); +} + +WGPUComputePassTimestampWrites WebGPUQueryPool::writes_for(uint32_t i) { + WGPUComputePassTimestampWrites tw = {}; + tw.querySet = qset_; + tw.beginningOfPassWriteIndex = 2 * i; + tw.endOfPassWriteIndex = 2 * i + 1; + return tw; +} + +void WebGPUQueryPool::record( + uint32_t i, + const std::string& name, + std::array gwg, + std::array lwg) { + ShaderDuration d; + d.idx = i; + d.kernel_name = name; + d.global_wg = gwg; + d.local_wg = lwg; + durations_.push_back(d); +} + +void WebGPUQueryPool::resolve(WGPUCommandEncoder encoder) { + if (num_pairs_ == 0) { + return; + } + const uint32_t count = 2 * num_pairs_; + wgpuCommandEncoderResolveQuerySet(encoder, qset_, 0, count, resolve_buf_, 0); + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + resolve_buf_, + 0, + readback_buf_, + 0, + static_cast(count) * kTimestampBytes); +} + +void WebGPUQueryPool::extract_results(WGPUInstance instance) { + if (num_pairs_ == 0) { + return; + } + const uint32_t count = 2 * num_pairs_; + const uint64_t bytes = static_cast(count) * kTimestampBytes; + + MapCallbackData cb; + WGPUBufferMapCallbackInfo cb_info = {}; + cb_info.mode = WGPUCallbackMode_WaitAnyOnly; + cb_info.callback = map_callback; + cb_info.userdata1 = &cb; + webgpu_wait( + instance, + wgpuBufferMapAsync(readback_buf_, WGPUMapMode_Read, 0, bytes, cb_info)); + + if (cb.status != WGPUMapAsyncStatus_Success) { + printf( + "WebGPUQueryPool: readback map failed (status %d)\n", (int)cb.status); + return; + } + const uint64_t* ticks = static_cast( + wgpuBufferGetConstMappedRange(readback_buf_, 0, bytes)); + if (ticks != nullptr) { + for (auto& d : durations_) { + const uint64_t t0 = ticks[2 * d.idx]; + const uint64_t t1 = ticks[2 * d.idx + 1]; + d.start_time_ns = static_cast(t0 * ns_per_tick_); + d.end_time_ns = static_cast(t1 * ns_per_tick_); + d.execution_duration_ns = + (t1 >= t0) ? static_cast((t1 - t0) * ns_per_tick_) : 0; + } + } + wgpuBufferUnmap(readback_buf_); +} + +void WebGPUQueryPool::print_results(bool tsv) const { + const char* sep = tsv ? "\t" : " "; + if (tsv) { + printf("idx%skernel%sgwg%sduration_us\n", sep, sep, sep); + } else { + printf("=== WebGPUQueryPool: per-dispatch GPU time ===\n"); + } + for (const auto& d : durations_) { + const double us = d.execution_duration_ns / 1000.0; + printf( + "%u%s%s%s(%u,%u,%u)%s%.3f\n", + d.idx, + sep, + d.kernel_name.empty() ? "dispatch" : d.kernel_name.c_str(), + sep, + d.global_wg[0], + d.global_wg[1], + d.global_wg[2], + sep, + us); + } + if (tsv) { + return; + } + std::map> totals; + for (const auto& d : durations_) { + auto& t = totals[d.kernel_name.empty() ? "dispatch" : d.kernel_name]; + t.first += d.execution_duration_ns; + t.second += 1; + } + printf("--- per-kernel mean / total (us) ---\n"); + for (const auto& kv : totals) { + const double mean_us = kv.second.first / kv.second.second / 1000.0; + const double total_us = kv.second.first / 1000.0; + printf( + "%s%smean %.3f%stotal %.3f (n=%u)\n", + kv.first.c_str(), + sep, + mean_us, + sep, + total_us, + kv.second.second); + } +} + +uint64_t WebGPUQueryPool::get_mean_shader_ns( + const std::string& kernel_name) const { + uint64_t sum = 0; + uint32_t n = 0; + for (const auto& d : durations_) { + if (d.kernel_name == kernel_name) { + sum += d.execution_duration_ns; + n += 1; + } + } + return n == 0 ? 0 : sum / n; +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/WebGPUQueryPool.h b/backends/webgpu/runtime/WebGPUQueryPool.h new file mode 100644 index 00000000000..c22c44cbc67 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUQueryPool.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +// Per-dispatch GPU timing; mirrors Vulkan QueryPool ShaderDuration. +struct ShaderDuration { + uint32_t idx = 0; + std::string kernel_name; + std::array global_wg{}; + std::array local_wg{}; + uint64_t start_time_ns = 0; + uint64_t end_time_ns = 0; + uint64_t execution_duration_ns = 0; +}; + +// GPU timestamp-query pool; re-port of Vulkan vk_api/QueryPool. +class WebGPUQueryPool { + public: + WebGPUQueryPool() = default; + ~WebGPUQueryPool(); + + WebGPUQueryPool(const WebGPUQueryPool&) = delete; + WebGPUQueryPool& operator=(const WebGPUQueryPool&) = delete; + + // Create the QuerySet + readback buffers; query the ns-per-tick period. + void initialize(WGPUDevice device, uint32_t max_pairs); + bool is_initialized() const { + return qset_ != nullptr; + } + uint32_t capacity() const { + return capacity_pairs_; + } + + // Clear durations and set the dispatch count for this run. + void reset(uint32_t num_dispatches); + + // timestampWrites for pass i: begin=2i, end=2i+1. + WGPUComputePassTimestampWrites writes_for(uint32_t i); + + // Record pass i's label + workgroup sizes (start/end filled by extract). + void record( + uint32_t i, + const std::string& name, + std::array gwg, + std::array lwg); + + // Resolve the QuerySet into the readback buffer; call before submit. + void resolve(WGPUCommandEncoder encoder); + + // Map the readback, convert ticks->ns, fill durations; call after submit. + void extract_results(WGPUInstance instance); + + const std::vector& results() const { + return durations_; + } + void print_results(bool tsv = false) const; + uint64_t get_mean_shader_ns(const std::string& kernel_name) const; + + private: + WGPUQuerySet qset_ = nullptr; + WGPUBuffer resolve_buf_ = nullptr; // QueryResolve | CopySrc + WGPUBuffer readback_buf_ = nullptr; // MapRead | CopyDst + uint32_t capacity_pairs_ = 0; + uint32_t num_pairs_ = 0; + double ns_per_tick_ = 1.0; // WebGPU timestamps are already nanoseconds + std::vector durations_; +}; + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp b/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp index 3a381e839f1..392f4bcb9da 100644 --- a/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp +++ b/backends/webgpu/runtime/ops/sdpa/Sdpa.cpp @@ -156,7 +156,8 @@ void build_dispatch( uint64_t uniform_size, uint32_t workgroup_count_x, uint32_t wg_size, - bool retain_uniform = false) { + bool retain_uniform = false, + const char* kernel_name = "") { WGPUDevice device = graph.device(); WGPUShaderSourceWGSL wgsl_desc = {}; @@ -227,7 +228,7 @@ void build_dispatch( bg_desc.entries = bg_entries; WGPUBindGroup bind_group = wgpuDeviceCreateBindGroup(device, &bg_desc); - graph.add_dispatch({pipeline, bind_group, workgroup_count_x}); + graph.add_dispatch({pipeline, bind_group, workgroup_count_x, kernel_name}); wgpuShaderModuleRelease(shader); wgpuBindGroupLayoutRelease(bgl); @@ -269,7 +270,8 @@ static WGPUBuffer record_update_cache_dispatch( sizeof(uc), wgc, uc_wg, - dynamic_pos); + dynamic_pos, + "update_cache"); return ubuf; } @@ -471,7 +473,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector& args) { sizeof(p), wgc, qk_wg, - dynamic_pos); + dynamic_pos, + "sdpa_compute_attn_weights"); qk_buf = ubuf; qk_idx = graph.num_dispatches() - 1; } @@ -494,7 +497,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector& args) { sizeof(p), wgc, 0, - dynamic_pos); + dynamic_pos, + "sdpa_softmax"); softmax_buf = ubuf; } @@ -519,7 +523,8 @@ void sdpa_with_kv_cache_impl(WebGPUGraph& graph, const std::vector& args) { sizeof(p), wgc, av_wg, - dynamic_pos); + dynamic_pos, + "sdpa_compute_out"); av_buf = ubuf; } diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index b608235ac5e..f216eae4edb 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -1116,6 +1116,129 @@ static bool test_resize_hook(const std::string& blob_path) { return true; } +// Capacity-overrun must throw; runs without a device or TimestampQuery. +static bool test_query_pool_overrun_throws() { + printf("\n--- Test: WebGPUQueryPool capacity-overrun guard ---\n"); + WebGPUQueryPool qp; + try { + qp.reset(1); + } catch (const std::exception&) { + printf("PASS: reset beyond capacity throws\n"); + return true; + } + printf("FAIL: reset beyond capacity did not throw\n"); + return false; +} + +// WebGPUQueryPool roundtrip: time a probe pass; assert non-zero GPU duration. +static bool test_query_pool_roundtrip(const WebGPUContext& ctx) { + printf("\n--- Test: WebGPUQueryPool roundtrip ---\n"); + if (!ctx.timestamp_supported) { + printf("SKIP: adapter lacks TimestampQuery feature\n"); + return true; + } + WGPUDevice device = ctx.device; + + // Probe loop iterates enough to burn a measurable, non-zero GPU duration. + const char* kProbeWGSL = + "@group(0) @binding(0) var out: array;\n" + "@compute @workgroup_size(64)\n" + "fn main(@builtin(global_invocation_id) gid: vec3) {\n" + " var acc = 0.0;\n" + " for (var i = 0u; i < 8192u; i = i + 1u) {\n" + " acc = acc + f32(i) * 1.000001;\n" + " }\n" + " out[gid.x] = acc;\n" + "}\n"; + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {kProbeWGSL, WGPU_STRLEN}; + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device, &shader_desc); + + WGPUBindGroupLayoutEntry bgl_entry = {}; + bgl_entry.binding = 0; + bgl_entry.visibility = WGPUShaderStage_Compute; + bgl_entry.buffer.type = WGPUBufferBindingType_Storage; + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = 1; + bgl_desc.entries = &bgl_entry; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device, &bgl_desc); + + WGPUPipelineLayoutDescriptor pl_desc = {}; + pl_desc.bindGroupLayoutCount = 1; + pl_desc.bindGroupLayouts = &bgl; + WGPUPipelineLayout pl = wgpuDeviceCreatePipelineLayout(device, &pl_desc); + + WGPUComputePipelineDescriptor pipe_desc = {}; + pipe_desc.layout = pl; + pipe_desc.compute.module = shader; + pipe_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + WGPUComputePipeline pipe = + wgpuDeviceCreateComputePipeline(device, &pipe_desc); + + WGPUBufferDescriptor obd = {}; + obd.size = 64 * sizeof(float); + obd.usage = WGPUBufferUsage_Storage; + WGPUBuffer out_buf = wgpuDeviceCreateBuffer(device, &obd); + + WGPUBindGroupEntry bg_entry = {}; + bg_entry.binding = 0; + bg_entry.buffer = out_buf; + bg_entry.size = obd.size; + WGPUBindGroupDescriptor bg_desc = {}; + bg_desc.layout = bgl; + bg_desc.entryCount = 1; + bg_desc.entries = &bg_entry; + WGPUBindGroup bg = wgpuDeviceCreateBindGroup(device, &bg_desc); + + WebGPUQueryPool qp; + qp.initialize(device, 1); + qp.reset(1); + + WGPUCommandEncoder enc = wgpuDeviceCreateCommandEncoder(device, nullptr); + WGPUComputePassTimestampWrites tw = qp.writes_for(0); + WGPUComputePassDescriptor pass_desc = {}; + pass_desc.timestampWrites = &tw; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(enc, &pass_desc); + wgpuComputePassEncoderSetPipeline(pass, pipe); + wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1); + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + qp.record(0, "probe", {1, 1, 1}, {64, 1, 1}); + qp.resolve(enc); + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(enc, nullptr); + wgpuQueueSubmit(ctx.queue, 1, &cmd); + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(enc); + + qp.extract_results(ctx.instance); + + wgpuBufferRelease(out_buf); + wgpuComputePipelineRelease(pipe); + wgpuPipelineLayoutRelease(pl); + wgpuBindGroupLayoutRelease(bgl); + wgpuBindGroupRelease(bg); + wgpuShaderModuleRelease(shader); + + if (qp.results().size() != 1) { + printf("FAIL: expected 1 duration, got %zu\n", qp.results().size()); + return false; + } + const uint64_t dur = qp.results()[0].execution_duration_ns; + printf(" probe duration: %llu ns\n", (unsigned long long)dur); + if (dur == 0) { + printf("FAIL: probe duration is zero (expected monotonic non-zero)\n"); + return false; + } + printf("PASS: WebGPUQueryPool roundtrip -- non-zero GPU kernel duration\n"); + return true; +} + int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; if (argc > 1) { @@ -1157,7 +1280,9 @@ int main(int argc, char** argv) { set_default_webgpu_context(&ctx); printf("WebGPU device acquired (native)\n"); - bool ok = test_single_add(model_path); + bool ok = test_query_pool_overrun_throws(); + ok = test_query_pool_roundtrip(ctx) && ok; + ok = test_single_add(model_path) && ok; if (!chained_model_path.empty()) { ok = test_chained_add(chained_model_path) && ok;