diff --git a/examples/models/eagle3/CMakeLists.txt b/examples/models/eagle3/CMakeLists.txt
new file mode 100644
index 00000000000..f7fe225f4f3
--- /dev/null
+++ b/examples/models/eagle3/CMakeLists.txt
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.24)
+project(eagle3_speculator)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+# gflags
+set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
+find_package(gflags REQUIRED)
+
+# executorch
+list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
+find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
+executorch_target_link_options_shared_lib(executorch)
+
+set(link_libraries executorch gflags)
+
+# CPU ops (host-side helpers not delegated to CUDA)
+list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
+executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+
+# Extensions
+list(
+ APPEND
+ link_libraries
+ extension_llm_runner
+ extension_module
+ extension_data_loader
+ extension_tensor
+ extension_flat_tensor
+)
+
+# Backend: CUDA (AOTI). The EAGLE-3 speculator export is CUDA-only.
+if(EXECUTORCH_BUILD_CUDA)
+ find_package(CUDAToolkit REQUIRED)
+ list(APPEND link_libraries aoti_cuda_backend)
+ executorch_target_link_options_shared_lib(aoti_cuda_backend)
+ add_compile_definitions(EXECUTORCH_BUILD_CUDA)
+else()
+ message(FATAL_ERROR "EAGLE-3 speculator runner requires EXECUTORCH_BUILD_CUDA=ON")
+endif()
+
+# Tokenizer (HuggingFace tokenizer.json)
+list(APPEND link_libraries tokenizers::tokenizers)
+
+add_executable(eagle3_speculator_runner main.cpp)
+target_include_directories(
+ eagle3_speculator_runner PUBLIC ${_common_include_directories}
+)
+target_link_libraries(eagle3_speculator_runner PUBLIC ${link_libraries})
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+ target_link_options_gc_sections(eagle3_speculator_runner)
+ if(NOT APPLE AND NOT MSVC)
+ target_link_options(eagle3_speculator_runner PRIVATE "LINKER:-s")
+ endif()
+endif()
diff --git a/examples/models/eagle3/CMakePresets.json b/examples/models/eagle3/CMakePresets.json
new file mode 100644
index 00000000000..9d9b75b262d
--- /dev/null
+++ b/examples/models/eagle3/CMakePresets.json
@@ -0,0 +1,29 @@
+{
+ "version": 6,
+ "configurePresets": [
+ {
+ "name": "eagle3-cuda",
+ "displayName": "EAGLE-3 speculator runner (CUDA)",
+ "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/eagle3",
+ "cacheVariables": {
+ "CMAKE_BUILD_TYPE": "Release",
+ "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+ "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out",
+ "EXECUTORCH_BUILD_CUDA": "ON"
+ },
+ "condition": {
+ "type": "inList",
+ "string": "${hostSystemName}",
+ "list": ["Linux", "Windows"]
+ }
+ }
+ ],
+ "buildPresets": [
+ {
+ "name": "eagle3-cuda",
+ "displayName": "Build EAGLE-3 speculator runner (CUDA)",
+ "configurePreset": "eagle3-cuda",
+ "targets": ["eagle3_speculator_runner"]
+ }
+ ]
+}
diff --git a/examples/models/eagle3/main.cpp b/examples/models/eagle3/main.cpp
new file mode 100644
index 00000000000..6a68e89eaaa
--- /dev/null
+++ b/examples/models/eagle3/main.cpp
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// EAGLE-3 speculative-decoding runner for ExecuTorch (CUDA/AOTI backend).
+//
+// Loads the speculator .pte (examples/models/eagle3/export.py) exposing three
+// methods that share the target / draft KV caches:
+// prefill(tokens[1,T], pos[T]) -> (next_token[1,1], feat[1,T,H])
+// target_verify(tokens[1,C], pos[C]) -> (greedy_ids[1,C], feat[1,C,H])
+// draft_decode(tokens[1,T], feat[1,T,H], pos[T]) -> (target_ids[1,T], g[1,T,H])
+// where feat is the fused (hidden-size) draft feature and H is the draft hidden
+// size. Verification is greedy (argmax), so emitted tokens equal greedy target
+// decoding (lossless) by construction.
+//
+// Scheme: the shifted EAGLE convention (vllm/v1/spec_decode/eagle.py,
+// set_inputs_first_pass: "Shift the input ids by one token" with unshifted
+// hidden_states). The draft pairs target hidden_state_t with token_{t+1}, so a
+// new draft chain seeds from the hidden states target_verify already produced
+// for the just-confirmed positions plus the corrected token's embedding -- the
+// corrected/bonus token never needs its own target forward, giving one target
+// forward per round (speedup ~= acceptance length tau).
+//
+// Features round-trip through the host between method calls (D2H copy + re-feed
+// as host tensors). They are small (<= max_prefill x H bf16), so the cost is
+// negligible next to the INT4 31B target forward, and it keeps device-tensor
+// lifetimes simple.
+//
+// Run (after exporting model.pte + aoti_cuda_blob.ptd via export.py, sourcing the
+// CUDA env, and building the eagle3-cuda preset):
+// eagle3_speculator_runner --model_path
/model.pte \
+// --data_path /aoti_cuda_blob.ptd --tokenizer_path \
+// --prompt "..." --max_new_tokens 128
+// The chat template and stop tokens default to Gemma 4 IT; override
+// --chat_prefix/--chat_suffix/--stop_ids/--stop_token (and --bos_id -1) for other
+// target/tokenizer pairs. Per-run timing counters (tau, verify/draft ms) print at
+// the end.
+//
+// Scope: a single-sequence, greedy, fixed-shape demo runner -- not a generic
+// EAGLE serving path. No batching, sampler stack (top-k/p/temperature), grammar/
+// tool constraints, streaming API, or integration with the standard ExecuTorch
+// LLM runner. The host feature round-trip above is a first-implementation choice
+// (the target forward dominates here); a device-resident handoff is future work.
+// The target, draft, and tokenizer must be a matched, co-trained set -- a
+// mismatch can pass export and silently degrade acceptance/output.
+
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+extern "C" void et_pal_emit_log_message(
+ ET_UNUSED et_timestamp_t timestamp,
+ et_pal_log_level_t level,
+ const char* filename,
+ ET_UNUSED const char* function,
+ size_t line,
+ const char* message,
+ ET_UNUSED size_t length) {
+ if (level == 'D' || level == 'I') {
+ return;
+ }
+ fprintf(stderr, "%c [%s:%zu] %s\n", (char)level, filename, line, message);
+}
+
+#ifdef EXECUTORCH_BUILD_CUDA
+#include
+#endif
+
+DEFINE_string(model_path, "", "Speculator model.pte path.");
+DEFINE_string(data_path, "", "Tensor data (.ptd) path for the CUDA backend.");
+DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
+DEFINE_string(prompt, "Explain why the sky is blue.", "Prompt text.");
+DEFINE_bool(raw_prompt, false, "Skip the Gemma 4 IT chat template.");
+DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
+DEFINE_int32(bos_id, 2, "BOS token id (-1 to skip; Gemma convention: 2).");
+DEFINE_int32(eos_id, 1, "EOS token id (Gemma convention: 1).");
+DEFINE_bool(cuda_graph, true, "Capture target_verify as a CUDA graph (CUDA only).");
+// Chat template + stop tokens default to Gemma 4 IT; override for other models.
+DEFINE_string(chat_prefix, "<|turn>user\n", "Chat-template text before the prompt.");
+DEFINE_string(
+ chat_suffix,
+ "\n<|turn>model\n<|channel>thought\n",
+ "Chat-template text after the prompt.");
+DEFINE_string(
+ stop_ids,
+ "1,50,106",
+ "Comma-separated extra stop token ids (empty to add none).");
+DEFINE_string(
+ stop_token,
+ "",
+ "A stop-delimiter string to encode and add to EOS (empty to skip).");
+
+using executorch::extension::from_blob;
+using executorch::extension::Module;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+namespace llm = executorch::extension::llm;
+using SizesType = executorch::aten::SizesType;
+
+namespace {
+
+// D2H-copy a tensor's raw bytes into a host buffer (the AOTI backend returns
+// device tensors). Works for any dtype; caller reinterprets.
+std::vector to_host_bytes(const executorch::aten::Tensor& t) {
+ std::vector out(t.nbytes());
+ const void* ptr = t.const_data_ptr();
+#ifdef EXECUTORCH_BUILD_CUDA
+ cudaPointerAttributes attrs{};
+ if (cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess &&
+ attrs.type == cudaMemoryTypeDevice) {
+ cudaError_t err = cudaMemcpy(out.data(), ptr, out.size(), cudaMemcpyDeviceToHost);
+ if (err != cudaSuccess) {
+ ET_LOG(Error, "D2H copy failed: %s", cudaGetErrorString(err));
+ exit(1);
+ }
+ return out;
+ }
+#endif
+ memcpy(out.data(), ptr, out.size());
+ return out;
+}
+
+// Read an int64 (1, N) tensor to a host vector.
+std::vector read_ids(const executorch::aten::Tensor& t) {
+ auto bytes = to_host_bytes(t);
+ size_t n = bytes.size() / sizeof(int64_t);
+ std::vector ids(n);
+ memcpy(ids.data(), bytes.data(), bytes.size());
+ return ids;
+}
+
+// A draft feature held on the host as raw bf16 (uint16) so it can be re-fed.
+struct HostFeature {
+ std::vector data; // row-major (T, H)
+ int64_t T = 0;
+ int64_t H = 0;
+};
+
+HostFeature read_feature(const executorch::aten::Tensor& t) {
+ // t is (1, T, H) bf16.
+ HostFeature f;
+ f.T = t.size(1);
+ f.H = t.size(2);
+ auto bytes = to_host_bytes(t);
+ f.data.resize(bytes.size() / sizeof(uint16_t));
+ memcpy(f.data.data(), bytes.data(), bytes.size());
+ return f;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ gflags::ParseCommandLineFlags(&argc, &argv, true);
+ if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) {
+ ET_LOG(Error, "Must specify --model_path and --tokenizer_path");
+ return 1;
+ }
+
+ llm::Stats stats;
+ stats.model_load_start_ms = llm::time_in_ms();
+
+ auto tokenizer = std::make_unique();
+ if (tokenizer->load(FLAGS_tokenizer_path) != tokenizers::Error::Ok) {
+ ET_LOG(Error, "Failed to load tokenizer from %s", FLAGS_tokenizer_path.c_str());
+ return 1;
+ }
+
+ std::vector data_files;
+ if (!FLAGS_data_path.empty()) {
+ data_files.push_back(FLAGS_data_path);
+ }
+ auto module = std::make_unique(
+ FLAGS_model_path,
+ data_files,
+ Module::LoadMode::MmapUseMlockIgnoreErrors,
+ /*event_tracer=*/nullptr,
+ /*memory_allocator=*/nullptr,
+ /*temp_allocator=*/nullptr,
+ /*share_memory_arenas=*/true);
+
+ // Weight sharing across methods (prefill and target_verify share the target).
+#ifdef EXECUTORCH_BUILD_CUDA
+ {
+ executorch::runtime::BackendOptions<1> backend_options;
+ backend_options.set_option("weight_sharing_across_methods", true);
+ executorch::runtime::set_option("CudaBackend", backend_options.view());
+ }
+ if (FLAGS_cuda_graph) {
+ // target_verify is the one target forward per round and has a static shape
+ // (chain+1 tokens), so capture it as a CUDA graph to avoid paying the
+ // 60-layer per-kernel launch overhead every round (the dominant cost
+ // otherwise). Its input tensors must wrap stable host buffers (below).
+ executorch::runtime::BackendOptions<1> g;
+ g.set_option("enable_cuda_graph_for_method", "target_verify");
+ executorch::runtime::set_option("CudaBackend", g.view());
+ }
+#endif
+
+ for (const char* m : {"prefill", "target_verify", "draft_decode"}) {
+ if (module->load_method(m) != Error::Ok) {
+ ET_LOG(Error, "Failed to load method %s", m);
+ return 1;
+ }
+ }
+
+ if (FLAGS_max_new_tokens <= 0) {
+ ET_LOG(Error, "--max_new_tokens must be >= 1");
+ return 1;
+ }
+
+ // Metadata baked in by export.py (required: a missing key means a
+ // mismatched/old .pte, so fail loudly instead of guessing).
+ auto meta = [&](const char* name) -> int64_t {
+ auto r = module->get(name);
+ if (!r.ok()) {
+ ET_LOG(Error, "missing required .pte metadata: %s", name);
+ exit(1);
+ }
+ return r->toScalar().to();
+ };
+ const int64_t chain_len = meta("get_chain_len");
+ const int64_t max_prefill = meta("get_max_prefill_chunk");
+ const int64_t min_prefill = meta("get_min_prefill_chunk");
+ const int64_t max_seq_len = meta("get_max_seq_len");
+ const int64_t K = chain_len;
+
+ // EOS: tokenizer/metadata ids, the configured eos, any --stop_ids, and the
+ // encoded --stop_token delimiter (all default to the Gemma 4 IT conventions).
+ auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get());
+ eos_ids.insert(static_cast(FLAGS_eos_id));
+ for (size_t b = 0, e; b <= FLAGS_stop_ids.size(); b = e + 1) {
+ e = FLAGS_stop_ids.find(',', b);
+ if (e == std::string::npos) {
+ e = FLAGS_stop_ids.size();
+ }
+ std::string tok = FLAGS_stop_ids.substr(b, e - b);
+ if (!tok.empty()) {
+ eos_ids.insert(static_cast(std::stoll(tok)));
+ }
+ }
+ if (!FLAGS_stop_token.empty()) {
+ if (auto t = tokenizer->encode(FLAGS_stop_token, /*bos=*/0, /*eos=*/0);
+ t.ok() && t->size() == 1) {
+ eos_ids.insert(t.get()[0]);
+ }
+ }
+
+ std::string prompt_text = FLAGS_prompt;
+ if (!FLAGS_raw_prompt) {
+ prompt_text = FLAGS_chat_prefix + prompt_text + FLAGS_chat_suffix;
+ }
+ auto enc = tokenizer->encode(prompt_text);
+ if (!enc.ok()) {
+ ET_LOG(Error, "Failed to encode prompt");
+ return 1;
+ }
+ std::vector prompt(enc->begin(), enc->end());
+ if (FLAGS_bos_id >= 0) {
+ prompt.insert(prompt.begin(), static_cast(FLAGS_bos_id));
+ }
+ const int64_t L = static_cast(prompt.size());
+ // The runner does not chunk: the whole prompt must fit one prefill, and its
+ // length must be within the exported prefill range [min_prefill, max_prefill].
+ if (L > max_prefill) {
+ ET_LOG(Error, "Prompt (%" PRId64 " tokens) exceeds max_prefill %" PRId64
+ "; this runner does not chunk prefill.", L, max_prefill);
+ return 1;
+ }
+ if (L < min_prefill) {
+ ET_LOG(Error, "Prompt (%" PRId64 " tokens) is below the exported prefill "
+ "minimum %" PRId64 "; use a longer prompt.", L, min_prefill);
+ return 1;
+ }
+ // The prefill bonus token is always emittable (no KV write past the prompt).
+ // Each speculative round, however, writes a K-token verify window, so it needs
+ // anchor_pos + K <= max_seq_len - 1 (enforced in the loop below). Cap the total
+ // at the positions available; max_new >= 1 since L <= max_prefill < max_seq_len.
+ int64_t max_new = std::min(FLAGS_max_new_tokens, max_seq_len - L);
+ printf("Prompt tokens: %" PRId64 ", chain K=%" PRId64 ", max_new=%" PRId64
+ "\n", L, K, max_new);
+
+ auto S = [](int64_t v) { return static_cast(v); };
+
+ // Persistent host buffers backing the tensors handed to each execute() call.
+ std::vector tok_buf, pos_buf;
+ std::vector feat_buf;
+
+ auto long_tensor = [&](std::vector& buf) {
+ return from_blob(
+ buf.data(), {1, S((int64_t)buf.size())}, executorch::aten::ScalarType::Long);
+ };
+ auto pos_tensor = [&](std::vector& buf) {
+ return from_blob(
+ buf.data(), {S((int64_t)buf.size())}, executorch::aten::ScalarType::Long);
+ };
+
+ // draft_decode over (tokens, feat rows, positions); returns proposals + the
+ // last row of g (the recurrent feature for the next chain step).
+ auto draft_decode = [&](const std::vector& tokens,
+ const uint16_t* feat_rows,
+ int64_t feat_T,
+ int64_t H,
+ int64_t start_pos,
+ std::vector& out_ids,
+ HostFeature& out_last_g) {
+ tok_buf.assign(tokens.begin(), tokens.end());
+ pos_buf.resize(tokens.size());
+ for (size_t i = 0; i < tokens.size(); i++) {
+ pos_buf[i] = start_pos + static_cast(i);
+ }
+ feat_buf.assign(feat_rows, feat_rows + feat_T * H);
+ auto t_tok = long_tensor(tok_buf);
+ auto t_feat = from_blob(
+ feat_buf.data(), {1, S(feat_T), S(H)}, executorch::aten::ScalarType::BFloat16);
+ auto t_pos = pos_tensor(pos_buf);
+ auto r = module->execute(
+ "draft_decode", {EValue(t_tok), EValue(t_feat), EValue(t_pos)});
+ if (r.error() != Error::Ok) {
+ ET_LOG(Error, "draft_decode failed");
+ exit(1);
+ }
+ out_ids = read_ids(r->at(0).toTensor());
+ HostFeature g = read_feature(r->at(1).toTensor());
+ out_last_g.T = 1;
+ out_last_g.H = g.H;
+ out_last_g.data.assign(
+ g.data.end() - g.H, g.data.end()); // last row of g
+ };
+
+ // Run a draft chain seeded by (seed_tokens, seed_feat) at seed positions; the
+ // last seeded slot predicts proposal 0, then K-1 recurrent steps.
+ auto chain = [&](const std::vector& seed_tokens,
+ const HostFeature& seed_feat,
+ int64_t seed_start_pos) {
+ std::vector proposals;
+ std::vector ids;
+ HostFeature last_g;
+ draft_decode(
+ seed_tokens, seed_feat.data.data(), seed_feat.T, seed_feat.H,
+ seed_start_pos, ids, last_g);
+ proposals.push_back(ids.back());
+ int64_t last_pos = seed_start_pos + seed_feat.T - 1;
+ for (int64_t k = 1; k < K; k++) {
+ std::vector step_ids;
+ HostFeature step_g;
+ draft_decode(
+ {proposals.back()}, last_g.data.data(), 1, last_g.H,
+ last_pos + k, step_ids, step_g);
+ proposals.push_back(step_ids[0]);
+ last_g = step_g;
+ }
+ return proposals;
+ };
+
+ stats.model_load_end_ms = llm::time_in_ms();
+ stats.inference_start_ms = stats.model_load_end_ms;
+
+ // --- Prefill: target over the prompt -> bonus token + per-position feature. ---
+ tok_buf = prompt;
+ pos_buf.resize(L);
+ for (int64_t i = 0; i < L; i++) {
+ pos_buf[i] = i;
+ }
+ auto pf = module->execute(
+ "prefill", {EValue(long_tensor(tok_buf)), EValue(pos_tensor(pos_buf))});
+ if (pf.error() != Error::Ok) {
+ ET_LOG(Error, "prefill failed");
+ return 1;
+ }
+ int64_t anchor = read_ids(pf->at(0).toTensor())[0]; // bonus token at position L
+ HostFeature feat_prompt = read_feature(pf->at(1).toTensor());
+ const int64_t H = feat_prompt.H;
+ int64_t anchor_pos = L;
+
+ stats.prompt_eval_end_ms = llm::time_in_ms();
+ stats.first_token_ms = stats.prompt_eval_end_ms;
+
+ std::vector emitted = {anchor};
+ uint64_t prev = static_cast(prompt.back());
+ {
+ auto s = tokenizer->decode(prev, static_cast(anchor));
+ if (s.ok()) { printf("%s", s->c_str()); fflush(stdout); }
+ prev = static_cast(anchor);
+ }
+
+ // We only run the speculative loop if more than the (already emitted) prefill
+ // bonus is wanted, the bonus wasn't EOS, and there is room for a K-token verify
+ // window. Otherwise we are done -- no draft seeding needed.
+ bool hit_eos = eos_ids.count(static_cast(anchor)) > 0;
+ bool speculate = max_new > 1 && !hit_eos && anchor_pos + K <= max_seq_len - 1;
+ std::vector proposals;
+ if (speculate) {
+ // Seed the first chain (shifted): draft slot p pairs feat_prompt[p] with
+ // token_{p+1}; the last slot pairs feat_prompt[L-1] with the bonus and
+ // predicts position L+1.
+ std::vector seed_tokens(prompt.begin() + 1, prompt.end());
+ seed_tokens.push_back(anchor);
+ proposals = chain(seed_tokens, feat_prompt, 0);
+ }
+
+ // Stable buffers for target_verify (fixed length K+1) so the CUDA graph
+ // replays against the same input addresses; we mutate the contents in place.
+ std::vector vtok_buf(K + 1), vpos_buf(K + 1);
+ auto vtok_t = from_blob(
+ vtok_buf.data(), {1, S(K + 1)}, executorch::aten::ScalarType::Long);
+ auto vpos_t = from_blob(
+ vpos_buf.data(), {S(K + 1)}, executorch::aten::ScalarType::Long);
+
+ // --- Speculative rounds: one target forward (target_verify) per round. ---
+ int64_t rounds = 0;
+ int64_t verify_ms = 0, draft_ms = 0; // instrumentation
+ while (speculate && (int64_t)emitted.size() < max_new && !hit_eos &&
+ anchor_pos + K <= max_seq_len - 1) {
+ rounds++;
+ // Verify [anchor, p0..p_{K-1}] at positions [anchor_pos .. anchor_pos+K].
+ vtok_buf[0] = anchor;
+ for (int64_t j = 0; j < K; j++) {
+ vtok_buf[j + 1] = proposals[j];
+ }
+ for (int64_t i = 0; i <= K; i++) {
+ vpos_buf[i] = anchor_pos + i;
+ }
+ int64_t t_v = llm::time_in_ms();
+ auto vr = module->execute("target_verify", {EValue(vtok_t), EValue(vpos_t)});
+ if (vr.error() != Error::Ok) {
+ ET_LOG(Error, "target_verify failed");
+ return 1;
+ }
+ std::vector verify_ids = read_ids(vr->at(0).toTensor());
+ HostFeature verify_feat = read_feature(vr->at(1).toTensor());
+ verify_ms += llm::time_in_ms() - t_v;
+
+ // Greedy acceptance: verify_ids[j] is the greedy token after token j, so it
+ // checks proposal j (which sits at position anchor_pos+1+j).
+ int64_t a = 0;
+ for (int64_t j = 0; j < K; j++) {
+ if (proposals[j] == verify_ids[j]) {
+ a++;
+ } else {
+ break;
+ }
+ }
+ int64_t corrected = verify_ids[a];
+
+ std::vector newly(proposals.begin(), proposals.begin() + a);
+ newly.push_back(corrected);
+ for (int64_t t : newly) {
+ if ((int64_t)emitted.size() >= max_new) break;
+ emitted.push_back(t);
+ auto s = tokenizer->decode(prev, static_cast(t));
+ if (s.ok()) { printf("%s", s->c_str()); fflush(stdout); }
+ prev = static_cast(t);
+ if (eos_ids.count(static_cast(t)) > 0) {
+ // Stop at the first accepted EOS; do not emit the rest of this batch.
+ // An accepted proposal (not just the corrected/bonus token) can be EOS,
+ // so this truncates newly at the first stop token, matching the eager
+ // reference.
+ hit_eos = true;
+ break;
+ }
+ }
+ if (hit_eos || (int64_t)emitted.size() >= max_new) break;
+
+ // Reseed the draft (shifted): slot anchor_pos+i holds (verify_feat[i],
+ // token_{anchor_pos+i+1}) where token = p_i (i reseed_tokens(proposals.begin(), proposals.begin() + a);
+ reseed_tokens.push_back(corrected);
+ HostFeature reseed_feat;
+ reseed_feat.T = a + 1;
+ reseed_feat.H = H;
+ reseed_feat.data.assign(
+ verify_feat.data.begin(), verify_feat.data.begin() + (a + 1) * H);
+ int64_t t_d = llm::time_in_ms();
+ proposals = chain(reseed_tokens, reseed_feat, anchor_pos);
+ draft_ms += llm::time_in_ms() - t_d;
+ anchor = corrected;
+ anchor_pos = anchor_pos + 1 + a;
+ }
+ printf("\n");
+ printf("[timing] verify=%" PRId64 "ms draft=%" PRId64 "ms over %" PRId64
+ " rounds (%.1f / %.1f ms per round)\n",
+ verify_ms, draft_ms, rounds,
+ rounds ? (double)verify_ms / rounds : 0.0,
+ rounds ? (double)draft_ms / rounds : 0.0);
+
+ stats.inference_end_ms = llm::time_in_ms();
+ stats.num_prompt_tokens = L;
+ stats.num_generated_tokens = static_cast(emitted.size());
+
+#ifdef EXECUTORCH_BUILD_CUDA
+ cudaDeviceSynchronize();
+#endif
+
+ // tau = mean tokens emitted per verify round; emitted[0] is the free prefill
+ // bonus (not produced by a round), so exclude it.
+ double tau =
+ rounds ? static_cast(emitted.size() - 1) / rounds : 0.0;
+ double gen_s =
+ (stats.inference_end_ms - stats.prompt_eval_end_ms) / 1000.0;
+ printf("\n--- EAGLE-3 speculative decode ---\n");
+ printf("generated %zu tokens in %" PRId64 " rounds (tau=%.3f)\n",
+ emitted.size(), rounds, tau);
+ if (gen_s > 0) {
+ printf("decode: %.2f tok/s (%.3f s)\n", emitted.size() / gen_s, gen_s);
+ }
+ llm::print_report(stats);
+ return 0;
+}