From 5725feeef6ea109a49b7418978fad871b1eb5f86 Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Tue, 9 Jun 2026 09:03:55 -0700 Subject: [PATCH] Update [ghstack-poisoned] --- examples/models/eagle3/CMakeLists.txt | 69 +++ examples/models/eagle3/CMakePresets.json | 29 ++ examples/models/eagle3/main.cpp | 532 +++++++++++++++++++++++ 3 files changed, 630 insertions(+) create mode 100644 examples/models/eagle3/CMakeLists.txt create mode 100644 examples/models/eagle3/CMakePresets.json create mode 100644 examples/models/eagle3/main.cpp diff --git a/examples/models/eagle3/CMakeLists.txt b/examples/models/eagle3/CMakeLists.txt new file mode 100644 index 00000000000..f7fe225f4f3 --- /dev/null +++ b/examples/models/eagle3/CMakeLists.txt @@ -0,0 +1,69 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +cmake_minimum_required(VERSION 3.24) +project(eagle3_speculator) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +# gflags +set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags) +find_package(gflags REQUIRED) + +# executorch +list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..) +find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH) +executorch_target_link_options_shared_lib(executorch) + +set(link_libraries executorch gflags) + +# CPU ops (host-side helpers not delegated to CUDA) +list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas) +executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib) + +# Extensions +list( + APPEND + link_libraries + extension_llm_runner + extension_module + extension_data_loader + extension_tensor + extension_flat_tensor +) + +# Backend: CUDA (AOTI). The EAGLE-3 speculator export is CUDA-only. +if(EXECUTORCH_BUILD_CUDA) + find_package(CUDAToolkit REQUIRED) + list(APPEND link_libraries aoti_cuda_backend) + executorch_target_link_options_shared_lib(aoti_cuda_backend) + add_compile_definitions(EXECUTORCH_BUILD_CUDA) +else() + message(FATAL_ERROR "EAGLE-3 speculator runner requires EXECUTORCH_BUILD_CUDA=ON") +endif() + +# Tokenizer (HuggingFace tokenizer.json) +list(APPEND link_libraries tokenizers::tokenizers) + +add_executable(eagle3_speculator_runner main.cpp) +target_include_directories( + eagle3_speculator_runner PUBLIC ${_common_include_directories} +) +target_link_libraries(eagle3_speculator_runner PUBLIC ${link_libraries}) + +if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + target_link_options_gc_sections(eagle3_speculator_runner) + if(NOT APPLE AND NOT MSVC) + target_link_options(eagle3_speculator_runner PRIVATE "LINKER:-s") + endif() +endif() diff --git a/examples/models/eagle3/CMakePresets.json b/examples/models/eagle3/CMakePresets.json new file mode 100644 index 00000000000..9d9b75b262d --- /dev/null +++ b/examples/models/eagle3/CMakePresets.json @@ -0,0 +1,29 @@ +{ + "version": 6, + "configurePresets": [ + { + "name": "eagle3-cuda", + "displayName": "EAGLE-3 speculator runner (CUDA)", + "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/eagle3", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out", + "CMAKE_PREFIX_PATH": "${sourceDir}/../../../cmake-out", + "EXECUTORCH_BUILD_CUDA": "ON" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } + } + ], + "buildPresets": [ + { + "name": "eagle3-cuda", + "displayName": "Build EAGLE-3 speculator runner (CUDA)", + "configurePreset": "eagle3-cuda", + "targets": ["eagle3_speculator_runner"] + } + ] +} diff --git a/examples/models/eagle3/main.cpp b/examples/models/eagle3/main.cpp new file mode 100644 index 00000000000..6a68e89eaaa --- /dev/null +++ b/examples/models/eagle3/main.cpp @@ -0,0 +1,532 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// EAGLE-3 speculative-decoding runner for ExecuTorch (CUDA/AOTI backend). +// +// Loads the speculator .pte (examples/models/eagle3/export.py) exposing three +// methods that share the target / draft KV caches: +// prefill(tokens[1,T], pos[T]) -> (next_token[1,1], feat[1,T,H]) +// target_verify(tokens[1,C], pos[C]) -> (greedy_ids[1,C], feat[1,C,H]) +// draft_decode(tokens[1,T], feat[1,T,H], pos[T]) -> (target_ids[1,T], g[1,T,H]) +// where feat is the fused (hidden-size) draft feature and H is the draft hidden +// size. Verification is greedy (argmax), so emitted tokens equal greedy target +// decoding (lossless) by construction. +// +// Scheme: the shifted EAGLE convention (vllm/v1/spec_decode/eagle.py, +// set_inputs_first_pass: "Shift the input ids by one token" with unshifted +// hidden_states). The draft pairs target hidden_state_t with token_{t+1}, so a +// new draft chain seeds from the hidden states target_verify already produced +// for the just-confirmed positions plus the corrected token's embedding -- the +// corrected/bonus token never needs its own target forward, giving one target +// forward per round (speedup ~= acceptance length tau). +// +// Features round-trip through the host between method calls (D2H copy + re-feed +// as host tensors). They are small (<= max_prefill x H bf16), so the cost is +// negligible next to the INT4 31B target forward, and it keeps device-tensor +// lifetimes simple. +// +// Run (after exporting model.pte + aoti_cuda_blob.ptd via export.py, sourcing the +// CUDA env, and building the eagle3-cuda preset): +// eagle3_speculator_runner --model_path /model.pte \ +// --data_path /aoti_cuda_blob.ptd --tokenizer_path \ +// --prompt "..." --max_new_tokens 128 +// The chat template and stop tokens default to Gemma 4 IT; override +// --chat_prefix/--chat_suffix/--stop_ids/--stop_token (and --bos_id -1) for other +// target/tokenizer pairs. Per-run timing counters (tau, verify/draft ms) print at +// the end. +// +// Scope: a single-sequence, greedy, fixed-shape demo runner -- not a generic +// EAGLE serving path. No batching, sampler stack (top-k/p/temperature), grammar/ +// tool constraints, streaming API, or integration with the standard ExecuTorch +// LLM runner. The host feature round-trip above is a first-implementation choice +// (the target forward dominates here); a device-resident handoff is future work. +// The target, draft, and tokenizer must be a matched, co-trained set -- a +// mismatch can pass export and silently degrade acceptance/output. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +extern "C" void et_pal_emit_log_message( + ET_UNUSED et_timestamp_t timestamp, + et_pal_log_level_t level, + const char* filename, + ET_UNUSED const char* function, + size_t line, + const char* message, + ET_UNUSED size_t length) { + if (level == 'D' || level == 'I') { + return; + } + fprintf(stderr, "%c [%s:%zu] %s\n", (char)level, filename, line, message); +} + +#ifdef EXECUTORCH_BUILD_CUDA +#include +#endif + +DEFINE_string(model_path, "", "Speculator model.pte path."); +DEFINE_string(data_path, "", "Tensor data (.ptd) path for the CUDA backend."); +DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path."); +DEFINE_string(prompt, "Explain why the sky is blue.", "Prompt text."); +DEFINE_bool(raw_prompt, false, "Skip the Gemma 4 IT chat template."); +DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate."); +DEFINE_int32(bos_id, 2, "BOS token id (-1 to skip; Gemma convention: 2)."); +DEFINE_int32(eos_id, 1, "EOS token id (Gemma convention: 1)."); +DEFINE_bool(cuda_graph, true, "Capture target_verify as a CUDA graph (CUDA only)."); +// Chat template + stop tokens default to Gemma 4 IT; override for other models. +DEFINE_string(chat_prefix, "<|turn>user\n", "Chat-template text before the prompt."); +DEFINE_string( + chat_suffix, + "\n<|turn>model\n<|channel>thought\n", + "Chat-template text after the prompt."); +DEFINE_string( + stop_ids, + "1,50,106", + "Comma-separated extra stop token ids (empty to add none)."); +DEFINE_string( + stop_token, + "", + "A stop-delimiter string to encode and add to EOS (empty to skip)."); + +using executorch::extension::from_blob; +using executorch::extension::Module; +using executorch::runtime::Error; +using executorch::runtime::EValue; +namespace llm = executorch::extension::llm; +using SizesType = executorch::aten::SizesType; + +namespace { + +// D2H-copy a tensor's raw bytes into a host buffer (the AOTI backend returns +// device tensors). Works for any dtype; caller reinterprets. +std::vector to_host_bytes(const executorch::aten::Tensor& t) { + std::vector out(t.nbytes()); + const void* ptr = t.const_data_ptr(); +#ifdef EXECUTORCH_BUILD_CUDA + cudaPointerAttributes attrs{}; + if (cudaPointerGetAttributes(&attrs, ptr) == cudaSuccess && + attrs.type == cudaMemoryTypeDevice) { + cudaError_t err = cudaMemcpy(out.data(), ptr, out.size(), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + ET_LOG(Error, "D2H copy failed: %s", cudaGetErrorString(err)); + exit(1); + } + return out; + } +#endif + memcpy(out.data(), ptr, out.size()); + return out; +} + +// Read an int64 (1, N) tensor to a host vector. +std::vector read_ids(const executorch::aten::Tensor& t) { + auto bytes = to_host_bytes(t); + size_t n = bytes.size() / sizeof(int64_t); + std::vector ids(n); + memcpy(ids.data(), bytes.data(), bytes.size()); + return ids; +} + +// A draft feature held on the host as raw bf16 (uint16) so it can be re-fed. +struct HostFeature { + std::vector data; // row-major (T, H) + int64_t T = 0; + int64_t H = 0; +}; + +HostFeature read_feature(const executorch::aten::Tensor& t) { + // t is (1, T, H) bf16. + HostFeature f; + f.T = t.size(1); + f.H = t.size(2); + auto bytes = to_host_bytes(t); + f.data.resize(bytes.size() / sizeof(uint16_t)); + memcpy(f.data.data(), bytes.data(), bytes.size()); + return f; +} + +} // namespace + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) { + ET_LOG(Error, "Must specify --model_path and --tokenizer_path"); + return 1; + } + + llm::Stats stats; + stats.model_load_start_ms = llm::time_in_ms(); + + auto tokenizer = std::make_unique(); + if (tokenizer->load(FLAGS_tokenizer_path) != tokenizers::Error::Ok) { + ET_LOG(Error, "Failed to load tokenizer from %s", FLAGS_tokenizer_path.c_str()); + return 1; + } + + std::vector data_files; + if (!FLAGS_data_path.empty()) { + data_files.push_back(FLAGS_data_path); + } + auto module = std::make_unique( + FLAGS_model_path, + data_files, + Module::LoadMode::MmapUseMlockIgnoreErrors, + /*event_tracer=*/nullptr, + /*memory_allocator=*/nullptr, + /*temp_allocator=*/nullptr, + /*share_memory_arenas=*/true); + + // Weight sharing across methods (prefill and target_verify share the target). +#ifdef EXECUTORCH_BUILD_CUDA + { + executorch::runtime::BackendOptions<1> backend_options; + backend_options.set_option("weight_sharing_across_methods", true); + executorch::runtime::set_option("CudaBackend", backend_options.view()); + } + if (FLAGS_cuda_graph) { + // target_verify is the one target forward per round and has a static shape + // (chain+1 tokens), so capture it as a CUDA graph to avoid paying the + // 60-layer per-kernel launch overhead every round (the dominant cost + // otherwise). Its input tensors must wrap stable host buffers (below). + executorch::runtime::BackendOptions<1> g; + g.set_option("enable_cuda_graph_for_method", "target_verify"); + executorch::runtime::set_option("CudaBackend", g.view()); + } +#endif + + for (const char* m : {"prefill", "target_verify", "draft_decode"}) { + if (module->load_method(m) != Error::Ok) { + ET_LOG(Error, "Failed to load method %s", m); + return 1; + } + } + + if (FLAGS_max_new_tokens <= 0) { + ET_LOG(Error, "--max_new_tokens must be >= 1"); + return 1; + } + + // Metadata baked in by export.py (required: a missing key means a + // mismatched/old .pte, so fail loudly instead of guessing). + auto meta = [&](const char* name) -> int64_t { + auto r = module->get(name); + if (!r.ok()) { + ET_LOG(Error, "missing required .pte metadata: %s", name); + exit(1); + } + return r->toScalar().to(); + }; + const int64_t chain_len = meta("get_chain_len"); + const int64_t max_prefill = meta("get_max_prefill_chunk"); + const int64_t min_prefill = meta("get_min_prefill_chunk"); + const int64_t max_seq_len = meta("get_max_seq_len"); + const int64_t K = chain_len; + + // EOS: tokenizer/metadata ids, the configured eos, any --stop_ids, and the + // encoded --stop_token delimiter (all default to the Gemma 4 IT conventions). + auto eos_ids = llm::get_eos_ids(tokenizer.get(), module.get()); + eos_ids.insert(static_cast(FLAGS_eos_id)); + for (size_t b = 0, e; b <= FLAGS_stop_ids.size(); b = e + 1) { + e = FLAGS_stop_ids.find(',', b); + if (e == std::string::npos) { + e = FLAGS_stop_ids.size(); + } + std::string tok = FLAGS_stop_ids.substr(b, e - b); + if (!tok.empty()) { + eos_ids.insert(static_cast(std::stoll(tok))); + } + } + if (!FLAGS_stop_token.empty()) { + if (auto t = tokenizer->encode(FLAGS_stop_token, /*bos=*/0, /*eos=*/0); + t.ok() && t->size() == 1) { + eos_ids.insert(t.get()[0]); + } + } + + std::string prompt_text = FLAGS_prompt; + if (!FLAGS_raw_prompt) { + prompt_text = FLAGS_chat_prefix + prompt_text + FLAGS_chat_suffix; + } + auto enc = tokenizer->encode(prompt_text); + if (!enc.ok()) { + ET_LOG(Error, "Failed to encode prompt"); + return 1; + } + std::vector prompt(enc->begin(), enc->end()); + if (FLAGS_bos_id >= 0) { + prompt.insert(prompt.begin(), static_cast(FLAGS_bos_id)); + } + const int64_t L = static_cast(prompt.size()); + // The runner does not chunk: the whole prompt must fit one prefill, and its + // length must be within the exported prefill range [min_prefill, max_prefill]. + if (L > max_prefill) { + ET_LOG(Error, "Prompt (%" PRId64 " tokens) exceeds max_prefill %" PRId64 + "; this runner does not chunk prefill.", L, max_prefill); + return 1; + } + if (L < min_prefill) { + ET_LOG(Error, "Prompt (%" PRId64 " tokens) is below the exported prefill " + "minimum %" PRId64 "; use a longer prompt.", L, min_prefill); + return 1; + } + // The prefill bonus token is always emittable (no KV write past the prompt). + // Each speculative round, however, writes a K-token verify window, so it needs + // anchor_pos + K <= max_seq_len - 1 (enforced in the loop below). Cap the total + // at the positions available; max_new >= 1 since L <= max_prefill < max_seq_len. + int64_t max_new = std::min(FLAGS_max_new_tokens, max_seq_len - L); + printf("Prompt tokens: %" PRId64 ", chain K=%" PRId64 ", max_new=%" PRId64 + "\n", L, K, max_new); + + auto S = [](int64_t v) { return static_cast(v); }; + + // Persistent host buffers backing the tensors handed to each execute() call. + std::vector tok_buf, pos_buf; + std::vector feat_buf; + + auto long_tensor = [&](std::vector& buf) { + return from_blob( + buf.data(), {1, S((int64_t)buf.size())}, executorch::aten::ScalarType::Long); + }; + auto pos_tensor = [&](std::vector& buf) { + return from_blob( + buf.data(), {S((int64_t)buf.size())}, executorch::aten::ScalarType::Long); + }; + + // draft_decode over (tokens, feat rows, positions); returns proposals + the + // last row of g (the recurrent feature for the next chain step). + auto draft_decode = [&](const std::vector& tokens, + const uint16_t* feat_rows, + int64_t feat_T, + int64_t H, + int64_t start_pos, + std::vector& out_ids, + HostFeature& out_last_g) { + tok_buf.assign(tokens.begin(), tokens.end()); + pos_buf.resize(tokens.size()); + for (size_t i = 0; i < tokens.size(); i++) { + pos_buf[i] = start_pos + static_cast(i); + } + feat_buf.assign(feat_rows, feat_rows + feat_T * H); + auto t_tok = long_tensor(tok_buf); + auto t_feat = from_blob( + feat_buf.data(), {1, S(feat_T), S(H)}, executorch::aten::ScalarType::BFloat16); + auto t_pos = pos_tensor(pos_buf); + auto r = module->execute( + "draft_decode", {EValue(t_tok), EValue(t_feat), EValue(t_pos)}); + if (r.error() != Error::Ok) { + ET_LOG(Error, "draft_decode failed"); + exit(1); + } + out_ids = read_ids(r->at(0).toTensor()); + HostFeature g = read_feature(r->at(1).toTensor()); + out_last_g.T = 1; + out_last_g.H = g.H; + out_last_g.data.assign( + g.data.end() - g.H, g.data.end()); // last row of g + }; + + // Run a draft chain seeded by (seed_tokens, seed_feat) at seed positions; the + // last seeded slot predicts proposal 0, then K-1 recurrent steps. + auto chain = [&](const std::vector& seed_tokens, + const HostFeature& seed_feat, + int64_t seed_start_pos) { + std::vector proposals; + std::vector ids; + HostFeature last_g; + draft_decode( + seed_tokens, seed_feat.data.data(), seed_feat.T, seed_feat.H, + seed_start_pos, ids, last_g); + proposals.push_back(ids.back()); + int64_t last_pos = seed_start_pos + seed_feat.T - 1; + for (int64_t k = 1; k < K; k++) { + std::vector step_ids; + HostFeature step_g; + draft_decode( + {proposals.back()}, last_g.data.data(), 1, last_g.H, + last_pos + k, step_ids, step_g); + proposals.push_back(step_ids[0]); + last_g = step_g; + } + return proposals; + }; + + stats.model_load_end_ms = llm::time_in_ms(); + stats.inference_start_ms = stats.model_load_end_ms; + + // --- Prefill: target over the prompt -> bonus token + per-position feature. --- + tok_buf = prompt; + pos_buf.resize(L); + for (int64_t i = 0; i < L; i++) { + pos_buf[i] = i; + } + auto pf = module->execute( + "prefill", {EValue(long_tensor(tok_buf)), EValue(pos_tensor(pos_buf))}); + if (pf.error() != Error::Ok) { + ET_LOG(Error, "prefill failed"); + return 1; + } + int64_t anchor = read_ids(pf->at(0).toTensor())[0]; // bonus token at position L + HostFeature feat_prompt = read_feature(pf->at(1).toTensor()); + const int64_t H = feat_prompt.H; + int64_t anchor_pos = L; + + stats.prompt_eval_end_ms = llm::time_in_ms(); + stats.first_token_ms = stats.prompt_eval_end_ms; + + std::vector emitted = {anchor}; + uint64_t prev = static_cast(prompt.back()); + { + auto s = tokenizer->decode(prev, static_cast(anchor)); + if (s.ok()) { printf("%s", s->c_str()); fflush(stdout); } + prev = static_cast(anchor); + } + + // We only run the speculative loop if more than the (already emitted) prefill + // bonus is wanted, the bonus wasn't EOS, and there is room for a K-token verify + // window. Otherwise we are done -- no draft seeding needed. + bool hit_eos = eos_ids.count(static_cast(anchor)) > 0; + bool speculate = max_new > 1 && !hit_eos && anchor_pos + K <= max_seq_len - 1; + std::vector proposals; + if (speculate) { + // Seed the first chain (shifted): draft slot p pairs feat_prompt[p] with + // token_{p+1}; the last slot pairs feat_prompt[L-1] with the bonus and + // predicts position L+1. + std::vector seed_tokens(prompt.begin() + 1, prompt.end()); + seed_tokens.push_back(anchor); + proposals = chain(seed_tokens, feat_prompt, 0); + } + + // Stable buffers for target_verify (fixed length K+1) so the CUDA graph + // replays against the same input addresses; we mutate the contents in place. + std::vector vtok_buf(K + 1), vpos_buf(K + 1); + auto vtok_t = from_blob( + vtok_buf.data(), {1, S(K + 1)}, executorch::aten::ScalarType::Long); + auto vpos_t = from_blob( + vpos_buf.data(), {S(K + 1)}, executorch::aten::ScalarType::Long); + + // --- Speculative rounds: one target forward (target_verify) per round. --- + int64_t rounds = 0; + int64_t verify_ms = 0, draft_ms = 0; // instrumentation + while (speculate && (int64_t)emitted.size() < max_new && !hit_eos && + anchor_pos + K <= max_seq_len - 1) { + rounds++; + // Verify [anchor, p0..p_{K-1}] at positions [anchor_pos .. anchor_pos+K]. + vtok_buf[0] = anchor; + for (int64_t j = 0; j < K; j++) { + vtok_buf[j + 1] = proposals[j]; + } + for (int64_t i = 0; i <= K; i++) { + vpos_buf[i] = anchor_pos + i; + } + int64_t t_v = llm::time_in_ms(); + auto vr = module->execute("target_verify", {EValue(vtok_t), EValue(vpos_t)}); + if (vr.error() != Error::Ok) { + ET_LOG(Error, "target_verify failed"); + return 1; + } + std::vector verify_ids = read_ids(vr->at(0).toTensor()); + HostFeature verify_feat = read_feature(vr->at(1).toTensor()); + verify_ms += llm::time_in_ms() - t_v; + + // Greedy acceptance: verify_ids[j] is the greedy token after token j, so it + // checks proposal j (which sits at position anchor_pos+1+j). + int64_t a = 0; + for (int64_t j = 0; j < K; j++) { + if (proposals[j] == verify_ids[j]) { + a++; + } else { + break; + } + } + int64_t corrected = verify_ids[a]; + + std::vector newly(proposals.begin(), proposals.begin() + a); + newly.push_back(corrected); + for (int64_t t : newly) { + if ((int64_t)emitted.size() >= max_new) break; + emitted.push_back(t); + auto s = tokenizer->decode(prev, static_cast(t)); + if (s.ok()) { printf("%s", s->c_str()); fflush(stdout); } + prev = static_cast(t); + if (eos_ids.count(static_cast(t)) > 0) { + // Stop at the first accepted EOS; do not emit the rest of this batch. + // An accepted proposal (not just the corrected/bonus token) can be EOS, + // so this truncates newly at the first stop token, matching the eager + // reference. + hit_eos = true; + break; + } + } + if (hit_eos || (int64_t)emitted.size() >= max_new) break; + + // Reseed the draft (shifted): slot anchor_pos+i holds (verify_feat[i], + // token_{anchor_pos+i+1}) where token = p_i (i