diff --git a/examples/specdec_bench/run.py b/examples/specdec_bench/run.py index f4fbf06c0e8..94932c787b8 100644 --- a/examples/specdec_bench/run.py +++ b/examples/specdec_bench/run.py @@ -265,7 +265,7 @@ def run_simple(args): type=str, required=False, default="EAGLE3", - choices=["EAGLE3", "EAGLE", "DRAFT_TARGET", "NGRAM", "MTP", "NONE"], + choices=["EAGLE3", "EAGLE", "DRAFT_TARGET", "NGRAM", "MTP", "DFLASH", "NONE"], help="Speculative algorithm to use", ) parser.add_argument("--model_dir", type=str, required=True, help="Path to the model directory") diff --git a/examples/specdec_bench/specdec_bench/models/sglang.py b/examples/specdec_bench/specdec_bench/models/sglang.py index d5ff890ffd7..5e39695aff7 100644 --- a/examples/specdec_bench/specdec_bench/models/sglang.py +++ b/examples/specdec_bench/specdec_bench/models/sglang.py @@ -43,44 +43,48 @@ def __init__( speculative_algorithm = "LOOKAHEAD" elif speculative_algorithm == "NONE": speculative_algorithm = None + + engine_kwargs = dict( + model_path=model_dir, + skip_tokenizer_init=True, + trust_remote_code=kwargs.get("trust_remote_code", False), + mem_fraction_static=kwargs.get("mem_fraction_static", 0.8), + disable_overlap_schedule=kwargs.get("disable_overlap_schedule", False), + tp_size=kwargs.get("tensor_parallel_size", 1), + ep_size=kwargs.get("moe_expert_parallel_size", 1), + torch_compile_max_bs=max_concurrent_requests, + max_running_requests=max_concurrent_requests, + attention_backend=kwargs.get("attention_backend"), + enable_torch_compile=kwargs.get("enable_torch_compile", False), + cuda_graph_max_bs=max_concurrent_requests, + disable_cuda_graph=False, + disable_cuda_graph_padding=True, + ) if speculative_algorithm is not None: # https://github.com/sgl-project/sglang/pull/3582 - self.model = sgl.Engine( - model_path=model_dir, - skip_tokenizer_init=True, - trust_remote_code=kwargs.get("trust_remote_code", False), - mem_fraction_static=0.8, - disable_overlap_schedule=kwargs.get("disable_overlap_schedule", False), - tp_size=kwargs.get("tensor_parallel_size", 1), - ep_size=kwargs.get("moe_expert_parallel_size", 1), - speculative_algorithm=speculative_algorithm, - speculative_num_steps=kwargs.get("speculative_num_steps", 3), - speculative_eagle_topk=kwargs.get("speculative_eagle_topk", 1), - speculative_num_draft_tokens=kwargs.get("speculative_num_draft_tokens", 4), - speculative_draft_model_path=kwargs.get("draft_model_dir"), - torch_compile_max_bs=max_concurrent_requests, - max_running_requests=max_concurrent_requests, - attention_backend=kwargs.get("attention_backend"), - enable_torch_compile=kwargs.get("enable_torch_compile", False), - cuda_graph_max_bs=max_concurrent_requests, - disable_cuda_graph=False, - ) - else: - self.model = sgl.Engine( - model_path=model_dir, - skip_tokenizer_init=True, - trust_remote_code=kwargs.get("trust_remote_code", False), - mem_fraction_static=0.8, - disable_overlap_schedule=kwargs.get("disable_overlap_schedule", False), - tp_size=kwargs.get("tensor_parallel_size", 1), - ep_size=kwargs.get("moe_expert_parallel_size", 1), - torch_compile_max_bs=max_concurrent_requests, - max_running_requests=max_concurrent_requests, - attention_backend=kwargs.get("attention_backend"), - enable_torch_compile=kwargs.get("enable_torch_compile", False), - cuda_graph_max_bs=max_concurrent_requests, - disable_cuda_graph=False, - ) + engine_kwargs["speculative_algorithm"] = speculative_algorithm + engine_kwargs["speculative_draft_model_path"] = kwargs.get("draft_model_dir") + if speculative_algorithm == "DFLASH": + engine_kwargs["speculative_num_draft_tokens"] = kwargs.get("speculative_num_draft_tokens", 8) + if "speculative_dflash_draft_window_size" in kwargs: + engine_kwargs["speculative_dflash_draft_window_size"] = kwargs[ + "speculative_dflash_draft_window_size" + ] + print( + f"[specdec_bench] DFLASH ignores --draft_length / speculative_num_steps / " + f"speculative_eagle_topk; effective draft block = " + f"speculative_num_draft_tokens={engine_kwargs['speculative_num_draft_tokens']}" + ) + else: + engine_kwargs["speculative_num_draft_tokens"] = kwargs.get("speculative_num_draft_tokens", 4) + engine_kwargs["speculative_num_steps"] = kwargs.get("speculative_num_steps", 3) + engine_kwargs["speculative_eagle_topk"] = kwargs.get("speculative_eagle_topk", 1) + + # extra engine arg needed for qwen3.5 + if "mamba_scheduler_strategy" in kwargs: + engine_kwargs["mamba_scheduler_strategy"] = kwargs["mamba_scheduler_strategy"] + + self.model = sgl.Engine(**engine_kwargs) self.sampling_config = sampling_kwargs diff --git a/examples/specdec_bench/specdec_bench/models/vllm.py b/examples/specdec_bench/specdec_bench/models/vllm.py index 2e312e7aec8..fc595c1d579 100644 --- a/examples/specdec_bench/specdec_bench/models/vllm.py +++ b/examples/specdec_bench/specdec_bench/models/vllm.py @@ -63,6 +63,12 @@ def __init__(self, model_dir, max_concurrent_requests, sampling_kwargs, **kwargs "method": "mtp", "num_speculative_tokens": kwargs.get("speculative_num_steps", 3), } + elif kwargs.get("speculative_algorithm") == "DFLASH": + specdec = { + "method": "dflash", + "model": kwargs.get("draft_model_dir"), + "num_speculative_tokens": kwargs.get("speculative_num_draft_tokens", 8), + } elif kwargs.get("speculative_algorithm") == "NONE": specdec = None