From 2c830f53f6b4a1840d0e17c2dfff73a8a4772e2c Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sat, 6 Jun 2026 17:16:42 +0200 Subject: [PATCH 1/2] perf: cap planner budget when model dwarfs the streaming budget --- src/ggml_extend.hpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 1f32c9bc9..d54ab694c 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2470,10 +2470,22 @@ struct GGMLRunner { *effective_budget_out = effective_budget; } + // When the model dwarfs the budget, cap the planner at a quarter so + // it builds smaller merged segments and chunk-K can fit alongside. + // Otherwise leave the planner free to merge into one large segment. + size_t total_params_bytes = 0; + for (const ggml_tensor* t : params_tensor_set_) { + if (t != nullptr) { + total_params_bytes += ggml_nbytes(t); + } + } + const size_t planner_budget = + (total_params_bytes * 4 > effective_budget * 3) ? effective_budget / 4 : effective_budget; + *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf, &graph_cut_plan_cache_, - effective_budget, + planner_budget, params_tensor_set_, get_desc().c_str()); if (stream_layers_enabled) { From 88a5ee44ab7bf91084831f90600e6bba360a32a7 Mon Sep 17 00:00:00 2001 From: fszontagh Date: Sun, 7 Jun 2026 22:29:45 +0200 Subject: [PATCH 2/2] perf: gate planner budget cap on stream_layers_enabled --- src/ggml_extend.hpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp index 3cb00db9f..26ba0bbdf 100644 --- a/src/ggml_extend.hpp +++ b/src/ggml_extend.hpp @@ -2470,17 +2470,21 @@ struct GGMLRunner { *effective_budget_out = effective_budget; } - // When the model dwarfs the budget, cap the planner at a quarter so - // it builds smaller merged segments and chunk-K can fit alongside. - // Otherwise leave the planner free to merge into one large segment. - size_t total_params_bytes = 0; - for (const ggml_tensor* t : params_tensor_set_) { - if (t != nullptr) { - total_params_bytes += ggml_nbytes(t); + // When streaming and the model dwarfs the budget, cap the planner at + // a quarter so it builds smaller merged segments and chunk-K can fit + // alongside. Without streaming the cap only adds dispatch overhead. + size_t planner_budget = effective_budget; + if (stream_layers_enabled) { + size_t total_params_bytes = 0; + for (const ggml_tensor* t : params_tensor_set_) { + if (t != nullptr) { + total_params_bytes += ggml_nbytes(t); + } + } + if (total_params_bytes * 4 > effective_budget * 3) { + planner_budget = effective_budget / 4; } } - const size_t planner_budget = - (total_params_bytes * 4 > effective_budget * 3) ? effective_budget / 4 : effective_budget; *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend, gf,