From 2c830f53f6b4a1840d0e17c2dfff73a8a4772e2c Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sat, 6 Jun 2026 17:16:42 +0200
Subject: [PATCH 1/2] perf: cap planner budget when model dwarfs the streaming
 budget

---
 src/ggml_extend.hpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 1f32c9bc9..d54ab694c 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2470,10 +2470,22 @@ struct GGMLRunner {
             *effective_budget_out = effective_budget;
         }
 
+        // When the model dwarfs the budget, cap the planner at a quarter so
+        // it builds smaller merged segments and chunk-K can fit alongside.
+        // Otherwise leave the planner free to merge into one large segment.
+        size_t total_params_bytes = 0;
+        for (const ggml_tensor* t : params_tensor_set_) {
+            if (t != nullptr) {
+                total_params_bytes += ggml_nbytes(t);
+            }
+        }
+        const size_t planner_budget =
+            (total_params_bytes * 4 > effective_budget * 3) ? effective_budget / 4 : effective_budget;
+
         *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                      gf,
                                                      &graph_cut_plan_cache_,
-                                                     effective_budget,
+                                                     planner_budget,
                                                      params_tensor_set_,
                                                      get_desc().c_str());
         if (stream_layers_enabled) {

From 88a5ee44ab7bf91084831f90600e6bba360a32a7 Mon Sep 17 00:00:00 2001
From: fszontagh <szf@fsociety.hu>
Date: Sun, 7 Jun 2026 22:29:45 +0200
Subject: [PATCH 2/2] perf: gate planner budget cap on stream_layers_enabled

---
 src/ggml_extend.hpp | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 3cb00db9f..26ba0bbdf 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -2470,17 +2470,21 @@ struct GGMLRunner {
             *effective_budget_out = effective_budget;
         }
 
-        // When the model dwarfs the budget, cap the planner at a quarter so
-        // it builds smaller merged segments and chunk-K can fit alongside.
-        // Otherwise leave the planner free to merge into one large segment.
-        size_t total_params_bytes = 0;
-        for (const ggml_tensor* t : params_tensor_set_) {
-            if (t != nullptr) {
-                total_params_bytes += ggml_nbytes(t);
+        // When streaming and the model dwarfs the budget, cap the planner at
+        // a quarter so it builds smaller merged segments and chunk-K can fit
+        // alongside. Without streaming the cap only adds dispatch overhead.
+        size_t planner_budget = effective_budget;
+        if (stream_layers_enabled) {
+            size_t total_params_bytes = 0;
+            for (const ggml_tensor* t : params_tensor_set_) {
+                if (t != nullptr) {
+                    total_params_bytes += ggml_nbytes(t);
+                }
+            }
+            if (total_params_bytes * 4 > effective_budget * 3) {
+                planner_budget = effective_budget / 4;
             }
         }
-        const size_t planner_budget =
-            (total_params_bytes * 4 > effective_budget * 3) ? effective_budget / 4 : effective_budget;
 
         *plan_out = sd::ggml_graph_cut::resolve_plan(runtime_backend,
                                                      gf,