pytorch · mergennachin · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -173,8 +173,8 @@ serve.py            (control plane: FastAPI/asyncio, OpenAI protocol, chat
                      templating, tool parsing, validation — NO CUDA, NO pybind)
    │  JSONL over stdin/stdout
    ▼
-qwen3_5_moe_worker  (C++ binary: one Qwen35MoEEngine + one session, synchronous
-                     loop — the CUDA model; NO asyncio server)
+qwen3_5_moe_worker  (C++ binary: one Qwen35MoEEngine, many isolated sessions,
+                     synchronous loop — the CUDA model; NO asyncio server)
 ```
 
 The model runs in a **separate worker process** because executing the AOTI CUDA
@@ -196,13 +196,42 @@ is safe under asyncio.
 | `--host` / `--port` | `127.0.0.1` / `8000` | Bind address |
 | `--max-context` | (none) | Reject prompts that exceed it with 400 |
 | `--no-think` | off | Default reasoning off (`enable_thinking=False`) |
+| `--max-sessions` | `1` | Isolated sessions on one weight load (see Sessions) |
 
-### V1 limitations
+### Sessions
+
+One worker loads the weights once (~18 GB) and hosts multiple **isolated**
+sessions on that single allocation — each with its own KV/recurrent state, via
+CUDA per-session mutable rebinding. Set `--max-sessions N` (clamped to 1 if the
+backend cannot rebind); one slot is reserved for anonymous requests, so up to
+`N - 1` named `session_id`s are addressable.
+
+Route a request to a persistent session with the `session_id` body field or, as
+aliases, the `X-ExecuTorch-Session-ID` / `session_id` / `x-session-affinity`
+headers (body wins, then that header order). The header aliases let a client that
+already emits a stable per-conversation affinity id (e.g. pi's
+`sendSessionAffinityHeaders`) route with no extra config. Requests without any
+share a transient scratch session. Free a session with `DELETE /v1/sessions/{id}`.
+
+```bash
+curl http://127.0.0.1:8000/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{"model":"qwen3.5-moe","session_id":"alice",
+       "messages":[{"role":"user","content":"hi"}]}'
+```
+
+Admission is up front: an explicit `session_id` on a single-session server
+returns **400** (`unsupported_session`); past capacity it returns **429**
+(`capacity_exhausted`) before any response bytes.
+
+This is **isolation, not concurrency or warm resume**: execution is still
+synchronous (one in-flight request; `--num-runners > 1` is rejected since more
+workers would duplicate the weights), and each request resets its session — the
+recurrent/conv state cannot be rewound by position (`seek()` is NotSupported), so
+turn-to-turn KV reuse (append-only warm resume) is a follow-up.
+
+### Other limitations
 
-- **Single-slot** (`serving_capacity=1`): one worker, one session, one model
-  load. `--num-runners > 1` is rejected; concurrent requests queue on the worker.
-- **No prefix cache**: the recurrent/conv state cannot be rewound by position
-  (`seek()` is NotSupported), so turn-to-turn KV reuse is off.
 - Supports the chat-completions contract of the generic server; `top_p != 1`,
   `seed`, `top_k`, `logprobs`, etc. are rejected (only temperature is plumbed).
 

diff --git a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
@@ -18,23 +18,29 @@
 // process segfaults in the int4 matmul (validated). Here the model runs in a
 // plain synchronous loop in its own process, which is reliable.
 //
-// Single-slot serving: this worker creates one session and the control plane
-// queues concurrent requests on it. (The engine itself can host multiple
-// sessions on the one ~18GB weight allocation; exposing that over the worker
-// protocol is a follow-up.)
+// Multi-session (isolation): the engine loads weights once and hosts multiple
+// isolated sessions on that one ~18GB allocation; the shared worker loop
+// (worker_loop.h) routes requests to per-session_id state, up to
+// --max_sessions. Execution is still synchronous (one in-flight request); warm
+// context reuse across requests is a follow-up.
 
 #include <gflags/gflags.h>
 
 #include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h>
-#include <executorch/extension/llm/runner/llm_session.h>
 #include <executorch/extension/llm/server/cpp/worker_loop.h>
 #include <executorch/runtime/platform/log.h>
 
-#include <utility>
+#include <cstdint>
 
 DEFINE_string(model_path, "", "Model .pte file path.");
 DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
 DEFINE_string(data_path, "", "Data file (.ptd) for the CUDA backend.");
+DEFINE_int32(
+    max_sessions,
+    1,
+    "Max physical sessions to host on the one weight allocation (CUDA "
+    "per-session mutable rebinding). Clamped to 1 if the backend cannot "
+    "rebind.");
 
 namespace {
 namespace llm = ::executorch::extension::llm;
@@ -54,6 +60,7 @@ int main(int argc, char** argv) {
   config.model_path = FLAGS_model_path;
   config.data_path = FLAGS_data_path;
   config.tokenizer_path = FLAGS_tokenizer_path;
+  config.max_sessions = FLAGS_max_sessions;
 
   auto engine_result = llm::Qwen35MoEEngine::create(config);
   if (engine_result.error() != Error::Ok) {
@@ -62,16 +69,9 @@ int main(int argc, char** argv) {
   }
   auto engine = std::move(engine_result.get());
 
-  auto session_result = engine->create_session();
-  if (session_result.error() != Error::Ok) {
-    ET_LOG(Error, "qwen35_moe_worker: failed to create session");
-    return 1;
-  }
-  auto session = std::move(session_result.get());
-
-  // The engine's tokenizer encodes the rendered prompt to ids; the session
-  // decodes ids back to text internally.
+  // The engine's tokenizer encodes the rendered prompt to ids; sessions decode
+  // ids back to text internally. The shared loop owns per-session_id state.
   ::tokenizers::Tokenizer* tokenizer = engine->tokenizer();
 
-  return llm::run_worker_stdio_loop(*session, *tokenizer, engine->metadata());
+  return llm::run_worker_stdio_loop(*engine, *tokenizer, engine->metadata());
 }
diff --git a/examples/models/qwen3_5_moe/serve.py b/examples/models/qwen3_5_moe/serve.py
@@ -17,9 +17,14 @@
 CUDA execution while a live asyncio loop is resident). Isolating CUDA in a plain
 (no-asyncio) C++ worker process is the reliable shape, and it loads weights once.
 
-V1 constraints:
-  * single-slot: one worker, one session; concurrent HTTP requests queue.
-  * prefix cache off (Qwen seek() is NotSupported).
+Sessions and constraints:
+  * One worker hosts many isolated sessions on a single ~18GB weight load (CUDA
+    per-session mutable rebinding); requests route by session_id (anonymous
+    requests share a scratch session). See --max-sessions.
+  * Execution is synchronous: one in-flight request at a time, concurrent HTTP
+    requests queue. Sessions provide isolation, not concurrent throughput.
+  * No warm context reuse yet: each request resets its session (Qwen seek() is
+    NotSupported; append-only reuse is a follow-up).
   * The control plane only does blocking pipe I/O on its executor thread (no
     CUDA), which is safe under asyncio.
 
@@ -77,6 +82,7 @@ def _spawn(args):
     ]
     if args.data_path:
         cmd += ["--data_path", args.data_path]
+    cmd += ["--max_sessions", str(args.max_sessions)]
     logger.info("Starting Qwen worker subprocess (loads the model once)...")
     return spawn_worker(cmd, env=env)
 
@@ -88,7 +94,7 @@ def build_app_from_args(args):
         args.hf_tokenizer, default_template_kwargs=default_template_kwargs
     )
 
-    worker = _spawn(args)  # one worker == one session (single-slot V1)
+    worker = _spawn(args)  # one worker, weights once, many isolated sessions
     runtime = SessionRuntime(worker)
     serving = ServingChat(
         runtime,
@@ -144,7 +150,17 @@ def main() -> None:
         "--num-runners",
         type=int,
         default=1,
-        help="V1 supports 1 only (single-slot).",
+        help="Workers (processes). 1 only: a worker hosts many isolated sessions "
+        "on one weight load; more workers would duplicate the ~18GB weights.",
+    )
+    p.add_argument(
+        "--max-sessions",
+        type=int,
+        default=1,
+        help="Isolated sessions the one worker hosts on a single weight load "
+        "(CUDA per-session mutable rebinding); clamped to 1 if the backend "
+        "cannot rebind. One slot is reserved for anonymous requests, so the "
+        "number of addressable session_ids is max-sessions - 1.",
     )
     p.add_argument(
         "--worker-bin",
@@ -157,8 +173,8 @@ def main() -> None:
 
     if args.num_runners != 1:
         p.error(
-            "Qwen3.5 MoE V1 is single-slot: one worker serves one session; "
-            "concurrent requests queue."
+            "Only 1 worker process is supported (it hosts many isolated sessions "
+            "on one ~18GB weight load); more workers would duplicate the weights."
         )
 
     app, _ = build_app_from_args(args)

diff --git a/examples/models/qwen3_5_moe/test_serve.py b/examples/models/qwen3_5_moe/test_serve.py
@@ -8,7 +8,7 @@
 
 Hermetic: no model, GPU, or worker subprocess. Covers layering (Qwen stays an
 example; the control plane runs no CUDA and imports no model pybind), the worker
-spawn command, and the single-slot CLI guard. The generic JSONL protocol is
+spawn command, and the single-worker CLI guard. The generic JSONL protocol is
 covered by extension/llm/server/python/tests/test_worker_client.py; the live
 HTTP smoke test is documented in README.md and run on a CUDA box.
 """
@@ -75,6 +75,7 @@ def fake_spawn(cmd, env=None):
             model_path="m.pte",
             tokenizer_path="t.json",
             data_path="d.ptd",
+            max_sessions=4,
         )
     )
     assert captured["cmd"] == [
@@ -85,6 +86,8 @@ def fake_spawn(cmd, env=None):
         "t.json",
         "--data_path",
         "d.ptd",
+        "--max_sessions",
+        "4",
     ]
 
 
@@ -95,7 +98,11 @@ def test_spawn_defaults_worker_bin_and_omits_empty_data_path(monkeypatch):
     )
     serve._spawn(
         SimpleNamespace(
-            worker_bin=None, model_path="m.pte", tokenizer_path="t.json", data_path=None
+            worker_bin=None,
+            model_path="m.pte",
+            tokenizer_path="t.json",
+            data_path=None,
+            max_sessions=4,
         )
     )
     cmd = captured["cmd"]

@@ -12,25 +12,24 @@
 // the stable serving abstraction) — no Python model code, no pybind, no
 // in-process Python serving. The OpenAI control plane (Python) spawns this
 // process and drives it over JSONL on stdin/stdout (see worker_client.py). The
-// JSONL protocol and the decode loop are shared across all workers in
-// worker_loop.h; this file only constructs the engine/session/tokenizer.
+// JSONL protocol, session management, and the decode loop are shared across all
+// workers in worker_loop.h; this file only constructs the engine/tokenizer.
+// TextLLMEngine hosts a single session, so the worker serves anonymous requests
+// via the shared loop's scratch session and reports no named sessions.
 
 #include <gflags/gflags.h>
 
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
-#include <executorch/extension/llm/runner/llm_session.h>
 #include <executorch/extension/llm/server/cpp/worker_loop.h>
 #include <executorch/runtime/platform/log.h>
 
 #include <optional>
-#include <utility>
 
 DEFINE_string(model_path, "", "Self-contained model .pte file path.");
 DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
 
 namespace {
 namespace llm = ::executorch::extension::llm;
-using ::executorch::runtime::Error;
 } // namespace
 
 int main(int argc, char** argv) {
@@ -50,12 +49,6 @@ int main(int argc, char** argv) {
     ET_LOG(Error, "text_llm_worker: failed to create engine");
     return 1;
   }
-  auto session_result = engine->create_session();
-  if (session_result.error() != Error::Ok) {
-    ET_LOG(Error, "text_llm_worker: failed to create session");
-    return 1;
-  }
-  auto session = std::move(session_result.get());
 
   // The session decodes token ids to text internally; this tokenizer encodes
   // the rendered prompt to ids. Same tokenizer.json -> same vocabulary.
@@ -65,5 +58,5 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  return llm::run_worker_stdio_loop(*session, *tokenizer, engine->metadata());
+  return llm::run_worker_stdio_loop(*engine, *tokenizer, engine->metadata());
 }