diff --git a/backends/cuda/runtime/cuda_mutable_state.cpp b/backends/cuda/runtime/cuda_mutable_state.cpp
index ff60da9b3d3..32ee1496720 100644
--- a/backends/cuda/runtime/cuda_mutable_state.cpp
+++ b/backends/cuda/runtime/cuda_mutable_state.cpp
@@ -398,7 +398,7 @@ void mutable_state_set_active(MutableStateContext ctx, int token) {
 void mutable_state_note_handle(CudaDelegateHandle* handle) {
   MutableStateContext ctx = tl_loading_ctx;
   if (ctx == kInvalidMutableContext) {
-    return; // not loading within a managed context (e.g. non-V2 path)
+    return; // not loading within a managed context (single-session path)
   }
   auto& m = mgr();
   std::lock_guard<std::mutex> g(m.mu);
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
index 899c816e859..399cb83a712 100644
--- a/examples/models/qwen3_5_moe/README.md
+++ b/examples/models/qwen3_5_moe/README.md
@@ -161,9 +161,12 @@ LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
     --data-path   qwen35_moe_exports/aoti_cuda_blob.ptd \
     --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
     --hf-tokenizer   ~/models/Qwen3.5-35B-A3B \
-    --model-id qwen3.5-moe --no-think
+    --model-id qwen3.5-moe --no-think --max-sessions 4
 ```
 
+`--max-sessions >= 2` is required for named sessions and warm resume; the default
+`1` is scratch-only (one slot is reserved for anonymous requests).
+
 ### Architecture (process isolation)
 
 Two processes, one model load:
@@ -202,16 +205,16 @@ is safe under asyncio.
 ### Sessions
 
 One worker loads the weights once (~18 GB) and hosts multiple **isolated**
-sessions on that single allocation — each with its own KV/recurrent state, via
-CUDA per-session mutable rebinding. Set `--max-sessions N` (clamped to 1 if the
-backend cannot rebind); one slot is reserved for anonymous requests, so up to
-`N - 1` named `session_id`s are addressable.
+sessions on that single allocation, each with its own KV/recurrent state. Set
+`--max-sessions N` (clamped to 1 if the backend hosts a single session); one slot
+is reserved for anonymous requests, so up to `N - 1` named `session_id`s are
+addressable.
 
 Route a request to a persistent session with the `session_id` body field or, as
 aliases, the `X-ExecuTorch-Session-ID` / `session_id` / `x-session-affinity`
 headers (body wins, then that header order). The header aliases let a client that
-already emits a stable per-conversation affinity id (e.g. pi's
-`sendSessionAffinityHeaders`) route with no extra config. Requests without any
+emits a stable per-conversation affinity id route per conversation (for pi, set
+`compat.sendSessionAffinityHeaders: true` in models.json). Requests without any
 share a transient scratch session.
 
 ```bash
@@ -243,15 +246,11 @@ Each `done` event reports
 (`new`/`exact_prefix`/`dirty`/`mismatch`/`equal`) for measuring the hit rate.
 `--no-warm-resume` forces a full prefill every request (for A/B comparison).
 
-**Tool-call turns (token-ID continuation):** an assistant turn re-rendered from
-its parsed tool call rarely re-tokenizes to the tokens the model actually
-generated, so plain warm resume misses on agent loops. The server stores the
-exact generated token ids per session and, on the next turn, sends the prompt as
-segments (`{"text"}` / `{"ids"}`) that splice those ids back in for prior
-assistant turns instead of re-rendering them — so the resident state stays an
-exact token prefix and resume hits. Tool *results* remain text (re-tokenized
-deterministically). The worker's exact-token check still backstops everything, so
-a mismatch just falls back to a full prefill.
+**Tool-call turns** also warm-resume: an assistant turn re-rendered from its
+parsed tool call rarely re-tokenizes to the tokens the model generated, so the
+server replays the exact generated token ids for prior turns to keep the resident
+state an exact prefix (tool *results* stay text). A mismatch still falls back to a
+full prefill.
 
 This is **isolation + warm resume, not concurrency**: execution is still
 synchronous (one in-flight request; `--num-runners > 1` is rejected since more
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
index b14ab38e656..805afbd7038 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -95,8 +95,8 @@ Result<std::unique_ptr<Module>> build_qwen_module(
 
 #ifdef EXECUTORCH_BUILD_CUDA
   // Backend options are read during backend init(), so they must be set before
-  // load_method. (CUDA graph is intentionally not enabled: V2 rebinds each
-  // session's mutable buffers before execute, which a captured graph's baked
+  // load_method. (CUDA graph is intentionally not enabled: each session
+  // rebinds its mutable buffers before execute, which a captured graph's baked
   // pointers would ignore.)
   {
     // Cross-method per-FQN weight sharing: prefill and decode reuse one weight
@@ -124,7 +124,7 @@ Error register_mutable_fqns(Module* module, int mutable_ctx) {
     ET_LOG(
         Error,
         "Qwen35MoEEngine: model has no get_mutable_buffer_metadata; re-export "
-        "for V2 multi-session");
+        "for multi-session");
     return res.error();
   }
   const auto& outs = res.get();
@@ -368,7 +368,7 @@ class Qwen35MoESession : public LLMSession {
   Error seek(int64_t pos) override {
     // The hybrid model carries recurrent/conv state that cannot be safely
     // rewound by logical position the way contiguous KV can. Fail closed so the
-    // prefix cache falls back to reset + full prefill (V1).
+    // prefix cache falls back to reset + full prefill.
     (void)pos;
     return Error::NotSupported;
   }
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.h b/examples/models/qwen3_5_moe/qwen35_moe_engine.h
index 26e2fbb55b4..b930ed1f5d5 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_engine.h
+++ b/examples/models/qwen3_5_moe/qwen35_moe_engine.h
@@ -17,7 +17,7 @@
 // isolated points are where an MLX runtime would slot in. MLX is NOT
 // implemented or validated here.
 //
-// V2 (CUDA): the ENGINE is multi-session — one shared Module (weights loaded
+// CUDA: the ENGINE is multi-session — one shared Module (weights loaded
 // once); create_session() hands out multiple logical sessions, each rebinding
 // its own GPU buffers for the model's mutable state (KV/conv/recurrent) before
 // execute, serialized by the engine lock. serving_capacity() reports how many
@@ -26,9 +26,9 @@
 // backends/cuda/runtime/cuda_mutable_state).
 //
 // The SERVING path (qwen3_5_moe_worker + control plane) exposes this over the
-// worker protocol: the worker routes requests to per-session_id state (V2a) and
+// worker protocol: the worker routes requests to per-session_id state and
 // reuses each session's resident context across requests (warm append-only
-// resume, V2b.1). Execution stays serialized (one in-flight request).
+// resume). Execution stays serialized (one in-flight request).
 
 #pragma once
 
@@ -53,7 +53,7 @@ struct Qwen35MoEConfig {
   std::string model_path; // .pte
   std::string data_path; // .ptd (CUDA delegate blob); empty if none
   std::string tokenizer_path; // HuggingFace tokenizer.json
-  // V2 multi-session: max physical sessions to advertise when the backend can
+  // Multi-session: max physical sessions to advertise when the backend can
   // host them without weight duplication (CUDA per-session mutable rebinding).
   // Clamped to 1 if the backend cannot rebind.
   int32_t max_sessions = 1;
@@ -74,7 +74,7 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine {
   ::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
       override;
 
-  // CUDA V2: one shared Module (one weight allocation); each session rebinds
+  // CUDA: one shared Module (one weight allocation); each session rebinds
   // its own GPU buffers for the model's mutable state. Reports
   // config.max_sessions when the backend supports per-session rebinding, else
   // fails closed to 1.
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
index ac2e3536a14..3b4c395677b 100644
--- a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
+++ b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
@@ -14,9 +14,8 @@
 // protocol and decode loop every worker uses (worker_loop.h); this file only
 // constructs the engine/session.
 //
-// Isolation rationale: executing the AOTI CUDA model inside a live asyncio HTTP
-// process segfaults in the int4 matmul (validated). Here the model runs in a
-// plain synchronous loop in its own process, which is reliable.
+// Model execution is isolated in this C++ worker for CUDA/AOTI reliability (see
+// the example README for the full rationale).
 //
 // Multi-session: the engine loads weights once and hosts multiple isolated
 // sessions on that one ~18GB allocation; the shared worker loop (worker_loop.h)
diff --git a/examples/models/qwen3_5_moe/serve.py b/examples/models/qwen3_5_moe/serve.py
index 9075ef8fe17..424b43cb7ca 100644
--- a/examples/models/qwen3_5_moe/serve.py
+++ b/examples/models/qwen3_5_moe/serve.py
@@ -12,10 +12,8 @@
 process (qwen3_5_moe_worker) that this process drives over JSONL via the generic
 WorkerClient — the same protocol the generic text_llm_worker speaks.
 
-Why two processes: executing the AOTI CUDA model inside a live asyncio server
-process segfaults in the int4 matmul (validated by elimination — the trigger is
-CUDA execution while a live asyncio loop is resident). Isolating CUDA in a plain
-(no-asyncio) C++ worker process is the reliable shape, and it loads weights once.
+Model execution is isolated in the C++ worker for CUDA/AOTI reliability; the
+worker loads weights once. (See the example README for the full rationale.)
 
 Sessions and constraints:
   * One worker hosts many isolated sessions on a single ~18GB weight load (CUDA
@@ -121,7 +119,7 @@ def _stop_worker():
 
 def main() -> None:
     p = argparse.ArgumentParser(
-        description="OpenAI-compatible LLM server for Qwen3.5 MoE (process-isolated, V1)"
+        description="OpenAI-compatible LLM server for Qwen3.5 MoE (process-isolated)"
     )
     p.add_argument("--model-path", required=True, help="Path to the .pte model")
     p.add_argument(
diff --git a/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp b/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp
index 6af51adf22c..5ec4895362d 100644
--- a/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp
+++ b/examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp
@@ -6,11 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// GPU no-bleed integration proof for the CUDA V2 per-session mutable-state
+// GPU no-bleed integration proof for the CUDA per-session mutable-state
 // rebind -- the REAL guard for mutable-buffer completeness (an under-declared
 // buffer would be shared across sessions; only behavior catches that, not the
 // declared-subset-of-discovered bookkeeping check). This is the automated form
-// of the manual "A solo / A inter" proof in the V2 foundation commit.
+// of the manual "A solo / A inter" multi-session isolation proof.
 //
 // CRITICAL: sessions are interleaved at EXECUTE granularity (A prefill, B
 // prefill, A decode, B decode, ...). The mechanism under test is the
diff --git a/extension/llm/server/README.md b/extension/llm/server/README.md
index 06b873b525b..3c2d949c8a8 100644
--- a/extension/llm/server/README.md
+++ b/extension/llm/server/README.md
@@ -11,6 +11,11 @@ extension/llm/server/
   # cpp/         # future: no-Python single-binary server
 ```
 
+**Which entry point:** use `extension.llm.server.python.server` for generic
+TextLLM `.pte` models; use `examples.models.qwen3_5_moe.serve` for Qwen3.5-MoE
+CUDA (it needs the `.ptd` delegate blob, Qwen XML tool parsing, and the Qwen
+engine/session worker).
+
 Why this layout: the OpenAI contract is identical across languages, so the
 **spec** and **conformance** suite are shared, and each language gets its own
 implementation directory. The real cross-language reuse comes from the C++
@@ -26,8 +31,8 @@ Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
 (`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
 may select the Qwen XML format) with `tool_choice="none"`,
 structured API errors, and best-effort cancellation. One worker process with
-serialized execution; it hosts many isolated sessions on one weight load (warm
-append-only resume across turns). KV/prefix state lives inside the
+serialized execution; a worker can host isolated sessions on one weight load when its engine reports
+capacity > 1 (with warm append-only resume across turns). KV/prefix state lives inside the
 worker/session, not the control plane. Unsupported params (including `top_p`,
 `seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
 `logprobs`, and `tool_choice="required"`) are rejected with a structured 400
@@ -63,7 +68,8 @@ Point pi at the server via `~/.pi/agent/models.json`:
 ```json
 { "providers": { "executorch": {
     "baseUrl": "http://127.0.0.1:8000/v1", "api": "openai-completions",
-    "apiKey": "x", "models": [ { "id": "<model-id>" } ] } } }
+    "apiKey": "x", "models": [ { "id": "<model-id>",
+      "compat": { "sendSessionAffinityHeaders": true } } ] } } }
 ```
 
 Other OpenAI-compatible clients use their own schema — generically: base URL
diff --git a/extension/llm/server/cpp/worker_loop.h b/extension/llm/server/cpp/worker_loop.h
index f580d21d356..3cf4541a4e2 100644
--- a/extension/llm/server/cpp/worker_loop.h
+++ b/extension/llm/server/cpp/worker_loop.h
@@ -8,67 +8,47 @@
 
 #pragma once
 
-// Shared model-worker generation loop + JSONL protocol, used by every model
-// worker (the generic text_llm_worker and model-specific workers like
-// qwen3_5_moe_worker). A worker only constructs its engine/tokenizer and calls
-// run_worker_stdio_loop(); the protocol, session management, and the decode
-// loop live here once, so protocol changes land in a single place.
+// Shared model-worker generation loop + JSONL protocol for every model worker
+// (the generic text_llm_worker and model-specific workers like
+// qwen3_5_moe_worker): a worker constructs its engine + tokenizer and calls
+// run_worker_stdio_loop(); the protocol, session routing, and decode loop live
+// here once.
 //
-// V2a (isolation): the worker owns one LLMEngine (weights loaded once) and
-// hands out multiple isolated LLMSessions keyed by session_id, each with its
-// own KV/recurrent state, up to the engine's serving capacity. Execution is
-// synchronous -- one in-flight request at a time, the control plane serializes.
+// The worker owns one LLMEngine (weights loaded once) and serves multiple
+// isolated LLMSessions keyed by session_id, up to the engine's serving
+// capacity; anonymous requests (no session_id) share one scratch session that
+// is reset every request. Execution is synchronous: one in-flight request at a
+// time.
 //
-// V2b.1 (warm append-only resume): a named session keeps its decoded context
-// across requests. On the next request the worker compares the new prompt's
-// token ids against the session's resident token ids; if the resident ids are
-// an exact prefix, it prefills ONLY the suffix (continuing the KV/recurrent
-// state at pos>0) instead of resetting and re-prefilling the whole prompt. The
-// check is exact-token (never string/retokenized text) and falls back to a full
-// reset+prefill whenever exact reuse can't be proven, so it is always correct;
-// the win is when the prompt is a genuine token extension of the prior turn.
+// Warm resume: a named session keeps its decoded context across requests. The
+// new prompt's token ids are matched against the session's resident token ids;
+// on an exact prefix only the suffix is prefilled (continuing at pos>0). The
+// match is exact-token (never retokenized text) and falls back to a full
+// reset+prefill whenever exact reuse can't be proven, so it is always correct.
 // See plan_prefill().
 //
-// Sessions:
-//   - Named: an explicit session_id -> session + resident token ids, created on
-//     first use (or via an `open` op), capped at max_named_sessions = capacity
-//     - 1 (the scratch slot is reserved). 0 when the backend hosts one session.
-//     Warm resume applies to named sessions (unless disabled).
-//   - Scratch: one session for anonymous requests (no session_id), reset every
-//     request -- distinct anonymous callers must never reuse each other's
-//     state.
-//
-// Protocol (one JSON object per line; matches worker_client.py):
+// Protocol (one JSON object per line; matches worker_client.py). stdout carries
+// ONLY protocol JSON; logs go to stderr (ET_LOG):
 //   worker -> stdout, once:    {"ready": true, "max_sessions": int,
 //                               "max_named_sessions": int}
 //   client -> stdin:
-//     generate:   {"max_new_tokens": int, "temperature": float,
-//                  "stop": [str, ...], "session_id"?: str,
-//                  and exactly one prompt form:
-//                    "prompt": str
-//                    "prompt_segments": [{"text": str} | {"ids": [int, ...]}]}
-//     open:       {"op": "open",  "session_id": str}
-//     close:      {"op": "close", "session_id": str}
-//     reset:      {"op": "reset", "session_id": str}  // clear context, keep
-//     slot
+//     generate: {"max_new_tokens": int, "temperature": float, "stop":
+//     [str,...],
+//                "session_id"?: str, and exactly one prompt form:
+//                  "prompt": str
+//                  "prompt_segments": [{"text": str} | {"ids": [int,...]}]}
+//     open/close/reset: {"op": "open"|"close"|"reset", "session_id": str}
 //   worker -> stdout:
-//     generate:   {"token": str} *   (streamed)
-//                 {"done": true, "prompt_tokens": int, "completion_tokens":
-//                 int,
-//                  "finish_reason": "stop"|"length",
-//                  "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
-//                  "session_reset_reason": "new"|"exact_prefix"|"dirty"|
-//                                          "mismatch"|"equal",
-//                  "generated_token_ids"?: [int, ...]}  // omitted if
-//                  stop-trimmed
-//     open:       {"opened": true, "session_id": str}
-//     close:      {"closed": true, "session_id": str}
-//     reset:      {"reset": true,  "session_id": str}
-//     error:      {"error": str, "code"?: str}  // code: "capacity_exhausted",
-//                                               // "unsupported_session"
-//
-// stdout carries ONLY protocol JSON; all logs go to stderr (ET_LOG). One
-// request at a time (the control plane serializes).
+//     generate: {"token": str} *  (streamed), then
+//               {"done": true, "prompt_tokens": int, "completion_tokens": int,
+//                "finish_reason": "stop"|"length",
+//                "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
+//                "session_reset_reason": str
+//                (new|exact_prefix|mismatch|dirty|equal),
+//                "generated_token_ids"?: [int,...]}  // omitted if stop-trimmed
+//     open/close/reset: {"opened"|"closed"|"reset": true, "session_id": str}
+//     error:    {"error": str, "code"?: str}  // capacity_exhausted |
+//                                              // unsupported_session
 
 #include <nlohmann/json.hpp>
 
@@ -242,7 +222,12 @@ inline void worker_handle_request(
     const auto& d = step_result.get();
     if (d.is_terminal) {
       finish = "stop";
-      break; // terminal step (EOS / cooperative stop): not emitted or counted
+      // Terminal step (EOS / cooperative stop): the terminal token is neither
+      // emitted as text nor counted in num_generated -> completion_tokens. This
+      // is intentional -- completion_tokens reflects the visible completion the
+      // client received, not internal forward steps; an EOS the user never sees
+      // is not part of that count.
+      break;
     }
     // The token was forwarded into the cache (pos advanced); track it so the
     // resident-ids/position invariant holds. EOS/terminal tokens are not
@@ -264,9 +249,15 @@ inline void worker_handle_request(
     if (stop_hit) {
       finish = "stop"; // reached a stop string: drop it and everything after
       stop_string = true;
-      // The emitted text was trimmed at the stop string, so the next turn's
-      // rendered prompt won't be an exact token extension of resident: force a
-      // reset rather than risk a false prefix match.
+      // Trimming at the stop means the next turn's prompt won't be an exact
+      // token extension of resident, so force a reset (no false prefix match).
+      //
+      // CONTRACT: every *string* stop is non-resumable this way (trim + dirty +
+      // omit generated_token_ids) -- right for user/request and content-cleanup
+      // stops, which change visible text. A clean turn terminator stays
+      // warm-resumable only if the engine surfaces it as a terminal/EOS token
+      // id (handled above via d.is_terminal; e.g. Qwen adds <|im_end|> to
+      // eos_ids).
       st.dirty = true;
       break;
     }
@@ -400,8 +391,8 @@ class WorkerSessions {
 // Emit {"ready": true, ...}, then read JSONL requests from stdin and dispatch
 // each (generate / open / close / reset), reporting exceptions as
 // {"error": ...} and continuing to serve. Returns 0 when stdin closes.
-// enable_warm_resume gates V2b.1 warm suffix reuse for named sessions (off ->
-// every request resets, the V2a behavior; useful for A/B measurement).
+// enable_warm_resume gates warm suffix reuse for named sessions (off -> every
+// request resets and re-prefills; useful for A/B measurement).
 inline int run_worker_stdio_loop(
     LLMEngine& engine,
     ::tokenizers::Tokenizer& tokenizer,
diff --git a/extension/llm/server/python/README.md b/extension/llm/server/python/README.md
index f0c1003d009..0871d588124 100644
--- a/extension/llm/server/python/README.md
+++ b/extension/llm/server/python/README.md
@@ -72,6 +72,28 @@ Key flags:
 | `--num-runners N` | Worker processes — **1 only** (one worker hosts many isolated sessions on one weight load; more would duplicate weights) |
 | `--worker-bin PATH` | path to the `text_llm_worker` binary (default: `cmake-out/extension/llm/server/cpp/text_llm_worker`) |
 
+## Smoke test
+
+```bash
+curl http://127.0.0.1:8000/health
+curl http://127.0.0.1:8000/v1/models
+curl http://127.0.0.1:8000/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{"model":"<model-id>","messages":[{"role":"user","content":"hello"}]}'
+```
+
+## Sessions
+
+When the worker reports named-session capacity (a worker whose engine supports it,
+launched with `--max-sessions N >= 2`; the generic `text_llm_worker` reports
+none), a request can target a persistent per-conversation session:
+
+- body `session_id`, or headers `X-ExecuTorch-Session-ID` / `session_id` /
+  `x-session-affinity` (body wins) — a stable id reuses that session's KV across
+  turns (warm resume).
+- `POST /v1/sessions/{id}/reset` — clear its context, keep the slot.
+- `DELETE /v1/sessions/{id}` — free its context and slot.
+
 ## Use from an agent harness
 
 - **opencode** (`opencode.json`):
@@ -85,8 +107,12 @@ Key flags:
   ```json
   { "providers": { "executorch": {
       "baseUrl": "http://127.0.0.1:8000/v1", "api": "openai-completions",
-      "apiKey": "x", "models": [ { "id": "qwen2.5-coder" } ] } } }
+      "apiKey": "x", "models": [ { "id": "qwen2.5-coder",
+        "compat": { "sendSessionAffinityHeaders": true } } ] } } }
   ```
+  `compat.sendSessionAffinityHeaders` makes pi route each conversation to its own
+  session (per-conversation isolation + warm resume); without it every request
+  uses the anonymous scratch session.
 
 ## Validate
 
@@ -115,15 +141,11 @@ plane (C++): a worker process (`text_llm_worker`) that owns all model state
 all token stepping and KV mutation; it speaks one JSON object per line on
 stdin/stdout.
 
-JSONL protocol (stdout carries protocol JSON only; logs go to stderr):
-
-```
-worker -> stdout, once at startup:  {"ready": true}
-client -> stdin,  per request:      {"prompt", "max_new_tokens", "temperature"}
-worker -> stdout, per request:      {"token": str} *        (streamed)
-                                    {"done": true, "prompt_tokens", "completion_tokens"}
-                                or  {"error": str}
-```
+The JSONL protocol — `generate` / `open` / `close` / `reset` ops, the `prompt` /
+`prompt_segments` prompt forms, warm-resume stats, and `generated_token_ids` — is
+defined in `cpp/worker_loop.h` (the worker side, the canonical reference) and
+driven by `worker_client.py` (the Python transport); stdout carries protocol JSON
+only, logs go to stderr.
 
 Process isolation is the reliable shape for CUDA/AOTI models: executing the model
 inside a live asyncio server process can segfault (validated with Qwen3.5-MoE);
@@ -161,7 +183,7 @@ Session capacity is determined by the worker/engine — a single worker hosts ma
 isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
 processes would each carry their own copy of the weights.
 
-The **generic `text_llm_worker` is scratch-only (V1)**: `TextLLMEngine::serving_capacity()`
+The **generic `text_llm_worker` is scratch-only**: `TextLLMEngine::serving_capacity()`
 is a conservative 1, so `max_named = max(0, capacity-1) = 0` — the default
 `server.py` serves only the anonymous scratch session (no named `session_id`s, no
 warm resume). The named-session / warm-resume / token-ID machinery is exercised
@@ -169,10 +191,29 @@ by a model-specific worker whose engine reports capacity > 1 (the Qwen3.5-MoE CU
 worker). This is intentional; the generic worker stays minimal until a backend is
 proven to host multiple physical sessions without duplicating weights.
 
-Cancellation is best-effort: a worker request runs to completion and is not
-interruptible mid-generation in V1, so `runner.stop()` means "the control plane
-stops consuming and the worker finishes the current request" rather than a hard
-cancel. There is **no prefix cache in V1 serving**; if KV prefix reuse returns it
-will live inside the worker/session, not in the Python control plane. Multiple
-workers, weight sharing across sessions on a backend that supports it, adaptive
-thinking, and multi-session subagents are future work.
+**Cancellation is best-effort, and it head-of-line blocks.** `WorkerClient.stop()`
+is a no-op, and `SessionRuntime.generate_stream()` holds the single worker lock until
+the worker naturally finishes. On client disconnect/cancellation the server calls
+`stop()` then awaits the in-flight worker request, so the abandoned generation runs to
+completion **and blocks every other session on that worker until it does** — a long or
+runaway generation stalls all concurrent requests (including a subagent fan-out).
+A disconnected client does **not** interrupt the C++ worker mid-generation. Real
+interruption needs a future protocol change — e.g. a control pipe, non-blocking stdin
+polling between decode steps, or request ids plus an out-of-band cancel op.
+
+**Warm resume needs true turn terminators surfaced as EOS/terminal token ids, not just
+string stops.** The worker treats every *string* stop the same — it trims the output,
+marks the session dirty, and omits `generated_token_ids` — which is correct for
+user/request stops and broad content-cleanup stops (they change visible text, so the
+turn is non-resumable). A clean model turn terminator is only resumable if the engine
+surfaces it as a terminal/EOS **token id** (the Qwen engine adds `<|im_end|>` to
+`eos_ids`, so it ends the turn before string-stop matching and stays resumable). A
+backend whose terminator is only a string stop would mark every turn dirty and never
+warm-resume; distinguishing resumable terminators from trim-stops in the protocol is
+future work.
+
+There is **no global (cross-session) prefix cache**; per-session append-only warm
+resume is worker-side (for engines that support it), and all KV/resident state
+lives inside the worker/session, never the Python control plane. Multiple workers,
+weight sharing across sessions on a backend that supports it, adaptive thinking,
+and multi-session subagents are future work.
diff --git a/extension/llm/server/python/openai_transcript.py b/extension/llm/server/python/openai_transcript.py
index 2eaff5fd7f0..d6e614822ec 100644
--- a/extension/llm/server/python/openai_transcript.py
+++ b/extension/llm/server/python/openai_transcript.py
@@ -4,24 +4,22 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""OpenAI/chat-template transcript state for token-ID warm resume (V2b.1.5).
-
-This is the OpenAI-adapter-specific glue that makes warm resume work across the
-chat template's lossy re-render of prior assistant turns (especially tool calls,
-which re-render from parsed structure and don't re-tokenize to what the model
-generated). It is NOT generic runtime infrastructure: it knows ChatMessages,
-tool_calls, the ChatTemplate, sentinels, and assistant fingerprints. The runtime
-(session_runtime) only sees PromptInput.
-
-Per session we keep one record per assistant turn we produced, in order:
-{"fp": fingerprint of the response we returned, "ids": exact generated token ids
-| None}. On the next request each prior assistant turn is replaced with a unique
-sentinel, the conversation is rendered once, and the rendered text is split on
-the sentinels with the stored ids spliced back in -- but only for turns whose
-fingerprint matches the incoming message (so an edited/branched history, or a
-session reused for another conversation, is never substituted with stale ids)
-and whose ids are present (a stop-trimmed turn has None and is left as text).
-Everything is backstopped by the worker's exact-token prefix check.
+"""OpenAI/chat-template transcript state for token-ID warm resume.
+
+Adapter-specific glue, not runtime infrastructure: it knows ChatMessages,
+tool_calls, the ChatTemplate, sentinels, and assistant fingerprints (the runtime
+only sees a PromptInput). It makes warm resume survive the chat template's lossy
+re-render of prior assistant turns -- especially tool calls, which re-render from
+parsed structure and don't re-tokenize to what the model generated.
+
+Per session it stores, for each assistant turn it produced, the exact generated
+token ids and a fingerprint of the response. On the next request each prior
+assistant turn is replaced with a sentinel, the conversation is rendered once,
+and the rendered text is split on the sentinels with the stored ids spliced back
+in -- only for turns whose fingerprint matches the incoming message (an edited,
+branched, or reused history is never substituted with stale ids) and whose ids
+are present (a stop-trimmed turn is left as text). The worker's exact-token
+prefix check is the final backstop.
 """
 
 import hashlib
@@ -88,14 +86,11 @@ def _assistant_fingerprint(content, tool_calls) -> str:
 
     @staticmethod
     def _normalize_scaffold(text_chunk: str, preamble: str) -> Optional[str]:
-        """Force the scaffold region -- the text between the last assistant header
-        in `text_chunk` and its end -- to equal `preamble`, so the worker
-        re-tokenizes the exact generation scaffold it made resident for this turn.
-        The region (the content was replaced by a sentinel) is empty when history
-        stripped the scaffold (insert) or a think scaffold when history preserved
-        it (replace, possibly with a different form than `preamble`). Returns the
-        adjusted text, or None if the region is not a recognized scaffold
-        (ambiguous -> caller falls back to plain text)."""
+        """Force the scaffold region (between the last assistant header in
+        `text_chunk` and its end) to equal `preamble`, so the worker re-tokenizes
+        the exact resident scaffold. The region is empty (history stripped it ->
+        insert) or a think scaffold (history preserved it -> replace). Returns the
+        adjusted text, or None if it isn't a recognized scaffold (-> text fallback)."""
         # No scaffold for this turn's mode/template: nothing to reproduce, so
         # leave the chunk untouched -- and don't require the Qwen/ChatML header,
         # so token-id splicing still works for templates with a different
@@ -159,14 +154,11 @@ def build_prompt_input(
         stored = self._turns.get(session_id or "")
         if not stored:
             return PromptInput(text=rendered_prompt)
-        # ORDINAL ASSUMPTION: stored[k] is the k-th assistant turn WE generated
-        # for this session, matched positionally against the k-th assistant
-        # message in the request. A client-injected assistant turn we did not
-        # generate -- a few-shot exemplar, a pre-seeded turn, or any reused
-        # session -- shifts that alignment, so the fingerprint at k mismatches and
-        # we stop splicing from there. This is always SAFE (text fallback +
-        # worker exact-prefix backstop); it only lowers the warm-resume hit rate,
-        # silently, for such conversations.
+        # Positional: stored[k] is the k-th assistant turn WE generated, matched
+        # against the k-th assistant message in the request. A client-injected
+        # turn (few-shot exemplar, pre-seeded turn, reused session) shifts that
+        # alignment -> fingerprint mismatch at k -> stop splicing. Always safe
+        # (text fallback + worker prefix backstop); just a lower hit rate.
         positions = [i for i, m in enumerate(messages) if m.role == "assistant"]
         splice: dict[int, dict] = {}  # message index -> {"ids", "preamble"}
         diverged_at = None
@@ -184,13 +176,9 @@ def build_prompt_input(
                 }
         if diverged_at is not None:
             # Drop the stale tail from the first mismatch so an edited/branched
-            # earlier turn can't keep shadowing future requests; the matched
-            # prefix [:diverged_at] is untouched and still splices. We have no
-            # exact ids for the edited turn itself (the client authored it, we
-            # didn't generate it), so warm resume for that turn and the ones after
-            # it stays text until the session is reset/closed. Safe regardless:
-            # stale ids are never spliced and the worker's exact-token prefix
-            # check backstops correctness.
+            # earlier turn can't shadow future requests; the matched prefix still
+            # splices, the rest stays text until reset/close. Safe either way:
+            # stale ids are never spliced and the worker's prefix check backstops.
             del stored[diverged_at:]
         if not splice:
             return PromptInput(text=rendered_prompt)
@@ -230,16 +218,13 @@ def record_assistant_turn(
         prior_turns: int,
         preamble: str = "",
     ) -> None:
-        """Record this turn's {fingerprint, exact generated ids, generation
-        preamble} at position `prior_turns` -- the count of assistant turns in the
-        request this response answers. Stored records at/after that index are
-        dropped first, so a regenerated or branched turn under the same session_id
-        replaces stale records instead of leaving them to shadow future
-        warm-resume hits with a stale fingerprint. ids is None when the worker
-        omitted them (stop-trimmed turn) -- recorded as non-resumable but kept for
-        positional alignment. `preamble` is the generation scaffold resident ahead
-        of these ids (mode-specific, e.g. Qwen3 `<think>` block), reproduced ahead
-        of the spliced ids on the next request so the prefix stays exact."""
+        """Record this turn's {fingerprint, generated ids, generation preamble} at
+        `prior_turns` (the assistant-turn count of the request it answers).
+        Records at/after that index are dropped first, so a regenerated/branched
+        turn replaces stale records rather than shadowing later hits. ids is None
+        when the worker omitted them (stop-trimmed -> non-resumable), kept for
+        positional alignment. `preamble` is the generation scaffold (e.g. the
+        Qwen3 `<think>` block) reproduced ahead of the spliced ids next request."""
         if not session_id:
             return
         turns = self._turns.setdefault(session_id, [])
diff --git a/extension/llm/server/python/serving_chat.py b/extension/llm/server/python/serving_chat.py
index 53d32978251..1b85f8fba3d 100644
--- a/extension/llm/server/python/serving_chat.py
+++ b/extension/llm/server/python/serving_chat.py
@@ -78,11 +78,12 @@ def __init__(
         #    _truncate_raw (pre-parse truncation on the tool path).
         #  * _content_specials: BROAD all-special-tokens set. For PLAIN chat it is
         #    added to the worker/clean stop set (create() -> gen_stops) so a leaked
-        #    special halts the worker and never reaches the client, AND it backs
-        #    _strip_specials for final cleanup of already-parsed visible content.
+        #    special halts the worker (-> dirty, ids omitted, non-resumable) and
+        #    never reaches the client, AND it backs _strip_specials for final
+        #    cleanup of already-parsed visible content.
         self._stops = template.turn_stop_sequences()
         self._content_specials = template.special_tokens()
-        # OpenAI/chat-template token-ID warm-resume state (V2b.1.5). Adapter-side,
+        # OpenAI/chat-template token-ID warm-resume state. Adapter-side,
         # not runtime; kept in lockstep with the worker's session state by
         # clearing both on reset/close.
         self._transcript = OpenAITranscriptState(template)
@@ -206,13 +207,8 @@ def _options(
         return GenerationOptions(
             max_new_tokens=req.resolved_max_tokens(),
             temperature=req.temperature if req.temperature is not None else 0.0,
-            # Worker stop set, decided per path in create(): narrow turn
-            # terminators (+ request stops) for tool turns so a structural/tool
-            # delimiter is never cut before the parser sees it; plus the broad
-            # content specials for plain chat so a leaked special halts the worker
-            # -- which then marks the turn dirty and omits its ids -- instead of
-            # streaming a token the client should not see. The server re-applies
-            # the same set in _clean/_collect_until_stop as a backstop.
+            # Worker stop set, chosen per path in create() (see __init__ for the
+            # two sets); the server re-applies it in _clean/_collect_until_stop.
             stop=stops,
         )
 
@@ -375,11 +371,10 @@ async def create(self, req: ChatCompletionRequest):
         prompt = self._template.render(
             req.messages, tools=template_tools, template_kwargs=req.chat_template_kwargs
         )
-        # Build the prompt input first: token-ID segments (V2b.1.5) splice this
-        # session's prior assistant turns' exact ids so warm resume stays exact
-        # across the chat template's lossy re-render of tool-call turns; plain
-        # rendered text when there's nothing to splice / on any ambiguity (the
-        # worker verifies the exact-token prefix regardless).
+        # Token-ID segments splice prior assistant turns' exact ids so warm resume
+        # survives the template's lossy tool-call re-render; plain text when
+        # there's nothing to splice or on ambiguity (the worker verifies the
+        # exact-token prefix regardless).
         prompt_input = self._transcript.build_prompt_input(
             session_id=req.session_id,
             messages=req.messages,
@@ -402,13 +397,8 @@ async def create(self, req: ChatCompletionRequest):
                 requested = req.resolved_max_tokens()
                 if requested > 0 and count + requested > self._max_context:
                     raise ContextLengthExceeded(count, self._max_context, requested)
-        # Stop-set split by path. Tool turns use only the narrow turn terminators
-        # (+ request stops) so a structural/tool delimiter is never halted before
-        # the parser sees it. Plain chat adds the broad content specials so a
-        # leaked special (one non-streaming would strip) halts the worker -- which
-        # marks the turn dirty and omits its ids -- instead of reaching the client
-        # or being recorded as resumable ids for text never shown. Both paths
-        # reuse this exact set in the control-plane cut (_clean/_collect_until_stop).
+        # Per-path worker stop set (see __init__ for the two sets and why): tool
+        # turns use the narrow set; plain chat adds the broad content specials.
         if self._tools_active(req):
             gen_stops = self._stops + self._request_stops(req)
         else:
@@ -524,7 +514,7 @@ def chunk(delta: DeltaMessage, finish=None) -> str:
         )
         try:
             if use_tools:
-                # v1: buffer the (usually short) tool response, parse once.
+                # Buffer the (usually short) tool response, parse once.
                 # Halt early at a stop boundary, and bound the raw output
                 # BEFORE parsing so post-stop tool calls / text don't leak.
                 raw, stop_hit[0] = await self._collect_until_stop(
diff --git a/extension/llm/server/python/session_runtime.py b/extension/llm/server/python/session_runtime.py
index e73dbafadc4..c0cdd852ad0 100644
--- a/extension/llm/server/python/session_runtime.py
+++ b/extension/llm/server/python/session_runtime.py
@@ -65,12 +65,12 @@ class GenStats:
     completion_tokens: int = 0
     # Worker-reported stop reason ("stop" | "length"), or None if not reported.
     finish_reason: Optional[str] = None
-    # Warm-resume accounting (V2b.1): tokens served from the session's resident
+    # Warm-resume accounting: tokens served from the session's resident
     # state vs prefilled this request, and why.
     reused_prompt_tokens: int = 0
     prefilled_prompt_tokens: int = 0
     session_reset_reason: Optional[str] = None
-    # Exact token ids generated this turn (V2b.1.5), for an adapter's transcript
+    # Exact token ids generated this turn, for an adapter's transcript
     # store. Empty when the worker doesn't report them (e.g. a stop-trimmed turn).
     generated_token_ids: list = field(default_factory=list)
 
@@ -180,6 +180,11 @@ def run() -> None:
                         raise item
                     yield item
             except asyncio.CancelledError:
+                # stop() is a no-op and we still `await fut` below, so a
+                # cancelled/disconnected client does NOT interrupt the worker --
+                # the in-flight generation runs to completion and head-of-line
+                # blocks other sessions until it does. Real interruption needs a
+                # worker-protocol cancel (see WorkerClient.stop).
                 self._worker.stop()
                 raise
             finally:
diff --git a/extension/llm/server/python/tests/conftest.py b/extension/llm/server/python/tests/conftest.py
index b91f0aec26e..8435e0f3fc2 100644
--- a/extension/llm/server/python/tests/conftest.py
+++ b/extension/llm/server/python/tests/conftest.py
@@ -45,7 +45,7 @@ def __init__(
         self._tokens = list(tokens)
         self._fail = fail
         self._finish_reason = finish_reason  # worker-reported stop reason, if any
-        self._gen_ids = list(gen_ids or [])  # ids reported per turn (V2b.1.5)
+        self._gen_ids = list(gen_ids or [])  # ids reported per turn
         self.captured_config = None
         self.stopped = False
         self.reset_count = 0
diff --git a/extension/llm/server/python/tests/test_sampling_params.py b/extension/llm/server/python/tests/test_sampling_params.py
index fe3166d1bb5..ae48cd95c3d 100644
--- a/extension/llm/server/python/tests/test_sampling_params.py
+++ b/extension/llm/server/python/tests/test_sampling_params.py
@@ -118,7 +118,7 @@ def test_zero_penalties_and_unknown_fields_accepted(make_client):
 
 def test_unsupported_tool_choice_rejected(make_client):
     # "required" / a specific-function choice would need constrained decoding to
-    # force/restrict the call; v1 rejects rather than silently treating as "auto".
+    # force/restrict the call; the server rejects rather than silently treating as "auto".
     client, _ = make_client()
     for choice in (
         "required",
diff --git a/extension/llm/server/python/tests/test_sessions.py b/extension/llm/server/python/tests/test_sessions.py
index 38bf62630fe..23c9f33e498 100644
--- a/extension/llm/server/python/tests/test_sessions.py
+++ b/extension/llm/server/python/tests/test_sessions.py
@@ -6,7 +6,7 @@
 
 """Session-routing contract tests (fake worker, no model/GPU).
 
-V2a: one worker hosts multiple isolated sessions, routed by session_id, admitted
+One worker hosts multiple isolated sessions, routed by session_id, admitted
 up front so capacity refusals are HTTP statuses rather than mid-stream errors.
 These assert the HTTP/wire contract only.
 """
@@ -152,9 +152,8 @@ def _chat_msgs(client, messages, session_id):
 
 
 def test_token_id_segments_splice_prior_assistant_turn(make_client):
-    # V2b.1.5: the server stores turn-1's generated ids and, on turn 2, sends
-    # prompt_segments that splice them back as an exact {ids} run (not text) --
-    # but only because the client echoes back the assistant turn we generated.
+    # Splices turn-1's stored generated ids back as an exact {ids} run on turn 2,
+    # only because the client echoes back the assistant turn we generated.
     client, fake = make_client(max_named_sessions=2, gen_ids=[7, 8, 9])
     assert (
         _chat_msgs(client, [{"role": "user", "content": "hi"}], "s").status_code == 200
@@ -368,7 +367,7 @@ class _HFToolSpecials:
 
 
 def test_stop_set_narrow_but_strip_set_broad():
-    # Two-set split (work item 1): the generation/pre-parse-truncation set is
+    # The generation/pre-parse-truncation set is
     # NARROW (turn terminators only) so a <tool_call> is never halted or cut
     # before the parser sees it; the final content-strip set stays BROAD so stray
     # specials can't leak into visible content.
@@ -396,7 +395,7 @@ def test_stop_set_narrow_but_strip_set_broad():
 
 
 def test_injected_assistant_exemplar_falls_back_to_text():
-    # 5d: a client-injected assistant turn we never generated (few-shot exemplar /
+    # A client-injected assistant turn we never generated (few-shot exemplar /
     # pre-seeded turn) shifts the ordinal alignment -> fingerprint mismatch ->
     # safe text fallback (no stale ids spliced).
     from executorch.extension.llm.server.python.openai_transcript import (
diff --git a/extension/llm/server/python/tests/test_streaming_stops.py b/extension/llm/server/python/tests/test_streaming_stops.py
index f295422c7db..ca926d92421 100644
--- a/extension/llm/server/python/tests/test_streaming_stops.py
+++ b/extension/llm/server/python/tests/test_streaming_stops.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Plain-chat streaming special-token cleanup (WI1).
+"""Plain-chat streaming special-token cleanup.
 
 Non-streaming scrubs broad content specials (the full all_special_tokens set) from
 visible content via _strip_specials. Plain-chat streaming must be consistent: a
diff --git a/extension/llm/server/python/tests/test_template.py b/extension/llm/server/python/tests/test_template.py
index e99b897a19e..c8a53f1dec8 100644
--- a/extension/llm/server/python/tests/test_template.py
+++ b/extension/llm/server/python/tests/test_template.py
@@ -237,7 +237,7 @@ def test_turn_stop_fallback_without_hf_is_narrow():
 
 
 def test_fallback_extracts_text_parts_not_repr():
-    # 5e: the ChatML fallback renders list-content text parts, not a Python repr.
+    # The ChatML fallback renders list-content text parts, not a Python repr.
     t = ChatTemplate(hf_tokenizer_path=None, allow_fallback=True)  # no _hf
     msg = ChatMessage(
         role="user",
diff --git a/extension/llm/server/python/tests/test_warm_resume_scaffold.py b/extension/llm/server/python/tests/test_warm_resume_scaffold.py
index fff89db5c94..5f01c553379 100644
--- a/extension/llm/server/python/tests/test_warm_resume_scaffold.py
+++ b/extension/llm/server/python/tests/test_warm_resume_scaffold.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Warm-resume generation-scaffold reproduction (V2b.1.5).
+"""Warm-resume generation-scaffold reproduction.
 
 Qwen3's template prefills a deterministic ``<think>`` scaffold into the
 generation prompt (so it lands in resident KV) but strips it when re-rendering a
@@ -510,7 +510,7 @@ def test_token_level_exact_prefix_toolloop_think():
     assert assembled[: len(resident)] == resident
 
 
-# --- WI4a: generation_preamble threads tools --------------------------------
+# --- generation_preamble threads tools ------------------------------------
 
 
 def test_generation_preamble_threads_tools():
diff --git a/extension/llm/server/python/tool_parsers/hermes.py b/extension/llm/server/python/tool_parsers/hermes.py
index 6ba19f89407..1f65982505f 100644
--- a/extension/llm/server/python/tool_parsers/hermes.py
+++ b/extension/llm/server/python/tool_parsers/hermes.py
@@ -32,7 +32,7 @@
 
 
 class _UndefinedToolCall(Exception):
-    """A <tool_call> named a tool not in the request's `tools`. v1 degrades the
+    """A <tool_call> named a tool not in the request's `tools`. Degrades the
     WHOLE response to visible text rather than emitting a partial set — never
     silently drop an undefined call while keeping its siblings (spec)."""
 
diff --git a/extension/llm/server/python/tool_parsers/qwen.py b/extension/llm/server/python/tool_parsers/qwen.py
index 8b72f890d64..02bdf12729c 100644
--- a/extension/llm/server/python/tool_parsers/qwen.py
+++ b/extension/llm/server/python/tool_parsers/qwen.py
@@ -56,7 +56,7 @@
 
 
 class _UndefinedToolCall(Exception):
-    """A call named a tool not in the request's `tools`. v1 degrades the WHOLE
+    """A call named a tool not in the request's `tools`. Degrades the WHOLE
     response to visible text rather than emitting a partial set (spec)."""
 
 
diff --git a/extension/llm/server/python/worker_client.py b/extension/llm/server/python/worker_client.py
index 6b7d4e84132..084a862e43d 100644
--- a/extension/llm/server/python/worker_client.py
+++ b/extension/llm/server/python/worker_client.py
@@ -13,30 +13,12 @@
 client serves a TextLLM worker, a Qwen worker, or any future model worker; only
 the binary and its launch args differ.
 
-Protocol (one JSON object per line):
-  worker -> stdout, once at startup:  {"ready": true, "max_sessions": int,
-                                       "max_named_sessions": int}
-  client -> stdin:
-    generate:  {"max_new_tokens": int, "temperature": float, "stop": [str, ...],
-                "session_id"?: str, and exactly one prompt form:
-                  "prompt": str
-                  "prompt_segments": [{"text": str} | {"ids": [int, ...]}]}
-    open:      {"op": "open",  "session_id": str}
-    close:     {"op": "close", "session_id": str}
-    reset:     {"op": "reset", "session_id": str}   # clear context, keep slot
-  worker -> stdout:
-    generate:  {"token": str} *   (streamed)
-               {"done": true, "prompt_tokens": int, "completion_tokens": int,
-                "finish_reason": "stop" | "length",
-                "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
-                "session_reset_reason": "new"|"exact_prefix"|"dirty"|"mismatch"
-                                        |"equal",
-                "generated_token_ids"?: [int, ...]}   # omitted if stop-trimmed
-    open:      {"opened": true, "session_id": str}
-    close:     {"closed": true, "session_id": str}
-    reset:     {"reset": true,  "session_id": str}
-    error:     {"error": str, "code"?: str}   # capacity_exhausted /
-                                              # unsupported_session
+Protocol (one JSON object per line; full reference in cpp/worker_loop.h): a
+per-request `generate` (a `prompt` or `prompt_segments` form, optional
+`session_id`) streams `{"token"}` then a `{"done", ...}` carrying warm-resume
+stats and optional `generated_token_ids`; `open`/`close`/`reset` ops manage named
+sessions; failures return `{"error", "code"?}`. The shapes this client builds and
+parses are in generate()/_on_done() below.
 
 The worker's stdout carries ONLY protocol JSON; its logs go to stderr. One
 request at a time per worker; the caller (SessionRuntime) serializes. A worker
@@ -64,7 +46,7 @@ class WorkerStats:
     # stop) or "length" (ran to max_new, possibly clamped to the context window).
     # None if the worker didn't report it (older worker / fake).
     finish_reason: Optional[str] = None
-    # Warm-resume accounting (V2b.1): how many prompt tokens were served from the
+    # Warm-resume accounting: how many prompt tokens were served from the
     # session's resident KV state vs actually prefilled this request, and why
     # ("new"|"exact_prefix"|"dirty"|"mismatch"|"equal"). Not exposed as OpenAI
     # usage; logged for measuring warm-resume hit rate. None on older workers.
@@ -74,7 +56,7 @@ class WorkerStats:
     # The exact (non-terminal) token ids generated this turn. The control plane
     # stores these per session and splices them back as an `ids` prompt segment
     # next turn, so a prior assistant span is an exact token extension instead of
-    # a lossy chat-template re-render (V2b.1.5). Empty on older workers.
+    # a lossy chat-template re-render. Empty on older workers.
     generated_token_ids: list = field(default_factory=list)
 
 
@@ -108,13 +90,17 @@ def __init__(self, proc: subprocess.Popen, max_named_sessions: int = 0):
         self.max_named_sessions = max_named_sessions
 
     def reset(self) -> None:
-        # The worker resets its session at the start of each request; nothing to
-        # do here.
+        # Legacy no-op; reset is explicit via reset_session, or handled by the
+        # worker's prefill plan.
         pass
 
     def stop(self) -> None:
-        # Best-effort: a request is synchronous and not interruptible mid-
-        # generation in V1.
+        # No-op: a worker request is synchronous over the JSONL pipe and is
+        # NOT interruptible mid-generation. The in-flight request runs to
+        # completion and head-of-line blocks every other session on this worker
+        # until it finishes. Real cancellation needs a protocol change (a control
+        # pipe, non-blocking stdin polling between decode steps, or request ids +
+        # an out-of-band cancel op).
         pass
 
     def open_session(self, session_id: str) -> None:
@@ -181,7 +167,7 @@ def generate(self, prompt, config, token_callback=None, stats_callback=None):
             "temperature": getattr(config, "temperature", 0.0),
             "stop": list(getattr(config, "stop", []) or []),
         }
-        # Token-ID segments (V2b.1.5) take precedence over the rendered string:
+        # Token-ID segments take precedence over the rendered string:
         # they let prior assistant spans be exact id runs, not lossy re-renders.
         # `is not None` (not truthiness): segments is a distinct prompt form, kept
         # whatever its content (the worker validates non-empty).
diff --git a/extension/llm/server/spec/README.md b/extension/llm/server/spec/README.md
index 58e0e46ef57..12b7645660a 100644
--- a/extension/llm/server/spec/README.md
+++ b/extension/llm/server/spec/README.md
@@ -27,7 +27,7 @@ rather than silently ignored — a client relying on them would otherwise get
 wrong behavior: `top_p` (anything other than `1.0`), `seed`, `n` (> 1),
 `reasoning_effort`, `frequency_penalty`/`presence_penalty` (nonzero), `top_k`,
 `logit_bias`, `tool_choice` = `"required"` or a specific-function choice
-(forcing/restricting a call needs constrained decoding, which v1 lacks),
+(forcing/restricting a call needs constrained decoding, not implemented),
 `response_format` other than `{"type": "text"}` (no constrained JSON),
 `logprobs`/`top_logprobs` (not returned), and `parallel_tool_calls: false`
 (single-call can't be guaranteed without constraining). Unknown fields that
@@ -62,12 +62,18 @@ status (e.g. `400 context_length_exceeded` when `--max-context` is set and the
 prompt exceeds it). A mid-stream failure emits an `error` SSE event then
 `[DONE]` rather than dropping the socket. Cancellation is best-effort: on a
 client disconnect the control plane stops consuming the stream (`stop()`), but
-the worker runs the in-flight request to completion — V1 has no mid-generation
-interrupt protocol.
+the worker runs the in-flight request to completion — there is no mid-generation
+interrupt protocol. Because execution is serialized on one worker, an abandoned
+generation also **head-of-line blocks** every other session until it finishes;
+real interruption (a control pipe / between-step stdin poll / request-id cancel
+op) is future work.
 
-### Prefix cache
+### Prefix / KV reuse
 
-Not in V1 serving. The control plane holds no KV state and does no prefix-reuse
-routing; each request is an independent prompt to the worker. If turn-to-turn KV
-prefix reuse returns, it will live inside the worker/session (where the KV cache
-is), not in the control plane.
+No global (cross-session) prefix cache: the control plane holds no KV state and
+does no prefix-reuse routing, so a system prompt shared by two different sessions
+is prefilled independently for each. Per-session append-only warm resume *is*
+implemented worker-side for engines that support it — a named session whose next
+request is an exact-token extension of its resident context prefills only the new
+suffix. All KV/resident state lives inside the worker/session, never the control
+plane.