From d9e5354dc5fe40d6beb766753090c789a3a8d53f Mon Sep 17 00:00:00 2001 From: andreaonofrei01 Date: Mon, 4 May 2026 14:03:46 +0200 Subject: [PATCH 1/4] docs(examples): add chat reasoning_effort examples Add four examples in examples/mistral/chat/ demonstrating reasoning_effort on mistral-medium-3-5: - reasoning_response_shape.py: dump the raw response shape for reasoning_effort="high" vs "none" so users see the ThinkChunk / TextChunk JSON before consuming it. - reasoning.py: single-turn call, iterate ThinkChunk and TextChunk in message.content. - reasoning_with_streaming.py: handle streaming deltas where chunks arrive as ThinkChunk lists during thinking and as plain string fragments after thinking ends. - reasoning_multi_turn.py: 3-turn math chain run with two replay strategies (keep vs drop ThinkChunks) and prints token usage so the cost difference is visible. --- examples/mistral/chat/reasoning.py | 56 +++++++++++ examples/mistral/chat/reasoning_multi_turn.py | 96 +++++++++++++++++++ .../mistral/chat/reasoning_response_shape.py | 47 +++++++++ .../mistral/chat/reasoning_with_streaming.py | 68 +++++++++++++ 4 files changed, 267 insertions(+) create mode 100644 examples/mistral/chat/reasoning.py create mode 100644 examples/mistral/chat/reasoning_multi_turn.py create mode 100644 examples/mistral/chat/reasoning_response_shape.py create mode 100644 examples/mistral/chat/reasoning_with_streaming.py diff --git a/examples/mistral/chat/reasoning.py b/examples/mistral/chat/reasoning.py new file mode 100644 index 00000000..dbdb14cf --- /dev/null +++ b/examples/mistral/chat/reasoning.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +import os + +import httpx + +from mistralai.client import Mistral +from mistralai.client.models import TextChunk, ThinkChunk, UserMessage + + +def main(): + api_key = os.environ["MISTRAL_API_KEY"] + model = "mistral-medium-3-5" + + client = Mistral( + api_key=api_key, + client=httpx.Client(timeout=httpx.Timeout(300.0)), + ) + + chat_response = client.chat.complete( + model=model, + messages=[ + UserMessage( + content=( + "John is one of 4 children. The first sister is 4 years old. " + "Next year, the second sister will be twice as old as the first sister. " + "The third sister is two years older than the second sister. " + "The third sister is half the age of her older brother. " + "How old is John?" + ) + ) + ], + reasoning_effort="high", + temperature=0.7, + ) + + # With reasoning_effort="high", message.content is a list of chunks. + # With reasoning_effort="none", message.content is a plain string. + content = chat_response.choices[0].message.content + if isinstance(content, str): + print(content) + return + + for chunk in content or []: + if isinstance(chunk, ThinkChunk): + print("--- thinking ---") + for inner in chunk.thinking: + if isinstance(inner, TextChunk): + print(inner.text) + print("--- /thinking ---") + elif isinstance(chunk, TextChunk): + print(chunk.text) + + +if __name__ == "__main__": + main() diff --git a/examples/mistral/chat/reasoning_multi_turn.py b/examples/mistral/chat/reasoning_multi_turn.py new file mode 100644 index 00000000..fb0a756d --- /dev/null +++ b/examples/mistral/chat/reasoning_multi_turn.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python + +# Multi-turn conversation with a reasoning model. +# +# When the assistant returns a list of chunks (ThinkChunk + TextChunk), +# you must choose what to put back into `messages` for the next turn. +# This example runs the same 3-turn math chain with two strategies and +# prints the resulting token usage so you can see the tradeoff: +# +# A) keep ThinkChunks -> the prompt grows fast as reasoning accumulates +# B) drop ThinkChunks -> only the final answer is replayed +# +# Both produce the correct answer here. Pick based on whether your task +# benefits from the model seeing its prior reasoning. + +import os + +import httpx + +from mistralai.client import Mistral +from mistralai.client.models import ( + AssistantMessage, + TextChunk, + ThinkChunk, + UserMessage, +) + +MODEL = "mistral-medium-3-5" +TURNS = [ + "What is 17 * 23?", + "Now multiply that by 3.", + "And subtract 100 from the result.", +] + + +def final_text(content): + if isinstance(content, str): + return content + return "".join(c.text for c in (content or []) if isinstance(c, TextChunk)) + + +def keep_thinking(content): + return content + + +def drop_thinking(content): + if isinstance(content, str): + return content + return [c for c in (content or []) if not isinstance(c, ThinkChunk)] + + +def run_chain(client, label, build_history): + print(f"\n========== {label} ==========") + messages = [] + total_prompt = 0 + total_completion = 0 + last_answer = "" + + for i, user_text in enumerate(TURNS, start=1): + messages.append(UserMessage(content=user_text)) + response = client.chat.complete( + model=MODEL, + messages=messages, + reasoning_effort="high", + temperature=0.7, + ) + content = response.choices[0].message.content + usage = response.usage + total_prompt += usage.prompt_tokens + total_completion += usage.completion_tokens + last_answer = final_text(content) + + print( + f"turn {i}: prompt={usage.prompt_tokens:>4} " + f"completion={usage.completion_tokens:>4} -> {last_answer}" + ) + messages.append(AssistantMessage(content=build_history(content))) + + print( + f"TOTAL: prompt={total_prompt} completion={total_completion} " + f"(sum {total_prompt + total_completion})" + ) + + +def main(): + client = Mistral( + api_key=os.environ["MISTRAL_API_KEY"], + client=httpx.Client(timeout=httpx.Timeout(300.0)), + ) + + run_chain(client, "A) keep ThinkChunks across turns", keep_thinking) + run_chain(client, "B) drop ThinkChunks across turns", drop_thinking) + + +if __name__ == "__main__": + main() diff --git a/examples/mistral/chat/reasoning_response_shape.py b/examples/mistral/chat/reasoning_response_shape.py new file mode 100644 index 00000000..ac93078f --- /dev/null +++ b/examples/mistral/chat/reasoning_response_shape.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +# Print the raw shape of a chat response when using `reasoning_effort`. +# Run this first to see what ThinkChunk / TextChunk look like in the wire +# format, then move on to the other reasoning_*.py examples. + +import json +import os + +import httpx + +from mistralai.client import Mistral +from mistralai.client.models import UserMessage + + +def main(): + client = Mistral( + api_key=os.environ["MISTRAL_API_KEY"], + client=httpx.Client(timeout=httpx.Timeout(300.0)), + ) + + prompt = "What is 12 * 14? Answer in one short sentence." + + for effort in ["high", "none"]: + print(f"\n========== reasoning_effort={effort!r} ==========") + response = client.chat.complete( + model="mistral-medium-3-5", + messages=[UserMessage(content=prompt)], + reasoning_effort=effort, + temperature=0.7, + ) + message = response.choices[0].message + print(f"type(message.content) = {type(message.content).__name__}") + print("message.content =") + if isinstance(message.content, str): + print(json.dumps(message.content, indent=2)) + else: + print( + json.dumps( + [chunk.model_dump() for chunk in message.content], + indent=2, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/examples/mistral/chat/reasoning_with_streaming.py b/examples/mistral/chat/reasoning_with_streaming.py new file mode 100644 index 00000000..7b761519 --- /dev/null +++ b/examples/mistral/chat/reasoning_with_streaming.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python + +import os + +import httpx + +from mistralai.client import Mistral +from mistralai.client.models import TextChunk, ThinkChunk, UserMessage + + +def main(): + api_key = os.environ["MISTRAL_API_KEY"] + model = "mistral-medium-3-5" + + client = Mistral( + api_key=api_key, + client=httpx.Client(timeout=httpx.Timeout(300.0)), + ) + + # While the model is thinking, delta.content is a list containing a + # ThinkChunk. After the thinking phase ends, delta.content arrives as + # plain string fragments. The transition event may contain both a closing + # ThinkChunk and the first TextChunk in a single list. + in_thinking = False + for event in client.chat.stream( + model=model, + messages=[ + UserMessage( + content=( + "If a train leaves Paris at 9am going 120 km/h and another " + "leaves Lyon at 10am going 150 km/h on the same track, " + "when do they meet? Paris-Lyon is 465 km." + ) + ) + ], + reasoning_effort="high", + temperature=0.7, + ): + delta = event.data.choices[0].delta.content + if not delta: + continue + + if isinstance(delta, str): + if in_thinking: + print("\n--- /thinking ---") + in_thinking = False + print(delta, end="", flush=True) + continue + + for chunk in delta: + if isinstance(chunk, ThinkChunk): + if not in_thinking: + print("--- thinking ---") + in_thinking = True + for inner in chunk.thinking: + if isinstance(inner, TextChunk): + print(inner.text, end="", flush=True) + elif isinstance(chunk, TextChunk): + if in_thinking: + print("\n--- /thinking ---") + in_thinking = False + print(chunk.text, end="", flush=True) + + print() + + +if __name__ == "__main__": + main() From e9793b674d565278fdf10b5fe0db98e2d5698a3d Mon Sep 17 00:00:00 2001 From: andreaonofrei01 Date: Mon, 4 May 2026 14:38:33 +0200 Subject: [PATCH 2/4] address review: keep ThinkChunks for MM3.5; use timeout_ms - reasoning_multi_turn.py: drop the keep-vs-drop comparison and recommend keeping ThinkChunks across turns. Per reviewer feedback, dropping the reasoning trace degrades MM3.5 performance. - All four files: replace httpx.Client(timeout=...) with the SDK's timeout_ms parameter; remove the httpx import. --- examples/mistral/chat/reasoning.py | 8 +--- examples/mistral/chat/reasoning_multi_turn.py | 48 +++++-------------- .../mistral/chat/reasoning_response_shape.py | 8 +--- .../mistral/chat/reasoning_with_streaming.py | 8 +--- 4 files changed, 18 insertions(+), 54 deletions(-) diff --git a/examples/mistral/chat/reasoning.py b/examples/mistral/chat/reasoning.py index dbdb14cf..1e2b1348 100644 --- a/examples/mistral/chat/reasoning.py +++ b/examples/mistral/chat/reasoning.py @@ -2,8 +2,6 @@ import os -import httpx - from mistralai.client import Mistral from mistralai.client.models import TextChunk, ThinkChunk, UserMessage @@ -12,10 +10,8 @@ def main(): api_key = os.environ["MISTRAL_API_KEY"] model = "mistral-medium-3-5" - client = Mistral( - api_key=api_key, - client=httpx.Client(timeout=httpx.Timeout(300.0)), - ) + # Bump request timeout because reasoning runs can be long. + client = Mistral(api_key=api_key, timeout_ms=300_000) chat_response = client.chat.complete( model=model, diff --git a/examples/mistral/chat/reasoning_multi_turn.py b/examples/mistral/chat/reasoning_multi_turn.py index fb0a756d..4c970ded 100644 --- a/examples/mistral/chat/reasoning_multi_turn.py +++ b/examples/mistral/chat/reasoning_multi_turn.py @@ -2,21 +2,16 @@ # Multi-turn conversation with a reasoning model. # -# When the assistant returns a list of chunks (ThinkChunk + TextChunk), -# you must choose what to put back into `messages` for the next turn. -# This example runs the same 3-turn math chain with two strategies and -# prints the resulting token usage so you can see the tradeoff: +# IMPORTANT: for Mistral Medium 3.5, always replay the assistant turn +# back into `messages` with its ThinkChunks intact. Dropping the +# reasoning trace across turns DEGRADES the model's performance. # -# A) keep ThinkChunks -> the prompt grows fast as reasoning accumulates -# B) drop ThinkChunks -> only the final answer is replayed -# -# Both produce the correct answer here. Pick based on whether your task -# benefits from the model seeing its prior reasoning. +# This example runs a 3-turn math chain and prints per-turn token +# usage. The prompt grows as the reasoning trace accumulates; that +# growth is expected. import os -import httpx - from mistralai.client import Mistral from mistralai.client.models import ( AssistantMessage, @@ -39,22 +34,13 @@ def final_text(content): return "".join(c.text for c in (content or []) if isinstance(c, TextChunk)) -def keep_thinking(content): - return content - - -def drop_thinking(content): - if isinstance(content, str): - return content - return [c for c in (content or []) if not isinstance(c, ThinkChunk)] - +def main(): + # Bump request timeout because reasoning runs can be long. + client = Mistral(api_key=os.environ["MISTRAL_API_KEY"], timeout_ms=300_000) -def run_chain(client, label, build_history): - print(f"\n========== {label} ==========") messages = [] total_prompt = 0 total_completion = 0 - last_answer = "" for i, user_text in enumerate(TURNS, start=1): messages.append(UserMessage(content=user_text)) @@ -68,13 +54,13 @@ def run_chain(client, label, build_history): usage = response.usage total_prompt += usage.prompt_tokens total_completion += usage.completion_tokens - last_answer = final_text(content) print( f"turn {i}: prompt={usage.prompt_tokens:>4} " - f"completion={usage.completion_tokens:>4} -> {last_answer}" + f"completion={usage.completion_tokens:>4} -> {final_text(content)}" ) - messages.append(AssistantMessage(content=build_history(content))) + # Replay the full assistant content (ThinkChunks included). + messages.append(AssistantMessage(content=content)) print( f"TOTAL: prompt={total_prompt} completion={total_completion} " @@ -82,15 +68,5 @@ def run_chain(client, label, build_history): ) -def main(): - client = Mistral( - api_key=os.environ["MISTRAL_API_KEY"], - client=httpx.Client(timeout=httpx.Timeout(300.0)), - ) - - run_chain(client, "A) keep ThinkChunks across turns", keep_thinking) - run_chain(client, "B) drop ThinkChunks across turns", drop_thinking) - - if __name__ == "__main__": main() diff --git a/examples/mistral/chat/reasoning_response_shape.py b/examples/mistral/chat/reasoning_response_shape.py index ac93078f..6cb820ab 100644 --- a/examples/mistral/chat/reasoning_response_shape.py +++ b/examples/mistral/chat/reasoning_response_shape.py @@ -7,17 +7,13 @@ import json import os -import httpx - from mistralai.client import Mistral from mistralai.client.models import UserMessage def main(): - client = Mistral( - api_key=os.environ["MISTRAL_API_KEY"], - client=httpx.Client(timeout=httpx.Timeout(300.0)), - ) + # Bump request timeout because reasoning runs can be long. + client = Mistral(api_key=os.environ["MISTRAL_API_KEY"], timeout_ms=300_000) prompt = "What is 12 * 14? Answer in one short sentence." diff --git a/examples/mistral/chat/reasoning_with_streaming.py b/examples/mistral/chat/reasoning_with_streaming.py index 7b761519..377844a2 100644 --- a/examples/mistral/chat/reasoning_with_streaming.py +++ b/examples/mistral/chat/reasoning_with_streaming.py @@ -2,8 +2,6 @@ import os -import httpx - from mistralai.client import Mistral from mistralai.client.models import TextChunk, ThinkChunk, UserMessage @@ -12,10 +10,8 @@ def main(): api_key = os.environ["MISTRAL_API_KEY"] model = "mistral-medium-3-5" - client = Mistral( - api_key=api_key, - client=httpx.Client(timeout=httpx.Timeout(300.0)), - ) + # Bump request timeout because reasoning runs can be long. + client = Mistral(api_key=api_key, timeout_ms=300_000) # While the model is thinking, delta.content is a list containing a # ThinkChunk. After the thinking phase ends, delta.content arrives as From 7dd374d7b30b970f45529c38abddf7db74eccb99 Mon Sep 17 00:00:00 2001 From: andreaonofrei01 Date: Mon, 4 May 2026 14:50:05 +0200 Subject: [PATCH 3/4] fix lint: remove unused ThinkChunk import in reasoning_multi_turn --- examples/mistral/chat/reasoning_multi_turn.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/mistral/chat/reasoning_multi_turn.py b/examples/mistral/chat/reasoning_multi_turn.py index 4c970ded..3ba40e4e 100644 --- a/examples/mistral/chat/reasoning_multi_turn.py +++ b/examples/mistral/chat/reasoning_multi_turn.py @@ -16,7 +16,6 @@ from mistralai.client.models import ( AssistantMessage, TextChunk, - ThinkChunk, UserMessage, ) From 1af9da5fbd996a7e0892b3112b07692a653dd7bd Mon Sep 17 00:00:00 2001 From: andreaonofrei01 Date: Mon, 4 May 2026 15:07:47 +0200 Subject: [PATCH 4/4] simplify multi-turn: append response.choices[0].message directly Avoids the unnecessary AssistantMessage(content=content) re-wrap and forwards any future fields on AssistantMessage automatically. Verified end-to-end: 3/3 runs of the math chain produce 391 -> 1173 -> 1073, and history inspection confirms each AssistantMessage slot preserves [ThinkChunk, TextChunk]. --- examples/mistral/chat/reasoning_multi_turn.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/mistral/chat/reasoning_multi_turn.py b/examples/mistral/chat/reasoning_multi_turn.py index 3ba40e4e..84b880de 100644 --- a/examples/mistral/chat/reasoning_multi_turn.py +++ b/examples/mistral/chat/reasoning_multi_turn.py @@ -13,11 +13,7 @@ import os from mistralai.client import Mistral -from mistralai.client.models import ( - AssistantMessage, - TextChunk, - UserMessage, -) +from mistralai.client.models import TextChunk, UserMessage MODEL = "mistral-medium-3-5" TURNS = [ @@ -49,17 +45,18 @@ def main(): reasoning_effort="high", temperature=0.7, ) - content = response.choices[0].message.content + message = response.choices[0].message usage = response.usage total_prompt += usage.prompt_tokens total_completion += usage.completion_tokens print( f"turn {i}: prompt={usage.prompt_tokens:>4} " - f"completion={usage.completion_tokens:>4} -> {final_text(content)}" + f"completion={usage.completion_tokens:>4} -> {final_text(message.content)}" ) - # Replay the full assistant content (ThinkChunks included). - messages.append(AssistantMessage(content=content)) + # Append the full assistant message back into history so the + # ThinkChunks are preserved across turns. + messages.append(message) print( f"TOTAL: prompt={total_prompt} completion={total_completion} "