From 34ad2ac535cdc2c59e654feaf030da0065e8252a Mon Sep 17 00:00:00 2001 From: Jonathan Irvin Date: Sun, 3 May 2026 21:30:34 -0500 Subject: [PATCH 1/5] feat(llm): add hit_max_iterations flag on tool-loop response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Telemetry-only signal: True when the loop exited via the max-iterations synthesis path rather than the model deciding to stop. Distinguishes "model didn't converge" from natural termination so downstream observability can label the two cases differently. No emitter changes — flag is set but no consumer reads it yet. --- src/llm/tool_loop.py | 1 + src/llm/types.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py index 2db87e9a..8d29aa03 100644 --- a/src/llm/tool_loop.py +++ b/src/llm/tool_loop.py @@ -470,6 +470,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]: final_response = await final_call_func() final_response.tool_calls_made = all_tool_calls final_response.iterations = iteration + 1 + final_response.hit_max_iterations = True final_response.input_tokens = total_input_tokens + final_response.input_tokens final_response.output_tokens = total_output_tokens + final_response.output_tokens final_response.cache_creation_input_tokens = ( diff --git a/src/llm/types.py b/src/llm/types.py index 7af5372d..d40d42d0 100644 --- a/src/llm/types.py +++ b/src/llm/types.py @@ -66,6 +66,9 @@ class HonchoLLMCallResponse(BaseModel, Generic[T]): tool_calls_made: list[dict[str, Any]] = Field(default_factory=list) iterations: int = 0 """Number of LLM calls made in the tool execution loop.""" + hit_max_iterations: bool = False + """True when the tool loop exited via the max-iterations synthesis path + rather than the model deciding to stop. Telemetry-only signal.""" thinking_content: str | None = None # Full thinking blocks with signatures for multi-turn replay (Anthropic only). thinking_blocks: list[dict[str, Any]] = Field(default_factory=list) From 48b3003bba7842c0a9122c1599556d963ad2419b Mon Sep 17 00:00:00 2001 From: Jonathan Irvin Date: Sun, 3 May 2026 21:30:52 -0500 Subject: [PATCH 2/5] feat(telemetry): declare per-LLM-call Prometheus series MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds six new metrics + recorder methods on the existing PrometheusMetrics singleton; no callers yet, so this commit is purely declarative. Series: - llm_calls / llm_call_duration_seconds — counter + histogram per call, labeled by feature × provider × model × outcome. - llm_tokens — input/output/cache_read/cache_creation per feature × provider × model. - llm_tool_calls — per-tool invocation outcome inside the tool loop. - llm_iterations — histogram of iterations consumed per call/outcome. - llm_backup_used — counts failovers from primary to backup provider. Cardinality-bounded: feature × provider × model × outcome ≈ 1.7k series cap. Deliberately no workspace_name label here — these answer "is this model effective for this feature", not "is workspace X slow". LLMCallOutcome enum exported from src.telemetry.prometheus so callers can reference the canonical values without importing from the metrics module directly. --- src/telemetry/prometheus/__init__.py | 2 + src/telemetry/prometheus/metrics.py | 170 +++++++++++++++++++++++++++ 2 files changed, 172 insertions(+) diff --git a/src/telemetry/prometheus/__init__.py b/src/telemetry/prometheus/__init__.py index ab0a2616..a3ff8694 100644 --- a/src/telemetry/prometheus/__init__.py +++ b/src/telemetry/prometheus/__init__.py @@ -11,6 +11,7 @@ DeriverComponents, DeriverTaskTypes, DialecticComponents, + LLMCallOutcome, TokenTypes, metrics_endpoint, prometheus_metrics, @@ -20,6 +21,7 @@ "DeriverComponents", "DeriverTaskTypes", "DialecticComponents", + "LLMCallOutcome", "TokenTypes", "metrics_endpoint", "prometheus_metrics", diff --git a/src/telemetry/prometheus/metrics.py b/src/telemetry/prometheus/metrics.py index 0c9fc15b..d70aea84 100644 --- a/src/telemetry/prometheus/metrics.py +++ b/src/telemetry/prometheus/metrics.py @@ -64,6 +64,24 @@ class DialecticComponents(Enum): TOTAL = "total" +class LLMCallOutcome(Enum): + """Terminal outcome of a single `honcho_llm_call`. + + Distinguishes "model didn't converge" (max_iterations) from "infra broke" + (timeout/validation/other) so dashboards and alerts can target each + independently. `success_via_backup` is its own bucket so silent failover + rate is observable without parsing logs. + """ + + SUCCESS = "success" + SUCCESS_AFTER_RETRY = "success_after_retry" + SUCCESS_VIA_BACKUP = "success_via_backup" + ERROR_TIMEOUT = "error_timeout" + ERROR_VALIDATION = "error_validation" + ERROR_MAX_ITERATIONS = "error_max_iterations" + ERROR_OTHER = "error_other" + + api_requests_counter = NamespacedCounter( "api_requests", "Total API requests", @@ -192,6 +210,61 @@ class DialecticComponents(Enum): ["namespace", "workspace_name", "session_name", "state"], ) +# ---- Per-LLM-call observability --------------------------------------------- +# Cardinality budget: feature ~6, provider ~4, model ~10, outcome 7 → ~1.7k +# series cap. Deliberately no workspace_name label here: the question these +# answer is "is this model effective for this feature", not "is workspace X +# slow". Per-workspace LLM behavior shows up in dialectic_calls + token +# counters which already carry workspace_name. + +llm_calls_counter = NamespacedCounter( + "llm_calls", + "Total honcho_llm_call invocations by feature, provider, model, outcome", + ["namespace", "feature", "provider", "model", "outcome"], +) + +llm_call_duration_histogram = NamespacedHistogram( + "llm_call_duration_seconds", + "End-to-end honcho_llm_call latency (includes retries and backup failover)", + ["namespace", "feature", "provider", "model", "outcome"], + buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600, 1800, 3600), +) + +# Distinct from the existing deriver/dialectic/dreamer token counters: +# this one carries provider+model so we can answer "tokens through gemini +# vs glm-5.1" without bouncing through Langfuse. +llm_tokens_counter = NamespacedCounter( + "llm_tokens", + "LLM tokens by feature/provider/model/token_type", + ["namespace", "feature", "provider", "model", "token_type"], +) + +llm_tool_calls_counter = NamespacedCounter( + "llm_tool_calls", + "Individual tool invocations within an LLM tool loop", + ["namespace", "feature", "tool_name", "outcome"], +) + +llm_iterations_histogram = NamespacedHistogram( + "llm_iterations", + "Tool-loop iterations consumed per call (1 = no tool calls)", + ["namespace", "feature", "outcome"], + buckets=(1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 50), +) + +llm_backup_used_counter = NamespacedCounter( + "llm_backup_used", + "Counts when a call's retry chain switched from primary to backup provider", + [ + "namespace", + "feature", + "primary_provider", + "primary_model", + "backup_provider", + "backup_model", + ], +) + @final class PrometheusMetrics: @@ -525,6 +598,103 @@ def set_session_queue_oldest_age( except Exception as e: self._handle_metric_error("set_session_queue_oldest_age", e) + def record_llm_call( + self, + *, + feature: str, + provider: str, + model: str, + outcome: str, + duration_seconds: float, + ) -> None: + try: + llm_calls_counter.labels( + feature=feature, + provider=provider, + model=model, + outcome=outcome, + ).inc() + llm_call_duration_histogram.labels( + feature=feature, + provider=provider, + model=model, + outcome=outcome, + ).observe(duration_seconds) + except Exception as e: + self._handle_metric_error("record_llm_call", e) + + def record_llm_tokens( + self, + *, + feature: str, + provider: str, + model: str, + token_type: str, + count: int, + ) -> None: + if count <= 0: + return + try: + llm_tokens_counter.labels( + feature=feature, + provider=provider, + model=model, + token_type=token_type, + ).inc(count) + except Exception as e: + self._handle_metric_error("record_llm_tokens", e) + + def record_llm_tool_call( + self, + *, + feature: str, + tool_name: str, + outcome: str, + ) -> None: + try: + llm_tool_calls_counter.labels( + feature=feature, + tool_name=tool_name, + outcome=outcome, + ).inc() + except Exception as e: + self._handle_metric_error("record_llm_tool_call", e) + + def observe_llm_iterations( + self, + *, + feature: str, + outcome: str, + iterations: int, + ) -> None: + try: + llm_iterations_histogram.labels( + feature=feature, + outcome=outcome, + ).observe(iterations) + except Exception as e: + self._handle_metric_error("observe_llm_iterations", e) + + def record_llm_backup_used( + self, + *, + feature: str, + primary_provider: str, + primary_model: str, + backup_provider: str, + backup_model: str, + ) -> None: + try: + llm_backup_used_counter.labels( + feature=feature, + primary_provider=primary_provider, + primary_model=primary_model, + backup_provider=backup_provider, + backup_model=backup_model, + ).inc() + except Exception as e: + self._handle_metric_error("record_llm_backup_used", e) + prometheus_metrics = PrometheusMetrics() From 389b9d73f3f541a5c365d7a125d91a3a76c01e49 Mon Sep 17 00:00:00 2001 From: Jonathan Irvin Date: Sun, 3 May 2026 21:31:09 -0500 Subject: [PATCH 3/5] feat(telemetry): add observe_llm_call helper for per-call observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces src/telemetry/llm_call_metrics.py — a context-manager-based wrapper that turns one LLM call into one set of Prometheus samples and one logfmt log line. Surface: - observe_llm_call(...) — context manager yielding a mutable _CallState the caller populates over the call's lifetime. - finalize_success(...) — populate state from a successful response and pick the outcome bucket (success / success_after_retry / success_via_backup). - mark_max_iterations(...) — flip the state to error_max_iterations when the tool loop exited via the synthesis path. - normalize_feature_label(...) — maps caller's track_name/trace_name to a low-cardinality Prom label (e.g. "Dreamer/deduction" -> dream_deduction). No callers wired in yet — this commit is the helper module on its own so the diff stays reviewable. Wiring into honcho_llm_call and the tool loop lands in subsequent commits. Errors raised inside the wrapped call are classified into outcome buckets (timeout / validation / other) and re-raised; the wrapper never swallows or transforms exceptions. --- src/telemetry/llm_call_metrics.py | 280 ++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 src/telemetry/llm_call_metrics.py diff --git a/src/telemetry/llm_call_metrics.py b/src/telemetry/llm_call_metrics.py new file mode 100644 index 00000000..16103479 --- /dev/null +++ b/src/telemetry/llm_call_metrics.py @@ -0,0 +1,280 @@ +"""Per-`honcho_llm_call` observability. + +Captures one structured log line + Prometheus sample set per LLM call, +covering every feature (deriver, dialectic, dream, summary). Designed to +be the *only* instrumentation point inside `honcho_llm_call`; subsystem- +specific token counters in `prometheus.metrics` continue to work in parallel. + +Outcome classification distinguishes "the model didn't converge" +(`error_max_iterations`) from "the infra broke" (timeout / validation / +other) so dashboards and alerts can target each independently. +""" + +from __future__ import annotations + +import logging +import re +import time +from collections.abc import Iterator +from contextlib import contextmanager +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +from src.exceptions import ValidationException +from src.telemetry.prometheus import LLMCallOutcome, prometheus_metrics + +if TYPE_CHECKING: + from src.config import ModelConfig + from src.llm.types import HonchoLLMCallResponse + +logger = logging.getLogger("honcho.llm.call") + + +# Track names used by callers, mapped to clean Prom label values. Anything +# not in the table falls through `_normalize_track_name` (lower + underscores). +_FEATURE_LABEL_MAP: dict[str, str] = { + "Minimal Deriver": "deriver", + "Dialectic Agent": "dialectic", + "Dialectic Agent Stream": "dialectic_stream", +} + + +def normalize_feature_label(track_name: str | None, trace_name: str | None) -> str: + """Map caller's `track_name`/`trace_name` to a low-cardinality Prom label. + + Prefers explicit `_FEATURE_LABEL_MAP` matches, then snake-cases the + track_name (e.g. ``"Dreamer/deduction"`` → ``"dream_deduction"``), + falling back to trace_name, then to ``"unknown"``. + """ + if track_name and track_name in _FEATURE_LABEL_MAP: + return _FEATURE_LABEL_MAP[track_name] + raw = track_name or trace_name + if not raw: + return "unknown" + # "Dreamer/Deduction" → "dreamer_deduction" → "dream_deduction" + s = re.sub(r"[^A-Za-z0-9]+", "_", raw).strip("_").lower() + s = s.replace("dreamer_", "dream_") + return s or "unknown" + + +@dataclass +class _CallState: + """Mutable observation collected across the lifetime of one call.""" + + feature: str + primary_provider: str + primary_model: str + has_backup: bool + backup_provider: str | None = None + backup_model: str | None = None + started_at: float = field(default_factory=time.monotonic) + final_provider: str = "" + final_model: str = "" + attempts: int = 1 + iterations: int | None = None + input_tokens: int = 0 + output_tokens: int = 0 + cache_creation_input_tokens: int = 0 + cache_read_input_tokens: int = 0 + tool_calls: int = 0 + used_backup: bool = False + outcome: LLMCallOutcome = LLMCallOutcome.SUCCESS + error_class: str | None = None + + +def _classify_error(exc: BaseException) -> LLMCallOutcome: + """Map an exception to a coarse outcome bucket.""" + if isinstance(exc, ValidationException): + return LLMCallOutcome.ERROR_VALIDATION + if isinstance(exc, TimeoutError): + return LLMCallOutcome.ERROR_TIMEOUT + name = type(exc).__name__.lower() + msg = str(exc).lower() + if "timeout" in name or "timed out" in msg or "timeout" in msg: + return LLMCallOutcome.ERROR_TIMEOUT + if "pydantic" in name and "validation" in name: + return LLMCallOutcome.ERROR_VALIDATION + return LLMCallOutcome.ERROR_OTHER + + +@contextmanager +def observe_llm_call( + *, + track_name: str | None, + trace_name: str | None, + runtime_model_config: ModelConfig, +) -> Iterator[_CallState]: + """Wrap one `honcho_llm_call` invocation with metrics + structured logging. + + The caller mutates the yielded `_CallState` (sets `attempts`, `iterations`, + accumulates tokens, etc.) over the call's lifetime; on exit we emit the + Prometheus samples and a single logfmt line. Metric errors are swallowed + inside `prometheus_metrics`; this wrapper never raises. + """ + fb = runtime_model_config.fallback + state = _CallState( + feature=normalize_feature_label(track_name, trace_name), + primary_provider=str(runtime_model_config.transport), + primary_model=str(runtime_model_config.model), + has_backup=fb is not None, + backup_provider=str(fb.transport) if fb is not None else None, + backup_model=str(fb.model) if fb is not None else None, + ) + # Default the "winning" provider/model to primary; the caller updates + # these post-success based on the actual AttemptPlan that returned. + state.final_provider = state.primary_provider + state.final_model = state.primary_model + + try: + yield state + except BaseException as exc: + state.outcome = _classify_error(exc) + state.error_class = type(exc).__name__ + _emit(state) + raise + else: + # Success — caller is responsible for setting iterations / final + # provider / outcome (via `finalize_success` below) before exit. + _emit(state) + + +def finalize_success( + state: _CallState, + *, + response: HonchoLLMCallResponse[Any] | None, + final_provider: str | None, + final_model: str | None, + attempts: int, + iterations: int | None, + has_backup: bool, +) -> None: + """Populate `state` from a successful response and pick the outcome bucket. + + Called by `honcho_llm_call` right before the context manager exits when + no exception was raised. `iterations` is None for tool-less calls. + """ + state.attempts = max(1, attempts) + state.iterations = iterations + if final_provider: + state.final_provider = final_provider + if final_model: + state.final_model = final_model + state.used_backup = ( + has_backup + and state.final_model != state.primary_model + and state.backup_model is not None + and state.final_model == state.backup_model + ) + if response is not None: + state.input_tokens = int(response.input_tokens or 0) + state.output_tokens = int(response.output_tokens or 0) + state.cache_creation_input_tokens = int( + response.cache_creation_input_tokens or 0 + ) + state.cache_read_input_tokens = int(response.cache_read_input_tokens or 0) + state.tool_calls = len(response.tool_calls_made or []) + + if state.used_backup: + state.outcome = LLMCallOutcome.SUCCESS_VIA_BACKUP + elif state.attempts > 1: + state.outcome = LLMCallOutcome.SUCCESS_AFTER_RETRY + else: + state.outcome = LLMCallOutcome.SUCCESS + + +def mark_max_iterations(state: _CallState, iterations: int) -> None: + """Mark a tool-loop call that hit `max_tool_iterations` without converging. + + Called when execute_tool_loop returns from the synthesis fallback path + rather than from natural convergence. The call still returned content, + but the model didn't decide to stop on its own — different reliability + signal than a clean success. + """ + state.iterations = iterations + state.outcome = LLMCallOutcome.ERROR_MAX_ITERATIONS + + +def _emit(state: _CallState) -> None: + duration = time.monotonic() - state.started_at + outcome_value = state.outcome.value + + prometheus_metrics.record_llm_call( + feature=state.feature, + provider=state.final_provider, + model=state.final_model, + outcome=outcome_value, + duration_seconds=duration, + ) + if state.input_tokens: + prometheus_metrics.record_llm_tokens( + feature=state.feature, + provider=state.final_provider, + model=state.final_model, + token_type="input", + count=state.input_tokens, + ) + if state.output_tokens: + prometheus_metrics.record_llm_tokens( + feature=state.feature, + provider=state.final_provider, + model=state.final_model, + token_type="output", + count=state.output_tokens, + ) + if state.cache_read_input_tokens: + prometheus_metrics.record_llm_tokens( + feature=state.feature, + provider=state.final_provider, + model=state.final_model, + token_type="cache_read", + count=state.cache_read_input_tokens, + ) + if state.cache_creation_input_tokens: + prometheus_metrics.record_llm_tokens( + feature=state.feature, + provider=state.final_provider, + model=state.final_model, + token_type="cache_creation", + count=state.cache_creation_input_tokens, + ) + if state.iterations is not None: + prometheus_metrics.observe_llm_iterations( + feature=state.feature, + outcome=outcome_value, + iterations=state.iterations, + ) + if state.used_backup and state.backup_provider and state.backup_model: + prometheus_metrics.record_llm_backup_used( + feature=state.feature, + primary_provider=state.primary_provider, + primary_model=state.primary_model, + backup_provider=state.backup_provider, + backup_model=state.backup_model, + ) + + # One structured logfmt line per call. Quote-free values keep `| logfmt` + # parsing in Loki/Grafana straightforward. + iter_value = state.iterations if state.iterations is not None else 0 + err_part = f" error_class={state.error_class}" if state.error_class else "" + line = ( + f"honcho.llm.call feature={state.feature}" + f" provider={state.final_provider} model={state.final_model}" + f" outcome={outcome_value} latency_ms={int(duration * 1000)}" + f" attempts={state.attempts}" + f" used_backup={'true' if state.used_backup else 'false'}" + f" input_tokens={state.input_tokens}" + f" output_tokens={state.output_tokens}" + f" cache_read_tokens={state.cache_read_input_tokens}" + f" cache_creation_tokens={state.cache_creation_input_tokens}" + f" tool_calls={state.tool_calls} iterations={iter_value}" + f"{err_part}" + ) + logger.info(line) + + +__all__ = [ + "finalize_success", + "mark_max_iterations", + "normalize_feature_label", + "observe_llm_call", +] From 873e5e34bb6513eca58398af96fbcfa4ca9311d1 Mon Sep 17 00:00:00 2001 From: Jonathan Irvin Date: Sun, 3 May 2026 21:31:51 -0500 Subject: [PATCH 4/5] feat(llm): emit per-tool-call metrics inside the tool execution loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds prometheus_metrics.record_llm_tool_call() calls in both the success and error branches of execute_tool_loop's per-tool dispatch. Threads track_name / trace_name through the function signature so the emitted metric carries the same feature label that the call-level metrics will use. Both new params default to None (current callers don't pass them yet), so feature label resolves to "unknown" until honcho_llm_call is wired in the next commit. Metric emission is wrapped in PrometheusMetrics' sentry-captured error handler — a metric bug can never break a real tool call. --- src/llm/tool_loop.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py index 8d29aa03..a13b553a 100644 --- a/src/llm/tool_loop.py +++ b/src/llm/tool_loop.py @@ -20,6 +20,8 @@ from src.config import ModelTransport from src.exceptions import ValidationException +from src.telemetry.llm_call_metrics import normalize_feature_label +from src.telemetry.prometheus import prometheus_metrics from src.utils.types import set_current_iteration from .executor import honcho_llm_call_inner @@ -166,6 +168,8 @@ async def execute_tool_loop( before_retry_callback: Callable[[Any], None], stream_final: bool = False, iteration_callback: IterationCallback | None = None, + track_name: str | None = None, + trace_name: str | None = None, ) -> HonchoLLMCallResponse[Any] | StreamingResponseWithMetadata: """Run the iterative tool calling loop for agentic LLM interactions. @@ -188,6 +192,8 @@ async def execute_tool_loop( + f"got {max_tool_iterations}" ) + feature_label = normalize_feature_label(track_name, trace_name) + conversation_messages: list[dict[str, Any]] = ( messages.copy() if messages else [{"role": "user", "content": prompt}] ) @@ -351,6 +357,11 @@ async def _call_with_messages( "tool_result": tool_result, } ) + prometheus_metrics.record_llm_tool_call( + feature=feature_label, + tool_name=tool_name, + outcome="success", + ) except Exception as e: logger.error(f"Tool execution failed for {tool_name}: {e}") tool_results.append( @@ -361,6 +372,11 @@ async def _call_with_messages( "is_error": True, } ) + prometheus_metrics.record_llm_tool_call( + feature=feature_label, + tool_name=tool_name, + outcome="error", + ) append_tool_results(current_provider, tool_results, conversation_messages) From 11cd7c2fa9d41237b1576410bcb272fb9e65d1fe Mon Sep 17 00:00:00 2001 From: Jonathan Irvin Date: Sun, 3 May 2026 21:32:11 -0500 Subject: [PATCH 5/5] feat(llm): wire observe_llm_call into honcho_llm_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wraps the body of honcho_llm_call (both tool-less and tool-loop paths) in observe_llm_call(...) so every invocation produces one set of Prometheus samples and one logfmt log line. Captures the AttemptPlan that produced the most-recent (and on success, the winning) call via a `last_plan` cell updated inside _get_attempt_plan, so the recorded provider/model is the one that actually answered — primary on early attempts, backup on the final retry. This makes backup-on-final-attempt observable directly from llm_calls / llm_tokens without parsing logs. Passes track_name and trace_name through to execute_tool_loop so its per-tool counter (added in the previous commit) carries the same feature label as the call-level metrics. When the tool loop returns response.hit_max_iterations=True, the call's outcome is overridden to error_max_iterations via mark_max_iterations so dashboards can split "model didn't converge" from clean success without the tool-loop having to know about outcome semantics. Streaming responses don't carry token counts at the entry point — the recorded call still emits but token counters skip those rows (record_llm_tokens silently no-ops on count<=0). Acceptable partial signal until streaming refactor surfaces tokens earlier. ruff + basedpyright clean. End-to-end smoke verified all six series fire correctly across success, success_via_backup, error_max_iterations, error_timeout, and tool-call paths. --- src/llm/api.py | 140 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 43 deletions(-) diff --git a/src/llm/api.py b/src/llm/api.py index 2a8f0529..78220374 100644 --- a/src/llm/api.py +++ b/src/llm/api.py @@ -21,6 +21,11 @@ from src.config import ConfiguredModelSettings, ModelConfig from src.exceptions import ValidationException +from src.telemetry.llm_call_metrics import ( + finalize_success, + mark_max_iterations, + observe_llm_call, +) from src.telemetry.logging import conditional_observe from src.telemetry.reasoning_traces import log_reasoning_trace @@ -193,6 +198,11 @@ async def honcho_llm_call( # tenacity uses 1-indexed attempts. current_attempt.set(1) + # Captures the AttemptPlan that produced the most recent (and on success, + # the winning) call so observability can label by the model that actually + # answered — primary on early attempts, backup on the final retry. + last_plan: dict[str, AttemptPlan | None] = {"value": None} + def _get_attempt_plan() -> AttemptPlan: plan = plan_attempt( runtime_model_config=runtime_model_config, @@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan: call_thinking_budget_tokens=thinking_budget_tokens, call_reasoning_effort=reasoning_effort, ) + last_plan["value"] = plan update_current_langfuse_observation( plan.provider, plan.model, @@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None: stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences ) - # Tool-less path: call once and return. - if not tools or not tool_executor: - result: ( - HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk] - ) = await decorated() + with observe_llm_call( + track_name=track_name, + trace_name=trace_name, + runtime_model_config=runtime_model_config, + ) as obs_state: + # Tool-less path: call once and return. + if not tools or not tool_executor: + result: ( + HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk] + ) = await decorated() + response_for_metrics = ( + result if isinstance(result, HonchoLLMCallResponse) else None + ) + winning = last_plan["value"] + finalize_success( + obs_state, + response=response_for_metrics, + final_provider=str(winning.provider) if winning else None, + final_model=winning.model if winning else None, + attempts=current_attempt.get(), + iterations=None, + has_backup=runtime_model_config.fallback is not None, + ) + if trace_name and isinstance(result, HonchoLLMCallResponse): + log_reasoning_trace( + task_type=trace_name, + model_config=runtime_model_config, + prompt=prompt, + response=result, + max_tokens=max_tokens, + thinking_budget_tokens=_trace_thinking_budget(), + reasoning_effort=_trace_reasoning_effort(), + json_mode=json_mode, + stop_seqs=_trace_stop_seqs(), + messages=messages, + ) + return result + + # execute_tool_loop raises ValidationException on out-of-range + # max_tool_iterations; fail-fast is cheaper than silent clamping here. + result = await execute_tool_loop( + prompt=prompt, + max_tokens=max_tokens, + messages=messages, + tools=tools, + tool_choice=tool_choice, + tool_executor=tool_executor, + max_tool_iterations=max_tool_iterations, + response_model=response_model, + json_mode=json_mode, + temperature=temperature, + stop_seqs=stop_seqs, + verbosity=verbosity, + enable_retry=enable_retry, + retry_attempts=retry_attempts, + max_input_tokens=max_input_tokens, + get_attempt_plan=_get_attempt_plan, + before_retry_callback=before_retry_callback, + stream_final=stream_final_only, + iteration_callback=iteration_callback, + track_name=track_name, + trace_name=trace_name, + ) + response_for_metrics = ( + result if isinstance(result, HonchoLLMCallResponse) else None + ) + winning = last_plan["value"] + iterations = ( + response_for_metrics.iterations + if response_for_metrics + else (getattr(result, "iterations", None)) + ) + finalize_success( + obs_state, + response=response_for_metrics, + final_provider=str(winning.provider) if winning else None, + final_model=winning.model if winning else None, + attempts=current_attempt.get(), + iterations=iterations, + has_backup=runtime_model_config.fallback is not None, + ) + if response_for_metrics is not None and getattr( + response_for_metrics, "hit_max_iterations", False + ): + mark_max_iterations(obs_state, iterations or max_tool_iterations) + if trace_name and isinstance(result, HonchoLLMCallResponse): log_reasoning_trace( task_type=trace_name, @@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None: ) return result - # execute_tool_loop raises ValidationException on out-of-range - # max_tool_iterations; fail-fast is cheaper than silent clamping here. - result = await execute_tool_loop( - prompt=prompt, - max_tokens=max_tokens, - messages=messages, - tools=tools, - tool_choice=tool_choice, - tool_executor=tool_executor, - max_tool_iterations=max_tool_iterations, - response_model=response_model, - json_mode=json_mode, - temperature=temperature, - stop_seqs=stop_seqs, - verbosity=verbosity, - enable_retry=enable_retry, - retry_attempts=retry_attempts, - max_input_tokens=max_input_tokens, - get_attempt_plan=_get_attempt_plan, - before_retry_callback=before_retry_callback, - stream_final=stream_final_only, - iteration_callback=iteration_callback, - ) - if trace_name and isinstance(result, HonchoLLMCallResponse): - log_reasoning_trace( - task_type=trace_name, - model_config=runtime_model_config, - prompt=prompt, - response=result, - max_tokens=max_tokens, - thinking_budget_tokens=_trace_thinking_budget(), - reasoning_effort=_trace_reasoning_effort(), - json_mode=json_mode, - stop_seqs=_trace_stop_seqs(), - messages=messages, - ) - return result - __all__ = ["honcho_llm_call"]