From 34ad2ac535cdc2c59e654feaf030da0065e8252a Mon Sep 17 00:00:00 2001
From: Jonathan Irvin <offendingcommit@gmail.com>
Date: Sun, 3 May 2026 21:30:34 -0500
Subject: [PATCH 1/5] feat(llm): add hit_max_iterations flag on tool-loop
 response
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Telemetry-only signal: True when the loop exited via the max-iterations
synthesis path rather than the model deciding to stop. Distinguishes
"model didn't converge" from natural termination so downstream
observability can label the two cases differently.

No emitter changes — flag is set but no consumer reads it yet.
---
 src/llm/tool_loop.py | 1 +
 src/llm/types.py     | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py
index 2db87e9a..8d29aa03 100644
--- a/src/llm/tool_loop.py
+++ b/src/llm/tool_loop.py
@@ -470,6 +470,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
     final_response = await final_call_func()
     final_response.tool_calls_made = all_tool_calls
     final_response.iterations = iteration + 1
+    final_response.hit_max_iterations = True
     final_response.input_tokens = total_input_tokens + final_response.input_tokens
     final_response.output_tokens = total_output_tokens + final_response.output_tokens
     final_response.cache_creation_input_tokens = (
diff --git a/src/llm/types.py b/src/llm/types.py
index 7af5372d..d40d42d0 100644
--- a/src/llm/types.py
+++ b/src/llm/types.py
@@ -66,6 +66,9 @@ class HonchoLLMCallResponse(BaseModel, Generic[T]):
     tool_calls_made: list[dict[str, Any]] = Field(default_factory=list)
     iterations: int = 0
     """Number of LLM calls made in the tool execution loop."""
+    hit_max_iterations: bool = False
+    """True when the tool loop exited via the max-iterations synthesis path
+    rather than the model deciding to stop. Telemetry-only signal."""
     thinking_content: str | None = None
     # Full thinking blocks with signatures for multi-turn replay (Anthropic only).
     thinking_blocks: list[dict[str, Any]] = Field(default_factory=list)

From 48b3003bba7842c0a9122c1599556d963ad2419b Mon Sep 17 00:00:00 2001
From: Jonathan Irvin <offendingcommit@gmail.com>
Date: Sun, 3 May 2026 21:30:52 -0500
Subject: [PATCH 2/5] feat(telemetry): declare per-LLM-call Prometheus series
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds six new metrics + recorder methods on the existing PrometheusMetrics
singleton; no callers yet, so this commit is purely declarative.

Series:
- llm_calls / llm_call_duration_seconds — counter + histogram per call,
  labeled by feature × provider × model × outcome.
- llm_tokens — input/output/cache_read/cache_creation per
  feature × provider × model.
- llm_tool_calls — per-tool invocation outcome inside the tool loop.
- llm_iterations — histogram of iterations consumed per call/outcome.
- llm_backup_used — counts failovers from primary to backup provider.

Cardinality-bounded: feature × provider × model × outcome ≈ 1.7k series
cap. Deliberately no workspace_name label here — these answer "is this
model effective for this feature", not "is workspace X slow".

LLMCallOutcome enum exported from src.telemetry.prometheus so callers can
reference the canonical values without importing from the metrics module
directly.
---
 src/telemetry/prometheus/__init__.py |   2 +
 src/telemetry/prometheus/metrics.py  | 170 +++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)

diff --git a/src/telemetry/prometheus/__init__.py b/src/telemetry/prometheus/__init__.py
index ab0a2616..a3ff8694 100644
--- a/src/telemetry/prometheus/__init__.py
+++ b/src/telemetry/prometheus/__init__.py
@@ -11,6 +11,7 @@
     DeriverComponents,
     DeriverTaskTypes,
     DialecticComponents,
+    LLMCallOutcome,
     TokenTypes,
     metrics_endpoint,
     prometheus_metrics,
@@ -20,6 +21,7 @@
     "DeriverComponents",
     "DeriverTaskTypes",
     "DialecticComponents",
+    "LLMCallOutcome",
     "TokenTypes",
     "metrics_endpoint",
     "prometheus_metrics",
diff --git a/src/telemetry/prometheus/metrics.py b/src/telemetry/prometheus/metrics.py
index 0c9fc15b..d70aea84 100644
--- a/src/telemetry/prometheus/metrics.py
+++ b/src/telemetry/prometheus/metrics.py
@@ -64,6 +64,24 @@ class DialecticComponents(Enum):
     TOTAL = "total"
 
 
+class LLMCallOutcome(Enum):
+    """Terminal outcome of a single `honcho_llm_call`.
+
+    Distinguishes "model didn't converge" (max_iterations) from "infra broke"
+    (timeout/validation/other) so dashboards and alerts can target each
+    independently. `success_via_backup` is its own bucket so silent failover
+    rate is observable without parsing logs.
+    """
+
+    SUCCESS = "success"
+    SUCCESS_AFTER_RETRY = "success_after_retry"
+    SUCCESS_VIA_BACKUP = "success_via_backup"
+    ERROR_TIMEOUT = "error_timeout"
+    ERROR_VALIDATION = "error_validation"
+    ERROR_MAX_ITERATIONS = "error_max_iterations"
+    ERROR_OTHER = "error_other"
+
+
 api_requests_counter = NamespacedCounter(
     "api_requests",
     "Total API requests",
@@ -192,6 +210,61 @@ class DialecticComponents(Enum):
     ["namespace", "workspace_name", "session_name", "state"],
 )
 
+# ---- Per-LLM-call observability ---------------------------------------------
+# Cardinality budget: feature ~6, provider ~4, model ~10, outcome 7 → ~1.7k
+# series cap. Deliberately no workspace_name label here: the question these
+# answer is "is this model effective for this feature", not "is workspace X
+# slow". Per-workspace LLM behavior shows up in dialectic_calls + token
+# counters which already carry workspace_name.
+
+llm_calls_counter = NamespacedCounter(
+    "llm_calls",
+    "Total honcho_llm_call invocations by feature, provider, model, outcome",
+    ["namespace", "feature", "provider", "model", "outcome"],
+)
+
+llm_call_duration_histogram = NamespacedHistogram(
+    "llm_call_duration_seconds",
+    "End-to-end honcho_llm_call latency (includes retries and backup failover)",
+    ["namespace", "feature", "provider", "model", "outcome"],
+    buckets=(0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600, 1800, 3600),
+)
+
+# Distinct from the existing deriver/dialectic/dreamer token counters:
+# this one carries provider+model so we can answer "tokens through gemini
+# vs glm-5.1" without bouncing through Langfuse.
+llm_tokens_counter = NamespacedCounter(
+    "llm_tokens",
+    "LLM tokens by feature/provider/model/token_type",
+    ["namespace", "feature", "provider", "model", "token_type"],
+)
+
+llm_tool_calls_counter = NamespacedCounter(
+    "llm_tool_calls",
+    "Individual tool invocations within an LLM tool loop",
+    ["namespace", "feature", "tool_name", "outcome"],
+)
+
+llm_iterations_histogram = NamespacedHistogram(
+    "llm_iterations",
+    "Tool-loop iterations consumed per call (1 = no tool calls)",
+    ["namespace", "feature", "outcome"],
+    buckets=(1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 50),
+)
+
+llm_backup_used_counter = NamespacedCounter(
+    "llm_backup_used",
+    "Counts when a call's retry chain switched from primary to backup provider",
+    [
+        "namespace",
+        "feature",
+        "primary_provider",
+        "primary_model",
+        "backup_provider",
+        "backup_model",
+    ],
+)
+
 
 @final
 class PrometheusMetrics:
@@ -525,6 +598,103 @@ def set_session_queue_oldest_age(
         except Exception as e:
             self._handle_metric_error("set_session_queue_oldest_age", e)
 
+    def record_llm_call(
+        self,
+        *,
+        feature: str,
+        provider: str,
+        model: str,
+        outcome: str,
+        duration_seconds: float,
+    ) -> None:
+        try:
+            llm_calls_counter.labels(
+                feature=feature,
+                provider=provider,
+                model=model,
+                outcome=outcome,
+            ).inc()
+            llm_call_duration_histogram.labels(
+                feature=feature,
+                provider=provider,
+                model=model,
+                outcome=outcome,
+            ).observe(duration_seconds)
+        except Exception as e:
+            self._handle_metric_error("record_llm_call", e)
+
+    def record_llm_tokens(
+        self,
+        *,
+        feature: str,
+        provider: str,
+        model: str,
+        token_type: str,
+        count: int,
+    ) -> None:
+        if count <= 0:
+            return
+        try:
+            llm_tokens_counter.labels(
+                feature=feature,
+                provider=provider,
+                model=model,
+                token_type=token_type,
+            ).inc(count)
+        except Exception as e:
+            self._handle_metric_error("record_llm_tokens", e)
+
+    def record_llm_tool_call(
+        self,
+        *,
+        feature: str,
+        tool_name: str,
+        outcome: str,
+    ) -> None:
+        try:
+            llm_tool_calls_counter.labels(
+                feature=feature,
+                tool_name=tool_name,
+                outcome=outcome,
+            ).inc()
+        except Exception as e:
+            self._handle_metric_error("record_llm_tool_call", e)
+
+    def observe_llm_iterations(
+        self,
+        *,
+        feature: str,
+        outcome: str,
+        iterations: int,
+    ) -> None:
+        try:
+            llm_iterations_histogram.labels(
+                feature=feature,
+                outcome=outcome,
+            ).observe(iterations)
+        except Exception as e:
+            self._handle_metric_error("observe_llm_iterations", e)
+
+    def record_llm_backup_used(
+        self,
+        *,
+        feature: str,
+        primary_provider: str,
+        primary_model: str,
+        backup_provider: str,
+        backup_model: str,
+    ) -> None:
+        try:
+            llm_backup_used_counter.labels(
+                feature=feature,
+                primary_provider=primary_provider,
+                primary_model=primary_model,
+                backup_provider=backup_provider,
+                backup_model=backup_model,
+            ).inc()
+        except Exception as e:
+            self._handle_metric_error("record_llm_backup_used", e)
+
 
 prometheus_metrics = PrometheusMetrics()
 

From 389b9d73f3f541a5c365d7a125d91a3a76c01e49 Mon Sep 17 00:00:00 2001
From: Jonathan Irvin <offendingcommit@gmail.com>
Date: Sun, 3 May 2026 21:31:09 -0500
Subject: [PATCH 3/5] feat(telemetry): add observe_llm_call helper for per-call
 observability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces src/telemetry/llm_call_metrics.py — a context-manager-based
wrapper that turns one LLM call into one set of Prometheus samples and
one logfmt log line.

Surface:
- observe_llm_call(...) — context manager yielding a mutable _CallState
  the caller populates over the call's lifetime.
- finalize_success(...) — populate state from a successful response and
  pick the outcome bucket (success / success_after_retry / success_via_backup).
- mark_max_iterations(...) — flip the state to error_max_iterations when
  the tool loop exited via the synthesis path.
- normalize_feature_label(...) — maps caller's track_name/trace_name to
  a low-cardinality Prom label (e.g. "Dreamer/deduction" -> dream_deduction).

No callers wired in yet — this commit is the helper module on its own
so the diff stays reviewable. Wiring into honcho_llm_call and the tool
loop lands in subsequent commits.

Errors raised inside the wrapped call are classified into outcome
buckets (timeout / validation / other) and re-raised; the wrapper never
swallows or transforms exceptions.
---
 src/telemetry/llm_call_metrics.py | 280 ++++++++++++++++++++++++++++++
 1 file changed, 280 insertions(+)
 create mode 100644 src/telemetry/llm_call_metrics.py

diff --git a/src/telemetry/llm_call_metrics.py b/src/telemetry/llm_call_metrics.py
new file mode 100644
index 00000000..16103479
--- /dev/null
+++ b/src/telemetry/llm_call_metrics.py
@@ -0,0 +1,280 @@
+"""Per-`honcho_llm_call` observability.
+
+Captures one structured log line + Prometheus sample set per LLM call,
+covering every feature (deriver, dialectic, dream, summary). Designed to
+be the *only* instrumentation point inside `honcho_llm_call`; subsystem-
+specific token counters in `prometheus.metrics` continue to work in parallel.
+
+Outcome classification distinguishes "the model didn't converge"
+(`error_max_iterations`) from "the infra broke" (timeout / validation /
+other) so dashboards and alerts can target each independently.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import time
+from collections.abc import Iterator
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+from src.exceptions import ValidationException
+from src.telemetry.prometheus import LLMCallOutcome, prometheus_metrics
+
+if TYPE_CHECKING:
+    from src.config import ModelConfig
+    from src.llm.types import HonchoLLMCallResponse
+
+logger = logging.getLogger("honcho.llm.call")
+
+
+# Track names used by callers, mapped to clean Prom label values. Anything
+# not in the table falls through `_normalize_track_name` (lower + underscores).
+_FEATURE_LABEL_MAP: dict[str, str] = {
+    "Minimal Deriver": "deriver",
+    "Dialectic Agent": "dialectic",
+    "Dialectic Agent Stream": "dialectic_stream",
+}
+
+
+def normalize_feature_label(track_name: str | None, trace_name: str | None) -> str:
+    """Map caller's `track_name`/`trace_name` to a low-cardinality Prom label.
+
+    Prefers explicit `_FEATURE_LABEL_MAP` matches, then snake-cases the
+    track_name (e.g. ``"Dreamer/deduction"`` → ``"dream_deduction"``),
+    falling back to trace_name, then to ``"unknown"``.
+    """
+    if track_name and track_name in _FEATURE_LABEL_MAP:
+        return _FEATURE_LABEL_MAP[track_name]
+    raw = track_name or trace_name
+    if not raw:
+        return "unknown"
+    # "Dreamer/Deduction" → "dreamer_deduction" → "dream_deduction"
+    s = re.sub(r"[^A-Za-z0-9]+", "_", raw).strip("_").lower()
+    s = s.replace("dreamer_", "dream_")
+    return s or "unknown"
+
+
+@dataclass
+class _CallState:
+    """Mutable observation collected across the lifetime of one call."""
+
+    feature: str
+    primary_provider: str
+    primary_model: str
+    has_backup: bool
+    backup_provider: str | None = None
+    backup_model: str | None = None
+    started_at: float = field(default_factory=time.monotonic)
+    final_provider: str = ""
+    final_model: str = ""
+    attempts: int = 1
+    iterations: int | None = None
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cache_creation_input_tokens: int = 0
+    cache_read_input_tokens: int = 0
+    tool_calls: int = 0
+    used_backup: bool = False
+    outcome: LLMCallOutcome = LLMCallOutcome.SUCCESS
+    error_class: str | None = None
+
+
+def _classify_error(exc: BaseException) -> LLMCallOutcome:
+    """Map an exception to a coarse outcome bucket."""
+    if isinstance(exc, ValidationException):
+        return LLMCallOutcome.ERROR_VALIDATION
+    if isinstance(exc, TimeoutError):
+        return LLMCallOutcome.ERROR_TIMEOUT
+    name = type(exc).__name__.lower()
+    msg = str(exc).lower()
+    if "timeout" in name or "timed out" in msg or "timeout" in msg:
+        return LLMCallOutcome.ERROR_TIMEOUT
+    if "pydantic" in name and "validation" in name:
+        return LLMCallOutcome.ERROR_VALIDATION
+    return LLMCallOutcome.ERROR_OTHER
+
+
+@contextmanager
+def observe_llm_call(
+    *,
+    track_name: str | None,
+    trace_name: str | None,
+    runtime_model_config: ModelConfig,
+) -> Iterator[_CallState]:
+    """Wrap one `honcho_llm_call` invocation with metrics + structured logging.
+
+    The caller mutates the yielded `_CallState` (sets `attempts`, `iterations`,
+    accumulates tokens, etc.) over the call's lifetime; on exit we emit the
+    Prometheus samples and a single logfmt line. Metric errors are swallowed
+    inside `prometheus_metrics`; this wrapper never raises.
+    """
+    fb = runtime_model_config.fallback
+    state = _CallState(
+        feature=normalize_feature_label(track_name, trace_name),
+        primary_provider=str(runtime_model_config.transport),
+        primary_model=str(runtime_model_config.model),
+        has_backup=fb is not None,
+        backup_provider=str(fb.transport) if fb is not None else None,
+        backup_model=str(fb.model) if fb is not None else None,
+    )
+    # Default the "winning" provider/model to primary; the caller updates
+    # these post-success based on the actual AttemptPlan that returned.
+    state.final_provider = state.primary_provider
+    state.final_model = state.primary_model
+
+    try:
+        yield state
+    except BaseException as exc:
+        state.outcome = _classify_error(exc)
+        state.error_class = type(exc).__name__
+        _emit(state)
+        raise
+    else:
+        # Success — caller is responsible for setting iterations / final
+        # provider / outcome (via `finalize_success` below) before exit.
+        _emit(state)
+
+
+def finalize_success(
+    state: _CallState,
+    *,
+    response: HonchoLLMCallResponse[Any] | None,
+    final_provider: str | None,
+    final_model: str | None,
+    attempts: int,
+    iterations: int | None,
+    has_backup: bool,
+) -> None:
+    """Populate `state` from a successful response and pick the outcome bucket.
+
+    Called by `honcho_llm_call` right before the context manager exits when
+    no exception was raised. `iterations` is None for tool-less calls.
+    """
+    state.attempts = max(1, attempts)
+    state.iterations = iterations
+    if final_provider:
+        state.final_provider = final_provider
+    if final_model:
+        state.final_model = final_model
+    state.used_backup = (
+        has_backup
+        and state.final_model != state.primary_model
+        and state.backup_model is not None
+        and state.final_model == state.backup_model
+    )
+    if response is not None:
+        state.input_tokens = int(response.input_tokens or 0)
+        state.output_tokens = int(response.output_tokens or 0)
+        state.cache_creation_input_tokens = int(
+            response.cache_creation_input_tokens or 0
+        )
+        state.cache_read_input_tokens = int(response.cache_read_input_tokens or 0)
+        state.tool_calls = len(response.tool_calls_made or [])
+
+    if state.used_backup:
+        state.outcome = LLMCallOutcome.SUCCESS_VIA_BACKUP
+    elif state.attempts > 1:
+        state.outcome = LLMCallOutcome.SUCCESS_AFTER_RETRY
+    else:
+        state.outcome = LLMCallOutcome.SUCCESS
+
+
+def mark_max_iterations(state: _CallState, iterations: int) -> None:
+    """Mark a tool-loop call that hit `max_tool_iterations` without converging.
+
+    Called when execute_tool_loop returns from the synthesis fallback path
+    rather than from natural convergence. The call still returned content,
+    but the model didn't decide to stop on its own — different reliability
+    signal than a clean success.
+    """
+    state.iterations = iterations
+    state.outcome = LLMCallOutcome.ERROR_MAX_ITERATIONS
+
+
+def _emit(state: _CallState) -> None:
+    duration = time.monotonic() - state.started_at
+    outcome_value = state.outcome.value
+
+    prometheus_metrics.record_llm_call(
+        feature=state.feature,
+        provider=state.final_provider,
+        model=state.final_model,
+        outcome=outcome_value,
+        duration_seconds=duration,
+    )
+    if state.input_tokens:
+        prometheus_metrics.record_llm_tokens(
+            feature=state.feature,
+            provider=state.final_provider,
+            model=state.final_model,
+            token_type="input",
+            count=state.input_tokens,
+        )
+    if state.output_tokens:
+        prometheus_metrics.record_llm_tokens(
+            feature=state.feature,
+            provider=state.final_provider,
+            model=state.final_model,
+            token_type="output",
+            count=state.output_tokens,
+        )
+    if state.cache_read_input_tokens:
+        prometheus_metrics.record_llm_tokens(
+            feature=state.feature,
+            provider=state.final_provider,
+            model=state.final_model,
+            token_type="cache_read",
+            count=state.cache_read_input_tokens,
+        )
+    if state.cache_creation_input_tokens:
+        prometheus_metrics.record_llm_tokens(
+            feature=state.feature,
+            provider=state.final_provider,
+            model=state.final_model,
+            token_type="cache_creation",
+            count=state.cache_creation_input_tokens,
+        )
+    if state.iterations is not None:
+        prometheus_metrics.observe_llm_iterations(
+            feature=state.feature,
+            outcome=outcome_value,
+            iterations=state.iterations,
+        )
+    if state.used_backup and state.backup_provider and state.backup_model:
+        prometheus_metrics.record_llm_backup_used(
+            feature=state.feature,
+            primary_provider=state.primary_provider,
+            primary_model=state.primary_model,
+            backup_provider=state.backup_provider,
+            backup_model=state.backup_model,
+        )
+
+    # One structured logfmt line per call. Quote-free values keep `| logfmt`
+    # parsing in Loki/Grafana straightforward.
+    iter_value = state.iterations if state.iterations is not None else 0
+    err_part = f" error_class={state.error_class}" if state.error_class else ""
+    line = (
+        f"honcho.llm.call feature={state.feature}"
+        f" provider={state.final_provider} model={state.final_model}"
+        f" outcome={outcome_value} latency_ms={int(duration * 1000)}"
+        f" attempts={state.attempts}"
+        f" used_backup={'true' if state.used_backup else 'false'}"
+        f" input_tokens={state.input_tokens}"
+        f" output_tokens={state.output_tokens}"
+        f" cache_read_tokens={state.cache_read_input_tokens}"
+        f" cache_creation_tokens={state.cache_creation_input_tokens}"
+        f" tool_calls={state.tool_calls} iterations={iter_value}"
+        f"{err_part}"
+    )
+    logger.info(line)
+
+
+__all__ = [
+    "finalize_success",
+    "mark_max_iterations",
+    "normalize_feature_label",
+    "observe_llm_call",
+]

From 873e5e34bb6513eca58398af96fbcfa4ca9311d1 Mon Sep 17 00:00:00 2001
From: Jonathan Irvin <offendingcommit@gmail.com>
Date: Sun, 3 May 2026 21:31:51 -0500
Subject: [PATCH 4/5] feat(llm): emit per-tool-call metrics inside the tool
 execution loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds prometheus_metrics.record_llm_tool_call() calls in both the
success and error branches of execute_tool_loop's per-tool dispatch.
Threads track_name / trace_name through the function signature so the
emitted metric carries the same feature label that the call-level
metrics will use.

Both new params default to None (current callers don't pass them yet),
so feature label resolves to "unknown" until honcho_llm_call is wired
in the next commit. Metric emission is wrapped in PrometheusMetrics'
sentry-captured error handler — a metric bug can never break a real
tool call.
---
 src/llm/tool_loop.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/llm/tool_loop.py b/src/llm/tool_loop.py
index 8d29aa03..a13b553a 100644
--- a/src/llm/tool_loop.py
+++ b/src/llm/tool_loop.py
@@ -20,6 +20,8 @@
 
 from src.config import ModelTransport
 from src.exceptions import ValidationException
+from src.telemetry.llm_call_metrics import normalize_feature_label
+from src.telemetry.prometheus import prometheus_metrics
 from src.utils.types import set_current_iteration
 
 from .executor import honcho_llm_call_inner
@@ -166,6 +168,8 @@ async def execute_tool_loop(
     before_retry_callback: Callable[[Any], None],
     stream_final: bool = False,
     iteration_callback: IterationCallback | None = None,
+    track_name: str | None = None,
+    trace_name: str | None = None,
 ) -> HonchoLLMCallResponse[Any] | StreamingResponseWithMetadata:
     """Run the iterative tool calling loop for agentic LLM interactions.
 
@@ -188,6 +192,8 @@ async def execute_tool_loop(
             + f"got {max_tool_iterations}"
         )
 
+    feature_label = normalize_feature_label(track_name, trace_name)
+
     conversation_messages: list[dict[str, Any]] = (
         messages.copy() if messages else [{"role": "user", "content": prompt}]
     )
@@ -351,6 +357,11 @@ async def _call_with_messages(
                         "tool_result": tool_result,
                     }
                 )
+                prometheus_metrics.record_llm_tool_call(
+                    feature=feature_label,
+                    tool_name=tool_name,
+                    outcome="success",
+                )
             except Exception as e:
                 logger.error(f"Tool execution failed for {tool_name}: {e}")
                 tool_results.append(
@@ -361,6 +372,11 @@ async def _call_with_messages(
                         "is_error": True,
                     }
                 )
+                prometheus_metrics.record_llm_tool_call(
+                    feature=feature_label,
+                    tool_name=tool_name,
+                    outcome="error",
+                )
 
         append_tool_results(current_provider, tool_results, conversation_messages)
 

From 11cd7c2fa9d41237b1576410bcb272fb9e65d1fe Mon Sep 17 00:00:00 2001
From: Jonathan Irvin <offendingcommit@gmail.com>
Date: Sun, 3 May 2026 21:32:11 -0500
Subject: [PATCH 5/5] feat(llm): wire observe_llm_call into honcho_llm_call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wraps the body of honcho_llm_call (both tool-less and tool-loop paths)
in observe_llm_call(...) so every invocation produces one set of
Prometheus samples and one logfmt log line.

Captures the AttemptPlan that produced the most-recent (and on success,
the winning) call via a `last_plan` cell updated inside _get_attempt_plan,
so the recorded provider/model is the one that actually answered —
primary on early attempts, backup on the final retry. This makes
backup-on-final-attempt observable directly from llm_calls / llm_tokens
without parsing logs.

Passes track_name and trace_name through to execute_tool_loop so its
per-tool counter (added in the previous commit) carries the same
feature label as the call-level metrics.

When the tool loop returns response.hit_max_iterations=True, the call's
outcome is overridden to error_max_iterations via mark_max_iterations
so dashboards can split "model didn't converge" from clean success
without the tool-loop having to know about outcome semantics.

Streaming responses don't carry token counts at the entry point —
the recorded call still emits but token counters skip those rows
(record_llm_tokens silently no-ops on count<=0). Acceptable partial
signal until streaming refactor surfaces tokens earlier.

ruff + basedpyright clean. End-to-end smoke verified all six series
fire correctly across success, success_via_backup, error_max_iterations,
error_timeout, and tool-call paths.
---
 src/llm/api.py | 140 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 97 insertions(+), 43 deletions(-)

diff --git a/src/llm/api.py b/src/llm/api.py
index 2a8f0529..78220374 100644
--- a/src/llm/api.py
+++ b/src/llm/api.py
@@ -21,6 +21,11 @@
 
 from src.config import ConfiguredModelSettings, ModelConfig
 from src.exceptions import ValidationException
+from src.telemetry.llm_call_metrics import (
+    finalize_success,
+    mark_max_iterations,
+    observe_llm_call,
+)
 from src.telemetry.logging import conditional_observe
 from src.telemetry.reasoning_traces import log_reasoning_trace
 
@@ -193,6 +198,11 @@ async def honcho_llm_call(
     # tenacity uses 1-indexed attempts.
     current_attempt.set(1)
 
+    # Captures the AttemptPlan that produced the most recent (and on success,
+    # the winning) call so observability can label by the model that actually
+    # answered — primary on early attempts, backup on the final retry.
+    last_plan: dict[str, AttemptPlan | None] = {"value": None}
+
     def _get_attempt_plan() -> AttemptPlan:
         plan = plan_attempt(
             runtime_model_config=runtime_model_config,
@@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan:
             call_thinking_budget_tokens=thinking_budget_tokens,
             call_reasoning_effort=reasoning_effort,
         )
+        last_plan["value"] = plan
         update_current_langfuse_observation(
             plan.provider,
             plan.model,
@@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None:
             stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences
         )
 
-    # Tool-less path: call once and return.
-    if not tools or not tool_executor:
-        result: (
-            HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
-        ) = await decorated()
+    with observe_llm_call(
+        track_name=track_name,
+        trace_name=trace_name,
+        runtime_model_config=runtime_model_config,
+    ) as obs_state:
+        # Tool-less path: call once and return.
+        if not tools or not tool_executor:
+            result: (
+                HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
+            ) = await decorated()
+            response_for_metrics = (
+                result if isinstance(result, HonchoLLMCallResponse) else None
+            )
+            winning = last_plan["value"]
+            finalize_success(
+                obs_state,
+                response=response_for_metrics,
+                final_provider=str(winning.provider) if winning else None,
+                final_model=winning.model if winning else None,
+                attempts=current_attempt.get(),
+                iterations=None,
+                has_backup=runtime_model_config.fallback is not None,
+            )
+            if trace_name and isinstance(result, HonchoLLMCallResponse):
+                log_reasoning_trace(
+                    task_type=trace_name,
+                    model_config=runtime_model_config,
+                    prompt=prompt,
+                    response=result,
+                    max_tokens=max_tokens,
+                    thinking_budget_tokens=_trace_thinking_budget(),
+                    reasoning_effort=_trace_reasoning_effort(),
+                    json_mode=json_mode,
+                    stop_seqs=_trace_stop_seqs(),
+                    messages=messages,
+                )
+            return result
+
+        # execute_tool_loop raises ValidationException on out-of-range
+        # max_tool_iterations; fail-fast is cheaper than silent clamping here.
+        result = await execute_tool_loop(
+            prompt=prompt,
+            max_tokens=max_tokens,
+            messages=messages,
+            tools=tools,
+            tool_choice=tool_choice,
+            tool_executor=tool_executor,
+            max_tool_iterations=max_tool_iterations,
+            response_model=response_model,
+            json_mode=json_mode,
+            temperature=temperature,
+            stop_seqs=stop_seqs,
+            verbosity=verbosity,
+            enable_retry=enable_retry,
+            retry_attempts=retry_attempts,
+            max_input_tokens=max_input_tokens,
+            get_attempt_plan=_get_attempt_plan,
+            before_retry_callback=before_retry_callback,
+            stream_final=stream_final_only,
+            iteration_callback=iteration_callback,
+            track_name=track_name,
+            trace_name=trace_name,
+        )
+        response_for_metrics = (
+            result if isinstance(result, HonchoLLMCallResponse) else None
+        )
+        winning = last_plan["value"]
+        iterations = (
+            response_for_metrics.iterations
+            if response_for_metrics
+            else (getattr(result, "iterations", None))
+        )
+        finalize_success(
+            obs_state,
+            response=response_for_metrics,
+            final_provider=str(winning.provider) if winning else None,
+            final_model=winning.model if winning else None,
+            attempts=current_attempt.get(),
+            iterations=iterations,
+            has_backup=runtime_model_config.fallback is not None,
+        )
+        if response_for_metrics is not None and getattr(
+            response_for_metrics, "hit_max_iterations", False
+        ):
+            mark_max_iterations(obs_state, iterations or max_tool_iterations)
+
         if trace_name and isinstance(result, HonchoLLMCallResponse):
             log_reasoning_trace(
                 task_type=trace_name,
@@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None:
             )
         return result
 
-    # execute_tool_loop raises ValidationException on out-of-range
-    # max_tool_iterations; fail-fast is cheaper than silent clamping here.
-    result = await execute_tool_loop(
-        prompt=prompt,
-        max_tokens=max_tokens,
-        messages=messages,
-        tools=tools,
-        tool_choice=tool_choice,
-        tool_executor=tool_executor,
-        max_tool_iterations=max_tool_iterations,
-        response_model=response_model,
-        json_mode=json_mode,
-        temperature=temperature,
-        stop_seqs=stop_seqs,
-        verbosity=verbosity,
-        enable_retry=enable_retry,
-        retry_attempts=retry_attempts,
-        max_input_tokens=max_input_tokens,
-        get_attempt_plan=_get_attempt_plan,
-        before_retry_callback=before_retry_callback,
-        stream_final=stream_final_only,
-        iteration_callback=iteration_callback,
-    )
-    if trace_name and isinstance(result, HonchoLLMCallResponse):
-        log_reasoning_trace(
-            task_type=trace_name,
-            model_config=runtime_model_config,
-            prompt=prompt,
-            response=result,
-            max_tokens=max_tokens,
-            thinking_budget_tokens=_trace_thinking_budget(),
-            reasoning_effort=_trace_reasoning_effort(),
-            json_mode=json_mode,
-            stop_seqs=_trace_stop_seqs(),
-            messages=messages,
-        )
-    return result
-
 
 __all__ = ["honcho_llm_call"]