Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 97 additions & 43 deletions src/llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@

from src.config import ConfiguredModelSettings, ModelConfig
from src.exceptions import ValidationException
from src.telemetry.llm_call_metrics import (
finalize_success,
mark_max_iterations,
observe_llm_call,
)
from src.telemetry.logging import conditional_observe
from src.telemetry.reasoning_traces import log_reasoning_trace

Expand Down Expand Up @@ -193,6 +198,11 @@ async def honcho_llm_call(
# tenacity uses 1-indexed attempts.
current_attempt.set(1)

# Captures the AttemptPlan that produced the most recent (and on success,
# the winning) call so observability can label by the model that actually
# answered — primary on early attempts, backup on the final retry.
last_plan: dict[str, AttemptPlan | None] = {"value": None}

def _get_attempt_plan() -> AttemptPlan:
plan = plan_attempt(
runtime_model_config=runtime_model_config,
Expand All @@ -201,6 +211,7 @@ def _get_attempt_plan() -> AttemptPlan:
call_thinking_budget_tokens=thinking_budget_tokens,
call_reasoning_effort=reasoning_effort,
)
last_plan["value"] = plan
update_current_langfuse_observation(
plan.provider,
plan.model,
Expand Down Expand Up @@ -304,11 +315,92 @@ def _trace_stop_seqs() -> list[str] | None:
stop_seqs if stop_seqs is not None else runtime_model_config.stop_sequences
)

# Tool-less path: call once and return.
if not tools or not tool_executor:
result: (
HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
) = await decorated()
with observe_llm_call(
track_name=track_name,
trace_name=trace_name,
runtime_model_config=runtime_model_config,
) as obs_state:
# Tool-less path: call once and return.
if not tools or not tool_executor:
result: (
HonchoLLMCallResponse[Any] | AsyncIterator[HonchoLLMCallStreamChunk]
) = await decorated()
response_for_metrics = (
result if isinstance(result, HonchoLLMCallResponse) else None
)
winning = last_plan["value"]
finalize_success(
obs_state,
response=response_for_metrics,
final_provider=str(winning.provider) if winning else None,
final_model=winning.model if winning else None,
attempts=current_attempt.get(),
iterations=None,
has_backup=runtime_model_config.fallback is not None,
)
if trace_name and isinstance(result, HonchoLLMCallResponse):
log_reasoning_trace(
task_type=trace_name,
model_config=runtime_model_config,
prompt=prompt,
response=result,
max_tokens=max_tokens,
thinking_budget_tokens=_trace_thinking_budget(),
reasoning_effort=_trace_reasoning_effort(),
json_mode=json_mode,
stop_seqs=_trace_stop_seqs(),
messages=messages,
)
return result

# execute_tool_loop raises ValidationException on out-of-range
# max_tool_iterations; fail-fast is cheaper than silent clamping here.
result = await execute_tool_loop(
prompt=prompt,
max_tokens=max_tokens,
messages=messages,
tools=tools,
tool_choice=tool_choice,
tool_executor=tool_executor,
max_tool_iterations=max_tool_iterations,
response_model=response_model,
json_mode=json_mode,
temperature=temperature,
stop_seqs=stop_seqs,
verbosity=verbosity,
enable_retry=enable_retry,
retry_attempts=retry_attempts,
max_input_tokens=max_input_tokens,
get_attempt_plan=_get_attempt_plan,
before_retry_callback=before_retry_callback,
stream_final=stream_final_only,
iteration_callback=iteration_callback,
track_name=track_name,
trace_name=trace_name,
)
response_for_metrics = (
result if isinstance(result, HonchoLLMCallResponse) else None
)
winning = last_plan["value"]
iterations = (
response_for_metrics.iterations
if response_for_metrics
else (getattr(result, "iterations", None))
)
finalize_success(
obs_state,
response=response_for_metrics,
final_provider=str(winning.provider) if winning else None,
final_model=winning.model if winning else None,
attempts=current_attempt.get(),
iterations=iterations,
has_backup=runtime_model_config.fallback is not None,
)
if response_for_metrics is not None and getattr(
response_for_metrics, "hit_max_iterations", False
):
mark_max_iterations(obs_state, iterations or max_tool_iterations)

if trace_name and isinstance(result, HonchoLLMCallResponse):
log_reasoning_trace(
task_type=trace_name,
Expand All @@ -324,43 +416,5 @@ def _trace_stop_seqs() -> list[str] | None:
)
return result

# execute_tool_loop raises ValidationException on out-of-range
# max_tool_iterations; fail-fast is cheaper than silent clamping here.
result = await execute_tool_loop(
prompt=prompt,
max_tokens=max_tokens,
messages=messages,
tools=tools,
tool_choice=tool_choice,
tool_executor=tool_executor,
max_tool_iterations=max_tool_iterations,
response_model=response_model,
json_mode=json_mode,
temperature=temperature,
stop_seqs=stop_seqs,
verbosity=verbosity,
enable_retry=enable_retry,
retry_attempts=retry_attempts,
max_input_tokens=max_input_tokens,
get_attempt_plan=_get_attempt_plan,
before_retry_callback=before_retry_callback,
stream_final=stream_final_only,
iteration_callback=iteration_callback,
)
if trace_name and isinstance(result, HonchoLLMCallResponse):
log_reasoning_trace(
task_type=trace_name,
model_config=runtime_model_config,
prompt=prompt,
response=result,
max_tokens=max_tokens,
thinking_budget_tokens=_trace_thinking_budget(),
reasoning_effort=_trace_reasoning_effort(),
json_mode=json_mode,
stop_seqs=_trace_stop_seqs(),
messages=messages,
)
return result


__all__ = ["honcho_llm_call"]
17 changes: 17 additions & 0 deletions src/llm/tool_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from src.config import ModelTransport
from src.exceptions import ValidationException
from src.telemetry.llm_call_metrics import normalize_feature_label
from src.telemetry.prometheus import prometheus_metrics
from src.utils.types import set_current_iteration

from .executor import honcho_llm_call_inner
Expand Down Expand Up @@ -166,6 +168,8 @@ async def execute_tool_loop(
before_retry_callback: Callable[[Any], None],
stream_final: bool = False,
iteration_callback: IterationCallback | None = None,
track_name: str | None = None,
trace_name: str | None = None,
) -> HonchoLLMCallResponse[Any] | StreamingResponseWithMetadata:
"""Run the iterative tool calling loop for agentic LLM interactions.

Expand All @@ -188,6 +192,8 @@ async def execute_tool_loop(
+ f"got {max_tool_iterations}"
)

feature_label = normalize_feature_label(track_name, trace_name)

conversation_messages: list[dict[str, Any]] = (
messages.copy() if messages else [{"role": "user", "content": prompt}]
)
Expand Down Expand Up @@ -351,6 +357,11 @@ async def _call_with_messages(
"tool_result": tool_result,
}
)
prometheus_metrics.record_llm_tool_call(
feature=feature_label,
tool_name=tool_name,
outcome="success",
)
except Exception as e:
logger.error(f"Tool execution failed for {tool_name}: {e}")
tool_results.append(
Expand All @@ -361,6 +372,11 @@ async def _call_with_messages(
"is_error": True,
}
)
prometheus_metrics.record_llm_tool_call(
feature=feature_label,
tool_name=tool_name,
outcome="error",
)

append_tool_results(current_provider, tool_results, conversation_messages)

Expand Down Expand Up @@ -470,6 +486,7 @@ async def _final_call() -> HonchoLLMCallResponse[Any]:
final_response = await final_call_func()
final_response.tool_calls_made = all_tool_calls
final_response.iterations = iteration + 1
final_response.hit_max_iterations = True
final_response.input_tokens = total_input_tokens + final_response.input_tokens
final_response.output_tokens = total_output_tokens + final_response.output_tokens
final_response.cache_creation_input_tokens = (
Expand Down
3 changes: 3 additions & 0 deletions src/llm/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ class HonchoLLMCallResponse(BaseModel, Generic[T]):
tool_calls_made: list[dict[str, Any]] = Field(default_factory=list)
iterations: int = 0
"""Number of LLM calls made in the tool execution loop."""
hit_max_iterations: bool = False
"""True when the tool loop exited via the max-iterations synthesis path
rather than the model deciding to stop. Telemetry-only signal."""
thinking_content: str | None = None
# Full thinking blocks with signatures for multi-turn replay (Anthropic only).
thinking_blocks: list[dict[str, Any]] = Field(default_factory=list)
Expand Down
Loading
Loading