From ddcb72986cd13655a0768b6443be465fd9e21645 Mon Sep 17 00:00:00 2001 From: SentienceDEV Date: Thu, 30 Apr 2026 21:23:57 -0700 Subject: [PATCH] fail fast config for agent --- examples/agent/README.md | 1 + .../planner_executor_strict_fail_fast.py | 154 ++++++++++++++++++ predicate/agents/planner_executor_agent.py | 17 +- tests/unit/test_planner_executor_agent.py | 75 +++++++++ 4 files changed, 245 insertions(+), 2 deletions(-) create mode 100644 examples/agent/planner_executor_strict_fail_fast.py diff --git a/examples/agent/README.md b/examples/agent/README.md index c737716..c1ad9ed 100644 --- a/examples/agent/README.md +++ b/examples/agent/README.md @@ -3,4 +3,5 @@ Predicate agent examples. - `predicate_browser_agent_minimal.py`: minimal `PredicateBrowserAgent` usage. - `predicate_browser_agent_custom_prompt.py`: customize the compact prompt builder. - `predicate_browser_agent_video_recording_playwright.py`: enable Playwright video recording via context options (recommended). +- `planner_executor_strict_fail_fast.py`: demonstrate `PlannerExecutorConfig(strict_fail_fast=True)` vs default retry/replan behavior. diff --git a/examples/agent/planner_executor_strict_fail_fast.py b/examples/agent/planner_executor_strict_fail_fast.py new file mode 100644 index 0000000..faab607 --- /dev/null +++ b/examples/agent/planner_executor_strict_fail_fast.py @@ -0,0 +1,154 @@ +""" +Example: PlannerExecutorAgent strict fail-fast behavior. + +This demo runs the same failing required step in two modes: +- default mode (allows recovery/replan policy) +- strict fail-fast mode (abort immediately on required-step failure) + +Why this example is deterministic: +- We inject a fixed single-step plan. +- We inject a fixed failed step outcome. +- We count whether recovery/replan hooks are reached. + +Usage: + python examples/agent/planner_executor_strict_fail_fast.py +""" + +from __future__ import annotations + +import asyncio + +from predicate.agents import ( + Plan, + PlanStep, + PlannerExecutorAgent, + PlannerExecutorConfig, + PredicateSpec, + RetryConfig, + StepOutcome, + StepStatus, +) +from predicate.llm_provider import LLMProvider, LLMResponse + + +class FixedProvider(LLMProvider): + """Minimal provider used only to satisfy agent construction.""" + + def __init__(self) -> None: + super().__init__(model="fixed-provider") + + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> LLMResponse: + _ = system_prompt, user_prompt, kwargs + return LLMResponse(content="{}", model_name=self.model_name) + + def supports_json_mode(self) -> bool: + return True + + @property + def model_name(self) -> str: + return "fixed-provider" + + +class DemoRuntime: + """Tiny runtime for the fail-fast demo.""" + + def __init__(self, start_url: str = "https://shop.example.com/search") -> None: + self._url = start_url + + async def get_url(self) -> str: + return self._url + + async def goto(self, url: str) -> None: + self._url = url + + async def stabilize(self) -> None: + return None + + +async def run_demo(strict_fail_fast: bool) -> None: + config = PlannerExecutorConfig( + strict_fail_fast=strict_fail_fast, + retry=RetryConfig(max_replans=1), + auto_fallback_to_stepwise=False, + ) + agent = PlannerExecutorAgent( + planner=FixedProvider(), + executor=FixedProvider(), + config=config, + ) + runtime = DemoRuntime() + + plan = Plan( + task="Open a product details page", + steps=[ + PlanStep( + id=1, + goal="Click a product link", + action="CLICK", + intent="product link", + verify=[PredicateSpec(predicate="url_contains", args=["/product/"])], + required=True, + ) + ], + ) + + failed_step = StepOutcome( + step_id=1, + goal="Click a product link", + status=StepStatus.FAILED, + action_taken="CLICK(1)", + verification_passed=False, + error="verification_failed", + ) + + call_counts = {"recovery": 0, "replan": 0} + + async def fake_plan(*args, **kwargs) -> Plan: + _ = args, kwargs + return plan + + async def fake_execute_step(*args, **kwargs) -> StepOutcome: + _ = args, kwargs + return failed_step + + async def fake_attempt_recovery(*args, **kwargs) -> bool: + _ = args, kwargs + call_counts["recovery"] += 1 + return False + + async def fake_replan(*args, **kwargs) -> Plan: + _ = args, kwargs + call_counts["replan"] += 1 + # Mirror internal replan accounting so the loop exits after one replan. + agent._replans_used += 1 # type: ignore[attr-defined] + return plan + + agent.plan = fake_plan # type: ignore[method-assign] + agent._execute_step = fake_execute_step # type: ignore[method-assign] + agent._attempt_recovery = fake_attempt_recovery # type: ignore[method-assign] + agent.replan = fake_replan # type: ignore[method-assign] + + result = await agent.run( + runtime=runtime, + task="Open a product details page", + start_url="https://shop.example.com", + ) + + mode = "STRICT_FAIL_FAST" if strict_fail_fast else "DEFAULT" + print(f"\n=== {mode} ===") + print(f"success={result.success}") + print(f"error={result.error}") + print(f"steps_completed={result.steps_completed}") + print(f"replans_used={result.replans_used}") + print(f"recovery_calls={call_counts['recovery']}") + print(f"replan_calls={call_counts['replan']}") + + +async def main() -> None: + print("PlannerExecutorAgent strict fail-fast demo") + await run_demo(strict_fail_fast=False) + await run_demo(strict_fail_fast=True) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/predicate/agents/planner_executor_agent.py b/predicate/agents/planner_executor_agent.py index a5f7f09..614912b 100644 --- a/predicate/agents/planner_executor_agent.py +++ b/predicate/agents/planner_executor_agent.py @@ -696,6 +696,11 @@ class PlannerExecutorConfig: # Pre-step verification (skip step if predicates already pass) pre_step_verification: bool = True + # Strict fail-fast mode: + # - required step failures abort the run immediately + # - disables recovery/replan and intra-step fallback recoveries + strict_fail_fast: bool = False + # Scroll-to-find: automatically scroll to find elements when not in viewport scroll_to_find_enabled: bool = True scroll_to_find_max_scrolls: int = 3 # Max scroll attempts per direction @@ -4799,7 +4804,7 @@ async def _execute_step( pass # Ignore snapshot errors # If verification failed and we have optional substeps, try them - if not verification_passed and step.optional_substeps: + if not verification_passed and step.optional_substeps and not self.config.strict_fail_fast: substep_outcomes = await self._execute_optional_substeps( step.optional_substeps, runtime, @@ -4812,7 +4817,11 @@ async def _execute_step( # Fallback: For navigation-causing actions, if URL changed significantly, # consider the action successful even if predicate verification failed. # This handles cases where local LLMs generate imprecise predicates. - if not verification_passed and original_action in ("TYPE_AND_SUBMIT", "CLICK"): + if ( + not verification_passed + and original_action in ("TYPE_AND_SUBMIT", "CLICK") + and not self.config.strict_fail_fast + ): current_url = await runtime.get_url() if hasattr(runtime, "get_url") else None if current_url and pre_url and current_url != pre_url: # Check if this is a meaningful URL change (not just anchor change) @@ -5233,6 +5242,10 @@ async def run( # Handle failure if outcome.status == StepStatus.FAILED and step.required: + if self.config.strict_fail_fast: + error = f"Step {step.id} failed: {outcome.error or 'verification_failed'}" + break + # Check if we've reached an authentication boundary # This is a graceful terminal state - agent did all it could if self.config.auth_boundary.enabled: diff --git a/tests/unit/test_planner_executor_agent.py b/tests/unit/test_planner_executor_agent.py index 7aee77f..68430e6 100644 --- a/tests/unit/test_planner_executor_agent.py +++ b/tests/unit/test_planner_executor_agent.py @@ -789,6 +789,81 @@ def test_page_context_max_chars_customizable(self) -> None: config = PlannerExecutorConfig(use_page_context=True, page_context_max_chars=4000) assert config.page_context_max_chars == 4000 + def test_strict_fail_fast_default_disabled(self) -> None: + config = PlannerExecutorConfig() + assert config.strict_fail_fast is False + + def test_strict_fail_fast_can_be_enabled(self) -> None: + config = PlannerExecutorConfig(strict_fail_fast=True) + assert config.strict_fail_fast is True + + +class TestStrictFailFastBehavior: + """Behavioral tests for strict fail-fast mode.""" + + @pytest.mark.asyncio + async def test_run_aborts_required_failure_without_recovery_or_replan(self) -> None: + from unittest.mock import AsyncMock, MagicMock + + from predicate.agents.planner_executor_agent import ( + PlannerExecutorAgent, + StepOutcome, + StepStatus, + ) + + config = PlannerExecutorConfig(strict_fail_fast=True) + agent = PlannerExecutorAgent( + planner=MockLLMProvider(), + executor=MockLLMProvider(), + config=config, + ) + + plan = Plan( + task="Search for product", + steps=[ + PlanStep( + id=1, + goal="Click product result", + action="CLICK", + intent="product link", + verify=[PredicateSpec(predicate="url_contains", args=["/product"])], + required=True, + ) + ], + ) + + failed_outcome = StepOutcome( + step_id=1, + goal="Click product result", + status=StepStatus.FAILED, + action_taken="CLICK(1)", + verification_passed=False, + error="verification_failed", + ) + + runtime = MagicMock() + runtime.get_url = AsyncMock(return_value="https://shop.example.com/search") + runtime.goto = AsyncMock() + runtime.read_markdown = AsyncMock(return_value=None) + + agent.plan = AsyncMock(return_value=plan) # type: ignore[method-assign] + agent._execute_step = AsyncMock(return_value=failed_outcome) # type: ignore[method-assign] + agent.replan = AsyncMock(side_effect=RuntimeError("should not replan")) # type: ignore[method-assign] + agent._attempt_recovery = AsyncMock(return_value=True) # type: ignore[method-assign] + + outcome = await agent.run( + runtime, + task="Search for product", + start_url="https://shop.example.com", + ) + + assert outcome.success is False + assert outcome.replans_used == 0 + assert outcome.error == "Step 1 failed: verification_failed" + assert len(outcome.step_outcomes) == 1 + agent.replan.assert_not_awaited() # type: ignore[attr-defined] + agent._attempt_recovery.assert_not_awaited() # type: ignore[attr-defined] + # --------------------------------------------------------------------------- # Test PlanStep with optional_substeps