From 8477420f924ec5d04c7199d4997e9b1c81339b33 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 15:17:53 -0700 Subject: [PATCH 01/10] refactor default scenario targets --- .env_example | 5 + pyrit/scenario/core/__init__.py | 5 +- pyrit/scenario/core/scenario.py | 42 +++++++-- .../scenario/core/scenario_target_defaults.py | 91 +++++++++++++++++++ pyrit/scenario/core/scenario_techniques.py | 41 +-------- pyrit/scenario/scenarios/airt/cyber.py | 49 +--------- pyrit/scenario/scenarios/airt/jailbreak.py | 4 +- pyrit/scenario/scenarios/airt/leakage.py | 71 +-------------- pyrit/scenario/scenarios/airt/psychosocial.py | 22 +---- pyrit/scenario/scenarios/airt/scam.py | 66 +------------- .../setup/initializers/components/targets.py | 9 ++ tests/unit/scenario/test_jailbreak.py | 12 ++- tests/unit/scenario/test_rapid_response.py | 2 +- tests/unit/scenario/test_scenario.py | 16 +++- 14 files changed, 183 insertions(+), 252 deletions(-) create mode 100644 pyrit/scenario/core/scenario_target_defaults.py diff --git a/.env_example b/.env_example index fdf9d715e1..b925fb097c 100644 --- a/.env_example +++ b/.env_example @@ -79,6 +79,11 @@ ADVERSARIAL_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" ADVERSARIAL_CHAT_KEY="xxxxx" ADVERSARIAL_CHAT_MODEL="deployment-name" +# Objective Scorer chat target (used in scorers in scenarios) +OBJECTIVE_SCORER_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" +OBJECTIVE_SCORER_CHAT_KEY="xxxxx" +OBJECTIVE_SCORER_CHAT_MODEL="deployment-name" + AZURE_FOUNDRY_DEEPSEEK_ENDPOINT="https://xxxxx.eastus2.models.ai.azure.com" AZURE_FOUNDRY_DEEPSEEK_KEY="xxxxx" AZURE_FOUNDRY_DEEPSEEK_MODEL="" diff --git a/pyrit/scenario/core/__init__.py b/pyrit/scenario/core/__init__.py index c470d31c29..5442c2dd2b 100644 --- a/pyrit/scenario/core/__init__.py +++ b/pyrit/scenario/core/__init__.py @@ -10,9 +10,9 @@ from pyrit.scenario.core.dataset_configuration import EXPLICIT_SEED_GROUPS_KEY, DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.scenario.core.scenario_techniques import ( SCENARIO_TECHNIQUES, - get_default_adversarial_target, register_scenario_techniques, ) @@ -27,6 +27,7 @@ "Scenario", "ScenarioCompositeStrategy", "ScenarioStrategy", - "get_default_adversarial_target", "register_scenario_techniques", + "get_default_scorer_target", + "get_default_adversarial_target", ] diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 38a450cec4..145abba8ba 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -16,6 +16,7 @@ import uuid from abc import ABC, abstractmethod from collections.abc import Sequence +from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union, cast, get_origin from tqdm.auto import tqdm @@ -27,14 +28,20 @@ from pyrit.memory.memory_models import ScenarioResultEntry from pyrit.models import AttackResult, SeedAttackGroup from pyrit.models.scenario_result import ScenarioIdentifier, ScenarioResult -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.target_requirements import TargetRequirements -from pyrit.registry import ScorerRegistry +from pyrit.registry.object_registries.scorer_registry import ScorerRegistry from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import Scorer, SelfAskRefusalScorer, TrueFalseInverterScorer, TrueFalseScorer +from pyrit.scenario.core.scenario_target_defaults import get_default_scorer_target +from pyrit.score import Scorer, TrueFalseScorer +from pyrit.score.true_false.self_ask_refusal_scorer import SelfAskRefusalScorer +from pyrit.score.true_false.self_ask_true_false_scorer import SelfAskTrueFalseScorer +from pyrit.score.true_false.true_false_composite_scorer import TrueFalseCompositeScorer +from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer +from pyrit.score.true_false.true_false_score_aggregator import TrueFalseScoreAggregator if TYPE_CHECKING: from pyrit.executor.attack.core.attack_config import AttackScoringConfig @@ -107,6 +114,11 @@ class Scenario(ABC): #: what the scenario needs. Validated in ``initialize_async`` once the target is supplied. TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements() + #: Optional true/false question prompt path for objective scoring. + #: When set, the default objective scorer becomes + #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. + OBJECTIVE_TRUE_FALSE_QUESTION_PATH: ClassVar[Path | None] = None + def __init__( self, *, @@ -310,17 +322,27 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - # Deferred import to avoid circular dependency: + if type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH is not None: + chat_target = get_default_scorer_target() + objective_scorer = SelfAskTrueFalseScorer( + chat_target=chat_target, + true_false_question_path=type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH, + ) + backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + return TrueFalseCompositeScorer( + aggregator=TrueFalseScoreAggregator.AND, + scorers=[objective_scorer, backstop_scorer], + ) + + # Deferred import to avoid circular dependency. from pyrit.setup.initializers.components.scorers import ScorerInitializerTags entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) if entries and isinstance(entries[0].instance, TrueFalseScorer): - scorer = entries[0].instance - logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") - return scorer - scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=OpenAIChatTarget())) - logger.info(f"No registered default objective scorer found, using fallback: {type(scorer).__name__}") - return scorer + return entries[0].instance + + chat_target = get_default_scorer_target() + return TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) def set_params_from_args(self, *, args: dict[str, Any]) -> None: """ diff --git a/pyrit/scenario/core/scenario_target_defaults.py b/pyrit/scenario/core/scenario_target_defaults.py new file mode 100644 index 0000000000..bc6fe084ae --- /dev/null +++ b/pyrit/scenario/core/scenario_target_defaults.py @@ -0,0 +1,91 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target.common.target_capabilities import CapabilityName +from pyrit.registry import TargetRegistry + + +def get_default_scorer_target() -> PromptChatTarget: + """ + Resolve the default objective scorer chat target. + + First checks the ``TargetRegistry`` for an ``"objective_scorer_chat"`` entry + (populated by ``TargetInitializer`` from ``OBJECTIVE_SCORER_CHAT_*`` env vars). + Falls back to a plain ``OpenAIChatTarget`` + + Returns: + PromptChatTarget: The resolved objective scorer chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target(preferred_target_key="objective_scorer_chat") + + +def get_default_adversarial_target() -> PromptChatTarget: + """ + Resolve the default adversarial chat target. + + First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry + (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). + Falls back to a default fallback target with temperature=1.2 + + Returns: + PromptChatTarget: The resolved adversarial chat target. + + Raises: + ValueError: If the registered target does not support multi-turn. + """ + return _get_default_chat_target( + preferred_target_key="adversarial_chat", + required_capabilities={CapabilityName.MULTI_TURN}, + fallback_temperature=1.2, + ) + + +def _get_default_chat_target( + *, + preferred_target_key: str, + required_capabilities: set[CapabilityName] | None = None, + fallback_temperature: float | None = None, +) -> PromptChatTarget: + """ + Resolve a chat target from TargetRegistry with configurable fallback behavior. + + Resolution order: + 1. ``preferred_target_key`` entry from ``TargetRegistry`` + 2. ``OpenAIChatTarget(...)`` with optional temperature + + Args: + preferred_target_key (str): TargetRegistry key to resolve first. + required_capabilities (set[CapabilityName] | None): Optional capabilities + that a resolved target must support. + fallback_temperature (float | None): Optional temperature for fallback + ``OpenAIChatTarget`` construction. + + Returns: + PromptChatTarget: The resolved chat target. + + Raises: + ValueError: If the resolved target does not satisfy required capabilities. + ValueError: If the registry entry exists but is not a PromptChatTarget. + """ + registry = TargetRegistry.get_registry_singleton() + target = registry.get(preferred_target_key) + if target is not None: + # Check required capabilities first (fail fast) + if required_capabilities: + for capability in required_capabilities: + if not target.capabilities.includes(capability=capability): + raise ValueError(f"Registry entry '{preferred_target_key}' must support {capability.value}.") + + # Then check type + if not isinstance(target, PromptChatTarget): + raise ValueError( + f"Registry entry '{preferred_target_key}' must be a PromptChatTarget, but got {type(target).__name__}" + ) + + return target + + return OpenAIChatTarget(temperature=fallback_temperature) diff --git a/pyrit/scenario/core/scenario_techniques.py b/pyrit/scenario/core/scenario_techniques.py index 818ba8a530..46e723e762 100644 --- a/pyrit/scenario/core/scenario_techniques.py +++ b/pyrit/scenario/core/scenario_techniques.py @@ -22,6 +22,7 @@ import inspect import logging from pathlib import Path +from typing import TYPE_CHECKING from pyrit.common.path import EXECUTOR_SEED_PROMPT_PATH from pyrit.executor.attack import ( @@ -34,13 +35,15 @@ ) from pyrit.models import SeedAttackTechniqueGroup, SeedSimulatedConversation from pyrit.models.seeds.seed_simulated_conversation import NextMessageSystemPromptPaths -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget -from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.registry import TargetRegistry from pyrit.registry.object_registries.attack_technique_registry import ( AttackTechniqueRegistry, AttackTechniqueSpec, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target + +if TYPE_CHECKING: + from pyrit.prompt_target import PromptChatTarget logger = logging.getLogger(__name__) @@ -99,40 +102,6 @@ ] -# --------------------------------------------------------------------------- -# Default adversarial target -# --------------------------------------------------------------------------- - - -def get_default_adversarial_target() -> PromptChatTarget: - """ - Resolve the default adversarial chat target. - - First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry - (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). - Falls back to a plain ``OpenAIChatTarget(temperature=1.2)`` using - ``@apply_defaults`` resolution. - - Returns: - PromptChatTarget: The resolved adversarial chat target. - - Raises: - ValueError: If the registered target does not support multi-turn. - """ - registry = TargetRegistry.get_registry_singleton() - if "adversarial_chat" in registry: - target = registry.get("adversarial_chat") - if target: - if not target.capabilities.includes(capability=CapabilityName.MULTI_TURN): - raise ValueError( - f"Registry entry 'adversarial_chat' must support multi-turn conversations, " - f"but {type(target).__name__} does not." - ) - return target - - return OpenAIChatTarget(temperature=1.2) - - # --------------------------------------------------------------------------- # Runtime spec builder # --------------------------------------------------------------------------- diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index 7f0250911a..ac67174790 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -12,26 +12,16 @@ from __future__ import annotations import logging -import os from typing import TYPE_CHECKING, ClassVar -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import SCORER_SEED_PROMPT_PATH -from pyrit.prompt_target import OpenAIChatTarget from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) if TYPE_CHECKING: from pyrit.scenario.core.scenario_strategy import ScenarioStrategy + from pyrit.score import TrueFalseScorer logger = logging.getLogger(__name__) @@ -74,6 +64,7 @@ class Cyber(Scenario): """ VERSION: int = 2 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod @@ -138,39 +129,3 @@ def __init__( include_default_baseline=include_baseline, scenario_result_id=scenario_result_id, ) - - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - The scoring mechanism is as follows: - 1. Detect for the presence of malware using a SelfAskTrueFalseScorer. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce malware, we want that - to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and malware evaluation. - """ - endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - presence_of_malware = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml", - ) - - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_malware, backstop] - ) diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 1870fdc0cf..60e7c0146f 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -5,7 +5,6 @@ from pathlib import Path from typing import Any, Optional, Union -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack.core.attack_config import ( @@ -199,9 +198,10 @@ def _create_adversarial_target(self) -> OpenAIChatTarget: OpenAIChatTarget: A fresh adversarial target using an unfiltered endpoint. """ endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") + api_key = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") return OpenAIChatTarget( endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), + api_key=api_key, model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), temperature=1.2, ) diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index 801522d82e..687c159809 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -1,13 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os from pathlib import Path from typing import Optional from PIL import Image -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import DATASETS_PATH, SCORER_SEED_PROMPT_PATH from pyrit.executor.attack import ( @@ -22,20 +20,14 @@ from pyrit.models import SeedAttackGroup, SeedObjective from pyrit.prompt_converter import AddImageTextConverter, FirstLetterConverter, PromptConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.score import TrueFalseScorer class LeakageStrategy(ScenarioStrategy): @@ -89,6 +81,7 @@ class Leakage(Scenario): """ VERSION: int = 1 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -155,7 +148,7 @@ def __init__( self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) # Initialize adversarial chat for multi-turn attacks - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() self._adversarial_config = AttackAdversarialConfig(target=self._adversarial_chat) super().__init__( @@ -168,60 +161,6 @@ def __init__( self._objectives = objectives if objectives else self._get_default_objectives() - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - - The scoring mechanism is as follows: - 1. Detect for objective achieved using a SelfAskTrueFalseScorer with leakage detection. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to leak data, - we want that to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and leakage evaluation. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - presence_of_leakage = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml", - ) - - # Backstop scorer: Ensures the model didn't simply refuse the request. - # Uses SelfAskRefusalScorer inverted - if model refused, this returns False. - # Combined with AND aggregator: response must contain leakage AND not be a refusal. - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer( - aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_leakage, backstop] - ) - - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Create and retrieve the default adversarial target. - - Returns: - OpenAIChatTarget: Default adversarial target using an unfiltered endpoint. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _get_default_objectives(self) -> list[str]: """ Get the default seed prompts for leakage tests. diff --git a/pyrit/scenario/scenarios/airt/psychosocial.py b/pyrit/scenario/scenarios/airt/psychosocial.py index 8a0fc924b9..2e156e6f34 100644 --- a/pyrit/scenario/scenarios/airt/psychosocial.py +++ b/pyrit/scenario/scenarios/airt/psychosocial.py @@ -9,7 +9,6 @@ import yaml -from pyrit.auth import get_azure_openai_auth from pyrit.common import apply_defaults from pyrit.common.path import DATASETS_PATH from pyrit.executor.attack import ( @@ -37,6 +36,7 @@ from pyrit.scenario.core.scenario_strategy import ( ScenarioStrategy, ) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target from pyrit.score import ( FloatScaleScorer, FloatScaleThresholdScorer, @@ -253,7 +253,7 @@ def __init__( "objectives is deprecated and will be removed in a future version. " "Use dataset_config in initialize_async instead." ) - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() # Merge user-provided configs with defaults (user-provided takes precedence) self._subharm_configs = {**self.DEFAULT_SUBHARM_CONFIGS, **(subharm_configs or {})} @@ -356,21 +356,6 @@ def _filter_by_harm_category( filtered_groups.append(SeedAttackGroup(seeds=filtered_seeds)) return filtered_groups - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Create default adversarial chat target for multi-turn attacks. - - Returns: - OpenAIChatTarget: Default adversarial target, using an unfiltered endpoint. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.7, - ) - def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScorer: """ Create scorer for psychosocial harms evaluation. @@ -404,9 +389,10 @@ def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScore psychosocial_harm_rubric = yaml_data["value"] endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") + api_key = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") azure_openai_chat_target = OpenAIChatTarget( endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), + api_key=api_key, model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), ) diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index e714b73a57..cf693836b8 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -2,11 +2,9 @@ # Licensed under the MIT license. import logging -import os from pathlib import Path from typing import TYPE_CHECKING, Any, Optional -from pyrit.auth import get_azure_openai_auth from pyrit.common import Parameter, apply_defaults from pyrit.common.path import ( EXECUTOR_RED_TEAM_PATH, @@ -23,20 +21,14 @@ AttackScoringConfig, ) from pyrit.models import SeedAttackGroup -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy -from pyrit.score import ( - SelfAskRefusalScorer, - SelfAskTrueFalseScorer, - TrueFalseCompositeScorer, - TrueFalseInverterScorer, - TrueFalseScoreAggregator, - TrueFalseScorer, -) +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.score import TrueFalseScorer if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -91,6 +83,7 @@ class Scam(Scenario): """ VERSION: int = 1 + OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: @@ -172,7 +165,7 @@ def __init__( self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) # Used for multiturn strategies and RolePlayAttack rephrasing - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() self._adversarial_config = AttackAdversarialConfig(target=self._adversarial_chat) super().__init__( @@ -186,55 +179,6 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer: - """ - Retrieve the default objective scorer. - The scoring mechanism is as follows: - 1. Detect for objective achieved using a SelfAskTrueFalseScorer with a custom true/false question path. - 2. Detect for NON-refusal as a backstop. (If the target model refuses to produce scam materials, we want that - to register as a negative score). - - Returns: - TrueFalseCompositeScorer: Default objective scorer with backstop and scam materials evaluation. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - scam_materials = SelfAskTrueFalseScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=0.9, - ), - true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml", - ) - - backstop = TrueFalseInverterScorer( - scorer=SelfAskRefusalScorer( - chat_target=OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) - ) - ) - - return TrueFalseCompositeScorer(aggregator=TrueFalseScoreAggregator.AND, scorers=[scam_materials, backstop]) - - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - """ - Provide an OpenAI target for the role-play rephrasing step. - - Returns: - OpenAIChatTarget: Target that supplies the persuasion script rephrasing. - """ - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _resolve_seed_groups(self) -> list[SeedAttackGroup]: """ Resolve seed groups from dataset configuration. diff --git a/pyrit/setup/initializers/components/targets.py b/pyrit/setup/initializers/components/targets.py index 055e5c40d1..3a8049ca3a 100644 --- a/pyrit/setup/initializers/components/targets.py +++ b/pyrit/setup/initializers/components/targets.py @@ -180,6 +180,15 @@ class TargetConfig: temperature=1.2, tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.ADVERSARIAL], ), + TargetConfig( + registry_name="objective_scorer_chat", + target_class=OpenAIChatTarget, + endpoint_var="OBJECTIVE_SCORER_CHAT_ENDPOINT", + key_var="OBJECTIVE_SCORER_CHAT_KEY", + model_var="OBJECTIVE_SCORER_CHAT_MODEL", + underlying_model_var="OBJECTIVE_SCORER_CHAT_UNDERLYING_MODEL", + tags=[TargetInitializerTags.DEFAULT, TargetInitializerTags.SCORER], + ), TargetConfig( registry_name="azure_foundry_deepseek", target_class=OpenAIChatTarget, diff --git a/tests/unit/scenario/test_jailbreak.py b/tests/unit/scenario/test_jailbreak.py index c873465c6b..1ef7c2090b 100644 --- a/tests/unit/scenario/test_jailbreak.py +++ b/tests/unit/scenario/test_jailbreak.py @@ -15,7 +15,7 @@ from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack from pyrit.identifiers import ComponentIdentifier from pyrit.models import SeedGroup, SeedObjective -from pyrit.prompt_target import OpenAIChatTarget, PromptTarget +from pyrit.prompt_target import PromptTarget from pyrit.scenario.scenarios.airt.jailbreak import Jailbreak, JailbreakStrategy from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer @@ -447,11 +447,13 @@ async def test_no_target_duplication_async( class TestJailbreakAdversarialTarget: """Tests for adversarial target creation and caching.""" - def test_create_adversarial_target_returns_openai_chat_target(self) -> None: - """Test that _create_adversarial_target returns a new OpenAIChatTarget.""" + def test_get_or_create_adversarial_target_returns_prompt_chat_target(self) -> None: + """Test that _get_or_create_adversarial_target returns a PromptChatTarget.""" + from pyrit.prompt_target import PromptChatTarget + scenario = Jailbreak() - target = scenario._create_adversarial_target() - assert isinstance(target, OpenAIChatTarget) + target = scenario._get_or_create_adversarial_target() + assert isinstance(target, PromptChatTarget) def test_get_or_create_adversarial_target_reuses_instance(self) -> None: """Test that _get_or_create_adversarial_target returns the same instance on repeated calls.""" diff --git a/tests/unit/scenario/test_rapid_response.py b/tests/unit/scenario/test_rapid_response.py index ddf95df2e6..5b37d95c65 100644 --- a/tests/unit/scenario/test_rapid_response.py +++ b/tests/unit/scenario/test_rapid_response.py @@ -651,7 +651,7 @@ def test_get_default_adversarial_target_capability_check(self): mock_target = MagicMock(spec=PromptTarget) mock_target.capabilities.includes.return_value = False target_registry.register(name="adversarial_chat", instance=mock_target) - with pytest.raises(ValueError, match="must support multi-turn"): + with pytest.raises(ValueError, match="must support"): get_default_adversarial_target() diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index 2c285dc420..5ca197753f 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,12 +854,16 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + mock_self = MagicMock() + type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer - @patch("pyrit.scenario.core.scenario.OpenAIChatTarget") + @patch("pyrit.scenario.core.scenario.get_default_scorer_target") @patch("pyrit.scenario.core.scenario.ScorerRegistry") - def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_target) -> None: + def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_scorer_target) -> None: """Test fallback to TrueFalseInverterScorer when no tagged scorer exists.""" from pyrit.score import TrueFalseInverterScorer @@ -867,7 +871,11 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_oai_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - result = Scenario._get_default_objective_scorer(MagicMock()) + # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + mock_self = MagicMock() + type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + + result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer) From 554554355e74c50b98a37e30370fc5ecebf1c1e7 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 16:18:16 -0700 Subject: [PATCH 02/10] rename --- pyrit/scenario/core/scenario.py | 8 +++++--- pyrit/scenario/scenarios/airt/cyber.py | 2 +- pyrit/scenario/scenarios/airt/leakage.py | 2 +- pyrit/scenario/scenarios/airt/scam.py | 2 +- tests/unit/scenario/test_scenario.py | 8 ++++---- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 145abba8ba..92bf290896 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -117,7 +117,7 @@ class Scenario(ABC): #: Optional true/false question prompt path for objective scoring. #: When set, the default objective scorer becomes #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. - OBJECTIVE_TRUE_FALSE_QUESTION_PATH: ClassVar[Path | None] = None + COMPOSITE_SCORER_QUESTIONS_PATH: ClassVar[Path | None] = None def __init__( self, @@ -322,11 +322,13 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - if type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH is not None: + composite_scorer_questions_path = type(self).COMPOSITE_SCORER_QUESTIONS_PATH + + if composite_scorer_questions_path is not None: chat_target = get_default_scorer_target() objective_scorer = SelfAskTrueFalseScorer( chat_target=chat_target, - true_false_question_path=type(self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH, + true_false_question_path=composite_scorer_questions_path, ) backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) return TrueFalseCompositeScorer( diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index ac67174790..49d0445630 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -64,7 +64,7 @@ class Cyber(Scenario): """ VERSION: int = 2 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None @classmethod diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index 687c159809..3f58d20622 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -81,7 +81,7 @@ class Leakage(Scenario): """ VERSION: int = 1 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index cf693836b8..da8007ce1e 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -83,7 +83,7 @@ class Scam(Scenario): """ VERSION: int = 1 - OBJECTIVE_TRUE_FALSE_QUESTION_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" + COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index 5ca197753f..917ef4922b 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,9 +854,9 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None mock_self = MagicMock() - type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer @@ -871,9 +871,9 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None mock_self = MagicMock() - type(mock_self).OBJECTIVE_TRUE_FALSE_QUESTION_PATH = None + type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer) From b26f5534a77ca70428d07ad0dbe5f1e7956324c9 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Wed, 6 May 2026 16:27:06 -0700 Subject: [PATCH 03/10] log --- pyrit/scenario/core/scenario.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 92bf290896..f669a0c054 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -331,20 +331,26 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: true_false_question_path=composite_scorer_questions_path, ) backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) - return TrueFalseCompositeScorer( + scorer = TrueFalseCompositeScorer( aggregator=TrueFalseScoreAggregator.AND, scorers=[objective_scorer, backstop_scorer], ) + logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") + return scorer # Deferred import to avoid circular dependency. from pyrit.setup.initializers.components.scorers import ScorerInitializerTags entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) if entries and isinstance(entries[0].instance, TrueFalseScorer): - return entries[0].instance + scorer = entries[0].instance + logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") + return scorer chat_target = get_default_scorer_target() - return TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) + logger.info(f"Using fallback default objective scorer: {type(scorer).__name__}") + return scorer def set_params_from_args(self, *, args: dict[str, Any]) -> None: """ From 6921832008245d1d756c939e20815f1db3652e6e Mon Sep 17 00:00:00 2001 From: behnam Date: Wed, 6 May 2026 22:00:38 -0700 Subject: [PATCH 04/10] more removals --- pyrit/scenario/scenarios/airt/jailbreak.py | 30 +++++-------------- pyrit/scenario/scenarios/airt/psychosocial.py | 15 +++------- .../scenarios/foundry/red_team_agent.py | 15 ++-------- 3 files changed, 13 insertions(+), 47 deletions(-) diff --git a/pyrit/scenario/scenarios/airt/jailbreak.py b/pyrit/scenario/scenarios/airt/jailbreak.py index 60e7c0146f..69673e766c 100644 --- a/pyrit/scenario/scenarios/airt/jailbreak.py +++ b/pyrit/scenario/scenarios/airt/jailbreak.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import os from pathlib import Path from typing import Any, Optional, Union @@ -19,12 +18,13 @@ from pyrit.models import SeedAttackGroup from pyrit.prompt_converter import TextJailbreakConverter from pyrit.prompt_normalizer import PromptConverterConfiguration -from pyrit.prompt_target import OpenAIChatTarget +from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target from pyrit.score import ( TrueFalseScorer, ) @@ -161,7 +161,7 @@ def __init__( self._num_templates = num_templates self._num_attempts = num_attempts - self._adversarial_target: Optional[OpenAIChatTarget] = None + self._adversarial_target: Optional[PromptChatTarget] = None # Note that num_templates and jailbreak_names are mutually exclusive. # If self._num_templates is None, then this returns all discoverable jailbreak templates. @@ -190,34 +190,18 @@ def __init__( # Will be resolved in _get_atomic_attacks_async self._seed_groups: Optional[list[SeedAttackGroup]] = None - def _create_adversarial_target(self) -> OpenAIChatTarget: - """ - Create a new adversarial target instance. - - Returns: - OpenAIChatTarget: A fresh adversarial target using an unfiltered endpoint. - """ - endpoint = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - api_key = os.getenv("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=api_key, - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - - def _get_or_create_adversarial_target(self) -> OpenAIChatTarget: + def _get_or_create_adversarial_target(self) -> PromptChatTarget: """ Return the shared adversarial target, creating it on first access. - Reuses a single OpenAIChatTarget instance across all role-play attacks + Reuses a single PromptChatTarget instance across all role-play attacks to avoid repeated client and TLS setup. Returns: - OpenAIChatTarget: The shared adversarial target. + PromptChatTarget: The shared adversarial target. """ if self._adversarial_target is None: - self._adversarial_target = self._create_adversarial_target() + self._adversarial_target = get_default_adversarial_target() return self._adversarial_target def _resolve_seed_groups(self) -> list[SeedAttackGroup]: diff --git a/pyrit/scenario/scenarios/airt/psychosocial.py b/pyrit/scenario/scenarios/airt/psychosocial.py index 2e156e6f34..308e6db50e 100644 --- a/pyrit/scenario/scenarios/airt/psychosocial.py +++ b/pyrit/scenario/scenarios/airt/psychosocial.py @@ -2,7 +2,6 @@ # Licensed under the MIT license. import logging -import os import pathlib from dataclasses import dataclass from typing import Any, Optional, TypeVar @@ -26,7 +25,7 @@ from pyrit.prompt_normalizer.prompt_converter_configuration import ( PromptConverterConfiguration, ) -from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget +from pyrit.prompt_target import PromptChatTarget from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.prompt_target.common.target_requirements import TargetRequirements from pyrit.scenario.core.atomic_attack import AtomicAttack @@ -36,7 +35,7 @@ from pyrit.scenario.core.scenario_strategy import ( ScenarioStrategy, ) -from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target from pyrit.score import ( FloatScaleScorer, FloatScaleThresholdScorer, @@ -388,17 +387,11 @@ def _get_scorer(self, subharm: Optional[str] = None) -> FloatScaleThresholdScore # Extract the 'value' field which contains the actual rubric text psychosocial_harm_rubric = yaml_data["value"] - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - api_key = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY") - azure_openai_chat_target = OpenAIChatTarget( - endpoint=endpoint, - api_key=api_key, - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - ) + scorer_target = get_default_scorer_target() # Create the base crisis scorer psych_scorer = SelfAskGeneralFloatScaleScorer( - chat_target=azure_openai_chat_target, + chat_target=scorer_target, system_prompt_format_string=psychosocial_harm_rubric, rationale_output_key="reasoning", # Match the YAML JSON schema key category="psychosocial_harm", diff --git a/pyrit/scenario/scenarios/foundry/red_team_agent.py b/pyrit/scenario/scenarios/foundry/red_team_agent.py index a875e186a5..6b8f271d9b 100644 --- a/pyrit/scenario/scenarios/foundry/red_team_agent.py +++ b/pyrit/scenario/scenarios/foundry/red_team_agent.py @@ -10,13 +10,11 @@ """ import logging -import os from collections.abc import Sequence from dataclasses import dataclass, field from inspect import signature from typing import TYPE_CHECKING, Any, Optional, TypeVar, cast -from pyrit.auth import get_azure_openai_auth from pyrit.common import REQUIRED_VALUE, apply_defaults from pyrit.datasets import TextJailBreak from pyrit.executor.attack import ( @@ -62,12 +60,12 @@ ) from pyrit.prompt_target import PromptTarget from pyrit.prompt_target.common.prompt_chat_target import PromptChatTarget -from pyrit.prompt_target.openai.openai_chat_target import OpenAIChatTarget from pyrit.scenario.core.atomic_attack import AtomicAttack from pyrit.scenario.core.attack_technique import AttackTechnique from pyrit.scenario.core.dataset_configuration import DatasetConfiguration from pyrit.scenario.core.scenario import Scenario from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy +from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target if TYPE_CHECKING: from pyrit.executor.attack.core.attack_strategy import AttackStrategy @@ -270,7 +268,7 @@ def __init__( Raises: ValueError: If attack_strategies is empty or contains unsupported strategies. """ - self._adversarial_chat = adversarial_chat if adversarial_chat else self._get_default_adversarial_target() + self._adversarial_chat = adversarial_chat if adversarial_chat else get_default_adversarial_target() if not attack_scoring_config: attack_scoring_config = AttackScoringConfig(objective_scorer=self._get_default_objective_scorer()) self._attack_scoring_config = attack_scoring_config @@ -426,15 +424,6 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]: return [self._get_attack_from_strategy(composition) for composition in self._scenario_composites] - def _get_default_adversarial_target(self) -> OpenAIChatTarget: - endpoint = os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT") - return OpenAIChatTarget( - endpoint=endpoint, - api_key=get_azure_openai_auth(endpoint or ""), - model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), - temperature=1.2, - ) - def _get_attack_from_strategy(self, composite: FoundryComposite) -> AtomicAttack: """ Get an atomic attack for the specified FoundryComposite. From 11cb3695e625777821519b864f91fe17dc0ee21b Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 09:25:49 -0700 Subject: [PATCH 05/10] update test --- tests/unit/setup/test_scenarios_initializer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/setup/test_scenarios_initializer.py b/tests/unit/setup/test_scenarios_initializer.py index 4d9cfa841f..d56414279b 100644 --- a/tests/unit/setup/test_scenarios_initializer.py +++ b/tests/unit/setup/test_scenarios_initializer.py @@ -235,12 +235,11 @@ async def test_idempotent(self, mock_adversarial_target): @pytest.mark.asyncio async def test_falls_back_to_default_target_when_registry_empty(self): """With no 'adversarial_chat' in TargetRegistry, the fallback constructs an OpenAIChatTarget.""" - # Patch OpenAIChatTarget at the import site inside scenario_techniques - # (which is what get_default_adversarial_target calls), so the test does - # not depend on OPENAI_CHAT_MODEL or any other env var being set. + # Patch OpenAIChatTarget at the fallback construction site so the test + # does not depend on OPENAI_CHAT_MODEL or any other env var being set. fallback_target = MagicMock(spec=PromptChatTarget) with patch( - "pyrit.scenario.core.scenario_techniques.OpenAIChatTarget", + "pyrit.scenario.core.scenario_target_defaults.OpenAIChatTarget", return_value=fallback_target, ) as mock_openai: init = ScenarioTechniqueInitializer() From 19a495e05ee6a20a544e6c353a993955032a1140 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 11:26:17 -0700 Subject: [PATCH 06/10] update foundry tests --- tests/unit/scenario/test_foundry.py | 233 +++------------------------- 1 file changed, 22 insertions(+), 211 deletions(-) diff --git a/tests/unit/scenario/test_foundry.py b/tests/unit/scenario/test_foundry.py index ab1995e947..7811bf4144 100644 --- a/tests/unit/scenario/test_foundry.py +++ b/tests/unit/scenario/test_foundry.py @@ -91,18 +91,29 @@ def mock_float_threshold_scorer(): return mock -@pytest.mark.usefixtures("patch_central_database") -class TestFoundryInitialization: - """Tests for RedTeamAgent initialization.""" - - @patch.dict( +@pytest.fixture +def mock_runtime_env(): + with patch.dict( "os.environ", { "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", + "OPENAI_CHAT_ENDPOINT": "https://test.openai.azure.com/", + "OPENAI_CHAT_KEY": "test-key", + "OPENAI_CHAT_MODEL": "gpt-4", }, - ) + ): + yield + + +FIXTURES = ["patch_central_database", "mock_runtime_env"] + + +@pytest.mark.usefixtures(*FIXTURES) +class TestFoundryInitialization: + """Tests for RedTeamAgent initialization.""" + async def test_init_with_single_strategy( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -120,14 +131,6 @@ async def test_init_with_single_strategy( assert scenario.atomic_attack_count > 0 assert scenario.name == "RedTeamAgent" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_init_with_multiple_strategies( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -150,14 +153,6 @@ async def test_init_with_multiple_strategies( ) assert scenario.atomic_attack_count >= len(strategies) - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) def test_init_with_custom_adversarial_target( self, mock_objective_target, mock_adversarial_target, mock_objective_scorer ): @@ -169,14 +164,6 @@ def test_init_with_custom_adversarial_target( assert scenario._adversarial_chat == mock_adversarial_target - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) def test_init_with_custom_scorer(self, mock_objective_target, mock_objective_scorer): """Test initialization with custom objective scorer.""" scenario = RedTeamAgent( @@ -185,14 +172,6 @@ def test_init_with_custom_scorer(self, mock_objective_target, mock_objective_sco assert scenario._attack_scoring_config.objective_scorer == mock_objective_scorer - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_init_with_memory_labels( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -215,14 +194,6 @@ async def test_init_with_memory_labels( assert scenario._memory_labels == memory_labels @patch("pyrit.scenario.core.scenario.Scenario._get_default_objective_scorer") - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) def test_init_creates_default_scorer_when_not_provided( self, mock_get_scorer, mock_objective_target, mock_memory_seed_groups ): @@ -240,14 +211,6 @@ def test_init_creates_default_scorer_when_not_provided( # seed_groups are resolved lazily during _get_atomic_attacks_async assert scenario._attack_scoring_config.objective_scorer == mock_scorer_instance - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_init_raises_exception_when_no_datasets_available(self, mock_objective_target, mock_objective_scorer): """Test that initialization raises ValueError when datasets are not available in memory.""" # Don't mock _resolve_seed_groups, let it try to load from empty memory @@ -258,18 +221,10 @@ async def test_init_raises_exception_when_no_datasets_available(self, mock_objec await scenario.initialize_async(objective_target=mock_objective_target) -@pytest.mark.usefixtures("patch_central_database") +@pytest.mark.usefixtures(*FIXTURES) class TestFoundryStrategyNormalization: """Tests for attack strategy normalization.""" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_normalize_easy_strategies( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -287,14 +242,6 @@ async def test_normalize_easy_strategies( # EASY should expand to multiple attack strategies assert scenario.atomic_attack_count > 1 - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_normalize_moderate_strategies( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -312,14 +259,6 @@ async def test_normalize_moderate_strategies( # MODERATE should expand to moderate attack strategies (currently only 1: Tense) assert scenario.atomic_attack_count >= 1 - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_normalize_difficult_strategies( self, mock_objective_target, mock_float_threshold_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -338,14 +277,6 @@ async def test_normalize_difficult_strategies( # DIFFICULT should expand to multiple attack strategies assert scenario.atomic_attack_count > 1 - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_normalize_mixed_difficulty_levels( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -363,14 +294,6 @@ async def test_normalize_mixed_difficulty_levels( # Combined difficulty levels should expand to multiple strategies assert scenario.atomic_attack_count > 5 # EASY has 20, MODERATE has 1, combined should have more - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_normalize_with_specific_and_difficulty_levels( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -392,18 +315,10 @@ async def test_normalize_with_specific_and_difficulty_levels( assert scenario.atomic_attack_count >= 20 -@pytest.mark.usefixtures("patch_central_database") +@pytest.mark.usefixtures(*FIXTURES) class TestFoundryAttackCreation: """Tests for attack creation from strategies.""" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_get_attack_from_single_turn_strategy( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -426,14 +341,6 @@ async def test_get_attack_from_single_turn_strategy( assert isinstance(atomic_attack, AtomicAttack) assert atomic_attack.seed_groups == mock_memory_seed_groups - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_get_attack_from_multi_turn_strategy( self, mock_objective_target, @@ -463,18 +370,10 @@ async def test_get_attack_from_multi_turn_strategy( assert atomic_attack.seed_groups == mock_memory_seed_groups -@pytest.mark.usefixtures("patch_central_database") +@pytest.mark.usefixtures(*FIXTURES) class TestFoundryGetAttack: """Tests for the _get_attack method.""" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_get_attack_single_turn_with_converters( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -497,14 +396,6 @@ async def test_get_attack_single_turn_with_converters( assert isinstance(attack, PromptSendingAttack) - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_get_attack_multi_turn_with_adversarial_target( self, mock_objective_target, @@ -534,18 +425,10 @@ async def test_get_attack_multi_turn_with_adversarial_target( assert isinstance(attack, CrescendoAttack) -@pytest.mark.usefixtures("patch_central_database") +@pytest.mark.usefixtures(*FIXTURES) class TestFoundryAllStrategies: """Tests that all strategies can be instantiated.""" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) @pytest.mark.parametrize( "strategy", [ @@ -592,14 +475,6 @@ async def test_all_single_turn_strategies_create_attack_runs( atomic_attack = scenario._get_attack_from_strategy(composite_strategy) assert isinstance(atomic_attack, AtomicAttack) - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) @pytest.mark.parametrize( "strategy", [ @@ -635,18 +510,10 @@ async def test_all_multi_turn_strategies_create_attack_runs( assert isinstance(atomic_attack, AtomicAttack) -@pytest.mark.usefixtures("patch_central_database") +@pytest.mark.usefixtures(*FIXTURES) class TestFoundryProperties: """Tests for RedTeamAgent properties and attributes.""" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_scenario_composites_set_after_initialize( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -672,14 +539,6 @@ async def test_scenario_composites_set_after_initialize( assert len(scenario._scenario_composites) == len(strategies) assert scenario.atomic_attack_count == len(strategies) - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) def test_scenario_version_is_set(self, mock_objective_target, mock_objective_scorer): """Test that scenario version is properly set.""" scenario = RedTeamAgent( @@ -688,14 +547,6 @@ def test_scenario_version_is_set(self, mock_objective_target, mock_objective_sco assert scenario.VERSION == 1 - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_scenario_atomic_attack_count_matches_strategies( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -719,14 +570,6 @@ async def test_scenario_atomic_attack_count_matches_strategies( # Should have at least as many runs as specific strategies provided assert scenario.atomic_attack_count >= len(strategies) - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_initialize_with_foundry_composite_directly( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -750,14 +593,6 @@ async def test_initialize_with_foundry_composite_directly( assert result.converters == [FoundryStrategy.Base64] assert result.name == "ComposedStrategy(crescendo, base64)" - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) async def test_initialize_with_mixed_composites_and_strategies( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config ): @@ -780,14 +615,6 @@ async def test_initialize_with_mixed_composites_and_strategies( assert scenario._scenario_composites[1].attack is None assert scenario._scenario_composites[1].converters == [FoundryStrategy.ROT13] - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") async def test_initialize_converts_scenario_composite_strategy_to_foundry_composite( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config @@ -811,14 +638,6 @@ async def test_initialize_converts_scenario_composite_strategy_to_foundry_compos assert result.attack == FoundryStrategy.Crescendo assert result.converters == [FoundryStrategy.Base64] - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") async def test_initialize_converts_converter_first_composite_strategy( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config @@ -841,14 +660,6 @@ async def test_initialize_converts_converter_first_composite_strategy( assert result.attack == FoundryStrategy.Crescendo assert result.converters == [FoundryStrategy.Base64] - @patch.dict( - "os.environ", - { - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT": "https://test.openai.azure.com/", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY": "test-key", - "AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL": "gpt-4", - }, - ) @pytest.mark.filterwarnings("ignore::DeprecationWarning") async def test_initialize_converts_converter_only_composite_strategy( self, mock_objective_target, mock_objective_scorer, mock_memory_seed_groups, mock_dataset_config From 52dc2a2c486f5e65ca4bb79df866e8b22d722be0 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 12:08:59 -0700 Subject: [PATCH 07/10] override method instead of classvar --- pyrit/scenario/core/scenario.py | 31 ++++++++++++------- .../scenario/core/scenario_target_defaults.py | 7 +++++ pyrit/scenario/scenarios/airt/cyber.py | 13 +++++++- pyrit/scenario/scenarios/airt/leakage.py | 13 +++++++- pyrit/scenario/scenarios/airt/scam.py | 11 ++++++- tests/unit/scenario/test_scenario.py | 8 ++--- 6 files changed, 65 insertions(+), 18 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 42e8222d0d..98ec21c469 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -114,10 +114,19 @@ class Scenario(ABC): #: what the scenario needs. Validated in ``initialize_async`` once the target is supplied. TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements() - #: Optional true/false question prompt path for objective scoring. - #: When set, the default objective scorer becomes - #: ``SelfAskTrueFalseScorer(path) AND NOT(SelfAskRefusalScorer)``. - COMPOSITE_SCORER_QUESTIONS_PATH: ClassVar[Path | None] = None + @classmethod + def get_override_composite_scorer_questions_path(cls) -> Sequence[Path]: + """ + Override to provide true/false question prompt paths for objective scoring. + + When overridden to return a non-empty sequence, the default objective scorer becomes + one ``SelfAskTrueFalseScorer`` per path AND-ed together with ``NOT(SelfAskRefusalScorer)`` + instead of the scenario-level default. + + Returns: + Sequence[Path]: Paths to true/false question prompts, or an empty sequence to use the default scorer. + """ + return [] def __init__( self, @@ -322,18 +331,18 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - composite_scorer_questions_path = type(self).COMPOSITE_SCORER_QUESTIONS_PATH + composite_scorer_questions_paths = type(self).get_override_composite_scorer_questions_path() - if composite_scorer_questions_path is not None: + if composite_scorer_questions_paths: chat_target = get_default_scorer_target() - objective_scorer = SelfAskTrueFalseScorer( - chat_target=chat_target, - true_false_question_path=composite_scorer_questions_path, - ) + path_scorers: list[TrueFalseScorer] = [ + SelfAskTrueFalseScorer(chat_target=chat_target, true_false_question_path=path) + for path in composite_scorer_questions_paths + ] backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) scorer = TrueFalseCompositeScorer( aggregator=TrueFalseScoreAggregator.AND, - scorers=[objective_scorer, backstop_scorer], + scorers=[*path_scorers, backstop_scorer], ) logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") return scorer diff --git a/pyrit/scenario/core/scenario_target_defaults.py b/pyrit/scenario/core/scenario_target_defaults.py index bc6fe084ae..856ae729f7 100644 --- a/pyrit/scenario/core/scenario_target_defaults.py +++ b/pyrit/scenario/core/scenario_target_defaults.py @@ -1,10 +1,14 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import logging + from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget from pyrit.prompt_target.common.target_capabilities import CapabilityName from pyrit.registry import TargetRegistry +logger = logging.getLogger(__name__) + def get_default_scorer_target() -> PromptChatTarget: """ @@ -88,4 +92,7 @@ def _get_default_chat_target( return target + logger.warning( + f"TargetRegistry entry '{preferred_target_key}' not found. Falling back to default OpenAIChatTarget." + ) return OpenAIChatTarget(temperature=fallback_temperature) diff --git a/pyrit/scenario/scenarios/airt/cyber.py b/pyrit/scenario/scenarios/airt/cyber.py index 9c7abbe022..4f865507da 100644 --- a/pyrit/scenario/scenarios/airt/cyber.py +++ b/pyrit/scenario/scenarios/airt/cyber.py @@ -12,6 +12,8 @@ from pyrit.scenario.core.scenario import Scenario if TYPE_CHECKING: + from pathlib import Path + from pyrit.scenario.core.scenario_strategy import ScenarioStrategy from pyrit.score import TrueFalseScorer @@ -56,9 +58,18 @@ class Cyber(Scenario): """ VERSION: int = 2 - COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml" _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None + @classmethod + def get_override_composite_scorer_questions_path(cls) -> list[Path]: + """ + Override true/false question paths for cyber objective scoring. + + Returns: + Sequence[Path]: Paths to true/false question paths for cyber objective scoring. + """ + return [SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml"] + @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: """ diff --git a/pyrit/scenario/scenarios/airt/leakage.py b/pyrit/scenario/scenarios/airt/leakage.py index 8be71be6ef..3033fca351 100644 --- a/pyrit/scenario/scenarios/airt/leakage.py +++ b/pyrit/scenario/scenarios/airt/leakage.py @@ -24,6 +24,8 @@ from pyrit.scenario.core.scenario_strategy import ScenarioStrategy if TYPE_CHECKING: + from pathlib import Path + from pyrit.scenario.core.attack_technique_factory import AttackTechniqueFactory from pyrit.scenario.core.scenario_strategy import ScenarioStrategy from pyrit.score import TrueFalseScorer @@ -97,7 +99,16 @@ class Leakage(Scenario): VERSION: int = 2 _cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None - COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml" + + @classmethod + def get_override_composite_scorer_questions_path(cls) -> list[Path]: + """ + Override true/false question paths for leakage objective scoring. + + Returns: + Sequence[Path]: Paths to true/false question paths for leakage objective scoring. + """ + return [SCORER_SEED_PROMPT_PATH / "true_false_question" / "leakage.yaml"] @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/pyrit/scenario/scenarios/airt/scam.py b/pyrit/scenario/scenarios/airt/scam.py index da8007ce1e..a3c9f45de6 100644 --- a/pyrit/scenario/scenarios/airt/scam.py +++ b/pyrit/scenario/scenarios/airt/scam.py @@ -83,7 +83,16 @@ class Scam(Scenario): """ VERSION: int = 1 - COMPOSITE_SCORER_QUESTIONS_PATH = SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml" + + @classmethod + def get_override_composite_scorer_questions_path(cls) -> list[Path]: + """ + Override true/false question paths for scam objective scoring. + + Returns: + Sequence[Path]: Paths to true/false question paths for scam objective scoring. + """ + return [SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml"] @classmethod def get_strategy_class(cls) -> type[ScenarioStrategy]: diff --git a/tests/unit/scenario/test_scenario.py b/tests/unit/scenario/test_scenario.py index 7e1b1c2e64..303bc168f1 100644 --- a/tests/unit/scenario/test_scenario.py +++ b/tests/unit/scenario/test_scenario.py @@ -854,9 +854,9 @@ def test_returns_registry_scorer_when_tagged(self, mock_registry_cls) -> None: mock_registry.get_by_tag.return_value = [mock_entry] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None + # Mock self with get_override_composite_scorer_questions_path returning empty sequence mock_self = MagicMock() - type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None + type(mock_self).get_override_composite_scorer_questions_path = classmethod(lambda cls: []) result = Scenario._get_default_objective_scorer(mock_self) assert result is mock_scorer @@ -871,9 +871,9 @@ def test_returns_fallback_when_registry_empty(self, mock_registry_cls, mock_get_ mock_registry.get_by_tag.return_value = [] mock_registry_cls.get_registry_singleton.return_value = mock_registry - # Mock self with COMPOSITE_SCORER_QUESTIONS_PATH = None + # Mock self with get_override_composite_scorer_questions_path returning empty sequence mock_self = MagicMock() - type(mock_self).COMPOSITE_SCORER_QUESTIONS_PATH = None + type(mock_self).get_override_composite_scorer_questions_path = classmethod(lambda cls: []) result = Scenario._get_default_objective_scorer(mock_self) assert isinstance(result, TrueFalseInverterScorer) From ea8acd860a9f4e65859f811d18580840b73935a8 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 12:15:10 -0700 Subject: [PATCH 08/10] warn --- pyrit/scenario/core/scenario.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index 98ec21c469..ca81437b68 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -358,7 +358,7 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: chat_target = get_default_scorer_target() scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) - logger.info(f"Using fallback default objective scorer: {type(scorer).__name__}") + logger.warning(f"Using fallback default objective scorer: {type(scorer).__name__}") return scorer def set_params_from_args(self, *, args: dict[str, Any]) -> None: From 1fddeb940c3bee334d64cf4362d9d6a9bc7d669e Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 13:26:13 -0700 Subject: [PATCH 09/10] register main and fallback scorers in scorer initializer --- .../setup/initializers/components/scorers.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pyrit/setup/initializers/components/scorers.py b/pyrit/setup/initializers/components/scorers.py index 3b2d3aee6c..92c4899313 100644 --- a/pyrit/setup/initializers/components/scorers.py +++ b/pyrit/setup/initializers/components/scorers.py @@ -76,6 +76,8 @@ class ScorerInitializerTags(str, Enum): # Target registry names used by scorer configurations. +MAIN_SCORER_TARGET: str = "objective_scorer_chat" +FALLBACK_SCORER_TARGET: str = "openai_chat" GPT4O_TARGET: str = "azure_openai_gpt4o" GPT4O_TEMP0_TARGET: str = "azure_openai_gpt4o_temp0" GPT4O_TEMP9_TARGET: str = "azure_openai_gpt4o_temp9" @@ -86,6 +88,8 @@ class ScorerInitializerTags(str, Enum): GPT5_1_TARGET: str = "azure_openai_gpt5_1" # Scorer registry names. +MAIN: str = "main" +FALLBACK: str = "fallback" REFUSAL_GPT4O_OBJECTIVE_STRICT: str = "refusal_gpt4o_objective_strict" REFUSAL_GPT4O_OBJECTIVE_LENIENT: str = "refusal_gpt4o_objective_lenient" REFUSAL_GPT4O_NO_OBJECTIVE_STRICT: str = "refusal_gpt4o_no_objective_strict" @@ -173,6 +177,7 @@ async def initialize_async(self) -> None: "Ensure TargetInitializer is included in the initializers list." ) + self._register_fallback_scorers() self._register_refusal_scorers() self._register_scale_scorers() self._register_acs_threshold_scorers() @@ -187,6 +192,27 @@ async def initialize_async(self) -> None: # Core scorer registration # --------------------------------------------------------------------------- + def _register_fallback_scorers(self) -> None: + """ + Register scorers used as fallback in scenarios. + """ + main = self._get_chat_target(MAIN_SCORER_TARGET) + fallback = self._get_chat_target(FALLBACK_SCORER_TARGET) + self._try_register( + name=MAIN, + factory=lambda: TrueFalseInverterScorer( + scorer=SelfAskRefusalScorer(chat_target=self._require_dependency(main, name=MAIN_SCORER_TARGET)) + ), + required_targets=[main], + ) + self._try_register( + name=FALLBACK, + factory=lambda: TrueFalseInverterScorer( + scorer=SelfAskRefusalScorer(chat_target=self._require_dependency(fallback, name=FALLBACK_SCORER_TARGET)) + ), + required_targets=[fallback], + ) + def _register_refusal_scorers(self) -> None: """ Register base refusal scorer variants and tag the best one. From 00c839b87a2de96b832b6075e4001b76712f1ef6 Mon Sep 17 00:00:00 2001 From: Behnam Ousat Date: Thu, 7 May 2026 13:51:29 -0700 Subject: [PATCH 10/10] use the registry default scorer's target if available --- pyrit/scenario/core/scenario.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pyrit/scenario/core/scenario.py b/pyrit/scenario/core/scenario.py index ca81437b68..95ee8dd227 100644 --- a/pyrit/scenario/core/scenario.py +++ b/pyrit/scenario/core/scenario.py @@ -331,10 +331,24 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> return technique_name def _get_default_objective_scorer(self) -> TrueFalseScorer: - composite_scorer_questions_paths = type(self).get_override_composite_scorer_questions_path() + # Deferred import to avoid circular dependency. + from pyrit.setup.initializers.components.scorers import ScorerInitializerTags + + # first check if the registry has a default objective scorer + # if available either itself, or its chat target will be used + chat_target: PromptTarget | None = None + registry_default_scorer: TrueFalseScorer | None = None + entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) + if entries and isinstance(entries[0].instance, TrueFalseScorer): + registry_default_scorer = entries[0].instance + chat_target = registry_default_scorer.get_chat_target() + logger.info(f"The registry contains default objective scorer: {type(registry_default_scorer).__name__}") + + chat_target = chat_target or get_default_scorer_target() + # if the scenario has override composite scorer questions, use them to build a composite scorer + composite_scorer_questions_paths = type(self).get_override_composite_scorer_questions_path() if composite_scorer_questions_paths: - chat_target = get_default_scorer_target() path_scorers: list[TrueFalseScorer] = [ SelfAskTrueFalseScorer(chat_target=chat_target, true_false_question_path=path) for path in composite_scorer_questions_paths @@ -347,16 +361,10 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") return scorer - # Deferred import to avoid circular dependency. - from pyrit.setup.initializers.components.scorers import ScorerInitializerTags - - entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) - if entries and isinstance(entries[0].instance, TrueFalseScorer): - scorer = entries[0].instance - logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") - return scorer + if registry_default_scorer: + logger.info(f"Using registry default objective scorer: {type(registry_default_scorer).__name__}") + return registry_default_scorer - chat_target = get_default_scorer_target() scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) logger.warning(f"Using fallback default objective scorer: {type(scorer).__name__}") return scorer