Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env_example
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ ADVERSARIAL_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
ADVERSARIAL_CHAT_KEY="xxxxx"
ADVERSARIAL_CHAT_MODEL="deployment-name"

# Objective Scorer chat target (used in scorers in scenarios)
Comment thread
behnam-o marked this conversation as resolved.
OBJECTIVE_SCORER_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1"
OBJECTIVE_SCORER_CHAT_KEY="xxxxx"
OBJECTIVE_SCORER_CHAT_MODEL="deployment-name"
Comment on lines +83 to +85
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just making sure this is updated in our global env and probably should be whichever endpoint our best performing scorer uses


AZURE_FOUNDRY_DEEPSEEK_ENDPOINT="https://xxxxx.eastus2.models.ai.azure.com"
AZURE_FOUNDRY_DEEPSEEK_KEY="xxxxx"
AZURE_FOUNDRY_DEEPSEEK_MODEL=""
Expand Down
5 changes: 3 additions & 2 deletions pyrit/scenario/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from pyrit.scenario.core.dataset_configuration import EXPLICIT_SEED_GROUPS_KEY, DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target, get_default_scorer_target
from pyrit.scenario.core.scenario_techniques import (
SCENARIO_TECHNIQUES,
get_default_adversarial_target,
register_scenario_techniques,
)

Expand All @@ -28,6 +28,7 @@
"ScenarioCompositeStrategy",
"ScenarioStrategy",
"ScorerOverridePolicy",
"get_default_adversarial_target",
"register_scenario_techniques",
"get_default_scorer_target",
"get_default_adversarial_target",
]
63 changes: 55 additions & 8 deletions pyrit/scenario/core/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import uuid
from abc import ABC, abstractmethod
from collections.abc import Sequence
from pathlib import Path
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union, cast, get_origin

from tqdm.auto import tqdm
Expand All @@ -27,14 +28,20 @@
from pyrit.memory.memory_models import ScenarioResultEntry
from pyrit.models import AttackResult, SeedAttackGroup
from pyrit.models.scenario_result import ScenarioIdentifier, ScenarioResult
from pyrit.prompt_target import OpenAIChatTarget, PromptTarget
from pyrit.prompt_target import PromptTarget
from pyrit.prompt_target.common.target_requirements import TargetRequirements
from pyrit.registry import ScorerRegistry
from pyrit.registry.object_registries.scorer_registry import ScorerRegistry
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a VERY long import from another module. Can we make it work with pyrit.registry? There aren't that many, are there?

from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.attack_technique import AttackTechnique
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario_strategy import ScenarioStrategy
from pyrit.score import Scorer, SelfAskRefusalScorer, TrueFalseInverterScorer, TrueFalseScorer
from pyrit.scenario.core.scenario_target_defaults import get_default_scorer_target
from pyrit.score import Scorer, TrueFalseScorer
from pyrit.score.true_false.self_ask_refusal_scorer import SelfAskRefusalScorer
from pyrit.score.true_false.self_ask_true_false_scorer import SelfAskTrueFalseScorer
from pyrit.score.true_false.true_false_composite_scorer import TrueFalseCompositeScorer
from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer
from pyrit.score.true_false.true_false_score_aggregator import TrueFalseScoreAggregator
Comment on lines +40 to +44
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here about short imports.


if TYPE_CHECKING:
from pyrit.executor.attack.core.attack_config import AttackScoringConfig
Expand Down Expand Up @@ -107,6 +114,20 @@ class Scenario(ABC):
#: what the scenario needs. Validated in ``initialize_async`` once the target is supplied.
TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements()

@classmethod
def get_override_composite_scorer_questions_path(cls) -> Sequence[Path]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_override_composite_scorer_questions_path(cls) -> Sequence[Path]:
def get_override_composite_scorer_questions_paths(cls) -> Sequence[Path]:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also NIT: would it be more clear to have this be get_override_default_scorer_questions_paths (to make it more obvious it's for the default scorer) or maybe get_override_true_false_scorer_questions_paths since it applies to the true_false scorer portion of the composite.

"""
Override to provide true/false question prompt paths for objective scoring.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Override to provide true/false question prompt paths for objective scoring.
Get the true/false question prompt paths for objective scoring.

I think the override part is redundant, if a class implements this, it is overridden inherently


When overridden to return a non-empty sequence, the default objective scorer becomes
one ``SelfAskTrueFalseScorer`` per path AND-ed together with ``NOT(SelfAskRefusalScorer)``
instead of the scenario-level default.
Comment on lines +122 to +124
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I feel like this part is confusing here because the logic for this is not in this specific function and this should be moved to the doc string for _get_default_objective_scorer


Returns:
Sequence[Path]: Paths to true/false question prompts, or an empty sequence to use the default scorer.
"""
return []
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this function subclasses one input — a list of paths — but silently determines:

  1. The per-path scorer type (always SelfAskTrueFalseScorer)
  2. The aggregator (always AND)
  3. That a NOT(SelfAskRefusalScorer) backstop is appended
  4. The chat target (via a 3-step fallback inside _get_default_objective_scorer

From the subclass author's perspective in scam.py:88-95, leakage.py:104, cyber.py:64, they "return paths" — and have no idea (without reading the base class) what scorer actually gets built. The method name suggests the mechanism ("composite") while hiding the behavior. so why not have something like :

def _get_default_objective_scorer(self) -> TrueFalseScorer:
    return self._build_composite_with_refusal_backstop(
        chat_target=self._resolve_default_scorer_chat_target(),
        true_false_question_paths=[SCORER_SEED_PROMPT_PATH / "true_false_question" / "scams.yaml"],
    )

@staticmethod
def _build_composite_with_refusal_backstop(
    *,
    chat_target: PromptChatTarget,
    true_false_question_paths: Sequence[Path],
) -> TrueFalseCompositeScorer:
    path_scorers = [
        SelfAskTrueFalseScorer(chat_target=chat_target, true_false_question_path=p)
        for p in true_false_question_paths
    ]
    backstop = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target))
    return TrueFalseCompositeScorer(
        aggregator=TrueFalseScoreAggregator.AND,
        scorers=[*path_scorers, backstop],
    )

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My reasoning behind this is ... as an author of a new scenario: "I don't want to think about what kind of scorer is used by default in my scenario ... just give me whatever all scenarios use, and btw, here's a list of questions I care about ... I'm happy with whatever other scenarios are doing with it"

and if a scenario author truely cares about the specific default scorer used for their scenario, they can still override _get_default_objective_scorer

(and if a user, as opposed to author, of a scenario wants a specific scorer, they can create one themselves and pass it as an init param)

that's why I take out the "construction of the scorers" out of individual scenarios, and only leave a way for them to specify what questions they want to be considered for scoring ... and if the maintainers decide that using a different type of scorer is more appropriate as default, scenario authors don't care and don't need to update their default scorer construction ...

I think maybe the method name could actually just be "additional_scoring_questions" or something like that, to indicate these are something the base scenario will "consider", but the author doesn't need to know how exactly.

Am I making sense? :D

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"the subclass authors have no idea what scorer actually gets built." is the goal and a good thing in my opinion ...

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotcha yeah i guess i forgot that a user could pass in their own non-default scorer, so this is just the case that the user doesn't pass in any scorer and then we create a custom scorer using the paths. I do think the function should be renamed to default_true_false_question_paths (we have a weird inconsistency with using get_* and default_* in our class functions) and leave the scorer portion out of it but i won't die on that hill 😅


def __init__(
self,
*,
Expand Down Expand Up @@ -310,16 +331,42 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) ->
return technique_name

def _get_default_objective_scorer(self) -> TrueFalseScorer:
# Deferred import to avoid circular dependency:
# Deferred import to avoid circular dependency.
from pyrit.setup.initializers.components.scorers import ScorerInitializerTags

# first check if the registry has a default objective scorer
# if available either itself, or its chat target will be used
chat_target: PromptTarget | None = None
registry_default_scorer: TrueFalseScorer | None = None
entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER)
if entries and isinstance(entries[0].instance, TrueFalseScorer):
scorer = entries[0].instance
logger.info(f"Using registered default objective scorer: {type(scorer).__name__}")
registry_default_scorer = entries[0].instance
chat_target = registry_default_scorer.get_chat_target()
logger.info(f"The registry contains default objective scorer: {type(registry_default_scorer).__name__}")

chat_target = chat_target or get_default_scorer_target()

# if the scenario has override composite scorer questions, use them to build a composite scorer
composite_scorer_questions_paths = type(self).get_override_composite_scorer_questions_path()
if composite_scorer_questions_paths:
path_scorers: list[TrueFalseScorer] = [
SelfAskTrueFalseScorer(chat_target=chat_target, true_false_question_path=path)
for path in composite_scorer_questions_paths
]
backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target))
scorer = TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND,
scorers=[*path_scorers, backstop_scorer],
)
logger.info(f"Using composite default objective scorer: {type(scorer).__name__}")
return scorer
scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=OpenAIChatTarget()))
logger.info(f"No registered default objective scorer found, using fallback: {type(scorer).__name__}")

if registry_default_scorer:
logger.info(f"Using registry default objective scorer: {type(registry_default_scorer).__name__}")
return registry_default_scorer

scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target))
logger.warning(f"Using fallback default objective scorer: {type(scorer).__name__}")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in all the logging for this method, would it also be helpful to output target information?

return scorer

def set_params_from_args(self, *, args: dict[str, Any]) -> None:
Expand Down
98 changes: 98 additions & 0 deletions pyrit/scenario/core/scenario_target_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget
from pyrit.prompt_target.common.target_capabilities import CapabilityName
from pyrit.registry import TargetRegistry

logger = logging.getLogger(__name__)


def get_default_scorer_target() -> PromptChatTarget:
Comment thread
behnam-o marked this conversation as resolved.
"""
Resolve the default objective scorer chat target.

First checks the ``TargetRegistry`` for an ``"objective_scorer_chat"`` entry
(populated by ``TargetInitializer`` from ``OBJECTIVE_SCORER_CHAT_*`` env vars).
Falls back to a plain ``OpenAIChatTarget``

Returns:
PromptChatTarget: The resolved objective scorer chat target.

Raises:
ValueError: If the registered target does not support multi-turn.
"""
return _get_default_chat_target(preferred_target_key="objective_scorer_chat")

Comment thread
behnam-o marked this conversation as resolved.

def get_default_adversarial_target() -> PromptChatTarget:
"""
Resolve the default adversarial chat target.

First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry
(populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars).
Falls back to a default fallback target with temperature=1.2

Returns:
PromptChatTarget: The resolved adversarial chat target.

Raises:
ValueError: If the registered target does not support multi-turn.
"""
return _get_default_chat_target(
preferred_target_key="adversarial_chat",
required_capabilities={CapabilityName.MULTI_TURN},
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does SYSTEM_PROMPT also need to be a require_capability? It seems like most of our multi-turn attacks require it.

fallback_temperature=1.2,
)


def _get_default_chat_target(
*,
preferred_target_key: str,
required_capabilities: set[CapabilityName] | None = None,
fallback_temperature: float | None = None,
) -> PromptChatTarget:
"""
Resolve a chat target from TargetRegistry with configurable fallback behavior.

Resolution order:
1. ``preferred_target_key`` entry from ``TargetRegistry``
2. ``OpenAIChatTarget(...)`` with optional temperature

Args:
preferred_target_key (str): TargetRegistry key to resolve first.
required_capabilities (set[CapabilityName] | None): Optional capabilities
that a resolved target must support.
fallback_temperature (float | None): Optional temperature for fallback
``OpenAIChatTarget`` construction.

Returns:
PromptChatTarget: The resolved chat target.

Raises:
ValueError: If the resolved target does not satisfy required capabilities.
ValueError: If the registry entry exists but is not a PromptChatTarget.
"""
registry = TargetRegistry.get_registry_singleton()
target = registry.get(preferred_target_key)
if target is not None:
# Check required capabilities first (fail fast)
if required_capabilities:
for capability in required_capabilities:
if not target.capabilities.includes(capability=capability):
raise ValueError(f"Registry entry '{preferred_target_key}' must support {capability.value}.")

# Then check type
if not isinstance(target, PromptChatTarget):
raise ValueError(
f"Registry entry '{preferred_target_key}' must be a PromptChatTarget, but got {type(target).__name__}"
)

return target

logger.warning(
f"TargetRegistry entry '{preferred_target_key}' not found. Falling back to default OpenAIChatTarget."
)
return OpenAIChatTarget(temperature=fallback_temperature)
41 changes: 5 additions & 36 deletions pyrit/scenario/core/scenario_techniques.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import inspect
import logging
from pathlib import Path
from typing import TYPE_CHECKING

from pyrit.common.path import EXECUTOR_SEED_PROMPT_PATH
from pyrit.executor.attack import (
Expand All @@ -35,13 +36,15 @@
)
from pyrit.models import SeedAttackTechniqueGroup, SeedSimulatedConversation
from pyrit.models.seeds.seed_simulated_conversation import NextMessageSystemPromptPaths
from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget
from pyrit.prompt_target.common.target_capabilities import CapabilityName
from pyrit.registry import TargetRegistry
from pyrit.registry.object_registries.attack_technique_registry import (
AttackTechniqueRegistry,
AttackTechniqueSpec,
)
from pyrit.scenario.core.scenario_target_defaults import get_default_adversarial_target

if TYPE_CHECKING:
from pyrit.prompt_target import PromptChatTarget

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -104,40 +107,6 @@
]


# ---------------------------------------------------------------------------
# Default adversarial target
# ---------------------------------------------------------------------------


def get_default_adversarial_target() -> PromptChatTarget:
"""
Resolve the default adversarial chat target.

First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry
(populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars).
Falls back to a plain ``OpenAIChatTarget(temperature=1.2)`` using
``@apply_defaults`` resolution.

Returns:
PromptChatTarget: The resolved adversarial chat target.

Raises:
ValueError: If the registered target does not support multi-turn.
"""
registry = TargetRegistry.get_registry_singleton()
if "adversarial_chat" in registry:
target = registry.get("adversarial_chat")
if target:
if not target.capabilities.includes(capability=CapabilityName.MULTI_TURN):
raise ValueError(
f"Registry entry 'adversarial_chat' must support multi-turn conversations, "
f"but {type(target).__name__} does not."
)
return target

return OpenAIChatTarget(temperature=1.2)


# ---------------------------------------------------------------------------
# Runtime spec builder
# ---------------------------------------------------------------------------
Expand Down
51 changes: 13 additions & 38 deletions pyrit/scenario/scenarios/airt/cyber.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,14 @@

from pyrit.common import apply_defaults
from pyrit.common.path import SCORER_SEED_PROMPT_PATH
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.score import (
SelfAskRefusalScorer,
SelfAskTrueFalseScorer,
TrueFalseCompositeScorer,
TrueFalseInverterScorer,
TrueFalseScoreAggregator,
TrueFalseScorer,
)

if TYPE_CHECKING:
from pathlib import Path

from pyrit.scenario.core.scenario_strategy import ScenarioStrategy
from pyrit.score import TrueFalseScorer

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,6 +60,16 @@ class Cyber(Scenario):
VERSION: int = 2
_cached_strategy_class: ClassVar[type[ScenarioStrategy] | None] = None

@classmethod
def get_override_composite_scorer_questions_path(cls) -> list[Path]:
"""
Override true/false question paths for cyber objective scoring.

Returns:
Sequence[Path]: Paths to true/false question paths for cyber objective scoring.
"""
return [SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml"]

@classmethod
def get_strategy_class(cls) -> type[ScenarioStrategy]:
"""
Expand Down Expand Up @@ -128,32 +132,3 @@ def __init__(
include_default_baseline=include_baseline,
scenario_result_id=scenario_result_id,
)

def _get_default_objective_scorer(self) -> TrueFalseCompositeScorer:
"""
Retrieve the default objective scorer.

The scoring mechanism is as follows:
1. Detect for the presence of malware using a SelfAskTrueFalseScorer.
2. Detect for NON-refusal as a backstop. (If the target model refuses to produce malware, we want that
to register as a negative score).

The chat target is resolved from the base class default scorer if available,
otherwise falls back to OpenAIChatTarget().

Returns:
TrueFalseCompositeScorer: Default objective scorer with backstop and malware evaluation.
"""
base_scorer = super()._get_default_objective_scorer()
chat_target = base_scorer.get_chat_target() or OpenAIChatTarget()

presence_of_malware = SelfAskTrueFalseScorer(
chat_target=chat_target,
true_false_question_path=SCORER_SEED_PROMPT_PATH / "true_false_question" / "malware.yaml",
)

backstop = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target))

return TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND, scorers=[presence_of_malware, backstop]
)
Loading
Loading