-
Notifications
You must be signed in to change notification settings - Fork 754
MAINT unify how default adversarial and scorer targets are set in scenarios #1695
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8477420
5545543
b26f553
3e22987
6921832
11cb369
07c8b56
19a495e
52dc2a2
ea8acd8
1fddeb9
00c839b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,6 +79,11 @@ ADVERSARIAL_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" | |
| ADVERSARIAL_CHAT_KEY="xxxxx" | ||
| ADVERSARIAL_CHAT_MODEL="deployment-name" | ||
|
|
||
| # Objective Scorer chat target (used in scorers in scenarios) | ||
| OBJECTIVE_SCORER_CHAT_ENDPOINT="https://xxxxx.openai.azure.com/openai/v1" | ||
| OBJECTIVE_SCORER_CHAT_KEY="xxxxx" | ||
| OBJECTIVE_SCORER_CHAT_MODEL="deployment-name" | ||
|
Comment on lines
+83
to
+85
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just making sure this is updated in our global env and probably should be whichever endpoint our best performing scorer uses |
||
|
|
||
| AZURE_FOUNDRY_DEEPSEEK_ENDPOINT="https://xxxxx.eastus2.models.ai.azure.com" | ||
| AZURE_FOUNDRY_DEEPSEEK_KEY="xxxxx" | ||
| AZURE_FOUNDRY_DEEPSEEK_MODEL="" | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |||||
| import uuid | ||||||
| from abc import ABC, abstractmethod | ||||||
| from collections.abc import Sequence | ||||||
| from pathlib import Path | ||||||
| from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union, cast, get_origin | ||||||
|
|
||||||
| from tqdm.auto import tqdm | ||||||
|
|
@@ -27,14 +28,20 @@ | |||||
| from pyrit.memory.memory_models import ScenarioResultEntry | ||||||
| from pyrit.models import AttackResult, SeedAttackGroup | ||||||
| from pyrit.models.scenario_result import ScenarioIdentifier, ScenarioResult | ||||||
| from pyrit.prompt_target import OpenAIChatTarget, PromptTarget | ||||||
| from pyrit.prompt_target import PromptTarget | ||||||
| from pyrit.prompt_target.common.target_requirements import TargetRequirements | ||||||
| from pyrit.registry import ScorerRegistry | ||||||
| from pyrit.registry.object_registries.scorer_registry import ScorerRegistry | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a VERY long import from another module. Can we make it work with |
||||||
| from pyrit.scenario.core.atomic_attack import AtomicAttack | ||||||
| from pyrit.scenario.core.attack_technique import AttackTechnique | ||||||
| from pyrit.scenario.core.dataset_configuration import DatasetConfiguration | ||||||
| from pyrit.scenario.core.scenario_strategy import ScenarioStrategy | ||||||
| from pyrit.score import Scorer, SelfAskRefusalScorer, TrueFalseInverterScorer, TrueFalseScorer | ||||||
| from pyrit.scenario.core.scenario_target_defaults import get_default_scorer_target | ||||||
| from pyrit.score import Scorer, TrueFalseScorer | ||||||
| from pyrit.score.true_false.self_ask_refusal_scorer import SelfAskRefusalScorer | ||||||
| from pyrit.score.true_false.self_ask_true_false_scorer import SelfAskTrueFalseScorer | ||||||
| from pyrit.score.true_false.true_false_composite_scorer import TrueFalseCompositeScorer | ||||||
| from pyrit.score.true_false.true_false_inverter_scorer import TrueFalseInverterScorer | ||||||
| from pyrit.score.true_false.true_false_score_aggregator import TrueFalseScoreAggregator | ||||||
|
Comment on lines
+40
to
+44
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here about short imports. |
||||||
|
|
||||||
| if TYPE_CHECKING: | ||||||
| from pyrit.executor.attack.core.attack_config import AttackScoringConfig | ||||||
|
|
@@ -107,6 +114,20 @@ class Scenario(ABC): | |||||
| #: what the scenario needs. Validated in ``initialize_async`` once the target is supplied. | ||||||
| TARGET_REQUIREMENTS: ClassVar[TargetRequirements] = TargetRequirements() | ||||||
|
|
||||||
| @classmethod | ||||||
| def get_override_composite_scorer_questions_path(cls) -> Sequence[Path]: | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also NIT: would it be more clear to have this be |
||||||
| """ | ||||||
| Override to provide true/false question prompt paths for objective scoring. | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I think the override part is redundant, if a class implements this, it is overridden inherently |
||||||
|
|
||||||
| When overridden to return a non-empty sequence, the default objective scorer becomes | ||||||
| one ``SelfAskTrueFalseScorer`` per path AND-ed together with ``NOT(SelfAskRefusalScorer)`` | ||||||
| instead of the scenario-level default. | ||||||
|
Comment on lines
+122
to
+124
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I feel like this part is confusing here because the logic for this is not in this specific function and this should be moved to the doc string for _get_default_objective_scorer |
||||||
|
|
||||||
| Returns: | ||||||
| Sequence[Path]: Paths to true/false question prompts, or an empty sequence to use the default scorer. | ||||||
| """ | ||||||
| return [] | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so this function subclasses one input — a list of paths — but silently determines:
From the subclass author's perspective in scam.py:88-95, leakage.py:104, cyber.py:64, they "return paths" — and have no idea (without reading the base class) what scorer actually gets built. The method name suggests the mechanism ("composite") while hiding the behavior. so why not have something like :
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My reasoning behind this is ... as an author of a new scenario: "I don't want to think about what kind of scorer is used by default in my scenario ... just give me whatever all scenarios use, and btw, here's a list of questions I care about ... I'm happy with whatever other scenarios are doing with it" and if a scenario author truely cares about the specific default scorer used for their scenario, they can still override _get_default_objective_scorer (and if a user, as opposed to author, of a scenario wants a specific scorer, they can create one themselves and pass it as an init param) that's why I take out the "construction of the scorers" out of individual scenarios, and only leave a way for them to specify what questions they want to be considered for scoring ... and if the maintainers decide that using a different type of scorer is more appropriate as default, scenario authors don't care and don't need to update their default scorer construction ... I think maybe the method name could actually just be "additional_scoring_questions" or something like that, to indicate these are something the base scenario will "consider", but the author doesn't need to know how exactly. Am I making sense? :D
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "the subclass authors have no idea what scorer actually gets built." is the goal and a good thing in my opinion ...
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gotcha yeah i guess i forgot that a user could pass in their own non-default scorer, so this is just the case that the user doesn't pass in any scorer and then we create a custom scorer using the paths. I do think the function should be renamed to default_true_false_question_paths (we have a weird inconsistency with using get_* and default_* in our class functions) and leave the scorer portion out of it but i won't die on that hill 😅 |
||||||
|
|
||||||
| def __init__( | ||||||
| self, | ||||||
| *, | ||||||
|
|
@@ -310,16 +331,42 @@ def _build_display_group(self, *, technique_name: str, seed_group_name: str) -> | |||||
| return technique_name | ||||||
|
|
||||||
| def _get_default_objective_scorer(self) -> TrueFalseScorer: | ||||||
| # Deferred import to avoid circular dependency: | ||||||
| # Deferred import to avoid circular dependency. | ||||||
| from pyrit.setup.initializers.components.scorers import ScorerInitializerTags | ||||||
|
|
||||||
| # first check if the registry has a default objective scorer | ||||||
| # if available either itself, or its chat target will be used | ||||||
| chat_target: PromptTarget | None = None | ||||||
| registry_default_scorer: TrueFalseScorer | None = None | ||||||
| entries = ScorerRegistry.get_registry_singleton().get_by_tag(tag=ScorerInitializerTags.DEFAULT_OBJECTIVE_SCORER) | ||||||
| if entries and isinstance(entries[0].instance, TrueFalseScorer): | ||||||
| scorer = entries[0].instance | ||||||
| logger.info(f"Using registered default objective scorer: {type(scorer).__name__}") | ||||||
| registry_default_scorer = entries[0].instance | ||||||
| chat_target = registry_default_scorer.get_chat_target() | ||||||
| logger.info(f"The registry contains default objective scorer: {type(registry_default_scorer).__name__}") | ||||||
|
|
||||||
| chat_target = chat_target or get_default_scorer_target() | ||||||
|
|
||||||
| # if the scenario has override composite scorer questions, use them to build a composite scorer | ||||||
| composite_scorer_questions_paths = type(self).get_override_composite_scorer_questions_path() | ||||||
| if composite_scorer_questions_paths: | ||||||
| path_scorers: list[TrueFalseScorer] = [ | ||||||
| SelfAskTrueFalseScorer(chat_target=chat_target, true_false_question_path=path) | ||||||
| for path in composite_scorer_questions_paths | ||||||
| ] | ||||||
| backstop_scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) | ||||||
| scorer = TrueFalseCompositeScorer( | ||||||
| aggregator=TrueFalseScoreAggregator.AND, | ||||||
| scorers=[*path_scorers, backstop_scorer], | ||||||
| ) | ||||||
| logger.info(f"Using composite default objective scorer: {type(scorer).__name__}") | ||||||
| return scorer | ||||||
| scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=OpenAIChatTarget())) | ||||||
| logger.info(f"No registered default objective scorer found, using fallback: {type(scorer).__name__}") | ||||||
|
|
||||||
| if registry_default_scorer: | ||||||
| logger.info(f"Using registry default objective scorer: {type(registry_default_scorer).__name__}") | ||||||
| return registry_default_scorer | ||||||
|
|
||||||
| scorer = TrueFalseInverterScorer(scorer=SelfAskRefusalScorer(chat_target=chat_target)) | ||||||
| logger.warning(f"Using fallback default objective scorer: {type(scorer).__name__}") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in all the logging for this method, would it also be helpful to output target information? |
||||||
| return scorer | ||||||
|
|
||||||
| def set_params_from_args(self, *, args: dict[str, Any]) -> None: | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,98 @@ | ||
| # Copyright (c) Microsoft Corporation. | ||
| # Licensed under the MIT license. | ||
|
|
||
| import logging | ||
|
|
||
| from pyrit.prompt_target import OpenAIChatTarget, PromptChatTarget | ||
| from pyrit.prompt_target.common.target_capabilities import CapabilityName | ||
| from pyrit.registry import TargetRegistry | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def get_default_scorer_target() -> PromptChatTarget: | ||
|
behnam-o marked this conversation as resolved.
|
||
| """ | ||
| Resolve the default objective scorer chat target. | ||
|
|
||
| First checks the ``TargetRegistry`` for an ``"objective_scorer_chat"`` entry | ||
| (populated by ``TargetInitializer`` from ``OBJECTIVE_SCORER_CHAT_*`` env vars). | ||
| Falls back to a plain ``OpenAIChatTarget`` | ||
|
|
||
| Returns: | ||
| PromptChatTarget: The resolved objective scorer chat target. | ||
|
|
||
| Raises: | ||
| ValueError: If the registered target does not support multi-turn. | ||
| """ | ||
| return _get_default_chat_target(preferred_target_key="objective_scorer_chat") | ||
|
|
||
|
behnam-o marked this conversation as resolved.
|
||
|
|
||
| def get_default_adversarial_target() -> PromptChatTarget: | ||
| """ | ||
| Resolve the default adversarial chat target. | ||
|
|
||
| First checks the ``TargetRegistry`` for an ``"adversarial_chat"`` entry | ||
| (populated by ``TargetInitializer`` from ``ADVERSARIAL_CHAT_*`` env vars). | ||
| Falls back to a default fallback target with temperature=1.2 | ||
|
|
||
| Returns: | ||
| PromptChatTarget: The resolved adversarial chat target. | ||
|
|
||
| Raises: | ||
| ValueError: If the registered target does not support multi-turn. | ||
| """ | ||
| return _get_default_chat_target( | ||
| preferred_target_key="adversarial_chat", | ||
| required_capabilities={CapabilityName.MULTI_TURN}, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does SYSTEM_PROMPT also need to be a require_capability? It seems like most of our multi-turn attacks require it. |
||
| fallback_temperature=1.2, | ||
| ) | ||
|
|
||
|
|
||
| def _get_default_chat_target( | ||
| *, | ||
| preferred_target_key: str, | ||
| required_capabilities: set[CapabilityName] | None = None, | ||
| fallback_temperature: float | None = None, | ||
| ) -> PromptChatTarget: | ||
| """ | ||
| Resolve a chat target from TargetRegistry with configurable fallback behavior. | ||
|
|
||
| Resolution order: | ||
| 1. ``preferred_target_key`` entry from ``TargetRegistry`` | ||
| 2. ``OpenAIChatTarget(...)`` with optional temperature | ||
|
|
||
| Args: | ||
| preferred_target_key (str): TargetRegistry key to resolve first. | ||
| required_capabilities (set[CapabilityName] | None): Optional capabilities | ||
| that a resolved target must support. | ||
| fallback_temperature (float | None): Optional temperature for fallback | ||
| ``OpenAIChatTarget`` construction. | ||
|
|
||
| Returns: | ||
| PromptChatTarget: The resolved chat target. | ||
|
|
||
| Raises: | ||
| ValueError: If the resolved target does not satisfy required capabilities. | ||
| ValueError: If the registry entry exists but is not a PromptChatTarget. | ||
| """ | ||
| registry = TargetRegistry.get_registry_singleton() | ||
| target = registry.get(preferred_target_key) | ||
| if target is not None: | ||
| # Check required capabilities first (fail fast) | ||
| if required_capabilities: | ||
| for capability in required_capabilities: | ||
| if not target.capabilities.includes(capability=capability): | ||
| raise ValueError(f"Registry entry '{preferred_target_key}' must support {capability.value}.") | ||
|
|
||
| # Then check type | ||
| if not isinstance(target, PromptChatTarget): | ||
| raise ValueError( | ||
| f"Registry entry '{preferred_target_key}' must be a PromptChatTarget, but got {type(target).__name__}" | ||
| ) | ||
|
|
||
| return target | ||
|
|
||
| logger.warning( | ||
| f"TargetRegistry entry '{preferred_target_key}' not found. Falling back to default OpenAIChatTarget." | ||
| ) | ||
| return OpenAIChatTarget(temperature=fallback_temperature) | ||
Uh oh!
There was an error while loading. Please reload this page.