Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pyrit/executor/attack/multi_turn/crescendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,9 @@ async def _check_refusal_async(self, context: CrescendoAttackContext, objective:
objective=context.objective,
):
scores = await self._refusal_scorer.score_async(
message=context.last_response, objective=objective, skip_on_error_result=False
message=context.last_response,
objective=objective,
skip_on_error_result=False,
)
return scores[0]

Expand Down
10 changes: 10 additions & 0 deletions pyrit/executor/attack/multi_turn/tree_of_attacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,11 @@ def __init__(
corresponding score is assigned instead of invoking the scorer. This prevents
premature branch pruning when targets return blocked/filtered responses.
Defaults to {"blocked": 0.0}. Pass an empty dict to disable.

Note: This check runs before the scorer, so if ``score_blocked_content``
is set on the objective scorer, it will have no effect for error types
present in this map. To evaluate partial content from blocked responses,
pass ``error_score_map={}`` to disable the early-return.
"""
# Store configuration
self._objective_target = objective_target
Expand Down Expand Up @@ -1398,6 +1403,11 @@ def __init__(
content policy violations from image generation targets). Defaults to
{"blocked": 0.0}. Pass an empty dict to disable.

Note: This check runs before the scorer, so if ``score_blocked_content``
is set on the objective scorer, it will have no effect for error types
present in this map. To evaluate partial content from blocked responses,
pass ``error_score_map={}`` to disable the early-return.

Raises:
ValueError: If attack_scoring_config uses a non-FloatScaleThresholdScorer objective scorer,
if the adversarial target does not natively support the capabilities TAP needs,
Expand Down
20 changes: 20 additions & 0 deletions pyrit/prompt_target/openai/openai_chat_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,26 @@ def _check_content_filter(self, response: Any) -> bool:
pass
return False

def _extract_partial_content(self, response: Any) -> Optional[str]:
"""
Extract partial content from a Chat Completions response with finish_reason=content_filter.

When Azure Content Safety triggers mid-generation, the model may have produced partial
text in ``response.choices[0].message.content`` before being cut off.

Args:
response: A ChatCompletion object from the OpenAI SDK.

Returns:
The partial text content, or None if no content was generated.
"""
try:
if response.choices and response.choices[0].message and response.choices[0].message.content:
return response.choices[0].message.content
except (AttributeError, IndexError):
pass
return None

def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]:
"""
Validate a Chat Completions API response for errors.
Expand Down
30 changes: 29 additions & 1 deletion pyrit/prompt_target/openai/openai_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,10 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece)
"""
Handle content filter errors by creating a proper error Message.

If the subclass provides partial content via ``_extract_partial_content``,
it is attached to each response piece as ``prompt_metadata["partial_content"]``
so that scorers with ``score_blocked_content=True`` can evaluate it.

Args:
response: The response object from OpenAI SDK.
request: The original request message piece.
Expand All @@ -567,13 +571,37 @@ def _handle_content_filter_response(self, response: Any, request: MessagePiece)
Message object with error type indicating content was filtered.
"""
logger.warning("Output content filtered by content policy.")
return handle_bad_request_exception(

partial_content = self._extract_partial_content(response)

error_message = handle_bad_request_exception(
response_text=response.model_dump_json(),
request=request,
error_code=200,
is_content_filter=True,
)

if partial_content:
Comment thread
jsong468 marked this conversation as resolved.
for piece in error_message.message_pieces:
piece.prompt_metadata["partial_content"] = partial_content

return error_message

def _extract_partial_content(self, response: Any) -> Optional[str]:
"""
Extract any partial content the model generated before the content filter triggered.

Override this in subclasses to extract partial content from API-specific response
structures. The base implementation returns None (no partial content).

Args:
response: The response object from OpenAI SDK.

Returns:
The partial text content, or None if no content was generated.
"""
return None

def _validate_response(self, response: Any, request: MessagePiece) -> Optional[Message]:
"""
Validate the response and return error Message if needed.
Expand Down
12 changes: 11 additions & 1 deletion pyrit/score/conversation_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,17 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non
# Only include user and assistant messages in the conversation text
if piece.api_role in ["user", "assistant", "tool"]:
role_display = "Assistant (simulated)" if piece.is_simulated else piece.api_role.capitalize()
conversation_text += f"{role_display}: {piece.converted_value}\n"
# For blocked pieces with partial content, use the partial content
# instead of the error JSON when score_blocked_content is enabled
if (
self.score_blocked_content
and piece.is_blocked()
and piece.prompt_metadata.get("partial_content")
):
text = str(piece.prompt_metadata["partial_content"])
else:
text = piece.converted_value
conversation_text += f"{role_display}: {text}\n"

# Create a new message with the concatenated conversation text
# Preserve the original message piece metadata
Expand Down
97 changes: 93 additions & 4 deletions pyrit/score/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ class Scorer(Identifiable, abc.ABC):

_identifier: Optional[ComponentIdentifier] = None

#: When True, blocked responses that contain partial content
#: (in prompt_metadata["partial_content"]) will be scored using that content
#: instead of being filtered out or short-circuited.
#: Set this on scorer instances before use. Defaults to False.
#:
#: Note: This attribute will only take effect if the target supports partial content extraction when content
#: filters are triggered (e.g., Chat Completions model via OpenAIChatTarget).
score_blocked_content: bool = False

def __init__(self, *, validator: ScorerPromptValidator, chat_target: Optional[PromptTarget] = None) -> None:
"""
Initialize the Scorer.
Expand Down Expand Up @@ -174,7 +183,9 @@ async def score_async(
role_filter (Optional[ChatMessageRole]): Only score messages with this exact stored role.
Use "assistant" to score only real assistant responses, or "simulated_assistant"
to score only simulated responses. Defaults to None (no filtering).
skip_on_error_result (bool): If True, skip scoring if the message contains an error. Defaults to False.
skip_on_error_result (bool): If True, skip scoring if the message contains an error.
When self.score_blocked_content is also True, blocked responses with partial content
will still be scored instead of skipping. Defaults to False.
infer_objective_from_request (bool): If True, infer the objective from the message's previous request
when objective is not provided. Defaults to False.

Expand All @@ -192,15 +203,25 @@ async def score_async(
return []

if skip_on_error_result and message.is_error():
logger.debug("Skipping scoring due to error in message and skip_on_error=True.")
return []
# When score_blocked_content is enabled and the message has partial content,
# don't skip — let _score_async handle the substitution.
has_partial = any(
p.prompt_metadata.get("partial_content") for p in message.message_pieces if p.is_blocked()
)
if not (self.score_blocked_content and has_partial):
logger.debug("Skipping scoring due to error in message and skip_on_error=True.")
return []

if infer_objective_from_request and (not objective):
objective = self._extract_objective_from_response(message)

# When score_blocked_content is enabled, create a modified message where blocked pieces
# with partial content are replaced with text-type substitutes (response_error="none").
scoring_message = self._apply_blocked_content_substitution(message) if self.score_blocked_content else message

try:
scores = await self._score_async(
message,
scoring_message,
objective=objective,
)
except PyritException as e:
Expand Down Expand Up @@ -253,6 +274,74 @@ async def _score_async(self, message: Message, *, objective: Optional[str] = Non
async def _score_piece_async(self, message_piece: MessagePiece, *, objective: Optional[str] = None) -> list[Score]:
raise NotImplementedError

@staticmethod
def _create_text_piece_from_blocked(piece: MessagePiece) -> Optional[MessagePiece]:
"""
Create a text-typed copy of a blocked MessagePiece using its partial content.

The substitute preserves the original piece's id (so scores link back correctly),
sets converted_value to the partial content with converted_value_data_type="text",
and sets response_error="none" so scorer short-circuits (e.g., refusal scorer's
blocked check) do not fire.

Args:
piece: A blocked MessagePiece with prompt_metadata["partial_content"].

Returns:
MessagePiece with text content, or None if partial content is empty.
"""
partial_content = str(piece.prompt_metadata.get("partial_content", ""))
if not partial_content:
return None

return MessagePiece(
id=piece.id,
role=piece.api_role,
original_value=piece.original_value,
converted_value=partial_content,
original_value_data_type=piece.original_value_data_type,
converted_value_data_type="text",
conversation_id=piece.conversation_id,
sequence=piece.sequence,
labels=piece.labels,
prompt_metadata=piece.prompt_metadata,
converter_identifiers=list(piece.converter_identifiers), # type: ignore[arg-type]
prompt_target_identifier=piece.prompt_target_identifier,
attack_identifier=piece.attack_identifier,
response_error="none",
timestamp=piece.timestamp,
)

def _apply_blocked_content_substitution(self, message: Message) -> Message:
"""
Create a copy of the message where blocked pieces with partial content are substituted.

Each blocked piece that has prompt_metadata["partial_content"] is replaced with a
text-typed copy (response_error="none", converted_value=partial_content). Non-blocked
pieces and blocked pieces without partial content are kept as-is.

Args:
message: The original message potentially containing blocked pieces.

Returns:
A new Message with substituted pieces, or the original if no substitution was needed.
"""
substituted = False
new_pieces: list[MessagePiece] = []
for piece in message.message_pieces:
if piece.is_blocked() and "partial_content" in piece.prompt_metadata:
substitute = self._create_text_piece_from_blocked(piece)
if substitute:
new_pieces.append(substitute)
substituted = True
continue
new_pieces.append(piece)

if not substituted:
return message

return Message(message_pieces=new_pieces)

def _get_supported_pieces(self, message: Message) -> list[MessagePiece]:
"""
Get a list of supported message pieces for this scorer.
Expand Down
56 changes: 56 additions & 0 deletions tests/unit/prompt_target/target/test_openai_chat_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -1596,6 +1596,62 @@ async def test_save_audio_response_async_pcm16_format(patch_central_database):
assert result == "/path/to/saved/audio.wav"


# ── _extract_partial_content tests ──────────────────────────────────────────


class TestExtractPartialContentChatTarget:
def test_extracts_partial_content_from_content_filter_response(self, target: OpenAIChatTarget):
mock_response = create_mock_completion(
content="Partial harmful content before cutoff", finish_reason="content_filter"
)
result = target._extract_partial_content(mock_response)
assert result == "Partial harmful content before cutoff"

def test_returns_none_when_no_content(self, target: OpenAIChatTarget):
mock_response = create_mock_completion(content=None, finish_reason="content_filter")
result = target._extract_partial_content(mock_response)
assert result is None

def test_returns_none_when_empty_content(self, target: OpenAIChatTarget):
mock_response = create_mock_completion(content="", finish_reason="content_filter")
result = target._extract_partial_content(mock_response)
assert result is None

def test_returns_none_when_no_choices(self, target: OpenAIChatTarget):
mock_response = MagicMock(spec=ChatCompletion)
mock_response.choices = []
result = target._extract_partial_content(mock_response)
assert result is None


class TestContentFilterPreservesPartialContent:
async def test_200_content_filter_attaches_partial_content_metadata(self, target: OpenAIChatTarget):
"""Integration: 200 + content_filter response preserves partial content in metadata."""
message = Message(
message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")]
)
mock_completion = create_mock_completion(content="Harmful partial content here", finish_reason="content_filter")
Comment thread
jsong468 marked this conversation as resolved.
target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign]

response = await target.send_prompt_async(message=message)

assert response[0].message_pieces[0].response_error == "blocked"
assert response[0].message_pieces[0].prompt_metadata["partial_content"] == "Harmful partial content here"

async def test_200_content_filter_no_metadata_when_no_content(self, target: OpenAIChatTarget):
"""200 + content_filter with no content doesn't attach metadata."""
message = Message(
message_pieces=[MessagePiece(role="user", conversation_id="test-convo", original_value="test prompt")]
)
mock_completion = create_mock_completion(content=None, finish_reason="content_filter")
target._async_client.chat.completions.create = AsyncMock(return_value=mock_completion) # type: ignore[method-assign]

response = await target.send_prompt_async(message=message)

assert response[0].message_pieces[0].response_error == "blocked"
assert "partial_content" not in response[0].message_pieces[0].prompt_metadata


async def test_save_audio_response_async_flac_format(patch_central_database):
"""Test saving audio response with flac format."""
audio_config = OpenAIChatAudioConfig(voice="alloy", audio_format="flac")
Expand Down
Loading
Loading