From 982eba22db2dd2235a46bfc91cbd18777ce398df Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Wed, 17 Dec 2025 16:42:50 -0800 Subject: [PATCH] updated tests --- .../test_glm_streaming_compliance.py | 921 +++++++++++++++--- 1 file changed, 799 insertions(+), 122 deletions(-) diff --git a/eval_protocol/benchmarks/test_glm_streaming_compliance.py b/eval_protocol/benchmarks/test_glm_streaming_compliance.py index b570cfa4..6943096e 100644 --- a/eval_protocol/benchmarks/test_glm_streaming_compliance.py +++ b/eval_protocol/benchmarks/test_glm_streaming_compliance.py @@ -5,12 +5,14 @@ import re from typing import Any +import pytest + from eval_protocol.models import ( EvaluateResult, EvaluationRow, Message, MetricResult, - ChatCompletionContentPartTextParam, + ChatCompletionContentPartParam, ) from eval_protocol.pytest.default_single_turn_rollout_process import ( SingleTurnRolloutProcessor, @@ -21,9 +23,23 @@ DEFAULT_MODEL_ID = "fireworks_ai/accounts/fireworks/models/glm-4p6" DEFAULT_MAX_TOKENS = 10000 +# ============================================================================ +# Feature Flags (read from environment variables) +# ============================================================================ + +# EP_SUPPORTS_MULTIPLE_TOOL_CALLS: Whether model supports multiple tool calls in one response +# Default: True ("1"). Set to "0" to skip multiple tool call tests. +SUPPORTS_MULTIPLE_TOOL_CALLS = os.getenv("EP_SUPPORTS_MULTIPLE_TOOL_CALLS", "1") == "1" + +# EP_SUPPORTS_REASONING: Whether model supports the reasoning_effort parameter +# Default: True ("1"). Set to "0" to skip reasoning-specific tests and NOT pass reasoning_effort param. +# When True: Include reasoning tests, pass reasoning_effort parameter, check reasoning_content +# When False: Skip reasoning-specific tests, do NOT pass reasoning_effort parameter at all +SUPPORTS_REASONING = os.getenv("EP_SUPPORTS_REASONING", "1") == "1" + def _coerce_content_to_str( - content: str | list[ChatCompletionContentPartTextParam] | None, + content: str | list[ChatCompletionContentPartParam] | None, ) -> str: if isinstance(content, list): texts: list[str] = [] @@ -525,15 +541,30 @@ def _build_completion_params_from_payload(payload: dict[str, Any]) -> dict[str, "model": DEFAULT_MODEL_ID, "stream": True, "return_reasoning_with_separate_field": True, - "reasoning_effort": "none", # Default: no reasoning unless explicitly requested + "raw_output": True, # Include raw model output for debugging } - passthrough_keys = {"temperature", "top_p", "max_tokens", "response_format", "reasoning_effort"} + # Only include reasoning_effort if model supports it + if SUPPORTS_REASONING: + params["reasoning_effort"] = "none" # Default: no reasoning unless explicitly requested + + passthrough_keys = {"temperature", "top_p", "max_tokens", "response_format", "tool_choice"} + # Only passthrough reasoning_effort if model supports it + if SUPPORTS_REASONING: + passthrough_keys.add("reasoning_effort") + for key in passthrough_keys: if key in payload: params[key] = payload[key] return params +def _maybe_add_reasoning_effort(params: dict[str, Any], effort: str = "none") -> dict[str, Any]: + """Add reasoning_effort to params only if model supports it.""" + if SUPPORTS_REASONING: + params["reasoning_effort"] = effort + return params + + def _normalize_tool_call(tc: Any) -> tuple[str | None, dict[str, Any] | None]: """Convert LiteLLM tool call objects/dicts into (name, arguments dict).""" @@ -712,6 +743,7 @@ def _debug_log_assistant_message(test_name: str, assistant_message: Message | No "max_tokens": DEFAULT_MAX_TOKENS, "response_format": STRUCTURED_RESPONSE_FORMAT, "reasoning_effort": "none", # No reasoning expected for structured output + "raw_output": True, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -980,6 +1012,7 @@ def test_streaming_json_preservation(row: EvaluationRow) -> EvaluationRow: "top_p": 1.0, "max_tokens": DEFAULT_MAX_TOKENS, "reasoning_effort": "none", # No reasoning expected for tool calls + "raw_output": True, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -1252,6 +1285,7 @@ def test_streaming_tool_complex_arguments(row: EvaluationRow) -> EvaluationRow: _MULTI_TOOL_CALLS_ROW = _build_row_from_payload("multi-tool-calls", MULTI_TOOL_CALLS_PAYLOAD) +@pytest.mark.skipif(not SUPPORTS_MULTIPLE_TOOL_CALLS, reason="Model does not support multiple tool calls") @evaluation_test( input_rows=[[_MULTI_TOOL_CALLS_ROW]], completion_params=[_build_completion_params_from_payload(MULTI_TOOL_CALLS_PAYLOAD)], @@ -1755,16 +1789,20 @@ def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow: } +@pytest.mark.skipif(not SUPPORTS_REASONING, reason="Model does not support reasoning_effort parameter") @evaluation_test( input_rows=[[REASONING_DISABLED_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model - "reasoning_effort": "none", # Explicitly disable reasoning - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": True, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": True, + "raw_output": True, + }, + "none", + ) # Explicitly disable reasoning ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -1865,16 +1903,20 @@ def test_reasoning_effort_none_no_reasoning(row: EvaluationRow) -> EvaluationRow } +@pytest.mark.skipif(not SUPPORTS_REASONING, reason="Model does not support reasoning_effort parameter") @evaluation_test( input_rows=[[REASONING_ENABLED_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model - "reasoning_effort": "low", # Enable reasoning - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": True, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": True, + "raw_output": True, + }, + "low", + ) # Enable reasoning ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -2003,13 +2045,16 @@ def test_reasoning_effort_low_has_reasoning(row: EvaluationRow) -> EvaluationRow @evaluation_test( input_rows=[[TOOLS_WITH_REASONING_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model - "reasoning_effort": "low", # Enable reasoning - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": True, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": True, + "raw_output": True, + }, + "low", + ) # Enable reasoning ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -2020,7 +2065,7 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow: Verify that streaming works correctly when BOTH tools and reasoning are present. Requirements: - - reasoning_content should be present + - reasoning_content should be present (only checked if SUPPORTS_REASONING) - tool_calls should be present - finish_reason should be "tool_calls" - No XML tags or reasoning leakage @@ -2047,12 +2092,6 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow: break metrics = { - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "has_tool_calls": MetricResult( score=1.0 if has_tool_calls else 0.0, is_score_valid=True, @@ -2074,12 +2113,24 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow: ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( - reasoning_present + reasoning_check_passed and has_tool_calls and finish_reason_tool_calls and tool_call_valid @@ -2091,7 +2142,7 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow: # Build detailed failure reason failure_reasons = [] - if not reasoning_present: + if SUPPORTS_REASONING and not reasoning_present: failure_reasons.append("reasoning_content missing") if not has_tool_calls: failure_reasons.append("no tool calls") @@ -2147,6 +2198,7 @@ def test_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow: "max_tokens": os.getenv("EP_MAX_TOKENS", DEFAULT_MAX_TOKENS), "temperature": 0.0, # Deterministic for consistency "stream": False, # Will be overridden by custom rollout + "raw_output": True, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -2298,6 +2350,7 @@ async def test_streaming_output_consistency(row: EvaluationRow) -> EvaluationRow "max_tokens": DEFAULT_MAX_TOKENS, "response_format": STRUCTURED_RESPONSE_FORMAT, "reasoning_effort": "none", + "raw_output": True, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -2505,6 +2558,7 @@ def test_non_streaming_simple_completion(row: EvaluationRow) -> EvaluationRow: "top_p": 1.0, "max_tokens": DEFAULT_MAX_TOKENS, "reasoning_effort": "none", + "raw_output": True, } ], rollout_processor=SingleTurnRolloutProcessor(), @@ -2629,6 +2683,7 @@ def test_non_streaming_single_tool_call(row: EvaluationRow) -> EvaluationRow: _MULTI_TOOL_CALLS_NON_STREAM_ROW = _build_row_from_payload("multi-tool-calls-non-stream", MULTI_TOOL_CALLS_PAYLOAD) +@pytest.mark.skipif(not SUPPORTS_MULTIPLE_TOOL_CALLS, reason="Model does not support multiple tool calls") @evaluation_test( input_rows=[[_MULTI_TOOL_CALLS_NON_STREAM_ROW]], completion_params=[ @@ -2723,16 +2778,20 @@ def test_non_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow: } +@pytest.mark.skipif(not SUPPORTS_REASONING, reason="Model does not support reasoning_effort parameter") @evaluation_test( input_rows=[[REASONING_DISABLED_NON_STREAM_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "reasoning_effort": "none", - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": False, # Non-streaming - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": False, # Non-streaming + "raw_output": True, + }, + "none", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -2830,16 +2889,20 @@ def test_reasoning_effort_none_no_reasoning_non_stream(row: EvaluationRow) -> Ev } +@pytest.mark.skipif(not SUPPORTS_REASONING, reason="Model does not support reasoning_effort parameter") @evaluation_test( input_rows=[[REASONING_ENABLED_NON_STREAM_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "reasoning_effort": "low", - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": False, # Non-streaming - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": False, # Non-streaming + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -2961,13 +3024,16 @@ def test_reasoning_effort_low_has_reasoning_non_stream(row: EvaluationRow) -> Ev @evaluation_test( input_rows=[[TOOLS_WITH_REASONING_NON_STREAM_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "reasoning_effort": "low", - "max_tokens": DEFAULT_MAX_TOKENS, - "temperature": 0.0, - "stream": False, # Non-streaming - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "max_tokens": DEFAULT_MAX_TOKENS, + "temperature": 0.0, + "stream": False, # Non-streaming + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -2997,12 +3063,6 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow break metrics = { - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "has_tool_calls": MetricResult( score=1.0 if has_tool_calls else 0.0, is_score_valid=True, @@ -3024,12 +3084,24 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( - reasoning_present + reasoning_check_passed and has_tool_calls and finish_reason_tool_calls and tool_call_valid @@ -3041,7 +3113,7 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow # Build detailed failure reason failure_reasons = [] - if not reasoning_present: + if SUPPORTS_REASONING and not reasoning_present: failure_reasons.append("reasoning_content missing") if not has_tool_calls: failure_reasons.append("no tool calls") @@ -3059,7 +3131,7 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow failure_reasons.append("thinking phrases in content") reason = ( - "Tools + reasoning work together in streaming" + "Tools + reasoning work together in non-streaming" if all_checks_passed else f"Compliance failed: {', '.join(failure_reasons)}" ) @@ -3107,14 +3179,17 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow @evaluation_test( input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "stream": True, - "reasoning_effort": "low", - "response_format": STRUCTURED_JSON_SCHEMA, - "temperature": 0.0, - "max_tokens": DEFAULT_MAX_TOKENS, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "stream": True, + "response_format": STRUCTURED_JSON_SCHEMA, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -3153,12 +3228,6 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu is_score_valid=content_is_json, reason="speed_kmh is numeric" if speed_is_number else "speed_kmh not numeric", ), - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "finish_reason_stop": MetricResult( score=1.0 if finish_reason_stop else 0.0, is_score_valid=True, @@ -3166,15 +3235,27 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( content_is_json and has_required_keys and speed_is_number - and reasoning_present + and reasoning_check_passed and finish_reason_stop and finish_reason_present and no_forbidden_tags @@ -3210,14 +3291,17 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu @evaluation_test( input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "stream": False, - "reasoning_effort": "low", - "response_format": STRUCTURED_JSON_SCHEMA, - "temperature": 0.0, - "max_tokens": DEFAULT_MAX_TOKENS, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "stream": False, + "response_format": STRUCTURED_JSON_SCHEMA, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -3256,12 +3340,6 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E is_score_valid=content_is_json, reason="speed_kmh is numeric" if speed_is_number else "speed_kmh not numeric", ), - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "finish_reason_stop": MetricResult( score=1.0 if finish_reason_stop else 0.0, is_score_valid=True, @@ -3269,15 +3347,27 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( content_is_json and has_required_keys and speed_is_number - and reasoning_present + and reasoning_check_passed and finish_reason_stop and finish_reason_present and no_forbidden_tags @@ -3330,16 +3420,20 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E } +@pytest.mark.skipif(not SUPPORTS_MULTIPLE_TOOL_CALLS, reason="Model does not support multiple tool calls") @evaluation_test( input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "stream": True, - "reasoning_effort": "low", - "temperature": 0.0, - "max_tokens": DEFAULT_MAX_TOKENS, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "stream": True, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -3373,12 +3467,6 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati all_cities_covered = len(cities_covered) == 3 metrics = { - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "has_multiple_tools": MetricResult( score=1.0 if has_multiple_tools else 0.0, is_score_valid=True, @@ -3398,12 +3486,24 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( - reasoning_present + reasoning_check_passed and has_multiple_tools and all_cities_covered and finish_reason_tool_calls @@ -3457,16 +3557,20 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati } +@pytest.mark.skipif(not SUPPORTS_MULTIPLE_TOOL_CALLS, reason="Model does not support multiple tool calls") @evaluation_test( input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW]], completion_params=[ - { - "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", - "stream": False, - "reasoning_effort": "low", - "temperature": 0.0, - "max_tokens": DEFAULT_MAX_TOKENS, - } + _maybe_add_reasoning_effort( + { + "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", + "stream": False, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "raw_output": True, + }, + "low", + ) ], rollout_processor=SingleTurnRolloutProcessor(), passed_threshold=1.0, @@ -3500,12 +3604,6 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval all_cities_covered = len(cities_covered) == 3 metrics = { - "reasoning_present": MetricResult( - score=1.0 if reasoning_present else 0.0, - is_score_valid=True, - reason="reasoning_content present" if reasoning_present else "reasoning_content missing", - data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, - ), "has_multiple_tools": MetricResult( score=1.0 if has_multiple_tools else 0.0, is_score_valid=True, @@ -3525,12 +3623,24 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval ), } + # Only add reasoning metric if reasoning is supported + if SUPPORTS_REASONING: + metrics["reasoning_present"] = MetricResult( + score=1.0 if reasoning_present else 0.0, + is_score_valid=True, + reason="reasoning_content present" if reasoning_present else "reasoning_content missing", + data={"reasoning_length": len(reasoning_str), "reasoning_preview": reasoning_str[:200]}, + ) + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( metrics, finish_reason, content_str, reasoning_str ) + # Reasoning check is only required if SUPPORTS_REASONING is True + reasoning_check_passed = reasoning_present if SUPPORTS_REASONING else True + all_checks_passed = ( - reasoning_present + reasoning_check_passed and has_multiple_tools and all_cities_covered and finish_reason_tool_calls @@ -3549,3 +3659,570 @@ def test_non_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Eval metrics=metrics, ) return row + + +# ============================================================================ +# Tool Choice Tests +# ============================================================================ + +TOOL_CHOICE_NONE_ROW = EvaluationRow( + messages=[ + Message(role="system", content="You are a helpful assistant."), + Message( + role="user", + content="What's the weather in San Francisco?", + ), + ], + tools=WEATHER_TOOL_DEFINITION, # Tools are passed but should be ignored with tool_choice=none +) +TOOL_CHOICE_NONE_ROW.input_metadata.dataset_info = { + "test_name": "tool_choice_none_stream", + "description": "Streaming: tool_choice=none should not produce tool calls and tools should not appear in raw_output prompt", +} + + +@evaluation_test( + input_rows=[[TOOL_CHOICE_NONE_ROW]], + completion_params=[ + { + "model": DEFAULT_MODEL_ID, + "stream": True, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "reasoning_effort": "none", + "tool_choice": "none", + "raw_output": True, + } + ], + rollout_processor=SingleTurnRolloutProcessor(), + passed_threshold=1.0, + mode="pointwise", +) +def test_streaming_tool_choice_none(row: EvaluationRow) -> EvaluationRow: + """ + Verify that tool_choice=none prevents tool calls and tools are not in raw_output prompt. + + Requirements: + - No tool_calls should be returned + - raw_output should NOT mention tools in the prompt + - finish_reason should be "stop" + - Content should be present + """ + assistant_msg = row.last_assistant_message() + finish_reason = row.execution_metadata.finish_reason + + content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else "" + reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else "" + tool_calls = assistant_msg.tool_calls if assistant_msg else [] + + has_content = bool(content_str.strip()) + no_tool_calls = not tool_calls or len(tool_calls) == 0 + finish_reason_stop = finish_reason == "stop" + + # Check raw_output for tool mentions (if available) + raw_output = getattr(row.execution_metadata, "raw_output", None) or {} + prompt_fragments = raw_output.get("prompt_fragments", []) if isinstance(raw_output, dict) else [] + + # Check if tools are mentioned in prompt_fragments + tools_in_prompt = False + tool_keywords = ["get_current_weather", "function", "tool"] + for fragment in prompt_fragments: + fragment_lower = fragment.lower() if isinstance(fragment, str) else "" + if any(keyword in fragment_lower for keyword in tool_keywords): + tools_in_prompt = True + break + + # If raw_output not available, skip the prompt check (is_score_valid=False) + raw_output_available = bool(raw_output and prompt_fragments) + + metrics = { + "has_content": MetricResult( + score=1.0 if has_content else 0.0, + is_score_valid=True, + reason="Content present" if has_content else "No content", + data={"content_preview": content_str[:100]}, + ), + "no_tool_calls": MetricResult( + score=1.0 if no_tool_calls else 0.0, + is_score_valid=True, + reason="No tool calls (as expected with tool_choice=none)" + if no_tool_calls + else f"Unexpected tool calls: {len(tool_calls) if tool_calls else 0}", + data={"tool_call_count": len(tool_calls) if tool_calls else 0}, + ), + "tools_not_in_prompt": MetricResult( + score=1.0 if not tools_in_prompt else 0.0, + is_score_valid=raw_output_available, # Only valid if raw_output is available + reason="Tools not mentioned in raw_output prompt" + if not tools_in_prompt + else "Tools found in raw_output prompt (unexpected with tool_choice=none)", + data={ + "prompt_fragments": prompt_fragments[:3] if prompt_fragments else [], + "raw_output_available": raw_output_available, + }, + ), + "finish_reason_stop": MetricResult( + score=1.0 if finish_reason_stop else 0.0, + is_score_valid=True, + reason="finish_reason is stop" if finish_reason_stop else f"Unexpected finish_reason: {finish_reason}", + ), + } + + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( + metrics, finish_reason, content_str, reasoning_str + ) + + # For all_checks_passed, only consider tools_in_prompt if raw_output is available + tools_check_passed = not tools_in_prompt if raw_output_available else True + + all_checks_passed = ( + has_content + and no_tool_calls + and tools_check_passed + and finish_reason_stop + and finish_reason_present + and no_forbidden_tags + and no_xml_tags + and no_reasoning_leakage + ) + + # Build detailed failure reason + failure_reasons: list[str] = [] + if not has_content: + failure_reasons.append("no content") + if not no_tool_calls: + failure_reasons.append("tool calls present") + if raw_output_available and tools_in_prompt: + failure_reasons.append("tools in raw_output prompt") + if not finish_reason_stop: + failure_reasons.append(f"finish_reason={finish_reason}") + if not finish_reason_present: + failure_reasons.append("finish_reason null") + if not no_forbidden_tags: + failure_reasons.append("forbidden tags detected") + if not no_xml_tags: + failure_reasons.append("XML tags detected") + + reason = ( + "tool_choice=none respected: no tool calls, tools not in prompt" + if all_checks_passed + else f"Compliance failed: {', '.join(failure_reasons)}" + ) + + row.evaluation_result = EvaluateResult( + score=1.0 if all_checks_passed else 0.0, + is_score_valid=True, + reason=reason, + metrics=metrics, + ) + return row + + +# Non-streaming version +TOOL_CHOICE_NONE_NON_STREAM_ROW = EvaluationRow( + messages=[ + Message(role="system", content="You are a helpful assistant."), + Message( + role="user", + content="What's the weather in San Francisco?", + ), + ], + tools=WEATHER_TOOL_DEFINITION, # Tools are passed but should be ignored with tool_choice=none +) +TOOL_CHOICE_NONE_NON_STREAM_ROW.input_metadata.dataset_info = { + "test_name": "tool_choice_none_non_stream", + "description": "Non-streaming: tool_choice=none should not produce tool calls and tools should not appear in raw_output prompt", +} + + +@evaluation_test( + input_rows=[[TOOL_CHOICE_NONE_NON_STREAM_ROW]], + completion_params=[ + { + "model": DEFAULT_MODEL_ID, + "stream": False, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "reasoning_effort": "none", + "tool_choice": "none", + "raw_output": True, + } + ], + rollout_processor=SingleTurnRolloutProcessor(), + passed_threshold=1.0, + mode="pointwise", +) +def test_non_streaming_tool_choice_none(row: EvaluationRow) -> EvaluationRow: + """Non-streaming version: Verify tool_choice=none prevents tool calls.""" + assistant_msg = row.last_assistant_message() + finish_reason = row.execution_metadata.finish_reason + + content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else "" + reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else "" + tool_calls = assistant_msg.tool_calls if assistant_msg else [] + + has_content = bool(content_str.strip()) + no_tool_calls = not tool_calls or len(tool_calls) == 0 + finish_reason_stop = finish_reason == "stop" + + # Check raw_output for tool mentions (if available) + raw_output = getattr(row.execution_metadata, "raw_output", None) or {} + prompt_fragments = raw_output.get("prompt_fragments", []) if isinstance(raw_output, dict) else [] + + # Check if tools are mentioned in prompt_fragments + tools_in_prompt = False + tool_keywords = ["get_current_weather", "function", "tool"] + for fragment in prompt_fragments: + fragment_lower = fragment.lower() if isinstance(fragment, str) else "" + if any(keyword in fragment_lower for keyword in tool_keywords): + tools_in_prompt = True + break + + # If raw_output not available, skip the prompt check (is_score_valid=False) + raw_output_available = bool(raw_output and prompt_fragments) + + metrics = { + "has_content": MetricResult( + score=1.0 if has_content else 0.0, + is_score_valid=True, + reason="Content present" if has_content else "No content", + data={"content_preview": content_str[:100]}, + ), + "no_tool_calls": MetricResult( + score=1.0 if no_tool_calls else 0.0, + is_score_valid=True, + reason="No tool calls (as expected with tool_choice=none)" + if no_tool_calls + else f"Unexpected tool calls: {len(tool_calls) if tool_calls else 0}", + data={"tool_call_count": len(tool_calls) if tool_calls else 0}, + ), + "tools_not_in_prompt": MetricResult( + score=1.0 if not tools_in_prompt else 0.0, + is_score_valid=raw_output_available, # Only valid if raw_output is available + reason="Tools not mentioned in raw_output prompt" + if not tools_in_prompt + else "Tools found in raw_output prompt (unexpected with tool_choice=none)", + data={ + "prompt_fragments": prompt_fragments[:3] if prompt_fragments else [], + "raw_output_available": raw_output_available, + }, + ), + "finish_reason_stop": MetricResult( + score=1.0 if finish_reason_stop else 0.0, + is_score_valid=True, + reason="finish_reason is stop" if finish_reason_stop else f"Unexpected finish_reason: {finish_reason}", + ), + } + + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( + metrics, finish_reason, content_str, reasoning_str + ) + + # For all_checks_passed, only consider tools_in_prompt if raw_output is available + tools_check_passed = not tools_in_prompt if raw_output_available else True + + all_checks_passed = ( + has_content + and no_tool_calls + and tools_check_passed + and finish_reason_stop + and finish_reason_present + and no_forbidden_tags + and no_xml_tags + and no_reasoning_leakage + ) + + # Build detailed failure reason + failure_reasons: list[str] = [] + if not has_content: + failure_reasons.append("no content") + if not no_tool_calls: + failure_reasons.append("tool calls present") + if raw_output_available and tools_in_prompt: + failure_reasons.append("tools in raw_output prompt") + if not finish_reason_stop: + failure_reasons.append(f"finish_reason={finish_reason}") + if not finish_reason_present: + failure_reasons.append("finish_reason null") + if not no_forbidden_tags: + failure_reasons.append("forbidden tags detected") + if not no_xml_tags: + failure_reasons.append("XML tags detected") + + reason = ( + "tool_choice=none respected: no tool calls, tools not in prompt" + if all_checks_passed + else f"Compliance failed: {', '.join(failure_reasons)}" + ) + + row.evaluation_result = EvaluateResult( + score=1.0 if all_checks_passed else 0.0, + is_score_valid=True, + reason=reason, + metrics=metrics, + ) + return row + + +# ============================================================================ +# Tool Choice Required Tests +# ============================================================================ + +TOOL_CHOICE_REQUIRED_ROW = EvaluationRow( + messages=[ + Message(role="system", content="You are a helpful assistant with access to tools."), + Message( + role="user", + content="What's the weather in Boston?", + ), + ], + tools=WEATHER_TOOL_DEFINITION, +) +TOOL_CHOICE_REQUIRED_ROW.input_metadata.dataset_info = { + "test_name": "tool_choice_required_stream", + "description": "Streaming: tool_choice=required must produce at least one tool call", +} + + +@evaluation_test( + input_rows=[[TOOL_CHOICE_REQUIRED_ROW]], + completion_params=[ + { + "model": DEFAULT_MODEL_ID, + "stream": True, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "reasoning_effort": "none", + "tool_choice": "required", + "raw_output": True, + } + ], + rollout_processor=SingleTurnRolloutProcessor(), + passed_threshold=1.0, + mode="pointwise", +) +def test_streaming_tool_choice_required(row: EvaluationRow) -> EvaluationRow: + """ + Verify that tool_choice=required forces the model to make a tool call. + + Requirements: + - At least one tool_call should be returned + - finish_reason should be "tool_calls" + - Tool call arguments should be valid JSON + """ + assistant_msg = row.last_assistant_message() + finish_reason = row.execution_metadata.finish_reason + + content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else "" + reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else "" + tool_calls = assistant_msg.tool_calls if assistant_msg else [] + calls = _collect_tool_calls(tool_calls) + + has_tool_calls = len(calls) > 0 + finish_reason_tool_calls = finish_reason == "tool_calls" + + # Validate tool call arguments are valid + tool_call_valid = False + tool_call_args = None + if has_tool_calls: + for name, args in calls: + if name == "get_current_weather" and isinstance(args, dict): + tool_call_args = args + location = (args.get("location") or "").lower() + if "boston" in location: + tool_call_valid = True + break + + metrics = { + "has_tool_calls": MetricResult( + score=1.0 if has_tool_calls else 0.0, + is_score_valid=True, + reason="Tool calls present (as expected with tool_choice=required)" + if has_tool_calls + else "No tool calls (unexpected with tool_choice=required)", + data={"tool_call_count": len(calls)}, + ), + "tool_call_valid": MetricResult( + score=1.0 if tool_call_valid else 0.0, + is_score_valid=has_tool_calls, + reason="Tool call arguments valid" if tool_call_valid else "Tool call arguments invalid or missing", + data={"arguments": tool_call_args, "tool_calls": [{"name": n, "args": a} for n, a in calls]}, + ), + "finish_reason_tool_calls": MetricResult( + score=1.0 if finish_reason_tool_calls else 0.0, + is_score_valid=True, + reason="finish_reason is tool_calls" + if finish_reason_tool_calls + else f"Unexpected finish_reason: {finish_reason}", + ), + } + + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( + metrics, finish_reason, content_str, reasoning_str + ) + + all_checks_passed = ( + has_tool_calls + and tool_call_valid + and finish_reason_tool_calls + and finish_reason_present + and no_forbidden_tags + and no_xml_tags + and no_reasoning_leakage + ) + + # Build detailed failure reason + failure_reasons = [] + if not has_tool_calls: + failure_reasons.append("no tool calls") + if not tool_call_valid: + failure_reasons.append("tool call invalid") + if not finish_reason_tool_calls: + failure_reasons.append(f"finish_reason={finish_reason}") + if not finish_reason_present: + failure_reasons.append("finish_reason null") + if not no_forbidden_tags: + failure_reasons.append("forbidden tags detected") + if not no_xml_tags: + failure_reasons.append("XML tags detected") + + reason = ( + "tool_choice=required respected: tool call made" + if all_checks_passed + else f"Compliance failed: {', '.join(failure_reasons)}" + ) + + row.evaluation_result = EvaluateResult( + score=1.0 if all_checks_passed else 0.0, + is_score_valid=True, + reason=reason, + metrics=metrics, + ) + return row + + +# Non-streaming version +TOOL_CHOICE_REQUIRED_NON_STREAM_ROW = EvaluationRow( + messages=[ + Message(role="system", content="You are a helpful assistant with access to tools."), + Message( + role="user", + content="What's the weather in Boston?", + ), + ], + tools=WEATHER_TOOL_DEFINITION, +) +TOOL_CHOICE_REQUIRED_NON_STREAM_ROW.input_metadata.dataset_info = { + "test_name": "tool_choice_required_non_stream", + "description": "Non-streaming: tool_choice=required must produce at least one tool call", +} + + +@evaluation_test( + input_rows=[[TOOL_CHOICE_REQUIRED_NON_STREAM_ROW]], + completion_params=[ + { + "model": DEFAULT_MODEL_ID, + "stream": False, + "temperature": 0.0, + "max_tokens": DEFAULT_MAX_TOKENS, + "reasoning_effort": "none", + "tool_choice": "required", + "raw_output": True, + } + ], + rollout_processor=SingleTurnRolloutProcessor(), + passed_threshold=1.0, + mode="pointwise", +) +def test_non_streaming_tool_choice_required(row: EvaluationRow) -> EvaluationRow: + """Non-streaming version: Verify tool_choice=required forces tool call.""" + assistant_msg = row.last_assistant_message() + finish_reason = row.execution_metadata.finish_reason + + content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else "" + reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else "" + tool_calls = assistant_msg.tool_calls if assistant_msg else [] + calls = _collect_tool_calls(tool_calls) + + has_tool_calls = len(calls) > 0 + finish_reason_tool_calls = finish_reason == "tool_calls" + + # Validate tool call arguments are valid + tool_call_valid = False + tool_call_args = None + if has_tool_calls: + for name, args in calls: + if name == "get_current_weather" and isinstance(args, dict): + tool_call_args = args + location = (args.get("location") or "").lower() + if "boston" in location: + tool_call_valid = True + break + + metrics = { + "has_tool_calls": MetricResult( + score=1.0 if has_tool_calls else 0.0, + is_score_valid=True, + reason="Tool calls present (as expected with tool_choice=required)" + if has_tool_calls + else "No tool calls (unexpected with tool_choice=required)", + data={"tool_call_count": len(calls)}, + ), + "tool_call_valid": MetricResult( + score=1.0 if tool_call_valid else 0.0, + is_score_valid=has_tool_calls, + reason="Tool call arguments valid" if tool_call_valid else "Tool call arguments invalid or missing", + data={"arguments": tool_call_args, "tool_calls": [{"name": n, "args": a} for n, a in calls]}, + ), + "finish_reason_tool_calls": MetricResult( + score=1.0 if finish_reason_tool_calls else 0.0, + is_score_valid=True, + reason="finish_reason is tool_calls" + if finish_reason_tool_calls + else f"Unexpected finish_reason: {finish_reason}", + ), + } + + finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks( + metrics, finish_reason, content_str, reasoning_str + ) + + all_checks_passed = ( + has_tool_calls + and tool_call_valid + and finish_reason_tool_calls + and finish_reason_present + and no_forbidden_tags + and no_xml_tags + and no_reasoning_leakage + ) + + # Build detailed failure reason + failure_reasons = [] + if not has_tool_calls: + failure_reasons.append("no tool calls") + if not tool_call_valid: + failure_reasons.append("tool call invalid") + if not finish_reason_tool_calls: + failure_reasons.append(f"finish_reason={finish_reason}") + if not finish_reason_present: + failure_reasons.append("finish_reason null") + if not no_forbidden_tags: + failure_reasons.append("forbidden tags detected") + if not no_xml_tags: + failure_reasons.append("XML tags detected") + + reason = ( + "tool_choice=required respected: tool call made" + if all_checks_passed + else f"Compliance failed: {', '.join(failure_reasons)}" + ) + + row.evaluation_result = EvaluateResult( + score=1.0 if all_checks_passed else 0.0, + is_score_valid=True, + reason=reason, + metrics=metrics, + ) + return row