1818from eval_protocol .pytest .evaluation_test import evaluation_test
1919
2020
21- DEFAULT_MODEL_ID = "fireworks_ai/accounts/fireworks/models/glm-4p6 "
21+ DEFAULT_MODEL_ID = "fireworks_ai/accounts/pyroworks/deployedModels/minimax-m2-zmi4qk9f "
2222DEFAULT_MAX_TOKENS = 10000
2323
2424
@@ -153,7 +153,34 @@ def _safe_json_loads(payload: str) -> Any | None:
153153 "content" : "Call test_brace_bug with param1='test_value', param2=42, and param3=true" ,
154154 }
155155 ],
156- "tools" : WEATHER_TOOL_DEFINITION ,
156+ "tools" : [
157+ {
158+ "type" : "function" ,
159+ "function" : {
160+ "name" : "test_brace_bug" ,
161+ "description" : "A test function to validate JSON brace handling in tool arguments" ,
162+ "parameters" : {
163+ "type" : "object" ,
164+ "properties" : {
165+ "param1" : {
166+ "type" : "string" ,
167+ "description" : "A string parameter" ,
168+ },
169+ "param2" : {
170+ "type" : "integer" ,
171+ "description" : "An integer parameter" ,
172+ },
173+ "param3" : {
174+ "type" : "boolean" ,
175+ "description" : "A boolean parameter" ,
176+ },
177+ },
178+ "required" : ["param1" , "param2" , "param3" ],
179+ "additionalProperties" : False ,
180+ },
181+ },
182+ }
183+ ],
157184 "temperature" : 0.1 ,
158185 "top_p" : 1 ,
159186}
@@ -468,48 +495,6 @@ def _safe_json_loads(payload: str) -> Any | None:
468495 "stream" : True ,
469496}
470497
471- PEER_TOOL_RECOVERY_FAILURE_PAYLOAD = {
472- "messages" : [
473- {
474- "role" : "user" ,
475- "content" : (
476- "View the file at /tmp/test.txt. If that fails, try again with the correct parameters. "
477- "Keep retrying until it works."
478- ),
479- }
480- ],
481- "tools" : [
482- {
483- "type" : "function" ,
484- "function" : {
485- "name" : "view" ,
486- "description" : "View a file or directory" ,
487- "strict" : True ,
488- "parameters" : {
489- "type" : "object" ,
490- "properties" : {
491- "path" : {
492- "type" : "string" ,
493- "description" : "Path to the file or directory to view" ,
494- },
495- "type" : {
496- "type" : "string" ,
497- "enum" : ["file" , "directory" ],
498- "description" : "Type of the path (file or directory)" ,
499- },
500- },
501- "required" : ["path" , "type" ],
502- "additionalProperties" : False ,
503- },
504- },
505- }
506- ],
507- "tool_choice" : "required" ,
508- "temperature" : 0.1 ,
509- "max_tokens" : 4000 ,
510- "stream" : True ,
511- }
512-
513498
514499def _build_row_from_payload (case : str , payload : dict [str , Any ]) -> EvaluationRow :
515500 messages = [
@@ -1329,47 +1314,50 @@ def test_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
13291314 return row
13301315
13311316
1332- _PEER_TOOL_MISSING_REQUIRED_ROW = _build_row_from_payload (
1333- "peer-tool-missing- required-param " , PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
1317+ _PEER_TOOL_REQUIRED_PARAMS_ROW = _build_row_from_payload (
1318+ "peer-tool-required-params " , PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
13341319)
13351320
13361321
13371322@evaluation_test (
1338- input_rows = [[_PEER_TOOL_MISSING_REQUIRED_ROW ]],
1323+ input_rows = [[_PEER_TOOL_REQUIRED_PARAMS_ROW ]],
13391324 completion_params = [_build_completion_params_from_payload (PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD )],
13401325 rollout_processor = SingleTurnRolloutProcessor (),
13411326 aggregation_method = "mean" ,
13421327 passed_threshold = 0.0 ,
13431328 num_runs = 1 ,
13441329 mode = "pointwise" ,
13451330)
1346- def test_streaming_tool_missing_required_param (row : EvaluationRow ) -> EvaluationRow :
1347- """Detect whether required parameters are omitted during streaming."""
1331+ def test_streaming_tool_required_params_present (row : EvaluationRow ) -> EvaluationRow :
1332+ """Verify that tool calls include all required parameters during streaming."""
13481333
13491334 assistant_msg = row .last_assistant_message ()
13501335 finish_reason = row .execution_metadata .finish_reason
1351- _debug_log_assistant_message ("tool_missing_required_param " , assistant_msg , finish_reason )
1336+ _debug_log_assistant_message ("tool_required_params " , assistant_msg , finish_reason )
13521337 content_str = _coerce_content_to_str (assistant_msg .content ) if assistant_msg else ""
13531338 reasoning_str = (assistant_msg .reasoning_content or "" ).strip () if assistant_msg else ""
13541339 calls = _collect_tool_calls (assistant_msg .tool_calls if assistant_msg else [])
13551340
1356- missing_required = False
1341+ required_params_present = False
13571342 arguments = None
13581343 for _ , args in calls :
13591344 if args :
13601345 arguments = args
1361- missing_required = "type" not in args or args .get ("type" ) not in {"file" , "directory" }
1346+ # Check that required 'type' param is present and valid
1347+ required_params_present = "type" in args and args .get ("type" ) in {"file" , "directory" }
13621348
13631349 metrics = {
13641350 "tool_call_emitted" : MetricResult (
13651351 score = 1.0 if calls else 0.0 ,
13661352 is_score_valid = True ,
13671353 reason = "Tool call emitted" if calls else "No tool call emitted" ,
13681354 ),
1369- "missing_required_param " : MetricResult (
1370- score = 1.0 if missing_required else 0.0 ,
1355+ "required_params_present " : MetricResult (
1356+ score = 1.0 if required_params_present else 0.0 ,
13711357 is_score_valid = bool (calls ),
1372- reason = "Required parameter missing or invalid" if missing_required else "All required parameters present" ,
1358+ reason = "All required parameters present"
1359+ if required_params_present
1360+ else "Required parameter missing or invalid" ,
13731361 data = {"arguments" : arguments },
13741362 ),
13751363 "finish_reason" : MetricResult (
@@ -1386,15 +1374,19 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
13861374 )
13871375
13881376 all_checks_passed = (
1389- missing_required and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1377+ required_params_present
1378+ and finish_reason_present
1379+ and no_forbidden_tags
1380+ and no_xml_tags
1381+ and no_reasoning_leakage
13901382 )
13911383
13921384 row .evaluation_result = EvaluateResult (
13931385 score = 1.0 if all_checks_passed else 0.0 ,
13941386 is_score_valid = True ,
1395- reason = "Detected missing required parameter "
1387+ reason = "All required parameters included in tool call "
13961388 if all_checks_passed
1397- else "Required parameters satisfied or response invalid" ,
1389+ else "Required parameters missing or response invalid" ,
13981390 metrics = metrics ,
13991391 )
14001392 return row
@@ -1674,71 +1666,6 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
16741666 return row
16751667
16761668
1677- _PEER_TOOL_RECOVERY_ROW = _build_row_from_payload ("peer-tool-recovery-failure" , PEER_TOOL_RECOVERY_FAILURE_PAYLOAD )
1678-
1679-
1680- @evaluation_test (
1681- input_rows = [[_PEER_TOOL_RECOVERY_ROW ]],
1682- completion_params = [_build_completion_params_from_payload (PEER_TOOL_RECOVERY_FAILURE_PAYLOAD )],
1683- rollout_processor = SingleTurnRolloutProcessor (),
1684- aggregation_method = "mean" ,
1685- passed_threshold = 0.0 ,
1686- num_runs = 1 ,
1687- mode = "pointwise" ,
1688- )
1689- def test_streaming_tool_retry_behavior (row : EvaluationRow ) -> EvaluationRow :
1690- """Check whether the assistant retries tool calls when instructed to recover."""
1691-
1692- assistant_msg = row .last_assistant_message ()
1693- print (f"assistant_msg: { assistant_msg } " )
1694- finish_reason = row .execution_metadata .finish_reason
1695- _debug_log_assistant_message ("tool_recovery" , assistant_msg , finish_reason )
1696- content_str = _coerce_content_to_str (assistant_msg .content ) if assistant_msg else ""
1697- calls = _collect_tool_calls (assistant_msg .tool_calls if assistant_msg else [])
1698- reasoning = (assistant_msg .reasoning_content or "" ).strip () if assistant_msg else ""
1699-
1700- multiple_attempts = len (calls ) >= 2
1701- metrics = {
1702- "tool_call_attempts" : MetricResult (
1703- score = 1.0 if multiple_attempts else 0.0 ,
1704- is_score_valid = True ,
1705- reason = "Multiple tool call attempts" if multiple_attempts else "Single/no tool call attempt" ,
1706- data = {"tool_call_count" : len (calls )},
1707- ),
1708- "reasoning_present" : MetricResult (
1709- score = 1.0 if reasoning else 0.0 ,
1710- is_score_valid = True ,
1711- reason = "Reasoning present" if reasoning else "No reasoning provided" ,
1712- data = {"reasoning" : reasoning [:160 ]},
1713- ),
1714- "finish_reason" : MetricResult (
1715- score = 1.0 if finish_reason in {"tool_calls" , "stop" } else 0.0 ,
1716- is_score_valid = True ,
1717- reason = "finish_reason acceptable"
1718- if finish_reason in {"tool_calls" , "stop" }
1719- else f"Unexpected finish_reason: { finish_reason } " ,
1720- ),
1721- }
1722-
1723- finish_reason_present , no_forbidden_tags , no_xml_tags , no_reasoning_leakage = _augment_metrics_with_common_checks (
1724- metrics , finish_reason , content_str , reasoning
1725- )
1726-
1727- all_checks_passed = (
1728- multiple_attempts and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1729- )
1730-
1731- row .evaluation_result = EvaluateResult (
1732- score = 1.0 if all_checks_passed else 0.0 ,
1733- is_score_valid = True ,
1734- reason = "Multiple recovery attempts observed"
1735- if all_checks_passed
1736- else "Recovery attempts missing or response invalid" ,
1737- metrics = metrics ,
1738- )
1739- return row
1740-
1741-
17421669# ============================================================================
17431670# Reasoning Effort Tests
17441671# ============================================================================
@@ -1759,7 +1686,7 @@ def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow:
17591686 input_rows = [[REASONING_DISABLED_ROW ]],
17601687 completion_params = [
17611688 {
1762- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" , # Reasoning-capable model
1689+ "model" : DEFAULT_MODEL_ID , # Reasoning-capable model
17631690 "reasoning_effort" : "none" , # Explicitly disable reasoning
17641691 "max_tokens" : DEFAULT_MAX_TOKENS ,
17651692 "temperature" : 0.0 ,
@@ -1869,7 +1796,7 @@ def test_reasoning_effort_none_no_reasoning(row: EvaluationRow) -> EvaluationRow
18691796 input_rows = [[REASONING_ENABLED_ROW ]],
18701797 completion_params = [
18711798 {
1872- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" , # Reasoning-capable model
1799+ "model" : DEFAULT_MODEL_ID , # Reasoning-capable model
18731800 "reasoning_effort" : "low" , # Enable reasoning
18741801 "max_tokens" : DEFAULT_MAX_TOKENS ,
18751802 "temperature" : 0.0 ,
@@ -2004,7 +1931,7 @@ def test_reasoning_effort_low_has_reasoning(row: EvaluationRow) -> EvaluationRow
20041931 input_rows = [[TOOLS_WITH_REASONING_ROW ]],
20051932 completion_params = [
20061933 {
2007- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" , # Reasoning-capable model
1934+ "model" : DEFAULT_MODEL_ID , # Reasoning-capable model
20081935 "reasoning_effort" : "low" , # Enable reasoning
20091936 "max_tokens" : DEFAULT_MAX_TOKENS ,
20101937 "temperature" : 0.0 ,
@@ -2727,7 +2654,7 @@ def test_non_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
27272654 input_rows = [[REASONING_DISABLED_NON_STREAM_ROW ]],
27282655 completion_params = [
27292656 {
2730- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
2657+ "model" : DEFAULT_MODEL_ID ,
27312658 "reasoning_effort" : "none" ,
27322659 "max_tokens" : DEFAULT_MAX_TOKENS ,
27332660 "temperature" : 0.0 ,
@@ -2834,7 +2761,7 @@ def test_reasoning_effort_none_no_reasoning_non_stream(row: EvaluationRow) -> Ev
28342761 input_rows = [[REASONING_ENABLED_NON_STREAM_ROW ]],
28352762 completion_params = [
28362763 {
2837- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
2764+ "model" : DEFAULT_MODEL_ID ,
28382765 "reasoning_effort" : "low" ,
28392766 "max_tokens" : DEFAULT_MAX_TOKENS ,
28402767 "temperature" : 0.0 ,
@@ -2962,7 +2889,7 @@ def test_reasoning_effort_low_has_reasoning_non_stream(row: EvaluationRow) -> Ev
29622889 input_rows = [[TOOLS_WITH_REASONING_NON_STREAM_ROW ]],
29632890 completion_params = [
29642891 {
2965- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
2892+ "model" : DEFAULT_MODEL_ID ,
29662893 "reasoning_effort" : "low" ,
29672894 "max_tokens" : DEFAULT_MAX_TOKENS ,
29682895 "temperature" : 0.0 ,
@@ -3108,7 +3035,7 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
31083035 input_rows = [[STRUCTURED_OUTPUT_WITH_REASONING_ROW ]],
31093036 completion_params = [
31103037 {
3111- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
3038+ "model" : DEFAULT_MODEL_ID ,
31123039 "stream" : True ,
31133040 "reasoning_effort" : "low" ,
31143041 "response_format" : STRUCTURED_JSON_SCHEMA ,
@@ -3211,7 +3138,7 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu
32113138 input_rows = [[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW ]],
32123139 completion_params = [
32133140 {
3214- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
3141+ "model" : DEFAULT_MODEL_ID ,
32153142 "stream" : False ,
32163143 "reasoning_effort" : "low" ,
32173144 "response_format" : STRUCTURED_JSON_SCHEMA ,
@@ -3334,7 +3261,7 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E
33343261 input_rows = [[MULTIPLE_TOOLS_WITH_REASONING_ROW ]],
33353262 completion_params = [
33363263 {
3337- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
3264+ "model" : DEFAULT_MODEL_ID ,
33383265 "stream" : True ,
33393266 "reasoning_effort" : "low" ,
33403267 "temperature" : 0.0 ,
@@ -3461,7 +3388,7 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
34613388 input_rows = [[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW ]],
34623389 completion_params = [
34633390 {
3464- "model" : "fireworks_ai/accounts/fireworks/models/deepseek-v3p1" ,
3391+ "model" : DEFAULT_MODEL_ID ,
34653392 "stream" : False ,
34663393 "reasoning_effort" : "low" ,
34673394 "temperature" : 0.0 ,
0 commit comments