Skip to content

Commit 688e87f

Browse files
committed
yo
1 parent 406ed5b commit 688e87f

File tree

1 file changed

+60
-133
lines changed

1 file changed

+60
-133
lines changed

eval_protocol/benchmarks/test_glm_streaming_compliance.py

Lines changed: 60 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from eval_protocol.pytest.evaluation_test import evaluation_test
1919

2020

21-
DEFAULT_MODEL_ID = "fireworks_ai/accounts/fireworks/models/glm-4p6"
21+
DEFAULT_MODEL_ID = "fireworks_ai/accounts/pyroworks/deployedModels/minimax-m2-zmi4qk9f"
2222
DEFAULT_MAX_TOKENS = 10000
2323

2424

@@ -153,7 +153,34 @@ def _safe_json_loads(payload: str) -> Any | None:
153153
"content": "Call test_brace_bug with param1='test_value', param2=42, and param3=true",
154154
}
155155
],
156-
"tools": WEATHER_TOOL_DEFINITION,
156+
"tools": [
157+
{
158+
"type": "function",
159+
"function": {
160+
"name": "test_brace_bug",
161+
"description": "A test function to validate JSON brace handling in tool arguments",
162+
"parameters": {
163+
"type": "object",
164+
"properties": {
165+
"param1": {
166+
"type": "string",
167+
"description": "A string parameter",
168+
},
169+
"param2": {
170+
"type": "integer",
171+
"description": "An integer parameter",
172+
},
173+
"param3": {
174+
"type": "boolean",
175+
"description": "A boolean parameter",
176+
},
177+
},
178+
"required": ["param1", "param2", "param3"],
179+
"additionalProperties": False,
180+
},
181+
},
182+
}
183+
],
157184
"temperature": 0.1,
158185
"top_p": 1,
159186
}
@@ -468,48 +495,6 @@ def _safe_json_loads(payload: str) -> Any | None:
468495
"stream": True,
469496
}
470497

471-
PEER_TOOL_RECOVERY_FAILURE_PAYLOAD = {
472-
"messages": [
473-
{
474-
"role": "user",
475-
"content": (
476-
"View the file at /tmp/test.txt. If that fails, try again with the correct parameters. "
477-
"Keep retrying until it works."
478-
),
479-
}
480-
],
481-
"tools": [
482-
{
483-
"type": "function",
484-
"function": {
485-
"name": "view",
486-
"description": "View a file or directory",
487-
"strict": True,
488-
"parameters": {
489-
"type": "object",
490-
"properties": {
491-
"path": {
492-
"type": "string",
493-
"description": "Path to the file or directory to view",
494-
},
495-
"type": {
496-
"type": "string",
497-
"enum": ["file", "directory"],
498-
"description": "Type of the path (file or directory)",
499-
},
500-
},
501-
"required": ["path", "type"],
502-
"additionalProperties": False,
503-
},
504-
},
505-
}
506-
],
507-
"tool_choice": "required",
508-
"temperature": 0.1,
509-
"max_tokens": 4000,
510-
"stream": True,
511-
}
512-
513498

514499
def _build_row_from_payload(case: str, payload: dict[str, Any]) -> EvaluationRow:
515500
messages = [
@@ -1329,47 +1314,50 @@ def test_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
13291314
return row
13301315

13311316

1332-
_PEER_TOOL_MISSING_REQUIRED_ROW = _build_row_from_payload(
1333-
"peer-tool-missing-required-param", PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
1317+
_PEER_TOOL_REQUIRED_PARAMS_ROW = _build_row_from_payload(
1318+
"peer-tool-required-params", PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD
13341319
)
13351320

13361321

13371322
@evaluation_test(
1338-
input_rows=[[_PEER_TOOL_MISSING_REQUIRED_ROW]],
1323+
input_rows=[[_PEER_TOOL_REQUIRED_PARAMS_ROW]],
13391324
completion_params=[_build_completion_params_from_payload(PEER_TOOL_MISSING_REQUIRED_PARAM_PAYLOAD)],
13401325
rollout_processor=SingleTurnRolloutProcessor(),
13411326
aggregation_method="mean",
13421327
passed_threshold=0.0,
13431328
num_runs=1,
13441329
mode="pointwise",
13451330
)
1346-
def test_streaming_tool_missing_required_param(row: EvaluationRow) -> EvaluationRow:
1347-
"""Detect whether required parameters are omitted during streaming."""
1331+
def test_streaming_tool_required_params_present(row: EvaluationRow) -> EvaluationRow:
1332+
"""Verify that tool calls include all required parameters during streaming."""
13481333

13491334
assistant_msg = row.last_assistant_message()
13501335
finish_reason = row.execution_metadata.finish_reason
1351-
_debug_log_assistant_message("tool_missing_required_param", assistant_msg, finish_reason)
1336+
_debug_log_assistant_message("tool_required_params", assistant_msg, finish_reason)
13521337
content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
13531338
reasoning_str = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
13541339
calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
13551340

1356-
missing_required = False
1341+
required_params_present = False
13571342
arguments = None
13581343
for _, args in calls:
13591344
if args:
13601345
arguments = args
1361-
missing_required = "type" not in args or args.get("type") not in {"file", "directory"}
1346+
# Check that required 'type' param is present and valid
1347+
required_params_present = "type" in args and args.get("type") in {"file", "directory"}
13621348

13631349
metrics = {
13641350
"tool_call_emitted": MetricResult(
13651351
score=1.0 if calls else 0.0,
13661352
is_score_valid=True,
13671353
reason="Tool call emitted" if calls else "No tool call emitted",
13681354
),
1369-
"missing_required_param": MetricResult(
1370-
score=1.0 if missing_required else 0.0,
1355+
"required_params_present": MetricResult(
1356+
score=1.0 if required_params_present else 0.0,
13711357
is_score_valid=bool(calls),
1372-
reason="Required parameter missing or invalid" if missing_required else "All required parameters present",
1358+
reason="All required parameters present"
1359+
if required_params_present
1360+
else "Required parameter missing or invalid",
13731361
data={"arguments": arguments},
13741362
),
13751363
"finish_reason": MetricResult(
@@ -1386,15 +1374,19 @@ def test_streaming_tool_missing_required_param(row: EvaluationRow) -> Evaluation
13861374
)
13871375

13881376
all_checks_passed = (
1389-
missing_required and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1377+
required_params_present
1378+
and finish_reason_present
1379+
and no_forbidden_tags
1380+
and no_xml_tags
1381+
and no_reasoning_leakage
13901382
)
13911383

13921384
row.evaluation_result = EvaluateResult(
13931385
score=1.0 if all_checks_passed else 0.0,
13941386
is_score_valid=True,
1395-
reason="Detected missing required parameter"
1387+
reason="All required parameters included in tool call"
13961388
if all_checks_passed
1397-
else "Required parameters satisfied or response invalid",
1389+
else "Required parameters missing or response invalid",
13981390
metrics=metrics,
13991391
)
14001392
return row
@@ -1674,71 +1666,6 @@ def test_streaming_tool_parameter_types(row: EvaluationRow) -> EvaluationRow:
16741666
return row
16751667

16761668

1677-
_PEER_TOOL_RECOVERY_ROW = _build_row_from_payload("peer-tool-recovery-failure", PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)
1678-
1679-
1680-
@evaluation_test(
1681-
input_rows=[[_PEER_TOOL_RECOVERY_ROW]],
1682-
completion_params=[_build_completion_params_from_payload(PEER_TOOL_RECOVERY_FAILURE_PAYLOAD)],
1683-
rollout_processor=SingleTurnRolloutProcessor(),
1684-
aggregation_method="mean",
1685-
passed_threshold=0.0,
1686-
num_runs=1,
1687-
mode="pointwise",
1688-
)
1689-
def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow:
1690-
"""Check whether the assistant retries tool calls when instructed to recover."""
1691-
1692-
assistant_msg = row.last_assistant_message()
1693-
print(f"assistant_msg: {assistant_msg}")
1694-
finish_reason = row.execution_metadata.finish_reason
1695-
_debug_log_assistant_message("tool_recovery", assistant_msg, finish_reason)
1696-
content_str = _coerce_content_to_str(assistant_msg.content) if assistant_msg else ""
1697-
calls = _collect_tool_calls(assistant_msg.tool_calls if assistant_msg else [])
1698-
reasoning = (assistant_msg.reasoning_content or "").strip() if assistant_msg else ""
1699-
1700-
multiple_attempts = len(calls) >= 2
1701-
metrics = {
1702-
"tool_call_attempts": MetricResult(
1703-
score=1.0 if multiple_attempts else 0.0,
1704-
is_score_valid=True,
1705-
reason="Multiple tool call attempts" if multiple_attempts else "Single/no tool call attempt",
1706-
data={"tool_call_count": len(calls)},
1707-
),
1708-
"reasoning_present": MetricResult(
1709-
score=1.0 if reasoning else 0.0,
1710-
is_score_valid=True,
1711-
reason="Reasoning present" if reasoning else "No reasoning provided",
1712-
data={"reasoning": reasoning[:160]},
1713-
),
1714-
"finish_reason": MetricResult(
1715-
score=1.0 if finish_reason in {"tool_calls", "stop"} else 0.0,
1716-
is_score_valid=True,
1717-
reason="finish_reason acceptable"
1718-
if finish_reason in {"tool_calls", "stop"}
1719-
else f"Unexpected finish_reason: {finish_reason}",
1720-
),
1721-
}
1722-
1723-
finish_reason_present, no_forbidden_tags, no_xml_tags, no_reasoning_leakage = _augment_metrics_with_common_checks(
1724-
metrics, finish_reason, content_str, reasoning
1725-
)
1726-
1727-
all_checks_passed = (
1728-
multiple_attempts and finish_reason_present and no_forbidden_tags and no_xml_tags and no_reasoning_leakage
1729-
)
1730-
1731-
row.evaluation_result = EvaluateResult(
1732-
score=1.0 if all_checks_passed else 0.0,
1733-
is_score_valid=True,
1734-
reason="Multiple recovery attempts observed"
1735-
if all_checks_passed
1736-
else "Recovery attempts missing or response invalid",
1737-
metrics=metrics,
1738-
)
1739-
return row
1740-
1741-
17421669
# ============================================================================
17431670
# Reasoning Effort Tests
17441671
# ============================================================================
@@ -1759,7 +1686,7 @@ def test_streaming_tool_retry_behavior(row: EvaluationRow) -> EvaluationRow:
17591686
input_rows=[[REASONING_DISABLED_ROW]],
17601687
completion_params=[
17611688
{
1762-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1689+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
17631690
"reasoning_effort": "none", # Explicitly disable reasoning
17641691
"max_tokens": DEFAULT_MAX_TOKENS,
17651692
"temperature": 0.0,
@@ -1869,7 +1796,7 @@ def test_reasoning_effort_none_no_reasoning(row: EvaluationRow) -> EvaluationRow
18691796
input_rows=[[REASONING_ENABLED_ROW]],
18701797
completion_params=[
18711798
{
1872-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1799+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
18731800
"reasoning_effort": "low", # Enable reasoning
18741801
"max_tokens": DEFAULT_MAX_TOKENS,
18751802
"temperature": 0.0,
@@ -2004,7 +1931,7 @@ def test_reasoning_effort_low_has_reasoning(row: EvaluationRow) -> EvaluationRow
20041931
input_rows=[[TOOLS_WITH_REASONING_ROW]],
20051932
completion_params=[
20061933
{
2007-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", # Reasoning-capable model
1934+
"model": DEFAULT_MODEL_ID, # Reasoning-capable model
20081935
"reasoning_effort": "low", # Enable reasoning
20091936
"max_tokens": DEFAULT_MAX_TOKENS,
20101937
"temperature": 0.0,
@@ -2727,7 +2654,7 @@ def test_non_streaming_multiple_tool_calls(row: EvaluationRow) -> EvaluationRow:
27272654
input_rows=[[REASONING_DISABLED_NON_STREAM_ROW]],
27282655
completion_params=[
27292656
{
2730-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2657+
"model": DEFAULT_MODEL_ID,
27312658
"reasoning_effort": "none",
27322659
"max_tokens": DEFAULT_MAX_TOKENS,
27332660
"temperature": 0.0,
@@ -2834,7 +2761,7 @@ def test_reasoning_effort_none_no_reasoning_non_stream(row: EvaluationRow) -> Ev
28342761
input_rows=[[REASONING_ENABLED_NON_STREAM_ROW]],
28352762
completion_params=[
28362763
{
2837-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2764+
"model": DEFAULT_MODEL_ID,
28382765
"reasoning_effort": "low",
28392766
"max_tokens": DEFAULT_MAX_TOKENS,
28402767
"temperature": 0.0,
@@ -2962,7 +2889,7 @@ def test_reasoning_effort_low_has_reasoning_non_stream(row: EvaluationRow) -> Ev
29622889
input_rows=[[TOOLS_WITH_REASONING_NON_STREAM_ROW]],
29632890
completion_params=[
29642891
{
2965-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
2892+
"model": DEFAULT_MODEL_ID,
29662893
"reasoning_effort": "low",
29672894
"max_tokens": DEFAULT_MAX_TOKENS,
29682895
"temperature": 0.0,
@@ -3108,7 +3035,7 @@ def test_non_streaming_tools_with_reasoning(row: EvaluationRow) -> EvaluationRow
31083035
input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_ROW]],
31093036
completion_params=[
31103037
{
3111-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3038+
"model": DEFAULT_MODEL_ID,
31123039
"stream": True,
31133040
"reasoning_effort": "low",
31143041
"response_format": STRUCTURED_JSON_SCHEMA,
@@ -3211,7 +3138,7 @@ def test_streaming_structured_output_with_reasoning(row: EvaluationRow) -> Evalu
32113138
input_rows=[[STRUCTURED_OUTPUT_WITH_REASONING_NON_STREAM_ROW]],
32123139
completion_params=[
32133140
{
3214-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3141+
"model": DEFAULT_MODEL_ID,
32153142
"stream": False,
32163143
"reasoning_effort": "low",
32173144
"response_format": STRUCTURED_JSON_SCHEMA,
@@ -3334,7 +3261,7 @@ def test_non_streaming_structured_output_with_reasoning(row: EvaluationRow) -> E
33343261
input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_ROW]],
33353262
completion_params=[
33363263
{
3337-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3264+
"model": DEFAULT_MODEL_ID,
33383265
"stream": True,
33393266
"reasoning_effort": "low",
33403267
"temperature": 0.0,
@@ -3461,7 +3388,7 @@ def test_streaming_multiple_tools_with_reasoning(row: EvaluationRow) -> Evaluati
34613388
input_rows=[[MULTIPLE_TOOLS_WITH_REASONING_NON_STREAM_ROW]],
34623389
completion_params=[
34633390
{
3464-
"model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
3391+
"model": DEFAULT_MODEL_ID,
34653392
"stream": False,
34663393
"reasoning_effort": "low",
34673394
"temperature": 0.0,

0 commit comments

Comments
 (0)