From 2bc7b58f23faf457ce548d4e8d65cbc6e2ac5f30 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 5 Jan 2026 19:01:22 +0000 Subject: [PATCH] Add configurable conversation timeout to all benchmarks Apply the same timeout configuration from commit0 to all other benchmarks: - gaia - multiswebench - openagentsafety - swebench - swebenchmultimodal - swtbench Default timeout is 3600 seconds (1 hour), configurable via CONVERSATION_TIMEOUT env var. Co-authored-by: openhands --- benchmarks/gaia/run_infer.py | 3 ++- benchmarks/multiswebench/run_infer.py | 3 ++- benchmarks/openagentsafety/run_infer.py | 3 ++- benchmarks/swebench/run_infer.py | 3 ++- benchmarks/swebenchmultimodal/run_infer.py | 3 ++- benchmarks/swtbench/run_infer.py | 3 ++- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 0a0569ab..f95d5846 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -303,7 +303,8 @@ def evaluate_instance( conversation.send_message(msg) else: conversation.send_message(instruction) - conversation.run() + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + conversation.run(timeout=run_timeout) # Extract answer from conversation history model_answer_raw = self._extract_answer_from_history(conversation.state.events) diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 9a64c73d..1c10ec33 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -340,7 +340,8 @@ def _log_event(ev): # keep it simple workspace_path=workspace.working_dir, ) conversation.send_message(instruction) - conversation.run() + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + conversation.run(timeout=run_timeout) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index d9206456..48e12273 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -434,10 +434,11 @@ def event_callback(event) -> None: conversation.send_message(instruction) # Run conversation with error handling + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) - conversation.run() + conversation.run(timeout=run_timeout) logger.info(f"Conversation completed for {instance.id}") except ValidationError as e: logger.warning(f"Validation error from custom events (continuing): {e}") diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 94846c4d..8cefd913 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -247,7 +247,8 @@ def _log_event(ev): # keep it simple workspace_path=workspace.working_dir, ) conversation.send_message(instruction) - conversation.run() + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + conversation.run(timeout=run_timeout) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 2311c778..5eaf2a1e 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -291,7 +291,8 @@ def _log_event(ev): # keep it simple else: logger.info("No image_assets found, sending text-only instruction") conversation.send_message(instruction) - conversation.run() + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + conversation.run(timeout=run_timeout) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index 983bf6d8..82f14f9a 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -274,7 +274,8 @@ def _log_event(ev): # keep it simple workspace_path=workspace.working_dir, ) conversation.send_message(instruction) - conversation.run() + run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600")) + conversation.run(timeout=run_timeout) # git add workspace.execute_command(f"cd {repo_path} ; git add -A")