From 2bc7b58f23faf457ce548d4e8d65cbc6e2ac5f30 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 5 Jan 2026 19:01:22 +0000
Subject: [PATCH] Add configurable conversation timeout to all benchmarks

Apply the same timeout configuration from commit0 to all other benchmarks:
- gaia
- multiswebench
- openagentsafety
- swebench
- swebenchmultimodal
- swtbench

Default timeout is 3600 seconds (1 hour), configurable via CONVERSATION_TIMEOUT env var.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/gaia/run_infer.py               | 3 ++-
 benchmarks/multiswebench/run_infer.py      | 3 ++-
 benchmarks/openagentsafety/run_infer.py    | 3 ++-
 benchmarks/swebench/run_infer.py           | 3 ++-
 benchmarks/swebenchmultimodal/run_infer.py | 3 ++-
 benchmarks/swtbench/run_infer.py           | 3 ++-
 6 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 0a0569ab..f95d5846 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -303,7 +303,8 @@ def evaluate_instance(
             conversation.send_message(msg)
         else:
             conversation.send_message(instruction)
-        conversation.run()
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        conversation.run(timeout=run_timeout)
 
         # Extract answer from conversation history
         model_answer_raw = self._extract_answer_from_history(conversation.state.events)
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index 9a64c73d..1c10ec33 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -340,7 +340,8 @@ def _log_event(ev):  # keep it simple
             workspace_path=workspace.working_dir,
         )
         conversation.send_message(instruction)
-        conversation.run()
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        conversation.run(timeout=run_timeout)
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index d9206456..48e12273 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -434,10 +434,11 @@ def event_callback(event) -> None:
         conversation.send_message(instruction)
 
         # Run conversation with error handling
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
         try:
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=UserWarning)
-                conversation.run()
+                conversation.run(timeout=run_timeout)
             logger.info(f"Conversation completed for {instance.id}")
         except ValidationError as e:
             logger.warning(f"Validation error from custom events (continuing): {e}")
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 94846c4d..8cefd913 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -247,7 +247,8 @@ def _log_event(ev):  # keep it simple
             workspace_path=workspace.working_dir,
         )
         conversation.send_message(instruction)
-        conversation.run()
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        conversation.run(timeout=run_timeout)
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 2311c778..5eaf2a1e 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -291,7 +291,8 @@ def _log_event(ev):  # keep it simple
         else:
             logger.info("No image_assets found, sending text-only instruction")
             conversation.send_message(instruction)
-        conversation.run()
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        conversation.run(timeout=run_timeout)
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index 983bf6d8..82f14f9a 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -274,7 +274,8 @@ def _log_event(ev):  # keep it simple
             workspace_path=workspace.working_dir,
         )
         conversation.send_message(instruction)
-        conversation.run()
+        run_timeout = int(os.getenv("CONVERSATION_TIMEOUT", "3600"))
+        conversation.run(timeout=run_timeout)
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")