From 4330d40138460e74a303f2d62ad4a165b9c110fe Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Thu, 26 Feb 2026 17:55:40 -0800 Subject: [PATCH] Fix OpenHands Modal startup and eval sandbox reliability --- .../agents/openhands_agent_sdk/adapter.py | 64 ++----------------- .../openhands/sdk/conversation/event_store.py | 10 +-- .../openhands/sdk/observability/laminar.py | 7 +- .../openhands-sdk/openhands/sdk/tool/tool.py | 5 +- .../openhands/sdk/utils/paging.py | 9 ++- .../terminal/terminal/terminal_session.py | 5 +- src/cooperbench/eval/backends/modal.py | 5 ++ 7 files changed, 34 insertions(+), 71 deletions(-) diff --git a/src/cooperbench/agents/openhands_agent_sdk/adapter.py b/src/cooperbench/agents/openhands_agent_sdk/adapter.py index ce07c2b..68bc42e 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/adapter.py +++ b/src/cooperbench/agents/openhands_agent_sdk/adapter.py @@ -666,8 +666,9 @@ def _collect_credentials(self) -> dict[str, str]: def __enter__(self) -> str: """Start sandbox, run agent-server, and return the tunnel URL.""" - # Build image and clear entrypoint (Modal will add its own default command) - image = modal.Image.from_registry(self.image_name).entrypoint([]) + # Preserve image ENTRYPOINT. + # The `-oh` images set ENTRYPOINT to launch `openhands.agent_server`. + image = modal.Image.from_registry(self.image_name) # Get or create app app = modal.App.lookup("cooperbench", create_if_missing=True) @@ -682,66 +683,13 @@ def __enter__(self) -> str: timeout=self.timeout, app=app, secrets=secrets, + # Start outside /workspace/repo to avoid import shadowing + # (e.g., openai_tiktoken_task shadows litellm's `tiktoken` import). + workdir="/", # Expose port 8000 for the agent-server encrypted_ports=[8000], ) - - # IMPORTANT: We use a Python wrapper to ensure collaboration tools are - # imported in the SAME process as the agent-server. This is needed because - # Modal may cache Docker images and the __init__.py auto-import might not - # be in the cached image. - # - # We write the wrapper to a file first, then execute it (heredocs don't - # work well with sandbox.exec's bash -c). - - wrapper_script = ''' -import sys -import os - -sys.argv = ['agent_server', '--host', '0.0.0.0', '--port', '8000'] - -# Force import collaboration tools to register them BEFORE server starts -try: - from openhands.tools.collaboration import SendMessageTool, ReceiveMessageTool - print('[STARTUP] Collaboration tools registered:', SendMessageTool.name, ReceiveMessageTool.name, flush=True) -except Exception as e: - print('[STARTUP] WARNING: Failed to import collaboration tools:', e, flush=True) - -# Now run the agent server (tools are registered in this process) -from openhands.agent_server.__main__ import main -main() -''' - - # Bash script to set up credentials and run the Python wrapper - startup_script = """ -#!/bin/bash -set -e - -# Write Google Cloud credentials if provided -if [ -n "$GOOGLE_APPLICATION_CREDENTIALS_JSON" ]; then - echo "$GOOGLE_APPLICATION_CREDENTIALS_JSON" > /tmp/gcp-credentials.json - export GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-credentials.json -fi - -# Run the Python wrapper -exec /opt/agent-server-venv/bin/python /tmp/agent_wrapper.py -""" - - # Write the Python wrapper script to the sandbox using base64 (safe encoding) - import base64 - wrapper_b64 = base64.b64encode(wrapper_script.encode()).decode() - write_wrapper = self._sandbox.exec("bash", "-c", f"echo '{wrapper_b64}' | base64 -d > /tmp/agent_wrapper.py") - write_wrapper.wait() - - # Start the agent-server manually (since we cleared the entrypoint) - self._server_proc = self._sandbox.exec( - "bash", "-c", startup_script, - ) - - # Give the server a moment to start and capture initial output - time.sleep(3) - # Get tunnel URL tunnel_info = self._sandbox.tunnels()[8000] tunnel_url = tunnel_info.url diff --git a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/conversation/event_store.py b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/conversation/event_store.py index cec748b..92c48f2 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/conversation/event_store.py +++ b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/conversation/event_store.py @@ -172,11 +172,11 @@ def __len__(self) -> int: return self._length def _path(self, idx: int, *, event_id: EventID | None = None) -> str: - return f"{self._dir}/{ - EVENT_FILE_PATTERN.format( - idx=idx, event_id=event_id or self._idx_to_id[idx] - ) - }" + filename = EVENT_FILE_PATTERN.format( + idx=idx, + event_id=event_id or self._idx_to_id[idx], + ) + return f"{self._dir}/{filename}" def _scan_and_build_index(self) -> int: try: diff --git a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/observability/laminar.py b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/observability/laminar.py index 830fc0d..5cab7b0 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/observability/laminar.py +++ b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/observability/laminar.py @@ -2,6 +2,8 @@ from typing import ( Any, Literal, + ParamSpec, + TypeVar, ) import litellm @@ -19,6 +21,9 @@ logger = get_logger(__name__) +P = ParamSpec("P") +R = TypeVar("R") + def maybe_init_laminar(): """Initialize Laminar if the environment variables are set. @@ -54,7 +59,7 @@ def maybe_init_laminar(): ) -def observe[**P, R]( +def observe( *, name: str | None = None, session_id: str | None = None, diff --git a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/tool/tool.py b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/tool/tool.py index 0079efb..30877ca 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/tool/tool.py +++ b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/tool/tool.py @@ -5,6 +5,7 @@ TYPE_CHECKING, Any, ClassVar, + Generic, Protocol, Self, TypeVar, @@ -93,7 +94,7 @@ class ToolAnnotations(BaseModel): ) -class ToolExecutor[ActionT, ObservationT](ABC): +class ToolExecutor(ABC, Generic[ActionT, ObservationT]): """Executor function type for a Tool.""" @abstractmethod @@ -145,7 +146,7 @@ def __call__( ... -class ToolDefinition[ActionT, ObservationT](DiscriminatedUnionMixin, ABC): +class ToolDefinition(DiscriminatedUnionMixin, ABC, Generic[ActionT, ObservationT]): """Base class for all tool implementations. This class serves as a base for the discriminated union of all tool types. diff --git a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/utils/paging.py b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/utils/paging.py index bc196e8..96ac494 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/utils/paging.py +++ b/src/cooperbench/agents/openhands_agent_sdk/openhands-sdk/openhands/sdk/utils/paging.py @@ -1,10 +1,13 @@ """Pagination utilities for iterating over paginated search results.""" from collections.abc import AsyncGenerator, Awaitable, Callable -from typing import Any, Protocol +from typing import Any, Protocol, TypeVar -class PageProtocol[T](Protocol): +T = TypeVar("T") + + +class PageProtocol(Protocol[T]): """Protocol for page objects returned by search functions. All page objects should have: @@ -16,7 +19,7 @@ class PageProtocol[T](Protocol): next_page_id: str | None -async def page_iterator[T]( +async def page_iterator( search_func: Callable[..., Awaitable[PageProtocol[T]]], *args: Any, **kwargs: Any, diff --git a/src/cooperbench/agents/openhands_agent_sdk/openhands-tools/openhands/tools/terminal/terminal/terminal_session.py b/src/cooperbench/agents/openhands_agent_sdk/openhands-tools/openhands/tools/terminal/terminal/terminal_session.py index ffa3470..bfddd9f 100644 --- a/src/cooperbench/agents/openhands_agent_sdk/openhands-tools/openhands/tools/terminal/terminal/terminal_session.py +++ b/src/cooperbench/agents/openhands_agent_sdk/openhands-tools/openhands/tools/terminal/terminal/terminal_session.py @@ -429,11 +429,12 @@ def execute(self, action: TerminalAction) -> TerminalObservation: logger.debug( f"TERMINAL CONTENT GOT after {time.time() - _start_time:.2f} seconds" ) + terminal_lines = cur_terminal_output.splitlines() logger.debug( - f"BEGIN OF TERMINAL CONTENT: {cur_terminal_output.split('\n')[:10]}" + f"BEGIN OF TERMINAL CONTENT: {terminal_lines[:10]}" ) logger.debug( - f"END OF TERMINAL CONTENT: {cur_terminal_output.split('\n')[-10:]}" + f"END OF TERMINAL CONTENT: {terminal_lines[-10:]}" ) ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_terminal_output) current_ps1_count = len(ps1_matches) diff --git a/src/cooperbench/eval/backends/modal.py b/src/cooperbench/eval/backends/modal.py index 9e8c676..d770f22 100644 --- a/src/cooperbench/eval/backends/modal.py +++ b/src/cooperbench/eval/backends/modal.py @@ -67,8 +67,13 @@ def create_sandbox( workdir: str = "/workspace", ) -> Sandbox: """Create a Modal sandbox for evaluation.""" + # Eval runs need a stable long-running process for exec calls. + # Task images may have entrypoints that exit immediately, so clear it + # and run `sleep infinity` explicitly (mirrors Docker backend behavior). modal_image = modal.Image.from_registry(image).entrypoint([]) sb = modal.Sandbox.create( + "sleep", + "infinity", image=modal_image, timeout=timeout, workdir=workdir,