Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 6 additions & 58 deletions src/cooperbench/agents/openhands_agent_sdk/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,8 +666,9 @@ def _collect_credentials(self) -> dict[str, str]:
def __enter__(self) -> str:
"""Start sandbox, run agent-server, and return the tunnel URL."""

# Build image and clear entrypoint (Modal will add its own default command)
image = modal.Image.from_registry(self.image_name).entrypoint([])
# Preserve image ENTRYPOINT.
# The `-oh` images set ENTRYPOINT to launch `openhands.agent_server`.
image = modal.Image.from_registry(self.image_name)

# Get or create app
app = modal.App.lookup("cooperbench", create_if_missing=True)
Expand All @@ -682,66 +683,13 @@ def __enter__(self) -> str:
timeout=self.timeout,
app=app,
secrets=secrets,
# Start outside /workspace/repo to avoid import shadowing
# (e.g., openai_tiktoken_task shadows litellm's `tiktoken` import).
workdir="/",
# Expose port 8000 for the agent-server
encrypted_ports=[8000],
)


# IMPORTANT: We use a Python wrapper to ensure collaboration tools are
# imported in the SAME process as the agent-server. This is needed because
# Modal may cache Docker images and the __init__.py auto-import might not
# be in the cached image.
#
# We write the wrapper to a file first, then execute it (heredocs don't
# work well with sandbox.exec's bash -c).

wrapper_script = '''
import sys
import os

sys.argv = ['agent_server', '--host', '0.0.0.0', '--port', '8000']

# Force import collaboration tools to register them BEFORE server starts
try:
from openhands.tools.collaboration import SendMessageTool, ReceiveMessageTool
print('[STARTUP] Collaboration tools registered:', SendMessageTool.name, ReceiveMessageTool.name, flush=True)
except Exception as e:
print('[STARTUP] WARNING: Failed to import collaboration tools:', e, flush=True)

# Now run the agent server (tools are registered in this process)
from openhands.agent_server.__main__ import main
main()
'''

# Bash script to set up credentials and run the Python wrapper
startup_script = """
#!/bin/bash
set -e

# Write Google Cloud credentials if provided
if [ -n "$GOOGLE_APPLICATION_CREDENTIALS_JSON" ]; then
echo "$GOOGLE_APPLICATION_CREDENTIALS_JSON" > /tmp/gcp-credentials.json
export GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-credentials.json
fi

# Run the Python wrapper
exec /opt/agent-server-venv/bin/python /tmp/agent_wrapper.py
"""

# Write the Python wrapper script to the sandbox using base64 (safe encoding)
import base64
wrapper_b64 = base64.b64encode(wrapper_script.encode()).decode()
write_wrapper = self._sandbox.exec("bash", "-c", f"echo '{wrapper_b64}' | base64 -d > /tmp/agent_wrapper.py")
write_wrapper.wait()

# Start the agent-server manually (since we cleared the entrypoint)
self._server_proc = self._sandbox.exec(
"bash", "-c", startup_script,
)

# Give the server a moment to start and capture initial output
time.sleep(3)

# Get tunnel URL
tunnel_info = self._sandbox.tunnels()[8000]
tunnel_url = tunnel_info.url
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,11 +172,11 @@ def __len__(self) -> int:
return self._length

def _path(self, idx: int, *, event_id: EventID | None = None) -> str:
return f"{self._dir}/{
EVENT_FILE_PATTERN.format(
idx=idx, event_id=event_id or self._idx_to_id[idx]
)
}"
filename = EVENT_FILE_PATTERN.format(
idx=idx,
event_id=event_id or self._idx_to_id[idx],
)
return f"{self._dir}/{filename}"

def _scan_and_build_index(self) -> int:
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from typing import (
Any,
Literal,
ParamSpec,
TypeVar,
)

import litellm
Expand All @@ -19,6 +21,9 @@

logger = get_logger(__name__)

P = ParamSpec("P")
R = TypeVar("R")


def maybe_init_laminar():
"""Initialize Laminar if the environment variables are set.
Expand Down Expand Up @@ -54,7 +59,7 @@ def maybe_init_laminar():
)


def observe[**P, R](
def observe(
*,
name: str | None = None,
session_id: str | None = None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
TYPE_CHECKING,
Any,
ClassVar,
Generic,
Protocol,
Self,
TypeVar,
Expand Down Expand Up @@ -93,7 +94,7 @@ class ToolAnnotations(BaseModel):
)


class ToolExecutor[ActionT, ObservationT](ABC):
class ToolExecutor(ABC, Generic[ActionT, ObservationT]):
"""Executor function type for a Tool."""

@abstractmethod
Expand Down Expand Up @@ -145,7 +146,7 @@ def __call__(
...


class ToolDefinition[ActionT, ObservationT](DiscriminatedUnionMixin, ABC):
class ToolDefinition(DiscriminatedUnionMixin, ABC, Generic[ActionT, ObservationT]):
"""Base class for all tool implementations.

This class serves as a base for the discriminated union of all tool types.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""Pagination utilities for iterating over paginated search results."""

from collections.abc import AsyncGenerator, Awaitable, Callable
from typing import Any, Protocol
from typing import Any, Protocol, TypeVar


class PageProtocol[T](Protocol):
T = TypeVar("T")


class PageProtocol(Protocol[T]):
"""Protocol for page objects returned by search functions.

All page objects should have:
Expand All @@ -16,7 +19,7 @@ class PageProtocol[T](Protocol):
next_page_id: str | None


async def page_iterator[T](
async def page_iterator(
search_func: Callable[..., Awaitable[PageProtocol[T]]],
*args: Any,
**kwargs: Any,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,11 +429,12 @@ def execute(self, action: TerminalAction) -> TerminalObservation:
logger.debug(
f"TERMINAL CONTENT GOT after {time.time() - _start_time:.2f} seconds"
)
terminal_lines = cur_terminal_output.splitlines()
logger.debug(
f"BEGIN OF TERMINAL CONTENT: {cur_terminal_output.split('\n')[:10]}"
f"BEGIN OF TERMINAL CONTENT: {terminal_lines[:10]}"
)
logger.debug(
f"END OF TERMINAL CONTENT: {cur_terminal_output.split('\n')[-10:]}"
f"END OF TERMINAL CONTENT: {terminal_lines[-10:]}"
)
ps1_matches = CmdOutputMetadata.matches_ps1_metadata(cur_terminal_output)
current_ps1_count = len(ps1_matches)
Expand Down
5 changes: 5 additions & 0 deletions src/cooperbench/eval/backends/modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,13 @@ def create_sandbox(
workdir: str = "/workspace",
) -> Sandbox:
"""Create a Modal sandbox for evaluation."""
# Eval runs need a stable long-running process for exec calls.
# Task images may have entrypoints that exit immediately, so clear it
# and run `sleep infinity` explicitly (mirrors Docker backend behavior).
modal_image = modal.Image.from_registry(image).entrypoint([])
sb = modal.Sandbox.create(
"sleep",
"infinity",
image=modal_image,
timeout=timeout,
workdir=workdir,
Expand Down