Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ jobs:
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Install OpenEnv for integration tests
run: |
# Install OpenEnv core and echo environment
uv pip install "openenv-core"
uv pip install "openenv @ git+https://github.com/meta-pytorch/OpenEnv.git"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm i would prefer to not install more dependencies in ci.yml like this, as the tests are getting quite slow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we create a test that happens in parallel that's specifically for the test you want to test?

- name: Run Core Tests with pytest-xdist
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
Expand Down Expand Up @@ -109,6 +115,9 @@ jobs:
--ignore=tests/remote_server/test_remote_fireworks.py \
--ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
--ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
--ignore=tests/pytest/test_openenv_browsergym_basic.py \
--ignore=tests/pytest/test_openenv_browsergym_eval.py \
--ignore=tests/pytest/test_openenv_textarena_docker.py \
--ignore=eval_protocol/benchmarks/ \
--ignore=eval_protocol/quickstart/ \
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
Expand Down
8 changes: 6 additions & 2 deletions tests/pytest/test_openenv_browsergym_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")

# Skip if OpenEnv not installed
try:
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore
except ImportError:
pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)


@pytest.mark.integration
def test_openenv_browsergym_basic():
Expand Down Expand Up @@ -43,8 +49,6 @@ def test_openenv_browsergym_basic():

# Construct the processor with a trivial action_parser; the model output will still be generated
# but we parse to a safe noop action to minimize flakiness for the environment step.
from envs.browsergym_env import BrowserGymAction, BrowserGymEnv # type: ignore

processor = OpenEnvRolloutProcessor(
env_factory=None,
prompt_builder=lambda obs, step, history: "Do nothing",
Expand Down
6 changes: 6 additions & 0 deletions tests/pytest/test_openenv_browsergym_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")

# Skip if OpenEnv not installed
try:
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore
except ImportError:
pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)


def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""
Expand Down
44 changes: 12 additions & 32 deletions tests/pytest/test_openenv_echo_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,8 @@
from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
import pytest


# Preferred import when using the monolithic `openenv` package
from envs.echo_env import EchoEnv # type: ignore


# Skip these integration-heavy tests on CI runners by default
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
# Import OpenEnv Echo environment
from envs.echo_env import EchoEnv, EchoAction # type: ignore


def echo_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
Expand All @@ -39,23 +34,10 @@ def action_parser(response_text: str):
"""
Convert raw model response to EchoAction.
"""
try:
from envs.echo_env import EchoAction # type: ignore
except Exception:
pytest.skip("OpenEnv (openenv.envs.echo_env) is not installed; skipping Echo hub test.")
raise
text = response_text.strip() if isinstance(response_text, str) else ""
return EchoAction(message=text or "hello")


# try:
# from envs.echo_env import EchoEnv # type: ignore

# _HAS_ECHO = True
# except Exception:
# _HAS_ECHO = False


# Inline test data
ECHO_INLINE_DATA: List[Dict[str, Any]] = [
{"id": "echo-1", "prompt": "hello"},
Expand All @@ -76,18 +58,16 @@ def action_parser(response_text: str):
num_runs=1,
max_concurrent_rollouts=2,
mode="pointwise",
rollout_processor=(
OpenEnvRolloutProcessor(
# Use HF Hub to launch the environment container automatically
env_client_cls=EchoEnv, # type: ignore
hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
# Simple prompt+parser above
prompt_builder=prompt_builder,
action_parser=action_parser,
# Keep defaults for timeouts/viewport/etc. (not relevant for echo)
timeout_ms=5000,
num_generations=1,
)
rollout_processor=OpenEnvRolloutProcessor(
# Use HF Hub to launch the environment container automatically
env_client_cls=EchoEnv,
hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
# Simple prompt+parser above
prompt_builder=prompt_builder,
action_parser=action_parser,
# Keep defaults for timeouts/viewport/etc. (not relevant for echo)
timeout_ms=5000,
num_generations=1,
),
)
def test_openenv_echo_hub(row: EvaluationRow) -> EvaluationRow:
Expand Down
Loading