diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d29a04a3..5e2db39d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,6 +82,12 @@ jobs: - name: Install tau2 for testing run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main + - name: Install OpenEnv for integration tests + run: | + # Install OpenEnv core and echo environment + uv pip install "openenv-core" + uv pip install "openenv @ git+https://github.com/meta-pytorch/OpenEnv.git" + - name: Run Core Tests with pytest-xdist env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} @@ -109,6 +115,9 @@ jobs: --ignore=tests/remote_server/test_remote_fireworks.py \ --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \ --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \ + --ignore=tests/pytest/test_openenv_browsergym_basic.py \ + --ignore=tests/pytest/test_openenv_browsergym_eval.py \ + --ignore=tests/pytest/test_openenv_textarena_docker.py \ --ignore=eval_protocol/benchmarks/ \ --ignore=eval_protocol/quickstart/ \ --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py index f87a663b..26dbeb34 100644 --- a/tests/pytest/test_openenv_browsergym_basic.py +++ b/tests/pytest/test_openenv_browsergym_basic.py @@ -12,6 +12,12 @@ # Skip these integration-heavy tests on CI runners by default pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI") +# Skip if OpenEnv not installed +try: + from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore +except ImportError: + pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True) + @pytest.mark.integration def test_openenv_browsergym_basic(): @@ -43,8 +49,6 @@ def test_openenv_browsergym_basic(): # Construct the processor with a trivial action_parser; the model output will still be generated # but we parse to a safe noop action to minimize flakiness for the environment step. - from envs.browsergym_env import BrowserGymAction, BrowserGymEnv # type: ignore - processor = OpenEnvRolloutProcessor( env_factory=None, prompt_builder=lambda obs, step, history: "Do nothing", diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py index 505336ae..dabad0fb 100644 --- a/tests/pytest/test_openenv_browsergym_eval.py +++ b/tests/pytest/test_openenv_browsergym_eval.py @@ -10,6 +10,12 @@ # Skip these integration-heavy tests on CI runners by default pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI") +# Skip if OpenEnv not installed +try: + from envs.browsergym_env import BrowserGymEnv, BrowserGymAction # type: ignore +except ImportError: + pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True) + def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]: """ diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py index ae9c2cdc..b3aa8004 100644 --- a/tests/pytest/test_openenv_echo_hub.py +++ b/tests/pytest/test_openenv_echo_hub.py @@ -8,13 +8,8 @@ from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor import pytest - -# Preferred import when using the monolithic `openenv` package -from envs.echo_env import EchoEnv # type: ignore - - -# Skip these integration-heavy tests on CI runners by default -pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI") +# Import OpenEnv Echo environment +from envs.echo_env import EchoEnv, EchoAction # type: ignore def echo_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]: @@ -39,23 +34,10 @@ def action_parser(response_text: str): """ Convert raw model response to EchoAction. """ - try: - from envs.echo_env import EchoAction # type: ignore - except Exception: - pytest.skip("OpenEnv (openenv.envs.echo_env) is not installed; skipping Echo hub test.") - raise text = response_text.strip() if isinstance(response_text, str) else "" return EchoAction(message=text or "hello") -# try: -# from envs.echo_env import EchoEnv # type: ignore - -# _HAS_ECHO = True -# except Exception: -# _HAS_ECHO = False - - # Inline test data ECHO_INLINE_DATA: List[Dict[str, Any]] = [ {"id": "echo-1", "prompt": "hello"}, @@ -76,18 +58,16 @@ def action_parser(response_text: str): num_runs=1, max_concurrent_rollouts=2, mode="pointwise", - rollout_processor=( - OpenEnvRolloutProcessor( - # Use HF Hub to launch the environment container automatically - env_client_cls=EchoEnv, # type: ignore - hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"), - # Simple prompt+parser above - prompt_builder=prompt_builder, - action_parser=action_parser, - # Keep defaults for timeouts/viewport/etc. (not relevant for echo) - timeout_ms=5000, - num_generations=1, - ) + rollout_processor=OpenEnvRolloutProcessor( + # Use HF Hub to launch the environment container automatically + env_client_cls=EchoEnv, + hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"), + # Simple prompt+parser above + prompt_builder=prompt_builder, + action_parser=action_parser, + # Keep defaults for timeouts/viewport/etc. (not relevant for echo) + timeout_ms=5000, + num_generations=1, ), ) def test_openenv_echo_hub(row: EvaluationRow) -> EvaluationRow: