eval-protocol · shreymodi1 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -82,6 +82,12 @@ jobs:
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
 
+      - name: Install OpenEnv for integration tests
+        run: |
+          # Install OpenEnv core and echo environment
+          uv pip install "openenv-core"
+          uv pip install "openenv @ git+https://github.com/meta-pytorch/OpenEnv.git"
+
       - name: Run Core Tests with pytest-xdist
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -109,6 +115,9 @@ jobs:
             --ignore=tests/remote_server/test_remote_fireworks.py \
             --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
             --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
+            --ignore=tests/pytest/test_openenv_browsergym_basic.py \
+            --ignore=tests/pytest/test_openenv_browsergym_eval.py \
+            --ignore=tests/pytest/test_openenv_textarena_docker.py \
             --ignore=eval_protocol/benchmarks/ \
             --ignore=eval_protocol/quickstart/ \
             --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10

diff --git a/tests/pytest/test_openenv_browsergym_basic.py b/tests/pytest/test_openenv_browsergym_basic.py
@@ -12,6 +12,12 @@
 # Skip these integration-heavy tests on CI runners by default
 pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
 
+# Skip if OpenEnv not installed
+try:
+    from envs.browsergym_env import BrowserGymEnv, BrowserGymAction  # type: ignore
+except ImportError:
+    pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)
+
 
 @pytest.mark.integration
 def test_openenv_browsergym_basic():
@@ -43,8 +49,6 @@ def test_openenv_browsergym_basic():
 
     # Construct the processor with a trivial action_parser; the model output will still be generated
     # but we parse to a safe noop action to minimize flakiness for the environment step.
-    from envs.browsergym_env import BrowserGymAction, BrowserGymEnv  # type: ignore
-
     processor = OpenEnvRolloutProcessor(
         env_factory=None,
         prompt_builder=lambda obs, step, history: "Do nothing",

diff --git a/tests/pytest/test_openenv_browsergym_eval.py b/tests/pytest/test_openenv_browsergym_eval.py
@@ -10,6 +10,12 @@
 # Skip these integration-heavy tests on CI runners by default
 pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
 
+# Skip if OpenEnv not installed
+try:
+    from envs.browsergym_env import BrowserGymEnv, BrowserGymAction  # type: ignore
+except ImportError:
+    pytest.skip("OpenEnv browsergym_env not installed", allow_module_level=True)
+
 
 def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
     """

diff --git a/tests/pytest/test_openenv_echo_hub.py b/tests/pytest/test_openenv_echo_hub.py
@@ -8,13 +8,8 @@
 from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
 import pytest
 
-
-# Preferred import when using the monolithic `openenv` package
-from envs.echo_env import EchoEnv  # type: ignore
-
-
-# Skip these integration-heavy tests on CI runners by default
-pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
+# Import OpenEnv Echo environment
+from envs.echo_env import EchoEnv, EchoAction  # type: ignore
 
 
 def echo_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -39,23 +34,10 @@ def action_parser(response_text: str):
     """
     Convert raw model response to EchoAction.
     """
-    try:
-        from envs.echo_env import EchoAction  # type: ignore
-    except Exception:
-        pytest.skip("OpenEnv (openenv.envs.echo_env) is not installed; skipping Echo hub test.")
-        raise
     text = response_text.strip() if isinstance(response_text, str) else ""
     return EchoAction(message=text or "hello")
 
 
-# try:
-#     from envs.echo_env import EchoEnv  # type: ignore
-
-#     _HAS_ECHO = True
-# except Exception:
-#     _HAS_ECHO = False
-
-
 # Inline test data
 ECHO_INLINE_DATA: List[Dict[str, Any]] = [
     {"id": "echo-1", "prompt": "hello"},
@@ -76,18 +58,16 @@ def action_parser(response_text: str):
     num_runs=1,
     max_concurrent_rollouts=2,
     mode="pointwise",
-    rollout_processor=(
-        OpenEnvRolloutProcessor(
-            # Use HF Hub to launch the environment container automatically
-            env_client_cls=EchoEnv,  # type: ignore
-            hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
-            # Simple prompt+parser above
-            prompt_builder=prompt_builder,
-            action_parser=action_parser,
-            # Keep defaults for timeouts/viewport/etc. (not relevant for echo)
-            timeout_ms=5000,
-            num_generations=1,
-        )
+    rollout_processor=OpenEnvRolloutProcessor(
+        # Use HF Hub to launch the environment container automatically
+        env_client_cls=EchoEnv,
+        hub_repo_id=os.getenv("OPENENV_ECHO_REPO", "openenv/echo-env"),
+        # Simple prompt+parser above
+        prompt_builder=prompt_builder,
+        action_parser=action_parser,
+        # Keep defaults for timeouts/viewport/etc. (not relevant for echo)
+        timeout_ms=5000,
+        num_generations=1,
     ),
 )
 def test_openenv_echo_hub(row: EvaluationRow) -> EvaluationRow: