local test command

xzrderek · xzrderek · commit cd9cc91c34f9 · 2025-11-10T17:41:27.000-08:00
diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
@@ -427,6 +427,27 @@ def parse_args(args=None):
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
     rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
 
+    # Local test command
+    local_test_parser = subparsers.add_parser(
+        "local-test",
+        help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
+    )
+    local_test_parser.add_argument(
+        "--entry",
+        help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
+    )
+    local_test_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host",
+    )
+    local_test_parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
+    )
+
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.
     # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -559,6 +580,10 @@ def _extract_flag_value(argv_list, flag_name):
             return create_rft_command(args)
         print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
         return 1
+    elif args.command == "local-test":
+        from .cli_commands.local_test import local_test_command
+
+        return local_test_command(args)
     elif args.command == "run":
         # For the 'run' command, Hydra takes over argument parsing.
 
diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
@@ -0,0 +1,140 @@
+import argparse
+import os
+import subprocess
+import sys
+from typing import List
+
+from .upload import _discover_tests, _prompt_select
+
+
+def _find_dockerfiles(root: str) -> List[str]:
+    skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
+    dockerfiles: List[str] = []
+    for dirpath, dirnames, filenames in os.walk(root):
+        dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
+        for name in filenames:
+            if name == "Dockerfile":
+                dockerfiles.append(os.path.join(dirpath, name))
+    return dockerfiles
+
+
+def _run_pytest_host(pytest_target: str) -> int:
+    print(f"Running locally: pytest {pytest_target} -vs")
+    proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
+    return proc.returncode
+
+
+def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
+    context_dir = os.path.dirname(dockerfile_path)
+    print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
+    try:
+        proc = subprocess.run(
+            ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        print(proc.stdout)
+        return proc.returncode == 0
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return False
+
+
+def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
+    workdir = "/workspace"
+    # Mount read-only is safer; but tests may write artifacts. Use read-write.
+    cmd = [
+        "docker",
+        "run",
+        "--rm",
+        "-v",
+        f"{project_root}:{workdir}",
+        "-w",
+        workdir,
+        image_tag,
+        "pytest",
+        pytest_target,
+        "-vs",
+    ]
+    print("Running in Docker:", " ".join(cmd))
+    try:
+        proc = subprocess.run(cmd)
+        return proc.returncode
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return 1
+
+
+def local_test_command(args: argparse.Namespace) -> int:
+    project_root = os.getcwd()
+
+    # Selection and pytest target resolution
+    pytest_target: str = ""
+    entry = getattr(args, "entry", None)
+    if entry:
+        if "::" in entry:
+            file_part = entry.split("::", 1)[0]
+            file_path = (
+                file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
+            )
+            pytest_target = entry
+        else:
+            file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
+            # Use path relative to project_root when possible
+            try:
+                rel = os.path.relpath(file_path, project_root)
+            except Exception:
+                rel = file_path
+            pytest_target = rel
+    else:
+        tests = _discover_tests(project_root)
+        if not tests:
+            print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
+            return 1
+        non_interactive = bool(getattr(args, "yes", False))
+        selected = _prompt_select(tests, non_interactive=non_interactive)
+        if not selected:
+            print("No tests selected.")
+            return 1
+        if len(selected) != 1:
+            print("Error: Please select exactly one evaluation test for 'local-test'.")
+            return 1
+        chosen = selected[0]
+        abs_path = os.path.abspath(chosen.file_path)
+        try:
+            rel = os.path.relpath(abs_path, project_root)
+        except Exception:
+            rel = abs_path
+        pytest_target = rel
+
+    ignore_docker = bool(getattr(args, "ignore_docker", False))
+    if ignore_docker:
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_host(pytest_target)
+
+    dockerfiles = _find_dockerfiles(project_root)
+    if len(dockerfiles) > 1:
+        print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
+        for df in dockerfiles:
+            print(f" - {df}")
+        print("Hint: use --ignore-docker to bypass Docker.")
+        return 1
+    if len(dockerfiles) == 1:
+        image_tag = "ep-evaluator:local"
+        ok = _build_docker_image(dockerfiles[0], image_tag)
+        if not ok:
+            print("Docker build failed. See logs above.")
+            return 1
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target)
+
+    # No Dockerfile: run on host
+    if not pytest_target:
+        print("Error: Failed to resolve a pytest target to run.")
+        return 1
+    return _run_pytest_host(pytest_target)
diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
@@ -0,0 +1,145 @@
+import os
+from types import SimpleNamespace
+
+import pytest
+
+
+def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Create a dummy test file
+    test_file = project / "metric" / "test_one.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    # Import module under test
+    from eval_protocol.cli_commands import local_test as lt
+
+    # Avoid Docker path
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
+
+    captured = {"target": ""}
+
+    def _fake_host(target: str) -> int:
+        captured["target"] = target
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    # Expect relative path target
+    assert captured["target"] == os.path.relpath(str(test_file), str(project))
+
+
+def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_two.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    # Pretend we have Dockerfile(s), but ignore_docker=True should skip
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+
+    called = {"host": False}
+
+    def _fake_host(target: str) -> int:
+        called["host"] = True
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert called["host"] is True
+
+
+def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_three.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(
+        lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
+    )
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 1
+
+
+def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_four.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
+
+    captured = {"target": "", "image": ""}
+
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
+        captured["target"] = pytest_target
+        captured["image"] = image_tag
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert captured["image"] == "ep-evaluator:local"
+    assert captured["target"] == os.path.relpath(str(test_file), str(project))
+
+
+def test_local_test_selector_single_test(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_sel.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+    from eval_protocol.cli_commands import upload as up
+
+    # No entry; force discover + selector
+    disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
+    monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
+    monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
+
+    called = {"host": False}
+
+    def _fake_host(target: str) -> int:
+        called["host"] = True
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert called["host"] is True