From 710e17f9746b1433bb60eb82e86a77056844b625 Mon Sep 17 00:00:00 2001 From: edgarpavlovsky Date: Fri, 7 Nov 2025 19:12:02 -0700 Subject: [PATCH 1/3] Fix terminal-bench integration test in CI - Fix pyproject.toml: Add [tool.hatch.build.targets.wheel] packages specification - Enable terminal-bench test in CI workflow with proper conditions - Add PATH fixes for uv and terminal-bench binaries - Add timeouts to prevent hanging (20min job, 15min step) - Add .actrc for local GitHub Actions testing with act - Add .secrets to .gitignore The terminal-bench adapter package was failing to build because hatchling didn't know which files to include. Now it correctly includes the adapters/ directory and the test can run in CI. --- .actrc | 12 ++++++++++++ .github/workflows/test.yml | 24 +++++++++++++++++++----- .gitignore | 1 + benchmark/pyproject.toml | 3 +++ 4 files changed, 35 insertions(+), 5 deletions(-) create mode 100644 .actrc diff --git a/.actrc b/.actrc new file mode 100644 index 0000000..36fedd5 --- /dev/null +++ b/.actrc @@ -0,0 +1,12 @@ +# Act configuration for running GitHub Actions locally +# Uses larger Docker image to support full Ubuntu functionality + +# Use medium-sized Ubuntu image (ubuntu-latest equivalent) +-P ubuntu-latest=catthehacker/ubuntu:act-latest + +# Enable verbose output for debugging +--verbose + +# Container architecture (automatically detected) +--container-architecture linux/amd64 + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6d82292..718ea31 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -99,8 +99,12 @@ jobs: integration-tests: name: Terminal-bench Integration runs-on: ubuntu-latest - # Temporarily disabled - needs debugging - if: false + timeout-minutes: 20 # Fail fast if tests hang + # Run on main branch and e/* branches for testing + if: | + github.ref == 'refs/heads/main' || + startsWith(github.ref, 'refs/heads/e/') || + startsWith(github.head_ref, 'e/') steps: - uses: actions/checkout@v4 @@ -114,10 +118,15 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Install uv - run: curl -LsSf https://astral.sh/uv/install.sh | sh + run: | + curl -LsSf https://astral.sh/uv/install.sh | sh + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Install terminal-bench - run: uv tool install terminal-bench + run: | + export PATH="$HOME/.local/bin:$PATH" + uv tool install terminal-bench + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Create virtual environment run: uv venv @@ -134,9 +143,14 @@ jobs: uv pip install -e . - name: Run terminal-bench integration test + timeout-minutes: 15 # Per-step timeout env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + PYTHONUNBUFFERED: "1" # Force immediate output run: | source .venv/bin/activate - pytest tests/ -m "integration" -v --tb=short + export PATH="$HOME/.local/bin:$PATH" + echo "Starting terminal-bench integration tests at $(date)" + pytest tests/ -m "integration" -v --tb=short -s --log-cli-level=INFO + echo "Terminal-bench tests completed at $(date)" diff --git a/.gitignore b/.gitignore index 0d1da53..7ccd72a 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ logs/ # Benchmark runs runs/ +.secrets diff --git a/benchmark/pyproject.toml b/benchmark/pyproject.toml index 2c995ac..0675483 100644 --- a/benchmark/pyproject.toml +++ b/benchmark/pyproject.toml @@ -12,6 +12,9 @@ dependencies = [ requires = ["hatchling"] build-backend = "hatchling.build" +[tool.hatch.build.targets.wheel] +packages = ["adapters"] + [dependency-groups] dev = [] From ba2dfc510f0cccd670a1fc1ff156827599957f4f Mon Sep 17 00:00:00 2001 From: edgarpavlovsky Date: Fri, 7 Nov 2025 19:21:26 -0700 Subject: [PATCH 2/3] Fix: Copy memory module to terminal-bench container The adapter was missing the memory module when copying Fireteam code into the terminal-bench container, causing ModuleNotFoundError when orchestrator.py tried to import memory.manager. This wasn't caught in local act testing because we interrupted the test before it reached actual execution inside the container. --- benchmark/adapters/fireteam_adapter.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/benchmark/adapters/fireteam_adapter.py b/benchmark/adapters/fireteam_adapter.py index f8252af..ddf16e1 100644 --- a/benchmark/adapters/fireteam_adapter.py +++ b/benchmark/adapters/fireteam_adapter.py @@ -141,7 +141,7 @@ def perform_task(self, instruction, session, logging_dir): fireteam_root = Path(__file__).parent.parent.parent # Create directory structure in container first - session.container.exec_run(["mkdir", "-p", "/fireteam/src/agents", "/fireteam/src/state"]) + session.container.exec_run(["mkdir", "-p", "/fireteam/src/agents", "/fireteam/src/state", "/fireteam/src/memory"]) # Copy main files session.copy_to_container( @@ -176,6 +176,14 @@ def perform_task(self, instruction, session, logging_dir): container_filename=state_file.name ) + # Copy memory module files + for memory_file in (fireteam_root / "src" / "memory").glob("*.py"): + session.copy_to_container( + paths=[memory_file], + container_dir="/fireteam/src/memory", + container_filename=memory_file.name + ) + # Run parent's setup and execution return super().perform_task(instruction, session, logging_dir) From 856770c4b9a089ea8ae01a5935a4f58ce509f8bb Mon Sep 17 00:00:00 2001 From: edgarpavlovsky Date: Fri, 7 Nov 2025 19:36:33 -0700 Subject: [PATCH 3/3] Remove --livestream flag to show terminal-bench output in logs The --livestream flag was causing terminal-bench to hide console output and stream to tmux instead, making it impossible to see progress or debug issues in CI/act logs. Removing it allows output to appear normally in pytest/CI logs. --- tests/test_terminal_bench_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_terminal_bench_integration.py b/tests/test_terminal_bench_integration.py index afe858c..b6593a8 100644 --- a/tests/test_terminal_bench_integration.py +++ b/tests/test_terminal_bench_integration.py @@ -33,8 +33,8 @@ def test_hello_world_task(self): '--dataset', 'terminal-bench-core==0.1.1', '--task-id', 'hello-world', '--global-agent-timeout-sec', '600', - '--log-level', 'debug', - '--livestream' # Enable real-time output + '--log-level', 'debug' + # Note: --livestream removed to show output in CI/act logs ] print("\nšŸš€ Running terminal-bench hello-world task...")