OpenHands · simonrosenberg · Jan 14, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 16, 2026
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
@@ -15,6 +15,7 @@
 import shutil
 import subprocess
 import sys
+import time
 from pathlib import Path
 from time import monotonic
 
@@ -78,6 +79,64 @@ def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None:
         )
 
 
+def _write_profile_sitecustomize(swt_bench_dir: Path, profile_output: Path) -> None:
+    """
+    Drop a sitecustomize.py into the swt-bench checkout to capture internal timings.
+
+    This script is picked up automatically by Python when running swt-bench's
+    src/main.py. It records coarse phases (docker builds, run_instances, per-instance
+    execution) and writes them to SWTBench profile JSON.
+    """
+    site_path = swt_bench_dir / "sitecustomize.py"
+    template_path = Path(__file__).parent / "swtbench_sitecustomize.py"
+    site_path.write_text(template_path.read_text())
+
+
+def _patch_swtbench_circular_import(swt_bench_dir: Path) -> None:
+    """
+    Remove the src.main import from swt-bench/src/__init__.py to avoid the
+    circular import that breaks src/main.py when run as a script.
+    """
+    init_file = swt_bench_dir / "src" / "__init__.py"
+    if not init_file.exists():
+        logger.warning("swt-bench src/__init__.py not found; skipping patch")
+        return
+
+    original = init_file.read_text()
+    lines = original.splitlines()
+
+    patched: list[str] = []
+    skipping_block = False
+    paren_balance = 0
+    removed = False
+
+    for line in lines:
+        if skipping_block:
+            paren_balance += line.count("(") - line.count(")")
+            if paren_balance <= 0:
+                skipping_block = False
+            continue
+
+        if "from src.main import" in line:
+            removed = True
+            paren_balance = line.count("(") - line.count(")")
+            if paren_balance > 0:
+                skipping_block = True
+            continue
+
+        patched.append(line)
+
+    if not removed:
+        logger.info("No src.main re-export found in %s; no patch needed", init_file)
+        return
+
+    trailing_newline = "\n" if original.endswith("\n") else ""
+    init_file.write_text("\n".join(patched) + trailing_newline)
+    logger.info(
+        "Removed src.main re-export from %s to avoid circular import", init_file
+    )
+
+
 def _load_prediction_instance_ids(predictions_file: Path) -> list[str]:
     instance_ids: list[str] = []
     seen = set()
@@ -244,13 +303,37 @@ def run_swtbench_evaluation(
     """
     logger.info(f"Running SWT-Bench evaluation on {predictions_file}")
 
+    timeline: list[dict[str, object]] = []
+    eval_start_ns = time.perf_counter_ns()
+    success = False
+    predictions_path = Path(predictions_file).resolve()
+    profile_output = predictions_path.parent / (
+        predictions_path.stem + ".swtbench_harness.profile.json"
+    )
+    timeline_file = predictions_path.parent / (
+        predictions_path.stem + ".swtbench_eval.timeline.json"
+    )
+
+    def record(phase: str, start_ns: int, extra: dict[str, object] | None = None):
+        end_ns = time.perf_counter_ns()
+        entry: dict[str, object] = {
+            "phase": phase,
+            "start_ns": start_ns,
+            "end_ns": end_ns,
+            "duration_ms": (end_ns - start_ns) / 1_000_000,
+        }
+        if extra:
+            entry.update(extra)
+        timeline.append(entry)
+
     try:
         # Use a global cache directory for SWT-Bench source
         cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench"
         swt_bench_dir = cache_dir / "swt-bench"
 
         # Clone SWT-Bench repository if it doesn't exist
         if not swt_bench_dir.exists():
+            clone_start = time.perf_counter_ns()
             logger.info("Setting up SWT-Bench source in global cache...")
             cache_dir.mkdir(parents=True, exist_ok=True)
 
@@ -266,22 +349,34 @@ def run_swtbench_evaluation(
                 raise subprocess.CalledProcessError(result.returncode, clone_cmd)
 
             logger.info(f"SWT-Bench source installed at {swt_bench_dir}")
+            record("clone_swt_bench", clone_start)
+        else:
+            record("reuse_swt_bench_cache", time.perf_counter_ns())
 
+        # Patch upstream sources for micromamba and circular import issues
         patch_swt_bench_for_micromamba(swt_bench_dir)
 
         # Get the directory and filename of the predictions file
         predictions_path = Path(predictions_file).resolve()
         predictions_filename = predictions_path.name
 
         # Copy predictions file to swt-bench directory
+        copy_start = time.perf_counter_ns()
         swt_predictions_file = swt_bench_dir / predictions_filename
         shutil.copy2(predictions_file, swt_predictions_file)
+        record("copy_predictions", copy_start)
+
+        # Install a profiling sitecustomize so we can capture harness timings
+        _write_profile_sitecustomize(swt_bench_dir, profile_output)
+        # Patch upstream circular import (src/__init__.py -> src.main -> run_evaluation)
+        _patch_swtbench_circular_import(swt_bench_dir)
 
         # Run SWT-Bench evaluation by running python directly from the swt-bench directory
         # but using the uv environment's python executable which has all dependencies
         benchmarks_dir = Path(__file__).parent.parent.parent
 
         # Get the python executable from the uv environment
+        python_start = time.perf_counter_ns()
         python_executable = subprocess.run(
             [
                 "uv",
@@ -296,10 +391,16 @@ def run_swtbench_evaluation(
             text=True,
             cwd=benchmarks_dir,
         ).stdout.strip()
+        record("resolve_python_executable", python_start)
 
         # Set up environment with PYTHONPATH to include swt-bench directory
         env = os.environ.copy()
-        env["PYTHONPATH"] = str(swt_bench_dir)
+        env["PYTHONPATH"] = (
+            f"{swt_bench_dir}:{env['PYTHONPATH']}"
+            if env.get("PYTHONPATH")
+            else str(swt_bench_dir)
+        )
+        env["SWTBENCH_PROFILE_JSON"] = str(profile_output)
 
         cmd = [
             python_executable,
@@ -322,25 +423,29 @@ def run_swtbench_evaluation(
         logger.info("SWT-Bench evaluation output:")
         print("-" * 80)
 
-        eval_start = monotonic()
         # Stream output directly to console, running from swt-bench directory
+        harness_start = time.perf_counter_ns()
         result = subprocess.run(cmd, text=True, cwd=swt_bench_dir, env=env)
-        eval_end = monotonic()
+        record(
+            "swtbench_harness",
+            harness_start,
+            {"returncode": result.returncode, "cmd": cmd},
+        )
 
         print("-" * 80)
         if result.returncode == 0:
-            logger.info(
-                "SWT-Bench evaluation completed successfully in %.2fs",
-                eval_end - eval_start,
-            )
+            logger.info("SWT-Bench evaluation completed successfully")
         else:
             logger.error(
-                "SWT-Bench evaluation failed with return code %s after %.2fs",
-                result.returncode,
-                eval_end - eval_start,
+                f"SWT-Bench evaluation failed with return code {result.returncode}"
             )
             raise subprocess.CalledProcessError(result.returncode, cmd)
-
+        record(
+            "swtbench_eval_total",
+            eval_start_ns,
+            {"events_recorded": len(timeline)},
+        )
+        success = True
     except FileNotFoundError:
         logger.error(
             "SWT-Bench evaluation command not found. "
@@ -350,6 +455,27 @@ def run_swtbench_evaluation(
     except Exception as e:
         logger.error(f"Error running SWT-Bench evaluation: {e}")
         raise
+    finally:
+        if not success:
+            record(
+                "swtbench_eval_total",
+                eval_start_ns,
+                {"events_recorded": len(timeline), "status": "error"},
+            )
+        timeline_payload = {
+            "predictions_file": str(predictions_file),
+            "dataset": dataset,
+            "workers": workers,
+            "started_ns": eval_start_ns,
+            "ended_ns": time.perf_counter_ns(),
+            "status": "ok" if success else "error",
+            "events": timeline,
+        }
+        try:
+            timeline_file.write_text(json.dumps(timeline_payload, indent=2))
+            logger.info("Wrote timeline to %s", timeline_file)
+        except Exception as e:  # noqa: BLE001
+            logger.warning("Failed to write SWTBench timeline: %s", e)
 
 
 def main() -> None:

diff --git a/benchmarks/swtbench/swtbench_sitecustomize.py b/benchmarks/swtbench/swtbench_sitecustomize.py
@@ -0,0 +1,124 @@
+"""
+Runtime-injected sitecustomize for SWT-Bench harness profiling.
+
+This file is copied into the swt-bench checkout as sitecustomize.py to collect
+coarse-grained timing events without modifying upstream code. It is activated
+only when PROFILE_SWTBENCH/SWTBENCH_PROFILE_JSON are set by the caller.
+"""
+
+import atexit
+import importlib
+import json
+import os
+import threading
+import time
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+PROFILE_PATH = os.environ.get("SWTBENCH_PROFILE_JSON", "swtbench_profile.json")
+_events: list[Dict[str, Any]] = []
+_lock = threading.Lock()
+_start_ns = time.perf_counter_ns()
+
+
+def _record(name: str, extra: Optional[Dict[str, Any]] = None):
+    start_ns = time.perf_counter_ns()
+
+    def _end(status: str = "ok", extra_end: Optional[Dict[str, Any]] = None):
+        end_ns = time.perf_counter_ns()
+        payload: Dict[str, Any] = {
+            "name": name,
+            "status": status,
+            "start_ns": start_ns,
+            "end_ns": end_ns,
+            "duration_ms": (end_ns - start_ns) / 1_000_000,
+        }
+        if extra:
+            payload.update(extra)
+        if extra_end:
+            payload.update(extra_end)
+        with _lock:
+            _events.append(payload)
+
+    return _end
+
+
+def _safe_patch(module, attr: str, wrapper):
+    try:
+        original = getattr(module, attr)
+        setattr(module, attr, wrapper(original))
+    except Exception:
+        # If patching fails, skip silently to avoid impacting the harness.
+        return
+
+
+# Patch swt-bench functions if available
+try:
+    run_evaluation = importlib.import_module("run_evaluation")  # type: ignore[assignment]
+
+    def _wrap_run_instances(original):
+        def _inner(predictions, instances, *args, **kwargs):
+            done = _record(
+                "run_instances",
+                {"instance_count": len(instances) if instances is not None else None},
+            )
+            try:
+                return original(predictions, instances, *args, **kwargs)
+            finally:
+                done()
+
+        return _inner
+
+    def _wrap_run_eval_exec_spec(original):
+        def _inner(exec_spec, model_patch, *args, **kwargs):
+            done = _record(
+                "run_eval_exec_spec",
+                {"instance_id": getattr(exec_spec, "instance_id", None)},
+            )
+            try:
+                return original(exec_spec, model_patch, *args, **kwargs)
+            finally:
+                done()
+
+        return _inner
+
+    _safe_patch(run_evaluation, "run_instances", _wrap_run_instances)
+    _safe_patch(run_evaluation, "run_eval_exec_spec", _wrap_run_eval_exec_spec)
+except Exception:
+    pass
+
+try:
+    docker_build = importlib.import_module("src.docker_build")  # type: ignore[assignment]
+
+    def _wrap_build_image(original):
+        def _inner(image_name, *args, **kwargs):
+            done = _record("docker_build", {"image_name": image_name})
+            try:
+                return original(image_name, *args, **kwargs)
+            finally:
+                done()
+
+        return _inner
+
+    _safe_patch(docker_build, "build_image", _wrap_build_image)
+except Exception:
+    pass
+
+
+def _flush() -> None:
+    end_ns = time.perf_counter_ns()
+    payload = {
+        "started_ns": _start_ns,
+        "ended_ns": end_ns,
+        "duration_ms": (end_ns - _start_ns) / 1_000_000,
+        "events": _events,
+    }
+    try:
+        Path(PROFILE_PATH).write_text(json.dumps(payload, indent=2))
+    except Exception:
+        # Avoid raising during interpreter shutdown
+        return
+
+
+atexit.register(_flush)