diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index 7501bf23..22f25144 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -15,6 +15,7 @@ import shutil import subprocess import sys +import time from pathlib import Path from time import monotonic @@ -78,6 +79,64 @@ def patch_swt_bench_for_micromamba(swt_bench_dir: Path) -> None: ) +def _write_profile_sitecustomize(swt_bench_dir: Path, profile_output: Path) -> None: + """ + Drop a sitecustomize.py into the swt-bench checkout to capture internal timings. + + This script is picked up automatically by Python when running swt-bench's + src/main.py. It records coarse phases (docker builds, run_instances, per-instance + execution) and writes them to SWTBench profile JSON. + """ + site_path = swt_bench_dir / "sitecustomize.py" + template_path = Path(__file__).parent / "swtbench_sitecustomize.py" + site_path.write_text(template_path.read_text()) + + +def _patch_swtbench_circular_import(swt_bench_dir: Path) -> None: + """ + Remove the src.main import from swt-bench/src/__init__.py to avoid the + circular import that breaks src/main.py when run as a script. + """ + init_file = swt_bench_dir / "src" / "__init__.py" + if not init_file.exists(): + logger.warning("swt-bench src/__init__.py not found; skipping patch") + return + + original = init_file.read_text() + lines = original.splitlines() + + patched: list[str] = [] + skipping_block = False + paren_balance = 0 + removed = False + + for line in lines: + if skipping_block: + paren_balance += line.count("(") - line.count(")") + if paren_balance <= 0: + skipping_block = False + continue + + if "from src.main import" in line: + removed = True + paren_balance = line.count("(") - line.count(")") + if paren_balance > 0: + skipping_block = True + continue + + patched.append(line) + + if not removed: + logger.info("No src.main re-export found in %s; no patch needed", init_file) + return + + trailing_newline = "\n" if original.endswith("\n") else "" + init_file.write_text("\n".join(patched) + trailing_newline) + logger.info( + "Removed src.main re-export from %s to avoid circular import", init_file + ) + + def _load_prediction_instance_ids(predictions_file: Path) -> list[str]: instance_ids: list[str] = [] seen = set() @@ -244,6 +303,29 @@ def run_swtbench_evaluation( """ logger.info(f"Running SWT-Bench evaluation on {predictions_file}") + timeline: list[dict[str, object]] = [] + eval_start_ns = time.perf_counter_ns() + success = False + predictions_path = Path(predictions_file).resolve() + profile_output = predictions_path.parent / ( + predictions_path.stem + ".swtbench_harness.profile.json" + ) + timeline_file = predictions_path.parent / ( + predictions_path.stem + ".swtbench_eval.timeline.json" + ) + + def record(phase: str, start_ns: int, extra: dict[str, object] | None = None): + end_ns = time.perf_counter_ns() + entry: dict[str, object] = { + "phase": phase, + "start_ns": start_ns, + "end_ns": end_ns, + "duration_ms": (end_ns - start_ns) / 1_000_000, + } + if extra: + entry.update(extra) + timeline.append(entry) + try: # Use a global cache directory for SWT-Bench source cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" @@ -251,6 +333,7 @@ def run_swtbench_evaluation( # Clone SWT-Bench repository if it doesn't exist if not swt_bench_dir.exists(): + clone_start = time.perf_counter_ns() logger.info("Setting up SWT-Bench source in global cache...") cache_dir.mkdir(parents=True, exist_ok=True) @@ -266,7 +349,11 @@ def run_swtbench_evaluation( raise subprocess.CalledProcessError(result.returncode, clone_cmd) logger.info(f"SWT-Bench source installed at {swt_bench_dir}") + record("clone_swt_bench", clone_start) + else: + record("reuse_swt_bench_cache", time.perf_counter_ns()) + # Patch upstream sources for micromamba and circular import issues patch_swt_bench_for_micromamba(swt_bench_dir) # Get the directory and filename of the predictions file @@ -274,14 +361,22 @@ def run_swtbench_evaluation( predictions_filename = predictions_path.name # Copy predictions file to swt-bench directory + copy_start = time.perf_counter_ns() swt_predictions_file = swt_bench_dir / predictions_filename shutil.copy2(predictions_file, swt_predictions_file) + record("copy_predictions", copy_start) + + # Install a profiling sitecustomize so we can capture harness timings + _write_profile_sitecustomize(swt_bench_dir, profile_output) + # Patch upstream circular import (src/__init__.py -> src.main -> run_evaluation) + _patch_swtbench_circular_import(swt_bench_dir) # Run SWT-Bench evaluation by running python directly from the swt-bench directory # but using the uv environment's python executable which has all dependencies benchmarks_dir = Path(__file__).parent.parent.parent # Get the python executable from the uv environment + python_start = time.perf_counter_ns() python_executable = subprocess.run( [ "uv", @@ -296,10 +391,16 @@ def run_swtbench_evaluation( text=True, cwd=benchmarks_dir, ).stdout.strip() + record("resolve_python_executable", python_start) # Set up environment with PYTHONPATH to include swt-bench directory env = os.environ.copy() - env["PYTHONPATH"] = str(swt_bench_dir) + env["PYTHONPATH"] = ( + f"{swt_bench_dir}:{env['PYTHONPATH']}" + if env.get("PYTHONPATH") + else str(swt_bench_dir) + ) + env["SWTBENCH_PROFILE_JSON"] = str(profile_output) cmd = [ python_executable, @@ -322,25 +423,29 @@ def run_swtbench_evaluation( logger.info("SWT-Bench evaluation output:") print("-" * 80) - eval_start = monotonic() # Stream output directly to console, running from swt-bench directory + harness_start = time.perf_counter_ns() result = subprocess.run(cmd, text=True, cwd=swt_bench_dir, env=env) - eval_end = monotonic() + record( + "swtbench_harness", + harness_start, + {"returncode": result.returncode, "cmd": cmd}, + ) print("-" * 80) if result.returncode == 0: - logger.info( - "SWT-Bench evaluation completed successfully in %.2fs", - eval_end - eval_start, - ) + logger.info("SWT-Bench evaluation completed successfully") else: logger.error( - "SWT-Bench evaluation failed with return code %s after %.2fs", - result.returncode, - eval_end - eval_start, + f"SWT-Bench evaluation failed with return code {result.returncode}" ) raise subprocess.CalledProcessError(result.returncode, cmd) - + record( + "swtbench_eval_total", + eval_start_ns, + {"events_recorded": len(timeline)}, + ) + success = True except FileNotFoundError: logger.error( "SWT-Bench evaluation command not found. " @@ -350,6 +455,27 @@ def run_swtbench_evaluation( except Exception as e: logger.error(f"Error running SWT-Bench evaluation: {e}") raise + finally: + if not success: + record( + "swtbench_eval_total", + eval_start_ns, + {"events_recorded": len(timeline), "status": "error"}, + ) + timeline_payload = { + "predictions_file": str(predictions_file), + "dataset": dataset, + "workers": workers, + "started_ns": eval_start_ns, + "ended_ns": time.perf_counter_ns(), + "status": "ok" if success else "error", + "events": timeline, + } + try: + timeline_file.write_text(json.dumps(timeline_payload, indent=2)) + logger.info("Wrote timeline to %s", timeline_file) + except Exception as e: # noqa: BLE001 + logger.warning("Failed to write SWTBench timeline: %s", e) def main() -> None: diff --git a/benchmarks/swtbench/swtbench_sitecustomize.py b/benchmarks/swtbench/swtbench_sitecustomize.py new file mode 100644 index 00000000..10ccc401 --- /dev/null +++ b/benchmarks/swtbench/swtbench_sitecustomize.py @@ -0,0 +1,124 @@ +""" +Runtime-injected sitecustomize for SWT-Bench harness profiling. + +This file is copied into the swt-bench checkout as sitecustomize.py to collect +coarse-grained timing events without modifying upstream code. It is activated +only when PROFILE_SWTBENCH/SWTBENCH_PROFILE_JSON are set by the caller. +""" + +import atexit +import importlib +import json +import os +import threading +import time +from pathlib import Path +from typing import Any, Dict, Optional + + +PROFILE_PATH = os.environ.get("SWTBENCH_PROFILE_JSON", "swtbench_profile.json") +_events: list[Dict[str, Any]] = [] +_lock = threading.Lock() +_start_ns = time.perf_counter_ns() + + +def _record(name: str, extra: Optional[Dict[str, Any]] = None): + start_ns = time.perf_counter_ns() + + def _end(status: str = "ok", extra_end: Optional[Dict[str, Any]] = None): + end_ns = time.perf_counter_ns() + payload: Dict[str, Any] = { + "name": name, + "status": status, + "start_ns": start_ns, + "end_ns": end_ns, + "duration_ms": (end_ns - start_ns) / 1_000_000, + } + if extra: + payload.update(extra) + if extra_end: + payload.update(extra_end) + with _lock: + _events.append(payload) + + return _end + + +def _safe_patch(module, attr: str, wrapper): + try: + original = getattr(module, attr) + setattr(module, attr, wrapper(original)) + except Exception: + # If patching fails, skip silently to avoid impacting the harness. + return + + +# Patch swt-bench functions if available +try: + run_evaluation = importlib.import_module("run_evaluation") # type: ignore[assignment] + + def _wrap_run_instances(original): + def _inner(predictions, instances, *args, **kwargs): + done = _record( + "run_instances", + {"instance_count": len(instances) if instances is not None else None}, + ) + try: + return original(predictions, instances, *args, **kwargs) + finally: + done() + + return _inner + + def _wrap_run_eval_exec_spec(original): + def _inner(exec_spec, model_patch, *args, **kwargs): + done = _record( + "run_eval_exec_spec", + {"instance_id": getattr(exec_spec, "instance_id", None)}, + ) + try: + return original(exec_spec, model_patch, *args, **kwargs) + finally: + done() + + return _inner + + _safe_patch(run_evaluation, "run_instances", _wrap_run_instances) + _safe_patch(run_evaluation, "run_eval_exec_spec", _wrap_run_eval_exec_spec) +except Exception: + pass + +try: + docker_build = importlib.import_module("src.docker_build") # type: ignore[assignment] + + def _wrap_build_image(original): + def _inner(image_name, *args, **kwargs): + done = _record("docker_build", {"image_name": image_name}) + try: + return original(image_name, *args, **kwargs) + finally: + done() + + return _inner + + _safe_patch(docker_build, "build_image", _wrap_build_image) +except Exception: + pass + + +def _flush() -> None: + end_ns = time.perf_counter_ns() + payload = { + "started_ns": _start_ns, + "ended_ns": end_ns, + "duration_ms": (end_ns - _start_ns) / 1_000_000, + "events": _events, + } + try: + Path(PROFILE_PATH).write_text(json.dumps(payload, indent=2)) + except Exception: + # Avoid raising during interpreter shutdown + return + + +atexit.register(_flush) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index e2ae1db5..ec1e1945 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -6,6 +6,7 @@ import json import os import sys +import time from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, as_completed from contextlib import contextmanager @@ -14,6 +15,7 @@ from typing import Callable, List, Optional, Tuple from uuid import UUID +import numpy as np from lmnr import Laminar from pydantic import BaseModel, Field from tqdm import tqdm @@ -298,6 +300,17 @@ def _run_iterative_mode( # Create attempt-specific output callback attempt_outputs: List[EvalOutput] = [] + def _make_json_safe(value: object) -> object: + if isinstance(value, np.ndarray): + return value.tolist() + if isinstance(value, np.generic): + return value.item() + if isinstance(value, dict): + return {k: _make_json_safe(v) for k, v in value.items()} + if isinstance(value, (list, tuple)): + return [_make_json_safe(v) for v in value] + return value + def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: attempt_outputs.append(out) # Write to attempt-specific file @@ -306,8 +319,9 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: f"output.critic_attempt_{attempt}.jsonl", ) try: - with open(attempt_file, "a") as f: - f.write(out.model_dump_json() + "\n") + payload = _make_json_safe(out.model_dump(mode="json")) + with open(attempt_file, "a", encoding="utf-8") as f: + f.write(json.dumps(payload) + "\n") except Exception as e: logger.warning( f"Failed to write to attempt file {attempt_file}: {e}" @@ -461,6 +475,13 @@ def _process_one_mp( - Ensures proper context-managed cleanup - Returns (instance, output) so the parent can stream results """ + timeline_dir = Path(self.metadata.eval_output_dir) / "timelines" + timeline_dir.mkdir(parents=True, exist_ok=True) + + def write_timeline(entry: dict[str, object]) -> None: + path = timeline_dir / f"{instance.id}.attempt{entry.get('attempt', 0)}.json" + path.write_text(json.dumps(entry, indent=2)) + # Set up instance-specific logging log_dir = os.path.join(self.metadata.eval_output_dir, "logs") reset_logger_for_multiprocessing(log_dir, instance.id) @@ -480,6 +501,12 @@ def _process_one_mp( while retry_count <= max_retries: workspace = None + attempt_index = retry_count + 1 + attempt_start_ns = time.perf_counter_ns() + attempt_start_ts = datetime.now(timezone.utc).isoformat() + attempt_status = "error" + phases: list[dict[str, int | str]] = [] + resource_factor = self.metadata.base_resource_factor # Start Laminar execution span and inject context into os.environ so workspace can pick it up # Escape the serialized context to safely pass as a cli argument @@ -506,11 +533,19 @@ def _process_one_mp( f"resource_factor={resource_factor}" ) + ws_start = time.perf_counter_ns() workspace = self.prepare_workspace( instance, resource_factor=resource_factor, forward_env=LMNR_ENV_VARS, ) + phases.append( + { + "name": "prepare_workspace", + "start_ns": int(ws_start), + "end_ns": int(time.perf_counter_ns()), + } + ) # Record runtime/pod mapping only for remote runtimes if isinstance(workspace, APIRemoteWorkspace): @@ -535,10 +570,19 @@ def _process_one_mp( runtime_run.session_id, runtime_run.resource_factor, ) + eval_start = time.perf_counter_ns() out = self.evaluate_instance(instance, workspace) + phases.append( + { + "name": "evaluate_instance", + "start_ns": int(eval_start), + "end_ns": int(time.perf_counter_ns()), + } + ) if runtime_runs: out.runtime_runs = runtime_runs logger.info("[child] done id=%s", instance.id) + attempt_status = "ok" return instance, out except Exception as e: last_error = e @@ -593,6 +637,7 @@ def _process_one_mp( return instance, error_output finally: # Ensure workspace cleanup happens regardless of success or failure + cleanup_start = time.perf_counter_ns() if workspace is not None: try: self._capture_conversation_archive(workspace, instance) @@ -614,6 +659,43 @@ def _process_one_mp( f"{str(cleanup_error)[:50]}" ) lmnr_span.end() + phases.append( + { + "name": "cleanup", + "start_ns": int(cleanup_start), + "end_ns": int(time.perf_counter_ns()), + } + ) + attempt_end_ns = time.perf_counter_ns() + write_timeline( + { + "instance_id": instance.id, + "attempt": attempt_index, + "critic_attempt": critic_attempt, + "status": attempt_status, + "error": ( + str(last_error) if attempt_status != "ok" else None + ), + "start_ts": attempt_start_ts, + "end_ts": datetime.now(timezone.utc).isoformat(), + "duration_ms": (attempt_end_ns - attempt_start_ns) + / 1_000_000, + "resource_factor": resource_factor, + "runtime_failure_count": runtime_failure_count, + "phases": [ + { + "name": p["name"], + "duration_ms": ( + (int(p["end_ns"]) - int(p["start_ns"])) + / 1_000_000 + ), + "start_ns": int(p["start_ns"]), + "end_ns": int(p["end_ns"]), + } + for p in phases + ], + } + ) # This should never be reached, but added for type safety error_output = self._create_error_output( diff --git a/uv.lock b/uv.lock index e7351742..7b04cc65 100644 --- a/uv.lock +++ b/uv.lock @@ -947,11 +947,11 @@ wheels = [ [[package]] name = "filelock" -version = "3.19.1" +version = "3.20.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, + { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, ] [[package]] @@ -1678,11 +1678,11 @@ wheels = [ [[package]] name = "libtmux" -version = "0.46.2" +version = "0.53.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9c/aa/7e1dcaa097156d6f3a7d8669be4389dced997feeb81744e3ff4681d65ee8/libtmux-0.46.2.tar.gz", hash = "sha256:9a398fec5d714129c8344555d466e1a903dfc0f741ba07aabe75a8ceb25c5dda", size = 346887, upload-time = "2025-05-26T19:40:04.096Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/28/e2b252817cb181aec2f42fe2d1d7fac5ec9c4d15bfb2b8ea4bd1179e4244/libtmux-0.53.0.tar.gz", hash = "sha256:1d19af4cea0c19543954d7e7317c7025c0739b029cccbe3b843212fae238f1bd", size = 405001, upload-time = "2025-12-14T11:59:11.337Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d6/2f/9d207039fcfa00d3b30e4d765f062fbcc42c873c7518a8cfebb3eafd00e0/libtmux-0.46.2-py3-none-any.whl", hash = "sha256:6c32dbf22bde8e5e33b2714a4295f6e838dc640f337cd4c085a044f6828c7793", size = 60873, upload-time = "2025-05-26T19:40:02.284Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d0/2e8bc5caa639ebb9f8801ba0be7070a28d48d8ed60e2a428d40f71fb88b8/libtmux-0.53.0-py3-none-any.whl", hash = "sha256:024b7ae6a12aae55358e8feb914c8632b3ab9bd61c0987c53559643c6a58ee4f", size = 77582, upload-time = "2025-12-14T11:59:09.739Z" }, ] [[package]] @@ -2269,7 +2269,7 @@ wheels = [ [[package]] name = "openhands-agent-server" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-agent-server" } dependencies = [ { name = "aiosqlite" }, @@ -2407,11 +2407,12 @@ dev = [ [[package]] name = "openhands-sdk" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-sdk" } dependencies = [ { name = "deprecation" }, { name = "fastmcp" }, + { name = "filelock" }, { name = "httpx" }, { name = "litellm" }, { name = "lmnr" }, @@ -2432,10 +2433,11 @@ requires-dist = [ { name = "boto3", marker = "extra == 'boto3'", specifier = ">=1.35.0" }, { name = "deprecation", specifier = ">=2.1.0" }, { name = "fastmcp", specifier = ">=2.11.3" }, + { name = "filelock", specifier = ">=3.20.1" }, { name = "httpx", specifier = ">=0.27.0" }, { name = "litellm", specifier = ">=1.80.10" }, { name = "lmnr", specifier = ">=0.7.24" }, - { name = "pydantic", specifier = ">=2.11.7" }, + { name = "pydantic", specifier = ">=2.12.5" }, { name = "python-frontmatter", specifier = ">=1.1.0" }, { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "tenacity", specifier = ">=9.1.2" }, @@ -2445,7 +2447,7 @@ provides-extras = ["boto3"] [[package]] name = "openhands-tools" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-tools" } dependencies = [ { name = "bashlex" }, @@ -2466,7 +2468,7 @@ requires-dist = [ { name = "browser-use", specifier = ">=0.8.0" }, { name = "cachetools" }, { name = "func-timeout", specifier = ">=4.3.5" }, - { name = "libtmux", specifier = ">=0.46.2" }, + { name = "libtmux", specifier = ">=0.53.0" }, { name = "openhands-sdk", editable = "vendor/software-agent-sdk/openhands-sdk" }, { name = "pydantic", specifier = ">=2.11.7" }, { name = "tom-swe", specifier = ">=1.0.3" }, @@ -2474,7 +2476,7 @@ requires-dist = [ [[package]] name = "openhands-workspace" -version = "1.7.2" +version = "1.8.1" source = { editable = "vendor/software-agent-sdk/openhands-workspace" } dependencies = [ { name = "openhands-agent-server" },