diff --git a/eval_protocol/utils/evaluation_row_utils.py b/eval_protocol/utils/evaluation_row_utils.py index d89f0c55..bb1e94c7 100644 --- a/eval_protocol/utils/evaluation_row_utils.py +++ b/eval_protocol/utils/evaluation_row_utils.py @@ -9,6 +9,7 @@ from typing import List from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import InputMetadata def serialize_message(msg: Message) -> str: @@ -134,3 +135,27 @@ def assistant_to_ground_truth(data: List[EvaluationRow]) -> List[EvaluationRow]: ) return processed_rows + + +def create_rows_from_indices(count: int, **metadata) -> List[EvaluationRow]: + """Create evaluation rows with sequential row_ids. + + Useful for remote processors where the server determines content based on row_id. + + Args: + count: Number of rows to create + **metadata: Additional metadata to include in each row + + Returns: + List of EvaluationRows with row_id set to "0", "1", "2", ... + """ + rows = [] + for idx in range(count): + row_metadata = {"row_id": str(idx), **metadata} + rows.append( + EvaluationRow( + messages=[], + input_metadata=InputMetadata(**row_metadata), + ) + ) + return rows diff --git a/examples/swebench/README.md b/examples/swebench/README.md new file mode 100644 index 00000000..a696ed6f --- /dev/null +++ b/examples/swebench/README.md @@ -0,0 +1,300 @@ +# SWE-bench Evaluation Example + +This example shows how to evaluate LLM models on the SWE-bench software engineering benchmark using eval-protocol. + +## Quick Start + +### 1. Install Dependencies + +```bash +# From the python-sdk repository root +cd python-sdk + +# Install eval-protocol with swebench support +pip install -e ".[swebench]" +``` + +### 2. Set up mini-swe-agent + +mini-swe-agent requires a Fireworks API key to function: + +```bash +# Configure API key for mini-swe-agent +mini-extra config set FIREWORKS_API_KEY your_fireworks_api_key + +# Verify it's set +mini-extra config get FIREWORKS_API_KEY +``` + +### 3. Install SWE-bench Harness + +```bash +# Navigate to the swebench example directory +cd examples/swebench + +# Clone and install SWE-bench +git clone https://github.com/princeton-nlp/SWE-bench +pip install -e SWE-bench +``` + +### 4. Set Environment Variables + +```bash +export FIREWORKS_API_KEY="your_fireworks_api_key" +``` + +## Running the Evaluation + +**IMPORTANT:** Always run both the server and tests from the `examples/swebench/` directory. + +### Step 1: Start the Server + +Open a terminal and run: + +```bash +cd examples/swebench +python server.py +``` + +You should see: +``` +INFO: Uvicorn running on http://127.0.0.1:3000 (Press CTRL+C to quit) +``` + +### Step 2: Configure Your Test + +Edit `tests/test_swebench.py` to set your model and parameters: + +```python +completion_params=[{ + "model": "accounts/fireworks/models/your-model-name", # Edit this + "model_kwargs": { + "temperature": 0.2, # Optional + # "max_tokens": 2048, # Optional + # "reasoning": "high", # Optional + } +}], +max_concurrent_rollouts=3, # How many instances to run in parallel +``` + +To test different numbers of instances, edit line 26: +```python +def rows() -> List[EvaluationRow]: + return rows_from_indices(2) # Change 2 to desired number (max 500) +``` + +### Step 3: Run the Test + +Open a second terminal: + +```bash +cd examples/swebench +pytest tests/test_swebench.py -v -s +``` + +## What Happens During a Run + +For each instance (row): + +1. **Server receives request** from pytest +2. **Wrapper script** (`run_swe_agent_fw.py`) is called with the instance index +3. **mini-swe-agent** runs in a Docker container for that specific repository +4. **Agent attempts to solve** the issue by editing code +5. **Patch is generated** and saved to `preds.json` +6. **SWE-bench harness** applies the patch and runs tests +7. **Results** are written to the row directory +8. **Test fetches results** and displays pass/fail in the UI + +## Understanding the Output + +### Directory Structure + +Each instance creates its own `row_N/` directory: + +``` +examples/swebench/ +├── row_0/ # First instance +│ ├── preds.json # ← Model's generated patch +│ ├── astropy__astropy-12907/ # Instance-specific folder +│ │ └── astropy__astropy-12907.traj.json # Agent's execution trace +│ ├── logs/ # Harness execution logs +│ │ └── run_evaluation/ +│ │ └── eval-run/ +│ │ └── / +│ │ └── astropy__astropy-12907/ +│ │ ├── report.json # ← Test results (pass/fail) +│ │ ├── test_output.txt # Test execution output +│ │ ├── patch.diff # Applied patch +│ │ └── eval.sh # Evaluation script +│ ├── agent_0.log # Agent console output +│ ├── exit_statuses_*.yaml # Exit status if failed +│ └── .eval-run.json # Overall run summary +├── row_1/ # Second instance +│ └── ... +└── ... +``` + +### Key Files Explained + +#### `preds.json` - Model Predictions +Location: `row_N/preds.json` + +Contains the patch generated by the model: +```json +{ + "astropy__astropy-12907": { + "model_name_or_path": "accounts/fireworks/models/...", + "instance_id": "astropy__astropy-12907", + "model_patch": "diff --git a/... (the actual patch)" + } +} +``` + +**If missing:** Agent failed before generating a patch (check `exit_statuses_*.yaml`) + +#### `report.json` - Test Results +Location: `row_N/logs/run_evaluation/eval-run///report.json` + +Contains pass/fail status after running tests: +```json +{ + "astropy__astropy-12907": { + "patch_is_None": false, + "patch_exists": true, + "patch_successfully_applied": true, + "resolved": true, // ← Was the issue fixed? + "tests_status": { + "FAIL_TO_PASS": {"success": [...], "failure": []}, + "PASS_TO_PASS": {"success": [...], "failure": []} + } + } +} +``` + +- `resolved: true` = Instance solved! All required tests pass. +- `resolved: false` = Instance not solved (tests still failing) + +**If missing:** Agent didn't generate a patch or harness didn't run + +#### `exit_statuses_*.yaml` - Why Runs Failed +Location: `row_N/exit_statuses_*.yaml` + +```yaml +instances_by_exit_status: + Submitted: [] + LimitsExceeded: ["astropy__astropy-12907"] # Hit step/cost limits + Error: [] +``` + +Common statuses: +- `Submitted`: Completed normally +- `LimitsExceeded`: Agent hit max steps or cost limit +- `Error`: Unexpected error during execution + +#### `agent_N.log` - Agent Execution +Location: `row_N/agent_N.log` + +Full console output from the agent run, including: +- Docker container startup +- Model API calls +- Commands executed +- Errors (if any) + +#### `*.traj.json` - Agent Trajectory +Location: `row_N//.traj.json` + +Complete record of the agent's execution: +```json +{ + "instance_id": "astropy__astropy-12907", + "info": { + "submission": "...", // The patch + "exit_status": "Submitted", + "model_stats": { + "instance_cost": 0.05, + "api_calls": 15 + } + }, + "messages": [...] // All agent messages +} +``` + +## Viewing Results + +### In the Terminal + +The test output shows: +``` +INFO:test_swebench:[Row 0] Found instance_id: astropy__astropy-12907 +INFO:test_swebench:[Row 0] Report says resolved=True +INFO:test_swebench:[Row 0] Final: resolved=True, reason=harness_resolved=True +``` + +### In the Eval Protocol UI + +If Elasticsearch is running, visit: `http://localhost:8000` +- View aggregate scores +- Inspect individual trajectories +- Filter by resolved/unresolved +- See cost and token usage + +### Check Individual Files + +```bash +# Check if instance was solved +cat row_0/logs/run_evaluation/eval-run//astropy__astropy-12907/report.json | jq '.["astropy__astropy-12907"].resolved' + +# View the generated patch +cat row_0/preds.json | jq '.["astropy__astropy-12907"].model_patch' + +# Check exit status +cat row_0/exit_statuses_*.yaml +``` + +## Performance Notes + +- **Small test (2 instances):** ~10-30 minutes +- **Full dataset (500 instances):** 24-48 hours on a 16-core machine +- **Concurrent runs:** Recommended 3-5 based on CPU/memory +- **Docker space:** ~100GB for all images (downloads happen automatically) + +## Troubleshooting + +### Docker container fails to start +```bash +# Check Docker is running +docker ps + +# Check disk space +df -h +``` + +### Agent hits step limits +Instances that consistently hit limits may need: +- Higher step limit (edit mini-swe-agent config) +- Different prompting strategy +- More capable model + +### Server not responding +```bash +# Check server is running +curl http://127.0.0.1:3000/status?rollout_id=test + +# Check server logs for errors +# (shown in terminal where server.py is running) +``` + +## Next Steps + +- Review results in `row_*/logs/.../report.json` +- Analyze failed instances to improve your model +- Run on larger subsets to get statistical significance +- Export results for further analysis + +## Support + +For issues: +- Check agent logs: `row_N/agent_N.log` +- Check exit statuses: `row_N/exit_statuses_*.yaml` +- Verify Docker has sufficient resources +- Ensure API key is valid and has credits diff --git a/examples/swebench/SWE-bench b/examples/swebench/SWE-bench new file mode 160000 index 00000000..5cd4be9f --- /dev/null +++ b/examples/swebench/SWE-bench @@ -0,0 +1 @@ +Subproject commit 5cd4be9fb23971679cbbafe5a0ecade27cef99be diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py new file mode 100755 index 00000000..4d145038 --- /dev/null +++ b/examples/swebench/run_swe_agent_fw.py @@ -0,0 +1,268 @@ +#!/usr/bin/env python3 +""" +Fireworks-compatible wrapper for mini-swe-agent SWE-bench evaluations. + +This script handles Fireworks API compatibility by stripping non-standard fields +that mini-swe-agent adds for internal tracking. + +Requires fully qualified Fireworks model paths: +- Serverless models: fireworks_ai/accounts/fireworks/models/{model_name} +- Deployed models: fireworks_ai/accounts/{account}/deployedModels/{model_name} + +Usage: + python run_swe_agent_fw.py [options] + + +Requirements: + - mini-swe-agent: pip install mini-swe-agent + - Fireworks API key: Set via 'mini-extra config set FIREWORKS_API_KEY ' +""" + +import argparse +import os +import sys +import subprocess +import tempfile +from pathlib import Path +from typing import Any + +# Import required dependencies +from minisweagent.models.litellm_model import LitellmModel, LitellmModelConfig +import litellm + + +def __get_api_key(): + """Get Fireworks API key from environment or mini-swe-agent config.""" + # Environment variable takes precedence + api_key = os.environ.get("FIREWORKS_API_KEY") + if api_key: + return api_key + + # Try to get API key from mini-swe-agent's config system + try: + from minisweagent.config import get_config + + config = get_config() + return config.get("FIREWORKS_API_KEY") + except (ImportError, AttributeError, KeyError): + # Fallback: check common config file locations + config_paths = [ + Path.home() / ".config" / "mini-swe-agent" / ".env", + Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env", + ] + + for config_path in config_paths: + if config_path.exists(): + try: + with open(config_path) as f: + for line in f: + if line.startswith("FIREWORKS_API_KEY="): + value = line.split("=", 1)[1].strip() + return value.strip("'\"") + except (IOError, OSError): + continue + + return None + + +def __test_model(model_id): + """Test model connectivity with a simple completion.""" + from litellm import completion + + # Verify API key exists + api_key = __get_api_key() + if not api_key: + print("Error: FIREWORKS_API_KEY not found.") + return False + + # Configure environment for litellm + os.environ["FIREWORKS_API_KEY"] = api_key + # Assume model_id is fully qualified + model_name = model_id + + print(f"Testing model: {model_name}") + + try: + # Send test completion + response = completion( + model=model_name, + messages=[{"role": "user", "content": "Test message. Reply with OK."}], + temperature=0.0, + max_tokens=10, + ) + + print(f"Success. Response: {response.choices[0].message.content}") + print(f"Tokens used: {response.usage.total_tokens}") + return True + + except Exception as e: + print(f"Error: {e}") + return False + + +def __validate_environment(): + """Check for required API key.""" + if not __get_api_key(): + print("Warning: FIREWORKS_API_KEY not found.") + print("Set it with: mini-extra config set FIREWORKS_API_KEY ") + + +def __build_command(args, wrapper_module_path): + """Build mini-swe-agent command with appropriate arguments.""" + # Construct model class path + wrapper_module = wrapper_module_path.stem + model_class = f"{wrapper_module}.FireworksCompatibleModel" + + # Base command - assume model_id is fully qualified + cmd = [ + sys.executable, + "-m", + "minisweagent.run.mini_extra", + "swebench-single" if args.single is not None else "swebench", + "--model", + args.model_id, + "--model-class", + "tracing_model.FireworksCompatibleModel", + "--subset", + args.subset, + "--split", + args.split, + ] + if args.model_class: + cmd.extend(["--model-class", args.model_class]) + print(f"Output: {args.output}") + print(args.single) + # Mode-specific arguments + print(f"Output: {args.output}") + print(args.single) + # Mode-specific arguments + if args.single is not None: + # Use batch mode for a single index via slice and write to a per-row directory + from pathlib import Path + + slice_spec = f"{args.single}:{args.single + 1}" + row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}") + cmd = [ + sys.executable, + "-m", + "minisweagent.run.mini_extra", + "swebench", + "--model", + args.model_id, + "--model-class", + model_class, + "--subset", + args.subset, + "--split", + args.split, + "--slice", + slice_spec, + "--output", + row_dir, + ] + if args.model_class: + cmd.extend(["--model-class", args.model_class]) + print(f"DEBUG: Using batch mode with slice {slice_spec}, output={row_dir}") + else: + if args.instances: + cmd.extend(["--slice", f"0:{args.instances}"]) + cmd.extend(["--workers", str(args.workers), "--output", args.output]) + + return cmd + + +def main(): + parser = argparse.ArgumentParser( + description="Run mini-swe-agent with Fireworks models on SWE-bench", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Required model ID + parser.add_argument("model_id", help="Fireworks model ID") + parser.add_argument("--model-class", type=str, default=None, help="Optional mini-swe-agent model-class") + # Execution options + parser.add_argument("--instances", type=int, help="Number of instances to run") + parser.add_argument("--workers", type=int, default=1, help="Parallel workers (default: 1)") + parser.add_argument("--output", help="Output directory") + parser.add_argument("--subset", default="verified", choices=["verified", "lite", "full"]) + parser.add_argument("--split", default="test", choices=["dev", "test"]) + parser.add_argument("--single", type=int, metavar="INDEX", help="Run single instance") + parser.add_argument("--exit-immediately", action="store_true") + parser.add_argument("--test", action="store_true", help="Test model connectivity") + parser.add_argument( + "--reasoning", + type=str, + choices=["low", "medium", "high"], + default=None, + help="Provider-specific reasoning effort", + ) + parser.add_argument("--temperature", type=float, default=None, help="Model temperature override") + parser.add_argument("--max-tokens", type=int, default=None, help="Max tokens override") + args = parser.parse_args() + + # Handle test mode + if args.test: + sys.exit(0 if _test_model(args.model_id) else 1) + + # Validate API key + __validate_environment() + + # Set default output directory + if args.output is None: + safe_model_id = args.model_id.replace("/", "-").replace(":", "-") + script_dir = Path(__file__).parent.resolve() + args.output = str(script_dir / f"swebench-{safe_model_id}-results") + + # Create temporary module for importing FireworksCompatibleModel + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: + with open(__file__, "r") as current_file: + f.write(current_file.read()) + # Inject per-run model overrides directly into the temp module + f.write("\n# --- Injected by wrapper: per-run model overrides ---\n") + f.write("WRAPPER_MODEL_OVERRIDES = {\n") + f.write(f" 'reasoning': {repr(args.reasoning)},\n") + f.write(f" 'temperature': {repr(args.temperature)},\n") + f.write(f" 'max_tokens': {repr(args.max_tokens)},\n") + f.write("}\n") + temp_module_path = Path(f.name) + + try: + # Configure environment + env = os.environ.copy() + env["PYTHONPATH"] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}" + # Pass the fully qualified model path to the subprocess + env["FIREWORKS_MODEL_ID"] = args.model_id + + # Ensure API key is passed to subprocess + api_key = __get_api_key() + if api_key: + env["FIREWORKS_API_KEY"] = api_key + + # No environment variables for model kwargs; overrides are injected into the temp module + + # Build command + cmd = __build_command(args, temp_module_path) + + # Display configuration + print(f"Model: {args.model_id}") + print(f"Output: {args.output}") + print(f"Workers: {args.workers}") + if args.instances: + print(f"Instances: {args.instances}") + + # Debug: Show the actual command being run + print(f"Command: {' '.join(cmd)}") + print(f"Model path in command: {cmd[cmd.index('--model') + 1] if '--model' in cmd else 'NOT FOUND'}") + + # Execute mini-swe-agent + subprocess.run(cmd, env=env, check=True) + + finally: + # Clean up temporary module + if temp_module_path.exists(): + temp_module_path.unlink() + + +if __name__ == "__main__": + main() diff --git a/examples/swebench/server.py b/examples/swebench/server.py new file mode 100644 index 00000000..1c1fbb03 --- /dev/null +++ b/examples/swebench/server.py @@ -0,0 +1,246 @@ +"""Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url.""" + +import os +import threading +import subprocess +import logging +from fastapi import FastAPI +import uvicorn + +from eval_protocol import Status, InitRequest, RolloutIdFilter +from eval_protocol.log_utils.init import init_external_logging_from_env + +app = FastAPI() + +# Attach Elasticsearch handler to root logger (Eval Protocol UI) +init_external_logging_from_env() +# rollout_states = {} + + +@app.post("/init") +def init(req: InitRequest): + # Allow Eval Protocol to dynamically configure ES endpoint + + # Tag all logs for this rollout_id + logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}") + logger.addFilter(RolloutIdFilter(req.metadata.rollout_id)) + + def _worker(): + try: + # Validate model + if not req.model: + raise ValueError("model is required") + + if not req.metadata or not req.metadata.row_id: + raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'") + try: + single_index = int(str(req.metadata.row_id)) + except ValueError: + raise ValueError(f"row_id must be an integer index for --single, got: {req.metadata.row_id}") + env = os.environ.copy() + # Build environment for subprocess + if "FIREWORKS_API_KEY" in os.environ: + env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"] + # Make sure the tracing model module is importable by the subprocess + # so "tracing_model.TracingFireworksModel" can be imported + from pathlib import Path + + script_dir = Path(__file__).parent + env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}" + + # Sandbox by invocation_id to isolate concurrent test runs + from pathlib import Path + + invocation_id = req.metadata.invocation_id + base_dir = Path(os.getcwd()) / invocation_id + base_dir.mkdir(parents=True, exist_ok=True) + + script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve()) + + # Extract model_kwargs from req.metadata (forwarded from input_metadata) + model_kwargs = {} + # convert to logger.debug everywhere, remove debug then + logger.debug(f"req.metadata attributes: {dir(req.metadata)}") + + if hasattr(req.metadata, "model_kwargs"): + mk = getattr(req.metadata, "model_kwargs", None) + logger.debug(f"Found req.metadata.model_kwargs = {mk}") + if isinstance(mk, dict): + model_kwargs = mk + logger.debug(f"Extracted model_kwargs from metadata: {model_kwargs}") + else: + logger.debug("req.metadata has NO model_kwargs attribute") + + # Set tracing URL + if req.model_base_url: + env["TRACING_BASE_URL"] = req.model_base_url + + cmd = [ + "python3", + script_path, + req.model, + "--single", + str(single_index), + "--exit-immediately", + "--output", + str(base_dir), + "--model-class", + "tracing_model.TracingFireworksModel", + ] + # Forward model kwargs as CLI flags to the wrapper + if model_kwargs.get("reasoning") in ("low", "medium", "high"): + cmd.extend(["--reasoning", str(model_kwargs["reasoning"])]) + if model_kwargs.get("temperature") is not None: + cmd.extend(["--temperature", str(model_kwargs["temperature"])]) + if model_kwargs.get("max_tokens") is not None: + cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])]) + import json + + # Log path inside row directory for this run + row_dir = base_dir / f"row_{single_index}" + row_dir.mkdir(parents=True, exist_ok=True) + log_path = row_dir / f"agent_{single_index}.log" + + # Run without streaming; write all output to a log file; wait until completion + with open(log_path, "w") as lf: + proc = subprocess.Popen( + cmd, + env=env, + stdout=lf, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + ret = proc.wait() + + # Use row-specific preds.json to avoid cross-run interference + preds_path = row_dir / "preds.json" + if preds_path.exists(): + logger.info(f"Using preds.json at: {preds_path}") + else: + logger.error(f"No preds.json found at {preds_path}") + + # 2) Run SWE-bench evaluation harness on preds.json + preds_path_str = str(preds_path) + unique_run_id = f"eval-{invocation_id}" + eval_cmd = [ + "python3", + "-m", + "swebench.harness.run_evaluation", + "--dataset_name", + "princeton-nlp/SWE-bench_Verified", + "--predictions_path", + preds_path_str, + "--max_workers", + str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")), + "--run_id", + unique_run_id, + ] + logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd))) + eval_proc = subprocess.Popen( + eval_cmd, cwd=str(row_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 + ) + assert eval_proc.stdout is not None + for line in eval_proc.stdout: + logger.info(line.rstrip("\n")) + eval_rc = eval_proc.wait() + + # Collect evaluation results to send via Elasticsearch + import yaml + + instance_id = None + resolved = None + exit_reason = None + + if preds_path.exists(): + try: + preds = json.loads(preds_path.read_text()) + instance_id = next(iter(preds.keys()), None) + except Exception: + pass + + if instance_id: + model_id = req.model + if model_id: + safe_model = model_id.replace("/", "__").replace(":", "-") + report_path = ( + row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" + ) + + if report_path.exists(): + try: + report_data = json.loads(report_path.read_text()) + resolved = bool(report_data.get(instance_id, {}).get("resolved", False)) + except Exception: + pass + + if resolved is None: + exit_files = sorted(row_dir.glob("exit_statuses_*.yaml")) + if exit_files: + try: + status_doc = yaml.safe_load(exit_files[-1].read_text()) or {} + by_status = status_doc.get("instances_by_exit_status", {}) + for status_name, ids in by_status.items(): + if instance_id in (ids or []): + resolved = False + exit_reason = status_name + break + except Exception: + pass + + results_data = { + "instance_id": instance_id, + "resolved": resolved, + "exit_reason": exit_reason, + "row_id": str(single_index), + } + + except Exception as e: + # Best-effort: mark error but still finish to unblock polling + results_data = {"error": str(e), "row_id": str(single_index)} + logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))}) + finally: + # Create and log EvaluateResult in standardized format + from eval_protocol.models import EvaluateResult, MetricResult + + if resolved is not None: + reason = f"instance={instance_id}, resolved={resolved}" + if exit_reason: + reason += f", exit_reason={exit_reason}" + + eval_result = EvaluateResult( + score=1.0 if resolved else 0.0, + reason=reason, + is_score_valid=True, + metrics={ + "resolved": MetricResult( + score=1.0 if resolved else 0.0, + is_score_valid=True, + reason=f"resolved={resolved}", + value=int(resolved), + ) + }, + ) + logger.info( + f"EVAL_RESULT:{eval_result.model_dump_json()}", extra={"status": Status.rollout_finished()} + ) + else: + logger.info("EVAL_RESULT:null", extra={"status": Status.rollout_finished()}) + + threading.Thread(target=_worker, daemon=True).start() + return {"status": "accepted"} + + +# @app.get("/status") +# def status(rollout_id: str): +# return rollout_states.get(rollout_id, {"terminated": False}) + + +def main(): + host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1") + port = int(os.getenv("REMOTE_SERVER_PORT", "3000")) + uvicorn.run(app, host=host, port=port) + + +if __name__ == "__main__": + main() diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py new file mode 100644 index 00000000..87158ed1 --- /dev/null +++ b/examples/swebench/tests/test_swebench.py @@ -0,0 +1,79 @@ +from typing import List +from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader +from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor +from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices + + +def rows() -> List[EvaluationRow]: + return create_rows_from_indices(500) # All instances + + +# -------------------- Harness result attachment (UI pass/fail) -------------------- +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[rows], + ), + max_dataset_rows=2, + rollout_processor=RemoteRolloutProcessor( + remote_base_url="http://127.0.0.1:3000", + model_base_url="https://tracing.fireworks.ai", + timeout_seconds=1800, + disable_elastic_search_setup=True, + ), + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], + max_concurrent_rollouts=3, +) +async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: + """Evaluate SWE-bench instance by reading results from Fireworks tracing logs.""" + import logging + + logger = logging.getLogger(__name__) + + rollout_id = row.execution_metadata.rollout_id + logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}") + + if not rollout_id: + logger.warning("[DEBUG] No rollout_id") + return row + + try: + from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter + + adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai") + logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai") + + # Fetch logs for this rollout + logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}") + log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24) + + logger.info(f"[DEBUG] Received {len(log_entries)} log entries") + if log_entries: + logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}") + + # Find EVAL_RESULT message + found = False + for entry in log_entries: + message = entry.get("message", "") + if message.startswith("EVAL_RESULT:"): + logger.info("[DEBUG] Found EVAL_RESULT message!") + result_json = message.replace("EVAL_RESULT:", "") + logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...") + + if result_json != "null": + row.evaluation_result = EvaluateResult.model_validate_json(result_json) + logger.info( + f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}" + ) + found = True + break + + if not found: + logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs") + + except Exception as e: + logger.error(f"[DEBUG] Exception: {e}", exc_info=True) + + logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}") + return row diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py new file mode 100644 index 00000000..11375fc0 --- /dev/null +++ b/examples/swebench/tracing_model.py @@ -0,0 +1,192 @@ +""" +Custom model classes for integrating mini-swe-agent with eval-protocol's tracing infrastructure. + +## Why This File Exists + +mini-swe-agent is an autonomous agent that makes 20-100+ LLM API calls per SWE-bench instance +(e.g., reading files, editing code, running tests). To debug agent behavior and display results +in eval-protocol's UI, we need to capture and analyze every LLM call. + +This file bridges mini-swe-agent (which uses LitellmModel) with the Fireworks tracing proxy +(which requires specific URL patterns and SDK usage). + +## Problem Without This File + +By default, mini-swe-agent would: +- Call Fireworks API directly (no tracing) +- Agent conversations invisible in eval-protocol UI +- Can't debug why agent failed +- No cost tracking per call +- Model names get mangled by litellm routing + +## What These Classes Do + +### FireworksCompatibleModel (Base) +- Extends mini-swe-agent's LitellmModel +- Handles Fireworks API compatibility: + * Strips non-standard message fields that Fireworks API rejects + * Adds stop sequences to prevent common agent failure modes + * Applies temperature/reasoning overrides from wrapper script +- Used when tracing isn't needed (direct Fireworks API calls) + +### TracingFireworksModel (For eval-protocol integration) +- Extends FireworksCompatibleModel +- Routes ALL LLM calls through Fireworks tracing proxy instead of direct API +- Uses OpenAI SDK (not litellm) to preserve full model names +""" + +import sys +import os + +from minisweagent.models.litellm_model import LitellmModel + + +class FireworksCompatibleModel(LitellmModel): + """ + Fireworks-compatible wrapper for LitellmModel. + """ + + def __init__(self, **kwargs): + model_id = os.environ.get("FIREWORKS_MODEL_ID") + if model_id: + kwargs["model_name"] = model_id + + if "model_kwargs" not in kwargs: + kwargs["model_kwargs"] = {} + + # CRITICAL: Set drop_params to False so stop sequences aren't stripped! + kwargs["model_kwargs"]["drop_params"] = False + + # Get existing stop sequences + existing_stop = kwargs["model_kwargs"].get("stop", []) + if isinstance(existing_stop, str): + existing_stop = [existing_stop] + elif existing_stop is None: + existing_stop = [] + + # Add stop sequences (only the non-natural ones) + # stop_sequences = existing_stop + [ + # # ASCII versions + # "<|User|>", + # "<|Assistant|>", + # # Full-width PIPE versions (U+FF5C) + # "<|User|>", # \uff5c + # "<|Assistant|>", + # "```<|", + # "<|User", + # "<|Ass", + # # Full-width LETTER L versions (U+FF4C) + # "<lUser|>", # \uff4c + # "<lAssistant|>", + # "```<l", + # "<lUser", + # "<lAss", + # ] + # kwargs["model_kwargs"]["stop"] = stop_sequences + kwargs["model_kwargs"]["max_tokens"] = 1024 # Reduce to 1024 to save tokens + + if "temperature" not in kwargs["model_kwargs"]: + kwargs["model_kwargs"]["temperature"] = 0.0 + + # Apply per-run overrides injected by the wrapper (no environment variables) + overrides = globals().get("WRAPPER_MODEL_OVERRIDES") + if isinstance(overrides, dict): + if overrides.get("reasoning") in ("low", "medium", "high"): + kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"] + if overrides.get("temperature") is not None: + try: + kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"]) + except Exception: + pass + if overrides.get("max_tokens") is not None: + try: + kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"]) + except Exception: + pass + + super().__init__(**kwargs) + + def _query(self, messages: list[dict[str, str]], **kwargs): + """Remove non-standard fields before sending to Fireworks API.""" + # Keep only standard OpenAI-compatible fields + clean_messages = [] + for msg in messages: + clean_msg = {"role": msg["role"], "content": msg["content"]} + if "tool_calls" in msg: + clean_msg["tool_calls"] = msg["tool_calls"] + if "name" in msg: + clean_msg["name"] = msg["name"] + clean_messages.append(clean_msg) + + # IMPORTANT: Ensure drop_params stays False in the actual query + kwargs_with_stop = kwargs.copy() + if "drop_params" not in kwargs_with_stop: + kwargs_with_stop["drop_params"] = False + + return super()._query(clean_messages, **kwargs_with_stop) + + +class TracingFireworksModel(FireworksCompatibleModel): + """Routes LLM calls through tracing using OpenAI SDK (preserves model name).""" + + def _query(self, messages, **kwargs): + """Use OpenAI SDK directly to preserve model name through tracing.""" + from openai import OpenAI + import traceback + + tracing_url = os.environ.get("TRACING_BASE_URL", "") + api_key = os.environ.get("FIREWORKS_API_KEY", "") + + if not tracing_url: + print("⚠️ No TRACING_BASE_URL - using parent litellm") + return super()._query(messages, **kwargs) + + print("\n🔗 OpenAI SDK Call:") + print(f" URL: {tracing_url[:60]}...") + print(f" Model: {self.config.model_name}") + + try: + client = OpenAI(base_url=tracing_url, api_key=api_key) + + # Build OpenAI-compatible params + openai_kwargs = {} + if self.config.model_kwargs.get("stop"): + openai_kwargs["stop"] = self.config.model_kwargs["stop"] + print(f" Stop sequences: {len(openai_kwargs['stop'])}") + if self.config.model_kwargs.get("max_tokens"): + openai_kwargs["max_tokens"] = self.config.model_kwargs["max_tokens"] + if self.config.model_kwargs.get("temperature") is not None: + openai_kwargs["temperature"] = self.config.model_kwargs["temperature"] + + # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept! + clean_messages = [] + for msg in messages: + clean_msg = {"role": msg["role"], "content": msg["content"]} + # Preserve standard fields only + if "name" in msg: + clean_msg["name"] = msg["name"] + if "tool_calls" in msg: + clean_msg["tool_calls"] = msg["tool_calls"] + clean_messages.append(clean_msg) + + print(f" Messages: {len(clean_messages)} (cleaned)") + print(" Making call...") + + # OpenAI SDK call + response = client.chat.completions.create( + model=self.config.model_name, + messages=clean_messages, # ← Use cleaned messages! + **openai_kwargs, + ) + + print(" ✅ Call succeeded!") + print(f" Response ID: {response.id}") + print(f" Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n") + + return response + + except Exception as e: + print("\n❌ ERROR in TracingFireworksModel._query:") + print(f" {type(e).__name__}: {e}") + traceback.print_exc() + raise