From 71f416584ff1dbe7772dd02acce36a00e0744ee0 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Wed, 15 Oct 2025 14:33:24 -0700 Subject: [PATCH 01/10] swe-bench --- examples/swebench/README.md | 57 ++ ...works__deployments__r5dfiiwp.eval-run.json | 521 ++++++++++++++++++ examples/swebench/run_swe_agent_fw.py | 347 ++++++++++++ examples/swebench/server.py | 169 ++++++ examples/swebench/tests/conftest.py | 32 ++ examples/swebench/tests/test_swebench.py | 250 +++++++++ examples/swebench/tracing_model.py | 75 +++ 7 files changed, 1451 insertions(+) create mode 100644 examples/swebench/README.md create mode 100644 examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json create mode 100755 examples/swebench/run_swe_agent_fw.py create mode 100644 examples/swebench/server.py create mode 100644 examples/swebench/tests/conftest.py create mode 100644 examples/swebench/tests/test_swebench.py create mode 100644 examples/swebench/tracing_model.py diff --git a/examples/swebench/README.md b/examples/swebench/README.md new file mode 100644 index 00000000..04993e02 --- /dev/null +++ b/examples/swebench/README.md @@ -0,0 +1,57 @@ +SWE-bench (Remote) - Local (non-Docker) Setup and Usage + +Prerequisites +- Python 3.12 environment (same one you use for this repo) +- Fireworks API key +- mini-swe-agent and datasets (for patch generation) +- SWE-bench harness installed (for evaluation) + +Setup mini-swe-agent (non-Docker) +1) Install dependencies +```bash +pip install mini-swe-agent datasets +``` + +2) Configure API key for mini-swe-agent +```bash +mini-extra config set FIREWORKS_API_KEY +``` + +3) (Optional) Test connectivity +```bash +python3 examples/swebench/run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905 --test +``` + +Install SWE-bench evaluation harness +```bash +git clone https://github.com/princeton-nlp/SWE-bench +pip install -e SWE-bench +``` + +Environment +```bash +export FIREWORKS_API_KEY="" +``` + +Run the server +```bash +python examples/swebench/server.py +``` + +What the server does +- Invokes `run_swe_agent_fw.py` in batch mode with a single-slice per request +- Writes outputs to a per-row directory: `./row_{index}/` + - `row_{index}/preds.json` + - `row_{index}//.traj.json` +- Runs the SWE-bench harness on `row_{index}/preds.json` + +Run pytest to evaluate a model on SWE-bench +```bash +cd /Users/shrey/Documents/python-sdk +pytest examples/swebench/tests/test_swebench.py -v -s +``` + +Notes +- The test currently generates 10 rows by numeric index (0–9) +- Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}` +- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5` \ No newline at end of file diff --git a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json new file mode 100644 index 00000000..a9e10524 --- /dev/null +++ b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json @@ -0,0 +1,521 @@ +{ + "total_instances": 500, + "submitted_instances": 1, + "completed_instances": 0, + "resolved_instances": 0, + "unresolved_instances": 0, + "empty_patch_instances": 1, + "error_instances": 0, + "completed_ids": [], + "incomplete_ids": [ + "astropy__astropy-12907", + "astropy__astropy-13033", + "astropy__astropy-13236", + "astropy__astropy-13398", + "astropy__astropy-13579", + "astropy__astropy-13977", + "astropy__astropy-14096", + "astropy__astropy-14182", + "astropy__astropy-14309", + "astropy__astropy-14365", + "astropy__astropy-14369", + "astropy__astropy-14508", + "astropy__astropy-14539", + "astropy__astropy-14598", + "astropy__astropy-14995", + "astropy__astropy-7166", + "astropy__astropy-7336", + "astropy__astropy-7606", + "astropy__astropy-7671", + "astropy__astropy-8707", + "astropy__astropy-8872", + "django__django-10097", + "django__django-10554", + "django__django-10880", + "django__django-10914", + "django__django-10973", + "django__django-10999", + "django__django-11066", + "django__django-11087", + "django__django-11095", + "django__django-11099", + "django__django-11119", + "django__django-11133", + "django__django-11138", + "django__django-11141", + "django__django-11149", + "django__django-11163", + "django__django-11179", + "django__django-11206", + "django__django-11211", + "django__django-11239", + "django__django-11265", + "django__django-11276", + "django__django-11292", + "django__django-11299", + "django__django-11333", + "django__django-11400", + "django__django-11433", + "django__django-11451", + "django__django-11477", + "django__django-11490", + "django__django-11532", + "django__django-11551", + "django__django-11555", + "django__django-11603", + "django__django-11728", + "django__django-11734", + "django__django-11740", + "django__django-11749", + "django__django-11790", + "django__django-11815", + "django__django-11820", + "django__django-11848", + "django__django-11880", + "django__django-11885", + "django__django-11951", + "django__django-11964", + "django__django-11999", + "django__django-12039", + "django__django-12050", + "django__django-12125", + "django__django-12143", + "django__django-12155", + "django__django-12193", + "django__django-12209", + "django__django-12262", + "django__django-12273", + "django__django-12276", + "django__django-12304", + "django__django-12308", + "django__django-12325", + "django__django-12406", + "django__django-12419", + "django__django-12663", + "django__django-12708", + "django__django-12713", + "django__django-12741", + "django__django-12754", + "django__django-12774", + "django__django-12858", + "django__django-12965", + "django__django-13012", + "django__django-13023", + "django__django-13028", + "django__django-13033", + "django__django-13089", + "django__django-13109", + "django__django-13112", + "django__django-13121", + "django__django-13128", + "django__django-13158", + "django__django-13195", + "django__django-13212", + "django__django-13279", + "django__django-13297", + "django__django-13315", + "django__django-13343", + "django__django-13344", + "django__django-13346", + "django__django-13363", + "django__django-13401", + "django__django-13406", + "django__django-13410", + "django__django-13417", + "django__django-13449", + "django__django-13512", + "django__django-13513", + "django__django-13516", + "django__django-13551", + "django__django-13568", + "django__django-13569", + "django__django-13590", + "django__django-13658", + "django__django-13670", + "django__django-13741", + "django__django-13786", + "django__django-13794", + "django__django-13807", + "django__django-13809", + "django__django-13810", + "django__django-13820", + "django__django-13821", + "django__django-13837", + "django__django-13925", + "django__django-13933", + "django__django-13964", + "django__django-14007", + "django__django-14011", + "django__django-14017", + "django__django-14034", + "django__django-14053", + "django__django-14089", + "django__django-14122", + "django__django-14140", + "django__django-14155", + "django__django-14170", + "django__django-14238", + "django__django-14311", + "django__django-14315", + "django__django-14349", + "django__django-14351", + "django__django-14373", + "django__django-14376", + "django__django-14404", + "django__django-14434", + "django__django-14493", + "django__django-14500", + "django__django-14534", + "django__django-14539", + "django__django-14559", + "django__django-14580", + "django__django-14608", + "django__django-14631", + "django__django-14672", + "django__django-14725", + "django__django-14752", + "django__django-14765", + "django__django-14771", + "django__django-14787", + "django__django-14792", + "django__django-14855", + "django__django-14915", + "django__django-14999", + "django__django-15022", + "django__django-15037", + "django__django-15098", + "django__django-15103", + "django__django-15104", + "django__django-15127", + "django__django-15128", + "django__django-15161", + "django__django-15252", + "django__django-15268", + "django__django-15277", + "django__django-15278", + "django__django-15280", + "django__django-15315", + "django__django-15368", + "django__django-15375", + "django__django-15380", + "django__django-15382", + "django__django-15467", + "django__django-15499", + "django__django-15503", + "django__django-15525", + "django__django-15554", + "django__django-15561", + "django__django-15563", + "django__django-15569", + "django__django-15572", + "django__django-15629", + "django__django-15695", + "django__django-15731", + "django__django-15732", + "django__django-15741", + "django__django-15814", + "django__django-15851", + "django__django-15863", + "django__django-15916", + "django__django-15930", + "django__django-15957", + "django__django-15973", + "django__django-15987", + "django__django-16032", + "django__django-16082", + "django__django-16100", + "django__django-16116", + "django__django-16136", + "django__django-16139", + "django__django-16145", + "django__django-16255", + "django__django-16256", + "django__django-16263", + "django__django-16315", + "django__django-16333", + "django__django-16429", + "django__django-16454", + "django__django-16485", + "django__django-16493", + "django__django-16502", + "django__django-16527", + "django__django-16560", + "django__django-16569", + "django__django-16595", + "django__django-16612", + "django__django-16631", + "django__django-16642", + "django__django-16661", + "django__django-16662", + "django__django-16667", + "django__django-16801", + "django__django-16819", + "django__django-16877", + "django__django-16899", + "django__django-16901", + "django__django-16938", + "django__django-16950", + "django__django-17029", + "django__django-17084", + "django__django-17087", + "django__django-7530", + "django__django-9296", + "matplotlib__matplotlib-13989", + "matplotlib__matplotlib-14623", + "matplotlib__matplotlib-20488", + "matplotlib__matplotlib-20676", + "matplotlib__matplotlib-20826", + "matplotlib__matplotlib-20859", + "matplotlib__matplotlib-21568", + "matplotlib__matplotlib-22719", + "matplotlib__matplotlib-22865", + "matplotlib__matplotlib-22871", + "matplotlib__matplotlib-23299", + "matplotlib__matplotlib-23314", + "matplotlib__matplotlib-23412", + "matplotlib__matplotlib-23476", + "matplotlib__matplotlib-24026", + "matplotlib__matplotlib-24149", + "matplotlib__matplotlib-24177", + "matplotlib__matplotlib-24570", + "matplotlib__matplotlib-24627", + "matplotlib__matplotlib-24637", + "matplotlib__matplotlib-24870", + "matplotlib__matplotlib-24970", + "matplotlib__matplotlib-25122", + "matplotlib__matplotlib-25287", + "matplotlib__matplotlib-25311", + "matplotlib__matplotlib-25332", + "matplotlib__matplotlib-25479", + "matplotlib__matplotlib-25775", + "matplotlib__matplotlib-25960", + "matplotlib__matplotlib-26113", + "matplotlib__matplotlib-26208", + "matplotlib__matplotlib-26291", + "matplotlib__matplotlib-26342", + "matplotlib__matplotlib-26466", + "mwaskom__seaborn-3069", + "mwaskom__seaborn-3187", + "pallets__flask-5014", + "psf__requests-1142", + "psf__requests-1724", + "psf__requests-1766", + "psf__requests-1921", + "psf__requests-2317", + "psf__requests-2931", + "psf__requests-5414", + "psf__requests-6028", + "pydata__xarray-2905", + "pydata__xarray-3095", + "pydata__xarray-3151", + "pydata__xarray-3305", + "pydata__xarray-3677", + "pydata__xarray-3993", + "pydata__xarray-4075", + "pydata__xarray-4094", + "pydata__xarray-4356", + "pydata__xarray-4629", + "pydata__xarray-4687", + "pydata__xarray-4695", + "pydata__xarray-4966", + "pydata__xarray-6461", + "pydata__xarray-6599", + "pydata__xarray-6721", + "pydata__xarray-6744", + "pydata__xarray-6938", + "pydata__xarray-6992", + "pydata__xarray-7229", + "pydata__xarray-7233", + "pydata__xarray-7393", + "pylint-dev__pylint-4551", + "pylint-dev__pylint-4604", + "pylint-dev__pylint-4661", + "pylint-dev__pylint-4970", + "pylint-dev__pylint-6386", + "pylint-dev__pylint-6528", + "pylint-dev__pylint-6903", + "pylint-dev__pylint-7080", + "pylint-dev__pylint-7277", + "pylint-dev__pylint-8898", + "pytest-dev__pytest-10051", + "pytest-dev__pytest-10081", + "pytest-dev__pytest-10356", + "pytest-dev__pytest-5262", + "pytest-dev__pytest-5631", + "pytest-dev__pytest-5787", + "pytest-dev__pytest-5809", + "pytest-dev__pytest-5840", + "pytest-dev__pytest-6197", + "pytest-dev__pytest-6202", + "pytest-dev__pytest-7205", + "pytest-dev__pytest-7236", + "pytest-dev__pytest-7324", + "pytest-dev__pytest-7432", + "pytest-dev__pytest-7490", + "pytest-dev__pytest-7521", + "pytest-dev__pytest-7571", + "pytest-dev__pytest-7982", + "pytest-dev__pytest-8399", + "scikit-learn__scikit-learn-10297", + "scikit-learn__scikit-learn-10844", + "scikit-learn__scikit-learn-10908", + "scikit-learn__scikit-learn-11310", + "scikit-learn__scikit-learn-11578", + "scikit-learn__scikit-learn-12585", + "scikit-learn__scikit-learn-12682", + "scikit-learn__scikit-learn-12973", + "scikit-learn__scikit-learn-13124", + "scikit-learn__scikit-learn-13135", + "scikit-learn__scikit-learn-13142", + "scikit-learn__scikit-learn-13328", + "scikit-learn__scikit-learn-13439", + "scikit-learn__scikit-learn-13496", + "scikit-learn__scikit-learn-13779", + "scikit-learn__scikit-learn-14053", + "scikit-learn__scikit-learn-14087", + "scikit-learn__scikit-learn-14141", + "scikit-learn__scikit-learn-14496", + "scikit-learn__scikit-learn-14629", + "scikit-learn__scikit-learn-14710", + "scikit-learn__scikit-learn-14894", + "scikit-learn__scikit-learn-14983", + "scikit-learn__scikit-learn-15100", + "scikit-learn__scikit-learn-25102", + "scikit-learn__scikit-learn-25232", + "scikit-learn__scikit-learn-25747", + "scikit-learn__scikit-learn-25931", + "scikit-learn__scikit-learn-25973", + "scikit-learn__scikit-learn-26194", + "scikit-learn__scikit-learn-26323", + "scikit-learn__scikit-learn-9288", + "sphinx-doc__sphinx-10323", + "sphinx-doc__sphinx-10435", + "sphinx-doc__sphinx-10449", + "sphinx-doc__sphinx-10466", + "sphinx-doc__sphinx-10614", + "sphinx-doc__sphinx-10673", + "sphinx-doc__sphinx-11445", + "sphinx-doc__sphinx-11510", + "sphinx-doc__sphinx-7440", + "sphinx-doc__sphinx-7454", + "sphinx-doc__sphinx-7462", + "sphinx-doc__sphinx-7590", + "sphinx-doc__sphinx-7748", + "sphinx-doc__sphinx-7757", + "sphinx-doc__sphinx-7889", + "sphinx-doc__sphinx-7910", + "sphinx-doc__sphinx-7985", + "sphinx-doc__sphinx-8035", + "sphinx-doc__sphinx-8056", + "sphinx-doc__sphinx-8120", + "sphinx-doc__sphinx-8265", + "sphinx-doc__sphinx-8269", + "sphinx-doc__sphinx-8459", + "sphinx-doc__sphinx-8475", + "sphinx-doc__sphinx-8548", + "sphinx-doc__sphinx-8551", + "sphinx-doc__sphinx-8593", + "sphinx-doc__sphinx-8595", + "sphinx-doc__sphinx-8621", + "sphinx-doc__sphinx-8638", + "sphinx-doc__sphinx-8721", + "sphinx-doc__sphinx-9229", + "sphinx-doc__sphinx-9230", + "sphinx-doc__sphinx-9258", + "sphinx-doc__sphinx-9281", + "sphinx-doc__sphinx-9320", + "sphinx-doc__sphinx-9367", + "sphinx-doc__sphinx-9461", + "sphinx-doc__sphinx-9591", + "sphinx-doc__sphinx-9602", + "sphinx-doc__sphinx-9658", + "sphinx-doc__sphinx-9673", + "sphinx-doc__sphinx-9698", + "sphinx-doc__sphinx-9711", + "sympy__sympy-11618", + "sympy__sympy-12096", + "sympy__sympy-12419", + "sympy__sympy-12481", + "sympy__sympy-12489", + "sympy__sympy-13031", + "sympy__sympy-13091", + "sympy__sympy-13372", + "sympy__sympy-13480", + "sympy__sympy-13551", + "sympy__sympy-13615", + "sympy__sympy-13647", + "sympy__sympy-13757", + "sympy__sympy-13798", + "sympy__sympy-13852", + "sympy__sympy-13877", + "sympy__sympy-13878", + "sympy__sympy-13974", + "sympy__sympy-14248", + "sympy__sympy-14531", + "sympy__sympy-14711", + "sympy__sympy-14976", + "sympy__sympy-15017", + "sympy__sympy-15345", + "sympy__sympy-15349", + "sympy__sympy-15599", + "sympy__sympy-15809", + "sympy__sympy-15875", + "sympy__sympy-15976", + "sympy__sympy-16450", + "sympy__sympy-16597", + "sympy__sympy-16766", + "sympy__sympy-16792", + "sympy__sympy-16886", + "sympy__sympy-17139", + "sympy__sympy-17318", + "sympy__sympy-17630", + "sympy__sympy-17655", + "sympy__sympy-18189", + "sympy__sympy-18199", + "sympy__sympy-18211", + "sympy__sympy-18698", + "sympy__sympy-18763", + "sympy__sympy-19040", + "sympy__sympy-19346", + "sympy__sympy-19495", + "sympy__sympy-19637", + "sympy__sympy-19783", + "sympy__sympy-19954", + "sympy__sympy-20154", + "sympy__sympy-20428", + "sympy__sympy-20438", + "sympy__sympy-20590", + "sympy__sympy-20801", + "sympy__sympy-20916", + "sympy__sympy-21379", + "sympy__sympy-21596", + "sympy__sympy-21612", + "sympy__sympy-21847", + "sympy__sympy-21930", + "sympy__sympy-22080", + "sympy__sympy-22456", + "sympy__sympy-22714", + "sympy__sympy-22914", + "sympy__sympy-23262", + "sympy__sympy-23413", + "sympy__sympy-23534", + "sympy__sympy-23824", + "sympy__sympy-23950", + "sympy__sympy-24066", + "sympy__sympy-24213", + "sympy__sympy-24443", + "sympy__sympy-24539", + "sympy__sympy-24562", + "sympy__sympy-24661" + ], + "empty_patch_ids": [ + "astropy__astropy-13453" + ], + "submitted_ids": [ + "astropy__astropy-13453" + ], + "resolved_ids": [], + "unresolved_ids": [], + "error_ids": [], + "schema_version": 2 +} diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py new file mode 100755 index 00000000..f1ae1a51 --- /dev/null +++ b/examples/swebench/run_swe_agent_fw.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +Fireworks-compatible wrapper for mini-swe-agent SWE-bench evaluations. + +This script handles Fireworks API compatibility by stripping non-standard fields +that mini-swe-agent adds for internal tracking. + +Requires fully qualified Fireworks model paths: +- Serverless models: fireworks_ai/accounts/fireworks/models/{model_name} +- Deployed models: fireworks_ai/accounts/{account}/deployedModels/{model_name} + +Usage: + python run_swe_agent_fw.py [options] + +Examples: + # Serverless models + python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --instances 10 --workers 5 + python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct --subset full --workers 8 + + # Deployed models + python run_swe_agent_fw.py fireworks_ai/accounts/cognition/deployedModels/swe-1-mtp-tc1huggf --single 0 + python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --test + +Requirements: + - mini-swe-agent: pip install mini-swe-agent + - Fireworks API key: Set via 'mini-extra config set FIREWORKS_API_KEY ' +""" + +import argparse +import os +import sys +import subprocess +import tempfile +from pathlib import Path +from typing import Any + +# Import required dependencies +from minisweagent.models.litellm_model import LitellmModel, LitellmModelConfig +import litellm + + +class FireworksCompatibleModel(LitellmModel): + """ + Fireworks-compatible wrapper for LitellmModel. + """ + + def __init__(self, **kwargs): + if model_id := os.environ.get('FIREWORKS_MODEL_ID'): + kwargs['model_name'] = model_id + print(f"kwargs: {kwargs}") + if 'model_kwargs' not in kwargs: + kwargs['model_kwargs'] = {} + + # CRITICAL: Set drop_params to False so stop sequences aren't stripped! + kwargs['model_kwargs']['drop_params'] = False + + # Get existing stop sequences + existing_stop = kwargs['model_kwargs'].get('stop', []) + if isinstance(existing_stop, str): + existing_stop = [existing_stop] + elif existing_stop is None: + existing_stop = [] + + # Add stop sequences (only the non-natural ones) + stop_sequences = existing_stop + [ + # ASCII versions + "<|User|>", + "<|Assistant|>", + + # Full-width PIPE versions (U+FF5C) + "<|User|>", # \uff5c + "<|Assistant|>", + "```<|", + "<|User", + "<|Ass", + + # Full-width LETTER L versions (U+FF4C) + "<lUser|>", # \uff4c + "<lAssistant|>", + "```<l", + "<lUser", + "<lAss", + ] + kwargs['model_kwargs']['stop'] = stop_sequences + kwargs['model_kwargs']['max_tokens'] = 1024 # Reduce to 1024 to save tokens + + if 'temperature' not in kwargs['model_kwargs']: + kwargs['model_kwargs']['temperature'] = 0.0 + + # Apply per-run overrides injected by the wrapper (no environment variables) + overrides = globals().get('WRAPPER_MODEL_OVERRIDES') + if isinstance(overrides, dict): + if overrides.get('reasoning') in ('low', 'medium', 'high'): + kwargs['model_kwargs']['reasoning_effort'] = overrides['reasoning'] + if overrides.get('temperature') is not None: + try: + kwargs['model_kwargs']['temperature'] = float(overrides['temperature']) + except Exception: + pass + if overrides.get('max_tokens') is not None: + try: + kwargs['model_kwargs']['max_tokens'] = int(overrides['max_tokens']) + except Exception: + pass + + super().__init__(**kwargs) + + def _query(self, messages: list[dict[str, str]], **kwargs): + """Remove non-standard fields before sending to Fireworks API.""" + # Keep only standard OpenAI-compatible fields + clean_messages = [] + for msg in messages: + clean_msg = { + "role": msg["role"], + "content": msg["content"] + } + if "tool_calls" in msg: + clean_msg["tool_calls"] = msg["tool_calls"] + if "name" in msg: + clean_msg["name"] = msg["name"] + clean_messages.append(clean_msg) + + # IMPORTANT: Ensure drop_params stays False in the actual query + kwargs_with_stop = kwargs.copy() + if 'drop_params' not in kwargs_with_stop: + kwargs_with_stop['drop_params'] = False + + return super()._query(clean_messages, **kwargs_with_stop) + +def __get_api_key(): + """Get Fireworks API key from environment or mini-swe-agent config.""" + # Environment variable takes precedence + if api_key := os.environ.get('FIREWORKS_API_KEY'): + return api_key + + # Try to get API key from mini-swe-agent's config system + try: + from minisweagent.config import get_config + config = get_config() + return config.get('FIREWORKS_API_KEY') + except (ImportError, AttributeError, KeyError): + # Fallback: check common config file locations + config_paths = [ + Path.home() / ".config" / "mini-swe-agent" / ".env", + Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env" + ] + + for config_path in config_paths: + if config_path.exists(): + try: + with open(config_path) as f: + for line in f: + if line.startswith('FIREWORKS_API_KEY='): + value = line.split('=', 1)[1].strip() + return value.strip("'\"") + except (IOError, OSError): + continue + + return None + + +def __test_model(model_id): + """Test model connectivity with a simple completion.""" + from litellm import completion + + # Verify API key exists + api_key = __get_api_key() + if not api_key: + print("Error: FIREWORKS_API_KEY not found.") + return False + + # Configure environment for litellm + os.environ['FIREWORKS_API_KEY'] = api_key + # Assume model_id is fully qualified + model_name = model_id + + print(f"Testing model: {model_name}") + + try: + # Send test completion + response = completion( + model=model_name, + messages=[{"role": "user", "content": "Test message. Reply with OK."}], + temperature=0.0, + max_tokens=10 + ) + + print(f"Success. Response: {response.choices[0].message.content}") + print(f"Tokens used: {response.usage.total_tokens}") + return True + + except Exception as e: + print(f"Error: {e}") + return False + + +def __validate_environment(): + """Check for required API key.""" + if not __get_api_key(): + print("Warning: FIREWORKS_API_KEY not found.") + print("Set it with: mini-extra config set FIREWORKS_API_KEY ") + + + + +def __build_command(args, wrapper_module_path): + """Build mini-swe-agent command with appropriate arguments.""" + # Construct model class path + wrapper_module = wrapper_module_path.stem + model_class = f"{wrapper_module}.FireworksCompatibleModel" + + # Base command - assume model_id is fully qualified + cmd = [ + sys.executable, + "-m", "minisweagent.run.mini_extra", + "swebench-single" if args.single is not None else "swebench", + "--model", args.model_id, + "--model-class", model_class, + "--subset", args.subset, + "--split", args.split + ] + if args.model_class: + cmd.extend(["--model-class", args.model_class]) + print(f"Output: {args.output}") + print(args.single) + # Mode-specific arguments + print(f"Output: {args.output}") + print(args.single) + # Mode-specific arguments + if args.single is not None: + # Use batch mode for a single index via slice and write to a per-row directory + from pathlib import Path + slice_spec = f"{args.single}:{args.single+1}" + row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}") + cmd = [ + sys.executable, + "-m", "minisweagent.run.mini_extra", + "swebench", + "--model", args.model_id, + "--model-class", model_class, + "--subset", args.subset, + "--split", args.split, + "--slice", slice_spec, + "--output", row_dir, + ] + if args.model_class: + cmd.extend(["--model-class", args.model_class]) + print(f"DEBUG: Using batch mode with slice {slice_spec}, output={row_dir}") + else: + if args.instances: + cmd.extend(["--slice", f"0:{args.instances}"]) + cmd.extend(["--workers", str(args.workers), "--output", args.output]) + + return cmd + + + + +def main(): + parser = argparse.ArgumentParser( + description='Run mini-swe-agent with Fireworks models on SWE-bench', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + # Required model ID + parser.add_argument('model_id', help='Fireworks model ID') + parser.add_argument('--model-class', type=str, default=None, help='Optional mini-swe-agent model-class') + # Execution options + parser.add_argument('--instances', type=int, help='Number of instances to run') + parser.add_argument('--workers', type=int, default=1, help='Parallel workers (default: 1)') + parser.add_argument('--output', help='Output directory') + parser.add_argument('--subset', default='verified', choices=['verified', 'lite', 'full']) + parser.add_argument('--split', default='test', choices=['dev', 'test']) + parser.add_argument('--single', type=int, metavar='INDEX', help='Run single instance') + parser.add_argument('--exit-immediately', action='store_true') + parser.add_argument('--test', action='store_true', help='Test model connectivity') + parser.add_argument('--reasoning', type=str, choices=['low', 'medium', 'high'], default=None, help='Provider-specific reasoning effort') + parser.add_argument('--temperature', type=float, default=None, help='Model temperature override') + parser.add_argument('--max-tokens', type=int, default=None, help='Max tokens override') + args = parser.parse_args() + + # Handle test mode + if args.test: + sys.exit(0 if _test_model(args.model_id) else 1) + + # Validate API key + __validate_environment() + + # Set default output directory + if args.output is None: + safe_model_id = args.model_id.replace("/", "-").replace(":", "-") + script_dir = Path(__file__).parent.resolve() + args.output = str(script_dir / f'swebench-{safe_model_id}-results') + + # Create temporary module for importing FireworksCompatibleModel + with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: + with open(__file__, 'r') as current_file: + f.write(current_file.read()) + # Inject per-run model overrides directly into the temp module + f.write("\n# --- Injected by wrapper: per-run model overrides ---\n") + f.write("WRAPPER_MODEL_OVERRIDES = {\n") + f.write(f" 'reasoning': {repr(args.reasoning)},\n") + f.write(f" 'temperature': {repr(args.temperature)},\n") + f.write(f" 'max_tokens': {repr(args.max_tokens)},\n") + f.write("}\n") + temp_module_path = Path(f.name) + + try: + # Configure environment + env = os.environ.copy() + env['PYTHONPATH'] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}" + # Pass the fully qualified model path to the subprocess + env['FIREWORKS_MODEL_ID'] = args.model_id + + # Ensure API key is passed to subprocess + api_key = __get_api_key() + if api_key: + env['FIREWORKS_API_KEY'] = api_key + + # No environment variables for model kwargs; overrides are injected into the temp module + + # Build command + cmd = __build_command(args, temp_module_path) + + # Display configuration + print(f"Model: {args.model_id}") + print(f"Output: {args.output}") + print(f"Workers: {args.workers}") + if args.instances: + print(f"Instances: {args.instances}") + + # Debug: Show the actual command being run + print(f"Command: {' '.join(cmd)}") + print(f"Model path in command: {cmd[cmd.index('--model') + 1] if '--model' in cmd else 'NOT FOUND'}") + + # Execute mini-swe-agent + subprocess.run(cmd, env=env, check=True) + + finally: + # Clean up temporary module + if temp_module_path.exists(): + temp_module_path.unlink() + + +if __name__ == '__main__': + main() diff --git a/examples/swebench/server.py b/examples/swebench/server.py new file mode 100644 index 00000000..80ddabe4 --- /dev/null +++ b/examples/swebench/server.py @@ -0,0 +1,169 @@ +"""Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url.""" +import os +import threading +import subprocess +import logging +from fastapi import FastAPI +import uvicorn + +from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter + +app = FastAPI() + +# Attach Elasticsearch handler to root logger (Eval Protocol UI) +handler = ElasticsearchDirectHttpHandler() +logging.getLogger().addHandler(handler) +rollout_states = {} + +@app.post("/init") +def init(req: InitRequest): + # Allow Eval Protocol to dynamically configure ES endpoint + if req.elastic_search_config: + handler.configure(req.elastic_search_config) + + # Tag all logs for this rollout_id + logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}") + logger.addFilter(RolloutIdFilter(req.metadata.rollout_id)) + + rollout_states[req.metadata.rollout_id] = { + "terminated": False, + "status": "running", + "instance_id": req.metadata.row_id, + } + + def _worker(): + try: + # Validate model + if not req.model: + raise ValueError("model is required") + + + if not req.metadata or not req.metadata.row_id: + raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'") + try: + single_index = int(str(req.metadata.row_id)) + except ValueError: + raise ValueError(f"row_id must be an integer index for --single, got: {req.metadata.row_id}") + env = os.environ.copy() + # Build environment for subprocess + if "FIREWORKS_API_KEY" in os.environ: + env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"] + # Make sure the tracing model module is importable by the subprocess + # so "tracing_model.TracingFireworksModel" can be imported + env["PYTHONPATH"] = "/Users/shrey/Documents/python-sdk/examples/swebench:" + env.get("PYTHONPATH", "") + + # Determine output directory (from env or default) + out_dir = os.getcwd() + + from pathlib import Path + + script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve()) + + # Extract model_kwargs from req.metadata (forwarded from input_metadata) + model_kwargs = {} + logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}") + if hasattr(req.metadata, "model_kwargs"): + mk = getattr(req.metadata, "model_kwargs", None) + logger.info(f"DEBUG: Found req.metadata.model_kwargs = {mk}") + if isinstance(mk, dict): + model_kwargs = mk + logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}") + else: + logger.info(f"DEBUG: req.metadata has NO model_kwargs attribute") + + # Set tracing URL + if req.model_base_url: + env["TRACING_BASE_URL"] = req.model_base_url + + cmd = [ + "python3", + script_path, + req.model, + "--single", str(single_index), + "--exit-immediately", + "--output", str(out_dir), + "--model-class", "tracing_model.TracingFireworksModel", + ] + # Forward model kwargs as CLI flags to the wrapper + if model_kwargs.get("reasoning") in ("low", "medium", "high"): + cmd.extend(["--reasoning", str(model_kwargs["reasoning"])]) + if model_kwargs.get("temperature") is not None: + cmd.extend(["--temperature", str(model_kwargs["temperature"])]) + if model_kwargs.get("max_tokens") is not None: + cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])]) + import json + # Log path inside row directory for this run + row_dir = Path(out_dir) / f"row_{single_index}" + row_dir.mkdir(parents=True, exist_ok=True) + log_path = row_dir / f"agent_{single_index}.log" + + # Run without streaming; write all output to a log file; wait until completion + with open(log_path, "w") as lf: + proc = subprocess.Popen( + cmd, + env=env, + stdout=lf, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + ret = proc.wait() + + + # Stream stdout/stderr to logs + # assert proc.stdout is not None and proc.stderr is not None + # for line in proc.stdout: + # logger.info(line.rstrip("\n")) + # for line in proc.stderr: + # logger.warning(line.rstrip("\n")) + + # ret = proc.wait() + # logger.info(f"mini-swe-agent exited with code {ret}") + + # Use row-specific preds.json to avoid cross-run interference + preds_path = row_dir / "preds.json" + if preds_path.exists(): + logger.info(f"Using preds.json at: {preds_path}") + else: + logger.error(f"No preds.json found at {preds_path}") + + # 2) Run SWE-bench evaluation harness on preds.json + preds_path_str = str(preds_path) + eval_cmd = [ + "python3", "-m", "swebench.harness.run_evaluation", + "--dataset_name", "princeton-nlp/SWE-bench_Verified", + "--predictions_path", preds_path_str, + "--max_workers", str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")), + "--run_id", "eval-run", + ] + logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd))) + eval_proc = subprocess.Popen( + eval_cmd, cwd=str(row_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1 + ) + assert eval_proc.stdout is not None + for line in eval_proc.stdout: + logger.info(line.rstrip("\n")) + eval_rc = eval_proc.wait() + # logger.info(f"SWE-bench harness exited with code {eval_rc}") + + except Exception as e: + # Best-effort: mark error but still finish to unblock polling + logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))}) + finally: + # Always mark finished so RemoteRolloutProcessor stops polling + logger.info("Rollout completed", extra={"status": Status.rollout_finished()}) + + threading.Thread(target=_worker, daemon=True).start() + return {"status": "accepted"} + +@app.get("/status") +def status(rollout_id: str): + return rollout_states.get(rollout_id, {"terminated": False}) + +def main(): + host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1") + port = int(os.getenv("REMOTE_SERVER_PORT", "3000")) + uvicorn.run(app, host=host, port=port) + +if __name__ == "__main__": + main() diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py new file mode 100644 index 00000000..8bca079e --- /dev/null +++ b/examples/swebench/tests/conftest.py @@ -0,0 +1,32 @@ +import os +import pytest + +import os +import pytest + +MODEL_ID_OPT = None +CONCURRENCY_OPT = None +MODEL_KWARGS_OPT = None + +def pytest_addoption(parser): + parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID") + parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts") + parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature") + parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens") + parser.addoption("--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort") + +def pytest_configure(config): + global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT + MODEL_ID_OPT = config.getoption("--model-id") + CONCURRENCY_OPT = config.getoption("--concurrent-workers") + temp = config.getoption("--temperature") + mtok = config.getoption("--max-tokens") + reas = config.getoption("--reasoning") + mk = {} + if temp is not None: + mk["temperature"] = float(temp) + if mtok is not None: + mk["max_tokens"] = int(mtok) + if reas is not None: + mk["reasoning"] = reas + MODEL_KWARGS_OPT = mk or None \ No newline at end of file diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py new file mode 100644 index 00000000..37fef735 --- /dev/null +++ b/examples/swebench/tests/test_swebench.py @@ -0,0 +1,250 @@ +from typing import List +import os +import pytest +import requests +import yaml +from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader +from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor +from eval_protocol.types.remote_rollout_processor import DataLoaderConfig +from eval_protocol.quickstart.utils import filter_longest_conversation +# Reuse the converter used by the built-in adapter +from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row +import conftest + + +MODEL_ID = conftest.MODEL_ID_OPT +if not MODEL_ID: + raise RuntimeError("--model-id is required. Example: --model-id 'fireworks_ai/accounts/.../models/'") +CLI_CONCURRENCY = conftest.CONCURRENCY_OPT +CLI_MODEL_KWARGS = conftest.MODEL_KWARGS_OPT + +# Build completion_params once (used by decorator) +COMPLETION_PARAMS = {"model": MODEL_ID} +if CLI_MODEL_KWARGS: + COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS + +def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]: + """ + Fetch traces directly from the Fireworks tracing proxy with Authorization header + and convert them into EvaluationRows using the same converter as the adapter. + """ + base_url = (config.model_base_url or "https://tracing.fireworks.ai").rstrip("/") + api_key = os.environ.get("FIREWORKS_API_KEY") + if not api_key: + return [] + + url = f"{base_url}/v1/traces" + headers = {"Authorization": f"Bearer {api_key}"} + params = { + "tags": [f"rollout_id:{config.rollout_id}"], + "max_retries": 5, + "sleep_between_gets": 0.1, + } + + try: + resp = requests.get(url, params=params, headers=headers, timeout=300) + print(f"[fetch_traces] status={resp.status_code} url={resp.url}") # debug + resp.raise_for_status() + body = resp.json() or {} + traces = body.get("traces", []) + print(f"[fetch_traces] traces_found={len(traces)}") + except Exception as e: + print(f"[fetch_traces] error={e}") + return [] + + rows: List[EvaluationRow] = [] + for tr in traces: + row = convert_trace_dict_to_evaluation_row(tr, include_tool_calls=True, span_name=None) + if row: + rows.append(row) + return rows + + +def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]: + if not rows: + return [] + # Use the first row as the base; merge messages from all rows + base = rows[0] + seen = set() + merged_msgs: List[Message] = [] + for r in rows: + for m in (r.messages or []): + # Dedup by role+name+content+tool_calls signature + tool_sig = None + if getattr(m, "tool_calls", None): + tool_sig = tuple( + (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) + for tc in m.tool_calls + ) + key = (m.role, getattr(m, "name", None), m.content, tool_sig) + if key in seen: + continue + seen.add(key) + merged_msgs.append(m) + base.messages = merged_msgs + return [base] + +def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader: + return DynamicDataLoader( + generators=[lambda: fetch_traces_with_auth(config)], + preprocess_fn=_merge_rows_into_one, # merge all tool/LLM traces into one row + ) + +def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]: + out = [] + for idx, iid in enumerate(ids): + out.append( + EvaluationRow( + messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")], + input_metadata={ + "row_id": str(idx), # ← use instance_id here + "instance_id": iid, # ← explicit for debugging + "instance_index": str(idx), # ← optional: keep index + "completion_params": {"model": MODEL_ID}, + }, + ) + ) + return out + +def rows_from_indices(count: int) -> List[EvaluationRow]: + out: List[EvaluationRow] = [] + for idx in range(count): + metadata = { + "row_id": str(idx), + "instance_index": str(idx), + } + # Add model_kwargs to metadata so server can read from req.metadata + if CLI_MODEL_KWARGS: + metadata["model_kwargs"] = CLI_MODEL_KWARGS + + out.append( + EvaluationRow( + messages=[Message(role="user", content=f"Run SWE-bench index {idx}")], + input_metadata=metadata, + ) + ) + return out + +def rows() -> List[EvaluationRow]: + # Generate 10 rows by index; server maps index -> dataset instance via --slice + return rows_from_indices(10) + + +# -------------------- Harness result attachment (UI pass/fail) -------------------- +import json +from pathlib import Path + +def _safe_model_id(model_id: str) -> str: + return model_id.replace("/", "__").replace(":", "-") + +def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: + """Attach evaluation result by reading harness report or exit status.""" + import logging + logger = logging.getLogger(__name__) + + # Get row_id and instance_id + try: + row_id = str(row.input_metadata.row_id) # ← use attribute, not .get() + except Exception as e: + logger.warning(f"Could not get row_id: {e}") + return row + + row_dir = Path.cwd() / f"row_{row_id}" + logger.info(f"[Row {row_id}] Looking for results in {row_dir}") + + # Find instance_id from preds.json + preds_path = row_dir / "preds.json" + instance_id = None + if preds_path.exists(): + try: + preds = json.loads(preds_path.read_text()) + instance_id = next(iter(preds.keys()), None) + logger.info(f"[Row {row_id}] Found instance_id: {instance_id}") + except Exception as e: + logger.warning(f"[Row {row_id}] Could not read preds.json: {e}") + + if not instance_id: + logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result") + return row + + resolved: bool | None = None + reason_text: str | None = None + + # 1. Try to read from report.json (harness ran tests) + safe_model = _safe_model_id(model_id) + report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" + + if report_path.exists(): + logger.info(f"[Row {row_id}] Found report.json at {report_path}") + try: + report_data = json.loads(report_path.read_text()) + instance_data = report_data.get(instance_id, {}) + resolved = bool(instance_data.get("resolved", False)) + reason_text = f"harness_resolved={resolved}" + logger.info(f"[Row {row_id}] Report says resolved={resolved}") + except Exception as e: + logger.error(f"[Row {row_id}] Failed to parse report.json: {e}") + else: + logger.info(f"[Row {row_id}] No report.json found at {report_path}") + + # 2. If no report, check exit status YAML (agent didn't produce a patch) + if resolved is None: + exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml")) + if exit_status_files: + exit_file = exit_status_files[-1] + logger.info(f"[Row {row_id}] Reading exit status from {exit_file.name}") + try: + status_doc = yaml.safe_load(exit_file.read_text()) or {} + by_status = status_doc.get("instances_by_exit_status", {}) + for status_name, ids in by_status.items(): + if instance_id in (ids or []): + resolved = False + reason_text = f"exit_status={status_name}" + logger.info(f"[Row {row_id}] Exit status: {status_name}") + break + except Exception as e: + logger.error(f"[Row {row_id}] Failed to parse exit status: {e}") + else: + logger.warning(f"[Row {row_id}] No exit status YAML found") + + # 3. Attach result if we found anything + if resolved is not None: + logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}") + row.evaluation_result = EvaluateResult( + score=1.0 if resolved else 0.0, + reason=reason_text or f"resolved={resolved}", + is_score_valid=True, + metrics={ + "resolved": MetricResult( + score=1.0 if resolved else 0.0, + is_score_valid=True, + reason=reason_text or f"resolved={resolved}", + value=int(resolved), + ) + }, + ) + else: + logger.warning(f"[Row {row_id}] Could not determine resolved status") + + return row + + +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[rows], + ), + rollout_processor=RemoteRolloutProcessor( + remote_base_url="http://127.0.0.1:3000", + model_base_url="https://tracing.fireworks.ai", + timeout_seconds=1800, + output_data_loader=fireworks_output_data_loader, + ), + completion_params=[COMPLETION_PARAMS], + max_concurrent_rollouts=(CLI_CONCURRENCY or 2), +) +async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: + assert len(row.messages) >= 1 + row = attach_eval_result(row, MODEL_ID) + return row \ No newline at end of file diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py new file mode 100644 index 00000000..e00dc460 --- /dev/null +++ b/examples/swebench/tracing_model.py @@ -0,0 +1,75 @@ +""" +TracingFireworksModel - Routes through tracing using OpenAI SDK. +""" +import sys +import os + +sys.path.insert(0, "/Users/shrey/Documents/cookbook-internal/recipes/eval/swe_bench") + +from run_swe_agent_fw import FireworksCompatibleModel + + +class TracingFireworksModel(FireworksCompatibleModel): + """Routes LLM calls through tracing using OpenAI SDK (preserves model name).""" + + def _query(self, messages, **kwargs): + """Use OpenAI SDK directly to preserve model name through tracing.""" + from openai import OpenAI + import traceback + + tracing_url = os.environ.get('TRACING_BASE_URL', '') + api_key = os.environ.get('FIREWORKS_API_KEY', '') + + if not tracing_url: + print("⚠️ No TRACING_BASE_URL - using parent litellm") + return super()._query(messages, **kwargs) + + print(f"\n🔗 OpenAI SDK Call:") + print(f" URL: {tracing_url[:60]}...") + print(f" Model: {self.config.model_name}") + + try: + client = OpenAI(base_url=tracing_url, api_key=api_key) + + # Build OpenAI-compatible params + openai_kwargs = {} + if self.config.model_kwargs.get('stop'): + openai_kwargs['stop'] = self.config.model_kwargs['stop'] + print(f" Stop sequences: {len(openai_kwargs['stop'])}") + if self.config.model_kwargs.get('max_tokens'): + openai_kwargs['max_tokens'] = self.config.model_kwargs['max_tokens'] + if self.config.model_kwargs.get('temperature') is not None: + openai_kwargs['temperature'] = self.config.model_kwargs['temperature'] + + # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept! + clean_messages = [] + for msg in messages: + clean_msg = {"role": msg["role"], "content": msg["content"]} + # Preserve standard fields only + if "name" in msg: + clean_msg["name"] = msg["name"] + if "tool_calls" in msg: + clean_msg["tool_calls"] = msg["tool_calls"] + clean_messages.append(clean_msg) + + print(f" Messages: {len(clean_messages)} (cleaned)") + print(f" Making call...") + + # OpenAI SDK call + response = client.chat.completions.create( + model=self.config.model_name, + messages=clean_messages, # ← Use cleaned messages! + **openai_kwargs, + ) + + print(f" ✅ Call succeeded!") + print(f" Response ID: {response.id}") + print(f" Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n") + + return response + + except Exception as e: + print(f"\n❌ ERROR in TracingFireworksModel._query:") + print(f" {type(e).__name__}: {e}") + traceback.print_exc() + raise \ No newline at end of file From 2dad5181e99c750cb2211d3b9d2d1d817180cec5 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Thu, 16 Oct 2025 13:56:23 -0700 Subject: [PATCH 02/10] linterrors --- examples/swebench/README.md | 2 +- examples/swebench/SWE-bench | 1 + examples/swebench/run_swe_agent_fw.py | 176 ++++++++++++----------- examples/swebench/server.py | 41 ++++-- examples/swebench/tests/conftest.py | 10 +- examples/swebench/tests/test_swebench.py | 46 +++--- examples/swebench/tracing_model.py | 51 +++---- 7 files changed, 182 insertions(+), 145 deletions(-) create mode 160000 examples/swebench/SWE-bench diff --git a/examples/swebench/README.md b/examples/swebench/README.md index 04993e02..f4082f76 100644 --- a/examples/swebench/README.md +++ b/examples/swebench/README.md @@ -54,4 +54,4 @@ pytest examples/swebench/tests/test_swebench.py -v -s Notes - The test currently generates 10 rows by numeric index (0–9) - Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}` -- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5` \ No newline at end of file +- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5` diff --git a/examples/swebench/SWE-bench b/examples/swebench/SWE-bench new file mode 160000 index 00000000..5cd4be9f --- /dev/null +++ b/examples/swebench/SWE-bench @@ -0,0 +1 @@ +Subproject commit 5cd4be9fb23971679cbbafe5a0ecade27cef99be diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py index f1ae1a51..cef3de9a 100755 --- a/examples/swebench/run_swe_agent_fw.py +++ b/examples/swebench/run_swe_agent_fw.py @@ -45,64 +45,62 @@ class FireworksCompatibleModel(LitellmModel): """ def __init__(self, **kwargs): - if model_id := os.environ.get('FIREWORKS_MODEL_ID'): - kwargs['model_name'] = model_id + if model_id := os.environ.get("FIREWORKS_MODEL_ID"): + kwargs["model_name"] = model_id print(f"kwargs: {kwargs}") - if 'model_kwargs' not in kwargs: - kwargs['model_kwargs'] = {} - + if "model_kwargs" not in kwargs: + kwargs["model_kwargs"] = {} + # CRITICAL: Set drop_params to False so stop sequences aren't stripped! - kwargs['model_kwargs']['drop_params'] = False - + kwargs["model_kwargs"]["drop_params"] = False + # Get existing stop sequences - existing_stop = kwargs['model_kwargs'].get('stop', []) + existing_stop = kwargs["model_kwargs"].get("stop", []) if isinstance(existing_stop, str): existing_stop = [existing_stop] elif existing_stop is None: existing_stop = [] - + # Add stop sequences (only the non-natural ones) stop_sequences = existing_stop + [ - # ASCII versions + # ASCII versions "<|User|>", "<|Assistant|>", - # Full-width PIPE versions (U+FF5C) - "<|User|>", # \uff5c + "<|User|>", # \uff5c "<|Assistant|>", "```<|", "<|User", "<|Ass", - - # Full-width LETTER L versions (U+FF4C) - "<lUser|>", # \uff4c + # Full-width LETTER L versions (U+FF4C) + "<lUser|>", # \uff4c "<lAssistant|>", "```<l", "<lUser", "<lAss", ] - kwargs['model_kwargs']['stop'] = stop_sequences - kwargs['model_kwargs']['max_tokens'] = 1024 # Reduce to 1024 to save tokens - - if 'temperature' not in kwargs['model_kwargs']: - kwargs['model_kwargs']['temperature'] = 0.0 + kwargs["model_kwargs"]["stop"] = stop_sequences + kwargs["model_kwargs"]["max_tokens"] = 1024 # Reduce to 1024 to save tokens + + if "temperature" not in kwargs["model_kwargs"]: + kwargs["model_kwargs"]["temperature"] = 0.0 # Apply per-run overrides injected by the wrapper (no environment variables) - overrides = globals().get('WRAPPER_MODEL_OVERRIDES') + overrides = globals().get("WRAPPER_MODEL_OVERRIDES") if isinstance(overrides, dict): - if overrides.get('reasoning') in ('low', 'medium', 'high'): - kwargs['model_kwargs']['reasoning_effort'] = overrides['reasoning'] - if overrides.get('temperature') is not None: + if overrides.get("reasoning") in ("low", "medium", "high"): + kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"] + if overrides.get("temperature") is not None: try: - kwargs['model_kwargs']['temperature'] = float(overrides['temperature']) + kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"]) except Exception: pass - if overrides.get('max_tokens') is not None: + if overrides.get("max_tokens") is not None: try: - kwargs['model_kwargs']['max_tokens'] = int(overrides['max_tokens']) + kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"]) except Exception: pass - + super().__init__(**kwargs) def _query(self, messages: list[dict[str, str]], **kwargs): @@ -110,39 +108,38 @@ def _query(self, messages: list[dict[str, str]], **kwargs): # Keep only standard OpenAI-compatible fields clean_messages = [] for msg in messages: - clean_msg = { - "role": msg["role"], - "content": msg["content"] - } + clean_msg = {"role": msg["role"], "content": msg["content"]} if "tool_calls" in msg: clean_msg["tool_calls"] = msg["tool_calls"] if "name" in msg: clean_msg["name"] = msg["name"] clean_messages.append(clean_msg) - + # IMPORTANT: Ensure drop_params stays False in the actual query kwargs_with_stop = kwargs.copy() - if 'drop_params' not in kwargs_with_stop: - kwargs_with_stop['drop_params'] = False - + if "drop_params" not in kwargs_with_stop: + kwargs_with_stop["drop_params"] = False + return super()._query(clean_messages, **kwargs_with_stop) + def __get_api_key(): """Get Fireworks API key from environment or mini-swe-agent config.""" # Environment variable takes precedence - if api_key := os.environ.get('FIREWORKS_API_KEY'): + if api_key := os.environ.get("FIREWORKS_API_KEY"): return api_key # Try to get API key from mini-swe-agent's config system try: from minisweagent.config import get_config + config = get_config() - return config.get('FIREWORKS_API_KEY') + return config.get("FIREWORKS_API_KEY") except (ImportError, AttributeError, KeyError): # Fallback: check common config file locations config_paths = [ Path.home() / ".config" / "mini-swe-agent" / ".env", - Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env" + Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env", ] for config_path in config_paths: @@ -150,8 +147,8 @@ def __get_api_key(): try: with open(config_path) as f: for line in f: - if line.startswith('FIREWORKS_API_KEY='): - value = line.split('=', 1)[1].strip() + if line.startswith("FIREWORKS_API_KEY="): + value = line.split("=", 1)[1].strip() return value.strip("'\"") except (IOError, OSError): continue @@ -170,7 +167,7 @@ def __test_model(model_id): return False # Configure environment for litellm - os.environ['FIREWORKS_API_KEY'] = api_key + os.environ["FIREWORKS_API_KEY"] = api_key # Assume model_id is fully qualified model_name = model_id @@ -182,7 +179,7 @@ def __test_model(model_id): model=model_name, messages=[{"role": "user", "content": "Test message. Reply with OK."}], temperature=0.0, - max_tokens=10 + max_tokens=10, ) print(f"Success. Response: {response.choices[0].message.content}") @@ -201,8 +198,6 @@ def __validate_environment(): print("Set it with: mini-extra config set FIREWORKS_API_KEY ") - - def __build_command(args, wrapper_module_path): """Build mini-swe-agent command with appropriate arguments.""" # Construct model class path @@ -212,12 +207,17 @@ def __build_command(args, wrapper_module_path): # Base command - assume model_id is fully qualified cmd = [ sys.executable, - "-m", "minisweagent.run.mini_extra", + "-m", + "minisweagent.run.mini_extra", "swebench-single" if args.single is not None else "swebench", - "--model", args.model_id, - "--model-class", model_class, - "--subset", args.subset, - "--split", args.split + "--model", + args.model_id, + "--model-class", + model_class, + "--subset", + args.subset, + "--split", + args.split, ] if args.model_class: cmd.extend(["--model-class", args.model_class]) @@ -230,18 +230,26 @@ def __build_command(args, wrapper_module_path): if args.single is not None: # Use batch mode for a single index via slice and write to a per-row directory from pathlib import Path - slice_spec = f"{args.single}:{args.single+1}" + + slice_spec = f"{args.single}:{args.single + 1}" row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}") cmd = [ sys.executable, - "-m", "minisweagent.run.mini_extra", + "-m", + "minisweagent.run.mini_extra", "swebench", - "--model", args.model_id, - "--model-class", model_class, - "--subset", args.subset, - "--split", args.split, - "--slice", slice_spec, - "--output", row_dir, + "--model", + args.model_id, + "--model-class", + model_class, + "--subset", + args.subset, + "--split", + args.split, + "--slice", + slice_spec, + "--output", + row_dir, ] if args.model_class: cmd.extend(["--model-class", args.model_class]) @@ -253,31 +261,35 @@ def __build_command(args, wrapper_module_path): return cmd - - def main(): parser = argparse.ArgumentParser( - description='Run mini-swe-agent with Fireworks models on SWE-bench', + description="Run mini-swe-agent with Fireworks models on SWE-bench", formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__ + epilog=__doc__, ) # Required model ID - parser.add_argument('model_id', help='Fireworks model ID') - parser.add_argument('--model-class', type=str, default=None, help='Optional mini-swe-agent model-class') + parser.add_argument("model_id", help="Fireworks model ID") + parser.add_argument("--model-class", type=str, default=None, help="Optional mini-swe-agent model-class") # Execution options - parser.add_argument('--instances', type=int, help='Number of instances to run') - parser.add_argument('--workers', type=int, default=1, help='Parallel workers (default: 1)') - parser.add_argument('--output', help='Output directory') - parser.add_argument('--subset', default='verified', choices=['verified', 'lite', 'full']) - parser.add_argument('--split', default='test', choices=['dev', 'test']) - parser.add_argument('--single', type=int, metavar='INDEX', help='Run single instance') - parser.add_argument('--exit-immediately', action='store_true') - parser.add_argument('--test', action='store_true', help='Test model connectivity') - parser.add_argument('--reasoning', type=str, choices=['low', 'medium', 'high'], default=None, help='Provider-specific reasoning effort') - parser.add_argument('--temperature', type=float, default=None, help='Model temperature override') - parser.add_argument('--max-tokens', type=int, default=None, help='Max tokens override') + parser.add_argument("--instances", type=int, help="Number of instances to run") + parser.add_argument("--workers", type=int, default=1, help="Parallel workers (default: 1)") + parser.add_argument("--output", help="Output directory") + parser.add_argument("--subset", default="verified", choices=["verified", "lite", "full"]) + parser.add_argument("--split", default="test", choices=["dev", "test"]) + parser.add_argument("--single", type=int, metavar="INDEX", help="Run single instance") + parser.add_argument("--exit-immediately", action="store_true") + parser.add_argument("--test", action="store_true", help="Test model connectivity") + parser.add_argument( + "--reasoning", + type=str, + choices=["low", "medium", "high"], + default=None, + help="Provider-specific reasoning effort", + ) + parser.add_argument("--temperature", type=float, default=None, help="Model temperature override") + parser.add_argument("--max-tokens", type=int, default=None, help="Max tokens override") args = parser.parse_args() # Handle test mode @@ -291,11 +303,11 @@ def main(): if args.output is None: safe_model_id = args.model_id.replace("/", "-").replace(":", "-") script_dir = Path(__file__).parent.resolve() - args.output = str(script_dir / f'swebench-{safe_model_id}-results') + args.output = str(script_dir / f"swebench-{safe_model_id}-results") # Create temporary module for importing FireworksCompatibleModel - with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: - with open(__file__, 'r') as current_file: + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: + with open(__file__, "r") as current_file: f.write(current_file.read()) # Inject per-run model overrides directly into the temp module f.write("\n# --- Injected by wrapper: per-run model overrides ---\n") @@ -309,14 +321,14 @@ def main(): try: # Configure environment env = os.environ.copy() - env['PYTHONPATH'] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}" + env["PYTHONPATH"] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}" # Pass the fully qualified model path to the subprocess - env['FIREWORKS_MODEL_ID'] = args.model_id + env["FIREWORKS_MODEL_ID"] = args.model_id # Ensure API key is passed to subprocess api_key = __get_api_key() if api_key: - env['FIREWORKS_API_KEY'] = api_key + env["FIREWORKS_API_KEY"] = api_key # No environment variables for model kwargs; overrides are injected into the temp module @@ -343,5 +355,5 @@ def main(): temp_module_path.unlink() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/swebench/server.py b/examples/swebench/server.py index 80ddabe4..ae3df983 100644 --- a/examples/swebench/server.py +++ b/examples/swebench/server.py @@ -1,4 +1,5 @@ """Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url.""" + import os import threading import subprocess @@ -15,6 +16,7 @@ logging.getLogger().addHandler(handler) rollout_states = {} + @app.post("/init") def init(req: InitRequest): # Allow Eval Protocol to dynamically configure ES endpoint @@ -37,7 +39,6 @@ def _worker(): if not req.model: raise ValueError("model is required") - if not req.metadata or not req.metadata.row_id: raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'") try: @@ -56,9 +57,9 @@ def _worker(): out_dir = os.getcwd() from pathlib import Path - + script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve()) - + # Extract model_kwargs from req.metadata (forwarded from input_metadata) model_kwargs = {} logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}") @@ -69,8 +70,8 @@ def _worker(): model_kwargs = mk logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}") else: - logger.info(f"DEBUG: req.metadata has NO model_kwargs attribute") - + logger.info("DEBUG: req.metadata has NO model_kwargs attribute") + # Set tracing URL if req.model_base_url: env["TRACING_BASE_URL"] = req.model_base_url @@ -79,10 +80,13 @@ def _worker(): "python3", script_path, req.model, - "--single", str(single_index), + "--single", + str(single_index), "--exit-immediately", - "--output", str(out_dir), - "--model-class", "tracing_model.TracingFireworksModel", + "--output", + str(out_dir), + "--model-class", + "tracing_model.TracingFireworksModel", ] # Forward model kwargs as CLI flags to the wrapper if model_kwargs.get("reasoning") in ("low", "medium", "high"): @@ -92,6 +96,7 @@ def _worker(): if model_kwargs.get("max_tokens") is not None: cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])]) import json + # Log path inside row directory for this run row_dir = Path(out_dir) / f"row_{single_index}" row_dir.mkdir(parents=True, exist_ok=True) @@ -109,7 +114,6 @@ def _worker(): ) ret = proc.wait() - # Stream stdout/stderr to logs # assert proc.stdout is not None and proc.stderr is not None # for line in proc.stdout: @@ -130,11 +134,17 @@ def _worker(): # 2) Run SWE-bench evaluation harness on preds.json preds_path_str = str(preds_path) eval_cmd = [ - "python3", "-m", "swebench.harness.run_evaluation", - "--dataset_name", "princeton-nlp/SWE-bench_Verified", - "--predictions_path", preds_path_str, - "--max_workers", str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")), - "--run_id", "eval-run", + "python3", + "-m", + "swebench.harness.run_evaluation", + "--dataset_name", + "princeton-nlp/SWE-bench_Verified", + "--predictions_path", + preds_path_str, + "--max_workers", + str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")), + "--run_id", + "eval-run", ] logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd))) eval_proc = subprocess.Popen( @@ -156,14 +166,17 @@ def _worker(): threading.Thread(target=_worker, daemon=True).start() return {"status": "accepted"} + @app.get("/status") def status(rollout_id: str): return rollout_states.get(rollout_id, {"terminated": False}) + def main(): host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1") port = int(os.getenv("REMOTE_SERVER_PORT", "3000")) uvicorn.run(app, host=host, port=port) + if __name__ == "__main__": main() diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py index 8bca079e..3b81f7ad 100644 --- a/examples/swebench/tests/conftest.py +++ b/examples/swebench/tests/conftest.py @@ -1,19 +1,21 @@ import os import pytest -import os -import pytest MODEL_ID_OPT = None CONCURRENCY_OPT = None MODEL_KWARGS_OPT = None + def pytest_addoption(parser): parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID") parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts") parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature") parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens") - parser.addoption("--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort") + parser.addoption( + "--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort" + ) + def pytest_configure(config): global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT @@ -29,4 +31,4 @@ def pytest_configure(config): mk["max_tokens"] = int(mtok) if reas is not None: mk["reasoning"] = reas - MODEL_KWARGS_OPT = mk or None \ No newline at end of file + MODEL_KWARGS_OPT = mk or None diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index 37fef735..48c130a9 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -9,6 +9,7 @@ from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor from eval_protocol.types.remote_rollout_processor import DataLoaderConfig from eval_protocol.quickstart.utils import filter_longest_conversation + # Reuse the converter used by the built-in adapter from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row import conftest @@ -25,6 +26,7 @@ if CLI_MODEL_KWARGS: COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS + def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]: """ Fetch traces directly from the Fireworks tracing proxy with Authorization header @@ -70,13 +72,12 @@ def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]: seen = set() merged_msgs: List[Message] = [] for r in rows: - for m in (r.messages or []): + for m in r.messages or []: # Dedup by role+name+content+tool_calls signature tool_sig = None if getattr(m, "tool_calls", None): tool_sig = tuple( - (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) - for tc in m.tool_calls + (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) for tc in m.tool_calls ) key = (m.role, getattr(m, "name", None), m.content, tool_sig) if key in seen: @@ -86,12 +87,14 @@ def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]: base.messages = merged_msgs return [base] + def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader: return DynamicDataLoader( generators=[lambda: fetch_traces_with_auth(config)], preprocess_fn=_merge_rows_into_one, # merge all tool/LLM traces into one row ) + def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]: out = [] for idx, iid in enumerate(ids): @@ -99,15 +102,16 @@ def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]: EvaluationRow( messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")], input_metadata={ - "row_id": str(idx), # ← use instance_id here - "instance_id": iid, # ← explicit for debugging - "instance_index": str(idx), # ← optional: keep index + "row_id": str(idx), # ← use instance_id here + "instance_id": iid, # ← explicit for debugging + "instance_index": str(idx), # ← optional: keep index "completion_params": {"model": MODEL_ID}, }, ) ) return out + def rows_from_indices(count: int) -> List[EvaluationRow]: out: List[EvaluationRow] = [] for idx in range(count): @@ -118,7 +122,7 @@ def rows_from_indices(count: int) -> List[EvaluationRow]: # Add model_kwargs to metadata so server can read from req.metadata if CLI_MODEL_KWARGS: metadata["model_kwargs"] = CLI_MODEL_KWARGS - + out.append( EvaluationRow( messages=[Message(role="user", content=f"Run SWE-bench index {idx}")], @@ -127,6 +131,7 @@ def rows_from_indices(count: int) -> List[EvaluationRow]: ) return out + def rows() -> List[EvaluationRow]: # Generate 10 rows by index; server maps index -> dataset instance via --slice return rows_from_indices(10) @@ -136,24 +141,27 @@ def rows() -> List[EvaluationRow]: import json from pathlib import Path + def _safe_model_id(model_id: str) -> str: return model_id.replace("/", "__").replace(":", "-") + def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: """Attach evaluation result by reading harness report or exit status.""" import logging + logger = logging.getLogger(__name__) - + # Get row_id and instance_id try: row_id = str(row.input_metadata.row_id) # ← use attribute, not .get() except Exception as e: logger.warning(f"Could not get row_id: {e}") return row - + row_dir = Path.cwd() / f"row_{row_id}" logger.info(f"[Row {row_id}] Looking for results in {row_dir}") - + # Find instance_id from preds.json preds_path = row_dir / "preds.json" instance_id = None @@ -164,18 +172,18 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: logger.info(f"[Row {row_id}] Found instance_id: {instance_id}") except Exception as e: logger.warning(f"[Row {row_id}] Could not read preds.json: {e}") - + if not instance_id: logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result") return row - + resolved: bool | None = None reason_text: str | None = None - + # 1. Try to read from report.json (harness ran tests) safe_model = _safe_model_id(model_id) report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" - + if report_path.exists(): logger.info(f"[Row {row_id}] Found report.json at {report_path}") try: @@ -188,7 +196,7 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: logger.error(f"[Row {row_id}] Failed to parse report.json: {e}") else: logger.info(f"[Row {row_id}] No report.json found at {report_path}") - + # 2. If no report, check exit status YAML (agent didn't produce a patch) if resolved is None: exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml")) @@ -208,7 +216,7 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: logger.error(f"[Row {row_id}] Failed to parse exit status: {e}") else: logger.warning(f"[Row {row_id}] No exit status YAML found") - + # 3. Attach result if we found anything if resolved is not None: logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}") @@ -227,10 +235,10 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: ) else: logger.warning(f"[Row {row_id}] Could not determine resolved status") - + return row - + @evaluation_test( data_loaders=DynamicDataLoader( generators=[rows], @@ -247,4 +255,4 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: assert len(row.messages) >= 1 row = attach_eval_result(row, MODEL_ID) - return row \ No newline at end of file + return row diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py index e00dc460..8862388c 100644 --- a/examples/swebench/tracing_model.py +++ b/examples/swebench/tracing_model.py @@ -1,6 +1,7 @@ """ TracingFireworksModel - Routes through tracing using OpenAI SDK. """ + import sys import os @@ -11,36 +12,36 @@ class TracingFireworksModel(FireworksCompatibleModel): """Routes LLM calls through tracing using OpenAI SDK (preserves model name).""" - + def _query(self, messages, **kwargs): """Use OpenAI SDK directly to preserve model name through tracing.""" from openai import OpenAI import traceback - - tracing_url = os.environ.get('TRACING_BASE_URL', '') - api_key = os.environ.get('FIREWORKS_API_KEY', '') - + + tracing_url = os.environ.get("TRACING_BASE_URL", "") + api_key = os.environ.get("FIREWORKS_API_KEY", "") + if not tracing_url: print("⚠️ No TRACING_BASE_URL - using parent litellm") return super()._query(messages, **kwargs) - - print(f"\n🔗 OpenAI SDK Call:") + + print("\n🔗 OpenAI SDK Call:") print(f" URL: {tracing_url[:60]}...") print(f" Model: {self.config.model_name}") - + try: client = OpenAI(base_url=tracing_url, api_key=api_key) - + # Build OpenAI-compatible params openai_kwargs = {} - if self.config.model_kwargs.get('stop'): - openai_kwargs['stop'] = self.config.model_kwargs['stop'] + if self.config.model_kwargs.get("stop"): + openai_kwargs["stop"] = self.config.model_kwargs["stop"] print(f" Stop sequences: {len(openai_kwargs['stop'])}") - if self.config.model_kwargs.get('max_tokens'): - openai_kwargs['max_tokens'] = self.config.model_kwargs['max_tokens'] - if self.config.model_kwargs.get('temperature') is not None: - openai_kwargs['temperature'] = self.config.model_kwargs['temperature'] - + if self.config.model_kwargs.get("max_tokens"): + openai_kwargs["max_tokens"] = self.config.model_kwargs["max_tokens"] + if self.config.model_kwargs.get("temperature") is not None: + openai_kwargs["temperature"] = self.config.model_kwargs["temperature"] + # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept! clean_messages = [] for msg in messages: @@ -51,25 +52,25 @@ def _query(self, messages, **kwargs): if "tool_calls" in msg: clean_msg["tool_calls"] = msg["tool_calls"] clean_messages.append(clean_msg) - + print(f" Messages: {len(clean_messages)} (cleaned)") - print(f" Making call...") - + print(" Making call...") + # OpenAI SDK call response = client.chat.completions.create( model=self.config.model_name, messages=clean_messages, # ← Use cleaned messages! **openai_kwargs, ) - - print(f" ✅ Call succeeded!") + + print(" ✅ Call succeeded!") print(f" Response ID: {response.id}") print(f" Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n") - + return response - + except Exception as e: - print(f"\n❌ ERROR in TracingFireworksModel._query:") + print("\n❌ ERROR in TracingFireworksModel._query:") print(f" {type(e).__name__}: {e}") traceback.print_exc() - raise \ No newline at end of file + raise From 9ffbf9e4b60347df96bb36376e5c4a4e631cc99a Mon Sep 17 00:00:00 2001 From: shreymodi1 <82307545+shreymodi1@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:27:39 -0700 Subject: [PATCH 03/10] Delete examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json --- ...works__deployments__r5dfiiwp.eval-run.json | 521 ------------------ 1 file changed, 521 deletions(-) delete mode 100644 examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json diff --git a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json deleted file mode 100644 index a9e10524..00000000 --- a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json +++ /dev/null @@ -1,521 +0,0 @@ -{ - "total_instances": 500, - "submitted_instances": 1, - "completed_instances": 0, - "resolved_instances": 0, - "unresolved_instances": 0, - "empty_patch_instances": 1, - "error_instances": 0, - "completed_ids": [], - "incomplete_ids": [ - "astropy__astropy-12907", - "astropy__astropy-13033", - "astropy__astropy-13236", - "astropy__astropy-13398", - "astropy__astropy-13579", - "astropy__astropy-13977", - "astropy__astropy-14096", - "astropy__astropy-14182", - "astropy__astropy-14309", - "astropy__astropy-14365", - "astropy__astropy-14369", - "astropy__astropy-14508", - "astropy__astropy-14539", - "astropy__astropy-14598", - "astropy__astropy-14995", - "astropy__astropy-7166", - "astropy__astropy-7336", - "astropy__astropy-7606", - "astropy__astropy-7671", - "astropy__astropy-8707", - "astropy__astropy-8872", - "django__django-10097", - "django__django-10554", - "django__django-10880", - "django__django-10914", - "django__django-10973", - "django__django-10999", - "django__django-11066", - "django__django-11087", - "django__django-11095", - "django__django-11099", - "django__django-11119", - "django__django-11133", - "django__django-11138", - "django__django-11141", - "django__django-11149", - "django__django-11163", - "django__django-11179", - "django__django-11206", - "django__django-11211", - "django__django-11239", - "django__django-11265", - "django__django-11276", - "django__django-11292", - "django__django-11299", - "django__django-11333", - "django__django-11400", - "django__django-11433", - "django__django-11451", - "django__django-11477", - "django__django-11490", - "django__django-11532", - "django__django-11551", - "django__django-11555", - "django__django-11603", - "django__django-11728", - "django__django-11734", - "django__django-11740", - "django__django-11749", - "django__django-11790", - "django__django-11815", - "django__django-11820", - "django__django-11848", - "django__django-11880", - "django__django-11885", - "django__django-11951", - "django__django-11964", - "django__django-11999", - "django__django-12039", - "django__django-12050", - "django__django-12125", - "django__django-12143", - "django__django-12155", - "django__django-12193", - "django__django-12209", - "django__django-12262", - "django__django-12273", - "django__django-12276", - "django__django-12304", - "django__django-12308", - "django__django-12325", - "django__django-12406", - "django__django-12419", - "django__django-12663", - "django__django-12708", - "django__django-12713", - "django__django-12741", - "django__django-12754", - "django__django-12774", - "django__django-12858", - "django__django-12965", - "django__django-13012", - "django__django-13023", - "django__django-13028", - "django__django-13033", - "django__django-13089", - "django__django-13109", - "django__django-13112", - "django__django-13121", - "django__django-13128", - "django__django-13158", - "django__django-13195", - "django__django-13212", - "django__django-13279", - "django__django-13297", - "django__django-13315", - "django__django-13343", - "django__django-13344", - "django__django-13346", - "django__django-13363", - "django__django-13401", - "django__django-13406", - "django__django-13410", - "django__django-13417", - "django__django-13449", - "django__django-13512", - "django__django-13513", - "django__django-13516", - "django__django-13551", - "django__django-13568", - "django__django-13569", - "django__django-13590", - "django__django-13658", - "django__django-13670", - "django__django-13741", - "django__django-13786", - "django__django-13794", - "django__django-13807", - "django__django-13809", - "django__django-13810", - "django__django-13820", - "django__django-13821", - "django__django-13837", - "django__django-13925", - "django__django-13933", - "django__django-13964", - "django__django-14007", - "django__django-14011", - "django__django-14017", - "django__django-14034", - "django__django-14053", - "django__django-14089", - "django__django-14122", - "django__django-14140", - "django__django-14155", - "django__django-14170", - "django__django-14238", - "django__django-14311", - "django__django-14315", - "django__django-14349", - "django__django-14351", - "django__django-14373", - "django__django-14376", - "django__django-14404", - "django__django-14434", - "django__django-14493", - "django__django-14500", - "django__django-14534", - "django__django-14539", - "django__django-14559", - "django__django-14580", - "django__django-14608", - "django__django-14631", - "django__django-14672", - "django__django-14725", - "django__django-14752", - "django__django-14765", - "django__django-14771", - "django__django-14787", - "django__django-14792", - "django__django-14855", - "django__django-14915", - "django__django-14999", - "django__django-15022", - "django__django-15037", - "django__django-15098", - "django__django-15103", - "django__django-15104", - "django__django-15127", - "django__django-15128", - "django__django-15161", - "django__django-15252", - "django__django-15268", - "django__django-15277", - "django__django-15278", - "django__django-15280", - "django__django-15315", - "django__django-15368", - "django__django-15375", - "django__django-15380", - "django__django-15382", - "django__django-15467", - "django__django-15499", - "django__django-15503", - "django__django-15525", - "django__django-15554", - "django__django-15561", - "django__django-15563", - "django__django-15569", - "django__django-15572", - "django__django-15629", - "django__django-15695", - "django__django-15731", - "django__django-15732", - "django__django-15741", - "django__django-15814", - "django__django-15851", - "django__django-15863", - "django__django-15916", - "django__django-15930", - "django__django-15957", - "django__django-15973", - "django__django-15987", - "django__django-16032", - "django__django-16082", - "django__django-16100", - "django__django-16116", - "django__django-16136", - "django__django-16139", - "django__django-16145", - "django__django-16255", - "django__django-16256", - "django__django-16263", - "django__django-16315", - "django__django-16333", - "django__django-16429", - "django__django-16454", - "django__django-16485", - "django__django-16493", - "django__django-16502", - "django__django-16527", - "django__django-16560", - "django__django-16569", - "django__django-16595", - "django__django-16612", - "django__django-16631", - "django__django-16642", - "django__django-16661", - "django__django-16662", - "django__django-16667", - "django__django-16801", - "django__django-16819", - "django__django-16877", - "django__django-16899", - "django__django-16901", - "django__django-16938", - "django__django-16950", - "django__django-17029", - "django__django-17084", - "django__django-17087", - "django__django-7530", - "django__django-9296", - "matplotlib__matplotlib-13989", - "matplotlib__matplotlib-14623", - "matplotlib__matplotlib-20488", - "matplotlib__matplotlib-20676", - "matplotlib__matplotlib-20826", - "matplotlib__matplotlib-20859", - "matplotlib__matplotlib-21568", - "matplotlib__matplotlib-22719", - "matplotlib__matplotlib-22865", - "matplotlib__matplotlib-22871", - "matplotlib__matplotlib-23299", - "matplotlib__matplotlib-23314", - "matplotlib__matplotlib-23412", - "matplotlib__matplotlib-23476", - "matplotlib__matplotlib-24026", - "matplotlib__matplotlib-24149", - "matplotlib__matplotlib-24177", - "matplotlib__matplotlib-24570", - "matplotlib__matplotlib-24627", - "matplotlib__matplotlib-24637", - "matplotlib__matplotlib-24870", - "matplotlib__matplotlib-24970", - "matplotlib__matplotlib-25122", - "matplotlib__matplotlib-25287", - "matplotlib__matplotlib-25311", - "matplotlib__matplotlib-25332", - "matplotlib__matplotlib-25479", - "matplotlib__matplotlib-25775", - "matplotlib__matplotlib-25960", - "matplotlib__matplotlib-26113", - "matplotlib__matplotlib-26208", - "matplotlib__matplotlib-26291", - "matplotlib__matplotlib-26342", - "matplotlib__matplotlib-26466", - "mwaskom__seaborn-3069", - "mwaskom__seaborn-3187", - "pallets__flask-5014", - "psf__requests-1142", - "psf__requests-1724", - "psf__requests-1766", - "psf__requests-1921", - "psf__requests-2317", - "psf__requests-2931", - "psf__requests-5414", - "psf__requests-6028", - "pydata__xarray-2905", - "pydata__xarray-3095", - "pydata__xarray-3151", - "pydata__xarray-3305", - "pydata__xarray-3677", - "pydata__xarray-3993", - "pydata__xarray-4075", - "pydata__xarray-4094", - "pydata__xarray-4356", - "pydata__xarray-4629", - "pydata__xarray-4687", - "pydata__xarray-4695", - "pydata__xarray-4966", - "pydata__xarray-6461", - "pydata__xarray-6599", - "pydata__xarray-6721", - "pydata__xarray-6744", - "pydata__xarray-6938", - "pydata__xarray-6992", - "pydata__xarray-7229", - "pydata__xarray-7233", - "pydata__xarray-7393", - "pylint-dev__pylint-4551", - "pylint-dev__pylint-4604", - "pylint-dev__pylint-4661", - "pylint-dev__pylint-4970", - "pylint-dev__pylint-6386", - "pylint-dev__pylint-6528", - "pylint-dev__pylint-6903", - "pylint-dev__pylint-7080", - "pylint-dev__pylint-7277", - "pylint-dev__pylint-8898", - "pytest-dev__pytest-10051", - "pytest-dev__pytest-10081", - "pytest-dev__pytest-10356", - "pytest-dev__pytest-5262", - "pytest-dev__pytest-5631", - "pytest-dev__pytest-5787", - "pytest-dev__pytest-5809", - "pytest-dev__pytest-5840", - "pytest-dev__pytest-6197", - "pytest-dev__pytest-6202", - "pytest-dev__pytest-7205", - "pytest-dev__pytest-7236", - "pytest-dev__pytest-7324", - "pytest-dev__pytest-7432", - "pytest-dev__pytest-7490", - "pytest-dev__pytest-7521", - "pytest-dev__pytest-7571", - "pytest-dev__pytest-7982", - "pytest-dev__pytest-8399", - "scikit-learn__scikit-learn-10297", - "scikit-learn__scikit-learn-10844", - "scikit-learn__scikit-learn-10908", - "scikit-learn__scikit-learn-11310", - "scikit-learn__scikit-learn-11578", - "scikit-learn__scikit-learn-12585", - "scikit-learn__scikit-learn-12682", - "scikit-learn__scikit-learn-12973", - "scikit-learn__scikit-learn-13124", - "scikit-learn__scikit-learn-13135", - "scikit-learn__scikit-learn-13142", - "scikit-learn__scikit-learn-13328", - "scikit-learn__scikit-learn-13439", - "scikit-learn__scikit-learn-13496", - "scikit-learn__scikit-learn-13779", - "scikit-learn__scikit-learn-14053", - "scikit-learn__scikit-learn-14087", - "scikit-learn__scikit-learn-14141", - "scikit-learn__scikit-learn-14496", - "scikit-learn__scikit-learn-14629", - "scikit-learn__scikit-learn-14710", - "scikit-learn__scikit-learn-14894", - "scikit-learn__scikit-learn-14983", - "scikit-learn__scikit-learn-15100", - "scikit-learn__scikit-learn-25102", - "scikit-learn__scikit-learn-25232", - "scikit-learn__scikit-learn-25747", - "scikit-learn__scikit-learn-25931", - "scikit-learn__scikit-learn-25973", - "scikit-learn__scikit-learn-26194", - "scikit-learn__scikit-learn-26323", - "scikit-learn__scikit-learn-9288", - "sphinx-doc__sphinx-10323", - "sphinx-doc__sphinx-10435", - "sphinx-doc__sphinx-10449", - "sphinx-doc__sphinx-10466", - "sphinx-doc__sphinx-10614", - "sphinx-doc__sphinx-10673", - "sphinx-doc__sphinx-11445", - "sphinx-doc__sphinx-11510", - "sphinx-doc__sphinx-7440", - "sphinx-doc__sphinx-7454", - "sphinx-doc__sphinx-7462", - "sphinx-doc__sphinx-7590", - "sphinx-doc__sphinx-7748", - "sphinx-doc__sphinx-7757", - "sphinx-doc__sphinx-7889", - "sphinx-doc__sphinx-7910", - "sphinx-doc__sphinx-7985", - "sphinx-doc__sphinx-8035", - "sphinx-doc__sphinx-8056", - "sphinx-doc__sphinx-8120", - "sphinx-doc__sphinx-8265", - "sphinx-doc__sphinx-8269", - "sphinx-doc__sphinx-8459", - "sphinx-doc__sphinx-8475", - "sphinx-doc__sphinx-8548", - "sphinx-doc__sphinx-8551", - "sphinx-doc__sphinx-8593", - "sphinx-doc__sphinx-8595", - "sphinx-doc__sphinx-8621", - "sphinx-doc__sphinx-8638", - "sphinx-doc__sphinx-8721", - "sphinx-doc__sphinx-9229", - "sphinx-doc__sphinx-9230", - "sphinx-doc__sphinx-9258", - "sphinx-doc__sphinx-9281", - "sphinx-doc__sphinx-9320", - "sphinx-doc__sphinx-9367", - "sphinx-doc__sphinx-9461", - "sphinx-doc__sphinx-9591", - "sphinx-doc__sphinx-9602", - "sphinx-doc__sphinx-9658", - "sphinx-doc__sphinx-9673", - "sphinx-doc__sphinx-9698", - "sphinx-doc__sphinx-9711", - "sympy__sympy-11618", - "sympy__sympy-12096", - "sympy__sympy-12419", - "sympy__sympy-12481", - "sympy__sympy-12489", - "sympy__sympy-13031", - "sympy__sympy-13091", - "sympy__sympy-13372", - "sympy__sympy-13480", - "sympy__sympy-13551", - "sympy__sympy-13615", - "sympy__sympy-13647", - "sympy__sympy-13757", - "sympy__sympy-13798", - "sympy__sympy-13852", - "sympy__sympy-13877", - "sympy__sympy-13878", - "sympy__sympy-13974", - "sympy__sympy-14248", - "sympy__sympy-14531", - "sympy__sympy-14711", - "sympy__sympy-14976", - "sympy__sympy-15017", - "sympy__sympy-15345", - "sympy__sympy-15349", - "sympy__sympy-15599", - "sympy__sympy-15809", - "sympy__sympy-15875", - "sympy__sympy-15976", - "sympy__sympy-16450", - "sympy__sympy-16597", - "sympy__sympy-16766", - "sympy__sympy-16792", - "sympy__sympy-16886", - "sympy__sympy-17139", - "sympy__sympy-17318", - "sympy__sympy-17630", - "sympy__sympy-17655", - "sympy__sympy-18189", - "sympy__sympy-18199", - "sympy__sympy-18211", - "sympy__sympy-18698", - "sympy__sympy-18763", - "sympy__sympy-19040", - "sympy__sympy-19346", - "sympy__sympy-19495", - "sympy__sympy-19637", - "sympy__sympy-19783", - "sympy__sympy-19954", - "sympy__sympy-20154", - "sympy__sympy-20428", - "sympy__sympy-20438", - "sympy__sympy-20590", - "sympy__sympy-20801", - "sympy__sympy-20916", - "sympy__sympy-21379", - "sympy__sympy-21596", - "sympy__sympy-21612", - "sympy__sympy-21847", - "sympy__sympy-21930", - "sympy__sympy-22080", - "sympy__sympy-22456", - "sympy__sympy-22714", - "sympy__sympy-22914", - "sympy__sympy-23262", - "sympy__sympy-23413", - "sympy__sympy-23534", - "sympy__sympy-23824", - "sympy__sympy-23950", - "sympy__sympy-24066", - "sympy__sympy-24213", - "sympy__sympy-24443", - "sympy__sympy-24539", - "sympy__sympy-24562", - "sympy__sympy-24661" - ], - "empty_patch_ids": [ - "astropy__astropy-13453" - ], - "submitted_ids": [ - "astropy__astropy-13453" - ], - "resolved_ids": [], - "unresolved_ids": [], - "error_ids": [], - "schema_version": 2 -} From 0d1231101247719168c2354ebef95a58fcac8239 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Thu, 16 Oct 2025 17:19:39 -0700 Subject: [PATCH 04/10] addressing dereks comments --- examples/swebench/README.md | 309 ++++++++++++++++++++--- examples/swebench/run_swe_agent_fw.py | 97 +------ examples/swebench/server.py | 26 +- examples/swebench/tests/conftest.py | 34 --- examples/swebench/tests/test_swebench.py | 226 ++++------------- examples/swebench/tracing_model.py | 89 ++++++- pyproject.toml | 4 + 7 files changed, 423 insertions(+), 362 deletions(-) delete mode 100644 examples/swebench/tests/conftest.py diff --git a/examples/swebench/README.md b/examples/swebench/README.md index f4082f76..a696ed6f 100644 --- a/examples/swebench/README.md +++ b/examples/swebench/README.md @@ -1,57 +1,300 @@ -SWE-bench (Remote) - Local (non-Docker) Setup and Usage +# SWE-bench Evaluation Example -Prerequisites -- Python 3.12 environment (same one you use for this repo) -- Fireworks API key -- mini-swe-agent and datasets (for patch generation) -- SWE-bench harness installed (for evaluation) +This example shows how to evaluate LLM models on the SWE-bench software engineering benchmark using eval-protocol. -Setup mini-swe-agent (non-Docker) -1) Install dependencies -```bash -pip install mini-swe-agent datasets -``` +## Quick Start + +### 1. Install Dependencies -2) Configure API key for mini-swe-agent ```bash -mini-extra config set FIREWORKS_API_KEY +# From the python-sdk repository root +cd python-sdk + +# Install eval-protocol with swebench support +pip install -e ".[swebench]" ``` -3) (Optional) Test connectivity +### 2. Set up mini-swe-agent + +mini-swe-agent requires a Fireworks API key to function: + ```bash -python3 examples/swebench/run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905 --test +# Configure API key for mini-swe-agent +mini-extra config set FIREWORKS_API_KEY your_fireworks_api_key + +# Verify it's set +mini-extra config get FIREWORKS_API_KEY ``` -Install SWE-bench evaluation harness +### 3. Install SWE-bench Harness + ```bash +# Navigate to the swebench example directory +cd examples/swebench + +# Clone and install SWE-bench git clone https://github.com/princeton-nlp/SWE-bench pip install -e SWE-bench ``` -Environment +### 4. Set Environment Variables + +```bash +export FIREWORKS_API_KEY="your_fireworks_api_key" +``` + +## Running the Evaluation + +**IMPORTANT:** Always run both the server and tests from the `examples/swebench/` directory. + +### Step 1: Start the Server + +Open a terminal and run: + +```bash +cd examples/swebench +python server.py +``` + +You should see: +``` +INFO: Uvicorn running on http://127.0.0.1:3000 (Press CTRL+C to quit) +``` + +### Step 2: Configure Your Test + +Edit `tests/test_swebench.py` to set your model and parameters: + +```python +completion_params=[{ + "model": "accounts/fireworks/models/your-model-name", # Edit this + "model_kwargs": { + "temperature": 0.2, # Optional + # "max_tokens": 2048, # Optional + # "reasoning": "high", # Optional + } +}], +max_concurrent_rollouts=3, # How many instances to run in parallel +``` + +To test different numbers of instances, edit line 26: +```python +def rows() -> List[EvaluationRow]: + return rows_from_indices(2) # Change 2 to desired number (max 500) +``` + +### Step 3: Run the Test + +Open a second terminal: + +```bash +cd examples/swebench +pytest tests/test_swebench.py -v -s +``` + +## What Happens During a Run + +For each instance (row): + +1. **Server receives request** from pytest +2. **Wrapper script** (`run_swe_agent_fw.py`) is called with the instance index +3. **mini-swe-agent** runs in a Docker container for that specific repository +4. **Agent attempts to solve** the issue by editing code +5. **Patch is generated** and saved to `preds.json` +6. **SWE-bench harness** applies the patch and runs tests +7. **Results** are written to the row directory +8. **Test fetches results** and displays pass/fail in the UI + +## Understanding the Output + +### Directory Structure + +Each instance creates its own `row_N/` directory: + +``` +examples/swebench/ +├── row_0/ # First instance +│ ├── preds.json # ← Model's generated patch +│ ├── astropy__astropy-12907/ # Instance-specific folder +│ │ └── astropy__astropy-12907.traj.json # Agent's execution trace +│ ├── logs/ # Harness execution logs +│ │ └── run_evaluation/ +│ │ └── eval-run/ +│ │ └── / +│ │ └── astropy__astropy-12907/ +│ │ ├── report.json # ← Test results (pass/fail) +│ │ ├── test_output.txt # Test execution output +│ │ ├── patch.diff # Applied patch +│ │ └── eval.sh # Evaluation script +│ ├── agent_0.log # Agent console output +│ ├── exit_statuses_*.yaml # Exit status if failed +│ └── .eval-run.json # Overall run summary +├── row_1/ # Second instance +│ └── ... +└── ... +``` + +### Key Files Explained + +#### `preds.json` - Model Predictions +Location: `row_N/preds.json` + +Contains the patch generated by the model: +```json +{ + "astropy__astropy-12907": { + "model_name_or_path": "accounts/fireworks/models/...", + "instance_id": "astropy__astropy-12907", + "model_patch": "diff --git a/... (the actual patch)" + } +} +``` + +**If missing:** Agent failed before generating a patch (check `exit_statuses_*.yaml`) + +#### `report.json` - Test Results +Location: `row_N/logs/run_evaluation/eval-run///report.json` + +Contains pass/fail status after running tests: +```json +{ + "astropy__astropy-12907": { + "patch_is_None": false, + "patch_exists": true, + "patch_successfully_applied": true, + "resolved": true, // ← Was the issue fixed? + "tests_status": { + "FAIL_TO_PASS": {"success": [...], "failure": []}, + "PASS_TO_PASS": {"success": [...], "failure": []} + } + } +} +``` + +- `resolved: true` = Instance solved! All required tests pass. +- `resolved: false` = Instance not solved (tests still failing) + +**If missing:** Agent didn't generate a patch or harness didn't run + +#### `exit_statuses_*.yaml` - Why Runs Failed +Location: `row_N/exit_statuses_*.yaml` + +```yaml +instances_by_exit_status: + Submitted: [] + LimitsExceeded: ["astropy__astropy-12907"] # Hit step/cost limits + Error: [] +``` + +Common statuses: +- `Submitted`: Completed normally +- `LimitsExceeded`: Agent hit max steps or cost limit +- `Error`: Unexpected error during execution + +#### `agent_N.log` - Agent Execution +Location: `row_N/agent_N.log` + +Full console output from the agent run, including: +- Docker container startup +- Model API calls +- Commands executed +- Errors (if any) + +#### `*.traj.json` - Agent Trajectory +Location: `row_N//.traj.json` + +Complete record of the agent's execution: +```json +{ + "instance_id": "astropy__astropy-12907", + "info": { + "submission": "...", // The patch + "exit_status": "Submitted", + "model_stats": { + "instance_cost": 0.05, + "api_calls": 15 + } + }, + "messages": [...] // All agent messages +} +``` + +## Viewing Results + +### In the Terminal + +The test output shows: +``` +INFO:test_swebench:[Row 0] Found instance_id: astropy__astropy-12907 +INFO:test_swebench:[Row 0] Report says resolved=True +INFO:test_swebench:[Row 0] Final: resolved=True, reason=harness_resolved=True +``` + +### In the Eval Protocol UI + +If Elasticsearch is running, visit: `http://localhost:8000` +- View aggregate scores +- Inspect individual trajectories +- Filter by resolved/unresolved +- See cost and token usage + +### Check Individual Files + ```bash -export FIREWORKS_API_KEY="" +# Check if instance was solved +cat row_0/logs/run_evaluation/eval-run//astropy__astropy-12907/report.json | jq '.["astropy__astropy-12907"].resolved' + +# View the generated patch +cat row_0/preds.json | jq '.["astropy__astropy-12907"].model_patch' + +# Check exit status +cat row_0/exit_statuses_*.yaml ``` -Run the server +## Performance Notes + +- **Small test (2 instances):** ~10-30 minutes +- **Full dataset (500 instances):** 24-48 hours on a 16-core machine +- **Concurrent runs:** Recommended 3-5 based on CPU/memory +- **Docker space:** ~100GB for all images (downloads happen automatically) + +## Troubleshooting + +### Docker container fails to start ```bash -python examples/swebench/server.py +# Check Docker is running +docker ps + +# Check disk space +df -h ``` -What the server does -- Invokes `run_swe_agent_fw.py` in batch mode with a single-slice per request -- Writes outputs to a per-row directory: `./row_{index}/` - - `row_{index}/preds.json` - - `row_{index}//.traj.json` -- Runs the SWE-bench harness on `row_{index}/preds.json` +### Agent hits step limits +Instances that consistently hit limits may need: +- Higher step limit (edit mini-swe-agent config) +- Different prompting strategy +- More capable model -Run pytest to evaluate a model on SWE-bench +### Server not responding ```bash -cd /Users/shrey/Documents/python-sdk -pytest examples/swebench/tests/test_swebench.py -v -s +# Check server is running +curl http://127.0.0.1:3000/status?rollout_id=test + +# Check server logs for errors +# (shown in terminal where server.py is running) ``` -Notes -- The test currently generates 10 rows by numeric index (0–9) -- Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}` -- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5` +## Next Steps + +- Review results in `row_*/logs/.../report.json` +- Analyze failed instances to improve your model +- Run on larger subsets to get statistical significance +- Export results for further analysis + +## Support + +For issues: +- Check agent logs: `row_N/agent_N.log` +- Check exit statuses: `row_N/exit_statuses_*.yaml` +- Verify Docker has sufficient resources +- Ensure API key is valid and has credits diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py index cef3de9a..4d145038 100755 --- a/examples/swebench/run_swe_agent_fw.py +++ b/examples/swebench/run_swe_agent_fw.py @@ -12,14 +12,6 @@ Usage: python run_swe_agent_fw.py [options] -Examples: - # Serverless models - python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --instances 10 --workers 5 - python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct --subset full --workers 8 - - # Deployed models - python run_swe_agent_fw.py fireworks_ai/accounts/cognition/deployedModels/swe-1-mtp-tc1huggf --single 0 - python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --test Requirements: - mini-swe-agent: pip install mini-swe-agent @@ -39,94 +31,11 @@ import litellm -class FireworksCompatibleModel(LitellmModel): - """ - Fireworks-compatible wrapper for LitellmModel. - """ - - def __init__(self, **kwargs): - if model_id := os.environ.get("FIREWORKS_MODEL_ID"): - kwargs["model_name"] = model_id - print(f"kwargs: {kwargs}") - if "model_kwargs" not in kwargs: - kwargs["model_kwargs"] = {} - - # CRITICAL: Set drop_params to False so stop sequences aren't stripped! - kwargs["model_kwargs"]["drop_params"] = False - - # Get existing stop sequences - existing_stop = kwargs["model_kwargs"].get("stop", []) - if isinstance(existing_stop, str): - existing_stop = [existing_stop] - elif existing_stop is None: - existing_stop = [] - - # Add stop sequences (only the non-natural ones) - stop_sequences = existing_stop + [ - # ASCII versions - "<|User|>", - "<|Assistant|>", - # Full-width PIPE versions (U+FF5C) - "<|User|>", # \uff5c - "<|Assistant|>", - "```<|", - "<|User", - "<|Ass", - # Full-width LETTER L versions (U+FF4C) - "<lUser|>", # \uff4c - "<lAssistant|>", - "```<l", - "<lUser", - "<lAss", - ] - kwargs["model_kwargs"]["stop"] = stop_sequences - kwargs["model_kwargs"]["max_tokens"] = 1024 # Reduce to 1024 to save tokens - - if "temperature" not in kwargs["model_kwargs"]: - kwargs["model_kwargs"]["temperature"] = 0.0 - - # Apply per-run overrides injected by the wrapper (no environment variables) - overrides = globals().get("WRAPPER_MODEL_OVERRIDES") - if isinstance(overrides, dict): - if overrides.get("reasoning") in ("low", "medium", "high"): - kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"] - if overrides.get("temperature") is not None: - try: - kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"]) - except Exception: - pass - if overrides.get("max_tokens") is not None: - try: - kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"]) - except Exception: - pass - - super().__init__(**kwargs) - - def _query(self, messages: list[dict[str, str]], **kwargs): - """Remove non-standard fields before sending to Fireworks API.""" - # Keep only standard OpenAI-compatible fields - clean_messages = [] - for msg in messages: - clean_msg = {"role": msg["role"], "content": msg["content"]} - if "tool_calls" in msg: - clean_msg["tool_calls"] = msg["tool_calls"] - if "name" in msg: - clean_msg["name"] = msg["name"] - clean_messages.append(clean_msg) - - # IMPORTANT: Ensure drop_params stays False in the actual query - kwargs_with_stop = kwargs.copy() - if "drop_params" not in kwargs_with_stop: - kwargs_with_stop["drop_params"] = False - - return super()._query(clean_messages, **kwargs_with_stop) - - def __get_api_key(): """Get Fireworks API key from environment or mini-swe-agent config.""" # Environment variable takes precedence - if api_key := os.environ.get("FIREWORKS_API_KEY"): + api_key = os.environ.get("FIREWORKS_API_KEY") + if api_key: return api_key # Try to get API key from mini-swe-agent's config system @@ -213,7 +122,7 @@ def __build_command(args, wrapper_module_path): "--model", args.model_id, "--model-class", - model_class, + "tracing_model.FireworksCompatibleModel", "--subset", args.subset, "--split", diff --git a/examples/swebench/server.py b/examples/swebench/server.py index ae3df983..01063645 100644 --- a/examples/swebench/server.py +++ b/examples/swebench/server.py @@ -51,7 +51,10 @@ def _worker(): env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"] # Make sure the tracing model module is importable by the subprocess # so "tracing_model.TracingFireworksModel" can be imported - env["PYTHONPATH"] = "/Users/shrey/Documents/python-sdk/examples/swebench:" + env.get("PYTHONPATH", "") + from pathlib import Path + + script_dir = Path(__file__).parent + env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}" # Determine output directory (from env or default) out_dir = os.getcwd() @@ -62,15 +65,17 @@ def _worker(): # Extract model_kwargs from req.metadata (forwarded from input_metadata) model_kwargs = {} - logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}") + # convert to logger.debug everywhere, remove debug then + logger.debug(f"req.metadata attributes: {dir(req.metadata)}") + if hasattr(req.metadata, "model_kwargs"): mk = getattr(req.metadata, "model_kwargs", None) - logger.info(f"DEBUG: Found req.metadata.model_kwargs = {mk}") + logger.debug(f"Found req.metadata.model_kwargs = {mk}") if isinstance(mk, dict): model_kwargs = mk - logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}") + logger.debug(f"Extracted model_kwargs from metadata: {model_kwargs}") else: - logger.info("DEBUG: req.metadata has NO model_kwargs attribute") + logger.debug("req.metadata has NO model_kwargs attribute") # Set tracing URL if req.model_base_url: @@ -114,16 +119,6 @@ def _worker(): ) ret = proc.wait() - # Stream stdout/stderr to logs - # assert proc.stdout is not None and proc.stderr is not None - # for line in proc.stdout: - # logger.info(line.rstrip("\n")) - # for line in proc.stderr: - # logger.warning(line.rstrip("\n")) - - # ret = proc.wait() - # logger.info(f"mini-swe-agent exited with code {ret}") - # Use row-specific preds.json to avoid cross-run interference preds_path = row_dir / "preds.json" if preds_path.exists(): @@ -154,7 +149,6 @@ def _worker(): for line in eval_proc.stdout: logger.info(line.rstrip("\n")) eval_rc = eval_proc.wait() - # logger.info(f"SWE-bench harness exited with code {eval_rc}") except Exception as e: # Best-effort: mark error but still finish to unblock polling diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py deleted file mode 100644 index 3b81f7ad..00000000 --- a/examples/swebench/tests/conftest.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -import pytest - - -MODEL_ID_OPT = None -CONCURRENCY_OPT = None -MODEL_KWARGS_OPT = None - - -def pytest_addoption(parser): - parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID") - parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts") - parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature") - parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens") - parser.addoption( - "--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort" - ) - - -def pytest_configure(config): - global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT - MODEL_ID_OPT = config.getoption("--model-id") - CONCURRENCY_OPT = config.getoption("--concurrent-workers") - temp = config.getoption("--temperature") - mtok = config.getoption("--max-tokens") - reas = config.getoption("--reasoning") - mk = {} - if temp is not None: - mk["temperature"] = float(temp) - if mtok is not None: - mk["max_tokens"] = int(mtok) - if reas is not None: - mk["reasoning"] = reas - MODEL_KWARGS_OPT = mk or None diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index 48c130a9..6e2410a6 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -1,132 +1,24 @@ from typing import List -import os -import pytest -import requests import yaml from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor -from eval_protocol.types.remote_rollout_processor import DataLoaderConfig -from eval_protocol.quickstart.utils import filter_longest_conversation - -# Reuse the converter used by the built-in adapter -from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row -import conftest - - -MODEL_ID = conftest.MODEL_ID_OPT -if not MODEL_ID: - raise RuntimeError("--model-id is required. Example: --model-id 'fireworks_ai/accounts/.../models/'") -CLI_CONCURRENCY = conftest.CONCURRENCY_OPT -CLI_MODEL_KWARGS = conftest.MODEL_KWARGS_OPT - -# Build completion_params once (used by decorator) -COMPLETION_PARAMS = {"model": MODEL_ID} -if CLI_MODEL_KWARGS: - COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS - - -def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]: - """ - Fetch traces directly from the Fireworks tracing proxy with Authorization header - and convert them into EvaluationRows using the same converter as the adapter. - """ - base_url = (config.model_base_url or "https://tracing.fireworks.ai").rstrip("/") - api_key = os.environ.get("FIREWORKS_API_KEY") - if not api_key: - return [] - - url = f"{base_url}/v1/traces" - headers = {"Authorization": f"Bearer {api_key}"} - params = { - "tags": [f"rollout_id:{config.rollout_id}"], - "max_retries": 5, - "sleep_between_gets": 0.1, - } - - try: - resp = requests.get(url, params=params, headers=headers, timeout=300) - print(f"[fetch_traces] status={resp.status_code} url={resp.url}") # debug - resp.raise_for_status() - body = resp.json() or {} - traces = body.get("traces", []) - print(f"[fetch_traces] traces_found={len(traces)}") - except Exception as e: - print(f"[fetch_traces] error={e}") - return [] - - rows: List[EvaluationRow] = [] - for tr in traces: - row = convert_trace_dict_to_evaluation_row(tr, include_tool_calls=True, span_name=None) - if row: - rows.append(row) - return rows - - -def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]: - if not rows: - return [] - # Use the first row as the base; merge messages from all rows - base = rows[0] - seen = set() - merged_msgs: List[Message] = [] - for r in rows: - for m in r.messages or []: - # Dedup by role+name+content+tool_calls signature - tool_sig = None - if getattr(m, "tool_calls", None): - tool_sig = tuple( - (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) for tc in m.tool_calls - ) - key = (m.role, getattr(m, "name", None), m.content, tool_sig) - if key in seen: - continue - seen.add(key) - merged_msgs.append(m) - base.messages = merged_msgs - return [base] - - -def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader: - return DynamicDataLoader( - generators=[lambda: fetch_traces_with_auth(config)], - preprocess_fn=_merge_rows_into_one, # merge all tool/LLM traces into one row - ) - - -def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]: - out = [] - for idx, iid in enumerate(ids): - out.append( - EvaluationRow( - messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")], - input_metadata={ - "row_id": str(idx), # ← use instance_id here - "instance_id": iid, # ← explicit for debugging - "instance_index": str(idx), # ← optional: keep index - "completion_params": {"model": MODEL_ID}, - }, - ) - ) - return out +from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader +import json +from pathlib import Path def rows_from_indices(count: int) -> List[EvaluationRow]: out: List[EvaluationRow] = [] for idx in range(count): - metadata = { - "row_id": str(idx), - "instance_index": str(idx), - } - # Add model_kwargs to metadata so server can read from req.metadata - if CLI_MODEL_KWARGS: - metadata["model_kwargs"] = CLI_MODEL_KWARGS - out.append( EvaluationRow( - messages=[Message(role="user", content=f"Run SWE-bench index {idx}")], - input_metadata=metadata, + messages=[], + input_metadata={ + "row_id": str(idx), + "instance_index": str(idx), + }, ) ) return out @@ -134,33 +26,33 @@ def rows_from_indices(count: int) -> List[EvaluationRow]: def rows() -> List[EvaluationRow]: # Generate 10 rows by index; server maps index -> dataset instance via --slice - return rows_from_indices(10) + return rows_from_indices(2) # -------------------- Harness result attachment (UI pass/fail) -------------------- -import json -from pathlib import Path - - -def _safe_model_id(model_id: str) -> str: - return model_id.replace("/", "__").replace(":", "-") - - -def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: - """Attach evaluation result by reading harness report or exit status.""" - import logging - - logger = logging.getLogger(__name__) +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[rows], + ), + rollout_processor=RemoteRolloutProcessor( + remote_base_url="http://127.0.0.1:3000", + model_base_url="https://tracing.fireworks.ai", + timeout_seconds=1800, + output_data_loader=default_fireworks_output_data_loader, + ), + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], + max_concurrent_rollouts=3, +) +async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: + """Evaluate SWE-bench instance by reading harness report or exit status.""" - # Get row_id and instance_id + # Get row_id try: - row_id = str(row.input_metadata.row_id) # ← use attribute, not .get() - except Exception as e: - logger.warning(f"Could not get row_id: {e}") + row_id = str(row.input_metadata.row_id) + except Exception: return row row_dir = Path.cwd() / f"row_{row_id}" - logger.info(f"[Row {row_id}] Looking for results in {row_dir}") # Find instance_id from preds.json preds_path = row_dir / "preds.json" @@ -169,57 +61,48 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: try: preds = json.loads(preds_path.read_text()) instance_id = next(iter(preds.keys()), None) - logger.info(f"[Row {row_id}] Found instance_id: {instance_id}") - except Exception as e: - logger.warning(f"[Row {row_id}] Could not read preds.json: {e}") + except Exception: + pass if not instance_id: - logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result") return row resolved: bool | None = None reason_text: str | None = None - # 1. Try to read from report.json (harness ran tests) - safe_model = _safe_model_id(model_id) - report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" + # Get model from completion_params and convert to safe directory name (matching SWE-bench convention) + model_id = row.input_metadata.completion_params.get("model") if row.input_metadata.completion_params else None + if not model_id: + return row + safe_model = model_id.replace("/", "__").replace(":", "-") + # Read from report.json (harness ran tests) + report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" if report_path.exists(): - logger.info(f"[Row {row_id}] Found report.json at {report_path}") try: report_data = json.loads(report_path.read_text()) - instance_data = report_data.get(instance_id, {}) - resolved = bool(instance_data.get("resolved", False)) + resolved = bool(report_data.get(instance_id, {}).get("resolved", False)) reason_text = f"harness_resolved={resolved}" - logger.info(f"[Row {row_id}] Report says resolved={resolved}") - except Exception as e: - logger.error(f"[Row {row_id}] Failed to parse report.json: {e}") - else: - logger.info(f"[Row {row_id}] No report.json found at {report_path}") + except Exception: + pass - # 2. If no report, check exit status YAML (agent didn't produce a patch) + # If no report, check exit status YAML if resolved is None: exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml")) if exit_status_files: - exit_file = exit_status_files[-1] - logger.info(f"[Row {row_id}] Reading exit status from {exit_file.name}") try: - status_doc = yaml.safe_load(exit_file.read_text()) or {} + status_doc = yaml.safe_load(exit_status_files[-1].read_text()) or {} by_status = status_doc.get("instances_by_exit_status", {}) for status_name, ids in by_status.items(): if instance_id in (ids or []): resolved = False reason_text = f"exit_status={status_name}" - logger.info(f"[Row {row_id}] Exit status: {status_name}") break - except Exception as e: - logger.error(f"[Row {row_id}] Failed to parse exit status: {e}") - else: - logger.warning(f"[Row {row_id}] No exit status YAML found") + except Exception: + pass - # 3. Attach result if we found anything + # Attach result if resolved is not None: - logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}") row.evaluation_result = EvaluateResult( score=1.0 if resolved else 0.0, reason=reason_text or f"resolved={resolved}", @@ -233,26 +116,5 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow: ) }, ) - else: - logger.warning(f"[Row {row_id}] Could not determine resolved status") - - return row - -@evaluation_test( - data_loaders=DynamicDataLoader( - generators=[rows], - ), - rollout_processor=RemoteRolloutProcessor( - remote_base_url="http://127.0.0.1:3000", - model_base_url="https://tracing.fireworks.ai", - timeout_seconds=1800, - output_data_loader=fireworks_output_data_loader, - ), - completion_params=[COMPLETION_PARAMS], - max_concurrent_rollouts=(CLI_CONCURRENCY or 2), -) -async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: - assert len(row.messages) >= 1 - row = attach_eval_result(row, MODEL_ID) return row diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py index 8862388c..6f482efd 100644 --- a/examples/swebench/tracing_model.py +++ b/examples/swebench/tracing_model.py @@ -5,9 +5,92 @@ import sys import os -sys.path.insert(0, "/Users/shrey/Documents/cookbook-internal/recipes/eval/swe_bench") - -from run_swe_agent_fw import FireworksCompatibleModel +from minisweagent.models.litellm_model import LitellmModel + + +class FireworksCompatibleModel(LitellmModel): + """ + Fireworks-compatible wrapper for LitellmModel. + """ + + def __init__(self, **kwargs): + model_id = os.environ.get("FIREWORKS_MODEL_ID") + if model_id: + kwargs["model_name"] = model_id + + if "model_kwargs" not in kwargs: + kwargs["model_kwargs"] = {} + + # CRITICAL: Set drop_params to False so stop sequences aren't stripped! + kwargs["model_kwargs"]["drop_params"] = False + + # Get existing stop sequences + existing_stop = kwargs["model_kwargs"].get("stop", []) + if isinstance(existing_stop, str): + existing_stop = [existing_stop] + elif existing_stop is None: + existing_stop = [] + + # Add stop sequences (only the non-natural ones) + # stop_sequences = existing_stop + [ + # # ASCII versions + # "<|User|>", + # "<|Assistant|>", + # # Full-width PIPE versions (U+FF5C) + # "<|User|>", # \uff5c + # "<|Assistant|>", + # "```<|", + # "<|User", + # "<|Ass", + # # Full-width LETTER L versions (U+FF4C) + # "<lUser|>", # \uff4c + # "<lAssistant|>", + # "```<l", + # "<lUser", + # "<lAss", + # ] + # kwargs["model_kwargs"]["stop"] = stop_sequences + kwargs["model_kwargs"]["max_tokens"] = 1024 # Reduce to 1024 to save tokens + + if "temperature" not in kwargs["model_kwargs"]: + kwargs["model_kwargs"]["temperature"] = 0.0 + + # Apply per-run overrides injected by the wrapper (no environment variables) + overrides = globals().get("WRAPPER_MODEL_OVERRIDES") + if isinstance(overrides, dict): + if overrides.get("reasoning") in ("low", "medium", "high"): + kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"] + if overrides.get("temperature") is not None: + try: + kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"]) + except Exception: + pass + if overrides.get("max_tokens") is not None: + try: + kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"]) + except Exception: + pass + + super().__init__(**kwargs) + + def _query(self, messages: list[dict[str, str]], **kwargs): + """Remove non-standard fields before sending to Fireworks API.""" + # Keep only standard OpenAI-compatible fields + clean_messages = [] + for msg in messages: + clean_msg = {"role": msg["role"], "content": msg["content"]} + if "tool_calls" in msg: + clean_msg["tool_calls"] = msg["tool_calls"] + if "name" in msg: + clean_msg["name"] = msg["name"] + clean_messages.append(clean_msg) + + # IMPORTANT: Ensure drop_params stays False in the actual query + kwargs_with_stop = kwargs.copy() + if "drop_params" not in kwargs_with_stop: + kwargs_with_stop["drop_params"] = False + + return super()._query(clean_messages, **kwargs_with_stop) class TracingFireworksModel(FireworksCompatibleModel): diff --git a/pyproject.toml b/pyproject.toml index fd7e6961..f3981f25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,10 @@ box2d = [ "gymnasium[box2d]>=0.29.0", "Pillow", ] +swebench = [ + "mini-swe-agent<=1.14.0", + "datasets>=4.2.0", +] langfuse = [ "langfuse>=2.0.0", ] From b16bd50a1e90a83c39f857f3504c4f459ca08bf8 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Thu, 16 Oct 2025 17:42:00 -0700 Subject: [PATCH 05/10] pyproject removal due to dependancy issue --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3981f25..fd7e6961 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,10 +97,6 @@ box2d = [ "gymnasium[box2d]>=0.29.0", "Pillow", ] -swebench = [ - "mini-swe-agent<=1.14.0", - "datasets>=4.2.0", -] langfuse = [ "langfuse>=2.0.0", ] From 14c6f46a7a0af8c4ba6d9aa12280b42ffa8e8e53 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Fri, 17 Oct 2025 11:58:27 -0700 Subject: [PATCH 06/10] changepyproject.toml --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fd7e6961..b50ac5b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,11 @@ box2d = [ "gymnasium[box2d]>=0.29.0", "Pillow", ] +swebench = [ + "mini-swe-agent>=1.14.0", + "datasets>=2.0.0", + "litellm>=1.75.0", # Note: Overrides core litellm<1.75.0 for swebench compatibility +] langfuse = [ "langfuse>=2.0.0", ] From 47ef37b885dbb86c7e06f9c573ce9d74dab5f2f8 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Fri, 17 Oct 2025 15:53:46 -0700 Subject: [PATCH 07/10] added sandboxing of runs and remote server support --- examples/swebench/server.py | 64 ++++++++++++-- examples/swebench/tests/test_swebench.py | 102 ++++++++--------------- 2 files changed, 93 insertions(+), 73 deletions(-) diff --git a/examples/swebench/server.py b/examples/swebench/server.py index 01063645..0928c5f9 100644 --- a/examples/swebench/server.py +++ b/examples/swebench/server.py @@ -56,11 +56,13 @@ def _worker(): script_dir = Path(__file__).parent env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}" - # Determine output directory (from env or default) - out_dir = os.getcwd() - + # Sandbox by invocation_id to isolate concurrent test runs from pathlib import Path + invocation_id = req.metadata.invocation_id + base_dir = Path(os.getcwd()) / invocation_id + base_dir.mkdir(parents=True, exist_ok=True) + script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve()) # Extract model_kwargs from req.metadata (forwarded from input_metadata) @@ -89,7 +91,7 @@ def _worker(): str(single_index), "--exit-immediately", "--output", - str(out_dir), + str(base_dir), "--model-class", "tracing_model.TracingFireworksModel", ] @@ -103,7 +105,7 @@ def _worker(): import json # Log path inside row directory for this run - row_dir = Path(out_dir) / f"row_{single_index}" + row_dir = base_dir / f"row_{single_index}" row_dir.mkdir(parents=True, exist_ok=True) log_path = row_dir / f"agent_{single_index}.log" @@ -150,12 +152,60 @@ def _worker(): logger.info(line.rstrip("\n")) eval_rc = eval_proc.wait() + # Collect evaluation results to send via Elasticsearch + import yaml + + instance_id = None + resolved = None + + if preds_path.exists(): + try: + preds = json.loads(preds_path.read_text()) + instance_id = next(iter(preds.keys()), None) + except Exception: + pass + + if instance_id: + model_id = req.completion_params.get("model") if req.completion_params else None + if model_id: + safe_model = model_id.replace("/", "__").replace(":", "-") + report_path = ( + row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" + ) + + if report_path.exists(): + try: + report_data = json.loads(report_path.read_text()) + resolved = bool(report_data.get(instance_id, {}).get("resolved", False)) + except Exception: + pass + + if resolved is None: + exit_files = sorted(row_dir.glob("exit_statuses_*.yaml")) + if exit_files: + try: + status_doc = yaml.safe_load(exit_files[-1].read_text()) or {} + by_status = status_doc.get("instances_by_exit_status", {}) + for status_name, ids in by_status.items(): + if instance_id in (ids or []): + resolved = False + break + except Exception: + pass + + results_data = { + "instance_id": instance_id, + "resolved": resolved, + "row_id": str(single_index), + } + except Exception as e: # Best-effort: mark error but still finish to unblock polling + results_data = {"error": str(e), "row_id": str(single_index)} logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))}) finally: - # Always mark finished so RemoteRolloutProcessor stops polling - logger.info("Rollout completed", extra={"status": Status.rollout_finished()}) + # Log results and mark finished + logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()}) threading.Thread(target=_worker, daemon=True).start() return {"status": "accepted"} diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index 6e2410a6..6dd09f63 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -1,12 +1,9 @@ from typing import List -import yaml from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader -from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult +from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor +from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader -import json -from pathlib import Path def rows_from_indices(count: int) -> List[EvaluationRow]: @@ -39,82 +36,55 @@ def rows() -> List[EvaluationRow]: model_base_url="https://tracing.fireworks.ai", timeout_seconds=1800, output_data_loader=default_fireworks_output_data_loader, + disable_elastic_search_setup=True, + elastic_search_config=create_elasticsearch_config_from_env(), ), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_concurrent_rollouts=3, ) async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: - """Evaluate SWE-bench instance by reading harness report or exit status.""" + """Evaluate SWE-bench instance by reading results from Elasticsearch.""" + import logging - # Get row_id - try: - row_id = str(row.input_metadata.row_id) - except Exception: - return row - - row_dir = Path.cwd() / f"row_{row_id}" - - # Find instance_id from preds.json - preds_path = row_dir / "preds.json" - instance_id = None - if preds_path.exists(): - try: - preds = json.loads(preds_path.read_text()) - instance_id = next(iter(preds.keys()), None) - except Exception: - pass + logger = logging.getLogger(__name__) - if not instance_id: + rollout_id = row.execution_metadata.rollout_id + if not rollout_id: return row - resolved: bool | None = None - reason_text: str | None = None + # Query Elasticsearch for results logged by server + try: + from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient - # Get model from completion_params and convert to safe directory name (matching SWE-bench convention) - model_id = row.input_metadata.completion_params.get("model") if row.input_metadata.completion_params else None - if not model_id: - return row - safe_model = model_id.replace("/", "__").replace(":", "-") + es_config = create_elasticsearch_config_from_env() + es_client = ElasticsearchClient(es_config) - # Read from report.json (harness ran tests) - report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json" - if report_path.exists(): - try: - report_data = json.loads(report_path.read_text()) - resolved = bool(report_data.get(instance_id, {}).get("resolved", False)) - reason_text = f"harness_resolved={resolved}" - except Exception: - pass + # Search for results log from this rollout + query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}} - # If no report, check exit status YAML - if resolved is None: - exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml")) - if exit_status_files: - try: - status_doc = yaml.safe_load(exit_status_files[-1].read_text()) or {} - by_status = status_doc.get("instances_by_exit_status", {}) - for status_name, ids in by_status.items(): - if instance_id in (ids or []): - resolved = False - reason_text = f"exit_status={status_name}" - break - except Exception: - pass + search_results = es_client.es.search(index=es_config.index_name, query=query, size=1) - # Attach result - if resolved is not None: - row.evaluation_result = EvaluateResult( - score=1.0 if resolved else 0.0, - reason=reason_text or f"resolved={resolved}", - is_score_valid=True, - metrics={ - "resolved": MetricResult( + if search_results["hits"]["total"]["value"] > 0: + hit = search_results["hits"]["hits"][0]["_source"] + results_data = hit.get("results", {}) + resolved = results_data.get("resolved") + instance_id = results_data.get("instance_id") + + if resolved is not None: + row.evaluation_result = EvaluateResult( score=1.0 if resolved else 0.0, + reason=f"instance={instance_id}, resolved={resolved}", is_score_valid=True, - reason=reason_text or f"resolved={resolved}", - value=int(resolved), + metrics={ + "resolved": MetricResult( + score=1.0 if resolved else 0.0, + is_score_valid=True, + reason=f"resolved={resolved}", + value=int(resolved), + ) + }, ) - }, - ) + except Exception as e: + logger.warning(f"Could not read results from Elasticsearch: {e}") return row From e447ad679538d22e92fafd5e3365a97be97cc05f Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Sun, 19 Oct 2025 23:32:45 -0700 Subject: [PATCH 08/10] remote server changes --- examples/swebench/server.py | 51 +++++++-- examples/swebench/tests/test_swebench.py | 138 +++++++++++++++++------ pyproject.toml | 5 - 3 files changed, 145 insertions(+), 49 deletions(-) diff --git a/examples/swebench/server.py b/examples/swebench/server.py index 0928c5f9..3118a1cf 100644 --- a/examples/swebench/server.py +++ b/examples/swebench/server.py @@ -14,7 +14,7 @@ # Attach Elasticsearch handler to root logger (Eval Protocol UI) handler = ElasticsearchDirectHttpHandler() logging.getLogger().addHandler(handler) -rollout_states = {} +# rollout_states = {} @app.post("/init") @@ -27,11 +27,11 @@ def init(req: InitRequest): logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}") logger.addFilter(RolloutIdFilter(req.metadata.rollout_id)) - rollout_states[req.metadata.rollout_id] = { - "terminated": False, - "status": "running", - "instance_id": req.metadata.row_id, - } + # rollout_states[req.metadata.rollout_id] = { + # "terminated": False, + # "status": "running", + # "instance_id": req.metadata.row_id, + # } def _worker(): try: @@ -157,6 +157,7 @@ def _worker(): instance_id = None resolved = None + exit_reason = None if preds_path.exists(): try: @@ -166,7 +167,7 @@ def _worker(): pass if instance_id: - model_id = req.completion_params.get("model") if req.completion_params else None + model_id = req.model if model_id: safe_model = model_id.replace("/", "__").replace(":", "-") report_path = ( @@ -189,6 +190,7 @@ def _worker(): for status_name, ids in by_status.items(): if instance_id in (ids or []): resolved = False + exit_reason = status_name break except Exception: pass @@ -196,6 +198,7 @@ def _worker(): results_data = { "instance_id": instance_id, "resolved": resolved, + "exit_reason": exit_reason, "row_id": str(single_index), } @@ -204,16 +207,40 @@ def _worker(): results_data = {"error": str(e), "row_id": str(single_index)} logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))}) finally: - # Log results and mark finished - logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()}) + # Create and log EvaluateResult in standardized format + from eval_protocol.models import EvaluateResult, MetricResult + + if resolved is not None: + reason = f"instance={instance_id}, resolved={resolved}" + if exit_reason: + reason += f", exit_reason={exit_reason}" + + eval_result = EvaluateResult( + score=1.0 if resolved else 0.0, + reason=reason, + is_score_valid=True, + metrics={ + "resolved": MetricResult( + score=1.0 if resolved else 0.0, + is_score_valid=True, + reason=f"resolved={resolved}", + value=int(resolved), + ) + }, + ) + logger.info( + f"EVAL_RESULT:{eval_result.model_dump_json()}", extra={"status": Status.rollout_finished()} + ) + else: + logger.info("EVAL_RESULT:null", extra={"status": Status.rollout_finished()}) threading.Thread(target=_worker, daemon=True).start() return {"status": "accepted"} -@app.get("/status") -def status(rollout_id: str): - return rollout_states.get(rollout_id, {"terminated": False}) +# @app.get("/status") +# def status(rollout_id: str): +# return rollout_states.get(rollout_id, {"terminated": False}) def main(): diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index 6dd09f63..aa3f3300 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -32,7 +32,7 @@ def rows() -> List[EvaluationRow]: generators=[rows], ), rollout_processor=RemoteRolloutProcessor( - remote_base_url="http://127.0.0.1:3000", + remote_base_url="http://35.209.134.123:3000", model_base_url="https://tracing.fireworks.ai", timeout_seconds=1800, output_data_loader=default_fireworks_output_data_loader, @@ -42,49 +42,123 @@ def rows() -> List[EvaluationRow]: completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_concurrent_rollouts=3, ) -async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: - """Evaluate SWE-bench instance by reading results from Elasticsearch.""" - import logging +# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: +# """Evaluate SWE-bench instance by reading results from Elasticsearch.""" +# import logging +# logger = logging.getLogger(__name__) + +# rollout_id = row.execution_metadata.rollout_id +# logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}") + +# if not rollout_id: +# logger.warning("[DEBUG] No rollout_id, returning early") +# return row + +# try: +# from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient + +# es_config = create_elasticsearch_config_from_env() +# es_client = ElasticsearchClient(es_config) +# logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}") + +# # Search for EVAL_RESULT log by message prefix +# query = {"match": {"rollout_id": rollout_id}} +# search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT +# logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}") + +# # Filter for EVAL_RESULT in Python +# if search_results and search_results["hits"]["total"]["value"] > 0: +# for hit in search_results["hits"]["hits"]: +# message = hit["_source"].get("message", "") + +# if message.startswith("EVAL_RESULT:"): +# logger.info(f"[DEBUG] Found EVAL_RESULT message!") +# result_json = message.replace("EVAL_RESULT:", "") +# row.evaluation_result = EvaluateResult.model_validate_json(result_json) +# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}") +# break +# else: +# logger.warning("[DEBUG] EVAL_RESULT message not found in logs") +# else: +# logger.warning("[DEBUG] No logs found for rollout") + +# logger.info(f"[DEBUG] Searching ES for EVAL_RESULT") +# import asyncio +# search_results = None +# for attempt in range(5): +# search_results = es_client.search(query=query, size=1) +# if search_results and search_results["hits"]["total"]["value"] > 0: +# logger.info(f"[DEBUG] Found result on attempt {attempt + 1}") +# break +# logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...") +# await asyncio.sleep(1) + +# logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits") +# debug_query = {"match": {"rollout_id": rollout_id}} +# debug_results = es_client.search(query=debug_query, size=26) +# logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}") - logger = logging.getLogger(__name__) +# if debug_results["hits"]["total"]["value"] > 0: +# for hit in debug_results["hits"]["hits"]: +# msg = hit["_source"].get("message", "")[:80] +# logger.info(f"[DEBUG] Sample message: {msg}") +# else: +# logger.warning("[DEBUG] No logs at all for this rollout_id!") +# if search_results and search_results["hits"]["total"]["value"] > 0: +# hit = search_results["hits"]["hits"][0]["_source"] +# message = hit.get("message", "") +# logger.info(f"[DEBUG] Found message: {message[:100]}...") +# if message.startswith("EVAL_RESULT:"): +# result_json = message.replace("EVAL_RESULT:", "") +# logger.info(f"[DEBUG] Parsing EvaluateResult JSON") + +# if result_json != "null": +# # Deserialize directly to EvaluateResult +# row.evaluation_result = EvaluateResult.model_validate_json(result_json) +# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}") +# else: +# logger.warning("[DEBUG] Result was null (no resolved status available)") +# else: +# logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}") +# else: +# logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch") + +# except Exception as e: +# logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True) + +# logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}") +# return row + + +async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: + """Evaluate SWE-bench instance by reading results from Elasticsearch.""" rollout_id = row.execution_metadata.rollout_id if not rollout_id: return row - # Query Elasticsearch for results logged by server try: from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient es_config = create_elasticsearch_config_from_env() es_client = ElasticsearchClient(es_config) - # Search for results log from this rollout - query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}} - - search_results = es_client.es.search(index=es_config.index_name, query=query, size=1) - - if search_results["hits"]["total"]["value"] > 0: - hit = search_results["hits"]["hits"][0]["_source"] - results_data = hit.get("results", {}) - resolved = results_data.get("resolved") - instance_id = results_data.get("instance_id") - - if resolved is not None: - row.evaluation_result = EvaluateResult( - score=1.0 if resolved else 0.0, - reason=f"instance={instance_id}, resolved={resolved}", - is_score_valid=True, - metrics={ - "resolved": MetricResult( - score=1.0 if resolved else 0.0, - is_score_valid=True, - reason=f"resolved={resolved}", - value=int(resolved), - ) - }, - ) + # Get all logs for this rollout and find EVAL_RESULT message + query = {"match": {"rollout_id": rollout_id}} + search_results = es_client.search(query=query, size=50) + + if search_results and search_results["hits"]["total"]["value"] > 0: + for hit in search_results["hits"]["hits"]: + message = hit["_source"].get("message", "") + + if message.startswith("EVAL_RESULT:"): + result_json = message.replace("EVAL_RESULT:", "") + row.evaluation_result = EvaluateResult.model_validate_json(result_json) + break + except Exception as e: - logger.warning(f"Could not read results from Elasticsearch: {e}") + import logging + + logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}") return row diff --git a/pyproject.toml b/pyproject.toml index b50ac5b8..fd7e6961 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,11 +97,6 @@ box2d = [ "gymnasium[box2d]>=0.29.0", "Pillow", ] -swebench = [ - "mini-swe-agent>=1.14.0", - "datasets>=2.0.0", - "litellm>=1.75.0", # Note: Overrides core litellm<1.75.0 for swebench compatibility -] langfuse = [ "langfuse>=2.0.0", ] From e08ca9aee3274259336feb21789a8523ea15ab6a Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 20 Oct 2025 11:07:48 -0700 Subject: [PATCH 09/10] addressed comments --- eval_protocol/utils/evaluation_row_utils.py | 25 +++++ examples/swebench/tests/test_swebench.py | 113 +------------------- examples/swebench/tracing_model.py | 35 +++++- 3 files changed, 64 insertions(+), 109 deletions(-) diff --git a/eval_protocol/utils/evaluation_row_utils.py b/eval_protocol/utils/evaluation_row_utils.py index d89f0c55..bb1e94c7 100644 --- a/eval_protocol/utils/evaluation_row_utils.py +++ b/eval_protocol/utils/evaluation_row_utils.py @@ -9,6 +9,7 @@ from typing import List from eval_protocol.models import EvaluationRow, Message +from eval_protocol.models import InputMetadata def serialize_message(msg: Message) -> str: @@ -134,3 +135,27 @@ def assistant_to_ground_truth(data: List[EvaluationRow]) -> List[EvaluationRow]: ) return processed_rows + + +def create_rows_from_indices(count: int, **metadata) -> List[EvaluationRow]: + """Create evaluation rows with sequential row_ids. + + Useful for remote processors where the server determines content based on row_id. + + Args: + count: Number of rows to create + **metadata: Additional metadata to include in each row + + Returns: + List of EvaluationRows with row_id set to "0", "1", "2", ... + """ + rows = [] + for idx in range(count): + row_metadata = {"row_id": str(idx), **metadata} + rows.append( + EvaluationRow( + messages=[], + input_metadata=InputMetadata(**row_metadata), + ) + ) + return rows diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index aa3f3300..e3ee0955 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -3,27 +3,13 @@ from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env -from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader - -def rows_from_indices(count: int) -> List[EvaluationRow]: - out: List[EvaluationRow] = [] - for idx in range(count): - out.append( - EvaluationRow( - messages=[], - input_metadata={ - "row_id": str(idx), - "instance_index": str(idx), - }, - ) - ) - return out +# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader +from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices def rows() -> List[EvaluationRow]: - # Generate 10 rows by index; server maps index -> dataset instance via --slice - return rows_from_indices(2) + return create_rows_from_indices(500) # All instances # -------------------- Harness result attachment (UI pass/fail) -------------------- @@ -31,106 +17,17 @@ def rows() -> List[EvaluationRow]: data_loaders=DynamicDataLoader( generators=[rows], ), + max_dataset_rows=2, rollout_processor=RemoteRolloutProcessor( - remote_base_url="http://35.209.134.123:3000", + remote_base_url="http://127.0.0.1:3000", model_base_url="https://tracing.fireworks.ai", timeout_seconds=1800, - output_data_loader=default_fireworks_output_data_loader, disable_elastic_search_setup=True, elastic_search_config=create_elasticsearch_config_from_env(), ), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_concurrent_rollouts=3, ) -# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: -# """Evaluate SWE-bench instance by reading results from Elasticsearch.""" -# import logging -# logger = logging.getLogger(__name__) - -# rollout_id = row.execution_metadata.rollout_id -# logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}") - -# if not rollout_id: -# logger.warning("[DEBUG] No rollout_id, returning early") -# return row - -# try: -# from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient - -# es_config = create_elasticsearch_config_from_env() -# es_client = ElasticsearchClient(es_config) -# logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}") - -# # Search for EVAL_RESULT log by message prefix -# query = {"match": {"rollout_id": rollout_id}} -# search_results = es_client.search(query=query, size=50) # Get more to find EVAL_RESULT -# logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}") - -# # Filter for EVAL_RESULT in Python -# if search_results and search_results["hits"]["total"]["value"] > 0: -# for hit in search_results["hits"]["hits"]: -# message = hit["_source"].get("message", "") - -# if message.startswith("EVAL_RESULT:"): -# logger.info(f"[DEBUG] Found EVAL_RESULT message!") -# result_json = message.replace("EVAL_RESULT:", "") -# row.evaluation_result = EvaluateResult.model_validate_json(result_json) -# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}") -# break -# else: -# logger.warning("[DEBUG] EVAL_RESULT message not found in logs") -# else: -# logger.warning("[DEBUG] No logs found for rollout") - -# logger.info(f"[DEBUG] Searching ES for EVAL_RESULT") -# import asyncio -# search_results = None -# for attempt in range(5): -# search_results = es_client.search(query=query, size=1) -# if search_results and search_results["hits"]["total"]["value"] > 0: -# logger.info(f"[DEBUG] Found result on attempt {attempt + 1}") -# break -# logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...") -# await asyncio.sleep(1) - -# logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits") -# debug_query = {"match": {"rollout_id": rollout_id}} -# debug_results = es_client.search(query=debug_query, size=26) -# logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}") - -# if debug_results["hits"]["total"]["value"] > 0: -# for hit in debug_results["hits"]["hits"]: -# msg = hit["_source"].get("message", "")[:80] -# logger.info(f"[DEBUG] Sample message: {msg}") -# else: -# logger.warning("[DEBUG] No logs at all for this rollout_id!") -# if search_results and search_results["hits"]["total"]["value"] > 0: -# hit = search_results["hits"]["hits"][0]["_source"] -# message = hit.get("message", "") -# logger.info(f"[DEBUG] Found message: {message[:100]}...") - -# if message.startswith("EVAL_RESULT:"): -# result_json = message.replace("EVAL_RESULT:", "") -# logger.info(f"[DEBUG] Parsing EvaluateResult JSON") - -# if result_json != "null": -# # Deserialize directly to EvaluateResult -# row.evaluation_result = EvaluateResult.model_validate_json(result_json) -# logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}") -# else: -# logger.warning("[DEBUG] Result was null (no resolved status available)") -# else: -# logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}") -# else: -# logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch") - -# except Exception as e: -# logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True) - -# logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}") -# return row - - async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: """Evaluate SWE-bench instance by reading results from Elasticsearch.""" rollout_id = row.execution_metadata.rollout_id diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py index 6f482efd..11375fc0 100644 --- a/examples/swebench/tracing_model.py +++ b/examples/swebench/tracing_model.py @@ -1,5 +1,38 @@ """ -TracingFireworksModel - Routes through tracing using OpenAI SDK. +Custom model classes for integrating mini-swe-agent with eval-protocol's tracing infrastructure. + +## Why This File Exists + +mini-swe-agent is an autonomous agent that makes 20-100+ LLM API calls per SWE-bench instance +(e.g., reading files, editing code, running tests). To debug agent behavior and display results +in eval-protocol's UI, we need to capture and analyze every LLM call. + +This file bridges mini-swe-agent (which uses LitellmModel) with the Fireworks tracing proxy +(which requires specific URL patterns and SDK usage). + +## Problem Without This File + +By default, mini-swe-agent would: +- Call Fireworks API directly (no tracing) +- Agent conversations invisible in eval-protocol UI +- Can't debug why agent failed +- No cost tracking per call +- Model names get mangled by litellm routing + +## What These Classes Do + +### FireworksCompatibleModel (Base) +- Extends mini-swe-agent's LitellmModel +- Handles Fireworks API compatibility: + * Strips non-standard message fields that Fireworks API rejects + * Adds stop sequences to prevent common agent failure modes + * Applies temperature/reasoning overrides from wrapper script +- Used when tracing isn't needed (direct Fireworks API calls) + +### TracingFireworksModel (For eval-protocol integration) +- Extends FireworksCompatibleModel +- Routes ALL LLM calls through Fireworks tracing proxy instead of direct API +- Uses OpenAI SDK (not litellm) to preserve full model names """ import sys From 867d94757599b913f7eb04100c78dbac3a34e966 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Tue, 21 Oct 2025 10:12:53 -0700 Subject: [PATCH 10/10] porting to fireworks tracing --- examples/swebench/server.py | 17 ++----- examples/swebench/tests/test_swebench.py | 58 ++++++++++++++++-------- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/examples/swebench/server.py b/examples/swebench/server.py index 3118a1cf..1c1fbb03 100644 --- a/examples/swebench/server.py +++ b/examples/swebench/server.py @@ -7,32 +7,24 @@ from fastapi import FastAPI import uvicorn -from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter +from eval_protocol import Status, InitRequest, RolloutIdFilter +from eval_protocol.log_utils.init import init_external_logging_from_env app = FastAPI() # Attach Elasticsearch handler to root logger (Eval Protocol UI) -handler = ElasticsearchDirectHttpHandler() -logging.getLogger().addHandler(handler) +init_external_logging_from_env() # rollout_states = {} @app.post("/init") def init(req: InitRequest): # Allow Eval Protocol to dynamically configure ES endpoint - if req.elastic_search_config: - handler.configure(req.elastic_search_config) # Tag all logs for this rollout_id logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}") logger.addFilter(RolloutIdFilter(req.metadata.rollout_id)) - # rollout_states[req.metadata.rollout_id] = { - # "terminated": False, - # "status": "running", - # "instance_id": req.metadata.row_id, - # } - def _worker(): try: # Validate model @@ -130,6 +122,7 @@ def _worker(): # 2) Run SWE-bench evaluation harness on preds.json preds_path_str = str(preds_path) + unique_run_id = f"eval-{invocation_id}" eval_cmd = [ "python3", "-m", @@ -141,7 +134,7 @@ def _worker(): "--max_workers", str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")), "--run_id", - "eval-run", + unique_run_id, ] logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd))) eval_proc = subprocess.Popen( diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py index e3ee0955..87158ed1 100644 --- a/examples/swebench/tests/test_swebench.py +++ b/examples/swebench/tests/test_swebench.py @@ -2,9 +2,7 @@ from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult from eval_protocol.pytest import evaluation_test -from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env - -# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader +from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices @@ -23,39 +21,59 @@ def rows() -> List[EvaluationRow]: model_base_url="https://tracing.fireworks.ai", timeout_seconds=1800, disable_elastic_search_setup=True, - elastic_search_config=create_elasticsearch_config_from_env(), ), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}], max_concurrent_rollouts=3, ) async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow: - """Evaluate SWE-bench instance by reading results from Elasticsearch.""" + """Evaluate SWE-bench instance by reading results from Fireworks tracing logs.""" + import logging + + logger = logging.getLogger(__name__) + rollout_id = row.execution_metadata.rollout_id + logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}") + if not rollout_id: + logger.warning("[DEBUG] No rollout_id") return row try: - from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient + from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter - es_config = create_elasticsearch_config_from_env() - es_client = ElasticsearchClient(es_config) + adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai") + logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai") - # Get all logs for this rollout and find EVAL_RESULT message - query = {"match": {"rollout_id": rollout_id}} - search_results = es_client.search(query=query, size=50) + # Fetch logs for this rollout + logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}") + log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24) - if search_results and search_results["hits"]["total"]["value"] > 0: - for hit in search_results["hits"]["hits"]: - message = hit["_source"].get("message", "") + logger.info(f"[DEBUG] Received {len(log_entries)} log entries") + if log_entries: + logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}") - if message.startswith("EVAL_RESULT:"): - result_json = message.replace("EVAL_RESULT:", "") + # Find EVAL_RESULT message + found = False + for entry in log_entries: + message = entry.get("message", "") + if message.startswith("EVAL_RESULT:"): + logger.info("[DEBUG] Found EVAL_RESULT message!") + result_json = message.replace("EVAL_RESULT:", "") + logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...") + + if result_json != "null": row.evaluation_result = EvaluateResult.model_validate_json(result_json) - break + logger.info( + f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}" + ) + found = True + break - except Exception as e: - import logging + if not found: + logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs") - logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}") + except Exception as e: + logger.error(f"[DEBUG] Exception: {e}", exc_info=True) + logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}") return row