From 71f416584ff1dbe7772dd02acce36a00e0744ee0 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Wed, 15 Oct 2025 14:33:24 -0700
Subject: [PATCH 01/10] swe-bench

---
 examples/swebench/README.md                   |  57 ++
 ...works__deployments__r5dfiiwp.eval-run.json | 521 ++++++++++++++++++
 examples/swebench/run_swe_agent_fw.py         | 347 ++++++++++++
 examples/swebench/server.py                   | 169 ++++++
 examples/swebench/tests/conftest.py           |  32 ++
 examples/swebench/tests/test_swebench.py      | 250 +++++++++
 examples/swebench/tracing_model.py            |  75 +++
 7 files changed, 1451 insertions(+)
 create mode 100644 examples/swebench/README.md
 create mode 100644 examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json
 create mode 100755 examples/swebench/run_swe_agent_fw.py
 create mode 100644 examples/swebench/server.py
 create mode 100644 examples/swebench/tests/conftest.py
 create mode 100644 examples/swebench/tests/test_swebench.py
 create mode 100644 examples/swebench/tracing_model.py

diff --git a/examples/swebench/README.md b/examples/swebench/README.md
new file mode 100644
index 00000000..04993e02
--- /dev/null
+++ b/examples/swebench/README.md
@@ -0,0 +1,57 @@
+SWE-bench (Remote) - Local (non-Docker) Setup and Usage
+
+Prerequisites
+- Python 3.12 environment (same one you use for this repo)
+- Fireworks API key
+- mini-swe-agent and datasets (for patch generation)
+- SWE-bench harness installed (for evaluation)
+
+Setup mini-swe-agent (non-Docker)
+1) Install dependencies
+```bash
+pip install mini-swe-agent datasets
+```
+
+2) Configure API key for mini-swe-agent
+```bash
+mini-extra config set FIREWORKS_API_KEY <your_fireworks_key>
+```
+
+3) (Optional) Test connectivity
+```bash
+python3 examples/swebench/run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905 --test
+```
+
+Install SWE-bench evaluation harness
+```bash
+git clone https://github.com/princeton-nlp/SWE-bench
+pip install -e SWE-bench
+```
+
+Environment
+```bash
+export FIREWORKS_API_KEY="<your_fireworks_key>"
+```
+
+Run the server
+```bash
+python examples/swebench/server.py
+```
+
+What the server does
+- Invokes `run_swe_agent_fw.py` in batch mode with a single-slice per request
+- Writes outputs to a per-row directory: `./row_{index}/`
+  - `row_{index}/preds.json`
+  - `row_{index}/<instance_id>/<instance_id>.traj.json`
+- Runs the SWE-bench harness on `row_{index}/preds.json`
+
+Run pytest to evaluate a model on SWE-bench
+```bash
+cd /Users/shrey/Documents/python-sdk
+pytest examples/swebench/tests/test_swebench.py -v -s
+```
+
+Notes
+- The test currently generates 10 rows by numeric index (0–9)
+- Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}`
+- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5`
\ No newline at end of file
diff --git a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json
new file mode 100644
index 00000000..a9e10524
--- /dev/null
+++ b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json
@@ -0,0 +1,521 @@
+{
+    "total_instances": 500,
+    "submitted_instances": 1,
+    "completed_instances": 0,
+    "resolved_instances": 0,
+    "unresolved_instances": 0,
+    "empty_patch_instances": 1,
+    "error_instances": 0,
+    "completed_ids": [],
+    "incomplete_ids": [
+        "astropy__astropy-12907",
+        "astropy__astropy-13033",
+        "astropy__astropy-13236",
+        "astropy__astropy-13398",
+        "astropy__astropy-13579",
+        "astropy__astropy-13977",
+        "astropy__astropy-14096",
+        "astropy__astropy-14182",
+        "astropy__astropy-14309",
+        "astropy__astropy-14365",
+        "astropy__astropy-14369",
+        "astropy__astropy-14508",
+        "astropy__astropy-14539",
+        "astropy__astropy-14598",
+        "astropy__astropy-14995",
+        "astropy__astropy-7166",
+        "astropy__astropy-7336",
+        "astropy__astropy-7606",
+        "astropy__astropy-7671",
+        "astropy__astropy-8707",
+        "astropy__astropy-8872",
+        "django__django-10097",
+        "django__django-10554",
+        "django__django-10880",
+        "django__django-10914",
+        "django__django-10973",
+        "django__django-10999",
+        "django__django-11066",
+        "django__django-11087",
+        "django__django-11095",
+        "django__django-11099",
+        "django__django-11119",
+        "django__django-11133",
+        "django__django-11138",
+        "django__django-11141",
+        "django__django-11149",
+        "django__django-11163",
+        "django__django-11179",
+        "django__django-11206",
+        "django__django-11211",
+        "django__django-11239",
+        "django__django-11265",
+        "django__django-11276",
+        "django__django-11292",
+        "django__django-11299",
+        "django__django-11333",
+        "django__django-11400",
+        "django__django-11433",
+        "django__django-11451",
+        "django__django-11477",
+        "django__django-11490",
+        "django__django-11532",
+        "django__django-11551",
+        "django__django-11555",
+        "django__django-11603",
+        "django__django-11728",
+        "django__django-11734",
+        "django__django-11740",
+        "django__django-11749",
+        "django__django-11790",
+        "django__django-11815",
+        "django__django-11820",
+        "django__django-11848",
+        "django__django-11880",
+        "django__django-11885",
+        "django__django-11951",
+        "django__django-11964",
+        "django__django-11999",
+        "django__django-12039",
+        "django__django-12050",
+        "django__django-12125",
+        "django__django-12143",
+        "django__django-12155",
+        "django__django-12193",
+        "django__django-12209",
+        "django__django-12262",
+        "django__django-12273",
+        "django__django-12276",
+        "django__django-12304",
+        "django__django-12308",
+        "django__django-12325",
+        "django__django-12406",
+        "django__django-12419",
+        "django__django-12663",
+        "django__django-12708",
+        "django__django-12713",
+        "django__django-12741",
+        "django__django-12754",
+        "django__django-12774",
+        "django__django-12858",
+        "django__django-12965",
+        "django__django-13012",
+        "django__django-13023",
+        "django__django-13028",
+        "django__django-13033",
+        "django__django-13089",
+        "django__django-13109",
+        "django__django-13112",
+        "django__django-13121",
+        "django__django-13128",
+        "django__django-13158",
+        "django__django-13195",
+        "django__django-13212",
+        "django__django-13279",
+        "django__django-13297",
+        "django__django-13315",
+        "django__django-13343",
+        "django__django-13344",
+        "django__django-13346",
+        "django__django-13363",
+        "django__django-13401",
+        "django__django-13406",
+        "django__django-13410",
+        "django__django-13417",
+        "django__django-13449",
+        "django__django-13512",
+        "django__django-13513",
+        "django__django-13516",
+        "django__django-13551",
+        "django__django-13568",
+        "django__django-13569",
+        "django__django-13590",
+        "django__django-13658",
+        "django__django-13670",
+        "django__django-13741",
+        "django__django-13786",
+        "django__django-13794",
+        "django__django-13807",
+        "django__django-13809",
+        "django__django-13810",
+        "django__django-13820",
+        "django__django-13821",
+        "django__django-13837",
+        "django__django-13925",
+        "django__django-13933",
+        "django__django-13964",
+        "django__django-14007",
+        "django__django-14011",
+        "django__django-14017",
+        "django__django-14034",
+        "django__django-14053",
+        "django__django-14089",
+        "django__django-14122",
+        "django__django-14140",
+        "django__django-14155",
+        "django__django-14170",
+        "django__django-14238",
+        "django__django-14311",
+        "django__django-14315",
+        "django__django-14349",
+        "django__django-14351",
+        "django__django-14373",
+        "django__django-14376",
+        "django__django-14404",
+        "django__django-14434",
+        "django__django-14493",
+        "django__django-14500",
+        "django__django-14534",
+        "django__django-14539",
+        "django__django-14559",
+        "django__django-14580",
+        "django__django-14608",
+        "django__django-14631",
+        "django__django-14672",
+        "django__django-14725",
+        "django__django-14752",
+        "django__django-14765",
+        "django__django-14771",
+        "django__django-14787",
+        "django__django-14792",
+        "django__django-14855",
+        "django__django-14915",
+        "django__django-14999",
+        "django__django-15022",
+        "django__django-15037",
+        "django__django-15098",
+        "django__django-15103",
+        "django__django-15104",
+        "django__django-15127",
+        "django__django-15128",
+        "django__django-15161",
+        "django__django-15252",
+        "django__django-15268",
+        "django__django-15277",
+        "django__django-15278",
+        "django__django-15280",
+        "django__django-15315",
+        "django__django-15368",
+        "django__django-15375",
+        "django__django-15380",
+        "django__django-15382",
+        "django__django-15467",
+        "django__django-15499",
+        "django__django-15503",
+        "django__django-15525",
+        "django__django-15554",
+        "django__django-15561",
+        "django__django-15563",
+        "django__django-15569",
+        "django__django-15572",
+        "django__django-15629",
+        "django__django-15695",
+        "django__django-15731",
+        "django__django-15732",
+        "django__django-15741",
+        "django__django-15814",
+        "django__django-15851",
+        "django__django-15863",
+        "django__django-15916",
+        "django__django-15930",
+        "django__django-15957",
+        "django__django-15973",
+        "django__django-15987",
+        "django__django-16032",
+        "django__django-16082",
+        "django__django-16100",
+        "django__django-16116",
+        "django__django-16136",
+        "django__django-16139",
+        "django__django-16145",
+        "django__django-16255",
+        "django__django-16256",
+        "django__django-16263",
+        "django__django-16315",
+        "django__django-16333",
+        "django__django-16429",
+        "django__django-16454",
+        "django__django-16485",
+        "django__django-16493",
+        "django__django-16502",
+        "django__django-16527",
+        "django__django-16560",
+        "django__django-16569",
+        "django__django-16595",
+        "django__django-16612",
+        "django__django-16631",
+        "django__django-16642",
+        "django__django-16661",
+        "django__django-16662",
+        "django__django-16667",
+        "django__django-16801",
+        "django__django-16819",
+        "django__django-16877",
+        "django__django-16899",
+        "django__django-16901",
+        "django__django-16938",
+        "django__django-16950",
+        "django__django-17029",
+        "django__django-17084",
+        "django__django-17087",
+        "django__django-7530",
+        "django__django-9296",
+        "matplotlib__matplotlib-13989",
+        "matplotlib__matplotlib-14623",
+        "matplotlib__matplotlib-20488",
+        "matplotlib__matplotlib-20676",
+        "matplotlib__matplotlib-20826",
+        "matplotlib__matplotlib-20859",
+        "matplotlib__matplotlib-21568",
+        "matplotlib__matplotlib-22719",
+        "matplotlib__matplotlib-22865",
+        "matplotlib__matplotlib-22871",
+        "matplotlib__matplotlib-23299",
+        "matplotlib__matplotlib-23314",
+        "matplotlib__matplotlib-23412",
+        "matplotlib__matplotlib-23476",
+        "matplotlib__matplotlib-24026",
+        "matplotlib__matplotlib-24149",
+        "matplotlib__matplotlib-24177",
+        "matplotlib__matplotlib-24570",
+        "matplotlib__matplotlib-24627",
+        "matplotlib__matplotlib-24637",
+        "matplotlib__matplotlib-24870",
+        "matplotlib__matplotlib-24970",
+        "matplotlib__matplotlib-25122",
+        "matplotlib__matplotlib-25287",
+        "matplotlib__matplotlib-25311",
+        "matplotlib__matplotlib-25332",
+        "matplotlib__matplotlib-25479",
+        "matplotlib__matplotlib-25775",
+        "matplotlib__matplotlib-25960",
+        "matplotlib__matplotlib-26113",
+        "matplotlib__matplotlib-26208",
+        "matplotlib__matplotlib-26291",
+        "matplotlib__matplotlib-26342",
+        "matplotlib__matplotlib-26466",
+        "mwaskom__seaborn-3069",
+        "mwaskom__seaborn-3187",
+        "pallets__flask-5014",
+        "psf__requests-1142",
+        "psf__requests-1724",
+        "psf__requests-1766",
+        "psf__requests-1921",
+        "psf__requests-2317",
+        "psf__requests-2931",
+        "psf__requests-5414",
+        "psf__requests-6028",
+        "pydata__xarray-2905",
+        "pydata__xarray-3095",
+        "pydata__xarray-3151",
+        "pydata__xarray-3305",
+        "pydata__xarray-3677",
+        "pydata__xarray-3993",
+        "pydata__xarray-4075",
+        "pydata__xarray-4094",
+        "pydata__xarray-4356",
+        "pydata__xarray-4629",
+        "pydata__xarray-4687",
+        "pydata__xarray-4695",
+        "pydata__xarray-4966",
+        "pydata__xarray-6461",
+        "pydata__xarray-6599",
+        "pydata__xarray-6721",
+        "pydata__xarray-6744",
+        "pydata__xarray-6938",
+        "pydata__xarray-6992",
+        "pydata__xarray-7229",
+        "pydata__xarray-7233",
+        "pydata__xarray-7393",
+        "pylint-dev__pylint-4551",
+        "pylint-dev__pylint-4604",
+        "pylint-dev__pylint-4661",
+        "pylint-dev__pylint-4970",
+        "pylint-dev__pylint-6386",
+        "pylint-dev__pylint-6528",
+        "pylint-dev__pylint-6903",
+        "pylint-dev__pylint-7080",
+        "pylint-dev__pylint-7277",
+        "pylint-dev__pylint-8898",
+        "pytest-dev__pytest-10051",
+        "pytest-dev__pytest-10081",
+        "pytest-dev__pytest-10356",
+        "pytest-dev__pytest-5262",
+        "pytest-dev__pytest-5631",
+        "pytest-dev__pytest-5787",
+        "pytest-dev__pytest-5809",
+        "pytest-dev__pytest-5840",
+        "pytest-dev__pytest-6197",
+        "pytest-dev__pytest-6202",
+        "pytest-dev__pytest-7205",
+        "pytest-dev__pytest-7236",
+        "pytest-dev__pytest-7324",
+        "pytest-dev__pytest-7432",
+        "pytest-dev__pytest-7490",
+        "pytest-dev__pytest-7521",
+        "pytest-dev__pytest-7571",
+        "pytest-dev__pytest-7982",
+        "pytest-dev__pytest-8399",
+        "scikit-learn__scikit-learn-10297",
+        "scikit-learn__scikit-learn-10844",
+        "scikit-learn__scikit-learn-10908",
+        "scikit-learn__scikit-learn-11310",
+        "scikit-learn__scikit-learn-11578",
+        "scikit-learn__scikit-learn-12585",
+        "scikit-learn__scikit-learn-12682",
+        "scikit-learn__scikit-learn-12973",
+        "scikit-learn__scikit-learn-13124",
+        "scikit-learn__scikit-learn-13135",
+        "scikit-learn__scikit-learn-13142",
+        "scikit-learn__scikit-learn-13328",
+        "scikit-learn__scikit-learn-13439",
+        "scikit-learn__scikit-learn-13496",
+        "scikit-learn__scikit-learn-13779",
+        "scikit-learn__scikit-learn-14053",
+        "scikit-learn__scikit-learn-14087",
+        "scikit-learn__scikit-learn-14141",
+        "scikit-learn__scikit-learn-14496",
+        "scikit-learn__scikit-learn-14629",
+        "scikit-learn__scikit-learn-14710",
+        "scikit-learn__scikit-learn-14894",
+        "scikit-learn__scikit-learn-14983",
+        "scikit-learn__scikit-learn-15100",
+        "scikit-learn__scikit-learn-25102",
+        "scikit-learn__scikit-learn-25232",
+        "scikit-learn__scikit-learn-25747",
+        "scikit-learn__scikit-learn-25931",
+        "scikit-learn__scikit-learn-25973",
+        "scikit-learn__scikit-learn-26194",
+        "scikit-learn__scikit-learn-26323",
+        "scikit-learn__scikit-learn-9288",
+        "sphinx-doc__sphinx-10323",
+        "sphinx-doc__sphinx-10435",
+        "sphinx-doc__sphinx-10449",
+        "sphinx-doc__sphinx-10466",
+        "sphinx-doc__sphinx-10614",
+        "sphinx-doc__sphinx-10673",
+        "sphinx-doc__sphinx-11445",
+        "sphinx-doc__sphinx-11510",
+        "sphinx-doc__sphinx-7440",
+        "sphinx-doc__sphinx-7454",
+        "sphinx-doc__sphinx-7462",
+        "sphinx-doc__sphinx-7590",
+        "sphinx-doc__sphinx-7748",
+        "sphinx-doc__sphinx-7757",
+        "sphinx-doc__sphinx-7889",
+        "sphinx-doc__sphinx-7910",
+        "sphinx-doc__sphinx-7985",
+        "sphinx-doc__sphinx-8035",
+        "sphinx-doc__sphinx-8056",
+        "sphinx-doc__sphinx-8120",
+        "sphinx-doc__sphinx-8265",
+        "sphinx-doc__sphinx-8269",
+        "sphinx-doc__sphinx-8459",
+        "sphinx-doc__sphinx-8475",
+        "sphinx-doc__sphinx-8548",
+        "sphinx-doc__sphinx-8551",
+        "sphinx-doc__sphinx-8593",
+        "sphinx-doc__sphinx-8595",
+        "sphinx-doc__sphinx-8621",
+        "sphinx-doc__sphinx-8638",
+        "sphinx-doc__sphinx-8721",
+        "sphinx-doc__sphinx-9229",
+        "sphinx-doc__sphinx-9230",
+        "sphinx-doc__sphinx-9258",
+        "sphinx-doc__sphinx-9281",
+        "sphinx-doc__sphinx-9320",
+        "sphinx-doc__sphinx-9367",
+        "sphinx-doc__sphinx-9461",
+        "sphinx-doc__sphinx-9591",
+        "sphinx-doc__sphinx-9602",
+        "sphinx-doc__sphinx-9658",
+        "sphinx-doc__sphinx-9673",
+        "sphinx-doc__sphinx-9698",
+        "sphinx-doc__sphinx-9711",
+        "sympy__sympy-11618",
+        "sympy__sympy-12096",
+        "sympy__sympy-12419",
+        "sympy__sympy-12481",
+        "sympy__sympy-12489",
+        "sympy__sympy-13031",
+        "sympy__sympy-13091",
+        "sympy__sympy-13372",
+        "sympy__sympy-13480",
+        "sympy__sympy-13551",
+        "sympy__sympy-13615",
+        "sympy__sympy-13647",
+        "sympy__sympy-13757",
+        "sympy__sympy-13798",
+        "sympy__sympy-13852",
+        "sympy__sympy-13877",
+        "sympy__sympy-13878",
+        "sympy__sympy-13974",
+        "sympy__sympy-14248",
+        "sympy__sympy-14531",
+        "sympy__sympy-14711",
+        "sympy__sympy-14976",
+        "sympy__sympy-15017",
+        "sympy__sympy-15345",
+        "sympy__sympy-15349",
+        "sympy__sympy-15599",
+        "sympy__sympy-15809",
+        "sympy__sympy-15875",
+        "sympy__sympy-15976",
+        "sympy__sympy-16450",
+        "sympy__sympy-16597",
+        "sympy__sympy-16766",
+        "sympy__sympy-16792",
+        "sympy__sympy-16886",
+        "sympy__sympy-17139",
+        "sympy__sympy-17318",
+        "sympy__sympy-17630",
+        "sympy__sympy-17655",
+        "sympy__sympy-18189",
+        "sympy__sympy-18199",
+        "sympy__sympy-18211",
+        "sympy__sympy-18698",
+        "sympy__sympy-18763",
+        "sympy__sympy-19040",
+        "sympy__sympy-19346",
+        "sympy__sympy-19495",
+        "sympy__sympy-19637",
+        "sympy__sympy-19783",
+        "sympy__sympy-19954",
+        "sympy__sympy-20154",
+        "sympy__sympy-20428",
+        "sympy__sympy-20438",
+        "sympy__sympy-20590",
+        "sympy__sympy-20801",
+        "sympy__sympy-20916",
+        "sympy__sympy-21379",
+        "sympy__sympy-21596",
+        "sympy__sympy-21612",
+        "sympy__sympy-21847",
+        "sympy__sympy-21930",
+        "sympy__sympy-22080",
+        "sympy__sympy-22456",
+        "sympy__sympy-22714",
+        "sympy__sympy-22914",
+        "sympy__sympy-23262",
+        "sympy__sympy-23413",
+        "sympy__sympy-23534",
+        "sympy__sympy-23824",
+        "sympy__sympy-23950",
+        "sympy__sympy-24066",
+        "sympy__sympy-24213",
+        "sympy__sympy-24443",
+        "sympy__sympy-24539",
+        "sympy__sympy-24562",
+        "sympy__sympy-24661"
+    ],
+    "empty_patch_ids": [
+        "astropy__astropy-13453"
+    ],
+    "submitted_ids": [
+        "astropy__astropy-13453"
+    ],
+    "resolved_ids": [],
+    "unresolved_ids": [],
+    "error_ids": [],
+    "schema_version": 2
+}
diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py
new file mode 100755
index 00000000..f1ae1a51
--- /dev/null
+++ b/examples/swebench/run_swe_agent_fw.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+Fireworks-compatible wrapper for mini-swe-agent SWE-bench evaluations.
+
+This script handles Fireworks API compatibility by stripping non-standard fields
+that mini-swe-agent adds for internal tracking.
+
+Requires fully qualified Fireworks model paths:
+- Serverless models: fireworks_ai/accounts/fireworks/models/{model_name}
+- Deployed models: fireworks_ai/accounts/{account}/deployedModels/{model_name}
+
+Usage:
+    python run_swe_agent_fw.py <fully_qualified_model_path> [options]
+
+Examples:
+    # Serverless models
+    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --instances 10 --workers 5
+    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct --subset full --workers 8
+
+    # Deployed models
+    python run_swe_agent_fw.py fireworks_ai/accounts/cognition/deployedModels/swe-1-mtp-tc1huggf --single 0
+    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --test
+
+Requirements:
+    - mini-swe-agent: pip install mini-swe-agent
+    - Fireworks API key: Set via 'mini-extra config set FIREWORKS_API_KEY <key>'
+"""
+
+import argparse
+import os
+import sys
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any
+
+# Import required dependencies
+from minisweagent.models.litellm_model import LitellmModel, LitellmModelConfig
+import litellm
+
+
+class FireworksCompatibleModel(LitellmModel):
+    """
+    Fireworks-compatible wrapper for LitellmModel.
+    """
+
+    def __init__(self, **kwargs):
+        if model_id := os.environ.get('FIREWORKS_MODEL_ID'):
+            kwargs['model_name'] = model_id
+        print(f"kwargs: {kwargs}")
+        if 'model_kwargs' not in kwargs:
+            kwargs['model_kwargs'] = {}
+        
+        # CRITICAL: Set drop_params to False so stop sequences aren't stripped!
+        kwargs['model_kwargs']['drop_params'] = False
+        
+        # Get existing stop sequences
+        existing_stop = kwargs['model_kwargs'].get('stop', [])
+        if isinstance(existing_stop, str):
+            existing_stop = [existing_stop]
+        elif existing_stop is None:
+            existing_stop = []
+        
+        # Add stop sequences (only the non-natural ones)
+        stop_sequences = existing_stop + [
+           # ASCII versions
+            "<|User|>",
+            "<|Assistant|>",
+            
+            # Full-width PIPE versions (U+FF5C)
+            "<｜User|>",       # \uff5c
+            "<｜Assistant|>",
+            "```<｜",
+            "<｜User",
+            "<｜Ass",
+            
+            # Full-width LETTER L versions (U+FF4C) 
+            "<ｌUser|>",      # \uff4c
+            "<ｌAssistant|>",
+            "```<ｌ",
+            "<ｌUser",
+            "<ｌAss",
+        ]
+        kwargs['model_kwargs']['stop'] = stop_sequences
+        kwargs['model_kwargs']['max_tokens'] = 1024  # Reduce to 1024 to save tokens
+        
+        if 'temperature' not in kwargs['model_kwargs']:
+            kwargs['model_kwargs']['temperature'] = 0.0
+
+        # Apply per-run overrides injected by the wrapper (no environment variables)
+        overrides = globals().get('WRAPPER_MODEL_OVERRIDES')
+        if isinstance(overrides, dict):
+            if overrides.get('reasoning') in ('low', 'medium', 'high'):
+                kwargs['model_kwargs']['reasoning_effort'] = overrides['reasoning']
+            if overrides.get('temperature') is not None:
+                try:
+                    kwargs['model_kwargs']['temperature'] = float(overrides['temperature'])
+                except Exception:
+                    pass
+            if overrides.get('max_tokens') is not None:
+                try:
+                    kwargs['model_kwargs']['max_tokens'] = int(overrides['max_tokens'])
+                except Exception:
+                    pass
+        
+        super().__init__(**kwargs)
+
+    def _query(self, messages: list[dict[str, str]], **kwargs):
+        """Remove non-standard fields before sending to Fireworks API."""
+        # Keep only standard OpenAI-compatible fields
+        clean_messages = []
+        for msg in messages:
+            clean_msg = {
+                "role": msg["role"],
+                "content": msg["content"]
+            }
+            if "tool_calls" in msg:
+                clean_msg["tool_calls"] = msg["tool_calls"]
+            if "name" in msg:
+                clean_msg["name"] = msg["name"]
+            clean_messages.append(clean_msg)
+        
+        # IMPORTANT: Ensure drop_params stays False in the actual query
+        kwargs_with_stop = kwargs.copy()
+        if 'drop_params' not in kwargs_with_stop:
+            kwargs_with_stop['drop_params'] = False
+        
+        return super()._query(clean_messages, **kwargs_with_stop)
+
+def __get_api_key():
+    """Get Fireworks API key from environment or mini-swe-agent config."""
+    # Environment variable takes precedence
+    if api_key := os.environ.get('FIREWORKS_API_KEY'):
+        return api_key
+
+    # Try to get API key from mini-swe-agent's config system
+    try:
+        from minisweagent.config import get_config
+        config = get_config()
+        return config.get('FIREWORKS_API_KEY')
+    except (ImportError, AttributeError, KeyError):
+        # Fallback: check common config file locations
+        config_paths = [
+            Path.home() / ".config" / "mini-swe-agent" / ".env",
+            Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env"
+        ]
+
+        for config_path in config_paths:
+            if config_path.exists():
+                try:
+                    with open(config_path) as f:
+                        for line in f:
+                            if line.startswith('FIREWORKS_API_KEY='):
+                                value = line.split('=', 1)[1].strip()
+                                return value.strip("'\"")
+                except (IOError, OSError):
+                    continue
+
+    return None
+
+
+def __test_model(model_id):
+    """Test model connectivity with a simple completion."""
+    from litellm import completion
+
+    # Verify API key exists
+    api_key = __get_api_key()
+    if not api_key:
+        print("Error: FIREWORKS_API_KEY not found.")
+        return False
+
+    # Configure environment for litellm
+    os.environ['FIREWORKS_API_KEY'] = api_key
+    # Assume model_id is fully qualified
+    model_name = model_id
+
+    print(f"Testing model: {model_name}")
+
+    try:
+        # Send test completion
+        response = completion(
+            model=model_name,
+            messages=[{"role": "user", "content": "Test message. Reply with OK."}],
+            temperature=0.0,
+            max_tokens=10
+        )
+
+        print(f"Success. Response: {response.choices[0].message.content}")
+        print(f"Tokens used: {response.usage.total_tokens}")
+        return True
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+
+
+def __validate_environment():
+    """Check for required API key."""
+    if not __get_api_key():
+        print("Warning: FIREWORKS_API_KEY not found.")
+        print("Set it with: mini-extra config set FIREWORKS_API_KEY <key>")
+
+
+
+
+def __build_command(args, wrapper_module_path):
+    """Build mini-swe-agent command with appropriate arguments."""
+    # Construct model class path
+    wrapper_module = wrapper_module_path.stem
+    model_class = f"{wrapper_module}.FireworksCompatibleModel"
+
+    # Base command - assume model_id is fully qualified
+    cmd = [
+        sys.executable,
+        "-m", "minisweagent.run.mini_extra",
+        "swebench-single" if args.single is not None else "swebench",
+        "--model", args.model_id,
+        "--model-class", model_class,
+        "--subset", args.subset,
+        "--split", args.split
+    ]
+    if args.model_class:
+        cmd.extend(["--model-class", args.model_class])
+    print(f"Output: {args.output}")
+    print(args.single)
+    # Mode-specific arguments
+    print(f"Output: {args.output}")
+    print(args.single)
+    # Mode-specific arguments
+    if args.single is not None:
+        # Use batch mode for a single index via slice and write to a per-row directory
+        from pathlib import Path
+        slice_spec = f"{args.single}:{args.single+1}"
+        row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}")
+        cmd = [
+            sys.executable,
+            "-m", "minisweagent.run.mini_extra",
+            "swebench",
+            "--model", args.model_id,
+            "--model-class", model_class,
+            "--subset", args.subset,
+            "--split", args.split,
+            "--slice", slice_spec,
+            "--output", row_dir,
+        ]
+        if args.model_class:
+            cmd.extend(["--model-class", args.model_class])
+        print(f"DEBUG: Using batch mode with slice {slice_spec}, output={row_dir}")
+    else:
+        if args.instances:
+            cmd.extend(["--slice", f"0:{args.instances}"])
+        cmd.extend(["--workers", str(args.workers), "--output", args.output])
+
+    return cmd
+
+    
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Run mini-swe-agent with Fireworks models on SWE-bench',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+
+    # Required model ID
+    parser.add_argument('model_id', help='Fireworks model ID')
+    parser.add_argument('--model-class', type=str, default=None, help='Optional mini-swe-agent model-class')
+    # Execution options
+    parser.add_argument('--instances', type=int, help='Number of instances to run')
+    parser.add_argument('--workers', type=int, default=1, help='Parallel workers (default: 1)')
+    parser.add_argument('--output', help='Output directory')
+    parser.add_argument('--subset', default='verified', choices=['verified', 'lite', 'full'])
+    parser.add_argument('--split', default='test', choices=['dev', 'test'])
+    parser.add_argument('--single', type=int, metavar='INDEX', help='Run single instance')
+    parser.add_argument('--exit-immediately', action='store_true')
+    parser.add_argument('--test', action='store_true', help='Test model connectivity')
+    parser.add_argument('--reasoning', type=str, choices=['low', 'medium', 'high'], default=None, help='Provider-specific reasoning effort')
+    parser.add_argument('--temperature', type=float, default=None, help='Model temperature override')
+    parser.add_argument('--max-tokens', type=int, default=None, help='Max tokens override')
+    args = parser.parse_args()
+
+    # Handle test mode
+    if args.test:
+        sys.exit(0 if _test_model(args.model_id) else 1)
+
+    # Validate API key
+    __validate_environment()
+
+    # Set default output directory
+    if args.output is None:
+        safe_model_id = args.model_id.replace("/", "-").replace(":", "-")
+        script_dir = Path(__file__).parent.resolve()
+        args.output = str(script_dir / f'swebench-{safe_model_id}-results')
+
+    # Create temporary module for importing FireworksCompatibleModel
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        with open(__file__, 'r') as current_file:
+            f.write(current_file.read())
+        # Inject per-run model overrides directly into the temp module
+        f.write("\n# --- Injected by wrapper: per-run model overrides ---\n")
+        f.write("WRAPPER_MODEL_OVERRIDES = {\n")
+        f.write(f"    'reasoning': {repr(args.reasoning)},\n")
+        f.write(f"    'temperature': {repr(args.temperature)},\n")
+        f.write(f"    'max_tokens': {repr(args.max_tokens)},\n")
+        f.write("}\n")
+        temp_module_path = Path(f.name)
+
+    try:
+        # Configure environment
+        env = os.environ.copy()
+        env['PYTHONPATH'] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}"
+        # Pass the fully qualified model path to the subprocess
+        env['FIREWORKS_MODEL_ID'] = args.model_id
+
+        # Ensure API key is passed to subprocess
+        api_key = __get_api_key()
+        if api_key:
+            env['FIREWORKS_API_KEY'] = api_key
+
+        # No environment variables for model kwargs; overrides are injected into the temp module
+
+        # Build command
+        cmd = __build_command(args, temp_module_path)
+
+        # Display configuration
+        print(f"Model: {args.model_id}")
+        print(f"Output: {args.output}")
+        print(f"Workers: {args.workers}")
+        if args.instances:
+            print(f"Instances: {args.instances}")
+
+        # Debug: Show the actual command being run
+        print(f"Command: {' '.join(cmd)}")
+        print(f"Model path in command: {cmd[cmd.index('--model') + 1] if '--model' in cmd else 'NOT FOUND'}")
+
+        # Execute mini-swe-agent
+        subprocess.run(cmd, env=env, check=True)
+
+    finally:
+        # Clean up temporary module
+        if temp_module_path.exists():
+            temp_module_path.unlink()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/swebench/server.py b/examples/swebench/server.py
new file mode 100644
index 00000000..80ddabe4
--- /dev/null
+++ b/examples/swebench/server.py
@@ -0,0 +1,169 @@
+"""Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url."""
+import os
+import threading
+import subprocess
+import logging
+from fastapi import FastAPI
+import uvicorn
+
+from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter
+
+app = FastAPI()
+
+# Attach Elasticsearch handler to root logger (Eval Protocol UI)
+handler = ElasticsearchDirectHttpHandler()
+logging.getLogger().addHandler(handler)
+rollout_states = {}
+
+@app.post("/init")
+def init(req: InitRequest):
+    # Allow Eval Protocol to dynamically configure ES endpoint
+    if req.elastic_search_config:
+        handler.configure(req.elastic_search_config)
+
+    # Tag all logs for this rollout_id
+    logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
+    logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
+
+    rollout_states[req.metadata.rollout_id] = {
+        "terminated": False,
+        "status": "running",
+        "instance_id": req.metadata.row_id,
+    }
+
+    def _worker():
+        try:
+            # Validate model
+            if not req.model:
+                raise ValueError("model is required")
+
+            
+            if not req.metadata or not req.metadata.row_id:
+                raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'")
+            try:
+                single_index = int(str(req.metadata.row_id))
+            except ValueError:
+                raise ValueError(f"row_id must be an integer index for --single, got: {req.metadata.row_id}")
+            env = os.environ.copy()
+            # Build environment for subprocess
+            if "FIREWORKS_API_KEY" in os.environ:
+                env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"]
+            # Make sure the tracing model module is importable by the subprocess
+            # so "tracing_model.TracingFireworksModel" can be imported
+            env["PYTHONPATH"] = "/Users/shrey/Documents/python-sdk/examples/swebench:" + env.get("PYTHONPATH", "")
+
+            # Determine output directory (from env or default)
+            out_dir = os.getcwd()
+
+            from pathlib import Path
+            
+            script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve())
+            
+            # Extract model_kwargs from req.metadata (forwarded from input_metadata)
+            model_kwargs = {}
+            logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}")
+            if hasattr(req.metadata, "model_kwargs"):
+                mk = getattr(req.metadata, "model_kwargs", None)
+                logger.info(f"DEBUG: Found req.metadata.model_kwargs = {mk}")
+                if isinstance(mk, dict):
+                    model_kwargs = mk
+                    logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}")
+            else:
+                logger.info(f"DEBUG: req.metadata has NO model_kwargs attribute")
+            
+            # Set tracing URL
+            if req.model_base_url:
+                env["TRACING_BASE_URL"] = req.model_base_url
+
+            cmd = [
+                "python3",
+                script_path,
+                req.model,
+                "--single", str(single_index),
+                "--exit-immediately",
+                "--output", str(out_dir),
+                "--model-class", "tracing_model.TracingFireworksModel",
+            ]
+            # Forward model kwargs as CLI flags to the wrapper
+            if model_kwargs.get("reasoning") in ("low", "medium", "high"):
+                cmd.extend(["--reasoning", str(model_kwargs["reasoning"])])
+            if model_kwargs.get("temperature") is not None:
+                cmd.extend(["--temperature", str(model_kwargs["temperature"])])
+            if model_kwargs.get("max_tokens") is not None:
+                cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])])
+            import json
+            # Log path inside row directory for this run
+            row_dir = Path(out_dir) / f"row_{single_index}"
+            row_dir.mkdir(parents=True, exist_ok=True)
+            log_path = row_dir / f"agent_{single_index}.log"
+
+            # Run without streaming; write all output to a log file; wait until completion
+            with open(log_path, "w") as lf:
+                proc = subprocess.Popen(
+                    cmd,
+                    env=env,
+                    stdout=lf,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    bufsize=1,
+                )
+                ret = proc.wait()
+
+
+            # Stream stdout/stderr to logs
+            # assert proc.stdout is not None and proc.stderr is not None
+            # for line in proc.stdout:
+            #     logger.info(line.rstrip("\n"))
+            # for line in proc.stderr:
+            #     logger.warning(line.rstrip("\n"))
+
+            # ret = proc.wait()
+            # logger.info(f"mini-swe-agent exited with code {ret}")
+
+            # Use row-specific preds.json to avoid cross-run interference
+            preds_path = row_dir / "preds.json"
+            if preds_path.exists():
+                logger.info(f"Using preds.json at: {preds_path}")
+            else:
+                logger.error(f"No preds.json found at {preds_path}")
+
+            # 2) Run SWE-bench evaluation harness on preds.json
+            preds_path_str = str(preds_path)
+            eval_cmd = [
+                "python3", "-m", "swebench.harness.run_evaluation",
+                "--dataset_name", "princeton-nlp/SWE-bench_Verified",
+                "--predictions_path", preds_path_str,
+                "--max_workers", str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
+                "--run_id", "eval-run",
+            ]
+            logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
+            eval_proc = subprocess.Popen(
+                eval_cmd, cwd=str(row_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
+            )
+            assert eval_proc.stdout is not None
+            for line in eval_proc.stdout:
+                logger.info(line.rstrip("\n"))
+            eval_rc = eval_proc.wait()
+            # logger.info(f"SWE-bench harness exited with code {eval_rc}")
+
+        except Exception as e:
+            # Best-effort: mark error but still finish to unblock polling
+            logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
+        finally:
+            # Always mark finished so RemoteRolloutProcessor stops polling
+            logger.info("Rollout completed", extra={"status": Status.rollout_finished()})
+
+    threading.Thread(target=_worker, daemon=True).start()
+    return {"status": "accepted"}
+
+@app.get("/status")
+def status(rollout_id: str):
+    return rollout_states.get(rollout_id, {"terminated": False})
+
+def main():
+    host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1")
+    port = int(os.getenv("REMOTE_SERVER_PORT", "3000"))
+    uvicorn.run(app, host=host, port=port)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py
new file mode 100644
index 00000000..8bca079e
--- /dev/null
+++ b/examples/swebench/tests/conftest.py
@@ -0,0 +1,32 @@
+import os
+import pytest
+
+import os
+import pytest
+
+MODEL_ID_OPT = None
+CONCURRENCY_OPT = None
+MODEL_KWARGS_OPT = None
+
+def pytest_addoption(parser):
+    parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID")
+    parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts")
+    parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature")
+    parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens")
+    parser.addoption("--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort")
+
+def pytest_configure(config):
+    global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT
+    MODEL_ID_OPT = config.getoption("--model-id")
+    CONCURRENCY_OPT = config.getoption("--concurrent-workers")
+    temp = config.getoption("--temperature")
+    mtok = config.getoption("--max-tokens")
+    reas = config.getoption("--reasoning")
+    mk = {}
+    if temp is not None:
+        mk["temperature"] = float(temp)
+    if mtok is not None:
+        mk["max_tokens"] = int(mtok)
+    if reas is not None:
+        mk["reasoning"] = reas
+    MODEL_KWARGS_OPT = mk or None
\ No newline at end of file
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
new file mode 100644
index 00000000..37fef735
--- /dev/null
+++ b/examples/swebench/tests/test_swebench.py
@@ -0,0 +1,250 @@
+from typing import List
+import os
+import pytest
+import requests
+import yaml
+from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
+from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
+from eval_protocol.types.remote_rollout_processor import DataLoaderConfig
+from eval_protocol.quickstart.utils import filter_longest_conversation
+# Reuse the converter used by the built-in adapter
+from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row
+import conftest
+
+
+MODEL_ID = conftest.MODEL_ID_OPT
+if not MODEL_ID:
+    raise RuntimeError("--model-id is required. Example: --model-id 'fireworks_ai/accounts/.../models/<name>'")
+CLI_CONCURRENCY = conftest.CONCURRENCY_OPT
+CLI_MODEL_KWARGS = conftest.MODEL_KWARGS_OPT
+
+# Build completion_params once (used by decorator)
+COMPLETION_PARAMS = {"model": MODEL_ID}
+if CLI_MODEL_KWARGS:
+    COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS
+
+def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]:
+    """
+    Fetch traces directly from the Fireworks tracing proxy with Authorization header
+    and convert them into EvaluationRows using the same converter as the adapter.
+    """
+    base_url = (config.model_base_url or "https://tracing.fireworks.ai").rstrip("/")
+    api_key = os.environ.get("FIREWORKS_API_KEY")
+    if not api_key:
+        return []
+
+    url = f"{base_url}/v1/traces"
+    headers = {"Authorization": f"Bearer {api_key}"}
+    params = {
+        "tags": [f"rollout_id:{config.rollout_id}"],
+        "max_retries": 5,
+        "sleep_between_gets": 0.1,
+    }
+
+    try:
+        resp = requests.get(url, params=params, headers=headers, timeout=300)
+        print(f"[fetch_traces] status={resp.status_code} url={resp.url}")  # debug
+        resp.raise_for_status()
+        body = resp.json() or {}
+        traces = body.get("traces", [])
+        print(f"[fetch_traces] traces_found={len(traces)}")
+    except Exception as e:
+        print(f"[fetch_traces] error={e}")
+        return []
+
+    rows: List[EvaluationRow] = []
+    for tr in traces:
+        row = convert_trace_dict_to_evaluation_row(tr, include_tool_calls=True, span_name=None)
+        if row:
+            rows.append(row)
+    return rows
+
+
+def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]:
+    if not rows:
+        return []
+    # Use the first row as the base; merge messages from all rows
+    base = rows[0]
+    seen = set()
+    merged_msgs: List[Message] = []
+    for r in rows:
+        for m in (r.messages or []):
+            # Dedup by role+name+content+tool_calls signature
+            tool_sig = None
+            if getattr(m, "tool_calls", None):
+                tool_sig = tuple(
+                    (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name"))
+                    for tc in m.tool_calls
+                )
+            key = (m.role, getattr(m, "name", None), m.content, tool_sig)
+            if key in seen:
+                continue
+            seen.add(key)
+            merged_msgs.append(m)
+    base.messages = merged_msgs
+    return [base]
+
+def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
+    return DynamicDataLoader(
+        generators=[lambda: fetch_traces_with_auth(config)],
+        preprocess_fn=_merge_rows_into_one,  # merge all tool/LLM traces into one row
+    )
+
+def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]:
+    out = []
+    for idx, iid in enumerate(ids):
+        out.append(
+            EvaluationRow(
+                messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")],
+                input_metadata={
+                    "row_id": str(idx),                 # ← use instance_id here
+                    "instance_id": iid,           # ← explicit for debugging
+                    "instance_index": str(idx),   # ← optional: keep index
+                    "completion_params": {"model": MODEL_ID},
+                },
+            )
+        )
+    return out
+
+def rows_from_indices(count: int) -> List[EvaluationRow]:
+    out: List[EvaluationRow] = []
+    for idx in range(count):
+        metadata = {
+            "row_id": str(idx),
+            "instance_index": str(idx),
+        }
+        # Add model_kwargs to metadata so server can read from req.metadata
+        if CLI_MODEL_KWARGS:
+            metadata["model_kwargs"] = CLI_MODEL_KWARGS
+        
+        out.append(
+            EvaluationRow(
+                messages=[Message(role="user", content=f"Run SWE-bench index {idx}")],
+                input_metadata=metadata,
+            )
+        )
+    return out
+
+def rows() -> List[EvaluationRow]:
+    # Generate 10 rows by index; server maps index -> dataset instance via --slice
+    return rows_from_indices(10)
+
+
+# -------------------- Harness result attachment (UI pass/fail) --------------------
+import json
+from pathlib import Path
+
+def _safe_model_id(model_id: str) -> str:
+    return model_id.replace("/", "__").replace(":", "-")
+
+def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
+    """Attach evaluation result by reading harness report or exit status."""
+    import logging
+    logger = logging.getLogger(__name__)
+    
+    # Get row_id and instance_id
+    try:
+        row_id = str(row.input_metadata.row_id)  # ← use attribute, not .get()
+    except Exception as e:
+        logger.warning(f"Could not get row_id: {e}")
+        return row
+    
+    row_dir = Path.cwd() / f"row_{row_id}"
+    logger.info(f"[Row {row_id}] Looking for results in {row_dir}")
+    
+    # Find instance_id from preds.json
+    preds_path = row_dir / "preds.json"
+    instance_id = None
+    if preds_path.exists():
+        try:
+            preds = json.loads(preds_path.read_text())
+            instance_id = next(iter(preds.keys()), None)
+            logger.info(f"[Row {row_id}] Found instance_id: {instance_id}")
+        except Exception as e:
+            logger.warning(f"[Row {row_id}] Could not read preds.json: {e}")
+    
+    if not instance_id:
+        logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result")
+        return row
+    
+    resolved: bool | None = None
+    reason_text: str | None = None
+    
+    # 1. Try to read from report.json (harness ran tests)
+    safe_model = _safe_model_id(model_id)
+    report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
+    
+    if report_path.exists():
+        logger.info(f"[Row {row_id}] Found report.json at {report_path}")
+        try:
+            report_data = json.loads(report_path.read_text())
+            instance_data = report_data.get(instance_id, {})
+            resolved = bool(instance_data.get("resolved", False))
+            reason_text = f"harness_resolved={resolved}"
+            logger.info(f"[Row {row_id}] Report says resolved={resolved}")
+        except Exception as e:
+            logger.error(f"[Row {row_id}] Failed to parse report.json: {e}")
+    else:
+        logger.info(f"[Row {row_id}] No report.json found at {report_path}")
+    
+    # 2. If no report, check exit status YAML (agent didn't produce a patch)
+    if resolved is None:
+        exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
+        if exit_status_files:
+            exit_file = exit_status_files[-1]
+            logger.info(f"[Row {row_id}] Reading exit status from {exit_file.name}")
+            try:
+                status_doc = yaml.safe_load(exit_file.read_text()) or {}
+                by_status = status_doc.get("instances_by_exit_status", {})
+                for status_name, ids in by_status.items():
+                    if instance_id in (ids or []):
+                        resolved = False
+                        reason_text = f"exit_status={status_name}"
+                        logger.info(f"[Row {row_id}] Exit status: {status_name}")
+                        break
+            except Exception as e:
+                logger.error(f"[Row {row_id}] Failed to parse exit status: {e}")
+        else:
+            logger.warning(f"[Row {row_id}] No exit status YAML found")
+    
+    # 3. Attach result if we found anything
+    if resolved is not None:
+        logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}")
+        row.evaluation_result = EvaluateResult(
+            score=1.0 if resolved else 0.0,
+            reason=reason_text or f"resolved={resolved}",
+            is_score_valid=True,
+            metrics={
+                "resolved": MetricResult(
+                    score=1.0 if resolved else 0.0,
+                    is_score_valid=True,
+                    reason=reason_text or f"resolved={resolved}",
+                    value=int(resolved),
+                )
+            },
+        )
+    else:
+        logger.warning(f"[Row {row_id}] Could not determine resolved status")
+    
+    return row
+
+    
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[rows],
+    ),
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://127.0.0.1:3000",
+        model_base_url="https://tracing.fireworks.ai",
+        timeout_seconds=1800,
+        output_data_loader=fireworks_output_data_loader,
+    ),
+    completion_params=[COMPLETION_PARAMS],
+    max_concurrent_rollouts=(CLI_CONCURRENCY or 2),
+)
+async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
+    assert len(row.messages) >= 1
+    row = attach_eval_result(row, MODEL_ID)
+    return row
\ No newline at end of file
diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py
new file mode 100644
index 00000000..e00dc460
--- /dev/null
+++ b/examples/swebench/tracing_model.py
@@ -0,0 +1,75 @@
+"""
+TracingFireworksModel - Routes through tracing using OpenAI SDK.
+"""
+import sys
+import os
+
+sys.path.insert(0, "/Users/shrey/Documents/cookbook-internal/recipes/eval/swe_bench")
+
+from run_swe_agent_fw import FireworksCompatibleModel
+
+
+class TracingFireworksModel(FireworksCompatibleModel):
+    """Routes LLM calls through tracing using OpenAI SDK (preserves model name)."""
+    
+    def _query(self, messages, **kwargs):
+        """Use OpenAI SDK directly to preserve model name through tracing."""
+        from openai import OpenAI
+        import traceback
+        
+        tracing_url = os.environ.get('TRACING_BASE_URL', '')
+        api_key = os.environ.get('FIREWORKS_API_KEY', '')
+        
+        if not tracing_url:
+            print("⚠️  No TRACING_BASE_URL - using parent litellm")
+            return super()._query(messages, **kwargs)
+        
+        print(f"\n🔗 OpenAI SDK Call:")
+        print(f"   URL: {tracing_url[:60]}...")
+        print(f"   Model: {self.config.model_name}")
+        
+        try:
+            client = OpenAI(base_url=tracing_url, api_key=api_key)
+            
+            # Build OpenAI-compatible params
+            openai_kwargs = {}
+            if self.config.model_kwargs.get('stop'):
+                openai_kwargs['stop'] = self.config.model_kwargs['stop']
+                print(f"   Stop sequences: {len(openai_kwargs['stop'])}")
+            if self.config.model_kwargs.get('max_tokens'):
+                openai_kwargs['max_tokens'] = self.config.model_kwargs['max_tokens']
+            if self.config.model_kwargs.get('temperature') is not None:
+                openai_kwargs['temperature'] = self.config.model_kwargs['temperature']
+            
+            # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept!
+            clean_messages = []
+            for msg in messages:
+                clean_msg = {"role": msg["role"], "content": msg["content"]}
+                # Preserve standard fields only
+                if "name" in msg:
+                    clean_msg["name"] = msg["name"]
+                if "tool_calls" in msg:
+                    clean_msg["tool_calls"] = msg["tool_calls"]
+                clean_messages.append(clean_msg)
+            
+            print(f"   Messages: {len(clean_messages)} (cleaned)")
+            print(f"   Making call...")
+            
+            # OpenAI SDK call
+            response = client.chat.completions.create(
+                model=self.config.model_name,
+                messages=clean_messages,  # ← Use cleaned messages!
+                **openai_kwargs,
+            )
+            
+            print(f"   ✅ Call succeeded!")
+            print(f"   Response ID: {response.id}")
+            print(f"   Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n")
+            
+            return response
+            
+        except Exception as e:
+            print(f"\n❌ ERROR in TracingFireworksModel._query:")
+            print(f"   {type(e).__name__}: {e}")
+            traceback.print_exc()
+            raise
\ No newline at end of file

From 2dad5181e99c750cb2211d3b9d2d1d817180cec5 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Thu, 16 Oct 2025 13:56:23 -0700
Subject: [PATCH 02/10] linterrors

---
 examples/swebench/README.md              |   2 +-
 examples/swebench/SWE-bench              |   1 +
 examples/swebench/run_swe_agent_fw.py    | 176 ++++++++++++-----------
 examples/swebench/server.py              |  41 ++++--
 examples/swebench/tests/conftest.py      |  10 +-
 examples/swebench/tests/test_swebench.py |  46 +++---
 examples/swebench/tracing_model.py       |  51 +++----
 7 files changed, 182 insertions(+), 145 deletions(-)
 create mode 160000 examples/swebench/SWE-bench

diff --git a/examples/swebench/README.md b/examples/swebench/README.md
index 04993e02..f4082f76 100644
--- a/examples/swebench/README.md
+++ b/examples/swebench/README.md
@@ -54,4 +54,4 @@ pytest examples/swebench/tests/test_swebench.py -v -s
 Notes
 - The test currently generates 10 rows by numeric index (0–9)
 - Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}`
-- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5`
\ No newline at end of file
+- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5`
diff --git a/examples/swebench/SWE-bench b/examples/swebench/SWE-bench
new file mode 160000
index 00000000..5cd4be9f
--- /dev/null
+++ b/examples/swebench/SWE-bench
@@ -0,0 +1 @@
+Subproject commit 5cd4be9fb23971679cbbafe5a0ecade27cef99be
diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py
index f1ae1a51..cef3de9a 100755
--- a/examples/swebench/run_swe_agent_fw.py
+++ b/examples/swebench/run_swe_agent_fw.py
@@ -45,64 +45,62 @@ class FireworksCompatibleModel(LitellmModel):
     """
 
     def __init__(self, **kwargs):
-        if model_id := os.environ.get('FIREWORKS_MODEL_ID'):
-            kwargs['model_name'] = model_id
+        if model_id := os.environ.get("FIREWORKS_MODEL_ID"):
+            kwargs["model_name"] = model_id
         print(f"kwargs: {kwargs}")
-        if 'model_kwargs' not in kwargs:
-            kwargs['model_kwargs'] = {}
-        
+        if "model_kwargs" not in kwargs:
+            kwargs["model_kwargs"] = {}
+
         # CRITICAL: Set drop_params to False so stop sequences aren't stripped!
-        kwargs['model_kwargs']['drop_params'] = False
-        
+        kwargs["model_kwargs"]["drop_params"] = False
+
         # Get existing stop sequences
-        existing_stop = kwargs['model_kwargs'].get('stop', [])
+        existing_stop = kwargs["model_kwargs"].get("stop", [])
         if isinstance(existing_stop, str):
             existing_stop = [existing_stop]
         elif existing_stop is None:
             existing_stop = []
-        
+
         # Add stop sequences (only the non-natural ones)
         stop_sequences = existing_stop + [
-           # ASCII versions
+            # ASCII versions
             "<|User|>",
             "<|Assistant|>",
-            
             # Full-width PIPE versions (U+FF5C)
-            "<｜User|>",       # \uff5c
+            "<｜User|>",  # \uff5c
             "<｜Assistant|>",
             "```<｜",
             "<｜User",
             "<｜Ass",
-            
-            # Full-width LETTER L versions (U+FF4C) 
-            "<ｌUser|>",      # \uff4c
+            # Full-width LETTER L versions (U+FF4C)
+            "<ｌUser|>",  # \uff4c
             "<ｌAssistant|>",
             "```<ｌ",
             "<ｌUser",
             "<ｌAss",
         ]
-        kwargs['model_kwargs']['stop'] = stop_sequences
-        kwargs['model_kwargs']['max_tokens'] = 1024  # Reduce to 1024 to save tokens
-        
-        if 'temperature' not in kwargs['model_kwargs']:
-            kwargs['model_kwargs']['temperature'] = 0.0
+        kwargs["model_kwargs"]["stop"] = stop_sequences
+        kwargs["model_kwargs"]["max_tokens"] = 1024  # Reduce to 1024 to save tokens
+
+        if "temperature" not in kwargs["model_kwargs"]:
+            kwargs["model_kwargs"]["temperature"] = 0.0
 
         # Apply per-run overrides injected by the wrapper (no environment variables)
-        overrides = globals().get('WRAPPER_MODEL_OVERRIDES')
+        overrides = globals().get("WRAPPER_MODEL_OVERRIDES")
         if isinstance(overrides, dict):
-            if overrides.get('reasoning') in ('low', 'medium', 'high'):
-                kwargs['model_kwargs']['reasoning_effort'] = overrides['reasoning']
-            if overrides.get('temperature') is not None:
+            if overrides.get("reasoning") in ("low", "medium", "high"):
+                kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"]
+            if overrides.get("temperature") is not None:
                 try:
-                    kwargs['model_kwargs']['temperature'] = float(overrides['temperature'])
+                    kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"])
                 except Exception:
                     pass
-            if overrides.get('max_tokens') is not None:
+            if overrides.get("max_tokens") is not None:
                 try:
-                    kwargs['model_kwargs']['max_tokens'] = int(overrides['max_tokens'])
+                    kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"])
                 except Exception:
                     pass
-        
+
         super().__init__(**kwargs)
 
     def _query(self, messages: list[dict[str, str]], **kwargs):
@@ -110,39 +108,38 @@ def _query(self, messages: list[dict[str, str]], **kwargs):
         # Keep only standard OpenAI-compatible fields
         clean_messages = []
         for msg in messages:
-            clean_msg = {
-                "role": msg["role"],
-                "content": msg["content"]
-            }
+            clean_msg = {"role": msg["role"], "content": msg["content"]}
             if "tool_calls" in msg:
                 clean_msg["tool_calls"] = msg["tool_calls"]
             if "name" in msg:
                 clean_msg["name"] = msg["name"]
             clean_messages.append(clean_msg)
-        
+
         # IMPORTANT: Ensure drop_params stays False in the actual query
         kwargs_with_stop = kwargs.copy()
-        if 'drop_params' not in kwargs_with_stop:
-            kwargs_with_stop['drop_params'] = False
-        
+        if "drop_params" not in kwargs_with_stop:
+            kwargs_with_stop["drop_params"] = False
+
         return super()._query(clean_messages, **kwargs_with_stop)
 
+
 def __get_api_key():
     """Get Fireworks API key from environment or mini-swe-agent config."""
     # Environment variable takes precedence
-    if api_key := os.environ.get('FIREWORKS_API_KEY'):
+    if api_key := os.environ.get("FIREWORKS_API_KEY"):
         return api_key
 
     # Try to get API key from mini-swe-agent's config system
     try:
         from minisweagent.config import get_config
+
         config = get_config()
-        return config.get('FIREWORKS_API_KEY')
+        return config.get("FIREWORKS_API_KEY")
     except (ImportError, AttributeError, KeyError):
         # Fallback: check common config file locations
         config_paths = [
             Path.home() / ".config" / "mini-swe-agent" / ".env",
-            Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env"
+            Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env",
         ]
 
         for config_path in config_paths:
@@ -150,8 +147,8 @@ def __get_api_key():
                 try:
                     with open(config_path) as f:
                         for line in f:
-                            if line.startswith('FIREWORKS_API_KEY='):
-                                value = line.split('=', 1)[1].strip()
+                            if line.startswith("FIREWORKS_API_KEY="):
+                                value = line.split("=", 1)[1].strip()
                                 return value.strip("'\"")
                 except (IOError, OSError):
                     continue
@@ -170,7 +167,7 @@ def __test_model(model_id):
         return False
 
     # Configure environment for litellm
-    os.environ['FIREWORKS_API_KEY'] = api_key
+    os.environ["FIREWORKS_API_KEY"] = api_key
     # Assume model_id is fully qualified
     model_name = model_id
 
@@ -182,7 +179,7 @@ def __test_model(model_id):
             model=model_name,
             messages=[{"role": "user", "content": "Test message. Reply with OK."}],
             temperature=0.0,
-            max_tokens=10
+            max_tokens=10,
         )
 
         print(f"Success. Response: {response.choices[0].message.content}")
@@ -201,8 +198,6 @@ def __validate_environment():
         print("Set it with: mini-extra config set FIREWORKS_API_KEY <key>")
 
 
-
-
 def __build_command(args, wrapper_module_path):
     """Build mini-swe-agent command with appropriate arguments."""
     # Construct model class path
@@ -212,12 +207,17 @@ def __build_command(args, wrapper_module_path):
     # Base command - assume model_id is fully qualified
     cmd = [
         sys.executable,
-        "-m", "minisweagent.run.mini_extra",
+        "-m",
+        "minisweagent.run.mini_extra",
         "swebench-single" if args.single is not None else "swebench",
-        "--model", args.model_id,
-        "--model-class", model_class,
-        "--subset", args.subset,
-        "--split", args.split
+        "--model",
+        args.model_id,
+        "--model-class",
+        model_class,
+        "--subset",
+        args.subset,
+        "--split",
+        args.split,
     ]
     if args.model_class:
         cmd.extend(["--model-class", args.model_class])
@@ -230,18 +230,26 @@ def __build_command(args, wrapper_module_path):
     if args.single is not None:
         # Use batch mode for a single index via slice and write to a per-row directory
         from pathlib import Path
-        slice_spec = f"{args.single}:{args.single+1}"
+
+        slice_spec = f"{args.single}:{args.single + 1}"
         row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}")
         cmd = [
             sys.executable,
-            "-m", "minisweagent.run.mini_extra",
+            "-m",
+            "minisweagent.run.mini_extra",
             "swebench",
-            "--model", args.model_id,
-            "--model-class", model_class,
-            "--subset", args.subset,
-            "--split", args.split,
-            "--slice", slice_spec,
-            "--output", row_dir,
+            "--model",
+            args.model_id,
+            "--model-class",
+            model_class,
+            "--subset",
+            args.subset,
+            "--split",
+            args.split,
+            "--slice",
+            slice_spec,
+            "--output",
+            row_dir,
         ]
         if args.model_class:
             cmd.extend(["--model-class", args.model_class])
@@ -253,31 +261,35 @@ def __build_command(args, wrapper_module_path):
 
     return cmd
 
-    
-
 
 def main():
     parser = argparse.ArgumentParser(
-        description='Run mini-swe-agent with Fireworks models on SWE-bench',
+        description="Run mini-swe-agent with Fireworks models on SWE-bench",
         formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
+        epilog=__doc__,
     )
 
     # Required model ID
-    parser.add_argument('model_id', help='Fireworks model ID')
-    parser.add_argument('--model-class', type=str, default=None, help='Optional mini-swe-agent model-class')
+    parser.add_argument("model_id", help="Fireworks model ID")
+    parser.add_argument("--model-class", type=str, default=None, help="Optional mini-swe-agent model-class")
     # Execution options
-    parser.add_argument('--instances', type=int, help='Number of instances to run')
-    parser.add_argument('--workers', type=int, default=1, help='Parallel workers (default: 1)')
-    parser.add_argument('--output', help='Output directory')
-    parser.add_argument('--subset', default='verified', choices=['verified', 'lite', 'full'])
-    parser.add_argument('--split', default='test', choices=['dev', 'test'])
-    parser.add_argument('--single', type=int, metavar='INDEX', help='Run single instance')
-    parser.add_argument('--exit-immediately', action='store_true')
-    parser.add_argument('--test', action='store_true', help='Test model connectivity')
-    parser.add_argument('--reasoning', type=str, choices=['low', 'medium', 'high'], default=None, help='Provider-specific reasoning effort')
-    parser.add_argument('--temperature', type=float, default=None, help='Model temperature override')
-    parser.add_argument('--max-tokens', type=int, default=None, help='Max tokens override')
+    parser.add_argument("--instances", type=int, help="Number of instances to run")
+    parser.add_argument("--workers", type=int, default=1, help="Parallel workers (default: 1)")
+    parser.add_argument("--output", help="Output directory")
+    parser.add_argument("--subset", default="verified", choices=["verified", "lite", "full"])
+    parser.add_argument("--split", default="test", choices=["dev", "test"])
+    parser.add_argument("--single", type=int, metavar="INDEX", help="Run single instance")
+    parser.add_argument("--exit-immediately", action="store_true")
+    parser.add_argument("--test", action="store_true", help="Test model connectivity")
+    parser.add_argument(
+        "--reasoning",
+        type=str,
+        choices=["low", "medium", "high"],
+        default=None,
+        help="Provider-specific reasoning effort",
+    )
+    parser.add_argument("--temperature", type=float, default=None, help="Model temperature override")
+    parser.add_argument("--max-tokens", type=int, default=None, help="Max tokens override")
     args = parser.parse_args()
 
     # Handle test mode
@@ -291,11 +303,11 @@ def main():
     if args.output is None:
         safe_model_id = args.model_id.replace("/", "-").replace(":", "-")
         script_dir = Path(__file__).parent.resolve()
-        args.output = str(script_dir / f'swebench-{safe_model_id}-results')
+        args.output = str(script_dir / f"swebench-{safe_model_id}-results")
 
     # Create temporary module for importing FireworksCompatibleModel
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-        with open(__file__, 'r') as current_file:
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+        with open(__file__, "r") as current_file:
             f.write(current_file.read())
         # Inject per-run model overrides directly into the temp module
         f.write("\n# --- Injected by wrapper: per-run model overrides ---\n")
@@ -309,14 +321,14 @@ def main():
     try:
         # Configure environment
         env = os.environ.copy()
-        env['PYTHONPATH'] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}"
+        env["PYTHONPATH"] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}"
         # Pass the fully qualified model path to the subprocess
-        env['FIREWORKS_MODEL_ID'] = args.model_id
+        env["FIREWORKS_MODEL_ID"] = args.model_id
 
         # Ensure API key is passed to subprocess
         api_key = __get_api_key()
         if api_key:
-            env['FIREWORKS_API_KEY'] = api_key
+            env["FIREWORKS_API_KEY"] = api_key
 
         # No environment variables for model kwargs; overrides are injected into the temp module
 
@@ -343,5 +355,5 @@ def main():
             temp_module_path.unlink()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/swebench/server.py b/examples/swebench/server.py
index 80ddabe4..ae3df983 100644
--- a/examples/swebench/server.py
+++ b/examples/swebench/server.py
@@ -1,4 +1,5 @@
 """Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url."""
+
 import os
 import threading
 import subprocess
@@ -15,6 +16,7 @@
 logging.getLogger().addHandler(handler)
 rollout_states = {}
 
+
 @app.post("/init")
 def init(req: InitRequest):
     # Allow Eval Protocol to dynamically configure ES endpoint
@@ -37,7 +39,6 @@ def _worker():
             if not req.model:
                 raise ValueError("model is required")
 
-            
             if not req.metadata or not req.metadata.row_id:
                 raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'")
             try:
@@ -56,9 +57,9 @@ def _worker():
             out_dir = os.getcwd()
 
             from pathlib import Path
-            
+
             script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve())
-            
+
             # Extract model_kwargs from req.metadata (forwarded from input_metadata)
             model_kwargs = {}
             logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}")
@@ -69,8 +70,8 @@ def _worker():
                     model_kwargs = mk
                     logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}")
             else:
-                logger.info(f"DEBUG: req.metadata has NO model_kwargs attribute")
-            
+                logger.info("DEBUG: req.metadata has NO model_kwargs attribute")
+
             # Set tracing URL
             if req.model_base_url:
                 env["TRACING_BASE_URL"] = req.model_base_url
@@ -79,10 +80,13 @@ def _worker():
                 "python3",
                 script_path,
                 req.model,
-                "--single", str(single_index),
+                "--single",
+                str(single_index),
                 "--exit-immediately",
-                "--output", str(out_dir),
-                "--model-class", "tracing_model.TracingFireworksModel",
+                "--output",
+                str(out_dir),
+                "--model-class",
+                "tracing_model.TracingFireworksModel",
             ]
             # Forward model kwargs as CLI flags to the wrapper
             if model_kwargs.get("reasoning") in ("low", "medium", "high"):
@@ -92,6 +96,7 @@ def _worker():
             if model_kwargs.get("max_tokens") is not None:
                 cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])])
             import json
+
             # Log path inside row directory for this run
             row_dir = Path(out_dir) / f"row_{single_index}"
             row_dir.mkdir(parents=True, exist_ok=True)
@@ -109,7 +114,6 @@ def _worker():
                 )
                 ret = proc.wait()
 
-
             # Stream stdout/stderr to logs
             # assert proc.stdout is not None and proc.stderr is not None
             # for line in proc.stdout:
@@ -130,11 +134,17 @@ def _worker():
             # 2) Run SWE-bench evaluation harness on preds.json
             preds_path_str = str(preds_path)
             eval_cmd = [
-                "python3", "-m", "swebench.harness.run_evaluation",
-                "--dataset_name", "princeton-nlp/SWE-bench_Verified",
-                "--predictions_path", preds_path_str,
-                "--max_workers", str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
-                "--run_id", "eval-run",
+                "python3",
+                "-m",
+                "swebench.harness.run_evaluation",
+                "--dataset_name",
+                "princeton-nlp/SWE-bench_Verified",
+                "--predictions_path",
+                preds_path_str,
+                "--max_workers",
+                str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
+                "--run_id",
+                "eval-run",
             ]
             logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
             eval_proc = subprocess.Popen(
@@ -156,14 +166,17 @@ def _worker():
     threading.Thread(target=_worker, daemon=True).start()
     return {"status": "accepted"}
 
+
 @app.get("/status")
 def status(rollout_id: str):
     return rollout_states.get(rollout_id, {"terminated": False})
 
+
 def main():
     host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1")
     port = int(os.getenv("REMOTE_SERVER_PORT", "3000"))
     uvicorn.run(app, host=host, port=port)
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py
index 8bca079e..3b81f7ad 100644
--- a/examples/swebench/tests/conftest.py
+++ b/examples/swebench/tests/conftest.py
@@ -1,19 +1,21 @@
 import os
 import pytest
 
-import os
-import pytest
 
 MODEL_ID_OPT = None
 CONCURRENCY_OPT = None
 MODEL_KWARGS_OPT = None
 
+
 def pytest_addoption(parser):
     parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID")
     parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts")
     parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature")
     parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens")
-    parser.addoption("--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort")
+    parser.addoption(
+        "--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort"
+    )
+
 
 def pytest_configure(config):
     global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT
@@ -29,4 +31,4 @@ def pytest_configure(config):
         mk["max_tokens"] = int(mtok)
     if reas is not None:
         mk["reasoning"] = reas
-    MODEL_KWARGS_OPT = mk or None
\ No newline at end of file
+    MODEL_KWARGS_OPT = mk or None
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index 37fef735..48c130a9 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -9,6 +9,7 @@
 from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
 from eval_protocol.types.remote_rollout_processor import DataLoaderConfig
 from eval_protocol.quickstart.utils import filter_longest_conversation
+
 # Reuse the converter used by the built-in adapter
 from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row
 import conftest
@@ -25,6 +26,7 @@
 if CLI_MODEL_KWARGS:
     COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS
 
+
 def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]:
     """
     Fetch traces directly from the Fireworks tracing proxy with Authorization header
@@ -70,13 +72,12 @@ def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     seen = set()
     merged_msgs: List[Message] = []
     for r in rows:
-        for m in (r.messages or []):
+        for m in r.messages or []:
             # Dedup by role+name+content+tool_calls signature
             tool_sig = None
             if getattr(m, "tool_calls", None):
                 tool_sig = tuple(
-                    (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name"))
-                    for tc in m.tool_calls
+                    (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) for tc in m.tool_calls
                 )
             key = (m.role, getattr(m, "name", None), m.content, tool_sig)
             if key in seen:
@@ -86,12 +87,14 @@ def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]:
     base.messages = merged_msgs
     return [base]
 
+
 def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
     return DynamicDataLoader(
         generators=[lambda: fetch_traces_with_auth(config)],
         preprocess_fn=_merge_rows_into_one,  # merge all tool/LLM traces into one row
     )
 
+
 def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]:
     out = []
     for idx, iid in enumerate(ids):
@@ -99,15 +102,16 @@ def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]:
             EvaluationRow(
                 messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")],
                 input_metadata={
-                    "row_id": str(idx),                 # ← use instance_id here
-                    "instance_id": iid,           # ← explicit for debugging
-                    "instance_index": str(idx),   # ← optional: keep index
+                    "row_id": str(idx),  # ← use instance_id here
+                    "instance_id": iid,  # ← explicit for debugging
+                    "instance_index": str(idx),  # ← optional: keep index
                     "completion_params": {"model": MODEL_ID},
                 },
             )
         )
     return out
 
+
 def rows_from_indices(count: int) -> List[EvaluationRow]:
     out: List[EvaluationRow] = []
     for idx in range(count):
@@ -118,7 +122,7 @@ def rows_from_indices(count: int) -> List[EvaluationRow]:
         # Add model_kwargs to metadata so server can read from req.metadata
         if CLI_MODEL_KWARGS:
             metadata["model_kwargs"] = CLI_MODEL_KWARGS
-        
+
         out.append(
             EvaluationRow(
                 messages=[Message(role="user", content=f"Run SWE-bench index {idx}")],
@@ -127,6 +131,7 @@ def rows_from_indices(count: int) -> List[EvaluationRow]:
         )
     return out
 
+
 def rows() -> List[EvaluationRow]:
     # Generate 10 rows by index; server maps index -> dataset instance via --slice
     return rows_from_indices(10)
@@ -136,24 +141,27 @@ def rows() -> List[EvaluationRow]:
 import json
 from pathlib import Path
 
+
 def _safe_model_id(model_id: str) -> str:
     return model_id.replace("/", "__").replace(":", "-")
 
+
 def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
     """Attach evaluation result by reading harness report or exit status."""
     import logging
+
     logger = logging.getLogger(__name__)
-    
+
     # Get row_id and instance_id
     try:
         row_id = str(row.input_metadata.row_id)  # ← use attribute, not .get()
     except Exception as e:
         logger.warning(f"Could not get row_id: {e}")
         return row
-    
+
     row_dir = Path.cwd() / f"row_{row_id}"
     logger.info(f"[Row {row_id}] Looking for results in {row_dir}")
-    
+
     # Find instance_id from preds.json
     preds_path = row_dir / "preds.json"
     instance_id = None
@@ -164,18 +172,18 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
             logger.info(f"[Row {row_id}] Found instance_id: {instance_id}")
         except Exception as e:
             logger.warning(f"[Row {row_id}] Could not read preds.json: {e}")
-    
+
     if not instance_id:
         logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result")
         return row
-    
+
     resolved: bool | None = None
     reason_text: str | None = None
-    
+
     # 1. Try to read from report.json (harness ran tests)
     safe_model = _safe_model_id(model_id)
     report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
-    
+
     if report_path.exists():
         logger.info(f"[Row {row_id}] Found report.json at {report_path}")
         try:
@@ -188,7 +196,7 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
             logger.error(f"[Row {row_id}] Failed to parse report.json: {e}")
     else:
         logger.info(f"[Row {row_id}] No report.json found at {report_path}")
-    
+
     # 2. If no report, check exit status YAML (agent didn't produce a patch)
     if resolved is None:
         exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
@@ -208,7 +216,7 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
                 logger.error(f"[Row {row_id}] Failed to parse exit status: {e}")
         else:
             logger.warning(f"[Row {row_id}] No exit status YAML found")
-    
+
     # 3. Attach result if we found anything
     if resolved is not None:
         logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}")
@@ -227,10 +235,10 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
         )
     else:
         logger.warning(f"[Row {row_id}] Could not determine resolved status")
-    
+
     return row
 
-    
+
 @evaluation_test(
     data_loaders=DynamicDataLoader(
         generators=[rows],
@@ -247,4 +255,4 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
 async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
     assert len(row.messages) >= 1
     row = attach_eval_result(row, MODEL_ID)
-    return row
\ No newline at end of file
+    return row
diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py
index e00dc460..8862388c 100644
--- a/examples/swebench/tracing_model.py
+++ b/examples/swebench/tracing_model.py
@@ -1,6 +1,7 @@
 """
 TracingFireworksModel - Routes through tracing using OpenAI SDK.
 """
+
 import sys
 import os
 
@@ -11,36 +12,36 @@
 
 class TracingFireworksModel(FireworksCompatibleModel):
     """Routes LLM calls through tracing using OpenAI SDK (preserves model name)."""
-    
+
     def _query(self, messages, **kwargs):
         """Use OpenAI SDK directly to preserve model name through tracing."""
         from openai import OpenAI
         import traceback
-        
-        tracing_url = os.environ.get('TRACING_BASE_URL', '')
-        api_key = os.environ.get('FIREWORKS_API_KEY', '')
-        
+
+        tracing_url = os.environ.get("TRACING_BASE_URL", "")
+        api_key = os.environ.get("FIREWORKS_API_KEY", "")
+
         if not tracing_url:
             print("⚠️  No TRACING_BASE_URL - using parent litellm")
             return super()._query(messages, **kwargs)
-        
-        print(f"\n🔗 OpenAI SDK Call:")
+
+        print("\n🔗 OpenAI SDK Call:")
         print(f"   URL: {tracing_url[:60]}...")
         print(f"   Model: {self.config.model_name}")
-        
+
         try:
             client = OpenAI(base_url=tracing_url, api_key=api_key)
-            
+
             # Build OpenAI-compatible params
             openai_kwargs = {}
-            if self.config.model_kwargs.get('stop'):
-                openai_kwargs['stop'] = self.config.model_kwargs['stop']
+            if self.config.model_kwargs.get("stop"):
+                openai_kwargs["stop"] = self.config.model_kwargs["stop"]
                 print(f"   Stop sequences: {len(openai_kwargs['stop'])}")
-            if self.config.model_kwargs.get('max_tokens'):
-                openai_kwargs['max_tokens'] = self.config.model_kwargs['max_tokens']
-            if self.config.model_kwargs.get('temperature') is not None:
-                openai_kwargs['temperature'] = self.config.model_kwargs['temperature']
-            
+            if self.config.model_kwargs.get("max_tokens"):
+                openai_kwargs["max_tokens"] = self.config.model_kwargs["max_tokens"]
+            if self.config.model_kwargs.get("temperature") is not None:
+                openai_kwargs["temperature"] = self.config.model_kwargs["temperature"]
+
             # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept!
             clean_messages = []
             for msg in messages:
@@ -51,25 +52,25 @@ def _query(self, messages, **kwargs):
                 if "tool_calls" in msg:
                     clean_msg["tool_calls"] = msg["tool_calls"]
                 clean_messages.append(clean_msg)
-            
+
             print(f"   Messages: {len(clean_messages)} (cleaned)")
-            print(f"   Making call...")
-            
+            print("   Making call...")
+
             # OpenAI SDK call
             response = client.chat.completions.create(
                 model=self.config.model_name,
                 messages=clean_messages,  # ← Use cleaned messages!
                 **openai_kwargs,
             )
-            
-            print(f"   ✅ Call succeeded!")
+
+            print("   ✅ Call succeeded!")
             print(f"   Response ID: {response.id}")
             print(f"   Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n")
-            
+
             return response
-            
+
         except Exception as e:
-            print(f"\n❌ ERROR in TracingFireworksModel._query:")
+            print("\n❌ ERROR in TracingFireworksModel._query:")
             print(f"   {type(e).__name__}: {e}")
             traceback.print_exc()
-            raise
\ No newline at end of file
+            raise

From 9ffbf9e4b60347df96bb36376e5c4a4e631cc99a Mon Sep 17 00:00:00 2001
From: shreymodi1 <82307545+shreymodi1@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:27:39 -0700
Subject: [PATCH 03/10] Delete
 examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json

---
 ...works__deployments__r5dfiiwp.eval-run.json | 521 ------------------
 1 file changed, 521 deletions(-)
 delete mode 100644 examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json

diff --git a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json b/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json
deleted file mode 100644
index a9e10524..00000000
--- a/examples/swebench/fireworks_ai__accounts__pyroworks__models__swe-1-mtp#accounts__pyroworks__deployments__r5dfiiwp.eval-run.json
+++ /dev/null
@@ -1,521 +0,0 @@
-{
-    "total_instances": 500,
-    "submitted_instances": 1,
-    "completed_instances": 0,
-    "resolved_instances": 0,
-    "unresolved_instances": 0,
-    "empty_patch_instances": 1,
-    "error_instances": 0,
-    "completed_ids": [],
-    "incomplete_ids": [
-        "astropy__astropy-12907",
-        "astropy__astropy-13033",
-        "astropy__astropy-13236",
-        "astropy__astropy-13398",
-        "astropy__astropy-13579",
-        "astropy__astropy-13977",
-        "astropy__astropy-14096",
-        "astropy__astropy-14182",
-        "astropy__astropy-14309",
-        "astropy__astropy-14365",
-        "astropy__astropy-14369",
-        "astropy__astropy-14508",
-        "astropy__astropy-14539",
-        "astropy__astropy-14598",
-        "astropy__astropy-14995",
-        "astropy__astropy-7166",
-        "astropy__astropy-7336",
-        "astropy__astropy-7606",
-        "astropy__astropy-7671",
-        "astropy__astropy-8707",
-        "astropy__astropy-8872",
-        "django__django-10097",
-        "django__django-10554",
-        "django__django-10880",
-        "django__django-10914",
-        "django__django-10973",
-        "django__django-10999",
-        "django__django-11066",
-        "django__django-11087",
-        "django__django-11095",
-        "django__django-11099",
-        "django__django-11119",
-        "django__django-11133",
-        "django__django-11138",
-        "django__django-11141",
-        "django__django-11149",
-        "django__django-11163",
-        "django__django-11179",
-        "django__django-11206",
-        "django__django-11211",
-        "django__django-11239",
-        "django__django-11265",
-        "django__django-11276",
-        "django__django-11292",
-        "django__django-11299",
-        "django__django-11333",
-        "django__django-11400",
-        "django__django-11433",
-        "django__django-11451",
-        "django__django-11477",
-        "django__django-11490",
-        "django__django-11532",
-        "django__django-11551",
-        "django__django-11555",
-        "django__django-11603",
-        "django__django-11728",
-        "django__django-11734",
-        "django__django-11740",
-        "django__django-11749",
-        "django__django-11790",
-        "django__django-11815",
-        "django__django-11820",
-        "django__django-11848",
-        "django__django-11880",
-        "django__django-11885",
-        "django__django-11951",
-        "django__django-11964",
-        "django__django-11999",
-        "django__django-12039",
-        "django__django-12050",
-        "django__django-12125",
-        "django__django-12143",
-        "django__django-12155",
-        "django__django-12193",
-        "django__django-12209",
-        "django__django-12262",
-        "django__django-12273",
-        "django__django-12276",
-        "django__django-12304",
-        "django__django-12308",
-        "django__django-12325",
-        "django__django-12406",
-        "django__django-12419",
-        "django__django-12663",
-        "django__django-12708",
-        "django__django-12713",
-        "django__django-12741",
-        "django__django-12754",
-        "django__django-12774",
-        "django__django-12858",
-        "django__django-12965",
-        "django__django-13012",
-        "django__django-13023",
-        "django__django-13028",
-        "django__django-13033",
-        "django__django-13089",
-        "django__django-13109",
-        "django__django-13112",
-        "django__django-13121",
-        "django__django-13128",
-        "django__django-13158",
-        "django__django-13195",
-        "django__django-13212",
-        "django__django-13279",
-        "django__django-13297",
-        "django__django-13315",
-        "django__django-13343",
-        "django__django-13344",
-        "django__django-13346",
-        "django__django-13363",
-        "django__django-13401",
-        "django__django-13406",
-        "django__django-13410",
-        "django__django-13417",
-        "django__django-13449",
-        "django__django-13512",
-        "django__django-13513",
-        "django__django-13516",
-        "django__django-13551",
-        "django__django-13568",
-        "django__django-13569",
-        "django__django-13590",
-        "django__django-13658",
-        "django__django-13670",
-        "django__django-13741",
-        "django__django-13786",
-        "django__django-13794",
-        "django__django-13807",
-        "django__django-13809",
-        "django__django-13810",
-        "django__django-13820",
-        "django__django-13821",
-        "django__django-13837",
-        "django__django-13925",
-        "django__django-13933",
-        "django__django-13964",
-        "django__django-14007",
-        "django__django-14011",
-        "django__django-14017",
-        "django__django-14034",
-        "django__django-14053",
-        "django__django-14089",
-        "django__django-14122",
-        "django__django-14140",
-        "django__django-14155",
-        "django__django-14170",
-        "django__django-14238",
-        "django__django-14311",
-        "django__django-14315",
-        "django__django-14349",
-        "django__django-14351",
-        "django__django-14373",
-        "django__django-14376",
-        "django__django-14404",
-        "django__django-14434",
-        "django__django-14493",
-        "django__django-14500",
-        "django__django-14534",
-        "django__django-14539",
-        "django__django-14559",
-        "django__django-14580",
-        "django__django-14608",
-        "django__django-14631",
-        "django__django-14672",
-        "django__django-14725",
-        "django__django-14752",
-        "django__django-14765",
-        "django__django-14771",
-        "django__django-14787",
-        "django__django-14792",
-        "django__django-14855",
-        "django__django-14915",
-        "django__django-14999",
-        "django__django-15022",
-        "django__django-15037",
-        "django__django-15098",
-        "django__django-15103",
-        "django__django-15104",
-        "django__django-15127",
-        "django__django-15128",
-        "django__django-15161",
-        "django__django-15252",
-        "django__django-15268",
-        "django__django-15277",
-        "django__django-15278",
-        "django__django-15280",
-        "django__django-15315",
-        "django__django-15368",
-        "django__django-15375",
-        "django__django-15380",
-        "django__django-15382",
-        "django__django-15467",
-        "django__django-15499",
-        "django__django-15503",
-        "django__django-15525",
-        "django__django-15554",
-        "django__django-15561",
-        "django__django-15563",
-        "django__django-15569",
-        "django__django-15572",
-        "django__django-15629",
-        "django__django-15695",
-        "django__django-15731",
-        "django__django-15732",
-        "django__django-15741",
-        "django__django-15814",
-        "django__django-15851",
-        "django__django-15863",
-        "django__django-15916",
-        "django__django-15930",
-        "django__django-15957",
-        "django__django-15973",
-        "django__django-15987",
-        "django__django-16032",
-        "django__django-16082",
-        "django__django-16100",
-        "django__django-16116",
-        "django__django-16136",
-        "django__django-16139",
-        "django__django-16145",
-        "django__django-16255",
-        "django__django-16256",
-        "django__django-16263",
-        "django__django-16315",
-        "django__django-16333",
-        "django__django-16429",
-        "django__django-16454",
-        "django__django-16485",
-        "django__django-16493",
-        "django__django-16502",
-        "django__django-16527",
-        "django__django-16560",
-        "django__django-16569",
-        "django__django-16595",
-        "django__django-16612",
-        "django__django-16631",
-        "django__django-16642",
-        "django__django-16661",
-        "django__django-16662",
-        "django__django-16667",
-        "django__django-16801",
-        "django__django-16819",
-        "django__django-16877",
-        "django__django-16899",
-        "django__django-16901",
-        "django__django-16938",
-        "django__django-16950",
-        "django__django-17029",
-        "django__django-17084",
-        "django__django-17087",
-        "django__django-7530",
-        "django__django-9296",
-        "matplotlib__matplotlib-13989",
-        "matplotlib__matplotlib-14623",
-        "matplotlib__matplotlib-20488",
-        "matplotlib__matplotlib-20676",
-        "matplotlib__matplotlib-20826",
-        "matplotlib__matplotlib-20859",
-        "matplotlib__matplotlib-21568",
-        "matplotlib__matplotlib-22719",
-        "matplotlib__matplotlib-22865",
-        "matplotlib__matplotlib-22871",
-        "matplotlib__matplotlib-23299",
-        "matplotlib__matplotlib-23314",
-        "matplotlib__matplotlib-23412",
-        "matplotlib__matplotlib-23476",
-        "matplotlib__matplotlib-24026",
-        "matplotlib__matplotlib-24149",
-        "matplotlib__matplotlib-24177",
-        "matplotlib__matplotlib-24570",
-        "matplotlib__matplotlib-24627",
-        "matplotlib__matplotlib-24637",
-        "matplotlib__matplotlib-24870",
-        "matplotlib__matplotlib-24970",
-        "matplotlib__matplotlib-25122",
-        "matplotlib__matplotlib-25287",
-        "matplotlib__matplotlib-25311",
-        "matplotlib__matplotlib-25332",
-        "matplotlib__matplotlib-25479",
-        "matplotlib__matplotlib-25775",
-        "matplotlib__matplotlib-25960",
-        "matplotlib__matplotlib-26113",
-        "matplotlib__matplotlib-26208",
-        "matplotlib__matplotlib-26291",
-        "matplotlib__matplotlib-26342",
-        "matplotlib__matplotlib-26466",
-        "mwaskom__seaborn-3069",
-        "mwaskom__seaborn-3187",
-        "pallets__flask-5014",
-        "psf__requests-1142",
-        "psf__requests-1724",
-        "psf__requests-1766",
-        "psf__requests-1921",
-        "psf__requests-2317",
-        "psf__requests-2931",
-        "psf__requests-5414",
-        "psf__requests-6028",
-        "pydata__xarray-2905",
-        "pydata__xarray-3095",
-        "pydata__xarray-3151",
-        "pydata__xarray-3305",
-        "pydata__xarray-3677",
-        "pydata__xarray-3993",
-        "pydata__xarray-4075",
-        "pydata__xarray-4094",
-        "pydata__xarray-4356",
-        "pydata__xarray-4629",
-        "pydata__xarray-4687",
-        "pydata__xarray-4695",
-        "pydata__xarray-4966",
-        "pydata__xarray-6461",
-        "pydata__xarray-6599",
-        "pydata__xarray-6721",
-        "pydata__xarray-6744",
-        "pydata__xarray-6938",
-        "pydata__xarray-6992",
-        "pydata__xarray-7229",
-        "pydata__xarray-7233",
-        "pydata__xarray-7393",
-        "pylint-dev__pylint-4551",
-        "pylint-dev__pylint-4604",
-        "pylint-dev__pylint-4661",
-        "pylint-dev__pylint-4970",
-        "pylint-dev__pylint-6386",
-        "pylint-dev__pylint-6528",
-        "pylint-dev__pylint-6903",
-        "pylint-dev__pylint-7080",
-        "pylint-dev__pylint-7277",
-        "pylint-dev__pylint-8898",
-        "pytest-dev__pytest-10051",
-        "pytest-dev__pytest-10081",
-        "pytest-dev__pytest-10356",
-        "pytest-dev__pytest-5262",
-        "pytest-dev__pytest-5631",
-        "pytest-dev__pytest-5787",
-        "pytest-dev__pytest-5809",
-        "pytest-dev__pytest-5840",
-        "pytest-dev__pytest-6197",
-        "pytest-dev__pytest-6202",
-        "pytest-dev__pytest-7205",
-        "pytest-dev__pytest-7236",
-        "pytest-dev__pytest-7324",
-        "pytest-dev__pytest-7432",
-        "pytest-dev__pytest-7490",
-        "pytest-dev__pytest-7521",
-        "pytest-dev__pytest-7571",
-        "pytest-dev__pytest-7982",
-        "pytest-dev__pytest-8399",
-        "scikit-learn__scikit-learn-10297",
-        "scikit-learn__scikit-learn-10844",
-        "scikit-learn__scikit-learn-10908",
-        "scikit-learn__scikit-learn-11310",
-        "scikit-learn__scikit-learn-11578",
-        "scikit-learn__scikit-learn-12585",
-        "scikit-learn__scikit-learn-12682",
-        "scikit-learn__scikit-learn-12973",
-        "scikit-learn__scikit-learn-13124",
-        "scikit-learn__scikit-learn-13135",
-        "scikit-learn__scikit-learn-13142",
-        "scikit-learn__scikit-learn-13328",
-        "scikit-learn__scikit-learn-13439",
-        "scikit-learn__scikit-learn-13496",
-        "scikit-learn__scikit-learn-13779",
-        "scikit-learn__scikit-learn-14053",
-        "scikit-learn__scikit-learn-14087",
-        "scikit-learn__scikit-learn-14141",
-        "scikit-learn__scikit-learn-14496",
-        "scikit-learn__scikit-learn-14629",
-        "scikit-learn__scikit-learn-14710",
-        "scikit-learn__scikit-learn-14894",
-        "scikit-learn__scikit-learn-14983",
-        "scikit-learn__scikit-learn-15100",
-        "scikit-learn__scikit-learn-25102",
-        "scikit-learn__scikit-learn-25232",
-        "scikit-learn__scikit-learn-25747",
-        "scikit-learn__scikit-learn-25931",
-        "scikit-learn__scikit-learn-25973",
-        "scikit-learn__scikit-learn-26194",
-        "scikit-learn__scikit-learn-26323",
-        "scikit-learn__scikit-learn-9288",
-        "sphinx-doc__sphinx-10323",
-        "sphinx-doc__sphinx-10435",
-        "sphinx-doc__sphinx-10449",
-        "sphinx-doc__sphinx-10466",
-        "sphinx-doc__sphinx-10614",
-        "sphinx-doc__sphinx-10673",
-        "sphinx-doc__sphinx-11445",
-        "sphinx-doc__sphinx-11510",
-        "sphinx-doc__sphinx-7440",
-        "sphinx-doc__sphinx-7454",
-        "sphinx-doc__sphinx-7462",
-        "sphinx-doc__sphinx-7590",
-        "sphinx-doc__sphinx-7748",
-        "sphinx-doc__sphinx-7757",
-        "sphinx-doc__sphinx-7889",
-        "sphinx-doc__sphinx-7910",
-        "sphinx-doc__sphinx-7985",
-        "sphinx-doc__sphinx-8035",
-        "sphinx-doc__sphinx-8056",
-        "sphinx-doc__sphinx-8120",
-        "sphinx-doc__sphinx-8265",
-        "sphinx-doc__sphinx-8269",
-        "sphinx-doc__sphinx-8459",
-        "sphinx-doc__sphinx-8475",
-        "sphinx-doc__sphinx-8548",
-        "sphinx-doc__sphinx-8551",
-        "sphinx-doc__sphinx-8593",
-        "sphinx-doc__sphinx-8595",
-        "sphinx-doc__sphinx-8621",
-        "sphinx-doc__sphinx-8638",
-        "sphinx-doc__sphinx-8721",
-        "sphinx-doc__sphinx-9229",
-        "sphinx-doc__sphinx-9230",
-        "sphinx-doc__sphinx-9258",
-        "sphinx-doc__sphinx-9281",
-        "sphinx-doc__sphinx-9320",
-        "sphinx-doc__sphinx-9367",
-        "sphinx-doc__sphinx-9461",
-        "sphinx-doc__sphinx-9591",
-        "sphinx-doc__sphinx-9602",
-        "sphinx-doc__sphinx-9658",
-        "sphinx-doc__sphinx-9673",
-        "sphinx-doc__sphinx-9698",
-        "sphinx-doc__sphinx-9711",
-        "sympy__sympy-11618",
-        "sympy__sympy-12096",
-        "sympy__sympy-12419",
-        "sympy__sympy-12481",
-        "sympy__sympy-12489",
-        "sympy__sympy-13031",
-        "sympy__sympy-13091",
-        "sympy__sympy-13372",
-        "sympy__sympy-13480",
-        "sympy__sympy-13551",
-        "sympy__sympy-13615",
-        "sympy__sympy-13647",
-        "sympy__sympy-13757",
-        "sympy__sympy-13798",
-        "sympy__sympy-13852",
-        "sympy__sympy-13877",
-        "sympy__sympy-13878",
-        "sympy__sympy-13974",
-        "sympy__sympy-14248",
-        "sympy__sympy-14531",
-        "sympy__sympy-14711",
-        "sympy__sympy-14976",
-        "sympy__sympy-15017",
-        "sympy__sympy-15345",
-        "sympy__sympy-15349",
-        "sympy__sympy-15599",
-        "sympy__sympy-15809",
-        "sympy__sympy-15875",
-        "sympy__sympy-15976",
-        "sympy__sympy-16450",
-        "sympy__sympy-16597",
-        "sympy__sympy-16766",
-        "sympy__sympy-16792",
-        "sympy__sympy-16886",
-        "sympy__sympy-17139",
-        "sympy__sympy-17318",
-        "sympy__sympy-17630",
-        "sympy__sympy-17655",
-        "sympy__sympy-18189",
-        "sympy__sympy-18199",
-        "sympy__sympy-18211",
-        "sympy__sympy-18698",
-        "sympy__sympy-18763",
-        "sympy__sympy-19040",
-        "sympy__sympy-19346",
-        "sympy__sympy-19495",
-        "sympy__sympy-19637",
-        "sympy__sympy-19783",
-        "sympy__sympy-19954",
-        "sympy__sympy-20154",
-        "sympy__sympy-20428",
-        "sympy__sympy-20438",
-        "sympy__sympy-20590",
-        "sympy__sympy-20801",
-        "sympy__sympy-20916",
-        "sympy__sympy-21379",
-        "sympy__sympy-21596",
-        "sympy__sympy-21612",
-        "sympy__sympy-21847",
-        "sympy__sympy-21930",
-        "sympy__sympy-22080",
-        "sympy__sympy-22456",
-        "sympy__sympy-22714",
-        "sympy__sympy-22914",
-        "sympy__sympy-23262",
-        "sympy__sympy-23413",
-        "sympy__sympy-23534",
-        "sympy__sympy-23824",
-        "sympy__sympy-23950",
-        "sympy__sympy-24066",
-        "sympy__sympy-24213",
-        "sympy__sympy-24443",
-        "sympy__sympy-24539",
-        "sympy__sympy-24562",
-        "sympy__sympy-24661"
-    ],
-    "empty_patch_ids": [
-        "astropy__astropy-13453"
-    ],
-    "submitted_ids": [
-        "astropy__astropy-13453"
-    ],
-    "resolved_ids": [],
-    "unresolved_ids": [],
-    "error_ids": [],
-    "schema_version": 2
-}

From 0d1231101247719168c2354ebef95a58fcac8239 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Thu, 16 Oct 2025 17:19:39 -0700
Subject: [PATCH 04/10] addressing dereks comments

---
 examples/swebench/README.md              | 309 ++++++++++++++++++++---
 examples/swebench/run_swe_agent_fw.py    |  97 +------
 examples/swebench/server.py              |  26 +-
 examples/swebench/tests/conftest.py      |  34 ---
 examples/swebench/tests/test_swebench.py | 226 ++++-------------
 examples/swebench/tracing_model.py       |  89 ++++++-
 pyproject.toml                           |   4 +
 7 files changed, 423 insertions(+), 362 deletions(-)
 delete mode 100644 examples/swebench/tests/conftest.py

diff --git a/examples/swebench/README.md b/examples/swebench/README.md
index f4082f76..a696ed6f 100644
--- a/examples/swebench/README.md
+++ b/examples/swebench/README.md
@@ -1,57 +1,300 @@
-SWE-bench (Remote) - Local (non-Docker) Setup and Usage
+# SWE-bench Evaluation Example
 
-Prerequisites
-- Python 3.12 environment (same one you use for this repo)
-- Fireworks API key
-- mini-swe-agent and datasets (for patch generation)
-- SWE-bench harness installed (for evaluation)
+This example shows how to evaluate LLM models on the SWE-bench software engineering benchmark using eval-protocol.
 
-Setup mini-swe-agent (non-Docker)
-1) Install dependencies
-```bash
-pip install mini-swe-agent datasets
-```
+## Quick Start
+
+### 1. Install Dependencies
 
-2) Configure API key for mini-swe-agent
 ```bash
-mini-extra config set FIREWORKS_API_KEY <your_fireworks_key>
+# From the python-sdk repository root
+cd python-sdk
+
+# Install eval-protocol with swebench support
+pip install -e ".[swebench]"
 ```
 
-3) (Optional) Test connectivity
+### 2. Set up mini-swe-agent
+
+mini-swe-agent requires a Fireworks API key to function:
+
 ```bash
-python3 examples/swebench/run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905 --test
+# Configure API key for mini-swe-agent
+mini-extra config set FIREWORKS_API_KEY your_fireworks_api_key
+
+# Verify it's set
+mini-extra config get FIREWORKS_API_KEY
 ```
 
-Install SWE-bench evaluation harness
+### 3. Install SWE-bench Harness
+
 ```bash
+# Navigate to the swebench example directory
+cd examples/swebench
+
+# Clone and install SWE-bench
 git clone https://github.com/princeton-nlp/SWE-bench
 pip install -e SWE-bench
 ```
 
-Environment
+### 4. Set Environment Variables
+
+```bash
+export FIREWORKS_API_KEY="your_fireworks_api_key"
+```
+
+## Running the Evaluation
+
+**IMPORTANT:** Always run both the server and tests from the `examples/swebench/` directory.
+
+### Step 1: Start the Server
+
+Open a terminal and run:
+
+```bash
+cd examples/swebench
+python server.py
+```
+
+You should see:
+```
+INFO:     Uvicorn running on http://127.0.0.1:3000 (Press CTRL+C to quit)
+```
+
+### Step 2: Configure Your Test
+
+Edit `tests/test_swebench.py` to set your model and parameters:
+
+```python
+completion_params=[{
+    "model": "accounts/fireworks/models/your-model-name",  # Edit this
+    "model_kwargs": {
+        "temperature": 0.2,      # Optional
+        # "max_tokens": 2048,    # Optional
+        # "reasoning": "high",   # Optional
+    }
+}],
+max_concurrent_rollouts=3,  # How many instances to run in parallel
+```
+
+To test different numbers of instances, edit line 26:
+```python
+def rows() -> List[EvaluationRow]:
+    return rows_from_indices(2)  # Change 2 to desired number (max 500)
+```
+
+### Step 3: Run the Test
+
+Open a second terminal:
+
+```bash
+cd examples/swebench
+pytest tests/test_swebench.py -v -s
+```
+
+## What Happens During a Run
+
+For each instance (row):
+
+1. **Server receives request** from pytest
+2. **Wrapper script** (`run_swe_agent_fw.py`) is called with the instance index
+3. **mini-swe-agent** runs in a Docker container for that specific repository
+4. **Agent attempts to solve** the issue by editing code
+5. **Patch is generated** and saved to `preds.json`
+6. **SWE-bench harness** applies the patch and runs tests
+7. **Results** are written to the row directory
+8. **Test fetches results** and displays pass/fail in the UI
+
+## Understanding the Output
+
+### Directory Structure
+
+Each instance creates its own `row_N/` directory:
+
+```
+examples/swebench/
+├── row_0/                                    # First instance
+│   ├── preds.json                            # ← Model's generated patch
+│   ├── astropy__astropy-12907/               # Instance-specific folder
+│   │   └── astropy__astropy-12907.traj.json  # Agent's execution trace
+│   ├── logs/                                 # Harness execution logs
+│   │   └── run_evaluation/
+│   │       └── eval-run/
+│   │           └── <safe_model_name>/
+│   │               └── astropy__astropy-12907/
+│   │                   ├── report.json       # ← Test results (pass/fail)
+│   │                   ├── test_output.txt   # Test execution output
+│   │                   ├── patch.diff        # Applied patch
+│   │                   └── eval.sh           # Evaluation script
+│   ├── agent_0.log                           # Agent console output
+│   ├── exit_statuses_*.yaml                  # Exit status if failed
+│   └── <model_name>.eval-run.json            # Overall run summary
+├── row_1/                                    # Second instance
+│   └── ...
+└── ...
+```
+
+### Key Files Explained
+
+#### `preds.json` - Model Predictions
+Location: `row_N/preds.json`
+
+Contains the patch generated by the model:
+```json
+{
+  "astropy__astropy-12907": {
+    "model_name_or_path": "accounts/fireworks/models/...",
+    "instance_id": "astropy__astropy-12907",
+    "model_patch": "diff --git a/... (the actual patch)"
+  }
+}
+```
+
+**If missing:** Agent failed before generating a patch (check `exit_statuses_*.yaml`)
+
+#### `report.json` - Test Results
+Location: `row_N/logs/run_evaluation/eval-run/<model_name>/<instance_id>/report.json`
+
+Contains pass/fail status after running tests:
+```json
+{
+  "astropy__astropy-12907": {
+    "patch_is_None": false,
+    "patch_exists": true,
+    "patch_successfully_applied": true,
+    "resolved": true,  // ← Was the issue fixed?
+    "tests_status": {
+      "FAIL_TO_PASS": {"success": [...], "failure": []},
+      "PASS_TO_PASS": {"success": [...], "failure": []}
+    }
+  }
+}
+```
+
+- `resolved: true` = Instance solved! All required tests pass.
+- `resolved: false` = Instance not solved (tests still failing)
+
+**If missing:** Agent didn't generate a patch or harness didn't run
+
+#### `exit_statuses_*.yaml` - Why Runs Failed
+Location: `row_N/exit_statuses_*.yaml`
+
+```yaml
+instances_by_exit_status:
+  Submitted: []
+  LimitsExceeded: ["astropy__astropy-12907"]  # Hit step/cost limits
+  Error: []
+```
+
+Common statuses:
+- `Submitted`: Completed normally
+- `LimitsExceeded`: Agent hit max steps or cost limit
+- `Error`: Unexpected error during execution
+
+#### `agent_N.log` - Agent Execution
+Location: `row_N/agent_N.log`
+
+Full console output from the agent run, including:
+- Docker container startup
+- Model API calls
+- Commands executed
+- Errors (if any)
+
+#### `*.traj.json` - Agent Trajectory
+Location: `row_N/<instance_id>/<instance_id>.traj.json`
+
+Complete record of the agent's execution:
+```json
+{
+  "instance_id": "astropy__astropy-12907",
+  "info": {
+    "submission": "...",  // The patch
+    "exit_status": "Submitted",
+    "model_stats": {
+      "instance_cost": 0.05,
+      "api_calls": 15
+    }
+  },
+  "messages": [...]  // All agent messages
+}
+```
+
+## Viewing Results
+
+### In the Terminal
+
+The test output shows:
+```
+INFO:test_swebench:[Row 0] Found instance_id: astropy__astropy-12907
+INFO:test_swebench:[Row 0] Report says resolved=True
+INFO:test_swebench:[Row 0] Final: resolved=True, reason=harness_resolved=True
+```
+
+### In the Eval Protocol UI
+
+If Elasticsearch is running, visit: `http://localhost:8000`
+- View aggregate scores
+- Inspect individual trajectories
+- Filter by resolved/unresolved
+- See cost and token usage
+
+### Check Individual Files
+
 ```bash
-export FIREWORKS_API_KEY="<your_fireworks_key>"
+# Check if instance was solved
+cat row_0/logs/run_evaluation/eval-run/<model>/astropy__astropy-12907/report.json | jq '.["astropy__astropy-12907"].resolved'
+
+# View the generated patch
+cat row_0/preds.json | jq '.["astropy__astropy-12907"].model_patch'
+
+# Check exit status
+cat row_0/exit_statuses_*.yaml
 ```
 
-Run the server
+## Performance Notes
+
+- **Small test (2 instances):** ~10-30 minutes
+- **Full dataset (500 instances):** 24-48 hours on a 16-core machine
+- **Concurrent runs:** Recommended 3-5 based on CPU/memory
+- **Docker space:** ~100GB for all images (downloads happen automatically)
+
+## Troubleshooting
+
+### Docker container fails to start
 ```bash
-python examples/swebench/server.py
+# Check Docker is running
+docker ps
+
+# Check disk space
+df -h
 ```
 
-What the server does
-- Invokes `run_swe_agent_fw.py` in batch mode with a single-slice per request
-- Writes outputs to a per-row directory: `./row_{index}/`
-  - `row_{index}/preds.json`
-  - `row_{index}/<instance_id>/<instance_id>.traj.json`
-- Runs the SWE-bench harness on `row_{index}/preds.json`
+### Agent hits step limits
+Instances that consistently hit limits may need:
+- Higher step limit (edit mini-swe-agent config)
+- Different prompting strategy
+- More capable model
 
-Run pytest to evaluate a model on SWE-bench
+### Server not responding
 ```bash
-cd /Users/shrey/Documents/python-sdk
-pytest examples/swebench/tests/test_swebench.py -v -s
+# Check server is running
+curl http://127.0.0.1:3000/status?rollout_id=test
+
+# Check server logs for errors
+# (shown in terminal where server.py is running)
 ```
 
-Notes
-- The test currently generates 10 rows by numeric index (0–9)
-- Each request triggers the server to run one SWE-bench instance and write to its own `row_{index}`
-- Control harness workers via: `export SWEBENCH_EVAL_WORKERS=5`
+## Next Steps
+
+- Review results in `row_*/logs/.../report.json`
+- Analyze failed instances to improve your model
+- Run on larger subsets to get statistical significance
+- Export results for further analysis
+
+## Support
+
+For issues:
+- Check agent logs: `row_N/agent_N.log`
+- Check exit statuses: `row_N/exit_statuses_*.yaml`
+- Verify Docker has sufficient resources
+- Ensure API key is valid and has credits
diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py
index cef3de9a..4d145038 100755
--- a/examples/swebench/run_swe_agent_fw.py
+++ b/examples/swebench/run_swe_agent_fw.py
@@ -12,14 +12,6 @@
 Usage:
     python run_swe_agent_fw.py <fully_qualified_model_path> [options]
 
-Examples:
-    # Serverless models
-    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --instances 10 --workers 5
-    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/llama-v3p1-70b-instruct --subset full --workers 8
-
-    # Deployed models
-    python run_swe_agent_fw.py fireworks_ai/accounts/cognition/deployedModels/swe-1-mtp-tc1huggf --single 0
-    python run_swe_agent_fw.py fireworks_ai/accounts/fireworks/models/kimi-k2-instruct --test
 
 Requirements:
     - mini-swe-agent: pip install mini-swe-agent
@@ -39,94 +31,11 @@
 import litellm
 
 
-class FireworksCompatibleModel(LitellmModel):
-    """
-    Fireworks-compatible wrapper for LitellmModel.
-    """
-
-    def __init__(self, **kwargs):
-        if model_id := os.environ.get("FIREWORKS_MODEL_ID"):
-            kwargs["model_name"] = model_id
-        print(f"kwargs: {kwargs}")
-        if "model_kwargs" not in kwargs:
-            kwargs["model_kwargs"] = {}
-
-        # CRITICAL: Set drop_params to False so stop sequences aren't stripped!
-        kwargs["model_kwargs"]["drop_params"] = False
-
-        # Get existing stop sequences
-        existing_stop = kwargs["model_kwargs"].get("stop", [])
-        if isinstance(existing_stop, str):
-            existing_stop = [existing_stop]
-        elif existing_stop is None:
-            existing_stop = []
-
-        # Add stop sequences (only the non-natural ones)
-        stop_sequences = existing_stop + [
-            # ASCII versions
-            "<|User|>",
-            "<|Assistant|>",
-            # Full-width PIPE versions (U+FF5C)
-            "<｜User|>",  # \uff5c
-            "<｜Assistant|>",
-            "```<｜",
-            "<｜User",
-            "<｜Ass",
-            # Full-width LETTER L versions (U+FF4C)
-            "<ｌUser|>",  # \uff4c
-            "<ｌAssistant|>",
-            "```<ｌ",
-            "<ｌUser",
-            "<ｌAss",
-        ]
-        kwargs["model_kwargs"]["stop"] = stop_sequences
-        kwargs["model_kwargs"]["max_tokens"] = 1024  # Reduce to 1024 to save tokens
-
-        if "temperature" not in kwargs["model_kwargs"]:
-            kwargs["model_kwargs"]["temperature"] = 0.0
-
-        # Apply per-run overrides injected by the wrapper (no environment variables)
-        overrides = globals().get("WRAPPER_MODEL_OVERRIDES")
-        if isinstance(overrides, dict):
-            if overrides.get("reasoning") in ("low", "medium", "high"):
-                kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"]
-            if overrides.get("temperature") is not None:
-                try:
-                    kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"])
-                except Exception:
-                    pass
-            if overrides.get("max_tokens") is not None:
-                try:
-                    kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"])
-                except Exception:
-                    pass
-
-        super().__init__(**kwargs)
-
-    def _query(self, messages: list[dict[str, str]], **kwargs):
-        """Remove non-standard fields before sending to Fireworks API."""
-        # Keep only standard OpenAI-compatible fields
-        clean_messages = []
-        for msg in messages:
-            clean_msg = {"role": msg["role"], "content": msg["content"]}
-            if "tool_calls" in msg:
-                clean_msg["tool_calls"] = msg["tool_calls"]
-            if "name" in msg:
-                clean_msg["name"] = msg["name"]
-            clean_messages.append(clean_msg)
-
-        # IMPORTANT: Ensure drop_params stays False in the actual query
-        kwargs_with_stop = kwargs.copy()
-        if "drop_params" not in kwargs_with_stop:
-            kwargs_with_stop["drop_params"] = False
-
-        return super()._query(clean_messages, **kwargs_with_stop)
-
-
 def __get_api_key():
     """Get Fireworks API key from environment or mini-swe-agent config."""
     # Environment variable takes precedence
-    if api_key := os.environ.get("FIREWORKS_API_KEY"):
+    api_key = os.environ.get("FIREWORKS_API_KEY")
+    if api_key:
         return api_key
 
     # Try to get API key from mini-swe-agent's config system
@@ -213,7 +122,7 @@ def __build_command(args, wrapper_module_path):
         "--model",
         args.model_id,
         "--model-class",
-        model_class,
+        "tracing_model.FireworksCompatibleModel",
         "--subset",
         args.subset,
         "--split",
diff --git a/examples/swebench/server.py b/examples/swebench/server.py
index ae3df983..01063645 100644
--- a/examples/swebench/server.py
+++ b/examples/swebench/server.py
@@ -51,7 +51,10 @@ def _worker():
                 env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"]
             # Make sure the tracing model module is importable by the subprocess
             # so "tracing_model.TracingFireworksModel" can be imported
-            env["PYTHONPATH"] = "/Users/shrey/Documents/python-sdk/examples/swebench:" + env.get("PYTHONPATH", "")
+            from pathlib import Path
+
+            script_dir = Path(__file__).parent
+            env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}"
 
             # Determine output directory (from env or default)
             out_dir = os.getcwd()
@@ -62,15 +65,17 @@ def _worker():
 
             # Extract model_kwargs from req.metadata (forwarded from input_metadata)
             model_kwargs = {}
-            logger.info(f"DEBUG: req.metadata attributes: {dir(req.metadata)}")
+            # convert to logger.debug everywhere, remove debug then
+            logger.debug(f"req.metadata attributes: {dir(req.metadata)}")
+
             if hasattr(req.metadata, "model_kwargs"):
                 mk = getattr(req.metadata, "model_kwargs", None)
-                logger.info(f"DEBUG: Found req.metadata.model_kwargs = {mk}")
+                logger.debug(f"Found req.metadata.model_kwargs = {mk}")
                 if isinstance(mk, dict):
                     model_kwargs = mk
-                    logger.info(f"Extracted model_kwargs from metadata: {model_kwargs}")
+                    logger.debug(f"Extracted model_kwargs from metadata: {model_kwargs}")
             else:
-                logger.info("DEBUG: req.metadata has NO model_kwargs attribute")
+                logger.debug("req.metadata has NO model_kwargs attribute")
 
             # Set tracing URL
             if req.model_base_url:
@@ -114,16 +119,6 @@ def _worker():
                 )
                 ret = proc.wait()
 
-            # Stream stdout/stderr to logs
-            # assert proc.stdout is not None and proc.stderr is not None
-            # for line in proc.stdout:
-            #     logger.info(line.rstrip("\n"))
-            # for line in proc.stderr:
-            #     logger.warning(line.rstrip("\n"))
-
-            # ret = proc.wait()
-            # logger.info(f"mini-swe-agent exited with code {ret}")
-
             # Use row-specific preds.json to avoid cross-run interference
             preds_path = row_dir / "preds.json"
             if preds_path.exists():
@@ -154,7 +149,6 @@ def _worker():
             for line in eval_proc.stdout:
                 logger.info(line.rstrip("\n"))
             eval_rc = eval_proc.wait()
-            # logger.info(f"SWE-bench harness exited with code {eval_rc}")
 
         except Exception as e:
             # Best-effort: mark error but still finish to unblock polling
diff --git a/examples/swebench/tests/conftest.py b/examples/swebench/tests/conftest.py
deleted file mode 100644
index 3b81f7ad..00000000
--- a/examples/swebench/tests/conftest.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import os
-import pytest
-
-
-MODEL_ID_OPT = None
-CONCURRENCY_OPT = None
-MODEL_KWARGS_OPT = None
-
-
-def pytest_addoption(parser):
-    parser.addoption("--model-id", action="store", default=None, help="Fireworks model ID")
-    parser.addoption("--concurrent-workers", action="store", type=int, default=None, help="Max concurrent rollouts")
-    parser.addoption("--temperature", action="store", type=float, default=None, help="Model temperature")
-    parser.addoption("--max-tokens", action="store", type=int, default=None, help="Max tokens")
-    parser.addoption(
-        "--reasoning", action="store", choices=["low", "medium", "high"], default=None, help="Reasoning effort"
-    )
-
-
-def pytest_configure(config):
-    global MODEL_ID_OPT, CONCURRENCY_OPT, MODEL_KWARGS_OPT
-    MODEL_ID_OPT = config.getoption("--model-id")
-    CONCURRENCY_OPT = config.getoption("--concurrent-workers")
-    temp = config.getoption("--temperature")
-    mtok = config.getoption("--max-tokens")
-    reas = config.getoption("--reasoning")
-    mk = {}
-    if temp is not None:
-        mk["temperature"] = float(temp)
-    if mtok is not None:
-        mk["max_tokens"] = int(mtok)
-    if reas is not None:
-        mk["reasoning"] = reas
-    MODEL_KWARGS_OPT = mk or None
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index 48c130a9..6e2410a6 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -1,132 +1,24 @@
 from typing import List
-import os
-import pytest
-import requests
 import yaml
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
 from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
-from eval_protocol.types.remote_rollout_processor import DataLoaderConfig
-from eval_protocol.quickstart.utils import filter_longest_conversation
-
-# Reuse the converter used by the built-in adapter
-from eval_protocol.adapters.fireworks_tracing import convert_trace_dict_to_evaluation_row
-import conftest
-
-
-MODEL_ID = conftest.MODEL_ID_OPT
-if not MODEL_ID:
-    raise RuntimeError("--model-id is required. Example: --model-id 'fireworks_ai/accounts/.../models/<name>'")
-CLI_CONCURRENCY = conftest.CONCURRENCY_OPT
-CLI_MODEL_KWARGS = conftest.MODEL_KWARGS_OPT
-
-# Build completion_params once (used by decorator)
-COMPLETION_PARAMS = {"model": MODEL_ID}
-if CLI_MODEL_KWARGS:
-    COMPLETION_PARAMS["model_kwargs"] = CLI_MODEL_KWARGS
-
-
-def fetch_traces_with_auth(config: DataLoaderConfig) -> List[EvaluationRow]:
-    """
-    Fetch traces directly from the Fireworks tracing proxy with Authorization header
-    and convert them into EvaluationRows using the same converter as the adapter.
-    """
-    base_url = (config.model_base_url or "https://tracing.fireworks.ai").rstrip("/")
-    api_key = os.environ.get("FIREWORKS_API_KEY")
-    if not api_key:
-        return []
-
-    url = f"{base_url}/v1/traces"
-    headers = {"Authorization": f"Bearer {api_key}"}
-    params = {
-        "tags": [f"rollout_id:{config.rollout_id}"],
-        "max_retries": 5,
-        "sleep_between_gets": 0.1,
-    }
-
-    try:
-        resp = requests.get(url, params=params, headers=headers, timeout=300)
-        print(f"[fetch_traces] status={resp.status_code} url={resp.url}")  # debug
-        resp.raise_for_status()
-        body = resp.json() or {}
-        traces = body.get("traces", [])
-        print(f"[fetch_traces] traces_found={len(traces)}")
-    except Exception as e:
-        print(f"[fetch_traces] error={e}")
-        return []
-
-    rows: List[EvaluationRow] = []
-    for tr in traces:
-        row = convert_trace_dict_to_evaluation_row(tr, include_tool_calls=True, span_name=None)
-        if row:
-            rows.append(row)
-    return rows
-
-
-def _merge_rows_into_one(rows: List[EvaluationRow]) -> List[EvaluationRow]:
-    if not rows:
-        return []
-    # Use the first row as the base; merge messages from all rows
-    base = rows[0]
-    seen = set()
-    merged_msgs: List[Message] = []
-    for r in rows:
-        for m in r.messages or []:
-            # Dedup by role+name+content+tool_calls signature
-            tool_sig = None
-            if getattr(m, "tool_calls", None):
-                tool_sig = tuple(
-                    (tc.get("id"), tc.get("type"), (tc.get("function") or {}).get("name")) for tc in m.tool_calls
-                )
-            key = (m.role, getattr(m, "name", None), m.content, tool_sig)
-            if key in seen:
-                continue
-            seen.add(key)
-            merged_msgs.append(m)
-    base.messages = merged_msgs
-    return [base]
-
-
-def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:
-    return DynamicDataLoader(
-        generators=[lambda: fetch_traces_with_auth(config)],
-        preprocess_fn=_merge_rows_into_one,  # merge all tool/LLM traces into one row
-    )
-
-
-def rows_from_instance_ids(ids: list[str]) -> List[EvaluationRow]:
-    out = []
-    for idx, iid in enumerate(ids):
-        out.append(
-            EvaluationRow(
-                messages=[Message(role="user", content=f"Run SWE-bench instance {iid}")],
-                input_metadata={
-                    "row_id": str(idx),  # ← use instance_id here
-                    "instance_id": iid,  # ← explicit for debugging
-                    "instance_index": str(idx),  # ← optional: keep index
-                    "completion_params": {"model": MODEL_ID},
-                },
-            )
-        )
-    return out
+from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
+import json
+from pathlib import Path
 
 
 def rows_from_indices(count: int) -> List[EvaluationRow]:
     out: List[EvaluationRow] = []
     for idx in range(count):
-        metadata = {
-            "row_id": str(idx),
-            "instance_index": str(idx),
-        }
-        # Add model_kwargs to metadata so server can read from req.metadata
-        if CLI_MODEL_KWARGS:
-            metadata["model_kwargs"] = CLI_MODEL_KWARGS
-
         out.append(
             EvaluationRow(
-                messages=[Message(role="user", content=f"Run SWE-bench index {idx}")],
-                input_metadata=metadata,
+                messages=[],
+                input_metadata={
+                    "row_id": str(idx),
+                    "instance_index": str(idx),
+                },
             )
         )
     return out
@@ -134,33 +26,33 @@ def rows_from_indices(count: int) -> List[EvaluationRow]:
 
 def rows() -> List[EvaluationRow]:
     # Generate 10 rows by index; server maps index -> dataset instance via --slice
-    return rows_from_indices(10)
+    return rows_from_indices(2)
 
 
 # -------------------- Harness result attachment (UI pass/fail) --------------------
-import json
-from pathlib import Path
-
-
-def _safe_model_id(model_id: str) -> str:
-    return model_id.replace("/", "__").replace(":", "-")
-
-
-def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
-    """Attach evaluation result by reading harness report or exit status."""
-    import logging
-
-    logger = logging.getLogger(__name__)
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[rows],
+    ),
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://127.0.0.1:3000",
+        model_base_url="https://tracing.fireworks.ai",
+        timeout_seconds=1800,
+        output_data_loader=default_fireworks_output_data_loader,
+    ),
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
+    max_concurrent_rollouts=3,
+)
+async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
+    """Evaluate SWE-bench instance by reading harness report or exit status."""
 
-    # Get row_id and instance_id
+    # Get row_id
     try:
-        row_id = str(row.input_metadata.row_id)  # ← use attribute, not .get()
-    except Exception as e:
-        logger.warning(f"Could not get row_id: {e}")
+        row_id = str(row.input_metadata.row_id)
+    except Exception:
         return row
 
     row_dir = Path.cwd() / f"row_{row_id}"
-    logger.info(f"[Row {row_id}] Looking for results in {row_dir}")
 
     # Find instance_id from preds.json
     preds_path = row_dir / "preds.json"
@@ -169,57 +61,48 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
         try:
             preds = json.loads(preds_path.read_text())
             instance_id = next(iter(preds.keys()), None)
-            logger.info(f"[Row {row_id}] Found instance_id: {instance_id}")
-        except Exception as e:
-            logger.warning(f"[Row {row_id}] Could not read preds.json: {e}")
+        except Exception:
+            pass
 
     if not instance_id:
-        logger.warning(f"[Row {row_id}] No instance_id found, skipping eval result")
         return row
 
     resolved: bool | None = None
     reason_text: str | None = None
 
-    # 1. Try to read from report.json (harness ran tests)
-    safe_model = _safe_model_id(model_id)
-    report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
+    # Get model from completion_params and convert to safe directory name (matching SWE-bench convention)
+    model_id = row.input_metadata.completion_params.get("model") if row.input_metadata.completion_params else None
+    if not model_id:
+        return row
+    safe_model = model_id.replace("/", "__").replace(":", "-")
 
+    # Read from report.json (harness ran tests)
+    report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
     if report_path.exists():
-        logger.info(f"[Row {row_id}] Found report.json at {report_path}")
         try:
             report_data = json.loads(report_path.read_text())
-            instance_data = report_data.get(instance_id, {})
-            resolved = bool(instance_data.get("resolved", False))
+            resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
             reason_text = f"harness_resolved={resolved}"
-            logger.info(f"[Row {row_id}] Report says resolved={resolved}")
-        except Exception as e:
-            logger.error(f"[Row {row_id}] Failed to parse report.json: {e}")
-    else:
-        logger.info(f"[Row {row_id}] No report.json found at {report_path}")
+        except Exception:
+            pass
 
-    # 2. If no report, check exit status YAML (agent didn't produce a patch)
+    # If no report, check exit status YAML
     if resolved is None:
         exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
         if exit_status_files:
-            exit_file = exit_status_files[-1]
-            logger.info(f"[Row {row_id}] Reading exit status from {exit_file.name}")
             try:
-                status_doc = yaml.safe_load(exit_file.read_text()) or {}
+                status_doc = yaml.safe_load(exit_status_files[-1].read_text()) or {}
                 by_status = status_doc.get("instances_by_exit_status", {})
                 for status_name, ids in by_status.items():
                     if instance_id in (ids or []):
                         resolved = False
                         reason_text = f"exit_status={status_name}"
-                        logger.info(f"[Row {row_id}] Exit status: {status_name}")
                         break
-            except Exception as e:
-                logger.error(f"[Row {row_id}] Failed to parse exit status: {e}")
-        else:
-            logger.warning(f"[Row {row_id}] No exit status YAML found")
+            except Exception:
+                pass
 
-    # 3. Attach result if we found anything
+    # Attach result
     if resolved is not None:
-        logger.info(f"[Row {row_id}] Final: resolved={resolved}, reason={reason_text}")
         row.evaluation_result = EvaluateResult(
             score=1.0 if resolved else 0.0,
             reason=reason_text or f"resolved={resolved}",
@@ -233,26 +116,5 @@ def attach_eval_result(row: EvaluationRow, model_id: str) -> EvaluationRow:
                 )
             },
         )
-    else:
-        logger.warning(f"[Row {row_id}] Could not determine resolved status")
-
-    return row
 
-
-@evaluation_test(
-    data_loaders=DynamicDataLoader(
-        generators=[rows],
-    ),
-    rollout_processor=RemoteRolloutProcessor(
-        remote_base_url="http://127.0.0.1:3000",
-        model_base_url="https://tracing.fireworks.ai",
-        timeout_seconds=1800,
-        output_data_loader=fireworks_output_data_loader,
-    ),
-    completion_params=[COMPLETION_PARAMS],
-    max_concurrent_rollouts=(CLI_CONCURRENCY or 2),
-)
-async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-    assert len(row.messages) >= 1
-    row = attach_eval_result(row, MODEL_ID)
     return row
diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py
index 8862388c..6f482efd 100644
--- a/examples/swebench/tracing_model.py
+++ b/examples/swebench/tracing_model.py
@@ -5,9 +5,92 @@
 import sys
 import os
 
-sys.path.insert(0, "/Users/shrey/Documents/cookbook-internal/recipes/eval/swe_bench")
-
-from run_swe_agent_fw import FireworksCompatibleModel
+from minisweagent.models.litellm_model import LitellmModel
+
+
+class FireworksCompatibleModel(LitellmModel):
+    """
+    Fireworks-compatible wrapper for LitellmModel.
+    """
+
+    def __init__(self, **kwargs):
+        model_id = os.environ.get("FIREWORKS_MODEL_ID")
+        if model_id:
+            kwargs["model_name"] = model_id
+
+        if "model_kwargs" not in kwargs:
+            kwargs["model_kwargs"] = {}
+
+        # CRITICAL: Set drop_params to False so stop sequences aren't stripped!
+        kwargs["model_kwargs"]["drop_params"] = False
+
+        # Get existing stop sequences
+        existing_stop = kwargs["model_kwargs"].get("stop", [])
+        if isinstance(existing_stop, str):
+            existing_stop = [existing_stop]
+        elif existing_stop is None:
+            existing_stop = []
+
+        # Add stop sequences (only the non-natural ones)
+        # stop_sequences = existing_stop + [
+        #     # ASCII versions
+        #     "<|User|>",
+        #     "<|Assistant|>",
+        #     # Full-width PIPE versions (U+FF5C)
+        #     "<｜User|>",  # \uff5c
+        #     "<｜Assistant|>",
+        #     "```<｜",
+        #     "<｜User",
+        #     "<｜Ass",
+        #     # Full-width LETTER L versions (U+FF4C)
+        #     "<ｌUser|>",  # \uff4c
+        #     "<ｌAssistant|>",
+        #     "```<ｌ",
+        #     "<ｌUser",
+        #     "<ｌAss",
+        # ]
+        # kwargs["model_kwargs"]["stop"] = stop_sequences
+        kwargs["model_kwargs"]["max_tokens"] = 1024  # Reduce to 1024 to save tokens
+
+        if "temperature" not in kwargs["model_kwargs"]:
+            kwargs["model_kwargs"]["temperature"] = 0.0
+
+        # Apply per-run overrides injected by the wrapper (no environment variables)
+        overrides = globals().get("WRAPPER_MODEL_OVERRIDES")
+        if isinstance(overrides, dict):
+            if overrides.get("reasoning") in ("low", "medium", "high"):
+                kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"]
+            if overrides.get("temperature") is not None:
+                try:
+                    kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"])
+                except Exception:
+                    pass
+            if overrides.get("max_tokens") is not None:
+                try:
+                    kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"])
+                except Exception:
+                    pass
+
+        super().__init__(**kwargs)
+
+    def _query(self, messages: list[dict[str, str]], **kwargs):
+        """Remove non-standard fields before sending to Fireworks API."""
+        # Keep only standard OpenAI-compatible fields
+        clean_messages = []
+        for msg in messages:
+            clean_msg = {"role": msg["role"], "content": msg["content"]}
+            if "tool_calls" in msg:
+                clean_msg["tool_calls"] = msg["tool_calls"]
+            if "name" in msg:
+                clean_msg["name"] = msg["name"]
+            clean_messages.append(clean_msg)
+
+        # IMPORTANT: Ensure drop_params stays False in the actual query
+        kwargs_with_stop = kwargs.copy()
+        if "drop_params" not in kwargs_with_stop:
+            kwargs_with_stop["drop_params"] = False
+
+        return super()._query(clean_messages, **kwargs_with_stop)
 
 
 class TracingFireworksModel(FireworksCompatibleModel):
diff --git a/pyproject.toml b/pyproject.toml
index fd7e6961..f3981f25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,6 +97,10 @@ box2d = [
     "gymnasium[box2d]>=0.29.0",
     "Pillow",
 ]
+swebench = [
+    "mini-swe-agent<=1.14.0",
+    "datasets>=4.2.0",
+]
 langfuse = [
     "langfuse>=2.0.0",
 ]

From b16bd50a1e90a83c39f857f3504c4f459ca08bf8 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Thu, 16 Oct 2025 17:42:00 -0700
Subject: [PATCH 05/10] pyproject removal due to dependancy issue

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f3981f25..fd7e6961 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,10 +97,6 @@ box2d = [
     "gymnasium[box2d]>=0.29.0",
     "Pillow",
 ]
-swebench = [
-    "mini-swe-agent<=1.14.0",
-    "datasets>=4.2.0",
-]
 langfuse = [
     "langfuse>=2.0.0",
 ]

From 14c6f46a7a0af8c4ba6d9aa12280b42ffa8e8e53 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Fri, 17 Oct 2025 11:58:27 -0700
Subject: [PATCH 06/10] changepyproject.toml

---
 pyproject.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index fd7e6961..b50ac5b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,6 +97,11 @@ box2d = [
     "gymnasium[box2d]>=0.29.0",
     "Pillow",
 ]
+swebench = [
+    "mini-swe-agent>=1.14.0",
+    "datasets>=2.0.0",
+    "litellm>=1.75.0",  # Note: Overrides core litellm<1.75.0 for swebench compatibility
+]
 langfuse = [
     "langfuse>=2.0.0",
 ]

From 47ef37b885dbb86c7e06f9c573ce9d74dab5f2f8 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Fri, 17 Oct 2025 15:53:46 -0700
Subject: [PATCH 07/10] added sandboxing of runs and remote server support

---
 examples/swebench/server.py              |  64 ++++++++++++--
 examples/swebench/tests/test_swebench.py | 102 ++++++++---------------
 2 files changed, 93 insertions(+), 73 deletions(-)

diff --git a/examples/swebench/server.py b/examples/swebench/server.py
index 01063645..0928c5f9 100644
--- a/examples/swebench/server.py
+++ b/examples/swebench/server.py
@@ -56,11 +56,13 @@ def _worker():
             script_dir = Path(__file__).parent
             env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}"
 
-            # Determine output directory (from env or default)
-            out_dir = os.getcwd()
-
+            # Sandbox by invocation_id to isolate concurrent test runs
             from pathlib import Path
 
+            invocation_id = req.metadata.invocation_id
+            base_dir = Path(os.getcwd()) / invocation_id
+            base_dir.mkdir(parents=True, exist_ok=True)
+
             script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve())
 
             # Extract model_kwargs from req.metadata (forwarded from input_metadata)
@@ -89,7 +91,7 @@ def _worker():
                 str(single_index),
                 "--exit-immediately",
                 "--output",
-                str(out_dir),
+                str(base_dir),
                 "--model-class",
                 "tracing_model.TracingFireworksModel",
             ]
@@ -103,7 +105,7 @@ def _worker():
             import json
 
             # Log path inside row directory for this run
-            row_dir = Path(out_dir) / f"row_{single_index}"
+            row_dir = base_dir / f"row_{single_index}"
             row_dir.mkdir(parents=True, exist_ok=True)
             log_path = row_dir / f"agent_{single_index}.log"
 
@@ -150,12 +152,60 @@ def _worker():
                 logger.info(line.rstrip("\n"))
             eval_rc = eval_proc.wait()
 
+            # Collect evaluation results to send via Elasticsearch
+            import yaml
+
+            instance_id = None
+            resolved = None
+
+            if preds_path.exists():
+                try:
+                    preds = json.loads(preds_path.read_text())
+                    instance_id = next(iter(preds.keys()), None)
+                except Exception:
+                    pass
+
+            if instance_id:
+                model_id = req.completion_params.get("model") if req.completion_params else None
+                if model_id:
+                    safe_model = model_id.replace("/", "__").replace(":", "-")
+                    report_path = (
+                        row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
+                    )
+
+                    if report_path.exists():
+                        try:
+                            report_data = json.loads(report_path.read_text())
+                            resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
+                        except Exception:
+                            pass
+
+                    if resolved is None:
+                        exit_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
+                        if exit_files:
+                            try:
+                                status_doc = yaml.safe_load(exit_files[-1].read_text()) or {}
+                                by_status = status_doc.get("instances_by_exit_status", {})
+                                for status_name, ids in by_status.items():
+                                    if instance_id in (ids or []):
+                                        resolved = False
+                                        break
+                            except Exception:
+                                pass
+
+            results_data = {
+                "instance_id": instance_id,
+                "resolved": resolved,
+                "row_id": str(single_index),
+            }
+
         except Exception as e:
             # Best-effort: mark error but still finish to unblock polling
+            results_data = {"error": str(e), "row_id": str(single_index)}
             logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
         finally:
-            # Always mark finished so RemoteRolloutProcessor stops polling
-            logger.info("Rollout completed", extra={"status": Status.rollout_finished()})
+            # Log results and mark finished
+            logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()})
 
     threading.Thread(target=_worker, daemon=True).start()
     return {"status": "accepted"}
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index 6e2410a6..6dd09f63 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -1,12 +1,9 @@
 from typing import List
-import yaml
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
-from eval_protocol.models import EvaluationRow, Message, EvaluateResult, MetricResult
+from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
-from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
 from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
-import json
-from pathlib import Path
 
 
 def rows_from_indices(count: int) -> List[EvaluationRow]:
@@ -39,82 +36,55 @@ def rows() -> List[EvaluationRow]:
         model_base_url="https://tracing.fireworks.ai",
         timeout_seconds=1800,
         output_data_loader=default_fireworks_output_data_loader,
+        disable_elastic_search_setup=True,
+        elastic_search_config=create_elasticsearch_config_from_env(),
     ),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_concurrent_rollouts=3,
 )
 async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-    """Evaluate SWE-bench instance by reading harness report or exit status."""
+    """Evaluate SWE-bench instance by reading results from Elasticsearch."""
+    import logging
 
-    # Get row_id
-    try:
-        row_id = str(row.input_metadata.row_id)
-    except Exception:
-        return row
-
-    row_dir = Path.cwd() / f"row_{row_id}"
-
-    # Find instance_id from preds.json
-    preds_path = row_dir / "preds.json"
-    instance_id = None
-    if preds_path.exists():
-        try:
-            preds = json.loads(preds_path.read_text())
-            instance_id = next(iter(preds.keys()), None)
-        except Exception:
-            pass
+    logger = logging.getLogger(__name__)
 
-    if not instance_id:
+    rollout_id = row.execution_metadata.rollout_id
+    if not rollout_id:
         return row
 
-    resolved: bool | None = None
-    reason_text: str | None = None
+    # Query Elasticsearch for results logged by server
+    try:
+        from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
 
-    # Get model from completion_params and convert to safe directory name (matching SWE-bench convention)
-    model_id = row.input_metadata.completion_params.get("model") if row.input_metadata.completion_params else None
-    if not model_id:
-        return row
-    safe_model = model_id.replace("/", "__").replace(":", "-")
+        es_config = create_elasticsearch_config_from_env()
+        es_client = ElasticsearchClient(es_config)
 
-    # Read from report.json (harness ran tests)
-    report_path = row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
-    if report_path.exists():
-        try:
-            report_data = json.loads(report_path.read_text())
-            resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
-            reason_text = f"harness_resolved={resolved}"
-        except Exception:
-            pass
+        # Search for results log from this rollout
+        query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}}
 
-    # If no report, check exit status YAML
-    if resolved is None:
-        exit_status_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
-        if exit_status_files:
-            try:
-                status_doc = yaml.safe_load(exit_status_files[-1].read_text()) or {}
-                by_status = status_doc.get("instances_by_exit_status", {})
-                for status_name, ids in by_status.items():
-                    if instance_id in (ids or []):
-                        resolved = False
-                        reason_text = f"exit_status={status_name}"
-                        break
-            except Exception:
-                pass
+        search_results = es_client.es.search(index=es_config.index_name, query=query, size=1)
 
-    # Attach result
-    if resolved is not None:
-        row.evaluation_result = EvaluateResult(
-            score=1.0 if resolved else 0.0,
-            reason=reason_text or f"resolved={resolved}",
-            is_score_valid=True,
-            metrics={
-                "resolved": MetricResult(
+        if search_results["hits"]["total"]["value"] > 0:
+            hit = search_results["hits"]["hits"][0]["_source"]
+            results_data = hit.get("results", {})
+            resolved = results_data.get("resolved")
+            instance_id = results_data.get("instance_id")
+
+            if resolved is not None:
+                row.evaluation_result = EvaluateResult(
                     score=1.0 if resolved else 0.0,
+                    reason=f"instance={instance_id}, resolved={resolved}",
                     is_score_valid=True,
-                    reason=reason_text or f"resolved={resolved}",
-                    value=int(resolved),
+                    metrics={
+                        "resolved": MetricResult(
+                            score=1.0 if resolved else 0.0,
+                            is_score_valid=True,
+                            reason=f"resolved={resolved}",
+                            value=int(resolved),
+                        )
+                    },
                 )
-            },
-        )
+    except Exception as e:
+        logger.warning(f"Could not read results from Elasticsearch: {e}")
 
     return row

From e447ad679538d22e92fafd5e3365a97be97cc05f Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Sun, 19 Oct 2025 23:32:45 -0700
Subject: [PATCH 08/10] remote server changes

---
 examples/swebench/server.py              |  51 +++++++--
 examples/swebench/tests/test_swebench.py | 138 +++++++++++++++++------
 pyproject.toml                           |   5 -
 3 files changed, 145 insertions(+), 49 deletions(-)

diff --git a/examples/swebench/server.py b/examples/swebench/server.py
index 0928c5f9..3118a1cf 100644
--- a/examples/swebench/server.py
+++ b/examples/swebench/server.py
@@ -14,7 +14,7 @@
 # Attach Elasticsearch handler to root logger (Eval Protocol UI)
 handler = ElasticsearchDirectHttpHandler()
 logging.getLogger().addHandler(handler)
-rollout_states = {}
+# rollout_states = {}
 
 
 @app.post("/init")
@@ -27,11 +27,11 @@ def init(req: InitRequest):
     logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
     logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
 
-    rollout_states[req.metadata.rollout_id] = {
-        "terminated": False,
-        "status": "running",
-        "instance_id": req.metadata.row_id,
-    }
+    # rollout_states[req.metadata.rollout_id] = {
+    #     "terminated": False,
+    #     "status": "running",
+    #     "instance_id": req.metadata.row_id,
+    # }
 
     def _worker():
         try:
@@ -157,6 +157,7 @@ def _worker():
 
             instance_id = None
             resolved = None
+            exit_reason = None
 
             if preds_path.exists():
                 try:
@@ -166,7 +167,7 @@ def _worker():
                     pass
 
             if instance_id:
-                model_id = req.completion_params.get("model") if req.completion_params else None
+                model_id = req.model
                 if model_id:
                     safe_model = model_id.replace("/", "__").replace(":", "-")
                     report_path = (
@@ -189,6 +190,7 @@ def _worker():
                                 for status_name, ids in by_status.items():
                                     if instance_id in (ids or []):
                                         resolved = False
+                                        exit_reason = status_name
                                         break
                             except Exception:
                                 pass
@@ -196,6 +198,7 @@ def _worker():
             results_data = {
                 "instance_id": instance_id,
                 "resolved": resolved,
+                "exit_reason": exit_reason,
                 "row_id": str(single_index),
             }
 
@@ -204,16 +207,40 @@ def _worker():
             results_data = {"error": str(e), "row_id": str(single_index)}
             logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
         finally:
-            # Log results and mark finished
-            logger.info("Evaluation results", extra={"results": results_data, "status": Status.rollout_finished()})
+            # Create and log EvaluateResult in standardized format
+            from eval_protocol.models import EvaluateResult, MetricResult
+
+            if resolved is not None:
+                reason = f"instance={instance_id}, resolved={resolved}"
+                if exit_reason:
+                    reason += f", exit_reason={exit_reason}"
+
+                eval_result = EvaluateResult(
+                    score=1.0 if resolved else 0.0,
+                    reason=reason,
+                    is_score_valid=True,
+                    metrics={
+                        "resolved": MetricResult(
+                            score=1.0 if resolved else 0.0,
+                            is_score_valid=True,
+                            reason=f"resolved={resolved}",
+                            value=int(resolved),
+                        )
+                    },
+                )
+                logger.info(
+                    f"EVAL_RESULT:{eval_result.model_dump_json()}", extra={"status": Status.rollout_finished()}
+                )
+            else:
+                logger.info("EVAL_RESULT:null", extra={"status": Status.rollout_finished()})
 
     threading.Thread(target=_worker, daemon=True).start()
     return {"status": "accepted"}
 
 
-@app.get("/status")
-def status(rollout_id: str):
-    return rollout_states.get(rollout_id, {"terminated": False})
+# @app.get("/status")
+# def status(rollout_id: str):
+#     return rollout_states.get(rollout_id, {"terminated": False})
 
 
 def main():
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index 6dd09f63..aa3f3300 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -32,7 +32,7 @@ def rows() -> List[EvaluationRow]:
         generators=[rows],
     ),
     rollout_processor=RemoteRolloutProcessor(
-        remote_base_url="http://127.0.0.1:3000",
+        remote_base_url="http://35.209.134.123:3000",
         model_base_url="https://tracing.fireworks.ai",
         timeout_seconds=1800,
         output_data_loader=default_fireworks_output_data_loader,
@@ -42,49 +42,123 @@ def rows() -> List[EvaluationRow]:
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_concurrent_rollouts=3,
 )
-async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-    """Evaluate SWE-bench instance by reading results from Elasticsearch."""
-    import logging
+# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
+#     """Evaluate SWE-bench instance by reading results from Elasticsearch."""
+#     import logging
+#     logger = logging.getLogger(__name__)
+
+#     rollout_id = row.execution_metadata.rollout_id
+#     logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
+
+#     if not rollout_id:
+#         logger.warning("[DEBUG] No rollout_id, returning early")
+#         return row
+
+#     try:
+#         from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
+
+#         es_config = create_elasticsearch_config_from_env()
+#         es_client = ElasticsearchClient(es_config)
+#         logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}")
+
+#         # Search for EVAL_RESULT log by message prefix
+#         query = {"match": {"rollout_id": rollout_id}}
+#         search_results = es_client.search(query=query, size=50)  # Get more to find EVAL_RESULT
+#         logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}")
+
+#         # Filter for EVAL_RESULT in Python
+#         if search_results and search_results["hits"]["total"]["value"] > 0:
+#             for hit in search_results["hits"]["hits"]:
+#                 message = hit["_source"].get("message", "")
+
+#                 if message.startswith("EVAL_RESULT:"):
+#                     logger.info(f"[DEBUG] Found EVAL_RESULT message!")
+#                     result_json = message.replace("EVAL_RESULT:", "")
+#                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
+#                     logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}")
+#                     break
+#             else:
+#                 logger.warning("[DEBUG] EVAL_RESULT message not found in logs")
+#         else:
+#             logger.warning("[DEBUG] No logs found for rollout")
+
+#         logger.info(f"[DEBUG] Searching ES for EVAL_RESULT")
+#         import asyncio
+#         search_results = None
+#         for attempt in range(5):
+#             search_results = es_client.search(query=query, size=1)
+#             if search_results and search_results["hits"]["total"]["value"] > 0:
+#                 logger.info(f"[DEBUG] Found result on attempt {attempt + 1}")
+#                 break
+#             logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...")
+#             await asyncio.sleep(1)
+
+#         logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits")
+#         debug_query = {"match": {"rollout_id": rollout_id}}
+#         debug_results = es_client.search(query=debug_query, size=26)
+#         logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}")
 
-    logger = logging.getLogger(__name__)
+#         if debug_results["hits"]["total"]["value"] > 0:
+#             for hit in debug_results["hits"]["hits"]:
+#                 msg = hit["_source"].get("message", "")[:80]
+#                 logger.info(f"[DEBUG] Sample message: {msg}")
+#         else:
+#             logger.warning("[DEBUG] No logs at all for this rollout_id!")
+#         if search_results and search_results["hits"]["total"]["value"] > 0:
+#             hit = search_results["hits"]["hits"][0]["_source"]
+#             message = hit.get("message", "")
+#             logger.info(f"[DEBUG] Found message: {message[:100]}...")
 
+#             if message.startswith("EVAL_RESULT:"):
+#                 result_json = message.replace("EVAL_RESULT:", "")
+#                 logger.info(f"[DEBUG] Parsing EvaluateResult JSON")
+
+#                 if result_json != "null":
+#                     # Deserialize directly to EvaluateResult
+#                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
+#                     logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}")
+#                 else:
+#                     logger.warning("[DEBUG] Result was null (no resolved status available)")
+#             else:
+#                 logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}")
+#         else:
+#             logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch")
+
+#     except Exception as e:
+#         logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True)
+
+#     logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
+#     return row
+
+
+async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
+    """Evaluate SWE-bench instance by reading results from Elasticsearch."""
     rollout_id = row.execution_metadata.rollout_id
     if not rollout_id:
         return row
 
-    # Query Elasticsearch for results logged by server
     try:
         from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
 
         es_config = create_elasticsearch_config_from_env()
         es_client = ElasticsearchClient(es_config)
 
-        # Search for results log from this rollout
-        query = {"bool": {"must": [{"term": {"rollout_id.keyword": rollout_id}}, {"exists": {"field": "results"}}]}}
-
-        search_results = es_client.es.search(index=es_config.index_name, query=query, size=1)
-
-        if search_results["hits"]["total"]["value"] > 0:
-            hit = search_results["hits"]["hits"][0]["_source"]
-            results_data = hit.get("results", {})
-            resolved = results_data.get("resolved")
-            instance_id = results_data.get("instance_id")
-
-            if resolved is not None:
-                row.evaluation_result = EvaluateResult(
-                    score=1.0 if resolved else 0.0,
-                    reason=f"instance={instance_id}, resolved={resolved}",
-                    is_score_valid=True,
-                    metrics={
-                        "resolved": MetricResult(
-                            score=1.0 if resolved else 0.0,
-                            is_score_valid=True,
-                            reason=f"resolved={resolved}",
-                            value=int(resolved),
-                        )
-                    },
-                )
+        # Get all logs for this rollout and find EVAL_RESULT message
+        query = {"match": {"rollout_id": rollout_id}}
+        search_results = es_client.search(query=query, size=50)
+
+        if search_results and search_results["hits"]["total"]["value"] > 0:
+            for hit in search_results["hits"]["hits"]:
+                message = hit["_source"].get("message", "")
+
+                if message.startswith("EVAL_RESULT:"):
+                    result_json = message.replace("EVAL_RESULT:", "")
+                    row.evaluation_result = EvaluateResult.model_validate_json(result_json)
+                    break
+
     except Exception as e:
-        logger.warning(f"Could not read results from Elasticsearch: {e}")
+        import logging
+
+        logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}")
 
     return row
diff --git a/pyproject.toml b/pyproject.toml
index b50ac5b8..fd7e6961 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,11 +97,6 @@ box2d = [
     "gymnasium[box2d]>=0.29.0",
     "Pillow",
 ]
-swebench = [
-    "mini-swe-agent>=1.14.0",
-    "datasets>=2.0.0",
-    "litellm>=1.75.0",  # Note: Overrides core litellm<1.75.0 for swebench compatibility
-]
 langfuse = [
     "langfuse>=2.0.0",
 ]

From e08ca9aee3274259336feb21789a8523ea15ab6a Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Mon, 20 Oct 2025 11:07:48 -0700
Subject: [PATCH 09/10] addressed comments

---
 eval_protocol/utils/evaluation_row_utils.py |  25 +++++
 examples/swebench/tests/test_swebench.py    | 113 +-------------------
 examples/swebench/tracing_model.py          |  35 +++++-
 3 files changed, 64 insertions(+), 109 deletions(-)

diff --git a/eval_protocol/utils/evaluation_row_utils.py b/eval_protocol/utils/evaluation_row_utils.py
index d89f0c55..bb1e94c7 100644
--- a/eval_protocol/utils/evaluation_row_utils.py
+++ b/eval_protocol/utils/evaluation_row_utils.py
@@ -9,6 +9,7 @@
 from typing import List
 
 from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import InputMetadata
 
 
 def serialize_message(msg: Message) -> str:
@@ -134,3 +135,27 @@ def assistant_to_ground_truth(data: List[EvaluationRow]) -> List[EvaluationRow]:
         )
 
     return processed_rows
+
+
+def create_rows_from_indices(count: int, **metadata) -> List[EvaluationRow]:
+    """Create evaluation rows with sequential row_ids.
+
+    Useful for remote processors where the server determines content based on row_id.
+
+    Args:
+        count: Number of rows to create
+        **metadata: Additional metadata to include in each row
+
+    Returns:
+        List of EvaluationRows with row_id set to "0", "1", "2", ...
+    """
+    rows = []
+    for idx in range(count):
+        row_metadata = {"row_id": str(idx), **metadata}
+        rows.append(
+            EvaluationRow(
+                messages=[],
+                input_metadata=InputMetadata(**row_metadata),
+            )
+        )
+    return rows
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index aa3f3300..e3ee0955 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -3,27 +3,13 @@
 from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
-from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
 
-
-def rows_from_indices(count: int) -> List[EvaluationRow]:
-    out: List[EvaluationRow] = []
-    for idx in range(count):
-        out.append(
-            EvaluationRow(
-                messages=[],
-                input_metadata={
-                    "row_id": str(idx),
-                    "instance_index": str(idx),
-                },
-            )
-        )
-    return out
+# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
+from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
 
 
 def rows() -> List[EvaluationRow]:
-    # Generate 10 rows by index; server maps index -> dataset instance via --slice
-    return rows_from_indices(2)
+    return create_rows_from_indices(500)  # All instances
 
 
 # -------------------- Harness result attachment (UI pass/fail) --------------------
@@ -31,106 +17,17 @@ def rows() -> List[EvaluationRow]:
     data_loaders=DynamicDataLoader(
         generators=[rows],
     ),
+    max_dataset_rows=2,
     rollout_processor=RemoteRolloutProcessor(
-        remote_base_url="http://35.209.134.123:3000",
+        remote_base_url="http://127.0.0.1:3000",
         model_base_url="https://tracing.fireworks.ai",
         timeout_seconds=1800,
-        output_data_loader=default_fireworks_output_data_loader,
         disable_elastic_search_setup=True,
         elastic_search_config=create_elasticsearch_config_from_env(),
     ),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_concurrent_rollouts=3,
 )
-# async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-#     """Evaluate SWE-bench instance by reading results from Elasticsearch."""
-#     import logging
-#     logger = logging.getLogger(__name__)
-
-#     rollout_id = row.execution_metadata.rollout_id
-#     logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
-
-#     if not rollout_id:
-#         logger.warning("[DEBUG] No rollout_id, returning early")
-#         return row
-
-#     try:
-#         from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
-
-#         es_config = create_elasticsearch_config_from_env()
-#         es_client = ElasticsearchClient(es_config)
-#         logger.info(f"[DEBUG] ES client created for index: {es_config.index_name}")
-
-#         # Search for EVAL_RESULT log by message prefix
-#         query = {"match": {"rollout_id": rollout_id}}
-#         search_results = es_client.search(query=query, size=50)  # Get more to find EVAL_RESULT
-#         logger.info(f"[DEBUG] Total logs: {search_results['hits']['total']['value']}")
-
-#         # Filter for EVAL_RESULT in Python
-#         if search_results and search_results["hits"]["total"]["value"] > 0:
-#             for hit in search_results["hits"]["hits"]:
-#                 message = hit["_source"].get("message", "")
-
-#                 if message.startswith("EVAL_RESULT:"):
-#                     logger.info(f"[DEBUG] Found EVAL_RESULT message!")
-#                     result_json = message.replace("EVAL_RESULT:", "")
-#                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
-#                     logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}")
-#                     break
-#             else:
-#                 logger.warning("[DEBUG] EVAL_RESULT message not found in logs")
-#         else:
-#             logger.warning("[DEBUG] No logs found for rollout")
-
-#         logger.info(f"[DEBUG] Searching ES for EVAL_RESULT")
-#         import asyncio
-#         search_results = None
-#         for attempt in range(5):
-#             search_results = es_client.search(query=query, size=1)
-#             if search_results and search_results["hits"]["total"]["value"] > 0:
-#                 logger.info(f"[DEBUG] Found result on attempt {attempt + 1}")
-#                 break
-#             logger.info(f"[DEBUG] Attempt {attempt + 1}: No hits, retrying in 1s...")
-#             await asyncio.sleep(1)
-
-#         logger.info(f"[DEBUG] Final: ES returned {search_results['hits']['total']['value'] if search_results else 0} hits")
-#         debug_query = {"match": {"rollout_id": rollout_id}}
-#         debug_results = es_client.search(query=debug_query, size=26)
-#         logger.info(f"[DEBUG] Total logs for {rollout_id}: {debug_results['hits']['total']['value']}")
-
-#         if debug_results["hits"]["total"]["value"] > 0:
-#             for hit in debug_results["hits"]["hits"]:
-#                 msg = hit["_source"].get("message", "")[:80]
-#                 logger.info(f"[DEBUG] Sample message: {msg}")
-#         else:
-#             logger.warning("[DEBUG] No logs at all for this rollout_id!")
-#         if search_results and search_results["hits"]["total"]["value"] > 0:
-#             hit = search_results["hits"]["hits"][0]["_source"]
-#             message = hit.get("message", "")
-#             logger.info(f"[DEBUG] Found message: {message[:100]}...")
-
-#             if message.startswith("EVAL_RESULT:"):
-#                 result_json = message.replace("EVAL_RESULT:", "")
-#                 logger.info(f"[DEBUG] Parsing EvaluateResult JSON")
-
-#                 if result_json != "null":
-#                     # Deserialize directly to EvaluateResult
-#                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
-#                     logger.info(f"[DEBUG] Attached evaluation_result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}")
-#                 else:
-#                     logger.warning("[DEBUG] Result was null (no resolved status available)")
-#             else:
-#                 logger.warning(f"[DEBUG] Message doesn't start with EVAL_RESULT: {message[:50]}")
-#         else:
-#             logger.warning("[DEBUG] No EVAL_RESULT found in Elasticsearch")
-
-#     except Exception as e:
-#         logger.error(f"[DEBUG] Exception in test: {e}", exc_info=True)
-
-#     logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
-#     return row
-
-
 async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
     """Evaluate SWE-bench instance by reading results from Elasticsearch."""
     rollout_id = row.execution_metadata.rollout_id
diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py
index 6f482efd..11375fc0 100644
--- a/examples/swebench/tracing_model.py
+++ b/examples/swebench/tracing_model.py
@@ -1,5 +1,38 @@
 """
-TracingFireworksModel - Routes through tracing using OpenAI SDK.
+Custom model classes for integrating mini-swe-agent with eval-protocol's tracing infrastructure.
+
+## Why This File Exists
+
+mini-swe-agent is an autonomous agent that makes 20-100+ LLM API calls per SWE-bench instance
+(e.g., reading files, editing code, running tests). To debug agent behavior and display results
+in eval-protocol's UI, we need to capture and analyze every LLM call.
+
+This file bridges mini-swe-agent (which uses LitellmModel) with the Fireworks tracing proxy
+(which requires specific URL patterns and SDK usage).
+
+## Problem Without This File
+
+By default, mini-swe-agent would:
+- Call Fireworks API directly (no tracing)
+- Agent conversations invisible in eval-protocol UI
+- Can't debug why agent failed
+- No cost tracking per call
+- Model names get mangled by litellm routing
+
+## What These Classes Do
+
+### FireworksCompatibleModel (Base)
+- Extends mini-swe-agent's LitellmModel
+- Handles Fireworks API compatibility:
+  * Strips non-standard message fields that Fireworks API rejects
+  * Adds stop sequences to prevent common agent failure modes
+  * Applies temperature/reasoning overrides from wrapper script
+- Used when tracing isn't needed (direct Fireworks API calls)
+
+### TracingFireworksModel (For eval-protocol integration)
+- Extends FireworksCompatibleModel
+- Routes ALL LLM calls through Fireworks tracing proxy instead of direct API
+- Uses OpenAI SDK (not litellm) to preserve full model names
 """
 
 import sys

From 867d94757599b913f7eb04100c78dbac3a34e966 Mon Sep 17 00:00:00 2001
From: Shrey Modi <shrey@fireworks.ai>
Date: Tue, 21 Oct 2025 10:12:53 -0700
Subject: [PATCH 10/10] porting to fireworks tracing

---
 examples/swebench/server.py              | 17 ++-----
 examples/swebench/tests/test_swebench.py | 58 ++++++++++++++++--------
 2 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/examples/swebench/server.py b/examples/swebench/server.py
index 3118a1cf..1c1fbb03 100644
--- a/examples/swebench/server.py
+++ b/examples/swebench/server.py
@@ -7,32 +7,24 @@
 from fastapi import FastAPI
 import uvicorn
 
-from eval_protocol import Status, InitRequest, ElasticsearchDirectHttpHandler, RolloutIdFilter
+from eval_protocol import Status, InitRequest, RolloutIdFilter
+from eval_protocol.log_utils.init import init_external_logging_from_env
 
 app = FastAPI()
 
 # Attach Elasticsearch handler to root logger (Eval Protocol UI)
-handler = ElasticsearchDirectHttpHandler()
-logging.getLogger().addHandler(handler)
+init_external_logging_from_env()
 # rollout_states = {}
 
 
 @app.post("/init")
 def init(req: InitRequest):
     # Allow Eval Protocol to dynamically configure ES endpoint
-    if req.elastic_search_config:
-        handler.configure(req.elastic_search_config)
 
     # Tag all logs for this rollout_id
     logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
     logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
 
-    # rollout_states[req.metadata.rollout_id] = {
-    #     "terminated": False,
-    #     "status": "running",
-    #     "instance_id": req.metadata.row_id,
-    # }
-
     def _worker():
         try:
             # Validate model
@@ -130,6 +122,7 @@ def _worker():
 
             # 2) Run SWE-bench evaluation harness on preds.json
             preds_path_str = str(preds_path)
+            unique_run_id = f"eval-{invocation_id}"
             eval_cmd = [
                 "python3",
                 "-m",
@@ -141,7 +134,7 @@ def _worker():
                 "--max_workers",
                 str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
                 "--run_id",
-                "eval-run",
+                unique_run_id,
             ]
             logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
             eval_proc = subprocess.Popen(
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
index e3ee0955..87158ed1 100644
--- a/examples/swebench/tests/test_swebench.py
+++ b/examples/swebench/tests/test_swebench.py
@@ -2,9 +2,7 @@
 from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
 from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
 from eval_protocol.pytest import evaluation_test
-from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor, create_elasticsearch_config_from_env
-
-# from eval_protocol.pytest.tracing_utils import default_fireworks_output_data_loader
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
 from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
 
 
@@ -23,39 +21,59 @@ def rows() -> List[EvaluationRow]:
         model_base_url="https://tracing.fireworks.ai",
         timeout_seconds=1800,
         disable_elastic_search_setup=True,
-        elastic_search_config=create_elasticsearch_config_from_env(),
     ),
     completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_concurrent_rollouts=3,
 )
 async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
-    """Evaluate SWE-bench instance by reading results from Elasticsearch."""
+    """Evaluate SWE-bench instance by reading results from Fireworks tracing logs."""
+    import logging
+
+    logger = logging.getLogger(__name__)
+
     rollout_id = row.execution_metadata.rollout_id
+    logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
+
     if not rollout_id:
+        logger.warning("[DEBUG] No rollout_id")
         return row
 
     try:
-        from eval_protocol.log_utils.elasticsearch_client import ElasticsearchClient
+        from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
 
-        es_config = create_elasticsearch_config_from_env()
-        es_client = ElasticsearchClient(es_config)
+        adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai")
+        logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai")
 
-        # Get all logs for this rollout and find EVAL_RESULT message
-        query = {"match": {"rollout_id": rollout_id}}
-        search_results = es_client.search(query=query, size=50)
+        # Fetch logs for this rollout
+        logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}")
+        log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24)
 
-        if search_results and search_results["hits"]["total"]["value"] > 0:
-            for hit in search_results["hits"]["hits"]:
-                message = hit["_source"].get("message", "")
+        logger.info(f"[DEBUG] Received {len(log_entries)} log entries")
+        if log_entries:
+            logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}")
 
-                if message.startswith("EVAL_RESULT:"):
-                    result_json = message.replace("EVAL_RESULT:", "")
+        # Find EVAL_RESULT message
+        found = False
+        for entry in log_entries:
+            message = entry.get("message", "")
+            if message.startswith("EVAL_RESULT:"):
+                logger.info("[DEBUG] Found EVAL_RESULT message!")
+                result_json = message.replace("EVAL_RESULT:", "")
+                logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...")
+
+                if result_json != "null":
                     row.evaluation_result = EvaluateResult.model_validate_json(result_json)
-                    break
+                    logger.info(
+                        f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}"
+                    )
+                    found = True
+                break
 
-    except Exception as e:
-        import logging
+        if not found:
+            logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs")
 
-        logging.getLogger(__name__).warning(f"Could not read results from Elasticsearch: {e}")
+    except Exception as e:
+        logger.error(f"[DEBUG] Exception: {e}", exc_info=True)
 
+    logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
     return row