diff --git a/eval_protocol/utils/evaluation_row_utils.py b/eval_protocol/utils/evaluation_row_utils.py
index d89f0c55..bb1e94c7 100644
--- a/eval_protocol/utils/evaluation_row_utils.py
+++ b/eval_protocol/utils/evaluation_row_utils.py
@@ -9,6 +9,7 @@
 from typing import List
 
 from eval_protocol.models import EvaluationRow, Message
+from eval_protocol.models import InputMetadata
 
 
 def serialize_message(msg: Message) -> str:
@@ -134,3 +135,27 @@ def assistant_to_ground_truth(data: List[EvaluationRow]) -> List[EvaluationRow]:
         )
 
     return processed_rows
+
+
+def create_rows_from_indices(count: int, **metadata) -> List[EvaluationRow]:
+    """Create evaluation rows with sequential row_ids.
+
+    Useful for remote processors where the server determines content based on row_id.
+
+    Args:
+        count: Number of rows to create
+        **metadata: Additional metadata to include in each row
+
+    Returns:
+        List of EvaluationRows with row_id set to "0", "1", "2", ...
+    """
+    rows = []
+    for idx in range(count):
+        row_metadata = {"row_id": str(idx), **metadata}
+        rows.append(
+            EvaluationRow(
+                messages=[],
+                input_metadata=InputMetadata(**row_metadata),
+            )
+        )
+    return rows
diff --git a/examples/swebench/README.md b/examples/swebench/README.md
new file mode 100644
index 00000000..a696ed6f
--- /dev/null
+++ b/examples/swebench/README.md
@@ -0,0 +1,300 @@
+# SWE-bench Evaluation Example
+
+This example shows how to evaluate LLM models on the SWE-bench software engineering benchmark using eval-protocol.
+
+## Quick Start
+
+### 1. Install Dependencies
+
+```bash
+# From the python-sdk repository root
+cd python-sdk
+
+# Install eval-protocol with swebench support
+pip install -e ".[swebench]"
+```
+
+### 2. Set up mini-swe-agent
+
+mini-swe-agent requires a Fireworks API key to function:
+
+```bash
+# Configure API key for mini-swe-agent
+mini-extra config set FIREWORKS_API_KEY your_fireworks_api_key
+
+# Verify it's set
+mini-extra config get FIREWORKS_API_KEY
+```
+
+### 3. Install SWE-bench Harness
+
+```bash
+# Navigate to the swebench example directory
+cd examples/swebench
+
+# Clone and install SWE-bench
+git clone https://github.com/princeton-nlp/SWE-bench
+pip install -e SWE-bench
+```
+
+### 4. Set Environment Variables
+
+```bash
+export FIREWORKS_API_KEY="your_fireworks_api_key"
+```
+
+## Running the Evaluation
+
+**IMPORTANT:** Always run both the server and tests from the `examples/swebench/` directory.
+
+### Step 1: Start the Server
+
+Open a terminal and run:
+
+```bash
+cd examples/swebench
+python server.py
+```
+
+You should see:
+```
+INFO:     Uvicorn running on http://127.0.0.1:3000 (Press CTRL+C to quit)
+```
+
+### Step 2: Configure Your Test
+
+Edit `tests/test_swebench.py` to set your model and parameters:
+
+```python
+completion_params=[{
+    "model": "accounts/fireworks/models/your-model-name",  # Edit this
+    "model_kwargs": {
+        "temperature": 0.2,      # Optional
+        # "max_tokens": 2048,    # Optional
+        # "reasoning": "high",   # Optional
+    }
+}],
+max_concurrent_rollouts=3,  # How many instances to run in parallel
+```
+
+To test different numbers of instances, edit line 26:
+```python
+def rows() -> List[EvaluationRow]:
+    return rows_from_indices(2)  # Change 2 to desired number (max 500)
+```
+
+### Step 3: Run the Test
+
+Open a second terminal:
+
+```bash
+cd examples/swebench
+pytest tests/test_swebench.py -v -s
+```
+
+## What Happens During a Run
+
+For each instance (row):
+
+1. **Server receives request** from pytest
+2. **Wrapper script** (`run_swe_agent_fw.py`) is called with the instance index
+3. **mini-swe-agent** runs in a Docker container for that specific repository
+4. **Agent attempts to solve** the issue by editing code
+5. **Patch is generated** and saved to `preds.json`
+6. **SWE-bench harness** applies the patch and runs tests
+7. **Results** are written to the row directory
+8. **Test fetches results** and displays pass/fail in the UI
+
+## Understanding the Output
+
+### Directory Structure
+
+Each instance creates its own `row_N/` directory:
+
+```
+examples/swebench/
+├── row_0/                                    # First instance
+│   ├── preds.json                            # ← Model's generated patch
+│   ├── astropy__astropy-12907/               # Instance-specific folder
+│   │   └── astropy__astropy-12907.traj.json  # Agent's execution trace
+│   ├── logs/                                 # Harness execution logs
+│   │   └── run_evaluation/
+│   │       └── eval-run/
+│   │           └── <safe_model_name>/
+│   │               └── astropy__astropy-12907/
+│   │                   ├── report.json       # ← Test results (pass/fail)
+│   │                   ├── test_output.txt   # Test execution output
+│   │                   ├── patch.diff        # Applied patch
+│   │                   └── eval.sh           # Evaluation script
+│   ├── agent_0.log                           # Agent console output
+│   ├── exit_statuses_*.yaml                  # Exit status if failed
+│   └── <model_name>.eval-run.json            # Overall run summary
+├── row_1/                                    # Second instance
+│   └── ...
+└── ...
+```
+
+### Key Files Explained
+
+#### `preds.json` - Model Predictions
+Location: `row_N/preds.json`
+
+Contains the patch generated by the model:
+```json
+{
+  "astropy__astropy-12907": {
+    "model_name_or_path": "accounts/fireworks/models/...",
+    "instance_id": "astropy__astropy-12907",
+    "model_patch": "diff --git a/... (the actual patch)"
+  }
+}
+```
+
+**If missing:** Agent failed before generating a patch (check `exit_statuses_*.yaml`)
+
+#### `report.json` - Test Results
+Location: `row_N/logs/run_evaluation/eval-run/<model_name>/<instance_id>/report.json`
+
+Contains pass/fail status after running tests:
+```json
+{
+  "astropy__astropy-12907": {
+    "patch_is_None": false,
+    "patch_exists": true,
+    "patch_successfully_applied": true,
+    "resolved": true,  // ← Was the issue fixed?
+    "tests_status": {
+      "FAIL_TO_PASS": {"success": [...], "failure": []},
+      "PASS_TO_PASS": {"success": [...], "failure": []}
+    }
+  }
+}
+```
+
+- `resolved: true` = Instance solved! All required tests pass.
+- `resolved: false` = Instance not solved (tests still failing)
+
+**If missing:** Agent didn't generate a patch or harness didn't run
+
+#### `exit_statuses_*.yaml` - Why Runs Failed
+Location: `row_N/exit_statuses_*.yaml`
+
+```yaml
+instances_by_exit_status:
+  Submitted: []
+  LimitsExceeded: ["astropy__astropy-12907"]  # Hit step/cost limits
+  Error: []
+```
+
+Common statuses:
+- `Submitted`: Completed normally
+- `LimitsExceeded`: Agent hit max steps or cost limit
+- `Error`: Unexpected error during execution
+
+#### `agent_N.log` - Agent Execution
+Location: `row_N/agent_N.log`
+
+Full console output from the agent run, including:
+- Docker container startup
+- Model API calls
+- Commands executed
+- Errors (if any)
+
+#### `*.traj.json` - Agent Trajectory
+Location: `row_N/<instance_id>/<instance_id>.traj.json`
+
+Complete record of the agent's execution:
+```json
+{
+  "instance_id": "astropy__astropy-12907",
+  "info": {
+    "submission": "...",  // The patch
+    "exit_status": "Submitted",
+    "model_stats": {
+      "instance_cost": 0.05,
+      "api_calls": 15
+    }
+  },
+  "messages": [...]  // All agent messages
+}
+```
+
+## Viewing Results
+
+### In the Terminal
+
+The test output shows:
+```
+INFO:test_swebench:[Row 0] Found instance_id: astropy__astropy-12907
+INFO:test_swebench:[Row 0] Report says resolved=True
+INFO:test_swebench:[Row 0] Final: resolved=True, reason=harness_resolved=True
+```
+
+### In the Eval Protocol UI
+
+If Elasticsearch is running, visit: `http://localhost:8000`
+- View aggregate scores
+- Inspect individual trajectories
+- Filter by resolved/unresolved
+- See cost and token usage
+
+### Check Individual Files
+
+```bash
+# Check if instance was solved
+cat row_0/logs/run_evaluation/eval-run/<model>/astropy__astropy-12907/report.json | jq '.["astropy__astropy-12907"].resolved'
+
+# View the generated patch
+cat row_0/preds.json | jq '.["astropy__astropy-12907"].model_patch'
+
+# Check exit status
+cat row_0/exit_statuses_*.yaml
+```
+
+## Performance Notes
+
+- **Small test (2 instances):** ~10-30 minutes
+- **Full dataset (500 instances):** 24-48 hours on a 16-core machine
+- **Concurrent runs:** Recommended 3-5 based on CPU/memory
+- **Docker space:** ~100GB for all images (downloads happen automatically)
+
+## Troubleshooting
+
+### Docker container fails to start
+```bash
+# Check Docker is running
+docker ps
+
+# Check disk space
+df -h
+```
+
+### Agent hits step limits
+Instances that consistently hit limits may need:
+- Higher step limit (edit mini-swe-agent config)
+- Different prompting strategy
+- More capable model
+
+### Server not responding
+```bash
+# Check server is running
+curl http://127.0.0.1:3000/status?rollout_id=test
+
+# Check server logs for errors
+# (shown in terminal where server.py is running)
+```
+
+## Next Steps
+
+- Review results in `row_*/logs/.../report.json`
+- Analyze failed instances to improve your model
+- Run on larger subsets to get statistical significance
+- Export results for further analysis
+
+## Support
+
+For issues:
+- Check agent logs: `row_N/agent_N.log`
+- Check exit statuses: `row_N/exit_statuses_*.yaml`
+- Verify Docker has sufficient resources
+- Ensure API key is valid and has credits
diff --git a/examples/swebench/SWE-bench b/examples/swebench/SWE-bench
new file mode 160000
index 00000000..5cd4be9f
--- /dev/null
+++ b/examples/swebench/SWE-bench
@@ -0,0 +1 @@
+Subproject commit 5cd4be9fb23971679cbbafe5a0ecade27cef99be
diff --git a/examples/swebench/run_swe_agent_fw.py b/examples/swebench/run_swe_agent_fw.py
new file mode 100755
index 00000000..4d145038
--- /dev/null
+++ b/examples/swebench/run_swe_agent_fw.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python3
+"""
+Fireworks-compatible wrapper for mini-swe-agent SWE-bench evaluations.
+
+This script handles Fireworks API compatibility by stripping non-standard fields
+that mini-swe-agent adds for internal tracking.
+
+Requires fully qualified Fireworks model paths:
+- Serverless models: fireworks_ai/accounts/fireworks/models/{model_name}
+- Deployed models: fireworks_ai/accounts/{account}/deployedModels/{model_name}
+
+Usage:
+    python run_swe_agent_fw.py <fully_qualified_model_path> [options]
+
+
+Requirements:
+    - mini-swe-agent: pip install mini-swe-agent
+    - Fireworks API key: Set via 'mini-extra config set FIREWORKS_API_KEY <key>'
+"""
+
+import argparse
+import os
+import sys
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any
+
+# Import required dependencies
+from minisweagent.models.litellm_model import LitellmModel, LitellmModelConfig
+import litellm
+
+
+def __get_api_key():
+    """Get Fireworks API key from environment or mini-swe-agent config."""
+    # Environment variable takes precedence
+    api_key = os.environ.get("FIREWORKS_API_KEY")
+    if api_key:
+        return api_key
+
+    # Try to get API key from mini-swe-agent's config system
+    try:
+        from minisweagent.config import get_config
+
+        config = get_config()
+        return config.get("FIREWORKS_API_KEY")
+    except (ImportError, AttributeError, KeyError):
+        # Fallback: check common config file locations
+        config_paths = [
+            Path.home() / ".config" / "mini-swe-agent" / ".env",
+            Path.home() / "Library" / "Application Support" / "mini-swe-agent" / ".env",
+        ]
+
+        for config_path in config_paths:
+            if config_path.exists():
+                try:
+                    with open(config_path) as f:
+                        for line in f:
+                            if line.startswith("FIREWORKS_API_KEY="):
+                                value = line.split("=", 1)[1].strip()
+                                return value.strip("'\"")
+                except (IOError, OSError):
+                    continue
+
+    return None
+
+
+def __test_model(model_id):
+    """Test model connectivity with a simple completion."""
+    from litellm import completion
+
+    # Verify API key exists
+    api_key = __get_api_key()
+    if not api_key:
+        print("Error: FIREWORKS_API_KEY not found.")
+        return False
+
+    # Configure environment for litellm
+    os.environ["FIREWORKS_API_KEY"] = api_key
+    # Assume model_id is fully qualified
+    model_name = model_id
+
+    print(f"Testing model: {model_name}")
+
+    try:
+        # Send test completion
+        response = completion(
+            model=model_name,
+            messages=[{"role": "user", "content": "Test message. Reply with OK."}],
+            temperature=0.0,
+            max_tokens=10,
+        )
+
+        print(f"Success. Response: {response.choices[0].message.content}")
+        print(f"Tokens used: {response.usage.total_tokens}")
+        return True
+
+    except Exception as e:
+        print(f"Error: {e}")
+        return False
+
+
+def __validate_environment():
+    """Check for required API key."""
+    if not __get_api_key():
+        print("Warning: FIREWORKS_API_KEY not found.")
+        print("Set it with: mini-extra config set FIREWORKS_API_KEY <key>")
+
+
+def __build_command(args, wrapper_module_path):
+    """Build mini-swe-agent command with appropriate arguments."""
+    # Construct model class path
+    wrapper_module = wrapper_module_path.stem
+    model_class = f"{wrapper_module}.FireworksCompatibleModel"
+
+    # Base command - assume model_id is fully qualified
+    cmd = [
+        sys.executable,
+        "-m",
+        "minisweagent.run.mini_extra",
+        "swebench-single" if args.single is not None else "swebench",
+        "--model",
+        args.model_id,
+        "--model-class",
+        "tracing_model.FireworksCompatibleModel",
+        "--subset",
+        args.subset,
+        "--split",
+        args.split,
+    ]
+    if args.model_class:
+        cmd.extend(["--model-class", args.model_class])
+    print(f"Output: {args.output}")
+    print(args.single)
+    # Mode-specific arguments
+    print(f"Output: {args.output}")
+    print(args.single)
+    # Mode-specific arguments
+    if args.single is not None:
+        # Use batch mode for a single index via slice and write to a per-row directory
+        from pathlib import Path
+
+        slice_spec = f"{args.single}:{args.single + 1}"
+        row_dir = str((Path(args.output) if args.output else Path.cwd()) / f"row_{args.single}")
+        cmd = [
+            sys.executable,
+            "-m",
+            "minisweagent.run.mini_extra",
+            "swebench",
+            "--model",
+            args.model_id,
+            "--model-class",
+            model_class,
+            "--subset",
+            args.subset,
+            "--split",
+            args.split,
+            "--slice",
+            slice_spec,
+            "--output",
+            row_dir,
+        ]
+        if args.model_class:
+            cmd.extend(["--model-class", args.model_class])
+        print(f"DEBUG: Using batch mode with slice {slice_spec}, output={row_dir}")
+    else:
+        if args.instances:
+            cmd.extend(["--slice", f"0:{args.instances}"])
+        cmd.extend(["--workers", str(args.workers), "--output", args.output])
+
+    return cmd
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run mini-swe-agent with Fireworks models on SWE-bench",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+
+    # Required model ID
+    parser.add_argument("model_id", help="Fireworks model ID")
+    parser.add_argument("--model-class", type=str, default=None, help="Optional mini-swe-agent model-class")
+    # Execution options
+    parser.add_argument("--instances", type=int, help="Number of instances to run")
+    parser.add_argument("--workers", type=int, default=1, help="Parallel workers (default: 1)")
+    parser.add_argument("--output", help="Output directory")
+    parser.add_argument("--subset", default="verified", choices=["verified", "lite", "full"])
+    parser.add_argument("--split", default="test", choices=["dev", "test"])
+    parser.add_argument("--single", type=int, metavar="INDEX", help="Run single instance")
+    parser.add_argument("--exit-immediately", action="store_true")
+    parser.add_argument("--test", action="store_true", help="Test model connectivity")
+    parser.add_argument(
+        "--reasoning",
+        type=str,
+        choices=["low", "medium", "high"],
+        default=None,
+        help="Provider-specific reasoning effort",
+    )
+    parser.add_argument("--temperature", type=float, default=None, help="Model temperature override")
+    parser.add_argument("--max-tokens", type=int, default=None, help="Max tokens override")
+    args = parser.parse_args()
+
+    # Handle test mode
+    if args.test:
+        sys.exit(0 if _test_model(args.model_id) else 1)
+
+    # Validate API key
+    __validate_environment()
+
+    # Set default output directory
+    if args.output is None:
+        safe_model_id = args.model_id.replace("/", "-").replace(":", "-")
+        script_dir = Path(__file__).parent.resolve()
+        args.output = str(script_dir / f"swebench-{safe_model_id}-results")
+
+    # Create temporary module for importing FireworksCompatibleModel
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
+        with open(__file__, "r") as current_file:
+            f.write(current_file.read())
+        # Inject per-run model overrides directly into the temp module
+        f.write("\n# --- Injected by wrapper: per-run model overrides ---\n")
+        f.write("WRAPPER_MODEL_OVERRIDES = {\n")
+        f.write(f"    'reasoning': {repr(args.reasoning)},\n")
+        f.write(f"    'temperature': {repr(args.temperature)},\n")
+        f.write(f"    'max_tokens': {repr(args.max_tokens)},\n")
+        f.write("}\n")
+        temp_module_path = Path(f.name)
+
+    try:
+        # Configure environment
+        env = os.environ.copy()
+        env["PYTHONPATH"] = f"{temp_module_path.parent}:{env.get('PYTHONPATH', '')}"
+        # Pass the fully qualified model path to the subprocess
+        env["FIREWORKS_MODEL_ID"] = args.model_id
+
+        # Ensure API key is passed to subprocess
+        api_key = __get_api_key()
+        if api_key:
+            env["FIREWORKS_API_KEY"] = api_key
+
+        # No environment variables for model kwargs; overrides are injected into the temp module
+
+        # Build command
+        cmd = __build_command(args, temp_module_path)
+
+        # Display configuration
+        print(f"Model: {args.model_id}")
+        print(f"Output: {args.output}")
+        print(f"Workers: {args.workers}")
+        if args.instances:
+            print(f"Instances: {args.instances}")
+
+        # Debug: Show the actual command being run
+        print(f"Command: {' '.join(cmd)}")
+        print(f"Model path in command: {cmd[cmd.index('--model') + 1] if '--model' in cmd else 'NOT FOUND'}")
+
+        # Execute mini-swe-agent
+        subprocess.run(cmd, env=env, check=True)
+
+    finally:
+        # Clean up temporary module
+        if temp_module_path.exists():
+            temp_module_path.unlink()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/swebench/server.py b/examples/swebench/server.py
new file mode 100644
index 00000000..1c1fbb03
--- /dev/null
+++ b/examples/swebench/server.py
@@ -0,0 +1,246 @@
+"""Minimal SWE-bench server - wraps run_swe_agent_fw.py with tracing via model_base_url."""
+
+import os
+import threading
+import subprocess
+import logging
+from fastapi import FastAPI
+import uvicorn
+
+from eval_protocol import Status, InitRequest, RolloutIdFilter
+from eval_protocol.log_utils.init import init_external_logging_from_env
+
+app = FastAPI()
+
+# Attach Elasticsearch handler to root logger (Eval Protocol UI)
+init_external_logging_from_env()
+# rollout_states = {}
+
+
+@app.post("/init")
+def init(req: InitRequest):
+    # Allow Eval Protocol to dynamically configure ES endpoint
+
+    # Tag all logs for this rollout_id
+    logger = logging.getLogger(f"{__name__}.{req.metadata.rollout_id}")
+    logger.addFilter(RolloutIdFilter(req.metadata.rollout_id))
+
+    def _worker():
+        try:
+            # Validate model
+            if not req.model:
+                raise ValueError("model is required")
+
+            if not req.metadata or not req.metadata.row_id:
+                raise ValueError("metadata.row_id is required and must be an integer index as string, e.g. '0'")
+            try:
+                single_index = int(str(req.metadata.row_id))
+            except ValueError:
+                raise ValueError(f"row_id must be an integer index for --single, got: {req.metadata.row_id}")
+            env = os.environ.copy()
+            # Build environment for subprocess
+            if "FIREWORKS_API_KEY" in os.environ:
+                env["FIREWORKS_API_KEY"] = os.environ["FIREWORKS_API_KEY"]
+            # Make sure the tracing model module is importable by the subprocess
+            # so "tracing_model.TracingFireworksModel" can be imported
+            from pathlib import Path
+
+            script_dir = Path(__file__).parent
+            env["PYTHONPATH"] = f"{script_dir}:{env.get('PYTHONPATH', '')}"
+
+            # Sandbox by invocation_id to isolate concurrent test runs
+            from pathlib import Path
+
+            invocation_id = req.metadata.invocation_id
+            base_dir = Path(os.getcwd()) / invocation_id
+            base_dir.mkdir(parents=True, exist_ok=True)
+
+            script_path = str((Path(__file__).parent / "run_swe_agent_fw.py").resolve())
+
+            # Extract model_kwargs from req.metadata (forwarded from input_metadata)
+            model_kwargs = {}
+            # convert to logger.debug everywhere, remove debug then
+            logger.debug(f"req.metadata attributes: {dir(req.metadata)}")
+
+            if hasattr(req.metadata, "model_kwargs"):
+                mk = getattr(req.metadata, "model_kwargs", None)
+                logger.debug(f"Found req.metadata.model_kwargs = {mk}")
+                if isinstance(mk, dict):
+                    model_kwargs = mk
+                    logger.debug(f"Extracted model_kwargs from metadata: {model_kwargs}")
+            else:
+                logger.debug("req.metadata has NO model_kwargs attribute")
+
+            # Set tracing URL
+            if req.model_base_url:
+                env["TRACING_BASE_URL"] = req.model_base_url
+
+            cmd = [
+                "python3",
+                script_path,
+                req.model,
+                "--single",
+                str(single_index),
+                "--exit-immediately",
+                "--output",
+                str(base_dir),
+                "--model-class",
+                "tracing_model.TracingFireworksModel",
+            ]
+            # Forward model kwargs as CLI flags to the wrapper
+            if model_kwargs.get("reasoning") in ("low", "medium", "high"):
+                cmd.extend(["--reasoning", str(model_kwargs["reasoning"])])
+            if model_kwargs.get("temperature") is not None:
+                cmd.extend(["--temperature", str(model_kwargs["temperature"])])
+            if model_kwargs.get("max_tokens") is not None:
+                cmd.extend(["--max-tokens", str(model_kwargs["max_tokens"])])
+            import json
+
+            # Log path inside row directory for this run
+            row_dir = base_dir / f"row_{single_index}"
+            row_dir.mkdir(parents=True, exist_ok=True)
+            log_path = row_dir / f"agent_{single_index}.log"
+
+            # Run without streaming; write all output to a log file; wait until completion
+            with open(log_path, "w") as lf:
+                proc = subprocess.Popen(
+                    cmd,
+                    env=env,
+                    stdout=lf,
+                    stderr=subprocess.STDOUT,
+                    text=True,
+                    bufsize=1,
+                )
+                ret = proc.wait()
+
+            # Use row-specific preds.json to avoid cross-run interference
+            preds_path = row_dir / "preds.json"
+            if preds_path.exists():
+                logger.info(f"Using preds.json at: {preds_path}")
+            else:
+                logger.error(f"No preds.json found at {preds_path}")
+
+            # 2) Run SWE-bench evaluation harness on preds.json
+            preds_path_str = str(preds_path)
+            unique_run_id = f"eval-{invocation_id}"
+            eval_cmd = [
+                "python3",
+                "-m",
+                "swebench.harness.run_evaluation",
+                "--dataset_name",
+                "princeton-nlp/SWE-bench_Verified",
+                "--predictions_path",
+                preds_path_str,
+                "--max_workers",
+                str(os.getenv("SWEBENCH_EVAL_WORKERS", "5")),
+                "--run_id",
+                unique_run_id,
+            ]
+            logger.info("Starting SWE-bench harness: %s", " ".join(map(str, eval_cmd)))
+            eval_proc = subprocess.Popen(
+                eval_cmd, cwd=str(row_dir), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1
+            )
+            assert eval_proc.stdout is not None
+            for line in eval_proc.stdout:
+                logger.info(line.rstrip("\n"))
+            eval_rc = eval_proc.wait()
+
+            # Collect evaluation results to send via Elasticsearch
+            import yaml
+
+            instance_id = None
+            resolved = None
+            exit_reason = None
+
+            if preds_path.exists():
+                try:
+                    preds = json.loads(preds_path.read_text())
+                    instance_id = next(iter(preds.keys()), None)
+                except Exception:
+                    pass
+
+            if instance_id:
+                model_id = req.model
+                if model_id:
+                    safe_model = model_id.replace("/", "__").replace(":", "-")
+                    report_path = (
+                        row_dir / "logs" / "run_evaluation" / "eval-run" / safe_model / instance_id / "report.json"
+                    )
+
+                    if report_path.exists():
+                        try:
+                            report_data = json.loads(report_path.read_text())
+                            resolved = bool(report_data.get(instance_id, {}).get("resolved", False))
+                        except Exception:
+                            pass
+
+                    if resolved is None:
+                        exit_files = sorted(row_dir.glob("exit_statuses_*.yaml"))
+                        if exit_files:
+                            try:
+                                status_doc = yaml.safe_load(exit_files[-1].read_text()) or {}
+                                by_status = status_doc.get("instances_by_exit_status", {})
+                                for status_name, ids in by_status.items():
+                                    if instance_id in (ids or []):
+                                        resolved = False
+                                        exit_reason = status_name
+                                        break
+                            except Exception:
+                                pass
+
+            results_data = {
+                "instance_id": instance_id,
+                "resolved": resolved,
+                "exit_reason": exit_reason,
+                "row_id": str(single_index),
+            }
+
+        except Exception as e:
+            # Best-effort: mark error but still finish to unblock polling
+            results_data = {"error": str(e), "row_id": str(single_index)}
+            logger.error(f"Rollout error: {e}", extra={"status": Status.rollout_error(str(e))})
+        finally:
+            # Create and log EvaluateResult in standardized format
+            from eval_protocol.models import EvaluateResult, MetricResult
+
+            if resolved is not None:
+                reason = f"instance={instance_id}, resolved={resolved}"
+                if exit_reason:
+                    reason += f", exit_reason={exit_reason}"
+
+                eval_result = EvaluateResult(
+                    score=1.0 if resolved else 0.0,
+                    reason=reason,
+                    is_score_valid=True,
+                    metrics={
+                        "resolved": MetricResult(
+                            score=1.0 if resolved else 0.0,
+                            is_score_valid=True,
+                            reason=f"resolved={resolved}",
+                            value=int(resolved),
+                        )
+                    },
+                )
+                logger.info(
+                    f"EVAL_RESULT:{eval_result.model_dump_json()}", extra={"status": Status.rollout_finished()}
+                )
+            else:
+                logger.info("EVAL_RESULT:null", extra={"status": Status.rollout_finished()})
+
+    threading.Thread(target=_worker, daemon=True).start()
+    return {"status": "accepted"}
+
+
+# @app.get("/status")
+# def status(rollout_id: str):
+#     return rollout_states.get(rollout_id, {"terminated": False})
+
+
+def main():
+    host = os.getenv("REMOTE_SERVER_HOST", "127.0.0.1")
+    port = int(os.getenv("REMOTE_SERVER_PORT", "3000"))
+    uvicorn.run(app, host=host, port=port)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/swebench/tests/test_swebench.py b/examples/swebench/tests/test_swebench.py
new file mode 100644
index 00000000..87158ed1
--- /dev/null
+++ b/examples/swebench/tests/test_swebench.py
@@ -0,0 +1,79 @@
+from typing import List
+from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader
+from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
+from eval_protocol.pytest import evaluation_test
+from eval_protocol.pytest.remote_rollout_processor import RemoteRolloutProcessor
+from eval_protocol.utils.evaluation_row_utils import create_rows_from_indices
+
+
+def rows() -> List[EvaluationRow]:
+    return create_rows_from_indices(500)  # All instances
+
+
+# -------------------- Harness result attachment (UI pass/fail) --------------------
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[rows],
+    ),
+    max_dataset_rows=2,
+    rollout_processor=RemoteRolloutProcessor(
+        remote_base_url="http://127.0.0.1:3000",
+        model_base_url="https://tracing.fireworks.ai",
+        timeout_seconds=1800,
+        disable_elastic_search_setup=True,
+    ),
+    completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
+    max_concurrent_rollouts=3,
+)
+async def test_swebench_remote(row: EvaluationRow) -> EvaluationRow:
+    """Evaluate SWE-bench instance by reading results from Fireworks tracing logs."""
+    import logging
+
+    logger = logging.getLogger(__name__)
+
+    rollout_id = row.execution_metadata.rollout_id
+    logger.info(f"[DEBUG] Processing rollout_id: {rollout_id}")
+
+    if not rollout_id:
+        logger.warning("[DEBUG] No rollout_id")
+        return row
+
+    try:
+        from eval_protocol.adapters.fireworks_tracing import FireworksTracingAdapter
+
+        adapter = FireworksTracingAdapter(base_url="https://tracing.fireworks.ai")
+        logger.info("[DEBUG] Created adapter for https://tracing.fireworks.ai")
+
+        # Fetch logs for this rollout
+        logger.info(f"[DEBUG] Searching for tag: rollout_id:{rollout_id}")
+        log_entries = adapter.search_logs(tags=[f"rollout_id:{rollout_id}"], limit=100, hours_back=24)
+
+        logger.info(f"[DEBUG] Received {len(log_entries)} log entries")
+        if log_entries:
+            logger.info(f"[DEBUG] Sample messages: {[e.get('message', '')[:50] for e in log_entries[:3]]}")
+
+        # Find EVAL_RESULT message
+        found = False
+        for entry in log_entries:
+            message = entry.get("message", "")
+            if message.startswith("EVAL_RESULT:"):
+                logger.info("[DEBUG] Found EVAL_RESULT message!")
+                result_json = message.replace("EVAL_RESULT:", "")
+                logger.info(f"[DEBUG] Parsing JSON: {result_json[:100]}...")
+
+                if result_json != "null":
+                    row.evaluation_result = EvaluateResult.model_validate_json(result_json)
+                    logger.info(
+                        f"[DEBUG] Attached result: score={row.evaluation_result.score}, reason={row.evaluation_result.reason}"
+                    )
+                    found = True
+                break
+
+        if not found:
+            logger.warning(f"[DEBUG] No EVAL_RESULT message found in {len(log_entries)} logs")
+
+    except Exception as e:
+        logger.error(f"[DEBUG] Exception: {e}", exc_info=True)
+
+    logger.info(f"[DEBUG] Returning row, has evaluation_result: {row.evaluation_result is not None}")
+    return row
diff --git a/examples/swebench/tracing_model.py b/examples/swebench/tracing_model.py
new file mode 100644
index 00000000..11375fc0
--- /dev/null
+++ b/examples/swebench/tracing_model.py
@@ -0,0 +1,192 @@
+"""
+Custom model classes for integrating mini-swe-agent with eval-protocol's tracing infrastructure.
+
+## Why This File Exists
+
+mini-swe-agent is an autonomous agent that makes 20-100+ LLM API calls per SWE-bench instance
+(e.g., reading files, editing code, running tests). To debug agent behavior and display results
+in eval-protocol's UI, we need to capture and analyze every LLM call.
+
+This file bridges mini-swe-agent (which uses LitellmModel) with the Fireworks tracing proxy
+(which requires specific URL patterns and SDK usage).
+
+## Problem Without This File
+
+By default, mini-swe-agent would:
+- Call Fireworks API directly (no tracing)
+- Agent conversations invisible in eval-protocol UI
+- Can't debug why agent failed
+- No cost tracking per call
+- Model names get mangled by litellm routing
+
+## What These Classes Do
+
+### FireworksCompatibleModel (Base)
+- Extends mini-swe-agent's LitellmModel
+- Handles Fireworks API compatibility:
+  * Strips non-standard message fields that Fireworks API rejects
+  * Adds stop sequences to prevent common agent failure modes
+  * Applies temperature/reasoning overrides from wrapper script
+- Used when tracing isn't needed (direct Fireworks API calls)
+
+### TracingFireworksModel (For eval-protocol integration)
+- Extends FireworksCompatibleModel
+- Routes ALL LLM calls through Fireworks tracing proxy instead of direct API
+- Uses OpenAI SDK (not litellm) to preserve full model names
+"""
+
+import sys
+import os
+
+from minisweagent.models.litellm_model import LitellmModel
+
+
+class FireworksCompatibleModel(LitellmModel):
+    """
+    Fireworks-compatible wrapper for LitellmModel.
+    """
+
+    def __init__(self, **kwargs):
+        model_id = os.environ.get("FIREWORKS_MODEL_ID")
+        if model_id:
+            kwargs["model_name"] = model_id
+
+        if "model_kwargs" not in kwargs:
+            kwargs["model_kwargs"] = {}
+
+        # CRITICAL: Set drop_params to False so stop sequences aren't stripped!
+        kwargs["model_kwargs"]["drop_params"] = False
+
+        # Get existing stop sequences
+        existing_stop = kwargs["model_kwargs"].get("stop", [])
+        if isinstance(existing_stop, str):
+            existing_stop = [existing_stop]
+        elif existing_stop is None:
+            existing_stop = []
+
+        # Add stop sequences (only the non-natural ones)
+        # stop_sequences = existing_stop + [
+        #     # ASCII versions
+        #     "<|User|>",
+        #     "<|Assistant|>",
+        #     # Full-width PIPE versions (U+FF5C)
+        #     "<｜User|>",  # \uff5c
+        #     "<｜Assistant|>",
+        #     "```<｜",
+        #     "<｜User",
+        #     "<｜Ass",
+        #     # Full-width LETTER L versions (U+FF4C)
+        #     "<ｌUser|>",  # \uff4c
+        #     "<ｌAssistant|>",
+        #     "```<ｌ",
+        #     "<ｌUser",
+        #     "<ｌAss",
+        # ]
+        # kwargs["model_kwargs"]["stop"] = stop_sequences
+        kwargs["model_kwargs"]["max_tokens"] = 1024  # Reduce to 1024 to save tokens
+
+        if "temperature" not in kwargs["model_kwargs"]:
+            kwargs["model_kwargs"]["temperature"] = 0.0
+
+        # Apply per-run overrides injected by the wrapper (no environment variables)
+        overrides = globals().get("WRAPPER_MODEL_OVERRIDES")
+        if isinstance(overrides, dict):
+            if overrides.get("reasoning") in ("low", "medium", "high"):
+                kwargs["model_kwargs"]["reasoning_effort"] = overrides["reasoning"]
+            if overrides.get("temperature") is not None:
+                try:
+                    kwargs["model_kwargs"]["temperature"] = float(overrides["temperature"])
+                except Exception:
+                    pass
+            if overrides.get("max_tokens") is not None:
+                try:
+                    kwargs["model_kwargs"]["max_tokens"] = int(overrides["max_tokens"])
+                except Exception:
+                    pass
+
+        super().__init__(**kwargs)
+
+    def _query(self, messages: list[dict[str, str]], **kwargs):
+        """Remove non-standard fields before sending to Fireworks API."""
+        # Keep only standard OpenAI-compatible fields
+        clean_messages = []
+        for msg in messages:
+            clean_msg = {"role": msg["role"], "content": msg["content"]}
+            if "tool_calls" in msg:
+                clean_msg["tool_calls"] = msg["tool_calls"]
+            if "name" in msg:
+                clean_msg["name"] = msg["name"]
+            clean_messages.append(clean_msg)
+
+        # IMPORTANT: Ensure drop_params stays False in the actual query
+        kwargs_with_stop = kwargs.copy()
+        if "drop_params" not in kwargs_with_stop:
+            kwargs_with_stop["drop_params"] = False
+
+        return super()._query(clean_messages, **kwargs_with_stop)
+
+
+class TracingFireworksModel(FireworksCompatibleModel):
+    """Routes LLM calls through tracing using OpenAI SDK (preserves model name)."""
+
+    def _query(self, messages, **kwargs):
+        """Use OpenAI SDK directly to preserve model name through tracing."""
+        from openai import OpenAI
+        import traceback
+
+        tracing_url = os.environ.get("TRACING_BASE_URL", "")
+        api_key = os.environ.get("FIREWORKS_API_KEY", "")
+
+        if not tracing_url:
+            print("⚠️  No TRACING_BASE_URL - using parent litellm")
+            return super()._query(messages, **kwargs)
+
+        print("\n🔗 OpenAI SDK Call:")
+        print(f"   URL: {tracing_url[:60]}...")
+        print(f"   Model: {self.config.model_name}")
+
+        try:
+            client = OpenAI(base_url=tracing_url, api_key=api_key)
+
+            # Build OpenAI-compatible params
+            openai_kwargs = {}
+            if self.config.model_kwargs.get("stop"):
+                openai_kwargs["stop"] = self.config.model_kwargs["stop"]
+                print(f"   Stop sequences: {len(openai_kwargs['stop'])}")
+            if self.config.model_kwargs.get("max_tokens"):
+                openai_kwargs["max_tokens"] = self.config.model_kwargs["max_tokens"]
+            if self.config.model_kwargs.get("temperature") is not None:
+                openai_kwargs["temperature"] = self.config.model_kwargs["temperature"]
+
+            # CRITICAL: Clean messages - remove 'extra' fields that OpenAI API doesn't accept!
+            clean_messages = []
+            for msg in messages:
+                clean_msg = {"role": msg["role"], "content": msg["content"]}
+                # Preserve standard fields only
+                if "name" in msg:
+                    clean_msg["name"] = msg["name"]
+                if "tool_calls" in msg:
+                    clean_msg["tool_calls"] = msg["tool_calls"]
+                clean_messages.append(clean_msg)
+
+            print(f"   Messages: {len(clean_messages)} (cleaned)")
+            print("   Making call...")
+
+            # OpenAI SDK call
+            response = client.chat.completions.create(
+                model=self.config.model_name,
+                messages=clean_messages,  # ← Use cleaned messages!
+                **openai_kwargs,
+            )
+
+            print("   ✅ Call succeeded!")
+            print(f"   Response ID: {response.id}")
+            print(f"   Tokens: {response.usage.total_tokens if response.usage else 'N/A'}\n")
+
+            return response
+
+        except Exception as e:
+            print("\n❌ ERROR in TracingFireworksModel._query:")
+            print(f"   {type(e).__name__}: {e}")
+            traceback.print_exc()
+            raise