eval-protocol · benjibc · Sep 27, 2025 · Sep 29, 2025
diff --git a/Makefile b/Makefile
@@ -1,6 +1,101 @@
 PYTHON_DIRS = tests examples scripts eval_protocol
+PY ?= uv run python
 
 .PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release
+## -----------------------------
+## Local Langfuse + LiteLLM E2E
+## -----------------------------
+
+.PHONY: local-install local-langfuse-up local-langfuse-up-local local-langfuse-wait local-litellm-up local-litellm-smoke local-adapter-smoke local-generate-traces local-generate-chinook local-eval local-eval-fireworks-only local-quick-run
+
+local-install:
+	uv pip install -e ".[langfuse]"
+
+# 1) Start Langfuse per official docs (run from Langfuse repo). Here we just export env.
+local-langfuse-up:
+	@echo "Ensure you started Langfuse via docker compose as per docs."
+	@echo "Docs: https://langfuse.com/self-hosting/deployment/docker-compose"
+	@echo "Exporting LANGFUSE env vars for SDK..."
+	LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
+	LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
+	LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
+	printf "LANGFUSE_PUBLIC_KEY=%s\nLANGFUSE_SECRET_KEY=%s\nLANGFUSE_HOST=%s\n" $$LANGFUSE_PUBLIC_KEY $$LANGFUSE_SECRET_KEY $$LANGFUSE_HOST
+
+# Start Langfuse using local compose file
+local-langfuse-up-local:
+	docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d
+
+# Wait until Langfuse UI responds
+local-langfuse-wait:
+	LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
+	echo "Waiting for $$LANGFUSE_HOST ..."; \
+	for i in $$(seq 1 60); do \
+	  code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
+	  if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then echo "Langfuse is up (HTTP $$code)"; exit 0; fi; \
+	  sleep 2; \
+	done; \
+	echo "Langfuse did not become ready in time."; exit 1
+
+# 2) Start LiteLLM router (requires litellm installed). Keep foreground.
+local-litellm-up:
+	LITELLM_API_KEY=$${LITELLM_API_KEY:-local-demo-key}; \
+	printf "LITELLM_API_KEY=%s\n" $$LITELLM_API_KEY; \
+	LITELLM_API_KEY=$$LITELLM_API_KEY uv run litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000
+
+# 2b) Smoke test LiteLLM endpoints
+local-litellm-smoke:
+	@test -n "$$LITELLM_API_KEY" || (echo "LITELLM_API_KEY not set" && exit 1)
+	curl -s -H "Authorization: Bearer $$LITELLM_API_KEY" http://127.0.0.1:4000/v1/models | head -n 5 | cat
+	curl -s \
+	  -H "Authorization: Bearer $$LITELLM_API_KEY" \
+	  -H "Content-Type: application/json" \
+	  http://127.0.0.1:4000/v1/chat/completions \
+	  -d '{"model":"ollama/llama3.1","messages":[{"role":"user","content":"Say hi"}]}' \
+	| head -n 40 | cat
+
+# 3) Seed one trace into Langfuse
+
+# 4) Adapter smoke test (fetch 1 row)
+local-adapter-smoke:
+	LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
+	code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
+	if [ "$$code" != "200" ] && [ "$$code" != "302" ]; then \
+	  echo "Langfuse not reachable at $$LANGFUSE_HOST (HTTP $$code). Start it per docs."; \
+	  exit 1; \
+	fi; \
+	LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
+	LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
+	LANGFUSE_PUBLIC_KEY=$$LANGFUSE_PUBLIC_KEY LANGFUSE_SECRET_KEY=$$LANGFUSE_SECRET_KEY LANGFUSE_HOST=$$LANGFUSE_HOST \
+	$(PY) -c "from eval_protocol.adapters.langfuse import create_langfuse_adapter; a=create_langfuse_adapter(); rows=a.get_evaluation_rows(limit=1, sample_size=1); print('Fetched rows:', len(rows))"
+
+# Generate realistic traces into Langfuse (Chinook) using Fireworks models
+local-generate-traces:
+	@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
+	uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
+	CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q
+
+# Force-run Chinook generator with stub DB and Langfuse observe
+local-generate-chinook:
+	@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
+	uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
+	CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q
+
+# Fallback generator that does not need external DBs
+
+# 5) Run the local evaluation test (uses Fireworks as judge; requires FIREWORKS_API_KEY)
+local-eval:
+	@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
+	uv run pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
+
+# Run evaluation by calling Fireworks directly (skip LiteLLM router)
+local-eval-fireworks-only:
+	@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
+	uv run pytest eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py -k test_llm_judge_fireworks_only -q
+
+# One-shot: assumes Langfuse is already up externally and LiteLLM already running in another shell
+local-quick-run: local-seed-langfuse local-adapter-smoke local-eval
+	@echo "Done. Check Langfuse UI for scores."
+
 
 clean:
 	rm -rf build/ dist/ *.egg-info/

diff --git a/README.md b/README.md
@@ -18,7 +18,66 @@ With hundreds of models and configs, you need objective data to choose the right
 - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
 - **Local UI**: Pivot/table views for real-time analysis
 
-## ⚡ Quickstart (no labels needed)
+## ⚡ Quickstart (local traces + local models)
+
+This end-to-end uses a local Langfuse (Docker Compose), seeds app traces, then runs a model picker with a Fireworks-based judge and your local models (Ollama or llama.cpp). See `examples/local_langfuse_litellm_ollama/README.md` for a full guide.
+
+### 1) Start Langfuse locally (compose file included)
+
+```bash
+# From repo root
+docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d
+export LANGFUSE_HOST=http://localhost:3000
+export LANGFUSE_PUBLIC_KEY=...  # create in Langfuse UI
+export LANGFUSE_SECRET_KEY=...
+export LANGFUSE_ENVIRONMENT=local
+```
+
+Open `http://localhost:3000` and confirm the UI loads.
+
+### 2) Seed traces (PydanticAgent, no external DB required)
+
+```bash
+export FIREWORKS_API_KEY=...
+export CHINOOK_USE_STUB_DB=1
+make -C . local-generate-chinook
+```
+
+Optionally verify the adapter can fetch rows:
+
+```bash
+make -C . local-adapter-smoke
+```
+
+### 3) Evaluate with local models
+
+Ollama only, direct (bypass LiteLLM):
+
+```bash
+export DIRECT_OLLAMA=1
+export OLLAMA_BASE_URL=http://127.0.0.1:11434
+export OLLAMA_MODELS='ollama/llama3.1'   # comma-separated to compare multiple
+export FIREWORKS_API_KEY=...
+# Optional debug to verify calls and logging
+export EP_DEBUG=1
+pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
+```
+
+Optional: via LiteLLM router (Ollama/llama.cpp):
+
+```bash
+export LITELLM_API_KEY=local-demo-key
+litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000
+export LITELLM_BASE_URL=http://127.0.0.1:4000
+export OLLAMA_MODELS='ollama/llama3.1,ollama/llama3.2:1b'
+# Optional debug to verify router calls and logging
+export EP_DEBUG=1
+pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
+```
+
+The pytest output includes local links for a leaderboard and row-level traces at `http://localhost:8000`.
+
+## Basic AHA judge example (remote APIs)
 
 Install with your tracing platform extras and set API keys:
 
@@ -104,6 +163,12 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 uv add eval-protocol
 ```
 
+## 🧑‍💻 Developer notes
+
+- The `eval-protocol logs` command currently may show no rows in some local setups even when Langfuse traces exist; use the local UI links printed by pytest and the Langfuse UI to inspect results. We’re tracking improvements to unify local logs with external trace sources.
+- For Langfuse seeding, prefer `tests/chinook/langfuse/generate_traces.py` with `CHINOOK_USE_STUB_DB=1` to avoid external DBs.
+- To compare multiple local models, set `OLLAMA_MODELS` (comma-separated) or use the LiteLLM config for mix-and-match backends.
+
 ## 📚 Resources
 
 - **[Documentation](https://evalprotocol.io)** – Guides and API reference

diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -4,7 +4,6 @@
 import time
 from typing import List
 
-from litellm import acompletion
 from typing import Dict
 
 from eval_protocol.dataset_logger import default_logger
@@ -67,10 +66,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             _litellm = importlib.import_module("litellm")
             acompletion = getattr(_litellm, "acompletion")
+            if os.getenv("EP_DEBUG", "0").strip() == "1":
+                try:
+                    dbg_model = request_params.get("model")
+                    dbg_base = request_params.get("base_url")
+                    print(
+                        f"[EP-Debug] LiteLLM call: model={dbg_model}, base_url={dbg_base}, tools={'yes' if 'tools' in request_params else 'no'}"
+                    )
+                except Exception:
+                    pass
             response = await acompletion(**request_params)
-
             assistant_content = response.choices[0].message.content or ""
             tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+            usage = {
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens,
+            }
 
             converted_tool_calls = None
             if tool_calls:
@@ -112,16 +124,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
             ]
 
             row.execution_metadata.usage = CompletionUsage(
-                prompt_tokens=response.usage.prompt_tokens,
-                completion_tokens=response.usage.completion_tokens,
-                total_tokens=response.usage.total_tokens,
+                prompt_tokens=usage["prompt_tokens"],
+                completion_tokens=usage["completion_tokens"],
+                total_tokens=usage["total_tokens"],
             )
 
             row.messages = messages
 
             row.execution_metadata.duration_seconds = time.perf_counter() - start_time
 
             default_logger.log(row)
+            if os.getenv("EP_DEBUG", "0").strip() == "1":
+                try:
+                    print(
+                        f"[EP-Debug] Logged row to EP: rollout_id={row.execution_metadata.rollout_id}, invoc_id={row.execution_metadata.invocation_id}, msg_count={len(row.messages)}"
+                    )
+                except Exception:
+                    pass
             return row
 
         semaphore = config.semaphore

diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
@@ -2,6 +2,7 @@
 Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
 """
 
+import os
 from typing import Optional
 
 from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
@@ -85,6 +86,15 @@ async def aha_judge(
     # Upload score to adapter if provided
     if adapter and row.evaluation_result and row.evaluation_result.is_score_valid:
         model_name = row.input_metadata.completion_params.get("model", "unknown_model")
-        adapter.upload_score(row, model_name)
+        try:
+            if os.getenv("EP_DEBUG", "0").strip() == "1":
+                print(
+                    f"[EP-Debug] Uploading score to Langfuse: model={model_name}, score={row.evaluation_result.score}"
+                )
+            adapter.upload_score(row, model_name)
+            if os.getenv("EP_DEBUG", "0").strip() == "1":
+                print("[EP-Debug] Upload score success")
+        except Exception as e:
+            print(f"[EP-Debug] Upload score failed: {repr(e)}")
 
     return row
diff --git a/eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py b/eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py
@@ -0,0 +1,65 @@
+"""Evaluate Langfuse traces with Fireworks-only rollout (no LiteLLM router).
+
+This uses SingleTurnRolloutProcessor to call Fireworks directly via the
+litellm client (not the proxy server) and then runs the AHA judge (also on
+Fireworks by default). Scores are pushed back to Langfuse.
+"""
+
+from datetime import datetime
+import os
+
+import pytest
+
+from eval_protocol import (
+    DynamicDataLoader,
+    EvaluationRow,
+    SingleTurnRolloutProcessor,
+    aha_judge,
+    create_langfuse_adapter,
+    evaluation_test,
+    multi_turn_assistant_to_ground_truth,
+)
+
+
+def langfuse_fireworks_data_generator() -> list[EvaluationRow]:
+    adapter = create_langfuse_adapter()
+    return adapter.get_evaluation_rows(
+        environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"),
+        limit=int(os.getenv("LANGFUSE_LIMIT", "100")),
+        sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")),
+        include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))),
+        sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")),
+        max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")),
+        from_timestamp=None,
+        to_timestamp=datetime.utcnow(),
+    )
+
+
+@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
+@pytest.mark.skipif(
+    not os.getenv("FIREWORKS_API_KEY"),
+    reason="Requires FIREWORKS_API_KEY",
+)
+@pytest.mark.parametrize(
+    "completion_params",
+    [
+        {
+            "model": os.getenv("FIREWORKS_COMPLETION_MODEL", "accounts/fireworks/models/kimi-k2-instruct"),
+            "api_key": os.getenv("FIREWORKS_API_KEY"),
+            "base_url": os.getenv("FIREWORKS_BASE_URL", "https://api.fireworks.ai/inference/v1"),
+            "temperature": float(os.getenv("FIREWORKS_TEMPERATURE", "0.2")),
+            "max_tokens": int(os.getenv("FIREWORKS_MAX_TOKENS", "2048")),
+        },
+    ],
+)
+@evaluation_test(
+    data_loaders=DynamicDataLoader(
+        generators=[langfuse_fireworks_data_generator],
+        preprocess_fn=multi_turn_assistant_to_ground_truth,
+    ),
+    rollout_processor=SingleTurnRolloutProcessor(),
+    max_concurrent_evaluations=int(os.getenv("FIREWORKS_MAX_CONCURRENCY", "2")),
+)
+async def test_llm_judge_fireworks_only(row: EvaluationRow) -> EvaluationRow:
+    adapter = create_langfuse_adapter()
+    return await aha_judge(row, adapter=adapter)