Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,101 @@
PYTHON_DIRS = tests examples scripts eval_protocol
PY ?= uv run python

.PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release
## -----------------------------
## Local Langfuse + LiteLLM E2E
## -----------------------------

.PHONY: local-install local-langfuse-up local-langfuse-up-local local-langfuse-wait local-litellm-up local-litellm-smoke local-adapter-smoke local-generate-traces local-generate-chinook local-eval local-eval-fireworks-only local-quick-run

local-install:
uv pip install -e ".[langfuse]"

# 1) Start Langfuse per official docs (run from Langfuse repo). Here we just export env.
local-langfuse-up:
@echo "Ensure you started Langfuse via docker compose as per docs."
@echo "Docs: https://langfuse.com/self-hosting/deployment/docker-compose"
@echo "Exporting LANGFUSE env vars for SDK..."
LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
printf "LANGFUSE_PUBLIC_KEY=%s\nLANGFUSE_SECRET_KEY=%s\nLANGFUSE_HOST=%s\n" $$LANGFUSE_PUBLIC_KEY $$LANGFUSE_SECRET_KEY $$LANGFUSE_HOST

# Start Langfuse using local compose file
local-langfuse-up-local:
docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d

# Wait until Langfuse UI responds
local-langfuse-wait:
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
echo "Waiting for $$LANGFUSE_HOST ..."; \
for i in $$(seq 1 60); do \
code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then echo "Langfuse is up (HTTP $$code)"; exit 0; fi; \
sleep 2; \
done; \
echo "Langfuse did not become ready in time."; exit 1

# 2) Start LiteLLM router (requires litellm installed). Keep foreground.
local-litellm-up:
LITELLM_API_KEY=$${LITELLM_API_KEY:-local-demo-key}; \
printf "LITELLM_API_KEY=%s\n" $$LITELLM_API_KEY; \
LITELLM_API_KEY=$$LITELLM_API_KEY uv run litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000

# 2b) Smoke test LiteLLM endpoints
local-litellm-smoke:
@test -n "$$LITELLM_API_KEY" || (echo "LITELLM_API_KEY not set" && exit 1)
curl -s -H "Authorization: Bearer $$LITELLM_API_KEY" http://127.0.0.1:4000/v1/models | head -n 5 | cat
curl -s \
-H "Authorization: Bearer $$LITELLM_API_KEY" \
-H "Content-Type: application/json" \
http://127.0.0.1:4000/v1/chat/completions \
-d '{"model":"ollama/llama3.1","messages":[{"role":"user","content":"Say hi"}]}' \
| head -n 40 | cat

# 3) Seed one trace into Langfuse

# 4) Adapter smoke test (fetch 1 row)
local-adapter-smoke:
LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \
code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \
if [ "$$code" != "200" ] && [ "$$code" != "302" ]; then \
echo "Langfuse not reachable at $$LANGFUSE_HOST (HTTP $$code). Start it per docs."; \
exit 1; \
fi; \
LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \
LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \
LANGFUSE_PUBLIC_KEY=$$LANGFUSE_PUBLIC_KEY LANGFUSE_SECRET_KEY=$$LANGFUSE_SECRET_KEY LANGFUSE_HOST=$$LANGFUSE_HOST \
$(PY) -c "from eval_protocol.adapters.langfuse import create_langfuse_adapter; a=create_langfuse_adapter(); rows=a.get_evaluation_rows(limit=1, sample_size=1); print('Fetched rows:', len(rows))"

# Generate realistic traces into Langfuse (Chinook) using Fireworks models
local-generate-traces:
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q

# Force-run Chinook generator with stub DB and Langfuse observe
local-generate-chinook:
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true
CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q

# Fallback generator that does not need external DBs

# 5) Run the local evaluation test (uses Fireworks as judge; requires FIREWORKS_API_KEY)
local-eval:
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
uv run pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q

# Run evaluation by calling Fireworks directly (skip LiteLLM router)
local-eval-fireworks-only:
@test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1)
uv run pytest eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py -k test_llm_judge_fireworks_only -q

# One-shot: assumes Langfuse is already up externally and LiteLLM already running in another shell
local-quick-run: local-seed-langfuse local-adapter-smoke local-eval
@echo "Done. Check Langfuse UI for scores."


clean:
rm -rf build/ dist/ *.egg-info/
Expand Down
67 changes: 66 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,66 @@ With hundreds of models and configs, you need objective data to choose the right
- **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto
- **Local UI**: Pivot/table views for real-time analysis

## ⚡ Quickstart (no labels needed)
## ⚡ Quickstart (local traces + local models)

This end-to-end uses a local Langfuse (Docker Compose), seeds app traces, then runs a model picker with a Fireworks-based judge and your local models (Ollama or llama.cpp). See `examples/local_langfuse_litellm_ollama/README.md` for a full guide.

### 1) Start Langfuse locally (compose file included)

```bash
# From repo root
docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d
export LANGFUSE_HOST=http://localhost:3000
export LANGFUSE_PUBLIC_KEY=... # create in Langfuse UI
export LANGFUSE_SECRET_KEY=...
export LANGFUSE_ENVIRONMENT=local
```

Open `http://localhost:3000` and confirm the UI loads.

### 2) Seed traces (PydanticAgent, no external DB required)

```bash
export FIREWORKS_API_KEY=...
export CHINOOK_USE_STUB_DB=1
make -C . local-generate-chinook
```

Optionally verify the adapter can fetch rows:

```bash
make -C . local-adapter-smoke
```

### 3) Evaluate with local models

Ollama only, direct (bypass LiteLLM):

```bash
export DIRECT_OLLAMA=1
export OLLAMA_BASE_URL=http://127.0.0.1:11434
export OLLAMA_MODELS='ollama/llama3.1' # comma-separated to compare multiple
export FIREWORKS_API_KEY=...
# Optional debug to verify calls and logging
export EP_DEBUG=1
pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
```

Optional: via LiteLLM router (Ollama/llama.cpp):

```bash
export LITELLM_API_KEY=local-demo-key
litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000
export LITELLM_BASE_URL=http://127.0.0.1:4000
export OLLAMA_MODELS='ollama/llama3.1,ollama/llama3.2:1b'
# Optional debug to verify router calls and logging
export EP_DEBUG=1
pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q
```

The pytest output includes local links for a leaderboard and row-level traces at `http://localhost:8000`.

## Basic AHA judge example (remote APIs)

Install with your tracing platform extras and set API keys:

Expand Down Expand Up @@ -104,6 +163,12 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
uv add eval-protocol
```

## 🧑‍💻 Developer notes

- The `eval-protocol logs` command currently may show no rows in some local setups even when Langfuse traces exist; use the local UI links printed by pytest and the Langfuse UI to inspect results. We’re tracking improvements to unify local logs with external trace sources.
- For Langfuse seeding, prefer `tests/chinook/langfuse/generate_traces.py` with `CHINOOK_USE_STUB_DB=1` to avoid external DBs.
- To compare multiple local models, set `OLLAMA_MODELS` (comma-separated) or use the LiteLLM config for mix-and-match backends.

## 📚 Resources

- **[Documentation](https://evalprotocol.io)** – Guides and API reference
Expand Down
29 changes: 24 additions & 5 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import time
from typing import List

from litellm import acompletion
from typing import Dict

from eval_protocol.dataset_logger import default_logger
Expand Down Expand Up @@ -67,10 +66,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:

_litellm = importlib.import_module("litellm")
acompletion = getattr(_litellm, "acompletion")
if os.getenv("EP_DEBUG", "0").strip() == "1":
try:
dbg_model = request_params.get("model")
dbg_base = request_params.get("base_url")
print(
f"[EP-Debug] LiteLLM call: model={dbg_model}, base_url={dbg_base}, tools={'yes' if 'tools' in request_params else 'no'}"
)
except Exception:
pass
response = await acompletion(**request_params)

assistant_content = response.choices[0].message.content or ""
tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
usage = {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens,
}

converted_tool_calls = None
if tool_calls:
Expand Down Expand Up @@ -112,16 +124,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
]

row.execution_metadata.usage = CompletionUsage(
prompt_tokens=response.usage.prompt_tokens,
completion_tokens=response.usage.completion_tokens,
total_tokens=response.usage.total_tokens,
prompt_tokens=usage["prompt_tokens"],
completion_tokens=usage["completion_tokens"],
total_tokens=usage["total_tokens"],
)

row.messages = messages

row.execution_metadata.duration_seconds = time.perf_counter() - start_time

default_logger.log(row)
if os.getenv("EP_DEBUG", "0").strip() == "1":
try:
print(
f"[EP-Debug] Logged row to EP: rollout_id={row.execution_metadata.rollout_id}, invoc_id={row.execution_metadata.invocation_id}, msg_count={len(row.messages)}"
)
except Exception:
pass
return row

semaphore = config.semaphore
Expand Down
12 changes: 11 additions & 1 deletion eval_protocol/quickstart/llm_judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
"""

import os
from typing import Optional

from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult
Expand Down Expand Up @@ -85,6 +86,15 @@ async def aha_judge(
# Upload score to adapter if provided
if adapter and row.evaluation_result and row.evaluation_result.is_score_valid:
model_name = row.input_metadata.completion_params.get("model", "unknown_model")
adapter.upload_score(row, model_name)
try:
if os.getenv("EP_DEBUG", "0").strip() == "1":
print(
f"[EP-Debug] Uploading score to Langfuse: model={model_name}, score={row.evaluation_result.score}"
)
adapter.upload_score(row, model_name)
if os.getenv("EP_DEBUG", "0").strip() == "1":
print("[EP-Debug] Upload score success")
except Exception as e:
print(f"[EP-Debug] Upload score failed: {repr(e)}")

return row
65 changes: 65 additions & 0 deletions eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""Evaluate Langfuse traces with Fireworks-only rollout (no LiteLLM router).

This uses SingleTurnRolloutProcessor to call Fireworks directly via the
litellm client (not the proxy server) and then runs the AHA judge (also on
Fireworks by default). Scores are pushed back to Langfuse.
"""

from datetime import datetime
import os

import pytest

from eval_protocol import (
DynamicDataLoader,
EvaluationRow,
SingleTurnRolloutProcessor,
aha_judge,
create_langfuse_adapter,
evaluation_test,
multi_turn_assistant_to_ground_truth,
)


def langfuse_fireworks_data_generator() -> list[EvaluationRow]:
adapter = create_langfuse_adapter()
return adapter.get_evaluation_rows(
environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"),
limit=int(os.getenv("LANGFUSE_LIMIT", "100")),
sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")),
include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))),
sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")),
max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")),
from_timestamp=None,
to_timestamp=datetime.utcnow(),
)


@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@pytest.mark.skipif(
not os.getenv("FIREWORKS_API_KEY"),
reason="Requires FIREWORKS_API_KEY",
)
@pytest.mark.parametrize(
"completion_params",
[
{
"model": os.getenv("FIREWORKS_COMPLETION_MODEL", "accounts/fireworks/models/kimi-k2-instruct"),
"api_key": os.getenv("FIREWORKS_API_KEY"),
"base_url": os.getenv("FIREWORKS_BASE_URL", "https://api.fireworks.ai/inference/v1"),
"temperature": float(os.getenv("FIREWORKS_TEMPERATURE", "0.2")),
"max_tokens": int(os.getenv("FIREWORKS_MAX_TOKENS", "2048")),
},
],
)
@evaluation_test(
data_loaders=DynamicDataLoader(
generators=[langfuse_fireworks_data_generator],
preprocess_fn=multi_turn_assistant_to_ground_truth,
),
rollout_processor=SingleTurnRolloutProcessor(),
max_concurrent_evaluations=int(os.getenv("FIREWORKS_MAX_CONCURRENCY", "2")),
)
async def test_llm_judge_fireworks_only(row: EvaluationRow) -> EvaluationRow:
adapter = create_langfuse_adapter()
return await aha_judge(row, adapter=adapter)
Loading
Loading