diff --git a/Makefile b/Makefile index 982f3ed0..cfa4d3d5 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,101 @@ PYTHON_DIRS = tests examples scripts eval_protocol +PY ?= uv run python .PHONY: clean build dist upload test lint typecheck format release sync-docs version tag-version show-version bump-major bump-minor bump-patch full-release quick-release +## ----------------------------- +## Local Langfuse + LiteLLM E2E +## ----------------------------- + +.PHONY: local-install local-langfuse-up local-langfuse-up-local local-langfuse-wait local-litellm-up local-litellm-smoke local-adapter-smoke local-generate-traces local-generate-chinook local-eval local-eval-fireworks-only local-quick-run + +local-install: + uv pip install -e ".[langfuse]" + +# 1) Start Langfuse per official docs (run from Langfuse repo). Here we just export env. +local-langfuse-up: + @echo "Ensure you started Langfuse via docker compose as per docs." + @echo "Docs: https://langfuse.com/self-hosting/deployment/docker-compose" + @echo "Exporting LANGFUSE env vars for SDK..." + LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \ + LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \ + LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \ + printf "LANGFUSE_PUBLIC_KEY=%s\nLANGFUSE_SECRET_KEY=%s\nLANGFUSE_HOST=%s\n" $$LANGFUSE_PUBLIC_KEY $$LANGFUSE_SECRET_KEY $$LANGFUSE_HOST + +# Start Langfuse using local compose file +local-langfuse-up-local: + docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d + +# Wait until Langfuse UI responds +local-langfuse-wait: + LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \ + echo "Waiting for $$LANGFUSE_HOST ..."; \ + for i in $$(seq 1 60); do \ + code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \ + if [ "$$code" = "200" ] || [ "$$code" = "302" ]; then echo "Langfuse is up (HTTP $$code)"; exit 0; fi; \ + sleep 2; \ + done; \ + echo "Langfuse did not become ready in time."; exit 1 + +# 2) Start LiteLLM router (requires litellm installed). Keep foreground. +local-litellm-up: + LITELLM_API_KEY=$${LITELLM_API_KEY:-local-demo-key}; \ + printf "LITELLM_API_KEY=%s\n" $$LITELLM_API_KEY; \ + LITELLM_API_KEY=$$LITELLM_API_KEY uv run litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000 + +# 2b) Smoke test LiteLLM endpoints +local-litellm-smoke: + @test -n "$$LITELLM_API_KEY" || (echo "LITELLM_API_KEY not set" && exit 1) + curl -s -H "Authorization: Bearer $$LITELLM_API_KEY" http://127.0.0.1:4000/v1/models | head -n 5 | cat + curl -s \ + -H "Authorization: Bearer $$LITELLM_API_KEY" \ + -H "Content-Type: application/json" \ + http://127.0.0.1:4000/v1/chat/completions \ + -d '{"model":"ollama/llama3.1","messages":[{"role":"user","content":"Say hi"}]}' \ + | head -n 40 | cat + +# 3) Seed one trace into Langfuse + +# 4) Adapter smoke test (fetch 1 row) +local-adapter-smoke: + LANGFUSE_HOST=$${LANGFUSE_HOST:-http://localhost:3000}; \ + code=$$(curl -s -o /dev/null -w "%{http_code}" $$LANGFUSE_HOST); \ + if [ "$$code" != "200" ] && [ "$$code" != "302" ]; then \ + echo "Langfuse not reachable at $$LANGFUSE_HOST (HTTP $$code). Start it per docs."; \ + exit 1; \ + fi; \ + LANGFUSE_PUBLIC_KEY=$${LANGFUSE_PUBLIC_KEY:-local}; \ + LANGFUSE_SECRET_KEY=$${LANGFUSE_SECRET_KEY:-local}; \ + LANGFUSE_PUBLIC_KEY=$$LANGFUSE_PUBLIC_KEY LANGFUSE_SECRET_KEY=$$LANGFUSE_SECRET_KEY LANGFUSE_HOST=$$LANGFUSE_HOST \ + $(PY) -c "from eval_protocol.adapters.langfuse import create_langfuse_adapter; a=create_langfuse_adapter(); rows=a.get_evaluation_rows(limit=1, sample_size=1); print('Fetched rows:', len(rows))" + +# Generate realistic traces into Langfuse (Chinook) using Fireworks models +local-generate-traces: + @test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1) + uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true + CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q + +# Force-run Chinook generator with stub DB and Langfuse observe +local-generate-chinook: + @test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1) + uv pip install -e ".[pydantic,fireworks,chinook]" >/dev/null || true + CHINOOK_USE_STUB_DB=1 uv run pytest tests/chinook/langfuse/generate_traces.py -q + +# Fallback generator that does not need external DBs + +# 5) Run the local evaluation test (uses Fireworks as judge; requires FIREWORKS_API_KEY) +local-eval: + @test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1) + uv run pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q + +# Run evaluation by calling Fireworks directly (skip LiteLLM router) +local-eval-fireworks-only: + @test -n "$$FIREWORKS_API_KEY" || (echo "FIREWORKS_API_KEY not set" && exit 1) + uv run pytest eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py -k test_llm_judge_fireworks_only -q + +# One-shot: assumes Langfuse is already up externally and LiteLLM already running in another shell +local-quick-run: local-seed-langfuse local-adapter-smoke local-eval + @echo "Done. Check Langfuse UI for scores." + clean: rm -rf build/ dist/ *.egg-info/ diff --git a/README.md b/README.md index 814dccba..48e07746 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,66 @@ With hundreds of models and configs, you need objective data to choose the right - **LLM judge**: Stack-rank models using pairwise Arena-Hard-Auto - **Local UI**: Pivot/table views for real-time analysis -## ⚡ Quickstart (no labels needed) +## ⚡ Quickstart (local traces + local models) + +This end-to-end uses a local Langfuse (Docker Compose), seeds app traces, then runs a model picker with a Fireworks-based judge and your local models (Ollama or llama.cpp). See `examples/local_langfuse_litellm_ollama/README.md` for a full guide. + +### 1) Start Langfuse locally (compose file included) + +```bash +# From repo root +docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d +export LANGFUSE_HOST=http://localhost:3000 +export LANGFUSE_PUBLIC_KEY=... # create in Langfuse UI +export LANGFUSE_SECRET_KEY=... +export LANGFUSE_ENVIRONMENT=local +``` + +Open `http://localhost:3000` and confirm the UI loads. + +### 2) Seed traces (PydanticAgent, no external DB required) + +```bash +export FIREWORKS_API_KEY=... +export CHINOOK_USE_STUB_DB=1 +make -C . local-generate-chinook +``` + +Optionally verify the adapter can fetch rows: + +```bash +make -C . local-adapter-smoke +``` + +### 3) Evaluate with local models + +Ollama only, direct (bypass LiteLLM): + +```bash +export DIRECT_OLLAMA=1 +export OLLAMA_BASE_URL=http://127.0.0.1:11434 +export OLLAMA_MODELS='ollama/llama3.1' # comma-separated to compare multiple +export FIREWORKS_API_KEY=... +# Optional debug to verify calls and logging +export EP_DEBUG=1 +pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q +``` + +Optional: via LiteLLM router (Ollama/llama.cpp): + +```bash +export LITELLM_API_KEY=local-demo-key +litellm --config examples/local_langfuse_litellm_ollama/litellm-config.yaml --port 4000 +export LITELLM_BASE_URL=http://127.0.0.1:4000 +export OLLAMA_MODELS='ollama/llama3.1,ollama/llama3.2:1b' +# Optional debug to verify router calls and logging +export EP_DEBUG=1 +pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local -q +``` + +The pytest output includes local links for a leaderboard and row-level traces at `http://localhost:8000`. + +## Basic AHA judge example (remote APIs) Install with your tracing platform extras and set API keys: @@ -104,6 +163,12 @@ curl -LsSf https://astral.sh/uv/install.sh | sh uv add eval-protocol ``` +## 🧑‍💻 Developer notes + +- The `eval-protocol logs` command currently may show no rows in some local setups even when Langfuse traces exist; use the local UI links printed by pytest and the Langfuse UI to inspect results. We’re tracking improvements to unify local logs with external trace sources. +- For Langfuse seeding, prefer `tests/chinook/langfuse/generate_traces.py` with `CHINOOK_USE_STUB_DB=1` to avoid external DBs. +- To compare multiple local models, set `OLLAMA_MODELS` (comma-separated) or use the LiteLLM config for mix-and-match backends. + ## 📚 Resources - **[Documentation](https://evalprotocol.io)** – Guides and API reference diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index 2b4bf893..f05c6675 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -4,7 +4,6 @@ import time from typing import List -from litellm import acompletion from typing import Dict from eval_protocol.dataset_logger import default_logger @@ -67,10 +66,23 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: _litellm = importlib.import_module("litellm") acompletion = getattr(_litellm, "acompletion") + if os.getenv("EP_DEBUG", "0").strip() == "1": + try: + dbg_model = request_params.get("model") + dbg_base = request_params.get("base_url") + print( + f"[EP-Debug] LiteLLM call: model={dbg_model}, base_url={dbg_base}, tools={'yes' if 'tools' in request_params else 'no'}" + ) + except Exception: + pass response = await acompletion(**request_params) - assistant_content = response.choices[0].message.content or "" tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None + usage = { + "prompt_tokens": response.usage.prompt_tokens, + "completion_tokens": response.usage.completion_tokens, + "total_tokens": response.usage.total_tokens, + } converted_tool_calls = None if tool_calls: @@ -112,9 +124,9 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: ] row.execution_metadata.usage = CompletionUsage( - prompt_tokens=response.usage.prompt_tokens, - completion_tokens=response.usage.completion_tokens, - total_tokens=response.usage.total_tokens, + prompt_tokens=usage["prompt_tokens"], + completion_tokens=usage["completion_tokens"], + total_tokens=usage["total_tokens"], ) row.messages = messages @@ -122,6 +134,13 @@ async def process_row(row: EvaluationRow) -> EvaluationRow: row.execution_metadata.duration_seconds = time.perf_counter() - start_time default_logger.log(row) + if os.getenv("EP_DEBUG", "0").strip() == "1": + try: + print( + f"[EP-Debug] Logged row to EP: rollout_id={row.execution_metadata.rollout_id}, invoc_id={row.execution_metadata.invocation_id}, msg_count={len(row.messages)}" + ) + except Exception: + pass return row semaphore = config.semaphore diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index a5225857..d41320ca 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -2,6 +2,7 @@ Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto. """ +import os from typing import Optional from eval_protocol.models import EvaluationRow, EvaluateResult, MetricResult @@ -85,6 +86,15 @@ async def aha_judge( # Upload score to adapter if provided if adapter and row.evaluation_result and row.evaluation_result.is_score_valid: model_name = row.input_metadata.completion_params.get("model", "unknown_model") - adapter.upload_score(row, model_name) + try: + if os.getenv("EP_DEBUG", "0").strip() == "1": + print( + f"[EP-Debug] Uploading score to Langfuse: model={model_name}, score={row.evaluation_result.score}" + ) + adapter.upload_score(row, model_name) + if os.getenv("EP_DEBUG", "0").strip() == "1": + print("[EP-Debug] Upload score success") + except Exception as e: + print(f"[EP-Debug] Upload score failed: {repr(e)}") return row diff --git a/eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py b/eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py new file mode 100644 index 00000000..7b439e5e --- /dev/null +++ b/eval_protocol/quickstart/llm_judge_langfuse_fireworks_only.py @@ -0,0 +1,65 @@ +"""Evaluate Langfuse traces with Fireworks-only rollout (no LiteLLM router). + +This uses SingleTurnRolloutProcessor to call Fireworks directly via the +litellm client (not the proxy server) and then runs the AHA judge (also on +Fireworks by default). Scores are pushed back to Langfuse. +""" + +from datetime import datetime +import os + +import pytest + +from eval_protocol import ( + DynamicDataLoader, + EvaluationRow, + SingleTurnRolloutProcessor, + aha_judge, + create_langfuse_adapter, + evaluation_test, + multi_turn_assistant_to_ground_truth, +) + + +def langfuse_fireworks_data_generator() -> list[EvaluationRow]: + adapter = create_langfuse_adapter() + return adapter.get_evaluation_rows( + environment=os.getenv("LANGFUSE_ENVIRONMENT", "local"), + limit=int(os.getenv("LANGFUSE_LIMIT", "100")), + sample_size=int(os.getenv("LANGFUSE_SAMPLE_SIZE", "20")), + include_tool_calls=bool(int(os.getenv("LANGFUSE_INCLUDE_TOOL_CALLS", "1"))), + sleep_between_gets=float(os.getenv("LANGFUSE_SLEEP", "0.5")), + max_retries=int(os.getenv("LANGFUSE_MAX_RETRIES", "3")), + from_timestamp=None, + to_timestamp=datetime.utcnow(), + ) + + +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") +@pytest.mark.skipif( + not os.getenv("FIREWORKS_API_KEY"), + reason="Requires FIREWORKS_API_KEY", +) +@pytest.mark.parametrize( + "completion_params", + [ + { + "model": os.getenv("FIREWORKS_COMPLETION_MODEL", "accounts/fireworks/models/kimi-k2-instruct"), + "api_key": os.getenv("FIREWORKS_API_KEY"), + "base_url": os.getenv("FIREWORKS_BASE_URL", "https://api.fireworks.ai/inference/v1"), + "temperature": float(os.getenv("FIREWORKS_TEMPERATURE", "0.2")), + "max_tokens": int(os.getenv("FIREWORKS_MAX_TOKENS", "2048")), + }, + ], +) +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[langfuse_fireworks_data_generator], + preprocess_fn=multi_turn_assistant_to_ground_truth, + ), + rollout_processor=SingleTurnRolloutProcessor(), + max_concurrent_evaluations=int(os.getenv("FIREWORKS_MAX_CONCURRENCY", "2")), +) +async def test_llm_judge_fireworks_only(row: EvaluationRow) -> EvaluationRow: + adapter = create_langfuse_adapter() + return await aha_judge(row, adapter=adapter) diff --git a/eval_protocol/quickstart/llm_judge_langfuse_local.py b/eval_protocol/quickstart/llm_judge_langfuse_local.py new file mode 100644 index 00000000..161a0e2a --- /dev/null +++ b/eval_protocol/quickstart/llm_judge_langfuse_local.py @@ -0,0 +1,212 @@ +"""Fully local Langfuse + LiteLLM example with Fireworks judge. + +This example shows how to evaluate local model responses (served via a local +LiteLLM router in front of `ollama` and/or `llama.cpp`) using the default +Arena-Hard-Auto ("aha") judge, which runs on Fireworks. Traces are pulled from +your self-hosted Langfuse instance using the built-in adapter. + +Prerequisites +------------- +1. Start Langfuse locally and export the usual environment variables so the + SDK can connect:: + + docker compose up -d + export LANGFUSE_PUBLIC_KEY=local + export LANGFUSE_SECRET_KEY=local + export LANGFUSE_HOST=http://localhost:3000 + + Replace the credentials with whatever you configured for your local + deployment. + +2. Launch the model backends. The example below assumes: + + * ``ollama`` is running on ``http://127.0.0.1:11434`` with the model + ``llama3.1`` pulled. + * A ``llama.cpp`` server is running on ``http://127.0.0.1:8080`` that serves + ``Meta-Llama-3-8B-Instruct`` (adjust the path/model name for your set-up). + +3. Start a LiteLLM router that proxies both backends. Save the following to + ``litellm-config.yaml`` (change model names as desired):: + + model_list: + - model_name: "judge/llama3.1" + litellm_params: + model: "ollama/llama3.1" + api_base: "http://127.0.0.1:11434" + - model_name: "candidate/llama3.8b" + litellm_params: + model: "llama.cpp" + api_base: "http://127.0.0.1:8080/v1" + model_path: "/path/to/Meta-Llama-3-8B-Instruct.gguf" + + litellm_settings: + drop_params: true + telemetry: false + + Then launch the router:: + + export LITELLM_API_KEY=local-demo-key + litellm --config litellm-config.yaml --port 4000 + +4. Export your Fireworks credentials for the LLM judge:: + + export FIREWORKS_API_KEY=... # required for the judge + # optional if using organization-scoped models + export FIREWORKS_ACCOUNT_ID=... + +5. Point the example at the router. The defaults below expect the router on + ``http://127.0.0.1:4000`` and use ``judge/llama3.1`` as the judge model. + Override them via ``LITELLM_BASE_URL`` and ``LOCAL_JUDGE_MODEL`` if your + configuration is different. + +Running the example +------------------- +With the services running, execute:: + + pytest eval_protocol/quickstart/llm_judge_langfuse_local.py -k test_llm_judge_local + +The test will fetch traces from the local Langfuse instance, convert each +assistant turn into an ``EvaluationRow``, and score them with the local judge. +""" + +from datetime import datetime +import os + +import pytest + +from eval_protocol import ( + DynamicDataLoader, + EvaluationRow, + SingleTurnRolloutProcessor, + aha_judge, + create_langfuse_adapter, + evaluation_test, + multi_turn_assistant_to_ground_truth, +) +from eval_protocol.quickstart.utils import assistant_to_ground_truth +# Note: We keep the default aha judge (Fireworks) from utils.JUDGE_CONFIGS. + +# --------------------------------------------------------------------------- +# Force direct Ollama usage (no LiteLLM router) for this example +# --------------------------------------------------------------------------- +# Avoid unexpected input param overrides in local runs +os.environ.pop("EP_INPUT_PARAMS_JSON", None) + +# --------------------------------------------------------------------------- +# Hardcoded local configuration (no env required for models/routing) +# --------------------------------------------------------------------------- +OLLAMA_BASE_URL = "http://127.0.0.1:11434" +OLLAMA_MODELS = [ + "ollama/llama3.1", +] +LANGFUSE_TAGS = ["chinook_sql"] +LANGFUSE_LIMIT = 200 +LANGFUSE_SAMPLE_SIZE = 20 +LANGFUSE_SLEEP_BETWEEN_GETS = 1.0 +LANGFUSE_MAX_RETRIES = 6 +LANGFUSE_HOURS_BACK = 48 + + +# --------------------------------------------------------------------------- +# Data loading helpers +# --------------------------------------------------------------------------- +def langfuse_local_data_generator() -> list[EvaluationRow]: + """Fetch evaluation rows from a local Langfuse deployment.""" + + adapter = create_langfuse_adapter() + print("[EP-Debug] Pulling rows from Langfuse with hardcoded config:") + print( + f" tags={LANGFUSE_TAGS}, limit={LANGFUSE_LIMIT}, sample_size={LANGFUSE_SAMPLE_SIZE}, include_tool_calls=True" + ) + + rows = adapter.get_evaluation_rows( + environment=None, + tags=LANGFUSE_TAGS, + limit=LANGFUSE_LIMIT, + sample_size=LANGFUSE_SAMPLE_SIZE, + include_tool_calls=True, + sleep_between_gets=LANGFUSE_SLEEP_BETWEEN_GETS, + max_retries=LANGFUSE_MAX_RETRIES, + hours_back=LANGFUSE_HOURS_BACK, + from_timestamp=None, + to_timestamp=datetime.utcnow(), + ) + print(f"[EP-Debug] Langfuse adapter returned rows (preprocess pending): {len(rows)}") + return rows + + +def _preprocess_rows(data: list[EvaluationRow]) -> list[EvaluationRow]: + """Mirror quickstart pattern: run multi_turn split, then drop empties with debug.""" + split_rows = multi_turn_assistant_to_ground_truth(data) + print(f"[EP-Debug] After multi_turn_assistant_to_ground_truth: {len(split_rows)} rows") + + # Keep only rows that have at least one message before assistant turn + filtered = [r for r in split_rows if r.messages and len(r.messages) > 0] + if len(filtered) != len(split_rows): + print(f"[EP-Debug] Dropped {len(split_rows) - len(filtered)} rows with empty messages after split") + + # Show a small sample for inspection + for r in filtered[:2]: + try: + roles = [m.role for m in r.messages] + gt_repr = str(r.ground_truth or "") + print(f"[EP-Debug] Row sample: msg_count={len(r.messages)} roles={roles} gt_len={len(gt_repr)}") + except Exception: + pass + if filtered: + return filtered + + # Fallback: use last assistant as ground truth without split + print("[EP-Debug] Fallback preprocess: applying assistant_to_ground_truth") + fallback_rows = assistant_to_ground_truth(data) + fallback_filtered = [r for r in fallback_rows if r.messages and len(r.messages) > 0] + if len(fallback_filtered) != len(fallback_rows): + print(f"[EP-Debug] Fallback dropped {len(fallback_rows) - len(fallback_filtered)} rows with empty messages") + for r in fallback_filtered[:2]: + try: + roles = [m.role for m in r.messages] + gt_repr = str(r.ground_truth or "") + print(f"[EP-Debug] Fallback sample: msg_count={len(r.messages)} roles={roles} gt_len={len(gt_repr)}") + except Exception: + pass + return fallback_filtered + + +# Hardcoded completion params for local Ollama via LiteLLM SDK (no proxy) +_PARAMS = [ + { + "model": m, + "base_url": OLLAMA_BASE_URL, + "extra_body": {"stream": False}, + } + for m in OLLAMA_MODELS +] + + +@pytest.mark.parametrize("completion_params", _PARAMS) +@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip local example in CI") +@pytest.mark.skipif( + not os.getenv("LANGFUSE_PUBLIC_KEY") or not os.getenv("LANGFUSE_SECRET_KEY"), + reason="LANGFUSE credentials not configured", +) +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[langfuse_local_data_generator], + preprocess_fn=_preprocess_rows, + ), + rollout_processor=SingleTurnRolloutProcessor(), + max_concurrent_evaluations=1, +) +async def test_llm_judge_local(row: EvaluationRow) -> EvaluationRow: + """Evaluate one Langfuse trace row with the local aha judge.""" + # Use default Fireworks-based judge and push score back to Langfuse + adapter = create_langfuse_adapter() + if os.getenv("EP_DEBUG", "0").strip() == "1": + try: + cp = row.input_metadata.completion_params + print( + f"[EP-Debug] Starting judge for row: rollout_id={row.execution_metadata.rollout_id}, model={cp.get('model') if cp else 'n/a'}" + ) + except Exception: + pass + return await aha_judge(row, adapter=adapter) diff --git a/examples/local_langfuse_litellm_ollama/README.md b/examples/local_langfuse_litellm_ollama/README.md new file mode 100644 index 00000000..27f1e217 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/README.md @@ -0,0 +1,160 @@ +### Local Langfuse + Fireworks Judge (optionally LiteLLM/Ollama) + +This guide runs a local evaluation loop with: + +- Local Langfuse via a compose file included in this repo +- Eval Protocol to pull traces and score outputs +- Fireworks-hosted LLM as the judge (accurate scoring) +- Optional: LiteLLM router in front of local backends (Ollama / llama.cpp) + +References: [Langfuse Docker Compose](https://langfuse.com/self-hosting/deployment/docker-compose) + +--- + +#### 1) Start Langfuse from the included compose file + +```bash +# From repo root +docker compose -f examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml up -d +``` + +Export Langfuse credentials for the SDK: + +```bash +export LANGFUSE_PUBLIC_KEY=local +export LANGFUSE_SECRET_KEY=local +export LANGFUSE_HOST=http://localhost:3000 +export LANGFUSE_ENVIRONMENT=local +``` + +Open the UI at `http://localhost:3000`. + +--- + +#### 2) Launch local inference backends + +Option A: Ollama + +```bash +ollama serve & +ollama pull llama3.1 +``` + +Option B: llama.cpp (OpenAI-compatible server) + +```bash +# Example; adjust paths/ports/model +./server -m /path/to/Meta-Llama-3-8B-Instruct.gguf -c 8192 -ngl 33 -a 127.0.0.1 -p 8080 +``` + +--- + +#### 3) Start a LiteLLM router in front of local backends + +Create `litellm-config.yaml`: + +```yaml +model_list: + - model_name: "candidate/llama3.8b" + litellm_params: + model: "llama.cpp" + api_base: "http://127.0.0.1:8080/v1" + model_path: "/path/to/Meta-Llama-3-8B-Instruct.gguf" + - model_name: "ollama/llama3.1" + litellm_params: + model: "ollama/llama3.1" + api_base: "http://127.0.0.1:11434" + +litellm_settings: + drop_params: true + telemetry: false +``` + +Run the router: + +```bash +export LITELLM_API_KEY=local-demo-key +litellm --config litellm-config.yaml --port 4000 +``` + +Smoke test the router: + +```bash +curl -s -H "Authorization: Bearer $LITELLM_API_KEY" http://127.0.0.1:4000/v1/models | jq . +curl -s \ + -H "Authorization: Bearer $LITELLM_API_KEY" \ + -H "Content-Type: application/json" \ + http://127.0.0.1:4000/v1/chat/completions \ + -d '{"model":"ollama/llama3.1","messages":[{"role":"user","content":"Say hi"}]}' \ +| jq -r '.choices[0].message.content' +``` + +--- + +#### 4) Seed traces into Langfuse (consolidated example) + +Use the Chinook generator with PydanticAgentRolloutProcessor (no external DB required by default): + +```bash +export FIREWORKS_API_KEY=... +export CHINOOK_USE_STUB_DB=1 +make -C . local-generate-chinook +``` + +Verify adapter connectivity: + +```bash +make -C . local-adapter-smoke +``` + +--- + +#### 5) Install Eval Protocol with Langfuse extras + +```bash +uv pip install -e ".[langfuse]" # or: pip install 'eval-protocol[langfuse]' +``` + +Ensure Fireworks credentials are set for the judge: + +```bash +export FIREWORKS_API_KEY=... # required for judge +# optional depending on your account setup +export FIREWORKS_ACCOUNT_ID=... +``` + +--- + +#### 6) Run evaluation (Fireworks-only) + +```bash +export FIREWORKS_API_KEY=... +make -C . local-eval-fireworks-only +``` + +This pulls traces from Langfuse, runs the rollout on Fireworks, judges results on Fireworks, and pushes scores back to Langfuse. + +--- + +#### 7) View results in Langfuse + +- Open a trace and look for the evaluation score created by the run. +- Compare scores across candidate models to pick the best local model for your app. + +--- + +#### Troubleshooting + +- Langfuse not reachable: verify `LANGFUSE_HOST` and Docker health; see [Langfuse Docker Compose](https://langfuse.com/self-hosting/deployment/docker-compose) +- Judge errors: verify `FIREWORKS_API_KEY` and network access. You can switch judge model in `eval_protocol/quickstart/utils.py`. +- No results in EP UI at `http://localhost:8000`: ensure the logs server is running (`ep logs`), and that rows are being persisted under `.eval_protocol/logs.db`. With `EP_DEBUG=1`, the run prints `[EP-Debug] Logged row to EP: ...` lines. +- Ollama not being called: for direct mode, set `DIRECT_OLLAMA=1` and `OLLAMA_BASE_URL`; the run prints `[EP-Debug] DIRECT_OLLAMA=1 -> Calling Ollama: base=..., model=...`. For router mode, unset `DIRECT_OLLAMA` and confirm `LITELLM_BASE_URL` and `LITELLM_API_KEY`. +- Scores not appearing back in Langfuse: verify `FIREWORKS_API_KEY` and that the judge model can complete. With `EP_DEBUG=1`, you should see `[EP-Debug] Uploading score to Langfuse` and `Upload score success` messages. + +--- + +#### What’s happening under the hood + +- `LangfuseAdapter` pulls traces and converts them to `EvaluationRow` +- `PydanticAgentRolloutProcessor` runs the agent and logs traces +- `SingleTurnRolloutProcessor` + `aha_judge` evaluate and push scores to Langfuse diff --git a/examples/local_langfuse_litellm_ollama/generate_langgraph_traces.py b/examples/local_langfuse_litellm_ollama/generate_langgraph_traces.py new file mode 100644 index 00000000..71ed1d15 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/generate_langgraph_traces.py @@ -0,0 +1,91 @@ +import asyncio +import os +from typing import Any, Dict, List + +from langfuse import get_client + + +def _to_chatml_messages(messages: List[Any]) -> List[Dict[str, Any]]: + out: List[Dict[str, Any]] = [] + for m in messages: + role = getattr(m, "type", None) or getattr(m, "role", None) + if role == "ai" or role == "assistant": + entry: Dict[str, Any] = {"role": "assistant", "content": getattr(m, "content", "")} + tcs = getattr(m, "tool_calls", None) + if tcs: + try: + entry["tool_calls"] = [ + { + "id": tc.id, + "type": getattr(tc, "type", "function"), + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + for tc in tcs + ] + except Exception: + pass + out.append(entry) + elif role == "tool": + out.append( + { + "role": "tool", + "name": getattr(m, "name", None), + "tool_call_id": getattr(m, "tool_call_id", None), + "content": getattr(m, "content", ""), + } + ) + elif role == "human" or role == "user": + out.append({"role": "user", "content": getattr(m, "content", "")}) + return out + + +async def main() -> None: + # Lazy import to avoid hard deps unless used + import sys + import pathlib + + repo_root = pathlib.Path(__file__).resolve().parents[2] + sys.path.append(str(repo_root)) + from examples.langgraph.tools_graph import build_tools_graph + from langchain_core.messages import HumanMessage + + num = int(os.environ.get("LANGGRAPH_TRACE_COUNT", "10")) + lf = get_client() + app = build_tools_graph() + + prompts = [ + "Use calculator_add to add 2 and 3", + "Calculate 5 + 7", + "What is 10 + 1?", + "Add 8 and 9", + "Tool test: 4 plus 4", + ] + + for i in range(num): + prompt = prompts[i % len(prompts)] + # Create input in ChatML-like form + input_msgs = [{"role": "user", "content": prompt}] + + # Invoke graph and build output ChatML messages + result = await app.ainvoke({"messages": [HumanMessage(content=prompt)]}) + output_msgs = _to_chatml_messages(result.get("messages", [])) + + # Create trace with input/output for adapter to parse + trace_id = lf.create_trace_id() + from langfuse.types import TraceContext + + ctx = TraceContext(trace_id=trace_id) + # Create concrete events to ensure ingestion attaches to this trace + lf.create_event(trace_context=ctx, name="input", input={"messages": input_msgs}) + lf.create_event(trace_context=ctx, name="assistant", output={"messages": output_msgs}) + # Also set top-level trace metadata for adapter context + lf.update_current_trace(name="langgraph-demo") + lf.flush() + print("Created langgraph trace:", trace_id) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/local_langfuse_litellm_ollama/generate_synthetic_traces.py b/examples/local_langfuse_litellm_ollama/generate_synthetic_traces.py new file mode 100644 index 00000000..5c194ba9 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/generate_synthetic_traces.py @@ -0,0 +1,49 @@ +import os +import random +import time +from typing import List + +from langfuse import get_client +from langfuse.types import TraceContext + + +def _random_prompt(i: int) -> str: + prompts = [ + "Summarize the benefits of local inference.", + "What is 2+2?", + "Explain how LiteLLM routes requests.", + "Give a short description of the Chinook sample database.", + "List three ways to evaluate model quality.", + ] + return prompts[i % len(prompts)] + + +def create_trace(lf, user_text: str, assistant_text: str, tags: List[str]) -> str: + trace_id = lf.create_trace_id() + ctx = TraceContext(trace_id=trace_id) + # Attach input to trace + lf.update_current_trace( + name="local-synth", tags=tags, input={"messages": [{"role": "user", "content": user_text}]} + ) + # Add a generation observation for the assistant reply + lf.start_observation(trace_context=ctx, as_type="generation", name="assistant") + lf.update_current_generation(output={"messages": [{"role": "assistant", "content": assistant_text}]}) + lf.flush() + return trace_id + + +def main() -> None: + count = int(os.environ.get("SYNTHETIC_TRACE_COUNT", "25")) + lf = get_client() + tags = ["local", "demo", "synthetic"] + + for i in range(count): + user_q = _random_prompt(i) + assistant_a = f"Synthetic response {i}: {random.choice(['Sure.', 'Okay.', 'Here you go.', 'Result: 4'])}" + tid = create_trace(lf, user_q, assistant_a, tags) + print(f"Created synthetic trace: {tid}") + time.sleep(0.1) + + +if __name__ == "__main__": + main() diff --git a/examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml b/examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml new file mode 100644 index 00000000..3393e339 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/langfuse-docker-compose.yml @@ -0,0 +1,164 @@ +# Local Langfuse docker compose (copied from upstream; adjust CHANGEME values for production) +# Source reference: https://langfuse.com/self-hosting/deployment/docker-compose +services: + langfuse-worker: + image: docker.io/langfuse/langfuse-worker:3 + restart: always + depends_on: &langfuse-depends-on + postgres: + condition: service_healthy + minio: + condition: service_healthy + redis: + condition: service_healthy + clickhouse: + condition: service_healthy + ports: + - 127.0.0.1:3030:3030 + environment: &langfuse-worker-env + NEXTAUTH_URL: http://localhost:3000 + DATABASE_URL: postgresql://postgres:postgres@postgres:5432/postgres # CHANGEME + SALT: "mysalt" # CHANGEME + ENCRYPTION_KEY: "0000000000000000000000000000000000000000000000000000000000000000" # CHANGEME + TELEMETRY_ENABLED: ${TELEMETRY_ENABLED:-true} + LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: ${LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES:-true} + CLICKHOUSE_MIGRATION_URL: ${CLICKHOUSE_MIGRATION_URL:-clickhouse://clickhouse:9000} + CLICKHOUSE_URL: ${CLICKHOUSE_URL:-http://clickhouse:8123} + CLICKHOUSE_USER: ${CLICKHOUSE_USER:-clickhouse} + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-clickhouse} # CHANGEME + CLICKHOUSE_CLUSTER_ENABLED: ${CLICKHOUSE_CLUSTER_ENABLED:-false} + LANGFUSE_USE_AZURE_BLOB: ${LANGFUSE_USE_AZURE_BLOB:-false} + LANGFUSE_S3_EVENT_UPLOAD_BUCKET: ${LANGFUSE_S3_EVENT_UPLOAD_BUCKET:-langfuse} + LANGFUSE_S3_EVENT_UPLOAD_REGION: ${LANGFUSE_S3_EVENT_UPLOAD_REGION:-auto} + LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY:-miniosecret} # CHANGEME + LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: ${LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: ${LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE:-true} + LANGFUSE_S3_EVENT_UPLOAD_PREFIX: ${LANGFUSE_S3_EVENT_UPLOAD_PREFIX:-events/} + LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: ${LANGFUSE_S3_MEDIA_UPLOAD_BUCKET:-langfuse} + LANGFUSE_S3_MEDIA_UPLOAD_REGION: ${LANGFUSE_S3_MEDIA_UPLOAD_REGION:-auto} + LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: ${LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: ${LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY:-miniosecret} # CHANGEME + LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: ${LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT:-http://localhost:9090} + LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: ${LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE:-true} + LANGFUSE_S3_MEDIA_UPLOAD_PREFIX: ${LANGFUSE_S3_MEDIA_UPLOAD_PREFIX:-media/} + LANGFUSE_S3_BATCH_EXPORT_ENABLED: ${LANGFUSE_S3_BATCH_EXPORT_ENABLED:-false} + LANGFUSE_S3_BATCH_EXPORT_BUCKET: ${LANGFUSE_S3_BATCH_EXPORT_BUCKET:-langfuse} + LANGFUSE_S3_BATCH_EXPORT_PREFIX: ${LANGFUSE_S3_BATCH_EXPORT_PREFIX:-exports/} + LANGFUSE_S3_BATCH_EXPORT_REGION: ${LANGFUSE_S3_BATCH_EXPORT_REGION:-auto} + LANGFUSE_S3_BATCH_EXPORT_ENDPOINT: ${LANGFUSE_S3_BATCH_EXPORT_ENDPOINT:-http://minio:9000} + LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT: ${LANGFUSE_S3_BATCH_EXPORT_EXTERNAL_ENDPOINT:-http://localhost:9090} + LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID: ${LANGFUSE_S3_BATCH_EXPORT_ACCESS_KEY_ID:-minio} + LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY: ${LANGFUSE_S3_BATCH_EXPORT_SECRET_ACCESS_KEY:-miniosecret} # CHANGEME + LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE: ${LANGFUSE_S3_BATCH_EXPORT_FORCE_PATH_STYLE:-true} + LANGFUSE_INGESTION_QUEUE_DELAY_MS: ${LANGFUSE_INGESTION_QUEUE_DELAY_MS:-} + LANGFUSE_INGESTION_CLICKHOUSE_WRITE_INTERVAL_MS: ${LANGFUSE_INGESTION_CLICKHOUSE_WRITE_INTERVAL_MS:-} + REDIS_HOST: ${REDIS_HOST:-redis} + REDIS_PORT: ${REDIS_PORT:-6379} + REDIS_AUTH: ${REDIS_AUTH:-myredissecret} # CHANGEME + REDIS_TLS_ENABLED: ${REDIS_TLS_ENABLED:-false} + REDIS_TLS_CA: ${REDIS_TLS_CA:-/certs/ca.crt} + REDIS_TLS_CERT: ${REDIS_TLS_CERT:-/certs/redis.crt} + REDIS_TLS_KEY: ${REDIS_TLS_KEY:-/certs/redis.key} + EMAIL_FROM_ADDRESS: ${EMAIL_FROM_ADDRESS:-} + SMTP_CONNECTION_URL: ${SMTP_CONNECTION_URL:-} + + langfuse-web: + image: docker.io/langfuse/langfuse:3 + restart: always + depends_on: *langfuse-depends-on + ports: + - 3000:3000 + environment: + <<: *langfuse-worker-env + NEXTAUTH_SECRET: mysecret # CHANGEME + LANGFUSE_INIT_ORG_ID: ${LANGFUSE_INIT_ORG_ID:-} + LANGFUSE_INIT_ORG_NAME: ${LANGFUSE_INIT_ORG_NAME:-} + LANGFUSE_INIT_PROJECT_ID: ${LANGFUSE_INIT_PROJECT_ID:-} + LANGFUSE_INIT_PROJECT_NAME: ${LANGFUSE_INIT_PROJECT_NAME:-} + LANGFUSE_INIT_PROJECT_PUBLIC_KEY: ${LANGFUSE_INIT_PROJECT_PUBLIC_KEY:-} + LANGFUSE_INIT_PROJECT_SECRET_KEY: ${LANGFUSE_INIT_PROJECT_SECRET_KEY:-} + LANGFUSE_INIT_USER_EMAIL: ${LANGFUSE_INIT_USER_EMAIL:-} + LANGFUSE_INIT_USER_NAME: ${LANGFUSE_INIT_USER_NAME:-} + LANGFUSE_INIT_USER_PASSWORD: ${LANGFUSE_INIT_USER_PASSWORD:-} + + clickhouse: + image: docker.io/clickhouse/clickhouse-server + restart: always + user: "101:101" + environment: + CLICKHOUSE_DB: default + CLICKHOUSE_USER: clickhouse + CLICKHOUSE_PASSWORD: clickhouse # CHANGEME + volumes: + - langfuse_clickhouse_data:/var/lib/clickhouse + - langfuse_clickhouse_logs:/var/log/clickhouse-server + ports: + - 127.0.0.1:8123:8123 + - 127.0.0.1:9000:9000 + healthcheck: + test: wget --no-verbose --tries=1 --spider http://localhost:8123/ping || exit 1 + interval: 5s + timeout: 5s + retries: 10 + start_period: 1s + + minio: + image: docker.io/minio/minio + restart: always + entrypoint: sh + command: -c 'mkdir -p /data/langfuse && minio server --address ":9000" --console-address ":9001" /data' + environment: + MINIO_ROOT_USER: minio + MINIO_ROOT_PASSWORD: miniosecret # CHANGEME + ports: + - 9090:9000 + - 127.0.0.1:9091:9001 + volumes: + - langfuse_minio_data:/data + healthcheck: + test: ["CMD", "mc", "ready", "local"] + interval: 1s + timeout: 5s + retries: 5 + start_period: 1s + + redis: + image: docker.io/redis:7 + restart: always + command: > + --requirepass ${REDIS_AUTH:-myredissecret} + ports: + - 127.0.0.1:6379:6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 3s + timeout: 10s + retries: 10 + + postgres: + image: docker.io/postgres:${POSTGRES_VERSION:-latest} + restart: always + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 3s + timeout: 3s + retries: 10 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres # CHANGEME + POSTGRES_DB: postgres + ports: + - 127.0.0.1:5432:5432 + volumes: + - langfuse_postgres_data:/var/lib/postgresql/data + +volumes: + langfuse_postgres_data: + driver: local + langfuse_clickhouse_data: + driver: local + langfuse_clickhouse_logs: + driver: local + langfuse_minio_data: + driver: local diff --git a/examples/local_langfuse_litellm_ollama/litellm-config.yaml b/examples/local_langfuse_litellm_ollama/litellm-config.yaml new file mode 100644 index 00000000..d99132b3 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/litellm-config.yaml @@ -0,0 +1,14 @@ +model_list: + - model_name: "candidate/llama3.8b" + litellm_params: + model: "llama.cpp" + api_base: "http://127.0.0.1:8080/v1" + model_path: "/path/to/Meta-Llama-3-8B-Instruct.gguf" + - model_name: "ollama/llama3.1" + litellm_params: + model: "ollama/llama3.1" + api_base: "http://127.0.0.1:11434" + +litellm_settings: + drop_params: true + telemetry: false diff --git a/examples/local_langfuse_litellm_ollama/seed_langfuse.py b/examples/local_langfuse_litellm_ollama/seed_langfuse.py new file mode 100644 index 00000000..4b466115 --- /dev/null +++ b/examples/local_langfuse_litellm_ollama/seed_langfuse.py @@ -0,0 +1,21 @@ +from langfuse import get_client +from langfuse.types import TraceContext + + +def main() -> None: + lf = get_client() + trace_id = lf.create_trace_id() + ctx = TraceContext(trace_id=trace_id) + lf.update_current_trace( + name="local-demo", + tags=["local", "demo"], + input={"messages": [{"role": "user", "content": "What is 2+2?"}]}, + ) + lf.start_generation(trace_context=ctx, name="final") + lf.update_current_generation(output={"messages": [{"role": "assistant", "content": "It is 4."}]}) + lf.flush() + print("Created trace:", trace_id) + + +if __name__ == "__main__": + main() diff --git a/tests/chinook/pydantic/agent.py b/tests/chinook/pydantic/agent.py index 2b260fd4..de0f6b11 100644 --- a/tests/chinook/pydantic/agent.py +++ b/tests/chinook/pydantic/agent.py @@ -7,11 +7,47 @@ import os sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from db import connect_database + + +def _maybe_connect_database(): + """Connect to Chinook DB unless disabled via env. + + If CHINOOK_USE_STUB_DB=1 or connection fails, return a stub connection, + a stub cursor and a minimal introspection result that includes a tracks table. + """ + use_stub = os.getenv("CHINOOK_USE_STUB_DB") == "1" + if not use_stub: + try: + from db import connect_database # local import to avoid hard dep if stub + + return connect_database() + except Exception: + # Fall back to stub on any connection issue + pass + + class _StubConn: + def rollback(self): + pass + + class _StubCursor: + def __init__(self): + self.description = [("count",)] + self._rows = [(3503,)] # expected Chinook track count in examples + + def execute(self, _query: str): + # no-op; preset rows + return None + + def fetchall(self): + return self._rows + + # Minimal schema rows: (table_name, column_name, data_type, is_nullable) + introspection = [("tracks", "TrackId", "INTEGER", "NO")] + return _StubConn(), _StubCursor(), introspection def setup_agent(orchestrator_agent_model: Model): - connection, cursor, introspection_result = connect_database() + connection, cursor, introspection_result = _maybe_connect_database() introspection_result_str = "\n".join([",".join(map(str, item)) for item in introspection_result])