Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "uipath"
version = "2.3.0"
version = "2.3.1"
description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools."
readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.11"
Expand Down
57 changes: 48 additions & 9 deletions src/uipath/_cli/_evals/_evaluator_factory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib.util
import logging
import sys
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -68,6 +69,8 @@
ToolCallOutputEvaluatorConfig,
)

logger = logging.getLogger(__name__)


class EvaluatorFactory:
"""Factory class for creating evaluator instances based on configuration."""
Expand Down Expand Up @@ -106,12 +109,15 @@ def _prepare_evaluator_config(data: dict[str, Any]) -> dict[str, Any]:

@classmethod
def create_evaluator(
cls, data: dict[str, Any], evaluators_dir: Path | None = None
cls,
data: dict[str, Any],
evaluators_dir: Path | None = None,
agent_model: str | None = None,
) -> BaseEvaluator[Any, Any, Any]:
if data.get("version", None) == "1.0":
return cls._create_evaluator_internal(data, evaluators_dir)
else:
return cls._create_legacy_evaluator_internal(data)
return cls._create_legacy_evaluator_internal(data, agent_model)

@staticmethod
def _create_evaluator_internal(
Expand Down Expand Up @@ -371,11 +377,14 @@ def _create_llm_judge_simulation_trajectory_evaluator(
@staticmethod
def _create_legacy_evaluator_internal(
data: dict[str, Any],
agent_model: str | None = None,
) -> LegacyBaseEvaluator[Any]:
"""Create an evaluator instance from configuration data.

Args:
data: Dictionary containing evaluator configuration from JSON file
agent_model: Optional model name from agent settings for resolving
'same-as-agent' model configuration

Returns:
Appropriate evaluator instance based on category
Expand All @@ -391,9 +400,13 @@ def _create_legacy_evaluator_internal(
case JsonSimilarityEvaluatorParams():
return EvaluatorFactory._create_legacy_json_similarity_evaluator(params)
case LLMEvaluatorParams():
return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(params)
return EvaluatorFactory._create_legacy_llm_as_judge_evaluator(
params, agent_model
)
case TrajectoryEvaluatorParams():
return EvaluatorFactory._create_legacy_trajectory_evaluator(params)
return EvaluatorFactory._create_legacy_trajectory_evaluator(
params, agent_model
)
case _:
raise ValueError(f"Unknown evaluator category: {params}")

Expand All @@ -414,33 +427,59 @@ def _create_legacy_json_similarity_evaluator(
@staticmethod
def _create_legacy_llm_as_judge_evaluator(
params: LLMEvaluatorParams,
agent_model: str | None = None,
) -> LegacyLlmAsAJudgeEvaluator:
"""Create an LLM-as-a-judge evaluator."""
if not params.prompt:
raise ValueError("LLM evaluator must include 'prompt' field")

if not params.model:
raise ValueError("LLM evaluator must include 'model' field")

# Resolve 'same-as-agent' to actual agent model
if params.model == "same-as-agent":
raise ValueError(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
if not agent_model:
raise ValueError(
"'same-as-agent' model option requires agent settings. "
"Ensure agent.json contains valid model settings."
)
logger.info(
f"Resolving 'same-as-agent' to agent model: {agent_model} "
f"for evaluator '{params.name}'"
)
params = params.model_copy(update={"model": agent_model})

logger.info(
f"Creating LLM-as-judge evaluator '{params.name}' with model: {params.model}"
)
return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={})

@staticmethod
def _create_legacy_trajectory_evaluator(
params: TrajectoryEvaluatorParams,
agent_model: str | None = None,
) -> LegacyTrajectoryEvaluator:
"""Create a trajectory evaluator."""
if not params.prompt:
raise ValueError("Trajectory evaluator must include 'prompt' field")

if not params.model:
raise ValueError("LLM evaluator must include 'model' field")
raise ValueError("Trajectory evaluator must include 'model' field")

# Resolve 'same-as-agent' to actual agent model
if params.model == "same-as-agent":
raise ValueError(
"'same-as-agent' model option is not supported by coded agents evaluations. Please select a specific model for the evaluator."
if not agent_model:
raise ValueError(
"'same-as-agent' model option requires agent settings. "
"Ensure agent.json contains valid model settings."
)
logger.info(
f"Resolving 'same-as-agent' to agent model: {agent_model} "
f"for evaluator '{params.name}'"
)
params = params.model_copy(update={"model": agent_model})

logger.info(
f"Creating trajectory evaluator '{params.name}' with model: {params.model}"
)
return LegacyTrajectoryEvaluator(**params.model_dump(), config={})
117 changes: 107 additions & 10 deletions src/uipath/_cli/_evals/_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@
from contextlib import contextmanager
from pathlib import Path
from time import time
from typing import Any, Awaitable, Iterable, Iterator, Sequence, Tuple
from typing import (
Any,
Awaitable,
Iterable,
Iterator,
Protocol,
Sequence,
Tuple,
runtime_checkable,
)

import coverage
from opentelemetry import context as context_api
Expand Down Expand Up @@ -67,6 +76,27 @@
set_execution_context,
)

logger = logging.getLogger(__name__)


@runtime_checkable
class LLMAgentRuntimeProtocol(Protocol):
"""Protocol for runtimes that can provide agent model information.

Runtimes that implement this protocol can be queried for
the agent's configured LLM model, enabling features like 'same-as-agent'
model resolution for evaluators.
"""

def get_agent_model(self) -> str | None:
"""Return the agent's configured LLM model name.

Returns:
The model name from agent settings (e.g., 'gpt-4o-2024-11-20'),
or None if no model is configured.
"""
...


class ExecutionSpanExporter(SpanExporter):
"""Custom exporter that stores spans grouped by execution ids."""
Expand Down Expand Up @@ -180,6 +210,8 @@ def __init__(
self.logs_exporter: ExecutionLogsExporter = ExecutionLogsExporter()
self.execution_id = str(uuid.uuid4())
self.schema: UiPathRuntimeSchema | None = None
self._agent_model: str | None = None
self._metadata_loaded: bool = False
self.coverage = coverage.Coverage(branch=True)

async def __aenter__(self) -> "UiPathEvalRuntime":
Expand All @@ -192,14 +224,33 @@ async def __aexit__(self, *args: Any) -> None:
self.coverage.stop()
self.coverage.report(include=["./*"], show_missing=True)

async def get_schema(self) -> UiPathRuntimeSchema:
if not self.schema:
temp_runtime = await self.factory.new_runtime(
entrypoint=self.context.entrypoint or "",
runtime_id="default",
)
async def _ensure_metadata_loaded(self) -> None:
"""Load metadata (schema, agent model) from a single temporary runtime.

This method creates one temporary runtime to fetch both schema and agent
model, avoiding the overhead of creating multiple runtimes for metadata
queries. Results are cached for subsequent access.
"""
if self._metadata_loaded:
return

temp_runtime = await self.factory.new_runtime(
entrypoint=self.context.entrypoint or "",
runtime_id="metadata-query",
)
try:
self.schema = await temp_runtime.get_schema()
self._agent_model = self._find_agent_model_in_runtime(temp_runtime)
if self._agent_model:
logger.debug(f"Got agent model from runtime: {self._agent_model}")
self._metadata_loaded = True
finally:
await temp_runtime.dispose()

async def get_schema(self) -> UiPathRuntimeSchema:
await self._ensure_metadata_loaded()
if self.schema is None:
raise ValueError("Schema could not be loaded")
return self.schema

@contextmanager
Expand Down Expand Up @@ -232,7 +283,7 @@ async def initiate_evaluation(
evaluation_set, _ = EvalHelpers.load_eval_set(
self.context.eval_set, self.context.eval_ids
)
evaluators = self._load_evaluators(evaluation_set)
evaluators = await self._load_evaluators(evaluation_set)

await self.event_bus.publish(
EvaluationEvents.CREATE_EVAL_SET_RUN,
Expand Down Expand Up @@ -601,7 +652,48 @@ async def run_evaluator(

return result

def _load_evaluators(
async def _get_agent_model(self) -> str | None:
"""Get agent model from the runtime.

Uses the cached metadata from _ensure_metadata_loaded(), which creates
a single temporary runtime to fetch both schema and agent model.

Returns:
The model name from agent settings, or None if not found.
"""
try:
await self._ensure_metadata_loaded()
return self._agent_model
except Exception:
return None

def _find_agent_model_in_runtime(self, runtime: Any) -> str | None:
"""Recursively search for get_agent_model in runtime and its delegates.

Runtimes may be wrapped (e.g., ResumableRuntime wraps TelemetryWrapper
which wraps the base runtime). This method traverses the wrapper chain
to find a runtime that implements LLMAgentRuntimeProtocol.

Args:
runtime: The runtime to check (may be a wrapper)

Returns:
The model name if found, None otherwise.
"""
# Check if this runtime implements the protocol
if isinstance(runtime, LLMAgentRuntimeProtocol):
return runtime.get_agent_model()

# Check for delegate property (used by UiPathResumableRuntime, TelemetryRuntimeWrapper)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer adding this method in UiPathResumableRuntime and TelemetryRuntimeWrapper but we can do this later..

delegate = getattr(runtime, "delegate", None) or getattr(
runtime, "_delegate", None
)
if delegate is not None:
return self._find_agent_model_in_runtime(delegate)

return None

async def _load_evaluators(
self, evaluation_set: EvaluationSet
) -> list[BaseEvaluator[Any, Any, Any]]:
"""Load evaluators referenced by the evaluation set."""
Expand All @@ -611,6 +703,9 @@ def _load_evaluators(
raise ValueError("eval_set cannot be None")
evaluators_dir = Path(eval_set).parent.parent / "evaluators"

# Load agent model for 'same-as-agent' resolution in legacy evaluators
agent_model = await self._get_agent_model()

# If evaluatorConfigs is specified, use that (new field with weights)
# Otherwise, fall back to evaluatorRefs (old field without weights)
if (
Expand Down Expand Up @@ -638,7 +733,9 @@ def _load_evaluators(
try:
evaluator_id = data.get("id")
if evaluator_id in evaluator_ref_ids:
evaluator = EvaluatorFactory.create_evaluator(data, evaluators_dir)
evaluator = EvaluatorFactory.create_evaluator(
data, evaluators_dir, agent_model=agent_model
)
evaluators.append(evaluator)
found_evaluator_ids.add(evaluator_id)
except Exception as e:
Expand Down
Loading