From c3203f5b19df4cd01d5297b9a8d699523e1c9914 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 29 Dec 2025 10:02:14 -0800 Subject: [PATCH 1/3] lilac adapter --- eval_protocol/adapters/lilac.py | 265 ++++++++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 eval_protocol/adapters/lilac.py diff --git a/eval_protocol/adapters/lilac.py b/eval_protocol/adapters/lilac.py new file mode 100644 index 00000000..684adea9 --- /dev/null +++ b/eval_protocol/adapters/lilac.py @@ -0,0 +1,265 @@ +""" +Lilac ML integration for Eval Protocol. + +This adapter provides utilities for converting between EvaluationRow format +and Lilac dataset format, enabling powerful data curation features like: +- Clustering and deduplication +- Semantic search and filtering +- Quality scoring with embeddings +- Interactive data exploration + +Prerequisites: + pip install 'lilac[all]' + +Example usage: + >>> from eval_protocol.adapters.lilac import ( + ... evaluation_rows_to_lilac_dataset, + ... lilac_dataset_to_evaluation_rows, + ... ) + >>> + >>> # Convert EvaluationRows to Lilac dataset + >>> dataset = evaluation_rows_to_lilac_dataset(rows, name='my-traces') + >>> + >>> # Do Lilac operations (cluster, filter, etc.) + >>> dataset.cluster('messages_json') # or create your own text column + >>> + >>> # Convert back to EvaluationRows + >>> processed_rows = lilac_dataset_to_evaluation_rows(dataset) +""" + +from __future__ import annotations + +import json +import logging +from typing import Any, TYPE_CHECKING + +import pandas as pd + +from eval_protocol.models import ( + EvaluateResult, + EvaluationRow, + ExecutionMetadata, + InputMetadata, + Message, +) + +if TYPE_CHECKING: + import lilac as ll + +logger = logging.getLogger(__name__) + +# Check if lilac is available +try: + import lilac as ll + + LILAC_AVAILABLE = True +except ImportError: + LILAC_AVAILABLE = False + ll = None # type: ignore + + +def _ensure_lilac_available() -> None: + """Raise ImportError if lilac is not installed.""" + if not LILAC_AVAILABLE: + raise ImportError("Lilac is not installed. Install it with: pip install 'lilac[all]'") + + +# ============================================================================= +# Core Conversion Functions +# ============================================================================= + + +def _serialize_message(msg: Message) -> dict[str, Any]: + """Serialize a Message to a dictionary.""" + return msg.model_dump(exclude_none=True) + + +def _deserialize_messages(messages_json: str | None) -> list[Message]: + """Deserialize messages JSON back to Message objects.""" + if not messages_json: + return [] + try: + messages_data = json.loads(messages_json) + return [Message.model_validate(m) for m in messages_data] + except (json.JSONDecodeError, ValueError) as e: + logger.warning(f"Failed to deserialize messages: {e}") + return [] + + +def evaluation_row_to_dict(row: EvaluationRow) -> dict[str, Any]: + """Convert a single EvaluationRow to a dictionary for Lilac. + + The output contains JSON-serialized fields that can be reconstructed back + to EvaluationRow. Users can add their own text columns for clustering. + """ + result: dict[str, Any] = { + # Identifiers + "row_id": row.input_metadata.row_id if row.input_metadata else None, + # Full data as JSON (for reconstruction) + "messages_json": json.dumps([_serialize_message(m) for m in row.messages]), + "tools_json": json.dumps(row.tools) if row.tools else None, + "ground_truth_json": json.dumps(row.ground_truth) if row.ground_truth else None, + "input_metadata_json": row.input_metadata.model_dump_json() if row.input_metadata else None, + "execution_metadata_json": row.execution_metadata.model_dump_json() if row.execution_metadata else None, + "evaluation_result_json": row.evaluation_result.model_dump_json() if row.evaluation_result else None, + # Scalar fields for filtering + "score": row.evaluation_result.score if row.evaluation_result else None, + "message_count": len(row.messages), + "has_tools": bool(row.tools), + } + + return result + + +def dict_to_evaluation_row(data: dict[str, Any]) -> EvaluationRow: + """Convert a Lilac row dictionary back to an EvaluationRow.""" + # Parse messages + messages = _deserialize_messages(data.get("messages_json")) + + # Parse tools + tools = None + if data.get("tools_json"): + try: + tools = json.loads(data["tools_json"]) + except json.JSONDecodeError: + pass + + # Parse ground truth + ground_truth = None + if data.get("ground_truth_json"): + try: + ground_truth = json.loads(data["ground_truth_json"]) + except json.JSONDecodeError: + pass + + # Parse input metadata + input_metadata = InputMetadata() + if data.get("input_metadata_json"): + try: + input_metadata = InputMetadata.model_validate_json(data["input_metadata_json"]) + except (json.JSONDecodeError, ValueError): + input_metadata = InputMetadata(row_id=data.get("row_id")) + + # Parse execution metadata + execution_metadata = ExecutionMetadata() + if data.get("execution_metadata_json"): + try: + execution_metadata = ExecutionMetadata.model_validate_json(data["execution_metadata_json"]) + except (json.JSONDecodeError, ValueError): + pass + + # Parse evaluation result + evaluation_result = None + if data.get("evaluation_result_json"): + try: + evaluation_result = EvaluateResult.model_validate_json(data["evaluation_result_json"]) + except (json.JSONDecodeError, ValueError): + pass + + return EvaluationRow( + messages=messages, + tools=tools, + ground_truth=ground_truth, + input_metadata=input_metadata, + execution_metadata=execution_metadata, + evaluation_result=evaluation_result, + ) + + +# ============================================================================= +# Main Conversion Functions +# ============================================================================= + + +def evaluation_rows_to_lilac_dataset( + rows: list[EvaluationRow], + namespace: str = "local", + name: str = "eval-data", + project_dir: str | None = None, +) -> Any: + """Convert EvaluationRows to a Lilac dataset. + + Args: + rows: List of EvaluationRow objects + namespace: Lilac namespace (default: 'local') + name: Dataset name + project_dir: Lilac project directory (uses default if None) + + Returns: + Lilac Dataset object ready for clustering, filtering, etc. + + Example: + >>> dataset = evaluation_rows_to_lilac_dataset(rows, name='my-traces') + >>> + >>> # Add your own text column for clustering + >>> df = dataset.to_pandas() + >>> df['user_query'] = df['messages_json'].apply(extract_user_query) + >>> # Re-create dataset with new column, then cluster + """ + _ensure_lilac_available() + import lilac as ll_module # Re-import after ensuring available + + if project_dir: + ll_module.set_project_dir(project_dir) + + # Convert to DataFrame + records = [evaluation_row_to_dict(row) for row in rows] + df = pd.DataFrame(records) + + config = ll_module.DatasetConfig( + namespace=namespace, + name=name, + source=ll_module.PandasSource(df), + ) + + return ll_module.create_dataset(config) + + +def lilac_dataset_to_evaluation_rows( + dataset: Any, + filters: list[tuple[str, str, Any]] | None = None, + limit: int | None = None, +) -> list[EvaluationRow]: + """Convert a Lilac dataset back to EvaluationRows. + + Args: + dataset: Lilac Dataset object + filters: Optional Lilac filter tuples, e.g. [('score', 'greater', 0.5)] + limit: Maximum number of rows to return + + Returns: + List of EvaluationRow objects + """ + _ensure_lilac_available() + + # Build query + kwargs: dict[str, Any] = {} + if filters: + kwargs["filters"] = filters + if limit: + kwargs["limit"] = limit + + df = dataset.select_rows(**kwargs).df() + return dataframe_to_evaluation_rows(df) + + +def evaluation_rows_to_dataframe(rows: list[EvaluationRow]) -> pd.DataFrame: + """Convert EvaluationRows to a pandas DataFrame. + + Useful if you want to work with the DataFrame directly. + """ + records = [evaluation_row_to_dict(row) for row in rows] + return pd.DataFrame(records) + + +def dataframe_to_evaluation_rows(df: pd.DataFrame) -> list[EvaluationRow]: + """Convert a pandas DataFrame back to EvaluationRows.""" + rows = [] + for _, row_data in df.iterrows(): + try: + row = dict_to_evaluation_row(row_data.to_dict()) + rows.append(row) + except Exception as e: + logger.warning(f"Failed to convert row: {e}") + continue + return rows From 74cca5dc646b16fd0c72124dc86c438296c4d926 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Mon, 29 Dec 2025 10:29:32 -0800 Subject: [PATCH 2/3] updated adapter --- eval_protocol/adapters/lilac.py | 142 ++++++++------------------------ 1 file changed, 33 insertions(+), 109 deletions(-) diff --git a/eval_protocol/adapters/lilac.py b/eval_protocol/adapters/lilac.py index 684adea9..bc66fe6d 100644 --- a/eval_protocol/adapters/lilac.py +++ b/eval_protocol/adapters/lilac.py @@ -2,36 +2,39 @@ Lilac ML integration for Eval Protocol. This adapter provides utilities for converting between EvaluationRow format -and Lilac dataset format, enabling powerful data curation features like: +and pandas DataFrame format, enabling integration with Lilac for data curation: - Clustering and deduplication - Semantic search and filtering - Quality scoring with embeddings - Interactive data exploration -Prerequisites: - pip install 'lilac[all]' - Example usage: >>> from eval_protocol.adapters.lilac import ( - ... evaluation_rows_to_lilac_dataset, - ... lilac_dataset_to_evaluation_rows, + ... evaluation_rows_to_dataframe, + ... dataframe_to_evaluation_rows, ... ) >>> - >>> # Convert EvaluationRows to Lilac dataset - >>> dataset = evaluation_rows_to_lilac_dataset(rows, name='my-traces') + >>> # Convert EvaluationRows to DataFrame for Lilac + >>> df = evaluation_rows_to_dataframe(rows) + >>> df['user_query'] = df['messages_json'].apply(extract_user_message) >>> - >>> # Do Lilac operations (cluster, filter, etc.) - >>> dataset.cluster('messages_json') # or create your own text column + >>> # Use with Lilac for clustering + >>> import lilac as ll + >>> dataset = ll.create_dataset(ll.DatasetConfig( + ... namespace='local', name='my-data', source=ll.PandasSource(df) + ... )) + >>> dataset.cluster('user_query') >>> >>> # Convert back to EvaluationRows - >>> processed_rows = lilac_dataset_to_evaluation_rows(dataset) + >>> processed_df = dataset.to_pandas(include_signals=True) + >>> processed_rows = dataframe_to_evaluation_rows(processed_df) """ from __future__ import annotations import json import logging -from typing import Any, TYPE_CHECKING +from typing import Any import pandas as pd @@ -43,29 +46,11 @@ Message, ) -if TYPE_CHECKING: - import lilac as ll - logger = logging.getLogger(__name__) -# Check if lilac is available -try: - import lilac as ll - - LILAC_AVAILABLE = True -except ImportError: - LILAC_AVAILABLE = False - ll = None # type: ignore - - -def _ensure_lilac_available() -> None: - """Raise ImportError if lilac is not installed.""" - if not LILAC_AVAILABLE: - raise ImportError("Lilac is not installed. Install it with: pip install 'lilac[all]'") - # ============================================================================= -# Core Conversion Functions +# Internal Helpers # ============================================================================= @@ -86,13 +71,13 @@ def _deserialize_messages(messages_json: str | None) -> list[Message]: return [] -def evaluation_row_to_dict(row: EvaluationRow) -> dict[str, Any]: - """Convert a single EvaluationRow to a dictionary for Lilac. +def _evaluation_row_to_dict(row: EvaluationRow) -> dict[str, Any]: + """Convert a single EvaluationRow to a dictionary. The output contains JSON-serialized fields that can be reconstructed back to EvaluationRow. Users can add their own text columns for clustering. """ - result: dict[str, Any] = { + return { # Identifiers "row_id": row.input_metadata.row_id if row.input_metadata else None, # Full data as JSON (for reconstruction) @@ -108,11 +93,9 @@ def evaluation_row_to_dict(row: EvaluationRow) -> dict[str, Any]: "has_tools": bool(row.tools), } - return result - -def dict_to_evaluation_row(data: dict[str, Any]) -> EvaluationRow: - """Convert a Lilac row dictionary back to an EvaluationRow.""" +def _dict_to_evaluation_row(data: dict[str, Any]) -> EvaluationRow: + """Convert a dictionary back to an EvaluationRow.""" # Parse messages messages = _deserialize_messages(data.get("messages_json")) @@ -167,97 +150,38 @@ def dict_to_evaluation_row(data: dict[str, Any]) -> EvaluationRow: # ============================================================================= -# Main Conversion Functions +# Public API # ============================================================================= -def evaluation_rows_to_lilac_dataset( - rows: list[EvaluationRow], - namespace: str = "local", - name: str = "eval-data", - project_dir: str | None = None, -) -> Any: - """Convert EvaluationRows to a Lilac dataset. +def evaluation_rows_to_dataframe(rows: list[EvaluationRow]) -> pd.DataFrame: + """Convert EvaluationRows to a pandas DataFrame. + + The DataFrame can be used directly with Lilac for clustering and curation. Args: rows: List of EvaluationRow objects - namespace: Lilac namespace (default: 'local') - name: Dataset name - project_dir: Lilac project directory (uses default if None) Returns: - Lilac Dataset object ready for clustering, filtering, etc. - - Example: - >>> dataset = evaluation_rows_to_lilac_dataset(rows, name='my-traces') - >>> - >>> # Add your own text column for clustering - >>> df = dataset.to_pandas() - >>> df['user_query'] = df['messages_json'].apply(extract_user_query) - >>> # Re-create dataset with new column, then cluster + DataFrame with JSON-serialized fields for reconstruction """ - _ensure_lilac_available() - import lilac as ll_module # Re-import after ensuring available - - if project_dir: - ll_module.set_project_dir(project_dir) - - # Convert to DataFrame - records = [evaluation_row_to_dict(row) for row in rows] - df = pd.DataFrame(records) - - config = ll_module.DatasetConfig( - namespace=namespace, - name=name, - source=ll_module.PandasSource(df), - ) - - return ll_module.create_dataset(config) + records = [_evaluation_row_to_dict(row) for row in rows] + return pd.DataFrame(records) -def lilac_dataset_to_evaluation_rows( - dataset: Any, - filters: list[tuple[str, str, Any]] | None = None, - limit: int | None = None, -) -> list[EvaluationRow]: - """Convert a Lilac dataset back to EvaluationRows. +def dataframe_to_evaluation_rows(df: pd.DataFrame) -> list[EvaluationRow]: + """Convert a pandas DataFrame back to EvaluationRows. Args: - dataset: Lilac Dataset object - filters: Optional Lilac filter tuples, e.g. [('score', 'greater', 0.5)] - limit: Maximum number of rows to return + df: DataFrame with messages_json and other serialized fields Returns: List of EvaluationRow objects """ - _ensure_lilac_available() - - # Build query - kwargs: dict[str, Any] = {} - if filters: - kwargs["filters"] = filters - if limit: - kwargs["limit"] = limit - - df = dataset.select_rows(**kwargs).df() - return dataframe_to_evaluation_rows(df) - - -def evaluation_rows_to_dataframe(rows: list[EvaluationRow]) -> pd.DataFrame: - """Convert EvaluationRows to a pandas DataFrame. - - Useful if you want to work with the DataFrame directly. - """ - records = [evaluation_row_to_dict(row) for row in rows] - return pd.DataFrame(records) - - -def dataframe_to_evaluation_rows(df: pd.DataFrame) -> list[EvaluationRow]: - """Convert a pandas DataFrame back to EvaluationRows.""" rows = [] for _, row_data in df.iterrows(): try: - row = dict_to_evaluation_row(row_data.to_dict()) + row = _dict_to_evaluation_row(row_data.to_dict()) rows.append(row) except Exception as e: logger.warning(f"Failed to convert row: {e}") From e529f7fa10e6533dfee3a7b4db02e1a67beea459 Mon Sep 17 00:00:00 2001 From: Shrey Modi Date: Fri, 2 Jan 2026 13:39:29 -0800 Subject: [PATCH 3/3] updated adapters --- eval_protocol/adapters/__init__.py | 16 +++ eval_protocol/adapters/dataframe.py | 66 ++++++++++ eval_protocol/adapters/lilac.py | 189 ---------------------------- eval_protocol/models.py | 43 +++++++ 4 files changed, 125 insertions(+), 189 deletions(-) create mode 100644 eval_protocol/adapters/dataframe.py delete mode 100644 eval_protocol/adapters/lilac.py diff --git a/eval_protocol/adapters/__init__.py b/eval_protocol/adapters/__init__.py index 6f031f3d..b6e7c1e9 100644 --- a/eval_protocol/adapters/__init__.py +++ b/eval_protocol/adapters/__init__.py @@ -99,3 +99,19 @@ __all__.extend(["WeaveAdapter"]) except ImportError: pass + +# DataFrame adapter (pandas integration for Lilac, etc.) +try: + from .dataframe import ( + evaluation_rows_to_dataframe, + dataframe_to_evaluation_rows, + ) + + __all__.extend( + [ + "evaluation_rows_to_dataframe", + "dataframe_to_evaluation_rows", + ] + ) +except ImportError: + pass diff --git a/eval_protocol/adapters/dataframe.py b/eval_protocol/adapters/dataframe.py new file mode 100644 index 00000000..f67077f4 --- /dev/null +++ b/eval_protocol/adapters/dataframe.py @@ -0,0 +1,66 @@ +""" +Pandas DataFrame adapter for Eval Protocol. + +This module provides utilities for converting between EvaluationRow format +and pandas DataFrame format, enabling integration with data curation tools +such as Lilac, Great Expectations, or any pandas-based workflow. + +Example usage: + >>> from eval_protocol.adapters.dataframe import ( + ... evaluation_rows_to_dataframe, + ... dataframe_to_evaluation_rows, + ... ) + >>> + >>> # Convert EvaluationRows to DataFrame + >>> df = evaluation_rows_to_dataframe(rows) + >>> + >>> # Convert back to EvaluationRows + >>> rows = dataframe_to_evaluation_rows(df) +""" + +from __future__ import annotations + +import logging + +import pandas as pd + +from ..models import EvaluationRow + +logger = logging.getLogger(__name__) + + +def evaluation_rows_to_dataframe(rows: list[EvaluationRow]) -> pd.DataFrame: + """Convert EvaluationRows to a pandas DataFrame. + + Uses EvaluationRow.to_dict() for serialization. + + Args: + rows: List of EvaluationRow objects + + Returns: + DataFrame with 'data_json' containing serialized rows plus convenience fields + """ + records = [row.to_dict() for row in rows] + return pd.DataFrame(records) + + +def dataframe_to_evaluation_rows(df: pd.DataFrame) -> list[EvaluationRow]: + """Convert a pandas DataFrame back to EvaluationRows. + + Uses EvaluationRow.from_dict() for deserialization. + + Args: + df: DataFrame with 'data_json' column containing serialized EvaluationRows + + Returns: + List of EvaluationRow objects + """ + rows = [] + for _, row_data in df.iterrows(): + try: + row = EvaluationRow.from_dict(row_data.to_dict()) + rows.append(row) + except Exception as e: + logger.warning(f"Failed to convert row: {e}") + continue + return rows diff --git a/eval_protocol/adapters/lilac.py b/eval_protocol/adapters/lilac.py deleted file mode 100644 index bc66fe6d..00000000 --- a/eval_protocol/adapters/lilac.py +++ /dev/null @@ -1,189 +0,0 @@ -""" -Lilac ML integration for Eval Protocol. - -This adapter provides utilities for converting between EvaluationRow format -and pandas DataFrame format, enabling integration with Lilac for data curation: -- Clustering and deduplication -- Semantic search and filtering -- Quality scoring with embeddings -- Interactive data exploration - -Example usage: - >>> from eval_protocol.adapters.lilac import ( - ... evaluation_rows_to_dataframe, - ... dataframe_to_evaluation_rows, - ... ) - >>> - >>> # Convert EvaluationRows to DataFrame for Lilac - >>> df = evaluation_rows_to_dataframe(rows) - >>> df['user_query'] = df['messages_json'].apply(extract_user_message) - >>> - >>> # Use with Lilac for clustering - >>> import lilac as ll - >>> dataset = ll.create_dataset(ll.DatasetConfig( - ... namespace='local', name='my-data', source=ll.PandasSource(df) - ... )) - >>> dataset.cluster('user_query') - >>> - >>> # Convert back to EvaluationRows - >>> processed_df = dataset.to_pandas(include_signals=True) - >>> processed_rows = dataframe_to_evaluation_rows(processed_df) -""" - -from __future__ import annotations - -import json -import logging -from typing import Any - -import pandas as pd - -from eval_protocol.models import ( - EvaluateResult, - EvaluationRow, - ExecutionMetadata, - InputMetadata, - Message, -) - -logger = logging.getLogger(__name__) - - -# ============================================================================= -# Internal Helpers -# ============================================================================= - - -def _serialize_message(msg: Message) -> dict[str, Any]: - """Serialize a Message to a dictionary.""" - return msg.model_dump(exclude_none=True) - - -def _deserialize_messages(messages_json: str | None) -> list[Message]: - """Deserialize messages JSON back to Message objects.""" - if not messages_json: - return [] - try: - messages_data = json.loads(messages_json) - return [Message.model_validate(m) for m in messages_data] - except (json.JSONDecodeError, ValueError) as e: - logger.warning(f"Failed to deserialize messages: {e}") - return [] - - -def _evaluation_row_to_dict(row: EvaluationRow) -> dict[str, Any]: - """Convert a single EvaluationRow to a dictionary. - - The output contains JSON-serialized fields that can be reconstructed back - to EvaluationRow. Users can add their own text columns for clustering. - """ - return { - # Identifiers - "row_id": row.input_metadata.row_id if row.input_metadata else None, - # Full data as JSON (for reconstruction) - "messages_json": json.dumps([_serialize_message(m) for m in row.messages]), - "tools_json": json.dumps(row.tools) if row.tools else None, - "ground_truth_json": json.dumps(row.ground_truth) if row.ground_truth else None, - "input_metadata_json": row.input_metadata.model_dump_json() if row.input_metadata else None, - "execution_metadata_json": row.execution_metadata.model_dump_json() if row.execution_metadata else None, - "evaluation_result_json": row.evaluation_result.model_dump_json() if row.evaluation_result else None, - # Scalar fields for filtering - "score": row.evaluation_result.score if row.evaluation_result else None, - "message_count": len(row.messages), - "has_tools": bool(row.tools), - } - - -def _dict_to_evaluation_row(data: dict[str, Any]) -> EvaluationRow: - """Convert a dictionary back to an EvaluationRow.""" - # Parse messages - messages = _deserialize_messages(data.get("messages_json")) - - # Parse tools - tools = None - if data.get("tools_json"): - try: - tools = json.loads(data["tools_json"]) - except json.JSONDecodeError: - pass - - # Parse ground truth - ground_truth = None - if data.get("ground_truth_json"): - try: - ground_truth = json.loads(data["ground_truth_json"]) - except json.JSONDecodeError: - pass - - # Parse input metadata - input_metadata = InputMetadata() - if data.get("input_metadata_json"): - try: - input_metadata = InputMetadata.model_validate_json(data["input_metadata_json"]) - except (json.JSONDecodeError, ValueError): - input_metadata = InputMetadata(row_id=data.get("row_id")) - - # Parse execution metadata - execution_metadata = ExecutionMetadata() - if data.get("execution_metadata_json"): - try: - execution_metadata = ExecutionMetadata.model_validate_json(data["execution_metadata_json"]) - except (json.JSONDecodeError, ValueError): - pass - - # Parse evaluation result - evaluation_result = None - if data.get("evaluation_result_json"): - try: - evaluation_result = EvaluateResult.model_validate_json(data["evaluation_result_json"]) - except (json.JSONDecodeError, ValueError): - pass - - return EvaluationRow( - messages=messages, - tools=tools, - ground_truth=ground_truth, - input_metadata=input_metadata, - execution_metadata=execution_metadata, - evaluation_result=evaluation_result, - ) - - -# ============================================================================= -# Public API -# ============================================================================= - - -def evaluation_rows_to_dataframe(rows: list[EvaluationRow]) -> pd.DataFrame: - """Convert EvaluationRows to a pandas DataFrame. - - The DataFrame can be used directly with Lilac for clustering and curation. - - Args: - rows: List of EvaluationRow objects - - Returns: - DataFrame with JSON-serialized fields for reconstruction - """ - records = [_evaluation_row_to_dict(row) for row in rows] - return pd.DataFrame(records) - - -def dataframe_to_evaluation_rows(df: pd.DataFrame) -> list[EvaluationRow]: - """Convert a pandas DataFrame back to EvaluationRows. - - Args: - df: DataFrame with messages_json and other serialized fields - - Returns: - List of EvaluationRow objects - """ - rows = [] - for _, row_data in df.iterrows(): - try: - row = _dict_to_evaluation_row(row_data.to_dict()) - rows.append(row) - except Exception as e: - logger.warning(f"Failed to convert row: {e}") - continue - return rows diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 13d059d3..a7bc767b 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -953,6 +953,49 @@ def get_termination_reason(self) -> str: return str(reason) return "unknown" + def to_dict(self) -> Dict[str, Any]: + """Serialize this EvaluationRow to a dictionary. + + The entire EvaluationRow is serialized to JSON, allowing full reconstruction. + Additional scalar fields are included for convenient filtering/grouping. + + Returns: + Dictionary with 'data_json' containing the full serialized row, + plus convenience fields for filtering. + """ + return { + "data_json": self.model_dump_json(), + "row_id": self.input_metadata.row_id if self.input_metadata else None, + "score": self.evaluation_result.score if self.evaluation_result else None, + "message_count": len(self.messages), + "has_tools": bool(self.tools), + "created_at": self.created_at.isoformat() if self.created_at else None, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EvaluationRow": + """Reconstruct an EvaluationRow from a dictionary. + + Args: + data: Dictionary containing 'data_json' with the serialized EvaluationRow. + + Returns: + Reconstructed EvaluationRow instance. + + Raises: + ValueError: If 'data_json' is missing or invalid. + """ + from pydantic import ValidationError + + data_json = data.get("data_json") + if not data_json: + raise ValueError("Missing 'data_json' field in dictionary") + + try: + return cls.model_validate_json(data_json) + except ValidationError as e: + raise ValueError(f"Failed to deserialize EvaluationRow: {e}") from e + def __hash__(self) -> int: # Use a stable hash that works across Python processes return self._stable_hash()