From b0523296c8fb4ca634b5a1dfb9bc4534c845687e Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sun, 7 Dec 2025 14:43:39 +0530 Subject: [PATCH 01/12] Add basic evaluation example script --- examples/basic_evaluation.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/basic_evaluation.py diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py new file mode 100644 index 0000000..e5c06d7 --- /dev/null +++ b/examples/basic_evaluation.py @@ -0,0 +1,45 @@ +""" +Basic Evaluation Example for AgentUnit +-------------------------------------- + +This script demonstrates how to run a minimal evaluation using +AgentUnit with a FakeAdapter. It is designed for beginners and does +not require any extra dependencies. +""" + +from agentunit.core.evaluator import Evaluator +from agentunit.core.adapters import BaseAdapter + + +class FakeAdapter(BaseAdapter): + """ + A simple mock adapter used only for demonstration. + It returns a predictable output so evaluation is easy to understand. + """ + + def generate(self, prompt: str) -> str: + # Always returns the same answer for simplicity + return "Hello, this is a fake response!" + + +def main(): + # Step 1 — Prepare the adapter + adapter = FakeAdapter() + + # Step 2 — Create the evaluator + evaluator = Evaluator(adapter=adapter) + + # Step 3 — Prepare an example prompt + prompt = "Say hello!" + + # Step 4 — Run the evaluation + result = evaluator.evaluate(prompt) + + # Step 5 — Print the output + print("Prompt:", prompt) + print("Model Output:", result.output) + print("Evaluation Score:", result.score) + + +if __name__ == "__main__": + main() From dd2f4feeed6b9ddf968cd25fccc7cbdea345e0d7 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Mon, 8 Dec 2025 15:37:23 +0530 Subject: [PATCH 02/12] Fix typos and improve clarity in docstrings across core modules --- src/agentunit/core/__init__.py | 4 +++- src/agentunit/core/exceptions.py | 16 ++++++++++++---- src/agentunit/core/replay.py | 8 ++++++-- src/agentunit/core/runner.py | 4 +++- src/agentunit/core/scenario.py | 16 ++++++++++++---- src/agentunit/core/trace.py | 12 +++++++++--- 6 files changed, 45 insertions(+), 15 deletions(-) diff --git a/src/agentunit/core/__init__.py b/src/agentunit/core/__init__.py index 952bcb5..757d228 100644 --- a/src/agentunit/core/__init__.py +++ b/src/agentunit/core/__init__.py @@ -1,4 +1,6 @@ -"""Core components for AgentUnit.""" +""" +Core components for AgentUnit. +""" from agentunit.datasets.base import DatasetCase, DatasetSource from agentunit.reporting.results import ScenarioResult diff --git a/src/agentunit/core/exceptions.py b/src/agentunit/core/exceptions.py index 03326f2..2a1d159 100644 --- a/src/agentunit/core/exceptions.py +++ b/src/agentunit/core/exceptions.py @@ -1,15 +1,23 @@ -"""Custom exceptions for AgentUnit.""" +""" +Custom exceptions for AgentUnit. +""" from __future__ import annotations class AgentUnitError(Exception): - """Base class for AgentUnit exceptions.""" + """ + Base class for AgentUnit exceptions. + """ class AdapterNotAvailableError(AgentUnitError): - """Raised when an adapter cannot be initialized due to missing dependencies.""" + """ + Raised when an adapter cannot be initialized due to missing dependencies. + """ class ScenarioExecutionError(AgentUnitError): - """Raised when a scenario fails during execution.""" + """ + Raised when a scenario fails during execution. + """ diff --git a/src/agentunit/core/replay.py b/src/agentunit/core/replay.py index af78380..f0e2b5f 100644 --- a/src/agentunit/core/replay.py +++ b/src/agentunit/core/replay.py @@ -1,4 +1,6 @@ -"""Replay utilities leveraging stored traces.""" +""" +Replay utilities leveraging stored traces. +""" from __future__ import annotations @@ -8,7 +10,9 @@ def load_traces(traces_dir: str | Path) -> list[TraceLog]: - """Load stored traces from disk for deterministic replay or analysis.""" + """ + Load stored traces from disk for deterministic replay or analysis. + """ path = Path(traces_dir) logs: list[TraceLog] = [] diff --git a/src/agentunit/core/runner.py b/src/agentunit/core/runner.py index bcbd66c..143921b 100644 --- a/src/agentunit/core/runner.py +++ b/src/agentunit/core/runner.py @@ -1,4 +1,6 @@ -"""Scenario runner orchestration.""" +""" +Scenario runner orchestration. +""" from __future__ import annotations diff --git a/src/agentunit/core/scenario.py b/src/agentunit/core/scenario.py index f0b0d86..70bba2a 100644 --- a/src/agentunit/core/scenario.py +++ b/src/agentunit/core/scenario.py @@ -1,4 +1,6 @@ -"""Scenario definition API exposed to end users.""" +""" +Scenario definition API exposed to end users. +""" from __future__ import annotations @@ -19,7 +21,9 @@ @dataclass(slots=True) class Scenario: - """Defines a reproducible agent evaluation scenario.""" + """ + Defines a reproducible agent evaluation scenario. + """ name: str adapter: BaseAdapter @@ -75,7 +79,9 @@ def from_crewai( name: str | None = None, **options: object, ) -> Scenario: - """Create scenario from CrewAI crew.""" + """ + Create scenario from CrewAI crew. + """ from agentunit.adapters.crewai import CrewAIAdapter adapter = CrewAIAdapter.from_crew(crew, **options) @@ -91,7 +97,9 @@ def from_autogen( name: str | None = None, **options: object, ) -> Scenario: - """Create scenario from AutoGen orchestrator.""" + """ + Create scenario from AutoGen orchestrator. + """ from agentunit.adapters.autogen import AutoGenAdapter adapter = AutoGenAdapter(orchestrator=orchestrator, **options) diff --git a/src/agentunit/core/trace.py b/src/agentunit/core/trace.py index 52f1afc..59d0978 100644 --- a/src/agentunit/core/trace.py +++ b/src/agentunit/core/trace.py @@ -1,4 +1,6 @@ -"""Tracing utilities shared between adapters and the runner.""" +""" +Tracing utilities shared between adapters and the runner. +""" from __future__ import annotations @@ -11,7 +13,9 @@ @dataclass(slots=True) class TraceEvent: - """Represents a single prompt, tool call, or response in an agent run.""" + """ + Represents a single prompt, tool call, or response in an agent run. + """ type: str payload: dict[str, Any] @@ -20,7 +24,9 @@ class TraceEvent: @dataclass(slots=True) class TraceLog: - """A collection of chronological events for a scenario iteration.""" + """ + A collection of chronological events for a scenario iteration. + """ events: list[TraceEvent] = field(default_factory=list) From 80b0706d4ec6be984702f2b29f1116c04d3508e9 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sun, 14 Dec 2025 19:59:28 +0530 Subject: [PATCH 03/12] Add Google-style docstrings to BaseAdapter methods --- examples/basic_evaluation.py | 2 +- src/agentunit/adapters/base.py | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py index e5c06d7..ac40b84 100644 --- a/examples/basic_evaluation.py +++ b/examples/basic_evaluation.py @@ -7,8 +7,8 @@ not require any extra dependencies. """ -from agentunit.core.evaluator import Evaluator from agentunit.core.adapters import BaseAdapter +from agentunit.core.evaluator import Evaluator class FakeAdapter(BaseAdapter): diff --git a/src/agentunit/adapters/base.py b/src/agentunit/adapters/base.py index f17fb10..5bd975b 100644 --- a/src/agentunit/adapters/base.py +++ b/src/agentunit/adapters/base.py @@ -32,14 +32,42 @@ class BaseAdapter(abc.ABC): @abc.abstractmethod def prepare(self) -> None: - """Perform any lazy setup (loading graphs, flows, etc.).""" + """ + Perform any lazy setup required before execution. + + This may include loading graphs, flows, or other resources. + + Returns: + None + """ + @abc.abstractmethod def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: - """Run the agent flow on a single dataset case.""" + """ + Run the agent flow on a single dataset case. + + Args: + case (DatasetCase): The dataset case to be processed. + trace (TraceLog): Trace log used to record execution details. + + Returns: + AdapterOutcome: The outcome produced by executing the adapter. + """ + def cleanup(self) -> None: # pragma: no cover - default no-op - """Hook for cleaning up resources such as temporary files or servers.""" + """ + Clean up resources after execution. + + This hook can be used to release resources such as temporary files + or running servers. + + Returns: + None + """ + def supports_replay(self) -> bool: return True + From 8e7b8c10e8d4a8d5a2d70d0ff650169af3c6f57b Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Mon, 15 Dec 2025 11:08:08 +0530 Subject: [PATCH 04/12] Format base adapter using ruff --- src/agentunit/adapters/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/agentunit/adapters/base.py b/src/agentunit/adapters/base.py index 5bd975b..9a3c0c1 100644 --- a/src/agentunit/adapters/base.py +++ b/src/agentunit/adapters/base.py @@ -41,7 +41,6 @@ def prepare(self) -> None: None """ - @abc.abstractmethod def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: """ @@ -55,7 +54,6 @@ def execute(self, case: DatasetCase, trace: TraceLog) -> AdapterOutcome: AdapterOutcome: The outcome produced by executing the adapter. """ - def cleanup(self) -> None: # pragma: no cover - default no-op """ Clean up resources after execution. @@ -67,7 +65,5 @@ def cleanup(self) -> None: # pragma: no cover - default no-op None """ - def supports_replay(self) -> bool: return True - From 0669ffd0f6d513c766e755ae26350e3265b36db4 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Wed, 17 Dec 2025 21:59:52 +0530 Subject: [PATCH 05/12] docs: add instructions for running CI checks locally --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 3b9fa4a..850d485 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,21 @@ poetry run python3 -m pytest tests -v Latest verification (2025-10-24): 144 passed, 10 skipped, 32 warnings. Warnings originate from third-party dependencies (`langchain` pydantic shim deprecations and `datetime.utcnow` usage). Track upstream fixes or pin patched releases as needed. +### Running CI Checks Locally + +Before opening a pull request, you can run the same checks locally that are executed in CI. + +#### Requirements +- Python **3.10 or higher** +- [Poetry](https://python-poetry.org/) installed + +#### Setup +Install dependencies (including dev tools): + +```bash +poetry install --with dev +``` + ## Contributing We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for: From fe9d27bfef283c06bc7c1619e6d21d55a652ca23 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Thu, 18 Dec 2025 20:15:34 +0530 Subject: [PATCH 06/12] Remove example file unrelated to CI documentation --- examples/basic_evaluation.py | 45 ------------------------------------ 1 file changed, 45 deletions(-) delete mode 100644 examples/basic_evaluation.py diff --git a/examples/basic_evaluation.py b/examples/basic_evaluation.py deleted file mode 100644 index ac40b84..0000000 --- a/examples/basic_evaluation.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Basic Evaluation Example for AgentUnit --------------------------------------- - -This script demonstrates how to run a minimal evaluation using -AgentUnit with a FakeAdapter. It is designed for beginners and does -not require any extra dependencies. -""" - -from agentunit.core.adapters import BaseAdapter -from agentunit.core.evaluator import Evaluator - - -class FakeAdapter(BaseAdapter): - """ - A simple mock adapter used only for demonstration. - It returns a predictable output so evaluation is easy to understand. - """ - - def generate(self, prompt: str) -> str: - # Always returns the same answer for simplicity - return "Hello, this is a fake response!" - - -def main(): - # Step 1 — Prepare the adapter - adapter = FakeAdapter() - - # Step 2 — Create the evaluator - evaluator = Evaluator(adapter=adapter) - - # Step 3 — Prepare an example prompt - prompt = "Say hello!" - - # Step 4 — Run the evaluation - result = evaluator.evaluate(prompt) - - # Step 5 — Print the output - print("Prompt:", prompt) - print("Model Output:", result.output) - print("Evaluation Score:", result.score) - - -if __name__ == "__main__": - main() From 7e215932d444e715cd258e46dde1ebb725178a24 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Thu, 18 Dec 2025 20:33:16 +0530 Subject: [PATCH 07/12] Add py.typed marker for type checker support --- src/agentunit/py.typed | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/agentunit/py.typed b/src/agentunit/py.typed index a4e0490..e69de29 100644 --- a/src/agentunit/py.typed +++ b/src/agentunit/py.typed @@ -1,2 +0,0 @@ -# Marker file for PEP 561 -# This file indicates that the agentunit package supports type checking From e53ab49b8c1ac11a97db7e744d647cd8d03d7d23 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Thu, 18 Dec 2025 20:45:58 +0530 Subject: [PATCH 08/12] Add test for markdown emoji encoding --- tests/test_reporting.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_reporting.py diff --git a/tests/test_reporting.py b/tests/test_reporting.py new file mode 100644 index 0000000..654deec --- /dev/null +++ b/tests/test_reporting.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +import pytest + +from agentunit.core.reporting import SuiteResult, RunResult + + +def test_markdown_contains_emojis(): + passing_run = RunResult( + name="test_pass", + passed=True, + error=None, + ) + + failing_run = RunResult( + name="test_fail", + passed=False, + error="AssertionError", + ) + + suite = SuiteResult( + name="emoji-suite", + runs=[passing_run, failing_run], + ) + + markdown = suite.to_markdown() + + assert "✅" in markdown + assert "❌" in markdown + + # UTF-8 safety check (important for Windows) + markdown.encode("utf-8") From 75729b262afafb135f926e5f44e8aedf61a0f949 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Fri, 19 Dec 2025 21:17:48 +0530 Subject: [PATCH 09/12] Expose __version__ in agentunit package --- src/agentunit/__init__.py | 1 + tests/test_reporting.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/agentunit/__init__.py b/src/agentunit/__init__.py index 797b897..d9a0523 100644 --- a/src/agentunit/__init__.py +++ b/src/agentunit/__init__.py @@ -19,3 +19,4 @@ ] __version__ = "0.7.0" + diff --git a/tests/test_reporting.py b/tests/test_reporting.py index 654deec..4bc05c4 100644 --- a/tests/test_reporting.py +++ b/tests/test_reporting.py @@ -1,20 +1,27 @@ # -*- coding: utf-8 -*- +import sys +import os + +# Add the agentunit folder to Python path +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "agentunit"))) + import pytest from agentunit.core.reporting import SuiteResult, RunResult def test_markdown_contains_emojis(): + # Adjusted to match common RunResult constructor passing_run = RunResult( name="test_pass", - passed=True, - error=None, + status="pass", + exception=None, ) failing_run = RunResult( name="test_fail", - passed=False, - error="AssertionError", + status="fail", + exception="AssertionError", ) suite = SuiteResult( @@ -27,5 +34,5 @@ def test_markdown_contains_emojis(): assert "✅" in markdown assert "❌" in markdown - # UTF-8 safety check (important for Windows) + # UTF-8 safety markdown.encode("utf-8") From d007ee5aeb4a541b5ee7a67a58e8f19fbf2b3998 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sat, 20 Dec 2025 20:47:29 +0530 Subject: [PATCH 10/12] Fix Ruff import and pathlib issues --- tests/test_reporting.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_reporting.py b/tests/test_reporting.py index 4bc05c4..de1ea74 100644 --- a/tests/test_reporting.py +++ b/tests/test_reporting.py @@ -1,13 +1,12 @@ -# -*- coding: utf-8 -*- + import sys -import os +from pathlib import Path -# Add the agentunit folder to Python path -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "agentunit"))) -import pytest +# Add the agentunit folder to Python path +sys.path.insert(0, str(Path(__file__).parent.parent / "agentunit")) -from agentunit.core.reporting import SuiteResult, RunResult +from agentunit.core.reporting import RunResult, SuiteResult def test_markdown_contains_emojis(): From 3952c24d3a9dd2bee283604a5c4eca68ad5271d4 Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sat, 20 Dec 2025 21:06:31 +0530 Subject: [PATCH 11/12] Change readme and test_reporting file --- README.md | 10 ++++++++ tests/test_reporting.py | 56 ++++++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 850d485..b956bd0 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,16 @@ poetry run python3 -m pytest tests -v Latest verification (2025-10-24): 144 passed, 10 skipped, 32 warnings. Warnings originate from third-party dependencies (`langchain` pydantic shim deprecations and `datetime.utcnow` usage). Track upstream fixes or pin patched releases as needed. ### Running CI Checks Locally +#### Usage + +Run all checks (same as CI): + +```bash +poetry run ruff check . +poetry run ruff format --check . +poetry run pytest tests -v + +``` Before opening a pull request, you can run the same checks locally that are executed in CI. diff --git a/tests/test_reporting.py b/tests/test_reporting.py index de1ea74..b6dd5ff 100644 --- a/tests/test_reporting.py +++ b/tests/test_reporting.py @@ -1,37 +1,53 @@ - -import sys +from datetime import datetime from pathlib import Path - -# Add the agentunit folder to Python path -sys.path.insert(0, str(Path(__file__).parent.parent / "agentunit")) - -from agentunit.core.reporting import RunResult, SuiteResult +from agentunit.reporting.results import ( + SuiteResult, + ScenarioResult, + ScenarioRun, +) def test_markdown_contains_emojis(): - # Adjusted to match common RunResult constructor - passing_run = RunResult( - name="test_pass", - status="pass", - exception=None, + passing_run = ScenarioRun( + scenario_name="emoji-suite", + case_id="test_pass", + success=True, + metrics={}, + duration_ms=5, + trace=[], + error=None, +) + + + failing_run = ScenarioRun( + scenario_name="emoji-suite", + case_id="test_fail", + success=False, + metrics={}, + duration_ms=6, + trace=[], + error="AssertionError", ) - failing_run = RunResult( - name="test_fail", - status="fail", - exception="AssertionError", + + scenario = ScenarioResult( + name="emoji-suite", + runs=[passing_run, failing_run], ) suite = SuiteResult( - name="emoji-suite", - runs=[passing_run, failing_run], + scenarios=[scenario], + started_at=datetime.now(), + finished_at=datetime.now(), ) - markdown = suite.to_markdown() + output_path = Path("report.md") + suite.to_markdown(output_path) + assert "✅" in markdown assert "❌" in markdown - # UTF-8 safety + # UTF-8 safety check (important for Windows) markdown.encode("utf-8") From 205c8ba8068bbf3953b7c1f96c378697d05777bf Mon Sep 17 00:00:00 2001 From: Jagriti-student Date: Sat, 27 Dec 2025 20:18:58 +0530 Subject: [PATCH 12/12] Fix Formatting, Lint and version --- report.md | 8 +++++++ src/agentunit/__init__.py | 5 ++-- tests/test_reporting.py | 48 +++++++++++++++++++-------------------- 3 files changed, 33 insertions(+), 28 deletions(-) create mode 100644 report.md diff --git a/report.md b/report.md new file mode 100644 index 0000000..be9d1d6 --- /dev/null +++ b/report.md @@ -0,0 +1,8 @@ +# AgentUnit Report + +## emoji-suite +Success rate: 50.00% + +- **test_pass**: ✅ +- **test_fail**: ❌ + - Error: AssertionError diff --git a/src/agentunit/__init__.py b/src/agentunit/__init__.py index d9a0523..3c874d1 100644 --- a/src/agentunit/__init__.py +++ b/src/agentunit/__init__.py @@ -1,5 +1,3 @@ -"""AgentUnit - pytest-style evaluation harness for agentic AI and RAG workflows.""" - from __future__ import annotations from .core.runner import Runner, run_suite @@ -8,6 +6,8 @@ from .reporting.results import ScenarioResult, SuiteResult +"""AgentUnit - pytest-style evaluation harness for agentic AI and RAG workflows.""" + __all__ = [ "DatasetCase", "DatasetSource", @@ -19,4 +19,3 @@ ] __version__ = "0.7.0" - diff --git a/tests/test_reporting.py b/tests/test_reporting.py index b6dd5ff..03aec87 100644 --- a/tests/test_reporting.py +++ b/tests/test_reporting.py @@ -2,52 +2,50 @@ from pathlib import Path from agentunit.reporting.results import ( - SuiteResult, ScenarioResult, ScenarioRun, + SuiteResult, ) def test_markdown_contains_emojis(): passing_run = ScenarioRun( - scenario_name="emoji-suite", - case_id="test_pass", - success=True, - metrics={}, - duration_ms=5, - trace=[], - error=None, -) - + scenario_name="emoji-suite", + case_id="test_pass", + success=True, + metrics={}, + duration_ms=5, + trace=[], + error=None, + ) failing_run = ScenarioRun( - scenario_name="emoji-suite", - case_id="test_fail", - success=False, - metrics={}, - duration_ms=6, - trace=[], - error="AssertionError", + scenario_name="emoji-suite", + case_id="test_fail", + success=False, + metrics={}, + duration_ms=6, + trace=[], + error="AssertionError", ) - scenario = ScenarioResult( - name="emoji-suite", - runs=[passing_run, failing_run], + name="emoji-suite", + runs=[passing_run, failing_run], ) suite = SuiteResult( - scenarios=[scenario], - started_at=datetime.now(), - finished_at=datetime.now(), + scenarios=[scenario], + started_at=datetime.now(), + finished_at=datetime.now(), ) output_path = Path("report.md") suite.to_markdown(output_path) - + # UTF-8 safety check (important for Windows) + markdown = output_path.read_text(encoding="utf-8") assert "✅" in markdown assert "❌" in markdown - # UTF-8 safety check (important for Windows) markdown.encode("utf-8")