diff --git a/.env.example b/.env.example
index 5918be8..33b4a17 100644
--- a/.env.example
+++ b/.env.example
@@ -8,4 +8,7 @@ TAVILY_API_KEY=your_tavily_api_key_here
 ANTHROPIC_API_KEY=YOUR_ANTHROPIC_API_KEY
 ANTHROPIC_API_BASE=YOUR_ANTHROPIC_API_BASE
 OPTIMIZER_MODEL_NAME=gpt-4o
-EXECUTOR_MODEL_NAME=gpt-4o-mini
\ No newline at end of file
+EXECUTOR_MODEL_NAME=gpt-4o-mini
+
+# agentops API configuration
+AGENTOPS_API_KEY=your_agentops_api_key_here
\ No newline at end of file
diff --git a/docs/extending.md b/docs/extending.md
index c8f70af..eff55f5 100644
--- a/docs/extending.md
+++ b/docs/extending.md
@@ -24,6 +24,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 ### 📋 Implementation Requirements
 
 **✅ Essential Requirements:**
+
 - Extend `AgentSystem` base class
 - Implement `run_agent()` method (abstract method - required)
 - Include `evaluator` in config during initialization
@@ -31,6 +32,7 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 - Register with `AgentSystemRegistry`
 
 **💡 Optional but Recommended:**
+
 - Implement `_create_agents()` for tool integration support
 - Use `self.format_prompt` for benchmark-specific formatting
 - Handle async execution properly if needed
@@ -40,18 +42,19 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 #### Step 1: Create Agent System Class Structure
 
 ✅ Langgraph supported
-✅ Customizable agent and multi-agent interaction 
+✅ Customizable agent and multi-agent interaction
 
 **📋 Implementation Guide:**
-   - Inherit from `AgentSystem` base class
-   - Initialize configuration parameters (num_agents, num_rounds, model_name)
-   - Set up agent components using `_create_agents()` method
-   - Extract workers and result extractors from created components
-   - Validate that required components are available
+
+- Inherit from `AgentSystem` base class
+- Initialize configuration parameters (num_agents, num_rounds, model_name)
+- Set up agent components using `_create_agents()` method
+- Extract workers and result extractors from created components
+- Validate that required components are available
 
 **💡 SupervisorMAS Implementation Example (LangGraph Structure):**
 
-```
+```python
 # mas_arena/agents/supervisor_mas.py
 
     def _init_graph_if_needed(self, problem_input: Optional[Any] = None, feedback: Optional[Any] = None):
@@ -93,7 +96,8 @@ A comprehensive guide to extending MASArena with custom Multi-Agent Systems and
 ```
 
 **💡 ChatEval Implementation Example (Basic Structure):**
-```
+
+```python
 # mas_arena/agents/chateval.py
 class ChatEval(AgentSystem):
     """Multi-agent evaluation system based on iterative debate"""
@@ -118,15 +122,17 @@ class ChatEval(AgentSystem):
 #### Step 2: Implement Core `run_agent` Method
 
 **📋 Implementation Guide:**
-   - Extract problem text from input dictionary
-   - Initialize message storage for tracking LLM responses
-   - Implement multi-round agent interaction logic
-   - Collect and process agent responses with proper metadata
-   - Extract final answer using result extractor
-   - Return formatted result with messages and final answer
+
+- Extract problem text from input dictionary
+- Initialize message storage for tracking LLM responses
+- Implement multi-round agent interaction logic
+- Collect and process agent responses with proper metadata
+- Extract final answer using result extractor
+- Return formatted result with messages and final answer
 
 **💡 ChatEval Implementation Example (run_agent Core Method):**
-```
+
+```python
 # mas_arena/agents/chateval.py
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """Run iterative debate process"""
@@ -174,14 +180,16 @@ class ChatEval(AgentSystem):
 #### Step 3: Implement `_create_agents` Method (Tool Integration Support)
 
 **📋 Implementation Guide:**
-   - Create specialized `AgentNode` instances for each role
-   - Set agent names, models, and system prompts
-   - Create result extractor with format prompt integration
-   - Return dictionary with "workers" key containing all components
-   - Ensure each worker has `.name` and `.llm` attributes for tool binding
+
+- Create specialized `AgentNode` instances for each role
+- Set agent names, models, and system prompts
+- Create result extractor with format prompt integration
+- Return dictionary with "workers" key containing all components
+- Ensure each worker has `.name` and `.llm` attributes for tool binding
 
 **💡 ChatEval Implementation Example (_create_agents Tool Integration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
     def _create_agents(self) -> List[Agent]:
         """Create multiple agent instances and result extractor"""
@@ -213,14 +221,16 @@ class ChatEval(AgentSystem):
 #### Step 4: Register System with Framework
 
 **📋 Implementation Guide:**
-   - Use `AgentSystemRegistry.register()` to make system available
-   - Provide system name as string identifier
-   - Pass class reference (not instance)
-   - Include default configuration parameters
-   - These defaults can be overridden during initialization
+
+- Use `AgentSystemRegistry.register()` to make system available
+- Provide system name as string identifier
+- Pass class reference (not instance)
+- Include default configuration parameters
+- These defaults can be overridden during initialization
 
 **💡 ChatEval Implementation Example (Registration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
 # register agent system
 AgentSystemRegistry.register(
@@ -231,18 +241,88 @@ AgentSystemRegistry.register(
 )
 ```
 
+#### Step 5: \[Optional\] Add AgentOps Decorator for Better Tracing
+
+**📋 Implementation Guide:**
+
+- Import `agentops.sdk.decorators`
+- Add tracing to your agent system for better observability
+- Reference the [AgentOps Decorators Documentation](https://docs.agentops.ai/v2/concepts/decorators) for detailed usage
+
+**💡 AgentOps Integration Example:**
+
+```python
+# mas_arena/agents/chateval.py
+from agentops.sdk.decorators import agent, operation, trace
+
+@agent
+class ResultExtractor:
+    """Extract final results from conversation history with AgentOps tracking"""
+    def __init__(self, model_name: str = None, format_prompt: str = ""):
+        self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini")
+        self.format_prompt = format_prompt
+        self.llm = ChatOpenAI(
+            model=self.model_name,
+            request_timeout=60,
+            max_retries=2
+        )
+        self.name = "result_extractor"
+        
+    @operation    
+    async def extract(self, all_histories: List[List[Dict[str, str]]], problem: str) -> Dict[str, Any]:
+        """Extract final answer with AgentOps operation tracking"""
+        # ...existing extraction implementation...
+
+@dataclass
+class Agent:
+    """Represents an LLM agent with AgentOps tracking"""
+    agent_id: str
+    name: str
+    model_name: str
+    system_prompt: str
+    chat_history: List[Dict[str, str]] = None
+    
+    def __post_init__(self):
+        self.chat_history = []
+        self.llm = ChatOpenAI(
+            model=self.model_name,
+            request_timeout=60,
+            max_retries=2
+        )
+
+    @operation
+    async def generate_response(self, context: str) -> Any:
+        """Generate agent response with AgentOps operation tracking"""
+        # ...existing generation implementation...
+
+class ChatEval(AgentSystem):
+    """Multi-agent evaluation system with AgentOps tracing"""
+    
+    def __init__(self, name: str = "chateval", config: Dict[str, Any] = None):
+        super().__init__(name, config)
+        # ...existing initialization code...
+    
+    @trace
+    async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        """Run iterative debate process with AgentOps trace tracking"""
+        # ...existing run_agent implementation...
+        return result
+```
+
 ### ⚡ Advanced Features
 
 #### 🎨 Format Prompt Integration
 
 **📋 Implementation Guide:**
-   - Accept `format_prompt` parameter in initialization
-   - Store format prompt for benchmark-specific requirements
-   - Use format prompt in result extraction and agent prompts
-   - Configure timeout and retry settings for robust operation
+
+- Accept `format_prompt` parameter in initialization
+- Store format prompt for benchmark-specific requirements
+- Use format prompt in result extraction and agent prompts
+- Configure timeout and retry settings for robust operation
 
 **💡 ChatEval Implementation Example (Format Prompt Integration):**
-```
+
+```python
 # mas_arena/agents/chateval.py
     def __init__(self, model_name: str = None, format_prompt: str = ""):
         self.model_name = model_name or os.getenv("MODEL_NAME", "gpt-4o-mini")
@@ -258,14 +338,16 @@ AgentSystemRegistry.register(
 #### 🤖 Agent Node Pattern
 
 **📋 Implementation Guide:**
-   - Use dataclass decorator for clean agent definition
-   - Include required attributes: agent_id, name, model_name, system_prompt
-   - Initialize chat history as empty list
-   - Set up LLM instance with timeout and retry configuration
-   - Ensure compatibility with tool integration framework
+
+- Use dataclass decorator for clean agent definition
+- Include required attributes: agent_id, name, model_name, system_prompt
+- Initialize chat history as empty list
+- Set up LLM instance with timeout and retry configuration
+- Ensure compatibility with tool integration framework
 
 **💡 ChatEval Implementation Example (Agent Class Definition):**
-```
+
+```python
 # mas_arena/agents/chateval.py
 @dataclass
 class Agent:
@@ -288,14 +370,16 @@ class Agent:
 #### 🔄 Usage Metadata Handling
 
 **📋 Implementation Guide:**
-   - For native OpenAI API calls or non-structured output: No manual handling required
-   - For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)`
-   - Usage metadata is automatically handled by the framework
-   - Focus on implementing the structured output schema instead
+
+- For native OpenAI API calls or non-structured output: No manual handling required
+- For structured output: Use `self.llm.with_structured_output(schema=AgentResponse, include_raw=True)`
+- Usage metadata is automatically handled by the framework
+- Focus on implementing the structured output schema instead
 
 ### 📋 Key Implementation Summary
 
 **🔧 Implementation Points:**
+
 - Inherit from `AgentSystem` base class
 - Implement required `run_agent()` method  
 - Ensure config includes `evaluator` key
@@ -316,14 +400,16 @@ Use `AgentSystemRegistry.register()` to register system and provide default conf
 #### Step 1: Basic Structure and Registration
 
 **📋 Implementation Guide:**
-   - Use `@register_benchmark` decorator to register evaluator
-   - Define normalization keys mapping for data field standardization
-   - Inherit from `BaseEvaluator` base class
-   - Provide comprehensive docstring explaining evaluator purpose
-   - Set up evaluator name and supported answer formats
+
+- Use `@register_benchmark` decorator to register evaluator
+- Define normalization keys mapping for data field standardization
+- Inherit from `BaseEvaluator` base class
+- Provide comprehensive docstring explaining evaluator purpose
+- Set up evaluator name and supported answer formats
 
 **💡 MMLU_pro Implementation Example (Registration and Class Definition):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
 @register_benchmark(
     name="mmlu_pro",
@@ -345,14 +431,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### Step 2: Initialize Configuration
 
 **📋 Implementation Guide:**
-   - Call parent class initialization with name and config
-   - Set up evaluation-specific weights and parameters
-   - Configure dataset loading and validation
-   - Set up logging and error handling
-   - Define evaluation metrics and scoring methods
+
+- Call parent class initialization with name and config
+- Set up evaluation-specific weights and parameters
+- Configure dataset loading and validation
+- Set up logging and error handling
+- Define evaluation metrics and scoring methods
 
 **💡 MMLU_pro Implementation Example (Initialization):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def __init__(self, name="mmlu_pro", config=None):
         """
@@ -376,15 +464,17 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### Step 3: Implement Core Evaluation Method
 
 **📋 Implementation Guide:**
-   - Extract final answer and reference solution from inputs
-   - Use specialized answer extraction method for response parsing
-   - Apply scoring logic (exact match, numerical comparison, etc.)
-   - Calculate evaluation metrics and scores
-   - Return standardized evaluation results dictionary
-   - Include extracted answer and original final answer
+
+- Extract final answer and reference solution from inputs
+- Use specialized answer extraction method for response parsing
+- Apply scoring logic (exact match, numerical comparison, etc.)
+- Calculate evaluation metrics and scores
+- Return standardized evaluation results dictionary
+- Include extracted answer and original final answer
 
 **💡 MMLU_pro Implementation Example (evaluate Method):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def evaluate(self, problem: Dict[str, Any], run_result: Dict[str, Any]) -> Dict[str, Any]:
         """
@@ -424,14 +514,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### 🔍 Answer Extraction
 
 **📋 Implementation Guide:**
-   - Use regular expressions to extract formatted answers
-   - Handle multiple answer formats (tags, patterns, raw text)
-   - Implement fallback strategies for unformatted responses
-   - Clean and normalize extracted text
-   - Support flexible answer parsing for different benchmarks
+
+- Use regular expressions to extract formatted answers
+- Handle multiple answer formats (tags, patterns, raw text)
+- Implement fallback strategies for unformatted responses
+- Clean and normalize extracted text
+- Support flexible answer parsing for different benchmarks
 
 **💡 MMLU_pro Implementation Example (Answer Extraction):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def extract_answer_from_response(self, response: str) -> str:
         """
@@ -455,14 +547,16 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### ✅ Answer Verification
 
 **📋 Implementation Guide:**
-   - Implement case-insensitive comparison for text answers
-   - Handle numerical index to letter conversion (1→A, 2→B, etc.)
-   - Apply normalization and cleaning to both reference and candidate
-   - Return numerical score (1.0 for match, 0.0 for no match)
-   - Include error handling for malformed inputs
+
+- Implement case-insensitive comparison for text answers
+- Handle numerical index to letter conversion (1→A, 2→B, etc.)
+- Apply normalization and cleaning to both reference and candidate
+- Return numerical score (1.0 for match, 0.0 for no match)
+- Include error handling for malformed inputs
 
 **💡 MMLU_pro Implementation Example (Exact Match Verification):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def check_exact_match(self, reference: str, candidate: str) -> float:
         """
@@ -499,15 +593,17 @@ class MMLU_ProEvaluator(BaseEvaluator):
 #### 📊 Batch Evaluation
 
 **📋 Implementation Guide:**
-   - Iterate through all problems in the batch
-   - Extract problem IDs and reference answers for each item
-   - Apply evaluation logic consistently across all problems
-   - Collect comprehensive results with metadata
-   - Log evaluation progress and summary statistics
-   - Return standardized results format for benchmark runner
+
+- Iterate through all problems in the batch
+- Extract problem IDs and reference answers for each item
+- Apply evaluation logic consistently across all problems
+- Collect comprehensive results with metadata
+- Log evaluation progress and summary statistics
+- Return standardized results format for benchmark runner
 
 **💡 MMLU_pro Implementation Example (Batch Evaluation):**
-```
+
+```python
 # mas_arena/evaluators/mmlu_pro_evaluator.py
     def batch_evaluate(self, problems: List[Dict[str, Any]], **kwargs) -> List[Dict[str, Any]]:
         """
@@ -554,6 +650,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 💻 Code Evaluation
 
 **🔧 Code Evaluator Key Points:**
+
 - Inherit from `BaseCodeEvaluator` base class (not BaseEvaluator)
 - Implement `check_solution(code, test, entry_point)` method
 - Implement `extract_code(text)` to extract code from responses
@@ -561,6 +658,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - Use isolated environments for code execution
 
 **📊 Core Process Flow:**
+
 1. **Code Extraction** - Extract Python code from agent responses
 2. **Environment Isolation** - Create secure execution environment
 3. **Test Execution** - Run test cases to verify code correctness
@@ -569,6 +667,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 📋 Evaluator Implementation Summary
 
 **🔧 Core Components:**
+
 - Use `@register_benchmark` decorator for registration
 - Inherit from `BaseEvaluator` base class
 - Implement required `evaluate()` method
@@ -576,12 +675,14 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - Optional: Implement answer extraction and verification methods
 
 **📊 Evaluation Process:**
+
 1. **Data Normalization** - Map fields using normalization_keys
 2. **Answer Extraction** - Extract final answer from messages
 3. **Answer Verification** - Compare predicted vs reference answers
 4. **Result Return** - Return score, extracted_answer, final_answer fields
 
-> 📄 **Complete Implementation References**: 
+> 📄 **Complete Implementation References**:
+>
 > - Text Evaluator: [`mas_arena/evaluators/mmlu_pro_evaluator.py`](../mas_arena/evaluators/mmlu_pro_evaluator.py)
 > - Code Evaluator: [`mas_arena/evaluators/humaneval_evaluator.py`](../mas_arena/evaluators/humaneval_evaluator.py)
 
@@ -611,6 +712,7 @@ class MMLU_ProEvaluator(BaseEvaluator):
 ### 📋 Implementation Checklist
 
 **For MAS Extensions:**
+
 - [ ] ✅ Config includes `evaluator` key
 - [ ] 📊 Messages have `usage_metadata` for token tracking
 - [ ] 🏷️ Agents have `name` and `llm` attributes (for tool integration)
@@ -619,9 +721,9 @@ class MMLU_ProEvaluator(BaseEvaluator):
 - [ ] 📋 Proper registration with `AgentSystemRegistry`
 
 **For Evaluator Extensions:**
+
 - [ ] 🎯 Used `@register_benchmark` decorator
 - [ ] ✅ Implemented `evaluate` method
 - [ ] 🗝️ Proper normalization_keys mapping
 - [ ] 🛡️ Error handling for malformed inputs
 - [ ] ⏱️ Timeout handling for long operations
-
diff --git a/docs/index.md b/docs/index.md
index 1c205c2..92e37b5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -12,6 +12,7 @@ A comprehensive framework for benchmarking single and multi-agent systems across
 * **🔧 Tool Support**:  Manage tool selection via pluggable wrappers.
 * **🧩 Easy Extensions**: Add agents via subclassing—no core changes.
 * **📂 Paired Datasets & Evaluators**: Add new benchmarks with minimal effort.
+* **⏱️ Real-time Monitoring with AgentOps**: Track agent calls and costs instantly using AgentOps integration.
 
 ## 🙌 Contributing
 
diff --git a/docs/quick_start/usage.md b/docs/quick_start/usage.md
index 3b8adff..aecbf58 100644
--- a/docs/quick_start/usage.md
+++ b/docs/quick_start/usage.md
@@ -4,18 +4,23 @@ This guide explains how to run benchmarks and use the automated workflow optimiz
 
 ## Prerequisites
 
-1.  **Install dependencies:**
+1. **Install dependencies:**
     If you haven't already, install the required packages. We recommend using `uv`.
+
     ```bash
     uv sync
     ```
 
-2.  **Configure Environment Variables:**
+2. **Configure Environment Variables:**
     Create a `.env` file in the project root and set your OpenAI API key and desired model.
+
+    `AGENTOPS_API_KEY` is required to enable monitoring with AgentOps. You can obtain it from the [AgentOps website](https://app.agentops.ai/settings/projects).
+
     ```bash
     OPENAI_API_KEY=your_openai_api_key
     MODEL_NAME=gpt-4o-mini
     OPENAI_API_BASE=https://api.openai.com/v1
+    AGENTOPS_API_KEY=your_agentops_api_key
     ```
 
 ## Running Benchmarks
@@ -27,6 +32,7 @@ You can run benchmarks using the convenience shell script `run_benchmark.sh` (re
 The `run_benchmark.sh` script is the simplest way to run evaluations.
 
 **Syntax:**
+
 ```bash
 # Usage: ./run_benchmark.sh [benchmark] [agent_system] [limit] [mcp_config] [concurrency] [optimizer]
 ./run_benchmark.sh math supervisor_mas 10
@@ -45,7 +51,7 @@ The `run_benchmark.sh` script is the simplest way to run evaluations.
 
 ## Automated Workflow Optimization (AFlow)
 
-MASArena includes AFlow implementation, an automated optimizer for agent workflows. 
+MASArena includes AFlow implementation, an automated optimizer for agent workflows.
 
 **Example:**
 To run AFlow to optimize an agent for the `humaneval` benchmark, provide `aflow` as the optimizer argument to the shell script:
diff --git a/main.py b/main.py
index 5d01d96..62fa2fa 100644
--- a/main.py
+++ b/main.py
@@ -6,12 +6,15 @@
 import time
 from pathlib import Path
 import asyncio
+import dotenv
+import agentops
 
 from mas_arena.benchmark_runner import BenchmarkRunner
 import logging
 
 logger = logging.getLogger(__name__)
 
+dotenv.load_dotenv()
 
 def main():
     # Parse command line arguments
@@ -194,6 +197,16 @@ def main():
         if args.verbose:
             print(f"Warning: {args.benchmark} benchmark does not support concurrency. Running synchronously.\n")
 
+    # Set up agent system monitoring with AgentOps
+    if not os.getenv("AGENTOPS_API_KEY"):
+        logger.warning(
+            """AGENTOPS_API_KEY cannot be found in `.env`. To view tracing data in agentops, please set the api key. 
+You can get the key at https://app.agentops.ai/settings/projects.
+"""
+        )
+    agentops.init(api_key=os.getenv("AGENTOPS_API_KEY", ""))
+
+
     # Run benchmark
     try:
         if run_async:
diff --git a/mas_arena/agents/ChatDev.py b/mas_arena/agents/ChatDev.py
index 016705b..5302527 100644
--- a/mas_arena/agents/ChatDev.py
+++ b/mas_arena/agents/ChatDev.py
@@ -11,10 +11,11 @@
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import trace, operation
+
 # Load environment variables
 load_dotenv()
 
-
 @dataclass
 class ChatDevAgent:
     """Base agent class in ChatDev system"""
@@ -73,12 +74,10 @@ async def generate_response(self, context: str) -> Dict[str, Any]:
                 "content": f"Error: {str(e)}"
             }
 
-
 class Instructor(ChatDevAgent):
     """Instructor role (CTO, CEO, Tester, Reviewer)"""
     pass
 
-
 class Assistant(ChatDevAgent):
     """Assistant role (CTO, Programmer)"""
     pass
@@ -284,6 +283,7 @@ def check_solution(self, code: str, test: str, entry_point: str) -> Tuple[bool,
         except Exception as exc:
             return False, f"Execution error: {exc}", ""
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """Run complete workflow of ChatDev system"""
         try:
@@ -330,6 +330,7 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
                 "final_answer": f"Error in ChatDev workflow: {str(e)}"
             }
 
+    @operation
     async def _demand_analysis_phase(self, task: str, all_messages: List) -> str:
         """Demand Analysis phase - CEO and CPO discuss product form"""
         self.get_agent_by_role("CEO").clear_history()
@@ -362,6 +363,7 @@ async def _demand_analysis_phase(self, task: str, all_messages: List) -> str:
         
         return modality
 
+    @operation
     async def _coding_phase(self, task: str, modality: str, language: str, all_messages: List, problem: Dict[str, Any]) -> str:
         """Coding phase - CTO guides Programmer to write code, includes up to 3 rounds of debate"""
         
@@ -466,6 +468,7 @@ async def _coding_phase(self, task: str, modality: str, language: str, all_messa
         
         return current_code
 
+    @operation
     async def _code_complete_all_phase(self, task: str, modality: str, language: str, codes: str, all_messages: List) -> str:
         """Code Completion phase - Loop to complete all unimplemented files"""
         current_codes = codes
@@ -557,6 +560,7 @@ async def _code_review_phase(self, task: str, modality: str, language: str, code
         
         return current_codes
 
+    @operation
     async def _test_phase(self, codes: str, problem: Dict[str, Any], all_messages: List) -> str:
         """Real Testing phase - Use real code execution and testing"""
         current_codes = codes
@@ -652,6 +656,3 @@ async def _test_phase(self, codes: str, problem: Dict[str, Any], all_messages: L
     description="ChatDev multi-agent software development system, implementing complete software development workflow",
     max_iterations=3
 )
-        
-        
-        
diff --git a/mas_arena/agents/EvoMAC.py b/mas_arena/agents/EvoMAC.py
index 762f463..6cddad0 100644
--- a/mas_arena/agents/EvoMAC.py
+++ b/mas_arena/agents/EvoMAC.py
@@ -25,6 +25,9 @@
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 from asyncio import Lock
 
+
+from agentops.sdk.decorators import agent, operation, trace
+
 # Load environment variables for configuration
 load_dotenv(override=True)
 
@@ -879,6 +882,7 @@ async def _call_llm_async(self, messages: List) -> Tuple[str, Optional[Any]]:
             print(f"LLM call failed: {e}")
             return "", None
     
+    @operation
     async def _generate_initial_implementation(self, problem_statement: str) -> Tuple[str, Optional[Any]]:
         """
         Generate initial code implementation.
@@ -896,6 +900,7 @@ async def _generate_initial_implementation(self, problem_statement: str) -> Tupl
         )
         return await self._call_llm_async(messages)
     
+    @operation
     async def _organize_workflow(self, problem_statement: str) -> Tuple[str, Optional[Any]]:
         """
         Generate workflow organization from CTO agent.
@@ -984,6 +989,7 @@ async def _execute_single_task(self, problem_statement: str, task_name: str, tas
         # Update implementation with new code
         self.code_manager.update_from_response(response_content)
     
+    @operation
     async def _execute_testing_workflow(self, problem_statement: str) -> Tuple[bool, str]:
         """
         Execute comprehensive testing workflow.
@@ -1024,6 +1030,7 @@ async def _execute_testing_workflow(self, problem_statement: str) -> Tuple[bool,
         
         return has_any_bugs, "\n\n".join(all_test_reports)
     
+    @operation
     async def _organize_testing(self, problem_statement: str) -> Tuple[str, Optional[Any]]:
         """
         Generate test organization plan.
@@ -1047,6 +1054,7 @@ async def _organize_testing(self, problem_statement: str) -> Tuple[str, Optional
         )
         return await self._call_llm_async(messages)
     
+    @operation
     async def _execute_test_task(self, problem_statement: str, task_name: str, task_description: str) -> Tuple[bool, str]:
         """
         Execute a single test task.
@@ -1087,6 +1095,7 @@ async def _execute_test_task(self, problem_statement: str, task_name: str, task_
         
         return True, "Failed to extract test code"
     
+    @operation
     async def _perform_iterative_optimization(self, problem_statement: str, test_reports: str) -> Tuple[bool, str]:
         """
         Perform iterative optimization to fix bugs.
@@ -1120,6 +1129,7 @@ async def _perform_iterative_optimization(self, problem_statement: str, test_rep
         
         return True, current_reports
     
+    @operation
     async def _organize_updates(self, problem_statement: str, test_reports: str) -> Tuple[str, Optional[Any]]:
         """
         Generate update organization to fix current issues.
@@ -1165,6 +1175,7 @@ def _create_message_record(self, content: str, agent_name: str, response_obj: Op
             
         return message
     
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the complete EvoMAC agent system on a given problem.
diff --git a/mas_arena/agents/agentverse.py b/mas_arena/agents/agentverse.py
index 88813a0..fa5a09a 100644
--- a/mas_arena/agents/agentverse.py
+++ b/mas_arena/agents/agentverse.py
@@ -11,6 +11,10 @@
 from langchain_core.messages import SystemMessage, HumanMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+
+from agentops.sdk.decorators import agent, trace, operation
+
+
 # Define TypedDict classes for structured output
 class ExpertTeam(TypedDict):
     """Expert team configuration"""
@@ -55,6 +59,7 @@ class Discussion(TypedDict):
 class SumDiscussion(TypedDict):
     sum_context: List[Discussion]
 
+@agent(name="recruiter_agent")
 class RecruiterAgent:
     """Recruitment agent: generates descriptions for work agents"""
     def __init__(self, agent_id: str, model_name: str = None, num_agents: int = 3):
@@ -105,6 +110,7 @@ def _create_prompt(self, problem: str, feedback: str = None) -> str:
             Agent ID: {self.agent_id}
         """
 
+    @operation
     async def describe(self, problem: str, feedback: str = None):
         messages = [
             SystemMessage(content=self.system_prompt),
@@ -225,6 +231,7 @@ def _create_default_experts(self) -> List[Dict[str, Any]]:
         
         return default_experts
 
+@agent(name="work_agent")
 class WorkAgent:
     """Work agent that solves specific aspects of a problem"""
     def __init__(self, agent_id: str, system_prompt: str = None, format_prompt: str = ""):
@@ -246,6 +253,7 @@ def __init__(self, agent_id: str, system_prompt: str = None, format_prompt: str
             max_tokens=1000
         )
 
+    @operation
     async def solve(self, problem: str, feedback: str = None):
         """Solve a problem with optional feedback"""
         feedback_section = ""
@@ -314,6 +322,7 @@ async def solve(self, problem: str, feedback: str = None):
                 "message": response,
             }
 
+@agent(name="evaluator_agent")
 class Evaluator:
     """Evaluates agent solutions and decides whether to recruit new experts or provide final solution"""
     def __init__(self, model_name: str = None, max_iterations: int = 3, min_quality_threshold: float = 0.7, min_improvement_threshold: float = 0.1):
@@ -326,6 +335,7 @@ def __init__(self, model_name: str = None, max_iterations: int = 3, min_quality_
             model=self.model_name
         )
         
+    @operation
     async def evaluate(self, problem: str, solutions: List[Dict[str, Any]], iteration: int, previous_solutions: List[Dict[str, Any]] = None, format_prompt: str = "") -> Dict[str, Any]:
         """
         Evaluate solutions from multiple agents and decide whether to:
@@ -718,6 +728,7 @@ async def _async_solve_problem(self, problem: str, workers: List[WorkAgent], fee
         
         return solutions
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the agent system on a given problem.
@@ -731,6 +742,7 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
             Dictionary of run results including messages with usage metadata
         """
         problem_text = problem["problem"]
+      
         
         # Initialize messages and solutions
         all_messages = []
diff --git a/mas_arena/agents/autogen.py b/mas_arena/agents/autogen.py
index 95d9aff..587b5cf 100644
--- a/mas_arena/agents/autogen.py
+++ b/mas_arena/agents/autogen.py
@@ -3,6 +3,7 @@
 from openai import AsyncOpenAI
 from dotenv import load_dotenv
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
+from agentops.sdk.decorators import agent, trace
 
 load_dotenv()
 
@@ -31,6 +32,7 @@ def __init__(self, name: str = "autogen", config: Dict[str, Any] = None):
             }
         ]
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
 
         problem_text = problem["problem"]
diff --git a/mas_arena/agents/chateval.py b/mas_arena/agents/chateval.py
index 2759ff4..8a9ceef 100644
--- a/mas_arena/agents/chateval.py
+++ b/mas_arena/agents/chateval.py
@@ -1,10 +1,14 @@
 import os
+import time
 from typing import Dict, List, Any, TypedDict
 from dataclasses import dataclass
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import agent, operation, trace
+
+
 # define structured output class, use TypedDict instead of Pydantic
 class AgentResponse(TypedDict):
     """Structured output for agent responses"""
@@ -12,6 +16,7 @@ class AgentResponse(TypedDict):
     solution: str  # Solution
     confidence: int  # Confidence level in the solution, range 1-5
 
+# @agent
 @dataclass
 class Agent:
     """Represents an LLM agent"""
@@ -29,6 +34,7 @@ def __post_init__(self):
             max_retries=2        # Set maximum retry attempts to 2
         )
 
+    @operation
     async def generate_response(self, context: str) -> Any:
         """Generate agent response"""
         messages = [
@@ -96,6 +102,7 @@ async def generate_response(self, context: str) -> Any:
                 "solution": response.content
             }
 
+@agent
 class ResultExtractor:
     """Extract final results from conversation history"""
     def __init__(self, model_name: str = None, format_prompt: str = ""):
@@ -108,6 +115,7 @@ def __init__(self, model_name: str = None, format_prompt: str = ""):
         )
         self.name = "result_extractor"
         
+    @operation    
     async def extract(self, all_histories: List[List[Dict[str, str]]], problem: str) -> Dict[str, Any]:
         """
         Extract final answer from all agents' conversation histories
@@ -233,6 +241,7 @@ def _get_agent_prompt(self, agent_index: int) -> str:
 
 You are the Critical Thinking Expert, focused on providing multi-angle perspective analysis."""
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """Run iterative debate process"""
         problem_text = problem["problem"]
@@ -311,6 +320,6 @@ def _build_context(self, problem: str, agent_index: int, round_num: int) -> str:
     problem = {
         "problem": "A positive integer, its square root is 452, find this positive integer."
     }
-    agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 2})
-    result = agent.run_agent(problem)
+    chateval_agent = ChatEval(name="chateval", config={"num_agents": 3, "num_rounds": 2})
+    result = chateval_agent.run_agent(problem)
     print(result)
diff --git a/mas_arena/agents/evoagent.py b/mas_arena/agents/evoagent.py
index e209dc2..1122b1f 100644
--- a/mas_arena/agents/evoagent.py
+++ b/mas_arena/agents/evoagent.py
@@ -11,6 +11,8 @@
 from langchain_community.callbacks.openai_info import OpenAICallbackHandler
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import agent, trace, operation
+
 import nest_asyncio
 nest_asyncio.apply()
 
@@ -39,6 +41,7 @@ def print_agent_info(agent: 'Agent', score: float = None):
     print(f"{Colors.CYAN}Agent: {agent.name}{score_str}{Colors.ENDC}")
     print(f"  System Prompt: {agent.system_prompt[:100]}...")
 
+@agent
 @dataclass
 class Agent:
     """Represents an LLM agent"""
@@ -166,6 +169,7 @@ def _initialize_base_agents(self) -> List[Agent]:
             
         return base_agents
     
+    @operation
     async def _crossover(self, parent1: Agent, parent2: Agent) -> Agent:
         """
         Crossover operation: combine features of two parent agents to create offspring
@@ -288,6 +292,7 @@ async def _crossover(self, parent1: Agent, parent2: Agent) -> Agent:
             
             return child
     
+    @operation
     async def _mutation(self, parent: Agent) -> Agent:
         """
         Mutation operation: create a mutated offspring based on the parent agent
@@ -426,6 +431,7 @@ def _calculate_score(self, result: Dict[str, Any], problem: Dict[str, Any]) -> f
         except Exception:
             return 0.0
     
+    @operation
     async def _summarize_results(self, problem: str, results: List[Dict[str, Any]]) -> Tuple[str, Dict[str, Any]]:
         """
         Use LLM to summarize results from multiple agents
@@ -493,6 +499,7 @@ async def _summarize_results(self, problem: str, results: List[Dict[str, Any]])
             best_result = max(results, key=lambda x: x.get("score", 0))
             return best_result.get("extracted_answer", f"Unable to summarize results: {str(e)}"), {}
     
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run evolutionary agent system to solve given problem (async version)
diff --git a/mas_arena/agents/jarvis.py b/mas_arena/agents/jarvis.py
index f3fbfdf..18f4be5 100644
--- a/mas_arena/agents/jarvis.py
+++ b/mas_arena/agents/jarvis.py
@@ -28,6 +28,7 @@
 
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import agent, operation, trace
 
 DEMONSTRATIONS: list[dict] = []
 
@@ -201,6 +202,7 @@ async def aplan(
         )
         return self.output_parser.parse(llm_response, inputs["hf_tools"])
 
+@agent
 class ResponseGenerator:
     """Generates a response based on the input."""
 
@@ -208,6 +210,7 @@ def __init__(self, llm_chain: LLMChain, stop: Optional[List] = None):
         self.llm_chain = llm_chain
         self.stop = stop
 
+    @operation
     def generate(self, inputs: dict, callbacks: Callbacks = None, **kwargs: Any) -> str:
         """Given input, decided what to do."""
         llm_response = self.llm_chain.run(**inputs, stop=self.stop, callbacks=callbacks)
@@ -432,6 +435,7 @@ def load_chat_planner(llm: BaseLanguageModel) -> TaskPlanner:
 
     return TaskPlanner(llm = llm)
 
+@agent
 class HuggingGPT:
     """Agent for interacting with HuggingGPT - Text Processing Version."""
 
@@ -443,6 +447,7 @@ def __init__(self, llm: BaseLanguageModel, tools: List[BaseTool], name: str = "j
         self.response_generator = load_response_generator(llm)
         self.task_executor = None
 
+    @operation
     def run(self, input: str) -> str:
         """Process text input through planning, execution, and response generation."""
         # Plan tasks based on input
@@ -623,6 +628,7 @@ def __init__(self, name: str = "jarvis", config: Dict[str, Any] | None = None):
         self.agent = HuggingGPT(self.llm, self.tools, name=self.name)
         # self.format_prompt is inherited from AgentSystem and set in super().__init__
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Runs the HuggingGPT agent.
diff --git a/mas_arena/agents/llm_debate.py b/mas_arena/agents/llm_debate.py
index 299f0d3..5eccd33 100644
--- a/mas_arena/agents/llm_debate.py
+++ b/mas_arena/agents/llm_debate.py
@@ -13,6 +13,9 @@
 from dotenv import load_dotenv
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import operation, trace
+
+
 # Load environment variables
 load_dotenv(override=True)
 
@@ -45,6 +48,7 @@ def __init__(self, name: str = "llm_debate", config: Dict[str, Any] = None):
             timeout=40
         )
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the LLM Debate system on a given problem.
@@ -215,6 +219,7 @@ def _construct_assistant_message(self, completion) -> Dict[str, str]:
             # 备用方案
             return {"role": "assistant", "content": "抱歉，无法获取回答内容。"}
 
+    @operation
     async def _generate_answer_async(self, answer_context: List[Dict]) -> Any:
         """
         异步版本的API调用函数
@@ -257,6 +262,7 @@ def __init__(self):
         
         return None
 
+    @operation
     async def _call_llm(self, messages: List[Dict]) -> Dict[str, Any]:
         """
         Call the LLM with given messages and return response with usage metadata.
@@ -283,6 +289,7 @@ async def _call_llm(self, messages: List[Dict]) -> Dict[str, Any]:
                 'usage': None
             }
 
+    @operation
     async def _aggregate_answers(self, query: str, answers: List[str]) -> Dict[str, Any]:
         """
         Aggregate all agents' final answers into a single result.
diff --git a/mas_arena/agents/mad.py b/mas_arena/agents/mad.py
index 30eb7fb..d401e32 100644
--- a/mas_arena/agents/mad.py
+++ b/mas_arena/agents/mad.py
@@ -8,6 +8,8 @@
 from langchain_core.messages import SystemMessage, HumanMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import operation, trace
+
 load_dotenv(override=True)
 
 @dataclass
@@ -41,6 +43,7 @@ def add_memory(self, memory: str):
         """Add generated response to memory"""
         self.memory_lst.append({"role": "assistant", "content": memory})
 
+    @operation
     async def ask(self):
         """Query and get response"""
         from langchain_core.messages import AIMessage
@@ -124,6 +127,7 @@ def round_dct(self, num: int) -> str:
         }
         return dct.get(num, str(num))
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """Run debate process"""
         
diff --git a/mas_arena/agents/metagpt.py b/mas_arena/agents/metagpt.py
index e924223..6a5b34c 100644
--- a/mas_arena/agents/metagpt.py
+++ b/mas_arena/agents/metagpt.py
@@ -8,7 +8,9 @@
 from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import agent, operation, trace
 
+@agent
 @dataclass
 class Agent:
     name: str
@@ -277,6 +279,7 @@ def _subscribe_messages(self, agent_name: str, message_type: str = None) -> List
                 messages.append(message)
         return messages
 
+    @operation
     async def _run_agent_task(self, agent_name: str, task: Dict[str, Any]) -> Dict[str, Any]:
         agent = self.agents[agent_name]
         messages = [SystemMessage(content=agent.system_prompt), HumanMessage(content=str(task))]
@@ -368,6 +371,7 @@ def _extract_suggestions(self, qa_content: str) -> List[str]:
     def _need_iteration(self, qa_content: str) -> bool:
         return bool(self._extract_bugs(qa_content) or self._extract_suggestions(qa_content))
 
+    @trace
     async def run_agent(self, task_data: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the agent system on a task.
diff --git a/mas_arena/agents/single_agent.py b/mas_arena/agents/single_agent.py
index 7ac3f17..78658ec 100644
--- a/mas_arena/agents/single_agent.py
+++ b/mas_arena/agents/single_agent.py
@@ -13,6 +13,8 @@
 from dotenv import load_dotenv
 
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
+from agentops.sdk.decorators import trace
+
 
 # Load environment variables
 load_dotenv()
@@ -39,6 +41,7 @@ def __init__(self, name: str = "single_agent", config: Dict[str, Any] = None):
         else:
             self.client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url=os.getenv("OPENAI_API_BASE"))
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the agent system on a given problem.
diff --git a/mas_arena/agents/swarm.py b/mas_arena/agents/swarm.py
index 497ad82..b88dfa4 100644
--- a/mas_arena/agents/swarm.py
+++ b/mas_arena/agents/swarm.py
@@ -17,10 +17,12 @@
 
 from mas_arena.agents.base import AgentSystem, AgentSystemRegistry
 
+from agentops.sdk.decorators import agent, operation, trace
+
 # Load environment variables
 load_dotenv()
 
-
+@agent
 class SwarmAgent:
     """Individual agent in the swarm"""
 
@@ -41,7 +43,8 @@ def __init__(self, agent_id: str, model_name: str = None, system_prompt: str = N
         )
         self.llm = ChatOpenAI(model=self.model_name)
         self.name = agent_id
-
+    
+    @operation
     async def solve(self, problem: str) -> Dict[str, Any]:
         """
         Solve a problem independently.
@@ -85,7 +88,7 @@ def _create_prompt(self, problem: str) -> str:
 Agent ID: {self.agent_id}
 """
 
-
+@agent
 class Aggregator:
     """Aggregates results from swarm agents to produce a final solution"""
 
@@ -101,6 +104,7 @@ def __init__(self, model_name: str = None, format_prompt: str = None):
         self.name = "aggregator"
         self.format_prompt = format_prompt
 
+    @operation
     async def aggregate(self, problem: str, solutions: List[Dict[str, Any]]) -> Dict[str, Any]:
         """
         Aggregate solutions from multiple agents.
@@ -196,9 +200,9 @@ def _create_agents(self, problem_input: Dict[str, Any], feedback: Dict[str, Any]
     def _get_system_prompt(self) -> str:
         """Get system prompt for an agent based on its index"""
         base_prompt = "You are an intelligent AI assistant specialized in solving problems carefully and step by step."
-      
         return base_prompt
 
+    @trace
     async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
         """
         Run the agent system on a given problem.
diff --git a/pyproject.toml b/pyproject.toml
index 2e8ff02..e4ec6d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "project-multi-agents-benchmark"
+name = "MASArena"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
@@ -21,7 +21,7 @@ dependencies = [
     "networkx>=3.4.2",
     "nltk>=3.9.1",
     "numpy>=1.26.4",
-    "openai>=1.37.1",
+    "openai==1.85.0",
     "pandas>=2.2.3",
     "playwright>=1.46.0",
     "psutil>=7.0.0",
@@ -51,6 +51,7 @@ dependencies = [
     "pyyaml",
     "regex",
     "pydantic_core",
+    "agentops>=0.4.19",
 ]
 
 [tool.ruff]
diff --git a/requirements.txt b/requirements.txt
index 83f54d4..203c565 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,21 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile --all-extras --output-file requirements.txt pyproject.toml
 absl-py==2.2.2
+    # via masarena (pyproject.toml)
+agentops==0.4.19
+    # via masarena (pyproject.toml)
 aiofiles==24.1.0
+    # via masarena (pyproject.toml)
 aiohappyeyeballs==2.4.6
     # via aiohttp
 aiohttp==3.11.13
-    # via langchain-community
+    # via
+    #   agentops
+    #   langchain-community
 aiosignal==1.3.1
     # via aiohttp
 aliyun-python-sdk-core==2.13.3
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 annotated-types==0.7.0
     # via pydantic
 antlr4-python3-runtime==4.13.2
@@ -20,7 +28,7 @@ anyio==4.9.0
     #   sse-starlette
     #   starlette
 asteval==1.0.6
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 attrs==25.1.0
     # via aiohttp
 certifi==2025.1.31
@@ -43,12 +51,13 @@ dataclasses-json==0.6.7
 distro==1.9.0
     # via openai
 docx2markdown==0.1.1
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 filelock==3.17.0
     # via
     #   huggingface-hub
     #   torch
     #   transformers
+    #   triton
 fonttools==4.58.5
     # via matplotlib
 frozenlist==1.5.0
@@ -59,8 +68,10 @@ fsspec==2024.12.0
     # via
     #   huggingface-hub
     #   torch
+googleapis-common-protos==1.70.0
+    # via opentelemetry-exporter-otlp-proto-http
 gputil==1.4.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 greenlet==3.1.1
     # via
     #   playwright
@@ -75,6 +86,7 @@ httpcore==1.0.7
     # via httpx
 httpx==0.28.1
     # via
+    #   agentops
     #   langgraph-sdk
     #   langsmith
     #   mcp
@@ -96,7 +108,9 @@ idna==3.10
     #   requests
     #   yarl
 immutabledict==4.2.1
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
+importlib-metadata==8.7.0
+    # via opentelemetry-api
 jinja2==3.1.6
     # via torch
 jiter==0.8.2
@@ -115,13 +129,13 @@ kiwisolver==1.4.8
     # via matplotlib
 langchain==0.3.20
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   langchain-community
 langchain-community==0.3.19
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 langchain-core==0.3.43
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   langchain
     #   langchain-community
     #   langchain-mcp-adapters
@@ -130,28 +144,28 @@ langchain-core==0.3.43
     #   langgraph
     #   langgraph-checkpoint
 langchain-mcp-adapters==0.1.7
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 langchain-openai==0.3.7
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 langchain-text-splitters==0.3.6
     # via langchain
 langdetect==1.0.9
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 langgraph==0.2.74
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 langgraph-checkpoint==2.0.16
     # via langgraph
 langgraph-sdk==0.1.53
     # via langgraph
 langsmith==0.3.11
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   langchain
     #   langchain-community
     #   langchain-core
 latex2sympy2-extended==1.10.1
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   math-verify
 lxml==5.4.0
     # via
@@ -164,10 +178,10 @@ markupsafe==3.0.2
 marshmallow==3.26.1
     # via dataclasses-json
 math-verify==0.7.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 matplotlib==3.10.3
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   seaborn
 mcp==1.9.4
     # via langchain-mcp-adapters
@@ -184,16 +198,16 @@ multidict==6.1.0
 mypy-extensions==1.0.0
     # via typing-inspect
 nest-asyncio==1.6.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 networkx==3.4.2
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   torch
 nltk==3.9.1
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 numpy==1.26.4
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   contourpy
     #   langchain-community
     #   matplotlib
@@ -202,43 +216,110 @@ numpy==1.26.4
     #   scipy
     #   seaborn
     #   transformers
-openai==1.64.0
+nvidia-cublas-cu12==12.1.3.1
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.9.86
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+openai==1.85.0
+    # via
+    #   masarena (pyproject.toml)
     #   langchain-openai
+opentelemetry-api==1.36.0
+    # via
+    #   agentops
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-instrumentation
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
+opentelemetry-exporter-otlp-proto-common==1.36.0
+    # via opentelemetry-exporter-otlp-proto-http
+opentelemetry-exporter-otlp-proto-http==1.36.0
+    # via agentops
+opentelemetry-instrumentation==0.57b0
+    # via agentops
+opentelemetry-proto==1.36.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-sdk==1.36.0
+    # via
+    #   agentops
+    #   opentelemetry-exporter-otlp-proto-http
+opentelemetry-semantic-conventions==0.57b0
+    # via
+    #   agentops
+    #   opentelemetry-instrumentation
+    #   opentelemetry-sdk
+ordered-set==4.1.0
+    # via agentops
 orjson==3.10.15
     # via
     #   langgraph-sdk
     #   langsmith
 packaging==24.2
     # via
+    #   agentops
     #   huggingface-hub
     #   langchain-core
     #   langsmith
     #   marshmallow
     #   matplotlib
+    #   opentelemetry-instrumentation
     #   transformers
 pandas==2.2.3
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   seaborn
 pillow==11.1.0
     # via
     #   matplotlib
     #   sentence-transformers
 playwright==1.52.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 propcache==0.3.0
     # via
     #   aiohttp
     #   yarl
+protobuf==6.32.0
+    # via
+    #   googleapis-common-protos
+    #   opentelemetry-proto
 psutil==7.0.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via
+    #   masarena (pyproject.toml)
+    #   agentops
 pycryptodome==3.23.0
     # via aliyun-python-sdk-core
 pydantic==2.10.6
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   langchain
     #   langchain-core
     #   langsmith
@@ -247,7 +328,7 @@ pydantic==2.10.6
     #   pydantic-settings
 pydantic-core==2.27.2
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   pydantic
 pydantic-settings==2.8.1
     # via
@@ -260,7 +341,7 @@ pygments==2.19.2
 pyparsing==3.2.3
     # via matplotlib
 pypdf2==3.0.1
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
@@ -269,7 +350,7 @@ python-docx==1.2.0
     # via docx2markdown
 python-dotenv==1.0.1
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   pydantic-settings
 python-multipart==0.0.20
     # via mcp
@@ -277,7 +358,8 @@ pytz==2025.1
     # via pandas
 pyyaml==6.0.2
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
+    #   agentops
     #   huggingface-hub
     #   langchain
     #   langchain-community
@@ -285,17 +367,19 @@ pyyaml==6.0.2
     #   transformers
 regex==2024.11.6
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   nltk
     #   tiktoken
     #   transformers
 requests==2.32.3
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
+    #   agentops
     #   huggingface-hub
     #   langchain
     #   langchain-community
     #   langsmith
+    #   opentelemetry-exporter-otlp-proto-http
     #   requests-toolbelt
     #   tavily-python
     #   tika
@@ -304,7 +388,7 @@ requests==2.32.3
 requests-toolbelt==1.0.0
     # via langsmith
 rich==14.0.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 safetensors==0.5.3
     # via transformers
 scikit-learn==1.7.0
@@ -314,9 +398,9 @@ scipy==1.15.3
     #   scikit-learn
     #   sentence-transformers
 seaborn==0.13.2
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 sentence-transformers==4.1.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 setuptools==80.9.0
     # via tika
 six==1.17.0
@@ -336,22 +420,24 @@ sse-starlette==2.3.6
 starlette==0.47.0
     # via mcp
 stopit==1.1.2
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 sympy==1.13.3
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   latex2sympy2-extended
     #   torch
 tavily-python==0.7.7
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 tenacity==9.0.0
     # via
     #   langchain-community
     #   langchain-core
+termcolor==2.4.0
+    # via agentops
 threadpoolctl==3.6.0
     # via scikit-learn
 tika==3.1.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 tiktoken==0.9.0
     # via
     #   langchain-openai
@@ -360,11 +446,11 @@ tokenizers==0.21.1
     # via transformers
 torch==2.2.2
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   sentence-transformers
 tqdm==4.67.1
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   huggingface-hub
     #   nltk
     #   openai
@@ -372,18 +458,24 @@ tqdm==4.67.1
     #   transformers
 transformers==4.52.4
     # via
-    #   project-multi-agents-benchmark (pyproject.toml)
+    #   masarena (pyproject.toml)
     #   sentence-transformers
 tree-sitter==0.24.0
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 tree-sitter-python==0.23.6
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
+triton==2.2.0
+    # via torch
 typing-extensions==4.12.2
     # via
     #   anyio
     #   huggingface-hub
     #   langchain-core
     #   openai
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-http
+    #   opentelemetry-sdk
+    #   opentelemetry-semantic-conventions
     #   pydantic
     #   pydantic-core
     #   pyee
@@ -400,9 +492,15 @@ urllib3==2.3.0
     # via requests
 uvicorn==0.34.3
     # via mcp
+wrapt==1.17.3
+    # via
+    #   agentops
+    #   opentelemetry-instrumentation
 xmltodict==0.14.2
-    # via project-multi-agents-benchmark (pyproject.toml)
+    # via masarena (pyproject.toml)
 yarl==1.18.3
     # via aiohttp
+zipp==3.23.0
+    # via importlib-metadata
 zstandard==0.23.0
     # via langsmith
diff --git a/uv.lock b/uv.lock
index 1b8a5cd..520d1ba 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 1
+revision = 2
 requires-python = ">=3.11, <3.13"
 resolution-markers = [
     "python_full_version >= '3.12.4'",
@@ -17,6 +17,31 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/8f/aa/ba0014cc4659328dc818a28827be78e6d97312ab0cb98105a770924dc11e/absl_py-2.3.1-py3-none-any.whl", hash = "sha256:eeecf07f0c2a93ace0772c92e596ace6d3d3996c042b2128459aaae2a76de11d" },
 ]
 
+[[package]]
+name = "agentops"
+version = "0.4.19"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "httpx" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-http" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-sdk" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "ordered-set" },
+    { name = "packaging" },
+    { name = "psutil" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "termcolor" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/b0/d0/28a12fc847ff1594f1ff42b8ad0d9ab0b6f601eb7bda9624847f02ea24f4/agentops-0.4.19.tar.gz", hash = "sha256:63e5b770cf6b0c2fac5eb783054d506eb739a53e163cc7fb237b70c8facc37d9" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/b5/5c/034f99ce2cfb26ffad0236e5b25d1b667fa4464157577e14d80717f1c342/agentops-0.4.19-py3-none-any.whl", hash = "sha256:848f679075d6f95f4c9345ce2d89cce59f8827f5fb8a70a68c870b1611ba8193" },
+]
+
 [[package]]
 name = "aiofiles"
 version = "24.1.0"
@@ -457,6 +482,18 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/bb/61/78c7b3851add1481b048b5fdc29067397a1784e2910592bc81bb3f608635/fsspec-2025.5.1-py3-none-any.whl", hash = "sha256:24d3a2e663d5fc735ab256263c4075f374a174c3410c0b25e5bd1970bceaa462" },
 ]
 
+[[package]]
+name = "googleapis-common-protos"
+version = "1.70.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8" },
+]
+
 [[package]]
 name = "gputil"
 version = "1.4.0"
@@ -587,6 +624,18 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/59/56/25ca7b848164b7d93dbd5fc97dd7751700c93e324fe854afbeb562ee2f98/immutabledict-4.2.1-py3-none-any.whl", hash = "sha256:c56a26ced38c236f79e74af3ccce53772827cef5c3bce7cab33ff2060f756373" },
 ]
 
+[[package]]
+name = "importlib-metadata"
+version = "8.7.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "zipp" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/76/66/650a33bd90f786193e4de4b3ad86ea60b53c89b669a5c7be931fac31cdb0/importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/20/b0/36bd937216ec521246249be3bf9855081de4c5e06a0c9b4219dbeda50373/importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -1378,7 +1427,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "1.64.0"
+version = "1.85.0"
 source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
 dependencies = [
     { name = "anyio" },
@@ -1390,9 +1439,115 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7b/1d/aae78d8ecc571d672c4a27794a8f248bc46437a22ddcb9c4eb6fd6616c03/openai-1.64.0.tar.gz", hash = "sha256:2861053538704d61340da56e2f176853d19f1dc5704bc306b7597155f850d57a" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/22/3c/1143dc0a865d06482454fddb35d739c9260b18d721f01287f79cc53a315f/openai-1.85.0.tar.gz", hash = "sha256:6ba76e4ebc5725f71f2f6126c7cb5169ca8de60dd5aa61f350f9448ad162c913" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/a0/73/b4427c7873f4f778ec7a6d2b1724fd3aadc85719a12e324615b9c2bc614f/openai-1.85.0-py3-none-any.whl", hash = "sha256:7dc3e839cb8bb8747979a90c63ad4cb25a8e0cbec17b53eec009532c9965cecf" },
+]
+
+[[package]]
+name = "opentelemetry-api"
+version = "1.36.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "importlib-metadata" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/27/d2/c782c88b8afbf961d6972428821c302bd1e9e7bc361352172f0ca31296e2/opentelemetry_api-1.36.0.tar.gz", hash = "sha256:9a72572b9c416d004d492cbc6e61962c0501eaf945ece9b5a0f56597d8348aa0" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/bb/ee/6b08dde0a022c463b88f55ae81149584b125a42183407dc1045c486cc870/opentelemetry_api-1.36.0-py3-none-any.whl", hash = "sha256:02f20bcacf666e1333b6b1f04e647dc1d5111f86b8e510238fcc56d7762cda8c" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-common"
+version = "1.36.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "opentelemetry-proto" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/34/da/7747e57eb341c59886052d733072bc878424bf20f1d8cf203d508bbece5b/opentelemetry_exporter_otlp_proto_common-1.36.0.tar.gz", hash = "sha256:6c496ccbcbe26b04653cecadd92f73659b814c6e3579af157d8716e5f9f25cbf" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d0/ed/22290dca7db78eb32e0101738366b5bbda00d0407f00feffb9bf8c3fdf87/opentelemetry_exporter_otlp_proto_common-1.36.0-py3-none-any.whl", hash = "sha256:0fc002a6ed63eac235ada9aa7056e5492e9a71728214a61745f6ad04b923f840" },
+]
+
+[[package]]
+name = "opentelemetry-exporter-otlp-proto-http"
+version = "1.36.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-exporter-otlp-proto-common" },
+    { name = "opentelemetry-proto" },
+    { name = "opentelemetry-sdk" },
+    { name = "requests" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/25/85/6632e7e5700ba1ce5b8a065315f92c1e6d787ccc4fb2bdab15139eaefc82/opentelemetry_exporter_otlp_proto_http-1.36.0.tar.gz", hash = "sha256:dd3637f72f774b9fc9608ab1ac479f8b44d09b6fb5b2f3df68a24ad1da7d356e" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/7f/41/a680d38b34f8f5ddbd78ed9f0042e1cc712d58ec7531924d71cb1e6c629d/opentelemetry_exporter_otlp_proto_http-1.36.0-py3-none-any.whl", hash = "sha256:3d769f68e2267e7abe4527f70deb6f598f40be3ea34c6adc35789bea94a32902" },
+]
+
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.57b0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/12/37/cf17cf28f945a3aca5a038cfbb45ee01317d4f7f3a0e5209920883fe9b08/opentelemetry_instrumentation-0.57b0.tar.gz", hash = "sha256:f2a30135ba77cdea2b0e1df272f4163c154e978f57214795d72f40befd4fcf05" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d0/6f/f20cd1542959f43fb26a5bf9bb18cd81a1ea0700e8870c8f369bd07f5c65/opentelemetry_instrumentation-0.57b0-py3-none-any.whl", hash = "sha256:9109280f44882e07cec2850db28210b90600ae9110b42824d196de357cbddf7e" },
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "1.36.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/fd/02/f6556142301d136e3b7e95ab8ea6a5d9dc28d879a99f3dd673b5f97dca06/opentelemetry_proto-1.36.0.tar.gz", hash = "sha256:0f10b3c72f74c91e0764a5ec88fd8f1c368ea5d9c64639fb455e2854ef87dd2f" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/b3/57/3361e06136225be8180e879199caea520f38026f8071366241ac458beb8d/opentelemetry_proto-1.36.0-py3-none-any.whl", hash = "sha256:151b3bf73a09f94afc658497cf77d45a565606f62ce0c17acb08cd9937ca206e" },
+]
+
+[[package]]
+name = "opentelemetry-sdk"
+version = "1.36.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/4c/85/8567a966b85a2d3f971c4d42f781c305b2b91c043724fa08fd37d158e9dc/opentelemetry_sdk-1.36.0.tar.gz", hash = "sha256:19c8c81599f51b71670661ff7495c905d8fdf6976e41622d5245b791b06fa581" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/0b/59/7bed362ad1137ba5886dac8439e84cd2df6d087be7c09574ece47ae9b22c/opentelemetry_sdk-1.36.0-py3-none-any.whl", hash = "sha256:19fe048b42e98c5c1ffe85b569b7073576ad4ce0bcb6e9b4c6a39e890a6c45fb" },
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.57b0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/7e/31/67dfa252ee88476a29200b0255bda8dfc2cf07b56ad66dc9a6221f7dc787/opentelemetry_semantic_conventions-0.57b0.tar.gz", hash = "sha256:609a4a79c7891b4620d64c7aac6898f872d790d75f22019913a660756f27ff32" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/05/75/7d591371c6c39c73de5ce5da5a2cc7b72d1d1cd3f8f4638f553c01c37b11/opentelemetry_semantic_conventions-0.57b0-py3-none-any.whl", hash = "sha256:757f7e76293294f124c827e514c2a3144f191ef175b069ce8d1211e1e38e9e78" },
+]
+
+[[package]]
+name = "ordered-set"
+version = "4.1.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/4c/ca/bfac8bc689799bcca4157e0e0ced07e70ce125193fc2e166d2e685b7e2fe/ordered-set-4.1.0.tar.gz", hash = "sha256:694a8e44c87657c59292ede72891eb91d34131f6531463aab3009191c77364a8" }
 wheels = [
-    { url = "https://mirrors.aliyun.com/pypi/packages/9a/1a/e62718f311daa26d208800976d7944e5ee6d503e1ea474522b2a15a904bb/openai-1.64.0-py3-none-any.whl", hash = "sha256:20f85cde9e95e9fbb416e3cb5a6d3119c0b28308afd6e3cc47bf100623dac623" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/33/55/af02708f230eb77084a299d7b08175cff006dea4f2721074b92cdb0296c0/ordered_set-4.1.0-py3-none-any.whl", hash = "sha256:046e1132c71fcf3330438a539928932caf51ddbc582496833e23de611de14562" },
 ]
 
 [[package]]
@@ -1522,6 +1677,7 @@ version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
     { name = "absl-py" },
+    { name = "agentops" },
     { name = "aiofiles" },
     { name = "aliyun-python-sdk-core" },
     { name = "asteval" },
@@ -1573,6 +1729,7 @@ dependencies = [
 [package.metadata]
 requires-dist = [
     { name = "absl-py", specifier = ">=2.2.2" },
+    { name = "agentops", specifier = ">=0.4.19" },
     { name = "aiofiles", specifier = ">=24.1.0" },
     { name = "aliyun-python-sdk-core" },
     { name = "asteval", specifier = ">=0.9.31" },
@@ -1594,7 +1751,7 @@ requires-dist = [
     { name = "networkx", specifier = ">=3.4.2" },
     { name = "nltk", specifier = ">=3.9.1" },
     { name = "numpy", specifier = ">=1.26.4" },
-    { name = "openai", specifier = ">=1.37.1" },
+    { name = "openai", specifier = "==1.85.0" },
     { name = "pandas", specifier = ">=2.2.3" },
     { name = "playwright", specifier = ">=1.46.0" },
     { name = "psutil", specifier = ">=7.0.0" },
@@ -1661,6 +1818,20 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/b5/35/6c4c6fc8774a9e3629cd750dc24a7a4fb090a25ccd5c3246d127b70f9e22/propcache-0.3.0-py3-none-any.whl", hash = "sha256:67dda3c7325691c2081510e92c561f465ba61b975f481735aefdfc845d2cd043" },
 ]
 
+[[package]]
+name = "protobuf"
+version = "6.32.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783" },
+]
+
 [[package]]
 name = "psutil"
 version = "7.0.0"
@@ -2179,6 +2350,15 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/b6/cb/b86984bed139586d01532a587464b5805f12e397594f19f931c4c2fbfa61/tenacity-9.0.0-py3-none-any.whl", hash = "sha256:93de0c98785b27fcf659856aa9f54bfbd399e29969b0621bc7f762bd441b4539" },
 ]
 
+[[package]]
+name = "termcolor"
+version = "2.4.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/10/56/d7d66a84f96d804155f6ff2873d065368b25a07222a6fd51c4f24ef6d764/termcolor-2.4.0.tar.gz", hash = "sha256:aab9e56047c8ac41ed798fa36d892a37aca6b3e9159f3e0c24bc64a9b3ac7b7a" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/d9/5f/8c716e47b3a50cbd7c146f45881e11d9414def768b7cd9c5e6650ec2a80a/termcolor-2.4.0-py3-none-any.whl", hash = "sha256:9297c0df9c99445c2412e832e882a7884038a25617c60cea2ad69488d4040d63" },
+]
+
 [[package]]
 name = "threadpoolctl"
 version = "3.6.0"
@@ -2425,6 +2605,35 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/6d/0d/8adfeaa62945f90d19ddc461c55f4a50c258af7662d34b6a3d5d1f8646f6/uvicorn-0.34.3-py3-none-any.whl", hash = "sha256:16246631db62bdfbf069b0645177d6e8a77ba950cfedbfd093acef9444e4d885" },
 ]
 
+[[package]]
+name = "wrapt"
+version = "1.17.3"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/52/db/00e2a219213856074a213503fdac0511203dceefff26e1daa15250cc01a0/wrapt-1.17.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:273a736c4645e63ac582c60a56b0acb529ef07f78e08dc6bfadf6a46b19c0da7" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5e/30/ca3c4a5eba478408572096fe9ce36e6e915994dd26a4e9e98b4f729c06d9/wrapt-1.17.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5531d911795e3f935a9c23eb1c8c03c211661a5060aab167065896bbf62a5f85" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/31/25/3e8cc2c46b5329c5957cec959cb76a10718e1a513309c31399a4dad07eb3/wrapt-1.17.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0610b46293c59a3adbae3dee552b648b984176f8562ee0dba099a56cfbe4df1f" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/5d/8f/a32a99fc03e4b37e31b57cb9cefc65050ea08147a8ce12f288616b05ef54/wrapt-1.17.3-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b32888aad8b6e68f83a8fdccbf3165f5469702a7544472bdf41f582970ed3311" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/31/57/4930cb8d9d70d59c27ee1332a318c20291749b4fba31f113c2f8ac49a72e/wrapt-1.17.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cccf4f81371f257440c88faed6b74f1053eef90807b77e31ca057b2db74edb1" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/a8/f3/1afd48de81d63dd66e01b263a6fbb86e1b5053b419b9b33d13e1f6d0f7d0/wrapt-1.17.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8a210b158a34164de8bb68b0e7780041a903d7b00c87e906fb69928bf7890d5" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1e/d7/4ad5327612173b144998232f98a85bb24b60c352afb73bc48e3e0d2bdc4e/wrapt-1.17.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:79573c24a46ce11aab457b472efd8d125e5a51da2d1d24387666cd85f54c05b2" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/bb/59/e0adfc831674a65694f18ea6dc821f9fcb9ec82c2ce7e3d73a88ba2e8718/wrapt-1.17.3-cp311-cp311-win32.whl", hash = "sha256:c31eebe420a9a5d2887b13000b043ff6ca27c452a9a22fa71f35f118e8d4bf89" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/83/88/16b7231ba49861b6f75fc309b11012ede4d6b0a9c90969d9e0db8d991aeb/wrapt-1.17.3-cp311-cp311-win_amd64.whl", hash = "sha256:0b1831115c97f0663cb77aa27d381237e73ad4f721391a9bfb2fe8bc25fa6e77" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9a/1e/c4d4f3398ec073012c51d1c8d87f715f56765444e1a4b11e5180577b7e6e/wrapt-1.17.3-cp311-cp311-win_arm64.whl", hash = "sha256:5a7b3c1ee8265eb4c8f1b7d29943f195c00673f5ab60c192eba2d4a7eae5f46a" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9f/41/cad1aba93e752f1f9268c77270da3c469883d56e2798e7df6240dcb2287b/wrapt-1.17.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ab232e7fdb44cdfbf55fc3afa31bcdb0d8980b9b95c38b6405df2acb672af0e0" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/60/f8/096a7cc13097a1869fe44efe68dace40d2a16ecb853141394047f0780b96/wrapt-1.17.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9baa544e6acc91130e926e8c802a17f3b16fbea0fd441b5a60f5cf2cc5c3deba" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/33/df/bdf864b8997aab4febb96a9ae5c124f700a5abd9b5e13d2a3214ec4be705/wrapt-1.17.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6b538e31eca1a7ea4605e44f81a48aa24c4632a277431a6ed3f328835901f4fd" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9f/81/5d931d78d0eb732b95dc3ddaeeb71c8bb572fb01356e9133916cd729ecdd/wrapt-1.17.3-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:042ec3bb8f319c147b1301f2393bc19dba6e176b7da446853406d041c36c7828" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/ca/38/2e1785df03b3d72d34fc6252d91d9d12dc27a5c89caef3335a1bbb8908ca/wrapt-1.17.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3af60380ba0b7b5aeb329bc4e402acd25bd877e98b3727b0135cb5c2efdaefe9" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/b3/8b/48cdb60fe0603e34e05cffda0b2a4adab81fd43718e11111a4b0100fd7c1/wrapt-1.17.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0b02e424deef65c9f7326d8c19220a2c9040c51dc165cddb732f16198c168396" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/3c/51/d81abca783b58f40a154f1b2c56db1d2d9e0d04fa2d4224e357529f57a57/wrapt-1.17.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:74afa28374a3c3a11b3b5e5fca0ae03bef8450d6aa3ab3a1e2c30e3a75d023dc" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/9e/b1/43b286ca1392a006d5336412d41663eeef1ad57485f3e52c767376ba7e5a/wrapt-1.17.3-cp312-cp312-win32.whl", hash = "sha256:4da9f45279fff3543c371d5ababc57a0384f70be244de7759c85a7f989cb4ebe" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/28/de/49493f962bd3c586ab4b88066e967aa2e0703d6ef2c43aa28cb83bf7b507/wrapt-1.17.3-cp312-cp312-win_amd64.whl", hash = "sha256:e71d5c6ebac14875668a1e90baf2ea0ef5b7ac7918355850c0908ae82bcb297c" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/f1/48/0f7102fe9cb1e8a5a77f80d4f0956d62d97034bbe88d33e94699f99d181d/wrapt-1.17.3-cp312-cp312-win_arm64.whl", hash = "sha256:604d076c55e2fdd4c1c03d06dc1a31b95130010517b5019db15365ec4a405fc6" },
+    { url = "https://mirrors.aliyun.com/pypi/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22" },
+]
+
 [[package]]
 name = "xmltodict"
 version = "0.14.2"
@@ -2480,6 +2689,15 @@ wheels = [
     { url = "https://mirrors.aliyun.com/pypi/packages/f5/4b/a06e0ec3d155924f77835ed2d167ebd3b211a7b0853da1cf8d8414d784ef/yarl-1.18.3-py3-none-any.whl", hash = "sha256:b57f4f58099328dfb26c6a771d09fb20dbbae81d20cfb66141251ea063bd101b" },
 ]
 
+[[package]]
+name = "zipp"
+version = "3.23.0"
+source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
+sdist = { url = "https://mirrors.aliyun.com/pypi/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166" }
+wheels = [
+    { url = "https://mirrors.aliyun.com/pypi/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e" },
+]
+
 [[package]]
 name = "zstandard"
 version = "0.23.0"