diff --git a/.env.example b/.env.example index 228d697..7cc4641 100644 --- a/.env.example +++ b/.env.example @@ -1,25 +1,25 @@ -# LLM Provider Configuration -# =========================== +# ══════════════════════════════════════════════════════════════ +# REQUIRED: At least ONE LLM provider key +# ══════════════════════════════════════════════════════════════ +OPENAI_API_KEY=sk-... +# ANTHROPIC_API_KEY= +# GEMINI_API_KEY= +# DEEPSEEK_API_KEY= -# Default provider to use (openai, anthropic, gemini) -DEFAULT_LLM_CLIENT=openai +# ══════════════════════════════════════════════════════════════ +# REQUIRED for visual element detection (Agent.Find Visual Element) +# ══════════════════════════════════════════════════════════════ +HUGGINGFACE_API_KEY=hf_... -# OpenAI Configuration -OPENAI_API_KEY=your_openai_api_key_here -DEFAULT_OPENAI_MODEL=gpt-4o-mini - -# Anthropic Configuration -ANTHROPIC_API_KEY=your_anthropic_api_key_here -DEFAULT_ANTHROPIC_MODEL=claude-sonnet-4-5-20250929 - -# Google Gemini Configuration -GEMINI_API_KEY=your_gemini_api_key_here -DEFAULT_GEMINI_MODEL=gemini-1.5-flash - -# Image Upload Services -IMGBB_API_KEY=your_imgbb_api_key_here -FREEIMAGEHOST_API_KEY=your_freeimagehost_api_key_here - -# HuggingFace (Future) -HUGGINGFACE_API_KEY=your_huggingface_api_key_here +# ══════════════════════════════════════════════════════════════ +# OPTIONAL: Customize defaults +# ══════════════════════════════════════════════════════════════ +# DEFAULT_LLM_CLIENT=openai +# DEFAULT_OPENAI_MODEL=gpt-4o +# OLLAMA_BASE_URL=http://localhost:11434/v1 +# ══════════════════════════════════════════════════════════════ +# OPTIONAL: Image upload (falls back to base64 if not set) +# ══════════════════════════════════════════════════════════════ +# IMGBB_API_KEY= +# FREEIMAGEHOST_API_KEY= \ No newline at end of file diff --git a/Agent/AgentKeywords.py b/Agent/AgentKeywords.py index c5c50b1..25471bf 100644 --- a/Agent/AgentKeywords.py +++ b/Agent/AgentKeywords.py @@ -23,18 +23,20 @@ class AgentKeywords: def __init__( self, - llm_client: str = "openai", - llm_model: str = "gpt-4o-mini", + llm_client: str | None = None, + llm_model: str | None = None, platform_type: str = "auto", - click_mode: str = "xml", - input_mode: str = "text", + element_source: str = "accessibility", + llm_input_format: str = "som", ): + from Agent.config.config import Config + self.engine = AgentEngine( - llm_client=llm_client, + llm_client=llm_client or Config.DEFAULT_LLM_CLIENT, llm_model=llm_model, platform_type=platform_type, - click_mode=click_mode, - input_mode=input_mode, + element_source=element_source, + llm_input_format=llm_input_format, ) # ----------------------- Public RF Keywords ----------------------- @@ -64,11 +66,23 @@ def find_visual_element(self, description: str, format: str = "center"): """ return self.engine.find_visual_element(description, format=format) - def autonumous(self, instruction: str): - """Agent.Autonumous - This keyword is designed to autonomously plan and execute a test based on the - given single instruction. + def set_element_source(self, source: str): + """Agent.Set Element Source accessibility|vision + Example: Agent.Set Element Source vision + """ + self.engine.set_element_source(source) + + # def set_llm_input_format(self, format: str): + # """Agent.Set LLM Input Format text|som + # Example: Agent.Set LLM Input Format som + # """ + # self.engine.set_llm_input_format(format) + + # def autonumous(self, instruction: str): + # """Agent.Autonumous + # This keyword is designed to autonomously plan and execute a test based on the + # given single instruction. - Example: Agent.Autonumous Navigate to settings, change language to French, - ... then go back to home screen and verify the interface is in French""" - raise NotImplementedError("Agent.Autonumous is not implemented yet") \ No newline at end of file + # Example: Agent.Autonumous Navigate to settings, change language to French, + # ... then go back to home screen and verify the interface is in French""" + # raise NotImplementedError("Agent.Autonumous is not implemented yet") \ No newline at end of file diff --git a/Agent/__init__.py b/Agent/__init__.py index ae3af54..137da1b 100644 --- a/Agent/__init__.py +++ b/Agent/__init__.py @@ -1,7 +1,7 @@ from Agent.AgentKeywords import AgentKeywords -__version__ = "0.1.0" +__version__ = "0.0.1" __all__ = ["AgentKeywords"] diff --git a/Agent/agent_engine.py b/Agent/agent_engine.py index 48b5702..dbba091 100644 --- a/Agent/agent_engine.py +++ b/Agent/agent_engine.py @@ -1,6 +1,6 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional -from Agent.platforms import DeviceConnector, WebConnectorRF, create_platform +from Agent.platforms import DeviceConnector, create_platform from Agent.ai.llm.facade import UnifiedLLMFacade from Agent.ai._promptcomposer import AgentPromptComposer from Agent.utilities.imguploader.imghandler import ImageUploader @@ -8,114 +8,84 @@ from Agent.tools.base import ToolCategory from Agent.core.keyword_runner import KeywordRunner from Agent.tools.mobile import MOBILE_TOOLS -from Agent.tools.web import WEB_TOOLS from Agent.tools.visual import VISUAL_TOOLS from robot.api import logger class AgentEngine: - """Core engine for AI-driven test automation. - - Orchestrates the complete Agent.Do and Agent.VisualCheck flows: - - Capturing UI context and screenshots - - Composing AI prompts and calling LLMs - - Tool-based action execution and visual verification - - This is the main orchestrator that coordinates platform connectors, - AI services, tool registry, and executors. - """ + """Core engine for AI-driven Android test automation.""" def __init__( self, llm_client: str = "openai", llm_model: str = "gpt-4o-mini", - platform: Optional[Union[DeviceConnector, WebConnectorRF]] = None, + platform: Optional[DeviceConnector] = None, platform_type: str = "auto", - click_mode: str = "xml", - input_mode: str = "text", + element_source: str = "accessibility", + llm_input_format: str = "text", ) -> None: - # Platform connector - create or use provided if platform is None: self.platform = create_platform(platform_type) else: self.platform = platform - # Detect platform type - platform_name = self.platform.get_platform() - logger.info(f"🌐 Platform detected: {platform_name}") + logger.info("📱 Platform: mobile") - # AI components self.llm = UnifiedLLMFacade(provider=llm_client, model=llm_model) - self.prompt_composer = AgentPromptComposer( - platform_connector=self.platform - ) self.image_uploader = ImageUploader(service="auto") - # Tool execution components self.tool_registry = ToolRegistry() self.executor = KeywordRunner(self.platform) - # Register tools based on platform - if platform_name == "web": - self._register_web_tools() - else: - self._register_mobile_tools() + self._register_mobile_tools() self._register_visual_tools() - # Click strategy and input mode - self.click_mode = click_mode - self.input_mode = input_mode - logger.info(f"🎯 Click mode: {click_mode}, Input mode: {input_mode}") + self.prompt_composer = AgentPromptComposer( + tool_registry=self.tool_registry, + platform_connector=self.platform + ) + + self.element_source = element_source + self.llm_input_format = llm_input_format + logger.info(f"🎯 Element source: {element_source}, LLM input format: {llm_input_format}") def _register_mobile_tools(self) -> None: - """Register all mobile tools in the registry.""" for ToolClass in MOBILE_TOOLS: self.tool_registry.register(ToolClass()) - mobile_tools_count = len(self.tool_registry.get_by_category(ToolCategory.MOBILE)) logger.debug(f"📱 Registered {mobile_tools_count} mobile tools") - def _register_web_tools(self) -> None: - """Register all web tools in the registry.""" - for ToolClass in WEB_TOOLS: - self.tool_registry.register(ToolClass()) - - web_tools_count = len(self.tool_registry.get_by_category(ToolCategory.WEB)) - logger.debug(f"🌐 Registered {web_tools_count} web tools") - def _register_visual_tools(self) -> None: - """Register all visual verification tools in the registry.""" for ToolClass in VISUAL_TOOLS: self.tool_registry.register(ToolClass()) - visual_tools_count = len(self.tool_registry.get_by_category(ToolCategory.VISUAL)) logger.debug(f"👁️ Registered {visual_tools_count} visual tools") # ----------------------- Public API ----------------------- - def set_click_mode(self, mode: str) -> None: - """Change click mode dynamically during test execution. + def set_element_source(self, source: str) -> None: + """Change element source dynamically. Args: - mode: 'xml' or 'visual' + source: 'accessibility' or 'vision' """ - if mode not in ["xml", "visual"]: - raise ValueError(f"Invalid click_mode: {mode}. Choose: xml, visual") + if source not in ["accessibility", "vision"]: + raise ValueError(f"Invalid element_source: {source}. Choose: accessibility, vision") - self.click_mode = mode - logger.info(f"🔧 Click mode changed to: {mode}") + self.element_source = source + logger.info(f"🔧 Element source changed to: {source}") - def set_input_mode(self, mode: str) -> None: - """Change input mode dynamically during test execution. + def set_llm_input_format(self, format: str) -> None: + """Change LLM input format dynamically. Args: - mode: 'text' (numbered list) or 'som' (screenshot with numbered boxes) + format: 'text' or 'som' """ - if mode not in ["text", "som"]: - raise ValueError(f"Invalid input_mode: {mode}. Choose: text, som") + if format not in ["text", "som"]: + raise ValueError(f"Invalid llm_input_format: {format}. Choose: text, som") - self.input_mode = mode - logger.info(f"🔧 Input mode changed to: {mode}") + self.llm_input_format = format + logger.info(f"🔧 LLM input format changed to: {format}") def do(self, instruction: str) -> None: """Execute AI-driven action based on natural language instruction. @@ -125,21 +95,64 @@ def do(self, instruction: str) -> None: """ logger.info(f"🚀 Starting Agent.Do: '{instruction}'") - # Collect UI context (skip in visual-only mode) + if hasattr(self.platform, 'wait_for_page_stable'): + self.platform.wait_for_page_stable() + + screenshot_base64 = None ui_candidates = [] - if self.click_mode != "visual": - ui_candidates = self.platform.collect_ui_candidates() - else: - logger.debug("⚡ UI collection skipped (mode: visual)") + annotated_image_path = None - # Capture screenshot if needed (for SoM mode or visual click mode) - screenshot_base64 = None - need_screenshot = self.input_mode == "som" or self.click_mode == "visual" - if need_screenshot: + # Collect UI elements based on element source + if self.element_source == "accessibility": + ui_candidates = self.platform.collect_ui_candidates() + logger.debug(f"📋 Collected {len(ui_candidates)} accessibility elements") + elif self.element_source == "vision": screenshot_base64 = self.platform.get_screenshot_base64() - logger.debug(f"📸 Screenshot captured (input_mode: {self.input_mode})") - else: - logger.debug(f"⚡ Screenshot skipped (input_mode: {self.input_mode})") + from Agent.ai.vlm._client import OmniParserClient + from Agent.ai.vlm._parser import OmniParserResultProcessor + from PIL import Image + + client = OmniParserClient() + image_temp_path, parsed_text = client.parse_image(image_base64=screenshot_base64) + annotated_image_path = image_temp_path + + if parsed_text: + processor = OmniParserResultProcessor( + response_text=parsed_text, + image_temp_path=image_temp_path, + ) + elements_data = processor.get_parsed_ui_elements(element_type="all") + + with Image.open(image_temp_path) as img: + width, height = img.size + + for key, data in elements_data.items(): + bbox_norm = data.get("bbox", [0, 0, 0, 0]) + x1 = int(bbox_norm[0] * width) + y1 = int(bbox_norm[1] * height) + x2 = int(bbox_norm[2] * width) + y2 = int(bbox_norm[3] * height) + + element = { + "text": data.get("content", ""), + "class_name": data.get("type", "unknown"), + "bbox": { + "x": x1, + "y": y1, + "width": x2 - x1, + "height": y2 - y1 + }, + "source": "omniparser", + "interactivity": data.get("interactivity", "unknown") + } + ui_candidates.append(element) + + logger.debug(f"👁️ Detected {len(ui_candidates)} visual elements") + + # Capture screenshot if needed for SoM mode and not already captured + if self.llm_input_format == "som" and not screenshot_base64: + screenshot_base64 = self.platform.get_screenshot_base64() + logger.debug("📸 Screenshot captured for SoM mode") # Prepare context for tool execution context = { @@ -150,17 +163,28 @@ def do(self, instruction: str) -> None: if screenshot_base64: context["screenshot_base64"] = screenshot_base64 + logger.info(f"Elements sent to AI: {ui_candidates}") + # Prepare AI request platform_name = self.platform.get_platform() + tool_category = "mobile" messages = self.prompt_composer.compose_do_messages( instruction=instruction, ui_elements=ui_candidates, platform=platform_name, - click_mode=self.click_mode, - input_mode=self.input_mode, + element_source=self.element_source, + llm_input_format=self.llm_input_format, screenshot_base64=screenshot_base64, + annotated_image_path=annotated_image_path, ) - tools = self.prompt_composer.get_do_tools(category=platform_name, click_mode=self.click_mode) + if annotated_image_path: + logger.info(f"Annotated image: {annotated_image_path}") + + tools = self.prompt_composer.get_do_tools(category=tool_category, element_source=self.element_source) + logger.debug(f"Tools for {tool_category}: {len(tools)} tools") + + if not tools: + raise RuntimeError(f"No tools registered for platform '{platform_name}'. Check tool registration.") # Call AI result = self.llm.send_ai_request_with_tools( @@ -169,12 +193,15 @@ def do(self, instruction: str) -> None: tool_choice="required", temperature=0 ) - - logger.debug(f"AI response: {result}") + + tool_call = result.get("tool_calls", [{}])[0] + tool_name = tool_call.get("function", {}).get("name", "unknown") + tool_args = tool_call.get("function", {}).get("arguments", {}) + logger.info(f"🤖 AI chose: {tool_name}({tool_args})") # Execute tool self._execute_do_from_tool_calls(result, context, instruction) - logger.info("✅ Agent.Do completed") + logger.info("Agent.Do completed") def visual_check(self, instruction: str) -> None: """Execute visual verification based on natural language instruction. @@ -184,6 +211,10 @@ def visual_check(self, instruction: str) -> None: (e.g., "verify the home screen is displayed") """ logger.info(f"👁️ Starting Agent.VisualCheck: '{instruction}'") + + if hasattr(self.platform, 'wait_for_page_stable'): + self.platform.wait_for_page_stable() + screenshot_base64 = self.platform.get_screenshot_base64() # Embed screenshot to Robot Framework log @@ -194,6 +225,10 @@ def visual_check(self, instruction: str) -> None: # Prepare AI request messages = self.prompt_composer.compose_visual_check_messages(instruction, image_url) tools = self.prompt_composer.get_visual_check_tools() + logger.debug(f"Visual check tools: {len(tools)} tools") + + if not tools: + raise RuntimeError("No visual tools registered. Check tool registration.") # Call AI result = self.llm.send_ai_request_with_tools( @@ -219,6 +254,10 @@ def ask(self, question: str, response_format: str = "text") -> str: """ import json logger.info(f"❓ Agent.Ask: '{question}'") + + if hasattr(self.platform, 'wait_for_page_stable'): + self.platform.wait_for_page_stable() + screenshot_base64 = self.platform.get_screenshot_base64() self.platform.embed_image_to_log(screenshot_base64) @@ -248,6 +287,10 @@ def find_visual_element(self, description: str, format: str = "center") -> Dict[ from Agent.ai.vlm.interface import OmniParserOrchestrator logger.info(f"🔍 Agent.Find Visual Element: '{description}'") + + if hasattr(self.platform, 'wait_for_page_stable'): + self.platform.wait_for_page_stable() + screenshot_base64 = self.platform.get_screenshot_base64() self.platform.embed_image_to_log(screenshot_base64) @@ -259,7 +302,7 @@ def find_visual_element(self, description: str, format: str = "center") -> Dict[ result = orchestrator.find_element( element_description=description, image_base64=screenshot_base64, - element_type="interactive" + element_type="all" ) if not result: diff --git a/Agent/ai/_promptcomposer.py b/Agent/ai/_promptcomposer.py index 20834eb..df3324c 100644 --- a/Agent/ai/_promptcomposer.py +++ b/Agent/ai/_promptcomposer.py @@ -1,6 +1,10 @@ from typing import List, Dict, Optional, Any from Agent.tools.registry import ToolRegistry from Agent.tools.base import ToolCategory +from robot.api import logger +import base64 +import os +from datetime import datetime class AgentPromptComposer: @@ -13,25 +17,46 @@ def __init__( ) -> None: self.registry = tool_registry or ToolRegistry() self.platform = platform_connector + self._annotated_dir = None + + def _get_annotated_dir(self) -> str: + if self._annotated_dir is None: + from Agent.utilities._logdir import set_artifacts_subdir + self._annotated_dir = set_artifacts_subdir("RF_Agent/Annotated") + return self._annotated_dir + + def _save_annotated_image(self, image_base64: str, source: str = "som") -> str: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] + filename = f"annotated_{source}_{timestamp}.png" + filepath = os.path.join(self._get_annotated_dir(), filename) + + image_bytes = base64.b64decode(image_base64) + with open(filepath, "wb") as f: + f.write(image_bytes) + + logger.info(f"📸 Saved annotated image: {filepath}") + return filepath def compose_do_messages( self, instruction: str, ui_elements: Optional[List[Dict[str, Any]]] = None, platform: str = "mobile", - click_mode: str = "xml", - input_mode: str = "text", + element_source: str = "accessibility", + llm_input_format: str = "text", screenshot_base64: Optional[str] = None, + annotated_image_path: Optional[str] = None, ) -> List[Dict[str, Any]]: """Build DO action messages using tool calling approach. Args: instruction: User instruction ui_elements: List of UI elements - platform: 'mobile' or 'web' - determines system prompt - click_mode: 'xml' or 'visual' - guides AI on click strategy - input_mode: 'text' (numbered list) or 'som' (screenshot with numbered boxes) - screenshot_base64: Screenshot for SoM mode (required if input_mode='som') + platform: 'android' or 'ios' + element_source: 'accessibility' or 'vision' + llm_input_format: 'text' or 'som' + screenshot_base64: Screenshot (required for SoM mode) + annotated_image_path: Pre-annotated image from OmniParser """ # Base system prompt is_mobile = platform in ("android", "ios") @@ -41,18 +66,19 @@ def compose_do_messages( "Your job: analyze the instruction and call the appropriate function to interact with the mobile UI.\n" ) - # Add click guidance based on mode - if click_mode == "visual": + if element_source == "vision": system_content += ( - "\nFOR CLICKING: Use click_visual_element(description) - describe the element visually.\n" - "You will receive a screenshot. Analyze it and use visual descriptions.\n" + "\nELEMENTS DETECTED VIA COMPUTER VISION (OmniParser):\n" + "- tap_element(element_index): Click element by INDEX from numbered list\n" + "- input_text(element_index, text): Type text into element by INDEX\n" + "- The screenshot shows NUMBERED bounding boxes - use those numbers!\n" ) - else: # xml (default) + else: system_content += ( - "\nACTION SELECTION RULES:\n" - "1. FOR TEXT INPUT: Use input_text(element_index, text) - select from numbered list\n" - "2. FOR CLICKING: Use tap_element(index) - select from numbered list\n" - "3. OTHER ACTIONS: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n" + "\nUSE LOCATOR TOOLS:\n" + "1. FOR TEXT INPUT: input_text(element_index, text) - select from numbered list\n" + "2. FOR CLICKING: tap_element(index) - select from numbered list\n" + "3. OTHER: scroll_down(), swipe_left/right/up(), long_press(index), hide_keyboard(), go_back()\n" ) system_content += ( @@ -64,22 +90,21 @@ def compose_do_messages( "Your job: analyze the instruction and call the appropriate function to interact with the web page.\n" ) - # Add click guidance based on mode - if click_mode == "visual": + if element_source == "vision": system_content += ( - "\nFOR INTERACTION: Use VISUAL tools:\n" - "- click_visual_element(description): Click element by visual description\n" - "- input_text_visual(description, text): Input text into element by visual description\n" - "- hover_visual(description): Hover over element by visual description\n" - "- double_click_visual(description): Double click element by visual description\n" - "You will receive a screenshot. Analyze it and use visual descriptions.\n" + "\nUSE VISUAL TOOLS:\n" + "- click_visual_element(description): Click by visual description\n" + "- input_text_visual(description, text): Input text by visual description\n" + "- hover_visual(description): Hover by visual description\n" + "- double_click_visual(description): Double click by visual description\n" + "- Elements were detected using computer vision (OmniParser)\n" ) - else: # xml (default) + else: system_content += ( - "\nACTION SELECTION RULES:\n" - "1. FOR TEXT INPUT: Use input_text(index, text) for or