diff --git a/docs.json b/docs.json index 0878a67b..3738b767 100644 --- a/docs.json +++ b/docs.json @@ -215,7 +215,8 @@ { "group": "Extensions", "pages": [ - "openhands/usage/cli/mcp-servers" + "openhands/usage/cli/mcp-servers", + "openhands/usage/cli/critic" ] }, { @@ -268,7 +269,8 @@ "sdk/guides/agent-custom", "sdk/guides/convo-custom-visualizer", "sdk/guides/agent-stuck-detector", - "sdk/guides/agent-tom-agent" + "sdk/guides/agent-tom-agent", + "sdk/guides/critic" ] }, { diff --git a/openhands/usage/cli/critic-demo.mp4 b/openhands/usage/cli/critic-demo.mp4 new file mode 100644 index 00000000..2cc6ed27 Binary files /dev/null and b/openhands/usage/cli/critic-demo.mp4 differ diff --git a/openhands/usage/cli/critic.mdx b/openhands/usage/cli/critic.mdx new file mode 100644 index 00000000..5bc1f930 --- /dev/null +++ b/openhands/usage/cli/critic.mdx @@ -0,0 +1,41 @@ +--- +title: Critic (Experimental) +description: Automatic task success prediction for OpenHands LLM Provider users +--- + + +**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing. + + +## Overview + +If you're using the [OpenHands LLM Provider](/openhands/usage/llms/openhands-llms), an experimental **critic feature** is automatically enabled to predict task success in real-time. + +For detailed information about the critic feature, including programmatic access and advanced usage, see the [SDK Critic Guide](/sdk/guides/critic). + + +## What is the Critic? + +The critic is an LLM-based evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. It provides: + +- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success +- **Real-time feedback**: Scores computed during agent execution, not just at completion + + + +![Critic output in CLI](./screenshots/critic-cli-output.png) + +## Pricing + +The critic feature is **free during the public beta phase** for all OpenHands LLM Provider users. + +## Disabling the Critic + +If you prefer not to use the critic feature, you can disable it in your settings. + +![Critic settings in CLI](./screenshots/critic-cli-settings.png) + diff --git a/openhands/usage/cli/screenshots/critic-cli-output.png b/openhands/usage/cli/screenshots/critic-cli-output.png new file mode 100644 index 00000000..1dc97ea6 Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-output.png differ diff --git a/openhands/usage/cli/screenshots/critic-cli-settings.png b/openhands/usage/cli/screenshots/critic-cli-settings.png new file mode 100644 index 00000000..3eb41695 Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-settings.png differ diff --git a/sdk/api-reference/openhands.sdk.agent.mdx b/sdk/api-reference/openhands.sdk.agent.mdx index 07de2190..f55127f3 100644 --- a/sdk/api-reference/openhands.sdk.agent.mdx +++ b/sdk/api-reference/openhands.sdk.agent.mdx @@ -26,18 +26,8 @@ AgentBase and implements the agent execution logic. #### Properties -- `agent_context`: AgentContext | None -- `condenser`: CondenserBase | None -- `filter_tools_regex`: str | None -- `include_default_tools`: list[str] -- `llm`: LLM -- `mcp_config`: dict[str, Any] -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `security_policy_filename`: str -- `system_prompt_filename`: str -- `system_prompt_kwargs`: dict[str, object] -- `tools`: list[Tool] #### Methods @@ -94,11 +84,12 @@ agent implementations must follow. - `agent_context`: AgentContext | None - `condenser`: CondenserBase | None +- `critic`: CriticBase | None - `filter_tools_regex`: str | None - `include_default_tools`: list[str] - `llm`: LLM - `mcp_config`: dict[str, Any] -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `name`: str Returns the name of the Agent. diff --git a/sdk/api-reference/openhands.sdk.conversation.mdx b/sdk/api-reference/openhands.sdk.conversation.mdx index 7a58929d..ee27a282 100644 --- a/sdk/api-reference/openhands.sdk.conversation.mdx +++ b/sdk/api-reference/openhands.sdk.conversation.mdx @@ -126,6 +126,10 @@ Send a message to the agent. Set the confirmation policy for the conversation. +#### abstractmethod set_security_analyzer() + +Set the security analyzer for the conversation. + #### abstractmethod update_secrets() ### class Conversation @@ -197,8 +201,6 @@ Bases: `OpenHandsModel` - `execution_status`: [ConversationExecutionStatus](#class-conversationexecutionstatus) - `id`: UUID - `max_iterations`: int -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `persistence_dir`: str | None - `secret_registry`: [SecretRegistry](#class-secretregistry) - `security_analyzer`: SecurityAnalyzerBase | None @@ -280,6 +282,10 @@ actions that are pending confirmation or execution. Return True if the lock is currently held by any thread. +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### model_post_init() This function is meant to behave like a BaseModel method to initialise private attributes. @@ -352,7 +358,25 @@ Conversation will then calls MyVisualizer() followed by initialize(state) Initialize the visualizer base. -#### initialize() +#### create_sub_visualizer() + +Create a visualizer for a sub-agent during delegation. + +Override this method to support sub-agent visualization in multi-agent +delegation scenarios. The sub-visualizer will be used to display events +from the spawned sub-agent. + +By default, returns None which means sub-agents will not have visualization. +Subclasses that support delegation (like DelegationVisualizer) should +override this method to create appropriate sub-visualizers. + +* Parameters: + `agent_id` – The identifier of the sub-agent being spawned +* Returns: + A visualizer instance for the sub-agent, or None if sub-agent + visualization is not supported + +#### final initialize() Initialize the visualizer with conversation state. @@ -772,8 +796,6 @@ even when callable secrets fail on subsequent calls. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `secret_sources`: dict[str, SecretSource] #### Methods @@ -808,6 +830,10 @@ fresh values from callables to ensure comprehensive masking. * Returns: Text with secret values replaced by `` +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### model_post_init() This function is meant to behave like a BaseModel method to initialise private attributes. diff --git a/sdk/api-reference/openhands.sdk.event.mdx b/sdk/api-reference/openhands.sdk.event.mdx index 35e19600..5e2fbcaa 100644 --- a/sdk/api-reference/openhands.sdk.event.mdx +++ b/sdk/api-reference/openhands.sdk.event.mdx @@ -12,8 +12,9 @@ Bases: [`LLMConvertibleEvent`](#class-llmconvertibleevent) #### Properties - `action`: Action | None +- `critic_result`: CriticResult | None - `llm_response_id`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `reasoning_content`: str | None - `responses_reasoning_item`: ReasoningItemModel | None @@ -47,7 +48,7 @@ represents an error produced by the agent/scaffold, not model output. #### Properties - `error`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `visualize`: Text @@ -68,7 +69,7 @@ This action indicates a condensation of the conversation history is happening. - `forgotten_event_ids`: list[str] - `llm_response_id`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `summary`: str | None @@ -86,7 +87,7 @@ This action is used to request a condensation of the conversation history. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `visualize`: Text @@ -112,7 +113,7 @@ This event represents a summary generated by a condenser. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `summary`: str @@ -138,7 +139,7 @@ to ensure compatibility with websocket transmission. #### Properties - `key`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `value`: Any @@ -194,7 +195,7 @@ instead of writing it to a file inside the Docker container. - `filename`: str - `log_data`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `model_name`: str - `source`: Literal['agent', 'user', 'environment'] @@ -208,11 +209,8 @@ Base class for events that can be converted to LLM messages. #### Properties -- `id`: EventID -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `source`: SourceType -- `timestamp`: str #### Methods @@ -234,8 +232,8 @@ This is originally the “MessageAction”, but it suppose not to be tool call. #### Properties - `activated_skills`: list[str] +- `critic_result`: CriticResult | None - `extended_content`: list[TextContent] -- `id`: EventID - `llm_message`: Message - `llm_response_id`: str | None - `model_config`: ClassVar[ConfigDict] = (configuration object) @@ -245,7 +243,6 @@ This is originally the “MessageAction”, but it suppose not to be tool call. - `source`: Literal['agent', 'user', 'environment'] - `thinking_blocks`: Sequence[ThinkingBlock | RedactedThinkingBlock] Return the Anthropic thinking blocks from the LLM message. -- `timestamp`: str - `visualize`: Text Return Rich Text representation of this message event. @@ -264,7 +261,7 @@ Examples include tool execution, error, user reject. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `tool_call_id`: str @@ -277,7 +274,7 @@ Bases: [`ObservationBaseEvent`](#class-observationbaseevent) #### Properties - `action_id`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `observation`: Observation - `visualize`: Text @@ -296,7 +293,7 @@ Event indicating that the agent execution was paused by user request. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `visualize`: Text @@ -310,7 +307,7 @@ System prompt added by the agent. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source`: Literal['agent', 'user', 'environment'] - `system_prompt`: TextContent @@ -331,7 +328,7 @@ Event from VLLM representing token IDs used in LLM interaction. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `prompt_token_ids`: list[int] - `response_token_ids`: list[int] @@ -346,7 +343,7 @@ Observation when user rejects an action in confirmation mode. #### Properties - `action_id`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `rejection_reason`: str - `visualize`: Text diff --git a/sdk/api-reference/openhands.sdk.llm.mdx b/sdk/api-reference/openhands.sdk.llm.mdx index 8e22367b..fc63ab18 100644 --- a/sdk/api-reference/openhands.sdk.llm.mdx +++ b/sdk/api-reference/openhands.sdk.llm.mdx @@ -11,14 +11,15 @@ Bases: `BaseContent` #### Properties -- `cache_prompt`: bool - `image_urls`: list[str] -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `type`: Literal['image'] #### Methods +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### to_llm_dict() Convert to LLM API format. @@ -330,8 +331,6 @@ Bases: `BaseModel` - `content`: Sequence[[TextContent](#class-textcontent) | [ImageContent](#class-imagecontent)] - `force_string_serializer`: bool - `function_calling_enabled`: bool -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `name`: str | None - `reasoning_content`: str | None - `responses_reasoning_item`: [ReasoningItemModel](#class-reasoningitemmodel) | None @@ -360,6 +359,10 @@ Policy (non-stream): - Collect assistant text by concatenating output_text parts from message items - Normalize function_call items to MessageToolCall list +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### to_chat_dict() Serialize message for OpenAI Chat Completions. @@ -401,10 +404,11 @@ for Responses function_call_output call_id. - `arguments`: str - `id`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `name`: str - `origin`: Literal['completion', 'responses'] +- `costs`: list[Cost] +- `response_latencies`: list[ResponseLatency] +- `token_usages`: list[TokenUsage] #### Methods @@ -418,6 +422,10 @@ Create a MessageToolCall from a typed OpenAI Responses function_call item. Note: OpenAI Responses function_call.arguments is already a JSON string. +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### to_chat_dict() Serialize to OpenAI Chat Completions tool_calls format. @@ -426,29 +434,6 @@ Serialize to OpenAI Chat Completions tool_calls format. Serialize to OpenAI Responses ‘function_call’ input item format. -### class Metrics - -Bases: [`MetricsSnapshot`](#class-metricssnapshot) - -Metrics class can record various metrics during running and evaluation. -We track: - - - accumulated_cost and costs - - max_budget_per_task (budget limit) - - A list of ResponseLatency - - A list of TokenUsage (one per call). - - -#### Properties - -- `costs`: list[Cost] -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `response_latencies`: list[ResponseLatency] -- `token_usages`: list[TokenUsage] - -#### Methods - #### add_cost() #### add_response_latency() @@ -490,6 +475,10 @@ Log the metrics. Merge ‘other’ metrics into this one. +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### classmethod validate_accumulated_cost() ### class MetricsSnapshot @@ -506,9 +495,14 @@ Does not include lists of individual costs, latencies, or token usages. - `accumulated_cost`: float - `accumulated_token_usage`: TokenUsage | None - `max_budget_per_task`: float | None -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `model_name`: str + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + ### class ReasoningItemModel Bases: `BaseModel` @@ -523,10 +517,15 @@ Do not log or render encrypted_content. - `content`: list[str] | None - `encrypted_content`: str | None - `id`: str | None -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `status`: str | None - `summary`: list[str] + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + ### class RedactedThinkingBlock Bases: `BaseModel` @@ -540,9 +539,14 @@ before extended thinking was enabled. #### Properties - `data`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `type`: Literal['redacted_thinking'] + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + ### class RegistryEvent Bases: `BaseModel` @@ -571,7 +575,7 @@ Key features: - `active_llm`: [LLM](#class-llm) | None - `llms_for_routing`: dict[str, [LLM](#class-llm)] -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `router_name`: str @@ -631,7 +635,6 @@ Bases: `BaseContent` #### Properties -- `cache_prompt`: bool - `enable_truncation`: bool - `model_config`: ClassVar[ConfigDict] = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. @@ -657,8 +660,12 @@ and passed back to the API for tool use scenarios. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `signature`: str | None - `thinking`: str -- `type`: Literal['thinking'] \ No newline at end of file +- `type`: Literal['thinking'] + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. diff --git a/sdk/api-reference/openhands.sdk.tool.mdx b/sdk/api-reference/openhands.sdk.tool.mdx index 1e6234ac..62b85a29 100644 --- a/sdk/api-reference/openhands.sdk.tool.mdx +++ b/sdk/api-reference/openhands.sdk.tool.mdx @@ -13,7 +13,7 @@ Base schema for input action. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `visualize`: Text Return Rich Text representation of this action. @@ -47,9 +47,8 @@ Tool for signaling the completion of a task or conversation. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `name`: ClassVar[str] = 'finish' #### Methods @@ -65,6 +64,8 @@ Create FinishTool instance. * Raises: `ValueError` – If any parameters are provided. +#### name = 'finish' + ### class Observation Bases: `Schema`, `ABC` @@ -77,7 +78,7 @@ Base schema for output observation. - `ERROR_MESSAGE_HEADER`: ClassVar[str] = '[An error occurred during execution.]n' - `content`: list[TextContent | ImageContent] - `is_error`: bool -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `text`: str Extract all text content from the observation. @@ -113,9 +114,8 @@ Tool for logging thoughts without making changes. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) +- `model_config`: = (configuration object) Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `name`: ClassVar[str] = 'think' #### Methods @@ -131,6 +131,8 @@ Create ThinkTool instance. * Raises: `ValueError` – If any parameters are provided. +#### name = 'think' + ### class Tool Bases: `BaseModel` @@ -142,13 +144,15 @@ This is only used in agent-sdk for type schema for server use. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `name`: str - `params`: dict[str, Any] #### Methods +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### classmethod validate_name() Validate that name is not empty. diff --git a/sdk/api-reference/openhands.sdk.workspace.mdx b/sdk/api-reference/openhands.sdk.workspace.mdx index a1427c87..48066655 100644 --- a/sdk/api-reference/openhands.sdk.workspace.mdx +++ b/sdk/api-reference/openhands.sdk.workspace.mdx @@ -25,8 +25,6 @@ support the context manager protocol for safe resource management. #### Properties -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `working_dir`: Annotated[str, BeforeValidator(func=_convert_path_to_str, json_schema_input_type=PydanticUndefined), FieldInfo(annotation=NoneType, required=True, description='The working directory for agent operations and tool execution. Accepts both string paths and Path objects. Path objects are automatically converted to strings.')] #### Methods @@ -101,6 +99,10 @@ Get the git diff for the file at the path given. * Raises: `Exception` – If path is not a git repository or getting diff failed +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### pause() Pause the workspace to conserve resources. @@ -132,11 +134,16 @@ Result of executing a command in the workspace. - `command`: str - `exit_code`: int -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `stderr`: str - `stdout`: str - `timeout_occurred`: bool + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + ### class FileOperationResult Bases: `BaseModel` @@ -149,10 +156,15 @@ Result of a file upload or download operation. - `destination_path`: str - `error`: str | None - `file_size`: int | None -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. - `source_path`: str - `success`: bool + +#### Methods + +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + ### class LocalWorkspace Bases: [`BaseWorkspace`](#class-baseworkspace) @@ -172,13 +184,6 @@ should operate directly on the host system. ... content = workspace.read_file("README.md") ``` - -#### Properties - -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `working_dir`: Annotated[str, BeforeValidator(_convert_path_to_str), Field(description='The working directory for agent operations and tool execution. Accepts both string paths and Path objects. Path objects are automatically converted to strings.')] - #### Methods #### __init__() @@ -263,6 +268,10 @@ Get the git diff for the file at the path given. * Raises: `Exception` – If path is not a git repository or getting diff failed +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### pause() Pause the workspace (no-op for local workspaces). @@ -306,12 +315,7 @@ as it provides better isolation and security. Check if the remote workspace is alive by querying the health endpoint. * Returns: True if the health endpoint returns a successful response, False otherwise. -- `api_key`: str | None - `client`: Client -- `host`: str -- `model_config`: ClassVar[ConfigDict] = (configuration object) - Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. -- `working_dir`: str #### Methods @@ -385,6 +389,10 @@ Get the git diff for the file at the path given. * Raises: `Exception` – If path is not a git repository or getting diff failed +#### model_config = (configuration object) + +Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict]. + #### model_post_init() Override this method to perform additional initialization after __init__ and model_construct. diff --git a/sdk/guides/critic.mdx b/sdk/guides/critic.mdx new file mode 100644 index 00000000..c0098fbf --- /dev/null +++ b/sdk/guides/critic.mdx @@ -0,0 +1,179 @@ +--- +title: Critic (Experimental) +description: Real-time evaluation of agent actions using an LLM-based critic model. +--- + + +**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing. + + + +The critic model is hosted by the OpenHands LLM Provider and is currently free to use. This example is available on GitHub: [examples/01_standalone_sdk/34_critic_example.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/34_critic_example.py) + + +## What is a Critic? + +A **critic** is an evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. The critic runs alongside the agent and provides: + +- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success +- **Real-time feedback**: Scores computed during agent execution, not just at completion + +You can use critic scores to build automated workflows, such as triggering the agent to reflect on and fix its previous solution when the critic indicates poor task performance. + + +This critic is a more advanced extension of the approach described in our blog post [SOTA on SWE-Bench Verified with Inference-Time Scaling and Critic Model](https://openhands.dev/blog/sota-on-swe-bench-verified-with-inference-time-scaling-and-critic-model). A technical report with detailed evaluation metrics is forthcoming. + + +## Quick Start + +When using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`), the critic is **automatically configured** - no additional setup required. + +```python icon="python" expandable examples/01_standalone_sdk/34_critic_example.py +"""Example demonstrating critic-based evaluation of agent actions. + +This is EXPERIMENTAL. + +This shows how to configure an agent with a critic to evaluate action quality +in real-time. The critic scores are displayed in the conversation visualizer. + +For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured +using the same base_url with /vllm suffix and "critic" as the model name. +""" + +import os +import re + +from openhands.sdk import LLM, Agent, Conversation, Tool +from openhands.sdk.critic import APIBasedCritic +from openhands.sdk.critic.base import CriticBase +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.task_tracker import TaskTrackerTool +from openhands.tools.terminal import TerminalTool + + +def get_required_env(name: str) -> str: + value = os.getenv(name) + if value: + return value + raise ValueError( + f"Missing required environment variable: {name}. " + f"Set {name} before running this example." + ) + + +def get_default_critic(llm: LLM) -> CriticBase | None: + """Auto-configure critic for All-Hands LLM proxy. + + When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an + APIBasedCritic configured with: + - server_url: {base_url}/vllm + - api_key: same as LLM + - model_name: "critic" + + Returns None if base_url doesn't match or api_key is not set. + """ + base_url = llm.base_url + api_key = llm.api_key + if base_url is None or api_key is None: + return None + + # Match: llm-proxy.{env}.all-hands.dev (e.g., staging, prod, eval) + pattern = r"^https?://llm-proxy\.[^./]+\.all-hands\.dev" + if not re.match(pattern, base_url): + return None + + return APIBasedCritic( + server_url=f"{base_url.rstrip('/')}/vllm", + api_key=api_key, + model_name="critic", + ) + + +llm_api_key = get_required_env("LLM_API_KEY") + +llm = LLM( + model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"), + api_key=llm_api_key, + base_url=os.getenv("LLM_BASE_URL", None), +) + +# Try auto-configuration for All-Hands proxy, fall back to explicit env vars +critic = get_default_critic(llm) +if critic is None: + critic = APIBasedCritic( + server_url=get_required_env("CRITIC_SERVER_URL"), + api_key=get_required_env("CRITIC_API_KEY"), + model_name=get_required_env("CRITIC_MODEL_NAME"), + ) + + +# Configure agent with critic +agent = Agent( + llm=llm, + tools=[ + Tool(name=TerminalTool.name), + Tool(name=FileEditorTool.name), + Tool(name=TaskTrackerTool.name), + ], + # Add critic to evaluate agent actions + critic=critic, +) + +cwd = os.getcwd() +conversation = Conversation(agent=agent, workspace=cwd) + +conversation.send_message( + "Create a file called GREETING.txt with a friendly greeting message." +) +conversation.run() + +print("\nAll done! Check the output above for 'Critic Score' in the visualizer.") +``` + +```bash Running the Example +uv run python examples/01_standalone_sdk/34_critic_example.py +``` + +## Understanding Critic Results + +Critic evaluations produce scores and feedback: + +- **`score`**: Float between 0.0 and 1.0 representing predicted success probability +- **`message`**: Optional feedback with detailed probabilities +- **`success`**: Boolean property (True if score >= 0.5) + +Results are automatically displayed in the conversation visualizer: + +![Critic results in SDK visualizer](./screenshots/critic-sdk-visualizer.png) + +### Accessing Results Programmatically + +```python +from openhands.sdk import Event, ActionEvent, MessageEvent + +def callback(event: Event): + if isinstance(event, (ActionEvent, MessageEvent)): + if event.critic_result is not None: + print(f"Critic score: {event.critic_result.score:.3f}") + print(f"Success: {event.critic_result.success}") + +conversation = Conversation(agent=agent, callbacks=[callback]) +``` + +## Troubleshooting + +### Critic Evaluations Not Appearing + +- Verify the critic is properly configured and passed to the Agent +- Ensure you're using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`) + +### API Authentication Errors + +- Verify `LLM_API_KEY` is set correctly +- Check that the API key has not expired + +## Next Steps + +- **[Observability](/sdk/guides/observability)** - Monitor and log agent behavior +- **[Metrics](/sdk/guides/metrics)** - Collect performance metrics +- **[Stuck Detector](/sdk/guides/agent-stuck-detector)** - Detect unproductive agent patterns diff --git a/sdk/guides/hooks.mdx b/sdk/guides/hooks.mdx index eaee0390..620cd38c 100644 --- a/sdk/guides/hooks.mdx +++ b/sdk/guides/hooks.mdx @@ -37,7 +37,7 @@ from pathlib import Path from pydantic import SecretStr from openhands.sdk import LLM, Conversation -from openhands.sdk.hooks import HookConfig +from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher from openhands.tools.preset.default import get_default_agent @@ -66,60 +66,65 @@ with tempfile.TemporaryDirectory() as tmpdir: log_file = workspace / "tool_usage.log" summary_file = workspace / "summary.txt" - # Configure ALL hook types in one config - hook_config = HookConfig.from_dict( - { - "hooks": { - "PreToolUse": [ - { - "matcher": "terminal", - "hooks": [ - { - "type": "command", - "command": str(SCRIPT_DIR / "block_dangerous.sh"), - "timeout": 10, - } - ], - } + # Configure hooks using the typed approach (recommended) + # This provides better type safety and IDE support + hook_config = HookConfig( + pre_tool_use=[ + HookMatcher( + matcher="terminal", + hooks=[ + HookDefinition( + command=str(SCRIPT_DIR / "block_dangerous.sh"), + timeout=10, + ) ], - "PostToolUse": [ - { - "matcher": "*", - "hooks": [ - { - "type": "command", - "command": f"LOG_FILE={log_file} " - f"{SCRIPT_DIR / 'log_tools.sh'}", - "timeout": 5, - } - ], - } + ) + ], + post_tool_use=[ + HookMatcher( + matcher="*", + hooks=[ + HookDefinition( + command=(f"LOG_FILE={log_file} {SCRIPT_DIR / 'log_tools.sh'}"), + timeout=5, + ) ], - "UserPromptSubmit": [ - { - "hooks": [ - { - "type": "command", - "command": str(SCRIPT_DIR / "inject_git_context.sh"), - } - ], - } + ) + ], + user_prompt_submit=[ + HookMatcher( + hooks=[ + HookDefinition( + command=str(SCRIPT_DIR / "inject_git_context.sh"), + ) ], - "Stop": [ - { - "hooks": [ - { - "type": "command", - "command": f"SUMMARY_FILE={summary_file} " - f"{SCRIPT_DIR / 'require_summary.sh'}", - } - ], - } + ) + ], + stop=[ + HookMatcher( + hooks=[ + HookDefinition( + command=( + f"SUMMARY_FILE={summary_file} " + f"{SCRIPT_DIR / 'require_summary.sh'}" + ), + ) ], - } - } + ) + ], ) + # Alternative: You can also use .from_dict() for loading from JSON config files + # Example with a single hook matcher: + # hook_config = HookConfig.from_dict({ + # "hooks": { + # "PreToolUse": [{ + # "matcher": "terminal", + # "hooks": [{"command": "path/to/script.sh", "timeout": 10}] + # }] + # } + # }) + agent = get_default_agent(llm=llm) conversation = Conversation( agent=agent, diff --git a/sdk/guides/plugins.mdx b/sdk/guides/plugins.mdx index 8f8287d5..b38531e8 100644 --- a/sdk/guides/plugins.mdx +++ b/sdk/guides/plugins.mdx @@ -91,10 +91,23 @@ for skill in plugin.skills: print(f" Triggers: {skill.trigger}") # Hooks -print(f"\nHooks: {'Configured' if plugin.hooks else 'None'}") -if plugin.hooks: - for event_type, matchers in plugin.hooks.hooks.items(): - print(f" - {event_type}: {len(matchers)} matcher(s)") +hook_config = plugin.hooks +has_hooks = hook_config is not None and not hook_config.is_empty() +print(f"\nHooks: {'Configured' if has_hooks else 'None'}") +if has_hooks: + assert hook_config is not None + if hook_config.pre_tool_use: + print(f" - PreToolUse: {len(hook_config.pre_tool_use)} matcher(s)") + if hook_config.post_tool_use: + print(f" - PostToolUse: {len(hook_config.post_tool_use)} matcher(s)") + if hook_config.user_prompt_submit: + print(f" - UserPromptSubmit: {len(hook_config.user_prompt_submit)} matcher(s)") + if hook_config.session_start: + print(f" - SessionStart: {len(hook_config.session_start)} matcher(s)") + if hook_config.session_end: + print(f" - SessionEnd: {len(hook_config.session_end)} matcher(s)") + if hook_config.stop: + print(f" - Stop: {len(hook_config.stop)} matcher(s)") # MCP Config print(f"\nMCP Config: {'Configured' if plugin.mcp_config else 'None'}") @@ -138,6 +151,7 @@ if not api_key: print("Skipping agent demo (LLM_API_KEY not set)") print("\nTo run the full demo, set the LLM_API_KEY environment variable:") print(" export LLM_API_KEY=your-api-key") + print("EXAMPLE_COST: 0") sys.exit(0) # Configure LLM @@ -146,6 +160,7 @@ llm = LLM( usage_id="plugin-demo", model=model, api_key=SecretStr(api_key), + base_url=os.getenv("LLM_BASE_URL"), ) # Create agent context with plugin skills @@ -212,6 +227,7 @@ with tempfile.TemporaryDirectory() as tmpdir: print("No hook log file found (hooks may not have executed)") print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}") + print(f"EXAMPLE_COST: {llm.metrics.accumulated_cost:.4f}") ``` ```bash Running the Example diff --git a/sdk/guides/screenshots/critic-sdk-visualizer.png b/sdk/guides/screenshots/critic-sdk-visualizer.png new file mode 100644 index 00000000..b8a7473c Binary files /dev/null and b/sdk/guides/screenshots/critic-sdk-visualizer.png differ diff --git a/sdk/guides/skill.mdx b/sdk/guides/skill.mdx index 6ac5ec04..8b9ff62f 100644 --- a/sdk/guides/skill.mdx +++ b/sdk/guides/skill.mdx @@ -378,6 +378,7 @@ when triggered, plus the agent can proactively read them anytime. """ import os +import sys from pathlib import Path from pydantic import SecretStr @@ -392,108 +393,102 @@ from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool -def main(): - # Get the directory containing this script - script_dir = Path(__file__).parent - example_skills_dir = script_dir / "example_skills" - - # ========================================================================= - # Part 1: Loading Skills from a Directory - # ========================================================================= - print("=" * 80) - print("Part 1: Loading Skills from a Directory") - print("=" * 80) - - print(f"Loading skills from: {example_skills_dir}") - - # Discover resources in the skill directory - skill_subdir = example_skills_dir / "rot13-encryption" - resources = discover_skill_resources(skill_subdir) - print("\nDiscovered resources in rot13-encryption/:") - print(f" - scripts: {resources.scripts}") - print(f" - references: {resources.references}") - print(f" - assets: {resources.assets}") - - # Load skills from the directory - repo_skills, knowledge_skills, agent_skills = load_skills_from_dir( - example_skills_dir - ) - - print("\nLoaded skills from directory:") - print(f" - Repo skills: {list(repo_skills.keys())}") - print(f" - Knowledge skills: {list(knowledge_skills.keys())}") - print(f" - Agent skills (SKILL.md): {list(agent_skills.keys())}") - - # Access the loaded skill and show all AgentSkills standard fields - if agent_skills: - skill_name = list(agent_skills.keys())[0] - loaded_skill = agent_skills[skill_name] - print(f"\nDetails for '{skill_name}' (AgentSkills standard fields):") - print(f" - Name: {loaded_skill.name}") - desc = loaded_skill.description or "" - print(f" - Description: {desc[:70]}...") - print(f" - License: {loaded_skill.license}") - print(f" - Compatibility: {loaded_skill.compatibility}") - print(f" - Metadata: {loaded_skill.metadata}") - if loaded_skill.resources: - print(" - Resources:") - print(f" - Scripts: {loaded_skill.resources.scripts}") - print(f" - References: {loaded_skill.resources.references}") - print(f" - Assets: {loaded_skill.resources.assets}") - print(f" - Skill root: {loaded_skill.resources.skill_root}") - - # ========================================================================= - # Part 2: Using Skills with an Agent - # ========================================================================= - print("\n" + "=" * 80) - print("Part 2: Using Skills with an Agent") - print("=" * 80) - - # Check for API key - api_key = os.getenv("LLM_API_KEY") - if not api_key: - print("Skipping agent demo (LLM_API_KEY not set)") - print("\nTo run the full demo, set the LLM_API_KEY environment variable:") - print(" export LLM_API_KEY=your-api-key") - return - - # Configure LLM - model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") - llm = LLM( - usage_id="skills-demo", - model=model, - api_key=SecretStr(api_key), - base_url=os.getenv("LLM_BASE_URL"), - ) - - # Create agent context with loaded skills - agent_context = AgentContext( - skills=list(agent_skills.values()), - # Disable public skills for this demo to keep output focused - load_public_skills=False, - ) - - # Create agent with tools so it can read skill resources - tools = [ - Tool(name=TerminalTool.name), - Tool(name=FileEditorTool.name), - ] - agent = Agent(llm=llm, tools=tools, agent_context=agent_context) +# Get the directory containing this script +script_dir = Path(__file__).parent +example_skills_dir = script_dir / "example_skills" + +# ========================================================================= +# Part 1: Loading Skills from a Directory +# ========================================================================= +print("=" * 80) +print("Part 1: Loading Skills from a Directory") +print("=" * 80) + +print(f"Loading skills from: {example_skills_dir}") + +# Discover resources in the skill directory +skill_subdir = example_skills_dir / "rot13-encryption" +resources = discover_skill_resources(skill_subdir) +print("\nDiscovered resources in rot13-encryption/:") +print(f" - scripts: {resources.scripts}") +print(f" - references: {resources.references}") +print(f" - assets: {resources.assets}") + +# Load skills from the directory +repo_skills, knowledge_skills, agent_skills = load_skills_from_dir(example_skills_dir) + +print("\nLoaded skills from directory:") +print(f" - Repo skills: {list(repo_skills.keys())}") +print(f" - Knowledge skills: {list(knowledge_skills.keys())}") +print(f" - Agent skills (SKILL.md): {list(agent_skills.keys())}") + +# Access the loaded skill and show all AgentSkills standard fields +if agent_skills: + skill_name = next(iter(agent_skills)) + loaded_skill = agent_skills[skill_name] + print(f"\nDetails for '{skill_name}' (AgentSkills standard fields):") + print(f" - Name: {loaded_skill.name}") + desc = loaded_skill.description or "" + print(f" - Description: {desc[:70]}...") + print(f" - License: {loaded_skill.license}") + print(f" - Compatibility: {loaded_skill.compatibility}") + print(f" - Metadata: {loaded_skill.metadata}") + if loaded_skill.resources: + print(" - Resources:") + print(f" - Scripts: {loaded_skill.resources.scripts}") + print(f" - References: {loaded_skill.resources.references}") + print(f" - Assets: {loaded_skill.resources.assets}") + print(f" - Skill root: {loaded_skill.resources.skill_root}") + +# ========================================================================= +# Part 2: Using Skills with an Agent +# ========================================================================= +print("\n" + "=" * 80) +print("Part 2: Using Skills with an Agent") +print("=" * 80) + +# Check for API key +api_key = os.getenv("LLM_API_KEY") +if not api_key: + print("Skipping agent demo (LLM_API_KEY not set)") + print("\nTo run the full demo, set the LLM_API_KEY environment variable:") + print(" export LLM_API_KEY=your-api-key") + sys.exit(0) + +# Configure LLM +model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") +llm = LLM( + usage_id="skills-demo", + model=model, + api_key=SecretStr(api_key), + base_url=os.getenv("LLM_BASE_URL"), +) - # Create conversation - conversation = Conversation(agent=agent, workspace=os.getcwd()) +# Create agent context with loaded skills +agent_context = AgentContext( + skills=list(agent_skills.values()), + # Disable public skills for this demo to keep output focused + load_public_skills=False, +) - # Test the skill (triggered by "encrypt" keyword) - # The skill provides instructions and a script for ROT13 encryption - print("\nSending message with 'encrypt' keyword to trigger skill...") - conversation.send_message("Encrypt the message 'hello world'.") - conversation.run() +# Create agent with tools so it can read skill resources +tools = [ + Tool(name=TerminalTool.name), + Tool(name=FileEditorTool.name), +] +agent = Agent(llm=llm, tools=tools, agent_context=agent_context) - print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}") +# Create conversation +conversation = Conversation(agent=agent, workspace=os.getcwd()) +# Test the skill (triggered by "encrypt" keyword) +# The skill provides instructions and a script for ROT13 encryption +print("\nSending message with 'encrypt' keyword to trigger skill...") +conversation.send_message("Encrypt the message 'hello world'.") +conversation.run() -if __name__ == "__main__": - main() +print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}") +print(f"EXAMPLE_COST: {llm.metrics.accumulated_cost:.4f}") ``` ```bash Running the Example