diff --git a/docs.json b/docs.json
index 0878a67b..3738b767 100644
--- a/docs.json
+++ b/docs.json
@@ -215,7 +215,8 @@
{
"group": "Extensions",
"pages": [
- "openhands/usage/cli/mcp-servers"
+ "openhands/usage/cli/mcp-servers",
+ "openhands/usage/cli/critic"
]
},
{
@@ -268,7 +269,8 @@
"sdk/guides/agent-custom",
"sdk/guides/convo-custom-visualizer",
"sdk/guides/agent-stuck-detector",
- "sdk/guides/agent-tom-agent"
+ "sdk/guides/agent-tom-agent",
+ "sdk/guides/critic"
]
},
{
diff --git a/openhands/usage/cli/critic-demo.mp4 b/openhands/usage/cli/critic-demo.mp4
new file mode 100644
index 00000000..2cc6ed27
Binary files /dev/null and b/openhands/usage/cli/critic-demo.mp4 differ
diff --git a/openhands/usage/cli/critic.mdx b/openhands/usage/cli/critic.mdx
new file mode 100644
index 00000000..5bc1f930
--- /dev/null
+++ b/openhands/usage/cli/critic.mdx
@@ -0,0 +1,41 @@
+---
+title: Critic (Experimental)
+description: Automatic task success prediction for OpenHands LLM Provider users
+---
+
+
+**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing.
+
+
+## Overview
+
+If you're using the [OpenHands LLM Provider](/openhands/usage/llms/openhands-llms), an experimental **critic feature** is automatically enabled to predict task success in real-time.
+
+For detailed information about the critic feature, including programmatic access and advanced usage, see the [SDK Critic Guide](/sdk/guides/critic).
+
+
+## What is the Critic?
+
+The critic is an LLM-based evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. It provides:
+
+- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success
+- **Real-time feedback**: Scores computed during agent execution, not just at completion
+
+
+
+
+
+## Pricing
+
+The critic feature is **free during the public beta phase** for all OpenHands LLM Provider users.
+
+## Disabling the Critic
+
+If you prefer not to use the critic feature, you can disable it in your settings.
+
+
+
diff --git a/openhands/usage/cli/screenshots/critic-cli-output.png b/openhands/usage/cli/screenshots/critic-cli-output.png
new file mode 100644
index 00000000..1dc97ea6
Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-output.png differ
diff --git a/openhands/usage/cli/screenshots/critic-cli-settings.png b/openhands/usage/cli/screenshots/critic-cli-settings.png
new file mode 100644
index 00000000..3eb41695
Binary files /dev/null and b/openhands/usage/cli/screenshots/critic-cli-settings.png differ
diff --git a/sdk/api-reference/openhands.sdk.agent.mdx b/sdk/api-reference/openhands.sdk.agent.mdx
index 07de2190..f55127f3 100644
--- a/sdk/api-reference/openhands.sdk.agent.mdx
+++ b/sdk/api-reference/openhands.sdk.agent.mdx
@@ -26,18 +26,8 @@ AgentBase and implements the agent execution logic.
#### Properties
-- `agent_context`: AgentContext | None
-- `condenser`: CondenserBase | None
-- `filter_tools_regex`: str | None
-- `include_default_tools`: list[str]
-- `llm`: LLM
-- `mcp_config`: dict[str, Any]
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `security_policy_filename`: str
-- `system_prompt_filename`: str
-- `system_prompt_kwargs`: dict[str, object]
-- `tools`: list[Tool]
#### Methods
@@ -94,11 +84,12 @@ agent implementations must follow.
- `agent_context`: AgentContext | None
- `condenser`: CondenserBase | None
+- `critic`: CriticBase | None
- `filter_tools_regex`: str | None
- `include_default_tools`: list[str]
- `llm`: LLM
- `mcp_config`: dict[str, Any]
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `name`: str
Returns the name of the Agent.
diff --git a/sdk/api-reference/openhands.sdk.conversation.mdx b/sdk/api-reference/openhands.sdk.conversation.mdx
index 7a58929d..ee27a282 100644
--- a/sdk/api-reference/openhands.sdk.conversation.mdx
+++ b/sdk/api-reference/openhands.sdk.conversation.mdx
@@ -126,6 +126,10 @@ Send a message to the agent.
Set the confirmation policy for the conversation.
+#### abstractmethod set_security_analyzer()
+
+Set the security analyzer for the conversation.
+
#### abstractmethod update_secrets()
### class Conversation
@@ -197,8 +201,6 @@ Bases: `OpenHandsModel`
- `execution_status`: [ConversationExecutionStatus](#class-conversationexecutionstatus)
- `id`: UUID
- `max_iterations`: int
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `persistence_dir`: str | None
- `secret_registry`: [SecretRegistry](#class-secretregistry)
- `security_analyzer`: SecurityAnalyzerBase | None
@@ -280,6 +282,10 @@ actions that are pending confirmation or execution.
Return True if the lock is currently held by any thread.
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### model_post_init()
This function is meant to behave like a BaseModel method to initialise private attributes.
@@ -352,7 +358,25 @@ Conversation will then calls MyVisualizer() followed by initialize(state)
Initialize the visualizer base.
-#### initialize()
+#### create_sub_visualizer()
+
+Create a visualizer for a sub-agent during delegation.
+
+Override this method to support sub-agent visualization in multi-agent
+delegation scenarios. The sub-visualizer will be used to display events
+from the spawned sub-agent.
+
+By default, returns None which means sub-agents will not have visualization.
+Subclasses that support delegation (like DelegationVisualizer) should
+override this method to create appropriate sub-visualizers.
+
+* Parameters:
+ `agent_id` – The identifier of the sub-agent being spawned
+* Returns:
+ A visualizer instance for the sub-agent, or None if sub-agent
+ visualization is not supported
+
+#### final initialize()
Initialize the visualizer with conversation state.
@@ -772,8 +796,6 @@ even when callable secrets fail on subsequent calls.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `secret_sources`: dict[str, SecretSource]
#### Methods
@@ -808,6 +830,10 @@ fresh values from callables to ensure comprehensive masking.
* Returns:
Text with secret values replaced by ``
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### model_post_init()
This function is meant to behave like a BaseModel method to initialise private attributes.
diff --git a/sdk/api-reference/openhands.sdk.event.mdx b/sdk/api-reference/openhands.sdk.event.mdx
index 35e19600..5e2fbcaa 100644
--- a/sdk/api-reference/openhands.sdk.event.mdx
+++ b/sdk/api-reference/openhands.sdk.event.mdx
@@ -12,8 +12,9 @@ Bases: [`LLMConvertibleEvent`](#class-llmconvertibleevent)
#### Properties
- `action`: Action | None
+- `critic_result`: CriticResult | None
- `llm_response_id`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `reasoning_content`: str | None
- `responses_reasoning_item`: ReasoningItemModel | None
@@ -47,7 +48,7 @@ represents an error produced by the agent/scaffold, not model output.
#### Properties
- `error`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `visualize`: Text
@@ -68,7 +69,7 @@ This action indicates a condensation of the conversation history is happening.
- `forgotten_event_ids`: list[str]
- `llm_response_id`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `summary`: str | None
@@ -86,7 +87,7 @@ This action is used to request a condensation of the conversation history.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `visualize`: Text
@@ -112,7 +113,7 @@ This event represents a summary generated by a condenser.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `summary`: str
@@ -138,7 +139,7 @@ to ensure compatibility with websocket transmission.
#### Properties
- `key`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `value`: Any
@@ -194,7 +195,7 @@ instead of writing it to a file inside the Docker container.
- `filename`: str
- `log_data`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `model_name`: str
- `source`: Literal['agent', 'user', 'environment']
@@ -208,11 +209,8 @@ Base class for events that can be converted to LLM messages.
#### Properties
-- `id`: EventID
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `source`: SourceType
-- `timestamp`: str
#### Methods
@@ -234,8 +232,8 @@ This is originally the “MessageAction”, but it suppose not to be tool call.
#### Properties
- `activated_skills`: list[str]
+- `critic_result`: CriticResult | None
- `extended_content`: list[TextContent]
-- `id`: EventID
- `llm_message`: Message
- `llm_response_id`: str | None
- `model_config`: ClassVar[ConfigDict] = (configuration object)
@@ -245,7 +243,6 @@ This is originally the “MessageAction”, but it suppose not to be tool call.
- `source`: Literal['agent', 'user', 'environment']
- `thinking_blocks`: Sequence[ThinkingBlock | RedactedThinkingBlock]
Return the Anthropic thinking blocks from the LLM message.
-- `timestamp`: str
- `visualize`: Text
Return Rich Text representation of this message event.
@@ -264,7 +261,7 @@ Examples include tool execution, error, user reject.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `tool_call_id`: str
@@ -277,7 +274,7 @@ Bases: [`ObservationBaseEvent`](#class-observationbaseevent)
#### Properties
- `action_id`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `observation`: Observation
- `visualize`: Text
@@ -296,7 +293,7 @@ Event indicating that the agent execution was paused by user request.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `visualize`: Text
@@ -310,7 +307,7 @@ System prompt added by the agent.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source`: Literal['agent', 'user', 'environment']
- `system_prompt`: TextContent
@@ -331,7 +328,7 @@ Event from VLLM representing token IDs used in LLM interaction.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `prompt_token_ids`: list[int]
- `response_token_ids`: list[int]
@@ -346,7 +343,7 @@ Observation when user rejects an action in confirmation mode.
#### Properties
- `action_id`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `rejection_reason`: str
- `visualize`: Text
diff --git a/sdk/api-reference/openhands.sdk.llm.mdx b/sdk/api-reference/openhands.sdk.llm.mdx
index 8e22367b..fc63ab18 100644
--- a/sdk/api-reference/openhands.sdk.llm.mdx
+++ b/sdk/api-reference/openhands.sdk.llm.mdx
@@ -11,14 +11,15 @@ Bases: `BaseContent`
#### Properties
-- `cache_prompt`: bool
- `image_urls`: list[str]
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `type`: Literal['image']
#### Methods
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### to_llm_dict()
Convert to LLM API format.
@@ -330,8 +331,6 @@ Bases: `BaseModel`
- `content`: Sequence[[TextContent](#class-textcontent) | [ImageContent](#class-imagecontent)]
- `force_string_serializer`: bool
- `function_calling_enabled`: bool
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `name`: str | None
- `reasoning_content`: str | None
- `responses_reasoning_item`: [ReasoningItemModel](#class-reasoningitemmodel) | None
@@ -360,6 +359,10 @@ Policy (non-stream):
- Collect assistant text by concatenating output_text parts from message items
- Normalize function_call items to MessageToolCall list
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### to_chat_dict()
Serialize message for OpenAI Chat Completions.
@@ -401,10 +404,11 @@ for Responses function_call_output call_id.
- `arguments`: str
- `id`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `name`: str
- `origin`: Literal['completion', 'responses']
+- `costs`: list[Cost]
+- `response_latencies`: list[ResponseLatency]
+- `token_usages`: list[TokenUsage]
#### Methods
@@ -418,6 +422,10 @@ Create a MessageToolCall from a typed OpenAI Responses function_call item.
Note: OpenAI Responses function_call.arguments is already a JSON string.
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### to_chat_dict()
Serialize to OpenAI Chat Completions tool_calls format.
@@ -426,29 +434,6 @@ Serialize to OpenAI Chat Completions tool_calls format.
Serialize to OpenAI Responses ‘function_call’ input item format.
-### class Metrics
-
-Bases: [`MetricsSnapshot`](#class-metricssnapshot)
-
-Metrics class can record various metrics during running and evaluation.
-We track:
-
- - accumulated_cost and costs
- - max_budget_per_task (budget limit)
- - A list of ResponseLatency
- - A list of TokenUsage (one per call).
-
-
-#### Properties
-
-- `costs`: list[Cost]
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `response_latencies`: list[ResponseLatency]
-- `token_usages`: list[TokenUsage]
-
-#### Methods
-
#### add_cost()
#### add_response_latency()
@@ -490,6 +475,10 @@ Log the metrics.
Merge ‘other’ metrics into this one.
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### classmethod validate_accumulated_cost()
### class MetricsSnapshot
@@ -506,9 +495,14 @@ Does not include lists of individual costs, latencies, or token usages.
- `accumulated_cost`: float
- `accumulated_token_usage`: TokenUsage | None
- `max_budget_per_task`: float | None
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `model_name`: str
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
### class ReasoningItemModel
Bases: `BaseModel`
@@ -523,10 +517,15 @@ Do not log or render encrypted_content.
- `content`: list[str] | None
- `encrypted_content`: str | None
- `id`: str | None
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `status`: str | None
- `summary`: list[str]
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
### class RedactedThinkingBlock
Bases: `BaseModel`
@@ -540,9 +539,14 @@ before extended thinking was enabled.
#### Properties
- `data`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `type`: Literal['redacted_thinking']
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
### class RegistryEvent
Bases: `BaseModel`
@@ -571,7 +575,7 @@ Key features:
- `active_llm`: [LLM](#class-llm) | None
- `llms_for_routing`: dict[str, [LLM](#class-llm)]
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `router_name`: str
@@ -631,7 +635,6 @@ Bases: `BaseContent`
#### Properties
-- `cache_prompt`: bool
- `enable_truncation`: bool
- `model_config`: ClassVar[ConfigDict] = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
@@ -657,8 +660,12 @@ and passed back to the API for tool use scenarios.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `signature`: str | None
- `thinking`: str
-- `type`: Literal['thinking']
\ No newline at end of file
+- `type`: Literal['thinking']
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
diff --git a/sdk/api-reference/openhands.sdk.tool.mdx b/sdk/api-reference/openhands.sdk.tool.mdx
index 1e6234ac..62b85a29 100644
--- a/sdk/api-reference/openhands.sdk.tool.mdx
+++ b/sdk/api-reference/openhands.sdk.tool.mdx
@@ -13,7 +13,7 @@ Base schema for input action.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `visualize`: Text
Return Rich Text representation of this action.
@@ -47,9 +47,8 @@ Tool for signaling the completion of a task or conversation.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `name`: ClassVar[str] = 'finish'
#### Methods
@@ -65,6 +64,8 @@ Create FinishTool instance.
* Raises:
`ValueError` – If any parameters are provided.
+#### name = 'finish'
+
### class Observation
Bases: `Schema`, `ABC`
@@ -77,7 +78,7 @@ Base schema for output observation.
- `ERROR_MESSAGE_HEADER`: ClassVar[str] = '[An error occurred during execution.]n'
- `content`: list[TextContent | ImageContent]
- `is_error`: bool
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `text`: str
Extract all text content from the observation.
@@ -113,9 +114,8 @@ Tool for logging thoughts without making changes.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
+- `model_config`: = (configuration object)
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `name`: ClassVar[str] = 'think'
#### Methods
@@ -131,6 +131,8 @@ Create ThinkTool instance.
* Raises:
`ValueError` – If any parameters are provided.
+#### name = 'think'
+
### class Tool
Bases: `BaseModel`
@@ -142,13 +144,15 @@ This is only used in agent-sdk for type schema for server use.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `name`: str
- `params`: dict[str, Any]
#### Methods
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### classmethod validate_name()
Validate that name is not empty.
diff --git a/sdk/api-reference/openhands.sdk.workspace.mdx b/sdk/api-reference/openhands.sdk.workspace.mdx
index a1427c87..48066655 100644
--- a/sdk/api-reference/openhands.sdk.workspace.mdx
+++ b/sdk/api-reference/openhands.sdk.workspace.mdx
@@ -25,8 +25,6 @@ support the context manager protocol for safe resource management.
#### Properties
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `working_dir`: Annotated[str, BeforeValidator(func=_convert_path_to_str, json_schema_input_type=PydanticUndefined), FieldInfo(annotation=NoneType, required=True, description='The working directory for agent operations and tool execution. Accepts both string paths and Path objects. Path objects are automatically converted to strings.')]
#### Methods
@@ -101,6 +99,10 @@ Get the git diff for the file at the path given.
* Raises:
`Exception` – If path is not a git repository or getting diff failed
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### pause()
Pause the workspace to conserve resources.
@@ -132,11 +134,16 @@ Result of executing a command in the workspace.
- `command`: str
- `exit_code`: int
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `stderr`: str
- `stdout`: str
- `timeout_occurred`: bool
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
### class FileOperationResult
Bases: `BaseModel`
@@ -149,10 +156,15 @@ Result of a file upload or download operation.
- `destination_path`: str
- `error`: str | None
- `file_size`: int | None
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
- `source_path`: str
- `success`: bool
+
+#### Methods
+
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
### class LocalWorkspace
Bases: [`BaseWorkspace`](#class-baseworkspace)
@@ -172,13 +184,6 @@ should operate directly on the host system.
... content = workspace.read_file("README.md")
```
-
-#### Properties
-
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `working_dir`: Annotated[str, BeforeValidator(_convert_path_to_str), Field(description='The working directory for agent operations and tool execution. Accepts both string paths and Path objects. Path objects are automatically converted to strings.')]
-
#### Methods
#### __init__()
@@ -263,6 +268,10 @@ Get the git diff for the file at the path given.
* Raises:
`Exception` – If path is not a git repository or getting diff failed
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### pause()
Pause the workspace (no-op for local workspaces).
@@ -306,12 +315,7 @@ as it provides better isolation and security.
Check if the remote workspace is alive by querying the health endpoint.
* Returns:
True if the health endpoint returns a successful response, False otherwise.
-- `api_key`: str | None
- `client`: Client
-- `host`: str
-- `model_config`: ClassVar[ConfigDict] = (configuration object)
- Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
-- `working_dir`: str
#### Methods
@@ -385,6 +389,10 @@ Get the git diff for the file at the path given.
* Raises:
`Exception` – If path is not a git repository or getting diff failed
+#### model_config = (configuration object)
+
+Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
+
#### model_post_init()
Override this method to perform additional initialization after __init__ and model_construct.
diff --git a/sdk/guides/critic.mdx b/sdk/guides/critic.mdx
new file mode 100644
index 00000000..c0098fbf
--- /dev/null
+++ b/sdk/guides/critic.mdx
@@ -0,0 +1,179 @@
+---
+title: Critic (Experimental)
+description: Real-time evaluation of agent actions using an LLM-based critic model.
+---
+
+
+**This feature is highly experimental** and subject to change. The API, configuration, and behavior may evolve significantly based on feedback and testing.
+
+
+
+The critic model is hosted by the OpenHands LLM Provider and is currently free to use. This example is available on GitHub: [examples/01_standalone_sdk/34_critic_example.py](https://github.com/OpenHands/software-agent-sdk/blob/main/examples/01_standalone_sdk/34_critic_example.py)
+
+
+## What is a Critic?
+
+A **critic** is an evaluator that analyzes agent actions and conversation history to predict the quality or success probability of agent decisions. The critic runs alongside the agent and provides:
+
+- **Quality scores**: Probability scores between 0.0 and 1.0 indicating predicted success
+- **Real-time feedback**: Scores computed during agent execution, not just at completion
+
+You can use critic scores to build automated workflows, such as triggering the agent to reflect on and fix its previous solution when the critic indicates poor task performance.
+
+
+This critic is a more advanced extension of the approach described in our blog post [SOTA on SWE-Bench Verified with Inference-Time Scaling and Critic Model](https://openhands.dev/blog/sota-on-swe-bench-verified-with-inference-time-scaling-and-critic-model). A technical report with detailed evaluation metrics is forthcoming.
+
+
+## Quick Start
+
+When using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`), the critic is **automatically configured** - no additional setup required.
+
+```python icon="python" expandable examples/01_standalone_sdk/34_critic_example.py
+"""Example demonstrating critic-based evaluation of agent actions.
+
+This is EXPERIMENTAL.
+
+This shows how to configure an agent with a critic to evaluate action quality
+in real-time. The critic scores are displayed in the conversation visualizer.
+
+For All-Hands LLM proxy (llm-proxy.*.all-hands.dev), the critic is auto-configured
+using the same base_url with /vllm suffix and "critic" as the model name.
+"""
+
+import os
+import re
+
+from openhands.sdk import LLM, Agent, Conversation, Tool
+from openhands.sdk.critic import APIBasedCritic
+from openhands.sdk.critic.base import CriticBase
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.task_tracker import TaskTrackerTool
+from openhands.tools.terminal import TerminalTool
+
+
+def get_required_env(name: str) -> str:
+ value = os.getenv(name)
+ if value:
+ return value
+ raise ValueError(
+ f"Missing required environment variable: {name}. "
+ f"Set {name} before running this example."
+ )
+
+
+def get_default_critic(llm: LLM) -> CriticBase | None:
+ """Auto-configure critic for All-Hands LLM proxy.
+
+ When the LLM base_url matches `llm-proxy.*.all-hands.dev`, returns an
+ APIBasedCritic configured with:
+ - server_url: {base_url}/vllm
+ - api_key: same as LLM
+ - model_name: "critic"
+
+ Returns None if base_url doesn't match or api_key is not set.
+ """
+ base_url = llm.base_url
+ api_key = llm.api_key
+ if base_url is None or api_key is None:
+ return None
+
+ # Match: llm-proxy.{env}.all-hands.dev (e.g., staging, prod, eval)
+ pattern = r"^https?://llm-proxy\.[^./]+\.all-hands\.dev"
+ if not re.match(pattern, base_url):
+ return None
+
+ return APIBasedCritic(
+ server_url=f"{base_url.rstrip('/')}/vllm",
+ api_key=api_key,
+ model_name="critic",
+ )
+
+
+llm_api_key = get_required_env("LLM_API_KEY")
+
+llm = LLM(
+ model=os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929"),
+ api_key=llm_api_key,
+ base_url=os.getenv("LLM_BASE_URL", None),
+)
+
+# Try auto-configuration for All-Hands proxy, fall back to explicit env vars
+critic = get_default_critic(llm)
+if critic is None:
+ critic = APIBasedCritic(
+ server_url=get_required_env("CRITIC_SERVER_URL"),
+ api_key=get_required_env("CRITIC_API_KEY"),
+ model_name=get_required_env("CRITIC_MODEL_NAME"),
+ )
+
+
+# Configure agent with critic
+agent = Agent(
+ llm=llm,
+ tools=[
+ Tool(name=TerminalTool.name),
+ Tool(name=FileEditorTool.name),
+ Tool(name=TaskTrackerTool.name),
+ ],
+ # Add critic to evaluate agent actions
+ critic=critic,
+)
+
+cwd = os.getcwd()
+conversation = Conversation(agent=agent, workspace=cwd)
+
+conversation.send_message(
+ "Create a file called GREETING.txt with a friendly greeting message."
+)
+conversation.run()
+
+print("\nAll done! Check the output above for 'Critic Score' in the visualizer.")
+```
+
+```bash Running the Example
+uv run python examples/01_standalone_sdk/34_critic_example.py
+```
+
+## Understanding Critic Results
+
+Critic evaluations produce scores and feedback:
+
+- **`score`**: Float between 0.0 and 1.0 representing predicted success probability
+- **`message`**: Optional feedback with detailed probabilities
+- **`success`**: Boolean property (True if score >= 0.5)
+
+Results are automatically displayed in the conversation visualizer:
+
+
+
+### Accessing Results Programmatically
+
+```python
+from openhands.sdk import Event, ActionEvent, MessageEvent
+
+def callback(event: Event):
+ if isinstance(event, (ActionEvent, MessageEvent)):
+ if event.critic_result is not None:
+ print(f"Critic score: {event.critic_result.score:.3f}")
+ print(f"Success: {event.critic_result.success}")
+
+conversation = Conversation(agent=agent, callbacks=[callback])
+```
+
+## Troubleshooting
+
+### Critic Evaluations Not Appearing
+
+- Verify the critic is properly configured and passed to the Agent
+- Ensure you're using the OpenHands LLM Provider (`llm-proxy.*.all-hands.dev`)
+
+### API Authentication Errors
+
+- Verify `LLM_API_KEY` is set correctly
+- Check that the API key has not expired
+
+## Next Steps
+
+- **[Observability](/sdk/guides/observability)** - Monitor and log agent behavior
+- **[Metrics](/sdk/guides/metrics)** - Collect performance metrics
+- **[Stuck Detector](/sdk/guides/agent-stuck-detector)** - Detect unproductive agent patterns
diff --git a/sdk/guides/hooks.mdx b/sdk/guides/hooks.mdx
index eaee0390..620cd38c 100644
--- a/sdk/guides/hooks.mdx
+++ b/sdk/guides/hooks.mdx
@@ -37,7 +37,7 @@ from pathlib import Path
from pydantic import SecretStr
from openhands.sdk import LLM, Conversation
-from openhands.sdk.hooks import HookConfig
+from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher
from openhands.tools.preset.default import get_default_agent
@@ -66,60 +66,65 @@ with tempfile.TemporaryDirectory() as tmpdir:
log_file = workspace / "tool_usage.log"
summary_file = workspace / "summary.txt"
- # Configure ALL hook types in one config
- hook_config = HookConfig.from_dict(
- {
- "hooks": {
- "PreToolUse": [
- {
- "matcher": "terminal",
- "hooks": [
- {
- "type": "command",
- "command": str(SCRIPT_DIR / "block_dangerous.sh"),
- "timeout": 10,
- }
- ],
- }
+ # Configure hooks using the typed approach (recommended)
+ # This provides better type safety and IDE support
+ hook_config = HookConfig(
+ pre_tool_use=[
+ HookMatcher(
+ matcher="terminal",
+ hooks=[
+ HookDefinition(
+ command=str(SCRIPT_DIR / "block_dangerous.sh"),
+ timeout=10,
+ )
],
- "PostToolUse": [
- {
- "matcher": "*",
- "hooks": [
- {
- "type": "command",
- "command": f"LOG_FILE={log_file} "
- f"{SCRIPT_DIR / 'log_tools.sh'}",
- "timeout": 5,
- }
- ],
- }
+ )
+ ],
+ post_tool_use=[
+ HookMatcher(
+ matcher="*",
+ hooks=[
+ HookDefinition(
+ command=(f"LOG_FILE={log_file} {SCRIPT_DIR / 'log_tools.sh'}"),
+ timeout=5,
+ )
],
- "UserPromptSubmit": [
- {
- "hooks": [
- {
- "type": "command",
- "command": str(SCRIPT_DIR / "inject_git_context.sh"),
- }
- ],
- }
+ )
+ ],
+ user_prompt_submit=[
+ HookMatcher(
+ hooks=[
+ HookDefinition(
+ command=str(SCRIPT_DIR / "inject_git_context.sh"),
+ )
],
- "Stop": [
- {
- "hooks": [
- {
- "type": "command",
- "command": f"SUMMARY_FILE={summary_file} "
- f"{SCRIPT_DIR / 'require_summary.sh'}",
- }
- ],
- }
+ )
+ ],
+ stop=[
+ HookMatcher(
+ hooks=[
+ HookDefinition(
+ command=(
+ f"SUMMARY_FILE={summary_file} "
+ f"{SCRIPT_DIR / 'require_summary.sh'}"
+ ),
+ )
],
- }
- }
+ )
+ ],
)
+ # Alternative: You can also use .from_dict() for loading from JSON config files
+ # Example with a single hook matcher:
+ # hook_config = HookConfig.from_dict({
+ # "hooks": {
+ # "PreToolUse": [{
+ # "matcher": "terminal",
+ # "hooks": [{"command": "path/to/script.sh", "timeout": 10}]
+ # }]
+ # }
+ # })
+
agent = get_default_agent(llm=llm)
conversation = Conversation(
agent=agent,
diff --git a/sdk/guides/plugins.mdx b/sdk/guides/plugins.mdx
index 8f8287d5..b38531e8 100644
--- a/sdk/guides/plugins.mdx
+++ b/sdk/guides/plugins.mdx
@@ -91,10 +91,23 @@ for skill in plugin.skills:
print(f" Triggers: {skill.trigger}")
# Hooks
-print(f"\nHooks: {'Configured' if plugin.hooks else 'None'}")
-if plugin.hooks:
- for event_type, matchers in plugin.hooks.hooks.items():
- print(f" - {event_type}: {len(matchers)} matcher(s)")
+hook_config = plugin.hooks
+has_hooks = hook_config is not None and not hook_config.is_empty()
+print(f"\nHooks: {'Configured' if has_hooks else 'None'}")
+if has_hooks:
+ assert hook_config is not None
+ if hook_config.pre_tool_use:
+ print(f" - PreToolUse: {len(hook_config.pre_tool_use)} matcher(s)")
+ if hook_config.post_tool_use:
+ print(f" - PostToolUse: {len(hook_config.post_tool_use)} matcher(s)")
+ if hook_config.user_prompt_submit:
+ print(f" - UserPromptSubmit: {len(hook_config.user_prompt_submit)} matcher(s)")
+ if hook_config.session_start:
+ print(f" - SessionStart: {len(hook_config.session_start)} matcher(s)")
+ if hook_config.session_end:
+ print(f" - SessionEnd: {len(hook_config.session_end)} matcher(s)")
+ if hook_config.stop:
+ print(f" - Stop: {len(hook_config.stop)} matcher(s)")
# MCP Config
print(f"\nMCP Config: {'Configured' if plugin.mcp_config else 'None'}")
@@ -138,6 +151,7 @@ if not api_key:
print("Skipping agent demo (LLM_API_KEY not set)")
print("\nTo run the full demo, set the LLM_API_KEY environment variable:")
print(" export LLM_API_KEY=your-api-key")
+ print("EXAMPLE_COST: 0")
sys.exit(0)
# Configure LLM
@@ -146,6 +160,7 @@ llm = LLM(
usage_id="plugin-demo",
model=model,
api_key=SecretStr(api_key),
+ base_url=os.getenv("LLM_BASE_URL"),
)
# Create agent context with plugin skills
@@ -212,6 +227,7 @@ with tempfile.TemporaryDirectory() as tmpdir:
print("No hook log file found (hooks may not have executed)")
print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}")
+ print(f"EXAMPLE_COST: {llm.metrics.accumulated_cost:.4f}")
```
```bash Running the Example
diff --git a/sdk/guides/screenshots/critic-sdk-visualizer.png b/sdk/guides/screenshots/critic-sdk-visualizer.png
new file mode 100644
index 00000000..b8a7473c
Binary files /dev/null and b/sdk/guides/screenshots/critic-sdk-visualizer.png differ
diff --git a/sdk/guides/skill.mdx b/sdk/guides/skill.mdx
index 6ac5ec04..8b9ff62f 100644
--- a/sdk/guides/skill.mdx
+++ b/sdk/guides/skill.mdx
@@ -378,6 +378,7 @@ when triggered, plus the agent can proactively read them anytime.
"""
import os
+import sys
from pathlib import Path
from pydantic import SecretStr
@@ -392,108 +393,102 @@ from openhands.tools.file_editor import FileEditorTool
from openhands.tools.terminal import TerminalTool
-def main():
- # Get the directory containing this script
- script_dir = Path(__file__).parent
- example_skills_dir = script_dir / "example_skills"
-
- # =========================================================================
- # Part 1: Loading Skills from a Directory
- # =========================================================================
- print("=" * 80)
- print("Part 1: Loading Skills from a Directory")
- print("=" * 80)
-
- print(f"Loading skills from: {example_skills_dir}")
-
- # Discover resources in the skill directory
- skill_subdir = example_skills_dir / "rot13-encryption"
- resources = discover_skill_resources(skill_subdir)
- print("\nDiscovered resources in rot13-encryption/:")
- print(f" - scripts: {resources.scripts}")
- print(f" - references: {resources.references}")
- print(f" - assets: {resources.assets}")
-
- # Load skills from the directory
- repo_skills, knowledge_skills, agent_skills = load_skills_from_dir(
- example_skills_dir
- )
-
- print("\nLoaded skills from directory:")
- print(f" - Repo skills: {list(repo_skills.keys())}")
- print(f" - Knowledge skills: {list(knowledge_skills.keys())}")
- print(f" - Agent skills (SKILL.md): {list(agent_skills.keys())}")
-
- # Access the loaded skill and show all AgentSkills standard fields
- if agent_skills:
- skill_name = list(agent_skills.keys())[0]
- loaded_skill = agent_skills[skill_name]
- print(f"\nDetails for '{skill_name}' (AgentSkills standard fields):")
- print(f" - Name: {loaded_skill.name}")
- desc = loaded_skill.description or ""
- print(f" - Description: {desc[:70]}...")
- print(f" - License: {loaded_skill.license}")
- print(f" - Compatibility: {loaded_skill.compatibility}")
- print(f" - Metadata: {loaded_skill.metadata}")
- if loaded_skill.resources:
- print(" - Resources:")
- print(f" - Scripts: {loaded_skill.resources.scripts}")
- print(f" - References: {loaded_skill.resources.references}")
- print(f" - Assets: {loaded_skill.resources.assets}")
- print(f" - Skill root: {loaded_skill.resources.skill_root}")
-
- # =========================================================================
- # Part 2: Using Skills with an Agent
- # =========================================================================
- print("\n" + "=" * 80)
- print("Part 2: Using Skills with an Agent")
- print("=" * 80)
-
- # Check for API key
- api_key = os.getenv("LLM_API_KEY")
- if not api_key:
- print("Skipping agent demo (LLM_API_KEY not set)")
- print("\nTo run the full demo, set the LLM_API_KEY environment variable:")
- print(" export LLM_API_KEY=your-api-key")
- return
-
- # Configure LLM
- model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")
- llm = LLM(
- usage_id="skills-demo",
- model=model,
- api_key=SecretStr(api_key),
- base_url=os.getenv("LLM_BASE_URL"),
- )
-
- # Create agent context with loaded skills
- agent_context = AgentContext(
- skills=list(agent_skills.values()),
- # Disable public skills for this demo to keep output focused
- load_public_skills=False,
- )
-
- # Create agent with tools so it can read skill resources
- tools = [
- Tool(name=TerminalTool.name),
- Tool(name=FileEditorTool.name),
- ]
- agent = Agent(llm=llm, tools=tools, agent_context=agent_context)
+# Get the directory containing this script
+script_dir = Path(__file__).parent
+example_skills_dir = script_dir / "example_skills"
+
+# =========================================================================
+# Part 1: Loading Skills from a Directory
+# =========================================================================
+print("=" * 80)
+print("Part 1: Loading Skills from a Directory")
+print("=" * 80)
+
+print(f"Loading skills from: {example_skills_dir}")
+
+# Discover resources in the skill directory
+skill_subdir = example_skills_dir / "rot13-encryption"
+resources = discover_skill_resources(skill_subdir)
+print("\nDiscovered resources in rot13-encryption/:")
+print(f" - scripts: {resources.scripts}")
+print(f" - references: {resources.references}")
+print(f" - assets: {resources.assets}")
+
+# Load skills from the directory
+repo_skills, knowledge_skills, agent_skills = load_skills_from_dir(example_skills_dir)
+
+print("\nLoaded skills from directory:")
+print(f" - Repo skills: {list(repo_skills.keys())}")
+print(f" - Knowledge skills: {list(knowledge_skills.keys())}")
+print(f" - Agent skills (SKILL.md): {list(agent_skills.keys())}")
+
+# Access the loaded skill and show all AgentSkills standard fields
+if agent_skills:
+ skill_name = next(iter(agent_skills))
+ loaded_skill = agent_skills[skill_name]
+ print(f"\nDetails for '{skill_name}' (AgentSkills standard fields):")
+ print(f" - Name: {loaded_skill.name}")
+ desc = loaded_skill.description or ""
+ print(f" - Description: {desc[:70]}...")
+ print(f" - License: {loaded_skill.license}")
+ print(f" - Compatibility: {loaded_skill.compatibility}")
+ print(f" - Metadata: {loaded_skill.metadata}")
+ if loaded_skill.resources:
+ print(" - Resources:")
+ print(f" - Scripts: {loaded_skill.resources.scripts}")
+ print(f" - References: {loaded_skill.resources.references}")
+ print(f" - Assets: {loaded_skill.resources.assets}")
+ print(f" - Skill root: {loaded_skill.resources.skill_root}")
+
+# =========================================================================
+# Part 2: Using Skills with an Agent
+# =========================================================================
+print("\n" + "=" * 80)
+print("Part 2: Using Skills with an Agent")
+print("=" * 80)
+
+# Check for API key
+api_key = os.getenv("LLM_API_KEY")
+if not api_key:
+ print("Skipping agent demo (LLM_API_KEY not set)")
+ print("\nTo run the full demo, set the LLM_API_KEY environment variable:")
+ print(" export LLM_API_KEY=your-api-key")
+ sys.exit(0)
+
+# Configure LLM
+model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")
+llm = LLM(
+ usage_id="skills-demo",
+ model=model,
+ api_key=SecretStr(api_key),
+ base_url=os.getenv("LLM_BASE_URL"),
+)
- # Create conversation
- conversation = Conversation(agent=agent, workspace=os.getcwd())
+# Create agent context with loaded skills
+agent_context = AgentContext(
+ skills=list(agent_skills.values()),
+ # Disable public skills for this demo to keep output focused
+ load_public_skills=False,
+)
- # Test the skill (triggered by "encrypt" keyword)
- # The skill provides instructions and a script for ROT13 encryption
- print("\nSending message with 'encrypt' keyword to trigger skill...")
- conversation.send_message("Encrypt the message 'hello world'.")
- conversation.run()
+# Create agent with tools so it can read skill resources
+tools = [
+ Tool(name=TerminalTool.name),
+ Tool(name=FileEditorTool.name),
+]
+agent = Agent(llm=llm, tools=tools, agent_context=agent_context)
- print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}")
+# Create conversation
+conversation = Conversation(agent=agent, workspace=os.getcwd())
+# Test the skill (triggered by "encrypt" keyword)
+# The skill provides instructions and a script for ROT13 encryption
+print("\nSending message with 'encrypt' keyword to trigger skill...")
+conversation.send_message("Encrypt the message 'hello world'.")
+conversation.run()
-if __name__ == "__main__":
- main()
+print(f"\nTotal cost: ${llm.metrics.accumulated_cost:.4f}")
+print(f"EXAMPLE_COST: {llm.metrics.accumulated_cost:.4f}")
```
```bash Running the Example