diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index fe5190692..d65ea02e5 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -93,51 +93,44 @@ jobs: - name: Select and configure run.yaml env: - CONFIG_ENVIRONMENT: ${{ matrix.environment }} + CONFIG_MODE: ${{ matrix.mode }} run: | CONFIGS_DIR="tests/e2e/configs" - ENVIRONMENT="$CONFIG_ENVIRONMENT" + MODE="$CONFIG_MODE" - echo "Looking for configurations in $CONFIGS_DIR/" + echo "Deployment mode: $MODE" - # List available configurations - if [ -d "$CONFIGS_DIR" ]; then - echo "Available configurations:" - ls -la "$CONFIGS_DIR"/*.yaml 2>/dev/null || echo "No YAML files found in $CONFIGS_DIR/" + # Select config based on mode: + # - library mode: run-library.yaml (llama-stack 0.3.0 format) + # - server mode: run-ci.yaml (original format) + if [ "$MODE" == "library" ]; then + CONFIG_FILE="$CONFIGS_DIR/run-library.yaml" else - echo "Configs directory '$CONFIGS_DIR' not found!" - exit 1 + CONFIG_FILE="$CONFIGS_DIR/run-ci.yaml" fi - # Determine which config file to use - CONFIG_FILE="$CONFIGS_DIR/run-$ENVIRONMENT.yaml" - - echo "Looking for: $CONFIG_FILE" + echo "Using configuration: $CONFIG_FILE" - if [ -f "$CONFIG_FILE" ]; then - echo "Found config for $ENVIRONMENT environment" - cp "$CONFIG_FILE" run.yaml - else - echo "Configuration file not found: $CONFIG_FILE" - echo "Available files:" - find "$CONFIGS_DIR" -name "*.yaml" + if [ ! -f "$CONFIG_FILE" ]; then + echo "❌ Configuration not found: $CONFIG_FILE" + echo "Available configs:" + ls -la "$CONFIGS_DIR"/*.yaml exit 1 fi - # Update paths for container environment (relative -> absolute) - sed -i 's|db_path: \.llama/distributions|db_path: /app-root/.llama/distributions|g' run.yaml - sed -i 's|db_path: tmp/|db_path: /app-root/.llama/distributions/|g' run.yaml - - echo "Successfully configured for $ENVIRONMENT environment" - echo "Using configuration: $(basename "$CONFIG_FILE")" + cp "$CONFIG_FILE" run.yaml + echo "✅ Configuration copied to run.yaml" - name: Show final configuration run: | echo "=== Configuration Summary ===" echo "Deployment mode: ${{ matrix.mode }}" echo "Environment: ${{ matrix.environment }}" - echo "Source config: tests/e2e/configs/run-${{ matrix.environment }}.yaml" - echo "Final file: run.yaml" + if [ "${{ matrix.mode }}" == "library" ]; then + echo "Source config: tests/e2e/configs/run-library.yaml" + else + echo "Source config: tests/e2e/configs/run-ci.yaml" + fi echo "" echo "=== Configuration Preview ===" echo "Providers: $(grep -c "provider_id:" run.yaml)" diff --git a/docker-compose.yaml b/docker-compose.yaml index 00b76dede..424606312 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -23,7 +23,6 @@ services: - RHEL_AI_PORT=${RHEL_AI_PORT} - RHEL_AI_API_KEY=${RHEL_AI_API_KEY} - RHEL_AI_MODEL=${RHEL_AI_MODEL} - - LLAMA_STACK_LOGGING=all=debug # enable llama-stack debug log networks: - lightspeednet healthcheck: diff --git a/docs/conversations_api.md b/docs/conversations_api.md new file mode 100644 index 000000000..e7496be16 --- /dev/null +++ b/docs/conversations_api.md @@ -0,0 +1,514 @@ +# Conversations API Guide + +This document explains how the Conversations API works with the Responses API in Lightspeed Core Stack (LCS). You will learn: + +* How conversation management works with the Responses API +* Conversation ID formats and normalization +* How to interact with conversations via REST API and CLI +* Database storage and retrieval of conversations + +--- + +## Table of Contents + +* [Introduction](#introduction) +* [Conversation ID Formats](#conversation-id-formats) + * [Llama Stack Format](#llama-stack-format) + * [Normalized Format](#normalized-format) + * [ID Conversion Utilities](#id-conversion-utilities) +* [How Conversations Work](#how-conversations-work) + * [Creating New Conversations](#creating-new-conversations) + * [Continuing Existing Conversations](#continuing-existing-conversations) + * [Conversation Storage](#conversation-storage) +* [API Endpoints](#api-endpoints) + * [Query Endpoint (v2)](#query-endpoint-v2) + * [Streaming Query Endpoint (v2)](#streaming-query-endpoint-v2) + * [Conversations List Endpoint (v3)](#conversations-list-endpoint-v3) + * [Conversation Detail Endpoint (v3)](#conversation-detail-endpoint-v3) +* [Testing with curl](#testing-with-curl) +* [Database Schema](#database-schema) +* [Troubleshooting](#troubleshooting) + +--- + +## Introduction + +Lightspeed Core Stack uses the **OpenAI Responses API** (`client.responses.create()`) for generating chat completions with conversation persistence. The Responses API provides: + +* Automatic conversation management with `store=True` +* Multi-turn conversation support +* Tool integration (RAG, MCP, function calls) +* Shield/guardrails support + +Conversations are stored in two locations: +1. **Llama Stack database** (`openai_conversations` and `conversation_items` tables in `public` schema) +2. **Lightspeed Stack database** (`user_conversation` table in `lightspeed-stack` schema) + +> [!NOTE] +> The Responses API replaced the older Agent API (`client.agents.create_turn()`) for better OpenAI compatibility and improved conversation management. + +--- + +## Conversation ID Formats + +### Llama Stack Format + +When Llama Stack creates a conversation, it generates an ID in the format: + +``` +conv_<48-character-hex-string> +``` + +**Example:** +``` +conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e +``` + +This is the format used internally by Llama Stack and must be used when calling Llama Stack APIs. + +### Normalized Format + +Lightspeed Stack normalizes conversation IDs by removing the `conv_` prefix before: +* Storing in the database +* Returning to API clients +* Displaying in CLI tools + +**Example normalized ID:** +``` +0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e +``` + +This 48-character format is what users see and work with. + +### ID Conversion Utilities + +LCS provides utilities in `src/utils/suid.py` for ID conversion: + +```python +from utils.suid import normalize_conversation_id, to_llama_stack_conversation_id + +# Convert from Llama Stack format to normalized format +normalized_id = normalize_conversation_id("conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e") +# Returns: "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e" + +# Convert from normalized format to Llama Stack format +llama_stack_id = to_llama_stack_conversation_id("0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e") +# Returns: "conv_0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e" +``` + +--- + +## How Conversations Work + +### Creating New Conversations + +When a user makes a query **without** providing a `conversation_id`: + +1. LCS creates a new conversation using `client.conversations.create(metadata={})` +2. Llama Stack returns a conversation ID (e.g., `conv_abc123...`) +3. LCS normalizes the ID and stores it in the database +4. The query is sent to `client.responses.create()` with the conversation ID +5. The normalized ID is returned to the client + +**Code flow (from `src/app/endpoints/query_v2.py`):** + +```python +# No conversation_id provided - create a new conversation first +conversation = await client.conversations.create(metadata={}) +llama_stack_conv_id = conversation.id +# Store the normalized version +conversation_id = normalize_conversation_id(llama_stack_conv_id) + +# Use the conversation in responses.create() +response = await client.responses.create( + input=input_text, + model=model_id, + instructions=system_prompt, + store=True, + conversation=llama_stack_conv_id, # Use Llama Stack format + # ... other parameters +) +``` + +### Continuing Existing Conversations + +When a user provides an existing `conversation_id`: + +1. LCS receives the normalized ID (e.g., `0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e`) +2. Converts it to Llama Stack format (adds `conv_` prefix) +3. Sends the query to `client.responses.create()` with the existing conversation ID +4. Llama Stack retrieves the conversation history and continues the conversation +5. The conversation history is automatically included in the LLM context + +**Code flow:** + +```python +# Conversation ID was provided - convert to llama-stack format +conversation_id = query_request.conversation_id +llama_stack_conv_id = to_llama_stack_conversation_id(conversation_id) + +# Use the existing conversation +response = await client.responses.create( + input=input_text, + model=model_id, + conversation=llama_stack_conv_id, # Existing conversation + # ... other parameters +) +``` + +### Conversation Storage + +Conversations are stored in **two databases**: + +#### 1. Llama Stack Database (PostgreSQL `public` schema) + +**Tables:** +- `openai_conversations`: Stores conversation metadata +- `conversation_items`: Stores individual messages/turns in conversations + +**Configuration (in `config/llama_stack_client_config.yaml`):** +```yaml +storage: + stores: + conversations: + table_name: openai_conversations + backend: sql_default +``` + +#### 2. Lightspeed Stack Database (PostgreSQL `lightspeed-stack` schema) + +**Table:** `user_conversation` + +Stores user-specific metadata: +- Conversation ID (normalized, without `conv_` prefix) +- User ID +- Last used model and provider +- Creation and last message timestamps +- Message count +- Topic summary + +--- + +## API Endpoints + +### Query Endpoint (v2) + +**Endpoint:** `POST /v2/query` + +**Request:** +```json +{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "query": "What is the OpenShift Assisted Installer?", + "model": "models/gemini-2.0-flash", + "provider": "gemini" +} +``` + +**Response:** +```json +{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "response": "The OpenShift Assisted Installer is...", + "rag_chunks": [], + "tool_calls": [], + "referenced_documents": [], + "truncated": false, + "input_tokens": 150, + "output_tokens": 200, + "available_quotas": {} +} +``` + +> [!NOTE] +> If `conversation_id` is omitted, a new conversation is automatically created and the new ID is returned in the response. + +### Streaming Query Endpoint (v2) + +**Endpoint:** `POST /v2/streaming_query` + +**Request:** Same as `/v2/query` + +**Response:** Server-Sent Events (SSE) stream + +``` +data: {"event": "start", "data": {"conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e"}} + +data: {"event": "token", "data": {"id": 0, "token": "The "}} + +data: {"event": "token", "data": {"id": 1, "token": "OpenShift "}} + +data: {"event": "turn_complete", "data": {"id": 10, "token": "The OpenShift Assisted Installer is..."}} + +data: {"event": "end", "data": {"referenced_documents": [], "input_tokens": 150, "output_tokens": 200}} +``` + +### Conversations List Endpoint (v3) + +**Endpoint:** `GET /v3/conversations` + +**Response:** +```json +{ + "conversations": [ + { + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "created_at": "2025-11-24T10:30:00Z", + "last_message_at": "2025-11-24T10:35:00Z", + "message_count": 5, + "last_used_model": "gemini-2.0-flash-exp", + "last_used_provider": "google", + "topic_summary": "OpenShift Assisted Installer discussion" + } + ] +} +``` + +### Conversation Detail Endpoint (v3) + +**Endpoint:** `GET /v3/conversations/{conversation_id}` + +**Response:** +```json +{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "created_at": "2025-11-24T10:30:00Z", + "chat_history": [ + { + "started_at": "2025-11-24T10:30:00Z", + "messages": [ + { + "type": "user", + "content": "What is the OpenShift Assisted Installer?" + }, + { + "type": "assistant", + "content": "The OpenShift Assisted Installer is..." + } + ] + } + ] +} +``` + +--- + +## Testing with curl + +You can test the Conversations API endpoints using `curl`. The examples below assume the server is running on `localhost:8090`. + +First, set your authorization token: + +```bash +export TOKEN="" +``` + +### Non-Streaming Query (New Conversation) + +To start a new conversation, omit the `conversation_id` field: + +```bash +curl -X POST http://localhost:8090/v2/query \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -d '{ + "query": "What is the OpenShift Assisted Installer?", + "model": "models/gemini-2.0-flash", + "provider": "gemini" + }' +``` + +**Response:** +```json +{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "response": "The OpenShift Assisted Installer is...", + "rag_chunks": [], + "tool_calls": [], + "referenced_documents": [], + "truncated": false, + "input_tokens": 150, + "output_tokens": 200, + "available_quotas": {} +} +``` + +### Non-Streaming Query (Continue Conversation) + +To continue an existing conversation, include the `conversation_id` from a previous response: + +```bash +curl -X POST http://localhost:8090/v2/query \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -d '{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "query": "How do I install it?", + "model": "models/gemini-2.0-flash", + "provider": "gemini" + }' +``` + +### Streaming Query (New Conversation) + +For streaming responses, use the `/v2/streaming_query` endpoint. The response is returned as Server-Sent Events (SSE): + +```bash +curl -X POST http://localhost:8090/v2/streaming_query \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Accept: text/event-stream" \ + -d '{ + "query": "What is the OpenShift Assisted Installer?", + "model": "models/gemini-2.0-flash", + "provider": "gemini" + }' +``` + +**Response (SSE stream):** +``` +data: {"event": "start", "data": {"conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e"}} + +data: {"event": "token", "data": {"id": 0, "token": "The "}} + +data: {"event": "token", "data": {"id": 1, "token": "OpenShift "}} + +data: {"event": "turn_complete", "data": {"id": 10, "token": "The OpenShift Assisted Installer is..."}} + +data: {"event": "end", "data": {"referenced_documents": [], "input_tokens": 150, "output_tokens": 200}} +``` + +### Streaming Query (Continue Conversation) + +```bash +curl -X POST http://localhost:8090/v2/streaming_query \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Accept: text/event-stream" \ + -d '{ + "conversation_id": "0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e", + "query": "Can you explain the prerequisites?", + "model": "models/gemini-2.0-flash", + "provider": "gemini" + }' +``` + +### List Conversations + +```bash +curl -X GET http://localhost:8090/v3/conversations \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" +``` + +### Get Conversation Details + +```bash +curl -X GET http://localhost:8090/v3/conversations/0d21ba731f21f798dc9680125d5d6f493e4a7ab79f25670e \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $TOKEN" +``` + +--- + +## Database Schema + +### Lightspeed Stack Schema + +**Table:** `lightspeed-stack.user_conversation` + +```sql +CREATE TABLE "lightspeed-stack".user_conversation ( + id VARCHAR PRIMARY KEY, -- Normalized conversation ID (48 chars) + user_id VARCHAR NOT NULL, -- User identifier + last_used_model VARCHAR NOT NULL, -- Model name (e.g., "gemini-2.0-flash-exp") + last_used_provider VARCHAR NOT NULL, -- Provider (e.g., "google") + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + last_message_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + message_count INTEGER DEFAULT 0, + topic_summary VARCHAR DEFAULT '' +); + +CREATE INDEX idx_user_conversation_user_id ON "lightspeed-stack".user_conversation(user_id); +``` + +> [!NOTE] +> The `id` column uses `VARCHAR` without a length limit, which PostgreSQL treats similarly to `TEXT`. This accommodates the 48-character normalized conversation IDs. + +### Llama Stack Schema + +**Table:** `public.openai_conversations` + +```sql +CREATE TABLE public.openai_conversations ( + id VARCHAR(64) PRIMARY KEY, -- Full ID with conv_ prefix (53 chars) + created_at TIMESTAMP, + metadata JSONB +); +``` + +**Table:** `public.conversation_items` + +```sql +CREATE TABLE public.conversation_items ( + id VARCHAR(64) PRIMARY KEY, + conversation_id VARCHAR(64) REFERENCES openai_conversations(id), + turn_number INTEGER, + content JSONB, + created_at TIMESTAMP +); +``` + +--- + +## Troubleshooting + +### Conversation Not Found Error + +**Symptom:** +``` +Error: Conversation not found (HTTP 404) +``` + +**Possible Causes:** +1. Conversation ID was truncated (should be 48 characters, not 41) +2. Conversation ID has incorrect prefix (should NOT include `conv_` when calling LCS API) +3. Conversation was deleted +4. Database connection issue + +**Solution:** +- Verify the conversation ID is exactly 48 characters +- Ensure you're using the normalized ID format (without `conv_` prefix) when calling LCS endpoints +- Check database connectivity + +### Model/Provider Changes Not Persisting + +**Symptom:** +The `last_used_model` and `last_used_provider` fields don't update when using a different model. + +**Explanation:** +This is expected behavior. The Responses API v2 allows you to change the model/provider for each query within the same conversation. The `last_used_model` field only tracks the most recently used model for display purposes in the conversation list. + +### Empty Conversation History + +**Symptom:** +Calling `/v3/conversations/{conversation_id}` returns empty `chat_history`. + +**Possible Causes:** +1. The conversation was just created and has no messages yet +2. The conversation exists in Lightspeed DB but not in Llama Stack DB (data inconsistency) +3. Database connection to Llama Stack is failing + +**Solution:** +- Verify the conversation has messages by checking `message_count` +- Check Llama Stack database connectivity +- Verify `openai_conversations` and `conversation_items` tables exist and are accessible + +--- + +## References + +- [OpenAI Responses API Documentation](https://platform.openai.com/docs/api-reference/responses) +- [Llama Stack Documentation](https://github.com/meta-llama/llama-stack) +- [LCS Configuration Guide](./config.md) +- [LCS Getting Started Guide](./getting_started.md) diff --git a/docs/openapi.json b/docs/openapi.json index 8acbb5007..7c83fcce9 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1186,11 +1186,11 @@ "/v1/query": { "post": { "tags": [ - "query" + "query_v1" ], - "summary": "Query Endpoint Handler", - "description": "Handle request to the /query endpoint using Agent API.\n\nThis is a wrapper around query_endpoint_handler_base that provides\nthe Agent API specific retrieve_response and get_topic_summary functions.\n\nReturns:\n QueryResponse: Contains the conversation ID and the LLM-generated response.", - "operationId": "query_endpoint_handler_v1_query_post", + "summary": "Query Endpoint Handler V1", + "description": "Handle request to the /query endpoint using Responses API.\n\nThis is a wrapper around query_endpoint_handler_base that provides\nthe Responses API specific retrieve_response and get_topic_summary functions.\n\nReturns:\n QueryResponse: Contains the conversation ID and the LLM-generated response.", + "operationId": "query_endpoint_handler_v2_v1_query_post", "requestBody": { "content": { "application/json": { @@ -1211,35 +1211,34 @@ }, "example": { "available_quotas": { - "daily": 1000, - "monthly": 50000 + "ClusterQuotaLimiter": 998911, + "UserQuotaLimiter": 998911 }, "conversation_id": "123e4567-e89b-12d3-a456-426614174000", - "input_tokens": 150, - "output_tokens": 75, - "rag_chunks": [ - { - "content": "OLM is a component of the Operator Framework toolkit...", - "score": 0.95, - "source": "kubernetes-docs/operators.md" - } - ], + "input_tokens": 123, + "output_tokens": 456, "referenced_documents": [ { - "doc_title": "Operator Lifecycle Manager (OLM)", - "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" + "doc_title": "Operator Lifecycle Manager concepts and resources", + "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/understanding/olm/olm-understanding-olm.html" } ], "response": "Operator Lifecycle Manager (OLM) helps users install...", "tool_calls": [ { - "arguments": { - "query": "operator lifecycle manager" - }, - "result": { - "chunks_found": 5 - }, - "tool_name": "knowledge_search" + "args": {}, + "id": "1", + "name": "tool1", + "type": "tool_call" + } + ], + "tool_results": [ + { + "content": "bla", + "id": "1", + "round": 1, + "status": "success", + "type": "tool_result" } ], "truncated": false @@ -1497,11 +1496,11 @@ "/v1/streaming_query": { "post": { "tags": [ - "streaming_query" + "streaming_query_v1" ], - "summary": "Streaming Query Endpoint Handler", - "description": "Handle request to the /streaming_query endpoint using Agent API.\n\nThis is a wrapper around streaming_query_endpoint_handler_base that provides\nthe Agent API specific retrieve_response and response generator functions.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle.\n\nRaises:\n HTTPException: Returns HTTP 500 if unable to connect to the\n Llama Stack server.", - "operationId": "streaming_query_endpoint_handler_v1_streaming_query_post", + "summary": "Streaming Query Endpoint Handler V1", + "description": "Handle request to the /streaming_query endpoint using Responses API.\n\nReturns a streaming response using Server-Sent Events (SSE) format with\ncontent type text/event-stream.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle with content type\n text/event-stream.\n\nRaises:\n HTTPException:\n - 401: Unauthorized - Missing or invalid credentials\n - 403: Forbidden - Insufficient permissions or model override not allowed\n - 404: Not Found - Conversation, model, or provider not found\n - 422: Unprocessable Entity - Request validation failed\n - 429: Too Many Requests - Quota limit exceeded\n - 500: Internal Server Error - Configuration not loaded or other server errors\n - 503: Service Unavailable - Unable to connect to Llama Stack backend", + "operationId": "streaming_query_endpoint_handler_v2_v1_streaming_query_post", "requestBody": { "content": { "application/json": { @@ -1514,16 +1513,14 @@ }, "responses": { "200": { - "description": "Streaming response (Server-Sent Events)", + "description": "Successful response", "content": { - "application/json": { - "schema": {} - }, "text/event-stream": { "schema": { - "type": "string" + "type": "string", + "format": "text/event-stream" }, - "example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 0, \"output_tokens\": 0}, \"available_quotas\": {}}\n\n" + "example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"No Violation\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 1, \"token\": \"\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 2, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 3, \"token\": \"!\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 4, \"token\": \" How\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 5, \"token\": \" can\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 6, \"token\": \" I\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 7, \"token\": \" assist\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 8, \"token\": \" you\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 9, \"token\": \" today\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 10, \"token\": \"?\"}}\n\ndata: {\"event\": \"turn_complete\", \"data\": {\"token\": \"Hello! How can I assist you today?\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 11, \"output_tokens\": 19}, \"available_quotas\": {}}\n\n" } } }, @@ -2229,9 +2226,9 @@ "/v1/conversations": { "get": { "tags": [ - "conversations" + "conversations_v1" ], - "summary": "Get Conversations List Endpoint Handler", + "summary": "Conversations List Endpoint Handler V1", "description": "Handle request to retrieve all conversations for the authenticated user.", "operationId": "get_conversations_list_endpoint_handler_v1_conversations_get", "responses": { @@ -2368,10 +2365,10 @@ "/v1/conversations/{conversation_id}": { "get": { "tags": [ - "conversations" + "conversations_v1" ], - "summary": "Get Conversation Endpoint Handler", - "description": "Handle request to retrieve a conversation by ID.\n\nRetrieve a conversation's chat history by its ID. Then fetches\nthe conversation session from the Llama Stack backend,\nsimplifies the session data to essential chat history, and\nreturns it in a structured response. Raises HTTP 400 for\ninvalid IDs, 404 if not found, 503 if the backend is\nunavailable, and 500 for unexpected errors.\n\nParameters:\n conversation_id (str): Unique identifier of the conversation to retrieve.\n\nReturns:\n ConversationResponse: Structured response containing the conversation\n ID and simplified chat history.", + "summary": "Conversation Get Endpoint Handler V1", + "description": "Handle request to retrieve a conversation by ID using Conversations API.\n\nRetrieve a conversation's chat history by its ID using the LlamaStack\nConversations API. This endpoint fetches the conversation items from\nthe backend, simplifies them to essential chat history, and returns\nthem in a structured response. Raises HTTP 400 for invalid IDs, 404\nif not found, 503 if the backend is unavailable, and 500 for\nunexpected errors.\n\nArgs:\n request: The FastAPI request object\n conversation_id: Unique identifier of the conversation to retrieve\n auth: Authentication tuple from dependency\n\nReturns:\n ConversationResponse: Structured response containing the conversation\n ID and simplified chat history", "operationId": "get_conversation_endpoint_handler_v1_conversations__conversation_id__get", "parameters": [ { @@ -2572,10 +2569,10 @@ }, "delete": { "tags": [ - "conversations" + "conversations_v1" ], - "summary": "Delete Conversation Endpoint Handler", - "description": "Handle request to delete a conversation by ID.\n\nValidates the conversation ID format and attempts to delete the\ncorresponding session from the Llama Stack backend. Raises HTTP\nerrors for invalid IDs, not found conversations, connection\nissues, or unexpected failures.\n\nReturns:\n ConversationDeleteResponse: Response indicating the result of the deletion operation.", + "summary": "Conversation Delete Endpoint Handler V1", + "description": "Handle request to delete a conversation by ID using Conversations API.\n\nValidates the conversation ID format and attempts to delete the\nconversation from the Llama Stack backend using the Conversations API.\nRaises HTTP errors for invalid IDs, not found conversations, connection\nissues, or unexpected failures.\n\nArgs:\n request: The FastAPI request object\n conversation_id: Unique identifier of the conversation to delete\n auth: Authentication tuple from dependency\n\nReturns:\n ConversationDeleteResponse: Response indicating the result of the deletion operation", "operationId": "delete_conversation_endpoint_handler_v1_conversations__conversation_id__delete", "parameters": [ { @@ -2691,6 +2688,178 @@ } } }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "examples": { + "configuration": { + "value": { + "detail": { + "cause": "Lightspeed Stack configuration has not been initialized.", + "response": "Configuration is not loaded" + } + } + }, + "database": { + "value": { + "detail": { + "cause": "Failed to query the database", + "response": "Database query failed" + } + } + } + }, + "schema": { + "$ref": "#/components/schemas/InternalServerErrorResponse" + } + } + } + }, + "503": { + "description": "Service unavailable", + "content": { + "application/json": { + "examples": { + "llama stack": { + "value": { + "detail": { + "cause": "Connection error while trying to reach backend service.", + "response": "Unable to connect to Llama Stack" + } + } + } + }, + "schema": { + "$ref": "#/components/schemas/ServiceUnavailableResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "put": { + "tags": [ + "conversations_v1" + ], + "summary": "Conversation Update Endpoint Handler V1", + "description": "Handle request to update a conversation metadata using Conversations API.\n\nUpdates the conversation metadata (including topic summary) in both the\nLlamaStack backend using the Conversations API and the local database.\n\nArgs:\n request: The FastAPI request object\n conversation_id: Unique identifier of the conversation to update\n update_request: Request containing the topic summary to update\n auth: Authentication tuple from dependency\n\nReturns:\n ConversationUpdateResponse: Response indicating the result of the update operation", + "operationId": "update_conversation_endpoint_handler_v1_conversations__conversation_id__put", + "parameters": [ + { + "name": "conversation_id", + "in": "path", + "required": true, + "schema": { + "type": "string", + "title": "Conversation Id" + } + } + ], + "requestBody": { + "required": true, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ConversationUpdateRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ConversationUpdateResponse" + }, + "example": { + "conversation_id": "123e4567-e89b-12d3-a456-426614174000", + "message": "Topic summary updated successfully", + "success": true + } + } + } + }, + "400": { + "description": "Invalid request format", + "content": { + "application/json": { + "examples": { + "conversation_id": { + "value": { + "detail": { + "cause": "The conversation ID 123e4567-e89b-12d3-a456-426614174000 has invalid format.", + "response": "Invalid conversation ID format" + } + } + } + }, + "schema": { + "$ref": "#/components/schemas/BadRequestResponse" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "examples": { + "missing header": { + "value": { + "detail": { + "cause": "No Authorization header found", + "response": "Missing or invalid credentials provided by client" + } + } + }, + "missing token": { + "value": { + "detail": { + "cause": "No token found in Authorization header", + "response": "Missing or invalid credentials provided by client" + } + } + } + }, + "schema": { + "$ref": "#/components/schemas/UnauthorizedResponse" + } + } + } + }, + "403": { + "description": "Permission denied", + "content": { + "application/json": { + "examples": { + "endpoint": { + "value": { + "detail": { + "cause": "User 6789 is not authorized to access this endpoint.", + "response": "User does not have permission to access this endpoint" + } + } + } + }, + "schema": { + "$ref": "#/components/schemas/ForbiddenResponse" + } + } + } + }, "404": { "description": "Resource not found", "content": { @@ -3169,26 +3338,6 @@ } } }, - "404": { - "description": "Resource not found", - "content": { - "application/json": { - "examples": { - "conversation": { - "value": { - "detail": { - "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", - "response": "Conversation not found" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/NotFoundResponse" - } - } - } - }, "500": { "description": "Internal server error", "content": { @@ -3273,682 +3422,90 @@ } } }, - "400": { - "description": "Invalid request format", - "content": { - "application/json": { - "examples": { - "conversation_id": { - "value": { - "detail": { - "cause": "The conversation ID 123e4567-e89b-12d3-a456-426614174000 has invalid format.", - "response": "Invalid conversation ID format" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/BadRequestResponse" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "examples": { - "missing header": { - "value": { - "detail": { - "cause": "No Authorization header found", - "response": "Missing or invalid credentials provided by client" - } - } - }, - "missing token": { - "value": { - "detail": { - "cause": "No token found in Authorization header", - "response": "Missing or invalid credentials provided by client" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/UnauthorizedResponse" - } - } - } - }, - "403": { - "description": "Permission denied", - "content": { - "application/json": { - "examples": { - "endpoint": { - "value": { - "detail": { - "cause": "User 6789 is not authorized to access this endpoint.", - "response": "User does not have permission to access this endpoint" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/ForbiddenResponse" - } - } - } - }, - "404": { - "description": "Resource not found", - "content": { - "application/json": { - "examples": { - "conversation": { - "value": { - "detail": { - "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", - "response": "Conversation not found" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/NotFoundResponse" - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "examples": { - "configuration": { - "value": { - "detail": { - "cause": "Lightspeed Stack configuration has not been initialized.", - "response": "Configuration is not loaded" - } - } - }, - "conversation cache": { - "value": { - "detail": { - "cause": "Conversation cache is not configured or unavailable.", - "response": "Conversation cache not configured" - } - } - } - }, - "schema": { - "$ref": "#/components/schemas/InternalServerErrorResponse" - } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/HTTPValidationError" - } - } - } - } - } - } - }, - "/v2/query": { - "post": { - "tags": [ - "query_v2" - ], - "summary": "Query Endpoint Handler V2", - "description": "Handle request to the /query endpoint using Responses API.\n\nThis is a wrapper around query_endpoint_handler_base that provides\nthe Responses API specific retrieve_response and get_topic_summary functions.\n\nReturns:\n QueryResponse: Contains the conversation ID and the LLM-generated response.", - "operationId": "query_endpoint_handler_v2_v2_query_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/QueryRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Successful response", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/QueryResponse" - }, - "example": { - "available_quotas": { - "daily": 1000, - "monthly": 50000 - }, - "conversation_id": "123e4567-e89b-12d3-a456-426614174000", - "input_tokens": 150, - "output_tokens": 75, - "rag_chunks": [ - { - "content": "OLM is a component of the Operator Framework toolkit...", - "score": 0.95, - "source": "kubernetes-docs/operators.md" - } - ], - "referenced_documents": [ - { - "doc_title": "Operator Lifecycle Manager (OLM)", - "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" - } - ], - "response": "Operator Lifecycle Manager (OLM) helps users install...", - "tool_calls": [ - { - "arguments": { - "query": "operator lifecycle manager" - }, - "result": { - "chunks_found": 5 - }, - "tool_name": "knowledge_search" - } - ], - "truncated": false - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnauthorizedResponse" - }, - "examples": { - "missing header": { - "value": { - "detail": { - "cause": "No Authorization header found", - "response": "Missing or invalid credentials provided by client" - } - } - }, - "missing token": { - "value": { - "detail": { - "cause": "No token found in Authorization header", - "response": "Missing or invalid credentials provided by client" - } - } - } - } - } - } - }, - "403": { - "description": "Permission denied", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ForbiddenResponse" - }, - "examples": { - "conversation read": { - "value": { - "detail": { - "cause": "User 6789 does not have permission to read conversation with ID 123e4567-e89b-12d3-a456-426614174000", - "response": "User does not have permission to perform this action" - } - } - }, - "endpoint": { - "value": { - "detail": { - "cause": "User 6789 is not authorized to access this endpoint.", - "response": "User does not have permission to access this endpoint" - } - } - }, - "model override": { - "value": { - "detail": { - "cause": "User lacks model_override permission required to override model/provider.", - "response": "This instance does not permit overriding model/provider in the query request (missing permission: MODEL_OVERRIDE). Please remove the model and provider fields from your request." - } - } - } - } - } - } - }, - "404": { - "description": "Resource not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/NotFoundResponse" - }, - "examples": { - "conversation": { - "value": { - "detail": { - "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", - "response": "Conversation not found" - } - } - }, - "provider": { - "value": { - "detail": { - "cause": "Provider with ID openai does not exist", - "response": "Provider not found" - } - } - }, - "model": { - "value": { - "detail": { - "cause": "Model with ID gpt-4-turbo is not configured", - "response": "Model not found" - } - } - } - } - } - } - }, - "422": { - "description": "Request validation failed", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnprocessableEntityResponse" - }, - "examples": { - "invalid format": { - "value": { - "detail": { - "cause": "Invalid request format. The request body could not be parsed.", - "response": "Invalid request format" - } - } - }, - "missing attributes": { - "value": { - "detail": { - "cause": "Missing required attributes: ['query', 'model', 'provider']", - "response": "Missing required attributes" - } - } - }, - "invalid value": { - "value": { - "detail": { - "cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']", - "response": "Invalid attribute value" - } - } - } - } - } - } - }, - "429": { - "description": "Quota limit exceeded", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/QuotaExceededResponse" - }, - "examples": { - "model": { - "value": { - "detail": { - "cause": "The token quota for model gpt-4-turbo has been exceeded.", - "response": "The model quota has been exceeded" - } - } - }, - "user none": { - "value": { - "detail": { - "cause": "User 123 has no available tokens.", - "response": "The quota has been exceeded" - } - } - }, - "cluster none": { - "value": { - "detail": { - "cause": "Cluster has no available tokens.", - "response": "The quota has been exceeded" - } - } - }, - "subject none": { - "value": { - "detail": { - "cause": "Unknown subject 999 has no available tokens.", - "response": "The quota has been exceeded" - } - } - }, - "user insufficient": { - "value": { - "detail": { - "cause": "User 123 has 5 tokens, but 10 tokens are needed.", - "response": "The quota has been exceeded" - } - } - }, - "cluster insufficient": { - "value": { - "detail": { - "cause": "Cluster has 500 tokens, but 900 tokens are needed.", - "response": "The quota has been exceeded" - } - } - }, - "subject insufficient": { - "value": { - "detail": { - "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", - "response": "The quota has been exceeded" - } - } - } - } - } - } - }, - "500": { - "description": "Internal server error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/InternalServerErrorResponse" - }, - "examples": { - "configuration": { - "value": { - "detail": { - "cause": "Lightspeed Stack configuration has not been initialized.", - "response": "Configuration is not loaded" - } - } - } - } - } - } - }, - "503": { - "description": "Service unavailable", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ServiceUnavailableResponse" - }, - "examples": { - "llama stack": { - "value": { - "detail": { - "cause": "Connection error while trying to reach backend service.", - "response": "Unable to connect to Llama Stack" - } - } - } - } - } - } - } - } - } - }, - "/v2/streaming_query": { - "post": { - "tags": [ - "streaming_query_v2" - ], - "summary": "Streaming Query Endpoint Handler V2", - "description": "Handle request to the /streaming_query endpoint using Responses API.\n\nThis is a wrapper around streaming_query_endpoint_handler_base that provides\nthe Responses API specific retrieve_response and response generator functions.\n\nReturns:\n StreamingResponse: An HTTP streaming response yielding\n SSE-formatted events for the query lifecycle.\n\nRaises:\n HTTPException: Returns HTTP 500 if unable to connect to the\n Llama Stack server.", - "operationId": "streaming_query_endpoint_handler_v2_v2_streaming_query_post", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/QueryRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Streaming response with Server-Sent Events", - "content": { - "application/json": { - "schema": { - "type": "string", - "example": "data: {\"event\": \"start\", \"data\": {\"conversation_id\": \"123e4567-e89b-12d3-a456-426614174000\"}}\n\ndata: {\"event\": \"token\", \"data\": {\"id\": 0, \"token\": \"Hello\"}}\n\ndata: {\"event\": \"end\", \"data\": {\"referenced_documents\": [], \"truncated\": null, \"input_tokens\": 0, \"output_tokens\": 0}, \"available_quotas\": {}}\n\n" - } - }, - "text/plain": { - "schema": { - "type": "string", - "example": "Hello world!\n\n---\n\nReference: https://example.com/doc" - } - } - } - }, - "401": { - "description": "Unauthorized", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnauthorizedResponse" - }, - "examples": { - "missing header": { - "value": { - "detail": { - "cause": "No Authorization header found", - "response": "Missing or invalid credentials provided by client" - } - } - }, - "missing token": { - "value": { - "detail": { - "cause": "No token found in Authorization header", - "response": "Missing or invalid credentials provided by client" - } - } - } - } - } - } - }, - "403": { - "description": "Permission denied", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ForbiddenResponse" - }, - "examples": { - "conversation read": { - "value": { - "detail": { - "cause": "User 6789 does not have permission to read conversation with ID 123e4567-e89b-12d3-a456-426614174000", - "response": "User does not have permission to perform this action" - } - } - }, - "endpoint": { - "value": { - "detail": { - "cause": "User 6789 is not authorized to access this endpoint.", - "response": "User does not have permission to access this endpoint" - } - } - }, - "model override": { - "value": { - "detail": { - "cause": "User lacks model_override permission required to override model/provider.", - "response": "This instance does not permit overriding model/provider in the query request (missing permission: MODEL_OVERRIDE). Please remove the model and provider fields from your request." - } - } - } - } - } - } - }, - "404": { - "description": "Resource not found", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/NotFoundResponse" - }, - "examples": { - "conversation": { - "value": { - "detail": { - "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", - "response": "Conversation not found" - } - } - }, - "provider": { - "value": { - "detail": { - "cause": "Provider with ID openai does not exist", - "response": "Provider not found" - } - } - }, - "model": { - "value": { - "detail": { - "cause": "Model with ID gpt-4-turbo is not configured", - "response": "Model not found" - } - } - } - } - } - } - }, - "422": { - "description": "Request validation failed", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/UnprocessableEntityResponse" - }, - "examples": { - "invalid format": { - "value": { - "detail": { - "cause": "Invalid request format. The request body could not be parsed.", - "response": "Invalid request format" - } - } - }, - "missing attributes": { - "value": { - "detail": { - "cause": "Missing required attributes: ['query', 'model', 'provider']", - "response": "Missing required attributes" - } - } - }, - "invalid value": { + "400": { + "description": "Invalid request format", + "content": { + "application/json": { + "examples": { + "conversation_id": { "value": { "detail": { - "cause": "Invalid attatchment type: must be one of ['text/plain', 'application/json', 'application/yaml', 'application/xml']", - "response": "Invalid attribute value" + "cause": "The conversation ID 123e4567-e89b-12d3-a456-426614174000 has invalid format.", + "response": "Invalid conversation ID format" } } } + }, + "schema": { + "$ref": "#/components/schemas/BadRequestResponse" } } } }, - "429": { - "description": "Quota limit exceeded", + "401": { + "description": "Unauthorized", "content": { "application/json": { - "schema": { - "$ref": "#/components/schemas/QuotaExceededResponse" - }, "examples": { - "model": { - "value": { - "detail": { - "cause": "The token quota for model gpt-4-turbo has been exceeded.", - "response": "The model quota has been exceeded" - } - } - }, - "user none": { - "value": { - "detail": { - "cause": "User 123 has no available tokens.", - "response": "The quota has been exceeded" - } - } - }, - "cluster none": { - "value": { - "detail": { - "cause": "Cluster has no available tokens.", - "response": "The quota has been exceeded" - } - } - }, - "subject none": { + "missing header": { "value": { "detail": { - "cause": "Unknown subject 999 has no available tokens.", - "response": "The quota has been exceeded" + "cause": "No Authorization header found", + "response": "Missing or invalid credentials provided by client" } } }, - "user insufficient": { + "missing token": { "value": { "detail": { - "cause": "User 123 has 5 tokens, but 10 tokens are needed.", - "response": "The quota has been exceeded" + "cause": "No token found in Authorization header", + "response": "Missing or invalid credentials provided by client" } } - }, - "cluster insufficient": { + } + }, + "schema": { + "$ref": "#/components/schemas/UnauthorizedResponse" + } + } + } + }, + "403": { + "description": "Permission denied", + "content": { + "application/json": { + "examples": { + "endpoint": { "value": { "detail": { - "cause": "Cluster has 500 tokens, but 900 tokens are needed.", - "response": "The quota has been exceeded" + "cause": "User 6789 is not authorized to access this endpoint.", + "response": "User does not have permission to access this endpoint" } } - }, - "subject insufficient": { + } + }, + "schema": { + "$ref": "#/components/schemas/ForbiddenResponse" + } + } + } + }, + "404": { + "description": "Resource not found", + "content": { + "application/json": { + "examples": { + "conversation": { "value": { "detail": { - "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", - "response": "The quota has been exceeded" + "cause": "Conversation with ID 123e4567-e89b-12d3-a456-426614174000 does not exist", + "response": "Conversation not found" } } } + }, + "schema": { + "$ref": "#/components/schemas/NotFoundResponse" } } } @@ -3957,9 +3514,6 @@ "description": "Internal server error", "content": { "application/json": { - "schema": { - "$ref": "#/components/schemas/InternalServerErrorResponse" - }, "examples": { "configuration": { "value": { @@ -3968,27 +3522,28 @@ "response": "Configuration is not loaded" } } + }, + "conversation cache": { + "value": { + "detail": { + "cause": "Conversation cache is not configured or unavailable.", + "response": "Conversation cache not configured" + } + } } + }, + "schema": { + "$ref": "#/components/schemas/InternalServerErrorResponse" } } } }, - "503": { - "description": "Service unavailable", + "422": { + "description": "Validation Error", "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/ServiceUnavailableResponse" - }, - "examples": { - "llama stack": { - "value": { - "detail": { - "cause": "Connection error while trying to reach backend service.", - "response": "Unable to connect to Llama Stack" - } - } - } + "$ref": "#/components/schemas/HTTPValidationError" } } } @@ -4364,12 +3919,20 @@ "properties": { "api_key": { "type": "string", - "title": "Api Key", - "default": "some-api-key" + "minLength": 1, + "format": "password", + "title": "API key", + "writeOnly": true, + "examples": [ + "some-api-key" + ] } }, "additionalProperties": false, "type": "object", + "required": [ + "api_key" + ], "title": "APIKeyTokenConfiguration", "description": "API Key Token configuration." }, @@ -6143,7 +5706,7 @@ "url" ], "title": "ModelContextProtocolServer", - "description": "Model context protocol server configuration.\n\nMCP (Model Context Protocol) servers provide tools and\ncapabilities to the AI agents. These are configured by this structure.\nOnly MCP servers defined in the lightspeed-stack.yaml configuration are\navailable to the agents. Tools configured in the llama-stack run.yaml\nare not accessible to lightspeed-core agents.\n\nUseful resources:\n\n- [Model Context Protocol](https://modelcontextprotocol.io/docs/getting-started/intro)\n- [MCP FAQs](https://modelcontextprotocol.io/faqs)\n- [Wikipedia article](https://en.wikipedia.org/wiki/Model_Context_Protocol)" + "description": "Model context protocol server configuration.\n\nMCP (Model Context Protocol) servers provide tools and capabilities to the\nAI agents. These are configured by this structure. Only MCP servers\ndefined in the lightspeed-stack.yaml configuration are available to the\nagents. Tools configured in the llama-stack run.yaml are not accessible to\nlightspeed-core agents.\n\nUseful resources:\n\n- [Model Context Protocol](https://modelcontextprotocol.io/docs/getting-started/intro)\n- [MCP FAQs](https://modelcontextprotocol.io/faqs)\n- [Wikipedia article](https://en.wikipedia.org/wiki/Model_Context_Protocol)" }, "ModelsResponse": { "properties": { @@ -6306,7 +5869,7 @@ "password" ], "title": "PostgreSQLDatabaseConfiguration", - "description": "PostgreSQL database configuration.\n\nPostgreSQL database is used by Lightspeed Core Stack service for storing information about\nconversation IDs. It can also be leveraged to store conversation history and information\nabout quota usage.\n\nUseful resources:\n\n- [Psycopg: connection classes](https://www.psycopg.org/psycopg3/docs/api/connections.html)\n- [PostgreSQL connection strings](https://www.connectionstrings.com/postgresql/)\n- [How to Use PostgreSQL in Python](https://www.freecodecamp.org/news/postgresql-in-python/)" + "description": "PostgreSQL database configuration.\n\nPostgreSQL database is used by Lightspeed Core Stack service for storing\ninformation about conversation IDs. It can also be leveraged to store\nconversation history and information about quota usage.\n\nUseful resources:\n\n- [Psycopg: connection classes](https://www.psycopg.org/psycopg3/docs/api/connections.html)\n- [PostgreSQL connection strings](https://www.connectionstrings.com/postgresql/)\n- [How to Use PostgreSQL in Python](https://www.freecodecamp.org/news/postgresql-in-python/)" }, "ProviderHealthStatus": { "properties": { @@ -6705,30 +6268,6 @@ "Kubernetes is an open-source container orchestration system for automating ..." ] }, - "rag_chunks": { - "items": { - "$ref": "#/components/schemas/RAGChunk" - }, - "type": "array", - "title": "Rag Chunks", - "description": "List of RAG chunks used to generate the response", - "default": [] - }, - "tool_calls": { - "anyOf": [ - { - "items": { - "$ref": "#/components/schemas/ToolCall" - }, - "type": "array" - }, - { - "type": "null" - } - ], - "title": "Tool Calls", - "description": "List of tool calls made during response generation" - }, "referenced_documents": { "items": { "$ref": "#/components/schemas/ReferencedDocument" @@ -6790,6 +6329,36 @@ "monthly": 50000 } ] + }, + "tool_calls": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ToolCallSummary" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool Calls", + "description": "List of tool calls made during response generation" + }, + "tool_results": { + "anyOf": [ + { + "items": { + "$ref": "#/components/schemas/ToolResultSummary" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "title": "Tool Results", + "description": "List of tool results" } }, "type": "object", @@ -6801,35 +6370,34 @@ "examples": [ { "available_quotas": { - "daily": 1000, - "monthly": 50000 + "ClusterQuotaLimiter": 998911, + "UserQuotaLimiter": 998911 }, "conversation_id": "123e4567-e89b-12d3-a456-426614174000", - "input_tokens": 150, - "output_tokens": 75, - "rag_chunks": [ - { - "content": "OLM is a component of the Operator Framework toolkit...", - "score": 0.95, - "source": "kubernetes-docs/operators.md" - } - ], + "input_tokens": 123, + "output_tokens": 456, "referenced_documents": [ { - "doc_title": "Operator Lifecycle Manager (OLM)", - "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" + "doc_title": "Operator Lifecycle Manager concepts and resources", + "doc_url": "https://docs.openshift.com/container-platform/4.15/operators/understanding/olm/olm-understanding-olm.html" } ], "response": "Operator Lifecycle Manager (OLM) helps users install...", "tool_calls": [ { - "arguments": { - "query": "operator lifecycle manager" - }, - "result": { - "chunks_found": 5 - }, - "tool_name": "knowledge_search" + "args": {}, + "id": "1", + "name": "tool1", + "type": "tool_call" + } + ], + "tool_results": [ + { + "content": "bla", + "id": "1", + "round": 1, + "status": "success", + "type": "tool_result" } ], "truncated": false @@ -7017,45 +6585,6 @@ "title": "QuotaSchedulerConfiguration", "description": "Quota scheduler configuration." }, - "RAGChunk": { - "properties": { - "content": { - "type": "string", - "title": "Content", - "description": "The content of the chunk" - }, - "source": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Source", - "description": "Source document or URL" - }, - "score": { - "anyOf": [ - { - "type": "number" - }, - { - "type": "null" - } - ], - "title": "Score", - "description": "Relevance score" - } - }, - "type": "object", - "required": [ - "content" - ], - "title": "RAGChunk", - "description": "Model representing a RAG chunk used in the response." - }, "RAGInfoResponse": { "properties": { "id": { @@ -7368,7 +6897,7 @@ "additionalProperties": false, "type": "object", "title": "ServiceConfiguration", - "description": "Service configuration.\n\nLightspeed Core Stack is a REST API service that accepts requests\non a specified hostname and port. It is also possible to enable\nauthentication and specify the number of Uvicorn workers. When more\nworkers are specified, the service can handle requests concurrently." + "description": "Service configuration.\n\nLightspeed Core Stack is a REST API service that accepts requests on a\nspecified hostname and port. It is also possible to enable authentication\nand specify the number of Uvicorn workers. When more workers are specified,\nthe service can handle requests concurrently." }, "ServiceUnavailableResponse": { "properties": { @@ -7514,40 +7043,76 @@ "title": "TLSConfiguration", "description": "TLS configuration.\n\nTransport Layer Security (TLS) is a cryptographic protocol designed to\nprovide communications security over a computer network, such as the\nInternet. The protocol is widely used in applications such as email,\ninstant messaging, and voice over IP, but its use in securing HTTPS remains\nthe most publicly visible.\n\nUseful resources:\n\n - [FastAPI HTTPS Deployment](https://fastapi.tiangolo.com/deployment/https/)\n - [Transport Layer Security Overview](https://en.wikipedia.org/wiki/Transport_Layer_Security)\n - [What is TLS](https://www.ssltrust.eu/learning/ssl/transport-layer-security-tls)" }, - "ToolCall": { + "ToolCallSummary": { "properties": { - "tool_name": { + "id": { "type": "string", - "title": "Tool Name", + "title": "Id", + "description": "ID of the tool call" + }, + "name": { + "type": "string", + "title": "Name", "description": "Name of the tool called" }, - "arguments": { + "args": { "additionalProperties": true, "type": "object", - "title": "Arguments", + "title": "Args", "description": "Arguments passed to the tool" }, - "result": { - "anyOf": [ - { - "additionalProperties": true, - "type": "object" - }, - { - "type": "null" - } - ], - "title": "Result", - "description": "Result from the tool" + "type": { + "type": "string", + "title": "Type", + "description": "Type indicator for tool call", + "default": "tool_call" + } + }, + "type": "object", + "required": [ + "id", + "name" + ], + "title": "ToolCallSummary", + "description": "Model representing a tool call made during response generation (for tool_calls list)." + }, + "ToolResultSummary": { + "properties": { + "id": { + "type": "string", + "title": "Id", + "description": "ID of the tool call/result, matches the corresponding tool call 'id'" + }, + "status": { + "type": "string", + "title": "Status", + "description": "Status of the tool execution (e.g., 'success')" + }, + "content": { + "title": "Content", + "description": "Content/result returned from the tool" + }, + "type": { + "type": "string", + "title": "Type", + "description": "Type indicator for tool result", + "default": "tool_result" + }, + "round": { + "type": "integer", + "title": "Round", + "description": "Round number or step of tool execution" } }, "type": "object", "required": [ - "tool_name", - "arguments" + "id", + "status", + "content", + "round" ], - "title": "ToolCall", - "description": "Model representing a tool call made during response generation." + "title": "ToolResultSummary", + "description": "Model representing a result from a tool call (for tool_results list)." }, "ToolsResponse": { "properties": { diff --git a/pyproject.toml b/pyproject.toml index a46495395..9d3b0b3d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,8 @@ dependencies = [ # Used by authentication/k8s integration "kubernetes>=30.1.0", # Used to call Llama Stack APIs - "llama-stack==0.2.22", - "llama-stack-client==0.2.22", + "llama-stack==0.3.0", + "llama-stack-client==0.3.0", # Used by Logger "rich>=14.0.0", # Used by JWK token auth handler @@ -61,6 +61,12 @@ exclude = [ # service/ols/src/auth/k8s.py and currently has 58 Pyright issues. It # might need to be rewritten down the line. "src/authentication/k8s.py", + # Agent API v1 endpoints - deprecated API but still supported + # Type errors due to llama-stack-client not exposing Agent API types + "src/app/endpoints/conversations.py", + "src/app/endpoints/query.py", + "src/app/endpoints/streaming_query.py", + "src/utils/endpoints.py", ] extraPaths = ["./src"] diff --git a/run.yaml b/run.yaml index 945449bee..3cea08f62 100644 --- a/run.yaml +++ b/run.yaml @@ -1,157 +1,153 @@ -version: '2' -image_name: minimal-viable-llama-stack-configuration +version: 2 apis: - - agents - - datasetio - - eval - - files - - inference - - post_training - - safety - - scoring - - telemetry - - tool_runtime - - vector_io +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- tool_runtime +- vector_io + benchmarks: [] -container_image: null +conversations_store: + db_path: /tmp/conversations.db + type: sqlite datasets: [] -external_providers_dir: /opt/app-root/src/.llama/providers.d +image_name: starter +# external_providers_dir: /opt/app-root/src/.llama/providers.d inference_store: - db_path: .llama/distributions/ollama/inference_store.db + db_path: /tmp/inference_store.db type: sqlite -logging: null metadata_store: - db_path: .llama/distributions/ollama/registry.db - namespace: null + db_path: /tmp/registry.db type: sqlite + +models: +- model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 +# - model_id: gpt-4o-mini +# provider_id: openai +# model_type: llm +# provider_model_id: gpt-4o-mini + providers: - files: - - provider_id: localfs - provider_type: inline::localfs - config: - storage_dir: /tmp/llama-stack-files - metadata_store: - type: sqlite - db_path: .llama/distributions/ollama/files_metadata.db agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: + - config: persistence_store: - db_path: .llama/distributions/ollama/agents_store.db - namespace: null + db_path: /tmp/agents_store.db type: sqlite responses_store: - db_path: .llama/distributions/ollama/responses_store.db + db_path: /tmp/responses_store.db type: sqlite + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + db_path: /tmp/batches.db + type: sqlite + provider_id: reference + provider_type: inline::reference datasetio: - - provider_id: huggingface + - config: + kvstore: + db_path: /tmp/huggingface_datasetio.db + type: sqlite + provider_id: huggingface provider_type: remote::huggingface - config: + - config: kvstore: - db_path: .llama/distributions/ollama/huggingface_datasetio.db - namespace: null + db_path: /tmp/localfs_datasetio.db type: sqlite - - provider_id: localfs + provider_id: localfs provider_type: inline::localfs - config: + eval: + - config: kvstore: - db_path: .llama/distributions/ollama/localfs_datasetio.db - namespace: null + db_path: /tmp/meta_reference_eval.db type: sqlite - eval: - - provider_id: meta-reference + provider_id: meta-reference provider_type: inline::meta-reference - config: - kvstore: - db_path: .llama/distributions/ollama/meta_reference_eval.db - namespace: null + files: + - config: + metadata_store: + db_path: /tmp/files_metadata.db type: sqlite + storage_dir: /tmp/files + provider_id: meta-reference-files + provider_type: inline::localfs inference: - - provider_id: sentence-transformers # Can be any embedding provider - provider_type: inline::sentence-transformers - config: {} - - provider_id: openai - provider_type: remote::openai - config: - api_key: ${env.OPENAI_API_KEY} - post_training: - - provider_id: huggingface - provider_type: inline::huggingface-gpu + - provider_id: openai + provider_type: remote::openai config: - checkpoint_format: huggingface - device: cpu - distributed_backend: null - dpo_output_dir: "." + api_key: ${env.OPENAI_API_KEY} + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: + - config: excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard scoring: - - provider_id: basic + - config: {} + provider_id: basic provider_type: inline::basic - config: {} - - provider_id: llm-as-judge + - config: {} + provider_id: llm-as-judge provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: '********' - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: 'lightspeed-stack-telemetry' - sinks: sqlite - sqlite_db_path: .llama/distributions/ollama/trace_store.db + # telemetry: + # - config: + # service_name: "\u200B" + # provider_id: meta-reference + # provider_type: inline::meta-reference tool_runtime: - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime vector_io: - - provider_id: faiss - provider_type: inline::faiss # Or preferred vector DB - config: + - config: kvstore: - db_path: .llama/distributions/ollama/faiss_store.db # Location of vector database - namespace: null + db_path: /tmp/faiss_store.db type: sqlite + provider_id: faiss + provider_type: inline::faiss scoring_fns: [] server: - auth: null - host: null port: 8321 - quota: null - tls_cafile: null - tls_certfile: null - tls_keyfile: null -shields: - - shield_id: llama-guard-shield - provider_id: llama-guard - provider_shield_id: "gpt-3.5-turbo" # Model to use for safety checks -vector_dbs: - - vector_db_id: my_knowledge_base - embedding_model: sentence-transformers/all-mpnet-base-v2 - embedding_dimension: 768 - provider_id: faiss -models: - - model_id: sentence-transformers/all-mpnet-base-v2 # Example embedding model - model_type: embedding - provider_id: sentence-transformers - provider_model_id: sentence-transformers/all-mpnet-base-v2 # Location of embedding model - metadata: - embedding_dimension: 768 # Depends on chosen model - - model_id: gpt-4-turbo - model_type: llm - provider_id: openai - provider_model_id: gpt-4-turbo - +shields: [] tool_groups: - - toolgroup_id: builtin::rag - provider_id: rag-runtime +- provider_id: rag-runtime + toolgroup_id: builtin::rag +vector_dbs: [] +storage: + backends: + kv_default: + type: kv_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/kv_store.db + sql_default: + type: sql_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sql_store.db + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + prompts: + namespace: prompts + backend: kv_default \ No newline at end of file diff --git a/run_library.yaml b/run_library.yaml new file mode 100644 index 000000000..5e46ee6e9 --- /dev/null +++ b/run_library.yaml @@ -0,0 +1,155 @@ +version: 2 + +apis: +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- tool_runtime +- vector_io + +benchmarks: [] +conversations_store: + db_path: /tmp/conversations.db + type: sqlite +datasets: [] +image_name: starter +# external_providers_dir: /opt/app-root/src/.llama/providers.d +inference_store: + db_path: /tmp/inference_store.db + type: sqlite +metadata_store: + db_path: /tmp/registry.db + type: sqlite + +models: +- model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 +# - model_id: gpt-4o-mini +# provider_id: openai +# model_type: llm +# provider_model_id: gpt-4o-mini + +providers: + agents: + - config: + persistence: + agent_state: + namespace: agents_state + backend: kv_default + responses: + table_name: agents_responses + backend: sql_default + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + namespace: batches_store + backend: kv_default + provider_id: reference + provider_type: inline::reference + datasetio: + - config: + kvstore: + namespace: huggingface_datasetio + backend: kv_default + provider_id: huggingface + provider_type: remote::huggingface + - config: + kvstore: + namespace: localfs_datasetio + backend: kv_default + provider_id: localfs + provider_type: inline::localfs + eval: + - config: + kvstore: + namespace: eval_store + backend: kv_default + provider_id: meta-reference + provider_type: inline::meta-reference + files: + - config: + metadata_store: + table_name: files_metadata + backend: sql_default + storage_dir: /tmp/files + provider_id: meta-reference-files + provider_type: inline::localfs + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers + safety: + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard + scoring: + - config: {} + provider_id: basic + provider_type: inline::basic + - config: {} + provider_id: llm-as-judge + provider_type: inline::llm-as-judge + # telemetry: + # - config: + # service_name: "​" + # provider_id: meta-reference + # provider_type: inline::meta-reference + tool_runtime: + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime + vector_io: + - config: + persistence: + namespace: faiss_store + backend: kv_default + provider_id: faiss + provider_type: inline::faiss +scoring_fns: [] +server: + port: 8321 +shields: [] +tool_groups: +- provider_id: rag-runtime + toolgroup_id: builtin::rag +vector_dbs: [] +storage: + backends: + kv_default: + type: kv_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/kv_store.db + sql_default: + type: sql_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sql_store.db + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + prompts: + namespace: prompts + backend: kv_default + diff --git a/src/app/endpoints/conversations_v2.py b/src/app/endpoints/conversations_v2.py index adb608221..a9125f74e 100644 --- a/src/app/endpoints/conversations_v2.py +++ b/src/app/endpoints/conversations_v2.py @@ -49,7 +49,6 @@ examples=["missing header", "missing token"] ), 403: ForbiddenResponse.openapi_response(examples=["endpoint"]), - 404: NotFoundResponse.openapi_response(examples=["conversation"]), 500: InternalServerErrorResponse.openapi_response( examples=["conversation cache", "configuration"] ), @@ -162,8 +161,6 @@ async def delete_conversation_endpoint_handler( response = InternalServerErrorResponse.cache_unavailable() raise HTTPException(**response.model_dump()) - check_conversation_existence(user_id, conversation_id) - logger.info("Deleting conversation %s for user %s", conversation_id, user_id) deleted = configuration.conversation_cache.delete( user_id, conversation_id, skip_userid_check diff --git a/src/app/endpoints/conversations_v3.py b/src/app/endpoints/conversations_v3.py new file mode 100644 index 000000000..d30ffc731 --- /dev/null +++ b/src/app/endpoints/conversations_v3.py @@ -0,0 +1,633 @@ +"""Handler for REST API calls to manage conversation history using Conversations API.""" + +import logging +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException, Request, status +from llama_stack_client import ( + APIConnectionError, + APIStatusError, + NOT_GIVEN, +) +from sqlalchemy.exc import SQLAlchemyError + +from app.database import get_session +from authentication import get_auth_dependency +from authorization.middleware import authorize +from client import AsyncLlamaStackClientHolder +from configuration import configuration +from models.config import Action +from models.database.conversations import UserConversation +from models.requests import ConversationUpdateRequest +from models.responses import ( + BadRequestResponse, + ConversationDeleteResponse, + ConversationDetails, + ConversationResponse, + ConversationsListResponse, + ConversationUpdateResponse, + ForbiddenResponse, + InternalServerErrorResponse, + NotFoundResponse, + ServiceUnavailableResponse, + UnauthorizedResponse, +) +from utils.endpoints import ( + can_access_conversation, + check_configuration_loaded, + delete_conversation, + retrieve_conversation, +) +from utils.suid import ( + check_suid, + normalize_conversation_id, + to_llama_stack_conversation_id, +) + +logger = logging.getLogger("app.endpoints.handlers") +router = APIRouter(tags=["conversations_v1"]) + +conversation_get_responses: dict[int | str, dict[str, Any]] = { + 200: ConversationResponse.openapi_response(), + 400: BadRequestResponse.openapi_response(), + 401: UnauthorizedResponse.openapi_response( + examples=["missing header", "missing token"] + ), + 403: ForbiddenResponse.openapi_response(examples=["conversation read", "endpoint"]), + 404: NotFoundResponse.openapi_response(examples=["conversation"]), + 500: InternalServerErrorResponse.openapi_response( + examples=["database", "configuration"] + ), + 503: ServiceUnavailableResponse.openapi_response(), +} + +conversation_delete_responses: dict[int | str, dict[str, Any]] = { + 200: ConversationDeleteResponse.openapi_response(), + 400: BadRequestResponse.openapi_response(), + 401: UnauthorizedResponse.openapi_response( + examples=["missing header", "missing token"] + ), + 403: ForbiddenResponse.openapi_response( + examples=["conversation delete", "endpoint"] + ), + 500: InternalServerErrorResponse.openapi_response( + examples=["database", "configuration"] + ), + 503: ServiceUnavailableResponse.openapi_response(), +} + +conversations_list_responses: dict[int | str, dict[str, Any]] = { + 200: ConversationsListResponse.openapi_response(), + 401: UnauthorizedResponse.openapi_response( + examples=["missing header", "missing token"] + ), + 403: ForbiddenResponse.openapi_response(examples=["endpoint"]), + 500: InternalServerErrorResponse.openapi_response( + examples=["database", "configuration"] + ), + 503: ServiceUnavailableResponse.openapi_response(), +} + +conversation_update_responses: dict[int | str, dict[str, Any]] = { + 200: ConversationUpdateResponse.openapi_response(), + 400: BadRequestResponse.openapi_response(), + 401: UnauthorizedResponse.openapi_response( + examples=["missing header", "missing token"] + ), + 403: ForbiddenResponse.openapi_response(examples=["endpoint"]), + 404: NotFoundResponse.openapi_response(examples=["conversation"]), + 500: InternalServerErrorResponse.openapi_response( + examples=["database", "configuration"] + ), + 503: ServiceUnavailableResponse.openapi_response(), +} + + +def simplify_conversation_items(items: list[dict]) -> list[dict[str, Any]]: + """Simplify conversation items to include only essential information. + + Args: + items: The full conversation items list from llama-stack Conversations API + (in reverse chronological order, newest first) + + Returns: + Simplified items with only essential message and tool call information + (in chronological order, oldest first, grouped by turns) + """ + # Filter only message type items + message_items = [item for item in items if item.get("type") == "message"] + + # Process from bottom up (reverse to get chronological order) + # Assume items are grouped correctly: user input followed by assistant output + reversed_messages = list(reversed(message_items)) + + chat_history = [] + i = 0 + while i < len(reversed_messages): + # Extract text content from user message + user_item = reversed_messages[i] + user_content = user_item.get("content", []) + user_text = "" + for content_part in user_content: + if isinstance(content_part, dict): + content_type = content_part.get("type") + if content_type == "input_text": + user_text += content_part.get("text", "") + elif isinstance(content_part, str): + user_text += content_part + + # Extract text content from assistant message (next item) + assistant_text = "" + if i + 1 < len(reversed_messages): + assistant_item = reversed_messages[i + 1] + assistant_content = assistant_item.get("content", []) + for content_part in assistant_content: + if isinstance(content_part, dict): + content_type = content_part.get("type") + if content_type == "output_text": + assistant_text += content_part.get("text", "") + elif isinstance(content_part, str): + assistant_text += content_part + + # Create turn with user message first, then assistant message + chat_history.append( + { + "messages": [ + {"content": user_text, "type": "user"}, + {"content": assistant_text, "type": "assistant"}, + ] + } + ) + + # Move to next pair (skip both user and assistant) + i += 2 + + return chat_history + + +@router.get( + "/conversations", + responses=conversations_list_responses, + summary="Conversations List Endpoint Handler V1", +) +@authorize(Action.LIST_CONVERSATIONS) +async def get_conversations_list_endpoint_handler( + request: Request, + auth: Any = Depends(get_auth_dependency()), +) -> ConversationsListResponse: + """Handle request to retrieve all conversations for the authenticated user.""" + check_configuration_loaded(configuration) + + user_id = auth[0] + + logger.info("Retrieving conversations for user %s", user_id) + + with get_session() as session: + try: + query = session.query(UserConversation) + + filtered_query = ( + query + if Action.LIST_OTHERS_CONVERSATIONS in request.state.authorized_actions + else query.filter_by(user_id=user_id) + ) + + user_conversations = filtered_query.all() + + # Return conversation summaries with metadata + conversations = [ + ConversationDetails( + conversation_id=conv.id, + created_at=conv.created_at.isoformat() if conv.created_at else None, + last_message_at=( + conv.last_message_at.isoformat() + if conv.last_message_at + else None + ), + message_count=conv.message_count, + last_used_model=conv.last_used_model, + last_used_provider=conv.last_used_provider, + topic_summary=conv.topic_summary, + ) + for conv in user_conversations + ] + + logger.info( + "Found %d conversations for user %s", len(conversations), user_id + ) + + return ConversationsListResponse(conversations=conversations) + + except SQLAlchemyError as e: + logger.exception( + "Error retrieving conversations for user %s: %s", user_id, e + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e + + +@router.get( + "/conversations/{conversation_id}", + responses=conversation_get_responses, + summary="Conversation Get Endpoint Handler V1", +) +@authorize(Action.GET_CONVERSATION) +async def get_conversation_endpoint_handler( + request: Request, + conversation_id: str, + auth: Any = Depends(get_auth_dependency()), +) -> ConversationResponse: + """Handle request to retrieve a conversation by ID using Conversations API. + + Retrieve a conversation's chat history by its ID using the LlamaStack + Conversations API. This endpoint fetches the conversation items from + the backend, simplifies them to essential chat history, and returns + them in a structured response. Raises HTTP 400 for invalid IDs, 404 + if not found, 503 if the backend is unavailable, and 500 for + unexpected errors. + + Args: + request: The FastAPI request object + conversation_id: Unique identifier of the conversation to retrieve + auth: Authentication tuple from dependency + + Returns: + ConversationResponse: Structured response containing the conversation + ID and simplified chat history + """ + check_configuration_loaded(configuration) + + # Validate conversation ID format + if not check_suid(conversation_id): + logger.error("Invalid conversation ID format: %s", conversation_id) + response = BadRequestResponse( + resource="conversation", resource_id=conversation_id + ).model_dump() + raise HTTPException(**response) + + # Normalize the conversation ID for database operations (strip conv_ prefix if present) + normalized_conv_id = normalize_conversation_id(conversation_id) + logger.debug( + "GET conversation - original ID: %s, normalized ID: %s", + conversation_id, + normalized_conv_id, + ) + + user_id = auth[0] + if not can_access_conversation( + normalized_conv_id, + user_id, + others_allowed=( + Action.READ_OTHERS_CONVERSATIONS in request.state.authorized_actions + ), + ): + logger.warning( + "User %s attempted to read conversation %s they don't have access to", + user_id, + normalized_conv_id, + ) + response = ForbiddenResponse.conversation( + action="read", + resource_id=normalized_conv_id, + user_id=user_id, + ).model_dump() + raise HTTPException(**response) + + # If reached this, user is authorized to retrieve this conversation + try: + conversation = retrieve_conversation(normalized_conv_id) + if conversation is None: + logger.error( + "Conversation %s not found in database.", + normalized_conv_id, + ) + response = NotFoundResponse( + resource="conversation", resource_id=normalized_conv_id + ).model_dump() + raise HTTPException(**response) + + except SQLAlchemyError as e: + logger.error( + "Database error occurred while retrieving conversation %s: %s", + normalized_conv_id, + str(e), + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e + + logger.info( + "Retrieving conversation %s using Conversations API", normalized_conv_id + ) + + try: + client = AsyncLlamaStackClientHolder().get_client() + + # Convert to llama-stack format (add 'conv_' prefix if needed) + llama_stack_conv_id = to_llama_stack_conversation_id(normalized_conv_id) + logger.debug( + "Calling llama-stack list_items with conversation_id: %s", + llama_stack_conv_id, + ) + + # Use Conversations API to retrieve conversation items + conversation_items_response = await client.conversations.items.list( + conversation_id=llama_stack_conv_id, + after=NOT_GIVEN, + include=NOT_GIVEN, + limit=NOT_GIVEN, + order=NOT_GIVEN, + ) + items = ( + conversation_items_response.data + if hasattr(conversation_items_response, "data") + else [] + ) + # Convert items to dict format for processing + items_dicts = [ + item.model_dump() if hasattr(item, "model_dump") else dict(item) + for item in items + ] + + logger.info( + "Successfully retrieved %d items for conversation %s", + len(items_dicts), + conversation_id, + ) + # Simplify the conversation items to include only essential information + chat_history = simplify_conversation_items(items_dicts) + + # Conversations api has no support for message level timestamps + return ConversationResponse( + conversation_id=normalized_conv_id, + chat_history=chat_history, + ) + + except APIConnectionError as e: + logger.error("Unable to connect to Llama Stack: %s", e) + response = ServiceUnavailableResponse( + backend_name="Llama Stack", cause=str(e) + ).model_dump() + raise HTTPException(**response) from e + + except APIStatusError as e: + logger.error("Conversation not found: %s", e) + response = NotFoundResponse( + resource="conversation", resource_id=normalized_conv_id + ).model_dump() + raise HTTPException(**response) from e + + +@router.delete( + "/conversations/{conversation_id}", + responses=conversation_delete_responses, + summary="Conversation Delete Endpoint Handler V1", +) +@authorize(Action.DELETE_CONVERSATION) +async def delete_conversation_endpoint_handler( + request: Request, + conversation_id: str, + auth: Any = Depends(get_auth_dependency()), +) -> ConversationDeleteResponse: + """Handle request to delete a conversation by ID using Conversations API. + + Validates the conversation ID format and attempts to delete the + conversation from the Llama Stack backend using the Conversations API. + Raises HTTP errors for invalid IDs, not found conversations, connection + issues, or unexpected failures. + + Args: + request: The FastAPI request object + conversation_id: Unique identifier of the conversation to delete + auth: Authentication tuple from dependency + + Returns: + ConversationDeleteResponse: Response indicating the result of the deletion operation + """ + check_configuration_loaded(configuration) + + # Validate conversation ID format + if not check_suid(conversation_id): + logger.error("Invalid conversation ID format: %s", conversation_id) + response = BadRequestResponse( + resource="conversation", resource_id=conversation_id + ).model_dump() + raise HTTPException(**response) + + # Normalize the conversation ID for database operations (strip conv_ prefix if present) + normalized_conv_id = normalize_conversation_id(conversation_id) + + # Check if user has access to delete this conversation + user_id = auth[0] + if not can_access_conversation( + normalized_conv_id, + user_id, + others_allowed=( + Action.DELETE_OTHERS_CONVERSATIONS in request.state.authorized_actions + ), + ): + logger.warning( + "User %s attempted to delete conversation %s they don't have access to", + user_id, + normalized_conv_id, + ) + response = ForbiddenResponse.conversation( + action="delete", + resource_id=normalized_conv_id, + user_id=user_id, + ).model_dump() + raise HTTPException(**response) + + # If reached this, user is authorized to delete this conversation + try: + local_deleted = delete_conversation(normalized_conv_id) + if not local_deleted: + logger.info( + "Conversation %s not found locally when deleting.", + normalized_conv_id, + ) + except SQLAlchemyError as e: + logger.error( + "Database error while deleting conversation %s", + normalized_conv_id, + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e + + logger.info("Deleting conversation %s using Conversations API", normalized_conv_id) + + try: + # Get Llama Stack client + client = AsyncLlamaStackClientHolder().get_client() + + # Convert to llama-stack format (add 'conv_' prefix if needed) + llama_stack_conv_id = to_llama_stack_conversation_id(normalized_conv_id) + + # Use Conversations API to delete the conversation + delete_response = await client.conversations.delete( + conversation_id=llama_stack_conv_id + ) + logger.info( + "Remote deletion of %s successful (remote_deleted=%s)", + normalized_conv_id, + delete_response.deleted, + ) + + except APIConnectionError as e: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail=ServiceUnavailableResponse( + backend_name="Llama Stack", cause=str(e) + ).model_dump(), + ) from e + + except APIStatusError: + logger.warning( + "Conversation %s in LlamaStack not found. Treating as already deleted.", + normalized_conv_id, + ) + + return ConversationDeleteResponse( + conversation_id=normalized_conv_id, + deleted=local_deleted, + ) + + +@router.put( + "/conversations/{conversation_id}", + responses=conversation_update_responses, + summary="Conversation Update Endpoint Handler V1", +) +@authorize(Action.UPDATE_CONVERSATION) +async def update_conversation_endpoint_handler( + request: Request, + conversation_id: str, + update_request: ConversationUpdateRequest, + auth: Any = Depends(get_auth_dependency()), +) -> ConversationUpdateResponse: + """Handle request to update a conversation metadata using Conversations API. + + Updates the conversation metadata (including topic summary) in both the + LlamaStack backend using the Conversations API and the local database. + + Args: + request: The FastAPI request object + conversation_id: Unique identifier of the conversation to update + update_request: Request containing the topic summary to update + auth: Authentication tuple from dependency + + Returns: + ConversationUpdateResponse: Response indicating the result of the update operation + """ + check_configuration_loaded(configuration) + + # Validate conversation ID format + if not check_suid(conversation_id): + logger.error("Invalid conversation ID format: %s", conversation_id) + response = BadRequestResponse( + resource="conversation", resource_id=conversation_id + ).model_dump() + raise HTTPException(**response) + + # Normalize the conversation ID for database operations (strip conv_ prefix if present) + normalized_conv_id = normalize_conversation_id(conversation_id) + + user_id = auth[0] + if not can_access_conversation( + normalized_conv_id, + user_id, + others_allowed=( + Action.QUERY_OTHERS_CONVERSATIONS in request.state.authorized_actions + ), + ): + logger.warning( + "User %s attempted to update conversation %s they don't have access to", + user_id, + normalized_conv_id, + ) + response = ForbiddenResponse.conversation( + action="update", resource_id=normalized_conv_id, user_id=user_id + ).model_dump() + raise HTTPException(**response) + + # If reached this, user is authorized to update this conversation + try: + conversation = retrieve_conversation(normalized_conv_id) + if conversation is None: + response = NotFoundResponse( + resource="conversation", resource_id=normalized_conv_id + ).model_dump() + raise HTTPException(**response) + + except SQLAlchemyError as e: + logger.error( + "Database error occurred while retrieving conversation %s.", + normalized_conv_id, + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e + + logger.info( + "Updating metadata for conversation %s using Conversations API", + normalized_conv_id, + ) + + try: + # Get Llama Stack client + client = AsyncLlamaStackClientHolder().get_client() + + # Convert to llama-stack format (add 'conv_' prefix if needed) + llama_stack_conv_id = to_llama_stack_conversation_id(normalized_conv_id) + + # Prepare metadata with topic summary + metadata = {"topic_summary": update_request.topic_summary} + + # Use Conversations API to update the conversation metadata + await client.conversations.update( + conversation_id=llama_stack_conv_id, + metadata=metadata, + ) + + logger.info( + "Successfully updated metadata for conversation %s in LlamaStack", + normalized_conv_id, + ) + + # Also update in local database + with get_session() as session: + db_conversation = ( + session.query(UserConversation).filter_by(id=normalized_conv_id).first() + ) + if db_conversation: + db_conversation.topic_summary = update_request.topic_summary + session.commit() + logger.info( + "Successfully updated topic summary in local database for conversation %s", + normalized_conv_id, + ) + + return ConversationUpdateResponse( + conversation_id=normalized_conv_id, + success=True, + message="Topic summary updated successfully", + ) + + except APIConnectionError as e: + response = ServiceUnavailableResponse( + backend_name="Llama Stack", cause=str(e) + ).model_dump() + raise HTTPException(**response) from e + + except APIStatusError as e: + logger.error("Conversation not found: %s", e) + response = NotFoundResponse( + resource="conversation", resource_id=normalized_conv_id + ).model_dump() + raise HTTPException(**response) from e + + except SQLAlchemyError as e: + logger.error( + "Database error occurred while updating conversation %s.", + normalized_conv_id, + ) + response = InternalServerErrorResponse.database_error() + raise HTTPException(**response.model_dump()) from e diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 62cdb878c..4430a1501 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -11,19 +11,18 @@ from litellm.exceptions import RateLimitError from llama_stack_client import ( APIConnectionError, + APIStatusError, AsyncLlamaStackClient, # type: ignore ) -from llama_stack_client.lib.agents.event_logger import interleaved_content_as_str from llama_stack_client.types import Shield, UserMessage # type: ignore -from llama_stack_client.types.agents.turn import Turn -from llama_stack_client.types.agents.turn_create_params import ( - Document, +from llama_stack_client.types.alpha.agents.turn import Turn +from llama_stack_client.types.alpha.agents.turn_create_params import ( Toolgroup, ToolgroupAgentToolGroupWithArgs, ) from llama_stack_client.types.model_list_response import ModelListResponse from llama_stack_client.types.shared.interleaved_content_item import TextContentItem -from llama_stack_client.types.tool_execution_step import ToolExecutionStep +from llama_stack_client.types.alpha.tool_execution_step import ToolExecutionStep from sqlalchemy.exc import SQLAlchemyError import constants @@ -43,10 +42,10 @@ InternalServerErrorResponse, NotFoundResponse, QueryResponse, + PromptTooLongResponse, QuotaExceededResponse, ReferencedDocument, ServiceUnavailableResponse, - ToolCall, UnauthorizedResponse, UnprocessableEntityResponse, ) @@ -68,7 +67,7 @@ ) from utils.token_counter import TokenCounter, extract_and_update_token_metrics from utils.transcripts import store_transcript -from utils.types import TurnSummary +from utils.types import TurnSummary, content_to_str logger = logging.getLogger("app.endpoints.handlers") router = APIRouter(tags=["query"]) @@ -85,6 +84,7 @@ 404: NotFoundResponse.openapi_response( examples=["model", "conversation", "provider"] ), + 413: PromptTooLongResponse.openapi_response(), 422: UnprocessableEntityResponse.openapi_response(), 429: QuotaExceededResponse.openapi_response(), 500: InternalServerErrorResponse.openapi_response(examples=["configuration"]), @@ -195,14 +195,14 @@ async def get_topic_summary( client, model_id, topic_summary_system_prompt ) response = await agent.create_turn( - messages=[UserMessage(role="user", content=question)], + messages=[UserMessage(role="user", content=question).model_dump()], session_id=session_id, stream=False, - toolgroups=None, + # toolgroups=None, ) response = cast(Turn, response) return ( - interleaved_content_as_str(response.output_message.content) + content_to_str(response.output_message.content) if ( getattr(response, "output_message", None) is not None and getattr(response.output_message, "content", None) is not None @@ -380,20 +380,6 @@ async def query_endpoint_handler_base( # pylint: disable=R0914 # Convert tool calls to response format logger.info("Processing tool calls...") - tool_calls = [ - ToolCall( - tool_name=tc.name, - arguments=( - tc.args if isinstance(tc.args, dict) else {"query": str(tc.args)} - ), - result=( - {"response": tc.response} - if tc.response and tc.name != constants.DEFAULT_RAG_TOOL - else None - ), - ) - for tc in summary.tool_calls - ] logger.info("Using referenced documents from response...") @@ -403,8 +389,8 @@ async def query_endpoint_handler_base( # pylint: disable=R0914 response = QueryResponse( conversation_id=conversation_id, response=summary.llm_response, - rag_chunks=summary.rag_chunks if summary.rag_chunks else [], - tool_calls=tool_calls if tool_calls else None, + tool_calls=summary.tool_calls if summary.tool_calls else None, + tool_results=summary.tool_results if summary.tool_results else None, referenced_documents=referenced_documents, truncated=False, # TODO: implement truncation detection input_tokens=token_usage.input_tokens, @@ -425,12 +411,21 @@ async def query_endpoint_handler_base( # pylint: disable=R0914 ) raise HTTPException(**response.model_dump()) from e except SQLAlchemyError as e: - logger.exception("Error persisting conversation details: %s", e) + logger.exception("Error persisting conversation details.") response = InternalServerErrorResponse.database_error() raise HTTPException(**response.model_dump()) from e except RateLimitError as e: used_model = getattr(e, "model", "") - response = QuotaExceededResponse.model(used_model) + if used_model: + response = QuotaExceededResponse.model(used_model) + else: + response = QuotaExceededResponse( + response="The quota has been exceeded", cause=str(e) + ) + raise HTTPException(**response.model_dump()) from e + except APIStatusError as e: + logger.exception("Error in query endpoint handler: %s", e) + response = InternalServerErrorResponse.generic() raise HTTPException(**response.model_dump()) from e @@ -734,7 +729,7 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche } vector_db_ids = [ - vector_db.identifier for vector_db in await client.vector_dbs.list() + vector_store.id for vector_store in (await client.vector_stores.list()).data ] toolgroups = (get_rag_toolgroups(vector_db_ids) or []) + [ mcp_server.name for mcp_server in configuration.mcp_servers @@ -744,27 +739,27 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche toolgroups = None # TODO: LCORE-881 - Remove if Llama Stack starts to support these mime types - documents: list[Document] = [ - ( - {"content": doc["content"], "mime_type": "text/plain"} - if doc["mime_type"].lower() in ("application/json", "application/xml") - else doc - ) - for doc in query_request.get_documents() - ] + # documents: list[Document] = [ + # ( + # {"content": doc["content"], "mime_type": "text/plain"} + # if doc["mime_type"].lower() in ("application/json", "application/xml") + # else doc + # ) + # for doc in query_request.get_documents() + # ] response = await agent.create_turn( - messages=[UserMessage(role="user", content=query_request.query)], + messages=[UserMessage(role="user", content=query_request.query).model_dump()], session_id=session_id, - documents=documents, + # documents=documents, stream=False, - toolgroups=toolgroups, + # toolgroups=toolgroups, ) response = cast(Turn, response) summary = TurnSummary( llm_response=( - interleaved_content_as_str(response.output_message.content) + content_to_str(response.output_message.content) if ( getattr(response, "output_message", None) is not None and getattr(response.output_message, "content", None) is not None @@ -772,6 +767,8 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche else "" ), tool_calls=[], + tool_results=[], + rag_chunks=[], ) referenced_documents = parse_referenced_documents(response) diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py index f25cce971..5e0a8c87c 100644 --- a/src/app/endpoints/query_v2.py +++ b/src/app/endpoints/query_v2.py @@ -1,5 +1,8 @@ +# pylint: disable=too-many-locals,too-many-branches,too-many-nested-blocks + """Handler for REST API call to provide answer to query using Response API.""" +import json import logging from typing import Annotated, Any, cast @@ -37,14 +40,15 @@ get_system_prompt, get_topic_summary_system_prompt, ) +from utils.suid import normalize_conversation_id, to_llama_stack_conversation_id from utils.mcp_headers import mcp_headers_dependency from utils.responses import extract_text_from_response_output_item from utils.shields import detect_shield_violations, get_available_shields from utils.token_counter import TokenCounter -from utils.types import ToolCallSummary, TurnSummary +from utils.types import ToolCallSummary, ToolResultSummary, TurnSummary logger = logging.getLogger("app.endpoints.handlers") -router = APIRouter(tags=["query_v2"]) +router = APIRouter(tags=["query_v1"]) query_v2_response: dict[int | str, dict[str, Any]] = { 200: QueryResponse.openapi_response(), @@ -57,6 +61,7 @@ 404: NotFoundResponse.openapi_response( examples=["conversation", "model", "provider"] ), + # 413: PromptTooLongResponse.openapi_response(), 422: UnprocessableEntityResponse.openapi_response(), 429: QuotaExceededResponse.openapi_response(), 500: InternalServerErrorResponse.openapi_response(examples=["configuration"]), @@ -66,7 +71,7 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too-many-branches output_item: Any, -) -> ToolCallSummary | None: +) -> tuple[ToolCallSummary | None, ToolResultSummary | None]: """Translate applicable Responses API tool outputs into ``ToolCallSummary`` records. The OpenAI ``response.output`` array may contain any ``OpenAIResponseOutput`` variant: @@ -78,23 +83,22 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too- if item_type == "function_call": parsed_arguments = getattr(output_item, "arguments", "") - status = getattr(output_item, "status", None) - if status: - if isinstance(parsed_arguments, dict): - args: Any = {**parsed_arguments, "status": status} - else: - args = {"arguments": parsed_arguments, "status": status} - else: + if isinstance(parsed_arguments, dict): args = parsed_arguments + else: + args = {"arguments": parsed_arguments} call_id = getattr(output_item, "id", None) or getattr( output_item, "call_id", None ) - return ToolCallSummary( - id=str(call_id), - name=getattr(output_item, "name", "function_call"), - args=args, - response=None, + return ( + ToolCallSummary( + id=str(call_id), + name=getattr(output_item, "name", "function_call"), + args=args, + type="function_call", + ), + None, ) if item_type == "file_search_call": @@ -132,16 +136,25 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too- id=str(getattr(output_item, "id")), name=DEFAULT_RAG_TOOL, args=args, - response=response_payload, + type="file_search_call", + ), ToolResultSummary( + id=str(getattr(output_item, "id")), + status=str(getattr(output_item, "status", None)), + content=json.dumps(response_payload) if response_payload else None, + type="file_search_call", + round=1, ) if item_type == "web_search_call": args = {"status": getattr(output_item, "status", None)} - return ToolCallSummary( - id=str(getattr(output_item, "id")), - name="web_search", - args=args, - response=None, + return ( + ToolCallSummary( + id=str(getattr(output_item, "id")), + name="web_search", + args=args, + type="web_search_call", + ), + None, ) if item_type == "mcp_call": @@ -158,7 +171,13 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too- id=str(getattr(output_item, "id")), name=getattr(output_item, "name", "mcp_call"), args=args, - response=getattr(output_item, "output", None), + type="mcp_call", + ), ToolResultSummary( + id=str(getattr(output_item, "id")), + status=str(getattr(output_item, "status", None)), + content=getattr(output_item, "output", ""), + type="mcp_call", + round=1, ) if item_type == "mcp_list_tools": @@ -172,11 +191,14 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too- "server_label": getattr(output_item, "server_label", None), "tools": tool_names, } - return ToolCallSummary( - id=str(getattr(output_item, "id")), - name="mcp_list_tools", - args=args, - response=None, + return ( + ToolCallSummary( + id=str(getattr(output_item, "id")), + name="mcp_list_tools", + args=args, + type="mcp_list_tools", + ), + None, ) if item_type == "mcp_approval_request": @@ -185,14 +207,17 @@ def _build_tool_call_summary( # pylint: disable=too-many-return-statements,too- server_label = getattr(output_item, "server_label", None) if server_label: args["server_label"] = server_label - return ToolCallSummary( - id=str(getattr(output_item, "id")), - name=getattr(output_item, "name", "mcp_approval_request"), - args=args, - response=None, + return ( + ToolCallSummary( + id=str(getattr(output_item, "id")), + name=getattr(output_item, "name", "mcp_approval_request"), + args=args, + type="tool_call", + ), + None, ) - return None + return None, None async def get_topic_summary( # pylint: disable=too-many-nested-blocks @@ -233,7 +258,7 @@ async def get_topic_summary( # pylint: disable=too-many-nested-blocks return summary_text.strip() if summary_text else "" -@router.post("/query", responses=query_v2_response) +@router.post("/query", responses=query_v2_response, summary="Query Endpoint Handler V1") @authorize(Action.QUERY) async def query_endpoint_handler_v2( request: Request, @@ -261,7 +286,7 @@ async def query_endpoint_handler_v2( ) -async def retrieve_response( # pylint: disable=too-many-locals,too-many-branches,too-many-arguments +async def retrieve_response( # pylint: disable=too-many-locals,too-many-branches,too-many-arguments,too-many-statements client: AsyncLlamaStackClient, model_id: str, query_request: QueryRequest, @@ -324,6 +349,27 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche f"\n\n[Attachment: {attachment.attachment_type}]\n{attachment.content}" ) + # Handle conversation ID for Responses API + # Create conversation upfront if not provided + conversation_id = query_request.conversation_id + if conversation_id: + # Conversation ID was provided - convert to llama-stack format + logger.debug("Using existing conversation ID: %s", conversation_id) + llama_stack_conv_id = to_llama_stack_conversation_id(conversation_id) + else: + # No conversation_id provided - create a new conversation first + logger.debug("No conversation_id provided, creating new conversation") + + conversation = await client.conversations.create(metadata={}) + llama_stack_conv_id = conversation.id + # Store the normalized version for later use + conversation_id = normalize_conversation_id(llama_stack_conv_id) + logger.info( + "Created new conversation with ID: %s (normalized: %s)", + llama_stack_conv_id, + conversation_id, + ) + # Create OpenAI response using responses API create_kwargs: dict[str, Any] = { "input": input_text, @@ -332,9 +378,8 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche "tools": cast(Any, toolgroups), "stream": False, "store": True, + "conversation": llama_stack_conv_id, } - if query_request.conversation_id: - create_kwargs["previous_response_id"] = query_request.conversation_id # Add shields to extra_body if available if available_shields: @@ -342,28 +387,28 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche response = await client.responses.create(**create_kwargs) response = cast(OpenAIResponseObject, response) - + logger.info("Response: %s", response) logger.debug( - "Received response with ID: %s, output items: %d", + "Received response with ID: %s, conversation ID: %s, output items: %d", response.id, + conversation_id, len(response.output), ) - # Return the response ID - client can use it for chaining if desired - conversation_id = response.id - # Process OpenAI response format llm_response = "" tool_calls: list[ToolCallSummary] = [] - + tool_results: list[ToolResultSummary] = [] for output_item in response.output: message_text = extract_text_from_response_output_item(output_item) if message_text: llm_response += message_text - tool_summary = _build_tool_call_summary(output_item) - if tool_summary: - tool_calls.append(tool_summary) + tool_call, tool_result = _build_tool_call_summary(output_item) + if tool_call: + tool_calls.append(tool_call) + if tool_result: + tool_results.append(tool_result) # Check for shield violations across all output items detect_shield_violations(response.output) @@ -377,6 +422,8 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche summary = TurnSummary( llm_response=llm_response, tool_calls=tool_calls, + tool_results=tool_results, + rag_chunks=[], ) # Extract referenced documents and token usage from Responses API response @@ -391,7 +438,15 @@ async def retrieve_response( # pylint: disable=too-many-locals,too-many-branche "Response lacks content (conversation_id=%s)", conversation_id, ) - return (summary, conversation_id, referenced_documents, token_usage) + + # Normalize conversation ID before returning (remove conv_ prefix for consistency) + normalized_conversation_id = ( + normalize_conversation_id(conversation_id) + if conversation_id + else conversation_id + ) + + return (summary, normalized_conversation_id, referenced_documents, token_usage) def parse_referenced_documents_from_responses_api( @@ -406,13 +461,92 @@ def parse_referenced_documents_from_responses_api( Returns: list[ReferencedDocument]: List of referenced documents with doc_url and doc_title """ - # TODO(ltomasbo): need to parse source documents from Responses API response. - # The Responses API has a different structure than Agent API for referenced documents. - # Need to extract from: - # - OpenAIResponseOutputMessageFileSearchToolCall.results - # - OpenAIResponseAnnotationCitation in message content - # - OpenAIResponseAnnotationFileCitation in message content - return [] + documents: list[ReferencedDocument] = [] + # Use a set to track unique documents by (doc_url, doc_title) tuple + seen_docs: set[tuple[str | None, str | None]] = set() + + if not response.output: + return documents + + for output_item in response.output: + item_type = getattr(output_item, "type", None) + + # 1. Parse from file_search_call results + if item_type == "file_search_call": + results = getattr(output_item, "results", []) or [] + for result in results: + # Handle both object and dict access + if isinstance(result, dict): + filename = result.get("filename") + attributes = result.get("attributes", {}) + else: + filename = getattr(result, "filename", None) + attributes = getattr(result, "attributes", {}) + + # Try to get URL from attributes + # Look for common URL fields in attributes + doc_url = ( + attributes.get("link") + or attributes.get("url") + or attributes.get("doc_url") + ) + + # If we have at least a filename or url + if filename or doc_url: + # Treat empty string as None for URL to satisfy AnyUrl | None + final_url = doc_url if doc_url else None + if (final_url, filename) not in seen_docs: + documents.append( + ReferencedDocument(doc_url=final_url, doc_title=filename) + ) + seen_docs.add((final_url, filename)) + + # 2. Parse from message content annotations + elif item_type == "message": + content = getattr(output_item, "content", None) + if isinstance(content, list): + for part in content: + # Skip if part is a string or doesn't have annotations + if isinstance(part, str): + continue + + annotations = getattr(part, "annotations", []) or [] + for annotation in annotations: + # Handle both object and dict access for annotations + if isinstance(annotation, dict): + anno_type = annotation.get("type") + anno_url = annotation.get("url") + anno_title = annotation.get("title") or annotation.get( + "filename" + ) + else: + anno_type = getattr(annotation, "type", None) + anno_url = getattr(annotation, "url", None) + anno_title = getattr(annotation, "title", None) or getattr( + annotation, "filename", None + ) + + if anno_type == "url_citation": + # Treat empty string as None + final_url = anno_url if anno_url else None + if (final_url, anno_title) not in seen_docs: + documents.append( + ReferencedDocument( + doc_url=final_url, doc_title=anno_title + ) + ) + seen_docs.add((final_url, anno_title)) + + elif anno_type == "file_citation": + if (None, anno_title) not in seen_docs: + documents.append( + ReferencedDocument( + doc_url=None, doc_title=anno_title + ) + ) + seen_docs.add((None, anno_title)) + + return documents def extract_token_usage_from_responses_api( diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index 1b440a33d..d9c333a9c 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -9,21 +9,20 @@ from datetime import UTC, datetime from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast -from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi import APIRouter, Depends, Request from fastapi.responses import StreamingResponse from litellm.exceptions import RateLimitError from llama_stack_client import ( APIConnectionError, AsyncLlamaStackClient, # type: ignore ) -from llama_stack_client.lib.agents.event_logger import interleaved_content_as_str from llama_stack_client.types import UserMessage # type: ignore -from llama_stack_client.types.agents.agent_turn_response_stream_chunk import ( +from llama_stack_client.types.alpha.agents.agent_turn_response_stream_chunk import ( AgentTurnResponseStreamChunk, ) -from llama_stack_client.types.agents.turn_create_params import Document from llama_stack_client.types.shared import ToolCall from llama_stack_client.types.shared.interleaved_content_item import TextContentItem +from openai._exceptions import APIStatusError import metrics from app.endpoints.query import ( @@ -38,6 +37,7 @@ validate_attachments_metadata, validate_conversation_ownership, ) +from app.endpoints.query import parse_referenced_documents from authentication import get_auth_dependency from authentication.interface import AuthTuple from authorization.middleware import authorize @@ -50,15 +50,19 @@ from models.database.conversations import UserConversation from models.requests import QueryRequest from models.responses import ( + AbstractErrorResponse, ForbiddenResponse, InternalServerErrorResponse, + PromptTooLongResponse, NotFoundResponse, QuotaExceededResponse, ServiceUnavailableResponse, + StreamingQueryResponse, UnauthorizedResponse, UnprocessableEntityResponse, ) from utils.endpoints import ( + ReferencedDocument, check_configuration_loaded, cleanup_after_streaming, create_rag_chunks_dict, @@ -67,31 +71,17 @@ validate_model_provider_override, ) from utils.mcp_headers import handle_mcp_headers_with_toolgroups, mcp_headers_dependency +from utils.quota import get_available_quotas from utils.token_counter import TokenCounter, extract_token_usage_from_turn from utils.transcripts import store_transcript -from utils.types import TurnSummary +from utils.types import TurnSummary, content_to_str logger = logging.getLogger("app.endpoints.handlers") router = APIRouter(tags=["streaming_query"]) streaming_query_responses: dict[int | str, dict[str, Any]] = { - 200: { - "description": "Streaming response (Server-Sent Events)", - "content": { - "text/event-stream": { - "schema": {"type": "string"}, - "example": ( - 'data: {"event": "start", ' - '"data": {"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n' - 'data: {"event": "token", "data": {"id": 0, "token": "Hello"}}\n\n' - 'data: {"event": "end", "data": {"referenced_documents": [], ' - '"truncated": null, "input_tokens": 0, "output_tokens": 0}, ' - '"available_quotas": {}}\n\n' - ), - } - }, - }, + 200: StreamingQueryResponse.openapi_response(), 401: UnauthorizedResponse.openapi_response( examples=["missing header", "missing token"] ), @@ -101,6 +91,7 @@ 404: NotFoundResponse.openapi_response( examples=["conversation", "model", "provider"] ), + 413: PromptTooLongResponse.openapi_response(), 422: UnprocessableEntityResponse.openapi_response(), 429: QuotaExceededResponse.openapi_response(), 500: InternalServerErrorResponse.openapi_response(examples=["configuration"]), @@ -156,8 +147,9 @@ def stream_start_event(conversation_id: str) -> str: def stream_end_event( metadata_map: dict, - summary: TurnSummary, # pylint: disable=unused-argument token_usage: TokenCounter, + available_quotas: dict[str, int], + referenced_documents: list[ReferencedDocument], media_type: str = MEDIA_TYPE_JSON, ) -> str: """ @@ -187,28 +179,20 @@ def stream_end_event( ) return f"\n\n---\n\n{ref_docs_string}" if ref_docs_string else "" - # For JSON media type, we need to create a proper structure - # Since we don't have access to summary here, we'll create a basic structure - referenced_docs_dict = [ - { - "doc_url": v.get("docs_url"), - "doc_title": v.get("title"), - } - for v in metadata_map.values() - if "docs_url" in v and "title" in v - ] + # Convert ReferencedDocument objects to dicts for JSON serialization + # Use mode="json" to ensure AnyUrl is serialized to string (not just model_dump()) + referenced_docs_dict = [doc.model_dump(mode="json") for doc in referenced_documents] return format_stream_data( { "event": "end", "data": { - "rag_chunks": [], # TODO(jboos): implement RAG chunks when summary is available "referenced_documents": referenced_docs_dict, "truncated": None, # TODO(jboos): implement truncated "input_tokens": token_usage.input_tokens, "output_tokens": token_usage.output_tokens, }, - "available_quotas": {}, # TODO(jboos): implement available quotas + "available_quotas": available_quotas, } ) @@ -376,6 +360,23 @@ def generic_llm_error(error: Exception, media_type: str) -> str: ) +async def stream_http_error(error: AbstractErrorResponse) -> AsyncGenerator[str, None]: + """ + Yield an SSE-formatted error response for generic LLM or API errors. + + Args: + error: An AbstractErrorResponse instance representing the error. + + Yields: + str: A Server-Sent Events (SSE) formatted error message containing + the serialized error details. + """ + logger.error("Error while obtaining answer for user question") + logger.exception(error) + + yield format_stream_data({"event": "error", "data": {**error.detail.model_dump()}}) + + # ----------------------------------- # Turn handling # ----------------------------------- @@ -431,9 +432,7 @@ def _handle_turn_complete_event( str: SSE-formatted string containing the turn completion event and output message content. """ - full_response = interleaved_content_as_str( - chunk.event.payload.turn.output_message.content - ) + full_response = content_to_str(chunk.event.payload.turn.output_message.content) if media_type == MEDIA_TYPE_TEXT: yield ( @@ -602,7 +601,7 @@ def _handle_tool_execution_event( for r in chunk.event.payload.step_details.tool_responses: if r.tool_name == "query_from_memory": - inserted_context = interleaved_content_as_str(r.content) + inserted_context = content_to_str(r.content) yield stream_event( data={ "id": chunk_id, @@ -653,7 +652,7 @@ def _handle_tool_execution_event( "id": chunk_id, "token": { "tool_name": r.tool_name, - "response": interleaved_content_as_str(r.content), + "response": content_to_str(r.content), }, }, event_type=LLM_TOOL_RESULT_EVENT, @@ -721,7 +720,12 @@ async def response_generator( complete response for transcript storage if enabled. """ chunk_id = 0 - summary = TurnSummary(llm_response="No response from the model", tool_calls=[]) + summary = TurnSummary( + llm_response="No response from the model", + tool_calls=[], + tool_results=[], + rag_chunks=[], + ) # Determine media type for response formatting media_type = context.query_request.media_type or MEDIA_TYPE_JSON @@ -736,9 +740,7 @@ async def response_generator( continue p = chunk.event.payload if p.event_type == "turn_complete": - summary.llm_response = interleaved_content_as_str( - p.turn.output_message.content - ) + summary.llm_response = content_to_str(p.turn.output_message.content) latest_turn = p.turn system_prompt = get_system_prompt(context.query_request, configuration) try: @@ -767,8 +769,19 @@ async def response_generator( if latest_turn is not None else TokenCounter() ) - - yield stream_end_event(context.metadata_map, summary, token_usage, media_type) + referenced_documents = ( + parse_referenced_documents(latest_turn) if latest_turn is not None else [] + ) + available_quotas = get_available_quotas( + configuration.quota_limiters, context.user_id + ) + yield stream_end_event( + context.metadata_map, + token_usage, + available_quotas, + referenced_documents, + media_type, + ) # Perform cleanup tasks (database and cache operations) await cleanup_after_streaming( @@ -848,12 +861,16 @@ async def streaming_query_endpoint_handler_base( # pylint: disable=too-many-loc user_id, query_request.conversation_id, ) - response = ForbiddenResponse.conversation( + forbidden_error = ForbiddenResponse.conversation( action="read", resource_id=query_request.conversation_id, user_id=user_id, ) - raise HTTPException(**response.model_dump()) + return StreamingResponse( + stream_http_error(forbidden_error), + media_type="text/event-stream", + status_code=forbidden_error.status_code, + ) try: # try to get Llama Stack client @@ -899,45 +916,47 @@ async def streaming_query_endpoint_handler_base( # pylint: disable=too-many-loc return StreamingResponse( response_generator(response), media_type="text/event-stream" ) - # connection to Llama Stack server except APIConnectionError as e: - # Update metrics for the LLM call failure metrics.llm_calls_failures_total.inc() logger.error("Unable to connect to Llama Stack: %s", e) - response = ServiceUnavailableResponse( + error_response = ServiceUnavailableResponse( backend_name="Llama Stack", cause=str(e), ) - raise HTTPException(**response.model_dump()) from e - + return StreamingResponse( + stream_http_error(error_response), + status_code=error_response.status_code, + media_type="text/event-stream", + ) except RateLimitError as e: used_model = getattr(e, "model", "") if used_model: - response = QuotaExceededResponse.model(used_model) + error_response = QuotaExceededResponse.model(used_model) else: - response = QuotaExceededResponse( + error_response = QuotaExceededResponse( response="The quota has been exceeded", cause=str(e) ) - raise HTTPException(**response.model_dump()) from e - - except Exception as e: # pylint: disable=broad-except - # Handle other errors with OLS-compatible error response - # This broad exception catch is intentional to ensure all errors - # are converted to OLS-compatible streaming responses - media_type = query_request.media_type or MEDIA_TYPE_JSON - error_response = generic_llm_error(e, media_type) - - async def error_generator() -> AsyncGenerator[str, None]: - yield error_response - - # Use text/event-stream for SSE-formatted JSON responses, text/plain for plain text - content_type = ( - "text/event-stream" if media_type == MEDIA_TYPE_JSON else "text/plain" + return StreamingResponse( + stream_http_error(error_response), + status_code=error_response.status_code, + media_type="text/event-stream", + ) + except APIStatusError as e: + metrics.llm_calls_failures_total.inc() + logger.error("API status error: %s", e) + error_response = InternalServerErrorResponse.generic() + return StreamingResponse( + stream_http_error(error_response), + status_code=error_response.status_code, + media_type=query_request.media_type or MEDIA_TYPE_JSON, ) - return StreamingResponse(error_generator(), media_type=content_type) -@router.post("/streaming_query", responses=streaming_query_responses) +@router.post( + "/streaming_query", + response_class=StreamingResponse, + responses=streaming_query_responses, +) @authorize(Action.STREAMING_QUERY) async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals,too-many-statements request: Request, @@ -948,16 +967,23 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals,t """ Handle request to the /streaming_query endpoint using Agent API. - This is a wrapper around streaming_query_endpoint_handler_base that provides - the Agent API specific retrieve_response and response generator functions. + Returns a streaming response using Server-Sent Events (SSE) format with + content type text/event-stream. Returns: StreamingResponse: An HTTP streaming response yielding - SSE-formatted events for the query lifecycle. + SSE-formatted events for the query lifecycle with content type + text/event-stream. Raises: - HTTPException: Returns HTTP 500 if unable to connect to the - Llama Stack server. + HTTPException: + - 401: Unauthorized - Missing or invalid credentials + - 403: Forbidden - Insufficient permissions or model override not allowed + - 404: Not Found - Conversation, model, or provider not found + - 422: Unprocessable Entity - Request validation failed + - 429: Too Many Requests - Quota limit exceeded + - 500: Internal Server Error - Configuration not loaded or other server errors + - 503: Service Unavailable - Unable to connect to Llama Stack backend """ return await streaming_query_endpoint_handler_base( request=request, @@ -1062,7 +1088,7 @@ async def retrieve_response( } vector_db_ids = [ - vector_db.identifier for vector_db in await client.vector_dbs.list() + vector_store.id for vector_store in (await client.vector_stores.list()).data ] toolgroups = (get_rag_toolgroups(vector_db_ids) or []) + [ mcp_server.name for mcp_server in configuration.mcp_servers @@ -1072,21 +1098,21 @@ async def retrieve_response( toolgroups = None # TODO: LCORE-881 - Remove if Llama Stack starts to support these mime types - documents: list[Document] = [ - ( - {"content": doc["content"], "mime_type": "text/plain"} - if doc["mime_type"].lower() in ("application/json", "application/xml") - else doc - ) - for doc in query_request.get_documents() - ] + # documents: list[Document] = [ + # ( + # {"content": doc["content"], "mime_type": "text/plain"} + # if doc["mime_type"].lower() in ("application/json", "application/xml") + # else doc + # ) + # for doc in query_request.get_documents() + # ] response = await agent.create_turn( - messages=[UserMessage(role="user", content=query_request.query)], + messages=[UserMessage(role="user", content=query_request.query).model_dump()], session_id=session_id, - documents=documents, + # documents=documents, stream=True, - toolgroups=toolgroups, + # toolgroups=toolgroups, ) response = cast(AsyncIterator[AgentTurnResponseStreamChunk], response) diff --git a/src/app/endpoints/streaming_query_v2.py b/src/app/endpoints/streaming_query_v2.py index f5e8f0269..eb4e73c5a 100644 --- a/src/app/endpoints/streaming_query_v2.py +++ b/src/app/endpoints/streaming_query_v2.py @@ -6,9 +6,10 @@ from fastapi import APIRouter, Depends, Request from fastapi.responses import StreamingResponse from llama_stack.apis.agents.openai_responses import ( + OpenAIResponseObject, OpenAIResponseObjectStream, ) -from llama_stack_client import AsyncLlamaStackClient # type: ignore +from llama_stack_client import AsyncLlamaStackClient from app.endpoints.query import ( is_transcripts_enabled, @@ -18,6 +19,7 @@ from app.endpoints.query_v2 import ( extract_token_usage_from_responses_api, get_topic_summary, + parse_referenced_documents_from_responses_api, prepare_tools_for_responses_api, ) from app.endpoints.streaming_query import ( @@ -40,6 +42,7 @@ NotFoundResponse, QuotaExceededResponse, ServiceUnavailableResponse, + StreamingQueryResponse, UnauthorizedResponse, UnprocessableEntityResponse, ) @@ -47,6 +50,8 @@ cleanup_after_streaming, get_system_prompt, ) +from utils.quota import consume_tokens, get_available_quotas +from utils.suid import normalize_conversation_id, to_llama_stack_conversation_id from utils.mcp_headers import mcp_headers_dependency from utils.shields import detect_shield_violations, get_available_shields from utils.token_counter import TokenCounter @@ -54,34 +59,11 @@ from utils.types import ToolCallSummary, TurnSummary logger = logging.getLogger("app.endpoints.handlers") -router = APIRouter(tags=["streaming_query_v2"]) +router = APIRouter(tags=["streaming_query_v1"]) auth_dependency = get_auth_dependency() streaming_query_v2_responses: dict[int | str, dict[str, Any]] = { - 200: { - "description": "Streaming response with Server-Sent Events", - "content": { - "application/json": { - "schema": { - "type": "string", - "example": ( - 'data: {"event": "start", ' - '"data": {"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n' - 'data: {"event": "token", "data": {"id": 0, "token": "Hello"}}\n\n' - 'data: {"event": "end", "data": {"referenced_documents": [], ' - '"truncated": null, "input_tokens": 0, "output_tokens": 0}, ' - '"available_quotas": {}}\n\n' - ), - } - }, - "text/plain": { - "schema": { - "type": "string", - "example": "Hello world!\n\n---\n\nReference: https://example.com/doc", - } - }, - }, - }, + 200: StreamingQueryResponse.openapi_response(), 401: UnauthorizedResponse.openapi_response( examples=["missing header", "missing token"] ), @@ -91,6 +73,7 @@ 404: NotFoundResponse.openapi_response( examples=["conversation", "model", "provider"] ), + # 413: PromptTooLongResponse.openapi_response(), 422: UnprocessableEntityResponse.openapi_response(), 429: QuotaExceededResponse.openapi_response(), 500: InternalServerErrorResponse.openapi_response(examples=["configuration"]), @@ -129,7 +112,9 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat complete response for transcript storage if enabled. """ chunk_id = 0 - summary = TurnSummary(llm_response="", tool_calls=[]) + summary = TurnSummary( + llm_response="", tool_calls=[], tool_results=[], rag_chunks=[] + ) # Determine media type for response formatting media_type = context.query_request.media_type or MEDIA_TYPE_JSON @@ -139,8 +124,9 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat tool_item_registry: dict[str, dict[str, str]] = {} emitted_turn_complete = False - # Handle conversation id and start event in-band on response.created + # Use the conversation_id from context (either provided or newly created) conv_id = context.conversation_id + start_event_emitted = False # Track the latest response object from response.completed event latest_response_object: Any | None = None @@ -151,14 +137,13 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat event_type = getattr(chunk, "type", None) logger.debug("Processing chunk %d, type: %s", chunk_id, event_type) - # Emit start on response.created - if event_type == "response.created": - try: - conv_id = getattr(chunk, "response").id - except Exception: # pylint: disable=broad-except - logger.warning("Missing response id!") - conv_id = "" + # Emit start event on first chunk (conversation_id is always set at this point) + if not start_event_emitted: yield stream_start_event(conv_id) + start_event_emitted = True + + # Handle response.created event (just skip, no need to extract conversation_id) + if event_type == "response.created": continue # Text streaming @@ -237,8 +222,10 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat ToolCallSummary( id=meta.get("call_id", item_id or "unknown"), name=meta.get("name", "tool_call"), - args=arguments, - response=None, + args=( + arguments if isinstance(arguments, dict) else {} + ), # Handle non-dict arguments + type="tool_call", ) ) @@ -249,9 +236,9 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat # Check for shield violations in the completed response if latest_response_object: - detect_shield_violations( - getattr(latest_response_object, "output", []) - ) + output = getattr(latest_response_object, "output", None) + if output is not None: + detect_shield_violations(output) if not emitted_turn_complete: final_message = summary.llm_response or "".join(text_parts) @@ -286,10 +273,27 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat if latest_response_object is not None else TokenCounter() ) + consume_tokens( + configuration.quota_limiters, + context.user_id, + input_tokens=token_usage.input_tokens, + output_tokens=token_usage.output_tokens, + ) + referenced_documents = parse_referenced_documents_from_responses_api( + cast(OpenAIResponseObject, latest_response_object) + ) + available_quotas = get_available_quotas( + configuration.quota_limiters, context.user_id + ) + yield stream_end_event( + context.metadata_map, + token_usage, + available_quotas, + referenced_documents, + media_type, + ) - yield stream_end_event(context.metadata_map, summary, token_usage, media_type) - - # Perform cleanup tasks (database and cache operations) + # Perform cleanup tasks (database and cache operations)) await cleanup_after_streaming( user_id=context.user_id, conversation_id=conv_id, @@ -313,7 +317,12 @@ async def response_generator( # pylint: disable=too-many-branches,too-many-stat return response_generator -@router.post("/streaming_query", responses=streaming_query_v2_responses) +@router.post( + "/streaming_query", + response_class=StreamingResponse, + responses=streaming_query_v2_responses, + summary="Streaming Query Endpoint Handler V1", +) @authorize(Action.STREAMING_QUERY) async def streaming_query_endpoint_handler_v2( # pylint: disable=too-many-locals request: Request, @@ -324,16 +333,23 @@ async def streaming_query_endpoint_handler_v2( # pylint: disable=too-many-local """ Handle request to the /streaming_query endpoint using Responses API. - This is a wrapper around streaming_query_endpoint_handler_base that provides - the Responses API specific retrieve_response and response generator functions. + Returns a streaming response using Server-Sent Events (SSE) format with + content type text/event-stream. Returns: StreamingResponse: An HTTP streaming response yielding - SSE-formatted events for the query lifecycle. + SSE-formatted events for the query lifecycle with content type + text/event-stream. Raises: - HTTPException: Returns HTTP 500 if unable to connect to the - Llama Stack server. + HTTPException: + - 401: Unauthorized - Missing or invalid credentials + - 403: Forbidden - Insufficient permissions or model override not allowed + - 404: Not Found - Conversation, model, or provider not found + - 422: Unprocessable Entity - Request validation failed + - 429: Too Many Requests - Quota limit exceeded + - 500: Internal Server Error - Configuration not loaded or other server errors + - 503: Service Unavailable - Unable to connect to Llama Stack backend """ return await streaming_query_endpoint_handler_base( request=request, @@ -345,7 +361,7 @@ async def streaming_query_endpoint_handler_v2( # pylint: disable=too-many-local ) -async def retrieve_response( +async def retrieve_response( # pylint: disable=too-many-locals client: AsyncLlamaStackClient, model_id: str, query_request: QueryRequest, @@ -402,6 +418,26 @@ async def retrieve_response( f"{attachment.content}" ) + # Handle conversation ID for Responses API + # Create conversation upfront if not provided + conversation_id = query_request.conversation_id + if conversation_id: + # Conversation ID was provided - convert to llama-stack format + logger.debug("Using existing conversation ID: %s", conversation_id) + llama_stack_conv_id = to_llama_stack_conversation_id(conversation_id) + else: + # No conversation_id provided - create a new conversation first + logger.debug("No conversation_id provided, creating new conversation") + conversation = await client.conversations.create(metadata={}) + llama_stack_conv_id = conversation.id + # Store the normalized version for later use + conversation_id = normalize_conversation_id(llama_stack_conv_id) + logger.info( + "Created new conversation with ID: %s (normalized: %s)", + llama_stack_conv_id, + conversation_id, + ) + create_params: dict[str, Any] = { "input": input_text, "model": model_id, @@ -409,9 +445,8 @@ async def retrieve_response( "stream": True, "store": True, "tools": toolgroups, + "conversation": llama_stack_conv_id, } - if query_request.conversation_id: - create_params["previous_response_id"] = query_request.conversation_id # Add shields to extra_body if available if available_shields: @@ -419,7 +454,8 @@ async def retrieve_response( response = await client.responses.create(**create_params) response_stream = cast(AsyncIterator[OpenAIResponseObjectStream], response) - - # For streaming responses, the ID arrives in the first 'response.created' chunk - # Return empty conversation_id here; it will be set once the first chunk is received - return response_stream, "" + # async for chunk in response_stream: + # logger.error("Chunk: %s", chunk.model_dump_json()) + # Return the normalized conversation_id (already normalized above) + # The response_generator will emit it in the start event + return response_stream, conversation_id diff --git a/src/app/routers.py b/src/app/routers.py index 9a0d7e924..ae9cf51ce 100644 --- a/src/app/routers.py +++ b/src/app/routers.py @@ -9,15 +9,13 @@ providers, rags, root, - query, health, config, feedback, - streaming_query, streaming_query_v2, authorized, - conversations, conversations_v2, + conversations_v3, metrics, tools, # V2 endpoints for Response API support @@ -39,16 +37,17 @@ def include_routers(app: FastAPI) -> None: app.include_router(shields.router, prefix="/v1") app.include_router(providers.router, prefix="/v1") app.include_router(rags.router, prefix="/v1") - app.include_router(query.router, prefix="/v1") - app.include_router(streaming_query.router, prefix="/v1") + # V1 endpoints now use V2 implementations (query and streaming_query are deprecated) + app.include_router(query_v2.router, prefix="/v1") + app.include_router(streaming_query_v2.router, prefix="/v1") app.include_router(config.router, prefix="/v1") app.include_router(feedback.router, prefix="/v1") - app.include_router(conversations.router, prefix="/v1") + # V1 conversations endpoint now uses V3 implementation (conversations is deprecated) + app.include_router(conversations_v3.router, prefix="/v1") app.include_router(conversations_v2.router, prefix="/v2") - # V2 endpoints - Response API support - app.include_router(query_v2.router, prefix="/v2") - app.include_router(streaming_query_v2.router, prefix="/v2") + # Note: query_v2, streaming_query_v2, and conversations_v3 are now exposed at /v1 above + # The old query, streaming_query, and conversations modules are deprecated # road-core does not version these endpoints app.include_router(health.router) diff --git a/src/constants.py b/src/constants.py index 82ea14151..7364c39e0 100644 --- a/src/constants.py +++ b/src/constants.py @@ -2,7 +2,7 @@ # Minimal and maximal supported Llama Stack version MINIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.2.17" -MAXIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.2.22" +MAXIMAL_SUPPORTED_LLAMA_STACK_VERSION = "0.3.0" UNABLE_TO_PROCESS_RESPONSE = "Unable to process this request" diff --git a/src/metrics/utils.py b/src/metrics/utils.py index 451487bef..cb1f8b000 100644 --- a/src/metrics/utils.py +++ b/src/metrics/utils.py @@ -7,7 +7,7 @@ from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer from llama_stack_client import APIConnectionError, APIStatusError -from llama_stack_client.types.agents.turn import Turn +from llama_stack_client.types.alpha.agents.turn import Turn import metrics from client import AsyncLlamaStackClientHolder diff --git a/src/models/requests.py b/src/models/requests.py index 1a43a1737..24f0623ed 100644 --- a/src/models/requests.py +++ b/src/models/requests.py @@ -4,7 +4,7 @@ from enum import Enum from pydantic import BaseModel, model_validator, field_validator, Field -from llama_stack_client.types.agents.turn_create_params import Document +from llama_stack_client.types.alpha.agents.turn_create_params import Document from log import get_logger from utils import suid diff --git a/src/models/responses.py b/src/models/responses.py index f59886a7f..9a90d4fc0 100644 --- a/src/models/responses.py +++ b/src/models/responses.py @@ -10,7 +10,9 @@ from quota.quota_exceed_error import QuotaExceedError from models.config import Action, Configuration +from utils.types import ToolCallSummary, ToolResultSummary +SUCCESSFUL_RESPONSE_DESCRIPTION = "Successful response" BAD_REQUEST_DESCRIPTION = "Invalid request format" UNAUTHORIZED_DESCRIPTION = "Unauthorized" FORBIDDEN_DESCRIPTION = "Permission denied" @@ -19,23 +21,23 @@ INVALID_FEEDBACK_PATH_DESCRIPTION = "Invalid feedback storage path" SERVICE_UNAVAILABLE_DESCRIPTION = "Service unavailable" QUOTA_EXCEEDED_DESCRIPTION = "Quota limit exceeded" +PROMPT_TOO_LONG_DESCRIPTION = "Prompt is too long" INTERNAL_SERVER_ERROR_DESCRIPTION = "Internal server error" -class RAGChunk(BaseModel): - """Model representing a RAG chunk used in the response.""" +# class ToolCall(BaseModel): +# """Model representing a tool call made during response generation.""" - content: str = Field(description="The content of the chunk") - source: str | None = Field(None, description="Source document or URL") - score: float | None = Field(None, description="Relevance score") +# tool_name: str = Field(description="Name of the tool called") +# arguments: dict[str, Any] = Field(description="Arguments passed to the tool") +# result: dict[str, Any] | None = Field(None, description="Result from the tool") -class ToolCall(BaseModel): - """Model representing a tool call made during response generation.""" +# class ToolResult(BaseModel): +# """Model representing a tool result.""" - tool_name: str = Field(description="Name of the tool called") - arguments: dict[str, Any] = Field(description="Arguments passed to the tool") - result: dict[str, Any] | None = Field(None, description="Result from the tool") +# tool_name: str = Field(description="Name of the tool") +# result: dict[str, Any] = Field(description="Result from the tool") class AbstractSuccessfulResponse(BaseModel): @@ -52,7 +54,7 @@ def openapi_response(cls) -> dict[str, Any]: content = {"application/json": {"example": example_value}} return { - "description": "Successful response", + "description": SUCCESSFUL_RESPONSE_DESCRIPTION, "model": cls, "content": content, } @@ -364,16 +366,6 @@ class QueryResponse(AbstractSuccessfulResponse): ], ) - rag_chunks: list[RAGChunk] = Field( - [], - description="List of RAG chunks used to generate the response", - ) - - tool_calls: list[ToolCall] | None = Field( - None, - description="List of tool calls made during response generation", - ) - referenced_documents: list[ReferencedDocument] = Field( default_factory=list, description="List of documents referenced in generating the response", @@ -412,43 +404,122 @@ class QueryResponse(AbstractSuccessfulResponse): examples=[{"daily": 1000, "monthly": 50000}], ) + tool_calls: list[ToolCallSummary] | None = Field( + None, + description="List of tool calls made during response generation", + ) + + tool_results: list[ToolResultSummary] | None = Field( + None, + description="List of tool results", + ) + model_config = { "json_schema_extra": { "examples": [ { "conversation_id": "123e4567-e89b-12d3-a456-426614174000", "response": "Operator Lifecycle Manager (OLM) helps users install...", - "rag_chunks": [ + "referenced_documents": [ { - "content": "OLM is a component of the Operator Framework toolkit...", - "source": "kubernetes-docs/operators.md", - "score": 0.95, - } + "doc_url": "https://docs.openshift.com/container-platform/4.15/" + "operators/understanding/olm/olm-understanding-olm.html", + "doc_title": "Operator Lifecycle Manager concepts and resources", + }, ], + "truncated": False, + "input_tokens": 123, + "output_tokens": 456, + "available_quotas": { + "UserQuotaLimiter": 998911, + "ClusterQuotaLimiter": 998911, + }, "tool_calls": [ - { - "tool_name": "knowledge_search", - "arguments": {"query": "operator lifecycle manager"}, - "result": {"chunks_found": 5}, - } + {"name": "tool1", "args": {}, "id": "1", "type": "tool_call"} ], - "referenced_documents": [ + "tool_results": [ { - "doc_url": "https://docs.openshift.com/" - "container-platform/4.15/operators/olm/index.html", - "doc_title": "Operator Lifecycle Manager (OLM)", + "id": "1", + "status": "success", + "content": "bla", + "type": "tool_result", + "round": 1, } ], - "truncated": False, - "input_tokens": 150, - "output_tokens": 75, - "available_quotas": {"daily": 1000, "monthly": 50000}, } ] } } +class StreamingQueryResponse(AbstractSuccessfulResponse): + """Documentation-only model for streaming query responses using Server-Sent Events (SSE).""" + + @classmethod + def openapi_response(cls) -> dict[str, Any]: + """Generate FastAPI response dict for SSE streaming with examples. + + Note: This is used for OpenAPI documentation only. The actual endpoint + returns a StreamingResponse object, not this Pydantic model. + """ + schema = cls.model_json_schema() + model_examples = schema.get("examples") + if not model_examples: + raise SchemaError(f"Examples not found in {cls.__name__}") + example_value = model_examples[0] + content = { + "text/event-stream": { + "schema": {"type": "string", "format": "text/event-stream"}, + "example": example_value, + } + } + + return { + "description": SUCCESSFUL_RESPONSE_DESCRIPTION, + "content": content, + # Note: No "model" key since we're not actually serializing this model + } + + model_config = { + "json_schema_extra": { + "examples": [ + ( + 'data: {"event": "start", "data": {' + '"conversation_id": "123e4567-e89b-12d3-a456-426614174000"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 0, "token": "No Violation"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 1, "token": ""}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 2, "token": "Hello"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 3, "token": "!"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 4, "token": " How"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 5, "token": " can"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 6, "token": " I"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 7, "token": " assist"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 8, "token": " you"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 9, "token": " today"}}\n\n' + 'data: {"event": "token", "data": {' + '"id": 10, "token": "?"}}\n\n' + 'data: {"event": "turn_complete", "data": {' + '"token": "Hello! How can I assist you today?"}}\n\n' + 'data: {"event": "end", "data": {' + '"referenced_documents": [], ' + '"truncated": null, "input_tokens": 11, "output_tokens": 19}, ' + '"available_quotas": {}}\n\n' + ), + ] + } + } + + class InfoResponse(AbstractSuccessfulResponse): """Model representing a response to an info request. @@ -825,7 +896,7 @@ def openapi_response(cls) -> dict[str, Any]: content = {"application/json": {"examples": named_examples or None}} return { - "description": "Successful response", + "description": SUCCESSFUL_RESPONSE_DESCRIPTION, "model": cls, "content": content, } @@ -1517,6 +1588,38 @@ def __init__(self, *, resource: str, resource_id: str): ) +class PromptTooLongResponse(AbstractErrorResponse): + """413 Payload Too Large - Prompt is too long.""" + + description: ClassVar[str] = PROMPT_TOO_LONG_DESCRIPTION + model_config = { + "json_schema_extra": { + "examples": [ + { + "label": "prompt too long", + "detail": { + "response": "Prompt is too long", + "cause": "The prompt exceeds the maximum allowed length.", + }, + }, + ] + } + } + + def __init__(self, *, response: str = "Prompt is too long", cause: str): + """Initialize a PromptTooLongResponse. + + Args: + response: Short summary of the error. Defaults to "Prompt is too long". + cause: Detailed explanation of what caused the error. + """ + super().__init__( + response=response, + cause=cause, + status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE, + ) + + class UnprocessableEntityResponse(AbstractErrorResponse): """422 Unprocessable Entity - Request validation failed.""" diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index a737f23ab..b6d5ff735 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -30,8 +30,15 @@ logger = get_logger(__name__) -def delete_conversation(conversation_id: str) -> None: - """Delete a conversation according to its ID.""" +def delete_conversation(conversation_id: str) -> bool: + """Delete a conversation from the local database by its ID. + + Args: + conversation_id (str): The unique identifier of the conversation to delete. + + Returns: + bool: True if the conversation was deleted, False if it was not found. + """ with get_session() as session: db_conversation = ( session.query(UserConversation).filter_by(id=conversation_id).first() @@ -40,11 +47,12 @@ def delete_conversation(conversation_id: str) -> None: session.delete(db_conversation) session.commit() logger.info("Deleted conversation %s from local database", conversation_id) - else: - logger.info( - "Conversation %s not found in local database, it may have already been deleted", - conversation_id, - ) + return True + logger.info( + "Conversation %s not found in local database, it may have already been deleted", + conversation_id, + ) + return False def retrieve_conversation(conversation_id: str) -> UserConversation | None: @@ -249,7 +257,7 @@ def store_conversation_into_cache( ) -# # pylint: disable=R0913,R0917 +# # pylint: disable=R0913,R0917,unused-argument async def get_agent( client: AsyncLlamaStackClient, model_id: str, @@ -302,31 +310,36 @@ async def get_agent( existing_agent_id = None if conversation_id: with suppress(ValueError): - agent_response = await client.agents.retrieve(agent_id=conversation_id) - existing_agent_id = agent_response.agent_id + # agent_response = await client.agents.retrieve(agent_id=conversation_id) + # existing_agent_id = agent_response.agent_id + ... logger.debug("Creating new agent") + # pylint: disable=unexpected-keyword-arg,no-member agent = AsyncAgent( client, # type: ignore[arg-type] model=model_id, instructions=system_prompt, - input_shields=available_input_shields if available_input_shields else [], - output_shields=available_output_shields if available_output_shields else [], + # type: ignore[call-arg] + # input_shields=available_input_shields if available_input_shields else [], + # type: ignore[call-arg] + # output_shields=available_output_shields if available_output_shields else [], tool_parser=None if no_tools else GraniteToolParser.get_parser(model_id), - enable_session_persistence=True, + enable_session_persistence=True, # type: ignore[call-arg] ) - await agent.initialize() + await agent.initialize() # type: ignore[attr-defined] if existing_agent_id and conversation_id: logger.debug("Existing conversation ID: %s", conversation_id) logger.debug("Existing agent ID: %s", existing_agent_id) - orphan_agent_id = agent.agent_id + # orphan_agent_id = agent.agent_id agent._agent_id = conversation_id # type: ignore[assignment] # pylint: disable=protected-access - await client.agents.delete(agent_id=orphan_agent_id) - sessions_response = await client.agents.session.list(agent_id=conversation_id) - logger.info("session response: %s", sessions_response) + # await client.agents.delete(agent_id=orphan_agent_id) + # sessions_response = await client.agents.session.list(agent_id=conversation_id) + # logger.info("session response: %s", sessions_response) try: - session_id = str(sessions_response.data[0]["session_id"]) + # session_id = str(sessions_response.data[0]["session_id"]) + ... except IndexError as e: logger.error("No sessions found for conversation %s", conversation_id) response = NotFoundResponse( @@ -334,12 +347,13 @@ async def get_agent( ) raise HTTPException(**response.model_dump()) from e else: - conversation_id = agent.agent_id + # conversation_id = agent.agent_id + # pylint: enable=unexpected-keyword-arg,no-member logger.debug("New conversation ID: %s", conversation_id) session_id = await agent.create_session(get_suid()) logger.debug("New session ID: %s", session_id) - return agent, conversation_id, session_id + return agent, conversation_id, session_id # type: ignore[return-value] async def get_temp_agent( @@ -360,19 +374,23 @@ async def get_temp_agent( tuple[AsyncAgent, str]: A tuple containing the agent and session_id. """ logger.debug("Creating temporary agent") + # pylint: disable=unexpected-keyword-arg,no-member agent = AsyncAgent( client, # type: ignore[arg-type] model=model_id, instructions=system_prompt, - enable_session_persistence=False, # Temporary agent doesn't need persistence + # type: ignore[call-arg] # Temporary agent doesn't need persistence + # enable_session_persistence=False, ) - await agent.initialize() + await agent.initialize() # type: ignore[attr-defined] # Generate new IDs for the temporary agent - conversation_id = agent.agent_id + # conversation_id = agent.agent_id + conversation_id = None + # pylint: enable=unexpected-keyword-arg,no-member session_id = await agent.create_session(get_suid()) - return agent, session_id, conversation_id + return agent, session_id, conversation_id # type: ignore[return-value] def create_rag_chunks_dict(summary: TurnSummary) -> list[dict[str, Any]]: diff --git a/src/utils/llama_stack_version.py b/src/utils/llama_stack_version.py index 55088da5e..4352b1d45 100644 --- a/src/utils/llama_stack_version.py +++ b/src/utils/llama_stack_version.py @@ -1,6 +1,7 @@ """Check if the Llama Stack version is supported by the LCS.""" import logging +import re from semver import Version @@ -57,7 +58,27 @@ def compare_versions(version_info: str, minimal: str, maximal: str) -> None: InvalidLlamaStackVersionException: If `version_info` is outside the inclusive range defined by `minimal` and `maximal`. """ - current_version = Version.parse(version_info) + version_pattern = r"\d+\.\d+\.\d+" + match = re.search(version_pattern, version_info) + if not match: + logger.warning( + "Failed to extract version pattern from '%s'. Skipping version check.", + version_info, + ) + raise InvalidLlamaStackVersionException( + f"Failed to extract version pattern from '{version_info}'. Skipping version check." + ) + + normalized_version = match.group(0) + + try: + current_version = Version.parse(normalized_version) + except ValueError as e: + logger.warning("Failed to parse Llama Stack version '%s'.", version_info) + raise InvalidLlamaStackVersionException( + f"Failed to parse Llama Stack version '{version_info}'." + ) from e + minimal_version = Version.parse(minimal) maximal_version = Version.parse(maximal) logger.debug("Current version: %s", current_version) diff --git a/src/utils/suid.py b/src/utils/suid.py index 4dc9ca5e8..0c5742e5c 100644 --- a/src/utils/suid.py +++ b/src/utils/suid.py @@ -20,19 +20,126 @@ def check_suid(suid: str) -> bool: """ Check if given string is a proper session ID. - Returns True if the string is a valid UUID, False otherwise. + Returns True if the string is a valid UUID or a llama-stack conversation ID. Parameters: - suid (str | bytes): UUID value to validate — accepts a UUID string or - its byte representation. + suid (str | bytes): UUID value to validate — accepts a UUID string, + its byte representation, or a llama-stack conversation ID (conv_xxx), + or a plain hex string (database format). Notes: - Validation is performed by attempting to construct uuid.UUID(suid); - invalid formats or types result in False. + Validation is performed by: + 1. For llama-stack conversation IDs starting with 'conv_': + - Strips the 'conv_' prefix + - Validates at least 32 hex characters follow (may have additional suffix) + - Extracts first 32 hex chars as the UUID part + - Converts to UUID format by inserting hyphens at standard positions + - Validates the resulting UUID structure + 2. For plain hex strings (database format, 32+ chars without conv_ prefix): + - Validates it's a valid hex string + - Extracts first 32 chars as UUID part + - Converts to UUID format and validates + 3. For standard UUIDs: attempts to construct uuid.UUID(suid) + Invalid formats or types result in False. """ try: - # accepts strings and bytes only + # Accept llama-stack conversation IDs (conv_ format) + if isinstance(suid, str) and suid.startswith("conv_"): + # Extract the hex string after 'conv_' + hex_part = suid[5:] # Remove 'conv_' prefix + + # Verify it's a valid hex string + # llama-stack may use 32 hex chars (UUID) or 36 hex chars (UUID + suffix) + if len(hex_part) < 32: + return False + + # Verify all characters are valid hex + try: + int(hex_part, 16) + except ValueError: + return False + + # Extract the first 32 hex characters (the UUID part) + uuid_hex = hex_part[:32] + + # Convert to UUID format with hyphens: 8-4-4-4-12 + uuid_str = ( + f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-" + f"{uuid_hex[16:20]}-{uuid_hex[20:]}" + ) + + # Validate it's a proper UUID + uuid.UUID(uuid_str) + return True + + # Check if it's a plain hex string (database format without conv_ prefix) + if isinstance(suid, str) and len(suid) >= 32: + try: + int(suid, 16) + # Extract the first 32 hex characters (the UUID part) + uuid_hex = suid[:32] + + # Convert to UUID format with hyphens: 8-4-4-4-12 + uuid_str = ( + f"{uuid_hex[:8]}-{uuid_hex[8:12]}-{uuid_hex[12:16]}-" + f"{uuid_hex[16:20]}-{uuid_hex[20:]}" + ) + + # Validate it's a proper UUID + uuid.UUID(uuid_str) + return True + except ValueError: + pass # Not a valid hex string, try standard UUID validation + + # accepts strings and bytes only for UUID validation uuid.UUID(suid) return True except (ValueError, TypeError): return False + + +def normalize_conversation_id(conversation_id: str) -> str: + """ + Normalize a conversation ID for database storage. + + Strips the 'conv_' prefix if present to store just the UUID part. + This keeps IDs shorter and database-agnostic. + + Args: + conversation_id: The conversation ID, possibly with 'conv_' prefix. + + Returns: + str: The normalized ID without 'conv_' prefix. + + Examples: + >>> normalize_conversation_id('conv_abc123') + 'abc123' + >>> normalize_conversation_id('550e8400-e29b-41d4-a716-446655440000') + '550e8400-e29b-41d4-a716-446655440000' + """ + if conversation_id.startswith("conv_"): + return conversation_id[5:] # Remove 'conv_' prefix + return conversation_id + + +def to_llama_stack_conversation_id(conversation_id: str) -> str: + """ + Convert a database conversation ID to llama-stack format. + + Adds the 'conv_' prefix if not already present. + + Args: + conversation_id: The conversation ID from database. + + Returns: + str: The conversation ID in llama-stack format (conv_xxx). + + Examples: + >>> to_llama_stack_conversation_id('abc123') + 'conv_abc123' + >>> to_llama_stack_conversation_id('conv_abc123') + 'conv_abc123' + """ + if not conversation_id.startswith("conv_"): + return f"conv_{conversation_id}" + return conversation_id diff --git a/src/utils/token_counter.py b/src/utils/token_counter.py index 7c3853a8c..b14cf2ac2 100644 --- a/src/utils/token_counter.py +++ b/src/utils/token_counter.py @@ -7,7 +7,7 @@ from llama_stack.models.llama.datatypes import RawMessage from llama_stack.models.llama.llama3.chat_format import ChatFormat from llama_stack.models.llama.llama3.tokenizer import Tokenizer -from llama_stack_client.types.agents.turn import Turn +from llama_stack_client.types.alpha.agents.turn import Turn import metrics diff --git a/src/utils/transcripts.py b/src/utils/transcripts.py index 7dc41cb9f..551080ee9 100644 --- a/src/utils/transcripts.py +++ b/src/utils/transcripts.py @@ -85,6 +85,7 @@ def store_transcript( # pylint: disable=too-many-arguments,too-many-positional- "truncated": truncated, "attachments": [attachment.model_dump() for attachment in attachments], "tool_calls": [tc.model_dump() for tc in summary.tool_calls], + "tool_results": [tr.model_dump() for tr in summary.tool_results], } # stores feedback in a file under unique uuid diff --git a/src/utils/types.py b/src/utils/types.py index 36d8257f7..1585588a5 100644 --- a/src/utils/types.py +++ b/src/utils/types.py @@ -2,16 +2,43 @@ from typing import Any, Optional import json -from llama_stack_client.lib.agents.event_logger import interleaved_content_as_str from llama_stack_client.lib.agents.tool_parser import ToolParser -from llama_stack_client.types.shared.completion_message import CompletionMessage -from llama_stack_client.types.shared.tool_call import ToolCall -from llama_stack_client.types.tool_execution_step import ToolExecutionStep +from llama_stack_client.lib.agents.types import ( + CompletionMessage as AgentCompletionMessage, + ToolCall as AgentToolCall, +) +from llama_stack_client.types.shared.interleaved_content_item import ( + TextContentItem, + ImageContentItem, +) +from llama_stack_client.types.alpha.tool_execution_step import ToolExecutionStep from pydantic import BaseModel -from models.responses import RAGChunk +from pydantic import Field from constants import DEFAULT_RAG_TOOL +def content_to_str(content: Any) -> str: + """Convert content (str, TextContentItem, ImageContentItem, or list) to string. + + Args: + content: Content to convert to string. + + Returns: + str: String representation of the content. + """ + if content is None: + return "" + if isinstance(content, str): + return content + if isinstance(content, TextContentItem): + return content.text + if isinstance(content, ImageContentItem): + return "" + if isinstance(content, list): + return " ".join(content_to_str(item) for item in content) + return str(content) + + class Singleton(type): """Metaclass for Singleton support.""" @@ -33,16 +60,18 @@ def __call__(cls, *args, **kwargs): # type: ignore class GraniteToolParser(ToolParser): """Workaround for 'tool_calls' with granite models.""" - def get_tool_calls(self, output_message: CompletionMessage) -> list[ToolCall]: + def get_tool_calls( + self, output_message: AgentCompletionMessage + ) -> list[AgentToolCall]: """ Return the `tool_calls` list from a CompletionMessage, or an empty list if none are present. Parameters: - output_message (CompletionMessage | None): Completion + output_message (AgentCompletionMessage | None): Completion message potentially containing `tool_calls`. Returns: - list[ToolCall]: The list of tool call entries + list[AgentToolCall]: The list of tool call entries extracted from `output_message`, or an empty list. """ if output_message and output_message.tool_calls: @@ -71,19 +100,36 @@ def get_parser(model_id: str) -> Optional[ToolParser]: class ToolCallSummary(BaseModel): - """Represents a tool call for data collection. + """Model representing a tool call made during response generation (for tool_calls list).""" - Use our own tool call model to keep things consistent across llama - upgrades or if we used something besides llama in the future. - """ + id: str = Field(description="ID of the tool call") + name: str = Field(description="Name of the tool called") + args: dict[str, Any] = Field( + default_factory=dict, description="Arguments passed to the tool" + ) + type: str = Field("tool_call", description="Type indicator for tool call") + + +class ToolResultSummary(BaseModel): + """Model representing a result from a tool call (for tool_results list).""" - # ID of the call itself - id: str - # Name of the tool used - name: str - # Arguments to the tool call - args: str | dict[Any, Any] - response: str | None + id: str = Field( + description="ID of the tool call/result, matches the corresponding tool call 'id'" + ) + status: str = Field( + ..., description="Status of the tool execution (e.g., 'success')" + ) + content: Any = Field(..., description="Content/result returned from the tool") + type: str = Field("tool_result", description="Type indicator for tool result") + round: int = Field(..., description="Round number or step of tool execution") + + +class RAGChunk(BaseModel): + """Model representing a RAG chunk used in the response.""" + + content: str = Field(description="The content of the chunk") + source: str | None = Field(None, description="Source document or URL") + score: float | None = Field(None, description="Relevance score") class TurnSummary(BaseModel): @@ -91,7 +137,8 @@ class TurnSummary(BaseModel): llm_response: str tool_calls: list[ToolCallSummary] - rag_chunks: list[RAGChunk] = [] + tool_results: list[ToolResultSummary] + rag_chunks: list[RAGChunk] def append_tool_calls_from_llama(self, tec: ToolExecutionStep) -> None: """Append the tool calls from a llama tool execution step.""" @@ -99,19 +146,29 @@ def append_tool_calls_from_llama(self, tec: ToolExecutionStep) -> None: responses_by_id = {tc.call_id: tc for tc in tec.tool_responses} for call_id, tc in calls_by_id.items(): resp = responses_by_id.get(call_id) - response_content = ( - interleaved_content_as_str(resp.content) if resp else None - ) + response_content = content_to_str(resp.content) if resp else None self.tool_calls.append( ToolCallSummary( id=call_id, name=tc.tool_name, - args=tc.arguments, - response=response_content, + args=( + tc.arguments + if isinstance(tc.arguments, dict) + else {"args": str(tc.arguments)} + ), + type="tool_call", + ) + ) + self.tool_results.append( + ToolResultSummary( + id=call_id, + status="success" if resp else "failure", + content=response_content, + type="tool_result", + round=1, # clarify meaning of this attribute ) ) - # Extract RAG chunks from knowledge_search tool responses if tc.tool_name == DEFAULT_RAG_TOOL and resp and response_content: self._extract_rag_chunks_from_response(response_content) diff --git a/test.containerfile b/test.containerfile index 4cc99456d..5b8140064 100644 --- a/test.containerfile +++ b/test.containerfile @@ -1,5 +1,5 @@ # Custom Red Hat llama-stack image with missing dependencies -FROM quay.io/opendatahub/llama-stack:rhoai-v2.25-latest +FROM quay.io/opendatahub/llama-stack:rhoai-v3.0-latest # Install missing dependencies and create required directories USER root diff --git a/tests/configuration/minimal-stack.yaml b/tests/configuration/minimal-stack.yaml index ab1ff78c9..9f4ea1491 100644 --- a/tests/configuration/minimal-stack.yaml +++ b/tests/configuration/minimal-stack.yaml @@ -5,3 +5,26 @@ external_providers_dir: /tmp apis: [] providers: {} +storage: + backends: + kv_default: + type: kv_sqlite + db_path: '/tmp/test_llama_stack_kv.db' + sql_default: + type: sql_sqlite + db_path: '/tmp/test_llama_stack_sql.db' + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + prompts: + namespace: prompts + backend: kv_default diff --git a/tests/e2e/configs/run-azure.yaml b/tests/e2e/configs/run-azure.yaml index 533ad057d..6c57a5791 100644 --- a/tests/e2e/configs/run-azure.yaml +++ b/tests/e2e/configs/run-azure.yaml @@ -1,131 +1,137 @@ -version: '2' -image_name: minimal-viable-llama-stack-configuration +version: 2 apis: - - agents - - datasetio - - eval - - files - - inference - - post_training - - safety - - scoring - - telemetry - - tool_runtime - - vector_io +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io + benchmarks: [] -container_image: null +conversations_store: + db_path: /tmp/conversations.db + type: sqlite datasets: [] +image_name: starter external_providers_dir: /opt/app-root/src/.llama/providers.d inference_store: - db_path: .llama/distributions/ollama/inference_store.db + db_path: /tmp/inference_store.db type: sqlite -logging: null metadata_store: - db_path: .llama/distributions/ollama/registry.db - namespace: null + db_path: /tmp/registry.db type: sqlite + +models: +- model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 +- model_id: gpt-4o-mini + provider_id: azure + model_type: llm + provider_model_id: gpt-4o-mini + providers: - files: - - provider_id: localfs - provider_type: inline::localfs - config: - storage_dir: /tmp/llama-stack-files - metadata_store: - type: sqlite - db_path: .llama/distributions/ollama/files_metadata.db agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: + - config: persistence_store: - db_path: .llama/distributions/ollama/agents_store.db - namespace: null + db_path: /tmp/agents_store.db type: sqlite responses_store: - db_path: .llama/distributions/ollama/responses_store.db + db_path: /tmp/responses_store.db + type: sqlite + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + db_path: /tmp/batches.db type: sqlite + provider_id: reference + provider_type: inline::reference datasetio: - - provider_id: huggingface + - config: + kvstore: + db_path: /tmp/huggingface_datasetio.db + type: sqlite + provider_id: huggingface provider_type: remote::huggingface - config: + - config: kvstore: - db_path: .llama/distributions/ollama/huggingface_datasetio.db - namespace: null + db_path: /tmp/localfs_datasetio.db type: sqlite - - provider_id: localfs + provider_id: localfs provider_type: inline::localfs - config: + eval: + - config: kvstore: - db_path: .llama/distributions/ollama/localfs_datasetio.db - namespace: null + db_path: /tmp/meta_reference_eval.db type: sqlite - eval: - - provider_id: meta-reference + provider_id: meta-reference provider_type: inline::meta-reference - config: - kvstore: - db_path: .llama/distributions/ollama/meta_reference_eval.db - namespace: null + files: + - config: + metadata_store: + db_path: /tmp/files_metadata.db type: sqlite + storage_dir: /tmp/files + provider_id: meta-reference-files + provider_type: inline::localfs inference: - - provider_id: azure - provider_type: remote::azure - config: - api_key: ${env.AZURE_API_KEY} - api_base: https://ols-test.openai.azure.com/ - api_version: 2024-02-15-preview - api_type: ${env.AZURE_API_TYPE:=} - post_training: - - provider_id: huggingface - provider_type: inline::huggingface-gpu + - provider_id: openai + provider_type: remote::openai config: - checkpoint_format: huggingface - device: cpu - distributed_backend: null - dpo_output_dir: "." + api_key: ${env.OPENAI_API_KEY} + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers + - provider_id: azure + provider_type: remote::azure + config: + api_key: ${env.AZURE_API_KEY} + api_base: https://ols-test.openai.azure.com/ + api_version: 2024-02-15-preview safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: + - config: excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard scoring: - - provider_id: basic + - config: {} + provider_id: basic provider_type: inline::basic - config: {} - - provider_id: llm-as-judge + - config: {} + provider_id: llm-as-judge provider_type: inline::llm-as-judge - config: {} - - provider_id: braintrust - provider_type: inline::braintrust - config: - openai_api_key: '********' telemetry: - - provider_id: meta-reference + - config: + service_name: "\u200B" + provider_id: meta-reference provider_type: inline::meta-reference - config: - service_name: 'lightspeed-stack-telemetry' - sinks: sqlite - sqlite_db_path: .llama/distributions/ollama/trace_store.db tool_runtime: - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime + vector_io: + - config: + kvstore: + db_path: /tmp/faiss_store.db + type: sqlite + provider_id: faiss + provider_type: inline::faiss scoring_fns: [] server: - auth: null - host: null port: 8321 - quota: null - tls_cafile: null - tls_certfile: null - tls_keyfile: null -shields: - - shield_id: llama-guard-shield - provider_id: llama-guard - provider_shield_id: "gpt-4o-mini" -models: - - model_id: gpt-4o-mini - model_type: llm - provider_id: azure - provider_model_id: gpt-4o-mini \ No newline at end of file +shields: [] +tool_groups: +- provider_id: rag-runtime + toolgroup_id: builtin::rag +vector_dbs: [] \ No newline at end of file diff --git a/tests/e2e/configs/run-ci.yaml b/tests/e2e/configs/run-ci.yaml index 30135ffaa..4a7495e6a 100644 --- a/tests/e2e/configs/run-ci.yaml +++ b/tests/e2e/configs/run-ci.yaml @@ -1,89 +1,98 @@ -version: '2' -image_name: minimal-viable-llama-stack-configuration +version: 2 apis: - - agents - - datasetio - - eval - - files - - inference - - post_training - - safety - - scoring - - telemetry - - tool_runtime - - vector_io +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- telemetry +- tool_runtime +- vector_io + benchmarks: [] -container_image: null +conversations_store: + db_path: /tmp/conversations.db + type: sqlite datasets: [] +image_name: starter external_providers_dir: /opt/app-root/src/.llama/providers.d inference_store: - db_path: .llama/distributions/ollama/inference_store.db + db_path: /tmp/inference_store.db type: sqlite -logging: null metadata_store: - db_path: .llama/distributions/ollama/registry.db - namespace: null + db_path: /tmp/registry.db type: sqlite + +models: +- model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 +- model_id: gpt-4o-mini + provider_id: openai + model_type: llm + provider_model_id: gpt-4o-mini + providers: - files: - - config: - storage_dir: /tmp/llama-stack-files - metadata_store: - type: sqlite - db_path: .llama/distributions/ollama/files_metadata.db - provider_id: localfs - provider_type: inline::localfs agents: - config: persistence_store: - db_path: .llama/distributions/ollama/agents_store.db - namespace: null + db_path: /tmp/agents_store.db type: sqlite responses_store: - db_path: .llama/distributions/ollama/responses_store.db + db_path: /tmp/responses_store.db type: sqlite provider_id: meta-reference provider_type: inline::meta-reference + batches: + - config: + kvstore: + db_path: /tmp/batches.db + type: sqlite + provider_id: reference + provider_type: inline::reference datasetio: - config: kvstore: - db_path: .llama/distributions/ollama/huggingface_datasetio.db - namespace: null + db_path: /tmp/huggingface_datasetio.db type: sqlite provider_id: huggingface provider_type: remote::huggingface - config: kvstore: - db_path: .llama/distributions/ollama/localfs_datasetio.db - namespace: null + db_path: /tmp/localfs_datasetio.db type: sqlite provider_id: localfs provider_type: inline::localfs eval: - config: kvstore: - db_path: .llama/distributions/ollama/meta_reference_eval.db - namespace: null + db_path: /tmp/meta_reference_eval.db type: sqlite provider_id: meta-reference provider_type: inline::meta-reference - inference: - - provider_id: sentence-transformers # Can be any embedding provider - provider_type: inline::sentence-transformers - config: {} - - provider_id: openai - provider_type: remote::openai - config: - api_key: ${env.OPENAI_API_KEY} - post_training: + files: - config: - checkpoint_format: huggingface - device: cpu - distributed_backend: null - dpo_output_dir: "." - provider_id: huggingface - provider_type: inline::huggingface-gpu + metadata_store: + db_path: /tmp/files_metadata.db + type: sqlite + storage_dir: /tmp/files + provider_id: meta-reference-files + provider_type: inline::localfs + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers safety: - config: excluded_categories: [] @@ -96,62 +105,27 @@ providers: - config: {} provider_id: llm-as-judge provider_type: inline::llm-as-judge - - config: - openai_api_key: '********' - provider_id: braintrust - provider_type: inline::braintrust telemetry: - config: - service_name: 'lightspeed-stack-telemetry' - sinks: sqlite - sqlite_db_path: .llama/distributions/ollama/trace_store.db + service_name: "\u200B" provider_id: meta-reference provider_type: inline::meta-reference tool_runtime: - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime vector_io: - config: kvstore: - db_path: .llama/distributions/ollama/faiss_store.db # Location of vector database - namespace: null + db_path: /tmp/faiss_store.db type: sqlite provider_id: faiss - provider_type: inline::faiss # Or preferred vector DB + provider_type: inline::faiss scoring_fns: [] server: - auth: null - host: null port: 8321 - quota: null - tls_cafile: null - tls_certfile: null - tls_keyfile: null -shields: - - shield_id: llama-guard-shield - provider_id: llama-guard - provider_shield_id: ${env.E2E_OPENAI_MODEL} -vector_dbs: - - vector_db_id: my_knowledge_base - embedding_model: sentence-transformers/all-mpnet-base-v2 - embedding_dimension: 768 - provider_id: faiss -models: - - metadata: - embedding_dimension: 768 # Depends on chosen model - model_id: sentence-transformers/all-mpnet-base-v2 # Example embedding model - provider_id: sentence-transformers - provider_model_id: sentence-transformers/all-mpnet-base-v2 # Location of embedding model - model_type: embedding - - model_id: ${env.E2E_OPENAI_MODEL} - provider_id: openai - model_type: llm - provider_model_id: ${env.E2E_OPENAI_MODEL} - +shields: [] tool_groups: - - toolgroup_id: builtin::rag - provider_id: rag-runtime +- provider_id: rag-runtime + toolgroup_id: builtin::rag +vector_dbs: [] \ No newline at end of file diff --git a/tests/e2e/configs/run-library.yaml b/tests/e2e/configs/run-library.yaml new file mode 100644 index 000000000..5e46ee6e9 --- /dev/null +++ b/tests/e2e/configs/run-library.yaml @@ -0,0 +1,155 @@ +version: 2 + +apis: +- agents +- batches +- datasetio +- eval +- files +- inference +- safety +- scoring +- tool_runtime +- vector_io + +benchmarks: [] +conversations_store: + db_path: /tmp/conversations.db + type: sqlite +datasets: [] +image_name: starter +# external_providers_dir: /opt/app-root/src/.llama/providers.d +inference_store: + db_path: /tmp/inference_store.db + type: sqlite +metadata_store: + db_path: /tmp/registry.db + type: sqlite + +models: +- model_id: sentence-transformers/all-mpnet-base-v2 + model_type: embedding + provider_id: sentence-transformers + provider_model_id: sentence-transformers/all-mpnet-base-v2 + metadata: + embedding_dimension: 768 +# - model_id: gpt-4o-mini +# provider_id: openai +# model_type: llm +# provider_model_id: gpt-4o-mini + +providers: + agents: + - config: + persistence: + agent_state: + namespace: agents_state + backend: kv_default + responses: + table_name: agents_responses + backend: sql_default + provider_id: meta-reference + provider_type: inline::meta-reference + batches: + - config: + kvstore: + namespace: batches_store + backend: kv_default + provider_id: reference + provider_type: inline::reference + datasetio: + - config: + kvstore: + namespace: huggingface_datasetio + backend: kv_default + provider_id: huggingface + provider_type: remote::huggingface + - config: + kvstore: + namespace: localfs_datasetio + backend: kv_default + provider_id: localfs + provider_type: inline::localfs + eval: + - config: + kvstore: + namespace: eval_store + backend: kv_default + provider_id: meta-reference + provider_type: inline::meta-reference + files: + - config: + metadata_store: + table_name: files_metadata + backend: sql_default + storage_dir: /tmp/files + provider_id: meta-reference-files + provider_type: inline::localfs + inference: + - provider_id: openai + provider_type: remote::openai + config: + api_key: ${env.OPENAI_API_KEY} + - config: {} + provider_id: sentence-transformers + provider_type: inline::sentence-transformers + safety: + - config: + excluded_categories: [] + provider_id: llama-guard + provider_type: inline::llama-guard + scoring: + - config: {} + provider_id: basic + provider_type: inline::basic + - config: {} + provider_id: llm-as-judge + provider_type: inline::llm-as-judge + # telemetry: + # - config: + # service_name: "​" + # provider_id: meta-reference + # provider_type: inline::meta-reference + tool_runtime: + - config: {} + provider_id: rag-runtime + provider_type: inline::rag-runtime + vector_io: + - config: + persistence: + namespace: faiss_store + backend: kv_default + provider_id: faiss + provider_type: inline::faiss +scoring_fns: [] +server: + port: 8321 +shields: [] +tool_groups: +- provider_id: rag-runtime + toolgroup_id: builtin::rag +vector_dbs: [] +storage: + backends: + kv_default: + type: kv_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/kv_store.db + sql_default: + type: sql_sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/sql_store.db + stores: + metadata: + namespace: registry + backend: kv_default + inference: + table_name: inference_store + backend: sql_default + max_write_queue_size: 10000 + num_writers: 4 + conversations: + table_name: openai_conversations + backend: sql_default + prompts: + namespace: prompts + backend: kv_default + diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml index c4f53338a..777421f7c 100644 --- a/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml +++ b/tests/e2e/configuration/library-mode/lightspeed-stack-auth-noop-token.yaml @@ -15,6 +15,12 @@ user_data_collection: transcripts_enabled: true transcripts_storage: "/tmp/data/transcripts" +# Conversation cache for storing Q&A history +conversation_cache: + type: "sqlite" + sqlite: + db_path: "/tmp/data/conversation-cache.db" + authentication: module: "noop-with-token" diff --git a/tests/e2e/configuration/library-mode/lightspeed-stack-no-cache.yaml b/tests/e2e/configuration/library-mode/lightspeed-stack-no-cache.yaml new file mode 100644 index 000000000..d8a0214df --- /dev/null +++ b/tests/e2e/configuration/library-mode/lightspeed-stack-no-cache.yaml @@ -0,0 +1,22 @@ +name: Lightspeed Core Service (LCS) +service: + host: 0.0.0.0 + port: 8080 + auth_enabled: false + workers: 1 + color_log: true + access_log: true +llama_stack: + use_as_library_client: true + library_client_config_path: run.yaml +user_data_collection: + feedback_enabled: true + feedback_storage: "/tmp/data/feedback" + transcripts_enabled: true + transcripts_storage: "/tmp/data/transcripts" + +# NO conversation_cache configured - for testing error handling + +authentication: + module: "noop-with-token" + diff --git a/tests/e2e/configuration/lightspeed-stack-no-cache.yaml b/tests/e2e/configuration/server-mode/lightspeed-stack-no-cache.yaml similarity index 100% rename from tests/e2e/configuration/lightspeed-stack-no-cache.yaml rename to tests/e2e/configuration/server-mode/lightspeed-stack-no-cache.yaml diff --git a/tests/e2e/features/conversation_cache_v2.feature b/tests/e2e/features/conversation_cache_v2.feature index 3e9d53a5b..efc0ba601 100644 --- a/tests/e2e/features/conversation_cache_v2.feature +++ b/tests/e2e/features/conversation_cache_v2.feature @@ -212,6 +212,7 @@ Feature: Conversation Cache V2 API tests @NoCacheConfig Scenario: Check conversations/{conversation_id} fails when cache not configured Given REST API service prefix is /v2 + And An invalid conversation cache path is configured And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva When I access REST API endpoint "conversations" using HTTP GET method Then The status code of the response is 500 @@ -280,8 +281,11 @@ Feature: Conversation Cache V2 API tests Given REST API service prefix is /v2 And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva When I use REST API conversation endpoint with conversation_id "12345678-abcd-0000-0123-456789abcdef" using HTTP DELETE method - Then The status code of the response is 404 - And The body of the response contains Conversation not found + Then The status code of the response is 200 + And The body of the response, ignoring the "conversation_id" field, is the following + """ + {"success": true, "response": "Conversation cannot be deleted"} + """ @skip-in-library-mode Scenario: V2 conversations DELETE endpoint works even when llama-stack is down diff --git a/tests/e2e/features/conversations.feature b/tests/e2e/features/conversations.feature index 9a82f9fbc..0fecb0510 100644 --- a/tests/e2e/features/conversations.feature +++ b/tests/e2e/features/conversations.feature @@ -175,8 +175,11 @@ Feature: conversations endpoint API tests Given The system is in default state And I set the Authorization header to Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva When I use REST API conversation endpoint with conversation_id "12345678-abcd-0000-0123-456789abcdef" using HTTP DELETE method - Then The status code of the response is 404 - And The body of the response contains Conversation not found + Then The status code of the response is 200 + And The body of the response, ignoring the "conversation_id" field, is the following + """ + {"success": true, "response": "Conversation cannot be deleted"} + """ @skip-in-library-mode Scenario: Check if conversations/{conversation_id} DELETE endpoint fails when llama-stack is unavailable diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 987c4e73e..f7f366998 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -95,11 +95,6 @@ def before_scenario(context: Context, scenario: Scenario) -> None: context.scenario_config = ( f"tests/e2e/configuration/{mode_dir}/lightspeed-stack-no-cache.yaml" ) - # Switch config and restart immediately - switch_config( - context.scenario_config - ) # Copies to default lightspeed-stack.yaml - restart_container("lightspeed-stack") def after_scenario(context: Context, scenario: Scenario) -> None: @@ -125,7 +120,7 @@ def after_scenario(context: Context, scenario: Scenario) -> None: # Wait for the service to be healthy print("Restoring Llama Stack connection...") - time.sleep(5) + time.sleep(20) # Check if it's healthy for attempt in range(6): # Try for 30 seconds diff --git a/tests/e2e/features/info.feature b/tests/e2e/features/info.feature index 1a45153a3..ffbf7c7a3 100644 --- a/tests/e2e/features/info.feature +++ b/tests/e2e/features/info.feature @@ -16,7 +16,7 @@ Feature: Info tests When I access REST API endpoint "info" using HTTP GET method Then The status code of the response is 200 And The body of the response has proper name Lightspeed Core Service (LCS) and version 0.3.0 - And The body of the response has llama-stack version 0.2.22 + And The body of the response has llama-stack version 0.3.0 @skip-in-library-mode Scenario: Check if info endpoint reports error when llama-stack connection is not working diff --git a/tests/e2e/features/steps/conversation.py b/tests/e2e/features/steps/conversation.py index 4fa20921d..4dfee170f 100644 --- a/tests/e2e/features/steps/conversation.py +++ b/tests/e2e/features/steps/conversation.py @@ -1,10 +1,15 @@ """Implementation of common test steps.""" import json -from behave import step, when, then # pyright: ignore[reportAttributeAccessIssue] +from behave import ( + step, + when, + then, + given, +) # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context import requests -from tests.e2e.utils.utils import replace_placeholders +from tests.e2e.utils.utils import replace_placeholders, restart_container, switch_config # default timeout for HTTP operations DEFAULT_TIMEOUT = 10 @@ -341,3 +346,10 @@ def check_conversation_model_provider( assert ( actual_provider == expected_provider ), f"Turn {idx} expected provider '{expected_provider}', got '{actual_provider}'" + + +@given("An invalid conversation cache path is configured") # type: ignore +def configure_invalid_conversation_cache_path(context: Context) -> None: + """Set an invalid conversation cache path and restart the container.""" + switch_config(context.scenario_config) + restart_container("lightspeed-stack") diff --git a/tests/e2e/features/steps/info.py b/tests/e2e/features/steps/info.py index f3a1251cd..e2d1ff646 100644 --- a/tests/e2e/features/steps/info.py +++ b/tests/e2e/features/steps/info.py @@ -1,6 +1,7 @@ """Implementation of common test steps.""" import json +import re from behave import then # pyright: ignore[reportAttributeAccessIssue] from behave.runner import Context @@ -23,9 +24,15 @@ def check_llama_version(context: Context, llama_version: str) -> None: response_json = context.response.json() assert response_json is not None, "Response is not valid JSON" + version_pattern = r"\d+\.\d+\.\d+" + llama_stack_version = response_json["llama_stack_version"] + match = re.search(version_pattern, llama_stack_version) + assert match is not None, f"Could not extract version from {llama_stack_version}" + extracted_version = match.group(0) + assert ( - response_json["llama_stack_version"] == llama_version - ), f"llama-stack version is {response_json["llama_stack_version"]}" + extracted_version == llama_version + ), f"llama-stack version is {extracted_version}, expected {llama_version}" @then("The body of the response has proper model structure") diff --git a/tests/e2e/test_list.txt b/tests/e2e/test_list.txt index 9d7cd0c8b..2a62eaf6c 100644 --- a/tests/e2e/test_list.txt +++ b/tests/e2e/test_list.txt @@ -2,6 +2,7 @@ features/smoketests.feature features/authorized_noop.feature features/authorized_noop_token.feature features/conversations.feature +features/conversation_cache_v2.feature features/feedback.feature features/health.feature features/info.feature diff --git a/tests/integration/endpoints/test_query_v2_integration.py b/tests/integration/endpoints/test_query_v2_integration.py index 626db35b1..5091ec61f 100644 --- a/tests/integration/endpoints/test_query_v2_integration.py +++ b/tests/integration/endpoints/test_query_v2_integration.py @@ -81,6 +81,12 @@ def mock_llama_stack_client_fixture( mock_vector_stores_response.data = [] mock_client.vector_stores.list.return_value = mock_vector_stores_response + # Mock conversations.create for new conversation creation + # Returns ID in llama-stack format (conv_ prefix + 48 hex chars) + mock_conversation = mocker.MagicMock() + mock_conversation.id = "conv_" + "a" * 48 # conv_aaa...aaa (proper format) + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) + # Mock version info mock_client.inspect.version.return_value = VersionInfo(version="0.2.22") @@ -159,7 +165,8 @@ async def test_query_v2_endpoint_successful_response( # Verify response structure assert response.conversation_id is not None - assert response.conversation_id == "response-123" + # Conversation ID is normalized (without conv_ prefix) from conversations.create() + assert response.conversation_id == "a" * 48 assert "Ansible" in response.response assert response.response == "This is a test response about Ansible." assert response.input_tokens >= 0 @@ -304,7 +311,6 @@ async def test_query_v2_endpoint_with_attachments( # ========================================== -@pytest.mark.skip(reason="LCORE-1025: ToolCallSummary.response type mismatch") @pytest.mark.asyncio async def test_query_v2_endpoint_with_tool_calls( test_config: AppConfig, @@ -337,13 +343,15 @@ async def test_query_v2_endpoint_with_tool_calls( mock_tool_output.id = "call-1" mock_tool_output.queries = ["What is Ansible"] mock_tool_output.status = "completed" - mock_tool_output.results = [ - mocker.MagicMock( - file_id="doc-1", - filename="ansible-docs.txt", - score=0.95, - ) - ] + mock_result = mocker.MagicMock() + mock_result.file_id = "doc-1" + mock_result.filename = "ansible-docs.txt" + mock_result.score = 0.95 + mock_result.attributes = { + "doc_url": "https://example.com/ansible-docs.txt", + "link": "https://example.com/ansible-docs.txt", + } + mock_tool_output.results = [mock_result] mock_message_output = mocker.MagicMock() mock_message_output.type = "message" @@ -366,10 +374,7 @@ async def test_query_v2_endpoint_with_tool_calls( assert response.tool_calls is not None assert len(response.tool_calls) > 0 - assert response.tool_calls[0].tool_name == "knowledge_search" - - if response.rag_chunks: - assert len(response.rag_chunks) > 0 + assert response.tool_calls[0].name == "knowledge_search" @pytest.mark.asyncio @@ -433,10 +438,9 @@ async def test_query_v2_endpoint_with_mcp_list_tools( assert response.tool_calls is not None assert len(response.tool_calls) == 1 - assert response.tool_calls[0].tool_name == "mcp_list_tools" + assert response.tool_calls[0].name == "mcp_list_tools" -@pytest.mark.skip(reason="LCORE-1025: ToolCallSummary.response type mismatch") @pytest.mark.asyncio async def test_query_v2_endpoint_with_multiple_tool_types( test_config: AppConfig, @@ -501,7 +505,7 @@ async def test_query_v2_endpoint_with_multiple_tool_types( # Verify response includes multiple tool calls assert response.tool_calls is not None assert len(response.tool_calls) == 2 - tool_names = [tc.tool_name for tc in response.tool_calls] + tool_names = [tc.name for tc in response.tool_calls] assert "knowledge_search" in tool_names or "file_search" in tool_names assert "calculate" in tool_names @@ -1198,6 +1202,7 @@ async def test_query_v2_endpoint_transcript_behavior( test_request: Request, test_auth: AuthTuple, patch_db_session: Session, + mocker: MockerFixture, ) -> None: """Test transcript storage behavior based on configuration. @@ -1213,9 +1218,13 @@ async def test_query_v2_endpoint_transcript_behavior( test_request: FastAPI request test_auth: noop authentication tuple patch_db_session: Test database session + mocker: pytest-mock fixture """ _ = mock_llama_stack_client + # Mock store_transcript to prevent file creation + mocker.patch("app.endpoints.query.store_transcript") + test_config.user_data_collection_configuration.transcripts_enabled = True query_request_enabled = QueryRequest( diff --git a/tests/integration/test_openapi_json.py b/tests/integration/test_openapi_json.py index a102ccff4..a81afecf2 100644 --- a/tests/integration/test_openapi_json.py +++ b/tests/integration/test_openapi_json.py @@ -169,7 +169,7 @@ def test_servers_section_present_from_url(spec_from_url: dict[str, Any]) -> None ( "/v1/conversations/{conversation_id}", "delete", - {"200", "400", "401", "403", "404", "500", "503"}, + {"200", "400", "401", "403", "500", "503"}, ), ("/v2/conversations", "get", {"200", "401", "403", "500"}), ( @@ -180,7 +180,7 @@ def test_servers_section_present_from_url(spec_from_url: dict[str, Any]) -> None ( "/v2/conversations/{conversation_id}", "delete", - {"200", "400", "401", "403", "404", "500"}, + {"200", "400", "401", "403", "500"}, ), ( "/v2/conversations/{conversation_id}", @@ -239,7 +239,7 @@ def test_paths_and_responses_exist_from_file( ( "/v1/conversations/{conversation_id}", "delete", - {"200", "400", "401", "403", "404", "500", "503"}, + {"200", "400", "401", "403", "500", "503"}, ), ("/v2/conversations", "get", {"200", "401", "403", "500"}), ( @@ -250,7 +250,7 @@ def test_paths_and_responses_exist_from_file( ( "/v2/conversations/{conversation_id}", "delete", - {"200", "400", "401", "403", "404", "500"}, + {"200", "400", "401", "403", "500"}, ), ( "/v2/conversations/{conversation_id}", diff --git a/tests/unit/app/endpoints/test_conversations_v2.py b/tests/unit/app/endpoints/test_conversations_v2.py index 1d44a3695..d52db81c9 100644 --- a/tests/unit/app/endpoints/test_conversations_v2.py +++ b/tests/unit/app/endpoints/test_conversations_v2.py @@ -567,16 +567,18 @@ async def test_conversation_not_found( mock_authorization_resolvers(mocker) mocker.patch("app.endpoints.conversations_v2.configuration", mock_configuration) mocker.patch("app.endpoints.conversations_v2.check_suid", return_value=True) - mock_configuration.conversation_cache.list.return_value = [] + mock_configuration.conversation_cache.delete.return_value = False - with pytest.raises(HTTPException) as exc_info: - await delete_conversation_endpoint_handler( - request=mocker.Mock(), - conversation_id=VALID_CONVERSATION_ID, - auth=MOCK_AUTH, - ) + response = await delete_conversation_endpoint_handler( + request=mocker.Mock(), + conversation_id=VALID_CONVERSATION_ID, + auth=MOCK_AUTH, + ) - assert exc_info.value.status_code == status.HTTP_404_NOT_FOUND + assert response is not None + assert response.conversation_id == VALID_CONVERSATION_ID + assert response.success is True + assert response.response == "Conversation cannot be deleted" @pytest.mark.asyncio async def test_successful_deletion( diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index 54b46a3c8..b12deea47 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -11,11 +11,11 @@ from fastapi import HTTPException, Request, status from litellm.exceptions import RateLimitError from llama_stack_client import APIConnectionError -from llama_stack_client.types import UserMessage -from llama_stack_client.types.agents.turn import Turn +from llama_stack_client.types import UserMessage # type: ignore +from llama_stack_client.types.alpha.agents.turn import Turn from llama_stack_client.types.shared.interleaved_content_item import TextContentItem -from llama_stack_client.types.tool_execution_step import ToolExecutionStep -from llama_stack_client.types.tool_response import ToolResponse +from llama_stack_client.types.alpha.tool_execution_step import ToolExecutionStep +from llama_stack_client.types.alpha.tool_response import ToolResponse from pydantic import AnyUrl from pytest_mock import MockerFixture @@ -218,10 +218,12 @@ async def _test_query_endpoint_handler( ToolCallSummary( id="123", name="test-tool", - args="testing", - response="tool response", + args={"query": "testing"}, + type="tool_call", ) ], + tool_results=[], + rag_chunks=[], ) conversation_id = "00000000-0000-0000-0000-000000000000" query = "What is OpenStack?" @@ -604,7 +606,7 @@ async def test_retrieve_response_message_without_content( assert response.llm_response == "" -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_vector_db_available( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -652,7 +654,7 @@ async def test_retrieve_response_vector_db_available( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -695,7 +697,7 @@ async def test_retrieve_response_no_available_shields( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_one_available_shield( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -751,7 +753,7 @@ def __repr__(self) -> str: ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_two_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -810,7 +812,7 @@ def __repr__(self) -> str: ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_four_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -883,7 +885,7 @@ def __repr__(self) -> str: ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_one_attachment( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -939,7 +941,7 @@ async def test_retrieve_response_with_one_attachment( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_two_attachments( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1121,7 +1123,7 @@ def test_parse_referenced_documents_ignores_other_tools(mocker: MockerFixture) - assert not docs -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1202,7 +1204,7 @@ async def test_retrieve_response_with_mcp_servers( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers_empty_token( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1261,7 +1263,7 @@ async def test_retrieve_response_with_mcp_servers_empty_token( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers_and_mcp_headers( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1361,7 +1363,7 @@ async def test_retrieve_response_with_mcp_servers_and_mcp_headers( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_shield_violation( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1486,10 +1488,12 @@ async def test_auth_tuple_unpacking_in_query_endpoint_handler( ToolCallSummary( id="123", name="test-tool", - args="testing", - response="tool response", + args={"query": "testing"}, + type="tool_call", ) ], + tool_results=[], + rag_chunks=[], ) mock_retrieve_response = mocker.patch( "app.endpoints.query.retrieve_response", @@ -1546,10 +1550,12 @@ async def test_query_endpoint_handler_no_tools_true( ToolCallSummary( id="123", name="test-tool", - args="testing", - response="tool response", + args={"query": "testing"}, + type="tool_call", ) ], + tool_results=[], + rag_chunks=[], ) conversation_id = "00000000-0000-0000-0000-000000000000" query = "What is OpenStack?" @@ -1605,10 +1611,12 @@ async def test_query_endpoint_handler_no_tools_false( ToolCallSummary( id="123", name="test-tool", - args="testing", - response="tool response", + args={"query": "testing"}, + type="tool_call", ) ], + tool_results=[], + rag_chunks=[], ) conversation_id = "00000000-0000-0000-0000-000000000000" query = "What is OpenStack?" @@ -1641,7 +1649,7 @@ async def test_query_endpoint_handler_no_tools_false( assert response.conversation_id == conversation_id -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_tools_bypasses_mcp_and_rag( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1696,7 +1704,7 @@ async def test_retrieve_response_no_tools_bypasses_mcp_and_rag( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_tools_false_preserves_functionality( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1917,7 +1925,7 @@ async def test_query_endpoint_rejects_model_provider_override_without_permission assert detail["response"] == expected_msg -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_topic_summary_successful_response(mocker: MockerFixture) -> None: """Test get_topic_summary with successful response from agent.""" # Mock the dependencies @@ -1935,9 +1943,9 @@ async def test_get_topic_summary_successful_response(mocker: MockerFixture) -> N # Mock the agent's create_turn method mock_agent.create_turn.return_value = mock_response - # Mock the interleaved_content_as_str function + # Mock the content_to_str function mocker.patch( - "app.endpoints.query.interleaved_content_as_str", + "app.endpoints.query.content_to_str", return_value="This is a topic summary about OpenStack", ) @@ -2068,9 +2076,9 @@ async def test_get_topic_summary_with_interleaved_content( # Mock the agent's create_turn method mock_agent.create_turn.return_value = mock_response - # Mock the interleaved_content_as_str function - mock_interleaved_content_as_str = mocker.patch( - "app.endpoints.query.interleaved_content_as_str", return_value="Topic summary" + # Mock the content_to_str function + mock_content_to_str = mocker.patch( + "app.endpoints.query.content_to_str", return_value="Topic summary" ) # Mock the get_topic_summary_system_prompt function @@ -2091,8 +2099,8 @@ async def test_get_topic_summary_with_interleaved_content( # Assertions assert result == "Topic summary" - # Verify interleaved_content_as_str was called with the content - mock_interleaved_content_as_str.assert_called_once_with(mock_content) + # Verify content_to_str was called with the content + mock_content_to_str.assert_called_once_with(mock_content) @pytest.mark.asyncio @@ -2113,10 +2121,8 @@ async def test_get_topic_summary_system_prompt_retrieval(mocker: MockerFixture) # Mock the agent's create_turn method mock_agent.create_turn.return_value = mock_response - # Mock the interleaved_content_as_str function - mocker.patch( - "app.endpoints.query.interleaved_content_as_str", return_value="Topic summary" - ) + # Mock the content_to_str function + mocker.patch("app.endpoints.query.content_to_str", return_value="Topic summary") # Mock the get_topic_summary_system_prompt function mock_get_topic_summary_system_prompt = mocker.patch( @@ -2189,10 +2195,8 @@ async def test_get_topic_summary_agent_creation_parameters( # Mock the agent's create_turn method mock_agent.create_turn.return_value = mock_response - # Mock the interleaved_content_as_str function - mocker.patch( - "app.endpoints.query.interleaved_content_as_str", return_value="Topic summary" - ) + # Mock the content_to_str function + mocker.patch("app.endpoints.query.content_to_str", return_value="Topic summary") # Mock the get_topic_summary_system_prompt function mocker.patch( @@ -2218,7 +2222,7 @@ async def test_get_topic_summary_agent_creation_parameters( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) -> None: """Test that get_topic_summary calls create_turn with correct parameters.""" # Mock the dependencies @@ -2236,10 +2240,8 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) - # Mock the agent's create_turn method mock_agent.create_turn.return_value = mock_response - # Mock the interleaved_content_as_str function - mocker.patch( - "app.endpoints.query.interleaved_content_as_str", return_value="Topic summary" - ) + # Mock the content_to_str function + mocker.patch("app.endpoints.query.content_to_str", return_value="Topic summary") # Mock the get_topic_summary_system_prompt function mocker.patch( @@ -2281,6 +2283,7 @@ async def test_query_endpoint_quota_exceeded( model="gpt-4-turbo", ) # type: ignore mock_client = mocker.AsyncMock() + mock_client.models.list = mocker.AsyncMock(return_value=[]) mock_agent = mocker.AsyncMock() mock_agent.create_turn.side_effect = RateLimitError( model="gpt-4-turbo", llm_provider="openai", message="" @@ -2301,6 +2304,9 @@ async def test_query_endpoint_quota_exceeded( mocker.patch( "app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={} ) + mocker.patch("app.endpoints.query.check_tokens_available") + mocker.patch("app.endpoints.query.get_session") + mocker.patch("app.endpoints.query.is_transcripts_enabled", return_value=False) with pytest.raises(HTTPException) as exc_info: await query_endpoint_handler( @@ -2328,7 +2334,9 @@ async def test_query_endpoint_generate_topic_summary_default_true( mock_config.quota_limiters = [] mocker.patch("app.endpoints.query.configuration", mock_config) - summary = TurnSummary(llm_response="Test response", tool_calls=[]) + summary = TurnSummary( + llm_response="Test response", tool_calls=[], tool_results=[], rag_chunks=[] + ) mocker.patch( "app.endpoints.query.retrieve_response", return_value=( @@ -2376,7 +2384,9 @@ async def test_query_endpoint_generate_topic_summary_explicit_false( mock_config.quota_limiters = [] mocker.patch("app.endpoints.query.configuration", mock_config) - summary = TurnSummary(llm_response="Test response", tool_calls=[]) + summary = TurnSummary( + llm_response="Test response", tool_calls=[], tool_results=[], rag_chunks=[] + ) mocker.patch( "app.endpoints.query.retrieve_response", return_value=( diff --git a/tests/unit/app/endpoints/test_query_v2.py b/tests/unit/app/endpoints/test_query_v2.py index 4adfca306..38330eaaf 100644 --- a/tests/unit/app/endpoints/test_query_v2.py +++ b/tests/unit/app/endpoints/test_query_v2.py @@ -1,4 +1,4 @@ -# pylint: disable=redefined-outer-name, import-error +# pylint: disable=redefined-outer-name, import-error,too-many-locals """Unit tests for the /query (v2) REST API endpoint using Responses API.""" from typing import Any @@ -115,6 +115,10 @@ async def test_retrieve_response_no_tools_bypasses_tools(mocker: MockerFixture) response_obj.output = [] response_obj.usage = None # No usage info mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) # vector_stores.list should not matter when no_tools=True, but keep it valid mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] @@ -131,7 +135,7 @@ async def test_retrieve_response_no_tools_bypasses_tools(mocker: MockerFixture) mock_client, "model-x", qr, token="tkn" ) - assert conv_id == "resp-1" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "" assert referenced_docs == [] assert token_usage.input_tokens == 0 # No usage info, so 0 @@ -144,7 +148,7 @@ async def test_retrieve_response_no_tools_bypasses_tools(mocker: MockerFixture) @pytest.mark.asyncio -async def test_retrieve_response_builds_rag_and_mcp_tools( +async def test_retrieve_response_builds_rag_and_mcp_tools( # pylint: disable=too-many-locals mocker: MockerFixture, ) -> None: """Test that retrieve_response correctly builds RAG and MCP tools from configuration.""" @@ -154,6 +158,10 @@ async def test_retrieve_response_builds_rag_and_mcp_tools( response_obj.output = [] response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [mocker.Mock(id="dbA")] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -172,7 +180,7 @@ async def test_retrieve_response_builds_rag_and_mcp_tools( mock_client, "model-y", qr, token="mytoken" ) - assert conv_id == "resp-2" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert referenced_docs == [] assert token_usage.input_tokens == 0 # No usage info, so 0 assert token_usage.output_tokens == 0 @@ -198,10 +206,15 @@ async def test_retrieve_response_parses_output_and_tool_calls( mock_client = mocker.Mock() # Build output with content variants and tool calls + part1 = mocker.Mock(text="Hello ") + part1.annotations = [] # Ensure annotations is a list to avoid iteration error + part2 = mocker.Mock(text="world") + part2.annotations = [] + output_item_1 = mocker.Mock() output_item_1.type = "message" output_item_1.role = "assistant" - output_item_1.content = [mocker.Mock(text="Hello "), mocker.Mock(text="world")] + output_item_1.content = [part1, part2] output_item_2 = mocker.Mock() output_item_2.type = "message" @@ -222,6 +235,10 @@ async def test_retrieve_response_parses_output_and_tool_calls( response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -236,7 +253,7 @@ async def test_retrieve_response_parses_output_and_tool_calls( mock_client, "model-z", qr, token="tkn" ) - assert conv_id == "resp-3" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Hello world!" assert len(summary.tool_calls) == 1 assert summary.tool_calls[0].id == "tc-1" @@ -269,6 +286,10 @@ async def test_retrieve_response_with_usage_info(mocker: MockerFixture) -> None: response_obj.usage = mock_usage mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -283,7 +304,7 @@ async def test_retrieve_response_with_usage_info(mocker: MockerFixture) -> None: mock_client, "model-usage", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-with-usage" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Test response" assert token_usage.input_tokens == 150 assert token_usage.output_tokens == 75 @@ -308,6 +329,10 @@ async def test_retrieve_response_with_usage_dict(mocker: MockerFixture) -> None: response_obj.usage = {"input_tokens": 200, "output_tokens": 100} mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -322,7 +347,7 @@ async def test_retrieve_response_with_usage_dict(mocker: MockerFixture) -> None: mock_client, "model-usage-dict", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-with-usage-dict" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Test response dict" assert token_usage.input_tokens == 200 assert token_usage.output_tokens == 100 @@ -347,6 +372,10 @@ async def test_retrieve_response_with_empty_usage_dict(mocker: MockerFixture) -> response_obj.usage = {} # Empty dict mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -361,7 +390,7 @@ async def test_retrieve_response_with_empty_usage_dict(mocker: MockerFixture) -> mock_client, "model-empty-usage", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-empty-usage" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Test response empty usage" assert token_usage.input_tokens == 0 assert token_usage.output_tokens == 0 @@ -377,6 +406,10 @@ async def test_retrieve_response_validates_attachments(mocker: MockerFixture) -> response_obj.output = [] response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -432,7 +465,9 @@ async def test_query_endpoint_handler_v2_success( return_value=("llama/m", "m", "p"), ) - summary = mocker.Mock(llm_response="ANSWER", tool_calls=[], rag_chunks=[]) + summary = mocker.Mock( + llm_response="ANSWER", tool_calls=[], tool_results=[], rag_chunks=[] + ) token_usage = mocker.Mock(input_tokens=10, output_tokens=20) mocker.patch( "app.endpoints.query_v2.retrieve_response", @@ -520,9 +555,14 @@ async def test_query_endpoint_quota_exceeded( attachments=[], ) # type: ignore mock_client = mocker.AsyncMock() + mock_client.models.list = mocker.AsyncMock(return_value=[]) mock_client.responses.create.side_effect = RateLimitError( model="gpt-4-turbo", llm_provider="openai", message="" ) + # Mock conversation creation (needed for query_v2) + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mocker.patch( "app.endpoints.query.select_model_and_provider_id", return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), @@ -532,6 +572,13 @@ async def test_query_endpoint_quota_exceeded( "client.AsyncLlamaStackClientHolder.get_client", return_value=mock_client, ) + mocker.patch("app.endpoints.query.check_tokens_available") + mocker.patch("app.endpoints.query.get_session") + mocker.patch("app.endpoints.query.is_transcripts_enabled", return_value=False) + mocker.patch("app.endpoints.query_v2.get_available_shields", return_value=[]) + mocker.patch( + "app.endpoints.query_v2.prepare_tools_for_responses_api", return_value=None + ) with pytest.raises(HTTPException) as exc_info: await query_endpoint_handler_v2( @@ -567,6 +614,10 @@ async def test_retrieve_response_with_shields_available(mocker: MockerFixture) - response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -579,7 +630,7 @@ async def test_retrieve_response_with_shields_available(mocker: MockerFixture) - mock_client, "model-shields", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-shields" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Safe response" # Verify that shields were passed in extra_body @@ -610,6 +661,10 @@ async def test_retrieve_response_with_no_shields_available( response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -622,7 +677,7 @@ async def test_retrieve_response_with_no_shields_available( mock_client, "model-no-shields", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-no-shields" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Response without shields" # Verify that no extra_body was added @@ -655,6 +710,10 @@ async def test_retrieve_response_detects_shield_violation( response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -670,7 +729,7 @@ async def test_retrieve_response_detects_shield_violation( mock_client, "model-violation", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-violation" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "I cannot help with that request" # Verify that the validation error metric was incremented @@ -702,6 +761,10 @@ async def test_retrieve_response_no_violation_with_shields( response_obj.usage = None mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mock_vector_stores = mocker.Mock() mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) @@ -717,8 +780,93 @@ async def test_retrieve_response_no_violation_with_shields( mock_client, "model-safe", qr, token="tkn", provider_id="test-provider" ) - assert conv_id == "resp-safe" + assert conv_id == "abc123def456" # Normalized (without conv_ prefix) assert summary.llm_response == "Safe response" # Verify that the validation error metric was NOT incremented validation_metric.inc.assert_not_called() + + +@pytest.mark.asyncio +async def test_retrieve_response_parses_referenced_documents( + mocker: MockerFixture, +) -> None: + """Test that retrieve_response correctly parses referenced documents from response.""" + mock_client = mocker.AsyncMock() + + # 1. Output item with message content annotations (citations) + output_item_1 = mocker.Mock() + output_item_1.type = "message" + output_item_1.role = "assistant" + + # Mock content with annotations + content_part = mocker.Mock() + content_part.type = "output_text" + content_part.text = "Here is a citation." + + annotation1 = mocker.Mock() + annotation1.type = "url_citation" + annotation1.url = "http://example.com/doc1" + annotation1.title = "Doc 1" + + annotation2 = mocker.Mock() + annotation2.type = "file_citation" + annotation2.filename = "file1.txt" + annotation2.url = None + annotation2.title = None + + content_part.annotations = [annotation1, annotation2] + output_item_1.content = [content_part] + + # 2. Output item with file search tool call results + output_item_2 = mocker.Mock() + output_item_2.type = "file_search_call" + output_item_2.queries = ( + [] + ) # Ensure queries is a list to avoid iteration error in tool summary + output_item_2.status = "completed" + output_item_2.results = [ + {"filename": "file2.pdf", "attributes": {"url": "http://example.com/doc2"}}, + {"filename": "file3.docx", "attributes": {}}, # No URL + ] + + response_obj = mocker.Mock() + response_obj.id = "resp-docs" + response_obj.output = [output_item_1, output_item_2] + response_obj.usage = None + + mock_client.responses.create = mocker.AsyncMock(return_value=response_obj) + mock_vector_stores = mocker.Mock() + mock_vector_stores.data = [] + mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) + mock_client.shields.list = mocker.AsyncMock(return_value=[]) + + mocker.patch("app.endpoints.query_v2.get_system_prompt", return_value="PROMPT") + mocker.patch("app.endpoints.query_v2.configuration", mocker.Mock(mcp_servers=[])) + + qr = QueryRequest(query="query with docs") + _summary, _conv_id, referenced_docs, _token_usage = await retrieve_response( + mock_client, "model-docs", qr, token="tkn", provider_id="test-provider" + ) + + assert len(referenced_docs) == 4 + + # Verify Doc 1 (URL citation) + doc1 = next((d for d in referenced_docs if d.doc_title == "Doc 1"), None) + assert doc1 + assert str(doc1.doc_url) == "http://example.com/doc1" + + # Verify file1.txt (File citation) + doc2 = next((d for d in referenced_docs if d.doc_title == "file1.txt"), None) + assert doc2 + assert doc2.doc_url is None + + # Verify file2.pdf (File search result with URL) + doc3 = next((d for d in referenced_docs if d.doc_title == "file2.pdf"), None) + assert doc3 + assert str(doc3.doc_url) == "http://example.com/doc2" + + # Verify file3.docx (File search result without URL) + doc4 = next((d for d in referenced_docs if d.doc_title == "file3.docx"), None) + assert doc4 + assert doc4.doc_url is None diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py index e93f95660..1e5595d1c 100644 --- a/tests/unit/app/endpoints/test_streaming_query.py +++ b/tests/unit/app/endpoints/test_streaming_query.py @@ -1,8 +1,9 @@ """Unit tests for the /streaming-query REST API endpoint.""" -# pylint: disable=too-many-lines +# pylint: disable=too-many-lines,too-many-function-args import json from datetime import datetime +from typing import Any, cast import pytest from fastapi import HTTPException, Request, status @@ -10,26 +11,13 @@ from litellm.exceptions import RateLimitError from llama_stack_client import APIConnectionError from llama_stack_client.types import UserMessage # type: ignore -from llama_stack_client.types.agents import Turn -from llama_stack_client.types.agents.agent_turn_response_stream_chunk import ( - AgentTurnResponseStreamChunk, -) -from llama_stack_client.types.agents.turn_response_event import TurnResponseEvent -from llama_stack_client.types.agents.turn_response_event_payload import ( - AgentTurnResponseStepCompletePayload, - AgentTurnResponseStepProgressPayload, - AgentTurnResponseTurnAwaitingInputPayload, - AgentTurnResponseTurnCompletePayload, - AgentTurnResponseTurnStartPayload, -) +from llama_stack_client.types.alpha.agents.turn import Turn +from llama_stack_client.types.alpha.shield_call_step import ShieldCallStep from llama_stack_client.types.shared.completion_message import CompletionMessage -from llama_stack_client.types.shared.content_delta import TextDelta, ToolCallDelta from llama_stack_client.types.shared.interleaved_content_item import TextContentItem from llama_stack_client.types.shared.safety_violation import SafetyViolation from llama_stack_client.types.shared.tool_call import ToolCall -from llama_stack_client.types.shield_call_step import ShieldCallStep -from llama_stack_client.types.tool_execution_step import ToolExecutionStep -from llama_stack_client.types.tool_response import ToolResponse +from pydantic import AnyUrl from pytest_mock import MockerFixture from app.endpoints.query import get_rag_toolgroups @@ -50,10 +38,110 @@ from constants import MEDIA_TYPE_JSON, MEDIA_TYPE_TEXT from models.config import Action, ModelContextProtocolServer from models.requests import Attachment, QueryRequest +from models.responses import ReferencedDocument from tests.unit.conftest import AgentFixtures from tests.unit.utils.auth_helpers import mock_authorization_resolvers from utils.token_counter import TokenCounter -from utils.types import TurnSummary + + +# Note: content_delta module doesn't exist in llama-stack-client 0.3.x +# These are mock classes for backward compatibility with Agent API tests +# pylint: disable=too-few-public-methods,redefined-builtin + + +class TextDelta: + """Mock TextDelta for Agent API tests.""" + + def __init__(self, text: str, type: str = "text"): # noqa: A002 + self.text = text + self.type = type + + +class ToolCallDelta: + """Mock ToolCallDelta for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +# Note: Agent API types don't exist in llama-stack-client 0.3.x +# These are mock classes for backward compatibility with Agent API tests + + +class TurnResponseEvent: + """Mock TurnResponseEvent for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseStreamChunk: + """Mock AgentTurnResponseStreamChunk for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseStepCompletePayload: + """Mock AgentTurnResponseStepCompletePayload for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseStepProgressPayload: + """Mock AgentTurnResponseStepProgressPayload for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseTurnAwaitingInputPayload: + """Mock AgentTurnResponseTurnAwaitingInputPayload for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseTurnCompletePayload: + """Mock AgentTurnResponseTurnCompletePayload for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class AgentTurnResponseTurnStartPayload: + """Mock AgentTurnResponseTurnStartPayload for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class ToolExecutionStep: + """Mock ToolExecutionStep for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +class ToolResponse: + """Mock ToolResponse for Agent API tests.""" + + def __init__(self, **kwargs: Any): + for key, value in kwargs.items(): + setattr(self, key, value) + + +# pylint: enable=too-few-public-methods,redefined-builtin MOCK_AUTH = ( "017adfa4-7cc6-46e4-b663-3653e1ae69df", @@ -186,12 +274,18 @@ async def test_streaming_query_endpoint_on_connection_error( query_request = QueryRequest(query=query) # type: ignore # simulate situation when it is not possible to connect to Llama Stack - mock_client = mocker.AsyncMock() - mock_client.models.side_effect = APIConnectionError(request=query_request) # type: ignore - mock_lsc = mocker.patch("client.AsyncLlamaStackClientHolder.get_client") - mock_lsc.return_value = mock_client - mock_async_lsc = mocker.patch("client.AsyncLlamaStackClientHolder.get_client") - mock_async_lsc.return_value = mock_client + def _raise_connection_error(*args: Any, **kwargs: Any) -> None: + raise APIConnectionError(request=None) # type: ignore[arg-type] + + mocker.patch( + "client.AsyncLlamaStackClientHolder.get_client", + side_effect=_raise_connection_error, + ) + mocker.patch("app.endpoints.streaming_query.check_configuration_loaded") + mocker.patch( + "app.endpoints.streaming_query.evaluate_model_hints", + return_value=(None, None), + ) request = Request( scope={ @@ -268,7 +362,7 @@ async def _test_streaming_query_endpoint_handler(mocker: MockerFixture) -> None: ToolCall( call_id="t1", tool_name="knowledge_search", - arguments={}, + arguments="{}", ) ], ), @@ -292,7 +386,33 @@ async def _test_streaming_query_endpoint_handler(mocker: MockerFixture) -> None: ), session_id="test_session_id", started_at=datetime.now(), - steps=[], + steps=cast( + Any, + [ # type: ignore[assignment] + ToolExecutionStep( + turn_id="t1", + step_id="s3", + step_type="tool_execution", + tool_responses=[ + ToolResponse( + call_id="t1", + tool_name="knowledge_search", + content=[ + TextContentItem(text=s, type="text") + for s in SAMPLE_KNOWLEDGE_SEARCH_RESULTS + ], + ) + ], + tool_calls=[ + ToolCall( + call_id="t1", + tool_name="knowledge_search", + arguments="{}", + ) + ], + ) + ], + ), completed_at=datetime.now(), output_attachments=[], ), @@ -353,6 +473,7 @@ async def _test_streaming_query_endpoint_handler(mocker: MockerFixture) -> None: assert referenced_documents[1]["doc_title"] == "Doc2" +@pytest.mark.skip(reason="Deprecated API test") @pytest.mark.asyncio async def test_streaming_query_endpoint_handler(mocker: MockerFixture) -> None: """Test the streaming query endpoint handler.""" @@ -361,6 +482,7 @@ async def test_streaming_query_endpoint_handler(mocker: MockerFixture) -> None: @pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_streaming_query_endpoint_handler_store_transcript( mocker: MockerFixture, ) -> None: @@ -369,6 +491,7 @@ async def test_streaming_query_endpoint_handler_store_transcript( await _test_streaming_query_endpoint_handler(mocker) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_vector_db_available( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -414,6 +537,7 @@ async def test_retrieve_response_vector_db_available( ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -457,6 +581,7 @@ async def test_retrieve_response_no_available_shields( ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_one_available_shield( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -511,6 +636,7 @@ def __repr__(self) -> str: ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_two_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -568,6 +694,7 @@ def __repr__(self) -> str: ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_four_available_shields( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -639,6 +766,7 @@ def __repr__(self) -> str: ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_one_attachment( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -693,6 +821,7 @@ async def test_retrieve_response_with_one_attachment( ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_two_attachments( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -993,7 +1122,7 @@ def test_stream_build_event_step_progress_tool_call_tool_call() -> None: delta=ToolCallDelta( parse_status="succeeded", tool_call=ToolCall( - arguments={}, call_id="tc1", tool_name="my-tool" + arguments="{}", call_id="tc1", tool_name="my-tool" ), type="tool_call", ), @@ -1039,7 +1168,7 @@ def test_stream_build_event_step_complete() -> None: ], tool_calls=[ ToolCall( - call_id="t1", tool_name="knowledge_search", arguments={} + call_id="t1", tool_name="knowledge_search", arguments="{}" ) ], ), @@ -1053,7 +1182,7 @@ def test_stream_build_event_step_complete() -> None: assert result is not None assert "data: " in result assert '"event": "tool_call"' in result - assert '"token": {"tool_name": "knowledge_search", "arguments": {}}' in result + assert '"token": {"tool_name": "knowledge_search", "arguments": "{}"}' in result result = next(itr) assert ( @@ -1104,6 +1233,7 @@ def test_stream_build_event_returns_heartbeat() -> None: assert '"token": "heartbeat"' in result +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1183,6 +1313,7 @@ async def test_retrieve_response_with_mcp_servers( ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers_empty_token( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1246,6 +1377,7 @@ async def test_retrieve_response_with_mcp_servers_empty_token( ) +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_with_mcp_servers_and_mcp_headers( mocker: MockerFixture, ) -> None: @@ -1502,7 +1634,7 @@ async def test_streaming_query_endpoint_handler_no_tools_false( assert isinstance(response, StreamingResponse) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_tools_bypasses_mcp_and_rag( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1552,7 +1684,7 @@ async def test_retrieve_response_no_tools_bypasses_mcp_and_rag( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_retrieve_response_no_tools_false_preserves_functionality( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -1714,14 +1846,18 @@ async def test_streaming_query_handles_none_event(mocker: MockerFixture) -> None @pytest.mark.asyncio async def test_query_endpoint_quota_exceeded(mocker: MockerFixture) -> None: - """Test that streaming query endpoint raises HTTP 429 when model quota is exceeded.""" + """Test that streaming query endpoint streams HTTP 429 when model quota is exceeded.""" query_request = QueryRequest( query="What is OpenStack?", provider="openai", model="gpt-4-turbo", ) # type: ignore request = Request(scope={"type": "http"}) + request.state.authorized_actions = set() mock_client = mocker.AsyncMock() + mock_client.models.list = mocker.AsyncMock(return_value=[]) + mock_client.shields.list = mocker.AsyncMock(return_value=[]) + mock_client.vector_stores.list = mocker.AsyncMock(return_value=mocker.Mock(data=[])) mock_agent = mocker.AsyncMock() mock_agent.create_turn.side_effect = RateLimitError( model="gpt-4-turbo", llm_provider="openai", message="" @@ -1743,16 +1879,40 @@ async def test_query_endpoint_quota_exceeded(mocker: MockerFixture) -> None: "app.endpoints.streaming_query.handle_mcp_headers_with_toolgroups", return_value={}, ) + mocker.patch("app.endpoints.streaming_query.check_configuration_loaded") + mocker.patch( + "app.endpoints.streaming_query.is_transcripts_enabled", return_value=False + ) + mocker.patch( + "app.endpoints.streaming_query.get_system_prompt", return_value="PROMPT" + ) + mocker.patch( + "app.endpoints.streaming_query.evaluate_model_hints", + return_value=(None, None), + ) - with pytest.raises(HTTPException) as exc_info: - await streaming_query_endpoint_handler( - request, query_request=query_request, auth=MOCK_AUTH - ) - assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS - detail = exc_info.value.detail - assert isinstance(detail, dict) - assert detail["response"] == "The model quota has been exceeded" # type: ignore - assert "gpt-4-turbo" in detail["cause"] # type: ignore + response = await streaming_query_endpoint_handler( + request, query_request=query_request, auth=MOCK_AUTH + ) + assert isinstance(response, StreamingResponse) + assert response.status_code == status.HTTP_429_TOO_MANY_REQUESTS + + # Read the streamed error response (SSE format) + content = b"" + async for chunk in response.body_iterator: + if isinstance(chunk, bytes): + content += chunk + elif isinstance(chunk, str): + content += chunk.encode() + else: + # Handle memoryview or other types + content += bytes(chunk) + + content_str = content.decode() + # The error is formatted as SSE: data: {"event":"error","response":"...","cause":"..."}\n\n + # Check for the error message in the content + assert "The model quota has been exceeded" in content_str + assert "gpt-4-turbo" in content_str # ============================================================================ @@ -1859,10 +2019,22 @@ def test_stream_end_event_json(self) -> None: "doc2": {"title": "Test Doc 2", "docs_url": "https://example.com/doc2"}, } # Create mock objects for the test - mock_summary = TurnSummary(llm_response="Test response", tool_calls=[]) mock_token_usage = TokenCounter(input_tokens=100, output_tokens=50) + available_quotas: dict[str, int] = {} + referenced_documents = [ + ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc1"), doc_title="Test Doc 1" + ), + ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc2"), doc_title="Test Doc 2" + ), + ] result = stream_end_event( - metadata_map, mock_summary, mock_token_usage, MEDIA_TYPE_JSON + metadata_map, + mock_token_usage, + available_quotas, + referenced_documents, + MEDIA_TYPE_JSON, ) # Parse the result to verify structure @@ -1887,10 +2059,22 @@ def test_stream_end_event_text(self) -> None: "doc2": {"title": "Test Doc 2", "docs_url": "https://example.com/doc2"}, } # Create mock objects for the test - mock_summary = TurnSummary(llm_response="Test response", tool_calls=[]) mock_token_usage = TokenCounter(input_tokens=100, output_tokens=50) + available_quotas: dict[str, int] = {} + referenced_documents = [ + ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc1"), doc_title="Test Doc 1" + ), + ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc2"), doc_title="Test Doc 2" + ), + ] result = stream_end_event( - metadata_map, mock_summary, mock_token_usage, MEDIA_TYPE_TEXT + metadata_map, + mock_token_usage, + available_quotas, + referenced_documents, + MEDIA_TYPE_TEXT, ) expected = ( @@ -1904,10 +2088,15 @@ def test_stream_end_event_text_no_docs(self) -> None: metadata_map: dict = {} # Create mock objects for the test - mock_summary = TurnSummary(llm_response="Test response", tool_calls=[]) mock_token_usage = TokenCounter(input_tokens=100, output_tokens=50) + available_quotas: dict[str, int] = {} + referenced_documents: list[ReferencedDocument] = [] result = stream_end_event( - metadata_map, mock_summary, mock_token_usage, MEDIA_TYPE_TEXT + metadata_map, + mock_token_usage, + available_quotas, + referenced_documents, + MEDIA_TYPE_TEXT, ) assert result == "" @@ -2027,10 +2216,19 @@ def test_ols_end_event_structure(self) -> None: "doc1": {"title": "Test Doc", "docs_url": "https://example.com/doc"} } # Create mock objects for the test - mock_summary = TurnSummary(llm_response="Test response", tool_calls=[]) mock_token_usage = TokenCounter(input_tokens=100, output_tokens=50) + available_quotas: dict[str, int] = {} + referenced_documents = [ + ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc"), doc_title="Test Doc" + ), + ] end_event = stream_end_event( - metadata_map, mock_summary, mock_token_usage, MEDIA_TYPE_JSON + metadata_map, + mock_token_usage, + available_quotas, + referenced_documents, + MEDIA_TYPE_JSON, ) data_part = end_event.replace("data: ", "").strip() parsed = json.loads(data_part) diff --git a/tests/unit/app/endpoints/test_streaming_query_v2.py b/tests/unit/app/endpoints/test_streaming_query_v2.py index 461bc515f..9ba0900fc 100644 --- a/tests/unit/app/endpoints/test_streaming_query_v2.py +++ b/tests/unit/app/endpoints/test_streaming_query_v2.py @@ -1,12 +1,13 @@ -# pylint: disable=redefined-outer-name, import-error +# pylint: disable=redefined-outer-name,import-error, too-many-function-args """Unit tests for the /streaming_query (v2) endpoint using Responses API.""" from types import SimpleNamespace from typing import Any, AsyncIterator import pytest -from fastapi import HTTPException, Request, status +from fastapi import Request, status from fastapi.responses import StreamingResponse +from litellm.exceptions import RateLimitError from llama_stack_client import APIConnectionError from pytest_mock import MockerFixture @@ -37,6 +38,10 @@ async def test_retrieve_response_builds_rag_and_mcp_tools( mock_vector_stores.data = [mocker.Mock(id="db1")] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) mock_client.responses.create = mocker.AsyncMock(return_value=mocker.Mock()) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) # Mock shields.list mock_client.shields.list = mocker.AsyncMock(return_value=[]) @@ -69,6 +74,10 @@ async def test_retrieve_response_no_tools_passes_none(mocker: MockerFixture) -> mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) mock_client.responses.create = mocker.AsyncMock(return_value=mocker.Mock()) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) # Mock shields.list mock_client.shields.list = mocker.AsyncMock(return_value=[]) @@ -121,7 +130,7 @@ async def test_streaming_query_endpoint_handler_v2_success_yields_events( ) mocker.patch( "app.endpoints.streaming_query_v2.stream_end_event", - lambda _m, _s, _t, _media: "END\n", + lambda _m, _t, _aq, _rd, _media: "END\n", ) # Mock the cleanup function that handles all post-streaming database/cache work @@ -153,11 +162,13 @@ async def fake_stream() -> AsyncIterator[SimpleNamespace]: arguments='{"q":"x"}', ) yield SimpleNamespace(type="response.output_text.done", text="Hello world") - yield SimpleNamespace(type="response.completed") + # Include a response object with output attribute for shield violation detection + mock_response = SimpleNamespace(output=[]) + yield SimpleNamespace(type="response.completed", response=mock_response) mocker.patch( "app.endpoints.streaming_query_v2.retrieve_response", - return_value=(fake_stream(), ""), + return_value=(fake_stream(), "abc123def456"), ) metric = mocker.patch("metrics.llm_calls_total") @@ -179,7 +190,7 @@ async def fake_stream() -> AsyncIterator[SimpleNamespace]: events.append(s) # Validate event sequence and content - assert events[0] == "START:conv-xyz\n" + assert events[0] == "START:abc123def456\n" # content_part.added triggers empty token assert events[1] == "EV:token:\n" assert events[2] == "EV:token:Hello \n" @@ -195,7 +206,7 @@ async def fake_stream() -> AsyncIterator[SimpleNamespace]: # Verify cleanup was called with correct user_id and conversation_id call_args = cleanup_spy.call_args assert call_args.kwargs["user_id"] == "user123" - assert call_args.kwargs["conversation_id"] == "conv-xyz" + assert call_args.kwargs["conversation_id"] == "abc123def456" assert call_args.kwargs["model_id"] == "m" assert call_args.kwargs["provider_id"] == "p" @@ -214,16 +225,20 @@ def _raise(*_a: Any, **_k: Any) -> None: fail_metric = mocker.patch("metrics.llm_calls_failures_total") - with pytest.raises(HTTPException) as exc: - await streaming_query_endpoint_handler_v2( - request=dummy_request, - query_request=QueryRequest(query="hi"), - auth=("user123", "", False, "tok"), - mcp_headers={}, - ) + mocker.patch( + "app.endpoints.streaming_query.evaluate_model_hints", + return_value=(None, None), + ) + + response = await streaming_query_endpoint_handler_v2( + request=dummy_request, + query_request=QueryRequest(query="hi"), + auth=("user123", "", False, "tok"), + mcp_headers={}, + ) - assert exc.value.status_code == status.HTTP_503_SERVICE_UNAVAILABLE - assert "Unable to connect to Llama Stack" in str(exc.value.detail) + assert isinstance(response, StreamingResponse) + assert response.status_code == status.HTTP_503_SERVICE_UNAVAILABLE fail_metric.inc.assert_called_once() @@ -243,6 +258,10 @@ async def test_retrieve_response_with_shields_available(mocker: MockerFixture) - mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) mock_client.responses.create = mocker.AsyncMock(return_value=mocker.Mock()) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mocker.patch( "app.endpoints.streaming_query_v2.get_system_prompt", return_value="PROMPT" @@ -275,6 +294,10 @@ async def test_retrieve_response_with_no_shields_available( mock_vector_stores.data = [] mock_client.vector_stores.list = mocker.AsyncMock(return_value=mock_vector_stores) mock_client.responses.create = mocker.AsyncMock(return_value=mocker.Mock()) + # Mock conversations.create for new conversation creation + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123def456" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) mocker.patch( "app.endpoints.streaming_query_v2.get_system_prompt", return_value="PROMPT" @@ -325,7 +348,7 @@ async def test_streaming_response_detects_shield_violation( ) mocker.patch( "app.endpoints.streaming_query_v2.stream_end_event", - lambda _m, _s, _t, _media: "END\n", + lambda _m, _t, _aq, _rd, _media: "END\n", ) # Mock the cleanup function that handles all post-streaming database/cache work @@ -417,7 +440,7 @@ async def test_streaming_response_no_shield_violation( ) mocker.patch( "app.endpoints.streaming_query_v2.stream_end_event", - lambda _m, _s, _t, _media: "END\n", + lambda _m, _t, _aq, _rd, _media: "END\n", ) # Mock the cleanup function that handles all post-streaming database/cache work @@ -469,3 +492,81 @@ async def fake_stream_without_violation() -> AsyncIterator[SimpleNamespace]: # Verify that the validation error metric was NOT incremented validation_metric.inc.assert_not_called() + + +@pytest.mark.asyncio +async def test_streaming_query_endpoint_handler_v2_quota_exceeded( + mocker: MockerFixture, dummy_request: Request +) -> None: + """Test that streaming query endpoint v2 streams HTTP 429 when model quota is exceeded.""" + mocker.patch("app.endpoints.streaming_query.check_configuration_loaded") + + mock_client = mocker.Mock() + mock_client.models.list = mocker.AsyncMock(return_value=[mocker.Mock()]) + mock_client.responses.create.side_effect = RateLimitError( + model="gpt-4-turbo", llm_provider="openai", message="" + ) + # Mock conversation creation (needed for query_v2) + mock_conversation = mocker.Mock() + mock_conversation.id = "conv_abc123" + mock_client.conversations.create = mocker.AsyncMock(return_value=mock_conversation) + mock_client.vector_stores.list = mocker.AsyncMock(return_value=mocker.Mock(data=[])) + mock_client.shields.list = mocker.AsyncMock(return_value=[]) + + mocker.patch( + "client.AsyncLlamaStackClientHolder.get_client", return_value=mock_client + ) + mocker.patch( + "app.endpoints.streaming_query.evaluate_model_hints", + return_value=(None, None), + ) + mocker.patch( + "app.endpoints.streaming_query.select_model_and_provider_id", + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), + ) + mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") + mocker.patch( + "app.endpoints.streaming_query_v2.get_available_shields", return_value=[] + ) + mocker.patch( + "app.endpoints.streaming_query_v2.prepare_tools_for_responses_api", + return_value=None, + ) + mocker.patch( + "app.endpoints.streaming_query_v2.get_system_prompt", return_value="PROMPT" + ) + mocker.patch( + "app.endpoints.streaming_query_v2.to_llama_stack_conversation_id", + return_value="conv_abc123", + ) + mocker.patch( + "app.endpoints.streaming_query_v2.normalize_conversation_id", + return_value="abc123", + ) + + response = await streaming_query_endpoint_handler_v2( + request=dummy_request, + query_request=QueryRequest(query="What is OpenStack?"), + auth=("user123", "", False, "token-abc"), + mcp_headers={}, + ) + + assert isinstance(response, StreamingResponse) + assert response.status_code == status.HTTP_429_TOO_MANY_REQUESTS + + # Read the streamed error response (SSE format) + content = b"" + async for chunk in response.body_iterator: + if isinstance(chunk, bytes): + content += chunk + elif isinstance(chunk, str): + content += chunk.encode() + else: + # Handle memoryview or other types + content += bytes(chunk) + + content_str = content.decode() + # The error is formatted as SSE: data: {"event":"error","response":"...","cause":"..."}\n\n + # Check for the error message in the content + assert "The model quota has been exceeded" in content_str + assert "gpt-4-turbo" in content_str diff --git a/tests/unit/app/test_routers.py b/tests/unit/app/test_routers.py index 3723aed72..1245a07ba 100644 --- a/tests/unit/app/test_routers.py +++ b/tests/unit/app/test_routers.py @@ -7,20 +7,18 @@ from app.routers import include_routers # noqa:E402 from app.endpoints import ( - conversations, conversations_v2, + conversations_v3, root, info, models, shields, rags, providers, - query, query_v2, health, config, feedback, - streaming_query, streaming_query_v2, authorized, metrics, @@ -67,23 +65,24 @@ def test_include_routers() -> None: include_routers(app) # are all routers added? - assert len(app.routers) == 18 + assert len(app.routers) == 16 assert root.router in app.get_routers() assert info.router in app.get_routers() assert models.router in app.get_routers() assert tools.router in app.get_routers() assert shields.router in app.get_routers() assert providers.router in app.get_routers() - assert query.router in app.get_routers() + # assert query.router in app.get_routers() assert query_v2.router in app.get_routers() - assert streaming_query.router in app.get_routers() + # assert streaming_query.router in app.get_routers() assert streaming_query_v2.router in app.get_routers() assert config.router in app.get_routers() assert feedback.router in app.get_routers() assert health.router in app.get_routers() assert authorized.router in app.get_routers() - assert conversations.router in app.get_routers() + # assert conversations.router in app.get_routers() assert conversations_v2.router in app.get_routers() + assert conversations_v3.router in app.get_routers() assert metrics.router in app.get_routers() @@ -93,7 +92,7 @@ def test_check_prefixes() -> None: include_routers(app) # are all routers added? - assert len(app.routers) == 18 + assert len(app.routers) == 16 assert app.get_router_prefix(root.router) == "" assert app.get_router_prefix(info.router) == "/v1" assert app.get_router_prefix(models.router) == "/v1" @@ -101,14 +100,15 @@ def test_check_prefixes() -> None: assert app.get_router_prefix(shields.router) == "/v1" assert app.get_router_prefix(providers.router) == "/v1" assert app.get_router_prefix(rags.router) == "/v1" - assert app.get_router_prefix(query.router) == "/v1" - assert app.get_router_prefix(streaming_query.router) == "/v1" - assert app.get_router_prefix(query_v2.router) == "/v2" - assert app.get_router_prefix(streaming_query_v2.router) == "/v2" + # assert app.get_router_prefix(query.router) == "/v1" + # assert app.get_router_prefix(streaming_query.router) == "/v1" + assert app.get_router_prefix(query_v2.router) == "/v1" + assert app.get_router_prefix(streaming_query_v2.router) == "/v1" assert app.get_router_prefix(config.router) == "/v1" assert app.get_router_prefix(feedback.router) == "/v1" assert app.get_router_prefix(health.router) == "" assert app.get_router_prefix(authorized.router) == "" - assert app.get_router_prefix(conversations.router) == "/v1" - assert app.get_router_prefix(metrics.router) == "" + # assert app.get_router_prefix(conversations.router) == "/v1" assert app.get_router_prefix(conversations_v2.router) == "/v2" + assert app.get_router_prefix(conversations_v3.router) == "/v1" + assert app.get_router_prefix(metrics.router) == "" diff --git a/tests/unit/models/responses/test_error_responses.py b/tests/unit/models/responses/test_error_responses.py index 2e6ae99bd..e994e666d 100644 --- a/tests/unit/models/responses/test_error_responses.py +++ b/tests/unit/models/responses/test_error_responses.py @@ -11,6 +11,7 @@ FORBIDDEN_DESCRIPTION, INTERNAL_SERVER_ERROR_DESCRIPTION, NOT_FOUND_DESCRIPTION, + PROMPT_TOO_LONG_DESCRIPTION, QUOTA_EXCEEDED_DESCRIPTION, SERVICE_UNAVAILABLE_DESCRIPTION, UNAUTHORIZED_DESCRIPTION, @@ -21,6 +22,7 @@ ForbiddenResponse, InternalServerErrorResponse, NotFoundResponse, + PromptTooLongResponse, QuotaExceededResponse, ServiceUnavailableResponse, UnauthorizedResponse, @@ -655,6 +657,57 @@ def test_openapi_response_with_explicit_examples(self) -> None: assert "llama stack" in examples +class TestPromptTooLongResponse: + """Test cases for PromptTooLongResponse.""" + + def test_constructor_with_default_response(self) -> None: + """Test PromptTooLongResponse with default response.""" + response = PromptTooLongResponse( + cause="The prompt exceeds the maximum allowed length." + ) + assert isinstance(response, AbstractErrorResponse) + assert response.status_code == status.HTTP_413_REQUEST_ENTITY_TOO_LARGE + assert isinstance(response.detail, DetailModel) + assert response.detail.response == "Prompt is too long" + assert response.detail.cause == "The prompt exceeds the maximum allowed length." + + def test_openapi_response(self) -> None: + """Test PromptTooLongResponse.openapi_response() method.""" + schema = PromptTooLongResponse.model_json_schema() + model_examples = schema.get("examples", []) + expected_count = len(model_examples) + + result = PromptTooLongResponse.openapi_response() + assert result["description"] == PROMPT_TOO_LONG_DESCRIPTION + assert result["model"] == PromptTooLongResponse + assert "examples" in result["content"]["application/json"] + examples = result["content"]["application/json"]["examples"] + + # Verify example count matches schema examples count + assert len(examples) == expected_count + assert expected_count == 1 + + # Verify example structure + assert "prompt too long" in examples + prompt_example = examples["prompt too long"] + assert "value" in prompt_example + assert "detail" in prompt_example["value"] + assert prompt_example["value"]["detail"]["response"] == "Prompt is too long" + assert ( + prompt_example["value"]["detail"]["cause"] + == "The prompt exceeds the maximum allowed length." + ) + + def test_openapi_response_with_explicit_examples(self) -> None: + """Test PromptTooLongResponse.openapi_response() with explicit examples.""" + result = PromptTooLongResponse.openapi_response(examples=["prompt too long"]) + examples = result["content"]["application/json"]["examples"] + + # Verify only 1 example is returned when explicitly specified + assert len(examples) == 1 + assert "prompt too long" in examples + + class TestAbstractErrorResponse: # pylint: disable=too-few-public-methods """Test cases for AbstractErrorResponse edge cases.""" diff --git a/tests/unit/models/responses/test_query_response.py b/tests/unit/models/responses/test_query_response.py index 68333616b..050f91ef8 100644 --- a/tests/unit/models/responses/test_query_response.py +++ b/tests/unit/models/responses/test_query_response.py @@ -1,6 +1,9 @@ """Unit tests for QueryResponse model.""" -from models.responses import QueryResponse, RAGChunk, ToolCall, ReferencedDocument +from pydantic import AnyUrl + +from models.responses import QueryResponse, ReferencedDocument +from utils.types import ToolCallSummary, ToolResultSummary class TestQueryResponse: @@ -8,7 +11,7 @@ class TestQueryResponse: def test_constructor(self) -> None: """Test the QueryResponse constructor.""" - qr = QueryResponse( + qr = QueryResponse( # type: ignore[call-arg] conversation_id="123e4567-e89b-12d3-a456-426614174000", response="LLM answer", ) @@ -17,89 +20,33 @@ def test_constructor(self) -> None: def test_optional_conversation_id(self) -> None: """Test the QueryResponse with default conversation ID.""" - qr = QueryResponse(response="LLM answer") + qr = QueryResponse(response="LLM answer") # type: ignore[call-arg] assert qr.conversation_id is None assert qr.response == "LLM answer" - def test_rag_chunks_empty_by_default(self) -> None: - """Test that rag_chunks is empty by default.""" - qr = QueryResponse(response="LLM answer") - assert not qr.rag_chunks - - def test_rag_chunks_with_data(self) -> None: - """Test QueryResponse with RAG chunks.""" - rag_chunks = [ - RAGChunk( - content="Kubernetes is an open-source container orchestration system", - source="kubernetes-docs/overview.md", - score=0.95, - ), - RAGChunk( - content="Container orchestration automates deployment and management", - source="kubernetes-docs/concepts.md", - score=0.87, - ), - ] - - qr = QueryResponse( - conversation_id="123e4567-e89b-12d3-a456-426614174000", - response="LLM answer with RAG context", - rag_chunks=rag_chunks, - ) - - assert len(qr.rag_chunks) == 2 - assert ( - qr.rag_chunks[0].content - == "Kubernetes is an open-source container orchestration system" - ) - assert qr.rag_chunks[0].source == "kubernetes-docs/overview.md" - assert qr.rag_chunks[0].score == 0.95 - assert ( - qr.rag_chunks[1].content - == "Container orchestration automates deployment and management" - ) - assert qr.rag_chunks[1].source == "kubernetes-docs/concepts.md" - assert qr.rag_chunks[1].score == 0.87 - - def test_rag_chunks_with_optional_fields(self) -> None: - """Test RAG chunks with optional source and score fields.""" - rag_chunks = [ - RAGChunk(content="Some content without source or score"), - RAGChunk(content="Content with source only", source="docs/guide.md"), - RAGChunk(content="Content with score only", score=0.75), - ] - - qr = QueryResponse(response="LLM answer", rag_chunks=rag_chunks) - - assert len(qr.rag_chunks) == 3 - assert qr.rag_chunks[0].source is None - assert qr.rag_chunks[0].score is None - assert qr.rag_chunks[1].source == "docs/guide.md" - assert qr.rag_chunks[1].score is None - assert qr.rag_chunks[2].source is None - assert qr.rag_chunks[2].score == 0.75 - def test_complete_query_response_with_all_fields(self) -> None: - """Test QueryResponse with all fields including RAG chunks, tool calls, and docs.""" - rag_chunks = [ - RAGChunk( - content="OLM is a component of the Operator Framework toolkit", - source="kubernetes-docs/operators.md", - score=0.95, + """Test QueryResponse with all fields including tool calls, and tool results.""" + tool_calls = [ + ToolCallSummary( + id="call-1", + name="knowledge_search", + args={"query": "operator lifecycle manager"}, + type="tool_call", ) ] - - tool_calls = [ - ToolCall( - tool_name="knowledge_search", - arguments={"query": "operator lifecycle manager"}, - result={"chunks_found": 5}, + tool_results = [ + ToolResultSummary( + id="call-1", + status="success", + content={"chunks_found": 5}, + type="tool_result", + round=1, ) ] referenced_documents = [ ReferencedDocument( - doc_url=( + doc_url=AnyUrl( "https://docs.openshift.com/container-platform/4.15/operators/olm/index.html" ), doc_title="Operator Lifecycle Manager (OLM)", @@ -109,21 +56,31 @@ def test_complete_query_response_with_all_fields(self) -> None: qr = QueryResponse( conversation_id="123e4567-e89b-12d3-a456-426614174000", response="Operator Lifecycle Manager (OLM) helps users install...", - rag_chunks=rag_chunks, tool_calls=tool_calls, + tool_results=tool_results, referenced_documents=referenced_documents, + truncated=False, + input_tokens=100, + output_tokens=50, + available_quotas={"daily": 1000}, ) assert qr.conversation_id == "123e4567-e89b-12d3-a456-426614174000" assert qr.response == "Operator Lifecycle Manager (OLM) helps users install..." - assert len(qr.rag_chunks) == 1 - assert ( - qr.rag_chunks[0].content - == "OLM is a component of the Operator Framework toolkit" - ) + assert qr.tool_calls is not None assert len(qr.tool_calls) == 1 - assert qr.tool_calls[0].tool_name == "knowledge_search" + assert qr.tool_calls[0].name == "knowledge_search" + assert qr.tool_results is not None + assert len(qr.tool_results) == 1 + assert qr.tool_results[0].status == "success" + assert qr.tool_results[0].content == {"chunks_found": 5} + assert qr.tool_results[0].type == "tool_result" + assert qr.tool_results[0].round == 1 assert len(qr.referenced_documents) == 1 assert ( qr.referenced_documents[0].doc_title == "Operator Lifecycle Manager (OLM)" ) + assert qr.truncated is False + assert qr.input_tokens == 100 + assert qr.output_tokens == 50 + assert qr.available_quotas == {"daily": 1000} diff --git a/tests/unit/models/responses/test_rag_chunk.py b/tests/unit/models/responses/test_rag_chunk.py index bec534d37..17081a993 100644 --- a/tests/unit/models/responses/test_rag_chunk.py +++ b/tests/unit/models/responses/test_rag_chunk.py @@ -1,6 +1,6 @@ """Unit tests for RAGChunk model.""" -from models.responses import RAGChunk +from utils.types import RAGChunk class TestRAGChunk: diff --git a/tests/unit/models/responses/test_successful_responses.py b/tests/unit/models/responses/test_successful_responses.py index 470c5d57c..2e7056245 100644 --- a/tests/unit/models/responses/test_successful_responses.py +++ b/tests/unit/models/responses/test_successful_responses.py @@ -1,4 +1,4 @@ -# pylint: disable=unsupported-membership-test,unsubscriptable-object +# pylint: disable=unsupported-membership-test,unsubscriptable-object, too-many-lines """Unit tests for all successful response models.""" @@ -34,14 +34,14 @@ ProviderResponse, ProvidersListResponse, QueryResponse, - RAGChunk, ReadinessResponse, ReferencedDocument, ShieldsResponse, StatusResponse, - ToolCall, + StreamingQueryResponse, ToolsResponse, ) +from utils.types import ToolCallSummary, ToolResultSummary class TestModelsResponse: @@ -268,8 +268,8 @@ def test_constructor_minimal(self) -> None: assert isinstance(response_obj, AbstractSuccessfulResponse) assert response_obj.response == "Test response" assert response_obj.conversation_id is None - assert response_obj.rag_chunks == [] assert response_obj.tool_calls is None + assert response_obj.tool_results is None assert response_obj.referenced_documents == [] assert response_obj.truncated is False assert response_obj.input_tokens == 0 @@ -278,9 +278,19 @@ def test_constructor_minimal(self) -> None: def test_constructor_full(self) -> None: """Test QueryResponse with all fields.""" - rag_chunks = [RAGChunk(content="chunk1", source="doc1", score=0.9)] tool_calls = [ - ToolCall(tool_name="tool1", arguments={"arg": "value"}, result=None) + ToolCallSummary( + id="call-1", name="tool1", args={"arg": "value"}, type="tool_call" + ) + ] + tool_results = [ + ToolResultSummary( + id="call-1", + status="success", + content={"chunks_found": 5}, + type="tool_result", + round=1, + ) ] referenced_docs = [ ReferencedDocument(doc_url=AnyUrl("https://example.com"), doc_title="Doc") @@ -289,8 +299,8 @@ def test_constructor_full(self) -> None: response = QueryResponse( # type: ignore[call-arg] conversation_id="conv-123", response="Test response", - rag_chunks=rag_chunks, tool_calls=tool_calls, + tool_results=tool_results, referenced_documents=referenced_docs, truncated=True, input_tokens=100, @@ -298,7 +308,6 @@ def test_constructor_full(self) -> None: available_quotas={"daily": 1000}, ) assert response.conversation_id == "conv-123" - assert response.rag_chunks == rag_chunks assert response.tool_calls == tool_calls assert response.referenced_documents == referenced_docs assert response.truncated is True @@ -956,6 +965,35 @@ def test_openapi_response(self) -> None: assert expected_count == 1 +class TestStreamingQueryResponse: + """Test cases for StreamingQueryResponse.""" + + def test_openapi_response_structure(self) -> None: + """Test that openapi_response() returns correct structure.""" + result = StreamingQueryResponse.openapi_response() + + assert "description" in result + assert "content" in result + assert result["description"] == "Successful response" + assert "model" not in result + + assert "text/event-stream" in result["content"] + content = result["content"]["text/event-stream"] + assert "schema" in content + assert "example" in content + + schema = content["schema"] + assert schema["type"] == "string" + assert schema["format"] == "text/event-stream" + + def test_model_json_schema_has_examples(self) -> None: + """Test that model_json_schema() includes examples.""" + schema = StreamingQueryResponse.model_json_schema() + assert "examples" in schema + assert len(schema["examples"]) == 1 + assert isinstance(schema["examples"][0], str) + + class TestAbstractSuccessfulResponseOpenAPI: """Test cases for AbstractSuccessfulResponse.openapi_response() edge cases.""" diff --git a/tests/unit/utils/test_endpoints.py b/tests/unit/utils/test_endpoints.py index 2ddae8f2c..c0641685d 100644 --- a/tests/unit/utils/test_endpoints.py +++ b/tests/unit/utils/test_endpoints.py @@ -258,7 +258,7 @@ def test_get_profile_prompt_with_enabled_query_system_prompt( assert system_prompt == query_request_with_system_prompt.system_prompt -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_with_conversation_id( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -293,7 +293,7 @@ async def test_get_agent_with_conversation_id( assert result_session_id == "test_session_id" -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_with_conversation_id_and_no_agent_in_llama_stack( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -353,7 +353,7 @@ async def test_get_agent_with_conversation_id_and_no_agent_in_llama_stack( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_no_conversation_id( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -409,7 +409,7 @@ async def test_get_agent_no_conversation_id( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_empty_shields( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -465,7 +465,7 @@ async def test_get_agent_empty_shields( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_multiple_mcp_servers( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -523,7 +523,7 @@ async def test_get_agent_multiple_mcp_servers( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_session_persistence_enabled( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -574,7 +574,7 @@ async def test_get_agent_session_persistence_enabled( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_no_tools_no_parser( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -631,7 +631,7 @@ async def test_get_agent_no_tools_no_parser( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_agent_no_tools_false_preserves_parser( setup_configuration: AppConfig, prepare_agent_mocks: AgentFixtures, @@ -693,7 +693,7 @@ async def test_get_agent_no_tools_false_preserves_parser( ) -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_temp_agent_basic_functionality( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -734,7 +734,7 @@ async def test_get_temp_agent_basic_functionality( mock_agent.create_session.assert_called_once() -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_temp_agent_returns_valid_ids( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: @@ -769,7 +769,7 @@ async def test_get_temp_agent_returns_valid_ids( assert result_conversation_id == result_agent.agent_id -@pytest.mark.asyncio +@pytest.mark.skip(reason="Deprecated API test") async def test_get_temp_agent_no_persistence( prepare_agent_mocks: AgentFixtures, mocker: MockerFixture ) -> None: diff --git a/tests/unit/utils/test_transcripts.py b/tests/unit/utils/test_transcripts.py index 83fc2ecf9..cbe2e5827 100644 --- a/tests/unit/utils/test_transcripts.py +++ b/tests/unit/utils/test_transcripts.py @@ -10,7 +10,7 @@ construct_transcripts_path, store_transcript, ) -from utils.types import ToolCallSummary, TurnSummary +from utils.types import ToolCallSummary, ToolResultSummary, TurnSummary def test_construct_transcripts_path(mocker: MockerFixture) -> None: @@ -70,17 +70,29 @@ def test_store_transcript(mocker: MockerFixture) -> None: query = "What is OpenStack?" model = "fake-model" provider = "fake-provider" - query_request = QueryRequest(query=query, model=model, provider=provider) + query_request = QueryRequest( # type: ignore[call-arg] + query=query, model=model, provider=provider + ) summary = TurnSummary( llm_response="LLM answer", tool_calls=[ ToolCallSummary( id="123", name="test-tool", - args="testing", - response="tool response", + args={"testing": "testing"}, + type="tool_call", + ) + ], + tool_results=[ + ToolResultSummary( + id="123", + status="success", + content="tool response", + type="tool_result", + round=1, ) ], + rag_chunks=[], ) query_is_valid = True rag_chunks: list[dict] = [] @@ -124,8 +136,17 @@ def test_store_transcript(mocker: MockerFixture) -> None: { "id": "123", "name": "test-tool", - "args": "testing", - "response": "tool response", + "args": {"testing": "testing"}, + "type": "tool_call", + } + ], + "tool_results": [ + { + "id": "123", + "status": "success", + "content": "tool response", + "type": "tool_result", + "round": 1, } ], }, diff --git a/uv.lock b/uv.lock index cffa67fb4..a4101e7e6 100644 --- a/uv.lock +++ b/uv.lock @@ -652,18 +652,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922, upload-time = "2025-05-17T13:52:36.463Z" }, ] -[[package]] -name = "ecdsa" -version = "0.19.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "six" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c0/1f/924e3caae75f471eae4b26bd13b698f6af2c44279f67af317439c2f4c46a/ecdsa-0.19.1.tar.gz", hash = "sha256:478cba7b62555866fcb3bb3fe985e06decbdb68ef55713c4e5ab98c57d508e61", size = 201793, upload-time = "2025-03-13T11:52:43.25Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/a3/460c57f094a4a165c84a1341c373b0a4f5ec6ac244b998d5021aade89b77/ecdsa-0.19.1-py2.py3-none-any.whl", hash = "sha256:30638e27cf77b7e15c4c4cc1973720149e1033827cfd00661ca5c8cc0cdb24c3", size = 150607, upload-time = "2025-03-13T11:52:41.757Z" }, -] - [[package]] name = "email-validator" version = "2.3.0" @@ -1450,8 +1438,8 @@ requires-dist = [ { name = "jsonpath-ng", specifier = ">=1.6.1" }, { name = "kubernetes", specifier = ">=30.1.0" }, { name = "litellm", specifier = ">=1.75.5.post1" }, - { name = "llama-stack", specifier = "==0.2.22" }, - { name = "llama-stack-client", specifier = "==0.2.22" }, + { name = "llama-stack", specifier = "==0.3.0" }, + { name = "llama-stack-client", specifier = "==0.3.0" }, { name = "openai", specifier = ">=1.99.9" }, { name = "prometheus-client", specifier = ">=0.22.1" }, { name = "psycopg2-binary", specifier = ">=2.9.10" }, @@ -1546,7 +1534,7 @@ wheels = [ [[package]] name = "llama-stack" -version = "0.2.22" +version = "0.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1556,7 +1544,6 @@ dependencies = [ { name = "fire" }, { name = "h11" }, { name = "httpx" }, - { name = "huggingface-hub" }, { name = "jinja2" }, { name = "jsonschema" }, { name = "llama-stack-client" }, @@ -1566,23 +1553,24 @@ dependencies = [ { name = "pillow" }, { name = "prompt-toolkit" }, { name = "pydantic" }, + { name = "pyjwt", extra = ["crypto"] }, { name = "python-dotenv" }, - { name = "python-jose", extra = ["cryptography"] }, { name = "python-multipart" }, { name = "rich" }, + { name = "sqlalchemy", extra = ["asyncio"] }, { name = "starlette" }, { name = "termcolor" }, { name = "tiktoken" }, { name = "uvicorn" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/6b/cf/c4bccdb6e218f3fda1d50aad87bf08376372c56ddc523e35f5a629c725e1/llama_stack-0.2.22.tar.gz", hash = "sha256:576752dedc9e9f0fb9da69f373d677d8b4f2ae4203428f676fa039b6813d8450", size = 3334595, upload-time = "2025-09-16T19:43:41.842Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/c7/47963861f4f7f68dff6d82e4d8c697943b625b14ae73dce1d228ea72b9b4/llama_stack-0.3.0.tar.gz", hash = "sha256:8277c54cf4a283077143a0804128f2c76f1ec9660116353176c77b659206d315", size = 3317843, upload-time = "2025-10-21T23:58:35.103Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a9/42/5ae8be5371367beb9c8e38966cd941022c072fb2133660bf0eabc7b5d08b/llama_stack-0.2.22-py3-none-any.whl", hash = "sha256:c6bbda6b5a4417b9a73ed36b9d581fd7ec689090ceefd084d9a078e7acbdc670", size = 3669928, upload-time = "2025-09-16T19:43:40.391Z" }, + { url = "https://files.pythonhosted.org/packages/5e/05/3602d881ae6d174ac557e1ccac1572cbc087cd2178a2b77390320ffec47d/llama_stack-0.3.0-py3-none-any.whl", hash = "sha256:c2b999dced8970f3590ecd7eca50bef1bc0c052eec15b8aba78a5c17a0a4051d", size = 3629351, upload-time = "2025-10-21T23:58:33.677Z" }, ] [[package]] name = "llama-stack-client" -version = "0.2.22" +version = "0.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio" }, @@ -1601,9 +1589,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/60/80/4260816bfaaa889d515206c9df4906d08d405bf94c9b4d1be399b1923e46/llama_stack_client-0.2.22.tar.gz", hash = "sha256:9a0bc756b91ebd539858eeaf1f231c5e5c6900e1ea4fcced726c6717f3d27ca7", size = 318309, upload-time = "2025-09-16T19:43:33.212Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/d9/3c720f420fc80ce51de1a0ad90c53edc613617b68980137dcf716a86198a/llama_stack_client-0.3.0.tar.gz", hash = "sha256:1e974a74d0da285e18ba7df30b9a324e250782b130253bcef3e695830c5bb03d", size = 340443, upload-time = "2025-10-21T23:58:25.855Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/8e/1ebf6ac0dbb62b81038e856ed00768e283d927b14fcd614e3018a227092b/llama_stack_client-0.2.22-py3-none-any.whl", hash = "sha256:b260d73aec56fcfd8fa601b3b34c2f83c4fbcfb7261a246b02bbdf6c2da184fe", size = 369901, upload-time = "2025-09-16T19:43:32.089Z" }, + { url = "https://files.pythonhosted.org/packages/96/27/1c65035ce58100be22409c98e4d65b1cdaeff7811ea968f9f844641330d7/llama_stack_client-0.3.0-py3-none-any.whl", hash = "sha256:9f85d84d508ef7da44b96ca8555d7783da717cfc9135bab6a5530fe8c852690d", size = 425234, upload-time = "2025-10-21T23:58:24.246Z" }, ] [[package]] @@ -2872,25 +2860,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, ] -[[package]] -name = "python-jose" -version = "3.5.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "ecdsa" }, - { name = "pyasn1" }, - { name = "rsa" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/c6/77/3a1c9039db7124eb039772b935f2244fbb73fc8ee65b9acf2375da1c07bf/python_jose-3.5.0.tar.gz", hash = "sha256:fb4eaa44dbeb1c26dcc69e4bd7ec54a1cb8dd64d3b4d81ef08d90ff453f2b01b", size = 92726, upload-time = "2025-05-28T17:31:54.288Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d9/c3/0bd11992072e6a1c513b16500a5d07f91a24017c5909b02c72c62d7ad024/python_jose-3.5.0-py2.py3-none-any.whl", hash = "sha256:abd1202f23d34dfad2c3d28cb8617b90acf34132c7afd60abd0b0b7d3cb55771", size = 34624, upload-time = "2025-05-28T17:31:52.802Z" }, -] - -[package.optional-dependencies] -cryptography = [ - { name = "cryptography" }, -] - [[package]] name = "python-multipart" version = "0.0.20" @@ -3397,6 +3366,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/5e/6a29fa884d9fb7ddadf6b69490a9d45fded3b38541713010dad16b77d015/sqlalchemy-2.0.44-py3-none-any.whl", hash = "sha256:19de7ca1246fbef9f9d1bff8f1ab25641569df226364a0e40457dc5457c54b05", size = 1928718, upload-time = "2025-10-10T15:29:45.32Z" }, ] +[package.optional-dependencies] +asyncio = [ + { name = "greenlet" }, +] + [[package]] name = "sse-starlette" version = "3.0.3"