From 3ed60950ada2df41fedba306a367cc2b36fd04d0 Mon Sep 17 00:00:00 2001 From: Andrej Simurka Date: Thu, 6 Nov 2025 14:58:06 +0100 Subject: [PATCH] Added quota limit exception handling --- docs/openapi.json | 87 +++++++++++++++++++ pyproject.toml | 3 +- src/app/endpoints/query.py | 15 ++++ src/app/endpoints/query_v2.py | 5 ++ src/app/endpoints/streaming_query.py | 20 ++++- src/models/responses.py | 68 +++++++++++++++ tests/unit/app/endpoints/test_query.py | 44 ++++++++++ tests/unit/app/endpoints/test_query_v2.py | 45 ++++++++++ .../app/endpoints/test_streaming_query.py | 44 ++++++++++ uv.lock | 4 +- 10 files changed, 330 insertions(+), 5 deletions(-) diff --git a/docs/openapi.json b/docs/openapi.json index 529d9b417..82ada88a9 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -375,6 +375,16 @@ } } }, + "429": { + "description": "The quota has been exceeded", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QuotaExceededResponse" + } + } + } + }, "500": { "description": "Internal Server Error", "detail": { @@ -461,6 +471,16 @@ } } }, + "429": { + "description": "The quota has been exceeded", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QuotaExceededResponse" + } + } + } + }, "500": { "description": "Internal Server Error", "detail": { @@ -1256,6 +1276,16 @@ } } }, + "429": { + "description": "The quota has been exceeded", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QuotaExceededResponse" + } + } + } + }, "500": { "description": "Internal Server Error", "detail": { @@ -3577,6 +3607,63 @@ } ] }, + "QuotaExceededResponse": { + "properties": { + "detail": { + "$ref": "#/components/schemas/DetailModel" + } + }, + "type": "object", + "required": [ + "detail" + ], + "title": "QuotaExceededResponse", + "description": "429 Too Many Requests - LLM quota exceeded.", + "examples": [ + { + "detail": { + "cause": "User 123 has no available tokens.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "Cluster has no available tokens.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "Unknown subject 999 has no available tokens.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "User 123 has 5 tokens, but 10 tokens are needed.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "Cluster has 500 tokens, but 900 tokens are needed.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", + "response": "The quota has been exceeded" + } + }, + { + "detail": { + "cause": "The token quota for model gpt-4-turbo has been exceeded.", + "response": "The model quota has been exceeded" + } + } + ] + }, "QuotaHandlersConfiguration": { "properties": { "sqlite": { diff --git a/pyproject.toml b/pyproject.toml index 079b4f12a..5c8ff3944 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,7 @@ dependencies = [ # Used by authorization resolvers "jsonpath-ng>=1.6.1", "psycopg2-binary>=2.9.10", + "litellm>=1.75.5.post1", ] @@ -129,8 +130,6 @@ llslibdev = [ "langdetect>=1.0.9", "emoji>=2.1.0", "nltk>=3.8.1", - # API inference: remote::gemini - "litellm>=1.75.5.post1", # API inference: inline::sentence-transformers "sentence-transformers>=5.0.0", # API vector_io: inline::faiss diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py index 8040ed7eb..4993da1a6 100644 --- a/src/app/endpoints/query.py +++ b/src/app/endpoints/query.py @@ -8,6 +8,7 @@ from typing import Annotated, Any, Optional, cast from fastapi import APIRouter, Depends, HTTPException, Request, status +from litellm.exceptions import RateLimitError from llama_stack_client import ( APIConnectionError, AsyncLlamaStackClient, # type: ignore @@ -42,6 +43,7 @@ ReferencedDocument, ToolCall, UnauthorizedResponse, + QuotaExceededResponse, ) from utils.endpoints import ( check_configuration_loaded, @@ -86,6 +88,10 @@ "description": "Client does not have permission to access conversation", "model": ForbiddenResponse, }, + 429: { + "description": "The quota has been exceeded", + "model": QuotaExceededResponse, + }, 500: { "detail": { "response": "Unable to connect to Llama Stack", @@ -421,6 +427,15 @@ async def query_endpoint_handler_base( # pylint: disable=R0914 "cause": str(e), }, ) from e + except RateLimitError as e: + used_model = getattr(e, "model", "unknown") + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail={ + "response": "Model quota exceeded", + "cause": f"The token quota for model {used_model} has been exceeded.", + }, + ) from e @router.post("/query", responses=query_response) diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py index 3ebcf4f8d..42f93709a 100644 --- a/src/app/endpoints/query_v2.py +++ b/src/app/endpoints/query_v2.py @@ -27,6 +27,7 @@ QueryResponse, ReferencedDocument, UnauthorizedResponse, + QuotaExceededResponse, ) from utils.endpoints import ( get_system_prompt, @@ -59,6 +60,10 @@ "description": "Client does not have permission to access conversation", "model": ForbiddenResponse, }, + 429: { + "description": "The quota has been exceeded", + "model": QuotaExceededResponse, + }, 500: { "detail": { "response": "Unable to connect to Llama Stack", diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py index c1d2c0492..abc661c46 100644 --- a/src/app/endpoints/streaming_query.py +++ b/src/app/endpoints/streaming_query.py @@ -8,6 +8,7 @@ from datetime import UTC, datetime from typing import Annotated, Any, AsyncGenerator, AsyncIterator, Iterator, cast +from litellm.exceptions import RateLimitError from fastapi import APIRouter, Depends, HTTPException, Request, status from fastapi.responses import StreamingResponse from llama_stack_client import ( @@ -48,7 +49,11 @@ from models.config import Action from models.database.conversations import UserConversation from models.requests import QueryRequest -from models.responses import ForbiddenResponse, UnauthorizedResponse +from models.responses import ( + ForbiddenResponse, + UnauthorizedResponse, + QuotaExceededResponse, +) from utils.endpoints import ( check_configuration_loaded, create_referenced_documents_with_metadata, @@ -104,6 +109,10 @@ "description": "Client does not have permission to access conversation", "model": ForbiddenResponse, }, + 429: { + "description": "The quota has been exceeded", + "model": QuotaExceededResponse, + }, 500: { "detail": { "response": "Unable to connect to Llama Stack", @@ -922,6 +931,15 @@ async def response_generator( "cause": str(e), }, ) from e + except RateLimitError as e: + used_model = getattr(e, "model", "unknown") + raise HTTPException( + status_code=status.HTTP_429_TOO_MANY_REQUESTS, + detail={ + "response": "Model quota exceeded", + "cause": f"The token quota for model {used_model} has been exceeded.", + }, + ) from e except Exception as e: # pylint: disable=broad-except # Handle other errors with OLS-compatible error response # This broad exception catch is intentional to ensure all errors diff --git a/src/models/responses.py b/src/models/responses.py index 1c03bbe84..c3a32d244 100644 --- a/src/models/responses.py +++ b/src/models/responses.py @@ -1142,6 +1142,74 @@ def __init__(self, user_id: str, resource: str, resource_id: str): } +class QuotaExceededResponse(AbstractErrorResponse): + """429 Too Many Requests - LLM quota exceeded.""" + + def __init__( + self, + user_id: str, + model_name: str, # pylint: disable=unused-argument + limit: int, # pylint: disable=unused-argument + ): + """Initialize a QuotaExceededResponse.""" + super().__init__( + detail=DetailModel( + response="The quota has been exceeded", + cause=(f"User {user_id} has no available tokens."), + ) + ) + # TODO(LCORE-837): add factories for custom cause creation + + model_config = { + "json_schema_extra": { + "examples": [ + { + "detail": { + "response": "The quota has been exceeded", + "cause": "User 123 has no available tokens.", + } + }, + { + "detail": { + "response": "The quota has been exceeded", + "cause": "Cluster has no available tokens.", + } + }, + { + "detail": { + "response": "The quota has been exceeded", + "cause": "Unknown subject 999 has no available tokens.", + } + }, + { + "detail": { + "response": "The quota has been exceeded", + "cause": "User 123 has 5 tokens, but 10 tokens are needed.", + } + }, + { + "detail": { + "response": "The quota has been exceeded", + "cause": "Cluster has 500 tokens, but 900 tokens are needed.", + } + }, + { + "detail": { + "response": "The quota has been exceeded", + "cause": "Unknown subject 999 has 3 tokens, but 6 tokens are needed.", + } + }, + { + "detail": { + "response": "The model quota has been exceeded", + "cause": "The token quota for model gpt-4-turbo has been exceeded.", + } + }, + ] + } + } + + class InvalidFeedbackStoragePathResponse(AbstractErrorResponse): """500 Internal Error - Invalid feedback storage path.""" diff --git a/tests/unit/app/endpoints/test_query.py b/tests/unit/app/endpoints/test_query.py index 0b18dc438..9b10d970e 100644 --- a/tests/unit/app/endpoints/test_query.py +++ b/tests/unit/app/endpoints/test_query.py @@ -10,6 +10,7 @@ import pytest from pytest_mock import MockerFixture from fastapi import HTTPException, Request, status +from litellm.exceptions import RateLimitError from llama_stack_client import APIConnectionError from llama_stack_client.types import UserMessage # type: ignore @@ -2261,3 +2262,46 @@ async def test_get_topic_summary_create_turn_parameters(mocker: MockerFixture) - stream=False, toolgroups=None, ) + + +@pytest.mark.asyncio +async def test_query_endpoint_quota_exceeded( + mocker: MockerFixture, dummy_request: Request +) -> None: + """Test that query endpoint raises HTTP 429 when model quota is exceeded.""" + query_request = QueryRequest( + query="What is OpenStack?", + provider="openai", + model="gpt-4-turbo", + ) # type: ignore + mock_client = mocker.AsyncMock() + mock_agent = mocker.AsyncMock() + mock_agent.create_turn.side_effect = RateLimitError( + model="gpt-4-turbo", llm_provider="openai", message="" + ) + mocker.patch( + "app.endpoints.query.get_agent", + return_value=(mock_agent, "conv-123", "sess-123"), + ) + mocker.patch( + "app.endpoints.query.select_model_and_provider_id", + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), + ) + mocker.patch("app.endpoints.query.validate_model_provider_override") + mocker.patch( + "client.AsyncLlamaStackClientHolder.get_client", + return_value=mock_client, + ) + mocker.patch( + "app.endpoints.query.handle_mcp_headers_with_toolgroups", return_value={} + ) + + with pytest.raises(HTTPException) as exc_info: + await query_endpoint_handler( + dummy_request, query_request=query_request, auth=MOCK_AUTH + ) + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS + detail = exc_info.value.detail + assert isinstance(detail, dict) + assert detail["response"] == "Model quota exceeded" # type: ignore + assert "gpt-4-turbo" in detail["cause"] # type: ignore diff --git a/tests/unit/app/endpoints/test_query_v2.py b/tests/unit/app/endpoints/test_query_v2.py index 3b214e121..1436bb34d 100644 --- a/tests/unit/app/endpoints/test_query_v2.py +++ b/tests/unit/app/endpoints/test_query_v2.py @@ -2,6 +2,7 @@ """Unit tests for the /query (v2) REST API endpoint using Responses API.""" from typing import Any +from litellm.exceptions import RateLimitError import pytest from pytest_mock import MockerFixture from fastapi import HTTPException, status, Request @@ -18,6 +19,14 @@ query_endpoint_handler_v2, ) +# User ID must be proper UUID +MOCK_AUTH = ( + "00000001-0001-0001-0001-000000000001", + "mock_username", + False, + "mock_token", +) + @pytest.fixture def dummy_request() -> Request: @@ -432,3 +441,39 @@ def _raise(*_args: Any, **_kwargs: Any) -> Exception: assert exc.value.status_code == status.HTTP_500_INTERNAL_SERVER_ERROR assert "Unable to connect to Llama Stack" in str(exc.value.detail) fail_metric.inc.assert_called_once() + + +@pytest.mark.asyncio +async def test_query_endpoint_quota_exceeded( + mocker: MockerFixture, dummy_request: Request +) -> None: + """Test that query endpoint raises HTTP 429 when model quota is exceeded.""" + query_request = QueryRequest( + query="What is OpenStack?", + provider="openai", + model="gpt-4-turbo", + attachments=[], + ) # type: ignore + mock_client = mocker.AsyncMock() + mock_client.responses.create.side_effect = RateLimitError( + model="gpt-4-turbo", llm_provider="openai", message="" + ) + mocker.patch( + "app.endpoints.query.select_model_and_provider_id", + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), + ) + mocker.patch("app.endpoints.query.validate_model_provider_override") + mocker.patch( + "client.AsyncLlamaStackClientHolder.get_client", + return_value=mock_client, + ) + + with pytest.raises(HTTPException) as exc_info: + await query_endpoint_handler_v2( + dummy_request, query_request=query_request, auth=MOCK_AUTH + ) + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS + detail = exc_info.value.detail + assert isinstance(detail, dict) + assert detail["response"] == "Model quota exceeded" # type: ignore + assert "gpt-4-turbo" in detail["cause"] # type: ignore diff --git a/tests/unit/app/endpoints/test_streaming_query.py b/tests/unit/app/endpoints/test_streaming_query.py index 5dd77ece4..2c82b8522 100644 --- a/tests/unit/app/endpoints/test_streaming_query.py +++ b/tests/unit/app/endpoints/test_streaming_query.py @@ -6,6 +6,7 @@ import json +from litellm.exceptions import RateLimitError import pytest from pytest_mock import MockerFixture @@ -1795,6 +1796,49 @@ async def test_streaming_query_handles_none_event(mocker: MockerFixture) -> None assert isinstance(response, StreamingResponse) +@pytest.mark.asyncio +async def test_query_endpoint_quota_exceeded(mocker: MockerFixture) -> None: + """Test that streaming query endpoint raises HTTP 429 when model quota is exceeded.""" + query_request = QueryRequest( + query="What is OpenStack?", + provider="openai", + model="gpt-4-turbo", + ) # type: ignore + request = Request(scope={"type": "http"}) + mock_client = mocker.AsyncMock() + mock_agent = mocker.AsyncMock() + mock_agent.create_turn.side_effect = RateLimitError( + model="gpt-4-turbo", llm_provider="openai", message="" + ) + mocker.patch( + "app.endpoints.streaming_query.get_agent", + return_value=(mock_agent, "conv-123", "sess-123"), + ) + mocker.patch( + "app.endpoints.streaming_query.select_model_and_provider_id", + return_value=("openai/gpt-4-turbo", "gpt-4-turbo", "openai"), + ) + mocker.patch("app.endpoints.streaming_query.validate_model_provider_override") + mocker.patch( + "client.AsyncLlamaStackClientHolder.get_client", + return_value=mock_client, + ) + mocker.patch( + "app.endpoints.streaming_query.handle_mcp_headers_with_toolgroups", + return_value={}, + ) + + with pytest.raises(HTTPException) as exc_info: + await streaming_query_endpoint_handler( + request, query_request=query_request, auth=MOCK_AUTH + ) + assert exc_info.value.status_code == status.HTTP_429_TOO_MANY_REQUESTS + detail = exc_info.value.detail + assert isinstance(detail, dict) + assert detail["response"] == "Model quota exceeded" # type: ignore + assert "gpt-4-turbo" in detail["cause"] # type: ignore + + # ============================================================================ # OLS Compatibility Tests # ============================================================================ diff --git a/uv.lock b/uv.lock index 5e6c2f629..acfc6dcd4 100644 --- a/uv.lock +++ b/uv.lock @@ -1351,6 +1351,7 @@ dependencies = [ { name = "fastapi" }, { name = "jsonpath-ng" }, { name = "kubernetes" }, + { name = "litellm" }, { name = "llama-stack" }, { name = "llama-stack-client" }, { name = "openai" }, @@ -1399,7 +1400,6 @@ llslibdev = [ { name = "faiss-cpu" }, { name = "fire" }, { name = "langdetect" }, - { name = "litellm" }, { name = "matplotlib" }, { name = "mcp" }, { name = "nltk" }, @@ -1432,6 +1432,7 @@ requires-dist = [ { name = "fastapi", specifier = ">=0.115.12" }, { name = "jsonpath-ng", specifier = ">=1.6.1" }, { name = "kubernetes", specifier = ">=30.1.0" }, + { name = "litellm", specifier = ">=1.75.5.post1" }, { name = "llama-stack", specifier = "==0.2.22" }, { name = "llama-stack-client", specifier = "==0.2.22" }, { name = "openai", specifier = ">=1.99.9" }, @@ -1480,7 +1481,6 @@ llslibdev = [ { name = "faiss-cpu", specifier = ">=1.11.0" }, { name = "fire", specifier = ">=0.7.0" }, { name = "langdetect", specifier = ">=1.0.9" }, - { name = "litellm", specifier = ">=1.75.5.post1" }, { name = "matplotlib", specifier = ">=3.10.0" }, { name = "mcp", specifier = ">=1.9.4" }, { name = "nltk", specifier = ">=3.8.1" },