lightspeed-core · tisnik · Oct 21, 2025 · Oct 8, 2025 · Oct 9, 2025 · Oct 17, 2025
diff --git a/docs/openapi.json b/docs/openapi.json
@@ -1787,6 +1787,9 @@
                         },
                         "type": "array",
                         "title": "Byok Rag"
+                    },
+                    "quota_handlers": {
+                        "$ref": "#/components/schemas/QuotaHandlersConfiguration"
                     }
                 },
                 "additionalProperties": false,
@@ -3590,6 +3593,103 @@
                     }
                 ]
             },
+            "QuotaHandlersConfiguration": {
+                "properties": {
+                    "sqlite": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/SQLiteDatabaseConfiguration"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "postgres": {
+                        "anyOf": [
+                            {
+                                "$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ]
+                    },
+                    "limiters": {
+                        "items": {
+                            "$ref": "#/components/schemas/QuotaLimiterConfiguration"
+                        },
+                        "type": "array",
+                        "title": "Limiters"
+                    },
+                    "scheduler": {
+                        "$ref": "#/components/schemas/QuotaSchedulerConfiguration"
+                    },
+                    "enable_token_history": {
+                        "type": "boolean",
+                        "title": "Enable Token History",
+                        "default": false
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "title": "QuotaHandlersConfiguration",
+                "description": "Quota limiter configuration."
+            },
+            "QuotaLimiterConfiguration": {
+                "properties": {
+                    "type": {
+                        "type": "string",
+                        "enum": [
+                            "user_limiter",
+                            "cluster_limiter"
+                        ],
+                        "title": "Type"
+                    },
+                    "name": {
+                        "type": "string",
+                        "title": "Name"
+                    },
+                    "initial_quota": {
+                        "type": "integer",
+                        "minimum": 0.0,
+                        "title": "Initial Quota"
+                    },
+                    "quota_increase": {
+                        "type": "integer",
+                        "minimum": 0.0,
+                        "title": "Quota Increase"
+                    },
+                    "period": {
+                        "type": "string",
+                        "title": "Period"
+                    }
+                },
+                "additionalProperties": false,
+                "type": "object",
+                "required": [
+                    "type",
+                    "name",
+                    "initial_quota",
+                    "quota_increase",
+                    "period"
+                ],
+                "title": "QuotaLimiterConfiguration",
+                "description": "Configuration for one quota limiter."
+            },
+            "QuotaSchedulerConfiguration": {
+                "properties": {
+                    "period": {
+                        "type": "integer",
+                        "exclusiveMinimum": 0.0,
+                        "title": "Period",
+                        "default": 1
+                    }
+                },
+                "type": "object",
+                "title": "QuotaSchedulerConfiguration",
+                "description": "Quota scheduler configuration."
+            },
             "RAGChunk": {
                 "properties": {
                     "content": {
@@ -3691,15 +3791,19 @@
                         "description": "URL of the referenced document"
                     },
                     "doc_title": {
-                        "type": "string",
+                        "anyOf": [
+                            {
+                                "type": "string"
+                            },
+                            {
+                                "type": "null"
+                            }
+                        ],
                         "title": "Doc Title",
                         "description": "Title of the referenced document"
                     }
                 },
                 "type": "object",
-                "required": [
-                    "doc_title"
-                ],
                 "title": "ReferencedDocument",
                 "description": "Model representing a document referenced in generating a response.\n\nAttributes:\n    doc_url: Url to the referenced doc.\n    doc_title: Title of the referenced doc."
             },

diff --git a/src/app/endpoints/conversations_v2.py b/src/app/endpoints/conversations_v2.py
@@ -314,13 +314,19 @@ def check_conversation_existence(user_id: str, conversation_id: str) -> None:
 
 def transform_chat_message(entry: CacheEntry) -> dict[str, Any]:
     """Transform the message read from cache into format used by response payload."""
+    user_message = {"content": entry.query, "type": "user"}
+    assistant_message: dict[str, Any] = {"content": entry.response, "type": "assistant"}
+
+    # If referenced_documents exist on the entry, add them to the assistant message
+    if entry.referenced_documents is not None:
+        assistant_message["referenced_documents"] = [
+            doc.model_dump(mode="json") for doc in entry.referenced_documents
+        ]
+
     return {
         "provider": entry.provider,
         "model": entry.model,
-        "messages": [
-            {"content": entry.query, "type": "user"},
-            {"content": entry.response, "type": "assistant"},
-        ],
+        "messages": [user_message, assistant_message],
         "started_at": entry.started_at,
         "completed_at": entry.completed_at,
     }
diff --git a/src/app/endpoints/query.py b/src/app/endpoints/query.py
@@ -31,6 +31,7 @@
 from authorization.middleware import authorize
 from client import AsyncLlamaStackClientHolder
 from configuration import configuration
+from models.cache_entry import CacheEntry
 from models.config import Action
 from models.database.conversations import UserConversation
 from models.requests import Attachment, QueryRequest
@@ -331,16 +332,22 @@ async def query_endpoint_handler(  # pylint: disable=R0914
         )
 
         completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+        cache_entry = CacheEntry(
+            query=query_request.query,
+            response=summary.llm_response,
+            provider=provider_id,
+            model=model_id,
+            started_at=started_at,
+            completed_at=completed_at,
+            referenced_documents=referenced_documents if referenced_documents else None,
+        )
+
         store_conversation_into_cache(
             configuration,
             user_id,
             conversation_id,
-            provider_id,
-            model_id,
-            query_request.query,
-            summary.llm_response,
-            started_at,
-            completed_at,
+            cache_entry,
             _skip_userid_check,
             topic_summary,
         )

diff --git a/src/app/endpoints/streaming_query.py b/src/app/endpoints/streaming_query.py
@@ -43,12 +43,14 @@
 from constants import DEFAULT_RAG_TOOL, MEDIA_TYPE_JSON, MEDIA_TYPE_TEXT
 import metrics
 from metrics.utils import update_llm_token_count_from_turn
+from models.cache_entry import CacheEntry
 from models.config import Action
 from models.database.conversations import UserConversation
 from models.requests import QueryRequest
 from models.responses import ForbiddenResponse, UnauthorizedResponse
 from utils.endpoints import (
     check_configuration_loaded,
+    create_referenced_documents_with_metadata,
     create_rag_chunks_dict,
     get_agent,
     get_system_prompt,
@@ -863,16 +865,28 @@ async def response_generator(
                     )
 
             completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+            referenced_documents = create_referenced_documents_with_metadata(
+                summary, metadata_map
+            )
+
+            cache_entry = CacheEntry(
+                query=query_request.query,
+                response=summary.llm_response,
+                provider=provider_id,
+                model=model_id,
+                started_at=started_at,
+                completed_at=completed_at,
+                referenced_documents=(
+                    referenced_documents if referenced_documents else None
+                ),
+            )
+
             store_conversation_into_cache(
                 configuration,
                 user_id,
                 conversation_id,
-                provider_id,
-                model_id,
-                query_request.query,
-                summary.llm_response,
-                started_at,
-                completed_at,
+                cache_entry,
                 _skip_userid_check,
                 topic_summary,
             )

diff --git a/src/cache/cache.py b/src/cache/cache.py
@@ -2,7 +2,8 @@
 
 from abc import ABC, abstractmethod
 
-from models.cache_entry import CacheEntry, ConversationData
+from models.cache_entry import CacheEntry
+from models.responses import ConversationData
 from utils.suid import check_suid
 
 

diff --git a/src/cache/in_memory_cache.py b/src/cache/in_memory_cache.py
@@ -1,8 +1,9 @@
 """In-memory cache implementation."""
 
 from cache.cache import Cache
-from models.cache_entry import CacheEntry, ConversationData
+from models.cache_entry import CacheEntry
 from models.config import InMemoryCacheConfig
+from models.responses import ConversationData
 from log import get_logger
 from utils.connection_decorator import connection
 

diff --git a/src/cache/noop_cache.py b/src/cache/noop_cache.py
@@ -1,7 +1,8 @@
 """No-operation cache implementation."""
 
 from cache.cache import Cache
-from models.cache_entry import CacheEntry, ConversationData
+from models.cache_entry import CacheEntry
+from models.responses import ConversationData
 from log import get_logger
 from utils.connection_decorator import connection