Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 108 additions & 4 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1787,6 +1787,9 @@
},
"type": "array",
"title": "Byok Rag"
},
"quota_handlers": {
"$ref": "#/components/schemas/QuotaHandlersConfiguration"
}
},
"additionalProperties": false,
Expand Down Expand Up @@ -3590,6 +3593,103 @@
}
]
},
"QuotaHandlersConfiguration": {
"properties": {
"sqlite": {
"anyOf": [
{
"$ref": "#/components/schemas/SQLiteDatabaseConfiguration"
},
{
"type": "null"
}
]
},
"postgres": {
"anyOf": [
{
"$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration"
},
{
"type": "null"
}
]
},
"limiters": {
"items": {
"$ref": "#/components/schemas/QuotaLimiterConfiguration"
},
"type": "array",
"title": "Limiters"
},
"scheduler": {
"$ref": "#/components/schemas/QuotaSchedulerConfiguration"
},
"enable_token_history": {
"type": "boolean",
"title": "Enable Token History",
"default": false
}
},
"additionalProperties": false,
"type": "object",
"title": "QuotaHandlersConfiguration",
"description": "Quota limiter configuration."
},
"QuotaLimiterConfiguration": {
"properties": {
"type": {
"type": "string",
"enum": [
"user_limiter",
"cluster_limiter"
],
"title": "Type"
},
"name": {
"type": "string",
"title": "Name"
},
"initial_quota": {
"type": "integer",
"minimum": 0.0,
"title": "Initial Quota"
},
"quota_increase": {
"type": "integer",
"minimum": 0.0,
"title": "Quota Increase"
},
"period": {
"type": "string",
"title": "Period"
}
},
"additionalProperties": false,
"type": "object",
"required": [
"type",
"name",
"initial_quota",
"quota_increase",
"period"
],
"title": "QuotaLimiterConfiguration",
"description": "Configuration for one quota limiter."
},
"QuotaSchedulerConfiguration": {
"properties": {
"period": {
"type": "integer",
"exclusiveMinimum": 0.0,
"title": "Period",
"default": 1
}
},
"type": "object",
"title": "QuotaSchedulerConfiguration",
"description": "Quota scheduler configuration."
},
"RAGChunk": {
"properties": {
"content": {
Expand Down Expand Up @@ -3691,15 +3791,19 @@
"description": "URL of the referenced document"
},
"doc_title": {
"type": "string",
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Doc Title",
"description": "Title of the referenced document"
}
},
"type": "object",
"required": [
"doc_title"
],
"title": "ReferencedDocument",
"description": "Model representing a document referenced in generating a response.\n\nAttributes:\n doc_url: Url to the referenced doc.\n doc_title: Title of the referenced doc."
},
Expand Down
14 changes: 10 additions & 4 deletions src/app/endpoints/conversations_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,13 +314,19 @@ def check_conversation_existence(user_id: str, conversation_id: str) -> None:

def transform_chat_message(entry: CacheEntry) -> dict[str, Any]:
"""Transform the message read from cache into format used by response payload."""
user_message = {"content": entry.query, "type": "user"}
assistant_message: dict[str, Any] = {"content": entry.response, "type": "assistant"}

# If referenced_documents exist on the entry, add them to the assistant message
if entry.referenced_documents is not None:
assistant_message["referenced_documents"] = [
doc.model_dump(mode="json") for doc in entry.referenced_documents
]

return {
"provider": entry.provider,
"model": entry.model,
"messages": [
{"content": entry.query, "type": "user"},
{"content": entry.response, "type": "assistant"},
],
"messages": [user_message, assistant_message],
"started_at": entry.started_at,
"completed_at": entry.completed_at,
}
19 changes: 13 additions & 6 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from authorization.middleware import authorize
from client import AsyncLlamaStackClientHolder
from configuration import configuration
from models.cache_entry import CacheEntry
from models.config import Action
from models.database.conversations import UserConversation
from models.requests import Attachment, QueryRequest
Expand Down Expand Up @@ -331,16 +332,22 @@ async def query_endpoint_handler( # pylint: disable=R0914
)

completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")

cache_entry = CacheEntry(
query=query_request.query,
response=summary.llm_response,
provider=provider_id,
model=model_id,
started_at=started_at,
completed_at=completed_at,
referenced_documents=referenced_documents if referenced_documents else None,
)

store_conversation_into_cache(
configuration,
user_id,
conversation_id,
provider_id,
model_id,
query_request.query,
summary.llm_response,
started_at,
completed_at,
cache_entry,
_skip_userid_check,
topic_summary,
)
Expand Down
26 changes: 20 additions & 6 deletions src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,14 @@
from constants import DEFAULT_RAG_TOOL, MEDIA_TYPE_JSON, MEDIA_TYPE_TEXT
import metrics
from metrics.utils import update_llm_token_count_from_turn
from models.cache_entry import CacheEntry
from models.config import Action
from models.database.conversations import UserConversation
from models.requests import QueryRequest
from models.responses import ForbiddenResponse, UnauthorizedResponse
from utils.endpoints import (
check_configuration_loaded,
create_referenced_documents_with_metadata,
create_rag_chunks_dict,
get_agent,
get_system_prompt,
Expand Down Expand Up @@ -863,16 +865,28 @@ async def response_generator(
)

completed_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")

referenced_documents = create_referenced_documents_with_metadata(
summary, metadata_map
)

cache_entry = CacheEntry(
query=query_request.query,
response=summary.llm_response,
provider=provider_id,
model=model_id,
started_at=started_at,
completed_at=completed_at,
referenced_documents=(
referenced_documents if referenced_documents else None
),
)

store_conversation_into_cache(
configuration,
user_id,
conversation_id,
provider_id,
model_id,
query_request.query,
summary.llm_response,
started_at,
completed_at,
cache_entry,
_skip_userid_check,
topic_summary,
)
Expand Down
3 changes: 2 additions & 1 deletion src/cache/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from abc import ABC, abstractmethod

from models.cache_entry import CacheEntry, ConversationData
from models.cache_entry import CacheEntry
from models.responses import ConversationData
from utils.suid import check_suid


Expand Down
3 changes: 2 additions & 1 deletion src/cache/in_memory_cache.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""In-memory cache implementation."""

from cache.cache import Cache
from models.cache_entry import CacheEntry, ConversationData
from models.cache_entry import CacheEntry
from models.config import InMemoryCacheConfig
from models.responses import ConversationData
from log import get_logger
from utils.connection_decorator import connection

Expand Down
3 changes: 2 additions & 1 deletion src/cache/noop_cache.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""No-operation cache implementation."""

from cache.cache import Cache
from models.cache_entry import CacheEntry, ConversationData
from models.cache_entry import CacheEntry
from models.responses import ConversationData
from log import get_logger
from utils.connection_decorator import connection

Expand Down
Loading
Loading