From 35d837770cc546dd465c01e2311a7b2dbc961fc2 Mon Sep 17 00:00:00 2001 From: Sri Roopa Ramesh Babu Date: Tue, 13 Jan 2026 08:58:24 -0500 Subject: [PATCH] RAG chunk metadata propogation. --- src/app/endpoints/query_v2.py | 29 ++- src/models/responses.py | 25 ++- src/utils/endpoints.py | 207 ++++++++++++++---- tests/unit/cache/test_postgres_cache.py | 11 +- tests/unit/models/responses/test_rag_chunk.py | 113 +++++++++- tests/unit/utils/test_endpoints.py | 148 ++++++++++++- 6 files changed, 484 insertions(+), 49 deletions(-) diff --git a/src/app/endpoints/query_v2.py b/src/app/endpoints/query_v2.py index 69422c42..45c88fbb 100644 --- a/src/app/endpoints/query_v2.py +++ b/src/app/endpoints/query_v2.py @@ -539,7 +539,16 @@ def parse_referenced_documents_from_responses_api( final_url = doc_url if doc_url else None if (final_url, filename) not in seen_docs: documents.append( - ReferencedDocument(doc_url=final_url, doc_title=filename) + ReferencedDocument( + doc_url=final_url, + doc_title=filename, + document_id=None, + product_name=None, + product_version=None, + source_path=None, + score=None, + chunk_metadata=None, + ) ) seen_docs.add((final_url, filename)) @@ -574,7 +583,14 @@ def parse_referenced_documents_from_responses_api( if (final_url, anno_title) not in seen_docs: documents.append( ReferencedDocument( - doc_url=final_url, doc_title=anno_title + doc_url=final_url, + doc_title=anno_title, + document_id=None, + product_name=None, + product_version=None, + source_path=None, + score=None, + chunk_metadata=None, ) ) seen_docs.add((final_url, anno_title)) @@ -583,7 +599,14 @@ def parse_referenced_documents_from_responses_api( if (None, anno_title) not in seen_docs: documents.append( ReferencedDocument( - doc_url=None, doc_title=anno_title + doc_url=None, + doc_title=anno_title, + document_id=None, + product_name=None, + product_version=None, + source_path=None, + score=None, + chunk_metadata=None, ) ) seen_docs.add((None, anno_title)) diff --git a/src/models/responses.py b/src/models/responses.py index bf4da269..2dea5377 100644 --- a/src/models/responses.py +++ b/src/models/responses.py @@ -329,8 +329,14 @@ class ReferencedDocument(BaseModel): """Model representing a document referenced in generating a response. Attributes: - doc_url: Url to the referenced doc. + doc_url: URL to the referenced doc. doc_title: Title of the referenced doc. + document_id: Unique identifier for the document in the RAG system. + product_name: Product name (e.g., "Red Hat OpenShift"). + product_version: Product version (e.g., "4.15"). + source_path: Source path or identifier for local/filesystem documents. + score: Relevance score from RAG retrieval (0.0 to 1.0). + chunk_metadata: Additional metadata fields from the RAG chunk. """ doc_url: Optional[AnyUrl] = Field( @@ -341,6 +347,23 @@ class ReferencedDocument(BaseModel): None, description="Title of the referenced document" ) + document_id: str | None = Field( + None, description="Document identifier from RAG system" + ) + + product_name: str | None = Field(None, description="Product name") + + product_version: str | None = Field(None, description="Product version") + + source_path: str | None = Field(None, description="Source path for local documents") + + score: float | None = Field(None, description="Relevance score from RAG retrieval") + + chunk_metadata: dict[str, Any] | None = Field( + None, + description="Additional metadata from RAG chunk", + ) + class QueryResponse(AbstractSuccessfulResponse): """Model representing LLM response to a query. diff --git a/src/utils/endpoints.py b/src/utils/endpoints.py index 0db9d503..c5cc6d15 100644 --- a/src/utils/endpoints.py +++ b/src/utils/endpoints.py @@ -441,9 +441,9 @@ def create_rag_chunks_dict(summary: TurnSummary) -> list[dict[str, Any]]: def _process_http_source( src: str, doc_urls: set[str] -) -> Optional[tuple[Optional[AnyUrl], str]]: +) -> tuple[AnyUrl | None, str, dict[str, Any]] | None: """ - Process HTTP source and return (doc_url, doc_title) tuple. + Process HTTP source and return (doc_url, doc_title, metadata_dict) tuple. Parameters: src (str): The source URL string to process. @@ -451,12 +451,14 @@ def _process_http_source( will add `src` to this set when it is new. Returns: - Optional[tuple[Optional[AnyUrl], str]]: A tuple (validated_url, doc_title) - when `src` was not previously seen: + tuple[AnyUrl | None, str, dict[str, Any]] | None: A tuple + (validated_url, doc_title, metadata_dict) when `src` was not + previously seen: - `validated_url`: an `AnyUrl` instance if `src` is a valid URL, or `None` if validation failed. - `doc_title`: the last path segment of the URL or `src` if no path segment is present. + - `metadata_dict`: dict containing document_id (the URL itself). Returns `None` if `src` was already present in `doc_urls`. """ if src not in doc_urls: @@ -468,19 +470,27 @@ def _process_http_source( validated_url = None doc_title = src.rsplit("/", 1)[-1] or src - return (validated_url, doc_title) + # For HTTP sources, document_id is the URL itself + metadata_dict = { + "document_id": src, + "product_name": None, + "product_version": None, + "source_path": None, + "chunk_metadata": None, + } + return (validated_url, doc_title, metadata_dict) return None -def _process_document_id( +def _process_document_id( # pylint: disable=too-many-locals src: str, doc_ids: set[str], doc_urls: set[str], metas_by_id: dict[str, dict[str, Any]], - metadata_map: Optional[dict[str, Any]], -) -> Optional[tuple[Optional[AnyUrl], str]]: + metadata_map: dict[str, Any] | None, +) -> tuple[AnyUrl | None, str, dict[str, Any]] | None: """ - Process document ID and return (doc_url, doc_title) tuple. + Process document ID and return (doc_url, doc_title, metadata_dict) tuple. Parameters: src (str): Document identifier to process. @@ -491,15 +501,16 @@ def _process_document_id( metadata dicts that may contain `docs_url` and `title`. - metadata_map (Optional[dict[str, Any]]): If provided (truthy), indicates + metadata_map (dict[str, Any] | None): If provided (truthy), indicates metadata is available and enables metadata lookup; when falsy, metadata lookup is skipped. Returns: - Optional[tuple[Optional[AnyUrl], str]]: `(validated_url, doc_title)` where - `validated_url` is a validated `AnyUrl` or `None` and `doc_title` is - the chosen title string; returns `None` if the `src` or its URL was + tuple[AnyUrl | None, str, dict[str, Any]] | None: `(validated_url, + doc_title, metadata_dict)` where `validated_url` is a validated `AnyUrl` + or `None` and `doc_title` is the chosen title string, and `metadata_dict` + contains enriched metadata; returns `None` if the `src` or its URL was already processed. """ if src in doc_ids: @@ -509,6 +520,12 @@ def _process_document_id( meta = metas_by_id.get(src, {}) if metadata_map else {} doc_url = meta.get("docs_url") title = meta.get("title") + + # Extract additional metadata fields + product_name = meta.get("product_name") + product_version = meta.get("product_version") + source_path = meta.get("source_path") or meta.get("source") + # Type check to ensure we have the right types if not isinstance(doc_url, (str, type(None))): doc_url = None @@ -529,23 +546,53 @@ def _process_document_id( validated_doc_url = None doc_title = title or (doc_url.rsplit("/", 1)[-1] if doc_url else src) - return (validated_doc_url, doc_title) + + # Build metadata dict with additional fields not in top-level ReferencedDocument + excluded_fields = { + "docs_url", + "title", + "document_id", + "product_name", + "product_version", + "source_path", + "source", + } + additional_metadata = ( + {k: v for k, v in meta.items() if k not in excluded_fields} if meta else {} + ) + + metadata_dict = { + "document_id": src, + "product_name": product_name, + "product_version": product_version, + "source_path": source_path, + "chunk_metadata": additional_metadata if additional_metadata else None, + } + + return (validated_doc_url, doc_title, metadata_dict) def _add_additional_metadata_docs( doc_urls: set[str], metas_by_id: dict[str, dict[str, Any]], -) -> list[tuple[Optional[AnyUrl], str]]: +) -> list[tuple[AnyUrl | None, str, dict[str, Any]]]: """Add additional referenced documents from metadata_map.""" - additional_entries: list[tuple[Optional[AnyUrl], str]] = [] - for meta in metas_by_id.values(): + additional_entries: list[tuple[AnyUrl | None, str, dict[str, Any]]] = [] + for doc_id, meta in metas_by_id.items(): doc_url = meta.get("docs_url") title = meta.get("title") # Note: must be "title", not "Title" + + # Extract additional metadata fields + product_name = meta.get("product_name") + product_version = meta.get("product_version") + source_path = meta.get("source_path") or meta.get("source") + # Type check to ensure we have the right types if not isinstance(doc_url, (str, type(None))): doc_url = None if not isinstance(title, (str, type(None))): title = None + if doc_url and doc_url not in doc_urls and title is not None: doc_urls.add(doc_url) try: @@ -556,69 +603,118 @@ def _add_additional_metadata_docs( logger.warning("Invalid URL in metadata_map: %s", doc_url) validated_url = None - additional_entries.append((validated_url, title)) + # Build metadata dict + excluded_fields = { + "docs_url", + "title", + "document_id", + "product_name", + "product_version", + "source_path", + "source", + } + additional_metadata = { + k: v for k, v in meta.items() if k not in excluded_fields + } + + metadata_dict = { + "document_id": doc_id, + "product_name": product_name, + "product_version": product_version, + "source_path": source_path, + "chunk_metadata": additional_metadata if additional_metadata else None, + } + + additional_entries.append((validated_url, title, metadata_dict)) return additional_entries -def _process_rag_chunks_for_documents( +def _process_rag_chunks_for_documents( # pylint: disable=too-many-locals,too-many-branches rag_chunks: list, - metadata_map: Optional[dict[str, Any]] = None, -) -> list[tuple[Optional[AnyUrl], str]]: + metadata_map: dict[str, Any] | None = None, +) -> list[tuple[AnyUrl | None, str, dict[str, Any], float | None]]: """ - Process RAG chunks and return a list of (doc_url, doc_title) tuples. + Process RAG chunks and return enriched document tuples with metadata and scores. This is the core logic shared between both return formats. Parameters: rag_chunks (list): Iterable of RAG chunk objects; each chunk must provide a `source` attribute (e.g., an HTTP URL or a document ID). - metadata_map (Optional[dict[str, Any]]): Optional mapping of document IDs + metadata_map (dict[str, Any] | None): Optional mapping of document IDs to metadata dictionaries used to resolve titles and document URLs. Returns: - list[tuple[Optional[AnyUrl], str]]: Ordered list of tuples where the first - element is a validated URL object or `None` (if no URL is available) - and the second element is the document title. + list[tuple[AnyUrl | None, str, dict[str, Any], float | None]]: Ordered list of tuples where: + - First element is a validated URL object or `None` (if no URL is available) + - Second element is the document title + - Third element is a dict with metadata (document_id, product_name, product_version, etc.) + - Fourth element is the relevance score or `None` """ doc_urls: set[str] = set() doc_ids: set[str] = set() + # Track scores by document source identifier + doc_scores: dict[str, float | None] = {} + # Process metadata_map if provided metas_by_id: dict[str, dict[str, Any]] = {} if metadata_map: metas_by_id = {k: v for k, v in metadata_map.items() if isinstance(v, dict)} - document_entries: list[tuple[Optional[AnyUrl], str]] = [] + document_entries: list[tuple[AnyUrl | None, str, dict[str, Any]]] = [] for chunk in rag_chunks: src = chunk.source if not src or src == constants.DEFAULT_RAG_TOOL: continue + # Extract score from chunk if available + score = getattr(chunk, "score", None) + if src.startswith("http"): entry = _process_http_source(src, doc_urls) if entry: document_entries.append(entry) + # Track score by source + if src not in doc_scores: + doc_scores[src] = score else: entry = _process_document_id( src, doc_ids, doc_urls, metas_by_id, metadata_map ) if entry: document_entries.append(entry) + # Track score by source + if src not in doc_scores: + doc_scores[src] = score # Add any additional referenced documents from metadata_map not already present if metadata_map: additional_entries = _add_additional_metadata_docs(doc_urls, metas_by_id) - document_entries.extend(additional_entries) + for additional_entry in additional_entries: + document_entries.append(additional_entry) + # Additional entries don't have scores from chunks + doc_id = additional_entry[2].get("document_id") + if doc_id and doc_id not in doc_scores: + doc_scores[doc_id] = None + + # Build final result with scores + result: list[tuple[AnyUrl | None, str, dict[str, Any], float | None]] = [] + for doc_url, doc_title, metadata_dict in document_entries: + # Get score using document_id from metadata + doc_id = metadata_dict.get("document_id") + score = doc_scores.get(doc_id) if doc_id else None + result.append((doc_url, doc_title, metadata_dict, score)) - return document_entries + return result def create_referenced_documents( rag_chunks: list, metadata_map: Optional[dict[str, Any]] = None, return_dict_format: bool = False, -) -> list[ReferencedDocument] | list[dict[str, Optional[str]]]: +) -> list[ReferencedDocument] | list[dict[str, str | None]]: """ Create referenced documents from RAG chunks with optional metadata enrichment. @@ -633,7 +729,9 @@ def create_referenced_documents( ReferencedDocument objects Returns: - List of ReferencedDocument objects or dictionaries with doc_url and doc_title + List of ReferencedDocument objects or dictionaries with all metadata fields + including doc_url, doc_title, document_id, product_name, product_version, + source_path, score, and chunk_metadata """ document_entries = _process_rag_chunks_for_documents(rag_chunks, metadata_map) @@ -642,12 +740,27 @@ def create_referenced_documents( { "doc_url": str(doc_url) if doc_url else None, "doc_title": doc_title, + "document_id": metadata_dict.get("document_id"), + "product_name": metadata_dict.get("product_name"), + "product_version": metadata_dict.get("product_version"), + "source_path": metadata_dict.get("source_path"), + "score": score, + "chunk_metadata": metadata_dict.get("chunk_metadata"), } - for doc_url, doc_title in document_entries + for doc_url, doc_title, metadata_dict, score in document_entries ] return [ - ReferencedDocument(doc_url=doc_url, doc_title=doc_title) - for doc_url, doc_title in document_entries + ReferencedDocument( + doc_url=doc_url, + doc_title=doc_title, + document_id=metadata_dict.get("document_id"), + product_name=metadata_dict.get("product_name"), + product_version=metadata_dict.get("product_version"), + source_path=metadata_dict.get("source_path"), + score=score, + chunk_metadata=metadata_dict.get("chunk_metadata"), + ) + for doc_url, doc_title, metadata_dict, score in document_entries ] @@ -674,8 +787,17 @@ def create_referenced_documents_with_metadata( summary.rag_chunks, metadata_map ) return [ - ReferencedDocument(doc_url=doc_url, doc_title=doc_title) - for doc_url, doc_title in document_entries + ReferencedDocument( + doc_url=doc_url, + doc_title=doc_title, + document_id=metadata_dict.get("document_id"), + product_name=metadata_dict.get("product_name"), + product_version=metadata_dict.get("product_version"), + source_path=metadata_dict.get("source_path"), + score=score, + chunk_metadata=metadata_dict.get("chunk_metadata"), + ) + for doc_url, doc_title, metadata_dict, score in document_entries ] @@ -698,8 +820,17 @@ def create_referenced_documents_from_chunks( """ document_entries = _process_rag_chunks_for_documents(rag_chunks) return [ - ReferencedDocument(doc_url=doc_url, doc_title=doc_title) - for doc_url, doc_title in document_entries + ReferencedDocument( + doc_url=doc_url, + doc_title=doc_title, + document_id=metadata_dict.get("document_id"), + product_name=metadata_dict.get("product_name"), + product_version=metadata_dict.get("product_version"), + source_path=metadata_dict.get("source_path"), + score=score, + chunk_metadata=metadata_dict.get("chunk_metadata"), + ) + for doc_url, doc_title, metadata_dict, score in document_entries ] diff --git a/tests/unit/cache/test_postgres_cache.py b/tests/unit/cache/test_postgres_cache.py index b720de61..94fc458b 100644 --- a/tests/unit/cache/test_postgres_cache.py +++ b/tests/unit/cache/test_postgres_cache.py @@ -602,7 +602,16 @@ def test_insert_and_get_with_referenced_documents( inserted_json_str = sql_params[-3] assert json.loads(inserted_json_str) == [ - {"doc_url": "http://example.com/", "doc_title": "Test Doc"} + { + "doc_url": "http://example.com/", + "doc_title": "Test Doc", + "document_id": None, + "product_name": None, + "product_version": None, + "source_path": None, + "score": None, + "chunk_metadata": None, + } ] # Simulate the database returning that data diff --git a/tests/unit/models/responses/test_rag_chunk.py b/tests/unit/models/responses/test_rag_chunk.py index 53f72f68..f658bc94 100644 --- a/tests/unit/models/responses/test_rag_chunk.py +++ b/tests/unit/models/responses/test_rag_chunk.py @@ -1,5 +1,8 @@ -"""Unit tests for RAGChunk model.""" +"""Unit tests for RAGChunk and ReferencedDocument models.""" +from pydantic import AnyUrl + +from models.responses import ReferencedDocument from utils.types import RAGChunk @@ -110,3 +113,111 @@ def test_url_as_source(self) -> None: ) assert chunk.source == url_source assert chunk.score == 0.92 + + +class TestReferencedDocument: + """Test cases for the ReferencedDocument model.""" + + def test_referenced_document_with_full_metadata(self) -> None: + """Test ReferencedDocument construction with all fields.""" + doc = ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc"), + doc_title="Test Document", + document_id="doc-123", + product_name="Red Hat OpenShift", + product_version="4.15", + source_path="/docs/install.md", + score=0.95, + chunk_metadata={"author": "Red Hat", "custom": "value"}, + ) + + assert doc.doc_url == AnyUrl("https://example.com/doc") + assert doc.doc_title == "Test Document" + assert doc.document_id == "doc-123" + assert doc.product_name == "Red Hat OpenShift" + assert doc.product_version == "4.15" + assert doc.source_path == "/docs/install.md" + assert doc.score == 0.95 + assert doc.chunk_metadata is not None + assert doc.chunk_metadata["author"] == "Red Hat" + assert doc.chunk_metadata["custom"] == "value" + + def test_referenced_document_minimal_fields(self) -> None: + """Test ReferencedDocument with minimal fields (backward compatibility).""" + doc = ReferencedDocument() + + assert doc.doc_url is None + assert doc.doc_title is None + assert doc.document_id is None + assert doc.product_name is None + assert doc.product_version is None + assert doc.source_path is None + assert doc.score is None + assert doc.chunk_metadata is None + + def test_referenced_document_backward_compatible(self) -> None: + """Test that existing code using only doc_url and doc_title still works.""" + doc = ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc"), doc_title="Test Document" + ) + + assert doc.doc_url == AnyUrl("https://example.com/doc") + assert doc.doc_title == "Test Document" + # New fields default to None + assert doc.document_id is None + assert doc.product_name is None + assert doc.product_version is None + assert doc.source_path is None + assert doc.score is None + assert doc.chunk_metadata is None + + def test_referenced_document_with_product_metadata_only(self) -> None: + """Test ReferencedDocument with only product metadata fields.""" + doc = ReferencedDocument( + product_name="Red Hat OpenStack", product_version="17.1" + ) + + assert doc.product_name == "Red Hat OpenStack" + assert doc.product_version == "17.1" + assert doc.doc_url is None + assert doc.doc_title is None + assert doc.document_id is None + assert doc.source_path is None + assert doc.score is None + assert doc.chunk_metadata is None + + def test_referenced_document_with_score(self) -> None: + """Test ReferencedDocument with relevance score.""" + doc = ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc"), + doc_title="Scored Document", + score=0.87, + ) + + assert doc.score == 0.87 + assert doc.doc_url == AnyUrl("https://example.com/doc") + assert doc.doc_title == "Scored Document" + + def test_referenced_document_empty_chunk_metadata(self) -> None: + """Test ReferencedDocument with empty chunk_metadata dict.""" + doc = ReferencedDocument( + doc_url=AnyUrl("https://example.com/doc"), + doc_title="Test Document", + chunk_metadata={}, + ) + + assert doc.chunk_metadata == {} + assert doc.doc_url == AnyUrl("https://example.com/doc") + + def test_referenced_document_with_document_id_and_source_path(self) -> None: + """Test ReferencedDocument with document_id and source_path.""" + doc = ReferencedDocument( + document_id="doc-456", + source_path="/local/path/to/document.md", + doc_title="Local Document", + ) + + assert doc.document_id == "doc-456" + assert doc.source_path == "/local/path/to/document.md" + assert doc.doc_title == "Local Document" + assert doc.doc_url is None diff --git a/tests/unit/utils/test_endpoints.py b/tests/unit/utils/test_endpoints.py index b1343674..2de406d4 100644 --- a/tests/unit/utils/test_endpoints.py +++ b/tests/unit/utils/test_endpoints.py @@ -924,16 +924,34 @@ def test_create_referenced_documents_http_urls_referenced_document_format( assert result[0].doc_title == "doc1" assert result[1].doc_url == AnyUrl("https://example.com/doc2") assert result[1].doc_title == "doc2" + # Verify new fields are present (HTTP sources use URL as document_id) + assert result[0].document_id == "https://example.com/doc1" + assert result[0].product_name is None + assert result[0].product_version is None + assert result[0].source_path is None + assert result[0].score is None + assert result[0].chunk_metadata is None def test_create_referenced_documents_document_ids_with_metadata(self) -> None: - """Test document IDs with metadata enrichment.""" + """Test document IDs with metadata enrichment including product fields.""" - mock_chunk1 = type("MockChunk", (), {"source": "doc_id_1"})() - mock_chunk2 = type("MockChunk", (), {"source": "doc_id_2"})() + mock_chunk1 = type("MockChunk", (), {"source": "doc_id_1", "score": 0.95})() + mock_chunk2 = type("MockChunk", (), {"source": "doc_id_2", "score": 0.87})() metadata_map = { - "doc_id_1": {"docs_url": "https://example.com/doc1", "title": "Document 1"}, - "doc_id_2": {"docs_url": "https://example.com/doc2", "title": "Document 2"}, + "doc_id_1": { + "docs_url": "https://example.com/doc1", + "title": "Document 1", + "product_name": "Red Hat OpenShift", + "product_version": "4.15", + "source_path": "/docs/openshift/install.md", + }, + "doc_id_2": { + "docs_url": "https://example.com/doc2", + "title": "Document 2", + "product_name": "Red Hat OpenStack", + "product_version": "17.1", + }, } result = endpoints.create_referenced_documents( @@ -948,10 +966,22 @@ def test_create_referenced_documents_document_ids_with_metadata(self) -> None: # results must be of the right type assert isinstance(result[0], ReferencedDocument) assert isinstance(result[1], ReferencedDocument) + # Verify existing fields assert result[0].doc_url == AnyUrl("https://example.com/doc1") assert result[0].doc_title == "Document 1" assert result[1].doc_url == AnyUrl("https://example.com/doc2") assert result[1].doc_title == "Document 2" + # Verify new metadata fields + assert result[0].document_id == "doc_id_1" + assert result[0].product_name == "Red Hat OpenShift" + assert result[0].product_version == "4.15" + assert result[0].source_path == "/docs/openshift/install.md" + assert result[0].score == 0.95 + assert result[1].document_id == "doc_id_2" + assert result[1].product_name == "Red Hat OpenStack" + assert result[1].product_version == "17.1" + assert result[1].source_path is None + assert result[1].score == 0.87 def test_create_referenced_documents_skips_tool_names(self) -> None: """Test that tool names like 'knowledge_search' are skipped.""" @@ -1036,6 +1066,114 @@ def test_create_referenced_documents_invalid_urls(self) -> None: assert result[1].doc_url == AnyUrl("https://example.com/doc1") assert result[1].doc_title == "doc1" + def test_create_referenced_documents_with_full_metadata(self) -> None: + """Test document creation with complete metadata including scores and custom fields.""" + + # Mock chunks with scores + mock_chunk1 = type("MockChunk", (), {"source": "doc_id_1", "score": 0.95})() + mock_chunk2 = type("MockChunk", (), {"source": "doc_id_2", "score": 0.87})() + + metadata_map = { + "doc_id_1": { + "docs_url": "https://example.com/doc1", + "title": "Product Documentation", + "document_id": "doc_id_1", + "product_name": "Red Hat OpenShift", + "product_version": "4.15", + "source_path": "/docs/openshift/4.15/install.md", + "author": "Red Hat", + "creation_date": "2024-01-01", + }, + "doc_id_2": { + "docs_url": "https://example.com/doc2", + "title": "Configuration Guide", + "document_id": "doc_id_2", + "product_name": "Red Hat OpenShift", + "product_version": "4.14", + "category": "configuration", + }, + } + + result = endpoints.create_referenced_documents( + [mock_chunk1, mock_chunk2], metadata_map + ) + + assert len(result) == 2 + + # Verify first document with full metadata + assert isinstance(result[0], ReferencedDocument) + assert result[0].doc_url == AnyUrl("https://example.com/doc1") + assert result[0].doc_title == "Product Documentation" + assert result[0].document_id == "doc_id_1" + assert result[0].product_name == "Red Hat OpenShift" + assert result[0].product_version == "4.15" + assert result[0].source_path == "/docs/openshift/4.15/install.md" + assert result[0].score == 0.95 + assert result[0].chunk_metadata is not None + assert result[0].chunk_metadata["author"] == "Red Hat" + assert result[0].chunk_metadata["creation_date"] == "2024-01-01" + + # Verify second document + assert isinstance(result[1], ReferencedDocument) + assert result[1].doc_url == AnyUrl("https://example.com/doc2") + assert result[1].doc_title == "Configuration Guide" + assert result[1].document_id == "doc_id_2" + assert result[1].product_name == "Red Hat OpenShift" + assert result[1].product_version == "4.14" + assert result[1].source_path is None + assert result[1].score == 0.87 + assert result[1].chunk_metadata is not None + assert result[1].chunk_metadata["category"] == "configuration" + + def test_create_referenced_documents_backward_compatibility(self) -> None: + """Test that new fields default to None for backward compatibility.""" + + mock_chunk = type("MockChunk", (), {"source": "https://example.com/doc1"})() + + result = endpoints.create_referenced_documents([mock_chunk]) + + assert len(result) == 1 + assert isinstance(result[0], ReferencedDocument) + assert result[0].doc_url == AnyUrl("https://example.com/doc1") + assert result[0].doc_title == "doc1" + # HTTP sources use URL as document_id + assert result[0].document_id == "https://example.com/doc1" + # Other new fields should be None + assert result[0].product_name is None + assert result[0].product_version is None + assert result[0].source_path is None + assert result[0].score is None + assert result[0].chunk_metadata is None + + def test_create_referenced_documents_dict_format_with_metadata(self) -> None: + """Test dictionary format return with all metadata fields.""" + + mock_chunk = type("MockChunk", (), {"source": "doc_id_1", "score": 0.92})() + + metadata_map = { + "doc_id_1": { + "docs_url": "https://example.com/doc1", + "title": "Test Document", + "product_name": "Test Product", + "product_version": "1.0", + } + } + + result = endpoints.create_referenced_documents( + [mock_chunk], metadata_map, return_dict_format=True + ) + + assert len(result) == 1 + assert isinstance(result[0], dict) + assert result[0]["doc_url"] == "https://example.com/doc1" + assert result[0]["doc_title"] == "Test Document" + assert result[0]["document_id"] == "doc_id_1" + assert result[0]["product_name"] == "Test Product" + assert result[0]["product_version"] == "1.0" + assert result[0]["score"] == 0.92 + assert result[0]["source_path"] is None + assert result[0]["chunk_metadata"] is None + @pytest.mark.asyncio async def test_cleanup_after_streaming_generate_topic_summary_default_true(