diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py new file mode 100644 index 0000000..155ff43 --- /dev/null +++ b/src/infrastructure/database/chromadb/conector.py @@ -0,0 +1,110 @@ +import chromadb +from chromadb.config import Settings +from typing import List, Optional + + +class ChromaDB: + """Manager for ChromaDB connection and operations.""" + + def __init__(self): + """Initialize ChromaDB connection.""" + self.host = 'localhost' + self.port = 8000 + self.client = self._connect() + self.collection = None + + def _connect(self): + """Connect to ChromaDB.""" + client = chromadb.HttpClient(host=self.host, port=self.port) + return client + + def _create_collection(self, collection_name: str): + """Create or get an existing collection. + + Args: + collection_name (str): Name of the collection to create/get + + Returns: + Collection: Created/retrieved collection object + """ + self.collection = self.client.get_or_create_collection( + name=collection_name + ) + return self.collection + + async def add_documents( + self, + documents: List[str], + collection_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None + ): + """Add documents to the collection. + + Args: + documents (List[str]): List of document texts + metadatas (Optional[List[dict]]): Document metadata + ids (Optional[List[str]]): Unique document IDs + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of add operation + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.add( + documents=documents, + metadatas=metadatas, + ids=ids + ) + + async def query_documents( + self, + query_text: str, + collection_name: str, + n_results: int = 5, + where: Optional[dict] = None + ): + """Query similar documents in the collection. + + Args: + query_text (str): Text to search for similarity + n_results (int): Number of desired results + where (Optional[dict]): Additional filters + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Query results + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.query( + query_texts=[query_text], + n_results=n_results, + where=where + ) + + async def delete_documents(self, ids: List[str],collection_name: str): + """Delete documents from collection by IDs. + + Args: + ids (List[str]): List of document IDs to delete + + Raises: + ValueError: If collection is not initialized + + Returns: + dict: Result of delete operation + """ + if not self.collection: + self.collection = self._create_collection(collection_name) + return self.collection.delete(ids=ids) + + async def close(self): + """Close ChromaDB connection and clean up resources.""" + if self.client: + self.client.reset() \ No newline at end of file diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py new file mode 100644 index 0000000..3c81712 --- /dev/null +++ b/src/services/docmuent_extration/extractor.py @@ -0,0 +1,98 @@ +import json +from typing import List, Dict, Tuple, Optional +from pathlib import Path +from docx import Document +from PyPDF2 import PdfReader +import re + +class Reader: + """ + Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT). + """ + + def __init__(self): + """Inicializa o leitor de documentos.""" + self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'} + + def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]: + """ + Lê um arquivo e retorna seus documentos, IDs e metadados. + + Args: + file_path (str): Caminho para o arquivo + document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato') + + Returns: + Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados) + """ + path = Path(file_path) + + if not path.exists(): + raise FileNotFoundError(f"Arquivo não encontrado: {file_path}") + + if path.suffix.lower() not in self.supported_extensions: + raise ValueError(f"Extensão não suportada: {path.suffix}") + + # Seleciona o método apropriado baseado na extensão + readers = { + '.json': self._read_json, + '.pdf': self._read_pdf, + '.docx': self._read_docx, + '.txt': self._read_txt + } + + reader = readers.get(path.suffix.lower()) + documents, content = reader(file_path) + + # Adiciona os documentos ao content para uso no _generate_metadata + content['documents'] = documents + + # Gera IDs e metadados + ids = [f"{document_content}_{i+1}" for i in range(len(documents))] + metadata = self._generate_metadata(content, document_content, path.name) + + return documents, ids, metadata + + def _read_json(self, file_path: str) -> Tuple[List[str], Dict]: + """Lê arquivo JSON.""" + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + + if isinstance(data, list): + documents = [item.get('texto', '') for item in data] + content = { + 'titulos': [item.get('titulo', '') for item in data], + 'subtitulos': [item.get('subtitulo', '') for item in data], + 'datas': [item.get('data', '') for item in data] + } + else: + documents = [data.get('texto', '')] + content = { + 'titulos': [data.get('titulo', '')], + 'subtitulos': [data.get('subtitulo', '')], + 'datas': [data.get('data', '')] + } + + return documents, content + + def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]: + """Lê arquivo PDF como um único documento.""" + reader = PdfReader(file_path) + + # Combina todo o texto do PDF em um único documento + full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip() + + # Cria uma lista com um único documento + documents = [full_text] + + # Metadata com informações do PDF + content = { + 'total_pages': len(reader.pages), + 'file_name': Path(file_path).name, + 'file_type': 'pdf', + 'file_size': Path(file_path).stat().st_size, # tamanho em bytes + 'created_at': str(Path(file_path).stat().st_ctime), # data de criação + 'modified_at': str(Path(file_path).stat().st_mtime) # data de modificação + } + + return documents, content \ No newline at end of file