CuriousGu · PrimeRibs2501 · Feb 12, 2025 · Feb 13, 2025
diff --git a/src/infrastructure/database/chromadb/conector.py b/src/infrastructure/database/chromadb/conector.py
@@ -0,0 +1,110 @@
+import chromadb
+from chromadb.config import Settings
+from typing import List, Optional
+
+
+class ChromaDB:
+    """Manager for ChromaDB connection and operations."""
+
+    def __init__(self):
+        """Initialize ChromaDB connection."""
+        self.host = 'localhost'
+        self.port = 8000
+        self.client = self._connect()
+        self.collection = None
+
+    def _connect(self):
+        """Connect to ChromaDB."""
+        client = chromadb.HttpClient(host=self.host, port=self.port)
+        return client
+
+    def _create_collection(self, collection_name: str):
+        """Create or get an existing collection.
+
+        Args:
+            collection_name (str): Name of the collection to create/get
+
+        Returns:
+            Collection: Created/retrieved collection object
+        """
+        self.collection = self.client.get_or_create_collection(
+            name=collection_name
+        )
+        return self.collection
+
+    async def add_documents(
+        self,
+        documents: List[str],
+        collection_name: str,
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None
+    ):
+        """Add documents to the collection.
+
+        Args:
+            documents (List[str]): List of document texts
+            metadatas (Optional[List[dict]]): Document metadata
+            ids (Optional[List[str]]): Unique document IDs
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Result of add operation
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.add(
+            documents=documents,
+            metadatas=metadatas,
+            ids=ids
+        )
+
+    async def query_documents(
+        self,
+        query_text: str,
+        collection_name: str,
+        n_results: int = 5,
+        where: Optional[dict] = None
+    ):
+        """Query similar documents in the collection.
+
+        Args:
+            query_text (str): Text to search for similarity
+            n_results (int): Number of desired results
+            where (Optional[dict]): Additional filters
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Query results
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.query(
+            query_texts=[query_text],
+            n_results=n_results,
+            where=where
+        )
+
+    async def delete_documents(self, ids: List[str],collection_name: str):
+        """Delete documents from collection by IDs.
+
+        Args:
+            ids (List[str]): List of document IDs to delete
+
+        Raises:
+            ValueError: If collection is not initialized
+
+        Returns:
+            dict: Result of delete operation
+        """
+        if not self.collection:
+            self.collection = self._create_collection(collection_name)
+        return self.collection.delete(ids=ids)
+
+    async def close(self):
+        """Close ChromaDB connection and clean up resources."""
+        if self.client:
+            self.client.reset()
diff --git a/src/services/docmuent_extration/extractor.py b/src/services/docmuent_extration/extractor.py
@@ -0,0 +1,98 @@
+import json
+from typing import List, Dict, Tuple, Optional
+from pathlib import Path
+from docx import Document
+from PyPDF2 import PdfReader
+import re
+
+class Reader:
+    """
+    Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
+    """
+
+    def __init__(self):
+        """Inicializa o leitor de documentos."""
+        self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}
+
+    def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]:
+        """
+        Lê um arquivo e retorna seus documentos, IDs e metadados.
+
+        Args:
+            file_path (str): Caminho para o arquivo
+            document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato')
+
+        Returns:
+            Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados)
+        """
+        path = Path(file_path)
+
+        if not path.exists():
+            raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
+
+        if path.suffix.lower() not in self.supported_extensions:
+            raise ValueError(f"Extensão não suportada: {path.suffix}")
+
+        # Seleciona o método apropriado baseado na extensão
+        readers = {
+            '.json': self._read_json,
+            '.pdf': self._read_pdf,
+            '.docx': self._read_docx,
+            '.txt': self._read_txt
+        }
+
+        reader = readers.get(path.suffix.lower())
+        documents, content = reader(file_path)
+
+        # Adiciona os documentos ao content para uso no _generate_metadata
+        content['documents'] = documents
+
+        # Gera IDs e metadados
+        ids = [f"{document_content}_{i+1}" for i in range(len(documents))]
+        metadata = self._generate_metadata(content, document_content, path.name)
+
+        return documents, ids, metadata
+
+    def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
+        """Lê arquivo JSON."""
+        with open(file_path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+
+        if isinstance(data, list):
+            documents = [item.get('texto', '') for item in data]
+            content = {
+                'titulos': [item.get('titulo', '') for item in data],
+                'subtitulos': [item.get('subtitulo', '') for item in data],
+                'datas': [item.get('data', '') for item in data]
+            }
+        else:
+            documents = [data.get('texto', '')]
+            content = {
+                'titulos': [data.get('titulo', '')],
+                'subtitulos': [data.get('subtitulo', '')],
+                'datas': [data.get('data', '')]
+            }
+
+        return documents, content
+
+    def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
+        """Lê arquivo PDF como um único documento."""
+        reader = PdfReader(file_path)
+
+        # Combina todo o texto do PDF em um único documento
+        full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip()
+
+        # Cria uma lista com um único documento
+        documents = [full_text]
+
+        # Metadata com informações do PDF
+        content = {
+            'total_pages': len(reader.pages),
+            'file_name': Path(file_path).name,
+            'file_type': 'pdf',
+            'file_size': Path(file_path).stat().st_size,  # tamanho em bytes
+            'created_at': str(Path(file_path).stat().st_ctime),  # data de criação
+            'modified_at': str(Path(file_path).stat().st_mtime)  # data de modificação
+        }
+
+        return documents, content