Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions src/infrastructure/database/chromadb/conector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import chromadb
from chromadb.config import Settings
from typing import List, Optional


class ChromaDB:
"""Manager for ChromaDB connection and operations."""

def __init__(self):
"""Initialize ChromaDB connection."""
self.host = 'localhost'
self.port = 8000
self.client = self._connect()
self.collection = None

def _connect(self):
"""Connect to ChromaDB."""
client = chromadb.HttpClient(host=self.host, port=self.port)
return client

def _create_collection(self, collection_name: str):
"""Create or get an existing collection.

Args:
collection_name (str): Name of the collection to create/get

Returns:
Collection: Created/retrieved collection object
"""
self.collection = self.client.get_or_create_collection(
name=collection_name
)
return self.collection

async def add_documents(
self,
documents: List[str],
collection_name: str,
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None
):
"""Add documents to the collection.

Args:
documents (List[str]): List of document texts
metadatas (Optional[List[dict]]): Document metadata
ids (Optional[List[str]]): Unique document IDs

Raises:
ValueError: If collection is not initialized

Returns:
dict: Result of add operation
"""
if not self.collection:
self.collection = self._create_collection(collection_name)
return self.collection.add(
documents=documents,
metadatas=metadatas,
ids=ids
)

async def query_documents(
self,
query_text: str,
collection_name: str,
n_results: int = 5,
where: Optional[dict] = None
):
"""Query similar documents in the collection.

Args:
query_text (str): Text to search for similarity
n_results (int): Number of desired results
where (Optional[dict]): Additional filters

Raises:
ValueError: If collection is not initialized

Returns:
dict: Query results
"""
if not self.collection:
self.collection = self._create_collection(collection_name)
return self.collection.query(
query_texts=[query_text],
n_results=n_results,
where=where
)

async def delete_documents(self, ids: List[str],collection_name: str):
"""Delete documents from collection by IDs.

Args:
ids (List[str]): List of document IDs to delete

Raises:
ValueError: If collection is not initialized

Returns:
dict: Result of delete operation
"""
if not self.collection:
self.collection = self._create_collection(collection_name)
return self.collection.delete(ids=ids)

async def close(self):
"""Close ChromaDB connection and clean up resources."""
if self.client:
self.client.reset()
98 changes: 98 additions & 0 deletions src/services/docmuent_extration/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import json
from typing import List, Dict, Tuple, Optional
from pathlib import Path
from docx import Document
from PyPDF2 import PdfReader
import re

class Reader:
"""
Classe genérica para leitura de diferentes tipos de arquivos (JSON, PDF, DOCX, TXT).
"""

def __init__(self):
"""Inicializa o leitor de documentos."""
self.supported_extensions = {'.json', '.pdf', '.docx', '.txt'}

def __call__(self, file_path: str, document_content: str) -> Tuple[List[str], List[str], List[Dict]]:
"""
Lê um arquivo e retorna seus documentos, IDs e metadados.

Args:
file_path (str): Caminho para o arquivo
document_content (str): Tipo do documento (ex: 'noticia', 'artigo', 'contrato')

Returns:
Tuple[List[str], List[str], List[Dict]]: (documentos, ids, metadados)
"""
path = Path(file_path)

if not path.exists():
raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")

if path.suffix.lower() not in self.supported_extensions:
raise ValueError(f"Extensão não suportada: {path.suffix}")

# Seleciona o método apropriado baseado na extensão
readers = {
'.json': self._read_json,
'.pdf': self._read_pdf,
'.docx': self._read_docx,
'.txt': self._read_txt
}

reader = readers.get(path.suffix.lower())
documents, content = reader(file_path)

# Adiciona os documentos ao content para uso no _generate_metadata
content['documents'] = documents

# Gera IDs e metadados
ids = [f"{document_content}_{i+1}" for i in range(len(documents))]
metadata = self._generate_metadata(content, document_content, path.name)

return documents, ids, metadata

def _read_json(self, file_path: str) -> Tuple[List[str], Dict]:
"""Lê arquivo JSON."""
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)

if isinstance(data, list):
documents = [item.get('texto', '') for item in data]
content = {
'titulos': [item.get('titulo', '') for item in data],
'subtitulos': [item.get('subtitulo', '') for item in data],
'datas': [item.get('data', '') for item in data]
}
else:
documents = [data.get('texto', '')]
content = {
'titulos': [data.get('titulo', '')],
'subtitulos': [data.get('subtitulo', '')],
'datas': [data.get('data', '')]
}

return documents, content

def _read_pdf(self, file_path: str) -> Tuple[List[str], Dict]:
"""Lê arquivo PDF como um único documento."""
reader = PdfReader(file_path)

# Combina todo o texto do PDF em um único documento
full_text = " ".join(page.extract_text() or "" for page in reader.pages).strip()

# Cria uma lista com um único documento
documents = [full_text]

# Metadata com informações do PDF
content = {
'total_pages': len(reader.pages),
'file_name': Path(file_path).name,
'file_type': 'pdf',
'file_size': Path(file_path).stat().st_size, # tamanho em bytes
'created_at': str(Path(file_path).stat().st_ctime), # data de criação
'modified_at': str(Path(file_path).stat().st_mtime) # data de modificação
}

return documents, content
Loading