""" Embedding Processor - Processador para geração e gestão de embeddings Descomplicar - Agência de Aceleração Digital https://www.descomplicar.pt """ import requests import numpy as np from typing import List, Dict, Any, Optional import os from datetime import datetime import psycopg2 from psycopg2.extras import Json, execute_values import json import tiktoken class EmbeddingProcessor: """Processador para geração e gestão de embeddings.""" def __init__(self): """Inicializa o processador de embeddings.""" self.api_key = "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" self.model = "text-embedding-ada-002" self.encoding = tiktoken.encoding_for_model(self.model) self.max_tokens = 8000 # Deixar margem de segurança # Configurações do banco de dados self.db_config = { 'dbname': 'superbot_kb', 'user': 'superbot_user', 'password': 'KufQ4La5jaAk', 'host': 'easy.descomplicar.pt', 'port': '5433' } # Inicializar conexão self.conn = None self.cur = None def connect(self) -> None: """Estabelece conexão com o banco de dados.""" try: self.conn = psycopg2.connect(**self.db_config) self.cur = self.conn.cursor() except Exception as e: raise Exception(f"Erro ao conectar ao banco de dados: {str(e)}") def disconnect(self) -> None: """Fecha a conexão com o banco de dados.""" if self.cur: self.cur.close() if self.conn: self.conn.close() def split_text(self, text: str) -> List[str]: """ Divide o texto em chunks menores respeitando o limite de tokens. Args: text (str): Texto para dividir Returns: List[str]: Lista de chunks de texto """ tokens = self.encoding.encode(text) chunks = [] current_chunk = [] current_size = 0 for token in tokens: if current_size + 1 > self.max_tokens: # Converter tokens atuais para texto chunk_text = self.encoding.decode(current_chunk) chunks.append(chunk_text) current_chunk = [token] current_size = 1 else: current_chunk.append(token) current_size += 1 # Adicionar último chunk if current_chunk: chunk_text = self.encoding.decode(current_chunk) chunks.append(chunk_text) return chunks def generate_embedding(self, text: str) -> List[float]: """ Gera embedding para um texto usando a API da OpenAI. Args: text (str): Texto para gerar embedding Returns: List[float]: Vetor de embedding """ try: headers = { "Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}" } data = { "model": self.model, "input": text } response = requests.post( "https://api.openai.com/v1/embeddings", headers=headers, json=data ) if response.status_code != 200: raise Exception(f"API Error: {response.status_code} - {response.text}") return response.json()["data"][0]["embedding"] except Exception as e: raise Exception(f"Erro ao gerar embedding: {str(e)}") def update_document_embeddings(self, document_id: int) -> None: """ Atualiza os embeddings de um documento. Args: document_id (int): ID do documento """ try: # Conectar ao banco de dados self.connect() # Buscar conteúdo do documento self.cur.execute(""" SELECT content FROM documents WHERE id = %s """, (document_id,)) result = self.cur.fetchone() if not result: raise Exception(f"Documento não encontrado: {document_id}") content = result[0] # Dividir em chunks chunks = self.split_text(content) print(f"Documento dividido em {len(chunks)} chunks") # Gerar embeddings para cada chunk embeddings_data = [] for i, chunk in enumerate(chunks, 1): print(f"Processando chunk {i}/{len(chunks)}...") embedding = self.generate_embedding(chunk) embeddings_data.append({ 'document_id': document_id, 'chunk_index': i-1, 'chunk_text': chunk, 'embedding': embedding }) # Salvar embeddings self.cur.execute(""" DELETE FROM embeddings WHERE document_id = %s """, (document_id,)) execute_values( self.cur, """ INSERT INTO embeddings (document_id, chunk_index, chunk_text, embedding) VALUES %s """, [( d['document_id'], d['chunk_index'], d['chunk_text'], d['embedding'] ) for d in embeddings_data] ) self.conn.commit() except Exception as e: if self.conn: self.conn.rollback() raise Exception(f"Erro ao atualizar embeddings: {str(e)}") finally: self.disconnect() def search_similar(self, query: str, limit: int = 5) -> List[Dict[str, Any]]: """ Busca documentos similares à query. Args: query (str): Texto para buscar limit (int): Número máximo de resultados Returns: List[Dict[str, Any]]: Lista de documentos similares """ try: # Gerar embedding para a query query_embedding = self.generate_embedding(query) # Conectar ao banco de dados self.connect() # Buscar chunks mais similares self.cur.execute(""" WITH similarity AS ( SELECT d.id as document_id, d.title, d.metadata, e.chunk_text, 1 - (e.embedding <=> %s) as similarity FROM embeddings e JOIN documents d ON e.document_id = d.id ORDER BY similarity DESC LIMIT 20 ) SELECT document_id, title, metadata, array_agg(chunk_text ORDER BY similarity DESC) as chunks, max(similarity) as doc_similarity FROM similarity GROUP BY document_id, title, metadata ORDER BY doc_similarity DESC LIMIT %s """, (query_embedding, limit)) results = [] for row in self.cur.fetchall(): results.append({ 'document_id': row[0], 'title': row[1], 'metadata': row[2], 'relevant_chunks': row[3], 'doc_similarity': row[4] }) return results except Exception as e: raise Exception(f"Erro ao buscar documentos: {str(e)}") finally: self.disconnect()