253 lines
8.1 KiB
Python
Executable File
253 lines
8.1 KiB
Python
Executable File
"""
|
|
Embedding Processor - Processador para geração e gestão de embeddings
|
|
Descomplicar - Agência de Aceleração Digital
|
|
https://www.descomplicar.pt
|
|
"""
|
|
|
|
import requests
|
|
import numpy as np
|
|
from typing import List, Dict, Any, Optional
|
|
import os
|
|
from datetime import datetime
|
|
import psycopg2
|
|
from psycopg2.extras import Json, execute_values
|
|
import json
|
|
import tiktoken
|
|
|
|
class EmbeddingProcessor:
|
|
"""Processador para geração e gestão de embeddings."""
|
|
|
|
def __init__(self):
|
|
"""Inicializa o processador de embeddings."""
|
|
self.api_key = "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"
|
|
self.model = "text-embedding-ada-002"
|
|
self.encoding = tiktoken.encoding_for_model(self.model)
|
|
self.max_tokens = 8000 # Deixar margem de segurança
|
|
|
|
# Configurações do banco de dados
|
|
self.db_config = {
|
|
'dbname': 'superbot_kb',
|
|
'user': 'superbot_user',
|
|
'password': 'KufQ4La5jaAk',
|
|
'host': 'easy.descomplicar.pt',
|
|
'port': '5433'
|
|
}
|
|
|
|
# Inicializar conexão
|
|
self.conn = None
|
|
self.cur = None
|
|
|
|
def connect(self) -> None:
|
|
"""Estabelece conexão com o banco de dados."""
|
|
try:
|
|
self.conn = psycopg2.connect(**self.db_config)
|
|
self.cur = self.conn.cursor()
|
|
except Exception as e:
|
|
raise Exception(f"Erro ao conectar ao banco de dados: {str(e)}")
|
|
|
|
def disconnect(self) -> None:
|
|
"""Fecha a conexão com o banco de dados."""
|
|
if self.cur:
|
|
self.cur.close()
|
|
if self.conn:
|
|
self.conn.close()
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
"""
|
|
Divide o texto em chunks menores respeitando o limite de tokens.
|
|
|
|
Args:
|
|
text (str): Texto para dividir
|
|
|
|
Returns:
|
|
List[str]: Lista de chunks de texto
|
|
"""
|
|
tokens = self.encoding.encode(text)
|
|
chunks = []
|
|
current_chunk = []
|
|
current_size = 0
|
|
|
|
for token in tokens:
|
|
if current_size + 1 > self.max_tokens:
|
|
# Converter tokens atuais para texto
|
|
chunk_text = self.encoding.decode(current_chunk)
|
|
chunks.append(chunk_text)
|
|
current_chunk = [token]
|
|
current_size = 1
|
|
else:
|
|
current_chunk.append(token)
|
|
current_size += 1
|
|
|
|
# Adicionar último chunk
|
|
if current_chunk:
|
|
chunk_text = self.encoding.decode(current_chunk)
|
|
chunks.append(chunk_text)
|
|
|
|
return chunks
|
|
|
|
def generate_embedding(self, text: str) -> List[float]:
|
|
"""
|
|
Gera embedding para um texto usando a API da OpenAI.
|
|
|
|
Args:
|
|
text (str): Texto para gerar embedding
|
|
|
|
Returns:
|
|
List[float]: Vetor de embedding
|
|
"""
|
|
try:
|
|
headers = {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {self.api_key}"
|
|
}
|
|
|
|
data = {
|
|
"model": self.model,
|
|
"input": text
|
|
}
|
|
|
|
response = requests.post(
|
|
"https://api.openai.com/v1/embeddings",
|
|
headers=headers,
|
|
json=data
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise Exception(f"API Error: {response.status_code} - {response.text}")
|
|
|
|
return response.json()["data"][0]["embedding"]
|
|
except Exception as e:
|
|
raise Exception(f"Erro ao gerar embedding: {str(e)}")
|
|
|
|
def update_document_embeddings(self, document_id: int) -> None:
|
|
"""
|
|
Atualiza os embeddings de um documento.
|
|
|
|
Args:
|
|
document_id (int): ID do documento
|
|
"""
|
|
try:
|
|
# Conectar ao banco de dados
|
|
self.connect()
|
|
|
|
# Buscar conteúdo do documento
|
|
self.cur.execute("""
|
|
SELECT content FROM documents
|
|
WHERE id = %s
|
|
""", (document_id,))
|
|
|
|
result = self.cur.fetchone()
|
|
if not result:
|
|
raise Exception(f"Documento não encontrado: {document_id}")
|
|
|
|
content = result[0]
|
|
|
|
# Dividir em chunks
|
|
chunks = self.split_text(content)
|
|
print(f"Documento dividido em {len(chunks)} chunks")
|
|
|
|
# Gerar embeddings para cada chunk
|
|
embeddings_data = []
|
|
for i, chunk in enumerate(chunks, 1):
|
|
print(f"Processando chunk {i}/{len(chunks)}...")
|
|
embedding = self.generate_embedding(chunk)
|
|
|
|
embeddings_data.append({
|
|
'document_id': document_id,
|
|
'chunk_index': i-1,
|
|
'chunk_text': chunk,
|
|
'embedding': embedding
|
|
})
|
|
|
|
# Salvar embeddings
|
|
self.cur.execute("""
|
|
DELETE FROM embeddings
|
|
WHERE document_id = %s
|
|
""", (document_id,))
|
|
|
|
execute_values(
|
|
self.cur,
|
|
"""
|
|
INSERT INTO embeddings (document_id, chunk_index, chunk_text, embedding)
|
|
VALUES %s
|
|
""",
|
|
[(
|
|
d['document_id'],
|
|
d['chunk_index'],
|
|
d['chunk_text'],
|
|
d['embedding']
|
|
) for d in embeddings_data]
|
|
)
|
|
|
|
self.conn.commit()
|
|
|
|
except Exception as e:
|
|
if self.conn:
|
|
self.conn.rollback()
|
|
raise Exception(f"Erro ao atualizar embeddings: {str(e)}")
|
|
|
|
finally:
|
|
self.disconnect()
|
|
|
|
def search_similar(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
|
|
"""
|
|
Busca documentos similares à query.
|
|
|
|
Args:
|
|
query (str): Texto para buscar
|
|
limit (int): Número máximo de resultados
|
|
|
|
Returns:
|
|
List[Dict[str, Any]]: Lista de documentos similares
|
|
"""
|
|
try:
|
|
# Gerar embedding para a query
|
|
query_embedding = self.generate_embedding(query)
|
|
|
|
# Conectar ao banco de dados
|
|
self.connect()
|
|
|
|
# Buscar chunks mais similares
|
|
self.cur.execute("""
|
|
WITH similarity AS (
|
|
SELECT
|
|
d.id as document_id,
|
|
d.title,
|
|
d.metadata,
|
|
e.chunk_text,
|
|
1 - (e.embedding <=> %s) as similarity
|
|
FROM embeddings e
|
|
JOIN documents d ON e.document_id = d.id
|
|
ORDER BY similarity DESC
|
|
LIMIT 20
|
|
)
|
|
SELECT
|
|
document_id,
|
|
title,
|
|
metadata,
|
|
array_agg(chunk_text ORDER BY similarity DESC) as chunks,
|
|
max(similarity) as doc_similarity
|
|
FROM similarity
|
|
GROUP BY document_id, title, metadata
|
|
ORDER BY doc_similarity DESC
|
|
LIMIT %s
|
|
""", (query_embedding, limit))
|
|
|
|
results = []
|
|
for row in self.cur.fetchall():
|
|
results.append({
|
|
'document_id': row[0],
|
|
'title': row[1],
|
|
'metadata': row[2],
|
|
'relevant_chunks': row[3],
|
|
'doc_similarity': row[4]
|
|
})
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Erro ao buscar documentos: {str(e)}")
|
|
|
|
finally:
|
|
self.disconnect()
|