init: scripts diversos (crawlers, conversores, scrapers)

This commit is contained in:
2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions

View File

@@ -0,0 +1,252 @@
"""
Embedding Processor - Processador para geração e gestão de embeddings
Descomplicar - Agência de Aceleração Digital
https://www.descomplicar.pt
"""
import requests
import numpy as np
from typing import List, Dict, Any, Optional
import os
from datetime import datetime
import psycopg2
from psycopg2.extras import Json, execute_values
import json
import tiktoken
class EmbeddingProcessor:
"""Processador para geração e gestão de embeddings."""
def __init__(self):
"""Inicializa o processador de embeddings."""
self.api_key = "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"
self.model = "text-embedding-ada-002"
self.encoding = tiktoken.encoding_for_model(self.model)
self.max_tokens = 8000 # Deixar margem de segurança
# Configurações do banco de dados
self.db_config = {
'dbname': 'superbot_kb',
'user': 'superbot_user',
'password': 'KufQ4La5jaAk',
'host': 'easy.descomplicar.pt',
'port': '5433'
}
# Inicializar conexão
self.conn = None
self.cur = None
def connect(self) -> None:
"""Estabelece conexão com o banco de dados."""
try:
self.conn = psycopg2.connect(**self.db_config)
self.cur = self.conn.cursor()
except Exception as e:
raise Exception(f"Erro ao conectar ao banco de dados: {str(e)}")
def disconnect(self) -> None:
"""Fecha a conexão com o banco de dados."""
if self.cur:
self.cur.close()
if self.conn:
self.conn.close()
def split_text(self, text: str) -> List[str]:
"""
Divide o texto em chunks menores respeitando o limite de tokens.
Args:
text (str): Texto para dividir
Returns:
List[str]: Lista de chunks de texto
"""
tokens = self.encoding.encode(text)
chunks = []
current_chunk = []
current_size = 0
for token in tokens:
if current_size + 1 > self.max_tokens:
# Converter tokens atuais para texto
chunk_text = self.encoding.decode(current_chunk)
chunks.append(chunk_text)
current_chunk = [token]
current_size = 1
else:
current_chunk.append(token)
current_size += 1
# Adicionar último chunk
if current_chunk:
chunk_text = self.encoding.decode(current_chunk)
chunks.append(chunk_text)
return chunks
def generate_embedding(self, text: str) -> List[float]:
"""
Gera embedding para um texto usando a API da OpenAI.
Args:
text (str): Texto para gerar embedding
Returns:
List[float]: Vetor de embedding
"""
try:
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}"
}
data = {
"model": self.model,
"input": text
}
response = requests.post(
"https://api.openai.com/v1/embeddings",
headers=headers,
json=data
)
if response.status_code != 200:
raise Exception(f"API Error: {response.status_code} - {response.text}")
return response.json()["data"][0]["embedding"]
except Exception as e:
raise Exception(f"Erro ao gerar embedding: {str(e)}")
def update_document_embeddings(self, document_id: int) -> None:
"""
Atualiza os embeddings de um documento.
Args:
document_id (int): ID do documento
"""
try:
# Conectar ao banco de dados
self.connect()
# Buscar conteúdo do documento
self.cur.execute("""
SELECT content FROM documents
WHERE id = %s
""", (document_id,))
result = self.cur.fetchone()
if not result:
raise Exception(f"Documento não encontrado: {document_id}")
content = result[0]
# Dividir em chunks
chunks = self.split_text(content)
print(f"Documento dividido em {len(chunks)} chunks")
# Gerar embeddings para cada chunk
embeddings_data = []
for i, chunk in enumerate(chunks, 1):
print(f"Processando chunk {i}/{len(chunks)}...")
embedding = self.generate_embedding(chunk)
embeddings_data.append({
'document_id': document_id,
'chunk_index': i-1,
'chunk_text': chunk,
'embedding': embedding
})
# Salvar embeddings
self.cur.execute("""
DELETE FROM embeddings
WHERE document_id = %s
""", (document_id,))
execute_values(
self.cur,
"""
INSERT INTO embeddings (document_id, chunk_index, chunk_text, embedding)
VALUES %s
""",
[(
d['document_id'],
d['chunk_index'],
d['chunk_text'],
d['embedding']
) for d in embeddings_data]
)
self.conn.commit()
except Exception as e:
if self.conn:
self.conn.rollback()
raise Exception(f"Erro ao atualizar embeddings: {str(e)}")
finally:
self.disconnect()
def search_similar(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
"""
Busca documentos similares à query.
Args:
query (str): Texto para buscar
limit (int): Número máximo de resultados
Returns:
List[Dict[str, Any]]: Lista de documentos similares
"""
try:
# Gerar embedding para a query
query_embedding = self.generate_embedding(query)
# Conectar ao banco de dados
self.connect()
# Buscar chunks mais similares
self.cur.execute("""
WITH similarity AS (
SELECT
d.id as document_id,
d.title,
d.metadata,
e.chunk_text,
1 - (e.embedding <=> %s) as similarity
FROM embeddings e
JOIN documents d ON e.document_id = d.id
ORDER BY similarity DESC
LIMIT 20
)
SELECT
document_id,
title,
metadata,
array_agg(chunk_text ORDER BY similarity DESC) as chunks,
max(similarity) as doc_similarity
FROM similarity
GROUP BY document_id, title, metadata
ORDER BY doc_similarity DESC
LIMIT %s
""", (query_embedding, limit))
results = []
for row in self.cur.fetchall():
results.append({
'document_id': row[0],
'title': row[1],
'metadata': row[2],
'relevant_chunks': row[3],
'doc_similarity': row[4]
})
return results
except Exception as e:
raise Exception(f"Erro ao buscar documentos: {str(e)}")
finally:
self.disconnect()