init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
--- a/kb-processor/src/processors/embedding_processor.py
+++ b/kb-processor/src/processors/embedding_processor.py
@@ -0,0 +1,252 @@
+"""
+Embedding Processor - Processador para geração e gestão de embeddings
+Descomplicar - Agência de Aceleração Digital
+https://www.descomplicar.pt
+"""
+
+import requests
+import numpy as np
+from typing import List, Dict, Any, Optional
+import os
+from datetime import datetime
+import psycopg2
+from psycopg2.extras import Json, execute_values
+import json
+import tiktoken
+
+class EmbeddingProcessor:
+    """Processador para geração e gestão de embeddings."""
+    
+    def __init__(self):
+        """Inicializa o processador de embeddings."""
+        self.api_key = "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"
+        self.model = "text-embedding-ada-002"
+        self.encoding = tiktoken.encoding_for_model(self.model)
+        self.max_tokens = 8000  # Deixar margem de segurança
+        
+        # Configurações do banco de dados
+        self.db_config = {
+            'dbname': 'superbot_kb',
+            'user': 'superbot_user',
+            'password': 'KufQ4La5jaAk',
+            'host': 'easy.descomplicar.pt',
+            'port': '5433'
+        }
+        
+        # Inicializar conexão
+        self.conn = None
+        self.cur = None
+    
+    def connect(self) -> None:
+        """Estabelece conexão com o banco de dados."""
+        try:
+            self.conn = psycopg2.connect(**self.db_config)
+            self.cur = self.conn.cursor()
+        except Exception as e:
+            raise Exception(f"Erro ao conectar ao banco de dados: {str(e)}")
+    
+    def disconnect(self) -> None:
+        """Fecha a conexão com o banco de dados."""
+        if self.cur:
+            self.cur.close()
+        if self.conn:
+            self.conn.close()
+    
+    def split_text(self, text: str) -> List[str]:
+        """
+        Divide o texto em chunks menores respeitando o limite de tokens.
+        
+        Args:
+            text (str): Texto para dividir
+            
+        Returns:
+            List[str]: Lista de chunks de texto
+        """
+        tokens = self.encoding.encode(text)
+        chunks = []
+        current_chunk = []
+        current_size = 0
+        
+        for token in tokens:
+            if current_size + 1 > self.max_tokens:
+                # Converter tokens atuais para texto
+                chunk_text = self.encoding.decode(current_chunk)
+                chunks.append(chunk_text)
+                current_chunk = [token]
+                current_size = 1
+            else:
+                current_chunk.append(token)
+                current_size += 1
+        
+        # Adicionar último chunk
+        if current_chunk:
+            chunk_text = self.encoding.decode(current_chunk)
+            chunks.append(chunk_text)
+        
+        return chunks
+    
+    def generate_embedding(self, text: str) -> List[float]:
+        """
+        Gera embedding para um texto usando a API da OpenAI.
+        
+        Args:
+            text (str): Texto para gerar embedding
+            
+        Returns:
+            List[float]: Vetor de embedding
+        """
+        try:
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self.api_key}"
+            }
+            
+            data = {
+                "model": self.model,
+                "input": text
+            }
+            
+            response = requests.post(
+                "https://api.openai.com/v1/embeddings",
+                headers=headers,
+                json=data
+            )
+            
+            if response.status_code != 200:
+                raise Exception(f"API Error: {response.status_code} - {response.text}")
+            
+            return response.json()["data"][0]["embedding"]
+        except Exception as e:
+            raise Exception(f"Erro ao gerar embedding: {str(e)}")
+    
+    def update_document_embeddings(self, document_id: int) -> None:
+        """
+        Atualiza os embeddings de um documento.
+        
+        Args:
+            document_id (int): ID do documento
+        """
+        try:
+            # Conectar ao banco de dados
+            self.connect()
+            
+            # Buscar conteúdo do documento
+            self.cur.execute("""
+                SELECT content FROM documents
+                WHERE id = %s
+            """, (document_id,))
+            
+            result = self.cur.fetchone()
+            if not result:
+                raise Exception(f"Documento não encontrado: {document_id}")
+            
+            content = result[0]
+            
+            # Dividir em chunks
+            chunks = self.split_text(content)
+            print(f"Documento dividido em {len(chunks)} chunks")
+            
+            # Gerar embeddings para cada chunk
+            embeddings_data = []
+            for i, chunk in enumerate(chunks, 1):
+                print(f"Processando chunk {i}/{len(chunks)}...")
+                embedding = self.generate_embedding(chunk)
+                
+                embeddings_data.append({
+                    'document_id': document_id,
+                    'chunk_index': i-1,
+                    'chunk_text': chunk,
+                    'embedding': embedding
+                })
+            
+            # Salvar embeddings
+            self.cur.execute("""
+                DELETE FROM embeddings
+                WHERE document_id = %s
+            """, (document_id,))
+            
+            execute_values(
+                self.cur,
+                """
+                INSERT INTO embeddings (document_id, chunk_index, chunk_text, embedding)
+                VALUES %s
+                """,
+                [(
+                    d['document_id'],
+                    d['chunk_index'],
+                    d['chunk_text'],
+                    d['embedding']
+                ) for d in embeddings_data]
+            )
+            
+            self.conn.commit()
+            
+        except Exception as e:
+            if self.conn:
+                self.conn.rollback()
+            raise Exception(f"Erro ao atualizar embeddings: {str(e)}")
+        
+        finally:
+            self.disconnect()
+    
+    def search_similar(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
+        """
+        Busca documentos similares à query.
+        
+        Args:
+            query (str): Texto para buscar
+            limit (int): Número máximo de resultados
+            
+        Returns:
+            List[Dict[str, Any]]: Lista de documentos similares
+        """
+        try:
+            # Gerar embedding para a query
+            query_embedding = self.generate_embedding(query)
+            
+            # Conectar ao banco de dados
+            self.connect()
+            
+            # Buscar chunks mais similares
+            self.cur.execute("""
+                WITH similarity AS (
+                    SELECT 
+                        d.id as document_id,
+                        d.title,
+                        d.metadata,
+                        e.chunk_text,
+                        1 - (e.embedding <=> %s) as similarity
+                    FROM embeddings e
+                    JOIN documents d ON e.document_id = d.id
+                    ORDER BY similarity DESC
+                    LIMIT 20
+                )
+                SELECT 
+                    document_id,
+                    title,
+                    metadata,
+                    array_agg(chunk_text ORDER BY similarity DESC) as chunks,
+                    max(similarity) as doc_similarity
+                FROM similarity
+                GROUP BY document_id, title, metadata
+                ORDER BY doc_similarity DESC
+                LIMIT %s
+            """, (query_embedding, limit))
+            
+            results = []
+            for row in self.cur.fetchall():
+                results.append({
+                    'document_id': row[0],
+                    'title': row[1],
+                    'metadata': row[2],
+                    'relevant_chunks': row[3],
+                    'doc_similarity': row[4]
+                })
+            
+            return results
+            
+        except Exception as e:
+            raise Exception(f"Erro ao buscar documentos: {str(e)}")
+        
+        finally:
+            self.disconnect()