init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
--- a/kb-processor/src/processors/web_processor.py
+++ b/kb-processor/src/processors/web_processor.py
@@ -0,0 +1,213 @@
+"""
+Web Processor - Módulo para processamento de conteúdo web
+Descomplicar - Agência de Aceleração Digital
+https://www.descomplicar.pt
+"""
+
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from typing import List, Dict
+from datetime import datetime
+from openai import OpenAI
+from .base_processor import BaseProcessor
+import time
+
+class WebProcessor(BaseProcessor):
+    """Processador específico para conteúdo web."""
+    
+    def __init__(self, url: str):
+        """
+        Inicializa o processador web.
+        
+        Args:
+            url (str): URL do conteúdo web
+        """
+        super().__init__(url)
+        self.url = url
+        self.base_domain = urlparse(url).netloc
+        self.soup = None
+        self.links = []
+        self.images = []
+        
+        # Configurar cliente OpenAI para traduções
+        self.translator = OpenAI(
+            api_key="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"
+        )
+        
+        # Atualizar metadata
+        self.metadata.update({
+            "tipo_documento": "web",
+            "fonte": url,
+            "data_original": datetime.now().strftime("%d-%m-%Y")
+        })
+    
+    def translate_batch(self, texts: List[str]) -> List[str]:
+        """
+        Traduz um lote de textos de uma vez.
+        
+        Args:
+            texts (List[str]): Lista de textos para traduzir
+            
+        Returns:
+            List[str]: Lista de textos traduzidos
+        """
+        if not texts:
+            return []
+            
+        # Juntar textos com marcadores
+        combined_text = "\n---SPLIT---\n".join(texts)
+        
+        try:
+            response = self.translator.chat.completions.create(
+                model="gpt-4",
+                messages=[
+                    {"role": "system", "content": "Traduza o seguinte texto para português de Portugal. Mantenha termos técnicos em inglês quando apropriado. Mantenha os marcadores ---SPLIT--- para separar os textos:"},
+                    {"role": "user", "content": combined_text}
+                ]
+            )
+            
+            # Separar textos traduzidos
+            translated = response.choices[0].message.content.split("\n---SPLIT---\n")
+            return [t.strip() for t in translated]
+            
+        except Exception as e:
+            print(f"Erro ao traduzir textos: {str(e)}")
+            return texts
+    
+    def read_content(self) -> str:
+        """
+        Lê o conteúdo da URL.
+        
+        Returns:
+            str: Conteúdo da página web
+        """
+        try:
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+            }
+            response = requests.get(self.url, headers=headers, timeout=30)
+            response.raise_for_status()
+            return response.text
+        except Exception as e:
+            print(f"Erro ao ler URL {self.url}: {str(e)}")
+            return ""
+    
+    def process_content(self):
+        """Processa o conteúdo web."""
+        start_time = time.time()
+        
+        # Definir timeout de 5 minutos
+        timeout = 300
+        
+        html = self.read_content()
+        if not html:
+            return
+            
+        self.soup = BeautifulSoup(html, 'html.parser')
+        
+        # Extrair título
+        title = self.soup.title.string if self.soup.title else "Sem título"
+        self.metadata["título"] = title.strip()
+        
+        # Extrair texto principal
+        self.content = self._extract_main_content()
+        
+        # Extrair links importantes
+        self._extract_links()
+        
+        # Dividir em secções
+        self._split_into_sections()
+        
+        # Verificar timeout
+        if time.time() - start_time > timeout:
+            print("Tempo limite excedido. Interrompendo processamento.")
+            return
+    
+    def _extract_main_content(self) -> str:
+        """
+        Extrai o conteúdo principal da página.
+        
+        Returns:
+            str: Texto principal da página
+        """
+        # Remover elementos indesejados
+        for elem in self.soup.select('script, style, nav, footer, header, .sidebar, .menu, .ads'):
+            elem.decompose()
+        
+        # Tentar encontrar o conteúdo principal
+        main_content = None
+        for selector in ['article', 'main', '.content', '.main-content', '#content', '#main']:
+            main_content = self.soup.select_one(selector)
+            if main_content:
+                break
+        
+        # Se não encontrar conteúdo principal, usar body
+        if not main_content:
+            main_content = self.soup.body
+        
+        # Extrair texto
+        if main_content:
+            # Extrair apenas parágrafos e cabeçalhos relevantes
+            elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'li'])
+            texts = [elem.get_text(strip=True) for elem in elements]
+            texts = [t for t in texts if len(t) > 20]  # Filtrar textos muito curtos
+            
+            # Traduzir em lotes de 5 textos
+            batch_size = 5
+            translated_texts = []
+            for i in range(0, len(texts), batch_size):
+                batch = texts[i:i + batch_size]
+                translated_batch = self.translate_batch(batch)
+                translated_texts.extend(translated_batch)
+            
+            return '\n\n'.join(translated_texts)
+        
+        return ""
+    
+    def _extract_links(self):
+        """Extrai links importantes da página."""
+        main_content = self.soup.select_one('article, main, .content, #content')
+        if not main_content:
+            return
+            
+        for link in main_content.find_all('a', href=True):
+            href = link.get('href')
+            text = link.get_text(strip=True)
+            if href and text and len(text) > 5:  # Ignorar links muito curtos
+                absolute_url = urljoin(self.url, href)
+                if urlparse(absolute_url).netloc == self.base_domain:
+                    self.links.append({
+                        'text': text,
+                        'url': absolute_url
+                    })
+    
+    def _split_into_sections(self):
+        """Divide o conteúdo em secções baseado em cabeçalhos."""
+        if not self.content:
+            return
+            
+        # Dividir por linhas vazias para encontrar parágrafos
+        paragraphs = [p.strip() for p in self.content.split('\n\n') if p.strip()]
+        
+        current_section = {
+            'title': self.metadata['título'],
+            'content': '',
+            'faqs': []
+        }
+        
+        for p in paragraphs:
+            # Se o parágrafo parece um título (curto e termina sem pontuação)
+            if len(p) < 100 and not p[-1] in '.!?':
+                if current_section['content']:
+                    self.chapters.append(current_section)
+                current_section = {
+                    'title': p,
+                    'content': '',
+                    'faqs': []
+                }
+            else:
+                current_section['content'] += p + '\n\n'
+        
+        if current_section['content']:
+            self.chapters.append(current_section)