scripts/kb-processor/src/processors/web_processor.py

"""
Web Processor - Módulo para processamento de conteúdo web
Descomplicar - Agência de Aceleração Digital
https://www.descomplicar.pt
"""

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Dict
from datetime import datetime
from openai import OpenAI
from .base_processor import BaseProcessor
import time

class WebProcessor(BaseProcessor):
    """Processador específico para conteúdo web."""

    def __init__(self, url: str):
        """
        Inicializa o processador web.

        Args:
            url (str): URL do conteúdo web
        """
        super().__init__(url)
        self.url = url
        self.base_domain = urlparse(url).netloc
        self.soup = None
        self.links = []
        self.images = []

        # Configurar cliente OpenAI para traduções
        self.translator = OpenAI(
            api_key="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"
        )

        # Atualizar metadata
        self.metadata.update({
            "tipo_documento": "web",
            "fonte": url,
            "data_original": datetime.now().strftime("%d-%m-%Y")
        })

    def translate_batch(self, texts: List[str]) -> List[str]:
        """
        Traduz um lote de textos de uma vez.

        Args:
            texts (List[str]): Lista de textos para traduzir

        Returns:
            List[str]: Lista de textos traduzidos
        """
        if not texts:
            return []

        # Juntar textos com marcadores
        combined_text = "\n---SPLIT---\n".join(texts)

        try:
            response = self.translator.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": "Traduza o seguinte texto para português de Portugal. Mantenha termos técnicos em inglês quando apropriado. Mantenha os marcadores ---SPLIT--- para separar os textos:"},
                    {"role": "user", "content": combined_text}
                ]
            )

            # Separar textos traduzidos
            translated = response.choices[0].message.content.split("\n---SPLIT---\n")
            return [t.strip() for t in translated]

        except Exception as e:
            print(f"Erro ao traduzir textos: {str(e)}")
            return texts

    def read_content(self) -> str:
        """
        Lê o conteúdo da URL.

        Returns:
            str: Conteúdo da página web
        """
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            }
            response = requests.get(self.url, headers=headers, timeout=30)
            response.raise_for_status()
            return response.text
        except Exception as e:
            print(f"Erro ao ler URL {self.url}: {str(e)}")
            return ""

    def process_content(self):
        """Processa o conteúdo web."""
        start_time = time.time()

        # Definir timeout de 5 minutos
        timeout = 300

        html = self.read_content()
        if not html:
            return

        self.soup = BeautifulSoup(html, 'html.parser')

        # Extrair título
        title = self.soup.title.string if self.soup.title else "Sem título"
        self.metadata["título"] = title.strip()

        # Extrair texto principal
        self.content = self._extract_main_content()

        # Extrair links importantes
        self._extract_links()

        # Dividir em secções
        self._split_into_sections()

        # Verificar timeout
        if time.time() - start_time > timeout:
            print("Tempo limite excedido. Interrompendo processamento.")
            return

    def _extract_main_content(self) -> str:
        """
        Extrai o conteúdo principal da página.

        Returns:
            str: Texto principal da página
        """
        # Remover elementos indesejados
        for elem in self.soup.select('script, style, nav, footer, header, .sidebar, .menu, .ads'):
            elem.decompose()

        # Tentar encontrar o conteúdo principal
        main_content = None
        for selector in ['article', 'main', '.content', '.main-content', '#content', '#main']:
            main_content = self.soup.select_one(selector)
            if main_content:
                break

        # Se não encontrar conteúdo principal, usar body
        if not main_content:
            main_content = self.soup.body

        # Extrair texto
        if main_content:
            # Extrair apenas parágrafos e cabeçalhos relevantes
            elements = main_content.find_all(['p', 'h1', 'h2', 'h3', 'li'])
            texts = [elem.get_text(strip=True) for elem in elements]
            texts = [t for t in texts if len(t) > 20]  # Filtrar textos muito curtos

            # Traduzir em lotes de 5 textos
            batch_size = 5
            translated_texts = []
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i + batch_size]
                translated_batch = self.translate_batch(batch)
                translated_texts.extend(translated_batch)

            return '\n\n'.join(translated_texts)

        return ""

    def _extract_links(self):
        """Extrai links importantes da página."""
        main_content = self.soup.select_one('article, main, .content, #content')
        if not main_content:
            return

        for link in main_content.find_all('a', href=True):
            href = link.get('href')
            text = link.get_text(strip=True)
            if href and text and len(text) > 5:  # Ignorar links muito curtos
                absolute_url = urljoin(self.url, href)
                if urlparse(absolute_url).netloc == self.base_domain:
                    self.links.append({
                        'text': text,
                        'url': absolute_url
                    })

    def _split_into_sections(self):
        """Divide o conteúdo em secções baseado em cabeçalhos."""
        if not self.content:
            return

        # Dividir por linhas vazias para encontrar parágrafos
        paragraphs = [p.strip() for p in self.content.split('\n\n') if p.strip()]

        current_section = {
            'title': self.metadata['título'],
            'content': '',
            'faqs': []
        }

        for p in paragraphs:
            # Se o parágrafo parece um título (curto e termina sem pontuação)
            if len(p) < 100 and not p[-1] in '.!?':
                if current_section['content']:
                    self.chapters.append(current_section)
                current_section = {
                    'title': p,
                    'content': '',
                    'faqs': []
                }
            else:
                current_section['content'] += p + '\n\n'

        if current_section['content']:
            self.chapters.append(current_section)