init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,262 @@
+"""
+format_content.py
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import os
+import re
+import json
+import logging
+import requests
+import time
+from typing import List, Dict, Optional, Generator
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dotenv import load_dotenv
+
+# Carregar variáveis de ambiente
+load_dotenv()
+
+# Configurações
+INPUT_DIR = "output_cleaned"
+OUTPUT_DIR = "formatted"
+API_KEY = os.getenv("OPENROUTER_API_KEY")
+
+# Configurar logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class ContentFormatter:
+    def __init__(self):
+        self.api_key = API_KEY
+        self.api_url = "https://openrouter.ai/api/v1/chat/completions"
+        self.model = "mistralai/ministral-3b"
+        
+        # Instruções base mais concisas para economizar tokens
+        self.base_instructions = """
+        És um especialista em formatação de conteúdo. Formata o texto seguindo estas regras:
+
+        1. ESTRUTURA:
+        - Título principal (H1)
+        - Breve introdução
+        - Seções com subtítulos
+        - Conclusão
+
+        2. FORMATAÇÃO:
+        - Markdown para títulos e listas
+        - Parágrafos curtos
+        - Listas para pontos-chave
+        - Links importantes
+
+        3. CONTEÚDO:
+        - Remove conteúdo promocional
+        - Mantém informação relevante
+        - Linguagem profissional
+        - Português de Portugal
+        """
+        
+    def extract_sections(self, content: str) -> Generator[str, None, None]:
+        """Extrai seções lógicas do conteúdo baseado em títulos."""
+        # Dividir por títulos (# ou ## ou ###)
+        sections = re.split(r'(?m)^(#+\s+.*$)', content)
+        
+        current_section = []
+        current_size = 0
+        # Limite de 100K tokens para ter margem segura (128K - 28K para instruções e overhead)
+        max_section_size = 400000  # 100K tokens (4 chars/token)
+        
+        for section in sections:
+            # Se a seção for maior que o limite, dividir em partes menores
+            if len(section) > max_section_size:
+                # Dividir por parágrafos
+                paragraphs = section.split('\n\n')
+                current_part = []
+                current_part_size = 0
+                
+                for paragraph in paragraphs:
+                    paragraph_size = len(paragraph)
+                    
+                    if current_part_size + paragraph_size > max_section_size and current_part:
+                        yield '\n\n'.join(current_part)
+                        current_part = [paragraph]
+                        current_part_size = paragraph_size
+                    else:
+                        current_part.append(paragraph)
+                        current_part_size += paragraph_size
+                
+                if current_part:
+                    yield '\n\n'.join(current_part)
+                continue
+            
+            # Estimar tamanho em caracteres
+            section_size = len(section)
+            
+            if current_size + section_size > max_section_size and current_section:
+                yield '\n'.join(current_section)
+                current_section = [section]
+                current_size = section_size
+            else:
+                current_section.append(section)
+                current_size += section_size
+        
+        if current_section:
+            yield '\n'.join(current_section)
+            
+    def format_chunk(self, chunk: str, retries: int = 3, timeout: int = 60) -> Optional[str]:
+        """Formata um chunk de conteúdo usando o modelo LLM."""
+        for attempt in range(retries):
+            try:
+                headers = {
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json",
+                    "HTTP-Referer": "https://openrouter.ai/docs",
+                    "X-Title": "Cascade IDE"
+                }
+                
+                data = {
+                    "model": self.model,
+                    "messages": [
+                        {"role": "system", "content": self.base_instructions},
+                        {"role": "user", "content": chunk}
+                    ],
+                    "temperature": 0.7,
+                    "max_tokens": None  # Permite que o modelo decida o comprimento da resposta
+                }
+                
+                response = requests.post(
+                    self.api_url,
+                    headers=headers,
+                    json=data,
+                    timeout=timeout
+                )
+                
+                if response.status_code == 200:
+                    result = response.json()
+                    if 'choices' in result and len(result['choices']) > 0:
+                        if 'message' in result['choices'][0]:
+                            return result['choices'][0]['message']['content']
+                        else:
+                            logger.error(f"Formato de resposta inesperado: {result}")
+                    else:
+                        logger.error(f"Resposta sem choices: {result}")
+                elif response.status_code == 429:  # Rate limit
+                    wait_time = int(response.headers.get('Retry-After', 10))
+                    logger.warning(f"Rate limit atingido. Aguardando {wait_time}s...")
+                    time.sleep(wait_time)
+                    continue
+                else:
+                    logger.error(f"Erro na API: {response.status_code} - {response.text}")
+                    if attempt < retries - 1:
+                        time.sleep(2 ** attempt)  # Exponential backoff
+                        continue
+                    return None
+                    
+            except requests.exceptions.Timeout:
+                logger.warning(f"Timeout ao processar chunk (tentativa {attempt + 1}/{retries})")
+                if attempt < retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None
+            except Exception as e:
+                logger.error(f"Erro ao formatar chunk: {str(e)}")
+                if attempt < retries - 1:
+                    time.sleep(2 ** attempt)
+                    continue
+                return None
+        
+        return None
+            
+    def format_file(self, input_file: Path, output_dir: Path) -> bool:
+        """Formata um ficheiro inteiro, processando por seções."""
+        try:
+            # Criar nome do ficheiro de saída
+            output_file = output_dir / f"formatted_{input_file.name}"
+            
+            # Se já existe, pular
+            if output_file.exists():
+                logger.info(f"Arquivo já existe, pulando: {output_file}")
+                return True
+            
+            # Ler conteúdo
+            with open(input_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+                
+            # Extrair seções
+            sections = list(self.extract_sections(content))
+            logger.info(f"Dividido {input_file.name} em {len(sections)} seções")
+            
+            # Processar cada seção
+            formatted_sections = []
+            for i, section in enumerate(sections, 1):
+                logger.info(f"Processando seção {i}/{len(sections)} de {input_file.name}")
+                formatted = self.format_chunk(section)
+                if formatted:
+                    formatted_sections.append(formatted)
+                    # Salvar progresso parcial
+                    partial_content = "\n\n---\n\n".join(formatted_sections)
+                    with open(output_file, 'w', encoding='utf-8') as f:
+                        f.write(partial_content)
+                else:
+                    logger.warning(f"Falha ao processar seção {i} de {input_file.name}")
+                
+                # Pequena pausa entre seções para evitar rate limits
+                time.sleep(1)
+                    
+            if not formatted_sections:
+                logger.error(f"Nenhuma seção processada com sucesso para {input_file.name}")
+                return False
+                
+            # Conteúdo final já foi salvo incrementalmente
+            logger.info(f"✓ Arquivo formatado salvo em: {output_file}")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Erro ao processar {input_file}: {str(e)}")
+            return False
+
+def format_markdown_files(input_dir: str, output_dir: str) -> None:
+    """Formata todos os arquivos Markdown em um diretório."""
+    try:
+        input_path = Path(input_dir)
+        output_path = Path(output_dir)
+        
+        # Validar entrada
+        if not input_path.exists() or not input_path.is_dir():
+            logger.error(f"Diretório de entrada não encontrado: {input_dir}")
+            return
+            
+        # Criar diretório de saída
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # Encontrar arquivos .md
+        markdown_files = list(input_path.glob('*.md'))
+        if not markdown_files:
+            logger.warning(f"Nenhum arquivo .md encontrado em: {input_dir}")
+            return
+            
+        logger.info(f"Encontrados {len(markdown_files)} arquivos para formatar")
+        
+        # Criar formatador
+        formatter = ContentFormatter()
+        
+        # Processar um arquivo por vez
+        successful = 0
+        for md_file in markdown_files:
+            if formatter.format_file(md_file, output_path):
+                successful += 1
+                
+        logger.info(f"Processamento concluído: {successful}/{len(markdown_files)} arquivos formatados")
+        
+    except Exception as e:
+        logger.error(f"Erro ao processar diretório: {str(e)}")
+
+if __name__ == '__main__':
+    print("Iniciando formatação dos arquivos Markdown...")
+    format_markdown_files(INPUT_DIR, OUTPUT_DIR)
+    print("Processo concluído!")