init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,242 @@
+"""
+clean_md.py
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import os
+import re
+import logging
+from typing import List, Tuple, Optional
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Configurar logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+class MarkdownCleaner:
+    def __init__(self):
+        # Padrões para remoção
+        self.patterns = [
+            # Elementos de navegação e cabeçalho
+            (r'(?i)(?m)^(?:Skip to|ADVERTISE|CONTACT|SUBSCRIBE|Search|STORIES|RESOURCES|TOPICS|EVENTS|LATEST|TOP|PODCASTS|ABOUT|SHOP|MEMBERSHIPS|PROGRAMS|BLOG|HIRE|THINK TANK|UNDERGROUND|ACCELERATOR|SPEAKING|LOG IN).*$', ''),
+            
+            # Comentários HTML
+            (r'(?s)<!--.*?-->', ''),
+            
+            # Scripts e estilos
+            (r'(?s)<script\b[^>]*>.*?</script>', ''),
+            (r'(?s)<style\b[^>]*>.*?</style>', ''),
+            
+            # Elementos HTML específicos com conteúdo
+            (r'(?s)<(?:nav|header|footer|aside|menu)\b[^>]*>.*?</(?:nav|header|footer|aside|menu)>', ''),
+            
+            # Divs com classes específicas
+            (r'(?s)<div\b[^>]*(?:class|id)=(["\'])(?:(?!\1).)*(?:ad|banner|cookie|modal|popup|sidebar|footer|header|nav|menu)(?:(?!\1).)*\1[^>]*>.*?</div>', ''),
+            
+            # Front matter e metadata
+            (r'(?s)^---\n.*?\n---\n', ''),
+            (r'(?i)^#\s*https?://[^\n]+$', ''),
+            
+            # Seções comuns de sites
+            (r'(?im)^#+\s*(?:Menu|Navigation|Publicidade|Cookies|Newsletter|Social|Footer|Anúncio|Patrocinado|Relacionados|Compartilhar|Comentários|Share|Follow|Subscribe|Contact|About|Search|Login|Sign|Register).*$', ''),
+            
+            # Links mantendo apenas o texto
+            (r'\[([^\]]+)\]\([^)]+\)', r'\1'),
+            
+            # Elementos de UI comuns
+            (r'(?i)(?m)^(?:Click|Tap|Swipe|Read more|Learn more|View|Download|Get started|Sign up|Log in|Register|Subscribe).*$', ''),
+            
+            # Palavras-chave específicas com contexto
+            (r'(?i)(?:advertisement|sponsored|cookies|privacy policy|terms of service|all rights reserved).*?\n', ''),
+            
+            # Limpeza de espaços e formatação
+            (r'(?m)^\s+$', ''),
+            (r'(?m)[ \t]+$', ''),
+            (r'\n{3,}', '\n\n'),
+            (r'(?m)^[-_*]{3,}$', ''),  # Linhas horizontais
+            
+            # Remover URLs soltos
+            (r'(?i)https?://\S+', ''),
+            
+            # Remover elementos de data/hora
+            (r'(?i)(?m)^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}.*$', ''),
+            (r'(?i)(?m)^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}.*$', '')
+        ]
+
+    def clean_content(self, content: str) -> str:
+        """Limpa o conteúdo usando todos os padrões definidos."""
+        if not content or len(content.strip()) == 0:
+            return ""
+            
+        cleaned = content
+        
+        # Aplicar todos os padrões
+        for pattern, replacement in self.patterns:
+            try:
+                cleaned = re.sub(pattern, replacement, cleaned)
+            except Exception as e:
+                logger.error(f"Erro ao aplicar padrão {pattern}: {str(e)}")
+                continue
+        
+        # Limpeza final
+        cleaned = cleaned.strip()
+        
+        # Remover linhas vazias consecutivas
+        cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
+        
+        # Remover linhas que contêm apenas caracteres especiais ou são muito curtas
+        cleaned_lines = []
+        for line in cleaned.splitlines():
+            line = line.strip()
+            # Ignorar linhas vazias ou muito curtas
+            if len(line) < 3:
+                continue
+            # Ignorar linhas que não têm letras
+            if not any(c.isalpha() for c in line):
+                continue
+            # Ignorar linhas que são apenas URLs ou números
+            if re.match(r'^(?:https?://|www\.|\d+|\W+).*$', line):
+                continue
+            cleaned_lines.append(line)
+        
+        cleaned = '\n\n'.join(cleaned_lines)
+        
+        return cleaned
+
+def process_file(input_file: Path, output_dir: Path, cleaner: MarkdownCleaner) -> bool:
+    """Processa um único ficheiro markdown."""
+    try:
+        # Definir arquivo de saída
+        output_file = output_dir / input_file.name
+        
+        # Obter tamanho original
+        original_size = input_file.stat().st_size
+        if original_size == 0:
+            logger.warning(f"Arquivo vazio ignorado: {input_file}")
+            return False
+            
+        logger.info(f"Processando: {input_file} ({original_size/1024/1024:.1f} MB)")
+        
+        # Ler conteúdo com tratamento de encoding
+        try:
+            with open(input_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+        except UnicodeDecodeError:
+            with open(input_file, 'r', encoding='latin-1') as f:
+                content = f.read()
+        
+        # Se conteúdo estiver vazio, ignorar
+        if not content or len(content.strip()) == 0:
+            logger.warning(f"Arquivo com conteúdo vazio ignorado: {input_file}")
+            return False
+            
+        # Limpar conteúdo
+        cleaned_content = cleaner.clean_content(content)
+        
+        # Se resultado estiver vazio, ignorar
+        if not cleaned_content or len(cleaned_content.strip()) == 0:
+            logger.warning(f"Resultado vazio após limpeza: {input_file}")
+            return False
+        
+        # Verificar se houve redução significativa
+        reduction = ((len(content) - len(cleaned_content)) / len(content)) * 100
+        if reduction < 10:
+            logger.warning(f"Aviso: Redução menor que 10% para {input_file}")
+        
+        # Salvar resultado
+        with open(output_file, 'w', encoding='utf-8') as f:
+            f.write(cleaned_content)
+            
+        # Obter tamanho final
+        final_size = output_file.stat().st_size
+        size_reduction = ((original_size - final_size) / original_size) * 100
+        
+        logger.info(f"✓ Arquivo limpo: {output_file} ({final_size/1024/1024:.1f} MB, redução de {size_reduction:.1f}%)")
+        return True
+        
+    except Exception as e:
+        logger.error(f"Erro ao processar {input_file}: {str(e)}")
+        return False
+
+def clean_markdown_files(input_dir: str, output_dir: str) -> Tuple[int, int]:
+    """
+    Limpa todos os arquivos Markdown em um diretório.
+    
+    Args:
+        input_dir: Diretório com os arquivos Markdown
+        output_dir: Diretório para salvar os arquivos processados
+        
+    Returns:
+        Tuple[int, int]: (número de arquivos processados com sucesso, total de arquivos)
+    """
+    try:
+        input_path = Path(input_dir)
+        output_path = Path(output_dir)
+        
+        # Validar entrada
+        if not input_path.exists() or not input_path.is_dir():
+            logger.error(f"Diretório de entrada não encontrado: {input_dir}")
+            return (0, 0)
+
+        # Criar diretório de saída se não existir
+        output_path.mkdir(parents=True, exist_ok=True)
+        
+        # Encontrar todos os arquivos .md
+        markdown_files = list(input_path.glob('*.md'))
+        total_files = len(markdown_files)
+        
+        if total_files == 0:
+            logger.warning(f"Nenhum arquivo .md encontrado em: {input_dir}")
+            return (0, 0)
+            
+        logger.info(f"Encontrados {total_files} arquivos .md para processar")
+        
+        # Criar instância do limpador
+        cleaner = MarkdownCleaner()
+        
+        # Processar arquivos em paralelo
+        successful = 0
+        with ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(process_file, md_file, output_path, cleaner)
+                for md_file in markdown_files
+            ]
+            
+            for future in as_completed(futures):
+                if future.result():
+                    successful += 1
+        
+        logger.info(f"Processamento concluído: {successful}/{total_files} arquivos processados com sucesso")
+        return (successful, total_files)
+        
+    except Exception as e:
+        logger.error(f"Erro ao processar diretório {input_dir}: {str(e)}")
+        return (0, 0)
+
+if __name__ == '__main__':
+    import sys
+    
+    if len(sys.argv) != 3:
+        print("Uso: python clean_md.py <diretorio_entrada> <diretorio_saida>")
+        sys.exit(1)
+        
+    input_dir = sys.argv[1]
+    output_dir = sys.argv[2]
+    
+    successful, total = clean_markdown_files(input_dir, output_dir)
+    if successful == 0:
+        print("Erro: Nenhum arquivo foi processado com sucesso")
+        sys.exit(1)
+    elif successful < total:
+        print(f"Aviso: Apenas {successful} de {total} arquivos foram processados com sucesso")
+        sys.exit(1)
+    else:
+        print(f"Sucesso: Todos os {total} arquivos foram processados")
+        sys.exit(0)