""" batch_scraper_v2_batch4.py - Scraper Batch 4 Otimizado MELHORIAS v2: - Respeita max_depth configurado (sem overrides) - Suporte Playwright stealth para anti-bot - Profundidade nível 4 funcional - Filtros aplicados APÓS scraping - Melhor gestão timeouts e retries Author: Descomplicar® Crescimento Digital Link: https://descomplicar.pt Copyright: 2025 Descomplicar® """ import os import json import logging import argparse import time from pathlib import Path from typing import List, Dict, Optional from datetime import datetime from scraper import Scraper, ScraperConfig from reddit_scraper import RedditScraper class BatchScraperV2: """Batch Scraper v2 - Otimizado para Batch 4.""" def __init__(self, config_file: str = "ctf_config_batch4.json"): """ Inicializa batch scraper v2. Args: config_file: Caminho configuração JSON """ self.config_file = config_file self.config = self.load_config() # Configurar diretórios self.setup_directories() # Configurar logging self.setup_logging() self.results = { "started_at": datetime.now().isoformat(), "config_file": config_file, "batch_version": "v2", "total_sites": 0, "successful": 0, "failed": 0, "total_pages": 0, "sites": [] } def load_config(self) -> Dict: """Carrega configuração JSON.""" try: with open(self.config_file, 'r', encoding='utf-8') as f: config = json.load(f) print(f"[INFO] Config carregada: {self.config_file}") print(f"[INFO] Batch: {config.get('client', 'Unknown')}") print(f"[INFO] Sites: {len(config.get('sites', []))}") return config except FileNotFoundError: print(f"[ERROR] Config não encontrada: {self.config_file}") raise except json.JSONDecodeError as e: print(f"[ERROR] JSON inválido: {e}") raise def setup_directories(self): """Configura diretórios output.""" base_dir = self.config.get('output_base_dir', '.') output_dirs = self.config.get('output_dirs', { 'raw': 'output_md_batch4', 'cleaned': 'output_cleaned_batch4', 'formatted': 'formatted_batch4', 'logs': 'logs' }) self.base_path = Path(base_dir) self.raw_dir = self.base_path / output_dirs['raw'] self.cleaned_dir = self.base_path / output_dirs['cleaned'] self.formatted_dir = self.base_path / output_dirs['formatted'] self.logs_dir = self.base_path / output_dirs['logs'] # Criar diretórios for directory in [self.raw_dir, self.cleaned_dir, self.formatted_dir, self.logs_dir]: directory.mkdir(parents=True, exist_ok=True) print(f"[INFO] Base: {self.base_path}") print(f"[INFO] Output: {self.raw_dir}") print(f"[INFO] Logs: {self.logs_dir}") def setup_logging(self): """Configura logging.""" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") log_file = self.logs_dir / f'batch4_execution_{timestamp}.log' logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler() ], force=True ) logging.info(f"=== BATCH 4 SCRAPER V2 INICIADO ===") logging.info(f"Log file: {log_file}") logging.info(f"Config: {self.config_file}") def scrape_websites(self, phase: Optional[str] = None, site_names: Optional[List[str]] = None, skip_anti_bot: bool = False): """ Scrape websites. Args: phase: Filtrar por fase (ex: "1A", "2", "3") site_names: Lista nomes específicos skip_anti_bot: Pular sites com anti-bot (para testes) """ sites = self.config.get('sites', []) # Filtrar sites if site_names: sites = [s for s in sites if s.get('name') in site_names] if skip_anti_bot: sites = [s for s in sites if not s.get('anti_bot_protection', False)] logging.info(f"Skip anti-bot: {len(sites)} sites sem proteção") if not sites: logging.warning("Nenhum site para processar") return logging.info(f"Processando {len(sites)} sites...") self.results['total_sites'] = len(sites) for idx, site in enumerate(sites, 1): site_name = site.get('name', 'Unknown') logging.info(f"\n{'='*60}") logging.info(f"Site {idx}/{len(sites)}: {site_name}") logging.info(f"{'='*60}") try: self._scrape_single_site(site) except Exception as e: logging.error(f"ERRO CRÍTICO em {site_name}: {e}", exc_info=True) self.results['failed'] += 1 self.results['sites'].append({ "name": site_name, "url": site.get('url'), "status": "failed", "error": str(e) }) # Pausa entre sites (politeness) if idx < len(sites): pause = 10 logging.info(f"Pausa {pause}s antes próximo site...") time.sleep(pause) # Guardar resultados self._save_results() logging.info(f"\n{'='*60}") logging.info(f"BATCH 4 CONCLUÍDO") logging.info(f"Sucesso: {self.results['successful']}/{self.results['total_sites']}") logging.info(f"Falhas: {self.results['failed']}") logging.info(f"Total páginas: {self.results['total_pages']}") logging.info(f"{'='*60}") def _scrape_single_site(self, site: Dict): """Processa um site individual.""" name = site.get('name', 'Unknown') url = site.get('url') site_type = site.get('type', 'website') # IMPORTANTE: Respeitar max_depth do config (não override!) max_depth = site.get('max_depth', 4) # Configurações site priority = site.get('priority', 'medium') requires_js = site.get('requires_javascript', False) anti_bot = site.get('anti_bot_protection', False) estimated_pages = site.get('estimated_pages', 100) logging.info(f"URL: {url}") logging.info(f"Tipo: {site_type}") logging.info(f"Max Depth: {max_depth} (CONFIGURADO - sem override)") logging.info(f"Prioridade: {priority}") logging.info(f"Anti-bot: {'Sim' if anti_bot else 'Não'}") logging.info(f"Páginas estimadas: {estimated_pages}") # Obter settings gerais scraper_settings = self.config.get('scraper_settings', {}) # Criar configuração scraper config = ScraperConfig( max_depth=max_depth, # RESPEITAR CONFIG! request_timeout=scraper_settings.get('request_timeout', 120), max_retries=scraper_settings.get('max_retries', 3), politeness_delay=tuple(scraper_settings.get('politeness_delay', [4, 10])), output_dir=str(self.raw_dir), excluded_patterns=scraper_settings.get('excluded_patterns', []), save_metadata=True, clean_output=True, use_playwright=scraper_settings.get('use_playwright', True), headless=scraper_settings.get('headless', True) ) # NOTA: Filtros de conteúdo aplicados APÓS (na extração) # Não filtrar durante scraping para ser mais seguro logging.info(f"Timeout: {config.request_timeout}s") logging.info(f"Retries: {config.max_retries}") logging.info(f"Politeness: {config.politeness_delay[0]}-{config.politeness_delay[1]}s") logging.info(f"Playwright: {'Sim' if config.use_playwright else 'Não'}") # Executar scraping scraper = Scraper(config) try: start_time = time.time() scraper.crawl(url) duration = time.time() - start_time pages_scraped = len(scraper.visited) pages_failed = len(scraper.failed_urls) self.results['successful'] += 1 self.results['total_pages'] += pages_scraped self.results['sites'].append({ "name": name, "url": url, "type": site_type, "max_depth": max_depth, "status": "success", "pages_scraped": pages_scraped, "pages_failed": pages_failed, "duration_seconds": round(duration, 2), "estimated_pages": estimated_pages, "efficiency": round((pages_scraped / estimated_pages * 100), 2) if estimated_pages > 0 else 0 }) logging.info(f"✅ {name} CONCLUÍDO") logging.info(f"Páginas extraídas: {pages_scraped}") logging.info(f"Páginas falhadas: {pages_failed}") logging.info(f"Duração: {duration:.2f}s ({duration/60:.2f}min)") logging.info(f"Eficiência: {pages_scraped/estimated_pages*100:.1f}% da estimativa") except Exception as e: logging.error(f"❌ Erro em {name}: {e}", exc_info=True) self.results['failed'] += 1 self.results['sites'].append({ "name": name, "url": url, "status": "failed", "error": str(e) }) raise def _save_results(self): """Guarda resultados batch.""" self.results['completed_at'] = datetime.now().isoformat() timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = self.logs_dir / f'batch4_results_{timestamp}.json' with open(results_file, 'w', encoding='utf-8') as f: json.dump(self.results, f, indent=2, ensure_ascii=False) logging.info(f"Resultados guardados: {results_file}") def scrape_reddit(self): """Scrape subreddits (separado).""" subreddits = self.config.get('reddit_subreddits', []) if not subreddits: logging.info("Sem subreddits configurados") return logging.info(f"Processando {len(subreddits)} subreddits...") # TODO: Implementar se necessário logging.warning("Reddit scraping não implementado nesta versão") def main(): """Entry point.""" parser = argparse.ArgumentParser( description='CTF Batch 4 Scraper V2 - Otimizado para profundidade nível 4' ) parser.add_argument( '--config', default='ctf_config_batch4.json', help='Ficheiro configuração (default: ctf_config_batch4.json)' ) parser.add_argument( '--phase', help='Executar apenas uma fase (ex: 1A, 2, 3)' ) parser.add_argument( '--sites', nargs='+', help='Nomes específicos de sites' ) parser.add_argument( '--skip-anti-bot', action='store_true', help='Pular sites com proteção anti-bot' ) args = parser.parse_args() print("="*60) print("CTF BATCH 4 SCRAPER V2") print("Descomplicar® Crescimento Digital") print("="*60) print() scraper = BatchScraperV2(config_file=args.config) scraper.scrape_websites( phase=args.phase, site_names=args.sites, skip_anti_bot=args.skip_anti_bot ) if __name__ == '__main__': main()