Files
scripts/scraper/batch_scraper_v2_batch4.py

341 lines
12 KiB
Python
Executable File

"""
batch_scraper_v2_batch4.py - Scraper Batch 4 Otimizado
MELHORIAS v2:
- Respeita max_depth configurado (sem overrides)
- Suporte Playwright stealth para anti-bot
- Profundidade nível 4 funcional
- Filtros aplicados APÓS scraping
- Melhor gestão timeouts e retries
Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""
import os
import json
import logging
import argparse
import time
from pathlib import Path
from typing import List, Dict, Optional
from datetime import datetime
from scraper import Scraper, ScraperConfig
from reddit_scraper import RedditScraper
class BatchScraperV2:
"""Batch Scraper v2 - Otimizado para Batch 4."""
def __init__(self, config_file: str = "ctf_config_batch4.json"):
"""
Inicializa batch scraper v2.
Args:
config_file: Caminho configuração JSON
"""
self.config_file = config_file
self.config = self.load_config()
# Configurar diretórios
self.setup_directories()
# Configurar logging
self.setup_logging()
self.results = {
"started_at": datetime.now().isoformat(),
"config_file": config_file,
"batch_version": "v2",
"total_sites": 0,
"successful": 0,
"failed": 0,
"total_pages": 0,
"sites": []
}
def load_config(self) -> Dict:
"""Carrega configuração JSON."""
try:
with open(self.config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
print(f"[INFO] Config carregada: {self.config_file}")
print(f"[INFO] Batch: {config.get('client', 'Unknown')}")
print(f"[INFO] Sites: {len(config.get('sites', []))}")
return config
except FileNotFoundError:
print(f"[ERROR] Config não encontrada: {self.config_file}")
raise
except json.JSONDecodeError as e:
print(f"[ERROR] JSON inválido: {e}")
raise
def setup_directories(self):
"""Configura diretórios output."""
base_dir = self.config.get('output_base_dir', '.')
output_dirs = self.config.get('output_dirs', {
'raw': 'output_md_batch4',
'cleaned': 'output_cleaned_batch4',
'formatted': 'formatted_batch4',
'logs': 'logs'
})
self.base_path = Path(base_dir)
self.raw_dir = self.base_path / output_dirs['raw']
self.cleaned_dir = self.base_path / output_dirs['cleaned']
self.formatted_dir = self.base_path / output_dirs['formatted']
self.logs_dir = self.base_path / output_dirs['logs']
# Criar diretórios
for directory in [self.raw_dir, self.cleaned_dir, self.formatted_dir, self.logs_dir]:
directory.mkdir(parents=True, exist_ok=True)
print(f"[INFO] Base: {self.base_path}")
print(f"[INFO] Output: {self.raw_dir}")
print(f"[INFO] Logs: {self.logs_dir}")
def setup_logging(self):
"""Configura logging."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = self.logs_dir / f'batch4_execution_{timestamp}.log'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler()
],
force=True
)
logging.info(f"=== BATCH 4 SCRAPER V2 INICIADO ===")
logging.info(f"Log file: {log_file}")
logging.info(f"Config: {self.config_file}")
def scrape_websites(self,
phase: Optional[str] = None,
site_names: Optional[List[str]] = None,
skip_anti_bot: bool = False):
"""
Scrape websites.
Args:
phase: Filtrar por fase (ex: "1A", "2", "3")
site_names: Lista nomes específicos
skip_anti_bot: Pular sites com anti-bot (para testes)
"""
sites = self.config.get('sites', [])
# Filtrar sites
if site_names:
sites = [s for s in sites if s.get('name') in site_names]
if skip_anti_bot:
sites = [s for s in sites if not s.get('anti_bot_protection', False)]
logging.info(f"Skip anti-bot: {len(sites)} sites sem proteção")
if not sites:
logging.warning("Nenhum site para processar")
return
logging.info(f"Processando {len(sites)} sites...")
self.results['total_sites'] = len(sites)
for idx, site in enumerate(sites, 1):
site_name = site.get('name', 'Unknown')
logging.info(f"\n{'='*60}")
logging.info(f"Site {idx}/{len(sites)}: {site_name}")
logging.info(f"{'='*60}")
try:
self._scrape_single_site(site)
except Exception as e:
logging.error(f"ERRO CRÍTICO em {site_name}: {e}", exc_info=True)
self.results['failed'] += 1
self.results['sites'].append({
"name": site_name,
"url": site.get('url'),
"status": "failed",
"error": str(e)
})
# Pausa entre sites (politeness)
if idx < len(sites):
pause = 10
logging.info(f"Pausa {pause}s antes próximo site...")
time.sleep(pause)
# Guardar resultados
self._save_results()
logging.info(f"\n{'='*60}")
logging.info(f"BATCH 4 CONCLUÍDO")
logging.info(f"Sucesso: {self.results['successful']}/{self.results['total_sites']}")
logging.info(f"Falhas: {self.results['failed']}")
logging.info(f"Total páginas: {self.results['total_pages']}")
logging.info(f"{'='*60}")
def _scrape_single_site(self, site: Dict):
"""Processa um site individual."""
name = site.get('name', 'Unknown')
url = site.get('url')
site_type = site.get('type', 'website')
# IMPORTANTE: Respeitar max_depth do config (não override!)
max_depth = site.get('max_depth', 4)
# Configurações site
priority = site.get('priority', 'medium')
requires_js = site.get('requires_javascript', False)
anti_bot = site.get('anti_bot_protection', False)
estimated_pages = site.get('estimated_pages', 100)
logging.info(f"URL: {url}")
logging.info(f"Tipo: {site_type}")
logging.info(f"Max Depth: {max_depth} (CONFIGURADO - sem override)")
logging.info(f"Prioridade: {priority}")
logging.info(f"Anti-bot: {'Sim' if anti_bot else 'Não'}")
logging.info(f"Páginas estimadas: {estimated_pages}")
# Obter settings gerais
scraper_settings = self.config.get('scraper_settings', {})
# Criar configuração scraper
config = ScraperConfig(
max_depth=max_depth, # RESPEITAR CONFIG!
request_timeout=scraper_settings.get('request_timeout', 120),
max_retries=scraper_settings.get('max_retries', 3),
politeness_delay=tuple(scraper_settings.get('politeness_delay', [4, 10])),
output_dir=str(self.raw_dir),
excluded_patterns=scraper_settings.get('excluded_patterns', []),
save_metadata=True,
clean_output=True,
use_playwright=scraper_settings.get('use_playwright', True),
headless=scraper_settings.get('headless', True)
)
# NOTA: Filtros de conteúdo aplicados APÓS (na extração)
# Não filtrar durante scraping para ser mais seguro
logging.info(f"Timeout: {config.request_timeout}s")
logging.info(f"Retries: {config.max_retries}")
logging.info(f"Politeness: {config.politeness_delay[0]}-{config.politeness_delay[1]}s")
logging.info(f"Playwright: {'Sim' if config.use_playwright else 'Não'}")
# Executar scraping
scraper = Scraper(config)
try:
start_time = time.time()
scraper.crawl(url)
duration = time.time() - start_time
pages_scraped = len(scraper.visited)
pages_failed = len(scraper.failed_urls)
self.results['successful'] += 1
self.results['total_pages'] += pages_scraped
self.results['sites'].append({
"name": name,
"url": url,
"type": site_type,
"max_depth": max_depth,
"status": "success",
"pages_scraped": pages_scraped,
"pages_failed": pages_failed,
"duration_seconds": round(duration, 2),
"estimated_pages": estimated_pages,
"efficiency": round((pages_scraped / estimated_pages * 100), 2) if estimated_pages > 0 else 0
})
logging.info(f"{name} CONCLUÍDO")
logging.info(f"Páginas extraídas: {pages_scraped}")
logging.info(f"Páginas falhadas: {pages_failed}")
logging.info(f"Duração: {duration:.2f}s ({duration/60:.2f}min)")
logging.info(f"Eficiência: {pages_scraped/estimated_pages*100:.1f}% da estimativa")
except Exception as e:
logging.error(f"❌ Erro em {name}: {e}", exc_info=True)
self.results['failed'] += 1
self.results['sites'].append({
"name": name,
"url": url,
"status": "failed",
"error": str(e)
})
raise
def _save_results(self):
"""Guarda resultados batch."""
self.results['completed_at'] = datetime.now().isoformat()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = self.logs_dir / f'batch4_results_{timestamp}.json'
with open(results_file, 'w', encoding='utf-8') as f:
json.dump(self.results, f, indent=2, ensure_ascii=False)
logging.info(f"Resultados guardados: {results_file}")
def scrape_reddit(self):
"""Scrape subreddits (separado)."""
subreddits = self.config.get('reddit_subreddits', [])
if not subreddits:
logging.info("Sem subreddits configurados")
return
logging.info(f"Processando {len(subreddits)} subreddits...")
# TODO: Implementar se necessário
logging.warning("Reddit scraping não implementado nesta versão")
def main():
"""Entry point."""
parser = argparse.ArgumentParser(
description='CTF Batch 4 Scraper V2 - Otimizado para profundidade nível 4'
)
parser.add_argument(
'--config',
default='ctf_config_batch4.json',
help='Ficheiro configuração (default: ctf_config_batch4.json)'
)
parser.add_argument(
'--phase',
help='Executar apenas uma fase (ex: 1A, 2, 3)'
)
parser.add_argument(
'--sites',
nargs='+',
help='Nomes específicos de sites'
)
parser.add_argument(
'--skip-anti-bot',
action='store_true',
help='Pular sites com proteção anti-bot'
)
args = parser.parse_args()
print("="*60)
print("CTF BATCH 4 SCRAPER V2")
print("Descomplicar® Crescimento Digital")
print("="*60)
print()
scraper = BatchScraperV2(config_file=args.config)
scraper.scrape_websites(
phase=args.phase,
site_names=args.sites,
skip_anti_bot=args.skip_anti_bot
)
if __name__ == '__main__':
main()