341 lines
12 KiB
Python
Executable File
341 lines
12 KiB
Python
Executable File
"""
|
|
batch_scraper_v2_batch4.py - Scraper Batch 4 Otimizado
|
|
|
|
MELHORIAS v2:
|
|
- Respeita max_depth configurado (sem overrides)
|
|
- Suporte Playwright stealth para anti-bot
|
|
- Profundidade nível 4 funcional
|
|
- Filtros aplicados APÓS scraping
|
|
- Melhor gestão timeouts e retries
|
|
|
|
Author: Descomplicar® Crescimento Digital
|
|
Link: https://descomplicar.pt
|
|
Copyright: 2025 Descomplicar®
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
import argparse
|
|
import time
|
|
from pathlib import Path
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime
|
|
|
|
from scraper import Scraper, ScraperConfig
|
|
from reddit_scraper import RedditScraper
|
|
|
|
class BatchScraperV2:
|
|
"""Batch Scraper v2 - Otimizado para Batch 4."""
|
|
|
|
def __init__(self, config_file: str = "ctf_config_batch4.json"):
|
|
"""
|
|
Inicializa batch scraper v2.
|
|
|
|
Args:
|
|
config_file: Caminho configuração JSON
|
|
"""
|
|
self.config_file = config_file
|
|
self.config = self.load_config()
|
|
|
|
# Configurar diretórios
|
|
self.setup_directories()
|
|
|
|
# Configurar logging
|
|
self.setup_logging()
|
|
|
|
self.results = {
|
|
"started_at": datetime.now().isoformat(),
|
|
"config_file": config_file,
|
|
"batch_version": "v2",
|
|
"total_sites": 0,
|
|
"successful": 0,
|
|
"failed": 0,
|
|
"total_pages": 0,
|
|
"sites": []
|
|
}
|
|
|
|
def load_config(self) -> Dict:
|
|
"""Carrega configuração JSON."""
|
|
try:
|
|
with open(self.config_file, 'r', encoding='utf-8') as f:
|
|
config = json.load(f)
|
|
print(f"[INFO] Config carregada: {self.config_file}")
|
|
print(f"[INFO] Batch: {config.get('client', 'Unknown')}")
|
|
print(f"[INFO] Sites: {len(config.get('sites', []))}")
|
|
return config
|
|
except FileNotFoundError:
|
|
print(f"[ERROR] Config não encontrada: {self.config_file}")
|
|
raise
|
|
except json.JSONDecodeError as e:
|
|
print(f"[ERROR] JSON inválido: {e}")
|
|
raise
|
|
|
|
def setup_directories(self):
|
|
"""Configura diretórios output."""
|
|
base_dir = self.config.get('output_base_dir', '.')
|
|
output_dirs = self.config.get('output_dirs', {
|
|
'raw': 'output_md_batch4',
|
|
'cleaned': 'output_cleaned_batch4',
|
|
'formatted': 'formatted_batch4',
|
|
'logs': 'logs'
|
|
})
|
|
|
|
self.base_path = Path(base_dir)
|
|
self.raw_dir = self.base_path / output_dirs['raw']
|
|
self.cleaned_dir = self.base_path / output_dirs['cleaned']
|
|
self.formatted_dir = self.base_path / output_dirs['formatted']
|
|
self.logs_dir = self.base_path / output_dirs['logs']
|
|
|
|
# Criar diretórios
|
|
for directory in [self.raw_dir, self.cleaned_dir, self.formatted_dir, self.logs_dir]:
|
|
directory.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"[INFO] Base: {self.base_path}")
|
|
print(f"[INFO] Output: {self.raw_dir}")
|
|
print(f"[INFO] Logs: {self.logs_dir}")
|
|
|
|
def setup_logging(self):
|
|
"""Configura logging."""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
log_file = self.logs_dir / f'batch4_execution_{timestamp}.log'
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(log_file, encoding='utf-8'),
|
|
logging.StreamHandler()
|
|
],
|
|
force=True
|
|
)
|
|
|
|
logging.info(f"=== BATCH 4 SCRAPER V2 INICIADO ===")
|
|
logging.info(f"Log file: {log_file}")
|
|
logging.info(f"Config: {self.config_file}")
|
|
|
|
def scrape_websites(self,
|
|
phase: Optional[str] = None,
|
|
site_names: Optional[List[str]] = None,
|
|
skip_anti_bot: bool = False):
|
|
"""
|
|
Scrape websites.
|
|
|
|
Args:
|
|
phase: Filtrar por fase (ex: "1A", "2", "3")
|
|
site_names: Lista nomes específicos
|
|
skip_anti_bot: Pular sites com anti-bot (para testes)
|
|
"""
|
|
sites = self.config.get('sites', [])
|
|
|
|
# Filtrar sites
|
|
if site_names:
|
|
sites = [s for s in sites if s.get('name') in site_names]
|
|
|
|
if skip_anti_bot:
|
|
sites = [s for s in sites if not s.get('anti_bot_protection', False)]
|
|
logging.info(f"Skip anti-bot: {len(sites)} sites sem proteção")
|
|
|
|
if not sites:
|
|
logging.warning("Nenhum site para processar")
|
|
return
|
|
|
|
logging.info(f"Processando {len(sites)} sites...")
|
|
self.results['total_sites'] = len(sites)
|
|
|
|
for idx, site in enumerate(sites, 1):
|
|
site_name = site.get('name', 'Unknown')
|
|
logging.info(f"\n{'='*60}")
|
|
logging.info(f"Site {idx}/{len(sites)}: {site_name}")
|
|
logging.info(f"{'='*60}")
|
|
|
|
try:
|
|
self._scrape_single_site(site)
|
|
except Exception as e:
|
|
logging.error(f"ERRO CRÍTICO em {site_name}: {e}", exc_info=True)
|
|
self.results['failed'] += 1
|
|
self.results['sites'].append({
|
|
"name": site_name,
|
|
"url": site.get('url'),
|
|
"status": "failed",
|
|
"error": str(e)
|
|
})
|
|
|
|
# Pausa entre sites (politeness)
|
|
if idx < len(sites):
|
|
pause = 10
|
|
logging.info(f"Pausa {pause}s antes próximo site...")
|
|
time.sleep(pause)
|
|
|
|
# Guardar resultados
|
|
self._save_results()
|
|
|
|
logging.info(f"\n{'='*60}")
|
|
logging.info(f"BATCH 4 CONCLUÍDO")
|
|
logging.info(f"Sucesso: {self.results['successful']}/{self.results['total_sites']}")
|
|
logging.info(f"Falhas: {self.results['failed']}")
|
|
logging.info(f"Total páginas: {self.results['total_pages']}")
|
|
logging.info(f"{'='*60}")
|
|
|
|
def _scrape_single_site(self, site: Dict):
|
|
"""Processa um site individual."""
|
|
name = site.get('name', 'Unknown')
|
|
url = site.get('url')
|
|
site_type = site.get('type', 'website')
|
|
|
|
# IMPORTANTE: Respeitar max_depth do config (não override!)
|
|
max_depth = site.get('max_depth', 4)
|
|
|
|
# Configurações site
|
|
priority = site.get('priority', 'medium')
|
|
requires_js = site.get('requires_javascript', False)
|
|
anti_bot = site.get('anti_bot_protection', False)
|
|
estimated_pages = site.get('estimated_pages', 100)
|
|
|
|
logging.info(f"URL: {url}")
|
|
logging.info(f"Tipo: {site_type}")
|
|
logging.info(f"Max Depth: {max_depth} (CONFIGURADO - sem override)")
|
|
logging.info(f"Prioridade: {priority}")
|
|
logging.info(f"Anti-bot: {'Sim' if anti_bot else 'Não'}")
|
|
logging.info(f"Páginas estimadas: {estimated_pages}")
|
|
|
|
# Obter settings gerais
|
|
scraper_settings = self.config.get('scraper_settings', {})
|
|
|
|
# Criar configuração scraper
|
|
config = ScraperConfig(
|
|
max_depth=max_depth, # RESPEITAR CONFIG!
|
|
request_timeout=scraper_settings.get('request_timeout', 120),
|
|
max_retries=scraper_settings.get('max_retries', 3),
|
|
politeness_delay=tuple(scraper_settings.get('politeness_delay', [4, 10])),
|
|
output_dir=str(self.raw_dir),
|
|
excluded_patterns=scraper_settings.get('excluded_patterns', []),
|
|
save_metadata=True,
|
|
clean_output=True,
|
|
use_playwright=scraper_settings.get('use_playwright', True),
|
|
headless=scraper_settings.get('headless', True)
|
|
)
|
|
|
|
# NOTA: Filtros de conteúdo aplicados APÓS (na extração)
|
|
# Não filtrar durante scraping para ser mais seguro
|
|
|
|
logging.info(f"Timeout: {config.request_timeout}s")
|
|
logging.info(f"Retries: {config.max_retries}")
|
|
logging.info(f"Politeness: {config.politeness_delay[0]}-{config.politeness_delay[1]}s")
|
|
logging.info(f"Playwright: {'Sim' if config.use_playwright else 'Não'}")
|
|
|
|
# Executar scraping
|
|
scraper = Scraper(config)
|
|
|
|
try:
|
|
start_time = time.time()
|
|
scraper.crawl(url)
|
|
duration = time.time() - start_time
|
|
|
|
pages_scraped = len(scraper.visited)
|
|
pages_failed = len(scraper.failed_urls)
|
|
|
|
self.results['successful'] += 1
|
|
self.results['total_pages'] += pages_scraped
|
|
|
|
self.results['sites'].append({
|
|
"name": name,
|
|
"url": url,
|
|
"type": site_type,
|
|
"max_depth": max_depth,
|
|
"status": "success",
|
|
"pages_scraped": pages_scraped,
|
|
"pages_failed": pages_failed,
|
|
"duration_seconds": round(duration, 2),
|
|
"estimated_pages": estimated_pages,
|
|
"efficiency": round((pages_scraped / estimated_pages * 100), 2) if estimated_pages > 0 else 0
|
|
})
|
|
|
|
logging.info(f"✅ {name} CONCLUÍDO")
|
|
logging.info(f"Páginas extraídas: {pages_scraped}")
|
|
logging.info(f"Páginas falhadas: {pages_failed}")
|
|
logging.info(f"Duração: {duration:.2f}s ({duration/60:.2f}min)")
|
|
logging.info(f"Eficiência: {pages_scraped/estimated_pages*100:.1f}% da estimativa")
|
|
|
|
except Exception as e:
|
|
logging.error(f"❌ Erro em {name}: {e}", exc_info=True)
|
|
|
|
self.results['failed'] += 1
|
|
self.results['sites'].append({
|
|
"name": name,
|
|
"url": url,
|
|
"status": "failed",
|
|
"error": str(e)
|
|
})
|
|
raise
|
|
|
|
def _save_results(self):
|
|
"""Guarda resultados batch."""
|
|
self.results['completed_at'] = datetime.now().isoformat()
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
results_file = self.logs_dir / f'batch4_results_{timestamp}.json'
|
|
|
|
with open(results_file, 'w', encoding='utf-8') as f:
|
|
json.dump(self.results, f, indent=2, ensure_ascii=False)
|
|
|
|
logging.info(f"Resultados guardados: {results_file}")
|
|
|
|
def scrape_reddit(self):
|
|
"""Scrape subreddits (separado)."""
|
|
subreddits = self.config.get('reddit_subreddits', [])
|
|
|
|
if not subreddits:
|
|
logging.info("Sem subreddits configurados")
|
|
return
|
|
|
|
logging.info(f"Processando {len(subreddits)} subreddits...")
|
|
|
|
# TODO: Implementar se necessário
|
|
logging.warning("Reddit scraping não implementado nesta versão")
|
|
|
|
|
|
def main():
|
|
"""Entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description='CTF Batch 4 Scraper V2 - Otimizado para profundidade nível 4'
|
|
)
|
|
parser.add_argument(
|
|
'--config',
|
|
default='ctf_config_batch4.json',
|
|
help='Ficheiro configuração (default: ctf_config_batch4.json)'
|
|
)
|
|
parser.add_argument(
|
|
'--phase',
|
|
help='Executar apenas uma fase (ex: 1A, 2, 3)'
|
|
)
|
|
parser.add_argument(
|
|
'--sites',
|
|
nargs='+',
|
|
help='Nomes específicos de sites'
|
|
)
|
|
parser.add_argument(
|
|
'--skip-anti-bot',
|
|
action='store_true',
|
|
help='Pular sites com proteção anti-bot'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("="*60)
|
|
print("CTF BATCH 4 SCRAPER V2")
|
|
print("Descomplicar® Crescimento Digital")
|
|
print("="*60)
|
|
print()
|
|
|
|
scraper = BatchScraperV2(config_file=args.config)
|
|
scraper.scrape_websites(
|
|
phase=args.phase,
|
|
site_names=args.sites,
|
|
skip_anti_bot=args.skip_anti_bot
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|