""" scraper.py Author: Descomplicar® Crescimento Digital Link: https://descomplicar.pt Copyright: 2025 Descomplicar® """ import os import time import logging import random import json from dataclasses import dataclass, asdict from urllib.parse import urljoin, urlparse, urlunparse from multiprocessing import Pool, cpu_count from typing import List, Dict, Optional, Set from pathlib import Path from playwright.sync_api import sync_playwright, TimeoutError, Page import markdownify from dotenv import load_dotenv # Carregar variáveis de ambiente load_dotenv() # Configuração de logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler('scraper.log'), logging.StreamHandler() ] ) @dataclass class ScraperConfig: user_agents: List[str] = None proxies: List[str] = None max_depth: int = 3 request_timeout: int = 30 max_retries: int = 3 backoff_factor: float = 0.5 politeness_delay: tuple = (1, 3) output_dir: str = "output_md" allowed_domains: List[str] = None excluded_patterns: List[str] = None save_metadata: bool = True clean_output: bool = True def to_dict(self) -> Dict: return asdict(self) @classmethod def from_dict(cls, data: Dict) -> 'ScraperConfig': return cls(**data) def save(self, filepath: str): with open(filepath, 'w') as f: json.dump(self.to_dict(), f, indent=2) @classmethod def load(cls, filepath: str) -> 'ScraperConfig': with open(filepath) as f: return cls.from_dict(json.load(f)) class Scraper: def __init__(self, config: ScraperConfig): self.config = config self.visited: Set[str] = set() self.failed_urls: Set[str] = set() self.metadata: Dict = {} self.current_proxy = None self.current_user_agent = None # Criar diretórios necessários self.setup_directories() # Inicializar valores aleatórios self._rotate_user_agent() self._rotate_proxy() def setup_directories(self): """Criar estrutura de diretórios necessária.""" os.makedirs(self.config.output_dir, exist_ok=True) os.makedirs(f"{self.config.output_dir}/metadata", exist_ok=True) os.makedirs(f"{self.config.output_dir}/raw", exist_ok=True) def _rotate_user_agent(self): if self.config.user_agents: self.current_user_agent = random.choice(self.config.user_agents) else: self.current_user_agent = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/90.0.4430.212 Safari/537.36" ) def _rotate_proxy(self): if self.config.proxies: self.current_proxy = random.choice(self.config.proxies) def _get_browser_config(self): config = { "headless": True, "timeout": self.config.request_timeout * 1000 } if self.current_proxy: config["proxy"] = { "server": self.current_proxy, "username": os.getenv("PROXY_USER"), "password": os.getenv("PROXY_PASS") } return config def _extract_metadata(self, page: Page, url: str) -> Dict: """Extrair metadados relevantes da página.""" metadata = { "url": url, "title": page.title(), "timestamp": time.time(), "headers": {}, "meta_tags": {} } # Extrair headers (h1-h6) for i in range(1, 7): headers = page.query_selector_all(f"h{i}") metadata["headers"][f"h{i}"] = [h.inner_text() for h in headers] # Extrair meta tags meta_tags = page.query_selector_all("meta") for tag in meta_tags: name = tag.get_attribute("name") or tag.get_attribute("property") content = tag.get_attribute("content") if name and content: metadata["meta_tags"][name] = content return metadata def _html_to_markdown(self, html: str, url: str) -> str: try: md = markdownify.markdownify( html, heading_style="ATX", bullets='•◦▪‣⁃', code_language_callback=lambda el: 'text' ) # Limpeza adicional if self.config.clean_output: md = self._clean_markdown(md) return f"# {urlparse(url).path}\n\n{md}\n\n---\n" except Exception as e: logging.error(f"Erro na conversão Markdown: {e}") return "" def _clean_markdown(self, content: str) -> str: """Limpa e melhora o conteúdo Markdown.""" lines = content.split('\n') cleaned_lines = [] for line in lines: # Remove linhas vazias consecutivas if not line.strip() and cleaned_lines and not cleaned_lines[-1].strip(): continue # Remove linhas com caracteres repetidos if len(set(line.strip())) == 1 and len(line.strip()) > 3: continue # Remove linhas muito curtas que são apenas pontuação if len(line.strip()) < 3 and not any(c.isalnum() for c in line): continue # Remove linhas de loading e javascript if "loading" in line.lower() or "javascript" in line.lower(): continue # Remove linhas que são apenas URLs if line.strip().startswith('http') and len(line.split()) == 1: continue # Remove linhas que são apenas números ou datas if line.strip().replace('/', '').replace('-', '').replace('.', '').isdigit(): continue cleaned_lines.append(line) content = '\n'.join(cleaned_lines) # Remove múltiplos espaços em branco content = ' '.join(content.split()) # Remove caracteres especiais repetidos for char in '.,!?-': content = content.replace(char + char, char) return content def _extract_article_content(self, page) -> str: """Extrai o conteúdo principal do artigo.""" # Tenta encontrar o conteúdo principal selectors = [ "article", "main", ".post-content", ".entry-content", ".article-content", "#content", ".content" ] content = None for selector in selectors: content = page.query_selector(selector) if content: break if not content: content = page.query_selector("body") if not content: return "" # Remove elementos indesejados remove_selectors = [ "header", "footer", "nav", ".sidebar", "#sidebar", ".widget", ".comments", ".related-posts", ".social-share", "script", "style", ".advertisement", ".ad-", "#cookie-notice" ] for selector in remove_selectors: elements = content.query_selector_all(selector) for element in elements: try: page.evaluate("element => element.remove()", element) except: pass return content.inner_html() def _scrape_page(self, url: str, depth: int) -> Optional[str]: if self._should_skip_url(url): logging.info(f"URL ignorada pelos filtros: {url}") return None retries = 0 while retries < self.config.max_retries: try: logging.info(f"Tentativa {retries + 1} para {url}") with sync_playwright() as playwright: browser = playwright.chromium.launch(**self._get_browser_config()) context = browser.new_context( java_script_enabled=True, ignore_https_errors=True, user_agent=self.current_user_agent ) logging.info(f"Browser iniciado para {url}") page = context.new_page() # Normalização da URL original_url = url parsed = urlparse(url) if not parsed.scheme: url = f"https://{url.lstrip('/')}" parsed = urlparse(url) if not parsed.netloc or '.' not in parsed.netloc or ' ' in parsed.netloc: raise ValueError(f"Domínio inválido: {original_url}") clean_path = parsed.path.split('//')[0] clean_url = urlunparse(parsed._replace(path=clean_path)) logging.info(f"Carregando página: {clean_url}") try: response = page.goto( clean_url, timeout=self.config.request_timeout * 1000, wait_until="load", referer="https://www.google.com/" ) if response and response.status >= 400: raise ConnectionError(f"Erro HTTP {response.status} - {clean_url}") logging.info(f"Página carregada: {clean_url}") except Exception as e: logging.error(f"Falha ao carregar {url}: {str(e)}") raise page.wait_for_load_state("load") logging.info(f"Estado de carregamento atingido para {url}") # Extrair conteúdo principal html = self._extract_article_content(page) if not html: logging.error(f"Nenhum conteúdo principal encontrado em {url}") return None logging.info(f"Conteúdo HTML extraído: {len(html)} caracteres") # Extrair metadados title = page.title() links = page.query_selector_all("a") logging.info(f"Título: {title}") logging.info(f"Links encontrados: {len(links)}") # Construir documento Markdown md_content = f"# {title}\n\n" md_content += f"**URL**: [{url}]({url})\n\n" md_content += self._html_to_markdown(html, url) # Adicionar links relacionados apenas se forem do mesmo domínio related_links = [] for link in links: href = link.get_attribute("href") if href and not href.startswith('#'): full_url = urljoin(url, href) if urlparse(full_url).netloc == parsed.netloc: text = link.inner_text().strip() if text: # Só adiciona se tiver texto related_links.append(f"- [{text}]({full_url})") if related_links: md_content += "\n\n## Links Relacionados\n\n" md_content += "\n".join(related_links) context.close() browser.close() logging.info(f"Página processada com sucesso: {url}") return md_content except TimeoutError: logging.warning(f"Timeout ao acessar {url} (tentativa {retries+1})") retries += 1 time.sleep(self.config.backoff_factor * (2 ** retries)) except Exception as e: logging.error(f"Erro crítico em {url}: {str(e)}") retries += 1 self.failed_urls.add(url) return None def _should_skip_url(self, url: str) -> bool: """Verifica se uma URL deve ser ignorada.""" parsed = urlparse(url) # Verificar padrões excluídos if self.config.excluded_patterns: for pattern in self.config.excluded_patterns: if pattern in url: return True # Verificar extensões de arquivo if parsed.path.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')): return True # Verificar domínios permitidos if self.config.allowed_domains: if parsed.netloc not in self.config.allowed_domains: return True return False def crawl(self, start_url: str): try: domain = urlparse(start_url).netloc queue = [(start_url, 0)] while queue: url, depth = queue.pop(0) if depth > self.config.max_depth: continue if url in self.visited: continue if domain != urlparse(url).netloc: continue logging.info(f"Processando: {url} (profundidade {depth})") content = self._scrape_page(url, depth) if content: # Nome do ficheiro simplificado filename = f"{urlparse(url).netloc.replace('www.','')}.md" filepath = os.path.join(self.config.output_dir, filename) # Se já existe, adiciona um número counter = 1 while os.path.exists(filepath): base, ext = os.path.splitext(filename) filepath = os.path.join(self.config.output_dir, f"{base}_{counter}{ext}") counter += 1 with open(filepath, 'w', encoding='utf-8') as f: f.write(content) self.visited.add(url) # Extrair novos links para processar new_links = self._extract_links_from_content(content) for link in new_links: if link not in self.visited: queue.append((link, depth + 1)) # Pausa entre requisições time.sleep(random.uniform(*self.config.politeness_delay)) # Rotacionar proxy e user agent periodicamente if len(self.visited) % 10 == 0: self._rotate_proxy() self._rotate_user_agent() # Salvar relatório final self._save_crawl_report(start_url) except Exception as e: logging.error(f"Erro durante o crawl: {str(e)}") raise def _extract_links_from_content(self, content: str) -> List[str]: """Extrai links do conteúdo markdown.""" links = [] for line in content.split('\n'): if line.startswith('- [') and '](' in line and ')' in line: start = line.find('](') + 2 end = line.find(')', start) if start > 1 and end > start: link = line[start:end] if not any(skip in link for skip in self.config.excluded_patterns): links.append(link) return list(set(links)) def _save_crawl_report(self, start_url: str): """Salva um relatório do crawl.""" report = { "start_url": start_url, "timestamp": time.time(), "failed_urls": list(self.failed_urls), "visited_urls": list(self.visited), "config": self.config.to_dict() } report_file = f"{self.config.output_dir}/crawl_report_{int(time.time())}.json" with open(report_file, 'w') as f: json.dump(report, f, indent=2) def run_scraper(): # Configuração padrão melhorada config = ScraperConfig( max_depth=3, request_timeout=60, # Aumentado para 60 segundos max_retries=3, politeness_delay=(2, 5), output_dir="output_md", excluded_patterns=[ '/tag/', '/category/', '/author/', '/page/', '/wp-content/', '/wp-admin/', '/feed/', '/rss/' ], save_metadata=True, clean_output=True, allowed_domains=["www.wpbeginner.com"] ) # URL para processar urls = ["https://www.wpbeginner.com"] scraper = Scraper(config) for url in urls: try: logging.info(f"Iniciando crawl de: {url}") scraper.crawl(url) except Exception as e: logging.error(f"Erro ao processar {url}: {str(e)}") continue if __name__ == "__main__": run_scraper()