Files
scripts/scraper/scraper.py

503 lines
18 KiB
Python
Executable File

"""
scraper.py
Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""
import os
import time
import logging
import random
import json
from dataclasses import dataclass, asdict
from urllib.parse import urljoin, urlparse, urlunparse
from multiprocessing import Pool, cpu_count
from typing import List, Dict, Optional, Set
from pathlib import Path
from playwright.sync_api import sync_playwright, TimeoutError, Page
import markdownify
from dotenv import load_dotenv
# Carregar variáveis de ambiente
load_dotenv()
# Configuração de logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('scraper.log'),
logging.StreamHandler()
]
)
@dataclass
class ScraperConfig:
user_agents: List[str] = None
proxies: List[str] = None
max_depth: int = 3
request_timeout: int = 30
max_retries: int = 3
backoff_factor: float = 0.5
politeness_delay: tuple = (1, 3)
output_dir: str = "output_md"
allowed_domains: List[str] = None
excluded_patterns: List[str] = None
save_metadata: bool = True
clean_output: bool = True
def to_dict(self) -> Dict:
return asdict(self)
@classmethod
def from_dict(cls, data: Dict) -> 'ScraperConfig':
return cls(**data)
def save(self, filepath: str):
with open(filepath, 'w') as f:
json.dump(self.to_dict(), f, indent=2)
@classmethod
def load(cls, filepath: str) -> 'ScraperConfig':
with open(filepath) as f:
return cls.from_dict(json.load(f))
class Scraper:
def __init__(self, config: ScraperConfig):
self.config = config
self.visited: Set[str] = set()
self.failed_urls: Set[str] = set()
self.metadata: Dict = {}
self.current_proxy = None
self.current_user_agent = None
# Criar diretórios necessários
self.setup_directories()
# Inicializar valores aleatórios
self._rotate_user_agent()
self._rotate_proxy()
def setup_directories(self):
"""Criar estrutura de diretórios necessária."""
os.makedirs(self.config.output_dir, exist_ok=True)
os.makedirs(f"{self.config.output_dir}/metadata", exist_ok=True)
os.makedirs(f"{self.config.output_dir}/raw", exist_ok=True)
def _rotate_user_agent(self):
if self.config.user_agents:
self.current_user_agent = random.choice(self.config.user_agents)
else:
self.current_user_agent = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/90.0.4430.212 Safari/537.36"
)
def _rotate_proxy(self):
if self.config.proxies:
self.current_proxy = random.choice(self.config.proxies)
def _get_browser_config(self):
config = {
"headless": True,
"timeout": self.config.request_timeout * 1000
}
if self.current_proxy:
config["proxy"] = {
"server": self.current_proxy,
"username": os.getenv("PROXY_USER"),
"password": os.getenv("PROXY_PASS")
}
return config
def _extract_metadata(self, page: Page, url: str) -> Dict:
"""Extrair metadados relevantes da página."""
metadata = {
"url": url,
"title": page.title(),
"timestamp": time.time(),
"headers": {},
"meta_tags": {}
}
# Extrair headers (h1-h6)
for i in range(1, 7):
headers = page.query_selector_all(f"h{i}")
metadata["headers"][f"h{i}"] = [h.inner_text() for h in headers]
# Extrair meta tags
meta_tags = page.query_selector_all("meta")
for tag in meta_tags:
name = tag.get_attribute("name") or tag.get_attribute("property")
content = tag.get_attribute("content")
if name and content:
metadata["meta_tags"][name] = content
return metadata
def _html_to_markdown(self, html: str, url: str) -> str:
try:
md = markdownify.markdownify(
html,
heading_style="ATX",
bullets='•◦▪‣⁃',
code_language_callback=lambda el: 'text'
)
# Limpeza adicional
if self.config.clean_output:
md = self._clean_markdown(md)
return f"# {urlparse(url).path}\n\n{md}\n\n---\n"
except Exception as e:
logging.error(f"Erro na conversão Markdown: {e}")
return ""
def _clean_markdown(self, content: str) -> str:
"""Limpa e melhora o conteúdo Markdown."""
lines = content.split('\n')
cleaned_lines = []
for line in lines:
# Remove linhas vazias consecutivas
if not line.strip() and cleaned_lines and not cleaned_lines[-1].strip():
continue
# Remove linhas com caracteres repetidos
if len(set(line.strip())) == 1 and len(line.strip()) > 3:
continue
# Remove linhas muito curtas que são apenas pontuação
if len(line.strip()) < 3 and not any(c.isalnum() for c in line):
continue
# Remove linhas de loading e javascript
if "loading" in line.lower() or "javascript" in line.lower():
continue
# Remove linhas que são apenas URLs
if line.strip().startswith('http') and len(line.split()) == 1:
continue
# Remove linhas que são apenas números ou datas
if line.strip().replace('/', '').replace('-', '').replace('.', '').isdigit():
continue
cleaned_lines.append(line)
content = '\n'.join(cleaned_lines)
# Remove múltiplos espaços em branco
content = ' '.join(content.split())
# Remove caracteres especiais repetidos
for char in '.,!?-':
content = content.replace(char + char, char)
return content
def _extract_article_content(self, page) -> str:
"""Extrai o conteúdo principal do artigo."""
# Tenta encontrar o conteúdo principal
selectors = [
"article",
"main",
".post-content",
".entry-content",
".article-content",
"#content",
".content"
]
content = None
for selector in selectors:
content = page.query_selector(selector)
if content:
break
if not content:
content = page.query_selector("body")
if not content:
return ""
# Remove elementos indesejados
remove_selectors = [
"header",
"footer",
"nav",
".sidebar",
"#sidebar",
".widget",
".comments",
".related-posts",
".social-share",
"script",
"style",
".advertisement",
".ad-",
"#cookie-notice"
]
for selector in remove_selectors:
elements = content.query_selector_all(selector)
for element in elements:
try:
page.evaluate("element => element.remove()", element)
except:
pass
return content.inner_html()
def _scrape_page(self, url: str, depth: int) -> Optional[str]:
if self._should_skip_url(url):
logging.info(f"URL ignorada pelos filtros: {url}")
return None
retries = 0
while retries < self.config.max_retries:
try:
logging.info(f"Tentativa {retries + 1} para {url}")
with sync_playwright() as playwright:
browser = playwright.chromium.launch(**self._get_browser_config())
context = browser.new_context(
java_script_enabled=True,
ignore_https_errors=True,
user_agent=self.current_user_agent
)
logging.info(f"Browser iniciado para {url}")
page = context.new_page()
# Normalização da URL
original_url = url
parsed = urlparse(url)
if not parsed.scheme:
url = f"https://{url.lstrip('/')}"
parsed = urlparse(url)
if not parsed.netloc or '.' not in parsed.netloc or ' ' in parsed.netloc:
raise ValueError(f"Domínio inválido: {original_url}")
clean_path = parsed.path.split('//')[0]
clean_url = urlunparse(parsed._replace(path=clean_path))
logging.info(f"Carregando página: {clean_url}")
try:
response = page.goto(
clean_url,
timeout=self.config.request_timeout * 1000,
wait_until="load",
referer="https://www.google.com/"
)
if response and response.status >= 400:
raise ConnectionError(f"Erro HTTP {response.status} - {clean_url}")
logging.info(f"Página carregada: {clean_url}")
except Exception as e:
logging.error(f"Falha ao carregar {url}: {str(e)}")
raise
page.wait_for_load_state("load")
logging.info(f"Estado de carregamento atingido para {url}")
# Extrair conteúdo principal
html = self._extract_article_content(page)
if not html:
logging.error(f"Nenhum conteúdo principal encontrado em {url}")
return None
logging.info(f"Conteúdo HTML extraído: {len(html)} caracteres")
# Extrair metadados
title = page.title()
links = page.query_selector_all("a")
logging.info(f"Título: {title}")
logging.info(f"Links encontrados: {len(links)}")
# Construir documento Markdown
md_content = f"# {title}\n\n"
md_content += f"**URL**: [{url}]({url})\n\n"
md_content += self._html_to_markdown(html, url)
# Adicionar links relacionados apenas se forem do mesmo domínio
related_links = []
for link in links:
href = link.get_attribute("href")
if href and not href.startswith('#'):
full_url = urljoin(url, href)
if urlparse(full_url).netloc == parsed.netloc:
text = link.inner_text().strip()
if text: # Só adiciona se tiver texto
related_links.append(f"- [{text}]({full_url})")
if related_links:
md_content += "\n\n## Links Relacionados\n\n"
md_content += "\n".join(related_links)
context.close()
browser.close()
logging.info(f"Página processada com sucesso: {url}")
return md_content
except TimeoutError:
logging.warning(f"Timeout ao acessar {url} (tentativa {retries+1})")
retries += 1
time.sleep(self.config.backoff_factor * (2 ** retries))
except Exception as e:
logging.error(f"Erro crítico em {url}: {str(e)}")
retries += 1
self.failed_urls.add(url)
return None
def _should_skip_url(self, url: str) -> bool:
"""Verifica se uma URL deve ser ignorada."""
parsed = urlparse(url)
# Verificar padrões excluídos
if self.config.excluded_patterns:
for pattern in self.config.excluded_patterns:
if pattern in url:
return True
# Verificar extensões de arquivo
if parsed.path.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')):
return True
# Verificar domínios permitidos
if self.config.allowed_domains:
if parsed.netloc not in self.config.allowed_domains:
return True
return False
def crawl(self, start_url: str):
try:
domain = urlparse(start_url).netloc
queue = [(start_url, 0)]
while queue:
url, depth = queue.pop(0)
if depth > self.config.max_depth:
continue
if url in self.visited:
continue
if domain != urlparse(url).netloc:
continue
logging.info(f"Processando: {url} (profundidade {depth})")
content = self._scrape_page(url, depth)
if content:
# Nome do ficheiro simplificado
filename = f"{urlparse(url).netloc.replace('www.','')}.md"
filepath = os.path.join(self.config.output_dir, filename)
# Se já existe, adiciona um número
counter = 1
while os.path.exists(filepath):
base, ext = os.path.splitext(filename)
filepath = os.path.join(self.config.output_dir, f"{base}_{counter}{ext}")
counter += 1
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
self.visited.add(url)
# Extrair novos links para processar
new_links = self._extract_links_from_content(content)
for link in new_links:
if link not in self.visited:
queue.append((link, depth + 1))
# Pausa entre requisições
time.sleep(random.uniform(*self.config.politeness_delay))
# Rotacionar proxy e user agent periodicamente
if len(self.visited) % 10 == 0:
self._rotate_proxy()
self._rotate_user_agent()
# Salvar relatório final
self._save_crawl_report(start_url)
except Exception as e:
logging.error(f"Erro durante o crawl: {str(e)}")
raise
def _extract_links_from_content(self, content: str) -> List[str]:
"""Extrai links do conteúdo markdown."""
links = []
for line in content.split('\n'):
if line.startswith('- [') and '](' in line and ')' in line:
start = line.find('](') + 2
end = line.find(')', start)
if start > 1 and end > start:
link = line[start:end]
if not any(skip in link for skip in self.config.excluded_patterns):
links.append(link)
return list(set(links))
def _save_crawl_report(self, start_url: str):
"""Salva um relatório do crawl."""
report = {
"start_url": start_url,
"timestamp": time.time(),
"failed_urls": list(self.failed_urls),
"visited_urls": list(self.visited),
"config": self.config.to_dict()
}
report_file = f"{self.config.output_dir}/crawl_report_{int(time.time())}.json"
with open(report_file, 'w') as f:
json.dump(report, f, indent=2)
def run_scraper():
# Configuração padrão melhorada
config = ScraperConfig(
max_depth=3,
request_timeout=60, # Aumentado para 60 segundos
max_retries=3,
politeness_delay=(2, 5),
output_dir="output_md",
excluded_patterns=[
'/tag/', '/category/', '/author/', '/page/',
'/wp-content/', '/wp-admin/', '/feed/', '/rss/'
],
save_metadata=True,
clean_output=True,
allowed_domains=["www.wpbeginner.com"]
)
# URL para processar
urls = ["https://www.wpbeginner.com"]
scraper = Scraper(config)
for url in urls:
try:
logging.info(f"Iniciando crawl de: {url}")
scraper.crawl(url)
except Exception as e:
logging.error(f"Erro ao processar {url}: {str(e)}")
continue
if __name__ == "__main__":
run_scraper()