503 lines
18 KiB
Python
Executable File
503 lines
18 KiB
Python
Executable File
"""
|
|
scraper.py
|
|
|
|
Author: Descomplicar® Crescimento Digital
|
|
Link: https://descomplicar.pt
|
|
Copyright: 2025 Descomplicar®
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import logging
|
|
import random
|
|
import json
|
|
from dataclasses import dataclass, asdict
|
|
from urllib.parse import urljoin, urlparse, urlunparse
|
|
from multiprocessing import Pool, cpu_count
|
|
from typing import List, Dict, Optional, Set
|
|
from pathlib import Path
|
|
|
|
from playwright.sync_api import sync_playwright, TimeoutError, Page
|
|
import markdownify
|
|
from dotenv import load_dotenv
|
|
|
|
# Carregar variáveis de ambiente
|
|
load_dotenv()
|
|
|
|
# Configuração de logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('scraper.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
|
|
@dataclass
|
|
class ScraperConfig:
|
|
user_agents: List[str] = None
|
|
proxies: List[str] = None
|
|
max_depth: int = 3
|
|
request_timeout: int = 30
|
|
max_retries: int = 3
|
|
backoff_factor: float = 0.5
|
|
politeness_delay: tuple = (1, 3)
|
|
output_dir: str = "output_md"
|
|
allowed_domains: List[str] = None
|
|
excluded_patterns: List[str] = None
|
|
save_metadata: bool = True
|
|
clean_output: bool = True
|
|
|
|
def to_dict(self) -> Dict:
|
|
return asdict(self)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict) -> 'ScraperConfig':
|
|
return cls(**data)
|
|
|
|
def save(self, filepath: str):
|
|
with open(filepath, 'w') as f:
|
|
json.dump(self.to_dict(), f, indent=2)
|
|
|
|
@classmethod
|
|
def load(cls, filepath: str) -> 'ScraperConfig':
|
|
with open(filepath) as f:
|
|
return cls.from_dict(json.load(f))
|
|
|
|
class Scraper:
|
|
def __init__(self, config: ScraperConfig):
|
|
self.config = config
|
|
self.visited: Set[str] = set()
|
|
self.failed_urls: Set[str] = set()
|
|
self.metadata: Dict = {}
|
|
self.current_proxy = None
|
|
self.current_user_agent = None
|
|
|
|
# Criar diretórios necessários
|
|
self.setup_directories()
|
|
|
|
# Inicializar valores aleatórios
|
|
self._rotate_user_agent()
|
|
self._rotate_proxy()
|
|
|
|
def setup_directories(self):
|
|
"""Criar estrutura de diretórios necessária."""
|
|
os.makedirs(self.config.output_dir, exist_ok=True)
|
|
os.makedirs(f"{self.config.output_dir}/metadata", exist_ok=True)
|
|
os.makedirs(f"{self.config.output_dir}/raw", exist_ok=True)
|
|
|
|
def _rotate_user_agent(self):
|
|
if self.config.user_agents:
|
|
self.current_user_agent = random.choice(self.config.user_agents)
|
|
else:
|
|
self.current_user_agent = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/90.0.4430.212 Safari/537.36"
|
|
)
|
|
|
|
def _rotate_proxy(self):
|
|
if self.config.proxies:
|
|
self.current_proxy = random.choice(self.config.proxies)
|
|
|
|
def _get_browser_config(self):
|
|
config = {
|
|
"headless": True,
|
|
"timeout": self.config.request_timeout * 1000
|
|
}
|
|
|
|
if self.current_proxy:
|
|
config["proxy"] = {
|
|
"server": self.current_proxy,
|
|
"username": os.getenv("PROXY_USER"),
|
|
"password": os.getenv("PROXY_PASS")
|
|
}
|
|
|
|
return config
|
|
|
|
def _extract_metadata(self, page: Page, url: str) -> Dict:
|
|
"""Extrair metadados relevantes da página."""
|
|
metadata = {
|
|
"url": url,
|
|
"title": page.title(),
|
|
"timestamp": time.time(),
|
|
"headers": {},
|
|
"meta_tags": {}
|
|
}
|
|
|
|
# Extrair headers (h1-h6)
|
|
for i in range(1, 7):
|
|
headers = page.query_selector_all(f"h{i}")
|
|
metadata["headers"][f"h{i}"] = [h.inner_text() for h in headers]
|
|
|
|
# Extrair meta tags
|
|
meta_tags = page.query_selector_all("meta")
|
|
for tag in meta_tags:
|
|
name = tag.get_attribute("name") or tag.get_attribute("property")
|
|
content = tag.get_attribute("content")
|
|
if name and content:
|
|
metadata["meta_tags"][name] = content
|
|
|
|
return metadata
|
|
|
|
def _html_to_markdown(self, html: str, url: str) -> str:
|
|
try:
|
|
md = markdownify.markdownify(
|
|
html,
|
|
heading_style="ATX",
|
|
bullets='•◦▪‣⁃',
|
|
code_language_callback=lambda el: 'text'
|
|
)
|
|
|
|
# Limpeza adicional
|
|
if self.config.clean_output:
|
|
md = self._clean_markdown(md)
|
|
|
|
return f"# {urlparse(url).path}\n\n{md}\n\n---\n"
|
|
except Exception as e:
|
|
logging.error(f"Erro na conversão Markdown: {e}")
|
|
return ""
|
|
|
|
def _clean_markdown(self, content: str) -> str:
|
|
"""Limpa e melhora o conteúdo Markdown."""
|
|
lines = content.split('\n')
|
|
cleaned_lines = []
|
|
|
|
for line in lines:
|
|
# Remove linhas vazias consecutivas
|
|
if not line.strip() and cleaned_lines and not cleaned_lines[-1].strip():
|
|
continue
|
|
|
|
# Remove linhas com caracteres repetidos
|
|
if len(set(line.strip())) == 1 and len(line.strip()) > 3:
|
|
continue
|
|
|
|
# Remove linhas muito curtas que são apenas pontuação
|
|
if len(line.strip()) < 3 and not any(c.isalnum() for c in line):
|
|
continue
|
|
|
|
# Remove linhas de loading e javascript
|
|
if "loading" in line.lower() or "javascript" in line.lower():
|
|
continue
|
|
|
|
# Remove linhas que são apenas URLs
|
|
if line.strip().startswith('http') and len(line.split()) == 1:
|
|
continue
|
|
|
|
# Remove linhas que são apenas números ou datas
|
|
if line.strip().replace('/', '').replace('-', '').replace('.', '').isdigit():
|
|
continue
|
|
|
|
cleaned_lines.append(line)
|
|
|
|
content = '\n'.join(cleaned_lines)
|
|
|
|
# Remove múltiplos espaços em branco
|
|
content = ' '.join(content.split())
|
|
|
|
# Remove caracteres especiais repetidos
|
|
for char in '.,!?-':
|
|
content = content.replace(char + char, char)
|
|
|
|
return content
|
|
|
|
def _extract_article_content(self, page) -> str:
|
|
"""Extrai o conteúdo principal do artigo."""
|
|
# Tenta encontrar o conteúdo principal
|
|
selectors = [
|
|
"article",
|
|
"main",
|
|
".post-content",
|
|
".entry-content",
|
|
".article-content",
|
|
"#content",
|
|
".content"
|
|
]
|
|
|
|
content = None
|
|
for selector in selectors:
|
|
content = page.query_selector(selector)
|
|
if content:
|
|
break
|
|
|
|
if not content:
|
|
content = page.query_selector("body")
|
|
|
|
if not content:
|
|
return ""
|
|
|
|
# Remove elementos indesejados
|
|
remove_selectors = [
|
|
"header",
|
|
"footer",
|
|
"nav",
|
|
".sidebar",
|
|
"#sidebar",
|
|
".widget",
|
|
".comments",
|
|
".related-posts",
|
|
".social-share",
|
|
"script",
|
|
"style",
|
|
".advertisement",
|
|
".ad-",
|
|
"#cookie-notice"
|
|
]
|
|
|
|
for selector in remove_selectors:
|
|
elements = content.query_selector_all(selector)
|
|
for element in elements:
|
|
try:
|
|
page.evaluate("element => element.remove()", element)
|
|
except:
|
|
pass
|
|
|
|
return content.inner_html()
|
|
|
|
def _scrape_page(self, url: str, depth: int) -> Optional[str]:
|
|
if self._should_skip_url(url):
|
|
logging.info(f"URL ignorada pelos filtros: {url}")
|
|
return None
|
|
|
|
retries = 0
|
|
while retries < self.config.max_retries:
|
|
try:
|
|
logging.info(f"Tentativa {retries + 1} para {url}")
|
|
with sync_playwright() as playwright:
|
|
browser = playwright.chromium.launch(**self._get_browser_config())
|
|
context = browser.new_context(
|
|
java_script_enabled=True,
|
|
ignore_https_errors=True,
|
|
user_agent=self.current_user_agent
|
|
)
|
|
|
|
logging.info(f"Browser iniciado para {url}")
|
|
page = context.new_page()
|
|
|
|
# Normalização da URL
|
|
original_url = url
|
|
parsed = urlparse(url)
|
|
|
|
if not parsed.scheme:
|
|
url = f"https://{url.lstrip('/')}"
|
|
|
|
parsed = urlparse(url)
|
|
|
|
if not parsed.netloc or '.' not in parsed.netloc or ' ' in parsed.netloc:
|
|
raise ValueError(f"Domínio inválido: {original_url}")
|
|
|
|
clean_path = parsed.path.split('//')[0]
|
|
clean_url = urlunparse(parsed._replace(path=clean_path))
|
|
|
|
logging.info(f"Carregando página: {clean_url}")
|
|
try:
|
|
response = page.goto(
|
|
clean_url,
|
|
timeout=self.config.request_timeout * 1000,
|
|
wait_until="load",
|
|
referer="https://www.google.com/"
|
|
)
|
|
|
|
if response and response.status >= 400:
|
|
raise ConnectionError(f"Erro HTTP {response.status} - {clean_url}")
|
|
|
|
logging.info(f"Página carregada: {clean_url}")
|
|
|
|
except Exception as e:
|
|
logging.error(f"Falha ao carregar {url}: {str(e)}")
|
|
raise
|
|
|
|
page.wait_for_load_state("load")
|
|
logging.info(f"Estado de carregamento atingido para {url}")
|
|
|
|
# Extrair conteúdo principal
|
|
html = self._extract_article_content(page)
|
|
if not html:
|
|
logging.error(f"Nenhum conteúdo principal encontrado em {url}")
|
|
return None
|
|
|
|
logging.info(f"Conteúdo HTML extraído: {len(html)} caracteres")
|
|
|
|
# Extrair metadados
|
|
title = page.title()
|
|
links = page.query_selector_all("a")
|
|
logging.info(f"Título: {title}")
|
|
logging.info(f"Links encontrados: {len(links)}")
|
|
|
|
# Construir documento Markdown
|
|
md_content = f"# {title}\n\n"
|
|
md_content += f"**URL**: [{url}]({url})\n\n"
|
|
md_content += self._html_to_markdown(html, url)
|
|
|
|
# Adicionar links relacionados apenas se forem do mesmo domínio
|
|
related_links = []
|
|
for link in links:
|
|
href = link.get_attribute("href")
|
|
if href and not href.startswith('#'):
|
|
full_url = urljoin(url, href)
|
|
if urlparse(full_url).netloc == parsed.netloc:
|
|
text = link.inner_text().strip()
|
|
if text: # Só adiciona se tiver texto
|
|
related_links.append(f"- [{text}]({full_url})")
|
|
|
|
if related_links:
|
|
md_content += "\n\n## Links Relacionados\n\n"
|
|
md_content += "\n".join(related_links)
|
|
|
|
context.close()
|
|
browser.close()
|
|
|
|
logging.info(f"Página processada com sucesso: {url}")
|
|
return md_content
|
|
|
|
except TimeoutError:
|
|
logging.warning(f"Timeout ao acessar {url} (tentativa {retries+1})")
|
|
retries += 1
|
|
time.sleep(self.config.backoff_factor * (2 ** retries))
|
|
except Exception as e:
|
|
logging.error(f"Erro crítico em {url}: {str(e)}")
|
|
retries += 1
|
|
|
|
self.failed_urls.add(url)
|
|
return None
|
|
|
|
def _should_skip_url(self, url: str) -> bool:
|
|
"""Verifica se uma URL deve ser ignorada."""
|
|
parsed = urlparse(url)
|
|
|
|
# Verificar padrões excluídos
|
|
if self.config.excluded_patterns:
|
|
for pattern in self.config.excluded_patterns:
|
|
if pattern in url:
|
|
return True
|
|
|
|
# Verificar extensões de arquivo
|
|
if parsed.path.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')):
|
|
return True
|
|
|
|
# Verificar domínios permitidos
|
|
if self.config.allowed_domains:
|
|
if parsed.netloc not in self.config.allowed_domains:
|
|
return True
|
|
|
|
return False
|
|
|
|
def crawl(self, start_url: str):
|
|
try:
|
|
domain = urlparse(start_url).netloc
|
|
queue = [(start_url, 0)]
|
|
|
|
while queue:
|
|
url, depth = queue.pop(0)
|
|
|
|
if depth > self.config.max_depth:
|
|
continue
|
|
|
|
if url in self.visited:
|
|
continue
|
|
|
|
if domain != urlparse(url).netloc:
|
|
continue
|
|
|
|
logging.info(f"Processando: {url} (profundidade {depth})")
|
|
|
|
content = self._scrape_page(url, depth)
|
|
if content:
|
|
# Nome do ficheiro simplificado
|
|
filename = f"{urlparse(url).netloc.replace('www.','')}.md"
|
|
filepath = os.path.join(self.config.output_dir, filename)
|
|
|
|
# Se já existe, adiciona um número
|
|
counter = 1
|
|
while os.path.exists(filepath):
|
|
base, ext = os.path.splitext(filename)
|
|
filepath = os.path.join(self.config.output_dir, f"{base}_{counter}{ext}")
|
|
counter += 1
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
self.visited.add(url)
|
|
|
|
# Extrair novos links para processar
|
|
new_links = self._extract_links_from_content(content)
|
|
for link in new_links:
|
|
if link not in self.visited:
|
|
queue.append((link, depth + 1))
|
|
|
|
# Pausa entre requisições
|
|
time.sleep(random.uniform(*self.config.politeness_delay))
|
|
|
|
# Rotacionar proxy e user agent periodicamente
|
|
if len(self.visited) % 10 == 0:
|
|
self._rotate_proxy()
|
|
self._rotate_user_agent()
|
|
|
|
# Salvar relatório final
|
|
self._save_crawl_report(start_url)
|
|
|
|
except Exception as e:
|
|
logging.error(f"Erro durante o crawl: {str(e)}")
|
|
raise
|
|
|
|
def _extract_links_from_content(self, content: str) -> List[str]:
|
|
"""Extrai links do conteúdo markdown."""
|
|
links = []
|
|
for line in content.split('\n'):
|
|
if line.startswith('- [') and '](' in line and ')' in line:
|
|
start = line.find('](') + 2
|
|
end = line.find(')', start)
|
|
if start > 1 and end > start:
|
|
link = line[start:end]
|
|
if not any(skip in link for skip in self.config.excluded_patterns):
|
|
links.append(link)
|
|
return list(set(links))
|
|
|
|
def _save_crawl_report(self, start_url: str):
|
|
"""Salva um relatório do crawl."""
|
|
report = {
|
|
"start_url": start_url,
|
|
"timestamp": time.time(),
|
|
"failed_urls": list(self.failed_urls),
|
|
"visited_urls": list(self.visited),
|
|
"config": self.config.to_dict()
|
|
}
|
|
|
|
report_file = f"{self.config.output_dir}/crawl_report_{int(time.time())}.json"
|
|
with open(report_file, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
def run_scraper():
|
|
# Configuração padrão melhorada
|
|
config = ScraperConfig(
|
|
max_depth=3,
|
|
request_timeout=60, # Aumentado para 60 segundos
|
|
max_retries=3,
|
|
politeness_delay=(2, 5),
|
|
output_dir="output_md",
|
|
excluded_patterns=[
|
|
'/tag/', '/category/', '/author/', '/page/',
|
|
'/wp-content/', '/wp-admin/', '/feed/', '/rss/'
|
|
],
|
|
save_metadata=True,
|
|
clean_output=True,
|
|
allowed_domains=["www.wpbeginner.com"]
|
|
)
|
|
|
|
# URL para processar
|
|
urls = ["https://www.wpbeginner.com"]
|
|
|
|
scraper = Scraper(config)
|
|
|
|
for url in urls:
|
|
try:
|
|
logging.info(f"Iniciando crawl de: {url}")
|
|
scraper.crawl(url)
|
|
except Exception as e:
|
|
logging.error(f"Erro ao processar {url}: {str(e)}")
|
|
continue
|
|
|
|
if __name__ == "__main__":
|
|
run_scraper()
|