init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,502 @@
+"""
+scraper.py
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import os
+import time
+import logging
+import random
+import json
+from dataclasses import dataclass, asdict
+from urllib.parse import urljoin, urlparse, urlunparse
+from multiprocessing import Pool, cpu_count
+from typing import List, Dict, Optional, Set
+from pathlib import Path
+
+from playwright.sync_api import sync_playwright, TimeoutError, Page
+import markdownify
+from dotenv import load_dotenv
+
+# Carregar variáveis de ambiente
+load_dotenv()
+
+# Configuração de logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('scraper.log'),
+        logging.StreamHandler()
+    ]
+)
+
+@dataclass
+class ScraperConfig:
+    user_agents: List[str] = None
+    proxies: List[str] = None
+    max_depth: int = 3
+    request_timeout: int = 30
+    max_retries: int = 3
+    backoff_factor: float = 0.5
+    politeness_delay: tuple = (1, 3)
+    output_dir: str = "output_md"
+    allowed_domains: List[str] = None
+    excluded_patterns: List[str] = None
+    save_metadata: bool = True
+    clean_output: bool = True
+    
+    def to_dict(self) -> Dict:
+        return asdict(self)
+    
+    @classmethod
+    def from_dict(cls, data: Dict) -> 'ScraperConfig':
+        return cls(**data)
+    
+    def save(self, filepath: str):
+        with open(filepath, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+    
+    @classmethod
+    def load(cls, filepath: str) -> 'ScraperConfig':
+        with open(filepath) as f:
+            return cls.from_dict(json.load(f))
+
+class Scraper:
+    def __init__(self, config: ScraperConfig):
+        self.config = config
+        self.visited: Set[str] = set()
+        self.failed_urls: Set[str] = set()
+        self.metadata: Dict = {}
+        self.current_proxy = None
+        self.current_user_agent = None
+        
+        # Criar diretórios necessários
+        self.setup_directories()
+        
+        # Inicializar valores aleatórios
+        self._rotate_user_agent()
+        self._rotate_proxy()
+    
+    def setup_directories(self):
+        """Criar estrutura de diretórios necessária."""
+        os.makedirs(self.config.output_dir, exist_ok=True)
+        os.makedirs(f"{self.config.output_dir}/metadata", exist_ok=True)
+        os.makedirs(f"{self.config.output_dir}/raw", exist_ok=True)
+
+    def _rotate_user_agent(self):
+        if self.config.user_agents:
+            self.current_user_agent = random.choice(self.config.user_agents)
+        else:
+            self.current_user_agent = (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/90.0.4430.212 Safari/537.36"
+            )
+
+    def _rotate_proxy(self):
+        if self.config.proxies:
+            self.current_proxy = random.choice(self.config.proxies)
+
+    def _get_browser_config(self):
+        config = {
+            "headless": True,
+            "timeout": self.config.request_timeout * 1000
+        }
+        
+        if self.current_proxy:
+            config["proxy"] = {
+                "server": self.current_proxy,
+                "username": os.getenv("PROXY_USER"),
+                "password": os.getenv("PROXY_PASS")
+            }
+            
+        return config
+
+    def _extract_metadata(self, page: Page, url: str) -> Dict:
+        """Extrair metadados relevantes da página."""
+        metadata = {
+            "url": url,
+            "title": page.title(),
+            "timestamp": time.time(),
+            "headers": {},
+            "meta_tags": {}
+        }
+        
+        # Extrair headers (h1-h6)
+        for i in range(1, 7):
+            headers = page.query_selector_all(f"h{i}")
+            metadata["headers"][f"h{i}"] = [h.inner_text() for h in headers]
+        
+        # Extrair meta tags
+        meta_tags = page.query_selector_all("meta")
+        for tag in meta_tags:
+            name = tag.get_attribute("name") or tag.get_attribute("property")
+            content = tag.get_attribute("content")
+            if name and content:
+                metadata["meta_tags"][name] = content
+        
+        return metadata
+
+    def _html_to_markdown(self, html: str, url: str) -> str:
+        try:
+            md = markdownify.markdownify(
+                html,
+                heading_style="ATX",
+                bullets='•◦▪‣⁃',
+                code_language_callback=lambda el: 'text'
+            )
+            
+            # Limpeza adicional
+            if self.config.clean_output:
+                md = self._clean_markdown(md)
+            
+            return f"# {urlparse(url).path}\n\n{md}\n\n---\n"
+        except Exception as e:
+            logging.error(f"Erro na conversão Markdown: {e}")
+            return ""
+    
+    def _clean_markdown(self, content: str) -> str:
+        """Limpa e melhora o conteúdo Markdown."""
+        lines = content.split('\n')
+        cleaned_lines = []
+        
+        for line in lines:
+            # Remove linhas vazias consecutivas
+            if not line.strip() and cleaned_lines and not cleaned_lines[-1].strip():
+                continue
+                
+            # Remove linhas com caracteres repetidos
+            if len(set(line.strip())) == 1 and len(line.strip()) > 3:
+                continue
+                
+            # Remove linhas muito curtas que são apenas pontuação
+            if len(line.strip()) < 3 and not any(c.isalnum() for c in line):
+                continue
+                
+            # Remove linhas de loading e javascript
+            if "loading" in line.lower() or "javascript" in line.lower():
+                continue
+                
+            # Remove linhas que são apenas URLs
+            if line.strip().startswith('http') and len(line.split()) == 1:
+                continue
+                
+            # Remove linhas que são apenas números ou datas
+            if line.strip().replace('/', '').replace('-', '').replace('.', '').isdigit():
+                continue
+                
+            cleaned_lines.append(line)
+        
+        content = '\n'.join(cleaned_lines)
+        
+        # Remove múltiplos espaços em branco
+        content = ' '.join(content.split())
+        
+        # Remove caracteres especiais repetidos
+        for char in '.,!?-':
+            content = content.replace(char + char, char)
+        
+        return content
+
+    def _extract_article_content(self, page) -> str:
+        """Extrai o conteúdo principal do artigo."""
+        # Tenta encontrar o conteúdo principal
+        selectors = [
+            "article", 
+            "main", 
+            ".post-content",
+            ".entry-content",
+            ".article-content",
+            "#content",
+            ".content"
+        ]
+        
+        content = None
+        for selector in selectors:
+            content = page.query_selector(selector)
+            if content:
+                break
+                
+        if not content:
+            content = page.query_selector("body")
+            
+        if not content:
+            return ""
+            
+        # Remove elementos indesejados
+        remove_selectors = [
+            "header",
+            "footer", 
+            "nav",
+            ".sidebar",
+            "#sidebar",
+            ".widget",
+            ".comments",
+            ".related-posts",
+            ".social-share",
+            "script",
+            "style",
+            ".advertisement",
+            ".ad-",
+            "#cookie-notice"
+        ]
+        
+        for selector in remove_selectors:
+            elements = content.query_selector_all(selector)
+            for element in elements:
+                try:
+                    page.evaluate("element => element.remove()", element)
+                except:
+                    pass
+        
+        return content.inner_html()
+
+    def _scrape_page(self, url: str, depth: int) -> Optional[str]:
+        if self._should_skip_url(url):
+            logging.info(f"URL ignorada pelos filtros: {url}")
+            return None
+            
+        retries = 0
+        while retries < self.config.max_retries:
+            try:
+                logging.info(f"Tentativa {retries + 1} para {url}")
+                with sync_playwright() as playwright:
+                    browser = playwright.chromium.launch(**self._get_browser_config())
+                    context = browser.new_context(
+                        java_script_enabled=True,
+                        ignore_https_errors=True,
+                        user_agent=self.current_user_agent
+                    )
+                    
+                    logging.info(f"Browser iniciado para {url}")
+                    page = context.new_page()
+                    
+                    # Normalização da URL
+                    original_url = url
+                    parsed = urlparse(url)
+                    
+                    if not parsed.scheme:
+                        url = f"https://{url.lstrip('/')}"
+                    
+                    parsed = urlparse(url)
+                    
+                    if not parsed.netloc or '.' not in parsed.netloc or ' ' in parsed.netloc:
+                        raise ValueError(f"Domínio inválido: {original_url}")
+                    
+                    clean_path = parsed.path.split('//')[0]
+                    clean_url = urlunparse(parsed._replace(path=clean_path))
+                    
+                    logging.info(f"Carregando página: {clean_url}")
+                    try:
+                        response = page.goto(
+                            clean_url,
+                            timeout=self.config.request_timeout * 1000,
+                            wait_until="load",
+                            referer="https://www.google.com/"
+                        )
+                        
+                        if response and response.status >= 400:
+                            raise ConnectionError(f"Erro HTTP {response.status} - {clean_url}")
+                            
+                        logging.info(f"Página carregada: {clean_url}")
+                        
+                    except Exception as e:
+                        logging.error(f"Falha ao carregar {url}: {str(e)}")
+                        raise
+                        
+                    page.wait_for_load_state("load")
+                    logging.info(f"Estado de carregamento atingido para {url}")
+                    
+                    # Extrair conteúdo principal
+                    html = self._extract_article_content(page)
+                    if not html:
+                        logging.error(f"Nenhum conteúdo principal encontrado em {url}")
+                        return None
+                        
+                    logging.info(f"Conteúdo HTML extraído: {len(html)} caracteres")
+                    
+                    # Extrair metadados
+                    title = page.title()
+                    links = page.query_selector_all("a")
+                    logging.info(f"Título: {title}")
+                    logging.info(f"Links encontrados: {len(links)}")
+                    
+                    # Construir documento Markdown
+                    md_content = f"# {title}\n\n"
+                    md_content += f"**URL**: [{url}]({url})\n\n"
+                    md_content += self._html_to_markdown(html, url)
+                    
+                    # Adicionar links relacionados apenas se forem do mesmo domínio
+                    related_links = []
+                    for link in links:
+                        href = link.get_attribute("href")
+                        if href and not href.startswith('#'):
+                            full_url = urljoin(url, href)
+                            if urlparse(full_url).netloc == parsed.netloc:
+                                text = link.inner_text().strip()
+                                if text:  # Só adiciona se tiver texto
+                                    related_links.append(f"- [{text}]({full_url})")
+                    
+                    if related_links:
+                        md_content += "\n\n## Links Relacionados\n\n"
+                        md_content += "\n".join(related_links)
+                    
+                    context.close()
+                    browser.close()
+                    
+                    logging.info(f"Página processada com sucesso: {url}")
+                    return md_content
+                    
+            except TimeoutError:
+                logging.warning(f"Timeout ao acessar {url} (tentativa {retries+1})")
+                retries += 1
+                time.sleep(self.config.backoff_factor * (2 ** retries))
+            except Exception as e:
+                logging.error(f"Erro crítico em {url}: {str(e)}")
+                retries += 1
+                
+        self.failed_urls.add(url)
+        return None
+
+    def _should_skip_url(self, url: str) -> bool:
+        """Verifica se uma URL deve ser ignorada."""
+        parsed = urlparse(url)
+        
+        # Verificar padrões excluídos
+        if self.config.excluded_patterns:
+            for pattern in self.config.excluded_patterns:
+                if pattern in url:
+                    return True
+        
+        # Verificar extensões de arquivo
+        if parsed.path.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')):
+            return True
+        
+        # Verificar domínios permitidos
+        if self.config.allowed_domains:
+            if parsed.netloc not in self.config.allowed_domains:
+                return True
+        
+        return False
+
+    def crawl(self, start_url: str):
+        try:
+            domain = urlparse(start_url).netloc
+            queue = [(start_url, 0)]
+            
+            while queue:
+                url, depth = queue.pop(0)
+                
+                if depth > self.config.max_depth:
+                    continue
+                    
+                if url in self.visited:
+                    continue
+                    
+                if domain != urlparse(url).netloc:
+                    continue
+
+                logging.info(f"Processando: {url} (profundidade {depth})")
+                
+                content = self._scrape_page(url, depth)
+                if content:
+                    # Nome do ficheiro simplificado
+                    filename = f"{urlparse(url).netloc.replace('www.','')}.md"
+                    filepath = os.path.join(self.config.output_dir, filename)
+                    
+                    # Se já existe, adiciona um número
+                    counter = 1
+                    while os.path.exists(filepath):
+                        base, ext = os.path.splitext(filename)
+                        filepath = os.path.join(self.config.output_dir, f"{base}_{counter}{ext}")
+                        counter += 1
+                    
+                    with open(filepath, 'w', encoding='utf-8') as f:
+                        f.write(content)
+                    
+                    self.visited.add(url)
+                    
+                    # Extrair novos links para processar
+                    new_links = self._extract_links_from_content(content)
+                    for link in new_links:
+                        if link not in self.visited:
+                            queue.append((link, depth + 1))
+                    
+                    # Pausa entre requisições
+                    time.sleep(random.uniform(*self.config.politeness_delay))
+                
+                # Rotacionar proxy e user agent periodicamente
+                if len(self.visited) % 10 == 0:
+                    self._rotate_proxy()
+                    self._rotate_user_agent()
+            
+            # Salvar relatório final
+            self._save_crawl_report(start_url)
+            
+        except Exception as e:
+            logging.error(f"Erro durante o crawl: {str(e)}")
+            raise
+
+    def _extract_links_from_content(self, content: str) -> List[str]:
+        """Extrai links do conteúdo markdown."""
+        links = []
+        for line in content.split('\n'):
+            if line.startswith('- [') and '](' in line and ')' in line:
+                start = line.find('](') + 2
+                end = line.find(')', start)
+                if start > 1 and end > start:
+                    link = line[start:end]
+                    if not any(skip in link for skip in self.config.excluded_patterns):
+                        links.append(link)
+        return list(set(links))
+
+    def _save_crawl_report(self, start_url: str):
+        """Salva um relatório do crawl."""
+        report = {
+            "start_url": start_url,
+            "timestamp": time.time(),
+            "failed_urls": list(self.failed_urls),
+            "visited_urls": list(self.visited),
+            "config": self.config.to_dict()
+        }
+        
+        report_file = f"{self.config.output_dir}/crawl_report_{int(time.time())}.json"
+        with open(report_file, 'w') as f:
+            json.dump(report, f, indent=2)
+
+def run_scraper():
+    # Configuração padrão melhorada
+    config = ScraperConfig(
+        max_depth=3,
+        request_timeout=60,  # Aumentado para 60 segundos
+        max_retries=3,
+        politeness_delay=(2, 5),
+        output_dir="output_md",
+        excluded_patterns=[
+            '/tag/', '/category/', '/author/', '/page/', 
+            '/wp-content/', '/wp-admin/', '/feed/', '/rss/'
+        ],
+        save_metadata=True,
+        clean_output=True,
+        allowed_domains=["www.wpbeginner.com"]
+    )
+    
+    # URL para processar
+    urls = ["https://www.wpbeginner.com"]
+    
+    scraper = Scraper(config)
+    
+    for url in urls:
+        try:
+            logging.info(f"Iniciando crawl de: {url}")
+            scraper.crawl(url)
+        except Exception as e:
+            logging.error(f"Erro ao processar {url}: {str(e)}")
+            continue
+
+if __name__ == "__main__":
+    run_scraper()