init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,6 @@
+.env
+venv/
+.venv/
+output/
+__pycache__/
+*.pyc
@@ -0,0 +1,139 @@
+"""
+4ai.py
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import asyncio
+from random import uniform
+from datetime import datetime
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+import json
+import os
+import aiofiles
+from urllib.parse import urlparse
+
+class CleanCrawler:
+    def __init__(self, max_depth=2, max_concurrency=10, base_domain=None):
+        self.max_depth = max_depth
+        self.visited = set()
+        self.content = []
+        self.semaphore = asyncio.Semaphore(max_concurrency)
+        self.base_domain = urlparse(base_domain).hostname if base_domain else None
+        
+        # Configurações reutilizáveis
+        self.browser_config = BrowserConfig(
+            headless=True,
+            viewport_width=1280,
+            viewport_height=720
+        )
+        self.crawler_config = CrawlerRunConfig(
+            word_count_threshold=10,
+            exclude_external_links=True,
+            remove_overlay_elements=True
+        )
+        self.retry_attempts = 3  # Tentativas de retry
+        self.request_timeout = 30  # Timeout em segundos
+        
+    async def crawl_page(self, url, depth=0):
+        if depth > self.max_depth or url in self.visited:
+            return
+
+        # Configurações do browser
+        browser_config = BrowserConfig(
+            headless=True,
+            viewport_width=1280,
+            viewport_height=720
+        )
+
+        # Configurações do crawler
+        crawler_config = CrawlerRunConfig(
+            word_count_threshold=10,
+            exclude_external_links=True,
+            remove_overlay_elements=True
+        )
+
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            for attempt in range(self.retry_attempts):
+                try:
+                    result = await asyncio.wait_for(
+                        crawler.arun(url=url, config=crawler_config),
+                        timeout=self.request_timeout
+                    )
+                    break
+                except (asyncio.TimeoutError, ConnectionError) as e:
+                    if attempt == self.retry_attempts - 1:
+                        raise
+                    await asyncio.sleep(2 ** attempt)
+            else:
+                return
+                
+                if result.success:
+                    self.visited.add(url)
+                    
+                    # Extrair apenas o conteúdo relevante
+                    title = f"# {url}"
+                    content = result.markdown
+                    
+                    # Armazenar de forma estruturada
+                    self.content.append({
+                        "depth": depth,
+                        "url": url,
+                        "title": title,
+                        "content": content
+                    })
+                    
+                    # Processar links da página se não atingiu profundidade máxima
+                    if depth < self.max_depth:
+                        internal_links = result.links.get("internal", [])
+                        tasks = []
+                        for link in internal_links:
+                            if link["href"] not in self.visited:
+                                tasks.append(self.crawl_page(link["href"], depth + 1))
+                        if tasks:
+                            await asyncio.gather(*tasks)
+
+            except Exception as e:
+                print(f"Erro ao processar {url}: {e}")
+
+    async def save_content(self, output_dir="output"):
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Organizar conteúdo por profundidade
+        organized_content = {}
+        for item in self.content:
+            depth = item["depth"]
+            if depth not in organized_content:
+                organized_content[depth] = []
+            organized_content[depth].append(item)
+            
+        # Salvar arquivos de forma assíncrona
+        tasks = []
+        for depth, items in organized_content.items():
+            filename = os.path.join(output_dir, f"nivel_{depth}.md")
+            tasks.append(self._async_write_file(filename, items))
+        
+        await asyncio.gather(*tasks)
+    
+    async def _async_write_file(self, filename, items):
+        async with aiofiles.open(filename, "w", encoding="utf-8") as f:
+            for item in items:
+                await f.write(f"\n{item['title']}\n\n")
+                await f.write(f"{item['content']}\n")
+                await f.write("-" * 80 + "\n")
+
+async def main():
+    url = "https://crawl4ai.com/mkdocs/"
+    
+    print("🕷️ Iniciando crawling...")
+    crawler = CleanCrawler(max_depth=2)
+    await crawler.crawl_page(url)
+    
+    print(f"✅ Páginas crawleadas: {len(crawler.visited)}")
+    crawler.save_content()
+    print("✅ Conteúdo salvo na pasta output/")
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,23 @@
+# crawl4all - Async Web Crawler
+
+Crawler assincrono baseado em crawl4ai com profundidade configuravel.
+
+## Setup
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Uso
+```bash
+# Editar config.py com URL alvo
+python r1.py
+```
+
+## Configuracao
+Editar `config.py`:
+- `BASE_URL` - URL alvo
+- `MAX_DEPTH` - Profundidade maxima (default: 2)
+- `MAX_CONCURRENCY` - Workers paralelos (default: 10)
+- `OUTPUT_DIR` - Pasta de output (default: "output")
@@ -0,0 +1,18 @@
+import os
+
+BASE_URL = "https://descomplicar.pt/"
+MAX_DEPTH = 2
+MAX_CONCURRENCY = 10
+RETRY_ATTEMPTS = 3
+REQUEST_TIMEOUT = 30
+OUTPUT_DIR = "output"
+BROWSER_CONFIG = {
+    "headless": True,
+    "viewport_width": 1280,
+    "viewport_height": 720
+}
+CRAWLER_CONFIG = {
+    "word_count_threshold": 10,
+    "exclude_external_links": True,
+    "remove_overlay_elements": True
+}
@@ -0,0 +1,241 @@
+"""
+r1.py
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import asyncio
+import logging
+import aiohttp
+from typing import Optional, Dict, List, Set, Any
+from random import uniform
+from datetime import datetime
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+import aiofiles
+import os
+from urllib.parse import urlparse
+import config  # Importando o módulo config
+from typing import Optional, Dict, List, Set, Any
+from typing import Optional, Dict, List, Set, Any
+
+# Configuração do logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class CleanCrawler:
+    """Crawler assíncrono para coletar conteúdo de sites."""
+    
+    def __init__(self, max_depth: int = 2, max_concurrency: int = 10, base_domain: Optional[str] = None) -> None:
+        """
+        Inicializa a classe CleanCrawler com validação de configuração.
+
+        :param max_depth: Profundidade máxima de navegação (>= 0)
+        :param max_concurrency: Número máximo de requisições paralelas (>= 1)
+        :param base_domain: Domínio base para limitar a coleta (opcional)
+        :raises ValueError: Se parâmetros inválidos ou configuração ausente
+        """
+        # Validação de parâmetros
+        if max_depth < 0 or max_concurrency < 1:
+            raise ValueError("Parâmetros inválidos: max_depth deve ser >= 0 e max_concurrency >= 1")
+            
+        # Validação da configuração
+        required_config = [
+            'RETRY_ATTEMPTS', 'REQUEST_TIMEOUT', 'BROWSER_CONFIG',
+            'CRAWLER_CONFIG', 'MAX_DEPTH', 'MAX_CONCURRENCY', 'BASE_URL', 'OUTPUT_DIR'
+        ]
+        for attr in required_config:
+            if not hasattr(config, attr):
+                raise ValueError(f"Configuração obrigatória ausente: {attr}")
+
+        self.max_depth = max_depth
+        self.visited: Set[str] = set()
+        self.content: List[Dict[str, Any]] = []
+        self.semaphore = asyncio.Semaphore(max_concurrency)
+        self.base_domain = urlparse(base_domain).hostname if base_domain else None
+        self.retry_attempts: int = config.RETRY_ATTEMPTS
+        self.request_timeout: int = config.REQUEST_TIMEOUT
+        
+        # Configurações reutilizáveis
+        self.browser_config = BrowserConfig(
+            headless=config.BROWSER_CONFIG["headless"],
+            viewport_width=config.BROWSER_CONFIG["viewport_width"],
+            viewport_height=config.BROWSER_CONFIG["viewport_height"]
+        )
+        self.crawler_config = CrawlerRunConfig(
+            word_count_threshold=config.CRAWLER_CONFIG["word_count_threshold"],
+            exclude_external_links=config.CRAWLER_CONFIG["exclude_external_links"],
+            remove_overlay_elements=config.CRAWLER_CONFIG["remove_overlay_elements"]
+        )
+
+    def _is_valid_url(self, url: str) -> bool:
+        """
+        Valida URLs com lógica expandida para captura de conteúdo
+        
+        :param url: URL a ser validada
+        :return: True se a URL for válida para crawling
+        """
+        if not self.base_domain:
+            logging.debug(f"Permitindo URL sem domínio base: {url}")
+            return True
+            
+        try:
+            parsed = urlparse(url)
+            
+            # URLs relativas sempre permitidas
+            if not parsed.netloc:
+                logging.debug(f"Permitindo URL relativa: {url}")
+                return True
+                
+            # Normaliza domínios para comparação
+            target_domain = parsed.netloc.lower()
+            base_domain = self.base_domain.lower()
+            
+            # Verifica subdomínios e domínio principal
+            if target_domain == base_domain:
+                return True
+                
+            if target_domain.endswith(f".{base_domain}"):
+                logging.debug(f"Permitindo subdomínio válido: {url}")
+                return True
+                
+            logging.debug(f"Bloqueando domínio não relacionado: {url}")
+            return False
+            
+        except Exception as e:
+            logging.error(f"Erro crítico na validação de URL: {str(e)}")
+            return False
+    
+    async def crawl_page(self, url, depth=0, parent_url=None):
+        """
+        Realiza o crawling de uma página.
+
+        :param url: URL da página a ser crawleada.
+        :param depth: Profundidade atual de navegação.
+        :param parent_url: URL da página pai, se houver.
+        """
+        logging.info(f"Realizando crawling da URL: {url}, profundidade: {depth}, parent_url: {parent_url}")
+        
+        if depth > self.max_depth:
+            logging.info(f"Profundidade máxima alcançada ou URL já visitada ou URL inválida: {url}")
+            return
+        if url in self.visited:
+            logging.info(f"URL já visitada: {url}")
+            return
+        if not self._is_valid_url(url):
+            logging.info(f"URL inválida: {url}")
+            return
+
+        async with self.semaphore:
+            try:
+                # Delay para politeness policy
+                await asyncio.sleep(uniform(0.5, 1.5))
+                
+                async with AsyncWebCrawler(config=self.browser_config) as crawler:
+                    logging.info(f"Iniciando crawler para a URL: {url}, config: {self.crawler_config}")
+                    for attempt in range(self.retry_attempts):
+                        try:
+                            logging.info(f"Tentativa {attempt + 1} de {self.retry_attempts} para {url}")
+                            result = await asyncio.wait_for(
+                                crawler.arun(url=url, config=self.crawler_config),
+                                timeout=self.request_timeout
+                            )
+                            logging.info(f"Sucesso ao acessar {url}, result: {result}")
+                            break
+                        except (asyncio.TimeoutError, ConnectionError) as e:
+                            if attempt == self.retry_attempts - 1:
+                                logging.error(f"Erro ao acessar {url} após {attempt + 1} tentativas: {str(e)}")
+                                raise
+                            await asyncio.sleep(2 ** attempt)
+                            logging.info(f"Tentativa {attempt + 2} de {self.retry_attempts} para {url}")
+                        
+                        if result.success:
+                            self.visited.add(url)
+                            logging.info(f"URL visitada com sucesso: {url}")
+                            
+                            title = f"# {url}"
+                            content = result.markdown
+                            
+                            self.content.append({
+                                "depth": depth,
+                                "url": url,
+                                "title": title,
+                                "content": content,
+                                "timestamp": datetime.utcnow().isoformat(),
+                                "parent_url": parent_url,
+                                "status": "success"
+                            })
+                            
+                            if depth < self.max_depth:
+                                internal_links = result.links.get("internal", [])
+                                tasks = [
+                                    self.crawl_page(link["href"], depth + 1, parent_url=url)
+                                    for link in internal_links
+                                    if link["href"] not in self.visited
+                                ]
+                                if tasks:
+                                    logging.info(f"Iniciando {len(tasks)} tarefas para links internos de {url}")
+                                    await asyncio.gather(*tasks)
+                            else:
+                                logging.info(f"Profundidade máxima alcançada para {url}")
+                                    
+            except (aiohttp.ClientError, asyncio.TimeoutError, ValueError) as e:
+                logging.error(f"Erro ao processar {url}: {str(e)}")
+                self.content.append({
+                    "url": url,
+                    "error": str(e),
+                    "timestamp": datetime.utcnow().isoformat(),
+                    "status": "failed"
+                })
+    
+    async def save_content(self, output_dir="output"):
+        """
+        Salva o conteúdo coletado em arquivos Markdown.
+
+        :param output_dir: Diretório onde os arquivos serão salvos.
+        """
+        os.makedirs(output_dir, exist_ok=True)
+        
+        organized_content = {}
+        for item in self.content:
+            depth = item["depth"]
+            organized_content.setdefault(depth, []).append(item)
+        
+        tasks = []
+        for depth, items in organized_content.items():
+            filename = os.path.join(output_dir, f"nivel_{depth}.md")
+            tasks.append(self._async_write_file(filename, items))
+        
+        await asyncio.gather(*tasks)
+    
+    async def _async_write_file(self, filename, items):
+        """
+        Escreve o conteúdo em um arquivo Markdown de forma assíncrona.
+
+        :param filename: Nome do arquivo.
+        :param items: Conteúdo a ser escrito.
+        """
+        async with aiofiles.open(filename, "w", encoding="utf-8") as f:
+            for item in items:
+                await f.write(f"\n{item['title']}\n\n")
+                await f.write(f"{item['content']}\n" if "content" in item else "")
+                await f.write(f"Status: {item['status']}\n")
+                await f.write("-" * 80 + "\n")
+
+async def main():
+    import config
+    
+    logging.info("🕷️ Iniciando crawling...")
+    crawler = CleanCrawler(
+        max_depth=config.MAX_DEPTH,
+        max_concurrency=config.MAX_CONCURRENCY,
+        base_domain=config.BASE_URL
+    )
+    await crawler.crawl_page(config.BASE_URL)
+        
+    logging.info(f"✅ Páginas crawleadas: {len(crawler.visited)}")
+    await crawler.save_content(output_dir=config.OUTPUT_DIR)
+    logging.info(f"✅ Conteúdo salvo na pasta {config.OUTPUT_DIR}/")
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,3 @@
+crawl4ai>=0.2.0
+aiohttp>=3.9.0
+aiofiles>=23.0.0