""" 4ai.py Author: Descomplicar® Crescimento Digital Link: https://descomplicar.pt Copyright: 2025 Descomplicar® """ import asyncio from random import uniform from datetime import datetime from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig import json import os import aiofiles from urllib.parse import urlparse class CleanCrawler: def __init__(self, max_depth=2, max_concurrency=10, base_domain=None): self.max_depth = max_depth self.visited = set() self.content = [] self.semaphore = asyncio.Semaphore(max_concurrency) self.base_domain = urlparse(base_domain).hostname if base_domain else None # Configurações reutilizáveis self.browser_config = BrowserConfig( headless=True, viewport_width=1280, viewport_height=720 ) self.crawler_config = CrawlerRunConfig( word_count_threshold=10, exclude_external_links=True, remove_overlay_elements=True ) self.retry_attempts = 3 # Tentativas de retry self.request_timeout = 30 # Timeout em segundos async def crawl_page(self, url, depth=0): if depth > self.max_depth or url in self.visited: return # Configurações do browser browser_config = BrowserConfig( headless=True, viewport_width=1280, viewport_height=720 ) # Configurações do crawler crawler_config = CrawlerRunConfig( word_count_threshold=10, exclude_external_links=True, remove_overlay_elements=True ) async with AsyncWebCrawler(config=browser_config) as crawler: for attempt in range(self.retry_attempts): try: result = await asyncio.wait_for( crawler.arun(url=url, config=crawler_config), timeout=self.request_timeout ) break except (asyncio.TimeoutError, ConnectionError) as e: if attempt == self.retry_attempts - 1: raise await asyncio.sleep(2 ** attempt) else: return if result.success: self.visited.add(url) # Extrair apenas o conteúdo relevante title = f"# {url}" content = result.markdown # Armazenar de forma estruturada self.content.append({ "depth": depth, "url": url, "title": title, "content": content }) # Processar links da página se não atingiu profundidade máxima if depth < self.max_depth: internal_links = result.links.get("internal", []) tasks = [] for link in internal_links: if link["href"] not in self.visited: tasks.append(self.crawl_page(link["href"], depth + 1)) if tasks: await asyncio.gather(*tasks) except Exception as e: print(f"Erro ao processar {url}: {e}") async def save_content(self, output_dir="output"): os.makedirs(output_dir, exist_ok=True) # Organizar conteúdo por profundidade organized_content = {} for item in self.content: depth = item["depth"] if depth not in organized_content: organized_content[depth] = [] organized_content[depth].append(item) # Salvar arquivos de forma assíncrona tasks = [] for depth, items in organized_content.items(): filename = os.path.join(output_dir, f"nivel_{depth}.md") tasks.append(self._async_write_file(filename, items)) await asyncio.gather(*tasks) async def _async_write_file(self, filename, items): async with aiofiles.open(filename, "w", encoding="utf-8") as f: for item in items: await f.write(f"\n{item['title']}\n\n") await f.write(f"{item['content']}\n") await f.write("-" * 80 + "\n") async def main(): url = "https://crawl4ai.com/mkdocs/" print("🕷️ Iniciando crawling...") crawler = CleanCrawler(max_depth=2) await crawler.crawl_page(url) print(f"✅ Páginas crawleadas: {len(crawler.visited)}") crawler.save_content() print("✅ Conteúdo salvo na pasta output/") if __name__ == "__main__": asyncio.run(main())