scripts/crawl4all/4ai.py

"""
4ai.py

Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""

import asyncio
from random import uniform
from datetime import datetime
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
import json
import os
import aiofiles
from urllib.parse import urlparse

class CleanCrawler:
    def __init__(self, max_depth=2, max_concurrency=10, base_domain=None):
        self.max_depth = max_depth
        self.visited = set()
        self.content = []
        self.semaphore = asyncio.Semaphore(max_concurrency)
        self.base_domain = urlparse(base_domain).hostname if base_domain else None

        # Configurações reutilizáveis
        self.browser_config = BrowserConfig(
            headless=True,
            viewport_width=1280,
            viewport_height=720
        )
        self.crawler_config = CrawlerRunConfig(
            word_count_threshold=10,
            exclude_external_links=True,
            remove_overlay_elements=True
        )
        self.retry_attempts = 3  # Tentativas de retry
        self.request_timeout = 30  # Timeout em segundos

    async def crawl_page(self, url, depth=0):
        if depth > self.max_depth or url in self.visited:
            return

        # Configurações do browser
        browser_config = BrowserConfig(
            headless=True,
            viewport_width=1280,
            viewport_height=720
        )

        # Configurações do crawler
        crawler_config = CrawlerRunConfig(
            word_count_threshold=10,
            exclude_external_links=True,
            remove_overlay_elements=True
        )

        async with AsyncWebCrawler(config=browser_config) as crawler:
            for attempt in range(self.retry_attempts):
                try:
                    result = await asyncio.wait_for(
                        crawler.arun(url=url, config=crawler_config),
                        timeout=self.request_timeout
                    )
                    break
                except (asyncio.TimeoutError, ConnectionError) as e:
                    if attempt == self.retry_attempts - 1:
                        raise
                    await asyncio.sleep(2 ** attempt)
            else:
                return

                if result.success:
                    self.visited.add(url)

                    # Extrair apenas o conteúdo relevante
                    title = f"# {url}"
                    content = result.markdown

                    # Armazenar de forma estruturada
                    self.content.append({
                        "depth": depth,
                        "url": url,
                        "title": title,
                        "content": content
                    })

                    # Processar links da página se não atingiu profundidade máxima
                    if depth < self.max_depth:
                        internal_links = result.links.get("internal", [])
                        tasks = []
                        for link in internal_links:
                            if link["href"] not in self.visited:
                                tasks.append(self.crawl_page(link["href"], depth + 1))
                        if tasks:
                            await asyncio.gather(*tasks)

            except Exception as e:
                print(f"Erro ao processar {url}: {e}")

    async def save_content(self, output_dir="output"):
        os.makedirs(output_dir, exist_ok=True)

        # Organizar conteúdo por profundidade
        organized_content = {}
        for item in self.content:
            depth = item["depth"]
            if depth not in organized_content:
                organized_content[depth] = []
            organized_content[depth].append(item)

        # Salvar arquivos de forma assíncrona
        tasks = []
        for depth, items in organized_content.items():
            filename = os.path.join(output_dir, f"nivel_{depth}.md")
            tasks.append(self._async_write_file(filename, items))

        await asyncio.gather(*tasks)

    async def _async_write_file(self, filename, items):
        async with aiofiles.open(filename, "w", encoding="utf-8") as f:
            for item in items:
                await f.write(f"\n{item['title']}\n\n")
                await f.write(f"{item['content']}\n")
                await f.write("-" * 80 + "\n")

async def main():
    url = "https://crawl4ai.com/mkdocs/"

    print("🕷️ Iniciando crawling...")
    crawler = CleanCrawler(max_depth=2)
    await crawler.crawl_page(url)

    print(f"✅ Páginas crawleadas: {len(crawler.visited)}")
    crawler.save_content()
    print("✅ Conteúdo salvo na pasta output/")

if __name__ == "__main__":
    asyncio.run(main())