Files
scripts/crawl4all/4ai.py

140 lines
4.8 KiB
Python
Executable File

"""
4ai.py
Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""
import asyncio
from random import uniform
from datetime import datetime
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
import json
import os
import aiofiles
from urllib.parse import urlparse
class CleanCrawler:
def __init__(self, max_depth=2, max_concurrency=10, base_domain=None):
self.max_depth = max_depth
self.visited = set()
self.content = []
self.semaphore = asyncio.Semaphore(max_concurrency)
self.base_domain = urlparse(base_domain).hostname if base_domain else None
# Configurações reutilizáveis
self.browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720
)
self.crawler_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
remove_overlay_elements=True
)
self.retry_attempts = 3 # Tentativas de retry
self.request_timeout = 30 # Timeout em segundos
async def crawl_page(self, url, depth=0):
if depth > self.max_depth or url in self.visited:
return
# Configurações do browser
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720
)
# Configurações do crawler
crawler_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
remove_overlay_elements=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
for attempt in range(self.retry_attempts):
try:
result = await asyncio.wait_for(
crawler.arun(url=url, config=crawler_config),
timeout=self.request_timeout
)
break
except (asyncio.TimeoutError, ConnectionError) as e:
if attempt == self.retry_attempts - 1:
raise
await asyncio.sleep(2 ** attempt)
else:
return
if result.success:
self.visited.add(url)
# Extrair apenas o conteúdo relevante
title = f"# {url}"
content = result.markdown
# Armazenar de forma estruturada
self.content.append({
"depth": depth,
"url": url,
"title": title,
"content": content
})
# Processar links da página se não atingiu profundidade máxima
if depth < self.max_depth:
internal_links = result.links.get("internal", [])
tasks = []
for link in internal_links:
if link["href"] not in self.visited:
tasks.append(self.crawl_page(link["href"], depth + 1))
if tasks:
await asyncio.gather(*tasks)
except Exception as e:
print(f"Erro ao processar {url}: {e}")
async def save_content(self, output_dir="output"):
os.makedirs(output_dir, exist_ok=True)
# Organizar conteúdo por profundidade
organized_content = {}
for item in self.content:
depth = item["depth"]
if depth not in organized_content:
organized_content[depth] = []
organized_content[depth].append(item)
# Salvar arquivos de forma assíncrona
tasks = []
for depth, items in organized_content.items():
filename = os.path.join(output_dir, f"nivel_{depth}.md")
tasks.append(self._async_write_file(filename, items))
await asyncio.gather(*tasks)
async def _async_write_file(self, filename, items):
async with aiofiles.open(filename, "w", encoding="utf-8") as f:
for item in items:
await f.write(f"\n{item['title']}\n\n")
await f.write(f"{item['content']}\n")
await f.write("-" * 80 + "\n")
async def main():
url = "https://crawl4ai.com/mkdocs/"
print("🕷️ Iniciando crawling...")
crawler = CleanCrawler(max_depth=2)
await crawler.crawl_page(url)
print(f"✅ Páginas crawleadas: {len(crawler.visited)}")
crawler.save_content()
print("✅ Conteúdo salvo na pasta output/")
if __name__ == "__main__":
asyncio.run(main())