140 lines
4.8 KiB
Python
Executable File
140 lines
4.8 KiB
Python
Executable File
"""
|
|
4ai.py
|
|
|
|
Author: Descomplicar® Crescimento Digital
|
|
Link: https://descomplicar.pt
|
|
Copyright: 2025 Descomplicar®
|
|
"""
|
|
|
|
import asyncio
|
|
from random import uniform
|
|
from datetime import datetime
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
import json
|
|
import os
|
|
import aiofiles
|
|
from urllib.parse import urlparse
|
|
|
|
class CleanCrawler:
|
|
def __init__(self, max_depth=2, max_concurrency=10, base_domain=None):
|
|
self.max_depth = max_depth
|
|
self.visited = set()
|
|
self.content = []
|
|
self.semaphore = asyncio.Semaphore(max_concurrency)
|
|
self.base_domain = urlparse(base_domain).hostname if base_domain else None
|
|
|
|
# Configurações reutilizáveis
|
|
self.browser_config = BrowserConfig(
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=720
|
|
)
|
|
self.crawler_config = CrawlerRunConfig(
|
|
word_count_threshold=10,
|
|
exclude_external_links=True,
|
|
remove_overlay_elements=True
|
|
)
|
|
self.retry_attempts = 3 # Tentativas de retry
|
|
self.request_timeout = 30 # Timeout em segundos
|
|
|
|
async def crawl_page(self, url, depth=0):
|
|
if depth > self.max_depth or url in self.visited:
|
|
return
|
|
|
|
# Configurações do browser
|
|
browser_config = BrowserConfig(
|
|
headless=True,
|
|
viewport_width=1280,
|
|
viewport_height=720
|
|
)
|
|
|
|
# Configurações do crawler
|
|
crawler_config = CrawlerRunConfig(
|
|
word_count_threshold=10,
|
|
exclude_external_links=True,
|
|
remove_overlay_elements=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
for attempt in range(self.retry_attempts):
|
|
try:
|
|
result = await asyncio.wait_for(
|
|
crawler.arun(url=url, config=crawler_config),
|
|
timeout=self.request_timeout
|
|
)
|
|
break
|
|
except (asyncio.TimeoutError, ConnectionError) as e:
|
|
if attempt == self.retry_attempts - 1:
|
|
raise
|
|
await asyncio.sleep(2 ** attempt)
|
|
else:
|
|
return
|
|
|
|
if result.success:
|
|
self.visited.add(url)
|
|
|
|
# Extrair apenas o conteúdo relevante
|
|
title = f"# {url}"
|
|
content = result.markdown
|
|
|
|
# Armazenar de forma estruturada
|
|
self.content.append({
|
|
"depth": depth,
|
|
"url": url,
|
|
"title": title,
|
|
"content": content
|
|
})
|
|
|
|
# Processar links da página se não atingiu profundidade máxima
|
|
if depth < self.max_depth:
|
|
internal_links = result.links.get("internal", [])
|
|
tasks = []
|
|
for link in internal_links:
|
|
if link["href"] not in self.visited:
|
|
tasks.append(self.crawl_page(link["href"], depth + 1))
|
|
if tasks:
|
|
await asyncio.gather(*tasks)
|
|
|
|
except Exception as e:
|
|
print(f"Erro ao processar {url}: {e}")
|
|
|
|
async def save_content(self, output_dir="output"):
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Organizar conteúdo por profundidade
|
|
organized_content = {}
|
|
for item in self.content:
|
|
depth = item["depth"]
|
|
if depth not in organized_content:
|
|
organized_content[depth] = []
|
|
organized_content[depth].append(item)
|
|
|
|
# Salvar arquivos de forma assíncrona
|
|
tasks = []
|
|
for depth, items in organized_content.items():
|
|
filename = os.path.join(output_dir, f"nivel_{depth}.md")
|
|
tasks.append(self._async_write_file(filename, items))
|
|
|
|
await asyncio.gather(*tasks)
|
|
|
|
async def _async_write_file(self, filename, items):
|
|
async with aiofiles.open(filename, "w", encoding="utf-8") as f:
|
|
for item in items:
|
|
await f.write(f"\n{item['title']}\n\n")
|
|
await f.write(f"{item['content']}\n")
|
|
await f.write("-" * 80 + "\n")
|
|
|
|
async def main():
|
|
url = "https://crawl4ai.com/mkdocs/"
|
|
|
|
print("🕷️ Iniciando crawling...")
|
|
crawler = CleanCrawler(max_depth=2)
|
|
await crawler.crawl_page(url)
|
|
|
|
print(f"✅ Páginas crawleadas: {len(crawler.visited)}")
|
|
crawler.save_content()
|
|
print("✅ Conteúdo salvo na pasta output/")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|