init: scripts diversos (crawlers, conversores, scrapers)

This commit is contained in:
2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions

6
crawl4all/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
.env
venv/
.venv/
output/
__pycache__/
*.pyc

139
crawl4all/4ai.py Executable file
View File

@@ -0,0 +1,139 @@
"""
4ai.py
Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""
import asyncio
from random import uniform
from datetime import datetime
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
import json
import os
import aiofiles
from urllib.parse import urlparse
class CleanCrawler:
def __init__(self, max_depth=2, max_concurrency=10, base_domain=None):
self.max_depth = max_depth
self.visited = set()
self.content = []
self.semaphore = asyncio.Semaphore(max_concurrency)
self.base_domain = urlparse(base_domain).hostname if base_domain else None
# Configurações reutilizáveis
self.browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720
)
self.crawler_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
remove_overlay_elements=True
)
self.retry_attempts = 3 # Tentativas de retry
self.request_timeout = 30 # Timeout em segundos
async def crawl_page(self, url, depth=0):
if depth > self.max_depth or url in self.visited:
return
# Configurações do browser
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720
)
# Configurações do crawler
crawler_config = CrawlerRunConfig(
word_count_threshold=10,
exclude_external_links=True,
remove_overlay_elements=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
for attempt in range(self.retry_attempts):
try:
result = await asyncio.wait_for(
crawler.arun(url=url, config=crawler_config),
timeout=self.request_timeout
)
break
except (asyncio.TimeoutError, ConnectionError) as e:
if attempt == self.retry_attempts - 1:
raise
await asyncio.sleep(2 ** attempt)
else:
return
if result.success:
self.visited.add(url)
# Extrair apenas o conteúdo relevante
title = f"# {url}"
content = result.markdown
# Armazenar de forma estruturada
self.content.append({
"depth": depth,
"url": url,
"title": title,
"content": content
})
# Processar links da página se não atingiu profundidade máxima
if depth < self.max_depth:
internal_links = result.links.get("internal", [])
tasks = []
for link in internal_links:
if link["href"] not in self.visited:
tasks.append(self.crawl_page(link["href"], depth + 1))
if tasks:
await asyncio.gather(*tasks)
except Exception as e:
print(f"Erro ao processar {url}: {e}")
async def save_content(self, output_dir="output"):
os.makedirs(output_dir, exist_ok=True)
# Organizar conteúdo por profundidade
organized_content = {}
for item in self.content:
depth = item["depth"]
if depth not in organized_content:
organized_content[depth] = []
organized_content[depth].append(item)
# Salvar arquivos de forma assíncrona
tasks = []
for depth, items in organized_content.items():
filename = os.path.join(output_dir, f"nivel_{depth}.md")
tasks.append(self._async_write_file(filename, items))
await asyncio.gather(*tasks)
async def _async_write_file(self, filename, items):
async with aiofiles.open(filename, "w", encoding="utf-8") as f:
for item in items:
await f.write(f"\n{item['title']}\n\n")
await f.write(f"{item['content']}\n")
await f.write("-" * 80 + "\n")
async def main():
url = "https://crawl4ai.com/mkdocs/"
print("🕷️ Iniciando crawling...")
crawler = CleanCrawler(max_depth=2)
await crawler.crawl_page(url)
print(f"✅ Páginas crawleadas: {len(crawler.visited)}")
crawler.save_content()
print("✅ Conteúdo salvo na pasta output/")
if __name__ == "__main__":
asyncio.run(main())

23
crawl4all/README.md Normal file
View File

@@ -0,0 +1,23 @@
# crawl4all - Async Web Crawler
Crawler assincrono baseado em crawl4ai com profundidade configuravel.
## Setup
```bash
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```
## Uso
```bash
# Editar config.py com URL alvo
python r1.py
```
## Configuracao
Editar `config.py`:
- `BASE_URL` - URL alvo
- `MAX_DEPTH` - Profundidade maxima (default: 2)
- `MAX_CONCURRENCY` - Workers paralelos (default: 10)
- `OUTPUT_DIR` - Pasta de output (default: "output")

18
crawl4all/config.py Executable file
View File

@@ -0,0 +1,18 @@
import os
BASE_URL = "https://descomplicar.pt/"
MAX_DEPTH = 2
MAX_CONCURRENCY = 10
RETRY_ATTEMPTS = 3
REQUEST_TIMEOUT = 30
OUTPUT_DIR = "output"
BROWSER_CONFIG = {
"headless": True,
"viewport_width": 1280,
"viewport_height": 720
}
CRAWLER_CONFIG = {
"word_count_threshold": 10,
"exclude_external_links": True,
"remove_overlay_elements": True
}

241
crawl4all/r1.py Executable file
View File

@@ -0,0 +1,241 @@
"""
r1.py
Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""
import asyncio
import logging
import aiohttp
from typing import Optional, Dict, List, Set, Any
from random import uniform
from datetime import datetime
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
import aiofiles
import os
from urllib.parse import urlparse
import config # Importando o módulo config
from typing import Optional, Dict, List, Set, Any
from typing import Optional, Dict, List, Set, Any
# Configuração do logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class CleanCrawler:
"""Crawler assíncrono para coletar conteúdo de sites."""
def __init__(self, max_depth: int = 2, max_concurrency: int = 10, base_domain: Optional[str] = None) -> None:
"""
Inicializa a classe CleanCrawler com validação de configuração.
:param max_depth: Profundidade máxima de navegação (>= 0)
:param max_concurrency: Número máximo de requisições paralelas (>= 1)
:param base_domain: Domínio base para limitar a coleta (opcional)
:raises ValueError: Se parâmetros inválidos ou configuração ausente
"""
# Validação de parâmetros
if max_depth < 0 or max_concurrency < 1:
raise ValueError("Parâmetros inválidos: max_depth deve ser >= 0 e max_concurrency >= 1")
# Validação da configuração
required_config = [
'RETRY_ATTEMPTS', 'REQUEST_TIMEOUT', 'BROWSER_CONFIG',
'CRAWLER_CONFIG', 'MAX_DEPTH', 'MAX_CONCURRENCY', 'BASE_URL', 'OUTPUT_DIR'
]
for attr in required_config:
if not hasattr(config, attr):
raise ValueError(f"Configuração obrigatória ausente: {attr}")
self.max_depth = max_depth
self.visited: Set[str] = set()
self.content: List[Dict[str, Any]] = []
self.semaphore = asyncio.Semaphore(max_concurrency)
self.base_domain = urlparse(base_domain).hostname if base_domain else None
self.retry_attempts: int = config.RETRY_ATTEMPTS
self.request_timeout: int = config.REQUEST_TIMEOUT
# Configurações reutilizáveis
self.browser_config = BrowserConfig(
headless=config.BROWSER_CONFIG["headless"],
viewport_width=config.BROWSER_CONFIG["viewport_width"],
viewport_height=config.BROWSER_CONFIG["viewport_height"]
)
self.crawler_config = CrawlerRunConfig(
word_count_threshold=config.CRAWLER_CONFIG["word_count_threshold"],
exclude_external_links=config.CRAWLER_CONFIG["exclude_external_links"],
remove_overlay_elements=config.CRAWLER_CONFIG["remove_overlay_elements"]
)
def _is_valid_url(self, url: str) -> bool:
"""
Valida URLs com lógica expandida para captura de conteúdo
:param url: URL a ser validada
:return: True se a URL for válida para crawling
"""
if not self.base_domain:
logging.debug(f"Permitindo URL sem domínio base: {url}")
return True
try:
parsed = urlparse(url)
# URLs relativas sempre permitidas
if not parsed.netloc:
logging.debug(f"Permitindo URL relativa: {url}")
return True
# Normaliza domínios para comparação
target_domain = parsed.netloc.lower()
base_domain = self.base_domain.lower()
# Verifica subdomínios e domínio principal
if target_domain == base_domain:
return True
if target_domain.endswith(f".{base_domain}"):
logging.debug(f"Permitindo subdomínio válido: {url}")
return True
logging.debug(f"Bloqueando domínio não relacionado: {url}")
return False
except Exception as e:
logging.error(f"Erro crítico na validação de URL: {str(e)}")
return False
async def crawl_page(self, url, depth=0, parent_url=None):
"""
Realiza o crawling de uma página.
:param url: URL da página a ser crawleada.
:param depth: Profundidade atual de navegação.
:param parent_url: URL da página pai, se houver.
"""
logging.info(f"Realizando crawling da URL: {url}, profundidade: {depth}, parent_url: {parent_url}")
if depth > self.max_depth:
logging.info(f"Profundidade máxima alcançada ou URL já visitada ou URL inválida: {url}")
return
if url in self.visited:
logging.info(f"URL já visitada: {url}")
return
if not self._is_valid_url(url):
logging.info(f"URL inválida: {url}")
return
async with self.semaphore:
try:
# Delay para politeness policy
await asyncio.sleep(uniform(0.5, 1.5))
async with AsyncWebCrawler(config=self.browser_config) as crawler:
logging.info(f"Iniciando crawler para a URL: {url}, config: {self.crawler_config}")
for attempt in range(self.retry_attempts):
try:
logging.info(f"Tentativa {attempt + 1} de {self.retry_attempts} para {url}")
result = await asyncio.wait_for(
crawler.arun(url=url, config=self.crawler_config),
timeout=self.request_timeout
)
logging.info(f"Sucesso ao acessar {url}, result: {result}")
break
except (asyncio.TimeoutError, ConnectionError) as e:
if attempt == self.retry_attempts - 1:
logging.error(f"Erro ao acessar {url} após {attempt + 1} tentativas: {str(e)}")
raise
await asyncio.sleep(2 ** attempt)
logging.info(f"Tentativa {attempt + 2} de {self.retry_attempts} para {url}")
if result.success:
self.visited.add(url)
logging.info(f"URL visitada com sucesso: {url}")
title = f"# {url}"
content = result.markdown
self.content.append({
"depth": depth,
"url": url,
"title": title,
"content": content,
"timestamp": datetime.utcnow().isoformat(),
"parent_url": parent_url,
"status": "success"
})
if depth < self.max_depth:
internal_links = result.links.get("internal", [])
tasks = [
self.crawl_page(link["href"], depth + 1, parent_url=url)
for link in internal_links
if link["href"] not in self.visited
]
if tasks:
logging.info(f"Iniciando {len(tasks)} tarefas para links internos de {url}")
await asyncio.gather(*tasks)
else:
logging.info(f"Profundidade máxima alcançada para {url}")
except (aiohttp.ClientError, asyncio.TimeoutError, ValueError) as e:
logging.error(f"Erro ao processar {url}: {str(e)}")
self.content.append({
"url": url,
"error": str(e),
"timestamp": datetime.utcnow().isoformat(),
"status": "failed"
})
async def save_content(self, output_dir="output"):
"""
Salva o conteúdo coletado em arquivos Markdown.
:param output_dir: Diretório onde os arquivos serão salvos.
"""
os.makedirs(output_dir, exist_ok=True)
organized_content = {}
for item in self.content:
depth = item["depth"]
organized_content.setdefault(depth, []).append(item)
tasks = []
for depth, items in organized_content.items():
filename = os.path.join(output_dir, f"nivel_{depth}.md")
tasks.append(self._async_write_file(filename, items))
await asyncio.gather(*tasks)
async def _async_write_file(self, filename, items):
"""
Escreve o conteúdo em um arquivo Markdown de forma assíncrona.
:param filename: Nome do arquivo.
:param items: Conteúdo a ser escrito.
"""
async with aiofiles.open(filename, "w", encoding="utf-8") as f:
for item in items:
await f.write(f"\n{item['title']}\n\n")
await f.write(f"{item['content']}\n" if "content" in item else "")
await f.write(f"Status: {item['status']}\n")
await f.write("-" * 80 + "\n")
async def main():
import config
logging.info("🕷️ Iniciando crawling...")
crawler = CleanCrawler(
max_depth=config.MAX_DEPTH,
max_concurrency=config.MAX_CONCURRENCY,
base_domain=config.BASE_URL
)
await crawler.crawl_page(config.BASE_URL)
logging.info(f"✅ Páginas crawleadas: {len(crawler.visited)}")
await crawler.save_content(output_dir=config.OUTPUT_DIR)
logging.info(f"✅ Conteúdo salvo na pasta {config.OUTPUT_DIR}/")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,3 @@
crawl4ai>=0.2.0
aiohttp>=3.9.0
aiofiles>=23.0.0