init: scripts diversos (crawlers, conversores, scrapers)

This commit is contained in:
2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions

38
scraper/test_single_file.py Executable file
View File

@@ -0,0 +1,38 @@
"""
Teste rápido - processar apenas 1 ficheiro
"""
import sys
sys.path.insert(0, '.claude-work/scripts')
from extract_knowledge_ctf import KnowledgeExtractor
from pathlib import Path
INPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/output_md"
OUTPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/knowledge_base_test"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
extractor = KnowledgeExtractor()
# Testar com 1 ficheiro específico
test_file = Path(INPUT_DIR) / "thehogring.com_1.md"
if test_file.exists():
print(f"📂 Testando: {test_file.name}")
result = extractor.process_file(test_file, Path(OUTPUT_DIR))
if result:
print("✅ SUCESSO! Ficheiro processado")
# Ver o output
output_file = Path(OUTPUT_DIR) / f"knowledge_{test_file.stem}.json"
if output_file.exists():
import json
with open(output_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print("\n📊 Conhecimento extraído:")
print(json.dumps(data, indent=2, ensure_ascii=False)[:500])
else:
print("❌ Falha ou conteúdo não relevante")
else:
print(f"❌ Ficheiro não existe: {test_file}")