init: scripts diversos (crawlers, conversores, scrapers)

This commit is contained in:
2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions

43
scraper/test_improved_parser.py Executable file
View File

@@ -0,0 +1,43 @@
import json
# Simular a resposta do Gemini
gemini_response = '```json\n{\n "relevante": true,\n "problema": "Não há um problema explícito no texto."\n}\n```'
print("RESPOSTA BRUTA:")
print(repr(gemini_response))
# Aplicar a lógica melhorada
content_text = gemini_response
# Remover blocos markdown
if '```json' in content_text:
content_text = content_text.split('```json')[1].split('```')[0]
elif '```' in content_text:
content_text = content_text.split('```')[1].split('```')[0]
# Limpeza agressiva
content_text = content_text.strip()
print("\n\nAPÓS LIMPEZA:")
print(repr(content_text))
try:
knowledge = json.loads(content_text)
print("\n\n✅ JSON PARSE SUCESSO!")
print(json.dumps(knowledge, indent=2, ensure_ascii=False))
except json.JSONDecodeError as e:
print(f"\n\n❌ JSON PARSE FALHOU: {e}")
# Fallback: extrair { ... } manualmente
start = content_text.find('{')
end = content_text.rfind('}') + 1
if start != -1 and end > start:
clean_json = content_text[start:end]
print(f"\n\nFALLBACK EXTRACT:")
print(repr(clean_json))
try:
knowledge = json.loads(clean_json)
print("\n✅ FALLBACK SUCESSO!")
print(json.dumps(knowledge, indent=2, ensure_ascii=False))
except Exception as e2:
print(f"❌ FALLBACK FALHOU: {e2}")