init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,129 @@
+"""
+Teste de extração - versão com parsing corrigido
+"""
+
+import os
+import json
+import requests
+from pathlib import Path
+from dotenv import load_dotenv
+
+load_dotenv()
+
+API_KEY = os.getenv("OPENROUTER_API_KEY")
+API_URL = "https://openrouter.ai/api/v1/chat/completions"
+
+def test_single_file():
+    """Testa extração em 1 ficheiro."""
+    
+    # Ler ficheiro
+    input_file = Path("/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/output_md/thehogring.com_1.md")
+    
+    with open(input_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+    
+    print(f"📂 Ficheiro: {input_file.name}")
+    print(f"📏 Tamanho: {len(content)} bytes")
+    
+    if len(content) < 1000:
+        print("⚠️  Ficheiro muito pequeno - SKIP")
+        return
+    
+    # Prompt simplificado para teste
+    prompt = f"""És um especialista em estofamento automotivo.
+
+Analisa este texto e extrai conhecimento útil.
+
+IGNORAR: navegação, publicidade, comentários genéricos.
+
+EXTRAIR APENAS SE EXISTIR:
+- Problemas técnicos de estofamento
+- Perguntas sobre materiais
+- Soluções práticas
+
+FORMATO JSON:
+{{
+  "relevante": true/false,
+  "problema": "descrição se existir",
+  "materiais": ["lista se existir"]
+}}
+
+SE NÃO HOUVER CONTEÚDO ÚTIL: {{"relevante": false}}
+
+TEXTO:
+{content[:3000]}
+
+Responde APENAS com o JSON, sem texto adicional."""
+
+    print("\n🔄 A enviar para Gemini...")
+    
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "https://descomplicar.pt",
+        "X-Title": "CTF Knowledge Test"
+    }
+
+    data = {
+        "model": "google/gemini-2.5-flash",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.2,
+        "max_tokens": 1000
+    }
+
+    try:
+        response = requests.post(API_URL, headers=headers, json=data, timeout=60)
+        
+        if response.status_code == 200:
+            result = response.json()
+            content_text = result['choices'][0]['message']['content']
+            
+            print("\n📥 Resposta bruta:")
+            print(repr(content_text)[:200])
+            
+            # Parsing robusto
+            try:
+                # Remover blocos markdown
+                if '```json' in content_text:
+                    content_text = content_text.split('```json')[1].split('```')[0]
+                elif '```' in content_text:
+                    content_text = content_text.split('```')[1].split('```')[0]
+
+                # Limpeza
+                content_text = content_text.strip()
+
+                # Parse
+                knowledge = json.loads(content_text)
+                
+                print("\n✅ JSON PARSE SUCESSO!")
+                print(json.dumps(knowledge, indent=2, ensure_ascii=False))
+                
+                return knowledge
+
+            except json.JSONDecodeError as e:
+                print(f"\n❌ JSON parse falhou: {e}")
+                print(f"Primeiros 300 chars após limpeza:")
+                print(repr(content_text[:300]))
+                
+                # Fallback: extrair { ... }
+                start = content_text.find('{')
+                end = content_text.rfind('}') + 1
+                if start != -1 and end > start:
+                    clean_json = content_text[start:end]
+                    try:
+                        knowledge = json.loads(clean_json)
+                        print("\n✅ FALLBACK SUCESSO!")
+                        print(json.dumps(knowledge, indent=2, ensure_ascii=False))
+                        return knowledge
+                    except:
+                        print("❌ Fallback também falhou")
+                        
+        else:
+            print(f"❌ Erro API: {response.status_code}")
+            print(response.text[:500])
+            
+    except Exception as e:
+        print(f"❌ Exceção: {e}")
+
+if __name__ == "__main__":
+    test_single_file()