init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,279 @@
+"""
+extract_knowledge_FINAL.py - Extração Inteligente de Conhecimento (VERSÃO CORRIGIDA)
+
+Objetivo: Extrair APENAS conhecimento útil dos 3,285 ficheiros MD
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import os
+import json
+import requests
+from pathlib import Path
+from typing import Dict, Optional
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Configurações
+INPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/output_md"
+OUTPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/knowledge_base"
+API_KEY = os.getenv("OPENROUTER_API_KEY")
+
+class KnowledgeExtractor:
+    def __init__(self):
+        self.api_key = API_KEY
+        self.api_url = "https://openrouter.ai/api/v1/chat/completions"
+        self.model = "google/gemini-2.5-flash"
+
+        # Prompt ULTRA-FOCADO
+        self.extraction_prompt = """
+És um especialista em estofamento automotivo, náutico, ferroviário e aeronáutico.
+
+OBJETIVO: Analisar este texto e extrair APENAS conhecimento técnico útil sobre estofamento.
+
+IGNORAR COMPLETAMENTE:
+- Navegação de site
+- Publicidade
+- Comentários genéricos ("obrigado", "bom post")
+- Conversas off-topic
+- Links sem contexto
+
+EXTRAIR APENAS SE EXISTIR:
+1. **Problema técnico específico** de estofamento
+2. **Pergunta sobre materiais** (qual tecido/couro usar, quando, porquê)
+3. **Solução prática** com detalhes técnicos
+4. **Recomendação de material** para situação específica
+
+FORMATO JSON DE SAÍDA:
+{{
+  "relevante": true/false,
+  "categoria_aplicacao": "automovel|automovel-classico|mobiliario|nautica|ferroviaria|aeronautica|geral",
+  "tipo_conteudo": "problema-tecnico|pergunta-material|tutorial|comparacao-materiais|caso-pratico",
+  "problemas": [
+    {{
+      "descricao": "Problema específico identificado",
+      "contexto": "Tipo de veículo/aplicação",
+      "severidade": "baixa|media|alta"
+    }}
+  ],
+  "perguntas_materiais": [
+    {{
+      "pergunta": "Pergunta específica sobre material",
+      "contexto": "Situação/aplicação",
+      "materiais_mencionados": ["vinyl", "leather", "alcantara", etc]
+    }}
+  ],
+  "solucoes": [
+    {{
+      "problema": "O que resolve",
+      "material_recomendado": "Material específico",
+      "tecnica": "Técnica ou método usado",
+      "resultado": "Outcome esperado"
+    }}
+  ],
+  "materiais_discutidos": {{
+    "principais": ["lista de materiais chave"],
+    "alternativos": ["opções alternativas"],
+    "nao_recomendados": ["materiais a evitar"]
+  }},
+  "keywords_tecnicas": ["lista", "de", "termos", "tecnicos"],
+  "aplicabilidade": ["tipos de veículos/situações onde se aplica"],
+  "nivel_expertise": "iniciante|intermedio|avancado"
+}}
+
+SE O TEXTO NÃO CONTIVER NADA ÚTIL, retorna: {{"relevante": false}}
+
+TEXTO PARA ANALISAR:
+---
+{content}
+---
+
+Responde APENAS com o JSON, sem texto adicional.
+"""
+
+    def extract_knowledge(self, content: str, source_file: str, retries: int = 3) -> Optional[Dict]:
+        """Extrai conhecimento útil usando AI."""
+        # Pré-filtro
+        if len(content) < 1000:
+            return {"relevante": False, "motivo": "conteudo_pequeno"}
+
+        if len(content) > 50000:
+            content = content[:50000]
+
+        for attempt in range(retries):
+            try:
+                headers = {
+                    "Authorization": f"Bearer {self.api_key}",
+                    "Content-Type": "application/json",
+                    "HTTP-Referer": "https://descomplicar.pt",
+                    "X-Title": "CTF Knowledge Extraction"
+                }
+
+                prompt = self.extraction_prompt.format(content=content)
+
+                data = {
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "temperature": 0.2,
+                    "max_tokens": 3000
+                }
+
+                response = requests.post(
+                    self.api_url,
+                    headers=headers,
+                    json=data,
+                    timeout=60
+                )
+
+                if response.status_code == 200:
+                    result = response.json()
+                    if 'choices' in result and len(result['choices']) > 0:
+                        content_text = result['choices'][0]['message']['content']
+
+                        # Parsing robusto - VERSÃO CORRIGIDA
+                        try:
+                            # Remover blocos markdown
+                            if '```json' in content_text:
+                                content_text = content_text.split('```json')[1].split('```')[0]
+                            elif '```' in content_text:
+                                content_text = content_text.split('```')[1].split('```')[0]
+
+                            # Limpeza agressiva
+                            content_text = content_text.strip()
+
+                            # Tentar parse
+                            knowledge = json.loads(content_text)
+                            return knowledge
+
+                        except json.JSONDecodeError as e:
+                            # Fallback: extrair { ... } manualmente
+                            try:
+                                start = content_text.find('{')
+                                end = content_text.rfind('}') + 1
+                                if start != -1 and end > start:
+                                    clean_json = content_text[start:end]
+                                    knowledge = json.loads(clean_json)
+                                    return knowledge
+                            except:
+                                print(f"⚠️  JSON parse error: {source_file} ({e})")
+                                pass
+
+                            return None
+
+                elif response.status_code == 429:  # Rate limit
+                    import time
+                    time.sleep(20)
+                    continue
+
+            except Exception as e:
+                print(f"❌ Erro: {e}")
+                if attempt < retries - 1:
+                    import time
+                    time.sleep(5)
+                    continue
+
+        return None
+
+    def process_file(self, input_file: Path, output_dir: Path) -> bool:
+        """Processa um ficheiro e extrai conhecimento."""
+        try:
+            with open(input_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            print(f"🔍 Analisando: {input_file.name}")
+
+            knowledge = self.extract_knowledge(content, input_file.name)
+
+            if not knowledge:
+                print(f"  ⚠️  Falha na extração")
+                return False
+
+            if not knowledge.get('relevante', False):
+                print(f"  ❌ Sem conteúdo útil - SKIP")
+                return False
+
+            # RELEVANTE - Guardar!
+            output_file = output_dir / f"knowledge_{input_file.stem}.json"
+
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(knowledge, f, indent=2, ensure_ascii=False)
+
+            # Estatísticas
+            n_problemas = len(knowledge.get('problemas', []))
+            n_perguntas = len(knowledge.get('perguntas_materiais', []))
+            n_solucoes = len(knowledge.get('solucoes', []))
+            categoria = knowledge.get('categoria_aplicacao', 'N/A')
+
+            print(f"  ✅ ÚTIL! [{categoria}] P:{n_problemas} Q:{n_perguntas} S:{n_solucoes}")
+
+            import time
+            time.sleep(2)  # Rate limiting
+
+            return True
+
+        except Exception as e:
+            print(f"  ❌ Erro: {e}")
+            return False
+
+
+def main():
+    """Função principal."""
+    print("═══════════════════════════════════════════════════════════")
+    print("  CTF CARSTUFF - EXTRAÇÃO INTELIGENTE (VERSÃO CORRIGIDA)")
+    print("  TESTE: 10 ficheiros")
+    print("═══════════════════════════════════════════════════════════")
+    print()
+
+    if not API_KEY:
+        print("❌ OPENROUTER_API_KEY não configurada no .env")
+        return
+
+    input_path = Path(INPUT_DIR)
+    output_path = Path(OUTPUT_DIR)
+
+    if not input_path.exists():
+        print(f"❌ Diretório não existe: {INPUT_DIR}")
+        return
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Sites prioritários
+    priority_sites = [
+        'thehogring.com',
+        'forums.pelicanparts.com',
+        'thesamba.com',
+        'sailrite.com'
+    ]
+
+    all_files = []
+    for site in priority_sites:
+        pattern = f"{site}_*.md"
+        files = list(input_path.glob(pattern))
+        all_files.extend(files)
+
+    print(f"📂 Encontrados {len(all_files)} ficheiros")
+    print(f"🎯 Testando com 10 ficheiros...") 
+    print()
+
+    extractor = KnowledgeExtractor()
+    relevant = 0
+    processed = 0
+
+    for md_file in all_files[:10]:  # TESTE: 10 ficheiros
+        if extractor.process_file(md_file, output_path):
+            relevant += 1
+        processed += 1
+
+    print()
+    print("═══════════════════════════════════════════════════════════")
+    ratio = (relevant / processed * 100) if processed > 0 else 0
+    print(f"✓ Teste: {relevant}/{processed} relevantes ({ratio:.1f}%)")
+    print(f"📁 Output: {OUTPUT_DIR}")
+    print("═══════════════════════════════════════════════════════════")
+
+
+if __name__ == '__main__':
+    main()