init: scripts diversos (crawlers, conversores, scrapers)

2026-03-05 20:38:36 +00:00
commit 6ac6f4be2a
925 changed files with 850330 additions and 0 deletions
@@ -0,0 +1,470 @@
+"""
+structure_content_local.py - Estruturação LOCAL com regex/heurísticas (SEM custos API)
+
+Author: Descomplicar® Crescimento Digital
+Link: https://descomplicar.pt
+Copyright: 2025 Descomplicar®
+"""
+
+import os
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List, Optional
+from collections import Counter
+
+# Configurações
+INPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/output_md"
+OUTPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/formatted"
+
+# Configurar logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('structure_local_execution.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+
+class LocalContentStructurer:
+    """Estruturador local usando regex e heurísticas."""
+
+    def __init__(self):
+        # Padrões para detecção de secções
+        self.problem_keywords = [
+            r'\b(problem|issue|erro|falha|defeito|dificuldade|challenge)\b',
+            r'\b(não funciona|not working|broken|fail)\b',
+            r'\b(como resolver|how to fix|solução para)\b'
+        ]
+
+        self.solution_keywords = [
+            r'\b(solução|solution|fix|repair|resolver|corrigir)\b',
+            r'\b(método|technique|process|procedimento)\b',
+            r'\b(usar|utilizar|aplicar|seguir|fazer)\b'
+        ]
+
+        self.result_keywords = [
+            r'\b(resultado|result|outcome|conclusão)\b',
+            r'\b(sucesso|success|funcionou|worked)\b',
+            r'\b(melhorou|improved|fixed|resolvido)\b'
+        ]
+
+        # Categorias por palavras-chave
+        self.category_patterns = {
+            'tutorial': [r'\bpasso\b', r'\bstep\b', r'\bguide\b', r'\bcomo fazer\b', r'\bhow to\b'],
+            'problema-tecnico': [r'\bproblema\b', r'\berro\b', r'\bfalha\b', r'\bissue\b'],
+            'showcase': [r'\bprojeto\b', r'\bproject\b', r'\bgaleria\b', r'\bgallery\b'],
+            'dica': [r'\bdica\b', r'\btip\b', r'\btruque\b', r'\btrick\b'],
+            'recurso': [r'\bferramenta\b', r'\btool\b', r'\bmaterial\b', r'\bsupply\b']
+        }
+
+        # Tópicos automotivos
+        self.automotive_topics = {
+            'estofamento': r'\b(upholstery|estofamento|estofar|tapeçaria)\b',
+            'couro': r'\b(leather|couro|pele)\b',
+            'tecido': r'\b(fabric|tecido|vinyl|vinil)\b',
+            'costura': r'\b(sewing|stitch|costura|coser)\b',
+            'bancos': r'\b(seat|banco|assento)\b',
+            'volante': r'\b(steering wheel|volante)\b',
+            'painel': r'\b(dashboard|painel|interior)\b',
+            'restauração': r'\b(restoration|restauração|restauro|renovação)\b'
+        }
+
+    def extract_title(self, content: str) -> str:
+        """Extrai título do conteúdo."""
+        lines = content.strip().split('\n')
+
+        # Procurar por markdown headers
+        for line in lines[:10]:
+            if line.startswith('# '):
+                return line.replace('# ', '').strip()
+            if line.startswith('## '):
+                return line.replace('## ', '').strip()
+
+        # Fallback: primeira linha não-vazia
+        for line in lines:
+            if line.strip():
+                return line.strip()[:100]
+
+        return "Sem Título"
+
+    def classify_category(self, content: str) -> str:
+        """Classifica categoria do conteúdo."""
+        content_lower = content.lower()
+        scores = {}
+
+        for category, patterns in self.category_patterns.items():
+            score = 0
+            for pattern in patterns:
+                score += len(re.findall(pattern, content_lower, re.IGNORECASE))
+            scores[category] = score
+
+        # Retornar categoria com maior score
+        if scores:
+            best_category = max(scores.items(), key=lambda x: x[1])
+            if best_category[1] > 0:
+                return best_category[0]
+
+        return "recurso"  # Default
+
+    def extract_topics(self, content: str) -> List[str]:
+        """Extrai tópicos relevantes."""
+        topics = []
+        content_lower = content.lower()
+
+        for topic, pattern in self.automotive_topics.items():
+            if re.search(pattern, content_lower, re.IGNORECASE):
+                topics.append(topic)
+
+        return topics if topics else ["estofamento automotivo"]
+
+    def extract_keywords(self, content: str) -> List[str]:
+        """Extrai palavras-chave por frequência."""
+        # Remover pontuação e split
+        words = re.findall(r'\b[a-záàâãéêíóôõúçA-ZÁÀÂÃÉÊÍÓÔÕÚÇ]{4,}\b', content.lower())
+
+        # Contar frequências
+        word_freq = Counter(words)
+
+        # Filtrar stop words comuns
+        stop_words = {'para', 'com', 'sem', 'sobre', 'mais', 'pode', 'como', 'quando', 'onde', 'the', 'and', 'for', 'with'}
+        keywords = [word for word, freq in word_freq.most_common(20) if word not in stop_words]
+
+        return keywords[:10]
+
+    def detect_sections(self, content: str) -> Dict[str, List[str]]:
+        """Detecta secções por padrões regex."""
+        sections = {
+            'problema': [],
+            'solucao': [],
+            'resultado': [],
+            'info': []
+        }
+
+        lines = content.split('\n')
+        current_section = 'info'
+        current_block = []
+
+        for line in lines:
+            line = line.strip()
+
+            if not line:
+                if current_block:
+                    sections[current_section].append(' '.join(current_block))
+                    current_block = []
+                continue
+
+            # Detectar mudança de secção
+            line_lower = line.lower()
+
+            is_problem = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.problem_keywords)
+            is_solution = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.solution_keywords)
+            is_result = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.result_keywords)
+
+            if is_problem:
+                if current_block:
+                    sections[current_section].append(' '.join(current_block))
+                current_section = 'problema'
+                current_block = [line]
+            elif is_solution:
+                if current_block:
+                    sections[current_section].append(' '.join(current_block))
+                current_section = 'solucao'
+                current_block = [line]
+            elif is_result:
+                if current_block:
+                    sections[current_section].append(' '.join(current_block))
+                current_section = 'resultado'
+                current_block = [line]
+            else:
+                current_block.append(line)
+
+        # Adicionar último bloco
+        if current_block:
+            sections[current_section].append(' '.join(current_block))
+
+        return sections
+
+    def extract_list_items(self, text: str) -> List[str]:
+        """Extrai itens de lista do texto."""
+        # Procurar por listas markdown ou numeradas
+        lines = text.split('\n')
+        items = []
+
+        for line in lines:
+            line = line.strip()
+            # Markdown list
+            if line.startswith('- ') or line.startswith('* '):
+                items.append(line[2:].strip())
+            # Numbered list
+            elif re.match(r'^\d+\.\s+', line):
+                items.append(re.sub(r'^\d+\.\s+', '', line))
+
+        return items if items else [text]
+
+    def structure_content(self, content: str, source_file: str) -> Dict:
+        """Estrutura conteúdo usando heurísticas."""
+
+        # Extrair informações básicas
+        title = self.extract_title(content)
+        category = self.classify_category(content)
+        topics = self.extract_topics(content)
+        keywords = self.extract_keywords(content)
+        sections = self.detect_sections(content)
+
+        # Montar fonte
+        fonte = source_file.split('_')[0].replace('.md', '')
+
+        # Criar estrutura JSON
+        structured = {
+            "metadata": {
+                "titulo": title,
+                "categoria": category,
+                "topicos": topics,
+                "fonte": fonte
+            },
+            "conteudo": [],
+            "keywords": keywords,
+            "aplicabilidade": ["Veículos diversos", "Estofamento automotivo"]
+        }
+
+        # Adicionar secções detectadas
+        for secao_tipo, blocos in sections.items():
+            if not blocos:
+                continue
+
+            for i, bloco in enumerate(blocos, 1):
+                if len(bloco) < 50:  # Ignorar blocos muito pequenos
+                    continue
+
+                # Extrair primeira frase como descrição
+                sentences = bloco.split('. ')
+                descricao = sentences[0] + '.' if sentences else bloco[:200]
+
+                # Extrair detalhes (listas)
+                detalhes = self.extract_list_items(bloco)
+                if len(detalhes) == 1 and detalhes[0] == bloco:
+                    detalhes = []  # Sem lista, usar só descrição
+
+                item = {
+                    "tipo": secao_tipo,
+                    "titulo": f"{secao_tipo.capitalize()} {i}" if len(blocos) > 1 else secao_tipo.capitalize(),
+                    "descricao": descricao,
+                    "detalhes": detalhes[:5],  # Máximo 5 itens
+                    "relevancia": "alta" if len(bloco) > 200 else "media"
+                }
+
+                structured["conteudo"].append(item)
+
+        return structured
+
+    def format_structured_md(self, structured_data: Dict, original_file: str) -> str:
+        """Converte dados estruturados em Markdown formatado (compatível com versão AI)."""
+        md_lines = []
+
+        # Metadata
+        meta = structured_data.get('metadata', {})
+        md_lines.append(f"# {meta.get('titulo', 'Sem Título')}")
+        md_lines.append("")
+        md_lines.append(f"**Categoria**: {meta.get('categoria', 'Geral')}")
+        md_lines.append(f"**Fonte**: {meta.get('fonte', original_file)}")
+
+        if meta.get('topicos'):
+            md_lines.append(f"**Tópicos**: {', '.join(meta['topicos'])}")
+
+        md_lines.append("")
+        md_lines.append("---")
+        md_lines.append("")
+
+        # Conteúdo estruturado
+        conteudo = structured_data.get('conteudo', [])
+
+        # Agrupar por tipo
+        problemas = [c for c in conteudo if c.get('tipo') == 'problema']
+        solucoes = [c for c in conteudo if c.get('tipo') == 'solucao']
+        resultados = [c for c in conteudo if c.get('tipo') == 'resultado']
+        info = [c for c in conteudo if c.get('tipo') == 'info']
+
+        # Problemas
+        if problemas:
+            md_lines.append("## 🔍 Problemas Identificados")
+            md_lines.append("")
+            for p in problemas:
+                md_lines.append(f"### {p.get('titulo', 'Problema')}")
+                md_lines.append("")
+                md_lines.append(p.get('descricao', ''))
+                md_lines.append("")
+                if p.get('detalhes'):
+                    for detalhe in p['detalhes']:
+                        md_lines.append(f"- {detalhe}")
+                    md_lines.append("")
+
+        # Soluções
+        if solucoes:
+            md_lines.append("## 💡 Soluções")
+            md_lines.append("")
+            for s in solucoes:
+                md_lines.append(f"### {s.get('titulo', 'Solução')}")
+                md_lines.append("")
+                md_lines.append(s.get('descricao', ''))
+                md_lines.append("")
+                if s.get('detalhes'):
+                    for i, detalhe in enumerate(s['detalhes'], 1):
+                        md_lines.append(f"{i}. {detalhe}")
+                    md_lines.append("")
+
+        # Resultados
+        if resultados:
+            md_lines.append("## ✅ Resultados")
+            md_lines.append("")
+            for r in resultados:
+                md_lines.append(f"### {r.get('titulo', 'Resultado')}")
+                md_lines.append("")
+                md_lines.append(r.get('descricao', ''))
+                md_lines.append("")
+                if r.get('detalhes'):
+                    for detalhe in r['detalhes']:
+                        md_lines.append(f"- {detalhe}")
+                    md_lines.append("")
+
+        # Informação adicional
+        if info:
+            md_lines.append("## 📋 Informação Adicional")
+            md_lines.append("")
+            for inf in info:
+                md_lines.append(f"### {inf.get('titulo', 'Info')}")
+                md_lines.append("")
+                md_lines.append(inf.get('descricao', ''))
+                md_lines.append("")
+                if inf.get('detalhes'):
+                    for detalhe in inf['detalhes']:
+                        md_lines.append(f"- {detalhe}")
+                    md_lines.append("")
+
+        # Keywords e aplicabilidade
+        md_lines.append("---")
+        md_lines.append("")
+
+        if structured_data.get('keywords'):
+            md_lines.append(f"**Palavras-chave**: {', '.join(structured_data['keywords'])}")
+            md_lines.append("")
+
+        if structured_data.get('aplicabilidade'):
+            md_lines.append("**Aplicabilidade**:")
+            for app in structured_data['aplicabilidade']:
+                md_lines.append(f"- {app}")
+            md_lines.append("")
+
+        return '\n'.join(md_lines)
+
+    def process_file(self, input_file: Path, output_dir: Path) -> bool:
+        """Processa um ficheiro."""
+        try:
+            output_file = output_dir / f"structured_{input_file.name}"
+
+            # Skip se já existe (compatibilidade com versão AI)
+            if output_file.exists():
+                logger.info(f"⏭️  Já existe: {output_file.name}")
+                return True
+
+            # Verificar tamanho
+            file_size = input_file.stat().st_size
+            if file_size < 500:
+                logger.warning(f"⚠️  Ficheiro muito pequeno ({file_size}B): {input_file.name}")
+                return False
+
+            # Ler conteúdo
+            with open(input_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+
+            logger.info(f"📄 Processando: {input_file.name} ({file_size/1024:.1f}KB)")
+
+            # Estruturar com heurísticas LOCAL
+            structured_data = self.structure_content(content, input_file.name)
+
+            # Converter para MD formatado
+            formatted_md = self.format_structured_md(structured_data, input_file.name)
+
+            # Guardar MD
+            with open(output_file, 'w', encoding='utf-8') as f:
+                f.write(formatted_md)
+
+            # Guardar JSON
+            json_file = output_dir / f"structured_{input_file.stem}.json"
+            with open(json_file, 'w', encoding='utf-8') as f:
+                json.dump(structured_data, f, indent=2, ensure_ascii=False)
+
+            logger.info(f"✅ Guardado: {output_file.name}")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"❌ Erro ao processar {input_file.name}: {e}")
+            return False
+
+
+def main():
+    """Função principal."""
+    input_path = Path(INPUT_DIR)
+    output_path = Path(OUTPUT_DIR)
+
+    if not input_path.exists():
+        logger.error(f"❌ Diretório não existe: {INPUT_DIR}")
+        return
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    # Encontrar TODOS os ficheiros .md
+    all_files = list(input_path.glob('*.md'))
+
+    logger.info(f"📊 Encontrados {len(all_files)} ficheiros para processar")
+
+    # Contar já processados
+    existing = list(output_path.glob('structured_*.md'))
+    logger.info(f"✅ Já processados: {len(existing)} ficheiros")
+    logger.info(f"⏳ Restantes: {len(all_files) - len(existing)} ficheiros")
+
+    # Processar
+    structurer = LocalContentStructurer()
+    successful = 0
+    skipped = 0
+    failed = 0
+
+    for i, md_file in enumerate(all_files, 1):
+        logger.info(f"\n[{i}/{len(all_files)}]")
+
+        result = structurer.process_file(md_file, output_path)
+
+        if result:
+            if (output_path / f"structured_{md_file.name}").stat().st_mtime > md_file.stat().st_mtime:
+                successful += 1
+            else:
+                skipped += 1
+        else:
+            failed += 1
+
+    logger.info("")
+    logger.info("═══════════════════════════════════════════════════════════")
+    logger.info(f"✅ Concluído!")
+    logger.info(f"   • Processados: {successful}")
+    logger.info(f"   • Já existiam: {skipped}")
+    logger.info(f"   • Falhados: {failed}")
+    logger.info(f"   • Total: {len(all_files)}")
+    logger.info(f"📁 Output: {output_path}")
+    logger.info("═══════════════════════════════════════════════════════════")
+
+
+if __name__ == '__main__':
+    print("═══════════════════════════════════════════════════════════")
+    print("  🔧 CTF CARSTUFF - ESTRUTURAÇÃO LOCAL (SEM CUSTOS)")
+    print("  Método: Regex + Heurísticas")
+    print("  Formato: Problema → Solução → Resultado")
+    print("═══════════════════════════════════════════════════════════")
+    print("")
+    main()
+    print("")
+    print("✅ Processo concluído!")