scripts/scraper/structure_content_local.py

"""
structure_content_local.py - Estruturação LOCAL com regex/heurísticas (SEM custos API)

Author: Descomplicar® Crescimento Digital
Link: https://descomplicar.pt
Copyright: 2025 Descomplicar®
"""

import os
import json
import logging
import re
from pathlib import Path
from typing import Dict, List, Optional
from collections import Counter

# Configurações
INPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/output_md"
OUTPUT_DIR = "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites/formatted"

# Configurar logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('structure_local_execution.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


class LocalContentStructurer:
    """Estruturador local usando regex e heurísticas."""

    def __init__(self):
        # Padrões para detecção de secções
        self.problem_keywords = [
            r'\b(problem|issue|erro|falha|defeito|dificuldade|challenge)\b',
            r'\b(não funciona|not working|broken|fail)\b',
            r'\b(como resolver|how to fix|solução para)\b'
        ]

        self.solution_keywords = [
            r'\b(solução|solution|fix|repair|resolver|corrigir)\b',
            r'\b(método|technique|process|procedimento)\b',
            r'\b(usar|utilizar|aplicar|seguir|fazer)\b'
        ]

        self.result_keywords = [
            r'\b(resultado|result|outcome|conclusão)\b',
            r'\b(sucesso|success|funcionou|worked)\b',
            r'\b(melhorou|improved|fixed|resolvido)\b'
        ]

        # Categorias por palavras-chave
        self.category_patterns = {
            'tutorial': [r'\bpasso\b', r'\bstep\b', r'\bguide\b', r'\bcomo fazer\b', r'\bhow to\b'],
            'problema-tecnico': [r'\bproblema\b', r'\berro\b', r'\bfalha\b', r'\bissue\b'],
            'showcase': [r'\bprojeto\b', r'\bproject\b', r'\bgaleria\b', r'\bgallery\b'],
            'dica': [r'\bdica\b', r'\btip\b', r'\btruque\b', r'\btrick\b'],
            'recurso': [r'\bferramenta\b', r'\btool\b', r'\bmaterial\b', r'\bsupply\b']
        }

        # Tópicos automotivos
        self.automotive_topics = {
            'estofamento': r'\b(upholstery|estofamento|estofar|tapeçaria)\b',
            'couro': r'\b(leather|couro|pele)\b',
            'tecido': r'\b(fabric|tecido|vinyl|vinil)\b',
            'costura': r'\b(sewing|stitch|costura|coser)\b',
            'bancos': r'\b(seat|banco|assento)\b',
            'volante': r'\b(steering wheel|volante)\b',
            'painel': r'\b(dashboard|painel|interior)\b',
            'restauração': r'\b(restoration|restauração|restauro|renovação)\b'
        }

    def extract_title(self, content: str) -> str:
        """Extrai título do conteúdo."""
        lines = content.strip().split('\n')

        # Procurar por markdown headers
        for line in lines[:10]:
            if line.startswith('# '):
                return line.replace('# ', '').strip()
            if line.startswith('## '):
                return line.replace('## ', '').strip()

        # Fallback: primeira linha não-vazia
        for line in lines:
            if line.strip():
                return line.strip()[:100]

        return "Sem Título"

    def classify_category(self, content: str) -> str:
        """Classifica categoria do conteúdo."""
        content_lower = content.lower()
        scores = {}

        for category, patterns in self.category_patterns.items():
            score = 0
            for pattern in patterns:
                score += len(re.findall(pattern, content_lower, re.IGNORECASE))
            scores[category] = score

        # Retornar categoria com maior score
        if scores:
            best_category = max(scores.items(), key=lambda x: x[1])
            if best_category[1] > 0:
                return best_category[0]

        return "recurso"  # Default

    def extract_topics(self, content: str) -> List[str]:
        """Extrai tópicos relevantes."""
        topics = []
        content_lower = content.lower()

        for topic, pattern in self.automotive_topics.items():
            if re.search(pattern, content_lower, re.IGNORECASE):
                topics.append(topic)

        return topics if topics else ["estofamento automotivo"]

    def extract_keywords(self, content: str) -> List[str]:
        """Extrai palavras-chave por frequência."""
        # Remover pontuação e split
        words = re.findall(r'\b[a-záàâãéêíóôõúçA-ZÁÀÂÃÉÊÍÓÔÕÚÇ]{4,}\b', content.lower())

        # Contar frequências
        word_freq = Counter(words)

        # Filtrar stop words comuns
        stop_words = {'para', 'com', 'sem', 'sobre', 'mais', 'pode', 'como', 'quando', 'onde', 'the', 'and', 'for', 'with'}
        keywords = [word for word, freq in word_freq.most_common(20) if word not in stop_words]

        return keywords[:10]

    def detect_sections(self, content: str) -> Dict[str, List[str]]:
        """Detecta secções por padrões regex."""
        sections = {
            'problema': [],
            'solucao': [],
            'resultado': [],
            'info': []
        }

        lines = content.split('\n')
        current_section = 'info'
        current_block = []

        for line in lines:
            line = line.strip()

            if not line:
                if current_block:
                    sections[current_section].append(' '.join(current_block))
                    current_block = []
                continue

            # Detectar mudança de secção
            line_lower = line.lower()

            is_problem = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.problem_keywords)
            is_solution = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.solution_keywords)
            is_result = any(re.search(pattern, line_lower, re.IGNORECASE) for pattern in self.result_keywords)

            if is_problem:
                if current_block:
                    sections[current_section].append(' '.join(current_block))
                current_section = 'problema'
                current_block = [line]
            elif is_solution:
                if current_block:
                    sections[current_section].append(' '.join(current_block))
                current_section = 'solucao'
                current_block = [line]
            elif is_result:
                if current_block:
                    sections[current_section].append(' '.join(current_block))
                current_section = 'resultado'
                current_block = [line]
            else:
                current_block.append(line)

        # Adicionar último bloco
        if current_block:
            sections[current_section].append(' '.join(current_block))

        return sections

    def extract_list_items(self, text: str) -> List[str]:
        """Extrai itens de lista do texto."""
        # Procurar por listas markdown ou numeradas
        lines = text.split('\n')
        items = []

        for line in lines:
            line = line.strip()
            # Markdown list
            if line.startswith('- ') or line.startswith('* '):
                items.append(line[2:].strip())
            # Numbered list
            elif re.match(r'^\d+\.\s+', line):
                items.append(re.sub(r'^\d+\.\s+', '', line))

        return items if items else [text]

    def structure_content(self, content: str, source_file: str) -> Dict:
        """Estrutura conteúdo usando heurísticas."""

        # Extrair informações básicas
        title = self.extract_title(content)
        category = self.classify_category(content)
        topics = self.extract_topics(content)
        keywords = self.extract_keywords(content)
        sections = self.detect_sections(content)

        # Montar fonte
        fonte = source_file.split('_')[0].replace('.md', '')

        # Criar estrutura JSON
        structured = {
            "metadata": {
                "titulo": title,
                "categoria": category,
                "topicos": topics,
                "fonte": fonte
            },
            "conteudo": [],
            "keywords": keywords,
            "aplicabilidade": ["Veículos diversos", "Estofamento automotivo"]
        }

        # Adicionar secções detectadas
        for secao_tipo, blocos in sections.items():
            if not blocos:
                continue

            for i, bloco in enumerate(blocos, 1):
                if len(bloco) < 50:  # Ignorar blocos muito pequenos
                    continue

                # Extrair primeira frase como descrição
                sentences = bloco.split('. ')
                descricao = sentences[0] + '.' if sentences else bloco[:200]

                # Extrair detalhes (listas)
                detalhes = self.extract_list_items(bloco)
                if len(detalhes) == 1 and detalhes[0] == bloco:
                    detalhes = []  # Sem lista, usar só descrição

                item = {
                    "tipo": secao_tipo,
                    "titulo": f"{secao_tipo.capitalize()} {i}" if len(blocos) > 1 else secao_tipo.capitalize(),
                    "descricao": descricao,
                    "detalhes": detalhes[:5],  # Máximo 5 itens
                    "relevancia": "alta" if len(bloco) > 200 else "media"
                }

                structured["conteudo"].append(item)

        return structured

    def format_structured_md(self, structured_data: Dict, original_file: str) -> str:
        """Converte dados estruturados em Markdown formatado (compatível com versão AI)."""
        md_lines = []

        # Metadata
        meta = structured_data.get('metadata', {})
        md_lines.append(f"# {meta.get('titulo', 'Sem Título')}")
        md_lines.append("")
        md_lines.append(f"**Categoria**: {meta.get('categoria', 'Geral')}")
        md_lines.append(f"**Fonte**: {meta.get('fonte', original_file)}")

        if meta.get('topicos'):
            md_lines.append(f"**Tópicos**: {', '.join(meta['topicos'])}")

        md_lines.append("")
        md_lines.append("---")
        md_lines.append("")

        # Conteúdo estruturado
        conteudo = structured_data.get('conteudo', [])

        # Agrupar por tipo
        problemas = [c for c in conteudo if c.get('tipo') == 'problema']
        solucoes = [c for c in conteudo if c.get('tipo') == 'solucao']
        resultados = [c for c in conteudo if c.get('tipo') == 'resultado']
        info = [c for c in conteudo if c.get('tipo') == 'info']

        # Problemas
        if problemas:
            md_lines.append("## 🔍 Problemas Identificados")
            md_lines.append("")
            for p in problemas:
                md_lines.append(f"### {p.get('titulo', 'Problema')}")
                md_lines.append("")
                md_lines.append(p.get('descricao', ''))
                md_lines.append("")
                if p.get('detalhes'):
                    for detalhe in p['detalhes']:
                        md_lines.append(f"- {detalhe}")
                    md_lines.append("")

        # Soluções
        if solucoes:
            md_lines.append("## 💡 Soluções")
            md_lines.append("")
            for s in solucoes:
                md_lines.append(f"### {s.get('titulo', 'Solução')}")
                md_lines.append("")
                md_lines.append(s.get('descricao', ''))
                md_lines.append("")
                if s.get('detalhes'):
                    for i, detalhe in enumerate(s['detalhes'], 1):
                        md_lines.append(f"{i}. {detalhe}")
                    md_lines.append("")

        # Resultados
        if resultados:
            md_lines.append("## ✅ Resultados")
            md_lines.append("")
            for r in resultados:
                md_lines.append(f"### {r.get('titulo', 'Resultado')}")
                md_lines.append("")
                md_lines.append(r.get('descricao', ''))
                md_lines.append("")
                if r.get('detalhes'):
                    for detalhe in r['detalhes']:
                        md_lines.append(f"- {detalhe}")
                    md_lines.append("")

        # Informação adicional
        if info:
            md_lines.append("## 📋 Informação Adicional")
            md_lines.append("")
            for inf in info:
                md_lines.append(f"### {inf.get('titulo', 'Info')}")
                md_lines.append("")
                md_lines.append(inf.get('descricao', ''))
                md_lines.append("")
                if inf.get('detalhes'):
                    for detalhe in inf['detalhes']:
                        md_lines.append(f"- {detalhe}")
                    md_lines.append("")

        # Keywords e aplicabilidade
        md_lines.append("---")
        md_lines.append("")

        if structured_data.get('keywords'):
            md_lines.append(f"**Palavras-chave**: {', '.join(structured_data['keywords'])}")
            md_lines.append("")

        if structured_data.get('aplicabilidade'):
            md_lines.append("**Aplicabilidade**:")
            for app in structured_data['aplicabilidade']:
                md_lines.append(f"- {app}")
            md_lines.append("")

        return '\n'.join(md_lines)

    def process_file(self, input_file: Path, output_dir: Path) -> bool:
        """Processa um ficheiro."""
        try:
            output_file = output_dir / f"structured_{input_file.name}"

            # Skip se já existe (compatibilidade com versão AI)
            if output_file.exists():
                logger.info(f"⏭️  Já existe: {output_file.name}")
                return True

            # Verificar tamanho
            file_size = input_file.stat().st_size
            if file_size < 500:
                logger.warning(f"⚠️  Ficheiro muito pequeno ({file_size}B): {input_file.name}")
                return False

            # Ler conteúdo
            with open(input_file, 'r', encoding='utf-8') as f:
                content = f.read()

            logger.info(f"📄 Processando: {input_file.name} ({file_size/1024:.1f}KB)")

            # Estruturar com heurísticas LOCAL
            structured_data = self.structure_content(content, input_file.name)

            # Converter para MD formatado
            formatted_md = self.format_structured_md(structured_data, input_file.name)

            # Guardar MD
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(formatted_md)

            # Guardar JSON
            json_file = output_dir / f"structured_{input_file.stem}.json"
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(structured_data, f, indent=2, ensure_ascii=False)

            logger.info(f"✅ Guardado: {output_file.name}")

            return True

        except Exception as e:
            logger.error(f"❌ Erro ao processar {input_file.name}: {e}")
            return False


def main():
    """Função principal."""
    input_path = Path(INPUT_DIR)
    output_path = Path(OUTPUT_DIR)

    if not input_path.exists():
        logger.error(f"❌ Diretório não existe: {INPUT_DIR}")
        return

    output_path.mkdir(parents=True, exist_ok=True)

    # Encontrar TODOS os ficheiros .md
    all_files = list(input_path.glob('*.md'))

    logger.info(f"📊 Encontrados {len(all_files)} ficheiros para processar")

    # Contar já processados
    existing = list(output_path.glob('structured_*.md'))
    logger.info(f"✅ Já processados: {len(existing)} ficheiros")
    logger.info(f"⏳ Restantes: {len(all_files) - len(existing)} ficheiros")

    # Processar
    structurer = LocalContentStructurer()
    successful = 0
    skipped = 0
    failed = 0

    for i, md_file in enumerate(all_files, 1):
        logger.info(f"\n[{i}/{len(all_files)}]")

        result = structurer.process_file(md_file, output_path)

        if result:
            if (output_path / f"structured_{md_file.name}").stat().st_mtime > md_file.stat().st_mtime:
                successful += 1
            else:
                skipped += 1
        else:
            failed += 1

    logger.info("")
    logger.info("═══════════════════════════════════════════════════════════")
    logger.info(f"✅ Concluído!")
    logger.info(f"   • Processados: {successful}")
    logger.info(f"   • Já existiam: {skipped}")
    logger.info(f"   • Falhados: {failed}")
    logger.info(f"   • Total: {len(all_files)}")
    logger.info(f"📁 Output: {output_path}")
    logger.info("═══════════════════════════════════════════════════════════")


if __name__ == '__main__':
    print("═══════════════════════════════════════════════════════════")
    print("  🔧 CTF CARSTUFF - ESTRUTURAÇÃO LOCAL (SEM CUSTOS)")
    print("  Método: Regex + Heurísticas")
    print("  Formato: Problema → Solução → Resultado")
    print("═══════════════════════════════════════════════════════════")
    print("")
    main()
    print("")
    print("✅ Processo concluído!")