scripts/okf-hub/okf-convert-wikilinks.py

#!/usr/bin/env python3
"""
okf-convert-wikilinks.py — Fase F: converte [[wikilinks]] → [texto](path.md) no CORPO
de notas `layer: wiki`, para compatibilidade com agentes CLI (que não resolvem [[ ]]).

Propriedades de segurança:
  - FENCE-AWARE: nunca toca em code fences (``` ~~~) nem inline `code` — preserva
    sintaxe bash `[[ -f x ]]` e exemplos de documentação.
  - Resolução robusta: mesmo-directório → path-style (relativo, depois vault-root,
    depois basename único) → stem global único. Acentos normalizados (NFKD), pipe
    escapado `\\|` tratado, anchors `#`/`^` preservados no URL.
  - Conservador: alvos ambíguos (stem repetido) ou não encontrados ficam [[ ]] e são
    reportados — NUNCA se adivinha um destino.
  - Exclusões alinhadas com okf-validate.sh (MEMORY/CLAUDE/AGENTS, 90-Templates, etc).

Uso: python3 okf-convert-wikilinks.py [--dry-run] [--dir=/path/Hub] [--include-index]
"""
import os
import re
import sys
import unicodedata
from pathlib import Path
from collections import defaultdict, Counter

HUB_DEFAULT = "/media/ealmeida/Dados/Hub"

EXCLUDE_DIRS = {
    ".stversions", "node_modules", ".git", ".obsidian", ".trash", ".ijfw", "ijfw",
    ".github", ".wayland", ".hermes", ".vscode", ".cursor", ".gstack",
    "_templates", "99-Arquivo", "90-Templates",
}
EXCLUDE_FILES = {"MEMORY.md", "CLAUDE.md", "GEMINI.md", "AGENTS.md", "copilot-instructions.md"}

WIKILINK_RE = re.compile(r'\[\[([^\]]+?)\]\]')
FENCE_RE = re.compile(r'^(\s*)(```|~~~)')
FM_RE = re.compile(r'^---\n(.*?)\n---', re.DOTALL)
SKIP_PROSE = {"", " ", "...", "wikilinks", "Wikilinks"}


def fold(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)).lower()


def is_excluded(rel: str) -> bool:
    base = Path(rel).name
    if base in EXCLUDE_FILES:
        return True
    if re.match(r'MEMORY-.*\.md$', base):
        return True
    return False


def layer_of(text: str):
    m = FM_RE.match(text)
    if not m:
        return None
    lm = re.search(r'^layer:\s*(\w+)', m.group(1), re.M)
    return lm.group(1) if lm else None


def build_stem_index(hub: Path) -> dict:
    """folded stem → [rel Path, ...] (lista para detectar ambiguidade)."""
    idx = defaultdict(list)
    for root, dirs, files in os.walk(hub):
        dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")]
        for f in files:
            if f.endswith(".md"):
                fp = Path(root) / f
                idx[fold(fp.stem)].append(fp.relative_to(hub))
    return idx


def ci_dir_lookup(dirpath: Path, basename: str, hub: Path):
    """Procura case-insensitive por basename(.md) em dirpath. Devolve rel Path ou None."""
    if not dirpath.exists():
        return None
    want = fold(basename if basename.endswith(".md") else basename + ".md")
    try:
        for e in dirpath.iterdir():
            if e.is_file() and fold(e.name) == want:
                return e.relative_to(hub)
    except Exception:
        pass
    return None


def resolve(target: str, current: Path, hub: Path, stem_idx: dict):
    """Devolve (relpath, anchor) ou None. 'AMBIG' como relpath sinaliza ambiguidade."""
    raw = target.strip()
    split = re.split(r'(?=[#^])', raw, maxsplit=1)
    name = split[0].strip().rstrip("\\").strip()
    anchor = raw[len(split[0]):].strip() if len(split) > 1 else ""
    if not name:
        return None
    cand = None
    if "/" in name:
        bn = Path(name).name
        td = (current.parent / Path(name).parent).resolve()
        if td == hub or hub in td.parents:
            cand = ci_dir_lookup(td, bn, hub)
        if cand is None:
            td2 = (hub / Path(name).parent).resolve()
            if td2 == hub or hub in td2.parents:
                cand = ci_dir_lookup(td2, bn, hub)
        if cand is None:
            lst = stem_idx.get(fold(Path(bn).stem), [])
            if len(lst) == 1:
                cand = lst[0]
    else:
        cand = ci_dir_lookup(current.parent, name, hub)
        if cand is None:
            lst = stem_idx.get(fold(Path(name).stem), [])
            if len(lst) == 1:
                cand = lst[0]
            elif len(lst) > 1:
                return ("AMBIG", "")
    if cand is None:
        return None
    relp = os.path.relpath(hub / cand, current.parent).replace("\\", "/")
    return (relp, anchor)


def convert_file(fp: Path, hub: Path, stem_idx: dict, stats: dict) -> str:
    content = fp.read_text(encoding="utf-8", errors="replace")
    lines = content.split("\n")
    out = []
    in_fence = False
    fmark = None
    fm_end = -1
    if lines and lines[0].strip() == "---":
        for i in range(1, len(lines)):
            if lines[i].strip() == "---":
                fm_end = i
                break
    for idx, line in enumerate(lines):
        if idx <= fm_end:
            out.append(line)
            continue
        fm = FENCE_RE.match(line)
        if not in_fence and fm:
            in_fence = True
            fmark = fm.group(2)
            out.append(line)
            continue
        if in_fence:
            if line.strip().startswith(fmark):
                in_fence = False
            out.append(line)
            continue
        parts = re.split(r'(`[^`]*`)', line)
        for j, seg in enumerate(parts):
            if seg.startswith("`"):
                continue

            def repl(mm):
                inner = mm.group(1)
                if "|" in inner:
                    tgt, alias = inner.split("|", 1)
                    tgt = tgt.strip().rstrip("\\").strip()
                    alias = alias.strip()
                else:
                    tgt, alias = inner.strip(), None
                if tgt in SKIP_PROSE:
                    stats["prose"] += 1
                    return mm.group(0)
                r = resolve(tgt, fp, hub, stem_idx)
                if r is None:
                    stats["unresolved"][tgt] += 1
                    return mm.group(0)
                if r[0] == "AMBIG":
                    stats["ambiguous"][tgt] += 1
                    return mm.group(0)
                relp, anchor = r
                disp = alias if alias else tgt
                url = relp + ("#" + anchor.lstrip("#^") if anchor else "")
                stats["converted"] += 1
                return f"[{disp}]({url})"

            parts[j] = WIKILINK_RE.sub(repl, seg)
        out.append("".join(parts))
    return "\n".join(out)


def main():
    dry_run = "--dry-run" in sys.argv
    include_index = "--include-index" in sys.argv
    hub = Path(HUB_DEFAULT)
    for arg in sys.argv[1:]:
        if arg.startswith("--dir="):
            hub = Path(arg[6:])
    if not hub.exists():
        print(f"ERRO: Hub não encontrado em {hub}", file=sys.stderr)
        sys.exit(1)

    print(f"{'[DRY-RUN] ' if dry_run else ''}A indexar ficheiros…")
    stem_idx = build_stem_index(hub)
    print(f"  {len(stem_idx)} stems indexados")

    stats = {"converted": 0, "prose": 0, "unresolved": Counter(), "ambiguous": Counter()}
    files_changed = 0
    for root, dirs, files in os.walk(hub):
        dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")]
        for f in files:
            if not f.endswith(".md"):
                continue
            if f == "index.md" and not include_index:
                continue
            rel = str((Path(root) / f).relative_to(hub))
            if is_excluded(rel):
                continue
            fp = Path(root) / f
            try:
                txt = fp.read_text(encoding="utf-8", errors="replace")
            except Exception:
                continue
            if layer_of(txt) != "wiki":
                continue
            if "[[" not in txt:
                continue
            new = convert_file(fp, hub, stem_idx, stats)
            if new != txt:
                files_changed += 1
                if not dry_run:
                    fp.write_text(new, encoding="utf-8")
                print(f"  [{'DRY' if dry_run else 'OK'}] {rel}")

    print("\n=== Resultado ===")
    print(f"Ficheiros alterados: {files_changed}")
    print(f"Wikilinks convertidos: {stats['converted']}")
    print(f"Prosa-sobre-sintaxe saltada: {stats['prose']}")
    amb = sum(stats["ambiguous"].values())
    unr = sum(stats["unresolved"].values())
    if amb:
        print(f"AMBÍGUOS (deixados): {amb} → {dict(stats['ambiguous'].most_common(10))}")
    if unr:
        print(f"NÃO-RESOLVÍVEIS (refs partidas, deixados como [[ ]]): {unr} (distintos {len(stats['unresolved'])})")
        for t, n in stats["unresolved"].most_common(30):
            print(f"   {n:3} [[{t}]]")


if __name__ == "__main__":
    main()