#!/usr/bin/env python3 """ okf-convert-wikilinks.py — Fase F: converte [[wikilinks]] → [texto](path.md) no CORPO de notas `layer: wiki`, para compatibilidade com agentes CLI (que não resolvem [[ ]]). Propriedades de segurança: - FENCE-AWARE: nunca toca em code fences (``` ~~~) nem inline `code` — preserva sintaxe bash `[[ -f x ]]` e exemplos de documentação. - Resolução robusta: mesmo-directório → path-style (relativo, depois vault-root, depois basename único) → stem global único. Acentos normalizados (NFKD), pipe escapado `\\|` tratado, anchors `#`/`^` preservados no URL. - Conservador: alvos ambíguos (stem repetido) ou não encontrados ficam [[ ]] e são reportados — NUNCA se adivinha um destino. - Exclusões alinhadas com okf-validate.sh (MEMORY/CLAUDE/AGENTS, 90-Templates, etc). Uso: python3 okf-convert-wikilinks.py [--dry-run] [--dir=/path/Hub] [--include-index] """ import os import re import sys import unicodedata from pathlib import Path from collections import defaultdict, Counter HUB_DEFAULT = "/media/ealmeida/Dados/Hub" EXCLUDE_DIRS = { ".stversions", "node_modules", ".git", ".obsidian", ".trash", ".ijfw", "ijfw", ".github", ".wayland", ".hermes", ".vscode", ".cursor", ".gstack", "_templates", "99-Arquivo", "90-Templates", } EXCLUDE_FILES = {"MEMORY.md", "CLAUDE.md", "GEMINI.md", "AGENTS.md", "copilot-instructions.md"} WIKILINK_RE = re.compile(r'\[\[([^\]]+?)\]\]') FENCE_RE = re.compile(r'^(\s*)(```|~~~)') FM_RE = re.compile(r'^---\n(.*?)\n---', re.DOTALL) SKIP_PROSE = {"", " ", "...", "wikilinks", "Wikilinks"} def fold(s: str) -> str: return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)).lower() def is_excluded(rel: str) -> bool: base = Path(rel).name if base in EXCLUDE_FILES: return True if re.match(r'MEMORY-.*\.md$', base): return True return False def layer_of(text: str): m = FM_RE.match(text) if not m: return None lm = re.search(r'^layer:\s*(\w+)', m.group(1), re.M) return lm.group(1) if lm else None def build_stem_index(hub: Path) -> dict: """folded stem → [rel Path, ...] (lista para detectar ambiguidade).""" idx = defaultdict(list) for root, dirs, files in os.walk(hub): dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")] for f in files: if f.endswith(".md"): fp = Path(root) / f idx[fold(fp.stem)].append(fp.relative_to(hub)) return idx def ci_dir_lookup(dirpath: Path, basename: str, hub: Path): """Procura case-insensitive por basename(.md) em dirpath. Devolve rel Path ou None.""" if not dirpath.exists(): return None want = fold(basename if basename.endswith(".md") else basename + ".md") try: for e in dirpath.iterdir(): if e.is_file() and fold(e.name) == want: return e.relative_to(hub) except Exception: pass return None def resolve(target: str, current: Path, hub: Path, stem_idx: dict): """Devolve (relpath, anchor) ou None. 'AMBIG' como relpath sinaliza ambiguidade.""" raw = target.strip() split = re.split(r'(?=[#^])', raw, maxsplit=1) name = split[0].strip().rstrip("\\").strip() anchor = raw[len(split[0]):].strip() if len(split) > 1 else "" if not name: return None cand = None if "/" in name: bn = Path(name).name td = (current.parent / Path(name).parent).resolve() if td == hub or hub in td.parents: cand = ci_dir_lookup(td, bn, hub) if cand is None: td2 = (hub / Path(name).parent).resolve() if td2 == hub or hub in td2.parents: cand = ci_dir_lookup(td2, bn, hub) if cand is None: lst = stem_idx.get(fold(Path(bn).stem), []) if len(lst) == 1: cand = lst[0] else: cand = ci_dir_lookup(current.parent, name, hub) if cand is None: lst = stem_idx.get(fold(Path(name).stem), []) if len(lst) == 1: cand = lst[0] elif len(lst) > 1: return ("AMBIG", "") if cand is None: return None relp = os.path.relpath(hub / cand, current.parent).replace("\\", "/") return (relp, anchor) def convert_file(fp: Path, hub: Path, stem_idx: dict, stats: dict) -> str: content = fp.read_text(encoding="utf-8", errors="replace") lines = content.split("\n") out = [] in_fence = False fmark = None fm_end = -1 if lines and lines[0].strip() == "---": for i in range(1, len(lines)): if lines[i].strip() == "---": fm_end = i break for idx, line in enumerate(lines): if idx <= fm_end: out.append(line) continue fm = FENCE_RE.match(line) if not in_fence and fm: in_fence = True fmark = fm.group(2) out.append(line) continue if in_fence: if line.strip().startswith(fmark): in_fence = False out.append(line) continue parts = re.split(r'(`[^`]*`)', line) for j, seg in enumerate(parts): if seg.startswith("`"): continue def repl(mm): inner = mm.group(1) if "|" in inner: tgt, alias = inner.split("|", 1) tgt = tgt.strip().rstrip("\\").strip() alias = alias.strip() else: tgt, alias = inner.strip(), None if tgt in SKIP_PROSE: stats["prose"] += 1 return mm.group(0) r = resolve(tgt, fp, hub, stem_idx) if r is None: stats["unresolved"][tgt] += 1 return mm.group(0) if r[0] == "AMBIG": stats["ambiguous"][tgt] += 1 return mm.group(0) relp, anchor = r disp = alias if alias else tgt url = relp + ("#" + anchor.lstrip("#^") if anchor else "") stats["converted"] += 1 return f"[{disp}]({url})" parts[j] = WIKILINK_RE.sub(repl, seg) out.append("".join(parts)) return "\n".join(out) def main(): dry_run = "--dry-run" in sys.argv include_index = "--include-index" in sys.argv hub = Path(HUB_DEFAULT) for arg in sys.argv[1:]: if arg.startswith("--dir="): hub = Path(arg[6:]) if not hub.exists(): print(f"ERRO: Hub não encontrado em {hub}", file=sys.stderr) sys.exit(1) print(f"{'[DRY-RUN] ' if dry_run else ''}A indexar ficheiros…") stem_idx = build_stem_index(hub) print(f" {len(stem_idx)} stems indexados") stats = {"converted": 0, "prose": 0, "unresolved": Counter(), "ambiguous": Counter()} files_changed = 0 for root, dirs, files in os.walk(hub): dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")] for f in files: if not f.endswith(".md"): continue if f == "index.md" and not include_index: continue rel = str((Path(root) / f).relative_to(hub)) if is_excluded(rel): continue fp = Path(root) / f try: txt = fp.read_text(encoding="utf-8", errors="replace") except Exception: continue if layer_of(txt) != "wiki": continue if "[[" not in txt: continue new = convert_file(fp, hub, stem_idx, stats) if new != txt: files_changed += 1 if not dry_run: fp.write_text(new, encoding="utf-8") print(f" [{'DRY' if dry_run else 'OK'}] {rel}") print("\n=== Resultado ===") print(f"Ficheiros alterados: {files_changed}") print(f"Wikilinks convertidos: {stats['converted']}") print(f"Prosa-sobre-sintaxe saltada: {stats['prose']}") amb = sum(stats["ambiguous"].values()) unr = sum(stats["unresolved"].values()) if amb: print(f"AMBÍGUOS (deixados): {amb} → {dict(stats['ambiguous'].most_common(10))}") if unr: print(f"NÃO-RESOLVÍVEIS (refs partidas, deixados como [[ ]]): {unr} (distintos {len(stats['unresolved'])})") for t, n in stats["unresolved"].most_common(30): print(f" {n:3} [[{t}]]") if __name__ == "__main__": main()