Files
scripts/okf-hub/okf-convert-wikilinks.py
T

243 lines
8.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
okf-convert-wikilinks.py — Fase F: converte [[wikilinks]] → [texto](path.md) no CORPO
de notas `layer: wiki`, para compatibilidade com agentes CLI (que não resolvem [[ ]]).
Propriedades de segurança:
- FENCE-AWARE: nunca toca em code fences (``` ~~~) nem inline `code` — preserva
sintaxe bash `[[ -f x ]]` e exemplos de documentação.
- Resolução robusta: mesmo-directório → path-style (relativo, depois vault-root,
depois basename único) → stem global único. Acentos normalizados (NFKD), pipe
escapado `\\|` tratado, anchors `#`/`^` preservados no URL.
- Conservador: alvos ambíguos (stem repetido) ou não encontrados ficam [[ ]] e são
reportados — NUNCA se adivinha um destino.
- Exclusões alinhadas com okf-validate.sh (MEMORY/CLAUDE/AGENTS, 90-Templates, etc).
Uso: python3 okf-convert-wikilinks.py [--dry-run] [--dir=/path/Hub] [--include-index]
"""
import os
import re
import sys
import unicodedata
from pathlib import Path
from collections import defaultdict, Counter
HUB_DEFAULT = "/media/ealmeida/Dados/Hub"
EXCLUDE_DIRS = {
".stversions", "node_modules", ".git", ".obsidian", ".trash", ".ijfw", "ijfw",
".github", ".wayland", ".hermes", ".vscode", ".cursor", ".gstack",
"_templates", "99-Arquivo", "90-Templates",
}
EXCLUDE_FILES = {"MEMORY.md", "CLAUDE.md", "GEMINI.md", "AGENTS.md", "copilot-instructions.md"}
WIKILINK_RE = re.compile(r'\[\[([^\]]+?)\]\]')
FENCE_RE = re.compile(r'^(\s*)(```|~~~)')
FM_RE = re.compile(r'^---\n(.*?)\n---', re.DOTALL)
SKIP_PROSE = {"", " ", "...", "wikilinks", "Wikilinks"}
def fold(s: str) -> str:
return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)).lower()
def is_excluded(rel: str) -> bool:
base = Path(rel).name
if base in EXCLUDE_FILES:
return True
if re.match(r'MEMORY-.*\.md$', base):
return True
return False
def layer_of(text: str):
m = FM_RE.match(text)
if not m:
return None
lm = re.search(r'^layer:\s*(\w+)', m.group(1), re.M)
return lm.group(1) if lm else None
def build_stem_index(hub: Path) -> dict:
"""folded stem → [rel Path, ...] (lista para detectar ambiguidade)."""
idx = defaultdict(list)
for root, dirs, files in os.walk(hub):
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")]
for f in files:
if f.endswith(".md"):
fp = Path(root) / f
idx[fold(fp.stem)].append(fp.relative_to(hub))
return idx
def ci_dir_lookup(dirpath: Path, basename: str, hub: Path):
"""Procura case-insensitive por basename(.md) em dirpath. Devolve rel Path ou None."""
if not dirpath.exists():
return None
want = fold(basename if basename.endswith(".md") else basename + ".md")
try:
for e in dirpath.iterdir():
if e.is_file() and fold(e.name) == want:
return e.relative_to(hub)
except Exception:
pass
return None
def resolve(target: str, current: Path, hub: Path, stem_idx: dict):
"""Devolve (relpath, anchor) ou None. 'AMBIG' como relpath sinaliza ambiguidade."""
raw = target.strip()
split = re.split(r'(?=[#^])', raw, maxsplit=1)
name = split[0].strip().rstrip("\\").strip()
anchor = raw[len(split[0]):].strip() if len(split) > 1 else ""
if not name:
return None
cand = None
if "/" in name:
bn = Path(name).name
td = (current.parent / Path(name).parent).resolve()
if td == hub or hub in td.parents:
cand = ci_dir_lookup(td, bn, hub)
if cand is None:
td2 = (hub / Path(name).parent).resolve()
if td2 == hub or hub in td2.parents:
cand = ci_dir_lookup(td2, bn, hub)
if cand is None:
lst = stem_idx.get(fold(Path(bn).stem), [])
if len(lst) == 1:
cand = lst[0]
else:
cand = ci_dir_lookup(current.parent, name, hub)
if cand is None:
lst = stem_idx.get(fold(Path(name).stem), [])
if len(lst) == 1:
cand = lst[0]
elif len(lst) > 1:
return ("AMBIG", "")
if cand is None:
return None
relp = os.path.relpath(hub / cand, current.parent).replace("\\", "/")
return (relp, anchor)
def convert_file(fp: Path, hub: Path, stem_idx: dict, stats: dict) -> str:
content = fp.read_text(encoding="utf-8", errors="replace")
lines = content.split("\n")
out = []
in_fence = False
fmark = None
fm_end = -1
if lines and lines[0].strip() == "---":
for i in range(1, len(lines)):
if lines[i].strip() == "---":
fm_end = i
break
for idx, line in enumerate(lines):
if idx <= fm_end:
out.append(line)
continue
fm = FENCE_RE.match(line)
if not in_fence and fm:
in_fence = True
fmark = fm.group(2)
out.append(line)
continue
if in_fence:
if line.strip().startswith(fmark):
in_fence = False
out.append(line)
continue
parts = re.split(r'(`[^`]*`)', line)
for j, seg in enumerate(parts):
if seg.startswith("`"):
continue
def repl(mm):
inner = mm.group(1)
if "|" in inner:
tgt, alias = inner.split("|", 1)
tgt = tgt.strip().rstrip("\\").strip()
alias = alias.strip()
else:
tgt, alias = inner.strip(), None
if tgt in SKIP_PROSE:
stats["prose"] += 1
return mm.group(0)
r = resolve(tgt, fp, hub, stem_idx)
if r is None:
stats["unresolved"][tgt] += 1
return mm.group(0)
if r[0] == "AMBIG":
stats["ambiguous"][tgt] += 1
return mm.group(0)
relp, anchor = r
disp = alias if alias else tgt
url = relp + ("#" + anchor.lstrip("#^") if anchor else "")
stats["converted"] += 1
return f"[{disp}]({url})"
parts[j] = WIKILINK_RE.sub(repl, seg)
out.append("".join(parts))
return "\n".join(out)
def main():
dry_run = "--dry-run" in sys.argv
include_index = "--include-index" in sys.argv
hub = Path(HUB_DEFAULT)
for arg in sys.argv[1:]:
if arg.startswith("--dir="):
hub = Path(arg[6:])
if not hub.exists():
print(f"ERRO: Hub não encontrado em {hub}", file=sys.stderr)
sys.exit(1)
print(f"{'[DRY-RUN] ' if dry_run else ''}A indexar ficheiros…")
stem_idx = build_stem_index(hub)
print(f" {len(stem_idx)} stems indexados")
stats = {"converted": 0, "prose": 0, "unresolved": Counter(), "ambiguous": Counter()}
files_changed = 0
for root, dirs, files in os.walk(hub):
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")]
for f in files:
if not f.endswith(".md"):
continue
if f == "index.md" and not include_index:
continue
rel = str((Path(root) / f).relative_to(hub))
if is_excluded(rel):
continue
fp = Path(root) / f
try:
txt = fp.read_text(encoding="utf-8", errors="replace")
except Exception:
continue
if layer_of(txt) != "wiki":
continue
if "[[" not in txt:
continue
new = convert_file(fp, hub, stem_idx, stats)
if new != txt:
files_changed += 1
if not dry_run:
fp.write_text(new, encoding="utf-8")
print(f" [{'DRY' if dry_run else 'OK'}] {rel}")
print("\n=== Resultado ===")
print(f"Ficheiros alterados: {files_changed}")
print(f"Wikilinks convertidos: {stats['converted']}")
print(f"Prosa-sobre-sintaxe saltada: {stats['prose']}")
amb = sum(stats["ambiguous"].values())
unr = sum(stats["unresolved"].values())
if amb:
print(f"AMBÍGUOS (deixados): {amb}{dict(stats['ambiguous'].most_common(10))}")
if unr:
print(f"NÃO-RESOLVÍVEIS (refs partidas, deixados como [[ ]]): {unr} (distintos {len(stats['unresolved'])})")
for t, n in stats["unresolved"].most_common(30):
print(f" {n:3} [[{t}]]")
if __name__ == "__main__":
main()