From 37522386994671d62675e8d23213c42be0117194 Mon Sep 17 00:00:00 2001 From: Emanuel Almeida Date: Sun, 28 Jun 2026 22:39:23 +0100 Subject: [PATCH] =?UTF-8?q?feat(okf-hub):=20Fase=20F=20=E2=80=94=20convert?= =?UTF-8?q?er=20fence-aware=20(path-style/acentos/pipe=20escapado)=20+=20v?= =?UTF-8?q?alidador=20fence-aware?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- okf-hub/okf-classify-dev.py | 22 +++ okf-hub/okf-convert-wikilinks.py | 300 ++++++++++++++++++++----------- okf-hub/okf-normalize.py | 9 + okf-hub/okf-validate.sh | 9 +- 4 files changed, 233 insertions(+), 107 deletions(-) create mode 100644 okf-hub/okf-classify-dev.py mode change 100644 => 100755 okf-hub/okf-convert-wikilinks.py diff --git a/okf-hub/okf-classify-dev.py b/okf-hub/okf-classify-dev.py new file mode 100644 index 0000000..f4d9d4b --- /dev/null +++ b/okf-hub/okf-classify-dev.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +"""layer raw|wiki nos .md do Dev (código: docs=wiki, efémeros/testes/samples=raw).""" +import sys,re +from pathlib import Path +ROOT=Path("/media/ealmeida/Dados/Dev"); DRY="--dry-run" in sys.argv +EXCL_DIR=re.compile(r'/(node_modules|\.git|venv|\.venv|vendor|dist|build|site-packages|__pycache__|target|3rdparty)/') +RAW=re.compile(r'(worklog|/logs?/|sess[aã]o|di[aá]rio|checkup|deep-research|/fontes?/|pesquisa|/tests?/|/__tests__/|/fixtures?/|/examples?/|/samples?/|/test-data/|/\.cache/|CHANGELOG-old|reuni[aã]o)', re.I) +add=0 +for f in ROOT.rglob("*.md"): + s=str(f) + if EXCL_DIR.search(s) or any(p.startswith(".") for p in f.parts): continue + if f.name in ("index.md","log.md"): continue + try: t=f.read_text(encoding="utf-8") + except: continue + if not t.startswith("---"): continue + e=t.find("\n---",3) + if e<0: continue + if re.search(r'^layer:',t[3:e],re.M): continue + layer = "raw" if RAW.search(s) else "wiki" + if not DRY: f.write_text(t[:e]+f"\nlayer: {layer}"+t[e:],encoding="utf-8") + add+=1 +print(f"{'[DRY] ' if DRY else ''}layer adicionado: {add}") diff --git a/okf-hub/okf-convert-wikilinks.py b/okf-hub/okf-convert-wikilinks.py old mode 100644 new mode 100755 index 4fb0464..f281a2b --- a/okf-hub/okf-convert-wikilinks.py +++ b/okf-hub/okf-convert-wikilinks.py @@ -1,153 +1,241 @@ #!/usr/bin/env python3 """ -okf-convert-wikilinks.py — Fase 3: Converte [[wikilinks]] → [texto](path.md) nos index.md -OKF §5: links bundle-relative para navegação entre conceitos +okf-convert-wikilinks.py — Fase F: converte [[wikilinks]] → [texto](path.md) no CORPO +de notas `layer: wiki`, para compatibilidade com agentes CLI (que não resolvem [[ ]]). -Âmbito: apenas ficheiros index.md (navegação) -Corpo de documentos (PROC, QR, etc.) mantém wikilinks — OKF tolera e Obsidian renderiza ambos. +Propriedades de segurança: + - FENCE-AWARE: nunca toca em code fences (``` ~~~) nem inline `code` — preserva + sintaxe bash `[[ -f x ]]` e exemplos de documentação. + - Resolução robusta: mesmo-directório → path-style (relativo, depois vault-root, + depois basename único) → stem global único. Acentos normalizados (NFKD), pipe + escapado `\\|` tratado, anchors `#`/`^` preservados no URL. + - Conservador: alvos ambíguos (stem repetido) ou não encontrados ficam [[ ]] e são + reportados — NUNCA se adivinha um destino. + - Exclusões alinhadas com okf-validate.sh (MEMORY/CLAUDE/AGENTS, 90-Templates, etc). -Uso: - python3 okf-convert-wikilinks.py [--dry-run] [--dir /path/to/Hub] - -Criado: 28-06-2026 +Uso: python3 okf-convert-wikilinks.py [--dry-run] [--dir=/path/Hub] [--include-index] """ - import os import re import sys +import unicodedata from pathlib import Path +from collections import defaultdict, Counter HUB_DEFAULT = "/media/ealmeida/Dados/Hub" -EXCLUDE_DIRS = {".stversions", "node_modules", ".git", ".obsidian", ".trash"} +EXCLUDE_DIRS = { + ".stversions", "node_modules", ".git", ".obsidian", ".trash", ".ijfw", "ijfw", + ".github", ".wayland", ".hermes", ".vscode", ".cursor", ".gstack", + "_templates", "99-Arquivo", "90-Templates", +} +EXCLUDE_FILES = {"MEMORY.md", "CLAUDE.md", "GEMINI.md", "AGENTS.md", "copilot-instructions.md"} -# Padrão wikilink: [[NomeFicheiro]] ou [[NomeFicheiro|Alias]] -WIKILINK_RE = re.compile(r'\[\[([^\]|]+)(?:\|([^\]]+))?\]\]') +WIKILINK_RE = re.compile(r'\[\[([^\]]+?)\]\]') +FENCE_RE = re.compile(r'^(\s*)(```|~~~)') +FM_RE = re.compile(r'^---\n(.*?)\n---', re.DOTALL) +SKIP_PROSE = {"", " ", "...", "wikilinks", "Wikilinks"} -def build_file_index(hub: Path) -> dict: - """Constrói índice nome→path para resolução de wikilinks.""" - index = {} # stem → Path relativo ao hub +def fold(s: str) -> str: + return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c)).lower() + + +def is_excluded(rel: str) -> bool: + base = Path(rel).name + if base in EXCLUDE_FILES: + return True + if re.match(r'MEMORY-.*\.md$', base): + return True + return False + + +def layer_of(text: str): + m = FM_RE.match(text) + if not m: + return None + lm = re.search(r'^layer:\s*(\w+)', m.group(1), re.M) + return lm.group(1) if lm else None + + +def build_stem_index(hub: Path) -> dict: + """folded stem → [rel Path, ...] (lista para detectar ambiguidade).""" + idx = defaultdict(list) for root, dirs, files in os.walk(hub): dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")] - for fname in files: - if fname.endswith(".md"): - fp = Path(root) / fname - stem = fp.stem.lower() - rel = fp.relative_to(hub) - # Guardar o primeiro match (mais provável no vault activo) - if stem not in index: - index[stem] = rel - # Também indexar o nome completo sem extensão - full_name = fname.lower() - if full_name not in index: - index[full_name] = rel - return index + for f in files: + if f.endswith(".md"): + fp = Path(root) / f + idx[fold(fp.stem)].append(fp.relative_to(hub)) + return idx -def resolve_wikilink(target: str, current_file: Path, file_index: dict, hub: Path) -> str: - """Resolve [[target]] para um caminho relativo ao ficheiro actual.""" - # Limpar o target (remover ^anchor, #heading, etc.) - target_clean = re.split(r'[#^]', target)[0].strip() - target_lower = target_clean.lower() - target_with_ext = target_lower + ".md" if not target_lower.endswith(".md") else target_lower - - # Tentar resolver - resolved = file_index.get(target_with_ext) or file_index.get(target_lower) - - if resolved: - # Calcular path relativo a partir do directório do ficheiro actual - try: - rel_path = os.path.relpath(hub / resolved, current_file.parent) - return rel_path.replace("\\", "/") - except Exception: - return str(resolved) +def ci_dir_lookup(dirpath: Path, basename: str, hub: Path): + """Procura case-insensitive por basename(.md) em dirpath. Devolve rel Path ou None.""" + if not dirpath.exists(): + return None + want = fold(basename if basename.endswith(".md") else basename + ".md") + try: + for e in dirpath.iterdir(): + if e.is_file() and fold(e.name) == want: + return e.relative_to(hub) + except Exception: + pass return None -def convert_wikilinks_in_file(filepath: Path, file_index: dict, hub: Path, dry_run: bool) -> dict: - """Converte wikilinks no ficheiro. Retorna estatísticas.""" - result = {"file": str(filepath.relative_to(hub)), "converted": 0, "unresolved": [], "action": "skip"} - - try: - content = filepath.read_text(encoding="utf-8") - except Exception as e: - result["action"] = "error" - result["error"] = str(e) - return result - - if "[[" not in content: - result["action"] = "no_wikilinks" - return result - - def replace_wikilink(m): - target = m.group(1) - alias = m.group(2) - display = alias if alias else target - - resolved_path = resolve_wikilink(target, filepath, file_index, hub) - if resolved_path: - result["converted"] += 1 - return f"[{display}]({resolved_path})" - else: - # Manter como wikilink se não resolvível - result["unresolved"].append(target) - return m.group(0) - - new_content = WIKILINK_RE.sub(replace_wikilink, content) - - if new_content != content: - result["action"] = "converted" - if not dry_run: - filepath.write_text(new_content, encoding="utf-8") +def resolve(target: str, current: Path, hub: Path, stem_idx: dict): + """Devolve (relpath, anchor) ou None. 'AMBIG' como relpath sinaliza ambiguidade.""" + raw = target.strip() + split = re.split(r'(?=[#^])', raw, maxsplit=1) + name = split[0].strip().rstrip("\\").strip() + anchor = raw[len(split[0]):].strip() if len(split) > 1 else "" + if not name: + return None + cand = None + if "/" in name: + bn = Path(name).name + td = (current.parent / Path(name).parent).resolve() + if td == hub or hub in td.parents: + cand = ci_dir_lookup(td, bn, hub) + if cand is None: + td2 = (hub / Path(name).parent).resolve() + if td2 == hub or hub in td2.parents: + cand = ci_dir_lookup(td2, bn, hub) + if cand is None: + lst = stem_idx.get(fold(Path(bn).stem), []) + if len(lst) == 1: + cand = lst[0] else: - result["action"] = "no_changes" + cand = ci_dir_lookup(current.parent, name, hub) + if cand is None: + lst = stem_idx.get(fold(Path(name).stem), []) + if len(lst) == 1: + cand = lst[0] + elif len(lst) > 1: + return ("AMBIG", "") + if cand is None: + return None + relp = os.path.relpath(hub / cand, current.parent).replace("\\", "/") + return (relp, anchor) - return result + +def convert_file(fp: Path, hub: Path, stem_idx: dict, stats: dict) -> str: + content = fp.read_text(encoding="utf-8", errors="replace") + lines = content.split("\n") + out = [] + in_fence = False + fmark = None + fm_end = -1 + if lines and lines[0].strip() == "---": + for i in range(1, len(lines)): + if lines[i].strip() == "---": + fm_end = i + break + for idx, line in enumerate(lines): + if idx <= fm_end: + out.append(line) + continue + fm = FENCE_RE.match(line) + if not in_fence and fm: + in_fence = True + fmark = fm.group(2) + out.append(line) + continue + if in_fence: + if line.strip().startswith(fmark): + in_fence = False + out.append(line) + continue + parts = re.split(r'(`[^`]*`)', line) + for j, seg in enumerate(parts): + if seg.startswith("`"): + continue + + def repl(mm): + inner = mm.group(1) + if "|" in inner: + tgt, alias = inner.split("|", 1) + tgt = tgt.strip().rstrip("\\").strip() + alias = alias.strip() + else: + tgt, alias = inner.strip(), None + if tgt in SKIP_PROSE: + stats["prose"] += 1 + return mm.group(0) + r = resolve(tgt, fp, hub, stem_idx) + if r is None: + stats["unresolved"][tgt] += 1 + return mm.group(0) + if r[0] == "AMBIG": + stats["ambiguous"][tgt] += 1 + return mm.group(0) + relp, anchor = r + disp = alias if alias else tgt + url = relp + ("#" + anchor.lstrip("#^") if anchor else "") + stats["converted"] += 1 + return f"[{disp}]({url})" + + parts[j] = WIKILINK_RE.sub(repl, seg) + out.append("".join(parts)) + return "\n".join(out) def main(): dry_run = "--dry-run" in sys.argv + include_index = "--include-index" in sys.argv hub = Path(HUB_DEFAULT) for arg in sys.argv[1:]: if arg.startswith("--dir="): hub = Path(arg[6:]) - if not hub.exists(): print(f"ERRO: Hub não encontrado em {hub}", file=sys.stderr) sys.exit(1) - print(f"{'[DRY-RUN] ' if dry_run else ''}A construir índice de ficheiros…") - file_index = build_file_index(hub) - print(f" {len(file_index)} ficheiros indexados") + print(f"{'[DRY-RUN] ' if dry_run else ''}A indexar ficheiros…") + stem_idx = build_stem_index(hub) + print(f" {len(stem_idx)} stems indexados") - print(f"A converter wikilinks nos index.md…") - total_converted = 0 - total_unresolved = [] + stats = {"converted": 0, "prose": 0, "unresolved": Counter(), "ambiguous": Counter()} files_changed = 0 - for root, dirs, files in os.walk(hub): dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS and not d.startswith(".")] - for fname in files: - if fname != "index.md": + for f in files: + if not f.endswith(".md"): continue - filepath = Path(root) / fname - result = convert_wikilinks_in_file(filepath, file_index, hub, dry_run) - - if result["action"] == "converted": + if f == "index.md" and not include_index: + continue + rel = str((Path(root) / f).relative_to(hub)) + if is_excluded(rel): + continue + fp = Path(root) / f + try: + txt = fp.read_text(encoding="utf-8", errors="replace") + except Exception: + continue + if layer_of(txt) != "wiki": + continue + if "[[" not in txt: + continue + new = convert_file(fp, hub, stem_idx, stats) + if new != txt: files_changed += 1 - total_converted += result["converted"] - total_unresolved.extend(result["unresolved"]) - print(f" [OK] {result['file']}: {result['converted']} convertidos" - + (f", {len(result['unresolved'])} não resolvidos" if result["unresolved"] else "")) - elif result["action"] == "error": - print(f" [ERRO] {result['file']}: {result.get('error')}") + if not dry_run: + fp.write_text(new, encoding="utf-8") + print(f" [{'DRY' if dry_run else 'OK'}] {rel}") - print(f"\n=== Resultado ===") + print("\n=== Resultado ===") print(f"Ficheiros alterados: {files_changed}") - print(f"Wikilinks convertidos: {total_converted}") - if total_unresolved: - print(f"Não resolvidos ({len(total_unresolved)}): {', '.join(set(total_unresolved))[:200]}") + print(f"Wikilinks convertidos: {stats['converted']}") + print(f"Prosa-sobre-sintaxe saltada: {stats['prose']}") + amb = sum(stats["ambiguous"].values()) + unr = sum(stats["unresolved"].values()) + if amb: + print(f"AMBÍGUOS (deixados): {amb} → {dict(stats['ambiguous'].most_common(10))}") + if unr: + print(f"NÃO-RESOLVÍVEIS (refs partidas, deixados como [[ ]]): {unr} (distintos {len(stats['unresolved'])})") + for t, n in stats["unresolved"].most_common(30): + print(f" {n:3} [[{t}]]") if __name__ == "__main__": diff --git a/okf-hub/okf-normalize.py b/okf-hub/okf-normalize.py index 7124566..5295ecf 100644 --- a/okf-hub/okf-normalize.py +++ b/okf-hub/okf-normalize.py @@ -27,6 +27,15 @@ EXCLUDE_DIRS = { ".obsidian", ".trash", "99-Arquivo", + # dependências / builds / caches (relevante p/ Dev) — dot-dirs já são saltados + "venv", + "vendor", + "dist", + "build", + "site-packages", + "target", + "__pycache__", + "3rdparty", } # Ficheiros reservados OKF — sem frontmatter obrigatório diff --git a/okf-hub/okf-validate.sh b/okf-hub/okf-validate.sh index a80274f..388176e 100755 --- a/okf-hub/okf-validate.sh +++ b/okf-hub/okf-validate.sh @@ -213,7 +213,14 @@ validate_file() { # Check for wikilinks in body (wiki layer) if echo "$content" | grep -q '^layer: wiki'; then local body - body="$(sed -n '/^---$/,/^---$/d; p' "$f")" + body="$(awk ' + NR==1 && $0=="---" {infm=1; next} + infm && $0=="---" {infm=0; next} + infm {next} + /^[[:space:]]*```/ || /^[[:space:]]*~~~/ {infence=!infence; next} + infence {next} + {gsub(/`[^`]*`/,""); print} + ' "$f")" if echo "$body" | grep -q '\[\['; then echo -e "${YEL}WARN${NC} $rel [wiki]: contains wikilinks [[ ]] — convert to [text](path)" ((WARNINGS++)) || true