init: scripts diversos (crawlers, conversores, scrapers)
This commit is contained in:
967
translate-wp-plugin/wp-translate-ptpt.py
Executable file
967
translate-wp-plugin/wp-translate-ptpt.py
Executable file
@@ -0,0 +1,967 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
wp-translate-ptpt.py
|
||||
Sistema eficiente de traduções WordPress PT-PT.
|
||||
|
||||
Author: Descomplicar® Crescimento Digital
|
||||
Date: 2026-02-23
|
||||
Version: 1.0.0
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
import time
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import argparse
|
||||
import subprocess
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from dataclasses import dataclass, field
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
|
||||
# Version
|
||||
__version__ = "1.0.0"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Data Classes
|
||||
# =============================================================================
|
||||
|
||||
@dataclass
|
||||
class PoEntry:
|
||||
"""Represents a .po file entry."""
|
||||
msgid: str = ""
|
||||
msgid_plural: str = ""
|
||||
msgstr: str = ""
|
||||
msgstr_plural: Dict[int, str] = field(default_factory=dict)
|
||||
comments: List[str] = field(default_factory=list)
|
||||
line_start: int = 0
|
||||
msgid_line: int = 0
|
||||
msgstr_line: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessResult:
|
||||
"""Result of processing a .po file."""
|
||||
success: bool
|
||||
total: int = 0
|
||||
translated: int = 0
|
||||
cached: int = 0
|
||||
brands_fixed: int = 0
|
||||
errors: List[Dict] = field(default_factory=list)
|
||||
error: str = ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PT-BR to PT-PT Conversion Rules
|
||||
# =============================================================================
|
||||
|
||||
PTBR_TO_PTPT = {
|
||||
# Verbs
|
||||
r'\bsalvar\b': 'guardar',
|
||||
r'\bsalvo\b': 'guardado',
|
||||
r'\bsalva\b': 'guardada',
|
||||
r'\bdeletar\b': 'eliminar',
|
||||
r'\bdeletado\b': 'eliminado',
|
||||
r'\bdeletada\b': 'eliminada',
|
||||
r'\bgerenciar\b': 'gerir',
|
||||
r'\bgerenciamento\b': 'gestão',
|
||||
r'\bgerenciado\b': 'gerido',
|
||||
r'\bhabilitar\b': 'activar',
|
||||
r'\bhabilitado\b': 'activado',
|
||||
r'\bhabilitada\b': 'activada',
|
||||
r'\bdesabilitar\b': 'desactivar',
|
||||
r'\bdesabilitado\b': 'desactivado',
|
||||
r'\bdesabilitada\b': 'desactivada',
|
||||
r'\bacessar\b': 'aceder',
|
||||
r'\bacessado\b': 'acedido',
|
||||
r'\bbaixar\b': 'transferir',
|
||||
r'\bcadastrar\b': 'registar',
|
||||
r'\bcadastro\b': 'registo',
|
||||
r'\bcadastrado\b': 'registado',
|
||||
r'\bcompartilhar\b': 'partilhar',
|
||||
r'\bcompartilhado\b': 'partilhado',
|
||||
r'\bvisualizar\b': 'pré-visualizar',
|
||||
|
||||
# Nouns
|
||||
r'\bsenha\b': 'palavra-passe',
|
||||
r'\bsenhas\b': 'palavras-passe',
|
||||
r'\barquivo\b': 'ficheiro',
|
||||
r'\barquivos\b': 'ficheiros',
|
||||
r'\btela\b': 'ecrã',
|
||||
r'\btelas\b': 'ecrãs',
|
||||
r'\bcelular\b': 'telemóvel',
|
||||
r'\busuário\b': 'utilizador',
|
||||
r'\busuários\b': 'utilizadores',
|
||||
r'\bconfiguração\b': 'definição',
|
||||
r'\bconfigurações\b': 'definições',
|
||||
r'\blixeira\b': 'lixo',
|
||||
r'\bequipe\b': 'equipa',
|
||||
|
||||
# Orthography (consoantes mudas)
|
||||
r'\batualiz': 'actualiz',
|
||||
r'\bfatura': 'factura',
|
||||
r'\bselecion': 'seleccion',
|
||||
r'\bação\b': 'acção',
|
||||
r'\bações\b': 'acções',
|
||||
r'\bprojeto\b': 'projecto',
|
||||
r'\bprojetos\b': 'projectos',
|
||||
r'\bdireção\b': 'direcção',
|
||||
r'\bproteção\b': 'protecção',
|
||||
r'\bcoleção\b': 'colecção',
|
||||
r'\bcorreção\b': 'correcção',
|
||||
r'\bótimo\b': 'óptimo',
|
||||
|
||||
# Gerund to infinitive
|
||||
r'\bprocessando\b': 'a processar',
|
||||
r'\bcarregando\b': 'a carregar',
|
||||
r'\batualizando\b': 'a actualizar',
|
||||
r'\bgerando\b': 'a gerar',
|
||||
r'\bsalvando\b': 'a guardar',
|
||||
r'\bdeletando\b': 'a eliminar',
|
||||
}
|
||||
|
||||
|
||||
def apply_ptbr_fixes(text: str) -> Tuple[str, int]:
|
||||
"""Apply PT-BR to PT-PT conversions."""
|
||||
fixed = text
|
||||
count = 0
|
||||
|
||||
for pattern, replacement in PTBR_TO_PTPT.items():
|
||||
before = fixed
|
||||
fixed = re.sub(pattern, replacement, fixed, flags=re.IGNORECASE)
|
||||
if fixed != before:
|
||||
count += 1
|
||||
|
||||
return fixed, count
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Seed Brands
|
||||
# =============================================================================
|
||||
|
||||
SEED_BRANDS = [
|
||||
# 115 plugins from current library
|
||||
"Fluent Forms", "FluentCRM", "Fluent SMTP", "Fluent Booking", "FluentCampaign Pro",
|
||||
"Fluent Support",
|
||||
"Rank Math", "Rank Math Pro",
|
||||
"Element Pack", "Element Pack Lite",
|
||||
"Elementor", "Elementor Pro",
|
||||
"ElementsKit", "ElementsKit Lite",
|
||||
"Happy Addons", "Happy Elementor Addons",
|
||||
"WooCommerce", "WPForms", "WPForms Lite", "Wordfence",
|
||||
"UpdraftPlus", "Real Cookie Banner", "Loco Translate",
|
||||
"WP Fastest Cache", "Forminator", "Bit Integrations", "Bit Social", "Bit Pi",
|
||||
"KiviCare", "KiviCare Pro", "Astra", "Branda", "TablePress",
|
||||
"AI Engine", "BetterDocs", "Cookie Notice",
|
||||
"Docket Cache", "Envato Elements", "Email Candy Pro",
|
||||
"Eventin Pro", "Fast Indexing API",
|
||||
"FileBird", "FileBird Document Library",
|
||||
"GUM Elementor Addon", "HappyFiles Pro",
|
||||
"Insert Headers and Footers",
|
||||
"Iqonic Extensions", "Iqonic Layouts",
|
||||
"JEG Elementor Kit", "Jet Engine",
|
||||
"JWT Authentication",
|
||||
"LoginPress", "MainWP BackWPup Extension",
|
||||
"MetForm", "PowerPack Elements",
|
||||
"Print My Blog", "Product Import Export for WooCommerce",
|
||||
"Shipper", "SkyBoot Custom Icons",
|
||||
"Testimonial Pro", "Ultimate Branding",
|
||||
"Uncanny Automator",
|
||||
"WebP Express", "WholesaleX",
|
||||
"WooCommerce Dashboard Stats", "Woo Save Abandoned Carts",
|
||||
"WPConsent", "WP Defender", "WP Event Solution",
|
||||
"WP Hummingbird", "WP Mail SMTP", "WPMU DEV SEO",
|
||||
"WPMU DEV Updates", "WP Optimize", "WP Rocket",
|
||||
"WP Security Audit Log", "WP Smush Pro",
|
||||
"WPFunnels", "WPFunnels Pro",
|
||||
|
||||
# Common services
|
||||
"Google", "Facebook", "Instagram", "Twitter", "LinkedIn",
|
||||
"PayPal", "Stripe", "Mailchimp", "Zapier", "HubSpot",
|
||||
"OpenAI", "ChatGPT", "YouTube", "TikTok",
|
||||
"Gmail", "Outlook",
|
||||
|
||||
# WordPress core
|
||||
"WordPress", "Gutenberg", "Jetpack",
|
||||
]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CacheManager
|
||||
# =============================================================================
|
||||
|
||||
class CacheManager:
|
||||
"""Manages SQLite cache for translations and brands."""
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
"""Initialize database connection and create tables."""
|
||||
self.conn = sqlite3.connect(db_path)
|
||||
self._init_db()
|
||||
|
||||
def _init_db(self):
|
||||
"""Create database schema."""
|
||||
# Brands table
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS brands (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT UNIQUE NOT NULL,
|
||||
variations TEXT,
|
||||
auto_detected BOOLEAN DEFAULT 0,
|
||||
confidence_score REAL DEFAULT 1.0,
|
||||
last_seen TIMESTAMP,
|
||||
plugin_slug TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
# Translations cache
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS translations (
|
||||
msgid_hash TEXT PRIMARY KEY,
|
||||
msgid TEXT,
|
||||
msgstr TEXT,
|
||||
plugin_name TEXT,
|
||||
validated BOOLEAN DEFAULT 0,
|
||||
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Corrections history
|
||||
self.conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS corrections (
|
||||
id INTEGER PRIMARY KEY,
|
||||
original TEXT,
|
||||
corrected TEXT,
|
||||
rule_applied TEXT,
|
||||
plugin_name TEXT,
|
||||
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
def get_cached_translation(self, msgid: str) -> Optional[str]:
|
||||
"""Retrieve cached translation for msgid."""
|
||||
msgid_hash = hashlib.md5(msgid.encode()).hexdigest()
|
||||
|
||||
cursor = self.conn.execute(
|
||||
"SELECT msgstr FROM translations WHERE msgid_hash = ? AND validated = 1",
|
||||
(msgid_hash,)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
return result[0] if result else None
|
||||
|
||||
def save_translation(self, msgid: str, msgstr: str, plugin_name: str, validated: bool = False):
|
||||
"""Save translation to cache."""
|
||||
msgid_hash = hashlib.md5(msgid.encode()).hexdigest()
|
||||
|
||||
self.conn.execute(
|
||||
"""INSERT OR REPLACE INTO translations
|
||||
(msgid_hash, msgid, msgstr, plugin_name, validated, timestamp)
|
||||
VALUES (?, ?, ?, ?, ?, CURRENT_TIMESTAMP)""",
|
||||
(msgid_hash, msgid, msgstr, plugin_name, validated)
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def log_correction(self, original: str, corrected: str, rule: str, plugin_name: str):
|
||||
"""Log a correction to history."""
|
||||
self.conn.execute(
|
||||
"""INSERT INTO corrections (original, corrected, rule_applied, plugin_name, timestamp)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)""",
|
||||
(original, corrected, rule, plugin_name)
|
||||
)
|
||||
self.conn.commit()
|
||||
|
||||
def close(self):
|
||||
"""Close database connection."""
|
||||
self.conn.close()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# BrandProtector
|
||||
# =============================================================================
|
||||
|
||||
class BrandProtector:
|
||||
"""Detects and protects brand names from literal translation."""
|
||||
|
||||
# Known literal translations
|
||||
LITERAL_TRANSLATIONS = {
|
||||
"Fluent Forms": ["Formulários Fluentes", "Formas Fluentes"],
|
||||
"FluentCRM": ["CRM Fluente"],
|
||||
"Fluent SMTP": ["SMTP Fluente"],
|
||||
"Fluent Booking": ["Reserva Fluente"],
|
||||
"Rank Math": ["Matemática de Classificação", "SEO Matemática"],
|
||||
"Element Pack": ["Pacote de Elementos"],
|
||||
"ElementsKit": ["Kit de Elementos"],
|
||||
"Happy Addons": ["Complementos Felizes"],
|
||||
"Happy Elementor Addons": ["Complementos Elementor Felizes"],
|
||||
"Real Cookie Banner": ["Banner de Biscoito Real", "Bandeira de Biscoito Real"],
|
||||
"Cookie Banner": ["Banner de Biscoito"],
|
||||
"Loco Translate": ["Loco Traduzir"],
|
||||
"WP Fastest Cache": ["Cache Mais Rápido WP"],
|
||||
"Bit Integrations": ["Integrações Bit"],
|
||||
"Bit Social": ["Social Bit"],
|
||||
"Wordfence": ["Cerca de Palavras"],
|
||||
}
|
||||
|
||||
def __init__(self, db_path: str):
|
||||
"""Initialize with database path."""
|
||||
self.db_path = db_path
|
||||
self.cache = CacheManager(db_path) if db_path != ":memory:" else None
|
||||
self.known_brands = self._load_brands()
|
||||
|
||||
def _load_brands(self) -> List[str]:
|
||||
"""Load known brands from database."""
|
||||
if not self.cache:
|
||||
return list(self.LITERAL_TRANSLATIONS.keys())
|
||||
|
||||
cursor = self.cache.conn.execute("SELECT name FROM brands")
|
||||
brands = [row[0] for row in cursor.fetchall()]
|
||||
return brands if brands else list(self.LITERAL_TRANSLATIONS.keys())
|
||||
|
||||
def detect_brand_patterns(self, text: str) -> List[str]:
|
||||
"""Detect possible brand names using heuristics."""
|
||||
candidates = []
|
||||
|
||||
# Pattern 1: CamelCase
|
||||
camel_case = re.findall(r'\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b', text)
|
||||
candidates.extend(camel_case)
|
||||
|
||||
# Pattern 2: Acronyms
|
||||
acronyms = re.findall(r'\b[A-Z]{2,}\b', text)
|
||||
candidates.extend(acronyms)
|
||||
|
||||
# Pattern 3: Trademarks
|
||||
trademarks = re.findall(r'(\w+(?:\s+\w+)?)\s*[®™]', text)
|
||||
candidates.extend(trademarks)
|
||||
|
||||
# Pattern 4: Mid-sentence capitals
|
||||
mid_sentence = re.findall(r'(?<=\s)[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?=\s)', text)
|
||||
candidates.extend(mid_sentence)
|
||||
|
||||
return list(set(candidates))
|
||||
|
||||
def calculate_confidence(self, brand: str, occurrences: int = 1) -> float:
|
||||
"""Calculate confidence score for detected brand."""
|
||||
score = 0.0
|
||||
|
||||
# +0.4 if CamelCase
|
||||
if re.match(r'^[A-Z][a-z]+(?:[A-Z][a-z]+)+$', brand):
|
||||
score += 0.4
|
||||
|
||||
# +0.3 if has trademark
|
||||
if any(char in brand for char in ['®', '™']):
|
||||
score += 0.3
|
||||
|
||||
# +0.1 per 5 occurrences (max 0.3)
|
||||
score += min(occurrences / 5 * 0.1, 0.3)
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
def protect_brands(self, text: str) -> Tuple[str, Dict[str, str]]:
|
||||
"""Replace brand names with placeholders before translation."""
|
||||
placeholders = {}
|
||||
protected_text = text
|
||||
|
||||
for i, brand in enumerate(self.known_brands):
|
||||
if brand in text:
|
||||
placeholder = f"__BRAND_{i}__"
|
||||
placeholders[placeholder] = brand
|
||||
protected_text = protected_text.replace(brand, placeholder)
|
||||
|
||||
return protected_text, placeholders
|
||||
|
||||
def restore_brands(self, text: str, placeholders: Dict[str, str]) -> str:
|
||||
"""Restore brand names after translation."""
|
||||
restored_text = text
|
||||
|
||||
for placeholder, brand in placeholders.items():
|
||||
restored_text = restored_text.replace(placeholder, brand)
|
||||
|
||||
return restored_text
|
||||
|
||||
def fix_translated_brands(self, msgid: str, msgstr: str) -> Tuple[str, List[str]]:
|
||||
"""Fix brands that were literally translated."""
|
||||
corrections = []
|
||||
fixed_msgstr = msgstr
|
||||
|
||||
# Fix known literal translations
|
||||
for correct_name, wrong_variations in self.LITERAL_TRANSLATIONS.items():
|
||||
for wrong in wrong_variations:
|
||||
if wrong in fixed_msgstr:
|
||||
fixed_msgstr = fixed_msgstr.replace(wrong, correct_name)
|
||||
corrections.append(f"{wrong} → {correct_name}")
|
||||
|
||||
if self.cache:
|
||||
self.cache.log_correction(
|
||||
original=wrong,
|
||||
corrected=correct_name,
|
||||
rule="literal_translation",
|
||||
plugin_name="unknown"
|
||||
)
|
||||
|
||||
return fixed_msgstr, corrections
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# QualityValidator
|
||||
# =============================================================================
|
||||
|
||||
class QualityValidator:
|
||||
"""Validates translation quality."""
|
||||
|
||||
PTBR_TERMS = [
|
||||
'você', 'vocês', 'gerenciar', 'habilitar', 'desabilitar',
|
||||
'deletar', 'salvar', 'arquivo', 'tela', 'senha', 'celular',
|
||||
'usuário', 'configuração', 'cadastro', 'lixeira', 'gerenciamento',
|
||||
'visualizar', 'acessar', 'baixar', 'compartilhar'
|
||||
]
|
||||
|
||||
def validate_entry(self, entry: PoEntry) -> Tuple[bool, List[str]]:
|
||||
"""Validate a complete entry."""
|
||||
errors = []
|
||||
|
||||
# 1. Check placeholders
|
||||
if not self._check_placeholders(entry.msgid, entry.msgstr):
|
||||
errors.append("PLACEHOLDER_MISMATCH")
|
||||
|
||||
# 2. Check HTML tags
|
||||
if not self._check_html_tags(entry.msgid, entry.msgstr):
|
||||
errors.append("HTML_TAG_MISMATCH")
|
||||
|
||||
# 3. Check for empty translations
|
||||
if entry.msgid and not entry.msgstr and not entry.msgstr_plural:
|
||||
errors.append("EMPTY_TRANSLATION")
|
||||
|
||||
# 4. Check for PT-BR terms
|
||||
ptbr_terms = self._detect_ptbr(entry.msgstr)
|
||||
if ptbr_terms:
|
||||
errors.append(f"PTBR_TERMS: {', '.join(ptbr_terms)}")
|
||||
|
||||
return len(errors) == 0, errors
|
||||
|
||||
def _check_placeholders(self, msgid: str, msgstr: str) -> bool:
|
||||
"""Check if placeholders are preserved."""
|
||||
if not msgstr:
|
||||
return True
|
||||
|
||||
pattern = r'%(?:\d+\$)?[sdifuxX]|\{\{?\w+\}?\}|\[\w+\]'
|
||||
|
||||
msgid_placeholders = sorted(re.findall(pattern, msgid))
|
||||
msgstr_placeholders = sorted(re.findall(pattern, msgstr))
|
||||
|
||||
return msgid_placeholders == msgstr_placeholders
|
||||
|
||||
def _check_html_tags(self, msgid: str, msgstr: str) -> bool:
|
||||
"""Check if HTML tags are preserved."""
|
||||
if not msgstr:
|
||||
return True
|
||||
|
||||
msgid_tags = sorted(re.findall(r'<[^>]+>', msgid))
|
||||
msgstr_tags = sorted(re.findall(r'<[^>]+>', msgstr))
|
||||
|
||||
# Auto-fix common issues
|
||||
msgstr_fixed = msgstr.replace('<forte>', '<strong>').replace('</forte>', '</strong>')
|
||||
msgstr_tags_fixed = sorted(re.findall(r'<[^>]+>', msgstr_fixed))
|
||||
|
||||
return msgid_tags == msgstr_tags or msgid_tags == msgstr_tags_fixed
|
||||
|
||||
def _detect_ptbr(self, text: str) -> List[str]:
|
||||
"""Detect PT-BR terms in text."""
|
||||
found = []
|
||||
|
||||
for term in self.PTBR_TERMS:
|
||||
if re.search(r'\b' + re.escape(term) + r'\b', text, re.IGNORECASE):
|
||||
found.append(term)
|
||||
|
||||
return found
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TranslationEngine
|
||||
# =============================================================================
|
||||
|
||||
class TranslationEngine:
|
||||
"""Wrapper for LibreTranslate API with retry and rate limiting."""
|
||||
|
||||
def __init__(self, api_url: str = "https://translate.descomplicar.pt"):
|
||||
"""Initialize translation engine."""
|
||||
self.api_url = api_url.rstrip("/")
|
||||
self.translate_endpoint = f"{self.api_url}/translate"
|
||||
self.rate_limit = 0.3
|
||||
self.last_call = 0
|
||||
self.stats = {"success": 0, "failed": 0, "cached": 0}
|
||||
|
||||
def translate(self, text: str, source: str = "en", target: str = "pt") -> str:
|
||||
"""Translate text with retry logic."""
|
||||
if not text or text.isspace():
|
||||
return text
|
||||
|
||||
# Rate limiting
|
||||
elapsed = time.time() - self.last_call
|
||||
if elapsed < self.rate_limit:
|
||||
time.sleep(self.rate_limit - elapsed)
|
||||
|
||||
# Retry 3 times
|
||||
for attempt in range(3):
|
||||
try:
|
||||
data = json.dumps({
|
||||
"q": text,
|
||||
"source": source,
|
||||
"target": target,
|
||||
"format": "text"
|
||||
}).encode('utf-8')
|
||||
|
||||
req = Request(
|
||||
self.translate_endpoint,
|
||||
data=data,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
with urlopen(req, timeout=30) as response:
|
||||
result = json.loads(response.read().decode('utf-8'))
|
||||
translated = result.get("translatedText", "")
|
||||
|
||||
self.last_call = time.time()
|
||||
self.stats["success"] += 1
|
||||
return translated
|
||||
|
||||
except (URLError, Exception) as e:
|
||||
if attempt < 2:
|
||||
wait = 2 ** attempt
|
||||
time.sleep(wait)
|
||||
continue
|
||||
else:
|
||||
self.stats["failed"] += 1
|
||||
return ""
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PoFileHandler
|
||||
# =============================================================================
|
||||
|
||||
class PoFileHandler:
|
||||
"""Parse and write .po files."""
|
||||
|
||||
def parse(self, po_file: Path) -> List[PoEntry]:
|
||||
"""Parse .po file into list of entries."""
|
||||
entries = []
|
||||
current = PoEntry()
|
||||
|
||||
with open(po_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.rstrip('\n')
|
||||
|
||||
if line.startswith('#'):
|
||||
current.comments.append(line)
|
||||
current.line_start = i
|
||||
|
||||
elif line.startswith('msgid '):
|
||||
if current.msgid:
|
||||
entries.append(current)
|
||||
current = PoEntry()
|
||||
current.msgid = self._extract_string(line)
|
||||
current.msgid_line = i
|
||||
|
||||
elif line.startswith('msgid_plural '):
|
||||
current.msgid_plural = self._extract_string(line)
|
||||
|
||||
elif line.startswith('msgstr'):
|
||||
value = self._extract_string(line)
|
||||
|
||||
if '[' in line:
|
||||
match = re.search(r'\[(\d+)\]', line)
|
||||
if match:
|
||||
idx = int(match.group(1))
|
||||
current.msgstr_plural[idx] = value
|
||||
else:
|
||||
current.msgstr = value
|
||||
else:
|
||||
current.msgstr = value
|
||||
|
||||
current.msgstr_line = i
|
||||
|
||||
elif line.startswith('"'):
|
||||
continuation = self._extract_string(line)
|
||||
|
||||
if current.msgstr_line and i > current.msgstr_line:
|
||||
if current.msgstr_plural:
|
||||
last_idx = max(current.msgstr_plural.keys())
|
||||
current.msgstr_plural[last_idx] += continuation
|
||||
else:
|
||||
current.msgstr += continuation
|
||||
elif current.msgid_line and i > current.msgid_line:
|
||||
if current.msgid_plural:
|
||||
current.msgid_plural += continuation
|
||||
else:
|
||||
current.msgid += continuation
|
||||
|
||||
elif not line.strip():
|
||||
if current.msgid:
|
||||
entries.append(current)
|
||||
current = PoEntry()
|
||||
|
||||
if current.msgid:
|
||||
entries.append(current)
|
||||
|
||||
return entries
|
||||
|
||||
def save(self, entries: List[PoEntry], output_file: Path):
|
||||
"""Save entries to .po file."""
|
||||
lines = []
|
||||
|
||||
for entry in entries:
|
||||
lines.extend(entry.comments)
|
||||
lines.append(f'msgid "{entry.msgid}"')
|
||||
|
||||
if entry.msgid_plural:
|
||||
lines.append(f'msgid_plural "{entry.msgid_plural}"')
|
||||
|
||||
if entry.msgstr_plural:
|
||||
for idx, value in sorted(entry.msgstr_plural.items()):
|
||||
lines.append(f'msgstr[{idx}] "{value}"')
|
||||
else:
|
||||
lines.append(f'msgstr "{entry.msgstr}"')
|
||||
|
||||
lines.append("")
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(lines))
|
||||
|
||||
def _extract_string(self, line: str) -> str:
|
||||
"""Extract string from msgid/msgstr line."""
|
||||
match = re.search(r'"(.*?)"', line)
|
||||
return match.group(1) if match else ""
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TranslationProcessor
|
||||
# =============================================================================
|
||||
|
||||
class TranslationProcessor:
|
||||
"""Main orchestrator for translation pipeline."""
|
||||
|
||||
def __init__(self, db_path: str, api_url: str):
|
||||
"""Initialize processor with all components."""
|
||||
self.cache = CacheManager(db_path) if db_path != ":memory:" else None
|
||||
self.brand_protector = BrandProtector(db_path)
|
||||
self.translator = TranslationEngine(api_url)
|
||||
self.po_handler = PoFileHandler()
|
||||
self.validator = QualityValidator()
|
||||
|
||||
def process_file(self, po_file: Path, mode: str = "full") -> ProcessResult:
|
||||
"""Process .po file through full pipeline."""
|
||||
|
||||
# Backup original
|
||||
backup_path = po_file.with_suffix('.po.backup')
|
||||
if backup_path.exists():
|
||||
backup_path.unlink()
|
||||
shutil.copy2(po_file, backup_path)
|
||||
|
||||
try:
|
||||
# Parse
|
||||
entries = self.po_handler.parse(po_file)
|
||||
|
||||
# Process entries
|
||||
processed = []
|
||||
errors = []
|
||||
stats = {"translated": 0, "cached": 0, "brands_fixed": 0}
|
||||
|
||||
for entry in entries:
|
||||
try:
|
||||
result, brands_fixed = self._process_entry(entry, mode, po_file.stem)
|
||||
|
||||
# Validate
|
||||
valid, validation_errors = self.validator.validate_entry(result)
|
||||
|
||||
if valid or not result.msgstr:
|
||||
processed.append(result)
|
||||
if result.msgstr and not entry.msgstr:
|
||||
stats["translated"] += 1
|
||||
stats["brands_fixed"] += brands_fixed
|
||||
else:
|
||||
errors.append({
|
||||
'msgid': entry.msgid[:50],
|
||||
'errors': validation_errors
|
||||
})
|
||||
processed.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
errors.append({'msgid': entry.msgid[:50], 'exception': str(e)})
|
||||
processed.append(entry)
|
||||
|
||||
# Save
|
||||
self.po_handler.save(processed, po_file)
|
||||
|
||||
# Compile .mo (best effort - don't fail if compilation has errors)
|
||||
compile_success = self._compile_mo(po_file)
|
||||
if not compile_success:
|
||||
errors.append({'warning': 'msgfmt compilation had warnings or errors'})
|
||||
|
||||
# Success - remove backup
|
||||
backup_path.unlink()
|
||||
|
||||
return ProcessResult(
|
||||
success=True,
|
||||
total=len(entries),
|
||||
translated=stats["translated"],
|
||||
cached=stats["cached"],
|
||||
brands_fixed=stats["brands_fixed"],
|
||||
errors=errors
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
# Rollback
|
||||
shutil.copy2(backup_path, po_file)
|
||||
return ProcessResult(success=False, error=str(e))
|
||||
|
||||
def _process_entry(self, entry: PoEntry, mode: str, plugin_name: str) -> Tuple[PoEntry, int]:
|
||||
"""Process single entry through pipeline."""
|
||||
brands_fixed = 0
|
||||
|
||||
# Skip header entries
|
||||
if not entry.msgid:
|
||||
return entry, 0
|
||||
|
||||
# Mode: brands-only
|
||||
if mode == "brands-only" or entry.msgstr:
|
||||
# Process msgstr (singular)
|
||||
if entry.msgstr:
|
||||
fixed, corrections = self.brand_protector.fix_translated_brands(
|
||||
entry.msgid, entry.msgstr
|
||||
)
|
||||
if corrections:
|
||||
brands_fixed = len(corrections)
|
||||
fixed, _ = apply_ptbr_fixes(fixed)
|
||||
entry.msgstr = fixed
|
||||
|
||||
# Process msgstr_plural (plural forms)
|
||||
if entry.msgstr_plural:
|
||||
for idx, value in entry.msgstr_plural.items():
|
||||
fixed, corrections = self.brand_protector.fix_translated_brands(
|
||||
entry.msgid, value
|
||||
)
|
||||
if corrections:
|
||||
brands_fixed += len(corrections)
|
||||
fixed, _ = apply_ptbr_fixes(fixed)
|
||||
entry.msgstr_plural[idx] = fixed
|
||||
|
||||
return entry, brands_fixed
|
||||
|
||||
# Mode: full translation
|
||||
if entry.msgid and not entry.msgstr:
|
||||
# Check cache
|
||||
if self.cache:
|
||||
cached = self.cache.get_cached_translation(entry.msgid)
|
||||
if cached:
|
||||
entry.msgstr = cached
|
||||
return entry, 0
|
||||
|
||||
# Translate
|
||||
protected, placeholders = self.brand_protector.protect_brands(entry.msgid)
|
||||
translated = self.translator.translate(protected)
|
||||
|
||||
if translated:
|
||||
translated = self.brand_protector.restore_brands(translated, placeholders)
|
||||
translated, _ = apply_ptbr_fixes(translated)
|
||||
translated, corrections = self.brand_protector.fix_translated_brands(
|
||||
entry.msgid, translated
|
||||
)
|
||||
brands_fixed = len(corrections)
|
||||
|
||||
entry.msgstr = translated
|
||||
|
||||
if self.cache:
|
||||
self.cache.save_translation(
|
||||
entry.msgid, translated, plugin_name, validated=False
|
||||
)
|
||||
|
||||
return entry, brands_fixed
|
||||
|
||||
def _compile_mo(self, po_file: Path) -> bool:
|
||||
"""Compile .mo file using msgfmt."""
|
||||
mo_file = po_file.with_suffix('.mo')
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
['msgfmt', '-cv', '-o', str(mo_file), str(po_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True
|
||||
)
|
||||
return True
|
||||
except (subprocess.CalledProcessError, FileNotFoundError):
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Seed Database
|
||||
# =============================================================================
|
||||
|
||||
def seed_brands_db(cache: CacheManager):
|
||||
"""Populate database with seed brands."""
|
||||
print("🌱 Seeding brands database...")
|
||||
|
||||
for brand in SEED_BRANDS:
|
||||
try:
|
||||
cache.conn.execute(
|
||||
"""INSERT OR IGNORE INTO brands (name, auto_detected, confidence_score)
|
||||
VALUES (?, 0, 1.0)""",
|
||||
(brand,)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
cache.conn.commit()
|
||||
print(f"✅ Seeded {len(SEED_BRANDS)} brands")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Main CLI
|
||||
# =============================================================================
|
||||
|
||||
def main():
|
||||
"""Main CLI entry point."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Sistema eficiente de traduções WordPress PT-PT",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument("files", nargs="*", help="Po files to process")
|
||||
parser.add_argument("--batch", type=Path, help="Process all .po files in directory")
|
||||
parser.add_argument("--brands-only", action="store_true", help="Only fix brands")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be done")
|
||||
parser.add_argument("--init-db", action="store_true", help="Initialize database")
|
||||
parser.add_argument("--export-brands", type=Path, help="Export brands to JSON")
|
||||
parser.add_argument("--import-brands", type=Path, help="Import brands from JSON")
|
||||
parser.add_argument("--db-path", type=str,
|
||||
default=str(Path.home() / ".wp-translate-ptpt" / "cache.db"))
|
||||
parser.add_argument("--api-url", type=str,
|
||||
default="https://translate.descomplicar.pt")
|
||||
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Ensure db directory exists
|
||||
db_dir = Path(args.db_path).parent
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize database
|
||||
if args.init_db:
|
||||
cache = CacheManager(args.db_path)
|
||||
seed_brands_db(cache)
|
||||
cache.close()
|
||||
return 0
|
||||
|
||||
# Export brands
|
||||
if args.export_brands:
|
||||
cache = CacheManager(args.db_path)
|
||||
cursor = cache.conn.execute("SELECT name FROM brands ORDER BY name")
|
||||
brands = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
with open(args.export_brands, 'w') as f:
|
||||
json.dump(brands, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"✅ Exported {len(brands)} brands to {args.export_brands}")
|
||||
cache.close()
|
||||
return 0
|
||||
|
||||
# Import brands
|
||||
if args.import_brands:
|
||||
with open(args.import_brands, 'r') as f:
|
||||
brands = json.load(f)
|
||||
|
||||
cache = CacheManager(args.db_path)
|
||||
for brand in brands:
|
||||
cache.conn.execute(
|
||||
"""INSERT OR IGNORE INTO brands (name, auto_detected, confidence_score)
|
||||
VALUES (?, 0, 1.0)""",
|
||||
(brand,)
|
||||
)
|
||||
cache.conn.commit()
|
||||
print(f"✅ Imported {len(brands)} brands")
|
||||
cache.close()
|
||||
return 0
|
||||
|
||||
# Collect files
|
||||
files_to_process = []
|
||||
|
||||
if args.batch:
|
||||
files_to_process = list(args.batch.rglob("*-pt_PT.po"))
|
||||
elif args.files:
|
||||
files_to_process = [Path(f) for f in args.files]
|
||||
else:
|
||||
parser.print_help()
|
||||
return 1
|
||||
|
||||
if not files_to_process:
|
||||
print("❌ No .po files found")
|
||||
return 1
|
||||
|
||||
# Process files
|
||||
processor = TranslationProcessor(args.db_path, args.api_url)
|
||||
|
||||
mode = "brands-only" if args.brands_only else "full"
|
||||
|
||||
print("="*60)
|
||||
print(f"🌍 WP Translate PT-PT v{__version__}")
|
||||
print("="*60)
|
||||
print(f"Mode: {mode}")
|
||||
print(f"Files: {len(files_to_process)}")
|
||||
print(f"Dry run: {args.dry_run}")
|
||||
print("="*60)
|
||||
print()
|
||||
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
for i, po_file in enumerate(files_to_process, 1):
|
||||
print(f"[{i}/{len(files_to_process)}] {po_file.name}...", end=" ", flush=True)
|
||||
|
||||
if args.dry_run:
|
||||
print("(skipped)")
|
||||
continue
|
||||
|
||||
result = processor.process_file(po_file, mode=mode)
|
||||
results.append(result)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ {result.brands_fixed} brands fixed")
|
||||
else:
|
||||
print(f"❌ {result.error}")
|
||||
|
||||
# Summary
|
||||
if results and not args.dry_run:
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("📊 SUMMARY")
|
||||
print("="*60)
|
||||
|
||||
total_success = sum(1 for r in results if r.success)
|
||||
total_brands_fixed = sum(r.brands_fixed for r in results)
|
||||
total_errors = sum(len(r.errors) for r in results)
|
||||
|
||||
print(f"Files processed: {total_success}/{len(results)}")
|
||||
print(f"Brands fixed: {total_brands_fixed}")
|
||||
print(f"Errors: {total_errors}")
|
||||
print(f"Time: {elapsed:.1f}s")
|
||||
print("="*60)
|
||||
|
||||
if processor.cache:
|
||||
processor.cache.close()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user