{ "client": "CTF_Carstuff_Batch4", "description": "BATCH 4 - Expansão massiva: 16 sites novos + Portal Clássicos recuperado", "output_base_dir": "/root/scraper-ctf", "output_dirs": { "raw": "output_md_batch4", "cleaned": "output_cleaned_batch4", "formatted": "formatted_batch4", "logs": "logs" }, "sites": [ { "name": "Portal dos Clássicos", "url": "https://portalclassicos.com/foruns/index.php", "type": "forum", "max_depth": 4, "priority": "high", "language": "pt", "category": "automovel-classico", "notes": "RECUPERADO! URL correta. Fórum PT - mercado local prioritário", "estimated_pages": 300, "relevance_keywords": ["estofamento", "interior", "banco", "couro", "vinil", "capota", "restauro"] }, { "name": "Pelican Parts - Porsche Forum", "url": "https://forums.pelicanparts.com/porsche-forums/", "type": "forum", "max_depth": 4, "priority": "high", "language": "en", "category": "automovel-classico", "notes": "Fórum Porsche - interior/estofamento posts prioritários", "estimated_pages": 1000, "relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "convertible top"] }, { "name": "Pelican Parts - BMW Forum", "url": "https://forums.pelicanparts.com/bmw-forums/", "type": "forum", "max_depth": 4, "priority": "high", "language": "en", "category": "automovel", "notes": "Fórum BMW - comunidade ativa", "estimated_pages": 800, "relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "alcantara"] }, { "name": "Peach Parts - Mercedes Forum", "url": "https://www.peachparts.com/shopforum/index.php", "type": "forum", "max_depth": 4, "priority": "high", "language": "en", "category": "automovel-classico", "notes": "Fórum Mercedes especializado - MB-Tex, leather comum", "estimated_pages": 700, "relevance_keywords": ["MB-Tex", "interior", "upholstery", "leather", "seat", "trim"] }, { "name": "Pelican Parts - VW Audi Forum", "url": "https://forums.pelicanparts.com/vw-audi-technical-forum/", "type": "forum", "max_depth": 4, "priority": "medium", "language": "en", "category": "automovel", "notes": "Fórum VW/Audi - mercado relevante", "estimated_pages": 500, "relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"] }, { "name": "Pelican Parts - Saab Forum", "url": "https://forums.pelicanparts.com/saab-technical-forum/", "type": "forum", "max_depth": 4, "priority": "medium", "language": "en", "category": "automovel-classico", "notes": "Fórum Saab - nicho mas ativo", "estimated_pages": 300, "relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"] }, { "name": "Pelican Parts - Mini Forum", "url": "https://forums.pelicanparts.com/mini-discussion-forum/", "type": "forum", "max_depth": 4, "priority": "medium", "language": "en", "category": "automovel", "notes": "Fórum Mini - comunidade Mini Cooper", "estimated_pages": 300, "relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"] }, { "name": "Pelican Tech - Main Hub", "url": "https://www.pelicanparts.com/techarticles/tech_center_main.htm", "type": "tech_articles", "max_depth": 3, "priority": "high", "language": "en", "category": "geral", "notes": "ANTI-BOT (403). Hub central tech articles - usar Playwright stealth", "estimated_pages": 80, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "trim", "seat", "restoration"] }, { "name": "Pelican Tech - Mercedes", "url": "https://www.pelicanparts.com/techarticles/Mercedes-Benz/MBZ_Tech_Index.htm", "type": "tech_articles", "max_depth": 3, "priority": "high", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles Mercedes - usar Playwright stealth", "estimated_pages": 40, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "MB-Tex", "trim"] }, { "name": "Pelican Tech - BMW", "url": "https://www.pelicanparts.com/BMW/techarticles/tech_main.htm", "type": "tech_articles", "max_depth": 3, "priority": "high", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles BMW - usar Playwright stealth", "estimated_pages": 40, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "leather", "trim"] }, { "name": "Pelican Tech - Mini", "url": "https://www.pelicanparts.com/MINI/index-SC.htm", "type": "tech_articles", "max_depth": 3, "priority": "medium", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles Mini - usar Playwright stealth", "estimated_pages": 25, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "trim"] }, { "name": "Pelican Tech - Audi", "url": "https://www.pelicanparts.com/techarticles/Audi_tech/Audi_Tech_Index.htm", "type": "tech_articles", "max_depth": 3, "priority": "medium", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles Audi - usar Playwright stealth", "estimated_pages": 25, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "leather", "trim"] }, { "name": "Pelican Tech - VW", "url": "https://www.pelicanparts.com/techarticles/Volkswagen_Tech_Index.htm", "type": "tech_articles", "max_depth": 3, "priority": "medium", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles VW - usar Playwright stealth", "estimated_pages": 25, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "trim"] }, { "name": "Pelican Tech - Volvo", "url": "https://www.pelicanparts.com/techarticles/Volvo_Tech.htm", "type": "tech_articles", "max_depth": 3, "priority": "low", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles Volvo - usar Playwright stealth", "estimated_pages": 15, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "trim"] }, { "name": "Pelican Tech - Saab", "url": "https://www.pelicanparts.com/techarticles/Saab_Tech.htm", "type": "tech_articles", "max_depth": 3, "priority": "low", "language": "en", "category": "automovel", "notes": "ANTI-BOT (403). Tech articles Saab - usar Playwright stealth", "estimated_pages": 15, "requires_javascript": true, "anti_bot_protection": true, "relevance_keywords": ["interior", "upholstery", "trim"] }, { "name": "Verdeck.de - Blog", "url": "https://www.verdeck.de/blog/", "type": "blog", "max_depth": 4, "priority": "high", "language": "de", "category": "capotas", "notes": "Alemão - especialistas capotas conversível. TRADUÇÃO NECESSÁRIA", "estimated_pages": 80, "requires_translation": true, "relevance_keywords": ["verdeck", "cabrio", "cabriolet", "stoffverdeck", "leder", "innenausstattung"] }, { "name": "Verdeck.de - Material", "url": "https://www.verdeck.de/unser-material/", "type": "resources", "max_depth": 3, "priority": "high", "language": "de", "category": "capotas", "notes": "Alemão - catálogo materiais capotas. TRADUÇÃO NECESSÁRIA", "estimated_pages": 25, "requires_translation": true, "relevance_keywords": ["material", "stoff", "sonnland", "haartz"] }, { "name": "Lederzentrum Wiki", "url": "https://www.lederzentrum.de/wiki/index.php/Das_Lederzentrum_Lederlexikon", "type": "wiki", "max_depth": 4, "priority": "high", "language": "de", "category": "couro", "notes": "Alemão - enciclopédia técnica couro. ALTA PRIORIDADE. TRADUÇÃO NECESSÁRIA", "estimated_pages": 150, "requires_translation": true, "relevance_keywords": ["leder", "autoleder", "reparatur", "pflege", "reinigung"] }, { "name": "Piel de Toro", "url": "https://pieldetoro.net/web/default.php", "type": "forum", "max_depth": 4, "priority": "medium", "language": "es", "category": "automovel-classico", "notes": "Espanhol - clássicos espanhóis. TRADUÇÃO NECESSÁRIA", "estimated_pages": 200, "requires_translation": true, "relevance_keywords": ["tapiceria", "cuero", "interior", "restauracion"] }, { "name": "Aircraft Interiors International", "url": "https://www.aircraftinteriorsinternational.com/", "type": "magazine", "max_depth": 4, "priority": "medium", "language": "en", "category": "aeronautica", "notes": "Magazine aeronáutica - CTF vende para aviação", "estimated_pages": 350, "relevance_keywords": ["aircraft interior", "cabin", "seat", "upholstery", "leather", "fabric"] }, { "name": "AIN Online", "url": "https://www.ainonline.com/", "type": "news", "max_depth": 4, "priority": "low", "language": "en", "category": "aeronautica", "notes": "News aeronáutica - filtrar apenas interior/retrofit", "estimated_pages": 800, "relevance_keywords": ["interior", "cabin", "retrofit", "refurbishment", "upholstery"] }, { "name": "Railway Interiors International", "url": "https://www.railwayinteriorsinternational.com/", "type": "magazine", "max_depth": 4, "priority": "medium", "language": "en", "category": "ferroviaria", "notes": "Magazine ferroviária - CTF vende para comboios", "estimated_pages": 350, "relevance_keywords": ["railway interior", "train", "seat", "upholstery", "fabric", "refurbishment"] }, { "name": "Global Railway Review", "url": "https://www.globalrailwayreview.com/", "type": "news", "max_depth": 4, "priority": "low", "language": "en", "category": "ferroviaria", "notes": "News ferroviária - filtrar apenas interior/retrofit", "estimated_pages": 800, "relevance_keywords": ["interior", "passenger", "refurbishment", "retrofit", "seat"] }, { "name": "Upholstery Resource", "url": "https://www.upholsteryresource.com/", "type": "resources", "max_depth": 4, "priority": "high", "language": "en", "category": "geral", "notes": "Recursos gerais estofamento - ALTA RELEVÂNCIA", "estimated_pages": 150, "relevance_keywords": ["upholstery", "fabric", "leather", "foam", "technique", "pattern"] } ], "scraper_settings": { "request_timeout": 120, "max_retries": 3, "politeness_delay": [4, 10], "use_playwright": true, "playwright_stealth": true, "headless": true, "user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "excluded_patterns": [ "/tag/", "/category/", "/author/", "/page/", "/wp-content/", "/wp-admin/", "/feed/", "/rss/", "/login", "/register", "/signin", "/signup", "/cart", "/checkout", "/account", "/my-account", "/product/", "/shop/", "/store/", "/parts/", "/members/", "/profile/", "/user/", "/gallery/", "/photos/", "/images/", "/media/", "/calendar/", "/events/", "/search/", "/results/", "/print/", "/pdf/", "/download/", "/shipping/", "/returns/", "/warranty/", "/contact", "/about", "/privacy", "/terms" ], "content_filters": { "min_word_count": 100, "apply_during_scraping": false, "note": "Filtros aplicados APÓS scraping na fase de extração" } }, "vps_execution": { "recommended_vps": "easy.descomplicar.pt", "ssh_port": 22, "ssh_user": "root", "working_directory": "/root/scraper-ctf", "estimated_duration_hours": 48, "estimated_storage_gb": 5, "recommended_cpu_cores": 4, "recommended_ram_gb": 8 }, "translation_requirements": { "german_sites": ["Verdeck.de - Blog", "Verdeck.de - Material", "Lederzentrum Wiki"], "spanish_sites": ["Piel de Toro"], "translation_api": "google-translate", "translation_stage": "after_extraction", "note": "Tradução apenas para casos extraídos (não todo o conteúdo)" }, "execution_strategy": { "total_sites": 24, "total_estimated_pages": 6500, "estimated_scraping_time": "48-60 hours", "estimated_cases": "1000-1300 (taxa 16.5%)", "phases": [ { "phase": "1A - Fóruns Alta Prioridade", "sites": ["Portal dos Clássicos", "Pelican Porsche", "Pelican BMW", "Peach Parts"], "estimated_time": "14-18h" }, { "phase": "1B - Fóruns Média/Baixa", "sites": ["Pelican VW-Audi", "Pelican Saab", "Pelican Mini", "Piel de Toro"], "estimated_time": "10-14h" }, { "phase": "2 - Tech Articles (Anti-bot)", "sites": ["Todos Pelican Tech Articles (8 sites)"], "estimated_time": "6-8h", "note": "Requer Playwright stealth mode" }, { "phase": "3 - Sites Alemães", "sites": ["Verdeck.de Blog", "Verdeck.de Material", "Lederzentrum Wiki"], "estimated_time": "8-10h" }, { "phase": "4 - Aeronáutica/Ferroviária", "sites": ["Aircraft Interiors", "Railway Interiors", "AIN Online", "Global Railway"], "estimated_time": "12-16h" }, { "phase": "5 - Recursos Gerais", "sites": ["Upholstery Resource"], "estimated_time": "4-6h" } ] }, "execution_notes": [ "✅ 16/24 sites validados disponíveis", "⚠️ 8 tech articles Pelican com HTTP 403 - requer Playwright stealth", "✅ Portal dos Clássicos RECUPERADO (URL correta encontrada)", "🌐 3 sites alemães + 1 espanhol requerem tradução APÓS extração", "🚀 Execução VPS recomendada (48-60h tempo total)", "📊 Estimativa final KB: 1400-1900 casos totais (559 atuais + 1000-1300 novos)", "🔧 Nível 4 profundidade para TODOS os sites", "🎯 Filtros keywords aplicados na EXTRAÇÃO (não scraping)", "⚡ Playwright stealth mode para anti-bot bypass", "💾 ~5GB storage necessário VPS" ] }