Files
scripts/scraper/ctf_config_batch4.json

418 lines
15 KiB
JSON
Executable File

{
"client": "CTF_Carstuff_Batch4",
"description": "BATCH 4 - Expansão massiva: 16 sites novos + Portal Clássicos recuperado",
"output_base_dir": "/root/scraper-ctf",
"output_dirs": {
"raw": "output_md_batch4",
"cleaned": "output_cleaned_batch4",
"formatted": "formatted_batch4",
"logs": "logs"
},
"sites": [
{
"name": "Portal dos Clássicos",
"url": "https://portalclassicos.com/foruns/index.php",
"type": "forum",
"max_depth": 4,
"priority": "high",
"language": "pt",
"category": "automovel-classico",
"notes": "RECUPERADO! URL correta. Fórum PT - mercado local prioritário",
"estimated_pages": 300,
"relevance_keywords": ["estofamento", "interior", "banco", "couro", "vinil", "capota", "restauro"]
},
{
"name": "Pelican Parts - Porsche Forum",
"url": "https://forums.pelicanparts.com/porsche-forums/",
"type": "forum",
"max_depth": 4,
"priority": "high",
"language": "en",
"category": "automovel-classico",
"notes": "Fórum Porsche - interior/estofamento posts prioritários",
"estimated_pages": 1000,
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "convertible top"]
},
{
"name": "Pelican Parts - BMW Forum",
"url": "https://forums.pelicanparts.com/bmw-forums/",
"type": "forum",
"max_depth": 4,
"priority": "high",
"language": "en",
"category": "automovel",
"notes": "Fórum BMW - comunidade ativa",
"estimated_pages": 800,
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "alcantara"]
},
{
"name": "Peach Parts - Mercedes Forum",
"url": "https://www.peachparts.com/shopforum/index.php",
"type": "forum",
"max_depth": 4,
"priority": "high",
"language": "en",
"category": "automovel-classico",
"notes": "Fórum Mercedes especializado - MB-Tex, leather comum",
"estimated_pages": 700,
"relevance_keywords": ["MB-Tex", "interior", "upholstery", "leather", "seat", "trim"]
},
{
"name": "Pelican Parts - VW Audi Forum",
"url": "https://forums.pelicanparts.com/vw-audi-technical-forum/",
"type": "forum",
"max_depth": 4,
"priority": "medium",
"language": "en",
"category": "automovel",
"notes": "Fórum VW/Audi - mercado relevante",
"estimated_pages": 500,
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
},
{
"name": "Pelican Parts - Saab Forum",
"url": "https://forums.pelicanparts.com/saab-technical-forum/",
"type": "forum",
"max_depth": 4,
"priority": "medium",
"language": "en",
"category": "automovel-classico",
"notes": "Fórum Saab - nicho mas ativo",
"estimated_pages": 300,
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
},
{
"name": "Pelican Parts - Mini Forum",
"url": "https://forums.pelicanparts.com/mini-discussion-forum/",
"type": "forum",
"max_depth": 4,
"priority": "medium",
"language": "en",
"category": "automovel",
"notes": "Fórum Mini - comunidade Mini Cooper",
"estimated_pages": 300,
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
},
{
"name": "Pelican Tech - Main Hub",
"url": "https://www.pelicanparts.com/techarticles/tech_center_main.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "high",
"language": "en",
"category": "geral",
"notes": "ANTI-BOT (403). Hub central tech articles - usar Playwright stealth",
"estimated_pages": 80,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "trim", "seat", "restoration"]
},
{
"name": "Pelican Tech - Mercedes",
"url": "https://www.pelicanparts.com/techarticles/Mercedes-Benz/MBZ_Tech_Index.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "high",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles Mercedes - usar Playwright stealth",
"estimated_pages": 40,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "MB-Tex", "trim"]
},
{
"name": "Pelican Tech - BMW",
"url": "https://www.pelicanparts.com/BMW/techarticles/tech_main.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "high",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles BMW - usar Playwright stealth",
"estimated_pages": 40,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "leather", "trim"]
},
{
"name": "Pelican Tech - Mini",
"url": "https://www.pelicanparts.com/MINI/index-SC.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "medium",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles Mini - usar Playwright stealth",
"estimated_pages": 25,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "trim"]
},
{
"name": "Pelican Tech - Audi",
"url": "https://www.pelicanparts.com/techarticles/Audi_tech/Audi_Tech_Index.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "medium",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles Audi - usar Playwright stealth",
"estimated_pages": 25,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "leather", "trim"]
},
{
"name": "Pelican Tech - VW",
"url": "https://www.pelicanparts.com/techarticles/Volkswagen_Tech_Index.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "medium",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles VW - usar Playwright stealth",
"estimated_pages": 25,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "trim"]
},
{
"name": "Pelican Tech - Volvo",
"url": "https://www.pelicanparts.com/techarticles/Volvo_Tech.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "low",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles Volvo - usar Playwright stealth",
"estimated_pages": 15,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "trim"]
},
{
"name": "Pelican Tech - Saab",
"url": "https://www.pelicanparts.com/techarticles/Saab_Tech.htm",
"type": "tech_articles",
"max_depth": 3,
"priority": "low",
"language": "en",
"category": "automovel",
"notes": "ANTI-BOT (403). Tech articles Saab - usar Playwright stealth",
"estimated_pages": 15,
"requires_javascript": true,
"anti_bot_protection": true,
"relevance_keywords": ["interior", "upholstery", "trim"]
},
{
"name": "Verdeck.de - Blog",
"url": "https://www.verdeck.de/blog/",
"type": "blog",
"max_depth": 4,
"priority": "high",
"language": "de",
"category": "capotas",
"notes": "Alemão - especialistas capotas conversível. TRADUÇÃO NECESSÁRIA",
"estimated_pages": 80,
"requires_translation": true,
"relevance_keywords": ["verdeck", "cabrio", "cabriolet", "stoffverdeck", "leder", "innenausstattung"]
},
{
"name": "Verdeck.de - Material",
"url": "https://www.verdeck.de/unser-material/",
"type": "resources",
"max_depth": 3,
"priority": "high",
"language": "de",
"category": "capotas",
"notes": "Alemão - catálogo materiais capotas. TRADUÇÃO NECESSÁRIA",
"estimated_pages": 25,
"requires_translation": true,
"relevance_keywords": ["material", "stoff", "sonnland", "haartz"]
},
{
"name": "Lederzentrum Wiki",
"url": "https://www.lederzentrum.de/wiki/index.php/Das_Lederzentrum_Lederlexikon",
"type": "wiki",
"max_depth": 4,
"priority": "high",
"language": "de",
"category": "couro",
"notes": "Alemão - enciclopédia técnica couro. ALTA PRIORIDADE. TRADUÇÃO NECESSÁRIA",
"estimated_pages": 150,
"requires_translation": true,
"relevance_keywords": ["leder", "autoleder", "reparatur", "pflege", "reinigung"]
},
{
"name": "Piel de Toro",
"url": "https://pieldetoro.net/web/default.php",
"type": "forum",
"max_depth": 4,
"priority": "medium",
"language": "es",
"category": "automovel-classico",
"notes": "Espanhol - clássicos espanhóis. TRADUÇÃO NECESSÁRIA",
"estimated_pages": 200,
"requires_translation": true,
"relevance_keywords": ["tapiceria", "cuero", "interior", "restauracion"]
},
{
"name": "Aircraft Interiors International",
"url": "https://www.aircraftinteriorsinternational.com/",
"type": "magazine",
"max_depth": 4,
"priority": "medium",
"language": "en",
"category": "aeronautica",
"notes": "Magazine aeronáutica - CTF vende para aviação",
"estimated_pages": 350,
"relevance_keywords": ["aircraft interior", "cabin", "seat", "upholstery", "leather", "fabric"]
},
{
"name": "AIN Online",
"url": "https://www.ainonline.com/",
"type": "news",
"max_depth": 4,
"priority": "low",
"language": "en",
"category": "aeronautica",
"notes": "News aeronáutica - filtrar apenas interior/retrofit",
"estimated_pages": 800,
"relevance_keywords": ["interior", "cabin", "retrofit", "refurbishment", "upholstery"]
},
{
"name": "Railway Interiors International",
"url": "https://www.railwayinteriorsinternational.com/",
"type": "magazine",
"max_depth": 4,
"priority": "medium",
"language": "en",
"category": "ferroviaria",
"notes": "Magazine ferroviária - CTF vende para comboios",
"estimated_pages": 350,
"relevance_keywords": ["railway interior", "train", "seat", "upholstery", "fabric", "refurbishment"]
},
{
"name": "Global Railway Review",
"url": "https://www.globalrailwayreview.com/",
"type": "news",
"max_depth": 4,
"priority": "low",
"language": "en",
"category": "ferroviaria",
"notes": "News ferroviária - filtrar apenas interior/retrofit",
"estimated_pages": 800,
"relevance_keywords": ["interior", "passenger", "refurbishment", "retrofit", "seat"]
},
{
"name": "Upholstery Resource",
"url": "https://www.upholsteryresource.com/",
"type": "resources",
"max_depth": 4,
"priority": "high",
"language": "en",
"category": "geral",
"notes": "Recursos gerais estofamento - ALTA RELEVÂNCIA",
"estimated_pages": 150,
"relevance_keywords": ["upholstery", "fabric", "leather", "foam", "technique", "pattern"]
}
],
"scraper_settings": {
"request_timeout": 120,
"max_retries": 3,
"politeness_delay": [4, 10],
"use_playwright": true,
"playwright_stealth": true,
"headless": true,
"user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"excluded_patterns": [
"/tag/", "/category/", "/author/", "/page/",
"/wp-content/", "/wp-admin/", "/feed/", "/rss/",
"/login", "/register", "/signin", "/signup",
"/cart", "/checkout", "/account", "/my-account",
"/product/", "/shop/", "/store/", "/parts/",
"/members/", "/profile/", "/user/",
"/gallery/", "/photos/", "/images/", "/media/",
"/calendar/", "/events/",
"/search/", "/results/",
"/print/", "/pdf/", "/download/",
"/shipping/", "/returns/", "/warranty/",
"/contact", "/about", "/privacy", "/terms"
],
"content_filters": {
"min_word_count": 100,
"apply_during_scraping": false,
"note": "Filtros aplicados APÓS scraping na fase de extração"
}
},
"vps_execution": {
"recommended_vps": "easy.descomplicar.pt",
"ssh_port": 22,
"ssh_user": "root",
"working_directory": "/root/scraper-ctf",
"estimated_duration_hours": 48,
"estimated_storage_gb": 5,
"recommended_cpu_cores": 4,
"recommended_ram_gb": 8
},
"translation_requirements": {
"german_sites": ["Verdeck.de - Blog", "Verdeck.de - Material", "Lederzentrum Wiki"],
"spanish_sites": ["Piel de Toro"],
"translation_api": "google-translate",
"translation_stage": "after_extraction",
"note": "Tradução apenas para casos extraídos (não todo o conteúdo)"
},
"execution_strategy": {
"total_sites": 24,
"total_estimated_pages": 6500,
"estimated_scraping_time": "48-60 hours",
"estimated_cases": "1000-1300 (taxa 16.5%)",
"phases": [
{
"phase": "1A - Fóruns Alta Prioridade",
"sites": ["Portal dos Clássicos", "Pelican Porsche", "Pelican BMW", "Peach Parts"],
"estimated_time": "14-18h"
},
{
"phase": "1B - Fóruns Média/Baixa",
"sites": ["Pelican VW-Audi", "Pelican Saab", "Pelican Mini", "Piel de Toro"],
"estimated_time": "10-14h"
},
{
"phase": "2 - Tech Articles (Anti-bot)",
"sites": ["Todos Pelican Tech Articles (8 sites)"],
"estimated_time": "6-8h",
"note": "Requer Playwright stealth mode"
},
{
"phase": "3 - Sites Alemães",
"sites": ["Verdeck.de Blog", "Verdeck.de Material", "Lederzentrum Wiki"],
"estimated_time": "8-10h"
},
{
"phase": "4 - Aeronáutica/Ferroviária",
"sites": ["Aircraft Interiors", "Railway Interiors", "AIN Online", "Global Railway"],
"estimated_time": "12-16h"
},
{
"phase": "5 - Recursos Gerais",
"sites": ["Upholstery Resource"],
"estimated_time": "4-6h"
}
]
},
"execution_notes": [
"✅ 16/24 sites validados disponíveis",
"⚠️ 8 tech articles Pelican com HTTP 403 - requer Playwright stealth",
"✅ Portal dos Clássicos RECUPERADO (URL correta encontrada)",
"🌐 3 sites alemães + 1 espanhol requerem tradução APÓS extração",
"🚀 Execução VPS recomendada (48-60h tempo total)",
"📊 Estimativa final KB: 1400-1900 casos totais (559 atuais + 1000-1300 novos)",
"🔧 Nível 4 profundidade para TODOS os sites",
"🎯 Filtros keywords aplicados na EXTRAÇÃO (não scraping)",
"⚡ Playwright stealth mode para anti-bot bypass",
"💾 ~5GB storage necessário VPS"
]
}