init: scripts diversos (crawlers, conversores, scrapers)
This commit is contained in:
417
scraper/ctf_config_batch4.json
Executable file
417
scraper/ctf_config_batch4.json
Executable file
@@ -0,0 +1,417 @@
|
||||
{
|
||||
"client": "CTF_Carstuff_Batch4",
|
||||
"description": "BATCH 4 - Expansão massiva: 16 sites novos + Portal Clássicos recuperado",
|
||||
"output_base_dir": "/root/scraper-ctf",
|
||||
"output_dirs": {
|
||||
"raw": "output_md_batch4",
|
||||
"cleaned": "output_cleaned_batch4",
|
||||
"formatted": "formatted_batch4",
|
||||
"logs": "logs"
|
||||
},
|
||||
"sites": [
|
||||
{
|
||||
"name": "Portal dos Clássicos",
|
||||
"url": "https://portalclassicos.com/foruns/index.php",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "pt",
|
||||
"category": "automovel-classico",
|
||||
"notes": "RECUPERADO! URL correta. Fórum PT - mercado local prioritário",
|
||||
"estimated_pages": 300,
|
||||
"relevance_keywords": ["estofamento", "interior", "banco", "couro", "vinil", "capota", "restauro"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Parts - Porsche Forum",
|
||||
"url": "https://forums.pelicanparts.com/porsche-forums/",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "automovel-classico",
|
||||
"notes": "Fórum Porsche - interior/estofamento posts prioritários",
|
||||
"estimated_pages": 1000,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "convertible top"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Parts - BMW Forum",
|
||||
"url": "https://forums.pelicanparts.com/bmw-forums/",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "Fórum BMW - comunidade ativa",
|
||||
"estimated_pages": 800,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim", "alcantara"]
|
||||
},
|
||||
{
|
||||
"name": "Peach Parts - Mercedes Forum",
|
||||
"url": "https://www.peachparts.com/shopforum/index.php",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "automovel-classico",
|
||||
"notes": "Fórum Mercedes especializado - MB-Tex, leather comum",
|
||||
"estimated_pages": 700,
|
||||
"relevance_keywords": ["MB-Tex", "interior", "upholstery", "leather", "seat", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Parts - VW Audi Forum",
|
||||
"url": "https://forums.pelicanparts.com/vw-audi-technical-forum/",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "Fórum VW/Audi - mercado relevante",
|
||||
"estimated_pages": 500,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Parts - Saab Forum",
|
||||
"url": "https://forums.pelicanparts.com/saab-technical-forum/",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel-classico",
|
||||
"notes": "Fórum Saab - nicho mas ativo",
|
||||
"estimated_pages": 300,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Parts - Mini Forum",
|
||||
"url": "https://forums.pelicanparts.com/mini-discussion-forum/",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "Fórum Mini - comunidade Mini Cooper",
|
||||
"estimated_pages": 300,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "seat", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Main Hub",
|
||||
"url": "https://www.pelicanparts.com/techarticles/tech_center_main.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "geral",
|
||||
"notes": "ANTI-BOT (403). Hub central tech articles - usar Playwright stealth",
|
||||
"estimated_pages": 80,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "trim", "seat", "restoration"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Mercedes",
|
||||
"url": "https://www.pelicanparts.com/techarticles/Mercedes-Benz/MBZ_Tech_Index.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles Mercedes - usar Playwright stealth",
|
||||
"estimated_pages": 40,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "MB-Tex", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - BMW",
|
||||
"url": "https://www.pelicanparts.com/BMW/techarticles/tech_main.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles BMW - usar Playwright stealth",
|
||||
"estimated_pages": 40,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Mini",
|
||||
"url": "https://www.pelicanparts.com/MINI/index-SC.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles Mini - usar Playwright stealth",
|
||||
"estimated_pages": 25,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Audi",
|
||||
"url": "https://www.pelicanparts.com/techarticles/Audi_tech/Audi_Tech_Index.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles Audi - usar Playwright stealth",
|
||||
"estimated_pages": 25,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "leather", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - VW",
|
||||
"url": "https://www.pelicanparts.com/techarticles/Volkswagen_Tech_Index.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles VW - usar Playwright stealth",
|
||||
"estimated_pages": 25,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Volvo",
|
||||
"url": "https://www.pelicanparts.com/techarticles/Volvo_Tech.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "low",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles Volvo - usar Playwright stealth",
|
||||
"estimated_pages": 15,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Pelican Tech - Saab",
|
||||
"url": "https://www.pelicanparts.com/techarticles/Saab_Tech.htm",
|
||||
"type": "tech_articles",
|
||||
"max_depth": 3,
|
||||
"priority": "low",
|
||||
"language": "en",
|
||||
"category": "automovel",
|
||||
"notes": "ANTI-BOT (403). Tech articles Saab - usar Playwright stealth",
|
||||
"estimated_pages": 15,
|
||||
"requires_javascript": true,
|
||||
"anti_bot_protection": true,
|
||||
"relevance_keywords": ["interior", "upholstery", "trim"]
|
||||
},
|
||||
{
|
||||
"name": "Verdeck.de - Blog",
|
||||
"url": "https://www.verdeck.de/blog/",
|
||||
"type": "blog",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "de",
|
||||
"category": "capotas",
|
||||
"notes": "Alemão - especialistas capotas conversível. TRADUÇÃO NECESSÁRIA",
|
||||
"estimated_pages": 80,
|
||||
"requires_translation": true,
|
||||
"relevance_keywords": ["verdeck", "cabrio", "cabriolet", "stoffverdeck", "leder", "innenausstattung"]
|
||||
},
|
||||
{
|
||||
"name": "Verdeck.de - Material",
|
||||
"url": "https://www.verdeck.de/unser-material/",
|
||||
"type": "resources",
|
||||
"max_depth": 3,
|
||||
"priority": "high",
|
||||
"language": "de",
|
||||
"category": "capotas",
|
||||
"notes": "Alemão - catálogo materiais capotas. TRADUÇÃO NECESSÁRIA",
|
||||
"estimated_pages": 25,
|
||||
"requires_translation": true,
|
||||
"relevance_keywords": ["material", "stoff", "sonnland", "haartz"]
|
||||
},
|
||||
{
|
||||
"name": "Lederzentrum Wiki",
|
||||
"url": "https://www.lederzentrum.de/wiki/index.php/Das_Lederzentrum_Lederlexikon",
|
||||
"type": "wiki",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "de",
|
||||
"category": "couro",
|
||||
"notes": "Alemão - enciclopédia técnica couro. ALTA PRIORIDADE. TRADUÇÃO NECESSÁRIA",
|
||||
"estimated_pages": 150,
|
||||
"requires_translation": true,
|
||||
"relevance_keywords": ["leder", "autoleder", "reparatur", "pflege", "reinigung"]
|
||||
},
|
||||
{
|
||||
"name": "Piel de Toro",
|
||||
"url": "https://pieldetoro.net/web/default.php",
|
||||
"type": "forum",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "es",
|
||||
"category": "automovel-classico",
|
||||
"notes": "Espanhol - clássicos espanhóis. TRADUÇÃO NECESSÁRIA",
|
||||
"estimated_pages": 200,
|
||||
"requires_translation": true,
|
||||
"relevance_keywords": ["tapiceria", "cuero", "interior", "restauracion"]
|
||||
},
|
||||
{
|
||||
"name": "Aircraft Interiors International",
|
||||
"url": "https://www.aircraftinteriorsinternational.com/",
|
||||
"type": "magazine",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "aeronautica",
|
||||
"notes": "Magazine aeronáutica - CTF vende para aviação",
|
||||
"estimated_pages": 350,
|
||||
"relevance_keywords": ["aircraft interior", "cabin", "seat", "upholstery", "leather", "fabric"]
|
||||
},
|
||||
{
|
||||
"name": "AIN Online",
|
||||
"url": "https://www.ainonline.com/",
|
||||
"type": "news",
|
||||
"max_depth": 4,
|
||||
"priority": "low",
|
||||
"language": "en",
|
||||
"category": "aeronautica",
|
||||
"notes": "News aeronáutica - filtrar apenas interior/retrofit",
|
||||
"estimated_pages": 800,
|
||||
"relevance_keywords": ["interior", "cabin", "retrofit", "refurbishment", "upholstery"]
|
||||
},
|
||||
{
|
||||
"name": "Railway Interiors International",
|
||||
"url": "https://www.railwayinteriorsinternational.com/",
|
||||
"type": "magazine",
|
||||
"max_depth": 4,
|
||||
"priority": "medium",
|
||||
"language": "en",
|
||||
"category": "ferroviaria",
|
||||
"notes": "Magazine ferroviária - CTF vende para comboios",
|
||||
"estimated_pages": 350,
|
||||
"relevance_keywords": ["railway interior", "train", "seat", "upholstery", "fabric", "refurbishment"]
|
||||
},
|
||||
{
|
||||
"name": "Global Railway Review",
|
||||
"url": "https://www.globalrailwayreview.com/",
|
||||
"type": "news",
|
||||
"max_depth": 4,
|
||||
"priority": "low",
|
||||
"language": "en",
|
||||
"category": "ferroviaria",
|
||||
"notes": "News ferroviária - filtrar apenas interior/retrofit",
|
||||
"estimated_pages": 800,
|
||||
"relevance_keywords": ["interior", "passenger", "refurbishment", "retrofit", "seat"]
|
||||
},
|
||||
{
|
||||
"name": "Upholstery Resource",
|
||||
"url": "https://www.upholsteryresource.com/",
|
||||
"type": "resources",
|
||||
"max_depth": 4,
|
||||
"priority": "high",
|
||||
"language": "en",
|
||||
"category": "geral",
|
||||
"notes": "Recursos gerais estofamento - ALTA RELEVÂNCIA",
|
||||
"estimated_pages": 150,
|
||||
"relevance_keywords": ["upholstery", "fabric", "leather", "foam", "technique", "pattern"]
|
||||
}
|
||||
],
|
||||
"scraper_settings": {
|
||||
"request_timeout": 120,
|
||||
"max_retries": 3,
|
||||
"politeness_delay": [4, 10],
|
||||
"use_playwright": true,
|
||||
"playwright_stealth": true,
|
||||
"headless": true,
|
||||
"user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"excluded_patterns": [
|
||||
"/tag/", "/category/", "/author/", "/page/",
|
||||
"/wp-content/", "/wp-admin/", "/feed/", "/rss/",
|
||||
"/login", "/register", "/signin", "/signup",
|
||||
"/cart", "/checkout", "/account", "/my-account",
|
||||
"/product/", "/shop/", "/store/", "/parts/",
|
||||
"/members/", "/profile/", "/user/",
|
||||
"/gallery/", "/photos/", "/images/", "/media/",
|
||||
"/calendar/", "/events/",
|
||||
"/search/", "/results/",
|
||||
"/print/", "/pdf/", "/download/",
|
||||
"/shipping/", "/returns/", "/warranty/",
|
||||
"/contact", "/about", "/privacy", "/terms"
|
||||
],
|
||||
"content_filters": {
|
||||
"min_word_count": 100,
|
||||
"apply_during_scraping": false,
|
||||
"note": "Filtros aplicados APÓS scraping na fase de extração"
|
||||
}
|
||||
},
|
||||
"vps_execution": {
|
||||
"recommended_vps": "easy.descomplicar.pt",
|
||||
"ssh_port": 22,
|
||||
"ssh_user": "root",
|
||||
"working_directory": "/root/scraper-ctf",
|
||||
"estimated_duration_hours": 48,
|
||||
"estimated_storage_gb": 5,
|
||||
"recommended_cpu_cores": 4,
|
||||
"recommended_ram_gb": 8
|
||||
},
|
||||
"translation_requirements": {
|
||||
"german_sites": ["Verdeck.de - Blog", "Verdeck.de - Material", "Lederzentrum Wiki"],
|
||||
"spanish_sites": ["Piel de Toro"],
|
||||
"translation_api": "google-translate",
|
||||
"translation_stage": "after_extraction",
|
||||
"note": "Tradução apenas para casos extraídos (não todo o conteúdo)"
|
||||
},
|
||||
"execution_strategy": {
|
||||
"total_sites": 24,
|
||||
"total_estimated_pages": 6500,
|
||||
"estimated_scraping_time": "48-60 hours",
|
||||
"estimated_cases": "1000-1300 (taxa 16.5%)",
|
||||
"phases": [
|
||||
{
|
||||
"phase": "1A - Fóruns Alta Prioridade",
|
||||
"sites": ["Portal dos Clássicos", "Pelican Porsche", "Pelican BMW", "Peach Parts"],
|
||||
"estimated_time": "14-18h"
|
||||
},
|
||||
{
|
||||
"phase": "1B - Fóruns Média/Baixa",
|
||||
"sites": ["Pelican VW-Audi", "Pelican Saab", "Pelican Mini", "Piel de Toro"],
|
||||
"estimated_time": "10-14h"
|
||||
},
|
||||
{
|
||||
"phase": "2 - Tech Articles (Anti-bot)",
|
||||
"sites": ["Todos Pelican Tech Articles (8 sites)"],
|
||||
"estimated_time": "6-8h",
|
||||
"note": "Requer Playwright stealth mode"
|
||||
},
|
||||
{
|
||||
"phase": "3 - Sites Alemães",
|
||||
"sites": ["Verdeck.de Blog", "Verdeck.de Material", "Lederzentrum Wiki"],
|
||||
"estimated_time": "8-10h"
|
||||
},
|
||||
{
|
||||
"phase": "4 - Aeronáutica/Ferroviária",
|
||||
"sites": ["Aircraft Interiors", "Railway Interiors", "AIN Online", "Global Railway"],
|
||||
"estimated_time": "12-16h"
|
||||
},
|
||||
{
|
||||
"phase": "5 - Recursos Gerais",
|
||||
"sites": ["Upholstery Resource"],
|
||||
"estimated_time": "4-6h"
|
||||
}
|
||||
]
|
||||
},
|
||||
"execution_notes": [
|
||||
"✅ 16/24 sites validados disponíveis",
|
||||
"⚠️ 8 tech articles Pelican com HTTP 403 - requer Playwright stealth",
|
||||
"✅ Portal dos Clássicos RECUPERADO (URL correta encontrada)",
|
||||
"🌐 3 sites alemães + 1 espanhol requerem tradução APÓS extração",
|
||||
"🚀 Execução VPS recomendada (48-60h tempo total)",
|
||||
"📊 Estimativa final KB: 1400-1900 casos totais (559 atuais + 1000-1300 novos)",
|
||||
"🔧 Nível 4 profundidade para TODOS os sites",
|
||||
"🎯 Filtros keywords aplicados na EXTRAÇÃO (não scraping)",
|
||||
"⚡ Playwright stealth mode para anti-bot bypass",
|
||||
"💾 ~5GB storage necessário VPS"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user