86 lines
2.3 KiB
JSON
Executable File
86 lines
2.3 KiB
JSON
Executable File
{
|
|
"client": "CTF_Carstuff_Batch3",
|
|
"output_base_dir": "/media/ealmeida/Dados/GDrive/Cloud/Clientes_360/CTF_Carstuff/KB/Scrapper/sites",
|
|
"output_dirs": {
|
|
"raw": "output_md",
|
|
"cleaned": "output_cleaned",
|
|
"formatted": "formatted",
|
|
"logs": "logs"
|
|
},
|
|
"sites": [
|
|
{
|
|
"name": "Portal dos Clássicos",
|
|
"url": "https://forum.portaldosclassicos.com",
|
|
"type": "forum",
|
|
"max_depth": 3,
|
|
"priority": "high",
|
|
"language": "pt",
|
|
"notes": "Fórum PT - prioridade alta (mercado local) - DEPTH 3 para captura completa"
|
|
},
|
|
{
|
|
"name": "Triumph Experience Forum",
|
|
"url": "https://triumphexp.com/forum/",
|
|
"type": "forum",
|
|
"max_depth": 3,
|
|
"priority": "medium",
|
|
"notes": "Fórum Triumph - DEPTH 3 para captura completa"
|
|
},
|
|
{
|
|
"name": "Autosattler.de Community",
|
|
"url": "https://autosattler.de/community",
|
|
"type": "forum",
|
|
"language": "de",
|
|
"max_depth": 3,
|
|
"priority": "low",
|
|
"notes": "Alemão - comunidade estofadores - DEPTH 3 para captura completa (requer validação URL)"
|
|
},
|
|
{
|
|
"name": "Lederzentrum Forum",
|
|
"url": "https://lederzentrum.de/forum",
|
|
"type": "forum",
|
|
"language": "de",
|
|
"max_depth": 3,
|
|
"priority": "low",
|
|
"notes": "Alemão - fórum técnico couro - DEPTH 3 para captura completa"
|
|
},
|
|
{
|
|
"name": "Foro Piel de Toro",
|
|
"url": "https://foro.pieldetoro.net",
|
|
"type": "forum",
|
|
"language": "es",
|
|
"max_depth": 3,
|
|
"priority": "low",
|
|
"notes": "Espanhol - automóveis clássicos - DEPTH 3 para captura completa"
|
|
}
|
|
],
|
|
"scraper_settings": {
|
|
"request_timeout": 90,
|
|
"max_retries": 3,
|
|
"politeness_delay": [3, 8],
|
|
"excluded_patterns": [
|
|
"/tag/",
|
|
"/category/",
|
|
"/author/",
|
|
"/page/",
|
|
"/wp-content/",
|
|
"/wp-admin/",
|
|
"/feed/",
|
|
"/rss/",
|
|
"/login",
|
|
"/register",
|
|
"/cart",
|
|
"/checkout",
|
|
"/product/",
|
|
"/shop/",
|
|
"/store/"
|
|
]
|
|
},
|
|
"execution_notes": [
|
|
"BATCH 3: Sites restantes não scrapeados",
|
|
"Portal dos Clássicos: PT - Alta prioridade (mercado local)",
|
|
"Sites alemães/espanhóis: Considerar tradução posterior",
|
|
"Reddit: Usar reddit_scraper.py separadamente",
|
|
"Total: 5 fóruns internacionais + 2 subreddits Reddit"
|
|
]
|
|
}
|