Files
DashDescomplicar/api/services/monitoring-collector.ts

284 lines
9.2 KiB
TypeScript

/**
* Monitoring Data Collector
* HTTP health checks for services + EasyPanel API metrics + staleness detection
* Runs every 5 minutes via scheduler in server.ts
* @author Descomplicar® | @link descomplicar.pt | @copyright 2026
*/
import db from '../db.js'
interface ServiceCheck {
name: string
url: string
okStatuses?: number[] // Additional HTTP codes to treat as 'up' (e.g. 403 for gateway)
}
interface CheckResult {
status: 'up' | 'down' | 'warning'
http_code: number
response_time: number
error?: string
}
/**
* EasyPanel API config.
* Accessible from Docker Swarm via service name 'easypanel'.
* Token read from EASYPANEL_API_TOKEN env var.
*/
const EASYPANEL_API_URL = process.env.EASYPANEL_API_URL || 'http://easypanel:3000/api/trpc'
const EASYPANEL_API_TOKEN = process.env.EASYPANEL_API_TOKEN || ''
/**
* Services to monitor via HTTP health check.
* Each entry maps to a record in tbl_eal_monitoring (category='service').
*/
const SERVICES: ServiceCheck[] = [
{ name: 'Desk CRM', url: 'https://desk.descomplicar.pt' },
{ name: 'NextCloud', url: 'https://cloud.descomplicar.pt' },
{ name: 'Gitea', url: 'https://git.descomplicar.pt' },
{ name: 'Wiki.js', url: 'https://wiki.descomplicar.pt' },
{ name: 'Syncthing', url: 'https://sync.descomplicar.pt' },
{ name: 'Authentik', url: 'https://auth.descomplicar.pt' },
{ name: 'Metabase', url: 'https://bi.descomplicar.pt' },
{ name: 'N8N', url: 'https://automator.descomplicar.pt' },
{ name: 'Outline', url: 'https://hub.descomplicar.pt' },
{ name: 'WhatSMS', url: 'https://app.whatsms.pt' },
{ name: 'MCP Gateway', url: 'http://gateway.descomplicar.pt', okStatuses: [403] },
]
/**
* Check a single URL and return health status.
* Uses redirect: 'manual' so 302 (auth redirects) count as 'up'.
*/
async function checkUrl(url: string, timeoutMs = 10000): Promise<CheckResult> {
const start = Date.now()
try {
const controller = new AbortController()
const timeout = setTimeout(() => controller.abort(), timeoutMs)
const response = await fetch(url, {
method: 'GET',
redirect: 'manual',
signal: controller.signal,
})
clearTimeout(timeout)
const response_time = Date.now() - start
const http_code = response.status
// 2xx or 3xx = service is responding
if (http_code >= 200 && http_code < 400) {
return { status: 'up', http_code, response_time }
}
// 4xx = service responds but with client error
if (http_code >= 400 && http_code < 500) {
return { status: 'warning', http_code, response_time }
}
// 5xx = server error
return { status: 'down', http_code, response_time }
} catch (error: unknown) {
const response_time = Date.now() - start
const message = error instanceof Error ? error.message : 'Unknown error'
if (message.includes('abort')) {
return { status: 'down', http_code: 0, response_time, error: 'Timeout' }
}
return { status: 'down', http_code: 0, response_time, error: message }
}
}
/**
* Update or insert a monitoring record.
* Tries UPDATE first; if no row matches, does INSERT.
*/
async function upsertMonitoring(category: string, name: string, status: string, details: object): Promise<void> {
const detailsJson = JSON.stringify(details)
const [result] = await db.query(
`UPDATE tbl_eal_monitoring SET status = ?, details = ?, last_check = NOW() WHERE category = ? AND name = ?`,
[status, detailsJson, category, name]
)
if ((result as any).affectedRows === 0) {
await db.query(
`INSERT INTO tbl_eal_monitoring (category, name, status, details, last_check) VALUES (?, ?, ?, ?, NOW())`,
[category, name, status, detailsJson]
)
}
}
/**
* Check all services via HTTP and update DB.
* Runs all checks in parallel for speed.
*/
export async function checkAllServices(): Promise<{ checked: number; up: number; down: number; warning: number }> {
let up = 0, down = 0, warning = 0
const results = await Promise.allSettled(
SERVICES.map(async (service) => {
const result = await checkUrl(service.url)
// Override status if HTTP code is in the service's okStatuses list
if (result.status === 'warning' && service.okStatuses?.includes(result.http_code)) {
result.status = 'up'
}
await upsertMonitoring('service', service.name, result.status, {
url: service.url,
http_code: result.http_code,
response_time: `${result.response_time}ms`,
...(result.error ? { error: result.error } : {})
})
return { name: service.name, ...result }
})
)
for (const r of results) {
if (r.status === 'fulfilled') {
if (r.value.status === 'up') up++
else if (r.value.status === 'warning') warning++
else down++
} else {
down++
}
}
return { checked: SERVICES.length, up, down, warning }
}
/**
* Mark WP sites as warning if they haven't reported in >24h.
* The WP plugin (descomplicar-monitor) POSTs data periodically.
* If no data arrives, something is wrong.
*/
export async function checkStaleness(): Promise<number> {
const [result] = await db.query(
`UPDATE tbl_eal_monitoring
SET status = 'warning',
details = JSON_SET(COALESCE(details, '{}'), '$.stale', true, '$.stale_reason', 'No data received in 24h')
WHERE category = 'wordpress'
AND status IN ('ok', 'up')
AND last_check < DATE_SUB(NOW(), INTERVAL 24 HOUR)`
)
return (result as any).affectedRows || 0
}
/**
* Call EasyPanel tRPC API endpoint.
* Returns parsed JSON or null on failure.
*/
async function callEasyPanelAPI(endpoint: string): Promise<any | null> {
if (!EASYPANEL_API_TOKEN) return null
try {
const controller = new AbortController()
const timeout = setTimeout(() => controller.abort(), 10000)
const response = await fetch(`${EASYPANEL_API_URL}/${endpoint}`, {
headers: { 'Authorization': `Bearer ${EASYPANEL_API_TOKEN}` },
signal: controller.signal,
})
clearTimeout(timeout)
if (!response.ok) return null
const data: any = await response.json()
return data?.result?.data?.json ?? null
} catch {
return null
}
}
/**
* Collect EasyPanel server metrics (CPU, RAM, disk) via API.
* Replaces SSH-based collection for the Easy server.
*/
export async function collectEasyPanelMetrics(): Promise<boolean> {
const stats = await callEasyPanelAPI('monitor.getSystemStats')
if (!stats) return false
const cpu = Math.round(stats.cpuInfo?.usedPercentage ?? 0)
const ram = Math.round((stats.memInfo?.usedMemPercentage ?? 0) * 10) / 10
const disk = parseFloat(stats.diskInfo?.usedPercentage ?? '0')
const load = stats.cpuInfo?.loadavg?.[0] ?? 0
await upsertMonitoring('server', 'EasyPanel', 'up', {
cpu, ram, disk, load,
uptime_hours: Math.round((stats.uptime ?? 0) / 3600),
mem_total_mb: Math.round(stats.memInfo?.totalMemMb ?? 0),
mem_used_mb: Math.round(stats.memInfo?.usedMemMb ?? 0),
disk_total_gb: stats.diskInfo?.totalGb,
disk_free_gb: stats.diskInfo?.freeGb,
})
console.log(`[EASYPANEL] Server: CPU=${cpu}%, RAM=${ram}%, Disk=${disk}%`)
return true
}
/**
* Collect Docker container/task stats via EasyPanel API.
* Updates the 'container' category in monitoring DB.
*/
export async function collectEasyPanelContainers(): Promise<boolean> {
const tasks = await callEasyPanelAPI('monitor.getDockerTaskStats')
if (!tasks) return false
let total = 0, up = 0, down = 0
const unhealthy: string[] = []
for (const [name, info] of Object.entries(tasks) as [string, { actual: number; desired: number }][]) {
total++
if (info.actual >= info.desired) {
up++
} else {
down++
unhealthy.push(name.replace('descomplicar_', ''))
}
}
const status = down > 0 ? 'warning' : 'ok'
await upsertMonitoring('container', 'EasyPanel Containers', status, {
total, up, down, restarting: 0,
...(unhealthy.length > 0 ? { unhealthy } : {}),
})
console.log(`[EASYPANEL] Containers: ${up}/${total} running${down > 0 ? `, ${down} down: ${unhealthy.join(', ')}` : ''}`)
return true
}
/**
* Main collector entry point.
* Called by scheduler in server.ts every 5 minutes.
*/
export async function collectMonitoringData(): Promise<void> {
console.log('[COLLECTOR] Starting monitoring collection...')
try {
const services = await checkAllServices()
console.log(`[COLLECTOR] Services: ${services.up} up, ${services.warning} warning, ${services.down} down`)
} catch (err: unknown) {
console.error('[COLLECTOR] Service checks failed:', err instanceof Error ? err.message : err)
}
// EasyPanel API metrics (replaces SSH for Easy server)
try {
const gotStats = await collectEasyPanelMetrics()
const gotContainers = await collectEasyPanelContainers()
if (!gotStats && !gotContainers) {
console.warn('[COLLECTOR] EasyPanel API unavailable (check EASYPANEL_API_TOKEN)')
}
} catch (err: unknown) {
console.error('[COLLECTOR] EasyPanel collection failed:', err instanceof Error ? err.message : err)
}
try {
const stale = await checkStaleness()
if (stale > 0) {
console.log(`[COLLECTOR] Marked ${stale} stale WP site(s) as warning`)
}
} catch (err: unknown) {
console.error('[COLLECTOR] Staleness check failed:', err instanceof Error ? err.message : err)
}
console.log('[COLLECTOR] Done')
}