feat: add monitoring-collector.ts - HTTP health checks for 11 services
This commit is contained in:
181
api/services/monitoring-collector.ts
Normal file
181
api/services/monitoring-collector.ts
Normal file
@@ -0,0 +1,181 @@
|
||||
/**
|
||||
* Monitoring Data Collector
|
||||
* HTTP health checks for services + staleness detection for WP sites
|
||||
* Runs every 5 minutes via scheduler in server.ts
|
||||
* @author Descomplicar® | @link descomplicar.pt | @copyright 2026
|
||||
*/
|
||||
import db from '../db.js'
|
||||
|
||||
interface ServiceCheck {
|
||||
name: string
|
||||
url: string
|
||||
okStatuses?: number[] // Additional HTTP codes to treat as 'up' (e.g. 403 for gateway)
|
||||
}
|
||||
|
||||
interface CheckResult {
|
||||
status: 'up' | 'down' | 'warning'
|
||||
http_code: number
|
||||
response_time: number
|
||||
error?: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Services to monitor via HTTP health check.
|
||||
* Each entry maps to a record in tbl_eal_monitoring (category='service').
|
||||
*/
|
||||
const SERVICES: ServiceCheck[] = [
|
||||
{ name: 'Desk CRM', url: 'https://desk.descomplicar.pt' },
|
||||
{ name: 'NextCloud', url: 'https://cloud.descomplicar.pt' },
|
||||
{ name: 'Gitea', url: 'https://git.descomplicar.pt' },
|
||||
{ name: 'Wiki.js', url: 'https://wiki.descomplicar.pt' },
|
||||
{ name: 'Syncthing', url: 'https://sync.descomplicar.pt' },
|
||||
{ name: 'Authentik', url: 'https://auth.descomplicar.pt' },
|
||||
{ name: 'Metabase', url: 'https://bi.descomplicar.pt' },
|
||||
{ name: 'N8N', url: 'https://automator.descomplicar.pt' },
|
||||
{ name: 'Outline', url: 'https://hub.descomplicar.pt' },
|
||||
{ name: 'WhatSMS', url: 'https://whatsms.pt' },
|
||||
{ name: 'MCP Gateway', url: 'http://gateway.descomplicar.pt', okStatuses: [403] },
|
||||
]
|
||||
|
||||
/**
|
||||
* Check a single URL and return health status.
|
||||
* Uses redirect: 'manual' so 302 (auth redirects) count as 'up'.
|
||||
*/
|
||||
async function checkUrl(url: string, timeoutMs = 10000): Promise<CheckResult> {
|
||||
const start = Date.now()
|
||||
try {
|
||||
const controller = new AbortController()
|
||||
const timeout = setTimeout(() => controller.abort(), timeoutMs)
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: 'GET',
|
||||
redirect: 'manual',
|
||||
signal: controller.signal,
|
||||
})
|
||||
|
||||
clearTimeout(timeout)
|
||||
const response_time = Date.now() - start
|
||||
const http_code = response.status
|
||||
|
||||
// 2xx or 3xx = service is responding
|
||||
if (http_code >= 200 && http_code < 400) {
|
||||
return { status: 'up', http_code, response_time }
|
||||
}
|
||||
// 4xx = service responds but with client error
|
||||
if (http_code >= 400 && http_code < 500) {
|
||||
return { status: 'warning', http_code, response_time }
|
||||
}
|
||||
// 5xx = server error
|
||||
return { status: 'down', http_code, response_time }
|
||||
} catch (error: unknown) {
|
||||
const response_time = Date.now() - start
|
||||
const message = error instanceof Error ? error.message : 'Unknown error'
|
||||
if (message.includes('abort')) {
|
||||
return { status: 'down', http_code: 0, response_time, error: 'Timeout' }
|
||||
}
|
||||
return { status: 'down', http_code: 0, response_time, error: message }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update or insert a monitoring record.
|
||||
* Tries UPDATE first; if no row matches, does INSERT.
|
||||
*/
|
||||
async function upsertMonitoring(category: string, name: string, status: string, details: object): Promise<void> {
|
||||
const detailsJson = JSON.stringify(details)
|
||||
|
||||
const [result] = await db.query(
|
||||
`UPDATE tbl_eal_monitoring SET status = ?, details = ?, last_check = NOW() WHERE category = ? AND name = ?`,
|
||||
[status, detailsJson, category, name]
|
||||
)
|
||||
|
||||
if ((result as any).affectedRows === 0) {
|
||||
await db.query(
|
||||
`INSERT INTO tbl_eal_monitoring (category, name, status, details, last_check) VALUES (?, ?, ?, ?, NOW())`,
|
||||
[category, name, status, detailsJson]
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check all services via HTTP and update DB.
|
||||
* Runs all checks in parallel for speed.
|
||||
*/
|
||||
export async function checkAllServices(): Promise<{ checked: number; up: number; down: number; warning: number }> {
|
||||
let up = 0, down = 0, warning = 0
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
SERVICES.map(async (service) => {
|
||||
const result = await checkUrl(service.url)
|
||||
|
||||
// Override status if HTTP code is in the service's okStatuses list
|
||||
if (result.status === 'warning' && service.okStatuses?.includes(result.http_code)) {
|
||||
result.status = 'up'
|
||||
}
|
||||
|
||||
await upsertMonitoring('service', service.name, result.status, {
|
||||
url: service.url,
|
||||
http_code: result.http_code,
|
||||
response_time: `${result.response_time}ms`,
|
||||
...(result.error ? { error: result.error } : {})
|
||||
})
|
||||
|
||||
return { name: service.name, ...result }
|
||||
})
|
||||
)
|
||||
|
||||
for (const r of results) {
|
||||
if (r.status === 'fulfilled') {
|
||||
if (r.value.status === 'up') up++
|
||||
else if (r.value.status === 'warning') warning++
|
||||
else down++
|
||||
} else {
|
||||
down++
|
||||
}
|
||||
}
|
||||
|
||||
return { checked: SERVICES.length, up, down, warning }
|
||||
}
|
||||
|
||||
/**
|
||||
* Mark WP sites as warning if they haven't reported in >24h.
|
||||
* The WP plugin (descomplicar-monitor) POSTs data periodically.
|
||||
* If no data arrives, something is wrong.
|
||||
*/
|
||||
export async function checkStaleness(): Promise<number> {
|
||||
const [result] = await db.query(
|
||||
`UPDATE tbl_eal_monitoring
|
||||
SET status = 'warning',
|
||||
details = JSON_SET(COALESCE(details, '{}'), '$.stale', true, '$.stale_reason', 'No data received in 24h')
|
||||
WHERE category = 'wordpress'
|
||||
AND status IN ('ok', 'up')
|
||||
AND last_check < DATE_SUB(NOW(), INTERVAL 24 HOUR)`
|
||||
)
|
||||
return (result as any).affectedRows || 0
|
||||
}
|
||||
|
||||
/**
|
||||
* Main collector entry point.
|
||||
* Called by scheduler in server.ts every 5 minutes.
|
||||
*/
|
||||
export async function collectMonitoringData(): Promise<void> {
|
||||
console.log('[COLLECTOR] Starting monitoring collection...')
|
||||
|
||||
try {
|
||||
const services = await checkAllServices()
|
||||
console.log(`[COLLECTOR] Services: ${services.up} up, ${services.warning} warning, ${services.down} down`)
|
||||
} catch (err: unknown) {
|
||||
console.error('[COLLECTOR] Service checks failed:', err instanceof Error ? err.message : err)
|
||||
}
|
||||
|
||||
try {
|
||||
const stale = await checkStaleness()
|
||||
if (stale > 0) {
|
||||
console.log(`[COLLECTOR] Marked ${stale} stale WP site(s) as warning`)
|
||||
}
|
||||
} catch (err: unknown) {
|
||||
console.error('[COLLECTOR] Staleness check failed:', err instanceof Error ? err.message : err)
|
||||
}
|
||||
|
||||
console.log('[COLLECTOR] Done')
|
||||
}
|
||||
Reference in New Issue
Block a user