2 * Agent Monitoring Worker
3 * Monitors agent status (online/offline) and creates notifications for status changes
6const pool = require('../services/db');
7const { notifyAgentStatusChange, notifyMonitoringAlert } = require('../utils/notificationHelper');
10const CHECK_INTERVAL = 2 * 60 * 1000; // Check every 2 minutes
11const OFFLINE_THRESHOLD = 5 * 60 * 1000; // 5 minutes without heartbeat = offline
12const ALERT_CPU_THRESHOLD = 90; // CPU % threshold
13const ALERT_MEMORY_THRESHOLD = 90; // Memory % threshold
14const ALERT_DISK_THRESHOLD = 90; // Disk % threshold
16// Track which agents we've already notified about being offline
17const notifiedOfflineAgents = new Set();
20 * Check for agents that have gone offline and send notifications
22async function checkOfflineAgents() {
24 console.log('[AgentMonitor] Checking for offline agents...');
26 // Check if agents table exists with required columns (Agent v2 system)
27 // MeshCentral uses a different monitoring approach
28 const tableCheck = await pool.query(`
30 FROM information_schema.columns
31 WHERE table_name = 'agents'
32 AND column_name IN ('last_seen', 'status', 'agent_uuid')
35 if (tableCheck.rows.length < 3) {
36 console.log('[AgentMonitor] agents table not available (using MeshCentral), skipping offline checks');
40 // Find agents that haven't sent heartbeat in OFFLINE_THRESHOLD and were previously online
41 const result = await pool.query(`
50 EXTRACT(EPOCH FROM (NOW() - a.last_seen)) AS seconds_since_last_seen
53 a.last_seen < NOW() - INTERVAL '${OFFLINE_THRESHOLD / 1000} seconds'
54 AND a.status != 'offline'
55 AND a.status != 'uninstalled'
56 ORDER BY a.last_seen DESC
59 if (result.rows.length > 0) {
60 console.log(`[AgentMonitor] Found ${result.rows.length} agents to mark offline`);
63 for (const agent of result.rows) {
64 // Only notify once per agent going offline
65 if (!notifiedOfflineAgents.has(agent.agent_uuid)) {
66 console.log(`[AgentMonitor] Agent ${agent.hostname} (${agent.agent_uuid}) is offline (last seen: ${Math.floor(agent.seconds_since_last_seen / 60)} minutes ago)`);
68 // Update agent status to offline
70 'UPDATE agents SET status = $1 WHERE agent_uuid = $2',
71 ['offline', agent.agent_uuid]
76 await notifyAgentStatusChange({
77 agent_id: agent.agent_id,
78 hostname: agent.hostname,
80 customer_id: agent.customer_id,
81 tenant_id: agent.tenant_id
84 notifiedOfflineAgents.add(agent.agent_uuid);
85 console.log(`[AgentMonitor] Sent offline notification for ${agent.hostname}`);
86 } catch (notifError) {
87 console.error(`[AgentMonitor] Error sending notification for ${agent.hostname}:`, notifError);
92 // Check for agents that have come back online
93 const onlineAgents = await pool.query(`
94 SELECT agent_uuid, hostname
97 last_seen >= NOW() - INTERVAL '${OFFLINE_THRESHOLD / 1000} seconds'
101 // Clear notifications for agents that are back online
102 for (const agent of onlineAgents.rows) {
103 if (notifiedOfflineAgents.has(agent.agent_uuid)) {
104 notifiedOfflineAgents.delete(agent.agent_uuid);
105 console.log(`[AgentMonitor] Agent ${agent.hostname} is back online`);
110 console.error('[AgentMonitor] Error checking offline agents:', err);
115 * Check for monitoring alerts (CPU, Memory, Disk thresholds)
117async function checkMonitoringAlerts() {
119 console.log('[AgentMonitor] Checking for monitoring alerts...');
121 // Check if agent_metrics table exists and has required columns
122 // This is graceful handling for legacy Agent v2 systems or missing migrations
123 const tableCheck = await pool.query(`
125 FROM information_schema.columns
126 WHERE table_name = 'agent_metrics'
127 AND column_name IN ('cpu_usage', 'memory_usage', 'disk_usage')
130 if (tableCheck.rows.length < 3) {
131 // agent_metrics table doesn't exist or missing required columns
132 // This is expected if using MeshCentral instead of Agent v2
133 console.log('[AgentMonitor] agent_metrics table not available (using MeshCentral), skipping monitoring alerts');
137 // Get latest metrics for all online agents
138 const result = await pool.query(`
151 SELECT cpu_usage, memory_usage, disk_usage, timestamp
153 WHERE agent_id = a.agent_id
154 ORDER BY timestamp DESC
159 AND am.timestamp IS NOT NULL
160 AND am.timestamp > NOW() - INTERVAL '10 minutes'
163 for (const agent of result.rows) {
164 // Check CPU threshold
165 if (agent.cpu_usage && agent.cpu_usage > ALERT_CPU_THRESHOLD) {
166 await notifyMonitoringAlert({
167 agent_id: agent.agent_id,
168 hostname: agent.hostname,
170 message: `CPU usage on ${agent.hostname} is at ${agent.cpu_usage.toFixed(1)}%, exceeding threshold of ${ALERT_CPU_THRESHOLD}%`,
171 value: agent.cpu_usage,
172 threshold: ALERT_CPU_THRESHOLD,
173 customer_id: agent.customer_id,
174 tenant_id: agent.tenant_id
176 console.log(`[AgentMonitor] CPU alert for ${agent.hostname}: ${agent.cpu_usage.toFixed(1)}%`);
179 // Check Memory threshold
180 if (agent.memory_usage && agent.memory_usage > ALERT_MEMORY_THRESHOLD) {
181 await notifyMonitoringAlert({
182 agent_id: agent.agent_id,
183 hostname: agent.hostname,
185 message: `Memory usage on ${agent.hostname} is at ${agent.memory_usage.toFixed(1)}%, exceeding threshold of ${ALERT_MEMORY_THRESHOLD}%`,
186 value: agent.memory_usage,
187 threshold: ALERT_MEMORY_THRESHOLD,
188 customer_id: agent.customer_id,
189 tenant_id: agent.tenant_id
191 console.log(`[AgentMonitor] Memory alert for ${agent.hostname}: ${agent.memory_usage.toFixed(1)}%`);
194 // Check Disk threshold
195 if (agent.disk_usage && agent.disk_usage > ALERT_DISK_THRESHOLD) {
196 await notifyMonitoringAlert({
197 agent_id: agent.agent_id,
198 hostname: agent.hostname,
200 message: `Disk usage on ${agent.hostname} is at ${agent.disk_usage.toFixed(1)}%, exceeding threshold of ${ALERT_DISK_THRESHOLD}%`,
201 value: agent.disk_usage,
202 threshold: ALERT_DISK_THRESHOLD,
203 customer_id: agent.customer_id,
204 tenant_id: agent.tenant_id
206 console.log(`[AgentMonitor] Disk alert for ${agent.hostname}: ${agent.disk_usage.toFixed(1)}%`);
211 console.error('[AgentMonitor] Error checking monitoring alerts:', err);
216 * Start the monitoring worker
218function startAgentMonitoring() {
219 console.log('[AgentMonitor] Starting agent monitoring worker...');
220 console.log(`[AgentMonitor] Check interval: ${CHECK_INTERVAL / 1000}s`);
221 console.log(`[AgentMonitor] Offline threshold: ${OFFLINE_THRESHOLD / 1000}s`);
224 checkOfflineAgents();
225 checkMonitoringAlerts();
227 // Schedule periodic checks
228 setInterval(checkOfflineAgents, CHECK_INTERVAL);
229 setInterval(checkMonitoringAlerts, CHECK_INTERVAL);
231 console.log('[AgentMonitor] Agent monitoring worker started');
234// Start monitoring if this file is run directly
235if (require.main === module) {
236 startAgentMonitoring();
238 // Keep process alive
239 process.on('SIGINT', () => {
240 console.log('[AgentMonitor] Shutting down gracefully...');
246 startAgentMonitoring,
248 checkMonitoringAlerts