EverydayTech Platform - Developer Reference
Complete Source Code Documentation - All Applications
Loading...
Searching...
No Matches
agentMonitor.js
Go to the documentation of this file.
1/**
2 * Agent Monitoring Worker
3 * Monitors agent status (online/offline) and creates notifications for status changes
4 */
5
6const pool = require('../services/db');
7const { notifyAgentStatusChange, notifyMonitoringAlert } = require('../utils/notificationHelper');
8
9// Configuration
10const CHECK_INTERVAL = 2 * 60 * 1000; // Check every 2 minutes
11const OFFLINE_THRESHOLD = 5 * 60 * 1000; // 5 minutes without heartbeat = offline
12const ALERT_CPU_THRESHOLD = 90; // CPU % threshold
13const ALERT_MEMORY_THRESHOLD = 90; // Memory % threshold
14const ALERT_DISK_THRESHOLD = 90; // Disk % threshold
15
16// Track which agents we've already notified about being offline
17const notifiedOfflineAgents = new Set();
18
19/**
20 * Check for agents that have gone offline and send notifications
21 */
22async function checkOfflineAgents() {
23 try {
24 console.log('[AgentMonitor] Checking for offline agents...');
25
26 // Check if agents table exists with required columns (Agent v2 system)
27 // MeshCentral uses a different monitoring approach
28 const tableCheck = await pool.query(`
29 SELECT column_name
30 FROM information_schema.columns
31 WHERE table_name = 'agents'
32 AND column_name IN ('last_seen', 'status', 'agent_uuid')
33 `);
34
35 if (tableCheck.rows.length < 3) {
36 console.log('[AgentMonitor] agents table not available (using MeshCentral), skipping offline checks');
37 return;
38 }
39
40 // Find agents that haven't sent heartbeat in OFFLINE_THRESHOLD and were previously online
41 const result = await pool.query(`
42 SELECT
43 a.agent_id,
44 a.agent_uuid,
45 a.hostname,
46 a.status,
47 a.last_seen,
48 a.customer_id,
49 a.tenant_id,
50 EXTRACT(EPOCH FROM (NOW() - a.last_seen)) AS seconds_since_last_seen
51 FROM agents a
52 WHERE
53 a.last_seen < NOW() - INTERVAL '${OFFLINE_THRESHOLD / 1000} seconds'
54 AND a.status != 'offline'
55 AND a.status != 'uninstalled'
56 ORDER BY a.last_seen DESC
57 `);
58
59 if (result.rows.length > 0) {
60 console.log(`[AgentMonitor] Found ${result.rows.length} agents to mark offline`);
61 }
62
63 for (const agent of result.rows) {
64 // Only notify once per agent going offline
65 if (!notifiedOfflineAgents.has(agent.agent_uuid)) {
66 console.log(`[AgentMonitor] Agent ${agent.hostname} (${agent.agent_uuid}) is offline (last seen: ${Math.floor(agent.seconds_since_last_seen / 60)} minutes ago)`);
67
68 // Update agent status to offline
69 await pool.query(
70 'UPDATE agents SET status = $1 WHERE agent_uuid = $2',
71 ['offline', agent.agent_uuid]
72 );
73
74 // Send notification
75 try {
76 await notifyAgentStatusChange({
77 agent_id: agent.agent_id,
78 hostname: agent.hostname,
79 status: 'offline',
80 customer_id: agent.customer_id,
81 tenant_id: agent.tenant_id
82 });
83
84 notifiedOfflineAgents.add(agent.agent_uuid);
85 console.log(`[AgentMonitor] Sent offline notification for ${agent.hostname}`);
86 } catch (notifError) {
87 console.error(`[AgentMonitor] Error sending notification for ${agent.hostname}:`, notifError);
88 }
89 }
90 }
91
92 // Check for agents that have come back online
93 const onlineAgents = await pool.query(`
94 SELECT agent_uuid, hostname
95 FROM agents
96 WHERE
97 last_seen >= NOW() - INTERVAL '${OFFLINE_THRESHOLD / 1000} seconds'
98 AND status = 'online'
99 `);
100
101 // Clear notifications for agents that are back online
102 for (const agent of onlineAgents.rows) {
103 if (notifiedOfflineAgents.has(agent.agent_uuid)) {
104 notifiedOfflineAgents.delete(agent.agent_uuid);
105 console.log(`[AgentMonitor] Agent ${agent.hostname} is back online`);
106 }
107 }
108
109 } catch (err) {
110 console.error('[AgentMonitor] Error checking offline agents:', err);
111 }
112}
113
114/**
115 * Check for monitoring alerts (CPU, Memory, Disk thresholds)
116 */
117async function checkMonitoringAlerts() {
118 try {
119 console.log('[AgentMonitor] Checking for monitoring alerts...');
120
121 // Check if agent_metrics table exists and has required columns
122 // This is graceful handling for legacy Agent v2 systems or missing migrations
123 const tableCheck = await pool.query(`
124 SELECT column_name
125 FROM information_schema.columns
126 WHERE table_name = 'agent_metrics'
127 AND column_name IN ('cpu_usage', 'memory_usage', 'disk_usage')
128 `);
129
130 if (tableCheck.rows.length < 3) {
131 // agent_metrics table doesn't exist or missing required columns
132 // This is expected if using MeshCentral instead of Agent v2
133 console.log('[AgentMonitor] agent_metrics table not available (using MeshCentral), skipping monitoring alerts');
134 return;
135 }
136
137 // Get latest metrics for all online agents
138 const result = await pool.query(`
139 SELECT
140 a.agent_id,
141 a.agent_uuid,
142 a.hostname,
143 a.customer_id,
144 a.tenant_id,
145 am.cpu_usage,
146 am.memory_usage,
147 am.disk_usage,
148 am.timestamp
149 FROM agents a
150 LEFT JOIN LATERAL (
151 SELECT cpu_usage, memory_usage, disk_usage, timestamp
152 FROM agent_metrics
153 WHERE agent_id = a.agent_id
154 ORDER BY timestamp DESC
155 LIMIT 1
156 ) am ON true
157 WHERE
158 a.status = 'online'
159 AND am.timestamp IS NOT NULL
160 AND am.timestamp > NOW() - INTERVAL '10 minutes'
161 `);
162
163 for (const agent of result.rows) {
164 // Check CPU threshold
165 if (agent.cpu_usage && agent.cpu_usage > ALERT_CPU_THRESHOLD) {
166 await notifyMonitoringAlert({
167 agent_id: agent.agent_id,
168 hostname: agent.hostname,
169 alertType: 'cpu',
170 message: `CPU usage on ${agent.hostname} is at ${agent.cpu_usage.toFixed(1)}%, exceeding threshold of ${ALERT_CPU_THRESHOLD}%`,
171 value: agent.cpu_usage,
172 threshold: ALERT_CPU_THRESHOLD,
173 customer_id: agent.customer_id,
174 tenant_id: agent.tenant_id
175 });
176 console.log(`[AgentMonitor] CPU alert for ${agent.hostname}: ${agent.cpu_usage.toFixed(1)}%`);
177 }
178
179 // Check Memory threshold
180 if (agent.memory_usage && agent.memory_usage > ALERT_MEMORY_THRESHOLD) {
181 await notifyMonitoringAlert({
182 agent_id: agent.agent_id,
183 hostname: agent.hostname,
184 alertType: 'memory',
185 message: `Memory usage on ${agent.hostname} is at ${agent.memory_usage.toFixed(1)}%, exceeding threshold of ${ALERT_MEMORY_THRESHOLD}%`,
186 value: agent.memory_usage,
187 threshold: ALERT_MEMORY_THRESHOLD,
188 customer_id: agent.customer_id,
189 tenant_id: agent.tenant_id
190 });
191 console.log(`[AgentMonitor] Memory alert for ${agent.hostname}: ${agent.memory_usage.toFixed(1)}%`);
192 }
193
194 // Check Disk threshold
195 if (agent.disk_usage && agent.disk_usage > ALERT_DISK_THRESHOLD) {
196 await notifyMonitoringAlert({
197 agent_id: agent.agent_id,
198 hostname: agent.hostname,
199 alertType: 'disk',
200 message: `Disk usage on ${agent.hostname} is at ${agent.disk_usage.toFixed(1)}%, exceeding threshold of ${ALERT_DISK_THRESHOLD}%`,
201 value: agent.disk_usage,
202 threshold: ALERT_DISK_THRESHOLD,
203 customer_id: agent.customer_id,
204 tenant_id: agent.tenant_id
205 });
206 console.log(`[AgentMonitor] Disk alert for ${agent.hostname}: ${agent.disk_usage.toFixed(1)}%`);
207 }
208 }
209
210 } catch (err) {
211 console.error('[AgentMonitor] Error checking monitoring alerts:', err);
212 }
213}
214
215/**
216 * Start the monitoring worker
217 */
218function startAgentMonitoring() {
219 console.log('[AgentMonitor] Starting agent monitoring worker...');
220 console.log(`[AgentMonitor] Check interval: ${CHECK_INTERVAL / 1000}s`);
221 console.log(`[AgentMonitor] Offline threshold: ${OFFLINE_THRESHOLD / 1000}s`);
222
223 // Run initial check
224 checkOfflineAgents();
225 checkMonitoringAlerts();
226
227 // Schedule periodic checks
228 setInterval(checkOfflineAgents, CHECK_INTERVAL);
229 setInterval(checkMonitoringAlerts, CHECK_INTERVAL);
230
231 console.log('[AgentMonitor] Agent monitoring worker started');
232}
233
234// Start monitoring if this file is run directly
235if (require.main === module) {
236 startAgentMonitoring();
237
238 // Keep process alive
239 process.on('SIGINT', () => {
240 console.log('[AgentMonitor] Shutting down gracefully...');
241 process.exit(0);
242 });
243}
244
245module.exports = {
246 startAgentMonitoring,
247 checkOfflineAgents,
248 checkMonitoringAlerts
249};