1 /*****************************************************************************
2 *
3 * CHECKS.C - Service and host check functions for Nagios
4 *
5 * Copyright (c) 2011 Nagios Core Development Team
6 * Copyright (c) 1999-2010 Ethan Galstad (egalstad@nagios.org)
7 * Last Modified: 01-20-2011
8 *
9 * License:
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *****************************************************************************/
25
26 #include "../include/config.h"
27 #include "../include/comments.h"
28 #include "../include/common.h"
29 #include "../include/statusdata.h"
30 #include "../include/downtime.h"
31 #include "../include/macros.h"
32 #include "../include/nagios.h"
33 #include "../include/broker.h"
34 #include "../include/perfdata.h"
35
36 /*#define DEBUG_CHECKS*/
37 /*#define DEBUG_HOST_CHECKS 1*/
38
39
40 #ifdef EMBEDDEDPERL
41 #include "../include/epn_nagios.h"
42 #endif
43
44 #ifdef USE_EVENT_BROKER
45 #include "../include/neberrors.h"
46 #endif
47
48 extern int sigshutdown;
49 extern int sigrestart;
50
51 extern char *temp_file;
52 extern char *temp_path;
53 extern char *check_result_path;
54
55 extern int interval_length;
56
57 extern int command_check_interval;
58
59 extern int log_initial_states;
60 extern int log_passive_checks;
61 extern int log_host_retries;
62
63 extern int service_check_timeout;
64 extern int host_check_timeout;
65
66 extern int check_reaper_interval;
67 extern int max_check_reaper_time;
68
69 extern int use_aggressive_host_checking;
70 extern unsigned long cached_host_check_horizon;
71 extern unsigned long cached_service_check_horizon;
72 extern int enable_predictive_host_dependency_checks;
73 extern int enable_predictive_service_dependency_checks;
74
75 extern int soft_state_dependencies;
76
77 extern int currently_running_service_checks;
78 extern int currently_running_host_checks;
79
80 extern int accept_passive_service_checks;
81 extern int execute_service_checks;
82 extern int accept_passive_host_checks;
83 extern int execute_host_checks;
84 extern int obsess_over_services;
85 extern int obsess_over_hosts;
86
87 extern int translate_passive_host_checks;
88 extern int passive_host_checks_are_soft;
89
90 extern int check_service_freshness;
91 extern int check_host_freshness;
92 extern int additional_freshness_latency;
93
94 extern int max_host_check_spread;
95 extern int max_service_check_spread;
96
97 extern int use_large_installation_tweaks;
98 extern int free_child_process_memory;
99 extern int child_processes_fork_twice;
100
101 extern time_t last_program_stop;
102 extern time_t program_start;
103 extern time_t event_start;
104
105 extern timed_event *event_list_low;
106 extern timed_event *event_list_low_tail;
107
108 extern host *host_list;
109 extern service *service_list;
110 extern servicedependency *servicedependency_list;
111 extern hostdependency *hostdependency_list;
112
113 extern unsigned long next_event_id;
114 extern unsigned long next_problem_id;
115
116 extern check_result check_result_info;
117 extern check_result *check_result_list;
118
119 extern pthread_t worker_threads[TOTAL_WORKER_THREADS];
120
121 extern unsigned long max_debug_file_size;
122
123 #ifdef EMBEDDEDPERL
124 extern int use_embedded_perl;
125 #endif
126
127
128
129
130
131 /******************************************************************/
132 /********************** CHECK REAPER FUNCTIONS ********************/
133 /******************************************************************/
134
135 /* reaps host and service check results */
reap_check_results(void)136 int reap_check_results(void) {
137 check_result *queued_check_result = NULL;
138 service *temp_service = NULL;
139 host *temp_host = NULL;
140 time_t current_time = 0L;
141 time_t reaper_start_time = 0L;
142 int reaped_checks = 0;
143
144 log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() start\n");
145 log_debug_info(DEBUGL_CHECKS, 0, "Starting to reap check results.\n");
146
147 /* get the start time */
148 time(&reaper_start_time);
149
150 /* process files in the check result queue */
151 process_check_result_queue(check_result_path);
152
153 /* read all check results that have come in... */
154 while((queued_check_result = read_check_result(&check_result_list))) {
155
156 reaped_checks++;
157
158 log_debug_info(DEBUGL_CHECKS, 2, "Found a check result (#%d) to handle...\n", reaped_checks);
159
160 /* service check */
161 if(queued_check_result->object_check_type == SERVICE_CHECK) {
162
163 /* make sure the service exists */
164 if((temp_service = find_service(queued_check_result->host_name, queued_check_result->service_description)) == NULL) {
165
166 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check result queue contained results for service '%s' on host '%s', but the service could not be found! Perhaps you forgot to define the service in your config files?\n", queued_check_result->service_description, queued_check_result->host_name);
167
168 /* free memory */
169 free_check_result(queued_check_result);
170 my_free(queued_check_result);
171
172 /* TODO - add new service definition automatically */
173
174 continue;
175 }
176
177 log_debug_info(DEBUGL_CHECKS, 1, "Handling check result for service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
178
179 /* process the check result */
180 handle_async_service_check_result(temp_service, queued_check_result);
181 }
182
183 /* host check */
184 else {
185 if((temp_host = find_host(queued_check_result->host_name)) == NULL) {
186
187 /* make sure the host exists */
188 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check result queue contained results for host '%s', but the host could not be found! Perhaps you forgot to define the host in your config files?\n", queued_check_result->host_name);
189
190 /* free memory */
191 free_check_result(queued_check_result);
192 my_free(queued_check_result);
193
194 /* TODO - add new host definition automatically */
195
196 continue;
197 }
198
199 log_debug_info(DEBUGL_CHECKS, 1, "Handling check result for host '%s'...\n", temp_host->name);
200
201 /* process the check result */
202 handle_async_host_check_result_3x(temp_host, queued_check_result);
203 }
204
205 log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Deleted check result file '%s'\n", queued_check_result->output_file);
206
207 /* free allocated memory */
208 free_check_result(queued_check_result);
209 my_free(queued_check_result);
210
211 /* break out if we've been here too long (max_check_reaper_time seconds) */
212 time(¤t_time);
213 if((int)(current_time - reaper_start_time) > max_check_reaper_time) {
214 log_debug_info(DEBUGL_CHECKS, 0, "Breaking out of check result reaper: max reaper time exceeded\n");
215 break;
216 }
217
218 /* bail out if we encountered a signal */
219 if(sigshutdown == TRUE || sigrestart == TRUE) {
220 log_debug_info(DEBUGL_CHECKS, 0, "Breaking out of check result reaper: signal encountered\n");
221 break;
222 }
223 }
224
225 log_debug_info(DEBUGL_CHECKS, 0, "Finished reaping %d check results\n", reaped_checks);
226 log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() end\n");
227
228 return OK;
229 }
230
231
232
233
234 /******************************************************************/
235 /****************** SERVICE MONITORING FUNCTIONS ******************/
236 /******************************************************************/
237
238 /* executes a scheduled service check */
run_scheduled_service_check(service * svc,int check_options,double latency)239 int run_scheduled_service_check(service *svc, int check_options, double latency) {
240 int result = OK;
241 time_t current_time = 0L;
242 time_t preferred_time = 0L;
243 time_t next_valid_time = 0L;
244 int time_is_valid = TRUE;
245
246 if(svc == NULL)
247 return ERROR;
248
249 log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_service_check() start\n");
250 log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n", svc->description, svc->host_name, check_options, latency);
251
252 /*
253 * reset the next_check_event so we know it's
254 * no longer in the scheduling queue
255 */
256 svc->next_check_event = NULL;
257
258 /* attempt to run the check */
259 result = run_async_service_check(svc, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time);
260
261 /* an error occurred, so reschedule the check */
262 if(result == ERROR) {
263
264 log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled service check at this time\n");
265
266 /* only attempt to (re)schedule checks that should get checked... */
267 if(svc->should_be_scheduled == TRUE) {
268
269 /* get current time */
270 time(¤t_time);
271
272 /* determine next time we should check the service if needed */
273 /* if service has no check interval, schedule it again for 5 minutes from now */
274 if(current_time >= preferred_time)
275 preferred_time = current_time + ((svc->check_interval <= 0) ? 300 : (svc->check_interval * interval_length));
276
277 /* make sure we rescheduled the next service check at a valid time */
278 get_next_valid_time(preferred_time, &next_valid_time, svc->check_period_ptr);
279
280 /*
281 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Service '%s' on host '%s' timeperiod check failed...\n",svc->description,svc->host_name);
282 logit(NSLOG_RUNTIME_WARNING,TRUE,"Current time: %s",ctime(¤t_time));
283 logit(NSLOG_RUNTIME_WARNING,TRUE,"Preferred time: %s",ctime(&preferred_time));
284 logit(NSLOG_RUNTIME_WARNING,TRUE,"Next valid time: %s",ctime(&next_valid_time));
285 */
286
287 /* the service could not be rescheduled properly - set the next check time for next week */
288 /*if(time_is_valid==FALSE && next_valid_time==preferred_time){*/
289 /* UPDATED 08/12/09 EG to reflect proper timeperod check logic */
290 if(time_is_valid == FALSE && check_time_against_period(next_valid_time, svc->check_period_ptr) == ERROR) {
291
292 /*
293 svc->next_check=(time_t)(next_valid_time+(60*60*24*365));
294 svc->should_be_scheduled=FALSE;
295 */
296
297 svc->next_check = (time_t)(next_valid_time + (60 * 60 * 24 * 7));
298
299 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' could not be rescheduled properly. Scheduling check for next week...\n", svc->description, svc->host_name);
300
301 log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next service check!\n");
302 }
303
304 /* this service could be rescheduled... */
305 else {
306 svc->next_check = next_valid_time;
307 svc->should_be_scheduled = TRUE;
308
309 log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next service check for %s", ctime(&next_valid_time));
310 }
311 }
312
313 /* reschedule the next service check - unless we couldn't find a valid next check time */
314 /* 10/19/07 EG - keep original check options */
315 if(svc->should_be_scheduled == TRUE)
316 schedule_service_check(svc, svc->next_check, check_options);
317
318 /* update the status log */
319 update_service_status(svc, FALSE);
320
321 return ERROR;
322 }
323
324 return OK;
325 }
326
327
328 /* forks a child process to run a service check, but does not wait for the service check result */
run_async_service_check(service * svc,int check_options,double latency,int scheduled_check,int reschedule_check,int * time_is_valid,time_t * preferred_time)329 int run_async_service_check(service *svc, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) {
330 nagios_macros mac;
331 char *raw_command = NULL;
332 char *processed_command = NULL;
333 char output_buffer[MAX_INPUT_BUFFER] = "";
334 char *temp_buffer = NULL;
335 struct timeval start_time, end_time;
336 pid_t pid = 0;
337 int fork_error = FALSE;
338 int wait_result = 0;
339 host *temp_host = NULL;
340 FILE *fp = NULL;
341 int pclose_result = 0;
342 mode_t new_umask = 077;
343 mode_t old_umask;
344 char *output_file = NULL;
345 double old_latency = 0.0;
346 dbuf checkresult_dbuf;
347 int dbuf_chunk = 1024;
348 #ifdef USE_EVENT_BROKER
349 int neb_result = OK;
350 #endif
351 #ifdef EMBEDDEDPERL
352 char fname[512] = "";
353 char *args[5] = {"", DO_CLEAN, "", "", NULL };
354 char *perl_plugin_output = NULL;
355 SV *plugin_hndlr_cr = NULL;
356 int count ;
357 int use_epn = FALSE;
358 #ifdef aTHX
359 dTHX;
360 #endif
361 dSP;
362 #endif
363
364 log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_service_check()\n");
365
366 /* make sure we have something */
367 if(svc == NULL)
368 return ERROR;
369
370 /* is the service check viable at this time? */
371 if(check_service_check_viability(svc, check_options, time_is_valid, preferred_time) == ERROR)
372 return ERROR;
373
374 /* find the host associated with this service */
375 if((temp_host = svc->host_ptr) == NULL)
376 return ERROR;
377
378 /******** GOOD TO GO FOR A REAL SERVICE CHECK AT THIS POINT ********/
379
380 #ifdef USE_EVENT_BROKER
381 /* initialize start/end times */
382 start_time.tv_sec = 0L;
383 start_time.tv_usec = 0L;
384 end_time.tv_sec = 0L;
385 end_time.tv_usec = 0L;
386
387 /* send data to event broker */
388 neb_result = broker_service_check(NEBTYPE_SERVICECHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, svc, SERVICE_CHECK_ACTIVE, start_time, end_time, svc->service_check_command, svc->latency, 0.0, 0, FALSE, 0, NULL, NULL);
389
390 /* neb module wants to cancel the service check - the check will be rescheduled for a later time by the scheduling logic */
391 if(neb_result == NEBERROR_CALLBACKCANCEL) {
392 if(preferred_time)
393 *preferred_time += (svc->check_interval * interval_length);
394 return ERROR;
395 }
396
397 /* neb module wants to override (or cancel) the service check - perhaps it will check the service itself */
398 /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
399 /* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */
400 if(neb_result == NEBERROR_CALLBACKOVERRIDE)
401 return OK;
402 #endif
403
404
405 log_debug_info(DEBUGL_CHECKS, 0, "Checking service '%s' on host '%s'...\n", svc->description, svc->host_name);
406
407 /* clear check options - we don't want old check options retained */
408 /* only clear check options for scheduled checks - ondemand checks shouldn't affected retained check options */
409 if(scheduled_check == TRUE)
410 svc->check_options = CHECK_OPTION_NONE;
411
412 /* update latency for macros, event broker, save old value for later */
413 old_latency = svc->latency;
414 svc->latency = latency;
415
416 /* grab the host and service macro variables */
417 memset(&mac, 0, sizeof(mac));
418 grab_host_macros_r(&mac, temp_host);
419 grab_service_macros_r(&mac, svc);
420
421 /* get the raw command line */
422 get_raw_command_line_r(&mac, svc->check_command_ptr, svc->service_check_command, &raw_command, 0);
423 if(raw_command == NULL) {
424 clear_volatile_macros_r(&mac);
425 log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name);
426 if(preferred_time)
427 *preferred_time += (svc->check_interval * interval_length);
428 svc->latency = old_latency;
429 return ERROR;
430 }
431
432 /* process any macros contained in the argument */
433 process_macros_r(&mac, raw_command, &processed_command, 0);
434 my_free(raw_command);
435 if(processed_command == NULL) {
436 clear_volatile_macros_r(&mac);
437 log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name);
438 if(preferred_time)
439 *preferred_time += (svc->check_interval * interval_length);
440 svc->latency = old_latency;
441 return ERROR;
442 }
443
444 /* get the command start time */
445 gettimeofday(&start_time, NULL);
446
447 #ifdef USE_EVENT_BROKER
448 /* send data to event broker */
449 neb_result = broker_service_check(NEBTYPE_SERVICECHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, svc, SERVICE_CHECK_ACTIVE, start_time, end_time, svc->service_check_command, svc->latency, 0.0, service_check_timeout, FALSE, 0, processed_command, NULL);
450
451 /* neb module wants to override the service check - perhaps it will check the service itself */
452 if(neb_result == NEBERROR_CALLBACKOVERRIDE) {
453 clear_volatile_macros_r(&mac);
454 svc->latency = old_latency;
455 my_free(processed_command);
456 return OK;
457 }
458 #endif
459
460 /* increment number of service checks that are currently running... */
461 currently_running_service_checks++;
462
463 /* set the execution flag */
464 svc->is_executing = TRUE;
465
466 /* start save check info */
467 check_result_info.object_check_type = SERVICE_CHECK;
468 check_result_info.check_type = SERVICE_CHECK_ACTIVE;
469 check_result_info.check_options = check_options;
470 check_result_info.scheduled_check = scheduled_check;
471 check_result_info.reschedule_check = reschedule_check;
472 check_result_info.start_time = start_time;
473 check_result_info.finish_time = start_time;
474 check_result_info.early_timeout = FALSE;
475 check_result_info.exited_ok = TRUE;
476 check_result_info.return_code = STATE_OK;
477 check_result_info.output = NULL;
478
479 /* open a temp file for storing check output */
480 old_umask = umask(new_umask);
481 asprintf(&output_file, "%s/checkXXXXXX", temp_path);
482 check_result_info.output_file_fd = mkstemp(output_file);
483 if(check_result_info.output_file_fd >= 0)
484 check_result_info.output_file_fp = fdopen(check_result_info.output_file_fd, "w");
485 else {
486 check_result_info.output_file_fp = NULL;
487 check_result_info.output_file_fd = -1;
488 }
489 umask(old_umask);
490
491 log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Check result output will be written to '%s' (fd=%d)\n", output_file, check_result_info.output_file_fd);
492
493
494 /* finish save check info */
495 check_result_info.host_name = (char *)strdup(svc->host_name);
496 check_result_info.service_description = (char *)strdup(svc->description);
497 check_result_info.output_file = (check_result_info.output_file_fd < 0 || output_file == NULL) ? NULL : strdup(output_file);
498
499 /* free memory */
500 my_free(output_file);
501
502 /* write start of check result file */
503 /* if things go really bad later on down the line, the user will at least have a partial file to help debug missing output results */
504 if(check_result_info.output_file_fp) {
505
506 fprintf(check_result_info.output_file_fp, "### Active Check Result File ###\n");
507 fprintf(check_result_info.output_file_fp, "file_time=%lu\n", (unsigned long)check_result_info.start_time.tv_sec);
508 fprintf(check_result_info.output_file_fp, "\n");
509
510 fprintf(check_result_info.output_file_fp, "### Nagios Service Check Result ###\n");
511 fprintf(check_result_info.output_file_fp, "# Time: %s", ctime(&check_result_info.start_time.tv_sec));
512 fprintf(check_result_info.output_file_fp, "host_name=%s\n", check_result_info.host_name);
513 fprintf(check_result_info.output_file_fp, "service_description=%s\n", check_result_info.service_description);
514 fprintf(check_result_info.output_file_fp, "check_type=%d\n", check_result_info.check_type);
515 fprintf(check_result_info.output_file_fp, "check_options=%d\n", check_result_info.check_options);
516 fprintf(check_result_info.output_file_fp, "scheduled_check=%d\n", check_result_info.scheduled_check);
517 fprintf(check_result_info.output_file_fp, "reschedule_check=%d\n", check_result_info.reschedule_check);
518 fprintf(check_result_info.output_file_fp, "latency=%f\n", svc->latency);
519 fprintf(check_result_info.output_file_fp, "start_time=%lu.%lu\n", check_result_info.start_time.tv_sec, check_result_info.start_time.tv_usec);
520
521 /* flush output or it'll get written again when we fork() */
522 fflush(check_result_info.output_file_fp);
523 }
524
525 /* initialize dynamic buffer for storing plugin output */
526 dbuf_init(&checkresult_dbuf, dbuf_chunk);
527
528
529 /* reset latency (permanent value will be set later) */
530 svc->latency = old_latency;
531
532 /* update check statistics */
533 update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_SERVICE_CHECK_STATS : ACTIVE_ONDEMAND_SERVICE_CHECK_STATS, start_time.tv_sec);
534
535 #ifdef EMBEDDEDPERL
536
537 /* get"filename" component of command */
538 strncpy(fname, processed_command, strcspn(processed_command, " "));
539 fname[strcspn(processed_command, " ")] = '\x0';
540
541 /* should we use the embedded Perl interpreter to run this script? */
542 use_epn = file_uses_embedded_perl(fname);
543
544 /* if yes, do some initialization */
545 if(use_epn == TRUE) {
546
547 log_debug_info(DEBUGL_CHECKS, 1, "** Using Embedded Perl interpreter to run service check...\n");
548
549 args[0] = fname;
550 args[2] = "";
551
552 if(strchr(processed_command, ' ') == NULL)
553 args[3] = "";
554 else
555 args[3] = processed_command + strlen(fname) + 1;
556
557 ENTER;
558 SAVETMPS;
559 PUSHMARK(SP);
560 XPUSHs(sv_2mortal(newSVpv(args[0], 0)));
561 XPUSHs(sv_2mortal(newSVpv(args[1], 0)));
562 XPUSHs(sv_2mortal(newSVpv(args[2], 0)));
563 XPUSHs(sv_2mortal(newSVpv(args[3], 0)));
564 PUTBACK;
565
566 /* call our perl interpreter to compile and optionally cache the command */
567
568 call_pv("Embed::Persistent::eval_file", G_SCALAR | G_EVAL);
569
570 SPAGAIN ;
571
572 if(SvTRUE(ERRSV)) {
573
574 /*
575 * if SvTRUE(ERRSV)
576 * write failure to IPC pipe
577 * return
578 */
579
580 /* remove the top element of the Perl stack (undef) */
581 (void) POPs ;
582
583 pclose_result = STATE_UNKNOWN;
584 perl_plugin_output = SvPVX(ERRSV);
585
586 log_debug_info(DEBUGL_CHECKS, 0, "Embedded Perl failed to compile %s, compile error %s - skipping plugin\n", fname, perl_plugin_output);
587
588 /* save plugin output */
589 if(perl_plugin_output != NULL) {
590 temp_buffer = escape_newlines(perl_plugin_output);
591 dbuf_strcat(&checkresult_dbuf, temp_buffer);
592 my_free(temp_buffer);
593 }
594
595 /* get the check finish time */
596 gettimeofday(&end_time, NULL);
597
598 /* record check result info */
599 check_result_info.exited_ok = FALSE;
600 check_result_info.return_code = pclose_result;
601 check_result_info.finish_time = end_time;
602
603 /* write check result to file */
604 if(check_result_info.output_file_fp) {
605
606 fprintf(check_result_info.output_file_fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
607 fprintf(check_result_info.output_file_fp, "early_timeout=%d\n", check_result_info.early_timeout);
608 fprintf(check_result_info.output_file_fp, "exited_ok=%d\n", check_result_info.exited_ok);
609 fprintf(check_result_info.output_file_fp, "return_code=%d\n", check_result_info.return_code);
610 fprintf(check_result_info.output_file_fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
611
612 /* close the temp file */
613 fclose(check_result_info.output_file_fp);
614
615 /* move check result to queue directory */
616 move_check_result_to_queue(check_result_info.output_file);
617 }
618
619 /* free memory */
620 dbuf_free(&checkresult_dbuf);
621
622 /* free check result memory */
623 free_check_result(&check_result_info);
624
625 return OK;
626 }
627 else {
628
629 plugin_hndlr_cr = newSVsv(POPs);
630
631 log_debug_info(DEBUGL_CHECKS, 1, "Embedded Perl successfully compiled %s and returned code ref to plugin handler\n", fname);
632
633 PUTBACK ;
634 FREETMPS ;
635 LEAVE ;
636 }
637 }
638 #endif
639
640 /* plugin is a C plugin or a Perl plugin _without_ compilation errors */
641
642 /* fork a child process */
643 pid = fork();
644
645 /* an error occurred while trying to fork */
646 if(pid == -1) {
647
648 fork_error = TRUE;
649
650 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'. The check will be rescheduled.\n", svc->description, svc->host_name, strerror(errno));
651
652 log_debug_info(DEBUGL_CHECKS, 0, "Check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'!\n", svc->description, svc->host_name, strerror(errno));
653 }
654
655 /* if we are in the child process... */
656 else if(pid == 0) {
657
658 /* set environment variables */
659 set_all_macro_environment_vars_r(&mac, TRUE);
660
661 /* ADDED 11/12/07 EG */
662 /* close external command file and shut down worker thread */
663 close_command_file();
664
665 /* fork again if we're not in a large installation */
666 if(child_processes_fork_twice == TRUE) {
667
668 /* fork again... */
669 pid = fork();
670
671 /* an error occurred while trying to fork again */
672 if(pid == -1)
673 exit(STATE_UNKNOWN);
674 }
675
676 /* the grandchild (or child if large install tweaks are enabled) process should run the service check... */
677 if(pid == 0 || child_processes_fork_twice == FALSE) {
678
679 /* reset signal handling */
680 reset_sighandler();
681
682 /* become the process group leader */
683 setpgid(0, 0);
684
685 /* exit on term signals at this process level */
686 signal(SIGTERM, SIG_DFL);
687
688 /* catch plugins that don't finish in a timely manner */
689 signal(SIGALRM, service_check_sighandler);
690 alarm(service_check_timeout);
691
692 /* disable rotation of the debug file */
693 max_debug_file_size = 0L;
694
695 /******** BEGIN EMBEDDED PERL INTERPRETER EXECUTION ********/
696 #ifdef EMBEDDEDPERL
697 if(use_epn == TRUE) {
698
699 /* execute our previously compiled script - from call_pv("Embed::Persistent::eval_file",..) */
700 /* NB. args[2] is _now_ a code ref (to the Perl subroutine corresp to the plugin) returned by eval_file() */
701
702 ENTER;
703 SAVETMPS;
704 PUSHMARK(SP);
705
706 XPUSHs(sv_2mortal(newSVpv(args[0], 0)));
707 XPUSHs(sv_2mortal(newSVpv(args[1], 0)));
708 XPUSHs(plugin_hndlr_cr);
709 XPUSHs(sv_2mortal(newSVpv(args[3], 0)));
710
711 PUTBACK;
712
713 count = call_pv("Embed::Persistent::run_package", G_ARRAY);
714
715 SPAGAIN;
716
717 perl_plugin_output = POPpx ;
718 pclose_result = POPi ;
719
720 /* NOTE: 07/16/07 This has to be done before FREETMPS statement below, or the POPpx pointer will be invalid (Hendrik B.) */
721 /* get perl plugin output - escape newlines */
722 if(perl_plugin_output != NULL) {
723 temp_buffer = escape_newlines(perl_plugin_output);
724 dbuf_strcat(&checkresult_dbuf, temp_buffer);
725 my_free(temp_buffer);
726 }
727
728 PUTBACK;
729 FREETMPS;
730 LEAVE;
731
732 log_debug_info(DEBUGL_CHECKS, 1, "Embedded Perl ran %s: return code=%d, plugin output=%s\n", fname, pclose_result, (perl_plugin_output == NULL) ? "NULL" : checkresult_dbuf.buf);
733
734 /* reset the alarm */
735 alarm(0);
736
737 /* get the check finish time */
738 gettimeofday(&end_time, NULL);
739
740 /* record check result info */
741 check_result_info.return_code = pclose_result;
742 check_result_info.finish_time = end_time;
743
744 /* write check result to file */
745 if(check_result_info.output_file_fp) {
746
747 fprintf(check_result_info.output_file_fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
748 fprintf(check_result_info.output_file_fp, "early_timeout=%d\n", check_result_info.early_timeout);
749 fprintf(check_result_info.output_file_fp, "exited_ok=%d\n", check_result_info.exited_ok);
750 fprintf(check_result_info.output_file_fp, "return_code=%d\n", check_result_info.return_code);
751 fprintf(check_result_info.output_file_fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
752
753 /* close the temp file */
754 fclose(check_result_info.output_file_fp);
755
756 /* move check result to queue directory */
757 move_check_result_to_queue(check_result_info.output_file);
758 }
759
760 /* free memory */
761 dbuf_free(&checkresult_dbuf);
762
763 /* free check result memory */
764 free_check_result(&check_result_info);
765
766 /* return with plugin exit status - not really necessary... */
767 _exit(pclose_result);
768 }
769 #endif
770 /******** END EMBEDDED PERL INTERPRETER EXECUTION ********/
771
772
773 /* run the plugin check command */
774 fp = popen(processed_command, "r");
775 if(fp == NULL)
776 _exit(STATE_UNKNOWN);
777
778 /* initialize buffer */
779 strcpy(output_buffer, "");
780
781 /* get all lines of plugin output - escape newlines */
782 while(fgets(output_buffer, sizeof(output_buffer) - 1, fp)) {
783 temp_buffer = escape_newlines(output_buffer);
784 dbuf_strcat(&checkresult_dbuf, temp_buffer);
785 my_free(temp_buffer);
786 }
787
788 /* close the process */
789 pclose_result = pclose(fp);
790
791 /* reset the alarm and ignore SIGALRM */
792 signal(SIGALRM, SIG_IGN);
793 alarm(0);
794
795 /* get the check finish time */
796 gettimeofday(&end_time, NULL);
797
798 /* record check result info */
799 check_result_info.finish_time = end_time;
800 check_result_info.early_timeout = FALSE;
801
802 /* test for execution error */
803 if(pclose_result == -1) {
804 pclose_result = STATE_UNKNOWN;
805 check_result_info.return_code = STATE_CRITICAL;
806 check_result_info.exited_ok = FALSE;
807 }
808 else {
809 if(WEXITSTATUS(pclose_result) == 0 && WIFSIGNALED(pclose_result))
810 check_result_info.return_code = 128 + WTERMSIG(pclose_result);
811 else
812 check_result_info.return_code = WEXITSTATUS(pclose_result);
813 }
814
815 /* write check result to file */
816 if(check_result_info.output_file_fp) {
817 FILE *fp;
818
819 /* avoid races with signal handling */
820 fp = check_result_info.output_file_fp;
821 check_result_info.output_file_fp = NULL;
822
823 fprintf(fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
824 fprintf(fp, "early_timeout=%d\n", check_result_info.early_timeout);
825 fprintf(fp, "exited_ok=%d\n", check_result_info.exited_ok);
826 fprintf(fp, "return_code=%d\n", check_result_info.return_code);
827 fprintf(fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
828
829 /* close the temp file */
830 fclose(fp);
831
832 /* move check result to queue directory */
833 move_check_result_to_queue(check_result_info.output_file);
834 }
835
836 /* free memory */
837 dbuf_free(&checkresult_dbuf);
838 my_free(processed_command);
839
840 /* free check result memory */
841 free_check_result(&check_result_info);
842
843 /* return with plugin exit status - not really necessary... */
844 _exit(pclose_result);
845 }
846
847 /* NOTE: this code is never reached if large install tweaks are enabled... */
848
849 /* unset environment variables */
850 set_all_macro_environment_vars_r(&mac, FALSE);
851
852 /* free allocated memory */
853 /* this needs to be done last, so we don't free memory for variables before they're used above */
854 if(free_child_process_memory == TRUE)
855 free_memory(&mac);
856
857 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
858 _exit(STATE_OK);
859 }
860
861 /* else the parent should wait for the first child to return... */
862 else if(pid > 0) {
863 clear_volatile_macros_r(&mac);
864
865 log_debug_info(DEBUGL_CHECKS, 2, "Service check is executing in child process (pid=%lu)\n", (unsigned long)pid);
866
867 /* parent should close output file */
868 if(check_result_info.output_file_fp)
869 fclose(check_result_info.output_file_fp);
870
871 /* should this be done in first child process (after spawning grandchild) as well? */
872 /* free memory allocated for IPC functionality */
873 free_check_result(&check_result_info);
874
875 /* free memory */
876 my_free(processed_command);
877
878 /* wait for the first child to return */
879 /* don't do this if large install tweaks are enabled - we'll clean up children in event loop */
880 if(child_processes_fork_twice == TRUE)
881 wait_result = waitpid(pid, NULL, 0);
882 }
883
884 /* see if we were able to run the check... */
885 if(fork_error == TRUE)
886 return ERROR;
887
888 return OK;
889 }
890
891
892
893 /* handles asynchronous service check results */
handle_async_service_check_result(service * temp_service,check_result * queued_check_result)894 int handle_async_service_check_result(service *temp_service, check_result *queued_check_result) {
895 host *temp_host = NULL;
896 time_t next_service_check = 0L;
897 time_t preferred_time = 0L;
898 time_t next_valid_time = 0L;
899 int reschedule_check = FALSE;
900 int state_change = FALSE;
901 int hard_state_change = FALSE;
902 int first_host_check_initiated = FALSE;
903 int route_result = HOST_UP;
904 time_t current_time = 0L;
905 int state_was_logged = FALSE;
906 char *old_plugin_output = NULL;
907 char *temp_plugin_output = NULL;
908 char *temp_ptr = NULL;
909 servicedependency *temp_dependency = NULL;
910 objectlist *check_servicelist = NULL;
911 objectlist *servicelist_item = NULL;
912 service *master_service = NULL;
913 int run_async_check = TRUE;
914 int state_changes_use_cached_state = TRUE; /* TODO - 09/23/07 move this to a global variable */
915 int flapping_check_done = FALSE;
916 void *ptr = NULL;
917
918
919 log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_service_check_result()\n");
920
921 /* make sure we have what we need */
922 if(temp_service == NULL || queued_check_result == NULL)
923 return ERROR;
924
925 /* get the current time */
926 time(¤t_time);
927
928 log_debug_info(DEBUGL_CHECKS, 0, "** Handling check result for service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
929 log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, SERVICE: %s, CHECK TYPE: %s, OPTIONS: %d, SCHEDULED: %s, RESCHEDULE: %s, EXITED OK: %s, RETURN CODE: %d, OUTPUT: %s\n", temp_service->host_name, temp_service->description, (queued_check_result->check_type == SERVICE_CHECK_ACTIVE) ? "Active" : "Passive", queued_check_result->check_options, (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No", queued_check_result->return_code, queued_check_result->output);
930
931 /* decrement the number of service checks still out there... */
932 if(queued_check_result->check_type == SERVICE_CHECK_ACTIVE && currently_running_service_checks > 0)
933 currently_running_service_checks--;
934
935 /* skip this service check results if its passive and we aren't accepting passive check results */
936 if(queued_check_result->check_type == SERVICE_CHECK_PASSIVE) {
937 if(accept_passive_service_checks == FALSE) {
938 log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive service checks are disabled globally.\n");
939 return ERROR;
940 }
941 if(temp_service->accept_passive_service_checks == FALSE) {
942 log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive checks are disabled for this service.\n");
943 return ERROR;
944 }
945 }
946
947 /* clear the freshening flag (it would have been set if this service was determined to be stale) */
948 if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
949 temp_service->is_being_freshened = FALSE;
950
951 /* clear the execution flag if this was an active check */
952 if(queued_check_result->check_type == SERVICE_CHECK_ACTIVE)
953 temp_service->is_executing = FALSE;
954
955 /* DISCARD INVALID FRESHNESS CHECK RESULTS */
956 /* If a services goes stale, Nagios will initiate a forced check in order to freshen it. There is a race condition whereby a passive check
957 could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would
958 make the service fresh again, so we do a quick check to make sure the service is still stale before we accept the check result. */
959 if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_service_result_fresh(temp_service, current_time, FALSE) == TRUE) {
960 log_debug_info(DEBUGL_CHECKS, 0, "Discarding service freshness check result because the service is currently fresh (race condition avoided).\n");
961 return OK;
962 }
963
964 /* check latency is passed to us */
965 temp_service->latency = queued_check_result->latency;
966
967 /* update the execution time for this check (millisecond resolution) */
968 temp_service->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0);
969 if(temp_service->execution_time < 0.0)
970 temp_service->execution_time = 0.0;
971
972 /* get the last check time */
973 temp_service->last_check = queued_check_result->start_time.tv_sec;
974
975 /* was this check passive or active? */
976 temp_service->check_type = (queued_check_result->check_type == SERVICE_CHECK_ACTIVE) ? SERVICE_CHECK_ACTIVE : SERVICE_CHECK_PASSIVE;
977
978 /* update check statistics for passive checks */
979 if(queued_check_result->check_type == SERVICE_CHECK_PASSIVE)
980 update_check_stats(PASSIVE_SERVICE_CHECK_STATS, queued_check_result->start_time.tv_sec);
981
982 /* should we reschedule the next service check? NOTE: This may be overridden later... */
983 reschedule_check = queued_check_result->reschedule_check;
984
985 /* save the old service status info */
986 temp_service->last_state = temp_service->current_state;
987
988 /* save old plugin output */
989 if(temp_service->plugin_output)
990 old_plugin_output = (char *)strdup(temp_service->plugin_output);
991
992 /* clear the old plugin output and perf data buffers */
993 my_free(temp_service->plugin_output);
994 my_free(temp_service->long_plugin_output);
995 my_free(temp_service->perf_data);
996
997 /* if there was some error running the command, just skip it (this shouldn't be happening) */
998 if(queued_check_result->exited_ok == FALSE) {
999
1000 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' did not exit properly!\n", temp_service->description, temp_service->host_name);
1001
1002 temp_service->plugin_output = (char *)strdup("(Service check did not exit properly)");
1003
1004 temp_service->current_state = STATE_CRITICAL;
1005 }
1006
1007 /* make sure the return code is within bounds */
1008 else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) {
1009
1010 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of service '%s' on host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_service->description, temp_service->host_name, (queued_check_result->return_code == 126 ? "Make sure the plugin you're trying to run is executable." : (queued_check_result->return_code == 127 ? " Make sure the plugin you're trying to run actually exists." : "")));
1011
1012 asprintf(&temp_plugin_output, "\x73\x6f\x69\x67\x61\x6e\x20\x74\x68\x67\x69\x72\x79\x70\x6f\x63\x20\x6e\x61\x68\x74\x65\x20\x64\x61\x74\x73\x6c\x61\x67");
1013 my_free(temp_plugin_output);
1014 asprintf(&temp_service->plugin_output, "(Return code of %d is out of bounds%s)", queued_check_result->return_code, (queued_check_result->return_code == 126 ? " - plugin may not be executable" : (queued_check_result->return_code == 127 ? " - plugin may be missing" : "")));
1015
1016 temp_service->current_state = STATE_CRITICAL;
1017 }
1018
1019 /* else the return code is okay... */
1020 else {
1021
1022 /* parse check output to get: (1) short output, (2) long output, (3) perf data */
1023 parse_check_output(queued_check_result->output, &temp_service->plugin_output, &temp_service->long_plugin_output, &temp_service->perf_data, TRUE, TRUE);
1024
1025 /* make sure the plugin output isn't null */
1026 if(temp_service->plugin_output == NULL)
1027 temp_service->plugin_output = (char *)strdup("(No output returned from plugin)");
1028
1029 /* replace semicolons in plugin output (but not performance data) with colons */
1030 else if((temp_ptr = temp_service->plugin_output)) {
1031 while((temp_ptr = strchr(temp_ptr, ';')))
1032 * temp_ptr = ':';
1033 }
1034
1035 log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n");
1036 log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_service->plugin_output == NULL) ? "NULL" : temp_service->plugin_output);
1037 log_debug_info(DEBUGL_CHECKS, 2, "Long Output: %s\n", (temp_service->long_plugin_output == NULL) ? "NULL" : temp_service->long_plugin_output);
1038 log_debug_info(DEBUGL_CHECKS, 2, "Perf Data: %s\n", (temp_service->perf_data == NULL) ? "NULL" : temp_service->perf_data);
1039
1040 /* grab the return code */
1041 temp_service->current_state = queued_check_result->return_code;
1042 }
1043
1044
1045 /* record the last state time */
1046 switch(temp_service->current_state) {
1047 case STATE_OK:
1048 temp_service->last_time_ok = temp_service->last_check;
1049 break;
1050 case STATE_WARNING:
1051 temp_service->last_time_warning = temp_service->last_check;
1052 break;
1053 case STATE_UNKNOWN:
1054 temp_service->last_time_unknown = temp_service->last_check;
1055 break;
1056 case STATE_CRITICAL:
1057 temp_service->last_time_critical = temp_service->last_check;
1058 break;
1059 default:
1060 break;
1061 }
1062
1063 /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
1064 if(temp_service->check_type == SERVICE_CHECK_PASSIVE) {
1065 if(log_passive_checks == TRUE)
1066 logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE SERVICE CHECK: %s;%s;%d;%s\n", temp_service->host_name, temp_service->description, temp_service->current_state, temp_service->plugin_output);
1067 }
1068
1069 /* get the host that this service runs on */
1070 temp_host = (host *)temp_service->host_ptr;
1071
1072 /* if the service check was okay... */
1073 if(temp_service->current_state == STATE_OK) {
1074
1075 /* if the host has never been checked before, verify its status */
1076 /* only do this if 1) the initial state was set to non-UP or 2) the host is not scheduled to be checked soon (next 5 minutes) */
1077 if(temp_host->has_been_checked == FALSE && (temp_host->initial_state != HOST_UP || (unsigned long)temp_host->next_check == 0L || (unsigned long)(temp_host->next_check - current_time) > 300)) {
1078
1079 /* set a flag to remember that we launched a check */
1080 first_host_check_initiated = TRUE;
1081
1082 /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
1083 /* previous logic was to simply run a sync (serial) host check */
1084 /* do NOT allow cached check results to happen here - we need the host to be checked for real... */
1085 if(use_aggressive_host_checking == TRUE)
1086 perform_on_demand_host_check(temp_host, NULL, CHECK_OPTION_NONE, FALSE, 0L);
1087 else
1088 run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1089 }
1090 }
1091
1092
1093 /**** NOTE - THIS WAS MOVED UP FROM LINE 1049 BELOW TO FIX PROBLEMS WHERE CURRENT ATTEMPT VALUE WAS ACTUALLY "LEADING" REAL VALUE ****/
1094 /* increment the current attempt number if this is a soft state (service was rechecked) */
1095 if(temp_service->state_type == SOFT_STATE && (temp_service->current_attempt < temp_service->max_attempts))
1096 temp_service->current_attempt = temp_service->current_attempt + 1;
1097
1098
1099 log_debug_info(DEBUGL_CHECKS, 2, "ST: %s CA: %d MA: %d CS: %d LS: %d LHS: %d\n", (temp_service->state_type == SOFT_STATE) ? "SOFT" : "HARD", temp_service->current_attempt, temp_service->max_attempts, temp_service->current_state, temp_service->last_state, temp_service->last_hard_state);
1100
1101 /* check for a state change (either soft or hard) */
1102 if(temp_service->current_state != temp_service->last_state) {
1103 log_debug_info(DEBUGL_CHECKS, 2, "Service has changed state since last check!\n");
1104 state_change = TRUE;
1105 }
1106
1107 /* checks for a hard state change where host was down at last service check */
1108 /* this occurs in the case where host goes down and service current attempt gets reset to 1 */
1109 /* if this check is not made, the service recovery looks like a soft recovery instead of a hard one */
1110 if(temp_service->host_problem_at_last_check == TRUE && temp_service->current_state == STATE_OK) {
1111 log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n");
1112 hard_state_change = TRUE;
1113 }
1114
1115 /* check for a "normal" hard state change where max check attempts is reached */
1116 if(temp_service->current_attempt >= temp_service->max_attempts && temp_service->current_state != temp_service->last_hard_state) {
1117 log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n");
1118 hard_state_change = TRUE;
1119 }
1120
1121 /* a state change occurred... */
1122 /* reset last and next notification times and acknowledgement flag if necessary, misc other stuff */
1123 if(state_change == TRUE || hard_state_change == TRUE) {
1124
1125 /* reschedule the service check */
1126 reschedule_check = TRUE;
1127
1128 /* reset notification times */
1129 temp_service->last_notification = (time_t)0;
1130 temp_service->next_notification = (time_t)0;
1131
1132 /* reset notification suppression option */
1133 temp_service->no_more_notifications = FALSE;
1134
1135 if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) {
1136
1137 temp_service->problem_has_been_acknowledged = FALSE;
1138 temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1139
1140 /* remove any non-persistant comments associated with the ack */
1141 delete_service_acknowledgement_comments(temp_service);
1142 }
1143 else if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && temp_service->current_state == STATE_OK) {
1144
1145 temp_service->problem_has_been_acknowledged = FALSE;
1146 temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1147
1148 /* remove any non-persistant comments associated with the ack */
1149 delete_service_acknowledgement_comments(temp_service);
1150 }
1151
1152 /* do NOT reset current notification number!!! */
1153 /* hard changes between non-OK states should continue to be escalated, so don't reset current notification number */
1154 /*temp_service->current_notification_number=0;*/
1155 }
1156
1157 /* initialize the last host and service state change times if necessary */
1158 if(temp_service->last_state_change == (time_t)0)
1159 temp_service->last_state_change = temp_service->last_check;
1160 if(temp_service->last_hard_state_change == (time_t)0)
1161 temp_service->last_hard_state_change = temp_service->last_check;
1162 if(temp_host->last_state_change == (time_t)0)
1163 temp_host->last_state_change = temp_service->last_check;
1164 if(temp_host->last_hard_state_change == (time_t)0)
1165 temp_host->last_hard_state_change = temp_service->last_check;
1166
1167 /* update last service state change times */
1168 if(state_change == TRUE)
1169 temp_service->last_state_change = temp_service->last_check;
1170 if(hard_state_change == TRUE)
1171 temp_service->last_hard_state_change = temp_service->last_check;
1172
1173 /* update the event and problem ids */
1174 if(state_change == TRUE) {
1175
1176 /* always update the event id on a state change */
1177 temp_service->last_event_id = temp_service->current_event_id;
1178 temp_service->current_event_id = next_event_id;
1179 next_event_id++;
1180
1181 /* update the problem id when transitioning to a problem state */
1182 if(temp_service->last_state == STATE_OK) {
1183 /* don't reset last problem id, or it will be zero the next time a problem is encountered */
1184 /* temp_service->last_problem_id=temp_service->current_problem_id;*/
1185 temp_service->current_problem_id = next_problem_id;
1186 next_problem_id++;
1187 }
1188
1189 /* clear the problem id when transitioning from a problem state to an OK state */
1190 if(temp_service->current_state == STATE_OK) {
1191 temp_service->last_problem_id = temp_service->current_problem_id;
1192 temp_service->current_problem_id = 0L;
1193 }
1194 }
1195
1196
1197 /**************************************/
1198 /******* SERVICE CHECK OK LOGIC *******/
1199 /**************************************/
1200
1201 /* if the service is up and running OK... */
1202 if(temp_service->current_state == STATE_OK) {
1203
1204 log_debug_info(DEBUGL_CHECKS, 1, "Service is OK.\n");
1205
1206 /* reset the acknowledgement flag (this should already have been done, but just in case...) */
1207 temp_service->problem_has_been_acknowledged = FALSE;
1208 temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1209
1210 /* verify the route to the host and send out host recovery notifications */
1211 if(temp_host->current_state != HOST_UP) {
1212
1213 log_debug_info(DEBUGL_CHECKS, 1, "Host is NOT UP, so we'll check it to see if it recovered...\n");
1214
1215 /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
1216 /* previous logic was to simply run a sync (serial) host check */
1217 if(use_aggressive_host_checking == TRUE)
1218 perform_on_demand_host_check(temp_host, NULL, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1219 /* 09/23/07 EG don't launch a new host check if we already did so earlier */
1220 else if(first_host_check_initiated == TRUE)
1221 log_debug_info(DEBUGL_CHECKS, 1, "First host check was already initiated, so we'll skip a new host check.\n");
1222 else {
1223 /* can we use the last cached host state? */
1224 /* usually only use cached host state if no service state change has occurred */
1225 if((state_change == FALSE || state_changes_use_cached_state == TRUE) && temp_host->has_been_checked == TRUE && ((current_time - temp_host->last_check) <= cached_host_check_horizon)) {
1226 log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state);
1227 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1228 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1229 }
1230
1231 /* else launch an async (parallel) check of the host */
1232 else
1233 run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1234 }
1235 }
1236
1237 /* if a hard service recovery has occurred... */
1238 if(hard_state_change == TRUE) {
1239
1240 log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a HARD RECOVERY.\n");
1241
1242 /* set the state type macro */
1243 temp_service->state_type = HARD_STATE;
1244
1245 /* log the service recovery */
1246 log_service_event(temp_service);
1247 state_was_logged = TRUE;
1248
1249 /* 10/04/07 check to see if the service and/or associate host is flapping */
1250 /* this should be done before a notification is sent out to ensure the host didn't just start flapping */
1251 check_for_service_flapping(temp_service, TRUE, TRUE);
1252 check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1253 flapping_check_done = TRUE;
1254
1255 /* notify contacts about the service recovery */
1256 service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1257
1258 /* run the service event handler to handle the hard state change */
1259 handle_service_event(temp_service);
1260 }
1261
1262 /* else if a soft service recovery has occurred... */
1263 else if(state_change == TRUE) {
1264
1265 log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a SOFT RECOVERY.\n");
1266
1267 /* this is a soft recovery */
1268 temp_service->state_type = SOFT_STATE;
1269
1270 /* log the soft recovery */
1271 log_service_event(temp_service);
1272 state_was_logged = TRUE;
1273
1274 /* run the service event handler to handle the soft state change */
1275 handle_service_event(temp_service);
1276 }
1277
1278 /* else no service state change has occurred... */
1279 else {
1280 log_debug_info(DEBUGL_CHECKS, 1, "Service did not change state.\n");
1281 }
1282
1283 /* should we obsessive over service checks? */
1284 if(obsess_over_services == TRUE)
1285 obsessive_compulsive_service_check_processor(temp_service);
1286
1287 /* reset all service variables because its okay now... */
1288 temp_service->host_problem_at_last_check = FALSE;
1289 temp_service->current_attempt = 1;
1290 temp_service->state_type = HARD_STATE;
1291 temp_service->last_hard_state = STATE_OK;
1292 temp_service->last_notification = (time_t)0;
1293 temp_service->next_notification = (time_t)0;
1294 temp_service->current_notification_number = 0;
1295 temp_service->problem_has_been_acknowledged = FALSE;
1296 temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1297 temp_service->notified_on_unknown = FALSE;
1298 temp_service->notified_on_warning = FALSE;
1299 temp_service->notified_on_critical = FALSE;
1300 temp_service->no_more_notifications = FALSE;
1301
1302 if(reschedule_check == TRUE)
1303 next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1304 }
1305
1306
1307 /*******************************************/
1308 /******* SERVICE CHECK PROBLEM LOGIC *******/
1309 /*******************************************/
1310
1311 /* hey, something's not working quite like it should... */
1312 else {
1313
1314 log_debug_info(DEBUGL_CHECKS, 1, "Service is in a non-OK state!\n");
1315
1316 /* check the route to the host if its up right now... */
1317 if(temp_host->current_state == HOST_UP) {
1318
1319 log_debug_info(DEBUGL_CHECKS, 1, "Host is currently UP, so we'll recheck its state to make sure...\n");
1320
1321 /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
1322 /* previous logic was to simply run a sync (serial) host check */
1323 if(use_aggressive_host_checking == TRUE)
1324 perform_on_demand_host_check(temp_host, &route_result, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1325 else {
1326 /* can we use the last cached host state? */
1327 /* only use cached host state if no service state change has occurred */
1328 if((state_change == FALSE || state_changes_use_cached_state == TRUE) && temp_host->has_been_checked == TRUE && ((current_time - temp_host->last_check) <= cached_host_check_horizon)) {
1329 /* use current host state as route result */
1330 route_result = temp_host->current_state;
1331 log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state);
1332 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1333 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1334 }
1335
1336 /* else launch an async (parallel) check of the host */
1337 /* CHANGED 02/15/08 only if service changed state since service was last checked */
1338 else if(state_change == TRUE) {
1339 /* use current host state as route result */
1340 route_result = temp_host->current_state;
1341 run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1342 }
1343
1344 /* ADDED 02/15/08 */
1345 /* else assume same host state */
1346 else {
1347 route_result = temp_host->current_state;
1348 log_debug_info(DEBUGL_CHECKS, 1, "* Using last known host state: %d\n", temp_host->current_state);
1349 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1350 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1351 }
1352 }
1353 }
1354
1355 /* else the host is either down or unreachable, so recheck it if necessary */
1356 else {
1357
1358 log_debug_info(DEBUGL_CHECKS, 1, "Host is currently DOWN/UNREACHABLE.\n");
1359
1360 /* we're using aggressive host checking, so really do recheck the host... */
1361 if(use_aggressive_host_checking == TRUE) {
1362 log_debug_info(DEBUGL_CHECKS, 1, "Agressive host checking is enabled, so we'll recheck the host state...\n");
1363 perform_on_demand_host_check(temp_host, &route_result, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1364 }
1365
1366 /* the service wobbled between non-OK states, so check the host... */
1367 else if((state_change == TRUE && state_changes_use_cached_state == FALSE) && temp_service->last_hard_state != STATE_OK) {
1368 log_debug_info(DEBUGL_CHECKS, 1, "Service wobbled between non-OK states, so we'll recheck the host state...\n");
1369 /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
1370 /* previous logic was to simply run a sync (serial) host check */
1371 /* use current host state as route result */
1372 route_result = temp_host->current_state;
1373 run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1374 /*perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);*/
1375 }
1376
1377 /* else fake the host check, but (possibly) resend host notifications to contacts... */
1378 else {
1379
1380 log_debug_info(DEBUGL_CHECKS, 1, "Assuming host is in same state as before...\n");
1381
1382 /* if the host has never been checked before, set the checked flag and last check time */
1383 /* 03/11/06 EG Note: This probably never evaluates to FALSE, present for historical reasons only, can probably be removed in the future */
1384 if(temp_host->has_been_checked == FALSE) {
1385 temp_host->has_been_checked = TRUE;
1386 temp_host->last_check = temp_service->last_check;
1387 }
1388
1389 /* fake the route check result */
1390 route_result = temp_host->current_state;
1391
1392 /* possibly re-send host notifications... */
1393 host_notification(temp_host, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1394 }
1395 }
1396
1397 /* if the host is down or unreachable ... */
1398 /* 05/29/2007 NOTE: The host might be in a SOFT problem state due to host check retries/caching. Not sure if we should take that into account and do something different or not... */
1399 if(route_result != HOST_UP) {
1400
1401 log_debug_info(DEBUGL_CHECKS, 2, "Host is not UP, so we mark state changes if appropriate\n");
1402
1403 /* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */
1404 if(temp_service->last_hard_state != temp_service->current_state)
1405 hard_state_change = TRUE;
1406
1407 /* update last state change times */
1408 if(state_change == TRUE || hard_state_change == TRUE)
1409 temp_service->last_state_change = temp_service->last_check;
1410 if(hard_state_change == TRUE) {
1411 temp_service->last_hard_state_change = temp_service->last_check;
1412 temp_service->state_type = HARD_STATE;
1413 temp_service->last_hard_state = temp_service->current_state;
1414 }
1415
1416 /* put service into a hard state without attempting check retries and don't send out notifications about it */
1417 temp_service->host_problem_at_last_check = TRUE;
1418 /* Below removed 08/04/2010 EG - http://tracker.nagios.org/view.php?id=128 */
1419 /*
1420 temp_service->state_type=HARD_STATE;
1421 temp_service->last_hard_state=temp_service->current_state;
1422 temp_service->current_attempt=1;
1423 */
1424 }
1425
1426 /* the host is up - it recovered since the last time the service was checked... */
1427 else if(temp_service->host_problem_at_last_check == TRUE) {
1428
1429 /* next time the service is checked we shouldn't get into this same case... */
1430 temp_service->host_problem_at_last_check = FALSE;
1431
1432 /* reset the current check counter, so we give the service a chance */
1433 /* this helps prevent the case where service has N max check attempts, N-1 of which have already occurred. */
1434 /* if we didn't do this, the next check might fail and result in a hard problem - we should really give it more time */
1435 /* ADDED IF STATEMENT 01-17-05 EG */
1436 /* 01-17-05: Services in hard problem states before hosts went down would sometimes come back as soft problem states after */
1437 /* the hosts recovered. This caused problems, so hopefully this will fix it */
1438 if(temp_service->state_type == SOFT_STATE)
1439 temp_service->current_attempt = 1;
1440 }
1441
1442 log_debug_info(DEBUGL_CHECKS, 1, "Current/Max Attempt(s): %d/%d\n", temp_service->current_attempt, temp_service->max_attempts);
1443
1444 /* if we should retry the service check, do so (except it the host is down or unreachable!) */
1445 if(temp_service->current_attempt < temp_service->max_attempts) {
1446
1447 /* the host is down or unreachable, so don't attempt to retry the service check */
1448 if(route_result != HOST_UP) {
1449
1450 log_debug_info(DEBUGL_CHECKS, 1, "Host isn't UP, so we won't retry the service check...\n");
1451
1452 /* the host is not up, so reschedule the next service check at regular interval */
1453 if(reschedule_check == TRUE)
1454 next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1455
1456 /* log the problem as a hard state if the host just went down */
1457 if(hard_state_change == TRUE) {
1458 log_service_event(temp_service);
1459 state_was_logged = TRUE;
1460
1461 /* run the service event handler to handle the hard state */
1462 handle_service_event(temp_service);
1463 }
1464 }
1465
1466 /* the host is up, so continue to retry the service check */
1467 else {
1468
1469 log_debug_info(DEBUGL_CHECKS, 1, "Host is UP, so we'll retry the service check...\n");
1470
1471 /* this is a soft state */
1472 temp_service->state_type = SOFT_STATE;
1473
1474 /* log the service check retry */
1475 log_service_event(temp_service);
1476 state_was_logged = TRUE;
1477
1478 /* run the service event handler to handle the soft state */
1479 handle_service_event(temp_service);
1480
1481 if(reschedule_check == TRUE)
1482 next_service_check = (time_t)(temp_service->last_check + (temp_service->retry_interval * interval_length));
1483 }
1484
1485 /* perform dependency checks on the second to last check of the service */
1486 if(enable_predictive_service_dependency_checks == TRUE && temp_service->current_attempt == (temp_service->max_attempts - 1)) {
1487
1488 log_debug_info(DEBUGL_CHECKS, 1, "Looking for services to check for predictive dependency checks...\n");
1489
1490 /* check services that THIS ONE depends on for notification AND execution */
1491 /* we do this because we might be sending out a notification soon and we want the dependency logic to be accurate */
1492 for(temp_dependency = get_first_servicedependency_by_dependent_service(temp_service->host_name, temp_service->description, &ptr); temp_dependency != NULL; temp_dependency = get_next_servicedependency_by_dependent_service(temp_service->host_name, temp_service->description, &ptr)) {
1493 if(temp_dependency->dependent_service_ptr == temp_service && temp_dependency->master_service_ptr != NULL) {
1494 master_service = (service *)temp_dependency->master_service_ptr;
1495 log_debug_info(DEBUGL_CHECKS, 2, "Predictive check of service '%s' on host '%s' queued.\n", master_service->description, master_service->host_name);
1496 add_object_to_objectlist(&check_servicelist, (void *)master_service);
1497 }
1498 }
1499 }
1500 }
1501
1502
1503 /* we've reached the maximum number of service rechecks, so handle the error */
1504 else {
1505
1506 log_debug_info(DEBUGL_CHECKS, 1, "Service has reached max number of rechecks, so we'll handle the error...\n");
1507
1508 /* this is a hard state */
1509 temp_service->state_type = HARD_STATE;
1510
1511 /* if we've hard a hard state change... */
1512 if(hard_state_change == TRUE) {
1513
1514 /* log the service problem (even if host is not up, which is new in 0.0.5) */
1515 log_service_event(temp_service);
1516 state_was_logged = TRUE;
1517 }
1518
1519 /* else log the problem (again) if this service is flagged as being volatile */
1520 else if(temp_service->is_volatile == TRUE) {
1521 log_service_event(temp_service);
1522 state_was_logged = TRUE;
1523 }
1524
1525 /* check for start of flexible (non-fixed) scheduled downtime if we just had a hard error */
1526 /* we need to check for both, state_change (SOFT) and hard_state_change (HARD) values */
1527 if((hard_state_change == TRUE || state_change == TRUE) && temp_service->pending_flex_downtime > 0)
1528 check_pending_flex_service_downtime(temp_service);
1529
1530 /* 10/04/07 check to see if the service and/or associate host is flapping */
1531 /* this should be done before a notification is sent out to ensure the host didn't just start flapping */
1532 check_for_service_flapping(temp_service, TRUE, TRUE);
1533 check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1534 flapping_check_done = TRUE;
1535
1536 /* (re)send notifications out about this service problem if the host is up (and was at last check also) and the dependencies were okay... */
1537 service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1538
1539 /* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */
1540 if(hard_state_change == TRUE || temp_service->is_volatile == TRUE)
1541 handle_service_event(temp_service);
1542
1543 /* save the last hard state */
1544 temp_service->last_hard_state = temp_service->current_state;
1545
1546 /* reschedule the next check at the regular interval */
1547 if(reschedule_check == TRUE)
1548 next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1549 }
1550
1551
1552 /* should we obsessive over service checks? */
1553 if(obsess_over_services == TRUE)
1554 obsessive_compulsive_service_check_processor(temp_service);
1555 }
1556
1557 /* reschedule the next service check ONLY for active, scheduled checks */
1558 if(reschedule_check == TRUE) {
1559
1560 log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of service at %s", ctime(&next_service_check));
1561
1562 /* default is to reschedule service check unless a test below fails... */
1563 temp_service->should_be_scheduled = TRUE;
1564
1565 /* next check time was calculated above */
1566 temp_service->next_check = next_service_check;
1567
1568 /* make sure we don't get ourselves into too much trouble... */
1569 if(current_time > temp_service->next_check)
1570 temp_service->next_check = current_time;
1571
1572 /* make sure we rescheduled the next service check at a valid time */
1573 preferred_time = temp_service->next_check;
1574 get_next_valid_time(preferred_time, &next_valid_time, temp_service->check_period_ptr);
1575 temp_service->next_check = next_valid_time;
1576
1577 /* services with non-recurring intervals do not get rescheduled */
1578 if(temp_service->check_interval == 0)
1579 temp_service->should_be_scheduled = FALSE;
1580
1581 /* services with active checks disabled do not get rescheduled */
1582 if(temp_service->checks_enabled == FALSE)
1583 temp_service->should_be_scheduled = FALSE;
1584
1585 /* schedule a non-forced check if we can */
1586 if(temp_service->should_be_scheduled == TRUE)
1587 schedule_service_check(temp_service, temp_service->next_check, CHECK_OPTION_NONE);
1588 }
1589
1590 /* if we're stalking this state type and state was not already logged AND the plugin output changed since last check, log it now.. */
1591 if(temp_service->state_type == HARD_STATE && state_change == FALSE && state_was_logged == FALSE && compare_strings(old_plugin_output, temp_service->plugin_output)) {
1592
1593 if((temp_service->current_state == STATE_OK && temp_service->stalk_on_ok == TRUE))
1594 log_service_event(temp_service);
1595
1596 else if((temp_service->current_state == STATE_WARNING && temp_service->stalk_on_warning == TRUE))
1597 log_service_event(temp_service);
1598
1599 else if((temp_service->current_state == STATE_UNKNOWN && temp_service->stalk_on_unknown == TRUE))
1600 log_service_event(temp_service);
1601
1602 else if((temp_service->current_state == STATE_CRITICAL && temp_service->stalk_on_critical == TRUE))
1603 log_service_event(temp_service);
1604 }
1605
1606 #ifdef USE_EVENT_BROKER
1607 /* send data to event broker */
1608 broker_service_check(NEBTYPE_SERVICECHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_service, temp_service->check_type, queued_check_result->start_time, queued_check_result->finish_time, NULL, temp_service->latency, temp_service->execution_time, service_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, NULL);
1609 #endif
1610
1611 /* set the checked flag */
1612 temp_service->has_been_checked = TRUE;
1613
1614 /* update the current service status log */
1615 update_service_status(temp_service, FALSE);
1616
1617 /* check to see if the service and/or associate host is flapping */
1618 if(flapping_check_done == FALSE) {
1619 check_for_service_flapping(temp_service, TRUE, TRUE);
1620 check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1621 }
1622
1623 /* update service performance info */
1624 update_service_performance_data(temp_service);
1625
1626 /* free allocated memory */
1627 my_free(temp_plugin_output);
1628 my_free(old_plugin_output);
1629
1630
1631 /* run async checks of all services we added above */
1632 /* don't run a check if one is already executing or we can get by with a cached state */
1633 for(servicelist_item = check_servicelist; servicelist_item != NULL; servicelist_item = servicelist_item->next) {
1634 run_async_check = TRUE;
1635 temp_service = (service *)servicelist_item->object_ptr;
1636
1637 /* we can get by with a cached state, so don't check the service */
1638 if((current_time - temp_service->last_check) <= cached_service_check_horizon) {
1639 run_async_check = FALSE;
1640
1641 /* update check statistics */
1642 update_check_stats(ACTIVE_CACHED_SERVICE_CHECK_STATS, current_time);
1643 }
1644
1645 if(temp_service->is_executing == TRUE)
1646 run_async_check = FALSE;
1647
1648 if(run_async_check == TRUE)
1649 run_async_service_check(temp_service, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1650 }
1651 free_objectlist(&check_servicelist);
1652
1653 return OK;
1654 }
1655
1656
1657
1658 /* schedules an immediate or delayed service check */
schedule_service_check(service * svc,time_t check_time,int options)1659 void schedule_service_check(service *svc, time_t check_time, int options) {
1660 timed_event *temp_event = NULL;
1661 timed_event *new_event = NULL;
1662 int use_original_event = TRUE;
1663
1664 log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_service_check()\n");
1665
1666 if(svc == NULL)
1667 return;
1668
1669 log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of service '%s' on host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", svc->description, svc->host_name, ctime(&check_time));
1670
1671 /* don't schedule a check if active checks of this service are disabled */
1672 if(svc->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) {
1673 log_debug_info(DEBUGL_CHECKS, 0, "Active checks of this service are disabled.\n");
1674 return;
1675 }
1676
1677 /* default is to use the new event */
1678 use_original_event = FALSE;
1679
1680 temp_event = (timed_event *)svc->next_check_event;
1681
1682 /*
1683 * If the service already has a check scheduled,
1684 * we need to decide which of the events to use
1685 */
1686 if(temp_event != NULL) {
1687
1688 log_debug_info(DEBUGL_CHECKS, 2, "Found another service check event for this service @ %s", ctime(&temp_event->run_time));
1689
1690 /* use the originally scheduled check unless we decide otherwise */
1691 use_original_event = TRUE;
1692
1693 /* the original event is a forced check... */
1694 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) {
1695
1696 /* the new event is also forced and its execution time is earlier than the original, so use it instead */
1697 if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) {
1698 use_original_event = FALSE;
1699 log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced and occurs before the existing event, so the new event will be used instead.\n");
1700 }
1701 }
1702
1703 /* the original event is not a forced check... */
1704 else {
1705
1706 /* the new event is a forced check, so use it instead */
1707 if((options & CHECK_OPTION_FORCE_EXECUTION)) {
1708 use_original_event = FALSE;
1709 log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced, so it will be used instead of the existing event.\n");
1710 }
1711
1712 /* the new event is not forced either and its execution time is earlier than the original, so use it instead */
1713 else if(check_time < temp_event->run_time) {
1714 use_original_event = FALSE;
1715 log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs before the existing (older) event, so it will be used instead.\n");
1716 }
1717
1718 /* the new event is older, so override the existing one */
1719 else {
1720 log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs after the existing event, so we'll ignore it.\n");
1721 }
1722 }
1723 }
1724
1725 /* schedule a new event */
1726 if(use_original_event == FALSE) {
1727
1728 /* allocate memory for a new event item */
1729 new_event = (timed_event *)malloc(sizeof(timed_event));
1730 if(new_event == NULL) {
1731 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of service '%s' on host '%s'!\n", svc->description, svc->host_name);
1732 return;
1733 }
1734
1735 /* make sure we kill off the old event */
1736 if(temp_event) {
1737 remove_event(temp_event, &event_list_low, &event_list_low_tail);
1738 my_free(temp_event);
1739 }
1740 log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new service check event.\n");
1741
1742 /* set the next service check event and time */
1743 svc->next_check_event = new_event;
1744 svc->next_check = check_time;
1745
1746 /* save check options for retention purposes */
1747 svc->check_options = options;
1748
1749 /* place the new event in the event queue */
1750 new_event->event_type = EVENT_SERVICE_CHECK;
1751 new_event->event_data = (void *)svc;
1752 new_event->event_args = (void *)NULL;
1753 new_event->event_options = options;
1754 new_event->run_time = svc->next_check;
1755 new_event->recurring = FALSE;
1756 new_event->event_interval = 0L;
1757 new_event->timing_func = NULL;
1758 new_event->compensate_for_time_change = TRUE;
1759 reschedule_event(new_event, &event_list_low, &event_list_low_tail);
1760 }
1761
1762 else {
1763 /* reset the next check time (it may be out of sync) */
1764 if(temp_event != NULL)
1765 svc->next_check = temp_event->run_time;
1766
1767 log_debug_info(DEBUGL_CHECKS, 2, "Keeping original service check event (ignoring the new one).\n");
1768 }
1769
1770
1771 /* update the status log */
1772 update_service_status(svc, FALSE);
1773
1774 return;
1775 }
1776
1777
1778
1779 /* checks viability of performing a service check */
check_service_check_viability(service * svc,int check_options,int * time_is_valid,time_t * new_time)1780 int check_service_check_viability(service *svc, int check_options, int *time_is_valid, time_t *new_time) {
1781 int result = OK;
1782 int perform_check = TRUE;
1783 time_t current_time = 0L;
1784 time_t preferred_time = 0L;
1785 int check_interval = 0;
1786
1787 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_check_viability()\n");
1788
1789 /* make sure we have a service */
1790 if(svc == NULL)
1791 return ERROR;
1792
1793 /* get the check interval to use if we need to reschedule the check */
1794 if(svc->state_type == SOFT_STATE && svc->current_state != STATE_OK)
1795 check_interval = (svc->retry_interval * interval_length);
1796 else
1797 check_interval = (svc->check_interval * interval_length);
1798
1799 /* get the current time */
1800 time(¤t_time);
1801
1802 /* initialize the next preferred check time */
1803 preferred_time = current_time;
1804
1805 /* can we check the host right now? */
1806 if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
1807
1808 /* if checks of the service are currently disabled... */
1809 if(svc->checks_enabled == FALSE) {
1810 preferred_time = current_time + check_interval;
1811 perform_check = FALSE;
1812
1813 log_debug_info(DEBUGL_CHECKS, 2, "Active checks of the service are currently disabled.\n");
1814 }
1815
1816 /* make sure this is a valid time to check the service */
1817 if(check_time_against_period((unsigned long)current_time, svc->check_period_ptr) == ERROR) {
1818 preferred_time = current_time;
1819 if(time_is_valid)
1820 *time_is_valid = FALSE;
1821 perform_check = FALSE;
1822
1823 log_debug_info(DEBUGL_CHECKS, 2, "This is not a valid time for this service to be actively checked.\n");
1824 }
1825
1826 /* check service dependencies for execution */
1827 if(check_service_dependencies(svc, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
1828 preferred_time = current_time + check_interval;
1829 perform_check = FALSE;
1830
1831 log_debug_info(DEBUGL_CHECKS, 2, "Execution dependencies for this service failed, so it will not be actively checked.\n");
1832 }
1833 }
1834
1835 /* pass back the next viable check time */
1836 if(new_time)
1837 *new_time = preferred_time;
1838
1839 result = (perform_check == TRUE) ? OK : ERROR;
1840
1841 return result;
1842 }
1843
1844
1845
1846 /* checks service dependencies */
check_service_dependencies(service * svc,int dependency_type)1847 int check_service_dependencies(service *svc, int dependency_type) {
1848 servicedependency *temp_dependency = NULL;
1849 service *temp_service = NULL;
1850 int state = STATE_OK;
1851 time_t current_time = 0L;
1852 void *ptr = NULL;
1853
1854
1855 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_dependencies()\n");
1856
1857 /* check all dependencies... */
1858 for(temp_dependency = get_first_servicedependency_by_dependent_service(svc->host_name, svc->description, &ptr); temp_dependency != NULL; temp_dependency = get_next_servicedependency_by_dependent_service(svc->host_name, svc->description, &ptr)) {
1859
1860 /* only check dependencies of the desired type (notification or execution) */
1861 if(temp_dependency->dependency_type != dependency_type)
1862 continue;
1863
1864 /* find the service we depend on... */
1865 if((temp_service = temp_dependency->master_service_ptr) == NULL)
1866 continue;
1867
1868 /* skip this dependency if it has a timeperiod and the current time isn't valid */
1869 time(¤t_time);
1870 if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR)
1871 return FALSE;
1872
1873 /* get the status to use (use last hard state if its currently in a soft state) */
1874 if(temp_service->state_type == SOFT_STATE && soft_state_dependencies == FALSE)
1875 state = temp_service->last_hard_state;
1876 else
1877 state = temp_service->current_state;
1878
1879 /* is the service we depend on in state that fails the dependency tests? */
1880 if(state == STATE_OK && temp_dependency->fail_on_ok == TRUE)
1881 return DEPENDENCIES_FAILED;
1882 if(state == STATE_WARNING && temp_dependency->fail_on_warning == TRUE)
1883 return DEPENDENCIES_FAILED;
1884 if(state == STATE_UNKNOWN && temp_dependency->fail_on_unknown == TRUE)
1885 return DEPENDENCIES_FAILED;
1886 if(state == STATE_CRITICAL && temp_dependency->fail_on_critical == TRUE)
1887 return DEPENDENCIES_FAILED;
1888 if((state == STATE_OK && temp_service->has_been_checked == FALSE) && temp_dependency->fail_on_pending == TRUE)
1889 return DEPENDENCIES_FAILED;
1890
1891 /* immediate dependencies ok at this point - check parent dependencies if necessary */
1892 if(temp_dependency->inherits_parent == TRUE) {
1893 if(check_service_dependencies(temp_service, dependency_type) != DEPENDENCIES_OK)
1894 return DEPENDENCIES_FAILED;
1895 }
1896 }
1897
1898 return DEPENDENCIES_OK;
1899 }
1900
1901
1902
1903 /* check for services that never returned from a check... */
check_for_orphaned_services(void)1904 void check_for_orphaned_services(void) {
1905 service *temp_service = NULL;
1906 time_t current_time = 0L;
1907 time_t expected_time = 0L;
1908
1909
1910 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_services()\n");
1911
1912 /* get the current time */
1913 time(¤t_time);
1914
1915 /* check all services... */
1916 for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) {
1917
1918 /* skip services that are not currently executing */
1919 if(temp_service->is_executing == FALSE)
1920 continue;
1921
1922 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */
1923 expected_time = (time_t)(temp_service->next_check + temp_service->latency + service_check_timeout + check_reaper_interval + 600);
1924
1925 /* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */
1926 if(expected_time < current_time) {
1927
1928 /* log a warning */
1929 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back). I'm scheduling an immediate check of the service...\n", temp_service->description, temp_service->host_name);
1930
1931 log_debug_info(DEBUGL_CHECKS, 1, "Service '%s' on host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_service->description, temp_service->host_name);
1932
1933 /* decrement the number of running service checks */
1934 if(currently_running_service_checks > 0)
1935 currently_running_service_checks--;
1936
1937 /* disable the executing flag */
1938 temp_service->is_executing = FALSE;
1939
1940 /* schedule an immediate check of the service */
1941 schedule_service_check(temp_service, current_time, CHECK_OPTION_ORPHAN_CHECK);
1942 }
1943
1944 }
1945
1946 return;
1947 }
1948
1949
1950
1951 /* check freshness of service results */
check_service_result_freshness(void)1952 void check_service_result_freshness(void) {
1953 service *temp_service = NULL;
1954 time_t current_time = 0L;
1955
1956
1957 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_result_freshness()\n");
1958 log_debug_info(DEBUGL_CHECKS, 1, "Checking the freshness of service check results...\n");
1959
1960 /* bail out if we're not supposed to be checking freshness */
1961 if(check_service_freshness == FALSE) {
1962 log_debug_info(DEBUGL_CHECKS, 1, "Service freshness checking is disabled.\n");
1963 return;
1964 }
1965
1966 /* get the current time */
1967 time(¤t_time);
1968
1969 /* check all services... */
1970 for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) {
1971
1972 /* skip services we shouldn't be checking for freshness */
1973 if(temp_service->check_freshness == FALSE)
1974 continue;
1975
1976 /* skip services that are currently executing (problems here will be caught by orphaned service check) */
1977 if(temp_service->is_executing == TRUE)
1978 continue;
1979
1980 /* skip services that have both active and passive checks disabled */
1981 if(temp_service->checks_enabled == FALSE && temp_service->accept_passive_service_checks == FALSE)
1982 continue;
1983
1984 /* skip services that are already being freshened */
1985 if(temp_service->is_being_freshened == TRUE)
1986 continue;
1987
1988 /* see if the time is right... */
1989 if(check_time_against_period(current_time, temp_service->check_period_ptr) == ERROR)
1990 continue;
1991
1992 /* EXCEPTION */
1993 /* don't check freshness of services without regular check intervals if we're using auto-freshness threshold */
1994 if(temp_service->check_interval == 0 && temp_service->freshness_threshold == 0)
1995 continue;
1996
1997 /* the results for the last check of this service are stale! */
1998 if(is_service_result_fresh(temp_service, current_time, TRUE) == FALSE) {
1999
2000 /* set the freshen flag */
2001 temp_service->is_being_freshened = TRUE;
2002
2003 /* schedule an immediate forced check of the service */
2004 schedule_service_check(temp_service, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
2005 }
2006
2007 }
2008
2009 return;
2010 }
2011
2012
2013
2014 /* tests whether or not a service's check results are fresh */
is_service_result_fresh(service * temp_service,time_t current_time,int log_this)2015 int is_service_result_fresh(service *temp_service, time_t current_time, int log_this) {
2016 int freshness_threshold = 0;
2017 time_t expiration_time = 0L;
2018 int days = 0;
2019 int hours = 0;
2020 int minutes = 0;
2021 int seconds = 0;
2022 int tdays = 0;
2023 int thours = 0;
2024 int tminutes = 0;
2025 int tseconds = 0;
2026
2027 log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
2028
2029 /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
2030 if(temp_service->freshness_threshold == 0) {
2031 if(temp_service->state_type == HARD_STATE || temp_service->current_state == STATE_OK)
2032 freshness_threshold = (temp_service->check_interval * interval_length) + temp_service->latency + additional_freshness_latency;
2033 else
2034 freshness_threshold = (temp_service->retry_interval * interval_length) + temp_service->latency + additional_freshness_latency;
2035 }
2036 else
2037 freshness_threshold = temp_service->freshness_threshold;
2038
2039 log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: service=%d, use=%d\n", temp_service->freshness_threshold, freshness_threshold);
2040
2041 /* calculate expiration time */
2042 /*
2043 * CHANGED 11/10/05 EG -
2044 * program start is only used in expiration time calculation
2045 * if > last check AND active checks are enabled, so active checks
2046 * can become stale immediately upon program startup
2047 */
2048 /*
2049 * CHANGED 02/25/06 SG -
2050 * passive checks also become stale, so remove dependence on active
2051 * check logic
2052 */
2053 if(temp_service->has_been_checked == FALSE)
2054 expiration_time = (time_t)(event_start + freshness_threshold);
2055 /*
2056 * CHANGED 06/19/07 EG -
2057 * Per Ton's suggestion (and user requests), only use program start
2058 * time over last check if no specific threshold has been set by user.
2059 * Problems can occur if Nagios is restarted more frequently that
2060 * freshness threshold intervals (services never go stale).
2061 */
2062 /*
2063 * CHANGED 10/07/07 EG:
2064 * Only match next condition for services that
2065 * have active checks enabled...
2066 */
2067 /*
2068 * CHANGED 10/07/07 EG:
2069 * Added max_service_check_spread to expiration time as suggested
2070 * by Altinity
2071 */
2072 else if(temp_service->checks_enabled == TRUE && event_start > temp_service->last_check && temp_service->freshness_threshold == 0)
2073 expiration_time = (time_t)(event_start + freshness_threshold + (max_service_check_spread * interval_length));
2074 else
2075 expiration_time = (time_t)(temp_service->last_check + freshness_threshold);
2076
2077 /*
2078 * If the check was last done passively, we assume it's going
2079 * to continue that way and we need to handle the fact that
2080 * Nagios might have been shut off for quite a long time. If so,
2081 * we mustn't spam freshness notifications but use event_start
2082 * instead of last_check to determine freshness expiration time.
2083 * The threshold for "long time" is determined as 61.8% of the normal
2084 * freshness threshold based on vast heuristical research (ie, "some
2085 * guy once told me the golden ratio is good for loads of stuff").
2086 */
2087 if(temp_service->check_type == SERVICE_CHECK_PASSIVE) {
2088 if(temp_service->last_check < event_start &&
2089 event_start - last_program_stop > freshness_threshold * 0.618) {
2090 expiration_time = event_start + freshness_threshold;
2091 }
2092 }
2093 log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_service->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_service->last_check, (unsigned long)current_time, (unsigned long)expiration_time);
2094
2095 /* the results for the last check of this service are stale */
2096 if(expiration_time < current_time) {
2097
2098 get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds);
2099 get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds);
2100
2101 /* log a warning */
2102 if(log_this == TRUE)
2103 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the service.\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2104
2105 log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the service...\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2106
2107 return FALSE;
2108 }
2109
2110 log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are fresh.\n", temp_service->description, temp_service->host_name);
2111
2112 return TRUE;
2113 }
2114
2115
2116
2117
2118 /******************************************************************/
2119 /*************** COMMON ROUTE/HOST CHECK FUNCTIONS ****************/
2120 /******************************************************************/
2121
2122 /* execute an on-demand check */
perform_on_demand_host_check(host * hst,int * check_return_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2123 int perform_on_demand_host_check(host *hst, int *check_return_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2124
2125 log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_on_demand_host_check()\n");
2126
2127 perform_on_demand_host_check_3x(hst, check_return_code, check_options, use_cached_result, check_timestamp_horizon);
2128
2129 return OK;
2130 }
2131
2132
2133
2134 /* execute a scheduled host check using either the 2.x or 3.x logic */
perform_scheduled_host_check(host * hst,int check_options,double latency)2135 int perform_scheduled_host_check(host *hst, int check_options, double latency) {
2136
2137 log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_scheduled_host_check()\n");
2138
2139 run_scheduled_host_check_3x(hst, check_options, latency);
2140
2141 return OK;
2142 }
2143
2144
2145
2146 /* schedules an immediate or delayed host check */
schedule_host_check(host * hst,time_t check_time,int options)2147 void schedule_host_check(host *hst, time_t check_time, int options) {
2148 timed_event *temp_event = NULL;
2149 timed_event *new_event = NULL;
2150 int use_original_event = TRUE;
2151
2152
2153 log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_host_check()\n");
2154
2155 if(hst == NULL)
2156 return;
2157
2158 log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", hst->name, ctime(&check_time));
2159
2160 /* don't schedule a check if active checks of this host are disabled */
2161 if(hst->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) {
2162 log_debug_info(DEBUGL_CHECKS, 0, "Active checks are disabled for this host.\n");
2163 return;
2164 }
2165
2166 /* default is to use the new event */
2167 use_original_event = FALSE;
2168
2169 temp_event = (timed_event *)hst->next_check_event;
2170
2171 /*
2172 * If the host already had a check scheduled we need
2173 * to decide which check event to use
2174 */
2175 if(temp_event != NULL) {
2176
2177 log_debug_info(DEBUGL_CHECKS, 2, "Found another host check event for this host @ %s", ctime(&temp_event->run_time));
2178
2179 /* use the originally scheduled check unless we decide otherwise */
2180 use_original_event = TRUE;
2181
2182 /* the original event is a forced check... */
2183 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) {
2184
2185 /* the new event is also forced and its execution time is earlier than the original, so use it instead */
2186 if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) {
2187 log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced and occurs before the existing event, so the new event be used instead.\n");
2188 use_original_event = FALSE;
2189 }
2190 }
2191
2192 /* the original event is not a forced check... */
2193 else {
2194
2195 /* the new event is a forced check, so use it instead */
2196 if((options & CHECK_OPTION_FORCE_EXECUTION)) {
2197 use_original_event = FALSE;
2198 log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced, so it will be used instead of the existing event.\n");
2199 }
2200
2201 /* the new event is not forced either and its execution time is earlier than the original, so use it instead */
2202 else if(check_time < temp_event->run_time) {
2203 use_original_event = FALSE;
2204 log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs before the existing (older) event, so it will be used instead.\n");
2205 }
2206
2207 /* the new event is older, so override the existing one */
2208 else {
2209 log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs after the existing event, so we'll ignore it.\n");
2210 }
2211 }
2212 }
2213
2214 /* use the new event */
2215 if(use_original_event == FALSE) {
2216
2217 log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new host check event.\n");
2218
2219 /* allocate memory for a new event item */
2220 if((new_event = (timed_event *)malloc(sizeof(timed_event))) == NULL) {
2221 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of host '%s'!\n", hst->name);
2222 return;
2223 }
2224
2225 if(temp_event) {
2226 remove_event(temp_event, &event_list_low, &event_list_low_tail);
2227 my_free(temp_event);
2228 }
2229
2230 /* set the next host check event and time */
2231 hst->next_check_event = new_event;
2232 hst->next_check = check_time;
2233
2234 /* save check options for retention purposes */
2235 hst->check_options = options;
2236
2237 /* place the new event in the event queue */
2238 new_event->event_type = EVENT_HOST_CHECK;
2239 new_event->event_data = (void *)hst;
2240 new_event->event_args = (void *)NULL;
2241 new_event->event_options = options;
2242 new_event->run_time = hst->next_check;
2243 new_event->recurring = FALSE;
2244 new_event->event_interval = 0L;
2245 new_event->timing_func = NULL;
2246 new_event->compensate_for_time_change = TRUE;
2247 reschedule_event(new_event, &event_list_low, &event_list_low_tail);
2248 }
2249
2250 else {
2251 /* reset the next check time (it may be out of sync) */
2252 if(temp_event != NULL)
2253 hst->next_check = temp_event->run_time;
2254
2255 log_debug_info(DEBUGL_CHECKS, 2, "Keeping original host check event (ignoring the new one).\n");
2256 }
2257
2258 /* update the status log */
2259 update_host_status(hst, FALSE);
2260
2261 return;
2262 }
2263
2264
2265
2266 /* checks host dependencies */
check_host_dependencies(host * hst,int dependency_type)2267 int check_host_dependencies(host *hst, int dependency_type) {
2268 hostdependency *temp_dependency = NULL;
2269 host *temp_host = NULL;
2270 int state = HOST_UP;
2271 time_t current_time = 0L;
2272 void *ptr = NULL;
2273
2274
2275 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_dependencies()\n");
2276
2277 /* check all dependencies... */
2278 for(temp_dependency = get_first_hostdependency_by_dependent_host(hst->name, &ptr); temp_dependency != NULL; temp_dependency = get_next_hostdependency_by_dependent_host(hst->name, &ptr)) {
2279
2280 /* only check dependencies of the desired type (notification or execution) */
2281 if(temp_dependency->dependency_type != dependency_type)
2282 continue;
2283
2284 /* find the host we depend on... */
2285 if((temp_host = temp_dependency->master_host_ptr) == NULL)
2286 continue;
2287
2288 /* skip this dependency if it has a timeperiod and the current time isn't valid */
2289 time(¤t_time);
2290 if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR)
2291 return FALSE;
2292
2293 /* get the status to use (use last hard state if its currently in a soft state) */
2294 if(temp_host->state_type == SOFT_STATE && soft_state_dependencies == FALSE)
2295 state = temp_host->last_hard_state;
2296 else
2297 state = temp_host->current_state;
2298
2299 /* is the host we depend on in state that fails the dependency tests? */
2300 if(state == HOST_UP && temp_dependency->fail_on_up == TRUE)
2301 return DEPENDENCIES_FAILED;
2302 if(state == HOST_DOWN && temp_dependency->fail_on_down == TRUE)
2303 return DEPENDENCIES_FAILED;
2304 if(state == HOST_UNREACHABLE && temp_dependency->fail_on_unreachable == TRUE)
2305 return DEPENDENCIES_FAILED;
2306 if((state == HOST_UP && temp_host->has_been_checked == FALSE) && temp_dependency->fail_on_pending == TRUE)
2307 return DEPENDENCIES_FAILED;
2308
2309 /* immediate dependencies ok at this point - check parent dependencies if necessary */
2310 if(temp_dependency->inherits_parent == TRUE) {
2311 if(check_host_dependencies(temp_host, dependency_type) != DEPENDENCIES_OK)
2312 return DEPENDENCIES_FAILED;
2313 }
2314 }
2315
2316 return DEPENDENCIES_OK;
2317 }
2318
2319
2320
2321 /* check for hosts that never returned from a check... */
check_for_orphaned_hosts(void)2322 void check_for_orphaned_hosts(void) {
2323 host *temp_host = NULL;
2324 time_t current_time = 0L;
2325 time_t expected_time = 0L;
2326
2327
2328 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_hosts()\n");
2329
2330 /* get the current time */
2331 time(¤t_time);
2332
2333 /* check all hosts... */
2334 for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) {
2335
2336 /* skip hosts that don't have a set check interval (on-demand checks are missed by the orphan logic) */
2337 if(temp_host->next_check == (time_t)0L)
2338 continue;
2339
2340 /* skip hosts that are not currently executing */
2341 if(temp_host->is_executing == FALSE)
2342 continue;
2343
2344 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */
2345 expected_time = (time_t)(temp_host->next_check + temp_host->latency + host_check_timeout + check_reaper_interval + 600);
2346
2347 /* this host was supposed to have executed a while ago, but for some reason the results haven't come back in... */
2348 if(expected_time < current_time) {
2349
2350 /* log a warning */
2351 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of host '%s' looks like it was orphaned (results never came back). I'm scheduling an immediate check of the host...\n", temp_host->name);
2352
2353 log_debug_info(DEBUGL_CHECKS, 1, "Host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_host->name);
2354
2355 /* decrement the number of running host checks */
2356 if(currently_running_host_checks > 0)
2357 currently_running_host_checks--;
2358
2359 /* disable the executing flag */
2360 temp_host->is_executing = FALSE;
2361
2362 /* schedule an immediate check of the host */
2363 schedule_host_check(temp_host, current_time, CHECK_OPTION_ORPHAN_CHECK);
2364 }
2365
2366 }
2367
2368 return;
2369 }
2370
2371
2372
2373 /* check freshness of host results */
check_host_result_freshness(void)2374 void check_host_result_freshness(void) {
2375 host *temp_host = NULL;
2376 time_t current_time = 0L;
2377
2378
2379 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_result_freshness()\n");
2380 log_debug_info(DEBUGL_CHECKS, 2, "Attempting to check the freshness of host check results...\n");
2381
2382 /* bail out if we're not supposed to be checking freshness */
2383 if(check_host_freshness == FALSE) {
2384 log_debug_info(DEBUGL_CHECKS, 2, "Host freshness checking is disabled.\n");
2385 return;
2386 }
2387
2388 /* get the current time */
2389 time(¤t_time);
2390
2391 /* check all hosts... */
2392 for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) {
2393
2394 /* skip hosts we shouldn't be checking for freshness */
2395 if(temp_host->check_freshness == FALSE)
2396 continue;
2397
2398 /* skip hosts that have both active and passive checks disabled */
2399 if(temp_host->checks_enabled == FALSE && temp_host->accept_passive_host_checks == FALSE)
2400 continue;
2401
2402 /* skip hosts that are currently executing (problems here will be caught by orphaned host check) */
2403 if(temp_host->is_executing == TRUE)
2404 continue;
2405
2406 /* skip hosts that are already being freshened */
2407 if(temp_host->is_being_freshened == TRUE)
2408 continue;
2409
2410 /* see if the time is right... */
2411 if(check_time_against_period(current_time, temp_host->check_period_ptr) == ERROR)
2412 continue;
2413
2414 /* the results for the last check of this host are stale */
2415 if(is_host_result_fresh(temp_host, current_time, TRUE) == FALSE) {
2416
2417 /* set the freshen flag */
2418 temp_host->is_being_freshened = TRUE;
2419
2420 /* schedule an immediate forced check of the host */
2421 schedule_host_check(temp_host, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
2422 }
2423 }
2424
2425 return;
2426 }
2427
2428
2429
2430 /* checks to see if a hosts's check results are fresh */
is_host_result_fresh(host * temp_host,time_t current_time,int log_this)2431 int is_host_result_fresh(host *temp_host, time_t current_time, int log_this) {
2432 time_t expiration_time = 0L;
2433 int freshness_threshold = 0;
2434 int days = 0;
2435 int hours = 0;
2436 int minutes = 0;
2437 int seconds = 0;
2438 int tdays = 0;
2439 int thours = 0;
2440 int tminutes = 0;
2441 int tseconds = 0;
2442 double interval = 0;
2443
2444 log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of host '%s'...\n", temp_host->name);
2445
2446 /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
2447 if(temp_host->freshness_threshold == 0) {
2448 if(temp_host->state_type == HARD_STATE || temp_host->current_state == STATE_OK) {
2449 interval = temp_host->check_interval;
2450 }
2451 else {
2452 interval = temp_host->retry_interval;
2453 }
2454 freshness_threshold = (interval * interval_length) + temp_host->latency + additional_freshness_latency;
2455 }
2456 else
2457 freshness_threshold = temp_host->freshness_threshold;
2458
2459 log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: host=%d, use=%d\n", temp_host->freshness_threshold, freshness_threshold);
2460
2461 /* calculate expiration time */
2462 /*
2463 * CHANGED 11/10/05 EG:
2464 * program start is only used in expiration time calculation
2465 * if > last check AND active checks are enabled, so active checks
2466 * can become stale immediately upon program startup
2467 */
2468 if(temp_host->has_been_checked == FALSE)
2469 expiration_time = (time_t)(event_start + freshness_threshold);
2470 /*
2471 * CHANGED 06/19/07 EG:
2472 * Per Ton's suggestion (and user requests), only use program start
2473 * time over last check if no specific threshold has been set by user.
2474 * Problems can occur if Nagios is restarted more frequently that
2475 * freshness threshold intervals (hosts never go stale).
2476 */
2477 /*
2478 * CHANGED 10/07/07 EG:
2479 * Added max_host_check_spread to expiration time as suggested by
2480 * Altinity
2481 */
2482 else if(temp_host->checks_enabled == TRUE && event_start > temp_host->last_check && temp_host->freshness_threshold == 0)
2483 expiration_time = (time_t)(event_start + freshness_threshold + (max_host_check_spread * interval_length));
2484 else
2485 expiration_time = (time_t)(temp_host->last_check + freshness_threshold);
2486
2487 /*
2488 * If the check was last done passively, we assume it's going
2489 * to continue that way and we need to handle the fact that
2490 * Nagios might have been shut off for quite a long time. If so,
2491 * we mustn't spam freshness notifications but use event_start
2492 * instead of last_check to determine freshness expiration time.
2493 * The threshold for "long time" is determined as 61.8% of the normal
2494 * freshness threshold based on vast heuristical research (ie, "some
2495 * guy once told me the golden ratio is good for loads of stuff").
2496 */
2497 if(temp_host->check_type == HOST_CHECK_PASSIVE) {
2498 if(temp_host->last_check < event_start &&
2499 event_start - last_program_stop > freshness_threshold * 0.618) {
2500 expiration_time = event_start + freshness_threshold;
2501 }
2502 }
2503
2504 log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_host->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_host->last_check, (unsigned long)current_time, (unsigned long)expiration_time);
2505
2506 /* the results for the last check of this host are stale */
2507 if(expiration_time < current_time) {
2508
2509 get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds);
2510 get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds);
2511
2512 /* log a warning */
2513 if(log_this == TRUE)
2514 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). I'm forcing an immediate check of the host.\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2515
2516 log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds). Forcing an immediate check of the host...\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2517
2518 return FALSE;
2519 }
2520 else
2521 log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are fresh.\n", temp_host->name);
2522
2523 return TRUE;
2524 }
2525
2526
2527
2528 /******************************************************************/
2529 /************* NAGIOS 3.X ROUTE/HOST CHECK FUNCTIONS **************/
2530 /******************************************************************/
2531
2532
2533 /*** ON-DEMAND HOST CHECKS USE THIS FUNCTION ***/
2534 /* check to see if we can reach the host */
perform_on_demand_host_check_3x(host * hst,int * check_result_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2535 int perform_on_demand_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2536 int result = OK;
2537
2538 log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_on_demand_host_check_3x()\n");
2539
2540 /* make sure we have a host */
2541 if(hst == NULL)
2542 return ERROR;
2543
2544 log_debug_info(DEBUGL_CHECKS, 0, "** On-demand check for host '%s'...\n", hst->name);
2545
2546 /* check the status of the host */
2547 result = run_sync_host_check_3x(hst, check_result_code, check_options, use_cached_result, check_timestamp_horizon);
2548
2549 return result;
2550 }
2551
2552
2553
2554 /* perform a synchronous check of a host */
2555 /* on-demand host checks will use this... */
run_sync_host_check_3x(host * hst,int * check_result_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2556 int run_sync_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2557 int result = OK;
2558 time_t current_time = 0L;
2559 int host_result = HOST_UP;
2560 char *old_plugin_output = NULL;
2561 struct timeval start_time;
2562 struct timeval end_time;
2563
2564
2565 log_debug_info(DEBUGL_FUNCTIONS, 0, "run_sync_host_check_3x()\n");
2566
2567 /* make sure we have a host */
2568 if(hst == NULL)
2569 return ERROR;
2570
2571 log_debug_info(DEBUGL_CHECKS, 0, "** Run sync check of host '%s'...\n", hst->name);
2572
2573 /* is the host check viable at this time? */
2574 /* if not, return current state and bail out */
2575 if(check_host_check_viability_3x(hst, check_options, NULL, NULL) == ERROR) {
2576 if(check_result_code)
2577 *check_result_code = hst->current_state;
2578 log_debug_info(DEBUGL_CHECKS, 0, "Host check is not viable at this time.\n");
2579 return OK;
2580 }
2581
2582 /* get the current time */
2583 time(¤t_time);
2584
2585 /* high resolution start time for event broker */
2586 gettimeofday(&start_time, NULL);
2587
2588 /* can we use the last cached host state? */
2589 if(use_cached_result == TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
2590
2591 /* we can used the cached result, so return it and get out of here... */
2592 if(hst->has_been_checked == TRUE && ((current_time - hst->last_check) <= check_timestamp_horizon)) {
2593 if(check_result_code)
2594 *check_result_code = hst->current_state;
2595
2596 log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", hst->current_state);
2597
2598 /* update check statistics */
2599 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
2600 update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
2601
2602 return OK;
2603 }
2604 }
2605
2606
2607 log_debug_info(DEBUGL_CHECKS, 1, "* Running actual host check: old state=%d\n", hst->current_state);
2608
2609
2610 /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
2611
2612 /* update check statistics */
2613 update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
2614 update_check_stats(SERIAL_HOST_CHECK_STATS, start_time.tv_sec);
2615
2616 /* reset host check latency, since on-demand checks have none */
2617 hst->latency = 0.0;
2618
2619 /* adjust host check attempt */
2620 adjust_host_check_attempt_3x(hst, TRUE);
2621
2622 /* save old host state */
2623 hst->last_state = hst->current_state;
2624 if(hst->state_type == HARD_STATE)
2625 hst->last_hard_state = hst->current_state;
2626
2627 /* save old plugin output for state stalking */
2628 if(hst->plugin_output)
2629 old_plugin_output = (char *)strdup(hst->plugin_output);
2630
2631 /* set the checked flag */
2632 hst->has_been_checked = TRUE;
2633
2634 /* clear the freshness flag */
2635 hst->is_being_freshened = FALSE;
2636
2637 /* clear check options - we don't want old check options retained */
2638 hst->check_options = CHECK_OPTION_NONE;
2639
2640 /* set the check type */
2641 hst->check_type = HOST_CHECK_ACTIVE;
2642
2643
2644 /*********** EXECUTE THE CHECK AND PROCESS THE RESULTS **********/
2645
2646 #ifdef USE_EVENT_BROKER
2647 /* send data to event broker */
2648 end_time.tv_sec = 0L;
2649 end_time.tv_usec = 0L;
2650 broker_host_check(NEBTYPE_HOSTCHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2651 #endif
2652
2653 /* execute the host check */
2654 host_result = execute_sync_host_check_3x(hst);
2655
2656 /* process the host check result */
2657 process_host_check_result_3x(hst, host_result, old_plugin_output, check_options, FALSE, use_cached_result, check_timestamp_horizon);
2658
2659 /* free memory */
2660 my_free(old_plugin_output);
2661
2662 log_debug_info(DEBUGL_CHECKS, 1, "* Sync host check done: new state=%d\n", hst->current_state);
2663
2664 /* high resolution end time for event broker */
2665 gettimeofday(&end_time, NULL);
2666
2667 #ifdef USE_EVENT_BROKER
2668 /* send data to event broker */
2669 broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, hst->execution_time, host_check_timeout, FALSE, hst->current_state, NULL, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2670 #endif
2671
2672 return result;
2673 }
2674
2675
2676
2677 /* run an "alive" check on a host */
2678 /* on-demand host checks will use this... */
execute_sync_host_check_3x(host * hst)2679 int execute_sync_host_check_3x(host *hst) {
2680 nagios_macros mac;
2681 int result = STATE_OK;
2682 int return_result = HOST_UP;
2683 char *processed_command = NULL;
2684 char *raw_command = NULL;
2685 struct timeval start_time;
2686 struct timeval end_time;
2687 char *temp_ptr;
2688 int early_timeout = FALSE;
2689 double exectime;
2690 char *temp_plugin_output = NULL;
2691 #ifdef USE_EVENT_BROKER
2692 int neb_result = OK;
2693 #endif
2694
2695
2696 log_debug_info(DEBUGL_FUNCTIONS, 0, "execute_sync_host_check_3x()\n");
2697
2698 if(hst == NULL)
2699 return HOST_DOWN;
2700
2701 log_debug_info(DEBUGL_CHECKS, 0, "** Executing sync check of host '%s'...\n", hst->name);
2702
2703 #ifdef USE_EVENT_BROKER
2704 /* initialize start/end times */
2705 start_time.tv_sec = 0L;
2706 start_time.tv_usec = 0L;
2707 end_time.tv_sec = 0L;
2708 end_time.tv_usec = 0L;
2709
2710 /* send data to event broker */
2711 neb_result = broker_host_check(NEBTYPE_HOSTCHECK_SYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2712
2713 /* neb module wants to cancel the host check - return the current state of the host */
2714 if(neb_result == NEBERROR_CALLBACKCANCEL)
2715 return hst->current_state;
2716
2717 /* neb module wants to override the host check - perhaps it will check the host itself */
2718 /* NOTE: if a module does this, it must check the status of the host and populate the data structures BEFORE it returns from the callback! */
2719 if(neb_result == NEBERROR_CALLBACKOVERRIDE)
2720 return hst->current_state;
2721 #endif
2722
2723 /* grab the host macros */
2724 memset(&mac, 0, sizeof(mac));
2725 grab_host_macros_r(&mac, hst);
2726
2727 /* high resolution start time for event broker */
2728 gettimeofday(&start_time, NULL);
2729
2730 /* get the last host check time */
2731 time(&hst->last_check);
2732
2733 /* get the raw command line */
2734 get_raw_command_line_r(&mac, hst->check_command_ptr, hst->host_check_command, &raw_command, 0);
2735 if(raw_command == NULL) {
2736 clear_volatile_macros_r(&mac);
2737 return ERROR;
2738 }
2739
2740 /* process any macros contained in the argument */
2741 process_macros_r(&mac, raw_command, &processed_command, 0);
2742 if(processed_command == NULL) {
2743 my_free(raw_command);
2744 clear_volatile_macros_r(&mac);
2745 return ERROR;
2746 }
2747
2748 #ifdef USE_EVENT_BROKER
2749 /* send data to event broker */
2750 end_time.tv_sec = 0L;
2751 end_time.tv_usec = 0L;
2752 broker_host_check(NEBTYPE_HOSTCHECK_RAW_START, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, return_result, hst->state_type, start_time, end_time, hst->host_check_command, 0.0, 0.0, host_check_timeout, early_timeout, result, processed_command, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2753 #endif
2754
2755 log_debug_info(DEBUGL_COMMANDS, 1, "Raw host check command: %s\n", raw_command);
2756 log_debug_info(DEBUGL_COMMANDS, 0, "Processed host check ommand: %s\n", processed_command);
2757 my_free(raw_command);
2758
2759 /* clear plugin output and performance data buffers */
2760 my_free(hst->plugin_output);
2761 my_free(hst->long_plugin_output);
2762 my_free(hst->perf_data);
2763
2764 /* run the host check command */
2765 result = my_system_r(&mac, processed_command, host_check_timeout, &early_timeout, &exectime, &temp_plugin_output, MAX_PLUGIN_OUTPUT_LENGTH);
2766 clear_volatile_macros_r(&mac);
2767
2768 /* if the check timed out, report an error */
2769 if(early_timeout == TRUE) {
2770
2771 my_free(temp_plugin_output);
2772 asprintf(&temp_plugin_output, "Host check timed out after %d seconds\n", host_check_timeout);
2773
2774 /* log the timeout */
2775 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host check command '%s' for host '%s' timed out after %d seconds\n", processed_command, hst->name, host_check_timeout);
2776 }
2777
2778 /* calculate total execution time */
2779 hst->execution_time = exectime;
2780
2781 /* record check type */
2782 hst->check_type = HOST_CHECK_ACTIVE;
2783
2784 /* parse the output: short and long output, and perf data */
2785 parse_check_output(temp_plugin_output, &hst->plugin_output, &hst->long_plugin_output, &hst->perf_data, TRUE, TRUE);
2786
2787 /* free memory */
2788 my_free(temp_plugin_output);
2789 my_free(processed_command);
2790
2791 /* a NULL host check command means we should assume the host is UP */
2792 if(hst->host_check_command == NULL) {
2793 my_free(hst->plugin_output);
2794 hst->plugin_output = (char *)strdup("(Host assumed to be UP)");
2795 result = STATE_OK;
2796 }
2797
2798 /* make sure we have some data */
2799 if(hst->plugin_output == NULL || !strcmp(hst->plugin_output, "")) {
2800 my_free(hst->plugin_output);
2801 hst->plugin_output = (char *)strdup("(No output returned from host check)");
2802 }
2803
2804 /* replace semicolons in plugin output (but not performance data) with colons */
2805 if((temp_ptr = hst->plugin_output)) {
2806 while((temp_ptr = strchr(temp_ptr, ';')))
2807 * temp_ptr = ':';
2808 }
2809
2810 /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
2811 if(use_aggressive_host_checking == FALSE && result == STATE_WARNING)
2812 result = STATE_OK;
2813
2814
2815 if(result == STATE_OK)
2816 return_result = HOST_UP;
2817 else
2818 return_result = HOST_DOWN;
2819
2820 /* high resolution end time for event broker */
2821 gettimeofday(&end_time, NULL);
2822
2823 #ifdef USE_EVENT_BROKER
2824 /* send data to event broker */
2825 broker_host_check(NEBTYPE_HOSTCHECK_RAW_END, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, return_result, hst->state_type, start_time, end_time, hst->host_check_command, 0.0, exectime, host_check_timeout, early_timeout, result, processed_command, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2826 #endif
2827
2828 log_debug_info(DEBUGL_CHECKS, 0, "** Sync host check done: state=%d\n", return_result);
2829
2830 return return_result;
2831 }
2832
2833
2834
2835 /* run a scheduled host check asynchronously */
run_scheduled_host_check_3x(host * hst,int check_options,double latency)2836 int run_scheduled_host_check_3x(host *hst, int check_options, double latency) {
2837 int result = OK;
2838 time_t current_time = 0L;
2839 time_t preferred_time = 0L;
2840 time_t next_valid_time = 0L;
2841 int time_is_valid = TRUE;
2842
2843
2844 log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_host_check_3x()\n");
2845
2846 if(hst == NULL)
2847 return ERROR;
2848
2849 log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n", hst->name, check_options, latency);
2850
2851 /*
2852 * reset the next_check_event so we know this host
2853 * check is no longer in the scheduling queue
2854 */
2855 hst->next_check_event = NULL;
2856
2857 /* attempt to run the check */
2858 result = run_async_host_check_3x(hst, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time);
2859
2860 /* an error occurred, so reschedule the check */
2861 if(result == ERROR) {
2862
2863 log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled host check at this time\n");
2864
2865 /* only attempt to (re)schedule checks that should get checked... */
2866 if(hst->should_be_scheduled == TRUE) {
2867
2868 /* get current time */
2869 time(¤t_time);
2870
2871 /* determine next time we should check the host if needed */
2872 /* if host has no check interval, schedule it again for 5 minutes from now */
2873 if(current_time >= preferred_time)
2874 preferred_time = current_time + ((hst->check_interval <= 0) ? 300 : (hst->check_interval * interval_length));
2875
2876 /* make sure we rescheduled the next host check at a valid time */
2877 get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr);
2878
2879 /* the host could not be rescheduled properly - set the next check time for next week */
2880 if(time_is_valid == FALSE && next_valid_time == preferred_time) {
2881
2882 /*
2883 hst->next_check=(time_t)(next_valid_time+(60*60*24*365));
2884 hst->should_be_scheduled=FALSE;
2885 */
2886
2887 hst->next_check = (time_t)(next_valid_time + (60 * 60 * 24 * 7));
2888
2889 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' could not be rescheduled properly. Scheduling check for next week...\n", hst->name);
2890
2891 log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next host check!\n");
2892 }
2893
2894 /* this service could be rescheduled... */
2895 else {
2896 hst->next_check = next_valid_time;
2897 hst->should_be_scheduled = TRUE;
2898
2899 log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next host check for %s", ctime(&next_valid_time));
2900 }
2901 }
2902
2903 /* update the status log */
2904 update_host_status(hst, FALSE);
2905
2906 /* reschedule the next host check - unless we couldn't find a valid next check time */
2907 /* 10/19/07 EG - keep original check options */
2908 if(hst->should_be_scheduled == TRUE)
2909 schedule_host_check(hst, hst->next_check, check_options);
2910
2911 return ERROR;
2912 }
2913
2914 return OK;
2915 }
2916
2917
2918
2919 /* perform an asynchronous check of a host */
2920 /* scheduled host checks will use this, as will some checks that result from on-demand checks... */
run_async_host_check_3x(host * hst,int check_options,double latency,int scheduled_check,int reschedule_check,int * time_is_valid,time_t * preferred_time)2921 int run_async_host_check_3x(host *hst, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) {
2922 nagios_macros mac;
2923 char *raw_command = NULL;
2924 char *processed_command = NULL;
2925 char output_buffer[MAX_INPUT_BUFFER] = "";
2926 char *temp_buffer = NULL;
2927 struct timeval start_time, end_time;
2928 pid_t pid = 0;
2929 int fork_error = FALSE;
2930 int wait_result = 0;
2931 FILE *fp = NULL;
2932 int pclose_result = 0;
2933 mode_t new_umask = 077;
2934 mode_t old_umask;
2935 char *output_file = NULL;
2936 double old_latency = 0.0;
2937 dbuf checkresult_dbuf;
2938 int dbuf_chunk = 1024;
2939 #ifdef USE_EVENT_BROKER
2940 int neb_result = OK;
2941 #endif
2942
2943 log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_host_check_3x()\n");
2944
2945 /* make sure we have a host */
2946 if(hst == NULL)
2947 return ERROR;
2948
2949 log_debug_info(DEBUGL_CHECKS, 0, "** Running async check of host '%s'...\n", hst->name);
2950
2951 /* is the host check viable at this time? */
2952 if(check_host_check_viability_3x(hst, check_options, time_is_valid, preferred_time) == ERROR)
2953 return ERROR;
2954
2955 /* 08/04/07 EG don't execute a new host check if one is already running */
2956 if(hst->is_executing == TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
2957 log_debug_info(DEBUGL_CHECKS, 1, "A check of this host is already being executed, so we'll pass for the moment...\n");
2958 return ERROR;
2959 }
2960
2961 /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
2962
2963 #ifdef USE_EVENT_BROKER
2964 /* initialize start/end times */
2965 start_time.tv_sec = 0L;
2966 start_time.tv_usec = 0L;
2967 end_time.tv_sec = 0L;
2968 end_time.tv_usec = 0L;
2969
2970 /* send data to event broker */
2971 neb_result = broker_host_check(NEBTYPE_HOSTCHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2972
2973 /* neb module wants to cancel the host check - the check will be rescheduled for a later time by the scheduling logic */
2974 if(neb_result == NEBERROR_CALLBACKCANCEL)
2975 return ERROR;
2976
2977 /* neb module wants to override the host check - perhaps it will check the host itself */
2978 /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
2979 if(neb_result == NEBERROR_CALLBACKOVERRIDE)
2980 return OK;
2981 #endif
2982
2983 log_debug_info(DEBUGL_CHECKS, 0, "Checking host '%s'...\n", hst->name);
2984
2985 /* clear check options - we don't want old check options retained */
2986 /* only clear options if this was a scheduled check - on demand check options shouldn't affect retained info */
2987 if(scheduled_check == TRUE)
2988 hst->check_options = CHECK_OPTION_NONE;
2989
2990 /* adjust host check attempt */
2991 adjust_host_check_attempt_3x(hst, TRUE);
2992
2993 /* set latency (temporarily) for macros and event broker */
2994 old_latency = hst->latency;
2995 hst->latency = latency;
2996
2997 /* grab the host macro variables */
2998 memset(&mac, 0, sizeof(mac));
2999 grab_host_macros_r(&mac, hst);
3000
3001 /* get the raw command line */
3002 get_raw_command_line_r(&mac, hst->check_command_ptr, hst->host_check_command, &raw_command, 0);
3003 if(raw_command == NULL) {
3004 clear_volatile_macros_r(&mac);
3005 log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for host '%s' was NULL - aborting.\n", hst->name);
3006 return ERROR;
3007 }
3008
3009 /* process any macros contained in the argument */
3010 process_macros_r(&mac, raw_command, &processed_command, 0);
3011 my_free(raw_command);
3012 if(processed_command == NULL) {
3013 clear_volatile_macros_r(&mac);
3014 log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for host '%s' was NULL - aborting.\n", hst->name);
3015 return ERROR;
3016 }
3017
3018 /* get the command start time */
3019 gettimeofday(&start_time, NULL);
3020
3021 /* set check time for on-demand checks, so they're not incorrectly detected as being orphaned - Luke Ross 5/16/08 */
3022 /* NOTE: 06/23/08 EG not sure if there will be side effects to this or not.... */
3023 if(scheduled_check == FALSE)
3024 hst->next_check = start_time.tv_sec;
3025
3026 /* increment number of host checks that are currently running... */
3027 currently_running_host_checks++;
3028
3029 /* set the execution flag */
3030 hst->is_executing = TRUE;
3031
3032 /* open a temp file for storing check output */
3033 old_umask = umask(new_umask);
3034 asprintf(&output_file, "%s/checkXXXXXX", temp_path);
3035 check_result_info.output_file_fd = mkstemp(output_file);
3036 if(check_result_info.output_file_fd >= 0)
3037 check_result_info.output_file_fp = fdopen(check_result_info.output_file_fd, "w");
3038 else {
3039 check_result_info.output_file_fp = NULL;
3040 check_result_info.output_file_fd = -1;
3041 }
3042 umask(old_umask);
3043
3044 log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Check result output will be written to '%s' (fd=%d)\n", output_file, check_result_info.output_file_fd);
3045
3046 /* save check info */
3047 check_result_info.object_check_type = HOST_CHECK;
3048 check_result_info.host_name = (char *)strdup(hst->name);
3049 check_result_info.service_description = NULL;
3050 check_result_info.check_type = HOST_CHECK_ACTIVE;
3051 check_result_info.check_options = check_options;
3052 check_result_info.scheduled_check = scheduled_check;
3053 check_result_info.reschedule_check = reschedule_check;
3054 check_result_info.output_file = (check_result_info.output_file_fd < 0 || output_file == NULL) ? NULL : strdup(output_file);
3055 check_result_info.latency = latency;
3056 check_result_info.start_time = start_time;
3057 check_result_info.finish_time = start_time;
3058 check_result_info.early_timeout = FALSE;
3059 check_result_info.exited_ok = TRUE;
3060 check_result_info.return_code = STATE_OK;
3061 check_result_info.output = NULL;
3062
3063 /* free memory */
3064 my_free(output_file);
3065
3066 /* write initial check info to file */
3067 /* if things go bad later on, the user will at least have something to go on when debugging... */
3068 if(check_result_info.output_file_fp) {
3069
3070 fprintf(check_result_info.output_file_fp, "### Active Check Result File ###\n");
3071 fprintf(check_result_info.output_file_fp, "file_time=%lu\n", (unsigned long)check_result_info.start_time.tv_sec);
3072 fprintf(check_result_info.output_file_fp, "\n");
3073
3074 fprintf(check_result_info.output_file_fp, "### Nagios Host Check Result ###\n");
3075 fprintf(check_result_info.output_file_fp, "# Time: %s", ctime(&check_result_info.start_time.tv_sec));
3076 fprintf(check_result_info.output_file_fp, "host_name=%s\n", check_result_info.host_name);
3077 fprintf(check_result_info.output_file_fp, "check_type=%d\n", check_result_info.check_type);
3078 fprintf(check_result_info.output_file_fp, "check_options=%d\n", check_result_info.check_options);
3079 fprintf(check_result_info.output_file_fp, "scheduled_check=%d\n", check_result_info.scheduled_check);
3080 fprintf(check_result_info.output_file_fp, "reschedule_check=%d\n", check_result_info.reschedule_check);
3081 fprintf(check_result_info.output_file_fp, "latency=%f\n", hst->latency);
3082 fprintf(check_result_info.output_file_fp, "start_time=%lu.%lu\n", check_result_info.start_time.tv_sec, check_result_info.start_time.tv_usec);
3083
3084 /* flush buffer or we'll end up writing twice when we fork() */
3085 fflush(check_result_info.output_file_fp);
3086 }
3087
3088 /* initialize dynamic buffer for storing plugin output */
3089 dbuf_init(&checkresult_dbuf, dbuf_chunk);
3090
3091 #ifdef USE_EVENT_BROKER
3092 /* send data to event broker */
3093 broker_host_check(NEBTYPE_HOSTCHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, processed_command, NULL, NULL, NULL, NULL);
3094 #endif
3095
3096 /* reset latency (permanent value for this check will get set later) */
3097 hst->latency = old_latency;
3098
3099 /* update check statistics */
3100 update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_HOST_CHECK_STATS : ACTIVE_ONDEMAND_HOST_CHECK_STATS, start_time.tv_sec);
3101 update_check_stats(PARALLEL_HOST_CHECK_STATS, start_time.tv_sec);
3102
3103 /* fork a child process */
3104 pid = fork();
3105
3106 /* an error occurred while trying to fork */
3107 if(pid == -1) {
3108
3109 fork_error = TRUE;
3110
3111 /* log an error */
3112 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of host '%s' could not be performed due to a fork() error: '%s'.\n", hst->name, strerror(errno));
3113
3114 log_debug_info(DEBUGL_CHECKS, 0, "Check of host '%s' could not be performed due to a fork() error: '%s'!\n", hst->name, strerror(errno));
3115 }
3116
3117 /* if we are in the child process... */
3118 else if(pid == 0) {
3119
3120 /* set environment variables */
3121 set_all_macro_environment_vars_r(&mac, TRUE);
3122
3123 /* ADDED 11/12/07 EG */
3124 /* close external command file and shut down worker thread */
3125 close_command_file();
3126
3127 /* fork again if we're not in a large installation */
3128 if(child_processes_fork_twice == TRUE) {
3129
3130 /* fork again... */
3131 pid = fork();
3132
3133 /* an error occurred while trying to fork again */
3134 if(pid == -1)
3135 exit(STATE_UNKNOWN);
3136 }
3137
3138 /* the grandchild (or child if large install tweaks are enabled) process should run the host check... */
3139 if(pid == 0 || child_processes_fork_twice == FALSE) {
3140
3141 /* reset signal handling */
3142 reset_sighandler();
3143
3144 /* become the process group leader */
3145 setpgid(0, 0);
3146
3147 /* exit on term signals at this process level */
3148 signal(SIGTERM, SIG_DFL);
3149
3150 /* catch plugins that don't finish in a timely manner */
3151 signal(SIGALRM, host_check_sighandler);
3152 alarm(host_check_timeout);
3153
3154 /* disable rotation of the debug file */
3155 max_debug_file_size = 0L;
3156
3157 /* run the plugin check command */
3158 fp = popen(processed_command, "r");
3159 if(fp == NULL)
3160 _exit(STATE_UNKNOWN);
3161
3162 /* initialize buffer */
3163 strcpy(output_buffer, "");
3164
3165 /* get all lines of plugin output - escape newlines */
3166 while(fgets(output_buffer, sizeof(output_buffer) - 1, fp)) {
3167 temp_buffer = escape_newlines(output_buffer);
3168 dbuf_strcat(&checkresult_dbuf, temp_buffer);
3169 my_free(temp_buffer);
3170 }
3171
3172 /* close the process */
3173 pclose_result = pclose(fp);
3174
3175 /* reset the alarm and signal handling here */
3176 signal(SIGALRM, SIG_IGN);
3177 alarm(0);
3178
3179 /* get the check finish time */
3180 gettimeofday(&end_time, NULL);
3181
3182 /* record check result info */
3183 check_result_info.finish_time = end_time;
3184 check_result_info.early_timeout = FALSE;
3185
3186 /* test for execution error */
3187 if(pclose_result == -1) {
3188 pclose_result = STATE_UNKNOWN;
3189 check_result_info.return_code = STATE_CRITICAL;
3190 check_result_info.exited_ok = FALSE;
3191 }
3192 else {
3193 if(WEXITSTATUS(pclose_result) == 0 && WIFSIGNALED(pclose_result))
3194 check_result_info.return_code = 128 + WTERMSIG(pclose_result);
3195 else
3196 check_result_info.return_code = WEXITSTATUS(pclose_result);
3197 }
3198
3199 /* write check result to file */
3200 if(check_result_info.output_file_fp) {
3201 FILE *fp;
3202
3203 /* protect against signal races */
3204 fp = check_result_info.output_file_fp;
3205 check_result_info.output_file_fp = NULL;
3206
3207 fprintf(fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
3208 fprintf(fp, "early_timeout=%d\n", check_result_info.early_timeout);
3209 fprintf(fp, "exited_ok=%d\n", check_result_info.exited_ok);
3210 fprintf(fp, "return_code=%d\n", check_result_info.return_code);
3211 fprintf(fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
3212
3213 /* close the temp file */
3214 fclose(fp);
3215
3216 /* move check result to queue directory */
3217 move_check_result_to_queue(check_result_info.output_file);
3218 }
3219
3220 /* free memory */
3221 dbuf_free(&checkresult_dbuf);
3222 my_free(processed_command);
3223
3224 /* free check result memory */
3225 free_check_result(&check_result_info);
3226
3227 /* return with plugin exit status - not really necessary... */
3228 _exit(pclose_result);
3229 }
3230
3231 /* NOTE: this code is never reached if large install tweaks are enabled... */
3232
3233 /* unset environment variables */
3234 set_all_macro_environment_vars_r(&mac, FALSE);
3235
3236 /* free allocated memory */
3237 /* this needs to be done last, so we don't free memory for variables before they're used above */
3238 if(free_child_process_memory == TRUE)
3239 free_memory(&mac);
3240
3241 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
3242 _exit(STATE_OK);
3243 }
3244
3245 /* else the parent should wait for the first child to return... */
3246 else if(pid > 0) {
3247 clear_volatile_macros_r(&mac);
3248
3249 log_debug_info(DEBUGL_CHECKS, 2, "Host check is executing in child process (pid=%lu)\n", (unsigned long)pid);
3250
3251 /* parent should close output file */
3252 if(check_result_info.output_file_fp)
3253 fclose(check_result_info.output_file_fp);
3254
3255 /* should this be done in first child process (after spawning grandchild) as well? */
3256 /* free memory allocated for IPC functionality */
3257 free_check_result(&check_result_info);
3258
3259 /* free memory */
3260 my_free(processed_command);
3261
3262 /* wait for the first child to return */
3263 /* if large install tweaks are enabled, we'll clean up the zombie process later */
3264 if(child_processes_fork_twice == TRUE)
3265 wait_result = waitpid(pid, NULL, 0);
3266 }
3267
3268 /* see if we were able to run the check... */
3269 if(fork_error == TRUE)
3270 return ERROR;
3271
3272 return OK;
3273 }
3274
3275
3276
3277 /* process results of an asynchronous host check */
handle_async_host_check_result_3x(host * temp_host,check_result * queued_check_result)3278 int handle_async_host_check_result_3x(host *temp_host, check_result *queued_check_result) {
3279 time_t current_time;
3280 int result = STATE_OK;
3281 int reschedule_check = FALSE;
3282 char *old_plugin_output = NULL;
3283 char *temp_ptr = NULL;
3284 struct timeval start_time_hires;
3285 struct timeval end_time_hires;
3286
3287 log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_host_check_result_3x()\n");
3288
3289 /* make sure we have what we need */
3290 if(temp_host == NULL || queued_check_result == NULL)
3291 return ERROR;
3292
3293 time(¤t_time);
3294
3295 log_debug_info(DEBUGL_CHECKS, 1, "** Handling async check result for host '%s'...\n", temp_host->name);
3296
3297 log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Type: %s\n", (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? "Active" : "Passive");
3298 log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Options: %d\n", queued_check_result->check_options);
3299 log_debug_info(DEBUGL_CHECKS, 2, "\tScheduled Check?: %s\n", (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No");
3300 log_debug_info(DEBUGL_CHECKS, 2, "\tReschedule Check?: %s\n", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No");
3301 log_debug_info(DEBUGL_CHECKS, 2, "\tExited OK?: %s\n", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No");
3302 log_debug_info(DEBUGL_CHECKS, 2, "\tExec Time: %.3f\n", temp_host->execution_time);
3303 log_debug_info(DEBUGL_CHECKS, 2, "\tLatency: %.3f\n", temp_host->latency);
3304 log_debug_info(DEBUGL_CHECKS, 2, "\tReturn Status: %d\n", queued_check_result->return_code);
3305 log_debug_info(DEBUGL_CHECKS, 2, "\tOutput: %s\n", (queued_check_result == NULL) ? "NULL" : queued_check_result->output);
3306
3307 /* decrement the number of host checks still out there... */
3308 if(queued_check_result->check_type == HOST_CHECK_ACTIVE && currently_running_host_checks > 0)
3309 currently_running_host_checks--;
3310
3311 /* skip this host check results if its passive and we aren't accepting passive check results */
3312 if(queued_check_result->check_type == HOST_CHECK_PASSIVE) {
3313 if(accept_passive_host_checks == FALSE) {
3314 log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive host checks are disabled globally.\n");
3315 return ERROR;
3316 }
3317 if(temp_host->accept_passive_host_checks == FALSE) {
3318 log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive checks are disabled for this host.\n");
3319 return ERROR;
3320 }
3321 }
3322
3323 /* clear the freshening flag (it would have been set if this host was determined to be stale) */
3324 if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
3325 temp_host->is_being_freshened = FALSE;
3326
3327 /* DISCARD INVALID FRESHNESS CHECK RESULTS */
3328 /* If a host goes stale, Nagios will initiate a forced check in order to freshen it. There is a race condition whereby a passive check
3329 could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here. This would
3330 make the host fresh again, so we do a quick check to make sure the host is still stale before we accept the check result. */
3331 if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_host_result_fresh(temp_host, current_time, FALSE) == TRUE) {
3332 log_debug_info(DEBUGL_CHECKS, 0, "Discarding host freshness check result because the host is currently fresh (race condition avoided).\n");
3333 return OK;
3334 }
3335
3336 /* was this check passive or active? */
3337 temp_host->check_type = (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? HOST_CHECK_ACTIVE : HOST_CHECK_PASSIVE;
3338
3339 /* update check statistics for passive results */
3340 if(queued_check_result->check_type == HOST_CHECK_PASSIVE)
3341 update_check_stats(PASSIVE_HOST_CHECK_STATS, queued_check_result->start_time.tv_sec);
3342
3343 /* should we reschedule the next check of the host? NOTE: this might be overridden later... */
3344 reschedule_check = queued_check_result->reschedule_check;
3345
3346 /* check latency is passed to us for both active and passive checks */
3347 temp_host->latency = queued_check_result->latency;
3348
3349 /* update the execution time for this check (millisecond resolution) */
3350 temp_host->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0);
3351 if(temp_host->execution_time < 0.0)
3352 temp_host->execution_time = 0.0;
3353
3354 /* set the checked flag */
3355 temp_host->has_been_checked = TRUE;
3356
3357 /* clear the execution flag if this was an active check */
3358 if(queued_check_result->check_type == HOST_CHECK_ACTIVE)
3359 temp_host->is_executing = FALSE;
3360
3361 /* get the last check time */
3362 temp_host->last_check = queued_check_result->start_time.tv_sec;
3363
3364 /* was this check passive or active? */
3365 temp_host->check_type = (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? HOST_CHECK_ACTIVE : HOST_CHECK_PASSIVE;
3366
3367 /* save the old host state */
3368 temp_host->last_state = temp_host->current_state;
3369 if(temp_host->state_type == HARD_STATE)
3370 temp_host->last_hard_state = temp_host->current_state;
3371
3372 /* save old plugin output */
3373 if(temp_host->plugin_output)
3374 old_plugin_output = (char *)strdup(temp_host->plugin_output);
3375
3376 /* clear the old plugin output and perf data buffers */
3377 my_free(temp_host->plugin_output);
3378 my_free(temp_host->long_plugin_output);
3379 my_free(temp_host->perf_data);
3380
3381 /* parse check output to get: (1) short output, (2) long output, (3) perf data */
3382 parse_check_output(queued_check_result->output, &temp_host->plugin_output, &temp_host->long_plugin_output, &temp_host->perf_data, TRUE, TRUE);
3383
3384 /* make sure we have some data */
3385 if(temp_host->plugin_output == NULL || !strcmp(temp_host->plugin_output, "")) {
3386 my_free(temp_host->plugin_output);
3387 temp_host->plugin_output = (char *)strdup("(No output returned from host check)");
3388 }
3389
3390 /* replace semicolons in plugin output (but not performance data) with colons */
3391 if((temp_ptr = temp_host->plugin_output)) {
3392 while((temp_ptr = strchr(temp_ptr, ';')))
3393 * temp_ptr = ':';
3394 }
3395
3396 log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n");
3397 log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_host->plugin_output == NULL) ? "NULL" : temp_host->plugin_output);
3398 log_debug_info(DEBUGL_CHECKS, 2, "Long Output: %s\n", (temp_host->long_plugin_output == NULL) ? "NULL" : temp_host->long_plugin_output);
3399 log_debug_info(DEBUGL_CHECKS, 2, "Perf Data: %s\n", (temp_host->perf_data == NULL) ? "NULL" : temp_host->perf_data);
3400
3401 /* get the unprocessed return code */
3402 /* NOTE: for passive checks, this is the final/processed state */
3403 result = queued_check_result->return_code;
3404
3405 /* adjust return code (active checks only) */
3406 if(queued_check_result->check_type == HOST_CHECK_ACTIVE) {
3407
3408 /* if there was some error running the command, just skip it (this shouldn't be happening) */
3409 if(queued_check_result->exited_ok == FALSE) {
3410
3411 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' did not exit properly!\n", temp_host->name);
3412
3413 my_free(temp_host->plugin_output);
3414 my_free(temp_host->long_plugin_output);
3415 my_free(temp_host->perf_data);
3416
3417 temp_host->plugin_output = (char *)strdup("(Host check did not exit properly)");
3418
3419 result = STATE_CRITICAL;
3420 }
3421
3422 /* make sure the return code is within bounds */
3423 else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) {
3424
3425 logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_host->name, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " Make sure the plugin you're trying to run actually exists." : "");
3426
3427 my_free(temp_host->plugin_output);
3428 my_free(temp_host->long_plugin_output);
3429 my_free(temp_host->perf_data);
3430
3431 asprintf(&temp_host->plugin_output, "(Return code of %d is out of bounds%s)", queued_check_result->return_code, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " - plugin may be missing" : "");
3432
3433 result = STATE_CRITICAL;
3434 }
3435
3436 /* a NULL host check command means we should assume the host is UP */
3437 if(temp_host->host_check_command == NULL) {
3438 my_free(temp_host->plugin_output);
3439 temp_host->plugin_output = (char *)strdup("(Host assumed to be UP)");
3440 result = STATE_OK;
3441 }
3442 }
3443
3444 /* translate return code to basic UP/DOWN state - the DOWN/UNREACHABLE state determination is made later */
3445 /* NOTE: only do this for active checks - passive check results already have the final state */
3446 if(queued_check_result->check_type == HOST_CHECK_ACTIVE) {
3447
3448 /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
3449 if(use_aggressive_host_checking == FALSE && result == STATE_WARNING)
3450 result = STATE_OK;
3451
3452 /* OK states means the host is UP */
3453 if(result == STATE_OK)
3454 result = HOST_UP;
3455
3456 /* any problem state indicates the host is not UP */
3457 else
3458 result = HOST_DOWN;
3459 }
3460
3461
3462 /******************* PROCESS THE CHECK RESULTS ******************/
3463
3464 /* process the host check result */
3465 process_host_check_result_3x(temp_host, result, old_plugin_output, CHECK_OPTION_NONE, reschedule_check, TRUE, cached_host_check_horizon);
3466
3467 /* free memory */
3468 my_free(old_plugin_output);
3469
3470 log_debug_info(DEBUGL_CHECKS, 1, "** Async check result for host '%s' handled: new state=%d\n", temp_host->name, temp_host->current_state);
3471
3472 /* high resolution start time for event broker */
3473 start_time_hires = queued_check_result->start_time;
3474
3475 /* high resolution end time for event broker */
3476 gettimeofday(&end_time_hires, NULL);
3477
3478 #ifdef USE_EVENT_BROKER
3479 /* send data to event broker */
3480 broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_host, temp_host->check_type, temp_host->current_state, temp_host->state_type, start_time_hires, end_time_hires, temp_host->host_check_command, temp_host->latency, temp_host->execution_time, host_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, temp_host->plugin_output, temp_host->long_plugin_output, temp_host->perf_data, NULL);
3481 #endif
3482
3483 return OK;
3484 }
3485
3486
3487
3488 /* processes the result of a synchronous or asynchronous host check */
process_host_check_result_3x(host * hst,int new_state,char * old_plugin_output,int check_options,int reschedule_check,int use_cached_result,unsigned long check_timestamp_horizon)3489 int process_host_check_result_3x(host *hst, int new_state, char *old_plugin_output, int check_options, int reschedule_check, int use_cached_result, unsigned long check_timestamp_horizon) {
3490 hostsmember *temp_hostsmember = NULL;
3491 host *child_host = NULL;
3492 host *parent_host = NULL;
3493 host *master_host = NULL;
3494 host *temp_host = NULL;
3495 hostdependency *temp_dependency = NULL;
3496 objectlist *check_hostlist = NULL;
3497 objectlist *hostlist_item = NULL;
3498 int parent_state = HOST_UP;
3499 time_t current_time = 0L;
3500 time_t next_check = 0L;
3501 time_t preferred_time = 0L;
3502 time_t next_valid_time = 0L;
3503 int run_async_check = TRUE;
3504 void *ptr = NULL;
3505
3506
3507 log_debug_info(DEBUGL_FUNCTIONS, 0, "process_host_check_result_3x()\n");
3508
3509 log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, ATTEMPT=%d/%d, CHECK TYPE=%s, STATE TYPE=%s, OLD STATE=%d, NEW STATE=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->check_type == HOST_CHECK_ACTIVE) ? "ACTIVE" : "PASSIVE", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state, new_state);
3510
3511 /* get the current time */
3512 time(¤t_time);
3513
3514 /* default next check time */
3515 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3516
3517 /* we have to adjust current attempt # for passive checks, as it isn't done elsewhere */
3518 if(hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == TRUE)
3519 adjust_host_check_attempt_3x(hst, FALSE);
3520
3521 /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
3522 if(hst->check_type == HOST_CHECK_PASSIVE) {
3523 if(log_passive_checks == TRUE)
3524 logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE HOST CHECK: %s;%d;%s\n", hst->name, new_state, hst->plugin_output);
3525 }
3526
3527
3528 /******* HOST WAS DOWN/UNREACHABLE INITIALLY *******/
3529 if(hst->current_state != HOST_UP) {
3530
3531 log_debug_info(DEBUGL_CHECKS, 1, "Host was DOWN/UNREACHABLE.\n");
3532
3533 /***** HOST IS NOW UP *****/
3534 /* the host just recovered! */
3535 if(new_state == HOST_UP) {
3536
3537 /* set the current state */
3538 hst->current_state = HOST_UP;
3539
3540 /* set the state type */
3541 /* set state type to HARD for passive checks and active checks that were previously in a HARD STATE */
3542 if(hst->state_type == HARD_STATE || (hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == FALSE))
3543 hst->state_type = HARD_STATE;
3544 else
3545 hst->state_type = SOFT_STATE;
3546
3547 log_debug_info(DEBUGL_CHECKS, 1, "Host experienced a %s recovery (it's now UP).\n", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT");
3548
3549 /* reschedule the next check of the host at the normal interval */
3550 reschedule_check = TRUE;
3551 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3552
3553 /* propagate checks to immediate parents if they are not already UP */
3554 /* we do this because a parent host (or grandparent) may have recovered somewhere and we should catch the recovery as soon as possible */
3555 log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to parent host(s)...\n");
3556
3557 for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3558 if((parent_host = temp_hostsmember->host_ptr) == NULL)
3559 continue;
3560 if(parent_host->current_state != HOST_UP) {
3561 log_debug_info(DEBUGL_CHECKS, 1, "Check of parent host '%s' queued.\n", parent_host->name);
3562 add_object_to_objectlist(&check_hostlist, (void *)parent_host);
3563 }
3564 }
3565
3566 /* propagate checks to immediate children if they are not already UP */
3567 /* we do this because children may currently be UNREACHABLE, but may (as a result of this recovery) switch to UP or DOWN states */
3568 log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to child host(s)...\n");
3569
3570 for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3571 if((child_host = temp_hostsmember->host_ptr) == NULL)
3572 continue;
3573 if(child_host->current_state != HOST_UP) {
3574 log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3575 add_object_to_objectlist(&check_hostlist, (void *)child_host);
3576 }
3577 }
3578 }
3579
3580 /***** HOST IS STILL DOWN/UNREACHABLE *****/
3581 /* we're still in a problem state... */
3582 else {
3583
3584 log_debug_info(DEBUGL_CHECKS, 1, "Host is still DOWN/UNREACHABLE.\n");
3585
3586 /* passive checks are treated as HARD states by default... */
3587 if(hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == FALSE) {
3588
3589 /* set the state type */
3590 hst->state_type = HARD_STATE;
3591
3592 /* reset the current attempt */
3593 hst->current_attempt = 1;
3594 }
3595
3596 /* active checks and passive checks (treated as SOFT states) */
3597 else {
3598
3599 /* set the state type */
3600 /* we've maxed out on the retries */
3601 if(hst->current_attempt == hst->max_attempts)
3602 hst->state_type = HARD_STATE;
3603 /* the host was in a hard problem state before, so it still is now */
3604 else if(hst->current_attempt == 1)
3605 hst->state_type = HARD_STATE;
3606 /* the host is in a soft state and the check will be retried */
3607 else
3608 hst->state_type = SOFT_STATE;
3609 }
3610
3611 /* make a determination of the host's state */
3612 /* translate host state between DOWN/UNREACHABLE (only for passive checks if enabled) */
3613 hst->current_state = new_state;
3614 if(hst->check_type == HOST_CHECK_ACTIVE || translate_passive_host_checks == TRUE)
3615 hst->current_state = determine_host_reachability(hst);
3616
3617 /* reschedule the next check if the host state changed */
3618 if(hst->last_state != hst->current_state || hst->last_hard_state != hst->current_state) {
3619
3620 reschedule_check = TRUE;
3621
3622 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
3623 if(hst->state_type == SOFT_STATE)
3624 next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length));
3625
3626 /* host has maxed out on retries (or was previously in a hard problem state), so reschedule the next check at the normal interval */
3627 else
3628 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3629 }
3630
3631 }
3632
3633 }
3634
3635 /******* HOST WAS UP INITIALLY *******/
3636 else {
3637
3638 log_debug_info(DEBUGL_CHECKS, 1, "Host was UP.\n");
3639
3640 /***** HOST IS STILL UP *****/
3641 /* either the host never went down since last check */
3642 if(new_state == HOST_UP) {
3643
3644 log_debug_info(DEBUGL_CHECKS, 1, "Host is still UP.\n");
3645
3646 /* set the current state */
3647 hst->current_state = HOST_UP;
3648
3649 /* set the state type */
3650 hst->state_type = HARD_STATE;
3651
3652 /* reschedule the next check at the normal interval */
3653 if(reschedule_check == TRUE)
3654 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3655 }
3656
3657 /***** HOST IS NOW DOWN/UNREACHABLE *****/
3658 else {
3659
3660 log_debug_info(DEBUGL_CHECKS, 1, "Host is now DOWN/UNREACHABLE.\n");
3661
3662 /***** SPECIAL CASE FOR HOSTS WITH MAX_ATTEMPTS==1 *****/
3663 if(hst->max_attempts == 1) {
3664
3665 log_debug_info(DEBUGL_CHECKS, 1, "Max attempts = 1!.\n");
3666
3667 /* set the state type */
3668 hst->state_type = HARD_STATE;
3669
3670 /* host has maxed out on retries, so reschedule the next check at the normal interval */
3671 reschedule_check = TRUE;
3672 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3673
3674 /* we need to run SYNCHRONOUS checks of all parent hosts to accurately determine the state of this host */
3675 /* this is extremely inefficient (reminiscent of Nagios 2.x logic), but there's no other good way around it */
3676 /* check all parent hosts to see if we're DOWN or UNREACHABLE */
3677 /* only do this for ACTIVE checks, as PASSIVE checks contain a pre-determined state */
3678 if(hst->check_type == HOST_CHECK_ACTIVE) {
3679
3680 log_debug_info(DEBUGL_CHECKS, 1, "** WARNING: Max attempts = 1, so we have to run serial checks of all parent hosts!\n");
3681
3682 for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3683
3684 if((parent_host = temp_hostsmember->host_ptr) == NULL)
3685 continue;
3686
3687 log_debug_info(DEBUGL_CHECKS, 1, "Running serial check parent host '%s'...\n", parent_host->name);
3688
3689 /* run an immediate check of the parent host */
3690 run_sync_host_check_3x(parent_host, &parent_state, check_options, use_cached_result, check_timestamp_horizon);
3691
3692 /* bail out as soon as we find one parent host that is UP */
3693 if(parent_state == HOST_UP) {
3694
3695 log_debug_info(DEBUGL_CHECKS, 1, "Parent host is UP, so this one is DOWN.\n");
3696
3697 /* set the current state */
3698 hst->current_state = HOST_DOWN;
3699 break;
3700 }
3701 }
3702
3703 if(temp_hostsmember == NULL) {
3704 /* host has no parents, so its up */
3705 if(hst->parent_hosts == NULL) {
3706 log_debug_info(DEBUGL_CHECKS, 1, "Host has no parents, so it's DOWN.\n");
3707 hst->current_state = HOST_DOWN;
3708 }
3709 else {
3710 /* no parents were up, so this host is UNREACHABLE */
3711 log_debug_info(DEBUGL_CHECKS, 1, "No parents were UP, so this host is UNREACHABLE.\n");
3712 hst->current_state = HOST_UNREACHABLE;
3713 }
3714 }
3715 }
3716
3717 /* set the host state for passive checks */
3718 else {
3719 /* set the state */
3720 hst->current_state = new_state;
3721
3722 /* translate host state between DOWN/UNREACHABLE for passive checks (if enabled) */
3723 /* make a determination of the host's state */
3724 if(translate_passive_host_checks == TRUE)
3725 hst->current_state = determine_host_reachability(hst);
3726
3727 }
3728
3729 /* propagate checks to immediate children if they are not UNREACHABLE */
3730 /* we do this because we may now be blocking the route to child hosts */
3731 log_debug_info(DEBUGL_CHECKS, 1, "Propagating check to immediate non-UNREACHABLE child hosts...\n");
3732
3733 for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3734 if((child_host = temp_hostsmember->host_ptr) == NULL)
3735 continue;
3736 if(child_host->current_state != HOST_UNREACHABLE) {
3737 log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3738 add_object_to_objectlist(&check_hostlist, (void *)child_host);
3739 }
3740 }
3741 }
3742
3743 /***** MAX ATTEMPTS > 1 *****/
3744 else {
3745
3746 /* active and (in some cases) passive check results are treated as SOFT states */
3747 if(hst->check_type == HOST_CHECK_ACTIVE || passive_host_checks_are_soft == TRUE) {
3748
3749 /* set the state type */
3750 hst->state_type = SOFT_STATE;
3751 }
3752
3753 /* by default, passive check results are treated as HARD states */
3754 else {
3755
3756 /* set the state type */
3757 hst->state_type = HARD_STATE;
3758
3759 /* reset the current attempt */
3760 hst->current_attempt = 1;
3761 }
3762
3763 /* make a (in some cases) preliminary determination of the host's state */
3764 /* translate host state between DOWN/UNREACHABLE (for passive checks only if enabled) */
3765 hst->current_state = new_state;
3766 if(hst->check_type == HOST_CHECK_ACTIVE || translate_passive_host_checks == TRUE)
3767 hst->current_state = determine_host_reachability(hst);
3768
3769 /* reschedule a check of the host */
3770 reschedule_check = TRUE;
3771
3772 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
3773 if(hst->check_type == HOST_CHECK_ACTIVE || passive_host_checks_are_soft == TRUE)
3774 next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length));
3775
3776 /* schedule a re-check of the host at the normal interval */
3777 else
3778 next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3779
3780 /* propagate checks to immediate parents if they are UP */
3781 /* we do this because a parent host (or grandparent) may have gone down and blocked our route */
3782 /* checking the parents ASAP will allow us to better determine the final state (DOWN/UNREACHABLE) of this host later */
3783 log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate parent hosts that are UP...\n");
3784
3785 for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3786 if((parent_host = temp_hostsmember->host_ptr) == NULL)
3787 continue;
3788 if(parent_host->current_state == HOST_UP) {
3789 add_object_to_objectlist(&check_hostlist, (void *)parent_host);
3790 log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", parent_host->name);
3791 }
3792 }
3793
3794 /* propagate checks to immediate children if they are not UNREACHABLE */
3795 /* we do this because we may now be blocking the route to child hosts */
3796 log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate non-UNREACHABLE child hosts...\n");
3797
3798 for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3799 if((child_host = temp_hostsmember->host_ptr) == NULL)
3800 continue;
3801 if(child_host->current_state != HOST_UNREACHABLE) {
3802 log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3803 add_object_to_objectlist(&check_hostlist, (void *)child_host);
3804 }
3805 }
3806
3807 /* check dependencies on second to last host check */
3808 if(enable_predictive_host_dependency_checks == TRUE && hst->current_attempt == (hst->max_attempts - 1)) {
3809
3810 /* propagate checks to hosts that THIS ONE depends on for notifications AND execution */
3811 /* we do to help ensure that the dependency checks are accurate before it comes time to notify */
3812 log_debug_info(DEBUGL_CHECKS, 1, "Propagating predictive dependency checks to hosts this one depends on...\n");
3813
3814 for(temp_dependency = get_first_hostdependency_by_dependent_host(hst->name, &ptr); temp_dependency != NULL; temp_dependency = get_next_hostdependency_by_dependent_host(hst->name, &ptr)) {
3815 if(temp_dependency->dependent_host_ptr == hst && temp_dependency->master_host_ptr != NULL) {
3816 master_host = (host *)temp_dependency->master_host_ptr;
3817 log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", master_host->name);
3818 add_object_to_objectlist(&check_hostlist, (void *)master_host);
3819 }
3820 }
3821 }
3822 }
3823 }
3824 }
3825
3826 log_debug_info(DEBUGL_CHECKS, 1, "Pre-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state);
3827
3828 /* handle the host state */
3829 handle_host_state(hst);
3830
3831 log_debug_info(DEBUGL_CHECKS, 1, "Post-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state);
3832
3833
3834 /******************** POST-PROCESSING STUFF *********************/
3835
3836 /* if the plugin output differs from previous check and no state change, log the current state/output if state stalking is enabled */
3837 if(hst->last_state == hst->current_state && compare_strings(old_plugin_output, hst->plugin_output)) {
3838
3839 if(hst->current_state == HOST_UP && hst->stalk_on_up == TRUE)
3840 log_host_event(hst);
3841
3842 else if(hst->current_state == HOST_DOWN && hst->stalk_on_down == TRUE)
3843 log_host_event(hst);
3844
3845 else if(hst->current_state == HOST_UNREACHABLE && hst->stalk_on_unreachable == TRUE)
3846 log_host_event(hst);
3847 }
3848
3849 /* check to see if the associated host is flapping */
3850 check_for_host_flapping(hst, TRUE, TRUE, TRUE);
3851
3852 /* reschedule the next check of the host (usually ONLY for scheduled, active checks, unless overridden above) */
3853 if(reschedule_check == TRUE) {
3854
3855 log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of host at %s", ctime(&next_check));
3856
3857 /* default is to reschedule host check unless a test below fails... */
3858 hst->should_be_scheduled = TRUE;
3859
3860 /* get the new current time */
3861 time(¤t_time);
3862
3863 /* make sure we don't get ourselves into too much trouble... */
3864 if(current_time > next_check)
3865 hst->next_check = current_time;
3866 else
3867 hst->next_check = next_check;
3868
3869 /* make sure we rescheduled the next service check at a valid time */
3870 preferred_time = hst->next_check;
3871 get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr);
3872 hst->next_check = next_valid_time;
3873
3874 /* hosts with non-recurring intervals do not get rescheduled if we're in a HARD or UP state */
3875 if(hst->check_interval == 0 && (hst->state_type == HARD_STATE || hst->current_state == HOST_UP))
3876 hst->should_be_scheduled = FALSE;
3877
3878 /* host with active checks disabled do not get rescheduled */
3879 if(hst->checks_enabled == FALSE)
3880 hst->should_be_scheduled = FALSE;
3881
3882 /* schedule a non-forced check if we can */
3883 if(hst->should_be_scheduled == TRUE) {
3884 schedule_host_check(hst, hst->next_check, CHECK_OPTION_NONE);
3885 }
3886 }
3887
3888 /* update host status - for both active (scheduled) and passive (non-scheduled) hosts */
3889 update_host_status(hst, FALSE);
3890
3891 /* run async checks of all hosts we added above */
3892 /* don't run a check if one is already executing or we can get by with a cached state */
3893 for(hostlist_item = check_hostlist; hostlist_item != NULL; hostlist_item = hostlist_item->next) {
3894 run_async_check = TRUE;
3895 temp_host = (host *)hostlist_item->object_ptr;
3896
3897 log_debug_info(DEBUGL_CHECKS, 2, "ASYNC CHECK OF HOST: %s, CURRENTTIME: %lu, LASTHOSTCHECK: %lu, CACHEDTIMEHORIZON: %lu, USECACHEDRESULT: %d, ISEXECUTING: %d\n", temp_host->name, current_time, temp_host->last_check, check_timestamp_horizon, use_cached_result, temp_host->is_executing);
3898
3899 if(use_cached_result == TRUE && ((current_time - temp_host->last_check) <= check_timestamp_horizon))
3900 run_async_check = FALSE;
3901 if(temp_host->is_executing == TRUE)
3902 run_async_check = FALSE;
3903 if(run_async_check == TRUE)
3904 run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
3905 }
3906 free_objectlist(&check_hostlist);
3907
3908 return OK;
3909 }
3910
3911
3912
3913 /* checks viability of performing a host check */
check_host_check_viability_3x(host * hst,int check_options,int * time_is_valid,time_t * new_time)3914 int check_host_check_viability_3x(host *hst, int check_options, int *time_is_valid, time_t *new_time) {
3915 int result = OK;
3916 int perform_check = TRUE;
3917 time_t current_time = 0L;
3918 time_t preferred_time = 0L;
3919 int check_interval = 0;
3920
3921 log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_check_viability_3x()\n");
3922
3923 /* make sure we have a host */
3924 if(hst == NULL)
3925 return ERROR;
3926
3927 /* get the check interval to use if we need to reschedule the check */
3928 if(hst->state_type == SOFT_STATE && hst->current_state != HOST_UP)
3929 check_interval = (hst->retry_interval * interval_length);
3930 else
3931 check_interval = (hst->check_interval * interval_length);
3932
3933 /* make sure check interval is positive - otherwise use 5 minutes out for next check */
3934 if(check_interval <= 0)
3935 check_interval = 300;
3936
3937 /* get the current time */
3938 time(¤t_time);
3939
3940 /* initialize the next preferred check time */
3941 preferred_time = current_time;
3942
3943 /* can we check the host right now? */
3944 if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
3945
3946 /* if checks of the host are currently disabled... */
3947 if(hst->checks_enabled == FALSE) {
3948 preferred_time = current_time + check_interval;
3949 perform_check = FALSE;
3950 }
3951
3952 /* make sure this is a valid time to check the host */
3953 if(check_time_against_period((unsigned long)current_time, hst->check_period_ptr) == ERROR) {
3954 preferred_time = current_time;
3955 if(time_is_valid)
3956 *time_is_valid = FALSE;
3957 perform_check = FALSE;
3958 }
3959
3960 /* check host dependencies for execution */
3961 if(check_host_dependencies(hst, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
3962 preferred_time = current_time + check_interval;
3963 perform_check = FALSE;
3964 }
3965 }
3966
3967 /* pass back the next viable check time */
3968 if(new_time)
3969 *new_time = preferred_time;
3970
3971 result = (perform_check == TRUE) ? OK : ERROR;
3972
3973 return result;
3974 }
3975
3976
3977
3978 /* adjusts current host check attempt before a new check is performed */
adjust_host_check_attempt_3x(host * hst,int is_active)3979 int adjust_host_check_attempt_3x(host *hst, int is_active) {
3980
3981 log_debug_info(DEBUGL_FUNCTIONS, 0, "adjust_host_check_attempt_3x()\n");
3982
3983 if(hst == NULL)
3984 return ERROR;
3985
3986 log_debug_info(DEBUGL_CHECKS, 2, "Adjusting check attempt number for host '%s': current attempt=%d/%d, state=%d, state type=%d\n", hst->name, hst->current_attempt, hst->max_attempts, hst->current_state, hst->state_type);
3987
3988 /* if host is in a hard state, reset current attempt number */
3989 if(hst->state_type == HARD_STATE)
3990 hst->current_attempt = 1;
3991
3992 /* if host is in a soft UP state, reset current attempt number (active checks only) */
3993 else if(is_active == TRUE && hst->state_type == SOFT_STATE && hst->current_state == HOST_UP)
3994 hst->current_attempt = 1;
3995
3996 /* increment current attempt number */
3997 else if(hst->current_attempt < hst->max_attempts)
3998 hst->current_attempt++;
3999
4000 log_debug_info(DEBUGL_CHECKS, 2, "New check attempt number = %d\n", hst->current_attempt);
4001
4002 return OK;
4003 }
4004
4005
4006
4007 /* determination of the host's state based on route availability*/
4008 /* used only to determine difference between DOWN and UNREACHABLE states */
determine_host_reachability(host * hst)4009 int determine_host_reachability(host *hst) {
4010 int state = HOST_DOWN;
4011 host *parent_host = NULL;
4012 hostsmember *temp_hostsmember = NULL;
4013
4014 log_debug_info(DEBUGL_FUNCTIONS, 0, "determine_host_reachability()\n");
4015
4016 if(hst == NULL)
4017 return HOST_DOWN;
4018
4019 log_debug_info(DEBUGL_CHECKS, 2, "Determining state of host '%s': current state=%d\n", hst->name, hst->current_state);
4020
4021 /* host is UP - no translation needed */
4022 if(hst->current_state == HOST_UP) {
4023 state = HOST_UP;
4024 log_debug_info(DEBUGL_CHECKS, 2, "Host is UP, no state translation needed.\n");
4025 }
4026
4027 /* host has no parents, so it is DOWN */
4028 else if(hst->parent_hosts == NULL) {
4029 state = HOST_DOWN;
4030 log_debug_info(DEBUGL_CHECKS, 2, "Host has no parents, so it is DOWN.\n");
4031 }
4032
4033 /* check all parent hosts to see if we're DOWN or UNREACHABLE */
4034 else {
4035
4036 for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
4037
4038 if((parent_host = temp_hostsmember->host_ptr) == NULL)
4039 continue;
4040
4041 /* bail out as soon as we find one parent host that is UP */
4042 if(parent_host->current_state == HOST_UP) {
4043 /* set the current state */
4044 state = HOST_DOWN;
4045 log_debug_info(DEBUGL_CHECKS, 2, "At least one parent (%s) is up, so host is DOWN.\n", parent_host->name);
4046 break;
4047 }
4048 }
4049 /* no parents were up, so this host is UNREACHABLE */
4050 if(temp_hostsmember == NULL) {
4051 state = HOST_UNREACHABLE;
4052 log_debug_info(DEBUGL_CHECKS, 2, "No parents were up, so host is UNREACHABLE.\n");
4053 }
4054 }
4055
4056 return state;
4057 }
4058
4059
4060
4061 /******************************************************************/
4062 /****************** HOST STATE HANDLER FUNCTIONS ******************/
4063 /******************************************************************/
4064
4065
4066 /* top level host state handler - occurs after every host check (soft/hard and active/passive) */
handle_host_state(host * hst)4067 int handle_host_state(host *hst) {
4068 int state_change = FALSE;
4069 int hard_state_change = FALSE;
4070 time_t current_time = 0L;
4071
4072
4073 log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_host_state()\n");
4074
4075 /* get current time */
4076 time(¤t_time);
4077
4078 /* obsess over this host check */
4079 obsessive_compulsive_host_check_processor(hst);
4080
4081 /* update performance data */
4082 update_host_performance_data(hst);
4083
4084 /* record latest time for current state */
4085 switch(hst->current_state) {
4086 case HOST_UP:
4087 hst->last_time_up = current_time;
4088 break;
4089 case HOST_DOWN:
4090 hst->last_time_down = current_time;
4091 break;
4092 case HOST_UNREACHABLE:
4093 hst->last_time_unreachable = current_time;
4094 break;
4095 default:
4096 break;
4097 }
4098
4099 /* has the host state changed? */
4100 if(hst->last_state != hst->current_state || (hst->current_state == HOST_UP && hst->state_type == SOFT_STATE))
4101 state_change = TRUE;
4102
4103 if(hst->current_attempt >= hst->max_attempts && hst->last_hard_state != hst->current_state)
4104 hard_state_change = TRUE;
4105
4106 /* if the host state has changed... */
4107 if(state_change == TRUE || hard_state_change == TRUE) {
4108
4109 /* reset the next and last notification times */
4110 hst->last_host_notification = (time_t)0;
4111 hst->next_host_notification = (time_t)0;
4112
4113 /* reset notification suppression option */
4114 hst->no_more_notifications = FALSE;
4115
4116 /* reset the acknowledgement flag if necessary */
4117 if(hst->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) {
4118
4119 hst->problem_has_been_acknowledged = FALSE;
4120 hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
4121
4122 /* remove any non-persistant comments associated with the ack */
4123 delete_host_acknowledgement_comments(hst);
4124 }
4125 else if(hst->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && hst->current_state == HOST_UP) {
4126
4127 hst->problem_has_been_acknowledged = FALSE;
4128 hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
4129
4130 /* remove any non-persistant comments associated with the ack */
4131 delete_host_acknowledgement_comments(hst);
4132 }
4133
4134 }
4135
4136 /* Not sure about this, but is old behaviour */
4137 if(hst->last_hard_state != hst->current_state)
4138 hard_state_change = TRUE;
4139
4140 if(state_change == TRUE || hard_state_change == TRUE) {
4141
4142 /* update last state change times */
4143 hst->last_state_change = current_time;
4144 if(hst->state_type == HARD_STATE)
4145 hst->last_hard_state_change = current_time;
4146
4147 /* update the event id */
4148 hst->last_event_id = hst->current_event_id;
4149 hst->current_event_id = next_event_id;
4150 next_event_id++;
4151
4152 /* update the problem id when transitioning to a problem state */
4153 if(hst->last_state == HOST_UP) {
4154 /* don't reset last problem id, or it will be zero the next time a problem is encountered */
4155 /*hst->last_problem_id=hst->current_problem_id;*/
4156 hst->current_problem_id = next_problem_id;
4157 next_problem_id++;
4158 }
4159
4160 /* clear the problem id when transitioning from a problem state to an UP state */
4161 if(hst->current_state == HOST_UP) {
4162 hst->last_problem_id = hst->current_problem_id;
4163 hst->current_problem_id = 0L;
4164 }
4165
4166 /* write the host state change to the main log file */
4167 if(hst->state_type == HARD_STATE || (hst->state_type == SOFT_STATE && log_host_retries == TRUE))
4168 log_host_event(hst);
4169
4170 /* check for start of flexible (non-fixed) scheduled downtime */
4171 /* CHANGED 08-05-2010 EG flex downtime can now start on soft states */
4172 /*if(hst->state_type==HARD_STATE)*/
4173 check_pending_flex_host_downtime(hst);
4174
4175 /* notify contacts about the recovery or problem if its a "hard" state */
4176 if(hst->state_type == HARD_STATE)
4177 host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
4178
4179 /* handle the host state change */
4180 handle_host_event(hst);
4181
4182 /* the host just recovered, so reset the current host attempt */
4183 if(hst->current_state == HOST_UP)
4184 hst->current_attempt = 1;
4185
4186 /* the host recovered, so reset the current notification number and state flags (after the recovery notification has gone out) */
4187 if(hst->current_state == HOST_UP) {
4188 hst->current_notification_number = 0;
4189 hst->notified_on_down = FALSE;
4190 hst->notified_on_unreachable = FALSE;
4191 }
4192 }
4193
4194 /* else the host state has not changed */
4195 else {
4196
4197 /* notify contacts if host is still down or unreachable */
4198 if(hst->current_state != HOST_UP && hst->state_type == HARD_STATE)
4199 host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
4200
4201 /* if we're in a soft state and we should log host retries, do so now... */
4202 if(hst->state_type == SOFT_STATE && log_host_retries == TRUE)
4203 log_host_event(hst);
4204 }
4205
4206 return OK;
4207 }
4208
4209
4210 /* parse raw plugin output and return: short and long output, perf data */
parse_check_output(char * buf,char ** short_output,char ** long_output,char ** perf_data,int escape_newlines_please,int newlines_are_escaped)4211 int parse_check_output(char *buf, char **short_output, char **long_output, char **perf_data, int escape_newlines_please, int newlines_are_escaped) {
4212 int current_line = 0;
4213 int found_newline = FALSE;
4214 int eof = FALSE;
4215 int used_buf = 0;
4216 int dbuf_chunk = 1024;
4217 dbuf db1;
4218 dbuf db2;
4219 char *ptr = NULL;
4220 int in_perf_data = FALSE;
4221 char *tempbuf = NULL;
4222 register int x = 0;
4223 register int y = 0;
4224
4225 /* initialize values */
4226 if(short_output)
4227 *short_output = NULL;
4228 if(long_output)
4229 *long_output = NULL;
4230 if(perf_data)
4231 *perf_data = NULL;
4232
4233 /* nothing to do */
4234 if(buf == NULL || !strcmp(buf, ""))
4235 return OK;
4236
4237 used_buf = strlen(buf) + 1;
4238
4239 /* initialize dynamic buffers (1KB chunk size) */
4240 dbuf_init(&db1, dbuf_chunk);
4241 dbuf_init(&db2, dbuf_chunk);
4242
4243 /* unescape newlines and escaped backslashes first */
4244 if(newlines_are_escaped == TRUE) {
4245 for(x = 0, y = 0; buf[x] != '\x0'; x++) {
4246 if(buf[x] == '\\' && buf[x + 1] == '\\') {
4247 x++;
4248 buf[y++] = buf[x];
4249 }
4250 else if(buf[x] == '\\' && buf[x + 1] == 'n') {
4251 x++;
4252 buf[y++] = '\n';
4253 }
4254 else
4255 buf[y++] = buf[x];
4256 }
4257 buf[y] = '\x0';
4258 }
4259
4260 /* process each line of input */
4261 for(x = 0; eof == FALSE; x++) {
4262
4263 /* we found the end of a line */
4264 if(buf[x] == '\n')
4265 found_newline = TRUE;
4266 else if(buf[x] == '\\' && buf[x + 1] == 'n' && newlines_are_escaped == TRUE) {
4267 found_newline = TRUE;
4268 buf[x] = '\x0';
4269 x++;
4270 }
4271 else if(buf[x] == '\x0') {
4272 found_newline = TRUE;
4273 eof = TRUE;
4274 }
4275 else
4276 found_newline = FALSE;
4277
4278 if(found_newline == TRUE) {
4279
4280 current_line++;
4281
4282 /* handle this line of input */
4283 buf[x] = '\x0';
4284 if((tempbuf = (char *)strdup(buf))) {
4285
4286 /* first line contains short plugin output and optional perf data */
4287 if(current_line == 1) {
4288
4289 /* get the short plugin output */
4290 if((ptr = strtok(tempbuf, "|"))) {
4291 if(short_output)
4292 *short_output = (char *)strdup(ptr);
4293
4294 /* get the optional perf data */
4295 if((ptr = strtok(NULL, "\n")))
4296 dbuf_strcat(&db2, ptr);
4297 }
4298 }
4299
4300 /* additional lines contain long plugin output and optional perf data */
4301 else {
4302
4303 /* rest of the output is perf data */
4304 if(in_perf_data == TRUE) {
4305 dbuf_strcat(&db2, tempbuf);
4306 dbuf_strcat(&db2, " ");
4307 }
4308
4309 /* we're still in long output */
4310 else {
4311
4312 /* perf data separator has been found */
4313 if(strstr(tempbuf, "|")) {
4314
4315 /* NOTE: strtok() causes problems if first character of tempbuf='|', so use my_strtok() instead */
4316 /* get the remaining long plugin output */
4317 if((ptr = my_strtok(tempbuf, "|"))) {
4318
4319 if(current_line > 2)
4320 dbuf_strcat(&db1, "\n");
4321 dbuf_strcat(&db1, ptr);
4322
4323 /* get the perf data */
4324 if((ptr = my_strtok(NULL, "\n"))) {
4325 dbuf_strcat(&db2, ptr);
4326 dbuf_strcat(&db2, " ");
4327 }
4328 }
4329
4330 /* set the perf data flag */
4331 in_perf_data = TRUE;
4332 }
4333
4334 /* just long output */
4335 else {
4336 if(current_line > 2)
4337 dbuf_strcat(&db1, "\n");
4338 dbuf_strcat(&db1, tempbuf);
4339 }
4340 }
4341 }
4342
4343 my_free(tempbuf);
4344 tempbuf = NULL;
4345 }
4346
4347
4348 /* shift data back to front of buffer and adjust counters */
4349 memmove((void *)&buf[0], (void *)&buf[x + 1], (size_t)((int)used_buf - x - 1));
4350 used_buf -= (x + 1);
4351 buf[used_buf] = '\x0';
4352 x = -1;
4353 }
4354 }
4355
4356 /* save long output */
4357 if(long_output && (db1.buf && strcmp(db1.buf, ""))) {
4358
4359 if(escape_newlines_please == FALSE)
4360 *long_output = (char *)strdup(db1.buf);
4361
4362 else {
4363
4364 /* escape newlines (and backslashes) in long output */
4365 if((tempbuf = (char *)malloc((strlen(db1.buf) * 2) + 1))) {
4366
4367 for(x = 0, y = 0; db1.buf[x] != '\x0'; x++) {
4368
4369 if(db1.buf[x] == '\n') {
4370 tempbuf[y++] = '\\';
4371 tempbuf[y++] = 'n';
4372 }
4373 else if(db1.buf[x] == '\\') {
4374 tempbuf[y++] = '\\';
4375 tempbuf[y++] = '\\';
4376 }
4377 else
4378 tempbuf[y++] = db1.buf[x];
4379 }
4380
4381 tempbuf[y] = '\x0';
4382 *long_output = (char *)strdup(tempbuf);
4383 my_free(tempbuf);
4384 }
4385 }
4386 }
4387
4388 /* save perf data */
4389 if(perf_data && (db2.buf && strcmp(db2.buf, "")))
4390 *perf_data = (char *)strdup(db2.buf);
4391
4392 /* strip short output and perf data */
4393 if(short_output)
4394 strip(*short_output);
4395 if(perf_data)
4396 strip(*perf_data);
4397
4398 /* free dynamic buffers */
4399 dbuf_free(&db1);
4400 dbuf_free(&db2);
4401
4402 return OK;
4403 }
4404
4405
4406