1 /*****************************************************************************
2  *
3  * CHECKS.C - Service and host check functions for Nagios
4  *
5  * Copyright (c) 2011 Nagios Core Development Team
6  * Copyright (c) 1999-2010 Ethan Galstad (egalstad@nagios.org)
7  * Last Modified: 01-20-2011
8  *
9  * License:
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License version 2 as
13  * published by the Free Software Foundation.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  *****************************************************************************/
25 
26 #include "../include/config.h"
27 #include "../include/comments.h"
28 #include "../include/common.h"
29 #include "../include/statusdata.h"
30 #include "../include/downtime.h"
31 #include "../include/macros.h"
32 #include "../include/nagios.h"
33 #include "../include/broker.h"
34 #include "../include/perfdata.h"
35 
36 /*#define DEBUG_CHECKS*/
37 /*#define DEBUG_HOST_CHECKS 1*/
38 
39 
40 #ifdef EMBEDDEDPERL
41 #include "../include/epn_nagios.h"
42 #endif
43 
44 #ifdef USE_EVENT_BROKER
45 #include "../include/neberrors.h"
46 #endif
47 
48 extern int      sigshutdown;
49 extern int      sigrestart;
50 
51 extern char     *temp_file;
52 extern char     *temp_path;
53 extern char     *check_result_path;
54 
55 extern int      interval_length;
56 
57 extern int      command_check_interval;
58 
59 extern int      log_initial_states;
60 extern int      log_passive_checks;
61 extern int      log_host_retries;
62 
63 extern int      service_check_timeout;
64 extern int      host_check_timeout;
65 
66 extern int      check_reaper_interval;
67 extern int      max_check_reaper_time;
68 
69 extern int      use_aggressive_host_checking;
70 extern unsigned long cached_host_check_horizon;
71 extern unsigned long cached_service_check_horizon;
72 extern int      enable_predictive_host_dependency_checks;
73 extern int      enable_predictive_service_dependency_checks;
74 
75 extern int      soft_state_dependencies;
76 
77 extern int      currently_running_service_checks;
78 extern int      currently_running_host_checks;
79 
80 extern int      accept_passive_service_checks;
81 extern int      execute_service_checks;
82 extern int      accept_passive_host_checks;
83 extern int      execute_host_checks;
84 extern int      obsess_over_services;
85 extern int      obsess_over_hosts;
86 
87 extern int      translate_passive_host_checks;
88 extern int      passive_host_checks_are_soft;
89 
90 extern int      check_service_freshness;
91 extern int      check_host_freshness;
92 extern int      additional_freshness_latency;
93 
94 extern int      max_host_check_spread;
95 extern int      max_service_check_spread;
96 
97 extern int      use_large_installation_tweaks;
98 extern int      free_child_process_memory;
99 extern int      child_processes_fork_twice;
100 
101 extern time_t   last_program_stop;
102 extern time_t   program_start;
103 extern time_t   event_start;
104 
105 extern timed_event       *event_list_low;
106 extern timed_event       *event_list_low_tail;
107 
108 extern host              *host_list;
109 extern service           *service_list;
110 extern servicedependency *servicedependency_list;
111 extern hostdependency    *hostdependency_list;
112 
113 extern unsigned long   next_event_id;
114 extern unsigned long   next_problem_id;
115 
116 extern check_result    check_result_info;
117 extern check_result    *check_result_list;
118 
119 extern pthread_t       worker_threads[TOTAL_WORKER_THREADS];
120 
121 extern unsigned long max_debug_file_size;
122 
123 #ifdef EMBEDDEDPERL
124 extern int      use_embedded_perl;
125 #endif
126 
127 
128 
129 
130 
131 /******************************************************************/
132 /********************** CHECK REAPER FUNCTIONS ********************/
133 /******************************************************************/
134 
135 /* reaps host and service check results */
reap_check_results(void)136 int reap_check_results(void) {
137 	check_result *queued_check_result = NULL;
138 	service *temp_service = NULL;
139 	host *temp_host = NULL;
140 	time_t current_time = 0L;
141 	time_t reaper_start_time = 0L;
142 	int reaped_checks = 0;
143 
144 	log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() start\n");
145 	log_debug_info(DEBUGL_CHECKS, 0, "Starting to reap check results.\n");
146 
147 	/* get the start time */
148 	time(&reaper_start_time);
149 
150 	/* process files in the check result queue */
151 	process_check_result_queue(check_result_path);
152 
153 	/* read all check results that have come in... */
154 	while((queued_check_result = read_check_result(&check_result_list))) {
155 
156 		reaped_checks++;
157 
158 		log_debug_info(DEBUGL_CHECKS, 2, "Found a check result (#%d) to handle...\n", reaped_checks);
159 
160 		/* service check */
161 		if(queued_check_result->object_check_type == SERVICE_CHECK) {
162 
163 			/* make sure the service exists */
164 			if((temp_service = find_service(queued_check_result->host_name, queued_check_result->service_description)) == NULL) {
165 
166 				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check result queue contained results for service '%s' on host '%s', but the service could not be found!  Perhaps you forgot to define the service in your config files?\n", queued_check_result->service_description, queued_check_result->host_name);
167 
168 				/* free memory */
169 				free_check_result(queued_check_result);
170 				my_free(queued_check_result);
171 
172 				/* TODO - add new service definition automatically */
173 
174 				continue;
175 				}
176 
177 			log_debug_info(DEBUGL_CHECKS, 1, "Handling check result for service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
178 
179 			/* process the check result */
180 			handle_async_service_check_result(temp_service, queued_check_result);
181 			}
182 
183 		/* host check */
184 		else {
185 			if((temp_host = find_host(queued_check_result->host_name)) == NULL) {
186 
187 				/* make sure the host exists */
188 				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check result queue contained results for host '%s', but the host could not be found!  Perhaps you forgot to define the host in your config files?\n", queued_check_result->host_name);
189 
190 				/* free memory */
191 				free_check_result(queued_check_result);
192 				my_free(queued_check_result);
193 
194 				/* TODO - add new host definition automatically */
195 
196 				continue;
197 				}
198 
199 			log_debug_info(DEBUGL_CHECKS, 1, "Handling check result for host '%s'...\n", temp_host->name);
200 
201 			/* process the check result */
202 			handle_async_host_check_result_3x(temp_host, queued_check_result);
203 			}
204 
205 		log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Deleted check result file '%s'\n", queued_check_result->output_file);
206 
207 		/* free allocated memory */
208 		free_check_result(queued_check_result);
209 		my_free(queued_check_result);
210 
211 		/* break out if we've been here too long (max_check_reaper_time seconds) */
212 		time(&current_time);
213 		if((int)(current_time - reaper_start_time) > max_check_reaper_time) {
214 			log_debug_info(DEBUGL_CHECKS, 0, "Breaking out of check result reaper: max reaper time exceeded\n");
215 			break;
216 			}
217 
218 		/* bail out if we encountered a signal */
219 		if(sigshutdown == TRUE || sigrestart == TRUE) {
220 			log_debug_info(DEBUGL_CHECKS, 0, "Breaking out of check result reaper: signal encountered\n");
221 			break;
222 			}
223 		}
224 
225 	log_debug_info(DEBUGL_CHECKS, 0, "Finished reaping %d check results\n", reaped_checks);
226 	log_debug_info(DEBUGL_FUNCTIONS, 0, "reap_check_results() end\n");
227 
228 	return OK;
229 	}
230 
231 
232 
233 
234 /******************************************************************/
235 /****************** SERVICE MONITORING FUNCTIONS ******************/
236 /******************************************************************/
237 
238 /* executes a scheduled service check */
run_scheduled_service_check(service * svc,int check_options,double latency)239 int run_scheduled_service_check(service *svc, int check_options, double latency) {
240 	int result = OK;
241 	time_t current_time = 0L;
242 	time_t preferred_time = 0L;
243 	time_t next_valid_time = 0L;
244 	int time_is_valid = TRUE;
245 
246 	if(svc == NULL)
247 		return ERROR;
248 
249 	log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_service_check() start\n");
250 	log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n", svc->description, svc->host_name, check_options, latency);
251 
252 	/*
253 	 * reset the next_check_event so we know it's
254 	 * no longer in the scheduling queue
255 	 */
256 	svc->next_check_event = NULL;
257 
258 	/* attempt to run the check */
259 	result = run_async_service_check(svc, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time);
260 
261 	/* an error occurred, so reschedule the check */
262 	if(result == ERROR) {
263 
264 		log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled service check at this time\n");
265 
266 		/* only attempt to (re)schedule checks that should get checked... */
267 		if(svc->should_be_scheduled == TRUE) {
268 
269 			/* get current time */
270 			time(&current_time);
271 
272 			/* determine next time we should check the service if needed */
273 			/* if service has no check interval, schedule it again for 5 minutes from now */
274 			if(current_time >= preferred_time)
275 				preferred_time = current_time + ((svc->check_interval <= 0) ? 300 : (svc->check_interval * interval_length));
276 
277 			/* make sure we rescheduled the next service check at a valid time */
278 			get_next_valid_time(preferred_time, &next_valid_time, svc->check_period_ptr);
279 
280 			/*
281 			logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Service '%s' on host '%s' timeperiod check failed...\n",svc->description,svc->host_name);
282 			logit(NSLOG_RUNTIME_WARNING,TRUE,"Current time: %s",ctime(&current_time));
283 			logit(NSLOG_RUNTIME_WARNING,TRUE,"Preferred time: %s",ctime(&preferred_time));
284 			logit(NSLOG_RUNTIME_WARNING,TRUE,"Next valid time: %s",ctime(&next_valid_time));
285 			*/
286 
287 			/* the service could not be rescheduled properly - set the next check time for next week */
288 			/*if(time_is_valid==FALSE && next_valid_time==preferred_time){*/
289 			/* UPDATED 08/12/09 EG to reflect proper timeperod check logic */
290 			if(time_is_valid == FALSE &&  check_time_against_period(next_valid_time, svc->check_period_ptr) == ERROR) {
291 
292 				/*
293 				svc->next_check=(time_t)(next_valid_time+(60*60*24*365));
294 				svc->should_be_scheduled=FALSE;
295 				*/
296 
297 				svc->next_check = (time_t)(next_valid_time + (60 * 60 * 24 * 7));
298 
299 				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of service '%s' on host '%s' could not be rescheduled properly.  Scheduling check for next week...\n", svc->description, svc->host_name);
300 
301 				log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next service check!\n");
302 				}
303 
304 			/* this service could be rescheduled... */
305 			else {
306 				svc->next_check = next_valid_time;
307 				svc->should_be_scheduled = TRUE;
308 
309 				log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next service check for %s", ctime(&next_valid_time));
310 				}
311 			}
312 
313 		/* reschedule the next service check - unless we couldn't find a valid next check time */
314 		/* 10/19/07 EG - keep original check options */
315 		if(svc->should_be_scheduled == TRUE)
316 			schedule_service_check(svc, svc->next_check, check_options);
317 
318 		/* update the status log */
319 		update_service_status(svc, FALSE);
320 
321 		return ERROR;
322 		}
323 
324 	return OK;
325 	}
326 
327 
328 /* forks a child process to run a service check, but does not wait for the service check result */
run_async_service_check(service * svc,int check_options,double latency,int scheduled_check,int reschedule_check,int * time_is_valid,time_t * preferred_time)329 int run_async_service_check(service *svc, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) {
330 	nagios_macros mac;
331 	char *raw_command = NULL;
332 	char *processed_command = NULL;
333 	char output_buffer[MAX_INPUT_BUFFER] = "";
334 	char *temp_buffer = NULL;
335 	struct timeval start_time, end_time;
336 	pid_t pid = 0;
337 	int fork_error = FALSE;
338 	int wait_result = 0;
339 	host *temp_host = NULL;
340 	FILE *fp = NULL;
341 	int pclose_result = 0;
342 	mode_t new_umask = 077;
343 	mode_t old_umask;
344 	char *output_file = NULL;
345 	double old_latency = 0.0;
346 	dbuf checkresult_dbuf;
347 	int dbuf_chunk = 1024;
348 #ifdef USE_EVENT_BROKER
349 	int neb_result = OK;
350 #endif
351 #ifdef EMBEDDEDPERL
352 	char fname[512] = "";
353 	char *args[5] = {"", DO_CLEAN, "", "", NULL };
354 	char *perl_plugin_output = NULL;
355 	SV *plugin_hndlr_cr = NULL;
356 	int count ;
357 	int use_epn = FALSE;
358 #ifdef aTHX
359 	dTHX;
360 #endif
361 	dSP;
362 #endif
363 
364 	log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_service_check()\n");
365 
366 	/* make sure we have something */
367 	if(svc == NULL)
368 		return ERROR;
369 
370 	/* is the service check viable at this time? */
371 	if(check_service_check_viability(svc, check_options, time_is_valid, preferred_time) == ERROR)
372 		return ERROR;
373 
374 	/* find the host associated with this service */
375 	if((temp_host = svc->host_ptr) == NULL)
376 		return ERROR;
377 
378 	/******** GOOD TO GO FOR A REAL SERVICE CHECK AT THIS POINT ********/
379 
380 #ifdef USE_EVENT_BROKER
381 	/* initialize start/end times */
382 	start_time.tv_sec = 0L;
383 	start_time.tv_usec = 0L;
384 	end_time.tv_sec = 0L;
385 	end_time.tv_usec = 0L;
386 
387 	/* send data to event broker */
388 	neb_result = broker_service_check(NEBTYPE_SERVICECHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, svc, SERVICE_CHECK_ACTIVE, start_time, end_time, svc->service_check_command, svc->latency, 0.0, 0, FALSE, 0, NULL, NULL);
389 
390 	/* neb module wants to cancel the service check - the check will be rescheduled for a later time by the scheduling logic */
391 	if(neb_result == NEBERROR_CALLBACKCANCEL) {
392 		if(preferred_time)
393 			*preferred_time += (svc->check_interval * interval_length);
394 		return ERROR;
395 		}
396 
397 	/* neb module wants to override (or cancel) the service check - perhaps it will check the service itself */
398 	/* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
399 	/* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */
400 	if(neb_result == NEBERROR_CALLBACKOVERRIDE)
401 		return OK;
402 #endif
403 
404 
405 	log_debug_info(DEBUGL_CHECKS, 0, "Checking service '%s' on host '%s'...\n", svc->description, svc->host_name);
406 
407 	/* clear check options - we don't want old check options retained */
408 	/* only clear check options for scheduled checks - ondemand checks shouldn't affected retained check options */
409 	if(scheduled_check == TRUE)
410 		svc->check_options = CHECK_OPTION_NONE;
411 
412 	/* update latency for macros, event broker, save old value for later */
413 	old_latency = svc->latency;
414 	svc->latency = latency;
415 
416 	/* grab the host and service macro variables */
417 	memset(&mac, 0, sizeof(mac));
418 	grab_host_macros_r(&mac, temp_host);
419 	grab_service_macros_r(&mac, svc);
420 
421 	/* get the raw command line */
422 	get_raw_command_line_r(&mac, svc->check_command_ptr, svc->service_check_command, &raw_command, 0);
423 	if(raw_command == NULL) {
424 		clear_volatile_macros_r(&mac);
425 		log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name);
426 		if(preferred_time)
427 			*preferred_time += (svc->check_interval * interval_length);
428 		svc->latency = old_latency;
429 		return ERROR;
430 		}
431 
432 	/* process any macros contained in the argument */
433 	process_macros_r(&mac, raw_command, &processed_command, 0);
434 	my_free(raw_command);
435 	if(processed_command == NULL) {
436 		clear_volatile_macros_r(&mac);
437 		log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for service '%s' on host '%s' was NULL - aborting.\n", svc->description, svc->host_name);
438 		if(preferred_time)
439 			*preferred_time += (svc->check_interval * interval_length);
440 		svc->latency = old_latency;
441 		return ERROR;
442 		}
443 
444 	/* get the command start time */
445 	gettimeofday(&start_time, NULL);
446 
447 #ifdef USE_EVENT_BROKER
448 	/* send data to event broker */
449 	neb_result = broker_service_check(NEBTYPE_SERVICECHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, svc, SERVICE_CHECK_ACTIVE, start_time, end_time, svc->service_check_command, svc->latency, 0.0, service_check_timeout, FALSE, 0, processed_command, NULL);
450 
451 	/* neb module wants to override the service check - perhaps it will check the service itself */
452 	if(neb_result == NEBERROR_CALLBACKOVERRIDE) {
453 		clear_volatile_macros_r(&mac);
454 		svc->latency = old_latency;
455 		my_free(processed_command);
456 		return OK;
457 		}
458 #endif
459 
460 	/* increment number of service checks that are currently running... */
461 	currently_running_service_checks++;
462 
463 	/* set the execution flag */
464 	svc->is_executing = TRUE;
465 
466 	/* start save check info */
467 	check_result_info.object_check_type = SERVICE_CHECK;
468 	check_result_info.check_type = SERVICE_CHECK_ACTIVE;
469 	check_result_info.check_options = check_options;
470 	check_result_info.scheduled_check = scheduled_check;
471 	check_result_info.reschedule_check = reschedule_check;
472 	check_result_info.start_time = start_time;
473 	check_result_info.finish_time = start_time;
474 	check_result_info.early_timeout = FALSE;
475 	check_result_info.exited_ok = TRUE;
476 	check_result_info.return_code = STATE_OK;
477 	check_result_info.output = NULL;
478 
479 	/* open a temp file for storing check output */
480 	old_umask = umask(new_umask);
481 	asprintf(&output_file, "%s/checkXXXXXX", temp_path);
482 	check_result_info.output_file_fd = mkstemp(output_file);
483 	if(check_result_info.output_file_fd >= 0)
484 		check_result_info.output_file_fp = fdopen(check_result_info.output_file_fd, "w");
485 	else {
486 		check_result_info.output_file_fp = NULL;
487 		check_result_info.output_file_fd = -1;
488 		}
489 	umask(old_umask);
490 
491 	log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Check result output will be written to '%s' (fd=%d)\n", output_file, check_result_info.output_file_fd);
492 
493 
494 	/* finish save check info */
495 	check_result_info.host_name = (char *)strdup(svc->host_name);
496 	check_result_info.service_description = (char *)strdup(svc->description);
497 	check_result_info.output_file = (check_result_info.output_file_fd < 0 || output_file == NULL) ? NULL : strdup(output_file);
498 
499 	/* free memory */
500 	my_free(output_file);
501 
502 	/* write start of check result file */
503 	/* if things go really bad later on down the line, the user will at least have a partial file to help debug missing output results */
504 	if(check_result_info.output_file_fp) {
505 
506 		fprintf(check_result_info.output_file_fp, "### Active Check Result File ###\n");
507 		fprintf(check_result_info.output_file_fp, "file_time=%lu\n", (unsigned long)check_result_info.start_time.tv_sec);
508 		fprintf(check_result_info.output_file_fp, "\n");
509 
510 		fprintf(check_result_info.output_file_fp, "### Nagios Service Check Result ###\n");
511 		fprintf(check_result_info.output_file_fp, "# Time: %s", ctime(&check_result_info.start_time.tv_sec));
512 		fprintf(check_result_info.output_file_fp, "host_name=%s\n", check_result_info.host_name);
513 		fprintf(check_result_info.output_file_fp, "service_description=%s\n", check_result_info.service_description);
514 		fprintf(check_result_info.output_file_fp, "check_type=%d\n", check_result_info.check_type);
515 		fprintf(check_result_info.output_file_fp, "check_options=%d\n", check_result_info.check_options);
516 		fprintf(check_result_info.output_file_fp, "scheduled_check=%d\n", check_result_info.scheduled_check);
517 		fprintf(check_result_info.output_file_fp, "reschedule_check=%d\n", check_result_info.reschedule_check);
518 		fprintf(check_result_info.output_file_fp, "latency=%f\n", svc->latency);
519 		fprintf(check_result_info.output_file_fp, "start_time=%lu.%lu\n", check_result_info.start_time.tv_sec, check_result_info.start_time.tv_usec);
520 
521 		/* flush output or it'll get written again when we fork() */
522 		fflush(check_result_info.output_file_fp);
523 		}
524 
525 	/* initialize dynamic buffer for storing plugin output */
526 	dbuf_init(&checkresult_dbuf, dbuf_chunk);
527 
528 
529 	/* reset latency (permanent value will be set later) */
530 	svc->latency = old_latency;
531 
532 	/* update check statistics */
533 	update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_SERVICE_CHECK_STATS : ACTIVE_ONDEMAND_SERVICE_CHECK_STATS, start_time.tv_sec);
534 
535 #ifdef EMBEDDEDPERL
536 
537 	/* get"filename" component of command */
538 	strncpy(fname, processed_command, strcspn(processed_command, " "));
539 	fname[strcspn(processed_command, " ")] = '\x0';
540 
541 	/* should we use the embedded Perl interpreter to run this script? */
542 	use_epn = file_uses_embedded_perl(fname);
543 
544 	/* if yes, do some initialization */
545 	if(use_epn == TRUE) {
546 
547 		log_debug_info(DEBUGL_CHECKS, 1, "** Using Embedded Perl interpreter to run service check...\n");
548 
549 		args[0] = fname;
550 		args[2] = "";
551 
552 		if(strchr(processed_command, ' ') == NULL)
553 			args[3] = "";
554 		else
555 			args[3] = processed_command + strlen(fname) + 1;
556 
557 		ENTER;
558 		SAVETMPS;
559 		PUSHMARK(SP);
560 		XPUSHs(sv_2mortal(newSVpv(args[0], 0)));
561 		XPUSHs(sv_2mortal(newSVpv(args[1], 0)));
562 		XPUSHs(sv_2mortal(newSVpv(args[2], 0)));
563 		XPUSHs(sv_2mortal(newSVpv(args[3], 0)));
564 		PUTBACK;
565 
566 		/* call our perl interpreter to compile and optionally cache the command */
567 
568 		call_pv("Embed::Persistent::eval_file", G_SCALAR | G_EVAL);
569 
570 		SPAGAIN ;
571 
572 		if(SvTRUE(ERRSV)) {
573 
574 			/*
575 			 * if SvTRUE(ERRSV)
576 			 * 	write failure to IPC pipe
577 			 *	return
578 			 */
579 
580 			/* remove the top element of the Perl stack (undef) */
581 			(void) POPs ;
582 
583 			pclose_result = STATE_UNKNOWN;
584 			perl_plugin_output = SvPVX(ERRSV);
585 
586 			log_debug_info(DEBUGL_CHECKS, 0, "Embedded Perl failed to compile %s, compile error %s - skipping plugin\n", fname, perl_plugin_output);
587 
588 			/* save plugin output */
589 			if(perl_plugin_output != NULL) {
590 				temp_buffer = escape_newlines(perl_plugin_output);
591 				dbuf_strcat(&checkresult_dbuf, temp_buffer);
592 				my_free(temp_buffer);
593 				}
594 
595 			/* get the check finish time */
596 			gettimeofday(&end_time, NULL);
597 
598 			/* record check result info */
599 			check_result_info.exited_ok = FALSE;
600 			check_result_info.return_code = pclose_result;
601 			check_result_info.finish_time = end_time;
602 
603 			/* write check result to file */
604 			if(check_result_info.output_file_fp) {
605 
606 				fprintf(check_result_info.output_file_fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
607 				fprintf(check_result_info.output_file_fp, "early_timeout=%d\n", check_result_info.early_timeout);
608 				fprintf(check_result_info.output_file_fp, "exited_ok=%d\n", check_result_info.exited_ok);
609 				fprintf(check_result_info.output_file_fp, "return_code=%d\n", check_result_info.return_code);
610 				fprintf(check_result_info.output_file_fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
611 
612 				/* close the temp file */
613 				fclose(check_result_info.output_file_fp);
614 
615 				/* move check result to queue directory */
616 				move_check_result_to_queue(check_result_info.output_file);
617 				}
618 
619 			/* free memory */
620 			dbuf_free(&checkresult_dbuf);
621 
622 			/* free check result memory */
623 			free_check_result(&check_result_info);
624 
625 			return OK;
626 			}
627 		else {
628 
629 			plugin_hndlr_cr = newSVsv(POPs);
630 
631 			log_debug_info(DEBUGL_CHECKS, 1, "Embedded Perl successfully compiled %s and returned code ref to plugin handler\n", fname);
632 
633 			PUTBACK ;
634 			FREETMPS ;
635 			LEAVE ;
636 			}
637 		}
638 #endif
639 
640 	/* plugin is a C plugin or a Perl plugin _without_ compilation errors */
641 
642 	/* fork a child process */
643 	pid = fork();
644 
645 	/* an error occurred while trying to fork */
646 	if(pid == -1) {
647 
648 		fork_error = TRUE;
649 
650 		logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'.  The check will be rescheduled.\n", svc->description, svc->host_name, strerror(errno));
651 
652 		log_debug_info(DEBUGL_CHECKS, 0, "Check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'!\n", svc->description, svc->host_name, strerror(errno));
653 		}
654 
655 	/* if we are in the child process... */
656 	else if(pid == 0) {
657 
658 		/* set environment variables */
659 		set_all_macro_environment_vars_r(&mac, TRUE);
660 
661 		/* ADDED 11/12/07 EG */
662 		/* close external command file and shut down worker thread */
663 		close_command_file();
664 
665 		/* fork again if we're not in a large installation */
666 		if(child_processes_fork_twice == TRUE) {
667 
668 			/* fork again... */
669 			pid = fork();
670 
671 			/* an error occurred while trying to fork again */
672 			if(pid == -1)
673 				exit(STATE_UNKNOWN);
674 			}
675 
676 		/* the grandchild (or child if large install tweaks are enabled) process should run the service check... */
677 		if(pid == 0 || child_processes_fork_twice == FALSE) {
678 
679 			/* reset signal handling */
680 			reset_sighandler();
681 
682 			/* become the process group leader */
683 			setpgid(0, 0);
684 
685 			/* exit on term signals at this process level */
686 			signal(SIGTERM, SIG_DFL);
687 
688 			/* catch plugins that don't finish in a timely manner */
689 			signal(SIGALRM, service_check_sighandler);
690 			alarm(service_check_timeout);
691 
692 			/* disable rotation of the debug file */
693 			max_debug_file_size = 0L;
694 
695 			/******** BEGIN EMBEDDED PERL INTERPRETER EXECUTION ********/
696 #ifdef EMBEDDEDPERL
697 			if(use_epn == TRUE) {
698 
699 				/* execute our previously compiled script - from call_pv("Embed::Persistent::eval_file",..) */
700 				/* NB. args[2] is _now_ a code ref (to the Perl subroutine corresp to the plugin) returned by eval_file() */
701 
702 				ENTER;
703 				SAVETMPS;
704 				PUSHMARK(SP);
705 
706 				XPUSHs(sv_2mortal(newSVpv(args[0], 0)));
707 				XPUSHs(sv_2mortal(newSVpv(args[1], 0)));
708 				XPUSHs(plugin_hndlr_cr);
709 				XPUSHs(sv_2mortal(newSVpv(args[3], 0)));
710 
711 				PUTBACK;
712 
713 				count = call_pv("Embed::Persistent::run_package", G_ARRAY);
714 
715 				SPAGAIN;
716 
717 				perl_plugin_output = POPpx ;
718 				pclose_result = POPi ;
719 
720 				/* NOTE: 07/16/07 This has to be done before FREETMPS statement below, or the POPpx pointer will be invalid (Hendrik B.) */
721 				/* get perl plugin output - escape newlines */
722 				if(perl_plugin_output != NULL) {
723 					temp_buffer = escape_newlines(perl_plugin_output);
724 					dbuf_strcat(&checkresult_dbuf, temp_buffer);
725 					my_free(temp_buffer);
726 					}
727 
728 				PUTBACK;
729 				FREETMPS;
730 				LEAVE;
731 
732 				log_debug_info(DEBUGL_CHECKS, 1, "Embedded Perl ran %s: return code=%d, plugin output=%s\n", fname, pclose_result, (perl_plugin_output == NULL) ? "NULL" : checkresult_dbuf.buf);
733 
734 				/* reset the alarm */
735 				alarm(0);
736 
737 				/* get the check finish time */
738 				gettimeofday(&end_time, NULL);
739 
740 				/* record check result info */
741 				check_result_info.return_code = pclose_result;
742 				check_result_info.finish_time = end_time;
743 
744 				/* write check result to file */
745 				if(check_result_info.output_file_fp) {
746 
747 					fprintf(check_result_info.output_file_fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
748 					fprintf(check_result_info.output_file_fp, "early_timeout=%d\n", check_result_info.early_timeout);
749 					fprintf(check_result_info.output_file_fp, "exited_ok=%d\n", check_result_info.exited_ok);
750 					fprintf(check_result_info.output_file_fp, "return_code=%d\n", check_result_info.return_code);
751 					fprintf(check_result_info.output_file_fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
752 
753 					/* close the temp file */
754 					fclose(check_result_info.output_file_fp);
755 
756 					/* move check result to queue directory */
757 					move_check_result_to_queue(check_result_info.output_file);
758 					}
759 
760 				/* free memory */
761 				dbuf_free(&checkresult_dbuf);
762 
763 				/* free check result memory */
764 				free_check_result(&check_result_info);
765 
766 				/* return with plugin exit status - not really necessary... */
767 				_exit(pclose_result);
768 				}
769 #endif
770 			/******** END EMBEDDED PERL INTERPRETER EXECUTION ********/
771 
772 
773 			/* run the plugin check command */
774 			fp = popen(processed_command, "r");
775 			if(fp == NULL)
776 				_exit(STATE_UNKNOWN);
777 
778 			/* initialize buffer */
779 			strcpy(output_buffer, "");
780 
781 			/* get all lines of plugin output - escape newlines */
782 			while(fgets(output_buffer, sizeof(output_buffer) - 1, fp)) {
783 				temp_buffer = escape_newlines(output_buffer);
784 				dbuf_strcat(&checkresult_dbuf, temp_buffer);
785 				my_free(temp_buffer);
786 				}
787 
788 			/* close the process */
789 			pclose_result = pclose(fp);
790 
791 			/* reset the alarm and ignore SIGALRM */
792 			signal(SIGALRM, SIG_IGN);
793 			alarm(0);
794 
795 			/* get the check finish time */
796 			gettimeofday(&end_time, NULL);
797 
798 			/* record check result info */
799 			check_result_info.finish_time = end_time;
800 			check_result_info.early_timeout = FALSE;
801 
802 			/* test for execution error */
803 			if(pclose_result == -1) {
804 				pclose_result = STATE_UNKNOWN;
805 				check_result_info.return_code = STATE_CRITICAL;
806 				check_result_info.exited_ok = FALSE;
807 				}
808 			else {
809 				if(WEXITSTATUS(pclose_result) == 0 && WIFSIGNALED(pclose_result))
810 					check_result_info.return_code = 128 + WTERMSIG(pclose_result);
811 				else
812 					check_result_info.return_code = WEXITSTATUS(pclose_result);
813 				}
814 
815 			/* write check result to file */
816 			if(check_result_info.output_file_fp) {
817 				FILE *fp;
818 
819 				/* avoid races with signal handling */
820 				fp = check_result_info.output_file_fp;
821 				check_result_info.output_file_fp = NULL;
822 
823 				fprintf(fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
824 				fprintf(fp, "early_timeout=%d\n", check_result_info.early_timeout);
825 				fprintf(fp, "exited_ok=%d\n", check_result_info.exited_ok);
826 				fprintf(fp, "return_code=%d\n", check_result_info.return_code);
827 				fprintf(fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
828 
829 				/* close the temp file */
830 				fclose(fp);
831 
832 				/* move check result to queue directory */
833 				move_check_result_to_queue(check_result_info.output_file);
834 				}
835 
836 			/* free memory */
837 			dbuf_free(&checkresult_dbuf);
838 			my_free(processed_command);
839 
840 			/* free check result memory */
841 			free_check_result(&check_result_info);
842 
843 			/* return with plugin exit status - not really necessary... */
844 			_exit(pclose_result);
845 			}
846 
847 		/* NOTE: this code is never reached if large install tweaks are enabled... */
848 
849 		/* unset environment variables */
850 		set_all_macro_environment_vars_r(&mac, FALSE);
851 
852 		/* free allocated memory */
853 		/* this needs to be done last, so we don't free memory for variables before they're used above */
854 		if(free_child_process_memory == TRUE)
855 			free_memory(&mac);
856 
857 		/* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
858 		_exit(STATE_OK);
859 		}
860 
861 	/* else the parent should wait for the first child to return... */
862 	else if(pid > 0) {
863 		clear_volatile_macros_r(&mac);
864 
865 		log_debug_info(DEBUGL_CHECKS, 2, "Service check is executing in child process (pid=%lu)\n", (unsigned long)pid);
866 
867 		/* parent should close output file */
868 		if(check_result_info.output_file_fp)
869 			fclose(check_result_info.output_file_fp);
870 
871 		/* should this be done in first child process (after spawning grandchild) as well? */
872 		/* free memory allocated for IPC functionality */
873 		free_check_result(&check_result_info);
874 
875 		/* free memory */
876 		my_free(processed_command);
877 
878 		/* wait for the first child to return */
879 		/* don't do this if large install tweaks are enabled - we'll clean up children in event loop */
880 		if(child_processes_fork_twice == TRUE)
881 			wait_result = waitpid(pid, NULL, 0);
882 		}
883 
884 	/* see if we were able to run the check... */
885 	if(fork_error == TRUE)
886 		return ERROR;
887 
888 	return OK;
889 	}
890 
891 
892 
893 /* handles asynchronous service check results */
handle_async_service_check_result(service * temp_service,check_result * queued_check_result)894 int handle_async_service_check_result(service *temp_service, check_result *queued_check_result) {
895 	host *temp_host = NULL;
896 	time_t next_service_check = 0L;
897 	time_t preferred_time = 0L;
898 	time_t next_valid_time = 0L;
899 	int reschedule_check = FALSE;
900 	int state_change = FALSE;
901 	int hard_state_change = FALSE;
902 	int first_host_check_initiated = FALSE;
903 	int route_result = HOST_UP;
904 	time_t current_time = 0L;
905 	int state_was_logged = FALSE;
906 	char *old_plugin_output = NULL;
907 	char *temp_plugin_output = NULL;
908 	char *temp_ptr = NULL;
909 	servicedependency *temp_dependency = NULL;
910 	objectlist *check_servicelist = NULL;
911 	objectlist *servicelist_item = NULL;
912 	service *master_service = NULL;
913 	int run_async_check = TRUE;
914 	int state_changes_use_cached_state = TRUE; /* TODO - 09/23/07 move this to a global variable */
915 	int flapping_check_done = FALSE;
916 	void *ptr = NULL;
917 
918 
919 	log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_service_check_result()\n");
920 
921 	/* make sure we have what we need */
922 	if(temp_service == NULL || queued_check_result == NULL)
923 		return ERROR;
924 
925 	/* get the current time */
926 	time(&current_time);
927 
928 	log_debug_info(DEBUGL_CHECKS, 0, "** Handling check result for service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
929 	log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, SERVICE: %s, CHECK TYPE: %s, OPTIONS: %d, SCHEDULED: %s, RESCHEDULE: %s, EXITED OK: %s, RETURN CODE: %d, OUTPUT: %s\n", temp_service->host_name, temp_service->description, (queued_check_result->check_type == SERVICE_CHECK_ACTIVE) ? "Active" : "Passive", queued_check_result->check_options, (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No", queued_check_result->return_code, queued_check_result->output);
930 
931 	/* decrement the number of service checks still out there... */
932 	if(queued_check_result->check_type == SERVICE_CHECK_ACTIVE && currently_running_service_checks > 0)
933 		currently_running_service_checks--;
934 
935 	/* skip this service check results if its passive and we aren't accepting passive check results */
936 	if(queued_check_result->check_type == SERVICE_CHECK_PASSIVE) {
937 		if(accept_passive_service_checks == FALSE) {
938 			log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive service checks are disabled globally.\n");
939 			return ERROR;
940 			}
941 		if(temp_service->accept_passive_service_checks == FALSE) {
942 			log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive service check result because passive checks are disabled for this service.\n");
943 			return ERROR;
944 			}
945 		}
946 
947 	/* clear the freshening flag (it would have been set if this service was determined to be stale) */
948 	if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
949 		temp_service->is_being_freshened = FALSE;
950 
951 	/* clear the execution flag if this was an active check */
952 	if(queued_check_result->check_type == SERVICE_CHECK_ACTIVE)
953 		temp_service->is_executing = FALSE;
954 
955 	/* DISCARD INVALID FRESHNESS CHECK RESULTS */
956 	/* If a services goes stale, Nagios will initiate a forced check in order to freshen it.  There is a race condition whereby a passive check
957 	   could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here.  This would
958 	   make the service fresh again, so we do a quick check to make sure the service is still stale before we accept the check result. */
959 	if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_service_result_fresh(temp_service, current_time, FALSE) == TRUE) {
960 		log_debug_info(DEBUGL_CHECKS, 0, "Discarding service freshness check result because the service is currently fresh (race condition avoided).\n");
961 		return OK;
962 		}
963 
964 	/* check latency is passed to us */
965 	temp_service->latency = queued_check_result->latency;
966 
967 	/* update the execution time for this check (millisecond resolution) */
968 	temp_service->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0);
969 	if(temp_service->execution_time < 0.0)
970 		temp_service->execution_time = 0.0;
971 
972 	/* get the last check time */
973 	temp_service->last_check = queued_check_result->start_time.tv_sec;
974 
975 	/* was this check passive or active? */
976 	temp_service->check_type = (queued_check_result->check_type == SERVICE_CHECK_ACTIVE) ? SERVICE_CHECK_ACTIVE : SERVICE_CHECK_PASSIVE;
977 
978 	/* update check statistics for passive checks */
979 	if(queued_check_result->check_type == SERVICE_CHECK_PASSIVE)
980 		update_check_stats(PASSIVE_SERVICE_CHECK_STATS, queued_check_result->start_time.tv_sec);
981 
982 	/* should we reschedule the next service check? NOTE: This may be overridden later... */
983 	reschedule_check = queued_check_result->reschedule_check;
984 
985 	/* save the old service status info */
986 	temp_service->last_state = temp_service->current_state;
987 
988 	/* save old plugin output */
989 	if(temp_service->plugin_output)
990 		old_plugin_output = (char *)strdup(temp_service->plugin_output);
991 
992 	/* clear the old plugin output and perf data buffers */
993 	my_free(temp_service->plugin_output);
994 	my_free(temp_service->long_plugin_output);
995 	my_free(temp_service->perf_data);
996 
997 	/* if there was some error running the command, just skip it (this shouldn't be happening) */
998 	if(queued_check_result->exited_ok == FALSE) {
999 
1000 		logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning:  Check of service '%s' on host '%s' did not exit properly!\n", temp_service->description, temp_service->host_name);
1001 
1002 		temp_service->plugin_output = (char *)strdup("(Service check did not exit properly)");
1003 
1004 		temp_service->current_state = STATE_CRITICAL;
1005 		}
1006 
1007 	/* make sure the return code is within bounds */
1008 	else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) {
1009 
1010 		logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of service '%s' on host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_service->description, temp_service->host_name, (queued_check_result->return_code == 126 ? "Make sure the plugin you're trying to run is executable." : (queued_check_result->return_code == 127 ? " Make sure the plugin you're trying to run actually exists." : "")));
1011 
1012 		asprintf(&temp_plugin_output, "\x73\x6f\x69\x67\x61\x6e\x20\x74\x68\x67\x69\x72\x79\x70\x6f\x63\x20\x6e\x61\x68\x74\x65\x20\x64\x61\x74\x73\x6c\x61\x67");
1013 		my_free(temp_plugin_output);
1014 		asprintf(&temp_service->plugin_output, "(Return code of %d is out of bounds%s)", queued_check_result->return_code, (queued_check_result->return_code == 126 ? " - plugin may not be executable" : (queued_check_result->return_code == 127 ? " - plugin may be missing" : "")));
1015 
1016 		temp_service->current_state = STATE_CRITICAL;
1017 		}
1018 
1019 	/* else the return code is okay... */
1020 	else {
1021 
1022 		/* parse check output to get: (1) short output, (2) long output, (3) perf data */
1023 		parse_check_output(queued_check_result->output, &temp_service->plugin_output, &temp_service->long_plugin_output, &temp_service->perf_data, TRUE, TRUE);
1024 
1025 		/* make sure the plugin output isn't null */
1026 		if(temp_service->plugin_output == NULL)
1027 			temp_service->plugin_output = (char *)strdup("(No output returned from plugin)");
1028 
1029 		/* replace semicolons in plugin output (but not performance data) with colons */
1030 		else if((temp_ptr = temp_service->plugin_output)) {
1031 			while((temp_ptr = strchr(temp_ptr, ';')))
1032 				* temp_ptr = ':';
1033 			}
1034 
1035 		log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n");
1036 		log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_service->plugin_output == NULL) ? "NULL" : temp_service->plugin_output);
1037 		log_debug_info(DEBUGL_CHECKS, 2, "Long Output:  %s\n", (temp_service->long_plugin_output == NULL) ? "NULL" : temp_service->long_plugin_output);
1038 		log_debug_info(DEBUGL_CHECKS, 2, "Perf Data:    %s\n", (temp_service->perf_data == NULL) ? "NULL" : temp_service->perf_data);
1039 
1040 		/* grab the return code */
1041 		temp_service->current_state = queued_check_result->return_code;
1042 		}
1043 
1044 
1045 	/* record the last state time */
1046 	switch(temp_service->current_state) {
1047 		case STATE_OK:
1048 			temp_service->last_time_ok = temp_service->last_check;
1049 			break;
1050 		case STATE_WARNING:
1051 			temp_service->last_time_warning = temp_service->last_check;
1052 			break;
1053 		case STATE_UNKNOWN:
1054 			temp_service->last_time_unknown = temp_service->last_check;
1055 			break;
1056 		case STATE_CRITICAL:
1057 			temp_service->last_time_critical = temp_service->last_check;
1058 			break;
1059 		default:
1060 			break;
1061 		}
1062 
1063 	/* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
1064 	if(temp_service->check_type == SERVICE_CHECK_PASSIVE) {
1065 		if(log_passive_checks == TRUE)
1066 			logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE SERVICE CHECK: %s;%s;%d;%s\n", temp_service->host_name, temp_service->description, temp_service->current_state, temp_service->plugin_output);
1067 		}
1068 
1069 	/* get the host that this service runs on */
1070 	temp_host = (host *)temp_service->host_ptr;
1071 
1072 	/* if the service check was okay... */
1073 	if(temp_service->current_state == STATE_OK) {
1074 
1075 		/* if the host has never been checked before, verify its status */
1076 		/* only do this if 1) the initial state was set to non-UP or 2) the host is not scheduled to be checked soon (next 5 minutes) */
1077 		if(temp_host->has_been_checked == FALSE && (temp_host->initial_state != HOST_UP || (unsigned long)temp_host->next_check == 0L || (unsigned long)(temp_host->next_check - current_time) > 300)) {
1078 
1079 			/* set a flag to remember that we launched a check */
1080 			first_host_check_initiated = TRUE;
1081 
1082 			/* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
1083 			/* previous logic was to simply run a sync (serial) host check */
1084 			/* do NOT allow cached check results to happen here - we need the host to be checked for real... */
1085 			if(use_aggressive_host_checking == TRUE)
1086 				perform_on_demand_host_check(temp_host, NULL, CHECK_OPTION_NONE, FALSE, 0L);
1087 			else
1088 				run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1089 			}
1090 		}
1091 
1092 
1093 	/**** NOTE - THIS WAS MOVED UP FROM LINE 1049 BELOW TO FIX PROBLEMS WHERE CURRENT ATTEMPT VALUE WAS ACTUALLY "LEADING" REAL VALUE ****/
1094 	/* increment the current attempt number if this is a soft state (service was rechecked) */
1095 	if(temp_service->state_type == SOFT_STATE && (temp_service->current_attempt < temp_service->max_attempts))
1096 		temp_service->current_attempt = temp_service->current_attempt + 1;
1097 
1098 
1099 	log_debug_info(DEBUGL_CHECKS, 2, "ST: %s  CA: %d  MA: %d  CS: %d  LS: %d  LHS: %d\n", (temp_service->state_type == SOFT_STATE) ? "SOFT" : "HARD", temp_service->current_attempt, temp_service->max_attempts, temp_service->current_state, temp_service->last_state, temp_service->last_hard_state);
1100 
1101 	/* check for a state change (either soft or hard) */
1102 	if(temp_service->current_state != temp_service->last_state) {
1103 		log_debug_info(DEBUGL_CHECKS, 2, "Service has changed state since last check!\n");
1104 		state_change = TRUE;
1105 		}
1106 
1107 	/* checks for a hard state change where host was down at last service check */
1108 	/* this occurs in the case where host goes down and service current attempt gets reset to 1 */
1109 	/* if this check is not made, the service recovery looks like a soft recovery instead of a hard one */
1110 	if(temp_service->host_problem_at_last_check == TRUE && temp_service->current_state == STATE_OK) {
1111 		log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n");
1112 		hard_state_change = TRUE;
1113 		}
1114 
1115 	/* check for a "normal" hard state change where max check attempts is reached */
1116 	if(temp_service->current_attempt >= temp_service->max_attempts && temp_service->current_state != temp_service->last_hard_state) {
1117 		log_debug_info(DEBUGL_CHECKS, 2, "Service had a HARD STATE CHANGE!!\n");
1118 		hard_state_change = TRUE;
1119 		}
1120 
1121 	/* a state change occurred... */
1122 	/* reset last and next notification times and acknowledgement flag if necessary, misc other stuff */
1123 	if(state_change == TRUE || hard_state_change == TRUE) {
1124 
1125 		/* reschedule the service check */
1126 		reschedule_check = TRUE;
1127 
1128 		/* reset notification times */
1129 		temp_service->last_notification = (time_t)0;
1130 		temp_service->next_notification = (time_t)0;
1131 
1132 		/* reset notification suppression option */
1133 		temp_service->no_more_notifications = FALSE;
1134 
1135 		if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) {
1136 
1137 			temp_service->problem_has_been_acknowledged = FALSE;
1138 			temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1139 
1140 			/* remove any non-persistant comments associated with the ack */
1141 			delete_service_acknowledgement_comments(temp_service);
1142 			}
1143 		else if(temp_service->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && temp_service->current_state == STATE_OK) {
1144 
1145 			temp_service->problem_has_been_acknowledged = FALSE;
1146 			temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1147 
1148 			/* remove any non-persistant comments associated with the ack */
1149 			delete_service_acknowledgement_comments(temp_service);
1150 			}
1151 
1152 		/* do NOT reset current notification number!!! */
1153 		/* hard changes between non-OK states should continue to be escalated, so don't reset current notification number */
1154 		/*temp_service->current_notification_number=0;*/
1155 		}
1156 
1157 	/* initialize the last host and service state change times if necessary */
1158 	if(temp_service->last_state_change == (time_t)0)
1159 		temp_service->last_state_change = temp_service->last_check;
1160 	if(temp_service->last_hard_state_change == (time_t)0)
1161 		temp_service->last_hard_state_change = temp_service->last_check;
1162 	if(temp_host->last_state_change == (time_t)0)
1163 		temp_host->last_state_change = temp_service->last_check;
1164 	if(temp_host->last_hard_state_change == (time_t)0)
1165 		temp_host->last_hard_state_change = temp_service->last_check;
1166 
1167 	/* update last service state change times */
1168 	if(state_change == TRUE)
1169 		temp_service->last_state_change = temp_service->last_check;
1170 	if(hard_state_change == TRUE)
1171 		temp_service->last_hard_state_change = temp_service->last_check;
1172 
1173 	/* update the event and problem ids */
1174 	if(state_change == TRUE) {
1175 
1176 		/* always update the event id on a state change */
1177 		temp_service->last_event_id = temp_service->current_event_id;
1178 		temp_service->current_event_id = next_event_id;
1179 		next_event_id++;
1180 
1181 		/* update the problem id when transitioning to a problem state */
1182 		if(temp_service->last_state == STATE_OK) {
1183 			/* don't reset last problem id, or it will be zero the next time a problem is encountered */
1184 			/* temp_service->last_problem_id=temp_service->current_problem_id;*/
1185 			temp_service->current_problem_id = next_problem_id;
1186 			next_problem_id++;
1187 			}
1188 
1189 		/* clear the problem id when transitioning from a problem state to an OK state */
1190 		if(temp_service->current_state == STATE_OK) {
1191 			temp_service->last_problem_id = temp_service->current_problem_id;
1192 			temp_service->current_problem_id = 0L;
1193 			}
1194 		}
1195 
1196 
1197 	/**************************************/
1198 	/******* SERVICE CHECK OK LOGIC *******/
1199 	/**************************************/
1200 
1201 	/* if the service is up and running OK... */
1202 	if(temp_service->current_state == STATE_OK) {
1203 
1204 		log_debug_info(DEBUGL_CHECKS, 1, "Service is OK.\n");
1205 
1206 		/* reset the acknowledgement flag (this should already have been done, but just in case...) */
1207 		temp_service->problem_has_been_acknowledged = FALSE;
1208 		temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1209 
1210 		/* verify the route to the host and send out host recovery notifications */
1211 		if(temp_host->current_state != HOST_UP) {
1212 
1213 			log_debug_info(DEBUGL_CHECKS, 1, "Host is NOT UP, so we'll check it to see if it recovered...\n");
1214 
1215 			/* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
1216 			/* previous logic was to simply run a sync (serial) host check */
1217 			if(use_aggressive_host_checking == TRUE)
1218 				perform_on_demand_host_check(temp_host, NULL, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1219 			/* 09/23/07 EG don't launch a new host check if we already did so earlier */
1220 			else if(first_host_check_initiated == TRUE)
1221 				log_debug_info(DEBUGL_CHECKS, 1, "First host check was already initiated, so we'll skip a new host check.\n");
1222 			else {
1223 				/* can we use the last cached host state? */
1224 				/* usually only use cached host state if no service state change has occurred */
1225 				if((state_change == FALSE || state_changes_use_cached_state == TRUE) && temp_host->has_been_checked == TRUE && ((current_time - temp_host->last_check) <= cached_host_check_horizon)) {
1226 					log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state);
1227 					update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1228 					update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1229 					}
1230 
1231 				/* else launch an async (parallel) check of the host */
1232 				else
1233 					run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1234 				}
1235 			}
1236 
1237 		/* if a hard service recovery has occurred... */
1238 		if(hard_state_change == TRUE) {
1239 
1240 			log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a HARD RECOVERY.\n");
1241 
1242 			/* set the state type macro */
1243 			temp_service->state_type = HARD_STATE;
1244 
1245 			/* log the service recovery */
1246 			log_service_event(temp_service);
1247 			state_was_logged = TRUE;
1248 
1249 			/* 10/04/07 check to see if the service and/or associate host is flapping */
1250 			/* this should be done before a notification is sent out to ensure the host didn't just start flapping */
1251 			check_for_service_flapping(temp_service, TRUE, TRUE);
1252 			check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1253 			flapping_check_done = TRUE;
1254 
1255 			/* notify contacts about the service recovery */
1256 			service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1257 
1258 			/* run the service event handler to handle the hard state change */
1259 			handle_service_event(temp_service);
1260 			}
1261 
1262 		/* else if a soft service recovery has occurred... */
1263 		else if(state_change == TRUE) {
1264 
1265 			log_debug_info(DEBUGL_CHECKS, 1, "Service experienced a SOFT RECOVERY.\n");
1266 
1267 			/* this is a soft recovery */
1268 			temp_service->state_type = SOFT_STATE;
1269 
1270 			/* log the soft recovery */
1271 			log_service_event(temp_service);
1272 			state_was_logged = TRUE;
1273 
1274 			/* run the service event handler to handle the soft state change */
1275 			handle_service_event(temp_service);
1276 			}
1277 
1278 		/* else no service state change has occurred... */
1279 		else {
1280 			log_debug_info(DEBUGL_CHECKS, 1, "Service did not change state.\n");
1281 			}
1282 
1283 		/* should we obsessive over service checks? */
1284 		if(obsess_over_services == TRUE)
1285 			obsessive_compulsive_service_check_processor(temp_service);
1286 
1287 		/* reset all service variables because its okay now... */
1288 		temp_service->host_problem_at_last_check = FALSE;
1289 		temp_service->current_attempt = 1;
1290 		temp_service->state_type = HARD_STATE;
1291 		temp_service->last_hard_state = STATE_OK;
1292 		temp_service->last_notification = (time_t)0;
1293 		temp_service->next_notification = (time_t)0;
1294 		temp_service->current_notification_number = 0;
1295 		temp_service->problem_has_been_acknowledged = FALSE;
1296 		temp_service->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
1297 		temp_service->notified_on_unknown = FALSE;
1298 		temp_service->notified_on_warning = FALSE;
1299 		temp_service->notified_on_critical = FALSE;
1300 		temp_service->no_more_notifications = FALSE;
1301 
1302 		if(reschedule_check == TRUE)
1303 			next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1304 		}
1305 
1306 
1307 	/*******************************************/
1308 	/******* SERVICE CHECK PROBLEM LOGIC *******/
1309 	/*******************************************/
1310 
1311 	/* hey, something's not working quite like it should... */
1312 	else {
1313 
1314 		log_debug_info(DEBUGL_CHECKS, 1, "Service is in a non-OK state!\n");
1315 
1316 		/* check the route to the host if its up right now... */
1317 		if(temp_host->current_state == HOST_UP) {
1318 
1319 			log_debug_info(DEBUGL_CHECKS, 1, "Host is currently UP, so we'll recheck its state to make sure...\n");
1320 
1321 			/* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
1322 			/* previous logic was to simply run a sync (serial) host check */
1323 			if(use_aggressive_host_checking == TRUE)
1324 				perform_on_demand_host_check(temp_host, &route_result, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1325 			else {
1326 				/* can we use the last cached host state? */
1327 				/* only use cached host state if no service state change has occurred */
1328 				if((state_change == FALSE || state_changes_use_cached_state == TRUE) && temp_host->has_been_checked == TRUE && ((current_time - temp_host->last_check) <= cached_host_check_horizon)) {
1329 					/* use current host state as route result */
1330 					route_result = temp_host->current_state;
1331 					log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", temp_host->current_state);
1332 					update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1333 					update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1334 					}
1335 
1336 				/* else launch an async (parallel) check of the host */
1337 				/* CHANGED 02/15/08 only if service changed state since service was last checked */
1338 				else if(state_change == TRUE) {
1339 					/* use current host state as route result */
1340 					route_result = temp_host->current_state;
1341 					run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1342 					}
1343 
1344 				/* ADDED 02/15/08 */
1345 				/* else assume same host state */
1346 				else {
1347 					route_result = temp_host->current_state;
1348 					log_debug_info(DEBUGL_CHECKS, 1, "* Using last known host state: %d\n", temp_host->current_state);
1349 					update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
1350 					update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
1351 					}
1352 				}
1353 			}
1354 
1355 		/* else the host is either down or unreachable, so recheck it if necessary */
1356 		else {
1357 
1358 			log_debug_info(DEBUGL_CHECKS, 1, "Host is currently DOWN/UNREACHABLE.\n");
1359 
1360 			/* we're using aggressive host checking, so really do recheck the host... */
1361 			if(use_aggressive_host_checking == TRUE) {
1362 				log_debug_info(DEBUGL_CHECKS, 1, "Agressive host checking is enabled, so we'll recheck the host state...\n");
1363 				perform_on_demand_host_check(temp_host, &route_result, CHECK_OPTION_NONE, TRUE, cached_host_check_horizon);
1364 				}
1365 
1366 			/* the service wobbled between non-OK states, so check the host... */
1367 			else if((state_change == TRUE && state_changes_use_cached_state == FALSE) && temp_service->last_hard_state != STATE_OK) {
1368 				log_debug_info(DEBUGL_CHECKS, 1, "Service wobbled between non-OK states, so we'll recheck the host state...\n");
1369 				/* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
1370 				/* previous logic was to simply run a sync (serial) host check */
1371 				/* use current host state as route result */
1372 				route_result = temp_host->current_state;
1373 				run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1374 				/*perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);*/
1375 				}
1376 
1377 			/* else fake the host check, but (possibly) resend host notifications to contacts... */
1378 			else {
1379 
1380 				log_debug_info(DEBUGL_CHECKS, 1, "Assuming host is in same state as before...\n");
1381 
1382 				/* if the host has never been checked before, set the checked flag and last check time */
1383 				/* 03/11/06 EG Note: This probably never evaluates to FALSE, present for historical reasons only, can probably be removed in the future */
1384 				if(temp_host->has_been_checked == FALSE) {
1385 					temp_host->has_been_checked = TRUE;
1386 					temp_host->last_check = temp_service->last_check;
1387 					}
1388 
1389 				/* fake the route check result */
1390 				route_result = temp_host->current_state;
1391 
1392 				/* possibly re-send host notifications... */
1393 				host_notification(temp_host, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1394 				}
1395 			}
1396 
1397 		/* if the host is down or unreachable ... */
1398 		/* 05/29/2007 NOTE: The host might be in a SOFT problem state due to host check retries/caching.  Not sure if we should take that into account and do something different or not... */
1399 		if(route_result != HOST_UP) {
1400 
1401 			log_debug_info(DEBUGL_CHECKS, 2, "Host is not UP, so we mark state changes if appropriate\n");
1402 
1403 			/* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */
1404 			if(temp_service->last_hard_state != temp_service->current_state)
1405 				hard_state_change = TRUE;
1406 
1407 			/* update last state change times */
1408 			if(state_change == TRUE || hard_state_change == TRUE)
1409 				temp_service->last_state_change = temp_service->last_check;
1410 			if(hard_state_change == TRUE) {
1411 				temp_service->last_hard_state_change = temp_service->last_check;
1412 				temp_service->state_type = HARD_STATE;
1413 				temp_service->last_hard_state = temp_service->current_state;
1414 				}
1415 
1416 			/* put service into a hard state without attempting check retries and don't send out notifications about it */
1417 			temp_service->host_problem_at_last_check = TRUE;
1418 			/* Below removed 08/04/2010 EG - http://tracker.nagios.org/view.php?id=128 */
1419 			/*
1420 			temp_service->state_type=HARD_STATE;
1421 			temp_service->last_hard_state=temp_service->current_state;
1422 			temp_service->current_attempt=1;
1423 			*/
1424 			}
1425 
1426 		/* the host is up - it recovered since the last time the service was checked... */
1427 		else if(temp_service->host_problem_at_last_check == TRUE) {
1428 
1429 			/* next time the service is checked we shouldn't get into this same case... */
1430 			temp_service->host_problem_at_last_check = FALSE;
1431 
1432 			/* reset the current check counter, so we give the service a chance */
1433 			/* this helps prevent the case where service has N max check attempts, N-1 of which have already occurred. */
1434 			/* if we didn't do this, the next check might fail and result in a hard problem - we should really give it more time */
1435 			/* ADDED IF STATEMENT 01-17-05 EG */
1436 			/* 01-17-05: Services in hard problem states before hosts went down would sometimes come back as soft problem states after */
1437 			/* the hosts recovered.  This caused problems, so hopefully this will fix it */
1438 			if(temp_service->state_type == SOFT_STATE)
1439 				temp_service->current_attempt = 1;
1440 			}
1441 
1442 		log_debug_info(DEBUGL_CHECKS, 1, "Current/Max Attempt(s): %d/%d\n", temp_service->current_attempt, temp_service->max_attempts);
1443 
1444 		/* if we should retry the service check, do so (except it the host is down or unreachable!) */
1445 		if(temp_service->current_attempt < temp_service->max_attempts) {
1446 
1447 			/* the host is down or unreachable, so don't attempt to retry the service check */
1448 			if(route_result != HOST_UP) {
1449 
1450 				log_debug_info(DEBUGL_CHECKS, 1, "Host isn't UP, so we won't retry the service check...\n");
1451 
1452 				/* the host is not up, so reschedule the next service check at regular interval */
1453 				if(reschedule_check == TRUE)
1454 					next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1455 
1456 				/* log the problem as a hard state if the host just went down */
1457 				if(hard_state_change == TRUE) {
1458 					log_service_event(temp_service);
1459 					state_was_logged = TRUE;
1460 
1461 					/* run the service event handler to handle the hard state */
1462 					handle_service_event(temp_service);
1463 					}
1464 				}
1465 
1466 			/* the host is up, so continue to retry the service check */
1467 			else {
1468 
1469 				log_debug_info(DEBUGL_CHECKS, 1, "Host is UP, so we'll retry the service check...\n");
1470 
1471 				/* this is a soft state */
1472 				temp_service->state_type = SOFT_STATE;
1473 
1474 				/* log the service check retry */
1475 				log_service_event(temp_service);
1476 				state_was_logged = TRUE;
1477 
1478 				/* run the service event handler to handle the soft state */
1479 				handle_service_event(temp_service);
1480 
1481 				if(reschedule_check == TRUE)
1482 					next_service_check = (time_t)(temp_service->last_check + (temp_service->retry_interval * interval_length));
1483 				}
1484 
1485 			/* perform dependency checks on the second to last check of the service */
1486 			if(enable_predictive_service_dependency_checks == TRUE && temp_service->current_attempt == (temp_service->max_attempts - 1)) {
1487 
1488 				log_debug_info(DEBUGL_CHECKS, 1, "Looking for services to check for predictive dependency checks...\n");
1489 
1490 				/* check services that THIS ONE depends on for notification AND execution */
1491 				/* we do this because we might be sending out a notification soon and we want the dependency logic to be accurate */
1492 				for(temp_dependency = get_first_servicedependency_by_dependent_service(temp_service->host_name, temp_service->description, &ptr); temp_dependency != NULL; temp_dependency = get_next_servicedependency_by_dependent_service(temp_service->host_name, temp_service->description, &ptr)) {
1493 					if(temp_dependency->dependent_service_ptr == temp_service && temp_dependency->master_service_ptr != NULL) {
1494 						master_service = (service *)temp_dependency->master_service_ptr;
1495 						log_debug_info(DEBUGL_CHECKS, 2, "Predictive check of service '%s' on host '%s' queued.\n", master_service->description, master_service->host_name);
1496 						add_object_to_objectlist(&check_servicelist, (void *)master_service);
1497 						}
1498 					}
1499 				}
1500 			}
1501 
1502 
1503 		/* we've reached the maximum number of service rechecks, so handle the error */
1504 		else {
1505 
1506 			log_debug_info(DEBUGL_CHECKS, 1, "Service has reached max number of rechecks, so we'll handle the error...\n");
1507 
1508 			/* this is a hard state */
1509 			temp_service->state_type = HARD_STATE;
1510 
1511 			/* if we've hard a hard state change... */
1512 			if(hard_state_change == TRUE) {
1513 
1514 				/* log the service problem (even if host is not up, which is new in 0.0.5) */
1515 				log_service_event(temp_service);
1516 				state_was_logged = TRUE;
1517 				}
1518 
1519 			/* else log the problem (again) if this service is flagged as being volatile */
1520 			else if(temp_service->is_volatile == TRUE) {
1521 				log_service_event(temp_service);
1522 				state_was_logged = TRUE;
1523 				}
1524 
1525 			/* check for start of flexible (non-fixed) scheduled downtime if we just had a hard error */
1526 			/* we need to check for both, state_change (SOFT) and hard_state_change (HARD) values */
1527 			if((hard_state_change == TRUE || state_change == TRUE) && temp_service->pending_flex_downtime > 0)
1528 				check_pending_flex_service_downtime(temp_service);
1529 
1530 			/* 10/04/07 check to see if the service and/or associate host is flapping */
1531 			/* this should be done before a notification is sent out to ensure the host didn't just start flapping */
1532 			check_for_service_flapping(temp_service, TRUE, TRUE);
1533 			check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1534 			flapping_check_done = TRUE;
1535 
1536 			/* (re)send notifications out about this service problem if the host is up (and was at last check also) and the dependencies were okay... */
1537 			service_notification(temp_service, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
1538 
1539 			/* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */
1540 			if(hard_state_change == TRUE || temp_service->is_volatile == TRUE)
1541 				handle_service_event(temp_service);
1542 
1543 			/* save the last hard state */
1544 			temp_service->last_hard_state = temp_service->current_state;
1545 
1546 			/* reschedule the next check at the regular interval */
1547 			if(reschedule_check == TRUE)
1548 				next_service_check = (time_t)(temp_service->last_check + (temp_service->check_interval * interval_length));
1549 			}
1550 
1551 
1552 		/* should we obsessive over service checks? */
1553 		if(obsess_over_services == TRUE)
1554 			obsessive_compulsive_service_check_processor(temp_service);
1555 		}
1556 
1557 	/* reschedule the next service check ONLY for active, scheduled checks */
1558 	if(reschedule_check == TRUE) {
1559 
1560 		log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of service at %s", ctime(&next_service_check));
1561 
1562 		/* default is to reschedule service check unless a test below fails... */
1563 		temp_service->should_be_scheduled = TRUE;
1564 
1565 		/* next check time was calculated above */
1566 		temp_service->next_check = next_service_check;
1567 
1568 		/* make sure we don't get ourselves into too much trouble... */
1569 		if(current_time > temp_service->next_check)
1570 			temp_service->next_check = current_time;
1571 
1572 		/* make sure we rescheduled the next service check at a valid time */
1573 		preferred_time = temp_service->next_check;
1574 		get_next_valid_time(preferred_time, &next_valid_time, temp_service->check_period_ptr);
1575 		temp_service->next_check = next_valid_time;
1576 
1577 		/* services with non-recurring intervals do not get rescheduled */
1578 		if(temp_service->check_interval == 0)
1579 			temp_service->should_be_scheduled = FALSE;
1580 
1581 		/* services with active checks disabled do not get rescheduled */
1582 		if(temp_service->checks_enabled == FALSE)
1583 			temp_service->should_be_scheduled = FALSE;
1584 
1585 		/* schedule a non-forced check if we can */
1586 		if(temp_service->should_be_scheduled == TRUE)
1587 			schedule_service_check(temp_service, temp_service->next_check, CHECK_OPTION_NONE);
1588 		}
1589 
1590 	/* if we're stalking this state type and state was not already logged AND the plugin output changed since last check, log it now.. */
1591 	if(temp_service->state_type == HARD_STATE && state_change == FALSE && state_was_logged == FALSE && compare_strings(old_plugin_output, temp_service->plugin_output)) {
1592 
1593 		if((temp_service->current_state == STATE_OK && temp_service->stalk_on_ok == TRUE))
1594 			log_service_event(temp_service);
1595 
1596 		else if((temp_service->current_state == STATE_WARNING && temp_service->stalk_on_warning == TRUE))
1597 			log_service_event(temp_service);
1598 
1599 		else if((temp_service->current_state == STATE_UNKNOWN && temp_service->stalk_on_unknown == TRUE))
1600 			log_service_event(temp_service);
1601 
1602 		else if((temp_service->current_state == STATE_CRITICAL && temp_service->stalk_on_critical == TRUE))
1603 			log_service_event(temp_service);
1604 		}
1605 
1606 #ifdef USE_EVENT_BROKER
1607 	/* send data to event broker */
1608 	broker_service_check(NEBTYPE_SERVICECHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_service, temp_service->check_type, queued_check_result->start_time, queued_check_result->finish_time, NULL, temp_service->latency, temp_service->execution_time, service_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, NULL);
1609 #endif
1610 
1611 	/* set the checked flag */
1612 	temp_service->has_been_checked = TRUE;
1613 
1614 	/* update the current service status log */
1615 	update_service_status(temp_service, FALSE);
1616 
1617 	/* check to see if the service and/or associate host is flapping */
1618 	if(flapping_check_done == FALSE) {
1619 		check_for_service_flapping(temp_service, TRUE, TRUE);
1620 		check_for_host_flapping(temp_host, TRUE, FALSE, TRUE);
1621 		}
1622 
1623 	/* update service performance info */
1624 	update_service_performance_data(temp_service);
1625 
1626 	/* free allocated memory */
1627 	my_free(temp_plugin_output);
1628 	my_free(old_plugin_output);
1629 
1630 
1631 	/* run async checks of all services we added above */
1632 	/* don't run a check if one is already executing or we can get by with a cached state */
1633 	for(servicelist_item = check_servicelist; servicelist_item != NULL; servicelist_item = servicelist_item->next) {
1634 		run_async_check = TRUE;
1635 		temp_service = (service *)servicelist_item->object_ptr;
1636 
1637 		/* we can get by with a cached state, so don't check the service */
1638 		if((current_time - temp_service->last_check) <= cached_service_check_horizon) {
1639 			run_async_check = FALSE;
1640 
1641 			/* update check statistics */
1642 			update_check_stats(ACTIVE_CACHED_SERVICE_CHECK_STATS, current_time);
1643 			}
1644 
1645 		if(temp_service->is_executing == TRUE)
1646 			run_async_check = FALSE;
1647 
1648 		if(run_async_check == TRUE)
1649 			run_async_service_check(temp_service, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
1650 		}
1651 	free_objectlist(&check_servicelist);
1652 
1653 	return OK;
1654 	}
1655 
1656 
1657 
1658 /* schedules an immediate or delayed service check */
schedule_service_check(service * svc,time_t check_time,int options)1659 void schedule_service_check(service *svc, time_t check_time, int options) {
1660 	timed_event *temp_event = NULL;
1661 	timed_event *new_event = NULL;
1662 	int use_original_event = TRUE;
1663 
1664 	log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_service_check()\n");
1665 
1666 	if(svc == NULL)
1667 		return;
1668 
1669 	log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of service '%s' on host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", svc->description, svc->host_name, ctime(&check_time));
1670 
1671 	/* don't schedule a check if active checks of this service are disabled */
1672 	if(svc->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) {
1673 		log_debug_info(DEBUGL_CHECKS, 0, "Active checks of this service are disabled.\n");
1674 		return;
1675 		}
1676 
1677 	/* default is to use the new event */
1678 	use_original_event = FALSE;
1679 
1680 	temp_event = (timed_event *)svc->next_check_event;
1681 
1682 	/*
1683 	 * If the service already has a check scheduled,
1684 	 * we need to decide which of the events to use
1685 	 */
1686 	if(temp_event != NULL) {
1687 
1688 		log_debug_info(DEBUGL_CHECKS, 2, "Found another service check event for this service @ %s", ctime(&temp_event->run_time));
1689 
1690 		/* use the originally scheduled check unless we decide otherwise */
1691 		use_original_event = TRUE;
1692 
1693 		/* the original event is a forced check... */
1694 		if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) {
1695 
1696 			/* the new event is also forced and its execution time is earlier than the original, so use it instead */
1697 			if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) {
1698 				use_original_event = FALSE;
1699 				log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced and occurs before the existing event, so the new event will be used instead.\n");
1700 				}
1701 			}
1702 
1703 		/* the original event is not a forced check... */
1704 		else {
1705 
1706 			/* the new event is a forced check, so use it instead */
1707 			if((options & CHECK_OPTION_FORCE_EXECUTION)) {
1708 				use_original_event = FALSE;
1709 				log_debug_info(DEBUGL_CHECKS, 2, "New service check event is forced, so it will be used instead of the existing event.\n");
1710 				}
1711 
1712 			/* the new event is not forced either and its execution time is earlier than the original, so use it instead */
1713 			else if(check_time < temp_event->run_time) {
1714 				use_original_event = FALSE;
1715 				log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs before the existing (older) event, so it will be used instead.\n");
1716 				}
1717 
1718 			/* the new event is older, so override the existing one */
1719 			else {
1720 				log_debug_info(DEBUGL_CHECKS, 2, "New service check event occurs after the existing event, so we'll ignore it.\n");
1721 				}
1722 			}
1723 		}
1724 
1725 	/* schedule a new event */
1726 	if(use_original_event == FALSE) {
1727 
1728 		/* allocate memory for a new event item */
1729 		new_event = (timed_event *)malloc(sizeof(timed_event));
1730 		if(new_event == NULL) {
1731 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of service '%s' on host '%s'!\n", svc->description, svc->host_name);
1732 			return;
1733 			}
1734 
1735 		/* make sure we kill off the old event */
1736 		if(temp_event) {
1737 			remove_event(temp_event, &event_list_low, &event_list_low_tail);
1738 			my_free(temp_event);
1739 			}
1740 		log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new service check event.\n");
1741 
1742 		/* set the next service check event and time */
1743 		svc->next_check_event = new_event;
1744 		svc->next_check = check_time;
1745 
1746 		/* save check options for retention purposes */
1747 		svc->check_options = options;
1748 
1749 		/* place the new event in the event queue */
1750 		new_event->event_type = EVENT_SERVICE_CHECK;
1751 		new_event->event_data = (void *)svc;
1752 		new_event->event_args = (void *)NULL;
1753 		new_event->event_options = options;
1754 		new_event->run_time = svc->next_check;
1755 		new_event->recurring = FALSE;
1756 		new_event->event_interval = 0L;
1757 		new_event->timing_func = NULL;
1758 		new_event->compensate_for_time_change = TRUE;
1759 		reschedule_event(new_event, &event_list_low, &event_list_low_tail);
1760 		}
1761 
1762 	else {
1763 		/* reset the next check time (it may be out of sync) */
1764 		if(temp_event != NULL)
1765 			svc->next_check = temp_event->run_time;
1766 
1767 		log_debug_info(DEBUGL_CHECKS, 2, "Keeping original service check event (ignoring the new one).\n");
1768 		}
1769 
1770 
1771 	/* update the status log */
1772 	update_service_status(svc, FALSE);
1773 
1774 	return;
1775 	}
1776 
1777 
1778 
1779 /* checks viability of performing a service check */
check_service_check_viability(service * svc,int check_options,int * time_is_valid,time_t * new_time)1780 int check_service_check_viability(service *svc, int check_options, int *time_is_valid, time_t *new_time) {
1781 	int result = OK;
1782 	int perform_check = TRUE;
1783 	time_t current_time = 0L;
1784 	time_t preferred_time = 0L;
1785 	int check_interval = 0;
1786 
1787 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_check_viability()\n");
1788 
1789 	/* make sure we have a service */
1790 	if(svc == NULL)
1791 		return ERROR;
1792 
1793 	/* get the check interval to use if we need to reschedule the check */
1794 	if(svc->state_type == SOFT_STATE && svc->current_state != STATE_OK)
1795 		check_interval = (svc->retry_interval * interval_length);
1796 	else
1797 		check_interval = (svc->check_interval * interval_length);
1798 
1799 	/* get the current time */
1800 	time(&current_time);
1801 
1802 	/* initialize the next preferred check time */
1803 	preferred_time = current_time;
1804 
1805 	/* can we check the host right now? */
1806 	if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
1807 
1808 		/* if checks of the service are currently disabled... */
1809 		if(svc->checks_enabled == FALSE) {
1810 			preferred_time = current_time + check_interval;
1811 			perform_check = FALSE;
1812 
1813 			log_debug_info(DEBUGL_CHECKS, 2, "Active checks of the service are currently disabled.\n");
1814 			}
1815 
1816 		/* make sure this is a valid time to check the service */
1817 		if(check_time_against_period((unsigned long)current_time, svc->check_period_ptr) == ERROR) {
1818 			preferred_time = current_time;
1819 			if(time_is_valid)
1820 				*time_is_valid = FALSE;
1821 			perform_check = FALSE;
1822 
1823 			log_debug_info(DEBUGL_CHECKS, 2, "This is not a valid time for this service to be actively checked.\n");
1824 			}
1825 
1826 		/* check service dependencies for execution */
1827 		if(check_service_dependencies(svc, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
1828 			preferred_time = current_time + check_interval;
1829 			perform_check = FALSE;
1830 
1831 			log_debug_info(DEBUGL_CHECKS, 2, "Execution dependencies for this service failed, so it will not be actively checked.\n");
1832 			}
1833 		}
1834 
1835 	/* pass back the next viable check time */
1836 	if(new_time)
1837 		*new_time = preferred_time;
1838 
1839 	result = (perform_check == TRUE) ? OK : ERROR;
1840 
1841 	return result;
1842 	}
1843 
1844 
1845 
1846 /* checks service dependencies */
check_service_dependencies(service * svc,int dependency_type)1847 int check_service_dependencies(service *svc, int dependency_type) {
1848 	servicedependency *temp_dependency = NULL;
1849 	service *temp_service = NULL;
1850 	int state = STATE_OK;
1851 	time_t current_time = 0L;
1852 	void *ptr = NULL;
1853 
1854 
1855 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_dependencies()\n");
1856 
1857 	/* check all dependencies... */
1858 	for(temp_dependency = get_first_servicedependency_by_dependent_service(svc->host_name, svc->description, &ptr); temp_dependency != NULL; temp_dependency = get_next_servicedependency_by_dependent_service(svc->host_name, svc->description, &ptr)) {
1859 
1860 		/* only check dependencies of the desired type (notification or execution) */
1861 		if(temp_dependency->dependency_type != dependency_type)
1862 			continue;
1863 
1864 		/* find the service we depend on... */
1865 		if((temp_service = temp_dependency->master_service_ptr) == NULL)
1866 			continue;
1867 
1868 		/* skip this dependency if it has a timeperiod and the current time isn't valid */
1869 		time(&current_time);
1870 		if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR)
1871 			return FALSE;
1872 
1873 		/* get the status to use (use last hard state if its currently in a soft state) */
1874 		if(temp_service->state_type == SOFT_STATE && soft_state_dependencies == FALSE)
1875 			state = temp_service->last_hard_state;
1876 		else
1877 			state = temp_service->current_state;
1878 
1879 		/* is the service we depend on in state that fails the dependency tests? */
1880 		if(state == STATE_OK && temp_dependency->fail_on_ok == TRUE)
1881 			return DEPENDENCIES_FAILED;
1882 		if(state == STATE_WARNING && temp_dependency->fail_on_warning == TRUE)
1883 			return DEPENDENCIES_FAILED;
1884 		if(state == STATE_UNKNOWN && temp_dependency->fail_on_unknown == TRUE)
1885 			return DEPENDENCIES_FAILED;
1886 		if(state == STATE_CRITICAL && temp_dependency->fail_on_critical == TRUE)
1887 			return DEPENDENCIES_FAILED;
1888 		if((state == STATE_OK && temp_service->has_been_checked == FALSE) && temp_dependency->fail_on_pending == TRUE)
1889 			return DEPENDENCIES_FAILED;
1890 
1891 		/* immediate dependencies ok at this point - check parent dependencies if necessary */
1892 		if(temp_dependency->inherits_parent == TRUE) {
1893 			if(check_service_dependencies(temp_service, dependency_type) != DEPENDENCIES_OK)
1894 				return DEPENDENCIES_FAILED;
1895 			}
1896 		}
1897 
1898 	return DEPENDENCIES_OK;
1899 	}
1900 
1901 
1902 
1903 /* check for services that never returned from a check... */
check_for_orphaned_services(void)1904 void check_for_orphaned_services(void) {
1905 	service *temp_service = NULL;
1906 	time_t current_time = 0L;
1907 	time_t expected_time = 0L;
1908 
1909 
1910 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_services()\n");
1911 
1912 	/* get the current time */
1913 	time(&current_time);
1914 
1915 	/* check all services... */
1916 	for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) {
1917 
1918 		/* skip services that are not currently executing */
1919 		if(temp_service->is_executing == FALSE)
1920 			continue;
1921 
1922 		/* determine the time at which the check results should have come in (allow 10 minutes slack time) */
1923 		expected_time = (time_t)(temp_service->next_check + temp_service->latency + service_check_timeout + check_reaper_interval + 600);
1924 
1925 		/* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */
1926 		if(expected_time < current_time) {
1927 
1928 			/* log a warning */
1929 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back).  I'm scheduling an immediate check of the service...\n", temp_service->description, temp_service->host_name);
1930 
1931 			log_debug_info(DEBUGL_CHECKS, 1, "Service '%s' on host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_service->description, temp_service->host_name);
1932 
1933 			/* decrement the number of running service checks */
1934 			if(currently_running_service_checks > 0)
1935 				currently_running_service_checks--;
1936 
1937 			/* disable the executing flag */
1938 			temp_service->is_executing = FALSE;
1939 
1940 			/* schedule an immediate check of the service */
1941 			schedule_service_check(temp_service, current_time, CHECK_OPTION_ORPHAN_CHECK);
1942 			}
1943 
1944 		}
1945 
1946 	return;
1947 	}
1948 
1949 
1950 
1951 /* check freshness of service results */
check_service_result_freshness(void)1952 void check_service_result_freshness(void) {
1953 	service *temp_service = NULL;
1954 	time_t current_time = 0L;
1955 
1956 
1957 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_service_result_freshness()\n");
1958 	log_debug_info(DEBUGL_CHECKS, 1, "Checking the freshness of service check results...\n");
1959 
1960 	/* bail out if we're not supposed to be checking freshness */
1961 	if(check_service_freshness == FALSE) {
1962 		log_debug_info(DEBUGL_CHECKS, 1, "Service freshness checking is disabled.\n");
1963 		return;
1964 		}
1965 
1966 	/* get the current time */
1967 	time(&current_time);
1968 
1969 	/* check all services... */
1970 	for(temp_service = service_list; temp_service != NULL; temp_service = temp_service->next) {
1971 
1972 		/* skip services we shouldn't be checking for freshness */
1973 		if(temp_service->check_freshness == FALSE)
1974 			continue;
1975 
1976 		/* skip services that are currently executing (problems here will be caught by orphaned service check) */
1977 		if(temp_service->is_executing == TRUE)
1978 			continue;
1979 
1980 		/* skip services that have both active and passive checks disabled */
1981 		if(temp_service->checks_enabled == FALSE && temp_service->accept_passive_service_checks == FALSE)
1982 			continue;
1983 
1984 		/* skip services that are already being freshened */
1985 		if(temp_service->is_being_freshened == TRUE)
1986 			continue;
1987 
1988 		/* see if the time is right... */
1989 		if(check_time_against_period(current_time, temp_service->check_period_ptr) == ERROR)
1990 			continue;
1991 
1992 		/* EXCEPTION */
1993 		/* don't check freshness of services without regular check intervals if we're using auto-freshness threshold */
1994 		if(temp_service->check_interval == 0 && temp_service->freshness_threshold == 0)
1995 			continue;
1996 
1997 		/* the results for the last check of this service are stale! */
1998 		if(is_service_result_fresh(temp_service, current_time, TRUE) == FALSE) {
1999 
2000 			/* set the freshen flag */
2001 			temp_service->is_being_freshened = TRUE;
2002 
2003 			/* schedule an immediate forced check of the service */
2004 			schedule_service_check(temp_service, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
2005 			}
2006 
2007 		}
2008 
2009 	return;
2010 	}
2011 
2012 
2013 
2014 /* tests whether or not a service's check results are fresh */
is_service_result_fresh(service * temp_service,time_t current_time,int log_this)2015 int is_service_result_fresh(service *temp_service, time_t current_time, int log_this) {
2016 	int freshness_threshold = 0;
2017 	time_t expiration_time = 0L;
2018 	int days = 0;
2019 	int hours = 0;
2020 	int minutes = 0;
2021 	int seconds = 0;
2022 	int tdays = 0;
2023 	int thours = 0;
2024 	int tminutes = 0;
2025 	int tseconds = 0;
2026 
2027 	log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of service '%s' on host '%s'...\n", temp_service->description, temp_service->host_name);
2028 
2029 	/* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
2030 	if(temp_service->freshness_threshold == 0) {
2031 		if(temp_service->state_type == HARD_STATE || temp_service->current_state == STATE_OK)
2032 			freshness_threshold = (temp_service->check_interval * interval_length) + temp_service->latency + additional_freshness_latency;
2033 		else
2034 			freshness_threshold = (temp_service->retry_interval * interval_length) + temp_service->latency + additional_freshness_latency;
2035 		}
2036 	else
2037 		freshness_threshold = temp_service->freshness_threshold;
2038 
2039 	log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: service=%d, use=%d\n", temp_service->freshness_threshold, freshness_threshold);
2040 
2041 	/* calculate expiration time */
2042 	/*
2043 	 * CHANGED 11/10/05 EG -
2044 	 * program start is only used in expiration time calculation
2045 	 * if > last check AND active checks are enabled, so active checks
2046 	 * can become stale immediately upon program startup
2047 	 */
2048 	/*
2049 	 * CHANGED 02/25/06 SG -
2050 	 * passive checks also become stale, so remove dependence on active
2051 	 * check logic
2052 	 */
2053 	if(temp_service->has_been_checked == FALSE)
2054 		expiration_time = (time_t)(event_start + freshness_threshold);
2055 	/*
2056 	 * CHANGED 06/19/07 EG -
2057 	 * Per Ton's suggestion (and user requests), only use program start
2058 	 * time over last check if no specific threshold has been set by user.
2059 	 * Problems can occur if Nagios is restarted more frequently that
2060 	 * freshness threshold intervals (services never go stale).
2061 	 */
2062 	/*
2063 	 * CHANGED 10/07/07 EG:
2064 	 * Only match next condition for services that
2065 	 * have active checks enabled...
2066 	 */
2067 	/*
2068 	 * CHANGED 10/07/07 EG:
2069 	 * Added max_service_check_spread to expiration time as suggested
2070 	 * by Altinity
2071 	 */
2072 	else if(temp_service->checks_enabled == TRUE && event_start > temp_service->last_check && temp_service->freshness_threshold == 0)
2073 		expiration_time = (time_t)(event_start + freshness_threshold + (max_service_check_spread * interval_length));
2074 	else
2075 		expiration_time = (time_t)(temp_service->last_check + freshness_threshold);
2076 
2077 	/*
2078 	 * If the check was last done passively, we assume it's going
2079 	 * to continue that way and we need to handle the fact that
2080 	 * Nagios might have been shut off for quite a long time. If so,
2081 	 * we mustn't spam freshness notifications but use event_start
2082 	 * instead of last_check to determine freshness expiration time.
2083 	 * The threshold for "long time" is determined as 61.8% of the normal
2084 	 * freshness threshold based on vast heuristical research (ie, "some
2085 	 * guy once told me the golden ratio is good for loads of stuff").
2086 	 */
2087 	if(temp_service->check_type == SERVICE_CHECK_PASSIVE) {
2088 		if(temp_service->last_check < event_start &&
2089 		        event_start - last_program_stop > freshness_threshold * 0.618) {
2090 			expiration_time = event_start + freshness_threshold;
2091 			}
2092 		}
2093 	log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_service->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_service->last_check, (unsigned long)current_time, (unsigned long)expiration_time);
2094 
2095 	/* the results for the last check of this service are stale */
2096 	if(expiration_time < current_time) {
2097 
2098 		get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds);
2099 		get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds);
2100 
2101 		/* log a warning */
2102 		if(log_this == TRUE)
2103 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  I'm forcing an immediate check of the service.\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2104 
2105 		log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  Forcing an immediate check of the service...\n", temp_service->description, temp_service->host_name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2106 
2107 		return FALSE;
2108 		}
2109 
2110 	log_debug_info(DEBUGL_CHECKS, 1, "Check results for service '%s' on host '%s' are fresh.\n", temp_service->description, temp_service->host_name);
2111 
2112 	return TRUE;
2113 	}
2114 
2115 
2116 
2117 
2118 /******************************************************************/
2119 /*************** COMMON ROUTE/HOST CHECK FUNCTIONS ****************/
2120 /******************************************************************/
2121 
2122 /* execute an on-demand check  */
perform_on_demand_host_check(host * hst,int * check_return_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2123 int perform_on_demand_host_check(host *hst, int *check_return_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2124 
2125 	log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_on_demand_host_check()\n");
2126 
2127 	perform_on_demand_host_check_3x(hst, check_return_code, check_options, use_cached_result, check_timestamp_horizon);
2128 
2129 	return OK;
2130 	}
2131 
2132 
2133 
2134 /* execute a scheduled host check using either the 2.x or 3.x logic */
perform_scheduled_host_check(host * hst,int check_options,double latency)2135 int perform_scheduled_host_check(host *hst, int check_options, double latency) {
2136 
2137 	log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_scheduled_host_check()\n");
2138 
2139 	run_scheduled_host_check_3x(hst, check_options, latency);
2140 
2141 	return OK;
2142 	}
2143 
2144 
2145 
2146 /* schedules an immediate or delayed host check */
schedule_host_check(host * hst,time_t check_time,int options)2147 void schedule_host_check(host *hst, time_t check_time, int options) {
2148 	timed_event *temp_event = NULL;
2149 	timed_event *new_event = NULL;
2150 	int use_original_event = TRUE;
2151 
2152 
2153 	log_debug_info(DEBUGL_FUNCTIONS, 0, "schedule_host_check()\n");
2154 
2155 	if(hst == NULL)
2156 		return;
2157 
2158 	log_debug_info(DEBUGL_CHECKS, 0, "Scheduling a %s, active check of host '%s' @ %s", (options & CHECK_OPTION_FORCE_EXECUTION) ? "forced" : "non-forced", hst->name, ctime(&check_time));
2159 
2160 	/* don't schedule a check if active checks of this host are disabled */
2161 	if(hst->checks_enabled == FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)) {
2162 		log_debug_info(DEBUGL_CHECKS, 0, "Active checks are disabled for this host.\n");
2163 		return;
2164 		}
2165 
2166 	/* default is to use the new event */
2167 	use_original_event = FALSE;
2168 
2169 	temp_event = (timed_event *)hst->next_check_event;
2170 
2171 	/*
2172 	 * If the host already had a check scheduled we need
2173 	 * to decide which check event to use
2174 	 */
2175 	if(temp_event != NULL) {
2176 
2177 		log_debug_info(DEBUGL_CHECKS, 2, "Found another host check event for this host @ %s", ctime(&temp_event->run_time));
2178 
2179 		/* use the originally scheduled check unless we decide otherwise */
2180 		use_original_event = TRUE;
2181 
2182 		/* the original event is a forced check... */
2183 		if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)) {
2184 
2185 			/* the new event is also forced and its execution time is earlier than the original, so use it instead */
2186 			if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)) {
2187 				log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced and occurs before the existing event, so the new event be used instead.\n");
2188 				use_original_event = FALSE;
2189 				}
2190 			}
2191 
2192 		/* the original event is not a forced check... */
2193 		else {
2194 
2195 			/* the new event is a forced check, so use it instead */
2196 			if((options & CHECK_OPTION_FORCE_EXECUTION)) {
2197 				use_original_event = FALSE;
2198 				log_debug_info(DEBUGL_CHECKS, 2, "New host check event is forced, so it will be used instead of the existing event.\n");
2199 				}
2200 
2201 			/* the new event is not forced either and its execution time is earlier than the original, so use it instead */
2202 			else if(check_time < temp_event->run_time) {
2203 				use_original_event = FALSE;
2204 				log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs before the existing (older) event, so it will be used instead.\n");
2205 				}
2206 
2207 			/* the new event is older, so override the existing one */
2208 			else {
2209 				log_debug_info(DEBUGL_CHECKS, 2, "New host check event occurs after the existing event, so we'll ignore it.\n");
2210 				}
2211 			}
2212 		}
2213 
2214 	/* use the new event */
2215 	if(use_original_event == FALSE) {
2216 
2217 		log_debug_info(DEBUGL_CHECKS, 2, "Scheduling new host check event.\n");
2218 
2219 		/* allocate memory for a new event item */
2220 		if((new_event = (timed_event *)malloc(sizeof(timed_event))) == NULL) {
2221 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Could not reschedule check of host '%s'!\n", hst->name);
2222 			return;
2223 			}
2224 
2225 		if(temp_event) {
2226 			remove_event(temp_event, &event_list_low, &event_list_low_tail);
2227 			my_free(temp_event);
2228 			}
2229 
2230 		/* set the next host check event and time */
2231 		hst->next_check_event = new_event;
2232 		hst->next_check = check_time;
2233 
2234 		/* save check options for retention purposes */
2235 		hst->check_options = options;
2236 
2237 		/* place the new event in the event queue */
2238 		new_event->event_type = EVENT_HOST_CHECK;
2239 		new_event->event_data = (void *)hst;
2240 		new_event->event_args = (void *)NULL;
2241 		new_event->event_options = options;
2242 		new_event->run_time = hst->next_check;
2243 		new_event->recurring = FALSE;
2244 		new_event->event_interval = 0L;
2245 		new_event->timing_func = NULL;
2246 		new_event->compensate_for_time_change = TRUE;
2247 		reschedule_event(new_event, &event_list_low, &event_list_low_tail);
2248 		}
2249 
2250 	else {
2251 		/* reset the next check time (it may be out of sync) */
2252 		if(temp_event != NULL)
2253 			hst->next_check = temp_event->run_time;
2254 
2255 		log_debug_info(DEBUGL_CHECKS, 2, "Keeping original host check event (ignoring the new one).\n");
2256 		}
2257 
2258 	/* update the status log */
2259 	update_host_status(hst, FALSE);
2260 
2261 	return;
2262 	}
2263 
2264 
2265 
2266 /* checks host dependencies */
check_host_dependencies(host * hst,int dependency_type)2267 int check_host_dependencies(host *hst, int dependency_type) {
2268 	hostdependency *temp_dependency = NULL;
2269 	host *temp_host = NULL;
2270 	int state = HOST_UP;
2271 	time_t current_time = 0L;
2272 	void *ptr = NULL;
2273 
2274 
2275 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_dependencies()\n");
2276 
2277 	/* check all dependencies... */
2278 	for(temp_dependency = get_first_hostdependency_by_dependent_host(hst->name, &ptr); temp_dependency != NULL; temp_dependency = get_next_hostdependency_by_dependent_host(hst->name, &ptr)) {
2279 
2280 		/* only check dependencies of the desired type (notification or execution) */
2281 		if(temp_dependency->dependency_type != dependency_type)
2282 			continue;
2283 
2284 		/* find the host we depend on... */
2285 		if((temp_host = temp_dependency->master_host_ptr) == NULL)
2286 			continue;
2287 
2288 		/* skip this dependency if it has a timeperiod and the current time isn't valid */
2289 		time(&current_time);
2290 		if(temp_dependency->dependency_period != NULL && check_time_against_period(current_time, temp_dependency->dependency_period_ptr) == ERROR)
2291 			return FALSE;
2292 
2293 		/* get the status to use (use last hard state if its currently in a soft state) */
2294 		if(temp_host->state_type == SOFT_STATE && soft_state_dependencies == FALSE)
2295 			state = temp_host->last_hard_state;
2296 		else
2297 			state = temp_host->current_state;
2298 
2299 		/* is the host we depend on in state that fails the dependency tests? */
2300 		if(state == HOST_UP && temp_dependency->fail_on_up == TRUE)
2301 			return DEPENDENCIES_FAILED;
2302 		if(state == HOST_DOWN && temp_dependency->fail_on_down == TRUE)
2303 			return DEPENDENCIES_FAILED;
2304 		if(state == HOST_UNREACHABLE && temp_dependency->fail_on_unreachable == TRUE)
2305 			return DEPENDENCIES_FAILED;
2306 		if((state == HOST_UP && temp_host->has_been_checked == FALSE) && temp_dependency->fail_on_pending == TRUE)
2307 			return DEPENDENCIES_FAILED;
2308 
2309 		/* immediate dependencies ok at this point - check parent dependencies if necessary */
2310 		if(temp_dependency->inherits_parent == TRUE) {
2311 			if(check_host_dependencies(temp_host, dependency_type) != DEPENDENCIES_OK)
2312 				return DEPENDENCIES_FAILED;
2313 			}
2314 		}
2315 
2316 	return DEPENDENCIES_OK;
2317 	}
2318 
2319 
2320 
2321 /* check for hosts that never returned from a check... */
check_for_orphaned_hosts(void)2322 void check_for_orphaned_hosts(void) {
2323 	host *temp_host = NULL;
2324 	time_t current_time = 0L;
2325 	time_t expected_time = 0L;
2326 
2327 
2328 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_for_orphaned_hosts()\n");
2329 
2330 	/* get the current time */
2331 	time(&current_time);
2332 
2333 	/* check all hosts... */
2334 	for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) {
2335 
2336 		/* skip hosts that don't have a set check interval (on-demand checks are missed by the orphan logic) */
2337 		if(temp_host->next_check == (time_t)0L)
2338 			continue;
2339 
2340 		/* skip hosts that are not currently executing */
2341 		if(temp_host->is_executing == FALSE)
2342 			continue;
2343 
2344 		/* determine the time at which the check results should have come in (allow 10 minutes slack time) */
2345 		expected_time = (time_t)(temp_host->next_check + temp_host->latency + host_check_timeout + check_reaper_interval + 600);
2346 
2347 		/* this host was supposed to have executed a while ago, but for some reason the results haven't come back in... */
2348 		if(expected_time < current_time) {
2349 
2350 			/* log a warning */
2351 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of host '%s' looks like it was orphaned (results never came back).  I'm scheduling an immediate check of the host...\n", temp_host->name);
2352 
2353 			log_debug_info(DEBUGL_CHECKS, 1, "Host '%s' was orphaned, so we're scheduling an immediate check...\n", temp_host->name);
2354 
2355 			/* decrement the number of running host checks */
2356 			if(currently_running_host_checks > 0)
2357 				currently_running_host_checks--;
2358 
2359 			/* disable the executing flag */
2360 			temp_host->is_executing = FALSE;
2361 
2362 			/* schedule an immediate check of the host */
2363 			schedule_host_check(temp_host, current_time, CHECK_OPTION_ORPHAN_CHECK);
2364 			}
2365 
2366 		}
2367 
2368 	return;
2369 	}
2370 
2371 
2372 
2373 /* check freshness of host results */
check_host_result_freshness(void)2374 void check_host_result_freshness(void) {
2375 	host *temp_host = NULL;
2376 	time_t current_time = 0L;
2377 
2378 
2379 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_result_freshness()\n");
2380 	log_debug_info(DEBUGL_CHECKS, 2, "Attempting to check the freshness of host check results...\n");
2381 
2382 	/* bail out if we're not supposed to be checking freshness */
2383 	if(check_host_freshness == FALSE) {
2384 		log_debug_info(DEBUGL_CHECKS, 2, "Host freshness checking is disabled.\n");
2385 		return;
2386 		}
2387 
2388 	/* get the current time */
2389 	time(&current_time);
2390 
2391 	/* check all hosts... */
2392 	for(temp_host = host_list; temp_host != NULL; temp_host = temp_host->next) {
2393 
2394 		/* skip hosts we shouldn't be checking for freshness */
2395 		if(temp_host->check_freshness == FALSE)
2396 			continue;
2397 
2398 		/* skip hosts that have both active and passive checks disabled */
2399 		if(temp_host->checks_enabled == FALSE && temp_host->accept_passive_host_checks == FALSE)
2400 			continue;
2401 
2402 		/* skip hosts that are currently executing (problems here will be caught by orphaned host check) */
2403 		if(temp_host->is_executing == TRUE)
2404 			continue;
2405 
2406 		/* skip hosts that are already being freshened */
2407 		if(temp_host->is_being_freshened == TRUE)
2408 			continue;
2409 
2410 		/* see if the time is right... */
2411 		if(check_time_against_period(current_time, temp_host->check_period_ptr) == ERROR)
2412 			continue;
2413 
2414 		/* the results for the last check of this host are stale */
2415 		if(is_host_result_fresh(temp_host, current_time, TRUE) == FALSE) {
2416 
2417 			/* set the freshen flag */
2418 			temp_host->is_being_freshened = TRUE;
2419 
2420 			/* schedule an immediate forced check of the host */
2421 			schedule_host_check(temp_host, current_time, CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
2422 			}
2423 		}
2424 
2425 	return;
2426 	}
2427 
2428 
2429 
2430 /* checks to see if a hosts's check results are fresh */
is_host_result_fresh(host * temp_host,time_t current_time,int log_this)2431 int is_host_result_fresh(host *temp_host, time_t current_time, int log_this) {
2432 	time_t expiration_time = 0L;
2433 	int freshness_threshold = 0;
2434 	int days = 0;
2435 	int hours = 0;
2436 	int minutes = 0;
2437 	int seconds = 0;
2438 	int tdays = 0;
2439 	int thours = 0;
2440 	int tminutes = 0;
2441 	int tseconds = 0;
2442 	double interval = 0;
2443 
2444 	log_debug_info(DEBUGL_CHECKS, 2, "Checking freshness of host '%s'...\n", temp_host->name);
2445 
2446 	/* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
2447 	if(temp_host->freshness_threshold == 0) {
2448 		if(temp_host->state_type == HARD_STATE || temp_host->current_state == STATE_OK) {
2449 			interval = temp_host->check_interval;
2450 			}
2451 		else {
2452 			interval = temp_host->retry_interval;
2453 			}
2454 		freshness_threshold = (interval * interval_length) + temp_host->latency + additional_freshness_latency;
2455 		}
2456 	else
2457 		freshness_threshold = temp_host->freshness_threshold;
2458 
2459 	log_debug_info(DEBUGL_CHECKS, 2, "Freshness thresholds: host=%d, use=%d\n", temp_host->freshness_threshold, freshness_threshold);
2460 
2461 	/* calculate expiration time */
2462 	/*
2463 	 * CHANGED 11/10/05 EG:
2464 	 * program start is only used in expiration time calculation
2465 	 * if > last check AND active checks are enabled, so active checks
2466 	 * can become stale immediately upon program startup
2467 	 */
2468 	if(temp_host->has_been_checked == FALSE)
2469 		expiration_time = (time_t)(event_start + freshness_threshold);
2470 	/*
2471 	 * CHANGED 06/19/07 EG:
2472 	 * Per Ton's suggestion (and user requests), only use program start
2473 	 * time over last check if no specific threshold has been set by user.
2474 	 * Problems can occur if Nagios is restarted more frequently that
2475 	 * freshness threshold intervals (hosts never go stale).
2476 	 */
2477 	/*
2478 	 * CHANGED 10/07/07 EG:
2479 	 * Added max_host_check_spread to expiration time as suggested by
2480 	 * Altinity
2481 	 */
2482 	else if(temp_host->checks_enabled == TRUE && event_start > temp_host->last_check && temp_host->freshness_threshold == 0)
2483 		expiration_time = (time_t)(event_start + freshness_threshold + (max_host_check_spread * interval_length));
2484 	else
2485 		expiration_time = (time_t)(temp_host->last_check + freshness_threshold);
2486 
2487 	/*
2488 	 * If the check was last done passively, we assume it's going
2489 	 * to continue that way and we need to handle the fact that
2490 	 * Nagios might have been shut off for quite a long time. If so,
2491 	 * we mustn't spam freshness notifications but use event_start
2492 	 * instead of last_check to determine freshness expiration time.
2493 	 * The threshold for "long time" is determined as 61.8% of the normal
2494 	 * freshness threshold based on vast heuristical research (ie, "some
2495 	 * guy once told me the golden ratio is good for loads of stuff").
2496 	 */
2497 	if(temp_host->check_type == HOST_CHECK_PASSIVE) {
2498 		if(temp_host->last_check < event_start &&
2499 		        event_start - last_program_stop > freshness_threshold * 0.618) {
2500 			expiration_time = event_start + freshness_threshold;
2501 			}
2502 		}
2503 
2504 	log_debug_info(DEBUGL_CHECKS, 2, "HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n", temp_host->has_been_checked, (unsigned long)program_start, (unsigned long)event_start, (unsigned long)temp_host->last_check, (unsigned long)current_time, (unsigned long)expiration_time);
2505 
2506 	/* the results for the last check of this host are stale */
2507 	if(expiration_time < current_time) {
2508 
2509 		get_time_breakdown((current_time - expiration_time), &days, &hours, &minutes, &seconds);
2510 		get_time_breakdown(freshness_threshold, &tdays, &thours, &tminutes, &tseconds);
2511 
2512 		/* log a warning */
2513 		if(log_this == TRUE)
2514 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The results of host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  I'm forcing an immediate check of the host.\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2515 
2516 		log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  Forcing an immediate check of the host...\n", temp_host->name, days, hours, minutes, seconds, tdays, thours, tminutes, tseconds);
2517 
2518 		return FALSE;
2519 		}
2520 	else
2521 		log_debug_info(DEBUGL_CHECKS, 1, "Check results for host '%s' are fresh.\n", temp_host->name);
2522 
2523 	return TRUE;
2524 	}
2525 
2526 
2527 
2528 /******************************************************************/
2529 /************* NAGIOS 3.X ROUTE/HOST CHECK FUNCTIONS **************/
2530 /******************************************************************/
2531 
2532 
2533 /*** ON-DEMAND HOST CHECKS USE THIS FUNCTION ***/
2534 /* check to see if we can reach the host */
perform_on_demand_host_check_3x(host * hst,int * check_result_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2535 int perform_on_demand_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2536 	int result = OK;
2537 
2538 	log_debug_info(DEBUGL_FUNCTIONS, 0, "perform_on_demand_host_check_3x()\n");
2539 
2540 	/* make sure we have a host */
2541 	if(hst == NULL)
2542 		return ERROR;
2543 
2544 	log_debug_info(DEBUGL_CHECKS, 0, "** On-demand check for host '%s'...\n", hst->name);
2545 
2546 	/* check the status of the host */
2547 	result = run_sync_host_check_3x(hst, check_result_code, check_options, use_cached_result, check_timestamp_horizon);
2548 
2549 	return result;
2550 	}
2551 
2552 
2553 
2554 /* perform a synchronous check of a host */
2555 /* on-demand host checks will use this... */
run_sync_host_check_3x(host * hst,int * check_result_code,int check_options,int use_cached_result,unsigned long check_timestamp_horizon)2556 int run_sync_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon) {
2557 	int result = OK;
2558 	time_t current_time = 0L;
2559 	int host_result = HOST_UP;
2560 	char *old_plugin_output = NULL;
2561 	struct timeval start_time;
2562 	struct timeval end_time;
2563 
2564 
2565 	log_debug_info(DEBUGL_FUNCTIONS, 0, "run_sync_host_check_3x()\n");
2566 
2567 	/* make sure we have a host */
2568 	if(hst == NULL)
2569 		return ERROR;
2570 
2571 	log_debug_info(DEBUGL_CHECKS, 0, "** Run sync check of host '%s'...\n", hst->name);
2572 
2573 	/* is the host check viable at this time? */
2574 	/* if not, return current state and bail out */
2575 	if(check_host_check_viability_3x(hst, check_options, NULL, NULL) == ERROR) {
2576 		if(check_result_code)
2577 			*check_result_code = hst->current_state;
2578 		log_debug_info(DEBUGL_CHECKS, 0, "Host check is not viable at this time.\n");
2579 		return OK;
2580 		}
2581 
2582 	/* get the current time */
2583 	time(&current_time);
2584 
2585 	/* high resolution start time for event broker */
2586 	gettimeofday(&start_time, NULL);
2587 
2588 	/* can we use the last cached host state? */
2589 	if(use_cached_result == TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
2590 
2591 		/* we can used the cached result, so return it and get out of here... */
2592 		if(hst->has_been_checked == TRUE && ((current_time - hst->last_check) <= check_timestamp_horizon)) {
2593 			if(check_result_code)
2594 				*check_result_code = hst->current_state;
2595 
2596 			log_debug_info(DEBUGL_CHECKS, 1, "* Using cached host state: %d\n", hst->current_state);
2597 
2598 			/* update check statistics */
2599 			update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
2600 			update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS, current_time);
2601 
2602 			return OK;
2603 			}
2604 		}
2605 
2606 
2607 	log_debug_info(DEBUGL_CHECKS, 1, "* Running actual host check: old state=%d\n", hst->current_state);
2608 
2609 
2610 	/******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
2611 
2612 	/* update check statistics */
2613 	update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS, current_time);
2614 	update_check_stats(SERIAL_HOST_CHECK_STATS, start_time.tv_sec);
2615 
2616 	/* reset host check latency, since on-demand checks have none */
2617 	hst->latency = 0.0;
2618 
2619 	/* adjust host check attempt */
2620 	adjust_host_check_attempt_3x(hst, TRUE);
2621 
2622 	/* save old host state */
2623 	hst->last_state = hst->current_state;
2624 	if(hst->state_type == HARD_STATE)
2625 		hst->last_hard_state = hst->current_state;
2626 
2627 	/* save old plugin output for state stalking */
2628 	if(hst->plugin_output)
2629 		old_plugin_output = (char *)strdup(hst->plugin_output);
2630 
2631 	/* set the checked flag */
2632 	hst->has_been_checked = TRUE;
2633 
2634 	/* clear the freshness flag */
2635 	hst->is_being_freshened = FALSE;
2636 
2637 	/* clear check options - we don't want old check options retained */
2638 	hst->check_options = CHECK_OPTION_NONE;
2639 
2640 	/* set the check type */
2641 	hst->check_type = HOST_CHECK_ACTIVE;
2642 
2643 
2644 	/*********** EXECUTE THE CHECK AND PROCESS THE RESULTS **********/
2645 
2646 #ifdef USE_EVENT_BROKER
2647 	/* send data to event broker */
2648 	end_time.tv_sec = 0L;
2649 	end_time.tv_usec = 0L;
2650 	broker_host_check(NEBTYPE_HOSTCHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2651 #endif
2652 
2653 	/* execute the host check */
2654 	host_result = execute_sync_host_check_3x(hst);
2655 
2656 	/* process the host check result */
2657 	process_host_check_result_3x(hst, host_result, old_plugin_output, check_options, FALSE, use_cached_result, check_timestamp_horizon);
2658 
2659 	/* free memory */
2660 	my_free(old_plugin_output);
2661 
2662 	log_debug_info(DEBUGL_CHECKS, 1, "* Sync host check done: new state=%d\n", hst->current_state);
2663 
2664 	/* high resolution end time for event broker */
2665 	gettimeofday(&end_time, NULL);
2666 
2667 #ifdef USE_EVENT_BROKER
2668 	/* send data to event broker */
2669 	broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, hst->execution_time, host_check_timeout, FALSE, hst->current_state, NULL, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2670 #endif
2671 
2672 	return result;
2673 	}
2674 
2675 
2676 
2677 /* run an "alive" check on a host */
2678 /* on-demand host checks will use this... */
execute_sync_host_check_3x(host * hst)2679 int execute_sync_host_check_3x(host *hst) {
2680 	nagios_macros mac;
2681 	int result = STATE_OK;
2682 	int return_result = HOST_UP;
2683 	char *processed_command = NULL;
2684 	char *raw_command = NULL;
2685 	struct timeval start_time;
2686 	struct timeval end_time;
2687 	char *temp_ptr;
2688 	int early_timeout = FALSE;
2689 	double exectime;
2690 	char *temp_plugin_output = NULL;
2691 #ifdef USE_EVENT_BROKER
2692 	int neb_result = OK;
2693 #endif
2694 
2695 
2696 	log_debug_info(DEBUGL_FUNCTIONS, 0, "execute_sync_host_check_3x()\n");
2697 
2698 	if(hst == NULL)
2699 		return HOST_DOWN;
2700 
2701 	log_debug_info(DEBUGL_CHECKS, 0, "** Executing sync check of host '%s'...\n", hst->name);
2702 
2703 #ifdef USE_EVENT_BROKER
2704 	/* initialize start/end times */
2705 	start_time.tv_sec = 0L;
2706 	start_time.tv_usec = 0L;
2707 	end_time.tv_sec = 0L;
2708 	end_time.tv_usec = 0L;
2709 
2710 	/* send data to event broker */
2711 	neb_result = broker_host_check(NEBTYPE_HOSTCHECK_SYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2712 
2713 	/* neb module wants to cancel the host check - return the current state of the host */
2714 	if(neb_result == NEBERROR_CALLBACKCANCEL)
2715 		return hst->current_state;
2716 
2717 	/* neb module wants to override the host check - perhaps it will check the host itself */
2718 	/* NOTE: if a module does this, it must check the status of the host and populate the data structures BEFORE it returns from the callback! */
2719 	if(neb_result == NEBERROR_CALLBACKOVERRIDE)
2720 		return hst->current_state;
2721 #endif
2722 
2723 	/* grab the host macros */
2724 	memset(&mac, 0, sizeof(mac));
2725 	grab_host_macros_r(&mac, hst);
2726 
2727 	/* high resolution start time for event broker */
2728 	gettimeofday(&start_time, NULL);
2729 
2730 	/* get the last host check time */
2731 	time(&hst->last_check);
2732 
2733 	/* get the raw command line */
2734 	get_raw_command_line_r(&mac, hst->check_command_ptr, hst->host_check_command, &raw_command, 0);
2735 	if(raw_command == NULL) {
2736 		clear_volatile_macros_r(&mac);
2737 		return ERROR;
2738 		}
2739 
2740 	/* process any macros contained in the argument */
2741 	process_macros_r(&mac, raw_command, &processed_command, 0);
2742 	if(processed_command == NULL) {
2743 		my_free(raw_command);
2744 		clear_volatile_macros_r(&mac);
2745 		return ERROR;
2746 		}
2747 
2748 #ifdef USE_EVENT_BROKER
2749 	/* send data to event broker */
2750 	end_time.tv_sec = 0L;
2751 	end_time.tv_usec = 0L;
2752 	broker_host_check(NEBTYPE_HOSTCHECK_RAW_START, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, return_result, hst->state_type, start_time, end_time, hst->host_check_command, 0.0, 0.0, host_check_timeout, early_timeout, result, processed_command, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2753 #endif
2754 
2755 	log_debug_info(DEBUGL_COMMANDS, 1, "Raw host check command: %s\n", raw_command);
2756 	log_debug_info(DEBUGL_COMMANDS, 0, "Processed host check ommand: %s\n", processed_command);
2757 	my_free(raw_command);
2758 
2759 	/* clear plugin output and performance data buffers */
2760 	my_free(hst->plugin_output);
2761 	my_free(hst->long_plugin_output);
2762 	my_free(hst->perf_data);
2763 
2764 	/* run the host check command */
2765 	result = my_system_r(&mac, processed_command, host_check_timeout, &early_timeout, &exectime, &temp_plugin_output, MAX_PLUGIN_OUTPUT_LENGTH);
2766 	clear_volatile_macros_r(&mac);
2767 
2768 	/* if the check timed out, report an error */
2769 	if(early_timeout == TRUE) {
2770 
2771 		my_free(temp_plugin_output);
2772 		asprintf(&temp_plugin_output, "Host check timed out after %d seconds\n", host_check_timeout);
2773 
2774 		/* log the timeout */
2775 		logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Host check command '%s' for host '%s' timed out after %d seconds\n", processed_command, hst->name, host_check_timeout);
2776 		}
2777 
2778 	/* calculate total execution time */
2779 	hst->execution_time = exectime;
2780 
2781 	/* record check type */
2782 	hst->check_type = HOST_CHECK_ACTIVE;
2783 
2784 	/* parse the output: short and long output, and perf data */
2785 	parse_check_output(temp_plugin_output, &hst->plugin_output, &hst->long_plugin_output, &hst->perf_data, TRUE, TRUE);
2786 
2787 	/* free memory */
2788 	my_free(temp_plugin_output);
2789 	my_free(processed_command);
2790 
2791 	/* a NULL host check command means we should assume the host is UP */
2792 	if(hst->host_check_command == NULL) {
2793 		my_free(hst->plugin_output);
2794 		hst->plugin_output = (char *)strdup("(Host assumed to be UP)");
2795 		result = STATE_OK;
2796 		}
2797 
2798 	/* make sure we have some data */
2799 	if(hst->plugin_output == NULL || !strcmp(hst->plugin_output, "")) {
2800 		my_free(hst->plugin_output);
2801 		hst->plugin_output = (char *)strdup("(No output returned from host check)");
2802 		}
2803 
2804 	/* replace semicolons in plugin output (but not performance data) with colons */
2805 	if((temp_ptr = hst->plugin_output)) {
2806 		while((temp_ptr = strchr(temp_ptr, ';')))
2807 			* temp_ptr = ':';
2808 		}
2809 
2810 	/* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
2811 	if(use_aggressive_host_checking == FALSE && result == STATE_WARNING)
2812 		result = STATE_OK;
2813 
2814 
2815 	if(result == STATE_OK)
2816 		return_result = HOST_UP;
2817 	else
2818 		return_result = HOST_DOWN;
2819 
2820 	/* high resolution end time for event broker */
2821 	gettimeofday(&end_time, NULL);
2822 
2823 #ifdef USE_EVENT_BROKER
2824 	/* send data to event broker */
2825 	broker_host_check(NEBTYPE_HOSTCHECK_RAW_END, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, return_result, hst->state_type, start_time, end_time, hst->host_check_command, 0.0, exectime, host_check_timeout, early_timeout, result, processed_command, hst->plugin_output, hst->long_plugin_output, hst->perf_data, NULL);
2826 #endif
2827 
2828 	log_debug_info(DEBUGL_CHECKS, 0, "** Sync host check done: state=%d\n", return_result);
2829 
2830 	return return_result;
2831 	}
2832 
2833 
2834 
2835 /* run a scheduled host check asynchronously */
run_scheduled_host_check_3x(host * hst,int check_options,double latency)2836 int run_scheduled_host_check_3x(host *hst, int check_options, double latency) {
2837 	int result = OK;
2838 	time_t current_time = 0L;
2839 	time_t preferred_time = 0L;
2840 	time_t next_valid_time = 0L;
2841 	int time_is_valid = TRUE;
2842 
2843 
2844 	log_debug_info(DEBUGL_FUNCTIONS, 0, "run_scheduled_host_check_3x()\n");
2845 
2846 	if(hst == NULL)
2847 		return ERROR;
2848 
2849 	log_debug_info(DEBUGL_CHECKS, 0, "Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n", hst->name, check_options, latency);
2850 
2851 	/*
2852 	 * reset the next_check_event so we know this host
2853 	 * check is no longer in the scheduling queue
2854 	 */
2855 	hst->next_check_event = NULL;
2856 
2857 	/* attempt to run the check */
2858 	result = run_async_host_check_3x(hst, check_options, latency, TRUE, TRUE, &time_is_valid, &preferred_time);
2859 
2860 	/* an error occurred, so reschedule the check */
2861 	if(result == ERROR) {
2862 
2863 		log_debug_info(DEBUGL_CHECKS, 1, "Unable to run scheduled host check at this time\n");
2864 
2865 		/* only attempt to (re)schedule checks that should get checked... */
2866 		if(hst->should_be_scheduled == TRUE) {
2867 
2868 			/* get current time */
2869 			time(&current_time);
2870 
2871 			/* determine next time we should check the host if needed */
2872 			/* if host has no check interval, schedule it again for 5 minutes from now */
2873 			if(current_time >= preferred_time)
2874 				preferred_time = current_time + ((hst->check_interval <= 0) ? 300 : (hst->check_interval * interval_length));
2875 
2876 			/* make sure we rescheduled the next host check at a valid time */
2877 			get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr);
2878 
2879 			/* the host could not be rescheduled properly - set the next check time for next week */
2880 			if(time_is_valid == FALSE && next_valid_time == preferred_time) {
2881 
2882 				/*
2883 				hst->next_check=(time_t)(next_valid_time+(60*60*24*365));
2884 				hst->should_be_scheduled=FALSE;
2885 				*/
2886 
2887 				hst->next_check = (time_t)(next_valid_time + (60 * 60 * 24 * 7));
2888 
2889 				logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Check of host '%s' could not be rescheduled properly.  Scheduling check for next week...\n", hst->name);
2890 
2891 				log_debug_info(DEBUGL_CHECKS, 1, "Unable to find any valid times to reschedule the next host check!\n");
2892 				}
2893 
2894 			/* this service could be rescheduled... */
2895 			else {
2896 				hst->next_check = next_valid_time;
2897 				hst->should_be_scheduled = TRUE;
2898 
2899 				log_debug_info(DEBUGL_CHECKS, 1, "Rescheduled next host check for %s", ctime(&next_valid_time));
2900 				}
2901 			}
2902 
2903 		/* update the status log */
2904 		update_host_status(hst, FALSE);
2905 
2906 		/* reschedule the next host check - unless we couldn't find a valid next check time */
2907 		/* 10/19/07 EG - keep original check options */
2908 		if(hst->should_be_scheduled == TRUE)
2909 			schedule_host_check(hst, hst->next_check, check_options);
2910 
2911 		return ERROR;
2912 		}
2913 
2914 	return OK;
2915 	}
2916 
2917 
2918 
2919 /* perform an asynchronous check of a host */
2920 /* scheduled host checks will use this, as will some checks that result from on-demand checks... */
run_async_host_check_3x(host * hst,int check_options,double latency,int scheduled_check,int reschedule_check,int * time_is_valid,time_t * preferred_time)2921 int run_async_host_check_3x(host *hst, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time) {
2922 	nagios_macros mac;
2923 	char *raw_command = NULL;
2924 	char *processed_command = NULL;
2925 	char output_buffer[MAX_INPUT_BUFFER] = "";
2926 	char *temp_buffer = NULL;
2927 	struct timeval start_time, end_time;
2928 	pid_t pid = 0;
2929 	int fork_error = FALSE;
2930 	int wait_result = 0;
2931 	FILE *fp = NULL;
2932 	int pclose_result = 0;
2933 	mode_t new_umask = 077;
2934 	mode_t old_umask;
2935 	char *output_file = NULL;
2936 	double old_latency = 0.0;
2937 	dbuf checkresult_dbuf;
2938 	int dbuf_chunk = 1024;
2939 #ifdef USE_EVENT_BROKER
2940 	int neb_result = OK;
2941 #endif
2942 
2943 	log_debug_info(DEBUGL_FUNCTIONS, 0, "run_async_host_check_3x()\n");
2944 
2945 	/* make sure we have a host */
2946 	if(hst == NULL)
2947 		return ERROR;
2948 
2949 	log_debug_info(DEBUGL_CHECKS, 0, "** Running async check of host '%s'...\n", hst->name);
2950 
2951 	/* is the host check viable at this time? */
2952 	if(check_host_check_viability_3x(hst, check_options, time_is_valid, preferred_time) == ERROR)
2953 		return ERROR;
2954 
2955 	/* 08/04/07 EG don't execute a new host check if one is already running */
2956 	if(hst->is_executing == TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
2957 		log_debug_info(DEBUGL_CHECKS, 1, "A check of this host is already being executed, so we'll pass for the moment...\n");
2958 		return ERROR;
2959 		}
2960 
2961 	/******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
2962 
2963 #ifdef USE_EVENT_BROKER
2964 	/* initialize start/end times */
2965 	start_time.tv_sec = 0L;
2966 	start_time.tv_usec = 0L;
2967 	end_time.tv_sec = 0L;
2968 	end_time.tv_usec = 0L;
2969 
2970 	/* send data to event broker */
2971 	neb_result = broker_host_check(NEBTYPE_HOSTCHECK_ASYNC_PRECHECK, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, NULL, NULL, NULL, NULL, NULL);
2972 
2973 	/* neb module wants to cancel the host check - the check will be rescheduled for a later time by the scheduling logic */
2974 	if(neb_result == NEBERROR_CALLBACKCANCEL)
2975 		return ERROR;
2976 
2977 	/* neb module wants to override the host check - perhaps it will check the host itself */
2978 	/* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
2979 	if(neb_result == NEBERROR_CALLBACKOVERRIDE)
2980 		return OK;
2981 #endif
2982 
2983 	log_debug_info(DEBUGL_CHECKS, 0, "Checking host '%s'...\n", hst->name);
2984 
2985 	/* clear check options - we don't want old check options retained */
2986 	/* only clear options if this was a scheduled check - on demand check options shouldn't affect retained info */
2987 	if(scheduled_check == TRUE)
2988 		hst->check_options = CHECK_OPTION_NONE;
2989 
2990 	/* adjust host check attempt */
2991 	adjust_host_check_attempt_3x(hst, TRUE);
2992 
2993 	/* set latency (temporarily) for macros and event broker */
2994 	old_latency = hst->latency;
2995 	hst->latency = latency;
2996 
2997 	/* grab the host macro variables */
2998 	memset(&mac, 0, sizeof(mac));
2999 	grab_host_macros_r(&mac, hst);
3000 
3001 	/* get the raw command line */
3002 	get_raw_command_line_r(&mac, hst->check_command_ptr, hst->host_check_command, &raw_command, 0);
3003 	if(raw_command == NULL) {
3004 		clear_volatile_macros_r(&mac);
3005 		log_debug_info(DEBUGL_CHECKS, 0, "Raw check command for host '%s' was NULL - aborting.\n", hst->name);
3006 		return ERROR;
3007 		}
3008 
3009 	/* process any macros contained in the argument */
3010 	process_macros_r(&mac, raw_command, &processed_command, 0);
3011 	my_free(raw_command);
3012 	if(processed_command == NULL) {
3013 		clear_volatile_macros_r(&mac);
3014 		log_debug_info(DEBUGL_CHECKS, 0, "Processed check command for host '%s' was NULL - aborting.\n", hst->name);
3015 		return ERROR;
3016 		}
3017 
3018 	/* get the command start time */
3019 	gettimeofday(&start_time, NULL);
3020 
3021 	/* set check time for on-demand checks, so they're not incorrectly detected as being orphaned - Luke Ross 5/16/08 */
3022 	/* NOTE: 06/23/08 EG not sure if there will be side effects to this or not.... */
3023 	if(scheduled_check == FALSE)
3024 		hst->next_check = start_time.tv_sec;
3025 
3026 	/* increment number of host checks that are currently running... */
3027 	currently_running_host_checks++;
3028 
3029 	/* set the execution flag */
3030 	hst->is_executing = TRUE;
3031 
3032 	/* open a temp file for storing check output */
3033 	old_umask = umask(new_umask);
3034 	asprintf(&output_file, "%s/checkXXXXXX", temp_path);
3035 	check_result_info.output_file_fd = mkstemp(output_file);
3036 	if(check_result_info.output_file_fd >= 0)
3037 		check_result_info.output_file_fp = fdopen(check_result_info.output_file_fd, "w");
3038 	else {
3039 		check_result_info.output_file_fp = NULL;
3040 		check_result_info.output_file_fd = -1;
3041 		}
3042 	umask(old_umask);
3043 
3044 	log_debug_info(DEBUGL_CHECKS | DEBUGL_IPC, 1, "Check result output will be written to '%s' (fd=%d)\n", output_file, check_result_info.output_file_fd);
3045 
3046 	/* save check info */
3047 	check_result_info.object_check_type = HOST_CHECK;
3048 	check_result_info.host_name = (char *)strdup(hst->name);
3049 	check_result_info.service_description = NULL;
3050 	check_result_info.check_type = HOST_CHECK_ACTIVE;
3051 	check_result_info.check_options = check_options;
3052 	check_result_info.scheduled_check = scheduled_check;
3053 	check_result_info.reschedule_check = reschedule_check;
3054 	check_result_info.output_file = (check_result_info.output_file_fd < 0 || output_file == NULL) ? NULL : strdup(output_file);
3055 	check_result_info.latency = latency;
3056 	check_result_info.start_time = start_time;
3057 	check_result_info.finish_time = start_time;
3058 	check_result_info.early_timeout = FALSE;
3059 	check_result_info.exited_ok = TRUE;
3060 	check_result_info.return_code = STATE_OK;
3061 	check_result_info.output = NULL;
3062 
3063 	/* free memory */
3064 	my_free(output_file);
3065 
3066 	/* write initial check info to file */
3067 	/* if things go bad later on, the user will at least have something to go on when debugging... */
3068 	if(check_result_info.output_file_fp) {
3069 
3070 		fprintf(check_result_info.output_file_fp, "### Active Check Result File ###\n");
3071 		fprintf(check_result_info.output_file_fp, "file_time=%lu\n", (unsigned long)check_result_info.start_time.tv_sec);
3072 		fprintf(check_result_info.output_file_fp, "\n");
3073 
3074 		fprintf(check_result_info.output_file_fp, "### Nagios Host Check Result ###\n");
3075 		fprintf(check_result_info.output_file_fp, "# Time: %s", ctime(&check_result_info.start_time.tv_sec));
3076 		fprintf(check_result_info.output_file_fp, "host_name=%s\n", check_result_info.host_name);
3077 		fprintf(check_result_info.output_file_fp, "check_type=%d\n", check_result_info.check_type);
3078 		fprintf(check_result_info.output_file_fp, "check_options=%d\n", check_result_info.check_options);
3079 		fprintf(check_result_info.output_file_fp, "scheduled_check=%d\n", check_result_info.scheduled_check);
3080 		fprintf(check_result_info.output_file_fp, "reschedule_check=%d\n", check_result_info.reschedule_check);
3081 		fprintf(check_result_info.output_file_fp, "latency=%f\n", hst->latency);
3082 		fprintf(check_result_info.output_file_fp, "start_time=%lu.%lu\n", check_result_info.start_time.tv_sec, check_result_info.start_time.tv_usec);
3083 
3084 		/* flush buffer or we'll end up writing twice when we fork() */
3085 		fflush(check_result_info.output_file_fp);
3086 		}
3087 
3088 	/* initialize dynamic buffer for storing plugin output */
3089 	dbuf_init(&checkresult_dbuf, dbuf_chunk);
3090 
3091 #ifdef USE_EVENT_BROKER
3092 	/* send data to event broker */
3093 	broker_host_check(NEBTYPE_HOSTCHECK_INITIATE, NEBFLAG_NONE, NEBATTR_NONE, hst, HOST_CHECK_ACTIVE, hst->current_state, hst->state_type, start_time, end_time, hst->host_check_command, hst->latency, 0.0, host_check_timeout, FALSE, 0, processed_command, NULL, NULL, NULL, NULL);
3094 #endif
3095 
3096 	/* reset latency (permanent value for this check will get set later) */
3097 	hst->latency = old_latency;
3098 
3099 	/* update check statistics */
3100 	update_check_stats((scheduled_check == TRUE) ? ACTIVE_SCHEDULED_HOST_CHECK_STATS : ACTIVE_ONDEMAND_HOST_CHECK_STATS, start_time.tv_sec);
3101 	update_check_stats(PARALLEL_HOST_CHECK_STATS, start_time.tv_sec);
3102 
3103 	/* fork a child process */
3104 	pid = fork();
3105 
3106 	/* an error occurred while trying to fork */
3107 	if(pid == -1) {
3108 
3109 		fork_error = TRUE;
3110 
3111 		/* log an error */
3112 		logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: The check of host '%s' could not be performed due to a fork() error: '%s'.\n", hst->name, strerror(errno));
3113 
3114 		log_debug_info(DEBUGL_CHECKS, 0, "Check of host '%s' could not be performed due to a fork() error: '%s'!\n", hst->name, strerror(errno));
3115 		}
3116 
3117 	/* if we are in the child process... */
3118 	else if(pid == 0) {
3119 
3120 		/* set environment variables */
3121 		set_all_macro_environment_vars_r(&mac, TRUE);
3122 
3123 		/* ADDED 11/12/07 EG */
3124 		/* close external command file and shut down worker thread */
3125 		close_command_file();
3126 
3127 		/* fork again if we're not in a large installation */
3128 		if(child_processes_fork_twice == TRUE) {
3129 
3130 			/* fork again... */
3131 			pid = fork();
3132 
3133 			/* an error occurred while trying to fork again */
3134 			if(pid == -1)
3135 				exit(STATE_UNKNOWN);
3136 			}
3137 
3138 		/* the grandchild (or child if large install tweaks are enabled) process should run the host check... */
3139 		if(pid == 0 || child_processes_fork_twice == FALSE) {
3140 
3141 			/* reset signal handling */
3142 			reset_sighandler();
3143 
3144 			/* become the process group leader */
3145 			setpgid(0, 0);
3146 
3147 			/* exit on term signals at this process level */
3148 			signal(SIGTERM, SIG_DFL);
3149 
3150 			/* catch plugins that don't finish in a timely manner */
3151 			signal(SIGALRM, host_check_sighandler);
3152 			alarm(host_check_timeout);
3153 
3154 			/* disable rotation of the debug file */
3155 			max_debug_file_size = 0L;
3156 
3157 			/* run the plugin check command */
3158 			fp = popen(processed_command, "r");
3159 			if(fp == NULL)
3160 				_exit(STATE_UNKNOWN);
3161 
3162 			/* initialize buffer */
3163 			strcpy(output_buffer, "");
3164 
3165 			/* get all lines of plugin output - escape newlines */
3166 			while(fgets(output_buffer, sizeof(output_buffer) - 1, fp)) {
3167 				temp_buffer = escape_newlines(output_buffer);
3168 				dbuf_strcat(&checkresult_dbuf, temp_buffer);
3169 				my_free(temp_buffer);
3170 				}
3171 
3172 			/* close the process */
3173 			pclose_result = pclose(fp);
3174 
3175 			/* reset the alarm and signal handling here */
3176 			signal(SIGALRM, SIG_IGN);
3177 			alarm(0);
3178 
3179 			/* get the check finish time */
3180 			gettimeofday(&end_time, NULL);
3181 
3182 			/* record check result info */
3183 			check_result_info.finish_time = end_time;
3184 			check_result_info.early_timeout = FALSE;
3185 
3186 			/* test for execution error */
3187 			if(pclose_result == -1) {
3188 				pclose_result = STATE_UNKNOWN;
3189 				check_result_info.return_code = STATE_CRITICAL;
3190 				check_result_info.exited_ok = FALSE;
3191 				}
3192 			else {
3193 				if(WEXITSTATUS(pclose_result) == 0 && WIFSIGNALED(pclose_result))
3194 					check_result_info.return_code = 128 + WTERMSIG(pclose_result);
3195 				else
3196 					check_result_info.return_code = WEXITSTATUS(pclose_result);
3197 				}
3198 
3199 			/* write check result to file */
3200 			if(check_result_info.output_file_fp) {
3201 				FILE *fp;
3202 
3203 				/* protect against signal races */
3204 				fp = check_result_info.output_file_fp;
3205 				check_result_info.output_file_fp = NULL;
3206 
3207 				fprintf(fp, "finish_time=%lu.%lu\n", check_result_info.finish_time.tv_sec, check_result_info.finish_time.tv_usec);
3208 				fprintf(fp, "early_timeout=%d\n", check_result_info.early_timeout);
3209 				fprintf(fp, "exited_ok=%d\n", check_result_info.exited_ok);
3210 				fprintf(fp, "return_code=%d\n", check_result_info.return_code);
3211 				fprintf(fp, "output=%s\n", (checkresult_dbuf.buf == NULL) ? "(null)" : checkresult_dbuf.buf);
3212 
3213 				/* close the temp file */
3214 				fclose(fp);
3215 
3216 				/* move check result to queue directory */
3217 				move_check_result_to_queue(check_result_info.output_file);
3218 				}
3219 
3220 			/* free memory */
3221 			dbuf_free(&checkresult_dbuf);
3222 			my_free(processed_command);
3223 
3224 			/* free check result memory */
3225 			free_check_result(&check_result_info);
3226 
3227 			/* return with plugin exit status - not really necessary... */
3228 			_exit(pclose_result);
3229 			}
3230 
3231 		/* NOTE: this code is never reached if large install tweaks are enabled... */
3232 
3233 		/* unset environment variables */
3234 		set_all_macro_environment_vars_r(&mac, FALSE);
3235 
3236 		/* free allocated memory */
3237 		/* this needs to be done last, so we don't free memory for variables before they're used above */
3238 		if(free_child_process_memory == TRUE)
3239 			free_memory(&mac);
3240 
3241 		/* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
3242 		_exit(STATE_OK);
3243 		}
3244 
3245 	/* else the parent should wait for the first child to return... */
3246 	else if(pid > 0) {
3247 		clear_volatile_macros_r(&mac);
3248 
3249 		log_debug_info(DEBUGL_CHECKS, 2, "Host check is executing in child process (pid=%lu)\n", (unsigned long)pid);
3250 
3251 		/* parent should close output file */
3252 		if(check_result_info.output_file_fp)
3253 			fclose(check_result_info.output_file_fp);
3254 
3255 		/* should this be done in first child process (after spawning grandchild) as well? */
3256 		/* free memory allocated for IPC functionality */
3257 		free_check_result(&check_result_info);
3258 
3259 		/* free memory */
3260 		my_free(processed_command);
3261 
3262 		/* wait for the first child to return */
3263 		/* if large install tweaks are enabled, we'll clean up the zombie process later */
3264 		if(child_processes_fork_twice == TRUE)
3265 			wait_result = waitpid(pid, NULL, 0);
3266 		}
3267 
3268 	/* see if we were able to run the check... */
3269 	if(fork_error == TRUE)
3270 		return ERROR;
3271 
3272 	return OK;
3273 	}
3274 
3275 
3276 
3277 /* process results of an asynchronous host check */
handle_async_host_check_result_3x(host * temp_host,check_result * queued_check_result)3278 int handle_async_host_check_result_3x(host *temp_host, check_result *queued_check_result) {
3279 	time_t current_time;
3280 	int result = STATE_OK;
3281 	int reschedule_check = FALSE;
3282 	char *old_plugin_output = NULL;
3283 	char *temp_ptr = NULL;
3284 	struct timeval start_time_hires;
3285 	struct timeval end_time_hires;
3286 
3287 	log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_async_host_check_result_3x()\n");
3288 
3289 	/* make sure we have what we need */
3290 	if(temp_host == NULL || queued_check_result == NULL)
3291 		return ERROR;
3292 
3293 	time(&current_time);
3294 
3295 	log_debug_info(DEBUGL_CHECKS, 1, "** Handling async check result for host '%s'...\n", temp_host->name);
3296 
3297 	log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Type:         %s\n", (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? "Active" : "Passive");
3298 	log_debug_info(DEBUGL_CHECKS, 2, "\tCheck Options:      %d\n", queued_check_result->check_options);
3299 	log_debug_info(DEBUGL_CHECKS, 2, "\tScheduled Check?:   %s\n", (queued_check_result->scheduled_check == TRUE) ? "Yes" : "No");
3300 	log_debug_info(DEBUGL_CHECKS, 2, "\tReschedule Check?:  %s\n", (queued_check_result->reschedule_check == TRUE) ? "Yes" : "No");
3301 	log_debug_info(DEBUGL_CHECKS, 2, "\tExited OK?:         %s\n", (queued_check_result->exited_ok == TRUE) ? "Yes" : "No");
3302 	log_debug_info(DEBUGL_CHECKS, 2, "\tExec Time:          %.3f\n", temp_host->execution_time);
3303 	log_debug_info(DEBUGL_CHECKS, 2, "\tLatency:            %.3f\n", temp_host->latency);
3304 	log_debug_info(DEBUGL_CHECKS, 2, "\tReturn Status:      %d\n", queued_check_result->return_code);
3305 	log_debug_info(DEBUGL_CHECKS, 2, "\tOutput:             %s\n", (queued_check_result == NULL) ? "NULL" : queued_check_result->output);
3306 
3307 	/* decrement the number of host checks still out there... */
3308 	if(queued_check_result->check_type == HOST_CHECK_ACTIVE && currently_running_host_checks > 0)
3309 		currently_running_host_checks--;
3310 
3311 	/* skip this host check results if its passive and we aren't accepting passive check results */
3312 	if(queued_check_result->check_type == HOST_CHECK_PASSIVE) {
3313 		if(accept_passive_host_checks == FALSE) {
3314 			log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive host checks are disabled globally.\n");
3315 			return ERROR;
3316 			}
3317 		if(temp_host->accept_passive_host_checks == FALSE) {
3318 			log_debug_info(DEBUGL_CHECKS, 0, "Discarding passive host check result because passive checks are disabled for this host.\n");
3319 			return ERROR;
3320 			}
3321 		}
3322 
3323 	/* clear the freshening flag (it would have been set if this host was determined to be stale) */
3324 	if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
3325 		temp_host->is_being_freshened = FALSE;
3326 
3327 	/* DISCARD INVALID FRESHNESS CHECK RESULTS */
3328 	/* If a host goes stale, Nagios will initiate a forced check in order to freshen it.  There is a race condition whereby a passive check
3329 	   could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here.  This would
3330 	   make the host fresh again, so we do a quick check to make sure the host is still stale before we accept the check result. */
3331 	if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_host_result_fresh(temp_host, current_time, FALSE) == TRUE) {
3332 		log_debug_info(DEBUGL_CHECKS, 0, "Discarding host freshness check result because the host is currently fresh (race condition avoided).\n");
3333 		return OK;
3334 		}
3335 
3336 	/* was this check passive or active? */
3337 	temp_host->check_type = (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? HOST_CHECK_ACTIVE : HOST_CHECK_PASSIVE;
3338 
3339 	/* update check statistics for passive results */
3340 	if(queued_check_result->check_type == HOST_CHECK_PASSIVE)
3341 		update_check_stats(PASSIVE_HOST_CHECK_STATS, queued_check_result->start_time.tv_sec);
3342 
3343 	/* should we reschedule the next check of the host? NOTE: this might be overridden later... */
3344 	reschedule_check = queued_check_result->reschedule_check;
3345 
3346 	/* check latency is passed to us for both active and passive checks */
3347 	temp_host->latency = queued_check_result->latency;
3348 
3349 	/* update the execution time for this check (millisecond resolution) */
3350 	temp_host->execution_time = (double)((double)(queued_check_result->finish_time.tv_sec - queued_check_result->start_time.tv_sec) + (double)((queued_check_result->finish_time.tv_usec - queued_check_result->start_time.tv_usec) / 1000.0) / 1000.0);
3351 	if(temp_host->execution_time < 0.0)
3352 		temp_host->execution_time = 0.0;
3353 
3354 	/* set the checked flag */
3355 	temp_host->has_been_checked = TRUE;
3356 
3357 	/* clear the execution flag if this was an active check */
3358 	if(queued_check_result->check_type == HOST_CHECK_ACTIVE)
3359 		temp_host->is_executing = FALSE;
3360 
3361 	/* get the last check time */
3362 	temp_host->last_check = queued_check_result->start_time.tv_sec;
3363 
3364 	/* was this check passive or active? */
3365 	temp_host->check_type = (queued_check_result->check_type == HOST_CHECK_ACTIVE) ? HOST_CHECK_ACTIVE : HOST_CHECK_PASSIVE;
3366 
3367 	/* save the old host state */
3368 	temp_host->last_state = temp_host->current_state;
3369 	if(temp_host->state_type == HARD_STATE)
3370 		temp_host->last_hard_state = temp_host->current_state;
3371 
3372 	/* save old plugin output */
3373 	if(temp_host->plugin_output)
3374 		old_plugin_output = (char *)strdup(temp_host->plugin_output);
3375 
3376 	/* clear the old plugin output and perf data buffers */
3377 	my_free(temp_host->plugin_output);
3378 	my_free(temp_host->long_plugin_output);
3379 	my_free(temp_host->perf_data);
3380 
3381 	/* parse check output to get: (1) short output, (2) long output, (3) perf data */
3382 	parse_check_output(queued_check_result->output, &temp_host->plugin_output, &temp_host->long_plugin_output, &temp_host->perf_data, TRUE, TRUE);
3383 
3384 	/* make sure we have some data */
3385 	if(temp_host->plugin_output == NULL || !strcmp(temp_host->plugin_output, "")) {
3386 		my_free(temp_host->plugin_output);
3387 		temp_host->plugin_output = (char *)strdup("(No output returned from host check)");
3388 		}
3389 
3390 	/* replace semicolons in plugin output (but not performance data) with colons */
3391 	if((temp_ptr = temp_host->plugin_output)) {
3392 		while((temp_ptr = strchr(temp_ptr, ';')))
3393 			* temp_ptr = ':';
3394 		}
3395 
3396 	log_debug_info(DEBUGL_CHECKS, 2, "Parsing check output...\n");
3397 	log_debug_info(DEBUGL_CHECKS, 2, "Short Output: %s\n", (temp_host->plugin_output == NULL) ? "NULL" : temp_host->plugin_output);
3398 	log_debug_info(DEBUGL_CHECKS, 2, "Long Output:  %s\n", (temp_host->long_plugin_output == NULL) ? "NULL" : temp_host->long_plugin_output);
3399 	log_debug_info(DEBUGL_CHECKS, 2, "Perf Data:    %s\n", (temp_host->perf_data == NULL) ? "NULL" : temp_host->perf_data);
3400 
3401 	/* get the unprocessed return code */
3402 	/* NOTE: for passive checks, this is the final/processed state */
3403 	result = queued_check_result->return_code;
3404 
3405 	/* adjust return code (active checks only) */
3406 	if(queued_check_result->check_type == HOST_CHECK_ACTIVE) {
3407 
3408 		/* if there was some error running the command, just skip it (this shouldn't be happening) */
3409 		if(queued_check_result->exited_ok == FALSE) {
3410 
3411 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning:  Check of host '%s' did not exit properly!\n", temp_host->name);
3412 
3413 			my_free(temp_host->plugin_output);
3414 			my_free(temp_host->long_plugin_output);
3415 			my_free(temp_host->perf_data);
3416 
3417 			temp_host->plugin_output = (char *)strdup("(Host check did not exit properly)");
3418 
3419 			result = STATE_CRITICAL;
3420 			}
3421 
3422 		/* make sure the return code is within bounds */
3423 		else if(queued_check_result->return_code < 0 || queued_check_result->return_code > 3) {
3424 
3425 			logit(NSLOG_RUNTIME_WARNING, TRUE, "Warning: Return code of %d for check of host '%s' was out of bounds.%s\n", queued_check_result->return_code, temp_host->name, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " Make sure the plugin you're trying to run actually exists." : "");
3426 
3427 			my_free(temp_host->plugin_output);
3428 			my_free(temp_host->long_plugin_output);
3429 			my_free(temp_host->perf_data);
3430 
3431 			asprintf(&temp_host->plugin_output, "(Return code of %d is out of bounds%s)", queued_check_result->return_code, (queued_check_result->return_code == 126 || queued_check_result->return_code == 127) ? " - plugin may be missing" : "");
3432 
3433 			result = STATE_CRITICAL;
3434 			}
3435 
3436 		/* a NULL host check command means we should assume the host is UP */
3437 		if(temp_host->host_check_command == NULL) {
3438 			my_free(temp_host->plugin_output);
3439 			temp_host->plugin_output = (char *)strdup("(Host assumed to be UP)");
3440 			result = STATE_OK;
3441 			}
3442 		}
3443 
3444 	/* translate return code to basic UP/DOWN state - the DOWN/UNREACHABLE state determination is made later */
3445 	/* NOTE: only do this for active checks - passive check results already have the final state */
3446 	if(queued_check_result->check_type == HOST_CHECK_ACTIVE) {
3447 
3448 		/* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
3449 		if(use_aggressive_host_checking == FALSE && result == STATE_WARNING)
3450 			result = STATE_OK;
3451 
3452 		/* OK states means the host is UP */
3453 		if(result == STATE_OK)
3454 			result = HOST_UP;
3455 
3456 		/* any problem state indicates the host is not UP */
3457 		else
3458 			result = HOST_DOWN;
3459 		}
3460 
3461 
3462 	/******************* PROCESS THE CHECK RESULTS ******************/
3463 
3464 	/* process the host check result */
3465 	process_host_check_result_3x(temp_host, result, old_plugin_output, CHECK_OPTION_NONE, reschedule_check, TRUE, cached_host_check_horizon);
3466 
3467 	/* free memory */
3468 	my_free(old_plugin_output);
3469 
3470 	log_debug_info(DEBUGL_CHECKS, 1, "** Async check result for host '%s' handled: new state=%d\n", temp_host->name, temp_host->current_state);
3471 
3472 	/* high resolution start time for event broker */
3473 	start_time_hires = queued_check_result->start_time;
3474 
3475 	/* high resolution end time for event broker */
3476 	gettimeofday(&end_time_hires, NULL);
3477 
3478 #ifdef USE_EVENT_BROKER
3479 	/* send data to event broker */
3480 	broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED, NEBFLAG_NONE, NEBATTR_NONE, temp_host, temp_host->check_type, temp_host->current_state, temp_host->state_type, start_time_hires, end_time_hires, temp_host->host_check_command, temp_host->latency, temp_host->execution_time, host_check_timeout, queued_check_result->early_timeout, queued_check_result->return_code, NULL, temp_host->plugin_output, temp_host->long_plugin_output, temp_host->perf_data, NULL);
3481 #endif
3482 
3483 	return OK;
3484 	}
3485 
3486 
3487 
3488 /* processes the result of a synchronous or asynchronous host check */
process_host_check_result_3x(host * hst,int new_state,char * old_plugin_output,int check_options,int reschedule_check,int use_cached_result,unsigned long check_timestamp_horizon)3489 int process_host_check_result_3x(host *hst, int new_state, char *old_plugin_output, int check_options, int reschedule_check, int use_cached_result, unsigned long check_timestamp_horizon) {
3490 	hostsmember *temp_hostsmember = NULL;
3491 	host *child_host = NULL;
3492 	host *parent_host = NULL;
3493 	host *master_host = NULL;
3494 	host *temp_host = NULL;
3495 	hostdependency *temp_dependency = NULL;
3496 	objectlist *check_hostlist = NULL;
3497 	objectlist *hostlist_item = NULL;
3498 	int parent_state = HOST_UP;
3499 	time_t current_time = 0L;
3500 	time_t next_check = 0L;
3501 	time_t preferred_time = 0L;
3502 	time_t next_valid_time = 0L;
3503 	int run_async_check = TRUE;
3504 	void *ptr = NULL;
3505 
3506 
3507 	log_debug_info(DEBUGL_FUNCTIONS, 0, "process_host_check_result_3x()\n");
3508 
3509 	log_debug_info(DEBUGL_CHECKS, 1, "HOST: %s, ATTEMPT=%d/%d, CHECK TYPE=%s, STATE TYPE=%s, OLD STATE=%d, NEW STATE=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->check_type == HOST_CHECK_ACTIVE) ? "ACTIVE" : "PASSIVE", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state, new_state);
3510 
3511 	/* get the current time */
3512 	time(&current_time);
3513 
3514 	/* default next check time */
3515 	next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3516 
3517 	/* we have to adjust current attempt # for passive checks, as it isn't done elsewhere */
3518 	if(hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == TRUE)
3519 		adjust_host_check_attempt_3x(hst, FALSE);
3520 
3521 	/* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
3522 	if(hst->check_type == HOST_CHECK_PASSIVE) {
3523 		if(log_passive_checks == TRUE)
3524 			logit(NSLOG_PASSIVE_CHECK, FALSE, "PASSIVE HOST CHECK: %s;%d;%s\n", hst->name, new_state, hst->plugin_output);
3525 		}
3526 
3527 
3528 	/******* HOST WAS DOWN/UNREACHABLE INITIALLY *******/
3529 	if(hst->current_state != HOST_UP) {
3530 
3531 		log_debug_info(DEBUGL_CHECKS, 1, "Host was DOWN/UNREACHABLE.\n");
3532 
3533 		/***** HOST IS NOW UP *****/
3534 		/* the host just recovered! */
3535 		if(new_state == HOST_UP) {
3536 
3537 			/* set the current state */
3538 			hst->current_state = HOST_UP;
3539 
3540 			/* set the state type */
3541 			/* set state type to HARD for passive checks and active checks that were previously in a HARD STATE */
3542 			if(hst->state_type == HARD_STATE || (hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == FALSE))
3543 				hst->state_type = HARD_STATE;
3544 			else
3545 				hst->state_type = SOFT_STATE;
3546 
3547 			log_debug_info(DEBUGL_CHECKS, 1, "Host experienced a %s recovery (it's now UP).\n", (hst->state_type == HARD_STATE) ? "HARD" : "SOFT");
3548 
3549 			/* reschedule the next check of the host at the normal interval */
3550 			reschedule_check = TRUE;
3551 			next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3552 
3553 			/* propagate checks to immediate parents if they are not already UP */
3554 			/* we do this because a parent host (or grandparent) may have recovered somewhere and we should catch the recovery as soon as possible */
3555 			log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to parent host(s)...\n");
3556 
3557 			for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3558 				if((parent_host = temp_hostsmember->host_ptr) == NULL)
3559 					continue;
3560 				if(parent_host->current_state != HOST_UP) {
3561 					log_debug_info(DEBUGL_CHECKS, 1, "Check of parent host '%s' queued.\n", parent_host->name);
3562 					add_object_to_objectlist(&check_hostlist, (void *)parent_host);
3563 					}
3564 				}
3565 
3566 			/* propagate checks to immediate children if they are not already UP */
3567 			/* we do this because children may currently be UNREACHABLE, but may (as a result of this recovery) switch to UP or DOWN states */
3568 			log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to child host(s)...\n");
3569 
3570 			for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3571 				if((child_host = temp_hostsmember->host_ptr) == NULL)
3572 					continue;
3573 				if(child_host->current_state != HOST_UP) {
3574 					log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3575 					add_object_to_objectlist(&check_hostlist, (void *)child_host);
3576 					}
3577 				}
3578 			}
3579 
3580 		/***** HOST IS STILL DOWN/UNREACHABLE *****/
3581 		/* we're still in a problem state... */
3582 		else {
3583 
3584 			log_debug_info(DEBUGL_CHECKS, 1, "Host is still DOWN/UNREACHABLE.\n");
3585 
3586 			/* passive checks are treated as HARD states by default... */
3587 			if(hst->check_type == HOST_CHECK_PASSIVE && passive_host_checks_are_soft == FALSE) {
3588 
3589 				/* set the state type */
3590 				hst->state_type = HARD_STATE;
3591 
3592 				/* reset the current attempt */
3593 				hst->current_attempt = 1;
3594 				}
3595 
3596 			/* active checks and passive checks (treated as SOFT states) */
3597 			else {
3598 
3599 				/* set the state type */
3600 				/* we've maxed out on the retries */
3601 				if(hst->current_attempt == hst->max_attempts)
3602 					hst->state_type = HARD_STATE;
3603 				/* the host was in a hard problem state before, so it still is now */
3604 				else if(hst->current_attempt == 1)
3605 					hst->state_type = HARD_STATE;
3606 				/* the host is in a soft state and the check will be retried */
3607 				else
3608 					hst->state_type = SOFT_STATE;
3609 				}
3610 
3611 			/* make a determination of the host's state */
3612 			/* translate host state between DOWN/UNREACHABLE (only for passive checks if enabled) */
3613 			hst->current_state = new_state;
3614 			if(hst->check_type == HOST_CHECK_ACTIVE || translate_passive_host_checks == TRUE)
3615 				hst->current_state = determine_host_reachability(hst);
3616 
3617 			/* reschedule the next check if the host state changed */
3618 			if(hst->last_state != hst->current_state || hst->last_hard_state != hst->current_state) {
3619 
3620 				reschedule_check = TRUE;
3621 
3622 				/* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
3623 				if(hst->state_type == SOFT_STATE)
3624 					next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length));
3625 
3626 				/* host has maxed out on retries (or was previously in a hard problem state), so reschedule the next check at the normal interval */
3627 				else
3628 					next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3629 				}
3630 
3631 			}
3632 
3633 		}
3634 
3635 	/******* HOST WAS UP INITIALLY *******/
3636 	else {
3637 
3638 		log_debug_info(DEBUGL_CHECKS, 1, "Host was UP.\n");
3639 
3640 		/***** HOST IS STILL UP *****/
3641 		/* either the host never went down since last check */
3642 		if(new_state == HOST_UP) {
3643 
3644 			log_debug_info(DEBUGL_CHECKS, 1, "Host is still UP.\n");
3645 
3646 			/* set the current state */
3647 			hst->current_state = HOST_UP;
3648 
3649 			/* set the state type */
3650 			hst->state_type = HARD_STATE;
3651 
3652 			/* reschedule the next check at the normal interval */
3653 			if(reschedule_check == TRUE)
3654 				next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3655 			}
3656 
3657 		/***** HOST IS NOW DOWN/UNREACHABLE *****/
3658 		else {
3659 
3660 			log_debug_info(DEBUGL_CHECKS, 1, "Host is now DOWN/UNREACHABLE.\n");
3661 
3662 			/***** SPECIAL CASE FOR HOSTS WITH MAX_ATTEMPTS==1 *****/
3663 			if(hst->max_attempts == 1) {
3664 
3665 				log_debug_info(DEBUGL_CHECKS, 1, "Max attempts = 1!.\n");
3666 
3667 				/* set the state type */
3668 				hst->state_type = HARD_STATE;
3669 
3670 				/* host has maxed out on retries, so reschedule the next check at the normal interval */
3671 				reschedule_check = TRUE;
3672 				next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3673 
3674 				/* we need to run SYNCHRONOUS checks of all parent hosts to accurately determine the state of this host */
3675 				/* this is extremely inefficient (reminiscent of Nagios 2.x logic), but there's no other good way around it */
3676 				/* check all parent hosts to see if we're DOWN or UNREACHABLE */
3677 				/* only do this for ACTIVE checks, as PASSIVE checks contain a pre-determined state */
3678 				if(hst->check_type == HOST_CHECK_ACTIVE) {
3679 
3680 					log_debug_info(DEBUGL_CHECKS, 1, "** WARNING: Max attempts = 1, so we have to run serial checks of all parent hosts!\n");
3681 
3682 					for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3683 
3684 						if((parent_host = temp_hostsmember->host_ptr) == NULL)
3685 							continue;
3686 
3687 						log_debug_info(DEBUGL_CHECKS, 1, "Running serial check parent host '%s'...\n", parent_host->name);
3688 
3689 						/* run an immediate check of the parent host */
3690 						run_sync_host_check_3x(parent_host, &parent_state, check_options, use_cached_result, check_timestamp_horizon);
3691 
3692 						/* bail out as soon as we find one parent host that is UP */
3693 						if(parent_state == HOST_UP) {
3694 
3695 							log_debug_info(DEBUGL_CHECKS, 1, "Parent host is UP, so this one is DOWN.\n");
3696 
3697 							/* set the current state */
3698 							hst->current_state = HOST_DOWN;
3699 							break;
3700 							}
3701 						}
3702 
3703 					if(temp_hostsmember == NULL) {
3704 						/* host has no parents, so its up */
3705 						if(hst->parent_hosts == NULL) {
3706 							log_debug_info(DEBUGL_CHECKS, 1, "Host has no parents, so it's DOWN.\n");
3707 							hst->current_state = HOST_DOWN;
3708 							}
3709 						else {
3710 							/* no parents were up, so this host is UNREACHABLE */
3711 							log_debug_info(DEBUGL_CHECKS, 1, "No parents were UP, so this host is UNREACHABLE.\n");
3712 							hst->current_state = HOST_UNREACHABLE;
3713 							}
3714 						}
3715 					}
3716 
3717 				/* set the host state for passive checks */
3718 				else {
3719 					/* set the state */
3720 					hst->current_state = new_state;
3721 
3722 					/* translate host state between DOWN/UNREACHABLE for passive checks (if enabled) */
3723 					/* make a determination of the host's state */
3724 					if(translate_passive_host_checks == TRUE)
3725 						hst->current_state = determine_host_reachability(hst);
3726 
3727 					}
3728 
3729 				/* propagate checks to immediate children if they are not UNREACHABLE */
3730 				/* we do this because we may now be blocking the route to child hosts */
3731 				log_debug_info(DEBUGL_CHECKS, 1, "Propagating check to immediate non-UNREACHABLE child hosts...\n");
3732 
3733 				for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3734 					if((child_host = temp_hostsmember->host_ptr) == NULL)
3735 						continue;
3736 					if(child_host->current_state != HOST_UNREACHABLE) {
3737 						log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3738 						add_object_to_objectlist(&check_hostlist, (void *)child_host);
3739 						}
3740 					}
3741 				}
3742 
3743 			/***** MAX ATTEMPTS > 1 *****/
3744 			else {
3745 
3746 				/* active and (in some cases) passive check results are treated as SOFT states */
3747 				if(hst->check_type == HOST_CHECK_ACTIVE || passive_host_checks_are_soft == TRUE) {
3748 
3749 					/* set the state type */
3750 					hst->state_type = SOFT_STATE;
3751 					}
3752 
3753 				/* by default, passive check results are treated as HARD states */
3754 				else {
3755 
3756 					/* set the state type */
3757 					hst->state_type = HARD_STATE;
3758 
3759 					/* reset the current attempt */
3760 					hst->current_attempt = 1;
3761 					}
3762 
3763 				/* make a (in some cases) preliminary determination of the host's state */
3764 				/* translate host state between DOWN/UNREACHABLE (for passive checks only if enabled) */
3765 				hst->current_state = new_state;
3766 				if(hst->check_type == HOST_CHECK_ACTIVE || translate_passive_host_checks == TRUE)
3767 					hst->current_state = determine_host_reachability(hst);
3768 
3769 				/* reschedule a check of the host */
3770 				reschedule_check = TRUE;
3771 
3772 				/* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
3773 				if(hst->check_type == HOST_CHECK_ACTIVE || passive_host_checks_are_soft == TRUE)
3774 					next_check = (unsigned long)(current_time + (hst->retry_interval * interval_length));
3775 
3776 				/* schedule a re-check of the host at the normal interval */
3777 				else
3778 					next_check = (unsigned long)(current_time + (hst->check_interval * interval_length));
3779 
3780 				/* propagate checks to immediate parents if they are UP */
3781 				/* we do this because a parent host (or grandparent) may have gone down and blocked our route */
3782 				/* checking the parents ASAP will allow us to better determine the final state (DOWN/UNREACHABLE) of this host later */
3783 				log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate parent hosts that are UP...\n");
3784 
3785 				for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3786 					if((parent_host = temp_hostsmember->host_ptr) == NULL)
3787 						continue;
3788 					if(parent_host->current_state == HOST_UP) {
3789 						add_object_to_objectlist(&check_hostlist, (void *)parent_host);
3790 						log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", parent_host->name);
3791 						}
3792 					}
3793 
3794 				/* propagate checks to immediate children if they are not UNREACHABLE */
3795 				/* we do this because we may now be blocking the route to child hosts */
3796 				log_debug_info(DEBUGL_CHECKS, 1, "Propagating checks to immediate non-UNREACHABLE child hosts...\n");
3797 
3798 				for(temp_hostsmember = hst->child_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
3799 					if((child_host = temp_hostsmember->host_ptr) == NULL)
3800 						continue;
3801 					if(child_host->current_state != HOST_UNREACHABLE) {
3802 						log_debug_info(DEBUGL_CHECKS, 1, "Check of child host '%s' queued.\n", child_host->name);
3803 						add_object_to_objectlist(&check_hostlist, (void *)child_host);
3804 						}
3805 					}
3806 
3807 				/* check dependencies on second to last host check */
3808 				if(enable_predictive_host_dependency_checks == TRUE && hst->current_attempt == (hst->max_attempts - 1)) {
3809 
3810 					/* propagate checks to hosts that THIS ONE depends on for notifications AND execution */
3811 					/* we do to help ensure that the dependency checks are accurate before it comes time to notify */
3812 					log_debug_info(DEBUGL_CHECKS, 1, "Propagating predictive dependency checks to hosts this one depends on...\n");
3813 
3814 					for(temp_dependency = get_first_hostdependency_by_dependent_host(hst->name, &ptr); temp_dependency != NULL; temp_dependency = get_next_hostdependency_by_dependent_host(hst->name, &ptr)) {
3815 						if(temp_dependency->dependent_host_ptr == hst && temp_dependency->master_host_ptr != NULL) {
3816 							master_host = (host *)temp_dependency->master_host_ptr;
3817 							log_debug_info(DEBUGL_CHECKS, 1, "Check of host '%s' queued.\n", master_host->name);
3818 							add_object_to_objectlist(&check_hostlist, (void *)master_host);
3819 							}
3820 						}
3821 					}
3822 				}
3823 			}
3824 		}
3825 
3826 	log_debug_info(DEBUGL_CHECKS, 1, "Pre-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state);
3827 
3828 	/* handle the host state */
3829 	handle_host_state(hst);
3830 
3831 	log_debug_info(DEBUGL_CHECKS, 1, "Post-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n", hst->name, hst->current_attempt, hst->max_attempts, (hst->state_type == HARD_STATE) ? "HARD" : "SOFT", hst->current_state);
3832 
3833 
3834 	/******************** POST-PROCESSING STUFF *********************/
3835 
3836 	/* if the plugin output differs from previous check and no state change, log the current state/output if state stalking is enabled */
3837 	if(hst->last_state == hst->current_state && compare_strings(old_plugin_output, hst->plugin_output)) {
3838 
3839 		if(hst->current_state == HOST_UP && hst->stalk_on_up == TRUE)
3840 			log_host_event(hst);
3841 
3842 		else if(hst->current_state == HOST_DOWN && hst->stalk_on_down == TRUE)
3843 			log_host_event(hst);
3844 
3845 		else if(hst->current_state == HOST_UNREACHABLE && hst->stalk_on_unreachable == TRUE)
3846 			log_host_event(hst);
3847 		}
3848 
3849 	/* check to see if the associated host is flapping */
3850 	check_for_host_flapping(hst, TRUE, TRUE, TRUE);
3851 
3852 	/* reschedule the next check of the host (usually ONLY for scheduled, active checks, unless overridden above) */
3853 	if(reschedule_check == TRUE) {
3854 
3855 		log_debug_info(DEBUGL_CHECKS, 1, "Rescheduling next check of host at %s", ctime(&next_check));
3856 
3857 		/* default is to reschedule host check unless a test below fails... */
3858 		hst->should_be_scheduled = TRUE;
3859 
3860 		/* get the new current time */
3861 		time(&current_time);
3862 
3863 		/* make sure we don't get ourselves into too much trouble... */
3864 		if(current_time > next_check)
3865 			hst->next_check = current_time;
3866 		else
3867 			hst->next_check = next_check;
3868 
3869 		/* make sure we rescheduled the next service check at a valid time */
3870 		preferred_time = hst->next_check;
3871 		get_next_valid_time(preferred_time, &next_valid_time, hst->check_period_ptr);
3872 		hst->next_check = next_valid_time;
3873 
3874 		/* hosts with non-recurring intervals do not get rescheduled if we're in a HARD or UP state */
3875 		if(hst->check_interval == 0 && (hst->state_type == HARD_STATE || hst->current_state == HOST_UP))
3876 			hst->should_be_scheduled = FALSE;
3877 
3878 		/* host with active checks disabled do not get rescheduled */
3879 		if(hst->checks_enabled == FALSE)
3880 			hst->should_be_scheduled = FALSE;
3881 
3882 		/* schedule a non-forced check if we can */
3883 		if(hst->should_be_scheduled == TRUE) {
3884 			schedule_host_check(hst, hst->next_check, CHECK_OPTION_NONE);
3885 			}
3886 		}
3887 
3888 	/* update host status - for both active (scheduled) and passive (non-scheduled) hosts */
3889 	update_host_status(hst, FALSE);
3890 
3891 	/* run async checks of all hosts we added above */
3892 	/* don't run a check if one is already executing or we can get by with a cached state */
3893 	for(hostlist_item = check_hostlist; hostlist_item != NULL; hostlist_item = hostlist_item->next) {
3894 		run_async_check = TRUE;
3895 		temp_host = (host *)hostlist_item->object_ptr;
3896 
3897 		log_debug_info(DEBUGL_CHECKS, 2, "ASYNC CHECK OF HOST: %s, CURRENTTIME: %lu, LASTHOSTCHECK: %lu, CACHEDTIMEHORIZON: %lu, USECACHEDRESULT: %d, ISEXECUTING: %d\n", temp_host->name, current_time, temp_host->last_check, check_timestamp_horizon, use_cached_result, temp_host->is_executing);
3898 
3899 		if(use_cached_result == TRUE && ((current_time - temp_host->last_check) <= check_timestamp_horizon))
3900 			run_async_check = FALSE;
3901 		if(temp_host->is_executing == TRUE)
3902 			run_async_check = FALSE;
3903 		if(run_async_check == TRUE)
3904 			run_async_host_check_3x(temp_host, CHECK_OPTION_NONE, 0.0, FALSE, FALSE, NULL, NULL);
3905 		}
3906 	free_objectlist(&check_hostlist);
3907 
3908 	return OK;
3909 	}
3910 
3911 
3912 
3913 /* checks viability of performing a host check */
check_host_check_viability_3x(host * hst,int check_options,int * time_is_valid,time_t * new_time)3914 int check_host_check_viability_3x(host *hst, int check_options, int *time_is_valid, time_t *new_time) {
3915 	int result = OK;
3916 	int perform_check = TRUE;
3917 	time_t current_time = 0L;
3918 	time_t preferred_time = 0L;
3919 	int check_interval = 0;
3920 
3921 	log_debug_info(DEBUGL_FUNCTIONS, 0, "check_host_check_viability_3x()\n");
3922 
3923 	/* make sure we have a host */
3924 	if(hst == NULL)
3925 		return ERROR;
3926 
3927 	/* get the check interval to use if we need to reschedule the check */
3928 	if(hst->state_type == SOFT_STATE && hst->current_state != HOST_UP)
3929 		check_interval = (hst->retry_interval * interval_length);
3930 	else
3931 		check_interval = (hst->check_interval * interval_length);
3932 
3933 	/* make sure check interval is positive - otherwise use 5 minutes out for next check */
3934 	if(check_interval <= 0)
3935 		check_interval = 300;
3936 
3937 	/* get the current time */
3938 	time(&current_time);
3939 
3940 	/* initialize the next preferred check time */
3941 	preferred_time = current_time;
3942 
3943 	/* can we check the host right now? */
3944 	if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)) {
3945 
3946 		/* if checks of the host are currently disabled... */
3947 		if(hst->checks_enabled == FALSE) {
3948 			preferred_time = current_time + check_interval;
3949 			perform_check = FALSE;
3950 			}
3951 
3952 		/* make sure this is a valid time to check the host */
3953 		if(check_time_against_period((unsigned long)current_time, hst->check_period_ptr) == ERROR) {
3954 			preferred_time = current_time;
3955 			if(time_is_valid)
3956 				*time_is_valid = FALSE;
3957 			perform_check = FALSE;
3958 			}
3959 
3960 		/* check host dependencies for execution */
3961 		if(check_host_dependencies(hst, EXECUTION_DEPENDENCY) == DEPENDENCIES_FAILED) {
3962 			preferred_time = current_time + check_interval;
3963 			perform_check = FALSE;
3964 			}
3965 		}
3966 
3967 	/* pass back the next viable check time */
3968 	if(new_time)
3969 		*new_time = preferred_time;
3970 
3971 	result = (perform_check == TRUE) ? OK : ERROR;
3972 
3973 	return result;
3974 	}
3975 
3976 
3977 
3978 /* adjusts current host check attempt before a new check is performed */
adjust_host_check_attempt_3x(host * hst,int is_active)3979 int adjust_host_check_attempt_3x(host *hst, int is_active) {
3980 
3981 	log_debug_info(DEBUGL_FUNCTIONS, 0, "adjust_host_check_attempt_3x()\n");
3982 
3983 	if(hst == NULL)
3984 		return ERROR;
3985 
3986 	log_debug_info(DEBUGL_CHECKS, 2, "Adjusting check attempt number for host '%s': current attempt=%d/%d, state=%d, state type=%d\n", hst->name, hst->current_attempt, hst->max_attempts, hst->current_state, hst->state_type);
3987 
3988 	/* if host is in a hard state, reset current attempt number */
3989 	if(hst->state_type == HARD_STATE)
3990 		hst->current_attempt = 1;
3991 
3992 	/* if host is in a soft UP state, reset current attempt number (active checks only) */
3993 	else if(is_active == TRUE && hst->state_type == SOFT_STATE && hst->current_state == HOST_UP)
3994 		hst->current_attempt = 1;
3995 
3996 	/* increment current attempt number */
3997 	else if(hst->current_attempt < hst->max_attempts)
3998 		hst->current_attempt++;
3999 
4000 	log_debug_info(DEBUGL_CHECKS, 2, "New check attempt number = %d\n", hst->current_attempt);
4001 
4002 	return OK;
4003 	}
4004 
4005 
4006 
4007 /* determination of the host's state based on route availability*/
4008 /* used only to determine difference between DOWN and UNREACHABLE states */
determine_host_reachability(host * hst)4009 int determine_host_reachability(host *hst) {
4010 	int state = HOST_DOWN;
4011 	host *parent_host = NULL;
4012 	hostsmember *temp_hostsmember = NULL;
4013 
4014 	log_debug_info(DEBUGL_FUNCTIONS, 0, "determine_host_reachability()\n");
4015 
4016 	if(hst == NULL)
4017 		return HOST_DOWN;
4018 
4019 	log_debug_info(DEBUGL_CHECKS, 2, "Determining state of host '%s': current state=%d\n", hst->name, hst->current_state);
4020 
4021 	/* host is UP - no translation needed */
4022 	if(hst->current_state == HOST_UP) {
4023 		state = HOST_UP;
4024 		log_debug_info(DEBUGL_CHECKS, 2, "Host is UP, no state translation needed.\n");
4025 		}
4026 
4027 	/* host has no parents, so it is DOWN */
4028 	else if(hst->parent_hosts == NULL) {
4029 		state = HOST_DOWN;
4030 		log_debug_info(DEBUGL_CHECKS, 2, "Host has no parents, so it is DOWN.\n");
4031 		}
4032 
4033 	/* check all parent hosts to see if we're DOWN or UNREACHABLE */
4034 	else {
4035 
4036 		for(temp_hostsmember = hst->parent_hosts; temp_hostsmember != NULL; temp_hostsmember = temp_hostsmember->next) {
4037 
4038 			if((parent_host = temp_hostsmember->host_ptr) == NULL)
4039 				continue;
4040 
4041 			/* bail out as soon as we find one parent host that is UP */
4042 			if(parent_host->current_state == HOST_UP) {
4043 				/* set the current state */
4044 				state = HOST_DOWN;
4045 				log_debug_info(DEBUGL_CHECKS, 2, "At least one parent (%s) is up, so host is DOWN.\n", parent_host->name);
4046 				break;
4047 				}
4048 			}
4049 		/* no parents were up, so this host is UNREACHABLE */
4050 		if(temp_hostsmember == NULL) {
4051 			state = HOST_UNREACHABLE;
4052 			log_debug_info(DEBUGL_CHECKS, 2, "No parents were up, so host is UNREACHABLE.\n");
4053 			}
4054 		}
4055 
4056 	return state;
4057 	}
4058 
4059 
4060 
4061 /******************************************************************/
4062 /****************** HOST STATE HANDLER FUNCTIONS ******************/
4063 /******************************************************************/
4064 
4065 
4066 /* top level host state handler - occurs after every host check (soft/hard and active/passive) */
handle_host_state(host * hst)4067 int handle_host_state(host *hst) {
4068 	int state_change = FALSE;
4069 	int hard_state_change = FALSE;
4070 	time_t current_time = 0L;
4071 
4072 
4073 	log_debug_info(DEBUGL_FUNCTIONS, 0, "handle_host_state()\n");
4074 
4075 	/* get current time */
4076 	time(&current_time);
4077 
4078 	/* obsess over this host check */
4079 	obsessive_compulsive_host_check_processor(hst);
4080 
4081 	/* update performance data */
4082 	update_host_performance_data(hst);
4083 
4084 	/* record latest time for current state */
4085 	switch(hst->current_state) {
4086 		case HOST_UP:
4087 			hst->last_time_up = current_time;
4088 			break;
4089 		case HOST_DOWN:
4090 			hst->last_time_down = current_time;
4091 			break;
4092 		case HOST_UNREACHABLE:
4093 			hst->last_time_unreachable = current_time;
4094 			break;
4095 		default:
4096 			break;
4097 		}
4098 
4099 	/* has the host state changed? */
4100 	if(hst->last_state != hst->current_state || (hst->current_state == HOST_UP && hst->state_type == SOFT_STATE))
4101 		state_change = TRUE;
4102 
4103 	if(hst->current_attempt >= hst->max_attempts && hst->last_hard_state != hst->current_state)
4104 		hard_state_change = TRUE;
4105 
4106 	/* if the host state has changed... */
4107 	if(state_change == TRUE || hard_state_change == TRUE) {
4108 
4109 		/* reset the next and last notification times */
4110 		hst->last_host_notification = (time_t)0;
4111 		hst->next_host_notification = (time_t)0;
4112 
4113 		/* reset notification suppression option */
4114 		hst->no_more_notifications = FALSE;
4115 
4116 		/* reset the acknowledgement flag if necessary */
4117 		if(hst->acknowledgement_type == ACKNOWLEDGEMENT_NORMAL && (state_change == TRUE || hard_state_change == FALSE)) {
4118 
4119 			hst->problem_has_been_acknowledged = FALSE;
4120 			hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
4121 
4122 			/* remove any non-persistant comments associated with the ack */
4123 			delete_host_acknowledgement_comments(hst);
4124 			}
4125 		else if(hst->acknowledgement_type == ACKNOWLEDGEMENT_STICKY && hst->current_state == HOST_UP) {
4126 
4127 			hst->problem_has_been_acknowledged = FALSE;
4128 			hst->acknowledgement_type = ACKNOWLEDGEMENT_NONE;
4129 
4130 			/* remove any non-persistant comments associated with the ack */
4131 			delete_host_acknowledgement_comments(hst);
4132 			}
4133 
4134 		}
4135 
4136 	/* Not sure about this, but is old behaviour */
4137 	if(hst->last_hard_state != hst->current_state)
4138 		hard_state_change = TRUE;
4139 
4140 	if(state_change == TRUE || hard_state_change == TRUE) {
4141 
4142 		/* update last state change times */
4143 		hst->last_state_change = current_time;
4144 		if(hst->state_type == HARD_STATE)
4145 			hst->last_hard_state_change = current_time;
4146 
4147 		/* update the event id */
4148 		hst->last_event_id = hst->current_event_id;
4149 		hst->current_event_id = next_event_id;
4150 		next_event_id++;
4151 
4152 		/* update the problem id when transitioning to a problem state */
4153 		if(hst->last_state == HOST_UP) {
4154 			/* don't reset last problem id, or it will be zero the next time a problem is encountered */
4155 			/*hst->last_problem_id=hst->current_problem_id;*/
4156 			hst->current_problem_id = next_problem_id;
4157 			next_problem_id++;
4158 			}
4159 
4160 		/* clear the problem id when transitioning from a problem state to an UP state */
4161 		if(hst->current_state == HOST_UP) {
4162 			hst->last_problem_id = hst->current_problem_id;
4163 			hst->current_problem_id = 0L;
4164 			}
4165 
4166 		/* write the host state change to the main log file */
4167 		if(hst->state_type == HARD_STATE || (hst->state_type == SOFT_STATE && log_host_retries == TRUE))
4168 			log_host_event(hst);
4169 
4170 		/* check for start of flexible (non-fixed) scheduled downtime */
4171 		/* CHANGED 08-05-2010 EG flex downtime can now start on soft states */
4172 		/*if(hst->state_type==HARD_STATE)*/
4173 		check_pending_flex_host_downtime(hst);
4174 
4175 		/* notify contacts about the recovery or problem if its a "hard" state */
4176 		if(hst->state_type == HARD_STATE)
4177 			host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
4178 
4179 		/* handle the host state change */
4180 		handle_host_event(hst);
4181 
4182 		/* the host just recovered, so reset the current host attempt */
4183 		if(hst->current_state == HOST_UP)
4184 			hst->current_attempt = 1;
4185 
4186 		/* the host recovered, so reset the current notification number and state flags (after the recovery notification has gone out) */
4187 		if(hst->current_state == HOST_UP) {
4188 			hst->current_notification_number = 0;
4189 			hst->notified_on_down = FALSE;
4190 			hst->notified_on_unreachable = FALSE;
4191 			}
4192 		}
4193 
4194 	/* else the host state has not changed */
4195 	else {
4196 
4197 		/* notify contacts if host is still down or unreachable */
4198 		if(hst->current_state != HOST_UP && hst->state_type == HARD_STATE)
4199 			host_notification(hst, NOTIFICATION_NORMAL, NULL, NULL, NOTIFICATION_OPTION_NONE);
4200 
4201 		/* if we're in a soft state and we should log host retries, do so now... */
4202 		if(hst->state_type == SOFT_STATE && log_host_retries == TRUE)
4203 			log_host_event(hst);
4204 		}
4205 
4206 	return OK;
4207 	}
4208 
4209 
4210 /* parse raw plugin output and return: short and long output, perf data */
parse_check_output(char * buf,char ** short_output,char ** long_output,char ** perf_data,int escape_newlines_please,int newlines_are_escaped)4211 int parse_check_output(char *buf, char **short_output, char **long_output, char **perf_data, int escape_newlines_please, int newlines_are_escaped) {
4212 	int current_line = 0;
4213 	int found_newline = FALSE;
4214 	int eof = FALSE;
4215 	int used_buf = 0;
4216 	int dbuf_chunk = 1024;
4217 	dbuf db1;
4218 	dbuf db2;
4219 	char *ptr = NULL;
4220 	int in_perf_data = FALSE;
4221 	char *tempbuf = NULL;
4222 	register int x = 0;
4223 	register int y = 0;
4224 
4225 	/* initialize values */
4226 	if(short_output)
4227 		*short_output = NULL;
4228 	if(long_output)
4229 		*long_output = NULL;
4230 	if(perf_data)
4231 		*perf_data = NULL;
4232 
4233 	/* nothing to do */
4234 	if(buf == NULL || !strcmp(buf, ""))
4235 		return OK;
4236 
4237 	used_buf = strlen(buf) + 1;
4238 
4239 	/* initialize dynamic buffers (1KB chunk size) */
4240 	dbuf_init(&db1, dbuf_chunk);
4241 	dbuf_init(&db2, dbuf_chunk);
4242 
4243 	/* unescape newlines and escaped backslashes first */
4244 	if(newlines_are_escaped == TRUE) {
4245 		for(x = 0, y = 0; buf[x] != '\x0'; x++) {
4246 			if(buf[x] == '\\' && buf[x + 1] == '\\') {
4247 				x++;
4248 				buf[y++] = buf[x];
4249 				}
4250 			else if(buf[x] == '\\' && buf[x + 1] == 'n') {
4251 				x++;
4252 				buf[y++] = '\n';
4253 				}
4254 			else
4255 				buf[y++] = buf[x];
4256 			}
4257 		buf[y] = '\x0';
4258 		}
4259 
4260 	/* process each line of input */
4261 	for(x = 0; eof == FALSE; x++) {
4262 
4263 		/* we found the end of a line */
4264 		if(buf[x] == '\n')
4265 			found_newline = TRUE;
4266 		else if(buf[x] == '\\' && buf[x + 1] == 'n' && newlines_are_escaped == TRUE) {
4267 			found_newline = TRUE;
4268 			buf[x] = '\x0';
4269 			x++;
4270 			}
4271 		else if(buf[x] == '\x0') {
4272 			found_newline = TRUE;
4273 			eof = TRUE;
4274 			}
4275 		else
4276 			found_newline = FALSE;
4277 
4278 		if(found_newline == TRUE) {
4279 
4280 			current_line++;
4281 
4282 			/* handle this line of input */
4283 			buf[x] = '\x0';
4284 			if((tempbuf = (char *)strdup(buf))) {
4285 
4286 				/* first line contains short plugin output and optional perf data */
4287 				if(current_line == 1) {
4288 
4289 					/* get the short plugin output */
4290 					if((ptr = strtok(tempbuf, "|"))) {
4291 						if(short_output)
4292 							*short_output = (char *)strdup(ptr);
4293 
4294 						/* get the optional perf data */
4295 						if((ptr = strtok(NULL, "\n")))
4296 							dbuf_strcat(&db2, ptr);
4297 						}
4298 					}
4299 
4300 				/* additional lines contain long plugin output and optional perf data */
4301 				else {
4302 
4303 					/* rest of the output is perf data */
4304 					if(in_perf_data == TRUE) {
4305 						dbuf_strcat(&db2, tempbuf);
4306 						dbuf_strcat(&db2, " ");
4307 						}
4308 
4309 					/* we're still in long output */
4310 					else {
4311 
4312 						/* perf data separator has been found */
4313 						if(strstr(tempbuf, "|")) {
4314 
4315 							/* NOTE: strtok() causes problems if first character of tempbuf='|', so use my_strtok() instead */
4316 							/* get the remaining long plugin output */
4317 							if((ptr = my_strtok(tempbuf, "|"))) {
4318 
4319 								if(current_line > 2)
4320 									dbuf_strcat(&db1, "\n");
4321 								dbuf_strcat(&db1, ptr);
4322 
4323 								/* get the perf data */
4324 								if((ptr = my_strtok(NULL, "\n"))) {
4325 									dbuf_strcat(&db2, ptr);
4326 									dbuf_strcat(&db2, " ");
4327 									}
4328 								}
4329 
4330 							/* set the perf data flag */
4331 							in_perf_data = TRUE;
4332 							}
4333 
4334 						/* just long output */
4335 						else {
4336 							if(current_line > 2)
4337 								dbuf_strcat(&db1, "\n");
4338 							dbuf_strcat(&db1, tempbuf);
4339 							}
4340 						}
4341 					}
4342 
4343 				my_free(tempbuf);
4344 				tempbuf = NULL;
4345 				}
4346 
4347 
4348 			/* shift data back to front of buffer and adjust counters */
4349 			memmove((void *)&buf[0], (void *)&buf[x + 1], (size_t)((int)used_buf - x - 1));
4350 			used_buf -= (x + 1);
4351 			buf[used_buf] = '\x0';
4352 			x = -1;
4353 			}
4354 		}
4355 
4356 	/* save long output */
4357 	if(long_output && (db1.buf && strcmp(db1.buf, ""))) {
4358 
4359 		if(escape_newlines_please == FALSE)
4360 			*long_output = (char *)strdup(db1.buf);
4361 
4362 		else {
4363 
4364 			/* escape newlines (and backslashes) in long output */
4365 			if((tempbuf = (char *)malloc((strlen(db1.buf) * 2) + 1))) {
4366 
4367 				for(x = 0, y = 0; db1.buf[x] != '\x0'; x++) {
4368 
4369 					if(db1.buf[x] == '\n') {
4370 						tempbuf[y++] = '\\';
4371 						tempbuf[y++] = 'n';
4372 						}
4373 					else if(db1.buf[x] == '\\') {
4374 						tempbuf[y++] = '\\';
4375 						tempbuf[y++] = '\\';
4376 						}
4377 					else
4378 						tempbuf[y++] = db1.buf[x];
4379 					}
4380 
4381 				tempbuf[y] = '\x0';
4382 				*long_output = (char *)strdup(tempbuf);
4383 				my_free(tempbuf);
4384 				}
4385 			}
4386 		}
4387 
4388 	/* save perf data */
4389 	if(perf_data && (db2.buf && strcmp(db2.buf, "")))
4390 		*perf_data = (char *)strdup(db2.buf);
4391 
4392 	/* strip short output and perf data */
4393 	if(short_output)
4394 		strip(*short_output);
4395 	if(perf_data)
4396 		strip(*perf_data);
4397 
4398 	/* free dynamic buffers */
4399 	dbuf_free(&db1);
4400 	dbuf_free(&db2);
4401 
4402 	return OK;
4403 	}
4404 
4405 
4406