1#!/usr/bin/perl 2# 3# $Id: $ 4# 5# Copyright (c) 2007 .SE (The Internet Infrastructure Foundation). 6# All rights reserved. 7# 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the following conditions 10# are met: 11# 1. Redistributions of source code must retain the above copyright 12# notice, this list of conditions and the following disclaimer. 13# 2. Redistributions in binary form must reproduce the above copyright 14# notice, this list of conditions and the following disclaimer in the 15# documentation and/or other materials provided with the distribution. 16# 17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 21# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 23# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER 25# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 27# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28# 29###################################################################### 30use 5.008; 31 32use warnings; 33use strict; 34 35use DNSCheck; 36 37use Getopt::Long; 38use Sys::Syslog; 39use POSIX qw(:sys_wait_h strftime); 40use Time::HiRes 'sleep'; 41 42use vars qw[ 43 %running 44 %reaped 45 %problem 46 $user 47 $verbose 48 $check 49 $limit 50 $running 51 $restart 52 @saved_argv 53 $syslog 54 $exit_timeout 55 $savelevel 56 %levels 57 $debug 58]; 59 60%running = (); 61%reaped = (); 62%problem = (); 63$debug = 0; 64$verbose = 0; 65$check = DNSCheck->new; 66$limit = $check->config->get("daemon")->{maxchild}; 67$savelevel = $check->config->get("daemon")->{savelevel} || 'INFO'; 68$running = 1; 69$restart = 0; 70$syslog = 1; 71%levels = ( 72 DEBUG => 0, 73 INFO => 1, 74 NOTICE => 2, 75 WARNING => 3, 76 ERROR => 4, 77 CRITICAL => 5, 78); 79# user to run as 80$user = 'nobody'; 81# Kick everything off 82main(); 83 84################################################################ 85# Utility functions and program setup 86################################################################ 87 88# Log something. Far, far more complex than it should have to be, to keep from 89# dying if we suddenly lose contact with syslogd. Which we do if the system is 90# too heavily loaded. 91sub slog { 92 my $priority = shift; 93 my $tries = 0; 94 95 # See perldoc on sprintf for why we have to write it like this 96 my $msg = sprintf($_[0], @_[1 .. $#_]); 97 98 printf("%s (%d): %s\n", uc($priority), $$, $msg) if $debug; 99 100 TRY: 101 eval { 102 if ($syslog) 103 { 104 syslog($priority, @_); 105 } else { 106 printf STDERR "%s (%d): %s\n", uc($priority), $$, $msg; 107 } 108 }; 109 if ($@) { 110 if ($tries < 5) { 111 print STDERR "Trying to reconnect to syslogd...\n"; 112 sleep(0.5); 113 $tries += 1; 114 openlog($check->config->get("syslog")->{ident}, 115 'pid', $check->config->get("syslog")->{facility}); 116 goto TRY; 117 } else { 118 print STDERR 119 "SYSLOG CONNECTION LOST. Switching to stderr logging.\n"; 120 $syslog = 0; 121 printf STDERR "%s (%d): %s\n", uc($priority), $$, $msg; 122 } 123 } 124} 125 126sub setup { 127 my $errfile = $check->config->get("daemon")->{errorlog}; 128 my $pidfile = $check->config->get("daemon")->{pidfile}; 129 my $uid; 130 unless ($uid = (getpwnam($user))[2]) { 131 die "Attempt to run dispatcher as non-existent user or as root\n"; 132 } 133 @saved_argv = @ARGV; # We'll use this if we're asked to restart ourselves 134 GetOptions('debug' => \$debug, 'verbose' => \$verbose); 135 openlog($check->config->get("syslog")->{ident}, 136 'pid', $check->config->get("syslog")->{facility}); 137 slog 'info', "$0 starting with %d maximum children.", 138 $check->config->get("daemon")->{maxchild}; 139 slog 'info', 'IPv4 disabled.' unless $check->config->get("net")->{ipv4}; 140 slog 'info', 'IPv6 disabled.' unless $check->config->get("net")->{ipv6}; 141 slog 'info', 'SMTP disabled.' unless $check->config->get("net")->{smtp}; 142 slog 'info', 'Logging as %s to facility %s.', 143 $check->config->get("syslog")->{ident}, 144 $check->config->get("syslog")->{facility}; 145 slog 'info', 'Reading config from %s and %s.', 146 $check->config->get("configfile"), $check->config->get("siteconfigfile"); 147 148 unless ($check->dbh) { 149 die "Failed to connect to database. Exiting.\n"; 150 } 151 detach() unless $debug; 152 open STDERR, '>>', $errfile or die "Failed to open error log: $!"; 153 printf STDERR "%s starting at %s\n", $0, scalar(localtime); 154 open PIDFILE, '>', $pidfile or die "Failed to open PID file: $!"; 155 print PIDFILE $$; 156 close PIDFILE; 157# become non-root 158 $>= $uid; 159 $SIG{CHLD} = \&REAPER; 160 $SIG{TERM} = sub { $running = 0 }; 161 $SIG{HUP} = sub { 162 $running = 0; 163 $restart = 1; 164 }; 165} 166 167sub detach 168{ # Instead of using ioctls and setfoo calls we use the old double-fork method. 169 my $pid; 170 171 # Once... 172 $pid = fork; 173 exit if $pid; 174 die "Fork failed: $!" unless defined($pid); 175 176 # ...and again 177 $pid = fork; 178 exit if $pid; 179 die "Fork failed: $!" unless defined($pid); 180 slog('info', 'Detached.'); 181} 182 183# Clean up residue from earlier run(s), if any. 184sub inital_cleanup { 185 my $dbh; 186 187 eval { $dbh = $check->dbh; }; 188 if ($@) { 189 slog 'critical', 'Database not available. Exiting.'; 190 exit(1); 191 } 192 193 $dbh->do( 194q[UPDATE queue SET inprogress = NULL WHERE inprogress IS NOT NULL AND tester_pid IS NULL] 195 ); 196 my $c = $dbh->selectall_hashref( 197q[SELECT id, domain, tester_pid FROM queue WHERE inprogress IS NOT NULL AND tester_pid IS NOT NULL], 198 'tester_pid' 199 ); 200 foreach my $k (keys %$c) { 201 if (kill 0, $c->{$k}{tester_pid}) { 202 203 # The process running this test is still alive, so just remove it from the 204 # queue. 205 $dbh->do(q[DELETE FROM queue WHERE id = ?], undef, $c->{$k}{id}); 206 slog 'info', 'Removed %s from queue', $c->{$k}{domain}; 207 } else { 208 209 # The process running this test has died, so reschedule it 210 $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE id = ?], 211 undef, $c->{$k}{id}); 212 slog 'info', 'Rescheduled test for %s', $c->{$k}{domain}; 213 } 214 } 215} 216 217################################################################ 218# Dispatcher 219################################################################ 220 221sub dispatch { 222 my $domain; 223 my $id; 224 my $source; 225 my $source_data; 226 my $fake_glue; 227 my $priority; 228 229 if (scalar keys %running < $limit) { 230 ($domain, $id, $source, $source_data, $fake_glue, $priority) = 231 get_entry(); 232 slog 'debug', "Fetched $domain from database." if defined($domain); 233 } else { 234 235 # slog 'info', 'Process limit reached.'; 236 } 237 238 if (defined($domain)) { 239 unless (defined($problem{$domain}) and $problem{$domain} >= 5) { 240 process($domain, $id, $source, $source_data, $fake_glue, $priority); 241 } else { 242 slog 'error', 243"Testing $domain caused repeated abnormal termination of children. Assuming bug. Exiting."; 244 $running = 0; 245 } 246 return 247 0.0 248 ; # There was something in the queue, so check for more without delay 249 } else { 250 return 0.25; # Queue empty or process slots full. Wait a little. 251 } 252} 253 254sub get_entry { 255 my $dbh; 256 257 eval { $dbh = $check->dbh; }; 258 if ($@) { 259 slog 'critical', 'Database not available. Exiting.'; 260 exit(1); 261 } 262 263 my ($id, $domain, $source, $source_data, $fake_glue, $priority); 264 265 eval { 266 $dbh->begin_work; 267 ($id, $domain, $source, $source_data, $fake_glue, $priority) = 268 $dbh->selectrow_array( 269q[SELECT id, domain, source_id, source_data, fake_parent_glue, priority FROM queue WHERE inprogress IS NULL AND priority = (SELECT MAX(priority) FROM queue WHERE inprogress IS NULL) ORDER BY id ASC LIMIT 1 FOR UPDATE] 270 ); 271 slog 'debug', "Got $id, $domain from database." 272 if (defined($domain) or defined($id)); 273 $dbh->do(q[UPDATE queue SET inprogress = NOW() WHERE id = ?], 274 undef, $id); 275 $dbh->commit; 276 }; 277 if ($@) { 278 my $err = $@; 279 slog 'warning', "Database error in get_entry: $err"; 280 281 if ($err =~ 282/(DBD driver has not implemented the AutoCommit attribute)|(Lost connection to MySQL server during query)/ 283 and defined($id)) 284 { 285 286 # Database handle went away. Try to recover. 287 slog 'info', 288 "Known problem. Trying to clear inprogress for queue id $id."; 289 $dbh = $check->dbh; 290 $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE id = ?], 291 undef, $id); 292 } 293 294 if ($err =~ m|Already in a transaction|) { 295 slog 'critical', 296 'Serious problem. Sleeping, then trying to restart.'; 297 $running = 0; 298 $restart = 1; 299 sleep(15); 300 return; 301 } 302 303 return undef; 304 } 305 306 return ($domain, $id, $source, $source_data, $fake_glue, $priority); 307} 308 309sub process { 310 my $domain = shift; 311 my $id = shift; 312 my $source = shift; 313 my $source_data = shift; 314 my $fake_glue = shift; 315 my $priority = shift; 316 317 my $pid = fork; 318 319 if ($pid) { # True values, so parent 320 $running{$pid} = $domain; 321 slog 'debug', "Child process $pid has been started."; 322 } elsif ($pid == 0) { # Zero value, so child 323 running_in_child($domain, $id, $source, $source_data, $fake_glue, 324 $priority); 325 } else { # Undefined value, so error 326 die "Fork failed: $!"; 327 } 328} 329 330sub running_in_child { 331 my $domain = shift; 332 my $id = shift; 333 my $source = shift; 334 my $source_data = shift; 335 my $fake_glue = shift; 336 my $priority = shift; 337 338 # Reuse the old configuration, but get new everything else. 339 my $dc = DNSCheck->new({ with_config_object => $check->config }); 340 my $dbh = $dc->dbh; 341 my $log = $dc->logger; 342 343 setpriority(0, $$, 20 - 2 * $priority); 344 345 if (defined($fake_glue)) { 346 my @ns = split(/\s+/, $fake_glue); 347 foreach my $n (@ns) { 348 my ($name, $ip) = split(m|/|, $n); 349 $dc->add_fake_glue($domain, $name, $ip); 350 } 351 } 352 353 # On some OS:s (including Ubuntu Linux), this is visible in the process list. 354 $0 = "dispatcher: testing $domain (queue id $id)"; 355 356 $dbh->do(q[UPDATE queue SET tester_pid = ? WHERE id = ?], undef, $$, $id); 357 $dbh->do( 358q[INSERT INTO tests (domain,begin, source_id, source_data) VALUES (?,NOW(),?,?)], 359 undef, $domain, $source, $source_data 360 ); 361 362 my $test_id = $dbh->{'mysql_insertid'}; 363 slog 'debug', "$$ running test number $test_id."; 364 my $line = 0; 365 366 # This line hides all the actual useful work. 367 $dc->zone->test($domain); 368 369 my $sth = $dbh->prepare( 370 q[ 371 INSERT INTO results 372 (test_id,line,module_id,parent_module_id,timestamp,level,message, 373 arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9) 374 VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?) 375 ] 376 ); 377 while (defined(my $e = $log->get_next_entry)) { 378 next if ($levels{ $e->{level} } < $levels{$savelevel}); 379 $line++; 380 my $time = strftime("%Y-%m-%d %H:%M:%S", localtime($e->{timestamp})); 381 $sth->execute( 382 $test_id, $line, $e->{module_id}, 383 $e->{parent_module_id}, $time, $e->{level}, 384 $e->{tag}, $e->{arg}[0], $e->{arg}[1], 385 $e->{arg}[2], $e->{arg}[3], $e->{arg}[4], 386 $e->{arg}[5], $e->{arg}[6], $e->{arg}[7], 387 $e->{arg}[8], $e->{arg}[9], 388 ); 389 } 390 391 $dbh->do( 392q[UPDATE tests SET end = NOW(), count_critical = ?, count_error = ?, count_warning = ?, count_notice = ?, count_info = ? 393 WHERE id = ?], 394 undef, $log->count_critical, $log->count_error, $log->count_warning, 395 $log->count_notice, $log->count_info, $test_id 396 ); 397 398# Everything went well, so exit nicely (if they didn't go well, we've already died not-so-nicely). 399 slog 'debug', "$$ about to exit nicely."; 400 exit(0); 401} 402 403################################################################ 404# Child process handling 405################################################################ 406 407sub monitor_children { 408 my @pids = keys 409 %reaped; # Can't trust %reaped to stay static while we work through it 410 411 foreach my $pid (@pids) { 412 slog 'debug', "Child process $pid has died."; 413 414 my $domain = $running{$pid}; 415 my $exitcode = $reaped{$pid}; 416 delete $running{$pid}; 417 delete $reaped{$pid}; 418 cleanup($domain, $exitcode, $pid); 419 } 420 421 if (defined($exit_timeout) and time() - $exit_timeout > 300) { 422 %running = (); 423 } 424} 425 426sub cleanup { 427 my $domain = shift; 428 my $exitcode = shift; 429 my $pid = shift; 430 my $dbh; 431 432 eval { $dbh = $check->dbh; }; 433 if ($@) { 434 slog 'critical', "Cannot connect to database. Exiting."; 435 exit(1); 436 } 437 438 my $status = $exitcode >> 8; 439 my $signal = $exitcode & 127; 440 441 if ($status == 0) { 442 443 # Child died nicely. 444 AGAIN: eval { 445 $dbh->do(q[DELETE FROM queue WHERE domain = ? AND tester_pid = ?], 446 undef, $domain, $pid); 447 }; 448 if ($@) 449 { # mysqld dumped us. Get a new handle and try again, after a little pause 450 slog 'warning', 451 "Failed to delete queue entry for $domain. Retrying."; 452 sleep(0.25); 453 $dbh = $check->dbh; 454 goto AGAIN; 455 } 456 457 } else { 458 459 # Child blew up. Clean up. 460 $problem{$domain} += 1; 461 slog 'warning', "Unclean exit when testing $domain (status $status)."; 462 $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE domain = ?], 463 undef, $domain); 464 $dbh->do( 465q[DELETE FROM tests WHERE begin IS NOT NULL AND end IS NULL AND domain = ?], 466 undef, $domain 467 ); 468 } 469} 470 471# This code is mostly stolen from the perlipc manpage. 472sub REAPER { 473 my $child; 474 475 while (($child = waitpid(-1, WNOHANG)) > 0) { 476 $reaped{$child} = $?; 477 } 478 $SIG{CHLD} = \&REAPER; 479} 480 481################################################################ 482# Main program 483################################################################ 484 485sub main { 486 setup(); 487 inital_cleanup(); 488 while ($running) { 489 my $skip = dispatch(); 490 monitor_children(); 491 sleep($skip); 492 } 493 slog 'info', "Waiting for %d children to exit.", scalar keys %running; 494 $exit_timeout = time(); 495 monitor_children until (keys %running == 0); 496 unlink $check->config->get("daemon")->{pidfile}; 497 slog 'info', "$0 exiting normally."; 498 printf STDERR "%s exiting normally.\n", $0; 499 if ($restart) { 500 slog 'info', "Attempting to restart myself (as $0 @saved_argv)."; 501 exec($0, @saved_argv); 502 warn "Exec failed: $!"; 503 } 504} 505 506__END__ 507 508=head1 NAME 509 510dnscheck-dispatcher - daemon program to run tests from a database queue 511 512=head2 SYNOPSIS 513 514 dnscheck-dispatcher [--debug] 515 516=head2 DESCRIPTION 517 518This daemon puts itself into the background (unless the --debug flag is given) 519and repeatedly queries the table C<queue> in the configured database for 520domains to test. When it gets one, it spawns a new process to run the tests. 521If there are no domains to check, or if the configured maximum number of 522active child processes has been reached, it sleeps 0.25 seconds and then tries 523again. It keeps doing this until it is terminated by a SIGTERM. At that point, 524it will wait until all children have died and cleanups been performed before it 525removes its PID file and then exits. 526 527=head2 OPTIONS 528 529=over 530 531=item --debug 532 533Prevents the daemon from going into the background and duplicates log 534information to standard output (it still goes to syslog as well). 535 536=back 537 538=head1 CONFIGURATION 539 540L<dnscheck-dispatcher> shares configuration files with the L<DNSCheck> perl 541modules. Or, to be more precise, it creates such an object and then queries 542its configuration object for its configuration information. It also uses the 543L<DNSCheck> object to get its database connection. 544 545There are two keys in the configuration YAML files that are of interest for 546the dispatcher. The first one is C<syslog>. It has the subkeys C<ident>, which 547specifies the name the daemon will use when talking to syslogd, and 548C<facility>, which specifies the syslog facility to use. 549 550The second one is C<daemon>. It has the subkeys C<pidfile>, C<errorlog>, 551C<maxchild> and C<savelevel>. They specify, in order, the file where the 552daemon will write its PID after it has detached, the file it will redirect its 553standard error to, the maximum number of concurrent child processes it may 554have and the minumum log level to save to the database. Make sure to set the 555pathnames to values where the user the daemon is running under has write 556permission, since it will terminated if they are specified but can't be 557written to. Additionally, running with a maxchild value of n means that at 558least n+1 simultaneous connections to the database will be opened. Make sure 559that the database can actually handle that, or everything will die with more 560or less understandable error messages. 561 562If everything works as intended nothing should ever be written to the 563errorlog. All normal log outout goes to syslog (and, with the debug flag, 564standard output). 565