1#!/usr/bin/perl
2#
3# $Id: $
4#
5# Copyright (c) 2007 .SE (The Internet Infrastructure Foundation).
6#                    All rights reserved.
7#
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the following conditions
10# are met:
11# 1. Redistributions of source code must retain the above copyright
12#    notice, this list of conditions and the following disclaimer.
13# 2. Redistributions in binary form must reproduce the above copyright
14#    notice, this list of conditions and the following disclaimer in the
15#    documentation and/or other materials provided with the distribution.
16#
17# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23# GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
25# IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
27# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#
29######################################################################
30use 5.008;
31
32use warnings;
33use strict;
34
35use DNSCheck;
36
37use Getopt::Long;
38use Sys::Syslog;
39use POSIX qw(:sys_wait_h strftime);
40use Time::HiRes 'sleep';
41
42use vars qw[
43  %running
44  %reaped
45  %problem
46  $user
47  $verbose
48  $check
49  $limit
50  $running
51  $restart
52  @saved_argv
53  $syslog
54  $exit_timeout
55  $savelevel
56  %levels
57  $debug
58];
59
60%running   = ();
61%reaped    = ();
62%problem   = ();
63$debug     = 0;
64$verbose   = 0;
65$check     = DNSCheck->new;
66$limit     = $check->config->get("daemon")->{maxchild};
67$savelevel = $check->config->get("daemon")->{savelevel} || 'INFO';
68$running   = 1;
69$restart   = 0;
70$syslog    = 1;
71%levels    = (
72    DEBUG    => 0,
73    INFO     => 1,
74    NOTICE   => 2,
75    WARNING  => 3,
76    ERROR    => 4,
77    CRITICAL => 5,
78);
79# user to run as
80$user = 'nobody';
81# Kick everything off
82main();
83
84################################################################
85# Utility functions and program setup
86################################################################
87
88# Log something. Far, far more complex than it should have to be, to keep from
89# dying if we suddenly lose contact with syslogd. Which we do if the system is
90# too heavily loaded.
91sub slog {
92    my $priority = shift;
93    my $tries    = 0;
94
95    # See perldoc on sprintf for why we have to write it like this
96    my $msg = sprintf($_[0], @_[1 .. $#_]);
97
98    printf("%s (%d): %s\n", uc($priority), $$, $msg) if $debug;
99
100  TRY:
101    eval {
102        if ($syslog)
103        {
104            syslog($priority, @_);
105        } else {
106            printf STDERR "%s (%d): %s\n", uc($priority), $$, $msg;
107        }
108    };
109    if ($@) {
110        if ($tries < 5) {
111            print STDERR "Trying to reconnect to syslogd...\n";
112            sleep(0.5);
113            $tries += 1;
114            openlog($check->config->get("syslog")->{ident},
115                'pid', $check->config->get("syslog")->{facility});
116            goto TRY;
117        } else {
118            print STDERR
119              "SYSLOG CONNECTION LOST. Switching to stderr logging.\n";
120            $syslog = 0;
121            printf STDERR "%s (%d): %s\n", uc($priority), $$, $msg;
122        }
123    }
124}
125
126sub setup {
127    my $errfile = $check->config->get("daemon")->{errorlog};
128    my $pidfile = $check->config->get("daemon")->{pidfile};
129    my $uid;
130    unless ($uid = (getpwnam($user))[2]) {
131	die "Attempt to run dispatcher as non-existent user or as root\n";
132    }
133    @saved_argv = @ARGV;    # We'll use this if we're asked to restart ourselves
134    GetOptions('debug' => \$debug, 'verbose' => \$verbose);
135    openlog($check->config->get("syslog")->{ident},
136        'pid', $check->config->get("syslog")->{facility});
137    slog 'info', "$0 starting with %d maximum children.",
138      $check->config->get("daemon")->{maxchild};
139    slog 'info', 'IPv4 disabled.' unless $check->config->get("net")->{ipv4};
140    slog 'info', 'IPv6 disabled.' unless $check->config->get("net")->{ipv6};
141    slog 'info', 'SMTP disabled.' unless $check->config->get("net")->{smtp};
142    slog 'info', 'Logging as %s to facility %s.',
143      $check->config->get("syslog")->{ident},
144      $check->config->get("syslog")->{facility};
145    slog 'info', 'Reading config from %s and %s.',
146      $check->config->get("configfile"), $check->config->get("siteconfigfile");
147
148    unless ($check->dbh) {
149        die "Failed to connect to database. Exiting.\n";
150    }
151    detach() unless $debug;
152    open STDERR, '>>', $errfile or die "Failed to open error log: $!";
153    printf STDERR "%s starting at %s\n", $0, scalar(localtime);
154    open PIDFILE, '>', $pidfile or die "Failed to open PID file: $!";
155    print PIDFILE $$;
156    close PIDFILE;
157# become non-root
158    $>= $uid;
159    $SIG{CHLD} = \&REAPER;
160    $SIG{TERM} = sub { $running = 0 };
161    $SIG{HUP}  = sub {
162        $running = 0;
163        $restart = 1;
164    };
165}
166
167sub detach
168{  # Instead of using ioctls and setfoo calls we use the old double-fork method.
169    my $pid;
170
171    # Once...
172    $pid = fork;
173    exit if $pid;
174    die "Fork failed: $!" unless defined($pid);
175
176    # ...and again
177    $pid = fork;
178    exit if $pid;
179    die "Fork failed: $!" unless defined($pid);
180    slog('info', 'Detached.');
181}
182
183# Clean up residue from earlier run(s), if any.
184sub inital_cleanup {
185    my $dbh;
186
187    eval { $dbh = $check->dbh; };
188    if ($@) {
189        slog 'critical', 'Database not available. Exiting.';
190        exit(1);
191    }
192
193    $dbh->do(
194q[UPDATE queue SET inprogress = NULL WHERE inprogress IS NOT NULL AND tester_pid IS NULL]
195    );
196    my $c = $dbh->selectall_hashref(
197q[SELECT id, domain, tester_pid FROM queue WHERE inprogress IS NOT NULL AND tester_pid IS NOT NULL],
198        'tester_pid'
199    );
200    foreach my $k (keys %$c) {
201        if (kill 0, $c->{$k}{tester_pid}) {
202
203      # The process running this test is still alive, so just remove it from the
204      # queue.
205            $dbh->do(q[DELETE FROM queue WHERE id = ?], undef, $c->{$k}{id});
206            slog 'info', 'Removed %s from queue', $c->{$k}{domain};
207        } else {
208
209            # The process running this test has died, so reschedule it
210            $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE id = ?],
211                undef, $c->{$k}{id});
212            slog 'info', 'Rescheduled test for %s', $c->{$k}{domain};
213        }
214    }
215}
216
217################################################################
218# Dispatcher
219################################################################
220
221sub dispatch {
222    my $domain;
223    my $id;
224    my $source;
225    my $source_data;
226    my $fake_glue;
227    my $priority;
228
229    if (scalar keys %running < $limit) {
230        ($domain, $id, $source, $source_data, $fake_glue, $priority) =
231          get_entry();
232        slog 'debug', "Fetched $domain from database." if defined($domain);
233    } else {
234
235        # slog 'info', 'Process limit reached.';
236    }
237
238    if (defined($domain)) {
239        unless (defined($problem{$domain}) and $problem{$domain} >= 5) {
240            process($domain, $id, $source, $source_data, $fake_glue, $priority);
241        } else {
242            slog 'error',
243"Testing $domain caused repeated abnormal termination of children. Assuming bug. Exiting.";
244            $running = 0;
245        }
246        return
247          0.0
248          ;  # There was something in the queue, so check for more without delay
249    } else {
250        return 0.25;    # Queue empty or process slots full. Wait a little.
251    }
252}
253
254sub get_entry {
255    my $dbh;
256
257    eval { $dbh = $check->dbh; };
258    if ($@) {
259        slog 'critical', 'Database not available. Exiting.';
260        exit(1);
261    }
262
263    my ($id, $domain, $source, $source_data, $fake_glue, $priority);
264
265    eval {
266        $dbh->begin_work;
267        ($id, $domain, $source, $source_data, $fake_glue, $priority) =
268          $dbh->selectrow_array(
269q[SELECT id, domain, source_id, source_data, fake_parent_glue, priority FROM queue WHERE inprogress IS NULL AND priority = (SELECT MAX(priority) FROM queue WHERE inprogress IS NULL) ORDER BY id ASC LIMIT 1 FOR UPDATE]
270          );
271        slog 'debug', "Got $id, $domain from database."
272          if (defined($domain) or defined($id));
273        $dbh->do(q[UPDATE queue SET inprogress = NOW() WHERE id = ?],
274            undef, $id);
275        $dbh->commit;
276    };
277    if ($@) {
278        my $err = $@;
279        slog 'warning', "Database error in get_entry: $err";
280
281        if ($err =~
282/(DBD driver has not implemented the AutoCommit attribute)|(Lost connection to MySQL server during query)/
283            and defined($id))
284        {
285
286            # Database handle went away. Try to recover.
287            slog 'info',
288              "Known problem. Trying to clear inprogress for queue id $id.";
289            $dbh = $check->dbh;
290            $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE id = ?],
291                undef, $id);
292        }
293
294        if ($err =~ m|Already in a transaction|) {
295            slog 'critical',
296              'Serious problem. Sleeping, then trying to restart.';
297            $running = 0;
298            $restart = 1;
299            sleep(15);
300            return;
301        }
302
303        return undef;
304    }
305
306    return ($domain, $id, $source, $source_data, $fake_glue, $priority);
307}
308
309sub process {
310    my $domain      = shift;
311    my $id          = shift;
312    my $source      = shift;
313    my $source_data = shift;
314    my $fake_glue   = shift;
315    my $priority    = shift;
316
317    my $pid = fork;
318
319    if ($pid) {    # True values, so parent
320        $running{$pid} = $domain;
321        slog 'debug', "Child process $pid has been started.";
322    } elsif ($pid == 0) {    # Zero value, so child
323        running_in_child($domain, $id, $source, $source_data, $fake_glue,
324            $priority);
325    } else {                 # Undefined value, so error
326        die "Fork failed: $!";
327    }
328}
329
330sub running_in_child {
331    my $domain      = shift;
332    my $id          = shift;
333    my $source      = shift;
334    my $source_data = shift;
335    my $fake_glue   = shift;
336    my $priority    = shift;
337
338    # Reuse the old configuration, but get new everything else.
339    my $dc  = DNSCheck->new({ with_config_object => $check->config });
340    my $dbh = $dc->dbh;
341    my $log = $dc->logger;
342
343    setpriority(0, $$, 20 - 2 * $priority);
344
345    if (defined($fake_glue)) {
346        my @ns = split(/\s+/, $fake_glue);
347        foreach my $n (@ns) {
348            my ($name, $ip) = split(m|/|, $n);
349            $dc->add_fake_glue($domain, $name, $ip);
350        }
351    }
352
353   # On some OS:s (including Ubuntu Linux), this is visible in the process list.
354    $0 = "dispatcher: testing $domain (queue id $id)";
355
356    $dbh->do(q[UPDATE queue SET tester_pid = ? WHERE id = ?], undef, $$, $id);
357    $dbh->do(
358q[INSERT INTO tests (domain,begin, source_id, source_data) VALUES (?,NOW(),?,?)],
359        undef, $domain, $source, $source_data
360    );
361
362    my $test_id = $dbh->{'mysql_insertid'};
363    slog 'debug', "$$ running test number $test_id.";
364    my $line = 0;
365
366    # This line hides all the actual useful work.
367    $dc->zone->test($domain);
368
369    my $sth = $dbh->prepare(
370        q[
371        INSERT INTO results
372          (test_id,line,module_id,parent_module_id,timestamp,level,message,
373          arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)
374          VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
375        ]
376    );
377    while (defined(my $e = $log->get_next_entry)) {
378        next if ($levels{ $e->{level} } < $levels{$savelevel});
379        $line++;
380        my $time = strftime("%Y-%m-%d %H:%M:%S", localtime($e->{timestamp}));
381        $sth->execute(
382            $test_id,               $line,        $e->{module_id},
383            $e->{parent_module_id}, $time,        $e->{level},
384            $e->{tag},              $e->{arg}[0], $e->{arg}[1],
385            $e->{arg}[2],           $e->{arg}[3], $e->{arg}[4],
386            $e->{arg}[5],           $e->{arg}[6], $e->{arg}[7],
387            $e->{arg}[8],           $e->{arg}[9],
388        );
389    }
390
391    $dbh->do(
392q[UPDATE tests SET end = NOW(), count_critical = ?, count_error = ?, count_warning = ?, count_notice = ?, count_info = ?
393  WHERE id = ?],
394        undef, $log->count_critical, $log->count_error, $log->count_warning,
395        $log->count_notice, $log->count_info, $test_id
396    );
397
398# Everything went well, so exit nicely (if they didn't go well, we've already died not-so-nicely).
399    slog 'debug', "$$ about to exit nicely.";
400    exit(0);
401}
402
403################################################################
404# Child process handling
405################################################################
406
407sub monitor_children {
408    my @pids = keys
409      %reaped;    # Can't trust %reaped to stay static while we work through it
410
411    foreach my $pid (@pids) {
412        slog 'debug', "Child process $pid has died.";
413
414        my $domain   = $running{$pid};
415        my $exitcode = $reaped{$pid};
416        delete $running{$pid};
417        delete $reaped{$pid};
418        cleanup($domain, $exitcode, $pid);
419    }
420
421    if (defined($exit_timeout) and time() - $exit_timeout > 300) {
422        %running = ();
423    }
424}
425
426sub cleanup {
427    my $domain   = shift;
428    my $exitcode = shift;
429    my $pid      = shift;
430    my $dbh;
431
432    eval { $dbh = $check->dbh; };
433    if ($@) {
434        slog 'critical', "Cannot connect to database. Exiting.";
435        exit(1);
436    }
437
438    my $status = $exitcode >> 8;
439    my $signal = $exitcode & 127;
440
441    if ($status == 0) {
442
443        # Child died nicely.
444      AGAIN: eval {
445            $dbh->do(q[DELETE FROM queue WHERE domain = ? AND tester_pid = ?],
446                undef, $domain, $pid);
447        };
448        if ($@)
449        { # mysqld dumped us. Get a new handle and try again, after a little pause
450            slog 'warning',
451              "Failed to delete queue entry for $domain. Retrying.";
452            sleep(0.25);
453            $dbh = $check->dbh;
454            goto AGAIN;
455        }
456
457    } else {
458
459        # Child blew up. Clean up.
460        $problem{$domain} += 1;
461        slog 'warning', "Unclean exit when testing $domain (status $status).";
462        $dbh->do(q[UPDATE queue SET inprogress = NULL WHERE domain = ?],
463            undef, $domain);
464        $dbh->do(
465q[DELETE FROM tests WHERE begin IS NOT NULL AND end IS NULL AND domain = ?],
466            undef, $domain
467        );
468    }
469}
470
471# This code is mostly stolen from the perlipc manpage.
472sub REAPER {
473    my $child;
474
475    while (($child = waitpid(-1, WNOHANG)) > 0) {
476        $reaped{$child} = $?;
477    }
478    $SIG{CHLD} = \&REAPER;
479}
480
481################################################################
482# Main program
483################################################################
484
485sub main {
486    setup();
487    inital_cleanup();
488    while ($running) {
489        my $skip = dispatch();
490        monitor_children();
491        sleep($skip);
492    }
493    slog 'info', "Waiting for %d children to exit.", scalar keys %running;
494    $exit_timeout = time();
495    monitor_children until (keys %running == 0);
496    unlink $check->config->get("daemon")->{pidfile};
497    slog 'info', "$0 exiting normally.";
498    printf STDERR "%s exiting normally.\n", $0;
499    if ($restart) {
500        slog 'info', "Attempting to restart myself (as $0 @saved_argv).";
501        exec($0, @saved_argv);
502        warn "Exec failed: $!";
503    }
504}
505
506__END__
507
508=head1 NAME
509
510dnscheck-dispatcher - daemon program to run tests from a database queue
511
512=head2 SYNOPSIS
513
514    dnscheck-dispatcher [--debug]
515
516=head2 DESCRIPTION
517
518This daemon puts itself into the background (unless the --debug flag is given)
519and repeatedly queries the table C<queue> in the configured database for
520domains to test. When it gets one, it spawns a new process to run the tests.
521If there are no domains to check, or if the configured maximum number of
522active child processes has been reached, it sleeps 0.25 seconds and then tries
523again. It keeps doing this until it is terminated by a SIGTERM. At that point,
524it will wait until all children have died and cleanups been performed before it
525removes its PID file and then exits.
526
527=head2 OPTIONS
528
529=over
530
531=item --debug
532
533Prevents the daemon from going into the background and duplicates log
534information to standard output (it still goes to syslog as well).
535
536=back
537
538=head1 CONFIGURATION
539
540L<dnscheck-dispatcher> shares configuration files with the L<DNSCheck> perl
541modules. Or, to be more precise, it creates such an object and then queries
542its configuration object for its configuration information. It also uses the
543L<DNSCheck> object to get its database connection.
544
545There are two keys in the configuration YAML files that are of interest for
546the dispatcher. The first one is C<syslog>. It has the subkeys C<ident>, which
547specifies the name the daemon will use when talking to syslogd, and
548C<facility>, which specifies the syslog facility to use.
549
550The second one is C<daemon>. It has the subkeys C<pidfile>, C<errorlog>,
551C<maxchild> and C<savelevel>. They specify, in order, the file where the
552daemon will write its PID after it has detached, the file it will redirect its
553standard error to, the maximum number of concurrent child processes it may
554have and the minumum log level to save to the database. Make sure to set the
555pathnames to values where the user the daemon is running under has write
556permission, since it will terminated if they are specified but can't be
557written to. Additionally, running with a maxchild value of n means that at
558least n+1 simultaneous connections to the database will be opened. Make sure
559that the database can actually handle that, or everything will die with more
560or less understandable error messages.
561
562If everything works as intended nothing should ever be written to the
563errorlog. All normal log outout goes to syslog (and, with the debug flag,
564standard output).
565