1package MogileFS::Worker::JobMaster;
2# manages/monitors the internal queues for various workers.
3# decided to have one of these per tracker instead of have workers
4# all elect one per job type... should be able to reuse more code, and avoid
5# relying on too many database locks.
6
7use strict;
8use base 'MogileFS::Worker';
9use fields (
10            'fsck_queue_limit',
11            'repl_queue_limit',
12            'dele_queue_limit',
13            'rebl_queue_limit',
14            );
15use MogileFS::Util qw(every error debug encode_url_args);
16use MogileFS::Config;
17use MogileFS::Server;
18
19use constant DEF_FSCK_QUEUE_MAX => 20_000;
20use constant DEF_FSCK_QUEUE_INJECT => 1000;
21
22use constant DEF_REBAL_QUEUE_MAX => 10_000;
23use constant DEF_REBAL_QUEUE_INJECT => 500;
24
25sub new {
26    my ($class, $psock) = @_;
27    my $self = fields::new($class);
28    $self->SUPER::new($psock);
29
30    return $self;
31}
32
33sub watchdog_timeout { 120; }
34
35# heartbeat all of the queues constantly.
36# if a queue drops below a watermark, check for more work.
37# NOTE: Uh. now that I think about it, should queue_check just return
38# the status for all queues in one roundtrip? :(
39# It's separate in case future workers want to manage their own queues, or
40# this gets split up...
41sub work {
42    my $self = shift;
43
44    $self->{fsck_queue_limit} = 100;
45    $self->{repl_queue_limit} = 100;
46    $self->{dele_queue_limit} = 100;
47    $self->{rebl_queue_limit} = 100;
48
49    Danga::Socket->AddOtherFds($self->psock_fd, sub{ $self->read_from_parent });
50
51    # kick off the initial run
52    $self->check_queues;
53    Danga::Socket->EventLoop;
54}
55
56# 'pings' parent and populates all queues.
57sub check_queues {
58    my $self = shift;
59
60    my $active = 0;
61    if ($self->validate_dbh) {
62        $self->send_to_parent("queue_depth all");
63        my $sto = Mgd::get_store();
64        $self->parent_ping;
65        $active += $self->_check_replicate_queues($sto);
66        $active += $self->_check_delete_queues($sto);
67        $active += $self->_check_fsck_queues($sto);
68        $active += $self->_check_rebal_queues($sto);
69    }
70
71    # don't sleep if active (just avoid recursion)
72    Danga::Socket->AddTimer($active ? 0 : 1, sub { $self->check_queues });
73}
74
75sub _check_delete_queues {
76    my $self = shift;
77    my $sto  = shift;
78    my ($need_fetch, $new_limit) =
79        queue_depth_check($self->queue_depth('delete'),
80        $self->{dele_queue_limit});
81    return unless $need_fetch;
82    my @to_del = $sto->grab_files_to_delete2($new_limit);
83    $self->{dele_queue_limit} = @to_del ? $new_limit : 100;
84    return unless @to_del;
85    for my $todo (@to_del) {
86        $self->send_to_parent("queue_todo delete " .
87            encode_url_args($todo));
88    }
89    return 1;
90}
91
92# NOTE: we only maintain one queue per worker, but we can easily
93# give specialized work per worker by tagging the $todo href.
94# in the case of replication, we want a normal "replication" queue,
95# but also "drain" and "rebalance" queues. So use $todo->{type} or something.
96# Drain/rebalance will be way awesomer with a queue attached:
97# "drain 5% of devid 5" or "drain 10G off devids 7,8,9"
98# hell, drain barely works if you encounter errors. Using a work queue
99# should fix that.
100# FIXME: Don't hardcode the min queue depth.
101sub _check_replicate_queues {
102    my $self = shift;
103    my $sto  = shift;
104    my ($need_fetch, $new_limit) =
105        queue_depth_check($self->queue_depth('replicate'),
106        $self->{repl_queue_limit});
107    return unless $need_fetch;
108    my @to_repl = $sto->grab_files_to_replicate($new_limit);
109    $self->{repl_queue_limit} = @to_repl ? $new_limit : 100;
110    return unless @to_repl;
111    # don't need to shuffle or sort, since we're the only tracker to get this
112    # list.
113    for my $todo (@to_repl) {
114        $todo->{_type} = 'replicate'; # could be 'drain', etc.
115        $self->send_to_parent("queue_todo replicate " .
116            encode_url_args($todo));
117    }
118    return 1;
119}
120
121# FSCK is going to be a little odd... We still need a single "global"
122# fsck worker to do the queue injection, but need to locally poll data.
123sub _check_fsck_queues {
124    my $self = shift;
125    my $sto  = shift;
126    my $fhost = MogileFS::Config->server_setting_cached('fsck_host');
127    if ($fhost && $fhost eq MogileFS::Config->hostname) {
128        $self->_inject_fsck_queues($sto);
129    }
130
131    # Queue depth algorithm:
132    # if internal queue is less than 30% full, fetch more.
133    # if internal queue bottomed out, increase fetch limit by 50.
134    # fetch more work
135    # if no work fetched, reset limit to 100 (default)
136    my ($need_fetch, $new_limit) =
137        queue_depth_check($self->queue_depth('fsck'),
138        $self->{fsck_queue_limit});
139    return unless $need_fetch;
140    my @to_fsck = $sto->grab_files_to_queued(FSCK_QUEUE,
141        'type, flags', $new_limit);
142    $self->{fsck_queue_limit} = @to_fsck ? $new_limit : 100;
143    return unless @to_fsck;
144    for my $todo (@to_fsck) {
145        $self->send_to_parent("queue_todo fsck " . encode_url_args($todo));
146    }
147    return 1;
148}
149
150sub _inject_fsck_queues {
151    my $self = shift;
152    my $sto  = shift;
153
154    $sto->fsck_log_summarize;
155    my $queue_size = $sto->file_queue_length(FSCK_QUEUE);
156    my $max_queue  =
157        MogileFS::Config->server_setting_cached('queue_size_for_fsck') ||
158            DEF_FSCK_QUEUE_MAX;
159    return if ($queue_size >= $max_queue);
160
161    my $max_checked = MogileFS::Config->server_setting('fsck_highest_fid_checked') || 0;
162    my $fid_at_end = MogileFS::Config->server_setting('fsck_fid_at_end');
163    my $to_inject   =
164        MogileFS::Config->server_setting_cached('queue_rate_for_fsck') ||
165            DEF_FSCK_QUEUE_INJECT;
166    my $fids = $sto->get_fidids_between($max_checked, $fid_at_end, $to_inject);
167    unless (@$fids) {
168        MogileFS::Config->set_server_setting('fsck_highest_fid_checked',
169            $max_checked);
170
171        # set these last since tests/scripts may rely on these to
172        # determine when fsck (injection) is complete
173        $sto->set_server_setting("fsck_host", undef);
174        $sto->set_server_setting("fsck_stop_time", $sto->get_db_unixtime);
175        return;
176    }
177
178    $sto->enqueue_many_for_todo($fids, FSCK_QUEUE, 0);
179
180    my $nmax = $fids->[-1];
181    MogileFS::Config->set_server_setting('fsck_highest_fid_checked', $nmax);
182}
183
184sub _check_rebal_queues {
185    my $self = shift;
186    my $sto  = shift;
187    my $rhost = MogileFS::Config->server_setting_cached('rebal_host');
188    if ($rhost && $rhost eq MogileFS::Config->hostname) {
189        $self->_inject_rebalance_queues($sto);
190    }
191
192    my ($need_fetch, $new_limit) =
193        queue_depth_check($self->queue_depth('rebalance'),
194        $self->{rebl_queue_limit});
195    return unless $need_fetch;
196    my @to_rebal = $sto->grab_files_to_queued(REBAL_QUEUE,
197        'type, flags, devid, arg', $new_limit);
198    $self->{rebl_queue_limit} = @to_rebal ? $new_limit : 100;
199    return unless @to_rebal;
200    for my $todo (@to_rebal) {
201        $todo->{_type} = 'rebalance';
202        $self->send_to_parent("queue_todo rebalance " . encode_url_args($todo));
203    }
204    return 1;
205}
206
207sub _inject_rebalance_queues {
208    my $self = shift;
209    my $sto  = shift;
210
211    my $queue_size  = $sto->file_queue_length(REBAL_QUEUE);
212    my $max_queue   =
213        MogileFS::Config->server_setting_cached('queue_size_for_rebal') ||
214            DEF_REBAL_QUEUE_MAX;
215    return if ($queue_size >= $max_queue);
216
217    my $to_inject   =
218        MogileFS::Config->server_setting_cached('queue_rate_for_rebal') ||
219            DEF_REBAL_QUEUE_INJECT;
220
221    # TODO: Cache the rebal object. Requires explicitly blowing it up at the
222    # end of a run or ... I guess whenever the host sees it's not the rebal
223    # host.
224    my $rebal       = MogileFS::Rebalance->new;
225    my $signal      = MogileFS::Config->server_setting('rebal_signal');
226    my $rebal_pol   = MogileFS::Config->server_setting('rebal_policy');
227    my $rebal_state = MogileFS::Config->server_setting('rebal_state');
228    $rebal->policy($rebal_pol);
229
230    my @devs = Mgd::device_factory()->get_all;
231    if ($rebal_state) {
232        $rebal->load_state($rebal_state);
233    } else {
234        $rebal->init(\@devs);
235    }
236
237    # Stopping is done via signal so we can note stop time in the state,
238    # and un-drain any devices that should be un-drained.
239    if ($signal && $signal eq 'stop') {
240        $rebal->stop;
241        $rebal_state = $rebal->save_state;
242        $sto->set_server_setting('rebal_signal', undef);
243        $sto->set_server_setting("rebal_host", undef);
244        $sto->set_server_setting('rebal_state', $rebal_state);
245        return;
246    }
247
248    my $devfids = $rebal->next_fids_to_rebalance(\@devs, $sto, $to_inject);
249
250    # undefined means there's no work left.
251    if (! defined $devfids) {
252        # Append some info to a rebalance log table?
253        # Leave state in the system for inspection post-run.
254        # TODO: Emit some sort of syslog/status line.
255        $rebal->finish;
256        $rebal_state = $rebal->save_state;
257        $sto->set_server_setting('rebal_state', $rebal_state);
258        $sto->set_server_setting("rebal_host", undef);
259        return;
260    }
261
262    # Empty means nothing to queue this round.
263    if (@$devfids) {
264        # I wish there was less data serialization in the world.
265        map { $_->[2] = join(',', @{$_->[2]}) } @$devfids;
266        $sto->enqueue_many_for_todo($devfids, REBAL_QUEUE, 0);
267    }
268
269    $rebal_state = $rebal->save_state;
270    MogileFS::Config->set_server_setting("rebal_state", $rebal_state);
271}
272
273# takes the current queue depth and fetch limit
274# returns whether or not to fetch, and new fetch limit.
275# TODO: separate a fetch limit from a queue limit...
276# so we don't hammer the DB with giant transactions, but loop
277# fast trying to keep the queue full.
278sub queue_depth_check {
279    my $max_limit =
280        MogileFS::Config->server_setting_cached('internal_queue_limit')
281            || 500;
282
283    my ($depth, $limit) = @_;
284    if ($depth == 0) {
285        $limit += 50 unless $limit >= $max_limit;
286        return (1, $limit);
287    } elsif ($depth / $limit < 0.70) {
288        return (1, $limit);
289    }
290    return (0, $limit);
291}
292
2931;
294
295# Local Variables:
296# mode: perl
297# c-basic-indent: 4
298# indent-tabs-mode: nil
299# End:
300