1package MogileFS::Worker::JobMaster; 2# manages/monitors the internal queues for various workers. 3# decided to have one of these per tracker instead of have workers 4# all elect one per job type... should be able to reuse more code, and avoid 5# relying on too many database locks. 6 7use strict; 8use base 'MogileFS::Worker'; 9use fields ( 10 'fsck_queue_limit', 11 'repl_queue_limit', 12 'dele_queue_limit', 13 'rebl_queue_limit', 14 ); 15use MogileFS::Util qw(every error debug encode_url_args); 16use MogileFS::Config; 17use MogileFS::Server; 18 19use constant DEF_FSCK_QUEUE_MAX => 20_000; 20use constant DEF_FSCK_QUEUE_INJECT => 1000; 21 22use constant DEF_REBAL_QUEUE_MAX => 10_000; 23use constant DEF_REBAL_QUEUE_INJECT => 500; 24 25sub new { 26 my ($class, $psock) = @_; 27 my $self = fields::new($class); 28 $self->SUPER::new($psock); 29 30 return $self; 31} 32 33sub watchdog_timeout { 120; } 34 35# heartbeat all of the queues constantly. 36# if a queue drops below a watermark, check for more work. 37# NOTE: Uh. now that I think about it, should queue_check just return 38# the status for all queues in one roundtrip? :( 39# It's separate in case future workers want to manage their own queues, or 40# this gets split up... 41sub work { 42 my $self = shift; 43 44 $self->{fsck_queue_limit} = 100; 45 $self->{repl_queue_limit} = 100; 46 $self->{dele_queue_limit} = 100; 47 $self->{rebl_queue_limit} = 100; 48 49 Danga::Socket->AddOtherFds($self->psock_fd, sub{ $self->read_from_parent }); 50 51 # kick off the initial run 52 $self->check_queues; 53 Danga::Socket->EventLoop; 54} 55 56# 'pings' parent and populates all queues. 57sub check_queues { 58 my $self = shift; 59 60 my $active = 0; 61 if ($self->validate_dbh) { 62 $self->send_to_parent("queue_depth all"); 63 my $sto = Mgd::get_store(); 64 $self->parent_ping; 65 $active += $self->_check_replicate_queues($sto); 66 $active += $self->_check_delete_queues($sto); 67 $active += $self->_check_fsck_queues($sto); 68 $active += $self->_check_rebal_queues($sto); 69 } 70 71 # don't sleep if active (just avoid recursion) 72 Danga::Socket->AddTimer($active ? 0 : 1, sub { $self->check_queues }); 73} 74 75sub _check_delete_queues { 76 my $self = shift; 77 my $sto = shift; 78 my ($need_fetch, $new_limit) = 79 queue_depth_check($self->queue_depth('delete'), 80 $self->{dele_queue_limit}); 81 return unless $need_fetch; 82 my @to_del = $sto->grab_files_to_delete2($new_limit); 83 $self->{dele_queue_limit} = @to_del ? $new_limit : 100; 84 return unless @to_del; 85 for my $todo (@to_del) { 86 $self->send_to_parent("queue_todo delete " . 87 encode_url_args($todo)); 88 } 89 return 1; 90} 91 92# NOTE: we only maintain one queue per worker, but we can easily 93# give specialized work per worker by tagging the $todo href. 94# in the case of replication, we want a normal "replication" queue, 95# but also "drain" and "rebalance" queues. So use $todo->{type} or something. 96# Drain/rebalance will be way awesomer with a queue attached: 97# "drain 5% of devid 5" or "drain 10G off devids 7,8,9" 98# hell, drain barely works if you encounter errors. Using a work queue 99# should fix that. 100# FIXME: Don't hardcode the min queue depth. 101sub _check_replicate_queues { 102 my $self = shift; 103 my $sto = shift; 104 my ($need_fetch, $new_limit) = 105 queue_depth_check($self->queue_depth('replicate'), 106 $self->{repl_queue_limit}); 107 return unless $need_fetch; 108 my @to_repl = $sto->grab_files_to_replicate($new_limit); 109 $self->{repl_queue_limit} = @to_repl ? $new_limit : 100; 110 return unless @to_repl; 111 # don't need to shuffle or sort, since we're the only tracker to get this 112 # list. 113 for my $todo (@to_repl) { 114 $todo->{_type} = 'replicate'; # could be 'drain', etc. 115 $self->send_to_parent("queue_todo replicate " . 116 encode_url_args($todo)); 117 } 118 return 1; 119} 120 121# FSCK is going to be a little odd... We still need a single "global" 122# fsck worker to do the queue injection, but need to locally poll data. 123sub _check_fsck_queues { 124 my $self = shift; 125 my $sto = shift; 126 my $fhost = MogileFS::Config->server_setting_cached('fsck_host'); 127 if ($fhost && $fhost eq MogileFS::Config->hostname) { 128 $self->_inject_fsck_queues($sto); 129 } 130 131 # Queue depth algorithm: 132 # if internal queue is less than 30% full, fetch more. 133 # if internal queue bottomed out, increase fetch limit by 50. 134 # fetch more work 135 # if no work fetched, reset limit to 100 (default) 136 my ($need_fetch, $new_limit) = 137 queue_depth_check($self->queue_depth('fsck'), 138 $self->{fsck_queue_limit}); 139 return unless $need_fetch; 140 my @to_fsck = $sto->grab_files_to_queued(FSCK_QUEUE, 141 'type, flags', $new_limit); 142 $self->{fsck_queue_limit} = @to_fsck ? $new_limit : 100; 143 return unless @to_fsck; 144 for my $todo (@to_fsck) { 145 $self->send_to_parent("queue_todo fsck " . encode_url_args($todo)); 146 } 147 return 1; 148} 149 150sub _inject_fsck_queues { 151 my $self = shift; 152 my $sto = shift; 153 154 $sto->fsck_log_summarize; 155 my $queue_size = $sto->file_queue_length(FSCK_QUEUE); 156 my $max_queue = 157 MogileFS::Config->server_setting_cached('queue_size_for_fsck') || 158 DEF_FSCK_QUEUE_MAX; 159 return if ($queue_size >= $max_queue); 160 161 my $max_checked = MogileFS::Config->server_setting('fsck_highest_fid_checked') || 0; 162 my $fid_at_end = MogileFS::Config->server_setting('fsck_fid_at_end'); 163 my $to_inject = 164 MogileFS::Config->server_setting_cached('queue_rate_for_fsck') || 165 DEF_FSCK_QUEUE_INJECT; 166 my $fids = $sto->get_fidids_between($max_checked, $fid_at_end, $to_inject); 167 unless (@$fids) { 168 MogileFS::Config->set_server_setting('fsck_highest_fid_checked', 169 $max_checked); 170 171 # set these last since tests/scripts may rely on these to 172 # determine when fsck (injection) is complete 173 $sto->set_server_setting("fsck_host", undef); 174 $sto->set_server_setting("fsck_stop_time", $sto->get_db_unixtime); 175 return; 176 } 177 178 $sto->enqueue_many_for_todo($fids, FSCK_QUEUE, 0); 179 180 my $nmax = $fids->[-1]; 181 MogileFS::Config->set_server_setting('fsck_highest_fid_checked', $nmax); 182} 183 184sub _check_rebal_queues { 185 my $self = shift; 186 my $sto = shift; 187 my $rhost = MogileFS::Config->server_setting_cached('rebal_host'); 188 if ($rhost && $rhost eq MogileFS::Config->hostname) { 189 $self->_inject_rebalance_queues($sto); 190 } 191 192 my ($need_fetch, $new_limit) = 193 queue_depth_check($self->queue_depth('rebalance'), 194 $self->{rebl_queue_limit}); 195 return unless $need_fetch; 196 my @to_rebal = $sto->grab_files_to_queued(REBAL_QUEUE, 197 'type, flags, devid, arg', $new_limit); 198 $self->{rebl_queue_limit} = @to_rebal ? $new_limit : 100; 199 return unless @to_rebal; 200 for my $todo (@to_rebal) { 201 $todo->{_type} = 'rebalance'; 202 $self->send_to_parent("queue_todo rebalance " . encode_url_args($todo)); 203 } 204 return 1; 205} 206 207sub _inject_rebalance_queues { 208 my $self = shift; 209 my $sto = shift; 210 211 my $queue_size = $sto->file_queue_length(REBAL_QUEUE); 212 my $max_queue = 213 MogileFS::Config->server_setting_cached('queue_size_for_rebal') || 214 DEF_REBAL_QUEUE_MAX; 215 return if ($queue_size >= $max_queue); 216 217 my $to_inject = 218 MogileFS::Config->server_setting_cached('queue_rate_for_rebal') || 219 DEF_REBAL_QUEUE_INJECT; 220 221 # TODO: Cache the rebal object. Requires explicitly blowing it up at the 222 # end of a run or ... I guess whenever the host sees it's not the rebal 223 # host. 224 my $rebal = MogileFS::Rebalance->new; 225 my $signal = MogileFS::Config->server_setting('rebal_signal'); 226 my $rebal_pol = MogileFS::Config->server_setting('rebal_policy'); 227 my $rebal_state = MogileFS::Config->server_setting('rebal_state'); 228 $rebal->policy($rebal_pol); 229 230 my @devs = Mgd::device_factory()->get_all; 231 if ($rebal_state) { 232 $rebal->load_state($rebal_state); 233 } else { 234 $rebal->init(\@devs); 235 } 236 237 # Stopping is done via signal so we can note stop time in the state, 238 # and un-drain any devices that should be un-drained. 239 if ($signal && $signal eq 'stop') { 240 $rebal->stop; 241 $rebal_state = $rebal->save_state; 242 $sto->set_server_setting('rebal_signal', undef); 243 $sto->set_server_setting("rebal_host", undef); 244 $sto->set_server_setting('rebal_state', $rebal_state); 245 return; 246 } 247 248 my $devfids = $rebal->next_fids_to_rebalance(\@devs, $sto, $to_inject); 249 250 # undefined means there's no work left. 251 if (! defined $devfids) { 252 # Append some info to a rebalance log table? 253 # Leave state in the system for inspection post-run. 254 # TODO: Emit some sort of syslog/status line. 255 $rebal->finish; 256 $rebal_state = $rebal->save_state; 257 $sto->set_server_setting('rebal_state', $rebal_state); 258 $sto->set_server_setting("rebal_host", undef); 259 return; 260 } 261 262 # Empty means nothing to queue this round. 263 if (@$devfids) { 264 # I wish there was less data serialization in the world. 265 map { $_->[2] = join(',', @{$_->[2]}) } @$devfids; 266 $sto->enqueue_many_for_todo($devfids, REBAL_QUEUE, 0); 267 } 268 269 $rebal_state = $rebal->save_state; 270 MogileFS::Config->set_server_setting("rebal_state", $rebal_state); 271} 272 273# takes the current queue depth and fetch limit 274# returns whether or not to fetch, and new fetch limit. 275# TODO: separate a fetch limit from a queue limit... 276# so we don't hammer the DB with giant transactions, but loop 277# fast trying to keep the queue full. 278sub queue_depth_check { 279 my $max_limit = 280 MogileFS::Config->server_setting_cached('internal_queue_limit') 281 || 500; 282 283 my ($depth, $limit) = @_; 284 if ($depth == 0) { 285 $limit += 50 unless $limit >= $max_limit; 286 return (1, $limit); 287 } elsif ($depth / $limit < 0.70) { 288 return (1, $limit); 289 } 290 return (0, $limit); 291} 292 2931; 294 295# Local Variables: 296# mode: perl 297# c-basic-indent: 4 298# indent-tabs-mode: nil 299# End: 300