1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2000-2011 Free Software Foundation Europe e.V.
5 Copyright (C) 2011-2012 Planets Communications B.V.
6 Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
11 in the file LICENSE.
12
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.
22 */
23 /*
24 * Kern Sibbald, MM
25 */
26 /**
27 * @file
28 * Job control and execution for Storage Daemon
29 */
30
31 #include "include/bareos.h"
32 #include "stored/stored.h"
33 #include "stored/bsr.h"
34 #include "stored/acquire.h"
35 #include "stored/fd_cmds.h"
36 #include "stored/jcr_private.h"
37 #include "stored/ndmp_tape.h"
38 #include "stored/read_record.h"
39 #include "stored/stored_globals.h"
40 #include "lib/bsock.h"
41 #include "lib/edit.h"
42 #include "lib/parse_bsr.h"
43 #include "lib/util.h"
44 #include "include/jcr.h"
45 #include "include/protocol_types.h"
46
47 namespace storagedaemon {
48
49 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
50
51 /* Requests from the Director daemon */
52 static char jobcmd[] =
53 "JobId=%d job=%127s job_name=%127s client_name=%127s "
54 "type=%d level=%d FileSet=%127s NoAttr=%d SpoolAttr=%d FileSetMD5=%127s "
55 "SpoolData=%d PreferMountedVols=%d SpoolSize=%127s "
56 "rerunning=%d VolSessionId=%d VolSessionTime=%d Quota=%llu "
57 "Protocol=%d BackupFormat=%127s\n";
58
59 /* Responses sent to Director daemon */
60 static char OK_job[] = "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n";
61 static char OK_nextrun[] = "3000 OK Job Authorization=%s\n";
62 static char BAD_job[] = "3915 Bad Job command. stat=%d CMD: %s\n";
63 static char Job_end[] =
64 "3099 Job %s end JobStatus=%d JobFiles=%d JobBytes=%s JobErrors=%u\n";
65
66 /**
67 * Director requests us to start a job
68 * Basic tasks done here:
69 * - We pickup the JobId to be run from the Director.
70 * - We pickup the device, media, and pool from the Director
71 * - Wait for a connection from the File Daemon (FD)
72 * - Accept commands from the FD (i.e. run the job)
73 * - Return when the connection is terminated or
74 * there is an error.
75 */
job_cmd(JobControlRecord * jcr)76 bool job_cmd(JobControlRecord* jcr)
77 {
78 int32_t JobId;
79 char auth_key[MAX_NAME_LENGTH];
80 char seed[MAX_NAME_LENGTH];
81 char spool_size[MAX_NAME_LENGTH];
82 BareosSocket* dir = jcr->dir_bsock;
83 PoolMem job_name, client_name, job, fileset_name, fileset_md5, backup_format;
84 int32_t JobType, level, spool_attributes, no_attributes, spool_data;
85 int32_t PreferMountedVols, rerunning, protocol;
86 int status;
87 uint64_t quota = 0;
88 JobControlRecord* ojcr;
89
90 /*
91 * Get JobId and permissions from Director
92 */
93 Dmsg1(100, "<dird: %s", dir->msg);
94 bstrncpy(spool_size, "0", sizeof(spool_size));
95 status = sscanf(dir->msg, jobcmd, &JobId, job.c_str(), job_name.c_str(),
96 client_name.c_str(), &JobType, &level, fileset_name.c_str(),
97 &no_attributes, &spool_attributes, fileset_md5.c_str(),
98 &spool_data, &PreferMountedVols, spool_size, &rerunning,
99 &jcr->VolSessionId, &jcr->VolSessionTime, "a, &protocol,
100 backup_format.c_str());
101 if (status != 19) {
102 PmStrcpy(jcr->errmsg, dir->msg);
103 dir->fsend(BAD_job, status, jcr->errmsg);
104 Dmsg1(100, ">dird: %s", dir->msg);
105 jcr->setJobStatus(JS_ErrorTerminated);
106 return false;
107 }
108
109 jcr->rerunning = (rerunning) ? true : false;
110 jcr->setJobProtocol(protocol);
111
112 Dmsg4(100, "rerunning=%d VolSesId=%d VolSesTime=%d Protocol=%d\n",
113 jcr->rerunning, jcr->VolSessionId, jcr->VolSessionTime,
114 jcr->getJobProtocol());
115 /*
116 * Since this job could be rescheduled, we
117 * check to see if we have it already. If so
118 * free the old jcr and use the new one.
119 */
120 ojcr = get_jcr_by_full_name(job.c_str());
121 if (ojcr && !ojcr->authenticated) {
122 Dmsg2(100, "Found ojcr=0x%x Job %s\n", (unsigned)(intptr_t)ojcr,
123 job.c_str());
124 FreeJcr(ojcr);
125 }
126 jcr->JobId = JobId;
127 Dmsg2(800, "Start JobId=%d %p\n", JobId, jcr);
128 /*
129 * If job rescheduled because previous was incomplete,
130 * the Resched flag is set and VolSessionId and VolSessionTime
131 * are given to us (same as restarted job).
132 */
133 if (!jcr->rerunning) {
134 jcr->VolSessionId = NewVolSessionId();
135 jcr->VolSessionTime = vol_session_time;
136 }
137 bstrncpy(jcr->Job, job, sizeof(jcr->Job));
138 UnbashSpaces(job_name);
139 jcr->impl->job_name = GetPoolMemory(PM_NAME);
140 PmStrcpy(jcr->impl->job_name, job_name);
141 UnbashSpaces(client_name);
142 jcr->client_name = GetPoolMemory(PM_NAME);
143 PmStrcpy(jcr->client_name, client_name);
144 UnbashSpaces(fileset_name);
145 jcr->impl->fileset_name = GetPoolMemory(PM_NAME);
146 PmStrcpy(jcr->impl->fileset_name, fileset_name);
147 jcr->setJobType(JobType);
148 jcr->setJobLevel(level);
149 jcr->impl->no_attributes = no_attributes;
150 jcr->impl->spool_attributes = spool_attributes;
151 jcr->impl->spool_data = spool_data;
152 jcr->impl->spool_size = str_to_int64(spool_size);
153 jcr->impl->fileset_md5 = GetPoolMemory(PM_NAME);
154 PmStrcpy(jcr->impl->fileset_md5, fileset_md5);
155 jcr->impl->PreferMountedVols = PreferMountedVols;
156 jcr->impl->RemainingQuota = quota;
157 UnbashSpaces(backup_format);
158 jcr->impl->backup_format = GetPoolMemory(PM_NAME);
159 PmStrcpy(jcr->impl->backup_format, backup_format);
160 jcr->authenticated = false;
161
162 Dmsg1(50, "Quota set as %llu\n", quota);
163
164 /*
165 * Pass back an authorization key for the File daemon
166 */
167 Bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId);
168 MakeSessionKey(auth_key, seed, 1);
169 jcr->sd_auth_key = strdup(auth_key);
170 dir->fsend(OK_job, jcr->VolSessionId, jcr->VolSessionTime, auth_key);
171 memset(auth_key, 0, sizeof(auth_key));
172 Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
173
174 DispatchNewPluginOptions(jcr);
175 GeneratePluginEvent(jcr, bsdEventJobStart, (void*)"JobStart");
176
177 return true;
178 }
179
DoJobRun(JobControlRecord * jcr)180 bool DoJobRun(JobControlRecord* jcr)
181 {
182 struct timeval tv;
183 struct timezone tz;
184 struct timespec timeout;
185 int errstat = 0;
186
187 jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
188
189 gettimeofday(&tv, &tz);
190 timeout.tv_nsec = tv.tv_usec * 1000;
191 timeout.tv_sec = tv.tv_sec + me->client_wait;
192
193 Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n", jcr->Job,
194 (int)(timeout.tv_sec - time(NULL)), jcr->sd_auth_key);
195 Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
196
197 /*
198 * Wait for the File daemon to contact us to start the Job,
199 * when he does, we will be released, unless the 30 minutes
200 * expires.
201 */
202 P(mutex);
203 while (!jcr->authenticated && !JobCanceled(jcr)) {
204 errstat =
205 pthread_cond_timedwait(&jcr->impl->job_start_wait, &mutex, &timeout);
206 if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
207 break;
208 }
209 Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
210 }
211 Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
212 JobCanceled(jcr), errstat);
213 V(mutex);
214 Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
215
216 memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key));
217 switch (jcr->getJobProtocol()) {
218 case PT_NDMP_BAREOS:
219 if (jcr->authenticated && !JobCanceled(jcr)) {
220 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
221
222 /*
223 * Wait for the Job to finish. As we want exclusive access to
224 * things like the connection to the director we suspend this
225 * thread and let the actual NDMP connection wake us after it
226 * has performed the backup. E.g. instead of doing a busy wait
227 * we just hang on a conditional variable.
228 */
229 Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
230 P(mutex);
231 pthread_cond_wait(&jcr->impl->job_end_wait, &mutex);
232 V(mutex);
233 }
234 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
235
236 /*
237 * For a NDMP backup we expect the protocol to send us either a nextrun
238 * cmd or a finish cmd to let us know they are finished.
239 */
240 return true;
241 default:
242 /*
243 * Handle the file daemon session.
244 */
245 if (jcr->authenticated && !JobCanceled(jcr)) {
246 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
247 RunJob(jcr); /* Run the job */
248 }
249 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
250
251 /*
252 * After a run cmd of a native backup we are done e.g.
253 * return false.
254 */
255 return false;
256 }
257 }
258
nextRunCmd(JobControlRecord * jcr)259 bool nextRunCmd(JobControlRecord* jcr)
260 {
261 char auth_key[MAX_NAME_LENGTH];
262 char seed[MAX_NAME_LENGTH];
263 BareosSocket* dir = jcr->dir_bsock;
264 struct timeval tv;
265 struct timezone tz;
266 struct timespec timeout;
267 int errstat = 0;
268
269 switch (jcr->getJobProtocol()) {
270 case PT_NDMP_BAREOS:
271 /*
272 * We expect a next NDMP backup stream so clear the authenticated flag
273 * and start waiting for the Next backup to Start.
274 */
275 jcr->authenticated = false;
276
277 /*
278 * Pass back a new authorization key for the File daemon
279 */
280 Bsnprintf(seed, sizeof(seed), "%p%d", jcr, jcr->JobId);
281 MakeSessionKey(auth_key, seed, 1);
282 if (jcr->sd_auth_key) { free(jcr->sd_auth_key); }
283 jcr->sd_auth_key = strdup(auth_key);
284 dir->fsend(OK_nextrun, auth_key);
285 memset(auth_key, 0, sizeof(auth_key));
286 Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
287
288 jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
289
290 gettimeofday(&tv, &tz);
291 timeout.tv_nsec = tv.tv_usec * 1000;
292 timeout.tv_sec = tv.tv_sec + me->client_wait;
293
294 Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n", jcr->Job,
295 (int)(timeout.tv_sec - time(NULL)), jcr->sd_auth_key);
296 Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
297
298 P(mutex);
299 while (!jcr->authenticated && !JobCanceled(jcr)) {
300 errstat = pthread_cond_timedwait(&jcr->impl->job_start_wait, &mutex,
301 &timeout);
302 if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
303 break;
304 }
305 Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
306 }
307 Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
308 JobCanceled(jcr), errstat);
309 V(mutex);
310 Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
311
312 if (jcr->authenticated && !JobCanceled(jcr)) {
313 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
314
315 /*
316 * Wait for the Job to finish. As we want exclusive access to
317 * things like the connection to the director we suspend this
318 * thread and let the actual NDMP connection wake us after it
319 * has performed the backup. E.g. instead of doing a busy wait
320 * we just hang on a conditional variable.
321 */
322 Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
323 P(mutex);
324 pthread_cond_wait(&jcr->impl->job_end_wait, &mutex);
325 V(mutex);
326 }
327 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
328
329 /*
330 * For a NDMP backup we expect the protocol to send us either a nextrun
331 * cmd or a finish cmd to let us know they are finished.
332 */
333 return true;
334 default:
335 Dmsg1(200, "NextRunCmd: %s", jcr->dir_bsock->msg);
336 Jmsg2(jcr, M_FATAL, 0,
337 _("Hey!!!! JobId %u Job %s tries to use nextrun cmd while not part "
338 "of protocol.\n"),
339 (uint32_t)jcr->JobId, jcr->Job);
340 return false;
341 }
342 }
343
FinishCmd(JobControlRecord * jcr)344 bool FinishCmd(JobControlRecord* jcr)
345 {
346 BareosSocket* dir = jcr->dir_bsock;
347 char ec1[30];
348
349 /*
350 * See if the Job has a certain protocol. Some protocols allow the
351 * finish cmd some do not (Native backup for example does NOT)
352 */
353 switch (jcr->getJobProtocol()) {
354 case PT_NDMP_BAREOS:
355 Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
356
357 jcr->end_time = time(NULL);
358 DequeueMessages(jcr); /* send any queued messages */
359 jcr->setJobStatus(JS_Terminated);
360
361 switch (jcr->getJobType()) {
362 case JT_BACKUP:
363 EndOfNdmpBackup(jcr);
364 break;
365 case JT_RESTORE:
366 EndOfNdmpRestore(jcr);
367 break;
368 default:
369 break;
370 }
371
372 GeneratePluginEvent(jcr, bsdEventJobEnd);
373
374 dir->fsend(Job_end, jcr->Job, jcr->JobStatus, jcr->JobFiles,
375 edit_uint64(jcr->JobBytes, ec1), jcr->JobErrors);
376 dir->signal(BNET_EOD); /* send EOD to Director daemon */
377
378 FreePlugins(jcr); /* release instantiated plugins */
379
380 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
381
382 return false; /* Continue DIR session ? */
383 default:
384 Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
385 Jmsg2(jcr, M_FATAL, 0,
386 _("Hey!!!! JobId %u Job %s tries to use finish cmd while not part "
387 "of protocol.\n"),
388 (uint32_t)jcr->JobId, jcr->Job);
389 return false; /* Continue DIR session ? */
390 }
391 }
392
393 /**
394 * Destroy the Job Control Record and associated
395 * resources (sockets).
396 */
StoredFreeJcr(JobControlRecord * jcr)397 void StoredFreeJcr(JobControlRecord* jcr)
398 {
399 Dmsg0(200, "Start stored FreeJcr\n");
400 Dmsg2(800, "End Job JobId=%u %p\n", jcr->JobId, jcr);
401
402 if (jcr->dir_bsock) {
403 Dmsg2(800, "Send Terminate jid=%d %p\n", jcr->JobId, jcr);
404 jcr->dir_bsock->signal(BNET_EOD);
405 jcr->dir_bsock->signal(BNET_TERMINATE);
406 }
407
408 if (jcr->store_bsock) {
409 jcr->store_bsock->close();
410 delete jcr->store_bsock;
411 jcr->store_bsock = NULL;
412 }
413
414 if (jcr->file_bsock) {
415 jcr->file_bsock->close();
416 delete jcr->file_bsock;
417 jcr->file_bsock = NULL;
418 }
419
420 if (jcr->impl->job_name) { FreePoolMemory(jcr->impl->job_name); }
421
422 if (jcr->client_name) {
423 FreeMemory(jcr->client_name);
424 jcr->client_name = NULL;
425 }
426
427 if (jcr->impl->fileset_name) { FreeMemory(jcr->impl->fileset_name); }
428
429 if (jcr->impl->fileset_md5) { FreeMemory(jcr->impl->fileset_md5); }
430
431 if (jcr->impl->backup_format) { FreeMemory(jcr->impl->backup_format); }
432
433 if (jcr->impl->read_session.bsr) {
434 libbareos::FreeBsr(jcr->impl->read_session.bsr);
435 jcr->impl->read_session.bsr = NULL;
436 }
437
438 if (jcr->impl->read_session.rctx) {
439 FreeReadContext(jcr->impl->read_session.rctx);
440 jcr->impl->read_session.rctx = NULL;
441 }
442
443 if (jcr->compress.deflate_buffer || jcr->compress.inflate_buffer) {
444 CleanupCompression(jcr);
445 }
446
447 /*
448 * Free any restore volume list created
449 */
450 FreeRestoreVolumeList(jcr);
451 if (jcr->RestoreBootstrap) {
452 SecureErase(jcr, jcr->RestoreBootstrap);
453 FreePoolMemory(jcr->RestoreBootstrap);
454 jcr->RestoreBootstrap = NULL;
455 }
456
457 if (jcr->impl->next_dev || jcr->impl->prev_dev) {
458 Emsg0(M_FATAL, 0, _("In FreeJcr(), but still attached to device!!!!\n"));
459 }
460
461 pthread_cond_destroy(&jcr->impl->job_start_wait);
462 pthread_cond_destroy(&jcr->impl->job_end_wait);
463
464 /*
465 * Avoid a double free
466 */
467 if (jcr->impl->dcr == jcr->impl->read_dcr) { jcr->impl->read_dcr = NULL; }
468
469 if (jcr->impl->dcr) {
470 FreeDeviceControlRecord(jcr->impl->dcr);
471 jcr->impl->dcr = NULL;
472 }
473
474 if (jcr->impl->read_dcr) {
475 FreeDeviceControlRecord(jcr->impl->read_dcr);
476 jcr->impl->read_dcr = NULL;
477 }
478
479 if (jcr->impl->plugin_options) { delete jcr->impl->plugin_options; }
480
481 if (jcr->impl->read_store) {
482 DirectorStorage* store = nullptr;
483 foreach_alist (store, jcr->impl->read_store) {
484 delete store->device;
485 delete store;
486 }
487 delete jcr->impl->read_store;
488 jcr->impl->read_store = NULL;
489 }
490
491 if (jcr->impl->write_store) {
492 DirectorStorage* store = nullptr;
493 foreach_alist (store, jcr->impl->write_store) {
494 delete store->device;
495 delete store;
496 }
497 delete jcr->impl->write_store;
498 jcr->impl->write_store = NULL;
499 }
500
501 FreePlugins(jcr); /* release instantiated plugins */
502
503
504 if (jcr->JobId != 0) {
505 WriteStateFile(me->working_directory, "bareos-sd",
506 GetFirstPortHostOrder(me->SDaddrs));
507 }
508
509 if (jcr->impl) {
510 delete jcr->impl;
511 jcr->impl = nullptr;
512 }
513
514 Dmsg0(200, "End stored FreeJcr\n");
515
516 return;
517 }
518
NewStoredJcr()519 JobControlRecord* NewStoredJcr()
520 {
521 JobControlRecord* jcr = new_jcr(StoredFreeJcr);
522 jcr->impl = new JobControlRecordPrivate;
523 return jcr;
524 }
525
526 } /* namespace storagedaemon */
527