1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2000-2011 Free Software Foundation Europe e.V.
5 Copyright (C) 2011-2012 Planets Communications B.V.
6 Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
11 in the file LICENSE.
12
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.
22 */
23 /*
24 * Kern Sibbald, MM
25 */
26 /**
27 * @file
28 * Job control and execution for Storage Daemon
29 */
30
31 #include "include/bareos.h"
32 #include "stored/stored.h"
33 #include "stored/bsr.h"
34 #include "stored/acquire.h"
35 #include "stored/fd_cmds.h"
36 #include "stored/ndmp_tape.h"
37 #include "stored/read_record.h"
38 #include "stored/stored_globals.h"
39 #include "lib/edit.h"
40 #include "lib/parse_bsr.h"
41 #include "include/jcr.h"
42
43 namespace storagedaemon {
44
45 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
46
47 /* Requests from the Director daemon */
48 static char jobcmd[] =
49 "JobId=%d job=%127s job_name=%127s client_name=%127s "
50 "type=%d level=%d FileSet=%127s NoAttr=%d SpoolAttr=%d FileSetMD5=%127s "
51 "SpoolData=%d PreferMountedVols=%d SpoolSize=%127s "
52 "rerunning=%d VolSessionId=%d VolSessionTime=%d Quota=%llu "
53 "Protocol=%d BackupFormat=%127s\n";
54
55 /* Responses sent to Director daemon */
56 static char OK_job[] =
57 "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n";
58 static char OK_nextrun[] =
59 "3000 OK Job Authorization=%s\n";
60 static char BAD_job[] =
61 "3915 Bad Job command. stat=%d CMD: %s\n";
62 static char Job_end[] =
63 "3099 Job %s end JobStatus=%d JobFiles=%d JobBytes=%s JobErrors=%u\n";
64
65 /**
66 * Director requests us to start a job
67 * Basic tasks done here:
68 * - We pickup the JobId to be run from the Director.
69 * - We pickup the device, media, and pool from the Director
70 * - Wait for a connection from the File Daemon (FD)
71 * - Accept commands from the FD (i.e. run the job)
72 * - Return when the connection is terminated or
73 * there is an error.
74 */
job_cmd(JobControlRecord * jcr)75 bool job_cmd(JobControlRecord *jcr)
76 {
77 int32_t JobId;
78 char auth_key[MAX_NAME_LENGTH];
79 char seed[MAX_NAME_LENGTH];
80 char spool_size[MAX_NAME_LENGTH];
81 BareosSocket *dir = jcr->dir_bsock;
82 PoolMem job_name, client_name, job, fileset_name, fileset_md5, backup_format;
83 int32_t JobType, level, spool_attributes, no_attributes, spool_data;
84 int32_t PreferMountedVols, rerunning, protocol;
85 int status;
86 uint64_t quota = 0;
87 JobControlRecord *ojcr;
88
89 /*
90 * Get JobId and permissions from Director
91 */
92 Dmsg1(100, "<dird: %s", dir->msg);
93 bstrncpy(spool_size, "0", sizeof(spool_size));
94 status = sscanf(dir->msg, jobcmd, &JobId, job.c_str(), job_name.c_str(),
95 client_name.c_str(), &JobType, &level, fileset_name.c_str(),
96 &no_attributes, &spool_attributes, fileset_md5.c_str(),
97 &spool_data, &PreferMountedVols, spool_size, &rerunning,
98 &jcr->VolSessionId, &jcr->VolSessionTime, "a, &protocol,
99 backup_format.c_str());
100 if (status != 19) {
101 PmStrcpy(jcr->errmsg, dir->msg);
102 dir->fsend(BAD_job, status, jcr->errmsg);
103 Dmsg1(100, ">dird: %s", dir->msg);
104 jcr->setJobStatus(JS_ErrorTerminated);
105 return false;
106 }
107
108 jcr->rerunning = (rerunning) ? true : false;
109 jcr->setJobProtocol(protocol);
110
111 Dmsg4(100, "rerunning=%d VolSesId=%d VolSesTime=%d Protocol=%d\n",
112 jcr->rerunning, jcr->VolSessionId, jcr->VolSessionTime, jcr->getJobProtocol());
113 /*
114 * Since this job could be rescheduled, we
115 * check to see if we have it already. If so
116 * free the old jcr and use the new one.
117 */
118 ojcr = get_jcr_by_full_name(job.c_str());
119 if (ojcr && !ojcr->authenticated) {
120 Dmsg2(100, "Found ojcr=0x%x Job %s\n", (unsigned)(intptr_t)ojcr, job.c_str());
121 FreeJcr(ojcr);
122 }
123 jcr->JobId = JobId;
124 Dmsg2(800, "Start JobId=%d %p\n", JobId, jcr);
125 /*
126 * If job rescheduled because previous was incomplete,
127 * the Resched flag is set and VolSessionId and VolSessionTime
128 * are given to us (same as restarted job).
129 */
130 if (!jcr->rerunning) {
131 jcr->VolSessionId = NewVolSessionId();
132 jcr->VolSessionTime = vol_session_time;
133 }
134 bstrncpy(jcr->Job, job, sizeof(jcr->Job));
135 UnbashSpaces(job_name);
136 jcr->job_name = GetPoolMemory(PM_NAME);
137 PmStrcpy(jcr->job_name, job_name);
138 UnbashSpaces(client_name);
139 jcr->client_name = GetPoolMemory(PM_NAME);
140 PmStrcpy(jcr->client_name, client_name);
141 UnbashSpaces(fileset_name);
142 jcr->fileset_name = GetPoolMemory(PM_NAME);
143 PmStrcpy(jcr->fileset_name, fileset_name);
144 jcr->setJobType(JobType);
145 jcr->setJobLevel(level);
146 jcr->no_attributes = no_attributes;
147 jcr->spool_attributes = spool_attributes;
148 jcr->spool_data = spool_data;
149 jcr->spool_size = str_to_int64(spool_size);
150 jcr->fileset_md5 = GetPoolMemory(PM_NAME);
151 PmStrcpy(jcr->fileset_md5, fileset_md5);
152 jcr->PreferMountedVols = PreferMountedVols;
153 jcr->RemainingQuota = quota;
154 UnbashSpaces(backup_format);
155 jcr->backup_format = GetPoolMemory(PM_NAME);
156 PmStrcpy(jcr->backup_format, backup_format);
157 jcr->authenticated = false;
158
159 Dmsg1(50, "Quota set as %llu\n", quota);
160
161 /*
162 * Pass back an authorization key for the File daemon
163 */
164 Bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId);
165 MakeSessionKey(auth_key, seed, 1);
166 jcr->sd_auth_key = bstrdup(auth_key);
167 dir->fsend(OK_job, jcr->VolSessionId, jcr->VolSessionTime, auth_key);
168 memset(auth_key, 0, sizeof(auth_key));
169 Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
170
171 DispatchNewPluginOptions(jcr);
172 GeneratePluginEvent(jcr, bsdEventJobStart, (void *)"JobStart");
173
174 return true;
175 }
176
DoJobRun(JobControlRecord * jcr)177 bool DoJobRun(JobControlRecord *jcr)
178 {
179 struct timeval tv;
180 struct timezone tz;
181 struct timespec timeout;
182 int errstat = 0;
183
184 jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
185
186 gettimeofday(&tv, &tz);
187 timeout.tv_nsec = tv.tv_usec * 1000;
188 timeout.tv_sec = tv.tv_sec + me->client_wait;
189
190 Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n",
191 jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
192 Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
193
194 /*
195 * Wait for the File daemon to contact us to start the Job,
196 * when he does, we will be released, unless the 30 minutes
197 * expires.
198 */
199 P(mutex);
200 while (!jcr->authenticated && !JobCanceled(jcr)) {
201 errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
202 if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
203 break;
204 }
205 Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
206 }
207 Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
208 JobCanceled(jcr), errstat);
209 V(mutex);
210 Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
211
212 memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key));
213 switch (jcr->getJobProtocol()) {
214 case PT_NDMP_BAREOS:
215 if (jcr->authenticated && !JobCanceled(jcr)) {
216 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
217
218 /*
219 * Wait for the Job to finish. As we want exclusive access to
220 * things like the connection to the director we suspend this
221 * thread and let the actual NDMP connection wake us after it
222 * has performed the backup. E.g. instead of doing a busy wait
223 * we just hang on a conditional variable.
224 */
225 Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
226 P(mutex);
227 pthread_cond_wait(&jcr->job_end_wait, &mutex);
228 V(mutex);
229 }
230 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
231
232 /*
233 * For a NDMP backup we expect the protocol to send us either a nextrun cmd
234 * or a finish cmd to let us know they are finished.
235 */
236 return true;
237 default:
238 /*
239 * Handle the file daemon session.
240 */
241 if (jcr->authenticated && !JobCanceled(jcr)) {
242 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
243 RunJob(jcr); /* Run the job */
244 }
245 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
246
247 /*
248 * After a run cmd of a native backup we are done e.g.
249 * return false.
250 */
251 return false;
252 }
253 }
254
nextRunCmd(JobControlRecord * jcr)255 bool nextRunCmd(JobControlRecord *jcr)
256 {
257 char auth_key[MAX_NAME_LENGTH];
258 char seed[MAX_NAME_LENGTH];
259 BareosSocket *dir = jcr->dir_bsock;
260 struct timeval tv;
261 struct timezone tz;
262 struct timespec timeout;
263 int errstat = 0;
264
265 switch (jcr->getJobProtocol()) {
266 case PT_NDMP_BAREOS:
267 /*
268 * We expect a next NDMP backup stream so clear the authenticated flag
269 * and start waiting for the Next backup to Start.
270 */
271 jcr->authenticated = false;
272
273 /*
274 * Pass back a new authorization key for the File daemon
275 */
276 Bsnprintf(seed, sizeof(seed), "%p%d", jcr, jcr->JobId);
277 MakeSessionKey(auth_key, seed, 1);
278 if (jcr->sd_auth_key) {
279 free(jcr->sd_auth_key);
280 }
281 jcr->sd_auth_key = bstrdup(auth_key);
282 dir->fsend(OK_nextrun, auth_key);
283 memset(auth_key, 0, sizeof(auth_key));
284 Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
285
286 jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
287
288 gettimeofday(&tv, &tz);
289 timeout.tv_nsec = tv.tv_usec * 1000;
290 timeout.tv_sec = tv.tv_sec + me->client_wait;
291
292 Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n",
293 jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
294 Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
295
296 P(mutex);
297 while (!jcr->authenticated && !JobCanceled(jcr)) {
298 errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
299 if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
300 break;
301 }
302 Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
303 }
304 Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
305 JobCanceled(jcr), errstat);
306 V(mutex);
307 Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
308
309 if (jcr->authenticated && !JobCanceled(jcr)) {
310 Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
311
312 /*
313 * Wait for the Job to finish. As we want exclusive access to
314 * things like the connection to the director we suspend this
315 * thread and let the actual NDMP connection wake us after it
316 * has performed the backup. E.g. instead of doing a busy wait
317 * we just hang on a conditional variable.
318 */
319 Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
320 P(mutex);
321 pthread_cond_wait(&jcr->job_end_wait, &mutex);
322 V(mutex);
323 }
324 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
325
326 /*
327 * For a NDMP backup we expect the protocol to send us either a nextrun cmd
328 * or a finish cmd to let us know they are finished.
329 */
330 return true;
331 default:
332 Dmsg1(200, "NextRunCmd: %s", jcr->dir_bsock->msg);
333 Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s tries to use nextrun cmd while not part of protocol.\n"),
334 (uint32_t)jcr->JobId, jcr->Job);
335 return false;
336 }
337 }
338
FinishCmd(JobControlRecord * jcr)339 bool FinishCmd(JobControlRecord *jcr)
340 {
341 BareosSocket *dir = jcr->dir_bsock;
342 char ec1[30];
343
344 /*
345 * See if the Job has a certain protocol. Some protocols allow the
346 * finish cmd some do not (Native backup for example does NOT)
347 */
348 switch (jcr->getJobProtocol()) {
349 case PT_NDMP_BAREOS:
350 Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
351
352 jcr->end_time = time(NULL);
353 DequeueMessages(jcr); /* send any queued messages */
354 jcr->setJobStatus(JS_Terminated);
355
356 switch (jcr->getJobType()) {
357 case JT_BACKUP:
358 EndOfNdmpBackup(jcr);
359 break;
360 case JT_RESTORE:
361 EndOfNdmpRestore(jcr);
362 break;
363 default:
364 break;
365 }
366
367 GeneratePluginEvent(jcr, bsdEventJobEnd);
368
369 dir->fsend(Job_end, jcr->Job, jcr->JobStatus, jcr->JobFiles,
370 edit_uint64(jcr->JobBytes, ec1), jcr->JobErrors);
371 dir->signal(BNET_EOD); /* send EOD to Director daemon */
372
373 FreePlugins(jcr); /* release instantiated plugins */
374
375 Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
376
377 return false; /* Continue DIR session ? */
378 default:
379 Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
380 Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s tries to use finish cmd while not part of protocol.\n"),
381 (uint32_t)jcr->JobId, jcr->Job);
382 return false; /* Continue DIR session ? */
383 }
384 }
385
386 #ifdef needed
387 /**
388 * Query Device command from Director
389 * Sends Storage Daemon's information on the device to the
390 * caller (presumably the Director).
391 * This command always returns "true" so that the line is
392 * not closed on an error.
393 *
394 */
QueryCmd(JobControlRecord * jcr)395 bool QueryCmd(JobControlRecord *jcr)
396 {
397 PoolMem dev_name, VolumeName, MediaType, ChangerName;
398 BareosSocket *dir = jcr->dir_bsock;
399 DeviceResource *device;
400 AUTOCHANGER *changer;
401 bool ok;
402
403 Dmsg1(100, "Query_cmd: %s", dir->msg);
404 ok = sscanf(dir->msg, query_device, dev_name.c_str()) == 1;
405 Dmsg1(100, "<dird: %s", dir->msg);
406 if (ok) {
407 UnbashSpaces(dev_name);
408 foreach_res(device, R_DEVICE) {
409 /* Find resource, and make sure we were able to open it */
410 if (bstrcmp(dev_name.c_str(), device->name())) {
411 if (!device->dev) {
412 device->dev = InitDev(jcr, device);
413 }
414 if (!device->dev) {
415 break;
416 }
417 ok = dir_update_device(jcr, device->dev);
418 if (ok) {
419 ok = dir->fsend(OK_query);
420 } else {
421 dir->fsend(NO_query);
422 }
423 return ok;
424 }
425 }
426 foreach_res(changer, R_AUTOCHANGER) {
427 /*Find resource, and make sure we were able to open it */
428 if (bstrcmp(dev_name.c_str(), changer->name())) {
429 if (!changer->device || changer->device->size() == 0) {
430 continue; /* no devices */
431 }
432 ok = dir_update_changer(jcr, changer);
433 if (ok) {
434 ok = dir->fsend(OK_query);
435 } else {
436 dir->fsend(NO_query);
437 }
438 return ok;
439 }
440 }
441 /* If we get here, the device/autochanger was not found */
442 UnbashSpaces(dir->msg);
443 PmStrcpy(jcr->errmsg, dir->msg);
444 dir->fsend(NO_device, dev_name.c_str());
445 Dmsg1(100, ">dird: %s", dir->msg);
446 } else {
447 UnbashSpaces(dir->msg);
448 PmStrcpy(jcr->errmsg, dir->msg);
449 dir->fsend(BAD_query, jcr->errmsg);
450 Dmsg1(100, ">dird: %s", dir->msg);
451 }
452
453 return true;
454 }
455 #endif
456
457 /**
458 * Destroy the Job Control Record and associated
459 * resources (sockets).
460 */
StoredFreeJcr(JobControlRecord * jcr)461 void StoredFreeJcr(JobControlRecord *jcr)
462 {
463 Dmsg0(200, "Start stored FreeJcr\n");
464 Dmsg2(800, "End Job JobId=%u %p\n", jcr->JobId, jcr);
465
466 if (jcr->dir_bsock) {
467 Dmsg2(800, "Send Terminate jid=%d %p\n", jcr->JobId, jcr);
468 jcr->dir_bsock->signal(BNET_EOD);
469 jcr->dir_bsock->signal(BNET_TERMINATE);
470 }
471
472 if (jcr->store_bsock) {
473 jcr->store_bsock->close();
474 delete jcr->store_bsock;
475 jcr->store_bsock = NULL;
476 }
477
478 if (jcr->file_bsock) {
479 jcr->file_bsock->close();
480 delete jcr->file_bsock;
481 jcr->file_bsock = NULL;
482 }
483
484 if (jcr->job_name) {
485 FreePoolMemory(jcr->job_name);
486 }
487
488 if (jcr->client_name) {
489 FreeMemory(jcr->client_name);
490 jcr->client_name = NULL;
491 }
492
493 if (jcr->fileset_name) {
494 FreeMemory(jcr->fileset_name);
495 }
496
497 if (jcr->fileset_md5) {
498 FreeMemory(jcr->fileset_md5);
499 }
500
501 if (jcr->backup_format) {
502 FreeMemory(jcr->backup_format);
503 }
504
505 if (jcr->bsr) {
506 libbareos::FreeBsr(jcr->bsr);
507 jcr->bsr = NULL;
508 }
509
510 if (jcr->rctx) {
511 FreeReadContext(jcr->rctx);
512 jcr->rctx = NULL;
513 }
514
515 if (jcr->compress.deflate_buffer || jcr->compress.inflate_buffer) {
516 CleanupCompression(jcr);
517 }
518
519 /*
520 * Free any restore volume list created
521 */
522 FreeRestoreVolumeList(jcr);
523 if (jcr->RestoreBootstrap) {
524 SecureErase(jcr, jcr->RestoreBootstrap);
525 FreePoolMemory(jcr->RestoreBootstrap);
526 jcr->RestoreBootstrap = NULL;
527 }
528
529 if (jcr->next_dev || jcr->prev_dev) {
530 Emsg0(M_FATAL, 0, _("In FreeJcr(), but still attached to device!!!!\n"));
531 }
532
533 pthread_cond_destroy(&jcr->job_start_wait);
534 pthread_cond_destroy(&jcr->job_end_wait);
535
536 if (jcr->dcrs) {
537 delete jcr->dcrs;
538 jcr->dcrs = NULL;
539 }
540
541 /*
542 * Avoid a double free
543 */
544 if (jcr->dcr == jcr->read_dcr) {
545 jcr->read_dcr = NULL;
546 }
547
548 if (jcr->dcr) {
549 FreeDeviceControlRecord(jcr->dcr);
550 jcr->dcr = NULL;
551 }
552
553 if (jcr->read_dcr) {
554 FreeDeviceControlRecord(jcr->read_dcr);
555 jcr->read_dcr = NULL;
556 }
557
558 if (jcr->plugin_options) {
559 delete jcr->plugin_options;
560 }
561
562 if (jcr->read_store) {
563 DirectorStorage *store = nullptr;
564 foreach_alist(store, jcr->read_store) {
565 delete store->device;
566 delete store;
567 }
568 delete jcr->read_store;
569 jcr->read_store = NULL;
570 }
571
572 if (jcr->write_store) {
573 DirectorStorage *store = nullptr;
574 foreach_alist(store, jcr->write_store) {
575 delete store->device;
576 delete store;
577 }
578 delete jcr->write_store;
579 jcr->write_store = NULL;
580 }
581
582 FreePlugins(jcr); /* release instantiated plugins */
583
584 Dsm_check(200);
585
586 if (jcr->JobId != 0) {
587 WriteStateFile(me->working_directory, "bareos-sd", GetFirstPortHostOrder(me->SDaddrs));
588 }
589
590 Dmsg0(200, "End stored FreeJcr\n");
591
592 return;
593 }
594
595 } /* namespace storagedaemon */
596