1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2000-2011 Free Software Foundation Europe e.V.
5    Copyright (C) 2011-2012 Planets Communications B.V.
6    Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7 
8    This program is Free Software; you can redistribute it and/or
9    modify it under the terms of version three of the GNU Affero General Public
10    License as published by the Free Software Foundation and included
11    in the file LICENSE.
12 
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16    Affero General Public License for more details.
17 
18    You should have received a copy of the GNU Affero General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22 */
23 /*
24  * Kern Sibbald, MM
25  */
26 /**
27  * @file
28  * Job control and execution for Storage Daemon
29  */
30 
31 #include "include/bareos.h"
32 #include "stored/stored.h"
33 #include "stored/bsr.h"
34 #include "stored/acquire.h"
35 #include "stored/fd_cmds.h"
36 #include "stored/ndmp_tape.h"
37 #include "stored/read_record.h"
38 #include "stored/stored_globals.h"
39 #include "lib/edit.h"
40 #include "lib/parse_bsr.h"
41 #include "include/jcr.h"
42 
43 namespace storagedaemon {
44 
45 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
46 
47 /* Requests from the Director daemon */
48 static char jobcmd[] =
49    "JobId=%d job=%127s job_name=%127s client_name=%127s "
50    "type=%d level=%d FileSet=%127s NoAttr=%d SpoolAttr=%d FileSetMD5=%127s "
51    "SpoolData=%d PreferMountedVols=%d SpoolSize=%127s "
52    "rerunning=%d VolSessionId=%d VolSessionTime=%d Quota=%llu "
53    "Protocol=%d BackupFormat=%127s\n";
54 
55 /* Responses sent to Director daemon */
56 static char OK_job[] =
57    "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n";
58 static char OK_nextrun[] =
59    "3000 OK Job Authorization=%s\n";
60 static char BAD_job[] =
61    "3915 Bad Job command. stat=%d CMD: %s\n";
62 static char Job_end[] =
63    "3099 Job %s end JobStatus=%d JobFiles=%d JobBytes=%s JobErrors=%u\n";
64 
65 /**
66  * Director requests us to start a job
67  * Basic tasks done here:
68  *  - We pickup the JobId to be run from the Director.
69  *  - We pickup the device, media, and pool from the Director
70  *  - Wait for a connection from the File Daemon (FD)
71  *  - Accept commands from the FD (i.e. run the job)
72  *  - Return when the connection is terminated or
73  *    there is an error.
74  */
job_cmd(JobControlRecord * jcr)75 bool job_cmd(JobControlRecord *jcr)
76 {
77    int32_t JobId;
78    char auth_key[MAX_NAME_LENGTH];
79    char seed[MAX_NAME_LENGTH];
80    char spool_size[MAX_NAME_LENGTH];
81    BareosSocket *dir = jcr->dir_bsock;
82    PoolMem job_name, client_name, job, fileset_name, fileset_md5, backup_format;
83    int32_t JobType, level, spool_attributes, no_attributes, spool_data;
84    int32_t PreferMountedVols, rerunning, protocol;
85    int status;
86    uint64_t quota = 0;
87    JobControlRecord *ojcr;
88 
89    /*
90     * Get JobId and permissions from Director
91     */
92    Dmsg1(100, "<dird: %s", dir->msg);
93    bstrncpy(spool_size, "0", sizeof(spool_size));
94    status = sscanf(dir->msg, jobcmd, &JobId, job.c_str(), job_name.c_str(),
95                    client_name.c_str(), &JobType, &level, fileset_name.c_str(),
96                    &no_attributes, &spool_attributes, fileset_md5.c_str(),
97                    &spool_data, &PreferMountedVols, spool_size, &rerunning,
98                    &jcr->VolSessionId, &jcr->VolSessionTime, &quota, &protocol,
99                    backup_format.c_str());
100    if (status != 19) {
101       PmStrcpy(jcr->errmsg, dir->msg);
102       dir->fsend(BAD_job, status, jcr->errmsg);
103       Dmsg1(100, ">dird: %s", dir->msg);
104       jcr->setJobStatus(JS_ErrorTerminated);
105       return false;
106    }
107 
108    jcr->rerunning = (rerunning) ? true : false;
109    jcr->setJobProtocol(protocol);
110 
111    Dmsg4(100, "rerunning=%d VolSesId=%d VolSesTime=%d Protocol=%d\n",
112          jcr->rerunning, jcr->VolSessionId, jcr->VolSessionTime, jcr->getJobProtocol());
113    /*
114     * Since this job could be rescheduled, we
115     *  check to see if we have it already. If so
116     *  free the old jcr and use the new one.
117     */
118    ojcr = get_jcr_by_full_name(job.c_str());
119    if (ojcr && !ojcr->authenticated) {
120       Dmsg2(100, "Found ojcr=0x%x Job %s\n", (unsigned)(intptr_t)ojcr, job.c_str());
121       FreeJcr(ojcr);
122    }
123    jcr->JobId = JobId;
124    Dmsg2(800, "Start JobId=%d %p\n", JobId, jcr);
125    /*
126     * If job rescheduled because previous was incomplete,
127     * the Resched flag is set and VolSessionId and VolSessionTime
128     * are given to us (same as restarted job).
129     */
130    if (!jcr->rerunning) {
131       jcr->VolSessionId = NewVolSessionId();
132       jcr->VolSessionTime = vol_session_time;
133    }
134    bstrncpy(jcr->Job, job, sizeof(jcr->Job));
135    UnbashSpaces(job_name);
136    jcr->job_name = GetPoolMemory(PM_NAME);
137    PmStrcpy(jcr->job_name, job_name);
138    UnbashSpaces(client_name);
139    jcr->client_name = GetPoolMemory(PM_NAME);
140    PmStrcpy(jcr->client_name, client_name);
141    UnbashSpaces(fileset_name);
142    jcr->fileset_name = GetPoolMemory(PM_NAME);
143    PmStrcpy(jcr->fileset_name, fileset_name);
144    jcr->setJobType(JobType);
145    jcr->setJobLevel(level);
146    jcr->no_attributes = no_attributes;
147    jcr->spool_attributes = spool_attributes;
148    jcr->spool_data = spool_data;
149    jcr->spool_size = str_to_int64(spool_size);
150    jcr->fileset_md5 = GetPoolMemory(PM_NAME);
151    PmStrcpy(jcr->fileset_md5, fileset_md5);
152    jcr->PreferMountedVols = PreferMountedVols;
153    jcr->RemainingQuota = quota;
154    UnbashSpaces(backup_format);
155    jcr->backup_format = GetPoolMemory(PM_NAME);
156    PmStrcpy(jcr->backup_format, backup_format);
157    jcr->authenticated = false;
158 
159    Dmsg1(50, "Quota set as %llu\n", quota);
160 
161    /*
162     * Pass back an authorization key for the File daemon
163     */
164    Bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId);
165    MakeSessionKey(auth_key, seed, 1);
166    jcr->sd_auth_key = bstrdup(auth_key);
167    dir->fsend(OK_job, jcr->VolSessionId, jcr->VolSessionTime, auth_key);
168    memset(auth_key, 0, sizeof(auth_key));
169    Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
170 
171    DispatchNewPluginOptions(jcr);
172    GeneratePluginEvent(jcr, bsdEventJobStart, (void *)"JobStart");
173 
174    return true;
175 }
176 
DoJobRun(JobControlRecord * jcr)177 bool DoJobRun(JobControlRecord *jcr)
178 {
179    struct timeval tv;
180    struct timezone tz;
181    struct timespec timeout;
182    int errstat = 0;
183 
184    jcr->sendJobStatus(JS_WaitFD);          /* wait for FD to connect */
185 
186    gettimeofday(&tv, &tz);
187    timeout.tv_nsec = tv.tv_usec * 1000;
188    timeout.tv_sec = tv.tv_sec + me->client_wait;
189 
190    Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n",
191          jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
192    Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
193 
194    /*
195     * Wait for the File daemon to contact us to start the Job,
196     * when he does, we will be released, unless the 30 minutes
197     * expires.
198     */
199    P(mutex);
200    while (!jcr->authenticated && !JobCanceled(jcr)) {
201       errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
202       if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
203          break;
204       }
205       Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
206    }
207    Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
208          JobCanceled(jcr), errstat);
209    V(mutex);
210    Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
211 
212    memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key));
213    switch (jcr->getJobProtocol()) {
214    case PT_NDMP_BAREOS:
215       if (jcr->authenticated && !JobCanceled(jcr)) {
216          Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
217 
218          /*
219           * Wait for the Job to finish. As we want exclusive access to
220           * things like the connection to the director we suspend this
221           * thread and let the actual NDMP connection wake us after it
222           * has performed the backup. E.g. instead of doing a busy wait
223           * we just hang on a conditional variable.
224           */
225          Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
226          P(mutex);
227          pthread_cond_wait(&jcr->job_end_wait, &mutex);
228          V(mutex);
229       }
230       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
231 
232       /*
233        * For a NDMP backup we expect the protocol to send us either a nextrun cmd
234        * or a finish cmd to let us know they are finished.
235        */
236       return true;
237    default:
238       /*
239        * Handle the file daemon session.
240        */
241       if (jcr->authenticated && !JobCanceled(jcr)) {
242          Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
243          RunJob(jcr);                   /* Run the job */
244       }
245       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
246 
247       /*
248        * After a run cmd of a native backup we are done e.g.
249        * return false.
250        */
251       return false;
252    }
253 }
254 
nextRunCmd(JobControlRecord * jcr)255 bool nextRunCmd(JobControlRecord *jcr)
256 {
257    char auth_key[MAX_NAME_LENGTH];
258    char seed[MAX_NAME_LENGTH];
259    BareosSocket *dir = jcr->dir_bsock;
260    struct timeval tv;
261    struct timezone tz;
262    struct timespec timeout;
263    int errstat = 0;
264 
265    switch (jcr->getJobProtocol()) {
266    case PT_NDMP_BAREOS:
267       /*
268        * We expect a next NDMP backup stream so clear the authenticated flag
269        * and start waiting for the Next backup to Start.
270        */
271       jcr->authenticated = false;
272 
273       /*
274        * Pass back a new authorization key for the File daemon
275        */
276       Bsnprintf(seed, sizeof(seed), "%p%d", jcr, jcr->JobId);
277       MakeSessionKey(auth_key, seed, 1);
278       if (jcr->sd_auth_key) {
279          free(jcr->sd_auth_key);
280       }
281       jcr->sd_auth_key = bstrdup(auth_key);
282       dir->fsend(OK_nextrun, auth_key);
283       memset(auth_key, 0, sizeof(auth_key));
284       Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
285 
286       jcr->sendJobStatus(JS_WaitFD);          /* wait for FD to connect */
287 
288       gettimeofday(&tv, &tz);
289       timeout.tv_nsec = tv.tv_usec * 1000;
290       timeout.tv_sec = tv.tv_sec + me->client_wait;
291 
292       Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n",
293             jcr->Job, (int)(timeout.tv_sec-time(NULL)), jcr->sd_auth_key);
294       Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
295 
296       P(mutex);
297       while (!jcr->authenticated && !JobCanceled(jcr)) {
298          errstat = pthread_cond_timedwait(&jcr->job_start_wait, &mutex, &timeout);
299          if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
300             break;
301          }
302          Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
303       }
304       Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
305             JobCanceled(jcr), errstat);
306       V(mutex);
307       Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
308 
309       if (jcr->authenticated && !JobCanceled(jcr)) {
310          Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
311 
312          /*
313           * Wait for the Job to finish. As we want exclusive access to
314           * things like the connection to the director we suspend this
315           * thread and let the actual NDMP connection wake us after it
316           * has performed the backup. E.g. instead of doing a busy wait
317           * we just hang on a conditional variable.
318           */
319          Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
320          P(mutex);
321          pthread_cond_wait(&jcr->job_end_wait, &mutex);
322          V(mutex);
323       }
324       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
325 
326       /*
327        * For a NDMP backup we expect the protocol to send us either a nextrun cmd
328        * or a finish cmd to let us know they are finished.
329        */
330       return true;
331    default:
332       Dmsg1(200, "NextRunCmd: %s", jcr->dir_bsock->msg);
333       Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s tries to use nextrun cmd while not part of protocol.\n"),
334             (uint32_t)jcr->JobId, jcr->Job);
335       return false;
336    }
337 }
338 
FinishCmd(JobControlRecord * jcr)339 bool FinishCmd(JobControlRecord *jcr)
340 {
341    BareosSocket *dir = jcr->dir_bsock;
342    char ec1[30];
343 
344    /*
345     * See if the Job has a certain protocol. Some protocols allow the
346     * finish cmd some do not (Native backup for example does NOT)
347     */
348    switch (jcr->getJobProtocol()) {
349    case PT_NDMP_BAREOS:
350       Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
351 
352       jcr->end_time = time(NULL);
353       DequeueMessages(jcr);             /* send any queued messages */
354       jcr->setJobStatus(JS_Terminated);
355 
356       switch (jcr->getJobType()) {
357       case JT_BACKUP:
358          EndOfNdmpBackup(jcr);
359          break;
360       case JT_RESTORE:
361          EndOfNdmpRestore(jcr);
362          break;
363       default:
364          break;
365       }
366 
367       GeneratePluginEvent(jcr, bsdEventJobEnd);
368 
369       dir->fsend(Job_end, jcr->Job, jcr->JobStatus, jcr->JobFiles,
370                  edit_uint64(jcr->JobBytes, ec1), jcr->JobErrors);
371       dir->signal(BNET_EOD);             /* send EOD to Director daemon */
372 
373       FreePlugins(jcr);                 /* release instantiated plugins */
374 
375       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
376 
377       return false;                      /* Continue DIR session ? */
378    default:
379       Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
380       Jmsg2(jcr, M_FATAL, 0, _("Hey!!!! JobId %u Job %s tries to use finish cmd while not part of protocol.\n"),
381             (uint32_t)jcr->JobId, jcr->Job);
382       return false;                      /* Continue DIR session ? */
383    }
384 }
385 
386 #ifdef needed
387 /**
388  *   Query Device command from Director
389  *   Sends Storage Daemon's information on the device to the
390  *    caller (presumably the Director).
391  *   This command always returns "true" so that the line is
392  *    not closed on an error.
393  *
394  */
QueryCmd(JobControlRecord * jcr)395 bool QueryCmd(JobControlRecord *jcr)
396 {
397    PoolMem dev_name, VolumeName, MediaType, ChangerName;
398    BareosSocket *dir = jcr->dir_bsock;
399    DeviceResource *device;
400    AUTOCHANGER *changer;
401    bool ok;
402 
403    Dmsg1(100, "Query_cmd: %s", dir->msg);
404    ok = sscanf(dir->msg, query_device, dev_name.c_str()) == 1;
405    Dmsg1(100, "<dird: %s", dir->msg);
406    if (ok) {
407       UnbashSpaces(dev_name);
408       foreach_res(device, R_DEVICE) {
409          /* Find resource, and make sure we were able to open it */
410          if (bstrcmp(dev_name.c_str(), device->name())) {
411             if (!device->dev) {
412                device->dev = InitDev(jcr, device);
413             }
414             if (!device->dev) {
415                break;
416             }
417             ok = dir_update_device(jcr, device->dev);
418             if (ok) {
419                ok = dir->fsend(OK_query);
420             } else {
421                dir->fsend(NO_query);
422             }
423             return ok;
424          }
425       }
426       foreach_res(changer, R_AUTOCHANGER) {
427          /*Find resource, and make sure we were able to open it */
428          if (bstrcmp(dev_name.c_str(), changer->name())) {
429             if (!changer->device || changer->device->size() == 0) {
430                continue;              /* no devices */
431             }
432             ok = dir_update_changer(jcr, changer);
433             if (ok) {
434                ok = dir->fsend(OK_query);
435             } else {
436                dir->fsend(NO_query);
437             }
438             return ok;
439          }
440       }
441       /* If we get here, the device/autochanger was not found */
442       UnbashSpaces(dir->msg);
443       PmStrcpy(jcr->errmsg, dir->msg);
444       dir->fsend(NO_device, dev_name.c_str());
445       Dmsg1(100, ">dird: %s", dir->msg);
446    } else {
447       UnbashSpaces(dir->msg);
448       PmStrcpy(jcr->errmsg, dir->msg);
449       dir->fsend(BAD_query, jcr->errmsg);
450       Dmsg1(100, ">dird: %s", dir->msg);
451    }
452 
453    return true;
454 }
455 #endif
456 
457 /**
458  * Destroy the Job Control Record and associated
459  * resources (sockets).
460  */
StoredFreeJcr(JobControlRecord * jcr)461 void StoredFreeJcr(JobControlRecord *jcr)
462 {
463    Dmsg0(200, "Start stored FreeJcr\n");
464    Dmsg2(800, "End Job JobId=%u %p\n", jcr->JobId, jcr);
465 
466    if (jcr->dir_bsock) {
467       Dmsg2(800, "Send Terminate jid=%d %p\n", jcr->JobId, jcr);
468       jcr->dir_bsock->signal(BNET_EOD);
469       jcr->dir_bsock->signal(BNET_TERMINATE);
470    }
471 
472    if (jcr->store_bsock) {
473       jcr->store_bsock->close();
474       delete jcr->store_bsock;
475       jcr->store_bsock = NULL;
476    }
477 
478    if (jcr->file_bsock) {
479       jcr->file_bsock->close();
480       delete jcr->file_bsock;
481       jcr->file_bsock = NULL;
482    }
483 
484    if (jcr->job_name) {
485       FreePoolMemory(jcr->job_name);
486    }
487 
488    if (jcr->client_name) {
489       FreeMemory(jcr->client_name);
490       jcr->client_name = NULL;
491    }
492 
493    if (jcr->fileset_name) {
494       FreeMemory(jcr->fileset_name);
495    }
496 
497    if (jcr->fileset_md5) {
498       FreeMemory(jcr->fileset_md5);
499    }
500 
501    if (jcr->backup_format) {
502       FreeMemory(jcr->backup_format);
503    }
504 
505    if (jcr->bsr) {
506       libbareos::FreeBsr(jcr->bsr);
507       jcr->bsr = NULL;
508    }
509 
510    if (jcr->rctx) {
511       FreeReadContext(jcr->rctx);
512       jcr->rctx = NULL;
513    }
514 
515    if (jcr->compress.deflate_buffer || jcr->compress.inflate_buffer) {
516       CleanupCompression(jcr);
517    }
518 
519    /*
520     * Free any restore volume list created
521     */
522    FreeRestoreVolumeList(jcr);
523    if (jcr->RestoreBootstrap) {
524       SecureErase(jcr, jcr->RestoreBootstrap);
525       FreePoolMemory(jcr->RestoreBootstrap);
526       jcr->RestoreBootstrap = NULL;
527    }
528 
529    if (jcr->next_dev || jcr->prev_dev) {
530       Emsg0(M_FATAL, 0, _("In FreeJcr(), but still attached to device!!!!\n"));
531    }
532 
533    pthread_cond_destroy(&jcr->job_start_wait);
534    pthread_cond_destroy(&jcr->job_end_wait);
535 
536    if (jcr->dcrs) {
537       delete jcr->dcrs;
538       jcr->dcrs = NULL;
539    }
540 
541    /*
542     * Avoid a double free
543     */
544    if (jcr->dcr == jcr->read_dcr) {
545       jcr->read_dcr = NULL;
546    }
547 
548    if (jcr->dcr) {
549       FreeDeviceControlRecord(jcr->dcr);
550       jcr->dcr = NULL;
551    }
552 
553    if (jcr->read_dcr) {
554       FreeDeviceControlRecord(jcr->read_dcr);
555       jcr->read_dcr = NULL;
556    }
557 
558    if (jcr->plugin_options) {
559       delete jcr->plugin_options;
560    }
561 
562    if (jcr->read_store) {
563       DirectorStorage *store = nullptr;
564       foreach_alist(store, jcr->read_store) {
565          delete store->device;
566          delete store;
567       }
568       delete jcr->read_store;
569       jcr->read_store = NULL;
570    }
571 
572    if (jcr->write_store) {
573       DirectorStorage *store = nullptr;
574       foreach_alist(store, jcr->write_store) {
575          delete store->device;
576          delete store;
577       }
578       delete jcr->write_store;
579       jcr->write_store = NULL;
580    }
581 
582    FreePlugins(jcr);                 /* release instantiated plugins */
583 
584    Dsm_check(200);
585 
586    if (jcr->JobId != 0) {
587       WriteStateFile(me->working_directory, "bareos-sd", GetFirstPortHostOrder(me->SDaddrs));
588    }
589 
590    Dmsg0(200, "End stored FreeJcr\n");
591 
592    return;
593 }
594 
595 } /* namespace storagedaemon */
596