1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2000-2011 Free Software Foundation Europe e.V.
5    Copyright (C) 2011-2012 Planets Communications B.V.
6    Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7 
8    This program is Free Software; you can redistribute it and/or
9    modify it under the terms of version three of the GNU Affero General Public
10    License as published by the Free Software Foundation and included
11    in the file LICENSE.
12 
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16    Affero General Public License for more details.
17 
18    You should have received a copy of the GNU Affero General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22 */
23 /*
24  * Kern Sibbald, MM
25  */
26 /**
27  * @file
28  * Job control and execution for Storage Daemon
29  */
30 
31 #include "include/bareos.h"
32 #include "stored/stored.h"
33 #include "stored/bsr.h"
34 #include "stored/acquire.h"
35 #include "stored/fd_cmds.h"
36 #include "stored/jcr_private.h"
37 #include "stored/ndmp_tape.h"
38 #include "stored/read_record.h"
39 #include "stored/stored_globals.h"
40 #include "lib/bsock.h"
41 #include "lib/edit.h"
42 #include "lib/parse_bsr.h"
43 #include "lib/util.h"
44 #include "include/jcr.h"
45 #include "include/protocol_types.h"
46 
47 namespace storagedaemon {
48 
49 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
50 
51 /* Requests from the Director daemon */
52 static char jobcmd[] =
53     "JobId=%d job=%127s job_name=%127s client_name=%127s "
54     "type=%d level=%d FileSet=%127s NoAttr=%d SpoolAttr=%d FileSetMD5=%127s "
55     "SpoolData=%d PreferMountedVols=%d SpoolSize=%127s "
56     "rerunning=%d VolSessionId=%d VolSessionTime=%d Quota=%llu "
57     "Protocol=%d BackupFormat=%127s\n";
58 
59 /* Responses sent to Director daemon */
60 static char OK_job[] = "3000 OK Job SDid=%u SDtime=%u Authorization=%s\n";
61 static char OK_nextrun[] = "3000 OK Job Authorization=%s\n";
62 static char BAD_job[] = "3915 Bad Job command. stat=%d CMD: %s\n";
63 static char Job_end[] =
64     "3099 Job %s end JobStatus=%d JobFiles=%d JobBytes=%s JobErrors=%u\n";
65 
66 /**
67  * Director requests us to start a job
68  * Basic tasks done here:
69  *  - We pickup the JobId to be run from the Director.
70  *  - We pickup the device, media, and pool from the Director
71  *  - Wait for a connection from the File Daemon (FD)
72  *  - Accept commands from the FD (i.e. run the job)
73  *  - Return when the connection is terminated or
74  *    there is an error.
75  */
job_cmd(JobControlRecord * jcr)76 bool job_cmd(JobControlRecord* jcr)
77 {
78   int32_t JobId;
79   char auth_key[MAX_NAME_LENGTH];
80   char seed[MAX_NAME_LENGTH];
81   char spool_size[MAX_NAME_LENGTH];
82   BareosSocket* dir = jcr->dir_bsock;
83   PoolMem job_name, client_name, job, fileset_name, fileset_md5, backup_format;
84   int32_t JobType, level, spool_attributes, no_attributes, spool_data;
85   int32_t PreferMountedVols, rerunning, protocol;
86   int status;
87   uint64_t quota = 0;
88   JobControlRecord* ojcr;
89 
90   /*
91    * Get JobId and permissions from Director
92    */
93   Dmsg1(100, "<dird: %s", dir->msg);
94   bstrncpy(spool_size, "0", sizeof(spool_size));
95   status = sscanf(dir->msg, jobcmd, &JobId, job.c_str(), job_name.c_str(),
96                   client_name.c_str(), &JobType, &level, fileset_name.c_str(),
97                   &no_attributes, &spool_attributes, fileset_md5.c_str(),
98                   &spool_data, &PreferMountedVols, spool_size, &rerunning,
99                   &jcr->VolSessionId, &jcr->VolSessionTime, &quota, &protocol,
100                   backup_format.c_str());
101   if (status != 19) {
102     PmStrcpy(jcr->errmsg, dir->msg);
103     dir->fsend(BAD_job, status, jcr->errmsg);
104     Dmsg1(100, ">dird: %s", dir->msg);
105     jcr->setJobStatus(JS_ErrorTerminated);
106     return false;
107   }
108 
109   jcr->rerunning = (rerunning) ? true : false;
110   jcr->setJobProtocol(protocol);
111 
112   Dmsg4(100, "rerunning=%d VolSesId=%d VolSesTime=%d Protocol=%d\n",
113         jcr->rerunning, jcr->VolSessionId, jcr->VolSessionTime,
114         jcr->getJobProtocol());
115   /*
116    * Since this job could be rescheduled, we
117    *  check to see if we have it already. If so
118    *  free the old jcr and use the new one.
119    */
120   ojcr = get_jcr_by_full_name(job.c_str());
121   if (ojcr && !ojcr->authenticated) {
122     Dmsg2(100, "Found ojcr=0x%x Job %s\n", (unsigned)(intptr_t)ojcr,
123           job.c_str());
124     FreeJcr(ojcr);
125   }
126   jcr->JobId = JobId;
127   Dmsg2(800, "Start JobId=%d %p\n", JobId, jcr);
128   /*
129    * If job rescheduled because previous was incomplete,
130    * the Resched flag is set and VolSessionId and VolSessionTime
131    * are given to us (same as restarted job).
132    */
133   if (!jcr->rerunning) {
134     jcr->VolSessionId = NewVolSessionId();
135     jcr->VolSessionTime = vol_session_time;
136   }
137   bstrncpy(jcr->Job, job, sizeof(jcr->Job));
138   UnbashSpaces(job_name);
139   jcr->impl->job_name = GetPoolMemory(PM_NAME);
140   PmStrcpy(jcr->impl->job_name, job_name);
141   UnbashSpaces(client_name);
142   jcr->client_name = GetPoolMemory(PM_NAME);
143   PmStrcpy(jcr->client_name, client_name);
144   UnbashSpaces(fileset_name);
145   jcr->impl->fileset_name = GetPoolMemory(PM_NAME);
146   PmStrcpy(jcr->impl->fileset_name, fileset_name);
147   jcr->setJobType(JobType);
148   jcr->setJobLevel(level);
149   jcr->impl->no_attributes = no_attributes;
150   jcr->impl->spool_attributes = spool_attributes;
151   jcr->impl->spool_data = spool_data;
152   jcr->impl->spool_size = str_to_int64(spool_size);
153   jcr->impl->fileset_md5 = GetPoolMemory(PM_NAME);
154   PmStrcpy(jcr->impl->fileset_md5, fileset_md5);
155   jcr->impl->PreferMountedVols = PreferMountedVols;
156   jcr->impl->RemainingQuota = quota;
157   UnbashSpaces(backup_format);
158   jcr->impl->backup_format = GetPoolMemory(PM_NAME);
159   PmStrcpy(jcr->impl->backup_format, backup_format);
160   jcr->authenticated = false;
161 
162   Dmsg1(50, "Quota set as %llu\n", quota);
163 
164   /*
165    * Pass back an authorization key for the File daemon
166    */
167   Bsnprintf(seed, sizeof(seed), "%p%d", jcr, JobId);
168   MakeSessionKey(auth_key, seed, 1);
169   jcr->sd_auth_key = strdup(auth_key);
170   dir->fsend(OK_job, jcr->VolSessionId, jcr->VolSessionTime, auth_key);
171   memset(auth_key, 0, sizeof(auth_key));
172   Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
173 
174   DispatchNewPluginOptions(jcr);
175   GeneratePluginEvent(jcr, bsdEventJobStart, (void*)"JobStart");
176 
177   return true;
178 }
179 
DoJobRun(JobControlRecord * jcr)180 bool DoJobRun(JobControlRecord* jcr)
181 {
182   struct timeval tv;
183   struct timezone tz;
184   struct timespec timeout;
185   int errstat = 0;
186 
187   jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
188 
189   gettimeofday(&tv, &tz);
190   timeout.tv_nsec = tv.tv_usec * 1000;
191   timeout.tv_sec = tv.tv_sec + me->client_wait;
192 
193   Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n", jcr->Job,
194         (int)(timeout.tv_sec - time(NULL)), jcr->sd_auth_key);
195   Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
196 
197   /*
198    * Wait for the File daemon to contact us to start the Job,
199    * when he does, we will be released, unless the 30 minutes
200    * expires.
201    */
202   P(mutex);
203   while (!jcr->authenticated && !JobCanceled(jcr)) {
204     errstat =
205         pthread_cond_timedwait(&jcr->impl->job_start_wait, &mutex, &timeout);
206     if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
207       break;
208     }
209     Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
210   }
211   Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
212         JobCanceled(jcr), errstat);
213   V(mutex);
214   Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
215 
216   memset(jcr->sd_auth_key, 0, strlen(jcr->sd_auth_key));
217   switch (jcr->getJobProtocol()) {
218     case PT_NDMP_BAREOS:
219       if (jcr->authenticated && !JobCanceled(jcr)) {
220         Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
221 
222         /*
223          * Wait for the Job to finish. As we want exclusive access to
224          * things like the connection to the director we suspend this
225          * thread and let the actual NDMP connection wake us after it
226          * has performed the backup. E.g. instead of doing a busy wait
227          * we just hang on a conditional variable.
228          */
229         Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
230         P(mutex);
231         pthread_cond_wait(&jcr->impl->job_end_wait, &mutex);
232         V(mutex);
233       }
234       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
235 
236       /*
237        * For a NDMP backup we expect the protocol to send us either a nextrun
238        * cmd or a finish cmd to let us know they are finished.
239        */
240       return true;
241     default:
242       /*
243        * Handle the file daemon session.
244        */
245       if (jcr->authenticated && !JobCanceled(jcr)) {
246         Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
247         RunJob(jcr); /* Run the job */
248       }
249       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
250 
251       /*
252        * After a run cmd of a native backup we are done e.g.
253        * return false.
254        */
255       return false;
256   }
257 }
258 
nextRunCmd(JobControlRecord * jcr)259 bool nextRunCmd(JobControlRecord* jcr)
260 {
261   char auth_key[MAX_NAME_LENGTH];
262   char seed[MAX_NAME_LENGTH];
263   BareosSocket* dir = jcr->dir_bsock;
264   struct timeval tv;
265   struct timezone tz;
266   struct timespec timeout;
267   int errstat = 0;
268 
269   switch (jcr->getJobProtocol()) {
270     case PT_NDMP_BAREOS:
271       /*
272        * We expect a next NDMP backup stream so clear the authenticated flag
273        * and start waiting for the Next backup to Start.
274        */
275       jcr->authenticated = false;
276 
277       /*
278        * Pass back a new authorization key for the File daemon
279        */
280       Bsnprintf(seed, sizeof(seed), "%p%d", jcr, jcr->JobId);
281       MakeSessionKey(auth_key, seed, 1);
282       if (jcr->sd_auth_key) { free(jcr->sd_auth_key); }
283       jcr->sd_auth_key = strdup(auth_key);
284       dir->fsend(OK_nextrun, auth_key);
285       memset(auth_key, 0, sizeof(auth_key));
286       Dmsg2(50, ">dird jid=%u: %s", (uint32_t)jcr->JobId, dir->msg);
287 
288       jcr->sendJobStatus(JS_WaitFD); /* wait for FD to connect */
289 
290       gettimeofday(&tv, &tz);
291       timeout.tv_nsec = tv.tv_usec * 1000;
292       timeout.tv_sec = tv.tv_sec + me->client_wait;
293 
294       Dmsg3(50, "%s waiting %d sec for FD to contact SD key=%s\n", jcr->Job,
295             (int)(timeout.tv_sec - time(NULL)), jcr->sd_auth_key);
296       Dmsg2(800, "Wait FD for jid=%d %p\n", jcr->JobId, jcr);
297 
298       P(mutex);
299       while (!jcr->authenticated && !JobCanceled(jcr)) {
300         errstat = pthread_cond_timedwait(&jcr->impl->job_start_wait, &mutex,
301                                          &timeout);
302         if (errstat == ETIMEDOUT || errstat == EINVAL || errstat == EPERM) {
303           break;
304         }
305         Dmsg1(800, "=== Auth cond errstat=%d\n", errstat);
306       }
307       Dmsg3(50, "Auth=%d canceled=%d errstat=%d\n", jcr->authenticated,
308             JobCanceled(jcr), errstat);
309       V(mutex);
310       Dmsg2(800, "Auth fail or cancel for jid=%d %p\n", jcr->JobId, jcr);
311 
312       if (jcr->authenticated && !JobCanceled(jcr)) {
313         Dmsg2(800, "Running jid=%d %p\n", jcr->JobId, jcr);
314 
315         /*
316          * Wait for the Job to finish. As we want exclusive access to
317          * things like the connection to the director we suspend this
318          * thread and let the actual NDMP connection wake us after it
319          * has performed the backup. E.g. instead of doing a busy wait
320          * we just hang on a conditional variable.
321          */
322         Dmsg2(800, "Wait for end job jid=%d %p\n", jcr->JobId, jcr);
323         P(mutex);
324         pthread_cond_wait(&jcr->impl->job_end_wait, &mutex);
325         V(mutex);
326       }
327       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
328 
329       /*
330        * For a NDMP backup we expect the protocol to send us either a nextrun
331        * cmd or a finish cmd to let us know they are finished.
332        */
333       return true;
334     default:
335       Dmsg1(200, "NextRunCmd: %s", jcr->dir_bsock->msg);
336       Jmsg2(jcr, M_FATAL, 0,
337             _("Hey!!!! JobId %u Job %s tries to use nextrun cmd while not part "
338               "of protocol.\n"),
339             (uint32_t)jcr->JobId, jcr->Job);
340       return false;
341   }
342 }
343 
FinishCmd(JobControlRecord * jcr)344 bool FinishCmd(JobControlRecord* jcr)
345 {
346   BareosSocket* dir = jcr->dir_bsock;
347   char ec1[30];
348 
349   /*
350    * See if the Job has a certain protocol. Some protocols allow the
351    * finish cmd some do not (Native backup for example does NOT)
352    */
353   switch (jcr->getJobProtocol()) {
354     case PT_NDMP_BAREOS:
355       Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
356 
357       jcr->end_time = time(NULL);
358       DequeueMessages(jcr); /* send any queued messages */
359       jcr->setJobStatus(JS_Terminated);
360 
361       switch (jcr->getJobType()) {
362         case JT_BACKUP:
363           EndOfNdmpBackup(jcr);
364           break;
365         case JT_RESTORE:
366           EndOfNdmpRestore(jcr);
367           break;
368         default:
369           break;
370       }
371 
372       GeneratePluginEvent(jcr, bsdEventJobEnd);
373 
374       dir->fsend(Job_end, jcr->Job, jcr->JobStatus, jcr->JobFiles,
375                  edit_uint64(jcr->JobBytes, ec1), jcr->JobErrors);
376       dir->signal(BNET_EOD); /* send EOD to Director daemon */
377 
378       FreePlugins(jcr); /* release instantiated plugins */
379 
380       Dmsg2(800, "Done jid=%d %p\n", jcr->JobId, jcr);
381 
382       return false; /* Continue DIR session ? */
383     default:
384       Dmsg1(200, "Finish_cmd: %s", jcr->dir_bsock->msg);
385       Jmsg2(jcr, M_FATAL, 0,
386             _("Hey!!!! JobId %u Job %s tries to use finish cmd while not part "
387               "of protocol.\n"),
388             (uint32_t)jcr->JobId, jcr->Job);
389       return false; /* Continue DIR session ? */
390   }
391 }
392 
393 /**
394  * Destroy the Job Control Record and associated
395  * resources (sockets).
396  */
StoredFreeJcr(JobControlRecord * jcr)397 void StoredFreeJcr(JobControlRecord* jcr)
398 {
399   Dmsg0(200, "Start stored FreeJcr\n");
400   Dmsg2(800, "End Job JobId=%u %p\n", jcr->JobId, jcr);
401 
402   if (jcr->dir_bsock) {
403     Dmsg2(800, "Send Terminate jid=%d %p\n", jcr->JobId, jcr);
404     jcr->dir_bsock->signal(BNET_EOD);
405     jcr->dir_bsock->signal(BNET_TERMINATE);
406   }
407 
408   if (jcr->store_bsock) {
409     jcr->store_bsock->close();
410     delete jcr->store_bsock;
411     jcr->store_bsock = NULL;
412   }
413 
414   if (jcr->file_bsock) {
415     jcr->file_bsock->close();
416     delete jcr->file_bsock;
417     jcr->file_bsock = NULL;
418   }
419 
420   if (jcr->impl->job_name) { FreePoolMemory(jcr->impl->job_name); }
421 
422   if (jcr->client_name) {
423     FreeMemory(jcr->client_name);
424     jcr->client_name = NULL;
425   }
426 
427   if (jcr->impl->fileset_name) { FreeMemory(jcr->impl->fileset_name); }
428 
429   if (jcr->impl->fileset_md5) { FreeMemory(jcr->impl->fileset_md5); }
430 
431   if (jcr->impl->backup_format) { FreeMemory(jcr->impl->backup_format); }
432 
433   if (jcr->impl->read_session.bsr) {
434     libbareos::FreeBsr(jcr->impl->read_session.bsr);
435     jcr->impl->read_session.bsr = NULL;
436   }
437 
438   if (jcr->impl->read_session.rctx) {
439     FreeReadContext(jcr->impl->read_session.rctx);
440     jcr->impl->read_session.rctx = NULL;
441   }
442 
443   if (jcr->compress.deflate_buffer || jcr->compress.inflate_buffer) {
444     CleanupCompression(jcr);
445   }
446 
447   /*
448    * Free any restore volume list created
449    */
450   FreeRestoreVolumeList(jcr);
451   if (jcr->RestoreBootstrap) {
452     SecureErase(jcr, jcr->RestoreBootstrap);
453     FreePoolMemory(jcr->RestoreBootstrap);
454     jcr->RestoreBootstrap = NULL;
455   }
456 
457   if (jcr->impl->next_dev || jcr->impl->prev_dev) {
458     Emsg0(M_FATAL, 0, _("In FreeJcr(), but still attached to device!!!!\n"));
459   }
460 
461   pthread_cond_destroy(&jcr->impl->job_start_wait);
462   pthread_cond_destroy(&jcr->impl->job_end_wait);
463 
464   /*
465    * Avoid a double free
466    */
467   if (jcr->impl->dcr == jcr->impl->read_dcr) { jcr->impl->read_dcr = NULL; }
468 
469   if (jcr->impl->dcr) {
470     FreeDeviceControlRecord(jcr->impl->dcr);
471     jcr->impl->dcr = NULL;
472   }
473 
474   if (jcr->impl->read_dcr) {
475     FreeDeviceControlRecord(jcr->impl->read_dcr);
476     jcr->impl->read_dcr = NULL;
477   }
478 
479   if (jcr->impl->plugin_options) { delete jcr->impl->plugin_options; }
480 
481   if (jcr->impl->read_store) {
482     DirectorStorage* store = nullptr;
483     foreach_alist (store, jcr->impl->read_store) {
484       delete store->device;
485       delete store;
486     }
487     delete jcr->impl->read_store;
488     jcr->impl->read_store = NULL;
489   }
490 
491   if (jcr->impl->write_store) {
492     DirectorStorage* store = nullptr;
493     foreach_alist (store, jcr->impl->write_store) {
494       delete store->device;
495       delete store;
496     }
497     delete jcr->impl->write_store;
498     jcr->impl->write_store = NULL;
499   }
500 
501   FreePlugins(jcr); /* release instantiated plugins */
502 
503 
504   if (jcr->JobId != 0) {
505     WriteStateFile(me->working_directory, "bareos-sd",
506                    GetFirstPortHostOrder(me->SDaddrs));
507   }
508 
509   if (jcr->impl) {
510     delete jcr->impl;
511     jcr->impl = nullptr;
512   }
513 
514   Dmsg0(200, "End stored FreeJcr\n");
515 
516   return;
517 }
518 
NewStoredJcr()519 JobControlRecord* NewStoredJcr()
520 {
521   JobControlRecord* jcr = new_jcr(StoredFreeJcr);
522   jcr->impl = new JobControlRecordPrivate;
523   return jcr;
524 }
525 
526 } /* namespace storagedaemon */
527