1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2000-2012 Free Software Foundation Europe e.V.
5    Copyright (C) 2011-2012 Planets Communications B.V.
6    Copyright (C) 2013-2020 Bareos GmbH & Co. KG
7 
8    This program is Free Software; you can redistribute it and/or
9    modify it under the terms of version three of the GNU Affero General Public
10    License as published by the Free Software Foundation and included
11    in the file LICENSE.
12 
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16    Affero General Public License for more details.
17 
18    You should have received a copy of the GNU Affero General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22 */
23 /*
24  * Manipulation routines for Job Control Records and
25  *  handling of last_jobs_list.
26  *
27  *  Kern E. Sibbald, December 2000
28  *
29  *  These routines are thread safe.
30  *
31  *  The job list routines were re-written in May 2005 to
32  *  eliminate the global lock while traversing the list, and
33  *  to use the dlist subroutines.  The locking is now done
34  *  on the list each time the list is modified or traversed.
35  *  That is it is "micro-locked" rather than globally locked.
36  *  The result is that there is one lock/unlock for each entry
37  *  in the list while traversing it rather than a single lock
38  *  at the beginning of a traversal and one at the end.  This
39  *  incurs slightly more overhead, but effectively eliminates
40  *  the possibilty of race conditions.  In addition, with the
41  *  exception of the global locking of the list during the
42  *  re-reading of the config file, no recursion is needed.
43  *
44  */
45 
46 #include "include/bareos.h"
47 #include "include/jcr.h"
48 #include "lib/berrno.h"
49 #include "lib/bsignal.h"
50 #include "lib/breg.h"
51 #include "lib/edit.h"
52 #include "lib/thread_specific_data.h"
53 #include "lib/tls_conf.h"
54 #include "lib/bsock.h"
55 #include "lib/recent_job_results_list.h"
56 #include "lib/message_queue_item.h"
57 #include "lib/volume_session_info.h"
58 #include "lib/watchdog.h"
59 
60 #include <algorithm>
61 
62 const int debuglevel = 3400;
63 
64 static void JcrTimeoutCheck(watchdog_t* self);
65 
66 int num_jobs_run;
67 
68 static std::vector<std::weak_ptr<JobControlRecord>> job_control_record_cache;
69 static dlist* job_control_record_chain = nullptr;
70 static int watch_dog_timeout = 0;
71 
72 static std::mutex jcr_chain_mutex;
73 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
74 
75 static char Job_status[] = "Status Job=%s JobStatus=%d\n";
76 
LockJobs()77 void LockJobs() { P(job_start_mutex); }
78 
UnlockJobs()79 void UnlockJobs() { V(job_start_mutex); }
80 
81 /*
82  * Get an ASCII representation of the Operation being performed as an english
83  * Noun
84  */
get_OperationName()85 const char* JobControlRecord::get_OperationName()
86 {
87   switch (JobType_) {
88     case JT_BACKUP:
89       return _("Backup");
90     case JT_VERIFY:
91       return _("Verifying");
92     case JT_RESTORE:
93       return _("Restoring");
94     case JT_ARCHIVE:
95       return _("Archiving");
96     case JT_COPY:
97       return _("Copying");
98     case JT_MIGRATE:
99       return _("Migration");
100     case JT_SCAN:
101       return _("Scanning");
102     case JT_CONSOLIDATE:
103       return _("Consolidating");
104     default:
105       return _("Unknown operation");
106   }
107 }
108 
109 /*
110  * Get an ASCII representation of the Action being performed either an english
111  * Verb or Adjective
112  */
get_ActionName(bool past)113 const char* JobControlRecord::get_ActionName(bool past)
114 {
115   switch (JobType_) {
116     case JT_BACKUP:
117       return _("backup");
118     case JT_VERIFY:
119       return (past) ? _("verified") : _("verify");
120     case JT_RESTORE:
121       return (past) ? _("restored") : _("restore");
122     case JT_ARCHIVE:
123       return (past) ? _("archived") : _("archive");
124     case JT_COPY:
125       return (past) ? _("copied") : _("copy");
126     case JT_MIGRATE:
127       return (past) ? _("migrated") : _("migrate");
128     case JT_SCAN:
129       return (past) ? _("scanned") : _("scan");
130     case JT_CONSOLIDATE:
131       return (past) ? _("consolidated") : _("consolidate");
132     default:
133       return _("unknown action");
134   }
135 }
136 
JobReads()137 bool JobControlRecord::JobReads()
138 {
139   switch (JobType_) {
140     case JT_VERIFY:
141     case JT_RESTORE:
142     case JT_COPY:
143     case JT_MIGRATE:
144       return true;
145     case JT_BACKUP:
146       if (JobLevel_ == L_VIRTUAL_FULL) { return true; }
147       break;
148     default:
149       break;
150   }
151   return false;
152 }
153 
154 struct job_callback_item {
155   void (*JobEndCb)(JobControlRecord* jcr, void*);
156   void* ctx{};
157 };
158 
159 /*
160  * Push a job_callback_item onto the job end callback stack.
161  */
RegisterJobEndCallback(JobControlRecord * jcr,void JobEndCb (JobControlRecord * jcr,void *),void * ctx)162 void RegisterJobEndCallback(JobControlRecord* jcr,
163                             void JobEndCb(JobControlRecord* jcr, void*),
164                             void* ctx)
165 {
166   job_callback_item* item;
167 
168   item = (job_callback_item*)malloc(sizeof(job_callback_item));
169 
170   item->JobEndCb = JobEndCb;
171   item->ctx = ctx;
172 
173   jcr->job_end_callbacks.push((void*)item);
174 }
175 
176 /*
177  * Pop each job_callback_item and process it.
178  */
CallJobEndCallbacks(JobControlRecord * jcr)179 static void CallJobEndCallbacks(JobControlRecord* jcr)
180 {
181   job_callback_item* item;
182 
183   if (jcr->job_end_callbacks.size() > 0) {
184     item = (job_callback_item*)jcr->job_end_callbacks.pop();
185     while (item) {
186       item->JobEndCb(jcr, item->ctx);
187       free(item);
188       item = (job_callback_item*)jcr->job_end_callbacks.pop();
189     }
190   }
191 }
192 
JobControlRecord()193 JobControlRecord::JobControlRecord()
194 {
195   Dmsg0(100, "Construct JobControlRecord\n");
196 
197   MessageQueueItem* item = nullptr;
198   msg_queue = new dlist(item, &item->link_);  // calculate offset
199 
200   int status;
201   if ((status = pthread_mutex_init(&msg_queue_mutex, nullptr)) != 0) {
202     BErrNo be;
203     Jmsg(nullptr, M_ABORT, 0, _("Could not init msg_queue mutex. ERR=%s\n"),
204          be.bstrerror(status));
205   }
206 
207   my_thread_id = pthread_self();
208   job_end_callbacks.init(1, false);
209   sched_time = time(nullptr);
210   initial_sched_time = sched_time;
211   InitMutex();
212   IncUseCount();
213   VolumeName = GetPoolMemory(PM_FNAME);
214   VolumeName[0] = 0;
215   errmsg = GetPoolMemory(PM_MESSAGE);
216   errmsg[0] = 0;
217   comment = GetPoolMemory(PM_FNAME);
218   comment[0] = 0;
219 
220   /*
221    * Setup some dummy values
222    */
223   bstrncpy(Job, "*System*", sizeof(Job));
224   JobId = 0;
225   setJobType(JT_SYSTEM); /* internal job until defined */
226   setJobLevel(L_NONE);
227   setJobStatus(JS_Created); /* ready to run */
228   SetTimeoutHandler();
229 }
230 
new_jcr(JCR_free_HANDLER * daemon_free_jcr)231 JobControlRecord* new_jcr(JCR_free_HANDLER* daemon_free_jcr)
232 {
233   Dmsg0(debuglevel, "Enter new_jcr\n");
234 
235   JobControlRecord* jcr
236       = static_cast<JobControlRecord*>(malloc(sizeof(JobControlRecord)));
237   jcr = new (jcr) JobControlRecord();
238 
239   jcr->daemon_free_jcr = daemon_free_jcr;
240 
241   LockJobs();
242   LockJcrChain();
243   InitJcrChain();
244   job_control_record_chain->append(jcr);
245   UnlockJcrChain();
246   UnlockJobs();
247   return jcr;
248 }
249 
InitJcr(std::shared_ptr<JobControlRecord> jcr,JCR_free_HANDLER * daemon_free_jcr)250 void InitJcr(std::shared_ptr<JobControlRecord> jcr,
251              JCR_free_HANDLER* daemon_free_jcr)
252 {
253   jcr->daemon_free_jcr = daemon_free_jcr;
254 
255   LockJobs();
256   LockJcrChain();
257   job_control_record_cache.emplace_back(jcr);
258   UnlockJcrChain();
259   UnlockJobs();
260 }
261 
262 /*
263  * Remove a JobControlRecord from the chain
264  *
265  * NOTE! The chain must be locked prior to calling this routine.
266  */
RemoveJcr(JobControlRecord * jcr)267 static void RemoveJcr(JobControlRecord* jcr)
268 {
269   Dmsg0(debuglevel, "Enter RemoveJcr\n");
270   if (!jcr) { Emsg0(M_ABORT, 0, _("nullptr jcr.\n")); }
271   job_control_record_chain->remove(jcr);
272   Dmsg0(debuglevel, "Leave RemoveJcr\n");
273 }
274 
FreeCommonJcr(JobControlRecord * jcr,bool is_destructor_call=false)275 static void FreeCommonJcr(JobControlRecord* jcr,
276                           bool is_destructor_call = false)
277 {
278   Dmsg1(100, "FreeCommonJcr: %p \n", jcr);
279 
280   if (!jcr) { Dmsg0(100, "FreeCommonJcr: Invalid jcr\n"); }
281 
282   RemoveJcrFromThreadSpecificData(jcr);
283   jcr->SetKillable(false);
284 
285   jcr->DestroyMutex();
286 
287   if (jcr->msg_queue) {
288     delete jcr->msg_queue;
289     jcr->msg_queue = nullptr;
290     pthread_mutex_destroy(&jcr->msg_queue_mutex);
291   }
292 
293   if (jcr->client_name) {
294     FreePoolMemory(jcr->client_name);
295     jcr->client_name = nullptr;
296   }
297 
298   if (jcr->attr) {
299     FreePoolMemory(jcr->attr);
300     jcr->attr = nullptr;
301   }
302 
303   if (jcr->sd_auth_key) {
304     free(jcr->sd_auth_key);
305     jcr->sd_auth_key = nullptr;
306   }
307 
308   if (jcr->VolumeName) {
309     FreePoolMemory(jcr->VolumeName);
310     jcr->VolumeName = nullptr;
311   }
312 
313   if (jcr->dir_bsock) {
314     jcr->dir_bsock->close();
315     delete jcr->dir_bsock;
316     jcr->dir_bsock = nullptr;
317   }
318 
319   if (jcr->errmsg) {
320     FreePoolMemory(jcr->errmsg);
321     jcr->errmsg = nullptr;
322   }
323 
324   if (jcr->where) {
325     free(jcr->where);
326     jcr->where = nullptr;
327   }
328 
329   if (jcr->RegexWhere) {
330     free(jcr->RegexWhere);
331     jcr->RegexWhere = nullptr;
332   }
333 
334   if (jcr->where_bregexp) {
335     FreeBregexps(jcr->where_bregexp);
336     delete jcr->where_bregexp;
337     jcr->where_bregexp = nullptr;
338   }
339 
340   if (jcr->cached_path) {
341     FreePoolMemory(jcr->cached_path);
342     jcr->cached_path = nullptr;
343     jcr->cached_pnl = 0;
344   }
345 
346   if (jcr->id_list) {
347     FreeGuidList(jcr->id_list);
348     jcr->id_list = nullptr;
349   }
350 
351   if (jcr->comment) {
352     FreePoolMemory(jcr->comment);
353     jcr->comment = nullptr;
354   }
355 
356   if (!is_destructor_call) { free(jcr); }
357 }
358 
JcrCleanup(JobControlRecord * jcr,bool is_destructor_call=false)359 static void JcrCleanup(JobControlRecord* jcr, bool is_destructor_call = false)
360 {
361   DequeueMessages(jcr);
362   CallJobEndCallbacks(jcr);
363 
364   Dmsg1(debuglevel, "End job=%d\n", jcr->JobId);
365 
366   switch (jcr->getJobType()) {
367     case JT_BACKUP:
368     case JT_VERIFY:
369     case JT_RESTORE:
370     case JT_MIGRATE:
371     case JT_COPY:
372     case JT_ADMIN:
373       if (jcr->JobId > 0) {  // except Console Jobs
374         num_jobs_run++;
375         RecentJobResultsList::Append(jcr);
376       }
377       break;
378     default:
379       break;
380   }
381 
382   CloseMsg(jcr);
383 
384   if (jcr->daemon_free_jcr) { jcr->daemon_free_jcr(jcr); }
385 
386   FreeCommonJcr(jcr, is_destructor_call);
387   CloseMsg(nullptr);  // flush any daemon messages
388 }
389 
~JobControlRecord()390 JobControlRecord::~JobControlRecord()
391 {
392   Dmsg0(100, "Destruct JobControlRecord\n");
393   JcrCleanup(this, true);
394   Dmsg0(debuglevel, "JobControlRecord Destructor finished\n");
395 }
396 
RunJcrGarbageCollector(JobControlRecord * jcr)397 static bool RunJcrGarbageCollector(JobControlRecord* jcr)
398 {
399   LockJcrChain();
400   jcr->DecUseCount(); /* decrement use count */
401   if (jcr->UseCount() < 0) {
402     Jmsg2(jcr, M_ERROR, 0, _("JobControlRecord UseCount=%d JobId=%d\n"),
403           jcr->UseCount(), jcr->JobId);
404   }
405   if (jcr->JobId > 0) {
406     Dmsg3(debuglevel, "Dec FreeJcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
407           jcr->UseCount(), jcr->Job);
408   }
409   if (jcr->UseCount() > 0) { /* if in use */
410     UnlockJcrChain();
411     return false;
412   }
413   if (jcr->JobId > 0) {
414     Dmsg3(debuglevel, "remove jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
415           jcr->UseCount(), jcr->Job);
416   }
417   RemoveJcr(jcr); /* remove Jcr from chain */
418   UnlockJcrChain();
419   return true;
420 }
421 
422 /*
423  * Global routine to free a jcr
424  */
b_free_jcr(const char * file,int line,JobControlRecord * jcr)425 void b_free_jcr(const char* file, int line, JobControlRecord* jcr)
426 {
427   Dmsg3(debuglevel, "Enter FreeJcr jid=%u from %s:%d\n", jcr->JobId, file,
428         line);
429 
430   if (RunJcrGarbageCollector(jcr)) { JcrCleanup(jcr); }
431 
432   Dmsg0(debuglevel, "Exit FreeJcr\n");
433 }
434 
SetKillable(bool killable)435 void JobControlRecord::SetKillable(bool killable)
436 {
437   lock();
438 
439   my_thread_killable = killable;
440   if (killable) {
441     my_thread_id = pthread_self();
442   } else {
443     memset(&my_thread_id, 0, sizeof(my_thread_id));
444   }
445 
446   unlock();
447 }
448 
MyThreadSendSignal(int sig)449 void JobControlRecord::MyThreadSendSignal(int sig)
450 {
451   lock();
452 
453   if (IsKillable() && !pthread_equal(my_thread_id, pthread_self())) {
454     Dmsg1(800, "Send kill to jid=%d\n", JobId);
455     pthread_kill(my_thread_id, sig);
456   } else if (!IsKillable()) {
457     Dmsg1(10, "Warning, can't send kill to jid=%d\n", JobId);
458   }
459 
460   unlock();
461 }
462 
463 
464 /*
465  * Given a JobId, find the JobControlRecord
466  *
467  * Returns: jcr on success
468  *          nullptr on failure
469  */
get_jcr_by_id(uint32_t JobId)470 JobControlRecord* get_jcr_by_id(uint32_t JobId)
471 {
472   JobControlRecord* jcr;
473 
474   foreach_jcr (jcr) {
475     if (jcr->JobId == JobId) {
476       jcr->IncUseCount();
477       Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
478             jcr->UseCount(), jcr->Job);
479       break;
480     }
481   }
482   endeach_jcr(jcr);
483 
484   return jcr;
485 }
486 
GetJcrCount()487 std::size_t GetJcrCount()
488 {
489   LockJcrChain();
490   std::size_t count = count_if(
491       job_control_record_cache.begin(), job_control_record_cache.end(),
492       [](std::weak_ptr<JobControlRecord>& p) { return !p.expired(); });
493   UnlockJcrChain();
494 
495   return count;
496 }
497 
GetJcr(std::function<bool (const JobControlRecord *)> compare)498 static std::shared_ptr<JobControlRecord> GetJcr(
499     std::function<bool(const JobControlRecord*)> compare)
500 {
501   std::shared_ptr<JobControlRecord> result;
502 
503   LockJcrChain();
504 
505   // cleanup chache
506   job_control_record_cache.erase(
507       std::remove_if(
508           job_control_record_cache.begin(), job_control_record_cache.end(),
509           [](std::weak_ptr<JobControlRecord>& p) { return p.expired(); }),
510       job_control_record_cache.end());
511 
512   find_if(job_control_record_cache.begin(), job_control_record_cache.end(),
513           [&compare, &result](std::weak_ptr<JobControlRecord>& p) {
514             auto jcr = p.lock();
515             if (compare(jcr.get())) {
516               result = jcr;
517               return true;
518             }
519             return false;
520           });
521 
522   UnlockJcrChain();
523 
524   return result;
525 }
526 
GetJcrById(uint32_t JobId)527 std::shared_ptr<JobControlRecord> GetJcrById(uint32_t JobId)
528 {
529   return GetJcr(
530       [JobId](const JobControlRecord* jcr) { return jcr->JobId == JobId; });
531 }
532 
GetJcrByFullName(std::string name)533 std::shared_ptr<JobControlRecord> GetJcrByFullName(std::string name)
534 {
535   return GetJcr([&name](const JobControlRecord* jcr) {
536     return std::string{jcr->Job} == name;
537   });
538 }
539 
GetJcrByPartialName(std::string name)540 std::shared_ptr<JobControlRecord> GetJcrByPartialName(std::string name)
541 {
542   return GetJcr([&name](const JobControlRecord* jcr) {
543     return std::string{jcr->Job}.find(name) == 0;
544   });
545 }
546 
GetJcrBySession(const VolumeSessionInfo & vsi)547 std::shared_ptr<JobControlRecord> GetJcrBySession(const VolumeSessionInfo& vsi)
548 {
549   return GetJcr([&vsi](const JobControlRecord* jcr) {
550     return (VolumeSessionInfo{jcr->VolSessionId, jcr->VolSessionTime} == vsi);
551   });
552 }
553 
554 /*
555  * Given a SessionId and SessionTime, find the JobControlRecord
556  *
557  * Returns: jcr on success
558  *          nullptr on failure
559  */
get_jcr_by_session(uint32_t SessionId,uint32_t SessionTime)560 JobControlRecord* get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
561 {
562   JobControlRecord* jcr;
563 
564   foreach_jcr (jcr) {
565     if (jcr->VolSessionId == SessionId && jcr->VolSessionTime == SessionTime) {
566       jcr->IncUseCount();
567       Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
568             jcr->UseCount(), jcr->Job);
569       break;
570     }
571   }
572   endeach_jcr(jcr);
573 
574   return jcr;
575 }
576 
577 /*
578  * Given a Job, find the JobControlRecord compares on the number of
579  * characters in Job thus allowing partial matches.
580  *
581  * Returns: jcr on success
582  *          nullptr on failure
583  */
get_jcr_by_partial_name(char * Job)584 JobControlRecord* get_jcr_by_partial_name(char* Job)
585 {
586   JobControlRecord* jcr;
587   int len;
588 
589   if (!Job) { return nullptr; }
590 
591   len = strlen(Job);
592   foreach_jcr (jcr) {
593     if (bstrncmp(Job, jcr->Job, len)) {
594       jcr->IncUseCount();
595       Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
596             jcr->UseCount(), jcr->Job);
597       break;
598     }
599   }
600   endeach_jcr(jcr);
601 
602   return jcr;
603 }
604 
605 /*
606  * Given a Job, find the JobControlRecord requires an exact match of names.
607  *
608  * Returns: jcr on success
609  *          nullptr on failure
610  */
get_jcr_by_full_name(char * Job)611 JobControlRecord* get_jcr_by_full_name(char* Job)
612 {
613   JobControlRecord* jcr;
614 
615   if (!Job) { return nullptr; }
616 
617   foreach_jcr (jcr) {
618     if (bstrcmp(jcr->Job, Job)) {
619       jcr->IncUseCount();
620       Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
621             jcr->UseCount(), jcr->Job);
622       break;
623     }
624   }
625   endeach_jcr(jcr);
626 
627   return jcr;
628 }
629 
JcrGetAuthenticateKey(const char * unified_job_name)630 const char* JcrGetAuthenticateKey(const char* unified_job_name)
631 {
632   if (!unified_job_name) { return nullptr; }
633 
634   JobControlRecord* jcr;
635   const char* auth_key = nullptr;
636   foreach_jcr (jcr) {
637     if (bstrcmp(jcr->Job, unified_job_name)) {
638       auth_key = jcr->sd_auth_key;
639       Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
640             jcr->UseCount(), jcr->Job);
641       break;
642     }
643   }
644   endeach_jcr(jcr);
645 
646   return auth_key;
647 }
648 
JcrGetTlsPolicy(const char * unified_job_name)649 TlsPolicy JcrGetTlsPolicy(const char* unified_job_name)
650 {
651   if (!unified_job_name) { return kBnetTlsUnknown; }
652 
653   TlsPolicy policy = kBnetTlsUnknown;
654   JobControlRecord* jcr;
655 
656   foreach_jcr (jcr) {
657     if (bstrcmp(jcr->Job, unified_job_name)) {
658       policy = jcr->sd_tls_policy;
659       Dmsg4(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s TlsPolicy=%d\n",
660             jcr->JobId, jcr->UseCount(), jcr->Job, policy);
661       break;
662     }
663   }
664   endeach_jcr(jcr);
665 
666   return policy;
667 }
668 
UpdateWaitTime(JobControlRecord * jcr,int newJobStatus)669 static void UpdateWaitTime(JobControlRecord* jcr, int newJobStatus)
670 {
671   bool enter_in_waittime;
672   int oldJobStatus = jcr->JobStatus;
673 
674   switch (newJobStatus) {
675     case JS_WaitFD:
676     case JS_WaitSD:
677     case JS_WaitMedia:
678     case JS_WaitMount:
679     case JS_WaitStoreRes:
680     case JS_WaitJobRes:
681     case JS_WaitClientRes:
682     case JS_WaitMaxJobs:
683     case JS_WaitPriority:
684       enter_in_waittime = true;
685       break;
686     default:
687       enter_in_waittime = false; /* not a Wait situation */
688       break;
689   }
690 
691   /*
692    * If we were previously waiting and are not any more
693    * we want to update the wait_time variable, which is
694    * the start of waiting.
695    */
696   switch (oldJobStatus) {
697     case JS_WaitFD:
698     case JS_WaitSD:
699     case JS_WaitMedia:
700     case JS_WaitMount:
701     case JS_WaitStoreRes:
702     case JS_WaitJobRes:
703     case JS_WaitClientRes:
704     case JS_WaitMaxJobs:
705     case JS_WaitPriority:
706       if (!enter_in_waittime) { /* we get out the wait time */
707         jcr->wait_time_sum += (time(nullptr) - jcr->wait_time);
708         jcr->wait_time = 0;
709       }
710       break;
711     default:
712       /*
713        * If wait state is new, we keep current time for watchdog MaxWaitTime
714        */
715       if (enter_in_waittime) { jcr->wait_time = time(nullptr); }
716       break;
717   }
718 }
719 
720 /*
721  * Priority runs from 0 (lowest) to 10 (highest)
722  */
GetStatusPriority(int JobStatus)723 static int GetStatusPriority(int JobStatus)
724 {
725   int priority = 0;
726 
727   switch (JobStatus) {
728     case JS_Incomplete:
729       priority = 10;
730       break;
731     case JS_ErrorTerminated:
732     case JS_FatalError:
733     case JS_Canceled:
734       priority = 9;
735       break;
736     case JS_Error:
737       priority = 8;
738       break;
739     case JS_Differences:
740       priority = 7;
741       break;
742   }
743 
744   return priority;
745 }
746 
747 /*
748  * Send Job status to Director
749  */
sendJobStatus()750 bool JobControlRecord::sendJobStatus()
751 {
752   if (dir_bsock) { return dir_bsock->fsend(Job_status, Job, JobStatus); }
753 
754   return true;
755 }
756 
757 /*
758  * Set and send Job status to Director
759  */
sendJobStatus(int newJobStatus)760 bool JobControlRecord::sendJobStatus(int newJobStatus)
761 {
762   if (!is_JobStatus(newJobStatus)) {
763     setJobStatus(newJobStatus);
764     if (dir_bsock) { return dir_bsock->fsend(Job_status, Job, JobStatus); }
765   }
766 
767   return true;
768 }
769 
setJobStarted()770 void JobControlRecord::setJobStarted()
771 {
772   job_started = true;
773   job_started_time = time(nullptr);
774 }
775 
resetJobStatus(int newJobStatus)776 void JobControlRecord::resetJobStatus(int newJobStatus)
777 {
778   JobStatus = newJobStatus;
779 }
780 
setJobStatus(int newJobStatus)781 void JobControlRecord::setJobStatus(int newJobStatus)
782 {
783   int priority;
784   int old_priority = 0;
785   int oldJobStatus = ' ';
786 
787   if (JobStatus) {
788     oldJobStatus = JobStatus;
789     old_priority = GetStatusPriority(oldJobStatus);
790   }
791   priority = GetStatusPriority(newJobStatus);
792 
793   Dmsg2(800, "setJobStatus(%s, %c)\n", Job, newJobStatus);
794 
795   /*
796    * Update wait_time depending on newJobStatus and oldJobStatus
797    */
798   UpdateWaitTime(this, newJobStatus);
799 
800   /*
801    * For a set of errors, ... keep the current status
802    * so it isn't lost. For all others, set it.
803    */
804   Dmsg2(800, "OnEntry JobStatus=%c newJobstatus=%c\n", oldJobStatus,
805         newJobStatus);
806 
807   /*
808    * If status priority is > than proposed new status, change it.
809    * If status priority == new priority and both are zero, take the new
810    * status. If it is not zero, then we keep the first non-zero "error" that
811    * occurred.
812    */
813   if (priority > old_priority || (priority == 0 && old_priority == 0)) {
814     Dmsg4(800, "Set new stat. old: %c,%d new: %c,%d\n", oldJobStatus,
815           old_priority, newJobStatus, priority);
816     JobStatus = newJobStatus; /* replace with new status */
817   }
818 
819   if (oldJobStatus != JobStatus) {
820     Dmsg2(800, "leave setJobStatus old=%c new=%c\n", oldJobStatus,
821           newJobStatus);
822     //    GeneratePluginEvent(this, bEventStatusChange, nullptr);
823   }
824 }
825 
LockJcrChain()826 void LockJcrChain() { jcr_chain_mutex.lock(); }
827 
UnlockJcrChain()828 void UnlockJcrChain() { jcr_chain_mutex.unlock(); }
829 
830 /*
831  * Start walk of jcr chain
832  * The proper way to walk the jcr chain is:
833  *    JobControlRecord *jcr;
834  *    foreach_jcr(jcr) {
835  *      ...
836  *    }
837  *    endeach_jcr(jcr);
838  *
839  * It is possible to leave out the endeach_jcr(jcr), but
840  * in that case, the last jcr referenced must be explicitly
841  * released with:
842  *
843  * FreeJcr(jcr);
844  */
jcr_walk_start()845 JobControlRecord* jcr_walk_start()
846 {
847   JobControlRecord* jcr;
848   LockJcrChain();
849   jcr = (JobControlRecord*)job_control_record_chain->first();
850   if (jcr) {
851     jcr->IncUseCount();
852     if (jcr->JobId > 0) {
853       Dmsg3(debuglevel, "Inc walk_start jid=%u UseCount=%d Job=%s\n",
854             jcr->JobId, jcr->UseCount(), jcr->Job);
855     }
856   }
857   UnlockJcrChain();
858   return jcr;
859 }
860 
861 /*
862  * Get next jcr from chain, and release current one
863  */
jcr_walk_next(JobControlRecord * prev_jcr)864 JobControlRecord* jcr_walk_next(JobControlRecord* prev_jcr)
865 {
866   JobControlRecord* jcr;
867 
868   LockJcrChain();
869   jcr = (JobControlRecord*)job_control_record_chain->next(prev_jcr);
870   if (jcr) {
871     jcr->IncUseCount();
872     if (jcr->JobId > 0) {
873       Dmsg3(debuglevel, "Inc walk_next jid=%u UseCount=%d Job=%s\n", jcr->JobId,
874             jcr->UseCount(), jcr->Job);
875     }
876   }
877   UnlockJcrChain();
878   if (prev_jcr) { FreeJcr(prev_jcr); }
879   return jcr;
880 }
881 
882 /*
883  * Release last jcr referenced
884  */
JcrWalkEnd(JobControlRecord * jcr)885 void JcrWalkEnd(JobControlRecord* jcr)
886 {
887   if (jcr) {
888     if (jcr->JobId > 0) {
889       Dmsg3(debuglevel, "Free walk_end jid=%u UseCount=%d Job=%s\n", jcr->JobId,
890             jcr->UseCount(), jcr->Job);
891     }
892     FreeJcr(jcr);
893   }
894 }
895 
896 /*
897  * Return number of Jobs
898  */
JobCount()899 int JobCount()
900 {
901   JobControlRecord* jcr;
902   int count = 0;
903 
904   LockJcrChain();
905   for (jcr = (JobControlRecord*)job_control_record_chain->first();
906        (jcr = (JobControlRecord*)job_control_record_chain->next(jcr));) {
907     if (jcr->JobId > 0) { count++; }
908   }
909   UnlockJcrChain();
910   return count;
911 }
912 
913 /*
914  * Setup to call the timeout check routine every 30 seconds
915  * This routine will check any timers that have been enabled.
916  */
InitJcrSubsystem(int timeout)917 bool InitJcrSubsystem(int timeout)
918 {
919   watchdog_t* wd = new_watchdog();
920 
921   watch_dog_timeout = timeout;
922   wd->one_shot = false;
923   wd->interval = 30; /* FIXME: should be configurable somewhere, even
924                       if only with a #define */
925   wd->callback = JcrTimeoutCheck;
926 
927   RegisterWatchdog(wd);
928 
929   return true;
930 }
931 
InitJcrChain()932 void InitJcrChain()
933 {
934   JobControlRecord* jcr = nullptr;
935   if (!job_control_record_chain) {
936     job_control_record_chain = new dlist(jcr, &jcr->link);
937   }
938 }
939 
CleanupJcrChain()940 void CleanupJcrChain()
941 {
942   if (job_control_record_chain) {
943     delete job_control_record_chain;
944     job_control_record_chain = nullptr;
945   }
946 }
947 
JcrTimeoutCheck(watchdog_t *)948 static void JcrTimeoutCheck(watchdog_t* /* self */)
949 {
950   JobControlRecord* jcr;
951   BareosSocket* bs;
952   time_t timer_start;
953 
954   Dmsg0(debuglevel, "Start JobControlRecord timeout checks\n");
955 
956   /* Walk through all JCRs checking if any one is
957    * blocked for more than specified max time.
958    */
959   foreach_jcr (jcr) {
960     Dmsg2(debuglevel, "JcrTimeoutCheck JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
961     if (jcr->JobId == 0) { continue; }
962     bs = jcr->store_bsock;
963     if (bs) {
964       timer_start = bs->timer_start;
965       if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
966         bs->timer_start = 0; /* turn off timer */
967         bs->SetTimedOut();
968         Qmsg(jcr, M_ERROR, 0,
969              _("Watchdog sending kill after %d secs to thread stalled reading "
970                "Storage daemon.\n"),
971              watchdog_time - timer_start);
972         jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
973       }
974     }
975     bs = jcr->file_bsock;
976     if (bs) {
977       timer_start = bs->timer_start;
978       if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
979         bs->timer_start = 0; /* turn off timer */
980         bs->SetTimedOut();
981         Qmsg(jcr, M_ERROR, 0,
982              _("Watchdog sending kill after %d secs to thread stalled reading "
983                "File daemon.\n"),
984              watchdog_time - timer_start);
985         jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
986       }
987     }
988     bs = jcr->dir_bsock;
989     if (bs) {
990       timer_start = bs->timer_start;
991       if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
992         bs->timer_start = 0; /* turn off timer */
993         bs->SetTimedOut();
994         Qmsg(jcr, M_ERROR, 0,
995              _("Watchdog sending kill after %d secs to thread stalled reading "
996                "Director.\n"),
997              watchdog_time - timer_start);
998         jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
999       }
1000     }
1001   }
1002   endeach_jcr(jcr);
1003 
1004   Dmsg0(debuglevel, "Finished JobControlRecord timeout checks\n");
1005 }
1006 
1007 /*
1008  * Return next JobId from comma separated list
1009  *
1010  * Returns:
1011  *   1 if next JobId returned
1012  *   0 if no more JobIds are in list
1013  *  -1 there is an error
1014  */
GetNextJobidFromList(const char ** p,uint32_t * JobId)1015 int GetNextJobidFromList(const char** p, uint32_t* JobId)
1016 {
1017   const int maxlen = 30;
1018   char jobid[maxlen + 1];
1019   const char* q = *p;
1020 
1021   jobid[0] = 0;
1022   for (int i = 0; i < maxlen; i++) {
1023     if (*q == 0) {
1024       break;
1025     } else if (*q == ',') {
1026       q++;
1027       break;
1028     }
1029     jobid[i] = *q++;
1030     jobid[i + 1] = 0;
1031   }
1032   if (jobid[0] == 0) {
1033     return 0;
1034   } else if (!Is_a_number(jobid)) {
1035     return -1; /* error */
1036   }
1037   *p = q;
1038   *JobId = str_to_int64(jobid);
1039   return 1;
1040 }
1041 
1042 /*
1043  * Used to display specific daemon information after a fatal signal
1044  * (like BareosDb in the director)
1045  */
1046 #define MAX_DBG_HOOK 10
1047 static dbg_jcr_hook_t* dbg_jcr_hooks[MAX_DBG_HOOK];
1048 static int dbg_jcr_handler_count;
1049 
DbgJcrAddHook(dbg_jcr_hook_t * hook)1050 void DbgJcrAddHook(dbg_jcr_hook_t* hook)
1051 {
1052   ASSERT(dbg_jcr_handler_count < MAX_DBG_HOOK);
1053   dbg_jcr_hooks[dbg_jcr_handler_count++] = hook;
1054 }
1055 
1056 /*
1057  * !!! WARNING !!!
1058  *
1059  * This function should be used ONLY after a fatal signal. We walk through the
1060  * JobControlRecord chain without doing any lock, BAREOS should not be
1061  * running.
1062  */
DbgPrintJcr(FILE * fp)1063 void DbgPrintJcr(FILE* fp)
1064 {
1065   char ed1[50], buf1[128], buf2[128], buf3[128], buf4[128];
1066   if (!job_control_record_chain) { return; }
1067 
1068   fprintf(fp, "Attempt to dump current JCRs. njcrs=%d\n",
1069           job_control_record_chain->size());
1070 
1071   for (JobControlRecord* jcr
1072        = (JobControlRecord*)job_control_record_chain->first();
1073        jcr; jcr = (JobControlRecord*)job_control_record_chain->next(jcr)) {
1074     fprintf(fp, "threadid=%s JobId=%d JobStatus=%c jcr=%p name=%s\n",
1075             edit_pthread(jcr->my_thread_id, ed1, sizeof(ed1)), (int)jcr->JobId,
1076             jcr->JobStatus, jcr, jcr->Job);
1077     fprintf(fp,
1078             "threadid=%s killable=%d JobId=%d JobStatus=%c jcr=%p name=%s\n",
1079             edit_pthread(jcr->my_thread_id, ed1, sizeof(ed1)),
1080             jcr->IsKillable(), (int)jcr->JobId, jcr->JobStatus, jcr, jcr->Job);
1081     fprintf(fp, "\tUseCount=%i\n", jcr->UseCount());
1082     fprintf(fp, "\tJobType=%c JobLevel=%c\n", jcr->getJobType(),
1083             jcr->getJobLevel());
1084     bstrftime(buf1, sizeof(buf1), jcr->sched_time);
1085     bstrftime(buf2, sizeof(buf2), jcr->start_time);
1086     bstrftime(buf3, sizeof(buf3), jcr->end_time);
1087     bstrftime(buf4, sizeof(buf4), jcr->wait_time);
1088     fprintf(fp, "\tsched_time=%s start_time=%s\n\tend_time=%s wait_time=%s\n",
1089             buf1, buf2, buf3, buf4);
1090     fprintf(fp, "\tdb=%p db_batch=%p batch_started=%i\n", jcr->db,
1091             jcr->db_batch, jcr->batch_started);
1092 
1093     /*
1094      * Call all the jcr debug hooks
1095      */
1096     for (int i = 0; i < dbg_jcr_handler_count; i++) {
1097       dbg_jcr_hook_t* hook = dbg_jcr_hooks[i];
1098       hook(jcr, fp);
1099     }
1100   }
1101 }
1102