1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2000-2012 Free Software Foundation Europe e.V.
5 Copyright (C) 2011-2012 Planets Communications B.V.
6 Copyright (C) 2013-2013 Bareos GmbH & Co. KG
7
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
11 in the file LICENSE.
12
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.
22 */
23 /*
24 * Manipulation routines for Job Control Records and
25 * handling of last_jobs_list.
26 *
27 * Kern E. Sibbald, December 2000
28 *
29 * These routines are thread safe.
30 *
31 * The job list routines were re-written in May 2005 to
32 * eliminate the global lock while traversing the list, and
33 * to use the dlist subroutines. The locking is now done
34 * on the list each time the list is modified or traversed.
35 * That is it is "micro-locked" rather than globally locked.
36 * The result is that there is one lock/unlock for each entry
37 * in the list while traversing it rather than a single lock
38 * at the beginning of a traversal and one at the end. This
39 * incurs slightly more overhead, but effectively eliminates
40 * the possibilty of race conditions. In addition, with the
41 * exception of the global locking of the list during the
42 * re-reading of the config file, no recursion is needed.
43 *
44 */
45
46 #include "include/bareos.h"
47 #include "include/jcr.h"
48 #include "lib/berrno.h"
49 #include "lib/bsignal.h"
50 #include "lib/breg.h"
51 #include "lib/edit.h"
52 #include "lib/thread_specific_data.h"
53 #include "lib/tls_conf.h"
54 #include "lib/bsock.h"
55 #include "lib/recent_job_results_list.h"
56 #include "lib/message_queue_item.h"
57 #include "lib/volume_session_info.h"
58 #include "lib/watchdog.h"
59
60 #include <algorithm>
61
62 const int debuglevel = 3400;
63
64 static void JcrTimeoutCheck(watchdog_t* self);
65
66 int num_jobs_run;
67
68 static std::vector<std::weak_ptr<JobControlRecord>> job_control_record_cache;
69 static dlist* job_control_record_chain = nullptr;
70 static int watch_dog_timeout = 0;
71
72 static std::mutex jcr_chain_mutex;
73 static pthread_mutex_t job_start_mutex = PTHREAD_MUTEX_INITIALIZER;
74
75 static char Job_status[] = "Status Job=%s JobStatus=%d\n";
76
LockJobs()77 void LockJobs() { P(job_start_mutex); }
78
UnlockJobs()79 void UnlockJobs() { V(job_start_mutex); }
80
81 /*
82 * Get an ASCII representation of the Operation being performed as an english
83 * Noun
84 */
get_OperationName()85 const char* JobControlRecord::get_OperationName()
86 {
87 switch (JobType_) {
88 case JT_BACKUP:
89 return _("Backup");
90 case JT_VERIFY:
91 return _("Verifying");
92 case JT_RESTORE:
93 return _("Restoring");
94 case JT_ARCHIVE:
95 return _("Archiving");
96 case JT_COPY:
97 return _("Copying");
98 case JT_MIGRATE:
99 return _("Migration");
100 case JT_SCAN:
101 return _("Scanning");
102 case JT_CONSOLIDATE:
103 return _("Consolidating");
104 default:
105 return _("Unknown operation");
106 }
107 }
108
109 /*
110 * Get an ASCII representation of the Action being performed either an english
111 * Verb or Adjective
112 */
get_ActionName(bool past)113 const char* JobControlRecord::get_ActionName(bool past)
114 {
115 switch (JobType_) {
116 case JT_BACKUP:
117 return _("backup");
118 case JT_VERIFY:
119 return (past) ? _("verified") : _("verify");
120 case JT_RESTORE:
121 return (past) ? _("restored") : _("restore");
122 case JT_ARCHIVE:
123 return (past) ? _("archived") : _("archive");
124 case JT_COPY:
125 return (past) ? _("copied") : _("copy");
126 case JT_MIGRATE:
127 return (past) ? _("migrated") : _("migrate");
128 case JT_SCAN:
129 return (past) ? _("scanned") : _("scan");
130 case JT_CONSOLIDATE:
131 return (past) ? _("consolidated") : _("consolidate");
132 default:
133 return _("unknown action");
134 }
135 }
136
JobReads()137 bool JobControlRecord::JobReads()
138 {
139 switch (JobType_) {
140 case JT_VERIFY:
141 case JT_RESTORE:
142 case JT_COPY:
143 case JT_MIGRATE:
144 return true;
145 case JT_BACKUP:
146 if (JobLevel_ == L_VIRTUAL_FULL) { return true; }
147 break;
148 default:
149 break;
150 }
151 return false;
152 }
153
154 struct job_callback_item {
155 void (*JobEndCb)(JobControlRecord* jcr, void*);
156 void* ctx{};
157 };
158
159 /*
160 * Push a job_callback_item onto the job end callback stack.
161 */
RegisterJobEndCallback(JobControlRecord * jcr,void JobEndCb (JobControlRecord * jcr,void *),void * ctx)162 void RegisterJobEndCallback(JobControlRecord* jcr,
163 void JobEndCb(JobControlRecord* jcr, void*),
164 void* ctx)
165 {
166 job_callback_item* item;
167
168 item = (job_callback_item*)malloc(sizeof(job_callback_item));
169
170 item->JobEndCb = JobEndCb;
171 item->ctx = ctx;
172
173 jcr->job_end_callbacks.push((void*)item);
174 }
175
176 /*
177 * Pop each job_callback_item and process it.
178 */
CallJobEndCallbacks(JobControlRecord * jcr)179 static void CallJobEndCallbacks(JobControlRecord* jcr)
180 {
181 job_callback_item* item;
182
183 if (jcr->job_end_callbacks.size() > 0) {
184 item = (job_callback_item*)jcr->job_end_callbacks.pop();
185 while (item) {
186 item->JobEndCb(jcr, item->ctx);
187 free(item);
188 item = (job_callback_item*)jcr->job_end_callbacks.pop();
189 }
190 }
191 }
192
JobControlRecord()193 JobControlRecord::JobControlRecord()
194 {
195 Dmsg0(100, "Construct JobControlRecord\n");
196
197 MessageQueueItem* item = nullptr;
198 msg_queue = new dlist(item, &item->link_); // calculate offset
199
200 int status;
201 if ((status = pthread_mutex_init(&msg_queue_mutex, nullptr)) != 0) {
202 BErrNo be;
203 Jmsg(nullptr, M_ABORT, 0, _("Could not init msg_queue mutex. ERR=%s\n"),
204 be.bstrerror(status));
205 }
206
207 my_thread_id = pthread_self();
208 job_end_callbacks.init(1, false);
209 sched_time = time(nullptr);
210 initial_sched_time = sched_time;
211 InitMutex();
212 IncUseCount();
213 VolumeName = GetPoolMemory(PM_FNAME);
214 VolumeName[0] = 0;
215 errmsg = GetPoolMemory(PM_MESSAGE);
216 errmsg[0] = 0;
217 comment = GetPoolMemory(PM_FNAME);
218 comment[0] = 0;
219
220 /*
221 * Setup some dummy values
222 */
223 bstrncpy(Job, "*System*", sizeof(Job));
224 JobId = 0;
225 setJobType(JT_SYSTEM); /* internal job until defined */
226 setJobLevel(L_NONE);
227 setJobStatus(JS_Created); /* ready to run */
228 SetTimeoutHandler();
229 }
230
new_jcr(JCR_free_HANDLER * daemon_free_jcr)231 JobControlRecord* new_jcr(JCR_free_HANDLER* daemon_free_jcr)
232 {
233 Dmsg0(debuglevel, "Enter new_jcr\n");
234
235 JobControlRecord* jcr =
236 static_cast<JobControlRecord*>(malloc(sizeof(JobControlRecord)));
237 jcr = new (jcr) JobControlRecord();
238
239 jcr->daemon_free_jcr = daemon_free_jcr;
240
241 LockJobs();
242 LockJcrChain();
243 InitJcrChain();
244 job_control_record_chain->append(jcr);
245 UnlockJcrChain();
246 UnlockJobs();
247 return jcr;
248 }
249
InitJcr(std::shared_ptr<JobControlRecord> jcr,JCR_free_HANDLER * daemon_free_jcr)250 void InitJcr(std::shared_ptr<JobControlRecord> jcr,
251 JCR_free_HANDLER* daemon_free_jcr)
252 {
253 jcr->daemon_free_jcr = daemon_free_jcr;
254
255 LockJobs();
256 LockJcrChain();
257 job_control_record_cache.emplace_back(jcr);
258 UnlockJcrChain();
259 UnlockJobs();
260 }
261
262 /*
263 * Remove a JobControlRecord from the chain
264 *
265 * NOTE! The chain must be locked prior to calling this routine.
266 */
RemoveJcr(JobControlRecord * jcr)267 static void RemoveJcr(JobControlRecord* jcr)
268 {
269 Dmsg0(debuglevel, "Enter RemoveJcr\n");
270 if (!jcr) { Emsg0(M_ABORT, 0, _("nullptr jcr.\n")); }
271 job_control_record_chain->remove(jcr);
272 Dmsg0(debuglevel, "Leave RemoveJcr\n");
273 }
274
FreeCommonJcr(JobControlRecord * jcr,bool is_destructor_call=false)275 static void FreeCommonJcr(JobControlRecord* jcr,
276 bool is_destructor_call = false)
277 {
278 Dmsg1(100, "FreeCommonJcr: %p \n", jcr);
279
280 if (!jcr) { Dmsg0(100, "FreeCommonJcr: Invalid jcr\n"); }
281
282 RemoveJcrFromThreadSpecificData(jcr);
283 jcr->SetKillable(false);
284
285 jcr->DestroyMutex();
286
287 if (jcr->msg_queue) {
288 delete jcr->msg_queue;
289 jcr->msg_queue = nullptr;
290 pthread_mutex_destroy(&jcr->msg_queue_mutex);
291 }
292
293 if (jcr->client_name) {
294 FreePoolMemory(jcr->client_name);
295 jcr->client_name = nullptr;
296 }
297
298 if (jcr->attr) {
299 FreePoolMemory(jcr->attr);
300 jcr->attr = nullptr;
301 }
302
303 if (jcr->sd_auth_key) {
304 free(jcr->sd_auth_key);
305 jcr->sd_auth_key = nullptr;
306 }
307
308 if (jcr->VolumeName) {
309 FreePoolMemory(jcr->VolumeName);
310 jcr->VolumeName = nullptr;
311 }
312
313 if (jcr->dir_bsock) {
314 jcr->dir_bsock->close();
315 delete jcr->dir_bsock;
316 jcr->dir_bsock = nullptr;
317 }
318
319 if (jcr->errmsg) {
320 FreePoolMemory(jcr->errmsg);
321 jcr->errmsg = nullptr;
322 }
323
324 if (jcr->where) {
325 free(jcr->where);
326 jcr->where = nullptr;
327 }
328
329 if (jcr->RegexWhere) {
330 free(jcr->RegexWhere);
331 jcr->RegexWhere = nullptr;
332 }
333
334 if (jcr->where_bregexp) {
335 FreeBregexps(jcr->where_bregexp);
336 delete jcr->where_bregexp;
337 jcr->where_bregexp = nullptr;
338 }
339
340 if (jcr->cached_path) {
341 FreePoolMemory(jcr->cached_path);
342 jcr->cached_path = nullptr;
343 jcr->cached_pnl = 0;
344 }
345
346 if (jcr->id_list) {
347 FreeGuidList(jcr->id_list);
348 jcr->id_list = nullptr;
349 }
350
351 if (jcr->comment) {
352 FreePoolMemory(jcr->comment);
353 jcr->comment = nullptr;
354 }
355
356 if (!is_destructor_call) { free(jcr); }
357 }
358
JcrCleanup(JobControlRecord * jcr,bool is_destructor_call=false)359 static void JcrCleanup(JobControlRecord* jcr, bool is_destructor_call = false)
360 {
361 DequeueMessages(jcr);
362 CallJobEndCallbacks(jcr);
363
364 Dmsg1(debuglevel, "End job=%d\n", jcr->JobId);
365
366 switch (jcr->getJobType()) {
367 case JT_BACKUP:
368 case JT_VERIFY:
369 case JT_RESTORE:
370 case JT_MIGRATE:
371 case JT_COPY:
372 case JT_ADMIN:
373 if (jcr->JobId > 0) { // except Console Jobs
374 num_jobs_run++;
375 RecentJobResultsList::Append(jcr);
376 }
377 break;
378 default:
379 break;
380 }
381
382 CloseMsg(jcr);
383
384 if (jcr->daemon_free_jcr) { jcr->daemon_free_jcr(jcr); }
385
386 FreeCommonJcr(jcr, is_destructor_call);
387 CloseMsg(nullptr); // flush any daemon messages
388 }
389
~JobControlRecord()390 JobControlRecord::~JobControlRecord()
391 {
392 Dmsg0(100, "Destruct JobControlRecord\n");
393 JcrCleanup(this, true);
394 Dmsg0(debuglevel, "JobControlRecord Destructor finished\n");
395 }
396
RunJcrGarbageCollector(JobControlRecord * jcr)397 static bool RunJcrGarbageCollector(JobControlRecord* jcr)
398 {
399 LockJcrChain();
400 jcr->DecUseCount(); /* decrement use count */
401 if (jcr->UseCount() < 0) {
402 Jmsg2(jcr, M_ERROR, 0, _("JobControlRecord UseCount=%d JobId=%d\n"),
403 jcr->UseCount(), jcr->JobId);
404 }
405 if (jcr->JobId > 0) {
406 Dmsg3(debuglevel, "Dec FreeJcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
407 jcr->UseCount(), jcr->Job);
408 }
409 if (jcr->UseCount() > 0) { /* if in use */
410 UnlockJcrChain();
411 return false;
412 }
413 if (jcr->JobId > 0) {
414 Dmsg3(debuglevel, "remove jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
415 jcr->UseCount(), jcr->Job);
416 }
417 RemoveJcr(jcr); /* remove Jcr from chain */
418 UnlockJcrChain();
419 return true;
420 }
421
422 /*
423 * Global routine to free a jcr
424 */
b_free_jcr(const char * file,int line,JobControlRecord * jcr)425 void b_free_jcr(const char* file, int line, JobControlRecord* jcr)
426 {
427 Dmsg3(debuglevel, "Enter FreeJcr jid=%u from %s:%d\n", jcr->JobId, file,
428 line);
429
430 if (RunJcrGarbageCollector(jcr)) { JcrCleanup(jcr); }
431
432 Dmsg0(debuglevel, "Exit FreeJcr\n");
433 }
434
SetKillable(bool killable)435 void JobControlRecord::SetKillable(bool killable)
436 {
437 lock();
438
439 my_thread_killable = killable;
440 if (killable) {
441 my_thread_id = pthread_self();
442 } else {
443 memset(&my_thread_id, 0, sizeof(my_thread_id));
444 }
445
446 unlock();
447 }
448
MyThreadSendSignal(int sig)449 void JobControlRecord::MyThreadSendSignal(int sig)
450 {
451 lock();
452
453 if (IsKillable() && !pthread_equal(my_thread_id, pthread_self())) {
454 Dmsg1(800, "Send kill to jid=%d\n", JobId);
455 pthread_kill(my_thread_id, sig);
456 } else if (!IsKillable()) {
457 Dmsg1(10, "Warning, can't send kill to jid=%d\n", JobId);
458 }
459
460 unlock();
461 }
462
463
464 /*
465 * Given a JobId, find the JobControlRecord
466 *
467 * Returns: jcr on success
468 * nullptr on failure
469 */
get_jcr_by_id(uint32_t JobId)470 JobControlRecord* get_jcr_by_id(uint32_t JobId)
471 {
472 JobControlRecord* jcr;
473
474 foreach_jcr (jcr) {
475 if (jcr->JobId == JobId) {
476 jcr->IncUseCount();
477 Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
478 jcr->UseCount(), jcr->Job);
479 break;
480 }
481 }
482 endeach_jcr(jcr);
483
484 return jcr;
485 }
486
GetJcrCount()487 std::size_t GetJcrCount()
488 {
489 LockJcrChain();
490 std::size_t count =
491 count_if(job_control_record_cache.begin(), job_control_record_cache.end(),
492 [](std::weak_ptr<JobControlRecord>& p) { return !p.expired(); });
493 UnlockJcrChain();
494
495 return count;
496 }
497
GetJcr(std::function<bool (const JobControlRecord *)> compare)498 static std::shared_ptr<JobControlRecord> GetJcr(
499 std::function<bool(const JobControlRecord*)> compare)
500 {
501 std::shared_ptr<JobControlRecord> result;
502
503 LockJcrChain();
504
505 // cleanup chache
506 job_control_record_cache.erase(
507 std::remove_if(
508 job_control_record_cache.begin(), job_control_record_cache.end(),
509 [](std::weak_ptr<JobControlRecord>& p) { return p.expired(); }),
510 job_control_record_cache.end());
511
512 find_if(job_control_record_cache.begin(), job_control_record_cache.end(),
513 [&compare, &result](std::weak_ptr<JobControlRecord>& p) {
514 auto jcr = p.lock();
515 if (compare(jcr.get())) {
516 result = jcr;
517 return true;
518 }
519 return false;
520 });
521
522 UnlockJcrChain();
523
524 return result;
525 }
526
GetJcrById(uint32_t JobId)527 std::shared_ptr<JobControlRecord> GetJcrById(uint32_t JobId)
528 {
529 return GetJcr(
530 [JobId](const JobControlRecord* jcr) { return jcr->JobId == JobId; });
531 }
532
GetJcrByFullName(std::string name)533 std::shared_ptr<JobControlRecord> GetJcrByFullName(std::string name)
534 {
535 return GetJcr([&name](const JobControlRecord* jcr) {
536 return std::string{jcr->Job} == name;
537 });
538 }
539
GetJcrByPartialName(std::string name)540 std::shared_ptr<JobControlRecord> GetJcrByPartialName(std::string name)
541 {
542 return GetJcr([&name](const JobControlRecord* jcr) {
543 return std::string{jcr->Job}.find(name) == 0;
544 });
545 }
546
GetJcrBySession(const VolumeSessionInfo & vsi)547 std::shared_ptr<JobControlRecord> GetJcrBySession(const VolumeSessionInfo& vsi)
548 {
549 return GetJcr([&vsi](const JobControlRecord* jcr) {
550 return (VolumeSessionInfo{jcr->VolSessionId, jcr->VolSessionTime} == vsi);
551 });
552 }
553
554 /*
555 * Given a SessionId and SessionTime, find the JobControlRecord
556 *
557 * Returns: jcr on success
558 * nullptr on failure
559 */
get_jcr_by_session(uint32_t SessionId,uint32_t SessionTime)560 JobControlRecord* get_jcr_by_session(uint32_t SessionId, uint32_t SessionTime)
561 {
562 JobControlRecord* jcr;
563
564 foreach_jcr (jcr) {
565 if (jcr->VolSessionId == SessionId && jcr->VolSessionTime == SessionTime) {
566 jcr->IncUseCount();
567 Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
568 jcr->UseCount(), jcr->Job);
569 break;
570 }
571 }
572 endeach_jcr(jcr);
573
574 return jcr;
575 }
576
577 /*
578 * Given a Job, find the JobControlRecord compares on the number of
579 * characters in Job thus allowing partial matches.
580 *
581 * Returns: jcr on success
582 * nullptr on failure
583 */
get_jcr_by_partial_name(char * Job)584 JobControlRecord* get_jcr_by_partial_name(char* Job)
585 {
586 JobControlRecord* jcr;
587 int len;
588
589 if (!Job) { return nullptr; }
590
591 len = strlen(Job);
592 foreach_jcr (jcr) {
593 if (bstrncmp(Job, jcr->Job, len)) {
594 jcr->IncUseCount();
595 Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
596 jcr->UseCount(), jcr->Job);
597 break;
598 }
599 }
600 endeach_jcr(jcr);
601
602 return jcr;
603 }
604
605 /*
606 * Given a Job, find the JobControlRecord requires an exact match of names.
607 *
608 * Returns: jcr on success
609 * nullptr on failure
610 */
get_jcr_by_full_name(char * Job)611 JobControlRecord* get_jcr_by_full_name(char* Job)
612 {
613 JobControlRecord* jcr;
614
615 if (!Job) { return nullptr; }
616
617 foreach_jcr (jcr) {
618 if (bstrcmp(jcr->Job, Job)) {
619 jcr->IncUseCount();
620 Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
621 jcr->UseCount(), jcr->Job);
622 break;
623 }
624 }
625 endeach_jcr(jcr);
626
627 return jcr;
628 }
629
JcrGetAuthenticateKey(const char * unified_job_name)630 const char* JcrGetAuthenticateKey(const char* unified_job_name)
631 {
632 if (!unified_job_name) { return nullptr; }
633
634 JobControlRecord* jcr;
635 const char* auth_key = nullptr;
636 foreach_jcr (jcr) {
637 if (bstrcmp(jcr->Job, unified_job_name)) {
638 auth_key = jcr->sd_auth_key;
639 Dmsg3(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s\n", jcr->JobId,
640 jcr->UseCount(), jcr->Job);
641 break;
642 }
643 }
644 endeach_jcr(jcr);
645
646 return auth_key;
647 }
648
JcrGetTlsPolicy(const char * unified_job_name)649 TlsPolicy JcrGetTlsPolicy(const char* unified_job_name)
650 {
651 if (!unified_job_name) { return kBnetTlsUnknown; }
652
653 TlsPolicy policy = kBnetTlsUnknown;
654 JobControlRecord* jcr;
655
656 foreach_jcr (jcr) {
657 if (bstrcmp(jcr->Job, unified_job_name)) {
658 policy = jcr->sd_tls_policy;
659 Dmsg4(debuglevel, "Inc get_jcr jid=%u UseCount=%d Job=%s TlsPolicy=%d\n",
660 jcr->JobId, jcr->UseCount(), jcr->Job, policy);
661 break;
662 }
663 }
664 endeach_jcr(jcr);
665
666 return policy;
667 }
668
UpdateWaitTime(JobControlRecord * jcr,int newJobStatus)669 static void UpdateWaitTime(JobControlRecord* jcr, int newJobStatus)
670 {
671 bool enter_in_waittime;
672 int oldJobStatus = jcr->JobStatus;
673
674 switch (newJobStatus) {
675 case JS_WaitFD:
676 case JS_WaitSD:
677 case JS_WaitMedia:
678 case JS_WaitMount:
679 case JS_WaitStoreRes:
680 case JS_WaitJobRes:
681 case JS_WaitClientRes:
682 case JS_WaitMaxJobs:
683 case JS_WaitPriority:
684 enter_in_waittime = true;
685 break;
686 default:
687 enter_in_waittime = false; /* not a Wait situation */
688 break;
689 }
690
691 /*
692 * If we were previously waiting and are not any more
693 * we want to update the wait_time variable, which is
694 * the start of waiting.
695 */
696 switch (oldJobStatus) {
697 case JS_WaitFD:
698 case JS_WaitSD:
699 case JS_WaitMedia:
700 case JS_WaitMount:
701 case JS_WaitStoreRes:
702 case JS_WaitJobRes:
703 case JS_WaitClientRes:
704 case JS_WaitMaxJobs:
705 case JS_WaitPriority:
706 if (!enter_in_waittime) { /* we get out the wait time */
707 jcr->wait_time_sum += (time(nullptr) - jcr->wait_time);
708 jcr->wait_time = 0;
709 }
710 break;
711 default:
712 /*
713 * If wait state is new, we keep current time for watchdog MaxWaitTime
714 */
715 if (enter_in_waittime) { jcr->wait_time = time(nullptr); }
716 break;
717 }
718 }
719
720 /*
721 * Priority runs from 0 (lowest) to 10 (highest)
722 */
GetStatusPriority(int JobStatus)723 static int GetStatusPriority(int JobStatus)
724 {
725 int priority = 0;
726
727 switch (JobStatus) {
728 case JS_Incomplete:
729 priority = 10;
730 break;
731 case JS_ErrorTerminated:
732 case JS_FatalError:
733 case JS_Canceled:
734 priority = 9;
735 break;
736 case JS_Error:
737 priority = 8;
738 break;
739 case JS_Differences:
740 priority = 7;
741 break;
742 }
743
744 return priority;
745 }
746
747 /*
748 * Send Job status to Director
749 */
sendJobStatus()750 bool JobControlRecord::sendJobStatus()
751 {
752 if (dir_bsock) { return dir_bsock->fsend(Job_status, Job, JobStatus); }
753
754 return true;
755 }
756
757 /*
758 * Set and send Job status to Director
759 */
sendJobStatus(int newJobStatus)760 bool JobControlRecord::sendJobStatus(int newJobStatus)
761 {
762 if (!is_JobStatus(newJobStatus)) {
763 setJobStatus(newJobStatus);
764 if (dir_bsock) { return dir_bsock->fsend(Job_status, Job, JobStatus); }
765 }
766
767 return true;
768 }
769
setJobStarted()770 void JobControlRecord::setJobStarted()
771 {
772 job_started = true;
773 job_started_time = time(nullptr);
774 }
775
resetJobStatus(int newJobStatus)776 void JobControlRecord::resetJobStatus(int newJobStatus)
777 {
778 JobStatus = newJobStatus;
779 }
780
setJobStatus(int newJobStatus)781 void JobControlRecord::setJobStatus(int newJobStatus)
782 {
783 int priority;
784 int old_priority = 0;
785 int oldJobStatus = ' ';
786
787 if (JobStatus) {
788 oldJobStatus = JobStatus;
789 old_priority = GetStatusPriority(oldJobStatus);
790 }
791 priority = GetStatusPriority(newJobStatus);
792
793 Dmsg2(800, "setJobStatus(%s, %c)\n", Job, newJobStatus);
794
795 /*
796 * Update wait_time depending on newJobStatus and oldJobStatus
797 */
798 UpdateWaitTime(this, newJobStatus);
799
800 /*
801 * For a set of errors, ... keep the current status
802 * so it isn't lost. For all others, set it.
803 */
804 Dmsg2(800, "OnEntry JobStatus=%c newJobstatus=%c\n", oldJobStatus,
805 newJobStatus);
806
807 /*
808 * If status priority is > than proposed new status, change it.
809 * If status priority == new priority and both are zero, take the new
810 * status. If it is not zero, then we keep the first non-zero "error" that
811 * occurred.
812 */
813 if (priority > old_priority || (priority == 0 && old_priority == 0)) {
814 Dmsg4(800, "Set new stat. old: %c,%d new: %c,%d\n", oldJobStatus,
815 old_priority, newJobStatus, priority);
816 JobStatus = newJobStatus; /* replace with new status */
817 }
818
819 if (oldJobStatus != JobStatus) {
820 Dmsg2(800, "leave setJobStatus old=%c new=%c\n", oldJobStatus,
821 newJobStatus);
822 // GeneratePluginEvent(this, bEventStatusChange, nullptr);
823 }
824 }
825
LockJcrChain()826 void LockJcrChain() { jcr_chain_mutex.lock(); }
827
UnlockJcrChain()828 void UnlockJcrChain() { jcr_chain_mutex.unlock(); }
829
830 /*
831 * Start walk of jcr chain
832 * The proper way to walk the jcr chain is:
833 * JobControlRecord *jcr;
834 * foreach_jcr(jcr) {
835 * ...
836 * }
837 * endeach_jcr(jcr);
838 *
839 * It is possible to leave out the endeach_jcr(jcr), but
840 * in that case, the last jcr referenced must be explicitly
841 * released with:
842 *
843 * FreeJcr(jcr);
844 */
jcr_walk_start()845 JobControlRecord* jcr_walk_start()
846 {
847 JobControlRecord* jcr;
848 LockJcrChain();
849 jcr = (JobControlRecord*)job_control_record_chain->first();
850 if (jcr) {
851 jcr->IncUseCount();
852 if (jcr->JobId > 0) {
853 Dmsg3(debuglevel, "Inc walk_start jid=%u UseCount=%d Job=%s\n",
854 jcr->JobId, jcr->UseCount(), jcr->Job);
855 }
856 }
857 UnlockJcrChain();
858 return jcr;
859 }
860
861 /*
862 * Get next jcr from chain, and release current one
863 */
jcr_walk_next(JobControlRecord * prev_jcr)864 JobControlRecord* jcr_walk_next(JobControlRecord* prev_jcr)
865 {
866 JobControlRecord* jcr;
867
868 LockJcrChain();
869 jcr = (JobControlRecord*)job_control_record_chain->next(prev_jcr);
870 if (jcr) {
871 jcr->IncUseCount();
872 if (jcr->JobId > 0) {
873 Dmsg3(debuglevel, "Inc walk_next jid=%u UseCount=%d Job=%s\n", jcr->JobId,
874 jcr->UseCount(), jcr->Job);
875 }
876 }
877 UnlockJcrChain();
878 if (prev_jcr) { FreeJcr(prev_jcr); }
879 return jcr;
880 }
881
882 /*
883 * Release last jcr referenced
884 */
JcrWalkEnd(JobControlRecord * jcr)885 void JcrWalkEnd(JobControlRecord* jcr)
886 {
887 if (jcr) {
888 if (jcr->JobId > 0) {
889 Dmsg3(debuglevel, "Free walk_end jid=%u UseCount=%d Job=%s\n", jcr->JobId,
890 jcr->UseCount(), jcr->Job);
891 }
892 FreeJcr(jcr);
893 }
894 }
895
896 /*
897 * Return number of Jobs
898 */
JobCount()899 int JobCount()
900 {
901 JobControlRecord* jcr;
902 int count = 0;
903
904 LockJcrChain();
905 for (jcr = (JobControlRecord*)job_control_record_chain->first();
906 (jcr = (JobControlRecord*)job_control_record_chain->next(jcr));) {
907 if (jcr->JobId > 0) { count++; }
908 }
909 UnlockJcrChain();
910 return count;
911 }
912
913 /*
914 * Setup to call the timeout check routine every 30 seconds
915 * This routine will check any timers that have been enabled.
916 */
InitJcrSubsystem(int timeout)917 bool InitJcrSubsystem(int timeout)
918 {
919 watchdog_t* wd = new_watchdog();
920
921 watch_dog_timeout = timeout;
922 wd->one_shot = false;
923 wd->interval = 30; /* FIXME: should be configurable somewhere, even
924 if only with a #define */
925 wd->callback = JcrTimeoutCheck;
926
927 RegisterWatchdog(wd);
928
929 return true;
930 }
931
InitJcrChain()932 void InitJcrChain()
933 {
934 JobControlRecord* jcr = nullptr;
935 if (!job_control_record_chain) {
936 job_control_record_chain = new dlist(jcr, &jcr->link);
937 }
938 }
939
CleanupJcrChain()940 void CleanupJcrChain()
941 {
942 if (job_control_record_chain) {
943 delete job_control_record_chain;
944 job_control_record_chain = nullptr;
945 }
946 }
947
JcrTimeoutCheck(watchdog_t *)948 static void JcrTimeoutCheck(watchdog_t* /* self */)
949 {
950 JobControlRecord* jcr;
951 BareosSocket* bs;
952 time_t timer_start;
953
954 Dmsg0(debuglevel, "Start JobControlRecord timeout checks\n");
955
956 /* Walk through all JCRs checking if any one is
957 * blocked for more than specified max time.
958 */
959 foreach_jcr (jcr) {
960 Dmsg2(debuglevel, "JcrTimeoutCheck JobId=%u jcr=0x%x\n", jcr->JobId, jcr);
961 if (jcr->JobId == 0) { continue; }
962 bs = jcr->store_bsock;
963 if (bs) {
964 timer_start = bs->timer_start;
965 if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
966 bs->timer_start = 0; /* turn off timer */
967 bs->SetTimedOut();
968 Qmsg(jcr, M_ERROR, 0,
969 _("Watchdog sending kill after %d secs to thread stalled reading "
970 "Storage daemon.\n"),
971 watchdog_time - timer_start);
972 jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
973 }
974 }
975 bs = jcr->file_bsock;
976 if (bs) {
977 timer_start = bs->timer_start;
978 if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
979 bs->timer_start = 0; /* turn off timer */
980 bs->SetTimedOut();
981 Qmsg(jcr, M_ERROR, 0,
982 _("Watchdog sending kill after %d secs to thread stalled reading "
983 "File daemon.\n"),
984 watchdog_time - timer_start);
985 jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
986 }
987 }
988 bs = jcr->dir_bsock;
989 if (bs) {
990 timer_start = bs->timer_start;
991 if (timer_start && (watchdog_time - timer_start) > watch_dog_timeout) {
992 bs->timer_start = 0; /* turn off timer */
993 bs->SetTimedOut();
994 Qmsg(jcr, M_ERROR, 0,
995 _("Watchdog sending kill after %d secs to thread stalled reading "
996 "Director.\n"),
997 watchdog_time - timer_start);
998 jcr->MyThreadSendSignal(TIMEOUT_SIGNAL);
999 }
1000 }
1001 }
1002 endeach_jcr(jcr);
1003
1004 Dmsg0(debuglevel, "Finished JobControlRecord timeout checks\n");
1005 }
1006
1007 /*
1008 * Return next JobId from comma separated list
1009 *
1010 * Returns:
1011 * 1 if next JobId returned
1012 * 0 if no more JobIds are in list
1013 * -1 there is an error
1014 */
GetNextJobidFromList(char ** p,uint32_t * JobId)1015 int GetNextJobidFromList(char** p, uint32_t* JobId)
1016 {
1017 const int maxlen = 30;
1018 char jobid[maxlen + 1];
1019 char* q = *p;
1020
1021 jobid[0] = 0;
1022 for (int i = 0; i < maxlen; i++) {
1023 if (*q == 0) {
1024 break;
1025 } else if (*q == ',') {
1026 q++;
1027 break;
1028 }
1029 jobid[i] = *q++;
1030 jobid[i + 1] = 0;
1031 }
1032 if (jobid[0] == 0) {
1033 return 0;
1034 } else if (!Is_a_number(jobid)) {
1035 return -1; /* error */
1036 }
1037 *p = q;
1038 *JobId = str_to_int64(jobid);
1039 return 1;
1040 }
1041
1042 /*
1043 * Used to display specific daemon information after a fatal signal
1044 * (like BareosDb in the director)
1045 */
1046 #define MAX_DBG_HOOK 10
1047 static dbg_jcr_hook_t* dbg_jcr_hooks[MAX_DBG_HOOK];
1048 static int dbg_jcr_handler_count;
1049
DbgJcrAddHook(dbg_jcr_hook_t * hook)1050 void DbgJcrAddHook(dbg_jcr_hook_t* hook)
1051 {
1052 ASSERT(dbg_jcr_handler_count < MAX_DBG_HOOK);
1053 dbg_jcr_hooks[dbg_jcr_handler_count++] = hook;
1054 }
1055
1056 /*
1057 * !!! WARNING !!!
1058 *
1059 * This function should be used ONLY after a fatal signal. We walk through the
1060 * JobControlRecord chain without doing any lock, BAREOS should not be
1061 * running.
1062 */
DbgPrintJcr(FILE * fp)1063 void DbgPrintJcr(FILE* fp)
1064 {
1065 char ed1[50], buf1[128], buf2[128], buf3[128], buf4[128];
1066 if (!job_control_record_chain) { return; }
1067
1068 fprintf(fp, "Attempt to dump current JCRs. njcrs=%d\n",
1069 job_control_record_chain->size());
1070
1071 for (JobControlRecord* jcr =
1072 (JobControlRecord*)job_control_record_chain->first();
1073 jcr; jcr = (JobControlRecord*)job_control_record_chain->next(jcr)) {
1074 fprintf(fp, "threadid=%s JobId=%d JobStatus=%c jcr=%p name=%s\n",
1075 edit_pthread(jcr->my_thread_id, ed1, sizeof(ed1)), (int)jcr->JobId,
1076 jcr->JobStatus, jcr, jcr->Job);
1077 fprintf(fp,
1078 "threadid=%s killable=%d JobId=%d JobStatus=%c jcr=%p name=%s\n",
1079 edit_pthread(jcr->my_thread_id, ed1, sizeof(ed1)),
1080 jcr->IsKillable(), (int)jcr->JobId, jcr->JobStatus, jcr, jcr->Job);
1081 fprintf(fp, "\tUseCount=%i\n", jcr->UseCount());
1082 fprintf(fp, "\tJobType=%c JobLevel=%c\n", jcr->getJobType(),
1083 jcr->getJobLevel());
1084 bstrftime(buf1, sizeof(buf1), jcr->sched_time);
1085 bstrftime(buf2, sizeof(buf2), jcr->start_time);
1086 bstrftime(buf3, sizeof(buf3), jcr->end_time);
1087 bstrftime(buf4, sizeof(buf4), jcr->wait_time);
1088 fprintf(fp, "\tsched_time=%s start_time=%s\n\tend_time=%s wait_time=%s\n",
1089 buf1, buf2, buf3, buf4);
1090 fprintf(fp, "\tdb=%p db_batch=%p batch_started=%i\n", jcr->db,
1091 jcr->db_batch, jcr->batch_started);
1092
1093 /*
1094 * Call all the jcr debug hooks
1095 */
1096 for (int i = 0; i < dbg_jcr_handler_count; i++) {
1097 dbg_jcr_hook_t* hook = dbg_jcr_hooks[i];
1098 hook(jcr, fp);
1099 }
1100 }
1101 }
1102