1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2002-2011 Free Software Foundation Europe e.V.
5    Copyright (C) 2013-2019 Bareos GmbH & Co. KG
6 
7    This program is Free Software; you can redistribute it and/or
8    modify it under the terms of version three of the GNU Affero General Public
9    License as published by the Free Software Foundation and included
10    in the file LICENSE.
11 
12    This program is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15    Affero General Public License for more details.
16 
17    You should have received a copy of the GNU Affero General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20    02110-1301, USA.
21 */
22 /*
23  * BAREOS thread watchdog routine. General routine that
24  * allows setting a watchdog timer with a callback that is
25  * called when the timer goes off.
26  *
27  * Kern Sibbald, January MMII
28  */
29 
30 #include "include/bareos.h"
31 #include "include/jcr.h"
32 #include "lib/berrno.h"
33 #include "lib/dlist.h"
34 #include "lib/thread_specific_data.h"
35 #include "lib/watchdog.h"
36 
37 
38 /* Exported globals */
39 utime_t watchdog_time = 0;        /* this has granularity of SLEEP_TIME */
40 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
41 
42 /* Locals */
43 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
44 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
45 
46 /* Forward referenced functions */
47 extern "C" void* watchdog_thread(void* arg);
48 
49 static void wd_lock();
50 static void wd_unlock();
51 
52 /* Static globals */
53 static bool quit = false;
54 static bool wd_is_init = false;
55 static brwlock_t lock; /* watchdog lock */
56 
57 static pthread_t wd_tid;
58 static dlist* wd_queue;
59 static dlist* wd_inactive;
60 
61 /*
62  * Returns: 0 if the current thread is NOT the watchdog
63  *          1 if the current thread is the watchdog
64  */
IsWatchdog()65 bool IsWatchdog()
66 {
67   if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
68     return true;
69   } else {
70     return false;
71   }
72 }
73 
74 /*
75  * Start watchdog thread
76  *
77  *  Returns: 0 on success
78  *           errno on failure
79  */
StartWatchdog(void)80 int StartWatchdog(void)
81 {
82   int status;
83   watchdog_t* dummy = NULL;
84   int errstat;
85 
86   if (wd_is_init) { return 0; }
87   Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
88   watchdog_time = time(NULL);
89 
90   if ((errstat = RwlInit(&lock)) != 0) {
91     BErrNo be;
92     Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
93           be.bstrerror(errstat));
94   }
95   wd_queue = new dlist(dummy, &dummy->link);
96   wd_inactive = new dlist(dummy, &dummy->link);
97   wd_is_init = true;
98 
99   if ((status = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
100     return status;
101   }
102   return 0;
103 }
104 
105 /*
106  * Wake watchdog timer thread so that it walks the
107  *  queue and adjusts its wait time (or exits).
108  */
ping_watchdog()109 static void ping_watchdog()
110 {
111   P(timer_mutex);
112   pthread_cond_signal(&timer);
113   V(timer_mutex);
114   Bmicrosleep(0, 100);
115 }
116 
117 /*
118  * Terminate the watchdog thread
119  *
120  * Returns: 0 on success
121  *          errno on failure
122  */
StopWatchdog(void)123 int StopWatchdog(void)
124 {
125   int status;
126   watchdog_t* p;
127 
128   if (!wd_is_init) { return 0; }
129 
130   quit = true; /* notify watchdog thread to stop */
131   ping_watchdog();
132 
133   status = pthread_join(wd_tid, NULL);
134 
135   while (!wd_queue->empty()) {
136     void* item = wd_queue->first();
137     wd_queue->remove(item);
138     p = (watchdog_t*)item;
139     if (p->destructor != NULL) { p->destructor(p); }
140     free(p);
141   }
142   delete wd_queue;
143   wd_queue = NULL;
144 
145   while (!wd_inactive->empty()) {
146     void* item = wd_inactive->first();
147     wd_inactive->remove(item);
148     p = (watchdog_t*)item;
149     if (p->destructor != NULL) { p->destructor(p); }
150     free(p);
151   }
152   delete wd_inactive;
153   wd_inactive = NULL;
154   RwlDestroy(&lock);
155   wd_is_init = false;
156 
157   return status;
158 }
159 
new_watchdog(void)160 watchdog_t* new_watchdog(void)
161 {
162   watchdog_t* wd = (watchdog_t*)malloc(sizeof(watchdog_t));
163 
164   if (!wd_is_init) { StartWatchdog(); }
165 
166   if (wd == NULL) { return NULL; }
167   wd->one_shot = true;
168   wd->interval = 0;
169   wd->callback = NULL;
170   wd->destructor = NULL;
171   wd->data = NULL;
172 
173   return wd;
174 }
175 
RegisterWatchdog(watchdog_t * wd)176 bool RegisterWatchdog(watchdog_t* wd)
177 {
178   if (!wd_is_init) {
179     Jmsg0(NULL, M_ABORT, 0,
180           _("BUG! RegisterWatchdog called before StartWatchdog\n"));
181   }
182   if (wd->callback == NULL) {
183     Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
184   }
185   if (wd->interval == 0) {
186     Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
187   }
188 
189   wd_lock();
190   wd->next_fire = watchdog_time + wd->interval;
191   wd_queue->append(wd);
192   Dmsg3(800, "Registered watchdog %p, interval %d%s\n", wd, wd->interval,
193         wd->one_shot ? " one shot" : "");
194   wd_unlock();
195   ping_watchdog();
196 
197   return false;
198 }
199 
UnregisterWatchdog(watchdog_t * wd)200 bool UnregisterWatchdog(watchdog_t* wd)
201 {
202   watchdog_t* p;
203   bool ok = false;
204 
205   if (!wd_is_init) {
206     Jmsg0(NULL, M_ABORT, 0,
207           _("BUG! unregister_watchdog_unlocked called before StartWatchdog\n"));
208   }
209 
210   wd_lock();
211   foreach_dlist (p, wd_queue) {
212     if (wd == p) {
213       wd_queue->remove(wd);
214       Dmsg1(800, "Unregistered watchdog %p\n", wd);
215       ok = true;
216       goto get_out;
217     }
218   }
219 
220   foreach_dlist (p, wd_inactive) {
221     if (wd == p) {
222       wd_inactive->remove(wd);
223       Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
224       ok = true;
225       goto get_out;
226     }
227   }
228 
229   Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
230 
231 get_out:
232   wd_unlock();
233   ping_watchdog();
234   return ok;
235 }
236 
237 /*
238  * This is the thread that walks the watchdog queue
239  *  and when a queue item fires, the callback is
240  *  invoked.  If it is a one shot, the queue item
241  *  is moved to the inactive queue.
242  */
watchdog_thread(void * arg)243 extern "C" void* watchdog_thread(void* arg)
244 {
245   struct timespec timeout;
246   struct timeval tv;
247   struct timezone tz;
248   utime_t next_time;
249 
250   SetJcrInThreadSpecificData(nullptr);
251   Dmsg0(800, "NicB-reworked watchdog thread entered\n");
252 
253   while (!quit) {
254     watchdog_t* p;
255 
256     /*
257      *
258      *  NOTE. lock_jcr_chain removed, but the message below
259      *   was left until we are sure there are no deadlocks.
260      *
261      * We lock the jcr chain here because a good number of the
262      *   callback routines lock the jcr chain. We need to lock
263      *   it here *before* the watchdog lock because the SD message
264      *   thread first locks the jcr chain, then when closing the
265      *   job locks the watchdog chain. If the two threads do not
266      *   lock in the same order, we get a deadlock -- each holds
267      *   the other's needed lock.
268      */
269     wd_lock();
270 
271   walk_list:
272     watchdog_time = time(NULL);
273     next_time = watchdog_time + watchdog_sleep_time;
274     foreach_dlist (p, wd_queue) {
275       if (p->next_fire <= watchdog_time) {
276         /* Run the callback */
277         Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
278         p->callback(p);
279 
280         /* Reschedule (or move to inactive list if it's a one-shot timer) */
281         if (p->one_shot) {
282           wd_queue->remove(p);
283           wd_inactive->append(p);
284           goto walk_list;
285         } else {
286           p->next_fire = watchdog_time + p->interval;
287         }
288       }
289       if (p->next_fire <= next_time) { next_time = p->next_fire; }
290     }
291     wd_unlock();
292 
293     /*
294      * Wait sleep time or until someone wakes us
295      */
296     gettimeofday(&tv, &tz);
297     timeout.tv_nsec = tv.tv_usec * 1000;
298     timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
299     while (timeout.tv_nsec >= 1000000000) {
300       timeout.tv_nsec -= 1000000000;
301       timeout.tv_sec++;
302     }
303 
304     Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
305     /* Note, this unlocks mutex during the sleep */
306     P(timer_mutex);
307     pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
308     V(timer_mutex);
309   }
310 
311   Dmsg0(800, "NicB-reworked watchdog thread exited\n");
312   return NULL;
313 }
314 
315 /*
316  * Watchdog lock, this can be called multiple times by the same
317  *   thread without blocking, but must be unlocked the number of
318  *   times it was locked.
319  */
wd_lock()320 static void wd_lock()
321 {
322   int errstat;
323   if ((errstat = RwlWritelock(&lock)) != 0) {
324     BErrNo be;
325     Jmsg1(NULL, M_ABORT, 0, _("RwlWritelock failure. ERR=%s\n"),
326           be.bstrerror(errstat));
327   }
328 }
329 
330 /*
331  * Unlock the watchdog. This can be called multiple times by the
332  *   same thread up to the number of times that thread called
333  *   wd_ lock()/
334  */
wd_unlock()335 static void wd_unlock()
336 {
337   int errstat;
338   if ((errstat = RwlWriteunlock(&lock)) != 0) {
339     BErrNo be;
340     Jmsg1(NULL, M_ABORT, 0, _("RwlWriteunlock failure. ERR=%s\n"),
341           be.bstrerror(errstat));
342   }
343 }
344