1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2002-2011 Free Software Foundation Europe e.V.
5 Copyright (C) 2013-2019 Bareos GmbH & Co. KG
6
7 This program is Free Software; you can redistribute it and/or
8 modify it under the terms of version three of the GNU Affero General Public
9 License as published by the Free Software Foundation and included
10 in the file LICENSE.
11
12 This program is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Affero General Public License for more details.
16
17 You should have received a copy of the GNU Affero General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 02110-1301, USA.
21 */
22 /*
23 * BAREOS thread watchdog routine. General routine that
24 * allows setting a watchdog timer with a callback that is
25 * called when the timer goes off.
26 *
27 * Kern Sibbald, January MMII
28 */
29
30 #include "include/bareos.h"
31 #include "include/jcr.h"
32 #include "lib/berrno.h"
33 #include "lib/dlist.h"
34 #include "lib/thread_specific_data.h"
35 #include "lib/watchdog.h"
36
37
38 /* Exported globals */
39 utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
40 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
41
42 /* Locals */
43 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
44 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
45
46 /* Forward referenced functions */
47 extern "C" void* watchdog_thread(void* arg);
48
49 static void wd_lock();
50 static void wd_unlock();
51
52 /* Static globals */
53 static bool quit = false;
54 static bool wd_is_init = false;
55 static brwlock_t lock; /* watchdog lock */
56
57 static pthread_t wd_tid;
58 static dlist* wd_queue;
59 static dlist* wd_inactive;
60
61 /*
62 * Returns: 0 if the current thread is NOT the watchdog
63 * 1 if the current thread is the watchdog
64 */
IsWatchdog()65 bool IsWatchdog()
66 {
67 if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
68 return true;
69 } else {
70 return false;
71 }
72 }
73
74 /*
75 * Start watchdog thread
76 *
77 * Returns: 0 on success
78 * errno on failure
79 */
StartWatchdog(void)80 int StartWatchdog(void)
81 {
82 int status;
83 watchdog_t* dummy = NULL;
84 int errstat;
85
86 if (wd_is_init) { return 0; }
87 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
88 watchdog_time = time(NULL);
89
90 if ((errstat = RwlInit(&lock)) != 0) {
91 BErrNo be;
92 Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
93 be.bstrerror(errstat));
94 }
95 wd_queue = new dlist(dummy, &dummy->link);
96 wd_inactive = new dlist(dummy, &dummy->link);
97 wd_is_init = true;
98
99 if ((status = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
100 return status;
101 }
102 return 0;
103 }
104
105 /*
106 * Wake watchdog timer thread so that it walks the
107 * queue and adjusts its wait time (or exits).
108 */
ping_watchdog()109 static void ping_watchdog()
110 {
111 P(timer_mutex);
112 pthread_cond_signal(&timer);
113 V(timer_mutex);
114 Bmicrosleep(0, 100);
115 }
116
117 /*
118 * Terminate the watchdog thread
119 *
120 * Returns: 0 on success
121 * errno on failure
122 */
StopWatchdog(void)123 int StopWatchdog(void)
124 {
125 int status;
126 watchdog_t* p;
127
128 if (!wd_is_init) { return 0; }
129
130 quit = true; /* notify watchdog thread to stop */
131 ping_watchdog();
132
133 status = pthread_join(wd_tid, NULL);
134
135 while (!wd_queue->empty()) {
136 void* item = wd_queue->first();
137 wd_queue->remove(item);
138 p = (watchdog_t*)item;
139 if (p->destructor != NULL) { p->destructor(p); }
140 free(p);
141 }
142 delete wd_queue;
143 wd_queue = NULL;
144
145 while (!wd_inactive->empty()) {
146 void* item = wd_inactive->first();
147 wd_inactive->remove(item);
148 p = (watchdog_t*)item;
149 if (p->destructor != NULL) { p->destructor(p); }
150 free(p);
151 }
152 delete wd_inactive;
153 wd_inactive = NULL;
154 RwlDestroy(&lock);
155 wd_is_init = false;
156
157 return status;
158 }
159
new_watchdog(void)160 watchdog_t* new_watchdog(void)
161 {
162 watchdog_t* wd = (watchdog_t*)malloc(sizeof(watchdog_t));
163
164 if (!wd_is_init) { StartWatchdog(); }
165
166 if (wd == NULL) { return NULL; }
167 wd->one_shot = true;
168 wd->interval = 0;
169 wd->callback = NULL;
170 wd->destructor = NULL;
171 wd->data = NULL;
172
173 return wd;
174 }
175
RegisterWatchdog(watchdog_t * wd)176 bool RegisterWatchdog(watchdog_t* wd)
177 {
178 if (!wd_is_init) {
179 Jmsg0(NULL, M_ABORT, 0,
180 _("BUG! RegisterWatchdog called before StartWatchdog\n"));
181 }
182 if (wd->callback == NULL) {
183 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
184 }
185 if (wd->interval == 0) {
186 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
187 }
188
189 wd_lock();
190 wd->next_fire = watchdog_time + wd->interval;
191 wd_queue->append(wd);
192 Dmsg3(800, "Registered watchdog %p, interval %d%s\n", wd, wd->interval,
193 wd->one_shot ? " one shot" : "");
194 wd_unlock();
195 ping_watchdog();
196
197 return false;
198 }
199
UnregisterWatchdog(watchdog_t * wd)200 bool UnregisterWatchdog(watchdog_t* wd)
201 {
202 watchdog_t* p;
203 bool ok = false;
204
205 if (!wd_is_init) {
206 Jmsg0(NULL, M_ABORT, 0,
207 _("BUG! unregister_watchdog_unlocked called before StartWatchdog\n"));
208 }
209
210 wd_lock();
211 foreach_dlist (p, wd_queue) {
212 if (wd == p) {
213 wd_queue->remove(wd);
214 Dmsg1(800, "Unregistered watchdog %p\n", wd);
215 ok = true;
216 goto get_out;
217 }
218 }
219
220 foreach_dlist (p, wd_inactive) {
221 if (wd == p) {
222 wd_inactive->remove(wd);
223 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
224 ok = true;
225 goto get_out;
226 }
227 }
228
229 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
230
231 get_out:
232 wd_unlock();
233 ping_watchdog();
234 return ok;
235 }
236
237 /*
238 * This is the thread that walks the watchdog queue
239 * and when a queue item fires, the callback is
240 * invoked. If it is a one shot, the queue item
241 * is moved to the inactive queue.
242 */
watchdog_thread(void * arg)243 extern "C" void* watchdog_thread(void* arg)
244 {
245 struct timespec timeout;
246 struct timeval tv;
247 struct timezone tz;
248 utime_t next_time;
249
250 SetJcrInThreadSpecificData(nullptr);
251 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
252
253 while (!quit) {
254 watchdog_t* p;
255
256 /*
257 *
258 * NOTE. lock_jcr_chain removed, but the message below
259 * was left until we are sure there are no deadlocks.
260 *
261 * We lock the jcr chain here because a good number of the
262 * callback routines lock the jcr chain. We need to lock
263 * it here *before* the watchdog lock because the SD message
264 * thread first locks the jcr chain, then when closing the
265 * job locks the watchdog chain. If the two threads do not
266 * lock in the same order, we get a deadlock -- each holds
267 * the other's needed lock.
268 */
269 wd_lock();
270
271 walk_list:
272 watchdog_time = time(NULL);
273 next_time = watchdog_time + watchdog_sleep_time;
274 foreach_dlist (p, wd_queue) {
275 if (p->next_fire <= watchdog_time) {
276 /* Run the callback */
277 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
278 p->callback(p);
279
280 /* Reschedule (or move to inactive list if it's a one-shot timer) */
281 if (p->one_shot) {
282 wd_queue->remove(p);
283 wd_inactive->append(p);
284 goto walk_list;
285 } else {
286 p->next_fire = watchdog_time + p->interval;
287 }
288 }
289 if (p->next_fire <= next_time) { next_time = p->next_fire; }
290 }
291 wd_unlock();
292
293 /*
294 * Wait sleep time or until someone wakes us
295 */
296 gettimeofday(&tv, &tz);
297 timeout.tv_nsec = tv.tv_usec * 1000;
298 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
299 while (timeout.tv_nsec >= 1000000000) {
300 timeout.tv_nsec -= 1000000000;
301 timeout.tv_sec++;
302 }
303
304 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
305 /* Note, this unlocks mutex during the sleep */
306 P(timer_mutex);
307 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
308 V(timer_mutex);
309 }
310
311 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
312 return NULL;
313 }
314
315 /*
316 * Watchdog lock, this can be called multiple times by the same
317 * thread without blocking, but must be unlocked the number of
318 * times it was locked.
319 */
wd_lock()320 static void wd_lock()
321 {
322 int errstat;
323 if ((errstat = RwlWritelock(&lock)) != 0) {
324 BErrNo be;
325 Jmsg1(NULL, M_ABORT, 0, _("RwlWritelock failure. ERR=%s\n"),
326 be.bstrerror(errstat));
327 }
328 }
329
330 /*
331 * Unlock the watchdog. This can be called multiple times by the
332 * same thread up to the number of times that thread called
333 * wd_ lock()/
334 */
wd_unlock()335 static void wd_unlock()
336 {
337 int errstat;
338 if ((errstat = RwlWriteunlock(&lock)) != 0) {
339 BErrNo be;
340 Jmsg1(NULL, M_ABORT, 0, _("RwlWriteunlock failure. ERR=%s\n"),
341 be.bstrerror(errstat));
342 }
343 }
344