1 /*
2 Bacula(R) - The Network Backup Solution
3
4 Copyright (C) 2000-2020 Kern Sibbald
5
6 The original author of Bacula is Kern Sibbald, with contributions
7 from many others, a complete list can be found in the file AUTHORS.
8
9 You may use this file and others of this release according to the
10 license defined in the LICENSE file, which includes the Affero General
11 Public License, v3.0 ("AGPLv3") and some additional permissions and
12 terms pursuant to its AGPLv3 Section 7.
13
14 This notice must be preserved when any source code is
15 conveyed and/or propagated.
16
17 Bacula(R) is a registered trademark of Kern Sibbald.
18 */
19 /*
20 * Bacula thread watchdog routine. General routine that
21 * allows setting a watchdog timer with a callback that is
22 * called when the timer goes off.
23 *
24 * Kern Sibbald, January MMII
25 *
26 */
27
28 #include "bacula.h"
29 #include "jcr.h"
30
31 /* Exported globals */
32 utime_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
33 utime_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
34
35 /* Locals */
36 static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
37 static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
38
39 /* Forward referenced functions */
40 extern "C" void *watchdog_thread(void *arg);
41
42 static void wd_lock();
43 static void wd_unlock();
44
45 /* Static globals */
46 static bool quit = false;
47 static bool wd_is_init = false;
48 static brwlock_t lock; /* watchdog lock */
49
50 static pthread_t wd_tid;
51 static dlist *wd_queue;
52 static dlist *wd_inactive;
53
54 /*
55 * Returns: 0 if the current thread is NOT the watchdog
56 * 1 if the current thread is the watchdog
57 */
is_watchdog()58 bool is_watchdog()
59 {
60 if (wd_is_init && pthread_equal(pthread_self(), wd_tid)) {
61 return true;
62 } else {
63 return false;
64 }
65 }
66
67 /*
68 * Start watchdog thread
69 *
70 * Returns: 0 on success
71 * errno on failure
72 */
start_watchdog(void)73 int start_watchdog(void)
74 {
75 int stat;
76 watchdog_t *dummy = NULL;
77 int errstat;
78
79 if (wd_is_init) {
80 return 0;
81 }
82 Dmsg0(800, "Initialising NicB-hacked watchdog thread\n");
83 watchdog_time = time(NULL);
84
85 if ((errstat=rwl_init(&lock)) != 0) {
86 berrno be;
87 Jmsg1(NULL, M_ABORT, 0, _("Unable to initialize watchdog lock. ERR=%s\n"),
88 be.bstrerror(errstat));
89 }
90 wd_queue = New(dlist(dummy, &dummy->link));
91 wd_inactive = New(dlist(dummy, &dummy->link));
92 wd_is_init = true;
93
94 if ((stat = pthread_create(&wd_tid, NULL, watchdog_thread, NULL)) != 0) {
95 return stat;
96 }
97 return 0;
98 }
99
100 /*
101 * Wake watchdog timer thread so that it walks the
102 * queue and adjusts its wait time (or exits).
103 */
ping_watchdog()104 static void ping_watchdog()
105 {
106 P(timer_mutex);
107 pthread_cond_signal(&timer);
108 V(timer_mutex);
109 bmicrosleep(0, 100);
110 }
111
112 /*
113 * Terminate the watchdog thread
114 *
115 * Returns: 0 on success
116 * errno on failure
117 */
stop_watchdog(void)118 int stop_watchdog(void)
119 {
120 int stat;
121 watchdog_t *p;
122
123 if (!wd_is_init) {
124 return 0;
125 }
126
127 quit = true; /* notify watchdog thread to stop */
128 ping_watchdog();
129
130 stat = pthread_join(wd_tid, NULL);
131
132 while (!wd_queue->empty()) {
133 void *item = wd_queue->first();
134 wd_queue->remove(item);
135 p = (watchdog_t *)item;
136 if (p->destructor != NULL) {
137 p->destructor(p);
138 }
139 free(p);
140 }
141 delete wd_queue;
142 wd_queue = NULL;
143
144 while (!wd_inactive->empty()) {
145 void *item = wd_inactive->first();
146 wd_inactive->remove(item);
147 p = (watchdog_t *)item;
148 if (p->destructor != NULL) {
149 p->destructor(p);
150 }
151 free(p);
152 }
153 delete wd_inactive;
154 wd_inactive = NULL;
155 rwl_destroy(&lock);
156 wd_is_init = false;
157
158 return stat;
159 }
160
new_watchdog(void)161 watchdog_t *new_watchdog(void)
162 {
163 watchdog_t *wd = (watchdog_t *)malloc(sizeof(watchdog_t));
164
165 if (!wd_is_init) {
166 start_watchdog();
167 }
168
169 if (wd == NULL) {
170 return NULL;
171 }
172 wd->one_shot = true;
173 wd->interval = 0;
174 wd->callback = NULL;
175 wd->destructor = NULL;
176 wd->data = NULL;
177
178 return wd;
179 }
180
register_watchdog(watchdog_t * wd)181 bool register_watchdog(watchdog_t *wd)
182 {
183 if (!wd_is_init) {
184 Jmsg0(NULL, M_ABORT, 0, _("BUG! register_watchdog called before start_watchdog\n"));
185 }
186 if (wd->callback == NULL) {
187 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has NULL callback\n"), wd);
188 }
189 if (wd->interval == 0) {
190 Jmsg1(NULL, M_ABORT, 0, _("BUG! Watchdog %p has zero interval\n"), wd);
191 }
192
193 wd_lock();
194 wd->next_fire = watchdog_time + wd->interval;
195 wd_queue->append(wd);
196 Dmsg3(800, "Registered watchdog %p, interval %d%s\n",
197 wd, wd->interval, wd->one_shot ? " one shot" : "");
198 wd_unlock();
199 ping_watchdog();
200
201 return false;
202 }
203
unregister_watchdog(watchdog_t * wd)204 bool unregister_watchdog(watchdog_t *wd)
205 {
206 watchdog_t *p;
207 bool ok = false;
208
209 if (!wd_is_init) {
210 Jmsg0(NULL, M_ABORT, 0, _("BUG! unregister_watchdog_unlocked called before start_watchdog\n"));
211 }
212
213 wd_lock();
214 foreach_dlist(p, wd_queue) {
215 if (wd == p) {
216 wd_queue->remove(wd);
217 Dmsg1(800, "Unregistered watchdog %p\n", wd);
218 ok = true;
219 goto get_out;
220 }
221 }
222
223 foreach_dlist(p, wd_inactive) {
224 if (wd == p) {
225 wd_inactive->remove(wd);
226 Dmsg1(800, "Unregistered inactive watchdog %p\n", wd);
227 ok = true;
228 goto get_out;
229 }
230 }
231
232 Dmsg1(800, "Failed to unregister watchdog %p\n", wd);
233
234 get_out:
235 wd_unlock();
236 ping_watchdog();
237 return ok;
238 }
239
240 /*
241 * This is the thread that walks the watchdog queue
242 * and when a queue item fires, the callback is
243 * invoked. If it is a one shot, the queue item
244 * is moved to the inactive queue.
245 */
watchdog_thread(void * arg)246 extern "C" void *watchdog_thread(void *arg)
247 {
248 struct timespec timeout;
249 struct timeval tv;
250 struct timezone tz;
251 utime_t next_time;
252
253 set_jcr_in_tsd(INVALID_JCR);
254 Dmsg0(800, "NicB-reworked watchdog thread entered\n");
255
256 while (!quit) {
257 watchdog_t *p;
258
259 wd_lock();
260
261 walk_list:
262 watchdog_time = time(NULL);
263 next_time = watchdog_time + watchdog_sleep_time;
264 foreach_dlist(p, wd_queue) {
265 if (p->next_fire <= watchdog_time) {
266 /* Run the callback */
267 Dmsg2(3400, "Watchdog callback p=0x%p fire=%d\n", p, p->next_fire);
268 p->callback(p);
269
270 /* Reschedule (or move to inactive list if it's a one-shot timer) */
271 if (p->one_shot) {
272 wd_queue->remove(p);
273 wd_inactive->append(p);
274 goto walk_list;
275 } else {
276 p->next_fire = watchdog_time + p->interval;
277 }
278 }
279 if (p->next_fire <= next_time) {
280 next_time = p->next_fire;
281 }
282 }
283 wd_unlock();
284
285 /*
286 * Wait sleep time or until someone wakes us
287 */
288 gettimeofday(&tv, &tz);
289 timeout.tv_nsec = tv.tv_usec * 1000;
290 timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
291 while (timeout.tv_nsec >= 1000000000) {
292 timeout.tv_nsec -= 1000000000;
293 timeout.tv_sec++;
294 }
295
296 Dmsg1(1900, "pthread_cond_timedwait %d\n", timeout.tv_sec - tv.tv_sec);
297 /* Note, this unlocks mutex during the sleep */
298 P(timer_mutex);
299 pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
300 V(timer_mutex);
301 }
302
303 Dmsg0(800, "NicB-reworked watchdog thread exited\n");
304 return NULL;
305 }
306
307 /*
308 * Watchdog lock, this can be called multiple times by the same
309 * thread without blocking, but must be unlocked the number of
310 * times it was locked.
311 */
wd_lock()312 static void wd_lock()
313 {
314 int errstat;
315 if ((errstat=rwl_writelock(&lock)) != 0) {
316 berrno be;
317 Jmsg1(NULL, M_ABORT, 0, _("rwl_writelock failure. ERR=%s\n"),
318 be.bstrerror(errstat));
319 }
320 }
321
322 /*
323 * Unlock the watchdog. This can be called multiple times by the
324 * same thread up to the number of times that thread called
325 * wd_ lock()/
326 */
wd_unlock()327 static void wd_unlock()
328 {
329 int errstat;
330 if ((errstat=rwl_writeunlock(&lock)) != 0) {
331 berrno be;
332 Jmsg1(NULL, M_ABORT, 0, _("rwl_writeunlock failure. ERR=%s\n"),
333 be.bstrerror(errstat));
334 }
335 }
336