1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2003-2012 Free Software Foundation Europe e.V.
5 Copyright (C) 2011-2012 Planets Communications B.V.
6 Copyright (C) 2013-2020 Bareos GmbH & Co. KG
7
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
11 in the file LICENSE.
12
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.
22 */
23 /*
24 * Kern Sibbald, May MMIII
25 */
26 /**
27 * @file
28 * Bareos File Daemon heartbeat routines
29 * Listens for heartbeats coming from the SD
30 * If configured, sends heartbeats to Dir
31 */
32
33 #include "include/bareos.h"
34 #include "filed/filed.h"
35 #include "filed/jcr_private.h"
36 #include "filed/filed_globals.h"
37 #include "lib/bnet.h"
38 #include "lib/bsock.h"
39 #include "lib/watchdog.h"
40
41 namespace filedaemon {
42
43 #define WAIT_INTERVAL 5
44
45 extern "C" void* sd_heartbeat_thread(void* arg);
46 extern "C" void* dir_heartbeat_thread(void* arg);
47 extern bool no_signals;
48
49 /**
50 * Listen on the SD socket for heartbeat signals.
51 * Send heartbeats to the Director every HB_TIME
52 * seconds.
53 */
sd_heartbeat_thread(void * arg)54 extern "C" void* sd_heartbeat_thread(void* arg)
55 {
56 int32_t n;
57 JobControlRecord* jcr = (JobControlRecord*)arg;
58 std::shared_ptr<BareosSocket> sd, dir;
59 time_t last_heartbeat = time(NULL);
60 time_t now;
61
62 pthread_detach(pthread_self());
63
64 /*
65 * Get our own local copy
66 */
67 sd.reset(jcr->store_bsock->clone());
68 dir.reset(jcr->dir_bsock->clone());
69
70 jcr->impl->hb_bsock = sd;
71 jcr->impl->hb_running = true;
72 jcr->impl->hb_dir_bsock = dir;
73 dir->suppress_error_msgs_ = true;
74 sd->suppress_error_msgs_ = true;
75 jcr->impl->hb_initialized_once
76 = true; // initialize last to avoid race condition
77
78 /* Hang reading the socket to the SD, and every time we get
79 * a heartbeat or we get a wait timeout (5 seconds), we
80 * check to see if we need to send a heartbeat to the
81 * Director.
82 */
83 while (!sd->IsStop()) {
84 n = BnetWaitDataIntr(sd.get(), WAIT_INTERVAL);
85 if (n < 0 || sd->IsStop()) { break; }
86 if (me->heartbeat_interval) {
87 now = time(NULL);
88 if (now - last_heartbeat >= me->heartbeat_interval) {
89 dir->signal(BNET_HEARTBEAT);
90 if (dir->IsStop()) { break; }
91 last_heartbeat = now;
92 }
93 }
94 if (n == 1) { /* input waiting */
95 sd->recv(); /* read it -- probably heartbeat from sd */
96 if (sd->IsStop()) { break; }
97 if (sd->message_length <= 0) {
98 Dmsg1(100, "Got BNET_SIG %d from SD\n", sd->message_length);
99 } else {
100 Dmsg2(100, "Got %d bytes from SD. MSG=%s\n", sd->message_length,
101 sd->msg);
102 }
103 }
104 Dmsg2(200, "wait_intr=%d stop=%d\n", n, IsBnetStop(sd.get()));
105 }
106
107 sd->close();
108 dir->close();
109 jcr->impl->hb_bsock.reset();
110 jcr->impl->hb_running = false;
111 jcr->impl->hb_dir_bsock = NULL;
112
113 return NULL;
114 }
115
116 /* Startup the heartbeat thread -- see above */
StartHeartbeatMonitor(JobControlRecord * jcr)117 void StartHeartbeatMonitor(JobControlRecord* jcr)
118 {
119 /*
120 * If no signals are set, do not start the heartbeat because
121 * it gives a constant stream of TIMEOUT_SIGNAL signals that
122 * make debugging impossible.
123 */
124 if (!no_signals) {
125 jcr->impl->hb_bsock = NULL;
126 jcr->impl->hb_running = false;
127 jcr->impl->hb_initialized_once = false;
128 jcr->impl->hb_dir_bsock = NULL;
129 pthread_create(&jcr->impl->heartbeat_id, NULL, sd_heartbeat_thread,
130 (void*)jcr);
131 }
132 }
133
134 /* Terminate the heartbeat thread. Used for both SD and DIR */
StopHeartbeatMonitor(JobControlRecord * jcr)135 void StopHeartbeatMonitor(JobControlRecord* jcr)
136 {
137 int cnt = 0;
138 if (no_signals) { return; }
139
140 /* Wait max 10 secs for heartbeat thread to start */
141 while (!jcr->impl->hb_initialized_once && cnt++ < 200) {
142 Bmicrosleep(0, 50000); /* wait for start */
143 }
144
145 if (jcr->impl->hb_running) {
146 jcr->impl->hb_bsock->SetTimedOut(); /* set timed_out to Terminate read */
147 jcr->impl->hb_bsock->SetTerminated(); /* set to Terminate read */
148 }
149
150 if (jcr->impl->hb_dir_bsock) {
151 jcr->impl->hb_dir_bsock
152 ->SetTimedOut(); /* set timed_out to Terminate read */
153 jcr->impl->hb_dir_bsock->SetTerminated(); /* set to Terminate read */
154 }
155
156 if (jcr->impl->hb_running) {
157 Dmsg0(100, "Send kill to heartbeat id\n");
158 pthread_kill(jcr->impl->heartbeat_id,
159 TIMEOUT_SIGNAL); /* make heartbeat thread go away */
160 Bmicrosleep(0, 50000);
161 }
162 cnt = 0;
163
164 /*
165 * Wait max 100 secs for heartbeat thread to stop
166 */
167 while (jcr->impl->hb_running && cnt++ < 200) {
168 pthread_kill(jcr->impl->heartbeat_id,
169 TIMEOUT_SIGNAL); /* make heartbeat thread go away */
170 Bmicrosleep(0, 500000);
171 }
172
173 if (jcr->impl->hb_bsock) {
174 // delete jcr->impl_->hb_bsock;
175 jcr->impl->hb_bsock.reset();
176 }
177
178 if (jcr->impl->hb_dir_bsock) {
179 // delete jcr->impl_->hb_dir_bsock;
180 jcr->impl->hb_dir_bsock.reset();
181 }
182
183 jcr->impl->hb_initialized_once = false;
184 }
185
186 /**
187 * Thread for sending heartbeats to the Director when there
188 * is no SD monitoring needed -- e.g. restore and verify Vol
189 * both do their own read() on the SD socket.
190 */
dir_heartbeat_thread(void * arg)191 extern "C" void* dir_heartbeat_thread(void* arg)
192 {
193 JobControlRecord* jcr = (JobControlRecord*)arg;
194 BareosSocket* dir;
195 time_t last_heartbeat = time(NULL);
196
197 pthread_detach(pthread_self());
198
199 /*
200 * Get our own local copy
201 */
202 dir = jcr->dir_bsock->clone();
203
204 jcr->impl->hb_bsock.reset(dir);
205 jcr->impl->hb_running = true;
206 dir->suppress_error_msgs_ = true;
207 jcr->impl->hb_initialized_once
208 = true; // initialize last to avoid race condition
209
210 while (!dir->IsStop()) {
211 time_t now, next;
212
213 now = time(NULL);
214 next = now - last_heartbeat;
215 if (next >= me->heartbeat_interval) {
216 dir->signal(BNET_HEARTBEAT);
217 if (dir->IsStop()) { break; }
218 last_heartbeat = now;
219 }
220 Bmicrosleep(next, 0);
221 }
222 dir->close();
223 jcr->impl->hb_bsock.reset();
224 jcr->impl->hb_running = false;
225 return NULL;
226 }
227
228 /**
229 * Same as above but we don't listen to the SD
230 */
StartDirHeartbeat(JobControlRecord * jcr)231 void StartDirHeartbeat(JobControlRecord* jcr)
232 {
233 if (me->heartbeat_interval) {
234 jcr->dir_bsock->SetLocking();
235 pthread_create(&jcr->impl->heartbeat_id, NULL, dir_heartbeat_thread,
236 (void*)jcr);
237 }
238 }
239
StopDirHeartbeat(JobControlRecord * jcr)240 void StopDirHeartbeat(JobControlRecord* jcr)
241 {
242 if (me->heartbeat_interval) { StopHeartbeatMonitor(jcr); }
243 }
244 } /* namespace filedaemon */
245