1 /*
2 BAREOS® - Backup Archiving REcovery Open Sourced
3
4 Copyright (C) 2003-2012 Free Software Foundation Europe e.V.
5 Copyright (C) 2011-2012 Planets Communications B.V.
6 Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7
8 This program is Free Software; you can redistribute it and/or
9 modify it under the terms of version three of the GNU Affero General Public
10 License as published by the Free Software Foundation and included
11 in the file LICENSE.
12
13 This program is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Affero General Public License for more details.
17
18 You should have received a copy of the GNU Affero General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 02110-1301, USA.
22 */
23 /*
24 * Kern Sibbald, May MMIII
25 */
26 /**
27 * @file
28 * Bareos File Daemon heartbeat routines
29 * Listens for heartbeats coming from the SD
30 * If configured, sends heartbeats to Dir
31 */
32
33 #include "include/bareos.h"
34 #include "filed/filed.h"
35 #include "filed/filed_globals.h"
36 #include "lib/bnet.h"
37
38 namespace filedaemon {
39
40 #define WAIT_INTERVAL 5
41
42 extern "C" void *sd_heartbeat_thread(void *arg);
43 extern "C" void *dir_heartbeat_thread(void *arg);
44 extern bool no_signals;
45
46 /**
47 * Listen on the SD socket for heartbeat signals.
48 * Send heartbeats to the Director every HB_TIME
49 * seconds.
50 */
sd_heartbeat_thread(void * arg)51 extern "C" void *sd_heartbeat_thread(void *arg)
52 {
53 int32_t n;
54 JobControlRecord *jcr = (JobControlRecord *)arg;
55 std::shared_ptr<BareosSocket> sd, dir;
56 time_t last_heartbeat = time(NULL);
57 time_t now;
58
59 pthread_detach(pthread_self());
60
61 /*
62 * Get our own local copy
63 */
64 sd.reset(jcr->store_bsock->clone());
65 dir.reset(jcr->dir_bsock->clone());
66
67 jcr->hb_bsock = sd;
68 jcr->hb_started = true;
69 jcr->hb_dir_bsock = dir;
70 dir->suppress_error_msgs_ = true;
71 sd->suppress_error_msgs_ = true;
72
73 /* Hang reading the socket to the SD, and every time we get
74 * a heartbeat or we get a wait timeout (5 seconds), we
75 * check to see if we need to send a heartbeat to the
76 * Director.
77 */
78 while (!sd->IsStop()) {
79 n = BnetWaitDataIntr(sd.get(), WAIT_INTERVAL);
80 if (n < 0 || sd->IsStop()) {
81 break;
82 }
83 if (me->heartbeat_interval) {
84 now = time(NULL);
85 if (now-last_heartbeat >= me->heartbeat_interval) {
86 dir->signal(BNET_HEARTBEAT);
87 if (dir->IsStop()) {
88 break;
89 }
90 last_heartbeat = now;
91 }
92 }
93 if (n == 1) { /* input waiting */
94 sd->recv(); /* read it -- probably heartbeat from sd */
95 if (sd->IsStop()) {
96 break;
97 }
98 if (sd->message_length <= 0) {
99 Dmsg1(100, "Got BNET_SIG %d from SD\n", sd->message_length);
100 } else {
101 Dmsg2(100, "Got %d bytes from SD. MSG=%s\n", sd->message_length, sd->msg);
102 }
103 }
104 Dmsg2(200, "wait_intr=%d stop=%d\n", n, IsBnetStop(sd.get()));
105 }
106
107 sd->close();
108 dir->close();
109 jcr->hb_bsock.reset();
110 jcr->hb_started = false;
111 jcr->hb_dir_bsock = NULL;
112
113 return NULL;
114 }
115
116 /* Startup the heartbeat thread -- see above */
StartHeartbeatMonitor(JobControlRecord * jcr)117 void StartHeartbeatMonitor(JobControlRecord *jcr)
118 {
119 /*
120 * If no signals are set, do not start the heartbeat because
121 * it gives a constant stream of TIMEOUT_SIGNAL signals that
122 * make debugging impossible.
123 */
124 if (!no_signals) {
125 jcr->hb_bsock = NULL;
126 jcr->hb_started = false;
127 jcr->hb_dir_bsock = NULL;
128 pthread_create(&jcr->heartbeat_id, NULL, sd_heartbeat_thread, (void *)jcr);
129 }
130 }
131
132 /* Terminate the heartbeat thread. Used for both SD and DIR */
StopHeartbeatMonitor(JobControlRecord * jcr)133 void StopHeartbeatMonitor(JobControlRecord *jcr)
134 {
135 int cnt = 0;
136 if (no_signals) {
137 return;
138 }
139 /* Wait max 10 secs for heartbeat thread to start */
140 while (!jcr->hb_started && cnt++ < 200) {
141 Bmicrosleep(0, 50000); /* wait for start */
142 }
143
144 if (jcr->hb_started) {
145 jcr->hb_bsock->SetTimedOut(); /* set timed_out to Terminate read */
146 jcr->hb_bsock->SetTerminated(); /* set to Terminate read */
147 }
148
149 if (jcr->hb_dir_bsock) {
150 jcr->hb_dir_bsock->SetTimedOut(); /* set timed_out to Terminate read */
151 jcr->hb_dir_bsock->SetTerminated(); /* set to Terminate read */
152 }
153
154 if (jcr->hb_started) {
155 Dmsg0(100, "Send kill to heartbeat id\n");
156 pthread_kill(jcr->heartbeat_id, TIMEOUT_SIGNAL); /* make heartbeat thread go away */
157 Bmicrosleep(0, 50000);
158 }
159 cnt = 0;
160
161 /*
162 * Wait max 100 secs for heartbeat thread to stop
163 */
164 while (jcr->hb_started && cnt++ < 200) {
165 pthread_kill(jcr->heartbeat_id, TIMEOUT_SIGNAL); /* make heartbeat thread go away */
166 Bmicrosleep(0, 500000);
167 }
168
169 if (jcr->hb_bsock) {
170 // delete jcr->hb_bsock;
171 jcr->hb_bsock.reset();
172 }
173
174 if (jcr->hb_dir_bsock) {
175 // delete jcr->hb_dir_bsock;
176 jcr->hb_dir_bsock.reset();
177 }
178 }
179
180 /**
181 * Thread for sending heartbeats to the Director when there
182 * is no SD monitoring needed -- e.g. restore and verify Vol
183 * both do their own read() on the SD socket.
184 */
dir_heartbeat_thread(void * arg)185 extern "C" void *dir_heartbeat_thread(void *arg)
186 {
187 JobControlRecord *jcr = (JobControlRecord *)arg;
188 BareosSocket *dir;
189 time_t last_heartbeat = time(NULL);
190
191 pthread_detach(pthread_self());
192
193 /*
194 * Get our own local copy
195 */
196 dir = jcr->dir_bsock->clone();
197
198 jcr->hb_bsock.reset(dir);
199 jcr->hb_started = true;
200 dir->suppress_error_msgs_ = true;
201
202 while (!dir->IsStop()) {
203 time_t now, next;
204
205 now = time(NULL);
206 next = now - last_heartbeat;
207 if (next >= me->heartbeat_interval) {
208 dir->signal(BNET_HEARTBEAT);
209 if (dir->IsStop()) {
210 break;
211 }
212 last_heartbeat = now;
213 }
214 Bmicrosleep(next, 0);
215 }
216 dir->close();
217 jcr->hb_bsock.reset();
218 jcr->hb_started = false;
219 return NULL;
220 }
221
222 /**
223 * Same as above but we don't listen to the SD
224 */
StartDirHeartbeat(JobControlRecord * jcr)225 void StartDirHeartbeat(JobControlRecord *jcr)
226 {
227 if (me->heartbeat_interval) {
228 jcr->dir_bsock->SetLocking();
229 pthread_create(&jcr->heartbeat_id, NULL, dir_heartbeat_thread, (void *)jcr);
230 }
231 }
232
StopDirHeartbeat(JobControlRecord * jcr)233 void StopDirHeartbeat(JobControlRecord *jcr)
234 {
235 if (me->heartbeat_interval) {
236 StopHeartbeatMonitor(jcr);
237 }
238 }
239 } /* namespace filedaemon */
240