1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2003-2012 Free Software Foundation Europe e.V.
5    Copyright (C) 2011-2012 Planets Communications B.V.
6    Copyright (C) 2013-2020 Bareos GmbH & Co. KG
7 
8    This program is Free Software; you can redistribute it and/or
9    modify it under the terms of version three of the GNU Affero General Public
10    License as published by the Free Software Foundation and included
11    in the file LICENSE.
12 
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16    Affero General Public License for more details.
17 
18    You should have received a copy of the GNU Affero General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22 */
23 /*
24  * Kern Sibbald, May MMIII
25  */
26 /**
27  * @file
28  * Bareos File Daemon heartbeat routines
29  * Listens for heartbeats coming from the SD
30  * If configured, sends heartbeats to Dir
31  */
32 
33 #include "include/bareos.h"
34 #include "filed/filed.h"
35 #include "filed/jcr_private.h"
36 #include "filed/filed_globals.h"
37 #include "lib/bnet.h"
38 #include "lib/bsock.h"
39 #include "lib/watchdog.h"
40 
41 namespace filedaemon {
42 
43 #define WAIT_INTERVAL 5
44 
45 extern "C" void* sd_heartbeat_thread(void* arg);
46 extern "C" void* dir_heartbeat_thread(void* arg);
47 extern bool no_signals;
48 
49 /**
50  * Listen on the SD socket for heartbeat signals.
51  * Send heartbeats to the Director every HB_TIME
52  *   seconds.
53  */
sd_heartbeat_thread(void * arg)54 extern "C" void* sd_heartbeat_thread(void* arg)
55 {
56   int32_t n;
57   JobControlRecord* jcr = (JobControlRecord*)arg;
58   std::shared_ptr<BareosSocket> sd, dir;
59   time_t last_heartbeat = time(NULL);
60   time_t now;
61 
62   pthread_detach(pthread_self());
63 
64   /*
65    * Get our own local copy
66    */
67   sd.reset(jcr->store_bsock->clone());
68   dir.reset(jcr->dir_bsock->clone());
69 
70   jcr->impl->hb_bsock = sd;
71   jcr->impl->hb_running = true;
72   jcr->impl->hb_dir_bsock = dir;
73   dir->suppress_error_msgs_ = true;
74   sd->suppress_error_msgs_ = true;
75   jcr->impl->hb_initialized_once
76       = true;  // initialize last to avoid race condition
77 
78   /* Hang reading the socket to the SD, and every time we get
79    * a heartbeat or we get a wait timeout (5 seconds), we
80    * check to see if we need to send a heartbeat to the
81    * Director.
82    */
83   while (!sd->IsStop()) {
84     n = BnetWaitDataIntr(sd.get(), WAIT_INTERVAL);
85     if (n < 0 || sd->IsStop()) { break; }
86     if (me->heartbeat_interval) {
87       now = time(NULL);
88       if (now - last_heartbeat >= me->heartbeat_interval) {
89         dir->signal(BNET_HEARTBEAT);
90         if (dir->IsStop()) { break; }
91         last_heartbeat = now;
92       }
93     }
94     if (n == 1) { /* input waiting */
95       sd->recv(); /* read it -- probably heartbeat from sd */
96       if (sd->IsStop()) { break; }
97       if (sd->message_length <= 0) {
98         Dmsg1(100, "Got BNET_SIG %d from SD\n", sd->message_length);
99       } else {
100         Dmsg2(100, "Got %d bytes from SD. MSG=%s\n", sd->message_length,
101               sd->msg);
102       }
103     }
104     Dmsg2(200, "wait_intr=%d stop=%d\n", n, IsBnetStop(sd.get()));
105   }
106 
107   sd->close();
108   dir->close();
109   jcr->impl->hb_bsock.reset();
110   jcr->impl->hb_running = false;
111   jcr->impl->hb_dir_bsock = NULL;
112 
113   return NULL;
114 }
115 
116 /* Startup the heartbeat thread -- see above */
StartHeartbeatMonitor(JobControlRecord * jcr)117 void StartHeartbeatMonitor(JobControlRecord* jcr)
118 {
119   /*
120    * If no signals are set, do not start the heartbeat because
121    * it gives a constant stream of TIMEOUT_SIGNAL signals that
122    * make debugging impossible.
123    */
124   if (!no_signals) {
125     jcr->impl->hb_bsock = NULL;
126     jcr->impl->hb_running = false;
127     jcr->impl->hb_initialized_once = false;
128     jcr->impl->hb_dir_bsock = NULL;
129     pthread_create(&jcr->impl->heartbeat_id, NULL, sd_heartbeat_thread,
130                    (void*)jcr);
131   }
132 }
133 
134 /* Terminate the heartbeat thread. Used for both SD and DIR */
StopHeartbeatMonitor(JobControlRecord * jcr)135 void StopHeartbeatMonitor(JobControlRecord* jcr)
136 {
137   int cnt = 0;
138   if (no_signals) { return; }
139 
140   /* Wait max 10 secs for heartbeat thread to start */
141   while (!jcr->impl->hb_initialized_once && cnt++ < 200) {
142     Bmicrosleep(0, 50000); /* wait for start */
143   }
144 
145   if (jcr->impl->hb_running) {
146     jcr->impl->hb_bsock->SetTimedOut();   /* set timed_out to Terminate read */
147     jcr->impl->hb_bsock->SetTerminated(); /* set to Terminate read */
148   }
149 
150   if (jcr->impl->hb_dir_bsock) {
151     jcr->impl->hb_dir_bsock
152         ->SetTimedOut(); /* set timed_out to Terminate read */
153     jcr->impl->hb_dir_bsock->SetTerminated(); /* set to Terminate read */
154   }
155 
156   if (jcr->impl->hb_running) {
157     Dmsg0(100, "Send kill to heartbeat id\n");
158     pthread_kill(jcr->impl->heartbeat_id,
159                  TIMEOUT_SIGNAL); /* make heartbeat thread go away */
160     Bmicrosleep(0, 50000);
161   }
162   cnt = 0;
163 
164   /*
165    * Wait max 100 secs for heartbeat thread to stop
166    */
167   while (jcr->impl->hb_running && cnt++ < 200) {
168     pthread_kill(jcr->impl->heartbeat_id,
169                  TIMEOUT_SIGNAL); /* make heartbeat thread go away */
170     Bmicrosleep(0, 500000);
171   }
172 
173   if (jcr->impl->hb_bsock) {
174     // delete jcr->impl_->hb_bsock;
175     jcr->impl->hb_bsock.reset();
176   }
177 
178   if (jcr->impl->hb_dir_bsock) {
179     // delete jcr->impl_->hb_dir_bsock;
180     jcr->impl->hb_dir_bsock.reset();
181   }
182 
183   jcr->impl->hb_initialized_once = false;
184 }
185 
186 /**
187  * Thread for sending heartbeats to the Director when there
188  *   is no SD monitoring needed -- e.g. restore and verify Vol
189  *   both do their own read() on the SD socket.
190  */
dir_heartbeat_thread(void * arg)191 extern "C" void* dir_heartbeat_thread(void* arg)
192 {
193   JobControlRecord* jcr = (JobControlRecord*)arg;
194   BareosSocket* dir;
195   time_t last_heartbeat = time(NULL);
196 
197   pthread_detach(pthread_self());
198 
199   /*
200    * Get our own local copy
201    */
202   dir = jcr->dir_bsock->clone();
203 
204   jcr->impl->hb_bsock.reset(dir);
205   jcr->impl->hb_running = true;
206   dir->suppress_error_msgs_ = true;
207   jcr->impl->hb_initialized_once
208       = true;  // initialize last to avoid race condition
209 
210   while (!dir->IsStop()) {
211     time_t now, next;
212 
213     now = time(NULL);
214     next = now - last_heartbeat;
215     if (next >= me->heartbeat_interval) {
216       dir->signal(BNET_HEARTBEAT);
217       if (dir->IsStop()) { break; }
218       last_heartbeat = now;
219     }
220     Bmicrosleep(next, 0);
221   }
222   dir->close();
223   jcr->impl->hb_bsock.reset();
224   jcr->impl->hb_running = false;
225   return NULL;
226 }
227 
228 /**
229  * Same as above but we don't listen to the SD
230  */
StartDirHeartbeat(JobControlRecord * jcr)231 void StartDirHeartbeat(JobControlRecord* jcr)
232 {
233   if (me->heartbeat_interval) {
234     jcr->dir_bsock->SetLocking();
235     pthread_create(&jcr->impl->heartbeat_id, NULL, dir_heartbeat_thread,
236                    (void*)jcr);
237   }
238 }
239 
StopDirHeartbeat(JobControlRecord * jcr)240 void StopDirHeartbeat(JobControlRecord* jcr)
241 {
242   if (me->heartbeat_interval) { StopHeartbeatMonitor(jcr); }
243 }
244 } /* namespace filedaemon */
245