1 /*
2    BAREOS® - Backup Archiving REcovery Open Sourced
3 
4    Copyright (C) 2003-2012 Free Software Foundation Europe e.V.
5    Copyright (C) 2011-2012 Planets Communications B.V.
6    Copyright (C) 2013-2016 Bareos GmbH & Co. KG
7 
8    This program is Free Software; you can redistribute it and/or
9    modify it under the terms of version three of the GNU Affero General Public
10    License as published by the Free Software Foundation and included
11    in the file LICENSE.
12 
13    This program is distributed in the hope that it will be useful, but
14    WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16    Affero General Public License for more details.
17 
18    You should have received a copy of the GNU Affero General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21    02110-1301, USA.
22 */
23 /*
24  * Kern Sibbald, May MMIII
25  */
26 /**
27  * @file
28  * Bareos File Daemon heartbeat routines
29  * Listens for heartbeats coming from the SD
30  * If configured, sends heartbeats to Dir
31  */
32 
33 #include "include/bareos.h"
34 #include "filed/filed.h"
35 #include "filed/filed_globals.h"
36 #include "lib/bnet.h"
37 
38 namespace filedaemon {
39 
40 #define WAIT_INTERVAL 5
41 
42 extern "C" void *sd_heartbeat_thread(void *arg);
43 extern "C" void *dir_heartbeat_thread(void *arg);
44 extern bool no_signals;
45 
46 /**
47  * Listen on the SD socket for heartbeat signals.
48  * Send heartbeats to the Director every HB_TIME
49  *   seconds.
50  */
sd_heartbeat_thread(void * arg)51 extern "C" void *sd_heartbeat_thread(void *arg)
52 {
53    int32_t n;
54    JobControlRecord *jcr = (JobControlRecord *)arg;
55    std::shared_ptr<BareosSocket> sd, dir;
56    time_t last_heartbeat = time(NULL);
57    time_t now;
58 
59    pthread_detach(pthread_self());
60 
61    /*
62     * Get our own local copy
63     */
64    sd.reset(jcr->store_bsock->clone());
65    dir.reset(jcr->dir_bsock->clone());
66 
67    jcr->hb_bsock = sd;
68    jcr->hb_started = true;
69    jcr->hb_dir_bsock = dir;
70    dir->suppress_error_msgs_ = true;
71    sd->suppress_error_msgs_ = true;
72 
73    /* Hang reading the socket to the SD, and every time we get
74     * a heartbeat or we get a wait timeout (5 seconds), we
75     * check to see if we need to send a heartbeat to the
76     * Director.
77     */
78    while (!sd->IsStop()) {
79       n = BnetWaitDataIntr(sd.get(), WAIT_INTERVAL);
80       if (n < 0 || sd->IsStop()) {
81          break;
82       }
83       if (me->heartbeat_interval) {
84          now = time(NULL);
85          if (now-last_heartbeat >= me->heartbeat_interval) {
86             dir->signal(BNET_HEARTBEAT);
87             if (dir->IsStop()) {
88                break;
89             }
90             last_heartbeat = now;
91          }
92       }
93       if (n == 1) {               /* input waiting */
94          sd->recv();              /* read it -- probably heartbeat from sd */
95          if (sd->IsStop()) {
96             break;
97          }
98          if (sd->message_length <= 0) {
99             Dmsg1(100, "Got BNET_SIG %d from SD\n", sd->message_length);
100          } else {
101             Dmsg2(100, "Got %d bytes from SD. MSG=%s\n", sd->message_length, sd->msg);
102          }
103       }
104       Dmsg2(200, "wait_intr=%d stop=%d\n", n, IsBnetStop(sd.get()));
105    }
106 
107    sd->close();
108    dir->close();
109    jcr->hb_bsock.reset();
110    jcr->hb_started = false;
111    jcr->hb_dir_bsock = NULL;
112 
113    return NULL;
114 }
115 
116 /* Startup the heartbeat thread -- see above */
StartHeartbeatMonitor(JobControlRecord * jcr)117 void StartHeartbeatMonitor(JobControlRecord *jcr)
118 {
119    /*
120     * If no signals are set, do not start the heartbeat because
121     * it gives a constant stream of TIMEOUT_SIGNAL signals that
122     * make debugging impossible.
123     */
124    if (!no_signals) {
125       jcr->hb_bsock = NULL;
126       jcr->hb_started = false;
127       jcr->hb_dir_bsock = NULL;
128       pthread_create(&jcr->heartbeat_id, NULL, sd_heartbeat_thread, (void *)jcr);
129    }
130 }
131 
132 /* Terminate the heartbeat thread. Used for both SD and DIR */
StopHeartbeatMonitor(JobControlRecord * jcr)133 void StopHeartbeatMonitor(JobControlRecord *jcr)
134 {
135    int cnt = 0;
136    if (no_signals) {
137       return;
138    }
139    /* Wait max 10 secs for heartbeat thread to start */
140    while (!jcr->hb_started && cnt++ < 200) {
141       Bmicrosleep(0, 50000);         /* wait for start */
142    }
143 
144    if (jcr->hb_started) {
145       jcr->hb_bsock->SetTimedOut();       /* set timed_out to Terminate read */
146       jcr->hb_bsock->SetTerminated();      /* set to Terminate read */
147    }
148 
149    if (jcr->hb_dir_bsock) {
150       jcr->hb_dir_bsock->SetTimedOut();     /* set timed_out to Terminate read */
151       jcr->hb_dir_bsock->SetTerminated();    /* set to Terminate read */
152    }
153 
154    if (jcr->hb_started) {
155       Dmsg0(100, "Send kill to heartbeat id\n");
156       pthread_kill(jcr->heartbeat_id, TIMEOUT_SIGNAL);  /* make heartbeat thread go away */
157       Bmicrosleep(0, 50000);
158    }
159    cnt = 0;
160 
161    /*
162     * Wait max 100 secs for heartbeat thread to stop
163     */
164    while (jcr->hb_started && cnt++ < 200) {
165       pthread_kill(jcr->heartbeat_id, TIMEOUT_SIGNAL);  /* make heartbeat thread go away */
166       Bmicrosleep(0, 500000);
167    }
168 
169    if (jcr->hb_bsock) {
170       // delete jcr->hb_bsock;
171       jcr->hb_bsock.reset();
172    }
173 
174    if (jcr->hb_dir_bsock) {
175       // delete jcr->hb_dir_bsock;
176       jcr->hb_dir_bsock.reset();
177    }
178 }
179 
180 /**
181  * Thread for sending heartbeats to the Director when there
182  *   is no SD monitoring needed -- e.g. restore and verify Vol
183  *   both do their own read() on the SD socket.
184  */
dir_heartbeat_thread(void * arg)185 extern "C" void *dir_heartbeat_thread(void *arg)
186 {
187    JobControlRecord *jcr = (JobControlRecord *)arg;
188    BareosSocket *dir;
189    time_t last_heartbeat = time(NULL);
190 
191    pthread_detach(pthread_self());
192 
193    /*
194     * Get our own local copy
195     */
196    dir = jcr->dir_bsock->clone();
197 
198    jcr->hb_bsock.reset(dir);
199    jcr->hb_started = true;
200    dir->suppress_error_msgs_ = true;
201 
202    while (!dir->IsStop()) {
203       time_t now, next;
204 
205       now = time(NULL);
206       next = now - last_heartbeat;
207       if (next >= me->heartbeat_interval) {
208          dir->signal(BNET_HEARTBEAT);
209          if (dir->IsStop()) {
210             break;
211          }
212          last_heartbeat = now;
213       }
214       Bmicrosleep(next, 0);
215    }
216    dir->close();
217    jcr->hb_bsock.reset();
218    jcr->hb_started = false;
219    return NULL;
220 }
221 
222 /**
223  * Same as above but we don't listen to the SD
224  */
StartDirHeartbeat(JobControlRecord * jcr)225 void StartDirHeartbeat(JobControlRecord *jcr)
226 {
227    if (me->heartbeat_interval) {
228       jcr->dir_bsock->SetLocking();
229       pthread_create(&jcr->heartbeat_id, NULL, dir_heartbeat_thread, (void *)jcr);
230    }
231 }
232 
StopDirHeartbeat(JobControlRecord * jcr)233 void StopDirHeartbeat(JobControlRecord *jcr)
234 {
235    if (me->heartbeat_interval) {
236       StopHeartbeatMonitor(jcr);
237    }
238 }
239 } /* namespace filedaemon */
240