1 /* Copyright (c) 2003-2007 MySQL AB
2 Use is subject to license terms
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
16
17 #include <signaldata/DumpStateOrd.hpp>
18 #include <NdbBackup.hpp>
19 #include <NdbOut.hpp>
20 #include <NDBT_Output.hpp>
21 #include <NdbConfig.h>
22 #include <ndb_version.h>
23 #include <NDBT.hpp>
24 #include <NdbSleep.h>
25 #include <random.h>
26 #include <NdbTick.h>
27
28 #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
29 ndbout << "ERR: "<< m \
30 << " " << "File: " << __FILE__ \
31 << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
32 return NDBT_FAILED; } }
33
34 #include <ConfigRetriever.hpp>
35 #include <mgmapi.h>
36 #include <mgmapi_config_parameters.h>
37 #include <mgmapi_configuration.hpp>
38
39 int
start(unsigned int & _backup_id)40 NdbBackup::start(unsigned int & _backup_id){
41
42
43 if (!isConnected())
44 return -1;
45
46 ndb_mgm_reply reply;
47 reply.return_code = 0;
48
49 if (ndb_mgm_start_backup(handle,
50 2, // wait until completed
51 &_backup_id,
52 &reply) == -1) {
53 g_err << "Error: " << ndb_mgm_get_latest_error(handle) << endl;
54 g_err << "Error msg: " << ndb_mgm_get_latest_error_msg(handle) << endl;
55 g_err << "Error desc: " << ndb_mgm_get_latest_error_desc(handle) << endl;
56 return -1;
57 }
58
59 if(reply.return_code != 0){
60 g_err << "PLEASE CHECK CODE NdbBackup.cpp line=" << __LINE__ << endl;
61 g_err << "Error: " << ndb_mgm_get_latest_error(handle) << endl;
62 g_err << "Error msg: " << ndb_mgm_get_latest_error_msg(handle) << endl;
63 g_err << "Error desc: " << ndb_mgm_get_latest_error_desc(handle) << endl;
64 return reply.return_code;
65 }
66 return 0;
67 }
68
69
70 const char *
getBackupDataDirForNode(int _node_id)71 NdbBackup::getBackupDataDirForNode(int _node_id){
72
73 /**
74 * Fetch configuration from management server
75 */
76 ndb_mgm_configuration *p;
77 if (connect())
78 return NULL;
79
80 if ((p = ndb_mgm_get_configuration(handle, 0)) == 0)
81 {
82 const char * s= ndb_mgm_get_latest_error_msg(handle);
83 if(s == 0)
84 s = "No error given!";
85
86 ndbout << "Could not fetch configuration" << endl;
87 ndbout << s << endl;
88 return NULL;
89 }
90
91 /**
92 * Setup cluster configuration data
93 */
94 ndb_mgm_configuration_iterator iter(* p, CFG_SECTION_NODE);
95 if (iter.find(CFG_NODE_ID, _node_id)){
96 ndbout << "Invalid configuration fetched, DB missing" << endl;
97 return NULL;
98 }
99
100 unsigned int type = NODE_TYPE_DB + 1;
101 if(iter.get(CFG_TYPE_OF_SECTION, &type) || type != NODE_TYPE_DB){
102 ndbout <<"type = " << type << endl;
103 ndbout <<"Invalid configuration fetched, I'm wrong type of node" << endl;
104 return NULL;
105 }
106
107 const char * path;
108 if (iter.get(CFG_DB_BACKUP_DATADIR, &path)){
109 ndbout << "BackupDataDir not found" << endl;
110 return NULL;
111 }
112
113 return path;
114
115 }
116
117 int
execRestore(bool _restore_data,bool _restore_meta,int _node_id,unsigned _backup_id)118 NdbBackup::execRestore(bool _restore_data,
119 bool _restore_meta,
120 int _node_id,
121 unsigned _backup_id){
122 const int buf_len = 1000;
123 char buf[buf_len];
124
125 ndbout << "getBackupDataDir "<< _node_id <<endl;
126
127 const char* path = getBackupDataDirForNode(_node_id);
128 if (path == NULL)
129 return -1;
130
131 ndbout << "getHostName "<< _node_id <<endl;
132 const char *host;
133 if (!getHostName(_node_id, &host)){
134 return -1;
135 }
136
137 /*
138 * Copy backup files to local dir
139 */
140
141 BaseString::snprintf(buf, buf_len,
142 "scp %s:%s/BACKUP/BACKUP-%d/BACKUP-%d*.%d.* .",
143 host, path,
144 _backup_id,
145 _backup_id,
146 _node_id);
147
148 ndbout << "buf: "<< buf <<endl;
149 int res = system(buf);
150
151 ndbout << "scp res: " << res << endl;
152
153 BaseString::snprintf(buf, 255, "%sndb_restore -c \"%s:%d\" -n %d -b %d %s %s .",
154 #if 1
155 "",
156 #else
157 "valgrind --leak-check=yes -v "
158 #endif
159 ndb_mgm_get_connected_host(handle),
160 ndb_mgm_get_connected_port(handle),
161 _node_id,
162 _backup_id,
163 _restore_data?"-r":"",
164 _restore_meta?"-m":"");
165
166 ndbout << "buf: "<< buf <<endl;
167 res = system(buf);
168
169 ndbout << "ndb_restore res: " << res << endl;
170
171 return res;
172
173 }
174
175 int
restore(unsigned _backup_id)176 NdbBackup::restore(unsigned _backup_id){
177
178 if (!isConnected())
179 return -1;
180
181 if (getStatus() != 0)
182 return -1;
183
184 int res;
185
186 // restore metadata first and data for first node
187 res = execRestore(true, true, ndbNodes[0].node_id, _backup_id);
188
189 // Restore data once for each node
190 for(size_t i = 1; i < ndbNodes.size(); i++){
191 res = execRestore(true, false, ndbNodes[i].node_id, _backup_id);
192 }
193
194 return 0;
195 }
196
197 // Master failure
198 int
199 NFDuringBackupM_codes[] = {
200 10003,
201 10004,
202 10007,
203 10008,
204 10009,
205 10010,
206 10012,
207 10013
208 };
209
210 // Slave failure
211 int
212 NFDuringBackupS_codes[] = {
213 10014,
214 10015,
215 10016,
216 10017,
217 10018,
218 10020
219 };
220
221 // Master takeover etc...
222 int
223 NFDuringBackupSL_codes[] = {
224 10001,
225 10002,
226 10021
227 };
228
229 int
NFMaster(NdbRestarter & _restarter)230 NdbBackup::NFMaster(NdbRestarter& _restarter){
231 const int sz = sizeof(NFDuringBackupM_codes)/sizeof(NFDuringBackupM_codes[0]);
232 return NF(_restarter, NFDuringBackupM_codes, sz, true);
233 }
234
235 int
NFMasterAsSlave(NdbRestarter & _restarter)236 NdbBackup::NFMasterAsSlave(NdbRestarter& _restarter){
237 const int sz = sizeof(NFDuringBackupS_codes)/sizeof(NFDuringBackupS_codes[0]);
238 return NF(_restarter, NFDuringBackupS_codes, sz, true);
239 }
240
241 int
NFSlave(NdbRestarter & _restarter)242 NdbBackup::NFSlave(NdbRestarter& _restarter){
243 const int sz = sizeof(NFDuringBackupS_codes)/sizeof(NFDuringBackupS_codes[0]);
244 return NF(_restarter, NFDuringBackupS_codes, sz, false);
245 }
246
247 int
NF(NdbRestarter & _restarter,int * NFDuringBackup_codes,const int sz,bool onMaster)248 NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz, bool onMaster){
249 int nNodes = _restarter.getNumDbNodes();
250 {
251 if(nNodes == 1)
252 return NDBT_OK;
253
254 int nodeId = _restarter.getMasterNodeId();
255
256 CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
257 "Could not restart node "<< nodeId);
258
259 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
260 "waitNodesNoStart failed");
261
262 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
263 "failed to start node");
264 }
265
266 CHECK(_restarter.waitClusterStarted() == 0,
267 "waitClusterStarted failed");
268
269 myRandom48Init(NdbTick_CurrentMillisecond());
270
271 for(int i = 0; i<sz; i++){
272
273 int error = NFDuringBackup_codes[i];
274 unsigned int backupId;
275
276 const int masterNodeId = _restarter.getMasterNodeId();
277 CHECK(masterNodeId > 0, "getMasterNodeId failed");
278 int nodeId;
279
280 nodeId = masterNodeId;
281 if (!onMaster) {
282 int randomId;
283 while (nodeId == masterNodeId) {
284 randomId = myRandom48(nNodes);
285 nodeId = _restarter.getDbNodeId(randomId);
286 }
287 }
288
289 g_err << "NdbBackup::NF node = " << nodeId
290 << " error code = " << error << " masterNodeId = "
291 << masterNodeId << endl;
292
293
294 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
295 CHECK(_restarter.dumpStateOneNode(nodeId, val, 2) == 0,
296 "failed to set RestartOnErrorInsert");
297 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
298 "failed to set error insert");
299
300 g_info << "error inserted" << endl;
301 NdbSleep_SecSleep(1);
302
303 g_info << "starting backup" << endl;
304 int r = start(backupId);
305 g_info << "r = " << r
306 << " (which should fail) started with id = " << backupId << endl;
307 if (r == 0) {
308 g_err << "Backup should have failed on error_insertion " << error << endl
309 << "Master = " << masterNodeId << "Node = " << nodeId << endl;
310 return NDBT_FAILED;
311 }
312
313 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
314 "waitNodesNoStart failed");
315
316 g_info << "number of nodes running " << _restarter.getNumDbNodes() << endl;
317
318 if (_restarter.getNumDbNodes() != nNodes) {
319 g_err << "Failure: cluster not up" << endl;
320 return NDBT_FAILED;
321 }
322
323 g_info << "starting new backup" << endl;
324 CHECK(start(backupId) == 0,
325 "failed to start backup");
326 g_info << "(which should succeed) started with id = " << backupId << endl;
327
328 g_info << "starting node" << endl;
329 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
330 "failed to start node");
331
332 CHECK(_restarter.waitClusterStarted() == 0,
333 "waitClusterStarted failed");
334 g_info << "node started" << endl;
335
336 int val2[] = { 24, 2424 };
337 CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
338 "failed to check backup resources RestartOnErrorInsert");
339
340 CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
341 "failed to set error insert");
342
343 NdbSleep_SecSleep(1);
344 }
345
346 return NDBT_OK;
347 }
348
349 int
350 FailS_codes[] = {
351 10025,
352 10027,
353 10033,
354 10035,
355 10036
356 };
357
358 int
359 FailM_codes[] = {
360 10023,
361 10024,
362 10025,
363 10026,
364 10027,
365 10028,
366 10031,
367 10033,
368 10035
369 };
370
371 int
FailMaster(NdbRestarter & _restarter)372 NdbBackup::FailMaster(NdbRestarter& _restarter){
373 const int sz = sizeof(FailM_codes)/sizeof(FailM_codes[0]);
374 return Fail(_restarter, FailM_codes, sz, true);
375 }
376
377 int
FailMasterAsSlave(NdbRestarter & _restarter)378 NdbBackup::FailMasterAsSlave(NdbRestarter& _restarter){
379 const int sz = sizeof(FailS_codes)/sizeof(FailS_codes[0]);
380 return Fail(_restarter, FailS_codes, sz, true);
381 }
382
383 int
FailSlave(NdbRestarter & _restarter)384 NdbBackup::FailSlave(NdbRestarter& _restarter){
385 const int sz = sizeof(FailS_codes)/sizeof(FailS_codes[0]);
386 return Fail(_restarter, FailS_codes, sz, false);
387 }
388
389 int
Fail(NdbRestarter & _restarter,int * Fail_codes,const int sz,bool onMaster)390 NdbBackup::Fail(NdbRestarter& _restarter, int *Fail_codes, const int sz, bool onMaster){
391
392 CHECK(_restarter.waitClusterStarted() == 0,
393 "waitClusterStarted failed");
394
395 int nNodes = _restarter.getNumDbNodes();
396
397 myRandom48Init(NdbTick_CurrentMillisecond());
398
399 for(int i = 0; i<sz; i++){
400 int error = Fail_codes[i];
401 unsigned int backupId;
402
403 const int masterNodeId = _restarter.getMasterNodeId();
404 CHECK(masterNodeId > 0, "getMasterNodeId failed");
405 int nodeId;
406
407 nodeId = masterNodeId;
408 if (!onMaster) {
409 int randomId;
410 while (nodeId == masterNodeId) {
411 randomId = myRandom48(nNodes);
412 nodeId = _restarter.getDbNodeId(randomId);
413 }
414 }
415
416 g_err << "NdbBackup::Fail node = " << nodeId
417 << " error code = " << error << " masterNodeId = "
418 << masterNodeId << endl;
419
420 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
421 "failed to set error insert");
422
423 g_info << "error inserted" << endl;
424 g_info << "waiting some before starting backup" << endl;
425
426 g_info << "starting backup" << endl;
427 int r = start(backupId);
428 g_info << "r = " << r
429 << " (which should fail) started with id = " << backupId << endl;
430 if (r == 0) {
431 g_err << "Backup should have failed on error_insertion " << error << endl
432 << "Master = " << masterNodeId << "Node = " << nodeId << endl;
433 return NDBT_FAILED;
434 }
435
436 CHECK(_restarter.waitClusterStarted() == 0,
437 "waitClusterStarted failed");
438
439 CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
440 "failed to set error insert");
441
442 NdbSleep_SecSleep(5);
443
444 int val2[] = { 24, 2424 };
445 CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
446 "failed to check backup resources RestartOnErrorInsert");
447
448 }
449
450 return NDBT_OK;
451 }
452
453