1 /* Copyright (c) 2003-2007 MySQL AB
2    Use is subject to license terms
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA */
16 
17 #include <signaldata/DumpStateOrd.hpp>
18 #include <NdbBackup.hpp>
19 #include <NdbOut.hpp>
20 #include <NDBT_Output.hpp>
21 #include <NdbConfig.h>
22 #include <ndb_version.h>
23 #include <NDBT.hpp>
24 #include <NdbSleep.h>
25 #include <random.h>
26 #include <NdbTick.h>
27 
28 #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
29   ndbout << "ERR: "<< m \
30            << "   " << "File: " << __FILE__ \
31            << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
32   return NDBT_FAILED; } }
33 
34 #include <ConfigRetriever.hpp>
35 #include <mgmapi.h>
36 #include <mgmapi_config_parameters.h>
37 #include <mgmapi_configuration.hpp>
38 
39 int
start(unsigned int & _backup_id)40 NdbBackup::start(unsigned int & _backup_id){
41 
42 
43   if (!isConnected())
44     return -1;
45 
46   ndb_mgm_reply reply;
47   reply.return_code = 0;
48 
49   if (ndb_mgm_start_backup(handle,
50 			   2, // wait until completed
51 			   &_backup_id,
52 			   &reply) == -1) {
53     g_err << "Error: " << ndb_mgm_get_latest_error(handle) << endl;
54     g_err << "Error msg: " << ndb_mgm_get_latest_error_msg(handle) << endl;
55     g_err << "Error desc: " << ndb_mgm_get_latest_error_desc(handle) << endl;
56     return -1;
57   }
58 
59   if(reply.return_code != 0){
60     g_err  << "PLEASE CHECK CODE NdbBackup.cpp line=" << __LINE__ << endl;
61     g_err << "Error: " << ndb_mgm_get_latest_error(handle) << endl;
62     g_err << "Error msg: " << ndb_mgm_get_latest_error_msg(handle) << endl;
63     g_err << "Error desc: " << ndb_mgm_get_latest_error_desc(handle) << endl;
64     return reply.return_code;
65   }
66   return 0;
67 }
68 
69 
70 const char *
getBackupDataDirForNode(int _node_id)71 NdbBackup::getBackupDataDirForNode(int _node_id){
72 
73   /**
74    * Fetch configuration from management server
75    */
76   ndb_mgm_configuration *p;
77   if (connect())
78     return NULL;
79 
80   if ((p = ndb_mgm_get_configuration(handle, 0)) == 0)
81   {
82     const char * s= ndb_mgm_get_latest_error_msg(handle);
83     if(s == 0)
84       s = "No error given!";
85 
86     ndbout << "Could not fetch configuration" << endl;
87     ndbout << s << endl;
88     return NULL;
89   }
90 
91   /**
92    * Setup cluster configuration data
93    */
94   ndb_mgm_configuration_iterator iter(* p, CFG_SECTION_NODE);
95   if (iter.find(CFG_NODE_ID, _node_id)){
96     ndbout << "Invalid configuration fetched, DB missing" << endl;
97     return NULL;
98   }
99 
100   unsigned int type = NODE_TYPE_DB + 1;
101   if(iter.get(CFG_TYPE_OF_SECTION, &type) || type != NODE_TYPE_DB){
102     ndbout <<"type = " << type << endl;
103     ndbout <<"Invalid configuration fetched, I'm wrong type of node" << endl;
104     return NULL;
105   }
106 
107   const char * path;
108   if (iter.get(CFG_DB_BACKUP_DATADIR, &path)){
109     ndbout << "BackupDataDir not found" << endl;
110     return NULL;
111   }
112 
113   return path;
114 
115 }
116 
117 int
execRestore(bool _restore_data,bool _restore_meta,int _node_id,unsigned _backup_id)118 NdbBackup::execRestore(bool _restore_data,
119 		       bool _restore_meta,
120 		       int _node_id,
121 		       unsigned _backup_id){
122   const int buf_len = 1000;
123   char buf[buf_len];
124 
125   ndbout << "getBackupDataDir "<< _node_id <<endl;
126 
127   const char* path = getBackupDataDirForNode(_node_id);
128   if (path == NULL)
129     return -1;
130 
131   ndbout << "getHostName "<< _node_id <<endl;
132   const char *host;
133   if (!getHostName(_node_id, &host)){
134     return -1;
135   }
136 
137   /*
138    * Copy  backup files to local dir
139    */
140 
141   BaseString::snprintf(buf, buf_len,
142 	   "scp %s:%s/BACKUP/BACKUP-%d/BACKUP-%d*.%d.* .",
143 	   host, path,
144 	   _backup_id,
145 	   _backup_id,
146 	   _node_id);
147 
148   ndbout << "buf: "<< buf <<endl;
149   int res = system(buf);
150 
151   ndbout << "scp res: " << res << endl;
152 
153   BaseString::snprintf(buf, 255, "%sndb_restore -c \"%s:%d\" -n %d -b %d %s %s .",
154 #if 1
155 	   "",
156 #else
157 	   "valgrind --leak-check=yes -v "
158 #endif
159 	   ndb_mgm_get_connected_host(handle),
160 	   ndb_mgm_get_connected_port(handle),
161 	   _node_id,
162 	   _backup_id,
163 	   _restore_data?"-r":"",
164 	   _restore_meta?"-m":"");
165 
166   ndbout << "buf: "<< buf <<endl;
167   res = system(buf);
168 
169   ndbout << "ndb_restore res: " << res << endl;
170 
171   return res;
172 
173 }
174 
175 int
restore(unsigned _backup_id)176 NdbBackup::restore(unsigned _backup_id){
177 
178   if (!isConnected())
179     return -1;
180 
181   if (getStatus() != 0)
182     return -1;
183 
184   int res;
185 
186   // restore metadata first and data for first node
187   res = execRestore(true, true, ndbNodes[0].node_id, _backup_id);
188 
189   // Restore data once for each node
190   for(size_t i = 1; i < ndbNodes.size(); i++){
191     res = execRestore(true, false, ndbNodes[i].node_id, _backup_id);
192   }
193 
194   return 0;
195 }
196 
197 // Master failure
198 int
199 NFDuringBackupM_codes[] = {
200   10003,
201   10004,
202   10007,
203   10008,
204   10009,
205   10010,
206   10012,
207   10013
208 };
209 
210 // Slave failure
211 int
212 NFDuringBackupS_codes[] = {
213   10014,
214   10015,
215   10016,
216   10017,
217   10018,
218   10020
219 };
220 
221 // Master takeover etc...
222 int
223 NFDuringBackupSL_codes[] = {
224   10001,
225   10002,
226   10021
227 };
228 
229 int
NFMaster(NdbRestarter & _restarter)230 NdbBackup::NFMaster(NdbRestarter& _restarter){
231   const int sz = sizeof(NFDuringBackupM_codes)/sizeof(NFDuringBackupM_codes[0]);
232   return NF(_restarter, NFDuringBackupM_codes, sz, true);
233 }
234 
235 int
NFMasterAsSlave(NdbRestarter & _restarter)236 NdbBackup::NFMasterAsSlave(NdbRestarter& _restarter){
237   const int sz = sizeof(NFDuringBackupS_codes)/sizeof(NFDuringBackupS_codes[0]);
238   return NF(_restarter, NFDuringBackupS_codes, sz, true);
239 }
240 
241 int
NFSlave(NdbRestarter & _restarter)242 NdbBackup::NFSlave(NdbRestarter& _restarter){
243   const int sz = sizeof(NFDuringBackupS_codes)/sizeof(NFDuringBackupS_codes[0]);
244   return NF(_restarter, NFDuringBackupS_codes, sz, false);
245 }
246 
247 int
NF(NdbRestarter & _restarter,int * NFDuringBackup_codes,const int sz,bool onMaster)248 NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz, bool onMaster){
249   int nNodes = _restarter.getNumDbNodes();
250   {
251     if(nNodes == 1)
252       return NDBT_OK;
253 
254     int nodeId = _restarter.getMasterNodeId();
255 
256     CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
257 	  "Could not restart node "<< nodeId);
258 
259     CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
260 	  "waitNodesNoStart failed");
261 
262     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
263 	  "failed to start node");
264   }
265 
266   CHECK(_restarter.waitClusterStarted() == 0,
267 	"waitClusterStarted failed");
268 
269   myRandom48Init(NdbTick_CurrentMillisecond());
270 
271   for(int i = 0; i<sz; i++){
272 
273     int error = NFDuringBackup_codes[i];
274     unsigned int backupId;
275 
276     const int masterNodeId = _restarter.getMasterNodeId();
277     CHECK(masterNodeId > 0, "getMasterNodeId failed");
278     int nodeId;
279 
280     nodeId = masterNodeId;
281     if (!onMaster) {
282       int randomId;
283       while (nodeId == masterNodeId) {
284 	randomId = myRandom48(nNodes);
285 	nodeId = _restarter.getDbNodeId(randomId);
286       }
287     }
288 
289     g_err << "NdbBackup::NF node = " << nodeId
290 	   << " error code = " << error << " masterNodeId = "
291 	   << masterNodeId << endl;
292 
293 
294     int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
295     CHECK(_restarter.dumpStateOneNode(nodeId, val, 2) == 0,
296 	  "failed to set RestartOnErrorInsert");
297     CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
298 	  "failed to set error insert");
299 
300     g_info << "error inserted"  << endl;
301     NdbSleep_SecSleep(1);
302 
303     g_info << "starting backup"  << endl;
304     int r = start(backupId);
305     g_info << "r = " << r
306 	   << " (which should fail) started with id = "  << backupId << endl;
307     if (r == 0) {
308       g_err << "Backup should have failed on error_insertion " << error << endl
309 	    << "Master = " << masterNodeId << "Node = " << nodeId << endl;
310       return NDBT_FAILED;
311     }
312 
313     CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
314 	  "waitNodesNoStart failed");
315 
316     g_info << "number of nodes running " << _restarter.getNumDbNodes() << endl;
317 
318     if (_restarter.getNumDbNodes() != nNodes) {
319       g_err << "Failure: cluster not up" << endl;
320       return NDBT_FAILED;
321     }
322 
323     g_info << "starting new backup"  << endl;
324     CHECK(start(backupId) == 0,
325 	  "failed to start backup");
326     g_info << "(which should succeed) started with id = "  << backupId << endl;
327 
328     g_info << "starting node"  << endl;
329     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
330 	  "failed to start node");
331 
332     CHECK(_restarter.waitClusterStarted() == 0,
333 	  "waitClusterStarted failed");
334     g_info << "node started"  << endl;
335 
336     int val2[] = { 24, 2424 };
337     CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
338 	  "failed to check backup resources RestartOnErrorInsert");
339 
340     CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
341 	  "failed to set error insert");
342 
343     NdbSleep_SecSleep(1);
344   }
345 
346   return NDBT_OK;
347 }
348 
349 int
350 FailS_codes[] = {
351   10025,
352   10027,
353   10033,
354   10035,
355   10036
356 };
357 
358 int
359 FailM_codes[] = {
360   10023,
361   10024,
362   10025,
363   10026,
364   10027,
365   10028,
366   10031,
367   10033,
368   10035
369 };
370 
371 int
FailMaster(NdbRestarter & _restarter)372 NdbBackup::FailMaster(NdbRestarter& _restarter){
373   const int sz = sizeof(FailM_codes)/sizeof(FailM_codes[0]);
374   return Fail(_restarter, FailM_codes, sz, true);
375 }
376 
377 int
FailMasterAsSlave(NdbRestarter & _restarter)378 NdbBackup::FailMasterAsSlave(NdbRestarter& _restarter){
379   const int sz = sizeof(FailS_codes)/sizeof(FailS_codes[0]);
380   return Fail(_restarter, FailS_codes, sz, true);
381 }
382 
383 int
FailSlave(NdbRestarter & _restarter)384 NdbBackup::FailSlave(NdbRestarter& _restarter){
385   const int sz = sizeof(FailS_codes)/sizeof(FailS_codes[0]);
386   return Fail(_restarter, FailS_codes, sz, false);
387 }
388 
389 int
Fail(NdbRestarter & _restarter,int * Fail_codes,const int sz,bool onMaster)390 NdbBackup::Fail(NdbRestarter& _restarter, int *Fail_codes, const int sz, bool onMaster){
391 
392   CHECK(_restarter.waitClusterStarted() == 0,
393 	"waitClusterStarted failed");
394 
395   int nNodes = _restarter.getNumDbNodes();
396 
397   myRandom48Init(NdbTick_CurrentMillisecond());
398 
399   for(int i = 0; i<sz; i++){
400     int error = Fail_codes[i];
401     unsigned int backupId;
402 
403     const int masterNodeId = _restarter.getMasterNodeId();
404     CHECK(masterNodeId > 0, "getMasterNodeId failed");
405     int nodeId;
406 
407     nodeId = masterNodeId;
408     if (!onMaster) {
409       int randomId;
410       while (nodeId == masterNodeId) {
411 	randomId = myRandom48(nNodes);
412 	nodeId = _restarter.getDbNodeId(randomId);
413       }
414     }
415 
416     g_err << "NdbBackup::Fail node = " << nodeId
417 	   << " error code = " << error << " masterNodeId = "
418 	   << masterNodeId << endl;
419 
420     CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
421 	  "failed to set error insert");
422 
423     g_info << "error inserted"  << endl;
424     g_info << "waiting some before starting backup"  << endl;
425 
426     g_info << "starting backup"  << endl;
427     int r = start(backupId);
428     g_info << "r = " << r
429 	   << " (which should fail) started with id = "  << backupId << endl;
430     if (r == 0) {
431       g_err << "Backup should have failed on error_insertion " << error << endl
432 	    << "Master = " << masterNodeId << "Node = " << nodeId << endl;
433       return NDBT_FAILED;
434     }
435 
436     CHECK(_restarter.waitClusterStarted() == 0,
437 	  "waitClusterStarted failed");
438 
439     CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
440 	  "failed to set error insert");
441 
442     NdbSleep_SecSleep(5);
443 
444     int val2[] = { 24, 2424 };
445     CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
446 	  "failed to check backup resources RestartOnErrorInsert");
447 
448   }
449 
450   return NDBT_OK;
451 }
452 
453