1 /*
2    Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #include <NdbRestarts.hpp>
26 #include <NDBT.hpp>
27 #include <string.h>
28 #include <NdbSleep.h>
29 #include <kernel/ndb_limits.h>
30 #include <signaldata/DumpStateOrd.hpp>
31 #include <NdbEnv.h>
32 #include <NDBT_Test.hpp>
33 
34 #define F_ARGS NDBT_Context* ctx, NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart
35 
36 int restartRandomNodeGraceful(F_ARGS);
37 int restartRandomNodeAbort(F_ARGS);
38 int restartRandomNodeError(F_ARGS);
39 int restartRandomNodeInitial(F_ARGS);
40 int restartNFDuringNR(F_ARGS);
41 int restartMasterNodeError(F_ARGS);
42 int twoNodeFailure(F_ARGS);
43 int fiftyPercentFail(F_ARGS);
44 int twoMasterNodeFailure(F_ARGS);
45 int restartAllNodesGracfeul(F_ARGS);
46 int restartAllNodesAbort(F_ARGS);
47 int restartAllNodesError9999(F_ARGS);
48 int fiftyPercentStopAndWait(F_ARGS);
49 int restartNodeDuringLCP(F_ARGS);
50 int stopOnError(F_ARGS);
51 int getRandomNodeId(NdbRestarter& _restarter);
52 
53 /**
54  * Define list of restarts
55  *  - name of restart
56  *  - function perfoming the restart
57  *  - required number of nodes
58  *  - ...
59  *  - arg1, used depending of restart
60  *  - arg2, used depending of restart
61  */
62 
63 const NdbRestarts::NdbRestart NdbRestarts::m_restarts[] = {
64   /*********************************************************
65    *
66    *  NODE RESTARTS with 1 node restarted
67    *
68    *********************************************************/
69   /**
70    * Restart a randomly selected node
71    * with graceful shutdown
72    */
73   NdbRestart("RestartRandomNode",
74 	     NODE_RESTART,
75 	     restartRandomNodeGraceful,
76 	     2),
77   /**
78    * Restart a randomly selected node
79    * with immediate(abort) shutdown
80    */
81   NdbRestart("RestartRandomNodeAbort",
82 	     NODE_RESTART,
83 	     restartRandomNodeAbort,
84 	     2),
85   /**
86    * Restart a randomly selected node
87    * with  error insert
88    *
89    */
90   NdbRestart("RestartRandomNodeError",
91 	     NODE_RESTART,
92 	     restartRandomNodeError,
93 	     2),
94   /**
95    * Restart the master node
96    * with  error insert
97    */
98   NdbRestart("RestartMasterNodeError",
99 	     NODE_RESTART,
100 	     restartMasterNodeError,
101 	     2),
102   /**
103    * Restart a randomly selected node without fileystem
104    *
105    */
106   NdbRestart("RestartRandomNodeInitial",
107 	     NODE_RESTART,
108 	     restartRandomNodeInitial,
109 	     2),
110   /**
111    * Restart a randomly selected node and then
112    * crash it while restarting
113    *
114    */
115   NdbRestart("RestartNFDuringNR",
116 	     NODE_RESTART,
117 	     restartNFDuringNR,
118 	     2),
119 
120   /**
121    * Set StopOnError and crash the node by sending
122    * SYSTEM_ERROR to it
123    *
124    */
125   NdbRestart("StopOnError",
126 	     NODE_RESTART,
127 	     stopOnError,
128 	     1),
129 
130   /*********************************************************
131    *
132    *  MULTIPLE NODE RESTARTS with more than 1 node
133    *
134    *********************************************************/
135   /**
136    * 2 nodes restart, select nodes to restart randomly and restart
137    * with a small random delay between restarts
138    */
139   NdbRestart("TwoNodeFailure",
140 	     MULTIPLE_NODE_RESTART,
141 	     twoNodeFailure,
142 	     4),
143   /**
144    * 2 nodes restart, select master nodes and restart with
145    * a small random delay between restarts
146    */
147 
148   NdbRestart("TwoMasterNodeFailure",
149 	     MULTIPLE_NODE_RESTART,
150 	     twoMasterNodeFailure,
151 	     4),
152 
153   NdbRestart("FiftyPercentFail",
154 	     MULTIPLE_NODE_RESTART,
155 	     fiftyPercentFail,
156 	     2),
157 
158   /*********************************************************
159    *
160    *  SYSTEM RESTARTS
161    *
162    *********************************************************/
163   /**
164    * Restart all nodes with graceful shutdown
165    *
166    */
167 
168   NdbRestart("RestartAllNodes",
169 	     SYSTEM_RESTART,
170 	     restartAllNodesGracfeul,
171 	     1),
172   /**
173    * Restart all nodes immediately without
174    * graful shutdown
175    */
176   NdbRestart("RestartAllNodesAbort",
177 	     SYSTEM_RESTART,
178 	     restartAllNodesAbort,
179 	     1),
180   /**
181    * Restart all nodes with error insert 9999
182    * TODO! We can later add more errors like 9998, 9997 etc.
183    */
184   NdbRestart("RestartAllNodesError9999",
185 	     SYSTEM_RESTART,
186 	     restartAllNodesError9999,
187 	     1),
188   /**
189    * Stop 50% of all nodes with error insert 9999
190    * Wait for a random number of minutes
191    * Stop the rest of the nodes and then start all again
192    */
193   NdbRestart("FiftyPercentStopAndWait",
194 	     SYSTEM_RESTART,
195 	     fiftyPercentStopAndWait,
196 	     2),
197   /**
198    * Restart a master node during LCP with error inserts.
199    */
200   NdbRestart("RestartNodeDuringLCP",
201 	     NODE_RESTART,
202 	     restartNodeDuringLCP,
203 	     2),
204 };
205 
206 const int NdbRestarts::m_NoOfRestarts = sizeof(m_restarts) / sizeof(NdbRestart);
207 
208 
209 const NdbRestarts::NdbErrorInsert NdbRestarts::m_errors[] = {
210   NdbErrorInsert("Error9999", 9999)
211 };
212 
213 const int NdbRestarts::m_NoOfErrors = sizeof(m_errors) / sizeof(NdbErrorInsert);
214 
NdbRestart(const char * _name,NdbRestartType _type,restartFunc * _func,int _requiredNodes,int _arg1)215 NdbRestarts::NdbRestart::NdbRestart(const char* _name,
216 				    NdbRestartType _type,
217 				    restartFunc* _func,
218 				    int _requiredNodes,
219 				    int _arg1){
220   m_name = _name;
221   m_type = _type;
222   m_restartFunc = _func;
223   m_numRequiredNodes = _requiredNodes;
224   //  m_arg1 = arg1;
225 }
226 
227 
getNumRestarts()228 int NdbRestarts::getNumRestarts(){
229   return m_NoOfRestarts;
230 }
231 
getRestart(int _num)232 const NdbRestarts::NdbRestart* NdbRestarts::getRestart(int _num){
233   if (_num >= m_NoOfRestarts)
234     return NULL;
235 
236   return &m_restarts[_num];
237 }
238 
getRestart(const char * _name)239 const NdbRestarts::NdbRestart* NdbRestarts::getRestart(const char* _name){
240   for(int i = 0; i < m_NoOfRestarts; i++){
241     if (strcmp(m_restarts[i].m_name, _name) == 0){
242       return &m_restarts[i];
243     }
244   }
245   g_err << "The restart \""<< _name << "\" not found in NdbRestarts" << endl;
246   return NULL;
247 }
248 
249 
executeRestart(NDBT_Context * ctx,const NdbRestarts::NdbRestart * _restart,unsigned int _timeout)250 int NdbRestarts::executeRestart(NDBT_Context* ctx,
251                                 const NdbRestarts::NdbRestart* _restart,
252 				unsigned int _timeout){
253   // Check that there are enough nodes in the cluster
254   // for this test
255   NdbRestarter restarter;
256   if (_restart->m_numRequiredNodes > restarter.getNumDbNodes()){
257     g_err << "This test requires " << _restart->m_numRequiredNodes << " nodes "
258 	  << "there are only "<< restarter.getNumDbNodes() <<" nodes in cluster"
259 	  << endl;
260     return NDBT_OK;
261   }
262   if (restarter.waitClusterStarted(120) != 0){
263     // If cluster is not started when we shall peform restart
264     // the restart can not be executed and the test fails
265     return NDBT_FAILED;
266   }
267 
268   int res = _restart->m_restartFunc(ctx, restarter, _restart);
269 
270   // Sleep a little waiting for nodes to react to command
271   NdbSleep_SecSleep(2);
272 
273   if  (_timeout == 0){
274     // If timeout == 0 wait for ever
275     while(restarter.waitClusterStarted(60) != 0)
276       g_err << "Cluster is not started after restart. Waiting 60s more..."
277 	    << endl;
278   } else {
279     if (restarter.waitClusterStarted(_timeout) != 0){
280       g_err<<"Cluster failed to start" << endl;
281       res = NDBT_FAILED;
282     }
283   }
284 
285   return res;
286 }
287 
executeRestart(NDBT_Context * ctx,int _num,unsigned int _timeout)288 int NdbRestarts::executeRestart(NDBT_Context* ctx,
289                                 int _num,
290 				unsigned int _timeout){
291   const NdbRestarts::NdbRestart* r = getRestart(_num);
292   if (r == NULL)
293     return NDBT_FAILED;
294 
295   int res = executeRestart(ctx, r, _timeout);
296   return res;
297 }
298 
executeRestart(NDBT_Context * ctx,const char * _name,unsigned int _timeout)299 int NdbRestarts::executeRestart(NDBT_Context* ctx,
300                                 const char* _name,
301 				unsigned int _timeout){
302   const NdbRestarts::NdbRestart* r = getRestart(_name);
303   if (r == NULL)
304     return NDBT_FAILED;
305 
306   int res = executeRestart(ctx, r, _timeout);
307   return res;
308 }
309 
listRestarts(NdbRestartType _type)310 void NdbRestarts::listRestarts(NdbRestartType _type){
311   for(int i = 0; i < m_NoOfRestarts; i++){
312     if (m_restarts[i].m_type == _type)
313       ndbout << " " << m_restarts[i].m_name << ", min "
314 	     << m_restarts[i].m_numRequiredNodes
315 	     << " nodes"<< endl;
316   }
317 }
318 
listRestarts()319 void NdbRestarts::listRestarts(){
320   ndbout << "NODE RESTARTS" << endl;
321   listRestarts(NODE_RESTART);
322   ndbout << "MULTIPLE NODE RESTARTS" << endl;
323   listRestarts(MULTIPLE_NODE_RESTART);
324   ndbout << "SYSTEM RESTARTS" << endl;
325   listRestarts(SYSTEM_RESTART);
326 }
327 
NdbErrorInsert(const char * _name,int _errorNo)328 NdbRestarts::NdbErrorInsert::NdbErrorInsert(const char* _name,
329 					    int _errorNo){
330 
331   m_name = _name;
332   m_errorNo = _errorNo;
333 }
334 
getNumErrorInserts()335 int NdbRestarts::getNumErrorInserts(){
336   return m_NoOfErrors;
337 }
338 
getError(int _num)339 const NdbRestarts::NdbErrorInsert* NdbRestarts::getError(int _num){
340   if (_num >= m_NoOfErrors)
341     return NULL;
342   return &m_errors[_num];
343 }
344 
getRandomError()345 const NdbRestarts::NdbErrorInsert* NdbRestarts::getRandomError(){
346   int randomId = myRandom48(m_NoOfErrors);
347   return &m_errors[randomId];
348 }
349 
350 
351 
352 /**
353  *
354  * IMPLEMENTATION OF THE DIFFERENT RESTARTS
355  * Each function should perform it's action
356  * and the returned NDBT_OK or NDBT_FAILED
357  *
358  */
359 
360 
361 #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
362   ndbout << "ERR: "<< m \
363            << "   " << "File: " << __FILE__ \
364            << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
365   return NDBT_FAILED; } }
366 
367 
368 
restartRandomNodeGraceful(F_ARGS)369 int restartRandomNodeGraceful(F_ARGS){
370 
371   myRandom48Init((long)NdbTick_CurrentMillisecond());
372   int randomId = myRandom48(_restarter.getNumDbNodes());
373   int nodeId = _restarter.getDbNodeId(randomId);
374 
375   g_info << _restart->m_name << ": node = "<<nodeId << endl;
376 
377   CHECK(_restarter.restartOneDbNode(nodeId) == 0,
378 	"Could not restart node "<<nodeId);
379 
380   return NDBT_OK;
381 }
382 
restartRandomNodeAbort(F_ARGS)383 int restartRandomNodeAbort(F_ARGS){
384 
385   myRandom48Init((long)NdbTick_CurrentMillisecond());
386   int randomId = myRandom48(_restarter.getNumDbNodes());
387   int nodeId = _restarter.getDbNodeId(randomId);
388 
389   g_info << _restart->m_name << ": node = "<<nodeId << endl;
390 
391   CHECK(_restarter.restartOneDbNode(nodeId, false, false, true) == 0,
392 	"Could not restart node "<<nodeId);
393 
394   return NDBT_OK;
395 }
396 
restartRandomNodeError(F_ARGS)397 int restartRandomNodeError(F_ARGS){
398 
399   myRandom48Init((long)NdbTick_CurrentMillisecond());
400   int randomId = myRandom48(_restarter.getNumDbNodes());
401   int nodeId = _restarter.getDbNodeId(randomId);
402 
403   ndbout << _restart->m_name << ": node = "<<nodeId << endl;
404 
405   CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0,
406 	"Could not restart node "<<nodeId);
407 
408   return NDBT_OK;
409 }
410 
restartMasterNodeError(F_ARGS)411 int restartMasterNodeError(F_ARGS){
412 
413   int nodeId = _restarter.getDbNodeId(0);
414 
415   g_info << _restart->m_name << ": node = "<<nodeId << endl;
416 
417   CHECK(_restarter.insertErrorInNode(nodeId, 39999) == 0,
418 	"Could not restart node "<<nodeId);
419 
420   return NDBT_OK;
421 }
422 
restartRandomNodeInitial(F_ARGS)423 int restartRandomNodeInitial(F_ARGS){
424 
425   myRandom48Init((long)NdbTick_CurrentMillisecond());
426   int randomId = myRandom48(_restarter.getNumDbNodes());
427   int nodeId = _restarter.getDbNodeId(randomId);
428 
429   g_info << _restart->m_name << ": node = "<<nodeId << endl;
430 
431   CHECK(_restarter.restartOneDbNode(nodeId, true) == 0,
432 	"Could not restart node "<<nodeId);
433 
434   return NDBT_OK;
435 }
436 
twoNodeFailure(F_ARGS)437 int twoNodeFailure(F_ARGS){
438 
439   myRandom48Init((long)NdbTick_CurrentMillisecond());
440   int randomId = myRandom48(_restarter.getNumDbNodes());
441   int n[2];
442   n[0] = _restarter.getDbNodeId(randomId);
443   n[1] = _restarter.getRandomNodeOtherNodeGroup(n[0], rand());
444   g_info << _restart->m_name << ": node = "<< n[0] << endl;
445 
446   int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
447   CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
448         "Failed to dump");
449   CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
450         "Failed to dump");
451 
452   CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
453 	"Could not restart node "<< n[0]);
454 
455     // Create random value, max 3 secs
456   int max = 3000;
457   int ms = (myRandom48(max)) + 1;
458   g_info << "Waiting for " << ms << "(" << max
459 	 << ") ms " << endl;
460   NdbSleep_MilliSleep(ms);
461 
462   g_info << _restart->m_name << ": node = "<< n[1] << endl;
463   CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
464 	"Could not restart node "<< n[1]);
465 
466   CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
467         "Failed to wait nostart");
468 
469   _restarter.startNodes(n, 2);
470 
471   return NDBT_OK;
472 }
473 
twoMasterNodeFailure(F_ARGS)474 int twoMasterNodeFailure(F_ARGS){
475 
476   int n[2];
477   n[0] = _restarter.getMasterNodeId();
478   n[1] = n[0];
479   do {
480     n[1] = _restarter.getNextMasterNodeId(n[1]);
481   } while(_restarter.getNodeGroup(n[0]) == _restarter.getNodeGroup(n[1]));
482 
483   g_info << _restart->m_name << ": ";
484   g_info << "node0 = "<< n[0] << "(" << _restarter.getNodeGroup(n[0]) << ") ";
485   g_info << "node1 = "<< n[1] << "(" << _restarter.getNodeGroup(n[1]) << ") ";
486   g_info << endl;
487 
488   int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
489   CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
490         "Failed to dump");
491   CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
492         "Failed to dump");
493 
494   CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
495 	"Could not restart node "<< n[0]);
496 
497   // Create random value, max 3 secs
498   int max = 3000;
499   int ms = (myRandom48(max)) + 1;
500   g_info << "Waiting for " << ms << "(" << max
501 	 << ") ms " << endl;
502   NdbSleep_MilliSleep(ms);
503 
504   g_info << _restart->m_name << ": node = "<< n[1] << endl;
505 
506   CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
507 	"Could not restart node "<< n[1]);
508 
509   CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
510         "Failed to wait nostart");
511 
512   _restarter.startNodes(n, 2);
513 
514   return NDBT_OK;
515 }
516 
get50PercentOfNodes(NdbRestarter & restarter,int * _nodes)517 int get50PercentOfNodes(NdbRestarter& restarter,
518 			int * _nodes){
519   // For now simply return all nodes with even node id
520   // TODO Check nodegroup and return one node from each
521 
522   int num50Percent = restarter.getNumDbNodes() / 2;
523   assert(num50Percent <= MAX_NDB_NODES);
524 
525   // Calculate which nodes to stop, select all even nodes
526   for (int i = 0; i < num50Percent; i++){
527     _nodes[i] = restarter.getDbNodeId(i*2);
528   }
529   return num50Percent;
530 }
531 
fiftyPercentFail(F_ARGS)532 int fiftyPercentFail(F_ARGS){
533 
534 
535   int nodes[MAX_NDB_NODES];
536 
537   int numNodes = get50PercentOfNodes(_restarter, nodes);
538 
539   // Stop the nodes, with nostart and abort
540   for (int i = 0; i < numNodes; i++){
541     g_info << "Stopping node "<< nodes[i] << endl;
542     int res = _restarter.restartOneDbNode(nodes[i], false, true, true);
543     CHECK(res == 0, "Could not stop node: "<< nodes[i]);
544   }
545 
546   CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
547 	"waitNodesNoStart");
548 
549   // Order all nodes to start
550   ndbout << "Starting all nodes" << endl;
551   CHECK(_restarter.startAll() == 0,
552 	"Could not start all nodes");
553 
554   return NDBT_OK;
555 }
556 
557 
restartAllNodesGracfeul(F_ARGS)558 int restartAllNodesGracfeul(F_ARGS){
559 
560   g_info << _restart->m_name  << endl;
561 
562   // Restart graceful
563   CHECK(_restarter.restartAll() == 0,
564 	"Could not restart all nodes");
565 
566   return NDBT_OK;
567 
568 }
569 
restartAllNodesAbort(F_ARGS)570 int restartAllNodesAbort(F_ARGS){
571 
572   g_info << _restart->m_name  << endl;
573 
574   // Restart abort
575   CHECK(_restarter.restartAll(false, false, true) == 0,
576 	"Could not restart all nodes");
577 
578   return NDBT_OK;
579 }
580 
restartAllNodesError9999(F_ARGS)581 int restartAllNodesError9999(F_ARGS){
582 
583   g_info << _restart->m_name <<  endl;
584 
585   int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
586   CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
587         "failed to set RestartOnErrorInsert");
588 
589   CHECK(_restarter.insertErrorInAllNodes(932) == 0,
590         "Failed to set error 932 (auto-restart on arbit error)");
591 
592   // Restart with error insert
593   CHECK(_restarter.insertErrorInAllNodes(9999) == 0,
594 	"Could not restart all nodes ");
595 
596   CHECK(_restarter.waitClusterNoStart() == 0,
597         "Failed to wait not started");
598 
599   _restarter.startAll();
600 
601   return NDBT_OK;
602 }
603 
fiftyPercentStopAndWait(F_ARGS)604 int fiftyPercentStopAndWait(F_ARGS){
605 
606   int nodes[MAX_NDB_NODES];
607   int numNodes = get50PercentOfNodes(_restarter, nodes);
608 
609   // Stop the nodes, with nostart and abort
610   for (int i = 0; i < numNodes; i++){
611     g_info << "Stopping node "<<nodes[i] << endl;
612     int res = _restarter.restartOneDbNode(nodes[i], false, true, true);
613     CHECK(res == 0, "Could not stop node: "<< nodes[i]);
614   }
615 
616   CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
617 	"waitNodesNoStart");
618 
619   // Create random value, max 120 secs
620   int max = 120;
621   int seconds = (myRandom48(max)) + 1;
622   g_info << "Waiting for " << seconds << "(" << max
623 	 << ") secs " << endl;
624   NdbSleep_SecSleep(seconds);
625 
626 
627   // Restart graceful
628   CHECK(_restarter.restartAll() == 0,
629 	"Could not restart all nodes");
630 
631   g_info << _restart->m_name <<  endl;
632 
633   return NDBT_OK;
634 }
635 
636 int
637 NFDuringNR_codes[] = {
638   7121,
639   5027,
640   7172,
641   6000,
642   6001,
643   7171,
644   7130,
645   7133,
646   7138,
647   7154,
648   7144,
649   5026,
650   7139,
651   7132,
652   5045,
653 
654   7195, 7196,7197,7198,7199,
655 
656 
657   //LCP
658   8000,
659   8001,
660   5010,
661   7022,
662   7024,
663   7016,
664   7017,
665   5002
666 };
667 
restartNFDuringNR(F_ARGS)668 int restartNFDuringNR(F_ARGS){
669 
670   myRandom48Init((long)NdbTick_CurrentMillisecond());
671   int i;
672   const int sz = sizeof(NFDuringNR_codes)/sizeof(NFDuringNR_codes[0]);
673   for(i = 0; i<sz; i++){
674     int randomId = myRandom48(_restarter.getNumDbNodes());
675     int nodeId = _restarter.getDbNodeId(randomId);
676     int error = NFDuringNR_codes[i];
677 
678     g_err << _restart->m_name << ": node = " << nodeId
679 	  << " error code = " << error << endl;
680 
681     CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
682 	  "Could not restart node "<< nodeId);
683 
684     CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
685 	  "waitNodesNoStart failed");
686 
687     int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
688     CHECK(_restarter.dumpStateOneNode(nodeId, val, 2) == 0,
689 	  "failed to set RestartOnErrorInsert");
690 
691     CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
692 	  "failed to set error insert");
693 
694     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
695 	  "failed to start node");
696 
697     NdbSleep_SecSleep(3);
698 
699     CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
700           "waitNodesNoStart failed");
701 
702     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
703 	  "failed to start node");
704 
705     CHECK(_restarter.waitNodesStarted(&nodeId, 1) == 0,
706 	  "waitNodesStarted failed");
707   }
708 
709   return NDBT_OK;
710 
711   if(_restarter.getNumDbNodes() < 4)
712     return NDBT_OK;
713 
714   char buf[256];
715   if(NdbEnv_GetEnv("USER", buf, 256) == 0 || strcmp(buf, "ejonore") != 0)
716     return NDBT_OK;
717 
718   for(i = 0; i<sz && !ctx->isTestStopped(); i++){
719     const int randomId = myRandom48(_restarter.getNumDbNodes());
720     int nodeId = _restarter.getDbNodeId(randomId);
721     const int error = NFDuringNR_codes[i];
722 
723     const int masterNodeId = _restarter.getMasterNodeId();
724     CHECK(masterNodeId > 0, "getMasterNodeId failed");
725     int crashNodeId = 0;
726     do {
727       int rand = myRandom48(1000);
728       crashNodeId = _restarter.getRandomNodeOtherNodeGroup(nodeId, rand);
729     } while(crashNodeId == masterNodeId);
730 
731     CHECK(crashNodeId > 0, "getMasterNodeId failed");
732 
733     g_info << _restart->m_name << " restarting node = " << nodeId
734 	   << " error code = " << error
735 	   << " crash node = " << crashNodeId << endl;
736 
737     CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
738 	  "Could not restart node "<< nodeId);
739 
740     CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
741 	  "waitNodesNoStart failed");
742 
743     int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
744     CHECK(_restarter.dumpStateOneNode(crashNodeId, val, 2) == 0,
745 	  "failed to set RestartOnErrorInsert");
746 
747     CHECK(_restarter.insertErrorInNode(crashNodeId, error) == 0,
748 	  "failed to set error insert");
749 
750     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
751 	  "failed to start node");
752 
753     CHECK(_restarter.waitClusterStarted() == 0,
754 	  "waitClusterStarted failed");
755   }
756 
757   return NDBT_OK;
758 }
759 
760 int
761 NRDuringLCP_Master_codes[] = {
762   7009, // Insert system error in master when local checkpoint is idle.
763   7010, // Insert system error in master when local checkpoint is in the
764         // state clcpStatus = CALCULATE_KEEP_GCI.
765   7013, // Insert system error in master when local checkpoint is in the
766         // state clcpStatus = COPY_GCI before sending COPY_GCIREQ.
767   7014, // Insert system error in master when local checkpoint is in the
768         // state clcpStatus = TC_CLOPSIZE before sending TC_CLOPSIZEREQ.
769   7015, // Insert system error in master when local checkpoint is in the
770         // state clcpStatus = START_LCP_ROUND before sending START_LCP_ROUND.
771   7019, // Insert system error in master when local checkpoint is in the
772         // state clcpStatus = IDLE before sending CONTINUEB(ZCHECK_TC_COUNTER).
773   7075, // Master. Don't send any LCP_FRAG_ORD(last=true)
774         // And crash when all have "not" been sent
775   7021, // Crash in  master when receiving START_LCP_REQ
776   7023, // Crash in  master when sending START_LCP_CONF
777   7025, // Crash in  master when receiving LCP_FRAG_REP
778   7026, // Crash in  master when changing state to LCP_TAB_COMPLETED
779   7027  // Crash in  master when changing state to LCP_TAB_SAVED
780 };
781 
782 int
783 NRDuringLCP_NonMaster_codes[] = {
784   7020, // Insert system error in local checkpoint participant at reception
785         // of COPY_GCIREQ.
786   8000, // Crash particpant when receiving TCGETOPSIZEREQ
787   8001, // Crash particpant when receiving TC_CLOPSIZEREQ
788   5010, // Crash any when receiving LCP_FRAGORD
789   7022, // Crash in !master when receiving START_LCP_REQ
790   7024, // Crash in !master when sending START_LCP_CONF
791   7016, // Crash in !master when receiving LCP_FRAG_REP
792   7017, // Crash in !master when changing state to LCP_TAB_COMPLETED
793   7018  // Crash in !master when changing state to LCP_TAB_SAVED
794 };
795 
restartNodeDuringLCP(F_ARGS)796 int restartNodeDuringLCP(F_ARGS) {
797   int i;
798   // Master
799   int val = DumpStateOrd::DihMinTimeBetweenLCP;
800   CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0,
801 	"Failed to set LCP to min value"); // Set LCP to min val
802   int sz = sizeof(NRDuringLCP_Master_codes)/
803            sizeof(NRDuringLCP_Master_codes[0]);
804   for(i = 0; i<sz; i++) {
805 
806     int error = NRDuringLCP_Master_codes[i];
807     int masterNodeId = _restarter.getMasterNodeId();
808 
809     CHECK(masterNodeId > 0, "getMasterNodeId failed");
810 
811     ndbout << _restart->m_name << " restarting master node = " << masterNodeId
812 	   << " error code = " << error << endl;
813 
814     {
815       int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
816       CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
817 	    "failed to set RestartOnErrorInsert");
818     }
819 
820     CHECK(_restarter.insertErrorInNode(masterNodeId, error) == 0,
821 	  "failed to set error insert");
822 
823     CHECK(_restarter.waitNodesNoStart(&masterNodeId, 1, 300) == 0,
824 				      "failed to wait no start");
825 
826     CHECK(_restarter.startNodes(&masterNodeId, 1) == 0,
827 	  "failed to start node");
828 
829     CHECK(_restarter.waitClusterStarted(300) == 0,
830 	  "waitClusterStarted failed");
831 
832     {
833       int val = DumpStateOrd::DihMinTimeBetweenLCP;
834       CHECK(_restarter.dumpStateOneNode(masterNodeId, &val, 1) == 0,
835 	    "failed to set error insert");
836     }
837   }
838 
839   // NON-Master
840   sz = sizeof(NRDuringLCP_NonMaster_codes)/
841        sizeof(NRDuringLCP_NonMaster_codes[0]);
842   for(i = 0; i<sz; i++) {
843 
844     int error = NRDuringLCP_NonMaster_codes[i];
845     int nodeId = getRandomNodeId(_restarter);
846     int masterNodeId = _restarter.getMasterNodeId();
847     CHECK(masterNodeId > 0, "getMasterNodeId failed");
848 
849     while (nodeId == masterNodeId) {
850       nodeId = getRandomNodeId(_restarter);
851     }
852 
853     ndbout << _restart->m_name << " restarting non-master node = " << nodeId
854 	   << " error code = " << error << endl;
855 
856     int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
857     CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
858 	  "failed to set RestartOnErrorInsert");
859 
860     CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
861 	  "failed to set error insert");
862 
863     CHECK(_restarter.waitNodesNoStart(&nodeId, 1, 300) == 0,
864 				      "failed to wait no start");
865 
866     CHECK(_restarter.startNodes(&nodeId, 1) == 0,
867 	  "failed to start node");
868 
869     CHECK(_restarter.waitClusterStarted(300) == 0,
870 	  "waitClusterStarted failed");
871 
872     {
873       int val = DumpStateOrd::DihMinTimeBetweenLCP;
874       CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
875 	    "failed to set error insert");
876     }
877   }
878 
879   return NDBT_OK;
880 }
881 
stopOnError(F_ARGS)882 int stopOnError(F_ARGS){
883 
884   myRandom48Init((long)NdbTick_CurrentMillisecond());
885 
886   int randomId = myRandom48(_restarter.getNumDbNodes());
887   int nodeId = _restarter.getDbNodeId(randomId);
888 
889   do {
890     g_info << _restart->m_name << ": node = " << nodeId
891 	   << endl;
892 
893     CHECK(_restarter.waitClusterStarted(300) == 0,
894 	  "waitClusterStarted failed");
895 
896     int val = DumpStateOrd::NdbcntrTestStopOnError;
897     CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
898 	  "failed to set NdbcntrTestStopOnError");
899 
900     NdbSleep_SecSleep(3);
901 
902     CHECK(_restarter.waitClusterStarted(300) == 0,
903 	  "waitClusterStarted failed");
904   } while (false);
905 
906   return NDBT_OK;
907 }
908 
getRandomNodeId(NdbRestarter & _restarter)909 int getRandomNodeId(NdbRestarter& _restarter) {
910   myRandom48Init((long)NdbTick_CurrentMillisecond());
911   int randomId = myRandom48(_restarter.getNumDbNodes());
912   int nodeId = _restarter.getDbNodeId(randomId);
913 
914   return nodeId;
915 }
916