1 /*
2 Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include <NdbRestarts.hpp>
26 #include <NDBT.hpp>
27 #include <string.h>
28 #include <NdbSleep.h>
29 #include <kernel/ndb_limits.h>
30 #include <signaldata/DumpStateOrd.hpp>
31 #include <NdbEnv.h>
32 #include <NDBT_Test.hpp>
33
34 #define F_ARGS NDBT_Context* ctx, NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart
35
36 int restartRandomNodeGraceful(F_ARGS);
37 int restartRandomNodeAbort(F_ARGS);
38 int restartRandomNodeError(F_ARGS);
39 int restartRandomNodeInitial(F_ARGS);
40 int restartNFDuringNR(F_ARGS);
41 int restartMasterNodeError(F_ARGS);
42 int twoNodeFailure(F_ARGS);
43 int fiftyPercentFail(F_ARGS);
44 int twoMasterNodeFailure(F_ARGS);
45 int restartAllNodesGracfeul(F_ARGS);
46 int restartAllNodesAbort(F_ARGS);
47 int restartAllNodesError9999(F_ARGS);
48 int fiftyPercentStopAndWait(F_ARGS);
49 int restartNodeDuringLCP(F_ARGS);
50 int stopOnError(F_ARGS);
51 int getRandomNodeId(NdbRestarter& _restarter);
52
53 /**
54 * Define list of restarts
55 * - name of restart
56 * - function perfoming the restart
57 * - required number of nodes
58 * - ...
59 * - arg1, used depending of restart
60 * - arg2, used depending of restart
61 */
62
63 const NdbRestarts::NdbRestart NdbRestarts::m_restarts[] = {
64 /*********************************************************
65 *
66 * NODE RESTARTS with 1 node restarted
67 *
68 *********************************************************/
69 /**
70 * Restart a randomly selected node
71 * with graceful shutdown
72 */
73 NdbRestart("RestartRandomNode",
74 NODE_RESTART,
75 restartRandomNodeGraceful,
76 2),
77 /**
78 * Restart a randomly selected node
79 * with immediate(abort) shutdown
80 */
81 NdbRestart("RestartRandomNodeAbort",
82 NODE_RESTART,
83 restartRandomNodeAbort,
84 2),
85 /**
86 * Restart a randomly selected node
87 * with error insert
88 *
89 */
90 NdbRestart("RestartRandomNodeError",
91 NODE_RESTART,
92 restartRandomNodeError,
93 2),
94 /**
95 * Restart the master node
96 * with error insert
97 */
98 NdbRestart("RestartMasterNodeError",
99 NODE_RESTART,
100 restartMasterNodeError,
101 2),
102 /**
103 * Restart a randomly selected node without fileystem
104 *
105 */
106 NdbRestart("RestartRandomNodeInitial",
107 NODE_RESTART,
108 restartRandomNodeInitial,
109 2),
110 /**
111 * Restart a randomly selected node and then
112 * crash it while restarting
113 *
114 */
115 NdbRestart("RestartNFDuringNR",
116 NODE_RESTART,
117 restartNFDuringNR,
118 2),
119
120 /**
121 * Set StopOnError and crash the node by sending
122 * SYSTEM_ERROR to it
123 *
124 */
125 NdbRestart("StopOnError",
126 NODE_RESTART,
127 stopOnError,
128 1),
129
130 /*********************************************************
131 *
132 * MULTIPLE NODE RESTARTS with more than 1 node
133 *
134 *********************************************************/
135 /**
136 * 2 nodes restart, select nodes to restart randomly and restart
137 * with a small random delay between restarts
138 */
139 NdbRestart("TwoNodeFailure",
140 MULTIPLE_NODE_RESTART,
141 twoNodeFailure,
142 4),
143 /**
144 * 2 nodes restart, select master nodes and restart with
145 * a small random delay between restarts
146 */
147
148 NdbRestart("TwoMasterNodeFailure",
149 MULTIPLE_NODE_RESTART,
150 twoMasterNodeFailure,
151 4),
152
153 NdbRestart("FiftyPercentFail",
154 MULTIPLE_NODE_RESTART,
155 fiftyPercentFail,
156 2),
157
158 /*********************************************************
159 *
160 * SYSTEM RESTARTS
161 *
162 *********************************************************/
163 /**
164 * Restart all nodes with graceful shutdown
165 *
166 */
167
168 NdbRestart("RestartAllNodes",
169 SYSTEM_RESTART,
170 restartAllNodesGracfeul,
171 1),
172 /**
173 * Restart all nodes immediately without
174 * graful shutdown
175 */
176 NdbRestart("RestartAllNodesAbort",
177 SYSTEM_RESTART,
178 restartAllNodesAbort,
179 1),
180 /**
181 * Restart all nodes with error insert 9999
182 * TODO! We can later add more errors like 9998, 9997 etc.
183 */
184 NdbRestart("RestartAllNodesError9999",
185 SYSTEM_RESTART,
186 restartAllNodesError9999,
187 1),
188 /**
189 * Stop 50% of all nodes with error insert 9999
190 * Wait for a random number of minutes
191 * Stop the rest of the nodes and then start all again
192 */
193 NdbRestart("FiftyPercentStopAndWait",
194 SYSTEM_RESTART,
195 fiftyPercentStopAndWait,
196 2),
197 /**
198 * Restart a master node during LCP with error inserts.
199 */
200 NdbRestart("RestartNodeDuringLCP",
201 NODE_RESTART,
202 restartNodeDuringLCP,
203 2),
204 };
205
206 const int NdbRestarts::m_NoOfRestarts = sizeof(m_restarts) / sizeof(NdbRestart);
207
208
209 const NdbRestarts::NdbErrorInsert NdbRestarts::m_errors[] = {
210 NdbErrorInsert("Error9999", 9999)
211 };
212
213 const int NdbRestarts::m_NoOfErrors = sizeof(m_errors) / sizeof(NdbErrorInsert);
214
NdbRestart(const char * _name,NdbRestartType _type,restartFunc * _func,int _requiredNodes,int _arg1)215 NdbRestarts::NdbRestart::NdbRestart(const char* _name,
216 NdbRestartType _type,
217 restartFunc* _func,
218 int _requiredNodes,
219 int _arg1){
220 m_name = _name;
221 m_type = _type;
222 m_restartFunc = _func;
223 m_numRequiredNodes = _requiredNodes;
224 // m_arg1 = arg1;
225 }
226
227
getNumRestarts()228 int NdbRestarts::getNumRestarts(){
229 return m_NoOfRestarts;
230 }
231
getRestart(int _num)232 const NdbRestarts::NdbRestart* NdbRestarts::getRestart(int _num){
233 if (_num >= m_NoOfRestarts)
234 return NULL;
235
236 return &m_restarts[_num];
237 }
238
getRestart(const char * _name)239 const NdbRestarts::NdbRestart* NdbRestarts::getRestart(const char* _name){
240 for(int i = 0; i < m_NoOfRestarts; i++){
241 if (strcmp(m_restarts[i].m_name, _name) == 0){
242 return &m_restarts[i];
243 }
244 }
245 g_err << "The restart \""<< _name << "\" not found in NdbRestarts" << endl;
246 return NULL;
247 }
248
249
executeRestart(NDBT_Context * ctx,const NdbRestarts::NdbRestart * _restart,unsigned int _timeout)250 int NdbRestarts::executeRestart(NDBT_Context* ctx,
251 const NdbRestarts::NdbRestart* _restart,
252 unsigned int _timeout){
253 // Check that there are enough nodes in the cluster
254 // for this test
255 NdbRestarter restarter;
256 if (_restart->m_numRequiredNodes > restarter.getNumDbNodes()){
257 g_err << "This test requires " << _restart->m_numRequiredNodes << " nodes "
258 << "there are only "<< restarter.getNumDbNodes() <<" nodes in cluster"
259 << endl;
260 return NDBT_OK;
261 }
262 if (restarter.waitClusterStarted(120) != 0){
263 // If cluster is not started when we shall peform restart
264 // the restart can not be executed and the test fails
265 return NDBT_FAILED;
266 }
267
268 int res = _restart->m_restartFunc(ctx, restarter, _restart);
269
270 // Sleep a little waiting for nodes to react to command
271 NdbSleep_SecSleep(2);
272
273 if (_timeout == 0){
274 // If timeout == 0 wait for ever
275 while(restarter.waitClusterStarted(60) != 0)
276 g_err << "Cluster is not started after restart. Waiting 60s more..."
277 << endl;
278 } else {
279 if (restarter.waitClusterStarted(_timeout) != 0){
280 g_err<<"Cluster failed to start" << endl;
281 res = NDBT_FAILED;
282 }
283 }
284
285 return res;
286 }
287
executeRestart(NDBT_Context * ctx,int _num,unsigned int _timeout)288 int NdbRestarts::executeRestart(NDBT_Context* ctx,
289 int _num,
290 unsigned int _timeout){
291 const NdbRestarts::NdbRestart* r = getRestart(_num);
292 if (r == NULL)
293 return NDBT_FAILED;
294
295 int res = executeRestart(ctx, r, _timeout);
296 return res;
297 }
298
executeRestart(NDBT_Context * ctx,const char * _name,unsigned int _timeout)299 int NdbRestarts::executeRestart(NDBT_Context* ctx,
300 const char* _name,
301 unsigned int _timeout){
302 const NdbRestarts::NdbRestart* r = getRestart(_name);
303 if (r == NULL)
304 return NDBT_FAILED;
305
306 int res = executeRestart(ctx, r, _timeout);
307 return res;
308 }
309
listRestarts(NdbRestartType _type)310 void NdbRestarts::listRestarts(NdbRestartType _type){
311 for(int i = 0; i < m_NoOfRestarts; i++){
312 if (m_restarts[i].m_type == _type)
313 ndbout << " " << m_restarts[i].m_name << ", min "
314 << m_restarts[i].m_numRequiredNodes
315 << " nodes"<< endl;
316 }
317 }
318
listRestarts()319 void NdbRestarts::listRestarts(){
320 ndbout << "NODE RESTARTS" << endl;
321 listRestarts(NODE_RESTART);
322 ndbout << "MULTIPLE NODE RESTARTS" << endl;
323 listRestarts(MULTIPLE_NODE_RESTART);
324 ndbout << "SYSTEM RESTARTS" << endl;
325 listRestarts(SYSTEM_RESTART);
326 }
327
NdbErrorInsert(const char * _name,int _errorNo)328 NdbRestarts::NdbErrorInsert::NdbErrorInsert(const char* _name,
329 int _errorNo){
330
331 m_name = _name;
332 m_errorNo = _errorNo;
333 }
334
getNumErrorInserts()335 int NdbRestarts::getNumErrorInserts(){
336 return m_NoOfErrors;
337 }
338
getError(int _num)339 const NdbRestarts::NdbErrorInsert* NdbRestarts::getError(int _num){
340 if (_num >= m_NoOfErrors)
341 return NULL;
342 return &m_errors[_num];
343 }
344
getRandomError()345 const NdbRestarts::NdbErrorInsert* NdbRestarts::getRandomError(){
346 int randomId = myRandom48(m_NoOfErrors);
347 return &m_errors[randomId];
348 }
349
350
351
352 /**
353 *
354 * IMPLEMENTATION OF THE DIFFERENT RESTARTS
355 * Each function should perform it's action
356 * and the returned NDBT_OK or NDBT_FAILED
357 *
358 */
359
360
361 #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \
362 ndbout << "ERR: "<< m \
363 << " " << "File: " << __FILE__ \
364 << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \
365 return NDBT_FAILED; } }
366
367
368
restartRandomNodeGraceful(F_ARGS)369 int restartRandomNodeGraceful(F_ARGS){
370
371 myRandom48Init((long)NdbTick_CurrentMillisecond());
372 int randomId = myRandom48(_restarter.getNumDbNodes());
373 int nodeId = _restarter.getDbNodeId(randomId);
374
375 g_info << _restart->m_name << ": node = "<<nodeId << endl;
376
377 CHECK(_restarter.restartOneDbNode(nodeId) == 0,
378 "Could not restart node "<<nodeId);
379
380 return NDBT_OK;
381 }
382
restartRandomNodeAbort(F_ARGS)383 int restartRandomNodeAbort(F_ARGS){
384
385 myRandom48Init((long)NdbTick_CurrentMillisecond());
386 int randomId = myRandom48(_restarter.getNumDbNodes());
387 int nodeId = _restarter.getDbNodeId(randomId);
388
389 g_info << _restart->m_name << ": node = "<<nodeId << endl;
390
391 CHECK(_restarter.restartOneDbNode(nodeId, false, false, true) == 0,
392 "Could not restart node "<<nodeId);
393
394 return NDBT_OK;
395 }
396
restartRandomNodeError(F_ARGS)397 int restartRandomNodeError(F_ARGS){
398
399 myRandom48Init((long)NdbTick_CurrentMillisecond());
400 int randomId = myRandom48(_restarter.getNumDbNodes());
401 int nodeId = _restarter.getDbNodeId(randomId);
402
403 ndbout << _restart->m_name << ": node = "<<nodeId << endl;
404
405 CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0,
406 "Could not restart node "<<nodeId);
407
408 return NDBT_OK;
409 }
410
restartMasterNodeError(F_ARGS)411 int restartMasterNodeError(F_ARGS){
412
413 int nodeId = _restarter.getDbNodeId(0);
414
415 g_info << _restart->m_name << ": node = "<<nodeId << endl;
416
417 CHECK(_restarter.insertErrorInNode(nodeId, 39999) == 0,
418 "Could not restart node "<<nodeId);
419
420 return NDBT_OK;
421 }
422
restartRandomNodeInitial(F_ARGS)423 int restartRandomNodeInitial(F_ARGS){
424
425 myRandom48Init((long)NdbTick_CurrentMillisecond());
426 int randomId = myRandom48(_restarter.getNumDbNodes());
427 int nodeId = _restarter.getDbNodeId(randomId);
428
429 g_info << _restart->m_name << ": node = "<<nodeId << endl;
430
431 CHECK(_restarter.restartOneDbNode(nodeId, true) == 0,
432 "Could not restart node "<<nodeId);
433
434 return NDBT_OK;
435 }
436
twoNodeFailure(F_ARGS)437 int twoNodeFailure(F_ARGS){
438
439 myRandom48Init((long)NdbTick_CurrentMillisecond());
440 int randomId = myRandom48(_restarter.getNumDbNodes());
441 int n[2];
442 n[0] = _restarter.getDbNodeId(randomId);
443 n[1] = _restarter.getRandomNodeOtherNodeGroup(n[0], rand());
444 g_info << _restart->m_name << ": node = "<< n[0] << endl;
445
446 int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
447 CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
448 "Failed to dump");
449 CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
450 "Failed to dump");
451
452 CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
453 "Could not restart node "<< n[0]);
454
455 // Create random value, max 3 secs
456 int max = 3000;
457 int ms = (myRandom48(max)) + 1;
458 g_info << "Waiting for " << ms << "(" << max
459 << ") ms " << endl;
460 NdbSleep_MilliSleep(ms);
461
462 g_info << _restart->m_name << ": node = "<< n[1] << endl;
463 CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
464 "Could not restart node "<< n[1]);
465
466 CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
467 "Failed to wait nostart");
468
469 _restarter.startNodes(n, 2);
470
471 return NDBT_OK;
472 }
473
twoMasterNodeFailure(F_ARGS)474 int twoMasterNodeFailure(F_ARGS){
475
476 int n[2];
477 n[0] = _restarter.getMasterNodeId();
478 n[1] = n[0];
479 do {
480 n[1] = _restarter.getNextMasterNodeId(n[1]);
481 } while(_restarter.getNodeGroup(n[0]) == _restarter.getNodeGroup(n[1]));
482
483 g_info << _restart->m_name << ": ";
484 g_info << "node0 = "<< n[0] << "(" << _restarter.getNodeGroup(n[0]) << ") ";
485 g_info << "node1 = "<< n[1] << "(" << _restarter.getNodeGroup(n[1]) << ") ";
486 g_info << endl;
487
488 int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
489 CHECK(_restarter.dumpStateOneNode(n[0], val2, 2) == 0,
490 "Failed to dump");
491 CHECK(_restarter.dumpStateOneNode(n[1], val2, 2) == 0,
492 "Failed to dump");
493
494 CHECK(_restarter.insertErrorInNode(n[0], 9999) == 0,
495 "Could not restart node "<< n[0]);
496
497 // Create random value, max 3 secs
498 int max = 3000;
499 int ms = (myRandom48(max)) + 1;
500 g_info << "Waiting for " << ms << "(" << max
501 << ") ms " << endl;
502 NdbSleep_MilliSleep(ms);
503
504 g_info << _restart->m_name << ": node = "<< n[1] << endl;
505
506 CHECK(_restarter.insertErrorInNode(n[1], 9999) == 0,
507 "Could not restart node "<< n[1]);
508
509 CHECK(_restarter.waitNodesNoStart(n, 2) == 0,
510 "Failed to wait nostart");
511
512 _restarter.startNodes(n, 2);
513
514 return NDBT_OK;
515 }
516
get50PercentOfNodes(NdbRestarter & restarter,int * _nodes)517 int get50PercentOfNodes(NdbRestarter& restarter,
518 int * _nodes){
519 // For now simply return all nodes with even node id
520 // TODO Check nodegroup and return one node from each
521
522 int num50Percent = restarter.getNumDbNodes() / 2;
523 assert(num50Percent <= MAX_NDB_NODES);
524
525 // Calculate which nodes to stop, select all even nodes
526 for (int i = 0; i < num50Percent; i++){
527 _nodes[i] = restarter.getDbNodeId(i*2);
528 }
529 return num50Percent;
530 }
531
fiftyPercentFail(F_ARGS)532 int fiftyPercentFail(F_ARGS){
533
534
535 int nodes[MAX_NDB_NODES];
536
537 int numNodes = get50PercentOfNodes(_restarter, nodes);
538
539 // Stop the nodes, with nostart and abort
540 for (int i = 0; i < numNodes; i++){
541 g_info << "Stopping node "<< nodes[i] << endl;
542 int res = _restarter.restartOneDbNode(nodes[i], false, true, true);
543 CHECK(res == 0, "Could not stop node: "<< nodes[i]);
544 }
545
546 CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
547 "waitNodesNoStart");
548
549 // Order all nodes to start
550 ndbout << "Starting all nodes" << endl;
551 CHECK(_restarter.startAll() == 0,
552 "Could not start all nodes");
553
554 return NDBT_OK;
555 }
556
557
restartAllNodesGracfeul(F_ARGS)558 int restartAllNodesGracfeul(F_ARGS){
559
560 g_info << _restart->m_name << endl;
561
562 // Restart graceful
563 CHECK(_restarter.restartAll() == 0,
564 "Could not restart all nodes");
565
566 return NDBT_OK;
567
568 }
569
restartAllNodesAbort(F_ARGS)570 int restartAllNodesAbort(F_ARGS){
571
572 g_info << _restart->m_name << endl;
573
574 // Restart abort
575 CHECK(_restarter.restartAll(false, false, true) == 0,
576 "Could not restart all nodes");
577
578 return NDBT_OK;
579 }
580
restartAllNodesError9999(F_ARGS)581 int restartAllNodesError9999(F_ARGS){
582
583 g_info << _restart->m_name << endl;
584
585 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
586 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
587 "failed to set RestartOnErrorInsert");
588
589 CHECK(_restarter.insertErrorInAllNodes(932) == 0,
590 "Failed to set error 932 (auto-restart on arbit error)");
591
592 // Restart with error insert
593 CHECK(_restarter.insertErrorInAllNodes(9999) == 0,
594 "Could not restart all nodes ");
595
596 CHECK(_restarter.waitClusterNoStart() == 0,
597 "Failed to wait not started");
598
599 _restarter.startAll();
600
601 return NDBT_OK;
602 }
603
fiftyPercentStopAndWait(F_ARGS)604 int fiftyPercentStopAndWait(F_ARGS){
605
606 int nodes[MAX_NDB_NODES];
607 int numNodes = get50PercentOfNodes(_restarter, nodes);
608
609 // Stop the nodes, with nostart and abort
610 for (int i = 0; i < numNodes; i++){
611 g_info << "Stopping node "<<nodes[i] << endl;
612 int res = _restarter.restartOneDbNode(nodes[i], false, true, true);
613 CHECK(res == 0, "Could not stop node: "<< nodes[i]);
614 }
615
616 CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0,
617 "waitNodesNoStart");
618
619 // Create random value, max 120 secs
620 int max = 120;
621 int seconds = (myRandom48(max)) + 1;
622 g_info << "Waiting for " << seconds << "(" << max
623 << ") secs " << endl;
624 NdbSleep_SecSleep(seconds);
625
626
627 // Restart graceful
628 CHECK(_restarter.restartAll() == 0,
629 "Could not restart all nodes");
630
631 g_info << _restart->m_name << endl;
632
633 return NDBT_OK;
634 }
635
636 int
637 NFDuringNR_codes[] = {
638 7121,
639 5027,
640 7172,
641 6000,
642 6001,
643 7171,
644 7130,
645 7133,
646 7138,
647 7154,
648 7144,
649 5026,
650 7139,
651 7132,
652 5045,
653
654 7195, 7196,7197,7198,7199,
655
656
657 //LCP
658 8000,
659 8001,
660 5010,
661 7022,
662 7024,
663 7016,
664 7017,
665 5002
666 };
667
restartNFDuringNR(F_ARGS)668 int restartNFDuringNR(F_ARGS){
669
670 myRandom48Init((long)NdbTick_CurrentMillisecond());
671 int i;
672 const int sz = sizeof(NFDuringNR_codes)/sizeof(NFDuringNR_codes[0]);
673 for(i = 0; i<sz; i++){
674 int randomId = myRandom48(_restarter.getNumDbNodes());
675 int nodeId = _restarter.getDbNodeId(randomId);
676 int error = NFDuringNR_codes[i];
677
678 g_err << _restart->m_name << ": node = " << nodeId
679 << " error code = " << error << endl;
680
681 CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
682 "Could not restart node "<< nodeId);
683
684 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
685 "waitNodesNoStart failed");
686
687 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 } ;
688 CHECK(_restarter.dumpStateOneNode(nodeId, val, 2) == 0,
689 "failed to set RestartOnErrorInsert");
690
691 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
692 "failed to set error insert");
693
694 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
695 "failed to start node");
696
697 NdbSleep_SecSleep(3);
698
699 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
700 "waitNodesNoStart failed");
701
702 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
703 "failed to start node");
704
705 CHECK(_restarter.waitNodesStarted(&nodeId, 1) == 0,
706 "waitNodesStarted failed");
707 }
708
709 return NDBT_OK;
710
711 if(_restarter.getNumDbNodes() < 4)
712 return NDBT_OK;
713
714 char buf[256];
715 if(NdbEnv_GetEnv("USER", buf, 256) == 0 || strcmp(buf, "ejonore") != 0)
716 return NDBT_OK;
717
718 for(i = 0; i<sz && !ctx->isTestStopped(); i++){
719 const int randomId = myRandom48(_restarter.getNumDbNodes());
720 int nodeId = _restarter.getDbNodeId(randomId);
721 const int error = NFDuringNR_codes[i];
722
723 const int masterNodeId = _restarter.getMasterNodeId();
724 CHECK(masterNodeId > 0, "getMasterNodeId failed");
725 int crashNodeId = 0;
726 do {
727 int rand = myRandom48(1000);
728 crashNodeId = _restarter.getRandomNodeOtherNodeGroup(nodeId, rand);
729 } while(crashNodeId == masterNodeId);
730
731 CHECK(crashNodeId > 0, "getMasterNodeId failed");
732
733 g_info << _restart->m_name << " restarting node = " << nodeId
734 << " error code = " << error
735 << " crash node = " << crashNodeId << endl;
736
737 CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
738 "Could not restart node "<< nodeId);
739
740 CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
741 "waitNodesNoStart failed");
742
743 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
744 CHECK(_restarter.dumpStateOneNode(crashNodeId, val, 2) == 0,
745 "failed to set RestartOnErrorInsert");
746
747 CHECK(_restarter.insertErrorInNode(crashNodeId, error) == 0,
748 "failed to set error insert");
749
750 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
751 "failed to start node");
752
753 CHECK(_restarter.waitClusterStarted() == 0,
754 "waitClusterStarted failed");
755 }
756
757 return NDBT_OK;
758 }
759
760 int
761 NRDuringLCP_Master_codes[] = {
762 7009, // Insert system error in master when local checkpoint is idle.
763 7010, // Insert system error in master when local checkpoint is in the
764 // state clcpStatus = CALCULATE_KEEP_GCI.
765 7013, // Insert system error in master when local checkpoint is in the
766 // state clcpStatus = COPY_GCI before sending COPY_GCIREQ.
767 7014, // Insert system error in master when local checkpoint is in the
768 // state clcpStatus = TC_CLOPSIZE before sending TC_CLOPSIZEREQ.
769 7015, // Insert system error in master when local checkpoint is in the
770 // state clcpStatus = START_LCP_ROUND before sending START_LCP_ROUND.
771 7019, // Insert system error in master when local checkpoint is in the
772 // state clcpStatus = IDLE before sending CONTINUEB(ZCHECK_TC_COUNTER).
773 7075, // Master. Don't send any LCP_FRAG_ORD(last=true)
774 // And crash when all have "not" been sent
775 7021, // Crash in master when receiving START_LCP_REQ
776 7023, // Crash in master when sending START_LCP_CONF
777 7025, // Crash in master when receiving LCP_FRAG_REP
778 7026, // Crash in master when changing state to LCP_TAB_COMPLETED
779 7027 // Crash in master when changing state to LCP_TAB_SAVED
780 };
781
782 int
783 NRDuringLCP_NonMaster_codes[] = {
784 7020, // Insert system error in local checkpoint participant at reception
785 // of COPY_GCIREQ.
786 8000, // Crash particpant when receiving TCGETOPSIZEREQ
787 8001, // Crash particpant when receiving TC_CLOPSIZEREQ
788 5010, // Crash any when receiving LCP_FRAGORD
789 7022, // Crash in !master when receiving START_LCP_REQ
790 7024, // Crash in !master when sending START_LCP_CONF
791 7016, // Crash in !master when receiving LCP_FRAG_REP
792 7017, // Crash in !master when changing state to LCP_TAB_COMPLETED
793 7018 // Crash in !master when changing state to LCP_TAB_SAVED
794 };
795
restartNodeDuringLCP(F_ARGS)796 int restartNodeDuringLCP(F_ARGS) {
797 int i;
798 // Master
799 int val = DumpStateOrd::DihMinTimeBetweenLCP;
800 CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0,
801 "Failed to set LCP to min value"); // Set LCP to min val
802 int sz = sizeof(NRDuringLCP_Master_codes)/
803 sizeof(NRDuringLCP_Master_codes[0]);
804 for(i = 0; i<sz; i++) {
805
806 int error = NRDuringLCP_Master_codes[i];
807 int masterNodeId = _restarter.getMasterNodeId();
808
809 CHECK(masterNodeId > 0, "getMasterNodeId failed");
810
811 ndbout << _restart->m_name << " restarting master node = " << masterNodeId
812 << " error code = " << error << endl;
813
814 {
815 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
816 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
817 "failed to set RestartOnErrorInsert");
818 }
819
820 CHECK(_restarter.insertErrorInNode(masterNodeId, error) == 0,
821 "failed to set error insert");
822
823 CHECK(_restarter.waitNodesNoStart(&masterNodeId, 1, 300) == 0,
824 "failed to wait no start");
825
826 CHECK(_restarter.startNodes(&masterNodeId, 1) == 0,
827 "failed to start node");
828
829 CHECK(_restarter.waitClusterStarted(300) == 0,
830 "waitClusterStarted failed");
831
832 {
833 int val = DumpStateOrd::DihMinTimeBetweenLCP;
834 CHECK(_restarter.dumpStateOneNode(masterNodeId, &val, 1) == 0,
835 "failed to set error insert");
836 }
837 }
838
839 // NON-Master
840 sz = sizeof(NRDuringLCP_NonMaster_codes)/
841 sizeof(NRDuringLCP_NonMaster_codes[0]);
842 for(i = 0; i<sz; i++) {
843
844 int error = NRDuringLCP_NonMaster_codes[i];
845 int nodeId = getRandomNodeId(_restarter);
846 int masterNodeId = _restarter.getMasterNodeId();
847 CHECK(masterNodeId > 0, "getMasterNodeId failed");
848
849 while (nodeId == masterNodeId) {
850 nodeId = getRandomNodeId(_restarter);
851 }
852
853 ndbout << _restart->m_name << " restarting non-master node = " << nodeId
854 << " error code = " << error << endl;
855
856 int val[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
857 CHECK(_restarter.dumpStateAllNodes(val, 2) == 0,
858 "failed to set RestartOnErrorInsert");
859
860 CHECK(_restarter.insertErrorInNode(nodeId, error) == 0,
861 "failed to set error insert");
862
863 CHECK(_restarter.waitNodesNoStart(&nodeId, 1, 300) == 0,
864 "failed to wait no start");
865
866 CHECK(_restarter.startNodes(&nodeId, 1) == 0,
867 "failed to start node");
868
869 CHECK(_restarter.waitClusterStarted(300) == 0,
870 "waitClusterStarted failed");
871
872 {
873 int val = DumpStateOrd::DihMinTimeBetweenLCP;
874 CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
875 "failed to set error insert");
876 }
877 }
878
879 return NDBT_OK;
880 }
881
stopOnError(F_ARGS)882 int stopOnError(F_ARGS){
883
884 myRandom48Init((long)NdbTick_CurrentMillisecond());
885
886 int randomId = myRandom48(_restarter.getNumDbNodes());
887 int nodeId = _restarter.getDbNodeId(randomId);
888
889 do {
890 g_info << _restart->m_name << ": node = " << nodeId
891 << endl;
892
893 CHECK(_restarter.waitClusterStarted(300) == 0,
894 "waitClusterStarted failed");
895
896 int val = DumpStateOrd::NdbcntrTestStopOnError;
897 CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0,
898 "failed to set NdbcntrTestStopOnError");
899
900 NdbSleep_SecSleep(3);
901
902 CHECK(_restarter.waitClusterStarted(300) == 0,
903 "waitClusterStarted failed");
904 } while (false);
905
906 return NDBT_OK;
907 }
908
getRandomNodeId(NdbRestarter & _restarter)909 int getRandomNodeId(NdbRestarter& _restarter) {
910 myRandom48Init((long)NdbTick_CurrentMillisecond());
911 int randomId = myRandom48(_restarter.getNumDbNodes());
912 int nodeId = _restarter.getDbNodeId(randomId);
913
914 return nodeId;
915 }
916