1 /* Copyright (c) 2003-2007 MySQL AB
2 Use is subject to license terms
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA */
16
17 #include <NdbRestarter.hpp>
18 #include <NdbOut.hpp>
19 #include <NdbSleep.h>
20 #include <NdbTick.h>
21 #include <mgmapi_debug.h>
22 #include <NDBT_Output.hpp>
23 #include <random.h>
24 #include <kernel/ndb_limits.h>
25 #include <ndb_version.h>
26
27 #define MGMERR(h) \
28 ndbout << "latest_error="<<ndb_mgm_get_latest_error(h) \
29 << ", line="<<ndb_mgm_get_latest_error_line(h) \
30 << ", mesg="<<ndb_mgm_get_latest_error_msg(h) \
31 << ", desc="<<ndb_mgm_get_latest_error_desc(h) \
32 << endl;
33
34
NdbRestarter(const char * _addr)35 NdbRestarter::NdbRestarter(const char* _addr):
36 connected(false),
37 handle(NULL),
38 m_config(0)
39 {
40 if (_addr == NULL){
41 addr.assign("");
42 } else {
43 addr.assign(_addr);
44 }
45 }
46
~NdbRestarter()47 NdbRestarter::~NdbRestarter(){
48 disconnect();
49 }
50
getDbNodeId(int _i)51 int NdbRestarter::getDbNodeId(int _i){
52 if (!isConnected())
53 return -1;
54
55 if (getStatus() != 0)
56 return -1;
57
58 for(size_t i = 0; i < ndbNodes.size(); i++){
59 if (i == (unsigned)_i){
60 return ndbNodes[i].node_id;
61 }
62 }
63 return -1;
64 }
65
66
67 int
restartOneDbNode(int _nodeId,bool inital,bool nostart,bool abort)68 NdbRestarter::restartOneDbNode(int _nodeId,
69 bool inital,
70 bool nostart,
71 bool abort){
72 if (!isConnected())
73 return -1;
74
75 int ret = 0;
76
77 if ((ret = ndb_mgm_restart2(handle, 1, &_nodeId,
78 inital, nostart, abort)) <= 0) {
79 /**
80 * ndb_mgm_restart2 returned error, one reason could
81 * be that the node have not stopped fast enough!
82 * Check status of the node to see if it's on the
83 * way down. If that's the case ignore the error
84 */
85
86 if (getStatus() != 0)
87 return -1;
88
89 g_info << "ndb_mgm_restart2 returned with error, checking node state" << endl;
90
91 for(size_t i = 0; i < ndbNodes.size(); i++){
92 if(ndbNodes[i].node_id == _nodeId){
93 g_info <<_nodeId<<": status="<<ndbNodes[i].node_status<<endl;
94 /* Node found check state */
95 switch(ndbNodes[i].node_status){
96 case NDB_MGM_NODE_STATUS_RESTARTING:
97 case NDB_MGM_NODE_STATUS_SHUTTING_DOWN:
98 return 0;
99 default:
100 break;
101 }
102 }
103 }
104
105 MGMERR(handle);
106 g_err << "Could not stop node with id = "<< _nodeId << endl;
107 return -1;
108 }
109
110 return 0;
111 }
112
113 int
getMasterNodeId()114 NdbRestarter::getMasterNodeId(){
115 if (!isConnected())
116 return -1;
117
118 if (getStatus() != 0)
119 return -1;
120
121 int min = 0;
122 int node = -1;
123 for(size_t i = 0; i < ndbNodes.size(); i++){
124 if(min == 0 || ndbNodes[i].dynamic_id < min){
125 min = ndbNodes[i].dynamic_id;
126 node = ndbNodes[i].node_id;
127 }
128 }
129
130 return node;
131 }
132
133 int
getNodeGroup(int nodeId)134 NdbRestarter::getNodeGroup(int nodeId){
135 if (!isConnected())
136 return -1;
137
138 if (getStatus() != 0)
139 return -1;
140
141 for(size_t i = 0; i < ndbNodes.size(); i++)
142 {
143 if(ndbNodes[i].node_id == nodeId)
144 {
145 return ndbNodes[i].node_group;
146 }
147 }
148
149 return -1;
150 }
151
152 int
getNextMasterNodeId(int nodeId)153 NdbRestarter::getNextMasterNodeId(int nodeId){
154 if (!isConnected())
155 return -1;
156
157 if (getStatus() != 0)
158 return -1;
159
160 size_t i;
161 for(i = 0; i < ndbNodes.size(); i++)
162 {
163 if(ndbNodes[i].node_id == nodeId)
164 {
165 break;
166 }
167 }
168 assert(i < ndbNodes.size());
169 if (i == ndbNodes.size())
170 return -1;
171
172 int dynid = ndbNodes[i].dynamic_id;
173 int minid = dynid;
174 for (i = 0; i<ndbNodes.size(); i++)
175 if (ndbNodes[i].dynamic_id > minid)
176 minid = ndbNodes[i].dynamic_id;
177
178 for (i = 0; i<ndbNodes.size(); i++)
179 if (ndbNodes[i].dynamic_id > dynid &&
180 ndbNodes[i].dynamic_id < minid)
181 {
182 minid = ndbNodes[i].dynamic_id;
183 }
184
185 if (minid != ~0)
186 {
187 for (i = 0; i<ndbNodes.size(); i++)
188 if (ndbNodes[i].dynamic_id == minid)
189 return ndbNodes[i].node_id;
190 }
191
192 return getMasterNodeId();
193 }
194
195 int
getRandomNotMasterNodeId(int rand)196 NdbRestarter::getRandomNotMasterNodeId(int rand){
197 int master = getMasterNodeId();
198 if(master == -1)
199 return -1;
200
201 Uint32 counter = 0;
202 rand = rand % ndbNodes.size();
203 while(counter++ < ndbNodes.size() && ndbNodes[rand].node_id == master)
204 rand = (rand + 1) % ndbNodes.size();
205
206 if(ndbNodes[rand].node_id != master)
207 return ndbNodes[rand].node_id;
208 return -1;
209 }
210
211 int
getRandomNodeOtherNodeGroup(int nodeId,int rand)212 NdbRestarter::getRandomNodeOtherNodeGroup(int nodeId, int rand){
213 if (!isConnected())
214 return -1;
215
216 if (getStatus() != 0)
217 return -1;
218
219 int node_group = -1;
220 for(size_t i = 0; i < ndbNodes.size(); i++){
221 if(ndbNodes[i].node_id == nodeId){
222 node_group = ndbNodes[i].node_group;
223 break;
224 }
225 }
226 if(node_group == -1){
227 return -1;
228 }
229
230 Uint32 counter = 0;
231 rand = rand % ndbNodes.size();
232 while(counter++ < ndbNodes.size() && ndbNodes[rand].node_group == node_group)
233 rand = (rand + 1) % ndbNodes.size();
234
235 if(ndbNodes[rand].node_group != node_group)
236 return ndbNodes[rand].node_id;
237
238 return -1;
239 }
240
241 int
getRandomNodeSameNodeGroup(int nodeId,int rand)242 NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){
243 if (!isConnected())
244 return -1;
245
246 if (getStatus() != 0)
247 return -1;
248
249 int node_group = -1;
250 for(size_t i = 0; i < ndbNodes.size(); i++){
251 if(ndbNodes[i].node_id == nodeId){
252 node_group = ndbNodes[i].node_group;
253 break;
254 }
255 }
256 if(node_group == -1){
257 return -1;
258 }
259
260 Uint32 counter = 0;
261 rand = rand % ndbNodes.size();
262 while(counter++ < ndbNodes.size() &&
263 (ndbNodes[rand].node_id == nodeId ||
264 ndbNodes[rand].node_group != node_group))
265 rand = (rand + 1) % ndbNodes.size();
266
267 if(ndbNodes[rand].node_group == node_group &&
268 ndbNodes[rand].node_id != nodeId)
269 return ndbNodes[rand].node_id;
270
271 return -1;
272 }
273
274 int
waitClusterStarted(unsigned int _timeout)275 NdbRestarter::waitClusterStarted(unsigned int _timeout){
276 return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout);
277 }
278
279 int
waitClusterStartPhase(int _startphase,unsigned int _timeout)280 NdbRestarter::waitClusterStartPhase(int _startphase, unsigned int _timeout){
281 return waitClusterState(NDB_MGM_NODE_STATUS_STARTING, _timeout, _startphase);
282 }
283
284 int
waitClusterSingleUser(unsigned int _timeout)285 NdbRestarter::waitClusterSingleUser(unsigned int _timeout){
286 return waitClusterState(NDB_MGM_NODE_STATUS_SINGLEUSER, _timeout);
287 }
288
289 int
waitClusterNoStart(unsigned int _timeout)290 NdbRestarter::waitClusterNoStart(unsigned int _timeout){
291 return waitClusterState(NDB_MGM_NODE_STATUS_NOT_STARTED, _timeout);
292 }
293
294 int
waitClusterState(ndb_mgm_node_status _status,unsigned int _timeout,int _startphase)295 NdbRestarter::waitClusterState(ndb_mgm_node_status _status,
296 unsigned int _timeout,
297 int _startphase){
298
299 int nodes[MAX_NDB_NODES];
300 int numNodes = 0;
301
302 if (getStatus() != 0)
303 return -1;
304
305 // Collect all nodes into nodes
306 for (size_t i = 0; i < ndbNodes.size(); i++){
307 nodes[i] = ndbNodes[i].node_id;
308 numNodes++;
309 }
310
311 return waitNodesState(nodes, numNodes, _status, _timeout, _startphase);
312 }
313
314
315 int
waitNodesState(const int * _nodes,int _num_nodes,ndb_mgm_node_status _status,unsigned int _timeout,int _startphase)316 NdbRestarter::waitNodesState(const int * _nodes, int _num_nodes,
317 ndb_mgm_node_status _status,
318 unsigned int _timeout,
319 int _startphase){
320
321 if (!isConnected()){
322 g_err << "!isConnected"<<endl;
323 return -1;
324 }
325
326 unsigned int attempts = 0;
327 unsigned int resetAttempts = 0;
328 const unsigned int MAX_RESET_ATTEMPTS = 10;
329 bool allInState = false;
330 while (allInState == false){
331 if (_timeout > 0 && attempts > _timeout){
332 /**
333 * Timeout has expired waiting for the nodes to enter
334 * the state we want
335 */
336 bool waitMore = false;
337 /**
338 * Make special check if we are waiting for
339 * cluster to become started
340 */
341 if(_status == NDB_MGM_NODE_STATUS_STARTED){
342 waitMore = true;
343 /**
344 * First check if any node is not starting
345 * then it's no idea to wait anymore
346 */
347 for (size_t n = 0; n < ndbNodes.size(); n++){
348 if (ndbNodes[n].node_status != NDB_MGM_NODE_STATUS_STARTED &&
349 ndbNodes[n].node_status != NDB_MGM_NODE_STATUS_STARTING)
350 waitMore = false;
351
352 }
353 }
354
355 if (!waitMore || resetAttempts > MAX_RESET_ATTEMPTS){
356 g_err << "waitNodeState("
357 << ndb_mgm_get_node_status_string(_status)
358 <<", "<<_startphase<<")"
359 << " timeout after " << attempts <<" attemps" << endl;
360 return -1;
361 }
362
363 g_err << "waitNodeState("
364 << ndb_mgm_get_node_status_string(_status)
365 <<", "<<_startphase<<")"
366 << " resetting number of attempts "
367 << resetAttempts << endl;
368 attempts = 0;
369 resetAttempts++;
370
371 }
372
373 allInState = true;
374 if (getStatus() != 0){
375 g_err << "getStatus != 0" << endl;
376 return -1;
377 }
378
379 // ndbout << "waitNodeState; _num_nodes = " << _num_nodes << endl;
380 // for (int i = 0; i < _num_nodes; i++)
381 // ndbout << " node["<<i<<"] =" <<_nodes[i] << endl;
382
383 for (int i = 0; i < _num_nodes; i++){
384 ndb_mgm_node_state* ndbNode = NULL;
385 for (size_t n = 0; n < ndbNodes.size(); n++){
386 if (ndbNodes[n].node_id == _nodes[i])
387 ndbNode = &ndbNodes[n];
388 }
389
390 if(ndbNode == NULL){
391 allInState = false;
392 continue;
393 }
394
395 g_info << "State node " << ndbNode->node_id << " "
396 << ndb_mgm_get_node_status_string(ndbNode->node_status);
397 if (ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTING)
398 g_info<< ", start_phase=" << ndbNode->start_phase;
399 g_info << endl;
400
401 assert(ndbNode != NULL);
402
403 if(_status == NDB_MGM_NODE_STATUS_STARTING &&
404 ((ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTING &&
405 ndbNode->start_phase >= _startphase) ||
406 (ndbNode->node_status == NDB_MGM_NODE_STATUS_STARTED)))
407 continue;
408
409 if (_status == NDB_MGM_NODE_STATUS_STARTING){
410 g_info << "status = "
411 << ndb_mgm_get_node_status_string(ndbNode->node_status)
412 <<", start_phase="<<ndbNode->start_phase<<endl;
413 if (ndbNode->node_status != _status) {
414 if (ndbNode->node_status < _status)
415 allInState = false;
416 else
417 g_info << "node_status(" << ndbNode->node_status
418 <<") != _status("<<_status<<")"<<endl;
419 } else if (ndbNode->start_phase < _startphase)
420 allInState = false;
421 } else {
422 if (ndbNode->node_status != _status)
423 allInState = false;
424 }
425 }
426 g_info << "Waiting for cluster enter state"
427 << ndb_mgm_get_node_status_string(_status)<< endl;
428 NdbSleep_SecSleep(1);
429 attempts++;
430 }
431 return 0;
432 }
433
waitNodesStarted(const int * _nodes,int _num_nodes,unsigned int _timeout)434 int NdbRestarter::waitNodesStarted(const int * _nodes, int _num_nodes,
435 unsigned int _timeout){
436 return waitNodesState(_nodes, _num_nodes,
437 NDB_MGM_NODE_STATUS_STARTED, _timeout);
438 }
439
waitNodesStartPhase(const int * _nodes,int _num_nodes,int _startphase,unsigned int _timeout)440 int NdbRestarter::waitNodesStartPhase(const int * _nodes, int _num_nodes,
441 int _startphase, unsigned int _timeout){
442 return waitNodesState(_nodes, _num_nodes,
443 NDB_MGM_NODE_STATUS_STARTING, _timeout,
444 _startphase);
445 }
446
waitNodesNoStart(const int * _nodes,int _num_nodes,unsigned int _timeout)447 int NdbRestarter::waitNodesNoStart(const int * _nodes, int _num_nodes,
448 unsigned int _timeout){
449 return waitNodesState(_nodes, _num_nodes,
450 NDB_MGM_NODE_STATUS_NOT_STARTED, _timeout);
451 }
452
453 bool
isConnected()454 NdbRestarter::isConnected(){
455 if (connected == true)
456 return true;
457 return connect() == 0;
458 }
459
460 int
connect()461 NdbRestarter::connect(){
462 disconnect();
463 handle = ndb_mgm_create_handle();
464 if (handle == NULL){
465 g_err << "handle == NULL" << endl;
466 return -1;
467 }
468 g_info << "Connecting to mgmsrv at " << addr.c_str() << endl;
469 if (ndb_mgm_set_connectstring(handle,addr.c_str()))
470 {
471 MGMERR(handle);
472 g_err << "Connection to " << addr.c_str() << " failed" << endl;
473 return -1;
474 }
475
476 if (ndb_mgm_connect(handle, 0, 0, 0) == -1)
477 {
478 MGMERR(handle);
479 g_err << "Connection to " << addr.c_str() << " failed" << endl;
480 return -1;
481 }
482
483 connected = true;
484 return 0;
485 }
486
487 void
disconnect()488 NdbRestarter::disconnect(){
489 if (handle != NULL){
490 ndb_mgm_disconnect(handle);
491 ndb_mgm_destroy_handle(&handle);
492 }
493 connected = false;
494 }
495
496 int
getStatus()497 NdbRestarter::getStatus(){
498 int retries = 0;
499 struct ndb_mgm_cluster_state * status;
500 struct ndb_mgm_node_state * node;
501
502 ndbNodes.clear();
503 mgmNodes.clear();
504 apiNodes.clear();
505
506 if (!isConnected())
507 return -1;
508
509 while(retries < 10){
510 status = ndb_mgm_get_status(handle);
511 if (status == NULL){
512 ndbout << "status==NULL, retries="<<retries<<endl;
513 MGMERR(handle);
514 retries++;
515 continue;
516 }
517 for (int i = 0; i < status->no_of_nodes; i++){
518 node = &status->node_states[i];
519 switch(node->node_type){
520 case NDB_MGM_NODE_TYPE_NDB:
521 ndbNodes.push_back(*node);
522 break;
523 case NDB_MGM_NODE_TYPE_MGM:
524 mgmNodes.push_back(*node);
525 break;
526 case NDB_MGM_NODE_TYPE_API:
527 apiNodes.push_back(*node);
528 break;
529 default:
530 if(node->node_status == NDB_MGM_NODE_STATUS_UNKNOWN ||
531 node->node_status == NDB_MGM_NODE_STATUS_NO_CONTACT){
532 retries++;
533 ndbNodes.clear();
534 mgmNodes.clear();
535 apiNodes.clear();
536 free(status);
537 status = NULL;
538 i = status->no_of_nodes;
539
540 ndbout << "kalle"<< endl;
541 break;
542 }
543 abort();
544 break;
545 }
546 }
547 if(status == 0){
548 ndbout << "status == 0" << endl;
549 continue;
550 }
551 free(status);
552 return 0;
553 }
554
555 g_err << "getStatus failed" << endl;
556 return -1;
557 }
558
559
getNumDbNodes()560 int NdbRestarter::getNumDbNodes(){
561 if (!isConnected())
562 return -1;
563
564 if (getStatus() != 0)
565 return -1;
566
567 return ndbNodes.size();
568 }
569
restartAll(bool initial,bool nostart,bool abort)570 int NdbRestarter::restartAll(bool initial,
571 bool nostart,
572 bool abort){
573
574 if (!isConnected())
575 return -1;
576
577 if (ndb_mgm_restart2(handle, 0, NULL, initial, 1, abort) == -1) {
578 MGMERR(handle);
579 g_err << "Could not restart(stop) all nodes " << endl;
580 // return -1; Continue anyway - Magnus
581 }
582
583 if (waitClusterNoStart(60) != 0){
584 g_err << "Cluster didnt enter STATUS_NOT_STARTED within 60s" << endl;
585 return -1;
586 }
587
588 if(nostart){
589 g_debug << "restartAll: nostart == true" << endl;
590 return 0;
591 }
592
593 if (ndb_mgm_start(handle, 0, NULL) == -1) {
594 MGMERR(handle);
595 g_err << "Could not restart(start) all nodes " << endl;
596 return -1;
597 }
598
599 return 0;
600 }
601
startAll()602 int NdbRestarter::startAll(){
603 if (!isConnected())
604 return -1;
605
606 if (ndb_mgm_start(handle, 0, NULL) == -1) {
607 MGMERR(handle);
608 g_err << "Could not start all nodes " << endl;
609 return -1;
610 }
611
612 return 0;
613
614 }
615
startNodes(const int * nodes,int num_nodes)616 int NdbRestarter::startNodes(const int * nodes, int num_nodes){
617 if (!isConnected())
618 return -1;
619
620 if (ndb_mgm_start(handle, num_nodes, nodes) != num_nodes) {
621 MGMERR(handle);
622 g_err << "Could not start all nodes " << endl;
623 return -1;
624 }
625
626 return 0;
627 }
628
insertErrorInNode(int _nodeId,int _error)629 int NdbRestarter::insertErrorInNode(int _nodeId, int _error){
630 if (!isConnected())
631 return -1;
632
633 ndb_mgm_reply reply;
634 reply.return_code = 0;
635
636 if (ndb_mgm_insert_error(handle, _nodeId, _error, &reply) == -1){
637 MGMERR(handle);
638 g_err << "Could not insert error in node with id = "<< _nodeId << endl;
639 }
640 if(reply.return_code != 0){
641 g_err << "Error: " << reply.message << endl;
642 }
643 return 0;
644 }
645
insertErrorInAllNodes(int _error)646 int NdbRestarter::insertErrorInAllNodes(int _error){
647 if (!isConnected())
648 return -1;
649
650 if (getStatus() != 0)
651 return -1;
652
653 int result = 0;
654
655 for(size_t i = 0; i < ndbNodes.size(); i++){
656 g_debug << "inserting error in node " << ndbNodes[i].node_id << endl;
657 if (insertErrorInNode(ndbNodes[i].node_id, _error) == -1)
658 result = -1;
659 }
660 return result;
661
662 }
663
664
665
dumpStateOneNode(int _nodeId,const int * _args,int _num_args)666 int NdbRestarter::dumpStateOneNode(int _nodeId, const int * _args, int _num_args){
667 if (!isConnected())
668 return -1;
669
670 ndb_mgm_reply reply;
671 reply.return_code = 0;
672
673 if (ndb_mgm_dump_state(handle, _nodeId, _args, _num_args, &reply) == -1){
674 MGMERR(handle);
675 g_err << "Could not dump state in node with id = "<< _nodeId << endl;
676 }
677
678 if(reply.return_code != 0){
679 g_err << "Error: " << reply.message << endl;
680 }
681 return reply.return_code;
682 }
683
dumpStateAllNodes(const int * _args,int _num_args)684 int NdbRestarter::dumpStateAllNodes(const int * _args, int _num_args){
685 if (!isConnected())
686 return -1;
687
688 if (getStatus() != 0)
689 return -1;
690
691 int result = 0;
692
693 for(size_t i = 0; i < ndbNodes.size(); i++){
694 g_debug << "dumping state in node " << ndbNodes[i].node_id << endl;
695 if (dumpStateOneNode(ndbNodes[i].node_id, _args, _num_args) == -1)
696 result = -1;
697 }
698 return result;
699
700 }
701
702
enterSingleUserMode(int _nodeId)703 int NdbRestarter::enterSingleUserMode(int _nodeId){
704 if (!isConnected())
705 return -1;
706
707 ndb_mgm_reply reply;
708 reply.return_code = 0;
709
710 if (ndb_mgm_enter_single_user(handle, _nodeId, &reply) == -1){
711 MGMERR(handle);
712 g_err << "Could not enter single user mode api node = "<< _nodeId << endl;
713 }
714
715 if(reply.return_code != 0){
716 g_err << "Error: " << reply.message << endl;
717 }
718
719 return reply.return_code;
720 }
721
722
exitSingleUserMode()723 int NdbRestarter::exitSingleUserMode(){
724 if (!isConnected())
725 return -1;
726
727 ndb_mgm_reply reply;
728 reply.return_code = 0;
729
730 if (ndb_mgm_exit_single_user(handle, &reply) == -1){
731 MGMERR(handle);
732 g_err << "Could not exit single user mode " << endl;
733 }
734
735 if(reply.return_code != 0){
736 g_err << "Error: " << reply.message << endl;
737 }
738 return reply.return_code;
739 }
740
741 ndb_mgm_configuration*
getConfig()742 NdbRestarter::getConfig(){
743 if(m_config) return m_config;
744
745 if (!isConnected())
746 return 0;
747 m_config = ndb_mgm_get_configuration(handle, 0);
748 return m_config;
749 }
750
751 template class Vector<ndb_mgm_node_state>;
752