1 /*
2    Copyright (c) 2003, 2021, Oracle and/or its affiliates.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #define NDBCNTR_C
26 #include "Ndbcntr.hpp"
27 
28 #include <ndb_limits.h>
29 #include <ndb_version.h>
30 #include <SimpleProperties.hpp>
31 #include <signaldata/NodeRecoveryStatusRep.hpp>
32 #include <signaldata/DictTabInfo.hpp>
33 #include <signaldata/SchemaTrans.hpp>
34 #include <signaldata/CreateTable.hpp>
35 #include <signaldata/CreateHashMap.hpp>
36 #include <signaldata/ReadNodesConf.hpp>
37 #include <signaldata/NodeFailRep.hpp>
38 #include <signaldata/TcKeyReq.hpp>
39 #include <signaldata/TcKeyConf.hpp>
40 #include <signaldata/EventReport.hpp>
41 #include <signaldata/NodeStateSignalData.hpp>
42 #include <signaldata/StopPerm.hpp>
43 #include <signaldata/StopMe.hpp>
44 #include <signaldata/WaitGCP.hpp>
45 #include <signaldata/CheckNodeGroups.hpp>
46 #include <signaldata/StartOrd.hpp>
47 #include <signaldata/AbortAll.hpp>
48 #include <signaldata/SystemError.hpp>
49 #include <signaldata/NdbSttor.hpp>
50 #include <signaldata/CntrStart.hpp>
51 #include <signaldata/DumpStateOrd.hpp>
52 
53 #include <signaldata/FsRemoveReq.hpp>
54 #include <signaldata/ReadConfig.hpp>
55 
56 #include <signaldata/FailRep.hpp>
57 
58 #include <AttributeHeader.hpp>
59 #include <Configuration.hpp>
60 #include <DebuggerNames.hpp>
61 #include <signaldata/DihRestart.hpp>
62 
63 #include <NdbOut.hpp>
64 #include <NdbTick.h>
65 
66 #include <signaldata/TakeOver.hpp>
67 #include <signaldata/CreateNodegroupImpl.hpp>
68 #include <signaldata/DropNodegroupImpl.hpp>
69 #include <signaldata/CreateFilegroup.hpp>
70 
71 #include <EventLogger.hpp>
72 
73 #define JAM_FILE_ID 458
74 
75 
76 extern EventLogger * g_eventLogger;
77 
78 // used during shutdown for reporting current startphase
79 // accessed from Emulator.cpp, NdbShutdown()
80 Uint32 g_currentStartPhase = 0;
81 
82 /**
83  * ALL_BLOCKS Used during start phases and while changing node state
84  *
85  * NDBFS_REF Has to be before NDBCNTR_REF (due to "ndb -i" stuff)
86  */
87 struct BlockInfo {
88   BlockReference Ref; // BlockReference
89   Uint32 NextSP;            // Next start phase
90   Uint32 ErrorInsertStart;
91   Uint32 ErrorInsertStop;
92 };
93 
94 static BlockInfo ALL_BLOCKS[] = {
95   { NDBFS_REF,   0 ,  2000,  2999 },
96   { DBTC_REF,    1 ,  8000,  8035 },
97   { DBDIH_REF,   1 ,  7000,  7173 },
98   { DBLQH_REF,   1 ,  5000,  5030 },
99   { DBACC_REF,   1 ,  3000,  3999 },
100   { DBTUP_REF,   1 ,  4000,  4007 },
101   { DBDICT_REF,  1 ,  6000,  6003 },
102   { NDBCNTR_REF, 0 ,  1000,  1999 },
103   { CMVMI_REF,   1 ,  9000,  9999 }, // before QMGR
104   { QMGR_REF,    1 ,     1,   999 },
105   { TRIX_REF,    1 ,     0,     0 },
106   { BACKUP_REF,  1 , 10000, 10999 },
107   { DBUTIL_REF,  1 , 11000, 11999 },
108   { SUMA_REF,    1 , 13000, 13999 },
109   { DBTUX_REF,   1 , 12000, 12999 }
110   ,{ TSMAN_REF,  1 ,     0,     0 }
111   ,{ LGMAN_REF,  1 ,     0,     0 }
112   ,{ PGMAN_REF,  1 ,     0,     0 }
113   ,{ RESTORE_REF,1 ,     0,     0 }
114   ,{ DBINFO_REF, 1 ,     0,     0 }
115   ,{ DBSPJ_REF,  1 ,     0,     0 }
116   ,{ THRMAN_REF, 1 ,     0,     0 }
117 };
118 
119 static const Uint32 ALL_BLOCKS_SZ = sizeof(ALL_BLOCKS)/sizeof(BlockInfo);
120 
121 static BlockReference readConfigOrder[ALL_BLOCKS_SZ] = {
122   CMVMI_REF,
123   NDBFS_REF,
124   DBINFO_REF,
125   DBTUP_REF,
126   DBACC_REF,
127   DBTC_REF,
128   DBLQH_REF,
129   DBTUX_REF,
130   DBDICT_REF,
131   DBDIH_REF,
132   NDBCNTR_REF,
133   QMGR_REF,
134   TRIX_REF,
135   BACKUP_REF,
136   DBUTIL_REF,
137   SUMA_REF,
138   TSMAN_REF,
139   LGMAN_REF,
140   PGMAN_REF,
141   RESTORE_REF,
142   DBSPJ_REF,
143   THRMAN_REF
144 };
145 
146 /*******************************/
147 /*  CONTINUEB                  */
148 /*******************************/
execCONTINUEB(Signal * signal)149 void Ndbcntr::execCONTINUEB(Signal* signal)
150 {
151   jamEntry();
152   UintR Ttemp1 = signal->theData[0];
153   switch (Ttemp1) {
154   case ZSTARTUP:{
155     if(getNodeState().startLevel == NodeState::SL_STARTED){
156       jam();
157       return;
158     }
159 
160     if(cmasterNodeId == getOwnNodeId() && c_start.m_starting.isclear()){
161       jam();
162       trySystemRestart(signal);
163       // Fall-through
164     }
165 
166     const Uint64 elapsed = NdbTick_Elapsed(
167                               c_start.m_startTime,
168                               NdbTick_getCurrentTicks()).milliSec();
169 
170     if (elapsed > c_start.m_startFailureTimeout)
171     {
172       jam();
173       Uint32 to_3= 0;
174       const ndb_mgm_configuration_iterator * p =
175 	m_ctx.m_config.getOwnConfigIterator();
176       ndb_mgm_get_int_parameter(p, CFG_DB_START_FAILURE_TIMEOUT, &to_3);
177       BaseString tmp;
178       tmp.append("Shutting down node as total restart time exceeds "
179 		 " StartFailureTimeout as set in config file ");
180       if(to_3 == 0)
181 	tmp.append(" 0 (inifinite)");
182       else
183 	tmp.appfmt(" %d", to_3);
184 
185       progError(__LINE__, NDBD_EXIT_RESTART_TIMEOUT, tmp.c_str());
186     }
187 
188     signal->theData[0] = ZSTARTUP;
189     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
190     break;
191   }
192   case ZSHUTDOWN:
193     jam();
194     c_stopRec.checkTimeout(signal);
195     break;
196   case ZBLOCK_STTOR:
197     if (ERROR_INSERTED(1002))
198     {
199       signal->theData[0] = ZBLOCK_STTOR;
200       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
201       return;
202     }
203     else
204     {
205       c_missra.sendNextSTTOR(signal);
206     }
207     return;
208   default:
209     jam();
210     systemErrorLab(signal, __LINE__);
211     return;
212     break;
213   }//switch
214 }//Ndbcntr::execCONTINUEB()
215 
216 void
execAPI_START_REP(Signal * signal)217 Ndbcntr::execAPI_START_REP(Signal* signal)
218 {
219   if(refToBlock(signal->getSendersBlockRef()) == QMGR)
220   {
221     for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
222       sendSignal(ALL_BLOCKS[i].Ref, GSN_API_START_REP, signal, 1, JBB);
223     }
224   }
225 }
226 /*******************************/
227 /*  SYSTEM_ERROR               */
228 /*******************************/
execSYSTEM_ERROR(Signal * signal)229 void Ndbcntr::execSYSTEM_ERROR(Signal* signal)
230 {
231   const SystemError * const sysErr = (SystemError *)signal->getDataPtr();
232   char buf[100];
233   int killingNode = refToNode(sysErr->errorRef);
234   Uint32 data1 = sysErr->data[0];
235 
236   jamEntry();
237   switch (sysErr->errorCode){
238   case SystemError::GCPStopDetected:
239   {
240     BaseString::snprintf(buf, sizeof(buf),
241 	     "Node %d killed this node because "
242 	     "GCP stop was detected",
243 	     killingNode);
244     signal->theData[0] = 7025;
245     EXECUTE_DIRECT(DBDIH, GSN_DUMP_STATE_ORD, signal, 1);
246     jamEntry();
247 
248     {
249       signal->theData[0] = 12002;
250       EXECUTE_DIRECT(LGMAN, GSN_DUMP_STATE_ORD, signal, 1, 0);
251     }
252 
253     jamEntry();
254 
255     if (ERROR_INSERTED(1004))
256     {
257       jam();
258       g_eventLogger->info("NDBCNTR not shutting down due to GCP stop");
259       return;
260     }
261     CRASH_INSERTION(1005);
262 
263     break;
264   }
265   case SystemError::CopyFragRefError:
266     CRASH_INSERTION(1000);
267     BaseString::snprintf(buf, sizeof(buf),
268 			 "Killed by node %d as "
269 			 "copyfrag failed, error: %u",
270 			 killingNode, data1);
271     break;
272 
273   case SystemError::StartFragRefError:
274     BaseString::snprintf(buf, sizeof(buf),
275 			 "Node %d killed this node because "
276 			 "it replied StartFragRef error code: %u.",
277 			 killingNode, data1);
278     break;
279 
280   case SystemError::CopySubscriptionRef:
281     CRASH_INSERTION(1003);
282     BaseString::snprintf(buf, sizeof(buf),
283 	     "Node %d killed this node because "
284 	     "it could not copy a subscription during node restart. "
285 	     "Copy subscription error code: %u.",
286 	     killingNode, data1);
287     break;
288   case SystemError::CopySubscriberRef:
289     BaseString::snprintf(buf, sizeof(buf),
290 	     "Node %d killed this node because "
291 	     "it could not start a subscriber during node restart. "
292 	     "Copy subscription error code: %u.",
293 	     killingNode, data1);
294     break;
295   default:
296     BaseString::snprintf(buf, sizeof(buf), "System error %d, "
297 	     " this node was killed by node %d",
298 	     sysErr->errorCode, killingNode);
299     break;
300   }
301 
302   progError(__LINE__, NDBD_EXIT_SYSTEM_ERROR, buf);
303   return;
304 }//Ndbcntr::execSYSTEM_ERROR()
305 
306 
307 struct ddentry
308 {
309   Uint32 type;
310   const char * name;
311   Uint64 size;
312 };
313 
314 /**
315  * f_dd[] = {
316  * { DictTabInfo::LogfileGroup, "DEFAULT-LG", 32*1024*1024 },
317  * { DictTabInfo::Undofile, "undofile.dat", 64*1024*1024 },
318  * { DictTabInfo::Tablespace, "DEFAULT-TS", 1024*1024 },
319  * { DictTabInfo::Datafile, "datafile.dat", 64*1024*1024 },
320  * { ~0, 0, 0 }
321  * };
322  */
323 Vector<ddentry> f_dd;
324 
325 static
326 Uint64
parse_size(const char * src)327 parse_size(const char * src)
328 {
329   Uint64 num = 0;
330   char * endptr = 0;
331   num = my_strtoll(src, &endptr, 10);
332 
333   if (endptr)
334   {
335     switch(* endptr){
336     case 'k':
337     case 'K':
338       num *= 1024;
339       break;
340     case 'm':
341     case 'M':
342       num *= 1024;
343       num *= 1024;
344       break;
345     case 'g':
346     case 'G':
347       num *= 1024;
348       num *= 1024;
349       num *= 1024;
350       break;
351     }
352   }
353   return num;
354 }
355 
356 static
357 int
parse_spec(Vector<ddentry> & dst,const char * src,Uint32 type)358 parse_spec(Vector<ddentry> & dst,
359            const char * src,
360            Uint32 type)
361 {
362   const char * key;
363   Uint32 filetype;
364 
365   struct ddentry group;
366   if (type == DictTabInfo::LogfileGroup)
367   {
368     key = "undo_buffer_size=";
369     group.size = 64*1024*1024;
370     group.name = "DEFAULT-LG";
371     group.type = type;
372     filetype = DictTabInfo::Undofile;
373   }
374   else
375   {
376     key = "extent_size=";
377     group.size = 1024*1024;
378     group.name = "DEFAULT-TS";
379     group.type = type;
380     filetype = DictTabInfo::Datafile;
381   }
382   size_t keylen = strlen(key);
383 
384   BaseString arg(src);
385   Vector<BaseString> list;
386   arg.split(list, ";");
387 
388   bool first = true;
389   for (Uint32 i = 0; i<list.size(); i++)
390   {
391     list[i].trim();
392     if (native_strncasecmp(list[i].c_str(), "name=", sizeof("name=")-1) == 0)
393     {
394       group.name= strdup(list[i].c_str() + sizeof("name=")-1);
395     }
396     else if (native_strncasecmp(list[i].c_str(), key, keylen) == 0)
397     {
398       group.size = parse_size(list[i].c_str() + keylen);
399     }
400     else if (strlen(list[i].c_str()) == 0 && (i + 1) == list.size())
401     {
402       /**
403        * ignore stray ";"
404        */
405     }
406     else
407     {
408       /**
409        * interpret as filespec
410        */
411       struct ddentry entry;
412       const char * path = list[i].c_str();
413       char * sizeptr = const_cast<char*>(strchr(path, ':'));
414       if (sizeptr == 0)
415       {
416         return -1;
417       }
418       * sizeptr = 0;
419 
420       entry.name = strdup(path);
421       entry.size = parse_size(sizeptr + 1);
422       entry.type = filetype;
423 
424       if (first)
425       {
426         /**
427          * push group aswell
428          */
429         first = false;
430         dst.push_back(group);
431       }
432       dst.push_back(entry);
433     }
434   }
435   return 0;
436 }
437 
438 /**
439 Restart Phases in MySQL Cluster
440 -------------------------------
441 In MySQL Cluster the restart is processed in phases, the restart of a node
442 is driven by a set of phases. In addition a node restart is also synchronised
443 with already started nodes and other nodes that are starting up in parallel
444 with our node. This comment will describe the various phases used.
445 
446 The first step in starting a node is to create the data node run-time
447 environment. The data node process is normally running with an angel process,
448 this angel process ensures that the data node is automatically restarted in
449 cases of failures. So the only reason to run the data node again is after an
450 OS crash or after a shutdown by an operator or as part of a software upgrade.
451 
452 When starting up the data node, the data node needs a node id, this is either
453 assigned through setting the parameter --ndb-nodeid when starting the data
454 node, or it is assigned by the management server when retrieving the
455 configuration. The angel process will ensure that the assigned node id will be
456 the same for all restarts of the data node.
457 
458 After forking the data node process, the starting process stays as the angel
459 process and the new process becomes the actual data node process. The actual
460 data node process starts by retrieving the configuration from the management
461 server.
462 
463 At this stage we have read the options, we have allocated a node id, we have
464 the configuration loaded from the management server. We will print some
465 important information to the data node log about our thread configuration and
466 some other things. To ensure that we find the correct files and create files
467 in the correct place we set the datadir of our data node process.
468 
469 Next we have to start the watch-dog thread since we are now starting to do
470 activities where we want to ensure that we don't get stuck due to some
471 software error.
472 
473 Next we will allocate the memory of the global memory pools, this is where
474 most memory is allocated, we still have a fair amount of memory allocated as
475 part of the initialisation of the various software modules in the NDB kernel,
476 but step by step we're moving towards usage of the global memory pools.
477 
478 Allocating memory can be a fairly time-consuming process where the OS can
479 require up to one second for each GByte of memory allocated (naturally OS
480 dependent and will change over time). What actually consumes the time here is
481 actually that we also touch each page to ensure that the allocated memory is
482 also mapped to real physical memory to avoid page misses while we're running
483 the process. To speed up this process we have made the touching of memory
484 multi-threaded.
485 
486 Actually where most memory is allocated is configurable, the configuration
487 variable LateAlloc can be used to delay the allocation of most memory to early
488 phases of the restart.
489 
490 The only memory that is required to allocate in the early phase is the job
491 buffer, memory for sending messages over the network and finally memory for
492 messages to and from the file system threads. So allocation of e.g.
493 DataMemory, IndexMemory and DiskPageBufferMemory can be delayed until the
494 early start phases.
495 
496 After allocating the global memory pool we initialise all data used by the
497 run-time environment. This ensures that we're ready to send and receive data
498 between the threads used in the data node process as soon as they are started.
499 
500 At this point we've only started the watch-dog process and the thread started
501 as part of creating the process (this thread will later be converted to the
502 first receive thread if we're running ndbmtd and the only execution thread if
503 we are running ndbd). Next step is to load all software modules and initialise
504 those to ensure they're properly set-up when the messages start arriving for
505 execution.
506 
507 Before we start the run-time environment we also need to activate the send
508 and receive services. This involves creating a socket client thread that
509 attempts to connect to socket server parts of other nodes in the cluster and
510 a thread to listen to the socket server used for those data nodes we
511 communicate as the socket server.
512 
513 The default behaviour is that the node with the lowest nodeid is the socket
514 server in the communication setup. This can be changed in the data node
515 configuration.
516 
517 Before we proceed and start the data node environment we will place the start
518 signals of the run-time environment in its proper job buffer. Actually to
519 start the system one needs to place two equal signals in the job buffer. The
520 first start signal starts the communication to other nodes and sets the state
521 to wait for the next signal to actually start the system. The second one will
522 start running the start phases.
523 
524 Finally we start all the threads of the run-time environment. These can
525 currently include a main thread, a rep thread, a number of tc threads,
526 a number of send threads, a number of receive threads and a number of
527 ldm threads. Given that communication buffers for all threads have been
528 preallocated, we can start sending signals immediately as those threads
529 startup. The receiving thread will start to take care of its received signals
530 as soon as it has come to that point in its thread startup code.
531 
532 There are two identical start signals, the first starts a recurring signal
533 that is sent on a regular basis to keep track of time in the data node.
534 Only the second one starts performing the various start phases.
535 
536 A startup of a data node is handled in a set of phases. The first phase is
537 to send the signal READ_CONFIG_REQ to all software modules in the kernel,
538 then STTOR is similarly sent to all software modules in 256 phases numbered
539 from 0 to 255. These are numbered from 0 to 255, we don't use all of those
540 phases, but the code is flexible such that any of those phases could be
541 used now or sometime in the future.
542 
543 In addition we have 6 modules that are involved in one more set of start
544 phases. The signal sent in these phases are called NDB_STTOR. The original
545 idea was to view this message as the local start of the NDB subsystem.
546 These signals are sent and handled by NDBCNTR and sent as part of the STTOR
547 handling in NDBCNTR. This means that it becomes a sequential part of the
548 startup phases.
549 
550 Before starting the phases we ensure that any management node can connect
551 to our node and that all other node are disconnected and that they can only
552 send messages to the QMGR module. The management server receives reports
553 about various events in the data node and the QMGR module is taking care of
554 the inclusion of the data node into the cluster. Before we're included in
555 the cluster we cannot communicate with other nodes in any manner.
556 
557 The start always starts in the main thread where each software module is
558 represented by at least a proxy module that all multithreaded modules contain.
559 The proxy module makes it possible to easy send and receive messages to a
560 set of modules of the same type using one message and one reply.
561 
562 The READ_CONFIG_REQ signals are always sent in the same order. It starts by
563 sending to CMVMI, this is the block that receives the start order and it
564 performs a number of functions from where the software modules can affect the
565 run-time environment. It normally allocates most memory of the process and
566 touches all of this memory. It is part of the main thread.
567 
568 The next module receiving READ_CONFIG_REQ is NDBFS, this is the module that
569 controls the file system threads, this module is found in the main thread.
570 
571 Next module is DBINFO, this module supports the ndbinfo database used to get
572 information about the data node internals in table format, this module is
573 found in the main thread.
574 
575 Next is DBTUP, this is the module where the actual data is stored. Next DBACC,
576 the module where primary key and unique key hash indexes are stored and where
577 we control row locks from. Both those blocks are contained in the ldm threads.
578 
579 Next is DBTC, the module where transaction coordination is managed from,
580 this module is part of the tc thread. Next is DBLQH, the module that controls
581 the actions on data through key operations and scans and also handles the
582 REDO logs. This is the main module of the ldm thread.
583 
584 Next is DBTUX that operates ordered index reusing pages used to store rows
585 in DBTUP, also part of the ldm thread. Next is DBDICT, the dictionary module
586 used to store and handle all metadata information about tables and columns,
587 tablespaces, log files and so forth. DICT is part of the main thread.
588 
589 Next is DBDIH, the module to store and handle distribution information about
590 all tables, the table partitions and all replicas of each partition. It
591 controls the local checkpoint process, the global checkpoint process and
592 controls a major part of the restart processing. The DIH module is a part of
593 the main thread.
594 
595 Next is NDBCNTR that controls the restart phases, it's part of the main
596 thread. Next is QMGR which takes care of the heartbeat protocol and inclusion
597 and exclusion of nodes in the cluster. It's part of the main thread.
598 
599 Next is TRIX that performs a few services related to ordered indexes and other
600 trigger-based services. It's part of the tc thread. Next is BACKUP, this is
601 used for backups and local checkpoints and is part of the ldm thread.
602 
603 Next is DBUTIL that provides a number of services such as performing key
604 operations on behalf of code in the modules. It's part of the main thread.
605 Next is the SUMA module that takes care of replication events, this is the
606 module handled by the rep thread.
607 
608 Next is TSMAN, then LGMAN, and then PGMAN that are all part of the disk data
609 handling taking care of tablespace, UNDO logging and page management. They
610 are all part of the ldm thread.
611 
612 RESTORE is a module used to restore local checkpoints as part of a startup.
613 This module is also part of the ldm thread.
614 
615 Finally we have the DBSPJ module that takes care of join queries pushed down
616 to the data node, it executes as part of the tc thread.
617 
618 The DBTUP, DBACC, DBLQH, DBTUX, BACKUP, TSMAN, LGMAN, PGMAN, RESTORE are all
619 tightly integrated modules that takes care of the data and indexes locally in
620 each node. This set of modules form an LDM instance, each node can have
621 multiple LDM instances and these can be spread over a set of threads.
622 Each LDM instance owns its own partition of the data.
623 
624 We also have two modules that are not a part of restart handling, this is the
625 TRPMAN module that performs a number of transport-related functions
626 (communication with other nodes). It executes in the receive threads. Finally
627 we have the THRMAN that executes in every thread and does some thread
628 management functionality.
629 
630 All modules receive READ_CONFIG_REQ, all modules also receive STTOR for
631 phase 0 and phase 1. In phase 1 they report back which startphases they want
632 to get informed about more.
633 
634 During the READ_CONFIG_REQ the threads can execute for a very long time in
635 a module since we can be allocating and touching memory of large sizes. This
636 means that our watchdog thread have a special timeout for this phase to
637 ensure that we don't crash the process simply due to a long time of
638 initialising our memory. In normal operations each signal should execute only
639 for a small number of microseconds.
640 
641 The start phases are synchronized by sending the message STTOR to all modules,
642 logically each module gets this signal for each start phase from 0 to 255.
643 However the response message STTORRY contains the list of start phases the
644 module really is interested in.
645 
646 The NDBCNTR module that handles the start phase signals can optimise away
647 any signals not needed. The order in which modules receive the STTOR message
648 is the same for all phases:
649 
650 1) NDBFS
651 2) DBTC
652 3) DBDIH
653 4) DBLQH
654 5) DBACC
655 6) DBTUP
656 7) DBDICT
657 8) NDBCNTR
658 9) CMVMI
659 10)QMGR
660 11)TRIX
661 12)BACKUP
662 13)DBUTIL
663 14)SUMA
664 15)DBTUX
665 16)TSMAN
666 17)LGMAN
667 18)PGMAN
668 19)RESTORE
669 20)DBINFO
670 21)DBSPJ
671 
672 In addition there is a special start phase handling controlled by NDBCNTR,
673 so when NDBCNTR receives its own STTOR message it starts a local start phase
674 handling involving the modules, DBLQH, DBDICT, DBTUP, DBACC, DBTC and DBDIH.
675 
676 This happens for phases 2 through 8. The messages sent in these start phases
677 are NDB_STTOR and NDB_STTORRY, they are handled in a similar manner to STTOR
678 and STTORRY. The modules receive also those start phases in the same order
679 for all phases and this order is:
680 
681 1) DBLQH
682 2) DBDICT
683 3) DBTUP
684 4) DBACC
685 5) DBTC
686 6) DBDIH
687 
688 For those modules that are multithreaded, the STTOR and NDB_STTOR messages
689 always are received by the Proxy module that executes in the main thread.
690 The Proxy module will then send the STTOR and NDB_STTOR messages to each
691 individual instance of the module (the number of instances is normally the
692 same as the number of threads, but could sometimes be different). It does
693 so in parallel, so all instances execute STTOR in parallel.
694 
695 So effectively each instance of a module will logically first receive
696 READ_CONFIG_REQ, then a set of STTOR messages for each start phase and some
697 modules will also receive NDB_STTOR in a certain order. All these messages
698 are sent in a specific order and sequentially. So this means that we have the
699 ability to control when things are done by performing it in the correct start
700 phase.
701 
702 Next we will describe step-by-step what happens in a node restart (or a node
703 start as part of a cluster start/restart). The startup is currently a
704 sequential process except where it is stated that it happens in parallel.
705 The below description thus describes the order things actually happens
706 currently.
707 
708 READ_CONFIG_REQ
709 ---------------
710 The READ_CONFIG_REQ does more or less the same for all software modules. It
711 allocates the memory required by the software module and initialises the
712 memory (creates various free lists and so forth). It also reads the various
713 configuration parameter which is of interest to the module (these often
714 affect the size of the memory we allocate).
715 
716 It starts in CMVMI that allocates most of the global memory pool, next we
717 have NDBFS that creates the necessary file directories for disk data, it
718 also creates the bound IO threads that can be used by one file at a time
719 (initial number of threads configurable through InitalNoOpenFiles), then it
720 creates a number of free threads (number of them configurable through
721 IOThreadPool) used by disk data files (all files used to handle disk data),
722 each such thread can be used to open/read/write/close a disk data file.
723 Finally NDBFS also creates the communication channel from the file system
724 threads back to the other threads.
725 
726 All other modules follow the same standard, they calculate a number of sizes
727 based on hard coded defines or through configuration variables, they allocate
728 memory for those variables, finally they initialise those allocated memory
729 structures.
730 
731 STTOR Phase 0
732 -------------
733 First STTOR phase executed is STTOR phase 0. The only modules doing anything
734 in this phase is NDBCNTR that clears the file system if the start is an initial
735 start and CMVMI that creates the file system directory.
736 
737 STTOR Phase 1
738 -------------
739 Next phase executed is STTOR phase 1, in this phase most modules initialise
740 some more data, references to neighbour modules are setup if necessary. In
741 addition DBDIH create some special mutexes that ensures that only one process
742 is involved in certain parts of the code at a time.
743 
744 NDBCNTR initialises some data related to running NDB_STTOR starting in
745 phase 2. CMVMI locks memory if configured to do so, after this it installs the
746 normal watchdog timeout since now all large memory allocations are performed.
747 CMVMI also starts regular memory reporting.
748 
749 QMGR is the most active module in this phase. It initialises some data, it
750 gets the restart type (initial start or normal start) from DBDIH, it opens
751 communication to all nodes in the cluster, it starts checking for node
752 failures of the include node handling. Finally it runs the protocol to
753 include the new node into the heartbeat protocol. This could take a while
754 since the node inclusion process can only bring in one node at a time and
755 the protocol contains some delays.
756 
757 The BACKUP module then starts the disk speed check loop which will run as
758 long as the node is up and running.
759 
760 STTOR Phase 2
761 -------------
762 Next step is to execute STTOR phase 2. The only module that does anything in
763 STTOR phase 2 is NDBCNTR, it asks DIH for the restart type, it reads the node
764 from the configuration, it initialises the partial timeout variables that
765 controls for how long to wait before we perform a partial start.
766 
767 NDBCNTR sends the signal CNTR_START_REQ to the NDBCNTR in the current master
768 node, this signal enables the master node to delay the start of this node if
769 necessary due to other starting nodes or some other condition. For cluster
770 starts/restarts it also gives the master node the chance to ensure we wait
771 for enough nodes to start up before we start the nodes.
772 
773 The master only accepts one node at a time that has received CNTR_START_CONF,
774 the next node can only receive CNTR_START_CONF after the previous starting
775 node have completed copying the metadata and releasing the metadata locks and
776 locks on DIH info, that happens below in STTOR phase 5.
777 
778 So in a rolling restart it is quite common that the first node will get
779 CNTR_START_CONF and then instead get blocked on the DICT lock waiting for
780 an LCP to complete. The other nodes starting up in parallel will instead
781 wait on CNTR_START_CONF since only one node at a time can pass this.
782 
783 After receiving CNTR_START_CONF, NDBCNTR continues by running NDB_STTOR
784 phase 1. Here DBLQH initialises the node records, it starts a reporting
785 service. It does also initialise the data about the REDO log, this also
786 includes initialising the REDO log on disk for all types of initial start
787 (can be quite time consuming).
788 
789 DBDICT initialises the schema file (contains the tables that have been created
790 in the cluster and other metadata objects). DBTUP initialises a default value
791 fragment and DBTC and DBDIH initialises some data variables. After completing
792 the NDB_STTOR phase in NDBCNTR there is no more work done in STTOR phase 2.
793 
794 STTOR Phase 3
795 -------------
796 Next step is to run the STTOR phase 3. Most modules that need the list of
797 nodes in the cluster reads this in this phase. DBDIH reads the nodes in this
798 phase, DBDICT sets the restart type. Next NDBCNTR receives this phase and
799 starts NDB_STTOR phase 2. In this phase DBLQH sets up connections from its
800 operation records to the operation records in DBACC and DBTUP. This is done
801 in parallel for all DBLQH module instances.
802 
803 DBDIH now prepares the node restart process by locking the meta data. This
804 means that we will wait until any ongoing meta data operation is completed
805 and when it is completed we will lock the meta data such that no meta data
806 changes can be done until we're done with the phase where we are copying the
807 metadata informatiom.
808 
809 The reason for locking is that all meta data and distribution info is fully
810 replicated. So we need to lock this information while we are copying the data
811 from the master node to the starting node. While we retain this lock we cannot
812 change meta data through meta data transactions. Before copying the meta data
813 later we also need to ensure no local checkpoint is running since this also
814 updates the distribution information.
815 
816 After locking this we need to request permission to start the node from the
817 master node. The request for permission to start the node is handled by the
818 starting node sending START_PERMREQ to the master node. This could receive a
819 negative reply if another node is already processing a node restart, it could
820 fail if an initial start is required. If another node is already starting we
821 will wait 3 second and try again. This is executed in DBDIH as part of
822 NDB_STTOR phase 2.
823 
824 After completing the NDB_STTOR phase 2 the STTOR phase 3 continues by the
825 CMVMI module activating the checks of send packed data which is used by scan
826 and key operations.
827 
828 Next the BACKUP module reads the configured nodes. Next the SUMA module sets
829 the reference to the Page Pool such that it can reuse pages from this global
830 memory pool, next DBTUX sets the restart type. Finally PGMAN starts a stats
831 loop and a cleanup loop that will run as long as the node is up and running.
832 
833 We could crash the node if our node is still involved in some processes
834 ongoing in the master node. This is fairly normal and will simply trigger a
835 crash followed by a normal new start up by the angel process. The request
836 for permission is handled by the master sending the information to all nodes.
837 
838 For initial starts the request for permission can be quite time consuming
839 since we have to invalidate all local checkpoints from all tables in the
840 meta data on all nodes. There is no parallelisation of this invalidation
841 process currently, so it will invalidate one table at a time.
842 
843 STTOR Phase 4
844 -------------
845 After completing STTOR phase 3 we move onto STTOR phase 4. This phase starts
846 by DBLQH acquiring a backup record in the BACKUP module that will be used
847 for local checkpoint processing.
848 
849 Next NDBCNTR starts NDB_STTOR phase 3. This starts also in DBLQH where we
850 read the configured nodes. Then we start reading the REDO log to get it
851 set-up (we will set this up in the background, it will be synchronised by
852 another part of cluster restart/node restart later described), for all types
853 of initial starts we will wait until the initialisation of the REDO log have
854 been completed until reporting this phase as completed.
855 
856 Next DBDICT will read the configured nodes whereafter also DBTC reads the
857 configured nodes and starts transaction counters reporting. Next in
858 NDB_STTOR phase 3 is that DBDIH initialises restart data for initial starts.
859 
860 Before completing its work in STTOR phase 4, NDBCNTR will set-up a waiting
861 point such that all starting nodes have reached this point before
862 proceeding. This is only done for cluster starts/restarts, so not for node
863 restarts.
864 
865 The master node controls this waitpoint and will send the signal
866 NDB_STARTREQ to DBDIH when all nodes of the cluster restart have reached
867 this point. More on this signal later.
868 
869 The final thing happening in STTOR phase 4 is that DBSPJ reads the configured
870 nodes.
871 
872 STTOR Phase 5
873 -------------
874 We now move onto STTOR phase 5. The first thing done here is to run NDB_STTOR
875 phase 4. Only DBDIH does some work here and it only does something in node
876 restarts. In this case it asks the current master node to start it up by
877 sending the START_MEREQ signal to it.
878 
879 START_MEREQ works by copying distribution information from master DBDIH node
880 and then also meta data information from master DBDICT. It copies one table
881 of distribution information at a time which makes the process a bit slow
882 since it includes writing the table to disk in the starting node.
883 
884 The only manner to trace this event is when writing the table distribution
885 information per table in DBDIH in the starting node. We can trace the
886 reception of DICTSTARTREQ that is received in the starting nodes DBDICT.
887 
888 When DBDIH and DBDICT information is copied then we need to block the global
889 checkpoint in order to include the new node in all changes of meta data and
890 distribution information from now on. This is performed by sending
891 INCL_NODEREQ to all nodes. After this we can release the meta data lock that
892 was set by DBDIH already in STTOR phase 2.
893 
894 After completing NDB_STTOR phase 4, NDBCNTR synchronises the start again in
895 the following manner:
896 
897 If initial cluster start and master then create system tables
898 If cluster start/restart then wait for all nodes to reach this point.
899 After waiting for nodes in a cluster start/restart then run NDB_STTOR
900 phase 5 in master node (only sent to DBDIH).
901 If node restart then run NDB_STTOR phase 5 (only sent to DBDIH).
902 
903 NDB_STTOR phase 5 in DBDIH is waiting for completion of a local checkpoint
904 if it is a master and we are running a cluster start/restart. For node
905 restarts we send the signal START_COPYREQ to the starting node to ask for
906 copying of data to our node.
907 
908   START OF DATABASE RECOVERY
909 
910 We start with explaining a number of terms used.
911 ------------------------------------------------
912 LCP: Local checkpoint, in NDB this means that all data in main memory is
913 written to disk and we also write changed disk pages to disk to ensure
914 that all changes before a certain point is available on disk.
915 Execute REDO log: This means that we're reading the REDO log one REDO log
916 record at a time and executing the action if needed that is found in the
917 REDO log record.
918 Apply the REDO log: Synonym of execute the REDO log.
919 Prepare REDO log record: This is a REDO log record that contains the
920 information about a change in the database (insert/delete/update/write).
921 COMMIT REDO log record: This is a REDO log record that specifies that a
922 Prepare REDO log record is to be actually executed. The COMMIT REDO log
923 record contains a back reference to the Prepare REDO log record.
924 ABORT REDO log record: Similarly to the COMMIT REDO log record but here
925 the transaction was aborted so there is no need to apply the REDO log
926 record.
927 Database: Means in this context all the data residing in the cluster or
928 in the node when there is a node restart.
929 Off-line Database: Means that our database in our node is not on-line
930 and thus cannot be used for reading. This is the state of the database
931 after restoring a LCP, but before applying the REDO log.
932 Off-line Consistent database: This is a database state which is not
933 up-to-date with the most recent changes, but it represents an old state
934 in the database that existed previously. This state is achieved after
935 restoring an LCP and executing the REDO log.
936 On-line Database: This is a database state which is up-to-date, any node
937 that can be used to read data is has its database on-line (actually
938 fragments are brought on-line one by one).
939 On-line Recoverable Database: This is an on-line database that is also
940 recoverable. In a node restart we reach the state on-line database first,
941 but we need to run an LCP before the database can also be recovered to
942 its current state. A recoverable database is also durable so this means
943 that we're adding the D in ACID to the database when we reach this state.
944 Node: There are API nodes, data nodes and management server nodes. A data
945 node is a ndbd/ndbmtd process that runs all the database logic and
946 contains the database data. The management server node is a process that
947 runs ndb_mgmd that contains the cluster configuration and also performs
948 a number of management services. API nodes are part of application processes
949 and within mysqld's. There can be more than one API node per application
950 process. Each API node is connected through a socket (or other
951 communication media) to each of the data nodes and management server nodes.
952 When one refers to nodes in this text it's mostly implied that we're
953 talking about a data node.
954 Node Group: A set of data nodes that all contain the same data. The number
955 of nodes in a node group is equal to the number of replicas we use in the
956 cluster.
957 Fragment: A part of a table that is fully stored on one node group.
958 Partition: Synonym of fragment.
959 Fragment replica: This is one fragment in one node. There can be up
960 to 4 replicas of a fragment (so thus a node group can have up to
961 4 nodes in it).
962 Distribution information: This is information about the partitions
963 (synonym of fragments) of the tables and on which nodes they reside
964 and information about LCPs that have been executed on each fragment
965 replica.
966 Metadata: This is the information about tables, indexes, triggers,
967 foreign keys, hash maps, files, log file groups, table spaces.
968 Dictionary information: Synonym to metadata.
969 LDM: Stands for Local Data Manager, these are the blocks that execute
970 the code that handles the data handled within one data node. It contains
971 blocks that handles the tuple storage, the hash index, the T-tree index,
972 the page buffer manager, the tablespace manager, a block that writes
973 LCPs and a block that restores LCPs, a log manager for disk data.
974 
975 ------------------------------------------------------------------------------
976 | What happens as part START_COPYREQ is what is the real database restore    |
977 | process. Here most of the important database recovery algorithms are       |
978 | executed to bring the database online again. The earlier phases were still |
979 | needed to restore the metadata and setup communication, setup memory and   |
980 | bringing in the starting node as a full citizen in the cluster of data     |
981 | nodes.                                                                     |
982 ------------------------------------------------------------------------------
983 
984 START_COPYREQ goes through all distribution information and sends
985 START_FRAGREQ to the owning DBLQH module instance for each fragment replica
986 to be restored on the node. DBLQH will start immediately to restore those
987 fragment replicas, it will queue the fragment replicas and restore one at a
988 time. This happens in two phases, first all fragment replicas that requires
989 restore of a local checkpoint starts to do that.
990 
991 After all fragment replicas to restore have been sent and we have restored all
992 fragments from a local checkpoint stored on our disk (or sometime by getting
993 the entire fragment from an alive node) then it is time to run the disk data
994 UNDO log. Finally after running this UNDO log we're ready to get the fragment
995 replicas restored to latest disk-durable state by applying the REDO log.
996 
997 DBDIH will send all required information for all fragment replicas to DBLQH
998 whereafter it sends START_RECREQ to DBLQH to indicate all fragment replica
999 information have been sent now.
1000 
1001 START_RECREQ is sent through the DBLQH proxy module and this part is
1002 parallelised such that all LDM instances are performing the below parts in
1003 parallel.
1004 
1005 If we're doing a initial node restart we don't need to restore any local
1006 checkpoints since initial node restart means that we start without a file
1007 system. So this means that we have to restore all data from other nodes in
1008 the node group. In this case we start applying the copying of fragment
1009 replicas immediately in DBLQH when we receive START_FRAGREQ. In this case
1010 we don't need to run any Undo or Redo log since there is no local checkpoint
1011 to restore the fragment.
1012 
1013 When this is completed and DBDIH has reported that all fragment replicas to
1014 start have been sent by sending START_RECREQ to DBLQH we will send
1015 START_RECREQ to TSMAN whereafter we are done with the restore of the data.
1016 
1017 We will specify all fragment replicas to restore as part of REDO log
1018 execution. This is done through the signal EXEC_FRAGREQ. When all such signals
1019 have been sent we send EXEC_SRREQ to indicate we have prepared for the next
1020 phase of REDO log execution in DBLQH.
1021 
1022 When all such signals are sent we have completed what is termed as phase 2
1023 of DBLQH, the phase 1 in DBLQH is what started in NDB_STTOR phase 3 to prepare
1024 the REDO log for reading it. So when both those phases are complete we're ready
1025 to start what is termed phase 3 in DBLQH.
1026 
1027 These DBLQH phases are not related to the start phases, these are internal
1028 stages of startup in the DBLQH module.
1029 
1030 Phase 3 in DBLQH is the reading of the REDO log and applying it on fragment
1031 replicas restored from the local checkpoint. This is required to create a
1032 database state which is synchronised on a specific global checkpoint. So we
1033 first install a local checkpoint for all fragments, next we apply the REDO
1034 log to synchronise the fragment replica with a certain global checkpoint.
1035 
1036 Before executing the REDO log we need to calculate the start GCI and the last
1037 GCI to apply in the REDO log by checking the limits on all fragment replicas
1038 we will restore to the desired global checkpoint.
1039 
1040 DBDIH has stored information about each local checkpoint of a fragment
1041 replica which global checkpoint ranges that are required to run from the REDO
1042 log in order to bring it to the state of a certain global checkpoint. This
1043 information was sent in the START_FRAGREQ signal. DBLQH will merge all of
1044 those limits per fragment replica to a global range of global checkpoints to
1045 run for this LDM instance. So each fragment replica has its own GCP id range
1046 to execute and this means that the minimum of all those start ranges and
1047 maximum of all the end ranges is the global range of GCP ids that we need
1048 to execute in the REDO log to bring the cluster on-line again.
1049 
1050 The next step is to calculate the start and stop megabyte in the REDO log for
1051 each log part by using the start and stop global checkpoint id. All the
1052 information required to calculate this is already in memory, so it's a pure
1053 calculation.
1054 
1055 When we execute the REDO log we actually only apply the COMMIT records in the
1056 correct global checkpoint range. The COMMIT record and the actual change
1057 records are in different places in the REDO log, so for each Megabyte of
1058 REDO log we record how far back in the REDO log we have to go to find the
1059 change records.
1060 
1061 While running the REDO log we maintain a fairly large cache of the REDO log
1062 to avoid that we have to do disk reads in those cases where the transaction
1063 ran for a long time.
1064 
1065 This means that long-running and large transactions can have a negative effect
1066 on restart times.
1067 
1068 After all log parts have completed this calculation we're now ready to start
1069 executing the REDO log. After executing the REDO log to completion we also
1070 write some stuff into the REDO log to indicate that any information beyond
1071 what we used here won't be used at any later time.
1072 
1073 We now need to wait for all other log parts to also complete execution of
1074 their parts of the REDO log. The REDO log execution is designed such that we
1075 can execute the REDO log in more than one phase, this is intended for cases
1076 where we can rebuild a node from more than one live node. Currently this code
1077 should never be used.
1078 
1079 So the next step is to check for the new head and tail of the REDO log parts.
1080 This is done through the same code that uses start and stop global
1081 checkpoints to calculate this number. This phase of the code also prepares
1082 the REDO log parts for writing new REDO log records by ensuring that the
1083 proper REDO log files are open. It also involves some rather tricky code to
1084 ensure that pages that have been made dirty are properly handled.
1085 
1086   COMPLETED RESTORING OFF-LINE CONSISTENT DATABASE
1087 ------------------------------------------------------------------------------
1088 | After completing restoring fragment replicas to a consistent global        |
1089 | checkpoint, we will now start rebuilding the ordered indexes based on the  |
1090 | data restored. After rebuilding the ordered indexes we are ready to send   |
1091 | START_RECCONF to the starting DBDIH. START_RECCONF is sent through the     |
1092 | DBLQH proxy, so it won't be passed onto DBDIH until all DBLQH instances    |
1093 | have completed this phase and responded with START_RECCONF.                |
1094 ------------------------------------------------------------------------------
1095 
1096 At this point in the DBLQH instances we have restored a consistent but old
1097 variant of all data in the node. There are still no ordered indexes and there
1098 is still much work remaining to get the node synchronised with the other nodes
1099 again. For cluster restarts it might be that the node is fully ready to go,
1100 it's however likely that some nodes still requires being synchronised with
1101 nodes that have restored a more recent global checkpoint.
1102 
1103 The DBDIH of the starting node will then start the take over process now
1104 that the starting node has consistent fragment replicas. We will prepare the
1105 starting node's DBLQH for the copying phase by sending PREPARE_COPY_FRAG_REQ
1106 for each fragment replica we will copy over. This is a sequential process that
1107 could be parallelised a bit.
1108 
1109 The process to take over a fragment replica is quite involved. It starts by
1110 sending PREPARE_COPY_FRAGREQ/CONF to the starting DBLQH, then we send
1111 UPDATE_TOREQ/CONF to the master DBDIH to ensure we lock the fragment
1112 information before the take over starts. After receiving confirmation of this
1113 fragment lock, the starting node send UPDATE_FRAG_STATEREQ/CONF to all nodes to
1114 include the new node into all operations on the fragment.
1115 
1116 After completing this we again send UPDATE_TOREQ/CONF to the master node to
1117 inform of the new status and unlock the lock on the fragment information. Then
1118 we're ready to perform the actual copying of the fragment. This is done by
1119 sending COPY_FRAGREQ/CONF to the node that will copy the data. When this
1120 copying is done we send COPY_ACTIVEREQ/CONF to the starting node to activate
1121 the fragment replica.
1122 
1123 Next we again send UPDATE_TOREQ/CONF to the master informing about that we're
1124 about to install the commit the take over of the new fragment replica. Next we
1125 commit the new fragment replica by sending UPDATE_FRAG_STATEREQ/CONF to all
1126 nodes informing them about completion of the copying of the fragment replica.
1127 Finally we send another update to the master node with UPDATE_TOREQ/CONF.
1128 Now we're finally complete with copying of this fragment.
1129 
1130 The idea with this scheme is that the first UPDATE_FRAG_STATEREQ ensures that
1131 we're a part of all transactions on the fragment. After doing the COPY_FRAGREQ
1132 that synchronises the starting node's fragment replica with the alive node's
1133 fragment replica on a row by row basis, we're sure that the two fragment
1134 replicas are entirely synchronised and we can do a new UPDATE_FRAG_STATEREQ to
1135 ensure all nodes know that we're done with the synchronisation.
1136 
1137   COMPLETED RESTORING ON-LINE NOT RECOVERABLE DATABASE
1138 ------------------------------------------------------------------------------
1139 | At this point we have restored an online variant of the database by        |
1140 | bringing one fragment at a time online. The database is still not          |
1141 | recoverable since we haven't enabled logging yet and there is no local     |
1142 | checkpoint of the data in the starting node.                               |
1143 ------------------------------------------------------------------------------
1144 
1145 Next step is to enable logging on all fragments, after completing this step
1146 we will send END_TOREQ to the master DBDIH. At this point we will wait until a
1147 local checkpoint is completed where this node have been involved. Finally when
1148 the local checkpoint have been completed we will send END_TOCONF to the
1149 starting node and then we will send START_COPYCONF and that will complete
1150 this phase of the restart.
1151 
1152   COMPLETED RESTORING ON-LINE RECOVERABLE DATABASE
1153 ------------------------------------------------------------------------------
1154 | At this point we have managed to restored all data and we have brought it  |
1155 | online and now we have also executed a local checkpoint afer enabling      |
1156 | logging and so now data in the starting node is also recoverable. So this  |
1157 | means that the database is now fully online again.                         |
1158 ------------------------------------------------------------------------------
1159 
1160 After completing NDB_STTOR phase 5 then all nodes that have been synchronised
1161 in a waitpoint here are started again and NDBCNTR continues by running
1162 phase 6 of NDB_STTOR.
1163 
1164 In this phase DBLQH, DBDICT and DBTC sets some status variables indicating
1165 that now the start has completed (it's not fully completed yet, but all
1166 services required for those modules to operate are completed. DBDIH also
1167 starts global checkpoint protocol for cluster start/restarts where it has
1168 become the master node.
1169 
1170 Yet one more waiting point for all nodes is now done in the case of a cluster
1171 start/restart.
1172 
1173 The final step in STTOR phase 5 is SUMA that reads the configured nodes,
1174 gets the node group members and if there is node restart it asks another
1175 node to recreate subscriptions for it.
1176 
1177 STTOR Phase 6
1178 -------------
1179 We now move onto STTOR phase 6. In this phase NDBCNTR gets the node group of
1180 the node, DBUTIL gets the systable id, prepares a set of operations for later
1181 use and connects to TC to enable it to run key operations on behalf of other
1182 modules later on.
1183 
1184 STTOR Phase 7
1185 -------------
1186 Next we move onto STTOR phase 7. DBDICT now starts the index statistics loop
1187 that will run as long as the node lives.
1188 
1189 QMGR will start arbitration handling to handle a case where we are at risk of
1190 network partitioning.
1191 
1192 BACKUP will update the disk checkpoint speed (there is one config variable
1193 for speed during restarts and one for normal operation, here we install the
1194 normal operation speed). If initial start BACKUP will also create a backup
1195 sequence through DBUTIL.
1196 
1197 SUMA will create a sequence if it's running in a master node and it's an
1198 initial start. SUMA will also always calculate which buckets it is
1199 responsible to handle. Finally DBTUX will start monitoring of ordered indexes.
1200 
1201 STTOR Phase 8
1202 -------------
1203 We then move onto STTOR phase 8. First thing here is to run phase 7 of
1204 NDB_STTOR in which DBDICT enables foreign keys. Next NDBCNTR will also wait
1205 for all nodes to come here if we're doing a cluster start/restart.
1206 
1207 Next CMVMI will set state to STARTED and QMGR will enable communication to
1208 all API nodes.
1209 
1210 STTOR Phase 101
1211 ---------------
1212 After this phase the only remaining phase is STTOR phase 101 in which SUMA
1213 takes over responsibility of the buckets it is responsible for in the
1214 asynchronous replication handling.
1215 
1216 Major potential consumers of time so far:
1217 
1218 All steps in the memory allocation (all steps of the READ_CONFIG_REQ).
1219 CMVMI STTOR phase 1 that could lock memory. QMGR phase 1 that runs the
1220 node inclusion protocol.
1221 
1222 NDBCNTR STTOR phase 2 that waits for CNTR_START_REQ, DBLQH REDO log
1223 initialisation for initial start types that happens in STTOR phase 2.
1224 Given that only one node can be in this phase at a time, this can be
1225 stalled by a local checkpoint wait of another node starting. So this
1226 wait can be fairly long.
1227 
1228 DBLQH sets up connections to DBACC and DBTUP, this is NDB_STTOR phase 2.
1229 DBDIH in NDB_STTOR phase 2 also can wait for the meta data to be locked
1230 and it can wait for response to START_PERMREQ.
1231 
1232 For initial starts waiting for DBLQH to complete NDB_STTOR phase 3 where
1233 it initialises set-up of the REDO logs. NDBCNTR for cluster start/restarts
1234 in STTOR phase 4 after completing NDB_STTOR phase 3 have to wait for all
1235 nodes to reach this point and then it has to wait for NDB_STARTREQ to
1236 complete.
1237 
1238 For node restarts we have delays in waiting for response to START_MEREQ
1239 signal and START_COPYREQ, this is actually where most of the real work of
1240 the restart is done. SUMA STTOR phase 5 where subscriptions are recreated
1241 is another potential time consumer.
1242 
1243 All waitpoints are obvious potential consumers of time. Those are mainly
1244 located in NDBCNTR (waitpoint 5.2, 5,1 and 6).
1245 
1246 Historical anecdotes:
1247 1) The NDB kernel run-time environment was originally designed for an
1248 AXE virtual machine. In AXE the starts were using the module MISSRA to
1249 drive the STTOR/STTORRY signals for the various startup phases.
1250 The MISSRA was later merged into NDBCNTR and is a submodule of NDBCNTR
1251 nowadays. The name of STTOR and STTORRY has some basis in the AXE systems
1252 way of naming signals in early days but has been forgotten now. At least
1253 the ST had something to do wih Start/Restart.
1254 
1255 2) The reason for introducing the NDB_STTOR was since we envisioned a system
1256 where the NDB kernel was just one subsystem within the run-time environment.
1257 So therefore we introduced separate start phases for the NDB subsystem.
1258 Over time the need for such a subsystem startup phases are no longer there,
1259 but the software is already engineered for this and thus it's been kept in
1260 this manner.
1261 
1262 3) Also the responsibility for the distributed parts of the database start
1263 is divided. QMGR is responsible for discovering when nodes are up and down.
1264 NDBCNTR maintains the protocols for failure handling and other changes of the
1265 node configuration. Finally DBDIH is responsible for the distributed start of
1266 the database parts. It interacts a lot with DBLQH that have the local
1267 responsibility of starting one nodes database part as directed by DBDIH.
1268 
1269 Local checkpoint processing in MySQL Cluster
1270 --------------------------------------------
1271 
1272 This comment attempts to describe the processing of checkpoints as it happens
1273 in MySQL Cluster. It also clarifies where potential bottlenecks are. This
1274 comment is mainly intended as internal documentation of the open source code
1275 of MySQL Cluster.
1276 
1277 The reason for local checkpoints in MySQL Cluster is to ensure that we have
1278 copy of data on disk which can be used to run the REDO log against to restore
1279 the data in MySQL Cluster after a crash.
1280 
1281 We start by introducing different restart variants in MySQL Cluster. The first
1282 variant is a normal node restart, this means that the node have been missing
1283 for a short time, but is now back on line again. We start by installing a
1284 checkpointed version of all tables (including executing proper parts of the
1285 REDO log against it). Next step is to use the replica which are still online
1286 to make the checkpointed version up to date. Replicas are always organised in
1287 node groups, the most common size of a node group is two nodes. So when a
1288 node starts up, it uses the other node in the same node group to get an
1289 online version of the tables back online. In a normal node restart we have
1290 first restored a somewhat old version of all tables before using the other
1291 node to synchronize it. This means that we only need to ship the latest
1292 version of the rows that have been updated since the node failed before the
1293 node restart. We also have the case of initial node restarts where all data
1294 have to be restored from the other node since the checkpoint in the starting
1295 node is either too old to be reused or it's not there at all when a completely
1296 new node is started up.
1297 
1298 The third variant of restart is a so called system restart, this means that
1299 the entire cluster is starting up after a cluster crash or after a controlled
1300 stop of the cluster. In this restart type we first restore a checkpoint on all
1301 nodes before running the REDO log to get the system in a consistent and
1302 up-to-date state. If any node was restored to an older global checkpoint than
1303 the one to restart from, then it is necessary to use the same code used in
1304 node restarts to bring those node to an online state.
1305 
1306 The system restart will restore a so called global checkpoint. A set of
1307 transactions are grouped together into a global checkpoint, when this global
1308 checkpoint has been completed the transactions belonging to it are safe and
1309 will survive a cluster crash. We run global checkpoints on a second level,
1310 local checkpoints write the entire data set to disk and is a longer process
1311 taking at least minutes.
1312 
1313 Before a starting node can be declared as fully restored it has to participate
1314 in a local checkpoint. The crashing node misses a set of REDO log record
1315 needed to restore the cluster, thus the node isn't fully restored until it can
1316 be used to restore all data it owns in a system restart.
1317 
1318 So when performing a rolling node restart where all nodes in the cluster are
1319 restarted (e.g. to upgrade the software in MySQL Cluster), it makes sense to
1320 restart a set of nodes at a time since we can only have one set of nodes
1321 restarted at a time.
1322 
1323 This was a bit of prerequisite to understand the need for local checkpoints.
1324 We now move to the description of how a local checkpoint is processed.
1325 
1326 The local checkpoint is a distributed process. It is controlled by a
1327 software module called DBDIH (or DIH for short, DIstribution Handler).
1328 DIH contains all the information about where various replicas of each fragment
1329 (synonym with partition) are placed and various data on these replicas.
1330 DIH stores distribution information in one file per table. This file is
1331 actually two files, this is to ensure that we can do careful writing of the
1332 file. We first write file 0, when this is completed, we write file 1,
1333 in this manner we can easily handle any crashes while writing the table
1334 description.
1335 
1336 When a local checkpoint have been completed, DIH immediately starts the
1337 process to start the next checkpoint. At least one global checkpoint have
1338 to be completed since starting the local checkpoint before we will start a
1339 new local checkpoint.
1340 
1341 The first step in the next local checkpoint is to check if we're ready to
1342 run it yet. This is performed by sending the message TCGETOPSIZEREQ to all
1343 TC's in the cluster. This will report back the amount of REDO log information
1344 generated by checking the information received in TC for all write
1345 transactions. The message will be sent by the master DIH. The role of the
1346 master is assigned to the oldest surviving data node, this makes it easy to
1347 select a new master whenever a data node currently acting as master dies.
1348 All nodes agree on the order of nodes entering the cluster, so the age of
1349 a node is consistent in all nodes in the cluster.
1350 
1351 When all messages have returned the REDO log write size to the master
1352 DIH we will compare it to the config variable TimeBetweenLocalCheckpoints
1353 (this variable is set in logarithm of size, so e.g. 25 means we wait
1354 2^25 words of REDO log has been created in the cluster which is 128 MByte
1355 of REDO log info).
1356 
1357 When sufficient amount of REDO log is generated, then we start the next local
1358 checkpoint, the first step is to clear all TC counters, this is done by
1359 sending TC_CLOPSIZEREQ to all TC's in the cluster.
1360 
1361 The next step is to calculate the keep GCI (this is the oldest global
1362 checkpoint id that needs to be retained in the REDO log). This number is very
1363 important since it's the point where we can move the tail of the REDO log
1364 forward. If we run out of REDO log space we will not be able to run any
1365 writing transactions until we have started the next local checkpoint and
1366 thereby moved the REDO log tail forward.
1367 
1368 We calculate this number by checking each fragment what GCI it needs to be
1369 restored. We currently keep two old local checkpoints still valid, so we
1370 won't move the GCI back to invalidate the two oldest local checkpoints per
1371 fragment. The GCI that will be restorable after completing this calculation
1372 is the minimum GCI found on all fragments when looping over them.
1373 
1374 Next we write this number and the new local checkpoint id and some other
1375 information in the Sysfile of all nodes in the cluster. This Sysfile is the
1376 first thing we look at when starting a restore of the cluster in a system
1377 restart, so it's important to have this type of information correct in this
1378 file.
1379 
1380 When this is done we will calculate which nodes that will participate in the
1381 local checkpoint (nodes currently performing the early parts of a restart is
1382 not part of the local checkpoint and obviously also not dead nodes).
1383 
1384 We send the information about the starting local checkpoint to all other DIH's
1385 in the system. We must keep all other DIH's up-to-date all the time to ensure
1386 it is easy to continue the local checkpoint also when the master DIH crashes
1387 or is stopped in the middle of the local checkpoint process. Each DIH records
1388 the set of nodes participating in the local checkpoint. They also set a flag
1389 on each replica record indicating a local checkpoint is ongoing, on each
1390 fragment record we also set the number of replicas that are part of this local
1391 checkpoint.
1392 
1393 Now we have completed the preparations for the local checkpoint, it is now
1394 time to start doing the actual checkpoint writing of the actual data. The
1395 master DIH controls this process by sending off a LCP_FRAG_ORD for each
1396 fragment replica that should be checkpointed. DIH can currently have 2 such
1397 LCP_FRAG_ORD outstanding per node and 2 fragment replicas queued. Each LDM
1398 thread can process writing of one fragment replica at a time and it can
1399 have one request for the next fragment replica queued. It's fairly
1400 straightforward to extend this number such that more fragment replicas can
1401 be written in parallel and more can be queued.
1402 
1403 LCP_FRAG_REP is sent to all DIH's when the local checkpoint for a fragment
1404 replica is completed. When a DIH discovers that all fragment replicas of a
1405 table have completed the local checkpoint, then it's time to write the table
1406 description to the file system. This will record the interesting local
1407 checkpoint information for all of the fragment replicas. There are two things
1408 that can cause this to wait. First writing and reading of the entire table
1409 description is something that can only happen one at a time, this mainly
1410 happens when there is some node failure handling ongoing while the local
1411 checkpoint is being processed.
1412 
1413 The second thing that can block the writing of a table description is that
1414 currently a maximum of 4 table descriptions can be written in parallel. This
1415 could easily become a bottleneck since each write a file can take in the order
1416 of fifty milliseconds. So this means we can currently only write about 80 such
1417 tables per second. In a system with many tables and little data this could
1418 become a bottleneck. It should however not be a difficult bottleneck.
1419 
1420 When the master DIH has sent all requests to checkpoint all fragment replicas
1421 it will send a special LCP_FRAG_ORD to all nodes indicating that no more
1422 fragment replicas will be sent out.
1423 */
1424 
1425 void
execREAD_CONFIG_REQ(Signal * signal)1426 Ndbcntr::execREAD_CONFIG_REQ(Signal* signal)
1427 {
1428   jamEntry();
1429 
1430   const ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtr();
1431 
1432   Uint32 ref = req->senderRef;
1433   Uint32 senderData = req->senderData;
1434 
1435   const ndb_mgm_configuration_iterator * p =
1436     m_ctx.m_config.getOwnConfigIterator();
1437   ndbrequire(p != 0);
1438 
1439   Uint32 dl = 0;
1440   ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &dl);
1441   if (dl == 0)
1442   {
1443     const char * lgspec = 0;
1444     char buf[1024];
1445     if (!ndb_mgm_get_string_parameter(p, CFG_DB_DD_LOGFILEGROUP_SPEC, &lgspec))
1446     {
1447       jam();
1448 
1449       if (parse_spec(f_dd, lgspec, DictTabInfo::LogfileGroup))
1450       {
1451         BaseString::snprintf(buf, sizeof(buf),
1452                              "Unable to parse InitialLogfileGroup: %s", lgspec);
1453         progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
1454       }
1455     }
1456 
1457     const char * tsspec = 0;
1458     if (!ndb_mgm_get_string_parameter(p, CFG_DB_DD_TABLEPACE_SPEC, &tsspec))
1459     {
1460       if (f_dd.size() == 0)
1461       {
1462         warningEvent("InitialTablespace specified, "
1463                      "but InitialLogfileGroup is not!");
1464         warningEvent("Ignoring InitialTablespace: %s",
1465                      tsspec);
1466       }
1467       else
1468       {
1469         if (parse_spec(f_dd, tsspec, DictTabInfo::Tablespace))
1470         {
1471           BaseString::snprintf(buf, sizeof(buf),
1472                                "Unable to parse InitialTablespace: %s", tsspec);
1473           progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
1474         }
1475       }
1476     }
1477   }
1478 
1479   struct ddentry empty;
1480   empty.type = ~0;
1481   f_dd.push_back(empty);
1482 
1483   if (true)
1484   {
1485     // TODO: add config parameter
1486     // remove ATTRIBUTE_MASK2
1487     g_sysTable_NDBEVENTS_0.columnCount--;
1488   }
1489 
1490   ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtrSend();
1491   conf->senderRef = reference();
1492   conf->senderData = senderData;
1493   sendSignal(ref, GSN_READ_CONFIG_CONF, signal,
1494 	     ReadConfigConf::SignalLength, JBB);
1495 }
1496 
execSTTOR(Signal * signal)1497 void Ndbcntr::execSTTOR(Signal* signal)
1498 {
1499   jamEntry();
1500   cstartPhase = signal->theData[1];
1501 
1502   cndbBlocksCount = 0;
1503   cinternalStartphase = cstartPhase - 1;
1504 
1505   switch (cstartPhase) {
1506   case 0:
1507     if (m_ctx.m_config.getInitialStart())
1508     {
1509       jam();
1510       g_eventLogger->info("Clearing filesystem in initial start");
1511       c_fsRemoveCount = 0;
1512       clearFilesystem(signal);
1513       return;
1514     }
1515     sendSttorry(signal);
1516     break;
1517   case ZSTART_PHASE_1:
1518     jam();
1519     startPhase1Lab(signal);
1520     break;
1521   case ZSTART_PHASE_2:
1522     jam();
1523     startPhase2Lab(signal);
1524     break;
1525   case ZSTART_PHASE_3:
1526     jam();
1527     startPhase3Lab(signal);
1528     break;
1529   case ZSTART_PHASE_4:
1530     jam();
1531     startPhase4Lab(signal);
1532     break;
1533   case ZSTART_PHASE_5:
1534     jam();
1535     startPhase5Lab(signal);
1536     break;
1537   case 6:
1538     jam();
1539     getNodeGroup(signal);
1540     sendSttorry(signal);
1541     break;
1542   case ZSTART_PHASE_8:
1543     jam();
1544     startPhase8Lab(signal);
1545     break;
1546   case ZSTART_PHASE_9:
1547     jam();
1548     startPhase9Lab(signal);
1549     break;
1550   default:
1551     jam();
1552     sendSttorry(signal);
1553     break;
1554   }//switch
1555 }//Ndbcntr::execSTTOR()
1556 
1557 void
getNodeGroup(Signal * signal)1558 Ndbcntr::getNodeGroup(Signal* signal){
1559   jam();
1560   CheckNodeGroups * sd = (CheckNodeGroups*)signal->getDataPtrSend();
1561   sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::GetNodeGroup;
1562   EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
1563 		 CheckNodeGroups::SignalLength);
1564   jamEntry();
1565   c_nodeGroup = sd->output;
1566 }
1567 
1568 /*******************************/
1569 /*  NDB_STTORRY                */
1570 /*******************************/
execNDB_STTORRY(Signal * signal)1571 void Ndbcntr::execNDB_STTORRY(Signal* signal)
1572 {
1573   jamEntry();
1574   switch (cstartPhase) {
1575   case ZSTART_PHASE_2:
1576     jam();
1577     ph2GLab(signal);
1578     return;
1579     break;
1580   case ZSTART_PHASE_3:
1581     jam();
1582     ph3ALab(signal);
1583     return;
1584     break;
1585   case ZSTART_PHASE_4:
1586     jam();
1587     ph4BLab(signal);
1588     return;
1589     break;
1590   case ZSTART_PHASE_5:
1591     jam();
1592     ph5ALab(signal);
1593     return;
1594     break;
1595   case ZSTART_PHASE_6:
1596     jam();
1597     ph6ALab(signal);
1598     return;
1599     break;
1600   case ZSTART_PHASE_7:
1601     jam();
1602     ph6BLab(signal);
1603     return;
1604     break;
1605   case ZSTART_PHASE_8:
1606     jam();
1607     ph7ALab(signal);
1608     return;
1609     break;
1610   case ZSTART_PHASE_9:
1611     jam();
1612     g_eventLogger->info("NDB start phase 8 completed");
1613     ph8ALab(signal);
1614     return;
1615     break;
1616   default:
1617     jam();
1618     systemErrorLab(signal, __LINE__);
1619     return;
1620     break;
1621   }//switch
1622 }//Ndbcntr::execNDB_STTORRY()
1623 
startPhase1Lab(Signal * signal)1624 void Ndbcntr::startPhase1Lab(Signal* signal)
1625 {
1626   jamEntry();
1627 
1628   initData(signal);
1629 
1630   cdynamicNodeId = 0;
1631 
1632   NdbBlocksRecPtr ndbBlocksPtr;
1633   ndbBlocksPtr.i = 0;
1634   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1635   ndbBlocksPtr.p->blockref = DBLQH_REF;
1636   ndbBlocksPtr.i = 1;
1637   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1638   ndbBlocksPtr.p->blockref = DBDICT_REF;
1639   ndbBlocksPtr.i = 2;
1640   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1641   ndbBlocksPtr.p->blockref = DBTUP_REF;
1642   ndbBlocksPtr.i = 3;
1643   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1644   ndbBlocksPtr.p->blockref = DBACC_REF;
1645   ndbBlocksPtr.i = 4;
1646   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1647   ndbBlocksPtr.p->blockref = DBTC_REF;
1648   ndbBlocksPtr.i = 5;
1649   ptrAss(ndbBlocksPtr, ndbBlocksRec);
1650   ndbBlocksPtr.p->blockref = DBDIH_REF;
1651   sendSttorry(signal);
1652   return;
1653 }
1654 
execREAD_NODESREF(Signal * signal)1655 void Ndbcntr::execREAD_NODESREF(Signal* signal)
1656 {
1657   jamEntry();
1658   systemErrorLab(signal, __LINE__);
1659   return;
1660 }//Ndbcntr::execREAD_NODESREF()
1661 
1662 
1663 /*******************************/
1664 /*  NDB_STARTREF               */
1665 /*******************************/
execNDB_STARTREF(Signal * signal)1666 void Ndbcntr::execNDB_STARTREF(Signal* signal)
1667 {
1668   jamEntry();
1669   systemErrorLab(signal, __LINE__);
1670   return;
1671 }//Ndbcntr::execNDB_STARTREF()
1672 
1673 /*******************************/
1674 /*  STTOR                      */
1675 /*******************************/
startPhase2Lab(Signal * signal)1676 void Ndbcntr::startPhase2Lab(Signal* signal)
1677 {
1678   c_start.m_lastGci = 0;
1679   c_start.m_lastGciNodeId = getOwnNodeId();
1680 
1681   DihRestartReq * req = CAST_PTR(DihRestartReq, signal->getDataPtrSend());
1682   req->senderRef = reference();
1683   if (ERROR_INSERTED(1021))
1684   {
1685     CLEAR_ERROR_INSERT_VALUE;
1686     sendSignalWithDelay(DBDIH_REF, GSN_DIH_RESTARTREQ, signal,
1687                         30000, DihRestartReq::SignalLength);
1688   }
1689   else
1690   {
1691     sendSignal(DBDIH_REF, GSN_DIH_RESTARTREQ, signal,
1692                DihRestartReq::SignalLength, JBB);
1693   }
1694   return;
1695 }//Ndbcntr::startPhase2Lab()
1696 
1697 /*******************************/
1698 /*  DIH_RESTARTCONF            */
1699 /*******************************/
execDIH_RESTARTCONF(Signal * signal)1700 void Ndbcntr::execDIH_RESTARTCONF(Signal* signal)
1701 {
1702   jamEntry();
1703 
1704   const DihRestartConf * conf = CAST_CONSTPTR(DihRestartConf,
1705                                               signal->getDataPtrSend());
1706   c_start.m_lastGci = conf->latest_gci;
1707   ctypeOfStart = NodeState::ST_SYSTEM_RESTART;
1708   cdihStartType = ctypeOfStart;
1709   ph2ALab(signal);
1710   return;
1711 }//Ndbcntr::execDIH_RESTARTCONF()
1712 
1713 /*******************************/
1714 /*  DIH_RESTARTREF             */
1715 /*******************************/
execDIH_RESTARTREF(Signal * signal)1716 void Ndbcntr::execDIH_RESTARTREF(Signal* signal)
1717 {
1718   jamEntry();
1719   ctypeOfStart = NodeState::ST_INITIAL_START;
1720   cdihStartType = ctypeOfStart;
1721   ph2ALab(signal);
1722   return;
1723 }//Ndbcntr::execDIH_RESTARTREF()
1724 
ph2ALab(Signal * signal)1725 void Ndbcntr::ph2ALab(Signal* signal)
1726 {
1727   /******************************/
1728   /* request configured nodes   */
1729   /* from QMGR                  */
1730   /*  READ_NODESREQ             */
1731   /******************************/
1732   signal->theData[0] = reference();
1733   sendSignal(QMGR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
1734   return;
1735 }//Ndbcntr::ph2ALab()
1736 
1737 inline
1738 Uint64
setTimeout(Uint32 timeoutValue)1739 setTimeout(Uint32 timeoutValue){
1740   return (timeoutValue != 0) ? timeoutValue : ~(Uint64)0;
1741 }
1742 
1743 /*******************************/
1744 /*  READ_NODESCONF             */
1745 /*******************************/
execREAD_NODESCONF(Signal * signal)1746 void Ndbcntr::execREAD_NODESCONF(Signal* signal)
1747 {
1748   jamEntry();
1749   const ReadNodesConf * readNodes = (ReadNodesConf *)&signal->theData[0];
1750 
1751   cmasterNodeId = readNodes->masterNodeId;
1752   cdynamicNodeId = readNodes->ndynamicId;
1753 
1754   /**
1755    * All defined nodes...
1756    */
1757   c_allDefinedNodes.assign(NdbNodeBitmask::Size, readNodes->allNodes);
1758   c_clusterNodes.assign(NdbNodeBitmask::Size, readNodes->clusterNodes);
1759 
1760   Uint32 to_1 = 30000;
1761   Uint32 to_2 = 0;
1762   Uint32 to_3 = 0;
1763 
1764   const ndb_mgm_configuration_iterator * p =
1765     m_ctx.m_config.getOwnConfigIterator();
1766 
1767   ndbrequire(p != 0);
1768   ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTIAL_TIMEOUT, &to_1);
1769   ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTITION_TIMEOUT, &to_2);
1770   ndb_mgm_get_int_parameter(p, CFG_DB_START_FAILURE_TIMEOUT, &to_3);
1771 
1772   c_start.m_startTime = NdbTick_getCurrentTicks();
1773   c_start.m_startPartialTimeout = setTimeout(to_1);
1774   c_start.m_startPartitionedTimeout = setTimeout(to_2);
1775   c_start.m_startFailureTimeout = setTimeout(to_3);
1776 
1777   sendCntrStartReq(signal);
1778 
1779   signal->theData[0] = ZSTARTUP;
1780   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
1781 
1782   return;
1783 }
1784 
1785 void
execCM_ADD_REP(Signal * signal)1786 Ndbcntr::execCM_ADD_REP(Signal* signal)
1787 {
1788   jamEntry();
1789   c_clusterNodes.set(signal->theData[0]);
1790 }
1791 
1792 void
sendCntrStartReq(Signal * signal)1793 Ndbcntr::sendCntrStartReq(Signal * signal)
1794 {
1795   jamEntry();
1796 
1797   if (getOwnNodeId() == cmasterNodeId)
1798   {
1799     g_eventLogger->info("Asking master node to accept our start "
1800                         "(we are master, GCI = %u)",
1801                         c_start.m_lastGci);
1802   }
1803   else
1804   {
1805     g_eventLogger->info("Asking master node to accept our start "
1806                         "(nodeId = %u is master), GCI = %u",
1807                         cmasterNodeId,
1808                         c_start.m_lastGci);
1809   }
1810 
1811   CntrStartReq * req = (CntrStartReq*)signal->getDataPtrSend();
1812   req->startType = ctypeOfStart;
1813   req->lastGci = c_start.m_lastGci;
1814   req->nodeId = getOwnNodeId();
1815   sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_START_REQ,
1816 	     signal, CntrStartReq::SignalLength, JBB);
1817 }
1818 
1819 void
execCNTR_START_REF(Signal * signal)1820 Ndbcntr::execCNTR_START_REF(Signal * signal){
1821   jamEntry();
1822   const CntrStartRef * ref = (CntrStartRef*)signal->getDataPtr();
1823 
1824   switch(ref->errorCode){
1825   case CntrStartRef::NotMaster:
1826     jam();
1827     cmasterNodeId = ref->masterNodeId;
1828     sendCntrStartReq(signal);
1829     return;
1830   case CntrStartRef::StopInProgress:
1831     jam();
1832     progError(__LINE__, NDBD_EXIT_RESTART_DURING_SHUTDOWN);
1833   }
1834   ndbrequire(false);
1835 }
1836 
1837 void
reset()1838 Ndbcntr::StartRecord::reset(){
1839   m_starting.clear();
1840   m_waiting.clear();
1841   m_withLog.clear();
1842   m_withoutLog.clear();
1843   m_waitTO.clear();
1844   m_lastGci = m_lastGciNodeId = 0;
1845   m_startPartialTimeout = ~0;
1846   m_startPartitionedTimeout = ~0;
1847   m_startFailureTimeout = ~0;
1848 
1849   m_logNodesCount = 0;
1850   bzero(m_wait_sp, sizeof(m_wait_sp));
1851 }
1852 
1853 void
execCNTR_START_CONF(Signal * signal)1854 Ndbcntr::execCNTR_START_CONF(Signal * signal){
1855   jamEntry();
1856   const CntrStartConf * conf = (CntrStartConf*)signal->getDataPtr();
1857 
1858   cnoStartNodes = conf->noStartNodes;
1859   ctypeOfStart = (NodeState::StartType)conf->startType;
1860   cdihStartType = ctypeOfStart;
1861   c_start.m_lastGci = conf->startGci;
1862   cmasterNodeId = conf->masterNodeId;
1863   NdbNodeBitmask tmp;
1864   tmp.assign(NdbNodeBitmask::Size, conf->startedNodes);
1865   c_startedNodes.bitOR(tmp);
1866   c_start.m_starting.assign(NdbNodeBitmask::Size, conf->startingNodes);
1867   m_cntr_start_conf = true;
1868   g_eventLogger->info("NDBCNTR master accepted us into cluster,"
1869                       " start NDB start phase 1");
1870   switch (ctypeOfStart)
1871   {
1872     case NodeState::ST_INITIAL_START:
1873     {
1874       g_eventLogger->info("We are performing initial start of cluster");
1875       break;
1876     }
1877     case NodeState::ST_INITIAL_NODE_RESTART:
1878     {
1879       g_eventLogger->info("We are performing initial node restart");
1880       break;
1881     }
1882     case NodeState::ST_NODE_RESTART:
1883     {
1884       g_eventLogger->info("We are performing a node restart");
1885       break;
1886     }
1887     case NodeState::ST_SYSTEM_RESTART:
1888     {
1889       g_eventLogger->info("We are performing a restart of the cluster");
1890       break;
1891     }
1892     default:
1893     {
1894       ndbrequire(false);
1895       break;
1896     }
1897   }
1898   ph2GLab(signal);
1899 }
1900 
1901 /**
1902  * Tried with parallell nr, but it crashed in DIH
1903  * so I turned it off, as I don't want to debug DIH now...
1904  * Jonas 19/11-03
1905  *
1906  * After trying for 2 hours, I gave up.
1907  * DIH is not designed to support it, and
1908  * it requires quite of lot of changes to
1909  * make it work
1910  * Jonas 5/12-03
1911  */
1912 #define PARALLELL_NR 0
1913 
1914 #if PARALLELL_NR
1915 const bool parallellNR = true;
1916 #else
1917 const bool parallellNR = false;
1918 #endif
1919 
1920 void
execCNTR_START_REP(Signal * signal)1921 Ndbcntr::execCNTR_START_REP(Signal* signal){
1922   jamEntry();
1923   Uint32 nodeId = signal->theData[0];
1924 
1925   c_startedNodes.set(nodeId);
1926   c_start.m_starting.clear(nodeId);
1927 
1928   /**
1929    * Inform all interested blocks that node has started
1930    */
1931   for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
1932     sendSignal(ALL_BLOCKS[i].Ref, GSN_NODE_START_REP, signal, 1, JBB);
1933   }
1934 
1935   signal->theData[0] = nodeId;
1936   execSTART_PERMREP(signal);
1937 }
1938 
1939 void
execSTART_PERMREP(Signal * signal)1940 Ndbcntr::execSTART_PERMREP(Signal* signal)
1941 {
1942   Uint32 nodeId = signal->theData[0];
1943   c_startedNodes.set(nodeId);
1944   c_start.m_starting.clear(nodeId);
1945 
1946   if(!c_start.m_starting.isclear()){
1947     jam();
1948     return;
1949   }
1950 
1951   if(cmasterNodeId != getOwnNodeId()){
1952     jam();
1953     c_start.reset();
1954     return;
1955   }
1956 
1957   if(c_start.m_waiting.isclear()){
1958     jam();
1959     c_start.reset();
1960     return;
1961   }
1962 
1963   startWaitingNodes(signal);
1964 }
1965 
1966 void
execCNTR_START_REQ(Signal * signal)1967 Ndbcntr::execCNTR_START_REQ(Signal * signal){
1968   jamEntry();
1969   const CntrStartReq * req = (CntrStartReq*)signal->getDataPtr();
1970 
1971   const Uint32 nodeId = req->nodeId;
1972   const Uint32 lastGci = req->lastGci;
1973   const NodeState::StartType st = (NodeState::StartType)req->startType;
1974 
1975   if(cmasterNodeId == 0){
1976     jam();
1977     // Has not completed READNODES yet
1978     sendSignalWithDelay(reference(), GSN_CNTR_START_REQ, signal, 100,
1979 			signal->getLength());
1980     return;
1981   }
1982 
1983   if(cmasterNodeId != getOwnNodeId()){
1984     jam();
1985     sendCntrStartRef(signal, nodeId, CntrStartRef::NotMaster);
1986     return;
1987   }
1988 
1989   const NodeState & nodeState = getNodeState();
1990   switch(nodeState.startLevel){
1991   case NodeState::SL_NOTHING:
1992   case NodeState::SL_CMVMI:
1993     jam();
1994     ndbrequire(false);
1995   case NodeState::SL_STARTING:
1996   case NodeState::SL_STARTED:
1997     jam();
1998     break;
1999 
2000   case NodeState::SL_STOPPING_1:
2001   case NodeState::SL_STOPPING_2:
2002   case NodeState::SL_STOPPING_3:
2003   case NodeState::SL_STOPPING_4:
2004     jam();
2005     sendCntrStartRef(signal, nodeId, CntrStartRef::StopInProgress);
2006     return;
2007   }
2008 
2009   /**
2010    * Am I starting (or started)
2011    */
2012   const bool starting = (nodeState.startLevel != NodeState::SL_STARTED);
2013 
2014   c_start.m_waiting.set(nodeId);
2015   switch(st){
2016   case NodeState::ST_INITIAL_START:
2017     jam();
2018     c_start.m_withoutLog.set(nodeId);
2019     break;
2020   case NodeState::ST_SYSTEM_RESTART:
2021     jam();
2022     c_start.m_withLog.set(nodeId);
2023     if(starting && lastGci > c_start.m_lastGci){
2024       jam();
2025       CntrStartRef * ref = (CntrStartRef*)signal->getDataPtrSend();
2026       ref->errorCode = CntrStartRef::NotMaster;
2027       ref->masterNodeId = nodeId;
2028       NodeReceiverGroup rg (NDBCNTR, c_start.m_waiting);
2029       sendSignal(rg, GSN_CNTR_START_REF, signal,
2030 		 CntrStartRef::SignalLength, JBB);
2031       return;
2032     }
2033     if(starting){
2034       jam();
2035       Uint32 i = c_start.m_logNodesCount++;
2036       c_start.m_logNodes[i].m_nodeId = nodeId;
2037       c_start.m_logNodes[i].m_lastGci = req->lastGci;
2038     }
2039     break;
2040   case NodeState::ST_NODE_RESTART:
2041   case NodeState::ST_INITIAL_NODE_RESTART:
2042   case NodeState::ST_ILLEGAL_TYPE:
2043     ndbrequire(false);
2044   }
2045 
2046   const bool startInProgress = !c_start.m_starting.isclear();
2047 
2048   if ((starting && startInProgress) || (startInProgress && !parallellNR))
2049   {
2050     jam();
2051     /**
2052      * We're already starting together with a bunch of nodes
2053      * Let this node wait...
2054      *
2055      * We will report the wait to DBDIH to keep track of waiting times in
2056      * the restart. We only report when a node restart is ongoing (that is
2057      * we are not starting ourselves).
2058      */
2059     if (!starting)
2060     {
2061       NdbcntrStartWaitRep *rep = (NdbcntrStartWaitRep*)signal->getDataPtrSend();
2062       rep->nodeId = nodeId;
2063       EXECUTE_DIRECT(DBDIH, GSN_NDBCNTR_START_WAIT_REP, signal,
2064                      NdbcntrStartWaitRep::SignalLength);
2065       return;
2066     }
2067   }
2068 
2069   if(starting){
2070     jam();
2071     trySystemRestart(signal);
2072   } else {
2073     jam();
2074     startWaitingNodes(signal);
2075   }
2076   return;
2077 }
2078 
2079 void
startWaitingNodes(Signal * signal)2080 Ndbcntr::startWaitingNodes(Signal * signal){
2081 
2082 #if ! PARALLELL_NR
2083   if (!c_start.m_waitTO.isclear())
2084   {
2085     jam();
2086 
2087     {
2088       char buf[100];
2089       ndbout_c("starting (TO) %s", c_start.m_waitTO.getText(buf));
2090     }
2091 
2092     /**
2093      * TO during SR
2094      *   this can run in parallel (nowadays :-)
2095      */
2096     NodeReceiverGroup rg(NDBCNTR, c_start.m_waitTO);
2097     c_start.m_starting.bitOR(c_start.m_waitTO);
2098     c_start.m_waiting.bitANDC(c_start.m_waitTO);
2099     c_start.m_waitTO.clear();
2100 
2101     /**
2102      * They are stuck in CntrWaitRep::ZWAITPOINT_4_1
2103      *   have all meta data ok...but needs START_COPYREQ
2104      */
2105     CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtrSend();
2106     rep->nodeId = getOwnNodeId();
2107     rep->waitPoint = CntrWaitRep::ZWAITPOINT_4_2_TO;
2108     sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2109     return;
2110   }
2111 
2112   const Uint32 nodeId = c_start.m_waiting.find(0);
2113   const Uint32 Tref = calcNdbCntrBlockRef(nodeId);
2114   ndbrequire(nodeId != c_start.m_waiting.NotFound);
2115 
2116   NodeState::StartType nrType = NodeState::ST_NODE_RESTART;
2117   const char *start_type_str = "node restart";
2118   if(c_start.m_withoutLog.get(nodeId))
2119   {
2120     jam();
2121     nrType = NodeState::ST_INITIAL_NODE_RESTART;
2122     start_type_str = "initial node restart";
2123   }
2124 
2125   /**
2126    * Let node perform restart
2127    */
2128   infoEvent("Start node: %u using %s as part of system restart",
2129             nodeId, start_type_str);
2130 
2131   CntrStartConf * conf = (CntrStartConf*)signal->getDataPtrSend();
2132   conf->noStartNodes = 1;
2133   conf->startType = nrType;
2134   conf->startGci = ~0; // Not used
2135   conf->masterNodeId = getOwnNodeId();
2136   BitmaskImpl::clear(NdbNodeBitmask::Size, conf->startingNodes);
2137   BitmaskImpl::set(NdbNodeBitmask::Size, conf->startingNodes, nodeId);
2138   c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2139   sendSignal(Tref, GSN_CNTR_START_CONF, signal,
2140 	     CntrStartConf::SignalLength, JBB);
2141 
2142   /**
2143    * A node restart is ongoing where we are master and we just accepted this
2144    * node to proceed with his node restart. Inform DBDIH about this event in
2145    * the node restart.
2146    */
2147   NdbcntrStartedRep *rep = (NdbcntrStartedRep*)signal->getDataPtrSend();
2148   rep->nodeId = nodeId;
2149   EXECUTE_DIRECT(DBDIH, GSN_NDBCNTR_STARTED_REP, signal,
2150                  NdbcntrStartedRep::SignalLength);
2151 
2152   c_start.m_waiting.clear(nodeId);
2153   c_start.m_withLog.clear(nodeId);
2154   c_start.m_withoutLog.clear(nodeId);
2155   c_start.m_starting.set(nodeId);
2156 #else
2157   // Parallell nr
2158 
2159   c_start.m_starting = c_start.m_waiting;
2160   c_start.m_waiting.clear();
2161 
2162   CntrStartConf * conf = (CntrStartConf*)signal->getDataPtrSend();
2163   conf->noStartNodes = 1;
2164   conf->startGci = ~0; // Not used
2165   conf->masterNodeId = getOwnNodeId();
2166   c_start.m_starting.copyto(NdbNodeBitmask::Size, conf->startingNodes);
2167   c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2168 
2169   char buf[100];
2170   if(!c_start.m_withLog.isclear()){
2171     jam();
2172     ndbout_c("Starting nodes w/ log: %s", c_start.m_withLog.getText(buf));
2173 
2174     NodeReceiverGroup rg(NDBCNTR, c_start.m_withLog);
2175     conf->startType = NodeState::ST_NODE_RESTART;
2176 
2177     sendSignal(rg, GSN_CNTR_START_CONF, signal,
2178 	       CntrStartConf::SignalLength, JBB);
2179   }
2180 
2181   if(!c_start.m_withoutLog.isclear()){
2182     jam();
2183     ndbout_c("Starting nodes wo/ log: %s", c_start.m_withoutLog.getText(buf));
2184     NodeReceiverGroup rg(NDBCNTR, c_start.m_withoutLog);
2185     conf->startType = NodeState::ST_INITIAL_NODE_RESTART;
2186 
2187     sendSignal(rg, GSN_CNTR_START_CONF, signal,
2188 	       CntrStartConf::SignalLength, JBB);
2189   }
2190 
2191   c_start.m_waiting.clear();
2192   c_start.m_withLog.clear();
2193   c_start.m_withoutLog.clear();
2194 #endif
2195 }
2196 
2197 void
sendCntrStartRef(Signal * signal,Uint32 nodeId,CntrStartRef::ErrorCode code)2198 Ndbcntr::sendCntrStartRef(Signal * signal,
2199 			  Uint32 nodeId, CntrStartRef::ErrorCode code){
2200   CntrStartRef * ref = (CntrStartRef*)signal->getDataPtrSend();
2201   ref->errorCode = code;
2202   ref->masterNodeId = cmasterNodeId;
2203   sendSignal(calcNdbCntrBlockRef(nodeId), GSN_CNTR_START_REF, signal,
2204 	     CntrStartRef::SignalLength, JBB);
2205 }
2206 
2207 CheckNodeGroups::Output
checkNodeGroups(Signal * signal,const NdbNodeBitmask & mask)2208 Ndbcntr::checkNodeGroups(Signal* signal, const NdbNodeBitmask & mask){
2209   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
2210   sd->blockRef = reference();
2211   sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::ArbitCheck;
2212   sd->mask = mask;
2213   EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
2214 		 CheckNodeGroups::SignalLength);
2215   jamEntry();
2216   return (CheckNodeGroups::Output)sd->output;
2217 }
2218 
2219 bool
trySystemRestart(Signal * signal)2220 Ndbcntr::trySystemRestart(Signal* signal){
2221   /**
2222    * System restart something
2223    */
2224   const bool allNodes = c_start.m_waiting.equal(c_allDefinedNodes);
2225   const bool allClusterNodes = c_start.m_waiting.equal(c_clusterNodes);
2226 
2227   if(!allClusterNodes){
2228     jam();
2229     return false;
2230   }
2231 
2232   NodeState::StartType srType = NodeState::ST_SYSTEM_RESTART;
2233   if(c_start.m_waiting.equal(c_start.m_withoutLog))
2234   {
2235     jam();
2236     srType = NodeState::ST_INITIAL_START;
2237     c_start.m_starting = c_start.m_withoutLog; // Used for starting...
2238     c_start.m_withoutLog.clear();
2239   } else {
2240 
2241     CheckNodeGroups::Output wLog = checkNodeGroups(signal, c_start.m_withLog);
2242 
2243     switch (wLog) {
2244     case CheckNodeGroups::Win:
2245       jam();
2246       break;
2247     case CheckNodeGroups::Lose:
2248       jam();
2249       // If we lose with all nodes, then we're in trouble
2250       ndbrequire(!allNodes);
2251       return false;
2252     case CheckNodeGroups::Partitioning:
2253       jam();
2254       bool allowPartition = (c_start.m_startPartitionedTimeout != (Uint64)~0);
2255 
2256       if(allNodes){
2257 	if(allowPartition){
2258 	  jam();
2259 	  break;
2260 	}
2261 	ndbrequire(false); // All nodes -> partitioning, which is not allowed
2262       }
2263 
2264       break;
2265     }
2266 
2267     // For now only with the "logged"-ones.
2268     // Let the others do node restart afterwards...
2269     c_start.m_starting = c_start.m_withLog;
2270     c_start.m_withLog.clear();
2271   }
2272 
2273   /**
2274    * Okidoki, we try to start
2275    */
2276   CntrStartConf * conf = (CntrStartConf*)signal->getDataPtr();
2277   conf->noStartNodes = c_start.m_starting.count();
2278   conf->startType = srType;
2279   conf->startGci = c_start.m_lastGci;
2280   conf->masterNodeId = c_start.m_lastGciNodeId;
2281   c_start.m_starting.copyto(NdbNodeBitmask::Size, conf->startingNodes);
2282   c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2283 
2284   ndbrequire(c_start.m_lastGciNodeId == getOwnNodeId());
2285 
2286   infoEvent("System Restart: master node: %u, num starting: %u, gci: %u",
2287             conf->noStartNodes,
2288             conf->masterNodeId,
2289             conf->startGci);
2290   char buf[100];
2291   infoEvent("CNTR_START_CONF: started: %s", c_startedNodes.getText(buf));
2292   infoEvent("CNTR_START_CONF: starting: %s", c_start.m_starting.getText(buf));
2293 
2294   NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2295   sendSignal(rg, GSN_CNTR_START_CONF, signal, CntrStartConf::SignalLength,JBB);
2296 
2297   c_start.m_waiting.bitANDC(c_start.m_starting);
2298 
2299   return true;
2300 }
2301 
ph2GLab(Signal * signal)2302 void Ndbcntr::ph2GLab(Signal* signal)
2303 {
2304   if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2305   {
2306     jam();
2307     sendNdbSttor(signal);
2308     return;
2309   }//if
2310   g_eventLogger->info("NDB start phase 1 completed");
2311   sendSttorry(signal);
2312   return;
2313 }//Ndbcntr::ph2GLab()
2314 
2315 /*
2316 4.4  START PHASE 3 */
2317 /*###########################################################################*/
2318 // SEND SIGNAL NDBSTTOR TO ALL BLOCKS, ACC, DICT, DIH, LQH, TC AND TUP
2319 // WHEN ALL BLOCKS HAVE RETURNED THEIR NDB_STTORRY ALL BLOCK HAVE FINISHED
2320 // THEIR LOCAL CONNECTIONs SUCESSFULLY
2321 // AND THEN WE CAN SEND APPL_STARTREG TO INFORM QMGR THAT WE ARE READY TO
2322 // SET UP DISTRIBUTED CONNECTIONS.
2323 /*--------------------------------------------------------------*/
2324 // THIS IS NDB START PHASE 3.
2325 /*--------------------------------------------------------------*/
2326 /*******************************/
2327 /*  STTOR                      */
2328 /*******************************/
startPhase3Lab(Signal * signal)2329 void Ndbcntr::startPhase3Lab(Signal* signal)
2330 {
2331   g_eventLogger->info("Start NDB start phase 2");
2332   ph3ALab(signal);
2333   return;
2334 }//Ndbcntr::startPhase3Lab()
2335 
2336 /*******************************/
2337 /*  NDB_STTORRY                */
2338 /*******************************/
ph3ALab(Signal * signal)2339 void Ndbcntr::ph3ALab(Signal* signal)
2340 {
2341   if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2342   {
2343     jam();
2344     sendNdbSttor(signal);
2345     return;
2346   }//if
2347   g_eventLogger->info("NDB start phase 2 completed");
2348   sendSttorry(signal);
2349   return;
2350 }//Ndbcntr::ph3ALab()
2351 
2352 /*
2353 4.5  START PHASE 4      */
2354 /*###########################################################################*/
2355 // WAIT FOR ALL NODES IN CLUSTER TO CHANGE STATE INTO ZSTART ,
2356 // APPL_CHANGEREP IS ALWAYS SENT WHEN SOMEONE HAVE
2357 // CHANGED THEIR STATE. APPL_STARTCONF INDICATES THAT ALL NODES ARE IN START
2358 // STATE SEND NDB_STARTREQ TO DIH AND THEN WAIT FOR NDB_STARTCONF
2359 /*---------------------------------------------------------------------------*/
2360 /*******************************/
2361 /*  STTOR                      */
2362 /*******************************/
startPhase4Lab(Signal * signal)2363 void Ndbcntr::startPhase4Lab(Signal* signal)
2364 {
2365   g_eventLogger->info("Start NDB start phase 3");
2366   ph4ALab(signal);
2367 }//Ndbcntr::startPhase4Lab()
2368 
2369 
ph4ALab(Signal * signal)2370 void Ndbcntr::ph4ALab(Signal* signal)
2371 {
2372   ph4BLab(signal);
2373   return;
2374 }//Ndbcntr::ph4ALab()
2375 
2376 /*******************************/
2377 /*  NDB_STTORRY                */
2378 /*******************************/
ph4BLab(Signal * signal)2379 void Ndbcntr::ph4BLab(Signal* signal)
2380 {
2381 /*--------------------------------------*/
2382 /* CASE: CSTART_PHASE = ZSTART_PHASE_4  */
2383 /*--------------------------------------*/
2384   if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2385   {
2386     jam();
2387     sendNdbSttor(signal);
2388     return;
2389   }//if
2390   if (ERROR_INSERTED(1010))
2391   {
2392     /* Just delay things for 10 seconds */
2393     CLEAR_ERROR_INSERT_VALUE;
2394     sendSignalWithDelay(reference(), GSN_NDB_STTORRY, signal,
2395                         10000, 1);
2396     return;
2397   }
2398   g_eventLogger->info("NDB start phase 3 completed");
2399   if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2400       (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2401   {
2402     jam();
2403     sendSttorry(signal);
2404     return;
2405   }//if
2406   waitpoint41Lab(signal);
2407   return;
2408 }//Ndbcntr::ph4BLab()
2409 
waitpoint41Lab(Signal * signal)2410 void Ndbcntr::waitpoint41Lab(Signal* signal)
2411 {
2412   if (getOwnNodeId() == cmasterNodeId) {
2413     jam();
2414 /*--------------------------------------*/
2415 /* MASTER WAITS UNTIL ALL SLAVES HAS    */
2416 /* SENT THE REPORTS                     */
2417 /*--------------------------------------*/
2418     cnoWaitrep++;
2419     if (cnoWaitrep == cnoStartNodes) {
2420       jam();
2421       cnoWaitrep = 0;
2422 /*---------------------------------------------------------------------------*/
2423 // NDB_STARTREQ STARTS UP ALL SET UP OF DISTRIBUTION INFORMATION IN DIH AND
2424 // DICT. AFTER SETTING UP THIS
2425 // DATA IT USES THAT DATA TO SET UP WHICH FRAGMENTS THAT ARE TO START AND
2426 // WHERE THEY ARE TO START. THEN
2427 // IT SETS UP THE FRAGMENTS AND RECOVERS THEM BY:
2428 //  1) READING A LOCAL CHECKPOINT FROM DISK.
2429 //  2) EXECUTING THE UNDO LOG ON INDEX AND DATA.
2430 //  3) EXECUTING THE FRAGMENT REDO LOG FROM ONE OR SEVERAL NODES TO
2431 //     RESTORE THE RESTART CONFIGURATION OF DATA IN NDB CLUSTER.
2432 /*---------------------------------------------------------------------------*/
2433       signal->theData[0] = reference();
2434       signal->theData[1] = ctypeOfStart;
2435       sendSignal(DBDIH_REF, GSN_NDB_STARTREQ, signal, 2, JBB);
2436     }//if
2437   } else {
2438     jam();
2439 /*--------------------------------------*/
2440 /* SLAVE NODES WILL PASS HERE ONCE AND  */
2441 /* SEND A WAITPOINT REPORT TO MASTER.   */
2442 /* SLAVES WONT DO ANYTHING UNTIL THEY   */
2443 /* RECEIVE A WAIT REPORT FROM THE MASTER*/
2444 /*--------------------------------------*/
2445     signal->theData[0] = getOwnNodeId();
2446     signal->theData[1] = CntrWaitRep::ZWAITPOINT_4_1;
2447     sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2448 	       GSN_CNTR_WAITREP, signal, 2, JBB);
2449   }//if
2450   return;
2451 }//Ndbcntr::waitpoint41Lab()
2452 
2453 void
waitpoint42To(Signal * signal)2454 Ndbcntr::waitpoint42To(Signal* signal)
2455 {
2456   jam();
2457 
2458   /**
2459    * This is a ugly hack
2460    * To "easy" enable TO during SR
2461    *   a better solution would be to move "all" start handling
2462    *   from DIH to cntr...which knows what's going on
2463    */
2464   cdihStartType = NodeState::ST_SYSTEM_RESTART;
2465   ctypeOfStart = NodeState::ST_NODE_RESTART;
2466 
2467   /**
2468    * This is immensely ugly...but makes TUX work (yuck)
2469    */
2470   {
2471     NodeStateRep* rep = (NodeStateRep*)signal->getDataPtrSend();
2472     rep->nodeState = getNodeState();
2473     rep->nodeState.masterNodeId = cmasterNodeId;
2474     rep->nodeState.setNodeGroup(c_nodeGroup);
2475     rep->nodeState.starting.restartType = NodeState::ST_NODE_RESTART;
2476 
2477     sendSignal(DBTUX_REF, GSN_NODE_STATE_REP, signal,
2478                NodeStateRep::SignalLength, JBB);
2479   }
2480 
2481   /**
2482    * We were forced to perform TO
2483    */
2484   StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
2485   req->senderRef = reference();
2486   req->senderData = RNIL;
2487   req->flags = StartCopyReq::WAIT_LCP;
2488   req->startingNodeId = getOwnNodeId();
2489   sendSignal(DBDIH_REF, GSN_START_COPYREQ, signal,
2490              StartCopyReq::SignalLength, JBB);
2491 }
2492 
2493 void
execSTART_COPYREF(Signal * signal)2494 Ndbcntr::execSTART_COPYREF(Signal* signal)
2495 {
2496 
2497 }
2498 
2499 void
execSTART_COPYCONF(Signal * signal)2500 Ndbcntr::execSTART_COPYCONF(Signal* signal)
2501 {
2502   sendSttorry(signal);
2503 }
2504 
2505 
2506 /*******************************/
2507 /*  NDB_STARTCONF              */
2508 /*******************************/
execNDB_STARTCONF(Signal * signal)2509 void Ndbcntr::execNDB_STARTCONF(Signal* signal)
2510 {
2511   jamEntry();
2512 
2513   NdbNodeBitmask tmp;
2514   if (signal->getLength() >= 1 + NdbNodeBitmask::Size)
2515   {
2516     jam();
2517     tmp.assign(NdbNodeBitmask::Size, signal->theData+1);
2518     if (!c_start.m_starting.equal(tmp))
2519     {
2520       /**
2521        * Some nodes has been "excluded" from SR
2522        */
2523       char buf0[100], buf1[100];
2524       g_eventLogger->info("execNDB_STARTCONF: changing from %s to %s",
2525                           c_start.m_starting.getText(buf0),
2526                           tmp.getText(buf1));
2527 
2528       NdbNodeBitmask waiting = c_start.m_starting;
2529       waiting.bitANDC(tmp);
2530 
2531       c_start.m_waiting.bitOR(waiting);
2532       c_start.m_waitTO.bitOR(waiting);
2533 
2534       c_start.m_starting.assign(tmp);
2535       cnoStartNodes = c_start.m_starting.count();
2536     }
2537   }
2538 
2539   NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2540   signal->theData[0] = getOwnNodeId();
2541   signal->theData[1] = CntrWaitRep::ZWAITPOINT_4_2;
2542   c_start.m_starting.copyto(NdbNodeBitmask::Size, signal->theData+2);
2543   sendSignal(rg, GSN_CNTR_WAITREP, signal, 2 + NdbNodeBitmask::Size,
2544              JBB);
2545   return;
2546 }//Ndbcntr::execNDB_STARTCONF()
2547 
2548 /*
2549 4.6  START PHASE 5      */
2550 /*###########################################################################*/
2551 // SEND APPL_RUN TO THE QMGR IN THIS BLOCK
2552 // SEND NDB_STTOR ALL BLOCKS ACC, DICT, DIH, LQH, TC AND TUP THEN WAIT FOR
2553 // THEIR NDB_STTORRY
2554 /*---------------------------------------------------------------------------*/
2555 /*******************************/
2556 /*  STTOR                      */
2557 /*******************************/
startPhase5Lab(Signal * signal)2558 void Ndbcntr::startPhase5Lab(Signal* signal)
2559 {
2560   ph5ALab(signal);
2561   return;
2562 }//Ndbcntr::startPhase5Lab()
2563 
2564 /*******************************/
2565 /*  NDB_STTORRY                */
2566 /*******************************/
2567 /*---------------------------------------------------------------------------*/
2568 // THIS IS NDB START PHASE 5.
2569 /*---------------------------------------------------------------------------*/
2570 // IN THIS START PHASE TUP INITIALISES DISK FILES FOR DISK STORAGE IF INITIAL
2571 // START. DIH WILL START UP
2572 // THE GLOBAL CHECKPOINT PROTOCOL AND WILL CONCLUDE ANY UNFINISHED TAKE OVERS
2573 // THAT STARTED BEFORE THE SYSTEM CRASH.
2574 /*---------------------------------------------------------------------------*/
ph5ALab(Signal * signal)2575 void Ndbcntr::ph5ALab(Signal* signal)
2576 {
2577   if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2578   {
2579     jam();
2580     sendNdbSttor(signal);
2581     return;
2582   }//if
2583   g_eventLogger->info("NDB start phase 4 completed");
2584 
2585   cstartPhase = cstartPhase + 1;
2586   cinternalStartphase = cstartPhase - 1;
2587   if (getOwnNodeId() == cmasterNodeId) {
2588     switch(ctypeOfStart){
2589     case NodeState::ST_INITIAL_START:
2590       jam();
2591       /*--------------------------------------*/
2592       /* MASTER CNTR IS RESPONSIBLE FOR       */
2593       /* CREATING SYSTEM TABLES               */
2594       /*--------------------------------------*/
2595       g_eventLogger->info("Creating System Tables Starting"
2596                           " as part of initial start");
2597       beginSchemaTransLab(signal);
2598       return;
2599     case NodeState::ST_SYSTEM_RESTART:
2600       jam();
2601       g_eventLogger->info("As master we will wait for other nodes to reach"
2602                           " the state waitpoint52 as well");
2603       waitpoint52Lab(signal);
2604       return;
2605     case NodeState::ST_NODE_RESTART:
2606     case NodeState::ST_INITIAL_NODE_RESTART:
2607       jam();
2608       break;
2609     case NodeState::ST_ILLEGAL_TYPE:
2610       jam();
2611       break;
2612     }
2613     ndbrequire(false);
2614   }
2615 
2616   /**
2617    * Not master
2618    */
2619   NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
2620   switch(ctypeOfStart){
2621   case NodeState::ST_NODE_RESTART:
2622   case NodeState::ST_INITIAL_NODE_RESTART:
2623     jam();
2624     /*----------------------------------------------------------------------*/
2625     // SEND NDB START PHASE 5 IN NODE RESTARTS TO COPY DATA TO THE NEWLY
2626     // STARTED NODE.
2627     /*----------------------------------------------------------------------*/
2628     req->senderRef = reference();
2629     req->nodeId = getOwnNodeId();
2630     req->internalStartPhase = cinternalStartphase;
2631     req->typeOfStart = cdihStartType;
2632     req->masterNodeId = cmasterNodeId;
2633 
2634     g_eventLogger->info("Start NDB start phase 5 (only to DBDIH)");
2635     //#define TRACE_STTOR
2636 #ifdef TRACE_STTOR
2637     ndbout_c("sending NDB_STTOR(%d) to DIH", cinternalStartphase);
2638 #endif
2639     sendSignal(DBDIH_REF, GSN_NDB_STTOR, signal,
2640 	       NdbSttor::SignalLength, JBB);
2641     return;
2642   case NodeState::ST_INITIAL_START:
2643   case NodeState::ST_SYSTEM_RESTART:
2644     jam();
2645     /*--------------------------------------*/
2646     /* DURING SYSTEMRESTART AND INITALSTART:*/
2647     /* SLAVE NODES WILL PASS HERE ONCE AND  */
2648     /* SEND A WAITPOINT REPORT TO MASTER.   */
2649     /* SLAVES WONT DO ANYTHING UNTIL THEY   */
2650     /* RECEIVE A WAIT REPORT FROM THE MASTER*/
2651     /* WHEN THE MASTER HAS FINISHED HIS WORK*/
2652     /*--------------------------------------*/
2653     g_eventLogger->info("During cluster start/restart only master runs"
2654                         " phase 5 of NDB start phases");
2655     g_eventLogger->info("Report to master node our state and wait for master");
2656 
2657     signal->theData[0] = getOwnNodeId();
2658     signal->theData[1] = CntrWaitRep::ZWAITPOINT_5_2;
2659     sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2660 	       GSN_CNTR_WAITREP, signal, 2, JBB);
2661     return;
2662   default:
2663     ndbrequire(false);
2664   }
2665 }//Ndbcntr::ph5ALab()
2666 
waitpoint52Lab(Signal * signal)2667 void Ndbcntr::waitpoint52Lab(Signal* signal)
2668 {
2669   cnoWaitrep = cnoWaitrep + 1;
2670 /*---------------------------------------------------------------------------*/
2671 // THIS WAITING POINT IS ONLY USED BY A MASTER NODE. WE WILL EXECUTE NDB START
2672 // PHASE 5 FOR DIH IN THE
2673 // MASTER. THIS WILL START UP LOCAL CHECKPOINTS AND WILL ALSO CONCLUDE ANY
2674 // UNFINISHED LOCAL CHECKPOINTS
2675 // BEFORE THE SYSTEM CRASH. THIS WILL ENSURE THAT WE ALWAYS RESTART FROM A
2676 // WELL KNOWN STATE.
2677 /*---------------------------------------------------------------------------*/
2678 /*--------------------------------------*/
2679 /* MASTER WAITS UNTIL HE RECEIVED WAIT  */
2680 /* REPORTS FROM ALL SLAVE CNTR          */
2681 /*--------------------------------------*/
2682   if (cnoWaitrep == cnoStartNodes) {
2683     jam();
2684     cnoWaitrep = 0;
2685 
2686     g_eventLogger->info("Start NDB start phase 5 (only to DBDIH)");
2687     NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
2688     req->senderRef = reference();
2689     req->nodeId = getOwnNodeId();
2690     req->internalStartPhase = cinternalStartphase;
2691     req->typeOfStart = cdihStartType;
2692     req->masterNodeId = cmasterNodeId;
2693 #ifdef TRACE_STTOR
2694     ndbout_c("sending NDB_STTOR(%d) to DIH", cinternalStartphase);
2695 #endif
2696     sendSignal(DBDIH_REF, GSN_NDB_STTOR, signal,
2697 	       NdbSttor::SignalLength, JBB);
2698   }//if
2699   return;
2700 }//Ndbcntr::waitpoint52Lab()
2701 
2702 /*******************************/
2703 /*  NDB_STTORRY                */
2704 /*******************************/
ph6ALab(Signal * signal)2705 void Ndbcntr::ph6ALab(Signal* signal)
2706 {
2707   g_eventLogger->info("NDB start phase 5 completed");
2708   if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2709       (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2710   {
2711     jam();
2712     waitpoint51Lab(signal);
2713     return;
2714   }//if
2715 
2716   NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2717   rg.m_nodes.clear(getOwnNodeId());
2718   signal->theData[0] = getOwnNodeId();
2719   signal->theData[1] = CntrWaitRep::ZWAITPOINT_5_1;
2720   sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2721 
2722   waitpoint51Lab(signal);
2723   return;
2724 }//Ndbcntr::ph6ALab()
2725 
waitpoint51Lab(Signal * signal)2726 void Ndbcntr::waitpoint51Lab(Signal* signal)
2727 {
2728   cstartPhase = cstartPhase + 1;
2729 /*---------------------------------------------------------------------------*/
2730 // A FINAL STEP IS NOW TO SEND NDB_STTOR TO TC. THIS MAKES IT POSSIBLE TO
2731 // CONNECT TO TC FOR APPLICATIONS.
2732 // THIS IS NDB START PHASE 6 WHICH IS FOR ALL BLOCKS IN ALL NODES.
2733 /*---------------------------------------------------------------------------*/
2734   g_eventLogger->info("Start NDB start phase 6");
2735   cinternalStartphase = cstartPhase - 1;
2736   cndbBlocksCount = 0;
2737   ph6BLab(signal);
2738   return;
2739 }//Ndbcntr::waitpoint51Lab()
2740 
ph6BLab(Signal * signal)2741 void Ndbcntr::ph6BLab(Signal* signal)
2742 {
2743   // c_missra.currentStartPhase - cstartPhase - cinternalStartphase =
2744   // 5 - 7 - 6
2745   if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2746   {
2747     jam();
2748     sendNdbSttor(signal);
2749     return;
2750   }//if
2751   g_eventLogger->info("NDB start phase 6 completed");
2752   if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2753       (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2754   {
2755     jam();
2756     sendSttorry(signal);
2757     return;
2758   }
2759   waitpoint61Lab(signal);
2760 }
2761 
waitpoint61Lab(Signal * signal)2762 void Ndbcntr::waitpoint61Lab(Signal* signal)
2763 {
2764   if (getOwnNodeId() == cmasterNodeId) {
2765     jam();
2766     cnoWaitrep6++;
2767     if (cnoWaitrep6 == cnoStartNodes) {
2768       jam();
2769       NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2770       rg.m_nodes.clear(getOwnNodeId());
2771       signal->theData[0] = getOwnNodeId();
2772       signal->theData[1] = CntrWaitRep::ZWAITPOINT_6_2;
2773       sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2774       sendSttorry(signal);
2775     }
2776   } else {
2777     jam();
2778     signal->theData[0] = getOwnNodeId();
2779     signal->theData[1] = CntrWaitRep::ZWAITPOINT_6_1;
2780     sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_WAITREP, signal, 2, JBB);
2781   }
2782 }
2783 
2784 // Start phase 8 (internal 7)
startPhase8Lab(Signal * signal)2785 void Ndbcntr::startPhase8Lab(Signal* signal)
2786 {
2787   g_eventLogger->info("Start NDB start phase 7");
2788   cinternalStartphase = cstartPhase - 1;
2789   cndbBlocksCount = 0;
2790   ph7ALab(signal);
2791 }
2792 
ph7ALab(Signal * signal)2793 void Ndbcntr::ph7ALab(Signal* signal)
2794 {
2795   while (cndbBlocksCount < ZNO_NDB_BLOCKS)
2796   {
2797     jam();
2798     sendNdbSttor(signal);
2799     return;
2800   }
2801   g_eventLogger->info("NDB start phase 7 completed");
2802   if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2803       (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2804   {
2805     jam();
2806     sendSttorry(signal);
2807     return;
2808   }
2809   waitpoint71Lab(signal);
2810 }
2811 
waitpoint71Lab(Signal * signal)2812 void Ndbcntr::waitpoint71Lab(Signal* signal)
2813 {
2814   if (getOwnNodeId() == cmasterNodeId) {
2815     jam();
2816     cnoWaitrep7++;
2817     if (cnoWaitrep7 == cnoStartNodes) {
2818       jam();
2819       NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2820       rg.m_nodes.clear(getOwnNodeId());
2821       signal->theData[0] = getOwnNodeId();
2822       signal->theData[1] = CntrWaitRep::ZWAITPOINT_7_2;
2823       sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2824       sendSttorry(signal);
2825     }
2826   } else {
2827     jam();
2828     signal->theData[0] = getOwnNodeId();
2829     signal->theData[1] = CntrWaitRep::ZWAITPOINT_7_1;
2830     sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_WAITREP, signal, 2, JBB);
2831   }
2832 }
2833 
2834 // Start phase 9 (internal 8)
startPhase9Lab(Signal * signal)2835 void Ndbcntr::startPhase9Lab(Signal* signal)
2836 {
2837   cinternalStartphase = cstartPhase - 1;
2838   cndbBlocksCount = 0;
2839   ph8ALab(signal);
2840 }
2841 
ph8ALab(Signal * signal)2842 void Ndbcntr::ph8ALab(Signal* signal)
2843 {
2844   sendSttorry(signal);
2845   resetStartVariables(signal);
2846   return;
2847 }//Ndbcntr::ph8BLab()
2848 
2849 bool
wait_sp(Signal * signal,Uint32 sp)2850 Ndbcntr::wait_sp(Signal* signal, Uint32 sp)
2851 {
2852   if (sp <= 2)
2853     return false;
2854 
2855   switch(ctypeOfStart){
2856   case NodeState::ST_SYSTEM_RESTART:
2857   case NodeState::ST_INITIAL_START:
2858     /**
2859      * synchronized...
2860      */
2861     break;
2862   default:
2863     return false;
2864   }
2865 
2866   if (!ndb_wait_sp(getNodeInfo(cmasterNodeId).m_version))
2867     return false;
2868 
2869   CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtrSend();
2870   rep->nodeId = getOwnNodeId();
2871   rep->waitPoint = RNIL;
2872   rep->request = CntrWaitRep::WaitFor;
2873   rep->sp = sp;
2874 
2875   sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2876              GSN_CNTR_WAITREP, signal, CntrWaitRep::SignalLength, JBB);
2877 
2878   return true; // wait
2879 }
2880 
2881 void
wait_sp_rep(Signal * signal)2882 Ndbcntr::wait_sp_rep(Signal* signal)
2883 {
2884   CntrWaitRep rep = *(CntrWaitRep*)signal->getDataPtrSend();
2885   switch(rep.request){
2886   case CntrWaitRep::WaitFor:
2887     jam();
2888     ndbrequire(cmasterNodeId == getOwnNodeId());
2889     break;
2890   case CntrWaitRep::Grant:
2891     jam();
2892     /**
2893      * We're allowed to proceed
2894      */
2895     c_missra.sendNextSTTOR(signal);
2896     return;
2897   }
2898 
2899   c_start.m_wait_sp[rep.nodeId] = rep.sp;
2900 
2901   /**
2902    * Check if we should allow someone to start...
2903    */
2904   Uint32 node = c_start.m_starting.find(0);
2905   ndbrequire(node < NDB_ARRAY_SIZE(c_start.m_wait_sp));
2906   Uint32 min = c_start.m_wait_sp[node];
2907   for (; node != NdbNodeBitmask::NotFound;
2908        node = c_start.m_starting.find(node + 1))
2909   {
2910     if (!ndb_wait_sp(getNodeInfo(node).m_version))
2911       continue;
2912 
2913     if (c_start.m_wait_sp[node] < min)
2914     {
2915       min = c_start.m_wait_sp[node];
2916     }
2917   }
2918 
2919   if (min == 0)
2920   {
2921     /**
2922      * wait for more
2923      */
2924     return;
2925   }
2926 
2927   NdbNodeBitmask grantnodes;
2928   node = c_start.m_starting.find(0);
2929   for (; node != NdbNodeBitmask::NotFound;
2930        node = c_start.m_starting.find(node + 1))
2931   {
2932     if (!ndb_wait_sp(getNodeInfo(node).m_version))
2933       continue;
2934 
2935     if (c_start.m_wait_sp[node] == min)
2936     {
2937       grantnodes.set(node);
2938       c_start.m_wait_sp[node] = 0;
2939     }
2940   }
2941 
2942   char buf[100];
2943   g_eventLogger->info("Grant nodes to start phase: %u, nodes: %s",
2944                       min,
2945                       grantnodes.getText(buf));
2946 
2947   NodeReceiverGroup rg(NDBCNTR, grantnodes);
2948   CntrWaitRep * conf = (CntrWaitRep*)signal->getDataPtrSend();
2949   conf->nodeId = getOwnNodeId();
2950   conf->waitPoint = RNIL;
2951   conf->request = CntrWaitRep::Grant;
2952   conf->sp = min;
2953   sendSignal(rg, GSN_CNTR_WAITREP, signal, CntrWaitRep::SignalLength, JBB);
2954 }
2955 
2956 /*******************************/
2957 /*  CNTR_WAITREP               */
2958 /*******************************/
execCNTR_WAITREP(Signal * signal)2959 void Ndbcntr::execCNTR_WAITREP(Signal* signal)
2960 {
2961   jamEntry();
2962   CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtr();
2963 
2964   Uint32 twaitPoint = rep->waitPoint;
2965   switch (twaitPoint) {
2966   case CntrWaitRep::ZWAITPOINT_4_1:
2967     jam();
2968     waitpoint41Lab(signal);
2969     break;
2970   case CntrWaitRep::ZWAITPOINT_4_2:
2971     jam();
2972     c_start.m_starting.assign(NdbNodeBitmask::Size, signal->theData + 2);
2973     sendSttorry(signal);
2974     break;
2975   case CntrWaitRep::ZWAITPOINT_5_1:
2976     jam();
2977     g_eventLogger->info("Master node %u have reached completion of NDB start"
2978                         " phase 5",
2979                         signal->theData[0]);
2980     waitpoint51Lab(signal);
2981     break;
2982   case CntrWaitRep::ZWAITPOINT_5_2:
2983     jam();
2984     g_eventLogger->info("Node %u have reached completion of NDB start"
2985                         " phase 4",
2986                         signal->theData[0]);
2987     waitpoint52Lab(signal);
2988     break;
2989   case CntrWaitRep::ZWAITPOINT_6_1:
2990     jam();
2991     waitpoint61Lab(signal);
2992     break;
2993   case CntrWaitRep::ZWAITPOINT_6_2:
2994     jam();
2995     sendSttorry(signal);
2996     break;
2997   case CntrWaitRep::ZWAITPOINT_7_1:
2998     jam();
2999     waitpoint71Lab(signal);
3000     break;
3001   case CntrWaitRep::ZWAITPOINT_7_2:
3002     jam();
3003     sendSttorry(signal);
3004     break;
3005   case CntrWaitRep::ZWAITPOINT_4_2_TO:
3006     jam();
3007     waitpoint42To(signal);
3008     break;
3009   case RNIL:
3010     ndbrequire(signal->getLength() >= CntrWaitRep::SignalLength);
3011     wait_sp_rep(signal);
3012     return;
3013   default:
3014     jam();
3015     systemErrorLab(signal, __LINE__);
3016     break;
3017   }//switch
3018 }//Ndbcntr::execCNTR_WAITREP()
3019 
3020 /*******************************/
3021 /*  NODE_FAILREP               */
3022 /*******************************/
execNODE_FAILREP(Signal * signal)3023 void Ndbcntr::execNODE_FAILREP(Signal* signal)
3024 {
3025   jamEntry();
3026 
3027   if (ERROR_INSERTED(1001))
3028   {
3029     sendSignalWithDelay(reference(), GSN_NODE_FAILREP, signal, 100,
3030                         signal->getLength());
3031     return;
3032   }
3033 
3034   const NodeFailRep * nodeFail = (NodeFailRep *)&signal->theData[0];
3035   NdbNodeBitmask allFailed;
3036   allFailed.assign(NdbNodeBitmask::Size, nodeFail->theNodes);
3037 
3038   NdbNodeBitmask failedStarted = c_startedNodes;
3039   NdbNodeBitmask failedStarting = c_start.m_starting;
3040   NdbNodeBitmask failedWaiting = c_start.m_waiting;
3041 
3042   failedStarted.bitAND(allFailed);
3043   failedStarting.bitAND(allFailed);
3044   failedWaiting.bitAND(allFailed);
3045 
3046   const bool tMasterFailed = allFailed.get(cmasterNodeId);
3047   const bool tStarted = !failedStarted.isclear();
3048   const bool tStarting = !failedStarting.isclear();
3049 
3050   if (tMasterFailed)
3051   {
3052     jam();
3053     /**
3054      * If master has failed choose qmgr president as master
3055      */
3056     cmasterNodeId = nodeFail->masterNodeId;
3057   }
3058 
3059   /**
3060    * Clear node bitmasks from failed nodes
3061    */
3062   c_start.m_starting.bitANDC(allFailed);
3063   c_start.m_waiting.bitANDC(allFailed);
3064   c_start.m_withLog.bitANDC(allFailed);
3065   c_start.m_withoutLog.bitANDC(allFailed);
3066   c_start.m_waitTO.bitANDC(allFailed);
3067   c_clusterNodes.bitANDC(allFailed);
3068   c_startedNodes.bitANDC(allFailed);
3069 
3070   const NodeState & st = getNodeState();
3071   if (st.startLevel == st.SL_STARTING)
3072   {
3073     jam();
3074 
3075     const Uint32 phase = st.starting.startPhase;
3076 
3077     const bool tStartConf = (phase > 2) || (phase == 2 && cndbBlocksCount > 0);
3078 
3079     if (tMasterFailed)
3080     {
3081       progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3082 		"Unhandled node failure during restart");
3083     }
3084 
3085     if (tStartConf && tStarting)
3086     {
3087       // One of other starting nodes has crashed...
3088       progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3089 		"Unhandled node failure of starting node during restart");
3090     }
3091 
3092     if (tStartConf && tStarted)
3093     {
3094       // One of other started nodes has crashed...
3095       progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3096 		"Unhandled node failure of started node during restart");
3097     }
3098 
3099     Uint32 nodeId = 0;
3100     while(!allFailed.isclear()){
3101       nodeId = allFailed.find(nodeId + 1);
3102       allFailed.clear(nodeId);
3103       signal->theData[0] = nodeId;
3104       sendSignal(QMGR_REF, GSN_NDB_FAILCONF, signal, 1, JBB);
3105     }//for
3106 
3107     return;
3108   }
3109 
3110   ndbrequire(!allFailed.get(getOwnNodeId()));
3111 
3112   NodeFailRep * rep = (NodeFailRep *)&signal->theData[0];
3113   rep->masterNodeId = cmasterNodeId;
3114 
3115   sendSignal(DBTC_REF, GSN_NODE_FAILREP, signal,
3116              NodeFailRep::SignalLength, JBB);
3117 
3118   sendSignal(DBLQH_REF, GSN_NODE_FAILREP, signal,
3119              NodeFailRep::SignalLength, JBB);
3120 
3121   sendSignal(DBDIH_REF, GSN_NODE_FAILREP, signal,
3122              NodeFailRep::SignalLength, JBB);
3123 
3124   sendSignal(DBDICT_REF, GSN_NODE_FAILREP, signal,
3125              NodeFailRep::SignalLength, JBB);
3126 
3127   sendSignal(BACKUP_REF, GSN_NODE_FAILREP, signal,
3128              NodeFailRep::SignalLength, JBB);
3129 
3130   sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
3131              NodeFailRep::SignalLength, JBB);
3132 
3133   sendSignal(QMGR_REF, GSN_NODE_FAILREP, signal,
3134              NodeFailRep::SignalLength, JBB);
3135 
3136   sendSignal(DBUTIL_REF, GSN_NODE_FAILREP, signal,
3137              NodeFailRep::SignalLength, JBB);
3138 
3139   sendSignal(DBTUP_REF, GSN_NODE_FAILREP, signal,
3140              NodeFailRep::SignalLength, JBB);
3141 
3142   sendSignal(TSMAN_REF, GSN_NODE_FAILREP, signal,
3143              NodeFailRep::SignalLength, JBB);
3144 
3145   sendSignal(LGMAN_REF, GSN_NODE_FAILREP, signal,
3146              NodeFailRep::SignalLength, JBB);
3147 
3148   sendSignal(DBSPJ_REF, GSN_NODE_FAILREP, signal,
3149              NodeFailRep::SignalLength, JBB);
3150 
3151   if (c_stopRec.stopReq.senderRef)
3152   {
3153     jam();
3154     switch(c_stopRec.m_state){
3155     case StopRecord::SR_WAIT_NODE_FAILURES:
3156     {
3157       jam();
3158       NdbNodeBitmask tmp;
3159       tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3160       tmp.bitANDC(allFailed);
3161       tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3162 
3163       if (tmp.isclear())
3164       {
3165 	jam();
3166 	if (c_stopRec.stopReq.senderRef != RNIL)
3167 	{
3168 	  jam();
3169 	  StopConf * const stopConf = (StopConf *)&signal->theData[0];
3170 	  stopConf->senderData = c_stopRec.stopReq.senderData;
3171 	  stopConf->nodeState  = (Uint32) NodeState::SL_SINGLEUSER;
3172 	  sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal,
3173 		     StopConf::SignalLength, JBB);
3174 	}
3175 
3176 	c_stopRec.stopReq.senderRef = 0;
3177 	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
3178 	req->senderRef = reference();
3179 	req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
3180 	req->requestType = WaitGCPReq::UnblockStartGcp;
3181 	sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
3182 		   WaitGCPReq::SignalLength, JBA);
3183       }
3184       break;
3185     }
3186     case StopRecord::SR_QMGR_STOP_REQ:
3187     {
3188       NdbNodeBitmask tmp;
3189       tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3190       tmp.bitANDC(allFailed);
3191 
3192       if (tmp.isclear())
3193       {
3194 	Uint32 nodeId = allFailed.find(0);
3195 	tmp.set(nodeId);
3196 
3197 	StopConf* conf = (StopConf*)signal->getDataPtrSend();
3198 	conf->senderData = c_stopRec.stopReq.senderData;
3199 	conf->nodeId = nodeId;
3200 	sendSignal(reference(),
3201 		   GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
3202       }
3203 
3204       tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3205 
3206       break;
3207     }
3208     case StopRecord::SR_BLOCK_GCP_START_GCP:
3209     case StopRecord::SR_WAIT_COMPLETE_GCP:
3210     case StopRecord::SR_UNBLOCK_GCP_START_GCP:
3211     case StopRecord::SR_CLUSTER_SHUTDOWN:
3212       break;
3213     }
3214   }
3215 
3216   signal->theData[0] = NDB_LE_NODE_FAILREP;
3217   signal->theData[2] = 0;
3218 
3219   Uint32 nodeId = 0;
3220   while(!allFailed.isclear()){
3221     nodeId = allFailed.find(nodeId + 1);
3222     allFailed.clear(nodeId);
3223     signal->theData[1] = nodeId;
3224     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
3225   }//for
3226 
3227   return;
3228 }//Ndbcntr::execNODE_FAILREP()
3229 
3230 /*******************************/
3231 /*  READ_NODESREQ              */
3232 /*******************************/
execREAD_NODESREQ(Signal * signal)3233 void Ndbcntr::execREAD_NODESREQ(Signal* signal)
3234 {
3235   jamEntry();
3236 
3237   /*----------------------------------------------------------------------*/
3238   // ANY BLOCK MAY SEND A REQUEST ABOUT NDB NODES AND VERSIONS IN THE
3239   // SYSTEM. THIS REQUEST CAN ONLY BE HANDLED IN
3240   // ABSOLUTE STARTPHASE 3 OR LATER
3241   /*----------------------------------------------------------------------*/
3242   BlockReference TuserBlockref = signal->theData[0];
3243   ReadNodesConf * const readNodes = (ReadNodesConf *)&signal->theData[0];
3244 
3245   /**
3246    * Prepare inactiveNodes bitmask.
3247    * The concept as such is by the way pretty useless.
3248    * It makes parallell starts more or less impossible...
3249    */
3250   NdbNodeBitmask tmp1;
3251   tmp1.bitOR(c_startedNodes);
3252   if(!getNodeState().getNodeRestartInProgress()){
3253     tmp1.bitOR(c_start.m_starting);
3254   } else {
3255     tmp1.set(getOwnNodeId());
3256   }
3257 
3258   NdbNodeBitmask tmp2;
3259   tmp2.bitOR(c_allDefinedNodes);
3260   tmp2.bitANDC(tmp1);
3261   /**
3262    * Fill in return signal
3263    */
3264   tmp2.copyto(NdbNodeBitmask::Size, readNodes->inactiveNodes);
3265   c_allDefinedNodes.copyto(NdbNodeBitmask::Size, readNodes->allNodes);
3266   c_clusterNodes.copyto(NdbNodeBitmask::Size, readNodes->clusterNodes);
3267   c_startedNodes.copyto(NdbNodeBitmask::Size, readNodes->startedNodes);
3268   c_start.m_starting.copyto(NdbNodeBitmask::Size, readNodes->startingNodes);
3269 
3270   readNodes->noOfNodes = c_allDefinedNodes.count();
3271   readNodes->masterNodeId = cmasterNodeId;
3272   readNodes->ndynamicId = cdynamicNodeId;
3273   if (m_cntr_start_conf)
3274   {
3275     jam();
3276     sendSignal(TuserBlockref, GSN_READ_NODESCONF, signal,
3277 	       ReadNodesConf::SignalLength, JBB);
3278 
3279   } else {
3280     jam();
3281     signal->theData[0] = ZNOT_AVAILABLE;
3282     sendSignal(TuserBlockref, GSN_READ_NODESREF, signal, 1, JBB);
3283   }//if
3284 }//Ndbcntr::execREAD_NODESREQ()
3285 
3286 /*----------------------------------------------------------------------*/
3287 // SENDS APPL_ERROR TO QMGR AND THEN SET A POINTER OUT OF BOUNDS
3288 /*----------------------------------------------------------------------*/
systemErrorLab(Signal * signal,int line)3289 void Ndbcntr::systemErrorLab(Signal* signal, int line)
3290 {
3291   progError(line, NDBD_EXIT_NDBREQUIRE); /* BUG INSERTION */
3292   return;
3293 }//Ndbcntr::systemErrorLab()
3294 
3295 /*###########################################################################*/
3296 /* CNTR MASTER CREATES AND INITIALIZES A SYSTEMTABLE AT INITIALSTART         */
3297 /*       |-2048| # 1 00000001    |                                           */
3298 /*       |  :  |   :             |                                           */
3299 /*       | -1  | # 1 00000001    |                                           */
3300 /*       |  1  |   0             | tupleid sequence now created on first use */
3301 /*       |  :  |   :             |                   v                       */
3302 /*       | 2048|   0             |                   v                       */
3303 /*---------------------------------------------------------------------------*/
beginSchemaTransLab(Signal * signal)3304 void Ndbcntr::beginSchemaTransLab(Signal* signal)
3305 {
3306   c_schemaTransId = reference();
3307 
3308   SchemaTransBeginReq* req =
3309     (SchemaTransBeginReq*)signal->getDataPtrSend();
3310   req->clientRef = reference();
3311   req->transId = c_schemaTransId;
3312   req->requestInfo = 0;
3313   sendSignal(DBDICT_REF, GSN_SCHEMA_TRANS_BEGIN_REQ, signal,
3314       SchemaTransBeginReq::SignalLength, JBB);
3315 }
3316 
execSCHEMA_TRANS_BEGIN_CONF(Signal * signal)3317 void Ndbcntr::execSCHEMA_TRANS_BEGIN_CONF(Signal* signal)
3318 {
3319   const SchemaTransBeginConf* conf =
3320     (SchemaTransBeginConf*)signal->getDataPtr();
3321   ndbrequire(conf->transId == c_schemaTransId);
3322   c_schemaTransKey = conf->transKey;
3323 
3324   createHashMap(signal, 0);
3325 }
3326 
execSCHEMA_TRANS_BEGIN_REF(Signal * signal)3327 void Ndbcntr::execSCHEMA_TRANS_BEGIN_REF(Signal* signal)
3328 {
3329   ndbrequire(false);
3330 }
3331 
3332 void
createHashMap(Signal * signal,Uint32 idx)3333 Ndbcntr::createHashMap(Signal* signal, Uint32 idx)
3334 {
3335   CreateHashMapReq* const req = (CreateHashMapReq*)signal->getDataPtrSend();
3336   req->clientRef = reference();
3337   req->clientData = idx;
3338   req->requestInfo = CreateHashMapReq::CreateDefault;
3339   req->transId = c_schemaTransId;
3340   req->transKey = c_schemaTransKey;
3341   req->buckets = 0;
3342   req->fragments = 0;
3343   sendSignal(DBDICT_REF, GSN_CREATE_HASH_MAP_REQ, signal,
3344 	     CreateHashMapReq::SignalLength, JBB);
3345 }
3346 
3347 void
execCREATE_HASH_MAP_REF(Signal * signal)3348 Ndbcntr::execCREATE_HASH_MAP_REF(Signal* signal)
3349 {
3350   jamEntry();
3351 
3352   ndbrequire(false);
3353 }
3354 
3355 void
execCREATE_HASH_MAP_CONF(Signal * signal)3356 Ndbcntr::execCREATE_HASH_MAP_CONF(Signal* signal)
3357 {
3358   jamEntry();
3359   CreateHashMapConf* conf = (CreateHashMapConf*)signal->getDataPtrSend();
3360 
3361   if (conf->senderData == 0)
3362   {
3363     jam();
3364     c_objectId = conf->objectId;
3365     c_objectVersion = conf->objectVersion;
3366   }
3367 
3368   createSystableLab(signal, 0);
3369 }
3370 
endSchemaTransLab(Signal * signal)3371 void Ndbcntr::endSchemaTransLab(Signal* signal)
3372 {
3373   SchemaTransEndReq* req =
3374     (SchemaTransEndReq*)signal->getDataPtrSend();
3375   req->clientRef = reference();
3376   req->transId = c_schemaTransId;
3377   req->requestInfo = 0;
3378   req->transKey = c_schemaTransKey;
3379   req->flags = 0;
3380   sendSignal(DBDICT_REF, GSN_SCHEMA_TRANS_END_REQ, signal,
3381       SchemaTransEndReq::SignalLength, JBB);
3382 }
3383 
execSCHEMA_TRANS_END_CONF(Signal * signal)3384 void Ndbcntr::execSCHEMA_TRANS_END_CONF(Signal* signal)
3385 {
3386   c_schemaTransId = 0;
3387   c_schemaTransKey = RNIL;
3388   startInsertTransactions(signal);
3389 }
3390 
execSCHEMA_TRANS_END_REF(Signal * signal)3391 void Ndbcntr::execSCHEMA_TRANS_END_REF(Signal* signal)
3392 {
3393   jamEntry();
3394   SchemaTransEndRef * ref = (SchemaTransEndRef*)signal->getDataPtr();
3395   char buf[256];
3396   BaseString::snprintf(buf, sizeof(buf),
3397                        "Failed to commit schema trans, err: %u",
3398                        ref->errorCode);
3399   progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3400   ndbrequire(false);
3401 }
3402 
3403 void
createDDObjects(Signal * signal,unsigned index)3404 Ndbcntr::createDDObjects(Signal * signal, unsigned index)
3405 {
3406   const ndb_mgm_configuration_iterator * p =
3407     m_ctx.m_config.getOwnConfigIterator();
3408   ndbrequire(p != 0);
3409 
3410   Uint32 propPage[256];
3411   LinearWriter w(propPage, 256);
3412 
3413   const ddentry* entry = &f_dd[index];
3414 
3415   switch(entry->type){
3416   case DictTabInfo::LogfileGroup:
3417   case DictTabInfo::Tablespace:
3418   {
3419     jam();
3420 
3421     DictFilegroupInfo::Filegroup fg; fg.init();
3422     BaseString::snprintf(fg.FilegroupName, sizeof(fg.FilegroupName),
3423                          "%s", entry->name);
3424     fg.FilegroupType = entry->type;
3425     if (entry->type == DictTabInfo::LogfileGroup)
3426     {
3427       jam();
3428       fg.LF_UndoBufferSize = Uint32(entry->size);
3429     }
3430     else
3431     {
3432       jam();
3433       fg.TS_ExtentSize = Uint32(entry->size);
3434       fg.TS_LogfileGroupId = c_objectId;
3435       fg.TS_LogfileGroupVersion = c_objectVersion;
3436     }
3437 
3438     SimpleProperties::UnpackStatus s;
3439     s = SimpleProperties::pack(w,
3440                                &fg,
3441                                DictFilegroupInfo::Mapping,
3442                                DictFilegroupInfo::MappingSize, true);
3443 
3444 
3445     Uint32 length = w.getWordsUsed();
3446     LinearSectionPtr ptr[3];
3447     ptr[0].p = &propPage[0];
3448     ptr[0].sz = length;
3449 
3450     CreateFilegroupReq * req = (CreateFilegroupReq*)signal->getDataPtrSend();
3451     req->senderRef = reference();
3452     req->senderData = index;
3453     req->objType = entry->type;
3454     req->transId = c_schemaTransId;
3455     req->transKey = c_schemaTransKey;
3456     req->requestInfo = 0;
3457     sendSignal(DBDICT_REF, GSN_CREATE_FILEGROUP_REQ, signal,
3458                CreateFilegroupReq::SignalLength, JBB, ptr, 1);
3459     return;
3460   }
3461   case DictTabInfo::Undofile:
3462   case DictTabInfo::Datafile:
3463   {
3464     jam();
3465     Uint32 propPage[256];
3466     LinearWriter w(propPage, 256);
3467     DictFilegroupInfo::File f; f.init();
3468     BaseString::snprintf(f.FileName, sizeof(f.FileName), "%s", entry->name);
3469     f.FileType = entry->type;
3470     f.FilegroupId = c_objectId;
3471     f.FilegroupVersion = c_objectVersion;
3472     f.FileSizeHi = Uint32(entry->size >> 32);
3473     f.FileSizeLo = Uint32(entry->size);
3474 
3475     SimpleProperties::UnpackStatus s;
3476     s = SimpleProperties::pack(w,
3477                                &f,
3478                                DictFilegroupInfo::FileMapping,
3479                                DictFilegroupInfo::FileMappingSize, true);
3480 
3481     Uint32 length = w.getWordsUsed();
3482     LinearSectionPtr ptr[3];
3483     ptr[0].p = &propPage[0];
3484     ptr[0].sz = length;
3485 
3486     CreateFileReq * req = (CreateFileReq*)signal->getDataPtrSend();
3487     req->senderRef = reference();
3488     req->senderData = index;
3489     req->objType = entry->type;
3490     req->transId = c_schemaTransId;
3491     req->transKey = c_schemaTransKey;
3492     req->requestInfo = CreateFileReq::ForceCreateFile;
3493     sendSignal(DBDICT_REF, GSN_CREATE_FILE_REQ, signal,
3494                CreateFileReq::SignalLength, JBB, ptr, 1);
3495     return;
3496   }
3497   default:
3498     break;
3499   }
3500 
3501   endSchemaTransLab(signal);
3502 }
3503 
3504 void
execCREATE_FILEGROUP_REF(Signal * signal)3505 Ndbcntr::execCREATE_FILEGROUP_REF(Signal* signal)
3506 {
3507   jamEntry();
3508   CreateFilegroupRef* ref = (CreateFilegroupRef*)signal->getDataPtr();
3509   char buf[1024];
3510 
3511   const ddentry* entry = &f_dd[ref->senderData];
3512 
3513   if (entry->type == DictTabInfo::LogfileGroup)
3514   {
3515     BaseString::snprintf(buf, sizeof(buf), "create logfilegroup err %u",
3516                          ref->errorCode);
3517   }
3518   else if (entry->type == DictTabInfo::Tablespace)
3519   {
3520     BaseString::snprintf(buf, sizeof(buf), "create tablespace err %u",
3521                          ref->errorCode);
3522   }
3523   progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3524 }
3525 
3526 void
execCREATE_FILEGROUP_CONF(Signal * signal)3527 Ndbcntr::execCREATE_FILEGROUP_CONF(Signal* signal)
3528 {
3529   jamEntry();
3530   CreateFilegroupConf* conf = (CreateFilegroupConf*)signal->getDataPtr();
3531   c_objectId = conf->filegroupId;
3532   c_objectVersion = conf->filegroupVersion;
3533   createDDObjects(signal, conf->senderData + 1);
3534 }
3535 
3536 void
execCREATE_FILE_REF(Signal * signal)3537 Ndbcntr::execCREATE_FILE_REF(Signal* signal)
3538 {
3539   jamEntry();
3540   CreateFileRef* ref = (CreateFileRef*)signal->getDataPtr();
3541   char buf[1024];
3542 
3543   const ddentry* entry = &f_dd[ref->senderData];
3544 
3545   if (entry->type == DictTabInfo::Undofile)
3546   {
3547     BaseString::snprintf(buf, sizeof(buf), "create undofile %s err %u",
3548                          entry->name,
3549                          ref->errorCode);
3550   }
3551   else if (entry->type == DictTabInfo::Datafile)
3552   {
3553     BaseString::snprintf(buf, sizeof(buf), "create datafile %s err %u",
3554                          entry->name,
3555                          ref->errorCode);
3556   }
3557   progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3558 }
3559 
3560 void
execCREATE_FILE_CONF(Signal * signal)3561 Ndbcntr::execCREATE_FILE_CONF(Signal* signal)
3562 {
3563   jamEntry();
3564   CreateFileConf* conf = (CreateFileConf*)signal->getDataPtr();
3565   createDDObjects(signal, conf->senderData + 1);
3566 }
3567 
createSystableLab(Signal * signal,unsigned index)3568 void Ndbcntr::createSystableLab(Signal* signal, unsigned index)
3569 {
3570   if (index >= g_sysTableCount) {
3571     ndbassert(index == g_sysTableCount);
3572     createDDObjects(signal, 0);
3573     return;
3574   }
3575   const SysTable& table = *g_sysTableList[index];
3576   Uint32 propPage[256];
3577   LinearWriter w(propPage, 256);
3578 
3579   // XXX remove commented-out lines later
3580 
3581   w.first();
3582   w.add(DictTabInfo::TableName, table.name);
3583   w.add(DictTabInfo::TableLoggedFlag, table.tableLoggedFlag);
3584   //w.add(DictTabInfo::TableKValue, 6);
3585   //w.add(DictTabInfo::MinLoadFactor, 70);
3586   //w.add(DictTabInfo::MaxLoadFactor, 80);
3587   w.add(DictTabInfo::FragmentTypeVal, (Uint32)table.fragmentType);
3588   //w.add(DictTabInfo::NoOfKeyAttr, 1);
3589   w.add(DictTabInfo::NoOfAttributes, (Uint32)table.columnCount);
3590   //w.add(DictTabInfo::NoOfNullable, (Uint32)0);
3591   //w.add(DictTabInfo::NoOfVariable, (Uint32)0);
3592   //w.add(DictTabInfo::KeyLength, 1);
3593   w.add(DictTabInfo::TableTypeVal, (Uint32)table.tableType);
3594   w.add(DictTabInfo::SingleUserMode, (Uint32)NDB_SUM_READ_WRITE);
3595   w.add(DictTabInfo::HashMapObjectId, c_objectId);
3596   w.add(DictTabInfo::HashMapVersion, c_objectVersion);
3597 
3598   for (unsigned i = 0; i < table.columnCount; i++) {
3599     const SysColumn& column = table.columnList[i];
3600     ndbassert(column.pos == i);
3601     w.add(DictTabInfo::AttributeName, column.name);
3602     w.add(DictTabInfo::AttributeId, (Uint32)i);
3603     w.add(DictTabInfo::AttributeKeyFlag, (Uint32)column.keyFlag);
3604     w.add(DictTabInfo::AttributeStorageType,
3605 	  (Uint32)NDB_STORAGETYPE_MEMORY);
3606     switch(column.type){
3607     case DictTabInfo::ExtVarbinary:
3608       jam();
3609       w.add(DictTabInfo::AttributeArrayType,
3610             (Uint32)NDB_ARRAYTYPE_SHORT_VAR);
3611       break;
3612     case DictTabInfo::ExtLongvarbinary:
3613       jam();
3614       w.add(DictTabInfo::AttributeArrayType,
3615             (Uint32)NDB_ARRAYTYPE_MEDIUM_VAR);
3616       break;
3617     default:
3618       jam();
3619       w.add(DictTabInfo::AttributeArrayType,
3620             (Uint32)NDB_ARRAYTYPE_FIXED);
3621       break;
3622     }
3623     w.add(DictTabInfo::AttributeNullableFlag, (Uint32)column.nullable);
3624     w.add(DictTabInfo::AttributeExtType, (Uint32)column.type);
3625     w.add(DictTabInfo::AttributeExtLength, (Uint32)column.length);
3626     w.add(DictTabInfo::AttributeEnd, (Uint32)true);
3627   }
3628   w.add(DictTabInfo::TableEnd, (Uint32)true);
3629 
3630   Uint32 length = w.getWordsUsed();
3631   LinearSectionPtr ptr[3];
3632   ptr[0].p = &propPage[0];
3633   ptr[0].sz = length;
3634 
3635   CreateTableReq* const req = (CreateTableReq*)signal->getDataPtrSend();
3636   req->clientRef = reference();
3637   req->clientData = index;
3638   req->requestInfo = 0;
3639   req->transId = c_schemaTransId;
3640   req->transKey = c_schemaTransKey;
3641   sendSignal(DBDICT_REF, GSN_CREATE_TABLE_REQ, signal,
3642 	     CreateTableReq::SignalLength, JBB, ptr, 1);
3643   return;
3644 }//Ndbcntr::createSystableLab()
3645 
execCREATE_TABLE_REF(Signal * signal)3646 void Ndbcntr::execCREATE_TABLE_REF(Signal* signal)
3647 {
3648   jamEntry();
3649   progError(__LINE__,NDBD_EXIT_NDBREQUIRE, "CREATE_TABLE_REF");
3650   return;
3651 }//Ndbcntr::execDICTTABREF()
3652 
execCREATE_TABLE_CONF(Signal * signal)3653 void Ndbcntr::execCREATE_TABLE_CONF(Signal* signal)
3654 {
3655   jamEntry();
3656   const CreateTableConf* conf = (const CreateTableConf*)signal->getDataPtr();
3657   //csystabId = conf->tableId;
3658   ndbrequire(conf->transId == c_schemaTransId);
3659   ndbrequire(conf->senderData < g_sysTableCount);
3660   const SysTable& table = *g_sysTableList[conf->senderData];
3661   table.tableId = conf->tableId;
3662   table.tableVersion = conf->tableVersion;
3663   createSystableLab(signal, conf->senderData + 1);
3664   //startInsertTransactions(signal);
3665   return;
3666 }//Ndbcntr::execDICTTABCONF()
3667 
3668 /*******************************/
3669 /*  DICTRELEASECONF            */
3670 /*******************************/
startInsertTransactions(Signal * signal)3671 void Ndbcntr::startInsertTransactions(Signal* signal)
3672 {
3673   jamEntry();
3674 
3675   ckey = 1;
3676   ctransidPhase = ZTRUE;
3677   signal->theData[0] = 0;
3678   signal->theData[1] = reference();
3679   sendSignal(DBTC_REF, GSN_TCSEIZEREQ, signal, 2, JBB);
3680   return;
3681 }//Ndbcntr::startInsertTransactions()
3682 
3683 /*******************************/
3684 /*  TCSEIZECONF                */
3685 /*******************************/
execTCSEIZECONF(Signal * signal)3686 void Ndbcntr::execTCSEIZECONF(Signal* signal)
3687 {
3688   jamEntry();
3689   ctcConnectionP = signal->theData[1];
3690   ctcReference = signal->theData[2];
3691   crSystab7Lab(signal);
3692   return;
3693 }//Ndbcntr::execTCSEIZECONF()
3694 
3695 const unsigned int RowsPerCommit = 16;
crSystab7Lab(Signal * signal)3696 void Ndbcntr::crSystab7Lab(Signal* signal)
3697 {
3698   UintR tkey;
3699   UintR Tmp;
3700 
3701   TcKeyReq * const tcKeyReq = (TcKeyReq *)&signal->theData[0];
3702 
3703   UintR reqInfo_Start = 0;
3704   tcKeyReq->setOperationType(reqInfo_Start, ZINSERT); // Insert
3705   tcKeyReq->setKeyLength    (reqInfo_Start, 1);
3706   tcKeyReq->setAIInTcKeyReq (reqInfo_Start, 5);
3707   tcKeyReq->setAbortOption  (reqInfo_Start, TcKeyReq::AbortOnError);
3708 
3709 /* KEY LENGTH = 1, ATTRINFO LENGTH IN TCKEYREQ = 5 */
3710   cresponses = 0;
3711   const UintR guard0 = ckey + (RowsPerCommit - 1);
3712   for (Tmp = ckey; Tmp <= guard0; Tmp++) {
3713     UintR reqInfo = reqInfo_Start;
3714     if (Tmp == ckey) { // First iteration, Set start flag
3715       jam();
3716       tcKeyReq->setStartFlag(reqInfo, 1);
3717     } //if
3718     if (Tmp == guard0) { // Last iteration, Set commit flag
3719       jam();
3720       tcKeyReq->setCommitFlag(reqInfo, 1);
3721       tcKeyReq->setExecuteFlag(reqInfo, 1);
3722     } //if
3723     if (ctransidPhase == ZTRUE) {
3724       jam();
3725       tkey = 0;
3726       tkey = tkey - Tmp;
3727     } else {
3728       jam();
3729       tkey = Tmp;
3730     }//if
3731 
3732     tcKeyReq->apiConnectPtr      = ctcConnectionP;
3733     tcKeyReq->attrLen            = 5;
3734     tcKeyReq->tableId            = g_sysTable_SYSTAB_0.tableId;
3735     tcKeyReq->requestInfo        = reqInfo;
3736     tcKeyReq->tableSchemaVersion = g_sysTable_SYSTAB_0.tableVersion;
3737     tcKeyReq->transId1           = 0;
3738     tcKeyReq->transId2           = ckey;
3739 
3740 //-------------------------------------------------------------
3741 // There is no optional part in this TCKEYREQ. There is one
3742 // key word and five ATTRINFO words.
3743 //-------------------------------------------------------------
3744     Uint32* tKeyDataPtr          = &tcKeyReq->scanInfo;
3745     Uint32* tAIDataPtr           = &tKeyDataPtr[1];
3746 
3747     tKeyDataPtr[0]               = tkey;
3748 
3749     AttributeHeader::init(&tAIDataPtr[0], 0, 1 << 2);
3750     tAIDataPtr[1]                = tkey;
3751     AttributeHeader::init(&tAIDataPtr[2], 1, 2 << 2);
3752     tAIDataPtr[3]                = (tkey << 16);
3753     tAIDataPtr[4]                = 1;
3754     sendSignal(ctcReference, GSN_TCKEYREQ, signal,
3755 	       TcKeyReq::StaticLength + 6, JBB);
3756   }//for
3757   ckey = ckey + RowsPerCommit;
3758   return;
3759 }//Ndbcntr::crSystab7Lab()
3760 
3761 /*******************************/
3762 /*  TCKEYCONF09                */
3763 /*******************************/
execTCKEYCONF(Signal * signal)3764 void Ndbcntr::execTCKEYCONF(Signal* signal)
3765 {
3766   const TcKeyConf * const keyConf = (TcKeyConf *)&signal->theData[0];
3767 
3768   jamEntry();
3769   cgciSystab = keyConf->gci_hi;
3770   UintR confInfo = keyConf->confInfo;
3771 
3772   if (TcKeyConf::getMarkerFlag(confInfo)){
3773     Uint32 transId1 = keyConf->transId1;
3774     Uint32 transId2 = keyConf->transId2;
3775     signal->theData[0] = transId1;
3776     signal->theData[1] = transId2;
3777     sendSignal(ctcReference, GSN_TC_COMMIT_ACK, signal, 2, JBB);
3778   }//if
3779 
3780   cresponses = cresponses + TcKeyConf::getNoOfOperations(confInfo);
3781   if (TcKeyConf::getCommitFlag(confInfo)){
3782     jam();
3783     ndbrequire(cresponses == RowsPerCommit);
3784 
3785     crSystab8Lab(signal);
3786     return;
3787   }
3788   return;
3789 }//Ndbcntr::tckeyConfLab()
3790 
crSystab8Lab(Signal * signal)3791 void Ndbcntr::crSystab8Lab(Signal* signal)
3792 {
3793   if (ckey < ZSIZE_SYSTAB) {
3794     jam();
3795     crSystab7Lab(signal);
3796     return;
3797   } else if (ctransidPhase == ZTRUE) {
3798     jam();
3799     ckey = 1;
3800     ctransidPhase = ZFALSE;
3801     // skip 2nd loop - tupleid sequence now created on first use
3802   }//if
3803   signal->theData[0] = ctcConnectionP;
3804   signal->theData[1] = reference();
3805   signal->theData[2] = 0;
3806   sendSignal(ctcReference, GSN_TCRELEASEREQ, signal, 2, JBB);
3807   return;
3808 }//Ndbcntr::crSystab8Lab()
3809 
3810 /*******************************/
3811 /*  TCRELEASECONF              */
3812 /*******************************/
execTCRELEASECONF(Signal * signal)3813 void Ndbcntr::execTCRELEASECONF(Signal* signal)
3814 {
3815   jamEntry();
3816   g_eventLogger->info("Creation of System Tables Completed");
3817   waitpoint52Lab(signal);
3818   return;
3819 }//Ndbcntr::execTCRELEASECONF()
3820 
crSystab9Lab(Signal * signal)3821 void Ndbcntr::crSystab9Lab(Signal* signal)
3822 {
3823   signal->theData[0] = 0; // user ptr
3824   signal->theData[1] = reference();
3825   signal->theData[2] = 0;
3826   sendSignalWithDelay(DBDIH_REF, GSN_GETGCIREQ, signal, 100, 3);
3827   return;
3828 }//Ndbcntr::crSystab9Lab()
3829 
3830 /*******************************/
3831 /*  GETGCICONF                 */
3832 /*******************************/
execGETGCICONF(Signal * signal)3833 void Ndbcntr::execGETGCICONF(Signal* signal)
3834 {
3835   jamEntry();
3836 
3837 #ifndef NO_GCP
3838   if (signal->theData[1] < cgciSystab) {
3839     jam();
3840 /*--------------------------------------*/
3841 /* MAKE SURE THAT THE SYSTABLE IS       */
3842 /* NOW SAFE ON DISK                     */
3843 /*--------------------------------------*/
3844     crSystab9Lab(signal);
3845     return;
3846   }//if
3847 #endif
3848   waitpoint52Lab(signal);
3849   return;
3850 }//Ndbcntr::execGETGCICONF()
3851 
execTCKEYREF(Signal * signal)3852 void Ndbcntr::execTCKEYREF(Signal* signal)
3853 {
3854   jamEntry();
3855   systemErrorLab(signal, __LINE__);
3856   return;
3857 }//Ndbcntr::execTCKEYREF()
3858 
execTCROLLBACKREP(Signal * signal)3859 void Ndbcntr::execTCROLLBACKREP(Signal* signal)
3860 {
3861   jamEntry();
3862   systemErrorLab(signal, __LINE__);
3863   return;
3864 }//Ndbcntr::execTCROLLBACKREP()
3865 
execTCRELEASEREF(Signal * signal)3866 void Ndbcntr::execTCRELEASEREF(Signal* signal)
3867 {
3868   jamEntry();
3869   systemErrorLab(signal, __LINE__);
3870   return;
3871 }//Ndbcntr::execTCRELEASEREF()
3872 
execTCSEIZEREF(Signal * signal)3873 void Ndbcntr::execTCSEIZEREF(Signal* signal)
3874 {
3875   jamEntry();
3876   systemErrorLab(signal, __LINE__);
3877   return;
3878 }//Ndbcntr::execTCSEIZEREF()
3879 
3880 
3881 /*---------------------------------------------------------------------------*/
3882 /*INITIALIZE VARIABLES AND RECORDS                                           */
3883 /*---------------------------------------------------------------------------*/
initData(Signal * signal)3884 void Ndbcntr::initData(Signal* signal)
3885 {
3886   c_start.reset();
3887   cmasterNodeId = 0;
3888   cnoStartNodes = 0;
3889   cnoWaitrep = 0;
3890 }//Ndbcntr::initData()
3891 
3892 
3893 /*---------------------------------------------------------------------------*/
3894 /*RESET VARIABLES USED DURING THE START                                      */
3895 /*---------------------------------------------------------------------------*/
resetStartVariables(Signal * signal)3896 void Ndbcntr::resetStartVariables(Signal* signal)
3897 {
3898   cnoStartNodes = 0;
3899   cnoWaitrep6 = cnoWaitrep7 = 0;
3900 }//Ndbcntr::resetStartVariables()
3901 
3902 
3903 /*---------------------------------------------------------------------------*/
3904 // SEND THE SIGNAL
3905 // INPUT                  CNDB_BLOCKS_COUNT
3906 /*---------------------------------------------------------------------------*/
sendNdbSttor(Signal * signal)3907 void Ndbcntr::sendNdbSttor(Signal* signal)
3908 {
3909   NdbBlocksRecPtr ndbBlocksPtr;
3910 
3911   ndbBlocksPtr.i = cndbBlocksCount;
3912   ptrCheckGuard(ndbBlocksPtr, ZSIZE_NDB_BLOCKS_REC, ndbBlocksRec);
3913 
3914   NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
3915   req->senderRef = reference();
3916   req->nodeId = getOwnNodeId();
3917   req->internalStartPhase = cinternalStartphase;
3918   req->typeOfStart = ctypeOfStart;
3919   req->masterNodeId = cmasterNodeId;
3920 
3921   for (int i = 0; i < 16; i++) {
3922     // Garbage
3923     req->config[i] = 0x88776655;
3924   }
3925 
3926   //#define MAX_STARTPHASE 2
3927 #ifdef TRACE_STTOR
3928   ndbout_c("sending NDB_STTOR(%d) to %s",
3929 	   cinternalStartphase,
3930 	   getBlockName( refToBlock(ndbBlocksPtr.p->blockref)));
3931 #endif
3932   if (refToBlock(ndbBlocksPtr.p->blockref) == DBDIH)
3933     req->typeOfStart = cdihStartType;
3934   sendSignal(ndbBlocksPtr.p->blockref, GSN_NDB_STTOR, signal, 22, JBB);
3935   cndbBlocksCount++;
3936 }//Ndbcntr::sendNdbSttor()
3937 
3938 /*---------------------------------------------------------------------------*/
3939 // JUST SEND THE SIGNAL
3940 /*---------------------------------------------------------------------------*/
sendSttorry(Signal * signal,Uint32 delayed)3941 void Ndbcntr::sendSttorry(Signal* signal, Uint32 delayed)
3942 {
3943   signal->theData[3] = ZSTART_PHASE_1;
3944   signal->theData[4] = ZSTART_PHASE_2;
3945   signal->theData[5] = ZSTART_PHASE_3;
3946   signal->theData[6] = ZSTART_PHASE_4;
3947   signal->theData[7] = ZSTART_PHASE_5;
3948   signal->theData[8] = ZSTART_PHASE_6;
3949   // skip simulated phase 7
3950   signal->theData[9] = ZSTART_PHASE_8;
3951   signal->theData[10] = ZSTART_PHASE_9;
3952   signal->theData[11] = ZSTART_PHASE_END;
3953   if (delayed == 0)
3954   {
3955     sendSignal(NDBCNTR_REF, GSN_STTORRY, signal, 12, JBB);
3956     return;
3957   }
3958   sendSignalWithDelay(NDBCNTR_REF, GSN_STTORRY, signal, delayed, 12);
3959 }//Ndbcntr::sendSttorry()
3960 
3961 void
execDUMP_STATE_ORD(Signal * signal)3962 Ndbcntr::execDUMP_STATE_ORD(Signal* signal)
3963 {
3964   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
3965   Uint32 arg = dumpState->args[0];
3966 
3967   if(arg == 13){
3968     infoEvent("Cntr: cstartPhase = %d, cinternalStartphase = %d, block = %d",
3969 	      cstartPhase,
3970               cinternalStartphase,
3971               cndbBlocksCount);
3972     infoEvent("Cntr: cmasterNodeId = %d", cmasterNodeId);
3973   }
3974 
3975   if (arg == DumpStateOrd::NdbcntrTestStopOnError){
3976     if (m_ctx.m_config.stopOnError() == true)
3977       ((Configuration&)m_ctx.m_config).stopOnError(false);
3978 
3979     const BlockReference tblockref = calcNdbCntrBlockRef(getOwnNodeId());
3980 
3981     SystemError * const sysErr = (SystemError*)&signal->theData[0];
3982     sysErr->errorCode = SystemError::TestStopOnError;
3983     sysErr->errorRef = reference();
3984     sendSignal(tblockref, GSN_SYSTEM_ERROR, signal,
3985 	       SystemError::SignalLength, JBA);
3986   }
3987 
3988   if (arg == DumpStateOrd::NdbcntrStopNodes)
3989   {
3990     NdbNodeBitmask mask;
3991     for(Uint32 i = 1; i<signal->getLength(); i++)
3992       mask.set(signal->theData[i]);
3993 
3994     StopReq* req = (StopReq*)signal->getDataPtrSend();
3995     req->senderRef = RNIL;
3996     req->senderData = 123;
3997     req->requestInfo = 0;
3998     req->singleuser = 0;
3999     req->singleUserApi = 0;
4000     mask.copyto(NdbNodeBitmask::Size, req->nodes);
4001     StopReq::setPerformRestart(req->requestInfo, 1);
4002     StopReq::setNoStart(req->requestInfo, 1);
4003     StopReq::setStopNodes(req->requestInfo, 1);
4004     StopReq::setStopAbort(req->requestInfo, 1);
4005 
4006     sendSignal(reference(), GSN_STOP_REQ, signal,
4007 	       StopReq::SignalLength, JBB);
4008     return;
4009   }
4010 
4011   if (arg == 71)
4012   {
4013 #ifdef ERROR_INSERT
4014     if (signal->getLength() == 2)
4015     {
4016       c_error_insert_extra = signal->theData[1];
4017       SET_ERROR_INSERT_VALUE(1002);
4018     }
4019     else if (ERROR_INSERTED(1002))
4020     {
4021       CLEAR_ERROR_INSERT_VALUE;
4022     }
4023 #endif
4024   }
4025 
4026 }//Ndbcntr::execDUMP_STATE_ORD()
4027 
updateNodeState(Signal * signal,const NodeState & newState) const4028 void Ndbcntr::updateNodeState(Signal* signal, const NodeState& newState) const{
4029   NodeStateRep * const stateRep = (NodeStateRep *)&signal->theData[0];
4030 
4031   if (newState.startLevel == NodeState::SL_STARTED)
4032   {
4033     CRASH_INSERTION(1000);
4034   }
4035 
4036   stateRep->nodeState = newState;
4037   stateRep->nodeState.masterNodeId = cmasterNodeId;
4038   stateRep->nodeState.setNodeGroup(c_nodeGroup);
4039 
4040   for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
4041     sendSignal(ALL_BLOCKS[i].Ref, GSN_NODE_STATE_REP, signal,
4042 	       NodeStateRep::SignalLength, JBB);
4043   }
4044 }
4045 
4046 void
execRESUME_REQ(Signal * signal)4047 Ndbcntr::execRESUME_REQ(Signal* signal){
4048   //ResumeReq * const req = (ResumeReq *)&signal->theData[0];
4049   //ResumeRef * const ref = (ResumeRef *)&signal->theData[0];
4050 
4051   jamEntry();
4052 
4053   signal->theData[0] = NDB_LE_SingleUser;
4054   signal->theData[1] = 2;
4055   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4056 
4057   //Uint32 senderData = req->senderData;
4058   //BlockReference senderRef = req->senderRef;
4059   NodeState newState(NodeState::SL_STARTED);
4060   updateNodeState(signal, newState);
4061   c_stopRec.stopReq.senderRef=0;
4062   send_node_started_rep(signal);
4063 }
4064 
4065 void
execSTOP_REQ(Signal * signal)4066 Ndbcntr::execSTOP_REQ(Signal* signal){
4067   StopReq * const req = (StopReq *)&signal->theData[0];
4068   StopRef * const ref = (StopRef *)&signal->theData[0];
4069   Uint32 singleuser  = req->singleuser;
4070   jamEntry();
4071   Uint32 senderData = req->senderData;
4072   BlockReference senderRef = req->senderRef;
4073   bool abort = StopReq::getStopAbort(req->requestInfo);
4074   bool stopnodes = StopReq::getStopNodes(req->requestInfo);
4075 
4076   if(!singleuser &&
4077      (getNodeState().startLevel < NodeState::SL_STARTED ||
4078       (abort && !stopnodes)))
4079   {
4080     /**
4081      * Node is not started yet
4082      *
4083      * So stop it quickly
4084      */
4085     jam();
4086     const Uint32 reqInfo = req->requestInfo;
4087     if(StopReq::getPerformRestart(reqInfo)){
4088       jam();
4089       StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4090       startOrd->restartInfo = reqInfo;
4091       sendSignal(CMVMI_REF, GSN_START_ORD, signal, 1, JBA);
4092     } else {
4093       jam();
4094       sendSignal(CMVMI_REF, GSN_STOP_ORD, signal, 1, JBA);
4095     }
4096     return;
4097   }
4098 
4099   if(c_stopRec.stopReq.senderRef != 0 ||
4100      (cmasterNodeId == getOwnNodeId() && !c_start.m_starting.isclear()))
4101   {
4102     /**
4103      * Requested a system shutdown
4104      */
4105     if(!singleuser && StopReq::getSystemStop(req->requestInfo)){
4106       jam();
4107       sendSignalWithDelay(reference(), GSN_STOP_REQ, signal, 100,
4108 			  StopReq::SignalLength);
4109       return;
4110     }
4111 
4112     /**
4113      * Requested a node shutdown
4114      */
4115     if(c_stopRec.stopReq.senderRef &&
4116        StopReq::getSystemStop(c_stopRec.stopReq.requestInfo))
4117       ref->errorCode = StopRef::SystemShutdownInProgress;
4118     else
4119       ref->errorCode = StopRef::NodeShutdownInProgress;
4120     ref->senderData = senderData;
4121     ref->masterNodeId = cmasterNodeId;
4122 
4123     if (senderRef != RNIL)
4124       sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4125     return;
4126   }
4127 
4128   if (stopnodes && !abort)
4129   {
4130     jam();
4131     ref->errorCode = StopRef::UnsupportedNodeShutdown;
4132     ref->senderData = senderData;
4133     ref->masterNodeId = cmasterNodeId;
4134     if (senderRef != RNIL)
4135       sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4136     return;
4137   }
4138 
4139   if (stopnodes && cmasterNodeId != getOwnNodeId())
4140   {
4141     jam();
4142     ref->errorCode = StopRef::MultiNodeShutdownNotMaster;
4143     ref->senderData = senderData;
4144     ref->masterNodeId = cmasterNodeId;
4145     if (senderRef != RNIL)
4146       sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4147     return;
4148   }
4149 
4150   c_stopRec.stopReq = * req;
4151   c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4152 
4153   if (stopnodes)
4154   {
4155     jam();
4156 
4157     if(!c_stopRec.checkNodeFail(signal))
4158     {
4159       jam();
4160       return;
4161     }
4162 
4163     char buf[100];
4164     NdbNodeBitmask mask;
4165     mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4166     infoEvent("Initiating shutdown abort of %s", mask.getText(buf));
4167     ndbout_c("Initiating shutdown abort of %s", mask.getText(buf));
4168 
4169     WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4170     req->senderRef = reference();
4171     req->senderData = StopRecord::SR_BLOCK_GCP_START_GCP;
4172     req->requestType = WaitGCPReq::BlockStartGcp;
4173     sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4174 	       WaitGCPReq::SignalLength, JBB);
4175     return;
4176   }
4177   else if(!singleuser)
4178   {
4179     if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo))
4180     {
4181       jam();
4182       if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo))
4183       {
4184 	((Configuration&)m_ctx.m_config).stopOnError(false);
4185       }
4186     }
4187     if(!c_stopRec.checkNodeFail(signal))
4188     {
4189       jam();
4190       return;
4191     }
4192     signal->theData[0] = NDB_LE_NDBStopStarted;
4193     signal->theData[1] = StopReq::getSystemStop(c_stopRec.stopReq.requestInfo) ? 1 : 0;
4194     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4195   }
4196   else
4197   {
4198     signal->theData[0] = NDB_LE_SingleUser;
4199     signal->theData[1] = 0;
4200     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4201   }
4202 
4203   NodeState newState(NodeState::SL_STOPPING_1,
4204 		     StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4205 
4206    if(singleuser) {
4207      newState.setSingleUser(true);
4208      newState.setSingleUserApi(c_stopRec.stopReq.singleUserApi);
4209    }
4210   updateNodeState(signal, newState);
4211   signal->theData[0] = ZSHUTDOWN;
4212   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4213 }
4214 
4215 void
checkTimeout(Signal * signal)4216 Ndbcntr::StopRecord::checkTimeout(Signal* signal){
4217   jamEntry();
4218 
4219   if(!cntr.getNodeState().getSingleUserMode())
4220     if(!checkNodeFail(signal)){
4221       jam();
4222       return;
4223     }
4224 
4225   switch(cntr.getNodeState().startLevel){
4226   case NodeState::SL_STOPPING_1:
4227     checkApiTimeout(signal);
4228     break;
4229   case NodeState::SL_STOPPING_2:
4230     checkTcTimeout(signal);
4231     break;
4232   case NodeState::SL_STOPPING_3:
4233     checkLqhTimeout_1(signal);
4234     break;
4235   case NodeState::SL_STOPPING_4:
4236     checkLqhTimeout_2(signal);
4237     break;
4238   case NodeState::SL_SINGLEUSER:
4239     break;
4240   default:
4241     ndbrequire(false);
4242   }
4243 }
4244 
4245 bool
checkNodeFail(Signal * signal)4246 Ndbcntr::StopRecord::checkNodeFail(Signal* signal){
4247   jam();
4248   if(StopReq::getSystemStop(stopReq.requestInfo)){
4249     jam();
4250     return true;
4251   }
4252 
4253   /**
4254    * Check if I can survive me stopping
4255    */
4256   NdbNodeBitmask ndbMask;
4257   ndbMask.assign(cntr.c_startedNodes);
4258 
4259   if (StopReq::getStopNodes(stopReq.requestInfo))
4260   {
4261     NdbNodeBitmask tmp;
4262     tmp.assign(NdbNodeBitmask::Size, stopReq.nodes);
4263 
4264     NdbNodeBitmask ndbStopNodes;
4265     ndbStopNodes.assign(NdbNodeBitmask::Size, stopReq.nodes);
4266     ndbStopNodes.bitAND(ndbMask);
4267     ndbStopNodes.copyto(NdbNodeBitmask::Size, stopReq.nodes);
4268 
4269     ndbMask.bitANDC(tmp);
4270 
4271     bool allNodesStopped = true;
4272     int i ;
4273     for( i = 0; i < (int) NdbNodeBitmask::Size; i++ ){
4274       if ( stopReq.nodes[i] != 0 ){
4275         allNodesStopped = false;
4276         break;
4277       }
4278     }
4279 
4280     if ( allNodesStopped ) {
4281       StopConf * const stopConf = (StopConf *)&signal->theData[0];
4282       stopConf->senderData = stopReq.senderData;
4283       stopConf->nodeState  = (Uint32) NodeState::SL_NOTHING;
4284       cntr.sendSignal(stopReq.senderRef, GSN_STOP_CONF, signal,
4285                        StopConf::SignalLength, JBB);
4286       stopReq.senderRef = 0;
4287       return false;
4288     }
4289 
4290   }
4291   else
4292   {
4293     ndbMask.clear(cntr.getOwnNodeId());
4294   }
4295 
4296   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
4297   sd->blockRef = cntr.reference();
4298   sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::ArbitCheck;
4299   sd->mask = ndbMask;
4300   cntr.EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
4301 		      CheckNodeGroups::SignalLength);
4302   jamEntry();
4303   switch (sd->output) {
4304   case CheckNodeGroups::Win:
4305   case CheckNodeGroups::Partitioning:
4306     return true;
4307     break;
4308   }
4309 
4310   StopRef * const ref = (StopRef *)&signal->theData[0];
4311 
4312   ref->senderData = stopReq.senderData;
4313   ref->errorCode = StopRef::NodeShutdownWouldCauseSystemCrash;
4314   ref->masterNodeId = cntr.cmasterNodeId;
4315 
4316   const BlockReference bref = stopReq.senderRef;
4317   if (bref != RNIL)
4318     cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4319 
4320   stopReq.senderRef = 0;
4321 
4322   if (cntr.getNodeState().startLevel != NodeState::SL_SINGLEUSER)
4323   {
4324     NodeState newState(NodeState::SL_STARTED);
4325     cntr.updateNodeState(signal, newState);
4326     cntr.send_node_started_rep(signal);
4327   }
4328 
4329   signal->theData[0] = NDB_LE_NDBStopAborted;
4330   cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
4331 
4332   return false;
4333 }
4334 
4335 void
checkApiTimeout(Signal * signal)4336 Ndbcntr::StopRecord::checkApiTimeout(Signal* signal){
4337   const Int32 timeout = stopReq.apiTimeout;
4338   const NDB_TICKS now = NdbTick_getCurrentTicks();
4339   if(timeout >= 0 &&
4340      NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4341     // || checkWithApiInSomeMagicWay)
4342     jam();
4343     NodeState newState(NodeState::SL_STOPPING_2,
4344 		       StopReq::getSystemStop(stopReq.requestInfo));
4345     if(stopReq.singleuser) {
4346       newState.setSingleUser(true);
4347       newState.setSingleUserApi(stopReq.singleUserApi);
4348     }
4349     cntr.updateNodeState(signal, newState);
4350 
4351     stopInitiatedTime = now;
4352   }
4353 
4354   signal->theData[0] = ZSHUTDOWN;
4355   cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4356 }
4357 
4358 void
checkTcTimeout(Signal * signal)4359 Ndbcntr::StopRecord::checkTcTimeout(Signal* signal){
4360   const Int32 timeout = stopReq.transactionTimeout;
4361   const NDB_TICKS now = NdbTick_getCurrentTicks();
4362   if(timeout >= 0 &&
4363      NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4364     // || checkWithTcInSomeMagicWay)
4365     jam();
4366     if(stopReq.getSystemStop(stopReq.requestInfo)  || stopReq.singleuser){
4367       jam();
4368       if(stopReq.singleuser)
4369       {
4370 	jam();
4371 	AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
4372 	req->senderRef = cntr.reference();
4373 	req->senderData = 12;
4374 	cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal,
4375 			AbortAllReq::SignalLength, JBB);
4376       }
4377       else
4378       {
4379 	WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4380 	req->senderRef = cntr.reference();
4381 	req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
4382 	req->requestType = WaitGCPReq::CompleteForceStart;
4383 	cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4384 			WaitGCPReq::SignalLength, JBB);
4385       }
4386     } else {
4387       jam();
4388       StopPermReq * req = (StopPermReq*)&signal->theData[0];
4389       req->senderRef = cntr.reference();
4390       req->senderData = 12;
4391       cntr.sendSignal(DBDIH_REF, GSN_STOP_PERM_REQ, signal,
4392 		      StopPermReq::SignalLength, JBB);
4393     }
4394     return;
4395   }
4396   signal->theData[0] = ZSHUTDOWN;
4397   cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4398 }
4399 
execSTOP_PERM_REF(Signal * signal)4400 void Ndbcntr::execSTOP_PERM_REF(Signal* signal){
4401   //StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
4402 
4403   jamEntry();
4404 
4405   signal->theData[0] = ZSHUTDOWN;
4406   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4407 }
4408 
execSTOP_PERM_CONF(Signal * signal)4409 void Ndbcntr::execSTOP_PERM_CONF(Signal* signal){
4410   jamEntry();
4411 
4412   AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
4413   req->senderRef = reference();
4414   req->senderData = 12;
4415   sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal,
4416 	     AbortAllReq::SignalLength, JBB);
4417 }
4418 
execABORT_ALL_CONF(Signal * signal)4419 void Ndbcntr::execABORT_ALL_CONF(Signal* signal){
4420   jamEntry();
4421   if(c_stopRec.stopReq.singleuser) {
4422     jam();
4423 
4424     NodeState newState(NodeState::SL_SINGLEUSER);
4425     newState.setSingleUser(true);
4426     newState.setSingleUserApi(c_stopRec.stopReq.singleUserApi);
4427     updateNodeState(signal, newState);
4428     c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4429 
4430     StopConf * const stopConf = (StopConf *)&signal->theData[0];
4431     stopConf->senderData = c_stopRec.stopReq.senderData;
4432     stopConf->nodeState  = (Uint32) NodeState::SL_SINGLEUSER;
4433     sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
4434 
4435     c_stopRec.stopReq.senderRef = 0; // the command is done
4436 
4437     signal->theData[0] = NDB_LE_SingleUser;
4438     signal->theData[1] = 1;
4439     signal->theData[2] = c_stopRec.stopReq.singleUserApi;
4440     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4441   }
4442   else
4443     {
4444       jam();
4445       NodeState newState(NodeState::SL_STOPPING_3,
4446 			 StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4447       updateNodeState(signal, newState);
4448 
4449       c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4450 
4451       signal->theData[0] = ZSHUTDOWN;
4452       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4453     }
4454 }
4455 
execABORT_ALL_REF(Signal * signal)4456 void Ndbcntr::execABORT_ALL_REF(Signal* signal){
4457   jamEntry();
4458 
4459   StopRef * const stopRef = (StopRef *)&signal->theData[0];
4460   stopRef->senderData = c_stopRec.stopReq.senderData;
4461   stopRef->errorCode = StopRef::TransactionAbortFailed;
4462   stopRef->masterNodeId = cmasterNodeId;
4463   sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4464 }
4465 
4466 void
checkLqhTimeout_1(Signal * signal)4467 Ndbcntr::StopRecord::checkLqhTimeout_1(Signal* signal){
4468   const Int32 timeout = stopReq.readOperationTimeout;
4469   const NDB_TICKS now = NdbTick_getCurrentTicks();
4470 
4471   if(timeout >= 0 &&
4472      NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4473     // || checkWithLqhInSomeMagicWay)
4474     jam();
4475 
4476     ChangeNodeStateReq * req = (ChangeNodeStateReq*)&signal->theData[0];
4477 
4478     NodeState newState(NodeState::SL_STOPPING_4,
4479 		       StopReq::getSystemStop(stopReq.requestInfo));
4480     req->nodeState = newState;
4481     req->senderRef = cntr.reference();
4482     req->senderData = 12;
4483     cntr.sendSignal(DBLQH_REF, GSN_CHANGE_NODE_STATE_REQ, signal,
4484                     ChangeNodeStateReq::SignalLength, JBB);
4485     return;
4486   }
4487   signal->theData[0] = ZSHUTDOWN;
4488   cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4489 }
4490 
4491 void
execCHANGE_NODE_STATE_CONF(Signal * signal)4492 Ndbcntr::execCHANGE_NODE_STATE_CONF(Signal* signal)
4493 {
4494   jamEntry();
4495 
4496   /**
4497    * stop replication stream
4498    */
4499   signal->theData[0] = reference();
4500   signal->theData[1] = 12;
4501   sendSignal(SUMA_REF, GSN_STOP_ME_REQ, signal, 2, JBB);
4502 }
4503 
execSTOP_ME_REF(Signal * signal)4504 void Ndbcntr::execSTOP_ME_REF(Signal* signal){
4505   jamEntry();
4506   ndbrequire(false);
4507 }
4508 
4509 
execSTOP_ME_CONF(Signal * signal)4510 void Ndbcntr::execSTOP_ME_CONF(Signal* signal){
4511   jamEntry();
4512 
4513   const StopMeConf * conf = CAST_CONSTPTR(StopMeConf, signal->getDataPtr());
4514   if (conf->senderData == 12)
4515   {
4516     /**
4517      * Remove node from transactions
4518      */
4519     signal->theData[0] = reference();
4520     signal->theData[1] = 13;
4521     sendSignal(DBDIH_REF, GSN_STOP_ME_REQ, signal, 2, JBB);
4522     return;
4523   }
4524 
4525   NodeState newState(NodeState::SL_STOPPING_4,
4526 		     StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4527   updateNodeState(signal, newState);
4528 
4529   c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4530   signal->theData[0] = ZSHUTDOWN;
4531   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4532 }
4533 
4534 void
checkLqhTimeout_2(Signal * signal)4535 Ndbcntr::StopRecord::checkLqhTimeout_2(Signal* signal){
4536   const Int32 timeout = stopReq.operationTimeout;
4537   const NDB_TICKS now = NdbTick_getCurrentTicks();
4538 
4539   if(timeout >= 0 &&
4540      NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4541     // || checkWithLqhInSomeMagicWay)
4542     jam();
4543     if(StopReq::getPerformRestart(stopReq.requestInfo)){
4544       jam();
4545       StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4546       startOrd->restartInfo = stopReq.requestInfo;
4547       cntr.sendSignal(CMVMI_REF, GSN_START_ORD, signal, 2, JBA);
4548     } else {
4549       jam();
4550       cntr.sendSignal(CMVMI_REF, GSN_STOP_ORD, signal, 1, JBA);
4551     }
4552     return;
4553   }
4554   signal->theData[0] = ZSHUTDOWN;
4555   cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4556 }
4557 
execWAIT_GCP_REF(Signal * signal)4558 void Ndbcntr::execWAIT_GCP_REF(Signal* signal){
4559   jamEntry();
4560 
4561   //WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
4562 
4563   WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4564   req->senderRef = reference();
4565   req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
4566   req->requestType = WaitGCPReq::CompleteForceStart;
4567   sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4568 	     WaitGCPReq::SignalLength, JBB);
4569 }
4570 
execWAIT_GCP_CONF(Signal * signal)4571 void Ndbcntr::execWAIT_GCP_CONF(Signal* signal){
4572   jamEntry();
4573 
4574   WaitGCPConf* conf = (WaitGCPConf*)signal->getDataPtr();
4575 
4576   switch(conf->senderData){
4577   case StopRecord::SR_BLOCK_GCP_START_GCP:
4578   {
4579     jam();
4580     /**
4581      *
4582      */
4583     if(!c_stopRec.checkNodeFail(signal))
4584     {
4585       jam();
4586       goto unblock;
4587     }
4588 
4589     WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4590     req->senderRef = reference();
4591     req->senderData = StopRecord::SR_WAIT_COMPLETE_GCP;
4592     req->requestType = WaitGCPReq::CompleteIfRunning;
4593 
4594     sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4595 	       WaitGCPReq::SignalLength, JBB);
4596     return;
4597   }
4598   case StopRecord::SR_UNBLOCK_GCP_START_GCP:
4599   {
4600     jam();
4601     return;
4602   }
4603   case StopRecord::SR_WAIT_COMPLETE_GCP:
4604   {
4605     jam();
4606     if(!c_stopRec.checkNodeFail(signal))
4607     {
4608       jam();
4609       goto unblock;
4610     }
4611 
4612     NdbNodeBitmask tmp;
4613     tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4614     c_stopRec.m_stop_req_counter = tmp;
4615     NodeReceiverGroup rg(QMGR, tmp);
4616     StopReq * stopReq = (StopReq *)&signal->theData[0];
4617     * stopReq = c_stopRec.stopReq;
4618     stopReq->senderRef = reference();
4619     sendSignal(rg, GSN_STOP_REQ, signal, StopReq::SignalLength, JBA);
4620     c_stopRec.m_state = StopRecord::SR_QMGR_STOP_REQ;
4621     return;
4622   }
4623   case StopRecord::SR_CLUSTER_SHUTDOWN:
4624   {
4625     jam();
4626     break;
4627   }
4628   }
4629 
4630   {
4631     ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4632     NodeState newState(NodeState::SL_STOPPING_3, true);
4633 
4634     /**
4635      * Inform QMGR so that arbitrator won't kill us
4636      */
4637     NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
4638     rep->nodeState = newState;
4639     rep->nodeState.masterNodeId = cmasterNodeId;
4640     rep->nodeState.setNodeGroup(c_nodeGroup);
4641     EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal,
4642 		   NodeStateRep::SignalLength);
4643 
4644     if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
4645       jam();
4646       StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4647       startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
4648       sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500,
4649 			  StartOrd::SignalLength);
4650     } else {
4651       jam();
4652       sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
4653     }
4654     return;
4655   }
4656 
4657 unblock:
4658   WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4659   req->senderRef = reference();
4660   req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
4661   req->requestType = WaitGCPReq::UnblockStartGcp;
4662   sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4663 	     WaitGCPReq::SignalLength, JBB);
4664 }
4665 
4666 void
execSTOP_CONF(Signal * signal)4667 Ndbcntr::execSTOP_CONF(Signal* signal)
4668 {
4669   jamEntry();
4670   StopConf *conf = (StopConf*)signal->getDataPtr();
4671   ndbrequire(c_stopRec.m_state == StopRecord::SR_QMGR_STOP_REQ);
4672   c_stopRec.m_stop_req_counter.clearWaitingFor(conf->nodeId);
4673   if (c_stopRec.m_stop_req_counter.done())
4674   {
4675     char buf[100];
4676     NdbNodeBitmask mask;
4677     mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4678     infoEvent("Stopping of %s", mask.getText(buf));
4679     ndbout_c("Stopping of %s", mask.getText(buf));
4680 
4681     /**
4682      * Kill any node...
4683      */
4684     FailRep * const failRep = (FailRep *)&signal->theData[0];
4685     failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
4686     failRep->failSourceNodeId = getOwnNodeId();
4687     NodeReceiverGroup rg(QMGR, c_clusterNodes);
4688     Uint32 nodeId = 0;
4689     while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))
4690 	   != NdbNodeBitmask::NotFound)
4691     {
4692       failRep->failNodeId = nodeId;
4693       sendSignal(rg, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
4694     }
4695     c_stopRec.m_state = StopRecord::SR_WAIT_NODE_FAILURES;
4696     return;
4697   }
4698 }
4699 
execSTTORRY(Signal * signal)4700 void Ndbcntr::execSTTORRY(Signal* signal){
4701   jamEntry();
4702   c_missra.execSTTORRY(signal);
4703 }
4704 
execREAD_CONFIG_CONF(Signal * signal)4705 void Ndbcntr::execREAD_CONFIG_CONF(Signal* signal){
4706   jamEntry();
4707   c_missra.execREAD_CONFIG_CONF(signal);
4708 }
4709 
execSTART_ORD(Signal * signal)4710 void Ndbcntr::execSTART_ORD(Signal* signal){
4711   jamEntry();
4712   c_missra.execSTART_ORD(signal);
4713 }
4714 
4715 #define CLEAR_DX 13
4716 #define CLEAR_LCP 3
4717 #define CLEAR_DD 2
4718 // FileSystemPathDataFiles FileSystemPathUndoFiles
4719 
4720 void
clearFilesystem(Signal * signal)4721 Ndbcntr::clearFilesystem(Signal* signal)
4722 {
4723   jam();
4724   FsRemoveReq * req  = (FsRemoveReq *)signal->getDataPtrSend();
4725   req->userReference = reference();
4726   req->userPointer   = 0;
4727   req->directory     = 1;
4728   req->ownDirectory  = 1;
4729 
4730   const Uint32 DX = CLEAR_DX;
4731   const Uint32 LCP = CLEAR_DX + CLEAR_LCP;
4732   const Uint32 DD = CLEAR_DX + CLEAR_LCP + CLEAR_DD;
4733 
4734   if (c_fsRemoveCount < DX)
4735   {
4736     FsOpenReq::setVersion(req->fileNumber, 3);
4737     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL); // Can by any...
4738     FsOpenReq::v1_setDisk(req->fileNumber, c_fsRemoveCount);
4739   }
4740   else if (c_fsRemoveCount < LCP)
4741   {
4742     FsOpenReq::setVersion(req->fileNumber, 5);
4743     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
4744     FsOpenReq::v5_setLcpNo(req->fileNumber, c_fsRemoveCount - CLEAR_DX);
4745     FsOpenReq::v5_setTableId(req->fileNumber, 0);
4746     FsOpenReq::v5_setFragmentId(req->fileNumber, 0);
4747   }
4748   else if (c_fsRemoveCount < DD)
4749   {
4750     req->ownDirectory  = 0;
4751     FsOpenReq::setVersion(req->fileNumber, 6);
4752     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
4753     FsOpenReq::v5_setLcpNo(req->fileNumber,
4754                            FsOpenReq::BP_DD_DF + c_fsRemoveCount - LCP);
4755   }
4756   else
4757   {
4758     ndbrequire(false);
4759   }
4760 
4761   sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
4762              FsRemoveReq::SignalLength, JBA);
4763   c_fsRemoveCount++;
4764 }
4765 
4766 void
execFSREMOVECONF(Signal * signal)4767 Ndbcntr::execFSREMOVECONF(Signal* signal){
4768   jamEntry();
4769   if(c_fsRemoveCount == CLEAR_DX + CLEAR_LCP + CLEAR_DD){
4770     jam();
4771     sendSttorry(signal);
4772   } else {
4773     jam();
4774     ndbrequire(c_fsRemoveCount < CLEAR_DX + CLEAR_LCP + CLEAR_DD);
4775     clearFilesystem(signal);
4776   }//if
4777 }
4778 
execSTART_ORD(Signal * signal)4779 void Ndbcntr::Missra::execSTART_ORD(Signal* signal){
4780   signal->theData[0] = NDB_LE_NDBStartStarted;
4781   signal->theData[1] = NDB_VERSION;
4782   signal->theData[2] = NDB_MYSQL_VERSION_D;
4783   cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4784 
4785   currentBlockIndex = 0;
4786   sendNextREAD_CONFIG_REQ(signal);
4787 }
4788 
sendNextREAD_CONFIG_REQ(Signal * signal)4789 void Ndbcntr::Missra::sendNextREAD_CONFIG_REQ(Signal* signal){
4790 
4791   if(currentBlockIndex < ALL_BLOCKS_SZ){
4792     jam();
4793 
4794     ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtrSend();
4795     req->senderData = 0;
4796     req->senderRef = cntr.reference();
4797     req->noOfParameters = 0;
4798 
4799     const BlockReference ref = readConfigOrder[currentBlockIndex];
4800 
4801     g_eventLogger->info("Sending READ_CONFIG_REQ to index = %d, name = %s",
4802                         currentBlockIndex,
4803                         getBlockName(refToBlock(ref)));
4804 
4805     /**
4806      * send delayed so that alloc gets "time-sliced"
4807      */
4808     cntr.sendSignalWithDelay(ref, GSN_READ_CONFIG_REQ, signal,
4809                              1, ReadConfigReq::SignalLength);
4810     return;
4811   }
4812 
4813   g_eventLogger->info("READ_CONFIG_REQ phase completed, this phase is"
4814                       " used to read configuration and to calculate"
4815                       " various sizes and allocate almost all memory"
4816                       " needed by the data node in its lifetime");
4817   /**
4818    * Finished...
4819    */
4820   currentStartPhase = 0;
4821   currentBlockIndex = 0;
4822   sendNextSTTOR(signal);
4823 }
4824 
execREAD_CONFIG_CONF(Signal * signal)4825 void Ndbcntr::Missra::execREAD_CONFIG_CONF(Signal* signal)
4826 {
4827   const ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtr();
4828 
4829   const Uint32 ref = conf->senderRef;
4830   ndbrequire(refToBlock(readConfigOrder[currentBlockIndex])
4831 	     == refToBlock(ref));
4832 
4833   currentBlockIndex++;
4834   sendNextREAD_CONFIG_REQ(signal);
4835 }
4836 
execSTTORRY(Signal * signal)4837 void Ndbcntr::Missra::execSTTORRY(Signal* signal){
4838   const BlockReference ref = signal->senderBlockRef();
4839   ndbrequire(refToBlock(ref) == refToBlock(ALL_BLOCKS[currentBlockIndex].Ref));
4840 
4841   /**
4842    * Update next start phase
4843    */
4844   for (Uint32 i = 3; i < 25; i++){
4845     jam();
4846     if (signal->theData[i] > currentStartPhase){
4847       jam();
4848       ALL_BLOCKS[currentBlockIndex].NextSP = signal->theData[i];
4849       break;
4850     }
4851   }
4852 
4853   currentBlockIndex++;
4854   sendNextSTTOR(signal);
4855 }
4856 
sendNextSTTOR(Signal * signal)4857 void Ndbcntr::Missra::sendNextSTTOR(Signal* signal){
4858 
4859   for(; currentStartPhase < 255 ;
4860       currentStartPhase++, g_currentStartPhase = currentStartPhase){
4861     jam();
4862 
4863 #ifdef ERROR_INSERT
4864     if (cntr.cerrorInsert == 1002 &&
4865         cntr.c_error_insert_extra == currentStartPhase)
4866     {
4867       signal->theData[0] = ZBLOCK_STTOR;
4868       cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4869       return;
4870     }
4871 #endif
4872 
4873     const Uint32 start = currentBlockIndex;
4874     for(; currentBlockIndex < ALL_BLOCKS_SZ; currentBlockIndex++){
4875       jam();
4876       if(ALL_BLOCKS[currentBlockIndex].NextSP == currentStartPhase){
4877 	jam();
4878 	signal->theData[0] = 0;
4879 	signal->theData[1] = currentStartPhase;
4880 	signal->theData[2] = 0;
4881 	signal->theData[3] = 0;
4882 	signal->theData[4] = 0;
4883 	signal->theData[5] = 0;
4884 	signal->theData[6] = 0;
4885 	signal->theData[7] = cntr.ctypeOfStart;
4886 
4887 	const BlockReference ref = ALL_BLOCKS[currentBlockIndex].Ref;
4888 
4889 #ifdef MAX_STARTPHASE
4890 	ndbrequire(currentStartPhase <= MAX_STARTPHASE);
4891 #endif
4892 
4893 #ifdef TRACE_STTOR
4894 	ndbout_c("sending STTOR(%d) to %s(ref=%x index=%d)",
4895 		 currentStartPhase,
4896 		 getBlockName( refToBlock(ref)),
4897 		 ref,
4898 		 currentBlockIndex);
4899 #endif
4900         if (refToBlock(ref) == DBDIH)
4901           signal->theData[7] = cntr.cdihStartType;
4902 
4903 	cntr.sendSignal(ref, GSN_STTOR, signal, 8, JBB);
4904 
4905 	return;
4906       }
4907     }
4908 
4909     currentBlockIndex = 0;
4910 
4911     NodeState newState(NodeState::SL_STARTING, currentStartPhase,
4912 		       (NodeState::StartType)cntr.ctypeOfStart);
4913     cntr.updateNodeState(signal, newState);
4914 
4915     if(start != 0)
4916     {
4917       /**
4918        * At least one wanted this start phase, record & report it
4919        */
4920       jam();
4921       g_eventLogger->info("Start phase %u completed", currentStartPhase);
4922       switch (currentStartPhase)
4923       {
4924         case 0:
4925           g_eventLogger->info("Phase 0 has made some file system"
4926                               " initialisations");
4927           break;
4928         case 1:
4929           g_eventLogger->info("Phase 1 initialised some variables and"
4930                               " included node in cluster, locked memory"
4931                               " if configured to do so");
4932           break;
4933         case 2:
4934           switch (cntr.ctypeOfStart)
4935           {
4936             case NodeState::ST_INITIAL_START:
4937             case NodeState::ST_INITIAL_NODE_RESTART:
4938               g_eventLogger->info("Phase 2 did more initialisations, master"
4939                                   " accepted our start, we initialised the"
4940                                   " REDO log");
4941               break;
4942             case NodeState::ST_SYSTEM_RESTART:
4943             case NodeState::ST_NODE_RESTART:
4944               g_eventLogger->info("Phase 2 did more initialisations, master"
4945                                   " accepted our start, we started REDO log"
4946                                   " initialisations");
4947               break;
4948             default:
4949               break;
4950           }
4951           break;
4952         case 3:
4953           switch (cntr.ctypeOfStart)
4954           {
4955             case NodeState::ST_INITIAL_START:
4956             case NodeState::ST_SYSTEM_RESTART:
4957               g_eventLogger->info("Phase 3 performed local connection setups");
4958               break;
4959             case NodeState::ST_INITIAL_NODE_RESTART:
4960             case NodeState::ST_NODE_RESTART:
4961               g_eventLogger->info("Phase 3 locked the data dictionary, "
4962                                   "performed local connection setups, we "
4963                                   " asked for permission to start our node");
4964               break;
4965             default:
4966               break;
4967           }
4968           break;
4969         case 4:
4970           switch (cntr.ctypeOfStart)
4971           {
4972             case NodeState::ST_SYSTEM_RESTART:
4973               g_eventLogger->info("Phase 4 restored all fragments from local"
4974                                   " disk up to a consistent global checkpoint"
4975                                   " id");
4976               break;
4977             case NodeState::ST_NODE_RESTART:
4978             case NodeState::ST_INITIAL_START:
4979             case NodeState::ST_INITIAL_NODE_RESTART:
4980               g_eventLogger->info("Phase 4 continued preparations of the REDO"
4981                                   " log");
4982               break;
4983             default:
4984               break;
4985           }
4986           break;
4987         case 5:
4988           switch (cntr.ctypeOfStart)
4989           {
4990             case NodeState::ST_INITIAL_NODE_RESTART:
4991             case NodeState::ST_NODE_RESTART:
4992               g_eventLogger->info("Phase 5 restored local fragments in its"
4993                                   " first NDB phase, then copied metadata to"
4994                                   " our node, and"
4995                                   " then actual data was copied over to our"
4996                                   " node, and finally we waited for a local"
4997                                   " checkpoint to complete");
4998               break;
4999             case NodeState::ST_INITIAL_START:
5000               g_eventLogger->info("Phase 5 Created the System Table");
5001             case NodeState::ST_SYSTEM_RESTART:
5002               g_eventLogger->info("Phase 5 waited for local checkpoint to"
5003                                   " complete");
5004               break;
5005             default:
5006               break;
5007           }
5008           break;
5009         case 6:
5010           g_eventLogger->info("Phase 6 updated blocks about that we've now"
5011                               " reached the started state.");
5012           break;
5013         case 7:
5014           g_eventLogger->info("Phase 7 mainly activated the asynchronous"
5015                               " change events process, and some other"
5016                               " background processes");
5017           break;
5018         case 8:
5019           switch (cntr.ctypeOfStart)
5020           {
5021             case NodeState::ST_INITIAL_START:
5022             case NodeState::ST_SYSTEM_RESTART:
5023             {
5024               g_eventLogger->info("Phase 8 enabled foreign keys and waited for"
5025                         "all nodes to complete start up to this point");
5026               break;
5027             }
5028             default:
5029               break;
5030           }
5031           break;
5032         case 9:
5033           g_eventLogger->info("Phase 9 enabled APIs to start connecting");
5034           break;
5035         case 101:
5036           g_eventLogger->info("Phase 101 was used by SUMA to take over"
5037                               " responsibility for sending some of the"
5038                               " asynchronous change events");
5039           break;
5040         default:
5041           break;
5042       }
5043 
5044       signal->theData[0] = NDB_LE_StartPhaseCompleted;
5045       signal->theData[1] = currentStartPhase;
5046       signal->theData[2] = cntr.ctypeOfStart;
5047       cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
5048 
5049       /**
5050        * Check if we should wait before proceeding with
5051        *   next startphase
5052        *
5053        * New code guarantees that before starting X
5054        *   that all other nodes (in system restart/initial start)
5055        *   want to start a startphase >= X
5056        */
5057       if (cntr.wait_sp(signal, currentStartPhase + 1))
5058       {
5059         jam();
5060         currentStartPhase++;
5061         g_currentStartPhase = currentStartPhase;
5062         return;
5063       }
5064     }
5065   }
5066 
5067   g_eventLogger->info("Node started");
5068 
5069   signal->theData[0] = NDB_LE_NDBStartCompleted;
5070   signal->theData[1] = NDB_VERSION;
5071   signal->theData[2] = NDB_MYSQL_VERSION_D;
5072   cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
5073 
5074   NodeState newState(NodeState::SL_STARTED);
5075   cntr.updateNodeState(signal, newState);
5076   cntr.send_node_started_rep(signal);
5077 
5078   NodeReceiverGroup rg(NDBCNTR, cntr.c_clusterNodes);
5079   signal->theData[0] = cntr.getOwnNodeId();
5080   cntr.sendSignal(rg, GSN_CNTR_START_REP, signal, 1, JBB);
5081 }
5082 
5083 void
send_node_started_rep(Signal * signal)5084 Ndbcntr::send_node_started_rep(Signal *signal)
5085 {
5086   signal->theData[0] = getOwnNodeId();
5087   sendSignal(QMGR_REF, GSN_NODE_STARTED_REP, signal, 1, JBB);
5088 }
5089 
5090 void
execCREATE_NODEGROUP_IMPL_REQ(Signal * signal)5091 Ndbcntr::execCREATE_NODEGROUP_IMPL_REQ(Signal* signal)
5092 {
5093   jamEntry();
5094 
5095   CreateNodegroupImplReq reqCopy = *(CreateNodegroupImplReq*)signal->getDataPtr();
5096   CreateNodegroupImplReq *req = &reqCopy;
5097 
5098   if (req->requestType == CreateNodegroupImplReq::RT_COMMIT)
5099   {
5100     jam();
5101     Uint32 save = c_nodeGroup;
5102     getNodeGroup(signal);
5103     if (save != c_nodeGroup)
5104     {
5105       jam();
5106       updateNodeState(signal, getNodeState());
5107     }
5108   }
5109 
5110   {
5111     CreateNodegroupImplConf* conf = (CreateNodegroupImplConf*)signal->getDataPtrSend();
5112     conf->senderRef = reference();
5113     conf->senderData = req->senderData;
5114     sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_CONF, signal,
5115                CreateNodegroupImplConf::SignalLength, JBB);
5116   }
5117 }
5118 
5119 void
execDROP_NODEGROUP_IMPL_REQ(Signal * signal)5120 Ndbcntr::execDROP_NODEGROUP_IMPL_REQ(Signal* signal)
5121 {
5122   jamEntry();
5123 
5124   DropNodegroupImplReq reqCopy = *(DropNodegroupImplReq*)signal->getDataPtr();
5125   DropNodegroupImplReq *req = &reqCopy;
5126 
5127   if (req->requestType == DropNodegroupImplReq::RT_COMPLETE)
5128   {
5129     jam();
5130     Uint32 save = c_nodeGroup;
5131     getNodeGroup(signal);
5132 
5133     if (save != c_nodeGroup)
5134     {
5135       jam();
5136       updateNodeState(signal, getNodeState());
5137     }
5138   }
5139 
5140   {
5141     DropNodegroupImplConf* conf = (DropNodegroupImplConf*)signal->getDataPtrSend();
5142     conf->senderRef = reference();
5143     conf->senderData = req->senderData;
5144     sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_CONF, signal,
5145                DropNodegroupImplConf::SignalLength, JBB);
5146   }
5147 }
5148 
5149 template class Vector<ddentry>;
5150