1 /*
2 Copyright (c) 2003, 2021, Oracle and/or its affiliates.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #define NDBCNTR_C
26 #include "Ndbcntr.hpp"
27
28 #include <ndb_limits.h>
29 #include <ndb_version.h>
30 #include <SimpleProperties.hpp>
31 #include <signaldata/NodeRecoveryStatusRep.hpp>
32 #include <signaldata/DictTabInfo.hpp>
33 #include <signaldata/SchemaTrans.hpp>
34 #include <signaldata/CreateTable.hpp>
35 #include <signaldata/CreateHashMap.hpp>
36 #include <signaldata/ReadNodesConf.hpp>
37 #include <signaldata/NodeFailRep.hpp>
38 #include <signaldata/TcKeyReq.hpp>
39 #include <signaldata/TcKeyConf.hpp>
40 #include <signaldata/EventReport.hpp>
41 #include <signaldata/NodeStateSignalData.hpp>
42 #include <signaldata/StopPerm.hpp>
43 #include <signaldata/StopMe.hpp>
44 #include <signaldata/WaitGCP.hpp>
45 #include <signaldata/CheckNodeGroups.hpp>
46 #include <signaldata/StartOrd.hpp>
47 #include <signaldata/AbortAll.hpp>
48 #include <signaldata/SystemError.hpp>
49 #include <signaldata/NdbSttor.hpp>
50 #include <signaldata/CntrStart.hpp>
51 #include <signaldata/DumpStateOrd.hpp>
52
53 #include <signaldata/FsRemoveReq.hpp>
54 #include <signaldata/ReadConfig.hpp>
55
56 #include <signaldata/FailRep.hpp>
57
58 #include <AttributeHeader.hpp>
59 #include <Configuration.hpp>
60 #include <DebuggerNames.hpp>
61 #include <signaldata/DihRestart.hpp>
62
63 #include <NdbOut.hpp>
64 #include <NdbTick.h>
65
66 #include <signaldata/TakeOver.hpp>
67 #include <signaldata/CreateNodegroupImpl.hpp>
68 #include <signaldata/DropNodegroupImpl.hpp>
69 #include <signaldata/CreateFilegroup.hpp>
70
71 #include <EventLogger.hpp>
72
73 #define JAM_FILE_ID 458
74
75
76 extern EventLogger * g_eventLogger;
77
78 // used during shutdown for reporting current startphase
79 // accessed from Emulator.cpp, NdbShutdown()
80 Uint32 g_currentStartPhase = 0;
81
82 /**
83 * ALL_BLOCKS Used during start phases and while changing node state
84 *
85 * NDBFS_REF Has to be before NDBCNTR_REF (due to "ndb -i" stuff)
86 */
87 struct BlockInfo {
88 BlockReference Ref; // BlockReference
89 Uint32 NextSP; // Next start phase
90 Uint32 ErrorInsertStart;
91 Uint32 ErrorInsertStop;
92 };
93
94 static BlockInfo ALL_BLOCKS[] = {
95 { NDBFS_REF, 0 , 2000, 2999 },
96 { DBTC_REF, 1 , 8000, 8035 },
97 { DBDIH_REF, 1 , 7000, 7173 },
98 { DBLQH_REF, 1 , 5000, 5030 },
99 { DBACC_REF, 1 , 3000, 3999 },
100 { DBTUP_REF, 1 , 4000, 4007 },
101 { DBDICT_REF, 1 , 6000, 6003 },
102 { NDBCNTR_REF, 0 , 1000, 1999 },
103 { CMVMI_REF, 1 , 9000, 9999 }, // before QMGR
104 { QMGR_REF, 1 , 1, 999 },
105 { TRIX_REF, 1 , 0, 0 },
106 { BACKUP_REF, 1 , 10000, 10999 },
107 { DBUTIL_REF, 1 , 11000, 11999 },
108 { SUMA_REF, 1 , 13000, 13999 },
109 { DBTUX_REF, 1 , 12000, 12999 }
110 ,{ TSMAN_REF, 1 , 0, 0 }
111 ,{ LGMAN_REF, 1 , 0, 0 }
112 ,{ PGMAN_REF, 1 , 0, 0 }
113 ,{ RESTORE_REF,1 , 0, 0 }
114 ,{ DBINFO_REF, 1 , 0, 0 }
115 ,{ DBSPJ_REF, 1 , 0, 0 }
116 ,{ THRMAN_REF, 1 , 0, 0 }
117 };
118
119 static const Uint32 ALL_BLOCKS_SZ = sizeof(ALL_BLOCKS)/sizeof(BlockInfo);
120
121 static BlockReference readConfigOrder[ALL_BLOCKS_SZ] = {
122 CMVMI_REF,
123 NDBFS_REF,
124 DBINFO_REF,
125 DBTUP_REF,
126 DBACC_REF,
127 DBTC_REF,
128 DBLQH_REF,
129 DBTUX_REF,
130 DBDICT_REF,
131 DBDIH_REF,
132 NDBCNTR_REF,
133 QMGR_REF,
134 TRIX_REF,
135 BACKUP_REF,
136 DBUTIL_REF,
137 SUMA_REF,
138 TSMAN_REF,
139 LGMAN_REF,
140 PGMAN_REF,
141 RESTORE_REF,
142 DBSPJ_REF,
143 THRMAN_REF
144 };
145
146 /*******************************/
147 /* CONTINUEB */
148 /*******************************/
execCONTINUEB(Signal * signal)149 void Ndbcntr::execCONTINUEB(Signal* signal)
150 {
151 jamEntry();
152 UintR Ttemp1 = signal->theData[0];
153 switch (Ttemp1) {
154 case ZSTARTUP:{
155 if(getNodeState().startLevel == NodeState::SL_STARTED){
156 jam();
157 return;
158 }
159
160 if(cmasterNodeId == getOwnNodeId() && c_start.m_starting.isclear()){
161 jam();
162 trySystemRestart(signal);
163 // Fall-through
164 }
165
166 const Uint64 elapsed = NdbTick_Elapsed(
167 c_start.m_startTime,
168 NdbTick_getCurrentTicks()).milliSec();
169
170 if (elapsed > c_start.m_startFailureTimeout)
171 {
172 jam();
173 Uint32 to_3= 0;
174 const ndb_mgm_configuration_iterator * p =
175 m_ctx.m_config.getOwnConfigIterator();
176 ndb_mgm_get_int_parameter(p, CFG_DB_START_FAILURE_TIMEOUT, &to_3);
177 BaseString tmp;
178 tmp.append("Shutting down node as total restart time exceeds "
179 " StartFailureTimeout as set in config file ");
180 if(to_3 == 0)
181 tmp.append(" 0 (inifinite)");
182 else
183 tmp.appfmt(" %d", to_3);
184
185 progError(__LINE__, NDBD_EXIT_RESTART_TIMEOUT, tmp.c_str());
186 }
187
188 signal->theData[0] = ZSTARTUP;
189 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
190 break;
191 }
192 case ZSHUTDOWN:
193 jam();
194 c_stopRec.checkTimeout(signal);
195 break;
196 case ZBLOCK_STTOR:
197 if (ERROR_INSERTED(1002))
198 {
199 signal->theData[0] = ZBLOCK_STTOR;
200 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
201 return;
202 }
203 else
204 {
205 c_missra.sendNextSTTOR(signal);
206 }
207 return;
208 default:
209 jam();
210 systemErrorLab(signal, __LINE__);
211 return;
212 break;
213 }//switch
214 }//Ndbcntr::execCONTINUEB()
215
216 void
execAPI_START_REP(Signal * signal)217 Ndbcntr::execAPI_START_REP(Signal* signal)
218 {
219 if(refToBlock(signal->getSendersBlockRef()) == QMGR)
220 {
221 for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
222 sendSignal(ALL_BLOCKS[i].Ref, GSN_API_START_REP, signal, 1, JBB);
223 }
224 }
225 }
226 /*******************************/
227 /* SYSTEM_ERROR */
228 /*******************************/
execSYSTEM_ERROR(Signal * signal)229 void Ndbcntr::execSYSTEM_ERROR(Signal* signal)
230 {
231 const SystemError * const sysErr = (SystemError *)signal->getDataPtr();
232 char buf[100];
233 int killingNode = refToNode(sysErr->errorRef);
234 Uint32 data1 = sysErr->data[0];
235
236 jamEntry();
237 switch (sysErr->errorCode){
238 case SystemError::GCPStopDetected:
239 {
240 BaseString::snprintf(buf, sizeof(buf),
241 "Node %d killed this node because "
242 "GCP stop was detected",
243 killingNode);
244 signal->theData[0] = 7025;
245 EXECUTE_DIRECT(DBDIH, GSN_DUMP_STATE_ORD, signal, 1);
246 jamEntry();
247
248 {
249 signal->theData[0] = 12002;
250 EXECUTE_DIRECT(LGMAN, GSN_DUMP_STATE_ORD, signal, 1, 0);
251 }
252
253 jamEntry();
254
255 if (ERROR_INSERTED(1004))
256 {
257 jam();
258 g_eventLogger->info("NDBCNTR not shutting down due to GCP stop");
259 return;
260 }
261 CRASH_INSERTION(1005);
262
263 break;
264 }
265 case SystemError::CopyFragRefError:
266 CRASH_INSERTION(1000);
267 BaseString::snprintf(buf, sizeof(buf),
268 "Killed by node %d as "
269 "copyfrag failed, error: %u",
270 killingNode, data1);
271 break;
272
273 case SystemError::StartFragRefError:
274 BaseString::snprintf(buf, sizeof(buf),
275 "Node %d killed this node because "
276 "it replied StartFragRef error code: %u.",
277 killingNode, data1);
278 break;
279
280 case SystemError::CopySubscriptionRef:
281 CRASH_INSERTION(1003);
282 BaseString::snprintf(buf, sizeof(buf),
283 "Node %d killed this node because "
284 "it could not copy a subscription during node restart. "
285 "Copy subscription error code: %u.",
286 killingNode, data1);
287 break;
288 case SystemError::CopySubscriberRef:
289 BaseString::snprintf(buf, sizeof(buf),
290 "Node %d killed this node because "
291 "it could not start a subscriber during node restart. "
292 "Copy subscription error code: %u.",
293 killingNode, data1);
294 break;
295 default:
296 BaseString::snprintf(buf, sizeof(buf), "System error %d, "
297 " this node was killed by node %d",
298 sysErr->errorCode, killingNode);
299 break;
300 }
301
302 progError(__LINE__, NDBD_EXIT_SYSTEM_ERROR, buf);
303 return;
304 }//Ndbcntr::execSYSTEM_ERROR()
305
306
307 struct ddentry
308 {
309 Uint32 type;
310 const char * name;
311 Uint64 size;
312 };
313
314 /**
315 * f_dd[] = {
316 * { DictTabInfo::LogfileGroup, "DEFAULT-LG", 32*1024*1024 },
317 * { DictTabInfo::Undofile, "undofile.dat", 64*1024*1024 },
318 * { DictTabInfo::Tablespace, "DEFAULT-TS", 1024*1024 },
319 * { DictTabInfo::Datafile, "datafile.dat", 64*1024*1024 },
320 * { ~0, 0, 0 }
321 * };
322 */
323 Vector<ddentry> f_dd;
324
325 static
326 Uint64
parse_size(const char * src)327 parse_size(const char * src)
328 {
329 Uint64 num = 0;
330 char * endptr = 0;
331 num = my_strtoll(src, &endptr, 10);
332
333 if (endptr)
334 {
335 switch(* endptr){
336 case 'k':
337 case 'K':
338 num *= 1024;
339 break;
340 case 'm':
341 case 'M':
342 num *= 1024;
343 num *= 1024;
344 break;
345 case 'g':
346 case 'G':
347 num *= 1024;
348 num *= 1024;
349 num *= 1024;
350 break;
351 }
352 }
353 return num;
354 }
355
356 static
357 int
parse_spec(Vector<ddentry> & dst,const char * src,Uint32 type)358 parse_spec(Vector<ddentry> & dst,
359 const char * src,
360 Uint32 type)
361 {
362 const char * key;
363 Uint32 filetype;
364
365 struct ddentry group;
366 if (type == DictTabInfo::LogfileGroup)
367 {
368 key = "undo_buffer_size=";
369 group.size = 64*1024*1024;
370 group.name = "DEFAULT-LG";
371 group.type = type;
372 filetype = DictTabInfo::Undofile;
373 }
374 else
375 {
376 key = "extent_size=";
377 group.size = 1024*1024;
378 group.name = "DEFAULT-TS";
379 group.type = type;
380 filetype = DictTabInfo::Datafile;
381 }
382 size_t keylen = strlen(key);
383
384 BaseString arg(src);
385 Vector<BaseString> list;
386 arg.split(list, ";");
387
388 bool first = true;
389 for (Uint32 i = 0; i<list.size(); i++)
390 {
391 list[i].trim();
392 if (native_strncasecmp(list[i].c_str(), "name=", sizeof("name=")-1) == 0)
393 {
394 group.name= strdup(list[i].c_str() + sizeof("name=")-1);
395 }
396 else if (native_strncasecmp(list[i].c_str(), key, keylen) == 0)
397 {
398 group.size = parse_size(list[i].c_str() + keylen);
399 }
400 else if (strlen(list[i].c_str()) == 0 && (i + 1) == list.size())
401 {
402 /**
403 * ignore stray ";"
404 */
405 }
406 else
407 {
408 /**
409 * interpret as filespec
410 */
411 struct ddentry entry;
412 const char * path = list[i].c_str();
413 char * sizeptr = const_cast<char*>(strchr(path, ':'));
414 if (sizeptr == 0)
415 {
416 return -1;
417 }
418 * sizeptr = 0;
419
420 entry.name = strdup(path);
421 entry.size = parse_size(sizeptr + 1);
422 entry.type = filetype;
423
424 if (first)
425 {
426 /**
427 * push group aswell
428 */
429 first = false;
430 dst.push_back(group);
431 }
432 dst.push_back(entry);
433 }
434 }
435 return 0;
436 }
437
438 /**
439 Restart Phases in MySQL Cluster
440 -------------------------------
441 In MySQL Cluster the restart is processed in phases, the restart of a node
442 is driven by a set of phases. In addition a node restart is also synchronised
443 with already started nodes and other nodes that are starting up in parallel
444 with our node. This comment will describe the various phases used.
445
446 The first step in starting a node is to create the data node run-time
447 environment. The data node process is normally running with an angel process,
448 this angel process ensures that the data node is automatically restarted in
449 cases of failures. So the only reason to run the data node again is after an
450 OS crash or after a shutdown by an operator or as part of a software upgrade.
451
452 When starting up the data node, the data node needs a node id, this is either
453 assigned through setting the parameter --ndb-nodeid when starting the data
454 node, or it is assigned by the management server when retrieving the
455 configuration. The angel process will ensure that the assigned node id will be
456 the same for all restarts of the data node.
457
458 After forking the data node process, the starting process stays as the angel
459 process and the new process becomes the actual data node process. The actual
460 data node process starts by retrieving the configuration from the management
461 server.
462
463 At this stage we have read the options, we have allocated a node id, we have
464 the configuration loaded from the management server. We will print some
465 important information to the data node log about our thread configuration and
466 some other things. To ensure that we find the correct files and create files
467 in the correct place we set the datadir of our data node process.
468
469 Next we have to start the watch-dog thread since we are now starting to do
470 activities where we want to ensure that we don't get stuck due to some
471 software error.
472
473 Next we will allocate the memory of the global memory pools, this is where
474 most memory is allocated, we still have a fair amount of memory allocated as
475 part of the initialisation of the various software modules in the NDB kernel,
476 but step by step we're moving towards usage of the global memory pools.
477
478 Allocating memory can be a fairly time-consuming process where the OS can
479 require up to one second for each GByte of memory allocated (naturally OS
480 dependent and will change over time). What actually consumes the time here is
481 actually that we also touch each page to ensure that the allocated memory is
482 also mapped to real physical memory to avoid page misses while we're running
483 the process. To speed up this process we have made the touching of memory
484 multi-threaded.
485
486 Actually where most memory is allocated is configurable, the configuration
487 variable LateAlloc can be used to delay the allocation of most memory to early
488 phases of the restart.
489
490 The only memory that is required to allocate in the early phase is the job
491 buffer, memory for sending messages over the network and finally memory for
492 messages to and from the file system threads. So allocation of e.g.
493 DataMemory, IndexMemory and DiskPageBufferMemory can be delayed until the
494 early start phases.
495
496 After allocating the global memory pool we initialise all data used by the
497 run-time environment. This ensures that we're ready to send and receive data
498 between the threads used in the data node process as soon as they are started.
499
500 At this point we've only started the watch-dog process and the thread started
501 as part of creating the process (this thread will later be converted to the
502 first receive thread if we're running ndbmtd and the only execution thread if
503 we are running ndbd). Next step is to load all software modules and initialise
504 those to ensure they're properly set-up when the messages start arriving for
505 execution.
506
507 Before we start the run-time environment we also need to activate the send
508 and receive services. This involves creating a socket client thread that
509 attempts to connect to socket server parts of other nodes in the cluster and
510 a thread to listen to the socket server used for those data nodes we
511 communicate as the socket server.
512
513 The default behaviour is that the node with the lowest nodeid is the socket
514 server in the communication setup. This can be changed in the data node
515 configuration.
516
517 Before we proceed and start the data node environment we will place the start
518 signals of the run-time environment in its proper job buffer. Actually to
519 start the system one needs to place two equal signals in the job buffer. The
520 first start signal starts the communication to other nodes and sets the state
521 to wait for the next signal to actually start the system. The second one will
522 start running the start phases.
523
524 Finally we start all the threads of the run-time environment. These can
525 currently include a main thread, a rep thread, a number of tc threads,
526 a number of send threads, a number of receive threads and a number of
527 ldm threads. Given that communication buffers for all threads have been
528 preallocated, we can start sending signals immediately as those threads
529 startup. The receiving thread will start to take care of its received signals
530 as soon as it has come to that point in its thread startup code.
531
532 There are two identical start signals, the first starts a recurring signal
533 that is sent on a regular basis to keep track of time in the data node.
534 Only the second one starts performing the various start phases.
535
536 A startup of a data node is handled in a set of phases. The first phase is
537 to send the signal READ_CONFIG_REQ to all software modules in the kernel,
538 then STTOR is similarly sent to all software modules in 256 phases numbered
539 from 0 to 255. These are numbered from 0 to 255, we don't use all of those
540 phases, but the code is flexible such that any of those phases could be
541 used now or sometime in the future.
542
543 In addition we have 6 modules that are involved in one more set of start
544 phases. The signal sent in these phases are called NDB_STTOR. The original
545 idea was to view this message as the local start of the NDB subsystem.
546 These signals are sent and handled by NDBCNTR and sent as part of the STTOR
547 handling in NDBCNTR. This means that it becomes a sequential part of the
548 startup phases.
549
550 Before starting the phases we ensure that any management node can connect
551 to our node and that all other node are disconnected and that they can only
552 send messages to the QMGR module. The management server receives reports
553 about various events in the data node and the QMGR module is taking care of
554 the inclusion of the data node into the cluster. Before we're included in
555 the cluster we cannot communicate with other nodes in any manner.
556
557 The start always starts in the main thread where each software module is
558 represented by at least a proxy module that all multithreaded modules contain.
559 The proxy module makes it possible to easy send and receive messages to a
560 set of modules of the same type using one message and one reply.
561
562 The READ_CONFIG_REQ signals are always sent in the same order. It starts by
563 sending to CMVMI, this is the block that receives the start order and it
564 performs a number of functions from where the software modules can affect the
565 run-time environment. It normally allocates most memory of the process and
566 touches all of this memory. It is part of the main thread.
567
568 The next module receiving READ_CONFIG_REQ is NDBFS, this is the module that
569 controls the file system threads, this module is found in the main thread.
570
571 Next module is DBINFO, this module supports the ndbinfo database used to get
572 information about the data node internals in table format, this module is
573 found in the main thread.
574
575 Next is DBTUP, this is the module where the actual data is stored. Next DBACC,
576 the module where primary key and unique key hash indexes are stored and where
577 we control row locks from. Both those blocks are contained in the ldm threads.
578
579 Next is DBTC, the module where transaction coordination is managed from,
580 this module is part of the tc thread. Next is DBLQH, the module that controls
581 the actions on data through key operations and scans and also handles the
582 REDO logs. This is the main module of the ldm thread.
583
584 Next is DBTUX that operates ordered index reusing pages used to store rows
585 in DBTUP, also part of the ldm thread. Next is DBDICT, the dictionary module
586 used to store and handle all metadata information about tables and columns,
587 tablespaces, log files and so forth. DICT is part of the main thread.
588
589 Next is DBDIH, the module to store and handle distribution information about
590 all tables, the table partitions and all replicas of each partition. It
591 controls the local checkpoint process, the global checkpoint process and
592 controls a major part of the restart processing. The DIH module is a part of
593 the main thread.
594
595 Next is NDBCNTR that controls the restart phases, it's part of the main
596 thread. Next is QMGR which takes care of the heartbeat protocol and inclusion
597 and exclusion of nodes in the cluster. It's part of the main thread.
598
599 Next is TRIX that performs a few services related to ordered indexes and other
600 trigger-based services. It's part of the tc thread. Next is BACKUP, this is
601 used for backups and local checkpoints and is part of the ldm thread.
602
603 Next is DBUTIL that provides a number of services such as performing key
604 operations on behalf of code in the modules. It's part of the main thread.
605 Next is the SUMA module that takes care of replication events, this is the
606 module handled by the rep thread.
607
608 Next is TSMAN, then LGMAN, and then PGMAN that are all part of the disk data
609 handling taking care of tablespace, UNDO logging and page management. They
610 are all part of the ldm thread.
611
612 RESTORE is a module used to restore local checkpoints as part of a startup.
613 This module is also part of the ldm thread.
614
615 Finally we have the DBSPJ module that takes care of join queries pushed down
616 to the data node, it executes as part of the tc thread.
617
618 The DBTUP, DBACC, DBLQH, DBTUX, BACKUP, TSMAN, LGMAN, PGMAN, RESTORE are all
619 tightly integrated modules that takes care of the data and indexes locally in
620 each node. This set of modules form an LDM instance, each node can have
621 multiple LDM instances and these can be spread over a set of threads.
622 Each LDM instance owns its own partition of the data.
623
624 We also have two modules that are not a part of restart handling, this is the
625 TRPMAN module that performs a number of transport-related functions
626 (communication with other nodes). It executes in the receive threads. Finally
627 we have the THRMAN that executes in every thread and does some thread
628 management functionality.
629
630 All modules receive READ_CONFIG_REQ, all modules also receive STTOR for
631 phase 0 and phase 1. In phase 1 they report back which startphases they want
632 to get informed about more.
633
634 During the READ_CONFIG_REQ the threads can execute for a very long time in
635 a module since we can be allocating and touching memory of large sizes. This
636 means that our watchdog thread have a special timeout for this phase to
637 ensure that we don't crash the process simply due to a long time of
638 initialising our memory. In normal operations each signal should execute only
639 for a small number of microseconds.
640
641 The start phases are synchronized by sending the message STTOR to all modules,
642 logically each module gets this signal for each start phase from 0 to 255.
643 However the response message STTORRY contains the list of start phases the
644 module really is interested in.
645
646 The NDBCNTR module that handles the start phase signals can optimise away
647 any signals not needed. The order in which modules receive the STTOR message
648 is the same for all phases:
649
650 1) NDBFS
651 2) DBTC
652 3) DBDIH
653 4) DBLQH
654 5) DBACC
655 6) DBTUP
656 7) DBDICT
657 8) NDBCNTR
658 9) CMVMI
659 10)QMGR
660 11)TRIX
661 12)BACKUP
662 13)DBUTIL
663 14)SUMA
664 15)DBTUX
665 16)TSMAN
666 17)LGMAN
667 18)PGMAN
668 19)RESTORE
669 20)DBINFO
670 21)DBSPJ
671
672 In addition there is a special start phase handling controlled by NDBCNTR,
673 so when NDBCNTR receives its own STTOR message it starts a local start phase
674 handling involving the modules, DBLQH, DBDICT, DBTUP, DBACC, DBTC and DBDIH.
675
676 This happens for phases 2 through 8. The messages sent in these start phases
677 are NDB_STTOR and NDB_STTORRY, they are handled in a similar manner to STTOR
678 and STTORRY. The modules receive also those start phases in the same order
679 for all phases and this order is:
680
681 1) DBLQH
682 2) DBDICT
683 3) DBTUP
684 4) DBACC
685 5) DBTC
686 6) DBDIH
687
688 For those modules that are multithreaded, the STTOR and NDB_STTOR messages
689 always are received by the Proxy module that executes in the main thread.
690 The Proxy module will then send the STTOR and NDB_STTOR messages to each
691 individual instance of the module (the number of instances is normally the
692 same as the number of threads, but could sometimes be different). It does
693 so in parallel, so all instances execute STTOR in parallel.
694
695 So effectively each instance of a module will logically first receive
696 READ_CONFIG_REQ, then a set of STTOR messages for each start phase and some
697 modules will also receive NDB_STTOR in a certain order. All these messages
698 are sent in a specific order and sequentially. So this means that we have the
699 ability to control when things are done by performing it in the correct start
700 phase.
701
702 Next we will describe step-by-step what happens in a node restart (or a node
703 start as part of a cluster start/restart). The startup is currently a
704 sequential process except where it is stated that it happens in parallel.
705 The below description thus describes the order things actually happens
706 currently.
707
708 READ_CONFIG_REQ
709 ---------------
710 The READ_CONFIG_REQ does more or less the same for all software modules. It
711 allocates the memory required by the software module and initialises the
712 memory (creates various free lists and so forth). It also reads the various
713 configuration parameter which is of interest to the module (these often
714 affect the size of the memory we allocate).
715
716 It starts in CMVMI that allocates most of the global memory pool, next we
717 have NDBFS that creates the necessary file directories for disk data, it
718 also creates the bound IO threads that can be used by one file at a time
719 (initial number of threads configurable through InitalNoOpenFiles), then it
720 creates a number of free threads (number of them configurable through
721 IOThreadPool) used by disk data files (all files used to handle disk data),
722 each such thread can be used to open/read/write/close a disk data file.
723 Finally NDBFS also creates the communication channel from the file system
724 threads back to the other threads.
725
726 All other modules follow the same standard, they calculate a number of sizes
727 based on hard coded defines or through configuration variables, they allocate
728 memory for those variables, finally they initialise those allocated memory
729 structures.
730
731 STTOR Phase 0
732 -------------
733 First STTOR phase executed is STTOR phase 0. The only modules doing anything
734 in this phase is NDBCNTR that clears the file system if the start is an initial
735 start and CMVMI that creates the file system directory.
736
737 STTOR Phase 1
738 -------------
739 Next phase executed is STTOR phase 1, in this phase most modules initialise
740 some more data, references to neighbour modules are setup if necessary. In
741 addition DBDIH create some special mutexes that ensures that only one process
742 is involved in certain parts of the code at a time.
743
744 NDBCNTR initialises some data related to running NDB_STTOR starting in
745 phase 2. CMVMI locks memory if configured to do so, after this it installs the
746 normal watchdog timeout since now all large memory allocations are performed.
747 CMVMI also starts regular memory reporting.
748
749 QMGR is the most active module in this phase. It initialises some data, it
750 gets the restart type (initial start or normal start) from DBDIH, it opens
751 communication to all nodes in the cluster, it starts checking for node
752 failures of the include node handling. Finally it runs the protocol to
753 include the new node into the heartbeat protocol. This could take a while
754 since the node inclusion process can only bring in one node at a time and
755 the protocol contains some delays.
756
757 The BACKUP module then starts the disk speed check loop which will run as
758 long as the node is up and running.
759
760 STTOR Phase 2
761 -------------
762 Next step is to execute STTOR phase 2. The only module that does anything in
763 STTOR phase 2 is NDBCNTR, it asks DIH for the restart type, it reads the node
764 from the configuration, it initialises the partial timeout variables that
765 controls for how long to wait before we perform a partial start.
766
767 NDBCNTR sends the signal CNTR_START_REQ to the NDBCNTR in the current master
768 node, this signal enables the master node to delay the start of this node if
769 necessary due to other starting nodes or some other condition. For cluster
770 starts/restarts it also gives the master node the chance to ensure we wait
771 for enough nodes to start up before we start the nodes.
772
773 The master only accepts one node at a time that has received CNTR_START_CONF,
774 the next node can only receive CNTR_START_CONF after the previous starting
775 node have completed copying the metadata and releasing the metadata locks and
776 locks on DIH info, that happens below in STTOR phase 5.
777
778 So in a rolling restart it is quite common that the first node will get
779 CNTR_START_CONF and then instead get blocked on the DICT lock waiting for
780 an LCP to complete. The other nodes starting up in parallel will instead
781 wait on CNTR_START_CONF since only one node at a time can pass this.
782
783 After receiving CNTR_START_CONF, NDBCNTR continues by running NDB_STTOR
784 phase 1. Here DBLQH initialises the node records, it starts a reporting
785 service. It does also initialise the data about the REDO log, this also
786 includes initialising the REDO log on disk for all types of initial start
787 (can be quite time consuming).
788
789 DBDICT initialises the schema file (contains the tables that have been created
790 in the cluster and other metadata objects). DBTUP initialises a default value
791 fragment and DBTC and DBDIH initialises some data variables. After completing
792 the NDB_STTOR phase in NDBCNTR there is no more work done in STTOR phase 2.
793
794 STTOR Phase 3
795 -------------
796 Next step is to run the STTOR phase 3. Most modules that need the list of
797 nodes in the cluster reads this in this phase. DBDIH reads the nodes in this
798 phase, DBDICT sets the restart type. Next NDBCNTR receives this phase and
799 starts NDB_STTOR phase 2. In this phase DBLQH sets up connections from its
800 operation records to the operation records in DBACC and DBTUP. This is done
801 in parallel for all DBLQH module instances.
802
803 DBDIH now prepares the node restart process by locking the meta data. This
804 means that we will wait until any ongoing meta data operation is completed
805 and when it is completed we will lock the meta data such that no meta data
806 changes can be done until we're done with the phase where we are copying the
807 metadata informatiom.
808
809 The reason for locking is that all meta data and distribution info is fully
810 replicated. So we need to lock this information while we are copying the data
811 from the master node to the starting node. While we retain this lock we cannot
812 change meta data through meta data transactions. Before copying the meta data
813 later we also need to ensure no local checkpoint is running since this also
814 updates the distribution information.
815
816 After locking this we need to request permission to start the node from the
817 master node. The request for permission to start the node is handled by the
818 starting node sending START_PERMREQ to the master node. This could receive a
819 negative reply if another node is already processing a node restart, it could
820 fail if an initial start is required. If another node is already starting we
821 will wait 3 second and try again. This is executed in DBDIH as part of
822 NDB_STTOR phase 2.
823
824 After completing the NDB_STTOR phase 2 the STTOR phase 3 continues by the
825 CMVMI module activating the checks of send packed data which is used by scan
826 and key operations.
827
828 Next the BACKUP module reads the configured nodes. Next the SUMA module sets
829 the reference to the Page Pool such that it can reuse pages from this global
830 memory pool, next DBTUX sets the restart type. Finally PGMAN starts a stats
831 loop and a cleanup loop that will run as long as the node is up and running.
832
833 We could crash the node if our node is still involved in some processes
834 ongoing in the master node. This is fairly normal and will simply trigger a
835 crash followed by a normal new start up by the angel process. The request
836 for permission is handled by the master sending the information to all nodes.
837
838 For initial starts the request for permission can be quite time consuming
839 since we have to invalidate all local checkpoints from all tables in the
840 meta data on all nodes. There is no parallelisation of this invalidation
841 process currently, so it will invalidate one table at a time.
842
843 STTOR Phase 4
844 -------------
845 After completing STTOR phase 3 we move onto STTOR phase 4. This phase starts
846 by DBLQH acquiring a backup record in the BACKUP module that will be used
847 for local checkpoint processing.
848
849 Next NDBCNTR starts NDB_STTOR phase 3. This starts also in DBLQH where we
850 read the configured nodes. Then we start reading the REDO log to get it
851 set-up (we will set this up in the background, it will be synchronised by
852 another part of cluster restart/node restart later described), for all types
853 of initial starts we will wait until the initialisation of the REDO log have
854 been completed until reporting this phase as completed.
855
856 Next DBDICT will read the configured nodes whereafter also DBTC reads the
857 configured nodes and starts transaction counters reporting. Next in
858 NDB_STTOR phase 3 is that DBDIH initialises restart data for initial starts.
859
860 Before completing its work in STTOR phase 4, NDBCNTR will set-up a waiting
861 point such that all starting nodes have reached this point before
862 proceeding. This is only done for cluster starts/restarts, so not for node
863 restarts.
864
865 The master node controls this waitpoint and will send the signal
866 NDB_STARTREQ to DBDIH when all nodes of the cluster restart have reached
867 this point. More on this signal later.
868
869 The final thing happening in STTOR phase 4 is that DBSPJ reads the configured
870 nodes.
871
872 STTOR Phase 5
873 -------------
874 We now move onto STTOR phase 5. The first thing done here is to run NDB_STTOR
875 phase 4. Only DBDIH does some work here and it only does something in node
876 restarts. In this case it asks the current master node to start it up by
877 sending the START_MEREQ signal to it.
878
879 START_MEREQ works by copying distribution information from master DBDIH node
880 and then also meta data information from master DBDICT. It copies one table
881 of distribution information at a time which makes the process a bit slow
882 since it includes writing the table to disk in the starting node.
883
884 The only manner to trace this event is when writing the table distribution
885 information per table in DBDIH in the starting node. We can trace the
886 reception of DICTSTARTREQ that is received in the starting nodes DBDICT.
887
888 When DBDIH and DBDICT information is copied then we need to block the global
889 checkpoint in order to include the new node in all changes of meta data and
890 distribution information from now on. This is performed by sending
891 INCL_NODEREQ to all nodes. After this we can release the meta data lock that
892 was set by DBDIH already in STTOR phase 2.
893
894 After completing NDB_STTOR phase 4, NDBCNTR synchronises the start again in
895 the following manner:
896
897 If initial cluster start and master then create system tables
898 If cluster start/restart then wait for all nodes to reach this point.
899 After waiting for nodes in a cluster start/restart then run NDB_STTOR
900 phase 5 in master node (only sent to DBDIH).
901 If node restart then run NDB_STTOR phase 5 (only sent to DBDIH).
902
903 NDB_STTOR phase 5 in DBDIH is waiting for completion of a local checkpoint
904 if it is a master and we are running a cluster start/restart. For node
905 restarts we send the signal START_COPYREQ to the starting node to ask for
906 copying of data to our node.
907
908 START OF DATABASE RECOVERY
909
910 We start with explaining a number of terms used.
911 ------------------------------------------------
912 LCP: Local checkpoint, in NDB this means that all data in main memory is
913 written to disk and we also write changed disk pages to disk to ensure
914 that all changes before a certain point is available on disk.
915 Execute REDO log: This means that we're reading the REDO log one REDO log
916 record at a time and executing the action if needed that is found in the
917 REDO log record.
918 Apply the REDO log: Synonym of execute the REDO log.
919 Prepare REDO log record: This is a REDO log record that contains the
920 information about a change in the database (insert/delete/update/write).
921 COMMIT REDO log record: This is a REDO log record that specifies that a
922 Prepare REDO log record is to be actually executed. The COMMIT REDO log
923 record contains a back reference to the Prepare REDO log record.
924 ABORT REDO log record: Similarly to the COMMIT REDO log record but here
925 the transaction was aborted so there is no need to apply the REDO log
926 record.
927 Database: Means in this context all the data residing in the cluster or
928 in the node when there is a node restart.
929 Off-line Database: Means that our database in our node is not on-line
930 and thus cannot be used for reading. This is the state of the database
931 after restoring a LCP, but before applying the REDO log.
932 Off-line Consistent database: This is a database state which is not
933 up-to-date with the most recent changes, but it represents an old state
934 in the database that existed previously. This state is achieved after
935 restoring an LCP and executing the REDO log.
936 On-line Database: This is a database state which is up-to-date, any node
937 that can be used to read data is has its database on-line (actually
938 fragments are brought on-line one by one).
939 On-line Recoverable Database: This is an on-line database that is also
940 recoverable. In a node restart we reach the state on-line database first,
941 but we need to run an LCP before the database can also be recovered to
942 its current state. A recoverable database is also durable so this means
943 that we're adding the D in ACID to the database when we reach this state.
944 Node: There are API nodes, data nodes and management server nodes. A data
945 node is a ndbd/ndbmtd process that runs all the database logic and
946 contains the database data. The management server node is a process that
947 runs ndb_mgmd that contains the cluster configuration and also performs
948 a number of management services. API nodes are part of application processes
949 and within mysqld's. There can be more than one API node per application
950 process. Each API node is connected through a socket (or other
951 communication media) to each of the data nodes and management server nodes.
952 When one refers to nodes in this text it's mostly implied that we're
953 talking about a data node.
954 Node Group: A set of data nodes that all contain the same data. The number
955 of nodes in a node group is equal to the number of replicas we use in the
956 cluster.
957 Fragment: A part of a table that is fully stored on one node group.
958 Partition: Synonym of fragment.
959 Fragment replica: This is one fragment in one node. There can be up
960 to 4 replicas of a fragment (so thus a node group can have up to
961 4 nodes in it).
962 Distribution information: This is information about the partitions
963 (synonym of fragments) of the tables and on which nodes they reside
964 and information about LCPs that have been executed on each fragment
965 replica.
966 Metadata: This is the information about tables, indexes, triggers,
967 foreign keys, hash maps, files, log file groups, table spaces.
968 Dictionary information: Synonym to metadata.
969 LDM: Stands for Local Data Manager, these are the blocks that execute
970 the code that handles the data handled within one data node. It contains
971 blocks that handles the tuple storage, the hash index, the T-tree index,
972 the page buffer manager, the tablespace manager, a block that writes
973 LCPs and a block that restores LCPs, a log manager for disk data.
974
975 ------------------------------------------------------------------------------
976 | What happens as part START_COPYREQ is what is the real database restore |
977 | process. Here most of the important database recovery algorithms are |
978 | executed to bring the database online again. The earlier phases were still |
979 | needed to restore the metadata and setup communication, setup memory and |
980 | bringing in the starting node as a full citizen in the cluster of data |
981 | nodes. |
982 ------------------------------------------------------------------------------
983
984 START_COPYREQ goes through all distribution information and sends
985 START_FRAGREQ to the owning DBLQH module instance for each fragment replica
986 to be restored on the node. DBLQH will start immediately to restore those
987 fragment replicas, it will queue the fragment replicas and restore one at a
988 time. This happens in two phases, first all fragment replicas that requires
989 restore of a local checkpoint starts to do that.
990
991 After all fragment replicas to restore have been sent and we have restored all
992 fragments from a local checkpoint stored on our disk (or sometime by getting
993 the entire fragment from an alive node) then it is time to run the disk data
994 UNDO log. Finally after running this UNDO log we're ready to get the fragment
995 replicas restored to latest disk-durable state by applying the REDO log.
996
997 DBDIH will send all required information for all fragment replicas to DBLQH
998 whereafter it sends START_RECREQ to DBLQH to indicate all fragment replica
999 information have been sent now.
1000
1001 START_RECREQ is sent through the DBLQH proxy module and this part is
1002 parallelised such that all LDM instances are performing the below parts in
1003 parallel.
1004
1005 If we're doing a initial node restart we don't need to restore any local
1006 checkpoints since initial node restart means that we start without a file
1007 system. So this means that we have to restore all data from other nodes in
1008 the node group. In this case we start applying the copying of fragment
1009 replicas immediately in DBLQH when we receive START_FRAGREQ. In this case
1010 we don't need to run any Undo or Redo log since there is no local checkpoint
1011 to restore the fragment.
1012
1013 When this is completed and DBDIH has reported that all fragment replicas to
1014 start have been sent by sending START_RECREQ to DBLQH we will send
1015 START_RECREQ to TSMAN whereafter we are done with the restore of the data.
1016
1017 We will specify all fragment replicas to restore as part of REDO log
1018 execution. This is done through the signal EXEC_FRAGREQ. When all such signals
1019 have been sent we send EXEC_SRREQ to indicate we have prepared for the next
1020 phase of REDO log execution in DBLQH.
1021
1022 When all such signals are sent we have completed what is termed as phase 2
1023 of DBLQH, the phase 1 in DBLQH is what started in NDB_STTOR phase 3 to prepare
1024 the REDO log for reading it. So when both those phases are complete we're ready
1025 to start what is termed phase 3 in DBLQH.
1026
1027 These DBLQH phases are not related to the start phases, these are internal
1028 stages of startup in the DBLQH module.
1029
1030 Phase 3 in DBLQH is the reading of the REDO log and applying it on fragment
1031 replicas restored from the local checkpoint. This is required to create a
1032 database state which is synchronised on a specific global checkpoint. So we
1033 first install a local checkpoint for all fragments, next we apply the REDO
1034 log to synchronise the fragment replica with a certain global checkpoint.
1035
1036 Before executing the REDO log we need to calculate the start GCI and the last
1037 GCI to apply in the REDO log by checking the limits on all fragment replicas
1038 we will restore to the desired global checkpoint.
1039
1040 DBDIH has stored information about each local checkpoint of a fragment
1041 replica which global checkpoint ranges that are required to run from the REDO
1042 log in order to bring it to the state of a certain global checkpoint. This
1043 information was sent in the START_FRAGREQ signal. DBLQH will merge all of
1044 those limits per fragment replica to a global range of global checkpoints to
1045 run for this LDM instance. So each fragment replica has its own GCP id range
1046 to execute and this means that the minimum of all those start ranges and
1047 maximum of all the end ranges is the global range of GCP ids that we need
1048 to execute in the REDO log to bring the cluster on-line again.
1049
1050 The next step is to calculate the start and stop megabyte in the REDO log for
1051 each log part by using the start and stop global checkpoint id. All the
1052 information required to calculate this is already in memory, so it's a pure
1053 calculation.
1054
1055 When we execute the REDO log we actually only apply the COMMIT records in the
1056 correct global checkpoint range. The COMMIT record and the actual change
1057 records are in different places in the REDO log, so for each Megabyte of
1058 REDO log we record how far back in the REDO log we have to go to find the
1059 change records.
1060
1061 While running the REDO log we maintain a fairly large cache of the REDO log
1062 to avoid that we have to do disk reads in those cases where the transaction
1063 ran for a long time.
1064
1065 This means that long-running and large transactions can have a negative effect
1066 on restart times.
1067
1068 After all log parts have completed this calculation we're now ready to start
1069 executing the REDO log. After executing the REDO log to completion we also
1070 write some stuff into the REDO log to indicate that any information beyond
1071 what we used here won't be used at any later time.
1072
1073 We now need to wait for all other log parts to also complete execution of
1074 their parts of the REDO log. The REDO log execution is designed such that we
1075 can execute the REDO log in more than one phase, this is intended for cases
1076 where we can rebuild a node from more than one live node. Currently this code
1077 should never be used.
1078
1079 So the next step is to check for the new head and tail of the REDO log parts.
1080 This is done through the same code that uses start and stop global
1081 checkpoints to calculate this number. This phase of the code also prepares
1082 the REDO log parts for writing new REDO log records by ensuring that the
1083 proper REDO log files are open. It also involves some rather tricky code to
1084 ensure that pages that have been made dirty are properly handled.
1085
1086 COMPLETED RESTORING OFF-LINE CONSISTENT DATABASE
1087 ------------------------------------------------------------------------------
1088 | After completing restoring fragment replicas to a consistent global |
1089 | checkpoint, we will now start rebuilding the ordered indexes based on the |
1090 | data restored. After rebuilding the ordered indexes we are ready to send |
1091 | START_RECCONF to the starting DBDIH. START_RECCONF is sent through the |
1092 | DBLQH proxy, so it won't be passed onto DBDIH until all DBLQH instances |
1093 | have completed this phase and responded with START_RECCONF. |
1094 ------------------------------------------------------------------------------
1095
1096 At this point in the DBLQH instances we have restored a consistent but old
1097 variant of all data in the node. There are still no ordered indexes and there
1098 is still much work remaining to get the node synchronised with the other nodes
1099 again. For cluster restarts it might be that the node is fully ready to go,
1100 it's however likely that some nodes still requires being synchronised with
1101 nodes that have restored a more recent global checkpoint.
1102
1103 The DBDIH of the starting node will then start the take over process now
1104 that the starting node has consistent fragment replicas. We will prepare the
1105 starting node's DBLQH for the copying phase by sending PREPARE_COPY_FRAG_REQ
1106 for each fragment replica we will copy over. This is a sequential process that
1107 could be parallelised a bit.
1108
1109 The process to take over a fragment replica is quite involved. It starts by
1110 sending PREPARE_COPY_FRAGREQ/CONF to the starting DBLQH, then we send
1111 UPDATE_TOREQ/CONF to the master DBDIH to ensure we lock the fragment
1112 information before the take over starts. After receiving confirmation of this
1113 fragment lock, the starting node send UPDATE_FRAG_STATEREQ/CONF to all nodes to
1114 include the new node into all operations on the fragment.
1115
1116 After completing this we again send UPDATE_TOREQ/CONF to the master node to
1117 inform of the new status and unlock the lock on the fragment information. Then
1118 we're ready to perform the actual copying of the fragment. This is done by
1119 sending COPY_FRAGREQ/CONF to the node that will copy the data. When this
1120 copying is done we send COPY_ACTIVEREQ/CONF to the starting node to activate
1121 the fragment replica.
1122
1123 Next we again send UPDATE_TOREQ/CONF to the master informing about that we're
1124 about to install the commit the take over of the new fragment replica. Next we
1125 commit the new fragment replica by sending UPDATE_FRAG_STATEREQ/CONF to all
1126 nodes informing them about completion of the copying of the fragment replica.
1127 Finally we send another update to the master node with UPDATE_TOREQ/CONF.
1128 Now we're finally complete with copying of this fragment.
1129
1130 The idea with this scheme is that the first UPDATE_FRAG_STATEREQ ensures that
1131 we're a part of all transactions on the fragment. After doing the COPY_FRAGREQ
1132 that synchronises the starting node's fragment replica with the alive node's
1133 fragment replica on a row by row basis, we're sure that the two fragment
1134 replicas are entirely synchronised and we can do a new UPDATE_FRAG_STATEREQ to
1135 ensure all nodes know that we're done with the synchronisation.
1136
1137 COMPLETED RESTORING ON-LINE NOT RECOVERABLE DATABASE
1138 ------------------------------------------------------------------------------
1139 | At this point we have restored an online variant of the database by |
1140 | bringing one fragment at a time online. The database is still not |
1141 | recoverable since we haven't enabled logging yet and there is no local |
1142 | checkpoint of the data in the starting node. |
1143 ------------------------------------------------------------------------------
1144
1145 Next step is to enable logging on all fragments, after completing this step
1146 we will send END_TOREQ to the master DBDIH. At this point we will wait until a
1147 local checkpoint is completed where this node have been involved. Finally when
1148 the local checkpoint have been completed we will send END_TOCONF to the
1149 starting node and then we will send START_COPYCONF and that will complete
1150 this phase of the restart.
1151
1152 COMPLETED RESTORING ON-LINE RECOVERABLE DATABASE
1153 ------------------------------------------------------------------------------
1154 | At this point we have managed to restored all data and we have brought it |
1155 | online and now we have also executed a local checkpoint afer enabling |
1156 | logging and so now data in the starting node is also recoverable. So this |
1157 | means that the database is now fully online again. |
1158 ------------------------------------------------------------------------------
1159
1160 After completing NDB_STTOR phase 5 then all nodes that have been synchronised
1161 in a waitpoint here are started again and NDBCNTR continues by running
1162 phase 6 of NDB_STTOR.
1163
1164 In this phase DBLQH, DBDICT and DBTC sets some status variables indicating
1165 that now the start has completed (it's not fully completed yet, but all
1166 services required for those modules to operate are completed. DBDIH also
1167 starts global checkpoint protocol for cluster start/restarts where it has
1168 become the master node.
1169
1170 Yet one more waiting point for all nodes is now done in the case of a cluster
1171 start/restart.
1172
1173 The final step in STTOR phase 5 is SUMA that reads the configured nodes,
1174 gets the node group members and if there is node restart it asks another
1175 node to recreate subscriptions for it.
1176
1177 STTOR Phase 6
1178 -------------
1179 We now move onto STTOR phase 6. In this phase NDBCNTR gets the node group of
1180 the node, DBUTIL gets the systable id, prepares a set of operations for later
1181 use and connects to TC to enable it to run key operations on behalf of other
1182 modules later on.
1183
1184 STTOR Phase 7
1185 -------------
1186 Next we move onto STTOR phase 7. DBDICT now starts the index statistics loop
1187 that will run as long as the node lives.
1188
1189 QMGR will start arbitration handling to handle a case where we are at risk of
1190 network partitioning.
1191
1192 BACKUP will update the disk checkpoint speed (there is one config variable
1193 for speed during restarts and one for normal operation, here we install the
1194 normal operation speed). If initial start BACKUP will also create a backup
1195 sequence through DBUTIL.
1196
1197 SUMA will create a sequence if it's running in a master node and it's an
1198 initial start. SUMA will also always calculate which buckets it is
1199 responsible to handle. Finally DBTUX will start monitoring of ordered indexes.
1200
1201 STTOR Phase 8
1202 -------------
1203 We then move onto STTOR phase 8. First thing here is to run phase 7 of
1204 NDB_STTOR in which DBDICT enables foreign keys. Next NDBCNTR will also wait
1205 for all nodes to come here if we're doing a cluster start/restart.
1206
1207 Next CMVMI will set state to STARTED and QMGR will enable communication to
1208 all API nodes.
1209
1210 STTOR Phase 101
1211 ---------------
1212 After this phase the only remaining phase is STTOR phase 101 in which SUMA
1213 takes over responsibility of the buckets it is responsible for in the
1214 asynchronous replication handling.
1215
1216 Major potential consumers of time so far:
1217
1218 All steps in the memory allocation (all steps of the READ_CONFIG_REQ).
1219 CMVMI STTOR phase 1 that could lock memory. QMGR phase 1 that runs the
1220 node inclusion protocol.
1221
1222 NDBCNTR STTOR phase 2 that waits for CNTR_START_REQ, DBLQH REDO log
1223 initialisation for initial start types that happens in STTOR phase 2.
1224 Given that only one node can be in this phase at a time, this can be
1225 stalled by a local checkpoint wait of another node starting. So this
1226 wait can be fairly long.
1227
1228 DBLQH sets up connections to DBACC and DBTUP, this is NDB_STTOR phase 2.
1229 DBDIH in NDB_STTOR phase 2 also can wait for the meta data to be locked
1230 and it can wait for response to START_PERMREQ.
1231
1232 For initial starts waiting for DBLQH to complete NDB_STTOR phase 3 where
1233 it initialises set-up of the REDO logs. NDBCNTR for cluster start/restarts
1234 in STTOR phase 4 after completing NDB_STTOR phase 3 have to wait for all
1235 nodes to reach this point and then it has to wait for NDB_STARTREQ to
1236 complete.
1237
1238 For node restarts we have delays in waiting for response to START_MEREQ
1239 signal and START_COPYREQ, this is actually where most of the real work of
1240 the restart is done. SUMA STTOR phase 5 where subscriptions are recreated
1241 is another potential time consumer.
1242
1243 All waitpoints are obvious potential consumers of time. Those are mainly
1244 located in NDBCNTR (waitpoint 5.2, 5,1 and 6).
1245
1246 Historical anecdotes:
1247 1) The NDB kernel run-time environment was originally designed for an
1248 AXE virtual machine. In AXE the starts were using the module MISSRA to
1249 drive the STTOR/STTORRY signals for the various startup phases.
1250 The MISSRA was later merged into NDBCNTR and is a submodule of NDBCNTR
1251 nowadays. The name of STTOR and STTORRY has some basis in the AXE systems
1252 way of naming signals in early days but has been forgotten now. At least
1253 the ST had something to do wih Start/Restart.
1254
1255 2) The reason for introducing the NDB_STTOR was since we envisioned a system
1256 where the NDB kernel was just one subsystem within the run-time environment.
1257 So therefore we introduced separate start phases for the NDB subsystem.
1258 Over time the need for such a subsystem startup phases are no longer there,
1259 but the software is already engineered for this and thus it's been kept in
1260 this manner.
1261
1262 3) Also the responsibility for the distributed parts of the database start
1263 is divided. QMGR is responsible for discovering when nodes are up and down.
1264 NDBCNTR maintains the protocols for failure handling and other changes of the
1265 node configuration. Finally DBDIH is responsible for the distributed start of
1266 the database parts. It interacts a lot with DBLQH that have the local
1267 responsibility of starting one nodes database part as directed by DBDIH.
1268
1269 Local checkpoint processing in MySQL Cluster
1270 --------------------------------------------
1271
1272 This comment attempts to describe the processing of checkpoints as it happens
1273 in MySQL Cluster. It also clarifies where potential bottlenecks are. This
1274 comment is mainly intended as internal documentation of the open source code
1275 of MySQL Cluster.
1276
1277 The reason for local checkpoints in MySQL Cluster is to ensure that we have
1278 copy of data on disk which can be used to run the REDO log against to restore
1279 the data in MySQL Cluster after a crash.
1280
1281 We start by introducing different restart variants in MySQL Cluster. The first
1282 variant is a normal node restart, this means that the node have been missing
1283 for a short time, but is now back on line again. We start by installing a
1284 checkpointed version of all tables (including executing proper parts of the
1285 REDO log against it). Next step is to use the replica which are still online
1286 to make the checkpointed version up to date. Replicas are always organised in
1287 node groups, the most common size of a node group is two nodes. So when a
1288 node starts up, it uses the other node in the same node group to get an
1289 online version of the tables back online. In a normal node restart we have
1290 first restored a somewhat old version of all tables before using the other
1291 node to synchronize it. This means that we only need to ship the latest
1292 version of the rows that have been updated since the node failed before the
1293 node restart. We also have the case of initial node restarts where all data
1294 have to be restored from the other node since the checkpoint in the starting
1295 node is either too old to be reused or it's not there at all when a completely
1296 new node is started up.
1297
1298 The third variant of restart is a so called system restart, this means that
1299 the entire cluster is starting up after a cluster crash or after a controlled
1300 stop of the cluster. In this restart type we first restore a checkpoint on all
1301 nodes before running the REDO log to get the system in a consistent and
1302 up-to-date state. If any node was restored to an older global checkpoint than
1303 the one to restart from, then it is necessary to use the same code used in
1304 node restarts to bring those node to an online state.
1305
1306 The system restart will restore a so called global checkpoint. A set of
1307 transactions are grouped together into a global checkpoint, when this global
1308 checkpoint has been completed the transactions belonging to it are safe and
1309 will survive a cluster crash. We run global checkpoints on a second level,
1310 local checkpoints write the entire data set to disk and is a longer process
1311 taking at least minutes.
1312
1313 Before a starting node can be declared as fully restored it has to participate
1314 in a local checkpoint. The crashing node misses a set of REDO log record
1315 needed to restore the cluster, thus the node isn't fully restored until it can
1316 be used to restore all data it owns in a system restart.
1317
1318 So when performing a rolling node restart where all nodes in the cluster are
1319 restarted (e.g. to upgrade the software in MySQL Cluster), it makes sense to
1320 restart a set of nodes at a time since we can only have one set of nodes
1321 restarted at a time.
1322
1323 This was a bit of prerequisite to understand the need for local checkpoints.
1324 We now move to the description of how a local checkpoint is processed.
1325
1326 The local checkpoint is a distributed process. It is controlled by a
1327 software module called DBDIH (or DIH for short, DIstribution Handler).
1328 DIH contains all the information about where various replicas of each fragment
1329 (synonym with partition) are placed and various data on these replicas.
1330 DIH stores distribution information in one file per table. This file is
1331 actually two files, this is to ensure that we can do careful writing of the
1332 file. We first write file 0, when this is completed, we write file 1,
1333 in this manner we can easily handle any crashes while writing the table
1334 description.
1335
1336 When a local checkpoint have been completed, DIH immediately starts the
1337 process to start the next checkpoint. At least one global checkpoint have
1338 to be completed since starting the local checkpoint before we will start a
1339 new local checkpoint.
1340
1341 The first step in the next local checkpoint is to check if we're ready to
1342 run it yet. This is performed by sending the message TCGETOPSIZEREQ to all
1343 TC's in the cluster. This will report back the amount of REDO log information
1344 generated by checking the information received in TC for all write
1345 transactions. The message will be sent by the master DIH. The role of the
1346 master is assigned to the oldest surviving data node, this makes it easy to
1347 select a new master whenever a data node currently acting as master dies.
1348 All nodes agree on the order of nodes entering the cluster, so the age of
1349 a node is consistent in all nodes in the cluster.
1350
1351 When all messages have returned the REDO log write size to the master
1352 DIH we will compare it to the config variable TimeBetweenLocalCheckpoints
1353 (this variable is set in logarithm of size, so e.g. 25 means we wait
1354 2^25 words of REDO log has been created in the cluster which is 128 MByte
1355 of REDO log info).
1356
1357 When sufficient amount of REDO log is generated, then we start the next local
1358 checkpoint, the first step is to clear all TC counters, this is done by
1359 sending TC_CLOPSIZEREQ to all TC's in the cluster.
1360
1361 The next step is to calculate the keep GCI (this is the oldest global
1362 checkpoint id that needs to be retained in the REDO log). This number is very
1363 important since it's the point where we can move the tail of the REDO log
1364 forward. If we run out of REDO log space we will not be able to run any
1365 writing transactions until we have started the next local checkpoint and
1366 thereby moved the REDO log tail forward.
1367
1368 We calculate this number by checking each fragment what GCI it needs to be
1369 restored. We currently keep two old local checkpoints still valid, so we
1370 won't move the GCI back to invalidate the two oldest local checkpoints per
1371 fragment. The GCI that will be restorable after completing this calculation
1372 is the minimum GCI found on all fragments when looping over them.
1373
1374 Next we write this number and the new local checkpoint id and some other
1375 information in the Sysfile of all nodes in the cluster. This Sysfile is the
1376 first thing we look at when starting a restore of the cluster in a system
1377 restart, so it's important to have this type of information correct in this
1378 file.
1379
1380 When this is done we will calculate which nodes that will participate in the
1381 local checkpoint (nodes currently performing the early parts of a restart is
1382 not part of the local checkpoint and obviously also not dead nodes).
1383
1384 We send the information about the starting local checkpoint to all other DIH's
1385 in the system. We must keep all other DIH's up-to-date all the time to ensure
1386 it is easy to continue the local checkpoint also when the master DIH crashes
1387 or is stopped in the middle of the local checkpoint process. Each DIH records
1388 the set of nodes participating in the local checkpoint. They also set a flag
1389 on each replica record indicating a local checkpoint is ongoing, on each
1390 fragment record we also set the number of replicas that are part of this local
1391 checkpoint.
1392
1393 Now we have completed the preparations for the local checkpoint, it is now
1394 time to start doing the actual checkpoint writing of the actual data. The
1395 master DIH controls this process by sending off a LCP_FRAG_ORD for each
1396 fragment replica that should be checkpointed. DIH can currently have 2 such
1397 LCP_FRAG_ORD outstanding per node and 2 fragment replicas queued. Each LDM
1398 thread can process writing of one fragment replica at a time and it can
1399 have one request for the next fragment replica queued. It's fairly
1400 straightforward to extend this number such that more fragment replicas can
1401 be written in parallel and more can be queued.
1402
1403 LCP_FRAG_REP is sent to all DIH's when the local checkpoint for a fragment
1404 replica is completed. When a DIH discovers that all fragment replicas of a
1405 table have completed the local checkpoint, then it's time to write the table
1406 description to the file system. This will record the interesting local
1407 checkpoint information for all of the fragment replicas. There are two things
1408 that can cause this to wait. First writing and reading of the entire table
1409 description is something that can only happen one at a time, this mainly
1410 happens when there is some node failure handling ongoing while the local
1411 checkpoint is being processed.
1412
1413 The second thing that can block the writing of a table description is that
1414 currently a maximum of 4 table descriptions can be written in parallel. This
1415 could easily become a bottleneck since each write a file can take in the order
1416 of fifty milliseconds. So this means we can currently only write about 80 such
1417 tables per second. In a system with many tables and little data this could
1418 become a bottleneck. It should however not be a difficult bottleneck.
1419
1420 When the master DIH has sent all requests to checkpoint all fragment replicas
1421 it will send a special LCP_FRAG_ORD to all nodes indicating that no more
1422 fragment replicas will be sent out.
1423 */
1424
1425 void
execREAD_CONFIG_REQ(Signal * signal)1426 Ndbcntr::execREAD_CONFIG_REQ(Signal* signal)
1427 {
1428 jamEntry();
1429
1430 const ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtr();
1431
1432 Uint32 ref = req->senderRef;
1433 Uint32 senderData = req->senderData;
1434
1435 const ndb_mgm_configuration_iterator * p =
1436 m_ctx.m_config.getOwnConfigIterator();
1437 ndbrequire(p != 0);
1438
1439 Uint32 dl = 0;
1440 ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &dl);
1441 if (dl == 0)
1442 {
1443 const char * lgspec = 0;
1444 char buf[1024];
1445 if (!ndb_mgm_get_string_parameter(p, CFG_DB_DD_LOGFILEGROUP_SPEC, &lgspec))
1446 {
1447 jam();
1448
1449 if (parse_spec(f_dd, lgspec, DictTabInfo::LogfileGroup))
1450 {
1451 BaseString::snprintf(buf, sizeof(buf),
1452 "Unable to parse InitialLogfileGroup: %s", lgspec);
1453 progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
1454 }
1455 }
1456
1457 const char * tsspec = 0;
1458 if (!ndb_mgm_get_string_parameter(p, CFG_DB_DD_TABLEPACE_SPEC, &tsspec))
1459 {
1460 if (f_dd.size() == 0)
1461 {
1462 warningEvent("InitialTablespace specified, "
1463 "but InitialLogfileGroup is not!");
1464 warningEvent("Ignoring InitialTablespace: %s",
1465 tsspec);
1466 }
1467 else
1468 {
1469 if (parse_spec(f_dd, tsspec, DictTabInfo::Tablespace))
1470 {
1471 BaseString::snprintf(buf, sizeof(buf),
1472 "Unable to parse InitialTablespace: %s", tsspec);
1473 progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
1474 }
1475 }
1476 }
1477 }
1478
1479 struct ddentry empty;
1480 empty.type = ~0;
1481 f_dd.push_back(empty);
1482
1483 if (true)
1484 {
1485 // TODO: add config parameter
1486 // remove ATTRIBUTE_MASK2
1487 g_sysTable_NDBEVENTS_0.columnCount--;
1488 }
1489
1490 ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtrSend();
1491 conf->senderRef = reference();
1492 conf->senderData = senderData;
1493 sendSignal(ref, GSN_READ_CONFIG_CONF, signal,
1494 ReadConfigConf::SignalLength, JBB);
1495 }
1496
execSTTOR(Signal * signal)1497 void Ndbcntr::execSTTOR(Signal* signal)
1498 {
1499 jamEntry();
1500 cstartPhase = signal->theData[1];
1501
1502 cndbBlocksCount = 0;
1503 cinternalStartphase = cstartPhase - 1;
1504
1505 switch (cstartPhase) {
1506 case 0:
1507 if (m_ctx.m_config.getInitialStart())
1508 {
1509 jam();
1510 g_eventLogger->info("Clearing filesystem in initial start");
1511 c_fsRemoveCount = 0;
1512 clearFilesystem(signal);
1513 return;
1514 }
1515 sendSttorry(signal);
1516 break;
1517 case ZSTART_PHASE_1:
1518 jam();
1519 startPhase1Lab(signal);
1520 break;
1521 case ZSTART_PHASE_2:
1522 jam();
1523 startPhase2Lab(signal);
1524 break;
1525 case ZSTART_PHASE_3:
1526 jam();
1527 startPhase3Lab(signal);
1528 break;
1529 case ZSTART_PHASE_4:
1530 jam();
1531 startPhase4Lab(signal);
1532 break;
1533 case ZSTART_PHASE_5:
1534 jam();
1535 startPhase5Lab(signal);
1536 break;
1537 case 6:
1538 jam();
1539 getNodeGroup(signal);
1540 sendSttorry(signal);
1541 break;
1542 case ZSTART_PHASE_8:
1543 jam();
1544 startPhase8Lab(signal);
1545 break;
1546 case ZSTART_PHASE_9:
1547 jam();
1548 startPhase9Lab(signal);
1549 break;
1550 default:
1551 jam();
1552 sendSttorry(signal);
1553 break;
1554 }//switch
1555 }//Ndbcntr::execSTTOR()
1556
1557 void
getNodeGroup(Signal * signal)1558 Ndbcntr::getNodeGroup(Signal* signal){
1559 jam();
1560 CheckNodeGroups * sd = (CheckNodeGroups*)signal->getDataPtrSend();
1561 sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::GetNodeGroup;
1562 EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
1563 CheckNodeGroups::SignalLength);
1564 jamEntry();
1565 c_nodeGroup = sd->output;
1566 }
1567
1568 /*******************************/
1569 /* NDB_STTORRY */
1570 /*******************************/
execNDB_STTORRY(Signal * signal)1571 void Ndbcntr::execNDB_STTORRY(Signal* signal)
1572 {
1573 jamEntry();
1574 switch (cstartPhase) {
1575 case ZSTART_PHASE_2:
1576 jam();
1577 ph2GLab(signal);
1578 return;
1579 break;
1580 case ZSTART_PHASE_3:
1581 jam();
1582 ph3ALab(signal);
1583 return;
1584 break;
1585 case ZSTART_PHASE_4:
1586 jam();
1587 ph4BLab(signal);
1588 return;
1589 break;
1590 case ZSTART_PHASE_5:
1591 jam();
1592 ph5ALab(signal);
1593 return;
1594 break;
1595 case ZSTART_PHASE_6:
1596 jam();
1597 ph6ALab(signal);
1598 return;
1599 break;
1600 case ZSTART_PHASE_7:
1601 jam();
1602 ph6BLab(signal);
1603 return;
1604 break;
1605 case ZSTART_PHASE_8:
1606 jam();
1607 ph7ALab(signal);
1608 return;
1609 break;
1610 case ZSTART_PHASE_9:
1611 jam();
1612 g_eventLogger->info("NDB start phase 8 completed");
1613 ph8ALab(signal);
1614 return;
1615 break;
1616 default:
1617 jam();
1618 systemErrorLab(signal, __LINE__);
1619 return;
1620 break;
1621 }//switch
1622 }//Ndbcntr::execNDB_STTORRY()
1623
startPhase1Lab(Signal * signal)1624 void Ndbcntr::startPhase1Lab(Signal* signal)
1625 {
1626 jamEntry();
1627
1628 initData(signal);
1629
1630 cdynamicNodeId = 0;
1631
1632 NdbBlocksRecPtr ndbBlocksPtr;
1633 ndbBlocksPtr.i = 0;
1634 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1635 ndbBlocksPtr.p->blockref = DBLQH_REF;
1636 ndbBlocksPtr.i = 1;
1637 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1638 ndbBlocksPtr.p->blockref = DBDICT_REF;
1639 ndbBlocksPtr.i = 2;
1640 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1641 ndbBlocksPtr.p->blockref = DBTUP_REF;
1642 ndbBlocksPtr.i = 3;
1643 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1644 ndbBlocksPtr.p->blockref = DBACC_REF;
1645 ndbBlocksPtr.i = 4;
1646 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1647 ndbBlocksPtr.p->blockref = DBTC_REF;
1648 ndbBlocksPtr.i = 5;
1649 ptrAss(ndbBlocksPtr, ndbBlocksRec);
1650 ndbBlocksPtr.p->blockref = DBDIH_REF;
1651 sendSttorry(signal);
1652 return;
1653 }
1654
execREAD_NODESREF(Signal * signal)1655 void Ndbcntr::execREAD_NODESREF(Signal* signal)
1656 {
1657 jamEntry();
1658 systemErrorLab(signal, __LINE__);
1659 return;
1660 }//Ndbcntr::execREAD_NODESREF()
1661
1662
1663 /*******************************/
1664 /* NDB_STARTREF */
1665 /*******************************/
execNDB_STARTREF(Signal * signal)1666 void Ndbcntr::execNDB_STARTREF(Signal* signal)
1667 {
1668 jamEntry();
1669 systemErrorLab(signal, __LINE__);
1670 return;
1671 }//Ndbcntr::execNDB_STARTREF()
1672
1673 /*******************************/
1674 /* STTOR */
1675 /*******************************/
startPhase2Lab(Signal * signal)1676 void Ndbcntr::startPhase2Lab(Signal* signal)
1677 {
1678 c_start.m_lastGci = 0;
1679 c_start.m_lastGciNodeId = getOwnNodeId();
1680
1681 DihRestartReq * req = CAST_PTR(DihRestartReq, signal->getDataPtrSend());
1682 req->senderRef = reference();
1683 if (ERROR_INSERTED(1021))
1684 {
1685 CLEAR_ERROR_INSERT_VALUE;
1686 sendSignalWithDelay(DBDIH_REF, GSN_DIH_RESTARTREQ, signal,
1687 30000, DihRestartReq::SignalLength);
1688 }
1689 else
1690 {
1691 sendSignal(DBDIH_REF, GSN_DIH_RESTARTREQ, signal,
1692 DihRestartReq::SignalLength, JBB);
1693 }
1694 return;
1695 }//Ndbcntr::startPhase2Lab()
1696
1697 /*******************************/
1698 /* DIH_RESTARTCONF */
1699 /*******************************/
execDIH_RESTARTCONF(Signal * signal)1700 void Ndbcntr::execDIH_RESTARTCONF(Signal* signal)
1701 {
1702 jamEntry();
1703
1704 const DihRestartConf * conf = CAST_CONSTPTR(DihRestartConf,
1705 signal->getDataPtrSend());
1706 c_start.m_lastGci = conf->latest_gci;
1707 ctypeOfStart = NodeState::ST_SYSTEM_RESTART;
1708 cdihStartType = ctypeOfStart;
1709 ph2ALab(signal);
1710 return;
1711 }//Ndbcntr::execDIH_RESTARTCONF()
1712
1713 /*******************************/
1714 /* DIH_RESTARTREF */
1715 /*******************************/
execDIH_RESTARTREF(Signal * signal)1716 void Ndbcntr::execDIH_RESTARTREF(Signal* signal)
1717 {
1718 jamEntry();
1719 ctypeOfStart = NodeState::ST_INITIAL_START;
1720 cdihStartType = ctypeOfStart;
1721 ph2ALab(signal);
1722 return;
1723 }//Ndbcntr::execDIH_RESTARTREF()
1724
ph2ALab(Signal * signal)1725 void Ndbcntr::ph2ALab(Signal* signal)
1726 {
1727 /******************************/
1728 /* request configured nodes */
1729 /* from QMGR */
1730 /* READ_NODESREQ */
1731 /******************************/
1732 signal->theData[0] = reference();
1733 sendSignal(QMGR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
1734 return;
1735 }//Ndbcntr::ph2ALab()
1736
1737 inline
1738 Uint64
setTimeout(Uint32 timeoutValue)1739 setTimeout(Uint32 timeoutValue){
1740 return (timeoutValue != 0) ? timeoutValue : ~(Uint64)0;
1741 }
1742
1743 /*******************************/
1744 /* READ_NODESCONF */
1745 /*******************************/
execREAD_NODESCONF(Signal * signal)1746 void Ndbcntr::execREAD_NODESCONF(Signal* signal)
1747 {
1748 jamEntry();
1749 const ReadNodesConf * readNodes = (ReadNodesConf *)&signal->theData[0];
1750
1751 cmasterNodeId = readNodes->masterNodeId;
1752 cdynamicNodeId = readNodes->ndynamicId;
1753
1754 /**
1755 * All defined nodes...
1756 */
1757 c_allDefinedNodes.assign(NdbNodeBitmask::Size, readNodes->allNodes);
1758 c_clusterNodes.assign(NdbNodeBitmask::Size, readNodes->clusterNodes);
1759
1760 Uint32 to_1 = 30000;
1761 Uint32 to_2 = 0;
1762 Uint32 to_3 = 0;
1763
1764 const ndb_mgm_configuration_iterator * p =
1765 m_ctx.m_config.getOwnConfigIterator();
1766
1767 ndbrequire(p != 0);
1768 ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTIAL_TIMEOUT, &to_1);
1769 ndb_mgm_get_int_parameter(p, CFG_DB_START_PARTITION_TIMEOUT, &to_2);
1770 ndb_mgm_get_int_parameter(p, CFG_DB_START_FAILURE_TIMEOUT, &to_3);
1771
1772 c_start.m_startTime = NdbTick_getCurrentTicks();
1773 c_start.m_startPartialTimeout = setTimeout(to_1);
1774 c_start.m_startPartitionedTimeout = setTimeout(to_2);
1775 c_start.m_startFailureTimeout = setTimeout(to_3);
1776
1777 sendCntrStartReq(signal);
1778
1779 signal->theData[0] = ZSTARTUP;
1780 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 1);
1781
1782 return;
1783 }
1784
1785 void
execCM_ADD_REP(Signal * signal)1786 Ndbcntr::execCM_ADD_REP(Signal* signal)
1787 {
1788 jamEntry();
1789 c_clusterNodes.set(signal->theData[0]);
1790 }
1791
1792 void
sendCntrStartReq(Signal * signal)1793 Ndbcntr::sendCntrStartReq(Signal * signal)
1794 {
1795 jamEntry();
1796
1797 if (getOwnNodeId() == cmasterNodeId)
1798 {
1799 g_eventLogger->info("Asking master node to accept our start "
1800 "(we are master, GCI = %u)",
1801 c_start.m_lastGci);
1802 }
1803 else
1804 {
1805 g_eventLogger->info("Asking master node to accept our start "
1806 "(nodeId = %u is master), GCI = %u",
1807 cmasterNodeId,
1808 c_start.m_lastGci);
1809 }
1810
1811 CntrStartReq * req = (CntrStartReq*)signal->getDataPtrSend();
1812 req->startType = ctypeOfStart;
1813 req->lastGci = c_start.m_lastGci;
1814 req->nodeId = getOwnNodeId();
1815 sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_START_REQ,
1816 signal, CntrStartReq::SignalLength, JBB);
1817 }
1818
1819 void
execCNTR_START_REF(Signal * signal)1820 Ndbcntr::execCNTR_START_REF(Signal * signal){
1821 jamEntry();
1822 const CntrStartRef * ref = (CntrStartRef*)signal->getDataPtr();
1823
1824 switch(ref->errorCode){
1825 case CntrStartRef::NotMaster:
1826 jam();
1827 cmasterNodeId = ref->masterNodeId;
1828 sendCntrStartReq(signal);
1829 return;
1830 case CntrStartRef::StopInProgress:
1831 jam();
1832 progError(__LINE__, NDBD_EXIT_RESTART_DURING_SHUTDOWN);
1833 }
1834 ndbrequire(false);
1835 }
1836
1837 void
reset()1838 Ndbcntr::StartRecord::reset(){
1839 m_starting.clear();
1840 m_waiting.clear();
1841 m_withLog.clear();
1842 m_withoutLog.clear();
1843 m_waitTO.clear();
1844 m_lastGci = m_lastGciNodeId = 0;
1845 m_startPartialTimeout = ~0;
1846 m_startPartitionedTimeout = ~0;
1847 m_startFailureTimeout = ~0;
1848
1849 m_logNodesCount = 0;
1850 bzero(m_wait_sp, sizeof(m_wait_sp));
1851 }
1852
1853 void
execCNTR_START_CONF(Signal * signal)1854 Ndbcntr::execCNTR_START_CONF(Signal * signal){
1855 jamEntry();
1856 const CntrStartConf * conf = (CntrStartConf*)signal->getDataPtr();
1857
1858 cnoStartNodes = conf->noStartNodes;
1859 ctypeOfStart = (NodeState::StartType)conf->startType;
1860 cdihStartType = ctypeOfStart;
1861 c_start.m_lastGci = conf->startGci;
1862 cmasterNodeId = conf->masterNodeId;
1863 NdbNodeBitmask tmp;
1864 tmp.assign(NdbNodeBitmask::Size, conf->startedNodes);
1865 c_startedNodes.bitOR(tmp);
1866 c_start.m_starting.assign(NdbNodeBitmask::Size, conf->startingNodes);
1867 m_cntr_start_conf = true;
1868 g_eventLogger->info("NDBCNTR master accepted us into cluster,"
1869 " start NDB start phase 1");
1870 switch (ctypeOfStart)
1871 {
1872 case NodeState::ST_INITIAL_START:
1873 {
1874 g_eventLogger->info("We are performing initial start of cluster");
1875 break;
1876 }
1877 case NodeState::ST_INITIAL_NODE_RESTART:
1878 {
1879 g_eventLogger->info("We are performing initial node restart");
1880 break;
1881 }
1882 case NodeState::ST_NODE_RESTART:
1883 {
1884 g_eventLogger->info("We are performing a node restart");
1885 break;
1886 }
1887 case NodeState::ST_SYSTEM_RESTART:
1888 {
1889 g_eventLogger->info("We are performing a restart of the cluster");
1890 break;
1891 }
1892 default:
1893 {
1894 ndbrequire(false);
1895 break;
1896 }
1897 }
1898 ph2GLab(signal);
1899 }
1900
1901 /**
1902 * Tried with parallell nr, but it crashed in DIH
1903 * so I turned it off, as I don't want to debug DIH now...
1904 * Jonas 19/11-03
1905 *
1906 * After trying for 2 hours, I gave up.
1907 * DIH is not designed to support it, and
1908 * it requires quite of lot of changes to
1909 * make it work
1910 * Jonas 5/12-03
1911 */
1912 #define PARALLELL_NR 0
1913
1914 #if PARALLELL_NR
1915 const bool parallellNR = true;
1916 #else
1917 const bool parallellNR = false;
1918 #endif
1919
1920 void
execCNTR_START_REP(Signal * signal)1921 Ndbcntr::execCNTR_START_REP(Signal* signal){
1922 jamEntry();
1923 Uint32 nodeId = signal->theData[0];
1924
1925 c_startedNodes.set(nodeId);
1926 c_start.m_starting.clear(nodeId);
1927
1928 /**
1929 * Inform all interested blocks that node has started
1930 */
1931 for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
1932 sendSignal(ALL_BLOCKS[i].Ref, GSN_NODE_START_REP, signal, 1, JBB);
1933 }
1934
1935 signal->theData[0] = nodeId;
1936 execSTART_PERMREP(signal);
1937 }
1938
1939 void
execSTART_PERMREP(Signal * signal)1940 Ndbcntr::execSTART_PERMREP(Signal* signal)
1941 {
1942 Uint32 nodeId = signal->theData[0];
1943 c_startedNodes.set(nodeId);
1944 c_start.m_starting.clear(nodeId);
1945
1946 if(!c_start.m_starting.isclear()){
1947 jam();
1948 return;
1949 }
1950
1951 if(cmasterNodeId != getOwnNodeId()){
1952 jam();
1953 c_start.reset();
1954 return;
1955 }
1956
1957 if(c_start.m_waiting.isclear()){
1958 jam();
1959 c_start.reset();
1960 return;
1961 }
1962
1963 startWaitingNodes(signal);
1964 }
1965
1966 void
execCNTR_START_REQ(Signal * signal)1967 Ndbcntr::execCNTR_START_REQ(Signal * signal){
1968 jamEntry();
1969 const CntrStartReq * req = (CntrStartReq*)signal->getDataPtr();
1970
1971 const Uint32 nodeId = req->nodeId;
1972 const Uint32 lastGci = req->lastGci;
1973 const NodeState::StartType st = (NodeState::StartType)req->startType;
1974
1975 if(cmasterNodeId == 0){
1976 jam();
1977 // Has not completed READNODES yet
1978 sendSignalWithDelay(reference(), GSN_CNTR_START_REQ, signal, 100,
1979 signal->getLength());
1980 return;
1981 }
1982
1983 if(cmasterNodeId != getOwnNodeId()){
1984 jam();
1985 sendCntrStartRef(signal, nodeId, CntrStartRef::NotMaster);
1986 return;
1987 }
1988
1989 const NodeState & nodeState = getNodeState();
1990 switch(nodeState.startLevel){
1991 case NodeState::SL_NOTHING:
1992 case NodeState::SL_CMVMI:
1993 jam();
1994 ndbrequire(false);
1995 case NodeState::SL_STARTING:
1996 case NodeState::SL_STARTED:
1997 jam();
1998 break;
1999
2000 case NodeState::SL_STOPPING_1:
2001 case NodeState::SL_STOPPING_2:
2002 case NodeState::SL_STOPPING_3:
2003 case NodeState::SL_STOPPING_4:
2004 jam();
2005 sendCntrStartRef(signal, nodeId, CntrStartRef::StopInProgress);
2006 return;
2007 }
2008
2009 /**
2010 * Am I starting (or started)
2011 */
2012 const bool starting = (nodeState.startLevel != NodeState::SL_STARTED);
2013
2014 c_start.m_waiting.set(nodeId);
2015 switch(st){
2016 case NodeState::ST_INITIAL_START:
2017 jam();
2018 c_start.m_withoutLog.set(nodeId);
2019 break;
2020 case NodeState::ST_SYSTEM_RESTART:
2021 jam();
2022 c_start.m_withLog.set(nodeId);
2023 if(starting && lastGci > c_start.m_lastGci){
2024 jam();
2025 CntrStartRef * ref = (CntrStartRef*)signal->getDataPtrSend();
2026 ref->errorCode = CntrStartRef::NotMaster;
2027 ref->masterNodeId = nodeId;
2028 NodeReceiverGroup rg (NDBCNTR, c_start.m_waiting);
2029 sendSignal(rg, GSN_CNTR_START_REF, signal,
2030 CntrStartRef::SignalLength, JBB);
2031 return;
2032 }
2033 if(starting){
2034 jam();
2035 Uint32 i = c_start.m_logNodesCount++;
2036 c_start.m_logNodes[i].m_nodeId = nodeId;
2037 c_start.m_logNodes[i].m_lastGci = req->lastGci;
2038 }
2039 break;
2040 case NodeState::ST_NODE_RESTART:
2041 case NodeState::ST_INITIAL_NODE_RESTART:
2042 case NodeState::ST_ILLEGAL_TYPE:
2043 ndbrequire(false);
2044 }
2045
2046 const bool startInProgress = !c_start.m_starting.isclear();
2047
2048 if ((starting && startInProgress) || (startInProgress && !parallellNR))
2049 {
2050 jam();
2051 /**
2052 * We're already starting together with a bunch of nodes
2053 * Let this node wait...
2054 *
2055 * We will report the wait to DBDIH to keep track of waiting times in
2056 * the restart. We only report when a node restart is ongoing (that is
2057 * we are not starting ourselves).
2058 */
2059 if (!starting)
2060 {
2061 NdbcntrStartWaitRep *rep = (NdbcntrStartWaitRep*)signal->getDataPtrSend();
2062 rep->nodeId = nodeId;
2063 EXECUTE_DIRECT(DBDIH, GSN_NDBCNTR_START_WAIT_REP, signal,
2064 NdbcntrStartWaitRep::SignalLength);
2065 return;
2066 }
2067 }
2068
2069 if(starting){
2070 jam();
2071 trySystemRestart(signal);
2072 } else {
2073 jam();
2074 startWaitingNodes(signal);
2075 }
2076 return;
2077 }
2078
2079 void
startWaitingNodes(Signal * signal)2080 Ndbcntr::startWaitingNodes(Signal * signal){
2081
2082 #if ! PARALLELL_NR
2083 if (!c_start.m_waitTO.isclear())
2084 {
2085 jam();
2086
2087 {
2088 char buf[100];
2089 ndbout_c("starting (TO) %s", c_start.m_waitTO.getText(buf));
2090 }
2091
2092 /**
2093 * TO during SR
2094 * this can run in parallel (nowadays :-)
2095 */
2096 NodeReceiverGroup rg(NDBCNTR, c_start.m_waitTO);
2097 c_start.m_starting.bitOR(c_start.m_waitTO);
2098 c_start.m_waiting.bitANDC(c_start.m_waitTO);
2099 c_start.m_waitTO.clear();
2100
2101 /**
2102 * They are stuck in CntrWaitRep::ZWAITPOINT_4_1
2103 * have all meta data ok...but needs START_COPYREQ
2104 */
2105 CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtrSend();
2106 rep->nodeId = getOwnNodeId();
2107 rep->waitPoint = CntrWaitRep::ZWAITPOINT_4_2_TO;
2108 sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2109 return;
2110 }
2111
2112 const Uint32 nodeId = c_start.m_waiting.find(0);
2113 const Uint32 Tref = calcNdbCntrBlockRef(nodeId);
2114 ndbrequire(nodeId != c_start.m_waiting.NotFound);
2115
2116 NodeState::StartType nrType = NodeState::ST_NODE_RESTART;
2117 const char *start_type_str = "node restart";
2118 if(c_start.m_withoutLog.get(nodeId))
2119 {
2120 jam();
2121 nrType = NodeState::ST_INITIAL_NODE_RESTART;
2122 start_type_str = "initial node restart";
2123 }
2124
2125 /**
2126 * Let node perform restart
2127 */
2128 infoEvent("Start node: %u using %s as part of system restart",
2129 nodeId, start_type_str);
2130
2131 CntrStartConf * conf = (CntrStartConf*)signal->getDataPtrSend();
2132 conf->noStartNodes = 1;
2133 conf->startType = nrType;
2134 conf->startGci = ~0; // Not used
2135 conf->masterNodeId = getOwnNodeId();
2136 BitmaskImpl::clear(NdbNodeBitmask::Size, conf->startingNodes);
2137 BitmaskImpl::set(NdbNodeBitmask::Size, conf->startingNodes, nodeId);
2138 c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2139 sendSignal(Tref, GSN_CNTR_START_CONF, signal,
2140 CntrStartConf::SignalLength, JBB);
2141
2142 /**
2143 * A node restart is ongoing where we are master and we just accepted this
2144 * node to proceed with his node restart. Inform DBDIH about this event in
2145 * the node restart.
2146 */
2147 NdbcntrStartedRep *rep = (NdbcntrStartedRep*)signal->getDataPtrSend();
2148 rep->nodeId = nodeId;
2149 EXECUTE_DIRECT(DBDIH, GSN_NDBCNTR_STARTED_REP, signal,
2150 NdbcntrStartedRep::SignalLength);
2151
2152 c_start.m_waiting.clear(nodeId);
2153 c_start.m_withLog.clear(nodeId);
2154 c_start.m_withoutLog.clear(nodeId);
2155 c_start.m_starting.set(nodeId);
2156 #else
2157 // Parallell nr
2158
2159 c_start.m_starting = c_start.m_waiting;
2160 c_start.m_waiting.clear();
2161
2162 CntrStartConf * conf = (CntrStartConf*)signal->getDataPtrSend();
2163 conf->noStartNodes = 1;
2164 conf->startGci = ~0; // Not used
2165 conf->masterNodeId = getOwnNodeId();
2166 c_start.m_starting.copyto(NdbNodeBitmask::Size, conf->startingNodes);
2167 c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2168
2169 char buf[100];
2170 if(!c_start.m_withLog.isclear()){
2171 jam();
2172 ndbout_c("Starting nodes w/ log: %s", c_start.m_withLog.getText(buf));
2173
2174 NodeReceiverGroup rg(NDBCNTR, c_start.m_withLog);
2175 conf->startType = NodeState::ST_NODE_RESTART;
2176
2177 sendSignal(rg, GSN_CNTR_START_CONF, signal,
2178 CntrStartConf::SignalLength, JBB);
2179 }
2180
2181 if(!c_start.m_withoutLog.isclear()){
2182 jam();
2183 ndbout_c("Starting nodes wo/ log: %s", c_start.m_withoutLog.getText(buf));
2184 NodeReceiverGroup rg(NDBCNTR, c_start.m_withoutLog);
2185 conf->startType = NodeState::ST_INITIAL_NODE_RESTART;
2186
2187 sendSignal(rg, GSN_CNTR_START_CONF, signal,
2188 CntrStartConf::SignalLength, JBB);
2189 }
2190
2191 c_start.m_waiting.clear();
2192 c_start.m_withLog.clear();
2193 c_start.m_withoutLog.clear();
2194 #endif
2195 }
2196
2197 void
sendCntrStartRef(Signal * signal,Uint32 nodeId,CntrStartRef::ErrorCode code)2198 Ndbcntr::sendCntrStartRef(Signal * signal,
2199 Uint32 nodeId, CntrStartRef::ErrorCode code){
2200 CntrStartRef * ref = (CntrStartRef*)signal->getDataPtrSend();
2201 ref->errorCode = code;
2202 ref->masterNodeId = cmasterNodeId;
2203 sendSignal(calcNdbCntrBlockRef(nodeId), GSN_CNTR_START_REF, signal,
2204 CntrStartRef::SignalLength, JBB);
2205 }
2206
2207 CheckNodeGroups::Output
checkNodeGroups(Signal * signal,const NdbNodeBitmask & mask)2208 Ndbcntr::checkNodeGroups(Signal* signal, const NdbNodeBitmask & mask){
2209 CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
2210 sd->blockRef = reference();
2211 sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::ArbitCheck;
2212 sd->mask = mask;
2213 EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
2214 CheckNodeGroups::SignalLength);
2215 jamEntry();
2216 return (CheckNodeGroups::Output)sd->output;
2217 }
2218
2219 bool
trySystemRestart(Signal * signal)2220 Ndbcntr::trySystemRestart(Signal* signal){
2221 /**
2222 * System restart something
2223 */
2224 const bool allNodes = c_start.m_waiting.equal(c_allDefinedNodes);
2225 const bool allClusterNodes = c_start.m_waiting.equal(c_clusterNodes);
2226
2227 if(!allClusterNodes){
2228 jam();
2229 return false;
2230 }
2231
2232 NodeState::StartType srType = NodeState::ST_SYSTEM_RESTART;
2233 if(c_start.m_waiting.equal(c_start.m_withoutLog))
2234 {
2235 jam();
2236 srType = NodeState::ST_INITIAL_START;
2237 c_start.m_starting = c_start.m_withoutLog; // Used for starting...
2238 c_start.m_withoutLog.clear();
2239 } else {
2240
2241 CheckNodeGroups::Output wLog = checkNodeGroups(signal, c_start.m_withLog);
2242
2243 switch (wLog) {
2244 case CheckNodeGroups::Win:
2245 jam();
2246 break;
2247 case CheckNodeGroups::Lose:
2248 jam();
2249 // If we lose with all nodes, then we're in trouble
2250 ndbrequire(!allNodes);
2251 return false;
2252 case CheckNodeGroups::Partitioning:
2253 jam();
2254 bool allowPartition = (c_start.m_startPartitionedTimeout != (Uint64)~0);
2255
2256 if(allNodes){
2257 if(allowPartition){
2258 jam();
2259 break;
2260 }
2261 ndbrequire(false); // All nodes -> partitioning, which is not allowed
2262 }
2263
2264 break;
2265 }
2266
2267 // For now only with the "logged"-ones.
2268 // Let the others do node restart afterwards...
2269 c_start.m_starting = c_start.m_withLog;
2270 c_start.m_withLog.clear();
2271 }
2272
2273 /**
2274 * Okidoki, we try to start
2275 */
2276 CntrStartConf * conf = (CntrStartConf*)signal->getDataPtr();
2277 conf->noStartNodes = c_start.m_starting.count();
2278 conf->startType = srType;
2279 conf->startGci = c_start.m_lastGci;
2280 conf->masterNodeId = c_start.m_lastGciNodeId;
2281 c_start.m_starting.copyto(NdbNodeBitmask::Size, conf->startingNodes);
2282 c_startedNodes.copyto(NdbNodeBitmask::Size, conf->startedNodes);
2283
2284 ndbrequire(c_start.m_lastGciNodeId == getOwnNodeId());
2285
2286 infoEvent("System Restart: master node: %u, num starting: %u, gci: %u",
2287 conf->noStartNodes,
2288 conf->masterNodeId,
2289 conf->startGci);
2290 char buf[100];
2291 infoEvent("CNTR_START_CONF: started: %s", c_startedNodes.getText(buf));
2292 infoEvent("CNTR_START_CONF: starting: %s", c_start.m_starting.getText(buf));
2293
2294 NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2295 sendSignal(rg, GSN_CNTR_START_CONF, signal, CntrStartConf::SignalLength,JBB);
2296
2297 c_start.m_waiting.bitANDC(c_start.m_starting);
2298
2299 return true;
2300 }
2301
ph2GLab(Signal * signal)2302 void Ndbcntr::ph2GLab(Signal* signal)
2303 {
2304 if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2305 {
2306 jam();
2307 sendNdbSttor(signal);
2308 return;
2309 }//if
2310 g_eventLogger->info("NDB start phase 1 completed");
2311 sendSttorry(signal);
2312 return;
2313 }//Ndbcntr::ph2GLab()
2314
2315 /*
2316 4.4 START PHASE 3 */
2317 /*###########################################################################*/
2318 // SEND SIGNAL NDBSTTOR TO ALL BLOCKS, ACC, DICT, DIH, LQH, TC AND TUP
2319 // WHEN ALL BLOCKS HAVE RETURNED THEIR NDB_STTORRY ALL BLOCK HAVE FINISHED
2320 // THEIR LOCAL CONNECTIONs SUCESSFULLY
2321 // AND THEN WE CAN SEND APPL_STARTREG TO INFORM QMGR THAT WE ARE READY TO
2322 // SET UP DISTRIBUTED CONNECTIONS.
2323 /*--------------------------------------------------------------*/
2324 // THIS IS NDB START PHASE 3.
2325 /*--------------------------------------------------------------*/
2326 /*******************************/
2327 /* STTOR */
2328 /*******************************/
startPhase3Lab(Signal * signal)2329 void Ndbcntr::startPhase3Lab(Signal* signal)
2330 {
2331 g_eventLogger->info("Start NDB start phase 2");
2332 ph3ALab(signal);
2333 return;
2334 }//Ndbcntr::startPhase3Lab()
2335
2336 /*******************************/
2337 /* NDB_STTORRY */
2338 /*******************************/
ph3ALab(Signal * signal)2339 void Ndbcntr::ph3ALab(Signal* signal)
2340 {
2341 if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2342 {
2343 jam();
2344 sendNdbSttor(signal);
2345 return;
2346 }//if
2347 g_eventLogger->info("NDB start phase 2 completed");
2348 sendSttorry(signal);
2349 return;
2350 }//Ndbcntr::ph3ALab()
2351
2352 /*
2353 4.5 START PHASE 4 */
2354 /*###########################################################################*/
2355 // WAIT FOR ALL NODES IN CLUSTER TO CHANGE STATE INTO ZSTART ,
2356 // APPL_CHANGEREP IS ALWAYS SENT WHEN SOMEONE HAVE
2357 // CHANGED THEIR STATE. APPL_STARTCONF INDICATES THAT ALL NODES ARE IN START
2358 // STATE SEND NDB_STARTREQ TO DIH AND THEN WAIT FOR NDB_STARTCONF
2359 /*---------------------------------------------------------------------------*/
2360 /*******************************/
2361 /* STTOR */
2362 /*******************************/
startPhase4Lab(Signal * signal)2363 void Ndbcntr::startPhase4Lab(Signal* signal)
2364 {
2365 g_eventLogger->info("Start NDB start phase 3");
2366 ph4ALab(signal);
2367 }//Ndbcntr::startPhase4Lab()
2368
2369
ph4ALab(Signal * signal)2370 void Ndbcntr::ph4ALab(Signal* signal)
2371 {
2372 ph4BLab(signal);
2373 return;
2374 }//Ndbcntr::ph4ALab()
2375
2376 /*******************************/
2377 /* NDB_STTORRY */
2378 /*******************************/
ph4BLab(Signal * signal)2379 void Ndbcntr::ph4BLab(Signal* signal)
2380 {
2381 /*--------------------------------------*/
2382 /* CASE: CSTART_PHASE = ZSTART_PHASE_4 */
2383 /*--------------------------------------*/
2384 if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2385 {
2386 jam();
2387 sendNdbSttor(signal);
2388 return;
2389 }//if
2390 if (ERROR_INSERTED(1010))
2391 {
2392 /* Just delay things for 10 seconds */
2393 CLEAR_ERROR_INSERT_VALUE;
2394 sendSignalWithDelay(reference(), GSN_NDB_STTORRY, signal,
2395 10000, 1);
2396 return;
2397 }
2398 g_eventLogger->info("NDB start phase 3 completed");
2399 if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2400 (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2401 {
2402 jam();
2403 sendSttorry(signal);
2404 return;
2405 }//if
2406 waitpoint41Lab(signal);
2407 return;
2408 }//Ndbcntr::ph4BLab()
2409
waitpoint41Lab(Signal * signal)2410 void Ndbcntr::waitpoint41Lab(Signal* signal)
2411 {
2412 if (getOwnNodeId() == cmasterNodeId) {
2413 jam();
2414 /*--------------------------------------*/
2415 /* MASTER WAITS UNTIL ALL SLAVES HAS */
2416 /* SENT THE REPORTS */
2417 /*--------------------------------------*/
2418 cnoWaitrep++;
2419 if (cnoWaitrep == cnoStartNodes) {
2420 jam();
2421 cnoWaitrep = 0;
2422 /*---------------------------------------------------------------------------*/
2423 // NDB_STARTREQ STARTS UP ALL SET UP OF DISTRIBUTION INFORMATION IN DIH AND
2424 // DICT. AFTER SETTING UP THIS
2425 // DATA IT USES THAT DATA TO SET UP WHICH FRAGMENTS THAT ARE TO START AND
2426 // WHERE THEY ARE TO START. THEN
2427 // IT SETS UP THE FRAGMENTS AND RECOVERS THEM BY:
2428 // 1) READING A LOCAL CHECKPOINT FROM DISK.
2429 // 2) EXECUTING THE UNDO LOG ON INDEX AND DATA.
2430 // 3) EXECUTING THE FRAGMENT REDO LOG FROM ONE OR SEVERAL NODES TO
2431 // RESTORE THE RESTART CONFIGURATION OF DATA IN NDB CLUSTER.
2432 /*---------------------------------------------------------------------------*/
2433 signal->theData[0] = reference();
2434 signal->theData[1] = ctypeOfStart;
2435 sendSignal(DBDIH_REF, GSN_NDB_STARTREQ, signal, 2, JBB);
2436 }//if
2437 } else {
2438 jam();
2439 /*--------------------------------------*/
2440 /* SLAVE NODES WILL PASS HERE ONCE AND */
2441 /* SEND A WAITPOINT REPORT TO MASTER. */
2442 /* SLAVES WONT DO ANYTHING UNTIL THEY */
2443 /* RECEIVE A WAIT REPORT FROM THE MASTER*/
2444 /*--------------------------------------*/
2445 signal->theData[0] = getOwnNodeId();
2446 signal->theData[1] = CntrWaitRep::ZWAITPOINT_4_1;
2447 sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2448 GSN_CNTR_WAITREP, signal, 2, JBB);
2449 }//if
2450 return;
2451 }//Ndbcntr::waitpoint41Lab()
2452
2453 void
waitpoint42To(Signal * signal)2454 Ndbcntr::waitpoint42To(Signal* signal)
2455 {
2456 jam();
2457
2458 /**
2459 * This is a ugly hack
2460 * To "easy" enable TO during SR
2461 * a better solution would be to move "all" start handling
2462 * from DIH to cntr...which knows what's going on
2463 */
2464 cdihStartType = NodeState::ST_SYSTEM_RESTART;
2465 ctypeOfStart = NodeState::ST_NODE_RESTART;
2466
2467 /**
2468 * This is immensely ugly...but makes TUX work (yuck)
2469 */
2470 {
2471 NodeStateRep* rep = (NodeStateRep*)signal->getDataPtrSend();
2472 rep->nodeState = getNodeState();
2473 rep->nodeState.masterNodeId = cmasterNodeId;
2474 rep->nodeState.setNodeGroup(c_nodeGroup);
2475 rep->nodeState.starting.restartType = NodeState::ST_NODE_RESTART;
2476
2477 sendSignal(DBTUX_REF, GSN_NODE_STATE_REP, signal,
2478 NodeStateRep::SignalLength, JBB);
2479 }
2480
2481 /**
2482 * We were forced to perform TO
2483 */
2484 StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
2485 req->senderRef = reference();
2486 req->senderData = RNIL;
2487 req->flags = StartCopyReq::WAIT_LCP;
2488 req->startingNodeId = getOwnNodeId();
2489 sendSignal(DBDIH_REF, GSN_START_COPYREQ, signal,
2490 StartCopyReq::SignalLength, JBB);
2491 }
2492
2493 void
execSTART_COPYREF(Signal * signal)2494 Ndbcntr::execSTART_COPYREF(Signal* signal)
2495 {
2496
2497 }
2498
2499 void
execSTART_COPYCONF(Signal * signal)2500 Ndbcntr::execSTART_COPYCONF(Signal* signal)
2501 {
2502 sendSttorry(signal);
2503 }
2504
2505
2506 /*******************************/
2507 /* NDB_STARTCONF */
2508 /*******************************/
execNDB_STARTCONF(Signal * signal)2509 void Ndbcntr::execNDB_STARTCONF(Signal* signal)
2510 {
2511 jamEntry();
2512
2513 NdbNodeBitmask tmp;
2514 if (signal->getLength() >= 1 + NdbNodeBitmask::Size)
2515 {
2516 jam();
2517 tmp.assign(NdbNodeBitmask::Size, signal->theData+1);
2518 if (!c_start.m_starting.equal(tmp))
2519 {
2520 /**
2521 * Some nodes has been "excluded" from SR
2522 */
2523 char buf0[100], buf1[100];
2524 g_eventLogger->info("execNDB_STARTCONF: changing from %s to %s",
2525 c_start.m_starting.getText(buf0),
2526 tmp.getText(buf1));
2527
2528 NdbNodeBitmask waiting = c_start.m_starting;
2529 waiting.bitANDC(tmp);
2530
2531 c_start.m_waiting.bitOR(waiting);
2532 c_start.m_waitTO.bitOR(waiting);
2533
2534 c_start.m_starting.assign(tmp);
2535 cnoStartNodes = c_start.m_starting.count();
2536 }
2537 }
2538
2539 NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2540 signal->theData[0] = getOwnNodeId();
2541 signal->theData[1] = CntrWaitRep::ZWAITPOINT_4_2;
2542 c_start.m_starting.copyto(NdbNodeBitmask::Size, signal->theData+2);
2543 sendSignal(rg, GSN_CNTR_WAITREP, signal, 2 + NdbNodeBitmask::Size,
2544 JBB);
2545 return;
2546 }//Ndbcntr::execNDB_STARTCONF()
2547
2548 /*
2549 4.6 START PHASE 5 */
2550 /*###########################################################################*/
2551 // SEND APPL_RUN TO THE QMGR IN THIS BLOCK
2552 // SEND NDB_STTOR ALL BLOCKS ACC, DICT, DIH, LQH, TC AND TUP THEN WAIT FOR
2553 // THEIR NDB_STTORRY
2554 /*---------------------------------------------------------------------------*/
2555 /*******************************/
2556 /* STTOR */
2557 /*******************************/
startPhase5Lab(Signal * signal)2558 void Ndbcntr::startPhase5Lab(Signal* signal)
2559 {
2560 ph5ALab(signal);
2561 return;
2562 }//Ndbcntr::startPhase5Lab()
2563
2564 /*******************************/
2565 /* NDB_STTORRY */
2566 /*******************************/
2567 /*---------------------------------------------------------------------------*/
2568 // THIS IS NDB START PHASE 5.
2569 /*---------------------------------------------------------------------------*/
2570 // IN THIS START PHASE TUP INITIALISES DISK FILES FOR DISK STORAGE IF INITIAL
2571 // START. DIH WILL START UP
2572 // THE GLOBAL CHECKPOINT PROTOCOL AND WILL CONCLUDE ANY UNFINISHED TAKE OVERS
2573 // THAT STARTED BEFORE THE SYSTEM CRASH.
2574 /*---------------------------------------------------------------------------*/
ph5ALab(Signal * signal)2575 void Ndbcntr::ph5ALab(Signal* signal)
2576 {
2577 if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2578 {
2579 jam();
2580 sendNdbSttor(signal);
2581 return;
2582 }//if
2583 g_eventLogger->info("NDB start phase 4 completed");
2584
2585 cstartPhase = cstartPhase + 1;
2586 cinternalStartphase = cstartPhase - 1;
2587 if (getOwnNodeId() == cmasterNodeId) {
2588 switch(ctypeOfStart){
2589 case NodeState::ST_INITIAL_START:
2590 jam();
2591 /*--------------------------------------*/
2592 /* MASTER CNTR IS RESPONSIBLE FOR */
2593 /* CREATING SYSTEM TABLES */
2594 /*--------------------------------------*/
2595 g_eventLogger->info("Creating System Tables Starting"
2596 " as part of initial start");
2597 beginSchemaTransLab(signal);
2598 return;
2599 case NodeState::ST_SYSTEM_RESTART:
2600 jam();
2601 g_eventLogger->info("As master we will wait for other nodes to reach"
2602 " the state waitpoint52 as well");
2603 waitpoint52Lab(signal);
2604 return;
2605 case NodeState::ST_NODE_RESTART:
2606 case NodeState::ST_INITIAL_NODE_RESTART:
2607 jam();
2608 break;
2609 case NodeState::ST_ILLEGAL_TYPE:
2610 jam();
2611 break;
2612 }
2613 ndbrequire(false);
2614 }
2615
2616 /**
2617 * Not master
2618 */
2619 NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
2620 switch(ctypeOfStart){
2621 case NodeState::ST_NODE_RESTART:
2622 case NodeState::ST_INITIAL_NODE_RESTART:
2623 jam();
2624 /*----------------------------------------------------------------------*/
2625 // SEND NDB START PHASE 5 IN NODE RESTARTS TO COPY DATA TO THE NEWLY
2626 // STARTED NODE.
2627 /*----------------------------------------------------------------------*/
2628 req->senderRef = reference();
2629 req->nodeId = getOwnNodeId();
2630 req->internalStartPhase = cinternalStartphase;
2631 req->typeOfStart = cdihStartType;
2632 req->masterNodeId = cmasterNodeId;
2633
2634 g_eventLogger->info("Start NDB start phase 5 (only to DBDIH)");
2635 //#define TRACE_STTOR
2636 #ifdef TRACE_STTOR
2637 ndbout_c("sending NDB_STTOR(%d) to DIH", cinternalStartphase);
2638 #endif
2639 sendSignal(DBDIH_REF, GSN_NDB_STTOR, signal,
2640 NdbSttor::SignalLength, JBB);
2641 return;
2642 case NodeState::ST_INITIAL_START:
2643 case NodeState::ST_SYSTEM_RESTART:
2644 jam();
2645 /*--------------------------------------*/
2646 /* DURING SYSTEMRESTART AND INITALSTART:*/
2647 /* SLAVE NODES WILL PASS HERE ONCE AND */
2648 /* SEND A WAITPOINT REPORT TO MASTER. */
2649 /* SLAVES WONT DO ANYTHING UNTIL THEY */
2650 /* RECEIVE A WAIT REPORT FROM THE MASTER*/
2651 /* WHEN THE MASTER HAS FINISHED HIS WORK*/
2652 /*--------------------------------------*/
2653 g_eventLogger->info("During cluster start/restart only master runs"
2654 " phase 5 of NDB start phases");
2655 g_eventLogger->info("Report to master node our state and wait for master");
2656
2657 signal->theData[0] = getOwnNodeId();
2658 signal->theData[1] = CntrWaitRep::ZWAITPOINT_5_2;
2659 sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2660 GSN_CNTR_WAITREP, signal, 2, JBB);
2661 return;
2662 default:
2663 ndbrequire(false);
2664 }
2665 }//Ndbcntr::ph5ALab()
2666
waitpoint52Lab(Signal * signal)2667 void Ndbcntr::waitpoint52Lab(Signal* signal)
2668 {
2669 cnoWaitrep = cnoWaitrep + 1;
2670 /*---------------------------------------------------------------------------*/
2671 // THIS WAITING POINT IS ONLY USED BY A MASTER NODE. WE WILL EXECUTE NDB START
2672 // PHASE 5 FOR DIH IN THE
2673 // MASTER. THIS WILL START UP LOCAL CHECKPOINTS AND WILL ALSO CONCLUDE ANY
2674 // UNFINISHED LOCAL CHECKPOINTS
2675 // BEFORE THE SYSTEM CRASH. THIS WILL ENSURE THAT WE ALWAYS RESTART FROM A
2676 // WELL KNOWN STATE.
2677 /*---------------------------------------------------------------------------*/
2678 /*--------------------------------------*/
2679 /* MASTER WAITS UNTIL HE RECEIVED WAIT */
2680 /* REPORTS FROM ALL SLAVE CNTR */
2681 /*--------------------------------------*/
2682 if (cnoWaitrep == cnoStartNodes) {
2683 jam();
2684 cnoWaitrep = 0;
2685
2686 g_eventLogger->info("Start NDB start phase 5 (only to DBDIH)");
2687 NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
2688 req->senderRef = reference();
2689 req->nodeId = getOwnNodeId();
2690 req->internalStartPhase = cinternalStartphase;
2691 req->typeOfStart = cdihStartType;
2692 req->masterNodeId = cmasterNodeId;
2693 #ifdef TRACE_STTOR
2694 ndbout_c("sending NDB_STTOR(%d) to DIH", cinternalStartphase);
2695 #endif
2696 sendSignal(DBDIH_REF, GSN_NDB_STTOR, signal,
2697 NdbSttor::SignalLength, JBB);
2698 }//if
2699 return;
2700 }//Ndbcntr::waitpoint52Lab()
2701
2702 /*******************************/
2703 /* NDB_STTORRY */
2704 /*******************************/
ph6ALab(Signal * signal)2705 void Ndbcntr::ph6ALab(Signal* signal)
2706 {
2707 g_eventLogger->info("NDB start phase 5 completed");
2708 if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2709 (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2710 {
2711 jam();
2712 waitpoint51Lab(signal);
2713 return;
2714 }//if
2715
2716 NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2717 rg.m_nodes.clear(getOwnNodeId());
2718 signal->theData[0] = getOwnNodeId();
2719 signal->theData[1] = CntrWaitRep::ZWAITPOINT_5_1;
2720 sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2721
2722 waitpoint51Lab(signal);
2723 return;
2724 }//Ndbcntr::ph6ALab()
2725
waitpoint51Lab(Signal * signal)2726 void Ndbcntr::waitpoint51Lab(Signal* signal)
2727 {
2728 cstartPhase = cstartPhase + 1;
2729 /*---------------------------------------------------------------------------*/
2730 // A FINAL STEP IS NOW TO SEND NDB_STTOR TO TC. THIS MAKES IT POSSIBLE TO
2731 // CONNECT TO TC FOR APPLICATIONS.
2732 // THIS IS NDB START PHASE 6 WHICH IS FOR ALL BLOCKS IN ALL NODES.
2733 /*---------------------------------------------------------------------------*/
2734 g_eventLogger->info("Start NDB start phase 6");
2735 cinternalStartphase = cstartPhase - 1;
2736 cndbBlocksCount = 0;
2737 ph6BLab(signal);
2738 return;
2739 }//Ndbcntr::waitpoint51Lab()
2740
ph6BLab(Signal * signal)2741 void Ndbcntr::ph6BLab(Signal* signal)
2742 {
2743 // c_missra.currentStartPhase - cstartPhase - cinternalStartphase =
2744 // 5 - 7 - 6
2745 if (cndbBlocksCount < ZNO_NDB_BLOCKS)
2746 {
2747 jam();
2748 sendNdbSttor(signal);
2749 return;
2750 }//if
2751 g_eventLogger->info("NDB start phase 6 completed");
2752 if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2753 (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2754 {
2755 jam();
2756 sendSttorry(signal);
2757 return;
2758 }
2759 waitpoint61Lab(signal);
2760 }
2761
waitpoint61Lab(Signal * signal)2762 void Ndbcntr::waitpoint61Lab(Signal* signal)
2763 {
2764 if (getOwnNodeId() == cmasterNodeId) {
2765 jam();
2766 cnoWaitrep6++;
2767 if (cnoWaitrep6 == cnoStartNodes) {
2768 jam();
2769 NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2770 rg.m_nodes.clear(getOwnNodeId());
2771 signal->theData[0] = getOwnNodeId();
2772 signal->theData[1] = CntrWaitRep::ZWAITPOINT_6_2;
2773 sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2774 sendSttorry(signal);
2775 }
2776 } else {
2777 jam();
2778 signal->theData[0] = getOwnNodeId();
2779 signal->theData[1] = CntrWaitRep::ZWAITPOINT_6_1;
2780 sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_WAITREP, signal, 2, JBB);
2781 }
2782 }
2783
2784 // Start phase 8 (internal 7)
startPhase8Lab(Signal * signal)2785 void Ndbcntr::startPhase8Lab(Signal* signal)
2786 {
2787 g_eventLogger->info("Start NDB start phase 7");
2788 cinternalStartphase = cstartPhase - 1;
2789 cndbBlocksCount = 0;
2790 ph7ALab(signal);
2791 }
2792
ph7ALab(Signal * signal)2793 void Ndbcntr::ph7ALab(Signal* signal)
2794 {
2795 while (cndbBlocksCount < ZNO_NDB_BLOCKS)
2796 {
2797 jam();
2798 sendNdbSttor(signal);
2799 return;
2800 }
2801 g_eventLogger->info("NDB start phase 7 completed");
2802 if ((ctypeOfStart == NodeState::ST_NODE_RESTART) ||
2803 (ctypeOfStart == NodeState::ST_INITIAL_NODE_RESTART))
2804 {
2805 jam();
2806 sendSttorry(signal);
2807 return;
2808 }
2809 waitpoint71Lab(signal);
2810 }
2811
waitpoint71Lab(Signal * signal)2812 void Ndbcntr::waitpoint71Lab(Signal* signal)
2813 {
2814 if (getOwnNodeId() == cmasterNodeId) {
2815 jam();
2816 cnoWaitrep7++;
2817 if (cnoWaitrep7 == cnoStartNodes) {
2818 jam();
2819 NodeReceiverGroup rg(NDBCNTR, c_start.m_starting);
2820 rg.m_nodes.clear(getOwnNodeId());
2821 signal->theData[0] = getOwnNodeId();
2822 signal->theData[1] = CntrWaitRep::ZWAITPOINT_7_2;
2823 sendSignal(rg, GSN_CNTR_WAITREP, signal, 2, JBB);
2824 sendSttorry(signal);
2825 }
2826 } else {
2827 jam();
2828 signal->theData[0] = getOwnNodeId();
2829 signal->theData[1] = CntrWaitRep::ZWAITPOINT_7_1;
2830 sendSignal(calcNdbCntrBlockRef(cmasterNodeId), GSN_CNTR_WAITREP, signal, 2, JBB);
2831 }
2832 }
2833
2834 // Start phase 9 (internal 8)
startPhase9Lab(Signal * signal)2835 void Ndbcntr::startPhase9Lab(Signal* signal)
2836 {
2837 cinternalStartphase = cstartPhase - 1;
2838 cndbBlocksCount = 0;
2839 ph8ALab(signal);
2840 }
2841
ph8ALab(Signal * signal)2842 void Ndbcntr::ph8ALab(Signal* signal)
2843 {
2844 sendSttorry(signal);
2845 resetStartVariables(signal);
2846 return;
2847 }//Ndbcntr::ph8BLab()
2848
2849 bool
wait_sp(Signal * signal,Uint32 sp)2850 Ndbcntr::wait_sp(Signal* signal, Uint32 sp)
2851 {
2852 if (sp <= 2)
2853 return false;
2854
2855 switch(ctypeOfStart){
2856 case NodeState::ST_SYSTEM_RESTART:
2857 case NodeState::ST_INITIAL_START:
2858 /**
2859 * synchronized...
2860 */
2861 break;
2862 default:
2863 return false;
2864 }
2865
2866 if (!ndb_wait_sp(getNodeInfo(cmasterNodeId).m_version))
2867 return false;
2868
2869 CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtrSend();
2870 rep->nodeId = getOwnNodeId();
2871 rep->waitPoint = RNIL;
2872 rep->request = CntrWaitRep::WaitFor;
2873 rep->sp = sp;
2874
2875 sendSignal(calcNdbCntrBlockRef(cmasterNodeId),
2876 GSN_CNTR_WAITREP, signal, CntrWaitRep::SignalLength, JBB);
2877
2878 return true; // wait
2879 }
2880
2881 void
wait_sp_rep(Signal * signal)2882 Ndbcntr::wait_sp_rep(Signal* signal)
2883 {
2884 CntrWaitRep rep = *(CntrWaitRep*)signal->getDataPtrSend();
2885 switch(rep.request){
2886 case CntrWaitRep::WaitFor:
2887 jam();
2888 ndbrequire(cmasterNodeId == getOwnNodeId());
2889 break;
2890 case CntrWaitRep::Grant:
2891 jam();
2892 /**
2893 * We're allowed to proceed
2894 */
2895 c_missra.sendNextSTTOR(signal);
2896 return;
2897 }
2898
2899 c_start.m_wait_sp[rep.nodeId] = rep.sp;
2900
2901 /**
2902 * Check if we should allow someone to start...
2903 */
2904 Uint32 node = c_start.m_starting.find(0);
2905 ndbrequire(node < NDB_ARRAY_SIZE(c_start.m_wait_sp));
2906 Uint32 min = c_start.m_wait_sp[node];
2907 for (; node != NdbNodeBitmask::NotFound;
2908 node = c_start.m_starting.find(node + 1))
2909 {
2910 if (!ndb_wait_sp(getNodeInfo(node).m_version))
2911 continue;
2912
2913 if (c_start.m_wait_sp[node] < min)
2914 {
2915 min = c_start.m_wait_sp[node];
2916 }
2917 }
2918
2919 if (min == 0)
2920 {
2921 /**
2922 * wait for more
2923 */
2924 return;
2925 }
2926
2927 NdbNodeBitmask grantnodes;
2928 node = c_start.m_starting.find(0);
2929 for (; node != NdbNodeBitmask::NotFound;
2930 node = c_start.m_starting.find(node + 1))
2931 {
2932 if (!ndb_wait_sp(getNodeInfo(node).m_version))
2933 continue;
2934
2935 if (c_start.m_wait_sp[node] == min)
2936 {
2937 grantnodes.set(node);
2938 c_start.m_wait_sp[node] = 0;
2939 }
2940 }
2941
2942 char buf[100];
2943 g_eventLogger->info("Grant nodes to start phase: %u, nodes: %s",
2944 min,
2945 grantnodes.getText(buf));
2946
2947 NodeReceiverGroup rg(NDBCNTR, grantnodes);
2948 CntrWaitRep * conf = (CntrWaitRep*)signal->getDataPtrSend();
2949 conf->nodeId = getOwnNodeId();
2950 conf->waitPoint = RNIL;
2951 conf->request = CntrWaitRep::Grant;
2952 conf->sp = min;
2953 sendSignal(rg, GSN_CNTR_WAITREP, signal, CntrWaitRep::SignalLength, JBB);
2954 }
2955
2956 /*******************************/
2957 /* CNTR_WAITREP */
2958 /*******************************/
execCNTR_WAITREP(Signal * signal)2959 void Ndbcntr::execCNTR_WAITREP(Signal* signal)
2960 {
2961 jamEntry();
2962 CntrWaitRep* rep = (CntrWaitRep*)signal->getDataPtr();
2963
2964 Uint32 twaitPoint = rep->waitPoint;
2965 switch (twaitPoint) {
2966 case CntrWaitRep::ZWAITPOINT_4_1:
2967 jam();
2968 waitpoint41Lab(signal);
2969 break;
2970 case CntrWaitRep::ZWAITPOINT_4_2:
2971 jam();
2972 c_start.m_starting.assign(NdbNodeBitmask::Size, signal->theData + 2);
2973 sendSttorry(signal);
2974 break;
2975 case CntrWaitRep::ZWAITPOINT_5_1:
2976 jam();
2977 g_eventLogger->info("Master node %u have reached completion of NDB start"
2978 " phase 5",
2979 signal->theData[0]);
2980 waitpoint51Lab(signal);
2981 break;
2982 case CntrWaitRep::ZWAITPOINT_5_2:
2983 jam();
2984 g_eventLogger->info("Node %u have reached completion of NDB start"
2985 " phase 4",
2986 signal->theData[0]);
2987 waitpoint52Lab(signal);
2988 break;
2989 case CntrWaitRep::ZWAITPOINT_6_1:
2990 jam();
2991 waitpoint61Lab(signal);
2992 break;
2993 case CntrWaitRep::ZWAITPOINT_6_2:
2994 jam();
2995 sendSttorry(signal);
2996 break;
2997 case CntrWaitRep::ZWAITPOINT_7_1:
2998 jam();
2999 waitpoint71Lab(signal);
3000 break;
3001 case CntrWaitRep::ZWAITPOINT_7_2:
3002 jam();
3003 sendSttorry(signal);
3004 break;
3005 case CntrWaitRep::ZWAITPOINT_4_2_TO:
3006 jam();
3007 waitpoint42To(signal);
3008 break;
3009 case RNIL:
3010 ndbrequire(signal->getLength() >= CntrWaitRep::SignalLength);
3011 wait_sp_rep(signal);
3012 return;
3013 default:
3014 jam();
3015 systemErrorLab(signal, __LINE__);
3016 break;
3017 }//switch
3018 }//Ndbcntr::execCNTR_WAITREP()
3019
3020 /*******************************/
3021 /* NODE_FAILREP */
3022 /*******************************/
execNODE_FAILREP(Signal * signal)3023 void Ndbcntr::execNODE_FAILREP(Signal* signal)
3024 {
3025 jamEntry();
3026
3027 if (ERROR_INSERTED(1001))
3028 {
3029 sendSignalWithDelay(reference(), GSN_NODE_FAILREP, signal, 100,
3030 signal->getLength());
3031 return;
3032 }
3033
3034 const NodeFailRep * nodeFail = (NodeFailRep *)&signal->theData[0];
3035 NdbNodeBitmask allFailed;
3036 allFailed.assign(NdbNodeBitmask::Size, nodeFail->theNodes);
3037
3038 NdbNodeBitmask failedStarted = c_startedNodes;
3039 NdbNodeBitmask failedStarting = c_start.m_starting;
3040 NdbNodeBitmask failedWaiting = c_start.m_waiting;
3041
3042 failedStarted.bitAND(allFailed);
3043 failedStarting.bitAND(allFailed);
3044 failedWaiting.bitAND(allFailed);
3045
3046 const bool tMasterFailed = allFailed.get(cmasterNodeId);
3047 const bool tStarted = !failedStarted.isclear();
3048 const bool tStarting = !failedStarting.isclear();
3049
3050 if (tMasterFailed)
3051 {
3052 jam();
3053 /**
3054 * If master has failed choose qmgr president as master
3055 */
3056 cmasterNodeId = nodeFail->masterNodeId;
3057 }
3058
3059 /**
3060 * Clear node bitmasks from failed nodes
3061 */
3062 c_start.m_starting.bitANDC(allFailed);
3063 c_start.m_waiting.bitANDC(allFailed);
3064 c_start.m_withLog.bitANDC(allFailed);
3065 c_start.m_withoutLog.bitANDC(allFailed);
3066 c_start.m_waitTO.bitANDC(allFailed);
3067 c_clusterNodes.bitANDC(allFailed);
3068 c_startedNodes.bitANDC(allFailed);
3069
3070 const NodeState & st = getNodeState();
3071 if (st.startLevel == st.SL_STARTING)
3072 {
3073 jam();
3074
3075 const Uint32 phase = st.starting.startPhase;
3076
3077 const bool tStartConf = (phase > 2) || (phase == 2 && cndbBlocksCount > 0);
3078
3079 if (tMasterFailed)
3080 {
3081 progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3082 "Unhandled node failure during restart");
3083 }
3084
3085 if (tStartConf && tStarting)
3086 {
3087 // One of other starting nodes has crashed...
3088 progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3089 "Unhandled node failure of starting node during restart");
3090 }
3091
3092 if (tStartConf && tStarted)
3093 {
3094 // One of other started nodes has crashed...
3095 progError(__LINE__, NDBD_EXIT_SR_OTHERNODEFAILED,
3096 "Unhandled node failure of started node during restart");
3097 }
3098
3099 Uint32 nodeId = 0;
3100 while(!allFailed.isclear()){
3101 nodeId = allFailed.find(nodeId + 1);
3102 allFailed.clear(nodeId);
3103 signal->theData[0] = nodeId;
3104 sendSignal(QMGR_REF, GSN_NDB_FAILCONF, signal, 1, JBB);
3105 }//for
3106
3107 return;
3108 }
3109
3110 ndbrequire(!allFailed.get(getOwnNodeId()));
3111
3112 NodeFailRep * rep = (NodeFailRep *)&signal->theData[0];
3113 rep->masterNodeId = cmasterNodeId;
3114
3115 sendSignal(DBTC_REF, GSN_NODE_FAILREP, signal,
3116 NodeFailRep::SignalLength, JBB);
3117
3118 sendSignal(DBLQH_REF, GSN_NODE_FAILREP, signal,
3119 NodeFailRep::SignalLength, JBB);
3120
3121 sendSignal(DBDIH_REF, GSN_NODE_FAILREP, signal,
3122 NodeFailRep::SignalLength, JBB);
3123
3124 sendSignal(DBDICT_REF, GSN_NODE_FAILREP, signal,
3125 NodeFailRep::SignalLength, JBB);
3126
3127 sendSignal(BACKUP_REF, GSN_NODE_FAILREP, signal,
3128 NodeFailRep::SignalLength, JBB);
3129
3130 sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
3131 NodeFailRep::SignalLength, JBB);
3132
3133 sendSignal(QMGR_REF, GSN_NODE_FAILREP, signal,
3134 NodeFailRep::SignalLength, JBB);
3135
3136 sendSignal(DBUTIL_REF, GSN_NODE_FAILREP, signal,
3137 NodeFailRep::SignalLength, JBB);
3138
3139 sendSignal(DBTUP_REF, GSN_NODE_FAILREP, signal,
3140 NodeFailRep::SignalLength, JBB);
3141
3142 sendSignal(TSMAN_REF, GSN_NODE_FAILREP, signal,
3143 NodeFailRep::SignalLength, JBB);
3144
3145 sendSignal(LGMAN_REF, GSN_NODE_FAILREP, signal,
3146 NodeFailRep::SignalLength, JBB);
3147
3148 sendSignal(DBSPJ_REF, GSN_NODE_FAILREP, signal,
3149 NodeFailRep::SignalLength, JBB);
3150
3151 if (c_stopRec.stopReq.senderRef)
3152 {
3153 jam();
3154 switch(c_stopRec.m_state){
3155 case StopRecord::SR_WAIT_NODE_FAILURES:
3156 {
3157 jam();
3158 NdbNodeBitmask tmp;
3159 tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3160 tmp.bitANDC(allFailed);
3161 tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3162
3163 if (tmp.isclear())
3164 {
3165 jam();
3166 if (c_stopRec.stopReq.senderRef != RNIL)
3167 {
3168 jam();
3169 StopConf * const stopConf = (StopConf *)&signal->theData[0];
3170 stopConf->senderData = c_stopRec.stopReq.senderData;
3171 stopConf->nodeState = (Uint32) NodeState::SL_SINGLEUSER;
3172 sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal,
3173 StopConf::SignalLength, JBB);
3174 }
3175
3176 c_stopRec.stopReq.senderRef = 0;
3177 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
3178 req->senderRef = reference();
3179 req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
3180 req->requestType = WaitGCPReq::UnblockStartGcp;
3181 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
3182 WaitGCPReq::SignalLength, JBA);
3183 }
3184 break;
3185 }
3186 case StopRecord::SR_QMGR_STOP_REQ:
3187 {
3188 NdbNodeBitmask tmp;
3189 tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3190 tmp.bitANDC(allFailed);
3191
3192 if (tmp.isclear())
3193 {
3194 Uint32 nodeId = allFailed.find(0);
3195 tmp.set(nodeId);
3196
3197 StopConf* conf = (StopConf*)signal->getDataPtrSend();
3198 conf->senderData = c_stopRec.stopReq.senderData;
3199 conf->nodeId = nodeId;
3200 sendSignal(reference(),
3201 GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
3202 }
3203
3204 tmp.copyto(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
3205
3206 break;
3207 }
3208 case StopRecord::SR_BLOCK_GCP_START_GCP:
3209 case StopRecord::SR_WAIT_COMPLETE_GCP:
3210 case StopRecord::SR_UNBLOCK_GCP_START_GCP:
3211 case StopRecord::SR_CLUSTER_SHUTDOWN:
3212 break;
3213 }
3214 }
3215
3216 signal->theData[0] = NDB_LE_NODE_FAILREP;
3217 signal->theData[2] = 0;
3218
3219 Uint32 nodeId = 0;
3220 while(!allFailed.isclear()){
3221 nodeId = allFailed.find(nodeId + 1);
3222 allFailed.clear(nodeId);
3223 signal->theData[1] = nodeId;
3224 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
3225 }//for
3226
3227 return;
3228 }//Ndbcntr::execNODE_FAILREP()
3229
3230 /*******************************/
3231 /* READ_NODESREQ */
3232 /*******************************/
execREAD_NODESREQ(Signal * signal)3233 void Ndbcntr::execREAD_NODESREQ(Signal* signal)
3234 {
3235 jamEntry();
3236
3237 /*----------------------------------------------------------------------*/
3238 // ANY BLOCK MAY SEND A REQUEST ABOUT NDB NODES AND VERSIONS IN THE
3239 // SYSTEM. THIS REQUEST CAN ONLY BE HANDLED IN
3240 // ABSOLUTE STARTPHASE 3 OR LATER
3241 /*----------------------------------------------------------------------*/
3242 BlockReference TuserBlockref = signal->theData[0];
3243 ReadNodesConf * const readNodes = (ReadNodesConf *)&signal->theData[0];
3244
3245 /**
3246 * Prepare inactiveNodes bitmask.
3247 * The concept as such is by the way pretty useless.
3248 * It makes parallell starts more or less impossible...
3249 */
3250 NdbNodeBitmask tmp1;
3251 tmp1.bitOR(c_startedNodes);
3252 if(!getNodeState().getNodeRestartInProgress()){
3253 tmp1.bitOR(c_start.m_starting);
3254 } else {
3255 tmp1.set(getOwnNodeId());
3256 }
3257
3258 NdbNodeBitmask tmp2;
3259 tmp2.bitOR(c_allDefinedNodes);
3260 tmp2.bitANDC(tmp1);
3261 /**
3262 * Fill in return signal
3263 */
3264 tmp2.copyto(NdbNodeBitmask::Size, readNodes->inactiveNodes);
3265 c_allDefinedNodes.copyto(NdbNodeBitmask::Size, readNodes->allNodes);
3266 c_clusterNodes.copyto(NdbNodeBitmask::Size, readNodes->clusterNodes);
3267 c_startedNodes.copyto(NdbNodeBitmask::Size, readNodes->startedNodes);
3268 c_start.m_starting.copyto(NdbNodeBitmask::Size, readNodes->startingNodes);
3269
3270 readNodes->noOfNodes = c_allDefinedNodes.count();
3271 readNodes->masterNodeId = cmasterNodeId;
3272 readNodes->ndynamicId = cdynamicNodeId;
3273 if (m_cntr_start_conf)
3274 {
3275 jam();
3276 sendSignal(TuserBlockref, GSN_READ_NODESCONF, signal,
3277 ReadNodesConf::SignalLength, JBB);
3278
3279 } else {
3280 jam();
3281 signal->theData[0] = ZNOT_AVAILABLE;
3282 sendSignal(TuserBlockref, GSN_READ_NODESREF, signal, 1, JBB);
3283 }//if
3284 }//Ndbcntr::execREAD_NODESREQ()
3285
3286 /*----------------------------------------------------------------------*/
3287 // SENDS APPL_ERROR TO QMGR AND THEN SET A POINTER OUT OF BOUNDS
3288 /*----------------------------------------------------------------------*/
systemErrorLab(Signal * signal,int line)3289 void Ndbcntr::systemErrorLab(Signal* signal, int line)
3290 {
3291 progError(line, NDBD_EXIT_NDBREQUIRE); /* BUG INSERTION */
3292 return;
3293 }//Ndbcntr::systemErrorLab()
3294
3295 /*###########################################################################*/
3296 /* CNTR MASTER CREATES AND INITIALIZES A SYSTEMTABLE AT INITIALSTART */
3297 /* |-2048| # 1 00000001 | */
3298 /* | : | : | */
3299 /* | -1 | # 1 00000001 | */
3300 /* | 1 | 0 | tupleid sequence now created on first use */
3301 /* | : | : | v */
3302 /* | 2048| 0 | v */
3303 /*---------------------------------------------------------------------------*/
beginSchemaTransLab(Signal * signal)3304 void Ndbcntr::beginSchemaTransLab(Signal* signal)
3305 {
3306 c_schemaTransId = reference();
3307
3308 SchemaTransBeginReq* req =
3309 (SchemaTransBeginReq*)signal->getDataPtrSend();
3310 req->clientRef = reference();
3311 req->transId = c_schemaTransId;
3312 req->requestInfo = 0;
3313 sendSignal(DBDICT_REF, GSN_SCHEMA_TRANS_BEGIN_REQ, signal,
3314 SchemaTransBeginReq::SignalLength, JBB);
3315 }
3316
execSCHEMA_TRANS_BEGIN_CONF(Signal * signal)3317 void Ndbcntr::execSCHEMA_TRANS_BEGIN_CONF(Signal* signal)
3318 {
3319 const SchemaTransBeginConf* conf =
3320 (SchemaTransBeginConf*)signal->getDataPtr();
3321 ndbrequire(conf->transId == c_schemaTransId);
3322 c_schemaTransKey = conf->transKey;
3323
3324 createHashMap(signal, 0);
3325 }
3326
execSCHEMA_TRANS_BEGIN_REF(Signal * signal)3327 void Ndbcntr::execSCHEMA_TRANS_BEGIN_REF(Signal* signal)
3328 {
3329 ndbrequire(false);
3330 }
3331
3332 void
createHashMap(Signal * signal,Uint32 idx)3333 Ndbcntr::createHashMap(Signal* signal, Uint32 idx)
3334 {
3335 CreateHashMapReq* const req = (CreateHashMapReq*)signal->getDataPtrSend();
3336 req->clientRef = reference();
3337 req->clientData = idx;
3338 req->requestInfo = CreateHashMapReq::CreateDefault;
3339 req->transId = c_schemaTransId;
3340 req->transKey = c_schemaTransKey;
3341 req->buckets = 0;
3342 req->fragments = 0;
3343 sendSignal(DBDICT_REF, GSN_CREATE_HASH_MAP_REQ, signal,
3344 CreateHashMapReq::SignalLength, JBB);
3345 }
3346
3347 void
execCREATE_HASH_MAP_REF(Signal * signal)3348 Ndbcntr::execCREATE_HASH_MAP_REF(Signal* signal)
3349 {
3350 jamEntry();
3351
3352 ndbrequire(false);
3353 }
3354
3355 void
execCREATE_HASH_MAP_CONF(Signal * signal)3356 Ndbcntr::execCREATE_HASH_MAP_CONF(Signal* signal)
3357 {
3358 jamEntry();
3359 CreateHashMapConf* conf = (CreateHashMapConf*)signal->getDataPtrSend();
3360
3361 if (conf->senderData == 0)
3362 {
3363 jam();
3364 c_objectId = conf->objectId;
3365 c_objectVersion = conf->objectVersion;
3366 }
3367
3368 createSystableLab(signal, 0);
3369 }
3370
endSchemaTransLab(Signal * signal)3371 void Ndbcntr::endSchemaTransLab(Signal* signal)
3372 {
3373 SchemaTransEndReq* req =
3374 (SchemaTransEndReq*)signal->getDataPtrSend();
3375 req->clientRef = reference();
3376 req->transId = c_schemaTransId;
3377 req->requestInfo = 0;
3378 req->transKey = c_schemaTransKey;
3379 req->flags = 0;
3380 sendSignal(DBDICT_REF, GSN_SCHEMA_TRANS_END_REQ, signal,
3381 SchemaTransEndReq::SignalLength, JBB);
3382 }
3383
execSCHEMA_TRANS_END_CONF(Signal * signal)3384 void Ndbcntr::execSCHEMA_TRANS_END_CONF(Signal* signal)
3385 {
3386 c_schemaTransId = 0;
3387 c_schemaTransKey = RNIL;
3388 startInsertTransactions(signal);
3389 }
3390
execSCHEMA_TRANS_END_REF(Signal * signal)3391 void Ndbcntr::execSCHEMA_TRANS_END_REF(Signal* signal)
3392 {
3393 jamEntry();
3394 SchemaTransEndRef * ref = (SchemaTransEndRef*)signal->getDataPtr();
3395 char buf[256];
3396 BaseString::snprintf(buf, sizeof(buf),
3397 "Failed to commit schema trans, err: %u",
3398 ref->errorCode);
3399 progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3400 ndbrequire(false);
3401 }
3402
3403 void
createDDObjects(Signal * signal,unsigned index)3404 Ndbcntr::createDDObjects(Signal * signal, unsigned index)
3405 {
3406 const ndb_mgm_configuration_iterator * p =
3407 m_ctx.m_config.getOwnConfigIterator();
3408 ndbrequire(p != 0);
3409
3410 Uint32 propPage[256];
3411 LinearWriter w(propPage, 256);
3412
3413 const ddentry* entry = &f_dd[index];
3414
3415 switch(entry->type){
3416 case DictTabInfo::LogfileGroup:
3417 case DictTabInfo::Tablespace:
3418 {
3419 jam();
3420
3421 DictFilegroupInfo::Filegroup fg; fg.init();
3422 BaseString::snprintf(fg.FilegroupName, sizeof(fg.FilegroupName),
3423 "%s", entry->name);
3424 fg.FilegroupType = entry->type;
3425 if (entry->type == DictTabInfo::LogfileGroup)
3426 {
3427 jam();
3428 fg.LF_UndoBufferSize = Uint32(entry->size);
3429 }
3430 else
3431 {
3432 jam();
3433 fg.TS_ExtentSize = Uint32(entry->size);
3434 fg.TS_LogfileGroupId = c_objectId;
3435 fg.TS_LogfileGroupVersion = c_objectVersion;
3436 }
3437
3438 SimpleProperties::UnpackStatus s;
3439 s = SimpleProperties::pack(w,
3440 &fg,
3441 DictFilegroupInfo::Mapping,
3442 DictFilegroupInfo::MappingSize, true);
3443
3444
3445 Uint32 length = w.getWordsUsed();
3446 LinearSectionPtr ptr[3];
3447 ptr[0].p = &propPage[0];
3448 ptr[0].sz = length;
3449
3450 CreateFilegroupReq * req = (CreateFilegroupReq*)signal->getDataPtrSend();
3451 req->senderRef = reference();
3452 req->senderData = index;
3453 req->objType = entry->type;
3454 req->transId = c_schemaTransId;
3455 req->transKey = c_schemaTransKey;
3456 req->requestInfo = 0;
3457 sendSignal(DBDICT_REF, GSN_CREATE_FILEGROUP_REQ, signal,
3458 CreateFilegroupReq::SignalLength, JBB, ptr, 1);
3459 return;
3460 }
3461 case DictTabInfo::Undofile:
3462 case DictTabInfo::Datafile:
3463 {
3464 jam();
3465 Uint32 propPage[256];
3466 LinearWriter w(propPage, 256);
3467 DictFilegroupInfo::File f; f.init();
3468 BaseString::snprintf(f.FileName, sizeof(f.FileName), "%s", entry->name);
3469 f.FileType = entry->type;
3470 f.FilegroupId = c_objectId;
3471 f.FilegroupVersion = c_objectVersion;
3472 f.FileSizeHi = Uint32(entry->size >> 32);
3473 f.FileSizeLo = Uint32(entry->size);
3474
3475 SimpleProperties::UnpackStatus s;
3476 s = SimpleProperties::pack(w,
3477 &f,
3478 DictFilegroupInfo::FileMapping,
3479 DictFilegroupInfo::FileMappingSize, true);
3480
3481 Uint32 length = w.getWordsUsed();
3482 LinearSectionPtr ptr[3];
3483 ptr[0].p = &propPage[0];
3484 ptr[0].sz = length;
3485
3486 CreateFileReq * req = (CreateFileReq*)signal->getDataPtrSend();
3487 req->senderRef = reference();
3488 req->senderData = index;
3489 req->objType = entry->type;
3490 req->transId = c_schemaTransId;
3491 req->transKey = c_schemaTransKey;
3492 req->requestInfo = CreateFileReq::ForceCreateFile;
3493 sendSignal(DBDICT_REF, GSN_CREATE_FILE_REQ, signal,
3494 CreateFileReq::SignalLength, JBB, ptr, 1);
3495 return;
3496 }
3497 default:
3498 break;
3499 }
3500
3501 endSchemaTransLab(signal);
3502 }
3503
3504 void
execCREATE_FILEGROUP_REF(Signal * signal)3505 Ndbcntr::execCREATE_FILEGROUP_REF(Signal* signal)
3506 {
3507 jamEntry();
3508 CreateFilegroupRef* ref = (CreateFilegroupRef*)signal->getDataPtr();
3509 char buf[1024];
3510
3511 const ddentry* entry = &f_dd[ref->senderData];
3512
3513 if (entry->type == DictTabInfo::LogfileGroup)
3514 {
3515 BaseString::snprintf(buf, sizeof(buf), "create logfilegroup err %u",
3516 ref->errorCode);
3517 }
3518 else if (entry->type == DictTabInfo::Tablespace)
3519 {
3520 BaseString::snprintf(buf, sizeof(buf), "create tablespace err %u",
3521 ref->errorCode);
3522 }
3523 progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3524 }
3525
3526 void
execCREATE_FILEGROUP_CONF(Signal * signal)3527 Ndbcntr::execCREATE_FILEGROUP_CONF(Signal* signal)
3528 {
3529 jamEntry();
3530 CreateFilegroupConf* conf = (CreateFilegroupConf*)signal->getDataPtr();
3531 c_objectId = conf->filegroupId;
3532 c_objectVersion = conf->filegroupVersion;
3533 createDDObjects(signal, conf->senderData + 1);
3534 }
3535
3536 void
execCREATE_FILE_REF(Signal * signal)3537 Ndbcntr::execCREATE_FILE_REF(Signal* signal)
3538 {
3539 jamEntry();
3540 CreateFileRef* ref = (CreateFileRef*)signal->getDataPtr();
3541 char buf[1024];
3542
3543 const ddentry* entry = &f_dd[ref->senderData];
3544
3545 if (entry->type == DictTabInfo::Undofile)
3546 {
3547 BaseString::snprintf(buf, sizeof(buf), "create undofile %s err %u",
3548 entry->name,
3549 ref->errorCode);
3550 }
3551 else if (entry->type == DictTabInfo::Datafile)
3552 {
3553 BaseString::snprintf(buf, sizeof(buf), "create datafile %s err %u",
3554 entry->name,
3555 ref->errorCode);
3556 }
3557 progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
3558 }
3559
3560 void
execCREATE_FILE_CONF(Signal * signal)3561 Ndbcntr::execCREATE_FILE_CONF(Signal* signal)
3562 {
3563 jamEntry();
3564 CreateFileConf* conf = (CreateFileConf*)signal->getDataPtr();
3565 createDDObjects(signal, conf->senderData + 1);
3566 }
3567
createSystableLab(Signal * signal,unsigned index)3568 void Ndbcntr::createSystableLab(Signal* signal, unsigned index)
3569 {
3570 if (index >= g_sysTableCount) {
3571 ndbassert(index == g_sysTableCount);
3572 createDDObjects(signal, 0);
3573 return;
3574 }
3575 const SysTable& table = *g_sysTableList[index];
3576 Uint32 propPage[256];
3577 LinearWriter w(propPage, 256);
3578
3579 // XXX remove commented-out lines later
3580
3581 w.first();
3582 w.add(DictTabInfo::TableName, table.name);
3583 w.add(DictTabInfo::TableLoggedFlag, table.tableLoggedFlag);
3584 //w.add(DictTabInfo::TableKValue, 6);
3585 //w.add(DictTabInfo::MinLoadFactor, 70);
3586 //w.add(DictTabInfo::MaxLoadFactor, 80);
3587 w.add(DictTabInfo::FragmentTypeVal, (Uint32)table.fragmentType);
3588 //w.add(DictTabInfo::NoOfKeyAttr, 1);
3589 w.add(DictTabInfo::NoOfAttributes, (Uint32)table.columnCount);
3590 //w.add(DictTabInfo::NoOfNullable, (Uint32)0);
3591 //w.add(DictTabInfo::NoOfVariable, (Uint32)0);
3592 //w.add(DictTabInfo::KeyLength, 1);
3593 w.add(DictTabInfo::TableTypeVal, (Uint32)table.tableType);
3594 w.add(DictTabInfo::SingleUserMode, (Uint32)NDB_SUM_READ_WRITE);
3595 w.add(DictTabInfo::HashMapObjectId, c_objectId);
3596 w.add(DictTabInfo::HashMapVersion, c_objectVersion);
3597
3598 for (unsigned i = 0; i < table.columnCount; i++) {
3599 const SysColumn& column = table.columnList[i];
3600 ndbassert(column.pos == i);
3601 w.add(DictTabInfo::AttributeName, column.name);
3602 w.add(DictTabInfo::AttributeId, (Uint32)i);
3603 w.add(DictTabInfo::AttributeKeyFlag, (Uint32)column.keyFlag);
3604 w.add(DictTabInfo::AttributeStorageType,
3605 (Uint32)NDB_STORAGETYPE_MEMORY);
3606 switch(column.type){
3607 case DictTabInfo::ExtVarbinary:
3608 jam();
3609 w.add(DictTabInfo::AttributeArrayType,
3610 (Uint32)NDB_ARRAYTYPE_SHORT_VAR);
3611 break;
3612 case DictTabInfo::ExtLongvarbinary:
3613 jam();
3614 w.add(DictTabInfo::AttributeArrayType,
3615 (Uint32)NDB_ARRAYTYPE_MEDIUM_VAR);
3616 break;
3617 default:
3618 jam();
3619 w.add(DictTabInfo::AttributeArrayType,
3620 (Uint32)NDB_ARRAYTYPE_FIXED);
3621 break;
3622 }
3623 w.add(DictTabInfo::AttributeNullableFlag, (Uint32)column.nullable);
3624 w.add(DictTabInfo::AttributeExtType, (Uint32)column.type);
3625 w.add(DictTabInfo::AttributeExtLength, (Uint32)column.length);
3626 w.add(DictTabInfo::AttributeEnd, (Uint32)true);
3627 }
3628 w.add(DictTabInfo::TableEnd, (Uint32)true);
3629
3630 Uint32 length = w.getWordsUsed();
3631 LinearSectionPtr ptr[3];
3632 ptr[0].p = &propPage[0];
3633 ptr[0].sz = length;
3634
3635 CreateTableReq* const req = (CreateTableReq*)signal->getDataPtrSend();
3636 req->clientRef = reference();
3637 req->clientData = index;
3638 req->requestInfo = 0;
3639 req->transId = c_schemaTransId;
3640 req->transKey = c_schemaTransKey;
3641 sendSignal(DBDICT_REF, GSN_CREATE_TABLE_REQ, signal,
3642 CreateTableReq::SignalLength, JBB, ptr, 1);
3643 return;
3644 }//Ndbcntr::createSystableLab()
3645
execCREATE_TABLE_REF(Signal * signal)3646 void Ndbcntr::execCREATE_TABLE_REF(Signal* signal)
3647 {
3648 jamEntry();
3649 progError(__LINE__,NDBD_EXIT_NDBREQUIRE, "CREATE_TABLE_REF");
3650 return;
3651 }//Ndbcntr::execDICTTABREF()
3652
execCREATE_TABLE_CONF(Signal * signal)3653 void Ndbcntr::execCREATE_TABLE_CONF(Signal* signal)
3654 {
3655 jamEntry();
3656 const CreateTableConf* conf = (const CreateTableConf*)signal->getDataPtr();
3657 //csystabId = conf->tableId;
3658 ndbrequire(conf->transId == c_schemaTransId);
3659 ndbrequire(conf->senderData < g_sysTableCount);
3660 const SysTable& table = *g_sysTableList[conf->senderData];
3661 table.tableId = conf->tableId;
3662 table.tableVersion = conf->tableVersion;
3663 createSystableLab(signal, conf->senderData + 1);
3664 //startInsertTransactions(signal);
3665 return;
3666 }//Ndbcntr::execDICTTABCONF()
3667
3668 /*******************************/
3669 /* DICTRELEASECONF */
3670 /*******************************/
startInsertTransactions(Signal * signal)3671 void Ndbcntr::startInsertTransactions(Signal* signal)
3672 {
3673 jamEntry();
3674
3675 ckey = 1;
3676 ctransidPhase = ZTRUE;
3677 signal->theData[0] = 0;
3678 signal->theData[1] = reference();
3679 sendSignal(DBTC_REF, GSN_TCSEIZEREQ, signal, 2, JBB);
3680 return;
3681 }//Ndbcntr::startInsertTransactions()
3682
3683 /*******************************/
3684 /* TCSEIZECONF */
3685 /*******************************/
execTCSEIZECONF(Signal * signal)3686 void Ndbcntr::execTCSEIZECONF(Signal* signal)
3687 {
3688 jamEntry();
3689 ctcConnectionP = signal->theData[1];
3690 ctcReference = signal->theData[2];
3691 crSystab7Lab(signal);
3692 return;
3693 }//Ndbcntr::execTCSEIZECONF()
3694
3695 const unsigned int RowsPerCommit = 16;
crSystab7Lab(Signal * signal)3696 void Ndbcntr::crSystab7Lab(Signal* signal)
3697 {
3698 UintR tkey;
3699 UintR Tmp;
3700
3701 TcKeyReq * const tcKeyReq = (TcKeyReq *)&signal->theData[0];
3702
3703 UintR reqInfo_Start = 0;
3704 tcKeyReq->setOperationType(reqInfo_Start, ZINSERT); // Insert
3705 tcKeyReq->setKeyLength (reqInfo_Start, 1);
3706 tcKeyReq->setAIInTcKeyReq (reqInfo_Start, 5);
3707 tcKeyReq->setAbortOption (reqInfo_Start, TcKeyReq::AbortOnError);
3708
3709 /* KEY LENGTH = 1, ATTRINFO LENGTH IN TCKEYREQ = 5 */
3710 cresponses = 0;
3711 const UintR guard0 = ckey + (RowsPerCommit - 1);
3712 for (Tmp = ckey; Tmp <= guard0; Tmp++) {
3713 UintR reqInfo = reqInfo_Start;
3714 if (Tmp == ckey) { // First iteration, Set start flag
3715 jam();
3716 tcKeyReq->setStartFlag(reqInfo, 1);
3717 } //if
3718 if (Tmp == guard0) { // Last iteration, Set commit flag
3719 jam();
3720 tcKeyReq->setCommitFlag(reqInfo, 1);
3721 tcKeyReq->setExecuteFlag(reqInfo, 1);
3722 } //if
3723 if (ctransidPhase == ZTRUE) {
3724 jam();
3725 tkey = 0;
3726 tkey = tkey - Tmp;
3727 } else {
3728 jam();
3729 tkey = Tmp;
3730 }//if
3731
3732 tcKeyReq->apiConnectPtr = ctcConnectionP;
3733 tcKeyReq->attrLen = 5;
3734 tcKeyReq->tableId = g_sysTable_SYSTAB_0.tableId;
3735 tcKeyReq->requestInfo = reqInfo;
3736 tcKeyReq->tableSchemaVersion = g_sysTable_SYSTAB_0.tableVersion;
3737 tcKeyReq->transId1 = 0;
3738 tcKeyReq->transId2 = ckey;
3739
3740 //-------------------------------------------------------------
3741 // There is no optional part in this TCKEYREQ. There is one
3742 // key word and five ATTRINFO words.
3743 //-------------------------------------------------------------
3744 Uint32* tKeyDataPtr = &tcKeyReq->scanInfo;
3745 Uint32* tAIDataPtr = &tKeyDataPtr[1];
3746
3747 tKeyDataPtr[0] = tkey;
3748
3749 AttributeHeader::init(&tAIDataPtr[0], 0, 1 << 2);
3750 tAIDataPtr[1] = tkey;
3751 AttributeHeader::init(&tAIDataPtr[2], 1, 2 << 2);
3752 tAIDataPtr[3] = (tkey << 16);
3753 tAIDataPtr[4] = 1;
3754 sendSignal(ctcReference, GSN_TCKEYREQ, signal,
3755 TcKeyReq::StaticLength + 6, JBB);
3756 }//for
3757 ckey = ckey + RowsPerCommit;
3758 return;
3759 }//Ndbcntr::crSystab7Lab()
3760
3761 /*******************************/
3762 /* TCKEYCONF09 */
3763 /*******************************/
execTCKEYCONF(Signal * signal)3764 void Ndbcntr::execTCKEYCONF(Signal* signal)
3765 {
3766 const TcKeyConf * const keyConf = (TcKeyConf *)&signal->theData[0];
3767
3768 jamEntry();
3769 cgciSystab = keyConf->gci_hi;
3770 UintR confInfo = keyConf->confInfo;
3771
3772 if (TcKeyConf::getMarkerFlag(confInfo)){
3773 Uint32 transId1 = keyConf->transId1;
3774 Uint32 transId2 = keyConf->transId2;
3775 signal->theData[0] = transId1;
3776 signal->theData[1] = transId2;
3777 sendSignal(ctcReference, GSN_TC_COMMIT_ACK, signal, 2, JBB);
3778 }//if
3779
3780 cresponses = cresponses + TcKeyConf::getNoOfOperations(confInfo);
3781 if (TcKeyConf::getCommitFlag(confInfo)){
3782 jam();
3783 ndbrequire(cresponses == RowsPerCommit);
3784
3785 crSystab8Lab(signal);
3786 return;
3787 }
3788 return;
3789 }//Ndbcntr::tckeyConfLab()
3790
crSystab8Lab(Signal * signal)3791 void Ndbcntr::crSystab8Lab(Signal* signal)
3792 {
3793 if (ckey < ZSIZE_SYSTAB) {
3794 jam();
3795 crSystab7Lab(signal);
3796 return;
3797 } else if (ctransidPhase == ZTRUE) {
3798 jam();
3799 ckey = 1;
3800 ctransidPhase = ZFALSE;
3801 // skip 2nd loop - tupleid sequence now created on first use
3802 }//if
3803 signal->theData[0] = ctcConnectionP;
3804 signal->theData[1] = reference();
3805 signal->theData[2] = 0;
3806 sendSignal(ctcReference, GSN_TCRELEASEREQ, signal, 2, JBB);
3807 return;
3808 }//Ndbcntr::crSystab8Lab()
3809
3810 /*******************************/
3811 /* TCRELEASECONF */
3812 /*******************************/
execTCRELEASECONF(Signal * signal)3813 void Ndbcntr::execTCRELEASECONF(Signal* signal)
3814 {
3815 jamEntry();
3816 g_eventLogger->info("Creation of System Tables Completed");
3817 waitpoint52Lab(signal);
3818 return;
3819 }//Ndbcntr::execTCRELEASECONF()
3820
crSystab9Lab(Signal * signal)3821 void Ndbcntr::crSystab9Lab(Signal* signal)
3822 {
3823 signal->theData[0] = 0; // user ptr
3824 signal->theData[1] = reference();
3825 signal->theData[2] = 0;
3826 sendSignalWithDelay(DBDIH_REF, GSN_GETGCIREQ, signal, 100, 3);
3827 return;
3828 }//Ndbcntr::crSystab9Lab()
3829
3830 /*******************************/
3831 /* GETGCICONF */
3832 /*******************************/
execGETGCICONF(Signal * signal)3833 void Ndbcntr::execGETGCICONF(Signal* signal)
3834 {
3835 jamEntry();
3836
3837 #ifndef NO_GCP
3838 if (signal->theData[1] < cgciSystab) {
3839 jam();
3840 /*--------------------------------------*/
3841 /* MAKE SURE THAT THE SYSTABLE IS */
3842 /* NOW SAFE ON DISK */
3843 /*--------------------------------------*/
3844 crSystab9Lab(signal);
3845 return;
3846 }//if
3847 #endif
3848 waitpoint52Lab(signal);
3849 return;
3850 }//Ndbcntr::execGETGCICONF()
3851
execTCKEYREF(Signal * signal)3852 void Ndbcntr::execTCKEYREF(Signal* signal)
3853 {
3854 jamEntry();
3855 systemErrorLab(signal, __LINE__);
3856 return;
3857 }//Ndbcntr::execTCKEYREF()
3858
execTCROLLBACKREP(Signal * signal)3859 void Ndbcntr::execTCROLLBACKREP(Signal* signal)
3860 {
3861 jamEntry();
3862 systemErrorLab(signal, __LINE__);
3863 return;
3864 }//Ndbcntr::execTCROLLBACKREP()
3865
execTCRELEASEREF(Signal * signal)3866 void Ndbcntr::execTCRELEASEREF(Signal* signal)
3867 {
3868 jamEntry();
3869 systemErrorLab(signal, __LINE__);
3870 return;
3871 }//Ndbcntr::execTCRELEASEREF()
3872
execTCSEIZEREF(Signal * signal)3873 void Ndbcntr::execTCSEIZEREF(Signal* signal)
3874 {
3875 jamEntry();
3876 systemErrorLab(signal, __LINE__);
3877 return;
3878 }//Ndbcntr::execTCSEIZEREF()
3879
3880
3881 /*---------------------------------------------------------------------------*/
3882 /*INITIALIZE VARIABLES AND RECORDS */
3883 /*---------------------------------------------------------------------------*/
initData(Signal * signal)3884 void Ndbcntr::initData(Signal* signal)
3885 {
3886 c_start.reset();
3887 cmasterNodeId = 0;
3888 cnoStartNodes = 0;
3889 cnoWaitrep = 0;
3890 }//Ndbcntr::initData()
3891
3892
3893 /*---------------------------------------------------------------------------*/
3894 /*RESET VARIABLES USED DURING THE START */
3895 /*---------------------------------------------------------------------------*/
resetStartVariables(Signal * signal)3896 void Ndbcntr::resetStartVariables(Signal* signal)
3897 {
3898 cnoStartNodes = 0;
3899 cnoWaitrep6 = cnoWaitrep7 = 0;
3900 }//Ndbcntr::resetStartVariables()
3901
3902
3903 /*---------------------------------------------------------------------------*/
3904 // SEND THE SIGNAL
3905 // INPUT CNDB_BLOCKS_COUNT
3906 /*---------------------------------------------------------------------------*/
sendNdbSttor(Signal * signal)3907 void Ndbcntr::sendNdbSttor(Signal* signal)
3908 {
3909 NdbBlocksRecPtr ndbBlocksPtr;
3910
3911 ndbBlocksPtr.i = cndbBlocksCount;
3912 ptrCheckGuard(ndbBlocksPtr, ZSIZE_NDB_BLOCKS_REC, ndbBlocksRec);
3913
3914 NdbSttor * const req = (NdbSttor*)signal->getDataPtrSend();
3915 req->senderRef = reference();
3916 req->nodeId = getOwnNodeId();
3917 req->internalStartPhase = cinternalStartphase;
3918 req->typeOfStart = ctypeOfStart;
3919 req->masterNodeId = cmasterNodeId;
3920
3921 for (int i = 0; i < 16; i++) {
3922 // Garbage
3923 req->config[i] = 0x88776655;
3924 }
3925
3926 //#define MAX_STARTPHASE 2
3927 #ifdef TRACE_STTOR
3928 ndbout_c("sending NDB_STTOR(%d) to %s",
3929 cinternalStartphase,
3930 getBlockName( refToBlock(ndbBlocksPtr.p->blockref)));
3931 #endif
3932 if (refToBlock(ndbBlocksPtr.p->blockref) == DBDIH)
3933 req->typeOfStart = cdihStartType;
3934 sendSignal(ndbBlocksPtr.p->blockref, GSN_NDB_STTOR, signal, 22, JBB);
3935 cndbBlocksCount++;
3936 }//Ndbcntr::sendNdbSttor()
3937
3938 /*---------------------------------------------------------------------------*/
3939 // JUST SEND THE SIGNAL
3940 /*---------------------------------------------------------------------------*/
sendSttorry(Signal * signal,Uint32 delayed)3941 void Ndbcntr::sendSttorry(Signal* signal, Uint32 delayed)
3942 {
3943 signal->theData[3] = ZSTART_PHASE_1;
3944 signal->theData[4] = ZSTART_PHASE_2;
3945 signal->theData[5] = ZSTART_PHASE_3;
3946 signal->theData[6] = ZSTART_PHASE_4;
3947 signal->theData[7] = ZSTART_PHASE_5;
3948 signal->theData[8] = ZSTART_PHASE_6;
3949 // skip simulated phase 7
3950 signal->theData[9] = ZSTART_PHASE_8;
3951 signal->theData[10] = ZSTART_PHASE_9;
3952 signal->theData[11] = ZSTART_PHASE_END;
3953 if (delayed == 0)
3954 {
3955 sendSignal(NDBCNTR_REF, GSN_STTORRY, signal, 12, JBB);
3956 return;
3957 }
3958 sendSignalWithDelay(NDBCNTR_REF, GSN_STTORRY, signal, delayed, 12);
3959 }//Ndbcntr::sendSttorry()
3960
3961 void
execDUMP_STATE_ORD(Signal * signal)3962 Ndbcntr::execDUMP_STATE_ORD(Signal* signal)
3963 {
3964 DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
3965 Uint32 arg = dumpState->args[0];
3966
3967 if(arg == 13){
3968 infoEvent("Cntr: cstartPhase = %d, cinternalStartphase = %d, block = %d",
3969 cstartPhase,
3970 cinternalStartphase,
3971 cndbBlocksCount);
3972 infoEvent("Cntr: cmasterNodeId = %d", cmasterNodeId);
3973 }
3974
3975 if (arg == DumpStateOrd::NdbcntrTestStopOnError){
3976 if (m_ctx.m_config.stopOnError() == true)
3977 ((Configuration&)m_ctx.m_config).stopOnError(false);
3978
3979 const BlockReference tblockref = calcNdbCntrBlockRef(getOwnNodeId());
3980
3981 SystemError * const sysErr = (SystemError*)&signal->theData[0];
3982 sysErr->errorCode = SystemError::TestStopOnError;
3983 sysErr->errorRef = reference();
3984 sendSignal(tblockref, GSN_SYSTEM_ERROR, signal,
3985 SystemError::SignalLength, JBA);
3986 }
3987
3988 if (arg == DumpStateOrd::NdbcntrStopNodes)
3989 {
3990 NdbNodeBitmask mask;
3991 for(Uint32 i = 1; i<signal->getLength(); i++)
3992 mask.set(signal->theData[i]);
3993
3994 StopReq* req = (StopReq*)signal->getDataPtrSend();
3995 req->senderRef = RNIL;
3996 req->senderData = 123;
3997 req->requestInfo = 0;
3998 req->singleuser = 0;
3999 req->singleUserApi = 0;
4000 mask.copyto(NdbNodeBitmask::Size, req->nodes);
4001 StopReq::setPerformRestart(req->requestInfo, 1);
4002 StopReq::setNoStart(req->requestInfo, 1);
4003 StopReq::setStopNodes(req->requestInfo, 1);
4004 StopReq::setStopAbort(req->requestInfo, 1);
4005
4006 sendSignal(reference(), GSN_STOP_REQ, signal,
4007 StopReq::SignalLength, JBB);
4008 return;
4009 }
4010
4011 if (arg == 71)
4012 {
4013 #ifdef ERROR_INSERT
4014 if (signal->getLength() == 2)
4015 {
4016 c_error_insert_extra = signal->theData[1];
4017 SET_ERROR_INSERT_VALUE(1002);
4018 }
4019 else if (ERROR_INSERTED(1002))
4020 {
4021 CLEAR_ERROR_INSERT_VALUE;
4022 }
4023 #endif
4024 }
4025
4026 }//Ndbcntr::execDUMP_STATE_ORD()
4027
updateNodeState(Signal * signal,const NodeState & newState) const4028 void Ndbcntr::updateNodeState(Signal* signal, const NodeState& newState) const{
4029 NodeStateRep * const stateRep = (NodeStateRep *)&signal->theData[0];
4030
4031 if (newState.startLevel == NodeState::SL_STARTED)
4032 {
4033 CRASH_INSERTION(1000);
4034 }
4035
4036 stateRep->nodeState = newState;
4037 stateRep->nodeState.masterNodeId = cmasterNodeId;
4038 stateRep->nodeState.setNodeGroup(c_nodeGroup);
4039
4040 for(Uint32 i = 0; i<ALL_BLOCKS_SZ; i++){
4041 sendSignal(ALL_BLOCKS[i].Ref, GSN_NODE_STATE_REP, signal,
4042 NodeStateRep::SignalLength, JBB);
4043 }
4044 }
4045
4046 void
execRESUME_REQ(Signal * signal)4047 Ndbcntr::execRESUME_REQ(Signal* signal){
4048 //ResumeReq * const req = (ResumeReq *)&signal->theData[0];
4049 //ResumeRef * const ref = (ResumeRef *)&signal->theData[0];
4050
4051 jamEntry();
4052
4053 signal->theData[0] = NDB_LE_SingleUser;
4054 signal->theData[1] = 2;
4055 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4056
4057 //Uint32 senderData = req->senderData;
4058 //BlockReference senderRef = req->senderRef;
4059 NodeState newState(NodeState::SL_STARTED);
4060 updateNodeState(signal, newState);
4061 c_stopRec.stopReq.senderRef=0;
4062 send_node_started_rep(signal);
4063 }
4064
4065 void
execSTOP_REQ(Signal * signal)4066 Ndbcntr::execSTOP_REQ(Signal* signal){
4067 StopReq * const req = (StopReq *)&signal->theData[0];
4068 StopRef * const ref = (StopRef *)&signal->theData[0];
4069 Uint32 singleuser = req->singleuser;
4070 jamEntry();
4071 Uint32 senderData = req->senderData;
4072 BlockReference senderRef = req->senderRef;
4073 bool abort = StopReq::getStopAbort(req->requestInfo);
4074 bool stopnodes = StopReq::getStopNodes(req->requestInfo);
4075
4076 if(!singleuser &&
4077 (getNodeState().startLevel < NodeState::SL_STARTED ||
4078 (abort && !stopnodes)))
4079 {
4080 /**
4081 * Node is not started yet
4082 *
4083 * So stop it quickly
4084 */
4085 jam();
4086 const Uint32 reqInfo = req->requestInfo;
4087 if(StopReq::getPerformRestart(reqInfo)){
4088 jam();
4089 StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4090 startOrd->restartInfo = reqInfo;
4091 sendSignal(CMVMI_REF, GSN_START_ORD, signal, 1, JBA);
4092 } else {
4093 jam();
4094 sendSignal(CMVMI_REF, GSN_STOP_ORD, signal, 1, JBA);
4095 }
4096 return;
4097 }
4098
4099 if(c_stopRec.stopReq.senderRef != 0 ||
4100 (cmasterNodeId == getOwnNodeId() && !c_start.m_starting.isclear()))
4101 {
4102 /**
4103 * Requested a system shutdown
4104 */
4105 if(!singleuser && StopReq::getSystemStop(req->requestInfo)){
4106 jam();
4107 sendSignalWithDelay(reference(), GSN_STOP_REQ, signal, 100,
4108 StopReq::SignalLength);
4109 return;
4110 }
4111
4112 /**
4113 * Requested a node shutdown
4114 */
4115 if(c_stopRec.stopReq.senderRef &&
4116 StopReq::getSystemStop(c_stopRec.stopReq.requestInfo))
4117 ref->errorCode = StopRef::SystemShutdownInProgress;
4118 else
4119 ref->errorCode = StopRef::NodeShutdownInProgress;
4120 ref->senderData = senderData;
4121 ref->masterNodeId = cmasterNodeId;
4122
4123 if (senderRef != RNIL)
4124 sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4125 return;
4126 }
4127
4128 if (stopnodes && !abort)
4129 {
4130 jam();
4131 ref->errorCode = StopRef::UnsupportedNodeShutdown;
4132 ref->senderData = senderData;
4133 ref->masterNodeId = cmasterNodeId;
4134 if (senderRef != RNIL)
4135 sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4136 return;
4137 }
4138
4139 if (stopnodes && cmasterNodeId != getOwnNodeId())
4140 {
4141 jam();
4142 ref->errorCode = StopRef::MultiNodeShutdownNotMaster;
4143 ref->senderData = senderData;
4144 ref->masterNodeId = cmasterNodeId;
4145 if (senderRef != RNIL)
4146 sendSignal(senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4147 return;
4148 }
4149
4150 c_stopRec.stopReq = * req;
4151 c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4152
4153 if (stopnodes)
4154 {
4155 jam();
4156
4157 if(!c_stopRec.checkNodeFail(signal))
4158 {
4159 jam();
4160 return;
4161 }
4162
4163 char buf[100];
4164 NdbNodeBitmask mask;
4165 mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4166 infoEvent("Initiating shutdown abort of %s", mask.getText(buf));
4167 ndbout_c("Initiating shutdown abort of %s", mask.getText(buf));
4168
4169 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4170 req->senderRef = reference();
4171 req->senderData = StopRecord::SR_BLOCK_GCP_START_GCP;
4172 req->requestType = WaitGCPReq::BlockStartGcp;
4173 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4174 WaitGCPReq::SignalLength, JBB);
4175 return;
4176 }
4177 else if(!singleuser)
4178 {
4179 if(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo))
4180 {
4181 jam();
4182 if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo))
4183 {
4184 ((Configuration&)m_ctx.m_config).stopOnError(false);
4185 }
4186 }
4187 if(!c_stopRec.checkNodeFail(signal))
4188 {
4189 jam();
4190 return;
4191 }
4192 signal->theData[0] = NDB_LE_NDBStopStarted;
4193 signal->theData[1] = StopReq::getSystemStop(c_stopRec.stopReq.requestInfo) ? 1 : 0;
4194 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4195 }
4196 else
4197 {
4198 signal->theData[0] = NDB_LE_SingleUser;
4199 signal->theData[1] = 0;
4200 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4201 }
4202
4203 NodeState newState(NodeState::SL_STOPPING_1,
4204 StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4205
4206 if(singleuser) {
4207 newState.setSingleUser(true);
4208 newState.setSingleUserApi(c_stopRec.stopReq.singleUserApi);
4209 }
4210 updateNodeState(signal, newState);
4211 signal->theData[0] = ZSHUTDOWN;
4212 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4213 }
4214
4215 void
checkTimeout(Signal * signal)4216 Ndbcntr::StopRecord::checkTimeout(Signal* signal){
4217 jamEntry();
4218
4219 if(!cntr.getNodeState().getSingleUserMode())
4220 if(!checkNodeFail(signal)){
4221 jam();
4222 return;
4223 }
4224
4225 switch(cntr.getNodeState().startLevel){
4226 case NodeState::SL_STOPPING_1:
4227 checkApiTimeout(signal);
4228 break;
4229 case NodeState::SL_STOPPING_2:
4230 checkTcTimeout(signal);
4231 break;
4232 case NodeState::SL_STOPPING_3:
4233 checkLqhTimeout_1(signal);
4234 break;
4235 case NodeState::SL_STOPPING_4:
4236 checkLqhTimeout_2(signal);
4237 break;
4238 case NodeState::SL_SINGLEUSER:
4239 break;
4240 default:
4241 ndbrequire(false);
4242 }
4243 }
4244
4245 bool
checkNodeFail(Signal * signal)4246 Ndbcntr::StopRecord::checkNodeFail(Signal* signal){
4247 jam();
4248 if(StopReq::getSystemStop(stopReq.requestInfo)){
4249 jam();
4250 return true;
4251 }
4252
4253 /**
4254 * Check if I can survive me stopping
4255 */
4256 NdbNodeBitmask ndbMask;
4257 ndbMask.assign(cntr.c_startedNodes);
4258
4259 if (StopReq::getStopNodes(stopReq.requestInfo))
4260 {
4261 NdbNodeBitmask tmp;
4262 tmp.assign(NdbNodeBitmask::Size, stopReq.nodes);
4263
4264 NdbNodeBitmask ndbStopNodes;
4265 ndbStopNodes.assign(NdbNodeBitmask::Size, stopReq.nodes);
4266 ndbStopNodes.bitAND(ndbMask);
4267 ndbStopNodes.copyto(NdbNodeBitmask::Size, stopReq.nodes);
4268
4269 ndbMask.bitANDC(tmp);
4270
4271 bool allNodesStopped = true;
4272 int i ;
4273 for( i = 0; i < (int) NdbNodeBitmask::Size; i++ ){
4274 if ( stopReq.nodes[i] != 0 ){
4275 allNodesStopped = false;
4276 break;
4277 }
4278 }
4279
4280 if ( allNodesStopped ) {
4281 StopConf * const stopConf = (StopConf *)&signal->theData[0];
4282 stopConf->senderData = stopReq.senderData;
4283 stopConf->nodeState = (Uint32) NodeState::SL_NOTHING;
4284 cntr.sendSignal(stopReq.senderRef, GSN_STOP_CONF, signal,
4285 StopConf::SignalLength, JBB);
4286 stopReq.senderRef = 0;
4287 return false;
4288 }
4289
4290 }
4291 else
4292 {
4293 ndbMask.clear(cntr.getOwnNodeId());
4294 }
4295
4296 CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
4297 sd->blockRef = cntr.reference();
4298 sd->requestType = CheckNodeGroups::Direct | CheckNodeGroups::ArbitCheck;
4299 sd->mask = ndbMask;
4300 cntr.EXECUTE_DIRECT(DBDIH, GSN_CHECKNODEGROUPSREQ, signal,
4301 CheckNodeGroups::SignalLength);
4302 jamEntry();
4303 switch (sd->output) {
4304 case CheckNodeGroups::Win:
4305 case CheckNodeGroups::Partitioning:
4306 return true;
4307 break;
4308 }
4309
4310 StopRef * const ref = (StopRef *)&signal->theData[0];
4311
4312 ref->senderData = stopReq.senderData;
4313 ref->errorCode = StopRef::NodeShutdownWouldCauseSystemCrash;
4314 ref->masterNodeId = cntr.cmasterNodeId;
4315
4316 const BlockReference bref = stopReq.senderRef;
4317 if (bref != RNIL)
4318 cntr.sendSignal(bref, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4319
4320 stopReq.senderRef = 0;
4321
4322 if (cntr.getNodeState().startLevel != NodeState::SL_SINGLEUSER)
4323 {
4324 NodeState newState(NodeState::SL_STARTED);
4325 cntr.updateNodeState(signal, newState);
4326 cntr.send_node_started_rep(signal);
4327 }
4328
4329 signal->theData[0] = NDB_LE_NDBStopAborted;
4330 cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
4331
4332 return false;
4333 }
4334
4335 void
checkApiTimeout(Signal * signal)4336 Ndbcntr::StopRecord::checkApiTimeout(Signal* signal){
4337 const Int32 timeout = stopReq.apiTimeout;
4338 const NDB_TICKS now = NdbTick_getCurrentTicks();
4339 if(timeout >= 0 &&
4340 NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4341 // || checkWithApiInSomeMagicWay)
4342 jam();
4343 NodeState newState(NodeState::SL_STOPPING_2,
4344 StopReq::getSystemStop(stopReq.requestInfo));
4345 if(stopReq.singleuser) {
4346 newState.setSingleUser(true);
4347 newState.setSingleUserApi(stopReq.singleUserApi);
4348 }
4349 cntr.updateNodeState(signal, newState);
4350
4351 stopInitiatedTime = now;
4352 }
4353
4354 signal->theData[0] = ZSHUTDOWN;
4355 cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4356 }
4357
4358 void
checkTcTimeout(Signal * signal)4359 Ndbcntr::StopRecord::checkTcTimeout(Signal* signal){
4360 const Int32 timeout = stopReq.transactionTimeout;
4361 const NDB_TICKS now = NdbTick_getCurrentTicks();
4362 if(timeout >= 0 &&
4363 NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4364 // || checkWithTcInSomeMagicWay)
4365 jam();
4366 if(stopReq.getSystemStop(stopReq.requestInfo) || stopReq.singleuser){
4367 jam();
4368 if(stopReq.singleuser)
4369 {
4370 jam();
4371 AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
4372 req->senderRef = cntr.reference();
4373 req->senderData = 12;
4374 cntr.sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal,
4375 AbortAllReq::SignalLength, JBB);
4376 }
4377 else
4378 {
4379 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4380 req->senderRef = cntr.reference();
4381 req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
4382 req->requestType = WaitGCPReq::CompleteForceStart;
4383 cntr.sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4384 WaitGCPReq::SignalLength, JBB);
4385 }
4386 } else {
4387 jam();
4388 StopPermReq * req = (StopPermReq*)&signal->theData[0];
4389 req->senderRef = cntr.reference();
4390 req->senderData = 12;
4391 cntr.sendSignal(DBDIH_REF, GSN_STOP_PERM_REQ, signal,
4392 StopPermReq::SignalLength, JBB);
4393 }
4394 return;
4395 }
4396 signal->theData[0] = ZSHUTDOWN;
4397 cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4398 }
4399
execSTOP_PERM_REF(Signal * signal)4400 void Ndbcntr::execSTOP_PERM_REF(Signal* signal){
4401 //StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
4402
4403 jamEntry();
4404
4405 signal->theData[0] = ZSHUTDOWN;
4406 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4407 }
4408
execSTOP_PERM_CONF(Signal * signal)4409 void Ndbcntr::execSTOP_PERM_CONF(Signal* signal){
4410 jamEntry();
4411
4412 AbortAllReq * req = (AbortAllReq*)&signal->theData[0];
4413 req->senderRef = reference();
4414 req->senderData = 12;
4415 sendSignal(DBTC_REF, GSN_ABORT_ALL_REQ, signal,
4416 AbortAllReq::SignalLength, JBB);
4417 }
4418
execABORT_ALL_CONF(Signal * signal)4419 void Ndbcntr::execABORT_ALL_CONF(Signal* signal){
4420 jamEntry();
4421 if(c_stopRec.stopReq.singleuser) {
4422 jam();
4423
4424 NodeState newState(NodeState::SL_SINGLEUSER);
4425 newState.setSingleUser(true);
4426 newState.setSingleUserApi(c_stopRec.stopReq.singleUserApi);
4427 updateNodeState(signal, newState);
4428 c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4429
4430 StopConf * const stopConf = (StopConf *)&signal->theData[0];
4431 stopConf->senderData = c_stopRec.stopReq.senderData;
4432 stopConf->nodeState = (Uint32) NodeState::SL_SINGLEUSER;
4433 sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_CONF, signal, StopConf::SignalLength, JBB);
4434
4435 c_stopRec.stopReq.senderRef = 0; // the command is done
4436
4437 signal->theData[0] = NDB_LE_SingleUser;
4438 signal->theData[1] = 1;
4439 signal->theData[2] = c_stopRec.stopReq.singleUserApi;
4440 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4441 }
4442 else
4443 {
4444 jam();
4445 NodeState newState(NodeState::SL_STOPPING_3,
4446 StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4447 updateNodeState(signal, newState);
4448
4449 c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4450
4451 signal->theData[0] = ZSHUTDOWN;
4452 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4453 }
4454 }
4455
execABORT_ALL_REF(Signal * signal)4456 void Ndbcntr::execABORT_ALL_REF(Signal* signal){
4457 jamEntry();
4458
4459 StopRef * const stopRef = (StopRef *)&signal->theData[0];
4460 stopRef->senderData = c_stopRec.stopReq.senderData;
4461 stopRef->errorCode = StopRef::TransactionAbortFailed;
4462 stopRef->masterNodeId = cmasterNodeId;
4463 sendSignal(c_stopRec.stopReq.senderRef, GSN_STOP_REF, signal, StopRef::SignalLength, JBB);
4464 }
4465
4466 void
checkLqhTimeout_1(Signal * signal)4467 Ndbcntr::StopRecord::checkLqhTimeout_1(Signal* signal){
4468 const Int32 timeout = stopReq.readOperationTimeout;
4469 const NDB_TICKS now = NdbTick_getCurrentTicks();
4470
4471 if(timeout >= 0 &&
4472 NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4473 // || checkWithLqhInSomeMagicWay)
4474 jam();
4475
4476 ChangeNodeStateReq * req = (ChangeNodeStateReq*)&signal->theData[0];
4477
4478 NodeState newState(NodeState::SL_STOPPING_4,
4479 StopReq::getSystemStop(stopReq.requestInfo));
4480 req->nodeState = newState;
4481 req->senderRef = cntr.reference();
4482 req->senderData = 12;
4483 cntr.sendSignal(DBLQH_REF, GSN_CHANGE_NODE_STATE_REQ, signal,
4484 ChangeNodeStateReq::SignalLength, JBB);
4485 return;
4486 }
4487 signal->theData[0] = ZSHUTDOWN;
4488 cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4489 }
4490
4491 void
execCHANGE_NODE_STATE_CONF(Signal * signal)4492 Ndbcntr::execCHANGE_NODE_STATE_CONF(Signal* signal)
4493 {
4494 jamEntry();
4495
4496 /**
4497 * stop replication stream
4498 */
4499 signal->theData[0] = reference();
4500 signal->theData[1] = 12;
4501 sendSignal(SUMA_REF, GSN_STOP_ME_REQ, signal, 2, JBB);
4502 }
4503
execSTOP_ME_REF(Signal * signal)4504 void Ndbcntr::execSTOP_ME_REF(Signal* signal){
4505 jamEntry();
4506 ndbrequire(false);
4507 }
4508
4509
execSTOP_ME_CONF(Signal * signal)4510 void Ndbcntr::execSTOP_ME_CONF(Signal* signal){
4511 jamEntry();
4512
4513 const StopMeConf * conf = CAST_CONSTPTR(StopMeConf, signal->getDataPtr());
4514 if (conf->senderData == 12)
4515 {
4516 /**
4517 * Remove node from transactions
4518 */
4519 signal->theData[0] = reference();
4520 signal->theData[1] = 13;
4521 sendSignal(DBDIH_REF, GSN_STOP_ME_REQ, signal, 2, JBB);
4522 return;
4523 }
4524
4525 NodeState newState(NodeState::SL_STOPPING_4,
4526 StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4527 updateNodeState(signal, newState);
4528
4529 c_stopRec.stopInitiatedTime = NdbTick_getCurrentTicks();
4530 signal->theData[0] = ZSHUTDOWN;
4531 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
4532 }
4533
4534 void
checkLqhTimeout_2(Signal * signal)4535 Ndbcntr::StopRecord::checkLqhTimeout_2(Signal* signal){
4536 const Int32 timeout = stopReq.operationTimeout;
4537 const NDB_TICKS now = NdbTick_getCurrentTicks();
4538
4539 if(timeout >= 0 &&
4540 NdbTick_Elapsed(stopInitiatedTime, now).milliSec() >= (Uint64)timeout){
4541 // || checkWithLqhInSomeMagicWay)
4542 jam();
4543 if(StopReq::getPerformRestart(stopReq.requestInfo)){
4544 jam();
4545 StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4546 startOrd->restartInfo = stopReq.requestInfo;
4547 cntr.sendSignal(CMVMI_REF, GSN_START_ORD, signal, 2, JBA);
4548 } else {
4549 jam();
4550 cntr.sendSignal(CMVMI_REF, GSN_STOP_ORD, signal, 1, JBA);
4551 }
4552 return;
4553 }
4554 signal->theData[0] = ZSHUTDOWN;
4555 cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4556 }
4557
execWAIT_GCP_REF(Signal * signal)4558 void Ndbcntr::execWAIT_GCP_REF(Signal* signal){
4559 jamEntry();
4560
4561 //WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
4562
4563 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4564 req->senderRef = reference();
4565 req->senderData = StopRecord::SR_CLUSTER_SHUTDOWN;
4566 req->requestType = WaitGCPReq::CompleteForceStart;
4567 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4568 WaitGCPReq::SignalLength, JBB);
4569 }
4570
execWAIT_GCP_CONF(Signal * signal)4571 void Ndbcntr::execWAIT_GCP_CONF(Signal* signal){
4572 jamEntry();
4573
4574 WaitGCPConf* conf = (WaitGCPConf*)signal->getDataPtr();
4575
4576 switch(conf->senderData){
4577 case StopRecord::SR_BLOCK_GCP_START_GCP:
4578 {
4579 jam();
4580 /**
4581 *
4582 */
4583 if(!c_stopRec.checkNodeFail(signal))
4584 {
4585 jam();
4586 goto unblock;
4587 }
4588
4589 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4590 req->senderRef = reference();
4591 req->senderData = StopRecord::SR_WAIT_COMPLETE_GCP;
4592 req->requestType = WaitGCPReq::CompleteIfRunning;
4593
4594 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4595 WaitGCPReq::SignalLength, JBB);
4596 return;
4597 }
4598 case StopRecord::SR_UNBLOCK_GCP_START_GCP:
4599 {
4600 jam();
4601 return;
4602 }
4603 case StopRecord::SR_WAIT_COMPLETE_GCP:
4604 {
4605 jam();
4606 if(!c_stopRec.checkNodeFail(signal))
4607 {
4608 jam();
4609 goto unblock;
4610 }
4611
4612 NdbNodeBitmask tmp;
4613 tmp.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4614 c_stopRec.m_stop_req_counter = tmp;
4615 NodeReceiverGroup rg(QMGR, tmp);
4616 StopReq * stopReq = (StopReq *)&signal->theData[0];
4617 * stopReq = c_stopRec.stopReq;
4618 stopReq->senderRef = reference();
4619 sendSignal(rg, GSN_STOP_REQ, signal, StopReq::SignalLength, JBA);
4620 c_stopRec.m_state = StopRecord::SR_QMGR_STOP_REQ;
4621 return;
4622 }
4623 case StopRecord::SR_CLUSTER_SHUTDOWN:
4624 {
4625 jam();
4626 break;
4627 }
4628 }
4629
4630 {
4631 ndbrequire(StopReq::getSystemStop(c_stopRec.stopReq.requestInfo));
4632 NodeState newState(NodeState::SL_STOPPING_3, true);
4633
4634 /**
4635 * Inform QMGR so that arbitrator won't kill us
4636 */
4637 NodeStateRep * rep = (NodeStateRep *)&signal->theData[0];
4638 rep->nodeState = newState;
4639 rep->nodeState.masterNodeId = cmasterNodeId;
4640 rep->nodeState.setNodeGroup(c_nodeGroup);
4641 EXECUTE_DIRECT(QMGR, GSN_NODE_STATE_REP, signal,
4642 NodeStateRep::SignalLength);
4643
4644 if(StopReq::getPerformRestart(c_stopRec.stopReq.requestInfo)){
4645 jam();
4646 StartOrd * startOrd = (StartOrd *)&signal->theData[0];
4647 startOrd->restartInfo = c_stopRec.stopReq.requestInfo;
4648 sendSignalWithDelay(CMVMI_REF, GSN_START_ORD, signal, 500,
4649 StartOrd::SignalLength);
4650 } else {
4651 jam();
4652 sendSignalWithDelay(CMVMI_REF, GSN_STOP_ORD, signal, 500, 1);
4653 }
4654 return;
4655 }
4656
4657 unblock:
4658 WaitGCPReq * req = (WaitGCPReq*)&signal->theData[0];
4659 req->senderRef = reference();
4660 req->senderData = StopRecord::SR_UNBLOCK_GCP_START_GCP;
4661 req->requestType = WaitGCPReq::UnblockStartGcp;
4662 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
4663 WaitGCPReq::SignalLength, JBB);
4664 }
4665
4666 void
execSTOP_CONF(Signal * signal)4667 Ndbcntr::execSTOP_CONF(Signal* signal)
4668 {
4669 jamEntry();
4670 StopConf *conf = (StopConf*)signal->getDataPtr();
4671 ndbrequire(c_stopRec.m_state == StopRecord::SR_QMGR_STOP_REQ);
4672 c_stopRec.m_stop_req_counter.clearWaitingFor(conf->nodeId);
4673 if (c_stopRec.m_stop_req_counter.done())
4674 {
4675 char buf[100];
4676 NdbNodeBitmask mask;
4677 mask.assign(NdbNodeBitmask::Size, c_stopRec.stopReq.nodes);
4678 infoEvent("Stopping of %s", mask.getText(buf));
4679 ndbout_c("Stopping of %s", mask.getText(buf));
4680
4681 /**
4682 * Kill any node...
4683 */
4684 FailRep * const failRep = (FailRep *)&signal->theData[0];
4685 failRep->failCause = FailRep::ZMULTI_NODE_SHUTDOWN;
4686 failRep->failSourceNodeId = getOwnNodeId();
4687 NodeReceiverGroup rg(QMGR, c_clusterNodes);
4688 Uint32 nodeId = 0;
4689 while ((nodeId = NdbNodeBitmask::find(c_stopRec.stopReq.nodes, nodeId+1))
4690 != NdbNodeBitmask::NotFound)
4691 {
4692 failRep->failNodeId = nodeId;
4693 sendSignal(rg, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
4694 }
4695 c_stopRec.m_state = StopRecord::SR_WAIT_NODE_FAILURES;
4696 return;
4697 }
4698 }
4699
execSTTORRY(Signal * signal)4700 void Ndbcntr::execSTTORRY(Signal* signal){
4701 jamEntry();
4702 c_missra.execSTTORRY(signal);
4703 }
4704
execREAD_CONFIG_CONF(Signal * signal)4705 void Ndbcntr::execREAD_CONFIG_CONF(Signal* signal){
4706 jamEntry();
4707 c_missra.execREAD_CONFIG_CONF(signal);
4708 }
4709
execSTART_ORD(Signal * signal)4710 void Ndbcntr::execSTART_ORD(Signal* signal){
4711 jamEntry();
4712 c_missra.execSTART_ORD(signal);
4713 }
4714
4715 #define CLEAR_DX 13
4716 #define CLEAR_LCP 3
4717 #define CLEAR_DD 2
4718 // FileSystemPathDataFiles FileSystemPathUndoFiles
4719
4720 void
clearFilesystem(Signal * signal)4721 Ndbcntr::clearFilesystem(Signal* signal)
4722 {
4723 jam();
4724 FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
4725 req->userReference = reference();
4726 req->userPointer = 0;
4727 req->directory = 1;
4728 req->ownDirectory = 1;
4729
4730 const Uint32 DX = CLEAR_DX;
4731 const Uint32 LCP = CLEAR_DX + CLEAR_LCP;
4732 const Uint32 DD = CLEAR_DX + CLEAR_LCP + CLEAR_DD;
4733
4734 if (c_fsRemoveCount < DX)
4735 {
4736 FsOpenReq::setVersion(req->fileNumber, 3);
4737 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL); // Can by any...
4738 FsOpenReq::v1_setDisk(req->fileNumber, c_fsRemoveCount);
4739 }
4740 else if (c_fsRemoveCount < LCP)
4741 {
4742 FsOpenReq::setVersion(req->fileNumber, 5);
4743 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
4744 FsOpenReq::v5_setLcpNo(req->fileNumber, c_fsRemoveCount - CLEAR_DX);
4745 FsOpenReq::v5_setTableId(req->fileNumber, 0);
4746 FsOpenReq::v5_setFragmentId(req->fileNumber, 0);
4747 }
4748 else if (c_fsRemoveCount < DD)
4749 {
4750 req->ownDirectory = 0;
4751 FsOpenReq::setVersion(req->fileNumber, 6);
4752 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
4753 FsOpenReq::v5_setLcpNo(req->fileNumber,
4754 FsOpenReq::BP_DD_DF + c_fsRemoveCount - LCP);
4755 }
4756 else
4757 {
4758 ndbrequire(false);
4759 }
4760
4761 sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
4762 FsRemoveReq::SignalLength, JBA);
4763 c_fsRemoveCount++;
4764 }
4765
4766 void
execFSREMOVECONF(Signal * signal)4767 Ndbcntr::execFSREMOVECONF(Signal* signal){
4768 jamEntry();
4769 if(c_fsRemoveCount == CLEAR_DX + CLEAR_LCP + CLEAR_DD){
4770 jam();
4771 sendSttorry(signal);
4772 } else {
4773 jam();
4774 ndbrequire(c_fsRemoveCount < CLEAR_DX + CLEAR_LCP + CLEAR_DD);
4775 clearFilesystem(signal);
4776 }//if
4777 }
4778
execSTART_ORD(Signal * signal)4779 void Ndbcntr::Missra::execSTART_ORD(Signal* signal){
4780 signal->theData[0] = NDB_LE_NDBStartStarted;
4781 signal->theData[1] = NDB_VERSION;
4782 signal->theData[2] = NDB_MYSQL_VERSION_D;
4783 cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4784
4785 currentBlockIndex = 0;
4786 sendNextREAD_CONFIG_REQ(signal);
4787 }
4788
sendNextREAD_CONFIG_REQ(Signal * signal)4789 void Ndbcntr::Missra::sendNextREAD_CONFIG_REQ(Signal* signal){
4790
4791 if(currentBlockIndex < ALL_BLOCKS_SZ){
4792 jam();
4793
4794 ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtrSend();
4795 req->senderData = 0;
4796 req->senderRef = cntr.reference();
4797 req->noOfParameters = 0;
4798
4799 const BlockReference ref = readConfigOrder[currentBlockIndex];
4800
4801 g_eventLogger->info("Sending READ_CONFIG_REQ to index = %d, name = %s",
4802 currentBlockIndex,
4803 getBlockName(refToBlock(ref)));
4804
4805 /**
4806 * send delayed so that alloc gets "time-sliced"
4807 */
4808 cntr.sendSignalWithDelay(ref, GSN_READ_CONFIG_REQ, signal,
4809 1, ReadConfigReq::SignalLength);
4810 return;
4811 }
4812
4813 g_eventLogger->info("READ_CONFIG_REQ phase completed, this phase is"
4814 " used to read configuration and to calculate"
4815 " various sizes and allocate almost all memory"
4816 " needed by the data node in its lifetime");
4817 /**
4818 * Finished...
4819 */
4820 currentStartPhase = 0;
4821 currentBlockIndex = 0;
4822 sendNextSTTOR(signal);
4823 }
4824
execREAD_CONFIG_CONF(Signal * signal)4825 void Ndbcntr::Missra::execREAD_CONFIG_CONF(Signal* signal)
4826 {
4827 const ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtr();
4828
4829 const Uint32 ref = conf->senderRef;
4830 ndbrequire(refToBlock(readConfigOrder[currentBlockIndex])
4831 == refToBlock(ref));
4832
4833 currentBlockIndex++;
4834 sendNextREAD_CONFIG_REQ(signal);
4835 }
4836
execSTTORRY(Signal * signal)4837 void Ndbcntr::Missra::execSTTORRY(Signal* signal){
4838 const BlockReference ref = signal->senderBlockRef();
4839 ndbrequire(refToBlock(ref) == refToBlock(ALL_BLOCKS[currentBlockIndex].Ref));
4840
4841 /**
4842 * Update next start phase
4843 */
4844 for (Uint32 i = 3; i < 25; i++){
4845 jam();
4846 if (signal->theData[i] > currentStartPhase){
4847 jam();
4848 ALL_BLOCKS[currentBlockIndex].NextSP = signal->theData[i];
4849 break;
4850 }
4851 }
4852
4853 currentBlockIndex++;
4854 sendNextSTTOR(signal);
4855 }
4856
sendNextSTTOR(Signal * signal)4857 void Ndbcntr::Missra::sendNextSTTOR(Signal* signal){
4858
4859 for(; currentStartPhase < 255 ;
4860 currentStartPhase++, g_currentStartPhase = currentStartPhase){
4861 jam();
4862
4863 #ifdef ERROR_INSERT
4864 if (cntr.cerrorInsert == 1002 &&
4865 cntr.c_error_insert_extra == currentStartPhase)
4866 {
4867 signal->theData[0] = ZBLOCK_STTOR;
4868 cntr.sendSignalWithDelay(cntr.reference(), GSN_CONTINUEB, signal, 100, 1);
4869 return;
4870 }
4871 #endif
4872
4873 const Uint32 start = currentBlockIndex;
4874 for(; currentBlockIndex < ALL_BLOCKS_SZ; currentBlockIndex++){
4875 jam();
4876 if(ALL_BLOCKS[currentBlockIndex].NextSP == currentStartPhase){
4877 jam();
4878 signal->theData[0] = 0;
4879 signal->theData[1] = currentStartPhase;
4880 signal->theData[2] = 0;
4881 signal->theData[3] = 0;
4882 signal->theData[4] = 0;
4883 signal->theData[5] = 0;
4884 signal->theData[6] = 0;
4885 signal->theData[7] = cntr.ctypeOfStart;
4886
4887 const BlockReference ref = ALL_BLOCKS[currentBlockIndex].Ref;
4888
4889 #ifdef MAX_STARTPHASE
4890 ndbrequire(currentStartPhase <= MAX_STARTPHASE);
4891 #endif
4892
4893 #ifdef TRACE_STTOR
4894 ndbout_c("sending STTOR(%d) to %s(ref=%x index=%d)",
4895 currentStartPhase,
4896 getBlockName( refToBlock(ref)),
4897 ref,
4898 currentBlockIndex);
4899 #endif
4900 if (refToBlock(ref) == DBDIH)
4901 signal->theData[7] = cntr.cdihStartType;
4902
4903 cntr.sendSignal(ref, GSN_STTOR, signal, 8, JBB);
4904
4905 return;
4906 }
4907 }
4908
4909 currentBlockIndex = 0;
4910
4911 NodeState newState(NodeState::SL_STARTING, currentStartPhase,
4912 (NodeState::StartType)cntr.ctypeOfStart);
4913 cntr.updateNodeState(signal, newState);
4914
4915 if(start != 0)
4916 {
4917 /**
4918 * At least one wanted this start phase, record & report it
4919 */
4920 jam();
4921 g_eventLogger->info("Start phase %u completed", currentStartPhase);
4922 switch (currentStartPhase)
4923 {
4924 case 0:
4925 g_eventLogger->info("Phase 0 has made some file system"
4926 " initialisations");
4927 break;
4928 case 1:
4929 g_eventLogger->info("Phase 1 initialised some variables and"
4930 " included node in cluster, locked memory"
4931 " if configured to do so");
4932 break;
4933 case 2:
4934 switch (cntr.ctypeOfStart)
4935 {
4936 case NodeState::ST_INITIAL_START:
4937 case NodeState::ST_INITIAL_NODE_RESTART:
4938 g_eventLogger->info("Phase 2 did more initialisations, master"
4939 " accepted our start, we initialised the"
4940 " REDO log");
4941 break;
4942 case NodeState::ST_SYSTEM_RESTART:
4943 case NodeState::ST_NODE_RESTART:
4944 g_eventLogger->info("Phase 2 did more initialisations, master"
4945 " accepted our start, we started REDO log"
4946 " initialisations");
4947 break;
4948 default:
4949 break;
4950 }
4951 break;
4952 case 3:
4953 switch (cntr.ctypeOfStart)
4954 {
4955 case NodeState::ST_INITIAL_START:
4956 case NodeState::ST_SYSTEM_RESTART:
4957 g_eventLogger->info("Phase 3 performed local connection setups");
4958 break;
4959 case NodeState::ST_INITIAL_NODE_RESTART:
4960 case NodeState::ST_NODE_RESTART:
4961 g_eventLogger->info("Phase 3 locked the data dictionary, "
4962 "performed local connection setups, we "
4963 " asked for permission to start our node");
4964 break;
4965 default:
4966 break;
4967 }
4968 break;
4969 case 4:
4970 switch (cntr.ctypeOfStart)
4971 {
4972 case NodeState::ST_SYSTEM_RESTART:
4973 g_eventLogger->info("Phase 4 restored all fragments from local"
4974 " disk up to a consistent global checkpoint"
4975 " id");
4976 break;
4977 case NodeState::ST_NODE_RESTART:
4978 case NodeState::ST_INITIAL_START:
4979 case NodeState::ST_INITIAL_NODE_RESTART:
4980 g_eventLogger->info("Phase 4 continued preparations of the REDO"
4981 " log");
4982 break;
4983 default:
4984 break;
4985 }
4986 break;
4987 case 5:
4988 switch (cntr.ctypeOfStart)
4989 {
4990 case NodeState::ST_INITIAL_NODE_RESTART:
4991 case NodeState::ST_NODE_RESTART:
4992 g_eventLogger->info("Phase 5 restored local fragments in its"
4993 " first NDB phase, then copied metadata to"
4994 " our node, and"
4995 " then actual data was copied over to our"
4996 " node, and finally we waited for a local"
4997 " checkpoint to complete");
4998 break;
4999 case NodeState::ST_INITIAL_START:
5000 g_eventLogger->info("Phase 5 Created the System Table");
5001 case NodeState::ST_SYSTEM_RESTART:
5002 g_eventLogger->info("Phase 5 waited for local checkpoint to"
5003 " complete");
5004 break;
5005 default:
5006 break;
5007 }
5008 break;
5009 case 6:
5010 g_eventLogger->info("Phase 6 updated blocks about that we've now"
5011 " reached the started state.");
5012 break;
5013 case 7:
5014 g_eventLogger->info("Phase 7 mainly activated the asynchronous"
5015 " change events process, and some other"
5016 " background processes");
5017 break;
5018 case 8:
5019 switch (cntr.ctypeOfStart)
5020 {
5021 case NodeState::ST_INITIAL_START:
5022 case NodeState::ST_SYSTEM_RESTART:
5023 {
5024 g_eventLogger->info("Phase 8 enabled foreign keys and waited for"
5025 "all nodes to complete start up to this point");
5026 break;
5027 }
5028 default:
5029 break;
5030 }
5031 break;
5032 case 9:
5033 g_eventLogger->info("Phase 9 enabled APIs to start connecting");
5034 break;
5035 case 101:
5036 g_eventLogger->info("Phase 101 was used by SUMA to take over"
5037 " responsibility for sending some of the"
5038 " asynchronous change events");
5039 break;
5040 default:
5041 break;
5042 }
5043
5044 signal->theData[0] = NDB_LE_StartPhaseCompleted;
5045 signal->theData[1] = currentStartPhase;
5046 signal->theData[2] = cntr.ctypeOfStart;
5047 cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
5048
5049 /**
5050 * Check if we should wait before proceeding with
5051 * next startphase
5052 *
5053 * New code guarantees that before starting X
5054 * that all other nodes (in system restart/initial start)
5055 * want to start a startphase >= X
5056 */
5057 if (cntr.wait_sp(signal, currentStartPhase + 1))
5058 {
5059 jam();
5060 currentStartPhase++;
5061 g_currentStartPhase = currentStartPhase;
5062 return;
5063 }
5064 }
5065 }
5066
5067 g_eventLogger->info("Node started");
5068
5069 signal->theData[0] = NDB_LE_NDBStartCompleted;
5070 signal->theData[1] = NDB_VERSION;
5071 signal->theData[2] = NDB_MYSQL_VERSION_D;
5072 cntr.sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
5073
5074 NodeState newState(NodeState::SL_STARTED);
5075 cntr.updateNodeState(signal, newState);
5076 cntr.send_node_started_rep(signal);
5077
5078 NodeReceiverGroup rg(NDBCNTR, cntr.c_clusterNodes);
5079 signal->theData[0] = cntr.getOwnNodeId();
5080 cntr.sendSignal(rg, GSN_CNTR_START_REP, signal, 1, JBB);
5081 }
5082
5083 void
send_node_started_rep(Signal * signal)5084 Ndbcntr::send_node_started_rep(Signal *signal)
5085 {
5086 signal->theData[0] = getOwnNodeId();
5087 sendSignal(QMGR_REF, GSN_NODE_STARTED_REP, signal, 1, JBB);
5088 }
5089
5090 void
execCREATE_NODEGROUP_IMPL_REQ(Signal * signal)5091 Ndbcntr::execCREATE_NODEGROUP_IMPL_REQ(Signal* signal)
5092 {
5093 jamEntry();
5094
5095 CreateNodegroupImplReq reqCopy = *(CreateNodegroupImplReq*)signal->getDataPtr();
5096 CreateNodegroupImplReq *req = &reqCopy;
5097
5098 if (req->requestType == CreateNodegroupImplReq::RT_COMMIT)
5099 {
5100 jam();
5101 Uint32 save = c_nodeGroup;
5102 getNodeGroup(signal);
5103 if (save != c_nodeGroup)
5104 {
5105 jam();
5106 updateNodeState(signal, getNodeState());
5107 }
5108 }
5109
5110 {
5111 CreateNodegroupImplConf* conf = (CreateNodegroupImplConf*)signal->getDataPtrSend();
5112 conf->senderRef = reference();
5113 conf->senderData = req->senderData;
5114 sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_CONF, signal,
5115 CreateNodegroupImplConf::SignalLength, JBB);
5116 }
5117 }
5118
5119 void
execDROP_NODEGROUP_IMPL_REQ(Signal * signal)5120 Ndbcntr::execDROP_NODEGROUP_IMPL_REQ(Signal* signal)
5121 {
5122 jamEntry();
5123
5124 DropNodegroupImplReq reqCopy = *(DropNodegroupImplReq*)signal->getDataPtr();
5125 DropNodegroupImplReq *req = &reqCopy;
5126
5127 if (req->requestType == DropNodegroupImplReq::RT_COMPLETE)
5128 {
5129 jam();
5130 Uint32 save = c_nodeGroup;
5131 getNodeGroup(signal);
5132
5133 if (save != c_nodeGroup)
5134 {
5135 jam();
5136 updateNodeState(signal, getNodeState());
5137 }
5138 }
5139
5140 {
5141 DropNodegroupImplConf* conf = (DropNodegroupImplConf*)signal->getDataPtrSend();
5142 conf->senderRef = reference();
5143 conf->senderData = req->senderData;
5144 sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_CONF, signal,
5145 DropNodegroupImplConf::SignalLength, JBB);
5146 }
5147 }
5148
5149 template class Vector<ddentry>;
5150