1 /*
2    Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #ifndef DBDIH_H
26 #define DBDIH_H
27 
28 #include <ndb_limits.h>
29 #include <pc.hpp>
30 #include <SimulatedBlock.hpp>
31 #include "Sysfile.hpp"
32 #include <SignalCounter.hpp>
33 
34 #include <signaldata/RedoStateRep.hpp>
35 #include <signaldata/MasterLCP.hpp>
36 #include <signaldata/CopyGCIReq.hpp>
37 #include <blocks/mutexes.hpp>
38 #include <signaldata/LCP.hpp>
39 #include <NdbSeqLock.hpp>
40 #include <CountingSemaphore.hpp>
41 #include <Mutex.hpp>
42 
43 #define JAM_FILE_ID 356
44 
45 
46 #ifdef DBDIH_C
47 
48 /*###################*/
49 /* FILE SYSTEM FLAGS */
50 /*###################*/
51 #define ZLIST_OF_PAIRS 0
52 #define ZLIST_OF_PAIRS_SYNCH 16
53 #define ZOPEN_READ_WRITE 2
54 #define ZCREATE_READ_WRITE 0x302
55 #define ZCLOSE_NO_DELETE 0
56 #define ZCLOSE_DELETE 1
57 
58 /*###############*/
59 /* NODE STATES   */
60 /*###############*/
61 #define ZIDLE 0
62 #define ZACTIVE 1
63 
64 /*#########*/
65 /* GENERAL */
66 /*#########*/
67 #define ZVAR_NO_WORD 0
68 #define ZVAR_NO_CRESTART_INFO 1
69 #define ZVAR_NO_CRESTART_INFO_TO_FILE 2
70 #define ZVALID 1
71 #define ZINVALID 2
72 
73 /*###############*/
74 /*  ERROR CODES  */
75 /*###############*/
76 // ------------------------------------------
77 // Error Codes for Transactions (None sofar)
78 // ------------------------------------------
79 #define ZUNDEFINED_FRAGMENT_ERROR 311
80 
81 // --------------------------------------
82 // Error Codes for Add Table
83 // --------------------------------------
84 #define ZREPLERROR1 306
85 #define ZREPLERROR2 307
86 
87 // --------------------------------------
88 // Other DIH error codes
89 // --------------------------------------
90 #define ZLONG_MESSAGE_ERROR 312
91 
92 // --------------------------------------
93 // Crash Codes
94 // --------------------------------------
95 #define ZCOULD_NOT_OCCUR_ERROR 300
96 #define ZNOT_MASTER_ERROR 301
97 #define ZWRONG_FAILURE_NUMBER_ERROR 302
98 #define ZWRONG_START_NODE_ERROR 303
99 #define ZNO_REPLICA_FOUND_ERROR 304
100 
101 /*#########*/
102 /* PHASES  */
103 /*#########*/
104 #define ZNDB_SPH1 1
105 #define ZNDB_SPH2 2
106 #define ZNDB_SPH3 3
107 #define ZNDB_SPH4 4
108 #define ZNDB_SPH5 5
109 #define ZNDB_SPH6 6
110 #define ZNDB_SPH7 7
111 #define ZNDB_SPH8 8
112 /*#########*/
113 /* SIZES   */
114 /*#########*/
115 /*
116  * Pages are used for flushing table definitions during LCP,
117  * and for other operations such as metadata changes etc
118  *
119  */
120 #define MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES 4
121 #define MAX_CONCURRENT_DIH_TAB_DEF_OPS (MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES + 2)
122 #define ZPAGEREC (MAX_CONCURRENT_DIH_TAB_DEF_OPS * PACK_TABLE_PAGES)
123 #define ZCREATE_REPLICA_FILE_SIZE 4
124 #define ZPROXY_MASTER_FILE_SIZE (MAX_NDB_NODES + 1)
125 
126 /*MaxConcurrent proxied WaitGcpReq.  Set to 10 as safety margin on 1.*/
127 #define ZPROXY_FILE_SIZE 10
128 #endif
129 
130 /*
131  * Pack table into pages.
132  * See use of writePageWord() in
133  * packTableIntoPagesLab() and helper
134  * functions to determine the constants
135  * below.
136  */
137 #define MAX_CRASHED_REPLICAS 8
138 #define PACK_REPLICAS_WORDS (4 + 4 * MAX_LCP_STORED + 2 * MAX_CRASHED_REPLICAS)
139 #define PACK_FRAGMENT_WORDS (6 + 2 * MAX_REPLICAS * PACK_REPLICAS_WORDS)
140 #define PACK_TABLE_WORDS (10 + MAX_NDB_PARTITIONS * PACK_FRAGMENT_WORDS)
141 #define PACK_TABLE_PAGE_WORDS (2048 - 32)
142 #define PACK_TABLE_PAGES ((PACK_TABLE_WORDS + PACK_TABLE_PAGE_WORDS - 1) / PACK_TABLE_PAGE_WORDS)
143 
144 #define MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE 32
145 #define MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE 32
146 
147 class Dbdih: public SimulatedBlock {
148 #ifdef ERROR_INSERT
149   typedef void (Dbdih::* SendFunction)(Signal*, Uint32, Uint32);
150 #endif
151 public:
152 
153   // Records
154 
155   /*############## CONNECT_RECORD ##############*/
156   /**
157    * THE CONNECT RECORD IS CREATED WHEN A TRANSACTION HAS TO START. IT KEEPS
158    * ALL INTERMEDIATE INFORMATION NECESSARY FOR THE TRANSACTION FROM THE
159    * DISTRIBUTED MANAGER. THE RECORD KEEPS INFORMATION ABOUT THE
160    * OPERATIONS THAT HAVE TO BE CARRIED OUT BY THE TRANSACTION AND
161    * ALSO THE TRAIL OF NODES FOR EACH OPERATION IN THE THE
162    * TRANSACTION.
163    */
164   struct ConnectRecord {
165     enum ConnectState {
166       INUSE = 0,
167       FREE = 1,
168       STARTED = 2,
169       ALTER_TABLE = 3,
170       ALTER_TABLE_ABORT = 4, // "local" abort
171       ALTER_TABLE_REVERT = 5,
172       GET_TABINFO = 6
173     };
174     union {
175       Uint32 nodes[MAX_REPLICAS];
176       struct {
177         Uint32 m_changeMask;
178         Uint32 m_totalfragments;
179         Uint32 m_partitionCount;
180         Uint32 m_org_totalfragments;
181         Uint32 m_new_map_ptr_i;
182       } m_alter;
183       struct {
184         Uint32 m_map_ptr_i;
185       } m_create;
186       struct {
187         Uint32 m_requestInfo;
188       } m_get_tabinfo;
189     };
190     ConnectState connectState;
191     Uint32 nextPool;
192     Uint32 table;
193     Uint32 userpointer;
194     BlockReference userblockref;
195     Callback m_callback;
196   };
197   typedef Ptr<ConnectRecord> ConnectRecordPtr;
198 
199   /**
200    *       THESE RECORDS ARE USED WHEN CREATING REPLICAS DURING SYSTEM
201    *       RESTART. I NEED A COMPLEX DATA STRUCTURE DESCRIBING THE REPLICAS
202    *       I WILL TRY TO CREATE FOR EACH FRAGMENT.
203    *
204    *       I STORE A REFERENCE TO THE FOUR POSSIBLE CREATE REPLICA RECORDS
205    *       IN A COMMON STORED VARIABLE. I ALLOW A MAXIMUM OF 4 REPLICAS TO
206    *       BE RESTARTED PER FRAGMENT.
207    */
208   struct CreateReplicaRecord {
209     Uint32 logStartGci[MAX_LOG_EXEC];
210     Uint32 logStopGci[MAX_LOG_EXEC];
211     Uint16 logNodeId[MAX_LOG_EXEC];
212     Uint32 createLcpId;
213 
214     Uint32 replicaRec;
215     Uint16 dataNodeId;
216     Uint16 lcpNo;
217     Uint16 noLogNodes;
218   };
219   typedef Ptr<CreateReplicaRecord> CreateReplicaRecordPtr;
220 
221   /**
222    *       THIS RECORD CONTAINS A FILE DESCRIPTION. THERE ARE TWO
223    *       FILES PER TABLE TO RAISE SECURITY LEVEL AGAINST DISK CRASHES.
224    */
225   struct FileRecord {
226     enum FileStatus {
227       CLOSED = 0,
228       CRASHED = 1,
229       OPEN = 2
230     };
231     enum FileType {
232       TABLE_FILE = 0,
233       GCP_FILE = 1
234     };
235     enum ReqStatus {
236       IDLE = 0,
237       CREATING_GCP = 1,
238       OPENING_GCP = 2,
239       OPENING_COPY_GCI = 3,
240       WRITING_COPY_GCI = 4,
241       CREATING_COPY_GCI = 5,
242       OPENING_TABLE = 6,
243       READING_GCP = 7,
244       READING_TABLE = 8,
245       WRITE_INIT_GCP = 9,
246       TABLE_CREATE = 10,
247       TABLE_WRITE = 11,
248       TABLE_CLOSE = 12,
249       CLOSING_GCP = 13,
250       CLOSING_TABLE_CRASH = 14,
251       CLOSING_TABLE_SR = 15,
252       CLOSING_GCP_CRASH = 16,
253       TABLE_OPEN_FOR_DELETE = 17,
254       TABLE_CLOSE_DELETE = 18
255     };
256     Uint32 fileName[4];
257     Uint32 fileRef;
258     FileStatus fileStatus;
259     FileType fileType;
260     Uint32 nextFile;
261     ReqStatus reqStatus;
262     Uint32 tabRef;
263   };
264   typedef Ptr<FileRecord> FileRecordPtr;
265 
266   /**
267    * THIS RECORD KEEPS THE STORAGE AND DECISIONS INFORMATION OF A FRAGMENT
268    * AND ITS REPLICAS. IF FRAGMENT HAS MORE THAN ONE BACK UP
269    * REPLICA THEN A LIST OF MORE NODES IS ATTACHED TO THIS RECORD.
270    * EACH RECORD IN MORE LIST HAS INFORMATION ABOUT ONE BACKUP. THIS RECORD
271    * ALSO HAVE THE STATUS OF THE FRAGMENT.
272    */
273   struct Fragmentstore {
274     Uint16 activeNodes[MAX_REPLICAS];
275     Uint32 preferredPrimary;
276 
277     Uint32 oldStoredReplicas;    /* "DEAD" STORED REPLICAS */
278     Uint32 storedReplicas;       /* "ALIVE" STORED REPLICAS */
279     Uint32 nextFragmentChunk;
280 
281     Uint32 m_log_part_id;
282 
283     /**
284      * Used by Fully replicated tables to find the main fragment and to
285      * find local fragments.
286      */
287     Uint32 fragId;
288     Uint32 partition_id;
289     Uint32 nextCopyFragment;
290 
291     Uint8 distributionKey;
292     Uint8 fragReplicas;
293     Uint8 noOldStoredReplicas;  /* NUMBER OF "DEAD" STORED REPLICAS */
294     Uint8 noStoredReplicas;     /* NUMBER OF "ALIVE" STORED REPLICAS*/
295     Uint8 noLcpReplicas;        ///< No of replicas remaining to be LCP:ed
296   };
297   typedef Ptr<Fragmentstore> FragmentstorePtr;
298 
299   /*########### PAGE RECORD ############*/
300   /**
301    *       THIS RECORD KEEPS INFORMATION ABOUT NODE GROUPS.
302    */
303   struct NodeGroupRecord {
304     Uint32 nodesInGroup[MAX_REPLICAS + 1];
305     Uint32 nextReplicaNode;
306     Uint32 nodeCount;
307     Uint32 activeTakeOver; // Which node...
308     Uint32 activeTakeOverCount;
309     Uint32 m_next_log_part;
310     Uint32 nodegroupIndex;
311     Uint32 m_ref_count;
312   };
313   typedef Ptr<NodeGroupRecord> NodeGroupRecordPtr;
314   /**
315    *       THIS RECORD KEEPS INFORMATION ABOUT NODES.
316    *
317    *       RECORD ALIGNED TO BE 64 BYTES.
318    */
319   enum NodefailHandlingStep {
320     NF_REMOVE_NODE_FROM_TABLE = 1,
321     NF_GCP_TAKE_OVER = 2,
322     NF_LCP_TAKE_OVER = 4
323   };
324 
325   /**
326    * useInTransactions is used in DIGETNODES to assert that we give
327    * DBTC a node view which is correct. To ensure we provide a view
328    * which is correct we use an RCU mechanism when executing
329    * DIGETNODES. It's not a crashing problem, but it ensures that
330    * we avoid getting into unnecessary extra wait states at node
331    * failures and also that we avoid unnecessary abortions.
332    *
333    * We update this view any time any node is changing the value of
334    * useInTransactions and DBTC could be actively executing
335    * transactions.
336    */
337   NdbSeqLock m_node_view_lock;
338 
339   struct NodeRecord
340   {
NodeRecordDbdih::NodeRecord341     NodeRecord() { }
342     /**
343      * Removed the constructor method and replaced it with the method
344      * initNodeRecord. The problem with the constructor method is that
345      * in debug compiled code it will initialise the entire object to
346      * zero. This didn't play well at all with the node recovery status
347      * which is used from the start of the node until it dies, so it
348      * should not be initialised when DIH finds it appropriate to
349      * initialise it. One could also long-term separate the two functions
350      * into two separate objects.
351      */
352     enum NodeStatus {
353       NOT_IN_CLUSTER = 0,
354       ALIVE = 1,
355       STARTING = 2,
356       DIED_NOW = 3,
357       DYING = 4,
358       DEAD = 5
359     };
360 
361     /**
362      * The NodeRecoveryStatus variable and all the timers connected to this
363      * status is used for two purposes. The first purpose is for a NDBINFO
364      * table that the master node will use to be able to specify the times
365      * a node restart has spent in the various node restart phases.
366      *
367      * This will help both the users and the developers to understand where
368      * the node restart is spending time.
369      *
370      * In addition the timers are also used to estimate how much more time
371      * the node will need before reaching the next wait for local checkpoint
372      * (LCP). Starting LCPs with good timing is crucial to shorten the waits
373      * for LCPs by the starting nodes. We want to wait with starting LCPs
374      * to ensure that as many nodes as possible are handled in between
375      * LCPs as possible. At the same time we cannot block LCP execution for
376      * any extended period since it will jeopardize the future stability of
377      * the cluster.
378      */
379     enum NodeRecoveryStatus
380     {
381       /* No valid state or node not defined in cluster */
382       NOT_DEFINED_IN_CLUSTER = 0,
383 
384       /* There is state for no information about restarts. */
385       NODE_NOT_RESTARTED_YET = 1,
386 
387       /* Node failure states are used in all nodes. */
388       NODE_FAILED = 2,
389       NODE_FAILURE_COMPLETED = 3,
390 
391       /* The first set of states are only used in master nodes. */
392       ALLOCATED_NODE_ID = 4,
393       INCLUDED_IN_HB_PROTOCOL = 5,
394       NDBCNTR_START_WAIT = 6,
395       NDBCNTR_STARTED = 7,
396       START_PERMITTED = 8,
397       WAIT_LCP_TO_COPY_DICT = 9,
398       COPY_DICT_TO_STARTING_NODE = 10,
399       INCLUDE_NODE_IN_LCP_AND_GCP = 11,
400       LOCAL_RECOVERY_STARTED = 12,
401       RESTORE_FRAG_COMPLETED = 13,
402       UNDO_DD_COMPLETED = 14,
403       EXECUTE_REDO_LOG_COMPLETED = 15,
404       COPY_FRAGMENTS_STARTED = 16,
405       WAIT_LCP_FOR_RESTART = 17,
406       WAIT_SUMA_HANDOVER = 18,
407       RESTART_COMPLETED = 19,
408 
409       /* There is a set of states used in non-master nodes as well. */
410       NODE_GETTING_PERMIT = 20,
411       NODE_GETTING_INCLUDED = 21,
412       NODE_GETTING_SYNCHED = 22,
413       NODE_IN_LCP_WAIT_STATE = 23,
414       NODE_ACTIVE = 24
415     };
416 
417     /**
418      * We need to ensure that we don't pause the node when the master node
419      * asks for it in case the node is already dead. We check this by
420      * by verifying that the node is in the state NODE_GETTING_PERMIT in
421      * in the non-master nodes. Since we do not yet maintain the
422      * nodeRecoveryStatus in all restart situations we temporarily
423      * put this into a separate variable that we maintain separately.
424      * TODO: We should use nodeRecoveryStatus when we maintain this
425      * state in all types of starts.
426      */
427     bool is_pausable;
428     NodeRecoveryStatus nodeRecoveryStatus;
429     NDB_TICKS nodeFailTime;
430     NDB_TICKS nodeFailCompletedTime;
431     NDB_TICKS allocatedNodeIdTime;
432     NDB_TICKS includedInHBProtocolTime;
433     NDB_TICKS ndbcntrStartWaitTime;
434     NDB_TICKS ndbcntrStartedTime;
435     NDB_TICKS startPermittedTime;
436     NDB_TICKS waitLCPToCopyDictTime;
437     NDB_TICKS copyDictToStartingNodeTime;
438     NDB_TICKS includeNodeInLCPAndGCPTime;
439     NDB_TICKS startDatabaseRecoveryTime;
440     NDB_TICKS startUndoDDTime;
441     NDB_TICKS startExecREDOLogTime;
442     NDB_TICKS startBuildIndexTime;
443     NDB_TICKS copyFragmentsStartedTime;
444     NDB_TICKS waitLCPForRestartTime;
445     NDB_TICKS waitSumaHandoverTime;
446     NDB_TICKS restartCompletedTime;
447 
448     NDB_TICKS nodeGettingPermitTime;
449     NDB_TICKS nodeGettingIncludedTime;
450     NDB_TICKS nodeGettingSynchedTime;
451     NDB_TICKS nodeInLCPWaitStateTime;
452     NDB_TICKS nodeActiveTime;
453 
454     struct FragmentCheckpointInfo {
455       Uint32 tableId;
456       Uint32 fragId;
457       Uint32 replicaPtr;
458     };
459 
460     Sysfile::ActiveStatus activeStatus;
461 
462     bool useInTransactions;
463 
464     NodeStatus nodeStatus;
465     bool allowNodeStart;
466     bool m_inclDihLcp;
467     Uint8 copyCompleted; // 0 = NO :-), 1 = YES, 2 = yes, first WAITING
468 
469     /**
470      * Used by master as part of running LCPs to keep track of fragments
471      * that have started checkpoints and fragments that have been queued
472      * for LCP execution.
473      */
474     FragmentCheckpointInfo startedChkpt[MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE];
475     FragmentCheckpointInfo queuedChkpt[MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE];
476 
477     Bitmask<1> m_nodefailSteps;
478     Uint32 activeTabptr;
479     Uint32 nextNode;
480     Uint32 nodeGroup;
481 
482     SignalCounter m_NF_COMPLETE_REP;
483 
484     Uint8 dbtcFailCompleted;
485     Uint8 dblqhFailCompleted;
486     Uint8 dbdihFailCompleted;
487     Uint8 dbdictFailCompleted;
488     Uint8 recNODE_FAILREP;
489 
490     Uint8 noOfQueuedChkpt;
491     Uint8 noOfStartedChkpt;
492 
493     MasterLCPConf::State lcpStateAtTakeOver;
494     Uint32 m_remove_node_from_table_lcp_id;
495   };
496   typedef Ptr<NodeRecord> NodeRecordPtr;
497   /**********************************************************************/
498   /* THIS RECORD KEEPS THE INFORMATION ABOUT A TABLE AND ITS FRAGMENTS  */
499   /**********************************************************************/
500   struct PageRecord {
501     Uint32 word[2048];
502     /* 8 KBYTE PAGE*/
503     Uint32 nextfreepage;
504   };
505   typedef Ptr<PageRecord> PageRecordPtr;
506 
507   /************ REPLICA RECORD *************/
508   /**********************************************************************/
509   /* THIS RECORD KEEPS THE INFORMATION ABOUT A REPLICA OF A FRAGMENT    */
510   /**********************************************************************/
511   struct ReplicaRecord {
512     /* -------------------------------------------------------------------- */
513     /* THE GLOBAL CHECKPOINT IDENTITY WHEN THIS REPLICA WAS CREATED.        */
514     /* THERE IS ONE INDEX PER REPLICA. A REPLICA INDEX IS CREATED WHEN ANODE*/
515     /* CRASH OCCURS.                                                        */
516     /* -------------------------------------------------------------------- */
517     Uint32 createGci[8];
518     /* -------------------------------------------------------------------- */
519     /* THE LAST GLOBAL CHECKPOINT IDENTITY WHICH HAS BEEN SAVED ON DISK.    */
520     /* THIS VARIABLE IS ONLY VALID FOR REPLICAS WHICH HAVE "DIED". A REPLICA*/
521     /* "DIES" EITHER WHEN THE NODE CRASHES THAT KEPT THE REPLICA OR BY BEING*/
522     /* STOPPED IN A CONTROLLED MANNER.                                      */
523     /* THERE IS ONE INDEX PER REPLICA. A REPLICA INDEX IS CREATED WHEN ANODE*/
524     /* CRASH OCCURS.                                                        */
525     /* -------------------------------------------------------------------- */
526     Uint32 replicaLastGci[8];
527     /* -------------------------------------------------------------------- */
528     /* THE LOCAL CHECKPOINT IDENTITY OF A LOCAL CHECKPOINT.                 */
529     /* -------------------------------------------------------------------- */
530     Uint32 lcpId[MAX_LCP_STORED];
531     /* -------------------------------------------------------------------- */
532     /* THIS VARIABLE KEEPS TRACK OF THE MAXIMUM GLOBAL CHECKPOINT COMPLETED */
533     /* FOR EACH OF THE LOCAL CHECKPOINTS IN THIS FRAGMENT REPLICA.          */
534     /* -------------------------------------------------------------------- */
535     Uint32 maxGciCompleted[MAX_LCP_STORED];
536     /* -------------------------------------------------------------------- */
537     /* THIS VARIABLE KEEPS TRACK OF THE MINIMUM GLOBAL CHECKPOINT STARTEDFOR*/
538     /* EACH OF THE LOCAL CHECKPOINTS IN THIS FRAGMENT REPLICA.              */
539     /* -------------------------------------------------------------------- */
540     Uint32 maxGciStarted[MAX_LCP_STORED];
541     /* -------------------------------------------------------------------- */
542     /* THE GLOBAL CHECKPOINT IDENTITY WHEN THE TABLE WAS CREATED.           */
543     /* -------------------------------------------------------------------- */
544     Uint32 initialGci;
545 
546     /* -------------------------------------------------------------------- */
547     /* THE REFERENCE TO THE NEXT REPLICA. EITHER IT REFERS TO THE NEXT IN   */
548     /* THE FREE LIST OR IT REFERS TO THE NEXT IN A LIST OF REPLICAS ON A    */
549     /* FRAGMENT.                                                            */
550     /* -------------------------------------------------------------------- */
551     Uint32 nextPool;
552 
553     /* -------------------------------------------------------------------- */
554     /*       THE NODE ID WHERE THIS REPLICA IS STORED.                      */
555     /* -------------------------------------------------------------------- */
556     Uint16 procNode;
557 
558     /* -------------------------------------------------------------------- */
559     /*    The last local checkpoint id started or queued on this replica.   */
560     /* -------------------------------------------------------------------- */
561     union {
562       Uint32 lcpIdStarted;   // Started or queued
563       Uint32 m_restorable_gci;
564     };
565 
566     /**
567      * Information needed to put the LCP_FRAG_REP into a queue and avoid
568      * sending the information onwards to all the other nodes in the
569      * cluster. We use a doubly linked list to support removal from
570      * queue due to drop table.
571      *
572      * By queueing in the local DIH we can make it appear as if the LCP
573      * is paused from the point of view of all the DIH blocks in the cluster.
574      *
575      * In the DBLQH the LCP is continuing unabated as long as there are
576      * fragments queued to execute LCPs on. The purpose of this pause support
577      * is to be able to copy the meta data without having to wait for the
578      * current LCP to be fully completed. Instead we can copy it while we are
579      * pausing the LCP reporting. This gives a possibility to provide
580      * new node with a snapshot of the metadata from the master node
581      * without having to stop the progress with the LCP execution.
582      */
583     Uint32 nextList;
584     Uint32 prevList;
585     Uint32 repMaxGciStarted;
586     Uint32 repMaxGciCompleted;
587     Uint32 fragId;
588     Uint32 tableId;
589     /* lcpNo == nextLcp, checked at queueing */
590     /* nodeId == procNode */
591 
592     /* -------------------------------------------------------------------- */
593     /* THIS VARIABLE SPECIFIES WHAT THE STATUS OF THE LOCAL CHECKPOINT IS.IT*/
594     /* CAN EITHER BE VALID OR INVALID. AT CREATION OF A FRAGMENT REPLICA ALL*/
595     /* LCP'S ARE INVALID. ALSO IF IF INDEX >= NO_LCP THEN THELOCALCHECKPOINT*/
596     /* IS ALWAYS INVALID. IF THE LCP BEFORE THE NEXT_LCP HAS LCP_ID THAT    */
597     /* DIFFERS FROM THE LATEST LCP_ID STARTED THEN THE NEXT_LCP IS ALSO     */
598     /* INVALID */
599     /* -------------------------------------------------------------------- */
600     Uint8 lcpStatus[MAX_LCP_STORED];
601 
602     /* -------------------------------------------------------------------- */
603     /*       THE NEXT LOCAL CHECKPOINT TO EXECUTE IN THIS FRAGMENT REPLICA. */
604     /* -------------------------------------------------------------------- */
605     Uint8 nextLcp;
606 
607     /* -------------------------------------------------------------------- */
608     /*       THE NUMBER OF CRASHED REPLICAS IN THIS REPLICAS SO FAR.        */
609     /* -------------------------------------------------------------------- */
610     Uint8 noCrashedReplicas;
611 
612     /**
613      * Is a LCP currently ongoing on fragment
614      */
615     Uint8 lcpOngoingFlag;
616   };
617   typedef Ptr<ReplicaRecord> ReplicaRecordPtr;
618   typedef ArrayPool<ReplicaRecord> ReplicaRecord_pool;
619   typedef DLFifoList<ReplicaRecord_pool> ReplicaRecord_fifo;
620 
621   ReplicaRecord_pool c_replicaRecordPool;
622   ReplicaRecord_fifo c_queued_lcp_frag_rep;
623 
624   /*************************************************************************
625    * TAB_DESCRIPTOR IS A DESCRIPTOR OF THE LOCATION OF THE FRAGMENTS BELONGING
626    * TO THE TABLE.THE INFORMATION ABOUT FRAGMENTS OF A TABLE ARE STORED IN
627    * CHUNKS OF FRAGMENTSTORE RECORDS.
628    * THIS RECORD ALSO HAS THE NECESSARY INFORMATION TO LOCATE A FRAGMENT AND
629    * TO LOCATE A FRAGMENT AND TO TRANSLATE A KEY OF A TUPLE TO THE FRAGMENT IT
630    * BELONGS
631    */
632   struct TabRecord
633   {
TabRecordDbdih::TabRecord634     TabRecord() { m_flags = 0; }
635 
636     /**
637      * State for copying table description into pages
638      */
639     enum CopyStatus {
640       CS_IDLE = 0,
641       CS_SR_PHASE1_READ_PAGES = 1,
642       CS_SR_PHASE2_READ_TABLE = 2,
643       CS_SR_PHASE3_COPY_TABLE = 3,
644       CS_REMOVE_NODE = 4,
645       CS_LCP_READ_TABLE = 5,
646       CS_COPY_TAB_REQ = 6,
647       CS_COPY_NODE_STATE = 7,
648       CS_ADD_TABLE_MASTER = 8,
649       CS_ADD_TABLE_SLAVE = 9,
650       CS_INVALIDATE_NODE_LCP = 10,
651       CS_ALTER_TABLE = 11,
652       CS_COPY_TO_SAVE = 12
653       ,CS_GET_TABINFO = 13
654     };
655     /**
656      * State for copying pages to disk
657      */
658     enum UpdateState {
659       US_IDLE = 0,
660       US_LOCAL_CHECKPOINT = 1,
661       US_LOCAL_CHECKPOINT_QUEUED = 2,
662       US_REMOVE_NODE = 3,
663       US_COPY_TAB_REQ = 4,
664       US_ADD_TABLE_MASTER = 5,
665       US_ADD_TABLE_SLAVE = 6,
666       US_INVALIDATE_NODE_LCP = 7,
667       US_CALLBACK = 8
668     };
669     enum TabLcpStatus {
670       TLS_ACTIVE = 1,
671       TLS_WRITING_TO_FILE = 2,
672       TLS_COMPLETED = 3
673     };
674     enum TabStatus {
675       TS_IDLE = 0,
676       TS_ACTIVE = 1,
677       TS_CREATING = 2,
678       TS_DROPPING = 3
679     };
680     enum Method {
681       LINEAR_HASH = 0,
682       NOTDEFINED = 1,
683       NORMAL_HASH = 2,
684       USER_DEFINED = 3,
685       HASH_MAP = 4
686     };
687     enum Storage {
688       ST_NOLOGGING = 0,         // Table is not logged, but survives SR
689       ST_NORMAL = 1,            // Normal table, logged and durable
690       ST_TEMPORARY = 2          // Table is lost after SR, not logged
691     };
692     enum TableFlags
693     {
694       TF_FULLY_REPLICATED = 1
695     };
696 
697     /**
698      * rw-lock that protects multiple parallel DIGETNODES (readers) from
699      *   updates to fragmenation changes (e.g UPDATE_FRAG_STATEREQ)...
700      *   search for DIH_TAB_WRITE_LOCK
701      */
702     NdbSeqLock m_lock;
703 
704     /**
705      * tabStatus, schemaTransId, m_map_ptr_i, totalfragments, noOfBackups
706      * and m_scan_reorg_flag are read concurrently from many TC threads in
707      * the execDIH_SCAN_TAB_REQ so we place these close to each other.
708      */
709     TabStatus tabStatus;
710     Uint32 schemaTransId;
711     Uint32 totalfragments;
712     /**
713      * partitionCount differs from totalfragments for fully replicated
714      * tables.
715      */
716     Uint32 partitionCount;
717     union {
718       Uint32 mask;
719       Uint32 m_map_ptr_i;
720     };
721     Uint32 m_scan_reorg_flag;
722     Uint32 m_flags;
723 
724     Uint8 noOfBackups;
725     Uint8 kvalue;
726     Uint16 primaryTableId;
727 
728     Uint16 noPages;
729     Uint16 tableType;
730 
731     Uint32 schemaVersion;
732     union {
733       Uint32 hashpointer;
734       Uint32 m_new_map_ptr_i;
735     };
736     Method method;
737 
738 
739 
740 //-----------------------------------------------------------------------------
741 // Each entry in this array contains a reference to 16 fragment records in a
742 // row. Thus finding the correct record is very quick provided the fragment id.
743 //-----------------------------------------------------------------------------
744     Uint32 startFid[(MAX_NDB_PARTITIONS - 1) / NO_OF_FRAGS_PER_CHUNK + 1];
745 
746     CopyStatus tabCopyStatus;
747     UpdateState tabUpdateState;
748     TabLcpStatus tabLcpStatus;
749     Storage tabStorage;
750 
751     Uint32 tabFile[2];
752     Uint32 noOfWords;
753     Uint32 tabRemoveNode;
754     Uint32 noOfFragChunks;
755     Uint32 tabActiveLcpFragments;
756 
757     struct {
758       Uint32 tabUserRef;
759       Uint32 tabUserPtr;
760     } m_dropTab;
761     Uint32 connectrec;
762 
763     // set in local protocol during prepare until commit
764     /**
765      * m_scan_count is heavily updated by all TC threads as they start and
766      * stop scans. This is always updated when also grabbing the mutex,
767      * so we place it close to the declaration of the mutex to avoid
768      * contaminating too many CPU cache lines.
769      */
770     Uint32 m_scan_count[2];
771 
772     /**
773      * This mutex protects the changes to m_scan_count to ensure that we
774      * complete old scans relying on old meta data before removing the
775      * metadata parts. It also protects the combination of tabStatus
776      * schemaTransId checked for in execDIH_SCAN_TAB_REQ(...).
777      *
778      * Given that DIH_SCAN_TAB_REQ also reads totalfragments, partitionCount
779      * m_map_ptr_i, noOfBackups, m_scan_reorg_flag we protect those variables
780      * as well with this mutex. These variables are also protected by the
781      * above NdbSeqLock to ensure that execDIGETNODESREQ can execute
782      * concurrently from many TC threads simultaneously.
783      *
784      * DIH_SCAN_TAB_REQ and DIH_SCAN_TAB_COMPLETE_REP are called once per
785      * scan at start and end. These will both grab a mutex on the table
786      * object. This should support in the order of a few million scans
787      * per table per data node. This should suffice. The need for a mutex
788      * comes from the fact that we need to keep track of number of scans.
789      * Thus we need to update from many different threads.
790      *
791      * DIGETNODESREQ is called once per primary key operation and once
792      * per fragment scanned in a scan operation. This means that it can
793      * be called many millions of times per second in a data node. Thus
794      * a mutex per table is not sufficient. The data read in DIGETNODESREQ
795      * is updated very seldomly. So we use the RCU mechanism, we read
796      * the value of the NdbSeqLock before reading the variables, we then
797      * read the variables protected by this mechanism whereafter we verify
798      * that the NdbSeqLock haven't changed it's value.
799      *
800      * It is noteworthy that using RCU requires reading the lock variable
801      * before and after in both the successful case as well as in the
802      * error case. We cannot deduce an error until we have verified that
803      * we have read consistent data.
804      *
805      * So with this mechanism DIGETNODESREQ can scale to almost any number
806      * of key operations and fragment scans per second with minor glitches
807      * while still performing online schema changes.
808      *
809      * We put the mutex surrounded by variables that are not used in normal
810      * operation to minimize the bad effects of CPU cache misses.
811      */
812     NdbMutex theMutex;
813 
814     Uint32 pageRef[PACK_TABLE_PAGES]; // TODO: makedynamic
815   };
816   typedef Ptr<TabRecord> TabRecordPtr;
817 
818   /***************************************************************************/
819   /* THIS RECORD IS USED TO KEEP TRACK OF TAKE OVER AND STARTING A NODE.    */
820   /* WE KEEP IT IN A RECORD TO ENABLE IT TO BE PARALLELISED IN THE FUTURE.  */
821   /**************************************************************************/
822   struct TakeOverRecord {
823 
TakeOverRecordDbdih::TakeOverRecord824     TakeOverRecord() {}
825 
826     /**
827      * States possible on slave (starting node)
828      */
829     enum ToSlaveStatus {
830       TO_SLAVE_IDLE = 0
831       ,TO_START_FRAGMENTS = 1      // Finding LCP for each fragment
832       ,TO_RUN_REDO = 2             // Waiting for local LQH to run REDO
833       ,TO_START_TO = 3             // Waiting for master (START_TOREQ)
834       ,TO_SELECTING_NEXT = 4       // Selecting next fragment to copy
835       ,TO_PREPARE_COPY = 5         // Waiting for local LQH (PREPARE_COPYREQ)
836       ,TO_UPDATE_BEFORE_STORED = 6 // Waiting on master (UPDATE_TOREQ)
837       ,TO_UPDATE_FRAG_STATE_STORED = 7
838                         // Waiting for all UPDATE_FRAG_STATEREQ stored
839       ,TO_UPDATE_AFTER_STORED = 8  // Waiting for master (UPDATE_TOREQ)
840       ,TO_COPY_FRAG = 9            // Waiting for copy node (COPY_FRAGREQ)
841       ,TO_COPY_ACTIVE = 10         // Waiting for local LQH (COPY_ACTIVEREQ)
842       ,TO_UPDATE_BEFORE_COMMIT = 11// Waiting for master (UPDATE_TOREQ)
843       ,TO_UPDATE_FRAG_STATE_COMMIT = 12
844                             // Waiting for all (UPDATE_FRAG_STATEREQ commit)
845       ,TO_UPDATE_AFTER_COMMIT = 13 // Waiting for master (UPDATE_TOREQ)
846 
847       ,TO_START_LOGGING = 14        // Enabling logging on all fragments
848       ,TO_SL_COPY_ACTIVE = 15       // Start logging: Copy active (local)
849       ,TO_SL_UPDATE_FRAG_STATE = 16 // Start logging: Create Frag (dist)
850       ,TO_END_TO = 17               // Waiting for master (END_TOREQ)
851       ,TO_QUEUED_UPDATE_BEFORE_STORED = 18 //Queued
852       ,TO_QUEUED_UPDATE_BEFORE_COMMIT = 19  //Queued
853       ,TO_QUEUED_SL_UPDATE_FRAG_STATE = 20  //Queued
854     };
855 
856     /**
857      * States possible on master
858      */
859     enum ToMasterStatus {
860       TO_MASTER_IDLE = 0
861       ,TO_MUTEX_BEFORE_STORED = 1  // Waiting for lock
862       ,TO_MUTEX_BEFORE_LOCKED = 2  // Lock held
863       ,TO_AFTER_STORED = 3         // No lock, but NGPtr reservation
864       ,TO_MUTEX_BEFORE_COMMIT = 4  // Waiting for lock
865       ,TO_MUTEX_BEFORE_SWITCH_REPLICA = 5 // Waiting for switch replica lock
866       ,TO_MUTEX_AFTER_SWITCH_REPLICA = 6
867       ,TO_WAIT_LCP = 7             // No locks, waiting for LCP
868     };
869     /**
870      * For node restarts we use a number of parallel take over records
871      * such that we can copy fragments from several LDM instances in
872      * parallel. Each thread will take care of a subset of LDM
873      * instances provided by knowing the number of instances and
874      * our thread id. For each replica we will then check if
875      * replica_instance_id % m_number_of_copy_threads == m_copy_thread_id.
876      */
877     Uint32 m_copy_thread_id;
878     Uint32 m_number_of_copy_threads;
879     Uint32 m_copy_threads_completed;
880 
881     Uint32 m_flags;       //
882     Uint32 m_senderRef;   // Who requested START_COPYREQ
883     Uint32 m_senderData;  // Data of sender
884 
885     Uint32 restorableGci; // Which GCI can be restore "locally" by node
886     Uint32 startGci;
887     Uint32 maxPage;
888     Uint32 toCopyNode;
889     Uint32 toCurrentFragid;
890     Uint32 toCurrentReplica;
891     Uint32 toCurrentTabref;
892     Uint32 toFailedNode;
893     Uint32 toStartingNode;
894     NDB_TICKS toStartTime;
895     ToSlaveStatus toSlaveStatus;
896     ToMasterStatus toMasterStatus;
897 
898     MutexHandle2<DIH_SWITCH_PRIMARY_MUTEX> m_switchPrimaryMutexHandle;
899     MutexHandle2<DIH_FRAGMENT_INFO> m_fragmentInfoMutex;
900 
901     Uint32 nextList;
902     union {
903       Uint32 prevList;
904       Uint32 nextPool;
905     };
906   };
907   typedef Ptr<TakeOverRecord> TakeOverRecordPtr;
908   typedef ArrayPool<TakeOverRecord> TakeOverRecord_pool;
909   typedef DLList<TakeOverRecord_pool> TakeOverRecord_list;
910   typedef SLFifoList<TakeOverRecord_pool> TakeOverRecord_fifo;
911 
912 
getParam(const char * param,Uint32 * retVal)913   virtual bool getParam(const char * param, Uint32 * retVal) {
914     if (param && strcmp(param, "ActiveMutexes") == 0)
915     {
916       if (retVal)
917       {
918         * retVal = 5 + MAX_NDB_NODES;
919       }
920       return true;
921     }
922     return false;
923   }
924 
925 public:
926   Dbdih(Block_context& ctx);
927   virtual ~Dbdih();
928 
929   struct RWFragment {
930     Uint32 pageIndex;
931     Uint32 wordIndex;
932     Uint32 fragId;
933     TabRecordPtr rwfTabPtr;
934     PageRecordPtr rwfPageptr;
935     Uint32 totalfragments;
936   };
937   struct CopyTableNode {
938     Uint32 pageIndex;
939     Uint32 wordIndex;
940     Uint32 noOfWords;
941     TabRecordPtr ctnTabPtr;
942     PageRecordPtr ctnPageptr;
943   };
944 
945 private:
946   friend class SimulatedBlock;
947   BLOCK_DEFINES(Dbdih);
948 
949   /**
950    * Methods used in Node Recovery Status module
951    * -------------------------------------------
952    */
953   void execDBINFO_SCANREQ(Signal *);
954   void execALLOC_NODEID_REP(Signal *);
955   void execINCL_NODE_HB_PROTOCOL_REP(Signal *);
956   void execNDBCNTR_START_WAIT_REP(Signal *);
957   void execNDBCNTR_STARTED_REP(Signal *);
958   void execSUMA_HANDOVER_COMPLETE_REP(Signal *);
959   void execEND_TOREP(Signal *signal);
960   void execLOCAL_RECOVERY_COMP_REP(Signal *signal);
961 
962   void sendEND_TOREP(Signal *signal, Uint32 startNodeId);
963   bool check_stall_lcp_start(void);
964   void check_node_not_restarted_yet(NodeRecordPtr nodePtr);
965   void setNodeRecoveryStatus(Uint32 nodeId,
966                              NodeRecord::NodeRecoveryStatus new_status);
967   void setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr);
968   void initNodeRecoveryTimers(NodeRecordPtr nodePtr);
969   void initNodeRecoveryStatus();
970   void initNodeRecord(NodeRecordPtr);
971   bool check_for_too_long_wait(Uint64 &lcp_max_wait_time,
972                                Uint64 &lcp_stall_time,
973                                NDB_TICKS now);
974   void check_all_node_recovery_timers(void);
975   bool check_node_recovery_timers(Uint32 nodeId);
976   void calculate_time_remaining(Uint32 nodeId,
977                                 NDB_TICKS state_start_time,
978                                 NDB_TICKS now,
979                                 NodeRecord::NodeRecoveryStatus state,
980                                 Uint32 *node_waited_for,
981                                 Uint64 *time_since_state_start,
982                                 NodeRecord::NodeRecoveryStatus *max_status);
983   void calculate_most_recent_node(Uint32 nodeId,
984                           NDB_TICKS state_start_time,
985                           NodeRecord::NodeRecoveryStatus state,
986                           Uint32 *most_recent_node,
987                           NDB_TICKS *most_recent_start_time,
988                           NodeRecord::NodeRecoveryStatus *most_recent_state);
989   const char* get_status_str(NodeRecord::NodeRecoveryStatus status);
990   void fill_row_with_node_restart_status(NodeRecordPtr nodePtr,
991                                          Ndbinfo::Row &row);
992   void write_zero_columns(Ndbinfo::Row &row, Uint32 num_rows);
993   void handle_before_master(NodeRecordPtr nodePtr, Ndbinfo::Row &row);
994   /* End methods for Node Recovery Status module */
995 
996   void execDUMP_STATE_ORD(Signal *);
997   void execNDB_TAMPER(Signal *);
998   void execDEBUG_SIG(Signal *);
999   void execMASTER_GCPREF(Signal *);
1000   void execMASTER_GCPREQ(Signal *);
1001   void execMASTER_GCPCONF(Signal *);
1002   void execMASTER_LCPREF(Signal *);
1003   void execMASTER_LCPREQ(Signal *);
1004   void execMASTER_LCPCONF(Signal *);
1005   void execNF_COMPLETEREP(Signal *);
1006   void execSTART_PERMREQ(Signal *);
1007   void execSTART_PERMCONF(Signal *);
1008   void execSTART_PERMREF(Signal *);
1009   void execINCL_NODEREQ(Signal *);
1010   void execINCL_NODECONF(Signal *);
1011 
1012   void execSTART_TOREQ(Signal *);
1013   void execSTART_TOREF(Signal *);
1014   void execSTART_TOCONF(Signal*);
1015 
1016   void execEND_TOREQ(Signal *);
1017   void execEND_TOREF(Signal *);
1018   void execEND_TOCONF(Signal*);
1019 
1020   void execUPDATE_TOREQ(Signal* signal);
1021   void execUPDATE_TOREF(Signal* signal);
1022   void execUPDATE_TOCONF(Signal* signal);
1023 
1024   void execSTART_MEREQ(Signal *);
1025   void execSTART_MECONF(Signal *);
1026   void execSTART_MEREF(Signal *);
1027   void execSTART_COPYREQ(Signal *);
1028   void execSTART_COPYCONF(Signal *);
1029   void execSTART_COPYREF(Signal *);
1030   void execUPDATE_FRAG_STATEREQ(Signal *);
1031   void execUPDATE_FRAG_STATECONF(Signal *);
1032   void execDIVERIFYREQ(Signal *);
1033   void execGCP_SAVEREQ(Signal *);
1034   void execGCP_SAVECONF(Signal *);
1035   void execGCP_PREPARECONF(Signal *);
1036   void execGCP_PREPARE(Signal *);
1037   void execGCP_NODEFINISH(Signal *);
1038   void execGCP_COMMIT(Signal *);
1039   void execSUB_GCP_COMPLETE_REP(Signal *);
1040   void execSUB_GCP_COMPLETE_ACK(Signal *);
1041   void execDIHNDBTAMPER(Signal *);
1042   void execCONTINUEB(Signal *);
1043   void execCOPY_GCIREQ(Signal *);
1044   void execCOPY_GCICONF(Signal *);
1045   void execCOPY_TABREQ(Signal *);
1046   void execCOPY_TABCONF(Signal *);
1047   void execTCGETOPSIZECONF(Signal *);
1048   void execTC_CLOPSIZECONF(Signal *);
1049   void execCHECK_LCP_IDLE_ORD(Signal *);
1050 
1051   void execDIH_GET_TABINFO_REQ(Signal*);
1052   void execSET_UP_MULTI_TRP_CONF(Signal*);
1053 
1054   /**
1055    * A number of functions used to find out if any node is currently is
1056    * restarting.
1057    */
1058   void execCHECK_NODE_RESTARTREQ(Signal*);
1059   void check_node_in_restart(Signal*, BlockReference, Uint32);
1060   void sendCHECK_NODE_RESTARTCONF(Signal*, BlockReference, Uint32);
1061 
1062   int handle_invalid_lcp_no(const struct LcpFragRep*, ReplicaRecordPtr);
1063   void execLCP_FRAG_REP(Signal *);
1064   void execLCP_COMPLETE_REP(Signal *);
1065   void execSTART_LCP_REQ(Signal *);
1066   void execSTART_LCP_CONF(Signal *);
1067   MutexHandle2<DIH_START_LCP_MUTEX> c_startLcpMutexHandle;
1068   void startLcpMutex_locked(Signal* signal, Uint32, Uint32);
1069   void startLcpMutex_unlocked(Signal* signal, Uint32, Uint32);
1070   void lcpFragmentMutex_locked(Signal* signal, Uint32, Uint32);
1071   void master_lcp_fragmentMutex_locked(Signal* signal, Uint32, Uint32);
1072 
1073   void switch_primary_stop_node(Signal* signal, Uint32, Uint32);
1074 
1075   MutexHandle2<DIH_SWITCH_PRIMARY_MUTEX> c_switchPrimaryMutexHandle;
1076   MutexHandle2<DIH_FRAGMENT_INFO> c_fragmentInfoMutex_lcp;
1077 
1078   /* LCP Pausing module start */
1079   void execFLUSH_LCP_REP_REQ(Signal*);
1080   void execFLUSH_LCP_REP_CONF(Signal*);
1081   void execPAUSE_LCP_REQ(Signal*);
1082   void execPAUSE_LCP_CONF(Signal*);
1083 
1084   void sendPAUSE_LCP_REQ(Signal*, bool pause);
1085   bool check_if_lcp_idle(void);
1086   void pause_lcp(Signal *signal,
1087                  Uint32 startNode,
1088                  BlockReference sender_ref);
1089   void unpause_lcp(Signal *signal,
1090                    Uint32 startNode,
1091                    BlockReference sender_ref,
1092                    PauseLcpReq::PauseAction pauseAction);
1093   void check_for_pause_action(Signal *signal,
1094                               StartLcpReq::PauseStart pauseStart);
1095   void end_pause(Signal *signal, PauseLcpReq::PauseAction pauseAction);
1096   void stop_pause(Signal *signal);
1097   void handle_node_failure_in_pause(Signal *signal);
1098   void dequeue_lcp_rep(Signal*);
1099   void start_copy_meta_data(Signal*);
1100   void start_lcp(Signal*);
1101   void start_lcp_before_mutex(Signal*);
1102   void queue_lcp_frag_rep(Signal *signal, LcpFragRep *lcpReport);
1103   void queue_lcp_complete_rep(Signal *signal, Uint32 lcpId);
1104   void init_lcp_pausing_module(void);
1105   bool check_pause_state_sanity(void);
1106   void check_pause_state_lcp_idle(void);
1107 
1108   /**
1109    * This is only true when an LCP is running and it is running with
1110    * support for PAUSE LCP (all DIH nodes support it). Actually this
1111    * is set when we have passed the START_LCP_REQ step. After this
1112    * step we release the fragment info mutex if we can use the pause
1113    * lcp protocol with all nodes.
1114    */
1115   bool c_lcp_runs_with_pause_support; /* Master state */
1116 
1117   /**
1118    * This is the state in the master that keeps track of where the master is
1119    * in the PAUSE LCP process. We can follow two different tracks in the
1120    * state traversal.
1121    *
1122    * 1) When the starting node is included into the LCP as part of PAUSE LCP
1123    *    handling. This is the expected outcome after pausing. The LCP didn't
1124    *    complete while we were pausing. We need to be included into the LCP
1125    *    here to ensure that the LCP state in the starting node is kept up to
1126    *    date during the rest of the LCP.
1127    *
1128    * PAUSE_LCP_IDLE -> PAUSE_LCP_REQUESTED
1129    * PAUSE_LCP_REQUESTED -> PAUSE_START_LCP_INCLUSION
1130    * PAUSE_START_LCP_INCLUSION -> PAUSE_IN_LCP_COPY_META_DATA
1131    * PAUSE_IN_LCP_COPY_META_DATA -> PAUSE_COMPLETE_LCP_INCLUSION
1132    * PAUSE_COMPLETE_LCP_INCLUSION -> PAUSE_IN_LCP_UNPAUSE
1133    * PAUSE_IN_LCP_UNPAUSE -> PAUSE_LCP_IDLE
1134    *
1135    * 2) When the starting node isn't included into the LCP as part of PAUSE
1136    *    LCP handling. While we were pausing the LCP completed. Thus no need
1137    *    to include the new node into the LCP since no more updates of the
1138    *    LCP state will happen after the pause.
1139    *
1140    * PAUSE_LCP_IDLE -> PAUSE_LCP_REQUESTED
1141    * PAUSE_LCP_REQUESTED -> PAUSE_NOT_IN_LCP_COPY_META_DATA
1142    * PAUSE_NOT_IN_LCP_COPY_META_DATA -> PAUSE_NOT_IN_LCP_UNPAUSE
1143    * PAUSE_NOT_IN_LCP_UNPAUSE -> PAUSE_LCP_IDLE
1144    */
1145   enum PauseLCPState
1146   {
1147     PAUSE_LCP_IDLE = 0,
1148     PAUSE_LCP_REQUESTED = 1,
1149     /* States to handle inclusion in LCP. */
1150     PAUSE_START_LCP_INCLUSION = 2,
1151     PAUSE_IN_LCP_COPY_META_DATA = 3,
1152     PAUSE_COMPLETE_LCP_INCLUSION = 4,
1153     PAUSE_IN_LCP_UNPAUSE = 5,
1154     /* States to handle not included in LCP */
1155     PAUSE_NOT_IN_LCP_COPY_META_DATA = 6,
1156     PAUSE_NOT_IN_LCP_UNPAUSE = 7
1157   };
1158   PauseLCPState c_pause_lcp_master_state;
1159 
1160   /**
1161    * Bitmask of nodes that we're expecting a PAUSE_LCP_CONF response from.
1162    * This bitmask is cleared if the starting node dies (or for that matter
1163    * if any node dies since this will cause the starting node to also fail).
1164    * The PAUSE_LCP_REQ_Counter is only used in the master node.
1165    */
1166   SignalCounter c_PAUSE_LCP_REQ_Counter; /* Master state */
1167 
1168   /**
1169    * We need to keep track of the LQH nodes that participated in the PAUSE
1170    * LCP request to ensure that we unpause the same set of nodes in the
1171    * unpause request. If the LCP completes between as part of the pause
1172    * request phase, then the m_participatingLQH bitmap will be cleared and
1173    * we need this bitmap also to unpause the participants even if the
1174    * LCP has completed to ensure that the pause state is reset. This variable
1175    * is used to make sure that we retain this bitmap independent of what
1176    * happens with the LCP.
1177    */
1178   NdbNodeBitmask c_pause_participants;
1179 
1180   /**
1181    * This variable states which is the node starting up that requires a
1182    * pause of the LCP to copy the meta data during an ongoing LCP.
1183    * If the node fails this variable is set to RNIL to indicate we no
1184    * longer need to worry about signals handling this pause.
1185    *
1186    * This is also the state variable that says that pause lcp is ongoing
1187    * in this participant.
1188    */
1189   Uint32 c_pause_lcp_start_node;
1190 
is_pause_for_this_node(Uint32 node)1191   bool is_pause_for_this_node(Uint32 node)
1192   {
1193     return (node == c_pause_lcp_start_node);
1194   }
1195 
1196   /**
1197    * When is_lcp_paused is true then c_dequeue_lcp_rep_ongoing is false.
1198    * When is_lcp_paused is false then c_dequeue_lcp_rep_ongoing is true
1199    * until we have dequeued all queued requests. Requests will be
1200    * queued as long as either of them are true to ensure that we keep
1201    * the order of signals.
1202    */
is_lcp_paused()1203   bool is_lcp_paused()
1204   {
1205     return (c_pause_lcp_start_node != RNIL);
1206   }
1207   bool c_dequeue_lcp_rep_ongoing;
1208 
1209   /**
1210    * Last LCP id we heard LCP_COMPLETE_REP from local LQH. We record this
1211    * to ensure we only get one LCP_COMPLETE_REP per LCP from our local
1212    * LQH.
1213    */
1214   Uint32 c_last_id_lcp_complete_rep;
1215   bool c_queued_lcp_complete_rep;
1216 
1217   /**
1218    * As soon as we have some LCP_FRAG_REP or LCP_COMPLETE_REP queued, this
1219    * variable gives us the lcp Id of the paused LCP.
1220    */
1221   Uint32 c_lcp_id_paused;
1222 
1223   /**
1224    * We set the LCP Id when receiving COPY_TABREQ to be used in the
1225    * updateLcpInfo routine.
1226    */
1227   Uint32 c_lcp_id_while_copy_meta_data; /* State in starting node */
1228 
1229   /**
1230    * A bitmap for outstanding FLUSH_LCP_REP_REQ messages to know
1231    * when all nodes have sent their reply. This bitmap is used in all nodes
1232    * that receive the PAUSE_LCP_REQ request.
1233    */
1234   SignalCounter c_FLUSH_LCP_REP_REQ_Counter;
1235   /* LCP Pausing module end   */
1236 
1237   void execBLOCK_COMMIT_ORD(Signal *);
1238   void execUNBLOCK_COMMIT_ORD(Signal *);
1239 
1240   void execDIH_SWITCH_REPLICA_REQ(Signal *);
1241   void execDIH_SWITCH_REPLICA_REF(Signal *);
1242   void execDIH_SWITCH_REPLICA_CONF(Signal *);
1243 
1244   void execSTOP_PERM_REQ(Signal *);
1245   void execSTOP_PERM_REF(Signal *);
1246   void execSTOP_PERM_CONF(Signal *);
1247 
1248   void execSTOP_ME_REQ(Signal *);
1249   void execSTOP_ME_REF(Signal *);
1250   void execSTOP_ME_CONF(Signal *);
1251 
1252   void execREAD_CONFIG_REQ(Signal *);
1253   void execUNBLO_DICTCONF(Signal *);
1254   void execCOPY_ACTIVECONF(Signal *);
1255   void execTAB_COMMITREQ(Signal *);
1256   void execNODE_FAILREP(Signal *);
1257   void execCOPY_FRAGCONF(Signal *);
1258   void execCOPY_FRAGREF(Signal *);
1259   void execPREPARE_COPY_FRAG_REF(Signal*);
1260   void execPREPARE_COPY_FRAG_CONF(Signal*);
1261   void execDIADDTABREQ(Signal *);
1262   void execDIGETNODESREQ(Signal *);
1263   void execSTTOR(Signal *);
1264   void execDIH_SCAN_TAB_REQ(Signal *);
1265   void execDIH_SCAN_TAB_COMPLETE_REP(Signal*);
1266   void execGCP_SAVEREF(Signal *);
1267   void execGCP_TCFINISHED(Signal *);
1268   void execGCP_TCFINISHED_sync_conf(Signal* signal, Uint32 cb, Uint32 err);
1269   void execREAD_NODESCONF(Signal *);
1270   void execNDB_STTOR(Signal *);
1271   void execDICTSTARTCONF(Signal *);
1272   void execNDB_STARTREQ(Signal *);
1273   void execGETGCIREQ(Signal *);
1274   void execGET_LATEST_GCI_REQ(Signal*);
1275   void execSET_LATEST_LCP_ID(Signal*);
1276   void execDIH_RESTARTREQ(Signal *);
1277   void execSTART_RECCONF(Signal *);
1278   void execSTART_FRAGREF(Signal *);
1279   void execSTART_FRAGCONF(Signal *);
1280   void execADD_FRAGCONF(Signal *);
1281   void execADD_FRAGREF(Signal *);
1282   void execDROP_FRAG_REF(Signal *);
1283   void execDROP_FRAG_CONF(Signal *);
1284   void execFSOPENCONF(Signal *);
1285   void execFSOPENREF(Signal *);
1286   void execFSCLOSECONF(Signal *);
1287   void execFSCLOSEREF(Signal *);
1288   void execFSREADCONF(Signal *);
1289   void execFSREADREF(Signal *);
1290   void execFSWRITECONF(Signal *);
1291   void execFSWRITEREF(Signal *);
1292   void execCHECKNODEGROUPSREQ(Signal *);
1293   void execSTART_INFOREQ(Signal*);
1294   void execSTART_INFOREF(Signal*);
1295   void execSTART_INFOCONF(Signal*);
1296   void execWAIT_GCP_REQ(Signal* signal);
1297   void execWAIT_GCP_REF(Signal* signal);
1298   void execWAIT_GCP_CONF(Signal* signal);
1299   void execREDO_STATE_REP(Signal* signal);
1300 
1301   void execPREP_DROP_TAB_REQ(Signal* signal);
1302   void execDROP_TAB_REQ(Signal* signal);
1303 
1304   void execALTER_TAB_REQ(Signal* signal);
1305 
1306   void execCREATE_FRAGMENTATION_REQ(Signal*);
1307   bool verify_fragmentation(Uint16* fragments,
1308                             Uint32 partition_count,
1309                             Uint32 partition_balance,
1310                             Uint32 ldm_count) const;
1311 
1312   void waitDropTabWritingToFile(Signal *, TabRecordPtr tabPtr);
1313   void checkDropTabComplete(Signal *, TabRecordPtr tabPtr);
1314 
1315   void execDICT_LOCK_CONF(Signal* signal);
1316   void execDICT_LOCK_REF(Signal* signal);
1317 
1318   void execUPGRADE_PROTOCOL_ORD(Signal* signal);
1319 
1320   void execCREATE_NODEGROUP_IMPL_REQ(Signal*);
1321   void execDROP_NODEGROUP_IMPL_REQ(Signal*);
1322 
1323   void execSTART_NODE_LCP_CONF(Signal *signal);
1324   void handleStartLcpReq(Signal*, StartLcpReq*);
1325   StartLcpReq c_save_startLcpReq;
1326   bool c_start_node_lcp_req_outstanding;
1327 
1328   // Statement blocks
1329 //------------------------------------
1330 // Methods that send signals
1331 //------------------------------------
1332   void nullRoutine(Signal *, Uint32 nodeId, Uint32);
1333   void sendCOPY_GCIREQ(Signal *, Uint32 nodeId, Uint32);
1334   void sendDIH_SWITCH_REPLICA_REQ(Signal *, Uint32 nodeId, Uint32);
1335   void sendEND_TOREQ(Signal *, Uint32 nodeId, Uint32);
1336   void sendGCP_COMMIT(Signal *, Uint32 nodeId, Uint32);
1337   void sendGCP_PREPARE(Signal *, Uint32 nodeId, Uint32);
1338   void sendGCP_SAVEREQ(Signal *, Uint32 nodeId, Uint32);
1339   void sendSUB_GCP_COMPLETE_REP(Signal*, Uint32 nodeId, Uint32);
1340   void sendINCL_NODEREQ(Signal *, Uint32 nodeId, Uint32);
1341   void sendMASTER_GCPREQ(Signal *, Uint32 nodeId, Uint32);
1342   void sendMASTER_LCPREQ(Signal *, Uint32 nodeId, Uint32);
1343   void sendMASTER_LCPCONF(Signal * signal, Uint32 fromLine);
1344   void sendSTART_RECREQ(Signal *, Uint32 nodeId, Uint32);
1345   void sendSTART_INFOREQ(Signal *, Uint32 nodeId, Uint32);
1346   void sendSTOP_ME_REQ(Signal *, Uint32 nodeId, Uint32);
1347   void sendTC_CLOPSIZEREQ(Signal *, Uint32 nodeId, Uint32);
1348   void sendTCGETOPSIZEREQ(Signal *, Uint32 nodeId, Uint32);
1349   void sendUPDATE_TOREQ(Signal *, Uint32 nodeId, Uint32);
1350   void sendSTART_LCP_REQ(Signal *, Uint32 nodeId, Uint32);
1351 
1352   void sendLCP_FRAG_ORD(Signal*, NodeRecord::FragmentCheckpointInfo info);
1353   void sendLastLCP_FRAG_ORD(Signal *);
1354 
1355   void sendCopyTable(Signal *, CopyTableNode* ctn,
1356                      BlockReference ref, Uint32 reqinfo);
1357   void sendDihfragreq(Signal *,
1358                       TabRecordPtr regTabPtr,
1359                       Uint32 fragId);
1360 
1361   void sendStartFragreq(Signal *,
1362                         TabRecordPtr regTabPtr,
1363                         Uint32 fragId);
1364 
1365   void sendAddFragreq(Signal*,
1366                       ConnectRecordPtr,
1367                       TabRecordPtr,
1368                       Uint32 fragId,
1369                       bool rcu_lock_held);
1370   void addTable_closeConf(Signal* signal, Uint32 tabPtrI);
1371   void resetReplicaSr(TabRecordPtr tabPtr);
1372   void resetReplicaLcp(ReplicaRecord * replicaP, Uint32 stopGci);
1373   void resetReplica(Ptr<ReplicaRecord>);
1374 
1375 /**
1376  * Methods part of Transaction Handling module
1377  */
1378   void start_scan_on_table(TabRecordPtr, Signal*, Uint32, EmulatedJamBuffer*);
1379   void complete_scan_on_table(TabRecordPtr tabPtr, Uint32, EmulatedJamBuffer*);
1380 
1381   bool prepare_add_table(TabRecordPtr, ConnectRecordPtr, Signal*);
1382   void commit_new_table(TabRecordPtr);
1383 
1384   void make_node_usable(NodeRecord *nodePtr);
1385   void make_node_not_usable(NodeRecord *nodePtr);
1386 
1387   void start_add_fragments_in_new_table(TabRecordPtr,
1388                                         ConnectRecordPtr,
1389                                         const Uint16 buf[],
1390                                         Signal *signal);
1391   void make_new_table_writeable(TabRecordPtr, ConnectRecordPtr, bool);
1392   void make_new_table_read_and_writeable(TabRecordPtr,
1393                                          ConnectRecordPtr,
1394                                          Signal*);
1395   bool make_old_table_non_writeable(TabRecordPtr, ConnectRecordPtr);
1396   void make_table_use_new_replica(TabRecordPtr,
1397                                   FragmentstorePtr fragPtr,
1398                                   ReplicaRecordPtr,
1399                                   Uint32 replicaType,
1400                                   Uint32 destNodeId);
1401   void make_table_use_new_node_order(TabRecordPtr,
1402                                      FragmentstorePtr,
1403                                      Uint32,
1404                                      Uint32*);
1405   void make_new_table_non_writeable(TabRecordPtr);
1406   void drop_fragments_from_new_table_view(TabRecordPtr, ConnectRecordPtr);
1407 
1408 //------------------------------------
1409 // Methods for LCP functionality
1410 //------------------------------------
1411   void checkKeepGci(TabRecordPtr, Uint32, Fragmentstore*, Uint32);
1412   void checkLcpStart(Signal *, Uint32 lineNo, Uint32 delay);
1413   bool checkStartMoreLcp(Signal *, Uint32 nodeId, bool startNext);
1414   bool reportLcpCompletion(const struct LcpFragRep *);
1415   void sendLCP_COMPLETE_REP(Signal *);
1416 
1417 //------------------------------------
1418 // Methods for Delete Table Files
1419 //------------------------------------
1420   void startDeleteFile(Signal* signal, TabRecordPtr tabPtr);
1421   void openTableFileForDelete(Signal* signal, Uint32 fileIndex);
1422   void tableOpenLab(Signal* signal, FileRecordPtr regFilePtr);
1423   void tableDeleteLab(Signal* signal, FileRecordPtr regFilePtr);
1424 
1425 //------------------------------------
1426 // File Record specific methods
1427 //------------------------------------
1428   void closeFile(Signal *, FileRecordPtr regFilePtr);
1429   void closeFileDelete(Signal *, FileRecordPtr regFilePtr);
1430   void createFileRw(Signal *, FileRecordPtr regFilePtr);
1431   void openFileRw(Signal *, FileRecordPtr regFilePtr);
1432   void openFileRo(Signal *, FileRecordPtr regFilePtr);
1433   void seizeFile(FileRecordPtr& regFilePtr);
1434   void releaseFile(Uint32 fileIndex);
1435 
1436 //------------------------------------
1437 // Methods called when completing file
1438 // operation.
1439 //------------------------------------
1440   void creatingGcpLab(Signal *, FileRecordPtr regFilePtr);
1441   void openingGcpLab(Signal *, FileRecordPtr regFilePtr);
1442   void openingTableLab(Signal *, FileRecordPtr regFilePtr);
1443   void tableCreateLab(Signal *, FileRecordPtr regFilePtr);
1444   void creatingGcpErrorLab(Signal *, FileRecordPtr regFilePtr);
1445   void openingCopyGciErrorLab(Signal *, FileRecordPtr regFilePtr);
1446   void creatingCopyGciErrorLab(Signal *, FileRecordPtr regFilePtr);
1447   void openingGcpErrorLab(Signal *, FileRecordPtr regFilePtr);
1448   void openingTableErrorLab(Signal *, FileRecordPtr regFilePtr);
1449   void tableCreateErrorLab(Signal *, FileRecordPtr regFilePtr);
1450   void closingGcpLab(Signal *, FileRecordPtr regFilePtr);
1451   void closingGcpCrashLab(Signal *, FileRecordPtr regFilePtr);
1452   void closingTableCrashLab(Signal *, FileRecordPtr regFilePtr);
1453   void closingTableSrLab(Signal *, FileRecordPtr regFilePtr);
1454   void tableCloseLab(Signal *, FileRecordPtr regFilePtr);
1455   void tableCloseErrorLab(FileRecordPtr regFilePtr);
1456   void readingGcpLab(Signal *, FileRecordPtr regFilePtr);
1457   void readingTableLab(Signal *, FileRecordPtr regFilePtr);
1458   void readingGcpErrorLab(Signal *, FileRecordPtr regFilePtr);
1459   void readingTableErrorLab(Signal *, FileRecordPtr regFilePtr);
1460   void writingCopyGciLab(Signal *, FileRecordPtr regFilePtr);
1461   void writeInitGcpLab(Signal *, FileRecordPtr regFilePtr);
1462   void tableWriteLab(Signal *, FileRecordPtr regFilePtr);
1463   void writeInitGcpErrorLab(Signal *, FileRecordPtr regFilePtr);
1464 
1465 
1466   void checkEscalation();
1467   void clearRestartInfoBits(Signal *);
1468   void invalidateLcpInfoAfterSr(Signal*);
1469 
1470   bool isMaster();
1471   bool isActiveMaster();
1472 
1473   void handleGcpStateInMaster(Signal *, NodeRecordPtr failedNodeptr);
1474   void initRestartInfo(Signal*);
1475   void initRestorableGciFiles();
1476   void makeNodeGroups(Uint32 nodeArray[]);
1477   void add_nodegroup(NodeGroupRecordPtr);
1478   void inc_ng_refcount(Uint32 ng);
1479   void dec_ng_refcount(Uint32 ng);
1480 
1481   void makePrnList(class ReadNodesConf * readNodes, Uint32 nodeArray[]);
1482   void nodeResetStart(Signal* signal);
1483   void releaseTabPages(Uint32 tableId);
1484   void replication(Uint32 noOfReplicas,
1485                    NodeGroupRecordPtr NGPtr,
1486                    FragmentstorePtr regFragptr);
1487   void sendDihRestartRef(Signal*);
1488   void unpack_sysfile_format_v1(bool set_max_node_id);
1489   void pack_sysfile_format_v1();
1490   void unpack_sysfile_format_v2(bool set_max_node_id);
1491   void pack_sysfile_format_v2();
1492   void send_COPY_GCIREQ_data_v1(Signal*, Uint32);
1493   void send_COPY_GCIREQ_data_v2(Signal*, Uint32);
1494   void send_START_MECONF_data_v1(Signal*, Uint32);
1495   void send_START_MECONF_data_v2(Signal*, Uint32);
1496   void selectMasterCandidateAndSend(Signal *);
1497   void setLcpActiveStatusEnd(Signal*);
1498   void setLcpActiveStatusStart(Signal *);
1499   void setNodeActiveStatus();
1500   void setNodeGroups();
1501   void setNodeInfo(Signal *);
1502   void setNodeLcpActiveStatus();
1503   void setNodeRestartInfoBits(Signal*);
1504   void startGcp(Signal *);
1505   void startGcpMonitor(Signal*);
1506 
1507   void readFragment(RWFragment* rf, FragmentstorePtr regFragptr);
1508   Uint32 readPageWord(RWFragment* rf);
1509   void readReplica(RWFragment* rf, ReplicaRecordPtr readReplicaPtr);
1510   void readReplicas(RWFragment* rf,
1511                     TabRecord *regTabPtr,
1512                     FragmentstorePtr regFragptr);
1513   void updateLcpInfo(TabRecord *regTabPtr,
1514                      Fragmentstore *regFragPtr,
1515                      ReplicaRecord *regReplicaPtr);
1516   void readRestorableGci(Signal *, FileRecordPtr regFilePtr);
1517   void readTabfile(Signal *, TabRecord* tab, FileRecordPtr regFilePtr);
1518   void writeFragment(RWFragment* wf, FragmentstorePtr regFragptr);
1519   void writePageWord(RWFragment* wf, Uint32 dataWord);
1520   void writeReplicas(RWFragment* wf, Uint32 replicaStartIndex);
1521   void writeRestorableGci(Signal *, FileRecordPtr regFilePtr);
1522   void writeTabfile(Signal *, TabRecord* tab, FileRecordPtr regFilePtr);
1523   void copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr);
1524 
1525   void gcpcommitreqLab(Signal *);
1526   void copyGciLab(Signal *, CopyGCIReq::CopyReason reason);
1527   void storeNewLcpIdLab(Signal *);
1528   void startLcpRoundLoopLab(Signal *, Uint32 startTableId, Uint32 startFragId);
1529 
1530   void nodeFailCompletedCheckLab(Signal*, NodeRecordPtr failedNodePtr);
1531 
1532   /**
1533    *
1534    */
1535   void setLocalNodefailHandling(Signal*, Uint32 failedNodeId,
1536 				NodefailHandlingStep step);
1537   void checkLocalNodefailComplete(Signal*, Uint32 failedNodeId,
1538 				  NodefailHandlingStep step);
1539 
1540   Callback m_sendSTTORRY;
1541   void sendSTTORRY(Signal*, Uint32 senderData = 0, Uint32 retVal = 0);
1542   void ndbsttorry10Lab(Signal *, Uint32 _line);
1543   void createMutexes(Signal* signal, Uint32 no);
1544   void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
1545   void dumpGcpStop();
1546   void crashSystemAtGcpStop(Signal *, bool);
1547   void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
1548   void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
1549   void GCP_SAVEhandling(Signal *, Uint32 nodeId);
1550   void packTableIntoPagesLab(Signal *, Uint32 tableId);
1551   void readPagesIntoTableLab(Signal *, Uint32 tableId);
1552   void readPagesIntoFragLab(Signal *, RWFragment* rf);
1553   void readTabDescriptionLab(Signal *, Uint32 tableId);
1554   void copyTableLab(Signal *, Uint32 tableId);
1555   void breakCopyTableLab(Signal *,
1556                          TabRecordPtr regTabPtr,
1557                          Uint32 nodeId);
1558   void checkAddfragCompletedLab(Signal *,
1559                                 TabRecordPtr regTabPtr,
1560                                 Uint32 fragId);
1561   void completeRestartLab(Signal *);
1562   void readTableFromPagesLab(Signal *, TabRecordPtr regTabPtr);
1563   void srPhase2ReadTableLab(Signal *, TabRecordPtr regTabPtr);
1564   void checkTcCounterLab(Signal *);
1565   void calculateKeepGciLab(Signal *, Uint32 tableId, Uint32 fragId);
1566   void tableUpdateLab(Signal *, TabRecordPtr regTabPtr);
1567   void checkLcpCompletedLab(Signal *);
1568   void initLcpLab(Signal *, Uint32 masterRef, Uint32 tableId);
1569   void startGcpLab(Signal *);
1570   void checkGcpStopLab(Signal *);
1571   void MASTER_GCPhandling(Signal *, Uint32 failedNodeId);
1572   void MASTER_LCPhandling(Signal *, Uint32 failedNodeId);
1573   void rnfTableNotReadyLab(Signal *, TabRecordPtr regTabPtr, Uint32 removeNodeId);
1574   void startLcpTakeOverLab(Signal *, Uint32 failedNodeId);
1575 
1576   void startLcpMasterTakeOver(Signal *, Uint32 failedNodeId);
1577   void startGcpMasterTakeOver(Signal *, Uint32 failedNodeId);
1578   void checkGcpOutstanding(Signal*, Uint32 failedNodeId);
1579 
1580   void checkEmptyLcpComplete(Signal *);
1581   void lcpBlockedLab(Signal *, Uint32, Uint32);
1582   void breakCheckTabCompletedLab(Signal *, TabRecordPtr regTabptr);
1583   void readGciFileLab(Signal *);
1584   void openingCopyGciSkipInitLab(Signal *, FileRecordPtr regFilePtr);
1585   void startLcpRoundLab(Signal *);
1586   void gcpBlockedLab(Signal *);
1587   void allNodesLcpCompletedLab(Signal *);
1588   void nodeRestartPh2Lab(Signal *);
1589   void nodeRestartPh2Lab2(Signal *);
1590   void initGciFilesLab(Signal *);
1591   void dictStartConfLab(Signal *);
1592   void nodeDictStartConfLab(Signal *, Uint32 nodeId);
1593   void ndbStartReqLab(Signal *, BlockReference ref);
1594   void nodeRestartStartRecConfLab(Signal *);
1595   void dihCopyCompletedLab(Signal *);
1596   void release_connect(ConnectRecordPtr ptr);
1597   void copyTableNode(Signal *,
1598                      CopyTableNode* ctn,
1599                      NodeRecordPtr regNodePtr);
1600   void startFragment(Signal *, Uint32 tableId, Uint32 fragId);
1601   bool checkLcpAllTablesDoneInLqh(Uint32 from);
1602 
1603   void lcpStateAtNodeFailureLab(Signal *, Uint32 nodeId);
1604   void copyNodeLab(Signal *, Uint32 tableId);
1605   void copyGciReqLab(Signal *);
1606   void allLab(Signal *,
1607               ConnectRecordPtr regConnectPtr,
1608               TabRecordPtr regTabPtr);
1609   void tableCopyNodeLab(Signal *, TabRecordPtr regTabPtr);
1610 
1611   void removeNodeFromTables(Signal *, Uint32 tableId, Uint32 nodeId);
1612   void removeNodeFromTable(Signal *, Uint32 tableId, TabRecordPtr tabPtr);
1613   void removeNodeFromTablesComplete(Signal* signal, Uint32 nodeId);
1614 
1615   void packFragIntoPagesLab(Signal *, RWFragment* wf);
1616   void startNextChkpt(Signal *);
1617   void failedNodeLcpHandling(Signal*, NodeRecordPtr failedNodePtr, bool &);
1618   void failedNodeSynchHandling(Signal *, NodeRecordPtr failedNodePtr);
1619   void checkCopyTab(Signal*, NodeRecordPtr failedNodePtr);
1620 
1621   Uint32 compute_max_failure_time();
1622   void setGCPStopTimeouts(Signal*,
1623                           bool set_gcp_save_max_lag = true,
1624                           bool set_micro_gcp_max_lag = true);
1625   void sendINFO_GCP_STOP_TIMER(Signal*);
1626   void initCommonData();
1627   void initialiseRecordsLab(Signal *, Uint32 stepNo, Uint32, Uint32);
1628 
1629   void findReplica(ReplicaRecordPtr& regReplicaPtr,
1630                    Fragmentstore* fragPtrP,
1631 		   Uint32 nodeId,
1632 		   bool oldStoredReplicas = false);
1633 //------------------------------------
1634 // Node failure handling methods
1635 //------------------------------------
1636   void startRemoveFailedNode(Signal *, NodeRecordPtr failedNodePtr);
1637   void handleGcpTakeOver(Signal *, NodeRecordPtr failedNodePtr);
1638   void handleLcpTakeOver(Signal *, NodeRecordPtr failedNodePtr);
1639   void handleTakeOver(Signal*, Ptr<TakeOverRecord>);
1640   void handleLcpMasterTakeOver(Signal *, Uint32 nodeId);
1641 
1642 //------------------------------------
1643 // Replica record specific methods
1644 //------------------------------------
1645   Uint32 findLogInterval(ConstPtr<ReplicaRecord> regReplicaPtr,
1646 			 Uint32 startGci);
1647   void findMinGci(ReplicaRecordPtr fmgReplicaPtr,
1648                   Uint32& keeGci,
1649                   Uint32& oldestRestorableGci);
1650   bool findStartGci(Ptr<ReplicaRecord> fstReplicaPtr,
1651                     Uint32 tfstStopGci,
1652                     Uint32& tfstStartGci,
1653                     Uint32& tfstLcp);
1654   void newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr);
1655   void packCrashedReplicas(ReplicaRecordPtr pcrReplicaPtr);
1656   void releaseReplicas(Uint32 * replicaPtr);
1657   void removeOldCrashedReplicas(Uint32, Uint32, ReplicaRecordPtr rocReplicaPtr);
1658   void removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr, Uint32 lastCompletedGCI);
1659   void mergeCrashedReplicas(ReplicaRecordPtr pcrReplicaPtr);
1660   void seizeReplicaRec(ReplicaRecordPtr& replicaPtr);
1661 
1662 //------------------------------------
1663 // Methods operating on a fragment and
1664 // its connected replicas and nodes.
1665 //------------------------------------
1666   void insertCopyFragmentList(TabRecord *tabPtr,
1667                               Fragmentstore *fragPtr,
1668                               Uint32 my_fragid);
1669   void allocStoredReplica(FragmentstorePtr regFragptr,
1670                           ReplicaRecordPtr& newReplicaPtr,
1671                           Uint32 nodeId,
1672                           Uint32 fragId,
1673                           Uint32 tableId);
1674   Uint32 extractNodeInfo(EmulatedJamBuffer *jambuf,
1675                          const Fragmentstore * fragPtr,
1676                          Uint32 nodes[]);
1677   Uint32 findLocalFragment(const TabRecord *,
1678                            Ptr<Fragmentstore> & fragPtr,
1679                            EmulatedJamBuffer *jambuf);
1680   Uint32 findPartitionOrder(const TabRecord *tabPtrP,
1681                             FragmentstorePtr fragPtr);
1682   Uint32 findFirstNewFragment(const TabRecord *,
1683                               Ptr<Fragmentstore> & fragPtr,
1684                               Uint32 fragId,
1685                               EmulatedJamBuffer *jambuf);
1686   bool check_if_local_fragment(EmulatedJamBuffer *jambuf,
1687                                const Fragmentstore *fragPtr);
1688   bool findBestLogNode(CreateReplicaRecord* createReplica,
1689                        FragmentstorePtr regFragptr,
1690                        Uint32 startGci,
1691                        Uint32 stopGci,
1692                        Uint32 logNode,
1693                        Uint32& fblStopGci);
1694   bool findLogNodes(CreateReplicaRecord* createReplica,
1695                     FragmentstorePtr regFragptr,
1696                     Uint32 startGci,
1697                     Uint32 stopGci);
1698   void initFragstore(FragmentstorePtr regFragptr, Uint32 fragId);
1699   void insertfraginfo(FragmentstorePtr regFragptr,
1700                       Uint32 noOfBackups,
1701                       Uint32* nodeArray);
1702   void linkOldStoredReplica(FragmentstorePtr regFragptr,
1703                             ReplicaRecordPtr replicaPtr);
1704   void linkStoredReplica(FragmentstorePtr regFragptr,
1705                          ReplicaRecordPtr replicaPtr);
1706   void prepareReplicas(FragmentstorePtr regFragptr);
1707   void removeNodeFromStored(Uint32 nodeId,
1708                             FragmentstorePtr regFragptr,
1709                             ReplicaRecordPtr replicaPtr,
1710 			    bool temporary);
1711   void removeOldStoredReplica(FragmentstorePtr regFragptr,
1712                               ReplicaRecordPtr replicaPtr);
1713   void removeStoredReplica(FragmentstorePtr regFragptr,
1714                            ReplicaRecordPtr replicaPtr);
1715   void searchStoredReplicas(FragmentstorePtr regFragptr);
1716   bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*,
1717 			    Ptr<ReplicaRecord>);
1718   void updateNodeInfo(FragmentstorePtr regFragptr);
1719 
1720 //------------------------------------
1721 // Fragment allocation, deallocation and
1722 // find methods
1723 //------------------------------------
1724   void allocFragments(Uint32 noOfFragments, TabRecordPtr regTabPtr);
1725   void releaseFragments(TabRecordPtr regTabPtr);
1726   void getFragstore(const TabRecord *, Uint32 fragNo, FragmentstorePtr & ptr);
1727   void getFragstoreCanFail(const TabRecord *,
1728                            Uint32 fragNo,
1729                            FragmentstorePtr & ptr);
1730   void initialiseFragstore();
1731 
1732   void wait_old_scan(Signal*);
1733   Uint32 add_fragments_to_table(Ptr<TabRecord>, const Uint16 buf[]);
1734   Uint32 add_fragment_to_table(Ptr<TabRecord>, Uint32, Ptr<Fragmentstore>&);
1735 
1736   void drop_fragments(Signal*, ConnectRecordPtr, Uint32 last);
1737   void release_fragment_from_table(Ptr<TabRecord>, Uint32 fragId);
1738   void send_alter_tab_ref(Signal*, Ptr<TabRecord>,Ptr<ConnectRecord>, Uint32);
1739   void send_alter_tab_conf(Signal*, Ptr<ConnectRecord>);
1740   void alter_table_writeTable_conf(Signal* signal, Uint32 ptrI, Uint32 err);
1741   void saveTableFile(Signal*, Ptr<ConnectRecord>, Ptr<TabRecord>,
1742                      TabRecord::CopyStatus, Callback&);
1743 
1744 //------------------------------------
1745 // Page Record specific methods
1746 //------------------------------------
1747   void allocpage(PageRecordPtr& regPagePtr);
1748   void releasePage(Uint32 pageIndex);
1749 
1750 //------------------------------------
1751 // Table Record specific methods
1752 //------------------------------------
1753   void initTable(TabRecordPtr regTabPtr);
1754   void initTableFile(TabRecordPtr regTabPtr);
1755   void releaseTable(TabRecordPtr tabPtr);
1756 
1757   void handleTakeOverMaster(Signal *, Uint32 takeOverPtr);
1758   void handleTakeOverNewMaster(Signal *, Uint32 takeOverPtr);
1759 
1760 //------------------------------------
1761 // Node Record specific methods
1762 //------------------------------------
1763   void checkStartTakeOver(Signal *);
1764   void insertAlive(NodeRecordPtr newNodePtr);
1765   void insertDeadNode(NodeRecordPtr removeNodePtr);
1766   void removeAlive(NodeRecordPtr removeNodePtr);
1767   void removeDeadNode(NodeRecordPtr removeNodePtr);
1768 
1769   NodeRecord::NodeStatus getNodeStatus(Uint32 nodeId);
1770   void setNodeStatus(Uint32 nodeId, NodeRecord::NodeStatus);
1771   Sysfile::ActiveStatus getNodeActiveStatus(Uint32 nodeId);
1772   void setNodeActiveStatus(Uint32 nodeId, Sysfile::ActiveStatus newStatus);
1773   void setNodeLcpActiveStatus(Uint32 nodeId, bool newState);
1774   bool getNodeLcpActiveStatus(Uint32 nodeId);
1775   bool getAllowNodeStart(Uint32 nodeId);
1776   void setAllowNodeStart(Uint32 nodeId, bool newState);
1777   bool getNodeCopyCompleted(Uint32 nodeId);
1778   void setNodeCopyCompleted(Uint32 nodeId, bool newState);
1779   Uint32 getNodeGroup(Uint32 nodeId) const;
1780   bool checkNodeAlive(Uint32 nodeId);
1781 
1782   void getTabInfo(Signal*);
1783   void getTabInfo_send(Signal*, TabRecordPtr);
1784   void getTabInfo_sendComplete(Signal*, Uint32, Uint32);
1785   int getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr, CopyTableNode);
1786   int getTabInfo_copySectionToPages(TabRecordPtr, SegmentedSectionPtr);
1787 
1788   // Initialisation
1789   void initData();
1790   void initRecords();
1791 
1792   // Variables to support record structures and their free lists
1793 
1794   ConnectRecord *connectRecord;
1795   Uint32 cfirstconnect;
1796   Uint32 cconnectFileSize;
1797 
1798   CreateReplicaRecord *createReplicaRecord;
1799   Uint32 cnoOfCreateReplicas;
1800 
1801   FileRecord *fileRecord;
1802   Uint32 cfirstfreeFile;
1803   Uint32 cfileFileSize;
1804 
1805   Fragmentstore *fragmentstore;
1806   Uint32 cfirstfragstore;
1807   Uint32 cfragstoreFileSize;
1808   RSS_OP_SNAPSHOT(cremainingfrags);
1809 
1810   NodeGroupRecord *nodeGroupRecord;
1811   RSS_OP_SNAPSHOT(cnghash);
1812 
1813   Uint32 c_nextNodeGroup;
1814   Uint16 c_next_replica_node[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES];
1815 
1816   /**
1817    * Temporary variables used by CREATE_FRAGMENTATION_REQ
1818    */
1819   Uint16
1820     tmp_next_replica_node[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES];
1821   Uint8
1822     tmp_next_replica_node_set[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES];
1823   Uint16 tmp_node_group_id[MAX_NDB_PARTITIONS];
1824   Uint16 tmp_fragments_per_ldm[MAX_NDB_NODES][NDBMT_MAX_WORKER_INSTANCES];
1825   Uint16 tmp_fragments_per_node[MAX_NDB_NODES];
1826   void init_next_replica_node(
1827     Uint16
1828      (*next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES],
1829      Uint32 noOfReplicas);
1830 
1831   NodeRecord *nodeRecord;
1832 
1833   PageRecord *pageRecord;
1834   Uint32 cfirstfreepage;
1835   Uint32 cpageFileSize;
1836 
1837   Uint32 cnoFreeReplicaRec;
1838   Uint32 creplicaFileSize;
1839   RSS_OP_SNAPSHOT(cnoFreeReplicaRec);
1840 
1841   TabRecord *tabRecord;
1842   Uint32 ctabFileSize;
1843 
1844   /**
1845    * Methods and variables used to control the node restart phase where a
1846    * node gets the data back from an alive node. This has two parts, one
1847    * part in the master node which controls that certain critical data is
1848    * only updated one at a time. The other part is in the starting node
1849    * where there is one thread for each parallel fragment copy process.
1850    *
1851    * There is also a set of signals used for the take over processes.
1852    *
1853    * START_FRAGREQ
1854    * Before performing the actual copy phase the starting node needs
1855    * information about all fragments to start. This signal is sent from the
1856    * from the starting nodes DBDIH to the starting nodes DBLQH and to the
1857    * actual instance that will handle the fragment replica.
1858    *
1859    * START_RECREQ/CONF:
1860    * This is sent from the starting node to all LDM instances to tell them
1861    * that they have now received all START_FRAGREQ, no more will be sent. After
1862    * receiving this signal the LDM instances can start reading the fragments
1863    * from disk and applying the REDO log to get them as up to date as possible
1864    * before we start the copy phase. One could also rebuild the ordered
1865    * indexes here.
1866    *
1867    * START_TOREQ/CONF/REF:
1868    * This is sent from the starting node to allocate a take over record in the
1869    * master node. This is sent once at the start of the take over processing.
1870    *
1871    * UPDATE_TOREQ/CONF/REF:
1872    * This is sent from a starting node to inform the master of a step forward
1873    * in the copy process. In some of those phases it means acquiring the global
1874    * cluster mutex on updating fragment state, in some phases it means
1875    * releasing the same mutex. Also the global switch primary replica mutex
1876    * can be acquired and released in certain phases.
1877    *
1878    * This is sent once before UPDATE_FRAGSTATEREQ/CONF and once after for each
1879    * fragment replica that the starting node will take over.
1880    *
1881    * UPDATE_FRAGSTATEREQ/CONF/REF:
1882    * This signal is sent to all nodes from starting node informing them of a
1883    * new replica entering a certain fragment. After the CONF has been received
1884    * we're sure that all transactions will involve this new node when updating
1885    * this fragment. We have a distribution key that can be used to verify if a
1886    * particular transaction have included the node in its transaction.
1887    *
1888    * This is sent once per fragment replica the starting node is taking over.
1889    *
1890    * PREPARE_COPY_FRAGREQ/CONF/REF:
1891    * This is sent from starting node to the LDM instance in starting node
1892    * asking for the maxPage value. Once per fragment replica to take over.
1893    *
1894    * COPY_FRAGREQ/CONF/REF:
1895    * This is sent to the copying node with the maxPage value. This will start
1896    * a scan in the copying node and copying over all records that have a newer
1897    * GCI than the one already restored from an LCP (the maxPage is also
1898    * somehow involved in this decision).
1899    * This signal relates to copying one fragment and is done after updating the
1900    * fragment state to ensure that all future transactions will involve the
1901    * node as well. There is another fragment state update performed after this
1902    * copy is completed.
1903    *
1904    * Sent once per fragment replica the starting node is taking over.
1905    *
1906    * COPY_ACTIVEREQ/CONF/REF:
1907    * This tells the starting node that the fragment replica is now copied over
1908    * and is in an active state.
1909    *
1910    * Sent per fragment replica the starting node is taking over.
1911    *
1912    * END_TOREQ/CONF/REF:
1913    * This is sent from the starting node to the master node. The response can
1914    * take a long time since it involves waiting for the proper LCP to complete
1915    * to ensure that the node is fully recoverable even on its own without other
1916    * nodes to assist it. For this to happen the node requires a complete
1917    * LCP to happen which started after we completed the copying of all
1918    * fragments and where the new node was part of the LCP.
1919    *
1920    * This is sent only once at the end of the take over process.
1921    * Multiple nodes can be in the take over process at the same time.
1922    *
1923    * CONTINUEB:
1924    * This signal is used a lot to control execution in the local DIH block.
1925    * It is used to start parallel threads and to ensure that we don't
1926    * execute for too long without giving other threads a chance to execute
1927    * or other signals to the DIH block.
1928    *
1929    * Variable descriptions
1930    * ---------------------
1931    *
1932    * We have a pool of take over records used by the master for
1933    * handling parallel node recoveries. We also use the same pool
1934    * in starting nodes to keep one main take over record and then
1935    * one record for each parallel thread that we can copy from in
1936    * parallel.
1937    *
1938    * Then for each thread that takes over we keep one record.
1939    * These records are always in one list.
1940    *
1941    * All threads are scanning fragments to find a fragment replica that needs
1942    * take over. When they discover one they try to update the fragment replica
1943    * state on the master (start takeover), which requires that they
1944    * temporarily become the activeThread. If this succeeds they are placed in
1945    * the activeThread variable. If it isn't successful they are placed into the
1946    * c_queued_for_start_takeover_list. When the global fragment replica state
1947    * update is completed, the list is checked to see if a queued thread should
1948    * become the activeThread. Then COPY_FRAGREQ is sent and the thread is
1949    * placed on the c_active_copy_instance_list. When start take over phase is
1950    * completed one starts the next take over from the list and sends off
1951    * COPY_FRAGREQ whereafter it is placed in the c_active_copy_thread_list.
1952    *
1953    * When the copy phase is completed the take over record is removed
1954    * from the c_active_copy_thread_list and one tries to become
1955    * the active thread. If it isn't successful the take over record
1956    * is placed into the c_queued_for_end_takeover_list. When the
1957    * active thread is done it gets a new record from either the
1958    * c_queued_for_start_takeover_list or from
1959    * c_queued_for_commit_takeover_list. c_queued_for_commit_takeover_list has
1960    * higher priority. Finally when there is no more fragments to find
1961    * for a certain thread after ending the takeover of a fragment
1962    * the record is placed into the c_completed_copy_thread_list.
1963    * When all threads are placed into this list then all threads are
1964    * done with the copy phase.
1965    *
1966    * Finally we start up the phase where we activate the REDO log.
1967    * During this phase the records are placed into the
1968    * c_active_copy_thread_list. When a thread is completed with
1969    * this phase the take over record is released. When all threads
1970    * are completed we are done with this parallelisation phase and the
1971    * node copying phase is completed whereafter we can also release the
1972    * main take over record.
1973    *
1974    * c_takeOverPool:
1975    * This is the pool of records used by both master and starting
1976    * node.
1977    *
1978    * c_mainTakeOverPtr:
1979    * This is the main record used by the starting node.
1980    *
1981    * c_queued_for_start_takeover_list:
1982    * A takeover thread is ready to copy a fragment, but has to wait until
1983    * another thread is ready with its master communication before
1984    * proceeding.
1985    *
1986    * c_queued_for_commit_takeover_list:
1987    * A takeover thread is ready to complete the copy of a fragment, it has to
1988    * wait a while since there is another thread currently communicating with
1989    * the master node.
1990    *
1991    * These two are queues, so we implement them as a Single Linked List,
1992    * FIFO queue, this means a SLFifoList.
1993    *
1994    * c_max_take_over_copy_threads:
1995    * The is the limit on the number of threads to use. Effectively the
1996    * parallelisation can never be higher than the number of LDM instances
1997    * that are used in the cluster.
1998    *
1999    * c_active_copy_threads_list:
2000    * Takeover threads are placed into this list while they are actively
2001    * copying a fragment at this point in time. We need to take things out
2002    * of this list in any order, so we need Double Linked List.
2003    *
2004    * c_activeTakeOverList:
2005    * While scannning fragments to find a fragment that our thread is
2006    * responsible for, we are placed into this list. This list handling
2007    * is on the starting node.
2008    *
2009    * This list is also used on the master node to keep track of node
2010    * take overs.
2011    *
2012    * c_completed_copy_threads_list:
2013    * This is a list where an thread is placed after completing the first
2014    * phase of scanning for fragments to copy. Some threads will be done
2015    * with this very quickly if we have more threads scanning than we have
2016    * LDM instances in the cluster. After completing the second phase where
2017    * we change state of ongoing transactions we release the thread.
2018    *
2019    * c_activeThreadTakeOverPtr:
2020    * This is the pointer to the currently active thread using the master
2021    * node to update the fragment state.
2022    *
2023    */
2024 #define ZTAKE_OVER_THREADS 16
2025 #define ZMAX_TAKE_OVER_THREADS 64
2026   Uint32 c_max_takeover_copy_threads;
2027 
2028   TakeOverRecord_pool c_takeOverPool;
2029   TakeOverRecord_list c_activeTakeOverList;
2030   TakeOverRecord_fifo c_queued_for_start_takeover_list;
2031   TakeOverRecord_fifo c_queued_for_commit_takeover_list;
2032   TakeOverRecord_list c_active_copy_threads_list;
2033   TakeOverRecord_list c_completed_copy_threads_list;
2034   TakeOverRecordPtr c_mainTakeOverPtr;
2035   TakeOverRecordPtr c_activeThreadTakeOverPtr;
2036 
2037   /* List used in takeover handling in master part. */
2038   TakeOverRecord_list c_masterActiveTakeOverList;
2039 
2040 
2041 //-----------------------------------------------------
2042 // TakeOver Record specific methods, starting node part
2043 //-----------------------------------------------------
2044   void startTakeOver(Signal *,
2045                      Uint32 startNode,
2046                      Uint32 toNode,
2047                      const struct StartCopyReq*);
2048 
2049   void startNextCopyFragment(Signal *, Uint32 takeOverPtr);
2050   void toCopyFragLab(Signal *, Uint32 takeOverPtr);
2051   void toStartCopyFrag(Signal *, TakeOverRecordPtr);
2052   void toCopyCompletedLab(Signal *, TakeOverRecordPtr regTakeOverptr);
2053 
2054   void nr_start_fragments(Signal*, TakeOverRecordPtr);
2055   void nr_start_fragment(Signal*, TakeOverRecordPtr, ReplicaRecordPtr);
2056   void nr_run_redo(Signal*, TakeOverRecordPtr);
2057   void nr_start_logging(Signal*, TakeOverRecordPtr);
2058 
2059   bool check_takeover_thread(TakeOverRecordPtr takeOverPtr,
2060                              FragmentstorePtr fragPtr,
2061                              Uint32 fragmentReplicaInstanceKey);
2062   void send_continueb_start_next_copy(Signal *signal,
2063                                       TakeOverRecordPtr takeOverPtr);
2064   void init_takeover_thread(TakeOverRecordPtr takeOverPtr,
2065                             TakeOverRecordPtr mainTakeOverPtr,
2066                             Uint32 number_of_threads,
2067                             Uint32 thread_id);
2068   void start_next_takeover_thread(Signal *signal);
2069   void start_thread_takeover_logging(Signal *signal);
2070   void send_continueb_nr_start_logging(Signal *signal,
2071                                        TakeOverRecordPtr takeOverPtr);
2072   bool thread_takeover_completed(Signal *signal,
2073                                  TakeOverRecordPtr takeOverPtr);
2074   bool thread_takeover_copy_completed(Signal *signal,
2075                                       TakeOverRecordPtr takeOverPtr);
2076   void release_take_over_threads(void);
2077   void check_take_over_completed_correctly(void);
2078 
2079   void sendStartTo(Signal* signal, TakeOverRecordPtr);
2080   void sendUpdateTo(Signal* signal, TakeOverRecordPtr);
2081   void sendUpdateFragStateReq(Signal *,
2082                               Uint32 startGci,
2083                               Uint32 storedType,
2084                               TakeOverRecordPtr takeOverPtr);
2085 
2086   void releaseTakeOver(TakeOverRecordPtr takeOverPtr,
2087                        bool from_master,
2088                        bool skip_check = false);
2089 
2090 //-------------------------------------------------
2091 // Methods for take over functionality, master part
2092 //-------------------------------------------------
2093   void switchPrimaryMutex_locked(Signal* signal, Uint32, Uint32);
2094   void switchPrimaryMutex_unlocked(Signal* signal, Uint32, Uint32);
2095   void check_force_lcp(Ptr<TakeOverRecord> takeOverPtr);
2096   void abortTakeOver(Signal*, TakeOverRecordPtr);
2097   void updateToReq_fragmentMutex_locked(Signal*, Uint32, Uint32);
2098   bool findTakeOver(Ptr<TakeOverRecord> & ptr, Uint32 failedNodeId);
2099   void insertBackup(FragmentstorePtr regFragptr, Uint32 nodeId);
2100 
2101   /*
2102     2.4  C O M M O N    S T O R E D    V A R I A B L E S
2103     ----------------------------------------------------
2104   */
2105   bool c_performed_copy_phase;
2106 
2107   struct DIVERIFY_queue
2108   {
DIVERIFY_queueDbdih::DIVERIFY_queue2109     DIVERIFY_queue() {
2110       m_ref = 0;
2111       cfirstVerifyQueue = clastVerifyQueue = 0;
2112       m_empty_done = 1;
2113     }
2114     Uint32 cfirstVerifyQueue;
2115     Uint32 clastVerifyQueue;
2116     Uint32 m_empty_done;
2117     Uint32 m_ref;
2118     char pad[NDB_CL_PADSZ(sizeof(void*) + 4 * sizeof(Uint32))];
2119   };
2120 
2121   bool isEmpty(const DIVERIFY_queue&);
2122   void enqueue(DIVERIFY_queue&);
2123   void dequeue(DIVERIFY_queue&);
2124   void emptyverificbuffer(Signal *, Uint32 q, bool aContintueB);
2125   void emptyverificbuffer_check(Signal*, Uint32, Uint32);
2126 
2127   DIVERIFY_queue c_diverify_queue[MAX_NDBMT_TC_THREADS];
2128   Uint32 c_diverify_queue_cnt;
2129 
2130   /*------------------------------------------------------------------------*/
2131   /*       THIS VARIABLE KEEPS THE REFERENCES TO FILE RECORDS THAT DESCRIBE */
2132   /*       THE TWO FILES THAT ARE USED TO STORE THE VARIABLE CRESTART_INFO  */
2133   /*       ON DISK.                                                         */
2134   /*------------------------------------------------------------------------*/
2135   Uint32 crestartInfoFile[2];
2136 
2137   bool cgckptflag;    /* A FLAG WHICH IS SET WHILE A NEW GLOBAL CHECK
2138                            POINT IS BEING CREATED. NO VERIFICATION IS ALLOWED
2139                            IF THE FLAG IS SET*/
2140   Uint32 cgcpOrderBlocked;
2141 
2142   /**
2143    * This structure describes
2144    *   the GCP Save protocol
2145    */
2146   struct GcpSave
2147   {
2148     Uint32 m_gci;
2149     Uint32 m_master_ref;
2150     enum State {
2151       GCP_SAVE_IDLE     = 0, // Idle
2152       GCP_SAVE_REQ      = 1, // REQ received
2153       GCP_SAVE_CONF     = 2, // REF/CONF sent
2154       GCP_SAVE_COPY_GCI = 3
2155     } m_state;
2156 
2157     struct {
2158       State m_state;
2159       Uint32 m_new_gci;
2160       Uint32 m_time_between_gcp;   /* Delay between global checkpoints */
2161       NDB_TICKS m_start_time;
2162     } m_master;
2163   } m_gcp_save;
2164 
2165   /**
2166    * This structure describes the MicroGCP protocol
2167    */
2168   struct MicroGcp
2169   {
MicroGcpDbdih::MicroGcp2170     MicroGcp() { }
2171     bool m_enabled;
2172     Uint32 m_master_ref;
2173 
2174     /**
2175      * rw-lock that protects multiple parallel DIVERIFY (readers) from
2176      *   updates to gcp-state (e.g GCP_PREPARE, GCP_COMMIT)
2177      */
2178     NdbSeqLock m_lock;
2179     Uint64 m_old_gci;
2180     // To avoid double send of SUB_GCP_COMPLETE_REP to SUMA via DBLQH.
2181     Uint64 m_last_sent_gci;
2182     Uint64 m_current_gci; // Currently active
2183     Uint64 m_new_gci;     // Currently being prepared...
2184     enum State {
2185       M_GCP_IDLE      = 0,
2186       M_GCP_PREPARE   = 1,
2187       M_GCP_COMMIT    = 2,
2188       M_GCP_COMMITTED = 3,
2189       M_GCP_COMPLETE  = 4
2190     } m_state;
2191 
2192     struct {
2193       State m_state;
2194       Uint32 m_time_between_gcp;
2195       Uint64 m_new_gci;
2196       NDB_TICKS m_start_time;
2197     } m_master;
2198   } m_micro_gcp;
2199 
2200   struct GcpMonitor
2201   {
2202     struct
2203     {
2204       Uint32 m_gci;
2205       Uint32 m_elapsed_ms; //MilliSec since last GCP_SAVEed
2206       Uint32 m_max_lag_ms; //Max allowed lag(ms) before 'crashSystem'
2207       bool m_need_max_lag_recalc; // Whether max lag need to be recalculated
2208 #ifdef ERROR_INSERT
2209       bool test_set_max_lag; // Testing
2210 #endif
2211     } m_gcp_save;
2212 
2213     struct
2214     {
2215       Uint64 m_gci;
2216       Uint32 m_elapsed_ms; //MilliSec since last GCP_COMMITed
2217       Uint32 m_max_lag_ms; //Max allowed lag(ms) before 'crashSystem'
2218       bool m_need_max_lag_recalc; // Whether max lag need to be recalculated
2219 #ifdef ERROR_INSERT
2220       bool test_set_max_lag; // Testing
2221 #endif
2222     } m_micro_gcp;
2223 
2224     NDB_TICKS m_last_check; //Time GCP monitor last checked
2225 
2226 #ifdef ERROR_INSERT
2227     Uint32 m_savedMaxCommitLag;  // Testing
2228 #endif
2229   } m_gcp_monitor;
2230 
2231   /*------------------------------------------------------------------------*/
2232   /*       THIS VARIABLE KEEPS TRACK OF THE STATE OF THIS NODE AS MASTER.   */
2233   /*------------------------------------------------------------------------*/
2234   enum MasterState {
2235     MASTER_IDLE = 0,
2236     MASTER_ACTIVE = 1,
2237     MASTER_TAKE_OVER_GCP = 2
2238   };
2239   MasterState cmasterState;
2240   Uint16      cmasterTakeOverNode;
2241   /* NODE IS NOT MASTER            */
2242   /* NODE IS ACTIVE AS MASTER      */
2243   /* NODE IS TAKING OVER AS MASTER */
2244 
2245   struct CopyGCIMaster {
CopyGCIMasterDbdih::CopyGCIMaster2246     CopyGCIMaster(){
2247       m_copyReason = CopyGCIReq::IDLE;
2248       for (Uint32 i = 0; i<WAIT_CNT; i++)
2249         m_waiting[i] = CopyGCIReq::IDLE;
2250     }
2251     /*------------------------------------------------------------------------*/
2252     /*       THIS STATE VARIABLE IS USED TO INDICATE IF COPYING OF RESTART    */
2253     /*       INFO WAS STARTED BY A LOCAL CHECKPOINT OR AS PART OF A SYSTEM    */
2254     /*       RESTART.                                                         */
2255     /*------------------------------------------------------------------------*/
2256     CopyGCIReq::CopyReason m_copyReason;
2257 
2258     /*------------------------------------------------------------------------*/
2259     /*       COPYING RESTART INFO CAN BE STARTED BY LOCAL CHECKPOINTS AND BY  */
2260     /*       GLOBAL CHECKPOINTS. WE CAN HOWEVER ONLY HANDLE TWO SUCH COPY AT  */
2261     /*       THE TIME. THUS WE HAVE TO KEEP WAIT INFORMATION IN THIS VARIABLE.*/
2262     /*------------------------------------------------------------------------*/
2263     STATIC_CONST( WAIT_CNT = 2 );
2264     CopyGCIReq::CopyReason m_waiting[WAIT_CNT];
2265   } c_copyGCIMaster;
2266 
2267   struct CopyGCISlave {
CopyGCISlaveDbdih::CopyGCISlave2268     CopyGCISlave(){ m_copyReason = CopyGCIReq::IDLE; m_expectedNextWord = 0;}
2269     /*------------------------------------------------------------------------*/
2270     /*       THIS STATE VARIABLE IS USED TO INDICATE IF COPYING OF RESTART    */
2271     /*       INFO WAS STARTED BY A LOCAL CHECKPOINT OR AS PART OF A SYSTEM    */
2272     /*       RESTART. THIS VARIABLE IS USED BY THE NODE THAT RECEIVES         */
2273     /*       COPY_GCI_REQ.                                                    */
2274     /*------------------------------------------------------------------------*/
2275     Uint32 m_senderData;
2276     BlockReference m_senderRef;
2277     CopyGCIReq::CopyReason m_copyReason;
2278 
2279     Uint32 m_expectedNextWord;
2280   } c_copyGCISlave;
2281 
2282   /*------------------------------------------------------------------------*/
2283   /*       THIS VARIABLE IS USED TO KEEP TRACK OF THE STATE OF LOCAL        */
2284   /*       CHECKPOINTS.                                                     */
2285   /*------------------------------------------------------------------------*/
2286 public:
2287   enum LcpStatus {
2288     LCP_STATUS_IDLE        = 0,
2289     LCP_TCGET              = 1,  // Only master
2290     LCP_STATUS_ACTIVE      = 2,
2291     LCP_WAIT_MUTEX         = 3,  // Only master
2292     LCP_CALCULATE_KEEP_GCI = 4,  // Only master
2293     LCP_COPY_GCI           = 5,
2294     LCP_INIT_TABLES        = 6,
2295     LCP_TC_CLOPSIZE        = 7,  // Only master
2296     LCP_START_LCP_ROUND    = 8,
2297     LCP_TAB_COMPLETED      = 9,
2298     LCP_TAB_SAVED          = 10
2299   };
2300 private:
2301 
2302   struct LcpState {
LcpStateDbdih::LcpState2303     LcpState() {}
2304     LcpStatus lcpStatus;
2305     Uint32 lcpStatusUpdatedPlace;
2306 
2307     struct Save {
2308       LcpStatus m_status;
2309       Uint32 m_place;
2310     } m_saveState[10];
2311 
setLcpStatusDbdih::LcpState2312     void setLcpStatus(LcpStatus status, Uint32 line){
2313       for (Uint32 i = 9; i > 0; i--)
2314         m_saveState[i] = m_saveState[i-1];
2315       m_saveState[0].m_status = lcpStatus;
2316       m_saveState[0].m_place = lcpStatusUpdatedPlace;
2317 
2318       lcpStatus = status;
2319       lcpStatusUpdatedPlace = line;
2320     }
2321 
2322     /**
2323      * State of stalling LCPs for node restarts
2324      */
2325     Uint32 lcpStallStart;  /* Has started stalling lcp start */
2326     NDB_TICKS lastLogTime; /* Last time we logged state of stall */
2327     NDB_TICKS m_start_lcp_check_time; /* Time of stalling started */
2328     Uint32 stall_node_waiting_for; /* The node we've logged about waiting for */
2329 
2330     Uint32 lcpStart;
2331     Uint32 lcpStopGcp;
2332     Uint32 keepGci;      /* USED TO CALCULATE THE GCI TO KEEP AFTER A LCP  */
2333     Uint32 oldestRestorableGci;
2334 
2335     bool lcpManualStallStart; /* User requested stall of start (testing only) */
2336 
2337     NDB_TICKS m_start_time; // When last LCP was started
2338     Uint64    m_lcp_time;   // How long last LCP took
2339     Uint32    m_lcp_trylock_timeout;
2340 
2341     struct CurrentFragment {
2342       Uint32 tableId;
2343       Uint32 fragmentId;
2344     } currentFragment;
2345 
2346     Uint32 noOfLcpFragRepOutstanding;
2347 
2348     /*------------------------------------------------------------------------*/
2349     /*       USED TO ENSURE THAT LCP'S ARE EXECUTED WITH CERTAIN TIMEINTERVALS*/
2350     /*       EVEN WHEN SYSTEM IS NOT DOING ANYTHING.                          */
2351     /*------------------------------------------------------------------------*/
2352     Uint32 ctimer;
2353     Uint32 ctcCounter;
2354     Uint32 clcpDelay;            /* MAX. 2^(CLCP_DELAY - 2) SEC BETWEEN LCP'S */
2355 
2356     /*------------------------------------------------------------------------*/
2357     /*       THIS STATE IS USED TO TELL IF THE FIRST LCP AFTER START/RESTART  */
2358     /*       HAS BEEN RUN.  AFTER A NODE RESTART THE NODE DOES NOT ENTER      */
2359     /*       STARTED STATE BEFORE THIS IS DONE.                               */
2360     /*------------------------------------------------------------------------*/
2361     bool immediateLcpStart;
2362     bool m_LCP_COMPLETE_REP_From_Master_Received;
2363     SignalCounter m_LCP_COMPLETE_REP_Counter_DIH;
2364     SignalCounter m_LCP_COMPLETE_REP_Counter_LQH;
2365     SignalCounter m_LAST_LCP_FRAG_ORD;
2366     NdbNodeBitmask m_participatingLQH;
2367     NdbNodeBitmask m_participatingDIH;
2368     NdbNodeBitmask m_allReplicasQueuedLQH;
2369 
2370     Uint32 m_masterLcpDihRef;
2371     bool   m_MASTER_LCPREQ_Received;
2372     Uint32 m_MASTER_LCPREQ_FailedNodeId;
2373 
2374     Uint32 m_lastLCP_COMPLETE_REP_id;
2375     Uint32 m_lastLCP_COMPLETE_REP_ref;
2376 
2377     // Whether the 'lcp' is already completed under the
2378     // coordination of the failed master
already_completed_lcpDbdih::LcpState2379     bool already_completed_lcp(Uint32 lcp, Uint32 current_master) const
2380     {
2381       const Uint32 last_completed_master_node =
2382         refToNode(m_lastLCP_COMPLETE_REP_ref);
2383       if (m_lastLCP_COMPLETE_REP_id == lcp &&
2384           last_completed_master_node != current_master &&
2385           last_completed_master_node == m_MASTER_LCPREQ_FailedNodeId)
2386       {
2387         return true;
2388       }
2389       return false;
2390     }
2391 
2392   } c_lcpState;
2393 
2394   /*------------------------------------------------------------------------*/
2395   /*       THIS VARIABLE KEEPS TRACK OF HOW MANY TABLES ARE ACTIVATED WHEN  */
2396   /*       STARTING A LOCAL CHECKPOINT WE SHOULD AVOID STARTING A CHECKPOINT*/
2397   /*       WHEN NO TABLES ARE ACTIVATED.                                    */
2398   /*------------------------------------------------------------------------*/
2399   Uint32 cnoOfActiveTables;
2400 
2401   BlockReference cdictblockref;          /* DICTIONARY BLOCK REFERENCE */
2402   Uint32 cfailurenr;              /* EVERY TIME WHEN A NODE FAILURE IS REPORTED
2403                                     THIS NUMBER IS INCREMENTED. AT THE START OF
2404                                     THE SYSTEM THIS NUMBER MUST BE INITIATED TO
2405                                     ZERO */
2406   Uint32 cMinTcFailNo;            /* Minimum TC handled failNo allowed to close GCP */
2407 
2408   BlockReference clocallqhblockref;
2409   BlockReference clocaltcblockref;
2410   BlockReference cmasterdihref;
2411   Uint16 cownNodeId;
2412   BlockReference cndbStartReqBlockref;
2413   BlockReference cntrlblockref;
2414   Uint32 con_lineNodes;
2415   Uint32 creceivedfrag;
2416   Uint32 cremainingfrags;
2417   Uint32 cstarttype;
2418   Uint32 csystemnodes;
2419   Uint32 c_newest_restorable_gci;
2420   Uint32 c_set_initial_start_flag;
2421   NDB_TICKS c_current_time; // Updated approx. every 10ms
2422 
2423   /* Limit the number of concurrent table definition writes during LCP
2424    * This avoids exhausting the DIH page pool
2425    */
2426   CountingSemaphore c_lcpTabDefWritesControl;
2427 
2428 public:
2429   enum LcpMasterTakeOverState {
2430     LMTOS_IDLE = 0,
2431     LMTOS_WAIT_LCP_FRAG_REP = 2,// Currently waiting for outst. LCP_FRAG_REP
2432     LMTOS_INITIAL = 3,
2433     LMTOS_ALL_IDLE = 4,
2434     LMTOS_ALL_ACTIVE = 5,
2435     LMTOS_LCP_CONCLUDING = 6,
2436     LMTOS_COPY_ONGOING = 7
2437   };
2438 private:
2439   class MasterTakeOverState {
2440   public:
MasterTakeOverState()2441     MasterTakeOverState() {}
set(LcpMasterTakeOverState s,Uint32 line)2442     void set(LcpMasterTakeOverState s, Uint32 line) {
2443       state = s; updatePlace = line;
2444     }
2445 
2446     LcpMasterTakeOverState state;
2447     Uint32 updatePlace;
2448 
2449     Uint32 minTableId;
2450     Uint32 minFragId;
2451     Uint32 failedNodeId;
2452   } c_lcpMasterTakeOverState;
2453 
2454   Uint16 cmasterNodeId;
2455 
2456   struct NodeStartMasterRecord {
NodeStartMasterRecordDbdih::NodeStartMasterRecord2457     NodeStartMasterRecord() {}
2458     Uint32 startNode;
2459     Uint32 wait;
2460     Uint32 failNr;
2461     bool activeState;
2462     Uint32 blockGcp; // 0, 1=ordered, 2=effective
2463     Uint32 startInfoErrorCode;
2464     Uint32 m_outstandingGsn;
2465     MutexHandle2<DIH_FRAGMENT_INFO> m_fragmentInfoMutex;
2466   };
2467   NodeStartMasterRecord c_nodeStartMaster;
2468 
2469   struct NodeStartSlaveRecord {
NodeStartSlaveRecordDbdih::NodeStartSlaveRecord2470     NodeStartSlaveRecord() { nodeId = 0;}
2471 
2472     Uint32 nodeId;
2473   };
2474   NodeStartSlaveRecord c_nodeStartSlave;
2475 
2476   Uint32 cfirstAliveNode;
2477   Uint32 cfirstDeadNode;
2478   Uint32 cstartPhase;
2479   Uint32 cnoReplicas;
2480 
2481   bool cwaitLcpSr;
2482 
2483   /**
2484    * After a node failure we want to increase the disk checkpoint speed until
2485    * we have completed the current ongoing node failure. We also increase the
2486    * checkpoint speed when we know that a node restart is ongoing.
2487    */
2488   bool c_increase_lcp_speed_after_nf;
2489   /**
2490    * Available nodegroups (ids) (length == cnoOfNodeGroups)
2491    *   use to support nodegroups 2,4,6 (not just consequtive nodegroup ids)
2492    */
2493   Uint32 c_node_groups[MAX_NDB_NODE_GROUPS];
2494   Uint32 cnoOfNodeGroups;
2495   Uint32 crestartGci;      /* VALUE OF GCI WHEN SYSTEM RESTARTED OR STARTED */
2496 
2497   /**
2498    * Counter variables keeping track of the number of outstanding signals
2499    * for particular signals in various protocols.
2500    */
2501   SignalCounter c_COPY_GCIREQ_Counter;
2502   SignalCounter c_COPY_TABREQ_Counter;
2503   SignalCounter c_UPDATE_FRAG_STATEREQ_Counter;
2504   SignalCounter c_DIH_SWITCH_REPLICA_REQ_Counter;
2505   SignalCounter c_GCP_COMMIT_Counter;
2506   SignalCounter c_GCP_PREPARE_Counter;
2507   SignalCounter c_GCP_SAVEREQ_Counter;
2508   SignalCounter c_SUB_GCP_COMPLETE_REP_Counter;
2509   SignalCounter c_INCL_NODEREQ_Counter;
2510   SignalCounter c_MASTER_GCPREQ_Counter;
2511   SignalCounter c_MASTER_LCPREQ_Counter;
2512   SignalCounter c_START_INFOREQ_Counter;
2513   SignalCounter c_START_RECREQ_Counter;
2514   SignalCounter c_STOP_ME_REQ_Counter;
2515   SignalCounter c_TC_CLOPSIZEREQ_Counter;
2516   SignalCounter c_TCGETOPSIZEREQ_Counter;
2517   SignalCounter c_START_LCP_REQ_Counter;
2518 
2519   bool   c_blockCommit;
2520   Uint32 c_blockCommitNo;
2521 
getBlockCommit() const2522   bool getBlockCommit() const {
2523     return c_blockCommit || cgckptflag;
2524   }
2525 
2526   /**
2527    * SwitchReplicaRecord - Should only be used by master
2528    */
2529   struct SwitchReplicaRecord {
SwitchReplicaRecordDbdih::SwitchReplicaRecord2530     SwitchReplicaRecord() {}
clearDbdih::SwitchReplicaRecord2531     void clear(){}
2532 
2533     Uint32 nodeId;
2534     Uint32 tableId;
2535     Uint32 fragNo;
2536   };
2537   SwitchReplicaRecord c_switchReplicas;
2538 
2539   struct StopPermProxyRecord {
StopPermProxyRecordDbdih::StopPermProxyRecord2540     StopPermProxyRecord() { clientRef = 0; }
2541 
2542     Uint32 clientData;
2543     BlockReference clientRef;
2544     BlockReference masterRef;
2545   };
2546 
2547   struct StopPermMasterRecord {
StopPermMasterRecordDbdih::StopPermMasterRecord2548     StopPermMasterRecord() { clientRef = 0;}
2549 
2550     Uint32 returnValue;
2551 
2552     Uint32 clientData;
2553     BlockReference clientRef;
2554   };
2555 
2556   StopPermProxyRecord c_stopPermProxy;
2557   StopPermMasterRecord c_stopPermMaster;
2558 
2559   void checkStopPermProxy(Signal*, NodeId failedNodeId);
2560   void checkStopPermMaster(Signal*, NodeRecordPtr failedNodePtr);
2561 
2562   void switchReplica(Signal*,
2563 		     Uint32 nodeId,
2564 		     Uint32 tableId,
2565 		     Uint32 fragNo);
2566 
2567   void switchReplicaReply(Signal*, NodeId nodeId);
2568 
2569   /**
2570    * Wait GCP (proxy)
2571    */
2572   struct WaitGCPProxyRecord {
WaitGCPProxyRecordDbdih::WaitGCPProxyRecord2573     WaitGCPProxyRecord() { clientRef = 0;}
2574 
2575     Uint32 clientData;
2576     BlockReference clientRef;
2577     BlockReference masterRef;
2578 
2579     union { Uint32 nextPool; Uint32 nextList; };
2580     Uint32 prevList;
2581   };
2582   typedef Ptr<WaitGCPProxyRecord> WaitGCPProxyPtr;
2583   typedef ArrayPool<WaitGCPProxyRecord> WaitGCPProxyRecord_pool;
2584   typedef DLList<WaitGCPProxyRecord_pool> WaitGCPProxyRecord_list;
2585   /**
2586    * Wait GCP (master)
2587    */
2588   struct WaitGCPMasterRecord {
WaitGCPMasterRecordDbdih::WaitGCPMasterRecord2589     WaitGCPMasterRecord() { clientRef = 0;}
2590     Uint32 clientData;
2591     BlockReference clientRef;
2592     /**
2593      * GCI which must be completed before CONF sent
2594      * For WaitEpoch, it is not used, the next
2595      * completing epoch sends a CONF.
2596      */
2597     Uint32 waitGCI;
2598 
2599     /**
2600      * Special value indicating a request for shutdown sync
2601      */
2602     static const Uint32 ShutdownSyncGci = 0xffffffff;
2603 
2604     union { Uint32 nextPool; Uint32 nextList; };
2605     Uint32 prevList;
2606   };
2607   typedef Ptr<WaitGCPMasterRecord> WaitGCPMasterPtr;
2608   typedef ArrayPool<WaitGCPMasterRecord> WaitGCPMasterRecord_pool;
2609 
2610   /**
2611    * Pool/list of WaitGCPProxyRecord record
2612    */
2613   WaitGCPProxyRecord_pool waitGCPProxyPool;
2614   WaitGCPProxyRecord_list c_waitGCPProxyList;
2615 
2616   /**
2617    * Pool/list of WaitGCPMasterRecord record
2618    */
2619   WaitGCPMasterRecord_pool waitGCPMasterPool;
2620   typedef DLList<WaitGCPMasterRecord_pool> WaitGCPList;
2621   WaitGCPList c_waitGCPMasterList;
2622   WaitGCPList c_waitEpochMasterList;
2623 
2624   void checkWaitGCPProxy(Signal*, NodeId failedNodeId);
2625   void checkWaitGCPMaster(Signal*, NodeId failedNodeId);
2626   void checkShutdownSync();
2627   void emptyWaitGCPMasterQueue(Signal*, Uint64, WaitGCPList&);
2628 
2629   void getNodeBitmap(NdbNodeBitmask& map,
2630                      Uint32 listHead,
2631                      int (*versionFunction) (Uint32));
2632 
2633   /**
2634    * Stop me
2635    */
2636   struct StopMeRecord {
StopMeRecordDbdih::StopMeRecord2637     StopMeRecord() { clientRef = 0;}
2638 
2639     BlockReference clientRef;
2640     Uint32 clientData;
2641   };
2642   StopMeRecord c_stopMe;
2643 
2644   void checkStopMe(Signal *, NodeRecordPtr failedNodePtr);
2645 
2646 #define DIH_CDATA_SIZE _SYSFILE_FILE_SIZE
2647   /**
2648    * This variable must be atleast the size of Sysfile::SYSFILE_SIZE32_v2
2649    */
2650   Uint32 cdata_size_in_words;
2651   Uint32 cdata[DIH_CDATA_SIZE];       /* TEMPORARY ARRAY VARIABLE */
2652 
2653   /**
2654    * Sys file data
2655    */
2656   Uint32 sysfileData[DIH_CDATA_SIZE];
2657   Uint32 sysfileDataToFile[DIH_CDATA_SIZE];
2658 
2659   /**
2660    * When a node comes up without filesystem
2661    *   we have to clear all LCP for that node
2662    */
2663   void handle_send_continueb_invalidate_node_lcp(Signal *signal);
2664   void invalidateNodeLCP(Signal *, Uint32 nodeId, Uint32 tableId);
2665   void invalidateNodeLCP(Signal *, Uint32 nodeId, TabRecordPtr);
2666 
2667   /**
2668    * Reply from nodeId
2669    */
2670   void startInfoReply(Signal *, Uint32 nodeId);
2671 
2672   void dump_replica_info();
2673   void dump_replica_info(const Fragmentstore*);
2674 
2675   // DIH specifics for execNODE_START_REP (sendDictUnlockOrd)
2676   void execNODE_START_REP(Signal* signal);
2677 
2678   /*
2679    * Lock master DICT.  Only current use is by starting node
2680    * during NR.  A pool of slave records is convenient anyway.
2681    */
2682   struct DictLockSlaveRecord {
2683     Uint32 lockPtr;
2684     Uint32 lockType;
2685     bool locked;
2686     Callback callback;
2687     Uint32 nextPool;
2688   };
2689 
2690   typedef Ptr<DictLockSlaveRecord> DictLockSlavePtr;
2691   typedef ArrayPool<DictLockSlaveRecord> DictLockSlaveRecord_pool;
2692   DictLockSlaveRecord_pool c_dictLockSlavePool;
2693 
2694   // slave
2695   void sendDictLockReq(Signal* signal, Uint32 lockType, Callback c);
2696   void recvDictLockConf(Signal* signal);
2697   void sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI);
2698 
2699   // NR
2700   Uint32 c_dictLockSlavePtrI_nodeRestart; // userPtr for NR
2701   void recvDictLockConf_nodeRestart(Signal* signal, Uint32 data, Uint32 ret);
2702 
2703   Uint32 c_error_7181_ref;
2704 
2705 #ifdef ERROR_INSERT
2706   void sendToRandomNodes(const char*, Signal*, SignalCounter*,
2707                          SendFunction,
2708                          Uint32 extra = RNIL,
2709                          Uint32 block = 0, Uint32 gsn = 0, Uint32 len = 0,
2710                          JobBufferLevel = JBB);
2711 #endif
2712 
2713   bool check_enable_micro_gcp(Signal* signal, bool broadcast);
2714 
2715   bool c_sr_wait_to;
2716   NdbNodeBitmask m_sr_nodes;
2717   NdbNodeBitmask m_to_nodes;
2718 
2719   void startme_copygci_conf(Signal*);
2720 
2721   /**
2722    * Local LCP state
2723    *   This struct is more or less a copy of lcp-state
2724    *   Reason for duplicating it is that
2725    *   - not to mess with current code
2726    *   - this one is "distributed", i.e maintained by *all* nodes,
2727    *     not like c_lcpState which mixed master/slave state in a "unnatural"
2728    *     way
2729    */
2730   struct LocalLCPState
2731   {
2732     enum State {
2733       LS_INITIAL = 0,
2734       LS_RUNNING = 1,
2735       LS_COMPLETE = 2,
2736       LS_RUNNING_MTO_TAB_SAVED = 3
2737     } m_state;
2738 
2739     StartLcpReq m_start_lcp_req;
2740     Uint32 m_keep_gci; // Min GCI is needed to restore LCP
2741     Uint32 m_stop_gci; // This GCI needs to be complete before LCP is restorable
2742 
LocalLCPStateDbdih::LocalLCPState2743     LocalLCPState() { reset();}
2744 
2745     void reset();
2746     void init(const StartLcpReq*);
2747     void init_master_take_over_idle_to_tab_saved();
2748     void lcp_frag_rep(const LcpFragRep*);
2749     void lcp_complete_rep(Uint32 gci);
2750 
2751     /**
2752      * @param gci - current GCI being made restorable (COPY_GCI)
2753      */
2754     bool check_cut_log_tail(Uint32 gci) const;
2755   } m_local_lcp_state;
2756 
2757   // MT LQH
2758   Uint32 c_fragments_per_node_;
2759   Uint32 getFragmentsPerNode();
2760   Uint32 getFragmentCount(Uint32 partitionBalance,
2761                           Uint32 numOfNodeGroups,
2762                           Uint32 numOfReplicas,
2763                           Uint32 numOfLDMs) const;
2764   /**
2765    * dihGetInstanceKey
2766    *
2767    * This method maps a fragment to a block instance key
2768    * This is the LDM instance which manages the fragment
2769    * on this node.
2770    * The range of an instance key is 1 to
2771    * NDBMT_MAX_WORKER_INSTANCES inclusive.
2772    * 0 is the proxy block instance.
2773    */
dihGetInstanceKey(FragmentstorePtr tFragPtr)2774   Uint32 dihGetInstanceKey(FragmentstorePtr tFragPtr) {
2775     ndbrequire(!tFragPtr.isNull());
2776     Uint32 log_part_id = tFragPtr.p->m_log_part_id;
2777     ndbrequire(log_part_id < NDBMT_MAX_WORKER_INSTANCES);
2778     return 1 + log_part_id;
2779   }
2780   Uint32 dihGetInstanceKey(Uint32 tabId, Uint32 fragId);
2781   Uint32 dihGetInstanceKeyCanFail(Uint32 tabId, Uint32 fragId);
2782 
2783   void log_setNoSend();
2784   /**
2785    * Get minimum version of nodes in alive-list
2786    */
2787   Uint32 getMinVersion() const;
2788 
2789   bool c_2pass_inr;
2790 
2791   /* Max LCP parallelism is node (version) specific */
2792   Uint8 getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const;
2793 
2794   void isolateNodes(Signal* signal,
2795                     Uint32 delayMillis,
2796                     const NdbNodeBitmask& victims);
2797 
2798   NodeId c_handled_master_take_over_copy_gci;
2799 
2800   bool handle_master_take_over_copy_gci(Signal *signal,
2801                                         NodeId newMasterNodeId);
2802 
2803   RedoStateRep::RedoAlertState m_node_redo_alert_state[MAX_NDB_NODES];
2804   RedoStateRep::RedoAlertState m_global_redo_alert_state;
2805   RedoStateRep::RedoAlertState get_global_redo_alert_state();
2806   void sendREDO_STATE_REP_to_all(Signal*, Uint32 block, bool send_to_all);
2807   bool m_master_lcp_req_lcp_already_completed;
2808 
2809   void complete_restart_nr(Signal*);
2810 
2811   /* The highest data node id in the cluster. */
2812   Uint32 m_max_node_id;
2813   bool m_set_up_multi_trp_in_node_restart;
2814 public:
is_master()2815   bool is_master() { return isMaster(); }
2816 
2817   NdbNodeBitmask c_shutdownReqNodes;
2818 };
2819 
2820 #if (DIH_CDATA_SIZE < _SYSFILE_SIZE32_v2)
2821 #error "cdata is to small compared to Sysfile size"
2822 #endif
2823 
2824 
2825 #undef JAM_FILE_ID
2826 
2827 #endif
2828 
2829