1 /* 2 Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License, version 2.0, 6 as published by the Free Software Foundation. 7 8 This program is also distributed with certain software (including 9 but not limited to OpenSSL) that is licensed under separate terms, 10 as designated in a particular file or component or in included license 11 documentation. The authors of MySQL hereby grant you an additional 12 permission to link the program and your derivative works with the 13 separately licensed software that they have included with MySQL. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License, version 2.0, for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with this program; if not, write to the Free Software 22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25 #ifndef DBDIH_H 26 #define DBDIH_H 27 28 #include <ndb_limits.h> 29 #include <pc.hpp> 30 #include <SimulatedBlock.hpp> 31 #include "Sysfile.hpp" 32 #include <SignalCounter.hpp> 33 34 #include <signaldata/RedoStateRep.hpp> 35 #include <signaldata/MasterLCP.hpp> 36 #include <signaldata/CopyGCIReq.hpp> 37 #include <blocks/mutexes.hpp> 38 #include <signaldata/LCP.hpp> 39 #include <NdbSeqLock.hpp> 40 #include <CountingSemaphore.hpp> 41 #include <Mutex.hpp> 42 43 #define JAM_FILE_ID 356 44 45 46 #ifdef DBDIH_C 47 48 /*###################*/ 49 /* FILE SYSTEM FLAGS */ 50 /*###################*/ 51 #define ZLIST_OF_PAIRS 0 52 #define ZLIST_OF_PAIRS_SYNCH 16 53 #define ZOPEN_READ_WRITE 2 54 #define ZCREATE_READ_WRITE 0x302 55 #define ZCLOSE_NO_DELETE 0 56 #define ZCLOSE_DELETE 1 57 58 /*###############*/ 59 /* NODE STATES */ 60 /*###############*/ 61 #define ZIDLE 0 62 #define ZACTIVE 1 63 64 /*#########*/ 65 /* GENERAL */ 66 /*#########*/ 67 #define ZVAR_NO_WORD 0 68 #define ZVAR_NO_CRESTART_INFO 1 69 #define ZVAR_NO_CRESTART_INFO_TO_FILE 2 70 #define ZVALID 1 71 #define ZINVALID 2 72 73 /*###############*/ 74 /* ERROR CODES */ 75 /*###############*/ 76 // ------------------------------------------ 77 // Error Codes for Transactions (None sofar) 78 // ------------------------------------------ 79 #define ZUNDEFINED_FRAGMENT_ERROR 311 80 81 // -------------------------------------- 82 // Error Codes for Add Table 83 // -------------------------------------- 84 #define ZREPLERROR1 306 85 #define ZREPLERROR2 307 86 87 // -------------------------------------- 88 // Other DIH error codes 89 // -------------------------------------- 90 #define ZLONG_MESSAGE_ERROR 312 91 92 // -------------------------------------- 93 // Crash Codes 94 // -------------------------------------- 95 #define ZCOULD_NOT_OCCUR_ERROR 300 96 #define ZNOT_MASTER_ERROR 301 97 #define ZWRONG_FAILURE_NUMBER_ERROR 302 98 #define ZWRONG_START_NODE_ERROR 303 99 #define ZNO_REPLICA_FOUND_ERROR 304 100 101 /*#########*/ 102 /* PHASES */ 103 /*#########*/ 104 #define ZNDB_SPH1 1 105 #define ZNDB_SPH2 2 106 #define ZNDB_SPH3 3 107 #define ZNDB_SPH4 4 108 #define ZNDB_SPH5 5 109 #define ZNDB_SPH6 6 110 #define ZNDB_SPH7 7 111 #define ZNDB_SPH8 8 112 /*#########*/ 113 /* SIZES */ 114 /*#########*/ 115 /* 116 * Pages are used for flushing table definitions during LCP, 117 * and for other operations such as metadata changes etc 118 * 119 */ 120 #define MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES 4 121 #define MAX_CONCURRENT_DIH_TAB_DEF_OPS (MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES + 2) 122 #define ZPAGEREC (MAX_CONCURRENT_DIH_TAB_DEF_OPS * PACK_TABLE_PAGES) 123 #define ZCREATE_REPLICA_FILE_SIZE 4 124 #define ZPROXY_MASTER_FILE_SIZE (MAX_NDB_NODES + 1) 125 126 /*MaxConcurrent proxied WaitGcpReq. Set to 10 as safety margin on 1.*/ 127 #define ZPROXY_FILE_SIZE 10 128 #endif 129 130 /* 131 * Pack table into pages. 132 * See use of writePageWord() in 133 * packTableIntoPagesLab() and helper 134 * functions to determine the constants 135 * below. 136 */ 137 #define MAX_CRASHED_REPLICAS 8 138 #define PACK_REPLICAS_WORDS (4 + 4 * MAX_LCP_STORED + 2 * MAX_CRASHED_REPLICAS) 139 #define PACK_FRAGMENT_WORDS (6 + 2 * MAX_REPLICAS * PACK_REPLICAS_WORDS) 140 #define PACK_TABLE_WORDS (10 + MAX_NDB_PARTITIONS * PACK_FRAGMENT_WORDS) 141 #define PACK_TABLE_PAGE_WORDS (2048 - 32) 142 #define PACK_TABLE_PAGES ((PACK_TABLE_WORDS + PACK_TABLE_PAGE_WORDS - 1) / PACK_TABLE_PAGE_WORDS) 143 144 #define MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE 32 145 #define MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE 32 146 147 class Dbdih: public SimulatedBlock { 148 #ifdef ERROR_INSERT 149 typedef void (Dbdih::* SendFunction)(Signal*, Uint32, Uint32); 150 #endif 151 public: 152 153 // Records 154 155 /*############## CONNECT_RECORD ##############*/ 156 /** 157 * THE CONNECT RECORD IS CREATED WHEN A TRANSACTION HAS TO START. IT KEEPS 158 * ALL INTERMEDIATE INFORMATION NECESSARY FOR THE TRANSACTION FROM THE 159 * DISTRIBUTED MANAGER. THE RECORD KEEPS INFORMATION ABOUT THE 160 * OPERATIONS THAT HAVE TO BE CARRIED OUT BY THE TRANSACTION AND 161 * ALSO THE TRAIL OF NODES FOR EACH OPERATION IN THE THE 162 * TRANSACTION. 163 */ 164 struct ConnectRecord { 165 enum ConnectState { 166 INUSE = 0, 167 FREE = 1, 168 STARTED = 2, 169 ALTER_TABLE = 3, 170 ALTER_TABLE_ABORT = 4, // "local" abort 171 ALTER_TABLE_REVERT = 5, 172 GET_TABINFO = 6 173 }; 174 union { 175 Uint32 nodes[MAX_REPLICAS]; 176 struct { 177 Uint32 m_changeMask; 178 Uint32 m_totalfragments; 179 Uint32 m_partitionCount; 180 Uint32 m_org_totalfragments; 181 Uint32 m_new_map_ptr_i; 182 } m_alter; 183 struct { 184 Uint32 m_map_ptr_i; 185 } m_create; 186 struct { 187 Uint32 m_requestInfo; 188 } m_get_tabinfo; 189 }; 190 ConnectState connectState; 191 Uint32 nextPool; 192 Uint32 table; 193 Uint32 userpointer; 194 BlockReference userblockref; 195 Callback m_callback; 196 }; 197 typedef Ptr<ConnectRecord> ConnectRecordPtr; 198 199 /** 200 * THESE RECORDS ARE USED WHEN CREATING REPLICAS DURING SYSTEM 201 * RESTART. I NEED A COMPLEX DATA STRUCTURE DESCRIBING THE REPLICAS 202 * I WILL TRY TO CREATE FOR EACH FRAGMENT. 203 * 204 * I STORE A REFERENCE TO THE FOUR POSSIBLE CREATE REPLICA RECORDS 205 * IN A COMMON STORED VARIABLE. I ALLOW A MAXIMUM OF 4 REPLICAS TO 206 * BE RESTARTED PER FRAGMENT. 207 */ 208 struct CreateReplicaRecord { 209 Uint32 logStartGci[MAX_LOG_EXEC]; 210 Uint32 logStopGci[MAX_LOG_EXEC]; 211 Uint16 logNodeId[MAX_LOG_EXEC]; 212 Uint32 createLcpId; 213 214 Uint32 replicaRec; 215 Uint16 dataNodeId; 216 Uint16 lcpNo; 217 Uint16 noLogNodes; 218 }; 219 typedef Ptr<CreateReplicaRecord> CreateReplicaRecordPtr; 220 221 /** 222 * THIS RECORD CONTAINS A FILE DESCRIPTION. THERE ARE TWO 223 * FILES PER TABLE TO RAISE SECURITY LEVEL AGAINST DISK CRASHES. 224 */ 225 struct FileRecord { 226 enum FileStatus { 227 CLOSED = 0, 228 CRASHED = 1, 229 OPEN = 2 230 }; 231 enum FileType { 232 TABLE_FILE = 0, 233 GCP_FILE = 1 234 }; 235 enum ReqStatus { 236 IDLE = 0, 237 CREATING_GCP = 1, 238 OPENING_GCP = 2, 239 OPENING_COPY_GCI = 3, 240 WRITING_COPY_GCI = 4, 241 CREATING_COPY_GCI = 5, 242 OPENING_TABLE = 6, 243 READING_GCP = 7, 244 READING_TABLE = 8, 245 WRITE_INIT_GCP = 9, 246 TABLE_CREATE = 10, 247 TABLE_WRITE = 11, 248 TABLE_CLOSE = 12, 249 CLOSING_GCP = 13, 250 CLOSING_TABLE_CRASH = 14, 251 CLOSING_TABLE_SR = 15, 252 CLOSING_GCP_CRASH = 16, 253 TABLE_OPEN_FOR_DELETE = 17, 254 TABLE_CLOSE_DELETE = 18 255 }; 256 Uint32 fileName[4]; 257 Uint32 fileRef; 258 FileStatus fileStatus; 259 FileType fileType; 260 Uint32 nextFile; 261 ReqStatus reqStatus; 262 Uint32 tabRef; 263 }; 264 typedef Ptr<FileRecord> FileRecordPtr; 265 266 /** 267 * THIS RECORD KEEPS THE STORAGE AND DECISIONS INFORMATION OF A FRAGMENT 268 * AND ITS REPLICAS. IF FRAGMENT HAS MORE THAN ONE BACK UP 269 * REPLICA THEN A LIST OF MORE NODES IS ATTACHED TO THIS RECORD. 270 * EACH RECORD IN MORE LIST HAS INFORMATION ABOUT ONE BACKUP. THIS RECORD 271 * ALSO HAVE THE STATUS OF THE FRAGMENT. 272 */ 273 struct Fragmentstore { 274 Uint16 activeNodes[MAX_REPLICAS]; 275 Uint32 preferredPrimary; 276 277 Uint32 oldStoredReplicas; /* "DEAD" STORED REPLICAS */ 278 Uint32 storedReplicas; /* "ALIVE" STORED REPLICAS */ 279 Uint32 nextFragmentChunk; 280 281 Uint32 m_log_part_id; 282 283 /** 284 * Used by Fully replicated tables to find the main fragment and to 285 * find local fragments. 286 */ 287 Uint32 fragId; 288 Uint32 partition_id; 289 Uint32 nextCopyFragment; 290 291 Uint8 distributionKey; 292 Uint8 fragReplicas; 293 Uint8 noOldStoredReplicas; /* NUMBER OF "DEAD" STORED REPLICAS */ 294 Uint8 noStoredReplicas; /* NUMBER OF "ALIVE" STORED REPLICAS*/ 295 Uint8 noLcpReplicas; ///< No of replicas remaining to be LCP:ed 296 }; 297 typedef Ptr<Fragmentstore> FragmentstorePtr; 298 299 /*########### PAGE RECORD ############*/ 300 /** 301 * THIS RECORD KEEPS INFORMATION ABOUT NODE GROUPS. 302 */ 303 struct NodeGroupRecord { 304 Uint32 nodesInGroup[MAX_REPLICAS + 1]; 305 Uint32 nextReplicaNode; 306 Uint32 nodeCount; 307 Uint32 activeTakeOver; // Which node... 308 Uint32 activeTakeOverCount; 309 Uint32 m_next_log_part; 310 Uint32 nodegroupIndex; 311 Uint32 m_ref_count; 312 }; 313 typedef Ptr<NodeGroupRecord> NodeGroupRecordPtr; 314 /** 315 * THIS RECORD KEEPS INFORMATION ABOUT NODES. 316 * 317 * RECORD ALIGNED TO BE 64 BYTES. 318 */ 319 enum NodefailHandlingStep { 320 NF_REMOVE_NODE_FROM_TABLE = 1, 321 NF_GCP_TAKE_OVER = 2, 322 NF_LCP_TAKE_OVER = 4 323 }; 324 325 /** 326 * useInTransactions is used in DIGETNODES to assert that we give 327 * DBTC a node view which is correct. To ensure we provide a view 328 * which is correct we use an RCU mechanism when executing 329 * DIGETNODES. It's not a crashing problem, but it ensures that 330 * we avoid getting into unnecessary extra wait states at node 331 * failures and also that we avoid unnecessary abortions. 332 * 333 * We update this view any time any node is changing the value of 334 * useInTransactions and DBTC could be actively executing 335 * transactions. 336 */ 337 NdbSeqLock m_node_view_lock; 338 339 struct NodeRecord 340 { NodeRecordDbdih::NodeRecord341 NodeRecord() { } 342 /** 343 * Removed the constructor method and replaced it with the method 344 * initNodeRecord. The problem with the constructor method is that 345 * in debug compiled code it will initialise the entire object to 346 * zero. This didn't play well at all with the node recovery status 347 * which is used from the start of the node until it dies, so it 348 * should not be initialised when DIH finds it appropriate to 349 * initialise it. One could also long-term separate the two functions 350 * into two separate objects. 351 */ 352 enum NodeStatus { 353 NOT_IN_CLUSTER = 0, 354 ALIVE = 1, 355 STARTING = 2, 356 DIED_NOW = 3, 357 DYING = 4, 358 DEAD = 5 359 }; 360 361 /** 362 * The NodeRecoveryStatus variable and all the timers connected to this 363 * status is used for two purposes. The first purpose is for a NDBINFO 364 * table that the master node will use to be able to specify the times 365 * a node restart has spent in the various node restart phases. 366 * 367 * This will help both the users and the developers to understand where 368 * the node restart is spending time. 369 * 370 * In addition the timers are also used to estimate how much more time 371 * the node will need before reaching the next wait for local checkpoint 372 * (LCP). Starting LCPs with good timing is crucial to shorten the waits 373 * for LCPs by the starting nodes. We want to wait with starting LCPs 374 * to ensure that as many nodes as possible are handled in between 375 * LCPs as possible. At the same time we cannot block LCP execution for 376 * any extended period since it will jeopardize the future stability of 377 * the cluster. 378 */ 379 enum NodeRecoveryStatus 380 { 381 /* No valid state or node not defined in cluster */ 382 NOT_DEFINED_IN_CLUSTER = 0, 383 384 /* There is state for no information about restarts. */ 385 NODE_NOT_RESTARTED_YET = 1, 386 387 /* Node failure states are used in all nodes. */ 388 NODE_FAILED = 2, 389 NODE_FAILURE_COMPLETED = 3, 390 391 /* The first set of states are only used in master nodes. */ 392 ALLOCATED_NODE_ID = 4, 393 INCLUDED_IN_HB_PROTOCOL = 5, 394 NDBCNTR_START_WAIT = 6, 395 NDBCNTR_STARTED = 7, 396 START_PERMITTED = 8, 397 WAIT_LCP_TO_COPY_DICT = 9, 398 COPY_DICT_TO_STARTING_NODE = 10, 399 INCLUDE_NODE_IN_LCP_AND_GCP = 11, 400 LOCAL_RECOVERY_STARTED = 12, 401 RESTORE_FRAG_COMPLETED = 13, 402 UNDO_DD_COMPLETED = 14, 403 EXECUTE_REDO_LOG_COMPLETED = 15, 404 COPY_FRAGMENTS_STARTED = 16, 405 WAIT_LCP_FOR_RESTART = 17, 406 WAIT_SUMA_HANDOVER = 18, 407 RESTART_COMPLETED = 19, 408 409 /* There is a set of states used in non-master nodes as well. */ 410 NODE_GETTING_PERMIT = 20, 411 NODE_GETTING_INCLUDED = 21, 412 NODE_GETTING_SYNCHED = 22, 413 NODE_IN_LCP_WAIT_STATE = 23, 414 NODE_ACTIVE = 24 415 }; 416 417 /** 418 * We need to ensure that we don't pause the node when the master node 419 * asks for it in case the node is already dead. We check this by 420 * by verifying that the node is in the state NODE_GETTING_PERMIT in 421 * in the non-master nodes. Since we do not yet maintain the 422 * nodeRecoveryStatus in all restart situations we temporarily 423 * put this into a separate variable that we maintain separately. 424 * TODO: We should use nodeRecoveryStatus when we maintain this 425 * state in all types of starts. 426 */ 427 bool is_pausable; 428 NodeRecoveryStatus nodeRecoveryStatus; 429 NDB_TICKS nodeFailTime; 430 NDB_TICKS nodeFailCompletedTime; 431 NDB_TICKS allocatedNodeIdTime; 432 NDB_TICKS includedInHBProtocolTime; 433 NDB_TICKS ndbcntrStartWaitTime; 434 NDB_TICKS ndbcntrStartedTime; 435 NDB_TICKS startPermittedTime; 436 NDB_TICKS waitLCPToCopyDictTime; 437 NDB_TICKS copyDictToStartingNodeTime; 438 NDB_TICKS includeNodeInLCPAndGCPTime; 439 NDB_TICKS startDatabaseRecoveryTime; 440 NDB_TICKS startUndoDDTime; 441 NDB_TICKS startExecREDOLogTime; 442 NDB_TICKS startBuildIndexTime; 443 NDB_TICKS copyFragmentsStartedTime; 444 NDB_TICKS waitLCPForRestartTime; 445 NDB_TICKS waitSumaHandoverTime; 446 NDB_TICKS restartCompletedTime; 447 448 NDB_TICKS nodeGettingPermitTime; 449 NDB_TICKS nodeGettingIncludedTime; 450 NDB_TICKS nodeGettingSynchedTime; 451 NDB_TICKS nodeInLCPWaitStateTime; 452 NDB_TICKS nodeActiveTime; 453 454 struct FragmentCheckpointInfo { 455 Uint32 tableId; 456 Uint32 fragId; 457 Uint32 replicaPtr; 458 }; 459 460 Sysfile::ActiveStatus activeStatus; 461 462 bool useInTransactions; 463 464 NodeStatus nodeStatus; 465 bool allowNodeStart; 466 bool m_inclDihLcp; 467 Uint8 copyCompleted; // 0 = NO :-), 1 = YES, 2 = yes, first WAITING 468 469 /** 470 * Used by master as part of running LCPs to keep track of fragments 471 * that have started checkpoints and fragments that have been queued 472 * for LCP execution. 473 */ 474 FragmentCheckpointInfo startedChkpt[MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE]; 475 FragmentCheckpointInfo queuedChkpt[MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE]; 476 477 Bitmask<1> m_nodefailSteps; 478 Uint32 activeTabptr; 479 Uint32 nextNode; 480 Uint32 nodeGroup; 481 482 SignalCounter m_NF_COMPLETE_REP; 483 484 Uint8 dbtcFailCompleted; 485 Uint8 dblqhFailCompleted; 486 Uint8 dbdihFailCompleted; 487 Uint8 dbdictFailCompleted; 488 Uint8 recNODE_FAILREP; 489 490 Uint8 noOfQueuedChkpt; 491 Uint8 noOfStartedChkpt; 492 493 MasterLCPConf::State lcpStateAtTakeOver; 494 Uint32 m_remove_node_from_table_lcp_id; 495 }; 496 typedef Ptr<NodeRecord> NodeRecordPtr; 497 /**********************************************************************/ 498 /* THIS RECORD KEEPS THE INFORMATION ABOUT A TABLE AND ITS FRAGMENTS */ 499 /**********************************************************************/ 500 struct PageRecord { 501 Uint32 word[2048]; 502 /* 8 KBYTE PAGE*/ 503 Uint32 nextfreepage; 504 }; 505 typedef Ptr<PageRecord> PageRecordPtr; 506 507 /************ REPLICA RECORD *************/ 508 /**********************************************************************/ 509 /* THIS RECORD KEEPS THE INFORMATION ABOUT A REPLICA OF A FRAGMENT */ 510 /**********************************************************************/ 511 struct ReplicaRecord { 512 /* -------------------------------------------------------------------- */ 513 /* THE GLOBAL CHECKPOINT IDENTITY WHEN THIS REPLICA WAS CREATED. */ 514 /* THERE IS ONE INDEX PER REPLICA. A REPLICA INDEX IS CREATED WHEN ANODE*/ 515 /* CRASH OCCURS. */ 516 /* -------------------------------------------------------------------- */ 517 Uint32 createGci[8]; 518 /* -------------------------------------------------------------------- */ 519 /* THE LAST GLOBAL CHECKPOINT IDENTITY WHICH HAS BEEN SAVED ON DISK. */ 520 /* THIS VARIABLE IS ONLY VALID FOR REPLICAS WHICH HAVE "DIED". A REPLICA*/ 521 /* "DIES" EITHER WHEN THE NODE CRASHES THAT KEPT THE REPLICA OR BY BEING*/ 522 /* STOPPED IN A CONTROLLED MANNER. */ 523 /* THERE IS ONE INDEX PER REPLICA. A REPLICA INDEX IS CREATED WHEN ANODE*/ 524 /* CRASH OCCURS. */ 525 /* -------------------------------------------------------------------- */ 526 Uint32 replicaLastGci[8]; 527 /* -------------------------------------------------------------------- */ 528 /* THE LOCAL CHECKPOINT IDENTITY OF A LOCAL CHECKPOINT. */ 529 /* -------------------------------------------------------------------- */ 530 Uint32 lcpId[MAX_LCP_STORED]; 531 /* -------------------------------------------------------------------- */ 532 /* THIS VARIABLE KEEPS TRACK OF THE MAXIMUM GLOBAL CHECKPOINT COMPLETED */ 533 /* FOR EACH OF THE LOCAL CHECKPOINTS IN THIS FRAGMENT REPLICA. */ 534 /* -------------------------------------------------------------------- */ 535 Uint32 maxGciCompleted[MAX_LCP_STORED]; 536 /* -------------------------------------------------------------------- */ 537 /* THIS VARIABLE KEEPS TRACK OF THE MINIMUM GLOBAL CHECKPOINT STARTEDFOR*/ 538 /* EACH OF THE LOCAL CHECKPOINTS IN THIS FRAGMENT REPLICA. */ 539 /* -------------------------------------------------------------------- */ 540 Uint32 maxGciStarted[MAX_LCP_STORED]; 541 /* -------------------------------------------------------------------- */ 542 /* THE GLOBAL CHECKPOINT IDENTITY WHEN THE TABLE WAS CREATED. */ 543 /* -------------------------------------------------------------------- */ 544 Uint32 initialGci; 545 546 /* -------------------------------------------------------------------- */ 547 /* THE REFERENCE TO THE NEXT REPLICA. EITHER IT REFERS TO THE NEXT IN */ 548 /* THE FREE LIST OR IT REFERS TO THE NEXT IN A LIST OF REPLICAS ON A */ 549 /* FRAGMENT. */ 550 /* -------------------------------------------------------------------- */ 551 Uint32 nextPool; 552 553 /* -------------------------------------------------------------------- */ 554 /* THE NODE ID WHERE THIS REPLICA IS STORED. */ 555 /* -------------------------------------------------------------------- */ 556 Uint16 procNode; 557 558 /* -------------------------------------------------------------------- */ 559 /* The last local checkpoint id started or queued on this replica. */ 560 /* -------------------------------------------------------------------- */ 561 union { 562 Uint32 lcpIdStarted; // Started or queued 563 Uint32 m_restorable_gci; 564 }; 565 566 /** 567 * Information needed to put the LCP_FRAG_REP into a queue and avoid 568 * sending the information onwards to all the other nodes in the 569 * cluster. We use a doubly linked list to support removal from 570 * queue due to drop table. 571 * 572 * By queueing in the local DIH we can make it appear as if the LCP 573 * is paused from the point of view of all the DIH blocks in the cluster. 574 * 575 * In the DBLQH the LCP is continuing unabated as long as there are 576 * fragments queued to execute LCPs on. The purpose of this pause support 577 * is to be able to copy the meta data without having to wait for the 578 * current LCP to be fully completed. Instead we can copy it while we are 579 * pausing the LCP reporting. This gives a possibility to provide 580 * new node with a snapshot of the metadata from the master node 581 * without having to stop the progress with the LCP execution. 582 */ 583 Uint32 nextList; 584 Uint32 prevList; 585 Uint32 repMaxGciStarted; 586 Uint32 repMaxGciCompleted; 587 Uint32 fragId; 588 Uint32 tableId; 589 /* lcpNo == nextLcp, checked at queueing */ 590 /* nodeId == procNode */ 591 592 /* -------------------------------------------------------------------- */ 593 /* THIS VARIABLE SPECIFIES WHAT THE STATUS OF THE LOCAL CHECKPOINT IS.IT*/ 594 /* CAN EITHER BE VALID OR INVALID. AT CREATION OF A FRAGMENT REPLICA ALL*/ 595 /* LCP'S ARE INVALID. ALSO IF IF INDEX >= NO_LCP THEN THELOCALCHECKPOINT*/ 596 /* IS ALWAYS INVALID. IF THE LCP BEFORE THE NEXT_LCP HAS LCP_ID THAT */ 597 /* DIFFERS FROM THE LATEST LCP_ID STARTED THEN THE NEXT_LCP IS ALSO */ 598 /* INVALID */ 599 /* -------------------------------------------------------------------- */ 600 Uint8 lcpStatus[MAX_LCP_STORED]; 601 602 /* -------------------------------------------------------------------- */ 603 /* THE NEXT LOCAL CHECKPOINT TO EXECUTE IN THIS FRAGMENT REPLICA. */ 604 /* -------------------------------------------------------------------- */ 605 Uint8 nextLcp; 606 607 /* -------------------------------------------------------------------- */ 608 /* THE NUMBER OF CRASHED REPLICAS IN THIS REPLICAS SO FAR. */ 609 /* -------------------------------------------------------------------- */ 610 Uint8 noCrashedReplicas; 611 612 /** 613 * Is a LCP currently ongoing on fragment 614 */ 615 Uint8 lcpOngoingFlag; 616 }; 617 typedef Ptr<ReplicaRecord> ReplicaRecordPtr; 618 typedef ArrayPool<ReplicaRecord> ReplicaRecord_pool; 619 typedef DLFifoList<ReplicaRecord_pool> ReplicaRecord_fifo; 620 621 ReplicaRecord_pool c_replicaRecordPool; 622 ReplicaRecord_fifo c_queued_lcp_frag_rep; 623 624 /************************************************************************* 625 * TAB_DESCRIPTOR IS A DESCRIPTOR OF THE LOCATION OF THE FRAGMENTS BELONGING 626 * TO THE TABLE.THE INFORMATION ABOUT FRAGMENTS OF A TABLE ARE STORED IN 627 * CHUNKS OF FRAGMENTSTORE RECORDS. 628 * THIS RECORD ALSO HAS THE NECESSARY INFORMATION TO LOCATE A FRAGMENT AND 629 * TO LOCATE A FRAGMENT AND TO TRANSLATE A KEY OF A TUPLE TO THE FRAGMENT IT 630 * BELONGS 631 */ 632 struct TabRecord 633 { TabRecordDbdih::TabRecord634 TabRecord() { m_flags = 0; } 635 636 /** 637 * State for copying table description into pages 638 */ 639 enum CopyStatus { 640 CS_IDLE = 0, 641 CS_SR_PHASE1_READ_PAGES = 1, 642 CS_SR_PHASE2_READ_TABLE = 2, 643 CS_SR_PHASE3_COPY_TABLE = 3, 644 CS_REMOVE_NODE = 4, 645 CS_LCP_READ_TABLE = 5, 646 CS_COPY_TAB_REQ = 6, 647 CS_COPY_NODE_STATE = 7, 648 CS_ADD_TABLE_MASTER = 8, 649 CS_ADD_TABLE_SLAVE = 9, 650 CS_INVALIDATE_NODE_LCP = 10, 651 CS_ALTER_TABLE = 11, 652 CS_COPY_TO_SAVE = 12 653 ,CS_GET_TABINFO = 13 654 }; 655 /** 656 * State for copying pages to disk 657 */ 658 enum UpdateState { 659 US_IDLE = 0, 660 US_LOCAL_CHECKPOINT = 1, 661 US_LOCAL_CHECKPOINT_QUEUED = 2, 662 US_REMOVE_NODE = 3, 663 US_COPY_TAB_REQ = 4, 664 US_ADD_TABLE_MASTER = 5, 665 US_ADD_TABLE_SLAVE = 6, 666 US_INVALIDATE_NODE_LCP = 7, 667 US_CALLBACK = 8 668 }; 669 enum TabLcpStatus { 670 TLS_ACTIVE = 1, 671 TLS_WRITING_TO_FILE = 2, 672 TLS_COMPLETED = 3 673 }; 674 enum TabStatus { 675 TS_IDLE = 0, 676 TS_ACTIVE = 1, 677 TS_CREATING = 2, 678 TS_DROPPING = 3 679 }; 680 enum Method { 681 LINEAR_HASH = 0, 682 NOTDEFINED = 1, 683 NORMAL_HASH = 2, 684 USER_DEFINED = 3, 685 HASH_MAP = 4 686 }; 687 enum Storage { 688 ST_NOLOGGING = 0, // Table is not logged, but survives SR 689 ST_NORMAL = 1, // Normal table, logged and durable 690 ST_TEMPORARY = 2 // Table is lost after SR, not logged 691 }; 692 enum TableFlags 693 { 694 TF_FULLY_REPLICATED = 1 695 }; 696 697 /** 698 * rw-lock that protects multiple parallel DIGETNODES (readers) from 699 * updates to fragmenation changes (e.g UPDATE_FRAG_STATEREQ)... 700 * search for DIH_TAB_WRITE_LOCK 701 */ 702 NdbSeqLock m_lock; 703 704 /** 705 * tabStatus, schemaTransId, m_map_ptr_i, totalfragments, noOfBackups 706 * and m_scan_reorg_flag are read concurrently from many TC threads in 707 * the execDIH_SCAN_TAB_REQ so we place these close to each other. 708 */ 709 TabStatus tabStatus; 710 Uint32 schemaTransId; 711 Uint32 totalfragments; 712 /** 713 * partitionCount differs from totalfragments for fully replicated 714 * tables. 715 */ 716 Uint32 partitionCount; 717 union { 718 Uint32 mask; 719 Uint32 m_map_ptr_i; 720 }; 721 Uint32 m_scan_reorg_flag; 722 Uint32 m_flags; 723 724 Uint8 noOfBackups; 725 Uint8 kvalue; 726 Uint16 primaryTableId; 727 728 Uint16 noPages; 729 Uint16 tableType; 730 731 Uint32 schemaVersion; 732 union { 733 Uint32 hashpointer; 734 Uint32 m_new_map_ptr_i; 735 }; 736 Method method; 737 738 739 740 //----------------------------------------------------------------------------- 741 // Each entry in this array contains a reference to 16 fragment records in a 742 // row. Thus finding the correct record is very quick provided the fragment id. 743 //----------------------------------------------------------------------------- 744 Uint32 startFid[(MAX_NDB_PARTITIONS - 1) / NO_OF_FRAGS_PER_CHUNK + 1]; 745 746 CopyStatus tabCopyStatus; 747 UpdateState tabUpdateState; 748 TabLcpStatus tabLcpStatus; 749 Storage tabStorage; 750 751 Uint32 tabFile[2]; 752 Uint32 noOfWords; 753 Uint32 tabRemoveNode; 754 Uint32 noOfFragChunks; 755 Uint32 tabActiveLcpFragments; 756 757 struct { 758 Uint32 tabUserRef; 759 Uint32 tabUserPtr; 760 } m_dropTab; 761 Uint32 connectrec; 762 763 // set in local protocol during prepare until commit 764 /** 765 * m_scan_count is heavily updated by all TC threads as they start and 766 * stop scans. This is always updated when also grabbing the mutex, 767 * so we place it close to the declaration of the mutex to avoid 768 * contaminating too many CPU cache lines. 769 */ 770 Uint32 m_scan_count[2]; 771 772 /** 773 * This mutex protects the changes to m_scan_count to ensure that we 774 * complete old scans relying on old meta data before removing the 775 * metadata parts. It also protects the combination of tabStatus 776 * schemaTransId checked for in execDIH_SCAN_TAB_REQ(...). 777 * 778 * Given that DIH_SCAN_TAB_REQ also reads totalfragments, partitionCount 779 * m_map_ptr_i, noOfBackups, m_scan_reorg_flag we protect those variables 780 * as well with this mutex. These variables are also protected by the 781 * above NdbSeqLock to ensure that execDIGETNODESREQ can execute 782 * concurrently from many TC threads simultaneously. 783 * 784 * DIH_SCAN_TAB_REQ and DIH_SCAN_TAB_COMPLETE_REP are called once per 785 * scan at start and end. These will both grab a mutex on the table 786 * object. This should support in the order of a few million scans 787 * per table per data node. This should suffice. The need for a mutex 788 * comes from the fact that we need to keep track of number of scans. 789 * Thus we need to update from many different threads. 790 * 791 * DIGETNODESREQ is called once per primary key operation and once 792 * per fragment scanned in a scan operation. This means that it can 793 * be called many millions of times per second in a data node. Thus 794 * a mutex per table is not sufficient. The data read in DIGETNODESREQ 795 * is updated very seldomly. So we use the RCU mechanism, we read 796 * the value of the NdbSeqLock before reading the variables, we then 797 * read the variables protected by this mechanism whereafter we verify 798 * that the NdbSeqLock haven't changed it's value. 799 * 800 * It is noteworthy that using RCU requires reading the lock variable 801 * before and after in both the successful case as well as in the 802 * error case. We cannot deduce an error until we have verified that 803 * we have read consistent data. 804 * 805 * So with this mechanism DIGETNODESREQ can scale to almost any number 806 * of key operations and fragment scans per second with minor glitches 807 * while still performing online schema changes. 808 * 809 * We put the mutex surrounded by variables that are not used in normal 810 * operation to minimize the bad effects of CPU cache misses. 811 */ 812 NdbMutex theMutex; 813 814 Uint32 pageRef[PACK_TABLE_PAGES]; // TODO: makedynamic 815 }; 816 typedef Ptr<TabRecord> TabRecordPtr; 817 818 /***************************************************************************/ 819 /* THIS RECORD IS USED TO KEEP TRACK OF TAKE OVER AND STARTING A NODE. */ 820 /* WE KEEP IT IN A RECORD TO ENABLE IT TO BE PARALLELISED IN THE FUTURE. */ 821 /**************************************************************************/ 822 struct TakeOverRecord { 823 TakeOverRecordDbdih::TakeOverRecord824 TakeOverRecord() {} 825 826 /** 827 * States possible on slave (starting node) 828 */ 829 enum ToSlaveStatus { 830 TO_SLAVE_IDLE = 0 831 ,TO_START_FRAGMENTS = 1 // Finding LCP for each fragment 832 ,TO_RUN_REDO = 2 // Waiting for local LQH to run REDO 833 ,TO_START_TO = 3 // Waiting for master (START_TOREQ) 834 ,TO_SELECTING_NEXT = 4 // Selecting next fragment to copy 835 ,TO_PREPARE_COPY = 5 // Waiting for local LQH (PREPARE_COPYREQ) 836 ,TO_UPDATE_BEFORE_STORED = 6 // Waiting on master (UPDATE_TOREQ) 837 ,TO_UPDATE_FRAG_STATE_STORED = 7 838 // Waiting for all UPDATE_FRAG_STATEREQ stored 839 ,TO_UPDATE_AFTER_STORED = 8 // Waiting for master (UPDATE_TOREQ) 840 ,TO_COPY_FRAG = 9 // Waiting for copy node (COPY_FRAGREQ) 841 ,TO_COPY_ACTIVE = 10 // Waiting for local LQH (COPY_ACTIVEREQ) 842 ,TO_UPDATE_BEFORE_COMMIT = 11// Waiting for master (UPDATE_TOREQ) 843 ,TO_UPDATE_FRAG_STATE_COMMIT = 12 844 // Waiting for all (UPDATE_FRAG_STATEREQ commit) 845 ,TO_UPDATE_AFTER_COMMIT = 13 // Waiting for master (UPDATE_TOREQ) 846 847 ,TO_START_LOGGING = 14 // Enabling logging on all fragments 848 ,TO_SL_COPY_ACTIVE = 15 // Start logging: Copy active (local) 849 ,TO_SL_UPDATE_FRAG_STATE = 16 // Start logging: Create Frag (dist) 850 ,TO_END_TO = 17 // Waiting for master (END_TOREQ) 851 ,TO_QUEUED_UPDATE_BEFORE_STORED = 18 //Queued 852 ,TO_QUEUED_UPDATE_BEFORE_COMMIT = 19 //Queued 853 ,TO_QUEUED_SL_UPDATE_FRAG_STATE = 20 //Queued 854 }; 855 856 /** 857 * States possible on master 858 */ 859 enum ToMasterStatus { 860 TO_MASTER_IDLE = 0 861 ,TO_MUTEX_BEFORE_STORED = 1 // Waiting for lock 862 ,TO_MUTEX_BEFORE_LOCKED = 2 // Lock held 863 ,TO_AFTER_STORED = 3 // No lock, but NGPtr reservation 864 ,TO_MUTEX_BEFORE_COMMIT = 4 // Waiting for lock 865 ,TO_MUTEX_BEFORE_SWITCH_REPLICA = 5 // Waiting for switch replica lock 866 ,TO_MUTEX_AFTER_SWITCH_REPLICA = 6 867 ,TO_WAIT_LCP = 7 // No locks, waiting for LCP 868 }; 869 /** 870 * For node restarts we use a number of parallel take over records 871 * such that we can copy fragments from several LDM instances in 872 * parallel. Each thread will take care of a subset of LDM 873 * instances provided by knowing the number of instances and 874 * our thread id. For each replica we will then check if 875 * replica_instance_id % m_number_of_copy_threads == m_copy_thread_id. 876 */ 877 Uint32 m_copy_thread_id; 878 Uint32 m_number_of_copy_threads; 879 Uint32 m_copy_threads_completed; 880 881 Uint32 m_flags; // 882 Uint32 m_senderRef; // Who requested START_COPYREQ 883 Uint32 m_senderData; // Data of sender 884 885 Uint32 restorableGci; // Which GCI can be restore "locally" by node 886 Uint32 startGci; 887 Uint32 maxPage; 888 Uint32 toCopyNode; 889 Uint32 toCurrentFragid; 890 Uint32 toCurrentReplica; 891 Uint32 toCurrentTabref; 892 Uint32 toFailedNode; 893 Uint32 toStartingNode; 894 NDB_TICKS toStartTime; 895 ToSlaveStatus toSlaveStatus; 896 ToMasterStatus toMasterStatus; 897 898 MutexHandle2<DIH_SWITCH_PRIMARY_MUTEX> m_switchPrimaryMutexHandle; 899 MutexHandle2<DIH_FRAGMENT_INFO> m_fragmentInfoMutex; 900 901 Uint32 nextList; 902 union { 903 Uint32 prevList; 904 Uint32 nextPool; 905 }; 906 }; 907 typedef Ptr<TakeOverRecord> TakeOverRecordPtr; 908 typedef ArrayPool<TakeOverRecord> TakeOverRecord_pool; 909 typedef DLList<TakeOverRecord_pool> TakeOverRecord_list; 910 typedef SLFifoList<TakeOverRecord_pool> TakeOverRecord_fifo; 911 912 getParam(const char * param,Uint32 * retVal)913 virtual bool getParam(const char * param, Uint32 * retVal) { 914 if (param && strcmp(param, "ActiveMutexes") == 0) 915 { 916 if (retVal) 917 { 918 * retVal = 5 + MAX_NDB_NODES; 919 } 920 return true; 921 } 922 return false; 923 } 924 925 public: 926 Dbdih(Block_context& ctx); 927 virtual ~Dbdih(); 928 929 struct RWFragment { 930 Uint32 pageIndex; 931 Uint32 wordIndex; 932 Uint32 fragId; 933 TabRecordPtr rwfTabPtr; 934 PageRecordPtr rwfPageptr; 935 Uint32 totalfragments; 936 }; 937 struct CopyTableNode { 938 Uint32 pageIndex; 939 Uint32 wordIndex; 940 Uint32 noOfWords; 941 TabRecordPtr ctnTabPtr; 942 PageRecordPtr ctnPageptr; 943 }; 944 945 private: 946 friend class SimulatedBlock; 947 BLOCK_DEFINES(Dbdih); 948 949 /** 950 * Methods used in Node Recovery Status module 951 * ------------------------------------------- 952 */ 953 void execDBINFO_SCANREQ(Signal *); 954 void execALLOC_NODEID_REP(Signal *); 955 void execINCL_NODE_HB_PROTOCOL_REP(Signal *); 956 void execNDBCNTR_START_WAIT_REP(Signal *); 957 void execNDBCNTR_STARTED_REP(Signal *); 958 void execSUMA_HANDOVER_COMPLETE_REP(Signal *); 959 void execEND_TOREP(Signal *signal); 960 void execLOCAL_RECOVERY_COMP_REP(Signal *signal); 961 962 void sendEND_TOREP(Signal *signal, Uint32 startNodeId); 963 bool check_stall_lcp_start(void); 964 void check_node_not_restarted_yet(NodeRecordPtr nodePtr); 965 void setNodeRecoveryStatus(Uint32 nodeId, 966 NodeRecord::NodeRecoveryStatus new_status); 967 void setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr); 968 void initNodeRecoveryTimers(NodeRecordPtr nodePtr); 969 void initNodeRecoveryStatus(); 970 void initNodeRecord(NodeRecordPtr); 971 bool check_for_too_long_wait(Uint64 &lcp_max_wait_time, 972 Uint64 &lcp_stall_time, 973 NDB_TICKS now); 974 void check_all_node_recovery_timers(void); 975 bool check_node_recovery_timers(Uint32 nodeId); 976 void calculate_time_remaining(Uint32 nodeId, 977 NDB_TICKS state_start_time, 978 NDB_TICKS now, 979 NodeRecord::NodeRecoveryStatus state, 980 Uint32 *node_waited_for, 981 Uint64 *time_since_state_start, 982 NodeRecord::NodeRecoveryStatus *max_status); 983 void calculate_most_recent_node(Uint32 nodeId, 984 NDB_TICKS state_start_time, 985 NodeRecord::NodeRecoveryStatus state, 986 Uint32 *most_recent_node, 987 NDB_TICKS *most_recent_start_time, 988 NodeRecord::NodeRecoveryStatus *most_recent_state); 989 const char* get_status_str(NodeRecord::NodeRecoveryStatus status); 990 void fill_row_with_node_restart_status(NodeRecordPtr nodePtr, 991 Ndbinfo::Row &row); 992 void write_zero_columns(Ndbinfo::Row &row, Uint32 num_rows); 993 void handle_before_master(NodeRecordPtr nodePtr, Ndbinfo::Row &row); 994 /* End methods for Node Recovery Status module */ 995 996 void execDUMP_STATE_ORD(Signal *); 997 void execNDB_TAMPER(Signal *); 998 void execDEBUG_SIG(Signal *); 999 void execMASTER_GCPREF(Signal *); 1000 void execMASTER_GCPREQ(Signal *); 1001 void execMASTER_GCPCONF(Signal *); 1002 void execMASTER_LCPREF(Signal *); 1003 void execMASTER_LCPREQ(Signal *); 1004 void execMASTER_LCPCONF(Signal *); 1005 void execNF_COMPLETEREP(Signal *); 1006 void execSTART_PERMREQ(Signal *); 1007 void execSTART_PERMCONF(Signal *); 1008 void execSTART_PERMREF(Signal *); 1009 void execINCL_NODEREQ(Signal *); 1010 void execINCL_NODECONF(Signal *); 1011 1012 void execSTART_TOREQ(Signal *); 1013 void execSTART_TOREF(Signal *); 1014 void execSTART_TOCONF(Signal*); 1015 1016 void execEND_TOREQ(Signal *); 1017 void execEND_TOREF(Signal *); 1018 void execEND_TOCONF(Signal*); 1019 1020 void execUPDATE_TOREQ(Signal* signal); 1021 void execUPDATE_TOREF(Signal* signal); 1022 void execUPDATE_TOCONF(Signal* signal); 1023 1024 void execSTART_MEREQ(Signal *); 1025 void execSTART_MECONF(Signal *); 1026 void execSTART_MEREF(Signal *); 1027 void execSTART_COPYREQ(Signal *); 1028 void execSTART_COPYCONF(Signal *); 1029 void execSTART_COPYREF(Signal *); 1030 void execUPDATE_FRAG_STATEREQ(Signal *); 1031 void execUPDATE_FRAG_STATECONF(Signal *); 1032 void execDIVERIFYREQ(Signal *); 1033 void execGCP_SAVEREQ(Signal *); 1034 void execGCP_SAVECONF(Signal *); 1035 void execGCP_PREPARECONF(Signal *); 1036 void execGCP_PREPARE(Signal *); 1037 void execGCP_NODEFINISH(Signal *); 1038 void execGCP_COMMIT(Signal *); 1039 void execSUB_GCP_COMPLETE_REP(Signal *); 1040 void execSUB_GCP_COMPLETE_ACK(Signal *); 1041 void execDIHNDBTAMPER(Signal *); 1042 void execCONTINUEB(Signal *); 1043 void execCOPY_GCIREQ(Signal *); 1044 void execCOPY_GCICONF(Signal *); 1045 void execCOPY_TABREQ(Signal *); 1046 void execCOPY_TABCONF(Signal *); 1047 void execTCGETOPSIZECONF(Signal *); 1048 void execTC_CLOPSIZECONF(Signal *); 1049 void execCHECK_LCP_IDLE_ORD(Signal *); 1050 1051 void execDIH_GET_TABINFO_REQ(Signal*); 1052 void execSET_UP_MULTI_TRP_CONF(Signal*); 1053 1054 /** 1055 * A number of functions used to find out if any node is currently is 1056 * restarting. 1057 */ 1058 void execCHECK_NODE_RESTARTREQ(Signal*); 1059 void check_node_in_restart(Signal*, BlockReference, Uint32); 1060 void sendCHECK_NODE_RESTARTCONF(Signal*, BlockReference, Uint32); 1061 1062 int handle_invalid_lcp_no(const struct LcpFragRep*, ReplicaRecordPtr); 1063 void execLCP_FRAG_REP(Signal *); 1064 void execLCP_COMPLETE_REP(Signal *); 1065 void execSTART_LCP_REQ(Signal *); 1066 void execSTART_LCP_CONF(Signal *); 1067 MutexHandle2<DIH_START_LCP_MUTEX> c_startLcpMutexHandle; 1068 void startLcpMutex_locked(Signal* signal, Uint32, Uint32); 1069 void startLcpMutex_unlocked(Signal* signal, Uint32, Uint32); 1070 void lcpFragmentMutex_locked(Signal* signal, Uint32, Uint32); 1071 void master_lcp_fragmentMutex_locked(Signal* signal, Uint32, Uint32); 1072 1073 void switch_primary_stop_node(Signal* signal, Uint32, Uint32); 1074 1075 MutexHandle2<DIH_SWITCH_PRIMARY_MUTEX> c_switchPrimaryMutexHandle; 1076 MutexHandle2<DIH_FRAGMENT_INFO> c_fragmentInfoMutex_lcp; 1077 1078 /* LCP Pausing module start */ 1079 void execFLUSH_LCP_REP_REQ(Signal*); 1080 void execFLUSH_LCP_REP_CONF(Signal*); 1081 void execPAUSE_LCP_REQ(Signal*); 1082 void execPAUSE_LCP_CONF(Signal*); 1083 1084 void sendPAUSE_LCP_REQ(Signal*, bool pause); 1085 bool check_if_lcp_idle(void); 1086 void pause_lcp(Signal *signal, 1087 Uint32 startNode, 1088 BlockReference sender_ref); 1089 void unpause_lcp(Signal *signal, 1090 Uint32 startNode, 1091 BlockReference sender_ref, 1092 PauseLcpReq::PauseAction pauseAction); 1093 void check_for_pause_action(Signal *signal, 1094 StartLcpReq::PauseStart pauseStart); 1095 void end_pause(Signal *signal, PauseLcpReq::PauseAction pauseAction); 1096 void stop_pause(Signal *signal); 1097 void handle_node_failure_in_pause(Signal *signal); 1098 void dequeue_lcp_rep(Signal*); 1099 void start_copy_meta_data(Signal*); 1100 void start_lcp(Signal*); 1101 void start_lcp_before_mutex(Signal*); 1102 void queue_lcp_frag_rep(Signal *signal, LcpFragRep *lcpReport); 1103 void queue_lcp_complete_rep(Signal *signal, Uint32 lcpId); 1104 void init_lcp_pausing_module(void); 1105 bool check_pause_state_sanity(void); 1106 void check_pause_state_lcp_idle(void); 1107 1108 /** 1109 * This is only true when an LCP is running and it is running with 1110 * support for PAUSE LCP (all DIH nodes support it). Actually this 1111 * is set when we have passed the START_LCP_REQ step. After this 1112 * step we release the fragment info mutex if we can use the pause 1113 * lcp protocol with all nodes. 1114 */ 1115 bool c_lcp_runs_with_pause_support; /* Master state */ 1116 1117 /** 1118 * This is the state in the master that keeps track of where the master is 1119 * in the PAUSE LCP process. We can follow two different tracks in the 1120 * state traversal. 1121 * 1122 * 1) When the starting node is included into the LCP as part of PAUSE LCP 1123 * handling. This is the expected outcome after pausing. The LCP didn't 1124 * complete while we were pausing. We need to be included into the LCP 1125 * here to ensure that the LCP state in the starting node is kept up to 1126 * date during the rest of the LCP. 1127 * 1128 * PAUSE_LCP_IDLE -> PAUSE_LCP_REQUESTED 1129 * PAUSE_LCP_REQUESTED -> PAUSE_START_LCP_INCLUSION 1130 * PAUSE_START_LCP_INCLUSION -> PAUSE_IN_LCP_COPY_META_DATA 1131 * PAUSE_IN_LCP_COPY_META_DATA -> PAUSE_COMPLETE_LCP_INCLUSION 1132 * PAUSE_COMPLETE_LCP_INCLUSION -> PAUSE_IN_LCP_UNPAUSE 1133 * PAUSE_IN_LCP_UNPAUSE -> PAUSE_LCP_IDLE 1134 * 1135 * 2) When the starting node isn't included into the LCP as part of PAUSE 1136 * LCP handling. While we were pausing the LCP completed. Thus no need 1137 * to include the new node into the LCP since no more updates of the 1138 * LCP state will happen after the pause. 1139 * 1140 * PAUSE_LCP_IDLE -> PAUSE_LCP_REQUESTED 1141 * PAUSE_LCP_REQUESTED -> PAUSE_NOT_IN_LCP_COPY_META_DATA 1142 * PAUSE_NOT_IN_LCP_COPY_META_DATA -> PAUSE_NOT_IN_LCP_UNPAUSE 1143 * PAUSE_NOT_IN_LCP_UNPAUSE -> PAUSE_LCP_IDLE 1144 */ 1145 enum PauseLCPState 1146 { 1147 PAUSE_LCP_IDLE = 0, 1148 PAUSE_LCP_REQUESTED = 1, 1149 /* States to handle inclusion in LCP. */ 1150 PAUSE_START_LCP_INCLUSION = 2, 1151 PAUSE_IN_LCP_COPY_META_DATA = 3, 1152 PAUSE_COMPLETE_LCP_INCLUSION = 4, 1153 PAUSE_IN_LCP_UNPAUSE = 5, 1154 /* States to handle not included in LCP */ 1155 PAUSE_NOT_IN_LCP_COPY_META_DATA = 6, 1156 PAUSE_NOT_IN_LCP_UNPAUSE = 7 1157 }; 1158 PauseLCPState c_pause_lcp_master_state; 1159 1160 /** 1161 * Bitmask of nodes that we're expecting a PAUSE_LCP_CONF response from. 1162 * This bitmask is cleared if the starting node dies (or for that matter 1163 * if any node dies since this will cause the starting node to also fail). 1164 * The PAUSE_LCP_REQ_Counter is only used in the master node. 1165 */ 1166 SignalCounter c_PAUSE_LCP_REQ_Counter; /* Master state */ 1167 1168 /** 1169 * We need to keep track of the LQH nodes that participated in the PAUSE 1170 * LCP request to ensure that we unpause the same set of nodes in the 1171 * unpause request. If the LCP completes between as part of the pause 1172 * request phase, then the m_participatingLQH bitmap will be cleared and 1173 * we need this bitmap also to unpause the participants even if the 1174 * LCP has completed to ensure that the pause state is reset. This variable 1175 * is used to make sure that we retain this bitmap independent of what 1176 * happens with the LCP. 1177 */ 1178 NdbNodeBitmask c_pause_participants; 1179 1180 /** 1181 * This variable states which is the node starting up that requires a 1182 * pause of the LCP to copy the meta data during an ongoing LCP. 1183 * If the node fails this variable is set to RNIL to indicate we no 1184 * longer need to worry about signals handling this pause. 1185 * 1186 * This is also the state variable that says that pause lcp is ongoing 1187 * in this participant. 1188 */ 1189 Uint32 c_pause_lcp_start_node; 1190 is_pause_for_this_node(Uint32 node)1191 bool is_pause_for_this_node(Uint32 node) 1192 { 1193 return (node == c_pause_lcp_start_node); 1194 } 1195 1196 /** 1197 * When is_lcp_paused is true then c_dequeue_lcp_rep_ongoing is false. 1198 * When is_lcp_paused is false then c_dequeue_lcp_rep_ongoing is true 1199 * until we have dequeued all queued requests. Requests will be 1200 * queued as long as either of them are true to ensure that we keep 1201 * the order of signals. 1202 */ is_lcp_paused()1203 bool is_lcp_paused() 1204 { 1205 return (c_pause_lcp_start_node != RNIL); 1206 } 1207 bool c_dequeue_lcp_rep_ongoing; 1208 1209 /** 1210 * Last LCP id we heard LCP_COMPLETE_REP from local LQH. We record this 1211 * to ensure we only get one LCP_COMPLETE_REP per LCP from our local 1212 * LQH. 1213 */ 1214 Uint32 c_last_id_lcp_complete_rep; 1215 bool c_queued_lcp_complete_rep; 1216 1217 /** 1218 * As soon as we have some LCP_FRAG_REP or LCP_COMPLETE_REP queued, this 1219 * variable gives us the lcp Id of the paused LCP. 1220 */ 1221 Uint32 c_lcp_id_paused; 1222 1223 /** 1224 * We set the LCP Id when receiving COPY_TABREQ to be used in the 1225 * updateLcpInfo routine. 1226 */ 1227 Uint32 c_lcp_id_while_copy_meta_data; /* State in starting node */ 1228 1229 /** 1230 * A bitmap for outstanding FLUSH_LCP_REP_REQ messages to know 1231 * when all nodes have sent their reply. This bitmap is used in all nodes 1232 * that receive the PAUSE_LCP_REQ request. 1233 */ 1234 SignalCounter c_FLUSH_LCP_REP_REQ_Counter; 1235 /* LCP Pausing module end */ 1236 1237 void execBLOCK_COMMIT_ORD(Signal *); 1238 void execUNBLOCK_COMMIT_ORD(Signal *); 1239 1240 void execDIH_SWITCH_REPLICA_REQ(Signal *); 1241 void execDIH_SWITCH_REPLICA_REF(Signal *); 1242 void execDIH_SWITCH_REPLICA_CONF(Signal *); 1243 1244 void execSTOP_PERM_REQ(Signal *); 1245 void execSTOP_PERM_REF(Signal *); 1246 void execSTOP_PERM_CONF(Signal *); 1247 1248 void execSTOP_ME_REQ(Signal *); 1249 void execSTOP_ME_REF(Signal *); 1250 void execSTOP_ME_CONF(Signal *); 1251 1252 void execREAD_CONFIG_REQ(Signal *); 1253 void execUNBLO_DICTCONF(Signal *); 1254 void execCOPY_ACTIVECONF(Signal *); 1255 void execTAB_COMMITREQ(Signal *); 1256 void execNODE_FAILREP(Signal *); 1257 void execCOPY_FRAGCONF(Signal *); 1258 void execCOPY_FRAGREF(Signal *); 1259 void execPREPARE_COPY_FRAG_REF(Signal*); 1260 void execPREPARE_COPY_FRAG_CONF(Signal*); 1261 void execDIADDTABREQ(Signal *); 1262 void execDIGETNODESREQ(Signal *); 1263 void execSTTOR(Signal *); 1264 void execDIH_SCAN_TAB_REQ(Signal *); 1265 void execDIH_SCAN_TAB_COMPLETE_REP(Signal*); 1266 void execGCP_SAVEREF(Signal *); 1267 void execGCP_TCFINISHED(Signal *); 1268 void execGCP_TCFINISHED_sync_conf(Signal* signal, Uint32 cb, Uint32 err); 1269 void execREAD_NODESCONF(Signal *); 1270 void execNDB_STTOR(Signal *); 1271 void execDICTSTARTCONF(Signal *); 1272 void execNDB_STARTREQ(Signal *); 1273 void execGETGCIREQ(Signal *); 1274 void execGET_LATEST_GCI_REQ(Signal*); 1275 void execSET_LATEST_LCP_ID(Signal*); 1276 void execDIH_RESTARTREQ(Signal *); 1277 void execSTART_RECCONF(Signal *); 1278 void execSTART_FRAGREF(Signal *); 1279 void execSTART_FRAGCONF(Signal *); 1280 void execADD_FRAGCONF(Signal *); 1281 void execADD_FRAGREF(Signal *); 1282 void execDROP_FRAG_REF(Signal *); 1283 void execDROP_FRAG_CONF(Signal *); 1284 void execFSOPENCONF(Signal *); 1285 void execFSOPENREF(Signal *); 1286 void execFSCLOSECONF(Signal *); 1287 void execFSCLOSEREF(Signal *); 1288 void execFSREADCONF(Signal *); 1289 void execFSREADREF(Signal *); 1290 void execFSWRITECONF(Signal *); 1291 void execFSWRITEREF(Signal *); 1292 void execCHECKNODEGROUPSREQ(Signal *); 1293 void execSTART_INFOREQ(Signal*); 1294 void execSTART_INFOREF(Signal*); 1295 void execSTART_INFOCONF(Signal*); 1296 void execWAIT_GCP_REQ(Signal* signal); 1297 void execWAIT_GCP_REF(Signal* signal); 1298 void execWAIT_GCP_CONF(Signal* signal); 1299 void execREDO_STATE_REP(Signal* signal); 1300 1301 void execPREP_DROP_TAB_REQ(Signal* signal); 1302 void execDROP_TAB_REQ(Signal* signal); 1303 1304 void execALTER_TAB_REQ(Signal* signal); 1305 1306 void execCREATE_FRAGMENTATION_REQ(Signal*); 1307 bool verify_fragmentation(Uint16* fragments, 1308 Uint32 partition_count, 1309 Uint32 partition_balance, 1310 Uint32 ldm_count) const; 1311 1312 void waitDropTabWritingToFile(Signal *, TabRecordPtr tabPtr); 1313 void checkDropTabComplete(Signal *, TabRecordPtr tabPtr); 1314 1315 void execDICT_LOCK_CONF(Signal* signal); 1316 void execDICT_LOCK_REF(Signal* signal); 1317 1318 void execUPGRADE_PROTOCOL_ORD(Signal* signal); 1319 1320 void execCREATE_NODEGROUP_IMPL_REQ(Signal*); 1321 void execDROP_NODEGROUP_IMPL_REQ(Signal*); 1322 1323 void execSTART_NODE_LCP_CONF(Signal *signal); 1324 void handleStartLcpReq(Signal*, StartLcpReq*); 1325 StartLcpReq c_save_startLcpReq; 1326 bool c_start_node_lcp_req_outstanding; 1327 1328 // Statement blocks 1329 //------------------------------------ 1330 // Methods that send signals 1331 //------------------------------------ 1332 void nullRoutine(Signal *, Uint32 nodeId, Uint32); 1333 void sendCOPY_GCIREQ(Signal *, Uint32 nodeId, Uint32); 1334 void sendDIH_SWITCH_REPLICA_REQ(Signal *, Uint32 nodeId, Uint32); 1335 void sendEND_TOREQ(Signal *, Uint32 nodeId, Uint32); 1336 void sendGCP_COMMIT(Signal *, Uint32 nodeId, Uint32); 1337 void sendGCP_PREPARE(Signal *, Uint32 nodeId, Uint32); 1338 void sendGCP_SAVEREQ(Signal *, Uint32 nodeId, Uint32); 1339 void sendSUB_GCP_COMPLETE_REP(Signal*, Uint32 nodeId, Uint32); 1340 void sendINCL_NODEREQ(Signal *, Uint32 nodeId, Uint32); 1341 void sendMASTER_GCPREQ(Signal *, Uint32 nodeId, Uint32); 1342 void sendMASTER_LCPREQ(Signal *, Uint32 nodeId, Uint32); 1343 void sendMASTER_LCPCONF(Signal * signal, Uint32 fromLine); 1344 void sendSTART_RECREQ(Signal *, Uint32 nodeId, Uint32); 1345 void sendSTART_INFOREQ(Signal *, Uint32 nodeId, Uint32); 1346 void sendSTOP_ME_REQ(Signal *, Uint32 nodeId, Uint32); 1347 void sendTC_CLOPSIZEREQ(Signal *, Uint32 nodeId, Uint32); 1348 void sendTCGETOPSIZEREQ(Signal *, Uint32 nodeId, Uint32); 1349 void sendUPDATE_TOREQ(Signal *, Uint32 nodeId, Uint32); 1350 void sendSTART_LCP_REQ(Signal *, Uint32 nodeId, Uint32); 1351 1352 void sendLCP_FRAG_ORD(Signal*, NodeRecord::FragmentCheckpointInfo info); 1353 void sendLastLCP_FRAG_ORD(Signal *); 1354 1355 void sendCopyTable(Signal *, CopyTableNode* ctn, 1356 BlockReference ref, Uint32 reqinfo); 1357 void sendDihfragreq(Signal *, 1358 TabRecordPtr regTabPtr, 1359 Uint32 fragId); 1360 1361 void sendStartFragreq(Signal *, 1362 TabRecordPtr regTabPtr, 1363 Uint32 fragId); 1364 1365 void sendAddFragreq(Signal*, 1366 ConnectRecordPtr, 1367 TabRecordPtr, 1368 Uint32 fragId, 1369 bool rcu_lock_held); 1370 void addTable_closeConf(Signal* signal, Uint32 tabPtrI); 1371 void resetReplicaSr(TabRecordPtr tabPtr); 1372 void resetReplicaLcp(ReplicaRecord * replicaP, Uint32 stopGci); 1373 void resetReplica(Ptr<ReplicaRecord>); 1374 1375 /** 1376 * Methods part of Transaction Handling module 1377 */ 1378 void start_scan_on_table(TabRecordPtr, Signal*, Uint32, EmulatedJamBuffer*); 1379 void complete_scan_on_table(TabRecordPtr tabPtr, Uint32, EmulatedJamBuffer*); 1380 1381 bool prepare_add_table(TabRecordPtr, ConnectRecordPtr, Signal*); 1382 void commit_new_table(TabRecordPtr); 1383 1384 void make_node_usable(NodeRecord *nodePtr); 1385 void make_node_not_usable(NodeRecord *nodePtr); 1386 1387 void start_add_fragments_in_new_table(TabRecordPtr, 1388 ConnectRecordPtr, 1389 const Uint16 buf[], 1390 Signal *signal); 1391 void make_new_table_writeable(TabRecordPtr, ConnectRecordPtr, bool); 1392 void make_new_table_read_and_writeable(TabRecordPtr, 1393 ConnectRecordPtr, 1394 Signal*); 1395 bool make_old_table_non_writeable(TabRecordPtr, ConnectRecordPtr); 1396 void make_table_use_new_replica(TabRecordPtr, 1397 FragmentstorePtr fragPtr, 1398 ReplicaRecordPtr, 1399 Uint32 replicaType, 1400 Uint32 destNodeId); 1401 void make_table_use_new_node_order(TabRecordPtr, 1402 FragmentstorePtr, 1403 Uint32, 1404 Uint32*); 1405 void make_new_table_non_writeable(TabRecordPtr); 1406 void drop_fragments_from_new_table_view(TabRecordPtr, ConnectRecordPtr); 1407 1408 //------------------------------------ 1409 // Methods for LCP functionality 1410 //------------------------------------ 1411 void checkKeepGci(TabRecordPtr, Uint32, Fragmentstore*, Uint32); 1412 void checkLcpStart(Signal *, Uint32 lineNo, Uint32 delay); 1413 bool checkStartMoreLcp(Signal *, Uint32 nodeId, bool startNext); 1414 bool reportLcpCompletion(const struct LcpFragRep *); 1415 void sendLCP_COMPLETE_REP(Signal *); 1416 1417 //------------------------------------ 1418 // Methods for Delete Table Files 1419 //------------------------------------ 1420 void startDeleteFile(Signal* signal, TabRecordPtr tabPtr); 1421 void openTableFileForDelete(Signal* signal, Uint32 fileIndex); 1422 void tableOpenLab(Signal* signal, FileRecordPtr regFilePtr); 1423 void tableDeleteLab(Signal* signal, FileRecordPtr regFilePtr); 1424 1425 //------------------------------------ 1426 // File Record specific methods 1427 //------------------------------------ 1428 void closeFile(Signal *, FileRecordPtr regFilePtr); 1429 void closeFileDelete(Signal *, FileRecordPtr regFilePtr); 1430 void createFileRw(Signal *, FileRecordPtr regFilePtr); 1431 void openFileRw(Signal *, FileRecordPtr regFilePtr); 1432 void openFileRo(Signal *, FileRecordPtr regFilePtr); 1433 void seizeFile(FileRecordPtr& regFilePtr); 1434 void releaseFile(Uint32 fileIndex); 1435 1436 //------------------------------------ 1437 // Methods called when completing file 1438 // operation. 1439 //------------------------------------ 1440 void creatingGcpLab(Signal *, FileRecordPtr regFilePtr); 1441 void openingGcpLab(Signal *, FileRecordPtr regFilePtr); 1442 void openingTableLab(Signal *, FileRecordPtr regFilePtr); 1443 void tableCreateLab(Signal *, FileRecordPtr regFilePtr); 1444 void creatingGcpErrorLab(Signal *, FileRecordPtr regFilePtr); 1445 void openingCopyGciErrorLab(Signal *, FileRecordPtr regFilePtr); 1446 void creatingCopyGciErrorLab(Signal *, FileRecordPtr regFilePtr); 1447 void openingGcpErrorLab(Signal *, FileRecordPtr regFilePtr); 1448 void openingTableErrorLab(Signal *, FileRecordPtr regFilePtr); 1449 void tableCreateErrorLab(Signal *, FileRecordPtr regFilePtr); 1450 void closingGcpLab(Signal *, FileRecordPtr regFilePtr); 1451 void closingGcpCrashLab(Signal *, FileRecordPtr regFilePtr); 1452 void closingTableCrashLab(Signal *, FileRecordPtr regFilePtr); 1453 void closingTableSrLab(Signal *, FileRecordPtr regFilePtr); 1454 void tableCloseLab(Signal *, FileRecordPtr regFilePtr); 1455 void tableCloseErrorLab(FileRecordPtr regFilePtr); 1456 void readingGcpLab(Signal *, FileRecordPtr regFilePtr); 1457 void readingTableLab(Signal *, FileRecordPtr regFilePtr); 1458 void readingGcpErrorLab(Signal *, FileRecordPtr regFilePtr); 1459 void readingTableErrorLab(Signal *, FileRecordPtr regFilePtr); 1460 void writingCopyGciLab(Signal *, FileRecordPtr regFilePtr); 1461 void writeInitGcpLab(Signal *, FileRecordPtr regFilePtr); 1462 void tableWriteLab(Signal *, FileRecordPtr regFilePtr); 1463 void writeInitGcpErrorLab(Signal *, FileRecordPtr regFilePtr); 1464 1465 1466 void checkEscalation(); 1467 void clearRestartInfoBits(Signal *); 1468 void invalidateLcpInfoAfterSr(Signal*); 1469 1470 bool isMaster(); 1471 bool isActiveMaster(); 1472 1473 void handleGcpStateInMaster(Signal *, NodeRecordPtr failedNodeptr); 1474 void initRestartInfo(Signal*); 1475 void initRestorableGciFiles(); 1476 void makeNodeGroups(Uint32 nodeArray[]); 1477 void add_nodegroup(NodeGroupRecordPtr); 1478 void inc_ng_refcount(Uint32 ng); 1479 void dec_ng_refcount(Uint32 ng); 1480 1481 void makePrnList(class ReadNodesConf * readNodes, Uint32 nodeArray[]); 1482 void nodeResetStart(Signal* signal); 1483 void releaseTabPages(Uint32 tableId); 1484 void replication(Uint32 noOfReplicas, 1485 NodeGroupRecordPtr NGPtr, 1486 FragmentstorePtr regFragptr); 1487 void sendDihRestartRef(Signal*); 1488 void unpack_sysfile_format_v1(bool set_max_node_id); 1489 void pack_sysfile_format_v1(); 1490 void unpack_sysfile_format_v2(bool set_max_node_id); 1491 void pack_sysfile_format_v2(); 1492 void send_COPY_GCIREQ_data_v1(Signal*, Uint32); 1493 void send_COPY_GCIREQ_data_v2(Signal*, Uint32); 1494 void send_START_MECONF_data_v1(Signal*, Uint32); 1495 void send_START_MECONF_data_v2(Signal*, Uint32); 1496 void selectMasterCandidateAndSend(Signal *); 1497 void setLcpActiveStatusEnd(Signal*); 1498 void setLcpActiveStatusStart(Signal *); 1499 void setNodeActiveStatus(); 1500 void setNodeGroups(); 1501 void setNodeInfo(Signal *); 1502 void setNodeLcpActiveStatus(); 1503 void setNodeRestartInfoBits(Signal*); 1504 void startGcp(Signal *); 1505 void startGcpMonitor(Signal*); 1506 1507 void readFragment(RWFragment* rf, FragmentstorePtr regFragptr); 1508 Uint32 readPageWord(RWFragment* rf); 1509 void readReplica(RWFragment* rf, ReplicaRecordPtr readReplicaPtr); 1510 void readReplicas(RWFragment* rf, 1511 TabRecord *regTabPtr, 1512 FragmentstorePtr regFragptr); 1513 void updateLcpInfo(TabRecord *regTabPtr, 1514 Fragmentstore *regFragPtr, 1515 ReplicaRecord *regReplicaPtr); 1516 void readRestorableGci(Signal *, FileRecordPtr regFilePtr); 1517 void readTabfile(Signal *, TabRecord* tab, FileRecordPtr regFilePtr); 1518 void writeFragment(RWFragment* wf, FragmentstorePtr regFragptr); 1519 void writePageWord(RWFragment* wf, Uint32 dataWord); 1520 void writeReplicas(RWFragment* wf, Uint32 replicaStartIndex); 1521 void writeRestorableGci(Signal *, FileRecordPtr regFilePtr); 1522 void writeTabfile(Signal *, TabRecord* tab, FileRecordPtr regFilePtr); 1523 void copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr); 1524 1525 void gcpcommitreqLab(Signal *); 1526 void copyGciLab(Signal *, CopyGCIReq::CopyReason reason); 1527 void storeNewLcpIdLab(Signal *); 1528 void startLcpRoundLoopLab(Signal *, Uint32 startTableId, Uint32 startFragId); 1529 1530 void nodeFailCompletedCheckLab(Signal*, NodeRecordPtr failedNodePtr); 1531 1532 /** 1533 * 1534 */ 1535 void setLocalNodefailHandling(Signal*, Uint32 failedNodeId, 1536 NodefailHandlingStep step); 1537 void checkLocalNodefailComplete(Signal*, Uint32 failedNodeId, 1538 NodefailHandlingStep step); 1539 1540 Callback m_sendSTTORRY; 1541 void sendSTTORRY(Signal*, Uint32 senderData = 0, Uint32 retVal = 0); 1542 void ndbsttorry10Lab(Signal *, Uint32 _line); 1543 void createMutexes(Signal* signal, Uint32 no); 1544 void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal); 1545 void dumpGcpStop(); 1546 void crashSystemAtGcpStop(Signal *, bool); 1547 void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr); 1548 void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode); 1549 void GCP_SAVEhandling(Signal *, Uint32 nodeId); 1550 void packTableIntoPagesLab(Signal *, Uint32 tableId); 1551 void readPagesIntoTableLab(Signal *, Uint32 tableId); 1552 void readPagesIntoFragLab(Signal *, RWFragment* rf); 1553 void readTabDescriptionLab(Signal *, Uint32 tableId); 1554 void copyTableLab(Signal *, Uint32 tableId); 1555 void breakCopyTableLab(Signal *, 1556 TabRecordPtr regTabPtr, 1557 Uint32 nodeId); 1558 void checkAddfragCompletedLab(Signal *, 1559 TabRecordPtr regTabPtr, 1560 Uint32 fragId); 1561 void completeRestartLab(Signal *); 1562 void readTableFromPagesLab(Signal *, TabRecordPtr regTabPtr); 1563 void srPhase2ReadTableLab(Signal *, TabRecordPtr regTabPtr); 1564 void checkTcCounterLab(Signal *); 1565 void calculateKeepGciLab(Signal *, Uint32 tableId, Uint32 fragId); 1566 void tableUpdateLab(Signal *, TabRecordPtr regTabPtr); 1567 void checkLcpCompletedLab(Signal *); 1568 void initLcpLab(Signal *, Uint32 masterRef, Uint32 tableId); 1569 void startGcpLab(Signal *); 1570 void checkGcpStopLab(Signal *); 1571 void MASTER_GCPhandling(Signal *, Uint32 failedNodeId); 1572 void MASTER_LCPhandling(Signal *, Uint32 failedNodeId); 1573 void rnfTableNotReadyLab(Signal *, TabRecordPtr regTabPtr, Uint32 removeNodeId); 1574 void startLcpTakeOverLab(Signal *, Uint32 failedNodeId); 1575 1576 void startLcpMasterTakeOver(Signal *, Uint32 failedNodeId); 1577 void startGcpMasterTakeOver(Signal *, Uint32 failedNodeId); 1578 void checkGcpOutstanding(Signal*, Uint32 failedNodeId); 1579 1580 void checkEmptyLcpComplete(Signal *); 1581 void lcpBlockedLab(Signal *, Uint32, Uint32); 1582 void breakCheckTabCompletedLab(Signal *, TabRecordPtr regTabptr); 1583 void readGciFileLab(Signal *); 1584 void openingCopyGciSkipInitLab(Signal *, FileRecordPtr regFilePtr); 1585 void startLcpRoundLab(Signal *); 1586 void gcpBlockedLab(Signal *); 1587 void allNodesLcpCompletedLab(Signal *); 1588 void nodeRestartPh2Lab(Signal *); 1589 void nodeRestartPh2Lab2(Signal *); 1590 void initGciFilesLab(Signal *); 1591 void dictStartConfLab(Signal *); 1592 void nodeDictStartConfLab(Signal *, Uint32 nodeId); 1593 void ndbStartReqLab(Signal *, BlockReference ref); 1594 void nodeRestartStartRecConfLab(Signal *); 1595 void dihCopyCompletedLab(Signal *); 1596 void release_connect(ConnectRecordPtr ptr); 1597 void copyTableNode(Signal *, 1598 CopyTableNode* ctn, 1599 NodeRecordPtr regNodePtr); 1600 void startFragment(Signal *, Uint32 tableId, Uint32 fragId); 1601 bool checkLcpAllTablesDoneInLqh(Uint32 from); 1602 1603 void lcpStateAtNodeFailureLab(Signal *, Uint32 nodeId); 1604 void copyNodeLab(Signal *, Uint32 tableId); 1605 void copyGciReqLab(Signal *); 1606 void allLab(Signal *, 1607 ConnectRecordPtr regConnectPtr, 1608 TabRecordPtr regTabPtr); 1609 void tableCopyNodeLab(Signal *, TabRecordPtr regTabPtr); 1610 1611 void removeNodeFromTables(Signal *, Uint32 tableId, Uint32 nodeId); 1612 void removeNodeFromTable(Signal *, Uint32 tableId, TabRecordPtr tabPtr); 1613 void removeNodeFromTablesComplete(Signal* signal, Uint32 nodeId); 1614 1615 void packFragIntoPagesLab(Signal *, RWFragment* wf); 1616 void startNextChkpt(Signal *); 1617 void failedNodeLcpHandling(Signal*, NodeRecordPtr failedNodePtr, bool &); 1618 void failedNodeSynchHandling(Signal *, NodeRecordPtr failedNodePtr); 1619 void checkCopyTab(Signal*, NodeRecordPtr failedNodePtr); 1620 1621 Uint32 compute_max_failure_time(); 1622 void setGCPStopTimeouts(Signal*, 1623 bool set_gcp_save_max_lag = true, 1624 bool set_micro_gcp_max_lag = true); 1625 void sendINFO_GCP_STOP_TIMER(Signal*); 1626 void initCommonData(); 1627 void initialiseRecordsLab(Signal *, Uint32 stepNo, Uint32, Uint32); 1628 1629 void findReplica(ReplicaRecordPtr& regReplicaPtr, 1630 Fragmentstore* fragPtrP, 1631 Uint32 nodeId, 1632 bool oldStoredReplicas = false); 1633 //------------------------------------ 1634 // Node failure handling methods 1635 //------------------------------------ 1636 void startRemoveFailedNode(Signal *, NodeRecordPtr failedNodePtr); 1637 void handleGcpTakeOver(Signal *, NodeRecordPtr failedNodePtr); 1638 void handleLcpTakeOver(Signal *, NodeRecordPtr failedNodePtr); 1639 void handleTakeOver(Signal*, Ptr<TakeOverRecord>); 1640 void handleLcpMasterTakeOver(Signal *, Uint32 nodeId); 1641 1642 //------------------------------------ 1643 // Replica record specific methods 1644 //------------------------------------ 1645 Uint32 findLogInterval(ConstPtr<ReplicaRecord> regReplicaPtr, 1646 Uint32 startGci); 1647 void findMinGci(ReplicaRecordPtr fmgReplicaPtr, 1648 Uint32& keeGci, 1649 Uint32& oldestRestorableGci); 1650 bool findStartGci(Ptr<ReplicaRecord> fstReplicaPtr, 1651 Uint32 tfstStopGci, 1652 Uint32& tfstStartGci, 1653 Uint32& tfstLcp); 1654 void newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr); 1655 void packCrashedReplicas(ReplicaRecordPtr pcrReplicaPtr); 1656 void releaseReplicas(Uint32 * replicaPtr); 1657 void removeOldCrashedReplicas(Uint32, Uint32, ReplicaRecordPtr rocReplicaPtr); 1658 void removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr, Uint32 lastCompletedGCI); 1659 void mergeCrashedReplicas(ReplicaRecordPtr pcrReplicaPtr); 1660 void seizeReplicaRec(ReplicaRecordPtr& replicaPtr); 1661 1662 //------------------------------------ 1663 // Methods operating on a fragment and 1664 // its connected replicas and nodes. 1665 //------------------------------------ 1666 void insertCopyFragmentList(TabRecord *tabPtr, 1667 Fragmentstore *fragPtr, 1668 Uint32 my_fragid); 1669 void allocStoredReplica(FragmentstorePtr regFragptr, 1670 ReplicaRecordPtr& newReplicaPtr, 1671 Uint32 nodeId, 1672 Uint32 fragId, 1673 Uint32 tableId); 1674 Uint32 extractNodeInfo(EmulatedJamBuffer *jambuf, 1675 const Fragmentstore * fragPtr, 1676 Uint32 nodes[]); 1677 Uint32 findLocalFragment(const TabRecord *, 1678 Ptr<Fragmentstore> & fragPtr, 1679 EmulatedJamBuffer *jambuf); 1680 Uint32 findPartitionOrder(const TabRecord *tabPtrP, 1681 FragmentstorePtr fragPtr); 1682 Uint32 findFirstNewFragment(const TabRecord *, 1683 Ptr<Fragmentstore> & fragPtr, 1684 Uint32 fragId, 1685 EmulatedJamBuffer *jambuf); 1686 bool check_if_local_fragment(EmulatedJamBuffer *jambuf, 1687 const Fragmentstore *fragPtr); 1688 bool findBestLogNode(CreateReplicaRecord* createReplica, 1689 FragmentstorePtr regFragptr, 1690 Uint32 startGci, 1691 Uint32 stopGci, 1692 Uint32 logNode, 1693 Uint32& fblStopGci); 1694 bool findLogNodes(CreateReplicaRecord* createReplica, 1695 FragmentstorePtr regFragptr, 1696 Uint32 startGci, 1697 Uint32 stopGci); 1698 void initFragstore(FragmentstorePtr regFragptr, Uint32 fragId); 1699 void insertfraginfo(FragmentstorePtr regFragptr, 1700 Uint32 noOfBackups, 1701 Uint32* nodeArray); 1702 void linkOldStoredReplica(FragmentstorePtr regFragptr, 1703 ReplicaRecordPtr replicaPtr); 1704 void linkStoredReplica(FragmentstorePtr regFragptr, 1705 ReplicaRecordPtr replicaPtr); 1706 void prepareReplicas(FragmentstorePtr regFragptr); 1707 void removeNodeFromStored(Uint32 nodeId, 1708 FragmentstorePtr regFragptr, 1709 ReplicaRecordPtr replicaPtr, 1710 bool temporary); 1711 void removeOldStoredReplica(FragmentstorePtr regFragptr, 1712 ReplicaRecordPtr replicaPtr); 1713 void removeStoredReplica(FragmentstorePtr regFragptr, 1714 ReplicaRecordPtr replicaPtr); 1715 void searchStoredReplicas(FragmentstorePtr regFragptr); 1716 bool setup_create_replica(FragmentstorePtr, CreateReplicaRecord*, 1717 Ptr<ReplicaRecord>); 1718 void updateNodeInfo(FragmentstorePtr regFragptr); 1719 1720 //------------------------------------ 1721 // Fragment allocation, deallocation and 1722 // find methods 1723 //------------------------------------ 1724 void allocFragments(Uint32 noOfFragments, TabRecordPtr regTabPtr); 1725 void releaseFragments(TabRecordPtr regTabPtr); 1726 void getFragstore(const TabRecord *, Uint32 fragNo, FragmentstorePtr & ptr); 1727 void getFragstoreCanFail(const TabRecord *, 1728 Uint32 fragNo, 1729 FragmentstorePtr & ptr); 1730 void initialiseFragstore(); 1731 1732 void wait_old_scan(Signal*); 1733 Uint32 add_fragments_to_table(Ptr<TabRecord>, const Uint16 buf[]); 1734 Uint32 add_fragment_to_table(Ptr<TabRecord>, Uint32, Ptr<Fragmentstore>&); 1735 1736 void drop_fragments(Signal*, ConnectRecordPtr, Uint32 last); 1737 void release_fragment_from_table(Ptr<TabRecord>, Uint32 fragId); 1738 void send_alter_tab_ref(Signal*, Ptr<TabRecord>,Ptr<ConnectRecord>, Uint32); 1739 void send_alter_tab_conf(Signal*, Ptr<ConnectRecord>); 1740 void alter_table_writeTable_conf(Signal* signal, Uint32 ptrI, Uint32 err); 1741 void saveTableFile(Signal*, Ptr<ConnectRecord>, Ptr<TabRecord>, 1742 TabRecord::CopyStatus, Callback&); 1743 1744 //------------------------------------ 1745 // Page Record specific methods 1746 //------------------------------------ 1747 void allocpage(PageRecordPtr& regPagePtr); 1748 void releasePage(Uint32 pageIndex); 1749 1750 //------------------------------------ 1751 // Table Record specific methods 1752 //------------------------------------ 1753 void initTable(TabRecordPtr regTabPtr); 1754 void initTableFile(TabRecordPtr regTabPtr); 1755 void releaseTable(TabRecordPtr tabPtr); 1756 1757 void handleTakeOverMaster(Signal *, Uint32 takeOverPtr); 1758 void handleTakeOverNewMaster(Signal *, Uint32 takeOverPtr); 1759 1760 //------------------------------------ 1761 // Node Record specific methods 1762 //------------------------------------ 1763 void checkStartTakeOver(Signal *); 1764 void insertAlive(NodeRecordPtr newNodePtr); 1765 void insertDeadNode(NodeRecordPtr removeNodePtr); 1766 void removeAlive(NodeRecordPtr removeNodePtr); 1767 void removeDeadNode(NodeRecordPtr removeNodePtr); 1768 1769 NodeRecord::NodeStatus getNodeStatus(Uint32 nodeId); 1770 void setNodeStatus(Uint32 nodeId, NodeRecord::NodeStatus); 1771 Sysfile::ActiveStatus getNodeActiveStatus(Uint32 nodeId); 1772 void setNodeActiveStatus(Uint32 nodeId, Sysfile::ActiveStatus newStatus); 1773 void setNodeLcpActiveStatus(Uint32 nodeId, bool newState); 1774 bool getNodeLcpActiveStatus(Uint32 nodeId); 1775 bool getAllowNodeStart(Uint32 nodeId); 1776 void setAllowNodeStart(Uint32 nodeId, bool newState); 1777 bool getNodeCopyCompleted(Uint32 nodeId); 1778 void setNodeCopyCompleted(Uint32 nodeId, bool newState); 1779 Uint32 getNodeGroup(Uint32 nodeId) const; 1780 bool checkNodeAlive(Uint32 nodeId); 1781 1782 void getTabInfo(Signal*); 1783 void getTabInfo_send(Signal*, TabRecordPtr); 1784 void getTabInfo_sendComplete(Signal*, Uint32, Uint32); 1785 int getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr, CopyTableNode); 1786 int getTabInfo_copySectionToPages(TabRecordPtr, SegmentedSectionPtr); 1787 1788 // Initialisation 1789 void initData(); 1790 void initRecords(); 1791 1792 // Variables to support record structures and their free lists 1793 1794 ConnectRecord *connectRecord; 1795 Uint32 cfirstconnect; 1796 Uint32 cconnectFileSize; 1797 1798 CreateReplicaRecord *createReplicaRecord; 1799 Uint32 cnoOfCreateReplicas; 1800 1801 FileRecord *fileRecord; 1802 Uint32 cfirstfreeFile; 1803 Uint32 cfileFileSize; 1804 1805 Fragmentstore *fragmentstore; 1806 Uint32 cfirstfragstore; 1807 Uint32 cfragstoreFileSize; 1808 RSS_OP_SNAPSHOT(cremainingfrags); 1809 1810 NodeGroupRecord *nodeGroupRecord; 1811 RSS_OP_SNAPSHOT(cnghash); 1812 1813 Uint32 c_nextNodeGroup; 1814 Uint16 c_next_replica_node[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES]; 1815 1816 /** 1817 * Temporary variables used by CREATE_FRAGMENTATION_REQ 1818 */ 1819 Uint16 1820 tmp_next_replica_node[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES]; 1821 Uint8 1822 tmp_next_replica_node_set[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES]; 1823 Uint16 tmp_node_group_id[MAX_NDB_PARTITIONS]; 1824 Uint16 tmp_fragments_per_ldm[MAX_NDB_NODES][NDBMT_MAX_WORKER_INSTANCES]; 1825 Uint16 tmp_fragments_per_node[MAX_NDB_NODES]; 1826 void init_next_replica_node( 1827 Uint16 1828 (*next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES], 1829 Uint32 noOfReplicas); 1830 1831 NodeRecord *nodeRecord; 1832 1833 PageRecord *pageRecord; 1834 Uint32 cfirstfreepage; 1835 Uint32 cpageFileSize; 1836 1837 Uint32 cnoFreeReplicaRec; 1838 Uint32 creplicaFileSize; 1839 RSS_OP_SNAPSHOT(cnoFreeReplicaRec); 1840 1841 TabRecord *tabRecord; 1842 Uint32 ctabFileSize; 1843 1844 /** 1845 * Methods and variables used to control the node restart phase where a 1846 * node gets the data back from an alive node. This has two parts, one 1847 * part in the master node which controls that certain critical data is 1848 * only updated one at a time. The other part is in the starting node 1849 * where there is one thread for each parallel fragment copy process. 1850 * 1851 * There is also a set of signals used for the take over processes. 1852 * 1853 * START_FRAGREQ 1854 * Before performing the actual copy phase the starting node needs 1855 * information about all fragments to start. This signal is sent from the 1856 * from the starting nodes DBDIH to the starting nodes DBLQH and to the 1857 * actual instance that will handle the fragment replica. 1858 * 1859 * START_RECREQ/CONF: 1860 * This is sent from the starting node to all LDM instances to tell them 1861 * that they have now received all START_FRAGREQ, no more will be sent. After 1862 * receiving this signal the LDM instances can start reading the fragments 1863 * from disk and applying the REDO log to get them as up to date as possible 1864 * before we start the copy phase. One could also rebuild the ordered 1865 * indexes here. 1866 * 1867 * START_TOREQ/CONF/REF: 1868 * This is sent from the starting node to allocate a take over record in the 1869 * master node. This is sent once at the start of the take over processing. 1870 * 1871 * UPDATE_TOREQ/CONF/REF: 1872 * This is sent from a starting node to inform the master of a step forward 1873 * in the copy process. In some of those phases it means acquiring the global 1874 * cluster mutex on updating fragment state, in some phases it means 1875 * releasing the same mutex. Also the global switch primary replica mutex 1876 * can be acquired and released in certain phases. 1877 * 1878 * This is sent once before UPDATE_FRAGSTATEREQ/CONF and once after for each 1879 * fragment replica that the starting node will take over. 1880 * 1881 * UPDATE_FRAGSTATEREQ/CONF/REF: 1882 * This signal is sent to all nodes from starting node informing them of a 1883 * new replica entering a certain fragment. After the CONF has been received 1884 * we're sure that all transactions will involve this new node when updating 1885 * this fragment. We have a distribution key that can be used to verify if a 1886 * particular transaction have included the node in its transaction. 1887 * 1888 * This is sent once per fragment replica the starting node is taking over. 1889 * 1890 * PREPARE_COPY_FRAGREQ/CONF/REF: 1891 * This is sent from starting node to the LDM instance in starting node 1892 * asking for the maxPage value. Once per fragment replica to take over. 1893 * 1894 * COPY_FRAGREQ/CONF/REF: 1895 * This is sent to the copying node with the maxPage value. This will start 1896 * a scan in the copying node and copying over all records that have a newer 1897 * GCI than the one already restored from an LCP (the maxPage is also 1898 * somehow involved in this decision). 1899 * This signal relates to copying one fragment and is done after updating the 1900 * fragment state to ensure that all future transactions will involve the 1901 * node as well. There is another fragment state update performed after this 1902 * copy is completed. 1903 * 1904 * Sent once per fragment replica the starting node is taking over. 1905 * 1906 * COPY_ACTIVEREQ/CONF/REF: 1907 * This tells the starting node that the fragment replica is now copied over 1908 * and is in an active state. 1909 * 1910 * Sent per fragment replica the starting node is taking over. 1911 * 1912 * END_TOREQ/CONF/REF: 1913 * This is sent from the starting node to the master node. The response can 1914 * take a long time since it involves waiting for the proper LCP to complete 1915 * to ensure that the node is fully recoverable even on its own without other 1916 * nodes to assist it. For this to happen the node requires a complete 1917 * LCP to happen which started after we completed the copying of all 1918 * fragments and where the new node was part of the LCP. 1919 * 1920 * This is sent only once at the end of the take over process. 1921 * Multiple nodes can be in the take over process at the same time. 1922 * 1923 * CONTINUEB: 1924 * This signal is used a lot to control execution in the local DIH block. 1925 * It is used to start parallel threads and to ensure that we don't 1926 * execute for too long without giving other threads a chance to execute 1927 * or other signals to the DIH block. 1928 * 1929 * Variable descriptions 1930 * --------------------- 1931 * 1932 * We have a pool of take over records used by the master for 1933 * handling parallel node recoveries. We also use the same pool 1934 * in starting nodes to keep one main take over record and then 1935 * one record for each parallel thread that we can copy from in 1936 * parallel. 1937 * 1938 * Then for each thread that takes over we keep one record. 1939 * These records are always in one list. 1940 * 1941 * All threads are scanning fragments to find a fragment replica that needs 1942 * take over. When they discover one they try to update the fragment replica 1943 * state on the master (start takeover), which requires that they 1944 * temporarily become the activeThread. If this succeeds they are placed in 1945 * the activeThread variable. If it isn't successful they are placed into the 1946 * c_queued_for_start_takeover_list. When the global fragment replica state 1947 * update is completed, the list is checked to see if a queued thread should 1948 * become the activeThread. Then COPY_FRAGREQ is sent and the thread is 1949 * placed on the c_active_copy_instance_list. When start take over phase is 1950 * completed one starts the next take over from the list and sends off 1951 * COPY_FRAGREQ whereafter it is placed in the c_active_copy_thread_list. 1952 * 1953 * When the copy phase is completed the take over record is removed 1954 * from the c_active_copy_thread_list and one tries to become 1955 * the active thread. If it isn't successful the take over record 1956 * is placed into the c_queued_for_end_takeover_list. When the 1957 * active thread is done it gets a new record from either the 1958 * c_queued_for_start_takeover_list or from 1959 * c_queued_for_commit_takeover_list. c_queued_for_commit_takeover_list has 1960 * higher priority. Finally when there is no more fragments to find 1961 * for a certain thread after ending the takeover of a fragment 1962 * the record is placed into the c_completed_copy_thread_list. 1963 * When all threads are placed into this list then all threads are 1964 * done with the copy phase. 1965 * 1966 * Finally we start up the phase where we activate the REDO log. 1967 * During this phase the records are placed into the 1968 * c_active_copy_thread_list. When a thread is completed with 1969 * this phase the take over record is released. When all threads 1970 * are completed we are done with this parallelisation phase and the 1971 * node copying phase is completed whereafter we can also release the 1972 * main take over record. 1973 * 1974 * c_takeOverPool: 1975 * This is the pool of records used by both master and starting 1976 * node. 1977 * 1978 * c_mainTakeOverPtr: 1979 * This is the main record used by the starting node. 1980 * 1981 * c_queued_for_start_takeover_list: 1982 * A takeover thread is ready to copy a fragment, but has to wait until 1983 * another thread is ready with its master communication before 1984 * proceeding. 1985 * 1986 * c_queued_for_commit_takeover_list: 1987 * A takeover thread is ready to complete the copy of a fragment, it has to 1988 * wait a while since there is another thread currently communicating with 1989 * the master node. 1990 * 1991 * These two are queues, so we implement them as a Single Linked List, 1992 * FIFO queue, this means a SLFifoList. 1993 * 1994 * c_max_take_over_copy_threads: 1995 * The is the limit on the number of threads to use. Effectively the 1996 * parallelisation can never be higher than the number of LDM instances 1997 * that are used in the cluster. 1998 * 1999 * c_active_copy_threads_list: 2000 * Takeover threads are placed into this list while they are actively 2001 * copying a fragment at this point in time. We need to take things out 2002 * of this list in any order, so we need Double Linked List. 2003 * 2004 * c_activeTakeOverList: 2005 * While scannning fragments to find a fragment that our thread is 2006 * responsible for, we are placed into this list. This list handling 2007 * is on the starting node. 2008 * 2009 * This list is also used on the master node to keep track of node 2010 * take overs. 2011 * 2012 * c_completed_copy_threads_list: 2013 * This is a list where an thread is placed after completing the first 2014 * phase of scanning for fragments to copy. Some threads will be done 2015 * with this very quickly if we have more threads scanning than we have 2016 * LDM instances in the cluster. After completing the second phase where 2017 * we change state of ongoing transactions we release the thread. 2018 * 2019 * c_activeThreadTakeOverPtr: 2020 * This is the pointer to the currently active thread using the master 2021 * node to update the fragment state. 2022 * 2023 */ 2024 #define ZTAKE_OVER_THREADS 16 2025 #define ZMAX_TAKE_OVER_THREADS 64 2026 Uint32 c_max_takeover_copy_threads; 2027 2028 TakeOverRecord_pool c_takeOverPool; 2029 TakeOverRecord_list c_activeTakeOverList; 2030 TakeOverRecord_fifo c_queued_for_start_takeover_list; 2031 TakeOverRecord_fifo c_queued_for_commit_takeover_list; 2032 TakeOverRecord_list c_active_copy_threads_list; 2033 TakeOverRecord_list c_completed_copy_threads_list; 2034 TakeOverRecordPtr c_mainTakeOverPtr; 2035 TakeOverRecordPtr c_activeThreadTakeOverPtr; 2036 2037 /* List used in takeover handling in master part. */ 2038 TakeOverRecord_list c_masterActiveTakeOverList; 2039 2040 2041 //----------------------------------------------------- 2042 // TakeOver Record specific methods, starting node part 2043 //----------------------------------------------------- 2044 void startTakeOver(Signal *, 2045 Uint32 startNode, 2046 Uint32 toNode, 2047 const struct StartCopyReq*); 2048 2049 void startNextCopyFragment(Signal *, Uint32 takeOverPtr); 2050 void toCopyFragLab(Signal *, Uint32 takeOverPtr); 2051 void toStartCopyFrag(Signal *, TakeOverRecordPtr); 2052 void toCopyCompletedLab(Signal *, TakeOverRecordPtr regTakeOverptr); 2053 2054 void nr_start_fragments(Signal*, TakeOverRecordPtr); 2055 void nr_start_fragment(Signal*, TakeOverRecordPtr, ReplicaRecordPtr); 2056 void nr_run_redo(Signal*, TakeOverRecordPtr); 2057 void nr_start_logging(Signal*, TakeOverRecordPtr); 2058 2059 bool check_takeover_thread(TakeOverRecordPtr takeOverPtr, 2060 FragmentstorePtr fragPtr, 2061 Uint32 fragmentReplicaInstanceKey); 2062 void send_continueb_start_next_copy(Signal *signal, 2063 TakeOverRecordPtr takeOverPtr); 2064 void init_takeover_thread(TakeOverRecordPtr takeOverPtr, 2065 TakeOverRecordPtr mainTakeOverPtr, 2066 Uint32 number_of_threads, 2067 Uint32 thread_id); 2068 void start_next_takeover_thread(Signal *signal); 2069 void start_thread_takeover_logging(Signal *signal); 2070 void send_continueb_nr_start_logging(Signal *signal, 2071 TakeOverRecordPtr takeOverPtr); 2072 bool thread_takeover_completed(Signal *signal, 2073 TakeOverRecordPtr takeOverPtr); 2074 bool thread_takeover_copy_completed(Signal *signal, 2075 TakeOverRecordPtr takeOverPtr); 2076 void release_take_over_threads(void); 2077 void check_take_over_completed_correctly(void); 2078 2079 void sendStartTo(Signal* signal, TakeOverRecordPtr); 2080 void sendUpdateTo(Signal* signal, TakeOverRecordPtr); 2081 void sendUpdateFragStateReq(Signal *, 2082 Uint32 startGci, 2083 Uint32 storedType, 2084 TakeOverRecordPtr takeOverPtr); 2085 2086 void releaseTakeOver(TakeOverRecordPtr takeOverPtr, 2087 bool from_master, 2088 bool skip_check = false); 2089 2090 //------------------------------------------------- 2091 // Methods for take over functionality, master part 2092 //------------------------------------------------- 2093 void switchPrimaryMutex_locked(Signal* signal, Uint32, Uint32); 2094 void switchPrimaryMutex_unlocked(Signal* signal, Uint32, Uint32); 2095 void check_force_lcp(Ptr<TakeOverRecord> takeOverPtr); 2096 void abortTakeOver(Signal*, TakeOverRecordPtr); 2097 void updateToReq_fragmentMutex_locked(Signal*, Uint32, Uint32); 2098 bool findTakeOver(Ptr<TakeOverRecord> & ptr, Uint32 failedNodeId); 2099 void insertBackup(FragmentstorePtr regFragptr, Uint32 nodeId); 2100 2101 /* 2102 2.4 C O M M O N S T O R E D V A R I A B L E S 2103 ---------------------------------------------------- 2104 */ 2105 bool c_performed_copy_phase; 2106 2107 struct DIVERIFY_queue 2108 { DIVERIFY_queueDbdih::DIVERIFY_queue2109 DIVERIFY_queue() { 2110 m_ref = 0; 2111 cfirstVerifyQueue = clastVerifyQueue = 0; 2112 m_empty_done = 1; 2113 } 2114 Uint32 cfirstVerifyQueue; 2115 Uint32 clastVerifyQueue; 2116 Uint32 m_empty_done; 2117 Uint32 m_ref; 2118 char pad[NDB_CL_PADSZ(sizeof(void*) + 4 * sizeof(Uint32))]; 2119 }; 2120 2121 bool isEmpty(const DIVERIFY_queue&); 2122 void enqueue(DIVERIFY_queue&); 2123 void dequeue(DIVERIFY_queue&); 2124 void emptyverificbuffer(Signal *, Uint32 q, bool aContintueB); 2125 void emptyverificbuffer_check(Signal*, Uint32, Uint32); 2126 2127 DIVERIFY_queue c_diverify_queue[MAX_NDBMT_TC_THREADS]; 2128 Uint32 c_diverify_queue_cnt; 2129 2130 /*------------------------------------------------------------------------*/ 2131 /* THIS VARIABLE KEEPS THE REFERENCES TO FILE RECORDS THAT DESCRIBE */ 2132 /* THE TWO FILES THAT ARE USED TO STORE THE VARIABLE CRESTART_INFO */ 2133 /* ON DISK. */ 2134 /*------------------------------------------------------------------------*/ 2135 Uint32 crestartInfoFile[2]; 2136 2137 bool cgckptflag; /* A FLAG WHICH IS SET WHILE A NEW GLOBAL CHECK 2138 POINT IS BEING CREATED. NO VERIFICATION IS ALLOWED 2139 IF THE FLAG IS SET*/ 2140 Uint32 cgcpOrderBlocked; 2141 2142 /** 2143 * This structure describes 2144 * the GCP Save protocol 2145 */ 2146 struct GcpSave 2147 { 2148 Uint32 m_gci; 2149 Uint32 m_master_ref; 2150 enum State { 2151 GCP_SAVE_IDLE = 0, // Idle 2152 GCP_SAVE_REQ = 1, // REQ received 2153 GCP_SAVE_CONF = 2, // REF/CONF sent 2154 GCP_SAVE_COPY_GCI = 3 2155 } m_state; 2156 2157 struct { 2158 State m_state; 2159 Uint32 m_new_gci; 2160 Uint32 m_time_between_gcp; /* Delay between global checkpoints */ 2161 NDB_TICKS m_start_time; 2162 } m_master; 2163 } m_gcp_save; 2164 2165 /** 2166 * This structure describes the MicroGCP protocol 2167 */ 2168 struct MicroGcp 2169 { MicroGcpDbdih::MicroGcp2170 MicroGcp() { } 2171 bool m_enabled; 2172 Uint32 m_master_ref; 2173 2174 /** 2175 * rw-lock that protects multiple parallel DIVERIFY (readers) from 2176 * updates to gcp-state (e.g GCP_PREPARE, GCP_COMMIT) 2177 */ 2178 NdbSeqLock m_lock; 2179 Uint64 m_old_gci; 2180 // To avoid double send of SUB_GCP_COMPLETE_REP to SUMA via DBLQH. 2181 Uint64 m_last_sent_gci; 2182 Uint64 m_current_gci; // Currently active 2183 Uint64 m_new_gci; // Currently being prepared... 2184 enum State { 2185 M_GCP_IDLE = 0, 2186 M_GCP_PREPARE = 1, 2187 M_GCP_COMMIT = 2, 2188 M_GCP_COMMITTED = 3, 2189 M_GCP_COMPLETE = 4 2190 } m_state; 2191 2192 struct { 2193 State m_state; 2194 Uint32 m_time_between_gcp; 2195 Uint64 m_new_gci; 2196 NDB_TICKS m_start_time; 2197 } m_master; 2198 } m_micro_gcp; 2199 2200 struct GcpMonitor 2201 { 2202 struct 2203 { 2204 Uint32 m_gci; 2205 Uint32 m_elapsed_ms; //MilliSec since last GCP_SAVEed 2206 Uint32 m_max_lag_ms; //Max allowed lag(ms) before 'crashSystem' 2207 bool m_need_max_lag_recalc; // Whether max lag need to be recalculated 2208 #ifdef ERROR_INSERT 2209 bool test_set_max_lag; // Testing 2210 #endif 2211 } m_gcp_save; 2212 2213 struct 2214 { 2215 Uint64 m_gci; 2216 Uint32 m_elapsed_ms; //MilliSec since last GCP_COMMITed 2217 Uint32 m_max_lag_ms; //Max allowed lag(ms) before 'crashSystem' 2218 bool m_need_max_lag_recalc; // Whether max lag need to be recalculated 2219 #ifdef ERROR_INSERT 2220 bool test_set_max_lag; // Testing 2221 #endif 2222 } m_micro_gcp; 2223 2224 NDB_TICKS m_last_check; //Time GCP monitor last checked 2225 2226 #ifdef ERROR_INSERT 2227 Uint32 m_savedMaxCommitLag; // Testing 2228 #endif 2229 } m_gcp_monitor; 2230 2231 /*------------------------------------------------------------------------*/ 2232 /* THIS VARIABLE KEEPS TRACK OF THE STATE OF THIS NODE AS MASTER. */ 2233 /*------------------------------------------------------------------------*/ 2234 enum MasterState { 2235 MASTER_IDLE = 0, 2236 MASTER_ACTIVE = 1, 2237 MASTER_TAKE_OVER_GCP = 2 2238 }; 2239 MasterState cmasterState; 2240 Uint16 cmasterTakeOverNode; 2241 /* NODE IS NOT MASTER */ 2242 /* NODE IS ACTIVE AS MASTER */ 2243 /* NODE IS TAKING OVER AS MASTER */ 2244 2245 struct CopyGCIMaster { CopyGCIMasterDbdih::CopyGCIMaster2246 CopyGCIMaster(){ 2247 m_copyReason = CopyGCIReq::IDLE; 2248 for (Uint32 i = 0; i<WAIT_CNT; i++) 2249 m_waiting[i] = CopyGCIReq::IDLE; 2250 } 2251 /*------------------------------------------------------------------------*/ 2252 /* THIS STATE VARIABLE IS USED TO INDICATE IF COPYING OF RESTART */ 2253 /* INFO WAS STARTED BY A LOCAL CHECKPOINT OR AS PART OF A SYSTEM */ 2254 /* RESTART. */ 2255 /*------------------------------------------------------------------------*/ 2256 CopyGCIReq::CopyReason m_copyReason; 2257 2258 /*------------------------------------------------------------------------*/ 2259 /* COPYING RESTART INFO CAN BE STARTED BY LOCAL CHECKPOINTS AND BY */ 2260 /* GLOBAL CHECKPOINTS. WE CAN HOWEVER ONLY HANDLE TWO SUCH COPY AT */ 2261 /* THE TIME. THUS WE HAVE TO KEEP WAIT INFORMATION IN THIS VARIABLE.*/ 2262 /*------------------------------------------------------------------------*/ 2263 STATIC_CONST( WAIT_CNT = 2 ); 2264 CopyGCIReq::CopyReason m_waiting[WAIT_CNT]; 2265 } c_copyGCIMaster; 2266 2267 struct CopyGCISlave { CopyGCISlaveDbdih::CopyGCISlave2268 CopyGCISlave(){ m_copyReason = CopyGCIReq::IDLE; m_expectedNextWord = 0;} 2269 /*------------------------------------------------------------------------*/ 2270 /* THIS STATE VARIABLE IS USED TO INDICATE IF COPYING OF RESTART */ 2271 /* INFO WAS STARTED BY A LOCAL CHECKPOINT OR AS PART OF A SYSTEM */ 2272 /* RESTART. THIS VARIABLE IS USED BY THE NODE THAT RECEIVES */ 2273 /* COPY_GCI_REQ. */ 2274 /*------------------------------------------------------------------------*/ 2275 Uint32 m_senderData; 2276 BlockReference m_senderRef; 2277 CopyGCIReq::CopyReason m_copyReason; 2278 2279 Uint32 m_expectedNextWord; 2280 } c_copyGCISlave; 2281 2282 /*------------------------------------------------------------------------*/ 2283 /* THIS VARIABLE IS USED TO KEEP TRACK OF THE STATE OF LOCAL */ 2284 /* CHECKPOINTS. */ 2285 /*------------------------------------------------------------------------*/ 2286 public: 2287 enum LcpStatus { 2288 LCP_STATUS_IDLE = 0, 2289 LCP_TCGET = 1, // Only master 2290 LCP_STATUS_ACTIVE = 2, 2291 LCP_WAIT_MUTEX = 3, // Only master 2292 LCP_CALCULATE_KEEP_GCI = 4, // Only master 2293 LCP_COPY_GCI = 5, 2294 LCP_INIT_TABLES = 6, 2295 LCP_TC_CLOPSIZE = 7, // Only master 2296 LCP_START_LCP_ROUND = 8, 2297 LCP_TAB_COMPLETED = 9, 2298 LCP_TAB_SAVED = 10 2299 }; 2300 private: 2301 2302 struct LcpState { LcpStateDbdih::LcpState2303 LcpState() {} 2304 LcpStatus lcpStatus; 2305 Uint32 lcpStatusUpdatedPlace; 2306 2307 struct Save { 2308 LcpStatus m_status; 2309 Uint32 m_place; 2310 } m_saveState[10]; 2311 setLcpStatusDbdih::LcpState2312 void setLcpStatus(LcpStatus status, Uint32 line){ 2313 for (Uint32 i = 9; i > 0; i--) 2314 m_saveState[i] = m_saveState[i-1]; 2315 m_saveState[0].m_status = lcpStatus; 2316 m_saveState[0].m_place = lcpStatusUpdatedPlace; 2317 2318 lcpStatus = status; 2319 lcpStatusUpdatedPlace = line; 2320 } 2321 2322 /** 2323 * State of stalling LCPs for node restarts 2324 */ 2325 Uint32 lcpStallStart; /* Has started stalling lcp start */ 2326 NDB_TICKS lastLogTime; /* Last time we logged state of stall */ 2327 NDB_TICKS m_start_lcp_check_time; /* Time of stalling started */ 2328 Uint32 stall_node_waiting_for; /* The node we've logged about waiting for */ 2329 2330 Uint32 lcpStart; 2331 Uint32 lcpStopGcp; 2332 Uint32 keepGci; /* USED TO CALCULATE THE GCI TO KEEP AFTER A LCP */ 2333 Uint32 oldestRestorableGci; 2334 2335 bool lcpManualStallStart; /* User requested stall of start (testing only) */ 2336 2337 NDB_TICKS m_start_time; // When last LCP was started 2338 Uint64 m_lcp_time; // How long last LCP took 2339 Uint32 m_lcp_trylock_timeout; 2340 2341 struct CurrentFragment { 2342 Uint32 tableId; 2343 Uint32 fragmentId; 2344 } currentFragment; 2345 2346 Uint32 noOfLcpFragRepOutstanding; 2347 2348 /*------------------------------------------------------------------------*/ 2349 /* USED TO ENSURE THAT LCP'S ARE EXECUTED WITH CERTAIN TIMEINTERVALS*/ 2350 /* EVEN WHEN SYSTEM IS NOT DOING ANYTHING. */ 2351 /*------------------------------------------------------------------------*/ 2352 Uint32 ctimer; 2353 Uint32 ctcCounter; 2354 Uint32 clcpDelay; /* MAX. 2^(CLCP_DELAY - 2) SEC BETWEEN LCP'S */ 2355 2356 /*------------------------------------------------------------------------*/ 2357 /* THIS STATE IS USED TO TELL IF THE FIRST LCP AFTER START/RESTART */ 2358 /* HAS BEEN RUN. AFTER A NODE RESTART THE NODE DOES NOT ENTER */ 2359 /* STARTED STATE BEFORE THIS IS DONE. */ 2360 /*------------------------------------------------------------------------*/ 2361 bool immediateLcpStart; 2362 bool m_LCP_COMPLETE_REP_From_Master_Received; 2363 SignalCounter m_LCP_COMPLETE_REP_Counter_DIH; 2364 SignalCounter m_LCP_COMPLETE_REP_Counter_LQH; 2365 SignalCounter m_LAST_LCP_FRAG_ORD; 2366 NdbNodeBitmask m_participatingLQH; 2367 NdbNodeBitmask m_participatingDIH; 2368 NdbNodeBitmask m_allReplicasQueuedLQH; 2369 2370 Uint32 m_masterLcpDihRef; 2371 bool m_MASTER_LCPREQ_Received; 2372 Uint32 m_MASTER_LCPREQ_FailedNodeId; 2373 2374 Uint32 m_lastLCP_COMPLETE_REP_id; 2375 Uint32 m_lastLCP_COMPLETE_REP_ref; 2376 2377 // Whether the 'lcp' is already completed under the 2378 // coordination of the failed master already_completed_lcpDbdih::LcpState2379 bool already_completed_lcp(Uint32 lcp, Uint32 current_master) const 2380 { 2381 const Uint32 last_completed_master_node = 2382 refToNode(m_lastLCP_COMPLETE_REP_ref); 2383 if (m_lastLCP_COMPLETE_REP_id == lcp && 2384 last_completed_master_node != current_master && 2385 last_completed_master_node == m_MASTER_LCPREQ_FailedNodeId) 2386 { 2387 return true; 2388 } 2389 return false; 2390 } 2391 2392 } c_lcpState; 2393 2394 /*------------------------------------------------------------------------*/ 2395 /* THIS VARIABLE KEEPS TRACK OF HOW MANY TABLES ARE ACTIVATED WHEN */ 2396 /* STARTING A LOCAL CHECKPOINT WE SHOULD AVOID STARTING A CHECKPOINT*/ 2397 /* WHEN NO TABLES ARE ACTIVATED. */ 2398 /*------------------------------------------------------------------------*/ 2399 Uint32 cnoOfActiveTables; 2400 2401 BlockReference cdictblockref; /* DICTIONARY BLOCK REFERENCE */ 2402 Uint32 cfailurenr; /* EVERY TIME WHEN A NODE FAILURE IS REPORTED 2403 THIS NUMBER IS INCREMENTED. AT THE START OF 2404 THE SYSTEM THIS NUMBER MUST BE INITIATED TO 2405 ZERO */ 2406 Uint32 cMinTcFailNo; /* Minimum TC handled failNo allowed to close GCP */ 2407 2408 BlockReference clocallqhblockref; 2409 BlockReference clocaltcblockref; 2410 BlockReference cmasterdihref; 2411 Uint16 cownNodeId; 2412 BlockReference cndbStartReqBlockref; 2413 BlockReference cntrlblockref; 2414 Uint32 con_lineNodes; 2415 Uint32 creceivedfrag; 2416 Uint32 cremainingfrags; 2417 Uint32 cstarttype; 2418 Uint32 csystemnodes; 2419 Uint32 c_newest_restorable_gci; 2420 Uint32 c_set_initial_start_flag; 2421 NDB_TICKS c_current_time; // Updated approx. every 10ms 2422 2423 /* Limit the number of concurrent table definition writes during LCP 2424 * This avoids exhausting the DIH page pool 2425 */ 2426 CountingSemaphore c_lcpTabDefWritesControl; 2427 2428 public: 2429 enum LcpMasterTakeOverState { 2430 LMTOS_IDLE = 0, 2431 LMTOS_WAIT_LCP_FRAG_REP = 2,// Currently waiting for outst. LCP_FRAG_REP 2432 LMTOS_INITIAL = 3, 2433 LMTOS_ALL_IDLE = 4, 2434 LMTOS_ALL_ACTIVE = 5, 2435 LMTOS_LCP_CONCLUDING = 6, 2436 LMTOS_COPY_ONGOING = 7 2437 }; 2438 private: 2439 class MasterTakeOverState { 2440 public: MasterTakeOverState()2441 MasterTakeOverState() {} set(LcpMasterTakeOverState s,Uint32 line)2442 void set(LcpMasterTakeOverState s, Uint32 line) { 2443 state = s; updatePlace = line; 2444 } 2445 2446 LcpMasterTakeOverState state; 2447 Uint32 updatePlace; 2448 2449 Uint32 minTableId; 2450 Uint32 minFragId; 2451 Uint32 failedNodeId; 2452 } c_lcpMasterTakeOverState; 2453 2454 Uint16 cmasterNodeId; 2455 2456 struct NodeStartMasterRecord { NodeStartMasterRecordDbdih::NodeStartMasterRecord2457 NodeStartMasterRecord() {} 2458 Uint32 startNode; 2459 Uint32 wait; 2460 Uint32 failNr; 2461 bool activeState; 2462 Uint32 blockGcp; // 0, 1=ordered, 2=effective 2463 Uint32 startInfoErrorCode; 2464 Uint32 m_outstandingGsn; 2465 MutexHandle2<DIH_FRAGMENT_INFO> m_fragmentInfoMutex; 2466 }; 2467 NodeStartMasterRecord c_nodeStartMaster; 2468 2469 struct NodeStartSlaveRecord { NodeStartSlaveRecordDbdih::NodeStartSlaveRecord2470 NodeStartSlaveRecord() { nodeId = 0;} 2471 2472 Uint32 nodeId; 2473 }; 2474 NodeStartSlaveRecord c_nodeStartSlave; 2475 2476 Uint32 cfirstAliveNode; 2477 Uint32 cfirstDeadNode; 2478 Uint32 cstartPhase; 2479 Uint32 cnoReplicas; 2480 2481 bool cwaitLcpSr; 2482 2483 /** 2484 * After a node failure we want to increase the disk checkpoint speed until 2485 * we have completed the current ongoing node failure. We also increase the 2486 * checkpoint speed when we know that a node restart is ongoing. 2487 */ 2488 bool c_increase_lcp_speed_after_nf; 2489 /** 2490 * Available nodegroups (ids) (length == cnoOfNodeGroups) 2491 * use to support nodegroups 2,4,6 (not just consequtive nodegroup ids) 2492 */ 2493 Uint32 c_node_groups[MAX_NDB_NODE_GROUPS]; 2494 Uint32 cnoOfNodeGroups; 2495 Uint32 crestartGci; /* VALUE OF GCI WHEN SYSTEM RESTARTED OR STARTED */ 2496 2497 /** 2498 * Counter variables keeping track of the number of outstanding signals 2499 * for particular signals in various protocols. 2500 */ 2501 SignalCounter c_COPY_GCIREQ_Counter; 2502 SignalCounter c_COPY_TABREQ_Counter; 2503 SignalCounter c_UPDATE_FRAG_STATEREQ_Counter; 2504 SignalCounter c_DIH_SWITCH_REPLICA_REQ_Counter; 2505 SignalCounter c_GCP_COMMIT_Counter; 2506 SignalCounter c_GCP_PREPARE_Counter; 2507 SignalCounter c_GCP_SAVEREQ_Counter; 2508 SignalCounter c_SUB_GCP_COMPLETE_REP_Counter; 2509 SignalCounter c_INCL_NODEREQ_Counter; 2510 SignalCounter c_MASTER_GCPREQ_Counter; 2511 SignalCounter c_MASTER_LCPREQ_Counter; 2512 SignalCounter c_START_INFOREQ_Counter; 2513 SignalCounter c_START_RECREQ_Counter; 2514 SignalCounter c_STOP_ME_REQ_Counter; 2515 SignalCounter c_TC_CLOPSIZEREQ_Counter; 2516 SignalCounter c_TCGETOPSIZEREQ_Counter; 2517 SignalCounter c_START_LCP_REQ_Counter; 2518 2519 bool c_blockCommit; 2520 Uint32 c_blockCommitNo; 2521 getBlockCommit() const2522 bool getBlockCommit() const { 2523 return c_blockCommit || cgckptflag; 2524 } 2525 2526 /** 2527 * SwitchReplicaRecord - Should only be used by master 2528 */ 2529 struct SwitchReplicaRecord { SwitchReplicaRecordDbdih::SwitchReplicaRecord2530 SwitchReplicaRecord() {} clearDbdih::SwitchReplicaRecord2531 void clear(){} 2532 2533 Uint32 nodeId; 2534 Uint32 tableId; 2535 Uint32 fragNo; 2536 }; 2537 SwitchReplicaRecord c_switchReplicas; 2538 2539 struct StopPermProxyRecord { StopPermProxyRecordDbdih::StopPermProxyRecord2540 StopPermProxyRecord() { clientRef = 0; } 2541 2542 Uint32 clientData; 2543 BlockReference clientRef; 2544 BlockReference masterRef; 2545 }; 2546 2547 struct StopPermMasterRecord { StopPermMasterRecordDbdih::StopPermMasterRecord2548 StopPermMasterRecord() { clientRef = 0;} 2549 2550 Uint32 returnValue; 2551 2552 Uint32 clientData; 2553 BlockReference clientRef; 2554 }; 2555 2556 StopPermProxyRecord c_stopPermProxy; 2557 StopPermMasterRecord c_stopPermMaster; 2558 2559 void checkStopPermProxy(Signal*, NodeId failedNodeId); 2560 void checkStopPermMaster(Signal*, NodeRecordPtr failedNodePtr); 2561 2562 void switchReplica(Signal*, 2563 Uint32 nodeId, 2564 Uint32 tableId, 2565 Uint32 fragNo); 2566 2567 void switchReplicaReply(Signal*, NodeId nodeId); 2568 2569 /** 2570 * Wait GCP (proxy) 2571 */ 2572 struct WaitGCPProxyRecord { WaitGCPProxyRecordDbdih::WaitGCPProxyRecord2573 WaitGCPProxyRecord() { clientRef = 0;} 2574 2575 Uint32 clientData; 2576 BlockReference clientRef; 2577 BlockReference masterRef; 2578 2579 union { Uint32 nextPool; Uint32 nextList; }; 2580 Uint32 prevList; 2581 }; 2582 typedef Ptr<WaitGCPProxyRecord> WaitGCPProxyPtr; 2583 typedef ArrayPool<WaitGCPProxyRecord> WaitGCPProxyRecord_pool; 2584 typedef DLList<WaitGCPProxyRecord_pool> WaitGCPProxyRecord_list; 2585 /** 2586 * Wait GCP (master) 2587 */ 2588 struct WaitGCPMasterRecord { WaitGCPMasterRecordDbdih::WaitGCPMasterRecord2589 WaitGCPMasterRecord() { clientRef = 0;} 2590 Uint32 clientData; 2591 BlockReference clientRef; 2592 /** 2593 * GCI which must be completed before CONF sent 2594 * For WaitEpoch, it is not used, the next 2595 * completing epoch sends a CONF. 2596 */ 2597 Uint32 waitGCI; 2598 2599 /** 2600 * Special value indicating a request for shutdown sync 2601 */ 2602 static const Uint32 ShutdownSyncGci = 0xffffffff; 2603 2604 union { Uint32 nextPool; Uint32 nextList; }; 2605 Uint32 prevList; 2606 }; 2607 typedef Ptr<WaitGCPMasterRecord> WaitGCPMasterPtr; 2608 typedef ArrayPool<WaitGCPMasterRecord> WaitGCPMasterRecord_pool; 2609 2610 /** 2611 * Pool/list of WaitGCPProxyRecord record 2612 */ 2613 WaitGCPProxyRecord_pool waitGCPProxyPool; 2614 WaitGCPProxyRecord_list c_waitGCPProxyList; 2615 2616 /** 2617 * Pool/list of WaitGCPMasterRecord record 2618 */ 2619 WaitGCPMasterRecord_pool waitGCPMasterPool; 2620 typedef DLList<WaitGCPMasterRecord_pool> WaitGCPList; 2621 WaitGCPList c_waitGCPMasterList; 2622 WaitGCPList c_waitEpochMasterList; 2623 2624 void checkWaitGCPProxy(Signal*, NodeId failedNodeId); 2625 void checkWaitGCPMaster(Signal*, NodeId failedNodeId); 2626 void checkShutdownSync(); 2627 void emptyWaitGCPMasterQueue(Signal*, Uint64, WaitGCPList&); 2628 2629 void getNodeBitmap(NdbNodeBitmask& map, 2630 Uint32 listHead, 2631 int (*versionFunction) (Uint32)); 2632 2633 /** 2634 * Stop me 2635 */ 2636 struct StopMeRecord { StopMeRecordDbdih::StopMeRecord2637 StopMeRecord() { clientRef = 0;} 2638 2639 BlockReference clientRef; 2640 Uint32 clientData; 2641 }; 2642 StopMeRecord c_stopMe; 2643 2644 void checkStopMe(Signal *, NodeRecordPtr failedNodePtr); 2645 2646 #define DIH_CDATA_SIZE _SYSFILE_FILE_SIZE 2647 /** 2648 * This variable must be atleast the size of Sysfile::SYSFILE_SIZE32_v2 2649 */ 2650 Uint32 cdata_size_in_words; 2651 Uint32 cdata[DIH_CDATA_SIZE]; /* TEMPORARY ARRAY VARIABLE */ 2652 2653 /** 2654 * Sys file data 2655 */ 2656 Uint32 sysfileData[DIH_CDATA_SIZE]; 2657 Uint32 sysfileDataToFile[DIH_CDATA_SIZE]; 2658 2659 /** 2660 * When a node comes up without filesystem 2661 * we have to clear all LCP for that node 2662 */ 2663 void handle_send_continueb_invalidate_node_lcp(Signal *signal); 2664 void invalidateNodeLCP(Signal *, Uint32 nodeId, Uint32 tableId); 2665 void invalidateNodeLCP(Signal *, Uint32 nodeId, TabRecordPtr); 2666 2667 /** 2668 * Reply from nodeId 2669 */ 2670 void startInfoReply(Signal *, Uint32 nodeId); 2671 2672 void dump_replica_info(); 2673 void dump_replica_info(const Fragmentstore*); 2674 2675 // DIH specifics for execNODE_START_REP (sendDictUnlockOrd) 2676 void execNODE_START_REP(Signal* signal); 2677 2678 /* 2679 * Lock master DICT. Only current use is by starting node 2680 * during NR. A pool of slave records is convenient anyway. 2681 */ 2682 struct DictLockSlaveRecord { 2683 Uint32 lockPtr; 2684 Uint32 lockType; 2685 bool locked; 2686 Callback callback; 2687 Uint32 nextPool; 2688 }; 2689 2690 typedef Ptr<DictLockSlaveRecord> DictLockSlavePtr; 2691 typedef ArrayPool<DictLockSlaveRecord> DictLockSlaveRecord_pool; 2692 DictLockSlaveRecord_pool c_dictLockSlavePool; 2693 2694 // slave 2695 void sendDictLockReq(Signal* signal, Uint32 lockType, Callback c); 2696 void recvDictLockConf(Signal* signal); 2697 void sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI); 2698 2699 // NR 2700 Uint32 c_dictLockSlavePtrI_nodeRestart; // userPtr for NR 2701 void recvDictLockConf_nodeRestart(Signal* signal, Uint32 data, Uint32 ret); 2702 2703 Uint32 c_error_7181_ref; 2704 2705 #ifdef ERROR_INSERT 2706 void sendToRandomNodes(const char*, Signal*, SignalCounter*, 2707 SendFunction, 2708 Uint32 extra = RNIL, 2709 Uint32 block = 0, Uint32 gsn = 0, Uint32 len = 0, 2710 JobBufferLevel = JBB); 2711 #endif 2712 2713 bool check_enable_micro_gcp(Signal* signal, bool broadcast); 2714 2715 bool c_sr_wait_to; 2716 NdbNodeBitmask m_sr_nodes; 2717 NdbNodeBitmask m_to_nodes; 2718 2719 void startme_copygci_conf(Signal*); 2720 2721 /** 2722 * Local LCP state 2723 * This struct is more or less a copy of lcp-state 2724 * Reason for duplicating it is that 2725 * - not to mess with current code 2726 * - this one is "distributed", i.e maintained by *all* nodes, 2727 * not like c_lcpState which mixed master/slave state in a "unnatural" 2728 * way 2729 */ 2730 struct LocalLCPState 2731 { 2732 enum State { 2733 LS_INITIAL = 0, 2734 LS_RUNNING = 1, 2735 LS_COMPLETE = 2, 2736 LS_RUNNING_MTO_TAB_SAVED = 3 2737 } m_state; 2738 2739 StartLcpReq m_start_lcp_req; 2740 Uint32 m_keep_gci; // Min GCI is needed to restore LCP 2741 Uint32 m_stop_gci; // This GCI needs to be complete before LCP is restorable 2742 LocalLCPStateDbdih::LocalLCPState2743 LocalLCPState() { reset();} 2744 2745 void reset(); 2746 void init(const StartLcpReq*); 2747 void init_master_take_over_idle_to_tab_saved(); 2748 void lcp_frag_rep(const LcpFragRep*); 2749 void lcp_complete_rep(Uint32 gci); 2750 2751 /** 2752 * @param gci - current GCI being made restorable (COPY_GCI) 2753 */ 2754 bool check_cut_log_tail(Uint32 gci) const; 2755 } m_local_lcp_state; 2756 2757 // MT LQH 2758 Uint32 c_fragments_per_node_; 2759 Uint32 getFragmentsPerNode(); 2760 Uint32 getFragmentCount(Uint32 partitionBalance, 2761 Uint32 numOfNodeGroups, 2762 Uint32 numOfReplicas, 2763 Uint32 numOfLDMs) const; 2764 /** 2765 * dihGetInstanceKey 2766 * 2767 * This method maps a fragment to a block instance key 2768 * This is the LDM instance which manages the fragment 2769 * on this node. 2770 * The range of an instance key is 1 to 2771 * NDBMT_MAX_WORKER_INSTANCES inclusive. 2772 * 0 is the proxy block instance. 2773 */ dihGetInstanceKey(FragmentstorePtr tFragPtr)2774 Uint32 dihGetInstanceKey(FragmentstorePtr tFragPtr) { 2775 ndbrequire(!tFragPtr.isNull()); 2776 Uint32 log_part_id = tFragPtr.p->m_log_part_id; 2777 ndbrequire(log_part_id < NDBMT_MAX_WORKER_INSTANCES); 2778 return 1 + log_part_id; 2779 } 2780 Uint32 dihGetInstanceKey(Uint32 tabId, Uint32 fragId); 2781 Uint32 dihGetInstanceKeyCanFail(Uint32 tabId, Uint32 fragId); 2782 2783 void log_setNoSend(); 2784 /** 2785 * Get minimum version of nodes in alive-list 2786 */ 2787 Uint32 getMinVersion() const; 2788 2789 bool c_2pass_inr; 2790 2791 /* Max LCP parallelism is node (version) specific */ 2792 Uint8 getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const; 2793 2794 void isolateNodes(Signal* signal, 2795 Uint32 delayMillis, 2796 const NdbNodeBitmask& victims); 2797 2798 NodeId c_handled_master_take_over_copy_gci; 2799 2800 bool handle_master_take_over_copy_gci(Signal *signal, 2801 NodeId newMasterNodeId); 2802 2803 RedoStateRep::RedoAlertState m_node_redo_alert_state[MAX_NDB_NODES]; 2804 RedoStateRep::RedoAlertState m_global_redo_alert_state; 2805 RedoStateRep::RedoAlertState get_global_redo_alert_state(); 2806 void sendREDO_STATE_REP_to_all(Signal*, Uint32 block, bool send_to_all); 2807 bool m_master_lcp_req_lcp_already_completed; 2808 2809 void complete_restart_nr(Signal*); 2810 2811 /* The highest data node id in the cluster. */ 2812 Uint32 m_max_node_id; 2813 bool m_set_up_multi_trp_in_node_restart; 2814 public: is_master()2815 bool is_master() { return isMaster(); } 2816 2817 NdbNodeBitmask c_shutdownReqNodes; 2818 }; 2819 2820 #if (DIH_CDATA_SIZE < _SYSFILE_SIZE32_v2) 2821 #error "cdata is to small compared to Sysfile size" 2822 #endif 2823 2824 2825 #undef JAM_FILE_ID 2826 2827 #endif 2828 2829