1 /*
2    Copyright (c) 2003, 2021, Oracle and/or its affiliates.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #define DBDIH_C
26 #include <ndb_global.h>
27 #include <ndb_limits.h>
28 #include <ndb_version.h>
29 #include <NdbOut.hpp>
30 
31 #include "Dbdih.hpp"
32 #include "Configuration.hpp"
33 
34 #include <signaldata/CopyTab.hpp>
35 #include <signaldata/DbinfoScan.hpp>
36 #include <signaldata/AllocNodeId.hpp>
37 #include <signaldata/NodeRecoveryStatusRep.hpp>
38 #include <signaldata/BlockCommitOrd.hpp>
39 #include <signaldata/CheckNodeGroups.hpp>
40 #include <signaldata/CopyActive.hpp>
41 #include <signaldata/CopyFrag.hpp>
42 #include <signaldata/CopyGCIReq.hpp>
43 #include <signaldata/DiAddTab.hpp>
44 #include <signaldata/DictStart.hpp>
45 #include <signaldata/DiGetNodes.hpp>
46 #include <signaldata/DihContinueB.hpp>
47 #include <signaldata/DihSwitchReplica.hpp>
48 #include <signaldata/DumpStateOrd.hpp>
49 #include <signaldata/EmptyLcp.hpp>
50 #include <signaldata/EventReport.hpp>
51 #include <signaldata/GCP.hpp>
52 #include <signaldata/HotSpareRep.hpp>
53 #include <signaldata/MasterGCP.hpp>
54 #include <signaldata/MasterLCP.hpp>
55 #include <signaldata/NFCompleteRep.hpp>
56 #include <signaldata/NodeFailRep.hpp>
57 #include <signaldata/ReadNodesConf.hpp>
58 #include <signaldata/StartFragReq.hpp>
59 #include <signaldata/StartInfo.hpp>
60 #include <signaldata/StartMe.hpp>
61 #include <signaldata/StartPerm.hpp>
62 #include <signaldata/StartRec.hpp>
63 #include <signaldata/StopPerm.hpp>
64 #include <signaldata/StopMe.hpp>
65 #include <signaldata/TestOrd.hpp>
66 #include <signaldata/WaitGCP.hpp>
67 #include <signaldata/DihStartTab.hpp>
68 #include <signaldata/LCP.hpp>
69 #include <signaldata/SystemError.hpp>
70 
71 #include <signaldata/TakeOver.hpp>
72 
73 #include <signaldata/DropTab.hpp>
74 #include <signaldata/AlterTab.hpp>
75 #include <signaldata/AlterTable.hpp>
76 #include <signaldata/PrepDropTab.hpp>
77 #include <signaldata/SumaImpl.hpp>
78 #include <signaldata/DictTabInfo.hpp>
79 #include <signaldata/CreateFragmentation.hpp>
80 #include <signaldata/LqhFrag.hpp>
81 #include <signaldata/FsOpenReq.hpp>
82 #include <signaldata/DihScanTab.hpp>
83 #include <signaldata/DictLock.hpp>
84 #include <DebuggerNames.hpp>
85 #include <signaldata/Upgrade.hpp>
86 #include <NdbEnv.h>
87 #include <signaldata/CreateNodegroup.hpp>
88 #include <signaldata/CreateNodegroupImpl.hpp>
89 #include <signaldata/DropNodegroup.hpp>
90 #include <signaldata/DropNodegroupImpl.hpp>
91 #include <signaldata/DihGetTabInfo.hpp>
92 #include <SectionReader.hpp>
93 #include <signaldata/DihRestart.hpp>
94 #include <signaldata/IsolateOrd.hpp>
95 
96 #include <EventLogger.hpp>
97 
98 #define JAM_FILE_ID 354
99 
100 static const Uint32 WaitTableStateChangeMillis = 10;
101 
102 extern EventLogger * g_eventLogger;
103 
104 #define SYSFILE ((Sysfile *)&sysfileData[0])
105 #define ZINIT_CREATE_GCI Uint32(0)
106 #define ZINIT_REPLICA_LAST_GCI Uint32(-1)
107 
108 #define RETURN_IF_NODE_NOT_ALIVE(node) \
109   if (!checkNodeAlive((node))) { \
110     jam(); \
111     return; \
112   } \
113 
114 #define receiveLoopMacro(sigName, receiveNodeId)\
115 {                                                \
116   c_##sigName##_Counter.clearWaitingFor(receiveNodeId); \
117   if(c_##sigName##_Counter.done() == false){     \
118      jam();                                      \
119      return;                                     \
120   }                                              \
121 }
122 
123 #define sendLoopMacro(sigName, signalRoutine, extra)                    \
124 {                                                                       \
125   c_##sigName##_Counter.clearWaitingFor();                              \
126   NodeRecordPtr specNodePtr;                                            \
127   specNodePtr.i = cfirstAliveNode;                                      \
128   do {                                                                  \
129     jam();                                                              \
130     ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);              \
131     c_##sigName##_Counter.setWaitingFor(specNodePtr.i);                 \
132     signalRoutine(signal, specNodePtr.i, extra);                        \
133     specNodePtr.i = specNodePtr.p->nextNode;                            \
134   } while (specNodePtr.i != RNIL);                                      \
135 }
136 
137 static
138 Uint32
prevLcpNo(Uint32 lcpNo)139 prevLcpNo(Uint32 lcpNo){
140   if(lcpNo == 0)
141     return MAX_LCP_USED - 1;
142   return lcpNo - 1;
143 }
144 
145 static
146 Uint32
nextLcpNo(Uint32 lcpNo)147 nextLcpNo(Uint32 lcpNo){
148   lcpNo++;
149   if(lcpNo >= MAX_LCP_USED)
150     return 0;
151   return lcpNo;
152 }
153 
nullRoutine(Signal * signal,Uint32 nodeId,Uint32 extra)154 void Dbdih::nullRoutine(Signal* signal, Uint32 nodeId, Uint32 extra)
155 {
156 }//Dbdih::nullRoutine()
157 
sendCOPY_GCIREQ(Signal * signal,Uint32 nodeId,Uint32 extra)158 void Dbdih::sendCOPY_GCIREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
159 {
160   ndbrequire(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE);
161 
162   const BlockReference ref = calcDihBlockRef(nodeId);
163   const Uint32 wordPerSignal = CopyGCIReq::DATA_SIZE;
164   const Uint32 noOfSignals = ((Sysfile::SYSFILE_SIZE32 + (wordPerSignal - 1)) /
165 			      wordPerSignal);
166 
167   CopyGCIReq * const copyGCI = (CopyGCIReq *)&signal->theData[0];
168   copyGCI->anyData = nodeId;
169   copyGCI->copyReason = c_copyGCIMaster.m_copyReason;
170   copyGCI->startWord = 0;
171 
172   for(Uint32 i = 0; i < noOfSignals; i++) {
173     jam();
174     { // Do copy
175       const int startWord = copyGCI->startWord;
176       for(Uint32 j = 0; j < wordPerSignal; j++) {
177         copyGCI->data[j] = sysfileData[j+startWord];
178       }//for
179     }
180     sendSignal(ref, GSN_COPY_GCIREQ, signal, 25, JBB);
181     copyGCI->startWord += wordPerSignal;
182   }//for
183 }//Dbdih::sendCOPY_GCIREQ()
184 
185 
sendDIH_SWITCH_REPLICA_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)186 void Dbdih::sendDIH_SWITCH_REPLICA_REQ(Signal* signal, Uint32 nodeId,
187                                        Uint32 extra)
188 {
189   const BlockReference ref    = calcDihBlockRef(nodeId);
190   sendSignal(ref, GSN_DIH_SWITCH_REPLICA_REQ, signal,
191              DihSwitchReplicaReq::SignalLength, JBB);
192 }//Dbdih::sendDIH_SWITCH_REPLICA_REQ()
193 
sendEMPTY_LCP_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)194 void Dbdih::sendEMPTY_LCP_REQ(Signal* signal, Uint32 nodeId, Uint32 extra)
195 {
196   BlockReference ref = calcLqhBlockRef(nodeId);
197   sendSignal(ref, GSN_EMPTY_LCP_REQ, signal, EmptyLcpReq::SignalLength, JBB);
198 }//Dbdih::sendEMPTY_LCPREQ()
199 
sendGCP_COMMIT(Signal * signal,Uint32 nodeId,Uint32 extra)200 void Dbdih::sendGCP_COMMIT(Signal* signal, Uint32 nodeId, Uint32 extra)
201 {
202   BlockReference ref = calcDihBlockRef(nodeId);
203   GCPCommit *req = (GCPCommit*)signal->getDataPtrSend();
204   req->nodeId = cownNodeId;
205   req->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
206   req->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
207   sendSignal(ref, GSN_GCP_COMMIT, signal, GCPCommit::SignalLength, JBA);
208 
209   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
210 }//Dbdih::sendGCP_COMMIT()
211 
sendGCP_PREPARE(Signal * signal,Uint32 nodeId,Uint32 extra)212 void Dbdih::sendGCP_PREPARE(Signal* signal, Uint32 nodeId, Uint32 extra)
213 {
214   BlockReference ref = calcDihBlockRef(nodeId);
215   GCPPrepare *req = (GCPPrepare*)signal->getDataPtrSend();
216   req->nodeId = cownNodeId;
217   req->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
218   req->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
219 
220   if (! (ERROR_INSERTED(7201) || ERROR_INSERTED(7202)))
221   {
222     sendSignal(ref, GSN_GCP_PREPARE, signal, GCPPrepare::SignalLength, JBA);
223   }
224   else if (ERROR_INSERTED(7201))
225   {
226     sendSignal(ref, GSN_GCP_PREPARE, signal, GCPPrepare::SignalLength, JBB);
227   }
228   else if (ERROR_INSERTED(7202))
229   {
230     ndbrequire(nodeId == getOwnNodeId());
231     sendSignalWithDelay(ref, GSN_GCP_PREPARE, signal, 2000,
232                         GCPPrepare::SignalLength);
233   }
234   else
235   {
236     ndbrequire(false); // should be dead code #ifndef ERROR_INSERT
237   }
238 
239   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
240 }//Dbdih::sendGCP_PREPARE()
241 
242 void
sendSUB_GCP_COMPLETE_REP(Signal * signal,Uint32 nodeId,Uint32 extra)243 Dbdih::sendSUB_GCP_COMPLETE_REP(Signal* signal, Uint32 nodeId, Uint32 extra)
244 {
245   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
246   if (!ndbd_dih_sub_gcp_complete_ack(getNodeInfo(nodeId).m_version))
247   {
248     jam();
249     c_SUB_GCP_COMPLETE_REP_Counter.clearWaitingFor(nodeId);
250   }
251   BlockReference ref = calcDihBlockRef(nodeId);
252   sendSignal(ref, GSN_SUB_GCP_COMPLETE_REP, signal,
253              SubGcpCompleteRep::SignalLength, JBA);
254 }
255 
sendGCP_SAVEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)256 void Dbdih::sendGCP_SAVEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
257 {
258   GCPSaveReq * const saveReq = (GCPSaveReq*)&signal->theData[0];
259   BlockReference ref = calcDihBlockRef(nodeId);
260   saveReq->dihBlockRef = reference();
261   saveReq->dihPtr = nodeId;
262   saveReq->gci = m_gcp_save.m_master.m_new_gci;
263   sendSignal(ref, GSN_GCP_SAVEREQ, signal, GCPSaveReq::SignalLength, JBB);
264 }//Dbdih::sendGCP_SAVEREQ()
265 
sendINCL_NODEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)266 void Dbdih::sendINCL_NODEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
267 {
268   BlockReference nodeDihRef = calcDihBlockRef(nodeId);
269   signal->theData[0] = reference();
270   signal->theData[1] = c_nodeStartMaster.startNode;
271   signal->theData[2] = c_nodeStartMaster.failNr;
272   signal->theData[3] = 0;
273   signal->theData[4] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
274   signal->theData[5] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
275   sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 6, JBA);
276 }//Dbdih::sendINCL_NODEREQ()
277 
sendMASTER_GCPREQ(Signal * signal,Uint32 nodeId,Uint32 extra)278 void Dbdih::sendMASTER_GCPREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
279 {
280   BlockReference ref = calcDihBlockRef(nodeId);
281   sendSignal(ref, GSN_MASTER_GCPREQ, signal, MasterGCPReq::SignalLength, JBB);
282 }//Dbdih::sendMASTER_GCPREQ()
283 
sendMASTER_LCPREQ(Signal * signal,Uint32 nodeId,Uint32 extra)284 void Dbdih::sendMASTER_LCPREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
285 {
286   BlockReference ref = calcDihBlockRef(nodeId);
287   sendSignal(ref, GSN_MASTER_LCPREQ, signal, MasterLCPReq::SignalLength, JBB);
288 }//Dbdih::sendMASTER_LCPREQ()
289 
sendSTART_INFOREQ(Signal * signal,Uint32 nodeId,Uint32 extra)290 void Dbdih::sendSTART_INFOREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
291 {
292   const BlockReference ref = calcDihBlockRef(nodeId);
293   sendSignal(ref, GSN_START_INFOREQ, signal, StartInfoReq::SignalLength, JBB);
294 }//sendSTART_INFOREQ()
295 
sendSTART_RECREQ(Signal * signal,Uint32 nodeId,Uint32 extra)296 void Dbdih::sendSTART_RECREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
297 {
298   if (!m_sr_nodes.get(nodeId))
299   {
300     jam();
301     c_START_RECREQ_Counter.clearWaitingFor(nodeId);
302     return;
303   }
304 
305   Uint32 keepGCI = SYSFILE->keepGCI;
306   Uint32 lastCompletedGCI = SYSFILE->lastCompletedGCI[nodeId];
307   if (keepGCI > lastCompletedGCI)
308   {
309     jam();
310     keepGCI = lastCompletedGCI;
311   }
312 
313   StartRecReq * const req = (StartRecReq*)&signal->theData[0];
314   BlockReference ref = calcLqhBlockRef(nodeId);
315   req->receivingNodeId = nodeId;
316   req->senderRef = reference();
317   req->keepGci = keepGCI;
318   req->lastCompletedGci = lastCompletedGCI;
319   req->newestGci = SYSFILE->newestRestorableGCI;
320   req->senderData = extra;
321   m_sr_nodes.copyto(NdbNodeBitmask::Size, req->sr_nodes);
322   sendSignal(ref, GSN_START_RECREQ, signal, StartRecReq::SignalLength, JBB);
323 
324   signal->theData[0] = NDB_LE_StartREDOLog;
325   signal->theData[1] = nodeId;
326   signal->theData[2] = keepGCI;
327   signal->theData[3] = lastCompletedGCI;
328   signal->theData[4] = SYSFILE->newestRestorableGCI;
329   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 5, JBB);
330 }//Dbdih::sendSTART_RECREQ()
331 
sendSTOP_ME_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)332 void Dbdih::sendSTOP_ME_REQ(Signal* signal, Uint32 nodeId, Uint32 extra)
333 {
334   if (nodeId != getOwnNodeId()) {
335     jam();
336     const BlockReference ref = calcDihBlockRef(nodeId);
337     sendSignal(ref, GSN_STOP_ME_REQ, signal, StopMeReq::SignalLength, JBB);
338   }//if
339 }//Dbdih::sendSTOP_ME_REQ()
340 
sendTC_CLOPSIZEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)341 void Dbdih::sendTC_CLOPSIZEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
342 {
343   BlockReference ref = calcTcBlockRef(nodeId);
344   signal->theData[0] = nodeId;
345   signal->theData[1] = reference();
346   sendSignal(ref, GSN_TC_CLOPSIZEREQ, signal, 2, JBB);
347 }//Dbdih::sendTC_CLOPSIZEREQ()
348 
sendTCGETOPSIZEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)349 void Dbdih::sendTCGETOPSIZEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
350 {
351   BlockReference ref = calcTcBlockRef(nodeId);
352   signal->theData[0] = nodeId;
353   signal->theData[1] = reference();
354   sendSignal(ref, GSN_TCGETOPSIZEREQ, signal, 2, JBB);
355 }//Dbdih::sendTCGETOPSIZEREQ()
356 
sendUPDATE_TOREQ(Signal * signal,Uint32 nodeId,Uint32 extra)357 void Dbdih::sendUPDATE_TOREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
358 {
359   const BlockReference ref = calcDihBlockRef(nodeId);
360   sendSignal(ref, GSN_UPDATE_TOREQ, signal, UpdateToReq::SignalLength, JBB);
361 }//sendUPDATE_TOREQ()
362 
execCONTINUEB(Signal * signal)363 void Dbdih::execCONTINUEB(Signal* signal)
364 {
365   jamEntry();
366   switch ((DihContinueB::Type)signal->theData[0]) {
367   case DihContinueB::ZPACK_TABLE_INTO_PAGES:
368     {
369       jam();
370       Uint32 tableId = signal->theData[1];
371       packTableIntoPagesLab(signal, tableId);
372       return;
373       break;
374     }
375   case DihContinueB::ZPACK_FRAG_INTO_PAGES:
376     {
377       RWFragment wf;
378       jam();
379       wf.rwfTabPtr.i = signal->theData[1];
380       ptrCheckGuard(wf.rwfTabPtr, ctabFileSize, tabRecord);
381       wf.fragId = signal->theData[2];
382       wf.pageIndex = signal->theData[3];
383       wf.wordIndex = signal->theData[4];
384       wf.totalfragments = signal->theData[5];
385       packFragIntoPagesLab(signal, &wf);
386       return;
387       break;
388     }
389   case DihContinueB::ZREAD_PAGES_INTO_TABLE:
390     {
391       jam();
392       Uint32 tableId = signal->theData[1];
393       readPagesIntoTableLab(signal, tableId);
394       return;
395       break;
396     }
397   case DihContinueB::ZREAD_PAGES_INTO_FRAG:
398     {
399       RWFragment rf;
400       jam();
401       rf.rwfTabPtr.i = signal->theData[1];
402       ptrCheckGuard(rf.rwfTabPtr, ctabFileSize, tabRecord);
403       rf.fragId = signal->theData[2];
404       rf.pageIndex = signal->theData[3];
405       rf.wordIndex = signal->theData[4];
406       readPagesIntoFragLab(signal, &rf);
407       return;
408       break;
409     }
410   case DihContinueB::ZCOPY_TABLE:
411     {
412       jam();
413       Uint32 tableId = signal->theData[1];
414       copyTableLab(signal, tableId);
415       return;
416     }
417   case DihContinueB::ZCOPY_TABLE_NODE:
418     {
419       NodeRecordPtr nodePtr;
420       CopyTableNode ctn;
421       jam();
422       ctn.ctnTabPtr.i = signal->theData[1];
423       ptrCheckGuard(ctn.ctnTabPtr, ctabFileSize, tabRecord);
424       nodePtr.i = signal->theData[2];
425       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
426       ctn.pageIndex = signal->theData[3];
427       ctn.wordIndex = signal->theData[4];
428       ctn.noOfWords = signal->theData[5];
429       copyTableNode(signal, &ctn, nodePtr);
430       return;
431     }
432   case DihContinueB::ZSTART_FRAGMENT:
433     {
434       jam();
435       Uint32 tableId = signal->theData[1];
436       Uint32 fragId = signal->theData[2];
437       startFragment(signal, tableId, fragId);
438       return;
439     }
440   case DihContinueB::ZCOMPLETE_RESTART:
441     jam();
442     completeRestartLab(signal);
443     return;
444   case DihContinueB::ZREAD_TABLE_FROM_PAGES:
445     {
446       TabRecordPtr tabPtr;
447       jam();
448       tabPtr.i = signal->theData[1];
449       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
450       readTableFromPagesLab(signal, tabPtr);
451       return;
452     }
453   case DihContinueB::ZSR_PHASE2_READ_TABLE:
454     {
455       TabRecordPtr tabPtr;
456       jam();
457       tabPtr.i = signal->theData[1];
458       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
459       srPhase2ReadTableLab(signal, tabPtr);
460       return;
461     }
462   case DihContinueB::ZCHECK_TC_COUNTER:
463     jam();
464 #ifndef NO_LCP
465     checkTcCounterLab(signal);
466 #endif
467     return;
468   case DihContinueB::ZCALCULATE_KEEP_GCI:
469     {
470       jam();
471       Uint32 tableId = signal->theData[1];
472       Uint32 fragId = signal->theData[2];
473       calculateKeepGciLab(signal, tableId, fragId);
474       return;
475     }
476   case DihContinueB::ZSTORE_NEW_LCP_ID:
477     jam();
478     storeNewLcpIdLab(signal);
479     return;
480   case DihContinueB::ZTABLE_UPDATE:
481     {
482       TabRecordPtr tabPtr;
483       jam();
484       tabPtr.i = signal->theData[1];
485       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
486       tableUpdateLab(signal, tabPtr);
487       return;
488     }
489   case DihContinueB::ZCHECK_LCP_COMPLETED:
490     {
491       jam();
492       checkLcpCompletedLab(signal);
493       return;
494     }
495   case DihContinueB::ZINIT_LCP:
496     {
497       jam();
498       Uint32 senderRef = signal->theData[1];
499       Uint32 tableId = signal->theData[2];
500       initLcpLab(signal, senderRef, tableId);
501       return;
502     }
503   case DihContinueB::ZADD_TABLE_MASTER_PAGES:
504     {
505       TabRecordPtr tabPtr;
506       jam();
507       tabPtr.i = signal->theData[1];
508       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
509       tabPtr.p->tabUpdateState = TabRecord::US_ADD_TABLE_MASTER;
510       tableUpdateLab(signal, tabPtr);
511       return;
512       break;
513     }
514   case DihContinueB::ZDIH_ADD_TABLE_MASTER:
515     {
516       jam();
517       addTable_closeConf(signal, signal->theData[1]);
518       return;
519     }
520   case DihContinueB::ZADD_TABLE_SLAVE_PAGES:
521     {
522       TabRecordPtr tabPtr;
523       jam();
524       tabPtr.i = signal->theData[1];
525       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
526       tabPtr.p->tabUpdateState = TabRecord::US_ADD_TABLE_SLAVE;
527       tableUpdateLab(signal, tabPtr);
528       return;
529     }
530   case DihContinueB::ZDIH_ADD_TABLE_SLAVE:
531     {
532       ndbrequire(false);
533       return;
534     }
535   case DihContinueB::ZSTART_GCP:
536     jam();
537 #ifndef NO_GCP
538     startGcpLab(signal);
539 #endif
540     return;
541     break;
542   case DihContinueB::ZCOPY_GCI:{
543     jam();
544     CopyGCIReq::CopyReason reason = (CopyGCIReq::CopyReason)signal->theData[1];
545     ndbrequire(c_copyGCIMaster.m_copyReason == reason);
546 
547     // set to idle, to be able to reuse method
548     c_copyGCIMaster.m_copyReason = CopyGCIReq::IDLE;
549     copyGciLab(signal, reason);
550     return;
551   }
552     break;
553   case DihContinueB::ZEMPTY_VERIFY_QUEUE:
554     jam();
555     emptyverificbuffer(signal, signal->theData[1], true);
556     return;
557     break;
558   case DihContinueB::ZCHECK_GCP_STOP:
559     jam();
560 #ifndef NO_GCP
561     checkGcpStopLab(signal);
562 #endif
563     return;
564     break;
565   case DihContinueB::ZREMOVE_NODE_FROM_TABLE:
566     {
567       jam();
568       Uint32 nodeId = signal->theData[1];
569       Uint32 tableId = signal->theData[2];
570       removeNodeFromTables(signal, nodeId, tableId);
571       return;
572     }
573   case DihContinueB::ZCOPY_NODE:
574     {
575       jam();
576       Uint32 tableId = signal->theData[1];
577       copyNodeLab(signal, tableId);
578       return;
579     }
580   case DihContinueB::ZTO_START_COPY_FRAG:
581     {
582       jam();
583       Uint32 takeOverPtrI = signal->theData[1];
584       startNextCopyFragment(signal, takeOverPtrI);
585       return;
586     }
587   case DihContinueB::ZINVALIDATE_NODE_LCP:
588     {
589       jam();
590       const Uint32 nodeId = signal->theData[1];
591       const Uint32 tableId = signal->theData[2];
592       invalidateNodeLCP(signal, nodeId, tableId);
593       return;
594     }
595   case DihContinueB::ZINITIALISE_RECORDS:
596     jam();
597     initialiseRecordsLab(signal,
598 			 signal->theData[1],
599 			 signal->theData[2],
600 			 signal->theData[3]);
601     return;
602     break;
603   case DihContinueB::ZSTART_PERMREQ_AGAIN:
604     jam();
605     nodeRestartPh2Lab2(signal);
606     return;
607     break;
608   case DihContinueB::SwitchReplica:
609     {
610       jam();
611       const Uint32 nodeId = signal->theData[1];
612       const Uint32 tableId = signal->theData[2];
613       const Uint32 fragNo = signal->theData[3];
614       switchReplica(signal, nodeId, tableId, fragNo);
615       return;
616     }
617   case DihContinueB::ZSEND_ADD_FRAG:
618     {
619       jam();
620       Uint32 takeOverPtrI = signal->theData[1];
621       toCopyFragLab(signal, takeOverPtrI);
622       return;
623     }
624   case DihContinueB::ZSEND_START_TO:
625     {
626       jam();
627       Ptr<TakeOverRecord> takeOverPtr;
628       c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
629       sendStartTo(signal, takeOverPtr);
630       return;
631     }
632   case DihContinueB::ZSEND_UPDATE_TO:
633     {
634       jam();
635       Ptr<TakeOverRecord> takeOverPtr;
636       c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
637       sendUpdateTo(signal, takeOverPtr);
638       return;
639     }
640   case DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE:{
641     jam();
642     TabRecordPtr tabPtr;
643     tabPtr.i = signal->theData[1];
644     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
645     waitDropTabWritingToFile(signal, tabPtr);
646     return;
647   }
648   case DihContinueB::ZTO_START_FRAGMENTS:
649   {
650     TakeOverRecordPtr takeOverPtr;
651     c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
652     nr_start_fragments(signal, takeOverPtr);
653     return;
654   }
655   case DihContinueB::ZWAIT_OLD_SCAN:
656   {
657     jam();
658     wait_old_scan(signal);
659     return;
660   }
661   case DihContinueB::ZLCP_TRY_LOCK:
662   {
663     jam();
664     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
665     Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked),
666                    signal->theData[1] };
667     ndbrequire(mutex.trylock(c, false));
668     return;
669   }
670   case DihContinueB::ZTO_START_LOGGING:
671   {
672     jam();
673     TakeOverRecordPtr takeOverPtr;
674     c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
675     nr_start_logging(signal, takeOverPtr);
676     return;
677   }
678   case DihContinueB::ZGET_TABINFO:
679   {
680     jam();
681     getTabInfo(signal);
682     return;
683   }
684   case DihContinueB::ZGET_TABINFO_SEND:
685   {
686     jam();
687     TabRecordPtr tabPtr;
688     tabPtr.i = signal->theData[1];
689     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
690     getTabInfo_send(signal, tabPtr);
691     return;
692   }
693   case DihContinueB::ZDEQUEUE_LCP_REP:
694   {
695     jam();
696     dequeue_lcp_rep(signal);
697     return;
698   }
699   }
700 
701   ndbrequire(false);
702   return;
703 }//Dbdih::execCONTINUEB()
704 
execCOPY_GCIREQ(Signal * signal)705 void Dbdih::execCOPY_GCIREQ(Signal* signal)
706 {
707   CopyGCIReq * const copyGCI = (CopyGCIReq *)&signal->theData[0];
708   jamEntry();
709   if (ERROR_INSERTED(7241))
710   {
711     jam();
712     g_eventLogger->info("Delayed COPY_GCIREQ 5s");
713     sendSignalWithDelay(reference(), GSN_COPY_GCIREQ,
714                         signal, 5000,
715                         signal->getLength());
716     return;
717   }
718 
719   CopyGCIReq::CopyReason reason = (CopyGCIReq::CopyReason)copyGCI->copyReason;
720   const Uint32 tstart = copyGCI->startWord;
721 
722   ndbrequire(cmasterdihref == signal->senderBlockRef()) ;
723   ndbrequire((reason == CopyGCIReq::GLOBAL_CHECKPOINT &&
724               c_copyGCISlave.m_copyReason == CopyGCIReq::GLOBAL_CHECKPOINT) ||
725              c_copyGCISlave.m_copyReason == CopyGCIReq::IDLE);
726   ndbrequire(c_copyGCISlave.m_expectedNextWord == tstart);
727   ndbrequire(reason != CopyGCIReq::IDLE);
728   bool isdone = (tstart + CopyGCIReq::DATA_SIZE) >= Sysfile::SYSFILE_SIZE32;
729 
730   if (ERROR_INSERTED(7177))
731   {
732     jam();
733 
734     if (signal->getLength() == 3)
735     {
736       jam();
737       goto done;
738     }
739   }
740 
741   arrGuard(tstart + CopyGCIReq::DATA_SIZE, sizeof(sysfileData)/4);
742   for(Uint32 i = 0; i<CopyGCIReq::DATA_SIZE; i++)
743     cdata[tstart+i] = copyGCI->data[i];
744 
745   if (ERROR_INSERTED(7177) && isMaster() && isdone)
746   {
747     sendSignalWithDelay(reference(), GSN_COPY_GCIREQ, signal, 1000, 3);
748     return;
749   }
750 
751 done:
752   if (isdone)
753   {
754     jam();
755     c_copyGCISlave.m_expectedNextWord = 0;
756   }
757   else
758   {
759     jam();
760     c_copyGCISlave.m_expectedNextWord += CopyGCIReq::DATA_SIZE;
761     return;
762   }
763 
764   if (cmasterdihref != reference())
765   {
766     jam();
767     Uint32 tmp= SYSFILE->m_restart_seq;
768     memcpy(sysfileData, cdata, sizeof(sysfileData));
769     SYSFILE->m_restart_seq = tmp;
770 
771     if (c_set_initial_start_flag)
772     {
773       jam();
774       Sysfile::setInitialStartOngoing(SYSFILE->systemRestartBits);
775     }
776   }
777 
778   c_copyGCISlave.m_copyReason = reason;
779   c_copyGCISlave.m_senderRef  = signal->senderBlockRef();
780   c_copyGCISlave.m_senderData = copyGCI->anyData;
781 
782   CRASH_INSERTION2(7020, reason==CopyGCIReq::LOCAL_CHECKPOINT);
783   CRASH_INSERTION2(7008, reason==CopyGCIReq::GLOBAL_CHECKPOINT);
784 
785   if (m_local_lcp_state.check_cut_log_tail(c_newest_restorable_gci))
786   {
787     jam();
788 
789 #if NOT_YET
790     LcpCompleteRep* rep = (LcpCompleteRep*)signal->getDataPtrSend();
791     rep->nodeId = getOwnNodeId();
792     rep->blockNo = 0;
793     rep->lcpId = m_local_lcp_state.m_start_lcp_req.lcpId;
794     rep->keepGci = m_local_lcp_state.m_keep_gci;
795     sendSignal(DBLQH_REF, GSN_LCP_COMPLETE_REP, signal,
796                LcpCompleteRep::SignalLength, JBB);
797 
798     warningEvent("CUT LOG TAIL: reason: %u lcp: %u m_keep_gci: %u stop: %u",
799                  reason,
800                  m_local_lcp_state.m_start_lcp_req.lcpId,
801                  m_local_lcp_state.m_keep_gci,
802                  m_local_lcp_state.m_stop_gci);
803 #endif
804     m_local_lcp_state.reset();
805   }
806 
807   /* -------------------------------------------------------------------------*/
808   /*     WE SET THE REQUESTER OF THE COPY GCI TO THE CURRENT MASTER. IF THE   */
809   /*     CURRENT MASTER WE DO NOT WANT THE NEW MASTER TO RECEIVE CONFIRM OF   */
810   /*     SOMETHING HE HAS NOT SENT. THE TAKE OVER MUST BE CAREFUL.            */
811   /* -------------------------------------------------------------------------*/
812   bool ok = false;
813   switch(reason){
814   case CopyGCIReq::IDLE:
815     ok = true;
816     jam();
817     ndbrequire(false);
818     break;
819   case CopyGCIReq::LOCAL_CHECKPOINT: {
820     ok = true;
821     jam();
822     c_lcpState.setLcpStatus(LCP_COPY_GCI, __LINE__);
823     c_lcpState.m_masterLcpDihRef = cmasterdihref;
824     setNodeActiveStatus();
825     break;
826   }
827   case CopyGCIReq::RESTART: {
828     ok = true;
829     jam();
830     Uint32 newest = SYSFILE->newestRestorableGCI;
831     m_micro_gcp.m_old_gci = Uint64(newest) << 32;
832     crestartGci = newest;
833     c_newest_restorable_gci = newest;
834     Sysfile::setRestartOngoing(SYSFILE->systemRestartBits);
835     m_micro_gcp.m_current_gci = Uint64(newest + 1) << 32;
836     setNodeActiveStatus();
837     setNodeGroups();
838     if ((Sysfile::getLCPOngoing(SYSFILE->systemRestartBits))) {
839       jam();
840       /* -------------------------------------------------------------------- */
841       //  IF THERE WAS A LOCAL CHECKPOINT ONGOING AT THE CRASH MOMENT WE WILL
842       //    INVALIDATE THAT LOCAL CHECKPOINT.
843       /* -------------------------------------------------------------------- */
844       invalidateLcpInfoAfterSr(signal);
845     }//if
846 
847     if (m_micro_gcp.m_enabled == false &&
848         m_micro_gcp.m_master.m_time_between_gcp)
849     {
850       /**
851        * Micro GCP is disabled...but configured...
852        */
853       jam();
854       m_micro_gcp.m_enabled = true;
855       UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
856       ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
857       EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
858     }
859     break;
860   }
861   case CopyGCIReq::GLOBAL_CHECKPOINT: {
862     ok = true;
863     jam();
864 
865     if (m_gcp_save.m_state == GcpSave::GCP_SAVE_COPY_GCI)
866     {
867       jam();
868       /**
869        * This must be master take over...and it already running...
870        */
871       ndbrequire(c_newest_restorable_gci == SYSFILE->newestRestorableGCI);
872       m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
873       return;
874     }
875 
876     if (c_newest_restorable_gci == SYSFILE->newestRestorableGCI)
877     {
878       jam();
879 
880       /**
881        * This must be master take over...and it already complete...
882        */
883       m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
884       c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
885       signal->theData[0] = c_copyGCISlave.m_senderData;
886       sendSignal(m_gcp_save.m_master_ref, GSN_COPY_GCICONF, signal, 1, JBB);
887       return;
888     }
889 
890     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_CONF);
891     m_gcp_save.m_state = GcpSave::GCP_SAVE_COPY_GCI;
892     m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
893     c_newest_restorable_gci = SYSFILE->newestRestorableGCI;
894     setNodeActiveStatus();
895     break;
896   }//if
897   case CopyGCIReq::INITIAL_START_COMPLETED:
898     ok = true;
899     jam();
900     break;
901   case CopyGCIReq::RESTART_NR:
902     jam();
903     setNodeGroups();
904     /**
905      * We dont really need to make anything durable here...skip it
906      */
907     c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
908     signal->theData[0] = c_copyGCISlave.m_senderData;
909     sendSignal(c_copyGCISlave.m_senderRef, GSN_COPY_GCICONF, signal, 1, JBB);
910     return;
911   }
912   ndbrequire(ok);
913 
914   CRASH_INSERTION(7183);
915 
916   if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
917   {
918     jam();
919     return;
920   }
921 #ifdef GCP_TIMER_HACK
922   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT) {
923     jam();
924     globalData.gcp_timer_copygci[0] = NdbTick_getCurrentTicks();
925   }
926 #endif
927 
928   /* ----------------------------------------------------------------------- */
929   /*     WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE.           */
930   /* ----------------------------------------------------------------------- */
931   FileRecordPtr filePtr;
932   filePtr.i = crestartInfoFile[0];
933   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
934   if (filePtr.p->fileStatus == FileRecord::OPEN) {
935     jam();
936     openingCopyGciSkipInitLab(signal, filePtr);
937     return;
938   }//if
939   openFileRw(signal, filePtr);
940   filePtr.p->reqStatus = FileRecord::OPENING_COPY_GCI;
941   return;
942 }//Dbdih::execCOPY_GCIREQ()
943 
execDICTSTARTCONF(Signal * signal)944 void Dbdih::execDICTSTARTCONF(Signal* signal)
945 {
946   jamEntry();
947   Uint32 nodeId = refToNode(signal->getSendersBlockRef());
948   if (nodeId != getOwnNodeId()) {
949     jam();
950     nodeDictStartConfLab(signal, nodeId);
951   } else {
952     jam();
953     dictStartConfLab(signal);
954   }//if
955 }//Dbdih::execDICTSTARTCONF()
956 
execFSCLOSECONF(Signal * signal)957 void Dbdih::execFSCLOSECONF(Signal* signal)
958 {
959   FileRecordPtr filePtr;
960   jamEntry();
961   filePtr.i = signal->theData[0];
962   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
963   filePtr.p->fileStatus = FileRecord::CLOSED;
964   FileRecord::ReqStatus status = filePtr.p->reqStatus;
965   filePtr.p->reqStatus = FileRecord::IDLE;
966   switch (status) {
967   case FileRecord::CLOSING_GCP:
968     jam();
969     closingGcpLab(signal, filePtr);
970     break;
971   case FileRecord::CLOSING_GCP_CRASH:
972     jam();
973     closingGcpCrashLab(signal, filePtr);
974     break;
975   case FileRecord::CLOSING_TABLE_CRASH:
976     jam();
977     closingTableCrashLab(signal, filePtr);
978     break;
979   case FileRecord::CLOSING_TABLE_SR:
980     jam();
981     closingTableSrLab(signal, filePtr);
982     break;
983   case FileRecord::TABLE_CLOSE:
984     jam();
985     tableCloseLab(signal, filePtr);
986     break;
987   case FileRecord::TABLE_CLOSE_DELETE:
988     jam();
989     tableDeleteLab(signal, filePtr);
990     break;
991   default:
992     ndbrequire(false);
993     break;
994   }//switch
995   return;
996 }//Dbdih::execFSCLOSECONF()
997 
execFSCLOSEREF(Signal * signal)998 void Dbdih::execFSCLOSEREF(Signal* signal)
999 {
1000   FileRecordPtr filePtr;
1001   jamEntry();
1002   filePtr.i = signal->theData[0];
1003   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1004   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1005   filePtr.p->reqStatus = FileRecord::IDLE;
1006   switch (status) {
1007   case FileRecord::CLOSING_GCP:
1008     jam();
1009     break;
1010   case FileRecord::CLOSING_GCP_CRASH:
1011     jam();
1012     closingGcpCrashLab(signal, filePtr);
1013     return;
1014   case FileRecord::CLOSING_TABLE_CRASH:
1015     jam();
1016     closingTableCrashLab(signal, filePtr);
1017     return;
1018   case FileRecord::CLOSING_TABLE_SR:
1019     jam();
1020     break;
1021   case FileRecord::TABLE_CLOSE:
1022     jam();
1023     break;
1024   case FileRecord::TABLE_CLOSE_DELETE:
1025     jam();
1026     break;
1027   default:
1028     jam();
1029     break;
1030 
1031   }//switch
1032   {
1033     char msg[100];
1034     sprintf(msg, "File system close failed during FileRecord status %d", (Uint32)status);
1035     fsRefError(signal,__LINE__,msg);
1036   }
1037   return;
1038 }//Dbdih::execFSCLOSEREF()
1039 
execFSOPENCONF(Signal * signal)1040 void Dbdih::execFSOPENCONF(Signal* signal)
1041 {
1042   FileRecordPtr filePtr;
1043   jamEntry();
1044   filePtr.i = signal->theData[0];
1045   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1046   filePtr.p->fileRef = signal->theData[1];
1047   filePtr.p->fileStatus = FileRecord::OPEN;
1048   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1049   filePtr.p->reqStatus = FileRecord::IDLE;
1050   switch (status) {
1051   case FileRecord::CREATING_GCP:
1052     jam();
1053     creatingGcpLab(signal, filePtr);
1054     break;
1055   case FileRecord::OPENING_COPY_GCI:
1056     jam();
1057     openingCopyGciSkipInitLab(signal, filePtr);
1058     break;
1059   case FileRecord::CREATING_COPY_GCI:
1060     jam();
1061     openingCopyGciSkipInitLab(signal, filePtr);
1062     break;
1063   case FileRecord::OPENING_GCP:
1064     jam();
1065     openingGcpLab(signal, filePtr);
1066     break;
1067   case FileRecord::OPENING_TABLE:
1068     jam();
1069     openingTableLab(signal, filePtr);
1070     break;
1071   case FileRecord::TABLE_CREATE:
1072     jam();
1073     tableCreateLab(signal, filePtr);
1074     break;
1075   case FileRecord::TABLE_OPEN_FOR_DELETE:
1076     jam();
1077     tableOpenLab(signal, filePtr);
1078     break;
1079   default:
1080     ndbrequire(false);
1081     break;
1082   }//switch
1083   return;
1084 }//Dbdih::execFSOPENCONF()
1085 
execFSOPENREF(Signal * signal)1086 void Dbdih::execFSOPENREF(Signal* signal)
1087 {
1088   FileRecordPtr filePtr;
1089   jamEntry();
1090   filePtr.i = signal->theData[0];
1091   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1092   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1093   filePtr.p->reqStatus = FileRecord::IDLE;
1094   switch (status) {
1095   case FileRecord::CREATING_GCP:
1096     /* --------------------------------------------------------------------- */
1097     /*   WE DID NOT MANAGE TO CREATE A GLOBAL CHECKPOINT FILE. SERIOUS ERROR */
1098     /*   WHICH CAUSES A SYSTEM RESTART.                                      */
1099     /* --------------------------------------------------------------------- */
1100     jam();
1101     break;
1102   case FileRecord::OPENING_COPY_GCI:
1103     jam();
1104     openingCopyGciErrorLab(signal, filePtr);
1105     return;
1106   case FileRecord::CREATING_COPY_GCI:
1107     jam();
1108     break;
1109   case FileRecord::OPENING_GCP:
1110     jam();
1111     openingGcpErrorLab(signal, filePtr);
1112     return;
1113   case FileRecord::OPENING_TABLE:
1114     jam();
1115     openingTableErrorLab(signal, filePtr);
1116     return;
1117   case FileRecord::TABLE_CREATE:
1118     jam();
1119     break;
1120   case FileRecord::TABLE_OPEN_FOR_DELETE:
1121     jam();
1122     tableDeleteLab(signal, filePtr);
1123     return;
1124   default:
1125     jam();
1126     break;
1127   }//switch
1128   {
1129     char msg[100];
1130     sprintf(msg, "File system open failed during FileRecord status %d", (Uint32)status);
1131     fsRefError(signal,__LINE__,msg);
1132   }
1133   return;
1134 }//Dbdih::execFSOPENREF()
1135 
execFSREADCONF(Signal * signal)1136 void Dbdih::execFSREADCONF(Signal* signal)
1137 {
1138   FileRecordPtr filePtr;
1139   jamEntry();
1140   filePtr.i = signal->theData[0];
1141   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1142   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1143   filePtr.p->reqStatus = FileRecord::IDLE;
1144   switch (status) {
1145   case FileRecord::READING_GCP:
1146     jam();
1147     readingGcpLab(signal, filePtr);
1148     break;
1149   case FileRecord::READING_TABLE:
1150     jam();
1151     readingTableLab(signal, filePtr);
1152     break;
1153   default:
1154     ndbrequire(false);
1155     break;
1156   }//switch
1157   return;
1158 }//Dbdih::execFSREADCONF()
1159 
execFSREADREF(Signal * signal)1160 void Dbdih::execFSREADREF(Signal* signal)
1161 {
1162   FileRecordPtr filePtr;
1163   jamEntry();
1164   filePtr.i = signal->theData[0];
1165   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1166   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1167   filePtr.p->reqStatus = FileRecord::IDLE;
1168   switch (status) {
1169   case FileRecord::READING_GCP:
1170     jam();
1171     readingGcpErrorLab(signal, filePtr);
1172     return;
1173   case FileRecord::READING_TABLE:
1174     jam();
1175     readingTableErrorLab(signal, filePtr);
1176     return;
1177   default:
1178     break;
1179   }//switch
1180   {
1181     char msg[100];
1182     sprintf(msg, "File system read failed during FileRecord status %d", (Uint32)status);
1183     fsRefError(signal,__LINE__,msg);
1184   }
1185 }//Dbdih::execFSREADREF()
1186 
execFSWRITECONF(Signal * signal)1187 void Dbdih::execFSWRITECONF(Signal* signal)
1188 {
1189   FileRecordPtr filePtr;
1190   jamEntry();
1191   filePtr.i = signal->theData[0];
1192   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1193   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1194   filePtr.p->reqStatus = FileRecord::IDLE;
1195   switch (status) {
1196   case FileRecord::WRITING_COPY_GCI:
1197     jam();
1198     writingCopyGciLab(signal, filePtr);
1199     break;
1200   case FileRecord::WRITE_INIT_GCP:
1201     jam();
1202     writeInitGcpLab(signal, filePtr);
1203     break;
1204   case FileRecord::TABLE_WRITE:
1205     jam();
1206     if (ERROR_INSERTED(7235))
1207     {
1208       jam();
1209       filePtr.p->reqStatus = status;
1210       /* Suspend processing of WRITECONFs */
1211       sendSignalWithDelay(reference(), GSN_FSWRITECONF, signal, 1000, signal->getLength());
1212       return;
1213     }
1214     tableWriteLab(signal, filePtr);
1215     break;
1216   default:
1217     ndbrequire(false);
1218     break;
1219   }//switch
1220   return;
1221 }//Dbdih::execFSWRITECONF()
1222 
execFSWRITEREF(Signal * signal)1223 void Dbdih::execFSWRITEREF(Signal* signal)
1224 {
1225   FileRecordPtr filePtr;
1226   jamEntry();
1227   filePtr.i = signal->theData[0];
1228   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1229   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1230   filePtr.p->reqStatus = FileRecord::IDLE;
1231   switch (status) {
1232   case FileRecord::WRITING_COPY_GCI:
1233     /* --------------------------------------------------------------------- */
1234     /*  EVEN CREATING THE FILE DID NOT WORK. WE WILL THEN CRASH.             */
1235     /*  ERROR IN WRITING FILE. WE WILL NOT CONTINUE FROM HERE.               */
1236     /* --------------------------------------------------------------------- */
1237     jam();
1238     break;
1239   case FileRecord::WRITE_INIT_GCP:
1240     /* --------------------------------------------------------------------- */
1241     /*   AN ERROR OCCURRED IN WRITING A GCI FILE WHICH IS A SERIOUS ERROR    */
1242     /*   THAT CAUSE A SYSTEM RESTART.                                        */
1243     /* --------------------------------------------------------------------- */
1244     jam();
1245     break;
1246   case FileRecord::TABLE_WRITE:
1247     jam();
1248     break;
1249   default:
1250     jam();
1251     break;
1252   }//switch
1253   {
1254     char msg[100];
1255     sprintf(msg, "File system write failed during FileRecord status %d", (Uint32)status);
1256     fsRefError(signal,__LINE__,msg);
1257   }
1258   return;
1259 }//Dbdih::execFSWRITEREF()
1260 
execGETGCIREQ(Signal * signal)1261 void Dbdih::execGETGCIREQ(Signal* signal)
1262 {
1263 
1264   jamEntry();
1265   Uint32 userPtr = signal->theData[0];
1266   BlockReference userRef = signal->theData[1];
1267   Uint32 type = signal->theData[2];
1268 
1269   Uint32 gci_hi = 0;
1270   Uint32 gci_lo = 0;
1271   switch(type){
1272   case 0:
1273     jam();
1274     gci_hi = SYSFILE->newestRestorableGCI;
1275     break;
1276   case 1:
1277     jam();
1278     gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
1279     gci_lo = Uint32(m_micro_gcp.m_current_gci);
1280     break;
1281   }
1282 
1283   signal->theData[0] = userPtr;
1284   signal->theData[1] = gci_hi;
1285   signal->theData[2] = gci_lo;
1286 
1287   if (userRef)
1288   {
1289     jam();
1290     sendSignal(userRef, GSN_GETGCICONF, signal, 3, JBB);
1291   }
1292   else
1293   {
1294     jam();
1295     // Execute direct
1296   }
1297 }//Dbdih::execGETGCIREQ()
1298 
execREAD_CONFIG_REQ(Signal * signal)1299 void Dbdih::execREAD_CONFIG_REQ(Signal* signal)
1300 {
1301   const ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtr();
1302   Uint32 ref = req->senderRef;
1303   Uint32 senderData = req->senderData;
1304   ndbrequire(req->noOfParameters == 0);
1305 
1306   jamEntry();
1307 
1308   const ndb_mgm_configuration_iterator * p =
1309     m_ctx.m_config.getOwnConfigIterator();
1310   ndbrequireErr(p != 0, NDBD_EXIT_INVALID_CONFIG);
1311 
1312   initData();
1313 
1314   cconnectFileSize = 256; // Only used for DDL
1315 
1316   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_API_CONNECT,
1317 					   &capiConnectFileSize),
1318 		NDBD_EXIT_INVALID_CONFIG);
1319   capiConnectFileSize++; // Increase by 1...so that srsw queue never gets full
1320 
1321   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_FRAG_CONNECT,
1322 					   &cfragstoreFileSize),
1323 		NDBD_EXIT_INVALID_CONFIG);
1324   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_REPLICAS,
1325 					   &creplicaFileSize),
1326 		NDBD_EXIT_INVALID_CONFIG);
1327   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_TABLE, &ctabFileSize),
1328 		NDBD_EXIT_INVALID_CONFIG);
1329 
1330   if (isNdbMtLqh())
1331   {
1332     jam();
1333     c_fragments_per_node_ = 0;
1334     // try to get some LQH workers which initially handle no fragments
1335     if (ERROR_INSERTED(7215)) {
1336       c_fragments_per_node_ = 1;
1337       ndbout_c("Using %u fragments per node", c_fragments_per_node_);
1338     }
1339   }
1340   ndb_mgm_get_int_parameter(p, CFG_DB_LCP_TRY_LOCK_TIMEOUT,
1341                             &c_lcpState.m_lcp_trylock_timeout);
1342 
1343   cfileFileSize = (2 * ctabFileSize) + 2;
1344   initRecords();
1345   initialiseRecordsLab(signal, 0, ref, senderData);
1346 
1347   {
1348     Uint32 val = 0;
1349     ndb_mgm_get_int_parameter(p, CFG_DB_2PASS_INR,
1350                               &val);
1351     c_2pass_inr = val ? true : false;
1352   }
1353 
1354   /**
1355    * Set API assigned nodegroup(s)
1356    */
1357   {
1358     NodeRecordPtr nodePtr;
1359     for (nodePtr.i = 0; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
1360     {
1361       ptrAss(nodePtr, nodeRecord);
1362       initNodeRecord(nodePtr);
1363       nodePtr.p->nodeGroup = RNIL;
1364     }
1365     initNodeRecoveryStatus();
1366 
1367     ndb_mgm_configuration_iterator * iter =
1368       m_ctx.m_config.getClusterConfigIterator();
1369     for(ndb_mgm_first(iter); ndb_mgm_valid(iter); ndb_mgm_next(iter))
1370     {
1371       jam();
1372       Uint32 nodeId;
1373       Uint32 nodeType;
1374 
1375       ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_NODE_ID, &nodeId));
1376       ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_TYPE_OF_SECTION,
1377                                             &nodeType));
1378 
1379       if (nodeType == NodeInfo::DB)
1380       {
1381         jam();
1382         Uint32 ng;
1383         nodePtr.i = nodeId;
1384         ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
1385         setNodeRecoveryStatusInitial(nodePtr);
1386         if (ndb_mgm_get_int_parameter(iter, CFG_DB_NODEGROUP, &ng) == 0)
1387         {
1388           jam();
1389           nodePtr.p->nodeGroup = ng;
1390         }
1391         else
1392         {
1393           jam();
1394           nodePtr.p->nodeGroup = RNIL;
1395         }
1396       }
1397     }
1398   }
1399   return;
1400 }
1401 
execSTART_COPYREF(Signal * signal)1402 void Dbdih::execSTART_COPYREF(Signal* signal)
1403 {
1404   jamEntry();
1405   ndbrequire(false);
1406 }//Dbdih::execSTART_COPYREF()
1407 
execSTART_FRAGCONF(Signal * signal)1408 void Dbdih::execSTART_FRAGCONF(Signal* signal)
1409 {
1410   (void)signal;  // Don't want compiler warning
1411   /* ********************************************************************* */
1412   /*  If anyone wants to add functionality in this method, be aware that   */
1413   /*  for temporary tables no START_FRAGREQ is sent and therefore no       */
1414   /*  START_FRAGCONF signal will be received for those tables!!            */
1415   /* ********************************************************************* */
1416   jamEntry();
1417   return;
1418 }//Dbdih::execSTART_FRAGCONF()
1419 
execSTART_FRAGREF(Signal * signal)1420 void Dbdih::execSTART_FRAGREF(Signal* signal)
1421 {
1422   jamEntry();
1423 
1424   /**
1425    * Kill starting node
1426    */
1427   Uint32 errCode = signal->theData[1];
1428   Uint32 nodeId = signal->theData[2];
1429 
1430   SystemError * const sysErr = (SystemError*)&signal->theData[0];
1431   sysErr->errorCode = SystemError::StartFragRefError;
1432   sysErr->errorRef = reference();
1433   sysErr->data[0] = errCode;
1434   sysErr->data[1] = 0;
1435   sendSignal(calcNdbCntrBlockRef(nodeId), GSN_SYSTEM_ERROR, signal,
1436 	     SystemError::SignalLength, JBB);
1437   return;
1438 }//Dbdih::execSTART_FRAGCONF()
1439 
execSTART_MEREF(Signal * signal)1440 void Dbdih::execSTART_MEREF(Signal* signal)
1441 {
1442   jamEntry();
1443   ndbrequire(false);
1444 }//Dbdih::execSTART_MEREF()
1445 
execTAB_COMMITREQ(Signal * signal)1446 void Dbdih::execTAB_COMMITREQ(Signal* signal)
1447 {
1448   TabRecordPtr tabPtr;
1449   jamEntry();
1450   Uint32 tdictPtr = signal->theData[0];
1451   BlockReference tdictBlockref = signal->theData[1];
1452   tabPtr.i = signal->theData[2];
1453   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
1454 
1455   ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_CREATING);
1456   tabPtr.p->tabStatus = TabRecord::TS_ACTIVE;
1457   tabPtr.p->schemaTransId = 0;
1458   signal->theData[0] = tdictPtr;
1459   signal->theData[1] = cownNodeId;
1460   signal->theData[2] = tabPtr.i;
1461   sendSignal(tdictBlockref, GSN_TAB_COMMITCONF, signal, 3, JBB);
1462   return;
1463 }//Dbdih::execTAB_COMMITREQ()
1464 
1465 /*
1466   3.2   S T A N D A R D   S U B P R O G R A M S   I N   P L E X
1467   *************************************************************
1468   */
1469 /*
1470   3.2.1   S T A R T /  R E S T A R T
1471   **********************************
1472   */
1473 /*****************************************************************************/
1474 /* **********     START / RESTART MODULE                         *************/
1475 /*****************************************************************************/
1476 /*
1477   3.2.1.1    LOADING   O W N   B L O C K  R E F E R E N C E (ABSOLUTE PHASE 1)
1478   *****************************************************************************
1479   */
execDIH_RESTARTREQ(Signal * signal)1480 void Dbdih::execDIH_RESTARTREQ(Signal* signal)
1481 {
1482   jamEntry();
1483   const DihRestartReq* req = CAST_CONSTPTR(DihRestartReq,
1484                                            signal->getDataPtr());
1485   if (req->senderRef != 0)
1486   {
1487     jam();
1488     cntrlblockref = req->senderRef;
1489     if(m_ctx.m_config.getInitialStart())
1490     {
1491       sendDihRestartRef(signal);
1492     } else {
1493       readGciFileLab(signal);
1494     }
1495   }
1496   else
1497   {
1498     /**
1499      * Precondition, (not checked)
1500      *   atleast 1 node in each node group
1501      */
1502     Uint32 i;
1503     NdbNodeBitmask mask;
1504     mask.assign(NdbNodeBitmask::Size, req->nodemask);
1505     const Uint32 *node_gcis = req->node_gcis;
1506     Uint32 node_group_gcis[MAX_NDB_NODES+1];
1507     memset(node_group_gcis, 0, sizeof(node_group_gcis));
1508     for (i = 0; i<MAX_NDB_NODES; i++)
1509     {
1510       if (mask.get(i))
1511       {
1512 	jam();
1513 	Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
1514         if (ng != NO_NODE_GROUP_ID)
1515         {
1516           ndbrequire(ng < MAX_NDB_NODES);
1517           Uint32 gci = node_gcis[i];
1518           if (gci < SYSFILE->lastCompletedGCI[i])
1519           {
1520             jam();
1521             /**
1522              * Handle case, where *I* know that node complete GCI
1523              *   but node does not...bug#29167
1524              *   i.e node died before it wrote own sysfile
1525              */
1526             gci = SYSFILE->lastCompletedGCI[i];
1527           }
1528 
1529           if (gci > node_group_gcis[ng])
1530           {
1531             jam();
1532             node_group_gcis[ng] = gci;
1533           }
1534         }
1535       }
1536     }
1537     for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
1538 
1539     Uint32 gci = node_group_gcis[i];
1540     for (i++ ; i<MAX_NDB_NODES; i++)
1541     {
1542       jam();
1543       if (node_group_gcis[i] && node_group_gcis[i] != gci)
1544       {
1545 	jam();
1546 	signal->theData[0] = i;
1547 	return;
1548       }
1549     }
1550     signal->theData[0] = MAX_NDB_NODES;
1551     return;
1552   }
1553   return;
1554 }//Dbdih::execDIH_RESTARTREQ()
1555 
execSTTOR(Signal * signal)1556 void Dbdih::execSTTOR(Signal* signal)
1557 {
1558   jamEntry();
1559 
1560   Callback c = { safe_cast(&Dbdih::sendSTTORRY), 0 };
1561   m_sendSTTORRY = c;
1562 
1563   switch(signal->theData[1]){
1564   case 1:
1565     createMutexes(signal, 0);
1566     init_lcp_pausing_module();
1567     return;
1568   case 3:
1569     signal->theData[0] = reference();
1570     sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
1571     return;
1572   }
1573 
1574   sendSTTORRY(signal);
1575 }//Dbdih::execSTTOR()
1576 
1577 void
sendSTTORRY(Signal * signal,Uint32 senderData,Uint32 retVal)1578 Dbdih::sendSTTORRY(Signal* signal, Uint32 senderData, Uint32 retVal)
1579 {
1580   signal->theData[0] = 0;
1581   signal->theData[1] = 0;
1582   signal->theData[2] = 0;
1583   signal->theData[3] = 1;   // Next start phase
1584   signal->theData[4] = 3;
1585   signal->theData[5] = 255; // Next start phase
1586   sendSignal(NDBCNTR_REF, GSN_STTORRY, signal, 6, JBB);
1587   return;
1588 }
1589 
1590 /*
1591  * ***************************************************************************
1592  * S E N D I N G   R E P L Y  T O  S T A R T /  R E S T A R T   R E Q U E S T S
1593  * ****************************************************************************
1594  */
ndbsttorry10Lab(Signal * signal,Uint32 _line)1595 void Dbdih::ndbsttorry10Lab(Signal* signal, Uint32 _line)
1596 {
1597   /*-------------------------------------------------------------------------*/
1598   // AN NDB START PHASE HAS BEEN COMPLETED. WHEN START PHASE 6 IS COMPLETED WE
1599   // RECORD THAT THE SYSTEM IS RUNNING.
1600   /*-------------------------------------------------------------------------*/
1601   signal->theData[0] = reference();
1602   sendSignal(cntrlblockref, GSN_NDB_STTORRY, signal, 1, JBB);
1603   return;
1604 }//Dbdih::ndbsttorry10Lab()
1605 
1606 /*
1607 ****************************************
1608 I N T E R N A L  P H A S E S
1609 ****************************************
1610 */
1611 /*---------------------------------------------------------------------------*/
1612 /*NDB_STTOR                              START SIGNAL AT START/RESTART       */
1613 /*---------------------------------------------------------------------------*/
execNDB_STTOR(Signal * signal)1614 void Dbdih::execNDB_STTOR(Signal* signal)
1615 {
1616   jamEntry();
1617   BlockReference cntrRef = signal->theData[0];    /* SENDERS BLOCK REFERENCE */
1618   Uint32 ownNodeId = signal->theData[1];          /* OWN PROCESSOR ID*/
1619   Uint32 phase = signal->theData[2];              /* INTERNAL START PHASE*/
1620   Uint32 typestart = signal->theData[3];
1621 
1622   cstarttype = typestart;
1623   cstartPhase = phase;
1624 
1625   switch (phase){
1626   case ZNDB_SPH1:
1627     jam();
1628     /*-----------------------------------------------------------------------*/
1629     // Compute all static block references in this node as part of
1630     // ndb start phase 1.
1631     /*-----------------------------------------------------------------------*/
1632     cownNodeId = ownNodeId;
1633     cntrlblockref = cntrRef;
1634     clocaltcblockref = calcTcBlockRef(ownNodeId);
1635     clocallqhblockref = calcLqhBlockRef(ownNodeId);
1636     cdictblockref = calcDictBlockRef(ownNodeId);
1637     c_lcpState.lcpStallStart = 0;
1638     NdbTick_Invalidate(&c_lcpState.m_start_lcp_check_time);
1639     ndbsttorry10Lab(signal, __LINE__);
1640     break;
1641 
1642   case ZNDB_SPH2:
1643     jam();
1644     /*-----------------------------------------------------------------------*/
1645     // For node restarts we will also add a request for permission
1646     // to continue the system restart.
1647     // The permission is given by the master node in the alive set.
1648     /*-----------------------------------------------------------------------*/
1649     if (cstarttype == NodeState::ST_INITIAL_NODE_RESTART)
1650     {
1651       jam();
1652       c_set_initial_start_flag = TRUE; // In sysfile...
1653     }
1654 
1655     if (cstarttype == NodeState::ST_INITIAL_START) {
1656       jam();
1657       // setInitialActiveStatus is moved into makeNodeGroups
1658     } else if (cstarttype == NodeState::ST_SYSTEM_RESTART) {
1659       jam();
1660       /*empty*/;
1661     } else if ((cstarttype == NodeState::ST_NODE_RESTART) ||
1662                (cstarttype == NodeState::ST_INITIAL_NODE_RESTART)) {
1663       jam();
1664       nodeRestartPh2Lab(signal);
1665       return;
1666     } else {
1667       ndbrequire(false);
1668     }//if
1669     ndbsttorry10Lab(signal, __LINE__);
1670     return;
1671 
1672   case ZNDB_SPH3:
1673     jam();
1674     /*-----------------------------------------------------------------------*/
1675     // Non-master nodes performing an initial start will execute
1676     // the start request here since the
1677     // initial start do not synchronise so much from the master.
1678     // In the master nodes the start
1679     // request will be sent directly to dih (in ndb_startreq) when all
1680     // nodes have completed phase 3 of the start.
1681     /*-----------------------------------------------------------------------*/
1682     cmasterState = MASTER_IDLE;
1683     if(cstarttype == NodeState::ST_INITIAL_START ||
1684        cstarttype == NodeState::ST_SYSTEM_RESTART){
1685       jam();
1686       cmasterState = isMaster() ? MASTER_ACTIVE : MASTER_IDLE;
1687     }
1688     if (!isMaster() && cstarttype == NodeState::ST_INITIAL_START) {
1689       jam();
1690       ndbStartReqLab(signal, cntrRef);
1691       return;
1692     }//if
1693     ndbsttorry10Lab(signal, __LINE__);
1694     break;
1695 
1696   case ZNDB_SPH4:
1697     jam();
1698     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
1699     cmasterTakeOverNode = ZNIL;
1700     switch(typestart){
1701     case NodeState::ST_INITIAL_START:
1702       jam();
1703       ndbsttorry10Lab(signal, __LINE__);
1704       return;
1705     case NodeState::ST_SYSTEM_RESTART:
1706       jam();
1707       ndbsttorry10Lab(signal, __LINE__);
1708       return;
1709     case NodeState::ST_INITIAL_NODE_RESTART:
1710     case NodeState::ST_NODE_RESTART:
1711       jam();
1712 
1713       /***********************************************************************
1714        * When starting nodes while system is operational we must be controlled
1715        * by the master. There can be multiple node restarts ongoing, but this
1716        * phase only allows for one node at a time. So it has to be controlled
1717        * from the master node.
1718        *
1719        * When this signal is confirmed the master has also copied the
1720        * dictionary and the distribution information.
1721        */
1722 
1723       g_eventLogger->info("Request copying of distribution and dictionary"
1724                           " information from master Starting");
1725 
1726       StartMeReq * req = (StartMeReq*)&signal->theData[0];
1727       req->startingRef = reference();
1728       req->startingVersion = 0; // Obsolete
1729       sendSignal(cmasterdihref, GSN_START_MEREQ, signal,
1730                  StartMeReq::SignalLength, JBB);
1731       return;
1732     }
1733     ndbrequire(false);
1734     break;
1735   case ZNDB_SPH5:
1736     jam();
1737     switch(typestart){
1738     case NodeState::ST_INITIAL_START:
1739     case NodeState::ST_SYSTEM_RESTART:
1740       jam();
1741       /*---------------------------------------------------------------------*/
1742       // WE EXECUTE A LOCAL CHECKPOINT AS A PART OF A SYSTEM RESTART.
1743       // THE IDEA IS THAT WE NEED TO
1744       // ENSURE THAT WE CAN RECOVER FROM PROBLEMS CAUSED BY MANY NODE
1745       // CRASHES THAT CAUSES THE LOG
1746       // TO GROW AND THE NUMBER OF LOG ROUNDS TO EXECUTE TO GROW.
1747       // THIS CAN OTHERWISE GET US INTO
1748       // A SITUATION WHICH IS UNREPAIRABLE. THUS WE EXECUTE A CHECKPOINT
1749       // BEFORE ALLOWING ANY TRANSACTIONS TO START.
1750       /*---------------------------------------------------------------------*/
1751       if (!isMaster()) {
1752 	jam();
1753 	ndbsttorry10Lab(signal, __LINE__);
1754 	return;
1755       }//if
1756 
1757       infoEvent("Make On-line Database recoverable by waiting for LCP"
1758                 " Starting, LCP id = %u",
1759                 SYSFILE->latestLCP_ID + 1);
1760 
1761       c_lcpState.immediateLcpStart = true;
1762       cwaitLcpSr = true;
1763       checkLcpStart(signal, __LINE__, 0);
1764       return;
1765     case NodeState::ST_NODE_RESTART:
1766     case NodeState::ST_INITIAL_NODE_RESTART:
1767       jam();
1768       {
1769         StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
1770         req->senderRef = reference();
1771         req->senderData = RNIL;
1772         req->flags = StartCopyReq::WAIT_LCP;
1773         req->startingNodeId = getOwnNodeId();
1774         if (!ndb_pnr(getNodeInfo(refToNode(cmasterdihref)).m_version))
1775         {
1776           jam();
1777           infoEvent("Detecting upgrade: Master(%u) does not support parallel"
1778                     " node recovery",
1779                     refToNode(cmasterdihref));
1780           sendSignal(cmasterdihref, GSN_START_COPYREQ, signal,
1781                      StartCopyReq::SignalLength, JBB);
1782         }
1783         else
1784         {
1785           sendSignal(reference(), GSN_START_COPYREQ, signal,
1786                      StartCopyReq::SignalLength, JBB);
1787         }
1788       }
1789       return;
1790     }
1791     ndbrequire(false);
1792   case ZNDB_SPH6:
1793     jam();
1794     switch(typestart){
1795     case NodeState::ST_INITIAL_START:
1796     case NodeState::ST_SYSTEM_RESTART:
1797       jam();
1798       if(isMaster()){
1799 	jam();
1800 	startGcp(signal);
1801       }
1802       ndbsttorry10Lab(signal, __LINE__);
1803       return;
1804     case NodeState::ST_NODE_RESTART:
1805     case NodeState::ST_INITIAL_NODE_RESTART:
1806       ndbsttorry10Lab(signal, __LINE__);
1807       return;
1808     }
1809     ndbrequire(false);
1810     break;
1811   default:
1812     jam();
1813     ndbsttorry10Lab(signal, __LINE__);
1814     break;
1815   }//switch
1816 }//Dbdih::execNDB_STTOR()
1817 
1818 void
execNODE_START_REP(Signal * signal)1819 Dbdih::execNODE_START_REP(Signal* signal)
1820 {
1821   /*
1822    * Send DICT_UNLOCK_ORD when this node is SL_STARTED.
1823    *
1824    * Sending it before (sp 7) conflicts with code which assumes
1825    * SL_STARTING means we are in copy phase of NR.
1826    *
1827    * NodeState::starting.restartType is not supposed to be used
1828    * when SL_STARTED.  Also it seems NODE_START_REP can arrive twice.
1829    *
1830    * For these reasons there are no consistency checks and
1831    * we rely on c_dictLockSlavePtrI_nodeRestart alone.
1832    */
1833   if (signal->theData[0] == getOwnNodeId())
1834   {
1835     /**
1836      * With parallel node restart, only unlock self, if it's self that has
1837      *   started
1838      */
1839     jam();
1840     if (c_dictLockSlavePtrI_nodeRestart != RNIL) {
1841       sendDictUnlockOrd(signal, c_dictLockSlavePtrI_nodeRestart);
1842       c_dictLockSlavePtrI_nodeRestart = RNIL;
1843     }
1844   }
1845   setGCPStopTimeouts();
1846 }
1847 
1848 void
createMutexes(Signal * signal,Uint32 count)1849 Dbdih::createMutexes(Signal * signal, Uint32 count){
1850   Callback c = { safe_cast(&Dbdih::createMutex_done), count };
1851 
1852   switch(count){
1853   case 0:{
1854     Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
1855     mutex.create(c);
1856     return;
1857   }
1858   case 1:{
1859     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
1860     mutex.create(c);
1861     return;
1862   }
1863   case 2:{
1864     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
1865     mutex.create(c);
1866     return;
1867   }
1868   }
1869 
1870   execute(signal, m_sendSTTORRY, 0);
1871 }
1872 
1873 void
createMutex_done(Signal * signal,Uint32 senderData,Uint32 retVal)1874 Dbdih::createMutex_done(Signal* signal, Uint32 senderData, Uint32 retVal){
1875   jamEntry();
1876   ndbrequire(retVal == 0);
1877 
1878   switch(senderData){
1879   case 0:{
1880     Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
1881     mutex.release();
1882     break;
1883   }
1884   case 1:{
1885     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
1886     mutex.release();
1887     break;
1888   }
1889   case 2:{
1890     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
1891     mutex.release();
1892     break;
1893   }
1894   }
1895 
1896   createMutexes(signal, senderData + 1);
1897 }
1898 
1899 /*****************************************************************************/
1900 /* ------------------------------------------------------------------------- */
1901 /*       WE HAVE BEEN REQUESTED BY NDBCNTR TO PERFORM A RESTART OF THE       */
1902 /*       DATABASE TABLES.                                                    */
1903 /*       THIS SIGNAL IS SENT AFTER COMPLETING PHASE 3 IN ALL BLOCKS IN A     */
1904 /*       SYSTEM RESTART. WE WILL ALSO JUMP TO THIS LABEL FROM PHASE 3 IN AN  */
1905 /*       INITIAL START.                                                      */
1906 /* ------------------------------------------------------------------------- */
1907 /*****************************************************************************/
execNDB_STARTREQ(Signal * signal)1908 void Dbdih::execNDB_STARTREQ(Signal* signal)
1909 {
1910   jamEntry();
1911   BlockReference ref = signal->theData[0];
1912   cstarttype = signal->theData[1];
1913   ndbStartReqLab(signal, ref);
1914 }//Dbdih::execNDB_STARTREQ()
1915 
ndbStartReqLab(Signal * signal,BlockReference ref)1916 void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
1917 {
1918   cndbStartReqBlockref = ref;
1919   if (cstarttype == NodeState::ST_INITIAL_START) {
1920     jam();
1921     initRestartInfo(signal);
1922     initGciFilesLab(signal);
1923     return;
1924   }
1925 
1926   NodeRecordPtr nodePtr;
1927   Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
1928   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
1929   {
1930     jam();
1931     ptrAss(nodePtr, nodeRecord);
1932     if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci)
1933     {
1934       jam();
1935       /**
1936        * Since we're starting(is master) and there
1937        *   there are other nodes with higher GCI...
1938        *   their gci's must be invalidated...
1939        *   and they _must_ do an initial start
1940        *   indicate this by setting lastCompletedGCI = 0
1941        */
1942       SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
1943       ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
1944       warningEvent("Making filesystem for node %d unusable (need --initial)",
1945 		   nodePtr.i);
1946     }
1947     else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
1948 	     SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
1949     {
1950       jam();
1951       CRASH_INSERTION(7170);
1952       char buf[255];
1953       BaseString::snprintf(buf, sizeof(buf),
1954 			   "Cluster requires this node to be started "
1955 			   " with --initial as partial start has been performed"
1956 			   " and this filesystem is unusable");
1957       progError(__LINE__,
1958 		NDBD_EXIT_SR_RESTARTCONFLICT,
1959 		buf);
1960       ndbrequire(false);
1961     }
1962   }
1963 
1964   /**
1965    * This set which GCI we will try to restart to
1966    */
1967   SYSFILE->newestRestorableGCI = gci;
1968   infoEvent("Restarting cluster to GCI: %u", gci);
1969 
1970   ndbrequire(isMaster());
1971   copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
1972 }//Dbdih::ndbStartReqLab()
1973 
execREAD_NODESCONF(Signal * signal)1974 void Dbdih::execREAD_NODESCONF(Signal* signal)
1975 {
1976   unsigned i;
1977   ReadNodesConf * const readNodes = (ReadNodesConf *)&signal->theData[0];
1978   jamEntry();
1979   Uint32 nodeArray[MAX_NDB_NODES+1];
1980 
1981   csystemnodes  = readNodes->noOfNodes;
1982   cmasterNodeId = readNodes->masterNodeId;
1983   unsigned index = 0;
1984   NdbNodeBitmask tmp; tmp.assign(2, readNodes->allNodes);
1985   for (i = 1; i < MAX_NDB_NODES; i++){
1986     jam();
1987     if(tmp.get(i)){
1988       jam();
1989       nodeArray[index] = i;
1990       if(NdbNodeBitmask::get(readNodes->inactiveNodes, i) == false){
1991         jam();
1992         con_lineNodes++;
1993       }//if
1994       index++;
1995     }//if
1996   }//for
1997   nodeArray[index] = RNIL; // terminate
1998 
1999   if (c_2pass_inr)
2000   {
2001     jam();
2002     Uint32 workers = getNodeInfo(getOwnNodeId()).m_lqh_workers;
2003 #ifdef VM_TRACE
2004     printf("Checking 2-pass initial node restart: ");
2005 #endif
2006     for (i = 0; i<index; i++)
2007     {
2008       if (NdbNodeBitmask::get(readNodes->inactiveNodes, nodeArray[i]))
2009         continue;
2010 
2011       if (!ndbd_non_trans_copy_frag_req(getNodeInfo(nodeArray[i]).m_version))
2012       {
2013         jam();
2014         c_2pass_inr = false;
2015 #ifdef VM_TRACE
2016         printf("not ok (version node %u) => disabled\n", nodeArray[i]);
2017 #endif
2018         break;
2019       }
2020 
2021       if (workers > 1 &&
2022           workers != getNodeInfo(nodeArray[i]).m_lqh_workers)
2023       {
2024         c_2pass_inr = false;
2025 #ifdef VM_TRACE
2026         printf("not ok (different worker cnt node %u) => disabled\n",
2027                nodeArray[i]);
2028 #endif
2029         break;
2030       }
2031     }
2032     if (c_2pass_inr)
2033     {
2034 #ifdef VM_TRACE
2035       ndbout_c("ok");
2036 #endif
2037     }
2038 
2039     /**
2040      * Note: In theory it would be ok for just nodes that we plan to copy from
2041      *   supported this...but in e.g a 3/4-replica scenario,
2042      *      if one of the nodes does, and the other doesnt, we don't
2043      *      have enough infrastructure to easily check this...
2044      *      therefore we require all nodes to support it.
2045      */
2046   }
2047 
2048   if(cstarttype == NodeState::ST_SYSTEM_RESTART ||
2049      cstarttype == NodeState::ST_NODE_RESTART)
2050   {
2051 
2052     for(i = 1; i<MAX_NDB_NODES; i++){
2053       const Uint32 stat = Sysfile::getNodeStatus(i, SYSFILE->nodeStatus);
2054       if(stat == Sysfile::NS_NotDefined && !tmp.get(i))
2055       {
2056 	jam();
2057 	continue;
2058       }
2059 
2060       if(tmp.get(i) && stat != Sysfile::NS_NotDefined)
2061       {
2062 	jam();
2063 	continue;
2064       }
2065 
2066       if (stat == Sysfile::NS_NotDefined && tmp.get(i))
2067       {
2068         jam();
2069         infoEvent("Discovered new node %u", i);
2070         continue;
2071       }
2072 
2073       if (stat == Sysfile::NS_Configured && !tmp.get(i))
2074       {
2075         jam();
2076         infoEvent("Configured node %u not present, ignoring",
2077                   i);
2078         continue;
2079       }
2080 
2081       char buf[255];
2082       BaseString::snprintf(buf, sizeof(buf),
2083                            "Illegal configuration change."
2084                            " Initial start needs to be performed "
2085                            " when removing nodes with nodegroup (node %d)", i);
2086       progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
2087     }
2088   }
2089 
2090   ndbrequire(csystemnodes >= 1 && csystemnodes < MAX_NDB_NODES);
2091 
2092   cmasterdihref = calcDihBlockRef(cmasterNodeId);
2093   /*-------------------------------------------------------------------------*/
2094   /* MAKE THE LIST OF PRN-RECORD WHICH IS ONE OF THE NODES-LIST IN THIS BLOCK*/
2095   /*-------------------------------------------------------------------------*/
2096   makePrnList(readNodes, nodeArray);
2097   if (cstarttype == NodeState::ST_INITIAL_START) {
2098     jam();
2099     /**----------------------------------------------------------------------
2100      * WHEN WE INITIALLY START A DATABASE WE WILL CREATE NODE GROUPS.
2101      * ALL NODES ARE PUT INTO NODE GROUPS ALTHOUGH HOT SPARE NODES ARE PUT
2102      * INTO A SPECIAL NODE GROUP. IN EACH NODE GROUP WE HAVE THE SAME AMOUNT
2103      * OF NODES AS THERE ARE NUMBER OF REPLICAS.
2104      * ONE POSSIBLE USAGE OF NODE GROUPS ARE TO MAKE A NODE GROUP A COMPLETE
2105      * FRAGMENT OF THE DATABASE. THIS MEANS THAT ALL REPLICAS WILL BE STORED
2106      * IN THE NODE GROUP.
2107      *-----------------------------------------------------------------------*/
2108     makeNodeGroups(nodeArray);
2109   }//if
2110   ndbrequire(checkNodeAlive(cmasterNodeId));
2111 
2112   /**
2113    * Keep bitmap of nodes that can be restored...
2114    *   and nodes that need take-over
2115    *
2116    */
2117   m_sr_nodes.clear();
2118   m_to_nodes.clear();
2119 
2120   // Start with assumption that all can restore
2121   {
2122     NodeRecordPtr specNodePtr;
2123     specNodePtr.i = cfirstAliveNode;
2124     do {
2125       jam();
2126       m_sr_nodes.set(specNodePtr.i);
2127       ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
2128       specNodePtr.i = specNodePtr.p->nextNode;
2129     } while (specNodePtr.i != RNIL);
2130   }
2131 
2132   execute(signal, m_sendSTTORRY, 0);
2133 }//Dbdih::execREAD_NODESCONF()
2134 
2135 /*---------------------------------------------------------------------------*/
2136 /*                    START NODE LOGIC FOR NODE RESTART                      */
2137 /*---------------------------------------------------------------------------*/
nodeRestartPh2Lab(Signal * signal)2138 void Dbdih::nodeRestartPh2Lab(Signal* signal)
2139 {
2140   /*
2141    * Lock master DICT to avoid metadata operations during INR/NR.
2142    * Done just before START_PERMREQ.
2143    *
2144    * It would be more elegant to do this just before START_MEREQ.
2145    * The problem is, on INR we end up in massive invalidateNodeLCP
2146    * which is not fully protected against metadata ops.
2147    */
2148   ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);
2149 
2150   // check that we are not yet taking part in schema ops
2151   CRASH_INSERTION(7174);
2152 
2153   Uint32 lockType = DictLockReq::NodeRestartLock;
2154   Callback c = { safe_cast(&Dbdih::recvDictLockConf_nodeRestart), 0 };
2155   sendDictLockReq(signal, lockType, c);
2156 }
2157 
recvDictLockConf_nodeRestart(Signal * signal,Uint32 data,Uint32 ret)2158 void Dbdih::recvDictLockConf_nodeRestart(Signal* signal, Uint32 data, Uint32 ret)
2159 {
2160   ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);
2161   ndbrequire(data != RNIL);
2162   c_dictLockSlavePtrI_nodeRestart = data;
2163 
2164   nodeRestartPh2Lab2(signal);
2165 }
2166 
nodeRestartPh2Lab2(Signal * signal)2167 void Dbdih::nodeRestartPh2Lab2(Signal* signal)
2168 {
2169   /*------------------------------------------------------------------------*/
2170   // REQUEST FOR PERMISSION FROM MASTER TO START A NODE IN AN ALREADY
2171   // RUNNING SYSTEM.
2172   /*------------------------------------------------------------------------*/
2173 
2174   g_eventLogger->info("Request permission to start our node from master Starting");
2175 
2176   StartPermReq * const req = (StartPermReq *)&signal->theData[0];
2177 
2178   req->blockRef  = reference();
2179   req->nodeId    = cownNodeId;
2180   req->startType = cstarttype;
2181   sendSignal(cmasterdihref, GSN_START_PERMREQ, signal, 3, JBB);
2182 
2183   if (ERROR_INSERTED(7203))
2184   {
2185     signal->theData[0] = 9999;
2186     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
2187   }
2188 }
2189 
execSTART_PERMCONF(Signal * signal)2190 void Dbdih::execSTART_PERMCONF(Signal* signal)
2191 {
2192   jamEntry();
2193   CRASH_INSERTION(7121);
2194   Uint32 nodeId = signal->theData[0];
2195   cfailurenr = signal->theData[1];
2196 
2197   bool microGCP = signal->theData[2];
2198   if (signal->getLength() < StartPermConf::SignalLength)
2199   {
2200     microGCP = false;
2201   }
2202   m_micro_gcp.m_enabled = microGCP;
2203   ndbrequire(nodeId == cownNodeId);
2204   ndbsttorry10Lab(signal, __LINE__);
2205 
2206   if (m_micro_gcp.m_enabled)
2207   {
2208     jam();
2209     UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
2210     ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
2211     EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
2212   }
2213   else if(isMultiThreaded())
2214   {
2215     /**
2216      * Prevent this start, as there is some non-thread-safe upgrade code for
2217      * this case in LQH.
2218      */
2219     progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT,
2220               "Cluster requires that all old data nodes are upgraded "
2221               "while running single-threaded ndbd before starting "
2222               "multi-threaded ndbmtd data nodes.");
2223   }
2224 
2225   g_eventLogger->info("Request permission to start our node from master Completed");
2226 
2227 }//Dbdih::execSTART_PERMCONF()
2228 
execSTART_PERMREF(Signal * signal)2229 void Dbdih::execSTART_PERMREF(Signal* signal)
2230 {
2231   jamEntry();
2232   Uint32 errorCode = signal->theData[1];
2233   if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR ||
2234       errorCode == StartPermRef::ZNODE_START_DISALLOWED_ERROR) {
2235     jam();
2236     /*-----------------------------------------------------------------------*/
2237     // The master was busy adding another node. We will wait for a few
2238     // seconds and try again.
2239     /*-----------------------------------------------------------------------*/
2240     g_eventLogger->info("Did not get permission to start (%u) retry in 3s",
2241                         errorCode);
2242     signal->theData[0] = DihContinueB::ZSTART_PERMREQ_AGAIN;
2243     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
2244     return;
2245   }//if
2246 
2247   if (errorCode == StartPermRef::InitialStartRequired)
2248   {
2249     CRASH_INSERTION(7170);
2250     char buf[255];
2251     BaseString::snprintf(buf, sizeof(buf),
2252 			 "Cluster requires this node to be started "
2253 			 " with --initial as partial start has been performed"
2254 			 " and this filesystem is unusable");
2255     progError(__LINE__,
2256 	      NDBD_EXIT_SR_RESTARTCONFLICT,
2257 	      buf);
2258     ndbrequire(false);
2259   }
2260 
2261   /*------------------------------------------------------------------------*/
2262   // Some node process in another node involving our node was still active. We
2263   // will recover from this by crashing here.
2264   // This is controlled restart using the
2265   // already existing features of node crashes. It is not a bug getting here.
2266   /*-------------------------------------------------------------------------*/
2267   ndbrequire(false);
2268   return;
2269 }//Dbdih::execSTART_PERMREF()
2270 
2271 /*---------------------------------------------------------------------------*/
2272 /*       THIS SIGNAL IS RECEIVED IN THE STARTING NODE WHEN THE START_MEREQ   */
2273 /*       HAS BEEN EXECUTED IN THE MASTER NODE.                               */
2274 /*---------------------------------------------------------------------------*/
execSTART_MECONF(Signal * signal)2275 void Dbdih::execSTART_MECONF(Signal* signal)
2276 {
2277   jamEntry();
2278   StartMeConf * const startMe = (StartMeConf *)&signal->theData[0];
2279   Uint32 nodeId = startMe->startingNodeId;
2280   const Uint32 startWord = startMe->startWord;
2281   Uint32 i;
2282 
2283   CRASH_INSERTION(7130);
2284   ndbrequire(nodeId == cownNodeId);
2285   arrGuard(startWord + StartMeConf::DATA_SIZE, sizeof(cdata)/4);
2286   for(i = 0; i < StartMeConf::DATA_SIZE; i++)
2287     cdata[startWord+i] = startMe->data[i];
2288 
2289   if(startWord + StartMeConf::DATA_SIZE < Sysfile::SYSFILE_SIZE32){
2290     jam();
2291     /**
2292      * We are still waiting for data
2293      */
2294     return;
2295   }
2296   jam();
2297 
2298   /**
2299    * Copy into sysfile
2300    *
2301    * But dont copy lastCompletedGCI:s
2302    */
2303   Uint32 key = SYSFILE->m_restart_seq;
2304   Uint32 tempGCP[MAX_NDB_NODES];
2305   for (i = 0; i < MAX_NDB_NODES; i++)
2306     tempGCP[i] = SYSFILE->lastCompletedGCI[i];
2307 
2308   for (i = 0; i < Sysfile::SYSFILE_SIZE32; i++)
2309     sysfileData[i] = cdata[i];
2310 
2311   SYSFILE->m_restart_seq = key;
2312   for (i = 0; i < MAX_NDB_NODES; i++)
2313     SYSFILE->lastCompletedGCI[i] = tempGCP[i];
2314 
2315   setNodeActiveStatus();
2316   setNodeGroups();
2317 
2318   g_eventLogger->info("Request copying of distribution and dictionary"
2319                       " information from master Completed");
2320 
2321   ndbsttorry10Lab(signal, __LINE__);
2322 
2323   if (getNodeActiveStatus(getOwnNodeId()) == Sysfile::NS_Configured)
2324   {
2325     jam();
2326     c_set_initial_start_flag = FALSE;
2327   }
2328 }//Dbdih::execSTART_MECONF()
2329 
execSTART_COPYCONF(Signal * signal)2330 void Dbdih::execSTART_COPYCONF(Signal* signal)
2331 {
2332   jamEntry();
2333 
2334   StartCopyConf* conf = (StartCopyConf*)signal->getDataPtr();
2335   Uint32 nodeId = conf->startingNodeId;
2336   Uint32 senderData = conf->senderData;
2337 
2338   if (!ndb_pnr(getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version))
2339   {
2340     jam();
2341     senderData = RNIL;
2342   }
2343 
2344   if (senderData == RNIL)
2345   {
2346     /**
2347      * This is NR
2348      */
2349     jam();
2350 
2351     g_eventLogger->info("Make On-line Database recoverable by waiting for"
2352                         " LCP Completed, LCP id = %u",
2353                         SYSFILE->latestLCP_ID);
2354 
2355     ndbrequire(nodeId == cownNodeId);
2356     CRASH_INSERTION(7132);
2357     ndbsttorry10Lab(signal, __LINE__);
2358   }
2359   else
2360   {
2361     /**
2362      * This is TO during SR...waiting for all nodes
2363      */
2364     infoEvent("Make On-line Database recoverable by waiting for LCP Completed"
2365               " on node %u, LCP id = %u",
2366               nodeId,
2367               SYSFILE->latestLCP_ID);
2368 
2369     ndbrequire(senderData == getOwnNodeId());
2370     ndbrequire(m_to_nodes.get(nodeId));
2371     m_to_nodes.clear(nodeId);
2372     m_sr_nodes.set(nodeId);
2373     if (!m_to_nodes.isclear())
2374     {
2375       jam();
2376       return;
2377     }
2378 
2379     infoEvent("Restore Database from disk Completed");
2380 
2381     signal->theData[0] = reference();
2382     m_sr_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
2383     sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
2384                1 + NdbNodeBitmask::Size, JBB);
2385     return;
2386   }
2387   return;
2388 }//Dbdih::execSTART_COPYCONF()
2389 
2390 /*---------------------------------------------------------------------------*/
2391 /*                    MASTER LOGIC FOR NODE RESTART                          */
2392 /*---------------------------------------------------------------------------*/
2393 /*                    NODE RESTART PERMISSION REQUEST                        */
2394 /*---------------------------------------------------------------------------*/
2395 // A REQUEST FROM A STARTING NODE TO PERFORM A NODE RESTART. IF NO OTHER NODE
2396 // IS ACTIVE IN PERFORMING A NODE RESTART AND THERE ARE NO ACTIVE PROCESSES IN
2397 // THIS NODE INVOLVING THE STARTING NODE  THIS REQUEST WILL BE GRANTED.
2398 /*---------------------------------------------------------------------------*/
execSTART_PERMREQ(Signal * signal)2399 void Dbdih::execSTART_PERMREQ(Signal* signal)
2400 {
2401   StartPermReq * const req = (StartPermReq*)&signal->theData[0];
2402   jamEntry();
2403   const BlockReference retRef = req->blockRef;
2404   const Uint32 nodeId   = req->nodeId;
2405   const Uint32 typeStart = req->startType;
2406   CRASH_INSERTION(7122);
2407   ndbrequire(isMaster());
2408   ndbrequire(refToNode(retRef) == nodeId);
2409   if (c_lcpMasterTakeOverState.state != LMTOS_IDLE)
2410   {
2411     jam();
2412     infoEvent("DIH : Denied request for start permission from %u "
2413               "while LCP Master takeover in progress.",
2414               nodeId);
2415     g_eventLogger->info("DIH : Denied request for start permission from %u "
2416                         "while LCP Master takeover in progress.",
2417                         nodeId);
2418     signal->theData[0] = nodeId;
2419     signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
2420     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
2421     return;
2422   }
2423   if ((c_nodeStartMaster.activeState) ||
2424       (c_nodeStartMaster.wait != ZFALSE) ||
2425       ERROR_INSERTED_CLEAR(7175)) {
2426     jam();
2427     signal->theData[0] = nodeId;
2428     signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
2429     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
2430     return;
2431   }//if
2432 
2433   if (!getAllowNodeStart(nodeId))
2434   {
2435     jam();
2436     g_eventLogger->info("Rejecting attempt to start node %u", nodeId);
2437 ref:
2438     signal->theData[0] = nodeId;
2439     signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
2440     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
2441     return;
2442   }
2443   if (getNodeStatus(nodeId) != NodeRecord::DEAD)
2444   {
2445     jam();
2446     g_eventLogger->error("nodeStatus in START_PERMREQ = %u",
2447                          (Uint32) getNodeStatus(nodeId));
2448     goto ref;
2449   }//if
2450 
2451   if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
2452       typeStart != NodeState::ST_INITIAL_NODE_RESTART)
2453   {
2454     jam();
2455     signal->theData[0] = nodeId;
2456     signal->theData[1] = StartPermRef::InitialStartRequired;
2457     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
2458     return;
2459   }
2460 
2461   /*----------------------------------------------------------------------
2462    * WE START THE INCLUSION PROCEDURE
2463    * ---------------------------------------------------------------------*/
2464   c_nodeStartMaster.failNr   = cfailurenr;
2465   c_nodeStartMaster.wait     = ZFALSE;
2466   c_nodeStartMaster.startInfoErrorCode = 0;
2467   c_nodeStartMaster.startNode = nodeId;
2468   c_nodeStartMaster.activeState = true;
2469   c_nodeStartMaster.m_outstandingGsn =  GSN_START_INFOREQ;
2470 
2471   setNodeStatus(nodeId, NodeRecord::STARTING);
2472   /**
2473    * But if it's a NodeState::ST_INITIAL_NODE_RESTART
2474    *
2475    * We first have to clear LCP's
2476    * For normal node restart we simply ensure that all nodes
2477    * are informed of the node restart
2478    */
2479   StartInfoReq *const r =(StartInfoReq*)&signal->theData[0];
2480   r->startingNodeId = nodeId;
2481   r->typeStart = typeStart;
2482   r->systemFailureNo = cfailurenr;
2483   sendLoopMacro(START_INFOREQ, sendSTART_INFOREQ, RNIL);
2484 }//Dbdih::execSTART_PERMREQ()
2485 
execSTART_INFOREF(Signal * signal)2486 void Dbdih::execSTART_INFOREF(Signal* signal)
2487 {
2488   StartInfoRef * ref = (StartInfoRef*)&signal->theData[0];
2489   if (getNodeStatus(ref->startingNodeId) != NodeRecord::STARTING) {
2490     jam();
2491     return;
2492   }//if
2493   ndbrequire(c_nodeStartMaster.startNode == ref->startingNodeId);
2494   c_nodeStartMaster.startInfoErrorCode = ref->errorCode;
2495   startInfoReply(signal, ref->sendingNodeId);
2496 }//Dbdih::execSTART_INFOREF()
2497 
execSTART_INFOCONF(Signal * signal)2498 void Dbdih::execSTART_INFOCONF(Signal* signal)
2499 {
2500   jamEntry();
2501   StartInfoConf * conf = (StartInfoConf*)&signal->theData[0];
2502   if (getNodeStatus(conf->startingNodeId) != NodeRecord::STARTING) {
2503     jam();
2504     return;
2505   }//if
2506   ndbrequire(c_nodeStartMaster.startNode == conf->startingNodeId);
2507   startInfoReply(signal, conf->sendingNodeId);
2508 }//Dbdih::execSTART_INFOCONF()
2509 
startInfoReply(Signal * signal,Uint32 nodeId)2510 void Dbdih::startInfoReply(Signal* signal, Uint32 nodeId)
2511 {
2512   receiveLoopMacro(START_INFOREQ, nodeId);
2513   /**
2514    * We're finished with the START_INFOREQ's
2515    */
2516   if (c_nodeStartMaster.startInfoErrorCode == 0)
2517   {
2518     jam();
2519     /**
2520      * Everything has been a success so far
2521      *
2522      * Update node recovery status that we now have received permission to
2523      * perform node restart from all live nodes. This code only executes
2524      * in the master node.
2525      */
2526     setNodeRecoveryStatus(c_nodeStartMaster.startNode,
2527                           NodeRecord::START_PERMITTED);
2528 
2529     StartPermConf * conf = (StartPermConf*)&signal->theData[0];
2530     conf->startingNodeId = c_nodeStartMaster.startNode;
2531     conf->systemFailureNo = cfailurenr;
2532     conf->microGCP = m_micro_gcp.m_enabled;
2533     sendSignal(calcDihBlockRef(c_nodeStartMaster.startNode),
2534                GSN_START_PERMCONF, signal, StartPermConf::SignalLength, JBB);
2535     c_nodeStartMaster.m_outstandingGsn = GSN_START_PERMCONF;
2536   }
2537   else
2538   {
2539     /**
2540      * Failure of START_INFO protocol, another node wasn't ready to
2541      * start this node, some part of handling a previous node failure
2542      * hadn't completed yet. The node will have to wait a bit more.
2543      * We need to restore the state such that the retry is possible.
2544      */
2545     jam();
2546     StartPermRef * ref = (StartPermRef*)&signal->theData[0];
2547     ref->startingNodeId = c_nodeStartMaster.startNode;
2548     ref->errorCode = c_nodeStartMaster.startInfoErrorCode;
2549     sendSignal(calcDihBlockRef(c_nodeStartMaster.startNode),
2550 	       GSN_START_PERMREF, signal, StartPermRef::SignalLength, JBB);
2551     setNodeStatus(c_nodeStartMaster.startNode, NodeRecord::DEAD);
2552     nodeResetStart(signal);
2553   }//if
2554 }//Dbdih::startInfoReply()
2555 
2556 /**
2557  *---------------------------------------------------------------------------
2558  * LCP Pausing module
2559  * ------------------
2560  *
2561  * This module contains code that executes for the purpose of pausing
2562  * LCP reporting to our meta data for a short time while we are copying the
2563  * meta data to a new starting node.
2564  *
2565  * In order to better understand the handling of the LCP protocol we will
2566  * describe the LCP protocol, this includes both the old and the new protocol.
2567  *
2568  * The LCP protocol is controlled by the DIH in the master node.
2569  * When an LCP has been completed we will immediately start checking for
2570  * the need for a new LCP to be started.
2571  *
2572  * The first step here is to ensure that we have had sufficient activity in
2573  * the cluster to necessitate an LCP to be executed again.
2574  *
2575  * To check this we send TCGETOPSIZEREQ to all DBTCs in the cluster. This
2576  * will gather in an estimate of how much writes we've had in the cluster
2577  * since the last LCP was started. There are also various ways to ensure
2578  * that we start an LCP immediately if so needed.
2579  *
2580  * If the activity was sufficient we will start the LCP.
2581  * Before starting the LCP we will calculate a number of GCI values that
2582  * are important, oldest restorable GCI and so forth.
2583  * Next we will send TC_CLOPSIZEREQ to all DBTCs in the cluster to clear
2584  * the activity counter in DBTC as preparation for the next LCP start.
2585  *
2586  * In the old way we will then grab a mutex on the fragment info, this
2587  * mutex will be held until the LCP is completed. The mutex is held in
2588  * the master node, in a master takeover the mutex needs to be taken
2589  * also in the new master node. Since all LCPs goes through the master
2590  * node this has the same effect as a distributed mutex on the fragment
2591  * info.
2592  *
2593  * In the new way we will start the LCP immediately here without grabbing
2594  * the mutex.
2595  *
2596  * The first step in starting is to calculate the set of LQHs involved in
2597  * the LCP and the set of DIHs involved in the LCP. A node is involved in
2598  * the LCP in DIH if it has had the meta data copied to it. It will
2599  * participate in an LCP in LQH if the data has been restored and we're
2600  * ready to perform a full LCP.
2601  *
2602  * Next we update to the new LCP id of the new LCP.
2603  *
2604  * The next step is performed in the master node by walking through all
2605  * fragment replicas of all active tables to see how much of the REDO log
2606  * we can cut away when starting the new LCP. At the first order of a
2607  * LCP of a fragment in an LDM instance we will set the new log tail in
2608  * that LDM instance.
2609  *
2610  * After calculating the new GCI values and setting the LCP id we will
2611  * synchronize this information with all other nodes in the cluster.
2612  * This information will also be synchronized to the file system in
2613  * the Sysfile. This file is where all restarts start by looking at
2614  * the state of the our database on files.
2615  * The COPY_GCIREQ signal is used to distribute this message.
2616  *
2617  * When all nodes have synchronized this information to disk and confirmed
2618  * this to the master then we are ready to start sending orders to perform
2619  * the individual checkpoints of the fragment replicas.
2620  *
2621  * The next step is that we want to set the tables to be involved in the
2622  * LCP. At this point we want to ensure that the same set of tables is
2623  * calculated in all nodes. To ensure this we grab the mutex that ensures
2624  * no tables are able to commit their CREATE TABLE statements until we are
2625  * done with this step.
2626  * This is started by the signal START_LCP_REQ. This signal also contains
2627  * list of nodes involved in the LCP both for LQH and DIH.
2628  *
2629  * CREATE TABLE can create new tables prior to this point  which we will
2630  * include, and that's ok as they cannot possibly affect the new redo tail
2631  * position. DROP TABLE can drop tables prior to this point, which could
2632  * remove the need to maintain some old redo, but that will be handled in
2633  * the following LCP.
2634  *
2635  * Each table to execute the LCP on is marked with a proper state in the
2636  * variable tabLcpStatus. Also each fragment replica to execute the LCP
2637  * on is marked with true in the lcpOngoingFlag and we set the number of
2638  * replicas to perform LCP on per fragment as well.
2639  *
2640  * These preparatory steps are done in a synchronized manner, so all nodes
2641  * have received information about the COPY_GCIREQ and now all nodes have
2642  * heard the START_LCP_REQ signals. So in a master takeover we can ask all
2643  * nodes about their LCP state and we can derive if we sent the COPY_GCIREQ
2644  * to all nodes and similarly we can derive if we sent and completed the
2645  * START_LCP_REQ step. To derive this requires all nodes to have heard of
2646  * those signals, not just one of them since a crash can occur in the
2647  * middle of signal sending.
2648  *
2649  * In a master takeover if we haven't completed the COPY_GCIREQ step then
2650  * we can start the next LCP from the beginning again. If COPY_GCIREQ has
2651  * been completed but not the START_LCP_REQ, then we can restart the
2652  * START_LCP_REQ step. Finally if the START_LCP_REQ has been completed
2653  * then we know that the execution of checkpoints on individual fragment
2654  * replicas is ongoing. Obviously in a master take over we should ensure
2655  * that the processing of START_LCP_REQ is completed before we report
2656  * back our state to the master node to ensure that we make the master
2657  * takeover handling as simple as possible.
2658  *
2659  * So now that we know exactly which tables and fragment replicas to checkpoint
2660  * it is time to start the actual checkpoint phase.
2661  *
2662  * The master node will send LCP_FRAG_ORD to DBLQH for each of the fragment
2663  * replicas to execute the LCP on.
2664  *
2665  * In the old way there was a queue of such LCP_FRAG_ORD with limited size in
2666  * DBDIH (queue size was 2 in 7.3 and earlier and 128 in early 7.4 versions).
2667  * Also DBLQH had a queue for LCP_FRAG_ORDs, in 7.3 this was 2 in size and
2668  * in early versions of 7.4 it was 64.
2669  *
2670  * In the new version we can send LCP_FRAG_ORD to LQH as before, LQH has an
2671  * infinite queue size (it simply stores the LCP_FRAG_ORD on the fragment
2672  * record, so there is no limit to the queue size since all fragments can
2673  * be in the queue). In addition at master takeover we also support receiving
2674  * the same order two or more times. By ensuring that we keep track of that
2675  * we already received a LCP_FRAG_ORD on a fragment we can also easily discard
2676  * LCP_FRAG_ORDs that we already received.
2677  *
2678  * These features mean that LQH can process a Local Checkpoint without much
2679  * interaction with DIH / DIH Master, which enables simplifications at DIH
2680  * and DIH Master in later versions. In principle we could send off all
2681  * LCP_FRAG_ORDs immediately if we like and more or less turn the LDM
2682  * instances into independent LCP execution engines. This is a step in the
2683  * direction of more local control in LQH over LCP execution.
2684  *
2685  * When all LCP_FRAG_ORD have been sent, then a special LCP_FRAG_ORD to all
2686  * participating LQH nodes. This signal has the flag lastFragmentFlag set,
2687  * it doesn't contain any fragment to checkpoint, it is only a flag that
2688  * indicates that we've sent the last LCP_FRAG_ORD.
2689  *
2690  * LQH will execute orders to execute LCP on a fragment in the order they are
2691  * received. As a fragment is completing its LCP it will generate a new message
2692  * LCP_FRAG_REP. This message is broadcasted to all participating DIHs. First
2693  * the message is sent from DBLQH to the local DIH. Finally the local DIH will
2694  * broadcast it to all participating DIHs.
2695  *
2696  * This new Pausing LCP module is involved here by being able to queue also
2697  * LCP_FRAG_REP before they are broadcast to the participating DIHs. They are
2698  * queued on the fragment replica records in the local DIH and thus we have
2699  * no limits on the queue size.
2700  *
2701  * This allows the DIH Master state to be stabilised as necessary during an
2702  * LCP, removing the need in some cases to wait for an LCP to complete before
2703  * performing some other activity.
2704  *
2705  * When LQH have executed all the LCP_FRAG_ORDs and have received the
2706  * last fragment flag, then the LDM will perform a number of activities to
2707  * complete the local checkpoint. These activities is mostly used by the
2708  * disk data tables.
2709  *
2710  * After all these activities have completed the LQH will send
2711  * LCP_COMPLETE_REP to the local DIH. The local DIH will broadcast it to all
2712  * participating DIHs.
2713  *
2714  * When all LQHs have sent all LCP_FRAG_REP and it has also sent the
2715  * LCP_COMPLETE_REP, then the LCP is completed. So a node that has seen
2716  * LCP_COMPLETE_REP from all nodes participating in the LCP knows that
2717  * it has received all the LCP_FRAG_REP for the LCP.
2718  *
2719  * In a master takeover in the old way we could not resend the LCP_FRAG_ORD
2720  * to the LQH again. To avoid this we used an extra master takeover
2721  * protocol EMPTY_LCP_REQ. This protocol ensures that all LQHs have completed
2722  * the queues and that all LCP_FRAG_REPs have been sent to all participating
2723  * DIHs and likewise with the LCP_COMPLETE_REP such that the new master has
2724  * a precise view of which fragment replicas have completed the LCP execution
2725  * so far.
2726  *
2727  * Thus when the master takeover is completed we know that each DIH has all
2728  * the LCP_FRAG_REP for which an LCP_FRAG_ORD have been sent and also all
2729  * LCP_COMPLETE_REP that have been produced. This means that we are now
2730  * ready to restart the process of sending LCP_FRAG_ORD again.
2731  *
2732  * The problem with this approach is that can consume a very long time to
2733  * execute the entire LCP fragment queue in LQH if the queue size increases
2734  * (increased from 2 to 64 going from 7.3 to 7.4) and the size of the
2735  * fragments also increase. So the master takeover can take a substantial
2736  * time in this case.
2737  *
2738  * So the new manner is to allow for the LQH to get LCP_FRAG_ORD and also
2739  * the special last LCP_FRAG_ORD several times with the same LCP id and
2740  * discard those that it receives for a second time. In this manner we can
2741  * simply restart sending the LCP_FRAG_ORD from the beginning. When we are
2742  * done with this we can start checking for completion of the LCP in the
2743  * normal way.
2744  *
2745  * When the master has sent the last special LCP_FRAG_ORD and these have been
2746  * received by the receiving nodes, then the master will actually itself not
2747  * do anything more to execute the LCP. The non-master nodes will however send
2748  * LCP_COMPLETE_REP to the master node. So this means that a new LCP won't
2749  * start until all participating DIHs have completed the processing of the
2750  * last LCP.
2751  *
2752  * So effectively taking over as master in this phase doesn't really require
2753  * any specific work other than redirecting the LCP_COMPLETE_REP from the
2754  * non-masters to the new master. If it has already been sent it should be
2755  * seen in the response to the MASTER_LCPREQ from the node. So after
2756  * receiving the last MASTER_LCPCONF we have information enough about whether
2757  * we need to send more LCP_FRAG_ORDs or not.
2758  *
2759  * We can still optimise the sending of LCP_FRAG_ORD a little bit by avoiding
2760  * to send LCP_FRAG_ORD to a fragment replica where we have already received
2761  * a LCP_FRAG_REP for it. It would be possible to avoid sending extra
2762  * LCP_FRAG_ORDs in various ways, but it doesn't really cost much, LCP_FRAG_ORD
2763  * is a small signal and the number of signals sent is limited to the number
2764  * of fragment replicas. So this would make sense if we have to support
2765  * extremely large clusters and extremely many tables in combination.
2766  *
2767  * As this description shows some interesting places to test master failures
2768  * are:
2769  * 1) Master failure while clearing TC counters (TC_CLOPSIZEREQ).
2770  * 2) Master failure while distributing COPY_GCIREQ.
2771  * 3) Master failure while distributing START_LCP_REQ
2772  * 4) Master failure while processing the LCP and sending LCP_FRAG_ORDs
2773  * 4.1) Before any LCP_FRAG_REP received
2774  * 4.2) After receiving many LCP_FRAG_REPs, but not all
2775  * 4.3) After receiving all LCP_FRAG_REPs, but not all LCP_COMPLETE_REPs
2776  * 4.4) After receiving all LCP_FRAG_REPs, and all LCP_COMPLETE_REPs.
2777  *
2778  * While distributing above can be interpreted as one test case of before
2779  * distributing, one in the middle of distributing and one when all
2780  * responses have been received.
2781  *
2782  * It is also important to similarly test PAUSE_LCP_REQ handling in all of
2783  * the above states. This can be handled by inserting an ERROR_INSERT that
2784  * effectively stops the process to copy meta data at some point and then
2785  * setting some variable that triggers the copying of meta data to continue
2786  * at a state that we wanted to accomplish.
2787  *---------------------------------------------------------------------------*/
2788 /* Initialisation routine, called once at startup of the node */
init_lcp_pausing_module(void)2789 void Dbdih::init_lcp_pausing_module(void)
2790 {
2791   /* Master state variables */
2792   c_pause_lcp_master_state = PAUSE_LCP_IDLE;
2793   c_lcp_runs_with_pause_support = false;
2794   c_old_node_waiting_for_lcp_end = false;
2795 
2796   /* Pause participant state variables */
2797   c_dequeue_lcp_rep_ongoing = false;
2798   c_queued_lcp_complete_rep = false;
2799   c_lcp_id_paused = RNIL;
2800   c_pause_lcp_start_node = RNIL;
2801   c_last_id_lcp_complete_rep = RNIL;
2802 
2803   /* Starting node state variable */
2804   c_lcp_id_while_copy_meta_data = RNIL;
2805 }
2806 
check_pause_state_lcp_idle(void)2807 void Dbdih::check_pause_state_lcp_idle(void)
2808 {
2809   /**
2810    * We should not be able to complete an LCP while still having
2811    * queued LCP_COMPLETE_REP and LCP_FRAG_REP.
2812    */
2813   ndbrequire(c_queued_lcp_frag_rep.isEmpty());
2814   ndbrequire(!c_queued_lcp_complete_rep);
2815 }
2816 
2817 /* Support function only called within ndbassert */
check_pause_state_sanity(void)2818 bool Dbdih::check_pause_state_sanity(void)
2819 {
2820   if (is_lcp_paused())
2821   {
2822     ndbrequire(!c_dequeue_lcp_rep_ongoing);
2823   }
2824   ndbrequire(c_lcp_id_paused == RNIL ||
2825              is_lcp_paused() ||
2826              c_dequeue_lcp_rep_ongoing);
2827   ndbrequire(!c_old_node_waiting_for_lcp_end ||
2828              c_lcp_runs_with_pause_support);
2829   return true;
2830 }
2831 
2832 /* Support function for execLCP_FRAG_REP */
queue_lcp_frag_rep(Signal * signal,LcpFragRep * lcpReport)2833 void Dbdih::queue_lcp_frag_rep(Signal *signal, LcpFragRep *lcpReport)
2834 {
2835   Uint32 tableId = lcpReport->tableId;
2836   Uint32 fragId = lcpReport->fragId;
2837 
2838   TabRecordPtr tabPtr;
2839   tabPtr.i = tableId;
2840   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
2841 
2842   if (tabPtr.p->tabStatus == TabRecord::TS_DROPPING ||
2843       tabPtr.p->tabStatus == TabRecord::TS_IDLE)
2844   {
2845     jam();
2846     return;
2847   }
2848 
2849   FragmentstorePtr fragPtr;
2850   getFragstore(tabPtr.p, fragId, fragPtr);
2851 
2852   ReplicaRecordPtr replicaPtr;
2853   findReplica(replicaPtr, fragPtr.p, lcpReport->nodeId);
2854   c_queued_lcp_frag_rep.addLast(replicaPtr);
2855   ndbrequire(replicaPtr.p->nextLcp == lcpReport->lcpNo);
2856   ndbrequire(replicaPtr.p->fragId == fragId);
2857   ndbrequire(replicaPtr.p->tableId == tableId);
2858   ndbrequire(replicaPtr.p->procNode == lcpReport->nodeId);
2859   ndbrequire(c_lcp_id_paused == RNIL ||
2860              c_lcp_id_paused == lcpReport->lcpId);
2861   c_lcp_id_paused = lcpReport->lcpId;
2862   replicaPtr.p->repMaxGciStarted = lcpReport->maxGciStarted;
2863   replicaPtr.p->repMaxGciCompleted = lcpReport->maxGciCompleted;
2864   ndbassert(check_pause_state_sanity());
2865 }
2866 
2867 /* Support function for execLCP_COMPLETE_REP */
queue_lcp_complete_rep(Signal * signal,Uint32 lcpId)2868 void Dbdih::queue_lcp_complete_rep(Signal *signal, Uint32 lcpId)
2869 {
2870   ndbrequire(!c_queued_lcp_complete_rep);
2871   c_queued_lcp_complete_rep = true;
2872   ndbrequire(c_lcp_id_paused == RNIL ||
2873              c_lcp_id_paused == lcpId);
2874   c_lcp_id_paused = lcpId;
2875   ndbassert(check_pause_state_sanity());
2876 }
2877 
2878 /* Support function to start copying of meta data */
start_copy_meta_data(Signal * signal)2879 void Dbdih::start_copy_meta_data(Signal *signal)
2880 {
2881   /**
2882    * Now that we have locked both the DICT lock and the LCPs are locked from
2883    * starting we are ready to copy both the distribution information and the
2884    * dictionary information. We update the node recovery status indicating
2885    * this. This code only executes in the master node.
2886    */
2887   setNodeRecoveryStatus(c_nodeStartMaster.startNode,
2888                         NodeRecord::COPY_DICT_TO_STARTING_NODE);
2889 
2890   c_nodeStartMaster.wait = 10;
2891   signal->theData[0] = DihContinueB::ZCOPY_NODE;
2892   signal->theData[1] = 0;
2893   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
2894   c_nodeStartMaster.m_outstandingGsn = GSN_COPY_TABREQ;
2895 }
2896 
2897 /**---------------------------------------------------------------
2898  * MASTER FUNCTIONALITY
2899  **--------------------------------------------------------------*/
2900 /**
2901  * If all nodes that are currently running the LCP can support PAUSE of an
2902  * LCP then we can use this function to find this out. We compute this
2903  * variable at a point where we start the LCP. We can still not get an old
2904  * node up and running until we get to a natural pause between two LCPs.
2905  *
2906  * If an old node comes around then it will block until the LCP is done,
2907  * this will also ensure that no other nodes will try to become part of
2908  * this LCP. However we could have new node already being included in
2909  * this LCP and then have more new nodes arriving that want to be included
2910  * and we can also have an old node arriving while we are including a new
2911  * node. But only one node at a time will be in the copy meta data phase
2912  * so this will work fine.
2913  */
check_if_pause_lcp_possible(void)2914 bool Dbdih::check_if_pause_lcp_possible(void)
2915 {
2916   NodeRecordPtr nodePtr;
2917   ndbrequire(isMaster());
2918   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
2919   {
2920     ptrAss(nodePtr, nodeRecord);
2921     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE)
2922     {
2923       if (getNodeInfo(nodePtr.i).m_version < NDBD_SUPPORT_PAUSE_LCP)
2924       {
2925         jam();
2926         return false;
2927       }
2928     }
2929   }
2930   return true;
2931 }
2932 
2933 /* Support function to check if LCP is still runnning */
check_if_lcp_idle(void)2934 bool Dbdih::check_if_lcp_idle(void)
2935 {
2936   ndbrequire(isMaster());
2937   switch (c_lcpState.lcpStatus)
2938   {
2939   case LCP_STATUS_IDLE:
2940   case LCP_TCGET:
2941   case LCP_TC_CLOPSIZE:
2942   case LCP_WAIT_MUTEX:
2943     jam();
2944     check_pause_state_lcp_idle();
2945     return true;
2946   case LCP_STATUS_ACTIVE:
2947     jam();
2948     return false;
2949   case LCP_TAB_COMPLETED:
2950     jam();
2951   case LCP_TAB_SAVED:
2952     jam();
2953   /**
2954    * For LCP_TAB_COMPLETED and LCP_TAB_COMPLETED we have already received
2955    * all the table information and thus there is no need to get the new
2956    * node into the LCP, there won't be any updates to the LCP data until
2957    * the next LCP happens.
2958    */
2959     return true;
2960   default:
2961     jam();
2962     return false;
2963   }
2964 }
2965 
2966 /* Send PAUSE_LCP_REQ to pause or to unpause, master code */
sendPAUSE_LCP_REQ(Signal * signal,bool pause)2967 void Dbdih::sendPAUSE_LCP_REQ(Signal *signal, bool pause)
2968 {
2969   PauseLcpReq *req = (PauseLcpReq*)signal->getDataPtrSend();
2970 
2971   /**
2972    * Send to all DIHs that participate in the LCP, including ourselves.
2973    * We will set up waiting for all those signals such that we can also
2974    * handle node failures in the middle of the pause process.
2975    */
2976   ndbrequire(isMaster());
2977   if (pause)
2978   {
2979     jam();
2980     ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_IDLE);
2981     c_pause_lcp_master_state = PAUSE_LCP_REQUESTED;
2982     req->pauseAction = PauseLcpReq::Pause;
2983     c_pause_participants = c_lcpState.m_participatingLQH;
2984     infoEvent("PAUSE LCP for starting node %u", c_nodeStartMaster.startNode);
2985   }
2986   else
2987   {
2988     /**
2989      * We are unpausing the LCP again after completing the copy of the meta
2990      * data, slightly different dependent on whether the starting node was
2991      * included into the LCP or not.
2992      */
2993     if (c_pause_lcp_master_state == PAUSE_COMPLETE_LCP_INCLUSION)
2994     {
2995       jam();
2996       ndbrequire(!check_if_lcp_idle());
2997       c_pause_lcp_master_state = PAUSE_IN_LCP_UNPAUSE;
2998       req->pauseAction = PauseLcpReq::UnPauseIncludedInLcp;
2999       infoEvent("UNPAUSE LCP for starting node %u, included in LCP",
3000                 c_nodeStartMaster.startNode);
3001     }
3002     else if (c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_COPY_META_DATA)
3003     {
3004       jam();
3005       ndbrequire(check_if_lcp_idle());
3006       c_pause_lcp_master_state = PAUSE_NOT_IN_LCP_UNPAUSE;
3007       req->pauseAction = PauseLcpReq::UnPauseNotIncludedInLcp;
3008       infoEvent("UNPAUSE LCP for starting node %u, not included in LCP",
3009                 c_nodeStartMaster.startNode);
3010     }
3011     else
3012     {
3013       ndbrequire(false);
3014     }
3015   }
3016   /**
3017    * The blocks that do the pausing is the local DIH in the nodes that
3018    * generate LCP_FRAG_REPs and LCP_COMPLETE_REPs. These are the
3019    * m_participatingLQH nodes. This set is untouched by new starting
3020    * nodes for this LCP. New nodes can be added to the next LCP, but
3021    * not to this one.
3022    *
3023    * As part of the pause protocol the starting node must also participate
3024    * in the LCP completion protocol, so the pause also includes taking the
3025    * starting node into the DIH node set that participates in the LCP.
3026    * We do however wait including the node until we reach the UnPause
3027    * action. The reason is that it is possible that the LCP is completed
3028    * in the process of pausing. In this case we will continue
3029    * completing the pause in the normal manner, but we will not send
3030    * START_LCP_REQ to the new node and we will not include the new in the
3031    * m_participatingDIH bitmap in the DIH nodes already participating
3032    * in the LCP.
3033    *
3034    * For those nodes that existed previously in the m_participatingDIH
3035    * bitmap, but not in the m_participatingLQH bitmap we need not
3036    * worry since they won't make use of the m_participatingDIH bitmap.
3037    * So there is no need to add the starting node into those. The
3038    * m_participatingDIH bitmap is used by those nodes that generate
3039    * LCP_FRAG_REPs and LCP_COMPLETE_REPs, and these nodes are exactly
3040    * the nodes found in the m_participatingLQH bitmap.
3041    */
3042 
3043   req->senderRef = reference();
3044   req->startNodeId = c_nodeStartMaster.startNode;
3045   if (req->pauseAction == PauseLcpReq::UnPauseIncludedInLcp)
3046   {
3047     jam();
3048     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.setWaitingFor(
3049       c_nodeStartMaster.startNode);
3050   }
3051   c_PAUSE_LCP_REQ_Counter.setWaitingFor(c_pause_participants);
3052   NodeReceiverGroup rg(DBDIH, c_pause_participants);
3053   rg.m_nodes.clear(getOwnNodeId());
3054   sendSignal(rg, GSN_PAUSE_LCP_REQ, signal,
3055              PauseLcpReq::SignalLength, JBB);
3056   /**
3057    * We execute the signal to ourself immediately, the reason is to
3058    * avoid having to add a specific state variable to detect when the
3059    * starting node have failed between now and receiving this signal.
3060    */
3061   execPAUSE_LCP_REQ(signal);
3062   ndbassert(check_pause_state_sanity());
3063 }
3064 
3065 /* Master code, other node has completed PAUSE_LCP_REQ */
execPAUSE_LCP_CONF(Signal * signal)3066 void Dbdih::execPAUSE_LCP_CONF(Signal *signal)
3067 {
3068   PauseLcpConf *conf = (PauseLcpConf*)&signal->theData[0];
3069   Uint32 nodeId = refToNode(conf->senderRef);
3070   Uint32 startNode = conf->startNodeId;
3071 
3072   ndbrequire(isMaster());
3073 
3074   if (!is_pause_for_this_node(startNode))
3075   {
3076     /* Ignore, node died in the process */
3077     jam();
3078     return;
3079   }
3080   ndbassert(check_pause_state_sanity());
3081   receiveLoopMacro(PAUSE_LCP_REQ, nodeId);
3082 
3083   if (c_pause_lcp_master_state == PAUSE_LCP_REQUESTED)
3084   {
3085     jam();
3086     /**
3087      * We have paused the reporting of LCPs, we are now ready to process the
3088      * copying of meta data. At this point in time we have sent PAUSE_LCP_REQ
3089      * to all LQH nodes participating in the LCP. Those in turn have sent
3090      * FLUSH_LCP_REP_REQ to all DIH participants and received a response
3091      * back from all nodes. This means that we have ensured that we have
3092      * absolutely no LCP_FRAG_REP and LCP_COMPLETE_REP signals in transit
3093      * in the entire cluster since we have sent a signal through every
3094      * link that could carry such a signal. We use the FIFO queue mechanism
3095      * of signals between two DIHs here as an important part of the protocol.
3096      *
3097      * This means that all DIHs now have the same view on the
3098      * LCP_FRAG_REPs they have seen and similarly for LCP_COMPLETE_REPs.
3099      * The LCP_COMPLETE_REPs could however still be sent back to ourselves
3100      * through a delayed signal since we don't want to process those
3101      * signals concurrently with pausing the LCP.
3102      *
3103      * We could end up in a situation where the LCP have completed here, but
3104      * this isn't a problem, we still hold the fragment info mutex, so no
3105      * new LCP can start until we are done with the copying and release the
3106      * fragment info mutex.
3107      */
3108     ndbassert(check_pause_state_sanity());
3109     check_for_pause_action(signal, StartLcpReq::PauseLcpStartFirst);
3110     return;
3111   }
3112   /**
3113    * UnPause
3114    * ------
3115    * This is the normal path for unpausing. At this point we have sent
3116    * PAUSE_LCP_REQ to all LQH nodes participating in the LCP. These nodes
3117    * have now started sending the LCP_FRAG_REPs and LCP_COMPLETE_REPs
3118    * again. The copying of meta data have been completed and we have
3119    * been included in the LCP handling. So we are now ready to proceed
3120    * with the node restart again. We will also perform the unpause
3121    * on the master node here to avoid interesting states between
3122    * stop pause and receiving the last PAUSE_LCP_CONF.
3123    */
3124   jam();
3125   ndbrequire(c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_UNPAUSE ||
3126              c_pause_lcp_master_state == PAUSE_IN_LCP_UNPAUSE);
3127   if (c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_UNPAUSE)
3128   {
3129     jam();
3130     end_pause(signal, PauseLcpReq::UnPauseNotIncludedInLcp);
3131   }
3132   else if (c_pause_lcp_master_state == PAUSE_IN_LCP_UNPAUSE)
3133   {
3134     jam();
3135     end_pause(signal, PauseLcpReq::UnPauseIncludedInLcp);
3136   }
3137   else
3138   {
3139     ndbrequire(false);
3140   }
3141   dihCopyCompletedLab(signal);
3142 }
3143 
3144 /**-------------------------------------------------------------------
3145   FUNCTIONS USED IN ALL NODES
3146 --------------------------------------------------------------------*/
3147 /**
3148  * PAUSE_LCP_REQ
3149  * -------------
3150  * This signal is sent from the master node to all DIHs to block distribution
3151  * of LCP_FRAG_REP signals. When we receive this signal we will queue all
3152  * signals that we receive from DBLQH about completed LCP fragments. The same
3153  * signal is also sent to stop the pause. The pauseAction is 0 for pause and
3154  * 1 for stop pause.
3155  *
3156  * After pausing locally in our own DBDIH, we will send a FLUSH_LCP_REP_REQ
3157  * to all nodes participating in the LCP. This ensures that any LCP_FRAG_REP
3158  * we have sent out has been received by the receiving node since we are
3159  * sending it on the same path and we have a guarantee that signals using
3160  * the same path won't race each other.
3161  */
execPAUSE_LCP_REQ(Signal * signal)3162 void Dbdih::execPAUSE_LCP_REQ(Signal *signal)
3163 {
3164   PauseLcpReq *req = (PauseLcpReq*) &signal->theData[0];
3165   PauseLcpReq::PauseAction pauseAction =
3166     (PauseLcpReq::PauseAction)req->pauseAction;
3167   Uint32 startNode = req->startNodeId;
3168 
3169   ndbrequire(req->senderRef == cmasterdihref);
3170   ndbassert(check_pause_state_sanity());
3171 
3172   /* TODO: Insert check that startNode is still alive here */
3173   if (pauseAction == PauseLcpReq::Pause)
3174   {
3175     jam();
3176     pause_lcp(signal, startNode, req->senderRef);
3177   }
3178   else
3179   {
3180     jam();
3181     unpause_lcp(signal,
3182                 startNode,
3183                 req->senderRef,
3184                 pauseAction);
3185   }
3186   return;
3187 }
3188 
pause_lcp(Signal * signal,Uint32 startNode,BlockReference sender_ref)3189 void Dbdih::pause_lcp(Signal *signal,
3190                       Uint32 startNode,
3191                       BlockReference sender_ref)
3192 {
3193   /**
3194    * Since the message comes from the master on behalf of the starting
3195    * node we need to ensure that the starting node hasn't failed already.
3196    * We handle stopping of pause at node failure, but if this arrives
3197    * after we already received NODE_FAILREP we need to ensure that we
3198    * don't proceed since this will cause havoc.
3199    */
3200   if (!isMaster())
3201   {
3202     /**
3203      * We should come here after getting permit to start node, but before
3204      * we the node is included into the LCP and GCP protocol, this happens
3205      * immediately after we copied the meta data which the PAUSE LCP
3206      * protocol is part of handling.
3207      */
3208     NodeRecordPtr nodePtr;
3209     nodePtr.i = startNode;
3210     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
3211     if (!nodePtr.p->is_pausable)
3212     {
3213       jam();
3214       /* Ignore, node already died */
3215       return;
3216     }
3217     /**
3218      * Verify that the master isn't starting PAUSE protocol for old nodes
3219      * that doesn't support the PAUSE LCP protocol. We make it an assert mostly
3220      * to find bugs early on, a proper handling would probably be to shoot
3221      * down the master node.
3222      */
3223     ndbassert(getNodeInfo(startNode).m_version >= NDBD_SUPPORT_PAUSE_LCP);
3224   }
3225 
3226   ndbrequire(sender_ref == cmasterdihref);
3227   if (c_dequeue_lcp_rep_ongoing)
3228   {
3229     jam();
3230     /**
3231      * Stop unpause mechanism as we are starting a new pause action.
3232      */
3233     c_dequeue_lcp_rep_ongoing = false;
3234   }
3235   c_pause_lcp_start_node = startNode;
3236 
3237   /**
3238    * Send flush signal to all nodes participating in LCP.
3239    * We need not send to ourselves since we don't send LCP_FRAG_REP
3240    * to ourselves. We need to keep track of which nodes that have
3241    * replied to the message.
3242    */
3243   FlushLcpRepReq *req = (FlushLcpRepReq*) signal->getDataPtrSend();
3244   req->senderRef = reference();
3245   req->startNodeId = startNode;
3246   c_FLUSH_LCP_REP_REQ_Counter.setWaitingFor(c_lcpState.m_participatingDIH);
3247   NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
3248 
3249   sendSignal(rg, GSN_FLUSH_LCP_REP_REQ, signal,
3250              FlushLcpRepReq::SignalLength, JBB);
3251 
3252   ndbassert(check_pause_state_sanity());
3253 }
3254 
check_for_pause_action(Signal * signal,StartLcpReq::PauseStart pauseStart)3255 void Dbdih::check_for_pause_action(Signal *signal,
3256                                    StartLcpReq::PauseStart pauseStart)
3257 {
3258   ndbrequire(is_lcp_paused());
3259   if (!check_if_lcp_idle())
3260   {
3261     jam();
3262     /**
3263      * A next step when we have paused the LCP execution is to get the
3264      * starting node active in the LCP handling. This means we need to send
3265      * START_LCP_REQ to the node. We won't track the reply here since a
3266      * missing reply is due to a crashed node and then the node failure
3267      * handling will ensure that the LCP is restarted and that the pause of
3268      * the LCP is unpaused.
3269      * (A test case for this is needed).
3270      *
3271      * At this point in time we have stalled all activity in the LCP.
3272      * This means that the bit maps on participating LQHs and DIHs is
3273      * stable, it also means that the bit maps for which LQHs and DIHs
3274      * that have completed is also stable (we have stopped LCP_COMPLETE_REP
3275      * to pass through in all nodes). There might be LQHs and DIHs that
3276      * have already completed and we need this information to also be
3277      * transferred to the starting node for it to be able to complete
3278      * the LCP processing properly.
3279      *
3280      * This means we actually have to send two signals with all four
3281      * bitmaps. After these signals have been sent over we will
3282      * be ready to copy the meta data and after that to unpause and
3283      * complete this LCP with the starting node as a new participant.
3284      *
3285      * It is vital to send this information before we copy the meta
3286      * data since the m_participatingLQH bitmap is needed to set
3287      * the lcpOngoing flag on the replicas set correctly.
3288      */
3289     StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
3290     BlockReference ref = calcDihBlockRef(c_nodeStartMaster.startNode);
3291     req->senderRef = reference();
3292     req->lcpId = SYSFILE->latestLCP_ID;
3293     req->pauseStart = pauseStart;
3294     if (pauseStart == StartLcpReq::PauseLcpStartFirst)
3295     {
3296       jam();
3297       ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_REQUESTED);
3298       c_pause_lcp_master_state = PAUSE_START_LCP_INCLUSION;
3299       req->participatingLQH = c_lcpState.m_participatingLQH;
3300       req->participatingDIH = c_lcpState.m_participatingDIH;
3301       sendSignal(ref, GSN_START_LCP_REQ, signal,
3302                  StartLcpReq::SignalLength, JBB);
3303     }
3304     else
3305     {
3306       bool found = false;
3307       ndbrequire(pauseStart == StartLcpReq::PauseLcpStartSecond);
3308       ndbrequire(c_pause_lcp_master_state == PAUSE_IN_LCP_COPY_META_DATA);
3309       c_pause_lcp_master_state = PAUSE_COMPLETE_LCP_INCLUSION;
3310       req->participatingLQH.clear();
3311       for (Uint32 nodeId = 1; nodeId < MAX_NDB_NODES; nodeId++)
3312       {
3313         if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
3314         {
3315           jamLine(nodeId);
3316           req->participatingLQH.set(nodeId);
3317           found = true;
3318         }
3319       }
3320       /**
3321        * We should not be able to have all LQH sent completed, but not all
3322        * LCP_FRAG_REP yet received.
3323        */
3324       ndbrequire(found);
3325       sendSignal(ref, GSN_START_LCP_REQ, signal,
3326                  StartLcpReq::SignalLength, JBB);
3327       return;
3328     }
3329   }
3330   else
3331   {
3332     if (pauseStart == StartLcpReq::PauseLcpStartFirst)
3333     {
3334       jam();
3335       /**
3336        * The LCP completed while we paused, no need to prepare the starting
3337        * node for inclusion into the LCP protocol since we will continue
3338        * with the node restart immediately after completing the copy of the
3339        * meta data and the unpause action.
3340        */
3341       ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_REQUESTED);
3342       c_pause_lcp_master_state = PAUSE_NOT_IN_LCP_COPY_META_DATA;
3343       start_copy_meta_data(signal);
3344     }
3345     else
3346     {
3347       jam();
3348       /**
3349        * The LCP completed while we paused and we have now copied the meta
3350        * data over. We are ready to unpause and need not include the new
3351        * node into the LCP protocol this time.
3352        */
3353       ndbrequire(pauseStart == StartLcpReq::PauseLcpStartSecond);
3354       ndbrequire(c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_COPY_META_DATA);
3355       sendPAUSE_LCP_REQ(signal, false);
3356     }
3357     return;
3358   }
3359 }
3360 
unpause_lcp(Signal * signal,Uint32 startNode,BlockReference sender_ref,PauseLcpReq::PauseAction pauseAction)3361 void Dbdih::unpause_lcp(Signal *signal,
3362                         Uint32 startNode,
3363                         BlockReference sender_ref,
3364                         PauseLcpReq::PauseAction pauseAction)
3365 {
3366   if (!is_pause_for_this_node(startNode))
3367   {
3368     jam();
3369     /* Ignore, node already died */
3370     return;
3371   }
3372   /**
3373    * When we stop pausing we will set the dequeue flag, LCP_FRAG_REPs and
3374    * LCP_COMPLETE_REPs will continue to be queued while any of those two
3375    * flags are set to ensure that we keep the order of LCP_FRAG_REP. This
3376    * order isn't absolutely necessary, but it makes it easier to debug
3377    * the system.
3378    */
3379   PauseLcpConf *conf = (PauseLcpConf*)signal->getDataPtrSend();
3380   conf->senderRef = reference();
3381   conf->startNodeId = startNode;
3382   sendSignal(cmasterdihref, GSN_PAUSE_LCP_CONF, signal,
3383              PauseLcpConf::SignalLength, JBB);
3384 
3385   if (isMaster())
3386   {
3387     jam();
3388     /**
3389      * We complete the Pause LCP protocol in master when all nodes
3390      * have returned. Too early here.
3391      */
3392     return;
3393   }
3394   end_pause(signal, pauseAction);
3395 }
3396 
end_pause(Signal * signal,PauseLcpReq::PauseAction pauseAction)3397 void Dbdih::end_pause(Signal *signal,
3398                       PauseLcpReq::PauseAction pauseAction)
3399 {
3400   if (pauseAction == PauseLcpReq::UnPauseIncludedInLcp)
3401   {
3402     jam();
3403     c_lcpState.m_participatingDIH.set(c_pause_lcp_start_node);
3404   }
3405   stop_pause(signal);
3406 }
3407 
stop_pause(Signal * signal)3408 void Dbdih::stop_pause(Signal *signal)
3409 {
3410   if (isMaster())
3411   {
3412     jam();
3413     c_pause_participants.clear();
3414     c_pause_lcp_master_state = PAUSE_LCP_IDLE;
3415   }
3416   c_pause_lcp_start_node = RNIL;
3417   ndbrequire(!c_dequeue_lcp_rep_ongoing);
3418   c_dequeue_lcp_rep_ongoing = true;
3419   ndbassert(check_pause_state_sanity());
3420   dequeue_lcp_rep(signal);
3421 }
3422 
3423 /**
3424  * All node failures while being in LCP pause state leads to immediate based
3425  * on the assumption that all node failures will also automatically lead
3426  * to failures of any starting nodes while we are still in the starting
3427  * state.
3428  *
3429  * This means we need no code to handle unpausing at node failures.
3430  */
handle_node_failure_in_pause(Signal * signal)3431 void Dbdih::handle_node_failure_in_pause(Signal *signal)
3432 {
3433   c_FLUSH_LCP_REP_REQ_Counter.clearWaitingFor();
3434   c_PAUSE_LCP_REQ_Counter.clearWaitingFor();
3435   stop_pause(signal);
3436   ndbassert(check_pause_state_sanity());
3437 }
3438 
3439 /**
3440  * We have stopped pausing and we are working through the queue of blocked
3441  * LCP reports. When we reach the end of it we will unset the dequeue flag
3442  * such that we need no more queue the LCP reports.
3443  *
3444  * We will dequeue one LCP report per signal and continue sending CONTINUEB
3445  * to ourselves until we're through the LCP reports that have blocked while
3446  * we paused.
3447  *
3448  * NOTE: The queue might be empty for a short while we are waiting for a
3449  * CONTINUEB to arrive. We don't check for emptiness before sending
3450  * CONTINUEB. So if one wants to add asserts on queue not empty while
3451  * flag is set, then this needs to be checked before CONTINUEB is sent.
3452  */
dequeue_lcp_rep(Signal * signal)3453 void Dbdih::dequeue_lcp_rep(Signal *signal)
3454 {
3455   ReplicaRecordPtr replicaPtr;
3456   bool empty;
3457   bool lcp_frag_rep_empty = c_queued_lcp_frag_rep.isEmpty();
3458   bool lcp_complete_rep_empty = !c_queued_lcp_complete_rep;
3459   if (!c_dequeue_lcp_rep_ongoing)
3460   {
3461     jam();
3462     ndbassert(check_pause_state_sanity());
3463     /**
3464      * We got a new pause signal before finishing off the queue, we will
3465      * stop dequeuing, the pause flag is already set and should continue
3466      * to be so.
3467      */
3468     return;
3469   }
3470   empty = lcp_frag_rep_empty && lcp_complete_rep_empty;
3471   /* Perform dequeueing of one LCP report */
3472   if (!empty)
3473   {
3474     if (!lcp_frag_rep_empty)
3475     {
3476       jam();
3477       /**
3478        * 1) Remove from queue
3479        * 2) Set up signal
3480        * 3) Send to all LCP DIH participants
3481        * 4) Send CONTINUEB for handling next in queue
3482        *
3483        * We also need to send to ourselves which is a bit different from
3484        * the normal LCP_FRAG_REP where we handle ourselves through a fall
3485        * through method. Here we come from a different place and we cannot
3486        * use the broadcast method since the dequeue flag is still set.
3487        * So we send the signals from here to all nodes in the DIH set
3488        * (including the starting node).
3489        */
3490       LcpFragRep *lcpFragRep = (LcpFragRep*)signal->getDataPtrSend();
3491 
3492       c_queued_lcp_frag_rep.first(replicaPtr);
3493       ndbrequire(replicaPtr.p != NULL);
3494       c_queued_lcp_frag_rep.removeFirst(replicaPtr);
3495 
3496       lcpFragRep->nodeId = getOwnNodeId();
3497       lcpFragRep->lcpId = c_lcp_id_paused;
3498       lcpFragRep->lcpNo = replicaPtr.p->nextLcp;
3499       lcpFragRep->tableId = replicaPtr.p->tableId;
3500       lcpFragRep->fragId = replicaPtr.p->fragId;
3501       lcpFragRep->maxGciCompleted = replicaPtr.p->repMaxGciCompleted;
3502       lcpFragRep->maxGciStarted = replicaPtr.p->repMaxGciStarted;
3503 
3504       NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
3505       sendSignal(rg, GSN_LCP_FRAG_REP, signal,
3506                  LcpFragRep::SignalLength, JBB);
3507 
3508       signal->theData[0] = DihContinueB::ZDEQUEUE_LCP_REP;
3509       sendSignal(reference(), GSN_CONTINUEB, signal,
3510                1, JBB);
3511       return;
3512     }
3513     else
3514     {
3515       /**
3516        * 1) Reset c_queued_lcp_complete_rep
3517        * 2) Set up LCP_COMPLETE_REP signal
3518        * 3) Send signals to all LCP DIH participants
3519        * 4) Fall through to end queue removal
3520        */
3521       ndbassert(c_queued_lcp_complete_rep);
3522       LcpCompleteRep *lcpCompleteRep =
3523         (LcpCompleteRep*)signal->getDataPtrSend();
3524 
3525       c_queued_lcp_complete_rep = false;
3526 
3527       lcpCompleteRep->nodeId = getOwnNodeId();
3528       lcpCompleteRep->lcpId = c_lcp_id_paused;
3529       lcpCompleteRep->blockNo = DBLQH;
3530 
3531       NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
3532       sendSignal(rg, GSN_LCP_COMPLETE_REP, signal,
3533                  LcpCompleteRep::SignalLength, JBB);
3534     }
3535   }
3536   jam();
3537   /**
3538    * We have completed dequeueing all queued LCP reports. This means we can
3539    * reset the dequeue flag and resume normal operation of LCP reporting.
3540    */
3541   c_dequeue_lcp_rep_ongoing = false;
3542   c_lcp_id_paused = RNIL;
3543   ndbassert(check_pause_state_sanity());
3544 }
3545 
3546 /**
3547  * FLUSH_LCP_REP_CONF
3548  * ------------------
3549  * When we have received this signal from all nodes that participates in the
3550  * LCP, then we can send the PAUSE_LCP_CONF reply to the requester of the
3551  * pause (always requested by the master, we can only handle one pause at the
3552  * the time). We do however send along the starting node id in the signal
3553  * to ensure that we don't have to wait with the next start in the case of
3554  * a crash in the middle of the pausing.
3555  *
3556  * We will not be able to reach this point with the same node again and
3557  * still receive a signal from the previous time the node was alive since
3558  * the node start contains a number of messages from the master to all
3559  * nodes and thus ensuring that no outstanding messages are from a previous
3560  * node instance with the same node id. The same applies to a number of
3561  * similar scenarios in the NDB code.
3562  */
execFLUSH_LCP_REP_CONF(Signal * signal)3563 void Dbdih::execFLUSH_LCP_REP_CONF(Signal *signal)
3564 {
3565   FlushLcpRepConf *conf = (FlushLcpRepConf*)&signal->theData[0];
3566   jamEntry();
3567 
3568   Uint32 nodeId = refToNode(conf->senderRef);
3569   Uint32 startNode = conf->startNodeId;
3570 
3571   if (!is_pause_for_this_node(startNode))
3572   {
3573     /* Ignore, node died in the process */
3574     jam();
3575     return;
3576   }
3577 
3578   receiveLoopMacro(FLUSH_LCP_REP_REQ, nodeId);
3579   {
3580     jam();
3581    /* Normal path, master is still alive */
3582     PauseLcpConf *conf = (PauseLcpConf*)signal->getDataPtrSend();
3583     conf->senderRef = reference();
3584     conf->startNodeId = startNode;
3585     sendSignal(cmasterdihref, GSN_PAUSE_LCP_CONF, signal,
3586                PauseLcpConf::SignalLength, JBB);
3587   }
3588   ndbassert(check_pause_state_sanity());
3589 }
3590 
3591 /**
3592  * FLUSH_LCP_REP_REQ
3593  * -----------------
3594  * The only purpose of this signal is to ensure that we don't have any
3595  * outstanding LCP_FRAG_REP signals or other LCP signals. These signals
3596  * are sent from the node producing them to all other nodes. This means that
3597  * potentially they could be stuck for a long time in various send buffers
3598  * in the system. So a simple manner to ensure all of those signals have
3599  * reached their destination is to send FLUSH_LCP_REP_REQ from each node to
3600  * all other nodes. This gives a safe condition that we don't have any
3601  * outstanding LCP_FRAG_REP signals in the cluster. So there is no logic to
3602  * execute when receiving this signal other than to send it back to the sender.
3603  *
3604  * It is quite ok to receive this signal in a node before the PAUSE_LCP_REQ
3605  * has arrived here. This signal doesn't cause any interaction with the
3606  * pause handling in this node, actually it doesn't do anything. It's only
3607  * purpose is to ensure that the signal links are flushed such that we know
3608  * that we don't have any outstanding LCP_FRAG_REPs and LCP_COMPLETE_REPs.
3609  */
execFLUSH_LCP_REP_REQ(Signal * signal)3610 void Dbdih::execFLUSH_LCP_REP_REQ(Signal *signal)
3611 {
3612   FlushLcpRepReq *req = (FlushLcpRepReq*)&signal->theData[0];
3613   FlushLcpRepConf *conf = (FlushLcpRepConf*)signal->getDataPtrSend();
3614   jamEntry();
3615   ndbassert(check_pause_state_sanity());
3616 
3617   BlockReference sender_ref = req->senderRef;
3618   Uint32 startNode = req->startNodeId;
3619   conf->senderRef = reference();
3620   conf->startNodeId = startNode;
3621   sendSignal(sender_ref, GSN_FLUSH_LCP_REP_CONF, signal,
3622              FlushLcpRepConf::SignalLength, JBB);
3623 }
3624 /*---------------------------------------------------------------------------*/
3625 /* END Pausing LCP Module */
3626 /*---------------------------------------------------------------------------*/
3627 
3628 
3629 /*---------------------------------------------------------------------------*/
3630 /*                    NODE RESTART CONTINUE REQUEST                          */
3631 /*---------------------------------------------------------------------------*/
3632 // THIS SIGNAL AND THE CODE BELOW IS EXECUTED BY THE MASTER WHEN IT HAS BEEN
3633 // REQUESTED TO START UP A NEW NODE. The master instructs the starting node
3634 // how to set up its log for continued execution.
3635 /*---------------------------------------------------------------------------*/
execSTART_MEREQ(Signal * signal)3636 void Dbdih::execSTART_MEREQ(Signal* signal)
3637 {
3638   StartMeReq * req = (StartMeReq*)&signal->theData[0];
3639   jamEntry();
3640   const BlockReference Tblockref = req->startingRef;
3641   const Uint32 Tnodeid = refToNode(Tblockref);
3642 
3643   ndbrequire(isMaster());
3644   ndbrequire(c_nodeStartMaster.startNode == Tnodeid);
3645   ndbrequire(getNodeStatus(Tnodeid) == NodeRecord::STARTING);
3646 
3647   if (getNodeInfo(Tnodeid).m_version >= NDBD_COPY_GCI_RESTART_NR)
3648   {
3649     jam();
3650     /**
3651      * COPY sysfile to starting node here directly
3652      *   so that it gets nodegroups early on
3653      */
3654 
3655     /**
3656      * Note: only one node can be starting now, so we can use
3657      *       c_nodeStartMaster.startNode for determining where to send
3658      */
3659     c_nodeStartMaster.m_outstandingGsn = GSN_COPY_GCIREQ;
3660     copyGciLab(signal, CopyGCIReq::RESTART_NR);
3661   }
3662   else
3663   {
3664     jam();
3665     startme_copygci_conf(signal);
3666   }
3667 }
3668 
3669 /**
3670  * We have come to a point in the node restart where we need to copy
3671  * the meta data to the starting node.
3672  *
3673  * In older versions we did this by acquiring a mutex that is held by
3674  * the following actions:
3675  * 1) Execution of LCP. The mutex is held for the entire time we are
3676  *   executing an LCP. This could be all the way up to hours.
3677  *
3678  * 2) Take over a fragment. This action happens in the phase where we
3679  *   are synchronizing the starting node with the alive nodes. In order
3680  *   to do so we need to lock the meta data in DBDIH to ensure that we
3681  *   can change it by adding one more alive replica.
3682  *
3683  * The new version still requires that no one is updating the meta data
3684  * while we are copying it. So this means that we still need to grab this
3685  * mutex to copy the meta data. But to synchronize our copying towards
3686  * the execution of LCPs we will use a pausing mechanism instead of
3687  * the mutex. This means that we can avoid the long wait for an LCP to
3688  * complete before we can copy the meta data.
3689  *
3690  * The take over of a fragment only updates the set of active replicas,
3691  * this will not be a problem to do in parallel with updating it with
3692  * regard to LCPs. So these need not be protected against each other.
3693  *
3694  * There are 3 processes that need protection for each other.
3695  * 1) The start of an LCP.
3696  * 2) The copying of meta data
3697  * 3) The synchronization of a node for a fragment
3698  *
3699  * 1) and 2) cannot run concurrently since we want to ensure that the
3700  * start of an LCP has a clear point in connection to the meta data
3701  * status.
3702  * 1) and 3) can run concurrently without any problems.
3703  *
3704  * 2) and 3) cannot run concurrently, but it would be possible to
3705  * have more fine-grained mutexes. The reason is that 3) changes
3706  * a replica from being an old stored replica to being a stored
3707  * replica. This change is part of the copying of meta data.
3708  *
3709  * 3) and 3) for different fragments could run concurrently, but this
3710  * would require changes of the protocol to synchronize the nodes to
3711  * to ensure that the master can handle several parallel changes of
3712  * replica status.
3713  *
3714  * 2) and 2) can run concurrently to some extent, but this would
3715  * require changes to the pause lcp protocol.
3716  *
3717  * The current implementation makes it possible to only run 1 out of
3718  * 1), 2) and 3) at a time.
3719  *
3720  * Another improvement possible is to speed up the copy meta data by
3721  * allowing the master to send more than one table at a time. This
3722  * would remove the wait state where we wait for the starting node
3723  * to receive a table and synchronize it to disk.
3724  *
3725  * One could also consider doing less synch's to disk if somehow the
3726  * different tables could be synched at the same time. This might
3727  * require changing the table layout on disk for DIH and DICT tables.
3728  */
3729 void
startme_copygci_conf(Signal * signal)3730 Dbdih::startme_copygci_conf(Signal* signal)
3731 {
3732   jam();
3733 
3734   /**
3735    * We update the node recovery status to indicate we are now waiting to
3736    * complete a local checkpoint such that we can keep track of node restart
3737    * status to control the start of local checkpoints in a proper manner.
3738    * This code is only executed in master nodes.
3739    */
3740   setNodeRecoveryStatus(c_nodeStartMaster.startNode,
3741                         NodeRecord::WAIT_LCP_TO_COPY_DICT);
3742 
3743   Callback c = { safe_cast(&Dbdih::lcpBlockedLab),
3744                  c_nodeStartMaster.startNode };
3745   Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
3746   mutex.lock(c, true, true);
3747 }
3748 
lcpBlockedLab(Signal * signal,Uint32 nodeId,Uint32 retVal)3749 void Dbdih::lcpBlockedLab(Signal* signal, Uint32 nodeId, Uint32 retVal)
3750 {
3751   jamEntry();
3752   if (c_nodeStartMaster.startNode != nodeId)
3753   {
3754     jam();
3755     if (retVal == 0 || retVal == UtilLockRef::InLockQueue)
3756     {
3757       infoEvent("Releasing table/fragment info lock for node %u", nodeId);
3758 
3759       Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
3760       mutex.unlock();
3761       return;
3762     }
3763     return;
3764   }
3765 
3766   if (retVal == UtilLockRef::InLockQueue)
3767   {
3768     jam();
3769     infoEvent("Node %u enqueued is waiting to copy table/fragment info",
3770               c_nodeStartMaster.startNode);
3771     return;
3772   }
3773 
3774   ndbrequire(retVal == 0); // Mutex error
3775   ndbrequire(getNodeStatus(c_nodeStartMaster.startNode)==NodeRecord::STARTING);
3776 
3777   if (c_lcp_runs_with_pause_support)
3778   {
3779     if (getNodeInfo(c_nodeStartMaster.startNode).m_version >=
3780         NDBD_SUPPORT_PAUSE_LCP)
3781     {
3782       /**
3783        * All nodes running the LCP supports the PAUSE LCP protocol. Also the
3784        * new node support it.
3785        * This means we don't have to wait for the LCP to complete, we can
3786        * pause the LCP while we are copying the meta data.
3787        */
3788       jam();
3789       sendPAUSE_LCP_REQ(signal, true);
3790       return;
3791     }
3792     else
3793     {
3794       jam();
3795       /**
3796        * We can only come here trying to start an old version with a master of
3797        * a new version. In this case we cannot use the PAUSE LCP protocol since
3798        * the new node can only handle copying of meta data outside the LCP
3799        * protocol.
3800        *
3801        * We come here holding the Fragment Info mutex. We will keep this mutex
3802        * and this means that a new LCP cannot start. We also set an indicator
3803        * to ensure that the LCP finish will know that we're waiting to copy
3804        * the data.
3805        */
3806       ndbrequire(!c_old_node_waiting_for_lcp_end);
3807       c_old_node_waiting_for_lcp_end = true;
3808       return;
3809     }
3810   }
3811   /**
3812    * Either we don't support the PAUSE protocol or some other node doesn't. We
3813    * can also arrive here simply because no LCP is ongoing. In this case we
3814    * can be sure that no LCP is ongoing in both cases. So we ensure that no
3815    * LCP starts up until we have completed the copying of meta data by keeping
3816    * the Fragment Info mutex until we have completed the copying of meta data.
3817    */
3818   start_copy_meta_data(signal);
3819 }//Dbdih::lcpBlockedLab()
3820 
nodeDictStartConfLab(Signal * signal,Uint32 nodeId)3821 void Dbdih::nodeDictStartConfLab(Signal* signal, Uint32 nodeId)
3822 {
3823   /*-----------------------------------------------------------------*/
3824   // Report that node restart has completed copy of dictionary.
3825   /*-----------------------------------------------------------------*/
3826   signal->theData[0] = NDB_LE_NR_CopyDict;
3827   signal->theData[1] = nodeId;
3828   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
3829 
3830   /*-------------------------------------------------------------------------
3831    * NOW WE HAVE COPIED BOTH DIH AND DICT INFORMATION. WE ARE NOW READY TO
3832    * INTEGRATE THE NODE INTO THE LCP AND GCP PROTOCOLS AND TO ALLOW UPDATES OF
3833    * THE DICTIONARY AGAIN.
3834    *
3835    * We can release the PAUSE on LCP now since we are ready to update the
3836    * meta data again.
3837    *
3838    * We update the node recovery status with this information to be able to
3839    * track node restart status. This code only executes in the master node.
3840    */
3841   /*-------------------------------------------------------------------------*/
3842   setNodeRecoveryStatus(c_nodeStartMaster.startNode,
3843                         NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP);
3844 
3845   c_nodeStartMaster.wait = ZFALSE;
3846   c_nodeStartMaster.blockGcp = 1;
3847 
3848   return;
3849 }//Dbdih::nodeDictStartConfLab()
3850 
dihCopyCompletedLab(Signal * signal)3851 void Dbdih::dihCopyCompletedLab(Signal* signal)
3852 {
3853   signal->theData[0] = NDB_LE_NR_CopyDistr;
3854   signal->theData[1] = c_nodeStartMaster.startNode;
3855   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
3856 
3857   BlockReference ref = calcDictBlockRef(c_nodeStartMaster.startNode);
3858   DictStartReq * req = (DictStartReq*)&signal->theData[0];
3859   req->restartGci = (Uint32)(m_micro_gcp.m_new_gci >> 32);
3860   req->senderRef = reference();
3861   sendSignal(ref, GSN_DICTSTARTREQ,
3862              signal, DictStartReq::SignalLength, JBB);
3863   c_nodeStartMaster.m_outstandingGsn = GSN_DICTSTARTREQ;
3864   c_nodeStartMaster.wait = 0;
3865 }//Dbdih::dihCopyCompletedLab()
3866 
gcpBlockedLab(Signal * signal)3867 void Dbdih::gcpBlockedLab(Signal* signal)
3868 {
3869   /**
3870    * The node DIH will be part of LCP
3871    */
3872   NodeRecordPtr nodePtr;
3873   nodePtr.i = c_nodeStartMaster.startNode;
3874   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
3875   nodePtr.p->m_inclDihLcp = true;
3876 
3877   /**
3878    * If node is new...this is the place to do things,
3879    *   gcp+lcp is blocked
3880    */
3881   if (getNodeActiveStatus(nodePtr.i) == Sysfile::NS_NotDefined)
3882   {
3883     jam();
3884     infoEvent("Adding node %d to sysfile, NS_Configured",
3885               nodePtr.i);
3886     setNodeActiveStatus(nodePtr.i, Sysfile::NS_Configured);
3887     Sysfile::setNodeGroup(nodePtr.i, SYSFILE->nodeGroups,
3888                           NO_NODE_GROUP_ID);
3889     Sysfile::setNodeStatus(nodePtr.i,
3890                            SYSFILE->nodeStatus, Sysfile::NS_Configured);
3891   }
3892 
3893   /*-------------------------------------------------------------------------*/
3894   // NOW IT IS TIME TO INFORM ALL OTHER NODES IN THE CLUSTER OF THE STARTED
3895   // NODE SUCH THAT THEY ALSO INCLUDE THE NODE IN THE NODE LISTS AND SO FORTH.
3896   /*------------------------------------------------------------------------*/
3897   sendLoopMacro(INCL_NODEREQ, sendINCL_NODEREQ, RNIL);
3898   /*-------------------------------------------------------------------------*/
3899   // We also need to send to the starting node to ensure he is aware of the
3900   // global checkpoint id and the correct state. We do not wait for any reply
3901   // since the starting node will not send any.
3902   /*-------------------------------------------------------------------------*/
3903   Uint32 startVersion = getNodeInfo(c_nodeStartMaster.startNode).m_version;
3904 
3905   if ((getMajor(startVersion) == 4 &&
3906        startVersion >= NDBD_INCL_NODECONF_VERSION_4) ||
3907       (getMajor(startVersion) == 5 &&
3908        startVersion >= NDBD_INCL_NODECONF_VERSION_5) ||
3909       (getMajor(startVersion) > 5))
3910   {
3911     c_INCL_NODEREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode);
3912   }
3913 
3914   sendINCL_NODEREQ(signal, c_nodeStartMaster.startNode, RNIL);
3915 }//Dbdih::gcpBlockedLab()
3916 
3917 /*---------------------------------------------------------------------------*/
3918 // THIS SIGNAL IS EXECUTED IN BOTH SLAVES AND IN THE MASTER
3919 /*---------------------------------------------------------------------------*/
execINCL_NODECONF(Signal * signal)3920 void Dbdih::execINCL_NODECONF(Signal* signal)
3921 {
3922   jamEntry();
3923   Uint32 TstartNode = signal->theData[0];
3924   Uint32 TsendNodeId_or_blockref = signal->theData[1];
3925 
3926   Uint32 blocklist[7];
3927   blocklist[0] = clocallqhblockref;
3928   blocklist[1] = clocaltcblockref;
3929   blocklist[2] = cdictblockref;
3930   blocklist[3] = numberToRef(BACKUP, getOwnNodeId());
3931   blocklist[4] = numberToRef(SUMA, getOwnNodeId());
3932   blocklist[5] = numberToRef(DBSPJ, getOwnNodeId());
3933   blocklist[6] = 0;
3934 
3935   for (Uint32 i = 0; blocklist[i] != 0; i++)
3936   {
3937     if (TsendNodeId_or_blockref == blocklist[i])
3938     {
3939       jam();
3940 
3941       if (TstartNode != c_nodeStartSlave.nodeId)
3942       {
3943         jam();
3944         warningEvent("Received INCL_NODECONF for %u from %s"
3945                      " while %u is starting",
3946                      TstartNode,
3947                      getBlockName(refToBlock(TsendNodeId_or_blockref)),
3948                      c_nodeStartSlave.nodeId);
3949         return;
3950       }
3951 
3952       if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE &&
3953 	  blocklist[i+1] != 0)
3954       {
3955 	/**
3956 	 * Send to next in block list
3957 	 */
3958 	jam();
3959 	signal->theData[0] = reference();
3960 	signal->theData[1] = c_nodeStartSlave.nodeId;
3961 	sendSignal(blocklist[i+1], GSN_INCL_NODEREQ, signal, 2, JBB);
3962 	return;
3963       }
3964       else
3965       {
3966 	/**
3967 	 * All done, reply to master
3968 	 */
3969 	jam();
3970         if (!isMaster())
3971         {
3972           jam();
3973           setNodeRecoveryStatus(c_nodeStartSlave.nodeId,
3974                                 NodeRecord::NODE_GETTING_INCLUDED);
3975         }
3976 	signal->theData[0] = c_nodeStartSlave.nodeId;
3977 	signal->theData[1] = cownNodeId;
3978 	sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
3979 
3980 	c_nodeStartSlave.nodeId = 0;
3981 	return;
3982       }
3983     }
3984   }
3985 
3986   if (c_nodeStartMaster.startNode != TstartNode)
3987   {
3988     jam();
3989     warningEvent("Received INCL_NODECONF for %u from %u"
3990                  " while %u is starting",
3991                  TstartNode,
3992                  TsendNodeId_or_blockref,
3993                  c_nodeStartMaster.startNode);
3994     return;
3995   }
3996 
3997   ndbrequire(reference() == cmasterdihref);
3998   receiveLoopMacro(INCL_NODEREQ, TsendNodeId_or_blockref);
3999 
4000   CRASH_INSERTION(7128);
4001   /*-------------------------------------------------------------------------*/
4002   // Now that we have included the starting node in the node lists in the
4003   // various blocks we are ready to start the global checkpoint protocol
4004   /*------------------------------------------------------------------------*/
4005   c_nodeStartMaster.wait = 11;
4006   c_nodeStartMaster.blockGcp = 0;
4007 
4008   /**
4009    * Restart GCP
4010    */
4011   signal->theData[0] = reference();
4012   sendSignal(reference(), GSN_UNBLO_DICTCONF, signal, 1, JBB);
4013 
4014   signal->theData[0] = DihContinueB::ZSTART_GCP;
4015   sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
4016 
4017   Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
4018   mutex.unlock();
4019 }//Dbdih::execINCL_NODECONF()
4020 
execUNBLO_DICTCONF(Signal * signal)4021 void Dbdih::execUNBLO_DICTCONF(Signal* signal)
4022 {
4023   jamEntry();
4024   c_nodeStartMaster.wait = ZFALSE;
4025   if (!c_nodeStartMaster.activeState) {
4026     jam();
4027     return;
4028   }//if
4029 
4030   CRASH_INSERTION(7129);
4031   /**-----------------------------------------------------------------------
4032    * WE HAVE NOW PREPARED IT FOR INCLUSION IN THE LCP PROTOCOL.
4033    * WE CAN NOW START THE LCP PROTOCOL AGAIN.
4034    * WE HAVE ALSO MADE THIS FOR THE GCP PROTOCOL.
4035    * WE ARE READY TO START THE PROTOCOLS AND RESPOND TO THE START REQUEST
4036    * FROM THE STARTING NODE.
4037    *------------------------------------------------------------------------*/
4038 
4039   StartMeConf * const startMe = (StartMeConf *)&signal->theData[0];
4040 
4041   const Uint32 wordPerSignal = StartMeConf::DATA_SIZE;
4042   const int noOfSignals = ((Sysfile::SYSFILE_SIZE32 + (wordPerSignal - 1)) /
4043                            wordPerSignal);
4044 
4045   Uint32 nodeId = startMe->startingNodeId = c_nodeStartMaster.startNode;
4046   startMe->startWord = 0;
4047 
4048   const Uint32 ref = calcDihBlockRef(c_nodeStartMaster.startNode);
4049   for(int i = 0; i < noOfSignals; i++){
4050     jam();
4051     { // Do copy
4052       const int startWord = startMe->startWord;
4053       for(Uint32 j = 0; j < wordPerSignal; j++){
4054         startMe->data[j] = sysfileData[j+startWord];
4055       }
4056     }
4057     sendSignal(ref, GSN_START_MECONF, signal, StartMeConf::SignalLength, JBB);
4058     startMe->startWord += wordPerSignal;
4059   }//for
4060   nodeResetStart(signal);
4061 
4062   /**
4063    * At this point the master knows that the starting node will start executing
4064    * the Database Recovery. This can take a fair amount of time. At the end of
4065    * the recovery the starting node need to be part of a LCP. In order to
4066    * synchronize for several nodes restarting at the same time we need to keep
4067    * track of start times.
4068    *
4069    * We expect that in most parallel node restarts the nodes are restarted
4070    * immediately after a crash or as part of a rolling restart. In this case
4071    * the node restart times will be very similar. So we should be able to
4072    * roughly estimate when the node restart will reach the point where it
4073    * is ready to wait for an LCP.
4074    *
4075    * When the first node reaches this point and also later nodes reach this
4076    * phase, then they will be able to estimate whether it is worth it to
4077    * hold the LCP until the next node arrives to this phase.
4078    *
4079    * The similitude of a flight or a train waiting for passengers arriving
4080    * on other flights or trains can be used here. It is useful to wait for
4081    * some time since there is a high cost for passengers to miss the train.
4082    * At the same time it isn't worthwhile to hold it for a very long time
4083    * since then all other passengers will suffer greatly. In this case the
4084    * other nodes waiting will suffer, but also we will risk running out of
4085    * REDO log space if we wait for too long time.
4086    *
4087    * Given that we don't wait for more than a short time to synchronize
4088    * means that the case of heterogenous nodes will also work ok in this
4089    * context although we will optimize for the homogenous case.
4090    *
4091    * To get even better estimates of where we are and to give users even
4092    * better understanding of what takes time in node restarts we have also
4093    * adde that the LDMs report when they have completed the 3 local phases
4094    * of local recovery. These are completion of restore fragments,
4095    * completion of UNDO Disk data, completion of execution of REDO log and
4096    * the final phase executed in LDMs are the ordered index rebuilds which is
4097    * completed when the local recovery is completed.
4098    */
4099   setNodeRecoveryStatus(nodeId, NodeRecord::LOCAL_RECOVERY_STARTED);
4100 
4101   /**
4102    * Allow next node to start...
4103    */
4104   signal->theData[0] = nodeId;
4105   sendSignal(NDBCNTR_REF, GSN_START_PERMREP, signal, 1, JBB);
4106 }//Dbdih::execUNBLO_DICTCONF()
4107 
4108 /*---------------------------------------------------------------------------*/
4109 /*                    NODE RESTART COPY REQUEST                              */
4110 /*---------------------------------------------------------------------------*/
4111 // A NODE RESTART HAS REACHED ITS FINAL PHASE WHEN THE DATA IS TO BE COPIED
4112 // TO THE NODE. START_COPYREQ IS EXECUTED BY THE STARTING NODE.
4113 /*---------------------------------------------------------------------------*/
execSTART_COPYREQ(Signal * signal)4114 void Dbdih::execSTART_COPYREQ(Signal* signal)
4115 {
4116   jamEntry();
4117   StartCopyReq req = *(StartCopyReq*)signal->getDataPtr();
4118 
4119   Uint32 startNodeId = req.startingNodeId;
4120 
4121   /*-------------------------------------------------------------------------*/
4122   /*
4123    * REPORT Copy process of node restart is now about to start up.
4124    *
4125    * We will report this both in an internal state that can be used to
4126    * report progress in NDBINFO tables as well as being used to keep track of
4127    * node restart status to make correct decisions on when to start LCPs.
4128    * We also report it to cluster log and internal node log.
4129    *
4130    * This code is only executed in master node.
4131    */
4132   /*-------------------------------------------------------------------------*/
4133   signal->theData[0] = NDB_LE_NR_CopyFragsStarted;
4134   signal->theData[1] = req.startingNodeId;
4135   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4136 
4137   g_eventLogger->info("Restore Database Off-line Starting");
4138   infoEvent("Restore Database Off-line Starting on node %u",
4139             startNodeId);
4140 
4141   CRASH_INSERTION(7131);
4142 
4143   switch (getNodeActiveStatus(startNodeId)) {
4144   case Sysfile::NS_Active:
4145   case Sysfile::NS_ActiveMissed_1:
4146   case Sysfile::NS_ActiveMissed_2:
4147   case Sysfile::NS_NotActive_NotTakenOver:
4148   case Sysfile::NS_Configured:
4149     jam();
4150     /*-----------------------------------------------------------------------*/
4151     // AN ACTIVE NODE HAS BEEN STARTED. THE ACTIVE NODE MUST THEN GET ALL DATA
4152     // IT HAD BEFORE ITS CRASH. WE START THE TAKE OVER IMMEDIATELY.
4153     // SINCE WE ARE AN ACTIVE NODE WE WILL TAKE OVER OUR OWN NODE THAT
4154     // PREVIOUSLY CRASHED.
4155     /*-----------------------------------------------------------------------*/
4156     startTakeOver(signal, startNodeId, startNodeId, &req);
4157     break;
4158   case Sysfile::NS_TakeOver:{
4159     jam();
4160     /*--------------------------------------------------------------------
4161      * We were in the process of taking over but it was not completed.
4162      * We will complete it now instead.
4163      *--------------------------------------------------------------------*/
4164     Uint32 takeOverNode = Sysfile::getTakeOverNode(startNodeId,
4165 						   SYSFILE->takeOver);
4166     if(takeOverNode == 0){
4167       jam();
4168       warningEvent("Bug in take-over code restarting");
4169       takeOverNode = startNodeId;
4170     }
4171 
4172     startTakeOver(signal, startNodeId, takeOverNode, &req);
4173     break;
4174   }
4175   default:
4176     ndbrequire(false);
4177     break;
4178   }//switch
4179 }//Dbdih::execSTART_COPYREQ()
4180 
4181 /*---------------------------------------------------------------------------*/
4182 /*                    SLAVE LOGIC FOR NODE RESTART                           */
4183 /*---------------------------------------------------------------------------*/
execSTART_INFOREQ(Signal * signal)4184 void Dbdih::execSTART_INFOREQ(Signal* signal)
4185 {
4186   jamEntry();
4187   StartInfoReq *const req =(StartInfoReq*)&signal->theData[0];
4188   Uint32 startNode = req->startingNodeId;
4189   if (cfailurenr != req->systemFailureNo) {
4190     jam();
4191     //---------------------------------------------------------------
4192     // A failure occurred since master sent this request. We will ignore
4193     // this request since the node is already dead that is starting.
4194     //---------------------------------------------------------------
4195     return;
4196   }//if
4197   CRASH_INSERTION(7123);
4198   if (isMaster()) {
4199     jam();
4200     ndbrequire(getNodeStatus(startNode) == NodeRecord::STARTING);
4201   } else {
4202     jam();
4203     if (getNodeStatus(startNode) == NodeRecord::STARTING)
4204     {
4205       /**
4206        * The master is sending out a new START_INFOREQ, obviously some
4207        * other node wasn't ready to start it yet, we are still ready.
4208        * We will report this fact without any additional state changes.
4209        */
4210       jam();
4211       NodeRecordPtr nodePtr;
4212       nodePtr.i = startNode;
4213       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4214       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
4215                  NodeRecord::NODE_GETTING_PERMIT);
4216       ndbrequire(getAllowNodeStart(startNode));
4217 
4218       StartInfoConf * c = (StartInfoConf*)&signal->theData[0];
4219       c->sendingNodeId = cownNodeId;
4220       c->startingNodeId = startNode;
4221       sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
4222 	         StartInfoConf::SignalLength, JBB);
4223       return;
4224     }
4225     else
4226     {
4227       jam();
4228       ndbrequire(getNodeStatus(startNode) == NodeRecord::DEAD);
4229     }
4230   }//if
4231   if ((!getAllowNodeStart(startNode)) ||
4232       (c_nodeStartSlave.nodeId != 0) ||
4233       (ERROR_INSERTED(7124))) {
4234     jam();
4235     if (!getAllowNodeStart(startNode))
4236     {
4237       jam();
4238       g_eventLogger->info("Not allowed to start now for node %u", startNode);
4239     }
4240     else if (c_nodeStartSlave.nodeId != 0)
4241     {
4242       jam();
4243       g_eventLogger->info("INCL_NODEREQ protocol still ongoing node = %u"
4244                           " c_nodeStartSlave.nodeId = %u",
4245                           startNode,
4246                           c_nodeStartSlave.nodeId);
4247     }
4248     else
4249     {
4250       jam();
4251       g_eventLogger->info("ERROR INSERT 7124");
4252     }
4253     StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0];
4254     ref->startingNodeId = startNode;
4255     ref->sendingNodeId = cownNodeId;
4256     ref->errorCode = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
4257     sendSignal(cmasterdihref, GSN_START_INFOREF, signal,
4258 	       StartInfoRef::SignalLength, JBB);
4259     return;
4260   }//if
4261   setNodeStatus(startNode, NodeRecord::STARTING);
4262   if (req->typeStart == NodeState::ST_INITIAL_NODE_RESTART) {
4263     jam();
4264     g_eventLogger->info("Started invalidation of node %u", startNode);
4265     setAllowNodeStart(startNode, false);
4266     invalidateNodeLCP(signal, startNode, 0);
4267   } else {
4268     jam();
4269     if (!isMaster())
4270     {
4271       jam();
4272       setNodeRecoveryStatus(startNode, NodeRecord::NODE_GETTING_PERMIT);
4273     }
4274     StartInfoConf * c = (StartInfoConf*)&signal->theData[0];
4275     c->sendingNodeId = cownNodeId;
4276     c->startingNodeId = startNode;
4277     sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
4278 	       StartInfoConf::SignalLength, JBB);
4279     return;
4280   }//if
4281 }//Dbdih::execSTART_INFOREQ()
4282 
execINCL_NODEREQ(Signal * signal)4283 void Dbdih::execINCL_NODEREQ(Signal* signal)
4284 {
4285   jamEntry();
4286   Uint32 retRef = signal->theData[0];
4287   Uint32 nodeId = signal->theData[1];
4288   if (nodeId == getOwnNodeId() && ERROR_INSERTED(7165))
4289   {
4290     CLEAR_ERROR_INSERT_VALUE;
4291     sendSignalWithDelay(reference(), GSN_INCL_NODEREQ, signal, 5000,
4292                         signal->getLength());
4293     return;
4294   }
4295 
4296   Uint32 tnodeStartFailNr = signal->theData[2];
4297   Uint32 gci_hi = signal->theData[4];
4298   Uint32 gci_lo = signal->theData[5];
4299   if (unlikely(signal->getLength() < 6))
4300   {
4301     jam();
4302     gci_lo = 0;
4303   }
4304 
4305   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
4306   CRASH_INSERTION(7127);
4307   m_micro_gcp.m_current_gci = gci;
4308   m_micro_gcp.m_old_gci = gci - 1;
4309 
4310   /*-------------------------------------------------------------------------*/
4311   // When a node is restarted we must ensure that a lcp will be run
4312   // as soon as possible and the reset the delay according to the original
4313   // configuration.
4314   // Without an initial local checkpoint the new node will not be available.
4315   /*-------------------------------------------------------------------------*/
4316   if (getOwnNodeId() == nodeId) {
4317     jam();
4318     /*-----------------------------------------------------------------------*/
4319     // We are the starting node. We came here only to set the global checkpoint
4320     // id's and the lcp status.
4321     /*-----------------------------------------------------------------------*/
4322     CRASH_INSERTION(7171);
4323     Uint32 masterVersion = getNodeInfo(refToNode(cmasterdihref)).m_version;
4324 
4325     if ((NDB_VERSION_MAJOR == 4 &&
4326 	 masterVersion >= NDBD_INCL_NODECONF_VERSION_4) ||
4327 	(NDB_VERSION_MAJOR == 5 &&
4328 	 masterVersion >= NDBD_INCL_NODECONF_VERSION_5) ||
4329 	(NDB_VERSION_MAJOR > 5))
4330     {
4331       signal->theData[0] = getOwnNodeId();
4332       signal->theData[1] = getOwnNodeId();
4333       sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
4334     }
4335     return;
4336   }//if
4337   if (getNodeStatus(nodeId) != NodeRecord::STARTING) {
4338     jam();
4339     return;
4340   }//if
4341   ndbrequire(cfailurenr == tnodeStartFailNr);
4342   ndbrequire (c_nodeStartSlave.nodeId == 0);
4343   c_nodeStartSlave.nodeId = nodeId;
4344 
4345   ndbrequire (retRef == cmasterdihref);
4346 
4347   NodeRecordPtr nodePtr;
4348   nodePtr.i = nodeId;
4349   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4350 
4351   Sysfile::ActiveStatus TsaveState = nodePtr.p->activeStatus;
4352   Uint32 TnodeGroup = nodePtr.p->nodeGroup;
4353 
4354   initNodeRecord(nodePtr);
4355   nodePtr.p->nodeGroup = TnodeGroup;
4356   nodePtr.p->activeStatus = TsaveState;
4357   nodePtr.p->nodeStatus = NodeRecord::ALIVE;
4358   nodePtr.p->useInTransactions = true;
4359   nodePtr.p->m_inclDihLcp = true;
4360 
4361   removeDeadNode(nodePtr);
4362   insertAlive(nodePtr);
4363   con_lineNodes++;
4364 
4365   /*-------------------------------------------------------------------------*/
4366   //      WE WILL ALSO SEND THE INCLUDE NODE REQUEST TO THE LOCAL LQH BLOCK.
4367   /*-------------------------------------------------------------------------*/
4368   signal->theData[0] = reference();
4369   signal->theData[1] = nodeId;
4370   signal->theData[2] = Uint32(m_micro_gcp.m_current_gci >> 32);
4371   sendSignal(clocallqhblockref, GSN_INCL_NODEREQ, signal, 3, JBB);
4372 }//Dbdih::execINCL_NODEREQ()
4373 
4374 /* ------------------------------------------------------------------------- */
4375 // execINCL_NODECONF() is found in the master logic part since it is used by
4376 // both the master and the slaves.
4377 /* ------------------------------------------------------------------------- */
4378 
4379 /******************************************************************************
4380  *
4381  * Node takeover functionality
4382  * MASTER part
4383  *****************************************************************************/
execSTART_TOREQ(Signal * signal)4384 void Dbdih::execSTART_TOREQ(Signal* signal)
4385 {
4386   jamEntry();
4387   StartToReq req = *(StartToReq *)&signal->theData[0];
4388 
4389 
4390   if (ndb_pnr(getNodeInfo(refToNode(req.senderRef)).m_version))
4391   {
4392     jam();
4393     TakeOverRecordPtr takeOverPtr;
4394 
4395     c_takeOverPool.seize(takeOverPtr);
4396     c_masterActiveTakeOverList.addFirst(takeOverPtr);
4397     takeOverPtr.p->toStartingNode = req.startingNodeId;
4398     takeOverPtr.p->m_senderRef = req.senderRef;
4399     takeOverPtr.p->m_senderData = req.senderData;
4400     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
4401     takeOverPtr.p->toStartTime = c_current_time;
4402   }
4403 
4404   setNodeRecoveryStatus(req.startingNodeId,
4405                         NodeRecord::COPY_FRAGMENTS_STARTED);
4406 
4407   StartToConf * conf = (StartToConf *)&signal->theData[0];
4408   conf->senderData = req.senderData;
4409   conf->sendingNodeId = cownNodeId;
4410   conf->startingNodeId = req.startingNodeId;
4411   sendSignal(req.senderRef, GSN_START_TOCONF,
4412              signal, StartToConf::SignalLength, JBB);
4413 }//Dbdih::execSTART_TOREQ()
4414 
execUPDATE_TOREQ(Signal * signal)4415 void Dbdih::execUPDATE_TOREQ(Signal* signal)
4416 {
4417   jamEntry();
4418   UpdateToReq req = *(UpdateToReq *)&signal->theData[0];
4419 
4420   Uint32 errCode;
4421   Uint32 extra;
4422   g_eventLogger->debug("Received UPDATE_TOREQ for startnode: %u, copynode:%u",
4423                        req.startingNodeId, req.copyNodeId);
4424   if (ndb_pnr(getNodeInfo(refToNode(req.senderRef)).m_version))
4425   {
4426     jam();
4427     /**
4428      *
4429      */
4430     TakeOverRecordPtr takeOverPtr;
4431     if (findTakeOver(takeOverPtr, req.startingNodeId) == false)
4432     {
4433       g_eventLogger->info("Unknown takeOver node: %u", req.startingNodeId);
4434       errCode = UpdateToRef::UnknownTakeOver;
4435       extra = RNIL;
4436       goto ref;
4437     }
4438 
4439     CRASH_INSERTION(7141);
4440 
4441     takeOverPtr.p->toCopyNode = req.copyNodeId;
4442     takeOverPtr.p->toCurrentTabref = req.tableId;
4443     takeOverPtr.p->toCurrentFragid = req.fragmentNo;
4444 
4445     NodeRecordPtr nodePtr;
4446     NodeGroupRecordPtr NGPtr;
4447     nodePtr.i = req.copyNodeId;
4448     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4449     NGPtr.i = nodePtr.p->nodeGroup;
4450     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
4451 
4452     Mutex mutex(signal, c_mutexMgr, takeOverPtr.p->m_fragmentInfoMutex);
4453     Callback c = { safe_cast(&Dbdih::updateToReq_fragmentMutex_locked),
4454                    takeOverPtr.i };
4455 
4456     switch(req.requestType){
4457     case UpdateToReq::BEFORE_STORED:
4458       jam();
4459 
4460       if (NGPtr.p->activeTakeOver == 0)
4461       {
4462         jam();
4463         NGPtr.p->activeTakeOver = req.startingNodeId;
4464         NGPtr.p->activeTakeOverCount = 1;
4465       }
4466       else if (NGPtr.p->activeTakeOver == req.startingNodeId)
4467       {
4468         NGPtr.p->activeTakeOverCount++;
4469       }
4470       else
4471       {
4472         jam();
4473         errCode = UpdateToRef::CopyFragInProgress;
4474         extra = NGPtr.p->activeTakeOver;
4475         g_eventLogger->info("takeOver node in progress: %u",
4476                             NGPtr.p->activeTakeOver);
4477         goto ref;
4478       }
4479 
4480       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_STORED;
4481       mutex.lock(c, false, true);
4482       return;
4483     case UpdateToReq::AFTER_STORED:
4484     {
4485       jam();
4486       mutex.unlock();
4487       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_AFTER_STORED;
4488       // Send conf
4489       break;
4490     }
4491     case UpdateToReq::BEFORE_COMMIT_STORED:
4492       jam();
4493       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_COMMIT;
4494       mutex.lock(c, false, true);
4495       return;
4496     case UpdateToReq::AFTER_COMMIT_STORED:
4497     {
4498       jam();
4499       mutex.unlock();
4500 
4501       Mutex mutex2(signal, c_mutexMgr,
4502                    takeOverPtr.p->m_switchPrimaryMutexHandle);
4503       mutex2.unlock();
4504       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
4505       break; // send conf
4506     }
4507     }
4508   }
4509   else
4510   {
4511     CRASH_INSERTION(7154);
4512     RETURN_IF_NODE_NOT_ALIVE(req.startingNodeId);
4513   }
4514 
4515   {
4516     UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
4517     conf->senderData = req.senderData;
4518     conf->sendingNodeId = cownNodeId;
4519     conf->startingNodeId = req.startingNodeId;
4520     sendSignal(req.senderRef, GSN_UPDATE_TOCONF, signal,
4521                UpdateToConf::SignalLength, JBB);
4522   }
4523   return;
4524 
4525 ref:
4526   UpdateToRef* ref = (UpdateToRef*)signal->getDataPtrSend();
4527   ref->senderData = req.senderData;
4528   ref->senderRef = reference();
4529   ref->errorCode = errCode;
4530   ref->extra = extra;
4531   sendSignal(req.senderRef, GSN_UPDATE_TOREF, signal,
4532              UpdateToRef::SignalLength, JBB);
4533 }
4534 
4535 void
updateToReq_fragmentMutex_locked(Signal * signal,Uint32 toPtrI,Uint32 retVal)4536 Dbdih::updateToReq_fragmentMutex_locked(Signal * signal,
4537                                         Uint32 toPtrI, Uint32 retVal)
4538 {
4539   jamEntry();
4540   TakeOverRecordPtr takeOverPtr;
4541   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
4542 
4543   Uint32 nodeId = takeOverPtr.p->toStartingNode;
4544 
4545   if (retVal == UtilLockRef::InLockQueue)
4546   {
4547     jam();
4548     infoEvent("Node %u waiting to continue copying table %u fragment: %u (%s)",
4549               nodeId,
4550               takeOverPtr.p->toCurrentTabref,
4551               takeOverPtr.p->toCurrentFragid,
4552               takeOverPtr.p->toMasterStatus ==
4553                 TakeOverRecord::TO_MUTEX_BEFORE_STORED ? "STORED" : "COMMIT");
4554     return;
4555   }
4556 
4557   Uint32 errCode;
4558   Uint32 extra;
4559 
4560   NodeRecordPtr nodePtr;
4561   nodePtr.i = nodeId;
4562   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4563   if (unlikely(nodePtr.p->nodeStatus != NodeRecord::ALIVE))
4564   {
4565     jam();
4566     /**
4567      * Node died while we waited for lock...
4568      */
4569     abortTakeOver(signal, takeOverPtr);
4570     return;
4571   }
4572 
4573   switch(takeOverPtr.p->toMasterStatus){
4574   case TakeOverRecord::TO_MUTEX_BEFORE_STORED:
4575   {
4576     jam();
4577     // send conf
4578     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_LOCKED;
4579     break;
4580   }
4581   case TakeOverRecord::TO_MUTEX_BEFORE_COMMIT:
4582   {
4583     jam();
4584 
4585     NodeRecordPtr nodePtr;
4586     NodeGroupRecordPtr NGPtr;
4587     nodePtr.i = takeOverPtr.p->toCopyNode;
4588     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4589     NGPtr.i = nodePtr.p->nodeGroup;
4590     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
4591 
4592     if (NGPtr.p->activeTakeOver != nodeId)
4593     {
4594       ndbassert(false);
4595       errCode = UpdateToRef::InvalidRequest;
4596       extra = NGPtr.p->activeTakeOver;
4597       goto ref;
4598     }
4599     ndbrequire(NGPtr.p->activeTakeOverCount > 0);
4600     NGPtr.p->activeTakeOverCount--;
4601     if (NGPtr.p->activeTakeOverCount == 0)
4602     {
4603       /**
4604        * Last active copy thread, give up activeTakeOver for now
4605        */
4606       jam();
4607       NGPtr.p->activeTakeOver = 0;
4608     }
4609     takeOverPtr.p->toCopyNode = RNIL;
4610     Mutex mutex(signal, c_mutexMgr,
4611                 takeOverPtr.p->m_switchPrimaryMutexHandle);
4612     Callback c = { safe_cast(&Dbdih::switchPrimaryMutex_locked),
4613                    takeOverPtr.i };
4614     ndbrequire(mutex.lock(c));
4615     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_SWITCH_REPLICA;
4616     return;
4617     break;
4618   }
4619   default:
4620     jamLine(takeOverPtr.p->toMasterStatus);
4621     ndbrequire(false);
4622   }
4623 
4624   {
4625     UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
4626     conf->senderData = takeOverPtr.p->m_senderData;
4627     conf->sendingNodeId = cownNodeId;
4628     conf->startingNodeId = takeOverPtr.p->toStartingNode;
4629     sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
4630                UpdateToConf::SignalLength, JBB);
4631   }
4632   return;
4633 
4634 ref:
4635   {
4636     Mutex mutex(signal, c_mutexMgr, takeOverPtr.p->m_fragmentInfoMutex);
4637     mutex.unlock();
4638 
4639     UpdateToRef* ref = (UpdateToRef*)signal->getDataPtrSend();
4640     ref->senderData = takeOverPtr.p->m_senderData;
4641     ref->senderRef = reference();
4642     ref->errorCode = errCode;
4643     ref->extra = extra;
4644     sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOREF, signal,
4645                UpdateToRef::SignalLength, JBB);
4646     return;
4647   }
4648 }
4649 
4650 void
switchPrimaryMutex_locked(Signal * signal,Uint32 toPtrI,Uint32 retVal)4651 Dbdih::switchPrimaryMutex_locked(Signal* signal, Uint32 toPtrI, Uint32 retVal)
4652 {
4653   jamEntry();
4654   ndbrequire(retVal == 0);
4655 
4656   TakeOverRecordPtr takeOverPtr;
4657   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
4658 
4659   Uint32 nodeId = takeOverPtr.p->toStartingNode;
4660   NodeRecordPtr nodePtr;
4661   nodePtr.i = nodeId;
4662   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4663 
4664   if (unlikely(nodePtr.p->nodeStatus != NodeRecord::ALIVE))
4665   {
4666     jam();
4667     /**
4668      * Node died while we waited for lock...
4669      */
4670     abortTakeOver(signal, takeOverPtr);
4671     return;
4672   }
4673 
4674   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_AFTER_SWITCH_REPLICA;
4675 
4676   UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
4677   conf->senderData = takeOverPtr.p->m_senderData;
4678   conf->sendingNodeId = cownNodeId;
4679   conf->startingNodeId = takeOverPtr.p->toStartingNode;
4680   sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
4681              UpdateToConf::SignalLength, JBB);
4682 }
4683 
4684 void
switchPrimaryMutex_unlocked(Signal * signal,Uint32 toPtrI,Uint32 retVal)4685 Dbdih::switchPrimaryMutex_unlocked(Signal* signal, Uint32 toPtrI, Uint32 retVal)
4686 {
4687   jamEntry();
4688   ndbrequire(retVal == 0);
4689 
4690   TakeOverRecordPtr takeOverPtr;
4691   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
4692 
4693   UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
4694   conf->senderData = takeOverPtr.p->m_senderData;
4695   conf->sendingNodeId = cownNodeId;
4696   conf->startingNodeId = takeOverPtr.p->toStartingNode;
4697   sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
4698              UpdateToConf::SignalLength, JBB);
4699 }
4700 
4701 void
abortTakeOver(Signal * signal,TakeOverRecordPtr takeOverPtr)4702 Dbdih::abortTakeOver(Signal* signal, TakeOverRecordPtr takeOverPtr)
4703 {
4704   if (!takeOverPtr.p->m_switchPrimaryMutexHandle.isNull())
4705   {
4706     jam();
4707     Mutex mutex(signal, c_mutexMgr,
4708                 takeOverPtr.p->m_switchPrimaryMutexHandle);
4709     mutex.unlock();
4710 
4711   }
4712 
4713   if (!takeOverPtr.p->m_fragmentInfoMutex.isNull())
4714   {
4715     jam();
4716     Mutex mutex(signal, c_mutexMgr,
4717                 takeOverPtr.p->m_fragmentInfoMutex);
4718     mutex.unlock();
4719   }
4720 
4721   NodeRecordPtr nodePtr;
4722   nodePtr.i = takeOverPtr.p->toCopyNode;
4723   if (nodePtr.i != RNIL)
4724   {
4725     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4726     NodeGroupRecordPtr NGPtr;
4727     NGPtr.i = nodePtr.p->nodeGroup;
4728     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
4729     if (NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode)
4730     {
4731       jam();
4732       NGPtr.p->activeTakeOver = 0;
4733       NGPtr.p->activeTakeOverCount = 0;
4734     }
4735   }
4736 
4737   releaseTakeOver(takeOverPtr, true);
4738 }
4739 
4740 static
4741 void
add_lcp_counter(Uint32 * counter,Uint32 add)4742 add_lcp_counter(Uint32 * counter, Uint32 add)
4743 {
4744   Uint64 tmp = * counter;
4745   tmp += add;
4746   if (tmp > 0xFFFFFFFF)
4747     tmp = 0xFFFFFFFF;
4748   * counter = Uint32(tmp);
4749 }
4750 
execEND_TOREQ(Signal * signal)4751 void Dbdih::execEND_TOREQ(Signal* signal)
4752 {
4753   jamEntry();
4754   EndToReq req = *(EndToReq *)&signal->theData[0];
4755 
4756   Uint32 nodeId = refToNode(req.senderRef);
4757   TakeOverRecordPtr takeOverPtr;
4758 
4759   if (ndb_pnr(getNodeInfo(nodeId).m_version))
4760   {
4761     jam();
4762     /**
4763      *
4764      */
4765     ndbrequire(findTakeOver(takeOverPtr, nodeId));
4766     NodeRecordPtr nodePtr;
4767     nodePtr.i = nodeId;
4768     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4769 
4770     if (req.flags & StartCopyReq::WAIT_LCP)
4771     {
4772       /**
4773        * Wait for LCP
4774        */
4775       Uint32 latestLCP_ID = SYSFILE->latestLCP_ID;
4776       switch (c_lcpState.lcpStatus)
4777       {
4778         case LCP_STATUS_IDLE:
4779         case LCP_WAIT_MUTEX:
4780         case LCP_TCGET:
4781         case LCP_TC_CLOPSIZE:
4782           /**
4783            * We haven't started the next LCP yet, we haven't assigned the
4784            * nodes to participate in this LCP, so we will wait for the next
4785            * LCP started.
4786            */
4787          jam();
4788          latestLCP_ID++;
4789          break;
4790        default:
4791          /**
4792           * All the remaining status codes means that the LCP has been started
4793           * and that the participating nodes have been set. So if our node is
4794           * part of the participating nodes we will wait for this LCP,
4795           * otherwise we will wait for the next LCP to start.
4796           */
4797          jam();
4798          if (!c_lcpState.m_participatingLQH.get(nodeId))
4799          {
4800            jam();
4801            latestLCP_ID++;
4802          }
4803          break;
4804       }
4805       infoEvent("Make On-line Database recoverable by waiting"
4806                 " for LCP Starting on node %u, LCP id %u",
4807                 nodeId,
4808                 latestLCP_ID);
4809 
4810       nodePtr.p->copyCompleted = 2;
4811       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_WAIT_LCP;
4812 
4813       /**
4814        * Make sure that node also participated in one GCP
4815        *   before running it's first LCP, so that GCI variables
4816        *   in LQH are set properly
4817        */
4818       c_lcpState.lcpStopGcp = c_newest_restorable_gci;
4819 
4820       /**
4821        * We want to keep track of how long time we wait for LCP to be able
4822        * to present it in an ndbinfo table. This information is also used
4823        * in deciding when to start LCPs.
4824        *
4825        * We ensure that we will not stall any LCPs in this state due to not
4826        * having had enough activity. We can still stall due to waiting for
4827        * other nodes to reach this state.
4828        */
4829       add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
4830       setNodeRecoveryStatus(nodePtr.i, NodeRecord::WAIT_LCP_FOR_RESTART);
4831       return;
4832     }
4833     nodePtr.p->copyCompleted = 1;
4834     releaseTakeOver(takeOverPtr, true);
4835   }
4836 
4837   EndToConf * conf = (EndToConf *)&signal->theData[0];
4838   conf->senderData = req.senderData;
4839   conf->sendingNodeId = cownNodeId;
4840   conf->startingNodeId = req.startingNodeId;
4841   sendSignal(req.senderRef, GSN_END_TOCONF, signal,
4842              EndToConf::SignalLength, JBB);
4843 }//Dbdih::execEND_TOREQ()
4844 
4845 #define DIH_TAB_WRITE_LOCK(tabPtrP) \
4846   do { assertOwnThread(); tabPtrP->m_lock.write_lock(); } while (0)
4847 
4848 #define DIH_TAB_WRITE_UNLOCK(tabPtrP) \
4849   do { assertOwnThread(); tabPtrP->m_lock.write_unlock(); } while (0)
4850 
4851 /* --------------------------------------------------------------------------*/
4852 /*       AN ORDER TO START OR COMMIT THE REPLICA CREATION ARRIVED FROM THE   */
4853 /*       MASTER.                                                             */
4854 /* --------------------------------------------------------------------------*/
execUPDATE_FRAG_STATEREQ(Signal * signal)4855 void Dbdih::execUPDATE_FRAG_STATEREQ(Signal* signal)
4856 {
4857   jamEntry();
4858   UpdateFragStateReq * const req = (UpdateFragStateReq *)&signal->theData[0];
4859 
4860   Uint32 senderData = req->senderData;
4861   Uint32 senderRef = req->senderRef;
4862 
4863   TabRecordPtr tabPtr;
4864   tabPtr.i = req->tableId;
4865   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
4866 
4867   Uint32 fragId = req->fragId;
4868   Uint32 tdestNodeid = req->startingNodeId;
4869   //Uint32 tsourceNodeid = req->copyNodeId;
4870   Uint32 startGci = req->startGci;
4871   Uint32 replicaType = req->replicaType;
4872   Uint32 tFailedNodeId = req->failedNodeId;
4873 
4874   if (!ndb_pnr(getNodeInfo(refToNode(senderRef)).m_version))
4875   {
4876     jam();
4877     tFailedNodeId = tdestNodeid;
4878   }
4879 
4880   FragmentstorePtr fragPtr;
4881   getFragstore(tabPtr.p, fragId, fragPtr);
4882   RETURN_IF_NODE_NOT_ALIVE(tdestNodeid);
4883   ReplicaRecordPtr frReplicaPtr;
4884   findReplica(frReplicaPtr, fragPtr.p, tFailedNodeId,
4885               replicaType == UpdateFragStateReq::START_LOGGING ? false : true);
4886   if (frReplicaPtr.i == RNIL)
4887   {
4888     dump_replica_info(fragPtr.p);
4889   }
4890   ndbrequire(frReplicaPtr.i != RNIL);
4891 
4892   DIH_TAB_WRITE_LOCK(tabPtr.p);
4893   switch (replicaType) {
4894   case UpdateFragStateReq::STORED:
4895     jam();
4896     CRASH_INSERTION(7138);
4897     /* ----------------------------------------------------------------------*/
4898     /*  HERE WE ARE INSERTING THE NEW BACKUP NODE IN THE EXECUTION OF ALL    */
4899     /*  OPERATIONS. FROM HERE ON ALL OPERATIONS ON THIS FRAGMENT WILL INCLUDE*/
4900     /*  USE OF THE NEW REPLICA.                                              */
4901     /* --------------------------------------------------------------------- */
4902     insertBackup(fragPtr, tdestNodeid);
4903 
4904     fragPtr.p->distributionKey++;
4905     fragPtr.p->distributionKey &= 255;
4906     break;
4907   case UpdateFragStateReq::COMMIT_STORED:
4908     jam();
4909     CRASH_INSERTION(7139);
4910     /* ----------------------------------------------------------------------*/
4911     /*  HERE WE ARE MOVING THE REPLICA TO THE STORED SECTION SINCE IT IS NOW */
4912     /*  FULLY LOADED WITH ALL DATA NEEDED.                                   */
4913     // We also update the order of the replicas here so that if the new
4914     // replica is the desired primary we insert it as primary.
4915     /* ----------------------------------------------------------------------*/
4916     removeOldStoredReplica(fragPtr, frReplicaPtr);
4917     linkStoredReplica(fragPtr, frReplicaPtr);
4918     updateNodeInfo(fragPtr);
4919     break;
4920   case UpdateFragStateReq::START_LOGGING:
4921     jam();
4922     break;
4923   default:
4924     ndbrequire(false);
4925     break;
4926   }//switch
4927   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
4928 
4929   /* ------------------------------------------------------------------------*/
4930   /*       THE NEW NODE OF THIS REPLICA IS THE STARTING NODE.                */
4931   /* ------------------------------------------------------------------------*/
4932   if (tFailedNodeId != tdestNodeid)
4933   {
4934     jam();
4935     /**
4936      * This is a Hot-spare or move partition
4937      */
4938 
4939     /*  IF WE ARE STARTING A TAKE OVER NODE WE MUST INVALIDATE ALL LCP'S.   */
4940     /*  OTHERWISE WE WILL TRY TO START LCP'S THAT DO NOT EXIST.             */
4941     /* ---------------------------------------------------------------------*/
4942     frReplicaPtr.p->procNode = tdestNodeid;
4943     frReplicaPtr.p->noCrashedReplicas = 0;
4944     frReplicaPtr.p->createGci[0] = startGci;
4945     frReplicaPtr.p->replicaLastGci[0] = (Uint32)-1;
4946     for (Uint32 i = 0; i < MAX_LCP_STORED; i++)
4947     {
4948       frReplicaPtr.p->lcpStatus[i] = ZINVALID;
4949     }
4950   }
4951   else
4952   {
4953     jam();
4954     const Uint32 noCrashed = frReplicaPtr.p->noCrashedReplicas;
4955     arrGuard(noCrashed, 8);
4956     frReplicaPtr.p->createGci[noCrashed] = startGci;
4957     frReplicaPtr.p->replicaLastGci[noCrashed] = (Uint32)-1;
4958   }
4959 
4960   if (!isMaster())
4961   {
4962     jam();
4963     NodeRecordPtr nodePtr;
4964     nodePtr.i = tdestNodeid;
4965     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4966     if (nodePtr.p->nodeRecoveryStatus != NodeRecord::NODE_GETTING_SYNCHED)
4967     {
4968       jam();
4969       /**
4970        * We come here many times, we will call the state transition
4971        * code only the first time.
4972        */
4973       setNodeRecoveryStatus(tdestNodeid, NodeRecord::NODE_GETTING_SYNCHED);
4974     }
4975   }
4976   UpdateFragStateConf * const conf =
4977     (UpdateFragStateConf *)&signal->theData[0];
4978   conf->senderData = senderData;
4979   conf->tableId = tabPtr.i;
4980   conf->fragId = fragId;
4981   conf->sendingNodeId = cownNodeId;
4982   conf->startingNodeId = tdestNodeid;
4983   conf->failedNodeId = tFailedNodeId;
4984   sendSignal(senderRef, GSN_UPDATE_FRAG_STATECONF, signal,
4985              UpdateFragStateConf::SignalLength, JBB);
4986 }//Dbdih::execUPDATE_FRAG_STATEREQ()
4987 
4988 /**
4989  * Node Recovery Status Module
4990  * ---------------------------
4991  * This module is used to keep track of the restart progress in the master node
4992  * and also to report it to the user through a NDBINFO table. The module is
4993  * also used to estimate when a restart reaches certain critical checkpoints
4994  * in the restart execution. This is used to ensure that we hold up start of
4995  * those critical parts (e.g. LCPs) if there is a good chance that we will
4996  * reach there in reasonable time. Same principal as holding a train waiting
4997  * for a batch of important customers. One can wait for a while, but not
4998  * for too long time since this will affect many others as well.
4999  *
5000  * The only actions that are reported here happen in the master node. The only
5001  * exception to this is the node failure and node failure completed events
5002  * that happens in all nodes. Since the master node is the node that was
5003  * started first of all nodes, this means that the master node will contain
5004  * information about the node restarts of all nodes except those that
5005  * was started at the same time as the master node.
5006  */
5007 
5008 /* Debug Node Recovery Status module */
5009 #define DBG_NRS(a)
5010 //#define DBG_NRS(a) ndbout << a << endl
5011 
initNodeRecoveryStatus()5012 void Dbdih::initNodeRecoveryStatus()
5013 {
5014   NodeRecordPtr nodePtr;
5015 
5016   jam();
5017   for (nodePtr.i = 0; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
5018   {
5019     ptrAss(nodePtr, nodeRecord);
5020     nodePtr.p->nodeRecoveryStatus = NodeRecord::NOT_DEFINED_IN_CLUSTER;
5021     nodePtr.p->is_pausable = false;
5022     initNodeRecoveryTimers(nodePtr);
5023   }
5024 }
5025 
initNodeRecoveryTimers(NodeRecordPtr nodePtr)5026 void Dbdih::initNodeRecoveryTimers(NodeRecordPtr nodePtr)
5027 {
5028   jam();
5029   NdbTick_Invalidate(&nodePtr.p->nodeFailTime);
5030   NdbTick_Invalidate(&nodePtr.p->nodeFailCompletedTime);
5031   NdbTick_Invalidate(&nodePtr.p->allocatedNodeIdTime);
5032   NdbTick_Invalidate(&nodePtr.p->includedInHBProtocolTime);
5033   NdbTick_Invalidate(&nodePtr.p->ndbcntrStartWaitTime);
5034   NdbTick_Invalidate(&nodePtr.p->ndbcntrStartedTime);
5035   NdbTick_Invalidate(&nodePtr.p->startPermittedTime);
5036   NdbTick_Invalidate(&nodePtr.p->waitLCPToCopyDictTime);
5037   NdbTick_Invalidate(&nodePtr.p->copyDictToStartingNodeTime);
5038   NdbTick_Invalidate(&nodePtr.p->includeNodeInLCPAndGCPTime);
5039   NdbTick_Invalidate(&nodePtr.p->startDatabaseRecoveryTime);
5040   NdbTick_Invalidate(&nodePtr.p->startUndoDDTime);
5041   NdbTick_Invalidate(&nodePtr.p->startExecREDOLogTime);
5042   NdbTick_Invalidate(&nodePtr.p->startBuildIndexTime);
5043   NdbTick_Invalidate(&nodePtr.p->copyFragmentsStartedTime);
5044   NdbTick_Invalidate(&nodePtr.p->waitLCPForRestartTime);
5045   NdbTick_Invalidate(&nodePtr.p->waitSumaHandoverTime);
5046   NdbTick_Invalidate(&nodePtr.p->restartCompletedTime);
5047   NdbTick_Invalidate(&nodePtr.p->nodeGettingPermitTime);
5048   NdbTick_Invalidate(&nodePtr.p->nodeGettingIncludedTime);
5049   NdbTick_Invalidate(&nodePtr.p->nodeGettingSynchedTime);
5050   NdbTick_Invalidate(&nodePtr.p->nodeInLCPWaitStateTime);
5051   NdbTick_Invalidate(&nodePtr.p->nodeActiveTime);
5052 }
5053 
5054 /**
5055  * A node has allocated a node id, this happens even before the angel starts
5056  * a new ndbd/ndbmtd process or in a very early phase of ndbd/ndbmtd startup.
5057  */
execALLOC_NODEID_REP(Signal * signal)5058 void Dbdih::execALLOC_NODEID_REP(Signal *signal)
5059 {
5060   NodeRecordPtr nodePtr;
5061   AllocNodeIdRep *rep = (AllocNodeIdRep*)&signal->theData[0];
5062 
5063   jamEntry();
5064   if (rep->nodeId >= MAX_NDB_NODES)
5065   {
5066     jam();
5067     return;
5068   }
5069   nodePtr.i = rep->nodeId;
5070   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5071   if (nodePtr.p->nodeStatus == NodeRecord::NOT_IN_CLUSTER)
5072   {
5073     jam();
5074     return;
5075   }
5076   setNodeRecoveryStatus(rep->nodeId, NodeRecord::ALLOCATED_NODE_ID);
5077 }
5078 
5079 /**
5080  * A node have been included in the heartbeat protocol. This happens very early
5081  * on in the restart, from here the node need to act as a real-time engine and
5082  * thus has to avoid extremely time consuming activities that block execution.
5083  */
execINCL_NODE_HB_PROTOCOL_REP(Signal * signal)5084 void Dbdih::execINCL_NODE_HB_PROTOCOL_REP(Signal *signal)
5085 {
5086   InclNodeHBProtocolRep *rep = (InclNodeHBProtocolRep*)&signal->theData[0];
5087   jamEntry();
5088 
5089   setNodeRecoveryStatus(rep->nodeId, NodeRecord::INCLUDED_IN_HB_PROTOCOL);
5090 }
5091 
5092 /**
5093  * The node is blocked to continue in its node restart handling since another
5094  * node is currently going through the stages to among other things copy the
5095  * meta data.
5096  */
execNDBCNTR_START_WAIT_REP(Signal * signal)5097 void Dbdih::execNDBCNTR_START_WAIT_REP(Signal *signal)
5098 {
5099   NdbcntrStartWaitRep *rep = (NdbcntrStartWaitRep*)&signal->theData[0];
5100   jamEntry();
5101 
5102   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NDBCNTR_START_WAIT);
5103 }
5104 
5105 /**
5106  * The node wasn't blocked by another node restart anymore, we can now
5107  * continue processing the restart and soon go on to copy the meta data.
5108  */
execNDBCNTR_STARTED_REP(Signal * signal)5109 void Dbdih::execNDBCNTR_STARTED_REP(Signal *signal)
5110 {
5111   NdbcntrStartedRep *rep = (NdbcntrStartedRep*)&signal->theData[0];
5112   jamEntry();
5113 
5114   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NDBCNTR_STARTED);
5115 }
5116 
5117 /**
5118  * SUMA handover for the node has completed, this is the very final step
5119  * of the node restart after which the node is fully up and running.
5120  */
execSUMA_HANDOVER_COMPLETE_REP(Signal * signal)5121 void Dbdih::execSUMA_HANDOVER_COMPLETE_REP(Signal *signal)
5122 {
5123   SumaHandoverCompleteRep *rep = (SumaHandoverCompleteRep*)&signal->theData[0];
5124   jamEntry();
5125 
5126   setNodeRecoveryStatus(rep->nodeId, NodeRecord::RESTART_COMPLETED);
5127 }
5128 
execLOCAL_RECOVERY_COMP_REP(Signal * signal)5129 void Dbdih::execLOCAL_RECOVERY_COMP_REP(Signal *signal)
5130 {
5131   jamEntry();
5132   if (reference() != cmasterdihref)
5133   {
5134     jam();
5135     if (likely(getNodeInfo(refToNode(cmasterdihref)).m_version >=
5136                NDBD_NODE_RECOVERY_STATUS_VERSION))
5137     {
5138       jam();
5139       sendSignal(cmasterdihref, GSN_LOCAL_RECOVERY_COMP_REP, signal,
5140                  LocalRecoveryCompleteRep::SignalLengthMaster, JBB);
5141     }
5142     else
5143     {
5144       jam();
5145     }
5146     return;
5147   }
5148   LocalRecoveryCompleteRep *rep =
5149     (LocalRecoveryCompleteRep*)&signal->theData[0];
5150   LocalRecoveryCompleteRep::PhaseIds phaseId =
5151     (LocalRecoveryCompleteRep::PhaseIds)rep->phaseId;
5152   Uint32 nodeId = rep->nodeId;
5153 
5154   switch (phaseId)
5155   {
5156   case LocalRecoveryCompleteRep::RESTORE_FRAG_COMPLETED:
5157     jam();
5158     setNodeRecoveryStatus(nodeId, NodeRecord::RESTORE_FRAG_COMPLETED);
5159     break;
5160   case LocalRecoveryCompleteRep::UNDO_DD_COMPLETED:
5161     jam();
5162     setNodeRecoveryStatus(nodeId, NodeRecord::UNDO_DD_COMPLETED);
5163     break;
5164   case LocalRecoveryCompleteRep::EXECUTE_REDO_LOG_COMPLETED:
5165     jam();
5166     setNodeRecoveryStatus(nodeId, NodeRecord::EXECUTE_REDO_LOG_COMPLETED);
5167     break;
5168   default:
5169     ndbrequire(false);
5170   }
5171 }
5172 
5173 /**
5174  * Called by starting nodes to provide non-master nodes with an estimate of how
5175  * long time it takes to synchronize the starting node with the alive nodes.
5176  */
sendEND_TOREP(Signal * signal,Uint32 startingNodeId)5177 void Dbdih::sendEND_TOREP(Signal *signal, Uint32 startingNodeId)
5178 {
5179   EndToRep *rep = (EndToRep*)signal->getDataPtrSend();
5180   NodeRecordPtr nodePtr;
5181   nodePtr.i = cfirstAliveNode;
5182   rep->nodeId = startingNodeId;
5183 
5184   do
5185   {
5186     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5187     if (likely(getNodeInfo(nodePtr.i).m_version >=
5188                NDBD_NODE_RECOVERY_STATUS_VERSION))
5189     {
5190       /**
5191        * Don't send to nodes with earlier versions that don't have support
5192        * for this code.
5193        */
5194       jamLine(nodePtr.i);
5195       BlockReference ref = calcDihBlockRef(nodePtr.i);
5196       if (ref != cmasterdihref)
5197       {
5198         jam();
5199         sendSignal(ref, GSN_END_TOREP, signal,
5200 	           EndToRep::SignalLength, JBB);
5201       }
5202     }
5203     nodePtr.i = nodePtr.p->nextNode;
5204   } while (nodePtr.i != RNIL);
5205 }
5206 
5207 /**
5208  * Received in non-master nodes, to ensure we get estimate on synch time
5209  * between starting node and alive nodes.
5210  */
execEND_TOREP(Signal * signal)5211 void Dbdih::execEND_TOREP(Signal *signal)
5212 {
5213   EndToRep *rep = (EndToRep*)&signal->theData[0];
5214   jamEntry();
5215   if (isMaster())
5216   {
5217     jam();
5218     return;
5219   }
5220   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NODE_IN_LCP_WAIT_STATE);
5221 }
5222 
5223 /**
5224  * Called when setting state to ALLOCATED_NODE_ID or
5225  * INCLUDE_IN_HB_PROTOCOL since a node can be dead for a long time
5226  * while we've been master and potentially could even have allocated
5227  * its node id before we became master.
5228  */
check_node_not_restarted_yet(NodeRecordPtr nodePtr)5229 void Dbdih::check_node_not_restarted_yet(NodeRecordPtr nodePtr)
5230 {
5231   if (nodePtr.p->nodeRecoveryStatus ==
5232       NodeRecord::NODE_NOT_RESTARTED_YET)
5233   {
5234     jam();
5235     /**
5236      * A node which has been dead since we started is restarted.
5237      * We set node failure time and node failure completed time
5238      * to now in this case to initialise those unknown values, we
5239      * rather report zero time than an uninitialised time.
5240      */
5241     nodePtr.p->nodeFailTime = c_current_time;
5242     nodePtr.p->nodeFailCompletedTime = c_current_time;
5243   }
5244 }
5245 
setNodeRecoveryStatus(Uint32 nodeId,NodeRecord::NodeRecoveryStatus new_status)5246 void Dbdih::setNodeRecoveryStatus(Uint32 nodeId,
5247                                   NodeRecord::NodeRecoveryStatus new_status)
5248 {
5249   NodeRecordPtr nodePtr;
5250   NDB_TICKS current_time;
5251 
5252   c_current_time = NdbTick_getCurrentTicks();
5253   current_time = c_current_time;
5254 
5255   nodePtr.i = nodeId;
5256   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5257   jam();
5258   jamLine(nodePtr.p->nodeRecoveryStatus);
5259 
5260   /**
5261    * We maintain the state NODE_GETTING_PERMIT in the
5262    * variable is_pausable independent of when it is
5263    * received since it is needed to be able to handle
5264    * PAUSE protocol properly. The node recovery status
5265    * isn't sufficiently developed to handle this using
5266    * the state variable alone yet since we cannot handle
5267    * all restart types yet.
5268    */
5269   if (new_status == NodeRecord::NODE_GETTING_PERMIT)
5270   {
5271     jam();
5272     nodePtr.p->is_pausable = true;
5273   }
5274   else
5275   {
5276     jam();
5277     nodePtr.p->is_pausable = false;
5278   }
5279 
5280   if (getNodeState().startLevel != NodeState::SL_STARTED)
5281   {
5282     jam();
5283     /**
5284      * We will ignore all state transitions until we are started ourselves
5285      * before we even attempt to record state transitions. This means we
5286      * have no view into system restarts currently and inital starts. We
5287      * only worry about node restarts for now.
5288      */
5289     return;
5290   }
5291   if (new_status != NodeRecord::NODE_FAILED &&
5292       new_status != NodeRecord::NODE_FAILURE_COMPLETED)
5293   {
5294     jam();
5295     /**
5296      * Given that QMGR, NDBCNTR, DBDICT and DBDIH executes in the same thread
5297      * the possibility of jumping over a state doesn't exist. If we split out
5298      * any of those into separate threads in the future it is important to
5299      * check that the ndbrequire's in this function still holds.
5300      */
5301     if (!isMaster())
5302     {
5303       if (getNodeInfo(nodePtr.i).m_version <
5304           NDBD_NODE_RECOVERY_STATUS_VERSION)
5305       {
5306         jam();
5307         /**
5308          * We ignore state changes for non-master nodes that are from
5309          * too old versions to support all state transitions.
5310          */
5311         return;
5312       }
5313       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET &&
5314           new_status != NodeRecord::NODE_GETTING_PERMIT)
5315       {
5316         jam();
5317         /**
5318          * We're getting into the game too late, we will ignore state changes
5319          * for this node restart since it won't provide any useful info
5320          * anyways.
5321          */
5322         return;
5323       }
5324     }
5325     else if (nodePtr.p->nodeRecoveryStatus ==
5326              NodeRecord::NODE_NOT_RESTARTED_YET)
5327     {
5328       jam();
5329       switch (new_status)
5330       {
5331         case NodeRecord::ALLOCATED_NODE_ID:
5332           jam();
5333         case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
5334           jam();
5335           /**
5336            * These are the normal states to hear about as first states after
5337            * we completed our own start. We can either first hear a node
5338            * failure and then we are sure we will follow the right path
5339            * since we heard about the node failure after being started.
5340            * If we weren't there for the node failure we are also ok with
5341            * starting all the way from allocated node id and included in
5342            * heartbeat protocol.
5343            */
5344           break;
5345         default:
5346           jam();
5347           jamLine(new_status);
5348           /**
5349            * This was due to a partial system restart, we haven't gotten
5350            * around to supporting this yet. This requires more work
5351            * before we can support it, this would mean that we come into
5352            * the action midway, so this will be solved when we handle
5353            * system restarts properly, but this is more work needed and
5354            * not done yet. So for now we ignore those states and will
5355            * handle the next time the node starts up instead.
5356            * TODO
5357            */
5358           return;
5359       }
5360     }
5361   }
5362   switch (new_status)
5363   {
5364     case NodeRecord::NODE_FAILED:
5365     /* State generated in DBDIH */
5366       jam();
5367       /**
5368        * A node failure can happen at any time and from any state as long as
5369        * it is defined in the cluster.
5370        *
5371        * This state change will be reported in all nodes at all times.
5372        *
5373        * We will clear all timers when a node fails since we want to ensure
5374        * that we only have valid timers backwards in time to avoid reading
5375        * old timers.
5376        */
5377       ndbrequire((nodePtr.p->nodeRecoveryStatus !=
5378                   NodeRecord::NOT_DEFINED_IN_CLUSTER));
5379       initNodeRecoveryTimers(nodePtr);
5380       nodePtr.p->nodeFailTime = current_time;
5381       break;
5382     case NodeRecord::NODE_FAILURE_COMPLETED:
5383     /* State generated in DBDIH */
5384       jam();
5385       /* This state change will be reported in all nodes at all times */
5386       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5387                  NodeRecord::NODE_FAILED);
5388       nodePtr.p->nodeFailCompletedTime = current_time;
5389       break;
5390     case NodeRecord::ALLOCATED_NODE_ID:
5391     /* State generated in QMGR */
5392       jam();
5393       ndbrequire(isMaster());
5394       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
5395                   NodeRecord::NODE_FAILURE_COMPLETED) ||
5396                  (nodePtr.p->nodeRecoveryStatus ==
5397                   NodeRecord::ALLOCATED_NODE_ID) ||
5398                  (nodePtr.p->nodeRecoveryStatus ==
5399                   NodeRecord::NODE_NOT_RESTARTED_YET));
5400       check_node_not_restarted_yet(nodePtr);
5401       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::ALLOCATED_NODE_ID)
5402       {
5403         jam();
5404         /**
5405          * If a node first allocates a node id and then comes back again to
5406          * allocate it again, then start counting time from node failed
5407          * as from now since a long time might have passed since we actually
5408          * failed.
5409          */
5410         nodePtr.p->nodeFailTime = current_time;
5411         nodePtr.p->nodeFailCompletedTime = current_time;
5412       }
5413       nodePtr.p->allocatedNodeIdTime = current_time;
5414       break;
5415     case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
5416     /* State generated in QMGR */
5417       jam();
5418       /**
5419        * We can come here from ALLOCATED_NODE_ID obviously,
5420        * but it seems that we should also be able to get
5421        * here from a state where the node has been able to
5422        * allocate a node id with an old master, now it is
5423        * using this old allocated node id to be included in
5424        * the heartbeat protocol. So the node could be in
5425        * node not restarted yet or node failure completed.
5426        */
5427       ndbrequire(isMaster());
5428       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
5429                   NodeRecord::ALLOCATED_NODE_ID) ||
5430                  (nodePtr.p->nodeRecoveryStatus ==
5431                   NodeRecord::NODE_NOT_RESTARTED_YET) ||
5432                  (nodePtr.p->nodeRecoveryStatus ==
5433                   NodeRecord::NODE_FAILURE_COMPLETED));
5434       check_node_not_restarted_yet(nodePtr);
5435       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILURE_COMPLETED)
5436       {
5437         jam();
5438         nodePtr.p->allocatedNodeIdTime = current_time;
5439       }
5440       nodePtr.p->includedInHBProtocolTime = current_time;
5441       break;
5442     case NodeRecord::NDBCNTR_START_WAIT:
5443     /* State generated in NDBCNTR */
5444       jam();
5445       ndbrequire(isMaster());
5446       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5447                  NodeRecord::INCLUDED_IN_HB_PROTOCOL);
5448       nodePtr.p->ndbcntrStartWaitTime = current_time;
5449       break;
5450     case NodeRecord::NDBCNTR_STARTED:
5451     /* State generated in NDBCNTR */
5452       jam();
5453       ndbrequire(isMaster());
5454       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
5455                   NodeRecord::NDBCNTR_START_WAIT) ||
5456                  (nodePtr.p->nodeRecoveryStatus ==
5457                   NodeRecord::INCLUDED_IN_HB_PROTOCOL));
5458 
5459       if (nodePtr.p->nodeRecoveryStatus ==
5460           NodeRecord::INCLUDED_IN_HB_PROTOCOL)
5461       {
5462         jam();
5463         nodePtr.p->ndbcntrStartWaitTime = current_time;
5464       }
5465       nodePtr.p->ndbcntrStartedTime = current_time;
5466       break;
5467     case NodeRecord::START_PERMITTED:
5468     /* State generated in DBDIH */
5469       jam();
5470       ndbrequire(isMaster());
5471       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5472                  NodeRecord::NDBCNTR_STARTED);
5473       nodePtr.p->startPermittedTime = current_time;
5474       break;
5475     case NodeRecord::WAIT_LCP_TO_COPY_DICT:
5476     /* State generated in DBDIH */
5477       jam();
5478       ndbrequire(isMaster());
5479       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5480                  NodeRecord::START_PERMITTED);
5481       nodePtr.p->waitLCPToCopyDictTime = current_time;
5482       break;
5483     case NodeRecord::COPY_DICT_TO_STARTING_NODE:
5484     /* State generated in DBDIH */
5485       jam();
5486       ndbrequire(isMaster());
5487       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5488                  NodeRecord::WAIT_LCP_TO_COPY_DICT);
5489       nodePtr.p->copyDictToStartingNodeTime = current_time;
5490       break;
5491     case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
5492     /* State generated in DBDIH */
5493       jam();
5494       ndbrequire(isMaster());
5495       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5496                  NodeRecord::COPY_DICT_TO_STARTING_NODE);
5497       nodePtr.p->includeNodeInLCPAndGCPTime = current_time;
5498       break;
5499     case NodeRecord::LOCAL_RECOVERY_STARTED:
5500     /* State generated in DBDIH */
5501       jam();
5502       ndbrequire(isMaster());
5503       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5504                  NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP);
5505       nodePtr.p->startDatabaseRecoveryTime = current_time;
5506       break;
5507     case NodeRecord::RESTORE_FRAG_COMPLETED:
5508     /* State generated in DBLQH in starting node */
5509       jam();
5510       ndbrequire(isMaster());
5511       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5512                  NodeRecord::LOCAL_RECOVERY_STARTED);
5513       nodePtr.p->startUndoDDTime = current_time;
5514       break;
5515     case NodeRecord::UNDO_DD_COMPLETED:
5516     /* State generated in DBLQH in starting node */
5517       jam();
5518       ndbrequire(isMaster());
5519       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5520                  NodeRecord::RESTORE_FRAG_COMPLETED);
5521       nodePtr.p->startExecREDOLogTime = current_time;
5522       break;
5523     case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
5524     /* State generated in DBLQH in starting node */
5525       jam();
5526       ndbrequire(isMaster());
5527       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5528                  NodeRecord::UNDO_DD_COMPLETED);
5529       nodePtr.p->startBuildIndexTime = current_time;
5530       break;
5531     case NodeRecord::COPY_FRAGMENTS_STARTED:
5532     /* State generated in DBDIH */
5533       jam();
5534       ndbrequire(isMaster());
5535       /**
5536        * If the starting node doesn't support reporting its
5537        * local recovery status, then we come here from
5538        * LOCAL_RECOVERY_STARTED, in the normal case with a
5539        * new version of the starting node we come here rather from
5540        * EXECUTE_REDO_LOG_COMPLETED.
5541        */
5542       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
5543                   NodeRecord::EXECUTE_REDO_LOG_COMPLETED) ||
5544                  ((nodePtr.p->nodeRecoveryStatus ==
5545                   NodeRecord::LOCAL_RECOVERY_STARTED) &&
5546                   (getNodeInfo(nodePtr.i).m_version <
5547                   NDBD_NODE_RECOVERY_STATUS_VERSION)));
5548       if (nodePtr.p->nodeRecoveryStatus ==
5549           NodeRecord::LOCAL_RECOVERY_STARTED)
5550       {
5551         /**
5552          * We handle this state transition even for old versions since
5553          * it still gives all the information we need to make the right
5554          * decision about the LCP start.
5555          */
5556         NDB_TICKS start_time = nodePtr.p->startDatabaseRecoveryTime;
5557         jam();
5558         /* Set all local times to 0 if node doesn't support sending those */
5559         nodePtr.p->startUndoDDTime = start_time;
5560         nodePtr.p->startExecREDOLogTime = start_time;
5561         nodePtr.p->startBuildIndexTime = start_time;
5562       }
5563       nodePtr.p->copyFragmentsStartedTime = current_time;
5564       break;
5565     case NodeRecord::WAIT_LCP_FOR_RESTART:
5566     /* State generated in DBDIH */
5567       jam();
5568       ndbrequire(isMaster());
5569       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5570                  NodeRecord::COPY_FRAGMENTS_STARTED);
5571       nodePtr.p->waitLCPForRestartTime = current_time;
5572       break;
5573     case NodeRecord::WAIT_SUMA_HANDOVER:
5574     /* State generated in DBDIH */
5575       jam();
5576       ndbrequire(isMaster());
5577       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5578                  NodeRecord::WAIT_LCP_FOR_RESTART);
5579       nodePtr.p->waitSumaHandoverTime = current_time;
5580       break;
5581     case NodeRecord::RESTART_COMPLETED:
5582     /* State generated in DBDICT */
5583       jam();
5584       ndbrequire(isMaster());
5585       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5586                  NodeRecord::WAIT_SUMA_HANDOVER);
5587       nodePtr.p->restartCompletedTime = current_time;
5588       break;
5589 
5590     /* Non-master states */
5591     case NodeRecord::NODE_GETTING_PERMIT:
5592     {
5593       jam();
5594       ndbrequire(!isMaster());
5595       /**
5596        * NODE_GETTING_PERMIT is the first state a non-master node sees.
5597        * So we can come here from seeing node failure state or node
5598        * failure completed state.
5599        *
5600        * For a non-master node we can always come to any state from the
5601        * state NODE_NOT_RESTARTED_YET since we don't record any states
5602        * until we have completed our own restart and at that time there
5603        * can be other nodes restarting in any state.
5604        *
5605        * In addition we won't even record states for a starting node if
5606        * we only seen the final phases of the restart. So the state
5607        * NODE_NOT_RESTARTED_YET can be there through a major part of
5608        * a node restart.
5609        */
5610       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5611                  NodeRecord::NODE_FAILURE_COMPLETED ||
5612                  nodePtr.p->nodeRecoveryStatus ==
5613                  NodeRecord::NODE_NOT_RESTARTED_YET);
5614       if (nodePtr.p->nodeRecoveryStatus ==
5615           NodeRecord::NODE_NOT_RESTARTED_YET)
5616       {
5617         jam();
5618         nodePtr.p->nodeFailTime = current_time;
5619         nodePtr.p->nodeFailCompletedTime = current_time;
5620       }
5621       nodePtr.p->nodeGettingPermitTime = current_time;
5622       break;
5623     }
5624     case NodeRecord::NODE_GETTING_INCLUDED:
5625     {
5626       jam();
5627       ndbrequire(!isMaster());
5628       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5629                   NodeRecord::NODE_GETTING_PERMIT);
5630       nodePtr.p->nodeGettingIncludedTime = current_time;
5631       break;
5632     }
5633     case NodeRecord::NODE_GETTING_SYNCHED:
5634     {
5635       jam();
5636       ndbrequire(!isMaster());
5637       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5638                   NodeRecord::NODE_GETTING_INCLUDED);
5639       nodePtr.p->nodeGettingSynchedTime = current_time;
5640       break;
5641     }
5642     case NodeRecord::NODE_IN_LCP_WAIT_STATE:
5643     {
5644       jam();
5645       ndbrequire(!isMaster());
5646       /**
5647        * A weird case for coming to here with NODE_GETTING_INCLUDED is if
5648        * there are no tables that require being synched. This is an
5649        * unusual case, but still possible.
5650        */
5651       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
5652                   NodeRecord::NODE_GETTING_INCLUDED) ||
5653                  (nodePtr.p->nodeRecoveryStatus ==
5654                   NodeRecord::NODE_GETTING_SYNCHED));
5655       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_GETTING_INCLUDED)
5656       {
5657         jam();
5658         /* No fragment updates, set time to 0 for synch */
5659         nodePtr.p->nodeGettingSynchedTime = nodePtr.p->nodeGettingIncludedTime;
5660       }
5661       nodePtr.p->nodeInLCPWaitStateTime = current_time;
5662       break;
5663     }
5664     case NodeRecord::NODE_ACTIVE:
5665       jam();
5666       ndbrequire(!isMaster());
5667       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5668                  NodeRecord::NODE_IN_LCP_WAIT_STATE);
5669       nodePtr.p->nodeActiveTime = current_time;
5670       break;
5671     default:
5672       ndbrequire(false);
5673   }
5674 
5675   infoEvent("NR Status: node=%u,OLD=%s,NEW=%s",
5676             nodeId,
5677             get_status_str(nodePtr.p->nodeRecoveryStatus),
5678             get_status_str(new_status));
5679 
5680   g_eventLogger->info("NR Status: node=%u,OLD=%s,NEW=%s",
5681                       nodeId,
5682                       get_status_str(nodePtr.p->nodeRecoveryStatus),
5683                       get_status_str(new_status));
5684 
5685   nodePtr.p->nodeRecoveryStatus = new_status;
5686   ndbassert(check_node_recovery_timers(nodePtr.i));
5687 }
5688 
setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr)5689 void Dbdih::setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr)
5690 {
5691   DBG_NRS("setNodeRecoveryStatusInitial: node= " << nodePtr.i << "state= " <<
5692           (Uint32)NodeRecord::NODE_NOT_RESTARTED_YET);
5693   nodePtr.p->nodeRecoveryStatus = NodeRecord::NODE_NOT_RESTARTED_YET;
5694 }
5695 
5696 /**
5697  * Define heuristic constants
5698  * --------------------------
5699  *
5700  * The base for the maximum wait is the time the last LCP execution took.
5701  * We will never wait for more than 35% of this time. We will check this
5702  * even before attempting to wait any further. We will also cap the wait
5703  * to never exceed an hour.
5704  *
5705  * Next we will adjust the maximum wait time down to 85% of this value
5706  * when we are calculating the estimate based on node states. This means
5707  * that if we estimate that we will wait for more than around 30% of an
5708  * LCP execution time, then we will start the LCP.
5709  *
5710  * If the node we are waiting for is in the early start phases then we
5711  * even less inclined to wait and will decrease the time by another
5712  * 50% dropping it to around 15% of an LCP execution time.
5713  *
5714  * If we have no node with a proper estimate, then we will drop the
5715  * wait time even more to 25% of the previous value, so 7-8% for
5716  * nodes in later start phases and only 3-4% in early start phases.
5717  */
5718 #define STALL_MAX_ONE_HOUR (60 * 60 * 1000)
5719 #define MAX_PERCENTAGE_OF_LCP_TIME_WE_STALL 35
5720 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_ESTIMATE 85
5721 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_EARLY_START_PHASES 50
5722 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_NO_ESTIMATE 25
5723 
check_for_too_long_wait(Uint64 & lcp_max_wait_time,Uint64 & lcp_stall_time,NDB_TICKS now)5724 bool Dbdih::check_for_too_long_wait(Uint64 &lcp_max_wait_time,
5725                                     Uint64 &lcp_stall_time,
5726                                     NDB_TICKS now)
5727 {
5728   /**
5729    * We first get the time of the latest LCP execution. We want to stall
5730    * execution of LCPs, but never for so long that we get into other
5731    * problems such as out of REDO log.
5732    */
5733   Uint64 lcp_proc_time;
5734   Uint64 lcp_time = c_lcpState.m_lcp_time;
5735   Uint32 lcp_start = c_lcpState.lcpStallStart;
5736   if (lcp_start == 0)
5737   {
5738     jam();
5739     lcp_stall_time = 0;
5740   }
5741   else
5742   {
5743     jam();
5744     lcp_stall_time = NdbTick_Elapsed(c_lcpState.m_start_lcp_check_time,
5745                                      now).milliSec();
5746   }
5747 
5748   /**
5749    * We never wait for more than 1 hour and at most 35% of the time it
5750    * takes to execute an LCP. We calculate the maximum stall time here
5751    * based on those two inputs.
5752    */
5753   lcp_proc_time = MAX_PERCENTAGE_OF_LCP_TIME_WE_STALL * lcp_time;
5754   lcp_proc_time /= 100;
5755   lcp_max_wait_time = STALL_MAX_ONE_HOUR;
5756   if (lcp_max_wait_time > lcp_proc_time)
5757   {
5758     jam();
5759     lcp_max_wait_time = lcp_proc_time;
5760   }
5761 
5762   DBG_NRS("lcp_stall_time is = " << lcp_stall_time
5763            << " lcp_max_wait_time is = " << lcp_max_wait_time);
5764   /**
5765    * If we have already stalled for longer time than the maximum wait we
5766    * will allow, then we need not check the states of node restarts, we
5767    * will start the LCP anyways.
5768    */
5769   if (lcp_stall_time > lcp_max_wait_time)
5770   {
5771     jam();
5772     return true;
5773   }
5774 
5775   /**
5776    * In the calculated delay we will allow for a slightly shorter calculated
5777    * delay than the maximum actual delay we will wait. This is to avoid that
5778    * we wait for a long time only to stop waiting right before the wait is
5779    * over.
5780    */
5781   lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_ESTIMATE;
5782   lcp_max_wait_time /= 100; /* Decrease max time by 15% */
5783   lcp_max_wait_time -= lcp_stall_time; /* Decrease by time we already waited */
5784   return false;
5785 }
5786 
calculate_time_remaining(Uint32 nodeId,NDB_TICKS state_start_time,NDB_TICKS now,NodeRecord::NodeRecoveryStatus state,Uint32 * node_waited_for,Uint64 * time_since_state_start,NodeRecord::NodeRecoveryStatus * max_status)5787 void Dbdih::calculate_time_remaining(
5788                                 Uint32 nodeId,
5789                                 NDB_TICKS state_start_time,
5790                                 NDB_TICKS now,
5791                                 NodeRecord::NodeRecoveryStatus state,
5792                                 Uint32 *node_waited_for,
5793                                 Uint64 *time_since_state_start,
5794                                 NodeRecord::NodeRecoveryStatus *max_status)
5795 {
5796   ndbassert(NdbTick_IsValid(now));
5797   ndbassert(NdbTick_IsValid(state_start_time));
5798 
5799   if (state > (*max_status))
5800   {
5801     jam();
5802     (*time_since_state_start) =
5803       NdbTick_Elapsed(state_start_time, now).milliSec();
5804     (*max_status) = state;
5805     (*node_waited_for) = nodeId;
5806   }
5807   else if (state == (*max_status))
5808   {
5809     jam();
5810     Uint64 loc_time_since_state_start;
5811     loc_time_since_state_start =
5812       NdbTick_Elapsed(state_start_time, now).milliSec();
5813     if (loc_time_since_state_start > (*time_since_state_start))
5814     {
5815       jam();
5816       (*time_since_state_start) = loc_time_since_state_start;
5817       (*node_waited_for) = nodeId;
5818     }
5819   }
5820 }
5821 
calculate_most_recent_node(Uint32 nodeId,NDB_TICKS state_start_time,NodeRecord::NodeRecoveryStatus state,Uint32 * most_recent_node,NDB_TICKS * most_recent_start_time,NodeRecord::NodeRecoveryStatus * most_recent_state)5822 void Dbdih::calculate_most_recent_node(
5823                         Uint32 nodeId,
5824                         NDB_TICKS state_start_time,
5825                         NodeRecord::NodeRecoveryStatus state,
5826                         Uint32 *most_recent_node,
5827                         NDB_TICKS *most_recent_start_time,
5828                         NodeRecord::NodeRecoveryStatus *most_recent_state)
5829 {
5830   ndbassert(NdbTick_IsValid(state_start_time));
5831   if ((*most_recent_node) == 0)
5832   {
5833     /* No state set, set this as state */
5834     jam();
5835   }
5836   else if ((*most_recent_state) == state)
5837   {
5838     jam();
5839     /* Same state as before, use most recent */
5840     if (NdbTick_Compare((*most_recent_start_time),
5841                         state_start_time) > 0)
5842     {
5843       jam();
5844       return;
5845     }
5846     jam();
5847   }
5848   else if ((*most_recent_state) == NodeRecord::NODE_ACTIVE)
5849   {
5850     /* Old state from non-master, new from master, use this one */
5851     jam();
5852   }
5853   else if ((*most_recent_state) > state)
5854   {
5855     /**
5856      * Two master states, use the latest (this one)
5857      * Latest is the one with the lowest state since
5858      * the older one has progressed longer.
5859      */
5860     jam();
5861   }
5862   else
5863   {
5864     /* Ignore this state, we already have a better one */
5865     jam();
5866     return;
5867   }
5868   (*most_recent_state) = state;
5869   (*most_recent_start_time) = state_start_time;
5870   (*most_recent_node) = nodeId;
5871   return;
5872 }
5873 
5874 #if 0
5875 /* Useful debug function when trying to find overwrite of node record */
5876 void Dbdih::check_all_node_recovery_timers(void)
5877 {
5878   Uint32 nodeId;
5879   for (nodeId = 1; nodeId < MAX_NDB_NODES; nodeId++)
5880   {
5881     ndbassert(check_node_recovery_timers(nodeId));
5882   }
5883 }
5884 #endif
5885 
check_node_recovery_timers(Uint32 nodeId)5886 bool Dbdih::check_node_recovery_timers(Uint32 nodeId)
5887 {
5888   NodeRecordPtr nodePtr;
5889   nodePtr.i = nodeId;
5890   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5891 
5892   switch (nodePtr.p->nodeRecoveryStatus)
5893   {
5894   case NodeRecord::RESTART_COMPLETED:
5895     ndbassert(NdbTick_IsValid(nodePtr.p->restartCompletedTime));
5896   case NodeRecord::WAIT_SUMA_HANDOVER:
5897     ndbassert(NdbTick_IsValid(nodePtr.p->waitSumaHandoverTime));
5898   case NodeRecord::WAIT_LCP_FOR_RESTART:
5899     ndbassert(NdbTick_IsValid(nodePtr.p->waitLCPForRestartTime));
5900   case NodeRecord::COPY_FRAGMENTS_STARTED:
5901     ndbassert(NdbTick_IsValid(nodePtr.p->copyFragmentsStartedTime));
5902   case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
5903     ndbassert(NdbTick_IsValid(nodePtr.p->startBuildIndexTime));
5904   case NodeRecord::UNDO_DD_COMPLETED:
5905     ndbassert(NdbTick_IsValid(nodePtr.p->startExecREDOLogTime));
5906   case NodeRecord::RESTORE_FRAG_COMPLETED:
5907     ndbassert(NdbTick_IsValid(nodePtr.p->startUndoDDTime));
5908   case NodeRecord::LOCAL_RECOVERY_STARTED:
5909     ndbassert(NdbTick_IsValid(nodePtr.p->startDatabaseRecoveryTime));
5910   case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
5911     ndbassert(NdbTick_IsValid(nodePtr.p->includeNodeInLCPAndGCPTime));
5912   case NodeRecord::COPY_DICT_TO_STARTING_NODE:
5913     ndbassert(NdbTick_IsValid(nodePtr.p->copyDictToStartingNodeTime));
5914   case NodeRecord::WAIT_LCP_TO_COPY_DICT:
5915     ndbassert(NdbTick_IsValid(nodePtr.p->waitLCPToCopyDictTime));
5916   case NodeRecord::START_PERMITTED:
5917     ndbassert(NdbTick_IsValid(nodePtr.p->startPermittedTime));
5918   case NodeRecord::NDBCNTR_STARTED:
5919     ndbassert(NdbTick_IsValid(nodePtr.p->ndbcntrStartedTime));
5920   case NodeRecord::NDBCNTR_START_WAIT:
5921     ndbassert(NdbTick_IsValid(nodePtr.p->ndbcntrStartWaitTime));
5922   case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
5923     ndbassert(NdbTick_IsValid(nodePtr.p->includedInHBProtocolTime));
5924   case NodeRecord::ALLOCATED_NODE_ID:
5925     ndbassert(NdbTick_IsValid(nodePtr.p->allocatedNodeIdTime));
5926     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
5927     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailTime));
5928     break;
5929   case NodeRecord::NODE_ACTIVE:
5930     ndbassert(NdbTick_IsValid(nodePtr.p->nodeActiveTime));
5931   case NodeRecord::NODE_IN_LCP_WAIT_STATE:
5932     ndbassert(NdbTick_IsValid(nodePtr.p->nodeInLCPWaitStateTime));
5933   case NodeRecord::NODE_GETTING_SYNCHED:
5934     ndbassert(NdbTick_IsValid(nodePtr.p->nodeGettingSynchedTime));
5935   case NodeRecord::NODE_GETTING_INCLUDED:
5936     ndbassert(NdbTick_IsValid(nodePtr.p->nodeGettingIncludedTime));
5937   case NodeRecord::NODE_GETTING_PERMIT:
5938     ndbassert(NdbTick_IsValid(nodePtr.p->nodeGettingPermitTime));
5939     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
5940     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailTime));
5941     break;
5942   case NodeRecord::NODE_FAILURE_COMPLETED:
5943     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
5944   case NodeRecord::NODE_FAILED:
5945     ndbassert(NdbTick_IsValid(nodePtr.p->nodeFailTime));
5946     break;
5947   default:
5948     jam();
5949     break;
5950   }
5951   return true;
5952 }
5953 
5954 /**
5955  * We want to stall the LCP start if any node is encountering the place where
5956  * we need to participate in an LCP to complete our restart. If any node is
5957  * close to reaching this state we want to block the LCP until it has reached
5958  * this state.
5959  */
check_stall_lcp_start(void)5960 bool Dbdih::check_stall_lcp_start(void)
5961 {
5962   const NDB_TICKS now = c_current_time = NdbTick_getCurrentTicks();
5963   /**
5964    * The following variables are calculated to measure the node closest to
5965    * reaching the WAIT_LCP_FOR_RESTART state.
5966    */
5967   NodeRecord::NodeRecoveryStatus max_status = NodeRecord::NOT_DEFINED_IN_CLUSTER;
5968   Uint64 time_since_state_start = 0;
5969   Uint32 node_waited_for = 0;
5970   NDB_TICKS state_start_time;
5971 
5972   /**
5973    * This is the node we will use to estimate the time remaining. If no such
5974    * node exists, then we have no measurements to use and we will have to
5975    * fall back to heuristics. We also store the state and time of this variable
5976    * to get the most recent estimate.
5977    */
5978   NodeRecord::NodeRecoveryStatus most_recent_node_status =
5979     NodeRecord::ALLOCATED_NODE_ID;
5980   Uint32 most_recent_node = 0;
5981   NDB_TICKS most_recent_node_start_time;
5982 
5983   /**
5984    * If the estimated time until we reach the WAIT_LCP_FOR_RESTART state is
5985    * higher than the below value, then we won't wait at all, we will start
5986    * the LCP immediately in this case.
5987    */
5988   Uint64 lcp_max_wait_time = 0;
5989   Uint64 lcp_stall_time = 0;
5990 
5991   /**
5992    * If we don't find any most recent node, then should we fall back to
5993    * heuristics?. We fall back to heuristics when we have nodes in early
5994    * stages of node restart that could potentially move through those
5995    * stages rapidly.
5996    */
5997   NodeRecordPtr nodePtr;
5998 
5999   Uint64 time_remaining;
6000   Uint64 estimated_time;
6001 
6002   NdbTick_Invalidate(&most_recent_node_start_time);
6003   NdbTick_Invalidate(&state_start_time);
6004 
6005   if (check_for_too_long_wait(lcp_max_wait_time,
6006                               lcp_stall_time,
6007                               now))
6008   {
6009     jam();
6010     goto immediate_start_label;
6011   }
6012 
6013   /**
6014    * It is ok to wait before starting the new LCP, we will go through the
6015    * data nodes and see if we have reasons to wait.
6016    */
6017   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
6018   {
6019     ptrAss(nodePtr, nodeRecord);
6020     switch (nodePtr.p->nodeRecoveryStatus)
6021     {
6022       case NodeRecord::NOT_DEFINED_IN_CLUSTER:
6023       case NodeRecord::NODE_NOT_RESTARTED_YET:
6024       {
6025         jam();
6026         /**
6027          * We have no useful information about estimated time remaining
6028          * and we're not restarting this node currently. Simply continue.
6029          */
6030         break;
6031       }
6032       /**
6033        * The states NODE_ACTIVE, RESTART_COMPLETED, WAIT_LCP_FOR_RESTART and
6034        * WAIT_SUMA_HANDOVER can all be used to estimate the time remaining
6035        * for the node restarts still running. We use the most recent estimate,
6036        * the WAIT_LCP_FOR_RESTART being most recent, then WAIT_SUMA_HANDOVER,
6037        * then RESTART_COMPLETED and finally NODE_ACTIVE.
6038        */
6039       case NodeRecord::NODE_ACTIVE:
6040       {
6041         jam();
6042         state_start_time = nodePtr.p->nodeActiveTime;
6043         calculate_most_recent_node(nodePtr.i,
6044                                    state_start_time,
6045                                    nodePtr.p->nodeRecoveryStatus,
6046                                    &most_recent_node,
6047                                    &most_recent_node_start_time,
6048                                    &most_recent_node_status);
6049         break;
6050       }
6051       case NodeRecord::RESTART_COMPLETED:
6052       {
6053         jam();
6054         state_start_time = nodePtr.p->restartCompletedTime;
6055         calculate_most_recent_node(nodePtr.i,
6056                                    state_start_time,
6057                                    nodePtr.p->nodeRecoveryStatus,
6058                                    &most_recent_node,
6059                                    &most_recent_node_start_time,
6060                                    &most_recent_node_status);
6061         break;
6062       }
6063       case NodeRecord::WAIT_SUMA_HANDOVER:
6064       {
6065         jam();
6066         state_start_time = nodePtr.p->waitSumaHandoverTime;
6067         calculate_most_recent_node(nodePtr.i,
6068                                    state_start_time,
6069                                    nodePtr.p->nodeRecoveryStatus,
6070                                    &most_recent_node,
6071                                    &most_recent_node_start_time,
6072                                    &most_recent_node_status);
6073         break;
6074       }
6075       case NodeRecord::WAIT_LCP_FOR_RESTART:
6076       {
6077         jam();
6078         state_start_time = nodePtr.p->waitLCPForRestartTime;
6079         ndbassert(NdbTick_IsValid(nodePtr.p->includeNodeInLCPAndGCPTime));
6080         ndbassert(NdbTick_IsValid(nodePtr.p->copyDictToStartingNodeTime));
6081         calculate_most_recent_node(nodePtr.i,
6082                                    state_start_time,
6083                                    nodePtr.p->nodeRecoveryStatus,
6084                                    &most_recent_node,
6085                                    &most_recent_node_start_time,
6086                                    &most_recent_node_status);
6087         break;
6088       }
6089       /**
6090        * The following are states where we expect a node restart to either
6091        * be ongoing or to very soon start up.
6092        *
6093        * The states ranging from NDBCNTR_STARTED to COPY_FRAGMENTS_STARTED
6094        * are states that can be used to estimate the time remaining until
6095        * someone reaches the WAIT_LCP_FOR_RESTART state. We get the state
6096        * and time in this state for the node that has proceeded the
6097        * furthest in the restart. The other states are less good for
6098        * estimating the time remaining but will still be used with some
6099        * extra heuristics.
6100        */
6101       case NodeRecord::NODE_FAILED:
6102       {
6103         jam();
6104         state_start_time = nodePtr.p->nodeFailTime;
6105         calculate_time_remaining(nodePtr.i,
6106                                  state_start_time,
6107                                  now,
6108                                  nodePtr.p->nodeRecoveryStatus,
6109                                  &node_waited_for,
6110                                  &time_since_state_start,
6111                                  &max_status);
6112         break;
6113       }
6114       case NodeRecord::NODE_FAILURE_COMPLETED:
6115       {
6116         jam();
6117         state_start_time = nodePtr.p->nodeFailCompletedTime;
6118         calculate_time_remaining(nodePtr.i,
6119                                  state_start_time,
6120                                  now,
6121                                  nodePtr.p->nodeRecoveryStatus,
6122                                  &node_waited_for,
6123                                  &time_since_state_start,
6124                                  &max_status);
6125         break;
6126       }
6127       case NodeRecord::ALLOCATED_NODE_ID:
6128       {
6129         jam();
6130         state_start_time = nodePtr.p->allocatedNodeIdTime;
6131         calculate_time_remaining(nodePtr.i,
6132                                  state_start_time,
6133                                  now,
6134                                  nodePtr.p->nodeRecoveryStatus,
6135                                  &node_waited_for,
6136                                  &time_since_state_start,
6137                                  &max_status);
6138         break;
6139       }
6140       case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6141       {
6142         jam();
6143         state_start_time = nodePtr.p->includedInHBProtocolTime;
6144         calculate_time_remaining(nodePtr.i,
6145                                  state_start_time,
6146                                  now,
6147                                  nodePtr.p->nodeRecoveryStatus,
6148                                  &node_waited_for,
6149                                  &time_since_state_start,
6150                                  &max_status);
6151         break;
6152       }
6153       case NodeRecord::NDBCNTR_START_WAIT:
6154       {
6155         jam();
6156         state_start_time = nodePtr.p->ndbcntrStartWaitTime;
6157         calculate_time_remaining(nodePtr.i,
6158                                  state_start_time,
6159                                  now,
6160                                  nodePtr.p->nodeRecoveryStatus,
6161                                  &node_waited_for,
6162                                  &time_since_state_start,
6163                                  &max_status);
6164         break;
6165       }
6166       case NodeRecord::NDBCNTR_STARTED:
6167       {
6168         jam();
6169         state_start_time = nodePtr.p->ndbcntrStartedTime;
6170         calculate_time_remaining(nodePtr.i,
6171                                  state_start_time,
6172                                  now,
6173                                  nodePtr.p->nodeRecoveryStatus,
6174                                  &node_waited_for,
6175                                  &time_since_state_start,
6176                                  &max_status);
6177         break;
6178       }
6179       case NodeRecord::START_PERMITTED:
6180       {
6181         jam();
6182         state_start_time = nodePtr.p->startPermittedTime;
6183         calculate_time_remaining(nodePtr.i,
6184                                  state_start_time,
6185                                  now,
6186                                  nodePtr.p->nodeRecoveryStatus,
6187                                  &node_waited_for,
6188                                  &time_since_state_start,
6189                                  &max_status);
6190         break;
6191       }
6192       case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6193       {
6194         jam();
6195         state_start_time = nodePtr.p->waitLCPToCopyDictTime;
6196         calculate_time_remaining(nodePtr.i,
6197                                  state_start_time,
6198                                  now,
6199                                  nodePtr.p->nodeRecoveryStatus,
6200                                  &node_waited_for,
6201                                  &time_since_state_start,
6202                                  &max_status);
6203         break;
6204       }
6205       case NodeRecord::COPY_DICT_TO_STARTING_NODE:
6206       {
6207         jam();
6208         state_start_time = nodePtr.p->copyDictToStartingNodeTime;
6209         calculate_time_remaining(nodePtr.i,
6210                                  state_start_time,
6211                                  now,
6212                                  nodePtr.p->nodeRecoveryStatus,
6213                                  &node_waited_for,
6214                                  &time_since_state_start,
6215                                  &max_status);
6216         break;
6217       }
6218       case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
6219       {
6220         jam();
6221         state_start_time = nodePtr.p->includeNodeInLCPAndGCPTime;
6222         calculate_time_remaining(nodePtr.i,
6223                                  state_start_time,
6224                                  now,
6225                                  nodePtr.p->nodeRecoveryStatus,
6226                                  &node_waited_for,
6227                                  &time_since_state_start,
6228                                  &max_status);
6229         break;
6230       }
6231       case NodeRecord::LOCAL_RECOVERY_STARTED:
6232       {
6233         jam();
6234         state_start_time = nodePtr.p->startDatabaseRecoveryTime;
6235         calculate_time_remaining(nodePtr.i,
6236                                  state_start_time,
6237                                  now,
6238                                  nodePtr.p->nodeRecoveryStatus,
6239                                  &node_waited_for,
6240                                  &time_since_state_start,
6241                                  &max_status);
6242         break;
6243       }
6244       case NodeRecord::RESTORE_FRAG_COMPLETED:
6245       {
6246         jam();
6247         state_start_time = nodePtr.p->startUndoDDTime;
6248         calculate_time_remaining(nodePtr.i,
6249                                  state_start_time,
6250                                  now,
6251                                  nodePtr.p->nodeRecoveryStatus,
6252                                  &node_waited_for,
6253                                  &time_since_state_start,
6254                                  &max_status);
6255         break;
6256       }
6257       case NodeRecord::UNDO_DD_COMPLETED:
6258       {
6259         jam();
6260         state_start_time = nodePtr.p->startExecREDOLogTime;
6261         calculate_time_remaining(nodePtr.i,
6262                                  state_start_time,
6263                                  now,
6264                                  nodePtr.p->nodeRecoveryStatus,
6265                                  &node_waited_for,
6266                                  &time_since_state_start,
6267                                  &max_status);
6268         break;
6269       }
6270       case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
6271       {
6272         jam();
6273         state_start_time = nodePtr.p->startBuildIndexTime;
6274         calculate_time_remaining(nodePtr.i,
6275                                  state_start_time,
6276                                  now,
6277                                  nodePtr.p->nodeRecoveryStatus,
6278                                  &node_waited_for,
6279                                  &time_since_state_start,
6280                                  &max_status);
6281         break;
6282       }
6283       case NodeRecord::COPY_FRAGMENTS_STARTED:
6284       {
6285         jam();
6286         state_start_time = nodePtr.p->copyFragmentsStartedTime;
6287         calculate_time_remaining(nodePtr.i,
6288                                  state_start_time,
6289                                  now,
6290                                  nodePtr.p->nodeRecoveryStatus,
6291                                  &node_waited_for,
6292                                  &time_since_state_start,
6293                                  &max_status);
6294         break;
6295       }
6296       default:
6297       {
6298         jamLine(nodePtr.p->nodeRecoveryStatus);
6299         /* The states only used on non-masters should never occur here */
6300         ndbrequire(false);
6301       }
6302     }
6303   }
6304   if (node_waited_for == 0)
6305   {
6306     jam();
6307     /* No restart is ongoing, we can safely proceed with starting the LCP. */
6308     goto immediate_start_label;
6309   }
6310   if (most_recent_node == 0)
6311   {
6312     jam();
6313     /**
6314      * We have restarts ongoing, but we have no node that can be used to
6315      * estimate the remaining time. In this case we use a heuristic which
6316      * means we're willing to wait for 25% of the max wait time (about
6317      * 7% of the time to execute an LCP). If this wait is sufficient for a
6318      * node to reach WAIT_LCP_FOR_RESTART we immediately get more recent
6319      * estimate and can make more intelligent estimates at that time.
6320      */
6321     lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_NO_ESTIMATE;
6322     lcp_max_wait_time /= 100;
6323     if (lcp_stall_time > lcp_max_wait_time)
6324     {
6325       jam();
6326       goto immediate_start_label;
6327     }
6328     else
6329     {
6330       jam();
6331       goto wait_label;
6332     }
6333   }
6334 
6335   /**
6336    * A node exists which has estimates on times to execute the node restart.
6337    * A node restart exists as well. We will estimate whether it makes sense
6338    * to delay the LCP for a while more at this time.
6339    */
6340   nodePtr.i = most_recent_node;
6341   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
6342   jamLine(most_recent_node);
6343   jamLine(node_waited_for);
6344 
6345   if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_ACTIVE)
6346   {
6347     /**
6348      * We have only access to a node where we gathered measurements during
6349      * the time we were non-master node. We transfer times from non-master
6350      * timers to master timers as best estimates to use below in our
6351      * calculations. We also change the max_status to ensure that we read
6352      * the correct timer when doing the calculations.
6353      *
6354      * Also we don't measure any time since state start since our calculations
6355      * very rough and it would take a lot of logic to get a good estimate of
6356      * time since the state start according the stats gathered as non-master.
6357      *
6358      * Also given that our estimates are less accurate we will decrease the
6359      * maximum wait time by 50%.
6360      */
6361     if (max_status < NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP)
6362     {
6363       jam();
6364       max_status = NodeRecord::NDBCNTR_STARTED;
6365       nodePtr.p->ndbcntrStartedTime = nodePtr.p->nodeGettingPermitTime;
6366     }
6367     else if (max_status < NodeRecord::COPY_FRAGMENTS_STARTED)
6368     {
6369       jam();
6370       max_status = NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP;
6371       nodePtr.p->includeNodeInLCPAndGCPTime =
6372         nodePtr.p->nodeGettingIncludedTime;
6373     }
6374     else
6375     {
6376       jam();
6377       max_status = NodeRecord::COPY_FRAGMENTS_STARTED;
6378       nodePtr.p->copyFragmentsStartedTime = nodePtr.p->nodeGettingSynchedTime;
6379     }
6380     nodePtr.p->waitLCPForRestartTime = nodePtr.p->nodeInLCPWaitStateTime;
6381     time_since_state_start = 0;
6382     lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_EARLY_START_PHASES;
6383     lcp_max_wait_time /= 100;
6384   }
6385 
6386   /**
6387    * Calculate estimated time remaining from start of the max state we've seen.
6388    */
6389   switch (max_status)
6390   {
6391     case NodeRecord::NODE_FAILED:
6392     case NodeRecord::NODE_FAILURE_COMPLETED:
6393     case NodeRecord::ALLOCATED_NODE_ID:
6394     case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6395     case NodeRecord::NDBCNTR_START_WAIT:
6396     {
6397       jam();
6398       /**
6399        * Estimate a complete restart, these states have wait states that are
6400        * hard to estimate impact of. So here we simply want a measurement
6401        * whether it pays off to wait, we also decrease the maximum wait time
6402        * to decrease likelihood we will actually wait.
6403        */
6404       lcp_max_wait_time *= 50;
6405       lcp_max_wait_time /= 100;
6406       estimated_time = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
6407                               nodePtr.p->waitLCPForRestartTime).milliSec();
6408       break;
6409     }
6410     case NodeRecord::NDBCNTR_STARTED:
6411     {
6412       jam();
6413       estimated_time = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
6414                               nodePtr.p->waitLCPForRestartTime).milliSec();
6415       break;
6416     }
6417     case NodeRecord::START_PERMITTED:
6418     {
6419       jam();
6420       estimated_time = NdbTick_Elapsed(nodePtr.p->startPermittedTime,
6421                               nodePtr.p->waitLCPForRestartTime).milliSec();
6422       break;
6423     }
6424     case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6425     {
6426       jam();
6427       estimated_time = NdbTick_Elapsed(nodePtr.p->waitLCPToCopyDictTime,
6428                               nodePtr.p->waitLCPForRestartTime).milliSec();
6429       break;
6430     }
6431     case NodeRecord::COPY_DICT_TO_STARTING_NODE:
6432     {
6433       jam();
6434       estimated_time = NdbTick_Elapsed(nodePtr.p->copyDictToStartingNodeTime,
6435                               nodePtr.p->waitLCPForRestartTime).milliSec();
6436       break;
6437     }
6438     case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
6439     {
6440       jam();
6441       estimated_time = NdbTick_Elapsed(nodePtr.p->includeNodeInLCPAndGCPTime,
6442                               nodePtr.p->waitLCPForRestartTime).milliSec();
6443       break;
6444     }
6445     case NodeRecord::LOCAL_RECOVERY_STARTED:
6446     {
6447       jam();
6448       estimated_time = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
6449                               nodePtr.p->waitLCPForRestartTime).milliSec();
6450       break;
6451     }
6452     case NodeRecord::RESTORE_FRAG_COMPLETED:
6453     {
6454       jam();
6455       estimated_time = NdbTick_Elapsed(nodePtr.p->startUndoDDTime,
6456                               nodePtr.p->waitLCPForRestartTime).milliSec();
6457       break;
6458     }
6459     case NodeRecord::UNDO_DD_COMPLETED:
6460     {
6461       jam();
6462       estimated_time = NdbTick_Elapsed(nodePtr.p->startExecREDOLogTime,
6463                               nodePtr.p->waitLCPForRestartTime).milliSec();
6464       break;
6465     }
6466     case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
6467     {
6468       jam();
6469       estimated_time = NdbTick_Elapsed(nodePtr.p->startBuildIndexTime,
6470                               nodePtr.p->waitLCPForRestartTime).milliSec();
6471       break;
6472     }
6473     case NodeRecord::COPY_FRAGMENTS_STARTED:
6474     {
6475       jam();
6476       estimated_time = NdbTick_Elapsed(nodePtr.p->copyFragmentsStartedTime,
6477                               nodePtr.p->waitLCPForRestartTime).milliSec();
6478       break;
6479     }
6480     default:
6481     {
6482       jamLine(max_status);
6483       ndbrequire(false);
6484     }
6485   }
6486 
6487   if (estimated_time < time_since_state_start)
6488   {
6489     jam();
6490     time_remaining = 0;
6491   }
6492   else
6493   {
6494     jam();
6495     time_remaining = estimated_time - time_since_state_start;
6496   }
6497   if (time_remaining > lcp_max_wait_time)
6498   {
6499     jam();
6500     goto immediate_start_label;
6501   }
6502 
6503 wait_label:
6504   /**
6505    * We exit from the routine to check for stalling LCPs with a decision
6506    * to stall or continue stalling. We ensure that we output proper logs
6507    * about this decision every now and then and that we record the proper
6508    * information about the stalling decisions.
6509    */
6510   jam();
6511   if (c_lcpState.lcpStallStart == 0)
6512   {
6513     jam();
6514     c_lcpState.m_start_lcp_check_time = now;
6515   }
6516   if (c_lcpState.lcpStallStart == 0 ||
6517       node_waited_for != c_lcpState.stall_node_waiting_for ||
6518       NdbTick_Elapsed(c_lcpState.lastLogTime, now).milliSec() >
6519       Uint64(1200000))
6520   {
6521     /**
6522      * Output a log message every time we start stalling
6523      * and every time we change node waiting for and every
6524      * time we have stalled for 2 mins.
6525      */
6526     jam();
6527     c_lcpState.lastLogTime = now;
6528     infoEvent("Stall LCP, LCP time = %u secs,"
6529               " wait for Node%u, state %s",
6530               Uint32(c_lcpState.m_lcp_time / 1000),
6531               node_waited_for,
6532               get_status_str(max_status));
6533     infoEvent("Stall LCP: current stall time: %u secs,"
6534               " max wait time:%u secs",
6535               Uint32(lcp_stall_time/1000),
6536               Uint32(lcp_max_wait_time/1000));
6537   }
6538   c_lcpState.lcpStallStart = 1;
6539   c_lcpState.stall_node_waiting_for = node_waited_for;
6540   return true;
6541 
6542 immediate_start_label:
6543   /**
6544    * We quit waiting for starting the LCP, we will start immediately.
6545    * This will be recorded as a start LCP, so no need for special
6546    * logging message for this. Simply reset the stall state.
6547    */
6548   c_lcpState.lcpStallStart = 0;
6549   return false;
6550 }
6551 
6552 const char*
get_status_str(NodeRecord::NodeRecoveryStatus status)6553 Dbdih::get_status_str(NodeRecord::NodeRecoveryStatus status)
6554 {
6555   const char *status_str;
6556   switch (status)
6557   {
6558   case NodeRecord::ALLOCATED_NODE_ID:
6559     status_str="Allocated node id";
6560     break;
6561   case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6562     status_str="Included in heartbeat protocol";
6563     break;
6564   case NodeRecord::NDBCNTR_START_WAIT:
6565     status_str="Wait for NDBCNTR master permit";
6566     break;
6567   case NodeRecord::NDBCNTR_STARTED:
6568     status_str="NDBCNTR master permitted us";
6569     break;
6570   case NodeRecord::NODE_GETTING_PERMIT:
6571   case NodeRecord::START_PERMITTED:
6572     status_str="All nodes permitted us";
6573     break;
6574   case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6575     status_str="Wait for LCP complete to copy meta data";
6576     break;
6577   case NodeRecord::COPY_DICT_TO_STARTING_NODE:
6578     status_str="Copy meta data to start node";
6579     break;
6580   case NodeRecord::NODE_GETTING_INCLUDED:
6581   case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
6582     status_str="Include node in LCP/GCP protocols";
6583     break;
6584   case NodeRecord::LOCAL_RECOVERY_STARTED:
6585     status_str="Restore fragments ongoing";
6586     break;
6587   case NodeRecord::RESTORE_FRAG_COMPLETED:
6588     status_str="Undo Disk data ongoing";
6589     break;
6590   case NodeRecord::UNDO_DD_COMPLETED:
6591     status_str="Execute REDO logs ongoing";
6592     break;
6593   case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
6594     status_str="Build indexes ongoing";
6595     break;
6596   case NodeRecord::NODE_GETTING_SYNCHED:
6597   case NodeRecord::COPY_FRAGMENTS_STARTED:
6598     status_str="Synchronize start node with live nodes";
6599     break;
6600   case NodeRecord::NODE_IN_LCP_WAIT_STATE:
6601   case NodeRecord::WAIT_LCP_FOR_RESTART:
6602     status_str="Wait LCP to ensure durability";
6603     break;
6604   case NodeRecord::WAIT_SUMA_HANDOVER:
6605     status_str="Wait handover of subscriptions";
6606     break;
6607   case NodeRecord::NODE_ACTIVE:
6608   case NodeRecord::RESTART_COMPLETED:
6609     status_str="Restart completed";
6610     break;
6611   case NodeRecord::NODE_FAILED:
6612     status_str="Node failed, fail handling ongoing";
6613     break;
6614   case NodeRecord::NODE_FAILURE_COMPLETED:
6615     status_str="Node failure handling complete";
6616     break;
6617   case NodeRecord::NODE_NOT_RESTARTED_YET:
6618     status_str="Initial state";
6619     break;
6620   default:
6621     jamLine(status);
6622     ndbrequire(false);
6623     break;
6624   }
6625   return status_str;
6626 }
6627 
6628 /**
6629  * Fill the table with the following data:
6630  * All the times are reported in seconds.
6631  *
6632  * NodeRestartStatus: This is a string which is derived from the
6633  *  nodeRecoveryStatus.
6634  *
6635  * CompleteFailTime: Time to complete the node failure.
6636  * AllocatedNodeIdTime: Time from completing node failure until we have
6637  *   allocated a node id again.
6638  * IncludeHeartbeatProtocolTime: Time from allocating node id until we
6639  *   have been included in the heartbeat protocol.
6640  * NdbcntrStartWaitTime: Time from being included in the heartbeat
6641  *   protocol until we have been set to wait for NDBCNTR master to
6642  *   allow us to continue starting.
6643  * NdbcntrStartedTime: Time from we start waiting for NDBCNTR master
6644  *   to accept us into the cluster until we are accepted into the cluster.
6645  * StartPermittedTime: Time from we are accepted by NDBCNTR master to
6646  *   start until we have received Start permit from all nodes.
6647  * WaitLCPToCopyDictTime: Time from all nodes permit us to start until we
6648  *   have finished waiting for LCP to complete before we copy the meta
6649  *   data in the cluster.
6650  * CopyToDictStartingNodeTime: Time from we have been allowed to start
6651  *   copying meta data until we have completed this.
6652  * IncludeNodeInLCPAndGCPTime: Time from we have copied the meta data
6653  *   until we have stopped the GCP protocol and have been included into
6654  *   the LCP and GCP protocol by all nodes.
6655  * LocalRecoveryTime: Time from being included until we have fully completed
6656  *   the Local Recovery in a node.
6657  * RestoreFragmentTime:
6658  * Time to restore all fragments from local files generated by the LCPs.
6659  * UndoDDTime:
6660  * Time to run Disk Data UNDO log on all restored fragments.
6661  * ExecREDOLogTime:
6662  * Time to execute the REDO log on all restored fragments.
6663  * BuildIndexTime:
6664  * Time to rebuild indexes on all restored fragments.
6665  * CopyFragmentsTime: Time from completing Local Recovery until all recent data
6666  *   have been copied from alive nodes to starting node.
6667  * WaitSumaHandoverTime: Time from being fully up-to-date until we have
6668  *   completed the handover of replication subscriptions.
6669  * Total recovery time:
6670  * Total time from node failure completed until we are started again.
6671  *
6672  * For nodes that have states set when we were not yet master we will only
6673  * report a few times:
6674  * StartPermittedTime: Time from node completed the node failure until our
6675  *   node permitted the node to start.
6676  * IncludeNodeInLCPAndGCPTime: Time from we permitted the node to start until
6677  *   we completed including the node in the LCP and GCP protocol.
6678  * LocalRecoveryTime: Time from we were included in the LCP and GCP protocol until
6679  *   we started copying the fragments.
6680  * CopyFragmentsTime: Time from we started synchronizing the starting node
6681  *   until we completed the node restart.
6682  *
6683  * Any time not happened yet will be reported as 0.
6684  */
write_zero_columns(Ndbinfo::Row & row,Uint32 num_rows)6685 void Dbdih::write_zero_columns(Ndbinfo::Row &row, Uint32 num_rows)
6686 {
6687   for (Uint32 i = 0; i < num_rows; i++)
6688   {
6689     jam();
6690     row.write_uint32(Uint32(0));
6691   }
6692   return;
6693 }
6694 
fill_row_with_node_restart_status(NodeRecordPtr nodePtr,Ndbinfo::Row & row)6695 void Dbdih::fill_row_with_node_restart_status(NodeRecordPtr nodePtr,
6696                                               Ndbinfo::Row &row)
6697 {
6698   Uint64 elapsed;
6699   NodeRecord::NodeRecoveryStatus status = nodePtr.p->nodeRecoveryStatus;
6700   row.write_uint32(nodePtr.i);
6701   const char *status_str = get_status_str(status);
6702   row.write_string(status_str);
6703   row.write_uint32(Uint32(nodePtr.p->nodeRecoveryStatus));
6704 
6705   if (status == NodeRecord::NODE_ACTIVE)
6706   {
6707     handle_before_master(nodePtr, row);
6708     return;
6709   }
6710   if (status == NodeRecord::NODE_FAILED)
6711   {
6712     write_zero_columns(row, 19);
6713     return;
6714   }
6715   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
6716                             nodePtr.p->nodeFailCompletedTime).milliSec();
6717   elapsed/= 1000;
6718   /* Time to complete node failure */
6719   row.write_uint32(Uint32(elapsed));
6720 
6721   if (status == NodeRecord::NODE_FAILURE_COMPLETED)
6722   {
6723     write_zero_columns(row, 18);
6724     return;
6725   }
6726   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailCompletedTime,
6727                             nodePtr.p->allocatedNodeIdTime).milliSec();
6728   elapsed/= 1000;
6729   /* Time to allocate node id */
6730   row.write_uint32(Uint32(elapsed));
6731 
6732   if (status == NodeRecord::ALLOCATED_NODE_ID)
6733   {
6734     write_zero_columns(row, 17);
6735     return;
6736   }
6737   elapsed = NdbTick_Elapsed(nodePtr.p->allocatedNodeIdTime,
6738                             nodePtr.p->includedInHBProtocolTime).milliSec();
6739   elapsed/= 1000;
6740   /* Time to include in HB Protocol */
6741   row.write_uint32(Uint32(elapsed));
6742 
6743   if (status == NodeRecord::INCLUDED_IN_HB_PROTOCOL)
6744   {
6745     write_zero_columns(row, 16);
6746     return;
6747   }
6748   elapsed = NdbTick_Elapsed(nodePtr.p->includedInHBProtocolTime,
6749                             nodePtr.p->ndbcntrStartWaitTime).milliSec();
6750   elapsed/= 1000;
6751   /* Time until wait for for ndbcntr master */
6752   row.write_uint32(Uint32(elapsed));
6753 
6754   if (status == NodeRecord::NDBCNTR_START_WAIT)
6755   {
6756     write_zero_columns(row, 15);
6757     return;
6758   }
6759   elapsed = NdbTick_Elapsed(nodePtr.p->ndbcntrStartWaitTime,
6760                             nodePtr.p->ndbcntrStartedTime).milliSec();
6761   elapsed/= 1000;
6762   /* Time wait for NDBCNTR master */
6763   row.write_uint32(Uint32(elapsed));
6764 
6765   if (status == NodeRecord::NDBCNTR_STARTED)
6766   {
6767     write_zero_columns(row, 14);
6768     return;
6769   }
6770   elapsed = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
6771                             nodePtr.p->startPermittedTime).milliSec();
6772   elapsed/= 1000;
6773   /* Time to get start permitted */
6774   row.write_uint32(Uint32(elapsed));
6775 
6776   if (status == NodeRecord::START_PERMITTED)
6777   {
6778     write_zero_columns(row, 13);
6779     return;
6780   }
6781   elapsed = NdbTick_Elapsed(nodePtr.p->startPermittedTime,
6782                             nodePtr.p->waitLCPToCopyDictTime).milliSec();
6783   elapsed/= 1000;
6784   /* Time to wait for LCP to copy meta data */
6785   row.write_uint32(Uint32(elapsed));
6786 
6787   if (status == NodeRecord::WAIT_LCP_TO_COPY_DICT)
6788   {
6789     write_zero_columns(row, 12);
6790     return;
6791   }
6792   elapsed = NdbTick_Elapsed(nodePtr.p->waitLCPToCopyDictTime,
6793                             nodePtr.p->copyDictToStartingNodeTime).milliSec();
6794   elapsed/= 1000;
6795   /* Time to copy meta data */
6796   row.write_uint32(Uint32(elapsed));
6797 
6798   if (status == NodeRecord::COPY_DICT_TO_STARTING_NODE)
6799   {
6800     write_zero_columns(row, 11);
6801     return;
6802   }
6803   elapsed = NdbTick_Elapsed(nodePtr.p->copyDictToStartingNodeTime,
6804                             nodePtr.p->includeNodeInLCPAndGCPTime).milliSec();
6805   elapsed/= 1000;
6806   /* Time to include node in GCP+LCP protocols */
6807   row.write_uint32(Uint32(elapsed));
6808 
6809   if (status == NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP)
6810   {
6811     write_zero_columns(row, 10);
6812     return;
6813   }
6814   elapsed = NdbTick_Elapsed(nodePtr.p->includeNodeInLCPAndGCPTime,
6815                             nodePtr.p->startDatabaseRecoveryTime).milliSec();
6816   elapsed/= 1000;
6817   /* Time for starting node to request local recovery */
6818   row.write_uint32(Uint32(elapsed));
6819 
6820   if (status == NodeRecord::LOCAL_RECOVERY_STARTED)
6821   {
6822     write_zero_columns(row, 9);
6823     return;
6824   }
6825 
6826   /* Total time of local recovery */
6827   if (status < NodeRecord::COPY_FRAGMENTS_STARTED)
6828   {
6829     row.write_uint32(Uint32(0));
6830   }
6831   else
6832   {
6833     elapsed = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
6834                               nodePtr.p->copyFragmentsStartedTime).milliSec();
6835     elapsed/= 1000;
6836     row.write_uint32(Uint32(elapsed));
6837   }
6838 
6839   elapsed = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
6840                             nodePtr.p->startUndoDDTime).milliSec();
6841   elapsed/= 1000;
6842   /* Time to restore fragments */
6843   row.write_uint32(Uint32(elapsed));
6844 
6845   if (status == NodeRecord::RESTORE_FRAG_COMPLETED)
6846   {
6847     write_zero_columns(row, 7);
6848     return;
6849   }
6850   elapsed = NdbTick_Elapsed(nodePtr.p->startUndoDDTime,
6851                             nodePtr.p->startExecREDOLogTime).milliSec();
6852   elapsed/= 1000;
6853   /* Time to UNDO disk data parts */
6854   row.write_uint32(Uint32(elapsed));
6855 
6856   if (status == NodeRecord::UNDO_DD_COMPLETED)
6857   {
6858     write_zero_columns(row, 6);
6859     return;
6860   }
6861   elapsed = NdbTick_Elapsed(nodePtr.p->startExecREDOLogTime,
6862                             nodePtr.p->startBuildIndexTime).milliSec();
6863   elapsed/= 1000;
6864   /* Time to execute REDO logs */
6865   row.write_uint32(Uint32(elapsed));
6866 
6867   if (status == NodeRecord::EXECUTE_REDO_LOG_COMPLETED)
6868   {
6869     write_zero_columns(row, 5);
6870     return;
6871   }
6872   elapsed = NdbTick_Elapsed(nodePtr.p->startBuildIndexTime,
6873                             nodePtr.p->copyFragmentsStartedTime).milliSec();
6874   elapsed/= 1000;
6875   /* Time to build indexes */
6876   row.write_uint32(Uint32(elapsed));
6877 
6878   if (status == NodeRecord::COPY_FRAGMENTS_STARTED)
6879   {
6880     write_zero_columns(row, 4);
6881     return;
6882   }
6883   elapsed = NdbTick_Elapsed(nodePtr.p->copyFragmentsStartedTime,
6884                             nodePtr.p->waitLCPForRestartTime).milliSec();
6885   elapsed/= 1000;
6886   /* Time to synchronize starting node with alive nodes */
6887   row.write_uint32(Uint32(elapsed));
6888 
6889   if (status == NodeRecord::WAIT_LCP_FOR_RESTART)
6890   {
6891     write_zero_columns(row, 3);
6892     return;
6893   }
6894   elapsed = NdbTick_Elapsed(nodePtr.p->waitLCPForRestartTime,
6895                             nodePtr.p->waitSumaHandoverTime).milliSec();
6896   elapsed/= 1000;
6897   /* Time to wait for completion of LCPs */
6898   row.write_uint32(Uint32(elapsed));
6899 
6900   if (status == NodeRecord::WAIT_SUMA_HANDOVER)
6901   {
6902     write_zero_columns(row, 2);
6903     return;
6904   }
6905   elapsed = NdbTick_Elapsed(nodePtr.p->waitSumaHandoverTime,
6906                             nodePtr.p->restartCompletedTime).milliSec();
6907   elapsed/= 1000;
6908   /* Time to handover subscriptions to starting node */
6909   row.write_uint32(Uint32(elapsed));
6910 
6911   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
6912                             nodePtr.p->restartCompletedTime).milliSec();
6913   elapsed/= 1000;
6914   /* Total recovery time */
6915   row.write_uint32(Uint32(elapsed));
6916 
6917   return;
6918 }
6919 
handle_before_master(NodeRecordPtr nodePtr,Ndbinfo::Row & row)6920 void Dbdih::handle_before_master(NodeRecordPtr nodePtr,
6921                                  Ndbinfo::Row &row)
6922 {
6923   Uint64 elapsed;
6924 
6925   /* Time to complete node failure */
6926   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
6927                             nodePtr.p->nodeFailCompletedTime).milliSec();
6928   elapsed/= 1000;
6929   row.write_uint32(Uint32(elapsed));
6930 
6931   /**
6932    * No report on
6933    * 1) Allocate node id
6934    * 2) Include in heartbeat protocol
6935    * 3) Wait for NDBCNTR master
6936    * 4) Time until ok from NDBCNTR master
6937    */
6938   row.write_uint32(Uint32(0));
6939   row.write_uint32(Uint32(0));
6940   row.write_uint32(Uint32(0));
6941   row.write_uint32(Uint32(0));
6942 
6943   /* Time to get from failure to start permitted */
6944   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
6945                             nodePtr.p->nodeGettingPermitTime).milliSec();
6946   elapsed/= 1000;
6947   row.write_uint32(Uint32(elapsed));
6948 
6949   /**
6950    * No report on
6951    * 1) Time to wait for LCP to copy meta data
6952    * 2) Time to copy meta data
6953    */
6954   row.write_uint32(Uint32(0));
6955   row.write_uint32(Uint32(0));
6956 
6957   /* Time from getting start permitted to getting included */
6958   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingPermitTime,
6959                             nodePtr.p->nodeGettingIncludedTime).milliSec();
6960   elapsed/= 1000;
6961   row.write_uint32(Uint32(elapsed));
6962 
6963   /**
6964    * No report on
6965    * 1) Time for starting node to request local recovery
6966    */
6967   row.write_uint32(Uint32(0));
6968 
6969   /* Time for local recovery */
6970   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingIncludedTime,
6971                             nodePtr.p->nodeGettingSynchedTime).milliSec();
6972   elapsed/= 1000;
6973   row.write_uint32(Uint32(elapsed));
6974 
6975   /**
6976    * No report on
6977    * 1) Restore fragment time
6978    * 2) UNDO DD time
6979    * 3) Execute REDO log time
6980    * 4) Build index time
6981    */
6982   row.write_uint32(Uint32(0));
6983   row.write_uint32(Uint32(0));
6984   row.write_uint32(Uint32(0));
6985   row.write_uint32(Uint32(0));
6986 
6987   /* Time to synchronize starting node with alive nodes */
6988   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingSynchedTime,
6989                             nodePtr.p->nodeInLCPWaitStateTime).milliSec();
6990   elapsed/= 1000;
6991   row.write_uint32(Uint32(elapsed));
6992 
6993   /**
6994    * No report on
6995    * 1) Time to wait for LCP to be restorable as a node
6996    * 2) Time to handover subscriptions
6997    */
6998   row.write_uint32(Uint32(0));
6999   row.write_uint32(Uint32(0));
7000 
7001   /* Total time from node failure to node restarted */
7002   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7003                             nodePtr.p->nodeActiveTime).milliSec();
7004   elapsed/= 1000;
7005   row.write_uint32(Uint32(elapsed));
7006 
7007   return;
7008 }
7009 
execDBINFO_SCANREQ(Signal * signal)7010 void Dbdih::execDBINFO_SCANREQ(Signal *signal)
7011 {
7012   DbinfoScanReq req = *(DbinfoScanReq*)signal->theData;
7013   const Ndbinfo::ScanCursor *cursor =
7014     CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
7015   Ndbinfo::Ratelimit rl;
7016   bool sent_any = false;
7017   jamEntry();
7018 
7019   switch (req.tableId)
7020   {
7021   case Ndbinfo::RESTART_INFO_TABLEID:
7022   {
7023     if (isMaster() == false)
7024     {
7025       /* Only report from master node's view on restarts */
7026       break;
7027     }
7028     if (getNodeState().startLevel != NodeState::SL_STARTED)
7029     {
7030       jam();
7031       /* Ignore when we are starting up or shutting down */
7032       break;
7033     }
7034 
7035     NodeRecordPtr nodePtr;
7036     jam();
7037     nodePtr.i = cursor->data[0];
7038     if (nodePtr.i == 0)
7039     {
7040       nodePtr.i = 1; /* Ignore node 0 */
7041     }
7042     else if (nodePtr.i >= MAX_NDB_NODES)
7043     {
7044       break;
7045     }
7046     for (; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
7047     {
7048       ptrAss(nodePtr, nodeRecord);
7049       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET ||
7050           nodePtr.p->nodeRecoveryStatus == NodeRecord::NOT_DEFINED_IN_CLUSTER)
7051         continue;
7052       jamLine(nodePtr.i);
7053       sent_any = true;
7054       Ndbinfo::Row row(signal, req);
7055       fill_row_with_node_restart_status(nodePtr, row);
7056       ndbinfo_send_row(signal, req, row, rl);
7057       if (rl.need_break(req))
7058       {
7059         jam();
7060         ndbinfo_send_scan_break(signal, req, rl, nodePtr.i + 1);
7061         return;
7062       }
7063     }
7064     if (cursor->data[0] == 0 && !sent_any)
7065     {
7066       /* No nodes had any node restart data to report */
7067       jam();
7068       break;
7069     }
7070     break;
7071   }
7072   default:
7073     break;
7074   }
7075   ndbinfo_send_scan_conf(signal, req, rl);
7076 }
7077 /* END Node Recovery Status Module */
7078 
7079 /*****************************************************************************/
7080 /***********     NODE ADDING  MODULE                             *************/
7081 /***********     CODE TO HANDLE TAKE OVER                        *************/
7082 /*****************************************************************************/
7083 // A take over can be initiated by a number of things:
7084 // 1) A node restart, usually the node takes over itself but can also take
7085 //    over somebody else if its own data was already taken over
7086 // 2) At system restart it is necessary to use the take over code to recover
7087 //    nodes which had too old checkpoints to be restorable by the usual
7088 //    restoration from disk.
7089 // 3) When a node has missed too many local checkpoints and is decided by the
7090 //    master to be taken over by a hot spare node that sits around waiting
7091 //    for this to happen. (This is not yet implemented).
7092 //
7093 // To support multiple node failures efficiently the code is written such that
7094 // only one take over can handle transitions in state but during a copy
7095 // fragment other take over's can perform state transitions.
7096 /*****************************************************************************/
startTakeOver(Signal * signal,Uint32 startNode,Uint32 nodeTakenOver,const StartCopyReq * req)7097 void Dbdih::startTakeOver(Signal* signal,
7098                           Uint32 startNode,
7099                           Uint32 nodeTakenOver,
7100                           const StartCopyReq* req)
7101 {
7102   jam();
7103 
7104   TakeOverRecordPtr takeOverPtr;
7105 
7106   ndbrequire(c_takeOverPool.seize(takeOverPtr));
7107   takeOverPtr.p->startGci = SYSFILE->lastCompletedGCI[startNode];
7108   takeOverPtr.p->restorableGci = SYSFILE->lastCompletedGCI[startNode];
7109   takeOverPtr.p->toStartingNode = startNode;
7110   takeOverPtr.p->toFailedNode = nodeTakenOver;
7111   takeOverPtr.p->toCurrentTabref = 0;
7112   takeOverPtr.p->toCurrentFragid = 0;
7113 
7114   ndbrequire(req != NULL);
7115   takeOverPtr.p->m_flags = req->flags;
7116   takeOverPtr.p->m_senderData = req->senderData;
7117   takeOverPtr.p->m_senderRef = req->senderRef;
7118 
7119   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_FRAGMENTS;
7120   nr_start_fragments(signal, takeOverPtr);
7121 }//Dbdih::startTakeOver()
7122 
7123 void
nr_start_fragments(Signal * signal,TakeOverRecordPtr takeOverPtr)7124 Dbdih::nr_start_fragments(Signal* signal,
7125 			  TakeOverRecordPtr takeOverPtr)
7126 {
7127   Uint32 loopCount = 0 ;
7128   TabRecordPtr tabPtr;
7129   while (loopCount++ < 100) {
7130     tabPtr.i = takeOverPtr.p->toCurrentTabref;
7131     if (tabPtr.i >= ctabFileSize) {
7132       jam();
7133       nr_run_redo(signal, takeOverPtr);
7134       return;
7135     }//if
7136     ptrAss(tabPtr, tabRecord);
7137     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
7138 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL)
7139     {
7140       jam();
7141       takeOverPtr.p->toCurrentFragid = 0;
7142       takeOverPtr.p->toCurrentTabref++;
7143       continue;
7144     }//if
7145     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
7146     if (fragId >= tabPtr.p->totalfragments) {
7147       jam();
7148       takeOverPtr.p->toCurrentFragid = 0;
7149       takeOverPtr.p->toCurrentTabref++;
7150       continue;
7151     }//if
7152     FragmentstorePtr fragPtr;
7153     getFragstore(tabPtr.p, fragId, fragPtr);
7154     ReplicaRecordPtr loopReplicaPtr;
7155     loopReplicaPtr.i = fragPtr.p->oldStoredReplicas;
7156     while (loopReplicaPtr.i != RNIL) {
7157       c_replicaRecordPool.getPtr(loopReplicaPtr);
7158       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode) {
7159         jam();
7160 	nr_start_fragment(signal, takeOverPtr, loopReplicaPtr);
7161 	break;
7162       } else {
7163         jam();
7164         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
7165       }//if
7166     }//while
7167     takeOverPtr.p->toCurrentFragid++;
7168   }//while
7169   signal->theData[0] = DihContinueB::ZTO_START_FRAGMENTS;
7170   signal->theData[1] = takeOverPtr.i;
7171   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7172 }
7173 
7174 void
nr_start_fragment(Signal * signal,TakeOverRecordPtr takeOverPtr,ReplicaRecordPtr replicaPtr)7175 Dbdih::nr_start_fragment(Signal* signal,
7176 			 TakeOverRecordPtr takeOverPtr,
7177 			 ReplicaRecordPtr replicaPtr)
7178 {
7179   Uint32 i;
7180   Uint32 maxLcpId = 0;
7181   Uint32 maxLcpIndex = ~0;
7182 
7183   Uint32 gci = 0;
7184   Uint32 restorableGCI = takeOverPtr.p->restorableGci;
7185 
7186 #if defined VM_TRACE || defined ERROR_INSERT
7187   ndbout_c("tab: %d frag: %d replicaP->nextLcp: %d",
7188 	   takeOverPtr.p->toCurrentTabref,
7189 	   takeOverPtr.p->toCurrentFragid,
7190 	   replicaPtr.p->nextLcp);
7191 #endif
7192 
7193   Int32 j = replicaPtr.p->noCrashedReplicas - 1;
7194   Uint32 idx = prevLcpNo(replicaPtr.p->nextLcp);
7195   for(i = 0; i<MAX_LCP_USED; i++, idx = prevLcpNo(idx))
7196   {
7197 #if defined VM_TRACE || defined ERROR_INSERT
7198     ndbout_c("scanning idx: %d lcpId: %d crashed replicas: %u %s",
7199              idx, replicaPtr.p->lcpId[idx],
7200              replicaPtr.p->noCrashedReplicas,
7201              replicaPtr.p->lcpStatus[idx] == ZVALID ? "VALID" : "NOT VALID");
7202 #endif
7203     if (replicaPtr.p->lcpStatus[idx] == ZVALID)
7204     {
7205       Uint32 startGci = replicaPtr.p->maxGciCompleted[idx] + 1;
7206       Uint32 stopGci = replicaPtr.p->maxGciStarted[idx];
7207 #if defined VM_TRACE || defined ERROR_INSERT
7208       ndbout_c(" maxGciCompleted: %u maxGciStarted: %u", startGci - 1, stopGci);
7209 #endif
7210       for (; j>= 0; j--)
7211       {
7212 #if defined VM_TRACE || defined ERROR_INSERT
7213 	ndbout_c("crashed replica: %d(%d) replica(createGci: %u lastGci: %d )",
7214 		 j,
7215 		 replicaPtr.p->noCrashedReplicas,
7216                  replicaPtr.p->createGci[j],
7217 		 replicaPtr.p->replicaLastGci[j]);
7218 #endif
7219 	if (replicaPtr.p->createGci[j] <= startGci &&
7220             replicaPtr.p->replicaLastGci[j] >= stopGci)
7221 	{
7222 	  maxLcpId = replicaPtr.p->lcpId[idx];
7223 	  maxLcpIndex = idx;
7224           gci = replicaPtr.p->replicaLastGci[j];
7225 	  goto done;
7226 	}
7227       }
7228     }
7229     else
7230     {
7231 #if defined VM_TRACE || defined ERROR_INSERT
7232       ndbout_c(" ");
7233 #endif
7234     }
7235   }
7236 
7237   idx = 2; // backward compat code
7238 #if defined VM_TRACE || defined ERROR_INSERT
7239   ndbout_c("- scanning idx: %d lcpId: %d", idx, replicaPtr.p->lcpId[idx]);
7240 #endif
7241   if (replicaPtr.p->lcpStatus[idx] == ZVALID)
7242   {
7243     Uint32 startGci = replicaPtr.p->maxGciCompleted[idx] + 1;
7244     Uint32 stopGci = replicaPtr.p->maxGciStarted[idx];
7245     for (;j >= 0; j--)
7246     {
7247 #if defined VM_TRACE || defined ERROR_INSERT
7248       ndbout_c("crashed replica: %d(%d) replica(createGci: %u lastGci: %d )",
7249                j,
7250                replicaPtr.p->noCrashedReplicas,
7251                replicaPtr.p->createGci[j],
7252                replicaPtr.p->replicaLastGci[j]);
7253 #endif
7254       if (replicaPtr.p->createGci[j] <= startGci &&
7255           replicaPtr.p->replicaLastGci[j] >= stopGci)
7256       {
7257         maxLcpId = replicaPtr.p->lcpId[idx];
7258         maxLcpIndex = idx;
7259         gci = replicaPtr.p->replicaLastGci[j];
7260         goto done;
7261       }
7262     }
7263   }
7264 
7265 done:
7266 
7267   StartFragReq *req = (StartFragReq *)signal->getDataPtrSend();
7268   req->requestInfo = StartFragReq::SFR_RESTORE_LCP;
7269   if (maxLcpIndex == ~ (Uint32) 0)
7270   {
7271     /**
7272      * we didn't find a local LCP that we can restore
7273      */
7274     jam();
7275     ndbassert(gci == 0);
7276     replicaPtr.p->m_restorable_gci = gci;
7277 
7278     req->userPtr = 0;
7279     req->userRef = reference();
7280     req->lcpNo = ZNIL;
7281     req->lcpId = 0;
7282     req->tableId = takeOverPtr.p->toCurrentTabref;
7283     req->fragId = takeOverPtr.p->toCurrentFragid;
7284     req->noOfLogNodes = 0;
7285 
7286     if (c_2pass_inr && cstarttype == NodeState::ST_INITIAL_NODE_RESTART)
7287     {
7288       /**
7289        * Check if we can make 2-phase copy
7290        *   1) non-transaction, (after we rebuild indexes)
7291        *   2) transaction (maintaining indexes during rebuild)
7292        *      where the transactional copies efterything >= startGci
7293        *
7294        * NOTE: c_2pass_inr is only set if all nodes in cluster currently
7295        *       supports this
7296        */
7297 
7298       if (takeOverPtr.p->startGci == 0)
7299       {
7300         jam();
7301         /**
7302          * Set a startGci to currently lastCompletedGCI of master
7303          *   any value will do...as long as subsequent transactional copy
7304          *   will be using it (scanning >= this value)
7305          */
7306         takeOverPtr.p->startGci = SYSFILE->lastCompletedGCI[cmasterNodeId];
7307       }
7308 
7309       TabRecordPtr tabPtr;
7310       tabPtr.i = takeOverPtr.p->toCurrentTabref;
7311       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
7312 
7313       FragmentstorePtr fragPtr;
7314       getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
7315       Uint32 nodes[MAX_REPLICAS];
7316       extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
7317 
7318       req->lqhLogNode[0] = nodes[0]; // Source
7319       req->requestInfo = StartFragReq::SFR_COPY_FRAG;
7320       replicaPtr.p->m_restorable_gci = takeOverPtr.p->startGci;
7321     }
7322 
7323     if (req->requestInfo == StartFragReq::SFR_RESTORE_LCP)
7324     {
7325       g_eventLogger->debug("node: %d tab: %d frag: %d no lcp to restore",
7326                            takeOverPtr.p->toStartingNode,
7327                            takeOverPtr.p->toCurrentTabref,
7328                            takeOverPtr.p->toCurrentFragid);
7329     }
7330     else
7331     {
7332       g_eventLogger->debug("node: %d tab: %d frag: %d copying data from %u"
7333                            " (gci: %u)",
7334                            takeOverPtr.p->toStartingNode,
7335                            takeOverPtr.p->toCurrentTabref,
7336                            takeOverPtr.p->toCurrentFragid,
7337                            req->lqhLogNode[0],
7338                            takeOverPtr.p->startGci);
7339     }
7340 
7341     BlockReference ref = numberToRef(DBLQH, takeOverPtr.p->toStartingNode);
7342     sendSignal(ref, GSN_START_FRAGREQ, signal,
7343 	       StartFragReq::SignalLength, JBB);
7344   }
7345   else
7346   {
7347     jam();
7348     if (gci != restorableGCI)
7349     {
7350       Ptr<TabRecord> tabPtr;
7351       tabPtr.i = takeOverPtr.p->toCurrentTabref;
7352       ptrAss(tabPtr, tabRecord);
7353 
7354       FragmentstorePtr fragPtr;
7355       getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
7356       dump_replica_info(fragPtr.p);
7357     }
7358     ndbassert(gci == restorableGCI);
7359     replicaPtr.p->m_restorable_gci = gci;
7360     Uint32 startGci = replicaPtr.p->maxGciCompleted[maxLcpIndex] + 1;
7361     if (startGci > gci)
7362       startGci = gci;
7363     g_eventLogger->debug("Requesting start of fragment: "
7364              "node: %d tab: %d frag: %d restore lcp: %u(idx: %u)"
7365              " maxGciStarted: %u maxGciCompleted: %u (restorable:"
7366              " %u(%u) newestRestorableGCI: %u)",
7367              takeOverPtr.p->toStartingNode,
7368              takeOverPtr.p->toCurrentTabref,
7369              takeOverPtr.p->toCurrentFragid,
7370 	     maxLcpId,
7371              maxLcpIndex,
7372 	     replicaPtr.p->maxGciStarted[maxLcpIndex],
7373 	     replicaPtr.p->maxGciCompleted[maxLcpIndex],
7374 	     restorableGCI,
7375 	     SYSFILE->lastCompletedGCI[takeOverPtr.p->toStartingNode],
7376 	     SYSFILE->newestRestorableGCI);
7377 
7378     StartFragReq *req = (StartFragReq *)signal->getDataPtrSend();
7379     req->userPtr = 0;
7380     req->userRef = reference();
7381     req->lcpNo = maxLcpIndex;
7382     req->lcpId = maxLcpId;
7383     req->tableId = takeOverPtr.p->toCurrentTabref;
7384     req->fragId = takeOverPtr.p->toCurrentFragid;
7385     req->noOfLogNodes = 1;
7386     req->lqhLogNode[0] = takeOverPtr.p->toStartingNode;
7387     req->startGci[0] = startGci;
7388     req->lastGci[0] = gci;
7389 
7390     BlockReference ref = numberToRef(DBLQH, takeOverPtr.p->toStartingNode);
7391     sendSignal(ref, GSN_START_FRAGREQ, signal,
7392 	       StartFragReq::SignalLength, JBB);
7393 
7394     if (startGci < takeOverPtr.p->startGci)
7395     {
7396       jam();
7397       takeOverPtr.p->startGci = startGci;
7398     }
7399   }
7400 }
7401 
7402 void
nr_run_redo(Signal * signal,TakeOverRecordPtr takeOverPtr)7403 Dbdih::nr_run_redo(Signal* signal, TakeOverRecordPtr takeOverPtr)
7404 {
7405   /**
7406    * sendSTART_RECREQ uses m_sr_nodes
7407    *   and for TO during SR, we don't want to modify it
7408    *   so save/restore it
7409    */
7410   NdbNodeBitmask save = m_sr_nodes;
7411   m_sr_nodes.clear();
7412   m_sr_nodes.set(takeOverPtr.p->toStartingNode);
7413 
7414   Uint32 save_keepGCI = SYSFILE->keepGCI;
7415   if (takeOverPtr.p->startGci < SYSFILE->keepGCI)
7416   {
7417     jam();
7418     SYSFILE->keepGCI = takeOverPtr.p->startGci;
7419     g_eventLogger->info("GSN_START_RECREQ keepGci: %u (%u)",
7420                         takeOverPtr.p->startGci, save_keepGCI);
7421   }
7422 
7423   g_eventLogger->info("All start fragments sent, requesting LDM to restore"
7424                       " all fragments and to execute the REDO log to bring"
7425                       " the database to an off-line but consistent state");
7426 
7427   takeOverPtr.p->toCurrentTabref = 0;
7428   takeOverPtr.p->toCurrentFragid = 0;
7429   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_RUN_REDO;
7430   sendSTART_RECREQ(signal, takeOverPtr.p->toStartingNode, takeOverPtr.i);
7431 
7432   m_sr_nodes = save; // restore
7433   SYSFILE->keepGCI = save_keepGCI;
7434 }
7435 
7436 void
nr_start_logging(Signal * signal,TakeOverRecordPtr takeOverPtr)7437 Dbdih::nr_start_logging(Signal* signal, TakeOverRecordPtr takeOverPtr)
7438 {
7439   Uint32 loopCount = 0 ;
7440   TabRecordPtr tabPtr;
7441   while (loopCount++ < 100)
7442   {
7443     tabPtr.i = takeOverPtr.p->toCurrentTabref;
7444     if (tabPtr.i >= ctabFileSize)
7445     {
7446       jam();
7447       g_eventLogger->debug("Copy thread %u complete",
7448                           takeOverPtr.p->m_copy_thread_id);
7449       if (!thread_takeover_completed(signal, takeOverPtr))
7450       {
7451         jam();
7452         return;
7453       }
7454       check_take_over_completed_correctly();
7455       g_eventLogger->info("Make On-line Database recoverable by waiting"
7456                           " for LCP Starting, all parallel threads have"
7457                           " now ceased their activity and we have a single"
7458                           " wait state here");
7459 
7460       takeOverPtr = c_mainTakeOverPtr;
7461 
7462       takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_END_TO;
7463       EndToReq* req = (EndToReq*)signal->getDataPtrSend();
7464       req->senderData = takeOverPtr.i;
7465       req->senderRef = reference();
7466       req->flags = takeOverPtr.p->m_flags;
7467       sendSignal(cmasterdihref, GSN_END_TOREQ,
7468                  signal, EndToReq::SignalLength, JBB);
7469       sendEND_TOREP(signal, takeOverPtr.p->toStartingNode);
7470       return;
7471     }
7472     ptrAss(tabPtr, tabRecord);
7473     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
7474 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL)
7475     {
7476       jam();
7477       takeOverPtr.p->toCurrentFragid = 0;
7478       takeOverPtr.p->toCurrentTabref++;
7479       continue;
7480     }
7481 
7482     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
7483     if (fragId >= tabPtr.p->totalfragments)
7484     {
7485       jam();
7486       takeOverPtr.p->toCurrentFragid = 0;
7487       takeOverPtr.p->toCurrentTabref++;
7488       continue;
7489     }
7490     FragmentstorePtr fragPtr;
7491     getFragstore(tabPtr.p, fragId, fragPtr);
7492 
7493     Uint32 instanceKey = dihGetInstanceKey(fragPtr);
7494     if (!check_takeover_thread(takeOverPtr,
7495                                fragPtr,
7496                                instanceKey))
7497     {
7498       jam();
7499       /**
7500        * We are scanning for fragment replicas to take over, but this replica
7501        * was not ours to take over, it will be handled by another take over
7502        * thread.
7503        */
7504       takeOverPtr.p->toCurrentFragid++;
7505       continue;
7506     }
7507 
7508     ReplicaRecordPtr loopReplicaPtr;
7509     loopReplicaPtr.i = fragPtr.p->storedReplicas;
7510     while (loopReplicaPtr.i != RNIL)
7511     {
7512       c_replicaRecordPool.getPtr(loopReplicaPtr);
7513       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode)
7514       {
7515         jam();
7516         ndbrequire(loopReplicaPtr.p->procNode == getOwnNodeId());
7517         takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_COPY_ACTIVE;
7518 
7519         BlockReference lqhRef = numberToRef(DBLQH, instanceKey,
7520                                             takeOverPtr.p->toStartingNode);
7521 
7522         CopyActiveReq * const req = (CopyActiveReq *)&signal->theData[0];
7523         req->userPtr = takeOverPtr.i;
7524         req->userRef = reference();
7525         req->tableId = takeOverPtr.p->toCurrentTabref;
7526         req->fragId = takeOverPtr.p->toCurrentFragid;
7527         req->distributionKey = fragPtr.p->distributionKey;
7528         req->flags = 0;
7529         sendSignal(lqhRef, GSN_COPY_ACTIVEREQ, signal,
7530                    CopyActiveReq::SignalLength, JBB);
7531         return;
7532       }
7533       else
7534       {
7535         jam();
7536         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
7537       }
7538     }
7539     takeOverPtr.p->toCurrentFragid++;
7540   }
7541   send_continueb_nr_start_logging(signal, takeOverPtr);
7542 }
7543 
7544 /**
7545  * Instance takeover uses a number of queues and variables to keep track of
7546  * the takeover threads.
7547  *
7548  * We start by sending START_TOREQ to the master. This is done by the
7549  * main takeover record. This is always placed in the variable
7550  * c_mainTakeOverPtr.
7551  *
7552  * After this we create a number of parallel threads. A record is created
7553  * and put into the queue:
7554  * c_activeTakeOverList
7555  * It stays there while we're scanning for fragments to take over in our
7556  * takeover thread.
7557  *
7558  * When we find an instance to take over we have two possibilities.
7559  * We can either be put into the active thread which is the variable:
7560  * c_activeThreadTakeOverPtr
7561  * If the active thread is already busy, then we are placed into the
7562  * queue:
7563  * c_queued_for_start_takeover_list
7564  * When we're taken out of the queue we are placed into the active thread.
7565  *
7566  * We are taken out of the active thread when we're sending COPY_FRAGREQ.
7567  * At this point our takeover thread is placed in the list
7568  * c_active_copy_threads_list
7569  * It stays in this list until we're done with the copying when we have
7570  * received COPY_ACTIVECONF back from the LDM instance in the starting node.
7571  *
7572  * At this point we need to update the fragment state again and we need to
7573  * become active thread again which is controlled by:
7574  * c_activeThreadTakeOverPtr
7575  * If the active thread is already busy then we use the queue
7576  * c_queued_for_commit_takeover_list
7577  * This queue has higher priority than the
7578  * c_queued_for_start_takeover_list
7579  *
7580  * After completing the update of the fragment state we are removed as active
7581  * thread and placed back in the list
7582  * c_activeTakeOverList
7583  *
7584  * We proceed with the next fragment until we're out of fragments to handle
7585  * for this thread.
7586  *
7587  * At this point we are removed from
7588  * c_activeTakeOverList
7589  * and placed into
7590  * c_completed_copy_threads_list
7591  *
7592  * If this was a system restart we will then remove all threads from the
7593  * c_completed_copy_threads_list
7594  * and only the
7595  * c_mainTakeOverPtr
7596  * record still remains.
7597  *
7598  * For normal node recovery we start a process of activating the node. We
7599  * start this process by removing the takeover thread from
7600  * c_completed_copy_threads_list
7601  * and placing the takeover thread into the list
7602  * c_active_copy_threads_list
7603  * instead.
7604  *
7605  * At every point when we need to update the fragment state we remove the
7606  * takeover record from the
7607  * c_active_copy_threads_list
7608  * and place it as the active thread record. If the active thread is
7609  * already busy then we place the record in the list
7610  * c_queued_for_commit_takeover_list
7611  *
7612  * After completing the update of the fragment state we place the record
7613  * back into the list
7614  * c_active_copy_threads_list
7615  *
7616  * When we are finally done with activating the node instance in this final
7617  * process, then we're removing the record from the
7618  * c_active_copy_threads_list
7619  * and releasing the takeover thread record to the take over pool.
7620  *
7621  * When all node instances are completed then all lists should be empty and
7622  * no thread should be active and only the main record should remain.
7623  */
7624 
7625 
7626 void
sendStartTo(Signal * signal,TakeOverRecordPtr takeOverPtr)7627 Dbdih::sendStartTo(Signal* signal, TakeOverRecordPtr takeOverPtr)
7628 {
7629   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_TO;
7630 
7631   StartToReq* req = (StartToReq*)signal->getDataPtrSend();
7632   req->senderData = takeOverPtr.i;
7633   req->senderRef = reference();
7634   req->startingNodeId = takeOverPtr.p->toStartingNode;
7635   sendSignal(cmasterdihref, GSN_START_TOREQ,
7636              signal, StartToReq::SignalLength, JBB);
7637 }
7638 
7639 void
execSTART_TOREF(Signal * signal)7640 Dbdih::execSTART_TOREF(Signal* signal)
7641 {
7642   jamEntry();
7643 
7644   StartToRef* ref = (StartToRef*)signal->getDataPtr();
7645   Uint32 errCode = ref->errorCode;
7646   (void)errCode; // TODO check for "valid" error
7647 
7648   TakeOverRecordPtr takeOverPtr;
7649   c_takeOverPool.getPtr(takeOverPtr, ref->senderData);
7650 
7651   signal->theData[0] = DihContinueB::ZSEND_START_TO;
7652   signal->theData[1] = takeOverPtr.i;
7653 
7654   sendSignalWithDelay(reference(), GSN_CONTINUEB,
7655                       signal, 5000, 2);
7656 }
7657 
7658 /**
7659  * We have completed one thread's communication with the master and we're
7660  * ready to start off another which have been queued.
7661  */
7662 void
start_next_takeover_thread(Signal * signal)7663 Dbdih::start_next_takeover_thread(Signal *signal)
7664 {
7665   TakeOverRecordPtr takeOverPtr;
7666   bool dequeued_from_commit_take_over = true;
7667   bool dequeued_from_start_take_over = false;
7668 
7669   if (!c_queued_for_commit_takeover_list.removeFirst(takeOverPtr))
7670   {
7671     dequeued_from_commit_take_over = false;
7672     if (!c_queued_for_start_takeover_list.removeFirst(takeOverPtr))
7673     {
7674       jam();
7675       /**
7676        * No threads are queued up for master communication, so we can
7677        * set active to RNIL and wait for the next thread to be completed
7678        * with another step.
7679        */
7680       g_eventLogger->debug("No threads queued up");
7681       c_activeThreadTakeOverPtr.i = RNIL;
7682       return;
7683     }
7684     dequeued_from_start_take_over = true;
7685     jam();
7686   }
7687   c_activeThreadTakeOverPtr = takeOverPtr;
7688   g_eventLogger->debug("New active takeover thread: %u, state: %u",
7689                       takeOverPtr.i,
7690                       takeOverPtr.p->toSlaveStatus);
7691   if (takeOverPtr.p->toSlaveStatus ==
7692         TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_STORED)
7693   {
7694     jam();
7695     ndbrequire(dequeued_from_start_take_over);
7696     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_STORED;
7697     sendUpdateTo(signal, takeOverPtr);
7698   }
7699   else if (takeOverPtr.p->toSlaveStatus ==
7700              TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_COMMIT)
7701   {
7702     jam();
7703     ndbrequire(dequeued_from_commit_take_over);
7704     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_COMMIT;
7705     sendUpdateTo(signal, takeOverPtr);
7706   }
7707   else if (takeOverPtr.p->toSlaveStatus ==
7708              TakeOverRecord::TO_QUEUED_SL_UPDATE_FRAG_STATE)
7709   {
7710     jam();
7711     ndbrequire(dequeued_from_commit_take_over);
7712     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_UPDATE_FRAG_STATE;
7713     sendUpdateFragStateReq(signal,
7714                            takeOverPtr.p->startGci,
7715                            UpdateFragStateReq::START_LOGGING,
7716                            takeOverPtr);
7717     return;
7718   }
7719   else
7720   {
7721     ndbrequire(false);
7722   }
7723   return;
7724 }
7725 
7726 void
init_takeover_thread(TakeOverRecordPtr takeOverPtr,TakeOverRecordPtr mainTakeOverPtr,Uint32 number_of_copy_threads,Uint32 thread_id)7727 Dbdih::init_takeover_thread(TakeOverRecordPtr takeOverPtr,
7728                             TakeOverRecordPtr mainTakeOverPtr,
7729                             Uint32 number_of_copy_threads,
7730                             Uint32 thread_id)
7731 {
7732   c_activeTakeOverList.addFirst(takeOverPtr);
7733   takeOverPtr.p->m_copy_thread_id = thread_id;
7734   takeOverPtr.p->m_number_of_copy_threads = number_of_copy_threads;
7735 
7736   takeOverPtr.p->m_flags = mainTakeOverPtr.p->m_flags;
7737   takeOverPtr.p->m_senderData = mainTakeOverPtr.p->m_senderData;
7738   takeOverPtr.p->m_senderRef = mainTakeOverPtr.p->m_senderRef;
7739 
7740   takeOverPtr.p->startGci = mainTakeOverPtr.p->startGci;
7741   takeOverPtr.p->restorableGci = mainTakeOverPtr.p->restorableGci;
7742   /* maxPage is received in PREPARE_COPY_FRAGCONF */
7743 
7744   takeOverPtr.p->toCopyNode = mainTakeOverPtr.p->toCopyNode;
7745   takeOverPtr.p->toFailedNode = mainTakeOverPtr.p->toFailedNode;
7746   takeOverPtr.p->toStartingNode = mainTakeOverPtr.p->toStartingNode;
7747 
7748   takeOverPtr.p->toStartTime = mainTakeOverPtr.p->toStartTime;
7749   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SELECTING_NEXT;
7750   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
7751 
7752   takeOverPtr.p->toCurrentTabref = 0;
7753   takeOverPtr.p->toCurrentFragid = 0;
7754   takeOverPtr.p->toCurrentReplica = RNIL;
7755 }
7756 
7757 void
send_continueb_start_next_copy(Signal * signal,TakeOverRecordPtr takeOverPtr)7758 Dbdih::send_continueb_start_next_copy(Signal *signal,
7759                                       TakeOverRecordPtr takeOverPtr)
7760 {
7761   signal->theData[0] = DihContinueB::ZTO_START_COPY_FRAG;
7762   signal->theData[1] = takeOverPtr.i;
7763   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7764 }
7765 
7766 void
execSTART_TOCONF(Signal * signal)7767 Dbdih::execSTART_TOCONF(Signal* signal)
7768 {
7769   jamEntry();
7770   StartToConf * conf = (StartToConf*)signal->getDataPtr();
7771 
7772   TakeOverRecordPtr takeOverPtr;
7773   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
7774 
7775   CRASH_INSERTION(7133);
7776 
7777   /**
7778    * We are now allowed to start copying
7779    *
7780    * It is time to start the parallelisation phase where we have a number
7781    * of take over threads where each take over thread takes care of
7782    * a set of LDM instances. This means that each take over thread can
7783    * execute in parallel towards DBLQH, but we have to serialise access
7784    * towards the master which is designed to handle one take over thread
7785    * request per node at a time. So we handle multiple take overs internally
7786    * and towards the LDM instances, but towards the master we appear as there
7787    * is only one take over thread.
7788    *
7789    * This means that we need no master specific take over code to parallelize
7790    * copying over several LDM instances. The take over can be made parallel as
7791    * soon as a version with this code is started as long as the master can
7792    * handle parallel node recovery in general.
7793    */
7794 
7795   c_mainTakeOverPtr = takeOverPtr;
7796   c_mainTakeOverPtr.p->m_number_of_copy_threads =
7797     c_max_takeover_copy_threads;
7798   c_mainTakeOverPtr.p->m_copy_threads_completed = 0;
7799   c_activeThreadTakeOverPtr.i = RNIL;
7800   check_take_over_completed_correctly();
7801 
7802   for (Uint32 i = 0; i < c_max_takeover_copy_threads; i++)
7803   {
7804     /**
7805      * We will break the rule of not starting more than 4 signals from one
7806      * signal here. The reason is that we know that eventually we will start
7807      * the same number of parallel threads anyways and also there won't be
7808      * anymore parallelisation after that internally in this thread. There
7809      * could potentially be further parallelisation in DBLQH, but this is
7810      * in a number of parallel threads and thus not DIH's concern to handle.
7811      */
7812     jam();
7813     ndbrequire(c_takeOverPool.seize(takeOverPtr));
7814     init_takeover_thread(takeOverPtr,
7815                          c_mainTakeOverPtr,
7816                          c_max_takeover_copy_threads,
7817                          i);
7818     send_continueb_start_next_copy(signal, takeOverPtr);
7819   }
7820 }
7821 
7822 bool
check_takeover_thread(TakeOverRecordPtr takeOverPtr,FragmentstorePtr fragPtr,Uint32 fragmentReplicaInstanceKey)7823 Dbdih::check_takeover_thread(TakeOverRecordPtr takeOverPtr,
7824                              FragmentstorePtr fragPtr,
7825                              Uint32 fragmentReplicaInstanceKey)
7826 {
7827   ndbassert(fragmentReplicaInstanceKey != 0);
7828   fragmentReplicaInstanceKey--;
7829   /**
7830    * The instance key is in reality the log part id. The log part id
7831    * is often in ndbmtd the same as the instance id. But in ndbd and
7832    * in ndbmtd with 2 LDM instances there is a difference. The
7833    * instance id is mapped in the receiving node modulo the number
7834    * of LDM instances. So we take the instance key modulo the number
7835    * of LDM instances to get the thread id to handle this takeover
7836    * thread.
7837    *
7838    * For safety we will never run more parallelism than we have in the
7839    * minimum node of the starting node and the copying node.
7840    */
7841   Uint32 nodes[MAX_REPLICAS];
7842   extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
7843   Uint32 lqhWorkers = getNodeInfo(takeOverPtr.p->toStartingNode).m_lqh_workers;
7844   lqhWorkers = MIN(lqhWorkers,
7845                    getNodeInfo(nodes[0]).m_lqh_workers);
7846   lqhWorkers = MAX(lqhWorkers, 1);
7847   Uint32 instanceId = fragmentReplicaInstanceKey % lqhWorkers;
7848 
7849   if (getNodeInfo(refToNode(cmasterdihref)).m_version <
7850       NDBD_SUPPORT_PARALLEL_SYNCH)
7851   {
7852     jam();
7853     /**
7854      * The master node has no support to receive multiple requests to copy a
7855      * fragment on the same node group. We fix this by ensuring that we only
7856      * use one thread in the parallel copy scheme.
7857      */
7858     instanceId = 0;
7859   }
7860   if ((instanceId % takeOverPtr.p->m_number_of_copy_threads) ==
7861       takeOverPtr.p->m_copy_thread_id)
7862   {
7863     jam();
7864     return true;
7865   }
7866   else
7867   {
7868     jam();
7869     return false;
7870   }
7871 }
7872 
startNextCopyFragment(Signal * signal,Uint32 takeOverPtrI)7873 void Dbdih::startNextCopyFragment(Signal* signal, Uint32 takeOverPtrI)
7874 {
7875   TabRecordPtr tabPtr;
7876   TakeOverRecordPtr takeOverPtr;
7877   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
7878 
7879   Uint32 loopCount;
7880   loopCount = 0;
7881   if (ERROR_INSERTED(7159)) {
7882     loopCount = 100;
7883   }//if
7884   while (loopCount++ < 100) {
7885     tabPtr.i = takeOverPtr.p->toCurrentTabref;
7886     if (tabPtr.i >= ctabFileSize) {
7887       jam();
7888       CRASH_INSERTION(7136);
7889       toCopyCompletedLab(signal, takeOverPtr);
7890       return;
7891     }//if
7892     ptrAss(tabPtr, tabRecord);
7893     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE){
7894       jam();
7895       takeOverPtr.p->toCurrentFragid = 0;
7896       takeOverPtr.p->toCurrentTabref++;
7897       continue;
7898     }//if
7899     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
7900     if (fragId >= tabPtr.p->totalfragments) {
7901       jam();
7902       takeOverPtr.p->toCurrentFragid = 0;
7903       takeOverPtr.p->toCurrentTabref++;
7904       if (ERROR_INSERTED(7135)) {
7905         if (takeOverPtr.p->toCurrentTabref == 1) {
7906           ndbrequire(false);
7907         }//if
7908       }//if
7909       continue;
7910     }//if
7911     FragmentstorePtr fragPtr;
7912     getFragstore(tabPtr.p, fragId, fragPtr);
7913 
7914     Uint32 instanceKey = dihGetInstanceKey(fragPtr);
7915     if (!check_takeover_thread(takeOverPtr,
7916                                fragPtr,
7917                                instanceKey))
7918     {
7919       /**
7920        * We are scanning for fragment replicas to take over, but this replica
7921        * was not ours to take over, it will be handled by another take over
7922        * thread.
7923        */
7924       jam();
7925       takeOverPtr.p->toCurrentFragid++;
7926       continue;
7927     }
7928     jam();
7929 
7930     ReplicaRecordPtr loopReplicaPtr;
7931     loopReplicaPtr.i = fragPtr.p->oldStoredReplicas;
7932     while (loopReplicaPtr.i != RNIL) {
7933       c_replicaRecordPool.getPtr(loopReplicaPtr);
7934       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toFailedNode) {
7935         jam();
7936 	/* ----------------------------------------------------------------- */
7937 	/* WE HAVE FOUND A REPLICA THAT BELONGED THE FAILED NODE THAT NEEDS  */
7938 	/* TAKE OVER. WE TAKE OVER THIS REPLICA TO THE NEW NODE.             */
7939 	/* ----------------------------------------------------------------- */
7940         takeOverPtr.p->toCurrentReplica = loopReplicaPtr.i;
7941         toCopyFragLab(signal, takeOverPtr.i);
7942         return;
7943       } else if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode) {
7944         jam();
7945 	/* ----------------------------------------------------------------- */
7946 	/* WE HAVE OBVIOUSLY STARTED TAKING OVER THIS WITHOUT COMPLETING IT. */
7947 	/* WE NEED TO COMPLETE THE TAKE OVER OF THIS REPLICA.                */
7948 	/* ----------------------------------------------------------------- */
7949         takeOverPtr.p->toCurrentReplica = loopReplicaPtr.i;
7950         toCopyFragLab(signal, takeOverPtr.i);
7951         return;
7952       } else {
7953         jam();
7954         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
7955       }//if
7956     }//while
7957     takeOverPtr.p->toCurrentFragid++;
7958   }//while
7959   send_continueb_start_next_copy(signal, takeOverPtr);
7960 }//Dbdih::startNextCopyFragment()
7961 
toCopyFragLab(Signal * signal,Uint32 takeOverPtrI)7962 void Dbdih::toCopyFragLab(Signal* signal, Uint32 takeOverPtrI)
7963 {
7964   TakeOverRecordPtr takeOverPtr;
7965   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
7966 
7967   /**
7968    * Inform starting node that TakeOver is about to start
7969    */
7970   g_eventLogger->debug("PREPARE_COPY_FRAGREQ: tab: %u, frag: %u, thread: %u",
7971     takeOverPtr.p->toCurrentTabref,
7972     takeOverPtr.p->toCurrentFragid,
7973     takeOverPtr.i);
7974   TabRecordPtr tabPtr;
7975   tabPtr.i = takeOverPtr.p->toCurrentTabref;
7976   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
7977 
7978   FragmentstorePtr fragPtr;
7979   getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
7980   Uint32 nodes[MAX_REPLICAS];
7981   extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
7982   takeOverPtr.p->toCopyNode = nodes[0];
7983 
7984   PrepareCopyFragReq* req= (PrepareCopyFragReq*)signal->getDataPtrSend();
7985   req->senderRef = reference();
7986   req->senderData = takeOverPtrI;
7987   req->tableId = takeOverPtr.p->toCurrentTabref;
7988   req->fragId = takeOverPtr.p->toCurrentFragid;
7989   req->copyNodeId = takeOverPtr.p->toCopyNode;
7990   req->startingNodeId = takeOverPtr.p->toStartingNode; // Dst
7991 
7992   Uint32 instanceKey = dihGetInstanceKey(req->tableId, req->fragId);
7993   Uint32 ref = numberToRef(DBLQH, instanceKey, takeOverPtr.p->toStartingNode);
7994 
7995   sendSignal(ref, GSN_PREPARE_COPY_FRAG_REQ, signal,
7996              PrepareCopyFragReq::SignalLength, JBB);
7997 
7998   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_PREPARE_COPY;
7999 }
8000 
8001 void
execPREPARE_COPY_FRAG_REF(Signal * signal)8002 Dbdih::execPREPARE_COPY_FRAG_REF(Signal* signal)
8003 {
8004   jamEntry();
8005   PrepareCopyFragRef ref = *(PrepareCopyFragRef*)signal->getDataPtr();
8006 
8007   TakeOverRecordPtr takeOverPtr;
8008   c_takeOverPool.getPtr(takeOverPtr, ref.senderData);
8009 
8010   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_PREPARE_COPY);
8011 
8012   /**
8013    * Treat this as copy frag ref
8014    */
8015   CopyFragRef * cfref = (CopyFragRef*)signal->getDataPtrSend();
8016   cfref->userPtr = ref.senderData;
8017   cfref->startingNodeId = ref.startingNodeId;
8018   cfref->errorCode = ref.errorCode;
8019   cfref->tableId = ref.tableId;
8020   cfref->fragId = ref.fragId;
8021   cfref->sendingNodeId = ref.copyNodeId;
8022   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_FRAG;
8023   execCOPY_FRAGREF(signal);
8024 }
8025 
8026 void
execPREPARE_COPY_FRAG_CONF(Signal * signal)8027 Dbdih::execPREPARE_COPY_FRAG_CONF(Signal* signal)
8028 {
8029   jamEntry();
8030   PrepareCopyFragConf conf = *(PrepareCopyFragConf*)signal->getDataPtr();
8031 
8032   TakeOverRecordPtr takeOverPtr;
8033   c_takeOverPool.getPtr(takeOverPtr, conf.senderData);
8034 
8035   Uint32 version = getNodeInfo(refToNode(conf.senderRef)).m_version;
8036   ndbrequire(ndb_check_prep_copy_frag_version(version) >= 2);
8037   takeOverPtr.p->maxPage = conf.maxPageNo;
8038 
8039   c_activeTakeOverList.remove(takeOverPtr);
8040 
8041   if (c_activeThreadTakeOverPtr.i != RNIL)
8042   {
8043     /**
8044      * There is already an active take over thread that is performing an
8045      * update of its fragment replica state through the master. We will
8046      * put ourselves in the c_queued_for_start_take_over_list and be
8047      * started as soon as possible.
8048      */
8049     jam();
8050     g_eventLogger->debug("QUEUED_UPDATE_BEFORE_STORED, inst: %u",
8051                          takeOverPtr.i);
8052     takeOverPtr.p->toSlaveStatus =
8053       TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_STORED;
8054     c_queued_for_start_takeover_list.addLast(takeOverPtr);
8055     return;
8056   }
8057   /* Mark master busy before proceeding */
8058   c_activeThreadTakeOverPtr = takeOverPtr;
8059 
8060   /**
8061    * We need to lock fragment info...in order to later run
8062    * UPDATE_FRAG_STATEREQ. We will mark ourselves as the active thread
8063    * such that other threads will be queued up until we are ready with
8064    * updating the fragment state.
8065    */
8066   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_STORED;
8067   g_eventLogger->debug("PREPARE_COPY_FRAG_CONF: thread: %u", takeOverPtr.i);
8068   sendUpdateTo(signal, takeOverPtr);
8069 }
8070 
8071 void
sendUpdateTo(Signal * signal,TakeOverRecordPtr takeOverPtr)8072 Dbdih::sendUpdateTo(Signal* signal, TakeOverRecordPtr takeOverPtr)
8073 {
8074   /**
8075    * We must refer to the main takeover thread towards the master node,
8076    * but we take the data from the thread which is currently active.
8077    */
8078   g_eventLogger->debug("UPDATE_TOREQ: tab:%u, frag:%u, thread:%u, state:%u",
8079     takeOverPtr.p->toCurrentTabref,
8080     takeOverPtr.p->toCurrentFragid,
8081     takeOverPtr.i,
8082     takeOverPtr.p->toSlaveStatus);
8083   UpdateToReq* req = (UpdateToReq*)signal->getDataPtrSend();
8084   req->senderData = c_mainTakeOverPtr.i;
8085   req->senderRef = reference();
8086   req->startingNodeId = takeOverPtr.p->toStartingNode;
8087   req->copyNodeId = takeOverPtr.p->toCopyNode;
8088   req->tableId = takeOverPtr.p->toCurrentTabref;
8089   req->fragmentNo = takeOverPtr.p->toCurrentFragid;
8090   switch(takeOverPtr.p->toSlaveStatus){
8091   case TakeOverRecord::TO_UPDATE_BEFORE_STORED:
8092     jam();
8093     req->requestType = UpdateToReq::BEFORE_STORED;
8094     break;
8095   case TakeOverRecord::TO_UPDATE_AFTER_STORED:
8096     req->requestType = UpdateToReq::AFTER_STORED;
8097     break;
8098   case TakeOverRecord::TO_UPDATE_BEFORE_COMMIT:
8099     jam();
8100     req->requestType = UpdateToReq::BEFORE_COMMIT_STORED;
8101     break;
8102   case TakeOverRecord::TO_UPDATE_AFTER_COMMIT:
8103     jam();
8104     req->requestType = UpdateToReq::AFTER_COMMIT_STORED;
8105     break;
8106   default:
8107     jamLine(takeOverPtr.p->toSlaveStatus);
8108     ndbrequire(false);
8109   }
8110   sendSignal(cmasterdihref, GSN_UPDATE_TOREQ,
8111              signal, UpdateToReq::SignalLength, JBB);
8112 }
8113 
8114 void
execUPDATE_TOREF(Signal * signal)8115 Dbdih::execUPDATE_TOREF(Signal* signal)
8116 {
8117   jamEntry();
8118   UpdateToRef* ref = (UpdateToRef*)signal->getDataPtr();
8119   Uint32 errCode = ref->errorCode;
8120   (void)errCode; // TODO check for "valid" error
8121 
8122   TakeOverRecordPtr takeOverPtr;
8123 
8124   ndbrequire(ref->senderData == c_mainTakeOverPtr.i);
8125   ndbrequire(c_activeThreadTakeOverPtr.i != RNIL);
8126 
8127   c_takeOverPool.getPtr(takeOverPtr, c_activeThreadTakeOverPtr.i);
8128 
8129   g_eventLogger->info("UPDATE_TOREF: thread: %u, state:%u",
8130                       takeOverPtr.i,
8131                       takeOverPtr.p->toSlaveStatus);
8132   signal->theData[0] = DihContinueB::ZSEND_UPDATE_TO;
8133   signal->theData[1] = takeOverPtr.i;
8134 
8135   sendSignalWithDelay(reference(), GSN_CONTINUEB,
8136                       signal, 5000, 2);
8137 }
8138 
8139 void
execUPDATE_TOCONF(Signal * signal)8140 Dbdih::execUPDATE_TOCONF(Signal* signal)
8141 {
8142   jamEntry();
8143 
8144   UpdateToConf* conf = (UpdateToConf*)signal->getDataPtr();
8145 
8146   TakeOverRecordPtr takeOverPtr;
8147 
8148   /**
8149    * We operate towards the master using the main takeover thread.
8150    * The CONF is however intended for the current active takeover
8151    * thread.
8152    */
8153   ndbrequire(conf->senderData == c_mainTakeOverPtr.i);
8154   ndbrequire(c_activeThreadTakeOverPtr.i != RNIL);
8155 
8156   c_takeOverPool.getPtr(takeOverPtr, c_activeThreadTakeOverPtr.i);
8157 
8158   g_eventLogger->debug("UPDATE_TOCONF: thread: %u, state:%u",
8159                        takeOverPtr.i,
8160                        takeOverPtr.p->toSlaveStatus);
8161   switch(takeOverPtr.p->toSlaveStatus){
8162   case TakeOverRecord::TO_UPDATE_BEFORE_STORED:
8163     jam();
8164 
8165     CRASH_INSERTION(7154);
8166 
8167     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_FRAG_STATE_STORED;
8168     sendUpdateFragStateReq(signal,
8169                            ZINIT_CREATE_GCI,
8170                            UpdateFragStateReq::STORED,
8171                            takeOverPtr);
8172     return;
8173   case TakeOverRecord::TO_UPDATE_AFTER_STORED:
8174     jam();
8175 
8176     CRASH_INSERTION(7195);
8177 
8178     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_FRAG;
8179     toStartCopyFrag(signal, takeOverPtr);
8180     return;
8181   case TakeOverRecord::TO_UPDATE_BEFORE_COMMIT:
8182     jam();
8183 
8184     CRASH_INSERTION(7196);
8185 
8186     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_FRAG_STATE_COMMIT;
8187     sendUpdateFragStateReq(signal,
8188                            takeOverPtr.p->startGci,
8189                            UpdateFragStateReq::COMMIT_STORED,
8190                            takeOverPtr);
8191     return;
8192   case TakeOverRecord::TO_UPDATE_AFTER_COMMIT:
8193     jam();
8194 
8195     CRASH_INSERTION(7197);
8196 
8197     start_next_takeover_thread(signal);
8198     c_activeTakeOverList.addFirst(takeOverPtr);
8199     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SELECTING_NEXT;
8200     startNextCopyFragment(signal, takeOverPtr.i);
8201     return;
8202   default:
8203     ndbrequire(false);
8204   }
8205 }
8206 
8207 void
toStartCopyFrag(Signal * signal,TakeOverRecordPtr takeOverPtr)8208 Dbdih::toStartCopyFrag(Signal* signal, TakeOverRecordPtr takeOverPtr)
8209 {
8210   TabRecordPtr tabPtr;
8211   tabPtr.i = takeOverPtr.p->toCurrentTabref;
8212   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
8213 
8214   Uint32 fragId = takeOverPtr.p->toCurrentFragid;
8215 
8216   FragmentstorePtr fragPtr;
8217   getFragstore(tabPtr.p, fragId, fragPtr);
8218 
8219   ReplicaRecordPtr replicaPtr;
8220   findReplica(replicaPtr, fragPtr.p, getOwnNodeId(), true);
8221 
8222   Uint32 gci = replicaPtr.p->m_restorable_gci;
8223   replicaPtr.p->m_restorable_gci = 0; // used in union...
8224 
8225   Uint32 instanceKey = dihGetInstanceKey(tabPtr.i, fragId);
8226   BlockReference ref = numberToRef(DBLQH, instanceKey,
8227                                    takeOverPtr.p->toCopyNode);
8228   CopyFragReq * const copyFragReq = (CopyFragReq *)&signal->theData[0];
8229   copyFragReq->userPtr = takeOverPtr.i;
8230   copyFragReq->userRef = reference();
8231   copyFragReq->tableId = tabPtr.i;
8232   copyFragReq->fragId = fragId;
8233   copyFragReq->nodeId = takeOverPtr.p->toStartingNode;
8234   copyFragReq->schemaVersion = tabPtr.p->schemaVersion;
8235   copyFragReq->distributionKey = fragPtr.p->distributionKey;
8236   copyFragReq->gci = gci;
8237   Uint32 len = copyFragReq->nodeCount =
8238     extractNodeInfo(jamBuffer(), fragPtr.p,
8239                     copyFragReq->nodeList);
8240   copyFragReq->nodeList[len] = takeOverPtr.p->maxPage;
8241   copyFragReq->nodeList[len+1] = CopyFragReq::CFR_TRANSACTIONAL;
8242   sendSignal(ref, GSN_COPY_FRAGREQ, signal,
8243              CopyFragReq::SignalLength + len, JBB);
8244   g_eventLogger->debug("COPY_FRAGREQ: thread: %u, tab: %u, frag: %u",
8245     takeOverPtr.i,
8246     takeOverPtr.p->toCurrentTabref,
8247     takeOverPtr.p->toCurrentFragid);
8248   start_next_takeover_thread(signal);
8249   c_active_copy_threads_list.addFirst(takeOverPtr);
8250 }//Dbdih::toStartCopy()
8251 
sendUpdateFragStateReq(Signal * signal,Uint32 startGci,Uint32 replicaType,TakeOverRecordPtr takeOverPtr)8252 void Dbdih::sendUpdateFragStateReq(Signal* signal,
8253                                    Uint32 startGci,
8254                                    Uint32 replicaType,
8255                                    TakeOverRecordPtr takeOverPtr)
8256 {
8257   sendLoopMacro(UPDATE_FRAG_STATEREQ, nullRoutine, RNIL);
8258 
8259   g_eventLogger->debug("Update frag state for inst:%u,tab:%u,frag:%u",
8260                        takeOverPtr.i,
8261                        takeOverPtr.p->toCurrentTabref,
8262                        takeOverPtr.p->toCurrentFragid);
8263   UpdateFragStateReq * const req = (UpdateFragStateReq *)&signal->theData[0];
8264   req->senderData = takeOverPtr.i;
8265   req->senderRef = reference();
8266   req->tableId = takeOverPtr.p->toCurrentTabref;
8267   req->fragId = takeOverPtr.p->toCurrentFragid;
8268   req->startingNodeId = takeOverPtr.p->toStartingNode;
8269   req->copyNodeId = takeOverPtr.p->toCopyNode;
8270   req->failedNodeId = takeOverPtr.p->toFailedNode;
8271   req->startGci = startGci;
8272   req->replicaType = replicaType;
8273 
8274   NodeRecordPtr nodePtr;
8275   nodePtr.i = cfirstAliveNode;
8276   do {
8277     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
8278     BlockReference ref = calcDihBlockRef(nodePtr.i);
8279     sendSignal(ref, GSN_UPDATE_FRAG_STATEREQ, signal,
8280 	       UpdateFragStateReq::SignalLength, JBB);
8281     nodePtr.i = nodePtr.p->nextNode;
8282   } while (nodePtr.i != RNIL);
8283 }//Dbdih::sendUpdateFragStateReq()
8284 
execUPDATE_FRAG_STATECONF(Signal * signal)8285 void Dbdih::execUPDATE_FRAG_STATECONF(Signal* signal)
8286 {
8287   jamEntry();
8288   CRASH_INSERTION(7148);
8289   UpdateFragStateConf * conf = (UpdateFragStateConf *)&signal->theData[0];
8290 
8291   TakeOverRecordPtr takeOverPtr;
8292 
8293   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
8294 
8295   g_eventLogger->debug("Updated frag state for inst:%u,tab:%u,frag:%u,state:%u",
8296                        takeOverPtr.i,
8297                        takeOverPtr.p->toCurrentTabref,
8298                        takeOverPtr.p->toCurrentFragid,
8299                        takeOverPtr.p->toSlaveStatus);
8300   receiveLoopMacro(UPDATE_FRAG_STATEREQ, conf->sendingNodeId);
8301 
8302   switch(takeOverPtr.p->toSlaveStatus){
8303   case TakeOverRecord::TO_UPDATE_FRAG_STATE_STORED:
8304     jam();
8305     CRASH_INSERTION(7198);
8306     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_AFTER_STORED;
8307     break;
8308   case TakeOverRecord::TO_UPDATE_FRAG_STATE_COMMIT:
8309     jam();
8310     CRASH_INSERTION(7199);
8311     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_AFTER_COMMIT;
8312     break;
8313   case TakeOverRecord::TO_SL_UPDATE_FRAG_STATE:
8314     jam();
8315     //CRASH_INSERTION(
8316     start_next_takeover_thread(signal);
8317     c_active_copy_threads_list.addFirst(takeOverPtr);
8318     g_eventLogger->debug("UPDATE_FRAG_STATE completed: thread: %u",
8319       takeOverPtr.i);
8320     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_LOGGING;
8321     takeOverPtr.p->toCurrentFragid++;
8322     signal->theData[0] = DihContinueB::ZTO_START_LOGGING;
8323     signal->theData[1] = takeOverPtr.i;
8324     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
8325     return;
8326   default:
8327     jamLine(takeOverPtr.p->toSlaveStatus);
8328     ndbrequire(false);
8329   }
8330   sendUpdateTo(signal, takeOverPtr);
8331 }//Dbdih::execUPDATE_FRAG_STATECONF()
8332 
execCOPY_FRAGREF(Signal * signal)8333 void Dbdih::execCOPY_FRAGREF(Signal* signal)
8334 {
8335   const CopyFragRef * const ref = (CopyFragRef *)&signal->theData[0];
8336   jamEntry();
8337   Uint32 takeOverPtrI = ref->userPtr;
8338   Uint32 startingNodeId = ref->startingNodeId;
8339   Uint32 errorCode = ref->errorCode;
8340 
8341   TakeOverRecordPtr takeOverPtr;
8342   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
8343   ndbrequire(ref->tableId == takeOverPtr.p->toCurrentTabref);
8344   ndbrequire(ref->fragId == takeOverPtr.p->toCurrentFragid);
8345   ndbrequire(ref->startingNodeId == takeOverPtr.p->toStartingNode);
8346   ndbrequire(ref->sendingNodeId == takeOverPtr.p->toCopyNode);
8347   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_FRAG);
8348 
8349   //--------------------------------------------------------------------------
8350   // For some reason we did not succeed in copying a fragment. We treat this
8351   // as a serious failure and crash the starting node.
8352   //--------------------------------------------------------------------------
8353   BlockReference cntrRef = calcNdbCntrBlockRef(startingNodeId);
8354   SystemError * const sysErr = (SystemError*)&signal->theData[0];
8355   sysErr->errorCode = SystemError::CopyFragRefError;
8356   sysErr->errorRef = reference();
8357   sysErr->data[0] = errorCode;
8358   sysErr->data[1] = 0;
8359   sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal,
8360 	     SystemError::SignalLength, JBB);
8361   return;
8362 }//Dbdih::execCOPY_FRAGREF()
8363 
execCOPY_FRAGCONF(Signal * signal)8364 void Dbdih::execCOPY_FRAGCONF(Signal* signal)
8365 {
8366   const CopyFragConf * const conf = (CopyFragConf *)&signal->theData[0];
8367   jamEntry();
8368   CRASH_INSERTION(7142);
8369 
8370   TakeOverRecordPtr takeOverPtr;
8371   c_takeOverPool.getPtr(takeOverPtr, conf->userPtr);
8372 
8373   Uint32 rows_lo = conf->rows_lo;
8374   Uint32 bytes_lo = conf->bytes_lo;
8375 
8376   ndbrequire(conf->tableId == takeOverPtr.p->toCurrentTabref);
8377   ndbrequire(conf->fragId == takeOverPtr.p->toCurrentFragid);
8378   ndbrequire(conf->startingNodeId == takeOverPtr.p->toStartingNode);
8379   ndbrequire(conf->sendingNodeId == takeOverPtr.p->toCopyNode);
8380   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_FRAG);
8381 
8382   g_eventLogger->debug("COPY_FRAGCONF: thread: %u, tab: %u, frag: %u",
8383     takeOverPtr.i,
8384     takeOverPtr.p->toCurrentTabref,
8385     takeOverPtr.p->toCurrentFragid);
8386 
8387   TabRecordPtr tabPtr;
8388   tabPtr.i = takeOverPtr.p->toCurrentTabref;
8389   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
8390 
8391   FragmentstorePtr fragPtr;
8392   getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
8393   Uint32 instanceKey = dihGetInstanceKey(fragPtr);
8394   BlockReference lqhRef = numberToRef(DBLQH, instanceKey,
8395                                       takeOverPtr.p->toStartingNode);
8396   CopyActiveReq * const req = (CopyActiveReq *)&signal->theData[0];
8397   req->userPtr = takeOverPtr.i;
8398   req->userRef = reference();
8399   req->tableId = takeOverPtr.p->toCurrentTabref;
8400   req->fragId = takeOverPtr.p->toCurrentFragid;
8401   req->distributionKey = fragPtr.p->distributionKey;
8402   req->flags = 0;
8403 
8404   Uint32 min_version = getNodeVersionInfo().m_type[NodeInfo::DB].m_min_version;
8405   if (ndb_delayed_copy_active_req(min_version))
8406   {
8407     jam();
8408     /**
8409      * Bug48474 - Don't start logging an fragment
8410      *            until all fragments has been copied
8411      *            Else it's easy to run out of REDO
8412      */
8413     req->flags |= CopyActiveReq::CAR_NO_WAIT | CopyActiveReq::CAR_NO_LOGGING;
8414   }
8415 
8416   sendSignal(lqhRef, GSN_COPY_ACTIVEREQ, signal,
8417              CopyActiveReq::SignalLength, JBB);
8418   g_eventLogger->debug("COPY_ACTIVEREQ: thread: %u, tab: %u, frag: %u",
8419     takeOverPtr.i,
8420     takeOverPtr.p->toCurrentTabref,
8421     takeOverPtr.p->toCurrentFragid);
8422 
8423   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_ACTIVE;
8424 
8425   signal->theData[0] = NDB_LE_NR_CopyFragDone;
8426   signal->theData[1] = getOwnNodeId();
8427   signal->theData[2] = takeOverPtr.p->toCurrentTabref;
8428   signal->theData[3] = takeOverPtr.p->toCurrentFragid;
8429   signal->theData[4] = rows_lo;
8430   signal->theData[5] = 0;
8431   signal->theData[6] = bytes_lo;
8432   signal->theData[7] = 0;
8433   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 8, JBB);
8434 }//Dbdih::execCOPY_FRAGCONF()
8435 
execCOPY_ACTIVECONF(Signal * signal)8436 void Dbdih::execCOPY_ACTIVECONF(Signal* signal)
8437 {
8438   const CopyActiveConf * const conf = (CopyActiveConf *)&signal->theData[0];
8439   jamEntry();
8440   CRASH_INSERTION(7143);
8441 
8442   TakeOverRecordPtr takeOverPtr;
8443   c_takeOverPool.getPtr(takeOverPtr, conf->userPtr);
8444 
8445   ndbrequire(conf->tableId == takeOverPtr.p->toCurrentTabref);
8446   ndbrequire(conf->fragId == takeOverPtr.p->toCurrentFragid);
8447   ndbrequire(checkNodeAlive(conf->startingNodeId));
8448 
8449   g_eventLogger->debug("COPY_ACTIVECONF: thread: %u, tab: %u, frag: %u",
8450     takeOverPtr.i,
8451     takeOverPtr.p->toCurrentTabref,
8452     takeOverPtr.p->toCurrentFragid);
8453 
8454   takeOverPtr.p->startGci = conf->startGci;
8455 
8456   c_active_copy_threads_list.remove(takeOverPtr);
8457 
8458   if (takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_ACTIVE)
8459   {
8460     if (c_activeThreadTakeOverPtr.i != RNIL)
8461     {
8462       /**
8463        * There is already an active take over thread that is performing an
8464        * update of its fragment replica state through the master. We will
8465        * put ourselves in the c_queued_for_commit_take_over_list and be
8466        * started as soon as possible.
8467        */
8468       g_eventLogger->debug("QUEUED_UPDATE_BEFORE_COMMIT, inst: %u",
8469                           takeOverPtr.i);
8470       jam();
8471       takeOverPtr.p->toSlaveStatus =
8472         TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_COMMIT;
8473       c_queued_for_commit_takeover_list.addLast(takeOverPtr);
8474       return;
8475     }
8476     g_eventLogger->debug("Copy frag active: tab:%u,frag:%u,inst:%u",
8477       takeOverPtr.p->toCurrentTabref,
8478       takeOverPtr.p->toCurrentFragid,
8479       takeOverPtr.i);
8480     jam();
8481     c_activeThreadTakeOverPtr = takeOverPtr; /* Mark master busy */
8482     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_COMMIT;
8483     sendUpdateTo(signal, takeOverPtr);
8484   }
8485   else
8486   {
8487     jam();
8488     ndbrequire(takeOverPtr.p->toSlaveStatus==
8489                TakeOverRecord::TO_SL_COPY_ACTIVE);
8490 
8491     if (c_activeThreadTakeOverPtr.i != RNIL)
8492     {
8493       jam();
8494       g_eventLogger->debug("QUEUED_SL_UPDATE_FRAG_STATE, inst: %u",
8495                            takeOverPtr.i);
8496       takeOverPtr.p->toSlaveStatus =
8497         TakeOverRecord::TO_QUEUED_SL_UPDATE_FRAG_STATE;
8498       c_queued_for_commit_takeover_list.addLast(takeOverPtr);
8499       return;
8500     }
8501     c_activeThreadTakeOverPtr = takeOverPtr; /* Mark master busy */
8502     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_UPDATE_FRAG_STATE;
8503     g_eventLogger->debug("Update frag state:inst:%u,tab:%u,frag:%u,state:%u",
8504                          takeOverPtr.i,
8505                          takeOverPtr.p->toCurrentTabref,
8506                          takeOverPtr.p->toCurrentFragid,
8507                          takeOverPtr.p->toSlaveStatus);
8508     sendUpdateFragStateReq(signal,
8509                            takeOverPtr.p->startGci,
8510                            UpdateFragStateReq::START_LOGGING,
8511                            takeOverPtr);
8512   }
8513 }//Dbdih::execCOPY_ACTIVECONF()
8514 
8515 void
check_take_over_completed_correctly()8516 Dbdih::check_take_over_completed_correctly()
8517 {
8518   ndbrequire(c_completed_copy_threads_list.isEmpty());
8519   ndbrequire(c_activeTakeOverList.isEmpty());
8520   ndbrequire(c_queued_for_start_takeover_list.isEmpty());
8521   ndbrequire(c_queued_for_commit_takeover_list.isEmpty());
8522   ndbrequire(c_active_copy_threads_list.isEmpty());
8523   ndbrequire(c_activeThreadTakeOverPtr.i == RNIL);
8524   ndbrequire(c_mainTakeOverPtr.i != RNIL);
8525   /**
8526    * We could be master in system restart where we had to
8527    * restart with aid of another node and thus perform
8528    * synchronize with this other node. In this case we
8529    * have 2 take over records, one for master part and
8530    * one for start copy part.
8531    */
8532   ndbrequire((c_takeOverPool.getUsed() == 1) ||
8533              (cmasterdihref == reference() &&
8534               c_takeOverPool.getUsed() == 2));
8535 }
8536 
8537 void
release_take_over_threads(void)8538 Dbdih::release_take_over_threads(void)
8539 {
8540   TakeOverRecordPtr takeOverPtr;
8541   do
8542   {
8543     jam();
8544     if (!c_completed_copy_threads_list.removeFirst(takeOverPtr))
8545     {
8546       jam();
8547       break;
8548     }
8549     releaseTakeOver(takeOverPtr, false);
8550   } while (1);
8551   check_take_over_completed_correctly();
8552 }
8553 
8554 bool
thread_takeover_copy_completed(Signal * signal,TakeOverRecordPtr takeOverPtr)8555 Dbdih::thread_takeover_copy_completed(Signal *signal,
8556                                         TakeOverRecordPtr takeOverPtr)
8557 {
8558   c_activeTakeOverList.remove(takeOverPtr);
8559   c_completed_copy_threads_list.addFirst(takeOverPtr);
8560   c_mainTakeOverPtr.p->m_copy_threads_completed++;
8561   if (c_mainTakeOverPtr.p->m_copy_threads_completed ==
8562       c_mainTakeOverPtr.p->m_number_of_copy_threads)
8563   {
8564     /* No more to do, just wait for more threads to complete */
8565     return true;
8566   }
8567   return false;
8568 }
8569 
toCopyCompletedLab(Signal * signal,TakeOverRecordPtr takeOverPtr)8570 void Dbdih::toCopyCompletedLab(Signal * signal, TakeOverRecordPtr takeOverPtr)
8571 {
8572   /**
8573    * One take over thread has completed its work. We will have to wait for
8574    * all of the threads to complete here before we can proceed.
8575    */
8576   g_eventLogger->debug("Thread %u copy completed", takeOverPtr.i);
8577   if (!thread_takeover_copy_completed(signal, takeOverPtr))
8578   {
8579     jam();
8580     return;
8581   }
8582   jam();
8583   c_mainTakeOverPtr.p->m_copy_threads_completed = 0;
8584 
8585   signal->theData[0] = NDB_LE_NR_CopyFragsCompleted;
8586   signal->theData[1] = takeOverPtr.p->toStartingNode;
8587   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
8588 
8589   /* Ask LQH to dump CopyFrag stage statistics */
8590   signal->theData[0] = DumpStateOrd::LqhReportCopyInfo;
8591   sendSignal(DBLQH_REF, GSN_DUMP_STATE_ORD, signal, 1, JBB);
8592 
8593   g_eventLogger->info("Bring Database On-line Completed");
8594   infoEvent("Bring Database On-line Completed on node %u",
8595             takeOverPtr.p->toStartingNode);
8596 
8597   Uint32 min_version = getNodeVersionInfo().m_type[NodeInfo::DB].m_min_version;
8598   if (ndb_delayed_copy_active_req(min_version))
8599   {
8600     jam();
8601     g_eventLogger->info("Starting REDO logging");
8602     infoEvent("Starting REDO logging on node %u",
8603               takeOverPtr.p->toStartingNode);
8604     start_thread_takeover_logging(signal);
8605     return;
8606   }
8607   else
8608   {
8609     jam();
8610 
8611     /**
8612      * We won't need the threads anymore so we remove them from the
8613      * completed list and release them to the pool.
8614      */
8615     release_take_over_threads();
8616     g_eventLogger->info("Make On-line Database recoverable by waiting"
8617                         " for LCP Starting");
8618     infoEvent("Make On-line Database recoverable by waiting"
8619               " for LCP Starting on node %u",
8620               takeOverPtr.p->toStartingNode);
8621 
8622     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_END_TO;
8623 
8624     EndToReq* req = (EndToReq*)signal->getDataPtrSend();
8625     req->senderData = takeOverPtr.i;
8626     req->senderRef = reference();
8627     req->flags = takeOverPtr.p->m_flags;
8628     sendSignal(cmasterdihref, GSN_END_TOREQ,
8629                signal, EndToReq::SignalLength, JBB);
8630     sendEND_TOREP(signal, takeOverPtr.p->toStartingNode);
8631     return;
8632   }
8633 }//Dbdih::toCopyCompletedLab()
8634 
8635 void
send_continueb_nr_start_logging(Signal * signal,TakeOverRecordPtr takeOverPtr)8636 Dbdih::send_continueb_nr_start_logging(Signal *signal,
8637                                        TakeOverRecordPtr takeOverPtr)
8638 {
8639   signal->theData[0] = DihContinueB::ZTO_START_LOGGING;
8640   signal->theData[1] = takeOverPtr.i;
8641   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
8642 }
8643 
8644 void
start_thread_takeover_logging(Signal * signal)8645 Dbdih::start_thread_takeover_logging(Signal *signal)
8646 {
8647   /**
8648    * Ensure no active thread, all thread takeover records are
8649    * placed into the c_completed_copy_threads_list and that
8650    * we have a main takeover thread and that all other lists are
8651    * empty at this point.
8652    */
8653   ndbrequire(c_activeThreadTakeOverPtr.i == RNIL);
8654   ndbrequire(c_activeTakeOverList.isEmpty());
8655   ndbrequire(c_queued_for_start_takeover_list.isEmpty());
8656   ndbrequire(c_queued_for_commit_takeover_list.isEmpty());
8657   ndbrequire(c_active_copy_threads_list.isEmpty());
8658   ndbrequire(c_mainTakeOverPtr.i != RNIL);
8659   ndbrequire(!c_completed_copy_threads_list.isEmpty());
8660   TakeOverRecordPtr takeOverPtr;
8661   do
8662   {
8663     jam();
8664     if (!c_completed_copy_threads_list.removeFirst(takeOverPtr))
8665     {
8666       jam();
8667       break;
8668     }
8669     c_active_copy_threads_list.addFirst(takeOverPtr);
8670     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_LOGGING;
8671     takeOverPtr.p->toCurrentTabref = 0;
8672     takeOverPtr.p->toCurrentFragid = 0;
8673     takeOverPtr.p->toCurrentReplica = RNIL;
8674     send_continueb_nr_start_logging(signal, takeOverPtr);
8675   } while (1);
8676 }
8677 
8678 bool
thread_takeover_completed(Signal * signal,TakeOverRecordPtr takeOverPtr)8679 Dbdih::thread_takeover_completed(Signal *signal,
8680                                    TakeOverRecordPtr takeOverPtr)
8681 {
8682   c_active_copy_threads_list.remove(takeOverPtr);
8683   releaseTakeOver(takeOverPtr, false);
8684   c_mainTakeOverPtr.p->m_copy_threads_completed++;
8685   if (c_mainTakeOverPtr.p->m_copy_threads_completed ==
8686       c_mainTakeOverPtr.p->m_number_of_copy_threads)
8687   {
8688     return true;
8689   }
8690   return false;
8691 }
8692 
8693 void
execEND_TOREF(Signal * signal)8694 Dbdih::execEND_TOREF(Signal* signal)
8695 {
8696   jamEntry();
8697   EndToRef* ref = (EndToRef*)signal->getDataPtr();
8698 
8699   TakeOverRecordPtr takeOverPtr;
8700   c_takeOverPool.getPtr(takeOverPtr, ref->senderData);
8701 
8702   ndbrequire(false);
8703 }
8704 
8705 void
execEND_TOCONF(Signal * signal)8706 Dbdih::execEND_TOCONF(Signal* signal)
8707 {
8708   jamEntry();
8709   EndToConf* conf = (EndToConf*)signal->getDataPtr();
8710 
8711   CRASH_INSERTION(7144);
8712 
8713   TakeOverRecordPtr takeOverPtr;
8714   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
8715 
8716   Uint32 senderData = takeOverPtr.p->m_senderData;
8717   Uint32 senderRef = takeOverPtr.p->m_senderRef;
8718   Uint32 nodeId = takeOverPtr.p->toStartingNode;
8719 
8720   releaseTakeOver(takeOverPtr, false);
8721   c_mainTakeOverPtr.i = RNIL;
8722   c_mainTakeOverPtr.p = NULL;
8723 
8724   StartCopyConf* ret = (StartCopyConf*)signal->getDataPtrSend();
8725   ret->startingNodeId = nodeId;
8726   ret->senderData = senderData;
8727   ret->senderRef = reference();
8728   sendSignal(senderRef, GSN_START_COPYCONF, signal,
8729              StartCopyConf::SignalLength, JBB);
8730 }
8731 
releaseTakeOver(TakeOverRecordPtr takeOverPtr,bool from_master)8732 void Dbdih::releaseTakeOver(TakeOverRecordPtr takeOverPtr,
8733                             bool from_master)
8734 {
8735   takeOverPtr.p->m_copy_threads_completed = 0;
8736   takeOverPtr.p->m_number_of_copy_threads = (Uint32)-1;
8737   takeOverPtr.p->m_copy_thread_id = (Uint32)-1;
8738 
8739   takeOverPtr.p->toCopyNode = RNIL;
8740   takeOverPtr.p->toCurrentFragid = RNIL;
8741   takeOverPtr.p->toCurrentReplica = RNIL;
8742   takeOverPtr.p->toCurrentTabref = RNIL;
8743   takeOverPtr.p->toFailedNode = RNIL;
8744   takeOverPtr.p->toStartingNode = RNIL;
8745   NdbTick_Invalidate(&takeOverPtr.p->toStartTime);
8746   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SLAVE_IDLE;
8747   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
8748 
8749   if (from_master)
8750   {
8751     c_masterActiveTakeOverList.remove(takeOverPtr);
8752   }
8753   c_takeOverPool.release(takeOverPtr);
8754 }//Dbdih::releaseTakeOver()
8755 
8756 /*****************************************************************************/
8757 /* ------------------------------------------------------------------------- */
8758 /*       WE HAVE BEEN REQUESTED TO PERFORM A SYSTEM RESTART. WE START BY     */
8759 /*       READING THE GCI FILES. THIS REQUEST WILL ONLY BE SENT TO THE MASTER */
8760 /*       DIH. THAT MEANS WE HAVE TO REPLICATE THE INFORMATION WE READ FROM   */
8761 /*       OUR FILES TO ENSURE THAT ALL NODES HAVE THE SAME DISTRIBUTION       */
8762 /*       INFORMATION.                                                        */
8763 /* ------------------------------------------------------------------------- */
8764 /*****************************************************************************/
readGciFileLab(Signal * signal)8765 void Dbdih::readGciFileLab(Signal* signal)
8766 {
8767   FileRecordPtr filePtr;
8768   filePtr.i = crestartInfoFile[0];
8769   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
8770   filePtr.p->reqStatus = FileRecord::OPENING_GCP;
8771 
8772   openFileRo(signal, filePtr);
8773 }//Dbdih::readGciFileLab()
8774 
openingGcpLab(Signal * signal,FileRecordPtr filePtr)8775 void Dbdih::openingGcpLab(Signal* signal, FileRecordPtr filePtr)
8776 {
8777   /* ----------------------------------------------------------------------- */
8778   /*     WE HAVE SUCCESSFULLY OPENED A FILE CONTAINING INFORMATION ABOUT     */
8779   /*     THE GLOBAL CHECKPOINTS THAT ARE POSSIBLE TO RESTART.                */
8780   /* ----------------------------------------------------------------------- */
8781   readRestorableGci(signal, filePtr);
8782   filePtr.p->reqStatus = FileRecord::READING_GCP;
8783 }//Dbdih::openingGcpLab()
8784 
readingGcpLab(Signal * signal,FileRecordPtr filePtr)8785 void Dbdih::readingGcpLab(Signal* signal, FileRecordPtr filePtr)
8786 {
8787   /* ----------------------------------------------------------------------- */
8788   /*     WE HAVE NOW SUCCESSFULLY MANAGED TO READ IN THE GLOBAL CHECKPOINT   */
8789   /*     INFORMATION FROM FILE. LATER WE WILL ADD SOME FUNCTIONALITY THAT    */
8790   /*     CHECKS THE RESTART TIMERS TO DEDUCE FROM WHERE TO RESTART.          */
8791   /*     NOW WE WILL SIMPLY RESTART FROM THE NEWEST GLOBAL CHECKPOINT        */
8792   /*     POSSIBLE TO RESTORE.                                                */
8793   /*                                                                         */
8794   /*     BEFORE WE INVOKE DICT WE NEED TO COPY CRESTART_INFO TO ALL NODES.   */
8795   /*     WE ALSO COPY TO OUR OWN NODE. TO ENABLE US TO DO THIS PROPERLY WE   */
8796   /*     START BY CLOSING THIS FILE.                                         */
8797   /* ----------------------------------------------------------------------- */
8798   globalData.m_restart_seq = ++SYSFILE->m_restart_seq;
8799   closeFile(signal, filePtr);
8800   filePtr.p->reqStatus = FileRecord::CLOSING_GCP;
8801 }//Dbdih::readingGcpLab()
8802 
closingGcpLab(Signal * signal,FileRecordPtr filePtr)8803 void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr)
8804 {
8805   if (Sysfile::getInitialStartOngoing(SYSFILE->systemRestartBits) == false){
8806     jam();
8807     selectMasterCandidateAndSend(signal);
8808     return;
8809   } else {
8810     jam();
8811     sendDihRestartRef(signal);
8812     return;
8813   }//if
8814 }//Dbdih::closingGcpLab()
8815 
8816 void
sendDihRestartRef(Signal * signal)8817 Dbdih::sendDihRestartRef(Signal* signal)
8818 {
8819   jam();
8820 
8821   /**
8822    * We couldn't read P0.Sysfile...
8823    *   so compute no_nodegroup_mask from configuration
8824    */
8825   NdbNodeBitmask no_nodegroup_mask;
8826 
8827   ndb_mgm_configuration_iterator * iter =
8828     m_ctx.m_config.getClusterConfigIterator();
8829   for(ndb_mgm_first(iter); ndb_mgm_valid(iter); ndb_mgm_next(iter))
8830   {
8831     jam();
8832     Uint32 nodeId;
8833     Uint32 nodeType;
8834 
8835     ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_NODE_ID, &nodeId));
8836     ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_TYPE_OF_SECTION,
8837                                           &nodeType));
8838 
8839     if (nodeType == NodeInfo::DB)
8840     {
8841       jam();
8842       Uint32 ng;
8843       if (ndb_mgm_get_int_parameter(iter, CFG_DB_NODEGROUP, &ng) == 0)
8844       {
8845         jam();
8846         if (ng == NDB_NO_NODEGROUP)
8847         {
8848           no_nodegroup_mask.set(nodeId);
8849         }
8850       }
8851     }
8852   }
8853   DihRestartRef * ref = CAST_PTR(DihRestartRef, signal->getDataPtrSend());
8854   no_nodegroup_mask.copyto(NdbNodeBitmask::Size, ref->no_nodegroup_mask);
8855   sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal,
8856              DihRestartRef::SignalLength, JBB);
8857 }
8858 
8859 /* ------------------------------------------------------------------------- */
8860 /*       SELECT THE MASTER CANDIDATE TO BE USED IN SYSTEM RESTARTS.          */
8861 /* ------------------------------------------------------------------------- */
selectMasterCandidateAndSend(Signal * signal)8862 void Dbdih::selectMasterCandidateAndSend(Signal* signal)
8863 {
8864   setNodeGroups();
8865 
8866   NodeRecordPtr nodePtr;
8867   Uint32 node_groups[MAX_NDB_NODES];
8868   memset(node_groups, 0, sizeof(node_groups));
8869   NdbNodeBitmask no_nodegroup_mask;
8870   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
8871     jam();
8872     if (Sysfile::getNodeStatus(nodePtr.i, SYSFILE->nodeStatus) == Sysfile::NS_NotDefined)
8873     {
8874       jam();
8875       continue;
8876     }
8877     const Uint32 ng = Sysfile::getNodeGroup(nodePtr.i, SYSFILE->nodeGroups);
8878     if(ng != NO_NODE_GROUP_ID)
8879     {
8880       ndbrequire(ng < MAX_NDB_NODES);
8881       node_groups[ng]++;
8882     }
8883     else
8884     {
8885       no_nodegroup_mask.set(nodePtr.i);
8886     }
8887   }
8888 
8889   DihRestartConf * conf = CAST_PTR(DihRestartConf, signal->getDataPtrSend());
8890   conf->unused = getOwnNodeId();
8891   conf->latest_gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
8892   no_nodegroup_mask.copyto(NdbNodeBitmask::Size, conf->no_nodegroup_mask);
8893   sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal,
8894              DihRestartConf::SignalLength, JBB);
8895 
8896   for (nodePtr.i = 0; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
8897     jam();
8898     Uint32 count = node_groups[nodePtr.i];
8899     if(count != 0 && count != cnoReplicas){
8900       char buf[255];
8901       BaseString::snprintf(buf, sizeof(buf),
8902 			   "Illegal configuration change."
8903 			   " Initial start needs to be performed "
8904 			   " when changing no of replicas (%d != %d)",
8905 			   node_groups[nodePtr.i], cnoReplicas);
8906       progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
8907     }
8908   }
8909 }//Dbdih::selectMasterCandidate()
8910 
8911 /* ------------------------------------------------------------------------- */
8912 /*       ERROR HANDLING DURING READING RESTORABLE GCI FROM FILE.             */
8913 /* ------------------------------------------------------------------------- */
openingGcpErrorLab(Signal * signal,FileRecordPtr filePtr)8914 void Dbdih::openingGcpErrorLab(Signal* signal, FileRecordPtr filePtr)
8915 {
8916   filePtr.p->fileStatus = FileRecord::CRASHED;
8917   filePtr.p->reqStatus = FileRecord::IDLE;
8918   if (crestartInfoFile[0] == filePtr.i) {
8919     jam();
8920     /* --------------------------------------------------------------------- */
8921     /*   THE FIRST FILE WAS NOT ABLE TO BE OPENED. SET STATUS TO CRASHED AND */
8922     /*   TRY OPEN THE NEXT FILE.                                             */
8923     /* --------------------------------------------------------------------- */
8924     filePtr.i = crestartInfoFile[1];
8925     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
8926     openFileRo(signal, filePtr);
8927     filePtr.p->reqStatus = FileRecord::OPENING_GCP;
8928   } else {
8929     jam();
8930     /* --------------------------------------------------------------------- */
8931     /*   WE FAILED IN OPENING THE SECOND FILE. BOTH FILES WERE CORRUPTED. WE */
8932     /*   CANNOT CONTINUE THE RESTART IN THIS CASE. TELL NDBCNTR OF OUR       */
8933     /*   FAILURE.                                                            */
8934     /*---------------------------------------------------------------------- */
8935     sendDihRestartRef(signal);
8936     return;
8937   }//if
8938 }//Dbdih::openingGcpErrorLab()
8939 
readingGcpErrorLab(Signal * signal,FileRecordPtr filePtr)8940 void Dbdih::readingGcpErrorLab(Signal* signal, FileRecordPtr filePtr)
8941 {
8942   filePtr.p->fileStatus = FileRecord::CRASHED;
8943   /* ----------------------------------------------------------------------- */
8944   /*     WE FAILED IN READING THE FILE AS WELL. WE WILL CLOSE THIS FILE.     */
8945   /* ----------------------------------------------------------------------- */
8946   closeFile(signal, filePtr);
8947   filePtr.p->reqStatus = FileRecord::CLOSING_GCP_CRASH;
8948 }//Dbdih::readingGcpErrorLab()
8949 
closingGcpCrashLab(Signal * signal,FileRecordPtr filePtr)8950 void Dbdih::closingGcpCrashLab(Signal* signal, FileRecordPtr filePtr)
8951 {
8952   if (crestartInfoFile[0] == filePtr.i) {
8953     jam();
8954     /* --------------------------------------------------------------------- */
8955     /*   ERROR IN FIRST FILE, TRY THE SECOND FILE.                           */
8956     /* --------------------------------------------------------------------- */
8957     filePtr.i = crestartInfoFile[1];
8958     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
8959     openFileRw(signal, filePtr);
8960     filePtr.p->reqStatus = FileRecord::OPENING_GCP;
8961     return;
8962   }//if
8963   /* ----------------------------------------------------------------------- */
8964   /*     WE DISCOVERED A FAILURE WITH THE SECOND FILE AS WELL. THIS IS A     */
8965   /*     SERIOUS PROBLEM. REPORT FAILURE TO NDBCNTR.                         */
8966   /* ----------------------------------------------------------------------- */
8967   sendDihRestartRef(signal);
8968 }//Dbdih::closingGcpCrashLab()
8969 
8970 /*****************************************************************************/
8971 /* ------------------------------------------------------------------------- */
8972 /*       THIS IS AN INITIAL RESTART. WE WILL CREATE THE TWO FILES DESCRIBING */
8973 /*       THE GLOBAL CHECKPOINTS THAT ARE RESTORABLE.                         */
8974 /* ------------------------------------------------------------------------- */
8975 /*****************************************************************************/
initGciFilesLab(Signal * signal)8976 void Dbdih::initGciFilesLab(Signal* signal)
8977 {
8978   FileRecordPtr filePtr;
8979   filePtr.i = crestartInfoFile[0];
8980   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
8981   createFileRw(signal, filePtr);
8982   filePtr.p->reqStatus = FileRecord::CREATING_GCP;
8983 }//Dbdih::initGciFilesLab()
8984 
8985 /* ------------------------------------------------------------------------- */
8986 /*       GLOBAL CHECKPOINT FILE HAVE BEEN SUCCESSFULLY CREATED.              */
8987 /* ------------------------------------------------------------------------- */
creatingGcpLab(Signal * signal,FileRecordPtr filePtr)8988 void Dbdih::creatingGcpLab(Signal* signal, FileRecordPtr filePtr)
8989 {
8990   if (filePtr.i == crestartInfoFile[0]) {
8991     jam();
8992     /* --------------------------------------------------------------------- */
8993     /*   IF CREATED FIRST THEN ALSO CREATE THE SECOND FILE.                  */
8994     /* --------------------------------------------------------------------- */
8995     filePtr.i = crestartInfoFile[1];
8996     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
8997     createFileRw(signal, filePtr);
8998     filePtr.p->reqStatus = FileRecord::CREATING_GCP;
8999   } else {
9000     jam();
9001     /* --------------------------------------------------------------------- */
9002     /*   BOTH FILES HAVE BEEN CREATED. NOW WRITE THE INITIAL DATA TO BOTH    */
9003     /*   OF THE FILES.                                                       */
9004     /* --------------------------------------------------------------------- */
9005     filePtr.i = crestartInfoFile[0];
9006     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
9007     writeRestorableGci(signal, filePtr);
9008     filePtr.p->reqStatus = FileRecord::WRITE_INIT_GCP;
9009   }//if
9010 }//Dbdih::creatingGcpLab()
9011 
9012 /* ------------------------------------------------------------------------- */
9013 /*       WE HAVE SUCCESSFULLY WRITTEN A GCI FILE.                            */
9014 /* ------------------------------------------------------------------------- */
writeInitGcpLab(Signal * signal,FileRecordPtr filePtr)9015 void Dbdih::writeInitGcpLab(Signal* signal, FileRecordPtr filePtr)
9016 {
9017   filePtr.p->reqStatus = FileRecord::IDLE;
9018   if (filePtr.i == crestartInfoFile[0]) {
9019     jam();
9020     /* --------------------------------------------------------------------- */
9021     /*   WE HAVE WRITTEN THE FIRST FILE NOW ALSO WRITE THE SECOND FILE.      */
9022     /* --------------------------------------------------------------------- */
9023     filePtr.i = crestartInfoFile[1];
9024     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
9025     writeRestorableGci(signal, filePtr);
9026     filePtr.p->reqStatus = FileRecord::WRITE_INIT_GCP;
9027   } else {
9028     /* --------------------------------------------------------------------- */
9029     /*   WE HAVE WRITTEN BOTH FILES. LEAVE BOTH FILES OPEN AND CONFIRM OUR   */
9030     /*   PART OF THE INITIAL START.                                          */
9031     /* --------------------------------------------------------------------- */
9032     if (isMaster()) {
9033       jam();
9034       /*---------------------------------------------------------------------*/
9035       // IN MASTER NODES THE START REQUEST IS RECEIVED FROM NDBCNTR AND WE MUST
9036       // RESPOND WHEN COMPLETED.
9037       /*---------------------------------------------------------------------*/
9038       signal->theData[0] = reference();
9039       sendSignal(cndbStartReqBlockref, GSN_NDB_STARTCONF, signal, 1, JBB);
9040     } else {
9041       jam();
9042       ndbsttorry10Lab(signal, __LINE__);
9043       return;
9044     }//if
9045   }//if
9046 }//Dbdih::writeInitGcpLab()
9047 
9048 /*****************************************************************************/
9049 /* **********     NODES DELETION MODULE                          *************/
9050 /*****************************************************************************/
9051 /*---------------------------------------------------------------------------*/
9052 /*                    LOGIC FOR NODE FAILURE                                 */
9053 /*---------------------------------------------------------------------------*/
execNODE_FAILREP(Signal * signal)9054 void Dbdih::execNODE_FAILREP(Signal* signal)
9055 {
9056   Uint32 i;
9057   Uint32 failedNodes[MAX_NDB_NODES];
9058   jamEntry();
9059   NodeFailRep * const nodeFail = (NodeFailRep *)&signal->theData[0];
9060 
9061   cfailurenr = nodeFail->failNo;
9062   Uint32 newMasterId = nodeFail->masterNodeId;
9063   const Uint32 noOfFailedNodes = nodeFail->noOfNodes;
9064 
9065   if (ERROR_INSERTED(7179) || ERROR_INSERTED(7217))
9066   {
9067     CLEAR_ERROR_INSERT_VALUE;
9068   }
9069 
9070   if (ERROR_INSERTED(7184))
9071   {
9072     SET_ERROR_INSERT_VALUE(7000);
9073   }
9074 
9075   c_increase_lcp_speed_after_nf = true;
9076 
9077   /*-------------------------------------------------------------------------*/
9078   // The first step is to convert from a bit mask to an array of failed nodes.
9079   /*-------------------------------------------------------------------------*/
9080   Uint32 index = 0;
9081   for (i = 1; i < MAX_NDB_NODES; i++) {
9082     if(NdbNodeBitmask::get(nodeFail->theNodes, i)){
9083       jamLine(i);
9084       failedNodes[index] = i;
9085       index++;
9086     }//if
9087   }//for
9088   ndbrequire(noOfFailedNodes == index);
9089   ndbrequire(noOfFailedNodes - 1 < MAX_NDB_NODES);
9090 
9091   /*-------------------------------------------------------------------------*/
9092   // The second step is to update the node status of the failed nodes, remove
9093   // them from the alive node list and put them into the dead node list. Also
9094   // update the number of nodes on-line.
9095   // We also set certain state variables ensuring that the node no longer is
9096   // used in transactions and also mark that we received this signal.
9097   /*-------------------------------------------------------------------------*/
9098   for (i = 0; i < noOfFailedNodes; i++) {
9099     jam();
9100     NodeRecordPtr TNodePtr;
9101     TNodePtr.i = failedNodes[i];
9102     ptrCheckGuard(TNodePtr, MAX_NDB_NODES, nodeRecord);
9103     setNodeRecoveryStatus(TNodePtr.i, NodeRecord::NODE_FAILED);
9104     TNodePtr.p->useInTransactions = false;
9105     TNodePtr.p->m_inclDihLcp = false;
9106     TNodePtr.p->recNODE_FAILREP = ZTRUE;
9107     if (TNodePtr.p->nodeStatus == NodeRecord::ALIVE) {
9108       jam();
9109       con_lineNodes--;
9110       TNodePtr.p->nodeStatus = NodeRecord::DIED_NOW;
9111       removeAlive(TNodePtr);
9112       insertDeadNode(TNodePtr);
9113     }//if
9114   }//for
9115 
9116   /*-------------------------------------------------------------------------*/
9117   // Verify that we can continue to operate the cluster. If we cannot we will
9118   // not return from checkEscalation.
9119   /*-------------------------------------------------------------------------*/
9120   checkEscalation();
9121 
9122   /*------------------------------------------------------------------------*/
9123   // Verify that a starting node has also crashed. Reset the node start record.
9124   /*-------------------------------------------------------------------------*/
9125 #if 0
9126   /**
9127    * Node will crash by itself...
9128    *   nodeRestart is run then...
9129    */
9130   if (false && c_nodeStartMaster.startNode != RNIL && getNodeStatus(c_nodeStartMaster.startNode) == NodeRecord::ALIVE)
9131   {
9132     BlockReference cntrRef = calcNdbCntrBlockRef(c_nodeStartMaster.startNode);
9133     SystemError * const sysErr = (SystemError*)&signal->theData[0];
9134     sysErr->errorCode = SystemError::StartInProgressError;
9135     sysErr->errorRef = reference();
9136     sysErr->data[0]= 0;
9137     sysErr->data[1]= __LINE__;
9138     sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal,  SystemError::SignalLength, JBA);
9139     nodeResetStart(signal);
9140   }//if
9141 #endif
9142 
9143   if (is_lcp_paused())
9144   {
9145     /**
9146      * Stop any LCP pausing, a node has crashed, this implies that also the
9147      * node that caused us to pause the LCP has crashed.
9148      */
9149     jam();
9150     handle_node_failure_in_pause(signal);
9151   }
9152   /*--------------------------------------------------*/
9153   /*                                                  */
9154   /*       WE CHANGE THE REFERENCE TO MASTER DIH      */
9155   /*       BLOCK AND POINTER AT THIS PLACE IN THE CODE*/
9156   /*--------------------------------------------------*/
9157   Uint32 oldMasterId = cmasterNodeId;
9158   BlockReference oldMasterRef = cmasterdihref;
9159   cmasterdihref = calcDihBlockRef(newMasterId);
9160   cmasterNodeId = newMasterId;
9161 
9162   const bool masterTakeOver = (oldMasterId != newMasterId);
9163 
9164   for(i = 0; i < noOfFailedNodes; i++) {
9165     NodeRecordPtr failedNodePtr;
9166     failedNodePtr.i = failedNodes[i];
9167     ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
9168     if (oldMasterRef == reference()) {
9169       /*-------------------------------------------------------*/
9170       // Functions that need to be called only for master nodes.
9171       /*-------------------------------------------------------*/
9172       checkCopyTab(signal, failedNodePtr);
9173       checkStopPermMaster(signal, failedNodePtr);
9174       checkWaitGCPMaster(signal, failedNodes[i]);
9175 
9176       {
9177         Ptr<TakeOverRecord> takeOverPtr;
9178         if (findTakeOver(takeOverPtr, failedNodePtr.i))
9179         {
9180           handleTakeOver(signal, takeOverPtr);
9181         }
9182       }
9183       checkGcpOutstanding(signal, failedNodePtr.i);
9184     } else {
9185       jam();
9186       /*-----------------------------------------------------------*/
9187       // Functions that need to be called only for nodes that were
9188       // not master before these failures.
9189       /*-----------------------------------------------------------*/
9190       checkStopPermProxy(signal, failedNodes[i]);
9191       checkWaitGCPProxy(signal, failedNodes[i]);
9192     }//if
9193     /*--------------------------------------------------*/
9194     // Functions that need to be called for all nodes.
9195     /*--------------------------------------------------*/
9196     checkStopMe(signal, failedNodePtr);
9197     failedNodeLcpHandling(signal, failedNodePtr);
9198     startRemoveFailedNode(signal, failedNodePtr);
9199 
9200     /**
9201      * This is the last function called
9202      *   It modifies failedNodePtr.p->nodeStatus
9203      */
9204     failedNodeSynchHandling(signal, failedNodePtr);
9205   }//for
9206 
9207   if(masterTakeOver){
9208     jam();
9209     startLcpMasterTakeOver(signal, oldMasterId);
9210     startGcpMasterTakeOver(signal, oldMasterId);
9211 
9212     if(getNodeState().getNodeRestartInProgress()){
9213       jam();
9214       progError(__LINE__, NDBD_EXIT_MASTER_FAILURE_DURING_NR);
9215     }
9216   }
9217 
9218 
9219   if (isMaster()) {
9220     jam();
9221     setNodeRestartInfoBits(signal);
9222   }//if
9223 
9224   setGCPStopTimeouts();
9225 }//Dbdih::execNODE_FAILREP()
9226 
checkCopyTab(Signal * signal,NodeRecordPtr failedNodePtr)9227 void Dbdih::checkCopyTab(Signal* signal, NodeRecordPtr failedNodePtr)
9228 {
9229   jam();
9230 
9231   if(c_nodeStartMaster.startNode != failedNodePtr.i){
9232     jam();
9233     return;
9234   }
9235 
9236   switch(c_nodeStartMaster.m_outstandingGsn){
9237   case GSN_COPY_TABREQ:
9238     jam();
9239     releaseTabPages(failedNodePtr.p->activeTabptr);
9240     if (c_COPY_TABREQ_Counter.isWaitingFor(failedNodePtr.i))
9241     {
9242       jam();
9243       c_COPY_TABREQ_Counter.clearWaitingFor(failedNodePtr.i);
9244     }
9245     c_nodeStartMaster.wait = ZFALSE;
9246     break;
9247   case GSN_START_INFOREQ:
9248   case GSN_START_PERMCONF:
9249   case GSN_DICTSTARTREQ:
9250   case GSN_COPY_GCIREQ:
9251     jam();
9252     break;
9253   default:
9254     g_eventLogger->error("outstanding gsn: %s(%d)",
9255                          getSignalName(c_nodeStartMaster.m_outstandingGsn),
9256                          c_nodeStartMaster.m_outstandingGsn);
9257     ndbrequire(false);
9258   }
9259 
9260   if (!c_nodeStartMaster.m_fragmentInfoMutex.isNull())
9261   {
9262     jam();
9263     Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
9264     mutex.unlock();
9265   }
9266 
9267   nodeResetStart(signal);
9268 }//Dbdih::checkCopyTab()
9269 
checkStopMe(Signal * signal,NodeRecordPtr failedNodePtr)9270 void Dbdih::checkStopMe(Signal* signal, NodeRecordPtr failedNodePtr)
9271 {
9272   jam();
9273   if (c_STOP_ME_REQ_Counter.isWaitingFor(failedNodePtr.i)){
9274     jam();
9275     ndbrequire(c_stopMe.clientRef != 0);
9276     StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
9277     stopMeConf->senderRef = calcDihBlockRef(failedNodePtr.i);
9278     stopMeConf->senderData = c_stopMe.clientData;
9279     sendSignal(reference(), GSN_STOP_ME_CONF, signal,
9280 	       StopMeConf::SignalLength, JBB);
9281   }//if
9282 }//Dbdih::checkStopMe()
9283 
checkStopPermMaster(Signal * signal,NodeRecordPtr failedNodePtr)9284 void Dbdih::checkStopPermMaster(Signal* signal, NodeRecordPtr failedNodePtr)
9285 {
9286   DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
9287   jam();
9288   if (c_DIH_SWITCH_REPLICA_REQ_Counter.isWaitingFor(failedNodePtr.i)){
9289     jam();
9290     ndbrequire(c_stopPermMaster.clientRef != 0);
9291     ref->senderNode = failedNodePtr.i;
9292     ref->errorCode = StopPermRef::NF_CausedAbortOfStopProcedure;
9293     sendSignal(reference(), GSN_DIH_SWITCH_REPLICA_REF, signal,
9294                DihSwitchReplicaRef::SignalLength, JBB);
9295     return;
9296   }//if
9297 }//Dbdih::checkStopPermMaster()
9298 
checkStopPermProxy(Signal * signal,NodeId failedNodeId)9299 void Dbdih::checkStopPermProxy(Signal* signal, NodeId failedNodeId)
9300 {
9301   jam();
9302   if(c_stopPermProxy.clientRef != 0 &&
9303      refToNode(c_stopPermProxy.masterRef) == failedNodeId){
9304 
9305     /**
9306      * The master has failed report to proxy-client
9307      */
9308     jam();
9309     StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
9310 
9311     ref->senderData = c_stopPermProxy.clientData;
9312     ref->errorCode  = StopPermRef::NF_CausedAbortOfStopProcedure;
9313     sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_REF, signal, 2, JBB);
9314     c_stopPermProxy.clientRef = 0;
9315   }//if
9316 }//Dbdih::checkStopPermProxy()
9317 
9318 void
handleTakeOver(Signal * signal,TakeOverRecordPtr takeOverPtr)9319 Dbdih::handleTakeOver(Signal* signal, TakeOverRecordPtr takeOverPtr)
9320 {
9321   jam();
9322   switch(takeOverPtr.p->toMasterStatus){
9323   case TakeOverRecord::TO_MASTER_IDLE:
9324     jam();
9325     releaseTakeOver(takeOverPtr, true);
9326     return;
9327   case TakeOverRecord::TO_MUTEX_BEFORE_STORED:
9328     jam();
9329     /**
9330      * Waiting for lock...
9331      *   do nothing...will be detected when lock is acquired
9332      */
9333     return;
9334   case TakeOverRecord::TO_MUTEX_BEFORE_LOCKED:
9335     jam();
9336     /**
9337      * Has lock...and NGPtr reservation...
9338      */
9339     abortTakeOver(signal, takeOverPtr);
9340     return;
9341   case TakeOverRecord::TO_AFTER_STORED:{
9342     jam();
9343     /**
9344      * No lock...but NGPtr reservation...remove NGPtr reservation
9345      */
9346     NodeRecordPtr nodePtr;
9347     NodeGroupRecordPtr NGPtr;
9348     nodePtr.i = takeOverPtr.p->toCopyNode;
9349     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
9350     NGPtr.i = nodePtr.p->nodeGroup;
9351     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
9352 
9353     ndbassert(NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode);
9354     if (NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode)
9355     {
9356       jam();
9357       NGPtr.p->activeTakeOver = 0;
9358     }
9359     releaseTakeOver(takeOverPtr, true);
9360     return;
9361   }
9362   case TakeOverRecord::TO_MUTEX_BEFORE_COMMIT:
9363     jam();
9364     /**
9365      * Waiting for lock...
9366      *   do nothing...will be detected when lock is acquired
9367      */
9368     return;
9369   case TakeOverRecord::TO_MUTEX_BEFORE_SWITCH_REPLICA:
9370     jam();
9371     /**
9372      * Waiting for lock...
9373      *   do nothing...will be detected when lock is acquired
9374      */
9375     return;
9376   case TakeOverRecord::TO_MUTEX_AFTER_SWITCH_REPLICA:
9377     jam();
9378     abortTakeOver(signal, takeOverPtr);
9379     return;
9380   case TakeOverRecord::TO_WAIT_LCP:{
9381     jam();
9382     /**
9383      * Waiting for LCP
9384      */
9385     NodeRecordPtr nodePtr;
9386     nodePtr.i = takeOverPtr.p->toStartingNode;
9387     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
9388     nodePtr.p->copyCompleted = 0;
9389     releaseTakeOver(takeOverPtr, true);
9390     return;
9391   }
9392   default:
9393     jamLine(takeOverPtr.p->toMasterStatus);
9394     ndbrequire(false);
9395   }
9396 }
9397 
failedNodeSynchHandling(Signal * signal,NodeRecordPtr failedNodePtr)9398 void Dbdih::failedNodeSynchHandling(Signal* signal,
9399 				    NodeRecordPtr failedNodePtr)
9400 {
9401   jam();
9402   /*----------------------------------------------------*/
9403   /*       INITIALISE THE VARIABLES THAT KEEP TRACK OF  */
9404   /*       WHEN A NODE FAILURE IS COMPLETED.            */
9405   /*----------------------------------------------------*/
9406   failedNodePtr.p->dbdictFailCompleted = ZFALSE;
9407   failedNodePtr.p->dbtcFailCompleted = ZFALSE;
9408   failedNodePtr.p->dbdihFailCompleted = ZFALSE;
9409   failedNodePtr.p->dblqhFailCompleted = ZFALSE;
9410 
9411   failedNodePtr.p->m_NF_COMPLETE_REP.clearWaitingFor();
9412 
9413   NodeRecordPtr nodePtr;
9414   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
9415     ptrAss(nodePtr, nodeRecord);
9416     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
9417       jam();
9418       /**
9419        * We'r waiting for nodePtr.i to complete
9420        * handling of failedNodePtr.i's death
9421        */
9422 
9423       failedNodePtr.p->m_NF_COMPLETE_REP.setWaitingFor(nodePtr.i);
9424     } else {
9425       jam();
9426       if ((nodePtr.p->nodeStatus == NodeRecord::DYING) &&
9427           (nodePtr.p->m_NF_COMPLETE_REP.isWaitingFor(failedNodePtr.i))){
9428         jam();
9429 	/*----------------------------------------------------*/
9430 	/*       THE NODE FAILED BEFORE REPORTING THE FAILURE */
9431 	/*       HANDLING COMPLETED ON THIS FAILED NODE.      */
9432 	/*       REPORT THAT NODE FAILURE HANDLING WAS        */
9433 	/*       COMPLETED ON THE NEW FAILED NODE FOR THIS    */
9434 	/*       PARTICULAR OLD FAILED NODE.                  */
9435 	/*----------------------------------------------------*/
9436         NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
9437         nf->blockNo = 0;
9438         nf->nodeId  = failedNodePtr.i;
9439         nf->failedNodeId = nodePtr.i;
9440 	nf->from    = __LINE__;
9441         sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
9442                    NFCompleteRep::SignalLength, JBB);
9443       }//if
9444     }//if
9445   }//for
9446   if (failedNodePtr.p->nodeStatus == NodeRecord::DIED_NOW) {
9447     jam();
9448     failedNodePtr.p->nodeStatus = NodeRecord::DYING;
9449   } else {
9450     jam();
9451     /*----------------------------------------------------*/
9452     // No more processing needed when node not even started
9453     // yet. We give the node status to DEAD since we do not
9454     // care whether all nodes complete the node failure
9455     // handling. The node have not been included in the
9456     // node failure protocols.
9457     /*----------------------------------------------------*/
9458     failedNodePtr.p->nodeStatus = NodeRecord::DEAD;
9459     /**-----------------------------------------------------------------------
9460      * WE HAVE COMPLETED HANDLING THE NODE FAILURE IN DIH. WE CAN REPORT THIS
9461      * TO DIH THAT WAIT FOR THE OTHER BLOCKS TO BE CONCLUDED AS WELL.
9462      *-----------------------------------------------------------------------*/
9463     NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
9464     nf->blockNo      = DBDIH;
9465     nf->nodeId       = cownNodeId;
9466     nf->failedNodeId = failedNodePtr.i;
9467     nf->from         = __LINE__;
9468     sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
9469                NFCompleteRep::SignalLength, JBB);
9470   }//if
9471 }//Dbdih::failedNodeSynchHandling()
9472 
9473 bool
findTakeOver(Ptr<TakeOverRecord> & ptr,Uint32 failedNodeId)9474 Dbdih::findTakeOver(Ptr<TakeOverRecord> & ptr, Uint32 failedNodeId)
9475 {
9476   for (c_masterActiveTakeOverList.first(ptr); !ptr.isNull();
9477        c_masterActiveTakeOverList.next(ptr))
9478   {
9479     jam();
9480     if (ptr.p->toStartingNode == failedNodeId)
9481     {
9482       jam();
9483       return true;
9484     }
9485   }
9486   ptr.setNull();
9487   return false;
9488 }//Dbdih::findTakeOver()
9489 
failedNodeLcpHandling(Signal * signal,NodeRecordPtr failedNodePtr)9490 void Dbdih::failedNodeLcpHandling(Signal* signal, NodeRecordPtr failedNodePtr)
9491 {
9492   jam();
9493   const Uint32 nodeId = failedNodePtr.i;
9494 
9495   if (isMaster() && c_lcpState.m_participatingLQH.get(failedNodePtr.i))
9496   {
9497     /*----------------------------------------------------*/
9498     /*  THE NODE WAS INVOLVED IN A LOCAL CHECKPOINT. WE   */
9499     /* MUST UPDATE THE ACTIVE STATUS TO INDICATE THAT     */
9500     /* THE NODE HAVE MISSED A LOCAL CHECKPOINT.           */
9501     /*----------------------------------------------------*/
9502 
9503     /**
9504      * Bug#28717, Only master should do this, as this status is copied
9505      *   to other nodes
9506      */
9507     switch (failedNodePtr.p->activeStatus) {
9508     case Sysfile::NS_Active:
9509       jam();
9510       failedNodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
9511       break;
9512     case Sysfile::NS_ActiveMissed_1:
9513       jam();
9514       failedNodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
9515       break;
9516     case Sysfile::NS_ActiveMissed_2:
9517       jam();
9518       failedNodePtr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
9519       break;
9520     case Sysfile::NS_TakeOver:
9521       jam();
9522       failedNodePtr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
9523       break;
9524     case Sysfile::NS_Configured:
9525       jam();
9526       break;
9527     default:
9528       g_eventLogger->error("activeStatus = %u "
9529                            "at failure after NODE_FAILREP of node = %u",
9530                            (Uint32) failedNodePtr.p->activeStatus,
9531                            failedNodePtr.i);
9532       ndbrequire(false);
9533       break;
9534     }//switch
9535   }//if
9536 
9537   c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
9538   c_lcpState.m_participatingLQH.clear(failedNodePtr.i);
9539 
9540   bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i);
9541 
9542   if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i))
9543   {
9544     jam();
9545     /**
9546      * Mark the signal as a special signal to distinguish it from a signal
9547      * that arrives from time queue for a dead node that should not be
9548      * handled. The marking here makes it known to the LCP_COMPLETE_REP
9549      * that this is a special node failure handling signal which should
9550      * be allowed to pass through although the node is dead.
9551      */
9552     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
9553     rep->nodeId = failedNodePtr.i;
9554     rep->lcpId = SYSFILE->latestLCP_ID;
9555     rep->blockNo = DBDIH;
9556     rep->fromTQ = 0;
9557     sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
9558                LcpCompleteRep::SignalLengthTQ, JBB);
9559   }
9560 
9561   bool lcp_complete_rep = false;
9562   if (!wf)
9563   {
9564     jam();
9565 
9566     /**
9567      * Check if we're waiting for the failed node's LQH to complete
9568      *
9569      * Note that this is ran "before" LCP master take over
9570      */
9571     if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
9572       jam();
9573 
9574       lcp_complete_rep = true;
9575       LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
9576       rep->nodeId  = nodeId;
9577       rep->lcpId   = SYSFILE->latestLCP_ID;
9578       rep->blockNo = DBLQH;
9579       rep->fromTQ = 0;
9580       sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
9581                  LcpCompleteRep::SignalLengthTQ, JBB);
9582 
9583       if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
9584         jam();
9585         /**
9586          * Make sure we're ready to accept it
9587          */
9588         c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
9589       }
9590     }
9591   }
9592 
9593   if (c_TCGETOPSIZEREQ_Counter.isWaitingFor(failedNodePtr.i)) {
9594     jam();
9595     signal->theData[0] = failedNodePtr.i;
9596     signal->theData[1] = 0;
9597     sendSignal(reference(), GSN_TCGETOPSIZECONF, signal, 2, JBB);
9598   }//if
9599 
9600   if (c_TC_CLOPSIZEREQ_Counter.isWaitingFor(failedNodePtr.i)) {
9601     jam();
9602     signal->theData[0] = failedNodePtr.i;
9603     sendSignal(reference(), GSN_TC_CLOPSIZECONF, signal, 1, JBB);
9604   }//if
9605 
9606   if (c_START_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) {
9607     jam();
9608     StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
9609     conf->senderRef = numberToRef(DBLQH, failedNodePtr.i);
9610     conf->lcpId = SYSFILE->latestLCP_ID;
9611     sendSignal(reference(), GSN_START_LCP_CONF, signal,
9612 	       StartLcpConf::SignalLength, JBB);
9613   }//if
9614 
9615 dosend:
9616   if (c_EMPTY_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i))
9617   {
9618     jam();
9619     EmptyLcpConf * const rep = (EmptyLcpConf *)&signal->theData[0];
9620     rep->senderNodeId = failedNodePtr.i;
9621     rep->tableId = ~0;
9622     rep->fragmentId = ~0;
9623     rep->lcpNo = 0;
9624     rep->lcpId = SYSFILE->latestLCP_ID;
9625     rep->idle = true;
9626     sendSignal(reference(), GSN_EMPTY_LCP_CONF, signal,
9627 	       EmptyLcpConf::SignalLength, JBB);
9628   }
9629   else if (!c_EMPTY_LCP_REQ_Counter.done() && lcp_complete_rep)
9630   {
9631     jam();
9632     c_EMPTY_LCP_REQ_Counter.setWaitingFor(failedNodePtr.i);
9633     goto dosend;
9634   }
9635 
9636   if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) {
9637     jam();
9638     MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
9639     ref->senderNodeId = failedNodePtr.i;
9640     ref->failedNodeId = cmasterTakeOverNode;
9641     sendSignal(reference(), GSN_MASTER_LCPREF, signal,
9642 	       MasterLCPRef::SignalLength, JBB);
9643   }//if
9644 
9645 }//Dbdih::failedNodeLcpHandling()
9646 
checkGcpOutstanding(Signal * signal,Uint32 failedNodeId)9647 void Dbdih::checkGcpOutstanding(Signal* signal, Uint32 failedNodeId){
9648   if (c_GCP_PREPARE_Counter.isWaitingFor(failedNodeId)){
9649     jam();
9650     GCPPrepareConf* conf = (GCPPrepareConf*)signal->getDataPtrSend();
9651     conf->nodeId = failedNodeId;
9652     conf->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
9653     conf->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
9654     sendSignal(reference(), GSN_GCP_PREPARECONF, signal,
9655                GCPPrepareConf::SignalLength, JBB);
9656   }//if
9657 
9658   if (c_GCP_COMMIT_Counter.isWaitingFor(failedNodeId))
9659   {
9660     jam();
9661     /* Record minimum failure number, will cause re-send of
9662      * GCP_NOMORETRANS if local GCP_NODEFINISH arrives before
9663      * TC has handled the failure.
9664      */
9665     cMinTcFailNo = cfailurenr;
9666 
9667     /**
9668      * Waiting for GSN_GCP_NODEFINISH
9669      *   TC-take-over can generate new transactions
9670      *   that will be in this epoch
9671      *   re-run GCP_NOMORETRANS to master-TC (self) that will run
9672      *   take-over
9673      */
9674     c_GCP_COMMIT_Counter.clearWaitingFor(failedNodeId);
9675 
9676     /* Check to see whether we have already received GCP_NODEFINISH
9677      * from the local (Master) TC instance
9678      */
9679     if (!c_GCP_COMMIT_Counter.isWaitingFor(getOwnNodeId()))
9680     {
9681       jam();
9682       /* Already received GCP_NODEFINISH for this GCI, must
9683        * resend GCP_NOMORETRANS request now.
9684        * Otherwise we will re-send it when GCP_NODEFINISH
9685        * arrives.
9686        */
9687       c_GCP_COMMIT_Counter.setWaitingFor(getOwnNodeId());
9688       /* Reset DIH GCP state */
9689       m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
9690 
9691       GCPNoMoreTrans* req = (GCPNoMoreTrans*)signal->getDataPtrSend();
9692       req->senderRef = reference();
9693       req->senderData = m_micro_gcp.m_master_ref;
9694       req->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
9695       req->gci_lo = Uint32(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
9696       sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
9697                  GCPNoMoreTrans::SignalLength, JBB);
9698     }
9699   }
9700 
9701   if (c_GCP_SAVEREQ_Counter.isWaitingFor(failedNodeId)) {
9702     jam();
9703     GCPSaveRef * const saveRef = (GCPSaveRef*)&signal->theData[0];
9704     saveRef->dihPtr = failedNodeId;
9705     saveRef->nodeId = failedNodeId;
9706     saveRef->gci    = m_gcp_save.m_master.m_new_gci;
9707     saveRef->errorCode = GCPSaveRef::FakedSignalDueToNodeFailure;
9708     sendSignal(reference(), GSN_GCP_SAVEREF, signal,
9709 	       GCPSaveRef::SignalLength, JBB);
9710   }//if
9711 
9712   if (c_COPY_GCIREQ_Counter.isWaitingFor(failedNodeId)) {
9713     jam();
9714     signal->theData[0] = failedNodeId;
9715     sendSignal(reference(), GSN_COPY_GCICONF, signal, 1, JBB);
9716   }//if
9717 
9718   if (c_MASTER_GCPREQ_Counter.isWaitingFor(failedNodeId)){
9719     jam();
9720     MasterGCPRef * const ref = (MasterGCPRef *)&signal->theData[0];
9721     ref->senderNodeId = failedNodeId;
9722     ref->failedNodeId = cmasterTakeOverNode;
9723     sendSignal(reference(), GSN_MASTER_GCPREF, signal,
9724 	       MasterGCPRef::SignalLength, JBB);
9725   }//if
9726 
9727   if (c_SUB_GCP_COMPLETE_REP_Counter.isWaitingFor(failedNodeId))
9728   {
9729     jam();
9730     SubGcpCompleteAck* ack = CAST_PTR(SubGcpCompleteAck,
9731                                       signal->getDataPtrSend());
9732     ack->rep.senderRef = numberToRef(DBDIH, failedNodeId);
9733     sendSignal(reference(), GSN_SUB_GCP_COMPLETE_ACK, signal,
9734 	       SubGcpCompleteAck::SignalLength, JBB);
9735   }
9736 }
9737 
9738 /**
9739  * This function checks if any node is started that doesn't support the
9740  * functionality to remove the need of the EMPTY_LCP_REQ protocol.
9741  */
check_if_empty_lcp_needed(void)9742 bool Dbdih::check_if_empty_lcp_needed(void)
9743 {
9744   NodeRecordPtr specNodePtr;
9745   specNodePtr.i = cfirstAliveNode;
9746   do
9747   {
9748     jam();
9749     if (getNodeInfo(specNodePtr.i).m_version < NDBD_EMPTY_LCP_NOT_NEEDED)
9750     {
9751       jam();
9752       return true;
9753     }
9754     ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
9755     specNodePtr.i = specNodePtr.p->nextNode;
9756   } while (specNodePtr.i != RNIL);
9757   return false;
9758 }
9759 
9760 void
startLcpMasterTakeOver(Signal * signal,Uint32 nodeId)9761 Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId)
9762 {
9763   jam();
9764 
9765   if (ERROR_INSERTED(7230))
9766   {
9767     return;
9768   }
9769 
9770   Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;
9771 
9772   NodeRecordPtr nodePtr;
9773   nodePtr.i = oldNode;
9774   if (oldNode > 0 && oldNode < MAX_NDB_NODES)
9775   {
9776     jam();
9777     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
9778     if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
9779     {
9780       jam();
9781       checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
9782     }
9783   }
9784 
9785   c_lcpMasterTakeOverState.use_empty_lcp = check_if_empty_lcp_needed();
9786   if (!c_lcpMasterTakeOverState.use_empty_lcp)
9787   {
9788     jam();
9789     /**
9790      * As of NDBD_EMPTY_LCP_PROTOCOL_NOT_NEEDED version this is the
9791      * normal path through the code.
9792      *
9793      * We now ensures that LQH keeps track of which LCP_FRAG_ORD it has
9794      * received. So this means that we can be a bit more sloppy in master
9795      * take over. We need not worry if we resend LCP_FRAG_ORD since LQH will
9796      * simply drop it.
9797      *
9798      * So when we are done with the master take over we will simply start from
9799      * scratch from the first table and fragment. We have sufficient
9800      * information locally in the new master to skip resending all fragment
9801      * replicas where we already received LCP_FRAG_REP. For those where we sent
9802      * LCP_FRAG_ORD but not received LCP_FRAG_REP we simply send it again. If
9803      * it was sent before then LQH will discover it and drop it.
9804      *
9805      * We also don't need to worry about sending too many LCP_FRAG_ORDs to LQH
9806      * since we can send it for all fragment replicas given that we use the
9807      * fragment record as the queueing record. So in practice the queue is
9808      * always large enough.
9809      *
9810      * For old nodes we still have to run the EMPTY_LCP_REQ protocol to
9811      * ensure that all outstanding LCP_FRAG_ORD have come back to all
9812      * DBDIHs as LCP_FRAG_REPs to ensure that every DBDIH has a complete
9813      * understanding of the LCP state and can take it over. What we do here
9814      * is that if one node is old, then we run the old take over protocol
9815      * for all nodes to not mess the code up too much. Theoretically it
9816      * would suffice to send EMPTY_LCP_REQ to only old nodes, but we won't
9817      * handle this, we will simply run the old code as it was.
9818      */
9819     c_lcpMasterTakeOverState.minTableId = 0;
9820     c_lcpMasterTakeOverState.minFragId = 0;
9821     c_lcpMasterTakeOverState.failedNodeId = nodeId;
9822     c_lcpMasterTakeOverState.set(LMTOS_WAIT_LCP_FRAG_REP, __LINE__);
9823     setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
9824     checkEmptyLcpComplete(signal);
9825     return;
9826   }
9827 
9828   c_lcpMasterTakeOverState.minTableId = ~0;
9829   c_lcpMasterTakeOverState.minFragId = ~0;
9830   c_lcpMasterTakeOverState.failedNodeId = nodeId;
9831   c_lcpMasterTakeOverState.set(LMTOS_WAIT_EMPTY_LCP, __LINE__);
9832 
9833   EmptyLcpReq* req = (EmptyLcpReq*)signal->getDataPtrSend();
9834   req->senderRef = reference();
9835   {
9836     NodeRecordPtr specNodePtr;
9837     specNodePtr.i = cfirstAliveNode;
9838     do {
9839       jam();
9840       ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
9841       if (!c_EMPTY_LCP_REQ_Counter.isWaitingFor(specNodePtr.i))
9842       {
9843         jam();
9844         c_EMPTY_LCP_REQ_Counter.setWaitingFor(specNodePtr.i);
9845         sendEMPTY_LCP_REQ(signal, specNodePtr.i, 0);
9846         if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(specNodePtr.i))
9847         {
9848           jam();
9849           c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor();
9850         }
9851       }
9852       specNodePtr.i = specNodePtr.p->nextNode;
9853     } while (specNodePtr.i != RNIL);
9854   }
9855   setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
9856 }
9857 
startGcpMasterTakeOver(Signal * signal,Uint32 oldMasterId)9858 void Dbdih::startGcpMasterTakeOver(Signal* signal, Uint32 oldMasterId){
9859   jam();
9860   /*--------------------------------------------------*/
9861   /*                                                  */
9862   /*       THE MASTER HAVE FAILED AND WE WERE ELECTED */
9863   /*       TO BE THE NEW MASTER NODE. WE NEED TO QUERY*/
9864   /*       ALL THE OTHER NODES ABOUT THEIR STATUS IN  */
9865   /*       ORDER TO BE ABLE TO TAKE OVER CONTROL OF   */
9866   /*       THE GLOBAL CHECKPOINT PROTOCOL AND THE     */
9867   /*       LOCAL CHECKPOINT PROTOCOL.                 */
9868   /*--------------------------------------------------*/
9869   if(!isMaster()){
9870     jam();
9871     return;
9872   }
9873   cmasterState = MASTER_TAKE_OVER_GCP;
9874   cmasterTakeOverNode = oldMasterId;
9875   MasterGCPReq * const req = (MasterGCPReq *)&signal->theData[0];
9876   req->masterRef = reference();
9877   req->failedNodeId = oldMasterId;
9878   sendLoopMacro(MASTER_GCPREQ, sendMASTER_GCPREQ, RNIL);
9879 
9880   signal->theData[0] = NDB_LE_GCP_TakeoverStarted;
9881   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
9882 
9883   /**
9884    * save own value...
9885    *   to be able to check values returned in MASTER_GCPCONF
9886    */
9887   m_gcp_save.m_master.m_new_gci = m_gcp_save.m_gci;
9888 
9889   setLocalNodefailHandling(signal, oldMasterId, NF_GCP_TAKE_OVER);
9890 }//Dbdih::handleNewMaster()
9891 
startRemoveFailedNode(Signal * signal,NodeRecordPtr failedNodePtr)9892 void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr)
9893 {
9894   Uint32 nodeId = failedNodePtr.i;
9895   if(failedNodePtr.p->nodeStatus != NodeRecord::DIED_NOW){
9896     jam();
9897     /**
9898      * Is node isn't alive. It can't be part of LCP
9899      */
9900     ndbrequire(!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId));
9901 
9902     /**
9903      * And there is no point in removing any replicas
9904      *   It's dead...
9905      */
9906     return;
9907   }
9908 
9909   /**
9910    * If node has node complete LCP
9911    *   we need to remove it as undo might not be complete
9912    *   bug#31257
9913    */
9914   failedNodePtr.p->m_remove_node_from_table_lcp_id = RNIL;
9915   if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(failedNodePtr.i))
9916   {
9917     jam();
9918     failedNodePtr.p->m_remove_node_from_table_lcp_id = SYSFILE->latestLCP_ID;
9919   }
9920 
9921   jam();
9922 
9923   if (!ERROR_INSERTED(7194) && !ERROR_INSERTED(7221))
9924   {
9925     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
9926     signal->theData[1] = failedNodePtr.i;
9927     signal->theData[2] = 0; // Tab id
9928     if (!ERROR_INSERTED(7233))
9929       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
9930     else
9931       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
9932   }
9933   else
9934   {
9935     if (ERROR_INSERTED(7194))
9936     {
9937       ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE");
9938     }
9939     else if (ERROR_INSERTED(7221))
9940     {
9941       ndbout_c("7221 Not starting ZREMOVE_NODE_FROM_TABLE");
9942     }
9943   }
9944 
9945   setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE);
9946 }//Dbdih::startRemoveFailedNode()
9947 
handle_master_take_over_copy_gci(Signal * signal,NodeId new_master_node_id)9948 bool Dbdih::handle_master_take_over_copy_gci(Signal *signal, NodeId new_master_node_id)
9949 {
9950   if (c_copyGCISlave.m_expectedNextWord != 0)
9951   {
9952     jam();
9953     c_copyGCISlave.m_expectedNextWord = 0;
9954     c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
9955   }
9956 
9957   if (c_copyGCISlave.m_copyReason != CopyGCIReq::IDLE)
9958   {
9959     /**
9960      * Before we allow the new master to start up the new GCP protocols
9961      * we need to ensure that the activity started by the previous
9962      * failed master is completed before we process the master takeover.
9963      * By enforcing this in MASTER_GCPREQ and MASTER_LCPREQ we are
9964      * certain that the master takeover is ready to start up the new
9965      * COPY_GCIREQ protocols.
9966      */
9967     sendSignalWithDelay(reference(), GSN_MASTER_GCPREQ,
9968                         signal, 10, MasterGCPReq::SignalLength);
9969     return true;
9970   }
9971   c_handled_master_take_over_copy_gci = new_master_node_id;
9972   return false;
9973 }
9974 
9975 /*--------------------------------------------------*/
9976 /*       THE MASTER HAS FAILED AND THE NEW MASTER IS*/
9977 /*       QUERYING THIS NODE ABOUT THE STATE OF THE  */
9978 /*       GLOBAL CHECKPOINT PROTOCOL                 */
9979 /*--------------------------------------------------*/
execMASTER_GCPREQ(Signal * signal)9980 void Dbdih::execMASTER_GCPREQ(Signal* signal)
9981 {
9982   NodeRecordPtr failedNodePtr;
9983   NodeRecordPtr newMasterNodePtr;
9984   MasterGCPReq * const masterGCPReq = (MasterGCPReq *)&signal->theData[0];
9985   jamEntry();
9986   const BlockReference newMasterBlockref = masterGCPReq->masterRef;
9987   const Uint32 failedNodeId = masterGCPReq->failedNodeId;
9988 
9989   failedNodePtr.i = failedNodeId;
9990   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
9991   newMasterNodePtr.i = refToNode(newMasterBlockref);
9992   ptrCheckGuard(newMasterNodePtr, MAX_NDB_NODES, nodeRecord);
9993 
9994   if (newMasterNodePtr.p->nodeStatus != NodeRecord::ALIVE)
9995   {
9996     /**
9997      * We delayed the MASTER_GCPREQ signal and now it arrived after
9998      * the new master already died. We ignore this signal.
9999      */
10000 #ifdef VM_TRACE
10001     g_eventLogger->info("Dropped MASTER_GCPREQ from node %u",
10002                         newMasterNodePtr.i);
10003 #endif
10004     jam();
10005     return;
10006   }
10007 
10008   if (failedNodePtr.p->nodeStatus == NodeRecord::ALIVE) {
10009     jam();
10010     /*--------------------------------------------------*/
10011     /*       ENSURE THAT WE HAVE PROCESSED THE SIGNAL   */
10012     /*       NODE_FAILURE BEFORE WE PROCESS THIS REQUEST*/
10013     /*       FROM THE NEW MASTER. THIS ENSURES THAT WE  */
10014     /*       HAVE REMOVED THE FAILED NODE FROM THE LIST */
10015     /*       OF ACTIVE NODES AND SO FORTH.              */
10016     /*--------------------------------------------------*/
10017     sendSignalWithDelay(reference(), GSN_MASTER_GCPREQ,
10018                         signal, 10, MasterGCPReq::SignalLength);
10019     return;
10020   } else {
10021     ndbrequire(failedNodePtr.p->nodeStatus == NodeRecord::DYING);
10022   }//if
10023 
10024   if (handle_master_take_over_copy_gci(signal, newMasterNodePtr.i))
10025   {
10026     return;
10027   }
10028 #ifdef VM_TRACE
10029   g_eventLogger->info("Handle MASTER_GCPREQ from node %u",
10030                       newMasterNodePtr.i);
10031 #endif
10032   if (ERROR_INSERTED(7181))
10033   {
10034     ndbout_c("execGCP_TCFINISHED in MASTER_GCPREQ");
10035     CLEAR_ERROR_INSERT_VALUE;
10036     signal->theData[0] = c_error_7181_ref;
10037     signal->theData[1] = (Uint32)(m_micro_gcp.m_old_gci >> 32);
10038     signal->theData[2] = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
10039     signal->theData[3] = cfailurenr;
10040     execGCP_TCFINISHED(signal);
10041   }
10042 
10043   MasterGCPConf::State gcpState;
10044   switch(m_micro_gcp.m_state){
10045   case MicroGcp::M_GCP_IDLE:
10046     jam();
10047     gcpState = MasterGCPConf::GCP_READY;
10048     break;
10049   case MicroGcp::M_GCP_PREPARE:
10050     jam();
10051     gcpState = MasterGCPConf::GCP_PREPARE_RECEIVED;
10052     break;
10053   case MicroGcp::M_GCP_COMMIT:
10054     jam();
10055     gcpState = MasterGCPConf::GCP_COMMIT_RECEIVED;
10056     break;
10057   case MicroGcp::M_GCP_COMMITTED:
10058     jam();
10059     gcpState = MasterGCPConf::GCP_COMMITTED;
10060 
10061     /**
10062      * Change state to GCP_COMMIT_RECEIVEDn and rerun GSN_GCP_NOMORETRANS
10063      */
10064     gcpState = MasterGCPConf::GCP_COMMIT_RECEIVED;
10065     m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
10066 
10067     {
10068       GCPNoMoreTrans* req2 = (GCPNoMoreTrans*)signal->getDataPtrSend();
10069       req2->senderRef = reference();
10070       req2->senderData = m_micro_gcp.m_master_ref;
10071       req2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
10072       req2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
10073       sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
10074                  GCPNoMoreTrans::SignalLength, JBB);
10075     }
10076     break;
10077   case MicroGcp::M_GCP_COMPLETE:
10078     /**
10079      * This is a master only state...
10080      */
10081     ndbrequire(false);
10082   }
10083 
10084   MasterGCPConf::SaveState saveState;
10085   switch(m_gcp_save.m_state){
10086   case GcpSave::GCP_SAVE_IDLE:
10087     jam();
10088     saveState = MasterGCPConf::GCP_SAVE_IDLE;
10089     break;
10090   case GcpSave::GCP_SAVE_REQ:
10091     jam();
10092     saveState = MasterGCPConf::GCP_SAVE_REQ;
10093     break;
10094   case GcpSave::GCP_SAVE_CONF:
10095     jam();
10096     saveState = MasterGCPConf::GCP_SAVE_CONF;
10097     break;
10098   case GcpSave::GCP_SAVE_COPY_GCI:
10099     jam();
10100     saveState = MasterGCPConf::GCP_SAVE_COPY_GCI;
10101     break;
10102   }
10103 
10104   MasterGCPConf * const masterGCPConf = (MasterGCPConf *)&signal->theData[0];
10105   masterGCPConf->gcpState  = gcpState;
10106   masterGCPConf->senderNodeId = cownNodeId;
10107   masterGCPConf->failedNodeId = failedNodeId;
10108   masterGCPConf->newGCP_hi = (Uint32)(m_micro_gcp.m_new_gci >> 32);
10109   masterGCPConf->latestLCP = SYSFILE->latestLCP_ID;
10110   masterGCPConf->oldestRestorableGCI = SYSFILE->oldestRestorableGCI;
10111   masterGCPConf->keepGCI = SYSFILE->keepGCI;
10112   masterGCPConf->newGCP_lo = Uint32(m_micro_gcp.m_new_gci);
10113   masterGCPConf->saveState = saveState;
10114   masterGCPConf->saveGCI = m_gcp_save.m_gci;
10115   for(Uint32 i = 0; i < NdbNodeBitmask::Size; i++)
10116     masterGCPConf->lcpActive[i] = SYSFILE->lcpActive[i];
10117 
10118   if (ERROR_INSERTED(7225))
10119   {
10120     CLEAR_ERROR_INSERT_VALUE;
10121     ndbrequire(refToNode(newMasterBlockref) == getOwnNodeId());
10122     sendSignalWithDelay(newMasterBlockref, GSN_MASTER_GCPCONF, signal,
10123                         500, MasterGCPConf::SignalLength);
10124   }
10125   else
10126   {
10127     sendSignal(newMasterBlockref, GSN_MASTER_GCPCONF, signal,
10128                MasterGCPConf::SignalLength, JBB);
10129   }
10130 
10131   if (ERROR_INSERTED(7182))
10132   {
10133     ndbout_c("execGCP_TCFINISHED in MASTER_GCPREQ");
10134     CLEAR_ERROR_INSERT_VALUE;
10135     signal->theData[0] = c_error_7181_ref;
10136     signal->theData[1] = (Uint32)(m_micro_gcp.m_old_gci >> 32);
10137     signal->theData[2] = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
10138     signal->theData[3] = cfailurenr;
10139     execGCP_TCFINISHED(signal);
10140   }
10141 }//Dbdih::execMASTER_GCPREQ()
10142 
execMASTER_GCPCONF(Signal * signal)10143 void Dbdih::execMASTER_GCPCONF(Signal* signal)
10144 {
10145   NodeRecordPtr senderNodePtr;
10146   MasterGCPConf * const masterGCPConf = (MasterGCPConf *)&signal->theData[0];
10147   jamEntry();
10148   senderNodePtr.i = masterGCPConf->senderNodeId;
10149   ptrCheckGuard(senderNodePtr, MAX_NDB_NODES, nodeRecord);
10150 
10151 #ifdef VM_TRACE
10152   g_eventLogger->info("MASTER_GCPCONF from node %u", senderNodePtr.i);
10153 #endif
10154 
10155   MasterGCPConf::State gcpState = (MasterGCPConf::State)masterGCPConf->gcpState;
10156   MasterGCPConf::SaveState saveState =
10157     (MasterGCPConf::SaveState)masterGCPConf->saveState;
10158   const Uint32 failedNodeId = masterGCPConf->failedNodeId;
10159   const Uint32 newGcp_hi = masterGCPConf->newGCP_hi;
10160   const Uint32 newGcp_lo = masterGCPConf->newGCP_lo;
10161   Uint64 newGCI = newGcp_lo | (Uint64(newGcp_hi) << 32);
10162   const Uint32 latestLcpId = masterGCPConf->latestLCP;
10163   const Uint32 oldestRestorableGci = masterGCPConf->oldestRestorableGCI;
10164   const Uint32 oldestKeepGci = masterGCPConf->keepGCI;
10165   const Uint32 saveGCI = masterGCPConf->saveGCI;
10166 
10167   if (latestLcpId > SYSFILE->latestLCP_ID) {
10168     jam();
10169 #if 0
10170     g_eventLogger->info("Dbdih: Setting SYSFILE->latestLCP_ID to %d",
10171                         latestLcpId);
10172     SYSFILE->latestLCP_ID = latestLcpId;
10173 #endif
10174     SYSFILE->keepGCI = oldestKeepGci;
10175     SYSFILE->oldestRestorableGCI = oldestRestorableGci;
10176     for(Uint32 i = 0; i < NdbNodeBitmask::Size; i++)
10177       SYSFILE->lcpActive[i] = masterGCPConf->lcpActive[i];
10178   }//if
10179 
10180   bool ok = false;
10181   switch (gcpState) {
10182   case MasterGCPConf::GCP_READY:
10183     jam();
10184     ok = true;
10185     // Either not started or complete...
10186     break;
10187   case MasterGCPConf::GCP_PREPARE_RECEIVED:
10188     jam();
10189     ok = true;
10190     if (m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_IDLE)
10191     {
10192       jam();
10193       m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_PREPARE;
10194       m_micro_gcp.m_master.m_new_gci = newGCI;
10195     }
10196     else
10197     {
10198       jam();
10199       ndbrequire(m_micro_gcp.m_master.m_new_gci == newGCI);
10200     }
10201     break;
10202   case MasterGCPConf::GCP_COMMIT_RECEIVED:
10203     jam();
10204   case MasterGCPConf::GCP_COMMITTED:
10205     jam();
10206     ok = true;
10207     if (m_micro_gcp.m_master.m_state != MicroGcp::M_GCP_IDLE)
10208     {
10209       ndbrequire(m_micro_gcp.m_master.m_new_gci == newGCI);
10210     }
10211     m_micro_gcp.m_master.m_new_gci = newGCI;
10212     m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMMIT;
10213     break;
10214 #ifndef VM_TRACE
10215   default:
10216     jamLine(gcpState);
10217     ndbrequire(false);
10218 #endif
10219   }
10220   ndbassert(ok); // Unhandled case...
10221 
10222   ok = false;
10223   /**
10224    * GCI should differ with atmost one
10225    */
10226   ndbrequire(saveGCI == m_gcp_save.m_gci ||
10227              saveGCI == m_gcp_save.m_gci + 1 ||
10228              saveGCI + 1 == m_gcp_save.m_gci);
10229   if (saveGCI > m_gcp_save.m_master.m_new_gci)
10230   {
10231     jam();
10232     m_gcp_save.m_master.m_new_gci = saveGCI;
10233   }
10234   switch(saveState){
10235   case MasterGCPConf::GCP_SAVE_IDLE:
10236     jam();
10237     break;
10238   case MasterGCPConf::GCP_SAVE_REQ:
10239     jam();
10240     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
10241     {
10242       jam();
10243       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
10244     }
10245     break;
10246   case MasterGCPConf::GCP_SAVE_CONF:
10247     jam();
10248     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
10249     {
10250       jam();
10251       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
10252     }
10253     break;
10254   case MasterGCPConf::GCP_SAVE_COPY_GCI:
10255     jam();
10256     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
10257     {
10258       jam();
10259       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
10260     }
10261     break;
10262 #ifndef VM_TRACE
10263   default:
10264     jamLine(saveState);
10265     ndbrequire(false);
10266 #endif
10267   }
10268   //ndbassert(ok); // Unhandled case
10269 
10270   receiveLoopMacro(MASTER_GCPREQ, senderNodePtr.i);
10271   /*-------------------------------------------------------------------------*/
10272   // We have now received all responses and are ready to take over the GCP
10273   // protocol as master.
10274   /*-------------------------------------------------------------------------*/
10275   MASTER_GCPhandling(signal, failedNodeId);
10276 
10277   return;
10278 }//Dbdih::execMASTER_GCPCONF()
10279 
execMASTER_GCPREF(Signal * signal)10280 void Dbdih::execMASTER_GCPREF(Signal* signal)
10281 {
10282   const MasterGCPRef * const ref = (MasterGCPRef *)&signal->theData[0];
10283   jamEntry();
10284   receiveLoopMacro(MASTER_GCPREQ, ref->senderNodeId);
10285   /*-------------------------------------------------------------------------*/
10286   // We have now received all responses and are ready to take over the GCP
10287   // protocol as master.
10288   /*-------------------------------------------------------------------------*/
10289   MASTER_GCPhandling(signal, ref->failedNodeId);
10290 }//Dbdih::execMASTER_GCPREF()
10291 
MASTER_GCPhandling(Signal * signal,Uint32 failedNodeId)10292 void Dbdih::MASTER_GCPhandling(Signal* signal, Uint32 failedNodeId)
10293 {
10294   cmasterState = MASTER_ACTIVE;
10295 
10296   NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
10297   NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
10298   if (m_gcp_monitor.m_micro_gcp.m_max_lag_ms > 0)
10299   {
10300     infoEvent("GCP Monitor: Computed max GCP_SAVE lag to %u seconds",
10301               m_gcp_monitor.m_gcp_save.m_max_lag_ms / 1000);
10302     infoEvent("GCP Monitor: Computed max GCP_COMMIT lag to %u seconds",
10303               m_gcp_monitor.m_micro_gcp.m_max_lag_ms / 1000);
10304   }
10305   else
10306   {
10307     infoEvent("GCP Monitor: unlimited lags allowed");
10308   }
10309 
10310   bool ok = false;
10311   switch(m_micro_gcp.m_master.m_state){
10312   case MicroGcp::M_GCP_IDLE:
10313     jam();
10314     ok = true;
10315     signal->theData[0] = DihContinueB::ZSTART_GCP;
10316     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
10317     break;
10318   case MicroGcp::M_GCP_PREPARE:
10319   {
10320     jam();
10321     ok = true;
10322 
10323     /**
10324      * Restart GCP_PREPARE
10325      */
10326     sendLoopMacro(GCP_PREPARE, sendGCP_PREPARE, RNIL);
10327     break;
10328   }
10329   case MicroGcp::M_GCP_COMMIT:
10330   {
10331     jam();
10332     ok = true;
10333 
10334     /**
10335      * Restart GCP_COMMIT
10336      */
10337     sendLoopMacro(GCP_COMMIT, sendGCP_COMMIT, RNIL);
10338     break;
10339   }
10340   case MicroGcp::M_GCP_COMMITTED:
10341     jam();
10342     ndbrequire(false);
10343   case MicroGcp::M_GCP_COMPLETE:
10344     jam();
10345     ndbrequire(false);
10346 #ifndef VM_TRACE
10347   default:
10348     jamLine(m_micro_gcp.m_master.m_state);
10349     ndbrequire(false);
10350 #endif
10351   }
10352   ndbassert(ok);
10353 
10354   if (m_micro_gcp.m_enabled == false)
10355   {
10356     jam();
10357     m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_IDLE;
10358   }
10359   else
10360   {
10361     ok = false;
10362     switch(m_gcp_save.m_master.m_state){
10363     case GcpSave::GCP_SAVE_IDLE:
10364       jam();
10365       ok = true;
10366       break;
10367     case GcpSave::GCP_SAVE_REQ:
10368     {
10369       jam();
10370       ok = true;
10371 
10372       /**
10373        * Restart GCP_SAVE_REQ
10374        */
10375       sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
10376       break;
10377     }
10378     case GcpSave::GCP_SAVE_CONF:
10379       jam();
10380     case GcpSave::GCP_SAVE_COPY_GCI:
10381       jam();
10382       ok = true;
10383       copyGciLab(signal, CopyGCIReq::GLOBAL_CHECKPOINT);
10384       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
10385       break;
10386 #ifndef VM_TRACE
10387     default:
10388       jamLine(m_gcp_save.m_master.m_state);
10389       ndbrequire(false);
10390 #endif
10391     }
10392     ndbrequire(ok);
10393   }
10394 
10395   signal->theData[0] = NDB_LE_GCP_TakeoverCompleted;
10396   signal->theData[1] = m_micro_gcp.m_master.m_state;
10397   signal->theData[2] = m_gcp_save.m_master.m_state;
10398   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
10399 
10400   infoEvent("kk: %u/%u %u %u",
10401             Uint32(m_micro_gcp.m_current_gci >> 32),
10402             Uint32(m_micro_gcp.m_current_gci),
10403             m_micro_gcp.m_master.m_state,
10404             m_gcp_save.m_master.m_state);
10405 
10406   /*--------------------------------------------------*/
10407   /*       WE SEPARATE HANDLING OF GLOBAL CHECKPOINTS */
10408   /*       AND LOCAL CHECKPOINTS HERE. LCP'S HAVE TO  */
10409   /*       REMOVE ALL FAILED FRAGMENTS BEFORE WE CAN  */
10410   /*       HANDLE THE LCP PROTOCOL.                   */
10411   /*--------------------------------------------------*/
10412   checkLocalNodefailComplete(signal, failedNodeId, NF_GCP_TAKE_OVER);
10413 
10414   startGcpMonitor(signal);
10415 
10416   return;
10417 }//Dbdih::masterGcpConfFromFailedLab()
10418 
10419 void
handle_send_continueb_invalidate_node_lcp(Signal * signal)10420 Dbdih::handle_send_continueb_invalidate_node_lcp(Signal *signal)
10421 {
10422   if (ERROR_INSERTED(7204))
10423   {
10424     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
10425   }
10426   else if (ERROR_INSERTED(7245))
10427   {
10428     if (isMaster())
10429     {
10430       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
10431     }
10432     else
10433     {
10434       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 3);
10435     }
10436   }
10437   else if (ERROR_INSERTED(7246))
10438   {
10439     /**
10440      * This error injection supports a special test case where we
10441      * delay node 1 and 2 more than other nodes to ensure that we
10442      * get some nodes that reply with START_INFOCONF and some that
10443      * reply with START_INFOREF to get the code tested for the case
10444      * some nodes reply with START_INFOREF and some with START_INFOCONF.
10445      */
10446     if (isMaster())
10447     {
10448       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
10449     }
10450     else if (cownNodeId == Uint32(1) ||
10451              (refToNode(cmasterdihref) == Uint32(1) &&
10452               cownNodeId == Uint32(2)))
10453     {
10454       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5000, 3);
10455     }
10456     else
10457     {
10458       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 8000, 3);
10459     }
10460   }
10461   else
10462   {
10463     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
10464   }
10465 }
10466 
10467 void
invalidateNodeLCP(Signal * signal,Uint32 nodeId,Uint32 tableId)10468 Dbdih::invalidateNodeLCP(Signal* signal, Uint32 nodeId, Uint32 tableId)
10469 {
10470   jamEntry();
10471   TabRecordPtr tabPtr;
10472   tabPtr.i = tableId;
10473   const Uint32 RT_BREAK = 64;
10474   if (ERROR_INSERTED(7125)) {
10475     return;
10476   }//if
10477   for (Uint32 i = 0; i<RT_BREAK; i++) {
10478     jam();
10479     if (tabPtr.i >= ctabFileSize){
10480       jam();
10481       /**
10482        * Ready with entire loop
10483        * Return to master
10484        */
10485       if (ERROR_INSERTED(7204) ||
10486           ERROR_INSERTED(7245) ||
10487           ERROR_INSERTED(7246))
10488       {
10489         CLEAR_ERROR_INSERT_VALUE;
10490       }
10491       setAllowNodeStart(nodeId, true);
10492       g_eventLogger->info("Completed invalidation of node %u", nodeId);
10493       if (getNodeStatus(nodeId) == NodeRecord::STARTING) {
10494         jam();
10495         if (!isMaster())
10496         {
10497           jam();
10498           setNodeRecoveryStatus(nodeId, NodeRecord::NODE_GETTING_PERMIT);
10499         }
10500         StartInfoConf * conf = (StartInfoConf*)&signal->theData[0];
10501         conf->sendingNodeId = cownNodeId;
10502         conf->startingNodeId = nodeId;
10503         sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
10504                    StartInfoConf::SignalLength, JBB);
10505       }//if
10506       return;
10507     }//if
10508     ptrAss(tabPtr, tabRecord);
10509     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) {
10510       jam();
10511       invalidateNodeLCP(signal, nodeId, tabPtr);
10512       return;
10513     }//if
10514     tabPtr.i++;
10515   }//for
10516   signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
10517   signal->theData[1] = nodeId;
10518   signal->theData[2] = tabPtr.i;
10519   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
10520 }//Dbdih::invalidateNodeLCP()
10521 
10522 void
invalidateNodeLCP(Signal * signal,Uint32 nodeId,TabRecordPtr tabPtr)10523 Dbdih::invalidateNodeLCP(Signal* signal, Uint32 nodeId, TabRecordPtr tabPtr)
10524 {
10525   /**
10526    * Check so that no one else is using the tab descriptior
10527    */
10528   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
10529     jam();
10530     signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
10531     signal->theData[1] = nodeId;
10532     signal->theData[2] = tabPtr.i;
10533     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10534                         WaitTableStateChangeMillis, 3);
10535     return;
10536   }//if
10537 
10538   /**
10539    * For each fragment
10540    */
10541   bool modified = false;
10542   FragmentstorePtr fragPtr;
10543   for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){
10544     jam();
10545     getFragstore(tabPtr.p, fragNo, fragPtr);
10546     /**
10547      * For each of replica record
10548      */
10549     ReplicaRecordPtr replicaPtr;
10550     for(replicaPtr.i = fragPtr.p->oldStoredReplicas; replicaPtr.i != RNIL;
10551         replicaPtr.i = replicaPtr.p->nextPool) {
10552       jam();
10553       c_replicaRecordPool.getPtr(replicaPtr);
10554       if(replicaPtr.p->procNode == nodeId){
10555         jam();
10556         /**
10557          * Found one with correct node id
10558          */
10559         /**
10560          * Invalidate all LCP's
10561          */
10562         modified = true;
10563         for(int i = 0; i < MAX_LCP_STORED; i++) {
10564           replicaPtr.p->lcpStatus[i] = ZINVALID;
10565         }//if
10566         /**
10567          * And reset nextLcp
10568          */
10569         replicaPtr.p->nextLcp = 0;
10570         replicaPtr.p->noCrashedReplicas = 0;
10571       }//if
10572     }//for
10573   }//for
10574 
10575   if (modified) {
10576     jam();
10577     /**
10578      * Save table description to disk
10579      */
10580     tabPtr.p->tabCopyStatus  = TabRecord::CS_INVALIDATE_NODE_LCP;
10581     tabPtr.p->tabUpdateState = TabRecord::US_INVALIDATE_NODE_LCP;
10582     tabPtr.p->tabRemoveNode  = nodeId;
10583     signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
10584     signal->theData[1] = tabPtr.i;
10585     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
10586     return;
10587   }
10588 
10589   jam();
10590   /**
10591    * Move to next table
10592    */
10593   tabPtr.i++;
10594   signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
10595   signal->theData[1] = nodeId;
10596   signal->theData[2] = tabPtr.i;
10597 
10598   handle_send_continueb_invalidate_node_lcp(signal);
10599 
10600   return;
10601 }//Dbdih::invalidateNodeLCP()
10602 
10603 /*------------------------------------------------*/
10604 /*       INPUT:  TABPTR                           */
10605 /*               TNODEID                          */
10606 /*------------------------------------------------*/
removeNodeFromTables(Signal * signal,Uint32 nodeId,Uint32 tableId)10607 void Dbdih::removeNodeFromTables(Signal* signal,
10608 				 Uint32 nodeId, Uint32 tableId)
10609 {
10610   jamEntry();
10611   TabRecordPtr tabPtr;
10612   tabPtr.i = tableId;
10613   const Uint32 RT_BREAK = 64;
10614   for (Uint32 i = 0; i<RT_BREAK; i++) {
10615     jam();
10616     if (tabPtr.i >= ctabFileSize){
10617       jam();
10618       if (ERROR_INSERTED(7233))
10619       {
10620         CLEAR_ERROR_INSERT_VALUE;
10621       }
10622 
10623       removeNodeFromTablesComplete(signal, nodeId);
10624       return;
10625     }//if
10626 
10627     ptrAss(tabPtr, tabRecord);
10628     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) {
10629       jam();
10630       removeNodeFromTable(signal, nodeId, tabPtr);
10631       return;
10632     }//if
10633     tabPtr.i++;
10634   }//for
10635   signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
10636   signal->theData[1] = nodeId;
10637   signal->theData[2] = tabPtr.i;
10638   if (!ERROR_INSERTED(7233))
10639     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
10640   else
10641     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
10642 }
10643 
removeNodeFromTable(Signal * signal,Uint32 nodeId,TabRecordPtr tabPtr)10644 void Dbdih::removeNodeFromTable(Signal* signal,
10645 				Uint32 nodeId, TabRecordPtr tabPtr){
10646 
10647   /**
10648    * Check so that no one else is using the tab descriptior
10649    */
10650   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
10651     jam();
10652     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
10653     signal->theData[1] = nodeId;
10654     signal->theData[2] = tabPtr.i;
10655     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10656                         WaitTableStateChangeMillis, 3);
10657     return;
10658   }//if
10659 
10660   NodeRecordPtr nodePtr;
10661   nodePtr.i = nodeId;
10662   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
10663   const Uint32 lcpId = nodePtr.p->m_remove_node_from_table_lcp_id;
10664 
10665   /**
10666    * For each fragment
10667    */
10668   Uint32 noOfRemovedReplicas = 0;     // No of replicas removed
10669   Uint32 noOfRemovedLcpReplicas = 0;  // No of replicas in LCP removed
10670   Uint32 noOfRemainingLcpReplicas = 0;// No of replicas in LCP remaining
10671 
10672   const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE);
10673   const bool unlogged = (tabPtr.p->tabStorage != TabRecord::ST_NORMAL);
10674 
10675   FragmentstorePtr fragPtr;
10676   for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){
10677     jam();
10678     getFragstore(tabPtr.p, fragNo, fragPtr);
10679 
10680     /**
10681      * For each of replica record
10682      */
10683     bool found = false;
10684     ReplicaRecordPtr replicaPtr;
10685     for(replicaPtr.i = fragPtr.p->storedReplicas; replicaPtr.i != RNIL;
10686         replicaPtr.i = replicaPtr.p->nextPool) {
10687       jam();
10688 
10689       c_replicaRecordPool.getPtr(replicaPtr);
10690       if(replicaPtr.p->procNode == nodeId){
10691         jam();
10692 	found = true;
10693 	noOfRemovedReplicas++;
10694 	removeNodeFromStored(nodeId, fragPtr, replicaPtr, unlogged);
10695 	if(replicaPtr.p->lcpOngoingFlag){
10696 	  jam();
10697 	  /**
10698 	   * This replica is currently LCP:ed
10699 	   */
10700 	  ndbrequire(fragPtr.p->noLcpReplicas > 0);
10701 	  fragPtr.p->noLcpReplicas --;
10702 
10703 	  noOfRemovedLcpReplicas ++;
10704 	  replicaPtr.p->lcpOngoingFlag = false;
10705 	}
10706 
10707         if (lcpId != RNIL)
10708         {
10709           jam();
10710           Uint32 lcpNo = prevLcpNo(replicaPtr.p->nextLcp);
10711           if (replicaPtr.p->lcpStatus[lcpNo] == ZVALID &&
10712               replicaPtr.p->lcpId[lcpNo] == lcpId)
10713           {
10714             jam();
10715             replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
10716             replicaPtr.p->lcpId[lcpNo] = 0;
10717             replicaPtr.p->nextLcp = lcpNo;
10718             g_eventLogger->debug("REMOVING lcp: %u from table: %u frag:"
10719                                  " %u node: %u",
10720                                  SYSFILE->latestLCP_ID,
10721                                  tabPtr.i,
10722                                  fragNo,
10723                                  nodeId);
10724           }
10725         }
10726       }
10727     }
10728 
10729     /**
10730      * Run updateNodeInfo to remove any dead nodes from list of activeNodes
10731      *  see bug#15587
10732      */
10733     updateNodeInfo(fragPtr);
10734     noOfRemainingLcpReplicas += fragPtr.p->noLcpReplicas;
10735   }
10736 
10737   if (noOfRemovedReplicas == 0)
10738   {
10739     jam();
10740     /**
10741      * The table had no replica on the failed node
10742      *   continue with next table
10743      */
10744     tabPtr.i++;
10745     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
10746     signal->theData[1] = nodeId;
10747     signal->theData[2] = tabPtr.i;
10748     if (!ERROR_INSERTED(7233))
10749       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
10750     else
10751       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
10752     return;
10753   }
10754 
10755   /**
10756    * We did remove at least one replica
10757    */
10758   bool ok = false;
10759   switch(tabPtr.p->tabLcpStatus){
10760   case TabRecord::TLS_COMPLETED:
10761     ok = true;
10762     jam();
10763     /**
10764      * WE WILL WRITE THE TABLE DESCRIPTION TO DISK AT THIS TIME
10765      * INDEPENDENT OF WHAT THE LOCAL CHECKPOINT NEEDED.
10766      * THIS IS TO ENSURE THAT THE FAILED NODES ARE ALSO UPDATED ON DISK
10767      * IN THE DIH DATA STRUCTURES BEFORE WE COMPLETE HANDLING OF THE
10768      * NODE FAILURE.
10769      */
10770     ndbrequire(noOfRemovedLcpReplicas == 0);
10771 
10772     tabPtr.p->tabCopyStatus = TabRecord::CS_REMOVE_NODE;
10773     tabPtr.p->tabUpdateState = TabRecord::US_REMOVE_NODE;
10774     tabPtr.p->tabRemoveNode = nodeId;
10775     signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
10776     signal->theData[1] = tabPtr.i;
10777     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
10778     return;
10779     break;
10780   case TabRecord::TLS_ACTIVE:
10781     ok = true;
10782     jam();
10783     /**
10784      * The table is participating in an LCP currently
10785      */
10786     // Fall through
10787     break;
10788   case TabRecord::TLS_WRITING_TO_FILE:
10789     ok = true;
10790     jam();
10791     /**
10792      * This should never happen since we in the beginning of this function
10793      * checks the tabCopyStatus
10794      */
10795     ndbrequire(lcpOngoingFlag);
10796     ndbrequire(false);
10797     break;
10798   }
10799   ndbrequire(ok);
10800 
10801   /**
10802    * The table is participating in an LCP currently
10803    *   and we removed some replicas that should have been checkpointed
10804    */
10805   ndbrequire(tabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE);
10806 
10807   tabPtr.p->tabCopyStatus = TabRecord::CS_REMOVE_NODE;
10808   tabPtr.p->tabUpdateState = TabRecord::US_REMOVE_NODE;
10809   tabPtr.p->tabRemoveNode = nodeId;
10810   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
10811   signal->theData[1] = tabPtr.i;
10812   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
10813 
10814   if (noOfRemainingLcpReplicas == 0)
10815   {
10816     jam();
10817     /**
10818      * Check if the removal on the failed node made the LCP complete
10819      */
10820     tabPtr.p->tabLcpStatus = TabRecord::TLS_WRITING_TO_FILE;
10821     checkLcpAllTablesDoneInLqh(__LINE__);
10822   }
10823 }
10824 
10825 void
removeNodeFromTablesComplete(Signal * signal,Uint32 nodeId)10826 Dbdih::removeNodeFromTablesComplete(Signal* signal, Uint32 nodeId){
10827   jam();
10828 
10829   /**
10830    * Check if we "accidently" completed a LCP
10831    */
10832   checkLcpCompletedLab(signal);
10833 
10834   /**
10835    * Check if we (DIH) are finished with node fail handling
10836    */
10837   checkLocalNodefailComplete(signal, nodeId, NF_REMOVE_NODE_FROM_TABLE);
10838 }
10839 
10840 void
checkLocalNodefailComplete(Signal * signal,Uint32 failedNodeId,NodefailHandlingStep step)10841 Dbdih::checkLocalNodefailComplete(Signal* signal, Uint32 failedNodeId,
10842 				  NodefailHandlingStep step){
10843   jam();
10844 
10845   NodeRecordPtr nodePtr;
10846   nodePtr.i = failedNodeId;
10847   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
10848 
10849   ndbrequire(nodePtr.p->m_nodefailSteps.get(step));
10850   nodePtr.p->m_nodefailSteps.clear(step);
10851 
10852   if(nodePtr.p->m_nodefailSteps.count() > 0){
10853     jam();
10854     return;
10855   }
10856 
10857   if (ERROR_INSERTED(7030))
10858   {
10859     g_eventLogger->info("Reenable GCP_PREPARE");
10860     CLEAR_ERROR_INSERT_VALUE;
10861   }
10862 
10863   NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
10864   nf->blockNo = DBDIH;
10865   nf->nodeId = cownNodeId;
10866   nf->failedNodeId = failedNodeId;
10867   nf->from = __LINE__;
10868   sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
10869              NFCompleteRep::SignalLength, JBB);
10870 }
10871 
10872 
10873 void
setLocalNodefailHandling(Signal * signal,Uint32 failedNodeId,NodefailHandlingStep step)10874 Dbdih::setLocalNodefailHandling(Signal* signal, Uint32 failedNodeId,
10875 				NodefailHandlingStep step){
10876   jam();
10877 
10878   NodeRecordPtr nodePtr;
10879   nodePtr.i = failedNodeId;
10880   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
10881 
10882   ndbrequire(!nodePtr.p->m_nodefailSteps.get(step));
10883   nodePtr.p->m_nodefailSteps.set(step);
10884 }
10885 
startLcpTakeOverLab(Signal * signal,Uint32 failedNodeId)10886 void Dbdih::startLcpTakeOverLab(Signal* signal, Uint32 failedNodeId)
10887 {
10888   /*--------------------------------------------------------------------*/
10889   // Start LCP master take over process. Consists of the following steps.
10890   // 1) Ensure that all LQH's have reported all fragments they have been
10891   // told to checkpoint. Can be a fairly long step time-wise.
10892   // 2) Query all nodes about their LCP status.
10893   // During the query process we do not want our own state to change.
10894   // This can change due to delayed reception of LCP_REPORT, completed
10895   // save of table on disk or reception of DIH_LCPCOMPLETE from other
10896   // node.
10897   /*--------------------------------------------------------------------*/
10898 }//Dbdih::startLcpTakeOver()
10899 
10900 void
execEMPTY_LCP_REP(Signal * signal)10901 Dbdih::execEMPTY_LCP_REP(Signal* signal)
10902 {
10903   jamEntry();
10904   EmptyLcpRep* rep = (EmptyLcpRep*)signal->getDataPtr();
10905 
10906   Uint32 len = signal->getLength();
10907   ndbrequire(len > EmptyLcpRep::SignalLength);
10908   len -= EmptyLcpRep::SignalLength;
10909 
10910   NdbNodeBitmask nodes;
10911   nodes.assign(NdbNodeBitmask::Size, rep->receiverGroup);
10912   NodeReceiverGroup rg (DBDIH, nodes);
10913   memmove(signal->getDataPtrSend(),
10914           signal->getDataPtr()+EmptyLcpRep::SignalLength, 4*len);
10915 
10916   sendSignal(rg, GSN_EMPTY_LCP_CONF, signal, len, JBB);
10917 }
10918 
execEMPTY_LCP_CONF(Signal * signal)10919 void Dbdih::execEMPTY_LCP_CONF(Signal* signal)
10920 {
10921   jamEntry();
10922 
10923   ndbrequire(c_lcpMasterTakeOverState.state == LMTOS_WAIT_EMPTY_LCP);
10924 
10925   const EmptyLcpConf * const conf = (EmptyLcpConf *)&signal->theData[0];
10926   Uint32 nodeId = conf->senderNodeId;
10927 
10928   CRASH_INSERTION(7206);
10929 
10930 
10931   if(!conf->idle){
10932     jam();
10933     if (conf->tableId < c_lcpMasterTakeOverState.minTableId) {
10934       jam();
10935       c_lcpMasterTakeOverState.minTableId = conf->tableId;
10936       c_lcpMasterTakeOverState.minFragId = conf->fragmentId;
10937     } else if (conf->tableId == c_lcpMasterTakeOverState.minTableId &&
10938 	       conf->fragmentId < c_lcpMasterTakeOverState.minFragId) {
10939       jam();
10940       c_lcpMasterTakeOverState.minFragId = conf->fragmentId;
10941     }//if
10942     if(isMaster()){
10943       jam();
10944       c_lcpState.m_LAST_LCP_FRAG_ORD.setWaitingFor(nodeId);
10945     }
10946   }
10947 
10948   receiveLoopMacro(EMPTY_LCP_REQ, nodeId);
10949   /*--------------------------------------------------------------------*/
10950   // Received all EMPTY_LCPCONF. We can continue with next phase of the
10951   // take over LCP master process.
10952   /*--------------------------------------------------------------------*/
10953   c_lcpMasterTakeOverState.set(LMTOS_WAIT_LCP_FRAG_REP, __LINE__);
10954   checkEmptyLcpComplete(signal);
10955   return;
10956 }//Dbdih::execEMPTY_LCPCONF()
10957 
10958 void
checkEmptyLcpComplete(Signal * signal)10959 Dbdih::checkEmptyLcpComplete(Signal *signal)
10960 {
10961 
10962   ndbrequire(c_lcpMasterTakeOverState.state == LMTOS_WAIT_LCP_FRAG_REP);
10963 
10964   if(c_lcpState.noOfLcpFragRepOutstanding > 0 &&
10965      c_lcpMasterTakeOverState.use_empty_lcp)
10966   {
10967     jam();
10968     /**
10969      * In the EMPTY_LCP_REQ we need to ensure that we have received
10970      * LCP_FRAG_REP for all outstanding LCP_FRAG_ORDs. So we need to wait
10971      * here for all to complete before we are ready to move on.
10972      *
10973      * This is not needed when LQH can remove duplicate LCP_FRAG_ORDs, so
10974      * we can proceed with the master takeover immediately.
10975      */
10976     return;
10977   }
10978 
10979   if(isMaster()){
10980     jam();
10981 
10982     signal->theData[0] = NDB_LE_LCP_TakeoverStarted;
10983     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
10984 
10985     signal->theData[0] = 7012;
10986     execDUMP_STATE_ORD(signal);
10987 
10988     if (ERROR_INSERTED(7194))
10989     {
10990       ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE");
10991       signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
10992       signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId;
10993       signal->theData[2] = 0; // Tab id
10994       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
10995     }
10996 
10997     c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__);
10998     MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
10999     req->masterRef = reference();
11000     req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId;
11001     sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ, RNIL);
11002 
11003   }
11004   else
11005   {
11006     jam();
11007     sendMASTER_LCPCONF(signal, __LINE__);
11008   }
11009 }
11010 
11011 /*--------------------------------------------------*/
11012 /*       THE MASTER HAS FAILED AND THE NEW MASTER IS*/
11013 /*       QUERYING THIS NODE ABOUT THE STATE OF THE  */
11014 /*       LOCAL CHECKPOINT PROTOCOL.                 */
11015 /*--------------------------------------------------*/
execMASTER_LCPREQ(Signal * signal)11016 void Dbdih::execMASTER_LCPREQ(Signal* signal)
11017 {
11018   NodeRecordPtr newMasterNodePtr;
11019   const MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
11020   jamEntry();
11021   const BlockReference newMasterBlockref = req->masterRef;
11022 
11023   newMasterNodePtr.i = refToNode(newMasterBlockref);
11024   ptrCheckGuard(newMasterNodePtr, MAX_NDB_NODES, nodeRecord);
11025 
11026   if (newMasterNodePtr.p->nodeStatus != NodeRecord::ALIVE)
11027   {
11028     /**
11029      * We delayed the MASTER_LCPREQ signal and now it arrived after
11030      * the new master already died. We ignore this signal.
11031      */
11032     jam();
11033     return;
11034   }
11035 
11036   CRASH_INSERTION(7205);
11037 
11038   if (ERROR_INSERTED(7207))
11039   {
11040     jam();
11041     SET_ERROR_INSERT_VALUE(7208);
11042     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
11043 			500, signal->getLength());
11044     return;
11045   }
11046 
11047   if (ERROR_INSERTED(7208))
11048   {
11049     jam();
11050     signal->theData[0] = 9999;
11051     sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)),
11052                GSN_NDB_TAMPER, signal, 1, JBB);
11053   }
11054 
11055   if (ERROR_INSERTED(7231))
11056   {
11057     CLEAR_ERROR_INSERT_VALUE;
11058     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
11059 			1500, signal->getLength());
11060     return;
11061   }
11062 
11063   if (newMasterBlockref != cmasterdihref)
11064   {
11065     /**
11066      * We haven't processed the NODE_FAILREP signal causing the new master
11067      * to be selected as the new master by this node.
11068      */
11069     jam();
11070     ndbout_c("resending GSN_MASTER_LCPREQ");
11071     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
11072 			50, signal->getLength());
11073     return;
11074   }
11075 
11076   if (c_handled_master_take_over_copy_gci != refToNode(newMasterNodePtr.i))
11077   {
11078     /**
11079      * We need to ensure that MASTER_GCPREQ has ensured that the COPY_GCIREQ
11080      * activity started by old master has been completed before we proceed
11081      * with handling the take over of the LCP protocol.
11082      */
11083     jam();
11084     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
11085                         10, signal->getLength());
11086     return;
11087   }
11088   c_handled_master_take_over_copy_gci = 0;
11089 
11090   Uint32 failedNodeId = req->failedNodeId;
11091 
11092   /**
11093    * There can be no take over with the same master
11094    */
11095   ndbrequire(c_lcpState.m_masterLcpDihRef != newMasterBlockref);
11096   c_lcpState.m_masterLcpDihRef = newMasterBlockref;
11097   c_lcpState.m_MASTER_LCPREQ_Received = true;
11098   c_lcpState.m_MASTER_LCPREQ_FailedNodeId = failedNodeId;
11099 
11100   if(newMasterBlockref != cmasterdihref){
11101     jam();
11102     ndbrequire(0);
11103   }
11104 
11105   if (c_lcpState.lcpStatus == LCP_INIT_TABLES)
11106   {
11107     jam();
11108     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
11109   }
11110   sendMASTER_LCPCONF(signal, __LINE__);
11111 }//Dbdih::execMASTER_LCPREQ()
11112 
11113 void
sendMASTER_LCPCONF(Signal * signal,Uint32 from)11114 Dbdih::sendMASTER_LCPCONF(Signal * signal, Uint32 from)
11115 {
11116   if (!c_lcpState.m_MASTER_LCPREQ_Received)
11117   {
11118     jam();
11119     /**
11120      * Has not received MASTER_LCPREQ yet
11121      */
11122     return;
11123   }
11124 
11125 #if defined VM_TRACE || defined ERROR_INSERT
11126   bool info = true;
11127 #else
11128   bool info = false;
11129 #endif
11130 
11131   if (ERROR_INSERTED(7230))
11132   {
11133     signal->theData[0] = 9999;
11134     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 100, 1);
11135     goto err7230;
11136   }
11137 
11138   if (!c_EMPTY_LCP_REQ_Counter.done())
11139   {
11140     /**
11141      * Have not received all EMPTY_LCP_REP
11142      * dare not answer MASTER_LCP_CONF yet
11143      */
11144     jam();
11145     if (info)
11146       infoEvent("from: %u : c_EMPTY_LCP_REQ_Counter.done() == false", from);
11147     return;
11148   }
11149 
11150   if (c_lcpState.lcpStatus == LCP_INIT_TABLES)
11151   {
11152     jam();
11153     /**
11154      * Still aborting old initLcpLab
11155      */
11156     if (info)
11157       infoEvent("from: %u : c_lcpState.lcpStatus == LCP_INIT_TABLES", from);
11158     return;
11159   }
11160 
11161 err7230:
11162   if (info)
11163     infoEvent("from: %u : sendMASTER_LCPCONF", from);
11164 
11165   if (c_lcpState.lcpStatus == LCP_COPY_GCI)
11166   {
11167     jam();
11168     /**
11169      * Restart it
11170      */
11171     //Uint32 lcpId = SYSFILE->latestLCP_ID;
11172     SYSFILE->latestLCP_ID--;
11173     Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
11174     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
11175 #if 0
11176     if(c_copyGCISlave.m_copyReason == CopyGCIReq::LOCAL_CHECKPOINT){
11177       g_eventLogger->info("Dbdih: Also resetting c_copyGCISlave");
11178       c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
11179       c_copyGCISlave.m_expectedNextWord = 0;
11180     }
11181 #endif
11182   }
11183 
11184   MasterLCPConf::State lcpState;
11185   switch (c_lcpState.lcpStatus) {
11186   case LCP_STATUS_IDLE:
11187     jam();
11188     /*------------------------------------------------*/
11189     /*       LOCAL CHECKPOINT IS CURRENTLY NOT ACTIVE */
11190     /*       SINCE NO COPY OF RESTART INFORMATION HAVE*/
11191     /*       BEEN RECEIVED YET. ALSO THE PREVIOUS     */
11192     /*       CHECKPOINT HAVE BEEN FULLY COMPLETED.    */
11193     /*------------------------------------------------*/
11194     lcpState = MasterLCPConf::LCP_STATUS_IDLE;
11195     break;
11196   case LCP_STATUS_ACTIVE:
11197     jam();
11198     /*--------------------------------------------------*/
11199     /*       COPY OF RESTART INFORMATION HAS BEEN       */
11200     /*       PERFORMED AND ALSO RESPONSE HAVE BEEN SENT.*/
11201     /*--------------------------------------------------*/
11202     lcpState = MasterLCPConf::LCP_STATUS_ACTIVE;
11203     break;
11204   case LCP_TAB_COMPLETED:
11205     jam();
11206     /*--------------------------------------------------------*/
11207     /*       ALL LCP_REPORT'S HAVE BEEN COMPLETED FOR         */
11208     /*       ALL TABLES.     SAVE OF AT LEAST ONE TABLE IS    */
11209     /*       ONGOING YET.                                     */
11210     /*--------------------------------------------------------*/
11211     lcpState = MasterLCPConf::LCP_TAB_COMPLETED;
11212     break;
11213   case LCP_TAB_SAVED:
11214     jam();
11215     /*--------------------------------------------------------*/
11216     /*       ALL LCP_REPORT'S HAVE BEEN COMPLETED FOR         */
11217     /*       ALL TABLES.     ALL TABLES HAVE ALSO BEEN SAVED  */
11218     /*       ALL OTHER NODES ARE NOT YET FINISHED WITH        */
11219     /*       THE LOCAL CHECKPOINT.                            */
11220     /*--------------------------------------------------------*/
11221     lcpState = MasterLCPConf::LCP_TAB_SAVED;
11222     break;
11223   case LCP_TCGET:
11224   case LCP_CALCULATE_KEEP_GCI:
11225   case LCP_TC_CLOPSIZE:
11226   case LCP_WAIT_MUTEX:
11227   case LCP_START_LCP_ROUND:
11228     /**
11229      * These should only exists on the master
11230      *   but since this is master take over
11231      *   it not allowed
11232      */
11233     ndbrequire(false);
11234     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
11235     break;
11236   case LCP_COPY_GCI:
11237   case LCP_INIT_TABLES:
11238     /**
11239      * These two states are handled by if statements above
11240      */
11241     ndbrequire(false);
11242     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
11243     break;
11244   default:
11245     ndbrequire(false);
11246     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
11247   }//switch
11248 
11249   Uint32 failedNodeId = c_lcpState.m_MASTER_LCPREQ_FailedNodeId;
11250   MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
11251   conf->senderNodeId = cownNodeId;
11252   conf->lcpState = lcpState;
11253   conf->failedNodeId = failedNodeId;
11254   sendSignal(c_lcpState.m_masterLcpDihRef, GSN_MASTER_LCPCONF,
11255              signal, MasterLCPConf::SignalLength, JBB);
11256 
11257   // Answer to MASTER_LCPREQ sent, reset flag so
11258   // that it's not sent again before another request comes in
11259   c_lcpState.m_MASTER_LCPREQ_Received = false;
11260 
11261   CRASH_INSERTION(7232);
11262 
11263   if (ERROR_INSERTED(7230))
11264   {
11265     return;
11266   }
11267 
11268   if(c_lcpState.lcpStatus == LCP_TAB_SAVED){
11269 #ifdef VM_TRACE
11270     g_eventLogger->info("Sending extra GSN_LCP_COMPLETE_REP to new master");
11271 #endif
11272     sendLCP_COMPLETE_REP(signal);
11273   }
11274 
11275   if(!isMaster())
11276   {
11277     c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
11278     checkLocalNodefailComplete(signal, failedNodeId, NF_LCP_TAKE_OVER);
11279   }
11280 
11281   return;
11282 }
11283 
11284 NdbOut&
operator <<(NdbOut & out,const Dbdih::LcpMasterTakeOverState state)11285 operator<<(NdbOut& out, const Dbdih::LcpMasterTakeOverState state){
11286   switch(state){
11287   case Dbdih::LMTOS_IDLE:
11288     out << "LMTOS_IDLE";
11289     break;
11290   case Dbdih::LMTOS_WAIT_EMPTY_LCP:
11291     out << "LMTOS_WAIT_EMPTY_LCP";
11292     break;
11293   case Dbdih::LMTOS_WAIT_LCP_FRAG_REP:
11294     out << "LMTOS_WAIT_EMPTY_LCP";
11295     break;
11296   case Dbdih::LMTOS_INITIAL:
11297     out << "LMTOS_INITIAL";
11298     break;
11299   case Dbdih::LMTOS_ALL_IDLE:
11300     out << "LMTOS_ALL_IDLE";
11301     break;
11302   case Dbdih::LMTOS_ALL_ACTIVE:
11303     out << "LMTOS_ALL_ACTIVE";
11304     break;
11305   case Dbdih::LMTOS_LCP_CONCLUDING:
11306     out << "LMTOS_LCP_CONCLUDING";
11307     break;
11308   case Dbdih::LMTOS_COPY_ONGOING:
11309     out << "LMTOS_COPY_ONGOING";
11310     break;
11311   }
11312   return out;
11313 }
11314 
11315 struct MASTERLCP_StateTransitions {
11316   Dbdih::LcpMasterTakeOverState CurrentState;
11317   MasterLCPConf::State ParticipantState;
11318   Dbdih::LcpMasterTakeOverState NewState;
11319 };
11320 
11321 static const
11322 MASTERLCP_StateTransitions g_masterLCPTakeoverStateTransitions[] = {
11323   /**
11324    * Current = LMTOS_INITIAL
11325    */
11326   { Dbdih::LMTOS_INITIAL,
11327     MasterLCPConf::LCP_STATUS_IDLE,
11328     Dbdih::LMTOS_ALL_IDLE },
11329 
11330   { Dbdih::LMTOS_INITIAL,
11331     MasterLCPConf::LCP_STATUS_ACTIVE,
11332     Dbdih::LMTOS_ALL_ACTIVE },
11333 
11334   { Dbdih::LMTOS_INITIAL,
11335     MasterLCPConf::LCP_TAB_COMPLETED,
11336     Dbdih::LMTOS_LCP_CONCLUDING },
11337 
11338   { Dbdih::LMTOS_INITIAL,
11339     MasterLCPConf::LCP_TAB_SAVED,
11340     Dbdih::LMTOS_LCP_CONCLUDING },
11341 
11342   /**
11343    * Current = LMTOS_ALL_IDLE
11344    */
11345   { Dbdih::LMTOS_ALL_IDLE,
11346     MasterLCPConf::LCP_STATUS_IDLE,
11347     Dbdih::LMTOS_ALL_IDLE },
11348 
11349   { Dbdih::LMTOS_ALL_IDLE,
11350     MasterLCPConf::LCP_STATUS_ACTIVE,
11351     Dbdih::LMTOS_COPY_ONGOING },
11352 
11353   { Dbdih::LMTOS_ALL_IDLE,
11354     MasterLCPConf::LCP_TAB_COMPLETED,
11355     Dbdih::LMTOS_LCP_CONCLUDING },
11356 
11357   { Dbdih::LMTOS_ALL_IDLE,
11358     MasterLCPConf::LCP_TAB_SAVED,
11359     Dbdih::LMTOS_LCP_CONCLUDING },
11360 
11361   /**
11362    * Current = LMTOS_COPY_ONGOING
11363    */
11364   { Dbdih::LMTOS_COPY_ONGOING,
11365     MasterLCPConf::LCP_STATUS_IDLE,
11366     Dbdih::LMTOS_COPY_ONGOING },
11367 
11368   { Dbdih::LMTOS_COPY_ONGOING,
11369     MasterLCPConf::LCP_STATUS_ACTIVE,
11370     Dbdih::LMTOS_COPY_ONGOING },
11371 
11372   /**
11373    * Current = LMTOS_ALL_ACTIVE
11374    */
11375   { Dbdih::LMTOS_ALL_ACTIVE,
11376     MasterLCPConf::LCP_STATUS_IDLE,
11377     Dbdih::LMTOS_COPY_ONGOING },
11378 
11379   { Dbdih::LMTOS_ALL_ACTIVE,
11380     MasterLCPConf::LCP_STATUS_ACTIVE,
11381     Dbdih::LMTOS_ALL_ACTIVE },
11382 
11383   { Dbdih::LMTOS_ALL_ACTIVE,
11384     MasterLCPConf::LCP_TAB_COMPLETED,
11385     Dbdih::LMTOS_LCP_CONCLUDING },
11386 
11387   { Dbdih::LMTOS_ALL_ACTIVE,
11388     MasterLCPConf::LCP_TAB_SAVED,
11389     Dbdih::LMTOS_LCP_CONCLUDING },
11390 
11391   /**
11392    * Current = LMTOS_LCP_CONCLUDING
11393    */
11394   { Dbdih::LMTOS_LCP_CONCLUDING,
11395     MasterLCPConf::LCP_STATUS_IDLE,
11396     Dbdih::LMTOS_LCP_CONCLUDING },
11397 
11398   { Dbdih::LMTOS_LCP_CONCLUDING,
11399     MasterLCPConf::LCP_STATUS_ACTIVE,
11400     Dbdih::LMTOS_LCP_CONCLUDING },
11401 
11402   { Dbdih::LMTOS_LCP_CONCLUDING,
11403     MasterLCPConf::LCP_TAB_COMPLETED,
11404     Dbdih::LMTOS_LCP_CONCLUDING },
11405 
11406   { Dbdih::LMTOS_LCP_CONCLUDING,
11407     MasterLCPConf::LCP_TAB_SAVED,
11408     Dbdih::LMTOS_LCP_CONCLUDING }
11409 };
11410 
11411 const Uint32 g_masterLCPTakeoverStateTransitionsRows =
11412 sizeof(g_masterLCPTakeoverStateTransitions) / sizeof(struct MASTERLCP_StateTransitions);
11413 
execMASTER_LCPCONF(Signal * signal)11414 void Dbdih::execMASTER_LCPCONF(Signal* signal)
11415 {
11416   const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
11417   jamEntry();
11418 
11419   if (ERROR_INSERTED(7194))
11420   {
11421     ndbout_c("delaying MASTER_LCPCONF due to error 7194");
11422     sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
11423                         300, signal->getLength());
11424     return;
11425   }
11426 
11427   if (ERROR_INSERTED(7230) &&
11428       refToNode(signal->getSendersBlockRef()) != getOwnNodeId())
11429   {
11430     infoEvent("delaying MASTER_LCPCONF due to error 7230 (from %u)",
11431               refToNode(signal->getSendersBlockRef()));
11432     sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
11433                         300, signal->getLength());
11434     return;
11435   }
11436 
11437   Uint32 senderNodeId = conf->senderNodeId;
11438   MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState;
11439   const Uint32 failedNodeId = conf->failedNodeId;
11440   NodeRecordPtr nodePtr;
11441   nodePtr.i = senderNodeId;
11442   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
11443   nodePtr.p->lcpStateAtTakeOver = lcpState;
11444 
11445   CRASH_INSERTION(7180);
11446 
11447 #ifdef VM_TRACE
11448   g_eventLogger->info("MASTER_LCPCONF from node %u", senderNodeId);
11449   printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);
11450 #endif
11451 
11452   bool found = false;
11453   for(Uint32 i = 0; i<g_masterLCPTakeoverStateTransitionsRows; i++){
11454     const struct MASTERLCP_StateTransitions * valid =
11455       &g_masterLCPTakeoverStateTransitions[i];
11456 
11457     if(valid->CurrentState == c_lcpMasterTakeOverState.state &&
11458        valid->ParticipantState == lcpState){
11459       jam();
11460       found = true;
11461       c_lcpMasterTakeOverState.set(valid->NewState, __LINE__);
11462       break;
11463     }
11464   }
11465   ndbrequire(found);
11466 
11467   bool ok = false;
11468   switch(lcpState){
11469   case MasterLCPConf::LCP_STATUS_IDLE:
11470     ok = true;
11471     break;
11472   case MasterLCPConf::LCP_STATUS_ACTIVE:
11473   case MasterLCPConf::LCP_TAB_COMPLETED:
11474   case MasterLCPConf::LCP_TAB_SAVED:
11475     ok = true;
11476     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.setWaitingFor(nodePtr.i);
11477     break;
11478   }
11479   ndbrequire(ok);
11480 
11481   receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
11482   /*-------------------------------------------------------------------------*/
11483   // We have now received all responses and are ready to take over the LCP
11484   // protocol as master.
11485   /*-------------------------------------------------------------------------*/
11486   MASTER_LCPhandling(signal, failedNodeId);
11487 }//Dbdih::execMASTER_LCPCONF()
11488 
execMASTER_LCPREF(Signal * signal)11489 void Dbdih::execMASTER_LCPREF(Signal* signal)
11490 {
11491   const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
11492   jamEntry();
11493 
11494   Uint32 senderNodeId = ref->senderNodeId;
11495   Uint32 failedNodeId = ref->failedNodeId;
11496 
11497   if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId))
11498   {
11499     jam();
11500     c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId);
11501   }
11502 
11503   receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
11504   /*-------------------------------------------------------------------------*/
11505   // We have now received all responses and are ready to take over the LCP
11506   // protocol as master.
11507   /*-------------------------------------------------------------------------*/
11508   MASTER_LCPhandling(signal, failedNodeId);
11509 }//Dbdih::execMASTER_LCPREF()
11510 
MASTER_LCPhandling(Signal * signal,Uint32 failedNodeId)11511 void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
11512 {
11513   /*-------------------------------------------------------------------------
11514    *
11515    * WE ARE NOW READY TO CONCLUDE THE TAKE OVER AS MASTER.
11516    * WE HAVE ENOUGH INFO TO START UP ACTIVITIES IN THE PROPER PLACE.
11517    * ALSO SET THE PROPER STATE VARIABLES.
11518    *------------------------------------------------------------------------*/
11519   c_lcpState.currentFragment.tableId = c_lcpMasterTakeOverState.minTableId;
11520   c_lcpState.currentFragment.fragmentId = c_lcpMasterTakeOverState.minFragId;
11521   c_lcpState.m_LAST_LCP_FRAG_ORD = c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH;
11522 
11523   NodeRecordPtr failedNodePtr;
11524   failedNodePtr.i = failedNodeId;
11525   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
11526 
11527   switch (c_lcpMasterTakeOverState.state) {
11528   case LMTOS_ALL_IDLE:
11529     jam();
11530     /* --------------------------------------------------------------------- */
11531     // All nodes were idle in the LCP protocol. Start checking for start of LCP
11532     // protocol.
11533     /* --------------------------------------------------------------------- */
11534 #ifdef VM_TRACE
11535     g_eventLogger->info("MASTER_LCPhandling:: LMTOS_ALL_IDLE -> checkLcpStart");
11536 #endif
11537     checkLcpStart(signal, __LINE__, 0);
11538     break;
11539   case LMTOS_COPY_ONGOING:
11540     jam();
11541     /* --------------------------------------------------------------------- */
11542     // We were in the starting process of the LCP protocol. We will restart the
11543     // protocol by calculating the keep gci and storing the new lcp id.
11544     /* --------------------------------------------------------------------- */
11545 #ifdef VM_TRACE
11546     g_eventLogger->info("MASTER_LCPhandling:: LMTOS_COPY_ONGOING -> storeNewLcpId");
11547 #endif
11548     if (c_lcpState.lcpStatus == LCP_STATUS_ACTIVE) {
11549       jam();
11550       /*---------------------------------------------------------------------*/
11551       /*  WE NEED TO DECREASE THE LATEST LCP ID SINCE WE HAVE ALREADY        */
11552       /*  STARTED THIS */
11553       /*  LOCAL CHECKPOINT.                                                  */
11554       /*---------------------------------------------------------------------*/
11555 #ifdef VM_TRACE
11556       Uint32 lcpId = SYSFILE->latestLCP_ID;
11557       g_eventLogger->info("Decreasing latestLCP_ID from %d to %d", lcpId, lcpId - 1);
11558 #endif
11559       SYSFILE->latestLCP_ID--;
11560     }//if
11561     start_lcp_before_mutex(signal);
11562     break;
11563   case LMTOS_ALL_ACTIVE:
11564     {
11565       jam();
11566       /* -------------------------------------------------------------------
11567        * Everybody was in the active phase. We will restart sending
11568        * LCP_FRAG_ORD to the nodes from the new master.
11569        * We also need to set dihLcpStatus to ZACTIVE
11570        * in the master node since the master will wait for all nodes to
11571        * complete before finalising the LCP process.
11572        * ------------------------------------------------------------------ */
11573 #ifdef VM_TRACE
11574       g_eventLogger->info("MASTER_LCPhandling:: LMTOS_ALL_ACTIVE -> "
11575                           "startLcpRoundLoopLab(table=%u, fragment=%u)",
11576                           c_lcpMasterTakeOverState.minTableId,
11577                           c_lcpMasterTakeOverState.minFragId);
11578 #endif
11579 
11580       c_lcpState.keepGci = SYSFILE->keepGCI;
11581 
11582       /**
11583        * We need not protect against ongoing copy of meta data here since
11584        * that cannot be ongoing while we are taking over as master. The
11585        * reason is that a starting node will always fail also if any node
11586        * fails in the middle of the start process.
11587        */
11588       c_lcp_runs_with_pause_support = check_if_pause_lcp_possible();
11589       if (!c_lcp_runs_with_pause_support)
11590       {
11591         jam();
11592         /**
11593          * We need to reaquire the mutex...
11594          */
11595         Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
11596         Callback c =
11597           { safe_cast(&Dbdih::master_lcp_fragmentMutex_locked),
11598             failedNodePtr.i
11599           };
11600         ndbrequire(mutex.lock(c, false));
11601       }
11602       else
11603       {
11604         jam();
11605         /* No mutex is needed, call callback function immediately */
11606         master_lcp_fragmentMutex_locked(signal, failedNodePtr.i, 0);
11607       }
11608       return;
11609     }
11610   case LMTOS_LCP_CONCLUDING:
11611     {
11612       jam();
11613       /* ------------------------------------------------------------------- */
11614       // The LCP process is in the finalisation phase. We simply wait for it to
11615       // complete with signals arriving in. We need to check also if we should
11616       // change state due to table write completion during state
11617       // collection phase.
11618       /* ------------------------------------------------------------------- */
11619       ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE);
11620 
11621       c_lcp_runs_with_pause_support = check_if_pause_lcp_possible();
11622       if (!c_lcp_runs_with_pause_support)
11623       {
11624         jam();
11625         /**
11626          * We need to reaquire the mutex...
11627          * We have nodes in the cluster without support of pause lcp.
11628          */
11629         Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
11630         Callback c =
11631           { safe_cast(&Dbdih::master_lcp_fragmentMutex_locked),
11632             failedNodePtr.i
11633           };
11634         ndbrequire(mutex.lock(c, false));
11635       }
11636       else
11637       {
11638         jam();
11639         /* No mutex is needed, call callback function immediately */
11640         master_lcp_fragmentMutex_locked(signal, failedNodePtr.i, 0);
11641       }
11642       return;
11643     }
11644   default:
11645     ndbrequire(false);
11646     break;
11647   }//switch
11648   signal->theData[0] = NDB_LE_LCP_TakeoverCompleted;
11649   signal->theData[1] = c_lcpMasterTakeOverState.state;
11650   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
11651 
11652   signal->theData[0] = 7012;
11653   execDUMP_STATE_ORD(signal);
11654 
11655   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
11656 
11657   checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER);
11658 }
11659 
11660 /* ------------------------------------------------------------------------- */
11661 /*       A BLOCK OR A NODE HAS COMPLETED THE HANDLING OF THE NODE FAILURE.   */
11662 /* ------------------------------------------------------------------------- */
execNF_COMPLETEREP(Signal * signal)11663 void Dbdih::execNF_COMPLETEREP(Signal* signal)
11664 {
11665   NodeRecordPtr failedNodePtr;
11666   NFCompleteRep * const nfCompleteRep = (NFCompleteRep *)&signal->theData[0];
11667   jamEntry();
11668   const Uint32 blockNo = nfCompleteRep->blockNo;
11669   Uint32 nodeId       = nfCompleteRep->nodeId;
11670   failedNodePtr.i = nfCompleteRep->failedNodeId;
11671 
11672   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
11673   switch (blockNo) {
11674   case DBTC:
11675     jam();
11676     ndbrequire(failedNodePtr.p->dbtcFailCompleted == ZFALSE);
11677     /* -------------------------------------------------------------------- */
11678     // Report the event that DBTC completed node failure handling.
11679     /* -------------------------------------------------------------------- */
11680     signal->theData[0] = NDB_LE_NodeFailCompleted;
11681     signal->theData[1] = DBTC;
11682     signal->theData[2] = failedNodePtr.i;
11683     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
11684 
11685     failedNodePtr.p->dbtcFailCompleted = ZTRUE;
11686     break;
11687   case DBDICT:
11688     jam();
11689     ndbrequire(failedNodePtr.p->dbdictFailCompleted == ZFALSE);
11690     /* --------------------------------------------------------------------- */
11691     // Report the event that DBDICT completed node failure handling.
11692     /* --------------------------------------------------------------------- */
11693     signal->theData[0] = NDB_LE_NodeFailCompleted;
11694     signal->theData[1] = DBDICT;
11695     signal->theData[2] = failedNodePtr.i;
11696     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
11697 
11698     failedNodePtr.p->dbdictFailCompleted = ZTRUE;
11699     break;
11700   case DBDIH:
11701     jam();
11702     ndbrequire(failedNodePtr.p->dbdihFailCompleted == ZFALSE);
11703     /* --------------------------------------------------------------------- */
11704     // Report the event that DBDIH completed node failure handling.
11705     /* --------------------------------------------------------------------- */
11706     signal->theData[0] = NDB_LE_NodeFailCompleted;
11707     signal->theData[1] = DBDIH;
11708     signal->theData[2] = failedNodePtr.i;
11709     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
11710 
11711     failedNodePtr.p->dbdihFailCompleted = ZTRUE;
11712     break;
11713   case DBLQH:
11714     jam();
11715     ndbrequire(failedNodePtr.p->dblqhFailCompleted == ZFALSE);
11716     /* --------------------------------------------------------------------- */
11717     // Report the event that DBDIH completed node failure handling.
11718     /* --------------------------------------------------------------------- */
11719     signal->theData[0] = NDB_LE_NodeFailCompleted;
11720     signal->theData[1] = DBLQH;
11721     signal->theData[2] = failedNodePtr.i;
11722     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
11723 
11724     failedNodePtr.p->dblqhFailCompleted = ZTRUE;
11725     break;
11726   case 0: /* Node has finished */
11727     jam();
11728     ndbrequire(nodeId < MAX_NDB_NODES);
11729 
11730     if (failedNodePtr.p->recNODE_FAILREP == ZFALSE) {
11731       jam();
11732       /* ------------------------------------------------------------------- */
11733       // We received a report about completion of node failure before we
11734       // received the message about the NODE failure ourselves.
11735       // We will send the signal to ourselves with a small delay
11736       // (10 milliseconds).
11737       /* ------------------------------------------------------------------- */
11738       //nf->from = __LINE__;
11739       sendSignalWithDelay(reference(), GSN_NF_COMPLETEREP, signal, 10,
11740 			  signal->length());
11741       return;
11742     }//if
11743 
11744     if (!failedNodePtr.p->m_NF_COMPLETE_REP.isWaitingFor(nodeId)){
11745       jam();
11746       return;
11747     }
11748 
11749     failedNodePtr.p->m_NF_COMPLETE_REP.clearWaitingFor(nodeId);;
11750 
11751     /* -------------------------------------------------------------------- */
11752     // Report the event that nodeId has completed node failure handling.
11753     /* -------------------------------------------------------------------- */
11754     signal->theData[0] = NDB_LE_NodeFailCompleted;
11755     signal->theData[1] = 0;
11756     signal->theData[2] = failedNodePtr.i;
11757     signal->theData[3] = nodeId;
11758     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
11759 
11760     nodeFailCompletedCheckLab(signal, failedNodePtr);
11761     return;
11762     break;
11763   default:
11764     ndbrequire(false);
11765     return;
11766     break;
11767   }//switch
11768   if (failedNodePtr.p->dbtcFailCompleted == ZFALSE) {
11769     jam();
11770     return;
11771   }//if
11772   if (failedNodePtr.p->dbdictFailCompleted == ZFALSE) {
11773     jam();
11774     return;
11775   }//if
11776   if (failedNodePtr.p->dbdihFailCompleted == ZFALSE) {
11777     jam();
11778     return;
11779   }//if
11780   if (failedNodePtr.p->dblqhFailCompleted == ZFALSE) {
11781     jam();
11782     return;
11783   }//if
11784   /* ----------------------------------------------------------------------- */
11785   /*     ALL BLOCKS IN THIS NODE HAVE COMPLETED THEIR PART OF HANDLING THE   */
11786   /*     NODE FAILURE. WE CAN NOW REPORT THIS COMPLETION TO ALL OTHER NODES. */
11787   /* ----------------------------------------------------------------------- */
11788   NodeRecordPtr nodePtr;
11789   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
11790     jam();
11791     ptrAss(nodePtr, nodeRecord);
11792     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
11793       jam();
11794       BlockReference ref = calcDihBlockRef(nodePtr.i);
11795       NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
11796       nf->blockNo      = 0;
11797       nf->nodeId       = cownNodeId;
11798       nf->failedNodeId = failedNodePtr.i;
11799       nf->from = __LINE__;
11800       sendSignal(ref, GSN_NF_COMPLETEREP, signal,
11801                  NFCompleteRep::SignalLength, JBB);
11802     }//if
11803   }//for
11804   return;
11805 }//Dbdih::execNF_COMPLETEREP()
11806 
nodeFailCompletedCheckLab(Signal * signal,NodeRecordPtr failedNodePtr)11807 void Dbdih::nodeFailCompletedCheckLab(Signal* signal,
11808 				      NodeRecordPtr failedNodePtr)
11809 {
11810   jam();
11811   if (!failedNodePtr.p->m_NF_COMPLETE_REP.done()){
11812     jam();
11813     return;
11814   }//if
11815   /* ---------------------------------------------------------------------- */
11816   /*    ALL BLOCKS IN ALL NODES HAVE NOW REPORTED COMPLETION OF THE NODE    */
11817   /*    FAILURE HANDLING. WE ARE NOW READY TO ACCEPT THAT THIS NODE STARTS  */
11818   /*    AGAIN.                                                              */
11819   /* ---------------------------------------------------------------------- */
11820   jam();
11821   failedNodePtr.p->nodeStatus = NodeRecord::DEAD;
11822   failedNodePtr.p->recNODE_FAILREP = ZFALSE;
11823 
11824   /* ---------------------------------------------------------------------- */
11825   // Report the event that all nodes completed node failure handling.
11826   /* ---------------------------------------------------------------------- */
11827   signal->theData[0] = NDB_LE_NodeFailCompleted;
11828   signal->theData[1] = 0;
11829   signal->theData[2] = failedNodePtr.i;
11830   signal->theData[3] = 0;
11831   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
11832 
11833   /* ---------------------------------------------------------------------- */
11834   // Report to QMGR that we have concluded recovery handling of this node.
11835   /* ---------------------------------------------------------------------- */
11836   signal->theData[0] = failedNodePtr.i;
11837   sendSignal(QMGR_REF, GSN_NDB_FAILCONF, signal, 1, JBB);
11838   setNodeRecoveryStatus(failedNodePtr.i, NodeRecord::NODE_FAILURE_COMPLETED);
11839   return;
11840 }//Dbdih::nodeFailCompletedCheckLab()
11841 
11842 /*****************************************************************************/
11843 /* **********     SEIZING / RELEASING MODULE                     *************/
11844 /*****************************************************************************/
11845 /*
11846   3.4   L O C A L  N O D E   S E I Z E
11847   ************************************
11848   */
11849 /*
11850   3.7   A D D   T A B L E
11851   **********************=
11852   */
11853 /*****************************************************************************/
11854 /* **********     TABLE ADDING MODULE                            *************/
11855 /*****************************************************************************/
11856 /*
11857   3.7.1   A D D   T A B L E   M A I N L Y
11858   ***************************************
11859   */
11860 
inc_node_or_group(Uint32 & node,Uint32 max_node)11861 static inline void inc_node_or_group(Uint32 &node, Uint32 max_node)
11862 {
11863   Uint32 next = node + 1;
11864   node = (next == max_node ? 0 : next);
11865 }
11866 
11867 /*
11868   Spread fragments in backwards compatible mode
11869 */
set_default_node_groups(Signal * signal,Uint32 noFrags)11870 static void set_default_node_groups(Signal *signal, Uint32 noFrags)
11871 {
11872   Uint16 *node_group_array = (Uint16*)&signal->theData[25];
11873   Uint32 i;
11874   node_group_array[0] = 0;
11875   for (i = 1; i < noFrags; i++)
11876     node_group_array[i] = NDB_UNDEF_NODEGROUP;
11877 }
11878 
find_min_index(const Uint32 * array,Uint32 cnt)11879 static Uint32 find_min_index(const Uint32* array, Uint32 cnt)
11880 {
11881   Uint32 m = 0;
11882   Uint32 mv = array[0];
11883   for (Uint32 i = 1; i<cnt; i++)
11884   {
11885     if (array[i] < mv)
11886     {
11887       m = i;
11888       mv = array[i];
11889     }
11890   }
11891   return m;
11892 }
11893 
11894 Uint32
getFragmentsPerNode()11895 Dbdih::getFragmentsPerNode()
11896 {
11897   jam();
11898   if (c_fragments_per_node_ != 0)
11899   {
11900     return c_fragments_per_node_;
11901   }
11902 
11903   c_fragments_per_node_ = getLqhWorkers();
11904   if (c_fragments_per_node_ == 0)
11905     c_fragments_per_node_ = 1; // ndbd
11906 
11907   NodeRecordPtr nodePtr;
11908   nodePtr.i = cfirstAliveNode;
11909   do
11910   {
11911     jam();
11912     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
11913     Uint32 workers = getNodeInfo(nodePtr.i).m_lqh_workers;
11914     if (workers == 0) // ndbd
11915       workers = 1;
11916 
11917     c_fragments_per_node_ = MIN(workers, c_fragments_per_node_);
11918     nodePtr.i = nodePtr.p->nextNode;
11919   } while (nodePtr.i != RNIL);
11920 
11921   if (c_fragments_per_node_ == 0)
11922   {
11923     ndbassert(false);
11924     c_fragments_per_node_ = 1;
11925   }
11926 #ifdef VM_TRACE
11927   ndbout_c("Using %u fragments per node", c_fragments_per_node_);
11928 #endif
11929   return c_fragments_per_node_;
11930 }
11931 
execCREATE_FRAGMENTATION_REQ(Signal * signal)11932 void Dbdih::execCREATE_FRAGMENTATION_REQ(Signal * signal)
11933 {
11934   Uint16 node_group_id[MAX_NDB_PARTITIONS];
11935   jamEntry();
11936   CreateFragmentationReq * const req =
11937     (CreateFragmentationReq*)signal->getDataPtr();
11938 
11939   const Uint32 senderRef = req->senderRef;
11940   const Uint32 senderData = req->senderData;
11941   Uint32 noOfFragments = req->noOfFragments;
11942   const Uint32 fragType = req->fragmentationType;
11943   const Uint32 primaryTableId = req->primaryTableId;
11944   const Uint32 map_ptr_i = req->map_ptr_i;
11945   const Uint32 flags = req->requestInfo;
11946 
11947   Uint32 err = 0;
11948   const Uint32 defaultFragments =
11949     getFragmentsPerNode() * cnoOfNodeGroups * cnoReplicas;
11950   const Uint32 maxFragments = MAX_FRAG_PER_LQH * defaultFragments;
11951 
11952   do {
11953     NodeGroupRecordPtr NGPtr;
11954     TabRecordPtr primTabPtr;
11955     Uint32 count = 2;
11956     Uint16 noOfReplicas = cnoReplicas;
11957     Uint16 *fragments = (Uint16*)(signal->theData+25);
11958     if (primaryTableId == RNIL) {
11959       jam();
11960       switch ((DictTabInfo::FragmentType)fragType){
11961         /*
11962           Backward compatability and for all places in code not changed.
11963         */
11964       case DictTabInfo::AllNodesSmallTable:
11965         jam();
11966         noOfFragments = defaultFragments;
11967         set_default_node_groups(signal, noOfFragments);
11968         break;
11969       case DictTabInfo::AllNodesMediumTable:
11970         jam();
11971         noOfFragments = 2 * defaultFragments;
11972         if (noOfFragments > maxFragments)
11973           noOfFragments = maxFragments;
11974         set_default_node_groups(signal, noOfFragments);
11975         break;
11976       case DictTabInfo::AllNodesLargeTable:
11977         jam();
11978         noOfFragments = 4 * defaultFragments;
11979         if (noOfFragments > maxFragments)
11980           noOfFragments = maxFragments;
11981         set_default_node_groups(signal, noOfFragments);
11982         break;
11983       case DictTabInfo::SingleFragment:
11984         jam();
11985         noOfFragments = 1;
11986         set_default_node_groups(signal, noOfFragments);
11987         break;
11988       case DictTabInfo::DistrKeyHash:
11989         jam();
11990       case DictTabInfo::DistrKeyLin:
11991         jam();
11992         if (noOfFragments == 0)
11993         {
11994           jam();
11995           noOfFragments = defaultFragments;
11996           set_default_node_groups(signal, noOfFragments);
11997         }
11998         break;
11999       case DictTabInfo::HashMapPartition:
12000       {
12001         jam();
12002         ndbrequire(map_ptr_i != RNIL);
12003         Ptr<Hash2FragmentMap> ptr;
12004         g_hash_map.getPtr(ptr, map_ptr_i);
12005         if (noOfFragments == 0)
12006         {
12007           jam();
12008           noOfFragments = ptr.p->m_fragments;
12009         }
12010         else if (noOfFragments != ptr.p->m_fragments)
12011         {
12012           jam();
12013           err = CreateFragmentationRef::InvalidFragmentationType;
12014           break;
12015         }
12016         set_default_node_groups(signal, noOfFragments);
12017         break;
12018       }
12019       default:
12020         jam();
12021         if (noOfFragments == 0)
12022         {
12023           jam();
12024           err = CreateFragmentationRef::InvalidFragmentationType;
12025         }
12026         break;
12027       }
12028       if (err)
12029         break;
12030       /*
12031         When we come here the the exact partition is specified
12032         and there is an array of node groups sent along as well.
12033       */
12034       memcpy(&node_group_id[0], &signal->theData[25], 2 * noOfFragments);
12035       Uint16 next_replica_node[MAX_NDB_NODES];
12036       memset(next_replica_node,0,sizeof(next_replica_node));
12037       Uint32 default_node_group= c_nextNodeGroup;
12038       for(Uint32 fragNo = 0; fragNo < noOfFragments; fragNo++)
12039       {
12040         jam();
12041         NGPtr.i = node_group_id[fragNo];
12042         if (NGPtr.i == NDB_UNDEF_NODEGROUP)
12043         {
12044           jam();
12045 	  NGPtr.i = c_node_groups[default_node_group];
12046         }
12047         if (NGPtr.i >= MAX_NDB_NODES)
12048         {
12049           jam();
12050           err = CreateFragmentationRef::InvalidNodeGroup;
12051           break;
12052         }
12053         ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
12054         if (NGPtr.p->nodegroupIndex == RNIL)
12055         {
12056           jam();
12057           err = CreateFragmentationRef::InvalidNodeGroup;
12058           break;
12059         }
12060         const Uint32 max = NGPtr.p->nodeCount;
12061 	const Uint32 logPart = (NGPtr.p->m_next_log_part++ / cnoReplicas) % globalData.ndbLogParts;
12062         ndbrequire(logPart < NDBMT_MAX_WORKER_INSTANCES);
12063 	fragments[count++] = logPart; // Store logpart first
12064 	Uint32 tmp= next_replica_node[NGPtr.i];
12065         for(Uint32 replicaNo = 0; replicaNo < noOfReplicas; replicaNo++)
12066         {
12067           jam();
12068           const Uint16 nodeId = NGPtr.p->nodesInGroup[tmp];
12069           fragments[count++]= nodeId;
12070           inc_node_or_group(tmp, max);
12071         }
12072         inc_node_or_group(tmp, max);
12073 	next_replica_node[NGPtr.i]= tmp;
12074 
12075         /**
12076          * Next node group for next fragment
12077          */
12078         inc_node_or_group(default_node_group, cnoOfNodeGroups);
12079       }
12080       if (err)
12081       {
12082         jam();
12083         break;
12084       }
12085       else
12086       {
12087         jam();
12088         c_nextNodeGroup = default_node_group;
12089       }
12090     } else {
12091       if (primaryTableId >= ctabFileSize) {
12092         jam();
12093         err = CreateFragmentationRef::InvalidPrimaryTable;
12094         break;
12095       }
12096       primTabPtr.i = primaryTableId;
12097       ptrAss(primTabPtr, tabRecord);
12098       if (primTabPtr.p->tabStatus != TabRecord::TS_ACTIVE) {
12099         jam();
12100         err = CreateFragmentationRef::InvalidPrimaryTable;
12101         break;
12102       }
12103       Uint32 fragments_per_node[MAX_NDB_NODES]; // Keep track of no of (primary) fragments per node
12104       bzero(fragments_per_node, sizeof(fragments_per_node));
12105       for (Uint32 fragNo = 0; fragNo < primTabPtr.p->totalfragments; fragNo++) {
12106         jam();
12107         FragmentstorePtr fragPtr;
12108         ReplicaRecordPtr replicaPtr;
12109         getFragstore(primTabPtr.p, fragNo, fragPtr);
12110 	fragments[count++] = fragPtr.p->m_log_part_id;
12111         fragments[count++] = fragPtr.p->preferredPrimary;
12112         fragments_per_node[fragPtr.p->preferredPrimary]++;
12113         for (replicaPtr.i = fragPtr.p->storedReplicas;
12114              replicaPtr.i != RNIL;
12115              replicaPtr.i = replicaPtr.p->nextPool) {
12116           jam();
12117           c_replicaRecordPool.getPtr(replicaPtr);
12118           if (replicaPtr.p->procNode != fragPtr.p->preferredPrimary) {
12119             jam();
12120             fragments[count++]= replicaPtr.p->procNode;
12121           }
12122         }
12123         for (replicaPtr.i = fragPtr.p->oldStoredReplicas;
12124              replicaPtr.i != RNIL;
12125              replicaPtr.i = replicaPtr.p->nextPool) {
12126           jam();
12127           c_replicaRecordPool.getPtr(replicaPtr);
12128           if (replicaPtr.p->procNode != fragPtr.p->preferredPrimary) {
12129             jam();
12130             fragments[count++]= replicaPtr.p->procNode;
12131           }
12132         }
12133       }
12134 
12135       if (flags & CreateFragmentationReq::RI_GET_FRAGMENTATION)
12136       {
12137         jam();
12138         noOfFragments = primTabPtr.p->totalfragments;
12139       }
12140       else if (flags & CreateFragmentationReq::RI_ADD_PARTITION)
12141       {
12142         jam();
12143         /**
12144          * All nodes that dont belong to a nodegroup to ~0 fragments_per_node
12145          *   so that they dont get any more...
12146          */
12147         for (Uint32 i = 0; i<MAX_NDB_NODES; i++)
12148         {
12149           if (getNodeStatus(i) == NodeRecord::NOT_IN_CLUSTER ||
12150               getNodeGroup(i) >= cnoOfNodeGroups) // XXX todo
12151           {
12152             jam();
12153             ndbassert(fragments_per_node[i] == 0);
12154             fragments_per_node[i] = ~(Uint32)0;
12155           }
12156         }
12157         for (Uint32 i = primTabPtr.p->totalfragments; i<noOfFragments; i++)
12158         {
12159           jam();
12160           Uint32 node = find_min_index(fragments_per_node,
12161                                        NDB_ARRAY_SIZE(fragments_per_node));
12162           NGPtr.i = getNodeGroup(node);
12163           ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
12164           const Uint32 logPart = (NGPtr.p->m_next_log_part++) % globalData.ndbLogParts;
12165           ndbrequire(logPart < NDBMT_MAX_WORKER_INSTANCES);
12166           fragments[count++] = logPart;
12167           fragments[count++] = node;
12168           fragments_per_node[node]++;
12169           for (Uint32 r = 0; r<noOfReplicas; r++)
12170           {
12171             jam();
12172             if (NGPtr.p->nodesInGroup[r] != node)
12173             {
12174               jam();
12175               fragments[count++] = NGPtr.p->nodesInGroup[r];
12176             }
12177           }
12178         }
12179       }
12180     }
12181     if(count != (2U + (1 + noOfReplicas) * noOfFragments)){
12182         char buf[255];
12183         BaseString::snprintf(buf, sizeof(buf),
12184                            "Illegal configuration change: NoOfReplicas."
12185                            " Can't be applied online ");
12186         progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
12187     }
12188 
12189     CreateFragmentationConf * const conf =
12190       (CreateFragmentationConf*)signal->getDataPtrSend();
12191     conf->senderRef = reference();
12192     conf->senderData = senderData;
12193     conf->noOfReplicas = (Uint32)noOfReplicas;
12194     conf->noOfFragments = (Uint32)noOfFragments;
12195 
12196     fragments[0]= noOfReplicas;
12197     fragments[1]= noOfFragments;
12198 
12199     if(senderRef != 0)
12200     {
12201       jam();
12202       LinearSectionPtr ptr[3];
12203       ptr[0].p = (Uint32*)&fragments[0];
12204       ptr[0].sz = (count + 1) / 2;
12205       sendSignal(senderRef,
12206 		 GSN_CREATE_FRAGMENTATION_CONF,
12207 		 signal,
12208 		 CreateFragmentationConf::SignalLength,
12209 		 JBB,
12210 		 ptr,
12211 		 1);
12212     }
12213     // Always ACK/NACK (here ACK)
12214     signal->theData[0] = 0;
12215     return;
12216   } while(false);
12217   // Always ACK/NACK (here NACK)
12218   signal->theData[0] = err;
12219 }
12220 
execDIADDTABREQ(Signal * signal)12221 void Dbdih::execDIADDTABREQ(Signal* signal)
12222 {
12223   Uint32 fragType;
12224   jamEntry();
12225 
12226   DiAddTabReq * const req = (DiAddTabReq*)signal->getDataPtr();
12227 
12228   // Seize connect record
12229   ndbrequire(cfirstconnect != RNIL);
12230   ConnectRecordPtr connectPtr;
12231   connectPtr.i = cfirstconnect;
12232   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12233   cfirstconnect = connectPtr.p->nextPool;
12234 
12235   const Uint32 userPtr = req->connectPtr;
12236   const BlockReference userRef = signal->getSendersBlockRef();
12237   connectPtr.p->nextPool = RNIL;
12238   connectPtr.p->userpointer = userPtr;
12239   connectPtr.p->userblockref = userRef;
12240   connectPtr.p->connectState = ConnectRecord::INUSE;
12241   connectPtr.p->table = req->tableId;
12242   connectPtr.p->m_alter.m_changeMask = 0;
12243   connectPtr.p->m_create.m_map_ptr_i = req->hashMapPtrI;
12244 
12245   TabRecordPtr tabPtr;
12246   tabPtr.i = req->tableId;
12247   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12248   tabPtr.p->connectrec = connectPtr.i;
12249   tabPtr.p->tableType = req->tableType;
12250   fragType= req->fragType;
12251   tabPtr.p->schemaVersion = req->schemaVersion;
12252   tabPtr.p->primaryTableId = req->primaryTableId;
12253   tabPtr.p->schemaTransId = req->schemaTransId;
12254   tabPtr.p->m_scan_count[0] = 0;
12255   tabPtr.p->m_scan_count[1] = 0;
12256   tabPtr.p->m_scan_reorg_flag = 0;
12257 
12258   if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE)
12259   {
12260     jam();
12261     tabPtr.p->tabStatus = TabRecord::TS_CREATING;
12262     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
12263     sendAddFragreq(signal, connectPtr, tabPtr, 0);
12264     return;
12265   }
12266 
12267   if (getNodeState().getSystemRestartInProgress() &&
12268      tabPtr.p->tabStatus == TabRecord::TS_IDLE)
12269   {
12270     jam();
12271 
12272     ndbrequire(cmasterNodeId == getOwnNodeId());
12273     tabPtr.p->tabStatus = TabRecord::TS_CREATING;
12274 
12275     initTableFile(tabPtr);
12276     FileRecordPtr filePtr;
12277     filePtr.i = tabPtr.p->tabFile[0];
12278     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
12279     openFileRw(signal, filePtr);
12280     filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
12281     return;
12282   }
12283 
12284   /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
12285   /* AT THE TIME OF INITIATING THE FILE OF TABLE         */
12286   /* DESCRIPTION IS CREATED FOR APPROPRIATE SIZE. EACH   */
12287   /* EACH RECORD IN THIS FILE HAS THE INFORMATION ABOUT  */
12288   /* ONE TABLE. THE POINTER TO THIS RECORD IS THE TABLE  */
12289   /* REFERENCE. IN THE BEGINNING ALL RECORDS ARE CREATED */
12290   /* BUT THEY DO NOT HAVE ANY INFORMATION ABOUT ANY TABLE*/
12291   /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
12292   tabPtr.p->tabStatus = TabRecord::TS_CREATING;
12293   if(req->loggedTable)
12294     tabPtr.p->tabStorage= TabRecord::ST_NORMAL;
12295   else if(req->temporaryTable)
12296     tabPtr.p->tabStorage= TabRecord::ST_TEMPORARY;
12297   else
12298     tabPtr.p->tabStorage= TabRecord::ST_NOLOGGING;
12299   tabPtr.p->kvalue = req->kValue;
12300 
12301   switch ((DictTabInfo::FragmentType)fragType){
12302   case DictTabInfo::HashMapPartition:
12303     tabPtr.p->method = TabRecord::HASH_MAP;
12304     break;
12305   case DictTabInfo::AllNodesSmallTable:
12306   case DictTabInfo::AllNodesMediumTable:
12307   case DictTabInfo::AllNodesLargeTable:
12308   case DictTabInfo::SingleFragment:
12309     jam();
12310   case DictTabInfo::DistrKeyLin:
12311     jam();
12312     tabPtr.p->method = TabRecord::LINEAR_HASH;
12313     break;
12314   case DictTabInfo::DistrKeyHash:
12315     jam();
12316     tabPtr.p->method = TabRecord::NORMAL_HASH;
12317     break;
12318   case DictTabInfo::DistrKeyOrderedIndex:
12319   {
12320     TabRecordPtr primTabPtr;
12321     primTabPtr.i = req->primaryTableId;
12322     ptrCheckGuard(primTabPtr, ctabFileSize, tabRecord);
12323     tabPtr.p->method = primTabPtr.p->method;
12324     req->hashMapPtrI = primTabPtr.p->m_map_ptr_i;
12325     break;
12326   }
12327   case DictTabInfo::UserDefined:
12328     jam();
12329     tabPtr.p->method = TabRecord::USER_DEFINED;
12330     break;
12331   default:
12332     ndbrequire(false);
12333   }
12334 
12335   union {
12336     Uint16 fragments[MAX_FRAGMENT_DATA_ENTRIES];
12337     Uint32 align;
12338   };
12339   (void)align; // kill warning
12340   SectionHandle handle(this, signal);
12341   SegmentedSectionPtr fragDataPtr;
12342   ndbrequire(handle.getSection(fragDataPtr, DiAddTabReq::FRAGMENTATION));
12343   copy((Uint32*)fragments, fragDataPtr);
12344   releaseSections(handle);
12345 
12346   const Uint32 noReplicas = fragments[0];
12347   const Uint32 noFragments = fragments[1];
12348 
12349   tabPtr.p->noOfBackups = noReplicas - 1;
12350   tabPtr.p->totalfragments = noFragments;
12351   ndbrequire(noReplicas == cnoReplicas); // Only allowed
12352 
12353   if (ERROR_INSERTED(7173)) {
12354     CLEAR_ERROR_INSERT_VALUE;
12355     addtabrefuseLab(signal, connectPtr, ZREPLERROR1);
12356     return;
12357   }
12358   if ((noReplicas * noFragments) > cnoFreeReplicaRec) {
12359     jam();
12360     addtabrefuseLab(signal, connectPtr, ZREPLERROR1);
12361     return;
12362   }//if
12363   if (noFragments > cremainingfrags) {
12364     jam();
12365     addtabrefuseLab(signal, connectPtr, ZREPLERROR2);
12366     return;
12367   }//if
12368 
12369   Uint32 logTotalFragments = 1;
12370   while (logTotalFragments <= tabPtr.p->totalfragments) {
12371     jam();
12372     logTotalFragments <<= 1;
12373   }
12374   logTotalFragments >>= 1;
12375   tabPtr.p->mask = logTotalFragments - 1;
12376   tabPtr.p->hashpointer = tabPtr.p->totalfragments - logTotalFragments;
12377   allocFragments(tabPtr.p->totalfragments, tabPtr);
12378 
12379   if (tabPtr.p->method == TabRecord::HASH_MAP)
12380   {
12381     jam();
12382     tabPtr.p->m_map_ptr_i = req->hashMapPtrI;
12383     tabPtr.p->m_new_map_ptr_i = RNIL;
12384     Ptr<Hash2FragmentMap> mapPtr;
12385     g_hash_map.getPtr(mapPtr, tabPtr.p->m_map_ptr_i);
12386     ndbrequire(tabPtr.p->totalfragments >= mapPtr.p->m_fragments);
12387   }
12388 
12389   Uint32 index = 2;
12390   for (Uint32 fragId = 0; fragId < noFragments; fragId++) {
12391     jam();
12392     FragmentstorePtr fragPtr;
12393     Uint32 activeIndex = 0;
12394     getFragstore(tabPtr.p, fragId, fragPtr);
12395     fragPtr.p->m_log_part_id = fragments[index++];
12396     fragPtr.p->preferredPrimary = fragments[index];
12397 
12398     ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
12399 
12400     inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
12401 
12402     for (Uint32 i = 0; i<noReplicas; i++) {
12403       const Uint32 nodeId = fragments[index++];
12404       ReplicaRecordPtr replicaPtr;
12405       allocStoredReplica(fragPtr,
12406                          replicaPtr,
12407                          nodeId,
12408                          fragId,
12409                          tabPtr.i);
12410       if (getNodeStatus(nodeId) == NodeRecord::ALIVE) {
12411         jam();
12412         ndbrequire(activeIndex < MAX_REPLICAS);
12413         fragPtr.p->activeNodes[activeIndex] = nodeId;
12414         activeIndex++;
12415       } else {
12416         jam();
12417         removeStoredReplica(fragPtr, replicaPtr);
12418         linkOldStoredReplica(fragPtr, replicaPtr);
12419       }//if
12420     }//for
12421     fragPtr.p->fragReplicas = activeIndex;
12422     ndbrequire(activeIndex > 0 && fragPtr.p->storedReplicas != RNIL);
12423   }
12424   initTableFile(tabPtr);
12425   tabPtr.p->tabCopyStatus = TabRecord::CS_ADD_TABLE_MASTER;
12426   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
12427   signal->theData[1] = tabPtr.i;
12428   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
12429 }
12430 
12431 void
addTable_closeConf(Signal * signal,Uint32 tabPtrI)12432 Dbdih::addTable_closeConf(Signal * signal, Uint32 tabPtrI){
12433   TabRecordPtr tabPtr;
12434   tabPtr.i = tabPtrI;
12435   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12436 
12437   ConnectRecordPtr connectPtr;
12438   connectPtr.i = tabPtr.p->connectrec;
12439   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12440   connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
12441 
12442   sendAddFragreq(signal, connectPtr, tabPtr, 0);
12443 }
12444 
12445 void
sendAddFragreq(Signal * signal,ConnectRecordPtr connectPtr,TabRecordPtr tabPtr,Uint32 fragId)12446 Dbdih::sendAddFragreq(Signal* signal, ConnectRecordPtr connectPtr,
12447 		      TabRecordPtr tabPtr, Uint32 fragId){
12448   jam();
12449   const Uint32 fragCount = connectPtr.p->m_alter.m_totalfragments;
12450   ReplicaRecordPtr replicaPtr;
12451   replicaPtr.i = RNIL;
12452   FragmentstorePtr fragPtr;
12453   for(; fragId<fragCount; fragId++){
12454     jam();
12455     getFragstore(tabPtr.p, fragId, fragPtr);
12456 
12457     replicaPtr.i = fragPtr.p->storedReplicas;
12458     while(replicaPtr.i != RNIL){
12459       jam();
12460       c_replicaRecordPool.getPtr(replicaPtr);
12461       if(replicaPtr.p->procNode == getOwnNodeId()){
12462 	break;
12463       }
12464       replicaPtr.i = replicaPtr.p->nextPool;
12465     }
12466 
12467     if(replicaPtr.i != RNIL){
12468       jam();
12469       break;
12470     }
12471 
12472     replicaPtr.i = fragPtr.p->oldStoredReplicas;
12473     while(replicaPtr.i != RNIL){
12474       jam();
12475       c_replicaRecordPool.getPtr(replicaPtr);
12476       if(replicaPtr.p->procNode == getOwnNodeId()){
12477 	break;
12478       }
12479       replicaPtr.i = replicaPtr.p->nextPool;
12480     }
12481 
12482     if(replicaPtr.i != RNIL){
12483       jam();
12484       break;
12485     }
12486   }
12487 
12488   if(replicaPtr.i != RNIL){
12489     jam();
12490     ndbrequire(fragId < fragCount);
12491     ndbrequire(replicaPtr.p->procNode == getOwnNodeId());
12492 
12493     Uint32 requestInfo = 0;
12494     if(tabPtr.p->tabStorage != TabRecord::ST_NORMAL){
12495       requestInfo |= LqhFragReq::TemporaryTable;
12496     }
12497 
12498     if(getNodeState().getNodeRestartInProgress()){
12499       requestInfo |= LqhFragReq::CreateInRunning;
12500     }
12501 
12502     AddFragReq* const req = (AddFragReq*)signal->getDataPtr();
12503     req->dihPtr = connectPtr.i;
12504     req->senderData = connectPtr.p->userpointer;
12505     req->fragmentId = fragId;
12506     req->requestInfo = requestInfo;
12507     req->tableId = tabPtr.i;
12508     req->nextLCP = 0;
12509     req->nodeId = getOwnNodeId();
12510     req->totalFragments = fragCount;
12511     req->startGci = SYSFILE->newestRestorableGCI;
12512     req->logPartId = fragPtr.p->m_log_part_id;
12513     req->changeMask = 0;
12514 
12515     if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
12516     {
12517       jam();
12518       req->changeMask = connectPtr.p->m_alter.m_changeMask;
12519     }
12520 
12521     sendSignal(DBDICT_REF, GSN_ADD_FRAGREQ, signal,
12522 	       AddFragReq::SignalLength, JBB);
12523     return;
12524   }
12525 
12526   if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
12527   {
12528     jam();
12529     // Request handled successfully
12530 
12531     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
12532     {
12533       jam();
12534       DIH_TAB_WRITE_LOCK(tabPtr.p);
12535       tabPtr.p->m_new_map_ptr_i = connectPtr.p->m_alter.m_new_map_ptr_i;
12536       DIH_TAB_WRITE_UNLOCK(tabPtr.p);
12537     }
12538 
12539     if (AlterTableReq::getAddFragFlag(connectPtr.p->m_alter.m_changeMask))
12540     {
12541       jam();
12542       Callback cb;
12543       cb.m_callbackData = connectPtr.i;
12544       cb.m_callbackFunction = safe_cast(&Dbdih::alter_table_writeTable_conf);
12545       saveTableFile(signal, connectPtr, tabPtr, TabRecord::CS_ALTER_TABLE, cb);
12546       return;
12547     }
12548 
12549     send_alter_tab_conf(signal, connectPtr);
12550   }
12551   else
12552   {
12553     // Done
12554     DiAddTabConf * const conf = (DiAddTabConf*)signal->getDataPtr();
12555     conf->senderData = connectPtr.p->userpointer;
12556     sendSignal(connectPtr.p->userblockref, GSN_DIADDTABCONF, signal,
12557                DiAddTabConf::SignalLength, JBB);
12558 
12559 
12560     if (tabPtr.p->method == TabRecord::HASH_MAP)
12561     {
12562       Uint32 newValue = RNIL;
12563       if (DictTabInfo::isOrderedIndex(tabPtr.p->tableType))
12564       {
12565         jam();
12566         TabRecordPtr primTabPtr;
12567         primTabPtr.i = tabPtr.p->primaryTableId;
12568         ptrCheckGuard(primTabPtr, ctabFileSize, tabRecord);
12569         newValue = primTabPtr.p->m_map_ptr_i;
12570       }
12571       else
12572       {
12573         jam();
12574         newValue = connectPtr.p->m_create.m_map_ptr_i;
12575       }
12576 
12577       tabPtr.p->m_map_ptr_i = newValue;
12578     }
12579     // Release
12580     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
12581     tabPtr.p->connectrec = RNIL;
12582     release_connect(connectPtr);
12583   }
12584 
12585 }
12586 void
release_connect(ConnectRecordPtr ptr)12587 Dbdih::release_connect(ConnectRecordPtr ptr)
12588 {
12589   TabRecordPtr tabPtr;
12590   tabPtr.i = ptr.p->table;
12591   if (tabPtr.i != RNIL)
12592   {
12593     jam();
12594     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12595     if (tabPtr.p->connectrec == ptr.i)
12596     {
12597       ndbassert(false); // should be fixed elsewhere
12598       tabPtr.p->connectrec = RNIL;
12599     }
12600   }
12601 
12602   ptr.p->table = RNIL;
12603   ptr.p->userblockref = ZNIL;
12604   ptr.p->userpointer = RNIL;
12605   ptr.p->connectState = ConnectRecord::FREE;
12606   ptr.p->nextPool = cfirstconnect;
12607   cfirstconnect = ptr.i;
12608 }
12609 
12610 void
execADD_FRAGCONF(Signal * signal)12611 Dbdih::execADD_FRAGCONF(Signal* signal){
12612   jamEntry();
12613   AddFragConf * const conf = (AddFragConf*)signal->getDataPtr();
12614 
12615   ConnectRecordPtr connectPtr;
12616   connectPtr.i = conf->dihPtr;
12617   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12618 
12619   TabRecordPtr tabPtr;
12620   tabPtr.i = connectPtr.p->table;
12621   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12622 
12623   sendAddFragreq(signal, connectPtr, tabPtr, conf->fragId + 1);
12624 }
12625 
12626 void
execADD_FRAGREF(Signal * signal)12627 Dbdih::execADD_FRAGREF(Signal* signal){
12628   jamEntry();
12629   AddFragRef * const ref = (AddFragRef*)signal->getDataPtr();
12630 
12631   ConnectRecordPtr connectPtr;
12632   connectPtr.i = ref->dihPtr;
12633   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12634 
12635   Ptr<TabRecord> tabPtr;
12636   tabPtr.i = connectPtr.p->table;
12637   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12638   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
12639 
12640   if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
12641   {
12642     jam();
12643 
12644     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
12645     {
12646       jam();
12647       DIH_TAB_WRITE_LOCK(tabPtr.p);
12648       tabPtr.p->m_new_map_ptr_i = RNIL;
12649       DIH_TAB_WRITE_UNLOCK(tabPtr.p);
12650     }
12651 
12652     connectPtr.p->connectState = ConnectRecord::ALTER_TABLE_ABORT;
12653     drop_fragments(signal, connectPtr, connectPtr.p->m_alter.m_totalfragments);
12654     return;
12655   }
12656   else
12657   {
12658     DiAddTabRef * const ref = (DiAddTabRef*)signal->getDataPtr();
12659     ref->senderData = connectPtr.p->userpointer;
12660     ref->errorCode = ~0;
12661     sendSignal(connectPtr.p->userblockref, GSN_DIADDTABREF, signal,
12662 	       DiAddTabRef::SignalLength, JBB);
12663 
12664     // Release
12665     tabPtr.p->connectrec = RNIL;
12666     release_connect(connectPtr);
12667   }
12668 }
12669 
12670 /*
12671   3.7.1.3   R E F U S E
12672   *********************
12673   */
12674 void
addtabrefuseLab(Signal * signal,ConnectRecordPtr connectPtr,Uint32 errorCode)12675 Dbdih::addtabrefuseLab(Signal* signal,
12676                        ConnectRecordPtr connectPtr, Uint32 errorCode)
12677 {
12678   signal->theData[0] = connectPtr.p->userpointer;
12679   signal->theData[1] = errorCode;
12680   sendSignal(connectPtr.p->userblockref, GSN_DIADDTABREF, signal, 2, JBB);
12681 
12682   Ptr<TabRecord> tabPtr;
12683   tabPtr.i = connectPtr.p->table;
12684   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12685   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
12686   tabPtr.p->connectrec = RNIL;
12687 
12688   release_connect(connectPtr);
12689   return;
12690 }//Dbdih::addtabrefuseLab()
12691 
12692 /*
12693   3.7.2   A D D   T A B L E   D U P L I C A T I O N
12694   *************************************************
12695   */
12696 /*
12697   3.7.2.1    A D D   T A B L E   D U P L I C A T I O N   R E Q U E S T
12698   *******************************************************************=
12699   */
12700 
12701 /*
12702   D E L E T E   T A B L E
12703   **********************=
12704   */
12705 /*****************************************************************************/
12706 /***********              DELETE TABLE  MODULE                   *************/
12707 /*****************************************************************************/
12708 void
execDROP_TAB_REQ(Signal * signal)12709 Dbdih::execDROP_TAB_REQ(Signal* signal)
12710 {
12711   jamEntry();
12712   DropTabReq* req = (DropTabReq*)signal->getDataPtr();
12713 
12714   TabRecordPtr tabPtr;
12715   tabPtr.i = req->tableId;
12716   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12717 
12718   tabPtr.p->m_dropTab.tabUserRef = req->senderRef;
12719   tabPtr.p->m_dropTab.tabUserPtr = req->senderData;
12720 
12721   DropTabReq::RequestType rt = (DropTabReq::RequestType)req->requestType;
12722 
12723   switch(rt){
12724   case DropTabReq::OnlineDropTab:
12725     jam();
12726     ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_DROPPING);
12727     break;
12728   case DropTabReq::CreateTabDrop:
12729     jam();
12730     break;
12731   case DropTabReq::RestartDropTab:
12732     break;
12733   }
12734 
12735   if (isMaster())
12736   {
12737     /**
12738      * Remove from queue
12739      */
12740     NodeRecordPtr nodePtr;
12741     for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
12742     {
12743       jam();
12744       ptrAss(nodePtr, nodeRecord);
12745       if (c_lcpState.m_participatingLQH.get(nodePtr.i))
12746       {
12747         Uint32 index = 0;
12748 	Uint32 count = nodePtr.p->noOfQueuedChkpt;
12749 	while (index < count)
12750         {
12751 	  if (nodePtr.p->queuedChkpt[index].tableId == tabPtr.i)
12752           {
12753 	    jam();
12754 	    count--;
12755 	    for (Uint32 i = index; i<count; i++)
12756             {
12757 	      jam();
12758 	      nodePtr.p->queuedChkpt[i] = nodePtr.p->queuedChkpt[i + 1];
12759 	    }
12760 	  }
12761           else
12762           {
12763 	    index++;
12764 	  }
12765 	}
12766 	nodePtr.p->noOfQueuedChkpt = count;
12767       }
12768     }
12769   }
12770 
12771   {
12772     /**
12773      * Check table lcp state
12774      */
12775     bool ok = false;
12776     switch(tabPtr.p->tabLcpStatus){
12777     case TabRecord::TLS_COMPLETED:
12778     case TabRecord::TLS_WRITING_TO_FILE:
12779       ok = true;
12780       jam();
12781       break;
12782       return;
12783     case TabRecord::TLS_ACTIVE:
12784       ok = true;
12785       jam();
12786 
12787       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
12788 
12789       /**
12790        * First check if all fragments are done
12791        */
12792       if (checkLcpAllTablesDoneInLqh(__LINE__))
12793       {
12794 	jam();
12795 
12796         g_eventLogger->info("This is the last table");
12797 
12798 	/**
12799 	 * Then check if saving of tab info is done for all tables
12800 	 */
12801 	LcpStatus a = c_lcpState.lcpStatus;
12802 	checkLcpCompletedLab(signal);
12803 
12804         if(a != c_lcpState.lcpStatus)
12805         {
12806           g_eventLogger->info("And all tables are written to already written disk");
12807         }
12808       }
12809       break;
12810     }
12811     ndbrequire(ok);
12812   }
12813 
12814   waitDropTabWritingToFile(signal, tabPtr);
12815 }
12816 
startDeleteFile(Signal * signal,TabRecordPtr tabPtr)12817 void Dbdih::startDeleteFile(Signal* signal, TabRecordPtr tabPtr)
12818 {
12819   if (tabPtr.p->tabFile[0] == RNIL) {
12820     jam();
12821     initTableFile(tabPtr);
12822   }//if
12823   openTableFileForDelete(signal, tabPtr.p->tabFile[0]);
12824 }//Dbdih::startDeleteFile()
12825 
openTableFileForDelete(Signal * signal,Uint32 fileIndex)12826 void Dbdih::openTableFileForDelete(Signal* signal, Uint32 fileIndex)
12827 {
12828   FileRecordPtr filePtr;
12829   filePtr.i = fileIndex;
12830   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
12831   openFileRw(signal, filePtr);
12832   filePtr.p->reqStatus = FileRecord::TABLE_OPEN_FOR_DELETE;
12833 }//Dbdih::openTableFileForDelete()
12834 
tableOpenLab(Signal * signal,FileRecordPtr filePtr)12835 void Dbdih::tableOpenLab(Signal* signal, FileRecordPtr filePtr)
12836 {
12837   closeFileDelete(signal, filePtr);
12838   filePtr.p->reqStatus = FileRecord::TABLE_CLOSE_DELETE;
12839   return;
12840 }//Dbdih::tableOpenLab()
12841 
tableDeleteLab(Signal * signal,FileRecordPtr filePtr)12842 void Dbdih::tableDeleteLab(Signal* signal, FileRecordPtr filePtr)
12843 {
12844   TabRecordPtr tabPtr;
12845   tabPtr.i = filePtr.p->tabRef;
12846   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12847   if (filePtr.i == tabPtr.p->tabFile[0]) {
12848     jam();
12849     openTableFileForDelete(signal, tabPtr.p->tabFile[1]);
12850     return;
12851   }//if
12852   ndbrequire(filePtr.i == tabPtr.p->tabFile[1]);
12853 
12854   releaseFile(tabPtr.p->tabFile[0]);
12855   releaseFile(tabPtr.p->tabFile[1]);
12856   tabPtr.p->tabFile[0] = tabPtr.p->tabFile[1] = RNIL;
12857 
12858   tabPtr.p->tabStatus = TabRecord::TS_IDLE;
12859 
12860   DropTabConf * const dropConf = (DropTabConf *)signal->getDataPtrSend();
12861   dropConf->senderRef = reference();
12862   dropConf->senderData = tabPtr.p->m_dropTab.tabUserPtr;
12863   dropConf->tableId = tabPtr.i;
12864   sendSignal(tabPtr.p->m_dropTab.tabUserRef, GSN_DROP_TAB_CONF,
12865 	     signal, DropTabConf::SignalLength, JBB);
12866 
12867   tabPtr.p->m_dropTab.tabUserPtr = RNIL;
12868   tabPtr.p->m_dropTab.tabUserRef = 0;
12869   releaseTable(tabPtr);
12870 }//Dbdih::tableDeleteLab()
12871 
12872 
releaseTable(TabRecordPtr tabPtr)12873 void Dbdih::releaseTable(TabRecordPtr tabPtr)
12874 {
12875   FragmentstorePtr fragPtr;
12876   if (tabPtr.p->noOfFragChunks > 0) {
12877     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
12878       jam();
12879       getFragstore(tabPtr.p, fragId, fragPtr);
12880       dec_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
12881       releaseReplicas(& fragPtr.p->storedReplicas);
12882       releaseReplicas(& fragPtr.p->oldStoredReplicas);
12883     }//for
12884     releaseFragments(tabPtr);
12885   }
12886   if (tabPtr.p->tabFile[0] != RNIL) {
12887     jam();
12888     releaseFile(tabPtr.p->tabFile[0]);
12889     releaseFile(tabPtr.p->tabFile[1]);
12890     tabPtr.p->tabFile[0] = tabPtr.p->tabFile[1] = RNIL;
12891   }//if
12892 }//Dbdih::releaseTable()
12893 
releaseReplicas(Uint32 * replicaPtrI)12894 void Dbdih::releaseReplicas(Uint32 * replicaPtrI)
12895 {
12896   ReplicaRecordPtr replicaPtr;
12897   replicaPtr.i = * replicaPtrI;
12898   jam();
12899   while (replicaPtr.i != RNIL)
12900   {
12901     jam();
12902     c_replicaRecordPool.getPtr(replicaPtr);
12903     Uint32 tmp = replicaPtr.p->nextPool;
12904     c_replicaRecordPool.release(replicaPtr);
12905     replicaPtr.i = tmp;
12906     cnoFreeReplicaRec++;
12907   }//while
12908 
12909   * replicaPtrI = RNIL;
12910 }//Dbdih::releaseReplicas()
12911 
seizeReplicaRec(ReplicaRecordPtr & replicaPtr)12912 void Dbdih::seizeReplicaRec(ReplicaRecordPtr& replicaPtr)
12913 {
12914   c_replicaRecordPool.seize(replicaPtr);
12915   cnoFreeReplicaRec--;
12916   replicaPtr.p->nextPool = RNIL;
12917 }//Dbdih::seizeReplicaRec()
12918 
releaseFile(Uint32 fileIndex)12919 void Dbdih::releaseFile(Uint32 fileIndex)
12920 {
12921   FileRecordPtr filePtr;
12922   filePtr.i = fileIndex;
12923   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
12924   filePtr.p->nextFile = cfirstfreeFile;
12925   cfirstfreeFile = filePtr.i;
12926 }//Dbdih::releaseFile()
12927 
12928 
execALTER_TAB_REQ(Signal * signal)12929 void Dbdih::execALTER_TAB_REQ(Signal * signal)
12930 {
12931   const AlterTabReq* req = (const AlterTabReq*)signal->getDataPtr();
12932   const Uint32 senderRef = req->senderRef;
12933   const Uint32 senderData = req->senderData;
12934   const Uint32 tableId = req->tableId;
12935   const Uint32 tableVersion = req->tableVersion;
12936   const Uint32 newTableVersion = req->newTableVersion;
12937   AlterTabReq::RequestType requestType =
12938     (AlterTabReq::RequestType) req->requestType;
12939 
12940   TabRecordPtr tabPtr;
12941   tabPtr.i = tableId;
12942   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
12943 
12944   switch(requestType){
12945   case AlterTabReq::AlterTablePrepare:
12946     jam();
12947     // fall through
12948   case AlterTabReq::AlterTableRevert:
12949     jam();
12950     if (AlterTableReq::getAddFragFlag(req->changeMask) &&
12951         tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE)
12952     {
12953       jam();
12954       SectionHandle handle(this, signal);
12955       sendSignalWithDelay(reference(), GSN_ALTER_TAB_REQ, signal, 10,
12956                           signal->getLength(), &handle);
12957       return;
12958     }
12959   case AlterTabReq::AlterTableCommit:
12960     jam();
12961   case AlterTabReq::AlterTableComplete:
12962     jam();
12963   case AlterTabReq::AlterTableWaitScan:
12964     jam();
12965     break;
12966   default:
12967     jamLine(requestType);
12968   }
12969 
12970   ConnectRecordPtr connectPtr;
12971   connectPtr.i = RNIL;
12972   switch (requestType) {
12973   case AlterTabReq::AlterTablePrepare:
12974     jam();
12975 
12976     ndbrequire(cfirstconnect != RNIL);
12977     connectPtr.i = cfirstconnect;
12978     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12979     cfirstconnect = connectPtr.p->nextPool;
12980 
12981     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
12982     connectPtr.p->m_alter.m_org_totalfragments = tabPtr.p->totalfragments;
12983     connectPtr.p->m_alter.m_changeMask = req->changeMask;
12984     connectPtr.p->m_alter.m_new_map_ptr_i = req->new_map_ptr_i;
12985     connectPtr.p->userpointer = senderData;
12986     connectPtr.p->userblockref = senderRef;
12987     connectPtr.p->connectState = ConnectRecord::ALTER_TABLE;
12988     connectPtr.p->table = tabPtr.i;
12989     tabPtr.p->connectrec = connectPtr.i;
12990     break;
12991   case AlterTabReq::AlterTableRevert:
12992     jam();
12993     tabPtr.p->schemaVersion = tableVersion;
12994 
12995     connectPtr.i = req->connectPtr;
12996     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
12997 
12998     ndbrequire(connectPtr.p->connectState == ConnectRecord::ALTER_TABLE);
12999 
13000     connectPtr.p->userpointer = senderData;
13001     connectPtr.p->userblockref = senderRef;
13002 
13003     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
13004     {
13005       jam();
13006       DIH_TAB_WRITE_LOCK(tabPtr.p);
13007       tabPtr.p->m_new_map_ptr_i = RNIL;
13008       DIH_TAB_WRITE_UNLOCK(tabPtr.p);
13009     }
13010 
13011     if (AlterTableReq::getAddFragFlag(req->changeMask))
13012     {
13013       jam();
13014       tabPtr.p->tabCopyStatus = TabRecord::CS_ALTER_TABLE;
13015       connectPtr.p->connectState = ConnectRecord::ALTER_TABLE_REVERT;
13016       drop_fragments(signal, connectPtr,
13017                      connectPtr.p->m_alter.m_totalfragments);
13018       return;
13019     }
13020 
13021     send_alter_tab_conf(signal, connectPtr);
13022 
13023     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
13024     tabPtr.p->connectrec = RNIL;
13025     release_connect(connectPtr);
13026     return;
13027     break;
13028   case AlterTabReq::AlterTableCommit:
13029     jam();
13030     tabPtr.p->schemaVersion = newTableVersion;
13031 
13032     connectPtr.i = req->connectPtr;
13033     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
13034     connectPtr.p->userpointer = senderData;
13035     connectPtr.p->userblockref = senderRef;
13036     ndbrequire(connectPtr.p->connectState == ConnectRecord::ALTER_TABLE);
13037 
13038     tabPtr.p->totalfragments = connectPtr.p->m_alter.m_totalfragments;
13039     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
13040     {
13041       jam();
13042       DIH_TAB_WRITE_LOCK(tabPtr.p);
13043       Uint32 save = tabPtr.p->m_map_ptr_i;
13044       tabPtr.p->m_map_ptr_i = tabPtr.p->m_new_map_ptr_i;
13045       tabPtr.p->m_new_map_ptr_i = save;
13046 
13047       for (Uint32 i = 0; i<tabPtr.p->totalfragments; i++)
13048       {
13049         jam();
13050         FragmentstorePtr fragPtr;
13051         getFragstore(tabPtr.p, i, fragPtr);
13052         fragPtr.p->distributionKey = (fragPtr.p->distributionKey + 1) & 0xFF;
13053       }
13054       DIH_TAB_WRITE_UNLOCK(tabPtr.p);
13055 
13056       ndbassert(tabPtr.p->m_scan_count[1] == 0);
13057       tabPtr.p->m_scan_count[1] = tabPtr.p->m_scan_count[0];
13058       tabPtr.p->m_scan_count[0] = 0;
13059       tabPtr.p->m_scan_reorg_flag = 1;
13060 
13061       send_alter_tab_conf(signal, connectPtr);
13062       return;
13063     }
13064 
13065     send_alter_tab_conf(signal, connectPtr);
13066     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
13067     tabPtr.p->connectrec = RNIL;
13068     release_connect(connectPtr);
13069     return;
13070   case AlterTabReq::AlterTableComplete:
13071     jam();
13072     connectPtr.i = req->connectPtr;
13073     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
13074     connectPtr.p->userpointer = senderData;
13075     connectPtr.p->userblockref = senderRef;
13076 
13077     send_alter_tab_conf(signal, connectPtr);
13078 
13079     DIH_TAB_WRITE_LOCK(tabPtr.p);
13080     tabPtr.p->m_new_map_ptr_i = RNIL;
13081     tabPtr.p->m_scan_reorg_flag = 0;
13082     DIH_TAB_WRITE_UNLOCK(tabPtr.p);
13083 
13084     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
13085     tabPtr.p->connectrec = RNIL;
13086     release_connect(connectPtr);
13087     return;
13088   case AlterTabReq::AlterTableWaitScan:{
13089     jam();
13090     const NDB_TICKS now = NdbTick_getCurrentTicks();
13091     signal->theData[0] = DihContinueB::ZWAIT_OLD_SCAN;
13092     signal->theData[1] = tabPtr.i;
13093     signal->theData[2] = senderRef;
13094     signal->theData[3] = senderData;
13095     signal->theData[4] = connectPtr.i;
13096     signal->theData[5] = Uint32(now.getUint64() >> 32);
13097     signal->theData[6] = Uint32(now.getUint64());
13098     signal->theData[7] = 3; // Seconds to wait
13099     sendSignal(reference(), GSN_CONTINUEB, signal, 8, JBB);
13100     return;
13101   }
13102   default:
13103     ndbrequire(false);
13104     break;
13105   }
13106 
13107   if (AlterTableReq::getAddFragFlag(req->changeMask))
13108   {
13109     jam();
13110     SegmentedSectionPtr ptr;
13111     SectionHandle handle(this, signal);
13112     handle.getSection(ptr, 0);
13113     union {
13114       Uint16 buf[2+2*MAX_NDB_PARTITIONS];
13115       Uint32 _align[1];
13116     };
13117     copy(_align, ptr);
13118     releaseSections(handle);
13119     Uint32 err;
13120     Uint32 save = tabPtr.p->totalfragments;
13121     if ((err = add_fragments_to_table(tabPtr, buf)))
13122     {
13123       jam();
13124       ndbrequire(tabPtr.p->totalfragments == save);
13125       ndbrequire(connectPtr.p->m_alter.m_org_totalfragments == save);
13126       send_alter_tab_ref(signal, tabPtr, connectPtr, err);
13127 
13128       ndbrequire(tabPtr.p->connectrec == connectPtr.i);
13129       tabPtr.p->connectrec = RNIL;
13130       release_connect(connectPtr);
13131       return;
13132     }
13133 
13134     tabPtr.p->tabCopyStatus = TabRecord::CS_ALTER_TABLE;
13135     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
13136     tabPtr.p->totalfragments = save; // Dont make the available yet...
13137     sendAddFragreq(signal, connectPtr, tabPtr,
13138                    connectPtr.p->m_alter.m_org_totalfragments);
13139     return;
13140   }
13141 
13142   send_alter_tab_conf(signal, connectPtr);
13143 }
13144 
13145 Uint32
add_fragments_to_table(Ptr<TabRecord> tabPtr,const Uint16 buf[])13146 Dbdih::add_fragments_to_table(Ptr<TabRecord> tabPtr, const Uint16 buf[])
13147 {
13148   Uint32 replicas = buf[0];
13149   Uint32 cnt = buf[1];
13150 
13151   Uint32 i = 0;
13152   Uint32 err = 0;
13153   Uint32 current = tabPtr.p->totalfragments;
13154   for (i = 0; i<cnt; i++)
13155   {
13156     FragmentstorePtr fragPtr;
13157     if (ERROR_INSERTED(7212) && cnt)
13158     {
13159       err = 1;
13160       CLEAR_ERROR_INSERT_VALUE;
13161       goto error;
13162     }
13163 
13164     if ((err = add_fragment_to_table(tabPtr, current + i, fragPtr)))
13165       goto error;
13166 
13167     fragPtr.p->m_log_part_id = buf[2+(1 + replicas)*i];
13168     ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
13169     fragPtr.p->preferredPrimary = buf[2+(1 + replicas)*i + 1];
13170 
13171     inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
13172 
13173     Uint32 activeIndex = 0;
13174     for (Uint32 j = 0; j<replicas; j++)
13175     {
13176       const Uint32 nodeId = buf[2+(1 + replicas)*i + 1 + j];
13177       ReplicaRecordPtr replicaPtr;
13178       allocStoredReplica(fragPtr,
13179                          replicaPtr,
13180                          nodeId,
13181                          current + i,
13182                          tabPtr.i);
13183       if (getNodeStatus(nodeId) == NodeRecord::ALIVE) {
13184         jam();
13185         ndbrequire(activeIndex < MAX_REPLICAS);
13186         fragPtr.p->activeNodes[activeIndex] = nodeId;
13187         activeIndex++;
13188       } else {
13189         jam();
13190         removeStoredReplica(fragPtr, replicaPtr);
13191         linkOldStoredReplica(fragPtr, replicaPtr);
13192       }
13193     }
13194     fragPtr.p->fragReplicas = activeIndex;
13195   }
13196 
13197   return 0;
13198 error:
13199   for(i = i + current; i != current; i--)
13200   {
13201     release_fragment_from_table(tabPtr, i);
13202   }
13203 
13204   return err;
13205 }
13206 
13207 void
wait_old_scan(Signal * signal)13208 Dbdih::wait_old_scan(Signal* signal)
13209 {
13210   jam();
13211 
13212   TabRecordPtr tabPtr;
13213   tabPtr.i = signal->theData[1];
13214   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13215 
13216   if (tabPtr.p->m_scan_count[1] == 0)
13217   {
13218     jam();
13219     Uint32 senderRef = signal->theData[2];
13220     Uint32 senderData = signal->theData[3];
13221     Uint32 connectPtrI = signal->theData[4];
13222 
13223     AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
13224     conf->senderRef = reference();
13225     conf->senderData = senderData;
13226     conf->connectPtr = connectPtrI;
13227     sendSignal(senderRef, GSN_ALTER_TAB_CONF, signal,
13228                AlterTabConf::SignalLength, JBB);
13229     return;
13230   }
13231 
13232   const Uint32 start_hi = signal->theData[5];
13233   const Uint32 start_lo = signal->theData[6];
13234   const Uint32 wait = signal->theData[7];
13235 
13236   const NDB_TICKS start((Uint64(start_hi) << 32) | start_lo);
13237   const NDB_TICKS now  = NdbTick_getCurrentTicks();
13238   const Uint32 elapsed = (Uint32)NdbTick_Elapsed(start,now).seconds();
13239 
13240   if (elapsed > wait)
13241   {
13242     infoEvent("Waiting(%u) for scans(%u) to complete on table %u",
13243               elapsed,
13244               tabPtr.p->m_scan_count[1],
13245               tabPtr.i);
13246 
13247     if (wait == 3)
13248     {
13249       signal->theData[7] = 3 + 7;
13250     }
13251     else
13252     {
13253       signal->theData[7] = 2 * wait;
13254     }
13255   }
13256 
13257   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 1000, 7);
13258 }
13259 
13260 Uint32
add_fragment_to_table(Ptr<TabRecord> tabPtr,Uint32 fragId,Ptr<Fragmentstore> & fragPtr)13261 Dbdih::add_fragment_to_table(Ptr<TabRecord> tabPtr,
13262                              Uint32 fragId,
13263                              Ptr<Fragmentstore>& fragPtr)
13264 {
13265   Uint32 fragments = tabPtr.p->totalfragments;
13266   Uint32 chunks = tabPtr.p->noOfFragChunks;
13267 
13268   ndbrequire(fragId == fragments); // Only add at the end
13269 
13270   if (ERROR_INSERTED(7211))
13271   {
13272     CLEAR_ERROR_INSERT_VALUE;
13273     return 1;
13274   }
13275 
13276   Uint32 allocated = chunks << LOG_NO_OF_FRAGS_PER_CHUNK;
13277   if (fragId < allocated)
13278   {
13279     jam();
13280     tabPtr.p->totalfragments++;
13281     getFragstore(tabPtr.p, fragId, fragPtr);
13282     return 0;
13283   }
13284 
13285   /**
13286    * Allocate a new chunk
13287    */
13288   fragPtr.i = cfirstfragstore;
13289   if (fragPtr.i == RNIL)
13290   {
13291     jam();
13292     return -1;
13293   }
13294 
13295   ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13296   cfirstfragstore = fragPtr.p->nextFragmentChunk;
13297   ndbrequire(cremainingfrags >= NO_OF_FRAGS_PER_CHUNK);
13298   cremainingfrags -= NO_OF_FRAGS_PER_CHUNK;
13299 
13300   ndbrequire(chunks < NDB_ARRAY_SIZE(tabPtr.p->startFid));
13301   tabPtr.p->startFid[chunks] = fragPtr.i;
13302   for (Uint32 i = 0; i<NO_OF_FRAGS_PER_CHUNK; i++)
13303   {
13304     jam();
13305     Ptr<Fragmentstore> tmp;
13306     tmp.i = fragPtr.i + i;
13307     ptrCheckGuard(tmp, cfragstoreFileSize, fragmentstore);
13308     initFragstore(tmp);
13309   }
13310 
13311   tabPtr.p->totalfragments++;
13312   tabPtr.p->noOfFragChunks++;
13313 
13314   return 0;
13315 }
13316 
13317 void
release_fragment_from_table(Ptr<TabRecord> tabPtr,Uint32 fragId)13318 Dbdih::release_fragment_from_table(Ptr<TabRecord> tabPtr, Uint32 fragId)
13319 {
13320   FragmentstorePtr fragPtr;
13321   Uint32 fragments = tabPtr.p->totalfragments;
13322   Uint32 chunks = tabPtr.p->noOfFragChunks;
13323 
13324   if (fragId >= fragments)
13325   {
13326     jam();
13327     return;
13328   }
13329   ndbrequire(fragId == fragments - 1); // only remove at end
13330   ndbrequire(fragments != 0);
13331 
13332   getFragstore(tabPtr.p, fragId, fragPtr);
13333   dec_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
13334 
13335   releaseReplicas(& fragPtr.p->storedReplicas);
13336   releaseReplicas(& fragPtr.p->oldStoredReplicas);
13337 
13338   if (fragId == ((chunks - 1) << LOG_NO_OF_FRAGS_PER_CHUNK))
13339   {
13340     jam();
13341 
13342     getFragstore(tabPtr.p, fragId, fragPtr);
13343 
13344     fragPtr.p->nextFragmentChunk = cfirstfragstore;
13345     cfirstfragstore = fragPtr.i;
13346     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
13347     tabPtr.p->noOfFragChunks = chunks - 1;
13348   }
13349 
13350   tabPtr.p->totalfragments--;
13351 }
13352 
13353 void
send_alter_tab_ref(Signal * signal,Ptr<TabRecord> tabPtr,Ptr<ConnectRecord> connectPtr,Uint32 errCode)13354 Dbdih::send_alter_tab_ref(Signal* signal,
13355                           Ptr<TabRecord> tabPtr,
13356                           Ptr<ConnectRecord> connectPtr,
13357                           Uint32 errCode)
13358 {
13359   AlterTabRef* ref = (AlterTabRef*)signal->getDataPtrSend();
13360   ref->senderRef = reference();
13361   ref->senderData = connectPtr.p->userpointer;
13362   ref->errorCode = errCode;
13363   sendSignal(connectPtr.p->userblockref, GSN_ALTER_TAB_REF, signal,
13364              AlterTabRef::SignalLength, JBB);
13365 }
13366 
13367 void
send_alter_tab_conf(Signal * signal,Ptr<ConnectRecord> connectPtr)13368 Dbdih::send_alter_tab_conf(Signal* signal, Ptr<ConnectRecord> connectPtr)
13369 {
13370   AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
13371   conf->senderRef = reference();
13372   conf->senderData = connectPtr.p->userpointer;
13373   conf->connectPtr = connectPtr.i;
13374   sendSignal(connectPtr.p->userblockref, GSN_ALTER_TAB_CONF, signal,
13375              AlterTabConf::SignalLength, JBB);
13376 }
13377 
13378 void
saveTableFile(Signal * signal,Ptr<ConnectRecord> connectPtr,Ptr<TabRecord> tabPtr,TabRecord::CopyStatus expectedStatus,Callback & cb)13379 Dbdih::saveTableFile(Signal* signal,
13380                      Ptr<ConnectRecord> connectPtr,
13381                      Ptr<TabRecord> tabPtr,
13382                      TabRecord::CopyStatus expectedStatus,
13383                      Callback& cb)
13384 {
13385   ndbrequire(connectPtr.i == cb.m_callbackData);         // required
13386   ndbrequire(tabPtr.p->tabCopyStatus == expectedStatus); // locking
13387   memcpy(&connectPtr.p->m_callback, &cb, sizeof(Callback));
13388 
13389   tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_TO_SAVE;
13390   tabPtr.p->tabUpdateState = TabRecord::US_CALLBACK;
13391   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
13392   signal->theData[1] = tabPtr.i;
13393   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
13394 }
13395 
13396 void
alter_table_writeTable_conf(Signal * signal,Uint32 ptrI,Uint32 err)13397 Dbdih::alter_table_writeTable_conf(Signal* signal, Uint32 ptrI, Uint32 err)
13398 {
13399   jamEntry();
13400   ndbrequire(err == 0);
13401 
13402   ConnectRecordPtr connectPtr;
13403   connectPtr.i = ptrI;
13404   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
13405 
13406   switch(connectPtr.p->connectState){
13407   case ConnectRecord::ALTER_TABLE_REVERT:
13408   {
13409     jam();
13410     send_alter_tab_conf(signal, connectPtr);
13411 
13412     Ptr<TabRecord> tabPtr;
13413     tabPtr.i = connectPtr.p->table;
13414     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13415     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
13416     tabPtr.p->connectrec = RNIL;
13417     release_connect(connectPtr);
13418     return;
13419   }
13420   case ConnectRecord::ALTER_TABLE:
13421   {
13422     jam();
13423     send_alter_tab_conf(signal, connectPtr);
13424     return;
13425   }
13426   default:
13427     jamLine(connectPtr.p->connectState);
13428     ndbrequire(false);
13429   }
13430 }
13431 
13432 void
drop_fragments(Signal * signal,Ptr<ConnectRecord> connectPtr,Uint32 curr)13433 Dbdih::drop_fragments(Signal* signal, Ptr<ConnectRecord> connectPtr,
13434                       Uint32 curr)
13435 {
13436   ndbrequire(curr >= connectPtr.p->m_alter.m_org_totalfragments);
13437   if (curr == connectPtr.p->m_alter.m_org_totalfragments)
13438   {
13439     /**
13440      * done...
13441      */
13442     jam();
13443     Ptr<TabRecord> tabPtr;
13444     tabPtr.i = connectPtr.p->table;
13445     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13446 
13447     Uint32 new_frags = connectPtr.p->m_alter.m_totalfragments;
13448     Uint32 org_frags = connectPtr.p->m_alter.m_org_totalfragments;
13449     tabPtr.p->totalfragments = new_frags;
13450     for (Uint32 i = new_frags - 1; i >= org_frags; i--)
13451     {
13452       jam();
13453       release_fragment_from_table(tabPtr, i);
13454     }
13455     connectPtr.p->m_alter.m_totalfragments = org_frags;
13456 
13457     switch(connectPtr.p->connectState){
13458     case ConnectRecord::ALTER_TABLE_ABORT:
13459     {
13460       jam();
13461       ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_ALTER_TABLE);
13462       tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
13463       send_alter_tab_ref(signal, tabPtr, connectPtr, ~0);
13464 
13465       connectPtr.p->connectState = ConnectRecord::ALTER_TABLE;
13466       return;
13467     }
13468     case ConnectRecord::ALTER_TABLE_REVERT:
13469     {
13470       jam();
13471       Callback cb;
13472       cb.m_callbackData = connectPtr.i;
13473       cb.m_callbackFunction = safe_cast(&Dbdih::alter_table_writeTable_conf);
13474       saveTableFile(signal, connectPtr, tabPtr, TabRecord::CS_ALTER_TABLE, cb);
13475       return;
13476     }
13477     default:
13478       jamLine(connectPtr.p->connectState);
13479       ndbrequire(false);
13480     }
13481     return;
13482   }
13483 
13484   ndbrequire(curr > 0);
13485   DropFragReq* req = (DropFragReq*)signal->getDataPtrSend();
13486   req->senderRef = reference();
13487   req->senderData = connectPtr.i;
13488   req->tableId = connectPtr.p->table;
13489   req->fragId = curr - 1;
13490   req->requestInfo = DropFragReq::AlterTableAbort;
13491   sendSignal(DBLQH_REF, GSN_DROP_FRAG_REQ, signal,
13492              DropFragReq::SignalLength, JBB);
13493 }
13494 
13495 void
execDROP_FRAG_REF(Signal * signal)13496 Dbdih::execDROP_FRAG_REF(Signal* signal)
13497 {
13498   ndbrequire(false);
13499 }
13500 
13501 void
execDROP_FRAG_CONF(Signal * signal)13502 Dbdih::execDROP_FRAG_CONF(Signal* signal)
13503 {
13504   DropFragConf* conf = (DropFragConf*)signal->getDataPtr();
13505 
13506   ConnectRecordPtr connectPtr;
13507   connectPtr.i = conf->senderData;
13508   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
13509 
13510   drop_fragments(signal, connectPtr, conf->fragId);
13511 }
13512 
13513 /*
13514   G E T   N O D E S
13515   **********************=
13516   */
13517 /*****************************************************************************/
13518 /* **********     TRANSACTION  HANDLING  MODULE                  *************/
13519 /*****************************************************************************/
13520 /*
13521   3.8.1    G E T   N O D E S   R E Q U E S T
13522   ******************************************
13523   Asks what nodes should be part of a transaction.
13524 */
execDIGETNODESREQ(Signal * signal)13525 void Dbdih::execDIGETNODESREQ(Signal* signal)
13526 {
13527   const DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
13528   FragmentstorePtr fragPtr;
13529   TabRecordPtr tabPtr;
13530   tabPtr.i = req->tableId;
13531   Uint32 hashValue = req->hashValue;
13532   Uint32 distr_key_indicator = req->distr_key_indicator;
13533   Uint32 ttabFileSize = ctabFileSize;
13534   Uint32 fragId, newFragId = RNIL;
13535   DiGetNodesConf * const conf = (DiGetNodesConf *)&signal->theData[0];
13536   TabRecord* regTabDesc = tabRecord;
13537   EmulatedJamBuffer * jambuf = (EmulatedJamBuffer*)req->jamBufferPtr;
13538   thrjamEntry(jambuf);
13539   ptrCheckGuard(tabPtr, ttabFileSize, regTabDesc);
13540 
13541   if (DictTabInfo::isOrderedIndex(tabPtr.p->tableType))
13542   {
13543     thrjam(jambuf);
13544     tabPtr.i = tabPtr.p->primaryTableId;
13545     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13546   }
13547 
13548 loop:
13549   Uint32 val = tabPtr.p->m_lock.read_lock();
13550   Uint32 map_ptr_i = tabPtr.p->m_map_ptr_i;
13551   Uint32 new_map_ptr_i = tabPtr.p->m_new_map_ptr_i;
13552 
13553   /* When distr key indicator is set, regardless
13554    * of distribution algorithm in use, hashValue
13555    * IS fragment id.
13556    */
13557   if (distr_key_indicator)
13558   {
13559     fragId = hashValue;
13560     if (unlikely(fragId >= tabPtr.p->totalfragments))
13561     {
13562       thrjam(jambuf);
13563       conf->zero= 1; //Indicate error;
13564       signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
13565       return;
13566     }
13567   }
13568   else if (tabPtr.p->method == TabRecord::HASH_MAP)
13569   {
13570     thrjam(jambuf);
13571     Ptr<Hash2FragmentMap> ptr;
13572     g_hash_map.getPtr(ptr, map_ptr_i);
13573     fragId = ptr.p->m_map[hashValue % ptr.p->m_cnt];
13574 
13575     if (unlikely(new_map_ptr_i != RNIL))
13576     {
13577       thrjam(jambuf);
13578       g_hash_map.getPtr(ptr, new_map_ptr_i);
13579       newFragId = ptr.p->m_map[hashValue % ptr.p->m_cnt];
13580       if (newFragId == fragId)
13581       {
13582         thrjam(jambuf);
13583         newFragId = RNIL;
13584       }
13585     }
13586   }
13587   else if (tabPtr.p->method == TabRecord::LINEAR_HASH)
13588   {
13589     thrjam(jambuf);
13590     fragId = hashValue & tabPtr.p->mask;
13591     if (fragId < tabPtr.p->hashpointer) {
13592       thrjam(jambuf);
13593       fragId = hashValue & ((tabPtr.p->mask << 1) + 1);
13594     }//if
13595   }
13596   else if (tabPtr.p->method == TabRecord::NORMAL_HASH)
13597   {
13598     thrjam(jambuf);
13599     fragId= hashValue % tabPtr.p->totalfragments;
13600   }
13601   else
13602   {
13603     thrjam(jambuf);
13604     ndbassert(tabPtr.p->method == TabRecord::USER_DEFINED);
13605 
13606     /* User defined partitioning, but no distribution key passed */
13607     conf->zero= 1; //Indicate error;
13608     signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
13609     return;
13610   }
13611   if (ERROR_INSERTED_CLEAR(7240))
13612   {
13613     thrjam(jambuf);
13614     conf->zero= 1; //Indicate error;
13615     signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
13616     return;
13617   }
13618   getFragstore(tabPtr.p, fragId, fragPtr);
13619   Uint32 nodeCount = extractNodeInfo(jambuf, fragPtr.p, conf->nodes);
13620   Uint32 sig2 = (nodeCount - 1) +
13621     (fragPtr.p->distributionKey << 16) +
13622     (dihGetInstanceKey(fragPtr) << 24);
13623   conf->zero = 0;
13624   conf->reqinfo = sig2;
13625   conf->fragId = fragId;
13626 
13627   if (unlikely(newFragId != RNIL))
13628   {
13629     thrjam(jambuf);
13630     conf->reqinfo |= DiGetNodesConf::REORG_MOVING;
13631     getFragstore(tabPtr.p, newFragId, fragPtr);
13632     nodeCount = extractNodeInfo(jambuf,
13633                                fragPtr.p,
13634                                conf->nodes + 2 + MAX_REPLICAS);
13635     conf->nodes[MAX_REPLICAS] = newFragId;
13636     conf->nodes[MAX_REPLICAS + 1] = (nodeCount - 1) +
13637       (fragPtr.p->distributionKey << 16) +
13638       (dihGetInstanceKey(fragPtr) << 24);
13639   }
13640 
13641   if (unlikely(!tabPtr.p->m_lock.read_unlock(val)))
13642     goto loop;
13643 }//Dbdih::execDIGETNODESREQ()
13644 
extractNodeInfo(EmulatedJamBuffer * jambuf,const Fragmentstore * fragPtr,Uint32 nodes[])13645 Uint32 Dbdih::extractNodeInfo(EmulatedJamBuffer *jambuf,
13646                               const Fragmentstore * fragPtr,
13647                               Uint32 nodes[])
13648 {
13649   Uint32 nodeCount = 0;
13650   nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
13651   for (Uint32 i = 0; i < fragPtr->fragReplicas; i++) {
13652     thrjam(jambuf);
13653     NodeRecordPtr nodePtr;
13654     ndbrequire(i < MAX_REPLICAS);
13655     nodePtr.i = fragPtr->activeNodes[i];
13656     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
13657     if (nodePtr.p->useInTransactions) {
13658       thrjam(jambuf);
13659       nodes[nodeCount] = nodePtr.i;
13660       nodeCount++;
13661     }//if
13662   }//for
13663   ndbrequire(nodeCount > 0);
13664   return nodeCount;
13665 }//Dbdih::extractNodeInfo()
13666 
13667 void
getFragstore(TabRecord * tab,Uint32 fragNo,FragmentstorePtr & fragptr)13668 Dbdih::getFragstore(TabRecord * tab,        //In parameter
13669                     Uint32 fragNo,              //In parameter
13670                     FragmentstorePtr & fragptr) //Out parameter
13671 {
13672   FragmentstorePtr fragPtr;
13673   Uint32 TfragstoreFileSize = cfragstoreFileSize;
13674   Fragmentstore* TfragStore = fragmentstore;
13675   Uint32 chunkNo = fragNo >> LOG_NO_OF_FRAGS_PER_CHUNK;
13676   Uint32 chunkIndex = fragNo & (NO_OF_FRAGS_PER_CHUNK - 1);
13677   fragPtr.i = tab->startFid[chunkNo] + chunkIndex;
13678   if (likely(chunkNo < NDB_ARRAY_SIZE(tab->startFid))) {
13679     ptrCheckGuard(fragPtr, TfragstoreFileSize, TfragStore);
13680     fragptr = fragPtr;
13681     return;
13682   }//if
13683   ndbrequire(false);
13684 }//Dbdih::getFragstore()
13685 
allocFragments(Uint32 noOfFragments,TabRecordPtr tabPtr)13686 void Dbdih::allocFragments(Uint32 noOfFragments, TabRecordPtr tabPtr)
13687 {
13688   FragmentstorePtr fragPtr;
13689   Uint32 noOfChunks = (noOfFragments + (NO_OF_FRAGS_PER_CHUNK - 1)) >> LOG_NO_OF_FRAGS_PER_CHUNK;
13690   ndbrequire(cremainingfrags >= noOfFragments);
13691   for (Uint32 i = 0; i < noOfChunks; i++) {
13692     jam();
13693     Uint32 baseFrag = cfirstfragstore;
13694     ndbrequire(i < NDB_ARRAY_SIZE(tabPtr.p->startFid));
13695     tabPtr.p->startFid[i] = baseFrag;
13696     fragPtr.i = baseFrag;
13697     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13698     cfirstfragstore = fragPtr.p->nextFragmentChunk;
13699     cremainingfrags -= NO_OF_FRAGS_PER_CHUNK;
13700     for (Uint32 j = 0; j < NO_OF_FRAGS_PER_CHUNK; j++) {
13701       jam();
13702       fragPtr.i = baseFrag + j;
13703       ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13704       initFragstore(fragPtr);
13705     }//if
13706   }//for
13707   tabPtr.p->noOfFragChunks = noOfChunks;
13708 }//Dbdih::allocFragments()
13709 
releaseFragments(TabRecordPtr tabPtr)13710 void Dbdih::releaseFragments(TabRecordPtr tabPtr)
13711 {
13712   FragmentstorePtr fragPtr;
13713   for (Uint32 i = 0; i < tabPtr.p->noOfFragChunks; i++) {
13714     jam();
13715     ndbrequire(i < NDB_ARRAY_SIZE(tabPtr.p->startFid));
13716     Uint32 baseFrag = tabPtr.p->startFid[i];
13717     fragPtr.i = baseFrag;
13718     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13719     fragPtr.p->nextFragmentChunk = cfirstfragstore;
13720     cfirstfragstore = baseFrag;
13721     tabPtr.p->startFid[i] = RNIL;
13722     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
13723   }//for
13724   tabPtr.p->noOfFragChunks = 0;
13725 }//Dbdih::releaseFragments()
13726 
initialiseFragstore()13727 void Dbdih::initialiseFragstore()
13728 {
13729   Uint32 i;
13730   FragmentstorePtr fragPtr;
13731   for (i = 0; i < cfragstoreFileSize; i++) {
13732     fragPtr.i = i;
13733     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13734     initFragstore(fragPtr);
13735   }//for
13736   Uint32 noOfChunks = cfragstoreFileSize >> LOG_NO_OF_FRAGS_PER_CHUNK;
13737   fragPtr.i = 0;
13738   cfirstfragstore = RNIL;
13739   cremainingfrags = 0;
13740   for (i = 0; i < noOfChunks; i++) {
13741     refresh_watch_dog();
13742     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
13743     fragPtr.p->nextFragmentChunk = cfirstfragstore;
13744     cfirstfragstore = fragPtr.i;
13745     fragPtr.i += NO_OF_FRAGS_PER_CHUNK;
13746     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
13747   }//for
13748 }//Dbdih::initialiseFragstore()
13749 
13750 #ifndef NDB_HAVE_RMB
13751 #define rmb() do { } while (0)
13752 #endif
13753 
13754 #ifndef NDB_HAVE_WMB
13755 #define wmb() do { } while (0)
13756 #endif
13757 
13758 inline
13759 bool
isEmpty(const DIVERIFY_queue & q)13760 Dbdih::isEmpty(const DIVERIFY_queue & q)
13761 {
13762   return q.cfirstVerifyQueue == q.clastVerifyQueue;
13763 }
13764 
13765 inline
13766 void
enqueue(DIVERIFY_queue & q,Uint32 senderData,Uint64 gci)13767 Dbdih::enqueue(DIVERIFY_queue & q, Uint32 senderData, Uint64 gci)
13768 {
13769 #ifndef NDEBUG
13770   /**
13771    * - assert only
13772    * - we must read first *before* "publishing last
13773    *   or else DIH-thread could already have consumed entry
13774    *   when we call assert
13775    */
13776   Uint32 first = q.cfirstVerifyQueue;
13777 #endif
13778 
13779   Uint32 last = q.clastVerifyQueue;
13780   ApiConnectRecord * apiConnectRecord = q.apiConnectRecord;
13781 
13782   apiConnectRecord[last].senderData = senderData;
13783   apiConnectRecord[last].apiGci = gci;
13784   wmb();
13785   if (last + 1 == capiConnectFileSize)
13786   {
13787     q.clastVerifyQueue = 0;
13788   }
13789   else
13790   {
13791     q.clastVerifyQueue = last + 1;
13792   }
13793   assert(q.clastVerifyQueue != first);
13794 }
13795 
13796 inline
13797 void
dequeue(DIVERIFY_queue & q,ApiConnectRecord & conRecord)13798 Dbdih::dequeue(DIVERIFY_queue & q, ApiConnectRecord & conRecord)
13799 {
13800   Uint32 first = q.cfirstVerifyQueue;
13801   ApiConnectRecord * apiConnectRecord = q.apiConnectRecord;
13802 
13803   rmb();
13804   conRecord.senderData = apiConnectRecord[first].senderData;
13805   conRecord.apiGci = apiConnectRecord[first].apiGci;
13806 
13807   if (first + 1 == capiConnectFileSize)
13808   {
13809     q.cfirstVerifyQueue = 0;
13810   }
13811   else
13812   {
13813     q.cfirstVerifyQueue = first + 1;
13814   }
13815 }
13816 
13817 /*
13818   3.9   V E R I F I C A T I O N
13819   ****************************=
13820   */
13821 /****************************************************************************/
13822 /* **********     VERIFICATION SUB-MODULE                       *************/
13823 /****************************************************************************/
13824 /*
13825   3.9.1     R E C E I V I N G  O F  V E R I F I C A T I O N   R E Q U E S T
13826   *************************************************************************
13827   */
execDIVERIFYREQ(Signal * signal)13828 void Dbdih::execDIVERIFYREQ(Signal* signal)
13829 {
13830   EmulatedJamBuffer * jambuf = * (EmulatedJamBuffer**)(signal->theData+2);
13831   thrjamEntry(jambuf);
13832   Uint32 qno = signal->theData[1];
13833   ndbassert(qno < NDB_ARRAY_SIZE(c_diverify_queue));
13834   DIVERIFY_queue & q = c_diverify_queue[qno];
13835 loop:
13836   Uint32 val = m_micro_gcp.m_lock.read_lock();
13837   Uint32 blocked = getBlockCommit() == true ? 1 : 0;
13838   if (blocked == 0 && isEmpty(q))
13839   {
13840     thrjam(jambuf);
13841     /*-----------------------------------------------------------------------*/
13842     // We are not blocked and the verify queue was empty currently so we can
13843     // simply reply back to TC immediately. The method was called with
13844     // EXECUTE_DIRECT so we reply back by setting signal data and returning.
13845     // theData[0] already contains the correct information so
13846     // we need not touch it.
13847     /*-----------------------------------------------------------------------*/
13848     signal->theData[1] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
13849     signal->theData[2] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
13850     signal->theData[3] = 0;
13851     if (unlikely(! m_micro_gcp.m_lock.read_unlock(val)))
13852       goto loop;
13853     return;
13854   }//if
13855   /*-------------------------------------------------------------------------*/
13856   // Since we are blocked we need to put this operation last in the verify
13857   // queue to ensure that operation starts up in the correct order.
13858   /*-------------------------------------------------------------------------*/
13859   enqueue(q, signal->theData[0], m_micro_gcp.m_new_gci);
13860   if (blocked == 0 && jambuf == jamBuffer())
13861   {
13862     emptyverificbuffer(signal, 0, false);
13863   }
13864   signal->theData[3] = blocked + 1; // Indicate no immediate return
13865   return;
13866 }//Dbdih::execDIVERIFYREQ()
13867 
execDIH_SCAN_TAB_REQ(Signal * signal)13868 void Dbdih::execDIH_SCAN_TAB_REQ(Signal* signal)
13869 {
13870   DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtr();
13871   TabRecordPtr tabPtr;
13872   const Uint32 senderData = req->senderData;
13873   const Uint32 senderRef = req->senderRef;
13874   const Uint32 schemaTransId = req->schemaTransId;
13875 
13876   jamEntry();
13877 
13878   tabPtr.i = req->tableId;
13879   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13880 
13881   if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
13882   {
13883     if (! (tabPtr.p->tabStatus == TabRecord::TS_CREATING &&
13884            tabPtr.p->schemaTransId == schemaTransId))
13885     {
13886       jam();
13887       goto error;
13888     }
13889   }
13890 
13891   tabPtr.p->m_scan_count[0]++;
13892   ndbassert(tabPtr.p->m_map_ptr_i != DihScanTabConf::InvalidCookie);
13893   {
13894     DihScanTabConf* conf = (DihScanTabConf*)signal->getDataPtrSend();
13895     conf->tableId = tabPtr.i;
13896     conf->senderData = senderData;
13897     conf->fragmentCount = tabPtr.p->totalfragments;
13898     conf->noOfBackups = tabPtr.p->noOfBackups;
13899     conf->scanCookie = tabPtr.p->m_map_ptr_i;
13900     conf->reorgFlag = tabPtr.p->m_scan_reorg_flag;
13901     sendSignal(senderRef, GSN_DIH_SCAN_TAB_CONF, signal,
13902                DihScanTabConf::SignalLength, JBB);
13903   }
13904   return;
13905 
13906 error:
13907   DihScanTabRef* ref = (DihScanTabRef*)signal->getDataPtrSend();
13908   ref->tableId = tabPtr.i;
13909   ref->senderData = senderData;
13910   ref->error = DihScanTabRef::ErroneousTableState;
13911   ref->tableStatus = tabPtr.p->tabStatus;
13912   ref->schemaTransId = schemaTransId;
13913   sendSignal(senderRef, GSN_DIH_SCAN_TAB_REF, signal,
13914              DihScanTabRef::SignalLength, JBB);
13915   return;
13916 
13917 }//Dbdih::execDIH_SCAN_TAB_REQ()
13918 
execDIH_SCAN_GET_NODES_REQ(Signal * signal)13919 void Dbdih::execDIH_SCAN_GET_NODES_REQ(Signal* signal)
13920 {
13921   jamEntry();
13922 
13923   DihScanGetNodesReq* req = (DihScanGetNodesReq*)signal->getDataPtrSend();
13924   const Uint32 tableId = req->tableId;
13925   const Uint32 senderRef = req->senderRef;
13926   const Uint32 fragCnt = req->fragCnt;
13927 
13928   SectionHandle reqHandle(this, signal);
13929   const bool useLongSignal = (reqHandle.m_cnt > 0);
13930 
13931   DihScanGetNodesReq::FragItem fragReq[DihScanGetNodesReq::MAX_DIH_FRAG_REQS];
13932   if (useLongSignal)
13933   {
13934     // Long signal: Fetch into fragReq[]
13935     jam();
13936     SegmentedSectionPtr fragReqSection;
13937     ndbrequire(reqHandle.getSection(fragReqSection,0));
13938     ndbassert(fragReqSection.p->m_sz == (fragCnt*DihScanGetNodesReq::FragItem::Length));
13939     ndbassert(fragCnt <= DihScanGetNodesReq::MAX_DIH_FRAG_REQS);
13940     copy((Uint32*)fragReq, fragReqSection);
13941   }
13942   else // Short signal, with single FragItem
13943   {
13944     jam();
13945     ndbassert(fragCnt == 1);
13946     ndbassert(signal->getLength()
13947               == DihScanGetNodesReq::FixedSignalLength + DihScanGetNodesReq::FragItem::Length);
13948     memcpy(fragReq, req->fragItem, 4 * DihScanGetNodesReq::FragItem::Length);
13949   }
13950 
13951   TabRecordPtr tabPtr;
13952   tabPtr.i = tableId;
13953   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13954   if (DictTabInfo::isOrderedIndex(tabPtr.p->tableType)) {
13955     jam();
13956     tabPtr.i = tabPtr.p->primaryTableId;
13957     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
13958   }
13959 
13960   DihScanGetNodesConf* conf = (DihScanGetNodesConf*)signal->getDataPtrSend();
13961   conf->tableId = tableId;
13962   conf->fragCnt = fragCnt;
13963 
13964   for (Uint32 i=0; i < fragCnt; i++)
13965   {
13966     jam();
13967     FragmentstorePtr fragPtr;
13968     Uint32 nodes[MAX_REPLICAS];
13969 
13970     getFragstore(tabPtr.p, fragReq[i].fragId, fragPtr);
13971     Uint32 count = extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
13972 
13973     conf->fragItem[i].senderData  = fragReq[i].senderData;
13974     conf->fragItem[i].fragId      = fragReq[i].fragId;
13975     conf->fragItem[i].instanceKey = dihGetInstanceKey(fragPtr);
13976     conf->fragItem[i].count       = count;
13977     conf->fragItem[i].nodes[0]    = nodes[0];
13978     conf->fragItem[i].nodes[1]    = nodes[1];
13979     conf->fragItem[i].nodes[2]    = nodes[2];
13980     conf->fragItem[i].nodes[3]    = nodes[3];
13981   }
13982 
13983   if (useLongSignal)
13984   {
13985     jam();
13986     Ptr<SectionSegment> fragConf;
13987     const Uint32 len = fragCnt*DihScanGetNodesConf::FragItem::Length;
13988 
13989     if (ERROR_INSERTED_CLEAR(7234) ||
13990         unlikely(!import(fragConf, (Uint32*)conf->fragItem, len)))
13991     {
13992       jam();
13993       DihScanGetNodesRef* ref = (DihScanGetNodesRef*)signal->getDataPtrSend();
13994 
13995       ref->tableId = tableId;
13996       ref->fragCnt = fragCnt;
13997       ref->errCode = ZLONG_MESSAGE_ERROR;
13998 
13999       /**
14000        *  NOTE: DihScanGetNodesRef return the same FragItem list
14001        *        received as part of the REQuest to avoid possible
14002        *        malloc failure handling in the REF.
14003        */
14004       sendSignal(senderRef, GSN_DIH_SCAN_GET_NODES_REF, signal,
14005                  DihScanGetNodesRef::FixedSignalLength,
14006                  JBB, &reqHandle);
14007       return;
14008     }
14009     releaseSections(reqHandle);
14010 
14011     SectionHandle confHandle(this, fragConf.i);
14012     sendSignal(senderRef, GSN_DIH_SCAN_GET_NODES_CONF, signal,
14013                DihScanGetNodesConf::FixedSignalLength,
14014                JBB, &confHandle);
14015   }
14016   else
14017   {
14018     // A short signal is sufficient.
14019     jam();
14020     ndbassert(fragCnt == 1);
14021 
14022     if (ERROR_INSERTED_CLEAR(7234))
14023     {
14024       jam();
14025       DihScanGetNodesRef* ref = (DihScanGetNodesRef*)signal->getDataPtrSend();
14026 
14027       ref->tableId = tableId;
14028       ref->fragCnt = fragCnt;
14029       ref->errCode = ZLONG_MESSAGE_ERROR;
14030       ref->fragItem[0] = fragReq[0];
14031 
14032       sendSignal(senderRef, GSN_DIH_SCAN_GET_NODES_REF, signal,
14033                  DihScanGetNodesRef::FixedSignalLength
14034                  + DihScanGetNodesRef::FragItem::Length,
14035                  JBB);
14036       return;
14037     }
14038     sendSignal(senderRef, GSN_DIH_SCAN_GET_NODES_CONF, signal,
14039                DihScanGetNodesConf::FixedSignalLength
14040                + DihScanGetNodesConf::FragItem::Length,
14041                JBB);
14042   }
14043 }//Dbdih::execDIH_SCAN_GET_NODES_REQ
14044 
14045 void
execDIH_SCAN_TAB_COMPLETE_REP(Signal * signal)14046 Dbdih::execDIH_SCAN_TAB_COMPLETE_REP(Signal* signal)
14047 {
14048   jamEntry();
14049   DihScanTabCompleteRep* rep = (DihScanTabCompleteRep*)signal->getDataPtr();
14050   TabRecordPtr tabPtr;
14051   tabPtr.i = rep->tableId;
14052   Uint32 map_ptr_i = rep->scanCookie;
14053   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14054 
14055   if (map_ptr_i == tabPtr.p->m_map_ptr_i)
14056   {
14057     jam();
14058     ndbassert(tabPtr.p->m_scan_count[0]);
14059     tabPtr.p->m_scan_count[0]--;
14060   }
14061   else
14062   {
14063     jam();
14064     ndbassert(tabPtr.p->m_scan_count[1]);
14065     tabPtr.p->m_scan_count[1]--;
14066   }
14067 }
14068 
14069 
14070 /****************************************************************************/
14071 /* **********     GLOBAL-CHECK-POINT HANDLING  MODULE           *************/
14072 /****************************************************************************/
14073 /*
14074   3.10   G L O B A L  C H E C K P O I N T ( IN  M A S T E R  R O L E)
14075   *******************************************************************
14076   */
14077 
14078 bool
check_enable_micro_gcp(Signal * signal,bool broadcast)14079 Dbdih::check_enable_micro_gcp(Signal* signal, bool broadcast)
14080 {
14081   ndbassert(m_micro_gcp.m_enabled == false);
14082   ndbassert(NodeVersionInfo::DataLength == 6);
14083   Uint32 min = ~(Uint32)0;
14084   const NodeVersionInfo& info = getNodeVersionInfo();
14085   for (Uint32 i = 0; i<3; i++)
14086   {
14087     Uint32 tmp = info.m_type[i].m_min_version;
14088     if (tmp)
14089     {
14090       min = (min < tmp) ? min : tmp;
14091     }
14092   }
14093 
14094   if (ndb_check_micro_gcp(min))
14095   {
14096     jam();
14097     m_micro_gcp.m_enabled = true;
14098 
14099     infoEvent("Enabling micro GCP");
14100     if (broadcast)
14101     {
14102       jam();
14103       UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
14104       ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
14105 
14106       /**
14107        * We need to notify all ndbd's or they'll get confused!
14108        */
14109       NodeRecordPtr specNodePtr;
14110       specNodePtr.i = cfirstAliveNode;
14111       do {
14112         jam();
14113         ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
14114         sendSignal(calcDihBlockRef(specNodePtr.i), GSN_UPGRADE_PROTOCOL_ORD,
14115                    signal, UpgradeProtocolOrd::SignalLength, JBA);
14116         specNodePtr.i = specNodePtr.p->nextNode;
14117       } while (specNodePtr.i != RNIL);
14118       EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
14119     }
14120   }
14121   return m_micro_gcp.m_enabled;
14122 }
14123 
14124 void
execUPGRADE_PROTOCOL_ORD(Signal * signal)14125 Dbdih::execUPGRADE_PROTOCOL_ORD(Signal* signal)
14126 {
14127   const UpgradeProtocolOrd* ord = (UpgradeProtocolOrd*)signal->getDataPtr();
14128   switch(ord->type){
14129   case UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP:
14130     jam();
14131     m_micro_gcp.m_enabled = true;
14132     EXECUTE_DIRECT(QMGR, GSN_UPGRADE_PROTOCOL_ORD,signal, signal->getLength());
14133     return;
14134   }
14135 }
14136 
14137 void
startGcpLab(Signal * signal)14138 Dbdih::startGcpLab(Signal* signal)
14139 {
14140   if (ERROR_INSERTED(7242))
14141   {
14142     jam();
14143     g_eventLogger->info("Delayed GCP_COMMIT start 5s");
14144     signal->theData[0] = DihContinueB::ZSTART_GCP;
14145     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5000, 1);
14146     return;
14147   }
14148 
14149   for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
14150   {
14151     if (c_diverify_queue[i].m_empty_done == 0)
14152     {
14153       // Previous global checkpoint is not yet completed.
14154       jam();
14155       signal->theData[0] = DihContinueB::ZSTART_GCP;
14156       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
14157       return;
14158     }
14159   }
14160 
14161   emptyWaitGCPMasterQueue(signal,
14162                           m_micro_gcp.m_current_gci,
14163                           c_waitEpochMasterList);
14164 
14165   if (c_nodeStartMaster.blockGcp != 0 &&
14166       m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
14167   {
14168     jam();
14169 
14170     /* ------------------------------------------------------------------ */
14171     /*  A NEW NODE WANTS IN AND WE MUST ALLOW IT TO COME IN NOW SINCE THE */
14172     /*       GCP IS COMPLETED.                                            */
14173     /* ------------------------------------------------------------------ */
14174 
14175     if (ERROR_INSERTED(7217))
14176     {
14177       jam();
14178 
14179       signal->theData[0] = 9999;
14180       sendSignal(numberToRef(CMVMI, refToNode(c_nodeStartMaster.startNode)),
14181                  GSN_NDB_TAMPER, signal, 1, JBB);
14182       NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time); // Force start
14183       // fall through
14184     }
14185     else
14186     {
14187       jam();
14188       ndbrequire(c_nodeStartMaster.blockGcp == 1); // Ordered...
14189       c_nodeStartMaster.blockGcp = 2; // effective
14190       gcpBlockedLab(signal);
14191       return;
14192     }
14193   }
14194 
14195   if (cgcpOrderBlocked)
14196   {
14197     jam();
14198     signal->theData[0] = DihContinueB::ZSTART_GCP;
14199     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
14200     return;
14201   }
14202 
14203   const NDB_TICKS now = c_current_time = NdbTick_getCurrentTicks();
14204 
14205   /**
14206    * An invalid micro-GCP 'start_time' is used to force
14207    * a micro GCP to be started immediately.
14208    */
14209   if (NdbTick_IsValid(m_micro_gcp.m_master.m_start_time))
14210   {
14211     const Uint32 delayMicro = m_micro_gcp.m_enabled ?
14212       m_micro_gcp.m_master.m_time_between_gcp :
14213       m_gcp_save.m_master.m_time_between_gcp;
14214     const Uint64 elapsed =
14215       NdbTick_Elapsed(m_micro_gcp.m_master.m_start_time, now).milliSec();
14216 
14217     if (elapsed < delayMicro)
14218     {
14219       jam();
14220       signal->theData[0] = DihContinueB::ZSTART_GCP;
14221       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
14222       return;
14223     }
14224   }
14225 
14226   m_micro_gcp.m_master.m_start_time = now;
14227 
14228   if (m_micro_gcp.m_enabled == false &&
14229       m_micro_gcp.m_master.m_time_between_gcp)
14230   {
14231     /**
14232      * Micro GCP is disabled...but configured...
14233      */
14234     jam();
14235     check_enable_micro_gcp(signal, true);
14236   }
14237 
14238   /**
14239    * Check that there has not been more than 2^32 micro GCP wo/ any save
14240    */
14241   Uint64 currGCI = m_micro_gcp.m_current_gci;
14242   ndbrequire(Uint32(currGCI) != ~(Uint32)0);
14243   m_micro_gcp.m_master.m_new_gci = currGCI + 1;
14244 
14245   const Uint32 delaySave = m_gcp_save.m_master.m_time_between_gcp;
14246   const NDB_TICKS start  = m_gcp_save.m_master.m_start_time;
14247   const bool need_gcp_save =
14248     (!NdbTick_IsValid(start) ||                              //First or forced GCP
14249      NdbTick_Elapsed(start, now).milliSec() >= delaySave) && //Reached time limit
14250     (!ERROR_INSERTED(7243));  /* 7243 = no GCP_SAVE initiation */
14251 
14252   if ((m_micro_gcp.m_enabled == false) ||
14253       (need_gcp_save &&
14254        m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE))
14255   {
14256     jam();
14257     /**
14258      * Time for save...switch gci_hi
14259      */
14260     m_gcp_save.m_master.m_start_time = now;
14261     m_micro_gcp.m_master.m_new_gci = Uint64((currGCI >> 32) + 1) << 32;
14262 
14263     signal->theData[0] = NDB_LE_GlobalCheckpointStarted; //Event type
14264     signal->theData[1] = Uint32(currGCI >> 32);
14265     signal->theData[2] = Uint32(currGCI);
14266     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
14267   }
14268 
14269   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
14270 
14271 
14272   /***************************************************************************/
14273   // Report the event that a global checkpoint has started.
14274   /***************************************************************************/
14275 
14276   CRASH_INSERTION(7000);
14277   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_PREPARE;
14278   signal->setTrace(TestOrd::TraceGlobalCheckpoint);
14279 
14280 #ifdef ERROR_INSERT
14281   if (ERROR_INSERTED(7186))
14282   {
14283     sendToRandomNodes("GCP_PREPARE",
14284                       signal, &c_GCP_PREPARE_Counter, &Dbdih::sendGCP_PREPARE);
14285     signal->theData[0] = 9999;
14286     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
14287     return;
14288   }
14289   else if (ERROR_INSERTED(7200))
14290   {
14291     c_GCP_PREPARE_Counter.clearWaitingFor();
14292     NodeRecordPtr nodePtr;
14293     nodePtr.i = cfirstAliveNode;
14294     do {
14295       jam();
14296       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
14297       c_GCP_PREPARE_Counter.setWaitingFor(nodePtr.i);
14298       if (nodePtr.i != getOwnNodeId())
14299       {
14300         SET_ERROR_INSERT_VALUE(7201);
14301         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
14302       }
14303       else
14304       {
14305         SET_ERROR_INSERT_VALUE(7202);
14306         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
14307       }
14308       nodePtr.i = nodePtr.p->nextNode;
14309     } while (nodePtr.i != RNIL);
14310 
14311     NodeReceiverGroup rg(CMVMI, c_GCP_PREPARE_Counter);
14312     rg.m_nodes.clear(getOwnNodeId());
14313     Uint32 victim = rg.m_nodes.find(0);
14314 
14315     signal->theData[0] = 9999;
14316     sendSignal(numberToRef(CMVMI, victim),
14317 	       GSN_NDB_TAMPER, signal, 1, JBA);
14318 
14319     CLEAR_ERROR_INSERT_VALUE;
14320     return;
14321   }
14322   else if (ERROR_INSERTED(7227))
14323   {
14324     ndbout_c("Not sending GCP_PREPARE to %u", c_error_insert_extra);
14325     c_GCP_PREPARE_Counter.clearWaitingFor();
14326     NodeRecordPtr nodePtr;
14327     nodePtr.i = cfirstAliveNode;
14328     do {
14329       jam();
14330       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
14331       c_GCP_PREPARE_Counter.setWaitingFor(nodePtr.i);
14332       if (nodePtr.i != c_error_insert_extra)
14333       {
14334         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
14335       }
14336       nodePtr.i = nodePtr.p->nextNode;
14337     } while (nodePtr.i != RNIL);
14338 
14339     signal->theData[0] = 9999;
14340     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
14341     return;
14342   }
14343 #endif
14344 
14345   sendLoopMacro(GCP_PREPARE, sendGCP_PREPARE, RNIL);
14346 }//Dbdih::startGcpLab()
14347 
execGCP_PREPARECONF(Signal * signal)14348 void Dbdih::execGCP_PREPARECONF(Signal* signal)
14349 {
14350   jamEntry();
14351   Uint32 senderNodeId = signal->theData[0];
14352   Uint32 gci_hi = signal->theData[1];
14353   Uint32 gci_lo = signal->theData[2];
14354 
14355   if (unlikely(signal->getLength() < GCPPrepareConf::SignalLength))
14356   {
14357     gci_lo = 0;
14358     ndbassert(!ndb_check_micro_gcp(getNodeInfo(senderNodeId).m_version));
14359   }
14360 
14361   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
14362   ndbrequire(gci == m_micro_gcp.m_master.m_new_gci);
14363   receiveLoopMacro(GCP_PREPARE, senderNodeId);
14364   //-------------------------------------------------------------
14365   // We have now received all replies. We are ready to continue
14366   // with committing the global checkpoint.
14367   //-------------------------------------------------------------
14368   gcpcommitreqLab(signal);
14369 }//Dbdih::execGCP_PREPARECONF()
14370 
gcpcommitreqLab(Signal * signal)14371 void Dbdih::gcpcommitreqLab(Signal* signal)
14372 {
14373   CRASH_INSERTION(7001);
14374 
14375   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMMIT;
14376 
14377 #ifdef ERROR_INSERT
14378   if (ERROR_INSERTED(7187))
14379   {
14380     sendToRandomNodes("GCP_COMMIT",
14381                       signal, &c_GCP_COMMIT_Counter, &Dbdih::sendGCP_COMMIT);
14382     signal->theData[0] = 9999;
14383     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
14384     return;
14385   }
14386 #endif
14387 
14388   sendLoopMacro(GCP_COMMIT, sendGCP_COMMIT, RNIL);
14389   return;
14390 }//Dbdih::gcpcommitreqLab()
14391 
execGCP_NODEFINISH(Signal * signal)14392 void Dbdih::execGCP_NODEFINISH(Signal* signal)
14393 {
14394   jamEntry();
14395   const Uint32 senderNodeId = signal->theData[0];
14396   const Uint32 gci_hi = signal->theData[1];
14397   const Uint32 tcFailNo = signal->theData[2];
14398   const Uint32 gci_lo = signal->theData[3];
14399   const Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
14400 
14401   /* Check that there has not been a node failure since TC
14402    * reported this GCP complete...
14403    */
14404   if ((senderNodeId == getOwnNodeId()) &&
14405       (tcFailNo < cMinTcFailNo))
14406   {
14407     jam();
14408     ndbrequire(c_GCP_COMMIT_Counter.isWaitingFor(getOwnNodeId()));
14409 
14410     /* We are master, and the local TC will takeover the transactions
14411      * of the failed node, which can add to the current GCP, so resend
14412      * GCP_NOMORETRANS to TC...
14413      */
14414     m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT; /* Reset DIH Slave GCP state */
14415 
14416     GCPNoMoreTrans* req = (GCPNoMoreTrans*)signal->getDataPtrSend();
14417     req->senderRef = reference();
14418     req->senderData = m_micro_gcp.m_master_ref;
14419     req->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
14420     req->gci_lo = Uint32(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
14421     sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
14422                GCPNoMoreTrans::SignalLength, JBB);
14423 
14424     return;
14425   }
14426   (void)gci; // TODO validate
14427 
14428   ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT);
14429   receiveLoopMacro(GCP_COMMIT, senderNodeId);
14430 
14431   jam();
14432 
14433   if (m_micro_gcp.m_enabled)
14434   {
14435     jam();
14436 
14437     m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMPLETE;
14438 
14439     SubGcpCompleteRep * rep = (SubGcpCompleteRep*)signal->getDataPtr();
14440     rep->senderRef = reference();
14441     rep->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
14442     rep->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
14443     rep->flags = SubGcpCompleteRep::IN_MEMORY;
14444 
14445 #ifdef ERROR_INSERT
14446     if (ERROR_INSERTED(7190))
14447     {
14448       sendToRandomNodes("GCP_COMPLETE_REP", signal,
14449                         &c_SUB_GCP_COMPLETE_REP_Counter,
14450                         &Dbdih::sendSUB_GCP_COMPLETE_REP);
14451       signal->theData[0] = 9999;
14452       sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
14453     }
14454     else if (ERROR_INSERTED(7226))
14455     {
14456       ndbout_c("Not sending SUB_GCP_COMPLETE_REP to %u", c_error_insert_extra);
14457       c_SUB_GCP_COMPLETE_REP_Counter.clearWaitingFor();
14458       NodeRecordPtr nodePtr;
14459       nodePtr.i = cfirstAliveNode;
14460       do {
14461         jam();
14462         ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
14463         c_SUB_GCP_COMPLETE_REP_Counter.setWaitingFor(nodePtr.i);
14464         if (nodePtr.i != c_error_insert_extra)
14465         {
14466           sendSignal(calcDihBlockRef(nodePtr.i), GSN_SUB_GCP_COMPLETE_REP,
14467                      signal, SubGcpCompleteRep::SignalLength, JBA);
14468         }
14469         nodePtr.i = nodePtr.p->nextNode;
14470       } while (nodePtr.i != RNIL);
14471       SET_ERROR_INSERT_VALUE(7227);
14472 
14473       signal->theData[0] = 9999;
14474       sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
14475     }
14476     else
14477 #endif
14478     {
14479       jam();
14480       // Normal path...
14481       sendLoopMacro(SUB_GCP_COMPLETE_REP, sendSUB_GCP_COMPLETE_REP, RNIL);
14482     }
14483   }
14484 
14485   //-------------------------------------------------------------
14486   // We have now received all replies. We are ready to continue
14487   // with saving the global checkpoint to disk.
14488   //-------------------------------------------------------------
14489   CRASH_INSERTION(7002);
14490 
14491   Uint32 curr_hi = (Uint32)(m_micro_gcp.m_current_gci >> 32);
14492   Uint32 old_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
14493 
14494   if (m_micro_gcp.m_enabled)
14495   {
14496     jam();
14497   }
14498   else
14499   {
14500     ndbrequire(curr_hi != old_hi);
14501   }
14502 
14503   if (curr_hi == old_hi)
14504   {
14505     jam();
14506     return;
14507   }
14508 
14509   /**
14510    * Start a save
14511    */
14512   Uint32 saveGCI = old_hi;
14513   m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
14514   m_gcp_save.m_master.m_new_gci = saveGCI;
14515 
14516 #ifdef ERROR_INSERT
14517   if (ERROR_INSERTED(7188))
14518   {
14519     sendToRandomNodes("GCP_SAVE",
14520                       signal, &c_GCP_SAVEREQ_Counter, &Dbdih::sendGCP_SAVEREQ);
14521     signal->theData[0] = 9999;
14522     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
14523     return;
14524   }
14525   else if (ERROR_INSERTED(7216))
14526   {
14527     infoEvent("GCP_SAVE all/%u", c_error_insert_extra);
14528     NodeRecordPtr nodePtr;
14529     nodePtr.i = c_error_insert_extra;
14530     ptrAss(nodePtr, nodeRecord);
14531 
14532     removeAlive(nodePtr);
14533     sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
14534     insertAlive(nodePtr);
14535     signal->theData[0] = 9999;
14536     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
14537     c_GCP_SAVEREQ_Counter.setWaitingFor(c_error_insert_extra);
14538     return;
14539   }
14540 #endif
14541 
14542   sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
14543 }
14544 
14545 void
execSUB_GCP_COMPLETE_ACK(Signal * signal)14546 Dbdih::execSUB_GCP_COMPLETE_ACK(Signal* signal)
14547 {
14548   jamEntry();
14549   SubGcpCompleteAck ack = * CAST_CONSTPTR(SubGcpCompleteAck,
14550                                           signal->getDataPtr());
14551   Uint32 senderNodeId = refToNode(ack.rep.senderRef);
14552 
14553   ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMPLETE);
14554   receiveLoopMacro(SUB_GCP_COMPLETE_REP, senderNodeId);
14555 
14556   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_IDLE;
14557 
14558   if (!ERROR_INSERTED(7190))
14559   {
14560     signal->theData[0] = DihContinueB::ZSTART_GCP;
14561     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
14562   }
14563 }
14564 
14565 void
execGCP_SAVEREQ(Signal * signal)14566 Dbdih::execGCP_SAVEREQ(Signal* signal)
14567 {
14568   jamEntry();
14569   GCPSaveReq * req = (GCPSaveReq*)&signal->theData[0];
14570 
14571   if (ERROR_INSERTED(7237))
14572   {
14573     jam();
14574     g_eventLogger->info("Delayed GCP_SAVEREQ 5s");
14575     sendSignalWithDelay(reference(), GSN_GCP_SAVEREQ,
14576                         signal, 5000,
14577                         signal->getLength());
14578     return;
14579   }
14580 
14581   if (m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ)
14582   {
14583     jam();
14584     /**
14585      * This is master take over...
14586      * and SAVE_REQ is already running
14587      */
14588     ndbrequire(m_gcp_save.m_gci == req->gci);
14589     m_gcp_save.m_master_ref = req->dihBlockRef;
14590     return;
14591   }
14592 
14593   if (m_gcp_save.m_gci == req->gci)
14594   {
14595     jam();
14596     /**
14597      * This is master take over...
14598      * and SAVE_REQ is complete...
14599      */
14600     m_gcp_save.m_master_ref = req->dihBlockRef;
14601 
14602     GCPSaveReq save = (* req);
14603     GCPSaveConf * conf = (GCPSaveConf*)signal->getDataPtrSend();
14604     conf->dihPtr = save.dihPtr;
14605     conf->nodeId = getOwnNodeId();
14606     conf->gci    = save.gci;
14607     sendSignal(m_gcp_save.m_master_ref, GSN_GCP_SAVECONF, signal,
14608                GCPSaveConf::SignalLength, JBA);
14609     return;
14610   }
14611 
14612   ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_IDLE);
14613   m_gcp_save.m_state = GcpSave::GCP_SAVE_REQ;
14614   m_gcp_save.m_master_ref = req->dihBlockRef;
14615   m_gcp_save.m_gci = req->gci;
14616 
14617   req->dihBlockRef = reference();
14618   sendSignal(DBLQH_REF, GSN_GCP_SAVEREQ, signal, signal->getLength(), JBA);
14619 }
14620 
execGCP_SAVECONF(Signal * signal)14621 void Dbdih::execGCP_SAVECONF(Signal* signal)
14622 {
14623   jamEntry();
14624   GCPSaveConf * saveConf = (GCPSaveConf*)&signal->theData[0];
14625 
14626   if (refToBlock(signal->getSendersBlockRef()) == DBLQH)
14627   {
14628     jam();
14629 
14630     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ);
14631     m_gcp_save.m_state = GcpSave::GCP_SAVE_CONF;
14632 
14633     sendSignal(m_gcp_save.m_master_ref,
14634                GSN_GCP_SAVECONF, signal, signal->getLength(), JBA);
14635     return;
14636   }
14637 
14638   ndbrequire(saveConf->gci == m_gcp_save.m_master.m_new_gci);
14639   ndbrequire(saveConf->nodeId == saveConf->dihPtr);
14640   SYSFILE->lastCompletedGCI[saveConf->nodeId] = saveConf->gci;
14641   GCP_SAVEhandling(signal, saveConf->nodeId);
14642 }//Dbdih::execGCP_SAVECONF()
14643 
execGCP_SAVEREF(Signal * signal)14644 void Dbdih::execGCP_SAVEREF(Signal* signal)
14645 {
14646   jamEntry();
14647   GCPSaveRef * const saveRef = (GCPSaveRef*)&signal->theData[0];
14648 
14649   if (refToBlock(signal->getSendersBlockRef()) == DBLQH)
14650   {
14651     jam();
14652 
14653     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ);
14654     m_gcp_save.m_state = GcpSave::GCP_SAVE_CONF;
14655 
14656     sendSignal(m_gcp_save.m_master_ref,
14657                GSN_GCP_SAVEREF, signal, signal->getLength(), JBA);
14658     return;
14659   }
14660 
14661   ndbrequire(saveRef->gci == m_gcp_save.m_master.m_new_gci);
14662   ndbrequire(saveRef->nodeId == saveRef->dihPtr);
14663 
14664   /**
14665    * Only allow reason not to save
14666    */
14667   ndbrequire(saveRef->errorCode == GCPSaveRef::NodeShutdownInProgress ||
14668 	     saveRef->errorCode == GCPSaveRef::FakedSignalDueToNodeFailure ||
14669 	     saveRef->errorCode == GCPSaveRef::NodeRestartInProgress);
14670   GCP_SAVEhandling(signal, saveRef->nodeId);
14671 }//Dbdih::execGCP_SAVEREF()
14672 
GCP_SAVEhandling(Signal * signal,Uint32 nodeId)14673 void Dbdih::GCP_SAVEhandling(Signal* signal, Uint32 nodeId)
14674 {
14675   ndbrequire(m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_REQ);
14676   receiveLoopMacro(GCP_SAVEREQ, nodeId);
14677   /*-------------------------------------------------------------------------*/
14678   // All nodes have replied. We are ready to update the system file.
14679   /*-------------------------------------------------------------------------*/
14680 
14681   CRASH_INSERTION(7003);
14682   /**------------------------------------------------------------------------
14683    * SET NEW RECOVERABLE GCI. ALSO RESET RESTART COUNTER TO ZERO.
14684    * THIS INDICATES THAT THE SYSTEM HAS BEEN RECOVERED AND SURVIVED AT
14685    * LEAST ONE GLOBAL CHECKPOINT PERIOD. WE WILL USE THIS PARAMETER TO
14686    * SET BACK THE RESTART GCI IF WE ENCOUNTER MORE THAN ONE UNSUCCESSFUL
14687    * RESTART.
14688    *------------------------------------------------------------------------*/
14689   SYSFILE->newestRestorableGCI = m_gcp_save.m_gci;
14690   if(Sysfile::getInitialStartOngoing(SYSFILE->systemRestartBits) &&
14691      getNodeState().startLevel == NodeState::SL_STARTED){
14692     jam();
14693 #if 0
14694     g_eventLogger->info("Dbdih: Clearing initial start ongoing");
14695 #endif
14696     Sysfile::clearInitialStartOngoing(SYSFILE->systemRestartBits);
14697   }
14698   copyGciLab(signal, CopyGCIReq::GLOBAL_CHECKPOINT);
14699 
14700   m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
14701 
14702 }//Dbdih::GCP_SAVEhandling()
14703 
14704 /*
14705   3.11   G L O B A L  C H E C K P O I N T (N O T - M A S T E R)
14706   *************************************************************
14707   */
execGCP_PREPARE(Signal * signal)14708 void Dbdih::execGCP_PREPARE(Signal* signal)
14709 {
14710   jamEntry();
14711   CRASH_INSERTION(7005);
14712 
14713   if (ERROR_INSERTED(7030))
14714   {
14715     cgckptflag = true;
14716   }
14717   if (ERROR_INSERTED(7030) ||
14718       ERROR_INSERTED(7238))
14719   {
14720     g_eventLogger->info("Delayed GCP_PREPARE 5s");
14721     sendSignalWithDelay(reference(), GSN_GCP_PREPARE, signal, 5000,
14722 			signal->getLength());
14723     return;
14724   }
14725 
14726   GCPPrepare* req = (GCPPrepare*)signal->getDataPtr();
14727   GCPPrepareConf * conf = (GCPPrepareConf*)signal->getDataPtrSend();
14728   Uint32 masterNodeId = req->nodeId;
14729   Uint32 gci_hi = req->gci_hi;
14730   Uint32 gci_lo = req->gci_lo;
14731   if (unlikely(signal->getLength() < GCPPrepare::SignalLength))
14732   {
14733     jam();
14734     gci_lo = 0;
14735     ndbassert(!ndb_check_micro_gcp(getNodeInfo(masterNodeId).m_version));
14736   }
14737   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
14738 
14739   BlockReference retRef = calcDihBlockRef(masterNodeId);
14740 
14741   if (isMaster())
14742   {
14743     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_PREPARE);
14744   }
14745 
14746   if (m_micro_gcp.m_state == MicroGcp::M_GCP_PREPARE)
14747   {
14748     jam();
14749     /**
14750      * This must be master take over
14751      *   Prepare is already complete
14752      */
14753     ndbrequire(m_micro_gcp.m_new_gci == gci);
14754     m_micro_gcp.m_master_ref = retRef;
14755     goto reply;
14756   }
14757 
14758   if (m_micro_gcp.m_new_gci == gci)
14759   {
14760     jam();
14761     /**
14762      * This GCP has already been prepared...
14763      *   Must be master takeover
14764      */
14765     m_micro_gcp.m_master_ref = retRef;
14766     goto reply;
14767   }
14768 
14769   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_IDLE);
14770 
14771   m_micro_gcp.m_lock.write_lock();
14772   cgckptflag = true;
14773   m_micro_gcp.m_state = MicroGcp::M_GCP_PREPARE;
14774   m_micro_gcp.m_new_gci = gci;
14775   m_micro_gcp.m_master_ref = retRef;
14776   m_micro_gcp.m_lock.write_unlock();
14777 
14778   if (ERROR_INSERTED(7031))
14779   {
14780     g_eventLogger->info("Crashing delayed in GCP_PREPARE 3s");
14781     signal->theData[0] = 9999;
14782     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 3000, 1);
14783     return;
14784   }
14785 #ifdef GCP_TIMER_HACK
14786   globalData.gcp_timer_commit[0] = NdbTick_getCurrentTicks();
14787 #endif
14788 
14789 reply:
14790   /**
14791    * Send the new gci to Suma.
14792    *
14793    * To get correct signal order and avoid races, this signal is sent on the
14794    * same prio as the SUB_GCP_COMPLETE_REP signal sent to SUMA in
14795    * execSUB_GCP_COMPLETE_REP().
14796    */
14797   sendSignal(SUMA_REF, GSN_GCP_PREPARE, signal, signal->length(), JBB);
14798 
14799   /* Send reply. */
14800   conf->nodeId = cownNodeId;
14801   conf->gci_hi = gci_hi;
14802   conf->gci_lo = gci_lo;
14803   sendSignal(retRef, GSN_GCP_PREPARECONF, signal,
14804              GCPPrepareConf::SignalLength, JBA);
14805   return;
14806 }
14807 
execGCP_COMMIT(Signal * signal)14808 void Dbdih::execGCP_COMMIT(Signal* signal)
14809 {
14810   jamEntry();
14811   CRASH_INSERTION(7006);
14812 
14813   if (ERROR_INSERTED(7239))
14814   {
14815     g_eventLogger->info("Delayed GCP_COMMIT 5s");
14816     sendSignalWithDelay(reference(), GSN_GCP_COMMIT, signal, 5000,
14817                         signal->getLength());
14818     return;
14819   }
14820 
14821   GCPCommit * req = (GCPCommit*)signal->getDataPtr();
14822   Uint32 masterNodeId = req->nodeId;
14823   Uint32 gci_hi = req->gci_hi;
14824   Uint32 gci_lo = req->gci_lo;
14825 
14826   if (unlikely(signal->getLength() < GCPCommit::SignalLength))
14827   {
14828     gci_lo = 0;
14829     ndbassert(!ndb_check_micro_gcp(getNodeInfo(masterNodeId).m_version));
14830   }
14831   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
14832 
14833 #ifdef ERROR_INSERT
14834   if (ERROR_INSERTED(7213))
14835   {
14836     ndbout_c("err 7213 killing %d", c_error_insert_extra);
14837     Uint32 save = signal->theData[0];
14838     signal->theData[0] = 5048;
14839     sendSignal(numberToRef(DBLQH, c_error_insert_extra),
14840                GSN_NDB_TAMPER, signal, 1, JBB);
14841     signal->theData[0] = save;
14842     CLEAR_ERROR_INSERT_VALUE;
14843 
14844     signal->theData[0] = 9999;
14845     sendSignal(numberToRef(CMVMI, c_error_insert_extra),
14846                GSN_DUMP_STATE_ORD, signal, 1, JBB);
14847 
14848     signal->theData[0] = save;
14849     CLEAR_ERROR_INSERT_VALUE;
14850 
14851     return;
14852   }
14853 #endif
14854 
14855   Uint32 masterRef = calcDihBlockRef(masterNodeId);
14856   ndbrequire(masterNodeId == cmasterNodeId);
14857   if (isMaster())
14858   {
14859     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT);
14860   }
14861 
14862   if (m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT)
14863   {
14864     jam();
14865     /**
14866      * This must be master take over
14867      *   Commit is already ongoing...
14868      */
14869     ndbrequire(m_micro_gcp.m_current_gci == gci);
14870     m_micro_gcp.m_master_ref = masterRef;
14871     return;
14872   }
14873 
14874   if (m_micro_gcp.m_current_gci == gci)
14875   {
14876     jam();
14877     /**
14878      * This must be master take over
14879      *   Commit has already completed
14880      */
14881     m_micro_gcp.m_master_ref = masterRef;
14882 
14883     GCPNodeFinished* conf = (GCPNodeFinished*)signal->getDataPtrSend();
14884     conf->nodeId = cownNodeId;
14885     conf->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
14886     conf->failno = cfailurenr;
14887     conf->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
14888     sendSignal(masterRef, GSN_GCP_NODEFINISH, signal,
14889                GCPNodeFinished::SignalLength, JBB);
14890     return;
14891   }
14892 
14893   ndbrequire(m_micro_gcp.m_new_gci == gci);
14894   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_PREPARE);
14895   m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
14896   m_micro_gcp.m_master_ref = calcDihBlockRef(masterNodeId);
14897 
14898   m_micro_gcp.m_lock.write_lock();
14899   m_micro_gcp.m_old_gci = m_micro_gcp.m_current_gci;
14900   m_micro_gcp.m_current_gci = gci;
14901   cgckptflag = false;
14902   m_micro_gcp.m_lock.write_unlock();
14903 
14904   for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
14905   {
14906     jam();
14907     c_diverify_queue[i].m_empty_done = 0;
14908     emptyverificbuffer(signal, i, true);
14909   }
14910 
14911   GCPNoMoreTrans* req2 = (GCPNoMoreTrans*)signal->getDataPtrSend();
14912   req2->senderRef = reference();
14913   req2->senderData = calcDihBlockRef(masterNodeId);
14914   req2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
14915   req2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
14916   sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
14917              GCPNoMoreTrans::SignalLength, JBB);
14918   return;
14919 }//Dbdih::execGCP_COMMIT()
14920 
execGCP_TCFINISHED(Signal * signal)14921 void Dbdih::execGCP_TCFINISHED(Signal* signal)
14922 {
14923   jamEntry();
14924   CRASH_INSERTION(7007);
14925   GCPTCFinished* conf = (GCPTCFinished*)signal->getDataPtr();
14926   Uint32 retRef = conf->senderData;
14927   Uint32 gci_hi = conf->gci_hi;
14928   Uint32 gci_lo = conf->gci_lo;
14929   Uint32 tcFailNo = conf->tcFailNo;
14930   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
14931   ndbrequire(gci == m_micro_gcp.m_old_gci);
14932 
14933   if (ERROR_INSERTED(7181) || ERROR_INSERTED(7182))
14934   {
14935     c_error_7181_ref = retRef; // Save ref
14936     ndbout_c("killing %d", refToNode(cmasterdihref));
14937     signal->theData[0] = 9999;
14938     sendSignal(numberToRef(CMVMI, refToNode(cmasterdihref)),
14939 	       GSN_NDB_TAMPER, signal, 1, JBB);
14940     return;
14941   }
14942 
14943 #ifdef ERROR_INSERT
14944   if (ERROR_INSERTED(7214))
14945   {
14946     ndbout_c("err 7214 killing %d", c_error_insert_extra);
14947     Uint32 save = signal->theData[0];
14948     signal->theData[0] = 9999;
14949     sendSignal(numberToRef(CMVMI, c_error_insert_extra),
14950                GSN_NDB_TAMPER, signal, 1, JBB);
14951     signal->theData[0] = save;
14952     CLEAR_ERROR_INSERT_VALUE;
14953   }
14954 #endif
14955 
14956 #ifdef GCP_TIMER_HACK
14957   globalData.gcp_timer_commit[1] = NdbTick_getCurrentTicks();
14958 #endif
14959 
14960   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT);
14961 
14962   /**
14963    * Make sure that each LQH gets scheduled, so that they don't get out of sync
14964    * wrt to SUB_GCP_COMPLETE_REP
14965    */
14966   Callback cb;
14967   cb.m_callbackData = tcFailNo;  /* Pass fail-no triggering TC_FINISHED to callback */
14968   cb.m_callbackFunction = safe_cast(&Dbdih::execGCP_TCFINISHED_sync_conf);
14969   Uint32 path[] = { DBLQH, SUMA, 0 };
14970   synchronize_path(signal, path, cb);
14971 }//Dbdih::execGCP_TCFINISHED()
14972 
14973 void
execGCP_TCFINISHED_sync_conf(Signal * signal,Uint32 cb,Uint32 err)14974 Dbdih::execGCP_TCFINISHED_sync_conf(Signal* signal, Uint32 cb, Uint32 err)
14975 {
14976   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT);
14977 
14978   m_micro_gcp.m_state = MicroGcp::M_GCP_COMMITTED;
14979   Uint32 retRef = m_micro_gcp.m_master_ref;
14980 
14981   GCPNodeFinished* conf2 = (GCPNodeFinished*)signal->getDataPtrSend();
14982   conf2->nodeId = cownNodeId;
14983   conf2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
14984   conf2->failno = cb;  /* tcFailNo */
14985   conf2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
14986   sendSignal(retRef, GSN_GCP_NODEFINISH, signal,
14987              GCPNodeFinished::SignalLength, JBB);
14988 }
14989 
14990 void
execSUB_GCP_COMPLETE_REP(Signal * signal)14991 Dbdih::execSUB_GCP_COMPLETE_REP(Signal* signal)
14992 {
14993   jamEntry();
14994 
14995   CRASH_INSERTION(7228);
14996 
14997   if (ERROR_INSERTED(7244))
14998   {
14999     g_eventLogger->info("Delayed SUB_GCP_COMPLETE_REP 5s");
15000     sendSignalWithDelay(reference(), GSN_SUB_GCP_COMPLETE_REP, signal, 5000,
15001                         signal->getLength());
15002     return;
15003   }
15004 
15005   SubGcpCompleteRep rep = * (SubGcpCompleteRep*)signal->getDataPtr();
15006   if (isMaster())
15007   {
15008     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMPLETE);
15009   }
15010 
15011   Uint32 masterRef = rep.senderRef;
15012   if (m_micro_gcp.m_state == MicroGcp::M_GCP_IDLE)
15013   {
15014     jam();
15015     /**
15016      * This must be master take over
15017      *   signal has already arrived
15018      */
15019     m_micro_gcp.m_master_ref = masterRef;
15020     goto reply;
15021   }
15022 
15023   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMITTED);
15024   m_micro_gcp.m_state = MicroGcp::M_GCP_IDLE;
15025 
15026   /**
15027    * To handle multiple LDM instances, this need to be passed though
15028    * each LQH...(so that no fire-trig-ord can arrive "too" late)
15029    */
15030   sendSignal(DBLQH_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
15031              signal->length(), JBB);
15032 reply:
15033   Uint32 nodeId = refToNode(masterRef);
15034   if (!ndbd_dih_sub_gcp_complete_ack(getNodeInfo(nodeId).m_version))
15035   {
15036     jam();
15037     return;
15038   }
15039 
15040   SubGcpCompleteAck* ack = CAST_PTR(SubGcpCompleteAck,
15041                                     signal->getDataPtrSend());
15042   ack->rep = rep;
15043   ack->rep.senderRef = reference();
15044   sendSignal(masterRef, GSN_SUB_GCP_COMPLETE_ACK,
15045              signal, SubGcpCompleteAck::SignalLength, JBA);
15046 }
15047 
15048 /*****************************************************************************/
15049 //******     RECEIVING   TAMPER   REQUEST   FROM    NDBAPI             ******
15050 /*****************************************************************************/
execDIHNDBTAMPER(Signal * signal)15051 void Dbdih::execDIHNDBTAMPER(Signal* signal)
15052 {
15053   jamEntry();
15054   Uint32 tcgcpblocked = signal->theData[0];
15055   /* ACTION TO BE TAKEN BY DIH */
15056   Uint32 tuserpointer = signal->theData[1];
15057   BlockReference tuserblockref = signal->theData[2];
15058   switch (tcgcpblocked) {
15059   case 1:
15060     jam();
15061     if (isMaster()) {
15062       jam();
15063       cgcpOrderBlocked = 1;
15064     } else {
15065       jam();
15066       /* TRANSFER THE REQUEST */
15067       /* TO MASTER*/
15068       signal->theData[0] = tcgcpblocked;
15069       signal->theData[1] = tuserpointer;
15070       signal->theData[2] = tuserblockref;
15071       sendSignal(cmasterdihref, GSN_DIHNDBTAMPER, signal, 3, JBB);
15072     }//if
15073     break;
15074   case 2:
15075     jam();
15076     if (isMaster()) {
15077       jam();
15078       cgcpOrderBlocked = 0;
15079     } else {
15080       jam();
15081       /* TRANSFER THE REQUEST */
15082       /* TO MASTER*/
15083       signal->theData[0] = tcgcpblocked;
15084       signal->theData[1] = tuserpointer;
15085       signal->theData[2] = tuserblockref;
15086       sendSignal(cmasterdihref, GSN_DIHNDBTAMPER, signal, 3, JBB);
15087     }//if
15088     break;
15089   case 3:
15090     ndbrequire(false);
15091     return;
15092     break;
15093   case 4:
15094     jam();
15095     signal->theData[0] = tuserpointer;
15096     signal->theData[1] = crestartGci;
15097     sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 2, JBB);
15098     break;
15099 #ifdef ERROR_INSERT
15100   case 5:
15101     jam();
15102     if (tuserpointer >= 30000 && tuserpointer < 40000) {
15103       jam();
15104       /*--------------------------------------------------------------------*/
15105       // Redirect errors to master DIH in the 30000-range.
15106       /*--------------------------------------------------------------------*/
15107       tuserblockref = cmasterdihref;
15108       tuserpointer -= 30000;
15109       signal->theData[0] = 5;
15110       signal->theData[1] = tuserpointer;
15111       signal->theData[2] = tuserblockref;
15112       sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 3, JBB);
15113       return;
15114     } else if (tuserpointer >= 40000 && tuserpointer < 50000) {
15115       NodeRecordPtr localNodeptr;
15116       Uint32 Tfound = 0;
15117       jam();
15118       /*--------------------------------------------------------------------*/
15119       // Redirect errors to non-master DIH in the 40000-range.
15120       /*--------------------------------------------------------------------*/
15121       tuserpointer -= 40000;
15122       for (localNodeptr.i = 1;
15123            localNodeptr.i < MAX_NDB_NODES;
15124            localNodeptr.i++) {
15125         jam();
15126         ptrAss(localNodeptr, nodeRecord);
15127         if ((localNodeptr.p->nodeStatus == NodeRecord::ALIVE) &&
15128             (localNodeptr.i != cmasterNodeId)) {
15129           jam();
15130           tuserblockref = calcDihBlockRef(localNodeptr.i);
15131           Tfound = 1;
15132           break;
15133         }//if
15134       }//for
15135       if (Tfound == 0) {
15136         jam();
15137 	/*-------------------------------------------------------------------*/
15138 	// Ignore since no non-master node existed.
15139 	/*-------------------------------------------------------------------*/
15140         return;
15141       }//if
15142       signal->theData[0] = 5;
15143       signal->theData[1] = tuserpointer;
15144       signal->theData[2] = tuserblockref;
15145       sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 3, JBB);
15146       return;
15147     } else {
15148       jam();
15149       return;
15150     }//if
15151     break;
15152 #endif
15153   default:
15154     ndbrequire(false);
15155     break;
15156   }//switch
15157   return;
15158 }//Dbdih::execDIHNDBTAMPER()
15159 
15160 /*****************************************************************************/
15161 /* **********     FILE HANDLING MODULE                           *************/
15162 /*****************************************************************************/
copyGciLab(Signal * signal,CopyGCIReq::CopyReason reason)15163 void Dbdih::copyGciLab(Signal* signal, CopyGCIReq::CopyReason reason)
15164 {
15165   if(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE)
15166   {
15167     jam();
15168     /**
15169      * There can currently only be two waiting
15170      */
15171     for (Uint32 i = 0; i<CopyGCIMaster::WAIT_CNT; i++)
15172     {
15173       jam();
15174       if (c_copyGCIMaster.m_waiting[i] == CopyGCIReq::IDLE)
15175       {
15176         jam();
15177         c_copyGCIMaster.m_waiting[i] = reason;
15178         return;
15179       }
15180     }
15181 
15182     /**
15183      * Code should *not* request more than WAIT_CNT copy-gci's
15184      *   so this is an internal error
15185      */
15186     ndbrequire(false);
15187     return;
15188   }
15189   c_copyGCIMaster.m_copyReason = reason;
15190 
15191 #ifdef ERROR_INSERT
15192   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT && ERROR_INSERTED(7189))
15193   {
15194     sendToRandomNodes("COPY_GCI",
15195                       signal, &c_COPY_GCIREQ_Counter, &Dbdih::sendCOPY_GCIREQ);
15196     signal->theData[0] = 9999;
15197     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
15198     return;
15199   }
15200 #endif
15201 
15202   if (reason == CopyGCIReq::RESTART_NR)
15203   {
15204     jam();
15205     if (c_nodeStartMaster.startNode != RNIL)
15206     {
15207       jam();
15208       c_COPY_GCIREQ_Counter.clearWaitingFor();
15209       c_COPY_GCIREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode);
15210       sendCOPY_GCIREQ(signal, c_nodeStartMaster.startNode, RNIL);
15211       return;
15212     }
15213     else
15214     {
15215       jam();
15216       reason = c_copyGCIMaster.m_copyReason = c_copyGCIMaster.m_waiting[0];
15217       for (Uint32 i = 1; i<CopyGCIMaster::WAIT_CNT; i++)
15218       {
15219         jam();
15220         c_copyGCIMaster.m_waiting[i-1] = c_copyGCIMaster.m_waiting[i];
15221       }
15222       c_copyGCIMaster.m_waiting[CopyGCIMaster::WAIT_CNT-1] =
15223         CopyGCIReq::IDLE;
15224 
15225       if (reason == CopyGCIReq::IDLE)
15226       {
15227         jam();
15228         return;
15229       }
15230       // fall-through
15231     }
15232   }
15233 
15234   sendLoopMacro(COPY_GCIREQ, sendCOPY_GCIREQ, RNIL);
15235 
15236 }//Dbdih::copyGciLab()
15237 
15238 /* ------------------------------------------------------------------------- */
15239 /* COPY_GCICONF                           RESPONSE TO COPY_GCIREQ            */
15240 /* ------------------------------------------------------------------------- */
execCOPY_GCICONF(Signal * signal)15241 void Dbdih::execCOPY_GCICONF(Signal* signal)
15242 {
15243   jamEntry();
15244   NodeRecordPtr senderNodePtr;
15245   senderNodePtr.i = signal->theData[0];
15246   receiveLoopMacro(COPY_GCIREQ, senderNodePtr.i);
15247 
15248   CopyGCIReq::CopyReason current = c_copyGCIMaster.m_copyReason;
15249   c_copyGCIMaster.m_copyReason = CopyGCIReq::IDLE;
15250 
15251   bool ok = false;
15252   switch(current){
15253   case CopyGCIReq::RESTART:{
15254     ok = true;
15255     jam();
15256     DictStartReq * req = (DictStartReq*)&signal->theData[0];
15257     req->restartGci = SYSFILE->newestRestorableGCI;
15258     req->senderRef = reference();
15259     sendSignal(cdictblockref, GSN_DICTSTARTREQ,
15260                signal, DictStartReq::SignalLength, JBB);
15261     break;
15262   }
15263   case CopyGCIReq::LOCAL_CHECKPOINT:{
15264     ok = true;
15265     jam();
15266     startLcpRoundLab(signal);
15267     break;
15268   }
15269   case CopyGCIReq::GLOBAL_CHECKPOINT:
15270   {
15271     ok = true;
15272     jam();
15273 
15274     /************************************************************************/
15275     // Report the event that a global checkpoint has completed.
15276     /************************************************************************/
15277     signal->setTrace(0);
15278     signal->theData[0] = NDB_LE_GlobalCheckpointCompleted; //Event type
15279     signal->theData[1] = m_gcp_save.m_gci;
15280     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
15281 
15282     c_newest_restorable_gci = m_gcp_save.m_gci;
15283 #ifdef ERROR_INSERT
15284     if ((ERROR_INSERTED(7222) || ERROR_INSERTED(7223)) &&
15285         !Sysfile::getLCPOngoing(SYSFILE->systemRestartBits) &&
15286         c_newest_restorable_gci >= c_lcpState.lcpStopGcp)
15287     {
15288       if (ERROR_INSERTED(7222))
15289       {
15290         sendLoopMacro(COPY_TABREQ, nullRoutine, 0);
15291         NodeReceiverGroup rg(CMVMI, c_COPY_TABREQ_Counter);
15292 
15293         rg.m_nodes.clear(getOwnNodeId());
15294         if (!rg.m_nodes.isclear())
15295         {
15296           signal->theData[0] = 9999;
15297           sendSignal(rg, GSN_NDB_TAMPER, signal, 1, JBA);
15298         }
15299         signal->theData[0] = 9999;
15300         sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
15301 
15302         signal->theData[0] = 932;
15303         EXECUTE_DIRECT(QMGR, GSN_NDB_TAMPER, signal, 1);
15304 
15305         return;
15306       }
15307       if (ERROR_INSERTED(7223))
15308       {
15309         CLEAR_ERROR_INSERT_VALUE;
15310         signal->theData[0] = 9999;
15311         sendSignal(numberToRef(CMVMI, c_error_insert_extra)
15312                    , GSN_NDB_TAMPER, signal, 1, JBA);
15313       }
15314     }
15315 #endif
15316 
15317     if (m_micro_gcp.m_enabled == false)
15318     {
15319       jam();
15320       /**
15321        * Running old protocol
15322        */
15323       signal->theData[0] = DihContinueB::ZSTART_GCP;
15324       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
15325     }
15326     m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_IDLE;
15327 
15328     CRASH_INSERTION(7004);
15329     emptyWaitGCPMasterQueue(signal,
15330                             Uint64(m_gcp_save.m_gci) << 32,
15331                             c_waitGCPMasterList);
15332     break;
15333   }
15334   case CopyGCIReq::INITIAL_START_COMPLETED:
15335     ok = true;
15336     jam();
15337     break;
15338   case CopyGCIReq::IDLE:
15339     ok = false;
15340     jam();
15341     break;
15342   case CopyGCIReq::RESTART_NR:
15343     ok = true;
15344     jam();
15345     startme_copygci_conf(signal);
15346     break;
15347   }
15348   ndbrequire(ok);
15349 
15350 
15351   c_copyGCIMaster.m_copyReason = c_copyGCIMaster.m_waiting[0];
15352   for (Uint32 i = 1; i<CopyGCIMaster::WAIT_CNT; i++)
15353   {
15354     jam();
15355     c_copyGCIMaster.m_waiting[i-1] = c_copyGCIMaster.m_waiting[i];
15356   }
15357   c_copyGCIMaster.m_waiting[CopyGCIMaster::WAIT_CNT-1] = CopyGCIReq::IDLE;
15358 
15359   /**
15360    * Pop queue
15361    */
15362   if(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE)
15363   {
15364     jam();
15365 
15366     signal->theData[0] = DihContinueB::ZCOPY_GCI;
15367     signal->theData[1] = c_copyGCIMaster.m_copyReason;
15368     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
15369   }
15370 }//Dbdih::execCOPY_GCICONF()
15371 
15372 void
check_node_in_restart(Signal * signal,BlockReference ref,Uint32 nodeId)15373 Dbdih::check_node_in_restart(Signal *signal,
15374                              BlockReference ref,
15375                              Uint32 nodeId)
15376 {
15377   NodeRecordPtr nodePtr;
15378   for (nodePtr.i = nodeId; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
15379   {
15380     jam();
15381     ptrAss(nodePtr, nodeRecord);
15382     if (nodePtr.p->nodeGroup == RNIL ||
15383         nodePtr.p->nodeRecoveryStatus == NodeRecord::NOT_DEFINED_IN_CLUSTER ||
15384         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET ||
15385         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILED ||
15386         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILURE_COMPLETED ||
15387         nodePtr.p->nodeRecoveryStatus == NodeRecord::ALLOCATED_NODE_ID ||
15388         nodePtr.p->nodeRecoveryStatus == NodeRecord::RESTART_COMPLETED ||
15389         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_ACTIVE)
15390     {
15391       /**
15392        * Nodes that aren't part of a node group won't be part of LCPs,
15393        * Nodes not defined in Cluster we can ignore
15394        * Nodes not restarted yet while we were started have no impact
15395        * on LCP speed, if they restart while we restart doesn't matter
15396        * since in this case we will run at a speed for starting nodes.
15397        * Nodes recently failed and even those that completed will speed
15398        * up LCPs temporarily but using the c_increase_lcp_speed_after_nf
15399        * variable instead.
15400        * Nodes that have allocated a node id haven't really started yet.
15401        * Nodes that have completed their restart also need no speed up.
15402        */
15403       continue;
15404     }
15405     /**
15406      * All other states indicate that the node is in some or the other
15407      * node restart state, so thus it is a good idea to speed up LCP
15408      * processing.
15409      */
15410     jam();
15411     jamLine(nodePtr.i);
15412     sendCHECK_NODE_RESTARTCONF(signal, ref, 1);
15413     return;
15414   }
15415   jam();
15416   /* All nodes are up and running, no restart is ongoing */
15417   sendCHECK_NODE_RESTARTCONF(signal, ref, 0);
15418   return;
15419 }
15420 
sendCHECK_NODE_RESTARTCONF(Signal * signal,BlockReference ref,Uint32 node_restart)15421 void Dbdih::sendCHECK_NODE_RESTARTCONF(Signal *signal,
15422                                         BlockReference ref,
15423                                         Uint32 node_restart)
15424 {
15425   signal->theData[0] = node_restart;
15426   sendSignal(ref, GSN_CHECK_NODE_RESTARTCONF, signal, 1, JBB);
15427 }
15428 
execCHECK_NODE_RESTARTREQ(Signal * signal)15429 void Dbdih::execCHECK_NODE_RESTARTREQ(Signal *signal)
15430 {
15431   NodeRecordPtr nodePtr;
15432   Uint32 ref = signal->theData[0];
15433   jamEntry();
15434   /**
15435    * No signal data sent, this signal is sent to
15436    * check if we have any nodes that are currently
15437    * part of a LCP which is not yet been started.
15438    */
15439   if (c_increase_lcp_speed_after_nf == true)
15440   {
15441     /**
15442      * A node recently failed, we will run LCP faster until this LCP
15443      * has completed to ensure that we quickly get to a point where
15444      * we can copy the distribution and dictionary information.
15445      */
15446     jam();
15447     sendCHECK_NODE_RESTARTCONF(signal, ref, 1);
15448     return;
15449   }
15450   Uint32 start_node = 1;
15451   check_node_in_restart(signal, ref, start_node);
15452   return;
15453 }
15454 
invalidateLcpInfoAfterSr(Signal * signal)15455 void Dbdih::invalidateLcpInfoAfterSr(Signal* signal)
15456 {
15457   NodeRecordPtr nodePtr;
15458   SYSFILE->latestLCP_ID--;
15459   Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
15460   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
15461     jam();
15462     ptrAss(nodePtr, nodeRecord);
15463     if (!NdbNodeBitmask::get(SYSFILE->lcpActive, nodePtr.i)){
15464       jam();
15465       /* ------------------------------------------------------------------- */
15466       // The node was not active in the local checkpoint.
15467       // To avoid that we step the active status too fast to not
15468       // active we step back one step from Sysfile::NS_ActiveMissed_x.
15469       /* ------------------------------------------------------------------- */
15470       switch (nodePtr.p->activeStatus) {
15471       case Sysfile::NS_Active:
15472         nodePtr.p->activeStatus = Sysfile::NS_Active;
15473         break;
15474       case Sysfile::NS_ActiveMissed_1:
15475         jam();
15476         nodePtr.p->activeStatus = Sysfile::NS_Active;
15477         break;
15478       case Sysfile::NS_ActiveMissed_2:
15479         jam();
15480         nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
15481         break;
15482       default:
15483         jam();
15484         break;
15485       }//switch
15486     }
15487     else
15488     {
15489       jam();
15490       ndbassert(nodePtr.p->activeStatus == Sysfile::NS_Active);
15491     }
15492   }//for
15493   setNodeRestartInfoBits(signal);
15494 }//Dbdih::invalidateLcpInfoAfterSr()
15495 
15496 /* ------------------------------------------------------------------------- */
15497 /*       THE NEXT STEP IS TO WRITE THE FILE.                                 */
15498 /* ------------------------------------------------------------------------- */
openingCopyGciSkipInitLab(Signal * signal,FileRecordPtr filePtr)15499 void Dbdih::openingCopyGciSkipInitLab(Signal* signal, FileRecordPtr filePtr)
15500 {
15501   writeRestorableGci(signal, filePtr);
15502   filePtr.p->reqStatus = FileRecord::WRITING_COPY_GCI;
15503   return;
15504 }//Dbdih::openingCopyGciSkipInitLab()
15505 
writingCopyGciLab(Signal * signal,FileRecordPtr filePtr)15506 void Dbdih::writingCopyGciLab(Signal* signal, FileRecordPtr filePtr)
15507 {
15508   /* ----------------------------------------------------------------------- */
15509   /*     WE HAVE NOW WRITTEN THIS FILE. WRITE ALSO NEXT FILE IF THIS IS NOT  */
15510   /*     ALREADY THE LAST.                                                   */
15511   /* ----------------------------------------------------------------------- */
15512   CRASH_INSERTION(7219);
15513 
15514   filePtr.p->reqStatus = FileRecord::IDLE;
15515   if (filePtr.i == crestartInfoFile[0]) {
15516     jam();
15517     filePtr.i = crestartInfoFile[1];
15518     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
15519     if (filePtr.p->fileStatus == FileRecord::OPEN) {
15520       jam();
15521       openingCopyGciSkipInitLab(signal, filePtr);
15522       return;
15523     }//if
15524     openFileRw(signal, filePtr);
15525     filePtr.p->reqStatus = FileRecord::OPENING_COPY_GCI;
15526     return;
15527   }//if
15528   /* ----------------------------------------------------------------------- */
15529   /*     WE HAVE COMPLETED WRITING BOTH FILES SUCCESSFULLY. NOW REPORT OUR   */
15530   /*     SUCCESS TO THE MASTER DIH. BUT FIRST WE NEED TO RESET A NUMBER OF   */
15531   /*     VARIABLES USED BY THE LOCAL CHECKPOINT PROCESS (ONLY IF TRIGGERED   */
15532   /*     BY LOCAL CHECKPOINT PROCESS.                                        */
15533   /* ----------------------------------------------------------------------- */
15534   CopyGCIReq::CopyReason reason = c_copyGCISlave.m_copyReason;
15535 
15536   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT) {
15537     jam();
15538     m_gcp_save.m_state = GcpSave::GCP_SAVE_IDLE;
15539 
15540     SubGcpCompleteRep * const rep = (SubGcpCompleteRep*)signal->getDataPtr();
15541     rep->gci_hi = SYSFILE->newestRestorableGCI;
15542     rep->gci_lo = 0;
15543     rep->flags = SubGcpCompleteRep::ON_DISK;
15544 
15545     sendSignal(LGMAN_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
15546                SubGcpCompleteRep::SignalLength, JBB);
15547 
15548     jamEntry();
15549 
15550     if (m_micro_gcp.m_enabled == false)
15551     {
15552       jam();
15553       sendSignal(DBLQH_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
15554                  SubGcpCompleteRep::SignalLength, JBB);
15555       jamEntry();
15556       ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMITTED);
15557       m_micro_gcp.m_state = MicroGcp::M_GCP_IDLE;
15558 
15559       CRASH_INSERTION(7190);
15560     }
15561 
15562 #ifdef GCP_TIMER_HACK
15563     globalData.gcp_timer_copygci[1] = NdbTick_getCurrentTicks();
15564 
15565     // this is last timer point so we send local report here
15566     {
15567       const GlobalData& g = globalData;
15568       const Uint32 ms_commit = NdbTick_Elapsed(
15569 	  g.gcp_timer_commit[0], g.gcp_timer_commit[1]).milliSec();
15570       const Uint32 ms_save = NdbTick_Elapsed(
15571           g.gcp_timer_save[0], g.gcp_timer_save[1]).milliSec();
15572       const Uint32 ms_copygci = NdbTick_Elapsed(
15573           g.gcp_timer_copygci[0], g.gcp_timer_copygci[1]).milliSec();
15574 
15575       const Uint32 ms_total = ms_commit + ms_save + ms_copygci;
15576 
15577       // random formula to report excessive duration
15578       bool report =
15579         g.gcp_timer_limit != 0 ?
15580           (ms_total > g.gcp_timer_limit) :
15581           (ms_total > 3000 * (1 + cgcpDelay / 1000));
15582       if (report)
15583         infoEvent("GCP %u ms: total:%u commit:%u save:%u copygci:%u",
15584             coldgcp, ms_total, ms_commit, ms_save, ms_copygci);
15585     }
15586 #endif
15587   }
15588 
15589   jam();
15590   c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
15591 
15592   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT)
15593   {
15594     jam();
15595     signal->theData[0] = c_copyGCISlave.m_senderData;
15596     sendSignal(m_gcp_save.m_master_ref, GSN_COPY_GCICONF, signal, 1, JBB);
15597   }
15598   else if (c_copyGCISlave.m_senderRef == cmasterdihref)
15599   {
15600     jam();
15601     /**
15602      * Only if same master
15603      */
15604     signal->theData[0] = c_copyGCISlave.m_senderData;
15605     sendSignal(c_copyGCISlave.m_senderRef, GSN_COPY_GCICONF, signal, 1, JBB);
15606   }
15607   return;
15608 }//Dbdih::writingCopyGciLab()
15609 
execSTART_LCP_REQ(Signal * signal)15610 void Dbdih::execSTART_LCP_REQ(Signal* signal)
15611 {
15612   jamEntry();
15613   StartLcpReq * req = (StartLcpReq*)signal->getDataPtr();
15614 
15615   if (getNodeInfo(refToNode(req->senderRef)).m_version >=
15616       NDBD_SUPPORT_PAUSE_LCP)
15617   {
15618     if (req->pauseStart == StartLcpReq::PauseLcpStartFirst)
15619     {
15620       /**
15621        * The message was sent as part of start of LCPs when PAUSE LCP was used.
15622        * We have paused the LCP protocol and we are preparing to copy the
15623        * meta data. Before copying the metadata we need access to the
15624        * m_participatingLQH bitmap of nodes participating in the LCP.
15625        */
15626       jam();
15627       ndbrequire(cmasterdihref == req->senderRef);
15628       m_local_lcp_state.init(req);
15629       c_lcpState.m_participatingDIH = req->participatingDIH;
15630       c_lcpState.m_participatingLQH = req->participatingLQH;
15631       c_lcpState.m_masterLcpDihRef = cmasterdihref;
15632       c_lcpState.setLcpStatus(LCP_STATUS_ACTIVE, __LINE__);
15633       /**
15634        * We need to update the SYSFILE since it can take some time before we
15635        * have this number updated after a COPY_GCIREQ in connection to a
15636        * GCP.
15637        */
15638       SYSFILE->latestLCP_ID = req->lcpId;
15639 
15640       {
15641         char buf[100];
15642         g_eventLogger->info("c_lcpState.m_participatingLQH bitmap= %s",
15643             c_lcpState.m_participatingLQH.getText(buf));
15644         g_eventLogger->info("c_lcpState.m_participatingDIH bitmap= %s",
15645             c_lcpState.m_participatingDIH.getText(buf));
15646       }
15647 
15648       ndbrequire(!req->participatingDIH.get(getOwnNodeId()));
15649       c_lcpState.m_participatingDIH.set(getOwnNodeId());
15650 
15651       StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
15652       conf->senderRef = reference();
15653       sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
15654                  StartLcpConf::SignalLength, JBB);
15655       return;
15656     }
15657     if (req->pauseStart == StartLcpReq::PauseLcpStartSecond)
15658     {
15659       /**
15660        * We get the set of already completed LQHs from the master node.
15661        * No need to know anything about completed DIHs since only the
15662        * master keeps this information.
15663        *
15664        * This signal arrives after copying the meta data. Since we are
15665        * included into the LCP we verify that there is at least one
15666        * fragment replica that still hasn't arrived being ready with
15667        * the LCP execution.
15668        */
15669       jam();
15670       ndbrequire(c_lcpState.lcpStatus == LCP_STATUS_ACTIVE);
15671       ndbrequire(cmasterdihref == req->senderRef);
15672       ndbrequire(c_lcpState.m_masterLcpDihRef == cmasterdihref);
15673       c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH = req->participatingLQH;
15674       c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor();
15675       c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = false;
15676 
15677       c_current_time = NdbTick_getCurrentTicks();
15678       c_lcpState.m_start_time = c_current_time;
15679 
15680       g_eventLogger->info("Our node now in LCP execution after pausing LCP");
15681       g_eventLogger->info("LCP_COMPLETE_REP_Counter_LQH bitmap= %s",
15682           c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.getText());
15683 
15684       ndbrequire(!checkLcpAllTablesDoneInLqh(__LINE__));
15685 
15686       StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
15687       conf->senderRef = reference();
15688       sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
15689                  StartLcpConf::SignalLength, JBB);
15690       return;
15691     }
15692     ndbrequire(req->pauseStart == StartLcpReq::NormalLcpStart);
15693   }
15694   /**
15695    * Init m_local_lcp_state
15696    */
15697   m_local_lcp_state.init(req);
15698 
15699   if (!isMaster())
15700   {
15701     jam();
15702     c_current_time = NdbTick_getCurrentTicks();
15703     c_lcpState.m_start_time = c_current_time;
15704   }
15705 
15706   CRASH_INSERTION2(7021, isMaster());
15707   CRASH_INSERTION2(7022, !isMaster());
15708 
15709   for (Uint32 nodeId = 1; nodeId < MAX_NDB_NODES; nodeId++)
15710   {
15711     /**
15712      * We could have a race here, a node could die while the START_LCP_REQ
15713      * is in flight. We need remove the node from the set of nodes
15714      * participating in this case. Not removing it here could lead to a
15715      * potential LCP deadlock.
15716      *
15717      * For the PAUSE LCP code where we are included in the LCP we don't need
15718      * to worry about this. If any node fails in the state of me being
15719      * started, I will fail as well.
15720      */
15721     NodeRecordPtr nodePtr;
15722     if (req->participatingDIH.get(nodeId) ||
15723         req->participatingLQH.get(nodeId))
15724     {
15725       nodePtr.i = nodeId;
15726       ptrAss(nodePtr, nodeRecord);
15727       if (nodePtr.p->nodeStatus != NodeRecord::ALIVE)
15728       {
15729         jam();
15730         jamLine(nodeId);
15731         req->participatingDIH.clear(nodeId);
15732         req->participatingLQH.clear(nodeId);
15733       }
15734     }
15735   }
15736   c_lcpState.m_participatingDIH = req->participatingDIH;
15737   c_lcpState.m_participatingLQH = req->participatingLQH;
15738 
15739   for (Uint32 nodeId = 1; nodeId < MAX_NDB_NODES; nodeId++)
15740   {
15741     /**
15742      * We could have a race here, a node could die while the START_LCP_REQ
15743      * is in flight. We need remove the node from the set of nodes
15744      * participating in this case. Not removing it here could lead to a
15745      * potential LCP deadlock.
15746      *
15747      * For the PAUSE LCP code where we are included in the LCP we don't need
15748      * to worry about this. If any node fails in the state of me being
15749      * started, I will fail as well.
15750      */
15751     NodeRecordPtr nodePtr;
15752     if (req->participatingDIH.get(nodeId) ||
15753         req->participatingLQH.get(nodeId))
15754     {
15755       nodePtr.i = nodeId;
15756       ptrAss(nodePtr, nodeRecord);
15757       if (nodePtr.p->nodeStatus != NodeRecord::ALIVE)
15758       {
15759         jam();
15760         jamLine(nodeId);
15761         req->participatingDIH.clear(nodeId);
15762         req->participatingLQH.clear(nodeId);
15763       }
15764     }
15765   }
15766 
15767   c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH = req->participatingLQH;
15768   if(isMaster())
15769   {
15770     jam();
15771     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH = req->participatingDIH;
15772   }
15773   else
15774   {
15775     jam();
15776     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor();
15777   }
15778 
15779   c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = false;
15780 
15781   c_lcpState.setLcpStatus(LCP_INIT_TABLES, __LINE__);
15782 
15783   ndbrequire(c_lcpState.m_masterLcpDihRef == req->senderRef);
15784 
15785   signal->theData[0] = DihContinueB::ZINIT_LCP;
15786   signal->theData[1] = c_lcpState.m_masterLcpDihRef;
15787   signal->theData[2] = 0;
15788   if (ERROR_INSERTED(7021))
15789   {
15790     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 3);
15791   }
15792   else
15793   {
15794     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
15795   }
15796 }
15797 
15798 void
reset()15799 Dbdih::LocalLCPState::reset()
15800 {
15801   m_state = LS_INITIAL;
15802   m_keep_gci = RNIL;
15803   m_stop_gci = RNIL;
15804 }
15805 
15806 void
init(const StartLcpReq * req)15807 Dbdih::LocalLCPState::init(const StartLcpReq * req)
15808 {
15809   m_state = LS_RUNNING;
15810   m_start_lcp_req = *req;
15811   m_keep_gci = ~(Uint32)0;
15812   m_stop_gci = 0;
15813 }
15814 
15815 void
lcp_frag_rep(const LcpFragRep * rep)15816 Dbdih::LocalLCPState::lcp_frag_rep(const LcpFragRep * rep)
15817 {
15818   assert(m_state == LS_RUNNING);
15819   if (rep->maxGciCompleted < m_keep_gci)
15820   {
15821     m_keep_gci = rep->maxGciCompleted;
15822   }
15823 
15824   if (rep->maxGciStarted > m_stop_gci)
15825   {
15826     m_stop_gci = rep->maxGciStarted;
15827   }
15828 }
15829 
15830 void
lcp_complete_rep(Uint32 gci)15831 Dbdih::LocalLCPState::lcp_complete_rep(Uint32 gci)
15832 {
15833   assert(m_state == LS_RUNNING);
15834   m_state = LS_COMPLETE;
15835   if (gci > m_stop_gci)
15836     m_stop_gci = gci;
15837 }
15838 
15839 bool
check_cut_log_tail(Uint32 gci) const15840 Dbdih::LocalLCPState::check_cut_log_tail(Uint32 gci) const
15841 {
15842   if (m_state == LS_COMPLETE)
15843   {
15844     if (gci >= m_stop_gci)
15845       return true;
15846   }
15847   return false;
15848 }
15849 
initLcpLab(Signal * signal,Uint32 senderRef,Uint32 tableId)15850 void Dbdih::initLcpLab(Signal* signal, Uint32 senderRef, Uint32 tableId)
15851 {
15852   TabRecordPtr tabPtr;
15853   tabPtr.i = tableId;
15854 
15855   if (c_lcpState.m_masterLcpDihRef != senderRef ||
15856       c_lcpState.m_masterLcpDihRef != cmasterdihref)
15857   {
15858     /**
15859      * This is LCP master takeover...abort
15860      */
15861     jam();
15862     return;
15863   }
15864 
15865   //const Uint32 lcpId = SYSFILE->latestLCP_ID;
15866 
15867   for(; tabPtr.i < ctabFileSize; tabPtr.i++){
15868 
15869     ptrAss(tabPtr, tabRecord);
15870 
15871     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
15872     {
15873       jam();
15874       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
15875       continue;
15876     }
15877 
15878     if (tabPtr.p->tabStorage != TabRecord::ST_NORMAL) {
15879       /**
15880        * Table is not logged
15881        */
15882       jam();
15883       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
15884       continue;
15885     }
15886 
15887     if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
15888       /* ----------------------------------------------------------------- */
15889       // We protect the updates of table data structures by this variable.
15890       /* ----------------------------------------------------------------- */
15891       jam();
15892       signal->theData[0] = DihContinueB::ZINIT_LCP;
15893       signal->theData[1] = senderRef;
15894       signal->theData[2] = tabPtr.i;
15895       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
15896                           WaitTableStateChangeMillis, 3);
15897       return;
15898     }//if
15899 
15900     /**
15901      * Found a table
15902      */
15903     tabPtr.p->tabLcpStatus = TabRecord::TLS_ACTIVE;
15904 
15905     /**
15906      * For each fragment
15907      */
15908     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
15909       jam();
15910       FragmentstorePtr fragPtr;
15911       getFragstore(tabPtr.p, fragId, fragPtr);
15912 
15913       /**
15914        * For each of replica record
15915        */
15916       Uint32 replicaCount = 0;
15917       ReplicaRecordPtr replicaPtr;
15918       for(replicaPtr.i = fragPtr.p->storedReplicas; replicaPtr.i != RNIL;
15919 	  replicaPtr.i = replicaPtr.p->nextPool) {
15920 	jam();
15921 
15922         c_replicaRecordPool.getPtr(replicaPtr);
15923 	Uint32 nodeId = replicaPtr.p->procNode;
15924 	if(c_lcpState.m_participatingLQH.get(nodeId)){
15925 	  jam();
15926 	  replicaCount++;
15927 	  replicaPtr.p->lcpOngoingFlag = true;
15928 	}
15929         else if (replicaPtr.p->lcpOngoingFlag)
15930         {
15931           jam();
15932           replicaPtr.p->lcpOngoingFlag = false;
15933         }
15934       }
15935 
15936       fragPtr.p->noLcpReplicas = replicaCount;
15937     }//for
15938 
15939     signal->theData[0] = DihContinueB::ZINIT_LCP;
15940     signal->theData[1] = senderRef;
15941     signal->theData[2] = tabPtr.i + 1;
15942     if (ERROR_INSERTED(7021))
15943     {
15944       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 3);
15945     }
15946     else
15947     {
15948       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
15949     }
15950     return;
15951   }
15952 
15953   /**
15954    * No more tables
15955    */
15956   jam();
15957   if (ERROR_INSERTED(7236))
15958   {
15959     // delay 20s before completing last CONTINUEB(ZINIT_LCP)
15960     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 20000, 3);
15961     CLEAR_ERROR_INSERT_VALUE;
15962     return;
15963   }
15964 
15965   c_lcpState.setLcpStatus(LCP_STATUS_ACTIVE, __LINE__);
15966 
15967   CRASH_INSERTION2(7023, isMaster());
15968   CRASH_INSERTION2(7024, !isMaster());
15969 
15970   StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
15971   conf->senderRef = reference();
15972   sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
15973              StartLcpConf::SignalLength, JBB);
15974 }//Dbdih::initLcpLab()
15975 
15976 /* ------------------------------------------------------------------------- */
15977 /*       ERROR HANDLING FOR COPY RESTORABLE GCI FILE.                        */
15978 /* ------------------------------------------------------------------------- */
openingCopyGciErrorLab(Signal * signal,FileRecordPtr filePtr)15979 void Dbdih::openingCopyGciErrorLab(Signal* signal, FileRecordPtr filePtr)
15980 {
15981   createFileRw(signal, filePtr);
15982   /* ------------------------------------------------------------------------- */
15983   /*       ERROR IN OPENING FILE. WE WILL TRY BY CREATING FILE INSTEAD.        */
15984   /* ------------------------------------------------------------------------- */
15985   filePtr.p->reqStatus = FileRecord::CREATING_COPY_GCI;
15986   return;
15987 }//Dbdih::openingCopyGciErrorLab()
15988 
15989 /* ------------------------------------------------------------------------- */
15990 /*       ENTER DICTSTARTCONF WITH                                            */
15991 /*         TBLOCKREF                                                         */
15992 /* ------------------------------------------------------------------------- */
dictStartConfLab(Signal * signal)15993 void Dbdih::dictStartConfLab(Signal* signal)
15994 {
15995   infoEvent("Restore Database from disk Starting");
15996   /* ----------------------------------------------------------------------- */
15997   /*     WE HAVE NOW RECEIVED ALL THE TABLES TO RESTART.                     */
15998   /* ----------------------------------------------------------------------- */
15999   signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
16000   signal->theData[1] = 0;  /* START WITH TABLE 0    */
16001   signal->theData[2] = 0;  /* AND FRAGMENT 0        */
16002   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
16003   return;
16004 }//Dbdih::dictStartConfLab()
16005 
16006 
openingTableLab(Signal * signal,FileRecordPtr filePtr)16007 void Dbdih::openingTableLab(Signal* signal, FileRecordPtr filePtr)
16008 {
16009   /* ---------------------------------------------------------------------- */
16010   /*    SUCCESSFULLY OPENED A FILE. READ THE FIRST PAGE OF THIS FILE.       */
16011   /* ---------------------------------------------------------------------- */
16012   TabRecordPtr tabPtr;
16013   PageRecordPtr pagePtr;
16014 
16015   tabPtr.i = filePtr.p->tabRef;
16016   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16017   tabPtr.p->noPages = 1;
16018   allocpage(pagePtr);
16019   tabPtr.p->pageRef[0] = pagePtr.i;
16020   readTabfile(signal, tabPtr.p, filePtr);
16021   filePtr.p->reqStatus = FileRecord::READING_TABLE;
16022   return;
16023 }//Dbdih::openingTableLab()
16024 
openingTableErrorLab(Signal * signal,FileRecordPtr filePtr)16025 void Dbdih::openingTableErrorLab(Signal* signal, FileRecordPtr filePtr)
16026 {
16027   TabRecordPtr tabPtr;
16028   tabPtr.i = filePtr.p->tabRef;
16029   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16030   /* ---------------------------------------------------------------------- */
16031   /*    WE FAILED IN OPENING A FILE. IF THE FIRST FILE THEN TRY WITH THE    */
16032   /*    DUPLICATE FILE, OTHERWISE WE REPORT AN ERROR IN THE SYSTEM RESTART. */
16033   /* ---------------------------------------------------------------------- */
16034   if (filePtr.i == tabPtr.p->tabFile[0])
16035   {
16036     filePtr.i = tabPtr.p->tabFile[1];
16037     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
16038     openFileRw(signal, filePtr);
16039     filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
16040   }
16041   else
16042   {
16043     char buf[256];
16044     BaseString::snprintf(buf, sizeof(buf),
16045 			 "Error opening DIH schema files for table: %d",
16046 			 tabPtr.i);
16047     progError(__LINE__, NDBD_EXIT_AFS_NO_SUCH_FILE, buf);
16048   }
16049 }//Dbdih::openingTableErrorLab()
16050 
readingTableLab(Signal * signal,FileRecordPtr filePtr)16051 void Dbdih::readingTableLab(Signal* signal, FileRecordPtr filePtr)
16052 {
16053   TabRecordPtr tabPtr;
16054   PageRecordPtr pagePtr;
16055   /* ---------------------------------------------------------------------- */
16056   /*    WE HAVE SUCCESSFULLY READ A NUMBER OF PAGES IN THE TABLE FILE. IF   */
16057   /*    MORE PAGES EXIST IN THE FILE THEN READ ALL PAGES IN THE FILE.       */
16058   /* ---------------------------------------------------------------------- */
16059   filePtr.p->reqStatus = FileRecord::IDLE;
16060   tabPtr.i = filePtr.p->tabRef;
16061   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16062   pagePtr.i = tabPtr.p->pageRef[0];
16063   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16064   Uint32 noOfStoredPages = pagePtr.p->word[33];
16065   if (tabPtr.p->noPages < noOfStoredPages) {
16066     jam();
16067     ndbrequire(noOfStoredPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
16068     for (Uint32 i = tabPtr.p->noPages; i < noOfStoredPages; i++) {
16069       jam();
16070       allocpage(pagePtr);
16071       tabPtr.p->pageRef[i] = pagePtr.i;
16072     }//for
16073     tabPtr.p->noPages = noOfStoredPages;
16074     readTabfile(signal, tabPtr.p, filePtr);
16075     filePtr.p->reqStatus = FileRecord::READING_TABLE;
16076   } else {
16077     ndbrequire(tabPtr.p->noPages == pagePtr.p->word[33]);
16078     ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
16079     jam();
16080     /* --------------------------------------------------------------------- */
16081     /*   WE HAVE READ ALL PAGES. NOW READ FROM PAGES INTO TABLE AND FRAGMENT */
16082     /*   DATA STRUCTURES.                                                    */
16083     /* --------------------------------------------------------------------- */
16084     tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE1_READ_PAGES;
16085     signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_TABLE;
16086     signal->theData[1] = tabPtr.i;
16087     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16088     return;
16089   }//if
16090   return;
16091 }//Dbdih::readingTableLab()
16092 
readTableFromPagesLab(Signal * signal,TabRecordPtr tabPtr)16093 void Dbdih::readTableFromPagesLab(Signal* signal, TabRecordPtr tabPtr)
16094 {
16095   FileRecordPtr filePtr;
16096   filePtr.i = tabPtr.p->tabFile[0];
16097   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
16098   /* ---------------------------------------------------------------------- */
16099   /*    WE HAVE NOW COPIED TO OUR NODE. WE HAVE NOW COMPLETED RESTORING     */
16100   /*    THIS TABLE. CONTINUE WITH THE NEXT TABLE.                           */
16101   /*    WE ALSO NEED TO CLOSE THE TABLE FILE.                               */
16102   /* ---------------------------------------------------------------------- */
16103   if (filePtr.p->fileStatus != FileRecord::OPEN) {
16104     jam();
16105     filePtr.i = tabPtr.p->tabFile[1];
16106     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
16107   }//if
16108   closeFile(signal, filePtr);
16109   filePtr.p->reqStatus = FileRecord::CLOSING_TABLE_SR;
16110   return;
16111 }//Dbdih::readTableFromPagesLab()
16112 
closingTableSrLab(Signal * signal,FileRecordPtr filePtr)16113 void Dbdih::closingTableSrLab(Signal* signal, FileRecordPtr filePtr)
16114 {
16115   /**
16116    * Update table/fragment info
16117    */
16118   TabRecordPtr tabPtr;
16119   tabPtr.i = filePtr.p->tabRef;
16120   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16121   resetReplicaSr(tabPtr);
16122 
16123   signal->theData[0] = DihContinueB::ZCOPY_TABLE;
16124   signal->theData[1] = filePtr.p->tabRef;
16125   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16126 
16127   return;
16128 }//Dbdih::closingTableSrLab()
16129 
16130 void
execDIH_GET_TABINFO_REQ(Signal * signal)16131 Dbdih::execDIH_GET_TABINFO_REQ(Signal* signal)
16132 {
16133   jamEntry();
16134 
16135   DihGetTabInfoReq req = * (DihGetTabInfoReq*)signal->getDataPtr();
16136 
16137   Uint32 err = 0;
16138   do
16139   {
16140     TabRecordPtr tabPtr;
16141     tabPtr.i = req.tableId;
16142     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16143 
16144     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
16145     {
16146       jam();
16147       err = DihGetTabInfoRef::TableNotDefined;
16148       break;
16149     }
16150 
16151     if (cfirstconnect == RNIL)
16152     {
16153       jam();
16154       err = DihGetTabInfoRef::OutOfConnectionRecords;
16155       break;
16156     }
16157 
16158     if (tabPtr.p->connectrec != RNIL)
16159     {
16160       jam();
16161 
16162       ConnectRecordPtr connectPtr;
16163       connectPtr.i = tabPtr.p->connectrec;
16164       ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
16165 
16166       if (connectPtr.p->connectState != ConnectRecord::GET_TABINFO)
16167       {
16168         jam();
16169         err = DihGetTabInfoRef::TableBusy;
16170         break;
16171       }
16172     }
16173 
16174     ConnectRecordPtr connectPtr;
16175     connectPtr.i = cfirstconnect;
16176     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
16177     cfirstconnect = connectPtr.p->nextPool;
16178 
16179     connectPtr.p->nextPool = tabPtr.p->connectrec;
16180     tabPtr.p->connectrec = connectPtr.i;
16181 
16182     connectPtr.p->m_get_tabinfo.m_requestInfo = req.requestInfo;
16183     connectPtr.p->userpointer = req.senderData;
16184     connectPtr.p->userblockref = req.senderRef;
16185     connectPtr.p->connectState = ConnectRecord::GET_TABINFO;
16186     connectPtr.p->table = tabPtr.i;
16187 
16188     if (connectPtr.p->nextPool == RNIL)
16189     {
16190       jam();
16191 
16192       /**
16193        * we're the first...start packing...
16194        */
16195       signal->theData[0] = DihContinueB::ZGET_TABINFO;
16196       signal->theData[1] = tabPtr.i;
16197       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16198     }
16199 
16200     return;
16201   } while (0);
16202 
16203   DihGetTabInfoRef * ref = (DihGetTabInfoRef*)signal->getDataPtrSend();
16204   ref->senderData = req.senderData;
16205   ref->senderRef = reference();
16206   ref->errorCode = err;
16207   sendSignal(req.senderRef, GSN_DIH_GET_TABINFO_REF, signal,
16208              DihGetTabInfoRef::SignalLength, JBB);
16209 }
16210 
16211 void
getTabInfo(Signal * signal)16212 Dbdih::getTabInfo(Signal* signal)
16213 {
16214   TabRecordPtr tabPtr;
16215   tabPtr.i = signal->theData[1];
16216   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16217 
16218   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE)
16219   {
16220     jam();
16221     signal->theData[0] = DihContinueB::ZGET_TABINFO;
16222     signal->theData[1] = tabPtr.i;
16223     sendSignalWithDelay(reference(),
16224                         GSN_CONTINUEB,
16225                         signal,
16226                         WaitTableStateChangeMillis,
16227                         signal->length());
16228     return;
16229   }
16230 
16231   tabPtr.p->tabCopyStatus  = TabRecord::CS_GET_TABINFO;
16232 
16233   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
16234   signal->theData[1] = tabPtr.i;
16235   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16236 }
16237 
16238 int
getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr,CopyTableNode ctn)16239 Dbdih::getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr,
16240                                      CopyTableNode ctn)
16241 {
16242   PageRecordPtr pagePtr;
16243   pagePtr.i = ctn.ctnTabPtr.p->pageRef[0];
16244   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16245 
16246   while (ctn.noOfWords > 2048)
16247   {
16248     jam();
16249     ndbrequire(import(ptr, pagePtr.p->word, 2048));
16250     ctn.noOfWords -= 2048;
16251 
16252     ctn.pageIndex++;
16253     pagePtr.i = ctn.ctnTabPtr.p->pageRef[ctn.pageIndex];
16254     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16255   }
16256 
16257   ndbrequire(import(ptr, pagePtr.p->word, ctn.noOfWords));
16258   return 0;
16259 }
16260 
16261 int
getTabInfo_copySectionToPages(TabRecordPtr tabPtr,SegmentedSectionPtr ptr)16262 Dbdih::getTabInfo_copySectionToPages(TabRecordPtr tabPtr,
16263                                      SegmentedSectionPtr ptr)
16264 {
16265   jam();
16266   Uint32 sz = ptr.sz;
16267   SectionReader reader(ptr, getSectionSegmentPool());
16268 
16269   while (sz)
16270   {
16271     jam();
16272     PageRecordPtr pagePtr;
16273     allocpage(pagePtr);
16274     tabPtr.p->pageRef[tabPtr.p->noPages] = pagePtr.i;
16275     tabPtr.p->noPages++;
16276 
16277     Uint32 len = sz > 2048 ? 2048 : sz;
16278     ndbrequire(reader.getWords(pagePtr.p->word, len));
16279     sz -= len;
16280   }
16281   return 0;
16282 }
16283 
16284 void
getTabInfo_send(Signal * signal,TabRecordPtr tabPtr)16285 Dbdih::getTabInfo_send(Signal* signal,
16286                        TabRecordPtr tabPtr)
16287 {
16288   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_GET_TABINFO);
16289 
16290   ConnectRecordPtr connectPtr;
16291   connectPtr.i = tabPtr.p->connectrec;
16292 
16293   /**
16294    * Done
16295    */
16296   if (connectPtr.i == RNIL)
16297   {
16298     jam();
16299     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16300     return;
16301   }
16302 
16303   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
16304 
16305   ndbrequire(connectPtr.p->connectState == ConnectRecord::GET_TABINFO);
16306   ndbrequire(connectPtr.p->table == tabPtr.i);
16307 
16308   /**
16309    * Copy into segmented sections here...
16310    * NOTE: A GenericSectionIterator would be nice inside kernel too
16311    *  or having a pack-method that writes directly into SegmentedSection
16312    */
16313   PageRecordPtr pagePtr;
16314   pagePtr.i = tabPtr.p->pageRef[0];
16315   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16316   Uint32 words = pagePtr.p->word[34];
16317 
16318   CopyTableNode ctn;
16319   ctn.ctnTabPtr = tabPtr;
16320   ctn.pageIndex = 0;
16321   ctn.wordIndex = 0;
16322   ctn.noOfWords = words;
16323 
16324   SegmentedSectionPtr ptr;
16325   ndbrequire(getTabInfo_copyTableToSection(ptr, ctn) == 0);
16326 
16327   Callback cb = { safe_cast(&Dbdih::getTabInfo_sendComplete), connectPtr.i };
16328 
16329   SectionHandle handle(this, signal);
16330   handle.m_ptr[0] = ptr;
16331   handle.m_cnt = 1;
16332 
16333   DihGetTabInfoConf* conf = (DihGetTabInfoConf*)signal->getDataPtrSend();
16334   conf->senderData = connectPtr.p->userpointer;
16335   conf->senderRef = reference();
16336   sendFragmentedSignal(connectPtr.p->userblockref, GSN_DIH_GET_TABINFO_CONF, signal,
16337                        DihGetTabInfoConf::SignalLength, JBB, &handle, cb);
16338 }
16339 
16340 void
getTabInfo_sendComplete(Signal * signal,Uint32 senderData,Uint32 retVal)16341 Dbdih::getTabInfo_sendComplete(Signal * signal,
16342                                Uint32 senderData,
16343                                Uint32 retVal)
16344 {
16345   ndbrequire(retVal == 0);
16346 
16347   ConnectRecordPtr connectPtr;
16348   connectPtr.i = senderData;
16349   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
16350 
16351   ndbrequire(connectPtr.p->connectState == ConnectRecord::GET_TABINFO);
16352 
16353   TabRecordPtr tabPtr;
16354   tabPtr.i = connectPtr.p->table;
16355   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16356   tabPtr.p->connectrec = connectPtr.p->nextPool;
16357 
16358   signal->theData[0] = DihContinueB::ZGET_TABINFO_SEND;
16359   signal->theData[1] = tabPtr.i;
16360   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16361 
16362   release_connect(connectPtr);
16363 }
16364 
16365 void
resetReplicaSr(TabRecordPtr tabPtr)16366 Dbdih::resetReplicaSr(TabRecordPtr tabPtr){
16367 
16368   const Uint32 newestRestorableGCI = SYSFILE->newestRestorableGCI;
16369 
16370   for(Uint32 i = 0; i<tabPtr.p->totalfragments; i++)
16371   {
16372     FragmentstorePtr fragPtr;
16373     getFragstore(tabPtr.p, i, fragPtr);
16374 
16375     /**
16376      * During SR restart distributionKey from 0
16377      */
16378     fragPtr.p->distributionKey = 0;
16379 
16380     /**
16381      * 1) Start by moving all replicas into oldStoredReplicas
16382      */
16383     prepareReplicas(fragPtr);
16384 
16385     /**
16386      * 2) Move all "alive" replicas into storedReplicas
16387      *    + update noCrashedReplicas...
16388      */
16389     ReplicaRecordPtr replicaPtr;
16390     replicaPtr.i = fragPtr.p->oldStoredReplicas;
16391     while (replicaPtr.i != RNIL)
16392     {
16393       jam();
16394       c_replicaRecordPool.getPtr(replicaPtr);
16395 
16396       /**
16397        * invalidate LCP's not usable
16398        */
16399       resetReplica(replicaPtr);
16400 
16401       const Uint32 nextReplicaPtrI = replicaPtr.p->nextPool;
16402 
16403       NodeRecordPtr nodePtr;
16404       nodePtr.i = replicaPtr.p->procNode;
16405       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
16406 
16407       const Uint32 noCrashedReplicas = replicaPtr.p->noCrashedReplicas;
16408 
16409       if (nodePtr.p->nodeStatus == NodeRecord::ALIVE)
16410       {
16411 	jam();
16412 	switch (nodePtr.p->activeStatus) {
16413 	case Sysfile::NS_Active:
16414 	case Sysfile::NS_ActiveMissed_1:
16415 	case Sysfile::NS_ActiveMissed_2:{
16416 	  jam();
16417 	  /* --------------------------------------------------------------- */
16418 	  /* THE NODE IS ALIVE AND KICKING AND ACTIVE, LET'S USE IT.         */
16419 	  /* --------------------------------------------------------------- */
16420 	  arrGuardErr(noCrashedReplicas, MAX_CRASHED_REPLICAS, NDBD_EXIT_MAX_CRASHED_REPLICAS);
16421 
16422           // Create new crashed replica
16423           newCrashedReplica(replicaPtr);
16424 
16425           // Create a new redo-interval
16426           Uint32 nextCrashed = replicaPtr.p->noCrashedReplicas;
16427           replicaPtr.p->createGci[nextCrashed] = newestRestorableGCI + 1;
16428           replicaPtr.p->replicaLastGci[nextCrashed] = ZINIT_REPLICA_LAST_GCI;
16429 
16430           // merge
16431           mergeCrashedReplicas(replicaPtr);
16432 
16433 	  resetReplicaLcp(replicaPtr.p, newestRestorableGCI);
16434 
16435 	  /**
16436 	   * Make sure we can also find REDO for restoring replica...
16437 	   */
16438 	  {
16439 	    CreateReplicaRecord createReplica;
16440 	    ConstPtr<ReplicaRecord> constReplicaPtr;
16441 	    constReplicaPtr.i = replicaPtr.i;
16442 	    constReplicaPtr.p = replicaPtr.p;
16443 	    if (tabPtr.p->tabStorage != TabRecord::ST_NORMAL ||
16444 		setup_create_replica(fragPtr,
16445 				     &createReplica, constReplicaPtr))
16446 	    {
16447 	      jam();
16448 	      removeOldStoredReplica(fragPtr, replicaPtr);
16449 	      linkStoredReplica(fragPtr, replicaPtr);
16450 	    }
16451 	    else
16452 	    {
16453 	      jam();
16454 	      infoEvent("Forcing take-over of node %d due to unsufficient REDO"
16455 			" for table %d fragment: %d",
16456 			nodePtr.i, tabPtr.i, i);
16457 
16458               m_sr_nodes.clear(nodePtr.i);
16459               m_to_nodes.set(nodePtr.i);
16460 	      setNodeActiveStatus(nodePtr.i,
16461 				  Sysfile::NS_NotActive_NotTakenOver);
16462 	    }
16463 	  }
16464 	}
16465         default:
16466 	  jam();
16467 	  /*empty*/;
16468 	  break;
16469 	}
16470       }
16471       replicaPtr.i = nextReplicaPtrI;
16472     }//while
16473     updateNodeInfo(fragPtr);
16474   }
16475 }
16476 
16477 void
resetReplica(ReplicaRecordPtr readReplicaPtr)16478 Dbdih::resetReplica(ReplicaRecordPtr readReplicaPtr)
16479 {
16480   Uint32 i;
16481   /* ---------------------------------------------------------------------- */
16482   /*       IF THE LAST COMPLETED LOCAL CHECKPOINT IS VALID AND LARGER THAN  */
16483   /*       THE LAST COMPLETED CHECKPOINT THEN WE WILL INVALIDATE THIS LOCAL */
16484   /*       CHECKPOINT FOR THIS REPLICA.                                     */
16485   /* ---------------------------------------------------------------------- */
16486   for (i = 0; i < MAX_LCP_STORED; i++)
16487   {
16488     jam();
16489     if (readReplicaPtr.p->lcpStatus[i] == ZVALID &&
16490         readReplicaPtr.p->lcpId[i] > SYSFILE->latestLCP_ID)
16491     {
16492       jam();
16493       readReplicaPtr.p->lcpStatus[i] = ZINVALID;
16494     }
16495   }
16496 
16497   /* ---------------------------------------------------------------------- */
16498   /*       WE ALSO HAVE TO INVALIDATE ANY LOCAL CHECKPOINTS THAT HAVE BEEN  */
16499   /*       INVALIDATED BY MOVING BACK THE RESTART GCI.                      */
16500   /* ---------------------------------------------------------------------- */
16501   Uint32 lastCompletedGCI = SYSFILE->newestRestorableGCI;
16502   for (i = 0; i < MAX_LCP_STORED; i++)
16503   {
16504     jam();
16505     if (readReplicaPtr.p->lcpStatus[i] == ZVALID &&
16506         readReplicaPtr.p->maxGciStarted[i] > lastCompletedGCI)
16507     {
16508       jam();
16509       readReplicaPtr.p->lcpStatus[i] = ZINVALID;
16510     }
16511   }
16512 
16513   /* ---------------------------------------------------------------------- */
16514   /*       WE WILL REMOVE ANY OCCURRENCES OF REPLICAS THAT HAVE CRASHED     */
16515   /*       THAT ARE NO LONGER VALID DUE TO MOVING RESTART GCI BACKWARDS.    */
16516   /* ---------------------------------------------------------------------- */
16517   removeTooNewCrashedReplicas(readReplicaPtr, lastCompletedGCI);
16518 
16519   /**
16520    * Don't remove crashed replicas here,
16521    *   as 1) this will disable optimized NR
16522    *         if oldestRestorableGCI > GCI needed for local LCP's
16523    *      2) This is anyway done during LCP, which will be run during SR
16524    */
16525   //removeOldCrashedReplicas(readReplicaPtr);
16526 
16527   /* ---------------------------------------------------------------------- */
16528   /*       FIND PROCESSOR RECORD                                            */
16529   /* ---------------------------------------------------------------------- */
16530 }
16531 
16532 void
resetReplicaLcp(ReplicaRecord * replicaP,Uint32 stopGci)16533 Dbdih::resetReplicaLcp(ReplicaRecord * replicaP, Uint32 stopGci){
16534 
16535   Uint32 lcpNo = replicaP->nextLcp;
16536   const Uint32 startLcpNo = lcpNo;
16537   do {
16538     lcpNo = prevLcpNo(lcpNo);
16539     ndbrequire(lcpNo < MAX_LCP_STORED);
16540     if (replicaP->lcpStatus[lcpNo] == ZVALID)
16541     {
16542       if (replicaP->maxGciStarted[lcpNo] <= stopGci)
16543       {
16544         jam();
16545 	/* ----------------------------------------------------------------- */
16546 	/*   WE HAVE FOUND A USEFUL LOCAL CHECKPOINT THAT CAN BE USED FOR    */
16547 	/*   RESTARTING THIS FRAGMENT REPLICA.                               */
16548 	/* ----------------------------------------------------------------- */
16549         return ;
16550       }//if
16551     }//if
16552 
16553     /**
16554      * WE COULD  NOT USE THIS LOCAL CHECKPOINT. IT WAS TOO
16555      * RECENT OR SIMPLY NOT A VALID CHECKPOINT.
16556      * WE SHOULD THUS REMOVE THIS LOCAL CHECKPOINT SINCE IT WILL NEVER
16557      * AGAIN BE USED. SET LCP_STATUS TO INVALID.
16558      */
16559     replicaP->nextLcp = lcpNo;
16560     replicaP->lcpId[lcpNo] = 0;
16561     replicaP->lcpStatus[lcpNo] = ZINVALID;
16562   } while (lcpNo != startLcpNo);
16563 
16564   replicaP->nextLcp = 0;
16565 }
16566 
readingTableErrorLab(Signal * signal,FileRecordPtr filePtr)16567 void Dbdih::readingTableErrorLab(Signal* signal, FileRecordPtr filePtr)
16568 {
16569   TabRecordPtr tabPtr;
16570   tabPtr.i = filePtr.p->tabRef;
16571   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16572   /* ---------------------------------------------------------------------- */
16573   /*    READING THIS FILE FAILED. CLOSE IT AFTER RELEASING ALL PAGES.       */
16574   /* ---------------------------------------------------------------------- */
16575   ndbrequire(tabPtr.p->noPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
16576   for (Uint32 i = 0; i < tabPtr.p->noPages; i++) {
16577     jam();
16578     releasePage(tabPtr.p->pageRef[i]);
16579   }//for
16580   closeFile(signal, filePtr);
16581   filePtr.p->reqStatus = FileRecord::CLOSING_TABLE_CRASH;
16582   return;
16583 }//Dbdih::readingTableErrorLab()
16584 
closingTableCrashLab(Signal * signal,FileRecordPtr filePtr)16585 void Dbdih::closingTableCrashLab(Signal* signal, FileRecordPtr filePtr)
16586 {
16587   TabRecordPtr tabPtr;
16588   /* ---------------------------------------------------------------------- */
16589   /*    WE HAVE NOW CLOSED A FILE WHICH WE HAD A READ ERROR WITH. PROCEED   */
16590   /*    WITH NEXT FILE IF NOT THE LAST OTHERWISE REPORT ERROR.              */
16591   /* ---------------------------------------------------------------------- */
16592   tabPtr.i = filePtr.p->tabRef;
16593   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16594   ndbrequire(filePtr.i == tabPtr.p->tabFile[0]);
16595   filePtr.i = tabPtr.p->tabFile[1];
16596   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
16597   openFileRw(signal, filePtr);
16598   filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
16599 }//Dbdih::closingTableCrashLab()
16600 
16601 /*****************************************************************************/
16602 /* **********     COPY TABLE MODULE                              *************/
16603 /*****************************************************************************/
execCOPY_TABREQ(Signal * signal)16604 void Dbdih::execCOPY_TABREQ(Signal* signal)
16605 {
16606   CopyTabReq *req = (CopyTabReq*) &signal->theData[0];
16607   CRASH_INSERTION(7172);
16608 
16609   TabRecordPtr tabPtr;
16610   PageRecordPtr pagePtr;
16611   jamEntry();
16612   BlockReference ref = req->senderRef;
16613   Uint32 reqinfo = req->reqinfo;
16614   tabPtr.i = req->tableId;
16615   Uint32 schemaVersion = req->tableSchemaVersion;
16616   Uint32 noOfWords = req->noOfWords;
16617   ndbrequire(ref == cmasterdihref);
16618   ndbrequire(!isMaster());
16619   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16620   if (reqinfo == 1)
16621   {
16622     jam();
16623     tabPtr.p->schemaVersion = schemaVersion;
16624     initTableFile(tabPtr);
16625 
16626     /**
16627      * We need to set up the state of whether the table is actively writing
16628      * an LCP still. We can derive the state on replicas and fragments for
16629      * the LCP with the information that we get in the table by knowing the
16630      * currently executing LCP id. We also get the current LCP id fromt the
16631      * master here to ensure that we're up to date with this value.
16632      */
16633     c_lcp_id_while_copy_meta_data = req->currentLcpId;
16634     Uint32 masterNodeId = refToNode(ref);
16635     if (getNodeInfo(masterNodeId).m_version >= NDBD_SUPPORT_PAUSE_LCP)
16636     {
16637       if (req->tabLcpStatus == CopyTabReq::LcpCompleted)
16638       {
16639         jam();
16640         tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
16641       }
16642       else
16643       {
16644         jam();
16645         ndbrequire(req->tabLcpStatus == CopyTabReq::LcpActive);
16646         tabPtr.p->tabLcpStatus = TabRecord::TLS_ACTIVE;
16647       }
16648     }
16649     else
16650     {
16651       jam();
16652       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
16653     }
16654   }//if
16655   ndbrequire(tabPtr.p->noPages < NDB_ARRAY_SIZE(tabPtr.p->pageRef));
16656   if (tabPtr.p->noOfWords == 0) {
16657     jam();
16658     allocpage(pagePtr);
16659     tabPtr.p->pageRef[tabPtr.p->noPages] = pagePtr.i;
16660     tabPtr.p->noPages++;
16661   } else {
16662     jam();
16663     pagePtr.i = tabPtr.p->pageRef[tabPtr.p->noPages - 1];
16664     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16665   }//if
16666   ndbrequire(tabPtr.p->noOfWords + 15 < 2048);
16667   ndbrequire(tabPtr.p->noOfWords < 2048);
16668   MEMCOPY_NO_WORDS(&pagePtr.p->word[tabPtr.p->noOfWords], &signal->theData[5], 16);
16669   tabPtr.p->noOfWords += 16;
16670   if (tabPtr.p->noOfWords == 2048) {
16671     jam();
16672     tabPtr.p->noOfWords = 0;
16673   }//if
16674   if (noOfWords > 16) {
16675     jam();
16676     return;
16677   }//if
16678   tabPtr.p->noOfWords = 0;
16679   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
16680   tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_TAB_REQ;
16681   signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_TABLE;
16682   signal->theData[1] = tabPtr.i;
16683   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16684 }//Dbdih::execCOPY_TABREQ()
16685 
16686 void
copyTabReq_complete(Signal * signal,TabRecordPtr tabPtr)16687 Dbdih::copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr){
16688   if (!isMaster()) {
16689     jam();
16690     //----------------------------------------------------------------------------
16691     // In this particular case we do not release table pages if we are master. The
16692     // reason is that the master could still be sending the table info to another
16693     // node.
16694     //----------------------------------------------------------------------------
16695     releaseTabPages(tabPtr.i);
16696     tabPtr.p->tabStatus = TabRecord::TS_ACTIVE;
16697     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
16698       jam();
16699       FragmentstorePtr fragPtr;
16700       getFragstore(tabPtr.p, fragId, fragPtr);
16701       updateNodeInfo(fragPtr);
16702     }//for
16703   }//if
16704   c_lcp_id_while_copy_meta_data = RNIL;
16705   CopyTabConf *conf = (CopyTabConf*) signal->getDataPtrSend();
16706   conf->nodeId = getOwnNodeId();
16707   conf->tableId = tabPtr.i;
16708   sendSignal(cmasterdihref, GSN_COPY_TABCONF, signal,
16709              CopyTabConf::SignalLength, JBB);
16710 }
16711 
16712 /*****************************************************************************/
16713 /* ******  READ FROM A NUMBER OF PAGES INTO THE TABLE DATA STRUCTURES ********/
16714 /*****************************************************************************/
readPagesIntoTableLab(Signal * signal,Uint32 tableId)16715 void Dbdih::readPagesIntoTableLab(Signal* signal, Uint32 tableId)
16716 {
16717   RWFragment rf;
16718   rf.wordIndex = 35;
16719   rf.pageIndex = 0;
16720   rf.rwfTabPtr.i = tableId;
16721   ptrCheckGuard(rf.rwfTabPtr, ctabFileSize, tabRecord);
16722   rf.rwfPageptr.i = rf.rwfTabPtr.p->pageRef[0];
16723   ptrCheckGuard(rf.rwfPageptr, cpageFileSize, pageRecord);
16724   rf.rwfTabPtr.p->totalfragments = readPageWord(&rf);
16725   rf.rwfTabPtr.p->noOfBackups = readPageWord(&rf);
16726   rf.rwfTabPtr.p->hashpointer = readPageWord(&rf);
16727   rf.rwfTabPtr.p->kvalue = readPageWord(&rf);
16728   rf.rwfTabPtr.p->mask = readPageWord(&rf);
16729   rf.rwfTabPtr.p->method = (TabRecord::Method)readPageWord(&rf);
16730   /* ------------- */
16731   /* Type of table */
16732   /* ------------- */
16733   rf.rwfTabPtr.p->tabStorage = (TabRecord::Storage)(readPageWord(&rf));
16734 
16735   Uint32 noOfFrags = rf.rwfTabPtr.p->totalfragments;
16736   ndbrequire(noOfFrags > 0);
16737   ndbrequire((noOfFrags * (rf.rwfTabPtr.p->noOfBackups + 1)) <= cnoFreeReplicaRec);
16738   allocFragments(noOfFrags, rf.rwfTabPtr);
16739 
16740   signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_FRAG;
16741   signal->theData[1] = rf.rwfTabPtr.i;
16742   signal->theData[2] = 0;
16743   signal->theData[3] = rf.pageIndex;
16744   signal->theData[4] = rf.wordIndex;
16745   sendSignal(reference(), GSN_CONTINUEB, signal, 5, JBB);
16746   return;
16747 }//Dbdih::readPagesIntoTableLab()
16748 
readPagesIntoFragLab(Signal * signal,RWFragment * rf)16749 void Dbdih::readPagesIntoFragLab(Signal* signal, RWFragment* rf)
16750 {
16751   ndbrequire(rf->pageIndex < NDB_ARRAY_SIZE(rf->rwfTabPtr.p->pageRef));
16752   rf->rwfPageptr.i = rf->rwfTabPtr.p->pageRef[rf->pageIndex];
16753   ptrCheckGuard(rf->rwfPageptr, cpageFileSize, pageRecord);
16754   FragmentstorePtr fragPtr;
16755   getFragstore(rf->rwfTabPtr.p, rf->fragId, fragPtr);
16756   readFragment(rf, fragPtr);
16757   readReplicas(rf, rf->rwfTabPtr.p, fragPtr);
16758   rf->fragId++;
16759   if (rf->fragId == rf->rwfTabPtr.p->totalfragments) {
16760     jam();
16761     switch (rf->rwfTabPtr.p->tabCopyStatus) {
16762     case TabRecord::CS_SR_PHASE1_READ_PAGES:
16763       jam();
16764       releaseTabPages(rf->rwfTabPtr.i);
16765       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16766       signal->theData[0] = DihContinueB::ZREAD_TABLE_FROM_PAGES;
16767       signal->theData[1] = rf->rwfTabPtr.i;
16768       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16769       return;
16770       break;
16771     case TabRecord::CS_COPY_TAB_REQ:
16772       jam();
16773       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16774       if (getNodeState().getSystemRestartInProgress() &&
16775           rf->rwfTabPtr.p->tabStorage == TabRecord::ST_NORMAL)
16776       {
16777         /**
16778          * avoid overwriting own table-definition...
16779          *   but this is not possible for no-logging tables
16780          */
16781 	jam();
16782 	copyTabReq_complete(signal, rf->rwfTabPtr);
16783 	return;
16784       }
16785       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16786       rf->rwfTabPtr.p->tabUpdateState = TabRecord::US_COPY_TAB_REQ;
16787       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
16788       signal->theData[1] = rf->rwfTabPtr.i;
16789       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16790       return;
16791       break;
16792     default:
16793       ndbrequire(false);
16794       return;
16795       break;
16796     }//switch
16797   } else {
16798     jam();
16799     signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_FRAG;
16800     signal->theData[1] = rf->rwfTabPtr.i;
16801     signal->theData[2] = rf->fragId;
16802     signal->theData[3] = rf->pageIndex;
16803     signal->theData[4] = rf->wordIndex;
16804     sendSignal(reference(), GSN_CONTINUEB, signal, 5, JBB);
16805   }//if
16806   return;
16807 }//Dbdih::readPagesIntoFragLab()
16808 
16809 /*****************************************************************************/
16810 /*****   WRITING FROM TABLE DATA STRUCTURES INTO A SET OF PAGES         ******/
16811 // execCONTINUEB(ZPACK_TABLE_INTO_PAGES)
16812 /*****************************************************************************/
packTableIntoPagesLab(Signal * signal,Uint32 tableId)16813 void Dbdih::packTableIntoPagesLab(Signal* signal, Uint32 tableId)
16814 {
16815   RWFragment wf;
16816   TabRecordPtr tabPtr;
16817   allocpage(wf.rwfPageptr);
16818   tabPtr.i = tableId;
16819   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16820   tabPtr.p->pageRef[0] = wf.rwfPageptr.i;
16821   tabPtr.p->noPages = 1;
16822   wf.wordIndex = 35;
16823   wf.pageIndex = 0;
16824   Uint32 totalfragments = tabPtr.p->totalfragments;
16825   if (tabPtr.p->connectrec != RNIL)
16826   {
16827     jam();
16828     Ptr<ConnectRecord> connectPtr;
16829     connectPtr.i = tabPtr.p->connectrec;
16830     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
16831     ndbrequire(connectPtr.p->table == tabPtr.i);
16832     if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
16833     {
16834       jam();
16835       totalfragments = connectPtr.p->m_alter.m_totalfragments;
16836     }
16837   }
16838 
16839   writePageWord(&wf, totalfragments);
16840   writePageWord(&wf, tabPtr.p->noOfBackups);
16841   writePageWord(&wf, tabPtr.p->hashpointer);
16842   writePageWord(&wf, tabPtr.p->kvalue);
16843   writePageWord(&wf, tabPtr.p->mask);
16844   writePageWord(&wf, tabPtr.p->method);
16845   writePageWord(&wf, tabPtr.p->tabStorage);
16846 
16847   signal->theData[0] = DihContinueB::ZPACK_FRAG_INTO_PAGES;
16848   signal->theData[1] = tabPtr.i;
16849   signal->theData[2] = 0;
16850   signal->theData[3] = wf.pageIndex;
16851   signal->theData[4] = wf.wordIndex;
16852   signal->theData[5] = totalfragments;
16853   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
16854 }//Dbdih::packTableIntoPagesLab()
16855 
16856 /*****************************************************************************/
16857 // execCONTINUEB(ZPACK_FRAG_INTO_PAGES)
16858 /*****************************************************************************/
packFragIntoPagesLab(Signal * signal,RWFragment * wf)16859 void Dbdih::packFragIntoPagesLab(Signal* signal, RWFragment* wf)
16860 {
16861   ndbrequire(wf->pageIndex < NDB_ARRAY_SIZE(wf->rwfTabPtr.p->pageRef));
16862   wf->rwfPageptr.i = wf->rwfTabPtr.p->pageRef[wf->pageIndex];
16863   ptrCheckGuard(wf->rwfPageptr, cpageFileSize, pageRecord);
16864   FragmentstorePtr fragPtr;
16865   getFragstore(wf->rwfTabPtr.p, wf->fragId, fragPtr);
16866   writeFragment(wf, fragPtr);
16867   writeReplicas(wf, fragPtr.p->storedReplicas);
16868   writeReplicas(wf, fragPtr.p->oldStoredReplicas);
16869   wf->fragId++;
16870   if (wf->fragId == wf->totalfragments) {
16871     jam();
16872     PageRecordPtr pagePtr;
16873     pagePtr.i = wf->rwfTabPtr.p->pageRef[0];
16874     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
16875     pagePtr.p->word[33] = wf->rwfTabPtr.p->noPages;
16876     pagePtr.p->word[34] = ((wf->rwfTabPtr.p->noPages - 1) * 2048) + wf->wordIndex;
16877     switch (wf->rwfTabPtr.p->tabCopyStatus) {
16878     case TabRecord::CS_SR_PHASE2_READ_TABLE:
16879       /* -------------------------------------------------------------------*/
16880       // We are performing a system restart and we are now ready to copy the
16881       // table from this node (the master) to all other nodes.
16882       /* -------------------------------------------------------------------*/
16883       jam();
16884       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16885       signal->theData[0] = DihContinueB::ZSR_PHASE2_READ_TABLE;
16886       signal->theData[1] = wf->rwfTabPtr.i;
16887       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16888       return;
16889       break;
16890     case TabRecord::CS_COPY_NODE_STATE:
16891       jam();
16892       tableCopyNodeLab(signal, wf->rwfTabPtr);
16893       return;
16894       break;
16895     case TabRecord::CS_LCP_READ_TABLE:
16896       jam();
16897       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
16898       signal->theData[1] = wf->rwfTabPtr.i;
16899       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16900       return;
16901       break;
16902     case TabRecord::CS_REMOVE_NODE:
16903     case TabRecord::CS_INVALIDATE_NODE_LCP:
16904       jam();
16905       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
16906       signal->theData[1] = wf->rwfTabPtr.i;
16907       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16908       return;
16909       break;
16910     case TabRecord::CS_ADD_TABLE_MASTER:
16911       jam();
16912       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16913       signal->theData[0] = DihContinueB::ZADD_TABLE_MASTER_PAGES;
16914       signal->theData[1] = wf->rwfTabPtr.i;
16915       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16916       return;
16917       break;
16918     case TabRecord::CS_ADD_TABLE_SLAVE:
16919       jam();
16920       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
16921       signal->theData[0] = DihContinueB::ZADD_TABLE_SLAVE_PAGES;
16922       signal->theData[1] = wf->rwfTabPtr.i;
16923       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16924       return;
16925     case TabRecord::CS_COPY_TO_SAVE:
16926       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
16927       signal->theData[1] = wf->rwfTabPtr.i;
16928       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16929       return;
16930     case TabRecord::CS_GET_TABINFO:
16931       jam();
16932       signal->theData[0] = DihContinueB::ZGET_TABINFO_SEND;
16933       signal->theData[1] = wf->rwfTabPtr.i;
16934       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16935       return;
16936     default:
16937       ndbrequire(false);
16938       return;
16939       break;
16940     }//switch
16941   } else {
16942     jam();
16943     signal->theData[0] = DihContinueB::ZPACK_FRAG_INTO_PAGES;
16944     signal->theData[1] = wf->rwfTabPtr.i;
16945     signal->theData[2] = wf->fragId;
16946     signal->theData[3] = wf->pageIndex;
16947     signal->theData[4] = wf->wordIndex;
16948     signal->theData[5] = wf->totalfragments;
16949     sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
16950   }//if
16951   return;
16952 }//Dbdih::packFragIntoPagesLab()
16953 
16954 /*****************************************************************************/
16955 /* **********     START FRAGMENT MODULE                          *************/
16956 /*****************************************************************************/
16957 void
dump_replica_info()16958 Dbdih::dump_replica_info()
16959 {
16960   TabRecordPtr tabPtr;
16961   FragmentstorePtr fragPtr;
16962 
16963   for(tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++)
16964   {
16965     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16966     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
16967       continue;
16968 
16969     for(Uint32 fid = 0; fid<tabPtr.p->totalfragments; fid++)
16970     {
16971       getFragstore(tabPtr.p, fid, fragPtr);
16972       ndbout_c("tab: %d frag: %d gci: %d\n",
16973 	       tabPtr.i, fid, SYSFILE->newestRestorableGCI);
16974 
16975       dump_replica_info(fragPtr.p);
16976     }
16977   }
16978 }
16979 
16980 void
dump_replica_info(const Fragmentstore * fragPtrP)16981 Dbdih::dump_replica_info(const Fragmentstore* fragPtrP)
16982 {
16983   ndbout_c("  -- storedReplicas: ");
16984   Uint32 i;
16985   ReplicaRecordPtr replicaPtr;
16986   replicaPtr.i = fragPtrP->storedReplicas;
16987   for(; replicaPtr.i != RNIL; replicaPtr.i = replicaPtr.p->nextPool)
16988   {
16989     c_replicaRecordPool.getPtr(replicaPtr);
16990     ndbout_c("  node: %d initialGci: %d nextLcp: %d noCrashedReplicas: %d",
16991              replicaPtr.p->procNode,
16992              replicaPtr.p->initialGci,
16993              replicaPtr.p->nextLcp,
16994              replicaPtr.p->noCrashedReplicas);
16995     for(i = 0; i<MAX_LCP_STORED; i++)
16996     {
16997       ndbout_c("    i: %d %s : lcpId: %d maxGci Completed: %d Started: %d",
16998                i,
16999                (replicaPtr.p->lcpStatus[i] == ZVALID ?"VALID":"INVALID"),
17000                replicaPtr.p->lcpId[i],
17001                replicaPtr.p->maxGciCompleted[i],
17002                replicaPtr.p->maxGciStarted[i]);
17003     }
17004 
17005     for (i = 0; i < 8; i++)
17006     {
17007       ndbout_c("    crashed replica: %d replicaLastGci: %d createGci: %d",
17008                i,
17009                replicaPtr.p->replicaLastGci[i],
17010                replicaPtr.p->createGci[i]);
17011     }
17012   }
17013   ndbout_c("  -- oldStoredReplicas");
17014   replicaPtr.i = fragPtrP->oldStoredReplicas;
17015   for(; replicaPtr.i != RNIL; replicaPtr.i = replicaPtr.p->nextPool)
17016   {
17017     c_replicaRecordPool.getPtr(replicaPtr);
17018     ndbout_c("  node: %d initialGci: %d nextLcp: %d noCrashedReplicas: %d",
17019              replicaPtr.p->procNode,
17020              replicaPtr.p->initialGci,
17021              replicaPtr.p->nextLcp,
17022              replicaPtr.p->noCrashedReplicas);
17023     for(i = 0; i<MAX_LCP_STORED; i++)
17024     {
17025       ndbout_c("    i: %d %s : lcpId: %d maxGci Completed: %d Started: %d",
17026                i,
17027                (replicaPtr.p->lcpStatus[i] == ZVALID ?"VALID":"INVALID"),
17028                replicaPtr.p->lcpId[i],
17029                replicaPtr.p->maxGciCompleted[i],
17030                replicaPtr.p->maxGciStarted[i]);
17031     }
17032 
17033     for (i = 0; i < 8; i++)
17034     {
17035       ndbout_c("    crashed replica: %d replicaLastGci: %d createGci: %d",
17036                i,
17037                replicaPtr.p->replicaLastGci[i],
17038                replicaPtr.p->createGci[i]);
17039     }
17040   }
17041 }
17042 
startFragment(Signal * signal,Uint32 tableId,Uint32 fragId)17043 void Dbdih::startFragment(Signal* signal, Uint32 tableId, Uint32 fragId)
17044 {
17045   Uint32 TloopCount = 0;
17046   TabRecordPtr tabPtr;
17047   while (true) {
17048     if (TloopCount > 100) {
17049       jam();
17050       signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
17051       signal->theData[1] = tableId;
17052       signal->theData[2] = 0;
17053       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
17054       return;
17055     }
17056 
17057     if (tableId >= ctabFileSize) {
17058       jam();
17059       signal->theData[0] = DihContinueB::ZCOMPLETE_RESTART;
17060       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
17061       return;
17062     }//if
17063 
17064     tabPtr.i = tableId;
17065     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
17066     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE){
17067       jam();
17068       TloopCount++;
17069       tableId++;
17070       fragId = 0;
17071       continue;
17072     }
17073 
17074     if(tabPtr.p->tabStorage != TabRecord::ST_NORMAL){
17075       jam();
17076       TloopCount++;
17077       tableId++;
17078       fragId = 0;
17079       continue;
17080     }
17081 
17082     jam();
17083     break;
17084   }//while
17085 
17086   FragmentstorePtr fragPtr;
17087   getFragstore(tabPtr.p, fragId, fragPtr);
17088   /* ----------------------------------------------------------------------- */
17089   /*     WE NEED TO RESET THE REPLICA DATA STRUCTURES. THIS MEANS THAT WE    */
17090   /*     MUST REMOVE REPLICAS THAT WAS NOT STARTED AT THE GCI TO RESTORE. WE */
17091   /*     NEED TO PUT ALL STORED REPLICAS ON THE LIST OF OLD STORED REPLICAS  */
17092   /*     RESET THE NUMBER OF REPLICAS TO CREATE.                             */
17093   /* ----------------------------------------------------------------------- */
17094   cnoOfCreateReplicas = 0;
17095   /* ----------------------------------------------------------------------- */
17096   /*     WE WILL NEVER START MORE THAN FOUR FRAGMENT REPLICAS WHATEVER THE   */
17097   /*     DESIRED REPLICATION IS.                                             */
17098   /* ----------------------------------------------------------------------- */
17099   ndbrequire(tabPtr.p->noOfBackups < MAX_REPLICAS);
17100   /* ----------------------------------------------------------------------- */
17101   /*     SEARCH FOR STORED REPLICAS THAT CAN BE USED TO RESTART THE SYSTEM.  */
17102   /* ----------------------------------------------------------------------- */
17103   searchStoredReplicas(fragPtr);
17104 
17105   if (cnoOfCreateReplicas == 0) {
17106     /* --------------------------------------------------------------------- */
17107     /*   THERE WERE NO STORED REPLICAS AVAILABLE THAT CAN SERVE AS REPLICA TO*/
17108     /*   RESTART THE SYSTEM FROM. IN A LATER RELEASE WE WILL ADD             */
17109     /*   FUNCTIONALITY TO CHECK IF THERE ARE ANY STANDBY NODES THAT COULD DO */
17110     /*   THIS TASK INSTEAD IN THIS IMPLEMENTATION WE SIMPLY CRASH THE SYSTEM.*/
17111     /*   THIS WILL DECREASE THE GCI TO RESTORE WHICH HOPEFULLY WILL MAKE IT  */
17112     /*   POSSIBLE TO RESTORE THE SYSTEM.                                     */
17113     /* --------------------------------------------------------------------- */
17114     char buf[64];
17115     BaseString::snprintf(buf, sizeof(buf), "table: %d fragment: %d gci: %d",
17116 			 tableId, fragId, SYSFILE->newestRestorableGCI);
17117 
17118     ndbout_c("%s", buf);
17119     dump_replica_info();
17120 
17121     progError(__LINE__, NDBD_EXIT_NO_RESTORABLE_REPLICA, buf);
17122     ndbrequire(false);
17123     return;
17124   }//if
17125 
17126   /* ----------------------------------------------------------------------- */
17127   /*     WE HAVE CHANGED THE NODE TO BE PRIMARY REPLICA AND THE NODES TO BE  */
17128   /*     BACKUP NODES. WE MUST UPDATE THIS NODES DATA STRUCTURE SINCE WE     */
17129   /*     WILL NOT COPY THE TABLE DATA TO OURSELF.                            */
17130   /* ----------------------------------------------------------------------- */
17131   updateNodeInfo(fragPtr);
17132   /* ----------------------------------------------------------------------- */
17133   /*     NOW WE HAVE COLLECTED ALL THE REPLICAS WE COULD GET. WE WILL NOW    */
17134   /*     RESTART THE FRAGMENT REPLICAS WE HAVE FOUND IRRESPECTIVE OF IF THERE*/
17135   /*     ARE ENOUGH ACCORDING TO THE DESIRED REPLICATION.                    */
17136   /* ----------------------------------------------------------------------- */
17137   /*     WE START BY SENDING ADD_FRAGREQ FOR THOSE REPLICAS THAT NEED IT.    */
17138   /* ----------------------------------------------------------------------- */
17139   CreateReplicaRecordPtr createReplicaPtr;
17140   for (createReplicaPtr.i = 0;
17141        createReplicaPtr.i < cnoOfCreateReplicas;
17142        createReplicaPtr.i++) {
17143     jam();
17144     ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
17145   }//for
17146 
17147   sendStartFragreq(signal, tabPtr, fragId);
17148 
17149   /**
17150    * Don't wait for START_FRAGCONF
17151    */
17152   fragId++;
17153   if (fragId >= tabPtr.p->totalfragments) {
17154     jam();
17155     tabPtr.i++;
17156     fragId = 0;
17157   }//if
17158   signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
17159   signal->theData[1] = tabPtr.i;
17160   signal->theData[2] = fragId;
17161   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
17162 
17163   return;
17164 }//Dbdih::startFragmentLab()
17165 
17166 
17167 /*****************************************************************************/
17168 /* **********     COMPLETE RESTART MODULE                        *************/
17169 /*****************************************************************************/
completeRestartLab(Signal * signal)17170 void Dbdih::completeRestartLab(Signal* signal)
17171 {
17172   sendLoopMacro(START_RECREQ, sendSTART_RECREQ, RNIL);
17173 }//completeRestartLab()
17174 
17175 /* ------------------------------------------------------------------------- */
17176 //       SYSTEM RESTART:
17177 /*         A NODE HAS COMPLETED RESTORING ALL DATABASE FRAGMENTS.            */
17178 //       NODE RESTART:
17179 //         THE STARTING NODE HAS PREPARED ITS LOG FILES TO ENABLE EXECUTION
17180 //         OF TRANSACTIONS.
17181 // Precondition:
17182 //   This signal is received by the master node for the system restart.
17183 //   This signal is received by the starting node for node restart.
17184 /* ------------------------------------------------------------------------- */
execSTART_RECCONF(Signal * signal)17185 void Dbdih::execSTART_RECCONF(Signal* signal)
17186 {
17187   jamEntry();
17188   Uint32 senderNodeId = signal->theData[0];
17189   Uint32 senderData = signal->theData[1];
17190 
17191   if (senderData != RNIL)
17192   {
17193     jam();
17194     /**
17195      * This is normally a node restart, but it could also be second
17196      * phase of a system restart where a node is restored from a more
17197      * alive node, in this case we could even be the master node although
17198      * we arrive here.
17199      */
17200     g_eventLogger->info("Restore Database Off-line Completed");
17201     infoEvent("Restore Database Off-line Completed on node %u",
17202               senderNodeId);
17203 
17204     g_eventLogger->info("Bring Database On-line Starting");
17205     infoEvent("Bring Database On-line Starting on node %u",
17206               senderNodeId);
17207 
17208     /**
17209      * This is node restart
17210      */
17211     Ptr<TakeOverRecord> takeOverPtr;
17212     c_takeOverPool.getPtr(takeOverPtr, senderData);
17213     sendStartTo(signal, takeOverPtr);
17214     return;
17215   }
17216   infoEvent("Restore Database from disk Completed on node %u",
17217             senderNodeId);
17218 
17219   /* No take over record in the system restart case here */
17220   ndbrequire(senderData == RNIL);
17221   /* --------------------------------------------------------------------- */
17222   // This was the system restart case. We set the state indicating that the
17223   // node has completed restoration of all fragments.
17224   /* --------------------------------------------------------------------- */
17225   receiveLoopMacro(START_RECREQ, senderNodeId);
17226 
17227   /**
17228    * Remove each node that has to TO from LCP/LQH
17229    */
17230   Uint32 i = 0;
17231   while ((i = m_to_nodes.find(i + 1)) != NdbNodeBitmask::NotFound)
17232   {
17233     jam();
17234     NodeRecordPtr nodePtr;
17235     nodePtr.i = i;
17236     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
17237     nodePtr.p->copyCompleted = 0;
17238   }
17239 
17240   if (m_to_nodes.get(getOwnNodeId()))
17241   {
17242     /**
17243      * We (master) needs take-over
17244      *   run this directly to avoid strange confusion
17245      */
17246     jam();
17247     c_sr_wait_to = true;
17248   }
17249 
17250   if (!m_to_nodes.isclear() && c_sr_wait_to)
17251   {
17252     jam();
17253 
17254     StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
17255     req->senderRef = reference();
17256     req->senderData = getOwnNodeId();
17257     req->flags = 0; // Note dont wait for LCP
17258 
17259     i = 0;
17260     while ((i = m_to_nodes.find(i + 1)) != NdbNodeBitmask::NotFound)
17261     {
17262       jam();
17263       req->startingNodeId = i;
17264       sendSignal(calcDihBlockRef(i), GSN_START_COPYREQ, signal,
17265                  StartCopyReq::SignalLength, JBB);
17266     }
17267 
17268     char buf[100];
17269     infoEvent("Starting take-over of %s", m_to_nodes.getText(buf));
17270     return;
17271   }
17272 
17273   infoEvent("Restore Database from disk Completed");
17274 
17275   signal->theData[0] = reference();
17276   m_sr_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
17277   sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
17278              1 + NdbNodeBitmask::Size, JBB);
17279 }//Dbdih::execSTART_RECCONF()
17280 
copyNodeLab(Signal * signal,Uint32 tableId)17281 void Dbdih::copyNodeLab(Signal* signal, Uint32 tableId)
17282 {
17283   /* ----------------------------------------------------------------------- */
17284   // This code is executed by the master to assist a node restart in receiving
17285   // the data in the master.
17286   /* ----------------------------------------------------------------------- */
17287   Uint32 TloopCount = 0;
17288 
17289   if (!c_nodeStartMaster.activeState) {
17290     jam();
17291     /* --------------------------------------------------------------------- */
17292     // Obviously the node crashed in the middle of its node restart. We will
17293     // stop this process simply by returning after resetting the wait indicator.
17294     // We also need to handle the pausing of LCPs if it was active.
17295     /* ---------------------------------------------------------------------- */
17296     c_nodeStartMaster.wait = ZFALSE;
17297     return;
17298   }//if
17299   TabRecordPtr tabPtr;
17300   tabPtr.i = tableId;
17301   while (tabPtr.i < ctabFileSize) {
17302     ptrAss(tabPtr, tabRecord);
17303     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE)
17304     {
17305       /* -------------------------------------------------------------------- */
17306       // The table is defined. We will start by packing the table into pages.
17307       // The tabCopyStatus indicates to the CONTINUEB(ZPACK_TABLE_INTO_PAGES)
17308       // who called it. After packing the table into page(s) it will be sent to
17309       // the starting node by COPY_TABREQ signals. After returning from the
17310       // starting node we will return to this subroutine and continue
17311       // with the next table.
17312       /* -------------------------------------------------------------------- */
17313       if (! (tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE))
17314       {
17315         jam();
17316         signal->theData[0] = DihContinueB::ZCOPY_NODE;
17317         signal->theData[1] = tabPtr.i;
17318         sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
17319                             WaitTableStateChangeMillis, 2);
17320         return;
17321       }
17322       tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_NODE_STATE;
17323       signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
17324       signal->theData[1] = tabPtr.i;
17325       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17326       return;
17327     } else {
17328       jam();
17329       if (TloopCount > 100) {
17330 	/* ------------------------------------------------------------------ */
17331 	// Introduce real-time break after looping through 100 not copied tables
17332 	/* ----------------------------------------------------------------- */
17333         jam();
17334         signal->theData[0] = DihContinueB::ZCOPY_NODE;
17335         signal->theData[1] = tabPtr.i + 1;
17336         sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17337         return;
17338       } else {
17339         jam();
17340         TloopCount++;
17341         tabPtr.i++;
17342       }//if
17343     }//if
17344   }//while
17345   jam();
17346   if (is_lcp_paused())
17347   {
17348     jam();
17349     /**
17350      * Copying is done, we now need to tell the starting node about the
17351      * already completed LQHs and to ensure that the starting node
17352      * verifies that the copy was correct.
17353      */
17354     check_for_pause_action(signal, StartLcpReq::PauseLcpStartSecond);
17355     return;
17356   }
17357   else
17358   {
17359     jam();
17360     dihCopyCompletedLab(signal);
17361     return;
17362   }
17363 }//Dbdih::copyNodeLab()
17364 
tableCopyNodeLab(Signal * signal,TabRecordPtr tabPtr)17365 void Dbdih::tableCopyNodeLab(Signal* signal, TabRecordPtr tabPtr)
17366 {
17367   /* ----------------------------------------------------------------------- */
17368   /*       COPY PAGES READ TO STARTING NODE.                                 */
17369   /* ----------------------------------------------------------------------- */
17370   if (!c_nodeStartMaster.activeState) {
17371     jam();
17372     releaseTabPages(tabPtr.i);
17373     c_nodeStartMaster.wait = ZFALSE;
17374     return;
17375   }//if
17376   NodeRecordPtr copyNodePtr;
17377   PageRecordPtr pagePtr;
17378   copyNodePtr.i = c_nodeStartMaster.startNode;
17379   ptrCheckGuard(copyNodePtr, MAX_NDB_NODES, nodeRecord);
17380 
17381   copyNodePtr.p->activeTabptr = tabPtr.i;
17382   pagePtr.i = tabPtr.p->pageRef[0];
17383   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
17384 
17385   signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
17386   signal->theData[1] = tabPtr.i;
17387   signal->theData[2] = copyNodePtr.i;
17388   signal->theData[3] = 0;
17389   signal->theData[4] = 0;
17390   signal->theData[5] = pagePtr.p->word[34];
17391   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
17392 }//Dbdih::tableCopyNodeLab()
17393 
17394 /* ------------------------------------------------------------------------- */
17395 // execCONTINUEB(ZCOPY_TABLE)
17396 // This routine is used to copy the table descriptions from the master to
17397 // other nodes. It is used in the system restart to copy from master to all
17398 // starting nodes.
17399 /* ------------------------------------------------------------------------- */
copyTableLab(Signal * signal,Uint32 tableId)17400 void Dbdih::copyTableLab(Signal* signal, Uint32 tableId)
17401 {
17402   TabRecordPtr tabPtr;
17403   tabPtr.i = tableId;
17404   ptrAss(tabPtr, tabRecord);
17405 
17406   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
17407   tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE2_READ_TABLE;
17408   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
17409   signal->theData[1] = tabPtr.i;
17410   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17411   return;
17412 }//Dbdih::copyTableLab()
17413 
17414 /* ------------------------------------------------------------------------- */
17415 // execCONTINUEB(ZSR_PHASE2_READ_TABLE)
17416 /* ------------------------------------------------------------------------- */
srPhase2ReadTableLab(Signal * signal,TabRecordPtr tabPtr)17417 void Dbdih::srPhase2ReadTableLab(Signal* signal, TabRecordPtr tabPtr)
17418 {
17419   /* ----------------------------------------------------------------------- */
17420   // We set the sendCOPY_TABREQState to ZACTIVE for all nodes since it is a long
17421   // process to send off all table descriptions. Thus we ensure that we do
17422   // not encounter race conditions where one node is completed before the
17423   // sending process is completed. This could lead to that we start off the
17424   // system before we actually finished all copying of table descriptions
17425   // and could lead to strange errors.
17426   /* ----------------------------------------------------------------------- */
17427 
17428   //sendLoopMacro(COPY_TABREQ, nullRoutine);
17429 
17430   breakCopyTableLab(signal, tabPtr, cfirstAliveNode);
17431   return;
17432 }//Dbdih::srPhase2ReadTableLab()
17433 
17434 /* ------------------------------------------------------------------------- */
17435 /*       COPY PAGES READ TO ALL NODES.                                       */
17436 /* ------------------------------------------------------------------------- */
breakCopyTableLab(Signal * signal,TabRecordPtr tabPtr,Uint32 nodeId)17437 void Dbdih::breakCopyTableLab(Signal* signal, TabRecordPtr tabPtr, Uint32 nodeId)
17438 {
17439   NodeRecordPtr nodePtr;
17440   nodePtr.i = nodeId;
17441   while (nodePtr.i != RNIL) {
17442     jam();
17443     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
17444     if (nodePtr.i == getOwnNodeId()){
17445       jam();
17446       /* ------------------------------------------------------------------- */
17447       /* NOT NECESSARY TO COPY TO MY OWN NODE. I ALREADY HAVE THE PAGES.     */
17448       /* I DO HOWEVER NEED TO STORE THE TABLE DESCRIPTION ONTO DISK.         */
17449       /* ------------------------------------------------------------------- */
17450       /* IF WE ARE MASTER WE ONLY NEED TO SAVE THE TABLE ON DISK. WE ALREADY */
17451       /* HAVE THE TABLE DESCRIPTION IN THE DATA STRUCTURES.                  */
17452       // AFTER COMPLETING THE WRITE TO DISK THE MASTER WILL ALSO SEND
17453       // COPY_TABCONF AS ALL THE OTHER NODES.
17454       /* ------------------------------------------------------------------- */
17455       c_COPY_TABREQ_Counter.setWaitingFor(nodePtr.i);
17456       tabPtr.p->tabUpdateState = TabRecord::US_COPY_TAB_REQ;
17457       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
17458       signal->theData[1] = tabPtr.i;
17459       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17460       nodePtr.i = nodePtr.p->nextNode;
17461     } else {
17462       PageRecordPtr pagePtr;
17463       /* -------------------------------------------------------------------- */
17464       // RATHER THAN SENDING ALL COPY_TABREQ IN PARALLEL WE WILL SERIALISE THIS
17465       // ACTIVITY AND WILL THUS CALL breakCopyTableLab AGAIN WHEN COMPLETED THE
17466       // SENDING OF COPY_TABREQ'S.
17467       /* -------------------------------------------------------------------- */
17468       jam();
17469       tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE3_COPY_TABLE;
17470       pagePtr.i = tabPtr.p->pageRef[0];
17471       ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
17472       signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
17473       signal->theData[1] = tabPtr.i;
17474       signal->theData[2] = nodePtr.i;
17475       signal->theData[3] = 0;
17476       signal->theData[4] = 0;
17477       signal->theData[5] = pagePtr.p->word[34];
17478       sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
17479       return;
17480     }//if
17481   }//while
17482   /* ----------------------------------------------------------------------- */
17483   /*    WE HAVE NOW SENT THE TABLE PAGES TO ALL NODES. EXIT AND WAIT FOR ALL */
17484   /*    REPLIES.                                                             */
17485   /* ----------------------------------------------------------------------- */
17486   return;
17487 }//Dbdih::breakCopyTableLab()
17488 
17489 /* ------------------------------------------------------------------------- */
17490 // execCONTINUEB(ZCOPY_TABLE_NODE)
17491 /* ------------------------------------------------------------------------- */
copyTableNode(Signal * signal,CopyTableNode * ctn,NodeRecordPtr nodePtr)17492 void Dbdih::copyTableNode(Signal* signal,
17493 			  CopyTableNode* ctn, NodeRecordPtr nodePtr)
17494 {
17495   if (getNodeState().startLevel >= NodeState::SL_STARTED){
17496     /* --------------------------------------------------------------------- */
17497     // We are in the process of performing a node restart and are copying a
17498     // table description to a starting node. We will check that no nodes have
17499     // crashed in this process.
17500     /* --------------------------------------------------------------------- */
17501     if (!c_nodeStartMaster.activeState) {
17502       jam();
17503       /** ------------------------------------------------------------------
17504        * The starting node crashed. We will release table pages and stop this
17505        * copy process and allow new node restarts to start.
17506        * ------------------------------------------------------------------ */
17507       releaseTabPages(ctn->ctnTabPtr.i);
17508       c_nodeStartMaster.wait = ZFALSE;
17509       return;
17510     }//if
17511   }//if
17512   ndbrequire(ctn->pageIndex < NDB_ARRAY_SIZE(ctn->ctnTabPtr.p->pageRef));
17513   ctn->ctnPageptr.i = ctn->ctnTabPtr.p->pageRef[ctn->pageIndex];
17514   ptrCheckGuard(ctn->ctnPageptr, cpageFileSize, pageRecord);
17515   /**
17516    * If first page & firstWord reqinfo = 1 (first signal)
17517    */
17518   Uint32 reqinfo = (ctn->pageIndex == 0) && (ctn->wordIndex == 0);
17519   if(reqinfo == 1){
17520     c_COPY_TABREQ_Counter.setWaitingFor(nodePtr.i);
17521   }
17522 
17523   for (Uint32 i = 0; i < 16; i++) {
17524     jam();
17525     sendCopyTable(signal, ctn, calcDihBlockRef(nodePtr.i), reqinfo);
17526     reqinfo = 0;
17527     if (ctn->noOfWords <= 16) {
17528       jam();
17529       switch (ctn->ctnTabPtr.p->tabCopyStatus) {
17530       case TabRecord::CS_SR_PHASE3_COPY_TABLE:
17531 	/* ------------------------------------------------------------------ */
17532 	// We have copied the table description to this node.
17533 	// We will now proceed
17534 	// with sending the table description to the next node in the node list.
17535 	/* ------------------------------------------------------------------ */
17536         jam();
17537         ctn->ctnTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
17538         breakCopyTableLab(signal, ctn->ctnTabPtr, nodePtr.p->nextNode);
17539         return;
17540         break;
17541       case TabRecord::CS_COPY_NODE_STATE:
17542         jam();
17543         ctn->ctnTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
17544         return;
17545         break;
17546       default:
17547         ndbrequire(false);
17548         break;
17549       }//switch
17550     } else {
17551       jam();
17552       ctn->wordIndex += 16;
17553       if (ctn->wordIndex == 2048) {
17554         jam();
17555         ctn->wordIndex = 0;
17556         ctn->pageIndex++;
17557         ndbrequire(ctn->pageIndex < NDB_ARRAY_SIZE(ctn->ctnTabPtr.p->pageRef));
17558         ctn->ctnPageptr.i = ctn->ctnTabPtr.p->pageRef[ctn->pageIndex];
17559         ptrCheckGuard(ctn->ctnPageptr, cpageFileSize, pageRecord);
17560       }//if
17561       ctn->noOfWords -= 16;
17562     }//if
17563   }//for
17564   signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
17565   signal->theData[1] = ctn->ctnTabPtr.i;
17566   signal->theData[2] = nodePtr.i;
17567   signal->theData[3] = ctn->pageIndex;
17568   signal->theData[4] = ctn->wordIndex;
17569   signal->theData[5] = ctn->noOfWords;
17570   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
17571 }//Dbdih::copyTableNodeLab()
17572 
sendCopyTable(Signal * signal,CopyTableNode * ctn,BlockReference ref,Uint32 reqinfo)17573 void Dbdih::sendCopyTable(Signal* signal, CopyTableNode* ctn,
17574                           BlockReference ref, Uint32 reqinfo)
17575 {
17576   CopyTabReq *req = (CopyTabReq*) signal->getDataPtrSend();
17577   req->senderRef = reference();
17578   req->reqinfo = reqinfo;
17579   req->tableId = ctn->ctnTabPtr.i;
17580   req->tableSchemaVersion = ctn->ctnTabPtr.p->schemaVersion;
17581   req->noOfWords = ctn->noOfWords;
17582   ndbrequire(ctn->wordIndex + 15 < 2048);
17583   MEMCOPY_NO_WORDS(&req->tableWords[0],
17584                    &ctn->ctnPageptr.p->word[ctn->wordIndex],
17585                    16);
17586   Uint32 sig_len = CopyTabReq::SignalLength;
17587   if (reqinfo == 1)
17588   {
17589     if (ctn->ctnTabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE)
17590     {
17591       jam();
17592       req->tabLcpStatus = CopyTabReq::LcpActive;
17593     }
17594     else
17595     {
17596       jam();
17597       /**
17598        * The state TLS_WRITING_TO_FILE means that the LCP is completed from the
17599        * viewpoint of the new starting node since it will start by writing the
17600        * table description to disk.
17601        */
17602       req->tabLcpStatus = CopyTabReq::LcpCompleted;
17603     }
17604     req->currentLcpId = SYSFILE->latestLCP_ID;
17605     sig_len = CopyTabReq::SignalLengthExtra;
17606   }
17607   sendSignal(ref, GSN_COPY_TABREQ, signal, sig_len, JBB);
17608 }//Dbdih::sendCopyTable()
17609 
execCOPY_TABCONF(Signal * signal)17610 void Dbdih::execCOPY_TABCONF(Signal* signal)
17611 {
17612   CopyTabConf *conf = (CopyTabConf*) &signal->theData[0];
17613   jamEntry();
17614   Uint32 nodeId = conf->nodeId;
17615   Uint32 tableId = conf->tableId;
17616   if (getNodeState().startLevel >= NodeState::SL_STARTED){
17617     /* --------------------------------------------------------------------- */
17618     // We are in the process of performing a node restart. Continue by copying
17619     // the next table to the starting node.
17620     /* --------------------------------------------------------------------- */
17621     jam();
17622     ndbrequire(nodeId == c_nodeStartMaster.startNode);
17623     c_COPY_TABREQ_Counter.clearWaitingFor(nodeId);
17624 
17625     releaseTabPages(tableId);
17626     signal->theData[0] = DihContinueB::ZCOPY_NODE;
17627     signal->theData[1] = tableId + 1;
17628     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17629     return;
17630   } else {
17631     /* --------------------------------------------------------------------- */
17632     // We are in the process of performing a system restart. Check if all nodes
17633     // have saved the new table description to file and then continue with the
17634     // next table.
17635     /* --------------------------------------------------------------------- */
17636     receiveLoopMacro(COPY_TABREQ, nodeId);
17637     /* --------------------------------------------------------------------- */
17638     /*   WE HAVE NOW COPIED TO ALL NODES. WE HAVE NOW COMPLETED RESTORING    */
17639     /*   THIS TABLE. CONTINUE WITH THE NEXT TABLE.                           */
17640     /*   WE NEED TO RELEASE THE PAGES IN THE TABLE IN THIS NODE HERE.        */
17641     /*   WE ALSO NEED TO CLOSE THE TABLE FILE.                               */
17642     /* --------------------------------------------------------------------- */
17643     releaseTabPages(tableId);
17644 
17645     TabRecordPtr tabPtr;
17646     tabPtr.i = tableId;
17647     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
17648 
17649     ConnectRecordPtr connectPtr;
17650     connectPtr.i = tabPtr.p->connectrec;
17651     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
17652 
17653     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
17654     sendAddFragreq(signal, connectPtr, tabPtr, 0);
17655     return;
17656   }//if
17657 }//Dbdih::execCOPY_TABCONF()
17658 
17659 /*
17660   3.13   L O C A L   C H E C K P O I N T  (M A S T E R)
17661   ****************************************************
17662   */
17663 /*****************************************************************************/
17664 /* **********     LOCAL-CHECK-POINT-HANDLING MODULE              *************/
17665 /*****************************************************************************/
17666 /* ------------------------------------------------------------------------- */
17667 /*       IT IS TIME TO CHECK IF IT IS TIME TO START A LOCAL CHECKPOINT.      */
17668 /*       WE WILL EITHER START AFTER 1 MILLION WORDS HAVE ARRIVED OR WE WILL  */
17669 /*       EXECUTE AFTER ABOUT 16 MINUTES HAVE PASSED BY.                      */
17670 /* ------------------------------------------------------------------------- */
checkTcCounterLab(Signal * signal)17671 void Dbdih::checkTcCounterLab(Signal* signal)
17672 {
17673   CRASH_INSERTION(7009);
17674   if (c_lcpState.lcpStatus != LCP_STATUS_IDLE) {
17675     g_eventLogger->error("lcpStatus = %u"
17676                          "lcpStatusUpdatedPlace = %d",
17677                          (Uint32) c_lcpState.lcpStatus,
17678                          c_lcpState.lcpStatusUpdatedPlace);
17679     ndbrequire(false);
17680     return;
17681   }//if
17682   add_lcp_counter(&c_lcpState.ctimer, 32);
17683   if (c_lcpState.lcpStopGcp >= c_newest_restorable_gci) {
17684     jam();
17685     /* --------------------------------------------------------------------- */
17686     // We block LCP start if we have not completed one global checkpoints
17687     // before starting another local checkpoint.
17688     /* --------------------------------------------------------------------- */
17689     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
17690     checkLcpStart(signal, __LINE__, 100);
17691     return;
17692   }//if
17693   c_lcpState.setLcpStatus(LCP_TCGET, __LINE__);
17694 
17695   c_lcpState.ctcCounter = c_lcpState.ctimer;
17696   sendLoopMacro(TCGETOPSIZEREQ, sendTCGETOPSIZEREQ, RNIL);
17697 }//Dbdih::checkTcCounterLab()
17698 
checkLcpStart(Signal * signal,Uint32 lineNo,Uint32 delay)17699 void Dbdih::checkLcpStart(Signal* signal, Uint32 lineNo, Uint32 delay)
17700 {
17701   /* ----------------------------------------------------------------------- */
17702   // Verify that we are not attempting to start another instance of the LCP
17703   // when it is not alright to do so.
17704   /* ----------------------------------------------------------------------- */
17705   c_lcpState.lcpStart = ZACTIVE;
17706   signal->theData[0] = DihContinueB::ZCHECK_TC_COUNTER;
17707   signal->theData[1] = lineNo;
17708   if (delay == 0)
17709   {
17710     jam();
17711     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
17712   }
17713   else
17714   {
17715     jam();
17716     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, delay, 2);
17717   }
17718 }//Dbdih::checkLcpStart()
17719 
17720 /* ------------------------------------------------------------------------- */
17721 /*TCGETOPSIZECONF          HOW MUCH OPERATION SIZE HAVE BEEN EXECUTED BY TC  */
17722 /* ------------------------------------------------------------------------- */
execTCGETOPSIZECONF(Signal * signal)17723 void Dbdih::execTCGETOPSIZECONF(Signal* signal)
17724 {
17725   jamEntry();
17726   Uint32 senderNodeId = signal->theData[0];
17727   add_lcp_counter(&c_lcpState.ctcCounter, signal->theData[1]);
17728 
17729   receiveLoopMacro(TCGETOPSIZEREQ, senderNodeId);
17730 
17731   ndbrequire(c_lcpState.lcpStatus == LCP_TCGET);
17732   ndbrequire(c_lcpState.lcpStart == ZACTIVE);
17733   /* ----------------------------------------------------------------------- */
17734   // We are not actively starting another LCP, still we receive this signal.
17735   // This is not ok.
17736   /* ---------------------------------------------------------------------- */
17737   /*    ALL TC'S HAVE RESPONDED NOW. NOW WE WILL CHECK IF ENOUGH OPERATIONS */
17738   /*    HAVE EXECUTED TO ENABLE US TO START A NEW LOCAL CHECKPOINT.         */
17739   /*    WHILE COPYING DICTIONARY AND DISTRIBUTION INFO TO A STARTING NODE   */
17740   /*    WE WILL ALSO NOT ALLOW THE LOCAL CHECKPOINT TO PROCEED.             */
17741   /*----------------------------------------------------------------------- */
17742   if (c_lcpState.immediateLcpStart == false)
17743   {
17744     Uint64 cnt = Uint64(c_lcpState.ctcCounter);
17745     Uint64 limit = Uint64(1) << c_lcpState.clcpDelay;
17746     bool dostart = cnt >= limit;
17747     if (dostart == false)
17748     {
17749       jam();
17750       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
17751       checkLcpStart(signal, __LINE__, 1000);
17752       return;
17753     }//if
17754 
17755     /**
17756      * Check if we have reason to stall the start of the LCP due to
17757      * outstanding node restarts that are reasonably close to
17758      * need a LCP to complete or to need a point in time where there
17759      * are no LCPs ongoing.
17760      */
17761     if (check_stall_lcp_start())
17762     {
17763       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
17764       checkLcpStart(signal, __LINE__, 3000);
17765       return;
17766     }
17767   }
17768   c_lcpState.lcpStart = ZIDLE;
17769   c_lcpState.immediateLcpStart = false;
17770   /* -----------------------------------------------------------------------
17771    * Now the initial lcp is started,
17772    * we can reset the delay to its orginal value
17773    * --------------------------------------------------------------------- */
17774   CRASH_INSERTION(7010);
17775   /* ----------------------------------------------------------------------- */
17776   /*     IF MORE THAN 1 MILLION WORDS PASSED THROUGH THE TC'S THEN WE WILL   */
17777   /*     START A NEW LOCAL CHECKPOINT. CLEAR CTIMER. START CHECKPOINT        */
17778   /*     ACTIVITY BY CALCULATING THE KEEP GLOBAL CHECKPOINT.                 */
17779   // Also remember the current global checkpoint to ensure that we run at least
17780   // one global checkpoints between each local checkpoint that we start up.
17781   /* ----------------------------------------------------------------------- */
17782   c_lcpState.ctimer = 0;
17783   c_lcpState.keepGci = (Uint32)(m_micro_gcp.m_old_gci >> 32);
17784   c_lcpState.oldestRestorableGci = SYSFILE->oldestRestorableGCI;
17785 
17786   CRASH_INSERTION(7014);
17787   c_lcpState.setLcpStatus(LCP_TC_CLOPSIZE, __LINE__);
17788   sendLoopMacro(TC_CLOPSIZEREQ, sendTC_CLOPSIZEREQ, RNIL);
17789 }
17790 
execTC_CLOPSIZECONF(Signal * signal)17791 void Dbdih::execTC_CLOPSIZECONF(Signal* signal)
17792 {
17793   jamEntry();
17794   Uint32 senderNodeId = signal->theData[0];
17795   receiveLoopMacro(TC_CLOPSIZEREQ, senderNodeId);
17796 
17797   ndbrequire(c_lcpState.lcpStatus == LCP_TC_CLOPSIZE);
17798 
17799   /* ----------------------------------------------------------------------- */
17800   /*       UPDATE THE NEW LATEST LOCAL CHECKPOINT ID.                        */
17801   /* ----------------------------------------------------------------------- */
17802   cnoOfActiveTables = 0;
17803   c_lcpState.setLcpStatus(LCP_WAIT_MUTEX, __LINE__);
17804   ndbrequire(((int)c_lcpState.oldestRestorableGci) > 0);
17805 
17806   if (ERROR_INSERTED(7011)) {
17807     signal->theData[0] = NDB_LE_LCPStoppedInCalcKeepGci;
17808     signal->theData[1] = 0;
17809     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
17810     return;
17811   }//if
17812   start_lcp_before_mutex(signal);
17813 }
17814 
start_lcp_before_mutex(Signal * signal)17815 void Dbdih::start_lcp_before_mutex(Signal *signal)
17816 {
17817   /**
17818    * We lock the Fragment Info for at least a short time. This ensures
17819    * that we don't start an LCP while we are copying meta data. If we
17820    * support PAUSE LCP protocol we can later release the mutex early
17821    * on.
17822    */
17823   jam();
17824   Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
17825   Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked), 0 };
17826   ndbrequire(mutex.trylock(c, false));
17827 }
17828 
17829 void
lcpFragmentMutex_locked(Signal * signal,Uint32 senderData,Uint32 retVal)17830 Dbdih::lcpFragmentMutex_locked(Signal* signal,
17831                                Uint32 senderData,
17832                                Uint32 retVal)
17833 {
17834   jamEntry();
17835 
17836   if (retVal == UtilLockRef::LockAlreadyHeld)
17837   {
17838     jam();
17839     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
17840     mutex.release();
17841 
17842     if (senderData == 0)
17843     {
17844       jam();
17845       infoEvent("Local checkpoint blocked waiting for node-restart");
17846     }
17847     // 2* is as parameter is in seconds, and we sendSignalWithDelay 500ms
17848     if (senderData >= 2*c_lcpState.m_lcp_trylock_timeout)
17849     {
17850       jam();
17851       Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked), 0 };
17852       ndbrequire(mutex.lock(c, false));
17853       return;
17854     }
17855     signal->theData[0] = DihContinueB::ZLCP_TRY_LOCK;
17856     signal->theData[1] = senderData + 1;
17857     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 500, 2);
17858     return;
17859   }
17860 
17861   ndbrequire(retVal == 0);
17862   start_lcp(signal);
17863 }
17864 
start_lcp(Signal * signal)17865 void Dbdih::start_lcp(Signal *signal)
17866 {
17867   c_lcpState.m_start_time = c_current_time = NdbTick_getCurrentTicks();
17868 
17869   setLcpActiveStatusStart(signal);
17870 
17871   c_lcpState.setLcpStatus(LCP_CALCULATE_KEEP_GCI, __LINE__);
17872   c_lcpState.keepGci = m_micro_gcp.m_old_gci >> 32;
17873   c_lcpState.oldestRestorableGci = SYSFILE->oldestRestorableGCI;
17874   SYSFILE->latestLCP_ID++;
17875 
17876   signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
17877   signal->theData[1] = 0;  /* TABLE ID = 0          */
17878   signal->theData[2] = 0;  /* FRAGMENT ID = 0       */
17879   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
17880   return;
17881 }
17882 
17883 /* ------------------------------------------------------------------------- */
17884 /*       WE NEED TO CALCULATE THE OLDEST GLOBAL CHECKPOINT THAT WILL BE      */
17885 /*       COMPLETELY RESTORABLE AFTER EXECUTING THIS LOCAL CHECKPOINT.        */
17886 /* ------------------------------------------------------------------------- */
calculateKeepGciLab(Signal * signal,Uint32 tableId,Uint32 fragId)17887 void Dbdih::calculateKeepGciLab(Signal* signal, Uint32 tableId, Uint32 fragId)
17888 {
17889   TabRecordPtr tabPtr;
17890   Uint32 TloopCount = 1;
17891   tabPtr.i = tableId;
17892   do {
17893     if (tabPtr.i >= ctabFileSize) {
17894       if (cnoOfActiveTables > 0) {
17895         jam();
17896         signal->theData[0] = DihContinueB::ZSTORE_NEW_LCP_ID;
17897         sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
17898         return;
17899       } else {
17900         jam();
17901 	/* ------------------------------------------------------------------ */
17902 	/* THERE ARE NO TABLES TO CHECKPOINT. WE STOP THE CHECKPOINT ALREADY  */
17903 	/* HERE TO AVOID STRANGE PROBLEMS LATER.                              */
17904 	/* ------------------------------------------------------------------ */
17905         c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
17906         checkLcpStart(signal, __LINE__, 1000);
17907         return;
17908       }//if
17909     }//if
17910     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
17911     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
17912 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL) {
17913       if (TloopCount > 100) {
17914         jam();
17915         signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
17916         signal->theData[1] = tabPtr.i + 1;
17917         signal->theData[2] = 0;
17918         sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
17919         return;
17920       } else {
17921         jam();
17922         TloopCount++;
17923         tabPtr.i++;
17924       }//if
17925     } else {
17926       jam();
17927       TloopCount = 0;
17928     }//if
17929   } while (TloopCount != 0);
17930   cnoOfActiveTables++;
17931   FragmentstorePtr fragPtr;
17932   getFragstore(tabPtr.p, fragId, fragPtr);
17933   checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->storedReplicas);
17934   checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->oldStoredReplicas);
17935   fragId++;
17936   if (fragId >= tabPtr.p->totalfragments) {
17937     jam();
17938     tabPtr.i++;
17939     fragId = 0;
17940   }//if
17941   signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
17942   signal->theData[1] = tabPtr.i;
17943   signal->theData[2] = fragId;
17944   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
17945   return;
17946 }//Dbdih::calculateKeepGciLab()
17947 
17948 /* ------------------------------------------------------------------------- */
17949 /*       WE NEED TO STORE ON DISK THE FACT THAT WE ARE STARTING THIS LOCAL   */
17950 /*       CHECKPOINT ROUND. THIS WILL INVALIDATE ALL THE LOCAL CHECKPOINTS    */
17951 /*       THAT WILL EVENTUALLY BE OVERWRITTEN AS PART OF THIS LOCAL CHECKPOINT*/
17952 /* ------------------------------------------------------------------------- */
storeNewLcpIdLab(Signal * signal)17953 void Dbdih::storeNewLcpIdLab(Signal* signal)
17954 {
17955   signal->theData[0] = NDB_LE_LocalCheckpointStarted; //Event type
17956   signal->theData[1] = SYSFILE->latestLCP_ID;
17957   signal->theData[2] = c_lcpState.keepGci;
17958   signal->theData[3] = c_lcpState.oldestRestorableGci;
17959   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
17960 
17961   /***************************************************************************/
17962   // Report the event that a local checkpoint has started.
17963   /***************************************************************************/
17964 
17965   signal->setTrace(TestOrd::TraceLocalCheckpoint);
17966 
17967   CRASH_INSERTION(7013);
17968   SYSFILE->keepGCI = c_lcpState.keepGci;
17969   SYSFILE->oldestRestorableGCI = c_lcpState.oldestRestorableGci;
17970 
17971   const Uint32 oldestRestorableGCI = SYSFILE->oldestRestorableGCI;
17972 
17973   Int32 val = oldestRestorableGCI;
17974   ndbrequire(val > 0);
17975 
17976   /* ----------------------------------------------------------------------- */
17977   /* SET BIT INDICATING THAT LOCAL CHECKPOINT IS ONGOING. THIS IS CLEARED    */
17978   /* AT THE END OF A LOCAL CHECKPOINT.                                       */
17979   /* ----------------------------------------------------------------------- */
17980   SYSFILE->setLCPOngoing(SYSFILE->systemRestartBits);
17981   /* ---------------------------------------------------------------------- */
17982   /*    CHECK IF ANY NODE MUST BE TAKEN OUT OF SERVICE AND REFILLED WITH    */
17983   /*    NEW FRESH DATA FROM AN ACTIVE NODE.                                 */
17984   /* ---------------------------------------------------------------------- */
17985 
17986   /**
17987    * This used be done in setLcpActiveStatusStart
17988    *   but this function has been move "up" in the flow
17989    *   to just before calcKeepGci
17990    */
17991   setNodeRestartInfoBits(signal);
17992 
17993   c_lcpState.setLcpStatus(LCP_COPY_GCI, __LINE__);
17994   //#ifdef VM_TRACE
17995   //  infoEvent("LocalCheckpoint %d started", SYSFILE->latestLCP_ID);
17996   //  signal->theData[0] = 7012;
17997   //  execDUMP_STATE_ORD(signal);
17998   //#endif
17999 
18000   copyGciLab(signal, CopyGCIReq::LOCAL_CHECKPOINT);
18001 }//Dbdih::storeNewLcpIdLab()
18002 
startLcpRoundLab(Signal * signal)18003 void Dbdih::startLcpRoundLab(Signal* signal)
18004 {
18005   jam();
18006 
18007   CRASH_INSERTION(7218);
18008 
18009   /**
18010    * Next step in starting up a local checkpoint is to define which
18011    * tables that should participate in the local checkpoint, while
18012    * we are performing this step we don't want to have committing
18013    * schema transactions in the middle of this, this mutex ensures
18014    * that we will wait for a schema transaction to commit before we
18015    * proceed and once we acquired the mutex, then schema transaction
18016    * commits will block waiting for this LCP phase to complete.
18017    *
18018    * The reason we need this mutex is to ensure that all nodes that
18019    * participate in the LCP have the same view on the tables involved
18020    * in the LCP. This makes it possible for a node to easily take
18021    * over the master role in executing a LCP if the master node that
18022    * controls the LCP fails.
18023    */
18024   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
18025   Callback c = { safe_cast(&Dbdih::startLcpMutex_locked), 0 };
18026   ndbrequire(mutex.lock(c));
18027 }
18028 
18029 void
startLcpMutex_locked(Signal * signal,Uint32 senderData,Uint32 retVal)18030 Dbdih::startLcpMutex_locked(Signal* signal, Uint32 senderData, Uint32 retVal){
18031   jamEntry();
18032   ndbrequire(retVal == 0);
18033 
18034   StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
18035   req->senderRef = reference();
18036   req->lcpId = SYSFILE->latestLCP_ID;
18037   req->participatingLQH = c_lcpState.m_participatingLQH;
18038   req->participatingDIH = c_lcpState.m_participatingDIH;
18039   req->pauseStart = StartLcpReq::NormalLcpStart; /* Normal LCP start */
18040   sendLoopMacro(START_LCP_REQ, sendSTART_LCP_REQ, RNIL);
18041 }
18042 
18043 void
sendSTART_LCP_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)18044 Dbdih::sendSTART_LCP_REQ(Signal* signal, Uint32 nodeId, Uint32 extra)
18045 {
18046   BlockReference ref = calcDihBlockRef(nodeId);
18047   if (ERROR_INSERTED(7021) && nodeId == getOwnNodeId())
18048   {
18049     sendSignalWithDelay(ref, GSN_START_LCP_REQ, signal, 500,
18050                         StartLcpReq::SignalLength);
18051     return;
18052   }
18053   else if (ERROR_INSERTED(7021) && ((rand() % 10) > 4))
18054   {
18055     infoEvent("Don't send START_LCP_REQ to %u", nodeId);
18056     return;
18057   }
18058   sendSignal(ref, GSN_START_LCP_REQ, signal, StartLcpReq::SignalLength, JBB);
18059 }
18060 
18061 void
execSTART_LCP_CONF(Signal * signal)18062 Dbdih::execSTART_LCP_CONF(Signal* signal)
18063 {
18064   StartLcpConf * conf = (StartLcpConf*)signal->getDataPtr();
18065 
18066   Uint32 nodeId = refToNode(conf->senderRef);
18067 
18068   if (is_lcp_paused())
18069   {
18070     ndbrequire(isMaster());
18071     if (c_pause_lcp_master_state == PAUSE_START_LCP_INCLUSION)
18072     {
18073       jam();
18074       /**
18075        * We have completed including the starting node into the LCP.
18076        * We now need to copy the meta data.
18077        *
18078        * We come here as part of starting up a new starting node, so
18079        * we don't come here as part of a normal LCP start. So the
18080        * bitmap for outstanding signals we should not use since we
18081        * haven't set it up in this case.
18082        */
18083       c_pause_lcp_master_state = PAUSE_IN_LCP_COPY_META_DATA;
18084       start_copy_meta_data(signal);
18085       return;
18086     }
18087     else
18088     {
18089       jam();
18090       ndbrequire(c_pause_lcp_master_state == PAUSE_COMPLETE_LCP_INCLUSION);
18091       /**
18092        * We have completed copying the meta data and now we have also
18093        * completed the inclusion of the new node into the LCP protocol.
18094        * We are now ready to continue to the next stage of the node
18095        * restart handling for the starting node.
18096        */
18097       sendPAUSE_LCP_REQ(signal, false);
18098       return;
18099     }
18100   }
18101   receiveLoopMacro(START_LCP_REQ, nodeId);
18102 
18103   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
18104   Callback c = { safe_cast(&Dbdih::startLcpMutex_unlocked), 0 };
18105   mutex.unlock(c);
18106 }
18107 
18108 void
startLcpMutex_unlocked(Signal * signal,Uint32 data,Uint32 retVal)18109 Dbdih::startLcpMutex_unlocked(Signal* signal, Uint32 data, Uint32 retVal){
18110   jamEntry();
18111   ndbrequire(retVal == 0);
18112 
18113   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
18114   mutex.release();
18115 
18116   /* ----------------------------------------------------------------------- */
18117   /*     NOW PROCEED BY STARTING THE LOCAL CHECKPOINT IN EACH LQH.           */
18118   /* ----------------------------------------------------------------------- */
18119   c_lcpState.m_LAST_LCP_FRAG_ORD = c_lcpState.m_participatingLQH;
18120 
18121   c_lcp_runs_with_pause_support = check_if_pause_lcp_possible();
18122   if (c_lcp_runs_with_pause_support)
18123   {
18124     jam();
18125     /**
18126      * We can release the mutex now that we have started the LCP. Since we
18127      * hold the mutex we know that currently no copy of meta data is ongoing.
18128      * We have setup everything for the LCP to start we reach this call, so it
18129      * is safe to release the mutex and rely on the PAUSE LCP protocol to
18130      * handle the rest.
18131      *
18132      * We have held the fragment info mutex long enough to ensure that we have
18133      * copied the m_participatingDIH bitmap to all participants in the LCP.
18134      * This means that when we reach the participant nodes we can safely add
18135      * the starting node to m_participatingDIH to ensure that the starting
18136      * node also gets all the rest of the updates to the LCP data in DIH
18137      * while the LCP is completing. This phase of the LCP is fairly quick, so
18138      * the cost of holding the mutex here should be fairly small. The part of
18139      * the LCP that consumes most time is when we start performing the real
18140      * checkpointing on the m_participatingLQH nodes.
18141      */
18142     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
18143     mutex.unlock();
18144   }
18145   CRASH_INSERTION(7015);
18146   c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__);
18147   startLcpRoundLoopLab(signal, 0, 0);
18148 }
18149 
18150 void
master_lcp_fragmentMutex_locked(Signal * signal,Uint32 failedNodePtrI,Uint32 retVal)18151 Dbdih::master_lcp_fragmentMutex_locked(Signal* signal,
18152                                        Uint32 failedNodePtrI, Uint32 retVal)
18153 {
18154   jamEntry();
18155   ndbrequire(retVal == 0);
18156 
18157   signal->theData[0] = NDB_LE_LCP_TakeoverCompleted;
18158   signal->theData[1] = c_lcpMasterTakeOverState.state;
18159   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
18160 
18161   signal->theData[0] = 7012;
18162   execDUMP_STATE_ORD(signal);
18163 
18164   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
18165 
18166   checkLocalNodefailComplete(signal, failedNodePtrI, NF_LCP_TAKE_OVER);
18167 
18168   startLcpRoundLoopLab(signal, 0, 0);
18169 }
18170 
startLcpRoundLoopLab(Signal * signal,Uint32 startTableId,Uint32 startFragId)18171 void Dbdih::startLcpRoundLoopLab(Signal* signal,
18172 				 Uint32 startTableId, Uint32 startFragId)
18173 {
18174   NodeRecordPtr nodePtr;
18175   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
18176     ptrAss(nodePtr, nodeRecord);
18177     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
18178       ndbrequire(nodePtr.p->noOfStartedChkpt == 0);
18179       ndbrequire(nodePtr.p->noOfQueuedChkpt == 0);
18180     }//if
18181   }//if
18182   c_lcpState.currentFragment.tableId = startTableId;
18183   c_lcpState.currentFragment.fragmentId = startFragId;
18184   startNextChkpt(signal);
18185 }//Dbdih::startLcpRoundLoopLab()
18186 
startNextChkpt(Signal * signal)18187 void Dbdih::startNextChkpt(Signal* signal)
18188 {
18189   Uint32 lcpId = SYSFILE->latestLCP_ID;
18190 
18191   NdbNodeBitmask busyNodes;
18192   busyNodes.clear();
18193   const Uint32 lcpNodes = c_lcpState.m_participatingLQH.count();
18194 
18195   bool save = true;
18196   LcpState::CurrentFragment curr = c_lcpState.currentFragment;
18197 
18198   while (curr.tableId < ctabFileSize) {
18199     TabRecordPtr tabPtr;
18200     tabPtr.i = curr.tableId;
18201     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
18202     if ((tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) ||
18203         (tabPtr.p->tabLcpStatus != TabRecord::TLS_ACTIVE)) {
18204       curr.tableId++;
18205       curr.fragmentId = 0;
18206       continue;
18207     }//if
18208 
18209     FragmentstorePtr fragPtr;
18210     getFragstore(tabPtr.p, curr.fragmentId, fragPtr);
18211 
18212     ReplicaRecordPtr replicaPtr;
18213     for(replicaPtr.i = fragPtr.p->storedReplicas;
18214 	replicaPtr.i != RNIL ;
18215 	replicaPtr.i = replicaPtr.p->nextPool){
18216 
18217       jam();
18218       c_replicaRecordPool.getPtr(replicaPtr);
18219 
18220       NodeRecordPtr nodePtr;
18221       nodePtr.i = replicaPtr.p->procNode;
18222       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
18223 
18224       if (c_lcpState.m_participatingLQH.get(nodePtr.i))
18225       {
18226 	if (replicaPtr.p->lcpOngoingFlag &&
18227 	    replicaPtr.p->lcpIdStarted < lcpId)
18228 	{
18229 	  jam();
18230 	  //-------------------------------------------------------------------
18231 	  // We have found a replica on a node that performs local checkpoint
18232 	  // that is alive and that have not yet been started.
18233 	  //-------------------------------------------------------------------
18234 
18235           if (nodePtr.p->noOfStartedChkpt <
18236               getMaxStartedFragCheckpointsForNode(nodePtr.i))
18237 	  {
18238 	    jam();
18239 	    /**
18240 	     * Send LCP_FRAG_ORD to LQH
18241 	     */
18242 
18243 	    /**
18244 	     * Mark the replica so with lcpIdStarted == true
18245 	     */
18246 	    replicaPtr.p->lcpIdStarted = lcpId;
18247 
18248 	    Uint32 i = nodePtr.p->noOfStartedChkpt;
18249 	    nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
18250 	    nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
18251 	    nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
18252 	    nodePtr.p->noOfStartedChkpt = i + 1;
18253 
18254 	    sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
18255 	  }
18256           else if (nodePtr.p->noOfQueuedChkpt <
18257                    MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE)
18258 	  {
18259 	    jam();
18260 	    /**
18261 	     * Put LCP_FRAG_ORD "in queue"
18262 	     */
18263 
18264 	    /**
18265 	     * Mark the replica so with lcpIdStarted == true
18266 	     */
18267 	    replicaPtr.p->lcpIdStarted = lcpId;
18268 
18269 	    Uint32 i = nodePtr.p->noOfQueuedChkpt;
18270 	    nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
18271 	    nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
18272 	    nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
18273 	    nodePtr.p->noOfQueuedChkpt = i + 1;
18274 	  }
18275 	  else
18276 	  {
18277 	    jam();
18278 
18279 	    if(save)
18280 	    {
18281 	      /**
18282 	       * Stop increasing value on first that was "full"
18283 	       */
18284 	      c_lcpState.currentFragment = curr;
18285 	      save = false;
18286 	    }
18287 
18288 	    busyNodes.set(nodePtr.i);
18289 	    if(busyNodes.count() == lcpNodes)
18290 	    {
18291 	      /**
18292 	       * There were no possibility to start the local checkpoint
18293 	       * and it was not possible to queue it up. In this case we
18294 	       * stop the start of local checkpoints until the nodes with a
18295 	       * backlog have performed more checkpoints. We will return and
18296 	       * will not continue the process of starting any more checkpoints.
18297 	       */
18298 	      return;
18299 	    }//if
18300 	  }//if
18301 	}
18302       }//while
18303     }
18304     curr.fragmentId++;
18305     if (curr.fragmentId >= tabPtr.p->totalfragments) {
18306       jam();
18307       curr.fragmentId = 0;
18308       curr.tableId++;
18309     }//if
18310   }//while
18311 
18312   sendLastLCP_FRAG_ORD(signal);
18313 }//Dbdih::startNextChkpt()
18314 
sendLastLCP_FRAG_ORD(Signal * signal)18315 void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal)
18316 {
18317   LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
18318   lcpFragOrd->tableId = RNIL;
18319   lcpFragOrd->fragmentId = 0;
18320   lcpFragOrd->lcpId = SYSFILE->latestLCP_ID;
18321   lcpFragOrd->lcpNo = 0;
18322   lcpFragOrd->keepGci = c_lcpState.keepGci;
18323   lcpFragOrd->lastFragmentFlag = true;
18324 
18325   NodeRecordPtr nodePtr;
18326   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
18327     jam();
18328     ptrAss(nodePtr, nodeRecord);
18329 
18330     if(nodePtr.p->noOfQueuedChkpt == 0 &&
18331        nodePtr.p->noOfStartedChkpt == 0 &&
18332        c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodePtr.i)){
18333       jam();
18334 
18335       CRASH_INSERTION(7028);
18336 
18337       /**
18338        * Nothing queued or started <=> Complete on that node
18339        *
18340        */
18341       c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodePtr.i);
18342       if(ERROR_INSERTED(7075)){
18343 	continue;
18344       }
18345 
18346       CRASH_INSERTION(7193);
18347       BlockReference ref = calcLqhBlockRef(nodePtr.i);
18348       sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB);
18349     }
18350   }
18351   if(ERROR_INSERTED(7075))
18352   {
18353     if(c_lcpState.m_LAST_LCP_FRAG_ORD.done())
18354     {
18355       CRASH_INSERTION(7075);
18356     }
18357   }
18358 }//Dbdih::sendLastLCP_FRAGORD()
18359 
18360 /* ------------------------------------------------------------------------- */
18361 /*       A FRAGMENT REPLICA HAS COMPLETED EXECUTING ITS LOCAL CHECKPOINT.    */
18362 /*       CHECK IF ALL REPLICAS IN THE TABLE HAVE COMPLETED. IF SO STORE THE  */
18363 /*       THE TABLE DISTRIBUTION ON DISK. ALSO SEND LCP_REPORT TO ALL OTHER   */
18364 /*       NODES SO THAT THEY CAN STORE THE TABLE ONTO DISK AS WELL.           */
18365 /* ------------------------------------------------------------------------- */
execLCP_FRAG_REP(Signal * signal)18366 void Dbdih::execLCP_FRAG_REP(Signal* signal)
18367 {
18368   jamEntry();
18369 
18370   LcpFragRep * lcpReport = (LcpFragRep *)&signal->theData[0];
18371 
18372   /**
18373    * Proxying LCP_FRAG_REP
18374    */
18375   const bool broadcast_req = lcpReport->nodeId == LcpFragRep::BROADCAST_REQ;
18376   if (broadcast_req)
18377   {
18378     jam();
18379     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
18380 
18381     /**
18382      * Set correct nodeId
18383      */
18384     lcpReport->nodeId = getOwnNodeId();
18385 
18386     if (is_lcp_paused() || c_dequeue_lcp_rep_ongoing)
18387     {
18388       jam();
18389       /**
18390        * We are currently pausing sending all information about LCP_FRAG_REP
18391        * from this node and also pausing any local processing of signals
18392        * received from LQH. We can still handle messages from other DIH
18393        * nodes. These will eventually stop due to pausing and we will wait
18394        * until we know that all those signals have arrived at their
18395        * destination.
18396        *
18397        * We won't send anything until we have completed the
18398        * PAUSE_LCP_REQ protocol which means until the starting node have
18399        * received all the meta data from the master node.
18400        */
18401       queue_lcp_frag_rep(signal, lcpReport);
18402       return;
18403     }
18404     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
18405     rg.m_nodes.clear(getOwnNodeId());
18406     sendSignal(rg, GSN_LCP_FRAG_REP, signal, signal->getLength(), JBB);
18407 
18408     /**
18409      * and continue processing
18410      */
18411   }
18412 
18413   Uint32 nodeId = lcpReport->nodeId;
18414   Uint32 tableId = lcpReport->tableId;
18415   Uint32 fragId = lcpReport->fragId;
18416 
18417   /**
18418    * We can receive LCP_FRAG_REP in 2 different situations:
18419    * 1) signal->length() == SignalLength
18420    * A normal report of completion of a LCP on a specific fragment. This
18421    * cannot arrive when the node is down, the sending must be in
18422    * the m_participatingLQH set, in addition the node must be alive
18423    * in the DIH sense which means that it has passed the state where it
18424    * is included in all the LCP protocols and GCP protocols.
18425    *
18426    * 2) signal->length() == SignalLengthTQ && lcpReport->fromTQ == 1
18427    * This signal is sent when the table is in copy state when a signal
18428    * in 1) is received. In this case the node could die before we
18429    * arrive here. We check this by simply checking if the node is still
18430    * alive. If this happens we can simply drop the signal.
18431    */
18432   if (!checkNodeAlive(nodeId))
18433   {
18434     jam();
18435     ndbrequire(signal->length() == LcpFragRep::SignalLengthTQ &&
18436                lcpReport->fromTQ == Uint32(1));
18437     /**
18438      * Given that we can delay this signal during a table copy situation,
18439      * we can actually receive this signal when the node is already dead. If
18440      * the node is dead then we drop the signal as soon as possible, the node
18441      * failure handling will ensure that the node is properly handled anyways.
18442      */
18443     return;
18444   }
18445 
18446   ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE);
18447 
18448 #if 0
18449   printLCP_FRAG_REP(stdout,
18450 		    signal->getDataPtr(),
18451 		    signal->length(), number());
18452 #endif
18453 
18454   jamEntry();
18455 
18456   if (ERROR_INSERTED(7178) && nodeId != getOwnNodeId())
18457   {
18458     jam();
18459     Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
18460     Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
18461     if (owng == nodeg)
18462     {
18463       jam();
18464       ndbout_c("throwing away LCP_FRAG_REP from  (and killing) %d", nodeId);
18465       SET_ERROR_INSERT_VALUE(7179);
18466       signal->theData[0] = 9999;
18467       sendSignal(numberToRef(CMVMI, nodeId),
18468 		 GSN_NDB_TAMPER, signal, 1, JBA);
18469       return;
18470     }
18471   }
18472 
18473   if (ERROR_INSERTED(7179) && nodeId != getOwnNodeId())
18474   {
18475     jam();
18476     Uint32 owng =Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
18477     Uint32 nodeg = Sysfile::getNodeGroup(nodeId, SYSFILE->nodeGroups);
18478     if (owng == nodeg)
18479     {
18480       jam();
18481       ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId);
18482       return;
18483     }
18484   }
18485 
18486   CRASH_INSERTION2(7025, isMaster());
18487   CRASH_INSERTION2(7016, !isMaster());
18488   CRASH_INSERTION2(7191, (!isMaster() && tableId));
18489 
18490   bool fromTimeQueue = (signal->length() == LcpFragRep::SignalLengthTQ &&
18491                         lcpReport->fromTQ == Uint32(1) &&
18492                         !broadcast_req);
18493 
18494   TabRecordPtr tabPtr;
18495   tabPtr.i = tableId;
18496   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
18497   if(tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
18498     jam();
18499     /*-----------------------------------------------------------------------*/
18500     // If the table is currently copied to disk we also
18501     // stop already here to avoid strange half-way updates
18502     // of the table data structures.
18503     /*-----------------------------------------------------------------------*/
18504     /*
18505       We need to send this signal without a delay since we have discovered
18506       that we have run out of space in the short time queue. This problem
18507       is very unlikely to happen but it has and it results in a node crash.
18508       This should be considered a "quick fix" and not a permanent solution.
18509       A cleaner/better way would be to check the time queue if it is full or
18510       not before sending this signal.
18511     */
18512     lcpReport->fromTQ = Uint32(1);
18513     sendSignal(reference(), GSN_LCP_FRAG_REP, signal,
18514                LcpFragRep::SignalLengthTQ, JBB);
18515     /* Kept here for reference
18516        sendSignalWithDelay(reference(), GSN_LCP_FRAG_REP,
18517        signal, 20, signal->length());
18518     */
18519 
18520     if(!fromTimeQueue){
18521       c_lcpState.noOfLcpFragRepOutstanding++;
18522     }
18523 
18524     return;
18525   }//if
18526 
18527   if(fromTimeQueue)
18528   {
18529     jam();
18530     ndbrequire(c_lcpState.noOfLcpFragRepOutstanding > 0);
18531     c_lcpState.noOfLcpFragRepOutstanding--;
18532   }
18533 
18534   bool tableDone = reportLcpCompletion(lcpReport);
18535 
18536   Uint32 started = lcpReport->maxGciStarted;
18537 #ifdef VM_TRACE
18538   Uint32 completed = lcpReport->maxGciCompleted;
18539 #endif
18540 
18541   if (started > c_lcpState.lcpStopGcp)
18542   {
18543     jam();
18544     c_lcpState.lcpStopGcp = started;
18545   }
18546 
18547   /**
18548    * Update m_local_lcp_state
18549    *
18550    * we could only look fragments that we have locally...
18551    *   but for now we look at all fragments
18552    */
18553   m_local_lcp_state.lcp_frag_rep(lcpReport);
18554 
18555   if (tableDone)
18556   {
18557     jam();
18558 
18559     if (tabPtr.p->tabStatus == TabRecord::TS_IDLE ||
18560         tabPtr.p->tabStatus == TabRecord::TS_DROPPING)
18561     {
18562       jam();
18563       g_eventLogger->info("TS_DROPPING - Neglecting to save Table: %d Frag: %d - ",
18564                           tableId, fragId);
18565     }
18566     else
18567     {
18568       jam();
18569       /**
18570        * Write table description to file
18571        */
18572       tabPtr.p->tabLcpStatus = TabRecord::TLS_WRITING_TO_FILE;
18573       tabPtr.p->tabCopyStatus = TabRecord::CS_LCP_READ_TABLE;
18574 
18575       /**
18576        * Check whether we should write immediately, or queue...
18577        */
18578       if (c_lcpTabDefWritesControl.requestMustQueue())
18579       {
18580         jam();
18581         //ndbout_c("DIH : Queueing tab def flush op on table %u", tabPtr.i);
18582         /* Mark as queued - will be started when an already running op completes */
18583         tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT_QUEUED;
18584       }
18585       else
18586       {
18587         /* Run immediately */
18588         jam();
18589         tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT;
18590         signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
18591         signal->theData[1] = tabPtr.i;
18592         sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
18593       }
18594 
18595       bool ret = checkLcpAllTablesDoneInLqh(__LINE__);
18596       if (ret && ERROR_INSERTED(7209))
18597       {
18598         jam();
18599         CLEAR_ERROR_INSERT_VALUE;
18600         signal->theData[0] = 9999;
18601         sendSignal(numberToRef(CMVMI, cmasterNodeId),
18602                    GSN_NDB_TAMPER, signal, 1, JBB);
18603       }
18604     }
18605   }
18606 
18607 #ifdef VM_TRACE
18608   /* --------------------------------------------------------------------- */
18609   // REPORT that local checkpoint have completed this fragment.
18610   /* --------------------------------------------------------------------- */
18611   signal->theData[0] = NDB_LE_LCPFragmentCompleted;
18612   signal->theData[1] = nodeId;
18613   signal->theData[2] = tableId;
18614   signal->theData[3] = fragId;
18615   signal->theData[4] = started;
18616   signal->theData[5] = completed;
18617   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 6, JBB);
18618 #endif
18619 
18620   bool ok = false;
18621   switch(c_lcpMasterTakeOverState.state){
18622   case LMTOS_IDLE:
18623     ok = true;
18624     jam();
18625     /**
18626      * Fall through
18627      */
18628     break;
18629   case LMTOS_WAIT_EMPTY_LCP: // LCP Take over waiting for EMPTY_LCPCONF
18630     jam();
18631     return;
18632   case LMTOS_WAIT_LCP_FRAG_REP:
18633     jam();
18634     checkEmptyLcpComplete(signal);
18635     return;
18636   case LMTOS_INITIAL:
18637   case LMTOS_ALL_IDLE:
18638   case LMTOS_ALL_ACTIVE:
18639   case LMTOS_LCP_CONCLUDING:
18640   case LMTOS_COPY_ONGOING:
18641     /**
18642      * In the old code we ensured that all outstanding LCP_FRAG_REPs
18643      * were handled before entering those states. So receiving an
18644      * LCP_FRAG_REP is ok in new code, even in new code will block
18645      * LCP_COMPLETE_REP such that we don't complete an LCP while
18646      * processing a master take over. But we can still receive
18647      * LCP_FRAG_REP while processing a master takeover.
18648      *
18649      * In old code we were blocked from coming here for LCP_FRAG_REPs since
18650      * we enusred that we don't proceed here until all nodes have sent
18651      * their EMPTY_LCP_CONF to us. So we keep ndbrequire to ensure that
18652      * we come here only when running the new master take over code.
18653      */
18654     ndbrequire(!c_lcpMasterTakeOverState.use_empty_lcp);
18655     return;
18656   }
18657   ndbrequire(ok);
18658 
18659   /* ----------------------------------------------------------------------- */
18660   // Check if there are more LCP's to start up.
18661   /* ----------------------------------------------------------------------- */
18662   if(isMaster())
18663   {
18664     jam();
18665 
18666     /**
18667      * Remove from "running" array
18668      */
18669     NodeRecordPtr nodePtr;
18670     nodePtr.i = nodeId;
18671     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
18672 
18673     const Uint32 outstanding = nodePtr.p->noOfStartedChkpt;
18674     ndbrequire(outstanding > 0);
18675     bool found = false;
18676     for (Uint32 i = 0; i < outstanding; i++)
18677     {
18678       if (found)
18679       {
18680         jam();
18681         nodePtr.p->startedChkpt[i - 1] = nodePtr.p->startedChkpt[i];
18682         continue;
18683       }
18684       if(nodePtr.p->startedChkpt[i].tableId != tableId ||
18685          nodePtr.p->startedChkpt[i].fragId != fragId)
18686       {
18687         jam();
18688         continue;
18689       }
18690       jam();
18691       found = true;
18692     }
18693     ndbrequire(found);
18694     nodePtr.p->noOfStartedChkpt--;
18695     checkStartMoreLcp(signal, nodeId);
18696   }
18697 }
18698 
18699 bool
checkLcpAllTablesDoneInLqh(Uint32 line)18700 Dbdih::checkLcpAllTablesDoneInLqh(Uint32 line){
18701   TabRecordPtr tabPtr;
18702 
18703   /**
18704    * Check if finished with all tables
18705    */
18706   for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++) {
18707     //jam(); Removed as it flushed all other jam traces.
18708     ptrAss(tabPtr, tabRecord);
18709     if ((tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) &&
18710         (tabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE))
18711     {
18712       jam();
18713       /**
18714        * Nope, not finished with all tables
18715        */
18716       return false;
18717     }//if
18718   }//for
18719 
18720   CRASH_INSERTION2(7026, isMaster());
18721   CRASH_INSERTION2(7017, !isMaster());
18722 
18723   c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, line);
18724 
18725   if (ERROR_INSERTED(7194))
18726   {
18727     ndbout_c("CLEARING 7194");
18728     CLEAR_ERROR_INSERT_VALUE;
18729   }
18730 
18731   return true;
18732 }
18733 
findReplica(ReplicaRecordPtr & replicaPtr,Fragmentstore * fragPtrP,Uint32 nodeId,bool old)18734 void Dbdih::findReplica(ReplicaRecordPtr& replicaPtr,
18735 			Fragmentstore* fragPtrP,
18736 			Uint32 nodeId,
18737 			bool old)
18738 {
18739   replicaPtr.i = old ? fragPtrP->oldStoredReplicas : fragPtrP->storedReplicas;
18740   while(replicaPtr.i != RNIL){
18741     c_replicaRecordPool.getPtr(replicaPtr);
18742     if (replicaPtr.p->procNode == nodeId) {
18743       jam();
18744       return;
18745     } else {
18746       jam();
18747       replicaPtr.i = replicaPtr.p->nextPool;
18748     }//if
18749   };
18750 
18751 #ifdef VM_TRACE
18752   g_eventLogger->info("Fragment Replica(node=%d) not found", nodeId);
18753   replicaPtr.i = fragPtrP->oldStoredReplicas;
18754   while(replicaPtr.i != RNIL){
18755     c_replicaRecordPool.getPtr(replicaPtr);
18756     if (replicaPtr.p->procNode == nodeId) {
18757       jam();
18758       break;
18759     } else {
18760       jam();
18761       replicaPtr.i = replicaPtr.p->nextPool;
18762     }//if
18763   };
18764   if(replicaPtr.i != RNIL){
18765     g_eventLogger->info("...But was found in oldStoredReplicas");
18766   } else {
18767     g_eventLogger->info("...And wasn't found in oldStoredReplicas");
18768   }
18769 #endif
18770   ndbrequire(false);
18771 }//Dbdih::findReplica()
18772 
18773 
18774 int
handle_invalid_lcp_no(const LcpFragRep * rep,ReplicaRecordPtr replicaPtr)18775 Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep,
18776 			     ReplicaRecordPtr replicaPtr)
18777 {
18778   ndbrequire(!isMaster());
18779   Uint32 lcpNo = rep->lcpNo;
18780   Uint32 lcpId = rep->lcpId;
18781 
18782   if (!ndb_pnr(getNodeInfo(refToNode(cmasterdihref)).m_version))
18783   {
18784   }
18785   else
18786   {
18787     warningEvent("Detected previous node failure of %d during lcp",
18788                  rep->nodeId);
18789   }
18790 
18791   replicaPtr.p->nextLcp = lcpNo;
18792   replicaPtr.p->lcpId[lcpNo] = 0;
18793   replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
18794 
18795   for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i))
18796   {
18797     jam();
18798     if (replicaPtr.p->lcpStatus[i] == ZVALID &&
18799 	replicaPtr.p->lcpId[i] >= lcpId)
18800     {
18801       ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]);
18802       ndbrequire(false);
18803     }
18804   }
18805 
18806   return 0;
18807 }
18808 
18809 /**
18810  * Return true  if table is all fragment replicas have been checkpointed
18811  *                 to disk (in all LQHs)
18812  *        false otherwise
18813  */
18814 bool
reportLcpCompletion(const LcpFragRep * lcpReport)18815 Dbdih::reportLcpCompletion(const LcpFragRep* lcpReport)
18816 {
18817   Uint32 lcpNo = lcpReport->lcpNo;
18818   Uint32 lcpId = lcpReport->lcpId;
18819   Uint32 maxGciStarted = lcpReport->maxGciStarted;
18820   Uint32 maxGciCompleted = lcpReport->maxGciCompleted;
18821   Uint32 tableId = lcpReport->tableId;
18822   Uint32 fragId = lcpReport->fragId;
18823   Uint32 nodeId = lcpReport->nodeId;
18824 
18825   TabRecordPtr tabPtr;
18826   tabPtr.i = tableId;
18827   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
18828 
18829   if (tabPtr.p->tabStatus == TabRecord::TS_DROPPING ||
18830       tabPtr.p->tabStatus == TabRecord::TS_IDLE)
18831   {
18832     jam();
18833     return true;
18834   }
18835 
18836   FragmentstorePtr fragPtr;
18837   getFragstore(tabPtr.p, fragId, fragPtr);
18838 
18839   ReplicaRecordPtr replicaPtr;
18840   findReplica(replicaPtr, fragPtr.p, nodeId);
18841 
18842   ndbrequire(replicaPtr.p->lcpOngoingFlag == true);
18843   if(lcpNo != replicaPtr.p->nextLcp){
18844     if (handle_invalid_lcp_no(lcpReport, replicaPtr))
18845     {
18846       g_eventLogger->error("lcpNo = %d replicaPtr.p->nextLcp = %d",
18847                            lcpNo, replicaPtr.p->nextLcp);
18848       ndbrequire(false);
18849     }
18850   }
18851   ndbrequire(lcpNo == replicaPtr.p->nextLcp);
18852   ndbrequire(lcpNo < MAX_LCP_STORED);
18853   ndbrequire(replicaPtr.p->lcpId[lcpNo] != lcpId);
18854 
18855   replicaPtr.p->lcpIdStarted = lcpId;
18856   replicaPtr.p->lcpOngoingFlag = false;
18857 
18858   removeOldCrashedReplicas(tableId, fragId, replicaPtr);
18859   replicaPtr.p->lcpId[lcpNo] = lcpId;
18860   replicaPtr.p->lcpStatus[lcpNo] = ZVALID;
18861   replicaPtr.p->maxGciStarted[lcpNo] = maxGciStarted;
18862   replicaPtr.p->maxGciCompleted[lcpNo] = maxGciCompleted;
18863   replicaPtr.p->nextLcp = nextLcpNo(replicaPtr.p->nextLcp);
18864   ndbrequire(fragPtr.p->noLcpReplicas > 0);
18865   fragPtr.p->noLcpReplicas --;
18866 
18867   if(fragPtr.p->noLcpReplicas > 0){
18868     jam();
18869     return false;
18870   }
18871 
18872   for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) {
18873     jam();
18874     getFragstore(tabPtr.p, fid, fragPtr);
18875     if (fragPtr.p->noLcpReplicas > 0){
18876       jam();
18877       /* ----------------------------------------------------------------- */
18878       // Not all fragments in table have been checkpointed.
18879       /* ----------------------------------------------------------------- */
18880       if(0)
18881         g_eventLogger->info("reportLcpCompletion: fragment %d not ready", fid);
18882       return false;
18883     }//if
18884   }//for
18885   return true;
18886 }//Dbdih::reportLcpCompletion()
18887 
checkStartMoreLcp(Signal * signal,Uint32 nodeId)18888 void Dbdih::checkStartMoreLcp(Signal* signal, Uint32 nodeId)
18889 {
18890   ndbrequire(isMaster());
18891 
18892   NodeRecordPtr nodePtr;
18893   nodePtr.i = nodeId;
18894   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
18895 
18896   ndbrequire(nodePtr.p->noOfStartedChkpt <
18897              getMaxStartedFragCheckpointsForNode(nodePtr.i));
18898 
18899   if (nodePtr.p->noOfQueuedChkpt > 0) {
18900     jam();
18901     Uint32 startIndex = nodePtr.p->noOfStartedChkpt;
18902     nodePtr.p->startedChkpt[startIndex] = nodePtr.p->queuedChkpt[0];
18903     for (Uint32 i = 1; i < nodePtr.p->noOfQueuedChkpt; i++)
18904     {
18905       nodePtr.p->queuedChkpt[i - 1] = nodePtr.p->queuedChkpt[i];
18906     }
18907     nodePtr.p->noOfQueuedChkpt--;
18908     nodePtr.p->noOfStartedChkpt++;
18909     //-------------------------------------------------------------------
18910     // We can send a LCP_FRAG_ORD to the node ordering it to perform a
18911     // local checkpoint on this fragment replica.
18912     //-------------------------------------------------------------------
18913 
18914     sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[startIndex]);
18915   }
18916 
18917   /* ----------------------------------------------------------------------- */
18918   // When there are no more outstanding LCP reports and there are no one queued
18919   // in at least one node, then we are ready to make sure all nodes have at
18920   // least two outstanding LCP requests per node and at least two queued for
18921   // sending.
18922   /* ----------------------------------------------------------------------- */
18923   startNextChkpt(signal);
18924 }//Dbdih::checkStartMoreLcp()
18925 
18926 void
sendLCP_FRAG_ORD(Signal * signal,NodeRecord::FragmentCheckpointInfo info)18927 Dbdih::sendLCP_FRAG_ORD(Signal* signal,
18928 			NodeRecord::FragmentCheckpointInfo info){
18929 
18930   ReplicaRecordPtr replicaPtr;
18931   replicaPtr.i = info.replicaPtr;
18932   c_replicaRecordPool.getPtr(replicaPtr);
18933 
18934   // MT LQH goes via proxy for DD reasons
18935   BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode);
18936 
18937   if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId())
18938   {
18939     return;
18940   }
18941 
18942   if (replicaPtr.p->nextLcp >= MAX_LCP_USED)
18943   {
18944     jam();
18945     infoEvent("Updating nextLcp from %u to %u tab: %u",
18946               replicaPtr.p->nextLcp, 0,
18947               info.tableId);
18948     replicaPtr.p->nextLcp = 0;
18949   }
18950 
18951   Uint32 keepGci = c_lcpState.keepGci;
18952   if (keepGci > SYSFILE->lastCompletedGCI[replicaPtr.p->procNode])
18953   {
18954     jam();
18955     keepGci = SYSFILE->lastCompletedGCI[replicaPtr.p->procNode];
18956   }
18957 
18958   LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
18959   lcpFragOrd->tableId    = info.tableId;
18960   lcpFragOrd->fragmentId = info.fragId;
18961   lcpFragOrd->lcpId      = SYSFILE->latestLCP_ID;
18962   lcpFragOrd->lcpNo      = replicaPtr.p->nextLcp;
18963   lcpFragOrd->keepGci    = keepGci;
18964   lcpFragOrd->lastFragmentFlag = false;
18965   sendSignal(ref, GSN_LCP_FRAG_ORD, signal, LcpFragOrd::SignalLength, JBB);
18966 }
18967 
checkLcpCompletedLab(Signal * signal)18968 void Dbdih::checkLcpCompletedLab(Signal* signal)
18969 {
18970   if(c_lcpState.lcpStatus < LCP_TAB_COMPLETED)
18971   {
18972     jam();
18973     return;
18974   }
18975 
18976   TabRecordPtr tabPtr;
18977   for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++) {
18978     //jam(); Removed as it flushed all other jam traces.
18979     ptrAss(tabPtr, tabRecord);
18980     if (tabPtr.p->tabLcpStatus != TabRecord::TLS_COMPLETED)
18981     {
18982       jam();
18983       return;
18984     }
18985   }
18986 
18987   CRASH_INSERTION2(7027, isMaster());
18988   CRASH_INSERTION2(7018, !isMaster());
18989 
18990   if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED)
18991   {
18992     /**
18993      * We'r done
18994      */
18995 
18996     c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
18997     sendLCP_COMPLETE_REP(signal);
18998     return;
18999   }
19000 
19001   ndbrequire(c_lcpState.lcpStatus == LCP_TAB_SAVED);
19002   allNodesLcpCompletedLab(signal);
19003   return;
19004 }//Dbdih::checkLcpCompletedLab()
19005 
19006 void
sendLCP_COMPLETE_REP(Signal * signal)19007 Dbdih::sendLCP_COMPLETE_REP(Signal* signal){
19008   jam();
19009 
19010   /**
19011    * Quick and dirty fix for bug#36276 dont save
19012    * LCP_COMPLETE_REP to same node same LCP twice
19013    */
19014   bool alreadysent =
19015     c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID &&
19016     c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef;
19017 
19018   if (!alreadysent)
19019   {
19020     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
19021     rep->nodeId = getOwnNodeId();
19022     rep->lcpId = SYSFILE->latestLCP_ID;
19023     rep->blockNo = DBDIH;
19024 
19025     sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
19026                LcpCompleteRep::SignalLength, JBB);
19027 
19028     c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID;
19029     c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef;
19030   }
19031 
19032   /**
19033    * Say that an initial node restart does not need to be redone
19034    *   once node has been part of first LCP
19035    */
19036   if (c_set_initial_start_flag &&
19037       c_lcpState.m_participatingLQH.get(getOwnNodeId()))
19038   {
19039     jam();
19040     c_set_initial_start_flag = FALSE;
19041   }
19042 }
19043 
19044 /*-------------------------------------------------------------------------- */
19045 /* COMP_LCP_ROUND                   A LQH HAS COMPLETED A LOCAL CHECKPOINT  */
19046 /*------------------------------------------------------------------------- */
execLCP_COMPLETE_REP(Signal * signal)19047 void Dbdih::execLCP_COMPLETE_REP(Signal* signal)
19048 {
19049   jamEntry();
19050 
19051   CRASH_INSERTION(7191);
19052 
19053 #if 0
19054   g_eventLogger->info("LCP_COMPLETE_REP");
19055   printLCP_COMPLETE_REP(stdout,
19056 			signal->getDataPtr(),
19057 			signal->length(), number());
19058 #endif
19059 
19060   LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtr();
19061 
19062   if (rep->nodeId == LcpFragRep::BROADCAST_REQ)
19063   {
19064     jam();
19065     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
19066 
19067     /**
19068      * Set correct nodeId
19069      */
19070     rep->nodeId = getOwnNodeId();
19071 
19072     /**
19073      * We want to ensure that we don't receive multiple LCP_COMPLETE_REP
19074      * from our LQH for the same LCP id. This wouldn't fly with the
19075      * PAUSE LCP protocol handling.
19076      */
19077     ndbrequire(rep->blockNo == DBLQH);
19078     ndbrequire(c_last_id_lcp_complete_rep != rep->lcpId ||
19079                c_last_id_lcp_complete_rep == RNIL);
19080     c_last_id_lcp_complete_rep = rep->lcpId;
19081     if (is_lcp_paused() || c_dequeue_lcp_rep_ongoing)
19082     {
19083       jam();
19084       /**
19085        * Also the LCP_COMPLETE_REP are queued when we pause the LCP reporting.
19086        */
19087       queue_lcp_complete_rep(signal, rep->lcpId);
19088       return;
19089     }
19090     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
19091     rg.m_nodes.clear(getOwnNodeId());
19092     sendSignal(rg, GSN_LCP_COMPLETE_REP, signal, signal->getLength(), JBB);
19093 
19094     /**
19095      * and continue processing
19096      */
19097   }
19098 
19099   Uint32 lcpId = rep->lcpId;
19100   Uint32 nodeId = rep->nodeId;
19101   Uint32 blockNo = rep->blockNo;
19102 
19103   /**
19104    * We can arrive here in the following cases:
19105    * 1) blockNo == DBLQH and signal->length() == SignalLength
19106    *
19107    * This is a normal message from a node in the m_participatingLQH
19108    * bitmap. It indicates that the node has completed everything of
19109    * its processing in DBLQH, both sending all LCP_FRAG_REP and
19110    * handling the UNDO log. The sender must be in the set of
19111    * c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH waited for.
19112    *
19113    * There is an exception for this during master takeover, another node
19114    * might send LCP_COMPLETE_REP after receiving MASTER_LCPREQ and finalising
19115    * its part of the master takeover protocol. This signal might arrive
19116    * before we have completed the master takeover protocol. In this case
19117    * the signal must be delayed until the master takeover handling is
19118    * completed. One reason for this is that we haven't finalised setting
19119    * up the master bitmaps yet.
19120    *
19121    * We know in this case that the node is alive by assumption that
19122    * we don't receive messages from dead nodes.
19123    *
19124    * 2) blockNo == DBLQH and signal->length() == SignalLengthTQ and
19125    *    rep->fromTQ == 0
19126    *
19127    * This signal is sent from NODE_FAILREP. It should be allowed to
19128    * pass through although the node is already declared dead and
19129    * no longer part of the m_participatingLQH set. It is a vital part
19130    * of the node failure handling. It should also not be blocked by
19131    * an early starting master takeover. It should however be dropped
19132    * if it isn't part of the set waited for (can happen if 3) arrives
19133    * after NODE_FAILREP but before this signal).
19134    *
19135    * This signal cannot be delayed by a master takeover. We know that
19136    * the master takeover state should not be possible to go beyond
19137    * LMTOS_INITIAL.
19138    *
19139    * 3) blockNo == DBLQH and signal->length() == SignalLengthTQ and
19140    *    rep->fromTQ == 1
19141    *
19142    * This signal is sent as a delayed signal when signal 1) above is
19143    * received in the middle of processing a master take over.
19144    * If it is received when the node is already dead (removed from
19145    * the m_participatingLQH set), then we should simply ignore it
19146    * and drop the signal since the node failure handling already
19147    * has handled it. We find this out by checking if the node is
19148    * part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH set or
19149    * not.
19150    *
19151    * This signal can be delayed by a master takeover if it is not
19152    * to be dropped.
19153    *
19154    * 4) blockNo == DBDIH and signal->length() == SignalLength
19155    *
19156    * This is a normal signal sent from one of the nodes when it has
19157    * received LCP_COMPLETE_REP from all participating LQHs. It is
19158    * received from a node in the set of
19159    * c_lcpState.m_LCP_COMPLETE_REP_DIH_Counter. This set ensures that we
19160    * only receive one of these. We should never receive this signal if
19161    * the node isn't in the above set. The duplication of this signal
19162    * happens as part of executing NODE_FAILREP, but here we set
19163    * signal->length() to SignalLengthTQ and fromTQ = 0, so only that
19164    * signal can be arriving with the node not being part of this set.
19165    * The sending node can both be an alive node and a starting node
19166    * which hasn't been set to alive yet.
19167    *
19168    * The same principle applies as in 1) here, the signal could arrive
19169    * during master takeover when we haven't yet formed the correct
19170    * c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set. In this case we need
19171    * to delay the signal until the master takeover is completed.
19172    *
19173    * 5) blockNo == DBDIH and signal->length() == SignalLengthTQ and
19174    *    rep->fromTQ == 0
19175    *
19176    * This is sent from node failure processing when the node has died.
19177    * The same logic as in 6) applies, the signal can be dropped if the
19178    * node isn't part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set.
19179    * Otherwise it should be allowed to pass through.
19180    *
19181    * This signal cannot be delayed by the master takeover.
19182    *
19183    * 6) blockNo == DBDIH and signal->length() == SignalLengthTQ and
19184    *    rep->fromTQ == 1
19185    *
19186    * This is a signal sent as delayed after receiving 4) above in a master
19187    * takeover situation, if it arrives when the node is no
19188    * longer part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set,
19189    * then we know that the signal is a duplicate and has already been
19190    * processed and we can safely ignore it.
19191    *
19192    * This signal can be delayed by a master takeover if it is not
19193    * to be dropped.
19194    *
19195    * 7) blockNo == 0 and signal->length() == SignalLength
19196    * This is a signal from the master indicating that the LCP is completely
19197    * done. It should not be possible to receive it during a master takeover
19198    * and thus should never be allowed to be delayed since if the master
19199    * takeover is being processed, then this signal cannot arrive from the
19200    * dead master and it is too early to receive it from the new master.
19201    */
19202 
19203   if (blockNo == DBLQH &&
19204       signal->length() == LcpCompleteRep::SignalLengthTQ &&
19205       rep->fromTQ == Uint32(0))
19206   {
19207     /* Handle case 2) above */
19208     ndbrequire(c_lcpMasterTakeOverState.state <= LMTOS_INITIAL);
19209     if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
19210     {
19211       jam();
19212       return;
19213     }
19214     jam();
19215   }
19216   else if (blockNo == DBDIH &&
19217            signal->length() == LcpCompleteRep::SignalLengthTQ &&
19218            rep->fromTQ == Uint32(0))
19219   {
19220     /* Handle case 5) above */
19221     ndbrequire(c_lcpMasterTakeOverState.state <= LMTOS_INITIAL);
19222     if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(nodeId))
19223     {
19224       jam();
19225       return;
19226     }
19227     jam();
19228   }
19229   else if (blockNo == 0)
19230   {
19231     /* Handle case 7) above) */
19232     jam();
19233     ndbrequire(signal->length() == LcpCompleteRep::SignalLength);
19234     /**
19235      * Always allowed free pass through for signals from master that LCP is
19236      * completed.
19237      * These signals should not be blocked by master takeover since the
19238      * master is the last node to complete master takeover and the master
19239      * is sending this signal.
19240      */
19241   }
19242   else
19243   {
19244     /* Handle case 1), case 3), case 4) and case 6) above */
19245     jam();
19246     ndbrequire(blockNo == DBDIH || blockNo == DBLQH);
19247     if(c_lcpMasterTakeOverState.state > LMTOS_WAIT_LCP_FRAG_REP)
19248     {
19249       jam();
19250       /**
19251        * Don't allow LCP_COMPLETE_REP to arrive during
19252        * LCP master take over. We haven't yet formed the set of
19253        * expected signals and we don't want the master state to go to
19254        * completed while we are forming the state.
19255        *
19256        * We keep this even when removing the need to use the EMPTY_LCP_REQ
19257        * protocol. The reason is that we don't want to handle code to
19258        * process LCP completion as part of master take over as a
19259        * simplification. It is perfectly doable but we opted for keeping
19260        * this variant.
19261        */
19262       ndbrequire(isMaster());
19263       rep->fromTQ = Uint32(1);
19264       sendSignalWithDelay(reference(), GSN_LCP_COMPLETE_REP, signal, 100,
19265                           LcpCompleteRep::SignalLengthTQ);
19266       return;
19267     }
19268     /**
19269      * We are not in a master takeover situation, so we should have the
19270      * signal expected by the sets, however this could have been handled
19271      * by the signal sent from NODE_FAILREP already. So we need to verify
19272      * we really are in those sets. Not being in those states when a master
19273      * takeover isn't ongoing should only happen for delayed signals.
19274      */
19275     if (blockNo == DBLQH &&
19276         !c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
19277     {
19278       /* Can happen in case 3) above */
19279       jam();
19280       ndbrequire(signal->length() == LcpCompleteRep::SignalLengthTQ &&
19281                  rep->fromTQ == Uint32(1));
19282       return;
19283     }
19284     if (blockNo == DBDIH &&
19285         !c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(nodeId))
19286     {
19287       /* Can happen in case 6) above */
19288       jam();
19289       ndbrequire(signal->length() == LcpCompleteRep::SignalLengthTQ &&
19290                  rep->fromTQ == Uint32(1));
19291       return;
19292     }
19293   }
19294 
19295   ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE);
19296 
19297   switch(blockNo){
19298   case DBLQH:
19299     jam();
19300     c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(nodeId);
19301     ndbrequire(!c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId));
19302     break;
19303   case DBDIH:
19304     jam();
19305     ndbrequire(isMaster());
19306     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor(nodeId);
19307     break;
19308   case 0:
19309     jam();
19310     ndbrequire(!isMaster());
19311     ndbrequire(c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received == false);
19312     c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = true;
19313     break;
19314   default:
19315     ndbrequire(false);
19316   }
19317   ndbrequire(lcpId == SYSFILE->latestLCP_ID);
19318 
19319   allNodesLcpCompletedLab(signal);
19320   return;
19321 }
19322 
allNodesLcpCompletedLab(Signal * signal)19323 void Dbdih::allNodesLcpCompletedLab(Signal* signal)
19324 {
19325   jam();
19326 
19327   if (c_lcpState.lcpStatus != LCP_TAB_SAVED) {
19328     jam();
19329     /**
19330      * We have not sent LCP_COMPLETE_REP to master DIH yet
19331      */
19332     return;
19333   }//if
19334 
19335   if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.done()){
19336     jam();
19337     return;
19338   }
19339 
19340   if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.done()){
19341     jam();
19342     return;
19343   }
19344 
19345   if (!isMaster() &&
19346       c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received == false){
19347     jam();
19348     /**
19349      * Wait until master DIH has signalled lcp is complete
19350      */
19351     return;
19352   }
19353 
19354   if(c_lcpMasterTakeOverState.state != LMTOS_IDLE){
19355     jam();
19356 #ifdef VM_TRACE
19357     g_eventLogger->info("Exiting from allNodesLcpCompletedLab");
19358 #endif
19359     return;
19360   }
19361 
19362   /*------------------------------------------------------------------------ */
19363   /*     WE HAVE NOW COMPLETED A LOCAL CHECKPOINT. WE ARE NOW READY TO WAIT  */
19364   /*     FOR THE NEXT LOCAL CHECKPOINT. SEND WITHOUT TIME-OUT SINCE IT MIGHT */
19365   /*     BE TIME TO START THE NEXT LOCAL CHECKPOINT IMMEDIATELY.             */
19366   /*     CLEAR BIT 3 OF SYSTEM RESTART BITS TO INDICATE THAT THERE IS NO     */
19367   /*     LOCAL CHECKPOINT ONGOING. THIS WILL BE WRITTEN AT SOME LATER TIME   */
19368   /*     DURING A GLOBAL CHECKPOINT. IT IS NOT NECESSARY TO WRITE IT         */
19369   /*     IMMEDIATELY. WE WILL ALSO CLEAR BIT 2 OF SYSTEM RESTART BITS IF ALL */
19370   /*     CURRENTLY ACTIVE NODES COMPLETED THE LOCAL CHECKPOINT.              */
19371   /*------------------------------------------------------------------------ */
19372   CRASH_INSERTION(7019);
19373   signal->setTrace(0);
19374 
19375   /* Check pause states */
19376   check_pause_state_lcp_idle();
19377   c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
19378   c_increase_lcp_speed_after_nf = false;
19379 
19380   /**
19381    * Update m_local_lcp_state
19382    */
19383   m_local_lcp_state.lcp_complete_rep(c_newest_restorable_gci);
19384 
19385   if (isMaster())
19386   {
19387     /**
19388      * Check for any "completed" TO
19389      */
19390     TakeOverRecordPtr takeOverPtr;
19391     for (c_masterActiveTakeOverList.first(takeOverPtr); !takeOverPtr.isNull();)
19392     {
19393       jam();
19394 
19395       // move to next, since takeOverPtr might be release below
19396       TakeOverRecordPtr nextPtr = takeOverPtr;
19397       c_masterActiveTakeOverList.next(nextPtr);
19398 
19399       Ptr<NodeRecord> nodePtr;
19400       nodePtr.i = takeOverPtr.p->toStartingNode;
19401       if (takeOverPtr.p->toMasterStatus == TakeOverRecord::TO_WAIT_LCP)
19402       {
19403         jam();
19404         if (c_lcpState.m_participatingLQH.get(nodePtr.i))
19405         {
19406           jam();
19407           ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
19408           ndbrequire(nodePtr.p->copyCompleted == 2);
19409 
19410           /**
19411            * We have completed the node restart for this node. We set the
19412            * node recovery status to completed. This is used also in
19413            * estimating times for other nodes to complete their restarts.
19414            * It is also used to build NDBINFO table about node restart
19415            * status.
19416            *
19417            * This code is only executed in master node.
19418            */
19419           setNodeRecoveryStatus(nodePtr.i, NodeRecord::WAIT_SUMA_HANDOVER);
19420 
19421           EndToConf * conf = (EndToConf *)signal->getDataPtrSend();
19422           conf->senderData = takeOverPtr.p->m_senderData;
19423           conf->sendingNodeId = cownNodeId;
19424           conf->startingNodeId = nodePtr.i;
19425           sendSignal(takeOverPtr.p->m_senderRef, GSN_END_TOCONF, signal,
19426                      EndToConf::SignalLength, JBB);
19427 
19428           releaseTakeOver(takeOverPtr, true);
19429         }
19430       }
19431 
19432       takeOverPtr = nextPtr;
19433     }
19434     /**
19435      * We send the LCP_COMPLETE_REP from the master node to all nodes
19436      * that participated in the LCP in DIH, we could have alive nodes
19437      * here that didn't participate in the LCP because they became
19438      * alive so recently that they didn't need to participate in the
19439      * LCP since it was already closing when they entered through the
19440      * PAUSE LCP protocol. Sending to those nodes is not a good idea
19441      * since they are not at all set up to receive a LCP_COMPLETE_REP
19442      * message.
19443      */
19444     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
19445     rep->nodeId = getOwnNodeId();
19446     rep->lcpId = SYSFILE->latestLCP_ID;
19447     rep->blockNo = 0; // 0 = Sent from master
19448     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
19449     rg.m_nodes.clear(getOwnNodeId());
19450     sendSignal(rg, GSN_LCP_COMPLETE_REP, signal,
19451                LcpCompleteRep::SignalLength, JBB);
19452 
19453     jam();
19454   }
19455 
19456   Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
19457   setLcpActiveStatusEnd(signal);
19458 
19459   /**
19460    * We calculate LCP time also in non-master although it's only used by
19461    * master nodes. The idea is to have an estimate of LCP execution time
19462    * already when the master node is running it's first LCP.
19463    */
19464   c_lcpState.m_lcp_time =
19465     NdbTick_Elapsed(c_lcpState.m_start_time, c_current_time).milliSec();
19466 
19467   if(!isMaster()){
19468     jam();
19469     /**
19470      * We're not master, be content
19471      */
19472     return;
19473   }
19474 
19475   /***************************************************************************/
19476   // Report the event that a local checkpoint has completed.
19477   /***************************************************************************/
19478   signal->theData[0] = NDB_LE_LocalCheckpointCompleted; //Event type
19479   signal->theData[1] = SYSFILE->latestLCP_ID;
19480   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
19481 
19482   if (c_newest_restorable_gci > c_lcpState.lcpStopGcp &&
19483       !(ERROR_INSERTED(7222) || ERROR_INSERTED(7223)))
19484   {
19485     jam();
19486     c_lcpState.lcpStopGcp = c_newest_restorable_gci;
19487   }
19488 
19489   /**
19490    * Start checking for next LCP
19491    */
19492   checkLcpStart(signal, __LINE__, 0);
19493 
19494   ndbassert(check_pause_state_sanity());
19495   if (!c_lcp_runs_with_pause_support)
19496   {
19497     jam();
19498     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
19499     mutex.unlock();
19500   }
19501   else if (c_old_node_waiting_for_lcp_end)
19502   {
19503     jam();
19504     c_old_node_waiting_for_lcp_end = false;
19505     start_copy_meta_data(signal);
19506   }
19507 
19508   c_lcp_runs_with_pause_support = false;
19509   ndbassert(check_pause_state_sanity());
19510   c_current_time = NdbTick_getCurrentTicks();
19511 
19512   if (cwaitLcpSr == true) {
19513     jam();
19514 
19515     infoEvent("Make On-line Database recoverable by waiting for LCP"
19516               " Completed, LCP id = %u",
19517               SYSFILE->latestLCP_ID);
19518 
19519     cwaitLcpSr = false;
19520     ndbsttorry10Lab(signal, __LINE__);
19521     return;
19522   }//if
19523   return;
19524 }//Dbdih::allNodesLcpCompletedLab()
19525 
19526 /******************************************************************************/
19527 /* **********     TABLE UPDATE MODULE                             *************/
19528 /* ****************************************************************************/
19529 /* ------------------------------------------------------------------------- */
19530 /*       THIS MODULE IS USED TO UPDATE THE TABLE DESCRIPTION. IT STARTS BY   */
19531 /*       CREATING THE FIRST TABLE FILE, THEN UPDATES THIS FILE AND CLOSES IT.*/
19532 /*       AFTER THAT THE SAME HAPPENS WITH THE SECOND FILE. AFTER THAT THE    */
19533 /*       TABLE DISTRIBUTION HAS BEEN UPDATED.                                */
19534 /*                                                                           */
19535 /*       THE REASON FOR CREATING THE FILE AND NOT OPENING IT IS TO ENSURE    */
19536 /*       THAT WE DO NOT GET A MIX OF OLD AND NEW INFORMATION IN THE FILE IN  */
19537 /*       ERROR SITUATIONS.                                                   */
19538 /* ------------------------------------------------------------------------- */
tableUpdateLab(Signal * signal,TabRecordPtr tabPtr)19539 void Dbdih::tableUpdateLab(Signal* signal, TabRecordPtr tabPtr) {
19540   FileRecordPtr filePtr;
19541   if (tabPtr.p->tabStorage == TabRecord::ST_TEMPORARY)
19542   {
19543     // For temporary tables we do not write to disk. Mark both copies 0 and 1
19544     // as done, and go straight to the after-close code.
19545     filePtr.i = tabPtr.p->tabFile[1];
19546     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19547     tableCloseLab(signal, filePtr);
19548     return;
19549   }
19550   filePtr.i = tabPtr.p->tabFile[0];
19551   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19552   createFileRw(signal, filePtr);
19553   filePtr.p->reqStatus = FileRecord::TABLE_CREATE;
19554   return;
19555 }//Dbdih::tableUpdateLab()
19556 
tableCreateLab(Signal * signal,FileRecordPtr filePtr)19557 void Dbdih::tableCreateLab(Signal* signal, FileRecordPtr filePtr)
19558 {
19559   TabRecordPtr tabPtr;
19560   tabPtr.i = filePtr.p->tabRef;
19561   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19562   writeTabfile(signal, tabPtr.p, filePtr);
19563   filePtr.p->reqStatus = FileRecord::TABLE_WRITE;
19564   return;
19565 }//Dbdih::tableCreateLab()
19566 
tableWriteLab(Signal * signal,FileRecordPtr filePtr)19567 void Dbdih::tableWriteLab(Signal* signal, FileRecordPtr filePtr)
19568 {
19569   closeFile(signal, filePtr);
19570   filePtr.p->reqStatus = FileRecord::TABLE_CLOSE;
19571   return;
19572 }//Dbdih::tableWriteLab()
19573 
tableCloseLab(Signal * signal,FileRecordPtr filePtr)19574 void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
19575 {
19576   TabRecordPtr tabPtr;
19577   tabPtr.i = filePtr.p->tabRef;
19578   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19579   if (filePtr.i == tabPtr.p->tabFile[0]) {
19580     jam();
19581     filePtr.i = tabPtr.p->tabFile[1];
19582     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19583     createFileRw(signal, filePtr);
19584     filePtr.p->reqStatus = FileRecord::TABLE_CREATE;
19585     return;
19586   }//if
19587   switch (tabPtr.p->tabUpdateState) {
19588   case TabRecord::US_LOCAL_CHECKPOINT:
19589     jam();
19590     releaseTabPages(tabPtr.i);
19591 
19592     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
19593     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19594     tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
19595 
19596     /* Check whether there's some queued table definition flush op to start */
19597     if (c_lcpTabDefWritesControl.releaseMustStartQueued())
19598     {
19599       jam();
19600       /* Some table write is queued - let's kick it off */
19601       /* First find it...
19602        *   By using the tabUpdateState to 'queue' operations, we lose
19603        *   the original flush request order, which shouldn't matter.
19604        *   In any case, the checkpoint proceeds by table id, as does this
19605        *   search, so a similar order should result
19606        */
19607       TabRecordPtr tabPtr;
19608       for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++)
19609       {
19610         ptrAss(tabPtr, tabRecord);
19611         if (tabPtr.p->tabUpdateState == TabRecord::US_LOCAL_CHECKPOINT_QUEUED)
19612         {
19613           jam();
19614           //ndbout_c("DIH : Starting queued table def flush op on table %u", tabPtr.i);
19615           tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT;
19616           signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
19617           signal->theData[1] = tabPtr.i;
19618           sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19619           return;
19620         }
19621       }
19622       /* No queued table write found - error */
19623       g_eventLogger->warning("DIH : Error in queued table writes : inUse %u"
19624                              " queued %u total %u",
19625                              c_lcpTabDefWritesControl.inUse,
19626                              c_lcpTabDefWritesControl.queuedRequests,
19627                              c_lcpTabDefWritesControl.totalResources);
19628       ndbrequire(false);
19629     }
19630     jam();
19631     signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
19632     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
19633 
19634     return;
19635     break;
19636   case TabRecord::US_REMOVE_NODE:
19637     jam();
19638     releaseTabPages(tabPtr.i);
19639     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
19640     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19641     if (tabPtr.p->tabLcpStatus == TabRecord::TLS_WRITING_TO_FILE) {
19642       jam();
19643       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
19644       signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
19645       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
19646     }//if
19647     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
19648     signal->theData[1] = tabPtr.p->tabRemoveNode;
19649     signal->theData[2] = tabPtr.i + 1;
19650     if (!ERROR_INSERTED(7233))
19651       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
19652     else
19653       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
19654     return;
19655     break;
19656   case TabRecord::US_INVALIDATE_NODE_LCP:
19657     jam();
19658     releaseTabPages(tabPtr.i);
19659     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
19660     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19661 
19662     signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
19663     signal->theData[1] = tabPtr.p->tabRemoveNode;
19664     signal->theData[2] = tabPtr.i + 1;
19665 
19666     handle_send_continueb_invalidate_node_lcp(signal);
19667     return;
19668   case TabRecord::US_COPY_TAB_REQ:
19669     jam();
19670     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19671     copyTabReq_complete(signal, tabPtr);
19672     return;
19673     break;
19674   case TabRecord::US_ADD_TABLE_MASTER:
19675     jam();
19676     releaseTabPages(tabPtr.i);
19677     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19678     signal->theData[0] = DihContinueB::ZDIH_ADD_TABLE_MASTER;
19679     signal->theData[1] = tabPtr.i;
19680     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19681     return;
19682     break;
19683   case TabRecord::US_ADD_TABLE_SLAVE:
19684     jam();
19685     releaseTabPages(tabPtr.i);
19686     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19687     signal->theData[0] = DihContinueB::ZDIH_ADD_TABLE_SLAVE;
19688     signal->theData[1] = tabPtr.i;
19689     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19690     return;
19691     break;
19692   case TabRecord::US_CALLBACK:
19693   {
19694     jam();
19695     releaseTabPages(tabPtr.i);
19696     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
19697     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
19698 
19699     Ptr<ConnectRecord> connectPtr;
19700     connectPtr.i = tabPtr.p->connectrec;
19701     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
19702     execute(signal, connectPtr.p->m_callback, 0);
19703     return;
19704   }
19705   default:
19706     ndbrequire(false);
19707     return;
19708     break;
19709   }//switch
19710 }//Dbdih::tableCloseLab()
19711 
checkGcpStopLab(Signal * signal)19712 void Dbdih::checkGcpStopLab(Signal* signal)
19713 {
19714   static const Uint32 GCPCheckPeriodMillis = 100;
19715 
19716   // Calculate real time elapsed since last check
19717   const NDB_TICKS now = NdbTick_getCurrentTicks();
19718   const NDB_TICKS last = m_gcp_monitor.m_last_check;
19719   m_gcp_monitor.m_last_check = now;
19720 
19721   /**
19722    * Avoid false GCP failures if timers misbehaves,
19723    * (timer is non-monotonic, or OS/VM bugs which there are some of)
19724    * or we have scheduler problems due to being CPU starved:
19725    *
19726    * - If we overslept 'GCPCheckPeriodMillis', (CPU starved?) or
19727    *   timer leapt forward for other reasons (Adjusted, or OS-bug)
19728    *   we never calculate an elapsed periode of more than
19729    *   the requested sleep 'GCPCheckPeriodMillis'
19730    * - Else we add the real measured elapsed time to total.
19731    *   (Timers may fire prior to requested 'GCPCheckPeriodMillis')
19732    *
19733    * Note: If timer for some reason ticked backwards such that
19734    *       'now < last', NdbTick_Elapsed() will return '0' such
19735    *       that this is 'absorbed'
19736    */
19737   Uint32 elapsed_ms = (Uint32)NdbTick_Elapsed(last,now).milliSec();
19738   if (elapsed_ms > GCPCheckPeriodMillis)
19739     elapsed_ms = GCPCheckPeriodMillis;
19740 
19741   const Uint32 lag0 = (m_gcp_monitor.m_gcp_save.m_elapsed_ms  += elapsed_ms);
19742   const Uint32 lag1 = (m_gcp_monitor.m_micro_gcp.m_elapsed_ms += elapsed_ms);
19743 
19744   if (ERROR_INSERTED(7145))
19745   {
19746     static bool done = false;
19747     /*
19748       Recalculate the timeouts the get the low values that the test
19749       needs.  This was initially done at startup, and at that point,
19750       the ERROR_INSERT was not set yet.
19751     */
19752     if (!done)
19753     {
19754       setGCPStopTimeouts();
19755       done = true;
19756     }
19757   }
19758 
19759   if (m_gcp_monitor.m_gcp_save.m_gci == m_gcp_save.m_gci)
19760   {
19761     jam();
19762     if (m_gcp_monitor.m_gcp_save.m_max_lag_ms &&
19763         lag0 >= m_gcp_monitor.m_gcp_save.m_max_lag_ms)
19764     {
19765       crashSystemAtGcpStop(signal, false);
19766       /* Continue monitoring */
19767     }
19768 
19769     /**
19770      * Will report a warning every time lag crosses
19771      * a multiple of 'report_period_ms'
19772      */
19773     const Uint32 report_period_ms = 60*1000; // 60 seconds
19774     if (lag0 > 0 && (lag0 % report_period_ms) < elapsed_ms)
19775     {
19776       if (m_gcp_monitor.m_gcp_save.m_max_lag_ms)
19777       {
19778         warningEvent("GCP Monitor: GCP_SAVE lag %u seconds"
19779                      " (max lag: %us)",
19780                      lag0/1000, m_gcp_monitor.m_gcp_save.m_max_lag_ms/1000);
19781       }
19782       else
19783       {
19784         warningEvent("GCP Monitor: GCP_SAVE lag %u seconds"
19785                      " (no max lag)",
19786                      lag0/1000);
19787       }
19788     }
19789   }
19790   else
19791   {
19792     jam();
19793     m_gcp_monitor.m_gcp_save.m_gci = m_gcp_save.m_gci;
19794     m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
19795   }
19796 
19797   if (m_gcp_monitor.m_micro_gcp.m_gci == m_micro_gcp.m_current_gci)
19798   {
19799     jam();
19800     const Uint32 cmp = m_micro_gcp.m_enabled ?
19801       m_gcp_monitor.m_micro_gcp.m_max_lag_ms :
19802       m_gcp_monitor.m_gcp_save.m_max_lag_ms;
19803 
19804     if (cmp && lag1 >= cmp)
19805     {
19806       crashSystemAtGcpStop(signal, false);
19807       /* Continue monitoring */
19808     }
19809 
19810     /**
19811      * Will report a warning every time lag crosses
19812      * a multiple of 'report_period_ms'
19813      */
19814     const Uint32 report_period_ms = 10*1000; // 10 seconds
19815     if (lag1 > 0 && (lag1 % report_period_ms) < elapsed_ms)
19816     {
19817       if (m_gcp_monitor.m_micro_gcp.m_max_lag_ms)
19818       {
19819         warningEvent("GCP Monitor: GCP_COMMIT lag %u seconds"
19820                      " (max lag: %u)",
19821                      lag1/1000, m_gcp_monitor.m_micro_gcp.m_max_lag_ms/1000);
19822       }
19823       else
19824       {
19825         warningEvent("GCP Monitor: GCP_COMMIT lag %u seconds"
19826                      " (no max lag)",
19827                      lag1/1000);
19828       }
19829     }
19830   }
19831   else
19832   {
19833     jam();
19834     m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
19835     m_gcp_monitor.m_micro_gcp.m_gci = m_micro_gcp.m_current_gci;
19836   }
19837 
19838   signal->theData[0] = DihContinueB::ZCHECK_GCP_STOP;
19839   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
19840                       GCPCheckPeriodMillis, 1);
19841   return;
19842 }//Dbdih::checkGcpStopLab()
19843 
19844 void
dumpGcpStop()19845 Dbdih::dumpGcpStop()
19846 {
19847   ndbout_c("c_nodeStartMaster.blockGcp: %u %u",
19848            c_nodeStartMaster.blockGcp,
19849            c_nodeStartMaster.startNode);
19850   ndbout_c("m_gcp_save.m_elapsed: %u(ms) m_gcp_save.m_max_lag: %u(ms)",
19851            m_gcp_monitor.m_gcp_save.m_elapsed_ms,
19852            m_gcp_monitor.m_gcp_save.m_max_lag_ms);
19853   ndbout_c("m_micro_gcp.m_elapsed: %u(ms) m_micro_gcp.m_max_lag: %u(ms)",
19854            m_gcp_monitor.m_micro_gcp.m_elapsed_ms,
19855            m_gcp_monitor.m_micro_gcp.m_max_lag_ms);
19856 
19857 
19858   ndbout_c("m_gcp_save.m_state: %u", m_gcp_save.m_state);
19859   ndbout_c("m_gcp_save.m_master.m_state: %u", m_gcp_save.m_master.m_state);
19860   ndbout_c("m_micro_gcp.m_state: %u", m_micro_gcp.m_state);
19861   ndbout_c("m_micro_gcp.m_master.m_state: %u", m_micro_gcp.m_master.m_state);
19862 
19863   ndbout_c("c_COPY_GCIREQ_Counter = %s", c_COPY_GCIREQ_Counter.getText());
19864   ndbout_c("c_COPY_TABREQ_Counter = %s", c_COPY_TABREQ_Counter.getText());
19865   ndbout_c("c_UPDATE_FRAG_STATEREQ_Counter = %s",
19866             c_UPDATE_FRAG_STATEREQ_Counter.getText());
19867   ndbout_c("c_DIH_SWITCH_REPLICA_REQ_Counter = %s",
19868 	   c_DIH_SWITCH_REPLICA_REQ_Counter.getText());
19869   ndbout_c("c_EMPTY_LCP_REQ_Counter = %s",c_EMPTY_LCP_REQ_Counter.getText());
19870   ndbout_c("c_GCP_COMMIT_Counter = %s", c_GCP_COMMIT_Counter.getText());
19871   ndbout_c("c_GCP_PREPARE_Counter = %s", c_GCP_PREPARE_Counter.getText());
19872   ndbout_c("c_GCP_SAVEREQ_Counter = %s", c_GCP_SAVEREQ_Counter.getText());
19873   ndbout_c("c_SUB_GCP_COMPLETE_REP_Counter = %s",
19874            c_SUB_GCP_COMPLETE_REP_Counter.getText());
19875   ndbout_c("c_INCL_NODEREQ_Counter = %s", c_INCL_NODEREQ_Counter.getText());
19876   ndbout_c("c_MASTER_GCPREQ_Counter = %s", c_MASTER_GCPREQ_Counter.getText());
19877   ndbout_c("c_MASTER_LCPREQ_Counter = %s", c_MASTER_LCPREQ_Counter.getText());
19878   ndbout_c("c_START_INFOREQ_Counter = %s", c_START_INFOREQ_Counter.getText());
19879   ndbout_c("c_START_RECREQ_Counter = %s", c_START_RECREQ_Counter.getText());
19880   ndbout_c("c_STOP_ME_REQ_Counter = %s", c_STOP_ME_REQ_Counter.getText());
19881   ndbout_c("c_TC_CLOPSIZEREQ_Counter = %s", c_TC_CLOPSIZEREQ_Counter.getText());
19882   ndbout_c("c_TCGETOPSIZEREQ_Counter = %s", c_TCGETOPSIZEREQ_Counter.getText());
19883 
19884   ndbout_c("m_copyReason: %d m_waiting: %u %u",
19885            c_copyGCIMaster.m_copyReason,
19886            c_copyGCIMaster.m_waiting[0],
19887            c_copyGCIMaster.m_waiting[1]);
19888 
19889   ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
19890 	   c_copyGCISlave.m_senderData,
19891 	   c_copyGCISlave.m_senderRef,
19892 	   c_copyGCISlave.m_copyReason,
19893 	   c_copyGCISlave.m_expectedNextWord);
19894 }
19895 
19896 /**
19897  * GCP stop detected,
19898  * local == true means we must shutdown
19899  * local == false means we (GCP Master) are deciding what to
19900  *  do - may involve requesting shut down of other nodes and/or
19901  *  ourself.
19902  *
19903  * The action to take is generally :
19904  *   1.  Send 'Please log debug info + shutdown' signals to
19905  *       stalled nodes
19906  *   2,  Send ISOLATE_ORD with delay of X millis to *all*
19907  *       nodes (including self)
19908  *
19909  * Part 1 should result in a clean shutdown with debug
19910  * information and a clear cause
19911  * Part 2 ensures that if part 1 fails (as it might if the
19912  * nodes are 'ill'), the live nodes quickly exclude the
19913  * ill node and get on with their lives.
19914  *
19915  * Part 1 is implemented by various DUMP_STATE_ORD signals
19916  * and SYSTEM_ERROR
19917  * Part 2 is implemented using ISOLATE_ORD.
19918 */
crashSystemAtGcpStop(Signal * signal,bool local)19919 void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
19920 {
19921   dumpGcpStop();
19922   const Uint32 save_elapsed = m_gcp_monitor.m_gcp_save.m_elapsed_ms;
19923   const Uint32 micro_elapsed = m_gcp_monitor.m_micro_gcp.m_elapsed_ms;
19924   m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
19925   m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
19926 
19927   const Uint32 NodeIsolationTimeoutMillis = 100;
19928 
19929   if (local)
19930     goto dolocal;
19931 
19932   if (c_nodeStartMaster.blockGcp == 2)
19933   {
19934     jam();
19935     /**
19936      * Starting node...is delaying GCP to long...
19937      *   kill it
19938      */
19939     SystemError * const sysErr = (SystemError*)&signal->theData[0];
19940     sysErr->errorCode = SystemError::GCPStopDetected;
19941     sysErr->errorRef = reference();
19942     sysErr->data[0] = m_gcp_save.m_master.m_state;
19943     sysErr->data[1] = cgcpOrderBlocked;
19944     sysErr->data[2] = m_micro_gcp.m_master.m_state;
19945     sendSignal(calcNdbCntrBlockRef(c_nodeStartMaster.startNode),
19946                GSN_SYSTEM_ERROR, signal, SystemError::SignalLength, JBA);
19947 
19948     {
19949       /* Isolate, just in case */
19950       NdbNodeBitmask victims;
19951       victims.set(c_nodeStartMaster.startNode);
19952 
19953       isolateNodes(signal,
19954                    NodeIsolationTimeoutMillis,
19955                    victims);
19956     }
19957     return;
19958   }
19959 
19960   if (save_elapsed >= m_gcp_monitor.m_gcp_save.m_max_lag_ms)
19961   {
19962     switch(m_gcp_save.m_master.m_state){
19963     case GcpSave::GCP_SAVE_IDLE:
19964     {
19965       /**
19966        * No switch for looong time...and we're idle...it *our* fault
19967        */
19968       /* Ask others to isolate me, just in case */
19969       {
19970         NdbNodeBitmask victims;
19971         victims.set(cownNodeId);
19972 
19973         isolateNodes(signal,
19974                      NodeIsolationTimeoutMillis,
19975                      victims);
19976       }
19977       local = true;
19978       break;
19979     }
19980     case GcpSave::GCP_SAVE_REQ:
19981     {
19982       jam();
19983       NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
19984       signal->theData[0] = 2305;
19985       sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
19986 
19987       isolateNodes(signal,
19988                    NodeIsolationTimeoutMillis,
19989                    c_GCP_SAVEREQ_Counter.getNodeBitmask());
19990 
19991       warningEvent("Detected GCP stop(%d)...sending kill to %s",
19992                 m_gcp_save.m_master.m_state, c_GCP_SAVEREQ_Counter.getText());
19993       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
19994                m_gcp_save.m_master.m_state, c_GCP_SAVEREQ_Counter.getText());
19995       ndbrequire(!c_GCP_SAVEREQ_Counter.done());
19996       return;
19997     }
19998     case GcpSave::GCP_SAVE_COPY_GCI:
19999     {
20000       /**
20001        * We're waiting for a COPY_GCICONF
20002        */
20003       warningEvent("Detected GCP stop(%d)...sending kill to %s",
20004                 m_gcp_save.m_master.m_state, c_COPY_GCIREQ_Counter.getText());
20005       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
20006                m_gcp_save.m_master.m_state, c_COPY_GCIREQ_Counter.getText());
20007 
20008       {
20009         NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
20010         signal->theData[0] = 7022;
20011         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
20012       }
20013 
20014       {
20015         NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
20016         SystemError * const sysErr = (SystemError*)&signal->theData[0];
20017         sysErr->errorCode = SystemError::GCPStopDetected;
20018         sysErr->errorRef = reference();
20019         sysErr->data[0] = m_gcp_save.m_master.m_state;
20020         sysErr->data[1] = cgcpOrderBlocked;
20021         sysErr->data[2] = m_micro_gcp.m_master.m_state;
20022         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
20023                    SystemError::SignalLength, JBA);
20024       }
20025 
20026       isolateNodes(signal,
20027                    NodeIsolationTimeoutMillis,
20028                    c_COPY_GCIREQ_Counter.getNodeBitmask());
20029 
20030       ndbrequire(!c_COPY_GCIREQ_Counter.done());
20031       return;
20032     }
20033     case GcpSave::GCP_SAVE_CONF:
20034       /**
20035        * This *should* not happen (not a master state)
20036        */
20037       local = true;
20038       break;
20039     }
20040   }
20041 
20042   if (micro_elapsed >= m_gcp_monitor.m_micro_gcp.m_max_lag_ms)
20043   {
20044     switch(m_micro_gcp.m_master.m_state){
20045     case MicroGcp::M_GCP_IDLE:
20046     {
20047       /**
20048        * No switch for looong time...and we're idle...it *our* fault
20049        */
20050       /* Ask others to isolate me, just in case */
20051       {
20052         NdbNodeBitmask victims;
20053         victims.set(cownNodeId);
20054 
20055         isolateNodes(signal,
20056                      NodeIsolationTimeoutMillis,
20057                      victims);
20058       }
20059       local = true;
20060       break;
20061     }
20062     case MicroGcp::M_GCP_PREPARE:
20063     {
20064     /**
20065      * We're waiting for a GCP PREPARE CONF
20066      */
20067       warningEvent("Detected GCP stop(%d)...sending kill to %s",
20068                 m_micro_gcp.m_state, c_GCP_PREPARE_Counter.getText());
20069       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
20070                m_micro_gcp.m_state, c_GCP_PREPARE_Counter.getText());
20071 
20072       {
20073         NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
20074         signal->theData[0] = 7022;
20075         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
20076       }
20077 
20078       {
20079         NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
20080         SystemError * const sysErr = (SystemError*)&signal->theData[0];
20081         sysErr->errorCode = SystemError::GCPStopDetected;
20082         sysErr->errorRef = reference();
20083         sysErr->data[0] = m_gcp_save.m_master.m_state;
20084         sysErr->data[1] = cgcpOrderBlocked;
20085         sysErr->data[2] = m_micro_gcp.m_master.m_state;
20086         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
20087                    SystemError::SignalLength, JBA);
20088       }
20089 
20090       isolateNodes(signal,
20091                    NodeIsolationTimeoutMillis,
20092                    c_GCP_PREPARE_Counter.getNodeBitmask());
20093 
20094       ndbrequire(!c_GCP_PREPARE_Counter.done());
20095       return;
20096     }
20097     case MicroGcp::M_GCP_COMMIT:
20098     {
20099       warningEvent("Detected GCP stop(%d)...sending kill to %s",
20100                 m_micro_gcp.m_state, c_GCP_COMMIT_Counter.getText());
20101       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
20102                m_micro_gcp.m_state, c_GCP_COMMIT_Counter.getText());
20103 
20104       {
20105         NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
20106         signal->theData[0] = 7022;
20107         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
20108       }
20109 
20110       {
20111         NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
20112         SystemError * const sysErr = (SystemError*)&signal->theData[0];
20113         sysErr->errorCode = SystemError::GCPStopDetected;
20114         sysErr->errorRef = reference();
20115         sysErr->data[0] = m_gcp_save.m_master.m_state;
20116         sysErr->data[1] = cgcpOrderBlocked;
20117         sysErr->data[2] = m_micro_gcp.m_master.m_state;
20118         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
20119                    SystemError::SignalLength, JBA);
20120       }
20121 
20122       isolateNodes(signal,
20123                    NodeIsolationTimeoutMillis,
20124                    c_GCP_COMMIT_Counter.getNodeBitmask());
20125 
20126       ndbrequire(!c_GCP_COMMIT_Counter.done());
20127       return;
20128     }
20129     case MicroGcp::M_GCP_COMMITTED:
20130       /**
20131        * This *should* not happen (not a master state)
20132        */
20133       local = true;
20134       break;
20135     case MicroGcp::M_GCP_COMPLETE:
20136       infoEvent("Detected GCP stop(%d)...sending kill to %s",
20137                 m_micro_gcp.m_state, c_SUB_GCP_COMPLETE_REP_Counter.getText());
20138       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
20139                m_micro_gcp.m_state, c_SUB_GCP_COMPLETE_REP_Counter.getText());
20140 
20141       {
20142         NodeReceiverGroup rg(DBDIH, c_SUB_GCP_COMPLETE_REP_Counter);
20143         signal->theData[0] = 7022;
20144         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
20145       }
20146 
20147       {
20148         NodeReceiverGroup rg(NDBCNTR, c_SUB_GCP_COMPLETE_REP_Counter);
20149         SystemError * const sysErr = (SystemError*)&signal->theData[0];
20150         sysErr->errorCode = SystemError::GCPStopDetected;
20151         sysErr->errorRef = reference();
20152         sysErr->data[0] = m_gcp_save.m_master.m_state;
20153         sysErr->data[1] = cgcpOrderBlocked;
20154         sysErr->data[2] = m_micro_gcp.m_master.m_state;
20155         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
20156                    SystemError::SignalLength, JBA);
20157       }
20158 
20159       isolateNodes(signal,
20160                    NodeIsolationTimeoutMillis,
20161                    c_SUB_GCP_COMPLETE_REP_Counter.getNodeBitmask());
20162 
20163       ndbrequire(!c_SUB_GCP_COMPLETE_REP_Counter.done());
20164       return;
20165     }
20166   }
20167 
20168 dolocal:
20169   FileRecordPtr file0Ptr;
20170   file0Ptr.i = crestartInfoFile[0];
20171   ptrCheckGuard(file0Ptr, cfileFileSize, fileRecord);
20172   FileRecordPtr file1Ptr;
20173   file1Ptr.i = crestartInfoFile[1];
20174   ptrCheckGuard(file1Ptr, cfileFileSize, fileRecord);
20175 
20176   ndbout_c("file[0] status: %d type: %d reqStatus: %d file1: %d %d %d",
20177 	   file0Ptr.p->fileStatus, file0Ptr.p->fileType, file0Ptr.p->reqStatus,
20178 	   file1Ptr.p->fileStatus, file1Ptr.p->fileType, file1Ptr.p->reqStatus
20179 	   );
20180 
20181   signal->theData[0] = 404;
20182   signal->theData[1] = file0Ptr.p->fileRef;
20183   EXECUTE_DIRECT(NDBFS, GSN_DUMP_STATE_ORD, signal, 2);
20184 
20185   signal->theData[0] = 404;
20186   signal->theData[1] = file1Ptr.p->fileRef;
20187   EXECUTE_DIRECT(NDBFS, GSN_DUMP_STATE_ORD, signal, 2);
20188 
20189   /* Various GCP_STOP error insert codes */
20190   if (ERROR_INSERTED(7238) ||
20191       ERROR_INSERTED(7239) ||
20192       ERROR_INSERTED(7244) ||
20193       ERROR_INSERTED(7237) ||
20194       ERROR_INSERTED(7241) ||
20195       ERROR_INSERTED(7242) ||
20196       ERROR_INSERTED(7243))
20197   {
20198     jam();
20199     if (ERROR_INSERT_EXTRA == 1)
20200     {
20201       /* Testing GCP STOP handling via node isolation */
20202       jam();
20203       g_eventLogger->info("Not killing local due to GCP stop");
20204       return;
20205     }
20206     /* Otherwise fall through to SYSTEM_ERROR  */
20207   }
20208 
20209   jam();
20210   SystemError * const sysErr = (SystemError*)&signal->theData[0];
20211   sysErr->errorCode = SystemError::GCPStopDetected;
20212   sysErr->errorRef = reference();
20213   sysErr->data[0] = m_gcp_save.m_master.m_state;
20214   sysErr->data[1] = cgcpOrderBlocked;
20215   sysErr->data[2] = m_micro_gcp.m_master.m_state;
20216   EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
20217                  signal, SystemError::SignalLength);
20218   ndbrequire(false);
20219   return;
20220 }//Dbdih::crashSystemAtGcpStop()
20221 
20222 /*************************************************************************/
20223 /*                                                                       */
20224 /*       MODULE: ALLOCPAGE                                               */
20225 /*       DESCRIPTION: THE SUBROUTINE IS CALLED WITH POINTER TO PAGE      */
20226 /*                    RECORD. A PAGE  RECORD IS TAKEN FROM               */
20227 /*                    THE FREE PAGE  LIST                                */
20228 /*************************************************************************/
allocpage(PageRecordPtr & pagePtr)20229 void Dbdih::allocpage(PageRecordPtr& pagePtr)
20230 {
20231   ndbrequire(cfirstfreepage != RNIL);
20232   pagePtr.i = cfirstfreepage;
20233   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
20234   cfirstfreepage = pagePtr.p->nextfreepage;
20235   pagePtr.p->nextfreepage = RNIL;
20236 }//Dbdih::allocpage()
20237 
20238 /*************************************************************************/
20239 /*                                                                       */
20240 /*       MODULE: ALLOC_STORED_REPLICA                                    */
20241 /*       DESCRIPTION: THE SUBROUTINE IS CALLED TO GET A REPLICA RECORD,  */
20242 /*                    TO INITIALISE IT AND TO LINK IT INTO THE FRAGMENT  */
20243 /*                    STORE RECORD. USED FOR STORED REPLICAS.            */
20244 /*************************************************************************/
allocStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr & newReplicaPtr,Uint32 nodeId,Uint32 fragId,Uint32 tableId)20245 void Dbdih::allocStoredReplica(FragmentstorePtr fragPtr,
20246                                ReplicaRecordPtr& newReplicaPtr,
20247                                Uint32 nodeId,
20248                                Uint32 fragId,
20249                                Uint32 tableId)
20250 {
20251   Uint32 i;
20252   ReplicaRecordPtr arrReplicaPtr;
20253   ReplicaRecordPtr arrPrevReplicaPtr;
20254 
20255   seizeReplicaRec(newReplicaPtr);
20256   for (i = 0; i < MAX_LCP_STORED; i++) {
20257     newReplicaPtr.p->maxGciCompleted[i] = 0;
20258     newReplicaPtr.p->maxGciStarted[i] = 0;
20259     newReplicaPtr.p->lcpId[i] = 0;
20260     newReplicaPtr.p->lcpStatus[i] = ZINVALID;
20261   }//for
20262   newReplicaPtr.p->fragId = fragId;
20263   newReplicaPtr.p->tableId = tableId;
20264   newReplicaPtr.p->noCrashedReplicas = 0;
20265   newReplicaPtr.p->initialGci = (Uint32)(m_micro_gcp.m_current_gci >> 32);
20266   for (i = 0; i < MAX_CRASHED_REPLICAS; i++) {
20267     newReplicaPtr.p->replicaLastGci[i] = ZINIT_REPLICA_LAST_GCI;
20268     newReplicaPtr.p->createGci[i] = ZINIT_CREATE_GCI;
20269   }//for
20270   newReplicaPtr.p->createGci[0] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
20271   newReplicaPtr.p->nextLcp = 0;
20272   newReplicaPtr.p->procNode = nodeId;
20273   newReplicaPtr.p->lcpOngoingFlag = false;
20274   newReplicaPtr.p->lcpIdStarted = 0;
20275 
20276   arrPrevReplicaPtr.i = RNIL;
20277   arrReplicaPtr.i = fragPtr.p->storedReplicas;
20278   while (arrReplicaPtr.i != RNIL) {
20279     jam();
20280     c_replicaRecordPool.getPtr(arrReplicaPtr);
20281     arrPrevReplicaPtr = arrReplicaPtr;
20282     arrReplicaPtr.i = arrReplicaPtr.p->nextPool;
20283   }//while
20284   if (arrPrevReplicaPtr.i == RNIL) {
20285     jam();
20286     fragPtr.p->storedReplicas = newReplicaPtr.i;
20287   } else {
20288     jam();
20289     arrPrevReplicaPtr.p->nextPool = newReplicaPtr.i;
20290   }//if
20291   fragPtr.p->noStoredReplicas++;
20292 }//Dbdih::allocStoredReplica()
20293 
20294 /*************************************************************************/
20295 /* CHECK IF THE NODE CRASH IS TO ESCALATE INTO A SYSTEM CRASH. WE COULD  */
20296 /* DO THIS BECAUSE ALL REPLICAS OF SOME FRAGMENT ARE LOST. WE COULD ALSO */
20297 /* DO IT AFTER MANY NODE FAILURES THAT MAKE IT VERY DIFFICULT TO RESTORE */
20298 /* DATABASE AFTER A SYSTEM CRASH. IT MIGHT EVEN BE IMPOSSIBLE AND THIS   */
20299 /* MUST BE AVOIDED EVEN MORE THAN AVOIDING SYSTEM CRASHES.               */
20300 /*************************************************************************/
checkEscalation()20301 void Dbdih::checkEscalation()
20302 {
20303   Uint32 TnodeGroup[MAX_NDB_NODES];
20304   NodeRecordPtr nodePtr;
20305   Uint32 i;
20306   for (i = 0; i < cnoOfNodeGroups; i++) {
20307     TnodeGroup[i] = ZFALSE;
20308   }//for
20309   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
20310     jam();
20311     ptrAss(nodePtr, nodeRecord);
20312     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
20313 	nodePtr.p->activeStatus == Sysfile::NS_Active){
20314       ndbrequire(nodePtr.p->nodeGroup < MAX_NDB_NODES);
20315       TnodeGroup[nodePtr.p->nodeGroup] = ZTRUE;
20316     }
20317   }
20318   for (i = 0; i < cnoOfNodeGroups; i++) {
20319     jam();
20320     if (TnodeGroup[c_node_groups[i]] == ZFALSE) {
20321       jam();
20322       progError(__LINE__, NDBD_EXIT_LOST_NODE_GROUP, "Lost node group");
20323     }//if
20324   }//for
20325 }//Dbdih::checkEscalation()
20326 
20327 /*************************************************************************/
20328 /*                                                                       */
20329 /*       MODULE: CHECK_KEEP_GCI                                          */
20330 /*       DESCRIPTION: CHECK FOR MINIMUM GCI RESTORABLE WITH NEW LOCAL    */
20331 /*                    CHECKPOINT.                                        */
20332 /*************************************************************************/
checkKeepGci(TabRecordPtr tabPtr,Uint32 fragId,Fragmentstore *,Uint32 replicaStartIndex)20333 void Dbdih::checkKeepGci(TabRecordPtr tabPtr, Uint32 fragId, Fragmentstore*,
20334 			 Uint32 replicaStartIndex)
20335 {
20336   ReplicaRecordPtr ckgReplicaPtr;
20337   ckgReplicaPtr.i = replicaStartIndex;
20338   while (ckgReplicaPtr.i != RNIL) {
20339     jam();
20340     c_replicaRecordPool.getPtr(ckgReplicaPtr);
20341     if (c_lcpState.m_participatingLQH.get(ckgReplicaPtr.p->procNode))
20342     {
20343       Uint32 keepGci;
20344       Uint32 oldestRestorableGci;
20345       findMinGci(ckgReplicaPtr, keepGci, oldestRestorableGci);
20346       if (keepGci < c_lcpState.keepGci) {
20347         jam();
20348         /* ----------------------------------------------------------------- */
20349         /* WE MUST KEEP LOG RECORDS SO THAT WE CAN USE ALL LOCAL CHECKPOINTS */
20350         /* THAT ARE AVAILABLE. THUS WE NEED TO CALCULATE THE MINIMUM OVER ALL*/
20351         /* FRAGMENTS.                                                        */
20352         /* ----------------------------------------------------------------- */
20353         c_lcpState.keepGci = keepGci;
20354       }//if
20355       if (oldestRestorableGci > c_lcpState.oldestRestorableGci) {
20356         jam();
20357         c_lcpState.oldestRestorableGci = oldestRestorableGci;
20358       }//if
20359     }
20360     ckgReplicaPtr.i = ckgReplicaPtr.p->nextPool;
20361   }//while
20362 }//Dbdih::checkKeepGci()
20363 
closeFile(Signal * signal,FileRecordPtr filePtr)20364 void Dbdih::closeFile(Signal* signal, FileRecordPtr filePtr)
20365 {
20366   signal->theData[0] = filePtr.p->fileRef;
20367   signal->theData[1] = reference();
20368   signal->theData[2] = filePtr.i;
20369   signal->theData[3] = ZCLOSE_NO_DELETE;
20370   sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, 4, JBA);
20371 }//Dbdih::closeFile()
20372 
closeFileDelete(Signal * signal,FileRecordPtr filePtr)20373 void Dbdih::closeFileDelete(Signal* signal, FileRecordPtr filePtr)
20374 {
20375   signal->theData[0] = filePtr.p->fileRef;
20376   signal->theData[1] = reference();
20377   signal->theData[2] = filePtr.i;
20378   signal->theData[3] = ZCLOSE_DELETE;
20379   sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, 4, JBA);
20380 }//Dbdih::closeFileDelete()
20381 
createFileRw(Signal * signal,FileRecordPtr filePtr)20382 void Dbdih::createFileRw(Signal* signal, FileRecordPtr filePtr)
20383 {
20384   signal->theData[0] = reference();
20385   signal->theData[1] = filePtr.i;
20386   signal->theData[2] = filePtr.p->fileName[0];
20387   signal->theData[3] = filePtr.p->fileName[1];
20388   signal->theData[4] = filePtr.p->fileName[2];
20389   signal->theData[5] = filePtr.p->fileName[3];
20390   signal->theData[6] = ZCREATE_READ_WRITE;
20391   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
20392 }//Dbdih::createFileRw()
20393 
20394 void
emptyverificbuffer(Signal * signal,Uint32 q,bool aContinueB)20395 Dbdih::emptyverificbuffer(Signal* signal, Uint32 q, bool aContinueB)
20396 {
20397   if(unlikely(getBlockCommit() == true))
20398   {
20399     jam();
20400     return;
20401   }
20402 
20403   if (!isEmpty(c_diverify_queue[q]))
20404   {
20405     jam();
20406 
20407     ApiConnectRecord localApiConnect;
20408     dequeue(c_diverify_queue[q], localApiConnect);
20409     ndbrequire(localApiConnect.apiGci <= m_micro_gcp.m_current_gci);
20410     signal->theData[0] = localApiConnect.senderData;
20411     signal->theData[1] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
20412     signal->theData[2] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
20413     signal->theData[3] = 0;
20414     sendSignal(c_diverify_queue[q].m_ref, GSN_DIVERIFYCONF, signal, 4, JBB);
20415   }
20416   else if (aContinueB == true)
20417   {
20418     jam();
20419     /**
20420      * Make sure that we don't miss any pending transactions
20421      *   (transactions that are added to list by other thread
20422      *    while we execute this code)
20423      */
20424     Uint32 blocks[] = { DBTC, 0 };
20425     Callback c = { safe_cast(&Dbdih::emptyverificbuffer_check), q };
20426     synchronize_threads_for_blocks(signal, blocks, c);
20427     return;
20428   }
20429 
20430   if (aContinueB == true)
20431   {
20432     jam();
20433     //-----------------------------------------------------------------------
20434     // This emptying happened as part of a take-out process by continueb signals
20435     // This ensures that we will empty the queue eventually. We will also empty
20436     // one item every time we insert one item to ensure that the list doesn't
20437     // grow when it is not blocked.
20438     //-----------------------------------------------------------------------
20439     signal->theData[0] = DihContinueB::ZEMPTY_VERIFY_QUEUE;
20440     signal->theData[1] = q;
20441     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20442   }//if
20443 
20444   return;
20445 }//Dbdih::emptyverificbuffer()
20446 
20447 void
emptyverificbuffer_check(Signal * signal,Uint32 q,Uint32 retVal)20448 Dbdih::emptyverificbuffer_check(Signal* signal, Uint32 q, Uint32 retVal)
20449 {
20450   ndbrequire(retVal == 0);
20451   if (!isEmpty(c_diverify_queue[q]))
20452   {
20453     jam();
20454     signal->theData[0] = DihContinueB::ZEMPTY_VERIFY_QUEUE;
20455     signal->theData[1] = q;
20456     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20457   }
20458   else
20459   {
20460     /**
20461      * Done with emptyverificbuffer
20462      */
20463     c_diverify_queue[q].m_empty_done = 1;
20464   }
20465 }
20466 
20467 /*************************************************************************/
20468 /*       FIND THE NODES FROM WHICH WE CAN EXECUTE THE LOG TO RESTORE THE */
20469 /*       DATA NODE IN A SYSTEM RESTART.                                  */
20470 /*************************************************************************/
findLogNodes(CreateReplicaRecord * createReplica,FragmentstorePtr fragPtr,Uint32 startGci,Uint32 stopGci)20471 bool Dbdih::findLogNodes(CreateReplicaRecord* createReplica,
20472                          FragmentstorePtr fragPtr,
20473                          Uint32 startGci,
20474                          Uint32 stopGci)
20475 {
20476   ConstPtr<ReplicaRecord> flnReplicaPtr;
20477   flnReplicaPtr.i = createReplica->replicaRec;
20478   c_replicaRecordPool.getPtr(flnReplicaPtr);
20479   /* --------------------------------------------------------------------- */
20480   /*       WE START BY CHECKING IF THE DATA NODE CAN HANDLE THE LOG ALL BY */
20481   /*       ITSELF. THIS IS THE DESIRED BEHAVIOUR. IF THIS IS NOT POSSIBLE  */
20482   /*       THEN WE SEARCH FOR THE BEST POSSIBLE NODES AMONG THE NODES THAT */
20483   /*       ARE PART OF THIS SYSTEM RESTART.                                */
20484   /*       THIS CAN ONLY BE HANDLED BY THE LAST CRASHED REPLICA.           */
20485   /*       The condition is that the replica was created before or at the  */
20486   /*       time of the starting gci, in addition it must have been alive   */
20487   /*       at the time of the stopping gci. This is checked by two         */
20488   /*       conditions, the first checks replicaLastGci and the second      */
20489   /*       checks that it is also smaller than the last gci the node was   */
20490   /*       involved in. This is necessary to check since createGci is set  */
20491   /*       Last + 1 and sometimes startGci = stopGci + 1 and in that case  */
20492   /*       it could happen that replicaLastGci is set to -1 with CreateGci */
20493   /*       set to LastGci + 1.                                             */
20494   /* --------------------------------------------------------------------- */
20495   arrGuard(flnReplicaPtr.p->noCrashedReplicas, MAX_CRASHED_REPLICAS);
20496   const Uint32 noCrashed = flnReplicaPtr.p->noCrashedReplicas;
20497 
20498   if (!(ERROR_INSERTED(7073) || ERROR_INSERTED(7074))&&
20499       (startGci >= flnReplicaPtr.p->createGci[noCrashed]) &&
20500       (stopGci <= flnReplicaPtr.p->replicaLastGci[noCrashed]) &&
20501       (stopGci <= SYSFILE->lastCompletedGCI[flnReplicaPtr.p->procNode])) {
20502     jam();
20503     /* --------------------------------------------------------------------- */
20504     /*       WE FOUND ALL THE LOG RECORDS NEEDED IN THE DATA NODE. WE WILL   */
20505     /*       USE THOSE.                                                      */
20506     /* --------------------------------------------------------------------- */
20507     createReplica->noLogNodes = 1;
20508     createReplica->logStartGci[0] = startGci;
20509     createReplica->logStopGci[0] = stopGci;
20510     createReplica->logNodeId[0] = flnReplicaPtr.p->procNode;
20511     return true;
20512   }//if
20513   Uint32 logNode = 0;
20514   do {
20515     Uint32 fblStopGci;
20516     jam();
20517     if(!findBestLogNode(createReplica,
20518 			fragPtr,
20519 			startGci,
20520 			stopGci,
20521 			logNode,
20522 			fblStopGci)){
20523       jam();
20524       return false;
20525     }
20526 
20527     logNode++;
20528     if (fblStopGci >= stopGci) {
20529       jam();
20530       createReplica->noLogNodes = logNode;
20531       return true;
20532     }//if
20533     startGci = fblStopGci + 1;
20534     if (logNode >= MAX_LOG_EXEC)
20535     {
20536       jam();
20537       break;
20538     }//if
20539   } while (1);
20540   /* --------------------------------------------------------------------- */
20541   /*       IT WAS NOT POSSIBLE TO RESTORE THE REPLICA. THIS CAN EITHER BE  */
20542   /*       BECAUSE OF LACKING NODES OR BECAUSE OF A REALLY SERIOUS PROBLEM.*/
20543   /* --------------------------------------------------------------------- */
20544   return false;
20545 }//Dbdih::findLogNodes()
20546 
20547 /*************************************************************************/
20548 /*       FIND THE BEST POSSIBLE LOG NODE TO EXECUTE THE LOG AS SPECIFIED */
20549 /*       BY THE INPUT PARAMETERS. WE SCAN THROUGH ALL ALIVE REPLICAS.    */
20550 /*       THIS MEANS STORED, OLD_STORED                                   */
20551 /*************************************************************************/
20552 bool
findBestLogNode(CreateReplicaRecord * createReplica,FragmentstorePtr fragPtr,Uint32 startGci,Uint32 stopGci,Uint32 logNode,Uint32 & fblStopGci)20553 Dbdih::findBestLogNode(CreateReplicaRecord* createReplica,
20554 		       FragmentstorePtr fragPtr,
20555 		       Uint32 startGci,
20556 		       Uint32 stopGci,
20557 		       Uint32 logNode,
20558 		       Uint32& fblStopGci)
20559 {
20560   ConstPtr<ReplicaRecord> fblFoundReplicaPtr;
20561   ConstPtr<ReplicaRecord> fblReplicaPtr;
20562 
20563   /* --------------------------------------------------------------------- */
20564   /*       WE START WITH ZERO AS FOUND TO ENSURE THAT FIRST HIT WILL BE    */
20565   /*       BETTER.                                                         */
20566   /* --------------------------------------------------------------------- */
20567   fblStopGci = 0;
20568   fblReplicaPtr.i = fragPtr.p->storedReplicas;
20569   while (fblReplicaPtr.i != RNIL) {
20570     jam();
20571     c_replicaRecordPool.getPtr(fblReplicaPtr);
20572     if (m_sr_nodes.get(fblReplicaPtr.p->procNode))
20573     {
20574       jam();
20575       Uint32 fliStopGci = findLogInterval(fblReplicaPtr, startGci);
20576       if (fliStopGci > fblStopGci)
20577       {
20578         jam();
20579         fblStopGci = fliStopGci;
20580         fblFoundReplicaPtr = fblReplicaPtr;
20581       }//if
20582     }//if
20583     fblReplicaPtr.i = fblReplicaPtr.p->nextPool;
20584   }//while
20585   fblReplicaPtr.i = fragPtr.p->oldStoredReplicas;
20586   while (fblReplicaPtr.i != RNIL) {
20587     jam();
20588     c_replicaRecordPool.getPtr(fblReplicaPtr);
20589     if (m_sr_nodes.get(fblReplicaPtr.p->procNode))
20590     {
20591       jam();
20592       Uint32 fliStopGci = findLogInterval(fblReplicaPtr, startGci);
20593       if (fliStopGci > fblStopGci)
20594       {
20595         jam();
20596         fblStopGci = fliStopGci;
20597         fblFoundReplicaPtr = fblReplicaPtr;
20598       }//if
20599     }//if
20600     fblReplicaPtr.i = fblReplicaPtr.p->nextPool;
20601   }//while
20602   if (fblStopGci != 0) {
20603     jam();
20604     ndbrequire(logNode < MAX_LOG_EXEC);
20605     createReplica->logNodeId[logNode] = fblFoundReplicaPtr.p->procNode;
20606     createReplica->logStartGci[logNode] = startGci;
20607     if (fblStopGci >= stopGci) {
20608       jam();
20609       createReplica->logStopGci[logNode] = stopGci;
20610     } else {
20611       jam();
20612       createReplica->logStopGci[logNode] = fblStopGci;
20613     }//if
20614   }//if
20615 
20616   return fblStopGci != 0;
20617 }//Dbdih::findBestLogNode()
20618 
findLogInterval(ConstPtr<ReplicaRecord> replicaPtr,Uint32 startGci)20619 Uint32 Dbdih::findLogInterval(ConstPtr<ReplicaRecord> replicaPtr,
20620 			      Uint32 startGci)
20621 {
20622   ndbrequire(replicaPtr.p->noCrashedReplicas <= MAX_CRASHED_REPLICAS);
20623   Uint32 loopLimit = replicaPtr.p->noCrashedReplicas + 1;
20624   for (Uint32 i = 0; i < loopLimit; i++) {
20625     jam();
20626     if (replicaPtr.p->createGci[i] <= startGci) {
20627       if (replicaPtr.p->replicaLastGci[i] >= startGci) {
20628         jam();
20629         return replicaPtr.p->replicaLastGci[i];
20630       }//if
20631     }//if
20632   }//for
20633   return 0;
20634 }//Dbdih::findLogInterval()
20635 
20636 /*************************************************************************/
20637 /*                                                                       */
20638 /*       MODULE: FIND THE MINIMUM GCI THAT THIS NODE HAS LOG RECORDS FOR.*/
20639 /*************************************************************************/
findMinGci(ReplicaRecordPtr fmgReplicaPtr,Uint32 & keepGci,Uint32 & oldestRestorableGci)20640 void Dbdih::findMinGci(ReplicaRecordPtr fmgReplicaPtr,
20641                        Uint32& keepGci,
20642                        Uint32& oldestRestorableGci)
20643 {
20644   keepGci = (Uint32)-1;
20645   oldestRestorableGci = 0;
20646 
20647   Uint32 maxLcpId = 0;              // LcpId of latest valid LCP
20648   Uint32 maxLcpNo = MAX_LCP_STORED; // Index of latest valid LCP
20649   for (Uint32 i = 0; i < MAX_LCP_STORED; i++)
20650   {
20651     jam();
20652     if (fmgReplicaPtr.p->lcpStatus[i] == ZVALID)
20653     {
20654       if ((fmgReplicaPtr.p->lcpId[i] + MAX_LCP_STORED) <= SYSFILE->latestLCP_ID)
20655       {
20656         jam();
20657         /*-----------------------------------------------------------------*/
20658         // We invalidate the checkpoint we are preparing to overwrite.
20659         // The LCP id is still the old lcp id,
20660         // this is the reason of comparing with lcpId + 1.
20661         /*-----------------------------------------------------------------*/
20662         fmgReplicaPtr.p->lcpStatus[i] = ZINVALID;
20663       }
20664       else if (fmgReplicaPtr.p->lcpId[i] > maxLcpId)
20665       {
20666         jam();
20667         maxLcpId = fmgReplicaPtr.p->lcpId[i];
20668         maxLcpNo = i;
20669       }
20670     }
20671   }
20672 
20673   if (maxLcpNo < MAX_LCP_STORED)
20674   {
20675     /**
20676      * Only consider latest LCP (wrt to how to cut REDO)
20677      */
20678     jam();
20679     keepGci = fmgReplicaPtr.p->maxGciCompleted[maxLcpNo];
20680     oldestRestorableGci = fmgReplicaPtr.p->maxGciStarted[maxLcpNo];
20681   }
20682 
20683   if (oldestRestorableGci == 0 && keepGci == Uint32(-1))
20684   {
20685     jam();
20686     if (fmgReplicaPtr.p->createGci[0] == fmgReplicaPtr.p->initialGci)
20687     {
20688       keepGci = fmgReplicaPtr.p->createGci[0];
20689       // XXX Jonas
20690       //oldestRestorableGci = fmgReplicaPtr.p->createGci[0];
20691     }
20692   }
20693   else
20694   {
20695     ndbassert(oldestRestorableGci <= c_newest_restorable_gci);
20696   }
20697   return;
20698 }//Dbdih::findMinGci()
20699 
findStartGci(ConstPtr<ReplicaRecord> replicaPtr,Uint32 stopGci,Uint32 & startGci,Uint32 & lcpNo)20700 bool Dbdih::findStartGci(ConstPtr<ReplicaRecord> replicaPtr,
20701                          Uint32 stopGci,
20702                          Uint32& startGci,
20703                          Uint32& lcpNo)
20704 {
20705   Uint32 cnt = 0;
20706   Uint32 tmp[MAX_LCP_STORED];
20707   for (Uint32 i = 0; i<MAX_LCP_STORED; i++)
20708   {
20709     jam();
20710     if (replicaPtr.p->lcpStatus[i] == ZVALID &&
20711         replicaPtr.p->maxGciStarted[i] <= stopGci)
20712     {
20713       /**
20714        * In order to use LCP
20715        *   we must be able to run REDO atleast up until maxGciStarted
20716        *   which is that highest GCI that
20717        */
20718       jam();
20719       tmp[cnt] = i;
20720       cnt++;
20721     }
20722   }
20723 
20724   if (cnt)
20725   {
20726     jam();
20727     /**
20728      * We found atleast one...get the highest
20729      */
20730     lcpNo = tmp[0];
20731     Uint32 lcpId = replicaPtr.p->lcpId[lcpNo];
20732     for (Uint32 i = 1; i<cnt; i++)
20733     {
20734       jam();
20735       if (replicaPtr.p->lcpId[tmp[i]] > lcpId)
20736       {
20737         jam();
20738         lcpNo = tmp[i];
20739         lcpId = replicaPtr.p->lcpId[lcpNo];
20740       }
20741     }
20742     startGci = replicaPtr.p->maxGciCompleted[lcpNo] + 1;
20743     return true;
20744   }
20745 
20746   /* --------------------------------------------------------------------- */
20747   /*       NO VALID LOCAL CHECKPOINT WAS AVAILABLE. WE WILL ADD THE        */
20748   /*       FRAGMENT. THUS THE NEXT LCP MUST BE SET TO ZERO.                */
20749   /*       WE MUST EXECUTE THE LOG FROM THE INITIAL GLOBAL CHECKPOINT WHEN */
20750   /*       THE TABLE WAS CREATED.                                          */
20751   /* --------------------------------------------------------------------- */
20752   startGci = replicaPtr.p->initialGci;
20753   ndbrequire(replicaPtr.p->nextLcp == 0);
20754   return false;
20755 }//Dbdih::findStartGci()
20756 
20757 /**
20758  * Compute max time it can take to "resolve" cascading node-failures
20759  *   given hb-interval, arbit timeout and #db-nodes.
20760  */
20761 Uint32
compute_max_failure_time()20762 Dbdih::compute_max_failure_time()
20763 {
20764   jam();
20765   Uint32 no_of_live_db_nodes = 0;
20766 
20767   // Count the number of live data nodes.
20768   NodeRecordPtr nodePtr(NULL, cfirstAliveNode);
20769   while (nodePtr.i != RNIL)
20770   {
20771     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
20772 
20773     ndbassert(nodePtr.p->nodeStatus == NodeRecord::ALIVE);
20774 
20775     no_of_live_db_nodes++;
20776     nodePtr.i = nodePtr.p->nextNode;
20777   }
20778 
20779   const ndb_mgm_configuration_iterator* cfgIter =
20780     m_ctx.m_config.getOwnConfigIterator();
20781 
20782   Uint32 hbDBDB = 5000;
20783   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_HEARTBEAT_INTERVAL, &hbDBDB);
20784 
20785   Uint32 arbit_timeout = 7500;
20786   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_ARBIT_TIMEOUT, &arbit_timeout);
20787 
20788   /*
20789     A node is presumed dead if it is silent for four missed heartbeats,
20790     meaning that the worst case is five heartbeat intervals.
20791   */
20792   const Uint32 heartbeat_fail_time = hbDBDB * 5;
20793 
20794   /*
20795     The worst case failure scenario works as follows:
20796 
20797     1) All data nodes are running.
20798 
20799     2) One in each node group fail. Detecting this takes:
20800     no_of_node_groups * heartbeat_fail_time
20801 
20802     3) Arbitration is started, as the failed nodes could have formed an
20803     independent cluster. Arbitration make take up to arbit_timeout to
20804     complete.
20805 
20806     4) Just before arbitration completes, all remaining nodes except
20807     for the master fail. The remain node *could* have shut itself down
20808     as soon as the first of these failures are detected, but as it
20809     waits for outstanding PREP_FAILCONF messages before checking of
20810     the cluster is viable, it does not do so until all the failures
20811     have been detected. Detecting these failures thus takes:
20812     (no_of_nodes - no_of_node_groups - 1) * heartbeat_fail_time
20813 
20814     Combining these figure we get a total failure time of:
20815     (no_of_nodes - 1) * heartbeat_fail_time + arbit_timeout
20816 
20817     (For NoOfReplicas>2 there could be cases of nodes failing sequentially
20818     that would require more than one round of arbitration. These have not
20819     been considered here.)
20820   */
20821 
20822   return (MAX(no_of_live_db_nodes, 1) - 1) * heartbeat_fail_time
20823     + arbit_timeout;
20824 }
20825 
20826 /*
20827   Calculate timeouts for detecting GCP stops. These must be set such that
20828   node failures are not falsely interpreted as GCP stops.
20829 */
setGCPStopTimeouts()20830 void Dbdih::setGCPStopTimeouts()
20831 {
20832 
20833   const ndb_mgm_configuration_iterator* cfgIter =
20834     m_ctx.m_config.getOwnConfigIterator();
20835 
20836   const Uint32 max_failure_time = compute_max_failure_time();
20837 
20838   // Set time-between epochs timeout
20839   Uint32 micro_GCP_timeout = 4000;
20840   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_MICRO_GCP_TIMEOUT,
20841                             &micro_GCP_timeout);
20842 
20843   /*
20844     Set minimum value for time-between global checkpoint timeout.
20845     By default, this is 2 minutes.
20846   */
20847   Uint32 gcp_timeout = 120000;
20848   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_GCP_TIMEOUT, &gcp_timeout);
20849 
20850   const Uint32 old_micro_GCP_max_lag = m_gcp_monitor.m_micro_gcp.m_max_lag_ms;
20851   const Uint32 old_GCP_save_max_lag = m_gcp_monitor.m_gcp_save.m_max_lag_ms;
20852 
20853   if (micro_GCP_timeout != 0)
20854   {
20855     jam();
20856     if (ERROR_INSERTED(7145))
20857     {
20858       /*
20859         We drop these lower limits in certain tests, to verify that the
20860         calculated value for max_failure_time is sufficient.
20861        */
20862       ndbout << "Dbdih::setGCPStopTimeouts() setting minimal GCP timout values"
20863              << " for test purposes."  << endl;
20864       micro_GCP_timeout = 0;
20865       gcp_timeout = 0;
20866     }
20867 
20868     m_gcp_monitor.m_micro_gcp.m_max_lag_ms =
20869       m_micro_gcp.m_master.m_time_between_gcp + micro_GCP_timeout
20870       + max_failure_time;
20871 
20872     m_gcp_monitor.m_gcp_save.m_max_lag_ms =
20873       m_gcp_save.m_master.m_time_between_gcp +
20874       // Ensure that GCP-commit times out before GCP-save if both stops.
20875       MAX(gcp_timeout, micro_GCP_timeout) +
20876       max_failure_time;
20877   }
20878   else
20879   {
20880     jam();
20881     m_gcp_monitor.m_gcp_save.m_max_lag_ms = 0;
20882     m_gcp_monitor.m_micro_gcp.m_max_lag_ms = 0;
20883   }
20884 
20885   // If timeouts have changed, log it.
20886   if (old_micro_GCP_max_lag != m_gcp_monitor.m_micro_gcp.m_max_lag_ms ||
20887       old_GCP_save_max_lag != m_gcp_monitor.m_gcp_save.m_max_lag_ms)
20888   {
20889     if (m_gcp_monitor.m_micro_gcp.m_max_lag_ms > 0)
20890     {
20891       jam();
20892       if (isMaster())
20893       {
20894         jam();
20895         // Log to mgmd.
20896         infoEvent("GCP Monitor: Computed max GCP_COMMIT lag to %u seconds",
20897                   m_gcp_monitor.m_micro_gcp.m_max_lag_ms / 1000);
20898         infoEvent("GCP Monitor: Computed max GCP_SAVE lag to %u seconds",
20899                   m_gcp_monitor.m_gcp_save.m_max_lag_ms / 1000);
20900       }
20901       // Log locallly.
20902       g_eventLogger->info("GCP Monitor: Computed max GCP_COMMIT lag to %u"
20903                           " seconds",
20904                           m_gcp_monitor.m_micro_gcp.m_max_lag_ms / 1000);
20905       g_eventLogger->info("GCP Monitor: Computed max GCP_SAVE lag to %u"
20906                           " seconds",
20907                           m_gcp_monitor.m_gcp_save.m_max_lag_ms / 1000);
20908     }
20909     else
20910     {
20911       jam();
20912       if (isMaster())
20913       {
20914         jam();
20915         infoEvent("GCP Monitor: unlimited lags allowed");
20916       }
20917       g_eventLogger->info("GCP Monitor: unlimited lags allowed");
20918     }
20919   }
20920 } // setGCPStopTimeouts()
20921 
initCommonData()20922 void Dbdih::initCommonData()
20923 {
20924   c_blockCommit = false;
20925   c_blockCommitNo = 0;
20926   cfailurenr = 1;
20927   cMinTcFailNo = 0; /* 0 as TC inits to 0 */
20928   cfirstAliveNode = RNIL;
20929   cfirstDeadNode = RNIL;
20930   cgckptflag = false;
20931   cgcpOrderBlocked = 0;
20932 
20933   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
20934 
20935   c_lcpState.clcpDelay = 0;
20936   c_lcpState.lcpStart = ZIDLE;
20937   c_lcpState.lcpStopGcp = 0;
20938   c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
20939   c_lcpState.currentFragment.tableId = 0;
20940   c_lcpState.currentFragment.fragmentId = 0;
20941   c_lcpState.noOfLcpFragRepOutstanding = 0;
20942   c_lcpState.keepGci = 0;
20943   c_lcpState.oldestRestorableGci = 0;
20944   c_lcpState.ctcCounter = 0;
20945   c_lcpState.ctimer = 0;
20946   c_lcpState.immediateLcpStart = false;
20947   c_lcpState.m_MASTER_LCPREQ_Received = false;
20948   c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0;
20949   cmasterdihref = 0;
20950   cmasterNodeId = 0;
20951   cmasterState = MASTER_IDLE;
20952   cmasterTakeOverNode = 0;
20953   cnoOfActiveTables = 0;
20954   cnoOfNodeGroups = 0;
20955   c_nextNodeGroup = 0;
20956   cnoReplicas = 0;
20957   con_lineNodes = 0;
20958   creceivedfrag = 0;
20959   crestartGci = 0;
20960   crestartInfoFile[0] = RNIL;
20961   crestartInfoFile[1] = RNIL;
20962   cstartPhase = 0;
20963   cstarttype = (Uint32)-1;
20964   csystemnodes = 0;
20965   c_newest_restorable_gci = 0;
20966   cwaitLcpSr = false;
20967   c_nodeStartMaster.blockGcp = 0;
20968 
20969   nodeResetStart(0);
20970   c_nodeStartMaster.wait = ZFALSE;
20971 
20972   memset(&sysfileData[0], 0, sizeof(sysfileData));
20973 
20974   const ndb_mgm_configuration_iterator * p =
20975     m_ctx.m_config.getOwnConfigIterator();
20976   ndbrequire(p != 0);
20977 
20978   c_lcpState.clcpDelay = 20;
20979 
20980   /**
20981    * Get the configuration value for how many parallel fragment copy scans we
20982    * are going to do in parallel when we are requested to handle a node
20983    * recovery. If 0 set it to default value.
20984    */
20985   c_max_takeover_copy_threads = 0;
20986   ndb_mgm_get_int_parameter(p,
20987                             CFG_DB_PARALLEL_COPY_THREADS,
20988                             &c_max_takeover_copy_threads);
20989   if (c_max_takeover_copy_threads == 0)
20990   {
20991     jam();
20992     c_max_takeover_copy_threads = ZTAKE_OVER_THREADS;
20993   }
20994 
20995   ndb_mgm_get_int_parameter(p, CFG_DB_LCP_INTERVAL, &c_lcpState.clcpDelay);
20996   c_lcpState.clcpDelay = c_lcpState.clcpDelay > 31 ? 31 : c_lcpState.clcpDelay;
20997 
20998   //ndb_mgm_get_int_parameter(p, CFG_DB_MIN_HOT_SPARES, &cminHotSpareNodes);
20999 
21000   cnoReplicas = 1;
21001   ndb_mgm_get_int_parameter(p, CFG_DB_NO_REPLICAS, &cnoReplicas);
21002   if (cnoReplicas > MAX_REPLICAS)
21003   {
21004     progError(__LINE__, NDBD_EXIT_INVALID_CONFIG,
21005 	      "Only up to four replicas are supported. Check NoOfReplicas.");
21006   }
21007 
21008   bzero(&m_gcp_save, sizeof(m_gcp_save));
21009   bzero(&m_micro_gcp, sizeof(m_micro_gcp));
21010   NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
21011   NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
21012   {
21013     { // Set time-between global checkpoint
21014       Uint32 tmp = 2000;
21015       ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &tmp);
21016       tmp = tmp > 60000 ? 60000 : (tmp < 10 ? 10 : tmp);
21017       m_gcp_save.m_master.m_time_between_gcp = tmp;
21018     }
21019 
21020     Uint32 tmp = 0;
21021     if (ndb_mgm_get_int_parameter(p, CFG_DB_MICRO_GCP_INTERVAL, &tmp) == 0 &&
21022         tmp)
21023     {
21024       /**
21025        * Set time-between epochs
21026        */
21027       if (tmp > m_gcp_save.m_master.m_time_between_gcp)
21028         tmp = m_gcp_save.m_master.m_time_between_gcp;
21029       if (tmp < 10)
21030         tmp = 10;
21031       m_micro_gcp.m_master.m_time_between_gcp = tmp;
21032     }
21033 
21034     // These will be set when nodes reach state 'started'.
21035     m_gcp_monitor.m_micro_gcp.m_max_lag_ms = 0;
21036     m_gcp_monitor.m_gcp_save.m_max_lag_ms = 0;
21037   }
21038 }//Dbdih::initCommonData()
21039 
initFragstore(FragmentstorePtr fragPtr)21040 void Dbdih::initFragstore(FragmentstorePtr fragPtr)
21041 {
21042   fragPtr.p->storedReplicas = RNIL;
21043   fragPtr.p->oldStoredReplicas = RNIL;
21044 
21045   fragPtr.p->noStoredReplicas = 0;
21046   fragPtr.p->noOldStoredReplicas = 0;
21047   fragPtr.p->fragReplicas = 0;
21048   fragPtr.p->preferredPrimary = 0;
21049 
21050   for (Uint32 i = 0; i < MAX_REPLICAS; i++)
21051     fragPtr.p->activeNodes[i] = 0;
21052 
21053   fragPtr.p->noLcpReplicas = 0;
21054   fragPtr.p->distributionKey = 0;
21055 }//Dbdih::initFragstore()
21056 
21057 /*************************************************************************/
21058 /*                                                                       */
21059 /*       MODULE: INIT_RESTART_INFO                                       */
21060 /*       DESCRIPTION: INITIATE RESTART INFO VARIABLE AND VARIABLES FOR   */
21061 /*                    GLOBAL CHECKPOINTS.                                */
21062 /*************************************************************************/
initRestartInfo(Signal * signal)21063 void Dbdih::initRestartInfo(Signal* signal)
21064 {
21065   Uint32 i;
21066   for (i = 0; i < MAX_NDB_NODES; i++) {
21067     SYSFILE->lastCompletedGCI[i] = 0;
21068   }//for
21069   NodeRecordPtr nodePtr;
21070   nodePtr.i = cfirstAliveNode;
21071   do {
21072     jam();
21073     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
21074     SYSFILE->lastCompletedGCI[nodePtr.i] = 1;
21075     /* FIRST GCP = 1 ALREADY SET BY LQH */
21076     nodePtr.i = nodePtr.p->nextNode;
21077   } while (nodePtr.i != RNIL);
21078 
21079   Uint32 startGci = 1;
21080 #ifndef NDEBUG
21081 #ifdef NDB_USE_GET_ENV
21082   {
21083     char envBuf[256];
21084     const char* v = NdbEnv_GetEnv("NDB_START_GCI",
21085                                   envBuf,
21086                                   256);
21087     if (v && *v != 0)
21088     {
21089       startGci = my_strtoull(v, NULL, 0);
21090 
21091       ndbout_c("DbDih : Using value of %u from NDB_START_GCI",
21092                startGci);
21093     }
21094   }
21095 #endif
21096 #endif
21097 
21098   m_micro_gcp.m_old_gci = Uint64(startGci) << 32;
21099   m_micro_gcp.m_current_gci = Uint64(startGci + 1) << 32;
21100   crestartGci = startGci;
21101   c_newest_restorable_gci = startGci;
21102 
21103   SYSFILE->keepGCI             = startGci;
21104   SYSFILE->oldestRestorableGCI = startGci;
21105   SYSFILE->newestRestorableGCI = startGci;
21106   SYSFILE->systemRestartBits   = 0;
21107   for (i = 0; i < NdbNodeBitmask::Size; i++) {
21108     SYSFILE->lcpActive[0]        = 0;
21109   }//for
21110   for (i = 0; i < Sysfile::TAKE_OVER_SIZE; i++) {
21111     SYSFILE->takeOver[i] = 0;
21112   }//for
21113   Sysfile::setInitialStartOngoing(SYSFILE->systemRestartBits);
21114   srand((unsigned int)time(0));
21115   globalData.m_restart_seq = SYSFILE->m_restart_seq = 0;
21116 
21117   if (m_micro_gcp.m_enabled == false &&
21118       m_micro_gcp.m_master.m_time_between_gcp)
21119   {
21120     /**
21121      * Micro GCP is disabled...but configured...
21122      */
21123     jam();
21124     m_micro_gcp.m_enabled = true;
21125     UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
21126     ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
21127     EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
21128   }
21129 }//Dbdih::initRestartInfo()
21130 
21131 /*--------------------------------------------------------------------*/
21132 /*       NODE GROUP BITS ARE INITIALISED BEFORE THIS.                 */
21133 /*       NODE ACTIVE BITS ARE INITIALISED BEFORE THIS.                */
21134 /*--------------------------------------------------------------------*/
21135 /*************************************************************************/
21136 /*                                                                       */
21137 /*       MODULE: INIT_RESTORABLE_GCI_FILES                               */
21138 /*       DESCRIPTION: THE SUBROUTINE SETS UP THE FILES THAT REFERS TO THE*/
21139 /*       FILES THAT KEEP THE VARIABLE CRESTART_INFO                      */
21140 /*************************************************************************/
initRestorableGciFiles()21141 void Dbdih::initRestorableGciFiles()
21142 {
21143   Uint32 tirgTmp;
21144   FileRecordPtr filePtr;
21145   seizeFile(filePtr);
21146   filePtr.p->tabRef = RNIL;
21147   filePtr.p->fileType = FileRecord::GCP_FILE;
21148   filePtr.p->reqStatus = FileRecord::IDLE;
21149   filePtr.p->fileStatus = FileRecord::CLOSED;
21150   crestartInfoFile[0] = filePtr.i;
21151   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
21152   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
21153   filePtr.p->fileName[2] = (Uint32)-1;  /* S PART IGNORED        */
21154   tirgTmp = 1;  /* FILE NAME VERSION 1   */
21155   tirgTmp = (tirgTmp << 8) + 6; /* .SYSFILE              */
21156   tirgTmp = (tirgTmp << 8) + 1; /* D1 DIRECTORY          */
21157   tirgTmp = (tirgTmp << 8) + 0; /* P0 FILE NAME          */
21158   filePtr.p->fileName[3] = tirgTmp;
21159   /* --------------------------------------------------------------------- */
21160   /*       THE NAME BECOMES /D1/DBDICT/S0.SYSFILE                          */
21161   /* --------------------------------------------------------------------- */
21162   seizeFile(filePtr);
21163   filePtr.p->tabRef = RNIL;
21164   filePtr.p->fileType = FileRecord::GCP_FILE;
21165   filePtr.p->reqStatus = FileRecord::IDLE;
21166   filePtr.p->fileStatus = FileRecord::CLOSED;
21167   crestartInfoFile[1] = filePtr.i;
21168   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
21169   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
21170   filePtr.p->fileName[2] = (Uint32)-1;  /* S PART IGNORED        */
21171   tirgTmp = 1;  /* FILE NAME VERSION 1   */
21172   tirgTmp = (tirgTmp << 8) + 6; /* .SYSFILE              */
21173   tirgTmp = (tirgTmp << 8) + 2; /* D1 DIRECTORY          */
21174   tirgTmp = (tirgTmp << 8) + 0; /* P0 FILE NAME          */
21175   filePtr.p->fileName[3] = tirgTmp;
21176   /* --------------------------------------------------------------------- */
21177   /*       THE NAME BECOMES /D2/DBDICT/P0.SYSFILE                          */
21178   /* --------------------------------------------------------------------- */
21179 }//Dbdih::initRestorableGciFiles()
21180 
initTable(TabRecordPtr tabPtr)21181 void Dbdih::initTable(TabRecordPtr tabPtr)
21182 {
21183   new (tabPtr.p) TabRecord();
21184   tabPtr.p->noOfFragChunks = 0;
21185   tabPtr.p->method = TabRecord::NOTDEFINED;
21186   tabPtr.p->tabStatus = TabRecord::TS_IDLE;
21187   tabPtr.p->noOfWords = 0;
21188   tabPtr.p->noPages = 0;
21189   tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
21190   tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
21191   tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
21192   tabPtr.p->noOfBackups = 0;
21193   tabPtr.p->kvalue = 0;
21194   tabPtr.p->hashpointer = (Uint32)-1;
21195   tabPtr.p->mask = 0;
21196   tabPtr.p->tabStorage = TabRecord::ST_NORMAL;
21197   tabPtr.p->tabErrorCode = 0;
21198   tabPtr.p->schemaVersion = (Uint32)-1;
21199   tabPtr.p->tabRemoveNode = RNIL;
21200   tabPtr.p->totalfragments = (Uint32)-1;
21201   tabPtr.p->connectrec = RNIL;
21202   tabPtr.p->tabFile[0] = RNIL;
21203   tabPtr.p->tabFile[1] = RNIL;
21204   tabPtr.p->m_dropTab.tabUserRef = 0;
21205   tabPtr.p->m_dropTab.tabUserPtr = RNIL;
21206   Uint32 i;
21207   for (i = 0; i < NDB_ARRAY_SIZE(tabPtr.p->startFid); i++) {
21208     tabPtr.p->startFid[i] = RNIL;
21209   }//for
21210   for (i = 0; i < NDB_ARRAY_SIZE(tabPtr.p->pageRef); i++) {
21211     tabPtr.p->pageRef[i] = RNIL;
21212   }//for
21213   tabPtr.p->tableType = DictTabInfo::UndefTableType;
21214   tabPtr.p->schemaTransId = 0;
21215 }//Dbdih::initTable()
21216 
21217 /*************************************************************************/
21218 /*                                                                       */
21219 /*       MODULE: INIT_TABLE_FILES                                        */
21220 /*       DESCRIPTION: THE SUBROUTINE SETS UP THE FILES THAT REFERS TO THE*/
21221 /*       FILES THAT KEEP THE TABLE FRAGMENTATION DESCRIPTION.            */
21222 /*************************************************************************/
initTableFile(TabRecordPtr tabPtr)21223 void Dbdih::initTableFile(TabRecordPtr tabPtr)
21224 {
21225   Uint32 titfTmp;
21226   FileRecordPtr filePtr;
21227   seizeFile(filePtr);
21228   filePtr.p->tabRef = tabPtr.i;
21229   filePtr.p->fileType = FileRecord::TABLE_FILE;
21230   filePtr.p->reqStatus = FileRecord::IDLE;
21231   filePtr.p->fileStatus = FileRecord::CLOSED;
21232   tabPtr.p->tabFile[0] = filePtr.i;
21233   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
21234   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
21235   filePtr.p->fileName[2] = tabPtr.i;    /* Stid FILE NAME        */
21236   titfTmp = 1;  /* FILE NAME VERSION 1   */
21237   titfTmp = (titfTmp << 8) + 3; /* .FRAGLIST             */
21238   titfTmp = (titfTmp << 8) + 1; /* D1 DIRECTORY          */
21239   titfTmp = (titfTmp << 8) + 255;       /* P PART IGNORED        */
21240   filePtr.p->fileName[3] = titfTmp;
21241   /* --------------------------------------------------------------------- */
21242   /*       THE NAME BECOMES /D1/DBDICT/Stid.FRAGLIST                       */
21243   /* --------------------------------------------------------------------- */
21244   seizeFile(filePtr);
21245   filePtr.p->tabRef = tabPtr.i;
21246   filePtr.p->fileType = FileRecord::TABLE_FILE;
21247   filePtr.p->reqStatus = FileRecord::IDLE;
21248   filePtr.p->fileStatus = FileRecord::CLOSED;
21249   tabPtr.p->tabFile[1] = filePtr.i;
21250   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
21251   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
21252   filePtr.p->fileName[2] = tabPtr.i;    /* Stid FILE NAME        */
21253   titfTmp = 1;  /* FILE NAME VERSION 1   */
21254   titfTmp = (titfTmp << 8) + 3; /* .FRAGLIST             */
21255   titfTmp = (titfTmp << 8) + 2; /* D2 DIRECTORY          */
21256   titfTmp = (titfTmp << 8) + 255;       /* P PART IGNORED        */
21257   filePtr.p->fileName[3] = titfTmp;
21258   /* --------------------------------------------------------------------- */
21259   /*       THE NAME BECOMES /D2/DBDICT/Stid.FRAGLIST                       */
21260   /* --------------------------------------------------------------------- */
21261 }//Dbdih::initTableFile()
21262 
initialiseRecordsLab(Signal * signal,Uint32 stepNo,Uint32 retRef,Uint32 retData)21263 void Dbdih::initialiseRecordsLab(Signal* signal,
21264 				 Uint32 stepNo, Uint32 retRef, Uint32 retData)
21265 {
21266   switch (stepNo) {
21267   case 0:
21268     jam();
21269     initCommonData();
21270     break;
21271   case 1:{
21272     ApiConnectRecordPtr apiConnectptr;
21273     jam();
21274     c_diverify_queue[0].m_ref = calcTcBlockRef(getOwnNodeId());
21275     for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
21276     {
21277       if (globalData.ndbMtTcThreads > 0)
21278       {
21279         c_diverify_queue[i].m_ref = numberToRef(DBTC, i + 1, 0);
21280       }
21281       /******** INTIALIZING API CONNECT RECORDS ********/
21282       for (apiConnectptr.i = 0;
21283            apiConnectptr.i < capiConnectFileSize; apiConnectptr.i++)
21284       {
21285         refresh_watch_dog();
21286         ptrAss(apiConnectptr, c_diverify_queue[i].apiConnectRecord);
21287         apiConnectptr.p->senderData = RNIL;
21288         apiConnectptr.p->apiGci = ~(Uint64)0;
21289       }//for
21290     }
21291     jam();
21292     break;
21293   }
21294   case 2:{
21295     ConnectRecordPtr connectPtr;
21296     jam();
21297     /****** CONNECT ******/
21298     for (connectPtr.i = 0; connectPtr.i < cconnectFileSize; connectPtr.i++) {
21299       refresh_watch_dog();
21300       ptrAss(connectPtr, connectRecord);
21301       connectPtr.p->userpointer = RNIL;
21302       connectPtr.p->userblockref = ZNIL;
21303       connectPtr.p->connectState = ConnectRecord::FREE;
21304       connectPtr.p->table = RNIL;
21305       connectPtr.p->nextPool = connectPtr.i + 1;
21306       bzero(connectPtr.p->nodes, sizeof(connectPtr.p->nodes));
21307     }//for
21308     connectPtr.i = cconnectFileSize - 1;
21309     ptrAss(connectPtr, connectRecord);
21310     connectPtr.p->nextPool = RNIL;
21311     cfirstconnect = 0;
21312     break;
21313   }
21314   case 3:
21315     {
21316       FileRecordPtr filePtr;
21317       jam();
21318       /******** INTIALIZING FILE RECORDS ********/
21319       for (filePtr.i = 0; filePtr.i < cfileFileSize; filePtr.i++) {
21320 	ptrAss(filePtr, fileRecord);
21321 	filePtr.p->nextFile = filePtr.i + 1;
21322 	filePtr.p->fileStatus = FileRecord::CLOSED;
21323 	filePtr.p->reqStatus = FileRecord::IDLE;
21324       }//for
21325       filePtr.i = cfileFileSize - 1;
21326       ptrAss(filePtr, fileRecord);
21327       filePtr.p->nextFile = RNIL;
21328       cfirstfreeFile = 0;
21329       initRestorableGciFiles();
21330       break;
21331     }
21332   case 4:
21333     jam();
21334     initialiseFragstore();
21335     break;
21336   case 5:
21337     {
21338       jam();
21339       /******* NODE GROUP RECORD ******/
21340       /******* NODE RECORD       ******/
21341       NodeGroupRecordPtr loopNGPtr;
21342       for (loopNGPtr.i = 0; loopNGPtr.i < MAX_NDB_NODES; loopNGPtr.i++) {
21343 	ptrAss(loopNGPtr, nodeGroupRecord);
21344         loopNGPtr.p->nodesInGroup[0] = RNIL;
21345         loopNGPtr.p->nodesInGroup[1] = RNIL;
21346         loopNGPtr.p->nodesInGroup[2] = RNIL;
21347         loopNGPtr.p->nodesInGroup[3] = RNIL;
21348         loopNGPtr.p->nextReplicaNode = 0;
21349         loopNGPtr.p->nodeCount = 0;
21350         loopNGPtr.p->activeTakeOver = false;
21351         loopNGPtr.p->nodegroupIndex = RNIL;
21352         loopNGPtr.p->m_ref_count = 0;
21353         loopNGPtr.p->m_next_log_part = 0;
21354       }//for
21355       break;
21356     }
21357   case 6:
21358     {
21359       PageRecordPtr pagePtr;
21360       jam();
21361       /******* PAGE RECORD ******/
21362       for (pagePtr.i = 0; pagePtr.i < cpageFileSize; pagePtr.i++) {
21363         refresh_watch_dog();
21364 	ptrAss(pagePtr, pageRecord);
21365 	pagePtr.p->nextfreepage = pagePtr.i + 1;
21366       }//for
21367       pagePtr.i = cpageFileSize - 1;
21368       ptrAss(pagePtr, pageRecord);
21369       pagePtr.p->nextfreepage = RNIL;
21370       cfirstfreepage = 0;
21371       break;
21372     }
21373   case 7:
21374     {
21375       ReplicaRecordPtr initReplicaPtr;
21376       jam();
21377       /******* REPLICA RECORD ******/
21378       for (initReplicaPtr.i = 0; initReplicaPtr.i < creplicaFileSize;
21379 	   initReplicaPtr.i++) {
21380         refresh_watch_dog();
21381         c_replicaRecordPool.seizeId(initReplicaPtr, initReplicaPtr.i);
21382 	initReplicaPtr.p->lcpIdStarted = 0;
21383 	initReplicaPtr.p->lcpOngoingFlag = false;
21384         c_replicaRecordPool.releaseLast(initReplicaPtr);
21385       }//for
21386       cnoFreeReplicaRec = creplicaFileSize;
21387       break;
21388     }
21389   case 8:
21390     {
21391       TabRecordPtr loopTabptr;
21392       jam();
21393       /********* TAB-DESCRIPTOR ********/
21394       for (loopTabptr.i = 0; loopTabptr.i < ctabFileSize; loopTabptr.i++) {
21395 	ptrAss(loopTabptr, tabRecord);
21396         refresh_watch_dog();
21397 	initTable(loopTabptr);
21398       }//for
21399       break;
21400     }
21401   case 9:
21402     {
21403       jam();
21404       ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtrSend();
21405       conf->senderRef = reference();
21406       conf->senderData = retData;
21407       sendSignal(retRef, GSN_READ_CONFIG_CONF, signal,
21408 		 ReadConfigConf::SignalLength, JBB);
21409       return;
21410       break;
21411     }
21412   default:
21413     ndbrequire(false);
21414     break;
21415   }//switch
21416   jam();
21417   /* ---------------------------------------------------------------------- */
21418   /* SEND REAL-TIME BREAK DURING INIT OF VARIABLES DURING SYSTEM RESTART.   */
21419   /* ---------------------------------------------------------------------- */
21420   signal->theData[0] = DihContinueB::ZINITIALISE_RECORDS;
21421   signal->theData[1] = stepNo + 1;
21422   signal->theData[2] = retRef;
21423   signal->theData[3] = retData;
21424   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
21425 }//Dbdih::initialiseRecordsLab()
21426 
21427 /*************************************************************************/
21428 /*       INSERT THE NODE INTO THE LINKED LIST OF NODES INVOLVED ALL      */
21429 /*       DISTRIBUTED PROTOCOLS (EXCEPT GCP PROTOCOL THAT USES THE DIH    */
21430 /*       LINKED LIST INSTEAD).                                           */
21431 /*************************************************************************/
insertAlive(NodeRecordPtr newNodePtr)21432 void Dbdih::insertAlive(NodeRecordPtr newNodePtr)
21433 {
21434   NodeRecordPtr nodePtr;
21435 
21436   nodePtr.i = cfirstAliveNode;
21437   if (nodePtr.i == RNIL) {
21438     jam();
21439     cfirstAliveNode = newNodePtr.i;
21440   } else {
21441     do {
21442       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
21443       if (nodePtr.p->nextNode == RNIL) {
21444         jam();
21445         nodePtr.p->nextNode = newNodePtr.i;
21446         break;
21447       } else {
21448         jam();
21449         nodePtr.i = nodePtr.p->nextNode;
21450       }//if
21451     } while (1);
21452   }//if
21453   newNodePtr.p->nextNode = RNIL;
21454 }//Dbdih::insertAlive()
21455 
insertBackup(FragmentstorePtr fragPtr,Uint32 nodeId)21456 void Dbdih::insertBackup(FragmentstorePtr fragPtr, Uint32 nodeId)
21457 {
21458   for (Uint32 i = fragPtr.p->fragReplicas; i > 1; i--) {
21459     jam();
21460     ndbrequire(i < MAX_REPLICAS && i > 0);
21461     fragPtr.p->activeNodes[i] = fragPtr.p->activeNodes[i - 1];
21462   }//for
21463   fragPtr.p->activeNodes[1] = nodeId;
21464   fragPtr.p->fragReplicas++;
21465 }//Dbdih::insertBackup()
21466 
insertDeadNode(NodeRecordPtr newNodePtr)21467 void Dbdih::insertDeadNode(NodeRecordPtr newNodePtr)
21468 {
21469   NodeRecordPtr nodePtr;
21470 
21471   nodePtr.i = cfirstDeadNode;
21472   if (nodePtr.i == RNIL) {
21473     jam();
21474     cfirstDeadNode = newNodePtr.i;
21475   } else {
21476     do {
21477       jam();
21478       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
21479       if (nodePtr.p->nextNode == RNIL) {
21480         jam();
21481         nodePtr.p->nextNode = newNodePtr.i;
21482         break;
21483       } else {
21484         jam();
21485         nodePtr.i = nodePtr.p->nextNode;
21486       }//if
21487     } while (1);
21488   }//if
21489   newNodePtr.p->nextNode = RNIL;
21490 }//Dbdih::insertDeadNode()
21491 
linkOldStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)21492 void Dbdih::linkOldStoredReplica(FragmentstorePtr fragPtr,
21493                                  ReplicaRecordPtr replicatePtr)
21494 {
21495   ReplicaRecordPtr losReplicaPtr;
21496 
21497   replicatePtr.p->nextPool = RNIL;
21498   fragPtr.p->noOldStoredReplicas++;
21499   losReplicaPtr.i = fragPtr.p->oldStoredReplicas;
21500   if (losReplicaPtr.i == RNIL) {
21501     jam();
21502     fragPtr.p->oldStoredReplicas = replicatePtr.i;
21503     return;
21504   }//if
21505   c_replicaRecordPool.getPtr(losReplicaPtr);
21506   while (losReplicaPtr.p->nextPool != RNIL) {
21507     jam();
21508     losReplicaPtr.i = losReplicaPtr.p->nextPool;
21509     c_replicaRecordPool.getPtr(losReplicaPtr);
21510   }//if
21511   losReplicaPtr.p->nextPool = replicatePtr.i;
21512 }//Dbdih::linkOldStoredReplica()
21513 
linkStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)21514 void Dbdih::linkStoredReplica(FragmentstorePtr fragPtr,
21515                               ReplicaRecordPtr replicatePtr)
21516 {
21517   ReplicaRecordPtr lsrReplicaPtr;
21518 
21519   fragPtr.p->noStoredReplicas++;
21520   replicatePtr.p->nextPool = RNIL;
21521   lsrReplicaPtr.i = fragPtr.p->storedReplicas;
21522   if (fragPtr.p->storedReplicas == RNIL) {
21523     jam();
21524     fragPtr.p->storedReplicas = replicatePtr.i;
21525     return;
21526   }//if
21527   c_replicaRecordPool.getPtr(lsrReplicaPtr);
21528   while (lsrReplicaPtr.p->nextPool != RNIL) {
21529     jam();
21530     lsrReplicaPtr.i = lsrReplicaPtr.p->nextPool;
21531     c_replicaRecordPool.getPtr(lsrReplicaPtr);
21532   }//if
21533   lsrReplicaPtr.p->nextPool = replicatePtr.i;
21534 }//Dbdih::linkStoredReplica()
21535 
21536 /*************************************************************************/
21537 /*        MAKE NODE GROUPS BASED ON THE LIST OF NODES RECEIVED FROM CNTR */
21538 /*************************************************************************/
21539 void
add_nodegroup(NodeGroupRecordPtr NGPtr)21540 Dbdih::add_nodegroup(NodeGroupRecordPtr NGPtr)
21541 {
21542   if (NGPtr.p->nodegroupIndex == RNIL)
21543   {
21544     jam();
21545     NGPtr.p->nodegroupIndex = cnoOfNodeGroups;
21546     c_node_groups[cnoOfNodeGroups++] = NGPtr.i;
21547   }
21548 }
21549 
21550 void
inc_ng_refcount(Uint32 i)21551 Dbdih::inc_ng_refcount(Uint32 i)
21552 {
21553   NodeGroupRecordPtr NGPtr;
21554   NGPtr.i = i;
21555   ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21556   NGPtr.p->m_ref_count++;
21557 }
21558 
21559 void
dec_ng_refcount(Uint32 i)21560 Dbdih::dec_ng_refcount(Uint32 i)
21561 {
21562   NodeGroupRecordPtr NGPtr;
21563   NGPtr.i = i;
21564   ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21565   ndbrequire(NGPtr.p->m_ref_count);
21566   NGPtr.p->m_ref_count--;
21567 }
21568 
makeNodeGroups(Uint32 nodeArray[])21569 void Dbdih::makeNodeGroups(Uint32 nodeArray[])
21570 {
21571   NodeGroupRecordPtr NGPtr;
21572   NodeRecordPtr mngNodeptr;
21573   Uint32 j;
21574 
21575   /**-----------------------------------------------------------------------
21576    * ASSIGN ALL ACTIVE NODES INTO NODE GROUPS. HOT SPARE NODES ARE ASSIGNED
21577    * TO NODE GROUP ZNIL
21578    *-----------------------------------------------------------------------*/
21579   cnoOfNodeGroups = 0;
21580   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
21581   {
21582     jam();
21583     mngNodeptr.i = nodeArray[i];
21584     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
21585     if (mngNodeptr.p->nodeGroup == NDB_NO_NODEGROUP)
21586     {
21587       jam();
21588       mngNodeptr.p->nodeGroup = ZNIL;
21589       g_eventLogger->info("setting nodeGroup = ZNIL for node %u",
21590                           mngNodeptr.i);
21591     }
21592     else if (mngNodeptr.p->nodeGroup != RNIL)
21593     {
21594       jam();
21595       NGPtr.i = mngNodeptr.p->nodeGroup;
21596       ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21597       arrGuard(NGPtr.p->nodeCount, MAX_REPLICAS);
21598       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount++] = mngNodeptr.i;
21599 
21600       add_nodegroup(NGPtr);
21601     }
21602   }
21603   NGPtr.i = 0;
21604   for (; NGPtr.i < MAX_NDB_NODES; NGPtr.i++)
21605   {
21606     jam();
21607     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21608     if (NGPtr.p->nodeCount < cnoReplicas)
21609       break;
21610   }
21611 
21612   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
21613   {
21614     jam();
21615     mngNodeptr.i = nodeArray[i];
21616     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
21617     if (mngNodeptr.p->nodeGroup == RNIL)
21618     {
21619       mngNodeptr.p->nodeGroup = NGPtr.i;
21620       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount++] = mngNodeptr.i;
21621 
21622       add_nodegroup(NGPtr);
21623 
21624       if (NGPtr.p->nodeCount == cnoReplicas)
21625       {
21626         jam();
21627         for (; NGPtr.i < MAX_NDB_NODES; NGPtr.i++)
21628         {
21629           jam();
21630           ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21631           if (NGPtr.p->nodeCount < cnoReplicas)
21632             break;
21633         }
21634       }
21635     }
21636   }
21637 
21638   Uint32 maxNG = 0;
21639   for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
21640   {
21641     jam();
21642     NGPtr.i = c_node_groups[i];
21643     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21644     if (NGPtr.p->nodeCount == 0)
21645     {
21646       jam();
21647     }
21648     else if (NGPtr.p->nodeCount != cnoReplicas)
21649     {
21650       ndbrequire(false);
21651     }
21652     else
21653     {
21654       if (NGPtr.i > maxNG)
21655       {
21656         maxNG = NGPtr.i;
21657       }
21658     }
21659   }
21660 
21661   ndbrequire(csystemnodes < MAX_NDB_NODES);
21662 
21663   /**
21664    * Init sysfile
21665    */
21666   for(Uint32 i = 0; i < MAX_NDB_NODES; i++)
21667   {
21668     jam();
21669     Sysfile::setNodeGroup(i, SYSFILE->nodeGroups, NO_NODE_GROUP_ID);
21670     Sysfile::setNodeStatus(i, SYSFILE->nodeStatus,Sysfile::NS_NotDefined);
21671   }
21672 
21673   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
21674   {
21675     jam();
21676     Uint32 nodeId = mngNodeptr.i = nodeArray[i];
21677     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
21678 
21679     if (mngNodeptr.p->nodeGroup != ZNIL)
21680     {
21681       jam();
21682       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups,
21683                             mngNodeptr.p->nodeGroup);
21684 
21685       if (mngNodeptr.p->nodeStatus == NodeRecord::ALIVE)
21686       {
21687         jam();
21688         mngNodeptr.p->activeStatus = Sysfile::NS_Active;
21689       }
21690       else
21691       {
21692         jam();
21693         mngNodeptr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
21694       }
21695     }
21696     else
21697     {
21698       jam();
21699       Sysfile::setNodeGroup(mngNodeptr.i, SYSFILE->nodeGroups,
21700                             NO_NODE_GROUP_ID);
21701       mngNodeptr.p->activeStatus = Sysfile::NS_Configured;
21702     }
21703     Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus,
21704                            mngNodeptr.p->activeStatus);
21705   }
21706 
21707   for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
21708   {
21709     jam();
21710     bool alive = false;
21711     NodeGroupRecordPtr NGPtr;
21712     NGPtr.i = c_node_groups[i];
21713     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
21714     for (j = 0; j<NGPtr.p->nodeCount; j++)
21715     {
21716       jam();
21717       mngNodeptr.i = NGPtr.p->nodesInGroup[j];
21718       ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
21719       if (checkNodeAlive(NGPtr.p->nodesInGroup[j]))
21720       {
21721 	alive = true;
21722 	break;
21723       }
21724     }
21725 
21726     if (!alive)
21727     {
21728       char buf[255];
21729       BaseString::snprintf
21730         (buf, sizeof(buf),
21731          "Illegal initial start, no alive node in nodegroup %u", i);
21732       progError(__LINE__,
21733                 NDBD_EXIT_INSUFFICENT_NODES,
21734                 buf);
21735     }
21736   }
21737 }//Dbdih::makeNodeGroups()
21738 
21739 /**
21740  * On node failure QMGR asks DIH about node groups.  This is
21741  * a direct signal (function call in same process).  Input is
21742  * bitmask of surviving nodes.  The routine is not concerned
21743  * about node count.  Reply is one of:
21744  * 1) win - we can survive, and nobody else can
21745  * 2) lose - we cannot survive
21746  * 3) partition - we can survive but there could be others
21747  */
execCHECKNODEGROUPSREQ(Signal * signal)21748 void Dbdih::execCHECKNODEGROUPSREQ(Signal* signal)
21749 {
21750   jamNoBlock();
21751   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
21752 
21753   bool direct = (sd->requestType & CheckNodeGroups::Direct);
21754   bool ok = false;
21755   switch(sd->requestType & ~CheckNodeGroups::Direct){
21756   case CheckNodeGroups::ArbitCheck:{
21757     ok = true;
21758     jamNoBlock();
21759     unsigned missall = 0;
21760     unsigned haveall = 0;
21761     for (Uint32 i = 0; i < cnoOfNodeGroups; i++) {
21762       jamNoBlock();
21763       NodeGroupRecordPtr ngPtr;
21764       ngPtr.i = c_node_groups[i];
21765       ptrAss(ngPtr, nodeGroupRecord);
21766       Uint32 count = 0;
21767       for (Uint32 j = 0; j < ngPtr.p->nodeCount; j++) {
21768 	jamNoBlock();
21769 	Uint32 nodeId = ngPtr.p->nodesInGroup[j];
21770 	if (sd->mask.get(nodeId)) {
21771 	  jamNoBlock();
21772 	  count++;
21773 	}//if
21774       }//for
21775       if (count == 0) {
21776 	jamNoBlock();
21777 	missall++;
21778       }//if
21779       if (count == ngPtr.p->nodeCount) {
21780 	haveall++;
21781       }//if
21782     }//for
21783 
21784     if (missall) {
21785       jamNoBlock();
21786       sd->output = CheckNodeGroups::Lose;
21787     } else if (haveall) {
21788       jamNoBlock();
21789       sd->output = CheckNodeGroups::Win;
21790     } else {
21791       jamNoBlock();
21792       sd->output = CheckNodeGroups::Partitioning;
21793     }//if
21794   }
21795     break;
21796   case CheckNodeGroups::GetNodeGroup:{
21797     ok = true;
21798     Uint32 ng = Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
21799     if (ng == NO_NODE_GROUP_ID)
21800       ng = RNIL;
21801     sd->output = ng;
21802     break;
21803   }
21804   case CheckNodeGroups::GetNodeGroupMembers: {
21805     ok = true;
21806     Uint32 ng = Sysfile::getNodeGroup(sd->nodeId, SYSFILE->nodeGroups);
21807     if (ng == NO_NODE_GROUP_ID)
21808       ng = RNIL;
21809 
21810     sd->output = ng;
21811     sd->mask.clear();
21812 
21813     NodeGroupRecordPtr ngPtr;
21814     ngPtr.i = ng;
21815     if (ngPtr.i != RNIL)
21816     {
21817       jamNoBlock();
21818       ptrAss(ngPtr, nodeGroupRecord);
21819       for (Uint32 j = 0; j < ngPtr.p->nodeCount; j++) {
21820         jamNoBlock();
21821         sd->mask.set(ngPtr.p->nodesInGroup[j]);
21822       }
21823     }
21824     break;
21825   }
21826   case CheckNodeGroups::GetDefaultFragments:
21827     jamNoBlock();
21828     ok = true;
21829     sd->output = (cnoOfNodeGroups + sd->extraNodeGroups)
21830       * getFragmentsPerNode() * cnoReplicas;
21831     break;
21832   }
21833   ndbrequire(ok);
21834 
21835   if (!direct)
21836     sendSignal(sd->blockRef, GSN_CHECKNODEGROUPSCONF, signal,
21837 	       CheckNodeGroups::SignalLength, JBB);
21838 }//Dbdih::execCHECKNODEGROUPSREQ()
21839 
21840 void
makePrnList(ReadNodesConf * readNodes,Uint32 nodeArray[])21841   Dbdih::makePrnList(ReadNodesConf * readNodes, Uint32 nodeArray[])
21842 {
21843   cfirstAliveNode = RNIL;
21844   ndbrequire(con_lineNodes > 0);
21845   ndbrequire(csystemnodes < MAX_NDB_NODES);
21846   for (Uint32 i = 0; i < csystemnodes; i++) {
21847     NodeRecordPtr nodePtr;
21848     jam();
21849     nodePtr.i = nodeArray[i];
21850     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
21851     initNodeRecord(nodePtr);
21852     if (NdbNodeBitmask::get(readNodes->inactiveNodes, nodePtr.i) == false){
21853       jam();
21854       nodePtr.p->nodeStatus = NodeRecord::ALIVE;
21855       nodePtr.p->useInTransactions = true;
21856       nodePtr.p->copyCompleted = 1;
21857       nodePtr.p->m_inclDihLcp = true;
21858       insertAlive(nodePtr);
21859     } else {
21860       jam();
21861       nodePtr.p->nodeStatus = NodeRecord::DEAD;
21862       insertDeadNode(nodePtr);
21863     }//if
21864   }//for
21865 }//Dbdih::makePrnList()
21866 
21867 /*************************************************************************/
21868 /*       A NEW CRASHED REPLICA IS ADDED BY A NODE FAILURE.               */
21869 /*************************************************************************/
newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr)21870 void Dbdih::newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr)
21871 {
21872   /*----------------------------------------------------------------------*/
21873   /*       SET THE REPLICA_LAST_GCI OF THE CRASHED REPLICA TO LAST GCI    */
21874   /*       EXECUTED BY THE FAILED NODE.                                   */
21875   /*----------------------------------------------------------------------*/
21876   /*       WE HAVE A NEW CRASHED REPLICA. INITIATE CREATE GCI TO INDICATE */
21877   /*       THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
21878   /*       SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET.                 */
21879   /*----------------------------------------------------------------------*/
21880   Uint32 nodeId = ncrReplicaPtr.p->procNode;
21881   Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
21882   if (ncrReplicaPtr.p->noCrashedReplicas + 1 == MAX_CRASHED_REPLICAS)
21883   {
21884     jam();
21885     packCrashedReplicas(ncrReplicaPtr);
21886   }
21887 
21888   Uint32 noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas;
21889   arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, MAX_CRASHED_REPLICAS,
21890               NDBD_EXIT_MAX_CRASHED_REPLICAS);
21891 
21892   if (noCrashedReplicas > 0 &&
21893       ncrReplicaPtr.p->replicaLastGci[noCrashedReplicas - 1] == lastGCI)
21894   {
21895     jam();
21896     /**
21897      * Don't add another redo-interval, that already exist
21898      *  instead initalize new
21899      */
21900     ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] =
21901       ZINIT_CREATE_GCI;
21902     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
21903       ZINIT_REPLICA_LAST_GCI;
21904   }
21905   else if (ncrReplicaPtr.p->createGci[noCrashedReplicas] <= lastGCI)
21906   {
21907     jam();
21908     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
21909       lastGCI;
21910     ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
21911     ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] =
21912       ZINIT_CREATE_GCI;
21913     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
21914       ZINIT_REPLICA_LAST_GCI;
21915   }
21916   else
21917   {
21918     /**
21919      * This can happen if createGci is set
21920      *   (during sendUpdateFragStateReq(COMMIT_STORED))
21921      *   but SYSFILE->lastCompletedGCI[nodeId] has not been updated
21922      *   as node has not yet completed it's first LCP, causing it to return
21923      *   GCP_SAVEREF (which makes SYSFILE->lastCompletedGCI[nodeId] be left
21924      *   untouched)
21925      *
21926      * I.e crash during node-restart
21927      */
21928     ncrReplicaPtr.p->createGci[noCrashedReplicas] = ZINIT_CREATE_GCI;
21929   }
21930 
21931 }//Dbdih::newCrashedReplica()
21932 
21933 /*************************************************************************/
21934 /*       AT NODE FAILURE DURING START OF A NEW NODE WE NEED TO RESET A   */
21935 /*       SET OF VARIABLES CONTROLLING THE START AND INDICATING ONGOING   */
21936 /*       START OF A NEW NODE.                                            */
21937 /*************************************************************************/
nodeResetStart(Signal * signal)21938 void Dbdih::nodeResetStart(Signal *signal)
21939 {
21940   jam();
21941   Uint32 startGCP = c_nodeStartMaster.blockGcp;
21942 
21943   c_nodeStartSlave.nodeId = 0;
21944   c_nodeStartMaster.startNode = RNIL;
21945   c_nodeStartMaster.failNr = cfailurenr;
21946   c_nodeStartMaster.activeState = false;
21947   c_nodeStartMaster.blockGcp = 0;
21948   c_nodeStartMaster.m_outstandingGsn = 0;
21949 
21950   if (startGCP == 2) // effective
21951   {
21952     jam();
21953     ndbrequire(isMaster());
21954     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_IDLE);
21955     signal->theData[0] = DihContinueB::ZSTART_GCP;
21956     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
21957   }
21958 }//Dbdih::nodeResetStart()
21959 
openFileRw(Signal * signal,FileRecordPtr filePtr)21960 void Dbdih::openFileRw(Signal* signal, FileRecordPtr filePtr)
21961 {
21962   signal->theData[0] = reference();
21963   signal->theData[1] = filePtr.i;
21964   signal->theData[2] = filePtr.p->fileName[0];
21965   signal->theData[3] = filePtr.p->fileName[1];
21966   signal->theData[4] = filePtr.p->fileName[2];
21967   signal->theData[5] = filePtr.p->fileName[3];
21968   signal->theData[6] = FsOpenReq::OM_READWRITE;
21969   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
21970 }//Dbdih::openFileRw()
21971 
openFileRo(Signal * signal,FileRecordPtr filePtr)21972 void Dbdih::openFileRo(Signal* signal, FileRecordPtr filePtr)
21973 {
21974   signal->theData[0] = reference();
21975   signal->theData[1] = filePtr.i;
21976   signal->theData[2] = filePtr.p->fileName[0];
21977   signal->theData[3] = filePtr.p->fileName[1];
21978   signal->theData[4] = filePtr.p->fileName[2];
21979   signal->theData[5] = filePtr.p->fileName[3];
21980   signal->theData[6] = FsOpenReq::OM_READONLY;
21981   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
21982 }//Dbdih::openFileRw()
21983 
21984 /*************************************************************************/
21985 /*       REMOVE A CRASHED REPLICA BY PACKING THE ARRAY OF CREATED GCI AND*/
21986 /*       THE LAST GCI OF THE CRASHED REPLICA.                            */
21987 /*************************************************************************/
packCrashedReplicas(ReplicaRecordPtr replicaPtr)21988 void Dbdih::packCrashedReplicas(ReplicaRecordPtr replicaPtr)
21989 {
21990   ndbrequire(replicaPtr.p->noCrashedReplicas > 0);
21991   ndbrequire(replicaPtr.p->noCrashedReplicas <= MAX_CRASHED_REPLICAS);
21992   for (Uint32 i = 0; i < replicaPtr.p->noCrashedReplicas; i++) {
21993     jam();
21994     replicaPtr.p->createGci[i] = replicaPtr.p->createGci[i + 1];
21995     replicaPtr.p->replicaLastGci[i] = replicaPtr.p->replicaLastGci[i + 1];
21996   }//for
21997   replicaPtr.p->noCrashedReplicas--;
21998   replicaPtr.p->createGci[replicaPtr.p->noCrashedReplicas + 1] =
21999     ZINIT_CREATE_GCI;
22000   replicaPtr.p->replicaLastGci[replicaPtr.p->noCrashedReplicas + 1] =
22001     ZINIT_REPLICA_LAST_GCI;
22002 }//Dbdih::packCrashedReplicas()
22003 
22004 void
mergeCrashedReplicas(ReplicaRecordPtr replicaPtr)22005 Dbdih::mergeCrashedReplicas(ReplicaRecordPtr replicaPtr)
22006 {
22007   /**
22008    * merge adjacent redo-intervals
22009    */
22010   for (Uint32 i = replicaPtr.p->noCrashedReplicas; i > 0; i--)
22011   {
22012     jam();
22013     if (replicaPtr.p->createGci[i] == 1 + replicaPtr.p->replicaLastGci[i-1])
22014     {
22015       jam();
22016       replicaPtr.p->replicaLastGci[i-1] = replicaPtr.p->replicaLastGci[i];
22017       replicaPtr.p->createGci[i] = ZINIT_CREATE_GCI;
22018       replicaPtr.p->replicaLastGci[i] = ZINIT_REPLICA_LAST_GCI;
22019       replicaPtr.p->noCrashedReplicas--;
22020     }
22021     else
22022     {
22023       jam();
22024       break;
22025     }
22026   }
22027 }
22028 
prepareReplicas(FragmentstorePtr fragPtr)22029 void Dbdih::prepareReplicas(FragmentstorePtr fragPtr)
22030 {
22031   ReplicaRecordPtr prReplicaPtr;
22032   Uint32 prevReplica = RNIL;
22033 
22034   /* --------------------------------------------------------------------- */
22035   /*       BEGIN BY LINKING ALL REPLICA RECORDS ONTO THE OLD STORED REPLICA*/
22036   /*       LIST.                                                           */
22037   /*       AT A SYSTEM RESTART OBVIOUSLY ALL NODES ARE OLD.                */
22038   /* --------------------------------------------------------------------- */
22039   prReplicaPtr.i = fragPtr.p->storedReplicas;
22040   while (prReplicaPtr.i != RNIL) {
22041     jam();
22042     prevReplica = prReplicaPtr.i;
22043     c_replicaRecordPool.getPtr(prReplicaPtr);
22044     prReplicaPtr.i = prReplicaPtr.p->nextPool;
22045   }//while
22046   /* --------------------------------------------------------------------- */
22047   /*       LIST OF STORED REPLICAS WILL BE EMPTY NOW.                      */
22048   /* --------------------------------------------------------------------- */
22049   if (prevReplica != RNIL) {
22050     prReplicaPtr.i = prevReplica;
22051     c_replicaRecordPool.getPtr(prReplicaPtr);
22052     prReplicaPtr.p->nextPool = fragPtr.p->oldStoredReplicas;
22053     fragPtr.p->oldStoredReplicas = fragPtr.p->storedReplicas;
22054     fragPtr.p->storedReplicas = RNIL;
22055     fragPtr.p->noOldStoredReplicas += fragPtr.p->noStoredReplicas;
22056     fragPtr.p->noStoredReplicas = 0;
22057   }//if
22058 }//Dbdih::prepareReplicas()
22059 
readFragment(RWFragment * rf,FragmentstorePtr fragPtr)22060 void Dbdih::readFragment(RWFragment* rf, FragmentstorePtr fragPtr)
22061 {
22062   Uint32 TreadFid = readPageWord(rf);
22063   fragPtr.p->preferredPrimary = readPageWord(rf);
22064   fragPtr.p->noStoredReplicas = readPageWord(rf);
22065   fragPtr.p->noOldStoredReplicas = readPageWord(rf);
22066   Uint32 TdistKey = readPageWord(rf);
22067 
22068   ndbrequire(fragPtr.p->noStoredReplicas > 0);
22069   ndbrequire(TreadFid == rf->fragId);
22070   ndbrequire(TdistKey < 256);
22071   fragPtr.p->distributionKey = TdistKey;
22072 
22073   fragPtr.p->m_log_part_id = readPageWord(rf);
22074   if (!ndbd_128_instances_address(getMinVersion()))
22075   {
22076     jam();
22077     /**
22078      * Limit log-part to 0-3 as older version didn't handle
22079      *   getting requests to instances > 4
22080      *   (in reality 7 i think...but that is useless as log-part dividor anyway)
22081      */
22082     fragPtr.p->m_log_part_id %= 4;
22083   }
22084 
22085   /* Older nodes stored unlimited log part ids in the fragment definition,
22086    * now we constrain them to a valid range of actual values for this node.
22087    * Here we ensure that unlimited log part ids fit in the value range for
22088    * this node.
22089    */
22090   ndbrequire(globalData.ndbLogParts <= NDBMT_MAX_WORKER_INSTANCES);
22091 
22092   fragPtr.p->m_log_part_id %= globalData.ndbLogParts;
22093 
22094   ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
22095 
22096   inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
22097 }//Dbdih::readFragment()
22098 
readPageWord(RWFragment * rf)22099 Uint32 Dbdih::readPageWord(RWFragment* rf)
22100 {
22101   if (rf->wordIndex >= 2048) {
22102     jam();
22103     ndbrequire(rf->wordIndex == 2048);
22104     rf->pageIndex++;
22105     ndbrequire(rf->pageIndex < NDB_ARRAY_SIZE(rf->rwfTabPtr.p->pageRef));
22106     rf->rwfPageptr.i = rf->rwfTabPtr.p->pageRef[rf->pageIndex];
22107     ptrCheckGuard(rf->rwfPageptr, cpageFileSize, pageRecord);
22108     rf->wordIndex = 32;
22109   }//if
22110   Uint32 dataWord = rf->rwfPageptr.p->word[rf->wordIndex];
22111   rf->wordIndex++;
22112   return dataWord;
22113 }//Dbdih::readPageWord()
22114 
readReplica(RWFragment * rf,ReplicaRecordPtr readReplicaPtr)22115 void Dbdih::readReplica(RWFragment* rf, ReplicaRecordPtr readReplicaPtr)
22116 {
22117   Uint32 i;
22118   readReplicaPtr.p->procNode = readPageWord(rf);
22119   readReplicaPtr.p->initialGci = readPageWord(rf);
22120   readReplicaPtr.p->noCrashedReplicas = readPageWord(rf);
22121   readReplicaPtr.p->nextLcp = readPageWord(rf);
22122 
22123   /**
22124    * Initialise LCP inclusion data, this is to enable us to be included
22125    * in an LCP during a node restart.
22126    */
22127   readReplicaPtr.p->fragId = rf->fragId;
22128   readReplicaPtr.p->tableId = rf->rwfTabPtr.i;
22129   readReplicaPtr.p->lcpOngoingFlag = false;
22130 
22131   for (i = 0; i < MAX_LCP_STORED; i++) {
22132     readReplicaPtr.p->maxGciCompleted[i] = readPageWord(rf);
22133     readReplicaPtr.p->maxGciStarted[i] = readPageWord(rf);
22134     readReplicaPtr.p->lcpId[i] = readPageWord(rf);
22135     readReplicaPtr.p->lcpStatus[i] = readPageWord(rf);
22136   }//for
22137   const Uint32 noCrashedReplicas = readReplicaPtr.p->noCrashedReplicas;
22138   ndbrequire(noCrashedReplicas < MAX_CRASHED_REPLICAS);
22139   for (i = 0; i < noCrashedReplicas; i++) {
22140     readReplicaPtr.p->createGci[i] = readPageWord(rf);
22141     readReplicaPtr.p->replicaLastGci[i] = readPageWord(rf);
22142   }//for
22143   for(i = noCrashedReplicas; i<MAX_CRASHED_REPLICAS; i++){
22144     readReplicaPtr.p->createGci[i] = readPageWord(rf);
22145     readReplicaPtr.p->replicaLastGci[i] = readPageWord(rf);
22146   }
22147 }//Dbdih::readReplica()
22148 
22149 /**
22150  * This method is useful when we read the table distribution information from
22151  * the master node. In this case with the new PAUSE LCP protocol we need to
22152  * perform the functionality of the initLcpLab while copying the table to
22153  * ensure that we're a full DIH participant in the LCP when the copying of
22154  * the meta data has been completed.
22155  *
22156  * For all other cases the tabLcpStatus is TLS_COMPLETED and thus the method
22157  * will be ignored.
22158  */
updateLcpInfo(TabRecord * regTabPtr,Fragmentstore * regFragPtr,ReplicaRecord * regReplicaPtr)22159 void Dbdih::updateLcpInfo(TabRecord *regTabPtr,
22160                           Fragmentstore *regFragPtr,
22161                           ReplicaRecord *regReplicaPtr)
22162 {
22163   if (regTabPtr->tabLcpStatus == TabRecord::TLS_ACTIVE)
22164   {
22165     jam();
22166     Uint32 lastLcpNo = prevLcpNo(regReplicaPtr->nextLcp);
22167     if (c_lcp_id_while_copy_meta_data != RNIL &&
22168         regReplicaPtr->lcpId[lastLcpNo] < c_lcp_id_while_copy_meta_data &&
22169         c_lcpState.m_participatingLQH.get(regReplicaPtr->procNode))
22170     {
22171       /**
22172        * If the copy table indicating that the table is participating in
22173        * an LCP, if the fragment replica hasn't performed this LCP yet,
22174        * and the replica node is participating in the LCP at hand now.
22175        *
22176        * This code executes in the starting node after the LCP being
22177        * paused and we are included into the LCP protocol immediately
22178        * after copying the meta data. We received the bitmap of
22179        * participating LCP nodes just before the copying of meta
22180        * data started.
22181        */
22182       jam();
22183       regReplicaPtr->lcpOngoingFlag = true;
22184       regFragPtr->noLcpReplicas++;
22185 #if 0
22186       g_eventLogger->info("LCP Ongoing: TableId: %u, fragId: %u, node: %u"
22187                           " lastLcpNo: %u, lastLcpId: %u, lcpId: %u",
22188       regReplicaPtr->tableId,
22189       regReplicaPtr->fragId,
22190       regReplicaPtr->procNode,
22191       lastLcpNo,
22192       regReplicaPtr->lcpId[lastLcpNo],
22193       c_lcp_id_while_copy_meta_data);
22194 #endif
22195     }
22196   }
22197 }
22198 
readReplicas(RWFragment * rf,TabRecord * regTabPtr,FragmentstorePtr fragPtr)22199 void Dbdih::readReplicas(RWFragment* rf,
22200                          TabRecord *regTabPtr,
22201                          FragmentstorePtr fragPtr)
22202 {
22203   Uint32 i;
22204   ReplicaRecordPtr newReplicaPtr;
22205   Uint32 noStoredReplicas = fragPtr.p->noStoredReplicas;
22206   Uint32 noOldStoredReplicas = fragPtr.p->noOldStoredReplicas;
22207   /* ----------------------------------------------------------------------- */
22208   /*      WE CLEAR THE NUMBER OF STORED REPLICAS SINCE IT WILL BE CALCULATED */
22209   /*      BY THE LINKING SUBROUTINES.                                        */
22210   /* ----------------------------------------------------------------------- */
22211   fragPtr.p->noStoredReplicas = 0;
22212   fragPtr.p->noOldStoredReplicas = 0;
22213   fragPtr.p->noLcpReplicas = 0;
22214   Uint32 replicaIndex = 0;
22215   ndbrequire(noStoredReplicas + noOldStoredReplicas <= MAX_REPLICAS);
22216   for (i = 0; i < noStoredReplicas; i++)
22217   {
22218     seizeReplicaRec(newReplicaPtr);
22219     readReplica(rf, newReplicaPtr);
22220     ndbrequire(replicaIndex < MAX_REPLICAS);
22221     fragPtr.p->activeNodes[replicaIndex] = newReplicaPtr.p->procNode;
22222     replicaIndex++;
22223     linkStoredReplica(fragPtr, newReplicaPtr);
22224     updateLcpInfo(regTabPtr, fragPtr.p, newReplicaPtr.p);
22225   }//for
22226   fragPtr.p->fragReplicas = noStoredReplicas;
22227   for (i = 0; i < noOldStoredReplicas; i++) {
22228     jam();
22229     seizeReplicaRec(newReplicaPtr);
22230     readReplica(rf, newReplicaPtr);
22231     linkOldStoredReplica(fragPtr, newReplicaPtr);
22232   }//for
22233 }//Dbdih::readReplicas()
22234 
readRestorableGci(Signal * signal,FileRecordPtr filePtr)22235 void Dbdih::readRestorableGci(Signal* signal, FileRecordPtr filePtr)
22236 {
22237   signal->theData[0] = filePtr.p->fileRef;
22238   signal->theData[1] = reference();
22239   signal->theData[2] = filePtr.i;
22240   signal->theData[3] = ZLIST_OF_PAIRS;
22241   signal->theData[4] = ZVAR_NO_CRESTART_INFO;
22242   signal->theData[5] = 1;
22243   signal->theData[6] = 0;
22244   signal->theData[7] = 0;
22245   sendSignal(NDBFS_REF, GSN_FSREADREQ, signal, 8, JBA);
22246 }//Dbdih::readRestorableGci()
22247 
readTabfile(Signal * signal,TabRecord * tab,FileRecordPtr filePtr)22248 void Dbdih::readTabfile(Signal* signal, TabRecord* tab, FileRecordPtr filePtr)
22249 {
22250   signal->theData[0] = filePtr.p->fileRef;
22251   signal->theData[1] = reference();
22252   signal->theData[2] = filePtr.i;
22253   signal->theData[3] = ZLIST_OF_PAIRS;
22254   signal->theData[4] = ZVAR_NO_WORD;
22255   signal->theData[5] = tab->noPages;
22256   Uint32 section[2 * NDB_ARRAY_SIZE(tab->pageRef)];
22257   for (Uint32 i = 0; i < tab->noPages; i++)
22258   {
22259     section[(2 * i) + 0] = tab->pageRef[i];
22260     section[(2 * i) + 1] = i;
22261   }
22262   LinearSectionPtr ptr[3];
22263   ptr[0].p = section;
22264   ptr[0].sz = 2 * tab->noPages;
22265   sendSignal(NDBFS_REF, GSN_FSREADREQ, signal, 6, JBA, ptr, 1);
22266 }//Dbdih::readTabfile()
22267 
releasePage(Uint32 pageIndex)22268 void Dbdih::releasePage(Uint32 pageIndex)
22269 {
22270   PageRecordPtr pagePtr;
22271   pagePtr.i = pageIndex;
22272   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
22273   pagePtr.p->nextfreepage = cfirstfreepage;
22274   cfirstfreepage = pagePtr.i;
22275 }//Dbdih::releasePage()
22276 
releaseTabPages(Uint32 tableId)22277 void Dbdih::releaseTabPages(Uint32 tableId)
22278 {
22279   TabRecordPtr tabPtr;
22280   tabPtr.i = tableId;
22281   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
22282   ndbrequire(tabPtr.p->noPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
22283   for (Uint32 i = 0; i < tabPtr.p->noPages; i++) {
22284     jam();
22285     releasePage(tabPtr.p->pageRef[i]);
22286   }//for
22287   tabPtr.p->noPages = 0;
22288 }//Dbdih::releaseTabPages()
22289 
22290 /*************************************************************************/
22291 /*       REMOVE NODE FROM SET OF ALIVE NODES.                            */
22292 /*************************************************************************/
removeAlive(NodeRecordPtr removeNodePtr)22293 void Dbdih::removeAlive(NodeRecordPtr removeNodePtr)
22294 {
22295   NodeRecordPtr nodePtr;
22296 
22297   nodePtr.i = cfirstAliveNode;
22298   if (nodePtr.i == removeNodePtr.i) {
22299     jam();
22300     cfirstAliveNode = removeNodePtr.p->nextNode;
22301     return;
22302   }//if
22303   do {
22304     jam();
22305     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22306     if (nodePtr.p->nextNode == removeNodePtr.i) {
22307       jam();
22308       nodePtr.p->nextNode = removeNodePtr.p->nextNode;
22309       break;
22310     } else {
22311       jam();
22312       nodePtr.i = nodePtr.p->nextNode;
22313     }//if
22314   } while (1);
22315 }//Dbdih::removeAlive()
22316 
22317 /*************************************************************************/
22318 /*       REMOVE NODE FROM SET OF DEAD NODES.                             */
22319 /*************************************************************************/
removeDeadNode(NodeRecordPtr removeNodePtr)22320 void Dbdih::removeDeadNode(NodeRecordPtr removeNodePtr)
22321 {
22322   NodeRecordPtr nodePtr;
22323 
22324   nodePtr.i = cfirstDeadNode;
22325   if (nodePtr.i == removeNodePtr.i) {
22326     jam();
22327     cfirstDeadNode = removeNodePtr.p->nextNode;
22328     return;
22329   }//if
22330   do {
22331     jam();
22332     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22333     if (nodePtr.p->nextNode == removeNodePtr.i) {
22334       jam();
22335       nodePtr.p->nextNode = removeNodePtr.p->nextNode;
22336       break;
22337     } else {
22338       jam();
22339       nodePtr.i = nodePtr.p->nextNode;
22340     }//if
22341   } while (1);
22342 }//Dbdih::removeDeadNode()
22343 
22344 /*---------------------------------------------------------------*/
22345 /*       REMOVE REPLICAS OF A FAILED NODE FROM LIST OF STORED    */
22346 /*       REPLICAS AND MOVE IT TO THE LIST OF OLD STORED REPLICAS.*/
22347 /*       ALSO UPDATE THE CRASHED REPLICA INFORMATION.            */
22348 /*---------------------------------------------------------------*/
removeNodeFromStored(Uint32 nodeId,FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr,bool temporary)22349 void Dbdih::removeNodeFromStored(Uint32 nodeId,
22350                                  FragmentstorePtr fragPtr,
22351                                  ReplicaRecordPtr replicatePtr,
22352 				 bool temporary)
22353 {
22354   if (!temporary)
22355   {
22356     jam();
22357     newCrashedReplica(replicatePtr);
22358   }
22359   else
22360   {
22361     jam();
22362   }
22363   removeStoredReplica(fragPtr, replicatePtr);
22364   linkOldStoredReplica(fragPtr, replicatePtr);
22365   ndbrequire(fragPtr.p->storedReplicas != RNIL);
22366 }//Dbdih::removeNodeFromStored()
22367 
22368 /*************************************************************************/
22369 /*       REMOVE ANY OLD CRASHED REPLICAS THAT ARE NOT RESTORABLE ANY MORE*/
22370 /*************************************************************************/
removeOldCrashedReplicas(Uint32 tab,Uint32 frag,ReplicaRecordPtr rocReplicaPtr)22371 void Dbdih::removeOldCrashedReplicas(Uint32 tab, Uint32 frag,
22372                                      ReplicaRecordPtr rocReplicaPtr)
22373 {
22374   mergeCrashedReplicas(rocReplicaPtr);
22375   while (rocReplicaPtr.p->noCrashedReplicas > 0) {
22376     jam();
22377     /* --------------------------------------------------------------------- */
22378     /*       ONLY IF THERE IS AT LEAST ONE REPLICA THEN CAN WE REMOVE ANY.   */
22379     /* --------------------------------------------------------------------- */
22380     if (rocReplicaPtr.p->replicaLastGci[0] < SYSFILE->oldestRestorableGCI){
22381       jam();
22382       /* ------------------------------------------------------------------- */
22383       /*     THIS CRASHED REPLICA HAS BECOME EXTINCT AND MUST BE REMOVED TO  */
22384       /*     GIVE SPACE FOR NEW CRASHED REPLICAS.                            */
22385       /* ------------------------------------------------------------------- */
22386       packCrashedReplicas(rocReplicaPtr);
22387     } else {
22388       break;
22389     }//if
22390   }//while
22391 
22392   while (rocReplicaPtr.p->createGci[0] < SYSFILE->keepGCI)
22393   {
22394     jam();
22395     /* --------------------------------------------------------------------- */
22396     /*       MOVE FORWARD THE CREATE GCI TO A GCI THAT CAN BE USED. WE HAVE  */
22397     /*       NO CERTAINTY IN FINDING ANY LOG RECORDS FROM OLDER GCI'S.       */
22398     /* --------------------------------------------------------------------- */
22399     rocReplicaPtr.p->createGci[0] = SYSFILE->keepGCI;
22400 
22401     if (rocReplicaPtr.p->noCrashedReplicas)
22402     {
22403       /**
22404        * a REDO interval while is from 78 to 14 is not usefull
22405        *   but rather harmful, remove it...
22406        */
22407       if (rocReplicaPtr.p->createGci[0] > rocReplicaPtr.p->replicaLastGci[0])
22408       {
22409         jam();
22410         packCrashedReplicas(rocReplicaPtr);
22411       }
22412     }
22413   }
22414 }
22415 
removeOldStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)22416 void Dbdih::removeOldStoredReplica(FragmentstorePtr fragPtr,
22417                                    ReplicaRecordPtr replicatePtr)
22418 {
22419   ReplicaRecordPtr rosTmpReplicaPtr;
22420   ReplicaRecordPtr rosPrevReplicaPtr;
22421 
22422   fragPtr.p->noOldStoredReplicas--;
22423   if (fragPtr.p->oldStoredReplicas == replicatePtr.i) {
22424     jam();
22425     fragPtr.p->oldStoredReplicas = replicatePtr.p->nextPool;
22426   } else {
22427     rosPrevReplicaPtr.i = fragPtr.p->oldStoredReplicas;
22428     c_replicaRecordPool.getPtr(rosPrevReplicaPtr);
22429     rosTmpReplicaPtr.i = rosPrevReplicaPtr.p->nextPool;
22430     while (rosTmpReplicaPtr.i != replicatePtr.i) {
22431       jam();
22432       c_replicaRecordPool.getPtr(rosTmpReplicaPtr);
22433       rosPrevReplicaPtr = rosTmpReplicaPtr;
22434       rosTmpReplicaPtr.i = rosTmpReplicaPtr.p->nextPool;
22435     }//if
22436     rosPrevReplicaPtr.p->nextPool = replicatePtr.p->nextPool;
22437   }//if
22438 }//Dbdih::removeOldStoredReplica()
22439 
removeStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)22440 void Dbdih::removeStoredReplica(FragmentstorePtr fragPtr,
22441                                 ReplicaRecordPtr replicatePtr)
22442 {
22443   ReplicaRecordPtr rsrTmpReplicaPtr;
22444   ReplicaRecordPtr rsrPrevReplicaPtr;
22445 
22446   fragPtr.p->noStoredReplicas--;
22447   if (fragPtr.p->storedReplicas == replicatePtr.i) {
22448     jam();
22449     fragPtr.p->storedReplicas = replicatePtr.p->nextPool;
22450   } else {
22451     jam();
22452     rsrPrevReplicaPtr.i = fragPtr.p->storedReplicas;
22453     rsrTmpReplicaPtr.i = fragPtr.p->storedReplicas;
22454     c_replicaRecordPool.getPtr(rsrTmpReplicaPtr);
22455     rsrTmpReplicaPtr.i = rsrTmpReplicaPtr.p->nextPool;
22456     while (rsrTmpReplicaPtr.i != replicatePtr.i) {
22457       jam();
22458       rsrPrevReplicaPtr.i = rsrTmpReplicaPtr.i;
22459       c_replicaRecordPool.getPtr(rsrTmpReplicaPtr);
22460       rsrTmpReplicaPtr.i = rsrTmpReplicaPtr.p->nextPool;
22461     }//while
22462     c_replicaRecordPool.getPtr(rsrPrevReplicaPtr);
22463     rsrPrevReplicaPtr.p->nextPool = replicatePtr.p->nextPool;
22464   }//if
22465 }//Dbdih::removeStoredReplica()
22466 
22467 /*************************************************************************/
22468 /*       REMOVE ALL TOO NEW CRASHED REPLICAS THAT IS IN THIS REPLICA.    */
22469 /*************************************************************************/
removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr,Uint32 lastCompletedGCI)22470 void Dbdih::removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr, Uint32 lastCompletedGCI)
22471 {
22472   while (rtnReplicaPtr.p->noCrashedReplicas > 0) {
22473     jam();
22474     /* --------------------------------------------------------------------- */
22475     /*       REMOVE ALL REPLICAS THAT ONLY LIVED IN A PERIOD THAT HAVE BEEN  */
22476     /*       REMOVED FROM THE RESTART INFORMATION SINCE THE RESTART FAILED   */
22477     /*       TOO MANY TIMES.                                                 */
22478     /* --------------------------------------------------------------------- */
22479     arrGuard(rtnReplicaPtr.p->noCrashedReplicas - 1, MAX_CRASHED_REPLICAS);
22480     if (rtnReplicaPtr.p->createGci[rtnReplicaPtr.p->noCrashedReplicas - 1] > lastCompletedGCI)
22481     {
22482       jam();
22483       rtnReplicaPtr.p->createGci[rtnReplicaPtr.p->noCrashedReplicas - 1] =
22484 	ZINIT_CREATE_GCI;
22485       rtnReplicaPtr.p->replicaLastGci[rtnReplicaPtr.p->noCrashedReplicas - 1] =
22486 	ZINIT_REPLICA_LAST_GCI;
22487       rtnReplicaPtr.p->noCrashedReplicas--;
22488     } else {
22489       break;
22490     }//if
22491   }//while
22492 }//Dbdih::removeTooNewCrashedReplicas()
22493 
22494 /*************************************************************************/
22495 /*                                                                       */
22496 /*       MODULE: SEARCH FOR POSSIBLE REPLICAS THAT CAN HANDLE THE GLOBAL */
22497 /*               CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/
22498 /*               A MAXIMUM OF FOUR NODES IS RETRIEVED.                   */
22499 /*************************************************************************/
22500 bool
setup_create_replica(FragmentstorePtr fragPtr,CreateReplicaRecord * createReplicaPtrP,ConstPtr<ReplicaRecord> replicaPtr)22501 Dbdih::setup_create_replica(FragmentstorePtr fragPtr,
22502 			    CreateReplicaRecord* createReplicaPtrP,
22503 			    ConstPtr<ReplicaRecord> replicaPtr)
22504 {
22505   createReplicaPtrP->dataNodeId = replicaPtr.p->procNode;
22506   createReplicaPtrP->replicaRec = replicaPtr.i;
22507 
22508   /* ----------------------------------------------------------------- */
22509   /*   WE NEED TO SEARCH FOR A PROPER LOCAL CHECKPOINT TO USE FOR THE  */
22510   /*   SYSTEM RESTART.                                                 */
22511   /* ----------------------------------------------------------------- */
22512   Uint32 startGci;
22513   Uint32 startLcpNo;
22514   Uint32 stopGci = SYSFILE->newestRestorableGCI;
22515   bool result = findStartGci(replicaPtr,
22516 			     stopGci,
22517 			     startGci,
22518 			     startLcpNo);
22519   if (!result)
22520   {
22521     jam();
22522     /* --------------------------------------------------------------- */
22523     /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
22524     /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A   */
22525     /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
22526     /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS        */
22527     /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT.      */
22528     /*                                                                 */
22529     /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE   */
22530     /* LOCAL CHECKPOINT TO ZNIL.                                       */
22531     /* --------------------------------------------------------------- */
22532     createReplicaPtrP->lcpNo = ZNIL;
22533   }
22534   else
22535   {
22536     jam();
22537     /* --------------------------------------------------------------- */
22538     /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM.             */
22539     /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER.            */
22540     /* --------------------------------------------------------------- */
22541     createReplicaPtrP->lcpNo = startLcpNo;
22542     arrGuard(startLcpNo, MAX_LCP_STORED);
22543     createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo];
22544   }//if
22545 
22546 
22547   /* ----------------------------------------------------------------- */
22548   /*   WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO   */
22549   /*   EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
22550   /*   CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT  */
22551   /*   WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT.             */
22552   /* -_--------------------------------------------------------------- */
22553   return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci);
22554 }
22555 
searchStoredReplicas(FragmentstorePtr fragPtr)22556 void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
22557 {
22558   Uint32 nextReplicaPtrI;
22559   Ptr<ReplicaRecord> replicaPtr;
22560 
22561   replicaPtr.i = fragPtr.p->storedReplicas;
22562   while (replicaPtr.i != RNIL) {
22563     jam();
22564     c_replicaRecordPool.getPtr(replicaPtr);
22565     nextReplicaPtrI = replicaPtr.p->nextPool;
22566     ConstPtr<ReplicaRecord> constReplicaPtr;
22567     constReplicaPtr.i = replicaPtr.i;
22568     constReplicaPtr.p = replicaPtr.p;
22569     NodeRecordPtr nodePtr;
22570     nodePtr.i = replicaPtr.p->procNode;
22571     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22572     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
22573       jam();
22574       switch (nodePtr.p->activeStatus) {
22575       case Sysfile::NS_Active:
22576       case Sysfile::NS_ActiveMissed_1:
22577       case Sysfile::NS_ActiveMissed_2:{
22578 	/* ----------------------------------------------------------------- */
22579 	/*   INITIALISE THE CREATE REPLICA STRUCTURE THAT IS USED FOR SENDING*/
22580 	/*   TO LQH START_FRAGREQ.                                           */
22581 	/*   SET THE DATA NODE WHERE THE LOCAL CHECKPOINT IS FOUND. ALSO     */
22582 	/*   SET A REFERENCE TO THE REPLICA POINTER OF THAT.                 */
22583 	/* ----------------------------------------------------------------- */
22584 	CreateReplicaRecordPtr createReplicaPtr;
22585 	createReplicaPtr.i = cnoOfCreateReplicas;
22586 	ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
22587 	cnoOfCreateReplicas++;
22588 
22589 	/**
22590 	 * Should have been checked in resetReplicaSr
22591 	 */
22592 	ndbrequire(setup_create_replica(fragPtr,
22593 					createReplicaPtr.p,
22594 					constReplicaPtr));
22595 	break;
22596       }
22597       default:
22598         jam();
22599         /*empty*/;
22600         break;
22601       }//switch
22602     }
22603     replicaPtr.i = nextReplicaPtrI;
22604   }//while
22605 }//Dbdih::searchStoredReplicas()
22606 
22607 /*************************************************************************/
22608 /*                                                                       */
22609 /*       MODULE: SEIZE_FILE                                              */
22610 /*       DESCRIPTION: THE SUBROUTINE SEIZES A FILE RECORD FROM THE       */
22611 /*                    FREE LIST.                                         */
22612 /*************************************************************************/
seizeFile(FileRecordPtr & filePtr)22613 void Dbdih::seizeFile(FileRecordPtr& filePtr)
22614 {
22615   filePtr.i = cfirstfreeFile;
22616   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
22617   cfirstfreeFile = filePtr.p->nextFile;
22618   filePtr.p->nextFile = RNIL;
22619 }//Dbdih::seizeFile()
22620 
22621 /*************************************************************************/
22622 /*       SEND UPDATE_FRAG_STATEREQ TO ALL NODES IN THE NDB CLUSTER.      */
22623 /*************************************************************************/
22624 /*************************************************************************/
22625 /*                                                                       */
22626 /*       MODULE: FIND THE START GCI AND LOCAL CHECKPOINT TO USE.         */
22627 /*************************************************************************/
sendStartFragreq(Signal * signal,TabRecordPtr tabPtr,Uint32 fragId)22628 void Dbdih::sendStartFragreq(Signal* signal,
22629 			     TabRecordPtr tabPtr, Uint32 fragId)
22630 {
22631   CreateReplicaRecordPtr replicaPtr;
22632   for (replicaPtr.i = 0; replicaPtr.i < cnoOfCreateReplicas; replicaPtr.i++) {
22633     jam();
22634     ptrAss(replicaPtr, createReplicaRecord);
22635 
22636     BlockReference ref = numberToRef(DBLQH, replicaPtr.p->dataNodeId);
22637 
22638     StartFragReq * const startFragReq = (StartFragReq *)&signal->theData[0];
22639     startFragReq->userPtr = replicaPtr.p->replicaRec;
22640     startFragReq->userRef = reference();
22641     startFragReq->lcpNo = replicaPtr.p->lcpNo;
22642     startFragReq->lcpId = replicaPtr.p->createLcpId;
22643     startFragReq->tableId = tabPtr.i;
22644     startFragReq->fragId = fragId;
22645     startFragReq->requestInfo = StartFragReq::SFR_RESTORE_LCP;
22646 
22647     if(ERROR_INSERTED(7072) || ERROR_INSERTED(7074)){
22648       jam();
22649       const Uint32 noNodes = replicaPtr.p->noLogNodes;
22650       Uint32 start = replicaPtr.p->logStartGci[noNodes - 1];
22651       const Uint32 stop  = replicaPtr.p->logStopGci[noNodes - 1];
22652 
22653       for(Uint32 i = noNodes; i < MAX_LOG_EXEC && (stop - start) > 0; i++){
22654 	replicaPtr.p->noLogNodes++;
22655 	replicaPtr.p->logStopGci[i - 1] = start;
22656 
22657 	replicaPtr.p->logNodeId[i] = replicaPtr.p->logNodeId[i-1];
22658 	replicaPtr.p->logStartGci[i] = start + 1;
22659 	replicaPtr.p->logStopGci[i] = stop;
22660 	start += 1;
22661       }
22662     }
22663 
22664     startFragReq->noOfLogNodes = replicaPtr.p->noLogNodes;
22665 
22666     for (Uint32 i = 0; i < MAX_LOG_EXEC ; i++) {
22667       startFragReq->lqhLogNode[i] = replicaPtr.p->logNodeId[i];
22668       startFragReq->startGci[i] = replicaPtr.p->logStartGci[i];
22669       startFragReq->lastGci[i] = replicaPtr.p->logStopGci[i];
22670     }//for
22671 
22672     sendSignal(ref, GSN_START_FRAGREQ, signal,
22673 	       StartFragReq::SignalLength, JBB);
22674   }//for
22675 }//Dbdih::sendStartFragreq()
22676 
22677 /*************************************************************************/
22678 /*       SET LCP ACTIVE STATUS BEFORE STARTING A LOCAL CHECKPOINT.       */
22679 /*************************************************************************/
setLcpActiveStatusStart(Signal * signal)22680 void Dbdih::setLcpActiveStatusStart(Signal* signal)
22681 {
22682   NodeRecordPtr nodePtr;
22683 
22684   c_lcpState.m_participatingLQH.clear();
22685   c_lcpState.m_participatingDIH.clear();
22686 
22687   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
22688     ptrAss(nodePtr, nodeRecord);
22689 #if 0
22690     if(nodePtr.p->nodeStatus != NodeRecord::NOT_IN_CLUSTER){
22691       infoEvent("Node %d nodeStatus=%d activeStatus=%d copyCompleted=%d lcp=%d",
22692 		nodePtr.i,
22693 		nodePtr.p->nodeStatus,
22694 		nodePtr.p->activeStatus,
22695 		nodePtr.p->copyCompleted,
22696 		nodePtr.p->m_inclDihLcp);
22697     }
22698 #endif
22699     if(nodePtr.p->nodeStatus == NodeRecord::ALIVE)
22700     {
22701       jam();
22702       if (nodePtr.p->m_inclDihLcp)
22703       {
22704         jam();
22705         c_lcpState.m_participatingDIH.set(nodePtr.i);
22706       }
22707 
22708       if (nodePtr.p->copyCompleted)
22709       {
22710         jam();
22711 	c_lcpState.m_participatingLQH.set(nodePtr.i);
22712       }
22713       else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
22714       {
22715         jam();
22716         continue;
22717       }
22718       else
22719       {
22720         jam();
22721         nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
22722       }
22723     }
22724     else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
22725     {
22726       jam();
22727       continue;
22728     }
22729     else if (nodePtr.p->activeStatus != Sysfile::NS_NotDefined)
22730     {
22731       jam();
22732       nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
22733     }
22734   }
22735 }//Dbdih::setLcpActiveStatusStart()
22736 
22737 /*************************************************************************/
22738 /*       SET LCP ACTIVE STATUS AT THE END OF A LOCAL CHECKPOINT.        */
22739 /*************************************************************************/
setLcpActiveStatusEnd(Signal * signal)22740 void Dbdih::setLcpActiveStatusEnd(Signal* signal)
22741 {
22742   NodeRecordPtr nodePtr;
22743 
22744   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
22745     jam();
22746     ptrAss(nodePtr, nodeRecord);
22747     if (c_lcpState.m_participatingLQH.get(nodePtr.i))
22748     {
22749       jam();
22750       nodePtr.p->copyCompleted = 1;
22751       if (! (nodePtr.p->activeStatus == Sysfile::NS_Configured))
22752       {
22753         jam();
22754         nodePtr.p->activeStatus = Sysfile::NS_Active;
22755       }
22756       else
22757       {
22758         jam();
22759         // Do nothing
22760       }
22761       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_IN_LCP_WAIT_STATE)
22762       {
22763         jam();
22764         /**
22765          * This is a non-master node and this is the first time we heard this
22766          * node is alive and active. We set the node recovery status, this
22767          * status is only used in printouts if this node later becomes master
22768          * and the node is still alive and kicking. This means we have no
22769          * detailed information about its restart status.
22770          */
22771         setNodeRecoveryStatus(nodePtr.i, NodeRecord::NODE_ACTIVE);
22772       }
22773     }
22774     else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
22775     {
22776       jam();
22777       continue;
22778     }
22779     else if (nodePtr.p->activeStatus != Sysfile::NS_NotDefined)
22780     {
22781       jam();
22782       nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
22783     }
22784   }
22785 
22786   c_lcpState.m_participatingDIH.clear();
22787   c_lcpState.m_participatingLQH.clear();
22788 
22789   if (isMaster()) {
22790     jam();
22791     setNodeRestartInfoBits(signal);
22792   }//if
22793 }//Dbdih::setLcpActiveStatusEnd()
22794 
22795 /*************************************************************************/
22796 /* SET NODE ACTIVE STATUS AT SYSTEM RESTART AND WHEN UPDATED BY MASTER   */
22797 /*************************************************************************/
setNodeActiveStatus()22798 void Dbdih::setNodeActiveStatus()
22799 {
22800   NodeRecordPtr snaNodeptr;
22801 
22802   for (snaNodeptr.i = 1; snaNodeptr.i < MAX_NDB_NODES; snaNodeptr.i++)
22803   {
22804     ptrAss(snaNodeptr, nodeRecord);
22805     const Uint32 tsnaNodeBits = Sysfile::getNodeStatus(snaNodeptr.i,
22806                                                        SYSFILE->nodeStatus);
22807     switch (tsnaNodeBits) {
22808     case Sysfile::NS_Active:
22809       jam();
22810       snaNodeptr.p->activeStatus = Sysfile::NS_Active;
22811       break;
22812     case Sysfile::NS_ActiveMissed_1:
22813       jam();
22814       snaNodeptr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
22815       break;
22816     case Sysfile::NS_ActiveMissed_2:
22817       jam();
22818       snaNodeptr.p->activeStatus = Sysfile::NS_ActiveMissed_2;
22819       break;
22820     case Sysfile::NS_TakeOver:
22821       jam();
22822       snaNodeptr.p->activeStatus = Sysfile::NS_TakeOver;
22823       break;
22824     case Sysfile::NS_NotActive_NotTakenOver:
22825       jam();
22826       snaNodeptr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
22827       break;
22828     case Sysfile::NS_NotDefined:
22829       jam();
22830       snaNodeptr.p->activeStatus = Sysfile::NS_NotDefined;
22831       break;
22832     case Sysfile::NS_Configured:
22833       jam();
22834       snaNodeptr.p->activeStatus = Sysfile::NS_Configured;
22835       break;
22836     default:
22837       ndbrequire(false);
22838       break;
22839     }//switch
22840   }//for
22841 }//Dbdih::setNodeActiveStatus()
22842 
22843 /***************************************************************************/
22844 /* SET THE NODE GROUP BASED ON THE RESTART INFORMATION OR AS SET BY MASTER */
22845 /***************************************************************************/
setNodeGroups()22846 void Dbdih::setNodeGroups()
22847 {
22848   NodeGroupRecordPtr NGPtr;
22849   NodeRecordPtr sngNodeptr;
22850   Uint32 Ti;
22851 
22852   for (Ti = 0; Ti < cnoOfNodeGroups; Ti++) {
22853     NGPtr.i = c_node_groups[Ti];
22854     ptrAss(NGPtr, nodeGroupRecord);
22855     NGPtr.p->nodeCount = 0;
22856     NGPtr.p->nodegroupIndex = RNIL;
22857   }//for
22858   cnoOfNodeGroups = 0;
22859   for (sngNodeptr.i = 1; sngNodeptr.i < MAX_NDB_NODES; sngNodeptr.i++) {
22860     ptrAss(sngNodeptr, nodeRecord);
22861     Sysfile::ActiveStatus s =
22862       (Sysfile::ActiveStatus)Sysfile::getNodeStatus(sngNodeptr.i,
22863 						    SYSFILE->nodeStatus);
22864     switch (s){
22865     case Sysfile::NS_Active:
22866     case Sysfile::NS_ActiveMissed_1:
22867     case Sysfile::NS_ActiveMissed_2:
22868     case Sysfile::NS_NotActive_NotTakenOver:
22869     case Sysfile::NS_TakeOver:
22870       jam();
22871       sngNodeptr.p->nodeGroup = Sysfile::getNodeGroup(sngNodeptr.i,
22872                                                       SYSFILE->nodeGroups);
22873       NGPtr.i = sngNodeptr.p->nodeGroup;
22874       ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
22875       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount] = sngNodeptr.i;
22876       NGPtr.p->nodeCount++;
22877       add_nodegroup(NGPtr);
22878       break;
22879     case Sysfile::NS_NotDefined:
22880     case Sysfile::NS_Configured:
22881       jam();
22882       sngNodeptr.p->nodeGroup = ZNIL;
22883       break;
22884     default:
22885       ndbrequire(false);
22886       return;
22887       break;
22888     }//switch
22889   }//for
22890 }//Dbdih::setNodeGroups()
22891 
22892 /*************************************************************************/
22893 /* SET THE RESTART INFO BITS BASED ON THE NODES ACTIVE STATUS.           */
22894 /*************************************************************************/
setNodeRestartInfoBits(Signal * signal)22895 void Dbdih::setNodeRestartInfoBits(Signal * signal)
22896 {
22897   NodeRecordPtr nodePtr;
22898   Uint32 tsnrNodeGroup;
22899   Uint32 tsnrNodeActiveStatus;
22900   Uint32 i;
22901   for(i = 1; i < MAX_NDB_NODES; i++){
22902     Sysfile::setNodeStatus(i, SYSFILE->nodeStatus, Sysfile::NS_Active);
22903   }//for
22904   for(i = 1; i < Sysfile::NODE_GROUPS_SIZE; i++){
22905     SYSFILE->nodeGroups[i] = 0;
22906   }//for
22907   NdbNodeBitmask::clear(SYSFILE->lcpActive);
22908 
22909 #ifdef ERROR_INSERT
22910   NdbNodeBitmask tmp;
22911 #endif
22912 
22913   for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
22914     ptrAss(nodePtr, nodeRecord);
22915     switch (nodePtr.p->activeStatus) {
22916     case Sysfile::NS_Active:
22917       jam();
22918       tsnrNodeActiveStatus = Sysfile::NS_Active;
22919       break;
22920     case Sysfile::NS_ActiveMissed_1:
22921       jam();
22922       tsnrNodeActiveStatus = Sysfile::NS_ActiveMissed_1;
22923       break;
22924     case Sysfile::NS_ActiveMissed_2:
22925       jam();
22926       tsnrNodeActiveStatus = Sysfile::NS_ActiveMissed_2;
22927       break;
22928     case Sysfile::NS_TakeOver:
22929       jam();
22930       tsnrNodeActiveStatus = Sysfile::NS_TakeOver;
22931       break;
22932     case Sysfile::NS_NotActive_NotTakenOver:
22933       jam();
22934       tsnrNodeActiveStatus = Sysfile::NS_NotActive_NotTakenOver;
22935       break;
22936     case Sysfile::NS_NotDefined:
22937       jam();
22938       tsnrNodeActiveStatus = Sysfile::NS_NotDefined;
22939       break;
22940     case Sysfile::NS_Configured:
22941       jam();
22942       tsnrNodeActiveStatus = Sysfile::NS_Configured;
22943       break;
22944     default:
22945       ndbrequire(false);
22946       tsnrNodeActiveStatus = Sysfile::NS_NotDefined; // remove warning
22947       break;
22948     }//switch
22949     Sysfile::setNodeStatus(nodePtr.i, SYSFILE->nodeStatus,
22950                            tsnrNodeActiveStatus);
22951     if (nodePtr.p->nodeGroup == ZNIL) {
22952       jam();
22953       tsnrNodeGroup = NO_NODE_GROUP_ID;
22954     } else {
22955       jam();
22956       tsnrNodeGroup = nodePtr.p->nodeGroup;
22957     }//if
22958     Sysfile::setNodeGroup(nodePtr.i, SYSFILE->nodeGroups, tsnrNodeGroup);
22959     if (c_lcpState.m_participatingLQH.get(nodePtr.i))
22960     {
22961       jam();
22962       NdbNodeBitmask::set(SYSFILE->lcpActive, nodePtr.i);
22963     }//if
22964 #ifdef ERROR_INSERT
22965     else if (Sysfile::getLCPOngoing(SYSFILE->systemRestartBits))
22966     {
22967       jam();
22968       if (nodePtr.p->activeStatus == Sysfile::NS_Active)
22969         tmp.set(nodePtr.i);
22970     }
22971 #endif
22972   }//for
22973 
22974 #ifdef ERROR_INSERT
22975   if (ERROR_INSERTED(7220) && !tmp.isclear())
22976   {
22977     jam();
22978 
22979     NdbNodeBitmask all;
22980     nodePtr.i = cfirstAliveNode;
22981     do {
22982       jam();
22983       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22984       all.set(nodePtr.i);
22985       nodePtr.i = nodePtr.p->nextNode;
22986     } while (nodePtr.i != RNIL);
22987 
22988 
22989     NodeReceiverGroup rg(DBDIH, all);
22990     signal->theData[0] = 7219;
22991     sendSignal(rg, GSN_NDB_TAMPER, signal,  1, JBA);
22992   }
22993 #endif
22994 }//Dbdih::setNodeRestartInfoBits()
22995 
22996 /*************************************************************************/
22997 /*       START THE GLOBAL CHECKPOINT PROTOCOL IN MASTER AT START-UP      */
22998 /*************************************************************************/
startGcp(Signal * signal)22999 void Dbdih::startGcp(Signal* signal)
23000 {
23001   signal->theData[0] = DihContinueB::ZSTART_GCP;
23002   sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
23003 
23004   startGcpMonitor(signal);
23005 }//Dbdih::startGcp()
23006 
23007 void
startGcpMonitor(Signal * signal)23008 Dbdih::startGcpMonitor(Signal* signal)
23009 {
23010   jam();
23011   m_gcp_monitor.m_gcp_save.m_gci = m_gcp_save.m_gci;
23012   m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
23013   m_gcp_monitor.m_micro_gcp.m_gci = m_micro_gcp.m_current_gci;
23014   m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
23015   m_gcp_monitor.m_last_check = NdbTick_getCurrentTicks();
23016 
23017   signal->theData[0] = DihContinueB::ZCHECK_GCP_STOP;
23018   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
23019 }
23020 
updateNodeInfo(FragmentstorePtr fragPtr)23021 void Dbdih::updateNodeInfo(FragmentstorePtr fragPtr)
23022 {
23023   ReplicaRecordPtr replicatePtr;
23024   Uint32 index = 0;
23025   replicatePtr.i = fragPtr.p->storedReplicas;
23026   do {
23027     jam();
23028     c_replicaRecordPool.getPtr(replicatePtr);
23029     ndbrequire(index < MAX_REPLICAS);
23030     fragPtr.p->activeNodes[index] = replicatePtr.p->procNode;
23031     index++;
23032     replicatePtr.i = replicatePtr.p->nextPool;
23033   } while (replicatePtr.i != RNIL);
23034   fragPtr.p->fragReplicas = index;
23035 
23036   /* ----------------------------------------------------------------------- */
23037   // We switch primary to the preferred primary if the preferred primary is
23038   // in the list.
23039   /* ----------------------------------------------------------------------- */
23040   const Uint32 prefPrim = fragPtr.p->preferredPrimary;
23041   for (Uint32 i = 1; i < index; i++) {
23042     jam();
23043     ndbrequire(i < MAX_REPLICAS);
23044     if (fragPtr.p->activeNodes[i] == prefPrim){
23045       jam();
23046       Uint32 switchNode = fragPtr.p->activeNodes[0];
23047       fragPtr.p->activeNodes[0] = prefPrim;
23048       fragPtr.p->activeNodes[i] = switchNode;
23049       break;
23050     }//if
23051   }//for
23052 }//Dbdih::updateNodeInfo()
23053 
writeFragment(RWFragment * wf,FragmentstorePtr fragPtr)23054 void Dbdih::writeFragment(RWFragment* wf, FragmentstorePtr fragPtr)
23055 {
23056   writePageWord(wf, wf->fragId);
23057   writePageWord(wf, fragPtr.p->preferredPrimary);
23058   writePageWord(wf, fragPtr.p->noStoredReplicas);
23059   writePageWord(wf, fragPtr.p->noOldStoredReplicas);
23060   writePageWord(wf, fragPtr.p->distributionKey);
23061   writePageWord(wf, fragPtr.p->m_log_part_id);
23062 }//Dbdih::writeFragment()
23063 
writePageWord(RWFragment * wf,Uint32 dataWord)23064 void Dbdih::writePageWord(RWFragment* wf, Uint32 dataWord)
23065 {
23066   if (wf->wordIndex >= 2048) {
23067     jam();
23068     ndbrequire(wf->wordIndex == 2048);
23069     allocpage(wf->rwfPageptr);
23070     wf->wordIndex = 32;
23071     wf->pageIndex++;
23072     ndbrequire(wf->pageIndex < NDB_ARRAY_SIZE(wf->rwfTabPtr.p->pageRef));
23073     wf->rwfTabPtr.p->pageRef[wf->pageIndex] = wf->rwfPageptr.i;
23074     wf->rwfTabPtr.p->noPages++;
23075   }//if
23076   wf->rwfPageptr.p->word[wf->wordIndex] = dataWord;
23077   wf->wordIndex++;
23078 }//Dbdih::writePageWord()
23079 
writeReplicas(RWFragment * wf,Uint32 replicaStartIndex)23080 void Dbdih::writeReplicas(RWFragment* wf, Uint32 replicaStartIndex)
23081 {
23082   ReplicaRecordPtr wfReplicaPtr;
23083   wfReplicaPtr.i = replicaStartIndex;
23084   while (wfReplicaPtr.i != RNIL) {
23085     jam();
23086     c_replicaRecordPool.getPtr(wfReplicaPtr);
23087     writePageWord(wf, wfReplicaPtr.p->procNode);
23088     writePageWord(wf, wfReplicaPtr.p->initialGci);
23089     writePageWord(wf, wfReplicaPtr.p->noCrashedReplicas);
23090     writePageWord(wf, wfReplicaPtr.p->nextLcp);
23091     Uint32 i;
23092     for (i = 0; i < MAX_LCP_STORED; i++) {
23093       writePageWord(wf, wfReplicaPtr.p->maxGciCompleted[i]);
23094       writePageWord(wf, wfReplicaPtr.p->maxGciStarted[i]);
23095       writePageWord(wf, wfReplicaPtr.p->lcpId[i]);
23096       writePageWord(wf, wfReplicaPtr.p->lcpStatus[i]);
23097     }//if
23098     for (i = 0; i < MAX_CRASHED_REPLICAS; i++) {
23099       writePageWord(wf, wfReplicaPtr.p->createGci[i]);
23100       writePageWord(wf, wfReplicaPtr.p->replicaLastGci[i]);
23101     }//if
23102 
23103     wfReplicaPtr.i = wfReplicaPtr.p->nextPool;
23104   }//while
23105 }//Dbdih::writeReplicas()
23106 
writeRestorableGci(Signal * signal,FileRecordPtr filePtr)23107 void Dbdih::writeRestorableGci(Signal* signal, FileRecordPtr filePtr)
23108 {
23109   for (Uint32 i = 0; i < Sysfile::SYSFILE_SIZE32; i++) {
23110     sysfileDataToFile[i] = sysfileData[i];
23111   }//for
23112   signal->theData[0] = filePtr.p->fileRef;
23113   signal->theData[1] = reference();
23114   signal->theData[2] = filePtr.i;
23115   signal->theData[3] = ZLIST_OF_PAIRS_SYNCH;
23116   signal->theData[4] = ZVAR_NO_CRESTART_INFO_TO_FILE;
23117   signal->theData[5] = 1; /* AMOUNT OF PAGES */
23118   signal->theData[6] = 0; /* MEMORY PAGE = 0 SINCE COMMON STORED VARIABLE  */
23119   signal->theData[7] = 0;
23120 
23121   if (ERROR_INSERTED(7224) && filePtr.i == crestartInfoFile[1])
23122   {
23123     jam();
23124     SET_ERROR_INSERT_VALUE(7225);
23125     sendSignalWithDelay(NDBFS_REF, GSN_FSWRITEREQ, signal, 2000, 8);
23126 
23127     signal->theData[0] = 9999;
23128     sendSignal(numberToRef(CMVMI, refToNode(cmasterdihref)),
23129 	       GSN_NDB_TAMPER, signal, 1, JBB);
23130     g_eventLogger->info("FS_WRITEREQ delay 2 second for COPY_GCIREQ");
23131     return;
23132   }
23133   sendSignal(NDBFS_REF, GSN_FSWRITEREQ, signal, 8, JBA);
23134 }//Dbdih::writeRestorableGci()
23135 
writeTabfile(Signal * signal,TabRecord * tab,FileRecordPtr filePtr)23136 void Dbdih::writeTabfile(Signal* signal, TabRecord* tab, FileRecordPtr filePtr)
23137 {
23138   signal->theData[0] = filePtr.p->fileRef;
23139   signal->theData[1] = reference();
23140   signal->theData[2] = filePtr.i;
23141   signal->theData[3] = ZLIST_OF_PAIRS_SYNCH;
23142   signal->theData[4] = ZVAR_NO_WORD;
23143   signal->theData[5] = tab->noPages;
23144 
23145   NDB_STATIC_ASSERT(NDB_ARRAY_SIZE(tab->pageRef) <= NDB_FS_RW_PAGES);
23146   Uint32 section[2 * NDB_ARRAY_SIZE(tab->pageRef)];
23147   for (Uint32 i = 0; i < tab->noPages; i++)
23148   {
23149     section[(2 * i) + 0] = tab->pageRef[i];
23150     section[(2 * i) + 1] = i;
23151   }
23152   LinearSectionPtr ptr[3];
23153   ptr[0].p = section;
23154   ptr[0].sz = 2 * tab->noPages;
23155   sendSignal(NDBFS_REF, GSN_FSWRITEREQ, signal, 6, JBA, ptr, 1);
23156 }//Dbdih::writeTabfile()
23157 
execDEBUG_SIG(Signal * signal)23158 void Dbdih::execDEBUG_SIG(Signal* signal)
23159 {
23160   (void)signal; //Avoid compiler warnings
23161 }//Dbdih::execDEBUG_SIG()
23162 
23163 void
execDUMP_STATE_ORD(Signal * signal)23164 Dbdih::execDUMP_STATE_ORD(Signal* signal)
23165 {
23166   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
23167   Uint32 arg = dumpState->args[0];
23168 
23169   if (arg == DumpStateOrd::DihFragmentsPerNode)
23170   {
23171     infoEvent("Fragments per node = %u", getFragmentsPerNode());
23172   }
23173   if (arg == DumpStateOrd::DihDumpNodeRestartInfo) {
23174     infoEvent("c_nodeStartMaster.blockGcp = %d, c_nodeStartMaster.wait = %d",
23175 	      c_nodeStartMaster.blockGcp, c_nodeStartMaster.wait);
23176     for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
23177     {
23178       infoEvent("[ %u : cfirstVerifyQueue = %u clastVerifyQueue = %u sz: %u]",
23179                 i,
23180                 c_diverify_queue[i].cfirstVerifyQueue,
23181                 c_diverify_queue[i].clastVerifyQueue,
23182                 capiConnectFileSize);
23183     }
23184     infoEvent("cgcpOrderBlocked = %d",
23185               cgcpOrderBlocked);
23186   }//if
23187   if (arg == DumpStateOrd::DihDumpNodeStatusInfo) {
23188     NodeRecordPtr localNodePtr;
23189     infoEvent("Printing nodeStatus of all nodes");
23190     for (localNodePtr.i = 1; localNodePtr.i < MAX_NDB_NODES; localNodePtr.i++) {
23191       ptrAss(localNodePtr, nodeRecord);
23192       if (localNodePtr.p->nodeStatus != NodeRecord::NOT_IN_CLUSTER) {
23193         infoEvent("Node = %d has status = %d",
23194 		  localNodePtr.i, localNodePtr.p->nodeStatus);
23195       }//if
23196     }//for
23197   }//if
23198 
23199   if (arg == DumpStateOrd::DihPrintFragmentation)
23200   {
23201     Uint32 tableid = 0;
23202     Uint32 fragid = 0;
23203     if (signal->getLength() == 1)
23204     {
23205       infoEvent("Printing nodegroups --");
23206       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
23207       {
23208         jam();
23209         NodeGroupRecordPtr NGPtr;
23210         NGPtr.i = c_node_groups[i];
23211         ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
23212 
23213         infoEvent("NG %u(%u) ref: %u [ cnt: %u : %u %u %u %u ]",
23214                   NGPtr.i, NGPtr.p->nodegroupIndex, NGPtr.p->m_ref_count,
23215                   NGPtr.p->nodeCount,
23216                   NGPtr.p->nodesInGroup[0], NGPtr.p->nodesInGroup[1],
23217                   NGPtr.p->nodesInGroup[2], NGPtr.p->nodesInGroup[3]);
23218       }
23219       infoEvent("Printing fragmentation of all tables --");
23220     }
23221     else if (signal->getLength() == 3)
23222     {
23223       jam();
23224       tableid = dumpState->args[1];
23225       fragid = dumpState->args[2];
23226     }
23227     else
23228     {
23229       return;
23230     }
23231 
23232     if (tableid >= ctabFileSize)
23233     {
23234       return;
23235     }
23236 
23237     TabRecordPtr tabPtr;
23238     tabPtr.i = tableid;
23239     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23240 
23241     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE &&
23242         fragid < tabPtr.p->totalfragments)
23243     {
23244       dumpState->args[0] = DumpStateOrd::DihPrintOneFragmentation;
23245       dumpState->args[1] = tableid;
23246       dumpState->args[2] = fragid;
23247       execDUMP_STATE_ORD(signal);
23248     }
23249 
23250     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
23251         ++fragid >= tabPtr.p->totalfragments)
23252     {
23253         tableid++;
23254         fragid = 0;
23255     }
23256 
23257     if (tableid < ctabFileSize)
23258     {
23259       dumpState->args[0] = DumpStateOrd::DihPrintFragmentation;
23260       dumpState->args[1] = tableid;
23261       dumpState->args[2] = fragid;
23262       sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 3, JBB);
23263     }
23264   }
23265 
23266   if (arg == DumpStateOrd::DihPrintOneFragmentation)
23267   {
23268     Uint32 tableid = RNIL;
23269     Uint32 fragid = RNIL;
23270 
23271     if (signal->getLength() == 3)
23272     {
23273       jam();
23274       tableid = dumpState->args[1];
23275       fragid = dumpState->args[2];
23276     }
23277     else
23278     {
23279       return;
23280     }
23281 
23282     if (tableid >= ctabFileSize)
23283     {
23284       return;
23285     }
23286 
23287     TabRecordPtr tabPtr;
23288     tabPtr.i = tableid;
23289     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23290 
23291     if (fragid >= tabPtr.p->totalfragments)
23292     {
23293       return;
23294     }
23295 
23296     FragmentstorePtr fragPtr;
23297     getFragstore(tabPtr.p, fragid, fragPtr);
23298 
23299     Uint32 nodeOrder[MAX_REPLICAS];
23300     const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
23301                                                 fragPtr.p,
23302                                                 nodeOrder);
23303     char buf[100];
23304     BaseString::snprintf(buf, sizeof(buf),
23305                          " Table %d Fragment %d(%u) LP: %u - ",
23306                          tabPtr.i, fragid, dihGetInstanceKey(fragPtr),
23307                          fragPtr.p->m_log_part_id);
23308 
23309     for(Uint32 k = 0; k < noOfReplicas; k++)
23310     {
23311       char tmp[100];
23312       BaseString::snprintf(tmp, sizeof(tmp), "%d ", nodeOrder[k]);
23313       strcat(buf, tmp);
23314     }
23315     infoEvent("%s", buf);
23316   }
23317 
23318   if (signal->theData[0] == 7000) {
23319     infoEvent("ctimer = %d",
23320               c_lcpState.ctimer);
23321     infoEvent("cmasterState = %d", cmasterState);
23322     infoEvent("cmasterTakeOverNode = %d, ctcCounter = %d",
23323               cmasterTakeOverNode, c_lcpState.ctcCounter);
23324   }//if
23325   if (signal->theData[0] == 7001) {
23326     infoEvent("c_lcpState.keepGci = %d",
23327               c_lcpState.keepGci);
23328     infoEvent("c_lcpState.lcpStatus = %d, clcpStopGcp = %d",
23329               c_lcpState.lcpStatus,
23330 	      c_lcpState.lcpStopGcp);
23331     infoEvent("cimmediateLcpStart = %d",
23332               c_lcpState.immediateLcpStart);
23333   }//if
23334   if (signal->theData[0] == 7002) {
23335     infoEvent("cnoOfActiveTables = %d",
23336               cnoOfActiveTables);
23337     infoEvent("cdictblockref = %d, cfailurenr = %d",
23338               cdictblockref, cfailurenr);
23339     infoEvent("con_lineNodes = %d, reference() = %d, creceivedfrag = %d",
23340               con_lineNodes, reference(), creceivedfrag);
23341   }//if
23342   if (signal->theData[0] == 7003) {
23343     infoEvent("cfirstAliveNode = %d, cgckptflag = %d",
23344               cfirstAliveNode, cgckptflag);
23345     infoEvent("clocallqhblockref = %d, clocaltcblockref = %d, cgcpOrderBlocked = %d",
23346               clocallqhblockref, clocaltcblockref, cgcpOrderBlocked);
23347     infoEvent("cstarttype = %d, csystemnodes = %d",
23348               cstarttype, csystemnodes);
23349   }//if
23350   if (signal->theData[0] == 7004) {
23351     infoEvent("cmasterdihref = %d, cownNodeId = %d",
23352               cmasterdihref, cownNodeId);
23353     infoEvent("cndbStartReqBlockref = %d, cremainingfrags = %d",
23354               cndbStartReqBlockref, cremainingfrags);
23355   }//if
23356   if (signal->theData[0] == 7005) {
23357     infoEvent("crestartGci = %d",
23358               crestartGci);
23359   }//if
23360   if (signal->theData[0] == 7006) {
23361     infoEvent("clcpDelay = %d",
23362               c_lcpState.clcpDelay);
23363     infoEvent("cmasterNodeId = %d", cmasterNodeId);
23364     infoEvent("c_nodeStartMaster.startNode = %d, c_nodeStartMaster.wait = %d",
23365               c_nodeStartMaster.startNode, c_nodeStartMaster.wait);
23366   }//if
23367   if (signal->theData[0] == 7007) {
23368     infoEvent("c_nodeStartMaster.failNr = %d", c_nodeStartMaster.failNr);
23369     infoEvent("c_nodeStartMaster.startInfoErrorCode = %d",
23370               c_nodeStartMaster.startInfoErrorCode);
23371     infoEvent("c_nodeStartMaster.blockGcp = %d",
23372               c_nodeStartMaster.blockGcp);
23373   }//if
23374   if (signal->theData[0] == 7008) {
23375     infoEvent("cfirstDeadNode = %d, cstartPhase = %d, cnoReplicas = %d",
23376               cfirstDeadNode, cstartPhase, cnoReplicas);
23377     infoEvent("cwaitLcpSr = %d",cwaitLcpSr);
23378   }//if
23379   if (signal->theData[0] == 7009) {
23380     infoEvent("ccalcOldestRestorableGci = %d, cnoOfNodeGroups = %d",
23381               c_lcpState.oldestRestorableGci, cnoOfNodeGroups);
23382     infoEvent("crestartGci = %d",
23383               crestartGci);
23384   }//if
23385   if (signal->theData[0] == 7010) {
23386     infoEvent("c_lcpState.lcpStatusUpdatedPlace = %d, cLcpStart = %d",
23387               c_lcpState.lcpStatusUpdatedPlace, c_lcpState.lcpStart);
23388     infoEvent("c_blockCommit = %d, c_blockCommitNo = %d",
23389               c_blockCommit, c_blockCommitNo);
23390   }//if
23391   if (signal->theData[0] == 7011){
23392     infoEvent("c_COPY_GCIREQ_Counter = %s",
23393 	      c_COPY_GCIREQ_Counter.getText());
23394     infoEvent("c_COPY_TABREQ_Counter = %s",
23395 	      c_COPY_TABREQ_Counter.getText());
23396     infoEvent("c_UPDATE_FRAG_STATEREQ_Counter = %s",
23397 	      c_UPDATE_FRAG_STATEREQ_Counter.getText());
23398     infoEvent("c_DIH_SWITCH_REPLICA_REQ_Counter = %s",
23399 	      c_DIH_SWITCH_REPLICA_REQ_Counter.getText());
23400     infoEvent("c_EMPTY_LCP_REQ_Counter = %s",c_EMPTY_LCP_REQ_Counter.getText());
23401     infoEvent("c_GCP_COMMIT_Counter = %s", c_GCP_COMMIT_Counter.getText());
23402     infoEvent("c_GCP_PREPARE_Counter = %s", c_GCP_PREPARE_Counter.getText());
23403     infoEvent("c_GCP_SAVEREQ_Counter = %s", c_GCP_SAVEREQ_Counter.getText());
23404     infoEvent("c_SUB_GCP_COMPLETE_REP_Counter = %s",
23405               c_SUB_GCP_COMPLETE_REP_Counter.getText());
23406     infoEvent("c_INCL_NODEREQ_Counter = %s", c_INCL_NODEREQ_Counter.getText());
23407     infoEvent("c_MASTER_GCPREQ_Counter = %s",
23408 	      c_MASTER_GCPREQ_Counter.getText());
23409     infoEvent("c_MASTER_LCPREQ_Counter = %s",
23410 	      c_MASTER_LCPREQ_Counter.getText());
23411     infoEvent("c_START_INFOREQ_Counter = %s",
23412 	      c_START_INFOREQ_Counter.getText());
23413     infoEvent("c_START_RECREQ_Counter = %s", c_START_RECREQ_Counter.getText());
23414     infoEvent("c_STOP_ME_REQ_Counter = %s", c_STOP_ME_REQ_Counter.getText());
23415     infoEvent("c_TC_CLOPSIZEREQ_Counter = %s",
23416 	      c_TC_CLOPSIZEREQ_Counter.getText());
23417     infoEvent("c_TCGETOPSIZEREQ_Counter = %s",
23418 	      c_TCGETOPSIZEREQ_Counter.getText());
23419   }
23420 
23421   if(signal->theData[0] == 7012){
23422     char buf[8*_NDB_NODE_BITMASK_SIZE+1];
23423     infoEvent("ParticipatingDIH = %s", c_lcpState.m_participatingDIH.getText(buf));
23424     infoEvent("ParticipatingLQH = %s", c_lcpState.m_participatingLQH.getText(buf));
23425     infoEvent("m_LCP_COMPLETE_REP_Counter_DIH = %s",
23426 	      c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.getText());
23427     infoEvent("m_LCP_COMPLETE_REP_Counter_LQH = %s",
23428 	      c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.getText());
23429     infoEvent("m_LAST_LCP_FRAG_ORD = %s",
23430 	      c_lcpState.m_LAST_LCP_FRAG_ORD.getText());
23431     infoEvent("m_LCP_COMPLETE_REP_From_Master_Received = %d",
23432 	      c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received);
23433 
23434     NodeRecordPtr nodePtr;
23435     for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
23436       jam();
23437       ptrAss(nodePtr, nodeRecord);
23438       if(nodePtr.p->nodeStatus == NodeRecord::ALIVE){
23439         Uint32 i;
23440 	for(i = 0; i<nodePtr.p->noOfStartedChkpt; i++){
23441 	  infoEvent("Node %d: started: table=%d fragment=%d replica=%d",
23442 		    nodePtr.i,
23443 		    nodePtr.p->startedChkpt[i].tableId,
23444 		    nodePtr.p->startedChkpt[i].fragId,
23445 		    nodePtr.p->startedChkpt[i].replicaPtr);
23446 	}
23447 
23448 	for(i = 0; i<nodePtr.p->noOfQueuedChkpt; i++){
23449 	  infoEvent("Node %d: queued: table=%d fragment=%d replica=%d",
23450 		    nodePtr.i,
23451 		    nodePtr.p->queuedChkpt[i].tableId,
23452 		    nodePtr.p->queuedChkpt[i].fragId,
23453 		    nodePtr.p->queuedChkpt[i].replicaPtr);
23454 	}
23455       }
23456     }
23457   }
23458 
23459   if(arg == DumpStateOrd::DihTcSumaNodeFailCompleted &&
23460      signal->getLength() == 2 &&
23461      signal->theData[1] < MAX_NDB_NODES)
23462   {
23463     jam();
23464     char buf2[8+1];
23465     NodeRecordPtr nodePtr;
23466     nodePtr.i = signal->theData[1];
23467     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
23468     infoEvent("NF Node %d tc: %d lqh: %d dih: %d dict: %d recNODE_FAILREP: %d",
23469 	      nodePtr.i,
23470 	      nodePtr.p->dbtcFailCompleted,
23471 	      nodePtr.p->dblqhFailCompleted,
23472 	      nodePtr.p->dbdihFailCompleted,
23473 	      nodePtr.p->dbdictFailCompleted,
23474 	      nodePtr.p->recNODE_FAILREP);
23475     infoEvent(" m_NF_COMPLETE_REP: %s m_nodefailSteps: %s",
23476 	      nodePtr.p->m_NF_COMPLETE_REP.getText(),
23477 	      nodePtr.p->m_nodefailSteps.getText(buf2));
23478   }
23479 
23480   if(arg == 7020 && signal->getLength() > 3)
23481   {
23482     Uint32 gsn= signal->theData[1];
23483     Uint32 block= signal->theData[2];
23484     Uint32 length= signal->length() - 3;
23485     memmove(signal->theData, signal->theData+3, 4*length);
23486     sendSignal(numberToRef(block, getOwnNodeId()), gsn, signal, length, JBB);
23487 
23488     warningEvent("-- SENDING CUSTOM SIGNAL --");
23489     char buf[100], buf2[100];
23490     buf2[0]= 0;
23491     for(Uint32 i = 0; i<length; i++)
23492     {
23493       BaseString::snprintf(buf, 100, "%s %.8x", buf2, signal->theData[i]);
23494       BaseString::snprintf(buf2, 100, "%s", buf);
23495     }
23496     warningEvent("gsn: %d block: %s, length: %d theData: %s",
23497 		 gsn, getBlockName(block, "UNKNOWN"), length, buf);
23498 
23499     g_eventLogger->warning("-- SENDING CUSTOM SIGNAL --");
23500     g_eventLogger->warning("gsn: %d block: %s, length: %d theData: %s",
23501                            gsn, getBlockName(block, "UNKNOWN"), length, buf);
23502   }
23503 
23504   if(arg == DumpStateOrd::DihDumpLCPState){
23505     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
23506     infoEvent("lcpStatus = %d (update place = %d) ",
23507 	      c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace);
23508     infoEvent
23509       ("lcpStart = %d lcpStopGcp = %d keepGci = %d oldestRestorable = %d",
23510        c_lcpState.lcpStart, c_lcpState.lcpStopGcp,
23511        c_lcpState.keepGci, c_lcpState.oldestRestorableGci);
23512 
23513     infoEvent
23514       ("immediateLcpStart = %d masterLcpNodeId = %d",
23515        c_lcpState.immediateLcpStart,
23516        refToNode(c_lcpState.m_masterLcpDihRef));
23517 
23518     for (Uint32 i = 0; i<10; i++)
23519     {
23520       infoEvent("%u : status: %u place: %u", i,
23521                 c_lcpState.m_saveState[i].m_status,
23522                 c_lcpState.m_saveState[i].m_place);
23523     }
23524 
23525     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
23526   }
23527 
23528   if(arg == DumpStateOrd::DihDumpLCPMasterTakeOver){
23529     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
23530     infoEvent
23531       ("c_lcpMasterTakeOverState.state = %d updatePlace = %d failedNodeId = %d",
23532        c_lcpMasterTakeOverState.state,
23533        c_lcpMasterTakeOverState.updatePlace,
23534        c_lcpMasterTakeOverState.failedNodeId);
23535 
23536     infoEvent("c_lcpMasterTakeOverState.minTableId = %u minFragId = %u",
23537 	      c_lcpMasterTakeOverState.minTableId,
23538 	      c_lcpMasterTakeOverState.minFragId);
23539 
23540     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
23541   }
23542 
23543   if (signal->theData[0] == 7015)
23544   {
23545     if (signal->getLength() == 1)
23546     {
23547       signal->theData[1] = 0;
23548     }
23549 
23550     Uint32 tableId = signal->theData[1];
23551     if (tableId < ctabFileSize)
23552     {
23553       signal->theData[0] = 7021;
23554       execDUMP_STATE_ORD(signal);
23555       signal->theData[0] = 7015;
23556       signal->theData[1] = tableId + 1;
23557       sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 2, JBB);
23558     }
23559   }
23560 
23561   if(arg == DumpStateOrd::EnableUndoDelayDataWrite){
23562     g_eventLogger->info("Dbdih:: delay write of datapages for table = %d",
23563                         dumpState->args[1]);
23564     // Send this dump to ACC and TUP
23565     sendSignal(DBACC_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
23566     sendSignal(DBTUP_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
23567 
23568     // Start immediate LCP
23569     add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
23570     if (cmasterNodeId == getOwnNodeId())
23571     {
23572       jam();
23573       c_lcpState.immediateLcpStart = true;
23574     }
23575     return;
23576   }
23577 
23578   if (signal->theData[0] == DumpStateOrd::DihAllAllowNodeStart) {
23579     for (Uint32 i = 1; i < MAX_NDB_NODES; i++)
23580       setAllowNodeStart(i, true);
23581     return;
23582   }//if
23583   if (signal->theData[0] == DumpStateOrd::DihMinTimeBetweenLCP) {
23584     // Set time between LCP to min value
23585     if (signal->getLength() == 2)
23586     {
23587       Uint32 tmp;
23588       const ndb_mgm_configuration_iterator * p =
23589 	m_ctx.m_config.getOwnConfigIterator();
23590       ndbrequire(p != 0);
23591       ndb_mgm_get_int_parameter(p, CFG_DB_LCP_INTERVAL, &tmp);
23592       g_eventLogger->info("Reset time between LCP to %u", tmp);
23593       c_lcpState.clcpDelay = tmp;
23594     }
23595     else
23596     {
23597       g_eventLogger->info("Set time between LCP to min value");
23598       c_lcpState.clcpDelay = 0; // TimeBetweenLocalCheckpoints.min
23599     }
23600     return;
23601   }
23602   if (signal->theData[0] == DumpStateOrd::DihMaxTimeBetweenLCP) {
23603     // Set time between LCP to max value
23604     g_eventLogger->info("Set time between LCP to max value");
23605     c_lcpState.clcpDelay = 31; // TimeBetweenLocalCheckpoints.max
23606     return;
23607   }
23608 
23609   if(arg == 7098){
23610     if(signal->length() == 3){
23611       jam();
23612       infoEvent("startLcpRoundLoopLab(tabel=%d, fragment=%d)",
23613 		signal->theData[1], signal->theData[2]);
23614       startLcpRoundLoopLab(signal, signal->theData[1], signal->theData[2]);
23615       return;
23616     } else {
23617       infoEvent("Invalid no of arguments to 7098 - startLcpRoundLoopLab -"
23618 		" expected 2 (tableId, fragmentId)");
23619     }
23620   }
23621 
23622   if (arg == DumpStateOrd::DihStartLcpImmediately)
23623   {
23624     jam();
23625     if (cmasterNodeId == getOwnNodeId())
23626     {
23627       jam();
23628       c_lcpState.immediateLcpStart = true;
23629       return;
23630     }
23631 
23632     add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
23633     /**
23634      * If sent from local LQH, forward to master
23635      */
23636     if (refToMain(signal->getSendersBlockRef()) == DBLQH)
23637     {
23638       jam();
23639       sendSignal(cmasterdihref, GSN_DUMP_STATE_ORD, signal, 1, JBB);
23640     }
23641     return;
23642   }
23643 
23644   if (arg == DumpStateOrd::DihSetTimeBetweenGcp)
23645   {
23646     Uint32 tmp = 0;
23647     if (signal->getLength() == 1)
23648     {
23649       const ndb_mgm_configuration_iterator * p =
23650 	m_ctx.m_config.getOwnConfigIterator();
23651       ndbrequire(p != 0);
23652       ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &tmp);
23653     }
23654     else
23655     {
23656       tmp = signal->theData[1];
23657     }
23658     m_gcp_save.m_master.m_time_between_gcp = tmp;
23659     g_eventLogger->info("Setting time between gcp : %d", tmp);
23660   }
23661 
23662   if (arg == 7021 && signal->getLength() == 2)
23663   {
23664     TabRecordPtr tabPtr;
23665     tabPtr.i = signal->theData[1];
23666     if (tabPtr.i >= ctabFileSize)
23667       return;
23668 
23669     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23670 
23671     if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
23672       return;
23673 
23674     infoEvent
23675       ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d",
23676        tabPtr.i,
23677        tabPtr.p->tabCopyStatus,
23678        tabPtr.p->tabUpdateState,
23679        tabPtr.p->tabLcpStatus);
23680 
23681     FragmentstorePtr fragPtr;
23682     for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) {
23683       jam();
23684       getFragstore(tabPtr.p, fid, fragPtr);
23685 
23686       char buf[100], buf2[100];
23687       BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ",
23688 			   fid, fragPtr.p->noLcpReplicas);
23689 
23690       Uint32 num=0;
23691       ReplicaRecordPtr replicaPtr;
23692       replicaPtr.i = fragPtr.p->storedReplicas;
23693       do {
23694         c_replicaRecordPool.getPtr(replicaPtr);
23695 	BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)",
23696 			     buf, num,
23697 			     replicaPtr.p->procNode,
23698 			     replicaPtr.p->lcpIdStarted,
23699 			     replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle");
23700 	BaseString::snprintf(buf, sizeof(buf), "%s", buf2);
23701 
23702 	num++;
23703 	replicaPtr.i = replicaPtr.p->nextPool;
23704       } while (replicaPtr.i != RNIL);
23705       infoEvent("%s", buf);
23706     }
23707   }
23708 
23709   if (arg == 7022)
23710   {
23711     jam();
23712     crashSystemAtGcpStop(signal, true);
23713   }
23714 
23715   if (arg == 7025)
23716   {
23717     jam();
23718     dumpGcpStop();
23719     return;
23720   }
23721 
23722 #ifdef GCP_TIMER_HACK
23723   if (signal->theData[0] == 7901)
23724     globalData.gcp_timer_limit = signal->theData[1];
23725 #endif
23726   if (arg == 7023)
23727   {
23728     /**
23729      * Dump all active TakeOver
23730      */
23731     Ptr<TakeOverRecord> ptr;
23732     ptr.i = signal->theData[1];
23733     if (signal->getLength() == 1)
23734     {
23735       infoEvent("Starting dump all active take-over");
23736       c_masterActiveTakeOverList.first(ptr);
23737     }
23738 
23739     if (ptr.i == RNIL)
23740     {
23741       infoEvent("Dump all active take-over done");
23742       return;
23743     }
23744 
23745     c_masterActiveTakeOverList.getPtr(ptr);
23746     infoEvent("TakeOverPtr(%u) starting: %u flags: 0x%x ref: 0x%x, data: %u",
23747               ptr.i,
23748               ptr.p->toStartingNode,
23749               ptr.p->m_flags,
23750               ptr.p->m_senderRef,
23751               ptr.p->m_senderData);
23752     infoEvent("slaveState: %u masterState: %u",
23753               ptr.p->toSlaveStatus, ptr.p->toMasterStatus);
23754     infoEvent("restorableGci: %u startGci: %u tab: %u frag: %u src: %u max: %u",
23755               ptr.p->restorableGci, ptr.p->startGci,
23756               ptr.p->toCurrentTabref, ptr.p->toCurrentFragid,
23757               ptr.p->toCopyNode, ptr.p->maxPage);
23758 
23759     c_masterActiveTakeOverList.next(ptr);
23760     signal->theData[0] = arg;
23761     signal->theData[1] = ptr.i;
23762   }
23763 
23764   if (arg == DumpStateOrd::DihDumpPageRecInfo)
23765   {
23766     jam();
23767     ndbout_c("MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES %u", MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES);
23768     ndbout_c("MAX_CONCURRENT_DIH_TAB_DEF_OPS %u", MAX_CONCURRENT_DIH_TAB_DEF_OPS);
23769     ndbout_c("MAX_CRASHED_REPLICAS %u", MAX_CRASHED_REPLICAS);
23770     ndbout_c("MAX_LCP_STORED %u", MAX_LCP_STORED);
23771     ndbout_c("MAX_REPLICAS %u", MAX_REPLICAS);
23772     ndbout_c("MAX_NDB_PARTITIONS %u", MAX_NDB_PARTITIONS);
23773     ndbout_c("PACK_REPLICAS_WORDS %u", PACK_REPLICAS_WORDS);
23774     ndbout_c("PACK_FRAGMENT_WORDS %u", PACK_FRAGMENT_WORDS);
23775     ndbout_c("PACK_TABLE_WORDS %u", PACK_TABLE_WORDS);
23776     ndbout_c("PACK_TABLE_PAGE_WORDS %u", PACK_TABLE_PAGE_WORDS);
23777     ndbout_c("PACK_TABLE_PAGES %u", PACK_TABLE_PAGES);
23778     ndbout_c("ZPAGEREC %u", ZPAGEREC);
23779     ndbout_c("Total bytes : %lu", ZPAGEREC * sizeof(PageRecord));
23780     ndbout_c("LCP Tab def write ops inUse %u queued %u",
23781              c_lcpTabDefWritesControl.inUse,
23782              c_lcpTabDefWritesControl.queuedRequests);
23783 
23784     if (getNodeState().startLevel < NodeState::SL_STARTING)
23785       return ;
23786 
23787     Uint32 freeCount = 0;
23788     PageRecordPtr tmp;
23789     tmp.i = cfirstfreepage;
23790     while (tmp.i != RNIL)
23791     {
23792       jam();
23793       ptrCheckGuard(tmp, cpageFileSize, pageRecord);
23794       freeCount++;
23795       tmp.i = tmp.p->nextfreepage;
23796     };
23797     ndbout_c("Pages in use %u/%u", cpageFileSize - freeCount, cpageFileSize);
23798     return;
23799   }
23800 
23801   if (arg == DumpStateOrd::SchemaResourceSnapshot)
23802   {
23803     RSS_OP_SNAPSHOT_SAVE(cremainingfrags);
23804     RSS_OP_SNAPSHOT_SAVE(cnoFreeReplicaRec);
23805 
23806     {
23807       Uint32 cnghash = 0;
23808       NodeGroupRecordPtr NGPtr;
23809       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
23810       {
23811         NGPtr.i = c_node_groups[i];
23812         ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
23813         cnghash = (cnghash * 33) + NGPtr.p->m_ref_count;
23814       }
23815       RSS_OP_SNAPSHOT_SAVE(cnghash);
23816     }
23817     return;
23818   }
23819 
23820   if (arg == DumpStateOrd::SchemaResourceCheckLeak)
23821   {
23822     RSS_OP_SNAPSHOT_CHECK(cremainingfrags);
23823     RSS_OP_SNAPSHOT_SAVE(cnoFreeReplicaRec);
23824 
23825     {
23826       Uint32 cnghash = 0;
23827       NodeGroupRecordPtr NGPtr;
23828       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
23829       {
23830         NGPtr.i = c_node_groups[i];
23831         ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
23832         cnghash = (cnghash * 33) + NGPtr.p->m_ref_count;
23833       }
23834       RSS_OP_SNAPSHOT_CHECK(cnghash);
23835     }
23836   }
23837 
23838   /* Checks whether add frag failure was cleaned up.
23839    * Should NOT be used while commands involving addFragReq
23840    * are being performed.
23841    */
23842   if (arg == DumpStateOrd::DihAddFragFailCleanedUp && signal->length() == 2)
23843   {
23844     jam();
23845     TabRecordPtr tabPtr;
23846     tabPtr.i = signal->theData[1];
23847     if (tabPtr.i >= ctabFileSize)
23848       return;
23849 
23850     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23851 
23852     if (tabPtr.p->m_new_map_ptr_i == RNIL)
23853     {
23854       jam();
23855       infoEvent("DIH : Add frag fail clean up ok for table %u", tabPtr.i);
23856     }
23857     else
23858     {
23859       jam();
23860       warningEvent("new_map_ptr_i to table id %d is not NIL", tabPtr.i);
23861       /*
23862         This ndbrequire is needed by the runFailAddPartition() test case.
23863         This dump code is *not* intended for interactive usage, as the node
23864         is likely to crash.
23865       */
23866       ndbrequire(false);
23867     }
23868   }
23869   if (arg == DumpStateOrd::DihDisplayPauseState)
23870   {
23871     infoEvent("Pause LCP ref: %x, is_lcp_paused %u,"
23872               " c_dequeue_lcp_rep_ongoing %u",
23873               cmasterdihref,
23874               is_lcp_paused(),
23875               c_dequeue_lcp_rep_ongoing);
23876     infoEvent("c_pause_lcp_master_state: %u,"
23877               " c_old_node_waiting_for_lcp_end: %u",
23878               Uint32(c_pause_lcp_master_state),
23879               c_old_node_waiting_for_lcp_end);
23880     infoEvent("c_queued_lcp_complete_rep: %u,"
23881               " c_lcp_id_paused: %u",
23882               c_queued_lcp_complete_rep,
23883               c_lcp_id_paused);
23884     infoEvent("c_last_id_lcp_complete_rep: %u"
23885               " c_lcp_runs_with_pause_support: %u",
23886               c_last_id_lcp_complete_rep,
23887               c_lcp_runs_with_pause_support);
23888     infoEvent("c_lcp_id_while_copy_meta_data: %u, c_pause_lcp_start_node: %u",
23889               c_lcp_id_while_copy_meta_data,
23890               c_pause_lcp_start_node);
23891     infoEvent("c_PAUSE_LCP_REQ_Counter: %s",
23892               c_PAUSE_LCP_REQ_Counter.getText());
23893     infoEvent("c_FLUSH_LCP_REP_REQ_Counter: %s",
23894               c_FLUSH_LCP_REP_REQ_Counter.getText());
23895     if (isMaster())
23896     {
23897       char buf[100];
23898       infoEvent("c_lcpState.m_participatingLQH: %s",
23899                 c_lcpState.m_participatingLQH.getText(buf));
23900       infoEvent("c_pause_participants: %s",
23901                 c_pause_participants.getText(buf));
23902     }
23903   }
23904 
23905   DECLARE_DUMP0(DBDIH, 7213, "Set error 7213 with extra arg")
23906   {
23907     SET_ERROR_INSERT_VALUE2(7213, signal->theData[1]);
23908     return;
23909   }
23910   DECLARE_DUMP0(DBDIH, 7214, "Set error 7214 with extra arg")
23911   {
23912     SET_ERROR_INSERT_VALUE2(7214, signal->theData[1]);
23913     return;
23914   }
23915 
23916   DECLARE_DUMP0(DBDIH, 7216, "Set error 7216 with extra arg")
23917   {
23918     SET_ERROR_INSERT_VALUE2(7216, signal->theData[1]);
23919     return;
23920   }
23921   DECLARE_DUMP0(DBDIH, 6099, "Start microgcp")
23922   {
23923     if (isMaster())
23924     {
23925       jam();
23926       // Invalidating timestamp will force an immediate microGCP
23927       NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
23928     }
23929     else
23930     {
23931       jam();
23932       sendSignal(cmasterdihref, GSN_DUMP_STATE_ORD, signal, 1, JBB);
23933     }
23934     return;
23935   }
23936   DECLARE_DUMP0(DBDIH, 7999, "Set error code with extra arg")
23937   {
23938     SET_ERROR_INSERT_VALUE2(signal->theData[1],
23939                             signal->theData[2]);
23940   }
23941 
23942   if (arg == DumpStateOrd::DihSetGcpStopVals)
23943   {
23944     jam();
23945     if (signal->getLength() != 3)
23946     {
23947       jam();
23948       return;
23949     }
23950     if (signal->theData[1] == 0)
23951     {
23952       g_eventLogger->info("Changing GCP_COMMIT max_lag_millis from %u to %u",
23953                           m_gcp_monitor.m_micro_gcp.m_max_lag_ms,
23954                           signal->theData[2]);
23955       m_gcp_monitor.m_micro_gcp.m_max_lag_ms = signal->theData[2];
23956     }
23957     else
23958     {
23959       g_eventLogger->info("Changing GCP_SAVE max_lag_millis from %u to %u",
23960                           m_gcp_monitor.m_gcp_save.m_max_lag_ms,
23961                           signal->theData[2]);
23962       m_gcp_monitor.m_gcp_save.m_max_lag_ms = signal->theData[2];
23963     }
23964   }
23965 
23966 
23967 }//Dbdih::execDUMP_STATE_ORD()
23968 
23969 void
execPREP_DROP_TAB_REQ(Signal * signal)23970 Dbdih::execPREP_DROP_TAB_REQ(Signal* signal){
23971   jamEntry();
23972 
23973   PrepDropTabReq* req = (PrepDropTabReq*)signal->getDataPtr();
23974 
23975   TabRecordPtr tabPtr;
23976   tabPtr.i = req->tableId;
23977   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23978 
23979   Uint32 senderRef = req->senderRef;
23980   Uint32 senderData = req->senderData;
23981 
23982   PrepDropTabRef::ErrorCode err = PrepDropTabRef::OK;
23983   { /**
23984      * Check table state
23985      */
23986     bool ok = false;
23987     switch(tabPtr.p->tabStatus){
23988     case TabRecord::TS_IDLE:
23989       ok = true;
23990       jam();
23991       err = PrepDropTabRef::NoSuchTable;
23992       break;
23993     case TabRecord::TS_DROPPING:
23994       ok = true;
23995       jam();
23996       err = PrepDropTabRef::PrepDropInProgress;
23997       break;
23998     case TabRecord::TS_CREATING:
23999       jam();
24000       ok = true;
24001       break;
24002     case TabRecord::TS_ACTIVE:
24003       ok = true;
24004       jam();
24005       break;
24006     }
24007     ndbrequire(ok);
24008   }
24009 
24010   if(err != PrepDropTabRef::OK)
24011   {
24012     jam();
24013     PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
24014     ref->senderRef = reference();
24015     ref->senderData = senderData;
24016     ref->tableId = tabPtr.i;
24017     ref->errorCode = err;
24018     sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
24019 	       PrepDropTabRef::SignalLength, JBB);
24020     return;
24021   }
24022 
24023   tabPtr.p->tabStatus = TabRecord::TS_DROPPING;
24024   PrepDropTabConf* conf = (PrepDropTabConf*)signal->getDataPtrSend();
24025   conf->tableId = tabPtr.i;
24026   conf->senderRef = reference();
24027   conf->senderData = senderData;
24028   sendSignal(senderRef, GSN_PREP_DROP_TAB_CONF,
24029              signal, PrepDropTabConf::SignalLength, JBB);
24030 }
24031 
24032 void
waitDropTabWritingToFile(Signal * signal,TabRecordPtr tabPtr)24033 Dbdih::waitDropTabWritingToFile(Signal* signal, TabRecordPtr tabPtr){
24034 
24035   if (tabPtr.p->tabLcpStatus == TabRecord::TLS_WRITING_TO_FILE)
24036   {
24037     jam();
24038     signal->theData[0] = DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE;
24039     signal->theData[1] = tabPtr.i;
24040     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
24041                         WaitTableStateChangeMillis, 2);
24042     return;
24043   }
24044 
24045   if (tabPtr.p->tabUpdateState != TabRecord::US_IDLE)
24046   {
24047     jam();
24048     signal->theData[0] = DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE;
24049     signal->theData[1] = tabPtr.i;
24050     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
24051                         WaitTableStateChangeMillis, 2);
24052     return;
24053   }
24054 
24055   ndbrequire(tabPtr.p->tabLcpStatus ==  TabRecord::TLS_COMPLETED);
24056   checkDropTabComplete(signal, tabPtr);
24057 }
24058 
24059 void
checkDropTabComplete(Signal * signal,TabRecordPtr tabPtr)24060 Dbdih::checkDropTabComplete(Signal* signal, TabRecordPtr tabPtr)
24061 {
24062   startDeleteFile(signal, tabPtr);
24063 }
24064 
24065 void
execNDB_TAMPER(Signal * signal)24066 Dbdih::execNDB_TAMPER(Signal* signal)
24067 {
24068   if ((ERROR_INSERTED(7011)) &&
24069       (signal->theData[0] == 7012)) {
24070     CLEAR_ERROR_INSERT_VALUE;
24071     calculateKeepGciLab(signal, 0, 0);
24072     return;
24073   }//if
24074   if (signal->getLength() == 1)
24075   {
24076     SET_ERROR_INSERT_VALUE2(signal->theData[0],
24077                             0);
24078   }
24079   else
24080   {
24081     SET_ERROR_INSERT_VALUE2(signal->theData[0],
24082                             signal->theData[1]);
24083   }
24084   return;
24085 }//Dbdih::execNDB_TAMPER()
24086 
execBLOCK_COMMIT_ORD(Signal * signal)24087 void Dbdih::execBLOCK_COMMIT_ORD(Signal* signal){
24088   BlockCommitOrd* const block = (BlockCommitOrd *)&signal->theData[0];
24089 
24090   jamEntry();
24091 
24092   c_blockCommit = true;
24093   c_blockCommitNo = block->failNo;
24094 }
24095 
execUNBLOCK_COMMIT_ORD(Signal * signal)24096 void Dbdih::execUNBLOCK_COMMIT_ORD(Signal* signal){
24097   UnblockCommitOrd* const unblock = (UnblockCommitOrd *)&signal->theData[0];
24098   (void)unblock;
24099 
24100   jamEntry();
24101 
24102   if(c_blockCommit == true)
24103   {
24104     jam();
24105 
24106     c_blockCommit = false;
24107     for (Uint32 i = 0; i<c_diverify_queue_cnt; i++)
24108     {
24109       c_diverify_queue[i].m_empty_done = 0;
24110       emptyverificbuffer(signal, i, true);
24111     }
24112   }
24113 }
24114 
execSTOP_PERM_REQ(Signal * signal)24115 void Dbdih::execSTOP_PERM_REQ(Signal* signal){
24116 
24117   jamEntry();
24118 
24119   StopPermReq* const req = (StopPermReq*)&signal->theData[0];
24120   StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
24121 
24122   const Uint32 senderData = req->senderData;
24123   const BlockReference senderRef = req->senderRef;
24124   const NodeId nodeId = refToNode(senderRef);
24125 
24126   if (isMaster()) {
24127     /**
24128      * Master
24129      */
24130     jam();
24131     CRASH_INSERTION(7065);
24132     if (c_stopPermMaster.clientRef != 0) {
24133       jam();
24134 
24135       ref->senderData = senderData;
24136       ref->errorCode  = StopPermRef::NodeShutdownInProgress;
24137       sendSignal(senderRef, GSN_STOP_PERM_REF, signal,
24138                  StopPermRef::SignalLength, JBB);
24139       return;
24140     }//if
24141 
24142     if (c_nodeStartMaster.activeState) {
24143       jam();
24144       ref->senderData = senderData;
24145       ref->errorCode  = StopPermRef::NodeStartInProgress;
24146       sendSignal(senderRef, GSN_STOP_PERM_REF, signal,
24147                  StopPermRef::SignalLength, JBB);
24148       return;
24149     }//if
24150 
24151     /**
24152      * Lock
24153      */
24154     c_nodeStartMaster.activeState = true;
24155     c_stopPermMaster.clientRef = senderRef;
24156 
24157     c_stopPermMaster.clientData = senderData;
24158     c_stopPermMaster.returnValue = 0;
24159     c_switchReplicas.clear();
24160 
24161     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
24162     Callback c = { safe_cast(&Dbdih::switch_primary_stop_node), nodeId };
24163     ndbrequire(mutex.lock(c));
24164   } else {
24165     /**
24166      * Proxy part
24167      */
24168     jam();
24169     CRASH_INSERTION(7066);
24170     if(c_stopPermProxy.clientRef != 0){
24171       jam();
24172       ref->senderData = senderData;
24173       ref->errorCode = StopPermRef::NodeShutdownInProgress;
24174       sendSignal(senderRef, GSN_STOP_PERM_REF, signal, 2, JBB);
24175       return;
24176     }//if
24177 
24178     c_stopPermProxy.clientRef = senderRef;
24179     c_stopPermProxy.masterRef = cmasterdihref;
24180     c_stopPermProxy.clientData = senderData;
24181 
24182     req->senderRef = reference();
24183     req->senderData = senderData;
24184     sendSignal(cmasterdihref, GSN_STOP_PERM_REQ, signal,
24185 	       StopPermReq::SignalLength, JBB);
24186   }//if
24187 }//Dbdih::execSTOP_PERM_REQ()
24188 
24189 void
switch_primary_stop_node(Signal * signal,Uint32 node_id,Uint32 ret_val)24190 Dbdih::switch_primary_stop_node(Signal* signal, Uint32 node_id, Uint32 ret_val)
24191 {
24192   ndbrequire(ret_val == 0);
24193   signal->theData[0] = DihContinueB::SwitchReplica;
24194   signal->theData[1] = node_id;
24195   signal->theData[2] = 0; // table id
24196   signal->theData[3] = 0; // fragment id
24197   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
24198 }
24199 
execSTOP_PERM_REF(Signal * signal)24200 void Dbdih::execSTOP_PERM_REF(Signal* signal)
24201 {
24202   jamEntry();
24203   ndbrequire(c_stopPermProxy.clientRef != 0);
24204   ndbrequire(c_stopPermProxy.masterRef == signal->senderBlockRef());
24205   sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_REF, signal, 2, JBB);
24206   c_stopPermProxy.clientRef = 0;
24207 }//Dbdih::execSTOP_PERM_REF()
24208 
execSTOP_PERM_CONF(Signal * signal)24209 void Dbdih::execSTOP_PERM_CONF(Signal* signal)
24210 {
24211   jamEntry();
24212   ndbrequire(c_stopPermProxy.clientRef != 0);
24213   ndbrequire(c_stopPermProxy.masterRef == signal->senderBlockRef());
24214   sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_CONF, signal, 1, JBB);
24215   c_stopPermProxy.clientRef = 0;
24216 }//Dbdih::execSTOP_PERM_CONF()
24217 
execDIH_SWITCH_REPLICA_REQ(Signal * signal)24218 void Dbdih::execDIH_SWITCH_REPLICA_REQ(Signal* signal)
24219 {
24220   jamEntry();
24221   DihSwitchReplicaReq* const req = (DihSwitchReplicaReq*)&signal->theData[0];
24222   const Uint32 tableId = req->tableId;
24223   const Uint32 fragNo = req->fragNo;
24224   const BlockReference senderRef = req->senderRef;
24225 
24226   CRASH_INSERTION(7067);
24227   TabRecordPtr tabPtr;
24228   tabPtr.i = tableId;
24229   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
24230 
24231   ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_ACTIVE);
24232   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
24233     jam();
24234     sendSignal(reference(), GSN_DIH_SWITCH_REPLICA_REQ, signal,
24235 	       DihSwitchReplicaReq::SignalLength, JBB);
24236     return;
24237   }//if
24238   FragmentstorePtr fragPtr;
24239   getFragstore(tabPtr.p, fragNo, fragPtr);
24240 
24241   /**
24242    * Do funky stuff
24243    */
24244   Uint32 oldOrder[MAX_REPLICAS];
24245   const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
24246                                               fragPtr.p,
24247                                               oldOrder);
24248 
24249   if (noOfReplicas < req->noOfReplicas) {
24250     jam();
24251     //---------------------------------------------------------------------
24252     // A crash occurred in the middle of our switch handling.
24253     //---------------------------------------------------------------------
24254     DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
24255     ref->senderNode = cownNodeId;
24256     ref->errorCode = StopPermRef::NF_CausedAbortOfStopProcedure;
24257     sendSignal(senderRef, GSN_DIH_SWITCH_REPLICA_REF, signal,
24258                DihSwitchReplicaRef::SignalLength, JBB);
24259   }//if
24260 
24261   DIH_TAB_WRITE_LOCK(tabPtr.p);
24262   for (Uint32 i = 0; i < noOfReplicas; i++) {
24263     jam();
24264     ndbrequire(i < MAX_REPLICAS);
24265     fragPtr.p->activeNodes[i] = req->newNodeOrder[i];
24266   }//for
24267   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
24268 
24269   /**
24270    * Reply
24271    */
24272   DihSwitchReplicaConf* const conf = (DihSwitchReplicaConf*)&signal->theData[0];
24273   conf->senderNode = cownNodeId;
24274   sendSignal(senderRef, GSN_DIH_SWITCH_REPLICA_CONF, signal,
24275              DihSwitchReplicaConf::SignalLength, JBB);
24276 }//Dbdih::execDIH_SWITCH_REPLICA_REQ()
24277 
execDIH_SWITCH_REPLICA_CONF(Signal * signal)24278 void Dbdih::execDIH_SWITCH_REPLICA_CONF(Signal* signal)
24279 {
24280   jamEntry();
24281   /**
24282    * Response to master
24283    */
24284   CRASH_INSERTION(7068);
24285   DihSwitchReplicaConf* const conf = (DihSwitchReplicaConf*)&signal->theData[0];
24286   switchReplicaReply(signal, conf->senderNode);
24287 }//Dbdih::execDIH_SWITCH_REPLICA_CONF()
24288 
execDIH_SWITCH_REPLICA_REF(Signal * signal)24289 void Dbdih::execDIH_SWITCH_REPLICA_REF(Signal* signal)
24290 {
24291   jamEntry();
24292   DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
24293   if(c_stopPermMaster.returnValue == 0){
24294     jam();
24295     c_stopPermMaster.returnValue = ref->errorCode;
24296   }//if
24297   switchReplicaReply(signal, ref->senderNode);
24298 }//Dbdih::execDIH_SWITCH_REPLICA_REF()
24299 
switchReplicaReply(Signal * signal,NodeId nodeId)24300 void Dbdih::switchReplicaReply(Signal* signal,
24301 			       NodeId nodeId){
24302   jam();
24303   receiveLoopMacro(DIH_SWITCH_REPLICA_REQ, nodeId);
24304   //------------------------------------------------------
24305   // We have received all responses from the nodes. Thus
24306   // we have completed switching replica roles. Continue
24307   // with the next fragment.
24308   //------------------------------------------------------
24309   if(c_stopPermMaster.returnValue != 0){
24310     jam();
24311     c_switchReplicas.tableId = ctabFileSize + 1;
24312   }//if
24313   c_switchReplicas.fragNo++;
24314 
24315   signal->theData[0] = DihContinueB::SwitchReplica;
24316   signal->theData[1] = c_switchReplicas.nodeId;
24317   signal->theData[2] = c_switchReplicas.tableId;
24318   signal->theData[3] = c_switchReplicas.fragNo;
24319   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
24320 }//Dbdih::switchReplicaReply()
24321 
24322 void
switchReplica(Signal * signal,Uint32 nodeId,Uint32 tableId,Uint32 fragNo)24323 Dbdih::switchReplica(Signal* signal,
24324 		     Uint32 nodeId,
24325 		     Uint32 tableId,
24326 		     Uint32 fragNo){
24327   jam();
24328   DihSwitchReplicaReq* const req = (DihSwitchReplicaReq*)&signal->theData[0];
24329 
24330   const Uint32 RT_BREAK = 64;
24331 
24332   for (Uint32 i = 0; i < RT_BREAK; i++) {
24333     jam();
24334     if (tableId >= ctabFileSize) {
24335       jam();
24336       StopPermConf* const conf = (StopPermConf*)&signal->theData[0];
24337       StopPermRef*  const ref  = (StopPermRef*)&signal->theData[0];
24338       /**
24339        * Finished with all tables
24340        */
24341       if(c_stopPermMaster.returnValue == 0) {
24342 	jam();
24343 	conf->senderData = c_stopPermMaster.clientData;
24344 	sendSignal(c_stopPermMaster.clientRef, GSN_STOP_PERM_CONF,
24345 		   signal, 1, JBB);
24346       } else {
24347         jam();
24348         ref->senderData = c_stopPermMaster.clientData;
24349         ref->errorCode  = c_stopPermMaster.returnValue;
24350         sendSignal(c_stopPermMaster.clientRef, GSN_STOP_PERM_REF, signal, 2,JBB);
24351       }//if
24352 
24353       /**
24354        * UnLock
24355        */
24356       c_nodeStartMaster.activeState = false;
24357       c_stopPermMaster.clientRef = 0;
24358       c_stopPermMaster.clientData = 0;
24359       c_stopPermMaster.returnValue = 0;
24360       Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
24361       mutex.unlock(); // ignore result
24362       return;
24363     }//if
24364 
24365     TabRecordPtr tabPtr;
24366     tabPtr.i = tableId;
24367     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
24368 
24369     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) {
24370       jam();
24371       tableId++;
24372       fragNo = 0;
24373       continue;
24374     }//if
24375     if (fragNo >= tabPtr.p->totalfragments) {
24376       jam();
24377       tableId++;
24378       fragNo = 0;
24379       continue;
24380     }//if
24381     FragmentstorePtr fragPtr;
24382     getFragstore(tabPtr.p, fragNo, fragPtr);
24383 
24384     Uint32 oldOrder[MAX_REPLICAS];
24385     const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
24386                                                 fragPtr.p,
24387                                                 oldOrder);
24388 
24389     if(oldOrder[0] != nodeId) {
24390       jam();
24391       fragNo++;
24392       continue;
24393     }//if
24394     req->tableId = tableId;
24395     req->fragNo = fragNo;
24396     req->noOfReplicas = noOfReplicas;
24397     for (Uint32 i = 0; i < (noOfReplicas - 1); i++) {
24398       req->newNodeOrder[i] = oldOrder[i+1];
24399     }//for
24400     req->newNodeOrder[noOfReplicas-1] = nodeId;
24401     req->senderRef = reference();
24402 
24403     /**
24404      * Initialize struct
24405      */
24406     c_switchReplicas.tableId = tableId;
24407     c_switchReplicas.fragNo = fragNo;
24408     c_switchReplicas.nodeId = nodeId;
24409 
24410     sendLoopMacro(DIH_SWITCH_REPLICA_REQ, sendDIH_SWITCH_REPLICA_REQ, RNIL);
24411     return;
24412   }//for
24413 
24414   signal->theData[0] = DihContinueB::SwitchReplica;
24415   signal->theData[1] = nodeId;
24416   signal->theData[2] = tableId;
24417   signal->theData[3] = fragNo;
24418   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
24419 }//Dbdih::switchReplica()
24420 
execSTOP_ME_REQ(Signal * signal)24421 void Dbdih::execSTOP_ME_REQ(Signal* signal)
24422 {
24423   jamEntry();
24424   StopMeReq* const req = (StopMeReq*)&signal->theData[0];
24425   const BlockReference senderRef = req->senderRef;
24426   const Uint32 senderData = req->senderData;
24427   const Uint32 nodeId = refToNode(senderRef);
24428   {
24429     /**
24430      * Set node dead (remove from operations)
24431      */
24432     NodeRecordPtr nodePtr;
24433     nodePtr.i = nodeId;
24434     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24435     nodePtr.p->useInTransactions = false;
24436   }
24437   if (nodeId != getOwnNodeId()) {
24438     jam();
24439     StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
24440     stopMeConf->senderData = senderData;
24441     stopMeConf->senderRef  = reference();
24442     sendSignal(senderRef, GSN_STOP_ME_CONF, signal,
24443 	       StopMeConf::SignalLength, JBB);
24444     return;
24445   }//if
24446 
24447   /**
24448    * Local signal
24449    */
24450   jam();
24451   ndbrequire(c_stopMe.clientRef == 0);
24452 
24453   c_stopMe.clientData  = senderData;
24454   c_stopMe.clientRef   = senderRef;
24455 
24456   req->senderData = senderData;
24457   req->senderRef  = reference();
24458 
24459   sendLoopMacro(STOP_ME_REQ, sendSTOP_ME_REQ, RNIL);
24460 
24461   /**
24462    * Send conf to self
24463    */
24464   StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
24465   stopMeConf->senderData = senderData;
24466   stopMeConf->senderRef  = reference();
24467   sendSignal(reference(), GSN_STOP_ME_CONF, signal,
24468 	     StopMeConf::SignalLength, JBB);
24469 }//Dbdih::execSTOP_ME_REQ()
24470 
execSTOP_ME_REF(Signal * signal)24471 void Dbdih::execSTOP_ME_REF(Signal* signal)
24472 {
24473   ndbrequire(false);
24474 }
24475 
execSTOP_ME_CONF(Signal * signal)24476 void Dbdih::execSTOP_ME_CONF(Signal* signal)
24477 {
24478   jamEntry();
24479   StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
24480 
24481   const Uint32 senderRef  = stopMeConf->senderRef;
24482   const Uint32 senderData = stopMeConf->senderData;
24483   const Uint32 nodeId     = refToNode(senderRef);
24484 
24485   ndbrequire(c_stopMe.clientRef != 0);
24486   ndbrequire(c_stopMe.clientData == senderData);
24487 
24488   receiveLoopMacro(STOP_ME_REQ, nodeId);
24489   //---------------------------------------------------------
24490   // All STOP_ME_REQ have been received. We will send the
24491   // confirmation back to the requesting block.
24492   //---------------------------------------------------------
24493 
24494   stopMeConf->senderRef = reference();
24495   stopMeConf->senderData = c_stopMe.clientData;
24496   sendSignal(c_stopMe.clientRef, GSN_STOP_ME_CONF, signal,
24497 	     StopMeConf::SignalLength, JBB);
24498   c_stopMe.clientRef = 0;
24499 }//Dbdih::execSTOP_ME_CONF()
24500 
execWAIT_GCP_REQ(Signal * signal)24501 void Dbdih::execWAIT_GCP_REQ(Signal* signal)
24502 {
24503   jamEntry();
24504   WaitGCPReq* const req = (WaitGCPReq*)&signal->theData[0];
24505   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
24506   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
24507   const Uint32 senderData = req->senderData;
24508   const BlockReference senderRef = req->senderRef;
24509   const Uint32 requestType = req->requestType;
24510   Uint32 errorCode = 0;
24511 
24512   if(requestType == WaitGCPReq::CurrentGCI)
24513   {
24514     jam();
24515     conf->senderData = senderData;
24516     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
24517     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
24518     conf->blockStatus = cgcpOrderBlocked;
24519     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
24520 	       WaitGCPConf::SignalLength, JBB);
24521     return;
24522   }//if
24523 
24524   if(requestType == WaitGCPReq::RestartGCI)
24525   {
24526     jam();
24527     conf->senderData = senderData;
24528     conf->gci_hi = Uint32(crestartGci);
24529     conf->gci_lo = 0;
24530     conf->blockStatus = cgcpOrderBlocked;
24531     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
24532 	       WaitGCPConf::SignalLength, JBB);
24533     return;
24534   }//if
24535 
24536   if (requestType == WaitGCPReq::BlockStartGcp)
24537   {
24538     jam();
24539     conf->senderData = senderData;
24540     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
24541     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
24542     conf->blockStatus = cgcpOrderBlocked;
24543     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
24544 	       WaitGCPConf::SignalLength, JBB);
24545     cgcpOrderBlocked = 1;
24546     return;
24547   }
24548 
24549   if (requestType == WaitGCPReq::UnblockStartGcp)
24550   {
24551     jam();
24552     conf->senderData = senderData;
24553     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
24554     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
24555     conf->blockStatus = cgcpOrderBlocked;
24556     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
24557 	       WaitGCPConf::SignalLength, JBB);
24558     cgcpOrderBlocked = 0;
24559     return;
24560   }
24561 
24562   if(isMaster())
24563   {
24564     /**
24565      * Master
24566      */
24567 
24568     if (!isActiveMaster())
24569     {
24570       ndbassert(cmasterState == MASTER_TAKE_OVER_GCP);
24571       errorCode = WaitGCPRef::NF_MasterTakeOverInProgress;
24572       goto error;
24573     }
24574 
24575     if((requestType == WaitGCPReq::CompleteIfRunning) &&
24576        (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE))
24577     {
24578       jam();
24579       conf->senderData = senderData;
24580       conf->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
24581       conf->gci_lo = Uint32(m_micro_gcp.m_old_gci);
24582       conf->blockStatus = cgcpOrderBlocked;
24583       sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
24584 		 WaitGCPConf::SignalLength, JBB);
24585       return;
24586     }//if
24587 
24588     WaitGCPMasterPtr ptr;
24589     WaitGCPList * list = &c_waitGCPMasterList;
24590     if (requestType == WaitGCPReq::WaitEpoch)
24591     {
24592       jam();
24593       list = &c_waitEpochMasterList;
24594     }
24595 
24596     if (list->seizeFirst(ptr) == false)
24597     {
24598       jam();
24599       errorCode = WaitGCPRef::NoWaitGCPRecords;
24600       goto error;
24601       return;
24602     }
24603 
24604     ptr.p->clientRef = senderRef;
24605     ptr.p->clientData = senderData;
24606 
24607     if((requestType == WaitGCPReq::CompleteForceStart) &&
24608        (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE))
24609     {
24610       jam();
24611       // Invalidating GCP timestamp will force an immediate GCP
24612       NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
24613       NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
24614     }//if
24615     return;
24616   }
24617   else
24618   {
24619     /**
24620      * Proxy part
24621      */
24622     jam();
24623     WaitGCPProxyPtr ptr;
24624     if (c_waitGCPProxyList.seizeFirst(ptr) == false)
24625     {
24626       jam();
24627       errorCode = WaitGCPRef::NoWaitGCPRecords;
24628       goto error;
24629     }//if
24630     ptr.p->clientRef = senderRef;
24631     ptr.p->clientData = senderData;
24632     ptr.p->masterRef = cmasterdihref;
24633 
24634     req->senderData = ptr.i;
24635     req->senderRef = reference();
24636     req->requestType = requestType;
24637 
24638     sendSignal(cmasterdihref, GSN_WAIT_GCP_REQ, signal,
24639 	       WaitGCPReq::SignalLength, JBB);
24640     return;
24641   }//if
24642 
24643 error:
24644   ref->senderData = senderData;
24645   ref->errorCode = errorCode;
24646   sendSignal(senderRef, GSN_WAIT_GCP_REF, signal,
24647              WaitGCPRef::SignalLength, JBB);
24648 }//Dbdih::execWAIT_GCP_REQ()
24649 
execWAIT_GCP_REF(Signal * signal)24650 void Dbdih::execWAIT_GCP_REF(Signal* signal)
24651 {
24652   jamEntry();
24653   ndbrequire(!isMaster());
24654   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
24655 
24656   const Uint32 proxyPtr = ref->senderData;
24657   const Uint32 errorCode = ref->errorCode;
24658 
24659   WaitGCPProxyPtr ptr;
24660   ptr.i = proxyPtr;
24661   c_waitGCPProxyList.getPtr(ptr);
24662 
24663   ref->senderData = ptr.p->clientData;
24664   ref->errorCode = errorCode;
24665   sendSignal(ptr.p->clientRef, GSN_WAIT_GCP_REF, signal,
24666 	     WaitGCPRef::SignalLength, JBB);
24667 
24668   c_waitGCPProxyList.release(ptr);
24669 }//Dbdih::execWAIT_GCP_REF()
24670 
execWAIT_GCP_CONF(Signal * signal)24671 void Dbdih::execWAIT_GCP_CONF(Signal* signal)
24672 {
24673   jamEntry();
24674   ndbrequire(!isMaster());
24675   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
24676   const Uint32 proxyPtr = conf->senderData;
24677   const Uint32 gci_hi = conf->gci_hi;
24678   const Uint32 gci_lo = conf->gci_lo;
24679   WaitGCPProxyPtr ptr;
24680 
24681   ptr.i = proxyPtr;
24682   c_waitGCPProxyList.getPtr(ptr);
24683 
24684   conf->senderData = ptr.p->clientData;
24685   conf->gci_hi = gci_hi;
24686   conf->gci_lo = gci_lo;
24687   conf->blockStatus = cgcpOrderBlocked;
24688   sendSignal(ptr.p->clientRef, GSN_WAIT_GCP_CONF, signal,
24689 	     WaitGCPConf::SignalLength, JBB);
24690 
24691   c_waitGCPProxyList.release(ptr);
24692 }//Dbdih::execWAIT_GCP_CONF()
24693 
checkWaitGCPProxy(Signal * signal,NodeId failedNodeId)24694 void Dbdih::checkWaitGCPProxy(Signal* signal, NodeId failedNodeId)
24695 {
24696   jam();
24697   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
24698   ref->errorCode = WaitGCPRef::NF_CausedAbortOfProcedure;
24699 
24700   WaitGCPProxyPtr ptr;
24701   c_waitGCPProxyList.first(ptr);
24702   while(ptr.i != RNIL) {
24703     jam();
24704     const Uint32 i = ptr.i;
24705     const Uint32 clientData = ptr.p->clientData;
24706     const BlockReference clientRef = ptr.p->clientRef;
24707     const BlockReference masterRef = ptr.p->masterRef;
24708 
24709     c_waitGCPProxyList.next(ptr);
24710     if(refToNode(masterRef) == failedNodeId) {
24711       jam();
24712       c_waitGCPProxyList.release(i);
24713       ref->senderData = clientData;
24714       sendSignal(clientRef, GSN_WAIT_GCP_REF, signal,
24715 		 WaitGCPRef::SignalLength, JBB);
24716     }//if
24717   }//while
24718 }//Dbdih::checkWaitGCPProxy()
24719 
checkWaitGCPMaster(Signal * signal,NodeId failedNodeId)24720 void Dbdih::checkWaitGCPMaster(Signal* signal, NodeId failedNodeId)
24721 {
24722   jam();
24723   WaitGCPMasterPtr ptr;
24724   c_waitGCPMasterList.first(ptr);
24725 
24726   while (ptr.i != RNIL) {
24727     jam();
24728     const Uint32 i = ptr.i;
24729     const NodeId nodeId = refToNode(ptr.p->clientRef);
24730 
24731     c_waitGCPMasterList.next(ptr);
24732     if (nodeId == failedNodeId) {
24733       jam();
24734       c_waitGCPMasterList.release(i);
24735     }//if
24736   }//while
24737 }//Dbdih::checkWaitGCPMaster()
24738 
emptyWaitGCPMasterQueue(Signal * signal,Uint64 gci,WaitGCPList & list)24739 void Dbdih::emptyWaitGCPMasterQueue(Signal* signal,
24740                                     Uint64 gci,
24741                                     WaitGCPList & list)
24742 {
24743   jam();
24744   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
24745   conf->gci_hi = Uint32(gci >> 32);
24746   conf->gci_lo = Uint32(gci);
24747 
24748   WaitGCPMasterPtr ptr;
24749   list.first(ptr);
24750   while(ptr.i != RNIL) {
24751     jam();
24752     const Uint32 i = ptr.i;
24753     const Uint32 clientData = ptr.p->clientData;
24754     const BlockReference clientRef = ptr.p->clientRef;
24755 
24756     c_waitGCPMasterList.next(ptr);
24757     conf->senderData = clientData;
24758     conf->blockStatus = cgcpOrderBlocked;
24759     sendSignal(clientRef, GSN_WAIT_GCP_CONF, signal,
24760 	       WaitGCPConf::SignalLength, JBB);
24761 
24762     list.release(i);
24763   }//while
24764 }//Dbdih::emptyWaitGCPMasterQueue()
24765 
setNodeStatus(Uint32 nodeId,NodeRecord::NodeStatus newStatus)24766 void Dbdih::setNodeStatus(Uint32 nodeId, NodeRecord::NodeStatus newStatus)
24767 {
24768   NodeRecordPtr nodePtr;
24769   nodePtr.i = nodeId;
24770   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24771   nodePtr.p->nodeStatus = newStatus;
24772 }//Dbdih::setNodeStatus()
24773 
getNodeStatus(Uint32 nodeId)24774 Dbdih::NodeRecord::NodeStatus Dbdih::getNodeStatus(Uint32 nodeId)
24775 {
24776   NodeRecordPtr nodePtr;
24777   nodePtr.i = nodeId;
24778   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24779   return nodePtr.p->nodeStatus;
24780 }//Dbdih::getNodeStatus()
24781 
24782 Sysfile::ActiveStatus
getNodeActiveStatus(Uint32 nodeId)24783 Dbdih::getNodeActiveStatus(Uint32 nodeId)
24784 {
24785   NodeRecordPtr nodePtr;
24786   nodePtr.i = nodeId;
24787   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24788   return nodePtr.p->activeStatus;
24789 }//Dbdih::getNodeActiveStatus()
24790 
24791 
24792 void
setNodeActiveStatus(Uint32 nodeId,Sysfile::ActiveStatus newStatus)24793 Dbdih::setNodeActiveStatus(Uint32 nodeId, Sysfile::ActiveStatus newStatus)
24794 {
24795   NodeRecordPtr nodePtr;
24796   nodePtr.i = nodeId;
24797   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24798   nodePtr.p->activeStatus = newStatus;
24799 }//Dbdih::setNodeActiveStatus()
24800 
setAllowNodeStart(Uint32 nodeId,bool newState)24801 void Dbdih::setAllowNodeStart(Uint32 nodeId, bool newState)
24802 {
24803   NodeRecordPtr nodePtr;
24804   nodePtr.i = nodeId;
24805   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24806   nodePtr.p->allowNodeStart = newState;
24807 }//Dbdih::setAllowNodeStart()
24808 
getAllowNodeStart(Uint32 nodeId)24809 bool Dbdih::getAllowNodeStart(Uint32 nodeId)
24810 {
24811   NodeRecordPtr nodePtr;
24812   nodePtr.i = nodeId;
24813   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24814   return nodePtr.p->allowNodeStart;
24815 }//Dbdih::getAllowNodeStart()
24816 
24817 Uint32
getNodeGroup(Uint32 nodeId) const24818 Dbdih::getNodeGroup(Uint32 nodeId) const
24819 {
24820   NodeRecordPtr nodePtr;
24821   nodePtr.i = nodeId;
24822   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24823   return nodePtr.p->nodeGroup;
24824 }
24825 
checkNodeAlive(Uint32 nodeId)24826 bool Dbdih::checkNodeAlive(Uint32 nodeId)
24827 {
24828   NodeRecordPtr nodePtr;
24829   nodePtr.i = nodeId;
24830   ndbrequire(nodeId > 0);
24831   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24832   if (nodePtr.p->nodeStatus != NodeRecord::ALIVE) {
24833     return false;
24834   } else {
24835     return true;
24836   }//if
24837 }//Dbdih::checkNodeAlive()
24838 
isMaster()24839 bool Dbdih::isMaster()
24840 {
24841   return (reference() == cmasterdihref);
24842 }//Dbdih::isMaster()
24843 
isActiveMaster()24844 bool Dbdih::isActiveMaster()
24845 {
24846   return ((reference() == cmasterdihref) && (cmasterState == MASTER_ACTIVE));
24847 }//Dbdih::isActiveMaster()
24848 
initNodeRecord(NodeRecordPtr nodePtr)24849 void Dbdih::initNodeRecord(NodeRecordPtr nodePtr)
24850 {
24851   nodePtr.p->m_nodefailSteps.clear();
24852 
24853   nodePtr.p->activeStatus = Sysfile::NS_NotDefined;
24854   nodePtr.p->recNODE_FAILREP = ZFALSE;
24855   nodePtr.p->dbtcFailCompleted = ZTRUE;
24856   nodePtr.p->dbdictFailCompleted = ZTRUE;
24857   nodePtr.p->dbdihFailCompleted = ZTRUE;
24858   nodePtr.p->dblqhFailCompleted = ZTRUE;
24859   nodePtr.p->noOfStartedChkpt = 0;
24860   nodePtr.p->noOfQueuedChkpt = 0;
24861   nodePtr.p->lcpStateAtTakeOver = (MasterLCPConf::State)255;
24862 
24863   nodePtr.p->activeTabptr = RNIL;
24864   nodePtr.p->nodeStatus = NodeRecord::NOT_IN_CLUSTER;
24865   nodePtr.p->useInTransactions = false;
24866   nodePtr.p->copyCompleted = 0;
24867   nodePtr.p->allowNodeStart = true;
24868 }
24869 // DICT lock slave
24870 
24871 void
sendDictLockReq(Signal * signal,Uint32 lockType,Callback c)24872 Dbdih::sendDictLockReq(Signal* signal, Uint32 lockType, Callback c)
24873 {
24874   DictLockReq* req = (DictLockReq*)&signal->theData[0];
24875   DictLockSlavePtr lockPtr;
24876 
24877   c_dictLockSlavePool.seize(lockPtr);
24878   ndbrequire(lockPtr.i != RNIL);
24879 
24880   req->userPtr = lockPtr.i;
24881   req->lockType = lockType;
24882   req->userRef = reference();
24883 
24884   lockPtr.p->lockPtr = RNIL;
24885   lockPtr.p->lockType = lockType;
24886   lockPtr.p->locked = false;
24887   lockPtr.p->callback = c;
24888 
24889   // handle rolling upgrade
24890   {
24891     Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
24892 
24893     const unsigned int get_major = getMajor(masterVersion);
24894     const unsigned int get_minor = getMinor(masterVersion);
24895     const unsigned int get_build = getBuild(masterVersion);
24896     ndbrequire(get_major >= 4);
24897 
24898     if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
24899         (masterVersion < NDBD_DICT_LOCK_VERSION_5_1 &&
24900          get_major == 5 && get_minor == 1) ||
24901         ERROR_INSERTED(7176)) {
24902       jam();
24903 
24904       infoEvent("DIH: detect upgrade: master node %u old version %u.%u.%u",
24905                 (unsigned int)cmasterNodeId, get_major, get_minor, get_build);
24906 
24907       DictLockConf* conf = (DictLockConf*)&signal->theData[0];
24908       conf->userPtr = lockPtr.i;
24909       conf->lockType = lockType;
24910       conf->lockPtr = ZNIL;
24911 
24912       sendSignal(reference(), GSN_DICT_LOCK_CONF, signal,
24913                  DictLockConf::SignalLength, JBB);
24914       return;
24915     }
24916   }
24917 
24918   BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
24919   sendSignal(dictMasterRef, GSN_DICT_LOCK_REQ, signal,
24920       DictLockReq::SignalLength, JBB);
24921 }
24922 
24923 void
execDICT_LOCK_CONF(Signal * signal)24924 Dbdih::execDICT_LOCK_CONF(Signal* signal)
24925 {
24926   jamEntry();
24927   recvDictLockConf(signal);
24928 }
24929 
24930 void
execDICT_LOCK_REF(Signal * signal)24931 Dbdih::execDICT_LOCK_REF(Signal* signal)
24932 {
24933   jamEntry();
24934   ndbrequire(false);
24935 }
24936 
24937 void
recvDictLockConf(Signal * signal)24938 Dbdih::recvDictLockConf(Signal* signal)
24939 {
24940   const DictLockConf* conf = (const DictLockConf*)&signal->theData[0];
24941 
24942   DictLockSlavePtr lockPtr;
24943   c_dictLockSlavePool.getPtr(lockPtr, conf->userPtr);
24944 
24945   lockPtr.p->lockPtr = conf->lockPtr;
24946   ndbrequire(lockPtr.p->lockType == conf->lockType);
24947   ndbrequire(lockPtr.p->locked == false);
24948   lockPtr.p->locked = true;
24949 
24950   lockPtr.p->callback.m_callbackData = lockPtr.i;
24951   execute(signal, lockPtr.p->callback, 0);
24952 }
24953 
24954 void
sendDictUnlockOrd(Signal * signal,Uint32 lockSlavePtrI)24955 Dbdih::sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI)
24956 {
24957   DictUnlockOrd* ord = (DictUnlockOrd*)&signal->theData[0];
24958 
24959   DictLockSlavePtr lockPtr;
24960   c_dictLockSlavePool.getPtr(lockPtr, lockSlavePtrI);
24961 
24962   ord->lockPtr = lockPtr.p->lockPtr;
24963   ord->lockType = lockPtr.p->lockType;
24964   ord->senderData = lockPtr.i;
24965   ord->senderRef = reference();
24966 
24967   c_dictLockSlavePool.release(lockPtr);
24968 
24969   // handle rolling upgrade
24970   {
24971     Uint32 masterVersion = getNodeInfo(cmasterNodeId).m_version;
24972 
24973     const unsigned int get_major = getMajor(masterVersion);
24974     const unsigned int get_minor = getMinor(masterVersion);
24975     ndbrequire(get_major >= 4);
24976 
24977     if (masterVersion < NDBD_DICT_LOCK_VERSION_5 ||
24978         (masterVersion < NDBD_DICT_LOCK_VERSION_5_1 &&
24979          get_major == 5 && get_minor == 1) ||
24980         ERROR_INSERTED(7176)) {
24981       return;
24982     }
24983   }
24984 
24985   Uint32 len = DictUnlockOrd::SignalLength;
24986   if (unlikely(getNodeInfo(cmasterNodeId).m_version < NDB_MAKE_VERSION(6,3,0)))
24987   {
24988     jam();
24989     len = 2;
24990   }
24991 
24992   BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
24993   sendSignal(dictMasterRef, GSN_DICT_UNLOCK_ORD, signal, len, JBB);
24994 }
24995 
24996 #ifdef ERROR_INSERT
24997 void
sendToRandomNodes(const char * msg,Signal * signal,SignalCounter * counter,SendFunction fun,Uint32 extra,Uint32 block,Uint32 gsn,Uint32 len,JobBufferLevel level)24998 Dbdih::sendToRandomNodes(const char * msg,
24999                          Signal* signal,
25000                          SignalCounter* counter,
25001                          SendFunction fun,
25002                          Uint32 extra,
25003                          Uint32 block,
25004                          Uint32 gsn,
25005                          Uint32 len,
25006                          JobBufferLevel level)
25007 {
25008 
25009   if (counter)
25010     counter->clearWaitingFor();
25011 
25012   Vector<Uint32> nodes;
25013   NodeRecordPtr nodePtr;
25014   nodePtr.i = cfirstAliveNode;
25015   do {
25016     jam();
25017     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
25018     if (nodePtr.i != getOwnNodeId())
25019     {
25020       nodes.push_back(nodePtr.i);
25021     }
25022     nodePtr.i = nodePtr.p->nextNode;
25023   } while (nodePtr.i != RNIL);
25024 
25025 
25026   NdbNodeBitmask masked;
25027   Uint32 cnt = nodes.size();
25028   if (cnt <= 1)
25029   {
25030     goto do_send;
25031   }
25032 
25033   {
25034     Uint32 remove = (rand() % cnt);
25035     if (remove == 0)
25036       remove = 1;
25037 
25038     for (Uint32 i = 0; i<remove; i++)
25039     {
25040       Uint32 rand_node = rand() % nodes.size();
25041       masked.set(nodes[rand_node]);
25042       nodes.erase(rand_node);
25043     }
25044   }
25045 
25046 do_send:
25047   char bufpos = 0;
25048   char buf[256];
25049 
25050   nodePtr.i = cfirstAliveNode;
25051   do {
25052     jam();
25053     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
25054     if (counter)
25055       counter->setWaitingFor(nodePtr.i);
25056     if (!masked.get(nodePtr.i))
25057     {
25058       if (fun)
25059       {
25060         (this->*fun)(signal, nodePtr.i, extra);
25061       }
25062       else
25063       {
25064         Uint32 ref = numberToRef(block, nodePtr.i);
25065         sendSignal(ref, gsn, signal, len, level);
25066       }
25067       BaseString::snprintf(buf+bufpos, sizeof(buf)-bufpos, "%u ", nodePtr.i);
25068     }
25069     else
25070     {
25071       BaseString::snprintf(buf+bufpos, sizeof(buf)-bufpos, "[%u] ", nodePtr.i);
25072     }
25073     bufpos = strlen(buf);
25074     nodePtr.i = nodePtr.p->nextNode;
25075   } while (nodePtr.i != RNIL);
25076   infoEvent("%s %s", msg, buf);
25077 }
25078 
25079 #endif
25080 
25081 // MT LQH
25082 
25083 Uint32
dihGetInstanceKey(Uint32 tabId,Uint32 fragId)25084 Dbdih::dihGetInstanceKey(Uint32 tabId, Uint32 fragId)
25085 {
25086   TabRecordPtr tTabPtr;
25087   tTabPtr.i = tabId;
25088   ptrCheckGuard(tTabPtr, ctabFileSize, tabRecord);
25089   FragmentstorePtr tFragPtr;
25090   getFragstore(tTabPtr.p, fragId, tFragPtr);
25091   Uint32 instanceKey = dihGetInstanceKey(tFragPtr);
25092   return instanceKey;
25093 }
25094 
25095 /**
25096  *
25097  */
25098 void
execCREATE_NODEGROUP_IMPL_REQ(Signal * signal)25099 Dbdih::execCREATE_NODEGROUP_IMPL_REQ(Signal* signal)
25100 {
25101   jamEntry();
25102   CreateNodegroupImplReq reqCopy = *(CreateNodegroupImplReq*)signal->getDataPtr();
25103   CreateNodegroupImplReq *req = &reqCopy;
25104 
25105   Uint32 err = 0;
25106   Uint32 rt = req->requestType;
25107   Uint64 gci = 0;
25108   switch(rt){
25109   case CreateNodegroupImplReq::RT_ABORT:
25110     jam(); // do nothing
25111     break;
25112   case CreateNodegroupImplReq::RT_PARSE:
25113   case CreateNodegroupImplReq::RT_PREPARE:
25114   case CreateNodegroupImplReq::RT_COMMIT:
25115   {
25116     Uint32 cnt = 0;
25117     for (Uint32 i = 0; i<NDB_ARRAY_SIZE(req->nodes) && req->nodes[i] ; i++)
25118     {
25119       cnt++;
25120       if (getNodeActiveStatus(req->nodes[i]) != Sysfile::NS_Configured)
25121       {
25122         jam();
25123         err = CreateNodegroupRef::NodeAlreadyInNodegroup;
25124         goto error;
25125       }
25126     }
25127 
25128     if (cnt != cnoReplicas)
25129     {
25130       jam();
25131       err = CreateNodegroupRef::InvalidNoOfNodesInNodegroup;
25132       goto error;
25133     }
25134 
25135     Uint32 ng = req->nodegroupId;
25136     NdbNodeBitmask tmp;
25137     tmp.set();
25138     for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
25139     {
25140       tmp.clear(c_node_groups[i]);
25141     }
25142 
25143     if (ng == RNIL && rt == CreateNodegroupImplReq::RT_PARSE)
25144     {
25145       jam();
25146       ng = tmp.find(0);
25147     }
25148 
25149     if (ng > MAX_NDB_NODES)
25150     {
25151       jam();
25152       err = CreateNodegroupRef::InvalidNodegroupId;
25153       goto error;
25154     }
25155 
25156     if (tmp.get(ng) == false)
25157     {
25158       jam();
25159       err = CreateNodegroupRef::NodegroupInUse;
25160       goto error;
25161     }
25162 
25163     if (rt == CreateNodegroupImplReq::RT_PARSE || rt == CreateNodegroupImplReq::RT_PREPARE)
25164     {
25165       /**
25166        * Check that atleast one of the nodes are alive
25167        */
25168       bool alive = false;
25169       for (Uint32 i = 0; i<cnoReplicas; i++)
25170       {
25171         jam();
25172         Uint32 nodeId = req->nodes[i];
25173         if (getNodeStatus(nodeId) == NodeRecord::ALIVE)
25174         {
25175           jam();
25176           alive = true;
25177           break;
25178         }
25179       }
25180 
25181       jam();
25182       if (alive == false)
25183       {
25184         jam();
25185         err = CreateNodegroupRef::NoNodeAlive;
25186         goto error;
25187       }
25188     }
25189 
25190     if (rt == CreateNodegroupImplReq::RT_PARSE)
25191     {
25192       jam();
25193       signal->theData[0] = 0;
25194       signal->theData[1] = ng;
25195       return;
25196     }
25197 
25198     if (rt == CreateNodegroupImplReq::RT_PREPARE)
25199     {
25200       jam(); // do nothing
25201       break;
25202     }
25203 
25204     ndbrequire(rt == CreateNodegroupImplReq::RT_COMMIT);
25205     for (Uint32 i = 0; i<cnoReplicas; i++)
25206     {
25207       Uint32 nodeId = req->nodes[i];
25208       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups, req->nodegroupId);
25209       if (getNodeStatus(nodeId) == NodeRecord::ALIVE)
25210       {
25211         jam();
25212         Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_Active);
25213       }
25214       else
25215       {
25216         jam();
25217         Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_ActiveMissed_1);
25218       }
25219       setNodeActiveStatus();
25220       setNodeGroups();
25221     }
25222     break;
25223   }
25224   case CreateNodegroupImplReq::RT_COMPLETE:
25225     jam();
25226     gci = m_micro_gcp.m_current_gci;
25227     break;
25228   }
25229 
25230   {
25231     CreateNodegroupImplConf* conf = (CreateNodegroupImplConf*)signal->getDataPtrSend();
25232     conf->senderRef = reference();
25233     conf->senderData = req->senderData;
25234     conf->gci_hi = Uint32(gci >> 32);
25235     conf->gci_lo = Uint32(gci);
25236     sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_CONF, signal,
25237                CreateNodegroupImplConf::SignalLength, JBB);
25238   }
25239   return;
25240 
25241 error:
25242   if (rt == CreateNodegroupImplReq::RT_PARSE)
25243   {
25244     jam();
25245     signal->theData[0] = err;
25246     return;
25247   }
25248 
25249   if (rt == CreateNodegroupImplReq::RT_PREPARE)
25250   {
25251     jam();
25252     CreateNodegroupImplRef * ref = (CreateNodegroupImplRef*)signal->getDataPtrSend();
25253     ref->senderRef = reference();
25254     ref->senderData = req->senderData;
25255     ref->errorCode = err;
25256     sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_REF, signal,
25257                CreateNodegroupImplRef::SignalLength, JBB);
25258     return;
25259   }
25260 
25261   jamLine(err);
25262   ndbrequire(false);
25263 }
25264 
25265 /**
25266  *
25267  */
25268 void
execDROP_NODEGROUP_IMPL_REQ(Signal * signal)25269 Dbdih::execDROP_NODEGROUP_IMPL_REQ(Signal* signal)
25270 {
25271   jamEntry();
25272   DropNodegroupImplReq reqCopy = *(DropNodegroupImplReq*)signal->getDataPtr();
25273   DropNodegroupImplReq *req = &reqCopy;
25274 
25275   NodeGroupRecordPtr NGPtr;
25276 
25277   Uint32 err = 0;
25278   Uint32 rt = req->requestType;
25279   Uint64 gci = 0;
25280   switch(rt){
25281   case DropNodegroupImplReq::RT_ABORT:
25282     jam(); // do nothing
25283     break;
25284   case DropNodegroupImplReq::RT_PARSE:
25285   case DropNodegroupImplReq::RT_PREPARE:
25286     jam();
25287     NGPtr.i = req->nodegroupId;
25288     if (NGPtr.i >= MAX_NDB_NODES)
25289     {
25290       jam();
25291       err = DropNodegroupRef::NoSuchNodegroup;
25292       goto error;
25293     }
25294     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
25295 
25296     if (NGPtr.p->nodegroupIndex == RNIL)
25297     {
25298       jam();
25299       err = DropNodegroupRef::NoSuchNodegroup;
25300       goto error;
25301     }
25302 
25303     if (NGPtr.p->m_ref_count)
25304     {
25305       jam();
25306       err = DropNodegroupRef::NodegroupInUse;
25307       goto error;
25308     }
25309     break;
25310   case DropNodegroupImplReq::RT_COMMIT:
25311   {
25312     jam();
25313     gci = m_micro_gcp.m_current_gci;
25314     break;
25315   }
25316   case DropNodegroupImplReq::RT_COMPLETE:
25317   {
25318     NGPtr.i = req->nodegroupId;
25319     ptrCheckGuard(NGPtr, MAX_NDB_NODES, nodeGroupRecord);
25320     for (Uint32 i = 0; i<NGPtr.p->nodeCount; i++)
25321     {
25322       jam();
25323       Uint32 nodeId = NGPtr.p->nodesInGroup[i];
25324       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups, NO_NODE_GROUP_ID);
25325       Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_Configured);
25326     }
25327     setNodeActiveStatus();
25328     setNodeGroups();
25329     break;
25330   }
25331   }
25332 
25333   {
25334     DropNodegroupImplConf* conf = (DropNodegroupImplConf*)signal->getDataPtrSend();
25335     conf->senderRef = reference();
25336     conf->senderData = req->senderData;
25337     conf->gci_hi = Uint32(gci >> 32);
25338     conf->gci_lo = Uint32(gci);
25339     sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_CONF, signal,
25340                DropNodegroupImplConf::SignalLength, JBB);
25341   }
25342   return;
25343 
25344 error:
25345   DropNodegroupImplRef * ref = (DropNodegroupImplRef*)signal->getDataPtrSend();
25346   ref->senderRef = reference();
25347   ref->senderData = req->senderData;
25348   ref->errorCode = err;
25349   sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_REF, signal,
25350              DropNodegroupImplRef::SignalLength, JBB);
25351 }
25352 
25353 Uint32
getMinVersion() const25354 Dbdih::getMinVersion() const
25355 {
25356   Uint32 ver = getNodeInfo(getOwnNodeId()).m_version;
25357   NodeRecordPtr specNodePtr;
25358   specNodePtr.i = cfirstAliveNode;
25359   do
25360   {
25361     jam();
25362     ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
25363     Uint32 v = getNodeInfo(specNodePtr.i).m_version;
25364     if (v < ver)
25365     {
25366       jam();
25367       ver = v;
25368     }
25369     specNodePtr.i = specNodePtr.p->nextNode;
25370   } while (specNodePtr.i != RNIL);
25371 
25372   return ver;
25373 }
25374 
25375 Uint8
getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const25376 Dbdih::getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const
25377 {
25378   if (likely(getNodeInfo(nodeId).m_version >= NDBD_EXTRA_PARALLEL_FRAG_LCP))
25379   {
25380     return MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE;
25381   }
25382   else
25383   {
25384     /* Older node - only 2 parallel frag checkpoints supported */
25385     return 2;
25386   }
25387 }
25388 
25389 
25390 /**
25391  * isolateNodes
25392  *
25393  * Get all live nodes to disconnect the set of victims
25394  * in minDelayMillis.
25395  *
25396  * The signals are sent to live nodes immediately, and
25397  * those nodes perform the delay, to reduce the chance
25398  * of lag on this node causing problems
25399  */
25400 void
isolateNodes(Signal * signal,Uint32 delayMillis,const NdbNodeBitmask & victims)25401 Dbdih::isolateNodes(Signal* signal,
25402                     Uint32 delayMillis,
25403                     const NdbNodeBitmask& victims)
25404 {
25405   jam();
25406 
25407   IsolateOrd* ord = (IsolateOrd*) signal->theData;
25408 
25409   ord->senderRef          = reference();
25410   ord->isolateStep        = IsolateOrd::IS_REQ;
25411   ord->delayMillis        = delayMillis;
25412 
25413   victims.copyto(NdbNodeBitmask::Size, ord->nodesToIsolate);
25414 
25415   /* QMGR handles this */
25416   sendSignal(QMGR_REF,
25417              GSN_ISOLATE_ORD,
25418              signal,
25419              IsolateOrd::SignalLength,
25420              JBA);
25421 }
25422