1 /*
2    Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #include <cstring>
26 
27 #define DBDIH_C
28 #include <ndb_global.h>
29 #include <ndb_limits.h>
30 #include <ndb_version.h>
31 #include <NdbOut.hpp>
32 
33 #include "Dbdih.hpp"
34 #include "Configuration.hpp"
35 
36 #include <signaldata/CopyTab.hpp>
37 #include <signaldata/DbinfoScan.hpp>
38 #include <signaldata/AllocNodeId.hpp>
39 #include <signaldata/NodeRecoveryStatusRep.hpp>
40 #include <signaldata/BlockCommitOrd.hpp>
41 #include <signaldata/CheckNodeGroups.hpp>
42 #include <signaldata/CopyActive.hpp>
43 #include <signaldata/CopyFrag.hpp>
44 #include <signaldata/CopyGCIReq.hpp>
45 #include <signaldata/DiAddTab.hpp>
46 #include <signaldata/DictStart.hpp>
47 #include <signaldata/DiGetNodes.hpp>
48 #include <signaldata/DihContinueB.hpp>
49 #include <signaldata/DihSwitchReplica.hpp>
50 #include <signaldata/DumpStateOrd.hpp>
51 #include <signaldata/EventReport.hpp>
52 #include <signaldata/FsReadWriteReq.hpp>
53 #include <signaldata/GCP.hpp>
54 #include <signaldata/MasterGCP.hpp>
55 #include <signaldata/MasterLCP.hpp>
56 #include <signaldata/NFCompleteRep.hpp>
57 #include <signaldata/NodeFailRep.hpp>
58 #include <signaldata/ReadNodesConf.hpp>
59 #include <signaldata/StartFragReq.hpp>
60 #include <signaldata/StartInfo.hpp>
61 #include <signaldata/StartMe.hpp>
62 #include <signaldata/StartPerm.hpp>
63 #include <signaldata/StartRec.hpp>
64 #include <signaldata/StopPerm.hpp>
65 #include <signaldata/StopMe.hpp>
66 #include <signaldata/TestOrd.hpp>
67 #include <signaldata/WaitGCP.hpp>
68 #include <signaldata/DihStartTab.hpp>
69 #include <signaldata/LCP.hpp>
70 #include <signaldata/SystemError.hpp>
71 
72 #include <signaldata/TakeOver.hpp>
73 
74 #include <signaldata/DropTab.hpp>
75 #include <signaldata/AlterTab.hpp>
76 #include <signaldata/AlterTable.hpp>
77 #include <signaldata/PrepDropTab.hpp>
78 #include <signaldata/SumaImpl.hpp>
79 #include <signaldata/DictTabInfo.hpp>
80 #include <signaldata/CreateFragmentation.hpp>
81 #include <signaldata/LqhFrag.hpp>
82 #include <signaldata/FsOpenReq.hpp>
83 #include <signaldata/DihScanTab.hpp>
84 #include <signaldata/DictLock.hpp>
85 #include <DebuggerNames.hpp>
86 #include <signaldata/Upgrade.hpp>
87 #include <NdbEnv.h>
88 #include <signaldata/CreateNodegroup.hpp>
89 #include <signaldata/CreateNodegroupImpl.hpp>
90 #include <signaldata/DropNodegroup.hpp>
91 #include <signaldata/DropNodegroupImpl.hpp>
92 #include <signaldata/DihGetTabInfo.hpp>
93 #include <SectionReader.hpp>
94 #include <signaldata/DihRestart.hpp>
95 #include <signaldata/IsolateOrd.hpp>
96 #include <ndb_constants.h>
97 
98 #include <EventLogger.hpp>
99 
100 #define JAM_FILE_ID 354
101 
102 static const Uint32 WaitTableStateChangeMillis = 10;
103 
104 extern EventLogger * g_eventLogger;
105 
106 #if (defined(VM_TRACE) || defined(ERROR_INSERT))
107 //#define DEBUG_MULTI_TRP 1
108 //#define DEBUG_NODE_STOP 1
109 //#define DEBUG_REDO_CONTROL 1
110 //#define DEBUG_LCP 1
111 #define DEBUG_LCP_COMP 1
112 #endif
113 
114 #ifdef DEBUG_MULTI_TRP
115 #define DEB_MULTI_TRP(arglist) do { g_eventLogger->info arglist ; } while (0)
116 #else
117 #define DEB_MULTI_TRP(arglist) do { } while (0)
118 #endif
119 
120 #ifdef DEBUG_NODE_STOP
121 #define DEB_NODE_STOP(arglist) do { g_eventLogger->info arglist ; } while (0)
122 #else
123 #define DEB_NODE_STOP(arglist) do { } while (0)
124 #endif
125 
126 #ifdef DEBUG_REDO_CONTROL
127 #define DEB_REDO_CONTROL(arglist) do { g_eventLogger->info arglist ; } while (0)
128 #else
129 #define DEB_REDO_CONTROL(arglist) do { } while (0)
130 #endif
131 
132 #ifdef DEBUG_LCP
133 #define DEB_LCP(arglist) do { g_eventLogger->info arglist ; } while (0)
134 #else
135 #define DEB_LCP(arglist) do { } while (0)
136 #endif
137 
138 #ifdef DEBUG_LCP_COMP
139 #define DEB_LCP_COMP(arglist) do { g_eventLogger->info arglist ; } while (0)
140 #else
141 #define DEB_LCP_COMP(arglist) do { } while (0)
142 #endif
143 
144 #define SYSFILE ((Sysfile *)&sysfileData[0])
145 #define ZINIT_CREATE_GCI Uint32(0)
146 #define ZINIT_REPLICA_LAST_GCI Uint32(-1)
147 
148 #define RETURN_IF_NODE_NOT_ALIVE(node) \
149   if (!checkNodeAlive((node))) { \
150     jam(); \
151     return; \
152   } \
153 
154 #define receiveLoopMacro(sigName, receiveNodeId)\
155 {                                                \
156   c_##sigName##_Counter.clearWaitingFor(receiveNodeId); \
157   if(c_##sigName##_Counter.done() == false){     \
158      jam();                                      \
159      return;                                     \
160   }                                              \
161 }
162 
163 #define sendLoopMacro(sigName, signalRoutine, extra)                    \
164 {                                                                       \
165   c_##sigName##_Counter.clearWaitingFor();                              \
166   NodeRecordPtr specNodePtr;                                            \
167   specNodePtr.i = cfirstAliveNode;                                      \
168   do {                                                                  \
169     jam();                                                              \
170     ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);              \
171     c_##sigName##_Counter.setWaitingFor(specNodePtr.i);                 \
172     signalRoutine(signal, specNodePtr.i, extra);                        \
173     specNodePtr.i = specNodePtr.p->nextNode;                            \
174   } while (specNodePtr.i != RNIL);                                      \
175 }
176 
177 static
178 Uint32
prevLcpNo(Uint32 lcpNo)179 prevLcpNo(Uint32 lcpNo){
180   if(lcpNo == 0)
181     return MAX_LCP_USED - 1;
182   return lcpNo - 1;
183 }
184 
185 static
186 Uint32
nextLcpNo(Uint32 lcpNo)187 nextLcpNo(Uint32 lcpNo){
188   lcpNo++;
189   if(lcpNo >= MAX_LCP_USED)
190     return 0;
191   return lcpNo;
192 }
193 
nullRoutine(Signal * signal,Uint32 nodeId,Uint32 extra)194 void Dbdih::nullRoutine(Signal* signal, Uint32 nodeId, Uint32 extra)
195 {
196 }//Dbdih::nullRoutine()
197 
sendCOPY_GCIREQ(Signal * signal,Uint32 nodeId,Uint32 extra)198 void Dbdih::sendCOPY_GCIREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
199 {
200   ndbrequire(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE);
201   CopyGCIReq * const copyGCI = (CopyGCIReq *)&signal->theData[0];
202   copyGCI->anyData = nodeId;
203   copyGCI->copyReason = c_copyGCIMaster.m_copyReason;
204   copyGCI->startWord = 0;
205 
206   const BlockReference ref = calcDihBlockRef(nodeId);
207   if (ndbd_send_node_bitmask_in_section(getNodeInfo(nodeId).m_version))
208   {
209     jam();
210     pack_sysfile_format_v2();
211     send_COPY_GCIREQ_data_v2(signal, ref);
212   }
213   else
214   {
215     jam();
216     pack_sysfile_format_v1();
217     send_COPY_GCIREQ_data_v1(signal, ref);
218   }
219 }
220 
send_COPY_GCIREQ_data_v2(Signal * signal,BlockReference ref)221 void Dbdih::send_COPY_GCIREQ_data_v2(Signal *signal, BlockReference ref)
222 {
223   LinearSectionPtr lsptr[3];
224   lsptr[0].p = &cdata[0];
225   lsptr[0].sz = cdata_size_in_words;
226 #if 0
227   for (Uint32 i = 0; i < cdata_size_in_words; i++)
228   {
229     ndbout_c("cdata[%u] = %x", i, cdata[i]);
230   }
231   ndbout_c("ref = %x", ref);
232 #endif
233   sendSignal(ref,
234              GSN_COPY_GCIREQ,
235              signal,
236              CopyGCIReq::SignalLength,
237              JBB,
238              lsptr,
239              1);
240 }
241 
send_START_MECONF_data_v2(Signal * signal,BlockReference ref)242 void Dbdih::send_START_MECONF_data_v2(Signal *signal, BlockReference ref)
243 {
244   LinearSectionPtr lsptr[3];
245   lsptr[0].p = &cdata[0];
246   lsptr[0].sz = cdata_size_in_words;
247   sendSignal(ref,
248              GSN_START_MECONF,
249              signal,
250              StartMeConf::SignalLength_v2,
251              JBB,
252              lsptr,
253              1);
254 }
255 
send_COPY_GCIREQ_data_v1(Signal * signal,BlockReference ref)256 void Dbdih::send_COPY_GCIREQ_data_v1(Signal *signal, BlockReference ref)
257 {
258   const Uint32 wordPerSignal = CopyGCIReq::DATA_SIZE;
259   const Uint32 noOfSignals = ((Sysfile::SYSFILE_SIZE32_v1 +
260                               (wordPerSignal - 1)) /
261 			       wordPerSignal);
262 
263   CopyGCIReq * const copyGCI = (CopyGCIReq *)&signal->theData[0];
264   for(Uint32 i = 0; i < noOfSignals; i++)
265   {
266     const int startWord = copyGCI->startWord;
267     for(Uint32 j = 0; j < wordPerSignal; j++)
268     {
269       copyGCI->data[j] = cdata[j+startWord];
270     }
271     sendSignal(ref, GSN_COPY_GCIREQ, signal, 25, JBB);
272     copyGCI->startWord += wordPerSignal;
273   }
274 }
275 
send_START_MECONF_data_v1(Signal * signal,BlockReference ref)276 void Dbdih::send_START_MECONF_data_v1(Signal *signal, BlockReference ref)
277 {
278   const int wordPerSignal = StartMeConf::DATA_SIZE;
279   const int noOfSignals = ((Sysfile::SYSFILE_SIZE32_v1 +
280                             (wordPerSignal - 1)) /
281                               wordPerSignal);
282   StartMeConf * const startMe = (StartMeConf *)&signal->theData[0];
283   for(int i = 0; i < noOfSignals; i++)
284   {
285     const int startWord = startMe->startWord;
286     for(int j = 0; j < wordPerSignal; j++)
287     {
288       startMe->data[j] = cdata[j+startWord];
289     }
290     sendSignal(ref,
291                GSN_START_MECONF,
292                signal,
293                StartMeConf::SignalLength_v1,
294                JBB);
295     startMe->startWord += wordPerSignal;
296   }
297 }
298 
sendDIH_SWITCH_REPLICA_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)299 void Dbdih::sendDIH_SWITCH_REPLICA_REQ(Signal* signal, Uint32 nodeId,
300                                        Uint32 extra)
301 {
302   const BlockReference ref    = calcDihBlockRef(nodeId);
303   sendSignal(ref, GSN_DIH_SWITCH_REPLICA_REQ, signal,
304              DihSwitchReplicaReq::SignalLength, JBB);
305 }//Dbdih::sendDIH_SWITCH_REPLICA_REQ()
306 
sendGCP_COMMIT(Signal * signal,Uint32 nodeId,Uint32 extra)307 void Dbdih::sendGCP_COMMIT(Signal* signal, Uint32 nodeId, Uint32 extra)
308 {
309   BlockReference ref = calcDihBlockRef(nodeId);
310   GCPCommit *req = (GCPCommit*)signal->getDataPtrSend();
311   req->nodeId = cownNodeId;
312   req->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
313   req->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
314   DEB_NODE_STOP(("Send GCP_COMMIT(%u,%u) to %u",
315                  req->gci_hi, req->gci_lo, nodeId));
316   sendSignal(ref, GSN_GCP_COMMIT, signal, GCPCommit::SignalLength, JBA);
317 
318   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
319 }//Dbdih::sendGCP_COMMIT()
320 
sendGCP_PREPARE(Signal * signal,Uint32 nodeId,Uint32 extra)321 void Dbdih::sendGCP_PREPARE(Signal* signal, Uint32 nodeId, Uint32 extra)
322 {
323   BlockReference ref = calcDihBlockRef(nodeId);
324   GCPPrepare *req = (GCPPrepare*)signal->getDataPtrSend();
325   req->nodeId = cownNodeId;
326   req->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
327   req->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
328 
329   DEB_NODE_STOP(("Send GCP_PREPARE(%u,%u) to %u",
330                  req->gci_hi, req->gci_lo, nodeId));
331 
332   if (! (ERROR_INSERTED(7201) || ERROR_INSERTED(7202)))
333   {
334     sendSignal(ref, GSN_GCP_PREPARE, signal, GCPPrepare::SignalLength, JBA);
335   }
336   else if (ERROR_INSERTED(7201))
337   {
338     sendSignal(ref, GSN_GCP_PREPARE, signal, GCPPrepare::SignalLength, JBB);
339   }
340   else if (ERROR_INSERTED(7202))
341   {
342     ndbrequire(nodeId == getOwnNodeId());
343     sendSignalWithDelay(ref, GSN_GCP_PREPARE, signal, 2000,
344                         GCPPrepare::SignalLength);
345   }
346   else
347   {
348     ndbabort(); // should be dead code #ifndef ERROR_INSERT
349   }
350 
351   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
352 }//Dbdih::sendGCP_PREPARE()
353 
354 void
sendSUB_GCP_COMPLETE_REP(Signal * signal,Uint32 nodeId,Uint32 extra)355 Dbdih::sendSUB_GCP_COMPLETE_REP(Signal* signal, Uint32 nodeId, Uint32 extra)
356 {
357   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
358   BlockReference ref = calcDihBlockRef(nodeId);
359   sendSignal(ref, GSN_SUB_GCP_COMPLETE_REP, signal,
360              SubGcpCompleteRep::SignalLength, JBA);
361 }
362 
sendGCP_SAVEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)363 void Dbdih::sendGCP_SAVEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
364 {
365   GCPSaveReq * const saveReq = (GCPSaveReq*)&signal->theData[0];
366   BlockReference ref = calcDihBlockRef(nodeId);
367   saveReq->dihBlockRef = reference();
368   saveReq->dihPtr = nodeId;
369   saveReq->gci = m_gcp_save.m_master.m_new_gci;
370   sendSignal(ref, GSN_GCP_SAVEREQ, signal, GCPSaveReq::SignalLength, JBB);
371 }//Dbdih::sendGCP_SAVEREQ()
372 
sendINCL_NODEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)373 void Dbdih::sendINCL_NODEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
374 {
375   BlockReference nodeDihRef = calcDihBlockRef(nodeId);
376   signal->theData[0] = reference();
377   signal->theData[1] = c_nodeStartMaster.startNode;
378   signal->theData[2] = c_nodeStartMaster.failNr;
379   signal->theData[3] = 0;
380   signal->theData[4] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
381   signal->theData[5] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
382   sendSignal(nodeDihRef, GSN_INCL_NODEREQ, signal, 6, JBA);
383 }//Dbdih::sendINCL_NODEREQ()
384 
sendMASTER_GCPREQ(Signal * signal,Uint32 nodeId,Uint32 extra)385 void Dbdih::sendMASTER_GCPREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
386 {
387   BlockReference ref = calcDihBlockRef(nodeId);
388   sendSignal(ref, GSN_MASTER_GCPREQ, signal, MasterGCPReq::SignalLength, JBB);
389 }//Dbdih::sendMASTER_GCPREQ()
390 
sendMASTER_LCPREQ(Signal * signal,Uint32 nodeId,Uint32 extra)391 void Dbdih::sendMASTER_LCPREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
392 {
393   BlockReference ref = calcDihBlockRef(nodeId);
394   sendSignal(ref, GSN_MASTER_LCPREQ, signal, MasterLCPReq::SignalLength, JBB);
395 }//Dbdih::sendMASTER_LCPREQ()
396 
sendSTART_INFOREQ(Signal * signal,Uint32 nodeId,Uint32 extra)397 void Dbdih::sendSTART_INFOREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
398 {
399   const BlockReference ref = calcDihBlockRef(nodeId);
400   sendSignal(ref, GSN_START_INFOREQ, signal, StartInfoReq::SignalLength, JBB);
401 }//sendSTART_INFOREQ()
402 
sendSTART_RECREQ(Signal * signal,Uint32 nodeId,Uint32 extra)403 void Dbdih::sendSTART_RECREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
404 {
405   if (!m_sr_nodes.get(nodeId))
406   {
407     jam();
408     c_START_RECREQ_Counter.clearWaitingFor(nodeId);
409     return;
410   }
411 
412   Uint32 keepGCI = SYSFILE->keepGCI;
413   Uint32 lastCompletedGCI = SYSFILE->lastCompletedGCI[nodeId];
414   if (keepGCI > lastCompletedGCI)
415   {
416     jam();
417     keepGCI = lastCompletedGCI;
418   }
419 
420   StartRecReq * const req = (StartRecReq*)&signal->theData[0];
421   BlockReference ref = calcLqhBlockRef(nodeId);
422   req->receivingNodeId = nodeId;
423   req->senderRef = reference();
424   req->keepGci = keepGCI;
425   req->lastCompletedGci = lastCompletedGCI;
426   req->newestGci = SYSFILE->newestRestorableGCI;
427   req->senderData = extra;
428   m_sr_nodes.copyto(NdbNodeBitmask::Size, req->sr_nodes);
429   Uint32 packed_length = m_sr_nodes.getPackedLengthInWords();
430 
431   if (ndbd_send_node_bitmask_in_section(getNodeInfo(refToNode(ref)).m_version))
432   {
433     jam();
434     LinearSectionPtr lsptr[3];
435     lsptr[0].p = req->sr_nodes;
436     lsptr[0].sz = NdbNodeBitmask::getPackedLengthInWords(req->sr_nodes);
437     sendSignal(ref, GSN_START_RECREQ, signal, StartRecReq::SignalLength, JBB,
438                lsptr, 1);
439   }
440   else if (packed_length <= NdbNodeBitmask48::Size)
441   {
442     sendSignal(ref, GSN_START_RECREQ, signal, StartRecReq::SignalLength_v1, JBB);
443   }
444   else
445   {
446     ndbabort();
447   }
448 
449   signal->theData[0] = NDB_LE_StartREDOLog;
450   signal->theData[1] = nodeId;
451   signal->theData[2] = keepGCI;
452   signal->theData[3] = lastCompletedGCI;
453   signal->theData[4] = SYSFILE->newestRestorableGCI;
454   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 5, JBB);
455 }//Dbdih::sendSTART_RECREQ()
456 
sendSTOP_ME_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)457 void Dbdih::sendSTOP_ME_REQ(Signal* signal, Uint32 nodeId, Uint32 extra)
458 {
459   if (nodeId != getOwnNodeId()) {
460     jam();
461     const BlockReference ref = calcDihBlockRef(nodeId);
462     sendSignal(ref, GSN_STOP_ME_REQ, signal, StopMeReq::SignalLength, JBB);
463   }//if
464 }//Dbdih::sendSTOP_ME_REQ()
465 
sendTC_CLOPSIZEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)466 void Dbdih::sendTC_CLOPSIZEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
467 {
468   BlockReference ref = calcTcBlockRef(nodeId);
469   signal->theData[0] = nodeId;
470   signal->theData[1] = reference();
471   sendSignal(ref, GSN_TC_CLOPSIZEREQ, signal, 2, JBB);
472 }//Dbdih::sendTC_CLOPSIZEREQ()
473 
sendTCGETOPSIZEREQ(Signal * signal,Uint32 nodeId,Uint32 extra)474 void Dbdih::sendTCGETOPSIZEREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
475 {
476   BlockReference ref = calcTcBlockRef(nodeId);
477   signal->theData[0] = nodeId;
478   signal->theData[1] = reference();
479   sendSignal(ref, GSN_TCGETOPSIZEREQ, signal, 2, JBB);
480 }//Dbdih::sendTCGETOPSIZEREQ()
481 
sendUPDATE_TOREQ(Signal * signal,Uint32 nodeId,Uint32 extra)482 void Dbdih::sendUPDATE_TOREQ(Signal* signal, Uint32 nodeId, Uint32 extra)
483 {
484   const BlockReference ref = calcDihBlockRef(nodeId);
485   sendSignal(ref, GSN_UPDATE_TOREQ, signal, UpdateToReq::SignalLength, JBB);
486 }//sendUPDATE_TOREQ()
487 
execCONTINUEB(Signal * signal)488 void Dbdih::execCONTINUEB(Signal* signal)
489 {
490   jamEntry();
491   switch ((DihContinueB::Type)signal->theData[0]) {
492   case DihContinueB::ZPACK_TABLE_INTO_PAGES:
493     {
494       jam();
495       Uint32 tableId = signal->theData[1];
496       packTableIntoPagesLab(signal, tableId);
497       return;
498       break;
499     }
500   case DihContinueB::ZPACK_FRAG_INTO_PAGES:
501     {
502       RWFragment wf;
503       jam();
504       wf.rwfTabPtr.i = signal->theData[1];
505       ptrCheckGuard(wf.rwfTabPtr, ctabFileSize, tabRecord);
506       wf.fragId = signal->theData[2];
507       wf.pageIndex = signal->theData[3];
508       wf.wordIndex = signal->theData[4];
509       wf.totalfragments = signal->theData[5];
510       packFragIntoPagesLab(signal, &wf);
511       return;
512       break;
513     }
514   case DihContinueB::ZREAD_PAGES_INTO_TABLE:
515     {
516       jam();
517       Uint32 tableId = signal->theData[1];
518       readPagesIntoTableLab(signal, tableId);
519       return;
520       break;
521     }
522   case DihContinueB::ZREAD_PAGES_INTO_FRAG:
523     {
524       RWFragment rf;
525       jam();
526       rf.rwfTabPtr.i = signal->theData[1];
527       ptrCheckGuard(rf.rwfTabPtr, ctabFileSize, tabRecord);
528       rf.fragId = signal->theData[2];
529       rf.pageIndex = signal->theData[3];
530       rf.wordIndex = signal->theData[4];
531       readPagesIntoFragLab(signal, &rf);
532       return;
533       break;
534     }
535   case DihContinueB::ZCOPY_TABLE:
536     {
537       jam();
538       Uint32 tableId = signal->theData[1];
539       copyTableLab(signal, tableId);
540       return;
541     }
542   case DihContinueB::ZCOPY_TABLE_NODE:
543     {
544       NodeRecordPtr nodePtr;
545       CopyTableNode ctn;
546       jam();
547       ctn.ctnTabPtr.i = signal->theData[1];
548       ptrCheckGuard(ctn.ctnTabPtr, ctabFileSize, tabRecord);
549       nodePtr.i = signal->theData[2];
550       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
551       ctn.pageIndex = signal->theData[3];
552       ctn.wordIndex = signal->theData[4];
553       ctn.noOfWords = signal->theData[5];
554       copyTableNode(signal, &ctn, nodePtr);
555       return;
556     }
557   case DihContinueB::ZSTART_FRAGMENT:
558     {
559       jam();
560       Uint32 tableId = signal->theData[1];
561       Uint32 fragId = signal->theData[2];
562       startFragment(signal, tableId, fragId);
563       return;
564     }
565   case DihContinueB::ZCOMPLETE_RESTART:
566     jam();
567     completeRestartLab(signal);
568     return;
569   case DihContinueB::ZREAD_TABLE_FROM_PAGES:
570     {
571       TabRecordPtr tabPtr;
572       jam();
573       tabPtr.i = signal->theData[1];
574       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
575       readTableFromPagesLab(signal, tabPtr);
576       return;
577     }
578   case DihContinueB::ZSR_PHASE2_READ_TABLE:
579     {
580       TabRecordPtr tabPtr;
581       jam();
582       tabPtr.i = signal->theData[1];
583       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
584       srPhase2ReadTableLab(signal, tabPtr);
585       return;
586     }
587   case DihContinueB::ZCHECK_TC_COUNTER:
588     jam();
589 #ifndef NO_LCP
590     checkTcCounterLab(signal);
591 #endif
592     return;
593   case DihContinueB::ZCALCULATE_KEEP_GCI:
594     {
595       jam();
596       Uint32 tableId = signal->theData[1];
597       Uint32 fragId = signal->theData[2];
598       calculateKeepGciLab(signal, tableId, fragId);
599       return;
600     }
601   case DihContinueB::ZSTORE_NEW_LCP_ID:
602     jam();
603     storeNewLcpIdLab(signal);
604     return;
605   case DihContinueB::ZTABLE_UPDATE:
606     {
607       TabRecordPtr tabPtr;
608       jam();
609       tabPtr.i = signal->theData[1];
610       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
611       tableUpdateLab(signal, tabPtr);
612       return;
613     }
614   case DihContinueB::ZCHECK_LCP_COMPLETED:
615     {
616       jam();
617       checkLcpCompletedLab(signal);
618       return;
619     }
620   case DihContinueB::ZINIT_LCP:
621     {
622       jam();
623       Uint32 senderRef = signal->theData[1];
624       Uint32 tableId = signal->theData[2];
625       initLcpLab(signal, senderRef, tableId);
626       return;
627     }
628   case DihContinueB::ZADD_TABLE_MASTER_PAGES:
629     {
630       TabRecordPtr tabPtr;
631       jam();
632       tabPtr.i = signal->theData[1];
633       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
634       tabPtr.p->tabUpdateState = TabRecord::US_ADD_TABLE_MASTER;
635       tableUpdateLab(signal, tabPtr);
636       return;
637       break;
638     }
639   case DihContinueB::ZDIH_ADD_TABLE_MASTER:
640     {
641       jam();
642       addTable_closeConf(signal, signal->theData[1]);
643       return;
644     }
645   case DihContinueB::ZADD_TABLE_SLAVE_PAGES:
646     {
647       TabRecordPtr tabPtr;
648       jam();
649       tabPtr.i = signal->theData[1];
650       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
651       tabPtr.p->tabUpdateState = TabRecord::US_ADD_TABLE_SLAVE;
652       tableUpdateLab(signal, tabPtr);
653       return;
654     }
655   case DihContinueB::ZDIH_ADD_TABLE_SLAVE:
656     {
657       ndbabort();
658     }
659   case DihContinueB::ZSTART_GCP:
660     jam();
661 #ifndef NO_GCP
662     startGcpLab(signal);
663 #endif
664     return;
665     break;
666   case DihContinueB::ZCOPY_GCI:{
667     jam();
668     CopyGCIReq::CopyReason reason = (CopyGCIReq::CopyReason)signal->theData[1];
669     ndbrequire(c_copyGCIMaster.m_copyReason == reason);
670 
671     // set to idle, to be able to reuse method
672     c_copyGCIMaster.m_copyReason = CopyGCIReq::IDLE;
673     copyGciLab(signal, reason);
674     return;
675   }
676     break;
677   case DihContinueB::ZEMPTY_VERIFY_QUEUE:
678     jam();
679     emptyverificbuffer(signal, signal->theData[1], true);
680     return;
681     break;
682   case DihContinueB::ZCHECK_GCP_STOP:
683     jam();
684 #ifndef NO_GCP
685     checkGcpStopLab(signal);
686 #endif
687     return;
688     break;
689   case DihContinueB::ZREMOVE_NODE_FROM_TABLE:
690     {
691       jam();
692       Uint32 nodeId = signal->theData[1];
693       Uint32 tableId = signal->theData[2];
694       removeNodeFromTables(signal, nodeId, tableId);
695       return;
696     }
697   case DihContinueB::ZCOPY_NODE:
698     {
699       jam();
700       Uint32 tableId = signal->theData[1];
701       copyNodeLab(signal, tableId);
702       return;
703     }
704   case DihContinueB::ZTO_START_COPY_FRAG:
705     {
706       jam();
707       Uint32 takeOverPtrI = signal->theData[1];
708       startNextCopyFragment(signal, takeOverPtrI);
709       return;
710     }
711   case DihContinueB::ZINVALIDATE_NODE_LCP:
712     {
713       jam();
714       const Uint32 nodeId = signal->theData[1];
715       const Uint32 tableId = signal->theData[2];
716       invalidateNodeLCP(signal, nodeId, tableId);
717       return;
718     }
719   case DihContinueB::ZINITIALISE_RECORDS:
720     jam();
721     initialiseRecordsLab(signal,
722 			 signal->theData[1],
723 			 signal->theData[2],
724 			 signal->theData[3]);
725     return;
726     break;
727   case DihContinueB::ZSTART_PERMREQ_AGAIN:
728     jam();
729     nodeRestartPh2Lab2(signal);
730     return;
731     break;
732   case DihContinueB::SwitchReplica:
733     {
734       jam();
735       const Uint32 nodeId = signal->theData[1];
736       const Uint32 tableId = signal->theData[2];
737       const Uint32 fragNo = signal->theData[3];
738       switchReplica(signal, nodeId, tableId, fragNo);
739       return;
740     }
741   case DihContinueB::ZSEND_ADD_FRAG:
742     {
743       jam();
744       Uint32 takeOverPtrI = signal->theData[1];
745       toCopyFragLab(signal, takeOverPtrI);
746       return;
747     }
748   case DihContinueB::ZSEND_START_TO:
749     {
750       jam();
751       Ptr<TakeOverRecord> takeOverPtr;
752       c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
753       sendStartTo(signal, takeOverPtr);
754       return;
755     }
756   case DihContinueB::ZSEND_UPDATE_TO:
757     {
758       jam();
759       Ptr<TakeOverRecord> takeOverPtr;
760       c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
761       sendUpdateTo(signal, takeOverPtr);
762       return;
763     }
764   case DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE:{
765     jam();
766     TabRecordPtr tabPtr;
767     tabPtr.i = signal->theData[1];
768     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
769     waitDropTabWritingToFile(signal, tabPtr);
770     return;
771   }
772   case DihContinueB::ZTO_START_FRAGMENTS:
773   {
774     TakeOverRecordPtr takeOverPtr;
775     c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
776     nr_start_fragments(signal, takeOverPtr);
777     return;
778   }
779   case DihContinueB::ZWAIT_OLD_SCAN:
780   {
781     jam();
782     wait_old_scan(signal);
783     return;
784   }
785   case DihContinueB::ZLCP_TRY_LOCK:
786   {
787     jam();
788     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
789     Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked),
790                    signal->theData[1] };
791     ndbrequire(mutex.trylock(c, false));
792     return;
793   }
794   case DihContinueB::ZTO_START_LOGGING:
795   {
796     jam();
797     TakeOverRecordPtr takeOverPtr;
798     c_takeOverPool.getPtr(takeOverPtr, signal->theData[1]);
799     nr_start_logging(signal, takeOverPtr);
800     return;
801   }
802   case DihContinueB::ZGET_TABINFO:
803   {
804     jam();
805     getTabInfo(signal);
806     return;
807   }
808   case DihContinueB::ZGET_TABINFO_SEND:
809   {
810     jam();
811     TabRecordPtr tabPtr;
812     tabPtr.i = signal->theData[1];
813     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
814     getTabInfo_send(signal, tabPtr);
815     return;
816   }
817   case DihContinueB::ZDEQUEUE_LCP_REP:
818   {
819     jam();
820     dequeue_lcp_rep(signal);
821     return;
822   }
823   }
824 
825   ndbabort();
826 }//Dbdih::execCONTINUEB()
827 
828 /**
829  * Input to unpack functions is the v1 or v2 format stored in the cdata
830  * array of integers.
831  * Output of pack functions is the v1 or v2 format stored in the cdata
832  * array of integers.
833  */
834 
835 enum DataNodeStatusPacked
836 {
837   NODE_ACTIVE = 0,
838   NODE_ACTIVE_NODE_DOWN = 1,
839   NODE_CONFIGURED = 2,
840   NODE_UNDEFINED = 3
841 };
842 
843 void
pack_sysfile_format_v2(void)844 Dbdih::pack_sysfile_format_v2(void)
845 {
846   /**
847    * Format for COPY_GCIREQ v2:
848    * --------------------------
849    * 1) MAGIC_v2
850    * 2) m_max_node_id
851    * 3) Total size in words of packed format
852    * 4) numGCIs (number of GCIs in non-packed form)
853    * 5) numNodeGroups (node groups in non-packed form)
854    * 6) Number of replicas
855    * 7) systemRestartBits
856    * 8) m_restart_seq
857    * 9) keepGCI
858    * 10) oldestRestorableGCI
859    * 11) newestRestorabeGCI
860    * 12) latestLCP_ID
861    * 13) lcpActive bits (m_max_node_id bits)
862    * 14) nodeStatus 4 bits * m_max_node_id
863    *     3 bits is DataNodeStatusPacked
864    *     1 bit is set if GCI is non-packed form
865    * 15) GCIs in non-packed form
866    * 16) Node group bit (m_max_node_id bits)
867    * 17) Node groups in non-packed form (16 bits per node group)
868    */
869 #ifdef VM_TRACE
870   memset(SYSFILE->takeOver, 0, sizeof(SYSFILE->takeOver));
871   for (Uint32 i = 0; i < MAX_NDB_NODES; i++)
872   {
873     Sysfile::ActiveStatus active_status = (Sysfile::ActiveStatus)
874       SYSFILE->getNodeStatus(i, SYSFILE->nodeStatus);
875     ndbrequire(active_status != Sysfile::NS_ActiveMissed_2);
876     ndbrequire(active_status != Sysfile::NS_ActiveMissed_3);
877     ndbrequire(active_status != Sysfile::NS_TakeOver);
878     ndbrequire(active_status != Sysfile::NS_NotActive_TakenOver);
879   }
880 #endif
881   Uint32 index = 0;
882 
883   std::memcpy(&cdata[index], Sysfile::MAGIC_v2, Sysfile::MAGIC_SIZE_v2);
884   static_assert(Sysfile::MAGIC_SIZE_v2 % sizeof(Uint32) == 0, "");
885   index += Sysfile::MAGIC_SIZE_v2 / sizeof(Uint32);
886 
887   ndbrequire(index == 2);
888   cdata[index] = m_max_node_id;
889   index++;
890 
891   ndbrequire(index == 3);
892   const Uint32 index_cdata_size_in_words = index;
893   index++;
894 
895   ndbrequire(index == 4);
896   const Uint32 index_numGCIs = index;
897   index++;
898 
899   ndbrequire(index == 5);
900   const Uint32 index_numNodeGroups = index;
901   index++;
902 
903   ndbrequire(index == 6);
904   const Uint32 index_num_replicas = index;
905   index++;
906 
907   ndbrequire(index == 7);
908   for (Uint32 i = 0; i < 6; i++)
909   {
910     cdata[index] = sysfileData[i];
911     index++;
912   }
913 
914   Uint32 lcp_active_words = ((m_max_node_id) + 31) / 32;
915   ndbrequire(index == 13);
916   for (Uint32 i = 0; i < lcp_active_words; i++)
917   {
918     cdata[index] = SYSFILE->lcpActive[i];
919     index++;
920   }
921   Uint32 data = 0;
922   Uint32 start_bit = 0;
923   const Uint32 index_node_bit_words = index;
924   Uint32 numGCIs = 0;
925   Uint32 node_bit_words = ((m_max_node_id * 4) + 31) / 32;
926   Uint32 indexGCI = index_node_bit_words + node_bit_words;
927   Uint32 expectedGCI = SYSFILE->newestRestorableGCI;
928   for (Uint32 i = 1; i <= m_max_node_id; i++)
929   {
930     Sysfile::ActiveStatus active_status = (Sysfile::ActiveStatus)
931       SYSFILE->getNodeStatus(i, SYSFILE->nodeStatus);
932     Uint32 bits = 0;
933     Uint32 diff = 0;
934     Uint32 nodeGCI = SYSFILE->lastCompletedGCI[i];
935     switch (active_status)
936     {
937       case Sysfile::NS_Active:
938       {
939         jamDebug();
940         bits = NODE_ACTIVE;
941         if (nodeGCI != expectedGCI)
942         {
943           jamDebug();
944           diff = 1;
945         }
946         break;
947       }
948       case Sysfile::NS_ActiveMissed_1:
949       case Sysfile::NS_NotActive_NotTakenOver:
950       {
951         jamDebug();
952         bits = NODE_ACTIVE_NODE_DOWN;
953         diff = 1;
954         break;
955       }
956       case Sysfile::NS_ActiveMissed_2:
957       case Sysfile::NS_ActiveMissed_3:
958       case Sysfile::NS_NotActive_TakenOver:
959       case Sysfile::NS_TakeOver:
960       {
961         ndbout_c("active_status = %u", active_status);
962         ndbassert(false);
963         jamDebug();
964         bits = NODE_ACTIVE_NODE_DOWN;
965         diff = 1;
966         break;
967       }
968       case Sysfile::NS_NotDefined:
969       {
970         jamDebug();
971         bits = NODE_UNDEFINED;
972         if (nodeGCI != 0)
973         {
974           jamDebug();
975           diff = 1;
976         }
977         break;
978       }
979       case Sysfile::NS_Configured:
980       {
981         jamDebug();
982         bits = NODE_CONFIGURED;
983         if (nodeGCI != expectedGCI)
984         {
985           jamDebug();
986           diff = 1;
987         }
988         break;
989       }
990       default:
991       {
992         ndbout_c("active_status = %u", active_status);
993         ndbabort();
994       }
995     }
996     if (diff != 0)
997     {
998       numGCIs++;
999       bits += 8;
1000       cdata[indexGCI] = nodeGCI;
1001       indexGCI++;
1002     }
1003     data += (bits << start_bit);
1004     ndbrequire(bits < 16);
1005     start_bit += 4;
1006     if (start_bit == 32)
1007     {
1008       jamDebug();
1009       cdata[index] = data;
1010       data = 0;
1011       start_bit = 0;
1012       index++;
1013     }
1014   }
1015   if (start_bit != 0)
1016   {
1017     jamDebug();
1018     cdata[index] = data;
1019     index++;
1020   }
1021   ndbrequire((index + numGCIs) == indexGCI);
1022   Uint32 numNodeGroups = 0;
1023   Uint32 num_replicas = cnoReplicas;
1024   Uint32 replica_index = 0;
1025   index = indexGCI;
1026   Uint32 node_group_bit_words = lcp_active_words;
1027   const Uint32 index_ng = index + node_group_bit_words;
1028   data = 0;
1029   start_bit = 0;
1030   Uint16 *ng_area = (Uint16*)&cdata[index_ng];
1031   Uint32 predicted_ng = 0;
1032   for (Uint32 i = 1; i <= m_max_node_id; i++)
1033   {
1034     Sysfile::ActiveStatus active_status = (Sysfile::ActiveStatus)
1035       SYSFILE->getNodeStatus(i, SYSFILE->nodeStatus);
1036     Uint32 diff = 0;
1037     Uint32 nodeGroup;
1038     switch (active_status)
1039     {
1040       case Sysfile::NS_Active:
1041       case Sysfile::NS_ActiveMissed_1:
1042       case Sysfile::NS_NotActive_NotTakenOver:
1043       case Sysfile::NS_ActiveMissed_2:
1044       case Sysfile::NS_ActiveMissed_3:
1045       case Sysfile::NS_NotActive_TakenOver:
1046       case Sysfile::NS_TakeOver:
1047       {
1048         jamDebug();
1049         nodeGroup = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
1050         if (nodeGroup != predicted_ng)
1051         {
1052           jamDebug();
1053           diff = 1;
1054         }
1055         replica_index++;
1056         if (replica_index == num_replicas)
1057         {
1058           jamDebug();
1059           replica_index = 0;
1060           predicted_ng++;
1061         }
1062         break;
1063       }
1064       case Sysfile::NS_NotDefined:
1065       {
1066         jamDebug();
1067         /* If a node is not configured the node group will never be used.
1068          * Still the node group is expected to be NO_NODE_GROUP_ID.
1069          * Sometimes it seems that node group is wrongly set to zero.
1070          * While this is not critical, it should be examined why.
1071          */
1072         nodeGroup = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
1073         ndbrequire(nodeGroup == NO_NODE_GROUP_ID ||
1074                    nodeGroup == 0);
1075         break;
1076       }
1077       case Sysfile::NS_Configured:
1078       {
1079         jamDebug();
1080         nodeGroup = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
1081         if (nodeGroup != NO_NODE_GROUP_ID)
1082         {
1083           jamDebug();
1084           ndbabort();
1085           diff = 1;
1086         }
1087         break;
1088       }
1089       default:
1090       {
1091         ndbabort();
1092       }
1093     }
1094     if (diff != 0)
1095     {
1096       jamDebug();
1097       ng_area[numNodeGroups] = nodeGroup;
1098       numNodeGroups++;
1099       data += (1 << start_bit);
1100     }
1101     start_bit++;
1102     if (start_bit == 32)
1103     {
1104       jamDebug();
1105       cdata[index] = data;
1106       start_bit = 0;
1107       index++;
1108     }
1109   }
1110   if (start_bit != 0)
1111   {
1112     jamDebug();
1113     cdata[index] = data;
1114     index++;
1115   }
1116   ndbrequire(index == index_ng);
1117   cdata_size_in_words = index_ng + ((numNodeGroups + 1)/2);
1118   cdata[index_cdata_size_in_words] = cdata_size_in_words;
1119   cdata[index_numGCIs] = numGCIs;
1120   cdata[index_numNodeGroups] = numNodeGroups;
1121   cdata[index_num_replicas] = num_replicas;
1122 }
1123 
1124 void
pack_sysfile_format_v1(void)1125 Dbdih::pack_sysfile_format_v1(void)
1126 {
1127   ndbrequire(m_max_node_id <= 48);
1128   for (Uint32 i = 0; i < 6; i++)
1129     cdata[i] = sysfileData[i];
1130   for (Uint32 i = 0; i < 49; i++)
1131     cdata[6 + i] = SYSFILE->lastCompletedGCI[i];
1132   for (Uint32 i = 0; i < 7; i++)
1133     cdata[55 + i] = SYSFILE->nodeStatus[i];
1134 
1135   memset(&cdata[62], 0, 52);
1136   for (Uint32 i = 1; i <= 48; i++)
1137   {
1138     NodeId ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
1139     Sysfile::setNodeGroup_v1(i, &cdata[62], Uint8(ng));
1140   }
1141 
1142   memset(&cdata[75], 0, 52);
1143   for (Uint32 i = 1; i <= 48; i++)
1144   {
1145     NodeId nodeId = Sysfile::getTakeOverNode(i, SYSFILE->takeOver);
1146     ndbrequire(nodeId <= 48);
1147     Sysfile::setTakeOverNode_v1(i, &cdata[75], Uint8(nodeId));
1148   }
1149 
1150   for (Uint32 i = 0; i < 2; i++)
1151     cdata[88 + i] = SYSFILE->lcpActive[i];
1152 }
1153 
1154 void
unpack_sysfile_format_v2(bool set_max_node_id)1155 Dbdih::unpack_sysfile_format_v2(bool set_max_node_id)
1156 {
1157   Uint32 index = 0;
1158   ndbrequire(std::memcmp(&cdata[index],
1159                          Sysfile::MAGIC_v2,
1160                          Sysfile::MAGIC_SIZE_v2) == 0);
1161   index += Sysfile::MAGIC_SIZE_v2 / sizeof(Uint32);
1162 
1163   Uint32 max_node_id = cdata[index];
1164   index++;
1165 
1166   cdata_size_in_words = cdata[index];
1167   index++;
1168 
1169   if (set_max_node_id)
1170   {
1171     jam();
1172     m_max_node_id = max_node_id;
1173   }
1174 
1175   Uint32 numGCIs = cdata[index];
1176   index++;
1177 
1178   Uint32 numNodeGroups = cdata[index];
1179   index++;
1180 
1181   Uint32 num_replicas = cdata[index];
1182   index++;
1183 
1184   for (Uint32 i = 0; i < 6; i++)
1185   {
1186     sysfileData[i] = cdata[index];
1187     index++;
1188   }
1189   Uint32 lcp_active_words = ((max_node_id) + 31) / 32;
1190   for (Uint32 i = 0; i < lcp_active_words; i++)
1191   {
1192     SYSFILE->lcpActive[i] = cdata[index];
1193     index++;
1194   }
1195   Uint32 node_bit_words = ((max_node_id * 4) + 31) / 32;
1196   Uint32 node_group_words = lcp_active_words;
1197 
1198   const Uint32 index_node_bit_words = index;
1199   Uint32 indexGCI = index_node_bit_words + node_bit_words;
1200   Uint32 start_bit = 0;
1201   Uint32 newestGCI = SYSFILE->newestRestorableGCI;
1202   for (Uint32 i = 1; i <= max_node_id; i++)
1203   {
1204     Uint32 data = cdata[index];
1205     Uint32 bits = (data >> start_bit) & 0xF;
1206     Uint32 gci_bit = bits >> 3;
1207     Uint32 state_bits = bits & 0x7;
1208     switch (state_bits)
1209     {
1210       case NODE_ACTIVE:
1211       {
1212         if (gci_bit != 0)
1213         {
1214           jamDebug();
1215           SYSFILE->lastCompletedGCI[i] = cdata[indexGCI];
1216           indexGCI++;
1217         }
1218         else
1219         {
1220           jamDebug();
1221           SYSFILE->lastCompletedGCI[i] = newestGCI;
1222         }
1223         SYSFILE->setNodeStatus(i,
1224                                SYSFILE->nodeStatus,
1225                                Sysfile::NS_Active);
1226         break;
1227       }
1228       case NODE_ACTIVE_NODE_DOWN:
1229       {
1230         jamDebug();
1231         ndbrequire(gci_bit != 0);
1232         SYSFILE->lastCompletedGCI[i] = cdata[indexGCI];
1233         indexGCI++;
1234         SYSFILE->setNodeStatus(i,
1235                                SYSFILE->nodeStatus,
1236                                Sysfile::NS_ActiveMissed_1);
1237         break;
1238       }
1239       case NODE_CONFIGURED:
1240       {
1241         if (gci_bit != 0)
1242         {
1243           jamDebug();
1244           SYSFILE->lastCompletedGCI[i] = cdata[indexGCI];
1245           indexGCI++;
1246         }
1247         else
1248         {
1249           jamDebug();
1250           SYSFILE->lastCompletedGCI[i] = newestGCI;
1251         }
1252         SYSFILE->setNodeStatus(i,
1253                                SYSFILE->nodeStatus,
1254                                Sysfile::NS_Configured);
1255         break;
1256       }
1257       case NODE_UNDEFINED:
1258       {
1259         if (gci_bit != 0)
1260         {
1261           jamDebug();
1262           SYSFILE->lastCompletedGCI[i] = cdata[indexGCI];
1263           indexGCI++;
1264         }
1265         else
1266         {
1267           jamDebug();
1268           SYSFILE->lastCompletedGCI[i] = 0;
1269         }
1270         SYSFILE->setNodeStatus(i,
1271                                SYSFILE->nodeStatus,
1272                                Sysfile::NS_NotDefined);
1273         break;
1274       }
1275       default:
1276       {
1277         ndbabort();
1278       }
1279     }
1280     start_bit += 4;
1281     if (start_bit == 32)
1282     {
1283       index++;
1284       start_bit = 0;
1285     }
1286   }
1287   if (start_bit != 0)
1288   {
1289     index++;
1290   }
1291   ndbrequire(index == (index_node_bit_words + node_bit_words));
1292   ndbrequire((index + numGCIs) == indexGCI);
1293   index = indexGCI;
1294   const Uint32 index_ng = index + node_group_words;
1295   Uint16* ng_array = (Uint16*)&cdata[index_ng];
1296   start_bit = 0;
1297   Uint32 replica_index = 0;
1298   Uint32 ng_index = 0;
1299   Uint32 current_ng = 0;
1300   for (Uint32 i = 1; i <= max_node_id; i++)
1301   {
1302     Sysfile::ActiveStatus active_status = (Sysfile::ActiveStatus)
1303       SYSFILE->getNodeStatus(i, SYSFILE->nodeStatus);
1304     Uint32 data = cdata[index];
1305     Uint32 ng_bit = (data >> start_bit) & 0x1;
1306     Uint32 nodeGroup = NO_NODE_GROUP_ID;
1307     switch (active_status)
1308     {
1309       case Sysfile::NS_Active:
1310       case Sysfile::NS_ActiveMissed_1:
1311       {
1312         if (ng_bit == 0)
1313         {
1314           jamDebug();
1315           nodeGroup = current_ng;
1316           replica_index++;
1317           if (replica_index == num_replicas)
1318           {
1319             jamDebug();
1320             replica_index = 0;
1321             current_ng++;
1322           }
1323         }
1324         else
1325         {
1326           jamDebug();
1327           nodeGroup = (Uint32) ng_array[ng_index];
1328           ng_index++;
1329         }
1330         break;
1331       }
1332       case Sysfile::NS_NotDefined:
1333       case Sysfile::NS_Configured:
1334       {
1335         jamDebug();
1336         nodeGroup = NO_NODE_GROUP_ID;
1337         break;
1338       }
1339       default:
1340       {
1341         ndbabort();
1342       }
1343     }
1344     SYSFILE->setNodeGroup(i,
1345                           SYSFILE->nodeGroups,
1346                           nodeGroup);
1347     start_bit++;
1348     if (start_bit == 32)
1349     {
1350       jamDebug();
1351       index++;
1352       start_bit = 0;
1353     }
1354   }
1355   if (start_bit != 0)
1356   {
1357     jamDebug();
1358     index++;
1359   }
1360   ndbrequire(index == index_ng);
1361   ndbrequire(ng_index == numNodeGroups);
1362   index = index_ng + ((ng_index + 1)/2);
1363   ndbrequire(cdata_size_in_words == index);
1364 }
1365 
1366 void
unpack_sysfile_format_v1(bool set_max_node_id)1367 Dbdih::unpack_sysfile_format_v1(bool set_max_node_id)
1368 {
1369   jam();
1370   for (Uint32 i = 0; i < 6; i++)
1371   {
1372     sysfileData[i] = cdata[i];
1373   }
1374   for (Uint32 i = 0; i < 49; i++)
1375   {
1376     SYSFILE->lastCompletedGCI[i] = cdata[6 + i];
1377   }
1378   for (Uint32 i = 0; i < 7; i++)
1379   {
1380     SYSFILE->nodeStatus[i] = cdata[55 + i];
1381   }
1382 
1383   memset(SYSFILE->nodeGroups, 0, sizeof(SYSFILE->nodeGroups));
1384   for (NodeId i = 1; i <= 48; i++)
1385   {
1386     NodeId ng = Sysfile::getNodeGroup_v1(i, &cdata[62]);
1387     if (ng == 255)
1388       ng = NO_NODE_GROUP_ID;
1389     Sysfile::setNodeGroup(i, SYSFILE->nodeGroups, ng);
1390   }
1391 
1392   memset(SYSFILE->takeOver, 0, sizeof(SYSFILE->takeOver));
1393   for (NodeId i = 1; i <= 48; i++)
1394   {
1395     NodeId nodeId = Sysfile::getTakeOverNode_v1(i, &cdata[75]);
1396     Sysfile::setNodeGroup(i, SYSFILE->takeOver, nodeId);
1397   }
1398 
1399   for (Uint32 i = 0; i < 2; i++)
1400   {
1401     SYSFILE->lcpActive[i] = cdata[88 + i];
1402   }
1403   if (set_max_node_id)
1404   {
1405     for (Uint32 i = 1; i <= 48; i++)
1406     {
1407       if (Sysfile::getNodeStatus(i, SYSFILE->nodeStatus) !=
1408           Sysfile::NS_NotDefined)
1409       {
1410         jamLine((Uint16)i);
1411         m_max_node_id = i;
1412       }
1413     }
1414   }
1415   ndbrequire(m_max_node_id <= 48);
1416 }
1417 
execCOPY_GCIREQ(Signal * signal)1418 void Dbdih::execCOPY_GCIREQ(Signal* signal)
1419 {
1420   CopyGCIReq * const copyGCI = (CopyGCIReq *)&signal->theData[0];
1421   jamEntry();
1422   if (ERROR_INSERTED(7241))
1423   {
1424     jam();
1425     g_eventLogger->info("Delayed COPY_GCIREQ 5s");
1426     if (ndbd_send_node_bitmask_in_section(getNodeInfo(cmasterNodeId).m_version))
1427     {
1428       SectionHandle handle(this, signal);
1429       sendSignalWithDelay(reference(), GSN_COPY_GCIREQ,
1430                           signal, 5000,
1431                           signal->getLength(),
1432                           &handle);
1433     }
1434     else
1435     {
1436       sendSignalWithDelay(reference(), GSN_COPY_GCIREQ,
1437                           signal, 5000,
1438                           signal->getLength());
1439     }
1440     return;
1441   }
1442 
1443   CopyGCIReq::CopyReason reason = (CopyGCIReq::CopyReason)copyGCI->copyReason;
1444   ndbrequire((reason == CopyGCIReq::GLOBAL_CHECKPOINT &&
1445               c_copyGCISlave.m_copyReason == CopyGCIReq::GLOBAL_CHECKPOINT) ||
1446              c_copyGCISlave.m_copyReason == CopyGCIReq::IDLE);
1447   ndbrequire(reason != CopyGCIReq::IDLE);
1448   bool isdone = true;
1449   bool v2_format = true;
1450 
1451   Uint32 num_sections = signal->getNoOfSections();
1452   SectionHandle handle(this, signal);
1453   if (num_sections > 0)
1454   {
1455     jam();
1456     ndbrequire(ndbd_send_node_bitmask_in_section(
1457       getNodeInfo(cmasterNodeId).m_version));
1458     SegmentedSectionPtr ptr;
1459     ndbrequire(num_sections == 1);
1460     handle.getSection(ptr, 0);
1461     ndbrequire(ptr.sz <= (sizeof(cdata)/4));
1462     copy(cdata, ptr);
1463     cdata_size_in_words = ptr.sz;
1464     releaseSections(handle);
1465   }
1466   else
1467   {
1468     jam();
1469     const Uint32 tstart = copyGCI->startWord;
1470     v2_format = false;
1471     ndbrequire(cmasterdihref == signal->senderBlockRef()) ;
1472     ndbrequire(c_copyGCISlave.m_expectedNextWord == tstart);
1473     isdone = (tstart + CopyGCIReq::DATA_SIZE) >= Sysfile::SYSFILE_SIZE32_v1;
1474 
1475     arrGuard(tstart + CopyGCIReq::DATA_SIZE, sizeof(sysfileData)/4);
1476     for(Uint32 i = 0; i<CopyGCIReq::DATA_SIZE; i++)
1477       cdata[tstart+i] = copyGCI->data[i];
1478   }
1479   if (isdone)
1480   {
1481     jam();
1482     c_copyGCISlave.m_expectedNextWord = 0;
1483   }
1484   else
1485   {
1486     jam();
1487     c_copyGCISlave.m_expectedNextWord += CopyGCIReq::DATA_SIZE;
1488     return;
1489   }
1490   if (cmasterdihref != reference())
1491   {
1492     Uint32 tmp= SYSFILE->m_restart_seq;
1493     if (v2_format)
1494     {
1495       jam();
1496       unpack_sysfile_format_v2(false);
1497     }
1498     else
1499     {
1500       jam();
1501       unpack_sysfile_format_v1(false);
1502     }
1503     SYSFILE->m_restart_seq = tmp;
1504     if (c_set_initial_start_flag)
1505     {
1506       jam();
1507       Sysfile::setInitialStartOngoing(SYSFILE->systemRestartBits);
1508     }
1509   }
1510 
1511   c_copyGCISlave.m_copyReason = reason;
1512   c_copyGCISlave.m_senderRef  = signal->senderBlockRef();
1513   c_copyGCISlave.m_senderData = copyGCI->anyData;
1514 
1515   CRASH_INSERTION2(7020, reason==CopyGCIReq::LOCAL_CHECKPOINT);
1516   CRASH_INSERTION2(7008, reason==CopyGCIReq::GLOBAL_CHECKPOINT);
1517 
1518   if (m_local_lcp_state.check_cut_log_tail(c_newest_restorable_gci))
1519   {
1520     jam();
1521 
1522 #ifdef NOT_YET
1523     LcpCompleteRep* rep = (LcpCompleteRep*)signal->getDataPtrSend();
1524     rep->nodeId = getOwnNodeId();
1525     rep->blockNo = 0;
1526     rep->lcpId = m_local_lcp_state.m_start_lcp_req.lcpId;
1527     rep->keepGci = m_local_lcp_state.m_keep_gci;
1528     sendSignal(DBLQH_REF, GSN_LCP_COMPLETE_REP, signal,
1529                LcpCompleteRep::SignalLength, JBB);
1530 
1531     warningEvent("CUT LOG TAIL: reason: %u lcp: %u m_keep_gci: %u stop: %u",
1532                  reason,
1533                  m_local_lcp_state.m_start_lcp_req.lcpId,
1534                  m_local_lcp_state.m_keep_gci,
1535                  m_local_lcp_state.m_stop_gci);
1536 #endif
1537     m_local_lcp_state.reset();
1538   }
1539 
1540   /* -------------------------------------------------------------------------*/
1541   /*     WE SET THE REQUESTER OF THE COPY GCI TO THE CURRENT MASTER. IF THE   */
1542   /*     CURRENT MASTER WE DO NOT WANT THE NEW MASTER TO RECEIVE CONFIRM OF   */
1543   /*     SOMETHING HE HAS NOT SENT. THE TAKE OVER MUST BE CAREFUL.            */
1544   /* -------------------------------------------------------------------------*/
1545   bool ok = false;
1546   switch(reason){
1547   case CopyGCIReq::IDLE:
1548     ok = true;
1549     jam();
1550     ndbabort();
1551   case CopyGCIReq::LOCAL_CHECKPOINT: {
1552     ok = true;
1553     jam();
1554     c_lcpState.setLcpStatus(LCP_COPY_GCI, __LINE__);
1555     c_lcpState.m_masterLcpDihRef = cmasterdihref;
1556     setNodeActiveStatus();
1557     break;
1558   }
1559   case CopyGCIReq::RESTART: {
1560     ok = true;
1561     jam();
1562     Uint32 newest = SYSFILE->newestRestorableGCI;
1563     m_micro_gcp.m_old_gci = Uint64(newest) << 32;
1564     crestartGci = newest;
1565     c_newest_restorable_gci = newest;
1566     Sysfile::setRestartOngoing(SYSFILE->systemRestartBits);
1567     m_micro_gcp.m_current_gci = Uint64(newest + 1) << 32;
1568     setNodeActiveStatus();
1569     setNodeGroups();
1570     if ((Sysfile::getLCPOngoing(SYSFILE->systemRestartBits))) {
1571       jam();
1572       /* -------------------------------------------------------------------- */
1573       //  IF THERE WAS A LOCAL CHECKPOINT ONGOING AT THE CRASH MOMENT WE WILL
1574       //    INVALIDATE THAT LOCAL CHECKPOINT.
1575       /* -------------------------------------------------------------------- */
1576       invalidateLcpInfoAfterSr(signal);
1577     }//if
1578 
1579     if (m_micro_gcp.m_enabled == false &&
1580         m_micro_gcp.m_master.m_time_between_gcp)
1581     {
1582       /**
1583        * Micro GCP is disabled...but configured...
1584        */
1585       jam();
1586       m_micro_gcp.m_enabled = true;
1587       UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
1588       ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
1589       EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
1590     }
1591     break;
1592   }
1593   case CopyGCIReq::GLOBAL_CHECKPOINT: {
1594     ok = true;
1595     jam();
1596 
1597     if (m_gcp_save.m_state == GcpSave::GCP_SAVE_COPY_GCI)
1598     {
1599       jam();
1600       /**
1601        * This must be master take over...and it already running...
1602        */
1603       ndbrequire(c_newest_restorable_gci == SYSFILE->newestRestorableGCI);
1604       m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
1605       return;
1606     }
1607 
1608     if (c_newest_restorable_gci == SYSFILE->newestRestorableGCI)
1609     {
1610       jam();
1611 
1612       /**
1613        * This must be master take over...and it already complete...
1614        */
1615       m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
1616       c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
1617       signal->theData[0] = c_copyGCISlave.m_senderData;
1618       sendSignal(m_gcp_save.m_master_ref, GSN_COPY_GCICONF, signal, 1, JBB);
1619       return;
1620     }
1621 
1622     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_CONF);
1623     m_gcp_save.m_state = GcpSave::GCP_SAVE_COPY_GCI;
1624     m_gcp_save.m_master_ref = c_copyGCISlave.m_senderRef;
1625     c_newest_restorable_gci = SYSFILE->newestRestorableGCI;
1626     setNodeActiveStatus();
1627     break;
1628   }//if
1629   case CopyGCIReq::INITIAL_START_COMPLETED:
1630     ok = true;
1631     jam();
1632     break;
1633   case CopyGCIReq::RESTART_NR:
1634     jam();
1635     setNodeGroups();
1636     /**
1637      * We dont really need to make anything durable here...skip it
1638      *
1639      * We have received the current setting of node groups from the
1640      * master node, we are thus ready to setup multi sockets to our
1641      * neighbour nodes in the same node group.
1642      *
1643      * We should only reach here in the context of node restarts,
1644      * initial and normal ones.
1645      */
1646     ndbrequire(cstarttype == NodeState::ST_INITIAL_NODE_RESTART ||
1647                cstarttype == NodeState::ST_NODE_RESTART);
1648     jam();
1649     m_set_up_multi_trp_in_node_restart = true;
1650     signal->theData[0] = reference();
1651     sendSignal(QMGR_REF, GSN_SET_UP_MULTI_TRP_REQ, signal, 1, JBB);
1652     return;
1653   }
1654   ndbrequire(ok);
1655 
1656   CRASH_INSERTION(7183);
1657 
1658   if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
1659   {
1660     jam();
1661     return;
1662   }
1663 #ifdef GCP_TIMER_HACK
1664   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT) {
1665     jam();
1666     globalData.gcp_timer_copygci[0] = NdbTick_getCurrentTicks();
1667   }
1668 #endif
1669 
1670   /* ----------------------------------------------------------------------- */
1671   /*     WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE.           */
1672   /* ----------------------------------------------------------------------- */
1673   FileRecordPtr filePtr;
1674   filePtr.i = crestartInfoFile[0];
1675   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1676   if (filePtr.p->fileStatus == FileRecord::OPEN) {
1677     jam();
1678     openingCopyGciSkipInitLab(signal, filePtr);
1679     return;
1680   }//if
1681   openFileRw(signal, filePtr);
1682   filePtr.p->reqStatus = FileRecord::OPENING_COPY_GCI;
1683   return;
1684 }//Dbdih::execCOPY_GCIREQ()
1685 
1686 void
execSET_UP_MULTI_TRP_CONF(Signal * signal)1687 Dbdih::execSET_UP_MULTI_TRP_CONF(Signal *signal)
1688 {
1689   if (m_set_up_multi_trp_in_node_restart)
1690   {
1691     jam();
1692     g_eventLogger->info("Completed setting up multiple transporters to nodes"
1693                         " in the same node group");
1694     complete_restart_nr(signal);
1695   }
1696   else
1697   {
1698     jam();
1699     /**
1700      * Newly created multi sockets between nodes in a new nodegroup is now
1701      * created. No need to do anything more here.
1702      */
1703   }
1704 }
1705 
1706 void
complete_restart_nr(Signal * signal)1707 Dbdih::complete_restart_nr(Signal* signal)
1708 {
1709   jam();
1710   c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
1711   signal->theData[0] = c_copyGCISlave.m_senderData;
1712   sendSignal(c_copyGCISlave.m_senderRef, GSN_COPY_GCICONF, signal, 1, JBB);
1713 }
1714 
execDICTSTARTCONF(Signal * signal)1715 void Dbdih::execDICTSTARTCONF(Signal* signal)
1716 {
1717   jamEntry();
1718   Uint32 nodeId = refToNode(signal->getSendersBlockRef());
1719   if (nodeId != getOwnNodeId()) {
1720     jam();
1721     nodeDictStartConfLab(signal, nodeId);
1722   } else {
1723     jam();
1724     dictStartConfLab(signal);
1725   }//if
1726 }//Dbdih::execDICTSTARTCONF()
1727 
execFSCLOSECONF(Signal * signal)1728 void Dbdih::execFSCLOSECONF(Signal* signal)
1729 {
1730   FileRecordPtr filePtr;
1731   jamEntry();
1732   filePtr.i = signal->theData[0];
1733   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1734   filePtr.p->fileStatus = FileRecord::CLOSED;
1735   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1736   filePtr.p->reqStatus = FileRecord::IDLE;
1737   switch (status) {
1738   case FileRecord::CLOSING_GCP:
1739     jam();
1740     closingGcpLab(signal, filePtr);
1741     break;
1742   case FileRecord::CLOSING_GCP_CRASH:
1743     jam();
1744     closingGcpCrashLab(signal, filePtr);
1745     break;
1746   case FileRecord::CLOSING_TABLE_CRASH:
1747     jam();
1748     closingTableCrashLab(signal, filePtr);
1749     break;
1750   case FileRecord::CLOSING_TABLE_SR:
1751     jam();
1752     closingTableSrLab(signal, filePtr);
1753     break;
1754   case FileRecord::TABLE_CLOSE:
1755     jam();
1756     tableCloseLab(signal, filePtr);
1757     break;
1758   case FileRecord::TABLE_CLOSE_DELETE:
1759     jam();
1760     tableDeleteLab(signal, filePtr);
1761     break;
1762   default:
1763     ndbabort();
1764   }//switch
1765   return;
1766 }//Dbdih::execFSCLOSECONF()
1767 
execFSCLOSEREF(Signal * signal)1768 void Dbdih::execFSCLOSEREF(Signal* signal)
1769 {
1770   FileRecordPtr filePtr;
1771   jamEntry();
1772   filePtr.i = signal->theData[0];
1773   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1774   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1775   filePtr.p->reqStatus = FileRecord::IDLE;
1776   switch (status) {
1777   case FileRecord::CLOSING_GCP:
1778     jam();
1779     break;
1780   case FileRecord::CLOSING_GCP_CRASH:
1781     jam();
1782     closingGcpCrashLab(signal, filePtr);
1783     return;
1784   case FileRecord::CLOSING_TABLE_CRASH:
1785     jam();
1786     closingTableCrashLab(signal, filePtr);
1787     return;
1788   case FileRecord::CLOSING_TABLE_SR:
1789     jam();
1790     break;
1791   case FileRecord::TABLE_CLOSE:
1792     jam();
1793     break;
1794   case FileRecord::TABLE_CLOSE_DELETE:
1795     jam();
1796     break;
1797   default:
1798     jam();
1799     break;
1800 
1801   }//switch
1802   {
1803     char msg[100];
1804     sprintf(msg, "File system close failed during FileRecord status %d", (Uint32)status);
1805     fsRefError(signal,__LINE__,msg);
1806   }
1807   return;
1808 }//Dbdih::execFSCLOSEREF()
1809 
execFSOPENCONF(Signal * signal)1810 void Dbdih::execFSOPENCONF(Signal* signal)
1811 {
1812   FileRecordPtr filePtr;
1813   jamEntry();
1814   filePtr.i = signal->theData[0];
1815   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1816   filePtr.p->fileRef = signal->theData[1];
1817   filePtr.p->fileStatus = FileRecord::OPEN;
1818   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1819   filePtr.p->reqStatus = FileRecord::IDLE;
1820   switch (status) {
1821   case FileRecord::CREATING_GCP:
1822     jam();
1823     creatingGcpLab(signal, filePtr);
1824     break;
1825   case FileRecord::OPENING_COPY_GCI:
1826     jam();
1827     openingCopyGciSkipInitLab(signal, filePtr);
1828     break;
1829   case FileRecord::CREATING_COPY_GCI:
1830     jam();
1831     openingCopyGciSkipInitLab(signal, filePtr);
1832     break;
1833   case FileRecord::OPENING_GCP:
1834     jam();
1835     openingGcpLab(signal, filePtr);
1836     break;
1837   case FileRecord::OPENING_TABLE:
1838     jam();
1839     openingTableLab(signal, filePtr);
1840     break;
1841   case FileRecord::TABLE_CREATE:
1842     jam();
1843     tableCreateLab(signal, filePtr);
1844     break;
1845   case FileRecord::TABLE_OPEN_FOR_DELETE:
1846     jam();
1847     tableOpenLab(signal, filePtr);
1848     break;
1849   default:
1850     ndbabort();
1851   }//switch
1852   return;
1853 }//Dbdih::execFSOPENCONF()
1854 
execFSOPENREF(Signal * signal)1855 void Dbdih::execFSOPENREF(Signal* signal)
1856 {
1857   FileRecordPtr filePtr;
1858   jamEntry();
1859   filePtr.i = signal->theData[0];
1860   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1861   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1862   filePtr.p->reqStatus = FileRecord::IDLE;
1863   switch (status) {
1864   case FileRecord::CREATING_GCP:
1865     /* --------------------------------------------------------------------- */
1866     /*   WE DID NOT MANAGE TO CREATE A GLOBAL CHECKPOINT FILE. SERIOUS ERROR */
1867     /*   WHICH CAUSES A SYSTEM RESTART.                                      */
1868     /* --------------------------------------------------------------------- */
1869     jam();
1870     break;
1871   case FileRecord::OPENING_COPY_GCI:
1872     jam();
1873     openingCopyGciErrorLab(signal, filePtr);
1874     return;
1875   case FileRecord::CREATING_COPY_GCI:
1876     jam();
1877     break;
1878   case FileRecord::OPENING_GCP:
1879     jam();
1880     openingGcpErrorLab(signal, filePtr);
1881     return;
1882   case FileRecord::OPENING_TABLE:
1883     jam();
1884     openingTableErrorLab(signal, filePtr);
1885     return;
1886   case FileRecord::TABLE_CREATE:
1887     jam();
1888     break;
1889   case FileRecord::TABLE_OPEN_FOR_DELETE:
1890     jam();
1891     tableDeleteLab(signal, filePtr);
1892     return;
1893   default:
1894     jam();
1895     break;
1896   }//switch
1897   {
1898     char msg[100];
1899     sprintf(msg, "File system open failed during FileRecord status %d", (Uint32)status);
1900     fsRefError(signal,__LINE__,msg);
1901   }
1902   return;
1903 }//Dbdih::execFSOPENREF()
1904 
execFSREADCONF(Signal * signal)1905 void Dbdih::execFSREADCONF(Signal* signal)
1906 {
1907   FileRecordPtr filePtr;
1908   jamEntry();
1909   filePtr.i = signal->theData[0];
1910   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1911   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1912   filePtr.p->reqStatus = FileRecord::IDLE;
1913   switch (status) {
1914   case FileRecord::READING_GCP:
1915     jam();
1916     readingGcpLab(signal, filePtr);
1917     break;
1918   case FileRecord::READING_TABLE:
1919     jam();
1920     readingTableLab(signal, filePtr);
1921     break;
1922   default:
1923     ndbabort();
1924   }//switch
1925   return;
1926 }//Dbdih::execFSREADCONF()
1927 
execFSREADREF(Signal * signal)1928 void Dbdih::execFSREADREF(Signal* signal)
1929 {
1930   FileRecordPtr filePtr;
1931   jamEntry();
1932   filePtr.i = signal->theData[0];
1933   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1934   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1935   filePtr.p->reqStatus = FileRecord::IDLE;
1936   switch (status) {
1937   case FileRecord::READING_GCP:
1938     jam();
1939     readingGcpErrorLab(signal, filePtr);
1940     return;
1941   case FileRecord::READING_TABLE:
1942     jam();
1943     readingTableErrorLab(signal, filePtr);
1944     return;
1945   default:
1946     break;
1947   }//switch
1948   {
1949     char msg[100];
1950     sprintf(msg, "File system read failed during FileRecord status %d", (Uint32)status);
1951     fsRefError(signal,__LINE__,msg);
1952   }
1953 }//Dbdih::execFSREADREF()
1954 
execFSWRITECONF(Signal * signal)1955 void Dbdih::execFSWRITECONF(Signal* signal)
1956 {
1957   FileRecordPtr filePtr;
1958   jamEntry();
1959   filePtr.i = signal->theData[0];
1960   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1961   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1962   filePtr.p->reqStatus = FileRecord::IDLE;
1963   switch (status) {
1964   case FileRecord::WRITING_COPY_GCI:
1965     jam();
1966     writingCopyGciLab(signal, filePtr);
1967     break;
1968   case FileRecord::WRITE_INIT_GCP:
1969     jam();
1970     writeInitGcpLab(signal, filePtr);
1971     break;
1972   case FileRecord::TABLE_WRITE:
1973     jam();
1974     if (ERROR_INSERTED(7235))
1975     {
1976       jam();
1977       filePtr.p->reqStatus = status;
1978       /* Suspend processing of WRITECONFs */
1979       sendSignalWithDelay(reference(), GSN_FSWRITECONF, signal, 1000, signal->getLength());
1980       return;
1981     }
1982     tableWriteLab(signal, filePtr);
1983     break;
1984   default:
1985     ndbabort();
1986   }//switch
1987   return;
1988 }//Dbdih::execFSWRITECONF()
1989 
execFSWRITEREF(Signal * signal)1990 void Dbdih::execFSWRITEREF(Signal* signal)
1991 {
1992   FileRecordPtr filePtr;
1993   jamEntry();
1994   filePtr.i = signal->theData[0];
1995   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
1996   FileRecord::ReqStatus status = filePtr.p->reqStatus;
1997   filePtr.p->reqStatus = FileRecord::IDLE;
1998   switch (status) {
1999   case FileRecord::WRITING_COPY_GCI:
2000     /* --------------------------------------------------------------------- */
2001     /*  EVEN CREATING THE FILE DID NOT WORK. WE WILL THEN CRASH.             */
2002     /*  ERROR IN WRITING FILE. WE WILL NOT CONTINUE FROM HERE.               */
2003     /* --------------------------------------------------------------------- */
2004     jam();
2005     break;
2006   case FileRecord::WRITE_INIT_GCP:
2007     /* --------------------------------------------------------------------- */
2008     /*   AN ERROR OCCURRED IN WRITING A GCI FILE WHICH IS A SERIOUS ERROR    */
2009     /*   THAT CAUSE A SYSTEM RESTART.                                        */
2010     /* --------------------------------------------------------------------- */
2011     jam();
2012     break;
2013   case FileRecord::TABLE_WRITE:
2014     jam();
2015     break;
2016   default:
2017     jam();
2018     break;
2019   }//switch
2020   {
2021     char msg[100];
2022     sprintf(msg, "File system write failed during FileRecord status %d", (Uint32)status);
2023     fsRefError(signal,__LINE__,msg);
2024   }
2025   return;
2026 }//Dbdih::execFSWRITEREF()
2027 
execGETGCIREQ(Signal * signal)2028 void Dbdih::execGETGCIREQ(Signal* signal)
2029 {
2030 
2031   jamEntry();
2032   Uint32 userPtr = signal->theData[0];
2033   BlockReference userRef = signal->theData[1];
2034   Uint32 type = signal->theData[2];
2035 
2036   Uint32 gci_hi = 0;
2037   Uint32 gci_lo = 0;
2038   switch(type){
2039   case 0:
2040     jam();
2041     gci_hi = SYSFILE->newestRestorableGCI;
2042     break;
2043   case 1:
2044     jam();
2045     gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
2046     gci_lo = Uint32(m_micro_gcp.m_current_gci);
2047     break;
2048   }
2049 
2050   signal->theData[0] = userPtr;
2051   signal->theData[1] = gci_hi;
2052   signal->theData[2] = gci_lo;
2053 
2054   if (userRef)
2055   {
2056     jam();
2057     sendSignal(userRef, GSN_GETGCICONF, signal, 3, JBB);
2058   }
2059   else
2060   {
2061     jam();
2062     // Execute direct
2063   }
2064 }//Dbdih::execGETGCIREQ()
2065 
execREAD_CONFIG_REQ(Signal * signal)2066 void Dbdih::execREAD_CONFIG_REQ(Signal* signal)
2067 {
2068   const ReadConfigReq * req = (ReadConfigReq*)signal->getDataPtr();
2069   Uint32 ref = req->senderRef;
2070   Uint32 senderData = req->senderData;
2071   ndbrequire(req->noOfParameters == 0);
2072 
2073   jamEntry();
2074 
2075   const ndb_mgm_configuration_iterator * p =
2076     m_ctx.m_config.getOwnConfigIterator();
2077   ndbrequireErr(p != 0, NDBD_EXIT_INVALID_CONFIG);
2078 
2079   initData();
2080 
2081   cconnectFileSize = 256; // Only used for DDL
2082 
2083   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_FRAG_CONNECT,
2084 					   &cfragstoreFileSize),
2085 		NDBD_EXIT_INVALID_CONFIG);
2086   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_REPLICAS,
2087 					   &creplicaFileSize),
2088 		NDBD_EXIT_INVALID_CONFIG);
2089   ndbrequireErr(!ndb_mgm_get_int_parameter(p, CFG_DIH_TABLE, &ctabFileSize),
2090 		NDBD_EXIT_INVALID_CONFIG);
2091 
2092   if (isNdbMtLqh())
2093   {
2094     jam();
2095     c_fragments_per_node_ = 0;
2096     // try to get some LQH workers which initially handle no fragments
2097     if (ERROR_INSERTED(7215)) {
2098       c_fragments_per_node_ = 1;
2099       ndbout_c("Using %u fragments per node", c_fragments_per_node_);
2100     }
2101   }
2102   ndb_mgm_get_int_parameter(p, CFG_DB_LCP_TRY_LOCK_TIMEOUT,
2103                             &c_lcpState.m_lcp_trylock_timeout);
2104 
2105   cfileFileSize = (2 * ctabFileSize) + 2;
2106   initRecords();
2107   initialiseRecordsLab(signal, 0, ref, senderData);
2108 
2109   {
2110     Uint32 val = 0;
2111     ndb_mgm_get_int_parameter(p, CFG_DB_2PASS_INR,
2112                               &val);
2113     c_2pass_inr = val ? true : false;
2114   }
2115 
2116   /**
2117    * Set API assigned nodegroup(s)
2118    */
2119   {
2120     NodeRecordPtr nodePtr;
2121     for (nodePtr.i = 0; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
2122     {
2123       ptrAss(nodePtr, nodeRecord);
2124       initNodeRecord(nodePtr);
2125       nodePtr.p->nodeGroup = ZNIL;
2126     }
2127     initNodeRecoveryStatus();
2128 
2129     ndb_mgm_configuration_iterator * iter =
2130       m_ctx.m_config.getClusterConfigIterator();
2131     for(ndb_mgm_first(iter); ndb_mgm_valid(iter); ndb_mgm_next(iter))
2132     {
2133       jam();
2134       Uint32 nodeId;
2135       Uint32 nodeType;
2136 
2137       ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_NODE_ID, &nodeId));
2138       ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_TYPE_OF_SECTION,
2139                                             &nodeType));
2140 
2141       if (nodeType == NodeInfo::DB)
2142       {
2143         jam();
2144         Uint32 ng;
2145         nodePtr.i = nodeId;
2146         ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
2147         setNodeRecoveryStatusInitial(nodePtr);
2148         if (ndb_mgm_get_int_parameter(iter, CFG_DB_NODEGROUP, &ng) == 0)
2149         {
2150           jam();
2151           nodePtr.p->nodeGroup = ng;
2152         }
2153         else
2154         {
2155           jam();
2156           nodePtr.p->nodeGroup = ZNIL;
2157         }
2158       }
2159     }
2160   }
2161   return;
2162 }
2163 
execSTART_COPYREF(Signal * signal)2164 void Dbdih::execSTART_COPYREF(Signal* signal)
2165 {
2166   jamEntry();
2167   ndbabort();
2168 }//Dbdih::execSTART_COPYREF()
2169 
execSTART_FRAGCONF(Signal * signal)2170 void Dbdih::execSTART_FRAGCONF(Signal* signal)
2171 {
2172   (void)signal;  // Don't want compiler warning
2173   /* ********************************************************************* */
2174   /*  If anyone wants to add functionality in this method, be aware that   */
2175   /*  for temporary tables no START_FRAGREQ is sent and therefore no       */
2176   /*  START_FRAGCONF signal will be received for those tables!!            */
2177   /* ********************************************************************* */
2178   jamEntry();
2179   return;
2180 }//Dbdih::execSTART_FRAGCONF()
2181 
execSTART_FRAGREF(Signal * signal)2182 void Dbdih::execSTART_FRAGREF(Signal* signal)
2183 {
2184   jamEntry();
2185 
2186   /**
2187    * Kill starting node
2188    */
2189   Uint32 errCode = signal->theData[1];
2190   Uint32 nodeId = signal->theData[2];
2191 
2192   SystemError * const sysErr = (SystemError*)&signal->theData[0];
2193   sysErr->errorCode = SystemError::StartFragRefError;
2194   sysErr->errorRef = reference();
2195   sysErr->data[0] = errCode;
2196   sysErr->data[1] = 0;
2197   sendSignal(calcNdbCntrBlockRef(nodeId), GSN_SYSTEM_ERROR, signal,
2198 	     SystemError::SignalLength, JBB);
2199   return;
2200 }//Dbdih::execSTART_FRAGCONF()
2201 
execSTART_MEREF(Signal * signal)2202 void Dbdih::execSTART_MEREF(Signal* signal)
2203 {
2204   jamEntry();
2205   ndbabort();
2206 }//Dbdih::execSTART_MEREF()
2207 
execTAB_COMMITREQ(Signal * signal)2208 void Dbdih::execTAB_COMMITREQ(Signal* signal)
2209 {
2210   TabRecordPtr tabPtr;
2211   jamEntry();
2212   Uint32 tdictPtr = signal->theData[0];
2213   BlockReference tdictBlockref = signal->theData[1];
2214   tabPtr.i = signal->theData[2];
2215   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
2216 
2217   ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_CREATING);
2218 
2219   commit_new_table(tabPtr);
2220 
2221   signal->theData[0] = tdictPtr;
2222   signal->theData[1] = cownNodeId;
2223   signal->theData[2] = tabPtr.i;
2224   sendSignal(tdictBlockref, GSN_TAB_COMMITCONF, signal, 3, JBB);
2225   return;
2226 }//Dbdih::execTAB_COMMITREQ()
2227 
2228 /*
2229   3.2   S T A N D A R D   S U B P R O G R A M S   I N   P L E X
2230   *************************************************************
2231   */
2232 /*
2233   3.2.1   S T A R T /  R E S T A R T
2234   **********************************
2235   */
2236 /*****************************************************************************/
2237 /* **********     START / RESTART MODULE                         *************/
2238 /*****************************************************************************/
2239 /*
2240   3.2.1.1    LOADING   O W N   B L O C K  R E F E R E N C E (ABSOLUTE PHASE 1)
2241   *****************************************************************************
2242   */
execDIH_RESTARTREQ(Signal * signal)2243 void Dbdih::execDIH_RESTARTREQ(Signal* signal)
2244 {
2245   jamEntry();
2246   const DihRestartReq* req = CAST_CONSTPTR(DihRestartReq,
2247                                            signal->getDataPtr());
2248   if (req->senderRef != 0)
2249   {
2250     jam();
2251     cntrlblockref = req->senderRef;
2252     if(m_ctx.m_config.getInitialStart())
2253     {
2254       sendDihRestartRef(signal);
2255     } else {
2256       readGciFileLab(signal);
2257     }
2258   }
2259   else
2260   {
2261     /**
2262      * Precondition, (not checked)
2263      *   atleast 1 node in each node group
2264      * Sent as direct signals, so no need to handle bitmasks
2265      * and arrays of GCIs.
2266      */
2267     Uint32 i;
2268     NdbNodeBitmask mask;
2269     mask.assign(NdbNodeBitmask::Size, req->nodemask);
2270     const Uint32 *node_gcis = req->node_gcis;
2271     Uint32 node_group_gcis[MAX_NDB_NODES+1];
2272     memset(node_group_gcis, 0, sizeof(node_group_gcis));
2273     for (i = 0; i<MAX_NDB_NODES; i++)
2274     {
2275       if (mask.get(i))
2276       {
2277 	jam();
2278 	Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
2279         if (ng != NO_NODE_GROUP_ID)
2280         {
2281           ndbrequire(ng < MAX_NDB_NODE_GROUPS);
2282           Uint32 gci = node_gcis[i];
2283           if (gci > ZUNDEFINED_GCI_LIMIT &&
2284               gci + 1 == SYSFILE->lastCompletedGCI[i])
2285           {
2286             jam();
2287             /**
2288              * Handle case, where *I* know that node complete GCI
2289              *   but node does not...bug#29167
2290              *   i.e node died before it wrote own sysfile
2291              *   and node is only one gci behind
2292              */
2293             gci = SYSFILE->lastCompletedGCI[i];
2294           }
2295 
2296           if (gci > node_group_gcis[ng])
2297           {
2298             jam();
2299             node_group_gcis[ng] = gci;
2300           }
2301         }
2302       }
2303     }
2304     for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
2305 
2306     Uint32 gci = node_group_gcis[i];
2307     if (gci == ZUNDEFINED_GCI_LIMIT)
2308     {
2309       jam();
2310       signal->theData[0] = i;
2311       return;
2312     }
2313     for (i++ ; i<MAX_NDB_NODES; i++)
2314     {
2315       jam();
2316       if (node_group_gcis[i] && node_group_gcis[i] != gci)
2317       {
2318 	jam();
2319 	signal->theData[0] = i;
2320 	return;
2321       }
2322     }
2323     signal->theData[0] = MAX_NDB_NODES;
2324     return;
2325   }
2326   return;
2327 }//Dbdih::execDIH_RESTARTREQ()
2328 
execSET_LATEST_LCP_ID(Signal * signal)2329 void Dbdih::execSET_LATEST_LCP_ID(Signal *signal)
2330 {
2331   Uint32 nodeId = signal->theData[0];
2332   Uint32 latestLcpId = signal->theData[1];
2333   if (latestLcpId > SYSFILE->latestLCP_ID)
2334   {
2335     jam();
2336     g_eventLogger->info("Node %u saw more recent LCP id = %u, previously = %u",
2337                         nodeId,
2338                         latestLcpId,
2339                         SYSFILE->latestLCP_ID);
2340     ndbrequire(latestLcpId == (SYSFILE->latestLCP_ID + 1));
2341     SYSFILE->latestLCP_ID = latestLcpId;
2342   }
2343 }
2344 
execGET_LATEST_GCI_REQ(Signal * signal)2345 void Dbdih::execGET_LATEST_GCI_REQ(Signal *signal)
2346 {
2347   Uint32 nodeId = signal->theData[0];
2348   Uint32 latestGci = SYSFILE->lastCompletedGCI[nodeId];
2349   signal->theData[0] = latestGci;
2350 }
2351 
execSTTOR(Signal * signal)2352 void Dbdih::execSTTOR(Signal* signal)
2353 {
2354   jamEntry();
2355 
2356   Callback c = { safe_cast(&Dbdih::sendSTTORRY), 0 };
2357   m_sendSTTORRY = c;
2358 
2359   switch(signal->theData[1]){
2360   case 1:
2361     jam();
2362     createMutexes(signal, 0);
2363     init_lcp_pausing_module();
2364     return;
2365   case 3:
2366     jam();
2367     signal->theData[0] = reference();
2368     sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
2369     return;
2370   }
2371 
2372   sendSTTORRY(signal);
2373 }//Dbdih::execSTTOR()
2374 
2375 void
sendSTTORRY(Signal * signal,Uint32 senderData,Uint32 retVal)2376 Dbdih::sendSTTORRY(Signal* signal, Uint32 senderData, Uint32 retVal)
2377 {
2378   signal->theData[0] = 0;
2379   signal->theData[1] = 0;
2380   signal->theData[2] = 0;
2381   signal->theData[3] = 1;   // Next start phase
2382   signal->theData[4] = 3;
2383   signal->theData[5] = 255; // Next start phase
2384   sendSignal(NDBCNTR_REF, GSN_STTORRY, signal, 6, JBB);
2385   return;
2386 }
2387 
2388 /*
2389  * ***************************************************************************
2390  * S E N D I N G   R E P L Y  T O  S T A R T /  R E S T A R T   R E Q U E S T S
2391  * ****************************************************************************
2392  */
ndbsttorry10Lab(Signal * signal,Uint32 _line)2393 void Dbdih::ndbsttorry10Lab(Signal* signal, Uint32 _line)
2394 {
2395   /*-------------------------------------------------------------------------*/
2396   // AN NDB START PHASE HAS BEEN COMPLETED. WHEN START PHASE 6 IS COMPLETED WE
2397   // RECORD THAT THE SYSTEM IS RUNNING.
2398   /*-------------------------------------------------------------------------*/
2399   signal->theData[0] = reference();
2400   sendSignal(cntrlblockref, GSN_NDB_STTORRY, signal, 1, JBB);
2401   return;
2402 }//Dbdih::ndbsttorry10Lab()
2403 
2404 /*
2405 ****************************************
2406 I N T E R N A L  P H A S E S
2407 ****************************************
2408 */
2409 /*---------------------------------------------------------------------------*/
2410 /*NDB_STTOR                              START SIGNAL AT START/RESTART       */
2411 /*---------------------------------------------------------------------------*/
execNDB_STTOR(Signal * signal)2412 void Dbdih::execNDB_STTOR(Signal* signal)
2413 {
2414   jamEntry();
2415   BlockReference cntrRef = signal->theData[0];    /* SENDERS BLOCK REFERENCE */
2416   Uint32 ownNodeId = signal->theData[1];          /* OWN PROCESSOR ID*/
2417   Uint32 phase = signal->theData[2];              /* INTERNAL START PHASE*/
2418   Uint32 typestart = signal->theData[3];
2419 
2420   cstarttype = typestart;
2421   cstartPhase = phase;
2422 
2423   switch (phase){
2424   case ZNDB_SPH1:
2425     jam();
2426     /*-----------------------------------------------------------------------*/
2427     // Compute all static block references in this node as part of
2428     // ndb start phase 1.
2429     /*-----------------------------------------------------------------------*/
2430     cownNodeId = ownNodeId;
2431     cntrlblockref = cntrRef;
2432     clocaltcblockref = calcTcBlockRef(ownNodeId);
2433     clocallqhblockref = calcLqhBlockRef(ownNodeId);
2434     cdictblockref = calcDictBlockRef(ownNodeId);
2435     c_lcpState.lcpStallStart = 0;
2436     c_lcpState.lcpManualStallStart = false;
2437     NdbTick_Invalidate(&c_lcpState.m_start_lcp_check_time);
2438     ndbsttorry10Lab(signal, __LINE__);
2439     break;
2440 
2441   case ZNDB_SPH2:
2442     jam();
2443     /*-----------------------------------------------------------------------*/
2444     // For node restarts we will also add a request for permission
2445     // to continue the system restart.
2446     // The permission is given by the master node in the alive set.
2447     /*-----------------------------------------------------------------------*/
2448     if (cstarttype == NodeState::ST_INITIAL_NODE_RESTART)
2449     {
2450       jam();
2451       globalData.m_restart_seq = SYSFILE->m_restart_seq = 1;
2452       g_eventLogger->info("Starting with m_restart_seq set to 1");
2453       c_set_initial_start_flag = TRUE; // In sysfile...
2454     }
2455 
2456     if (cstarttype == NodeState::ST_INITIAL_START) {
2457       jam();
2458       // setInitialActiveStatus is moved into makeNodeGroups
2459     } else if (cstarttype == NodeState::ST_SYSTEM_RESTART) {
2460       jam();
2461       /*empty*/;
2462     } else if ((cstarttype == NodeState::ST_NODE_RESTART) ||
2463                (cstarttype == NodeState::ST_INITIAL_NODE_RESTART)) {
2464       jam();
2465       nodeRestartPh2Lab(signal);
2466       return;
2467     } else {
2468       ndbabort();
2469     }//if
2470     ndbsttorry10Lab(signal, __LINE__);
2471     return;
2472 
2473   case ZNDB_SPH3:
2474     jam();
2475     /*-----------------------------------------------------------------------*/
2476     // Non-master nodes performing an initial start will execute
2477     // the start request here since the
2478     // initial start do not synchronise so much from the master.
2479     // In the master nodes the start
2480     // request will be sent directly to dih (in ndb_startreq) when all
2481     // nodes have completed phase 3 of the start.
2482     /*-----------------------------------------------------------------------*/
2483     cmasterState = MASTER_IDLE;
2484     if(cstarttype == NodeState::ST_INITIAL_START ||
2485        cstarttype == NodeState::ST_SYSTEM_RESTART){
2486       jam();
2487       cmasterState = isMaster() ? MASTER_ACTIVE : MASTER_IDLE;
2488     }
2489     if (!isMaster() && cstarttype == NodeState::ST_INITIAL_START) {
2490       jam();
2491       ndbStartReqLab(signal, cntrRef);
2492       return;
2493     }//if
2494     ndbsttorry10Lab(signal, __LINE__);
2495     break;
2496 
2497   case ZNDB_SPH4:
2498     jam();
2499     cmasterTakeOverNode = ZNIL;
2500     switch(typestart){
2501     case NodeState::ST_INITIAL_START:
2502       jam();
2503       ndbassert(c_lcpState.lcpStatus == LCP_STATUS_IDLE);
2504       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
2505       ndbsttorry10Lab(signal, __LINE__);
2506       return;
2507     case NodeState::ST_SYSTEM_RESTART:
2508       jam();
2509       if (!c_performed_copy_phase)
2510       {
2511         jam();
2512         /**
2513          * We are not performing the copy phase, it is a normal
2514          * system restart, we initialise the LCP status to IDLE.
2515          *
2516          * When copy phase is performed the LCP processing have
2517          * already started when we arrive here.
2518          */
2519         ndbassert(c_lcpState.lcpStatus == LCP_STATUS_IDLE);
2520         c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
2521       }
2522       ndbsttorry10Lab(signal, __LINE__);
2523       return;
2524     case NodeState::ST_INITIAL_NODE_RESTART:
2525     case NodeState::ST_NODE_RESTART:
2526       jam();
2527 
2528       /***********************************************************************
2529        * When starting nodes while system is operational we must be controlled
2530        * by the master. There can be multiple node restarts ongoing, but this
2531        * phase only allows for one node at a time. So it has to be controlled
2532        * from the master node.
2533        *
2534        * When this signal is confirmed the master has also copied the
2535        * dictionary and the distribution information.
2536        */
2537       ndbassert(c_lcpState.lcpStatus == LCP_STATUS_IDLE);
2538       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
2539       g_eventLogger->info("Request copying of distribution and dictionary"
2540                           " information from master Starting");
2541 
2542       StartMeReq * req = (StartMeReq*)&signal->theData[0];
2543       req->startingRef = reference();
2544       req->startingVersion = 0; // Obsolete
2545       sendSignal(cmasterdihref, GSN_START_MEREQ, signal,
2546                  StartMeReq::SignalLength, JBB);
2547       return;
2548     }
2549     ndbabort();
2550   case ZNDB_SPH5:
2551     jam();
2552     switch(typestart){
2553     case NodeState::ST_INITIAL_START:
2554     case NodeState::ST_SYSTEM_RESTART:
2555       jam();
2556       /*---------------------------------------------------------------------*/
2557       // WE EXECUTE A LOCAL CHECKPOINT AS A PART OF A SYSTEM RESTART.
2558       // THE IDEA IS THAT WE NEED TO
2559       // ENSURE THAT WE CAN RECOVER FROM PROBLEMS CAUSED BY MANY NODE
2560       // CRASHES THAT CAUSES THE LOG
2561       // TO GROW AND THE NUMBER OF LOG ROUNDS TO EXECUTE TO GROW.
2562       // THIS CAN OTHERWISE GET US INTO
2563       // A SITUATION WHICH IS UNREPAIRABLE. THUS WE EXECUTE A CHECKPOINT
2564       // BEFORE ALLOWING ANY TRANSACTIONS TO START.
2565       /*---------------------------------------------------------------------*/
2566       if (!isMaster()) {
2567 	jam();
2568 	ndbsttorry10Lab(signal, __LINE__);
2569 	return;
2570       }//if
2571 
2572       infoEvent("Make On-line Database recoverable by waiting for LCP"
2573                 " Starting, LCP id = %u",
2574                 SYSFILE->latestLCP_ID + 1);
2575 
2576       c_lcpState.immediateLcpStart = true;
2577       cwaitLcpSr = true;
2578       checkLcpStart(signal, __LINE__, 0);
2579       return;
2580     case NodeState::ST_NODE_RESTART:
2581     case NodeState::ST_INITIAL_NODE_RESTART:
2582       jam();
2583       {
2584         StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
2585         req->senderRef = reference();
2586         req->senderData = RNIL;
2587         req->flags = StartCopyReq::WAIT_LCP;
2588         req->startingNodeId = getOwnNodeId();
2589         sendSignal(reference(), GSN_START_COPYREQ, signal,
2590                    StartCopyReq::SignalLength, JBB);
2591       }
2592       return;
2593     }
2594     ndbabort();
2595   case ZNDB_SPH6:
2596     jam();
2597     switch(typestart){
2598     case NodeState::ST_INITIAL_START:
2599     case NodeState::ST_SYSTEM_RESTART:
2600       jam();
2601       if(isMaster()){
2602 	jam();
2603         if (typestart == NodeState::ST_INITIAL_START)
2604         {
2605           /**
2606            * Skip GCI 1 at initial start, has special meaning
2607            * in CM_REGREQ protocol. Means node isn't restartable
2608            * on its own. Setting it to 2 such that we will
2609            * start preparing GCI 3 immediately.
2610            *
2611            * Only required to avoid restarting from GCI = 1.
2612            */
2613           jam();
2614           m_micro_gcp.m_current_gci = ((Uint64(ZUNDEFINED_GCI_LIMIT + 1)) << 32);
2615         }
2616 	startGcp(signal);
2617       }
2618       ndbsttorry10Lab(signal, __LINE__);
2619       return;
2620     case NodeState::ST_NODE_RESTART:
2621     case NodeState::ST_INITIAL_NODE_RESTART:
2622       ndbsttorry10Lab(signal, __LINE__);
2623       return;
2624     }
2625     ndbabort();
2626   default:
2627     jam();
2628     ndbsttorry10Lab(signal, __LINE__);
2629     break;
2630   }//switch
2631 }//Dbdih::execNDB_STTOR()
2632 
2633 void
execNODE_START_REP(Signal * signal)2634 Dbdih::execNODE_START_REP(Signal* signal)
2635 {
2636   /*
2637    * Send DICT_UNLOCK_ORD when this node is SL_STARTED.
2638    *
2639    * Sending it before (sp 7) conflicts with code which assumes
2640    * SL_STARTING means we are in copy phase of NR.
2641    *
2642    * NodeState::starting.restartType is not supposed to be used
2643    * when SL_STARTED.  Also it seems NODE_START_REP can arrive twice.
2644    *
2645    * For these reasons there are no consistency checks and
2646    * we rely on c_dictLockSlavePtrI_nodeRestart alone.
2647    */
2648   if (signal->theData[0] == getOwnNodeId())
2649   {
2650     /**
2651      * With parallel node restart, only unlock self, if it's self that has
2652      *   started
2653      */
2654     jam();
2655     if (c_dictLockSlavePtrI_nodeRestart != RNIL) {
2656       sendDictUnlockOrd(signal, c_dictLockSlavePtrI_nodeRestart);
2657       c_dictLockSlavePtrI_nodeRestart = RNIL;
2658     }
2659   }
2660   // Request max lag recalculation to reflect new cluster scale
2661   // after a node start
2662   m_gcp_monitor.m_gcp_save.m_need_max_lag_recalc = true;
2663   m_gcp_monitor.m_micro_gcp.m_need_max_lag_recalc = true;
2664 }
2665 
2666 void
createMutexes(Signal * signal,Uint32 count)2667 Dbdih::createMutexes(Signal * signal, Uint32 count){
2668   Callback c = { safe_cast(&Dbdih::createMutex_done), count };
2669 
2670   switch(count){
2671   case 0:{
2672     Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
2673     mutex.create(c);
2674     return;
2675   }
2676   case 1:{
2677     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
2678     mutex.create(c);
2679     return;
2680   }
2681   case 2:{
2682     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
2683     mutex.create(c);
2684     return;
2685   }
2686   }
2687 
2688   execute(signal, m_sendSTTORRY, 0);
2689 }
2690 
2691 void
createMutex_done(Signal * signal,Uint32 senderData,Uint32 retVal)2692 Dbdih::createMutex_done(Signal* signal, Uint32 senderData, Uint32 retVal){
2693   jamEntry();
2694   ndbrequire(retVal == 0);
2695 
2696   switch(senderData){
2697   case 0:{
2698     Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
2699     mutex.release();
2700     break;
2701   }
2702   case 1:{
2703     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
2704     mutex.release();
2705     break;
2706   }
2707   case 2:{
2708     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
2709     mutex.release();
2710     break;
2711   }
2712   }
2713 
2714   createMutexes(signal, senderData + 1);
2715 }
2716 
2717 /*****************************************************************************/
2718 /* ------------------------------------------------------------------------- */
2719 /*       WE HAVE BEEN REQUESTED BY NDBCNTR TO PERFORM A RESTART OF THE       */
2720 /*       DATABASE TABLES.                                                    */
2721 /*       THIS SIGNAL IS SENT AFTER COMPLETING PHASE 3 IN ALL BLOCKS IN A     */
2722 /*       SYSTEM RESTART. WE WILL ALSO JUMP TO THIS LABEL FROM PHASE 3 IN AN  */
2723 /*       INITIAL START.                                                      */
2724 /* ------------------------------------------------------------------------- */
2725 /*****************************************************************************/
execNDB_STARTREQ(Signal * signal)2726 void Dbdih::execNDB_STARTREQ(Signal* signal)
2727 {
2728   jamEntry();
2729   BlockReference ref = signal->theData[0];
2730   cstarttype = signal->theData[1];
2731   ndbStartReqLab(signal, ref);
2732 }//Dbdih::execNDB_STARTREQ()
2733 
ndbStartReqLab(Signal * signal,BlockReference ref)2734 void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
2735 {
2736   cndbStartReqBlockref = ref;
2737   if (cstarttype == NodeState::ST_INITIAL_START) {
2738     jam();
2739     initRestartInfo(signal);
2740     initGciFilesLab(signal);
2741     return;
2742   }
2743 
2744   NodeRecordPtr nodePtr;
2745   Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
2746   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
2747   {
2748     jam();
2749     ptrAss(nodePtr, nodeRecord);
2750     if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci)
2751     {
2752       jam();
2753       /**
2754        * Since we're starting(is master) and there
2755        *   there are other nodes with higher GCI...
2756        *   their gci's must be invalidated...
2757        *   and they _must_ do an initial start
2758        *   indicate this by setting lastCompletedGCI = 0
2759        */
2760       SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
2761       ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
2762       warningEvent("Making filesystem for node %d unusable (need --initial)",
2763 		   nodePtr.i);
2764     }
2765     else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
2766 	     SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
2767     {
2768       jam();
2769       CRASH_INSERTION(7170);
2770       char buf[255];
2771       BaseString::snprintf(buf, sizeof(buf),
2772 			   "Cluster requires this node to be started "
2773 			   " with --initial as partial start has been performed"
2774 			   " and this filesystem is unusable");
2775       progError(__LINE__,
2776 		NDBD_EXIT_SR_RESTARTCONFLICT,
2777 		buf);
2778     }
2779   }
2780 
2781   /**
2782    * This set which GCI we will try to restart to
2783    */
2784   SYSFILE->newestRestorableGCI = gci;
2785   infoEvent("Restarting cluster to GCI: %u", gci);
2786 
2787   ndbrequire(isMaster());
2788   copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
2789 }//Dbdih::ndbStartReqLab()
2790 
execREAD_NODESCONF(Signal * signal)2791 void Dbdih::execREAD_NODESCONF(Signal* signal)
2792 {
2793   unsigned i;
2794   ReadNodesConf * const readNodes = (ReadNodesConf *)&signal->theData[0];
2795   jamEntry();
2796   Uint32 nodeArray[MAX_NDB_NODES+1];
2797 
2798   {
2799     ndbrequire(signal->getNoOfSections() == 1);
2800     SegmentedSectionPtr ptr;
2801     SectionHandle handle(this, signal);
2802     handle.getSection(ptr, 0);
2803     ndbrequire(ptr.sz == 5 * NdbNodeBitmask::Size);
2804     copy((Uint32*)&readNodes->definedNodes.rep.data, ptr);
2805     releaseSections(handle);
2806   }
2807 
2808   csystemnodes  = readNodes->noOfNodes;
2809   cmasterNodeId = readNodes->masterNodeId;
2810   unsigned index = 0;
2811   NdbNodeBitmask tmp = readNodes->definedNodes;
2812   m_max_node_id = 0;
2813   for (i = 1; i < MAX_NDB_NODES; i++){
2814     jam();
2815     if(tmp.get(i)){
2816       jam();
2817       m_max_node_id = i;
2818       nodeArray[index] = i;
2819       if (readNodes->inactiveNodes.get(i) == false)
2820       {
2821         jam();
2822         con_lineNodes++;
2823       }//if
2824       index++;
2825     }//if
2826   }//for
2827   nodeArray[index] = RNIL; // terminate
2828 
2829   if (cmasterNodeId == getOwnNodeId() &&
2830       con_lineNodes >= 16)
2831   {
2832     /**
2833      * In large clusters the main thread can be quite busy, ensure it
2834      * doesn't assist the send thread in this scenario.
2835      */
2836     log_setNoSend();
2837     setNoSend();
2838   }
2839   if (c_2pass_inr)
2840   {
2841     jam();
2842     Uint32 workers = getNodeInfo(getOwnNodeId()).m_lqh_workers;
2843 #ifdef VM_TRACE
2844     printf("Checking 2-pass initial node restart: ");
2845 #endif
2846     for (i = 0; i<index; i++)
2847     {
2848       if (readNodes->inactiveNodes.get(nodeArray[i]))
2849         continue;
2850 
2851       if (workers > 1 &&
2852           workers != getNodeInfo(nodeArray[i]).m_lqh_workers)
2853       {
2854         c_2pass_inr = false;
2855 #ifdef VM_TRACE
2856         printf("not ok (different worker cnt node %u) => disabled\n",
2857                nodeArray[i]);
2858 #endif
2859         break;
2860       }
2861     }
2862     if (c_2pass_inr)
2863     {
2864 #ifdef VM_TRACE
2865       ndbout_c("ok");
2866 #endif
2867     }
2868 
2869     /**
2870      * Note: In theory it would be ok for just nodes that we plan to copy from
2871      *   supported this...but in e.g a 3/4-replica scenario,
2872      *      if one of the nodes does, and the other doesnt, we don't
2873      *      have enough infrastructure to easily check this...
2874      *      therefore we require all nodes to support it.
2875      */
2876   }
2877 
2878   if(cstarttype == NodeState::ST_SYSTEM_RESTART ||
2879      cstarttype == NodeState::ST_NODE_RESTART)
2880   {
2881 
2882     for(i = 1; i <= m_max_node_id; i++)
2883     {
2884       const Uint32 stat = Sysfile::getNodeStatus(i, SYSFILE->nodeStatus);
2885       if(stat == Sysfile::NS_NotDefined && !tmp.get(i))
2886       {
2887 	jam();
2888 	continue;
2889       }
2890 
2891       if(tmp.get(i) && stat != Sysfile::NS_NotDefined)
2892       {
2893 	jam();
2894 	continue;
2895       }
2896 
2897       if (stat == Sysfile::NS_NotDefined && tmp.get(i))
2898       {
2899         jam();
2900         infoEvent("Discovered new node %u", i);
2901         continue;
2902       }
2903 
2904       if (stat == Sysfile::NS_Configured && !tmp.get(i))
2905       {
2906         jam();
2907         infoEvent("Configured node %u not present, ignoring",
2908                   i);
2909         continue;
2910       }
2911 
2912       char buf[255];
2913       BaseString::snprintf(buf, sizeof(buf),
2914                            "Illegal configuration change."
2915                            " Initial start needs to be performed "
2916                            " when removing nodes with nodegroup (node %d)", i);
2917       progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
2918     }
2919   }
2920 
2921   ndbrequire(csystemnodes >= 1 && csystemnodes < MAX_NDB_NODES);
2922 
2923   cmasterdihref = calcDihBlockRef(cmasterNodeId);
2924   /*-------------------------------------------------------------------------*/
2925   /* MAKE THE LIST OF PRN-RECORD WHICH IS ONE OF THE NODES-LIST IN THIS BLOCK*/
2926   /*-------------------------------------------------------------------------*/
2927   makePrnList(readNodes, nodeArray);
2928   if (cstarttype == NodeState::ST_INITIAL_START) {
2929     jam();
2930     /**----------------------------------------------------------------------
2931      * WHEN WE INITIALLY START A DATABASE WE WILL CREATE NODE GROUPS.
2932      * ALL NODES ARE PUT INTO NODE GROUPS ALTHOUGH HOT SPARE NODES ARE PUT
2933      * INTO A SPECIAL NODE GROUP. IN EACH NODE GROUP WE HAVE THE SAME AMOUNT
2934      * OF NODES AS THERE ARE NUMBER OF REPLICAS.
2935      * ONE POSSIBLE USAGE OF NODE GROUPS ARE TO MAKE A NODE GROUP A COMPLETE
2936      * FRAGMENT OF THE DATABASE. THIS MEANS THAT ALL REPLICAS WILL BE STORED
2937      * IN THE NODE GROUP.
2938      *-----------------------------------------------------------------------*/
2939     makeNodeGroups(nodeArray);
2940   }//if
2941   ndbrequire(checkNodeAlive(cmasterNodeId));
2942 
2943   /**
2944    * Keep bitmap of nodes that can be restored...
2945    *   and nodes that need take-over
2946    *
2947    */
2948   m_sr_nodes.clear();
2949   m_to_nodes.clear();
2950 
2951   // Start with assumption that all can restore
2952   {
2953     NodeRecordPtr specNodePtr;
2954     specNodePtr.i = cfirstAliveNode;
2955     do {
2956       jam();
2957       m_sr_nodes.set(specNodePtr.i);
2958       ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
2959       specNodePtr.i = specNodePtr.p->nextNode;
2960     } while (specNodePtr.i != RNIL);
2961   }
2962 
2963   execute(signal, m_sendSTTORRY, 0);
2964 }//Dbdih::execREAD_NODESCONF()
2965 
2966 /*---------------------------------------------------------------------------*/
2967 /*                    START NODE LOGIC FOR NODE RESTART                      */
2968 /*---------------------------------------------------------------------------*/
nodeRestartPh2Lab(Signal * signal)2969 void Dbdih::nodeRestartPh2Lab(Signal* signal)
2970 {
2971   /*
2972    * Lock master DICT to avoid metadata operations during INR/NR.
2973    * Done just before START_PERMREQ.
2974    *
2975    * It would be more elegant to do this just before START_MEREQ.
2976    * The problem is, on INR we end up in massive invalidateNodeLCP
2977    * which is not fully protected against metadata ops.
2978    */
2979   ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);
2980 
2981   // check that we are not yet taking part in schema ops
2982   CRASH_INSERTION(7174);
2983 
2984   Uint32 lockType = DictLockReq::NodeRestartLock;
2985   Callback c = { safe_cast(&Dbdih::recvDictLockConf_nodeRestart), 0 };
2986   sendDictLockReq(signal, lockType, c);
2987 }
2988 
recvDictLockConf_nodeRestart(Signal * signal,Uint32 data,Uint32 ret)2989 void Dbdih::recvDictLockConf_nodeRestart(Signal* signal, Uint32 data, Uint32 ret)
2990 {
2991   ndbrequire(c_dictLockSlavePtrI_nodeRestart == RNIL);
2992   ndbrequire(data != RNIL);
2993   c_dictLockSlavePtrI_nodeRestart = data;
2994 
2995   nodeRestartPh2Lab2(signal);
2996 }
2997 
nodeRestartPh2Lab2(Signal * signal)2998 void Dbdih::nodeRestartPh2Lab2(Signal* signal)
2999 {
3000   /*------------------------------------------------------------------------*/
3001   // REQUEST FOR PERMISSION FROM MASTER TO START A NODE IN AN ALREADY
3002   // RUNNING SYSTEM.
3003   /*------------------------------------------------------------------------*/
3004 
3005   g_eventLogger->info("Request permission to start our node from master Starting");
3006 
3007   StartPermReq * const req = (StartPermReq *)&signal->theData[0];
3008 
3009   req->blockRef  = reference();
3010   req->nodeId    = cownNodeId;
3011   req->startType = cstarttype;
3012   sendSignal(cmasterdihref, GSN_START_PERMREQ, signal, 3, JBB);
3013 
3014   if (ERROR_INSERTED(7203))
3015   {
3016     signal->theData[0] = 9999;
3017     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
3018   }
3019 }
3020 
execSTART_PERMCONF(Signal * signal)3021 void Dbdih::execSTART_PERMCONF(Signal* signal)
3022 {
3023   jamEntry();
3024   CRASH_INSERTION(7121);
3025   Uint32 nodeId = signal->theData[0];
3026   cfailurenr = signal->theData[1];
3027 
3028   bool microGCP = signal->theData[2];
3029   if (signal->getLength() < StartPermConf::SignalLength)
3030   {
3031     microGCP = false;
3032   }
3033   m_micro_gcp.m_enabled = microGCP;
3034   ndbrequire(nodeId == cownNodeId);
3035   ndbsttorry10Lab(signal, __LINE__);
3036 
3037   if (m_micro_gcp.m_enabled)
3038   {
3039     jam();
3040     UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
3041     ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
3042     EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
3043   }
3044   else if(isMultiThreaded())
3045   {
3046     /**
3047      * Prevent this start, as there is some non-thread-safe upgrade code for
3048      * this case in LQH.
3049      */
3050     progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT,
3051               "Cluster requires that all old data nodes are upgraded "
3052               "while running single-threaded ndbd before starting "
3053               "multi-threaded ndbmtd data nodes.");
3054   }
3055 
3056   g_eventLogger->info("Request permission to start our node from master Completed");
3057 
3058 }//Dbdih::execSTART_PERMCONF()
3059 
execSTART_PERMREF(Signal * signal)3060 void Dbdih::execSTART_PERMREF(Signal* signal)
3061 {
3062   jamEntry();
3063   Uint32 errorCode = signal->theData[1];
3064   if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR ||
3065       errorCode == StartPermRef::ZNODE_START_DISALLOWED_ERROR) {
3066     jam();
3067     /*-----------------------------------------------------------------------*/
3068     // The master was busy adding another node. We will wait for a few
3069     // seconds and try again.
3070     /*-----------------------------------------------------------------------*/
3071     g_eventLogger->info("Did not get permission to start (%u) retry in 3s",
3072                         errorCode);
3073     signal->theData[0] = DihContinueB::ZSTART_PERMREQ_AGAIN;
3074     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
3075     return;
3076   }//if
3077 
3078   if (errorCode == StartPermRef::InitialStartRequired)
3079   {
3080     CRASH_INSERTION(7170);
3081     char buf[255];
3082     BaseString::snprintf(buf, sizeof(buf),
3083 			 "Cluster requires this node to be started "
3084 			 " with --initial as partial start has been performed"
3085 			 " and this filesystem is unusable");
3086     progError(__LINE__,
3087 	      NDBD_EXIT_SR_RESTARTCONFLICT,
3088 	      buf);
3089   }
3090 
3091   /*------------------------------------------------------------------------*/
3092   // Some node process in another node involving our node was still active. We
3093   // will recover from this by crashing here.
3094   // This is controlled restart using the
3095   // already existing features of node crashes. It is not a bug getting here.
3096   /*-------------------------------------------------------------------------*/
3097   ndbabort();
3098 }//Dbdih::execSTART_PERMREF()
3099 
3100 /*---------------------------------------------------------------------------*/
3101 /*       THIS SIGNAL IS RECEIVED IN THE STARTING NODE WHEN THE START_MEREQ   */
3102 /*       HAS BEEN EXECUTED IN THE MASTER NODE.                               */
3103 /*---------------------------------------------------------------------------*/
execSTART_MECONF(Signal * signal)3104 void Dbdih::execSTART_MECONF(Signal* signal)
3105 {
3106   jamEntry();
3107   StartMeConf * const startMe = (StartMeConf *)&signal->theData[0];
3108   Uint32 nodeId = startMe->startingNodeId;
3109   const Uint32 startWord = startMe->startWord;
3110 
3111   CRASH_INSERTION(7130);
3112   ndbrequire(nodeId == cownNodeId);
3113   bool v2_format = true;
3114   if (ndbd_send_node_bitmask_in_section(getNodeInfo(cmasterNodeId).m_version))
3115   {
3116     jam();
3117     ndbrequire(signal->getNoOfSections() == 1);
3118     SegmentedSectionPtr ptr;
3119     SectionHandle handle(this, signal);
3120     handle.getSection(ptr, 0);
3121     ndbrequire(ptr.sz <= (sizeof(cdata)/4));
3122     copy(cdata, ptr);
3123     cdata_size_in_words = ptr.sz;
3124     releaseSections(handle);
3125   }
3126   else
3127   {
3128     jam();
3129     v2_format = false;
3130     arrGuard(startWord + StartMeConf::DATA_SIZE, sizeof(cdata)/4);
3131     for(Uint32 i = 0; i < StartMeConf::DATA_SIZE; i++)
3132     {
3133       cdata[startWord+i] = startMe->data[i];
3134     }
3135     if(startWord + StartMeConf::DATA_SIZE < Sysfile::SYSFILE_SIZE32_v1)
3136     {
3137       jam();
3138       /**
3139        * We are still waiting for data
3140        */
3141       return;
3142     }
3143   }
3144 
3145   /**
3146    * Copy into sysfile
3147    *
3148    * But dont copy lastCompletedGCI:s
3149    */
3150   Uint32 key = SYSFILE->m_restart_seq;
3151   Uint32 tempGCP[MAX_NDB_NODES];
3152   for (Uint32 i = 1; i <= m_max_node_id; i++)
3153   {
3154     tempGCP[i] = SYSFILE->lastCompletedGCI[i];
3155   }
3156 
3157   if (v2_format)
3158   {
3159     jam();
3160     unpack_sysfile_format_v2(false);
3161   }
3162   else
3163   {
3164     jam();
3165     unpack_sysfile_format_v1(false);
3166   }
3167   SYSFILE->m_restart_seq = key;
3168   for (Uint32 i = 1; i <= m_max_node_id; i++)
3169   {
3170     SYSFILE->lastCompletedGCI[i] = tempGCP[i];
3171   }
3172   setNodeActiveStatus();
3173   setNodeGroups();
3174 
3175   g_eventLogger->info("Request copying of distribution and dictionary"
3176                       " information from master Completed");
3177 
3178   ndbsttorry10Lab(signal, __LINE__);
3179 
3180   if (getNodeActiveStatus(getOwnNodeId()) == Sysfile::NS_Configured)
3181   {
3182     jam();
3183     c_set_initial_start_flag = FALSE;
3184   }
3185 }//Dbdih::execSTART_MECONF()
3186 
execSTART_COPYCONF(Signal * signal)3187 void Dbdih::execSTART_COPYCONF(Signal* signal)
3188 {
3189   jamEntry();
3190 
3191   StartCopyConf* conf = (StartCopyConf*)signal->getDataPtr();
3192   Uint32 nodeId = conf->startingNodeId;
3193   Uint32 senderData = conf->senderData;
3194 
3195   if (senderData == RNIL)
3196   {
3197     /**
3198      * This is NR
3199      */
3200     jam();
3201 
3202     g_eventLogger->info("Make On-line Database recoverable by waiting for"
3203                         " LCP Completed, LCP id = %u",
3204                         SYSFILE->latestLCP_ID);
3205 
3206     ndbrequire(nodeId == cownNodeId);
3207     CRASH_INSERTION(7132);
3208     ndbsttorry10Lab(signal, __LINE__);
3209   }
3210   else
3211   {
3212     /**
3213      * This is TO during SR...waiting for all nodes
3214      */
3215     infoEvent("Make On-line Database recoverable by waiting for LCP Completed"
3216               " on node %u, LCP id = %u",
3217               nodeId,
3218               SYSFILE->latestLCP_ID);
3219 
3220     ndbrequire(senderData == getOwnNodeId());
3221     ndbrequire(m_to_nodes.get(nodeId));
3222     m_to_nodes.clear(nodeId);
3223     m_sr_nodes.set(nodeId);
3224     if (!m_to_nodes.isclear())
3225     {
3226       jam();
3227       return;
3228     }
3229 
3230     infoEvent("Restore Database from disk Completed");
3231 
3232     signal->theData[0] = reference();
3233     m_sr_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
3234 
3235     Uint32 packed_length = m_sr_nodes.getPackedLengthInWords();
3236     if (ndbd_send_node_bitmask_in_section(
3237         getNodeInfo(refToNode(cntrlblockref)).m_version))
3238     {
3239       LinearSectionPtr lsptr[3];
3240       lsptr[0].p = signal->theData + 1;
3241       lsptr[0].sz = m_sr_nodes.getPackedLengthInWords();
3242       sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
3243                      1, JBB, lsptr, 1);
3244     }
3245     else if (packed_length <= NdbNodeBitmask48::Size)
3246     {
3247       sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
3248                  1 + NdbNodeBitmask48::Size, JBB);
3249     }
3250     else
3251     {
3252       ndbabort();
3253     }
3254     return;
3255   }
3256   return;
3257 }//Dbdih::execSTART_COPYCONF()
3258 
3259 /*---------------------------------------------------------------------------*/
3260 /*                    MASTER LOGIC FOR NODE RESTART                          */
3261 /*---------------------------------------------------------------------------*/
3262 /*                    NODE RESTART PERMISSION REQUEST                        */
3263 /*---------------------------------------------------------------------------*/
3264 // A REQUEST FROM A STARTING NODE TO PERFORM A NODE RESTART. IF NO OTHER NODE
3265 // IS ACTIVE IN PERFORMING A NODE RESTART AND THERE ARE NO ACTIVE PROCESSES IN
3266 // THIS NODE INVOLVING THE STARTING NODE  THIS REQUEST WILL BE GRANTED.
3267 /*---------------------------------------------------------------------------*/
execSTART_PERMREQ(Signal * signal)3268 void Dbdih::execSTART_PERMREQ(Signal* signal)
3269 {
3270   StartPermReq * const req = (StartPermReq*)&signal->theData[0];
3271   jamEntry();
3272   const BlockReference retRef = req->blockRef;
3273   const Uint32 nodeId   = req->nodeId;
3274   const Uint32 typeStart = req->startType;
3275   CRASH_INSERTION(7122);
3276   ndbrequire(isMaster());
3277   ndbrequire(refToNode(retRef) == nodeId);
3278   if (c_lcpMasterTakeOverState.state != LMTOS_IDLE)
3279   {
3280     jam();
3281     infoEvent("DIH : Denied request for start permission from %u "
3282               "while LCP Master takeover in progress.",
3283               nodeId);
3284     g_eventLogger->info("DIH : Denied request for start permission from %u "
3285                         "while LCP Master takeover in progress.",
3286                         nodeId);
3287     signal->theData[0] = nodeId;
3288     signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
3289     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
3290     return;
3291   }
3292   if ((c_nodeStartMaster.activeState) ||
3293       (c_nodeStartMaster.wait != ZFALSE) ||
3294       ERROR_INSERTED_CLEAR(7175)) {
3295     jam();
3296     signal->theData[0] = nodeId;
3297     signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
3298     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
3299     return;
3300   }//if
3301 
3302   if (!getAllowNodeStart(nodeId))
3303   {
3304     jam();
3305     g_eventLogger->info("Rejecting attempt to start node %u", nodeId);
3306 ref:
3307     signal->theData[0] = nodeId;
3308     signal->theData[1] = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
3309     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
3310     return;
3311   }
3312   if (getNodeStatus(nodeId) != NodeRecord::DEAD)
3313   {
3314     jam();
3315     g_eventLogger->error("nodeStatus in START_PERMREQ = %u",
3316                          (Uint32) getNodeStatus(nodeId));
3317     goto ref;
3318   }//if
3319 
3320   if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
3321       typeStart != NodeState::ST_INITIAL_NODE_RESTART)
3322   {
3323     jam();
3324     signal->theData[0] = nodeId;
3325     signal->theData[1] = StartPermRef::InitialStartRequired;
3326     sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
3327     return;
3328   }
3329 
3330   /*----------------------------------------------------------------------
3331    * WE START THE INCLUSION PROCEDURE
3332    * ---------------------------------------------------------------------*/
3333   c_nodeStartMaster.failNr   = cfailurenr;
3334   c_nodeStartMaster.wait     = ZFALSE;
3335   c_nodeStartMaster.startInfoErrorCode = 0;
3336   c_nodeStartMaster.startNode = nodeId;
3337   c_nodeStartMaster.activeState = true;
3338   c_nodeStartMaster.m_outstandingGsn =  GSN_START_INFOREQ;
3339 
3340   setNodeStatus(nodeId, NodeRecord::STARTING);
3341   /**
3342    * But if it's a NodeState::ST_INITIAL_NODE_RESTART
3343    *
3344    * We first have to clear LCP's
3345    * For normal node restart we simply ensure that all nodes
3346    * are informed of the node restart
3347    */
3348   StartInfoReq *const r =(StartInfoReq*)&signal->theData[0];
3349   r->startingNodeId = nodeId;
3350   r->typeStart = typeStart;
3351   r->systemFailureNo = cfailurenr;
3352   sendLoopMacro(START_INFOREQ, sendSTART_INFOREQ, RNIL);
3353 }//Dbdih::execSTART_PERMREQ()
3354 
execSTART_INFOREF(Signal * signal)3355 void Dbdih::execSTART_INFOREF(Signal* signal)
3356 {
3357   StartInfoRef * ref = (StartInfoRef*)&signal->theData[0];
3358   if (getNodeStatus(ref->startingNodeId) != NodeRecord::STARTING) {
3359     jam();
3360     return;
3361   }//if
3362   ndbrequire(c_nodeStartMaster.startNode == ref->startingNodeId);
3363   c_nodeStartMaster.startInfoErrorCode = ref->errorCode;
3364   startInfoReply(signal, ref->sendingNodeId);
3365 }//Dbdih::execSTART_INFOREF()
3366 
execSTART_INFOCONF(Signal * signal)3367 void Dbdih::execSTART_INFOCONF(Signal* signal)
3368 {
3369   jamEntry();
3370   StartInfoConf * conf = (StartInfoConf*)&signal->theData[0];
3371   if (getNodeStatus(conf->startingNodeId) != NodeRecord::STARTING) {
3372     jam();
3373     return;
3374   }//if
3375   ndbrequire(c_nodeStartMaster.startNode == conf->startingNodeId);
3376   startInfoReply(signal, conf->sendingNodeId);
3377 }//Dbdih::execSTART_INFOCONF()
3378 
startInfoReply(Signal * signal,Uint32 nodeId)3379 void Dbdih::startInfoReply(Signal* signal, Uint32 nodeId)
3380 {
3381   receiveLoopMacro(START_INFOREQ, nodeId);
3382   /**
3383    * We're finished with the START_INFOREQ's
3384    */
3385   if (c_nodeStartMaster.startInfoErrorCode == 0)
3386   {
3387     jam();
3388     /**
3389      * Everything has been a success so far
3390      *
3391      * Update node recovery status that we now have received permission to
3392      * perform node restart from all live nodes. This code only executes
3393      * in the master node.
3394      */
3395     setNodeRecoveryStatus(c_nodeStartMaster.startNode,
3396                           NodeRecord::START_PERMITTED);
3397 
3398     StartPermConf * conf = (StartPermConf*)&signal->theData[0];
3399     conf->startingNodeId = c_nodeStartMaster.startNode;
3400     conf->systemFailureNo = cfailurenr;
3401     conf->microGCP = m_micro_gcp.m_enabled;
3402     sendSignal(calcDihBlockRef(c_nodeStartMaster.startNode),
3403                GSN_START_PERMCONF, signal, StartPermConf::SignalLength, JBB);
3404     c_nodeStartMaster.m_outstandingGsn = GSN_START_PERMCONF;
3405   }
3406   else
3407   {
3408     /**
3409      * Failure of START_INFO protocol, another node wasn't ready to
3410      * start this node, some part of handling a previous node failure
3411      * hadn't completed yet. The node will have to wait a bit more.
3412      * We need to restore the state such that the retry is possible.
3413      */
3414     jam();
3415     StartPermRef * ref = (StartPermRef*)&signal->theData[0];
3416     ref->startingNodeId = c_nodeStartMaster.startNode;
3417     ref->errorCode = c_nodeStartMaster.startInfoErrorCode;
3418     sendSignal(calcDihBlockRef(c_nodeStartMaster.startNode),
3419 	       GSN_START_PERMREF, signal, StartPermRef::SignalLength, JBB);
3420     setNodeStatus(c_nodeStartMaster.startNode, NodeRecord::DEAD);
3421     nodeResetStart(signal);
3422   }//if
3423 }//Dbdih::startInfoReply()
3424 
3425 /**
3426  *---------------------------------------------------------------------------
3427  * LCP Pausing module
3428  * ------------------
3429  *
3430  * This module contains code that executes for the purpose of pausing
3431  * LCP reporting to our meta data for a short time while we are copying the
3432  * meta data to a new starting node.
3433  *
3434  * In order to better understand the handling of the LCP protocol we will
3435  * describe the LCP protocol, this includes both the old and the new protocol.
3436  *
3437  * The LCP protocol is controlled by the DIH in the master node.
3438  * When an LCP has been completed we will immediately start checking for
3439  * the need for a new LCP to be started.
3440  *
3441  * The first step here is to ensure that we have had sufficient activity in
3442  * the cluster to necessitate an LCP to be executed again.
3443  *
3444  * To check this we send TCGETOPSIZEREQ to all DBTCs in the cluster. This
3445  * will gather in an estimate of how much writes we've had in the cluster
3446  * since the last LCP was started. There are also various ways to ensure
3447  * that we start an LCP immediately if so needed.
3448  *
3449  * If the activity was sufficient we will start the LCP.
3450  * Before starting the LCP we will calculate a number of GCI values that
3451  * are important, oldest restorable GCI and so forth.
3452  * Next we will send TC_CLOPSIZEREQ to all DBTCs in the cluster to clear
3453  * the activity counter in DBTC as preparation for the next LCP start.
3454  *
3455  * In the old way we will then grab a mutex on the fragment info, this
3456  * mutex will be held until the LCP is completed. The mutex is held in
3457  * the master node, in a master takeover the mutex needs to be taken
3458  * also in the new master node. Since all LCPs goes through the master
3459  * node this has the same effect as a distributed mutex on the fragment
3460  * info.
3461  *
3462  * In the new way we will start the LCP immediately here without grabbing
3463  * the mutex.
3464  *
3465  * The first step in starting is to calculate the set of LQHs involved in
3466  * the LCP and the set of DIHs involved in the LCP. A node is involved in
3467  * the LCP in DIH if it has had the meta data copied to it. It will
3468  * participate in an LCP in LQH if the data has been restored and we're
3469  * ready to perform a full LCP.
3470  *
3471  * Next we update to the new LCP id of the new LCP.
3472  *
3473  * The next step is performed in the master node by walking through all
3474  * fragment replicas of all active tables to see how much of the REDO log
3475  * we can cut away when starting the new LCP. At the first order of a
3476  * LCP of a fragment in an LDM instance we will set the new log tail in
3477  * that LDM instance.
3478  *
3479  * After calculating the new GCI values and setting the LCP id we will
3480  * synchronize this information with all other nodes in the cluster.
3481  * This information will also be synchronized to the file system in
3482  * the Sysfile. This file is where all restarts start by looking at
3483  * the state of the our database on files.
3484  * The COPY_GCIREQ signal is used to distribute this message.
3485  *
3486  * When all nodes have synchronized this information to disk and confirmed
3487  * this to the master then we are ready to start sending orders to perform
3488  * the individual checkpoints of the fragment replicas.
3489  *
3490  * The next step is that we want to set the tables to be involved in the
3491  * LCP. At this point we want to ensure that the same set of tables is
3492  * calculated in all nodes. To ensure this we grab the mutex that ensures
3493  * no tables are able to commit their CREATE TABLE statements until we are
3494  * done with this step.
3495  * This is started by the signal START_LCP_REQ. This signal also contains
3496  * list of nodes involved in the LCP both for LQH and DIH.
3497  *
3498  * CREATE TABLE can create new tables prior to this point  which we will
3499  * include, and that's ok as they cannot possibly affect the new redo tail
3500  * position. DROP TABLE can drop tables prior to this point, which could
3501  * remove the need to maintain some old redo, but that will be handled in
3502  * the following LCP.
3503  *
3504  * Each table to execute the LCP on is marked with a proper state in the
3505  * variable tabLcpStatus. Also each fragment replica to execute the LCP
3506  * on is marked with true in the lcpOngoingFlag and we set the number of
3507  * replicas to perform LCP on per fragment as well.
3508  *
3509  * These preparatory steps are done in a synchronized manner, so all nodes
3510  * have received information about the COPY_GCIREQ and now all nodes have
3511  * heard the START_LCP_REQ signals. So in a master takeover we can ask all
3512  * nodes about their LCP state and we can derive if we sent the COPY_GCIREQ
3513  * to all nodes and similarly we can derive if we sent and completed the
3514  * START_LCP_REQ step. To derive this requires all nodes to have heard of
3515  * those signals, not just one of them since a crash can occur in the
3516  * middle of signal sending.
3517  *
3518  * In a master takeover if we haven't completed the COPY_GCIREQ step then
3519  * we can start the next LCP from the beginning again. If COPY_GCIREQ has
3520  * been completed but not the START_LCP_REQ, then we can restart the
3521  * START_LCP_REQ step. Finally if the START_LCP_REQ has been completed
3522  * then we know that the execution of checkpoints on individual fragment
3523  * replicas is ongoing. Obviously in a master take over we should ensure
3524  * that the processing of START_LCP_REQ is completed before we report
3525  * back our state to the master node to ensure that we make the master
3526  * takeover handling as simple as possible.
3527  *
3528  * So now that we know exactly which tables and fragment replicas to checkpoint
3529  * it is time to start the actual checkpoint phase.
3530  *
3531  * The master node will send LCP_FRAG_ORD to DBLQH for each of the fragment
3532  * replicas to execute the LCP on.
3533  *
3534  * In the old way there was a queue of such LCP_FRAG_ORD with limited size in
3535  * DBDIH (queue size was 2 in 7.3 and earlier and 128 in early 7.4 versions).
3536  * Also DBLQH had a queue for LCP_FRAG_ORDs, in 7.3 this was 2 in size and
3537  * in early versions of 7.4 it was 64.
3538  *
3539  * In the new version we can send LCP_FRAG_ORD to LQH as before, LQH has an
3540  * infinite queue size (it simply stores the LCP_FRAG_ORD on the fragment
3541  * record, so there is no limit to the queue size since all fragments can
3542  * be in the queue). In addition at master takeover we also support receiving
3543  * the same order two or more times. By ensuring that we keep track of that
3544  * we already received a LCP_FRAG_ORD on a fragment we can also easily discard
3545  * LCP_FRAG_ORDs that we already received.
3546  *
3547  * These features mean that LQH can process a Local Checkpoint without much
3548  * interaction with DIH / DIH Master, which enables simplifications at DIH
3549  * and DIH Master in later versions. In principle we could send off all
3550  * LCP_FRAG_ORDs immediately if we like and more or less turn the LDM
3551  * instances into independent LCP execution engines. This is a step in the
3552  * direction of more local control in LQH over LCP execution.
3553  *
3554  * When all LCP_FRAG_ORD have been sent, then a special LCP_FRAG_ORD to all
3555  * participating LQH nodes. This signal has the flag lastFragmentFlag set,
3556  * it doesn't contain any fragment to checkpoint, it is only a flag that
3557  * indicates that we've sent the last LCP_FRAG_ORD.
3558  *
3559  * LQH will execute orders to execute LCP on a fragment in the order they are
3560  * received. As a fragment is completing its LCP it will generate a new message
3561  * LCP_FRAG_REP. This message is broadcasted to all participating DIHs. First
3562  * the message is sent from DBLQH to the local DIH. Finally the local DIH will
3563  * broadcast it to all participating DIHs.
3564  *
3565  * This new Pausing LCP module is involved here by being able to queue also
3566  * LCP_FRAG_REP before they are broadcast to the participating DIHs. They are
3567  * queued on the fragment replica records in the local DIH and thus we have
3568  * no limits on the queue size.
3569  *
3570  * This allows the DIH Master state to be stabilised as necessary during an
3571  * LCP, removing the need in some cases to wait for an LCP to complete before
3572  * performing some other activity.
3573  *
3574  * When LQH have executed all the LCP_FRAG_ORDs and have received the
3575  * last fragment flag, then the LDM will perform a number of activities to
3576  * complete the local checkpoint. These activities is mostly used by the
3577  * disk data tables.
3578  *
3579  * After all these activities have completed the LQH will send
3580  * LCP_COMPLETE_REP to the local DIH. The local DIH will broadcast it to all
3581  * participating DIHs.
3582  *
3583  * When all LQHs have sent all LCP_FRAG_REP and it has also sent the
3584  * LCP_COMPLETE_REP, then the LCP is completed. So a node that has seen
3585  * LCP_COMPLETE_REP from all nodes participating in the LCP knows that
3586  * it has received all the LCP_FRAG_REP for the LCP.
3587  *
3588  * In a master takeover in the old way we could not resend the LCP_FRAG_ORD
3589  * to the LQH again. To avoid this we used an extra master takeover
3590  * protocol EMPTY_LCP_REQ. This protocol ensures that all LQHs have completed
3591  * the queues and that all LCP_FRAG_REPs have been sent to all participating
3592  * DIHs and likewise with the LCP_COMPLETE_REP such that the new master has
3593  * a precise view of which fragment replicas have completed the LCP execution
3594  * so far.
3595  *
3596  * Thus when the master takeover is completed we know that each DIH has all
3597  * the LCP_FRAG_REP for which an LCP_FRAG_ORD have been sent and also all
3598  * LCP_COMPLETE_REP that have been produced. This means that we are now
3599  * ready to restart the process of sending LCP_FRAG_ORD again.
3600  *
3601  * The problem with this approach is that can consume a very long time to
3602  * execute the entire LCP fragment queue in LQH if the queue size increases
3603  * (increased from 2 to 64 going from 7.3 to 7.4) and the size of the
3604  * fragments also increase. So the master takeover can take a substantial
3605  * time in this case.
3606  *
3607  * So the new manner is to allow for the LQH to get LCP_FRAG_ORD and also
3608  * the special last LCP_FRAG_ORD several times with the same LCP id and
3609  * discard those that it receives for a second time. In this manner we can
3610  * simply restart sending the LCP_FRAG_ORD from the beginning. When we are
3611  * done with this we can start checking for completion of the LCP in the
3612  * normal way.
3613  *
3614  * When the master has sent the last special LCP_FRAG_ORD and these have been
3615  * received by the receiving nodes, then the master will actually itself not
3616  * do anything more to execute the LCP. The non-master nodes will however send
3617  * LCP_COMPLETE_REP to the master node. So this means that a new LCP won't
3618  * start until all participating DIHs have completed the processing of the
3619  * last LCP.
3620  *
3621  * So effectively taking over as master in this phase doesn't really require
3622  * any specific work other than redirecting the LCP_COMPLETE_REP from the
3623  * non-masters to the new master. If it has already been sent it should be
3624  * seen in the response to the MASTER_LCPREQ from the node. So after
3625  * receiving the last MASTER_LCPCONF we have information enough about whether
3626  * we need to send more LCP_FRAG_ORDs or not.
3627  *
3628  * We can still optimise the sending of LCP_FRAG_ORD a little bit by avoiding
3629  * to send LCP_FRAG_ORD to a fragment replica where we have already received
3630  * a LCP_FRAG_REP for it. It would be possible to avoid sending extra
3631  * LCP_FRAG_ORDs in various ways, but it doesn't really cost much, LCP_FRAG_ORD
3632  * is a small signal and the number of signals sent is limited to the number
3633  * of fragment replicas. So this would make sense if we have to support
3634  * extremely large clusters and extremely many tables in combination.
3635  *
3636  * As this description shows some interesting places to test master failures
3637  * are:
3638  * 1) Master failure while clearing TC counters (TC_CLOPSIZEREQ).
3639  * 2) Master failure while distributing COPY_GCIREQ.
3640  * 3) Master failure while distributing START_LCP_REQ
3641  * 4) Master failure while processing the LCP and sending LCP_FRAG_ORDs
3642  * 4.1) Before any LCP_FRAG_REP received
3643  * 4.2) After receiving many LCP_FRAG_REPs, but not all
3644  * 4.3) After receiving all LCP_FRAG_REPs, but not all LCP_COMPLETE_REPs
3645  * 4.4) After receiving all LCP_FRAG_REPs, and all LCP_COMPLETE_REPs.
3646  *
3647  * While distributing above can be interpreted as one test case of before
3648  * distributing, one in the middle of distributing and one when all
3649  * responses have been received.
3650  *
3651  * It is also important to similarly test PAUSE_LCP_REQ handling in all of
3652  * the above states. This can be handled by inserting an ERROR_INSERT that
3653  * effectively stops the process to copy meta data at some point and then
3654  * setting some variable that triggers the copying of meta data to continue
3655  * at a state that we wanted to accomplish.
3656  *---------------------------------------------------------------------------*/
3657 /* Initialisation routine, called once at startup of the node */
init_lcp_pausing_module(void)3658 void Dbdih::init_lcp_pausing_module(void)
3659 {
3660   /* Master state variables */
3661   c_pause_lcp_master_state = PAUSE_LCP_IDLE;
3662   c_lcp_runs_with_pause_support = false;
3663 
3664   /* Pause participant state variables */
3665   c_dequeue_lcp_rep_ongoing = false;
3666   c_queued_lcp_complete_rep = false;
3667   c_lcp_id_paused = RNIL;
3668   c_pause_lcp_start_node = RNIL;
3669   c_last_id_lcp_complete_rep = RNIL;
3670 
3671   /* Starting node state variable */
3672   c_lcp_id_while_copy_meta_data = RNIL;
3673 }
3674 
check_pause_state_lcp_idle(void)3675 void Dbdih::check_pause_state_lcp_idle(void)
3676 {
3677   /**
3678    * We should not be able to complete an LCP while still having
3679    * queued LCP_COMPLETE_REP and LCP_FRAG_REP.
3680    */
3681   ndbrequire(c_queued_lcp_frag_rep.isEmpty());
3682   ndbrequire(!c_queued_lcp_complete_rep);
3683 }
3684 
3685 /* Support function only called within ndbassert */
check_pause_state_sanity(void)3686 bool Dbdih::check_pause_state_sanity(void)
3687 {
3688   if (is_lcp_paused())
3689   {
3690     ndbrequire(!c_dequeue_lcp_rep_ongoing);
3691   }
3692   ndbrequire(c_lcp_id_paused == RNIL ||
3693              is_lcp_paused() ||
3694              c_dequeue_lcp_rep_ongoing);
3695   return true;
3696 }
3697 
3698 /* Support function for execLCP_FRAG_REP */
queue_lcp_frag_rep(Signal * signal,LcpFragRep * lcpReport)3699 void Dbdih::queue_lcp_frag_rep(Signal *signal, LcpFragRep *lcpReport)
3700 {
3701   Uint32 tableId = lcpReport->tableId;
3702   Uint32 fragId = lcpReport->fragId;
3703 
3704   TabRecordPtr tabPtr;
3705   tabPtr.i = tableId;
3706   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
3707 
3708   if (tabPtr.p->tabStatus == TabRecord::TS_DROPPING ||
3709       tabPtr.p->tabStatus == TabRecord::TS_IDLE)
3710   {
3711     jam();
3712     return;
3713   }
3714 
3715   FragmentstorePtr fragPtr;
3716   getFragstore(tabPtr.p, fragId, fragPtr);
3717 
3718   ReplicaRecordPtr replicaPtr;
3719   findReplica(replicaPtr, fragPtr.p, lcpReport->nodeId);
3720   c_queued_lcp_frag_rep.addLast(replicaPtr);
3721   ndbrequire(replicaPtr.p->nextLcp == lcpReport->lcpNo);
3722   ndbrequire(replicaPtr.p->fragId == fragId);
3723   ndbrequire(replicaPtr.p->tableId == tableId);
3724   ndbrequire(replicaPtr.p->procNode == lcpReport->nodeId);
3725   ndbrequire(c_lcp_id_paused == RNIL ||
3726              c_lcp_id_paused == lcpReport->lcpId);
3727   c_lcp_id_paused = lcpReport->lcpId;
3728   replicaPtr.p->repMaxGciStarted = lcpReport->maxGciStarted;
3729   replicaPtr.p->repMaxGciCompleted = lcpReport->maxGciCompleted;
3730   ndbassert(check_pause_state_sanity());
3731 }
3732 
3733 /* Support function for execLCP_COMPLETE_REP */
queue_lcp_complete_rep(Signal * signal,Uint32 lcpId)3734 void Dbdih::queue_lcp_complete_rep(Signal *signal, Uint32 lcpId)
3735 {
3736   ndbrequire(!c_queued_lcp_complete_rep);
3737   c_queued_lcp_complete_rep = true;
3738   ndbrequire(c_lcp_id_paused == RNIL ||
3739              c_lcp_id_paused == lcpId);
3740   c_lcp_id_paused = lcpId;
3741   ndbassert(check_pause_state_sanity());
3742 }
3743 
3744 /* Support function to start copying of meta data */
start_copy_meta_data(Signal * signal)3745 void Dbdih::start_copy_meta_data(Signal *signal)
3746 {
3747   /**
3748    * Now that we have locked both the DICT lock and the LCPs are locked from
3749    * starting we are ready to copy both the distribution information and the
3750    * dictionary information. We update the node recovery status indicating
3751    * this. This code only executes in the master node.
3752    */
3753   setNodeRecoveryStatus(c_nodeStartMaster.startNode,
3754                         NodeRecord::COPY_DICT_TO_STARTING_NODE);
3755 
3756   c_nodeStartMaster.wait = 10;
3757   signal->theData[0] = DihContinueB::ZCOPY_NODE;
3758   signal->theData[1] = 0;
3759   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
3760   c_nodeStartMaster.m_outstandingGsn = GSN_COPY_TABREQ;
3761 }
3762 
3763 /**---------------------------------------------------------------
3764  * MASTER FUNCTIONALITY
3765  **--------------------------------------------------------------*/
3766 /* Support function to check if LCP is still runnning */
check_if_lcp_idle(void)3767 bool Dbdih::check_if_lcp_idle(void)
3768 {
3769   ndbrequire(isMaster());
3770   switch (c_lcpState.lcpStatus)
3771   {
3772   case LCP_STATUS_IDLE:
3773   case LCP_TCGET:
3774   case LCP_TC_CLOPSIZE:
3775   case LCP_WAIT_MUTEX:
3776     jam();
3777     check_pause_state_lcp_idle();
3778     return true;
3779   case LCP_STATUS_ACTIVE:
3780     jam();
3781     return false;
3782   case LCP_TAB_COMPLETED:
3783     jam();
3784     // Fall through
3785   case LCP_TAB_SAVED:
3786     jam();
3787     /**
3788      * For LCP_TAB_COMPLETED and LCP_TAB_SAVED we have already received
3789      * all the table information and thus there is no need to get the new
3790      * node into the LCP, there won't be any updates to the LCP data until
3791      * the next LCP happens.
3792      */
3793     return true;
3794   default:
3795     jam();
3796     return false;
3797   }
3798 }
3799 
3800 /* Send PAUSE_LCP_REQ to pause or to unpause, master code */
sendPAUSE_LCP_REQ(Signal * signal,bool pause)3801 void Dbdih::sendPAUSE_LCP_REQ(Signal *signal, bool pause)
3802 {
3803   PauseLcpReq *req = (PauseLcpReq*)signal->getDataPtrSend();
3804 
3805   /**
3806    * Send to all DIHs that participate in the LCP, including ourselves.
3807    * We will set up waiting for all those signals such that we can also
3808    * handle node failures in the middle of the pause process.
3809    */
3810   ndbrequire(isMaster());
3811   if (pause)
3812   {
3813     jam();
3814     ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_IDLE);
3815     c_pause_lcp_master_state = PAUSE_LCP_REQUESTED;
3816     req->pauseAction = PauseLcpReq::Pause;
3817     c_pause_participants = c_lcpState.m_participatingLQH;
3818     infoEvent("PAUSE LCP for starting node %u", c_nodeStartMaster.startNode);
3819   }
3820   else
3821   {
3822     /**
3823      * We are unpausing the LCP again after completing the copy of the meta
3824      * data, slightly different dependent on whether the starting node was
3825      * included into the LCP or not.
3826      */
3827     if (c_pause_lcp_master_state == PAUSE_COMPLETE_LCP_INCLUSION)
3828     {
3829       jam();
3830       ndbrequire(!check_if_lcp_idle());
3831       c_pause_lcp_master_state = PAUSE_IN_LCP_UNPAUSE;
3832       req->pauseAction = PauseLcpReq::UnPauseIncludedInLcp;
3833       infoEvent("UNPAUSE LCP for starting node %u, included in LCP",
3834                 c_nodeStartMaster.startNode);
3835     }
3836     else if (c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_COPY_META_DATA)
3837     {
3838       jam();
3839       ndbrequire(check_if_lcp_idle());
3840       c_pause_lcp_master_state = PAUSE_NOT_IN_LCP_UNPAUSE;
3841       req->pauseAction = PauseLcpReq::UnPauseNotIncludedInLcp;
3842       infoEvent("UNPAUSE LCP for starting node %u, not included in LCP",
3843                 c_nodeStartMaster.startNode);
3844     }
3845     else
3846     {
3847       ndbabort();
3848     }
3849   }
3850   /**
3851    * The blocks that do the pausing is the local DIH in the nodes that
3852    * generate LCP_FRAG_REPs and LCP_COMPLETE_REPs. These are the
3853    * m_participatingLQH nodes. This set is untouched by new starting
3854    * nodes for this LCP. New nodes can be added to the next LCP, but
3855    * not to this one.
3856    *
3857    * As part of the pause protocol the starting node must also participate
3858    * in the LCP completion protocol, so the pause also includes taking the
3859    * starting node into the DIH node set that participates in the LCP.
3860    * We do however wait including the node until we reach the UnPause
3861    * action. The reason is that it is possible that the LCP is completed
3862    * in the process of pausing. In this case we will continue
3863    * completing the pause in the normal manner, but we will not send
3864    * START_LCP_REQ to the new node and we will not include the new in the
3865    * m_participatingDIH bitmap in the DIH nodes already participating
3866    * in the LCP.
3867    *
3868    * For those nodes that existed previously in the m_participatingDIH
3869    * bitmap, but not in the m_participatingLQH bitmap we need not
3870    * worry since they won't make use of the m_participatingDIH bitmap.
3871    * So there is no need to add the starting node into those. The
3872    * m_participatingDIH bitmap is used by those nodes that generate
3873    * LCP_FRAG_REPs and LCP_COMPLETE_REPs, and these nodes are exactly
3874    * the nodes found in the m_participatingLQH bitmap.
3875    */
3876 
3877   req->senderRef = reference();
3878   req->startNodeId = c_nodeStartMaster.startNode;
3879   if (req->pauseAction == PauseLcpReq::UnPauseIncludedInLcp)
3880   {
3881     jam();
3882     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.setWaitingFor(
3883       c_nodeStartMaster.startNode);
3884   }
3885   c_PAUSE_LCP_REQ_Counter.setWaitingFor(c_pause_participants);
3886   NodeReceiverGroup rg(DBDIH, c_pause_participants);
3887   rg.m_nodes.clear(getOwnNodeId());
3888   sendSignal(rg, GSN_PAUSE_LCP_REQ, signal,
3889              PauseLcpReq::SignalLength, JBB);
3890   /**
3891    * We execute the signal to ourself immediately, the reason is to
3892    * avoid having to add a specific state variable to detect when the
3893    * starting node have failed between now and receiving this signal.
3894    */
3895   execPAUSE_LCP_REQ(signal);
3896   ndbassert(check_pause_state_sanity());
3897 }
3898 
3899 /* Master code, other node has completed PAUSE_LCP_REQ */
execPAUSE_LCP_CONF(Signal * signal)3900 void Dbdih::execPAUSE_LCP_CONF(Signal *signal)
3901 {
3902   PauseLcpConf *conf = (PauseLcpConf*)&signal->theData[0];
3903   Uint32 nodeId = refToNode(conf->senderRef);
3904   Uint32 startNode = conf->startNodeId;
3905 
3906   ndbrequire(isMaster());
3907 
3908   if (!is_pause_for_this_node(startNode))
3909   {
3910     /* Ignore, node died in the process */
3911     jam();
3912     return;
3913   }
3914   ndbassert(check_pause_state_sanity());
3915   receiveLoopMacro(PAUSE_LCP_REQ, nodeId);
3916 
3917   if (c_pause_lcp_master_state == PAUSE_LCP_REQUESTED)
3918   {
3919     jam();
3920     /**
3921      * We have paused the reporting of LCPs, we are now ready to process the
3922      * copying of meta data. At this point in time we have sent PAUSE_LCP_REQ
3923      * to all LQH nodes participating in the LCP. Those in turn have sent
3924      * FLUSH_LCP_REP_REQ to all DIH participants and received a response
3925      * back from all nodes. This means that we have ensured that we have
3926      * absolutely no LCP_FRAG_REP and LCP_COMPLETE_REP signals in transit
3927      * in the entire cluster since we have sent a signal through every
3928      * link that could carry such a signal. We use the FIFO queue mechanism
3929      * of signals between two DIHs here as an important part of the protocol.
3930      *
3931      * This means that all DIHs now have the same view on the
3932      * LCP_FRAG_REPs they have seen and similarly for LCP_COMPLETE_REPs.
3933      * The LCP_COMPLETE_REPs could however still be sent back to ourselves
3934      * through a delayed signal since we don't want to process those
3935      * signals concurrently with pausing the LCP.
3936      *
3937      * We could end up in a situation where the LCP have completed here, but
3938      * this isn't a problem, we still hold the fragment info mutex, so no
3939      * new LCP can start until we are done with the copying and release the
3940      * fragment info mutex.
3941      */
3942     ndbassert(check_pause_state_sanity());
3943     check_for_pause_action(signal, StartLcpReq::PauseLcpStartFirst);
3944     return;
3945   }
3946   /**
3947    * UnPause
3948    * ------
3949    * This is the normal path for unpausing. At this point we have sent
3950    * PAUSE_LCP_REQ to all LQH nodes participating in the LCP. These nodes
3951    * have now started sending the LCP_FRAG_REPs and LCP_COMPLETE_REPs
3952    * again. The copying of meta data have been completed and we have
3953    * been included in the LCP handling. So we are now ready to proceed
3954    * with the node restart again. We will also perform the unpause
3955    * on the master node here to avoid interesting states between
3956    * stop pause and receiving the last PAUSE_LCP_CONF.
3957    */
3958   jam();
3959   ndbrequire(c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_UNPAUSE ||
3960              c_pause_lcp_master_state == PAUSE_IN_LCP_UNPAUSE);
3961   if (c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_UNPAUSE)
3962   {
3963     jam();
3964     end_pause(signal, PauseLcpReq::UnPauseNotIncludedInLcp);
3965   }
3966   else if (c_pause_lcp_master_state == PAUSE_IN_LCP_UNPAUSE)
3967   {
3968     jam();
3969     end_pause(signal, PauseLcpReq::UnPauseIncludedInLcp);
3970   }
3971   else
3972   {
3973     ndbabort();
3974   }
3975   dihCopyCompletedLab(signal);
3976 }
3977 
3978 /**-------------------------------------------------------------------
3979   FUNCTIONS USED IN ALL NODES
3980 --------------------------------------------------------------------*/
3981 /**
3982  * PAUSE_LCP_REQ
3983  * -------------
3984  * This signal is sent from the master node to all DIHs to block distribution
3985  * of LCP_FRAG_REP signals. When we receive this signal we will queue all
3986  * signals that we receive from DBLQH about completed LCP fragments. The same
3987  * signal is also sent to stop the pause. The pauseAction is 0 for pause and
3988  * 1 for stop pause.
3989  *
3990  * After pausing locally in our own DBDIH, we will send a FLUSH_LCP_REP_REQ
3991  * to all nodes participating in the LCP. This ensures that any LCP_FRAG_REP
3992  * we have sent out has been received by the receiving node since we are
3993  * sending it on the same path and we have a guarantee that signals using
3994  * the same path won't race each other.
3995  */
execPAUSE_LCP_REQ(Signal * signal)3996 void Dbdih::execPAUSE_LCP_REQ(Signal *signal)
3997 {
3998   PauseLcpReq *req = (PauseLcpReq*) &signal->theData[0];
3999   PauseLcpReq::PauseAction pauseAction =
4000     (PauseLcpReq::PauseAction)req->pauseAction;
4001   Uint32 startNode = req->startNodeId;
4002 
4003   ndbrequire(req->senderRef == cmasterdihref);
4004   ndbassert(check_pause_state_sanity());
4005 
4006   /* TODO: Insert check that startNode is still alive here */
4007   if (pauseAction == PauseLcpReq::Pause)
4008   {
4009     jam();
4010     pause_lcp(signal, startNode, req->senderRef);
4011   }
4012   else
4013   {
4014     jam();
4015     unpause_lcp(signal,
4016                 startNode,
4017                 req->senderRef,
4018                 pauseAction);
4019   }
4020   return;
4021 }
4022 
pause_lcp(Signal * signal,Uint32 startNode,BlockReference sender_ref)4023 void Dbdih::pause_lcp(Signal *signal,
4024                       Uint32 startNode,
4025                       BlockReference sender_ref)
4026 {
4027   /**
4028    * Since the message comes from the master on behalf of the starting
4029    * node we need to ensure that the starting node hasn't failed already.
4030    * We handle stopping of pause at node failure, but if this arrives
4031    * after we already received NODE_FAILREP we need to ensure that we
4032    * don't proceed since this will cause havoc.
4033    */
4034   if (!isMaster())
4035   {
4036     /**
4037      * We should come here after getting permit to start node, but before
4038      * we the node is included into the LCP and GCP protocol, this happens
4039      * immediately after we copied the meta data which the PAUSE LCP
4040      * protocol is part of handling.
4041      */
4042     NodeRecordPtr nodePtr;
4043     nodePtr.i = startNode;
4044     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4045     if (!nodePtr.p->is_pausable)
4046     {
4047       jam();
4048       /* Ignore, node already died */
4049       return;
4050     }
4051   }
4052 
4053   ndbrequire(sender_ref == cmasterdihref);
4054   if (c_dequeue_lcp_rep_ongoing)
4055   {
4056     jam();
4057     /**
4058      * Stop unpause mechanism as we are starting a new pause action.
4059      */
4060     c_dequeue_lcp_rep_ongoing = false;
4061   }
4062   c_pause_lcp_start_node = startNode;
4063 
4064   /**
4065    * Send flush signal to all nodes participating in LCP.
4066    * We need not send to ourselves since we don't send LCP_FRAG_REP
4067    * to ourselves. We need to keep track of which nodes that have
4068    * replied to the message.
4069    */
4070   FlushLcpRepReq *req = (FlushLcpRepReq*) signal->getDataPtrSend();
4071   req->senderRef = reference();
4072   req->startNodeId = startNode;
4073   c_FLUSH_LCP_REP_REQ_Counter.setWaitingFor(c_lcpState.m_participatingDIH);
4074   NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
4075 
4076   sendSignal(rg, GSN_FLUSH_LCP_REP_REQ, signal,
4077              FlushLcpRepReq::SignalLength, JBB);
4078 
4079   ndbassert(check_pause_state_sanity());
4080 }
4081 
check_for_pause_action(Signal * signal,StartLcpReq::PauseStart pauseStart)4082 void Dbdih::check_for_pause_action(Signal *signal,
4083                                    StartLcpReq::PauseStart pauseStart)
4084 {
4085   ndbrequire(is_lcp_paused());
4086   if (!check_if_lcp_idle())
4087   {
4088     jam();
4089     /**
4090      * A next step when we have paused the LCP execution is to get the
4091      * starting node active in the LCP handling. This means we need to send
4092      * START_LCP_REQ to the node. We won't track the reply here since a
4093      * missing reply is due to a crashed node and then the node failure
4094      * handling will ensure that the LCP is restarted and that the pause of
4095      * the LCP is unpaused.
4096      * (A test case for this is needed).
4097      *
4098      * At this point in time we have stalled all activity in the LCP.
4099      * This means that the bit maps on participating LQHs and DIHs is
4100      * stable, it also means that the bit maps for which LQHs and DIHs
4101      * that have completed is also stable (we have stopped LCP_COMPLETE_REP
4102      * to pass through in all nodes). There might be LQHs and DIHs that
4103      * have already completed and we need this information to also be
4104      * transferred to the starting node for it to be able to complete
4105      * the LCP processing properly.
4106      *
4107      * This means we actually have to send two signals with all four
4108      * bitmaps. After these signals have been sent over we will
4109      * be ready to copy the meta data and after that to unpause and
4110      * complete this LCP with the starting node as a new participant.
4111      *
4112      * It is vital to send this information before we copy the meta
4113      * data since the m_participatingLQH bitmap is needed to set
4114      * the lcpOngoing flag on the replicas set correctly.
4115      */
4116     StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
4117     BlockReference ref = calcDihBlockRef(c_nodeStartMaster.startNode);
4118     req->senderRef = reference();
4119     req->lcpId = SYSFILE->latestLCP_ID;
4120     req->pauseStart = pauseStart;
4121     Uint32 rec_node_version =
4122         getNodeInfo(c_nodeStartMaster.startNode).m_version;
4123 
4124     if (pauseStart == StartLcpReq::PauseLcpStartFirst)
4125     {
4126       jam();
4127       ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_REQUESTED);
4128       c_pause_lcp_master_state = PAUSE_START_LCP_INCLUSION;
4129       Uint32 packed_length1 = c_lcpState.m_participatingLQH.getPackedLengthInWords();
4130       Uint32 packed_length2 = c_lcpState.m_participatingDIH.getPackedLengthInWords();
4131 
4132       if (ndbd_send_node_bitmask_in_section(rec_node_version))
4133       {
4134         jam();
4135         Uint32 participatingLQH[NdbNodeBitmask::Size];
4136         Uint32 participatingDIH[NdbNodeBitmask::Size];
4137         c_lcpState.m_participatingLQH.copyto(NdbNodeBitmask::Size, participatingLQH);
4138         c_lcpState.m_participatingDIH.copyto(NdbNodeBitmask::Size, participatingDIH);
4139         LinearSectionPtr lsptr[3];
4140         lsptr[0].p = participatingLQH;
4141         lsptr[0].sz = packed_length1;
4142         lsptr[1].p = participatingDIH;
4143         lsptr[1].sz = packed_length2;
4144         req->participatingLQH_v1.clear();
4145         req->participatingDIH_v1.clear();
4146 
4147         sendSignal(ref, GSN_START_LCP_REQ, signal,
4148                            StartLcpReq::SignalLength, JBB, lsptr, 2);
4149       }
4150       else if ((packed_length1 <= NdbNodeBitmask48::Size) &&
4151                (packed_length2 <= NdbNodeBitmask48::Size))
4152       {
4153         jam();
4154         req->participatingLQH_v1 = c_lcpState.m_participatingLQH;
4155         req->participatingDIH_v1 = c_lcpState.m_participatingDIH;
4156         sendSignal(ref, GSN_START_LCP_REQ, signal,
4157                    StartLcpReq::SignalLength, JBB);
4158       }
4159       else
4160       {
4161         ndbabort();
4162       }
4163     }
4164     else
4165     {
4166       bool found = false;
4167       bool found_high_node_id = false;
4168       NdbNodeBitmask participatingLQH;
4169       ndbrequire(pauseStart == StartLcpReq::PauseLcpStartSecond);
4170       ndbrequire(c_pause_lcp_master_state == PAUSE_IN_LCP_COPY_META_DATA);
4171       c_pause_lcp_master_state = PAUSE_COMPLETE_LCP_INCLUSION;
4172       req->participatingLQH_v1.clear();
4173       for (Uint32 nodeId = 1; nodeId <= m_max_node_id; nodeId++)
4174       {
4175         if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
4176         {
4177           jamLine(nodeId);
4178           participatingLQH.set(nodeId);
4179           found = true;
4180           if (nodeId >= MAX_NDB_NODES_v1)
4181             found_high_node_id = true;
4182         }
4183       }
4184       /**
4185        * We should not be able to have all LQH sent completed, but not all
4186        * LCP_FRAG_REP yet received.
4187        */
4188       ndbrequire(found);
4189 
4190       if (ndbd_send_node_bitmask_in_section(rec_node_version))
4191       {
4192         jam();
4193         LinearSectionPtr lsptr[3];
4194         lsptr[0].p = participatingLQH.rep.data;
4195         lsptr[0].sz = participatingLQH.getPackedLengthInWords();
4196         req->participatingLQH_v1.clear();
4197         req->participatingDIH_v1.clear();
4198         sendSignal(ref, GSN_START_LCP_REQ, signal,
4199                    StartLcpReq::SignalLength, JBB, lsptr, 1);
4200       }
4201       else if (participatingLQH.getPackedLengthInWords() <= NdbNodeBitmask48::Size)
4202       {
4203         jam();
4204         req->participatingLQH_v1 = participatingLQH;
4205         req->participatingDIH_v1.clear();
4206         ndbrequire(!found_high_node_id);
4207         sendSignal(ref, GSN_START_LCP_REQ, signal,
4208                    StartLcpReq::SignalLength, JBB);
4209       }
4210       else
4211       {
4212         ndbabort();
4213       }
4214       return;
4215     }
4216   }
4217   else
4218   {
4219     if (pauseStart == StartLcpReq::PauseLcpStartFirst)
4220     {
4221       jam();
4222       /**
4223        * The LCP completed while we paused, no need to prepare the starting
4224        * node for inclusion into the LCP protocol since we will continue
4225        * with the node restart immediately after completing the copy of the
4226        * meta data and the unpause action.
4227        */
4228       ndbrequire(c_pause_lcp_master_state == PAUSE_LCP_REQUESTED);
4229       c_pause_lcp_master_state = PAUSE_NOT_IN_LCP_COPY_META_DATA;
4230       start_copy_meta_data(signal);
4231     }
4232     else
4233     {
4234       jam();
4235       /**
4236        * The LCP completed while we paused and we have now copied the meta
4237        * data over. We are ready to unpause and need not include the new
4238        * node into the LCP protocol this time.
4239        */
4240       ndbrequire(pauseStart == StartLcpReq::PauseLcpStartSecond);
4241       ndbrequire(c_pause_lcp_master_state == PAUSE_NOT_IN_LCP_COPY_META_DATA);
4242       sendPAUSE_LCP_REQ(signal, false);
4243     }
4244     return;
4245   }
4246 }
4247 
unpause_lcp(Signal * signal,Uint32 startNode,BlockReference sender_ref,PauseLcpReq::PauseAction pauseAction)4248 void Dbdih::unpause_lcp(Signal *signal,
4249                         Uint32 startNode,
4250                         BlockReference sender_ref,
4251                         PauseLcpReq::PauseAction pauseAction)
4252 {
4253   if (!is_pause_for_this_node(startNode))
4254   {
4255     jam();
4256     /* Ignore, node already died */
4257     return;
4258   }
4259   /**
4260    * When we stop pausing we will set the dequeue flag, LCP_FRAG_REPs and
4261    * LCP_COMPLETE_REPs will continue to be queued while any of those two
4262    * flags are set to ensure that we keep the order of LCP_FRAG_REP. This
4263    * order isn't absolutely necessary, but it makes it easier to debug
4264    * the system.
4265    */
4266   PauseLcpConf *conf = (PauseLcpConf*)signal->getDataPtrSend();
4267   conf->senderRef = reference();
4268   conf->startNodeId = startNode;
4269   sendSignal(cmasterdihref, GSN_PAUSE_LCP_CONF, signal,
4270              PauseLcpConf::SignalLength, JBB);
4271 
4272   if (isMaster())
4273   {
4274     jam();
4275     /**
4276      * We complete the Pause LCP protocol in master when all nodes
4277      * have returned. Too early here.
4278      */
4279     return;
4280   }
4281   end_pause(signal, pauseAction);
4282 }
4283 
end_pause(Signal * signal,PauseLcpReq::PauseAction pauseAction)4284 void Dbdih::end_pause(Signal *signal,
4285                       PauseLcpReq::PauseAction pauseAction)
4286 {
4287   if (pauseAction == PauseLcpReq::UnPauseIncludedInLcp)
4288   {
4289     jam();
4290     c_lcpState.m_participatingDIH.set(c_pause_lcp_start_node);
4291   }
4292   stop_pause(signal);
4293 }
4294 
stop_pause(Signal * signal)4295 void Dbdih::stop_pause(Signal *signal)
4296 {
4297   if (isMaster())
4298   {
4299     jam();
4300     c_pause_participants.clear();
4301     c_pause_lcp_master_state = PAUSE_LCP_IDLE;
4302   }
4303   c_pause_lcp_start_node = RNIL;
4304   ndbrequire(!c_dequeue_lcp_rep_ongoing);
4305   c_dequeue_lcp_rep_ongoing = true;
4306   ndbassert(check_pause_state_sanity());
4307   dequeue_lcp_rep(signal);
4308 
4309   checkLcpCompletedLab(signal);
4310 }
4311 
4312 /**
4313  * All node failures while being in LCP pause state leads to immediate based
4314  * on the assumption that all node failures will also automatically lead
4315  * to failures of any starting nodes while we are still in the starting
4316  * state.
4317  *
4318  * This means we need no code to handle unpausing at node failures.
4319  */
handle_node_failure_in_pause(Signal * signal)4320 void Dbdih::handle_node_failure_in_pause(Signal *signal)
4321 {
4322   c_FLUSH_LCP_REP_REQ_Counter.clearWaitingFor();
4323   c_PAUSE_LCP_REQ_Counter.clearWaitingFor();
4324   stop_pause(signal);
4325   ndbassert(check_pause_state_sanity());
4326 }
4327 
4328 /**
4329  * We have stopped pausing and we are working through the queue of blocked
4330  * LCP reports. When we reach the end of it we will unset the dequeue flag
4331  * such that we need no more queue the LCP reports.
4332  *
4333  * We will dequeue one LCP report per signal and continue sending CONTINUEB
4334  * to ourselves until we're through the LCP reports that have blocked while
4335  * we paused.
4336  *
4337  * NOTE: The queue might be empty for a short while we are waiting for a
4338  * CONTINUEB to arrive. We don't check for emptiness before sending
4339  * CONTINUEB. So if one wants to add asserts on queue not empty while
4340  * flag is set, then this needs to be checked before CONTINUEB is sent.
4341  */
dequeue_lcp_rep(Signal * signal)4342 void Dbdih::dequeue_lcp_rep(Signal *signal)
4343 {
4344   ReplicaRecordPtr replicaPtr;
4345   bool empty;
4346   bool lcp_frag_rep_empty = c_queued_lcp_frag_rep.isEmpty();
4347   bool lcp_complete_rep_empty = !c_queued_lcp_complete_rep;
4348   if (!c_dequeue_lcp_rep_ongoing)
4349   {
4350     jam();
4351     ndbassert(check_pause_state_sanity());
4352     /**
4353      * We got a new pause signal before finishing off the queue, we will
4354      * stop dequeuing, the pause flag is already set and should continue
4355      * to be so.
4356      */
4357     return;
4358   }
4359   empty = lcp_frag_rep_empty && lcp_complete_rep_empty;
4360   /* Perform dequeueing of one LCP report */
4361   if (!empty)
4362   {
4363     if (!lcp_frag_rep_empty)
4364     {
4365       jam();
4366       /**
4367        * 1) Remove from queue
4368        * 2) Set up signal
4369        * 3) Send to all LCP DIH participants
4370        * 4) Send CONTINUEB for handling next in queue
4371        *
4372        * We also need to send to ourselves which is a bit different from
4373        * the normal LCP_FRAG_REP where we handle ourselves through a fall
4374        * through method. Here we come from a different place and we cannot
4375        * use the broadcast method since the dequeue flag is still set.
4376        * So we send the signals from here to all nodes in the DIH set
4377        * (including the starting node).
4378        */
4379       LcpFragRep *lcpFragRep = (LcpFragRep*)signal->getDataPtrSend();
4380 
4381       c_queued_lcp_frag_rep.first(replicaPtr);
4382       ndbrequire(replicaPtr.p != NULL);
4383       c_queued_lcp_frag_rep.removeFirst(replicaPtr);
4384 
4385       lcpFragRep->nodeId = getOwnNodeId();
4386       lcpFragRep->lcpId = c_lcp_id_paused;
4387       lcpFragRep->lcpNo = replicaPtr.p->nextLcp;
4388       lcpFragRep->tableId = replicaPtr.p->tableId;
4389       lcpFragRep->fragId = replicaPtr.p->fragId;
4390       lcpFragRep->maxGciCompleted = replicaPtr.p->repMaxGciCompleted;
4391       lcpFragRep->maxGciStarted = replicaPtr.p->repMaxGciStarted;
4392 
4393       NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
4394       sendSignal(rg, GSN_LCP_FRAG_REP, signal,
4395                  LcpFragRep::SignalLength, JBB);
4396 
4397       /**
4398        * Send signal as delayed signals to avoid overloading ourself
4399        * and other nodes with too many dequeued LCP requests in a too
4400        * short time. The dequeue should not be time critical.
4401        */
4402       signal->theData[0] = DihContinueB::ZDEQUEUE_LCP_REP;
4403       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
4404                           1, 1);
4405       return;
4406     }
4407     else
4408     {
4409       /**
4410        * 1) Reset c_queued_lcp_complete_rep
4411        * 2) Set up LCP_COMPLETE_REP signal
4412        * 3) Send signals to all LCP DIH participants
4413        * 4) Fall through to end queue removal
4414        */
4415       ndbassert(c_queued_lcp_complete_rep);
4416       LcpCompleteRep *lcpCompleteRep =
4417         (LcpCompleteRep*)signal->getDataPtrSend();
4418 
4419       c_queued_lcp_complete_rep = false;
4420 
4421       lcpCompleteRep->nodeId = getOwnNodeId();
4422       lcpCompleteRep->lcpId = c_lcp_id_paused;
4423       lcpCompleteRep->blockNo = DBLQH;
4424 
4425       NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
4426       sendSignal(rg, GSN_LCP_COMPLETE_REP, signal,
4427                  LcpCompleteRep::SignalLength, JBB);
4428     }
4429   }
4430   jam();
4431   /**
4432    * We have completed dequeueing all queued LCP reports. This means we can
4433    * reset the dequeue flag and resume normal operation of LCP reporting.
4434    */
4435   c_dequeue_lcp_rep_ongoing = false;
4436   c_lcp_id_paused = RNIL;
4437   ndbassert(check_pause_state_sanity());
4438 }
4439 
4440 /**
4441  * FLUSH_LCP_REP_CONF
4442  * ------------------
4443  * When we have received this signal from all nodes that participates in the
4444  * LCP, then we can send the PAUSE_LCP_CONF reply to the requester of the
4445  * pause (always requested by the master, we can only handle one pause at the
4446  * the time). We do however send along the starting node id in the signal
4447  * to ensure that we don't have to wait with the next start in the case of
4448  * a crash in the middle of the pausing.
4449  *
4450  * We will not be able to reach this point with the same node again and
4451  * still receive a signal from the previous time the node was alive since
4452  * the node start contains a number of messages from the master to all
4453  * nodes and thus ensuring that no outstanding messages are from a previous
4454  * node instance with the same node id. The same applies to a number of
4455  * similar scenarios in the NDB code.
4456  */
execFLUSH_LCP_REP_CONF(Signal * signal)4457 void Dbdih::execFLUSH_LCP_REP_CONF(Signal *signal)
4458 {
4459   FlushLcpRepConf *conf = (FlushLcpRepConf*)&signal->theData[0];
4460   jamEntry();
4461 
4462   Uint32 nodeId = refToNode(conf->senderRef);
4463   Uint32 startNode = conf->startNodeId;
4464 
4465   if (!is_pause_for_this_node(startNode))
4466   {
4467     /* Ignore, node died in the process */
4468     jam();
4469     return;
4470   }
4471 
4472   receiveLoopMacro(FLUSH_LCP_REP_REQ, nodeId);
4473   {
4474     jam();
4475    /* Normal path, master is still alive */
4476     PauseLcpConf *conf = (PauseLcpConf*)signal->getDataPtrSend();
4477     conf->senderRef = reference();
4478     conf->startNodeId = startNode;
4479     sendSignal(cmasterdihref, GSN_PAUSE_LCP_CONF, signal,
4480                PauseLcpConf::SignalLength, JBB);
4481   }
4482   ndbassert(check_pause_state_sanity());
4483 }
4484 
4485 /**
4486  * FLUSH_LCP_REP_REQ
4487  * -----------------
4488  * The only purpose of this signal is to ensure that we don't have any
4489  * outstanding LCP_FRAG_REP signals or other LCP signals. These signals
4490  * are sent from the node producing them to all other nodes. This means that
4491  * potentially they could be stuck for a long time in various send buffers
4492  * in the system. So a simple manner to ensure all of those signals have
4493  * reached their destination is to send FLUSH_LCP_REP_REQ from each node to
4494  * all other nodes. This gives a safe condition that we don't have any
4495  * outstanding LCP_FRAG_REP signals in the cluster. So there is no logic to
4496  * execute when receiving this signal other than to send it back to the sender.
4497  *
4498  * It is quite ok to receive this signal in a node before the PAUSE_LCP_REQ
4499  * has arrived here. This signal doesn't cause any interaction with the
4500  * pause handling in this node, actually it doesn't do anything. It's only
4501  * purpose is to ensure that the signal links are flushed such that we know
4502  * that we don't have any outstanding LCP_FRAG_REPs and LCP_COMPLETE_REPs.
4503  */
execFLUSH_LCP_REP_REQ(Signal * signal)4504 void Dbdih::execFLUSH_LCP_REP_REQ(Signal *signal)
4505 {
4506   FlushLcpRepReq *req = (FlushLcpRepReq*)&signal->theData[0];
4507   FlushLcpRepConf *conf = (FlushLcpRepConf*)signal->getDataPtrSend();
4508   jamEntry();
4509   ndbassert(check_pause_state_sanity());
4510 
4511   BlockReference sender_ref = req->senderRef;
4512   Uint32 startNode = req->startNodeId;
4513   conf->senderRef = reference();
4514   conf->startNodeId = startNode;
4515   sendSignal(sender_ref, GSN_FLUSH_LCP_REP_CONF, signal,
4516              FlushLcpRepConf::SignalLength, JBB);
4517 }
4518 /*---------------------------------------------------------------------------*/
4519 /* END Pausing LCP Module */
4520 /*---------------------------------------------------------------------------*/
4521 
4522 
4523 /*---------------------------------------------------------------------------*/
4524 /*                    NODE RESTART CONTINUE REQUEST                          */
4525 /*---------------------------------------------------------------------------*/
4526 // THIS SIGNAL AND THE CODE BELOW IS EXECUTED BY THE MASTER WHEN IT HAS BEEN
4527 // REQUESTED TO START UP A NEW NODE. The master instructs the starting node
4528 // how to set up its log for continued execution.
4529 /*---------------------------------------------------------------------------*/
execSTART_MEREQ(Signal * signal)4530 void Dbdih::execSTART_MEREQ(Signal* signal)
4531 {
4532   StartMeReq * req = (StartMeReq*)&signal->theData[0];
4533   jamEntry();
4534   const BlockReference Tblockref = req->startingRef;
4535   const Uint32 Tnodeid = refToNode(Tblockref);
4536 
4537   ndbrequire(isMaster());
4538   ndbrequire(c_nodeStartMaster.startNode == Tnodeid);
4539   ndbrequire(getNodeStatus(Tnodeid) == NodeRecord::STARTING);
4540 
4541   DEB_MULTI_TRP(("START_MEREQ"));
4542   {
4543     jam();
4544     /**
4545      * COPY sysfile to starting node here directly
4546      *   so that it gets nodegroups early on
4547      */
4548 
4549     /**
4550      * Note: only one node can be starting now, so we can use
4551      *       c_nodeStartMaster.startNode for determining where to send
4552      */
4553     c_nodeStartMaster.m_outstandingGsn = GSN_COPY_GCIREQ;
4554     copyGciLab(signal, CopyGCIReq::RESTART_NR);
4555   }
4556 }
4557 
4558 /**
4559  * We have come to a point in the node restart where we need to copy
4560  * the meta data to the starting node.
4561  *
4562  * In older versions we did this by acquiring a mutex that is held by
4563  * the following actions:
4564  * 1) Execution of LCP. The mutex is held for the entire time we are
4565  *   executing an LCP. This could be all the way up to hours.
4566  *
4567  * 2) Take over a fragment. This action happens in the phase where we
4568  *   are synchronizing the starting node with the alive nodes. In order
4569  *   to do so we need to lock the meta data in DBDIH to ensure that we
4570  *   can change it by adding one more alive replica.
4571  *
4572  * The new version still requires that no one is updating the meta data
4573  * while we are copying it. So this means that we still need to grab this
4574  * mutex to copy the meta data. But to synchronize our copying towards
4575  * the execution of LCPs we will use a pausing mechanism instead of
4576  * the mutex. This means that we can avoid the long wait for an LCP to
4577  * complete before we can copy the meta data.
4578  *
4579  * The take over of a fragment only updates the set of active replicas,
4580  * this will not be a problem to do in parallel with updating it with
4581  * regard to LCPs. So these need not be protected against each other.
4582  *
4583  * There are 3 processes that need protection for each other.
4584  * 1) The start of an LCP.
4585  * 2) The copying of meta data
4586  * 3) The synchronization of a node for a fragment
4587  *
4588  * 1) and 2) cannot run concurrently since we want to ensure that the
4589  * start of an LCP has a clear point in connection to the meta data
4590  * status.
4591  * 1) and 3) can run concurrently without any problems.
4592  *
4593  * 2) and 3) cannot run concurrently, but it would be possible to
4594  * have more fine-grained mutexes. The reason is that 3) changes
4595  * a replica from being an old stored replica to being a stored
4596  * replica. This change is part of the copying of meta data.
4597  *
4598  * 3) and 3) for different fragments could run concurrently, but this
4599  * would require changes of the protocol to synchronize the nodes to
4600  * to ensure that the master can handle several parallel changes of
4601  * replica status.
4602  *
4603  * 2) and 2) can run concurrently to some extent, but this would
4604  * require changes to the pause lcp protocol.
4605  *
4606  * The current implementation makes it possible to only run 1 out of
4607  * 1), 2) and 3) at a time.
4608  *
4609  * Another improvement possible is to speed up the copy meta data by
4610  * allowing the master to send more than one table at a time. This
4611  * would remove the wait state where we wait for the starting node
4612  * to receive a table and synchronize it to disk.
4613  *
4614  * One could also consider doing less synch's to disk if somehow the
4615  * different tables could be synched at the same time. This might
4616  * require changing the table layout on disk for DIH and DICT tables.
4617  */
4618 void
startme_copygci_conf(Signal * signal)4619 Dbdih::startme_copygci_conf(Signal* signal)
4620 {
4621   jam();
4622 
4623   /**
4624    * We update the node recovery status to indicate we are now waiting to
4625    * complete a local checkpoint such that we can keep track of node restart
4626    * status to control the start of local checkpoints in a proper manner.
4627    * This code is only executed in master nodes.
4628    */
4629   if (c_nodeStartMaster.startNode != RNIL)
4630   {
4631     setNodeRecoveryStatus(c_nodeStartMaster.startNode,
4632                           NodeRecord::WAIT_LCP_TO_COPY_DICT);
4633 
4634     Callback c = { safe_cast(&Dbdih::lcpBlockedLab),
4635                    c_nodeStartMaster.startNode };
4636     Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
4637     mutex.lock(c, true, true);
4638   }
4639 }
4640 
lcpBlockedLab(Signal * signal,Uint32 nodeId,Uint32 retVal)4641 void Dbdih::lcpBlockedLab(Signal* signal, Uint32 nodeId, Uint32 retVal)
4642 {
4643   jamEntry();
4644   if (c_nodeStartMaster.startNode != nodeId)
4645   {
4646     jam();
4647     if (retVal == 0 || retVal == UtilLockRef::InLockQueue)
4648     {
4649       infoEvent("Releasing table/fragment info lock for node %u", nodeId);
4650 
4651       Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
4652       mutex.unlock();
4653       return;
4654     }
4655     return;
4656   }
4657 
4658   if (retVal == UtilLockRef::InLockQueue)
4659   {
4660     jam();
4661     infoEvent("Node %u enqueued is waiting to copy table/fragment info",
4662               c_nodeStartMaster.startNode);
4663     return;
4664   }
4665 
4666   ndbrequire(retVal == 0); // Mutex error
4667   ndbrequire(getNodeStatus(c_nodeStartMaster.startNode)==NodeRecord::STARTING);
4668 
4669   if (c_lcp_runs_with_pause_support)
4670   {
4671     {
4672       /**
4673        * All nodes running the LCP supports the PAUSE LCP protocol. Also the
4674        * new node support it.
4675        * This means we don't have to wait for the LCP to complete, we can
4676        * pause the LCP while we are copying the meta data.
4677        */
4678       jam();
4679       sendPAUSE_LCP_REQ(signal, true);
4680       return;
4681     }
4682   }
4683   /**
4684    * Either we don't support the PAUSE protocol or some other node doesn't. We
4685    * can also arrive here simply because no LCP is ongoing. In this case we
4686    * can be sure that no LCP is ongoing in both cases. So we ensure that no
4687    * LCP starts up until we have completed the copying of meta data by keeping
4688    * the Fragment Info mutex until we have completed the copying of meta data.
4689    */
4690   start_copy_meta_data(signal);
4691 }//Dbdih::lcpBlockedLab()
4692 
nodeDictStartConfLab(Signal * signal,Uint32 nodeId)4693 void Dbdih::nodeDictStartConfLab(Signal* signal, Uint32 nodeId)
4694 {
4695   /*-----------------------------------------------------------------*/
4696   // Report that node restart has completed copy of dictionary.
4697   /*-----------------------------------------------------------------*/
4698   signal->theData[0] = NDB_LE_NR_CopyDict;
4699   signal->theData[1] = nodeId;
4700   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4701 
4702   /*-------------------------------------------------------------------------
4703    * NOW WE HAVE COPIED BOTH DIH AND DICT INFORMATION. WE ARE NOW READY TO
4704    * INTEGRATE THE NODE INTO THE LCP AND GCP PROTOCOLS AND TO ALLOW UPDATES OF
4705    * THE DICTIONARY AGAIN.
4706    *
4707    * We can release the PAUSE on LCP now since we are ready to update the
4708    * meta data again.
4709    *
4710    * We update the node recovery status with this information to be able to
4711    * track node restart status. This code only executes in the master node.
4712    */
4713   /*-------------------------------------------------------------------------*/
4714   setNodeRecoveryStatus(c_nodeStartMaster.startNode,
4715                         NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP);
4716 
4717   c_nodeStartMaster.wait = ZFALSE;
4718   c_nodeStartMaster.blockGcp = 1;
4719 
4720   return;
4721 }//Dbdih::nodeDictStartConfLab()
4722 
dihCopyCompletedLab(Signal * signal)4723 void Dbdih::dihCopyCompletedLab(Signal* signal)
4724 {
4725   signal->theData[0] = NDB_LE_NR_CopyDistr;
4726   signal->theData[1] = c_nodeStartMaster.startNode;
4727   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
4728 
4729   BlockReference ref = calcDictBlockRef(c_nodeStartMaster.startNode);
4730   DictStartReq * req = (DictStartReq*)&signal->theData[0];
4731   req->restartGci = (Uint32)(m_micro_gcp.m_new_gci >> 32);
4732   req->senderRef = reference();
4733   sendSignal(ref, GSN_DICTSTARTREQ,
4734              signal, DictStartReq::SignalLength, JBB);
4735   c_nodeStartMaster.m_outstandingGsn = GSN_DICTSTARTREQ;
4736   c_nodeStartMaster.wait = 0;
4737 }//Dbdih::dihCopyCompletedLab()
4738 
gcpBlockedLab(Signal * signal)4739 void Dbdih::gcpBlockedLab(Signal* signal)
4740 {
4741   /**
4742    * The node DIH will be part of LCP
4743    */
4744   NodeRecordPtr nodePtr;
4745   nodePtr.i = c_nodeStartMaster.startNode;
4746   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
4747   nodePtr.p->m_inclDihLcp = true;
4748 
4749   /**
4750    * If node is new...this is the place to do things,
4751    *   gcp+lcp is blocked
4752    */
4753   if (getNodeActiveStatus(nodePtr.i) == Sysfile::NS_NotDefined)
4754   {
4755     jam();
4756     infoEvent("Adding node %d to sysfile, NS_Configured",
4757               nodePtr.i);
4758     setNodeActiveStatus(nodePtr.i, Sysfile::NS_Configured);
4759     Sysfile::setNodeGroup(nodePtr.i, SYSFILE->nodeGroups,
4760                           NO_NODE_GROUP_ID);
4761     Sysfile::setNodeStatus(nodePtr.i,
4762                            SYSFILE->nodeStatus, Sysfile::NS_Configured);
4763   }
4764 
4765   /*-------------------------------------------------------------------------*/
4766   // NOW IT IS TIME TO INFORM ALL OTHER NODES IN THE CLUSTER OF THE STARTED
4767   // NODE SUCH THAT THEY ALSO INCLUDE THE NODE IN THE NODE LISTS AND SO FORTH.
4768   /*------------------------------------------------------------------------*/
4769   sendLoopMacro(INCL_NODEREQ, sendINCL_NODEREQ, RNIL);
4770   /*-------------------------------------------------------------------------*/
4771   // We also need to send to the starting node to ensure he is aware of the
4772   // global checkpoint id and the correct state. We do not wait for any reply
4773   // since the starting node will not send any.
4774   /*-------------------------------------------------------------------------*/
4775   c_INCL_NODEREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode);
4776   sendINCL_NODEREQ(signal, c_nodeStartMaster.startNode, RNIL);
4777 }//Dbdih::gcpBlockedLab()
4778 
4779 /*---------------------------------------------------------------------------*/
4780 // THIS SIGNAL IS EXECUTED IN BOTH SLAVES AND IN THE MASTER
4781 /*---------------------------------------------------------------------------*/
execINCL_NODECONF(Signal * signal)4782 void Dbdih::execINCL_NODECONF(Signal* signal)
4783 {
4784   jamEntry();
4785   Uint32 TstartNode = signal->theData[0];
4786   Uint32 TsendNodeId_or_blockref = signal->theData[1];
4787 
4788   Uint32 blocklist[7];
4789   blocklist[0] = clocallqhblockref;
4790   blocklist[1] = clocaltcblockref;
4791   blocklist[2] = cdictblockref;
4792   blocklist[3] = numberToRef(BACKUP, getOwnNodeId());
4793   blocklist[4] = numberToRef(SUMA, getOwnNodeId());
4794   blocklist[5] = numberToRef(DBSPJ, getOwnNodeId());
4795   blocklist[6] = 0;
4796 
4797   for (Uint32 i = 0; blocklist[i] != 0; i++)
4798   {
4799     if (TsendNodeId_or_blockref == blocklist[i])
4800     {
4801       jam();
4802 
4803       if (TstartNode != c_nodeStartSlave.nodeId)
4804       {
4805         jam();
4806         warningEvent("Received INCL_NODECONF for %u from %s"
4807                      " while %u is starting",
4808                      TstartNode,
4809                      getBlockName(refToBlock(TsendNodeId_or_blockref)),
4810                      c_nodeStartSlave.nodeId);
4811         return;
4812       }
4813 
4814       if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE &&
4815 	  blocklist[i+1] != 0)
4816       {
4817 	/**
4818 	 * Send to next in block list
4819 	 */
4820 	jam();
4821 	signal->theData[0] = reference();
4822 	signal->theData[1] = c_nodeStartSlave.nodeId;
4823 	sendSignal(blocklist[i+1], GSN_INCL_NODEREQ, signal, 2, JBB);
4824 	return;
4825       }
4826       else
4827       {
4828 	/**
4829 	 * All done, reply to master if node is still up.
4830 	 */
4831 	jam();
4832         if (getNodeStatus(c_nodeStartSlave.nodeId) == NodeRecord::ALIVE)
4833         {
4834           jam();
4835           if (!isMaster())
4836           {
4837             jam();
4838             setNodeRecoveryStatus(c_nodeStartSlave.nodeId,
4839                                   NodeRecord::NODE_GETTING_INCLUDED);
4840           }
4841 	  signal->theData[0] = c_nodeStartSlave.nodeId;
4842 	  signal->theData[1] = cownNodeId;
4843 	  sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
4844         }
4845 	c_nodeStartSlave.nodeId = 0;
4846 	return;
4847       }
4848     }
4849   }
4850 
4851   if (c_nodeStartMaster.startNode != TstartNode)
4852   {
4853     jam();
4854     warningEvent("Received INCL_NODECONF for %u from %u"
4855                  " while %u is starting",
4856                  TstartNode,
4857                  TsendNodeId_or_blockref,
4858                  c_nodeStartMaster.startNode);
4859     return;
4860   }
4861 
4862   ndbrequire(reference() == cmasterdihref);
4863   receiveLoopMacro(INCL_NODEREQ, TsendNodeId_or_blockref);
4864 
4865   CRASH_INSERTION(7128);
4866   /*-------------------------------------------------------------------------*/
4867   // Now that we have included the starting node in the node lists in the
4868   // various blocks we are ready to start the global checkpoint protocol
4869   /*------------------------------------------------------------------------*/
4870   c_nodeStartMaster.wait = 11;
4871   c_nodeStartMaster.blockGcp = 0;
4872 
4873   /**
4874    * Restart GCP
4875    */
4876   signal->theData[0] = reference();
4877   sendSignal(reference(), GSN_UNBLO_DICTCONF, signal, 1, JBB);
4878 
4879   signal->theData[0] = DihContinueB::ZSTART_GCP;
4880   sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
4881 
4882   Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
4883   mutex.unlock();
4884 }//Dbdih::execINCL_NODECONF()
4885 
execUNBLO_DICTCONF(Signal * signal)4886 void Dbdih::execUNBLO_DICTCONF(Signal* signal)
4887 {
4888   jamEntry();
4889   c_nodeStartMaster.wait = ZFALSE;
4890   if (!c_nodeStartMaster.activeState) {
4891     jam();
4892     return;
4893   }//if
4894 
4895   CRASH_INSERTION(7129);
4896   /**-----------------------------------------------------------------------
4897    * WE HAVE NOW PREPARED IT FOR INCLUSION IN THE LCP PROTOCOL.
4898    * WE CAN NOW START THE LCP PROTOCOL AGAIN.
4899    * WE HAVE ALSO MADE THIS FOR THE GCP PROTOCOL.
4900    * WE ARE READY TO START THE PROTOCOLS AND RESPOND TO THE START REQUEST
4901    * FROM THE STARTING NODE.
4902    *------------------------------------------------------------------------*/
4903 
4904   StartMeConf * const startMe = (StartMeConf *)&signal->theData[0];
4905 
4906 
4907   Uint32 nodeId = startMe->startingNodeId = c_nodeStartMaster.startNode;
4908   startMe->startWord = 0;
4909 
4910   const Uint32 ref = calcDihBlockRef(c_nodeStartMaster.startNode);
4911   Uint32 node_version = getNodeInfo(c_nodeStartMaster.startNode).m_version;
4912   if (ndbd_send_node_bitmask_in_section(node_version))
4913   {
4914     jam();
4915     pack_sysfile_format_v2();
4916     send_START_MECONF_data_v2(signal, ref);
4917   }
4918   else
4919   {
4920     jam();
4921     pack_sysfile_format_v1();
4922     send_START_MECONF_data_v1(signal, ref);
4923   }
4924   nodeResetStart(signal);
4925 
4926   /**
4927    * At this point the master knows that the starting node will start executing
4928    * the Database Recovery. This can take a fair amount of time. At the end of
4929    * the recovery the starting node need to be part of a LCP. In order to
4930    * synchronize for several nodes restarting at the same time we need to keep
4931    * track of start times.
4932    *
4933    * We expect that in most parallel node restarts the nodes are restarted
4934    * immediately after a crash or as part of a rolling restart. In this case
4935    * the node restart times will be very similar. So we should be able to
4936    * roughly estimate when the node restart will reach the point where it
4937    * is ready to wait for an LCP.
4938    *
4939    * When the first node reaches this point and also later nodes reach this
4940    * phase, then they will be able to estimate whether it is worth it to
4941    * hold the LCP until the next node arrives to this phase.
4942    *
4943    * The similitude of a flight or a train waiting for passengers arriving
4944    * on other flights or trains can be used here. It is useful to wait for
4945    * some time since there is a high cost for passengers to miss the train.
4946    * At the same time it isn't worthwhile to hold it for a very long time
4947    * since then all other passengers will suffer greatly. In this case the
4948    * other nodes waiting will suffer, but also we will risk running out of
4949    * REDO log space if we wait for too long time.
4950    *
4951    * Given that we don't wait for more than a short time to synchronize
4952    * means that the case of heterogenous nodes will also work ok in this
4953    * context although we will optimize for the homogenous case.
4954    *
4955    * To get even better estimates of where we are and to give users even
4956    * better understanding of what takes time in node restarts we have also
4957    * adde that the LDMs report when they have completed the 3 local phases
4958    * of local recovery. These are completion of restore fragments,
4959    * completion of UNDO Disk data, completion of execution of REDO log and
4960    * the final phase executed in LDMs are the ordered index rebuilds which is
4961    * completed when the local recovery is completed.
4962    */
4963   setNodeRecoveryStatus(nodeId, NodeRecord::LOCAL_RECOVERY_STARTED);
4964 
4965   /**
4966    * Allow next node to start...
4967    */
4968   StartPermRep *rep = (StartPermRep*)signal->getDataPtrSend();
4969   rep->startNodeId = nodeId;
4970   rep->reason = StartPermRep::PermissionToStart;
4971   sendSignal(NDBCNTR_REF, GSN_START_PERMREP, signal,
4972              StartPermRep::SignalLength, JBB);
4973 }//Dbdih::execUNBLO_DICTCONF()
4974 
4975 /*---------------------------------------------------------------------------*/
4976 /*                    NODE RESTART COPY REQUEST                              */
4977 /*---------------------------------------------------------------------------*/
4978 // A NODE RESTART HAS REACHED ITS FINAL PHASE WHEN THE DATA IS TO BE COPIED
4979 // TO THE NODE. START_COPYREQ IS EXECUTED BY THE STARTING NODE.
4980 /*---------------------------------------------------------------------------*/
execSTART_COPYREQ(Signal * signal)4981 void Dbdih::execSTART_COPYREQ(Signal* signal)
4982 {
4983   jamEntry();
4984   StartCopyReq req = *(StartCopyReq*)signal->getDataPtr();
4985 
4986   Uint32 startNodeId = req.startingNodeId;
4987 
4988   /*-------------------------------------------------------------------------*/
4989   /*
4990    * REPORT Copy process of node restart is now about to start up.
4991    *
4992    * We will report this both in an internal state that can be used to
4993    * report progress in NDBINFO tables as well as being used to keep track of
4994    * node restart status to make correct decisions on when to start LCPs.
4995    * We also report it to cluster log and internal node log.
4996    *
4997    * This code is only executed in master node.
4998    */
4999   /*-------------------------------------------------------------------------*/
5000   signal->theData[0] = NDB_LE_NR_CopyFragsStarted;
5001   signal->theData[1] = req.startingNodeId;
5002   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
5003 
5004   g_eventLogger->info("Restore Database Off-line Starting");
5005   infoEvent("Restore Database Off-line Starting on node %u",
5006             startNodeId);
5007 
5008   CRASH_INSERTION(7131);
5009 
5010   switch (getNodeActiveStatus(startNodeId)) {
5011   case Sysfile::NS_Active:
5012   case Sysfile::NS_ActiveMissed_1:
5013   case Sysfile::NS_ActiveMissed_2:
5014   case Sysfile::NS_NotActive_NotTakenOver:
5015   case Sysfile::NS_Configured:
5016     jam();
5017     /*-----------------------------------------------------------------------*/
5018     // AN ACTIVE NODE HAS BEEN STARTED. THE ACTIVE NODE MUST THEN GET ALL DATA
5019     // IT HAD BEFORE ITS CRASH. WE START THE TAKE OVER IMMEDIATELY.
5020     // SINCE WE ARE AN ACTIVE NODE WE WILL TAKE OVER OUR OWN NODE THAT
5021     // PREVIOUSLY CRASHED.
5022     /*-----------------------------------------------------------------------*/
5023     startTakeOver(signal, startNodeId, startNodeId, &req);
5024     break;
5025   case Sysfile::NS_TakeOver:{
5026     jam();
5027     /*--------------------------------------------------------------------
5028      * We were in the process of taking over but it was not completed.
5029      * We will complete it now instead.
5030      *--------------------------------------------------------------------*/
5031     Uint32 takeOverNode = Sysfile::getTakeOverNode(startNodeId,
5032 						   SYSFILE->takeOver);
5033     if(takeOverNode == 0){
5034       jam();
5035       warningEvent("Bug in take-over code restarting");
5036       takeOverNode = startNodeId;
5037     }
5038 
5039     startTakeOver(signal, startNodeId, takeOverNode, &req);
5040     break;
5041   }
5042   default:
5043     ndbabort();
5044   }//switch
5045 }//Dbdih::execSTART_COPYREQ()
5046 
5047 /*---------------------------------------------------------------------------*/
5048 /*                    SLAVE LOGIC FOR NODE RESTART                           */
5049 /*---------------------------------------------------------------------------*/
execSTART_INFOREQ(Signal * signal)5050 void Dbdih::execSTART_INFOREQ(Signal* signal)
5051 {
5052   jamEntry();
5053   StartInfoReq *const req =(StartInfoReq*)&signal->theData[0];
5054   Uint32 startNode = req->startingNodeId;
5055   if (cfailurenr != req->systemFailureNo) {
5056     jam();
5057     //---------------------------------------------------------------
5058     // A failure occurred since master sent this request. We will ignore
5059     // this request since the node is already dead that is starting.
5060     //---------------------------------------------------------------
5061     return;
5062   }//if
5063   CRASH_INSERTION(7123);
5064   if (isMaster()) {
5065     jam();
5066     ndbrequire(getNodeStatus(startNode) == NodeRecord::STARTING);
5067   } else {
5068     jam();
5069     if (getNodeStatus(startNode) == NodeRecord::STARTING)
5070     {
5071       /**
5072        * The master is sending out a new START_INFOREQ, obviously some
5073        * other node wasn't ready to start it yet, we are still ready.
5074        * We will report this fact without any additional state changes.
5075        */
5076       jam();
5077       NodeRecordPtr nodePtr;
5078       nodePtr.i = startNode;
5079       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5080       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
5081                  NodeRecord::NODE_GETTING_PERMIT);
5082       ndbrequire(getAllowNodeStart(startNode));
5083 
5084       StartInfoConf * c = (StartInfoConf*)&signal->theData[0];
5085       c->sendingNodeId = cownNodeId;
5086       c->startingNodeId = startNode;
5087       sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
5088 	         StartInfoConf::SignalLength, JBB);
5089       return;
5090     }
5091     else
5092     {
5093       jam();
5094       ndbrequire(getNodeStatus(startNode) == NodeRecord::DEAD);
5095     }
5096   }//if
5097   if ((!getAllowNodeStart(startNode)) ||
5098       (c_nodeStartSlave.nodeId != 0) ||
5099       (ERROR_INSERTED(7124))) {
5100     jam();
5101     if (!getAllowNodeStart(startNode))
5102     {
5103       jam();
5104       g_eventLogger->info("Not allowed to start now for node %u", startNode);
5105     }
5106     else if (c_nodeStartSlave.nodeId != 0)
5107     {
5108       jam();
5109       g_eventLogger->info("INCL_NODEREQ protocol still ongoing node = %u"
5110                           " c_nodeStartSlave.nodeId = %u",
5111                           startNode,
5112                           c_nodeStartSlave.nodeId);
5113     }
5114     else
5115     {
5116       jam();
5117       g_eventLogger->info("ERROR INSERT 7124");
5118     }
5119     StartInfoRef *const ref =(StartInfoRef*)&signal->theData[0];
5120     ref->startingNodeId = startNode;
5121     ref->sendingNodeId = cownNodeId;
5122     ref->errorCode = StartPermRef::ZNODE_START_DISALLOWED_ERROR;
5123     sendSignal(cmasterdihref, GSN_START_INFOREF, signal,
5124 	       StartInfoRef::SignalLength, JBB);
5125     return;
5126   }//if
5127   setNodeStatus(startNode, NodeRecord::STARTING);
5128   if (req->typeStart == NodeState::ST_INITIAL_NODE_RESTART) {
5129     jam();
5130     g_eventLogger->info("Started invalidation of node %u", startNode);
5131     setAllowNodeStart(startNode, false);
5132     invalidateNodeLCP(signal, startNode, 0);
5133   } else {
5134     jam();
5135     if (!isMaster())
5136     {
5137       jam();
5138       setNodeRecoveryStatus(startNode, NodeRecord::NODE_GETTING_PERMIT);
5139     }
5140     StartInfoConf * c = (StartInfoConf*)&signal->theData[0];
5141     c->sendingNodeId = cownNodeId;
5142     c->startingNodeId = startNode;
5143     sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
5144 	       StartInfoConf::SignalLength, JBB);
5145     return;
5146   }//if
5147 }//Dbdih::execSTART_INFOREQ()
5148 
execINCL_NODEREQ(Signal * signal)5149 void Dbdih::execINCL_NODEREQ(Signal* signal)
5150 {
5151   jamEntry();
5152   Uint32 retRef = signal->theData[0];
5153   Uint32 nodeId = signal->theData[1];
5154   if (nodeId == getOwnNodeId() && ERROR_INSERTED(7165))
5155   {
5156     CLEAR_ERROR_INSERT_VALUE;
5157     sendSignalWithDelay(reference(), GSN_INCL_NODEREQ, signal, 5000,
5158                         signal->getLength());
5159     return;
5160   }
5161 
5162   Uint32 tnodeStartFailNr = signal->theData[2];
5163   Uint32 gci_hi = signal->theData[4];
5164   Uint32 gci_lo = signal->theData[5];
5165   if (unlikely(signal->getLength() < 6))
5166   {
5167     jam();
5168     gci_lo = 0;
5169   }
5170 
5171   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
5172   CRASH_INSERTION(7127);
5173   m_micro_gcp.m_current_gci = gci;
5174   m_micro_gcp.m_old_gci = gci - 1;
5175 
5176   /*-------------------------------------------------------------------------*/
5177   // When a node is restarted we must ensure that a lcp will be run
5178   // as soon as possible and the reset the delay according to the original
5179   // configuration.
5180   // Without an initial local checkpoint the new node will not be available.
5181   /*-------------------------------------------------------------------------*/
5182   if (getOwnNodeId() == nodeId) {
5183     jam();
5184     /*-----------------------------------------------------------------------*/
5185     // We are the starting node. We came here only to set the global checkpoint
5186     // id's and the lcp status.
5187     /*-----------------------------------------------------------------------*/
5188     CRASH_INSERTION(7171);
5189     signal->theData[0] = getOwnNodeId();
5190     signal->theData[1] = getOwnNodeId();
5191     sendSignal(cmasterdihref, GSN_INCL_NODECONF, signal, 2, JBB);
5192     return;
5193   }//if
5194   if (getNodeStatus(nodeId) != NodeRecord::STARTING) {
5195     jam();
5196     return;
5197   }//if
5198   ndbrequire(cfailurenr == tnodeStartFailNr);
5199   ndbrequire (c_nodeStartSlave.nodeId == 0);
5200   c_nodeStartSlave.nodeId = nodeId;
5201 
5202   ndbrequire (retRef == cmasterdihref);
5203 
5204   NodeRecordPtr nodePtr;
5205   nodePtr.i = nodeId;
5206   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5207 
5208   Sysfile::ActiveStatus TsaveState = nodePtr.p->activeStatus;
5209   Uint32 TnodeGroup = nodePtr.p->nodeGroup;
5210 
5211   initNodeRecord(nodePtr);
5212   nodePtr.p->nodeGroup = TnodeGroup;
5213   nodePtr.p->activeStatus = TsaveState;
5214   nodePtr.p->nodeStatus = NodeRecord::ALIVE;
5215   nodePtr.p->m_inclDihLcp = true;
5216   make_node_usable(nodePtr.p);
5217   removeDeadNode(nodePtr);
5218   insertAlive(nodePtr);
5219   con_lineNodes++;
5220   if (cmasterNodeId == getOwnNodeId() &&
5221       con_lineNodes >= 16)
5222   {
5223     log_setNoSend();
5224     setNoSend();
5225   }
5226 
5227   /*-------------------------------------------------------------------------*/
5228   //      WE WILL ALSO SEND THE INCLUDE NODE REQUEST TO THE LOCAL LQH BLOCK.
5229   /*-------------------------------------------------------------------------*/
5230   signal->theData[0] = reference();
5231   signal->theData[1] = nodeId;
5232   signal->theData[2] = Uint32(m_micro_gcp.m_current_gci >> 32);
5233   sendSignal(clocallqhblockref, GSN_INCL_NODEREQ, signal, 3, JBB);
5234 }//Dbdih::execINCL_NODEREQ()
5235 
5236 /* ------------------------------------------------------------------------- */
5237 // execINCL_NODECONF() is found in the master logic part since it is used by
5238 // both the master and the slaves.
5239 /* ------------------------------------------------------------------------- */
5240 
5241 /******************************************************************************
5242  *
5243  * Node takeover functionality
5244  * MASTER part
5245  *****************************************************************************/
execSTART_TOREQ(Signal * signal)5246 void Dbdih::execSTART_TOREQ(Signal* signal)
5247 {
5248   jamEntry();
5249   StartToReq req = *(StartToReq *)&signal->theData[0];
5250 
5251 
5252   {
5253     jam();
5254     TakeOverRecordPtr takeOverPtr;
5255 
5256     c_takeOverPool.seize(takeOverPtr);
5257     c_masterActiveTakeOverList.addFirst(takeOverPtr);
5258     takeOverPtr.p->toStartingNode = req.startingNodeId;
5259     takeOverPtr.p->m_senderRef = req.senderRef;
5260     takeOverPtr.p->m_senderData = req.senderData;
5261     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
5262     takeOverPtr.p->toStartTime = c_current_time;
5263   }
5264 
5265   setNodeRecoveryStatus(req.startingNodeId,
5266                         NodeRecord::COPY_FRAGMENTS_STARTED);
5267 
5268   StartToConf * conf = (StartToConf *)&signal->theData[0];
5269   conf->senderData = req.senderData;
5270   conf->sendingNodeId = cownNodeId;
5271   conf->startingNodeId = req.startingNodeId;
5272   sendSignal(req.senderRef, GSN_START_TOCONF,
5273              signal, StartToConf::SignalLength, JBB);
5274 }//Dbdih::execSTART_TOREQ()
5275 
execUPDATE_TOREQ(Signal * signal)5276 void Dbdih::execUPDATE_TOREQ(Signal* signal)
5277 {
5278   jamEntry();
5279   UpdateToReq req = *(UpdateToReq *)&signal->theData[0];
5280 
5281   Uint32 errCode;
5282   Uint32 extra;
5283   g_eventLogger->debug("Received UPDATE_TOREQ for startnode: %u, copynode:%u",
5284                        req.startingNodeId, req.copyNodeId);
5285   {
5286     jam();
5287     /**
5288      *
5289      */
5290     TakeOverRecordPtr takeOverPtr;
5291     if (findTakeOver(takeOverPtr, req.startingNodeId) == false)
5292     {
5293       g_eventLogger->info("Unknown takeOver node: %u", req.startingNodeId);
5294       errCode = UpdateToRef::UnknownTakeOver;
5295       extra = RNIL;
5296       goto ref;
5297     }
5298 
5299     CRASH_INSERTION(7141);
5300 
5301     takeOverPtr.p->toCopyNode = req.copyNodeId;
5302     takeOverPtr.p->toCurrentTabref = req.tableId;
5303     takeOverPtr.p->toCurrentFragid = req.fragmentNo;
5304 
5305     NodeRecordPtr nodePtr;
5306     NodeGroupRecordPtr NGPtr;
5307     nodePtr.i = req.copyNodeId;
5308     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5309     NGPtr.i = nodePtr.p->nodeGroup;
5310     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
5311 
5312     Mutex mutex(signal, c_mutexMgr, takeOverPtr.p->m_fragmentInfoMutex);
5313     Callback c = { safe_cast(&Dbdih::updateToReq_fragmentMutex_locked),
5314                    takeOverPtr.i };
5315 
5316     switch(req.requestType){
5317     case UpdateToReq::BEFORE_STORED:
5318       jam();
5319 
5320       if (NGPtr.p->activeTakeOver == 0)
5321       {
5322         jam();
5323         NGPtr.p->activeTakeOver = req.startingNodeId;
5324         NGPtr.p->activeTakeOverCount = 1;
5325       }
5326       else if (NGPtr.p->activeTakeOver == req.startingNodeId)
5327       {
5328         NGPtr.p->activeTakeOverCount++;
5329       }
5330       else
5331       {
5332         jam();
5333         errCode = UpdateToRef::CopyFragInProgress;
5334         extra = NGPtr.p->activeTakeOver;
5335         g_eventLogger->info("takeOver node in progress: %u",
5336                             NGPtr.p->activeTakeOver);
5337         goto ref;
5338       }
5339 
5340       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_STORED;
5341       mutex.lock(c, false, true);
5342       return;
5343     case UpdateToReq::AFTER_STORED:
5344     {
5345       jam();
5346       mutex.unlock();
5347       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_AFTER_STORED;
5348       // Send conf
5349       break;
5350     }
5351     case UpdateToReq::BEFORE_COMMIT_STORED:
5352       jam();
5353       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_COMMIT;
5354       mutex.lock(c, false, true);
5355       return;
5356     case UpdateToReq::AFTER_COMMIT_STORED:
5357     {
5358       jam();
5359       mutex.unlock();
5360 
5361       Mutex mutex2(signal, c_mutexMgr,
5362                    takeOverPtr.p->m_switchPrimaryMutexHandle);
5363       mutex2.unlock();
5364       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
5365       break; // send conf
5366     }
5367     }
5368   }
5369   {
5370     UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
5371     conf->senderData = req.senderData;
5372     conf->sendingNodeId = cownNodeId;
5373     conf->startingNodeId = req.startingNodeId;
5374     sendSignal(req.senderRef, GSN_UPDATE_TOCONF, signal,
5375                UpdateToConf::SignalLength, JBB);
5376   }
5377   return;
5378 
5379 ref:
5380   UpdateToRef* ref = (UpdateToRef*)signal->getDataPtrSend();
5381   ref->senderData = req.senderData;
5382   ref->senderRef = reference();
5383   ref->errorCode = errCode;
5384   ref->extra = extra;
5385   sendSignal(req.senderRef, GSN_UPDATE_TOREF, signal,
5386              UpdateToRef::SignalLength, JBB);
5387 }
5388 
5389 void
updateToReq_fragmentMutex_locked(Signal * signal,Uint32 toPtrI,Uint32 retVal)5390 Dbdih::updateToReq_fragmentMutex_locked(Signal * signal,
5391                                         Uint32 toPtrI, Uint32 retVal)
5392 {
5393   jamEntry();
5394   TakeOverRecordPtr takeOverPtr;
5395   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
5396 
5397   Uint32 nodeId = takeOverPtr.p->toStartingNode;
5398 
5399   if (retVal == UtilLockRef::InLockQueue)
5400   {
5401     jam();
5402     infoEvent("Node %u waiting to continue copying table %u fragment: %u (%s)",
5403               nodeId,
5404               takeOverPtr.p->toCurrentTabref,
5405               takeOverPtr.p->toCurrentFragid,
5406               takeOverPtr.p->toMasterStatus ==
5407                 TakeOverRecord::TO_MUTEX_BEFORE_STORED ? "STORED" : "COMMIT");
5408     return;
5409   }
5410 
5411   Uint32 errCode;
5412   Uint32 extra;
5413 
5414   NodeRecordPtr nodePtr;
5415   nodePtr.i = nodeId;
5416   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5417   if (unlikely(nodePtr.p->nodeStatus != NodeRecord::ALIVE))
5418   {
5419     jam();
5420     /**
5421      * Node died while we waited for lock...
5422      */
5423     abortTakeOver(signal, takeOverPtr);
5424     return;
5425   }
5426 
5427   switch(takeOverPtr.p->toMasterStatus){
5428   case TakeOverRecord::TO_MUTEX_BEFORE_STORED:
5429   {
5430     jam();
5431     // send conf
5432     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_LOCKED;
5433     break;
5434   }
5435   case TakeOverRecord::TO_MUTEX_BEFORE_COMMIT:
5436   {
5437     jam();
5438 
5439     NodeRecordPtr nodePtr;
5440     NodeGroupRecordPtr NGPtr;
5441     nodePtr.i = takeOverPtr.p->toCopyNode;
5442     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5443     NGPtr.i = nodePtr.p->nodeGroup;
5444     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
5445 
5446     if (NGPtr.p->activeTakeOver != nodeId)
5447     {
5448       ndbassert(false);
5449       errCode = UpdateToRef::InvalidRequest;
5450       extra = NGPtr.p->activeTakeOver;
5451       goto ref;
5452     }
5453     ndbrequire(NGPtr.p->activeTakeOverCount > 0);
5454     NGPtr.p->activeTakeOverCount--;
5455     if (NGPtr.p->activeTakeOverCount == 0)
5456     {
5457       /**
5458        * Last active copy thread, give up activeTakeOver for now
5459        */
5460       jam();
5461       NGPtr.p->activeTakeOver = 0;
5462       NGPtr.p->activeTakeOverCount = 0;
5463     }
5464     takeOverPtr.p->toCopyNode = RNIL;
5465     Mutex mutex(signal, c_mutexMgr,
5466                 takeOverPtr.p->m_switchPrimaryMutexHandle);
5467     Callback c = { safe_cast(&Dbdih::switchPrimaryMutex_locked),
5468                    takeOverPtr.i };
5469     ndbrequire(mutex.lock(c));
5470     takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_BEFORE_SWITCH_REPLICA;
5471     return;
5472     break;
5473   }
5474   default:
5475     jamLine(takeOverPtr.p->toMasterStatus);
5476     ndbabort();
5477   }
5478 
5479   {
5480     UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
5481     conf->senderData = takeOverPtr.p->m_senderData;
5482     conf->sendingNodeId = cownNodeId;
5483     conf->startingNodeId = takeOverPtr.p->toStartingNode;
5484     sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
5485                UpdateToConf::SignalLength, JBB);
5486   }
5487   return;
5488 
5489 ref:
5490   {
5491     Mutex mutex(signal, c_mutexMgr, takeOverPtr.p->m_fragmentInfoMutex);
5492     mutex.unlock();
5493 
5494     UpdateToRef* ref = (UpdateToRef*)signal->getDataPtrSend();
5495     ref->senderData = takeOverPtr.p->m_senderData;
5496     ref->senderRef = reference();
5497     ref->errorCode = errCode;
5498     ref->extra = extra;
5499     sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOREF, signal,
5500                UpdateToRef::SignalLength, JBB);
5501     return;
5502   }
5503 }
5504 
5505 void
switchPrimaryMutex_locked(Signal * signal,Uint32 toPtrI,Uint32 retVal)5506 Dbdih::switchPrimaryMutex_locked(Signal* signal, Uint32 toPtrI, Uint32 retVal)
5507 {
5508   jamEntry();
5509   ndbrequire(retVal == 0);
5510 
5511   TakeOverRecordPtr takeOverPtr;
5512   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
5513 
5514   Uint32 nodeId = takeOverPtr.p->toStartingNode;
5515   NodeRecordPtr nodePtr;
5516   nodePtr.i = nodeId;
5517   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5518 
5519   if (unlikely(nodePtr.p->nodeStatus != NodeRecord::ALIVE))
5520   {
5521     jam();
5522     /**
5523      * Node died while we waited for lock...
5524      */
5525     abortTakeOver(signal, takeOverPtr);
5526     return;
5527   }
5528 
5529   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MUTEX_AFTER_SWITCH_REPLICA;
5530 
5531   UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
5532   conf->senderData = takeOverPtr.p->m_senderData;
5533   conf->sendingNodeId = cownNodeId;
5534   conf->startingNodeId = takeOverPtr.p->toStartingNode;
5535   sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
5536              UpdateToConf::SignalLength, JBB);
5537 }
5538 
5539 void
switchPrimaryMutex_unlocked(Signal * signal,Uint32 toPtrI,Uint32 retVal)5540 Dbdih::switchPrimaryMutex_unlocked(Signal* signal, Uint32 toPtrI, Uint32 retVal)
5541 {
5542   jamEntry();
5543   ndbrequire(retVal == 0);
5544 
5545   TakeOverRecordPtr takeOverPtr;
5546   c_takeOverPool.getPtr(takeOverPtr, toPtrI);
5547 
5548   UpdateToConf * conf = (UpdateToConf *)&signal->theData[0];
5549   conf->senderData = takeOverPtr.p->m_senderData;
5550   conf->sendingNodeId = cownNodeId;
5551   conf->startingNodeId = takeOverPtr.p->toStartingNode;
5552   sendSignal(takeOverPtr.p->m_senderRef, GSN_UPDATE_TOCONF, signal,
5553              UpdateToConf::SignalLength, JBB);
5554 }
5555 
5556 void
abortTakeOver(Signal * signal,TakeOverRecordPtr takeOverPtr)5557 Dbdih::abortTakeOver(Signal* signal, TakeOverRecordPtr takeOverPtr)
5558 {
5559   if (!takeOverPtr.p->m_switchPrimaryMutexHandle.isNull())
5560   {
5561     jam();
5562     Mutex mutex(signal, c_mutexMgr,
5563                 takeOverPtr.p->m_switchPrimaryMutexHandle);
5564     mutex.unlock();
5565 
5566   }
5567 
5568   if (!takeOverPtr.p->m_fragmentInfoMutex.isNull())
5569   {
5570     jam();
5571     Mutex mutex(signal, c_mutexMgr,
5572                 takeOverPtr.p->m_fragmentInfoMutex);
5573     mutex.unlock();
5574   }
5575 
5576   NodeRecordPtr nodePtr;
5577   nodePtr.i = takeOverPtr.p->toCopyNode;
5578   if (nodePtr.i != RNIL)
5579   {
5580     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5581     NodeGroupRecordPtr NGPtr;
5582     NGPtr.i = nodePtr.p->nodeGroup;
5583     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
5584     if (NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode)
5585     {
5586       jam();
5587       NGPtr.p->activeTakeOver = 0;
5588       NGPtr.p->activeTakeOverCount = 0;
5589     }
5590   }
5591 
5592   releaseTakeOver(takeOverPtr, true);
5593 }
5594 
5595 static
5596 void
add_lcp_counter(Uint32 * counter,Uint32 add)5597 add_lcp_counter(Uint32 * counter, Uint32 add)
5598 {
5599   Uint64 tmp = * counter;
5600   tmp += add;
5601   if (tmp > 0xFFFFFFFF)
5602     tmp = 0xFFFFFFFF;
5603   * counter = Uint32(tmp);
5604 }
5605 
execEND_TOREQ(Signal * signal)5606 void Dbdih::execEND_TOREQ(Signal* signal)
5607 {
5608   jamEntry();
5609   EndToReq req = *(EndToReq *)&signal->theData[0];
5610 
5611   Uint32 nodeId = refToNode(req.senderRef);
5612   TakeOverRecordPtr takeOverPtr;
5613 
5614   {
5615     jam();
5616     /**
5617      *
5618      */
5619     ndbrequire(findTakeOver(takeOverPtr, nodeId));
5620     NodeRecordPtr nodePtr;
5621     nodePtr.i = nodeId;
5622     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5623 
5624     if (req.flags & StartCopyReq::WAIT_LCP)
5625     {
5626       /**
5627        * Wait for LCP
5628        */
5629       Uint32 latestLCP_ID = SYSFILE->latestLCP_ID;
5630       switch (c_lcpState.lcpStatus)
5631       {
5632         case LCP_STATUS_IDLE:
5633         case LCP_WAIT_MUTEX:
5634         case LCP_TCGET:
5635         case LCP_TC_CLOPSIZE:
5636           /**
5637            * We haven't started the next LCP yet, we haven't assigned the
5638            * nodes to participate in this LCP, so we will wait for the next
5639            * LCP started.
5640            */
5641          jam();
5642          latestLCP_ID++;
5643          break;
5644        default:
5645          /**
5646           * All the remaining status codes means that the LCP has been started
5647           * and that the participating nodes have been set. So if our node is
5648           * part of the participating nodes we will wait for this LCP,
5649           * otherwise we will wait for the next LCP to start.
5650           */
5651          jam();
5652          if (!c_lcpState.m_participatingLQH.get(nodeId))
5653          {
5654            jam();
5655            latestLCP_ID++;
5656          }
5657          break;
5658       }
5659       infoEvent("Make On-line Database recoverable by waiting"
5660                 " for LCP Starting on node %u, LCP id %u",
5661                 nodeId,
5662                 latestLCP_ID);
5663 
5664       nodePtr.p->copyCompleted = 2;
5665       takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_WAIT_LCP;
5666 
5667       /**
5668        * Make sure that node also participated in one GCP
5669        *   before running it's first LCP, so that GCI variables
5670        *   in LQH are set properly
5671        */
5672       c_lcpState.lcpStopGcp = c_newest_restorable_gci;
5673 
5674       /**
5675        * We want to keep track of how long time we wait for LCP to be able
5676        * to present it in an ndbinfo table. This information is also used
5677        * in deciding when to start LCPs.
5678        *
5679        * We ensure that we will not stall any LCPs in this state due to not
5680        * having had enough activity. We can still stall due to waiting for
5681        * other nodes to reach this state.
5682        */
5683       add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
5684       setNodeRecoveryStatus(nodePtr.i, NodeRecord::WAIT_LCP_FOR_RESTART);
5685       return;
5686     }
5687     nodePtr.p->copyCompleted = 1;
5688     releaseTakeOver(takeOverPtr, true);
5689   }
5690 
5691   EndToConf * conf = (EndToConf *)&signal->theData[0];
5692   conf->senderData = req.senderData;
5693   conf->sendingNodeId = cownNodeId;
5694   conf->startingNodeId = req.startingNodeId;
5695   sendSignal(req.senderRef, GSN_END_TOCONF, signal,
5696              EndToConf::SignalLength, JBB);
5697 }//Dbdih::execEND_TOREQ()
5698 
5699 /* --------------------------------------------------------------------------*/
5700 /*       AN ORDER TO START OR COMMIT THE REPLICA CREATION ARRIVED FROM THE   */
5701 /*       MASTER.                                                             */
5702 /* --------------------------------------------------------------------------*/
execUPDATE_FRAG_STATEREQ(Signal * signal)5703 void Dbdih::execUPDATE_FRAG_STATEREQ(Signal* signal)
5704 {
5705   jamEntry();
5706   UpdateFragStateReq * const req = (UpdateFragStateReq *)&signal->theData[0];
5707 
5708   Uint32 senderData = req->senderData;
5709   Uint32 senderRef = req->senderRef;
5710 
5711   TabRecordPtr tabPtr;
5712   tabPtr.i = req->tableId;
5713   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
5714 
5715   Uint32 fragId = req->fragId;
5716   Uint32 tdestNodeid = req->startingNodeId;
5717   //Uint32 tsourceNodeid = req->copyNodeId;
5718   Uint32 startGci = req->startGci;
5719   Uint32 replicaType = req->replicaType;
5720   Uint32 tFailedNodeId = req->failedNodeId;
5721 
5722   FragmentstorePtr fragPtr;
5723   getFragstore(tabPtr.p, fragId, fragPtr);
5724   RETURN_IF_NODE_NOT_ALIVE(tdestNodeid);
5725   ReplicaRecordPtr frReplicaPtr;
5726   findReplica(frReplicaPtr, fragPtr.p, tFailedNodeId,
5727               replicaType == UpdateFragStateReq::START_LOGGING ? false : true);
5728   if (frReplicaPtr.i == RNIL)
5729   {
5730     dump_replica_info(fragPtr.p);
5731   }
5732   ndbrequire(frReplicaPtr.i != RNIL);
5733 
5734   make_table_use_new_replica(tabPtr,
5735                              fragPtr,
5736                              frReplicaPtr,
5737                              replicaType,
5738                              tdestNodeid);
5739 
5740   /* ------------------------------------------------------------------------*/
5741   /*       THE NEW NODE OF THIS REPLICA IS THE STARTING NODE.                */
5742   /* ------------------------------------------------------------------------*/
5743   if (tFailedNodeId != tdestNodeid)
5744   {
5745     jam();
5746     /**
5747      * This is a Hot-spare or move partition
5748      */
5749 
5750     /*  IF WE ARE STARTING A TAKE OVER NODE WE MUST INVALIDATE ALL LCP'S.   */
5751     /*  OTHERWISE WE WILL TRY TO START LCP'S THAT DO NOT EXIST.             */
5752     /* ---------------------------------------------------------------------*/
5753     frReplicaPtr.p->procNode = tdestNodeid;
5754     frReplicaPtr.p->noCrashedReplicas = 0;
5755     frReplicaPtr.p->createGci[0] = startGci;
5756     frReplicaPtr.p->replicaLastGci[0] = (Uint32)-1;
5757     for (Uint32 i = 0; i < MAX_LCP_STORED; i++)
5758     {
5759       frReplicaPtr.p->lcpStatus[i] = ZINVALID;
5760     }
5761   }
5762   else
5763   {
5764     jam();
5765     const Uint32 noCrashed = frReplicaPtr.p->noCrashedReplicas;
5766     arrGuard(noCrashed, 8);
5767     frReplicaPtr.p->createGci[noCrashed] = startGci;
5768     frReplicaPtr.p->replicaLastGci[noCrashed] = (Uint32)-1;
5769   }
5770 
5771   if (!isMaster())
5772   {
5773     jam();
5774     NodeRecordPtr nodePtr;
5775     nodePtr.i = tdestNodeid;
5776     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5777     if (nodePtr.p->nodeRecoveryStatus != NodeRecord::NODE_GETTING_SYNCHED)
5778     {
5779       jam();
5780       /**
5781        * We come here many times, we will call the state transition
5782        * code only the first time.
5783        */
5784       setNodeRecoveryStatus(tdestNodeid, NodeRecord::NODE_GETTING_SYNCHED);
5785     }
5786   }
5787   UpdateFragStateConf * const conf =
5788     (UpdateFragStateConf *)&signal->theData[0];
5789   conf->senderData = senderData;
5790   conf->tableId = tabPtr.i;
5791   conf->fragId = fragId;
5792   conf->sendingNodeId = cownNodeId;
5793   conf->startingNodeId = tdestNodeid;
5794   conf->failedNodeId = tFailedNodeId;
5795   sendSignal(senderRef, GSN_UPDATE_FRAG_STATECONF, signal,
5796              UpdateFragStateConf::SignalLength, JBB);
5797 }//Dbdih::execUPDATE_FRAG_STATEREQ()
5798 
5799 /**
5800  * Node Recovery Status Module
5801  * ---------------------------
5802  * This module is used to keep track of the restart progress in the master node
5803  * and also to report it to the user through a NDBINFO table. The module is
5804  * also used to estimate when a restart reaches certain critical checkpoints
5805  * in the restart execution. This is used to ensure that we hold up start of
5806  * those critical parts (e.g. LCPs) if there is a good chance that we will
5807  * reach there in reasonable time. Same principal as holding a train waiting
5808  * for a batch of important customers. One can wait for a while, but not
5809  * for too long time since this will affect many others as well.
5810  *
5811  * The only actions that are reported here happen in the master node. The only
5812  * exception to this is the node failure and node failure completed events
5813  * that happens in all nodes. Since the master node is the node that was
5814  * started first of all nodes, this means that the master node will contain
5815  * information about the node restarts of all nodes except those that
5816  * was started at the same time as the master node.
5817  */
5818 
5819 /* Debug Node Recovery Status module */
5820 #define DBG_NRS(a)
5821 //#define DBG_NRS(a) ndbout << a << endl
5822 
initNodeRecoveryStatus()5823 void Dbdih::initNodeRecoveryStatus()
5824 {
5825   NodeRecordPtr nodePtr;
5826 
5827   jam();
5828   for (nodePtr.i = 0; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
5829   {
5830     ptrAss(nodePtr, nodeRecord);
5831     nodePtr.p->nodeRecoveryStatus = NodeRecord::NOT_DEFINED_IN_CLUSTER;
5832     nodePtr.p->is_pausable = false;
5833     initNodeRecoveryTimers(nodePtr);
5834   }
5835 }
5836 
initNodeRecoveryTimers(NodeRecordPtr nodePtr)5837 void Dbdih::initNodeRecoveryTimers(NodeRecordPtr nodePtr)
5838 {
5839   jam();
5840   NdbTick_Invalidate(&nodePtr.p->nodeFailTime);
5841   NdbTick_Invalidate(&nodePtr.p->nodeFailCompletedTime);
5842   NdbTick_Invalidate(&nodePtr.p->allocatedNodeIdTime);
5843   NdbTick_Invalidate(&nodePtr.p->includedInHBProtocolTime);
5844   NdbTick_Invalidate(&nodePtr.p->ndbcntrStartWaitTime);
5845   NdbTick_Invalidate(&nodePtr.p->ndbcntrStartedTime);
5846   NdbTick_Invalidate(&nodePtr.p->startPermittedTime);
5847   NdbTick_Invalidate(&nodePtr.p->waitLCPToCopyDictTime);
5848   NdbTick_Invalidate(&nodePtr.p->copyDictToStartingNodeTime);
5849   NdbTick_Invalidate(&nodePtr.p->includeNodeInLCPAndGCPTime);
5850   NdbTick_Invalidate(&nodePtr.p->startDatabaseRecoveryTime);
5851   NdbTick_Invalidate(&nodePtr.p->startUndoDDTime);
5852   NdbTick_Invalidate(&nodePtr.p->startExecREDOLogTime);
5853   NdbTick_Invalidate(&nodePtr.p->startBuildIndexTime);
5854   NdbTick_Invalidate(&nodePtr.p->copyFragmentsStartedTime);
5855   NdbTick_Invalidate(&nodePtr.p->waitLCPForRestartTime);
5856   NdbTick_Invalidate(&nodePtr.p->waitSumaHandoverTime);
5857   NdbTick_Invalidate(&nodePtr.p->restartCompletedTime);
5858   NdbTick_Invalidate(&nodePtr.p->nodeGettingPermitTime);
5859   NdbTick_Invalidate(&nodePtr.p->nodeGettingIncludedTime);
5860   NdbTick_Invalidate(&nodePtr.p->nodeGettingSynchedTime);
5861   NdbTick_Invalidate(&nodePtr.p->nodeInLCPWaitStateTime);
5862   NdbTick_Invalidate(&nodePtr.p->nodeActiveTime);
5863 }
5864 
5865 /**
5866  * A node has allocated a node id, this happens even before the angel starts
5867  * a new ndbd/ndbmtd process or in a very early phase of ndbd/ndbmtd startup.
5868  */
execALLOC_NODEID_REP(Signal * signal)5869 void Dbdih::execALLOC_NODEID_REP(Signal *signal)
5870 {
5871   NodeRecordPtr nodePtr;
5872   AllocNodeIdRep *rep = (AllocNodeIdRep*)&signal->theData[0];
5873 
5874   jamEntry();
5875   if (rep->nodeId >= MAX_NDB_NODES)
5876   {
5877     jam();
5878     return;
5879   }
5880   nodePtr.i = rep->nodeId;
5881   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5882   if (nodePtr.p->nodeStatus == NodeRecord::NOT_IN_CLUSTER)
5883   {
5884     jam();
5885     return;
5886   }
5887   setNodeRecoveryStatus(rep->nodeId, NodeRecord::ALLOCATED_NODE_ID);
5888 }
5889 
5890 /**
5891  * A node have been included in the heartbeat protocol. This happens very early
5892  * on in the restart, from here the node need to act as a real-time engine and
5893  * thus has to avoid extremely time consuming activities that block execution.
5894  */
execINCL_NODE_HB_PROTOCOL_REP(Signal * signal)5895 void Dbdih::execINCL_NODE_HB_PROTOCOL_REP(Signal *signal)
5896 {
5897   InclNodeHBProtocolRep *rep = (InclNodeHBProtocolRep*)&signal->theData[0];
5898   jamEntry();
5899 
5900   setNodeRecoveryStatus(rep->nodeId, NodeRecord::INCLUDED_IN_HB_PROTOCOL);
5901 }
5902 
5903 /**
5904  * The node is blocked to continue in its node restart handling since another
5905  * node is currently going through the stages to among other things copy the
5906  * meta data.
5907  */
execNDBCNTR_START_WAIT_REP(Signal * signal)5908 void Dbdih::execNDBCNTR_START_WAIT_REP(Signal *signal)
5909 {
5910   NdbcntrStartWaitRep *rep = (NdbcntrStartWaitRep*)&signal->theData[0];
5911   jamEntry();
5912 
5913   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NDBCNTR_START_WAIT);
5914 }
5915 
5916 /**
5917  * The node wasn't blocked by another node restart anymore, we can now
5918  * continue processing the restart and soon go on to copy the meta data.
5919  */
execNDBCNTR_STARTED_REP(Signal * signal)5920 void Dbdih::execNDBCNTR_STARTED_REP(Signal *signal)
5921 {
5922   NdbcntrStartedRep *rep = (NdbcntrStartedRep*)&signal->theData[0];
5923   jamEntry();
5924 
5925   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NDBCNTR_STARTED);
5926 }
5927 
5928 /**
5929  * SUMA handover for the node has completed, this is the very final step
5930  * of the node restart after which the node is fully up and running.
5931  */
execSUMA_HANDOVER_COMPLETE_REP(Signal * signal)5932 void Dbdih::execSUMA_HANDOVER_COMPLETE_REP(Signal *signal)
5933 {
5934   SumaHandoverCompleteRep *rep = (SumaHandoverCompleteRep*)&signal->theData[0];
5935   jamEntry();
5936 
5937   setNodeRecoveryStatus(rep->nodeId, NodeRecord::RESTART_COMPLETED);
5938 }
5939 
execLOCAL_RECOVERY_COMP_REP(Signal * signal)5940 void Dbdih::execLOCAL_RECOVERY_COMP_REP(Signal *signal)
5941 {
5942   jamEntry();
5943   if (reference() != cmasterdihref)
5944   {
5945     jam();
5946     sendSignal(cmasterdihref, GSN_LOCAL_RECOVERY_COMP_REP, signal,
5947                LocalRecoveryCompleteRep::SignalLengthMaster, JBB);
5948     return;
5949   }
5950   LocalRecoveryCompleteRep *rep =
5951     (LocalRecoveryCompleteRep*)&signal->theData[0];
5952   LocalRecoveryCompleteRep::PhaseIds phaseId =
5953     (LocalRecoveryCompleteRep::PhaseIds)rep->phaseId;
5954   Uint32 nodeId = rep->nodeId;
5955 
5956   switch (phaseId)
5957   {
5958   case LocalRecoveryCompleteRep::RESTORE_FRAG_COMPLETED:
5959     jam();
5960     setNodeRecoveryStatus(nodeId, NodeRecord::RESTORE_FRAG_COMPLETED);
5961     break;
5962   case LocalRecoveryCompleteRep::UNDO_DD_COMPLETED:
5963     jam();
5964     setNodeRecoveryStatus(nodeId, NodeRecord::UNDO_DD_COMPLETED);
5965     break;
5966   case LocalRecoveryCompleteRep::EXECUTE_REDO_LOG_COMPLETED:
5967     jam();
5968     setNodeRecoveryStatus(nodeId, NodeRecord::EXECUTE_REDO_LOG_COMPLETED);
5969     break;
5970   default:
5971     ndbabort();
5972   }
5973 }
5974 
5975 /**
5976  * Called by starting nodes to provide non-master nodes with an estimate of how
5977  * long time it takes to synchronize the starting node with the alive nodes.
5978  */
sendEND_TOREP(Signal * signal,Uint32 startingNodeId)5979 void Dbdih::sendEND_TOREP(Signal *signal, Uint32 startingNodeId)
5980 {
5981   EndToRep *rep = (EndToRep*)signal->getDataPtrSend();
5982   NodeRecordPtr nodePtr;
5983   nodePtr.i = cfirstAliveNode;
5984   rep->nodeId = startingNodeId;
5985 
5986   do
5987   {
5988     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
5989     {
5990       jamLine(nodePtr.i);
5991       BlockReference ref = calcDihBlockRef(nodePtr.i);
5992       if (ref != cmasterdihref)
5993       {
5994         jam();
5995         sendSignal(ref, GSN_END_TOREP, signal,
5996 	           EndToRep::SignalLength, JBB);
5997       }
5998     }
5999     nodePtr.i = nodePtr.p->nextNode;
6000   } while (nodePtr.i != RNIL);
6001 }
6002 
6003 /**
6004  * Received in non-master nodes, to ensure we get estimate on synch time
6005  * between starting node and alive nodes.
6006  */
execEND_TOREP(Signal * signal)6007 void Dbdih::execEND_TOREP(Signal *signal)
6008 {
6009   EndToRep *rep = (EndToRep*)&signal->theData[0];
6010   jamEntry();
6011   if (isMaster())
6012   {
6013     jam();
6014     return;
6015   }
6016   setNodeRecoveryStatus(rep->nodeId, NodeRecord::NODE_IN_LCP_WAIT_STATE);
6017 }
6018 
6019 /**
6020  * Called when setting state to ALLOCATED_NODE_ID or
6021  * INCLUDE_IN_HB_PROTOCOL since a node can be dead for a long time
6022  * while we've been master and potentially could even have allocated
6023  * its node id before we became master.
6024  */
check_node_not_restarted_yet(NodeRecordPtr nodePtr)6025 void Dbdih::check_node_not_restarted_yet(NodeRecordPtr nodePtr)
6026 {
6027   if (nodePtr.p->nodeRecoveryStatus ==
6028       NodeRecord::NODE_NOT_RESTARTED_YET)
6029   {
6030     jam();
6031     /**
6032      * A node which has been dead since we started is restarted.
6033      * We set node failure time and node failure completed time
6034      * to now in this case to initialise those unknown values, we
6035      * rather report zero time than an uninitialised time.
6036      */
6037     nodePtr.p->nodeFailTime = c_current_time;
6038     nodePtr.p->nodeFailCompletedTime = c_current_time;
6039   }
6040 }
6041 
setNodeRecoveryStatus(Uint32 nodeId,NodeRecord::NodeRecoveryStatus new_status)6042 void Dbdih::setNodeRecoveryStatus(Uint32 nodeId,
6043                                   NodeRecord::NodeRecoveryStatus new_status)
6044 {
6045   NodeRecordPtr nodePtr;
6046   NDB_TICKS current_time;
6047 
6048   c_current_time = NdbTick_getCurrentTicks();
6049   current_time = c_current_time;
6050 
6051   nodePtr.i = nodeId;
6052   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
6053   jam();
6054   jamLine(nodePtr.p->nodeRecoveryStatus);
6055 
6056   /**
6057    * We maintain the state NODE_GETTING_PERMIT in the
6058    * variable is_pausable independent of when it is
6059    * received since it is needed to be able to handle
6060    * PAUSE protocol properly. The node recovery status
6061    * isn't sufficiently developed to handle this using
6062    * the state variable alone yet since we cannot handle
6063    * all restart types yet.
6064    */
6065   if (new_status == NodeRecord::NODE_GETTING_PERMIT)
6066   {
6067     jam();
6068     nodePtr.p->is_pausable = true;
6069   }
6070   else
6071   {
6072     jam();
6073     nodePtr.p->is_pausable = false;
6074   }
6075 
6076   if (getNodeState().startLevel != NodeState::SL_STARTED)
6077   {
6078     jam();
6079     /**
6080      * We will ignore all state transitions until we are started ourselves
6081      * before we even attempt to record state transitions. This means we
6082      * have no view into system restarts currently and inital starts. We
6083      * only worry about node restarts for now.
6084      */
6085     return;
6086   }
6087   if (new_status != NodeRecord::NODE_FAILED &&
6088       new_status != NodeRecord::NODE_FAILURE_COMPLETED)
6089   {
6090     jam();
6091     /**
6092      * Given that QMGR, NDBCNTR, DBDICT and DBDIH executes in the same thread
6093      * the possibility of jumping over a state doesn't exist. If we split out
6094      * any of those into separate threads in the future it is important to
6095      * check that the ndbrequire's in this function still holds.
6096      */
6097     if (!isMaster())
6098     {
6099       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET &&
6100           new_status != NodeRecord::NODE_GETTING_PERMIT)
6101       {
6102         jam();
6103         /**
6104          * We're getting into the game too late, we will ignore state changes
6105          * for this node restart since it won't provide any useful info
6106          * anyways.
6107          */
6108         return;
6109       }
6110     }
6111     else if (nodePtr.p->nodeRecoveryStatus ==
6112              NodeRecord::NODE_NOT_RESTARTED_YET)
6113     {
6114       jam();
6115       switch (new_status)
6116       {
6117         case NodeRecord::ALLOCATED_NODE_ID:
6118           jam();
6119           // Fall through
6120         case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6121           jam();
6122           /**
6123            * These are the normal states to hear about as first states after
6124            * we completed our own start. We can either first hear a node
6125            * failure and then we are sure we will follow the right path
6126            * since we heard about the node failure after being started.
6127            * If we weren't there for the node failure we are also ok with
6128            * starting all the way from allocated node id and included in
6129            * heartbeat protocol.
6130            */
6131           break;
6132         default:
6133           jam();
6134           jamLine(new_status);
6135           /**
6136            * This was due to a partial system restart, we haven't gotten
6137            * around to supporting this yet. This requires more work
6138            * before we can support it, this would mean that we come into
6139            * the action midway, so this will be solved when we handle
6140            * system restarts properly, but this is more work needed and
6141            * not done yet. So for now we ignore those states and will
6142            * handle the next time the node starts up instead.
6143            * TODO
6144            */
6145           return;
6146       }
6147     }
6148   }
6149   switch (new_status)
6150   {
6151     case NodeRecord::NODE_FAILED:
6152     /* State generated in DBDIH */
6153       jam();
6154       /**
6155        * A node failure can happen at any time and from any state as long as
6156        * it is defined in the cluster.
6157        *
6158        * This state change will be reported in all nodes at all times.
6159        *
6160        * We will clear all timers when a node fails since we want to ensure
6161        * that we only have valid timers backwards in time to avoid reading
6162        * old timers.
6163        */
6164       ndbrequire((nodePtr.p->nodeRecoveryStatus !=
6165                   NodeRecord::NOT_DEFINED_IN_CLUSTER));
6166       initNodeRecoveryTimers(nodePtr);
6167       nodePtr.p->nodeFailTime = current_time;
6168       break;
6169     case NodeRecord::NODE_FAILURE_COMPLETED:
6170     /* State generated in DBDIH */
6171       jam();
6172       /* This state change will be reported in all nodes at all times */
6173       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6174                  NodeRecord::NODE_FAILED);
6175       nodePtr.p->nodeFailCompletedTime = current_time;
6176       break;
6177     case NodeRecord::ALLOCATED_NODE_ID:
6178     /* State generated in QMGR */
6179       jam();
6180       ndbrequire(isMaster());
6181       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
6182                   NodeRecord::NODE_FAILURE_COMPLETED) ||
6183                  (nodePtr.p->nodeRecoveryStatus ==
6184                   NodeRecord::ALLOCATED_NODE_ID) ||
6185                  (nodePtr.p->nodeRecoveryStatus ==
6186                   NodeRecord::NODE_NOT_RESTARTED_YET));
6187       check_node_not_restarted_yet(nodePtr);
6188       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::ALLOCATED_NODE_ID)
6189       {
6190         jam();
6191         /**
6192          * If a node first allocates a node id and then comes back again to
6193          * allocate it again, then start counting time from node failed
6194          * as from now since a long time might have passed since we actually
6195          * failed.
6196          */
6197         nodePtr.p->nodeFailTime = current_time;
6198         nodePtr.p->nodeFailCompletedTime = current_time;
6199       }
6200       nodePtr.p->allocatedNodeIdTime = current_time;
6201       break;
6202     case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6203     /* State generated in QMGR */
6204       jam();
6205       /**
6206        * We can come here from ALLOCATED_NODE_ID obviously,
6207        * but it seems that we should also be able to get
6208        * here from a state where the node has been able to
6209        * allocate a node id with an old master, now it is
6210        * using this old allocated node id to be included in
6211        * the heartbeat protocol. So the node could be in
6212        * node not restarted yet or node failure completed.
6213        */
6214       ndbrequire(isMaster());
6215       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
6216                   NodeRecord::ALLOCATED_NODE_ID) ||
6217                  (nodePtr.p->nodeRecoveryStatus ==
6218                   NodeRecord::NODE_NOT_RESTARTED_YET) ||
6219                  (nodePtr.p->nodeRecoveryStatus ==
6220                   NodeRecord::NODE_FAILURE_COMPLETED));
6221       check_node_not_restarted_yet(nodePtr);
6222       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILURE_COMPLETED)
6223       {
6224         jam();
6225         nodePtr.p->allocatedNodeIdTime = current_time;
6226       }
6227       nodePtr.p->includedInHBProtocolTime = current_time;
6228       break;
6229     case NodeRecord::NDBCNTR_START_WAIT:
6230     /* State generated in NDBCNTR */
6231       jam();
6232       ndbrequire(isMaster());
6233       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6234                  NodeRecord::INCLUDED_IN_HB_PROTOCOL);
6235       nodePtr.p->ndbcntrStartWaitTime = current_time;
6236       break;
6237     case NodeRecord::NDBCNTR_STARTED:
6238     /* State generated in NDBCNTR */
6239       jam();
6240       ndbrequire(isMaster());
6241       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
6242                   NodeRecord::NDBCNTR_START_WAIT) ||
6243                  (nodePtr.p->nodeRecoveryStatus ==
6244                   NodeRecord::INCLUDED_IN_HB_PROTOCOL));
6245 
6246       if (nodePtr.p->nodeRecoveryStatus ==
6247           NodeRecord::INCLUDED_IN_HB_PROTOCOL)
6248       {
6249         jam();
6250         nodePtr.p->ndbcntrStartWaitTime = current_time;
6251       }
6252       nodePtr.p->ndbcntrStartedTime = current_time;
6253       break;
6254     case NodeRecord::START_PERMITTED:
6255     /* State generated in DBDIH */
6256       jam();
6257       ndbrequire(isMaster());
6258       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6259                  NodeRecord::NDBCNTR_STARTED);
6260       nodePtr.p->startPermittedTime = current_time;
6261       break;
6262     case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6263     /* State generated in DBDIH */
6264       jam();
6265       ndbrequire(isMaster());
6266       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6267                  NodeRecord::START_PERMITTED);
6268       nodePtr.p->waitLCPToCopyDictTime = current_time;
6269       break;
6270     case NodeRecord::COPY_DICT_TO_STARTING_NODE:
6271     /* State generated in DBDIH */
6272       jam();
6273       ndbrequire(isMaster());
6274       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6275                  NodeRecord::WAIT_LCP_TO_COPY_DICT);
6276       nodePtr.p->copyDictToStartingNodeTime = current_time;
6277       break;
6278     case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
6279     /* State generated in DBDIH */
6280       jam();
6281       ndbrequire(isMaster());
6282       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6283                  NodeRecord::COPY_DICT_TO_STARTING_NODE);
6284       nodePtr.p->includeNodeInLCPAndGCPTime = current_time;
6285       break;
6286     case NodeRecord::LOCAL_RECOVERY_STARTED:
6287     /* State generated in DBDIH */
6288       jam();
6289       ndbrequire(isMaster());
6290       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6291                  NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP);
6292       nodePtr.p->startDatabaseRecoveryTime = current_time;
6293       break;
6294     case NodeRecord::RESTORE_FRAG_COMPLETED:
6295     /* State generated in DBLQH in starting node */
6296       jam();
6297       ndbrequire(isMaster());
6298       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6299                  NodeRecord::LOCAL_RECOVERY_STARTED);
6300       nodePtr.p->startUndoDDTime = current_time;
6301       break;
6302     case NodeRecord::UNDO_DD_COMPLETED:
6303     /* State generated in DBLQH in starting node */
6304       jam();
6305       ndbrequire(isMaster());
6306       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6307                  NodeRecord::RESTORE_FRAG_COMPLETED);
6308       nodePtr.p->startExecREDOLogTime = current_time;
6309       break;
6310     case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
6311     /* State generated in DBLQH in starting node */
6312       jam();
6313       ndbrequire(isMaster());
6314       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6315                  NodeRecord::UNDO_DD_COMPLETED);
6316       nodePtr.p->startBuildIndexTime = current_time;
6317       break;
6318     case NodeRecord::COPY_FRAGMENTS_STARTED:
6319     /* State generated in DBDIH */
6320       jam();
6321       ndbrequire(isMaster());
6322       /**
6323        * If the starting node doesn't support reporting its
6324        * local recovery status, then we come here from
6325        * LOCAL_RECOVERY_STARTED, in the normal case with a
6326        * new version of the starting node we come here rather from
6327        * EXECUTE_REDO_LOG_COMPLETED.
6328        */
6329       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6330                  NodeRecord::EXECUTE_REDO_LOG_COMPLETED);
6331       if (nodePtr.p->nodeRecoveryStatus ==
6332           NodeRecord::LOCAL_RECOVERY_STARTED)
6333       {
6334         /**
6335          * We handle this state transition even for old versions since
6336          * it still gives all the information we need to make the right
6337          * decision about the LCP start.
6338          */
6339         NDB_TICKS start_time = nodePtr.p->startDatabaseRecoveryTime;
6340         jam();
6341         /* Set all local times to 0 if node doesn't support sending those */
6342         nodePtr.p->startUndoDDTime = start_time;
6343         nodePtr.p->startExecREDOLogTime = start_time;
6344         nodePtr.p->startBuildIndexTime = start_time;
6345       }
6346       nodePtr.p->copyFragmentsStartedTime = current_time;
6347       break;
6348     case NodeRecord::WAIT_LCP_FOR_RESTART:
6349     /* State generated in DBDIH */
6350       jam();
6351       ndbrequire(isMaster());
6352       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6353                  NodeRecord::COPY_FRAGMENTS_STARTED);
6354       nodePtr.p->waitLCPForRestartTime = current_time;
6355       break;
6356     case NodeRecord::WAIT_SUMA_HANDOVER:
6357     /* State generated in DBDIH */
6358       jam();
6359       ndbrequire(isMaster());
6360       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6361                  NodeRecord::WAIT_LCP_FOR_RESTART);
6362       nodePtr.p->waitSumaHandoverTime = current_time;
6363       break;
6364     case NodeRecord::RESTART_COMPLETED:
6365     /* State generated in DBDICT */
6366       jam();
6367       ndbrequire(isMaster());
6368       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6369                  NodeRecord::WAIT_SUMA_HANDOVER);
6370       nodePtr.p->restartCompletedTime = current_time;
6371       break;
6372 
6373     /* Non-master states */
6374     case NodeRecord::NODE_GETTING_PERMIT:
6375     {
6376       jam();
6377       ndbrequire(!isMaster());
6378       /**
6379        * NODE_GETTING_PERMIT is the first state a non-master node sees.
6380        * So we can come here from seeing node failure state or node
6381        * failure completed state.
6382        *
6383        * For a non-master node we can always come to any state from the
6384        * state NODE_NOT_RESTARTED_YET since we don't record any states
6385        * until we have completed our own restart and at that time there
6386        * can be other nodes restarting in any state.
6387        *
6388        * In addition we won't even record states for a starting node if
6389        * we only seen the final phases of the restart. So the state
6390        * NODE_NOT_RESTARTED_YET can be there through a major part of
6391        * a node restart.
6392        */
6393       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6394                  NodeRecord::NODE_FAILURE_COMPLETED ||
6395                  nodePtr.p->nodeRecoveryStatus ==
6396                  NodeRecord::NODE_NOT_RESTARTED_YET);
6397       if (nodePtr.p->nodeRecoveryStatus ==
6398           NodeRecord::NODE_NOT_RESTARTED_YET)
6399       {
6400         jam();
6401         nodePtr.p->nodeFailTime = current_time;
6402         nodePtr.p->nodeFailCompletedTime = current_time;
6403       }
6404       nodePtr.p->nodeGettingPermitTime = current_time;
6405       break;
6406     }
6407     case NodeRecord::NODE_GETTING_INCLUDED:
6408     {
6409       jam();
6410       ndbrequire(!isMaster());
6411       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6412                   NodeRecord::NODE_GETTING_PERMIT);
6413       nodePtr.p->nodeGettingIncludedTime = current_time;
6414       break;
6415     }
6416     case NodeRecord::NODE_GETTING_SYNCHED:
6417     {
6418       jam();
6419       ndbrequire(!isMaster());
6420       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6421                   NodeRecord::NODE_GETTING_INCLUDED);
6422       nodePtr.p->nodeGettingSynchedTime = current_time;
6423       break;
6424     }
6425     case NodeRecord::NODE_IN_LCP_WAIT_STATE:
6426     {
6427       jam();
6428       ndbrequire(!isMaster());
6429       /**
6430        * A weird case for coming to here with NODE_GETTING_INCLUDED is if
6431        * there are no tables that require being synched. This is an
6432        * unusual case, but still possible.
6433        */
6434       ndbrequire((nodePtr.p->nodeRecoveryStatus ==
6435                   NodeRecord::NODE_GETTING_INCLUDED) ||
6436                  (nodePtr.p->nodeRecoveryStatus ==
6437                   NodeRecord::NODE_GETTING_SYNCHED));
6438       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_GETTING_INCLUDED)
6439       {
6440         jam();
6441         /* No fragment updates, set time to 0 for synch */
6442         nodePtr.p->nodeGettingSynchedTime = nodePtr.p->nodeGettingIncludedTime;
6443       }
6444       nodePtr.p->nodeInLCPWaitStateTime = current_time;
6445       break;
6446     }
6447     case NodeRecord::NODE_ACTIVE:
6448       jam();
6449       ndbrequire(!isMaster());
6450       ndbrequire(nodePtr.p->nodeRecoveryStatus ==
6451                  NodeRecord::NODE_IN_LCP_WAIT_STATE);
6452       nodePtr.p->nodeActiveTime = current_time;
6453       break;
6454     default:
6455       ndbabort();
6456   }
6457 
6458   infoEvent("NR Status: node=%u,OLD=%s,NEW=%s",
6459             nodeId,
6460             get_status_str(nodePtr.p->nodeRecoveryStatus),
6461             get_status_str(new_status));
6462 
6463   g_eventLogger->info("NR Status: node=%u,OLD=%s,NEW=%s",
6464                       nodeId,
6465                       get_status_str(nodePtr.p->nodeRecoveryStatus),
6466                       get_status_str(new_status));
6467 
6468   nodePtr.p->nodeRecoveryStatus = new_status;
6469   ndbassert(check_node_recovery_timers(nodePtr.i));
6470 }
6471 
setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr)6472 void Dbdih::setNodeRecoveryStatusInitial(NodeRecordPtr nodePtr)
6473 {
6474   DBG_NRS("setNodeRecoveryStatusInitial: node= " << nodePtr.i << "state= " <<
6475           (Uint32)NodeRecord::NODE_NOT_RESTARTED_YET);
6476   nodePtr.p->nodeRecoveryStatus = NodeRecord::NODE_NOT_RESTARTED_YET;
6477 }
6478 
6479 /**
6480  * Define heuristic constants
6481  * --------------------------
6482  *
6483  * The base for the maximum wait is the time the last LCP execution took.
6484  * We will never wait for more than 35% of this time. We will check this
6485  * even before attempting to wait any further. We will also cap the wait
6486  * to never exceed an hour.
6487  *
6488  * Next we will adjust the maximum wait time down to 85% of this value
6489  * when we are calculating the estimate based on node states. This means
6490  * that if we estimate that we will wait for more than around 30% of an
6491  * LCP execution time, then we will start the LCP.
6492  *
6493  * If the node we are waiting for is in the early start phases then we
6494  * even less inclined to wait and will decrease the time by another
6495  * 50% dropping it to around 15% of an LCP execution time.
6496  *
6497  * If we have no node with a proper estimate, then we will drop the
6498  * wait time even more to 25% of the previous value, so 7-8% for
6499  * nodes in later start phases and only 3-4% in early start phases.
6500  */
6501 #define STALL_MAX_ONE_HOUR (60 * 60 * 1000)
6502 #define MAX_PERCENTAGE_OF_LCP_TIME_WE_STALL 35
6503 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_ESTIMATE 85
6504 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_EARLY_START_PHASES 50
6505 #define MAX_PERCENTAGE_ADJUSTMENT_FOR_NO_ESTIMATE 25
6506 
check_for_too_long_wait(Uint64 & lcp_max_wait_time,Uint64 & lcp_stall_time,NDB_TICKS now)6507 bool Dbdih::check_for_too_long_wait(Uint64 &lcp_max_wait_time,
6508                                     Uint64 &lcp_stall_time,
6509                                     NDB_TICKS now)
6510 {
6511   /**
6512    * We first get the time of the latest LCP execution. We want to stall
6513    * execution of LCPs, but never for so long that we get into other
6514    * problems such as out of REDO log.
6515    */
6516   Uint64 lcp_proc_time;
6517   Uint64 lcp_time = c_lcpState.m_lcp_time;
6518   Uint32 lcp_start = c_lcpState.lcpStallStart;
6519   if (lcp_start == 0)
6520   {
6521     jam();
6522     lcp_stall_time = 0;
6523   }
6524   else
6525   {
6526     jam();
6527     lcp_stall_time = NdbTick_Elapsed(c_lcpState.m_start_lcp_check_time,
6528                                      now).milliSec();
6529   }
6530 
6531   /**
6532    * We never wait for more than 1 hour and at most 35% of the time it
6533    * takes to execute an LCP. We calculate the maximum stall time here
6534    * based on those two inputs.
6535    */
6536   lcp_proc_time = MAX_PERCENTAGE_OF_LCP_TIME_WE_STALL * lcp_time;
6537   lcp_proc_time /= 100;
6538   lcp_max_wait_time = STALL_MAX_ONE_HOUR;
6539   if (lcp_max_wait_time > lcp_proc_time)
6540   {
6541     jam();
6542     lcp_max_wait_time = lcp_proc_time;
6543   }
6544 
6545   DBG_NRS("lcp_stall_time is = " << lcp_stall_time
6546            << " lcp_max_wait_time is = " << lcp_max_wait_time);
6547   /**
6548    * If we have already stalled for longer time than the maximum wait we
6549    * will allow, then we need not check the states of node restarts, we
6550    * will start the LCP anyways.
6551    */
6552   if (lcp_stall_time > lcp_max_wait_time)
6553   {
6554     jam();
6555     return true;
6556   }
6557 
6558   /**
6559    * In the calculated delay we will allow for a slightly shorter calculated
6560    * delay than the maximum actual delay we will wait. This is to avoid that
6561    * we wait for a long time only to stop waiting right before the wait is
6562    * over.
6563    */
6564   lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_ESTIMATE;
6565   lcp_max_wait_time /= 100; /* Decrease max time by 15% */
6566   lcp_max_wait_time -= lcp_stall_time; /* Decrease by time we already waited */
6567   return false;
6568 }
6569 
calculate_time_remaining(Uint32 nodeId,NDB_TICKS state_start_time,NDB_TICKS now,NodeRecord::NodeRecoveryStatus state,Uint32 * node_waited_for,Uint64 * time_since_state_start,NodeRecord::NodeRecoveryStatus * max_status)6570 void Dbdih::calculate_time_remaining(
6571                                 Uint32 nodeId,
6572                                 NDB_TICKS state_start_time,
6573                                 NDB_TICKS now,
6574                                 NodeRecord::NodeRecoveryStatus state,
6575                                 Uint32 *node_waited_for,
6576                                 Uint64 *time_since_state_start,
6577                                 NodeRecord::NodeRecoveryStatus *max_status)
6578 {
6579   ndbassert(NdbTick_IsValid(now));
6580   ndbassert(NdbTick_IsValid(state_start_time));
6581 
6582   if (state > (*max_status))
6583   {
6584     jam();
6585     (*time_since_state_start) =
6586       NdbTick_Elapsed(state_start_time, now).milliSec();
6587     (*max_status) = state;
6588     (*node_waited_for) = nodeId;
6589   }
6590   else if (state == (*max_status))
6591   {
6592     jam();
6593     Uint64 loc_time_since_state_start;
6594     loc_time_since_state_start =
6595       NdbTick_Elapsed(state_start_time, now).milliSec();
6596     if (loc_time_since_state_start > (*time_since_state_start))
6597     {
6598       jam();
6599       (*time_since_state_start) = loc_time_since_state_start;
6600       (*node_waited_for) = nodeId;
6601     }
6602   }
6603 }
6604 
calculate_most_recent_node(Uint32 nodeId,NDB_TICKS state_start_time,NodeRecord::NodeRecoveryStatus state,Uint32 * most_recent_node,NDB_TICKS * most_recent_start_time,NodeRecord::NodeRecoveryStatus * most_recent_state)6605 void Dbdih::calculate_most_recent_node(
6606                         Uint32 nodeId,
6607                         NDB_TICKS state_start_time,
6608                         NodeRecord::NodeRecoveryStatus state,
6609                         Uint32 *most_recent_node,
6610                         NDB_TICKS *most_recent_start_time,
6611                         NodeRecord::NodeRecoveryStatus *most_recent_state)
6612 {
6613   ndbassert(NdbTick_IsValid(state_start_time));
6614   if ((*most_recent_node) == 0)
6615   {
6616     /* No state set, set this as state */
6617     jam();
6618   }
6619   else if ((*most_recent_state) == state)
6620   {
6621     jam();
6622     /* Same state as before, use most recent */
6623     if (NdbTick_Compare((*most_recent_start_time),
6624                         state_start_time) > 0)
6625     {
6626       jam();
6627       return;
6628     }
6629     jam();
6630   }
6631   else if ((*most_recent_state) == NodeRecord::NODE_ACTIVE)
6632   {
6633     /* Old state from non-master, new from master, use this one */
6634     jam();
6635   }
6636   else if ((*most_recent_state) > state)
6637   {
6638     /**
6639      * Two master states, use the latest (this one)
6640      * Latest is the one with the lowest state since
6641      * the older one has progressed longer.
6642      */
6643     jam();
6644   }
6645   else
6646   {
6647     /* Ignore this state, we already have a better one */
6648     jam();
6649     return;
6650   }
6651   (*most_recent_state) = state;
6652   (*most_recent_start_time) = state_start_time;
6653   (*most_recent_node) = nodeId;
6654   return;
6655 }
6656 
6657 #if 0
6658 /* Useful debug function when trying to find overwrite of node record */
6659 void Dbdih::check_all_node_recovery_timers(void)
6660 {
6661   Uint32 nodeId;
6662   for (nodeId = 1; nodeId <= m_max_node_id; nodeId++)
6663   {
6664     ndbassert(check_node_recovery_timers(nodeId));
6665   }
6666 }
6667 #endif
6668 
check_node_recovery_timers(Uint32 nodeId)6669 bool Dbdih::check_node_recovery_timers(Uint32 nodeId)
6670 {
6671   NodeRecordPtr nodePtr;
6672   nodePtr.i = nodeId;
6673   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
6674 
6675 #if defined VM_TRACE || defined ERROR_INSERT
6676   switch (nodePtr.p->nodeRecoveryStatus)
6677   {
6678   case NodeRecord::RESTART_COMPLETED:
6679     ndbrequire(NdbTick_IsValid(nodePtr.p->restartCompletedTime));
6680     // Fallthrough
6681   case NodeRecord::WAIT_SUMA_HANDOVER:
6682     ndbrequire(NdbTick_IsValid(nodePtr.p->waitSumaHandoverTime));
6683     // Fallthrough
6684   case NodeRecord::WAIT_LCP_FOR_RESTART:
6685     ndbrequire(NdbTick_IsValid(nodePtr.p->waitLCPForRestartTime));
6686     // Fallthrough
6687   case NodeRecord::COPY_FRAGMENTS_STARTED:
6688     ndbrequire(NdbTick_IsValid(nodePtr.p->copyFragmentsStartedTime));
6689     // Fallthrough
6690   case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
6691     ndbrequire(NdbTick_IsValid(nodePtr.p->startBuildIndexTime));
6692     // Fallthrough
6693   case NodeRecord::UNDO_DD_COMPLETED:
6694     ndbrequire(NdbTick_IsValid(nodePtr.p->startExecREDOLogTime));
6695     // Fallthrough
6696   case NodeRecord::RESTORE_FRAG_COMPLETED:
6697     ndbrequire(NdbTick_IsValid(nodePtr.p->startUndoDDTime));
6698     // Fallthrough
6699   case NodeRecord::LOCAL_RECOVERY_STARTED:
6700     ndbrequire(NdbTick_IsValid(nodePtr.p->startDatabaseRecoveryTime));
6701     // Fallthrough
6702   case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
6703     ndbrequire(NdbTick_IsValid(nodePtr.p->includeNodeInLCPAndGCPTime));
6704     // Fallthrough
6705   case NodeRecord::COPY_DICT_TO_STARTING_NODE:
6706     ndbrequire(NdbTick_IsValid(nodePtr.p->copyDictToStartingNodeTime));
6707     // Fallthrough
6708   case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6709     ndbrequire(NdbTick_IsValid(nodePtr.p->waitLCPToCopyDictTime));
6710     // Fallthrough
6711   case NodeRecord::START_PERMITTED:
6712     ndbrequire(NdbTick_IsValid(nodePtr.p->startPermittedTime));
6713     // Fallthrough
6714   case NodeRecord::NDBCNTR_STARTED:
6715     ndbrequire(NdbTick_IsValid(nodePtr.p->ndbcntrStartedTime));
6716     // Fallthrough
6717   case NodeRecord::NDBCNTR_START_WAIT:
6718     ndbrequire(NdbTick_IsValid(nodePtr.p->ndbcntrStartWaitTime));
6719     // Fallthrough
6720   case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6721     ndbrequire(NdbTick_IsValid(nodePtr.p->includedInHBProtocolTime));
6722     // Fallthrough
6723   case NodeRecord::ALLOCATED_NODE_ID:
6724     ndbrequire(NdbTick_IsValid(nodePtr.p->allocatedNodeIdTime));
6725     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
6726     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailTime));
6727     break;
6728   case NodeRecord::NODE_ACTIVE:
6729     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeActiveTime));
6730     // Fallthrough
6731   case NodeRecord::NODE_IN_LCP_WAIT_STATE:
6732     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeInLCPWaitStateTime));
6733     // Fallthrough
6734   case NodeRecord::NODE_GETTING_SYNCHED:
6735     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeGettingSynchedTime));
6736     // Fallthrough
6737   case NodeRecord::NODE_GETTING_INCLUDED:
6738     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeGettingIncludedTime));
6739     // Fallthrough
6740   case NodeRecord::NODE_GETTING_PERMIT:
6741     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeGettingPermitTime));
6742     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
6743     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailTime));
6744     break;
6745   case NodeRecord::NODE_FAILURE_COMPLETED:
6746     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailCompletedTime));
6747     // Fallthrough
6748   case NodeRecord::NODE_FAILED:
6749     ndbrequire(NdbTick_IsValid(nodePtr.p->nodeFailTime));
6750     break;
6751   default:
6752     jam();
6753   }
6754 #endif
6755   return true;
6756 }
6757 
6758 /**
6759  * We want to stall the LCP start if any node is encountering the place where
6760  * we need to participate in an LCP to complete our restart. If any node is
6761  * close to reaching this state we want to block the LCP until it has reached
6762  * this state.
6763  */
check_stall_lcp_start(void)6764 bool Dbdih::check_stall_lcp_start(void)
6765 {
6766   const NDB_TICKS now = c_current_time = NdbTick_getCurrentTicks();
6767   /**
6768    * The following variables are calculated to measure the node closest to
6769    * reaching the WAIT_LCP_FOR_RESTART state.
6770    */
6771   NodeRecord::NodeRecoveryStatus max_status = NodeRecord::NOT_DEFINED_IN_CLUSTER;
6772   Uint64 time_since_state_start = 0;
6773   Uint32 node_waited_for = 0;
6774   NDB_TICKS state_start_time;
6775 
6776   /**
6777    * This is the node we will use to estimate the time remaining. If no such
6778    * node exists, then we have no measurements to use and we will have to
6779    * fall back to heuristics. We also store the state and time of this variable
6780    * to get the most recent estimate.
6781    */
6782   NodeRecord::NodeRecoveryStatus most_recent_node_status =
6783     NodeRecord::ALLOCATED_NODE_ID;
6784   Uint32 most_recent_node = 0;
6785   NDB_TICKS most_recent_node_start_time;
6786 
6787   /**
6788    * If the estimated time until we reach the WAIT_LCP_FOR_RESTART state is
6789    * higher than the below value, then we won't wait at all, we will start
6790    * the LCP immediately in this case.
6791    */
6792   Uint64 lcp_max_wait_time = 0;
6793   Uint64 lcp_stall_time = 0;
6794 
6795   /**
6796    * If we don't find any most recent node, then should we fall back to
6797    * heuristics?. We fall back to heuristics when we have nodes in early
6798    * stages of node restart that could potentially move through those
6799    * stages rapidly.
6800    */
6801   NodeRecordPtr nodePtr;
6802 
6803   Uint64 time_remaining;
6804   Uint64 estimated_time;
6805 
6806   NdbTick_Invalidate(&most_recent_node_start_time);
6807   NdbTick_Invalidate(&state_start_time);
6808 
6809   if (check_for_too_long_wait(lcp_max_wait_time,
6810                               lcp_stall_time,
6811                               now))
6812   {
6813     jam();
6814     goto immediate_start_label;
6815   }
6816 
6817   /**
6818    * It is ok to wait before starting the new LCP, we will go through the
6819    * data nodes and see if we have reasons to wait.
6820    */
6821   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
6822   {
6823     ptrAss(nodePtr, nodeRecord);
6824     switch (nodePtr.p->nodeRecoveryStatus)
6825     {
6826       case NodeRecord::NOT_DEFINED_IN_CLUSTER:
6827       case NodeRecord::NODE_NOT_RESTARTED_YET:
6828       {
6829         jam();
6830         /**
6831          * We have no useful information about estimated time remaining
6832          * and we're not restarting this node currently. Simply continue.
6833          */
6834         break;
6835       }
6836       /**
6837        * The states NODE_ACTIVE, RESTART_COMPLETED, WAIT_LCP_FOR_RESTART and
6838        * WAIT_SUMA_HANDOVER can all be used to estimate the time remaining
6839        * for the node restarts still running. We use the most recent estimate,
6840        * the WAIT_LCP_FOR_RESTART being most recent, then WAIT_SUMA_HANDOVER,
6841        * then RESTART_COMPLETED and finally NODE_ACTIVE.
6842        */
6843       case NodeRecord::NODE_ACTIVE:
6844       {
6845         jam();
6846         state_start_time = nodePtr.p->nodeActiveTime;
6847         calculate_most_recent_node(nodePtr.i,
6848                                    state_start_time,
6849                                    nodePtr.p->nodeRecoveryStatus,
6850                                    &most_recent_node,
6851                                    &most_recent_node_start_time,
6852                                    &most_recent_node_status);
6853         break;
6854       }
6855       case NodeRecord::RESTART_COMPLETED:
6856       {
6857         jam();
6858         state_start_time = nodePtr.p->restartCompletedTime;
6859         calculate_most_recent_node(nodePtr.i,
6860                                    state_start_time,
6861                                    nodePtr.p->nodeRecoveryStatus,
6862                                    &most_recent_node,
6863                                    &most_recent_node_start_time,
6864                                    &most_recent_node_status);
6865         break;
6866       }
6867       case NodeRecord::WAIT_SUMA_HANDOVER:
6868       {
6869         jam();
6870         state_start_time = nodePtr.p->waitSumaHandoverTime;
6871         calculate_most_recent_node(nodePtr.i,
6872                                    state_start_time,
6873                                    nodePtr.p->nodeRecoveryStatus,
6874                                    &most_recent_node,
6875                                    &most_recent_node_start_time,
6876                                    &most_recent_node_status);
6877         break;
6878       }
6879       case NodeRecord::WAIT_LCP_FOR_RESTART:
6880       {
6881         jam();
6882         state_start_time = nodePtr.p->waitLCPForRestartTime;
6883         ndbassert(NdbTick_IsValid(nodePtr.p->includeNodeInLCPAndGCPTime));
6884         ndbassert(NdbTick_IsValid(nodePtr.p->copyDictToStartingNodeTime));
6885         calculate_most_recent_node(nodePtr.i,
6886                                    state_start_time,
6887                                    nodePtr.p->nodeRecoveryStatus,
6888                                    &most_recent_node,
6889                                    &most_recent_node_start_time,
6890                                    &most_recent_node_status);
6891         break;
6892       }
6893       /**
6894        * The following are states where we expect a node restart to either
6895        * be ongoing or to very soon start up.
6896        *
6897        * The states ranging from NDBCNTR_STARTED to COPY_FRAGMENTS_STARTED
6898        * are states that can be used to estimate the time remaining until
6899        * someone reaches the WAIT_LCP_FOR_RESTART state. We get the state
6900        * and time in this state for the node that has proceeded the
6901        * furthest in the restart. The other states are less good for
6902        * estimating the time remaining but will still be used with some
6903        * extra heuristics.
6904        */
6905       case NodeRecord::NODE_FAILED:
6906       {
6907         jam();
6908         state_start_time = nodePtr.p->nodeFailTime;
6909         calculate_time_remaining(nodePtr.i,
6910                                  state_start_time,
6911                                  now,
6912                                  nodePtr.p->nodeRecoveryStatus,
6913                                  &node_waited_for,
6914                                  &time_since_state_start,
6915                                  &max_status);
6916         break;
6917       }
6918       case NodeRecord::NODE_FAILURE_COMPLETED:
6919       {
6920         jam();
6921         state_start_time = nodePtr.p->nodeFailCompletedTime;
6922         calculate_time_remaining(nodePtr.i,
6923                                  state_start_time,
6924                                  now,
6925                                  nodePtr.p->nodeRecoveryStatus,
6926                                  &node_waited_for,
6927                                  &time_since_state_start,
6928                                  &max_status);
6929         break;
6930       }
6931       case NodeRecord::ALLOCATED_NODE_ID:
6932       {
6933         jam();
6934         state_start_time = nodePtr.p->allocatedNodeIdTime;
6935         calculate_time_remaining(nodePtr.i,
6936                                  state_start_time,
6937                                  now,
6938                                  nodePtr.p->nodeRecoveryStatus,
6939                                  &node_waited_for,
6940                                  &time_since_state_start,
6941                                  &max_status);
6942         break;
6943       }
6944       case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
6945       {
6946         jam();
6947         state_start_time = nodePtr.p->includedInHBProtocolTime;
6948         calculate_time_remaining(nodePtr.i,
6949                                  state_start_time,
6950                                  now,
6951                                  nodePtr.p->nodeRecoveryStatus,
6952                                  &node_waited_for,
6953                                  &time_since_state_start,
6954                                  &max_status);
6955         break;
6956       }
6957       case NodeRecord::NDBCNTR_START_WAIT:
6958       {
6959         jam();
6960         state_start_time = nodePtr.p->ndbcntrStartWaitTime;
6961         calculate_time_remaining(nodePtr.i,
6962                                  state_start_time,
6963                                  now,
6964                                  nodePtr.p->nodeRecoveryStatus,
6965                                  &node_waited_for,
6966                                  &time_since_state_start,
6967                                  &max_status);
6968         break;
6969       }
6970       case NodeRecord::NDBCNTR_STARTED:
6971       {
6972         jam();
6973         state_start_time = nodePtr.p->ndbcntrStartedTime;
6974         calculate_time_remaining(nodePtr.i,
6975                                  state_start_time,
6976                                  now,
6977                                  nodePtr.p->nodeRecoveryStatus,
6978                                  &node_waited_for,
6979                                  &time_since_state_start,
6980                                  &max_status);
6981         break;
6982       }
6983       case NodeRecord::START_PERMITTED:
6984       {
6985         jam();
6986         state_start_time = nodePtr.p->startPermittedTime;
6987         calculate_time_remaining(nodePtr.i,
6988                                  state_start_time,
6989                                  now,
6990                                  nodePtr.p->nodeRecoveryStatus,
6991                                  &node_waited_for,
6992                                  &time_since_state_start,
6993                                  &max_status);
6994         break;
6995       }
6996       case NodeRecord::WAIT_LCP_TO_COPY_DICT:
6997       {
6998         jam();
6999         state_start_time = nodePtr.p->waitLCPToCopyDictTime;
7000         calculate_time_remaining(nodePtr.i,
7001                                  state_start_time,
7002                                  now,
7003                                  nodePtr.p->nodeRecoveryStatus,
7004                                  &node_waited_for,
7005                                  &time_since_state_start,
7006                                  &max_status);
7007         break;
7008       }
7009       case NodeRecord::COPY_DICT_TO_STARTING_NODE:
7010       {
7011         jam();
7012         state_start_time = nodePtr.p->copyDictToStartingNodeTime;
7013         calculate_time_remaining(nodePtr.i,
7014                                  state_start_time,
7015                                  now,
7016                                  nodePtr.p->nodeRecoveryStatus,
7017                                  &node_waited_for,
7018                                  &time_since_state_start,
7019                                  &max_status);
7020         break;
7021       }
7022       case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
7023       {
7024         jam();
7025         state_start_time = nodePtr.p->includeNodeInLCPAndGCPTime;
7026         calculate_time_remaining(nodePtr.i,
7027                                  state_start_time,
7028                                  now,
7029                                  nodePtr.p->nodeRecoveryStatus,
7030                                  &node_waited_for,
7031                                  &time_since_state_start,
7032                                  &max_status);
7033         break;
7034       }
7035       case NodeRecord::LOCAL_RECOVERY_STARTED:
7036       {
7037         jam();
7038         state_start_time = nodePtr.p->startDatabaseRecoveryTime;
7039         calculate_time_remaining(nodePtr.i,
7040                                  state_start_time,
7041                                  now,
7042                                  nodePtr.p->nodeRecoveryStatus,
7043                                  &node_waited_for,
7044                                  &time_since_state_start,
7045                                  &max_status);
7046         break;
7047       }
7048       case NodeRecord::RESTORE_FRAG_COMPLETED:
7049       {
7050         jam();
7051         state_start_time = nodePtr.p->startUndoDDTime;
7052         calculate_time_remaining(nodePtr.i,
7053                                  state_start_time,
7054                                  now,
7055                                  nodePtr.p->nodeRecoveryStatus,
7056                                  &node_waited_for,
7057                                  &time_since_state_start,
7058                                  &max_status);
7059         break;
7060       }
7061       case NodeRecord::UNDO_DD_COMPLETED:
7062       {
7063         jam();
7064         state_start_time = nodePtr.p->startExecREDOLogTime;
7065         calculate_time_remaining(nodePtr.i,
7066                                  state_start_time,
7067                                  now,
7068                                  nodePtr.p->nodeRecoveryStatus,
7069                                  &node_waited_for,
7070                                  &time_since_state_start,
7071                                  &max_status);
7072         break;
7073       }
7074       case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
7075       {
7076         jam();
7077         state_start_time = nodePtr.p->startBuildIndexTime;
7078         calculate_time_remaining(nodePtr.i,
7079                                  state_start_time,
7080                                  now,
7081                                  nodePtr.p->nodeRecoveryStatus,
7082                                  &node_waited_for,
7083                                  &time_since_state_start,
7084                                  &max_status);
7085         break;
7086       }
7087       case NodeRecord::COPY_FRAGMENTS_STARTED:
7088       {
7089         jam();
7090         state_start_time = nodePtr.p->copyFragmentsStartedTime;
7091         calculate_time_remaining(nodePtr.i,
7092                                  state_start_time,
7093                                  now,
7094                                  nodePtr.p->nodeRecoveryStatus,
7095                                  &node_waited_for,
7096                                  &time_since_state_start,
7097                                  &max_status);
7098         break;
7099       }
7100       default:
7101       {
7102         jamLine(nodePtr.p->nodeRecoveryStatus);
7103         /* The states only used on non-masters should never occur here */
7104         ndbabort();
7105       }
7106     }
7107   }
7108   if (node_waited_for == 0)
7109   {
7110     jam();
7111     /* No restart is ongoing, we can safely proceed with starting the LCP. */
7112     goto immediate_start_label;
7113   }
7114   if (most_recent_node == 0)
7115   {
7116     jam();
7117     /**
7118      * We have restarts ongoing, but we have no node that can be used to
7119      * estimate the remaining time. In this case we use a heuristic which
7120      * means we're willing to wait for 25% of the max wait time (about
7121      * 7% of the time to execute an LCP). If this wait is sufficient for a
7122      * node to reach WAIT_LCP_FOR_RESTART we immediately get more recent
7123      * estimate and can make more intelligent estimates at that time.
7124      */
7125     lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_NO_ESTIMATE;
7126     lcp_max_wait_time /= 100;
7127     if (lcp_stall_time > lcp_max_wait_time)
7128     {
7129       jam();
7130       goto immediate_start_label;
7131     }
7132     else
7133     {
7134       jam();
7135       goto wait_label;
7136     }
7137   }
7138 
7139   /**
7140    * A node exists which has estimates on times to execute the node restart.
7141    * A node restart exists as well. We will estimate whether it makes sense
7142    * to delay the LCP for a while more at this time.
7143    */
7144   nodePtr.i = most_recent_node;
7145   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
7146   jamLine(most_recent_node);
7147   jamLine(node_waited_for);
7148 
7149   if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_ACTIVE)
7150   {
7151     /**
7152      * We have only access to a node where we gathered measurements during
7153      * the time we were non-master node. We transfer times from non-master
7154      * timers to master timers as best estimates to use below in our
7155      * calculations. We also change the max_status to ensure that we read
7156      * the correct timer when doing the calculations.
7157      *
7158      * Also we don't measure any time since state start since our calculations
7159      * very rough and it would take a lot of logic to get a good estimate of
7160      * time since the state start according the stats gathered as non-master.
7161      *
7162      * Also given that our estimates are less accurate we will decrease the
7163      * maximum wait time by 50%.
7164      */
7165     if (max_status < NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP)
7166     {
7167       jam();
7168       max_status = NodeRecord::NDBCNTR_STARTED;
7169       nodePtr.p->ndbcntrStartedTime = nodePtr.p->nodeGettingPermitTime;
7170     }
7171     else if (max_status < NodeRecord::COPY_FRAGMENTS_STARTED)
7172     {
7173       jam();
7174       max_status = NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP;
7175       nodePtr.p->includeNodeInLCPAndGCPTime =
7176         nodePtr.p->nodeGettingIncludedTime;
7177     }
7178     else
7179     {
7180       jam();
7181       max_status = NodeRecord::COPY_FRAGMENTS_STARTED;
7182       nodePtr.p->copyFragmentsStartedTime = nodePtr.p->nodeGettingSynchedTime;
7183     }
7184     nodePtr.p->waitLCPForRestartTime = nodePtr.p->nodeInLCPWaitStateTime;
7185     time_since_state_start = 0;
7186     lcp_max_wait_time *= MAX_PERCENTAGE_ADJUSTMENT_FOR_EARLY_START_PHASES;
7187     lcp_max_wait_time /= 100;
7188   }
7189 
7190   /**
7191    * Calculate estimated time remaining from start of the max state we've seen.
7192    */
7193   switch (max_status)
7194   {
7195     case NodeRecord::NODE_FAILED:
7196     case NodeRecord::NODE_FAILURE_COMPLETED:
7197     case NodeRecord::ALLOCATED_NODE_ID:
7198     case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
7199     case NodeRecord::NDBCNTR_START_WAIT:
7200     {
7201       jam();
7202       /**
7203        * Estimate a complete restart, these states have wait states that are
7204        * hard to estimate impact of. So here we simply want a measurement
7205        * whether it pays off to wait, we also decrease the maximum wait time
7206        * to decrease likelihood we will actually wait.
7207        */
7208       lcp_max_wait_time *= 50;
7209       lcp_max_wait_time /= 100;
7210       estimated_time = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
7211                               nodePtr.p->waitLCPForRestartTime).milliSec();
7212       break;
7213     }
7214     case NodeRecord::NDBCNTR_STARTED:
7215     {
7216       jam();
7217       estimated_time = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
7218                               nodePtr.p->waitLCPForRestartTime).milliSec();
7219       break;
7220     }
7221     case NodeRecord::START_PERMITTED:
7222     {
7223       jam();
7224       estimated_time = NdbTick_Elapsed(nodePtr.p->startPermittedTime,
7225                               nodePtr.p->waitLCPForRestartTime).milliSec();
7226       break;
7227     }
7228     case NodeRecord::WAIT_LCP_TO_COPY_DICT:
7229     {
7230       jam();
7231       estimated_time = NdbTick_Elapsed(nodePtr.p->waitLCPToCopyDictTime,
7232                               nodePtr.p->waitLCPForRestartTime).milliSec();
7233       break;
7234     }
7235     case NodeRecord::COPY_DICT_TO_STARTING_NODE:
7236     {
7237       jam();
7238       estimated_time = NdbTick_Elapsed(nodePtr.p->copyDictToStartingNodeTime,
7239                               nodePtr.p->waitLCPForRestartTime).milliSec();
7240       break;
7241     }
7242     case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
7243     {
7244       jam();
7245       estimated_time = NdbTick_Elapsed(nodePtr.p->includeNodeInLCPAndGCPTime,
7246                               nodePtr.p->waitLCPForRestartTime).milliSec();
7247       break;
7248     }
7249     case NodeRecord::LOCAL_RECOVERY_STARTED:
7250     {
7251       jam();
7252       estimated_time = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
7253                               nodePtr.p->waitLCPForRestartTime).milliSec();
7254       break;
7255     }
7256     case NodeRecord::RESTORE_FRAG_COMPLETED:
7257     {
7258       jam();
7259       estimated_time = NdbTick_Elapsed(nodePtr.p->startUndoDDTime,
7260                               nodePtr.p->waitLCPForRestartTime).milliSec();
7261       break;
7262     }
7263     case NodeRecord::UNDO_DD_COMPLETED:
7264     {
7265       jam();
7266       estimated_time = NdbTick_Elapsed(nodePtr.p->startExecREDOLogTime,
7267                               nodePtr.p->waitLCPForRestartTime).milliSec();
7268       break;
7269     }
7270     case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
7271     {
7272       jam();
7273       estimated_time = NdbTick_Elapsed(nodePtr.p->startBuildIndexTime,
7274                               nodePtr.p->waitLCPForRestartTime).milliSec();
7275       break;
7276     }
7277     case NodeRecord::COPY_FRAGMENTS_STARTED:
7278     {
7279       jam();
7280       estimated_time = NdbTick_Elapsed(nodePtr.p->copyFragmentsStartedTime,
7281                               nodePtr.p->waitLCPForRestartTime).milliSec();
7282       break;
7283     }
7284     default:
7285     {
7286       jamLine(max_status);
7287       ndbabort();
7288       return true; /* Will never reach here, silence compiler warnings */
7289     }
7290   }
7291 
7292   if (estimated_time < time_since_state_start)
7293   {
7294     jam();
7295     time_remaining = 0;
7296   }
7297   else
7298   {
7299     jam();
7300     time_remaining = estimated_time - time_since_state_start;
7301   }
7302   if (time_remaining > lcp_max_wait_time)
7303   {
7304     jam();
7305     goto immediate_start_label;
7306   }
7307 
7308 wait_label:
7309   /**
7310    * We exit from the routine to check for stalling LCPs with a decision
7311    * to stall or continue stalling. We ensure that we output proper logs
7312    * about this decision every now and then and that we record the proper
7313    * information about the stalling decisions.
7314    */
7315   jam();
7316   if (c_lcpState.lcpStallStart == 0)
7317   {
7318     jam();
7319     c_lcpState.m_start_lcp_check_time = now;
7320   }
7321   if (c_lcpState.lcpStallStart == 0 ||
7322       node_waited_for != c_lcpState.stall_node_waiting_for ||
7323       NdbTick_Elapsed(c_lcpState.lastLogTime, now).milliSec() >
7324       Uint64(1200000))
7325   {
7326     /**
7327      * Output a log message every time we start stalling
7328      * and every time we change node waiting for and every
7329      * time we have stalled for 2 mins.
7330      */
7331     jam();
7332     c_lcpState.lastLogTime = now;
7333     infoEvent("Stall LCP, LCP time = %u secs,"
7334               " wait for Node%u, state %s",
7335               Uint32(c_lcpState.m_lcp_time / 1000),
7336               node_waited_for,
7337               get_status_str(max_status));
7338     infoEvent("Stall LCP: current stall time: %u secs,"
7339               " max wait time:%u secs",
7340               Uint32(lcp_stall_time/1000),
7341               Uint32(lcp_max_wait_time/1000));
7342   }
7343   c_lcpState.lcpStallStart = 1;
7344   c_lcpState.stall_node_waiting_for = node_waited_for;
7345   return true;
7346 
7347 immediate_start_label:
7348   /**
7349    * We quit waiting for starting the LCP, we will start immediately.
7350    * This will be recorded as a start LCP, so no need for special
7351    * logging message for this. Simply reset the stall state.
7352    */
7353   c_lcpState.lcpStallStart = 0;
7354   return false;
7355 }
7356 
7357 const char*
get_status_str(NodeRecord::NodeRecoveryStatus status)7358 Dbdih::get_status_str(NodeRecord::NodeRecoveryStatus status)
7359 {
7360   const char *status_str;
7361   switch (status)
7362   {
7363   case NodeRecord::ALLOCATED_NODE_ID:
7364     status_str="Allocated node id";
7365     break;
7366   case NodeRecord::INCLUDED_IN_HB_PROTOCOL:
7367     status_str="Included in heartbeat protocol";
7368     break;
7369   case NodeRecord::NDBCNTR_START_WAIT:
7370     status_str="Wait for NDBCNTR master permit";
7371     break;
7372   case NodeRecord::NDBCNTR_STARTED:
7373     status_str="NDBCNTR master permitted us";
7374     break;
7375   case NodeRecord::NODE_GETTING_PERMIT:
7376   case NodeRecord::START_PERMITTED:
7377     status_str="All nodes permitted us";
7378     break;
7379   case NodeRecord::WAIT_LCP_TO_COPY_DICT:
7380     status_str="Wait for LCP complete to copy meta data";
7381     break;
7382   case NodeRecord::COPY_DICT_TO_STARTING_NODE:
7383     status_str="Copy meta data to start node";
7384     break;
7385   case NodeRecord::NODE_GETTING_INCLUDED:
7386   case NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP:
7387     status_str="Include node in LCP/GCP protocols";
7388     break;
7389   case NodeRecord::LOCAL_RECOVERY_STARTED:
7390     status_str="Restore fragments ongoing";
7391     break;
7392   case NodeRecord::RESTORE_FRAG_COMPLETED:
7393     status_str="Undo Disk data ongoing";
7394     break;
7395   case NodeRecord::UNDO_DD_COMPLETED:
7396     status_str="Execute REDO logs ongoing";
7397     break;
7398   case NodeRecord::EXECUTE_REDO_LOG_COMPLETED:
7399     status_str="Build indexes ongoing";
7400     break;
7401   case NodeRecord::NODE_GETTING_SYNCHED:
7402   case NodeRecord::COPY_FRAGMENTS_STARTED:
7403     status_str="Synchronize start node with live nodes";
7404     break;
7405   case NodeRecord::NODE_IN_LCP_WAIT_STATE:
7406   case NodeRecord::WAIT_LCP_FOR_RESTART:
7407     status_str="Wait LCP to ensure durability";
7408     break;
7409   case NodeRecord::WAIT_SUMA_HANDOVER:
7410     status_str="Wait handover of subscriptions";
7411     break;
7412   case NodeRecord::NODE_ACTIVE:
7413   case NodeRecord::RESTART_COMPLETED:
7414     status_str="Restart completed";
7415     break;
7416   case NodeRecord::NODE_FAILED:
7417     status_str="Node failed, fail handling ongoing";
7418     break;
7419   case NodeRecord::NODE_FAILURE_COMPLETED:
7420     status_str="Node failure handling complete";
7421     break;
7422   case NodeRecord::NODE_NOT_RESTARTED_YET:
7423     status_str="Initial state";
7424     break;
7425   default:
7426     jamLine(status);
7427     ndbabort();
7428     return NULL; /* Will never reach here, silence compiler warnings */
7429   }
7430   return status_str;
7431 }
7432 
7433 /**
7434  * Fill the table with the following data:
7435  * All the times are reported in seconds.
7436  *
7437  * NodeRestartStatus: This is a string which is derived from the
7438  *  nodeRecoveryStatus.
7439  *
7440  * CompleteFailTime: Time to complete the node failure.
7441  * AllocatedNodeIdTime: Time from completing node failure until we have
7442  *   allocated a node id again.
7443  * IncludeHeartbeatProtocolTime: Time from allocating node id until we
7444  *   have been included in the heartbeat protocol.
7445  * NdbcntrStartWaitTime: Time from being included in the heartbeat
7446  *   protocol until we have been set to wait for NDBCNTR master to
7447  *   allow us to continue starting.
7448  * NdbcntrStartedTime: Time from we start waiting for NDBCNTR master
7449  *   to accept us into the cluster until we are accepted into the cluster.
7450  * StartPermittedTime: Time from we are accepted by NDBCNTR master to
7451  *   start until we have received Start permit from all nodes.
7452  * WaitLCPToCopyDictTime: Time from all nodes permit us to start until we
7453  *   have finished waiting for LCP to complete before we copy the meta
7454  *   data in the cluster.
7455  * CopyToDictStartingNodeTime: Time from we have been allowed to start
7456  *   copying meta data until we have completed this.
7457  * IncludeNodeInLCPAndGCPTime: Time from we have copied the meta data
7458  *   until we have stopped the GCP protocol and have been included into
7459  *   the LCP and GCP protocol by all nodes.
7460  * LocalRecoveryTime: Time from being included until we have fully completed
7461  *   the Local Recovery in a node.
7462  * RestoreFragmentTime:
7463  * Time to restore all fragments from local files generated by the LCPs.
7464  * UndoDDTime:
7465  * Time to run Disk Data UNDO log on all restored fragments.
7466  * ExecREDOLogTime:
7467  * Time to execute the REDO log on all restored fragments.
7468  * BuildIndexTime:
7469  * Time to rebuild indexes on all restored fragments.
7470  * CopyFragmentsTime: Time from completing Local Recovery until all recent data
7471  *   have been copied from alive nodes to starting node.
7472  * WaitSumaHandoverTime: Time from being fully up-to-date until we have
7473  *   completed the handover of replication subscriptions.
7474  * Total recovery time:
7475  * Total time from node failure completed until we are started again.
7476  *
7477  * For nodes that have states set when we were not yet master we will only
7478  * report a few times:
7479  * StartPermittedTime: Time from node completed the node failure until our
7480  *   node permitted the node to start.
7481  * IncludeNodeInLCPAndGCPTime: Time from we permitted the node to start until
7482  *   we completed including the node in the LCP and GCP protocol.
7483  * LocalRecoveryTime: Time from we were included in the LCP and GCP protocol until
7484  *   we started copying the fragments.
7485  * CopyFragmentsTime: Time from we started synchronizing the starting node
7486  *   until we completed the node restart.
7487  *
7488  * Any time not happened yet will be reported as 0.
7489  */
write_zero_columns(Ndbinfo::Row & row,Uint32 num_rows)7490 void Dbdih::write_zero_columns(Ndbinfo::Row &row, Uint32 num_rows)
7491 {
7492   for (Uint32 i = 0; i < num_rows; i++)
7493   {
7494     jam();
7495     row.write_uint32(Uint32(0));
7496   }
7497   return;
7498 }
7499 
fill_row_with_node_restart_status(NodeRecordPtr nodePtr,Ndbinfo::Row & row)7500 void Dbdih::fill_row_with_node_restart_status(NodeRecordPtr nodePtr,
7501                                               Ndbinfo::Row &row)
7502 {
7503   Uint64 elapsed;
7504   NodeRecord::NodeRecoveryStatus status = nodePtr.p->nodeRecoveryStatus;
7505   row.write_uint32(nodePtr.i);
7506   const char *status_str = get_status_str(status);
7507   row.write_string(status_str);
7508   row.write_uint32(Uint32(nodePtr.p->nodeRecoveryStatus));
7509 
7510   if (status == NodeRecord::NODE_ACTIVE)
7511   {
7512     handle_before_master(nodePtr, row);
7513     return;
7514   }
7515   if (status == NodeRecord::NODE_FAILED)
7516   {
7517     write_zero_columns(row, 19);
7518     return;
7519   }
7520   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7521                             nodePtr.p->nodeFailCompletedTime).milliSec();
7522   elapsed/= 1000;
7523   /* Time to complete node failure */
7524   row.write_uint32(Uint32(elapsed));
7525 
7526   if (status == NodeRecord::NODE_FAILURE_COMPLETED)
7527   {
7528     write_zero_columns(row, 18);
7529     return;
7530   }
7531   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailCompletedTime,
7532                             nodePtr.p->allocatedNodeIdTime).milliSec();
7533   elapsed/= 1000;
7534   /* Time to allocate node id */
7535   row.write_uint32(Uint32(elapsed));
7536 
7537   if (status == NodeRecord::ALLOCATED_NODE_ID)
7538   {
7539     write_zero_columns(row, 17);
7540     return;
7541   }
7542   elapsed = NdbTick_Elapsed(nodePtr.p->allocatedNodeIdTime,
7543                             nodePtr.p->includedInHBProtocolTime).milliSec();
7544   elapsed/= 1000;
7545   /* Time to include in HB Protocol */
7546   row.write_uint32(Uint32(elapsed));
7547 
7548   if (status == NodeRecord::INCLUDED_IN_HB_PROTOCOL)
7549   {
7550     write_zero_columns(row, 16);
7551     return;
7552   }
7553   elapsed = NdbTick_Elapsed(nodePtr.p->includedInHBProtocolTime,
7554                             nodePtr.p->ndbcntrStartWaitTime).milliSec();
7555   elapsed/= 1000;
7556   /* Time until wait for for ndbcntr master */
7557   row.write_uint32(Uint32(elapsed));
7558 
7559   if (status == NodeRecord::NDBCNTR_START_WAIT)
7560   {
7561     write_zero_columns(row, 15);
7562     return;
7563   }
7564   elapsed = NdbTick_Elapsed(nodePtr.p->ndbcntrStartWaitTime,
7565                             nodePtr.p->ndbcntrStartedTime).milliSec();
7566   elapsed/= 1000;
7567   /* Time wait for NDBCNTR master */
7568   row.write_uint32(Uint32(elapsed));
7569 
7570   if (status == NodeRecord::NDBCNTR_STARTED)
7571   {
7572     write_zero_columns(row, 14);
7573     return;
7574   }
7575   elapsed = NdbTick_Elapsed(nodePtr.p->ndbcntrStartedTime,
7576                             nodePtr.p->startPermittedTime).milliSec();
7577   elapsed/= 1000;
7578   /* Time to get start permitted */
7579   row.write_uint32(Uint32(elapsed));
7580 
7581   if (status == NodeRecord::START_PERMITTED)
7582   {
7583     write_zero_columns(row, 13);
7584     return;
7585   }
7586   elapsed = NdbTick_Elapsed(nodePtr.p->startPermittedTime,
7587                             nodePtr.p->waitLCPToCopyDictTime).milliSec();
7588   elapsed/= 1000;
7589   /* Time to wait for LCP to copy meta data */
7590   row.write_uint32(Uint32(elapsed));
7591 
7592   if (status == NodeRecord::WAIT_LCP_TO_COPY_DICT)
7593   {
7594     write_zero_columns(row, 12);
7595     return;
7596   }
7597   elapsed = NdbTick_Elapsed(nodePtr.p->waitLCPToCopyDictTime,
7598                             nodePtr.p->copyDictToStartingNodeTime).milliSec();
7599   elapsed/= 1000;
7600   /* Time to copy meta data */
7601   row.write_uint32(Uint32(elapsed));
7602 
7603   if (status == NodeRecord::COPY_DICT_TO_STARTING_NODE)
7604   {
7605     write_zero_columns(row, 11);
7606     return;
7607   }
7608   elapsed = NdbTick_Elapsed(nodePtr.p->copyDictToStartingNodeTime,
7609                             nodePtr.p->includeNodeInLCPAndGCPTime).milliSec();
7610   elapsed/= 1000;
7611   /* Time to include node in GCP+LCP protocols */
7612   row.write_uint32(Uint32(elapsed));
7613 
7614   if (status == NodeRecord::INCLUDE_NODE_IN_LCP_AND_GCP)
7615   {
7616     write_zero_columns(row, 10);
7617     return;
7618   }
7619   elapsed = NdbTick_Elapsed(nodePtr.p->includeNodeInLCPAndGCPTime,
7620                             nodePtr.p->startDatabaseRecoveryTime).milliSec();
7621   elapsed/= 1000;
7622   /* Time for starting node to request local recovery */
7623   row.write_uint32(Uint32(elapsed));
7624 
7625   if (status == NodeRecord::LOCAL_RECOVERY_STARTED)
7626   {
7627     write_zero_columns(row, 9);
7628     return;
7629   }
7630 
7631   /* Total time of local recovery */
7632   if (status < NodeRecord::COPY_FRAGMENTS_STARTED)
7633   {
7634     row.write_uint32(Uint32(0));
7635   }
7636   else
7637   {
7638     elapsed = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
7639                               nodePtr.p->copyFragmentsStartedTime).milliSec();
7640     elapsed/= 1000;
7641     row.write_uint32(Uint32(elapsed));
7642   }
7643 
7644   elapsed = NdbTick_Elapsed(nodePtr.p->startDatabaseRecoveryTime,
7645                             nodePtr.p->startUndoDDTime).milliSec();
7646   elapsed/= 1000;
7647   /* Time to restore fragments */
7648   row.write_uint32(Uint32(elapsed));
7649 
7650   if (status == NodeRecord::RESTORE_FRAG_COMPLETED)
7651   {
7652     write_zero_columns(row, 7);
7653     return;
7654   }
7655   elapsed = NdbTick_Elapsed(nodePtr.p->startUndoDDTime,
7656                             nodePtr.p->startExecREDOLogTime).milliSec();
7657   elapsed/= 1000;
7658   /* Time to UNDO disk data parts */
7659   row.write_uint32(Uint32(elapsed));
7660 
7661   if (status == NodeRecord::UNDO_DD_COMPLETED)
7662   {
7663     write_zero_columns(row, 6);
7664     return;
7665   }
7666   elapsed = NdbTick_Elapsed(nodePtr.p->startExecREDOLogTime,
7667                             nodePtr.p->startBuildIndexTime).milliSec();
7668   elapsed/= 1000;
7669   /* Time to execute REDO logs */
7670   row.write_uint32(Uint32(elapsed));
7671 
7672   if (status == NodeRecord::EXECUTE_REDO_LOG_COMPLETED)
7673   {
7674     write_zero_columns(row, 5);
7675     return;
7676   }
7677   elapsed = NdbTick_Elapsed(nodePtr.p->startBuildIndexTime,
7678                             nodePtr.p->copyFragmentsStartedTime).milliSec();
7679   elapsed/= 1000;
7680   /* Time to build indexes */
7681   row.write_uint32(Uint32(elapsed));
7682 
7683   if (status == NodeRecord::COPY_FRAGMENTS_STARTED)
7684   {
7685     write_zero_columns(row, 4);
7686     return;
7687   }
7688   elapsed = NdbTick_Elapsed(nodePtr.p->copyFragmentsStartedTime,
7689                             nodePtr.p->waitLCPForRestartTime).milliSec();
7690   elapsed/= 1000;
7691   /* Time to synchronize starting node with alive nodes */
7692   row.write_uint32(Uint32(elapsed));
7693 
7694   if (status == NodeRecord::WAIT_LCP_FOR_RESTART)
7695   {
7696     write_zero_columns(row, 3);
7697     return;
7698   }
7699   elapsed = NdbTick_Elapsed(nodePtr.p->waitLCPForRestartTime,
7700                             nodePtr.p->waitSumaHandoverTime).milliSec();
7701   elapsed/= 1000;
7702   /* Time to wait for completion of LCPs */
7703   row.write_uint32(Uint32(elapsed));
7704 
7705   if (status == NodeRecord::WAIT_SUMA_HANDOVER)
7706   {
7707     write_zero_columns(row, 2);
7708     return;
7709   }
7710   elapsed = NdbTick_Elapsed(nodePtr.p->waitSumaHandoverTime,
7711                             nodePtr.p->restartCompletedTime).milliSec();
7712   elapsed/= 1000;
7713   /* Time to handover subscriptions to starting node */
7714   row.write_uint32(Uint32(elapsed));
7715 
7716   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7717                             nodePtr.p->restartCompletedTime).milliSec();
7718   elapsed/= 1000;
7719   /* Total recovery time */
7720   row.write_uint32(Uint32(elapsed));
7721 
7722   return;
7723 }
7724 
handle_before_master(NodeRecordPtr nodePtr,Ndbinfo::Row & row)7725 void Dbdih::handle_before_master(NodeRecordPtr nodePtr,
7726                                  Ndbinfo::Row &row)
7727 {
7728   Uint64 elapsed;
7729 
7730   /* Time to complete node failure */
7731   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7732                             nodePtr.p->nodeFailCompletedTime).milliSec();
7733   elapsed/= 1000;
7734   row.write_uint32(Uint32(elapsed));
7735 
7736   /**
7737    * No report on
7738    * 1) Allocate node id
7739    * 2) Include in heartbeat protocol
7740    * 3) Wait for NDBCNTR master
7741    * 4) Time until ok from NDBCNTR master
7742    */
7743   row.write_uint32(Uint32(0));
7744   row.write_uint32(Uint32(0));
7745   row.write_uint32(Uint32(0));
7746   row.write_uint32(Uint32(0));
7747 
7748   /* Time to get from failure to start permitted */
7749   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7750                             nodePtr.p->nodeGettingPermitTime).milliSec();
7751   elapsed/= 1000;
7752   row.write_uint32(Uint32(elapsed));
7753 
7754   /**
7755    * No report on
7756    * 1) Time to wait for LCP to copy meta data
7757    * 2) Time to copy meta data
7758    */
7759   row.write_uint32(Uint32(0));
7760   row.write_uint32(Uint32(0));
7761 
7762   /* Time from getting start permitted to getting included */
7763   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingPermitTime,
7764                             nodePtr.p->nodeGettingIncludedTime).milliSec();
7765   elapsed/= 1000;
7766   row.write_uint32(Uint32(elapsed));
7767 
7768   /**
7769    * No report on
7770    * 1) Time for starting node to request local recovery
7771    */
7772   row.write_uint32(Uint32(0));
7773 
7774   /* Time for local recovery */
7775   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingIncludedTime,
7776                             nodePtr.p->nodeGettingSynchedTime).milliSec();
7777   elapsed/= 1000;
7778   row.write_uint32(Uint32(elapsed));
7779 
7780   /**
7781    * No report on
7782    * 1) Restore fragment time
7783    * 2) UNDO DD time
7784    * 3) Execute REDO log time
7785    * 4) Build index time
7786    */
7787   row.write_uint32(Uint32(0));
7788   row.write_uint32(Uint32(0));
7789   row.write_uint32(Uint32(0));
7790   row.write_uint32(Uint32(0));
7791 
7792   /* Time to synchronize starting node with alive nodes */
7793   elapsed = NdbTick_Elapsed(nodePtr.p->nodeGettingSynchedTime,
7794                             nodePtr.p->nodeInLCPWaitStateTime).milliSec();
7795   elapsed/= 1000;
7796   row.write_uint32(Uint32(elapsed));
7797 
7798   /**
7799    * No report on
7800    * 1) Time to wait for LCP to be restorable as a node
7801    * 2) Time to handover subscriptions
7802    */
7803   row.write_uint32(Uint32(0));
7804   row.write_uint32(Uint32(0));
7805 
7806   /* Total time from node failure to node restarted */
7807   elapsed = NdbTick_Elapsed(nodePtr.p->nodeFailTime,
7808                             nodePtr.p->nodeActiveTime).milliSec();
7809   elapsed/= 1000;
7810   row.write_uint32(Uint32(elapsed));
7811 
7812   return;
7813 }
7814 
execDBINFO_SCANREQ(Signal * signal)7815 void Dbdih::execDBINFO_SCANREQ(Signal *signal)
7816 {
7817   DbinfoScanReq req = *(DbinfoScanReq*)signal->theData;
7818   const Ndbinfo::ScanCursor *cursor =
7819     CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
7820   Ndbinfo::Ratelimit rl;
7821   bool sent_any = false;
7822   jamEntry();
7823 
7824   switch (req.tableId)
7825   {
7826   case Ndbinfo::RESTART_INFO_TABLEID:
7827   {
7828     if (isMaster() == false)
7829     {
7830       /* Only report from master node's view on restarts */
7831       break;
7832     }
7833     if (getNodeState().startLevel != NodeState::SL_STARTED)
7834     {
7835       jam();
7836       /* Ignore when we are starting up or shutting down */
7837       break;
7838     }
7839 
7840     NodeRecordPtr nodePtr;
7841     jam();
7842     nodePtr.i = cursor->data[0];
7843     if (nodePtr.i == 0)
7844     {
7845       nodePtr.i = 1; /* Ignore node 0 */
7846     }
7847     else if (nodePtr.i > m_max_node_id)
7848     {
7849       break;
7850     }
7851     for (; nodePtr.i <= m_max_node_id; nodePtr.i++)
7852     {
7853       ptrAss(nodePtr, nodeRecord);
7854       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET ||
7855           nodePtr.p->nodeRecoveryStatus == NodeRecord::NOT_DEFINED_IN_CLUSTER)
7856         continue;
7857       jamLine(nodePtr.i);
7858       sent_any = true;
7859       Ndbinfo::Row row(signal, req);
7860       fill_row_with_node_restart_status(nodePtr, row);
7861       ndbinfo_send_row(signal, req, row, rl);
7862       if (rl.need_break(req))
7863       {
7864         jam();
7865         ndbinfo_send_scan_break(signal, req, rl, nodePtr.i + 1);
7866         return;
7867       }
7868     }
7869     if (cursor->data[0] == 0 && !sent_any)
7870     {
7871       /* No nodes had any node restart data to report */
7872       jam();
7873       break;
7874     }
7875     break;
7876   }
7877   case Ndbinfo::TABLE_DIST_STATUS_TABLEID:
7878   case Ndbinfo::TABLE_DIST_STATUS_ALL_TABLEID:
7879   {
7880     jam();
7881     TabRecordPtr tabPtr;
7882     tabPtr.i = cursor->data[0];
7883     if (!isMaster() && req.tableId == Ndbinfo::TABLE_DIST_STATUS_TABLEID)
7884     {
7885       jam();
7886       break;
7887     }
7888     for ( ; tabPtr.i < ctabFileSize ; tabPtr.i++)
7889     {
7890       jamLine(tabPtr.i);
7891       ptrAss(tabPtr, tabRecord);
7892       if (tabPtr.p->tabStatus != TabRecord::TS_IDLE)
7893       {
7894         jam();
7895         Ndbinfo::Row row(signal, req);
7896         row.write_uint32(cownNodeId);
7897         row.write_uint32(tabPtr.i);
7898         row.write_uint32(tabPtr.p->tabCopyStatus);
7899         row.write_uint32(tabPtr.p->tabUpdateState);
7900         row.write_uint32(tabPtr.p->tabLcpStatus);
7901         row.write_uint32(tabPtr.p->tabStatus);
7902         row.write_uint32(tabPtr.p->tabStorage);
7903         row.write_uint32(tabPtr.p->tableType);
7904         row.write_uint32(tabPtr.p->partitionCount);
7905         row.write_uint32(tabPtr.p->totalfragments);
7906         row.write_uint32(tabPtr.p->m_scan_count[0]);
7907         row.write_uint32(tabPtr.p->m_scan_count[1]);
7908         row.write_uint32(tabPtr.p->m_scan_reorg_flag);
7909         ndbinfo_send_row(signal, req, row, rl);
7910         if (rl.need_break(req))
7911         {
7912           jam();
7913           ndbinfo_send_scan_break(signal, req, rl, tabPtr.i + 1);
7914           return;
7915         }
7916       }
7917     }
7918     break;
7919   }
7920   case Ndbinfo::TABLE_FRAGMENTS_TABLEID:
7921   case Ndbinfo::TABLE_FRAGMENTS_ALL_TABLEID:
7922   {
7923     jam();
7924     TabRecordPtr tabPtr;
7925     FragmentstorePtr fragPtr;
7926     tabPtr.i = cursor->data[0] & 0xFFFF;
7927     Uint32 fragId = cursor->data[0] >> 16;
7928     if (!isMaster() && req.tableId == Ndbinfo::TABLE_FRAGMENTS_TABLEID)
7929     {
7930       jam();
7931       break;
7932     }
7933     for ( ; tabPtr.i < ctabFileSize ; tabPtr.i++)
7934     {
7935       jamLine(tabPtr.i);
7936       ptrAss(tabPtr, tabRecord);
7937       if (tabPtr.p->tabStatus != TabRecord::TS_IDLE &&
7938           (DictTabInfo::isTable(tabPtr.p->tableType) ||
7939            DictTabInfo::isUniqueIndex(tabPtr.p->tableType)))
7940       {
7941         for ( ; fragId < tabPtr.p->totalfragments ; fragId++)
7942         {
7943           jamLine(fragId);
7944           getFragstore(tabPtr.p, fragId, fragPtr);
7945           Ndbinfo::Row row(signal, req);
7946           row.write_uint32(cownNodeId);
7947           row.write_uint32(tabPtr.i);
7948           row.write_uint32(fragPtr.p->partition_id);
7949           row.write_uint32(fragPtr.p->fragId);
7950           if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) == 0)
7951           {
7952             row.write_uint32(0);
7953           }
7954           else
7955           {
7956             row.write_uint32(findPartitionOrder(tabPtr.p, fragPtr));
7957           }
7958 
7959           row.write_uint32(fragPtr.p->m_log_part_id);
7960           row.write_uint32(fragPtr.p->fragReplicas);
7961           row.write_uint32(fragPtr.p->activeNodes[0]);
7962           row.write_uint32(fragPtr.p->preferredPrimary);
7963 
7964           if (fragPtr.p->noStoredReplicas > 1)
7965           {
7966             row.write_uint32(fragPtr.p->activeNodes[1]);
7967           }
7968           else
7969           {
7970             row.write_uint32(0);
7971           }
7972 
7973           if (fragPtr.p->noStoredReplicas > 2)
7974           {
7975             row.write_uint32(fragPtr.p->activeNodes[2]);
7976           }
7977           else
7978           {
7979             row.write_uint32(0);
7980           }
7981 
7982           if (fragPtr.p->noStoredReplicas > 3)
7983           {
7984             row.write_uint32(fragPtr.p->activeNodes[3]);
7985           }
7986           else
7987           {
7988             row.write_uint32(0);
7989           }
7990 
7991           row.write_uint32(fragPtr.p->noStoredReplicas);
7992           row.write_uint32(fragPtr.p->noOldStoredReplicas);
7993           row.write_uint32(fragPtr.p->noLcpReplicas);
7994           ndbinfo_send_row(signal, req, row, rl);
7995           if (rl.need_break(req))
7996           {
7997             jam();
7998             Uint32 new_cursor = tabPtr.i + ((fragId + 1) << 16);
7999             ndbinfo_send_scan_break(signal, req, rl, new_cursor);
8000             return;
8001           }
8002         }
8003       }
8004       fragId = 0;
8005     }
8006     break;
8007   }
8008   case Ndbinfo::TABLE_REPLICAS_TABLEID:
8009   case Ndbinfo::TABLE_REPLICAS_ALL_TABLEID:
8010   {
8011     jam();
8012     TabRecordPtr tabPtr;
8013     FragmentstorePtr fragPtr;
8014     ReplicaRecordPtr replicaPtr;
8015     tabPtr.i = cursor->data[0] & 0xFFFF;
8016     Uint32 fragId = cursor->data[0] >> 16;
8017     if (!isMaster() && req.tableId == Ndbinfo::TABLE_REPLICAS_TABLEID)
8018     {
8019       jam();
8020       break;
8021     }
8022     for ( ; tabPtr.i < ctabFileSize ; tabPtr.i++)
8023     {
8024       jamLine(tabPtr.i);
8025       ptrAss(tabPtr, tabRecord);
8026       if (tabPtr.p->tabStatus != TabRecord::TS_IDLE &&
8027           (DictTabInfo::isTable(tabPtr.p->tableType) ||
8028            DictTabInfo::isUniqueIndex(tabPtr.p->tableType)))
8029       {
8030         jamLine(fragId);
8031         jamLine(tabPtr.p->totalfragments);
8032         jamLine(tabPtr.p->partitionCount);
8033         for ( ; fragId < tabPtr.p->totalfragments ; fragId++)
8034         {
8035           jamLine(fragId);
8036           getFragstore(tabPtr.p, fragId, fragPtr);
8037           for (Uint32 i = 0; i < 2; i++)
8038           {
8039             if (i == 0)
8040             {
8041               jam();
8042               replicaPtr.i = fragPtr.p->storedReplicas;
8043             }
8044             else
8045             {
8046               jam();
8047               replicaPtr.i = fragPtr.p->oldStoredReplicas;
8048             }
8049             while (replicaPtr.i != RNIL)
8050             {
8051               jam();
8052               Ndbinfo::Row row(signal, req);
8053               c_replicaRecordPool.getPtr(replicaPtr);
8054               row.write_uint32(cownNodeId);
8055               row.write_uint32(tabPtr.i);
8056               row.write_uint32(fragPtr.p->fragId);
8057               row.write_uint32(replicaPtr.p->initialGci);
8058               row.write_uint32(replicaPtr.p->procNode);
8059               row.write_uint32(replicaPtr.p->lcpOngoingFlag);
8060               row.write_uint32(replicaPtr.p->noCrashedReplicas);
8061               Uint32 lastId = 0;
8062               Uint32 maxLcpId = 0;
8063               for (Uint32 j = 0; j < MAX_LCP_USED; j++)
8064               {
8065                 jam();
8066                 if (replicaPtr.p->lcpStatus[j] == ZVALID)
8067                 {
8068                   jam();
8069                   if (replicaPtr.p->lcpId[j] > maxLcpId)
8070                   {
8071                     jam();
8072                     lastId = j;
8073                     maxLcpId = replicaPtr.p->lcpId[j];
8074                   }
8075                 }
8076               }
8077               Uint32 prevId = prevLcpNo(lastId);
8078               row.write_uint32(replicaPtr.p->maxGciStarted[lastId]);
8079               row.write_uint32(replicaPtr.p->maxGciCompleted[lastId]);
8080               row.write_uint32(replicaPtr.p->lcpId[lastId]);
8081               row.write_uint32(replicaPtr.p->maxGciStarted[prevId]);
8082               row.write_uint32(replicaPtr.p->maxGciCompleted[prevId]);
8083               row.write_uint32(replicaPtr.p->lcpId[prevId]);
8084               Uint32 last_replica_id = replicaPtr.p->noCrashedReplicas;
8085               row.write_uint32(replicaPtr.p->createGci[last_replica_id]);
8086               row.write_uint32(replicaPtr.p->replicaLastGci[last_replica_id]);
8087               row.write_uint32(i == 0 ? 1 : 0);
8088               ndbinfo_send_row(signal, req, row, rl);
8089               replicaPtr.i = replicaPtr.p->nextPool;
8090             }
8091           }
8092           if (rl.need_break(req))
8093           {
8094             jam();
8095             Uint32 new_cursor = tabPtr.i + ((fragId + 1) << 16);
8096             ndbinfo_send_scan_break(signal, req, rl, new_cursor);
8097             return;
8098           }
8099         }
8100         fragId = 0;
8101       }
8102     }
8103     break;
8104   }
8105   default:
8106     break;
8107   }
8108   ndbinfo_send_scan_conf(signal, req, rl);
8109 }
8110 /* END Node Recovery Status Module */
8111 
8112 /*****************************************************************************/
8113 /***********     NODE ADDING  MODULE                             *************/
8114 /***********     CODE TO HANDLE TAKE OVER                        *************/
8115 /*****************************************************************************/
8116 // A take over can be initiated by a number of things:
8117 // 1) A node restart, usually the node takes over itself but can also take
8118 //    over somebody else if its own data was already taken over
8119 // 2) At system restart it is necessary to use the take over code to recover
8120 //    nodes which had too old checkpoints to be restorable by the usual
8121 //    restoration from disk.
8122 // 3) When a node has missed too many local checkpoints and is decided by the
8123 //    master to be taken over by a hot spare node that sits around waiting
8124 //    for this to happen. (This is not yet implemented).
8125 //
8126 // To support multiple node failures efficiently the code is written such that
8127 // only one take over can handle transitions in state but during a copy
8128 // fragment other take over's can perform state transitions.
8129 /*****************************************************************************/
startTakeOver(Signal * signal,Uint32 startNode,Uint32 nodeTakenOver,const StartCopyReq * req)8130 void Dbdih::startTakeOver(Signal* signal,
8131                           Uint32 startNode,
8132                           Uint32 nodeTakenOver,
8133                           const StartCopyReq* req)
8134 {
8135   jam();
8136 
8137   TakeOverRecordPtr takeOverPtr;
8138 
8139   ndbrequire(c_takeOverPool.seize(takeOverPtr));
8140   takeOverPtr.p->startGci = SYSFILE->lastCompletedGCI[startNode];
8141   takeOverPtr.p->restorableGci = SYSFILE->lastCompletedGCI[startNode];
8142   takeOverPtr.p->toStartingNode = startNode;
8143   takeOverPtr.p->toFailedNode = nodeTakenOver;
8144   takeOverPtr.p->toCurrentTabref = 0;
8145   takeOverPtr.p->toCurrentFragid = 0;
8146 
8147   ndbrequire(req != NULL);
8148   takeOverPtr.p->m_flags = req->flags;
8149   takeOverPtr.p->m_senderData = req->senderData;
8150   takeOverPtr.p->m_senderRef = req->senderRef;
8151 
8152   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_FRAGMENTS;
8153   nr_start_fragments(signal, takeOverPtr);
8154 }//Dbdih::startTakeOver()
8155 
8156 void
nr_start_fragments(Signal * signal,TakeOverRecordPtr takeOverPtr)8157 Dbdih::nr_start_fragments(Signal* signal,
8158 			  TakeOverRecordPtr takeOverPtr)
8159 {
8160   Uint32 loopCount = 0 ;
8161   TabRecordPtr tabPtr;
8162   const Uint32 MaxFragsToSearch = 100;
8163   while (loopCount++ < MaxFragsToSearch) {
8164     tabPtr.i = takeOverPtr.p->toCurrentTabref;
8165     if (tabPtr.i >= ctabFileSize) {
8166       jam();
8167       nr_run_redo(signal, takeOverPtr);
8168       return;
8169     }//if
8170     ptrAss(tabPtr, tabRecord);
8171     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
8172 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL)
8173     {
8174       jam();
8175       takeOverPtr.p->toCurrentFragid = 0;
8176       takeOverPtr.p->toCurrentTabref++;
8177       continue;
8178     }//if
8179     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
8180     if (fragId >= tabPtr.p->totalfragments) {
8181       jam();
8182       takeOverPtr.p->toCurrentFragid = 0;
8183       takeOverPtr.p->toCurrentTabref++;
8184       continue;
8185     }//if
8186     FragmentstorePtr fragPtr;
8187     getFragstore(tabPtr.p, fragId, fragPtr);
8188     ReplicaRecordPtr loopReplicaPtr;
8189     loopReplicaPtr.i = fragPtr.p->oldStoredReplicas;
8190     while (loopReplicaPtr.i != RNIL) {
8191       c_replicaRecordPool.getPtr(loopReplicaPtr);
8192       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode) {
8193         jam();
8194 	nr_start_fragment(signal, takeOverPtr, loopReplicaPtr);
8195         loopCount+= MaxFragsToSearch; /* Take a break */
8196 	break;
8197       } else {
8198         jam();
8199         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
8200       }//if
8201     }//while
8202     takeOverPtr.p->toCurrentFragid++;
8203   }//while
8204   signal->theData[0] = DihContinueB::ZTO_START_FRAGMENTS;
8205   signal->theData[1] = takeOverPtr.i;
8206   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
8207 }
8208 
8209 void
nr_start_fragment(Signal * signal,TakeOverRecordPtr takeOverPtr,ReplicaRecordPtr replicaPtr)8210 Dbdih::nr_start_fragment(Signal* signal,
8211 			 TakeOverRecordPtr takeOverPtr,
8212 			 ReplicaRecordPtr replicaPtr)
8213 {
8214   Uint32 i;
8215   Uint32 maxLcpId = 0;
8216   Uint32 maxLcpIndex = ~0;
8217 
8218   Uint32 gci = 0;
8219   Uint32 restorableGCI = takeOverPtr.p->restorableGci;
8220 
8221 #if defined VM_TRACE || defined ERROR_INSERT
8222   ndbout_c("tab: %d frag: %d replicaP->nextLcp: %d",
8223 	   takeOverPtr.p->toCurrentTabref,
8224 	   takeOverPtr.p->toCurrentFragid,
8225 	   replicaPtr.p->nextLcp);
8226 #endif
8227 
8228   /**
8229    * Search for an LCP that can be used to restore.
8230    * For each LCP that is VALID we need to check if
8231    * it is restorable. It is restorable if the
8232    * node has a REDO log interval that can be used
8233    * to restore some GCI. For this to happen we have
8234    * to have a REDO log in the node that starts
8235    * before the last completed GCI in the LCP and that
8236    * goes on until at least until the maximum GCI
8237    * started in the LCP.
8238    *
8239    * We also verify that the local checkpoint was
8240    * performed within the interval that of this node's
8241    * last completed GCI. This ensures that we avoid
8242    * problems in cases of partial system restarts.
8243    */
8244   Uint32 idx = prevLcpNo(replicaPtr.p->nextLcp);
8245   for(i = 0; i<MAX_LCP_USED; i++, idx = prevLcpNo(idx))
8246   {
8247     Int32 j = replicaPtr.p->noCrashedReplicas - 1;
8248 #if defined VM_TRACE || defined ERROR_INSERT
8249     ndbout_c("scanning idx: %d lcpId: %d crashed replicas: %u %s",
8250              idx, replicaPtr.p->lcpId[idx],
8251              replicaPtr.p->noCrashedReplicas,
8252              replicaPtr.p->lcpStatus[idx] == ZVALID ? "VALID" : "NOT VALID");
8253 #endif
8254     if (replicaPtr.p->lcpStatus[idx] == ZVALID)
8255     {
8256       Uint32 startGci = replicaPtr.p->maxGciCompleted[idx] + 1;
8257       Uint32 stopGci = replicaPtr.p->maxGciStarted[idx];
8258 #if defined VM_TRACE || defined ERROR_INSERT
8259       ndbout_c(" maxGciCompleted: %u maxGciStarted: %u", startGci - 1, stopGci);
8260 #endif
8261       /* The following error insert is for Bug #23602217.
8262        * It ensures that the most recent LCP is considered
8263        * non-restorable. This forces the older LCP to be
8264        * restored, which failed to happen previously.
8265        */
8266       if (ERROR_INSERTED(7248))
8267       {
8268         g_eventLogger->info("Inserting error to skip most recent LCP");
8269         if (i == 0)
8270         {
8271           continue;
8272         }
8273       }
8274       for (; j>= 0; j--)
8275       {
8276 #if defined VM_TRACE || defined ERROR_INSERT
8277 	ndbout_c("crashed replica: %d(%d) replica(createGci: %u lastGci: %d )",
8278 		 j,
8279 		 replicaPtr.p->noCrashedReplicas,
8280                  replicaPtr.p->createGci[j],
8281 		 replicaPtr.p->replicaLastGci[j]);
8282 #endif
8283 	if (replicaPtr.p->createGci[j] <= startGci &&
8284             replicaPtr.p->replicaLastGci[j] >= stopGci &&
8285             replicaPtr.p->maxGciCompleted[idx] <=
8286               SYSFILE->lastCompletedGCI[replicaPtr.p->procNode] &&
8287             replicaPtr.p->maxGciStarted[idx] <=
8288               SYSFILE->lastCompletedGCI[replicaPtr.p->procNode])
8289 	{
8290 	  maxLcpId = replicaPtr.p->lcpId[idx];
8291 	  maxLcpIndex = idx;
8292           gci = replicaPtr.p->replicaLastGci[j];
8293 	  goto done;
8294 	}
8295       }
8296     }
8297     else
8298     {
8299 #if defined VM_TRACE || defined ERROR_INSERT
8300       ndbout_c(" ");
8301 #endif
8302     }
8303   }
8304 
8305   idx = 2; // backward compat code
8306 #if defined VM_TRACE || defined ERROR_INSERT
8307   ndbout_c("- scanning idx: %d lcpId: %d", idx, replicaPtr.p->lcpId[idx]);
8308 #endif
8309   if (replicaPtr.p->lcpStatus[idx] == ZVALID)
8310   {
8311     Uint32 startGci = replicaPtr.p->maxGciCompleted[idx] + 1;
8312     Uint32 stopGci = replicaPtr.p->maxGciStarted[idx];
8313     Int32 j = replicaPtr.p->noCrashedReplicas - 1;
8314     for (;j >= 0; j--)
8315     {
8316 #if defined VM_TRACE || defined ERROR_INSERT
8317       ndbout_c("crashed replica: %d(%d) replica(createGci: %u lastGci: %d )",
8318                j,
8319                replicaPtr.p->noCrashedReplicas,
8320                replicaPtr.p->createGci[j],
8321                replicaPtr.p->replicaLastGci[j]);
8322 #endif
8323       if (replicaPtr.p->createGci[j] <= startGci &&
8324           replicaPtr.p->replicaLastGci[j] >= stopGci &&
8325           replicaPtr.p->maxGciCompleted[idx] <=
8326             SYSFILE->lastCompletedGCI[replicaPtr.p->procNode] &&
8327           replicaPtr.p->maxGciStarted[idx] <=
8328             SYSFILE->lastCompletedGCI[replicaPtr.p->procNode])
8329       {
8330         maxLcpId = replicaPtr.p->lcpId[idx];
8331         maxLcpIndex = idx;
8332         gci = replicaPtr.p->replicaLastGci[j];
8333         goto done;
8334       }
8335     }
8336   }
8337 
8338 done:
8339 
8340   StartFragReq *req = (StartFragReq *)signal->getDataPtrSend();
8341   req->requestInfo = StartFragReq::SFR_RESTORE_LCP;
8342   req->nodeRestorableGci = takeOverPtr.p->restorableGci;
8343   if (maxLcpIndex == ~ (Uint32) 0)
8344   {
8345     /**
8346      * we didn't find a local LCP that we can restore
8347      */
8348     jam();
8349     ndbassert(gci == 0);
8350     replicaPtr.p->m_restorable_gci = gci;
8351 
8352     req->userPtr = 0;
8353     req->userRef = reference();
8354     req->lcpNo = ZNIL;
8355     req->lcpId = 0;
8356     req->tableId = takeOverPtr.p->toCurrentTabref;
8357     req->fragId = takeOverPtr.p->toCurrentFragid;
8358     req->noOfLogNodes = 0;
8359 
8360     if (c_2pass_inr && cstarttype == NodeState::ST_INITIAL_NODE_RESTART)
8361     {
8362       /**
8363        * Check if we can make 2-phase copy
8364        *   1) non-transaction, (after we rebuild indexes)
8365        *   2) transaction (maintaining indexes during rebuild)
8366        *      where the transactional copies efterything >= startGci
8367        *
8368        * NOTE: c_2pass_inr is only set if all nodes in cluster currently
8369        *       supports this
8370        */
8371 
8372       if (takeOverPtr.p->startGci == 0)
8373       {
8374         jam();
8375         /**
8376          * Set a startGci to currently lastCompletedGCI of master
8377          *   any value will do...as long as subsequent transactional copy
8378          *   will be using it (scanning >= this value)
8379          */
8380         takeOverPtr.p->startGci = SYSFILE->lastCompletedGCI[cmasterNodeId];
8381       }
8382 
8383       TabRecordPtr tabPtr;
8384       tabPtr.i = takeOverPtr.p->toCurrentTabref;
8385       ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
8386 
8387       FragmentstorePtr fragPtr;
8388       getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
8389       Uint32 nodes[MAX_REPLICAS];
8390       extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
8391 
8392       req->lqhLogNode[0] = nodes[0]; // Source
8393       req->requestInfo = StartFragReq::SFR_COPY_FRAG;
8394       replicaPtr.p->m_restorable_gci = takeOverPtr.p->startGci;
8395     }
8396 
8397     if (req->requestInfo == StartFragReq::SFR_RESTORE_LCP)
8398     {
8399       g_eventLogger->debug("node: %d tab: %d frag: %d no lcp to restore",
8400                            takeOverPtr.p->toStartingNode,
8401                            takeOverPtr.p->toCurrentTabref,
8402                            takeOverPtr.p->toCurrentFragid);
8403     }
8404     else
8405     {
8406       g_eventLogger->debug("node: %d tab: %d frag: %d copying data from %u"
8407                            " (gci: %u)",
8408                            takeOverPtr.p->toStartingNode,
8409                            takeOverPtr.p->toCurrentTabref,
8410                            takeOverPtr.p->toCurrentFragid,
8411                            req->lqhLogNode[0],
8412                            takeOverPtr.p->startGci);
8413     }
8414 
8415     BlockReference ref = numberToRef(DBLQH, takeOverPtr.p->toStartingNode);
8416     sendSignal(ref, GSN_START_FRAGREQ, signal,
8417 	       StartFragReq::SignalLength, JBB);
8418   }
8419   else
8420   {
8421     jam();
8422     if (gci != restorableGCI)
8423     {
8424       Ptr<TabRecord> tabPtr;
8425       tabPtr.i = takeOverPtr.p->toCurrentTabref;
8426       ptrAss(tabPtr, tabRecord);
8427 
8428       FragmentstorePtr fragPtr;
8429       getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
8430       dump_replica_info(fragPtr.p);
8431     }
8432     ndbassert(gci == restorableGCI);
8433     replicaPtr.p->m_restorable_gci = gci;
8434     Uint32 startGci = replicaPtr.p->maxGciCompleted[maxLcpIndex] + 1;
8435     if (startGci > gci)
8436       startGci = gci;
8437     g_eventLogger->debug("Requesting start of fragment: "
8438              "node: %d tab: %d frag: %d restore lcp: %u(idx: %u)"
8439              " maxGciStarted: %u maxGciCompleted: %u (restorable:"
8440              " %u(%u) newestRestorableGCI: %u)",
8441              takeOverPtr.p->toStartingNode,
8442              takeOverPtr.p->toCurrentTabref,
8443              takeOverPtr.p->toCurrentFragid,
8444 	     maxLcpId,
8445              maxLcpIndex,
8446 	     replicaPtr.p->maxGciStarted[maxLcpIndex],
8447 	     replicaPtr.p->maxGciCompleted[maxLcpIndex],
8448 	     restorableGCI,
8449 	     SYSFILE->lastCompletedGCI[takeOverPtr.p->toStartingNode],
8450 	     SYSFILE->newestRestorableGCI);
8451 
8452     StartFragReq *req = (StartFragReq *)signal->getDataPtrSend();
8453     req->userPtr = 0;
8454     req->userRef = reference();
8455     req->lcpNo = maxLcpIndex;
8456     req->lcpId = maxLcpId;
8457     req->tableId = takeOverPtr.p->toCurrentTabref;
8458     req->fragId = takeOverPtr.p->toCurrentFragid;
8459     req->noOfLogNodes = 1;
8460     req->lqhLogNode[0] = takeOverPtr.p->toStartingNode;
8461     req->startGci[0] = startGci;
8462     req->lastGci[0] = gci;
8463 
8464     BlockReference ref = numberToRef(DBLQH, takeOverPtr.p->toStartingNode);
8465     sendSignal(ref, GSN_START_FRAGREQ, signal,
8466 	       StartFragReq::SignalLength, JBB);
8467 
8468     if (startGci < takeOverPtr.p->startGci)
8469     {
8470       jam();
8471       takeOverPtr.p->startGci = startGci;
8472     }
8473   }
8474 }
8475 
8476 void
nr_run_redo(Signal * signal,TakeOverRecordPtr takeOverPtr)8477 Dbdih::nr_run_redo(Signal* signal, TakeOverRecordPtr takeOverPtr)
8478 {
8479   /**
8480    * sendSTART_RECREQ uses m_sr_nodes
8481    *   and for TO during SR, we don't want to modify it
8482    *   so save/restore it
8483    */
8484   NdbNodeBitmask save = m_sr_nodes;
8485   m_sr_nodes.clear();
8486   m_sr_nodes.set(takeOverPtr.p->toStartingNode);
8487 
8488   Uint32 save_keepGCI = SYSFILE->keepGCI;
8489   if (takeOverPtr.p->startGci < SYSFILE->keepGCI)
8490   {
8491     jam();
8492     SYSFILE->keepGCI = takeOverPtr.p->startGci;
8493     g_eventLogger->info("GSN_START_RECREQ keepGci: %u (%u)",
8494                         takeOverPtr.p->startGci, save_keepGCI);
8495   }
8496 
8497   g_eventLogger->info("All start fragments sent, requesting LDM to restore"
8498                       " all fragments and to execute the REDO log to bring"
8499                       " the database to an off-line but consistent state");
8500 
8501   takeOverPtr.p->toCurrentTabref = 0;
8502   takeOverPtr.p->toCurrentFragid = 0;
8503   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_RUN_REDO;
8504   sendSTART_RECREQ(signal, takeOverPtr.p->toStartingNode, takeOverPtr.i);
8505 
8506   m_sr_nodes = save; // restore
8507   SYSFILE->keepGCI = save_keepGCI;
8508 }
8509 
8510 void
nr_start_logging(Signal * signal,TakeOverRecordPtr takeOverPtr)8511 Dbdih::nr_start_logging(Signal* signal, TakeOverRecordPtr takeOverPtr)
8512 {
8513   Uint32 loopCount = 0 ;
8514   TabRecordPtr tabPtr;
8515   while (loopCount++ < 100)
8516   {
8517     tabPtr.i = takeOverPtr.p->toCurrentTabref;
8518     if (tabPtr.i >= ctabFileSize)
8519     {
8520       jam();
8521       g_eventLogger->debug("Copy thread %u complete",
8522                           takeOverPtr.p->m_copy_thread_id);
8523       if (!thread_takeover_completed(signal, takeOverPtr))
8524       {
8525         jam();
8526         return;
8527       }
8528       check_take_over_completed_correctly();
8529       g_eventLogger->info("Make On-line Database recoverable by waiting"
8530                           " for LCP Starting, all parallel threads have"
8531                           " now ceased their activity and we have a single"
8532                           " wait state here");
8533 
8534       takeOverPtr = c_mainTakeOverPtr;
8535 
8536       takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_END_TO;
8537       EndToReq* req = (EndToReq*)signal->getDataPtrSend();
8538       req->senderData = takeOverPtr.i;
8539       req->senderRef = reference();
8540       req->flags = takeOverPtr.p->m_flags;
8541       sendSignal(cmasterdihref, GSN_END_TOREQ,
8542                  signal, EndToReq::SignalLength, JBB);
8543       sendEND_TOREP(signal, takeOverPtr.p->toStartingNode);
8544       return;
8545     }
8546     ptrAss(tabPtr, tabRecord);
8547     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
8548 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL)
8549     {
8550       jam();
8551       takeOverPtr.p->toCurrentFragid = 0;
8552       takeOverPtr.p->toCurrentTabref++;
8553       continue;
8554     }
8555 
8556     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
8557     if (fragId >= tabPtr.p->totalfragments)
8558     {
8559       jam();
8560       takeOverPtr.p->toCurrentFragid = 0;
8561       takeOverPtr.p->toCurrentTabref++;
8562       continue;
8563     }
8564     FragmentstorePtr fragPtr;
8565     getFragstore(tabPtr.p, fragId, fragPtr);
8566 
8567     Uint32 instanceKey = dihGetInstanceKey(fragPtr);
8568     if (!check_takeover_thread(takeOverPtr,
8569                                fragPtr,
8570                                instanceKey))
8571     {
8572       jam();
8573       /**
8574        * We are scanning for fragment replicas to take over, but this replica
8575        * was not ours to take over, it will be handled by another take over
8576        * thread.
8577        */
8578       takeOverPtr.p->toCurrentFragid++;
8579       continue;
8580     }
8581 
8582     ReplicaRecordPtr loopReplicaPtr;
8583     loopReplicaPtr.i = fragPtr.p->storedReplicas;
8584     while (loopReplicaPtr.i != RNIL)
8585     {
8586       c_replicaRecordPool.getPtr(loopReplicaPtr);
8587       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode)
8588       {
8589         jam();
8590         ndbrequire(loopReplicaPtr.p->procNode == getOwnNodeId());
8591         takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_COPY_ACTIVE;
8592 
8593         BlockReference lqhRef = numberToRef(DBLQH, instanceKey,
8594                                             takeOverPtr.p->toStartingNode);
8595 
8596         CopyActiveReq * const req = (CopyActiveReq *)&signal->theData[0];
8597         req->userPtr = takeOverPtr.i;
8598         req->userRef = reference();
8599         req->tableId = takeOverPtr.p->toCurrentTabref;
8600         req->fragId = takeOverPtr.p->toCurrentFragid;
8601         req->distributionKey = fragPtr.p->distributionKey;
8602         req->flags = 0;
8603         sendSignal(lqhRef, GSN_COPY_ACTIVEREQ, signal,
8604                    CopyActiveReq::SignalLength, JBB);
8605         return;
8606       }
8607       else
8608       {
8609         jam();
8610         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
8611       }
8612     }
8613     takeOverPtr.p->toCurrentFragid++;
8614   }
8615   send_continueb_nr_start_logging(signal, takeOverPtr);
8616 }
8617 
8618 /**
8619  * Instance takeover uses a number of queues and variables to keep track of
8620  * the takeover threads.
8621  *
8622  * We start by sending START_TOREQ to the master. This is done by the
8623  * main takeover record. This is always placed in the variable
8624  * c_mainTakeOverPtr.
8625  *
8626  * After this we create a number of parallel threads. A record is created
8627  * and put into the queue:
8628  * c_activeTakeOverList
8629  * It stays there while we're scanning for fragments to take over in our
8630  * takeover thread.
8631  *
8632  * When we find an instance to take over we have two possibilities.
8633  * We can either be put into the active thread which is the variable:
8634  * c_activeThreadTakeOverPtr
8635  * If the active thread is already busy, then we are placed into the
8636  * queue:
8637  * c_queued_for_start_takeover_list
8638  * When we're taken out of the queue we are placed into the active thread.
8639  *
8640  * We are taken out of the active thread when we're sending COPY_FRAGREQ.
8641  * At this point our takeover thread is placed in the list
8642  * c_active_copy_threads_list
8643  * It stays in this list until we're done with the copying when we have
8644  * received COPY_ACTIVECONF back from the LDM instance in the starting node.
8645  *
8646  * At this point we need to update the fragment state again and we need to
8647  * become active thread again which is controlled by:
8648  * c_activeThreadTakeOverPtr
8649  * If the active thread is already busy then we use the queue
8650  * c_queued_for_commit_takeover_list
8651  * This queue has higher priority than the
8652  * c_queued_for_start_takeover_list
8653  *
8654  * After completing the update of the fragment state we are removed as active
8655  * thread and placed back in the list
8656  * c_activeTakeOverList
8657  *
8658  * We proceed with the next fragment until we're out of fragments to handle
8659  * for this thread.
8660  *
8661  * At this point we are removed from
8662  * c_activeTakeOverList
8663  * and placed into
8664  * c_completed_copy_threads_list
8665  *
8666  * If this was a system restart we will then remove all threads from the
8667  * c_completed_copy_threads_list
8668  * and only the
8669  * c_mainTakeOverPtr
8670  * record still remains.
8671  *
8672  * For normal node recovery we start a process of activating the node. We
8673  * start this process by removing the takeover thread from
8674  * c_completed_copy_threads_list
8675  * and placing the takeover thread into the list
8676  * c_active_copy_threads_list
8677  * instead.
8678  *
8679  * At every point when we need to update the fragment state we remove the
8680  * takeover record from the
8681  * c_active_copy_threads_list
8682  * and place it as the active thread record. If the active thread is
8683  * already busy then we place the record in the list
8684  * c_queued_for_commit_takeover_list
8685  *
8686  * After completing the update of the fragment state we place the record
8687  * back into the list
8688  * c_active_copy_threads_list
8689  *
8690  * When we are finally done with activating the node instance in this final
8691  * process, then we're removing the record from the
8692  * c_active_copy_threads_list
8693  * and releasing the takeover thread record to the take over pool.
8694  *
8695  * When all node instances are completed then all lists should be empty and
8696  * no thread should be active and only the main record should remain.
8697  */
8698 
8699 
8700 void
sendStartTo(Signal * signal,TakeOverRecordPtr takeOverPtr)8701 Dbdih::sendStartTo(Signal* signal, TakeOverRecordPtr takeOverPtr)
8702 {
8703   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_TO;
8704 
8705   StartToReq* req = (StartToReq*)signal->getDataPtrSend();
8706   req->senderData = takeOverPtr.i;
8707   req->senderRef = reference();
8708   req->startingNodeId = takeOverPtr.p->toStartingNode;
8709   sendSignal(cmasterdihref, GSN_START_TOREQ,
8710              signal, StartToReq::SignalLength, JBB);
8711 }
8712 
8713 void
execSTART_TOREF(Signal * signal)8714 Dbdih::execSTART_TOREF(Signal* signal)
8715 {
8716   jamEntry();
8717 
8718   StartToRef* ref = (StartToRef*)signal->getDataPtr();
8719   Uint32 errCode = ref->errorCode;
8720   (void)errCode; // TODO check for "valid" error
8721 
8722   TakeOverRecordPtr takeOverPtr;
8723   c_takeOverPool.getPtr(takeOverPtr, ref->senderData);
8724 
8725   signal->theData[0] = DihContinueB::ZSEND_START_TO;
8726   signal->theData[1] = takeOverPtr.i;
8727 
8728   sendSignalWithDelay(reference(), GSN_CONTINUEB,
8729                       signal, 5000, 2);
8730 }
8731 
8732 /**
8733  * We have completed one thread's communication with the master and we're
8734  * ready to start off another which have been queued.
8735  */
8736 void
start_next_takeover_thread(Signal * signal)8737 Dbdih::start_next_takeover_thread(Signal *signal)
8738 {
8739   TakeOverRecordPtr takeOverPtr;
8740   bool dequeued_from_commit_take_over = true;
8741   bool dequeued_from_start_take_over = false;
8742 
8743   if (!c_queued_for_commit_takeover_list.removeFirst(takeOverPtr))
8744   {
8745     dequeued_from_commit_take_over = false;
8746     if (!c_queued_for_start_takeover_list.removeFirst(takeOverPtr))
8747     {
8748       jam();
8749       /**
8750        * No threads are queued up for master communication, so we can
8751        * set active to RNIL and wait for the next thread to be completed
8752        * with another step.
8753        */
8754       g_eventLogger->debug("No threads queued up");
8755       c_activeThreadTakeOverPtr.i = RNIL;
8756       return;
8757     }
8758     dequeued_from_start_take_over = true;
8759     jam();
8760   }
8761   c_activeThreadTakeOverPtr = takeOverPtr;
8762   g_eventLogger->debug("New active takeover thread: %u, state: %u",
8763                       takeOverPtr.i,
8764                       takeOverPtr.p->toSlaveStatus);
8765   if (takeOverPtr.p->toSlaveStatus ==
8766         TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_STORED)
8767   {
8768     jam();
8769     ndbrequire(dequeued_from_start_take_over);
8770     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_STORED;
8771     sendUpdateTo(signal, takeOverPtr);
8772   }
8773   else if (takeOverPtr.p->toSlaveStatus ==
8774              TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_COMMIT)
8775   {
8776     jam();
8777     ndbrequire(dequeued_from_commit_take_over);
8778     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_COMMIT;
8779     sendUpdateTo(signal, takeOverPtr);
8780   }
8781   else if (takeOverPtr.p->toSlaveStatus ==
8782              TakeOverRecord::TO_QUEUED_SL_UPDATE_FRAG_STATE)
8783   {
8784     jam();
8785     ndbrequire(dequeued_from_commit_take_over);
8786     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_UPDATE_FRAG_STATE;
8787     sendUpdateFragStateReq(signal,
8788                            takeOverPtr.p->startGci,
8789                            UpdateFragStateReq::START_LOGGING,
8790                            takeOverPtr);
8791     return;
8792   }
8793   else
8794   {
8795     ndbabort();
8796   }
8797   return;
8798 }
8799 
8800 void
init_takeover_thread(TakeOverRecordPtr takeOverPtr,TakeOverRecordPtr mainTakeOverPtr,Uint32 number_of_copy_threads,Uint32 thread_id)8801 Dbdih::init_takeover_thread(TakeOverRecordPtr takeOverPtr,
8802                             TakeOverRecordPtr mainTakeOverPtr,
8803                             Uint32 number_of_copy_threads,
8804                             Uint32 thread_id)
8805 {
8806   c_activeTakeOverList.addFirst(takeOverPtr);
8807   takeOverPtr.p->m_copy_thread_id = thread_id;
8808   takeOverPtr.p->m_number_of_copy_threads = number_of_copy_threads;
8809 
8810   takeOverPtr.p->m_flags = mainTakeOverPtr.p->m_flags;
8811   takeOverPtr.p->m_senderData = mainTakeOverPtr.p->m_senderData;
8812   takeOverPtr.p->m_senderRef = mainTakeOverPtr.p->m_senderRef;
8813 
8814   takeOverPtr.p->startGci = mainTakeOverPtr.p->startGci;
8815   takeOverPtr.p->restorableGci = mainTakeOverPtr.p->restorableGci;
8816   /* maxPage is received in PREPARE_COPY_FRAGCONF */
8817 
8818   takeOverPtr.p->toCopyNode = mainTakeOverPtr.p->toCopyNode;
8819   takeOverPtr.p->toFailedNode = mainTakeOverPtr.p->toFailedNode;
8820   takeOverPtr.p->toStartingNode = mainTakeOverPtr.p->toStartingNode;
8821 
8822   takeOverPtr.p->toStartTime = mainTakeOverPtr.p->toStartTime;
8823   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SELECTING_NEXT;
8824   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
8825 
8826   takeOverPtr.p->toCurrentTabref = 0;
8827   takeOverPtr.p->toCurrentFragid = 0;
8828   takeOverPtr.p->toCurrentReplica = RNIL;
8829 }
8830 
8831 void
send_continueb_start_next_copy(Signal * signal,TakeOverRecordPtr takeOverPtr)8832 Dbdih::send_continueb_start_next_copy(Signal *signal,
8833                                       TakeOverRecordPtr takeOverPtr)
8834 {
8835   signal->theData[0] = DihContinueB::ZTO_START_COPY_FRAG;
8836   signal->theData[1] = takeOverPtr.i;
8837   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
8838 }
8839 
8840 void
execSTART_TOCONF(Signal * signal)8841 Dbdih::execSTART_TOCONF(Signal* signal)
8842 {
8843   jamEntry();
8844   StartToConf * conf = (StartToConf*)signal->getDataPtr();
8845 
8846   TakeOverRecordPtr takeOverPtr;
8847   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
8848 
8849   CRASH_INSERTION(7133);
8850 
8851   /**
8852    * We are now allowed to start copying
8853    *
8854    * It is time to start the parallelisation phase where we have a number
8855    * of take over threads where each take over thread takes care of
8856    * a set of LDM instances. This means that each take over thread can
8857    * execute in parallel towards DBLQH, but we have to serialise access
8858    * towards the master which is designed to handle one take over thread
8859    * request per node at a time. So we handle multiple take overs internally
8860    * and towards the LDM instances, but towards the master we appear as there
8861    * is only one take over thread.
8862    *
8863    * This means that we need no master specific take over code to parallelize
8864    * copying over several LDM instances. The take over can be made parallel as
8865    * soon as a version with this code is started as long as the master can
8866    * handle parallel node recovery in general.
8867    */
8868 
8869   c_mainTakeOverPtr = takeOverPtr;
8870   c_mainTakeOverPtr.p->m_number_of_copy_threads =
8871     c_max_takeover_copy_threads;
8872   c_mainTakeOverPtr.p->m_copy_threads_completed = 0;
8873   c_activeThreadTakeOverPtr.i = RNIL;
8874   check_take_over_completed_correctly();
8875 
8876   for (Uint32 i = 0; i < c_max_takeover_copy_threads; i++)
8877   {
8878     /**
8879      * We will break the rule of not starting more than 4 signals from one
8880      * signal here. The reason is that we know that eventually we will start
8881      * the same number of parallel threads anyways and also there won't be
8882      * anymore parallelisation after that internally in this thread. There
8883      * could potentially be further parallelisation in DBLQH, but this is
8884      * in a number of parallel threads and thus not DIH's concern to handle.
8885      */
8886     jam();
8887     ndbrequire(c_takeOverPool.seize(takeOverPtr));
8888     init_takeover_thread(takeOverPtr,
8889                          c_mainTakeOverPtr,
8890                          c_max_takeover_copy_threads,
8891                          i);
8892     send_continueb_start_next_copy(signal, takeOverPtr);
8893   }
8894 }
8895 
8896 bool
check_takeover_thread(TakeOverRecordPtr takeOverPtr,FragmentstorePtr fragPtr,Uint32 fragmentReplicaInstanceKey)8897 Dbdih::check_takeover_thread(TakeOverRecordPtr takeOverPtr,
8898                              FragmentstorePtr fragPtr,
8899                              Uint32 fragmentReplicaInstanceKey)
8900 {
8901   ndbassert(fragmentReplicaInstanceKey != 0);
8902   fragmentReplicaInstanceKey--;
8903   /**
8904    * The instance key is in reality the log part id. The log part id
8905    * is often in ndbmtd the same as the instance id. But in ndbd and
8906    * in ndbmtd with 2 LDM instances there is a difference. The
8907    * instance id is mapped in the receiving node modulo the number
8908    * of LDM instances. So we take the instance key modulo the number
8909    * of LDM instances to get the thread id to handle this takeover
8910    * thread.
8911    *
8912    * For safety we will never run more parallelism than we have in the
8913    * minimum node of the starting node and the copying node.
8914    */
8915   Uint32 nodes[MAX_REPLICAS];
8916   extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
8917   Uint32 lqhWorkers = getNodeInfo(takeOverPtr.p->toStartingNode).m_lqh_workers;
8918   lqhWorkers = MIN(lqhWorkers,
8919                    getNodeInfo(nodes[0]).m_lqh_workers);
8920   lqhWorkers = MAX(lqhWorkers, 1);
8921   Uint32 instanceId = fragmentReplicaInstanceKey % lqhWorkers;
8922 
8923   if ((instanceId % takeOverPtr.p->m_number_of_copy_threads) ==
8924       takeOverPtr.p->m_copy_thread_id)
8925   {
8926     jam();
8927     return true;
8928   }
8929   else
8930   {
8931     jam();
8932     return false;
8933   }
8934 }
8935 
startNextCopyFragment(Signal * signal,Uint32 takeOverPtrI)8936 void Dbdih::startNextCopyFragment(Signal* signal, Uint32 takeOverPtrI)
8937 {
8938   TabRecordPtr tabPtr;
8939   TakeOverRecordPtr takeOverPtr;
8940   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
8941 
8942   Uint32 loopCount;
8943   loopCount = 0;
8944   if (ERROR_INSERTED(7159)) {
8945     loopCount = 100;
8946   }//if
8947   while (loopCount++ < 100) {
8948     tabPtr.i = takeOverPtr.p->toCurrentTabref;
8949     if (tabPtr.i >= ctabFileSize) {
8950       jam();
8951       CRASH_INSERTION(7136);
8952       toCopyCompletedLab(signal, takeOverPtr);
8953       return;
8954     }//if
8955     ptrAss(tabPtr, tabRecord);
8956     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE){
8957       jam();
8958       takeOverPtr.p->toCurrentFragid = 0;
8959       takeOverPtr.p->toCurrentTabref++;
8960       continue;
8961     }//if
8962     Uint32 fragId = takeOverPtr.p->toCurrentFragid;
8963     if (fragId >= tabPtr.p->totalfragments) {
8964       jam();
8965       takeOverPtr.p->toCurrentFragid = 0;
8966       takeOverPtr.p->toCurrentTabref++;
8967       if (ERROR_INSERTED(7135)) {
8968         if (takeOverPtr.p->toCurrentTabref == 1) {
8969           ndbabort();
8970         }//if
8971       }//if
8972       continue;
8973     }//if
8974     FragmentstorePtr fragPtr;
8975     getFragstore(tabPtr.p, fragId, fragPtr);
8976 
8977     Uint32 instanceKey = dihGetInstanceKey(fragPtr);
8978     if (!check_takeover_thread(takeOverPtr,
8979                                fragPtr,
8980                                instanceKey))
8981     {
8982       /**
8983        * We are scanning for fragment replicas to take over, but this replica
8984        * was not ours to take over, it will be handled by another take over
8985        * thread.
8986        */
8987       jam();
8988       takeOverPtr.p->toCurrentFragid++;
8989       continue;
8990     }
8991     jam();
8992 
8993     ReplicaRecordPtr loopReplicaPtr;
8994     loopReplicaPtr.i = fragPtr.p->oldStoredReplicas;
8995     while (loopReplicaPtr.i != RNIL) {
8996       c_replicaRecordPool.getPtr(loopReplicaPtr);
8997       if (loopReplicaPtr.p->procNode == takeOverPtr.p->toFailedNode) {
8998         jam();
8999 	/* ----------------------------------------------------------------- */
9000 	/* WE HAVE FOUND A REPLICA THAT BELONGED THE FAILED NODE THAT NEEDS  */
9001 	/* TAKE OVER. WE TAKE OVER THIS REPLICA TO THE NEW NODE.             */
9002 	/* ----------------------------------------------------------------- */
9003         takeOverPtr.p->toCurrentReplica = loopReplicaPtr.i;
9004         toCopyFragLab(signal, takeOverPtr.i);
9005         return;
9006       } else if (loopReplicaPtr.p->procNode == takeOverPtr.p->toStartingNode) {
9007         jam();
9008 	/* ----------------------------------------------------------------- */
9009 	/* WE HAVE OBVIOUSLY STARTED TAKING OVER THIS WITHOUT COMPLETING IT. */
9010 	/* WE NEED TO COMPLETE THE TAKE OVER OF THIS REPLICA.                */
9011 	/* ----------------------------------------------------------------- */
9012         takeOverPtr.p->toCurrentReplica = loopReplicaPtr.i;
9013         toCopyFragLab(signal, takeOverPtr.i);
9014         return;
9015       } else {
9016         jam();
9017         loopReplicaPtr.i = loopReplicaPtr.p->nextPool;
9018       }//if
9019     }//while
9020     takeOverPtr.p->toCurrentFragid++;
9021   }//while
9022   send_continueb_start_next_copy(signal, takeOverPtr);
9023 }//Dbdih::startNextCopyFragment()
9024 
toCopyFragLab(Signal * signal,Uint32 takeOverPtrI)9025 void Dbdih::toCopyFragLab(Signal* signal, Uint32 takeOverPtrI)
9026 {
9027   TakeOverRecordPtr takeOverPtr;
9028   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
9029 
9030   /**
9031    * Inform starting node that TakeOver is about to start
9032    */
9033   g_eventLogger->debug("PREPARE_COPY_FRAGREQ: tab: %u, frag: %u, thread: %u",
9034     takeOverPtr.p->toCurrentTabref,
9035     takeOverPtr.p->toCurrentFragid,
9036     takeOverPtr.i);
9037   TabRecordPtr tabPtr;
9038   tabPtr.i = takeOverPtr.p->toCurrentTabref;
9039   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
9040 
9041   FragmentstorePtr fragPtr;
9042   getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
9043   Uint32 nodes[MAX_REPLICAS];
9044   extractNodeInfo(jamBuffer(), fragPtr.p, nodes);
9045   takeOverPtr.p->toCopyNode = nodes[0];
9046 
9047   PrepareCopyFragReq* req= (PrepareCopyFragReq*)signal->getDataPtrSend();
9048   req->senderRef = reference();
9049   req->senderData = takeOverPtrI;
9050   req->tableId = takeOverPtr.p->toCurrentTabref;
9051   req->fragId = takeOverPtr.p->toCurrentFragid;
9052   req->copyNodeId = takeOverPtr.p->toCopyNode;
9053   req->startingNodeId = takeOverPtr.p->toStartingNode; // Dst
9054 
9055   Uint32 instanceKey = dihGetInstanceKey(req->tableId, req->fragId);
9056   Uint32 ref = numberToRef(DBLQH, instanceKey, takeOverPtr.p->toStartingNode);
9057 
9058   sendSignal(ref, GSN_PREPARE_COPY_FRAG_REQ, signal,
9059              PrepareCopyFragReq::SignalLength, JBB);
9060 
9061   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_PREPARE_COPY;
9062 }
9063 
9064 void
execPREPARE_COPY_FRAG_REF(Signal * signal)9065 Dbdih::execPREPARE_COPY_FRAG_REF(Signal* signal)
9066 {
9067   jamEntry();
9068   PrepareCopyFragRef ref = *(PrepareCopyFragRef*)signal->getDataPtr();
9069 
9070   TakeOverRecordPtr takeOverPtr;
9071   c_takeOverPool.getPtr(takeOverPtr, ref.senderData);
9072 
9073   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_PREPARE_COPY);
9074 
9075   /**
9076    * Treat this as copy frag ref
9077    */
9078   CopyFragRef * cfref = (CopyFragRef*)signal->getDataPtrSend();
9079   cfref->userPtr = ref.senderData;
9080   cfref->startingNodeId = ref.startingNodeId;
9081   cfref->errorCode = ref.errorCode;
9082   cfref->tableId = ref.tableId;
9083   cfref->fragId = ref.fragId;
9084   cfref->sendingNodeId = ref.copyNodeId;
9085   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_FRAG;
9086   execCOPY_FRAGREF(signal);
9087 }
9088 
9089 void
execPREPARE_COPY_FRAG_CONF(Signal * signal)9090 Dbdih::execPREPARE_COPY_FRAG_CONF(Signal* signal)
9091 {
9092   jamEntry();
9093   PrepareCopyFragConf conf = *(PrepareCopyFragConf*)signal->getDataPtr();
9094 
9095   TakeOverRecordPtr takeOverPtr;
9096   c_takeOverPool.getPtr(takeOverPtr, conf.senderData);
9097 
9098   TabRecordPtr tabPtr;
9099   FragmentstorePtr fragPtr;
9100   ReplicaRecordPtr replicaPtr;
9101   tabPtr.i = takeOverPtr.p->toCurrentTabref;
9102   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
9103   getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
9104   findReplica(replicaPtr, fragPtr.p, getOwnNodeId(), true);
9105   if (signal->length() == PrepareCopyFragConf::SignalLength &&
9106       replicaPtr.p->m_restorable_gci == 0)
9107   {
9108     /**
9109      * DIH had no knowledge about any LCPs, but LQH found a
9110      * recoverable LCP so let's use that one instead of no one
9111      * at all.
9112      */
9113     jam();
9114     replicaPtr.p->m_restorable_gci = conf.completedGci;
9115   }
9116 
9117   takeOverPtr.p->maxPage = conf.maxPageNo;
9118 
9119   c_activeTakeOverList.remove(takeOverPtr);
9120 
9121   if (c_activeThreadTakeOverPtr.i != RNIL)
9122   {
9123     /**
9124      * There is already an active take over thread that is performing an
9125      * update of its fragment replica state through the master. We will
9126      * put ourselves in the c_queued_for_start_take_over_list and be
9127      * started as soon as possible.
9128      */
9129     jam();
9130     g_eventLogger->debug("QUEUED_UPDATE_BEFORE_STORED, inst: %u",
9131                          takeOverPtr.i);
9132     takeOverPtr.p->toSlaveStatus =
9133       TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_STORED;
9134     c_queued_for_start_takeover_list.addLast(takeOverPtr);
9135     return;
9136   }
9137   /* Mark master busy before proceeding */
9138   c_activeThreadTakeOverPtr = takeOverPtr;
9139 
9140   /**
9141    * We need to lock fragment info...in order to later run
9142    * UPDATE_FRAG_STATEREQ. We will mark ourselves as the active thread
9143    * such that other threads will be queued up until we are ready with
9144    * updating the fragment state.
9145    */
9146   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_STORED;
9147   g_eventLogger->debug("PREPARE_COPY_FRAG_CONF: thread: %u", takeOverPtr.i);
9148   sendUpdateTo(signal, takeOverPtr);
9149 }
9150 
9151 void
sendUpdateTo(Signal * signal,TakeOverRecordPtr takeOverPtr)9152 Dbdih::sendUpdateTo(Signal* signal, TakeOverRecordPtr takeOverPtr)
9153 {
9154   /**
9155    * We must refer to the main takeover thread towards the master node,
9156    * but we take the data from the thread which is currently active.
9157    */
9158   g_eventLogger->debug("UPDATE_TOREQ: tab:%u, frag:%u, thread:%u, state:%u",
9159     takeOverPtr.p->toCurrentTabref,
9160     takeOverPtr.p->toCurrentFragid,
9161     takeOverPtr.i,
9162     takeOverPtr.p->toSlaveStatus);
9163   UpdateToReq* req = (UpdateToReq*)signal->getDataPtrSend();
9164   req->senderData = c_mainTakeOverPtr.i;
9165   req->senderRef = reference();
9166   req->startingNodeId = takeOverPtr.p->toStartingNode;
9167   req->copyNodeId = takeOverPtr.p->toCopyNode;
9168   req->tableId = takeOverPtr.p->toCurrentTabref;
9169   req->fragmentNo = takeOverPtr.p->toCurrentFragid;
9170   switch(takeOverPtr.p->toSlaveStatus){
9171   case TakeOverRecord::TO_UPDATE_BEFORE_STORED:
9172     jam();
9173     req->requestType = UpdateToReq::BEFORE_STORED;
9174     break;
9175   case TakeOverRecord::TO_UPDATE_AFTER_STORED:
9176     req->requestType = UpdateToReq::AFTER_STORED;
9177     break;
9178   case TakeOverRecord::TO_UPDATE_BEFORE_COMMIT:
9179     jam();
9180     req->requestType = UpdateToReq::BEFORE_COMMIT_STORED;
9181     break;
9182   case TakeOverRecord::TO_UPDATE_AFTER_COMMIT:
9183     jam();
9184     req->requestType = UpdateToReq::AFTER_COMMIT_STORED;
9185     break;
9186   default:
9187     jamLine(takeOverPtr.p->toSlaveStatus);
9188     ndbabort();
9189   }
9190   sendSignal(cmasterdihref, GSN_UPDATE_TOREQ,
9191              signal, UpdateToReq::SignalLength, JBB);
9192 }
9193 
9194 void
execUPDATE_TOREF(Signal * signal)9195 Dbdih::execUPDATE_TOREF(Signal* signal)
9196 {
9197   jamEntry();
9198   UpdateToRef* ref = (UpdateToRef*)signal->getDataPtr();
9199   Uint32 errCode = ref->errorCode;
9200   Uint32 extra = ref->extra;
9201   (void)errCode; // TODO check for "valid" error
9202 
9203   TakeOverRecordPtr takeOverPtr;
9204 
9205   ndbrequire(ref->senderData == c_mainTakeOverPtr.i);
9206   ndbrequire(c_activeThreadTakeOverPtr.i != RNIL);
9207 
9208   c_takeOverPool.getPtr(takeOverPtr, c_activeThreadTakeOverPtr.i);
9209 
9210   g_eventLogger->info("UPDATE_TOREF: thread: %u, state:%u"
9211                       ", errCode: %u, extra: %u",
9212                       takeOverPtr.i,
9213                       takeOverPtr.p->toSlaveStatus,
9214                       errCode,
9215                       extra);
9216   signal->theData[0] = DihContinueB::ZSEND_UPDATE_TO;
9217   signal->theData[1] = takeOverPtr.i;
9218 
9219   sendSignalWithDelay(reference(), GSN_CONTINUEB,
9220                       signal, 5000, 2);
9221 }
9222 
9223 void
execUPDATE_TOCONF(Signal * signal)9224 Dbdih::execUPDATE_TOCONF(Signal* signal)
9225 {
9226   jamEntry();
9227 
9228   UpdateToConf* conf = (UpdateToConf*)signal->getDataPtr();
9229 
9230   TakeOverRecordPtr takeOverPtr;
9231 
9232   /**
9233    * We operate towards the master using the main takeover thread.
9234    * The CONF is however intended for the current active takeover
9235    * thread.
9236    */
9237   ndbrequire(conf->senderData == c_mainTakeOverPtr.i);
9238   ndbrequire(c_activeThreadTakeOverPtr.i != RNIL);
9239 
9240   c_takeOverPool.getPtr(takeOverPtr, c_activeThreadTakeOverPtr.i);
9241 
9242   g_eventLogger->debug("UPDATE_TOCONF: thread: %u, state:%u",
9243                        takeOverPtr.i,
9244                        takeOverPtr.p->toSlaveStatus);
9245   switch(takeOverPtr.p->toSlaveStatus){
9246   case TakeOverRecord::TO_UPDATE_BEFORE_STORED:
9247     jam();
9248 
9249     CRASH_INSERTION(7154);
9250 
9251     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_FRAG_STATE_STORED;
9252     sendUpdateFragStateReq(signal,
9253                            ZINIT_CREATE_GCI,
9254                            UpdateFragStateReq::STORED,
9255                            takeOverPtr);
9256     return;
9257   case TakeOverRecord::TO_UPDATE_AFTER_STORED:
9258     jam();
9259 
9260     CRASH_INSERTION(7195);
9261 
9262     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_FRAG;
9263     toStartCopyFrag(signal, takeOverPtr);
9264     return;
9265   case TakeOverRecord::TO_UPDATE_BEFORE_COMMIT:
9266     jam();
9267 
9268     CRASH_INSERTION(7196);
9269 
9270     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_FRAG_STATE_COMMIT;
9271     sendUpdateFragStateReq(signal,
9272                            takeOverPtr.p->startGci,
9273                            UpdateFragStateReq::COMMIT_STORED,
9274                            takeOverPtr);
9275     return;
9276   case TakeOverRecord::TO_UPDATE_AFTER_COMMIT:
9277     jam();
9278 
9279     CRASH_INSERTION(7197);
9280 
9281     start_next_takeover_thread(signal);
9282     c_activeTakeOverList.addFirst(takeOverPtr);
9283     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SELECTING_NEXT;
9284     startNextCopyFragment(signal, takeOverPtr.i);
9285     return;
9286   default:
9287     ndbabort();
9288   }
9289 }
9290 
9291 void
toStartCopyFrag(Signal * signal,TakeOverRecordPtr takeOverPtr)9292 Dbdih::toStartCopyFrag(Signal* signal, TakeOverRecordPtr takeOverPtr)
9293 {
9294   TabRecordPtr tabPtr;
9295   tabPtr.i = takeOverPtr.p->toCurrentTabref;
9296   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
9297 
9298   Uint32 fragId = takeOverPtr.p->toCurrentFragid;
9299 
9300   FragmentstorePtr fragPtr;
9301   getFragstore(tabPtr.p, fragId, fragPtr);
9302 
9303   ReplicaRecordPtr replicaPtr;
9304   findReplica(replicaPtr, fragPtr.p, getOwnNodeId(), true);
9305 
9306   Uint32 gci = replicaPtr.p->m_restorable_gci;
9307   replicaPtr.p->m_restorable_gci = 0; // used in union...
9308 
9309   Uint32 instanceKey = dihGetInstanceKey(tabPtr.i, fragId);
9310   BlockReference ref = numberToRef(DBLQH, instanceKey,
9311                                    takeOverPtr.p->toCopyNode);
9312   CopyFragReq * const copyFragReq = (CopyFragReq *)&signal->theData[0];
9313   copyFragReq->userPtr = takeOverPtr.i;
9314   copyFragReq->userRef = reference();
9315   copyFragReq->tableId = tabPtr.i;
9316   copyFragReq->fragId = fragId;
9317   copyFragReq->nodeId = takeOverPtr.p->toStartingNode;
9318   copyFragReq->schemaVersion = tabPtr.p->schemaVersion;
9319   copyFragReq->distributionKey = fragPtr.p->distributionKey;
9320   copyFragReq->gci = gci;
9321   Uint32 len = copyFragReq->nodeCount =
9322     extractNodeInfo(jamBuffer(), fragPtr.p,
9323                     copyFragReq->nodeList);
9324   copyFragReq->nodeList[len] = takeOverPtr.p->maxPage;
9325   copyFragReq->nodeList[len+1] = CopyFragReq::CFR_TRANSACTIONAL;
9326   sendSignal(ref, GSN_COPY_FRAGREQ, signal,
9327              CopyFragReq::SignalLength + len, JBB);
9328   g_eventLogger->debug("COPY_FRAGREQ: thread: %u, tab: %u, frag: %u",
9329     takeOverPtr.i,
9330     takeOverPtr.p->toCurrentTabref,
9331     takeOverPtr.p->toCurrentFragid);
9332   start_next_takeover_thread(signal);
9333   c_active_copy_threads_list.addFirst(takeOverPtr);
9334 }//Dbdih::toStartCopy()
9335 
sendUpdateFragStateReq(Signal * signal,Uint32 startGci,Uint32 replicaType,TakeOverRecordPtr takeOverPtr)9336 void Dbdih::sendUpdateFragStateReq(Signal* signal,
9337                                    Uint32 startGci,
9338                                    Uint32 replicaType,
9339                                    TakeOverRecordPtr takeOverPtr)
9340 {
9341   sendLoopMacro(UPDATE_FRAG_STATEREQ, nullRoutine, RNIL);
9342 
9343   g_eventLogger->debug("Update frag state for inst:%u,tab:%u,frag:%u",
9344                        takeOverPtr.i,
9345                        takeOverPtr.p->toCurrentTabref,
9346                        takeOverPtr.p->toCurrentFragid);
9347   UpdateFragStateReq * const req = (UpdateFragStateReq *)&signal->theData[0];
9348   req->senderData = takeOverPtr.i;
9349   req->senderRef = reference();
9350   req->tableId = takeOverPtr.p->toCurrentTabref;
9351   req->fragId = takeOverPtr.p->toCurrentFragid;
9352   req->startingNodeId = takeOverPtr.p->toStartingNode;
9353   req->copyNodeId = takeOverPtr.p->toCopyNode;
9354   req->failedNodeId = takeOverPtr.p->toFailedNode;
9355   req->startGci = startGci;
9356   req->replicaType = replicaType;
9357 
9358   NodeRecordPtr nodePtr;
9359   nodePtr.i = cfirstAliveNode;
9360   do {
9361     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
9362     BlockReference ref = calcDihBlockRef(nodePtr.i);
9363     sendSignal(ref, GSN_UPDATE_FRAG_STATEREQ, signal,
9364 	       UpdateFragStateReq::SignalLength, JBB);
9365     nodePtr.i = nodePtr.p->nextNode;
9366   } while (nodePtr.i != RNIL);
9367 }//Dbdih::sendUpdateFragStateReq()
9368 
execUPDATE_FRAG_STATECONF(Signal * signal)9369 void Dbdih::execUPDATE_FRAG_STATECONF(Signal* signal)
9370 {
9371   jamEntry();
9372   CRASH_INSERTION(7148);
9373   UpdateFragStateConf * conf = (UpdateFragStateConf *)&signal->theData[0];
9374 
9375   TakeOverRecordPtr takeOverPtr;
9376 
9377   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
9378 
9379   g_eventLogger->debug("Updated frag state for inst:%u,tab:%u,frag:%u,state:%u",
9380                        takeOverPtr.i,
9381                        takeOverPtr.p->toCurrentTabref,
9382                        takeOverPtr.p->toCurrentFragid,
9383                        takeOverPtr.p->toSlaveStatus);
9384   receiveLoopMacro(UPDATE_FRAG_STATEREQ, conf->sendingNodeId);
9385 
9386   switch(takeOverPtr.p->toSlaveStatus){
9387   case TakeOverRecord::TO_UPDATE_FRAG_STATE_STORED:
9388     jam();
9389     CRASH_INSERTION(7198);
9390     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_AFTER_STORED;
9391     break;
9392   case TakeOverRecord::TO_UPDATE_FRAG_STATE_COMMIT:
9393     jam();
9394     CRASH_INSERTION(7199);
9395     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_AFTER_COMMIT;
9396     break;
9397   case TakeOverRecord::TO_SL_UPDATE_FRAG_STATE:
9398     jam();
9399     //CRASH_INSERTION(
9400     start_next_takeover_thread(signal);
9401     c_active_copy_threads_list.addFirst(takeOverPtr);
9402     g_eventLogger->debug("UPDATE_FRAG_STATE completed: thread: %u",
9403       takeOverPtr.i);
9404     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_LOGGING;
9405     takeOverPtr.p->toCurrentFragid++;
9406     signal->theData[0] = DihContinueB::ZTO_START_LOGGING;
9407     signal->theData[1] = takeOverPtr.i;
9408     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
9409     return;
9410   default:
9411     jamLine(takeOverPtr.p->toSlaveStatus);
9412     ndbabort();
9413   }
9414   sendUpdateTo(signal, takeOverPtr);
9415 }//Dbdih::execUPDATE_FRAG_STATECONF()
9416 
execCOPY_FRAGREF(Signal * signal)9417 void Dbdih::execCOPY_FRAGREF(Signal* signal)
9418 {
9419   const CopyFragRef * const ref = (CopyFragRef *)&signal->theData[0];
9420   jamEntry();
9421   Uint32 takeOverPtrI = ref->userPtr;
9422   Uint32 startingNodeId = ref->startingNodeId;
9423   Uint32 errorCode = ref->errorCode;
9424 
9425   TakeOverRecordPtr takeOverPtr;
9426   c_takeOverPool.getPtr(takeOverPtr, takeOverPtrI);
9427   ndbrequire(ref->tableId == takeOverPtr.p->toCurrentTabref);
9428   ndbrequire(ref->fragId == takeOverPtr.p->toCurrentFragid);
9429   ndbrequire(ref->startingNodeId == takeOverPtr.p->toStartingNode);
9430   ndbrequire(ref->sendingNodeId == takeOverPtr.p->toCopyNode);
9431   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_FRAG);
9432 
9433   //--------------------------------------------------------------------------
9434   // For some reason we did not succeed in copying a fragment. We treat this
9435   // as a serious failure and crash the starting node.
9436   //--------------------------------------------------------------------------
9437   BlockReference cntrRef = calcNdbCntrBlockRef(startingNodeId);
9438   SystemError * const sysErr = (SystemError*)&signal->theData[0];
9439   sysErr->errorCode = SystemError::CopyFragRefError;
9440   sysErr->errorRef = reference();
9441   sysErr->data[0] = errorCode;
9442   sysErr->data[1] = 0;
9443   sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal,
9444 	     SystemError::SignalLength, JBB);
9445   return;
9446 }//Dbdih::execCOPY_FRAGREF()
9447 
execCOPY_FRAGCONF(Signal * signal)9448 void Dbdih::execCOPY_FRAGCONF(Signal* signal)
9449 {
9450   const CopyFragConf * const conf = (CopyFragConf *)&signal->theData[0];
9451   jamEntry();
9452   CRASH_INSERTION(7142);
9453 
9454   TakeOverRecordPtr takeOverPtr;
9455   c_takeOverPool.getPtr(takeOverPtr, conf->userPtr);
9456 
9457   Uint32 rows_lo = conf->rows_lo;
9458   Uint32 bytes_lo = conf->bytes_lo;
9459 
9460   ndbrequire(conf->tableId == takeOverPtr.p->toCurrentTabref);
9461   ndbrequire(conf->fragId == takeOverPtr.p->toCurrentFragid);
9462   ndbrequire(conf->startingNodeId == takeOverPtr.p->toStartingNode);
9463   ndbrequire(conf->sendingNodeId == takeOverPtr.p->toCopyNode);
9464   ndbrequire(takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_FRAG);
9465 
9466   g_eventLogger->debug("COPY_FRAGCONF: thread: %u, tab(%u,%u)",
9467     takeOverPtr.i,
9468     takeOverPtr.p->toCurrentTabref,
9469     takeOverPtr.p->toCurrentFragid);
9470 
9471   TabRecordPtr tabPtr;
9472   tabPtr.i = takeOverPtr.p->toCurrentTabref;
9473   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
9474 
9475   FragmentstorePtr fragPtr;
9476   getFragstore(tabPtr.p, takeOverPtr.p->toCurrentFragid, fragPtr);
9477   Uint32 instanceKey = dihGetInstanceKey(fragPtr);
9478   BlockReference lqhRef = numberToRef(DBLQH, instanceKey,
9479                                       takeOverPtr.p->toStartingNode);
9480   CopyActiveReq * const req = (CopyActiveReq *)&signal->theData[0];
9481   req->userPtr = takeOverPtr.i;
9482   req->userRef = reference();
9483   req->tableId = takeOverPtr.p->toCurrentTabref;
9484   req->fragId = takeOverPtr.p->toCurrentFragid;
9485   req->distributionKey = fragPtr.p->distributionKey;
9486   req->flags = 0;
9487 
9488   {
9489     jam();
9490     /**
9491      * Bug48474 - Don't start logging an fragment
9492      *            until all fragments has been copied
9493      *            Else it's easy to run out of REDO
9494      */
9495     req->flags |= CopyActiveReq::CAR_NO_WAIT | CopyActiveReq::CAR_NO_LOGGING;
9496   }
9497 
9498   sendSignal(lqhRef, GSN_COPY_ACTIVEREQ, signal,
9499              CopyActiveReq::SignalLength, JBB);
9500   g_eventLogger->debug("COPY_ACTIVEREQ: thread: %u, tab(%u,%u)",
9501     takeOverPtr.i,
9502     takeOverPtr.p->toCurrentTabref,
9503     takeOverPtr.p->toCurrentFragid);
9504 
9505   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_COPY_ACTIVE;
9506 
9507   signal->theData[0] = NDB_LE_NR_CopyFragDone;
9508   signal->theData[1] = getOwnNodeId();
9509   signal->theData[2] = takeOverPtr.p->toCurrentTabref;
9510   signal->theData[3] = takeOverPtr.p->toCurrentFragid;
9511   signal->theData[4] = rows_lo;
9512   signal->theData[5] = 0;
9513   signal->theData[6] = bytes_lo;
9514   signal->theData[7] = 0;
9515   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 8, JBB);
9516   g_eventLogger->debug("DIH:tab(%u,%u), COPY_FRAGCONF: %u rows inserted",
9517                        takeOverPtr.p->toCurrentTabref,
9518                        takeOverPtr.p->toCurrentFragid,
9519                        rows_lo);
9520 }//Dbdih::execCOPY_FRAGCONF()
9521 
execCOPY_ACTIVECONF(Signal * signal)9522 void Dbdih::execCOPY_ACTIVECONF(Signal* signal)
9523 {
9524   const CopyActiveConf * const conf = (CopyActiveConf *)&signal->theData[0];
9525   jamEntry();
9526   CRASH_INSERTION(7143);
9527 
9528   TakeOverRecordPtr takeOverPtr;
9529   c_takeOverPool.getPtr(takeOverPtr, conf->userPtr);
9530 
9531   ndbrequire(conf->tableId == takeOverPtr.p->toCurrentTabref);
9532   ndbrequire(conf->fragId == takeOverPtr.p->toCurrentFragid);
9533   ndbrequire(checkNodeAlive(conf->startingNodeId));
9534 
9535   g_eventLogger->debug("COPY_ACTIVECONF: thread: %u, tab: %u, frag: %u",
9536     takeOverPtr.i,
9537     takeOverPtr.p->toCurrentTabref,
9538     takeOverPtr.p->toCurrentFragid);
9539 
9540   takeOverPtr.p->startGci = conf->startGci;
9541 
9542   c_active_copy_threads_list.remove(takeOverPtr);
9543 
9544   if (takeOverPtr.p->toSlaveStatus == TakeOverRecord::TO_COPY_ACTIVE)
9545   {
9546     if (c_activeThreadTakeOverPtr.i != RNIL)
9547     {
9548       /**
9549        * There is already an active take over thread that is performing an
9550        * update of its fragment replica state through the master. We will
9551        * put ourselves in the c_queued_for_commit_take_over_list and be
9552        * started as soon as possible.
9553        */
9554       g_eventLogger->debug("QUEUED_UPDATE_BEFORE_COMMIT, inst: %u",
9555                           takeOverPtr.i);
9556       jam();
9557       takeOverPtr.p->toSlaveStatus =
9558         TakeOverRecord::TO_QUEUED_UPDATE_BEFORE_COMMIT;
9559       c_queued_for_commit_takeover_list.addLast(takeOverPtr);
9560       return;
9561     }
9562     g_eventLogger->debug("Copy frag active: tab:%u,frag:%u,inst:%u",
9563       takeOverPtr.p->toCurrentTabref,
9564       takeOverPtr.p->toCurrentFragid,
9565       takeOverPtr.i);
9566     jam();
9567     c_activeThreadTakeOverPtr = takeOverPtr; /* Mark master busy */
9568     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_UPDATE_BEFORE_COMMIT;
9569     sendUpdateTo(signal, takeOverPtr);
9570   }
9571   else
9572   {
9573     jam();
9574     ndbrequire(takeOverPtr.p->toSlaveStatus==
9575                TakeOverRecord::TO_SL_COPY_ACTIVE);
9576 
9577     if (c_activeThreadTakeOverPtr.i != RNIL)
9578     {
9579       jam();
9580       g_eventLogger->debug("QUEUED_SL_UPDATE_FRAG_STATE, inst: %u",
9581                            takeOverPtr.i);
9582       takeOverPtr.p->toSlaveStatus =
9583         TakeOverRecord::TO_QUEUED_SL_UPDATE_FRAG_STATE;
9584       c_queued_for_commit_takeover_list.addLast(takeOverPtr);
9585       return;
9586     }
9587     c_activeThreadTakeOverPtr = takeOverPtr; /* Mark master busy */
9588     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SL_UPDATE_FRAG_STATE;
9589     g_eventLogger->debug("Update frag state:inst:%u,tab:%u,frag:%u,state:%u",
9590                          takeOverPtr.i,
9591                          takeOverPtr.p->toCurrentTabref,
9592                          takeOverPtr.p->toCurrentFragid,
9593                          takeOverPtr.p->toSlaveStatus);
9594     sendUpdateFragStateReq(signal,
9595                            takeOverPtr.p->startGci,
9596                            UpdateFragStateReq::START_LOGGING,
9597                            takeOverPtr);
9598   }
9599 }//Dbdih::execCOPY_ACTIVECONF()
9600 
9601 void
check_take_over_completed_correctly()9602 Dbdih::check_take_over_completed_correctly()
9603 {
9604   ndbrequire(c_completed_copy_threads_list.isEmpty());
9605   ndbrequire(c_activeTakeOverList.isEmpty());
9606   ndbrequire(c_queued_for_start_takeover_list.isEmpty());
9607   ndbrequire(c_queued_for_commit_takeover_list.isEmpty());
9608   ndbrequire(c_active_copy_threads_list.isEmpty());
9609   ndbrequire(c_activeThreadTakeOverPtr.i == RNIL);
9610   ndbrequire(c_mainTakeOverPtr.i != RNIL);
9611   /**
9612    * We could be master in system restart where we had to
9613    * restart with aid of another node and thus perform
9614    * synchronize with this other node. In this case we
9615    * have 2 take over records, one for master part and
9616    * one for start copy part.
9617    */
9618   ndbrequire((c_takeOverPool.getUsed() == 1) ||
9619              (cmasterdihref == reference() &&
9620               c_takeOverPool.getUsed() == 2));
9621 }
9622 
9623 void
release_take_over_threads(void)9624 Dbdih::release_take_over_threads(void)
9625 {
9626   TakeOverRecordPtr takeOverPtr;
9627   do
9628   {
9629     jam();
9630     if (!c_completed_copy_threads_list.removeFirst(takeOverPtr))
9631     {
9632       jam();
9633       break;
9634     }
9635     releaseTakeOver(takeOverPtr, false);
9636   } while (1);
9637   check_take_over_completed_correctly();
9638 }
9639 
9640 bool
thread_takeover_copy_completed(Signal * signal,TakeOverRecordPtr takeOverPtr)9641 Dbdih::thread_takeover_copy_completed(Signal *signal,
9642                                         TakeOverRecordPtr takeOverPtr)
9643 {
9644   c_activeTakeOverList.remove(takeOverPtr);
9645   c_completed_copy_threads_list.addFirst(takeOverPtr);
9646   c_mainTakeOverPtr.p->m_copy_threads_completed++;
9647   if (c_mainTakeOverPtr.p->m_copy_threads_completed ==
9648       c_mainTakeOverPtr.p->m_number_of_copy_threads)
9649   {
9650     /* No more to do, just wait for more threads to complete */
9651     return true;
9652   }
9653   return false;
9654 }
9655 
toCopyCompletedLab(Signal * signal,TakeOverRecordPtr takeOverPtr)9656 void Dbdih::toCopyCompletedLab(Signal * signal, TakeOverRecordPtr takeOverPtr)
9657 {
9658   /**
9659    * One take over thread has completed its work. We will have to wait for
9660    * all of the threads to complete here before we can proceed.
9661    */
9662   g_eventLogger->debug("Thread %u copy completed", takeOverPtr.i);
9663   if (!thread_takeover_copy_completed(signal, takeOverPtr))
9664   {
9665     jam();
9666     return;
9667   }
9668   jam();
9669   c_mainTakeOverPtr.p->m_copy_threads_completed = 0;
9670 
9671   signal->theData[0] = NDB_LE_NR_CopyFragsCompleted;
9672   signal->theData[1] = takeOverPtr.p->toStartingNode;
9673   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
9674 
9675   /* Ask LQH to dump CopyFrag stage statistics */
9676   signal->theData[0] = DumpStateOrd::LqhReportCopyInfo;
9677   sendSignal(DBLQH_REF, GSN_DUMP_STATE_ORD, signal, 1, JBB);
9678 
9679   g_eventLogger->info("Bring Database On-line Completed");
9680   infoEvent("Bring Database On-line Completed on node %u",
9681             takeOverPtr.p->toStartingNode);
9682 
9683   {
9684     jam();
9685     g_eventLogger->info("Starting REDO logging");
9686     infoEvent("Starting REDO logging on node %u",
9687               takeOverPtr.p->toStartingNode);
9688     start_thread_takeover_logging(signal);
9689     return;
9690   }
9691 }//Dbdih::toCopyCompletedLab()
9692 
9693 void
send_continueb_nr_start_logging(Signal * signal,TakeOverRecordPtr takeOverPtr)9694 Dbdih::send_continueb_nr_start_logging(Signal *signal,
9695                                        TakeOverRecordPtr takeOverPtr)
9696 {
9697   signal->theData[0] = DihContinueB::ZTO_START_LOGGING;
9698   signal->theData[1] = takeOverPtr.i;
9699   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
9700 }
9701 
9702 void
start_thread_takeover_logging(Signal * signal)9703 Dbdih::start_thread_takeover_logging(Signal *signal)
9704 {
9705   /**
9706    * Ensure no active thread, all thread takeover records are
9707    * placed into the c_completed_copy_threads_list and that
9708    * we have a main takeover thread and that all other lists are
9709    * empty at this point.
9710    */
9711   ndbrequire(c_activeThreadTakeOverPtr.i == RNIL);
9712   ndbrequire(c_activeTakeOverList.isEmpty());
9713   ndbrequire(c_queued_for_start_takeover_list.isEmpty());
9714   ndbrequire(c_queued_for_commit_takeover_list.isEmpty());
9715   ndbrequire(c_active_copy_threads_list.isEmpty());
9716   ndbrequire(c_mainTakeOverPtr.i != RNIL);
9717   ndbrequire(!c_completed_copy_threads_list.isEmpty());
9718   TakeOverRecordPtr takeOverPtr;
9719   do
9720   {
9721     jam();
9722     if (!c_completed_copy_threads_list.removeFirst(takeOverPtr))
9723     {
9724       jam();
9725       break;
9726     }
9727     c_active_copy_threads_list.addFirst(takeOverPtr);
9728     takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_START_LOGGING;
9729     takeOverPtr.p->toCurrentTabref = 0;
9730     takeOverPtr.p->toCurrentFragid = 0;
9731     takeOverPtr.p->toCurrentReplica = RNIL;
9732     send_continueb_nr_start_logging(signal, takeOverPtr);
9733   } while (1);
9734 }
9735 
9736 bool
thread_takeover_completed(Signal * signal,TakeOverRecordPtr takeOverPtr)9737 Dbdih::thread_takeover_completed(Signal *signal,
9738                                    TakeOverRecordPtr takeOverPtr)
9739 {
9740   c_active_copy_threads_list.remove(takeOverPtr);
9741   releaseTakeOver(takeOverPtr, false);
9742   c_mainTakeOverPtr.p->m_copy_threads_completed++;
9743   if (c_mainTakeOverPtr.p->m_copy_threads_completed ==
9744       c_mainTakeOverPtr.p->m_number_of_copy_threads)
9745   {
9746     return true;
9747   }
9748   return false;
9749 }
9750 
9751 void
execEND_TOREF(Signal * signal)9752 Dbdih::execEND_TOREF(Signal* signal)
9753 {
9754   jamEntry();
9755   EndToRef* ref = (EndToRef*)signal->getDataPtr();
9756 
9757   TakeOverRecordPtr takeOverPtr;
9758   c_takeOverPool.getPtr(takeOverPtr, ref->senderData);
9759 
9760   ndbabort();
9761 }
9762 
9763 void
execEND_TOCONF(Signal * signal)9764 Dbdih::execEND_TOCONF(Signal* signal)
9765 {
9766   jamEntry();
9767   EndToConf* conf = (EndToConf*)signal->getDataPtr();
9768 
9769   CRASH_INSERTION(7144);
9770 
9771   TakeOverRecordPtr takeOverPtr;
9772   c_takeOverPool.getPtr(takeOverPtr, conf->senderData);
9773 
9774   Uint32 senderData = takeOverPtr.p->m_senderData;
9775   Uint32 senderRef = takeOverPtr.p->m_senderRef;
9776   Uint32 nodeId = takeOverPtr.p->toStartingNode;
9777 
9778   releaseTakeOver(takeOverPtr, false);
9779   c_mainTakeOverPtr.i = RNIL;
9780   c_mainTakeOverPtr.p = NULL;
9781 
9782   StartCopyConf* ret = (StartCopyConf*)signal->getDataPtrSend();
9783   ret->startingNodeId = nodeId;
9784   ret->senderData = senderData;
9785   ret->senderRef = reference();
9786   sendSignal(senderRef, GSN_START_COPYCONF, signal,
9787              StartCopyConf::SignalLength, JBB);
9788 }
9789 
releaseTakeOver(TakeOverRecordPtr takeOverPtr,bool from_master,bool skip_check)9790 void Dbdih::releaseTakeOver(TakeOverRecordPtr takeOverPtr,
9791                             bool from_master,
9792                             bool skip_check)
9793 {
9794   Uint32 startingNode = takeOverPtr.p->toStartingNode;
9795   takeOverPtr.p->m_copy_threads_completed = 0;
9796   takeOverPtr.p->m_number_of_copy_threads = (Uint32)-1;
9797   takeOverPtr.p->m_copy_thread_id = (Uint32)-1;
9798 
9799   takeOverPtr.p->toCopyNode = RNIL;
9800   takeOverPtr.p->toCurrentFragid = RNIL;
9801   takeOverPtr.p->toCurrentReplica = RNIL;
9802   takeOverPtr.p->toCurrentTabref = RNIL;
9803   takeOverPtr.p->toFailedNode = RNIL;
9804   takeOverPtr.p->toStartingNode = RNIL;
9805   NdbTick_Invalidate(&takeOverPtr.p->toStartTime);
9806   takeOverPtr.p->toSlaveStatus = TakeOverRecord::TO_SLAVE_IDLE;
9807   takeOverPtr.p->toMasterStatus = TakeOverRecord::TO_MASTER_IDLE;
9808 
9809   if (from_master)
9810   {
9811     jam();
9812     /**
9813      * We need to ensure that we don't leave any activeTakeOver
9814      * lying around since this will block any future restarts in
9815      * this node group.
9816      *
9817      * We could perform take over in parallel within one node
9818      * group. There is really nothing preventing multiple nodes
9819      * to copy different fragments to a starting node in case
9820      * we have more than 2 replicas.
9821      *
9822      * Note that the setting of toCopyNode within the master is a
9823      * bit weird, it is set in all UPDATE_TOREQ, but the release
9824      * code assumes that it was done only when starting BEFORE_STORED
9825      * and ended when acquiring the mutex for BEFORE_COMMIT. So this
9826      * has to be taken into account when making the code handle
9827      * multiple copy nodes per node group.
9828      */
9829     if (!skip_check)
9830     {
9831       jam();
9832       NodeRecordPtr nodePtr;
9833       NodeGroupRecordPtr NGPtr;
9834       nodePtr.i = startingNode;
9835       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
9836       NGPtr.i = nodePtr.p->nodeGroup;
9837       if (NGPtr.i != ZNIL)
9838       {
9839         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
9840 
9841         if (NGPtr.p->activeTakeOver == 0)
9842         {
9843           jam();
9844           ndbrequire(NGPtr.p->activeTakeOverCount == 0);
9845         }
9846         else if (NGPtr.p->activeTakeOver == startingNode)
9847         {
9848           jam();
9849           NGPtr.p->activeTakeOver = 0;
9850           NGPtr.p->activeTakeOverCount = 0;
9851         }
9852         else
9853         {
9854           jam();
9855           /**
9856            * We arrive here for instance when a takeover is completed
9857            * after waiting for LCP and a new takeover has already started
9858            * of another node in the same node group. In this case our
9859            * node is not the active node being taken over, we are only
9860            * passively waiting for the LCP to complete before we can
9861            * proceed with our recovery.
9862            */
9863         }
9864       }
9865     }
9866     c_masterActiveTakeOverList.remove(takeOverPtr);
9867 
9868   }
9869   c_takeOverPool.release(takeOverPtr);
9870 }//Dbdih::releaseTakeOver()
9871 
9872 /*****************************************************************************/
9873 /* ------------------------------------------------------------------------- */
9874 /*       WE HAVE BEEN REQUESTED TO PERFORM A SYSTEM RESTART. WE START BY     */
9875 /*       READING THE GCI FILES. THIS REQUEST WILL ONLY BE SENT TO THE MASTER */
9876 /*       DIH. THAT MEANS WE HAVE TO REPLICATE THE INFORMATION WE READ FROM   */
9877 /*       OUR FILES TO ENSURE THAT ALL NODES HAVE THE SAME DISTRIBUTION       */
9878 /*       INFORMATION.                                                        */
9879 /* ------------------------------------------------------------------------- */
9880 /*****************************************************************************/
readGciFileLab(Signal * signal)9881 void Dbdih::readGciFileLab(Signal* signal)
9882 {
9883   FileRecordPtr filePtr;
9884   filePtr.i = crestartInfoFile[0];
9885   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
9886   filePtr.p->reqStatus = FileRecord::OPENING_GCP;
9887 
9888   openFileRo(signal, filePtr);
9889 }//Dbdih::readGciFileLab()
9890 
openingGcpLab(Signal * signal,FileRecordPtr filePtr)9891 void Dbdih::openingGcpLab(Signal* signal, FileRecordPtr filePtr)
9892 {
9893   /* ----------------------------------------------------------------------- */
9894   /*     WE HAVE SUCCESSFULLY OPENED A FILE CONTAINING INFORMATION ABOUT     */
9895   /*     THE GLOBAL CHECKPOINTS THAT ARE POSSIBLE TO RESTART.                */
9896   /* ----------------------------------------------------------------------- */
9897   readRestorableGci(signal, filePtr);
9898   filePtr.p->reqStatus = FileRecord::READING_GCP;
9899 }//Dbdih::openingGcpLab()
9900 
readingGcpLab(Signal * signal,FileRecordPtr filePtr)9901 void Dbdih::readingGcpLab(Signal* signal, FileRecordPtr filePtr)
9902 {
9903   /* ----------------------------------------------------------------------- */
9904   /*     WE HAVE NOW SUCCESSFULLY MANAGED TO READ IN THE GLOBAL CHECKPOINT   */
9905   /*     INFORMATION FROM FILE. LATER WE WILL ADD SOME FUNCTIONALITY THAT    */
9906   /*     CHECKS THE RESTART TIMERS TO DEDUCE FROM WHERE TO RESTART.          */
9907   /*     NOW WE WILL SIMPLY RESTART FROM THE NEWEST GLOBAL CHECKPOINT        */
9908   /*     POSSIBLE TO RESTORE.                                                */
9909   /*                                                                         */
9910   /*     BEFORE WE INVOKE DICT WE NEED TO COPY CRESTART_INFO TO ALL NODES.   */
9911   /*     WE ALSO COPY TO OUR OWN NODE. TO ENABLE US TO DO THIS PROPERLY WE   */
9912   /*     START BY CLOSING THIS FILE.                                         */
9913   /* ----------------------------------------------------------------------- */
9914   if (std::memcmp(&cdata[0],
9915                   Sysfile::MAGIC_v2,
9916                   Sysfile::MAGIC_SIZE_v2) == 0)
9917   {
9918     jam();
9919     unpack_sysfile_format_v2(true);
9920   }
9921   else
9922   {
9923     jam();
9924     unpack_sysfile_format_v1(true);
9925   }
9926   /**
9927    * Initialise all nodes beyond m_max_node_id to not defined nodes.
9928    * Should not be necessary, but a protection.
9929    */
9930 #if 0
9931   for (Uint32 i = m_max_node_id + 1; i < MAX_NDB_NODES; i++)
9932   {
9933     Sysfile::setNodeStatus(i,
9934                            SYSFILE->nodeStatus,
9935                            Sysfile::NS_NotDefined);
9936   }
9937 #endif
9938   globalData.m_restart_seq = ++SYSFILE->m_restart_seq;
9939   g_eventLogger->info("Starting with m_restart_seq set to %u",
9940                       globalData.m_restart_seq);
9941   closeFile(signal, filePtr);
9942   filePtr.p->reqStatus = FileRecord::CLOSING_GCP;
9943 }//Dbdih::readingGcpLab()
9944 
closingGcpLab(Signal * signal,FileRecordPtr filePtr)9945 void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr)
9946 {
9947   if (Sysfile::getInitialStartOngoing(SYSFILE->systemRestartBits) == false){
9948     jam();
9949     selectMasterCandidateAndSend(signal);
9950     return;
9951   } else {
9952     jam();
9953     sendDihRestartRef(signal);
9954     return;
9955   }//if
9956 }//Dbdih::closingGcpLab()
9957 
9958 void
sendDihRestartRef(Signal * signal)9959 Dbdih::sendDihRestartRef(Signal* signal)
9960 {
9961   jam();
9962 
9963   /**
9964    * We couldn't read P0.Sysfile...
9965    *   so compute no_nodegroup_mask from configuration
9966    */
9967   NdbNodeBitmask no_nodegroup_mask;
9968 
9969   ndb_mgm_configuration_iterator * iter =
9970     m_ctx.m_config.getClusterConfigIterator();
9971   for(ndb_mgm_first(iter); ndb_mgm_valid(iter); ndb_mgm_next(iter))
9972   {
9973     jam();
9974     Uint32 nodeId;
9975     Uint32 nodeType;
9976 
9977     ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_NODE_ID, &nodeId));
9978     ndbrequire(!ndb_mgm_get_int_parameter(iter,CFG_TYPE_OF_SECTION,
9979                                           &nodeType));
9980 
9981     if (nodeType == NodeInfo::DB)
9982     {
9983       jam();
9984       Uint32 ng;
9985       if (ndb_mgm_get_int_parameter(iter, CFG_DB_NODEGROUP, &ng) == 0)
9986       {
9987         jam();
9988         if (ng == NDB_NO_NODEGROUP)
9989         {
9990           no_nodegroup_mask.set(nodeId);
9991         }
9992       }
9993     }
9994   }
9995   {
9996     LinearSectionPtr lsptr[3];
9997     lsptr[0].p = no_nodegroup_mask.rep.data;
9998     lsptr[0].sz = no_nodegroup_mask.getPackedLengthInWords();
9999     sendSignal(cntrlblockref,
10000                GSN_DIH_RESTARTREF,
10001                signal,
10002                DihRestartRef::SignalLength,
10003                JBB,
10004                lsptr,
10005                1);
10006   }
10007 }
10008 
10009 /* ------------------------------------------------------------------------- */
10010 /*       SELECT THE MASTER CANDIDATE TO BE USED IN SYSTEM RESTARTS.          */
10011 /* ------------------------------------------------------------------------- */
selectMasterCandidateAndSend(Signal * signal)10012 void Dbdih::selectMasterCandidateAndSend(Signal* signal)
10013 {
10014   setNodeGroups();
10015 
10016   NodeRecordPtr nodePtr;
10017   Uint32 node_groups[MAX_NDB_NODES];
10018   memset(node_groups, 0, sizeof(node_groups));
10019   NdbNodeBitmask no_nodegroup_mask;
10020   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++) {
10021     jam();
10022     if (Sysfile::getNodeStatus(nodePtr.i, SYSFILE->nodeStatus) == Sysfile::NS_NotDefined)
10023     {
10024       jam();
10025       continue;
10026     }
10027     const Uint32 ng = Sysfile::getNodeGroup(nodePtr.i, SYSFILE->nodeGroups);
10028     if(ng != NO_NODE_GROUP_ID)
10029     {
10030       jam();
10031       jamLine(Uint16(ng));
10032       ndbrequire(ng < MAX_NDB_NODE_GROUPS);
10033       node_groups[ng]++;
10034     }
10035     else
10036     {
10037       jam();
10038       no_nodegroup_mask.set(nodePtr.i);
10039     }
10040   }
10041 
10042   DihRestartConf * conf = CAST_PTR(DihRestartConf, signal->getDataPtrSend());
10043   conf->unused = getOwnNodeId();
10044   conf->latest_gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
10045   conf->latest_lcp_id = SYSFILE->latestLCP_ID;
10046   {
10047     LinearSectionPtr lsptr[3];
10048     lsptr[0].p = no_nodegroup_mask.rep.data;
10049     lsptr[0].sz = no_nodegroup_mask.getPackedLengthInWords();
10050     no_nodegroup_mask.copyto(NdbNodeBitmask::Size, conf->no_nodegroup_mask);
10051     ndbrequire(getOwnNodeId() == refToNode(cntrlblockref));
10052     sendSignal(cntrlblockref,
10053                GSN_DIH_RESTARTCONF,
10054                signal,
10055                DihRestartConf::SignalLength,
10056                JBB,
10057                lsptr,
10058                1);
10059   }
10060   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
10061   {
10062     jam();
10063     Uint32 count = node_groups[nodePtr.i];
10064     if(count != 0 && count != cnoReplicas){
10065       char buf[255];
10066       BaseString::snprintf(buf, sizeof(buf),
10067 			   "Illegal configuration change."
10068 			   " Initial start needs to be performed "
10069 			   " when changing no of replicas (%d != %d)",
10070 			   node_groups[nodePtr.i], cnoReplicas);
10071       progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
10072     }
10073   }
10074 }//Dbdih::selectMasterCandidate()
10075 
10076 /* ------------------------------------------------------------------------- */
10077 /*       ERROR HANDLING DURING READING RESTORABLE GCI FROM FILE.             */
10078 /* ------------------------------------------------------------------------- */
openingGcpErrorLab(Signal * signal,FileRecordPtr filePtr)10079 void Dbdih::openingGcpErrorLab(Signal* signal, FileRecordPtr filePtr)
10080 {
10081   filePtr.p->fileStatus = FileRecord::CRASHED;
10082   filePtr.p->reqStatus = FileRecord::IDLE;
10083   if (crestartInfoFile[0] == filePtr.i) {
10084     jam();
10085     /* --------------------------------------------------------------------- */
10086     /*   THE FIRST FILE WAS NOT ABLE TO BE OPENED. SET STATUS TO CRASHED AND */
10087     /*   TRY OPEN THE NEXT FILE.                                             */
10088     /* --------------------------------------------------------------------- */
10089     filePtr.i = crestartInfoFile[1];
10090     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10091     openFileRo(signal, filePtr);
10092     filePtr.p->reqStatus = FileRecord::OPENING_GCP;
10093   } else {
10094     jam();
10095     /* --------------------------------------------------------------------- */
10096     /*   WE FAILED IN OPENING THE SECOND FILE. BOTH FILES WERE CORRUPTED. WE */
10097     /*   CANNOT CONTINUE THE RESTART IN THIS CASE. TELL NDBCNTR OF OUR       */
10098     /*   FAILURE.                                                            */
10099     /*---------------------------------------------------------------------- */
10100     sendDihRestartRef(signal);
10101     return;
10102   }//if
10103 }//Dbdih::openingGcpErrorLab()
10104 
readingGcpErrorLab(Signal * signal,FileRecordPtr filePtr)10105 void Dbdih::readingGcpErrorLab(Signal* signal, FileRecordPtr filePtr)
10106 {
10107   filePtr.p->fileStatus = FileRecord::CRASHED;
10108   /* ----------------------------------------------------------------------- */
10109   /*     WE FAILED IN READING THE FILE AS WELL. WE WILL CLOSE THIS FILE.     */
10110   /* ----------------------------------------------------------------------- */
10111   closeFile(signal, filePtr);
10112   filePtr.p->reqStatus = FileRecord::CLOSING_GCP_CRASH;
10113 }//Dbdih::readingGcpErrorLab()
10114 
closingGcpCrashLab(Signal * signal,FileRecordPtr filePtr)10115 void Dbdih::closingGcpCrashLab(Signal* signal, FileRecordPtr filePtr)
10116 {
10117   if (crestartInfoFile[0] == filePtr.i) {
10118     jam();
10119     /* --------------------------------------------------------------------- */
10120     /*   ERROR IN FIRST FILE, TRY THE SECOND FILE.                           */
10121     /* --------------------------------------------------------------------- */
10122     filePtr.i = crestartInfoFile[1];
10123     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10124     openFileRw(signal, filePtr);
10125     filePtr.p->reqStatus = FileRecord::OPENING_GCP;
10126     return;
10127   }//if
10128   /* ----------------------------------------------------------------------- */
10129   /*     WE DISCOVERED A FAILURE WITH THE SECOND FILE AS WELL. THIS IS A     */
10130   /*     SERIOUS PROBLEM. REPORT FAILURE TO NDBCNTR.                         */
10131   /* ----------------------------------------------------------------------- */
10132   sendDihRestartRef(signal);
10133 }//Dbdih::closingGcpCrashLab()
10134 
10135 /*****************************************************************************/
10136 /* ------------------------------------------------------------------------- */
10137 /*       THIS IS AN INITIAL RESTART. WE WILL CREATE THE TWO FILES DESCRIBING */
10138 /*       THE GLOBAL CHECKPOINTS THAT ARE RESTORABLE.                         */
10139 /* ------------------------------------------------------------------------- */
10140 /*****************************************************************************/
initGciFilesLab(Signal * signal)10141 void Dbdih::initGciFilesLab(Signal* signal)
10142 {
10143   FileRecordPtr filePtr;
10144   filePtr.i = crestartInfoFile[0];
10145   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10146   createFileRw(signal, filePtr);
10147   filePtr.p->reqStatus = FileRecord::CREATING_GCP;
10148 }//Dbdih::initGciFilesLab()
10149 
10150 /* ------------------------------------------------------------------------- */
10151 /*       GLOBAL CHECKPOINT FILE HAVE BEEN SUCCESSFULLY CREATED.              */
10152 /* ------------------------------------------------------------------------- */
creatingGcpLab(Signal * signal,FileRecordPtr filePtr)10153 void Dbdih::creatingGcpLab(Signal* signal, FileRecordPtr filePtr)
10154 {
10155   if (filePtr.i == crestartInfoFile[0]) {
10156     jam();
10157     /* --------------------------------------------------------------------- */
10158     /*   IF CREATED FIRST THEN ALSO CREATE THE SECOND FILE.                  */
10159     /* --------------------------------------------------------------------- */
10160     filePtr.i = crestartInfoFile[1];
10161     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10162     createFileRw(signal, filePtr);
10163     filePtr.p->reqStatus = FileRecord::CREATING_GCP;
10164   } else {
10165     jam();
10166     /* --------------------------------------------------------------------- */
10167     /*   BOTH FILES HAVE BEEN CREATED. NOW WRITE THE INITIAL DATA TO BOTH    */
10168     /*   OF THE FILES.                                                       */
10169     /* --------------------------------------------------------------------- */
10170     filePtr.i = crestartInfoFile[0];
10171     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10172     writeRestorableGci(signal, filePtr);
10173     filePtr.p->reqStatus = FileRecord::WRITE_INIT_GCP;
10174   }//if
10175 }//Dbdih::creatingGcpLab()
10176 
10177 /* ------------------------------------------------------------------------- */
10178 /*       WE HAVE SUCCESSFULLY WRITTEN A GCI FILE.                            */
10179 /* ------------------------------------------------------------------------- */
writeInitGcpLab(Signal * signal,FileRecordPtr filePtr)10180 void Dbdih::writeInitGcpLab(Signal* signal, FileRecordPtr filePtr)
10181 {
10182   filePtr.p->reqStatus = FileRecord::IDLE;
10183   if (filePtr.i == crestartInfoFile[0]) {
10184     jam();
10185     /* --------------------------------------------------------------------- */
10186     /*   WE HAVE WRITTEN THE FIRST FILE NOW ALSO WRITE THE SECOND FILE.      */
10187     /* --------------------------------------------------------------------- */
10188     filePtr.i = crestartInfoFile[1];
10189     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
10190     writeRestorableGci(signal, filePtr);
10191     filePtr.p->reqStatus = FileRecord::WRITE_INIT_GCP;
10192   } else {
10193     /* --------------------------------------------------------------------- */
10194     /*   WE HAVE WRITTEN BOTH FILES. LEAVE BOTH FILES OPEN AND CONFIRM OUR   */
10195     /*   PART OF THE INITIAL START.                                          */
10196     /* --------------------------------------------------------------------- */
10197     if (isMaster()) {
10198       jam();
10199       /*---------------------------------------------------------------------*/
10200       // IN MASTER NODES THE START REQUEST IS RECEIVED FROM NDBCNTR AND WE MUST
10201       // RESPOND WHEN COMPLETED.
10202       /*---------------------------------------------------------------------*/
10203       signal->theData[0] = reference();
10204       sendSignal(cndbStartReqBlockref, GSN_NDB_STARTCONF, signal, 1, JBB);
10205     } else {
10206       jam();
10207       ndbsttorry10Lab(signal, __LINE__);
10208       return;
10209     }//if
10210   }//if
10211 }//Dbdih::writeInitGcpLab()
10212 
log_setNoSend()10213 void Dbdih::log_setNoSend()
10214 {
10215   g_eventLogger->info("Disable send assistance for main thread in large"
10216                       " clusters");
10217 }
10218 
10219 /*****************************************************************************/
10220 /* **********     NODES DELETION MODULE                          *************/
10221 /*****************************************************************************/
10222 /*---------------------------------------------------------------------------*/
10223 /*                    LOGIC FOR NODE FAILURE                                 */
10224 /*---------------------------------------------------------------------------*/
execNODE_FAILREP(Signal * signal)10225 void Dbdih::execNODE_FAILREP(Signal* signal)
10226 {
10227   Uint32 i;
10228   Uint32 failedNodes[MAX_NDB_NODES];
10229   jamEntry();
10230   NodeFailRep * const nodeFail = (NodeFailRep *)&signal->theData[0];
10231   NdbNodeBitmask allFailed;
10232 
10233   if (signal->getNoOfSections() >= 1)
10234   {
10235     jam();
10236     ndbrequire(getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version);
10237     SegmentedSectionPtr ptr;
10238     SectionHandle handle(this, signal);
10239     handle.getSection(ptr, 0);
10240     ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
10241     copy(allFailed.rep.data, ptr);
10242     releaseSections(handle);
10243   }
10244   else
10245   {
10246     allFailed.assign(NdbNodeBitmask48::Size, nodeFail->theNodes);
10247   }
10248 
10249   cfailurenr = nodeFail->failNo;
10250   Uint32 newMasterId = nodeFail->masterNodeId;
10251   const Uint32 noOfFailedNodes = nodeFail->noOfNodes;
10252 
10253   /* Send NODE_FAILREP to rest of blocks (not NDBCNTR, QMGR, DBDIH).
10254    * Some of them will respond with NF_COMPLETEREP to DBDIH when they handled
10255    * the node failure.
10256    */
10257 
10258   LinearSectionPtr lsptr[1];
10259   lsptr[0].p = allFailed.rep.data;
10260   lsptr[0].sz = allFailed.getPackedLengthInWords();
10261 
10262   sendSignal(DBTC_REF, GSN_NODE_FAILREP, signal,
10263              NodeFailRep::SignalLength, JBB, lsptr, 1);
10264 
10265   sendSignal(DBLQH_REF, GSN_NODE_FAILREP, signal,
10266              NodeFailRep::SignalLength, JBB, lsptr, 1);
10267 
10268   sendSignal(DBDICT_REF, GSN_NODE_FAILREP, signal,
10269              NodeFailRep::SignalLength, JBB, lsptr, 1);
10270 
10271   sendSignal(BACKUP_REF, GSN_NODE_FAILREP, signal,
10272              NodeFailRep::SignalLength, JBB, lsptr, 1);
10273 
10274   sendSignal(SUMA_REF, GSN_NODE_FAILREP, signal,
10275              NodeFailRep::SignalLength, JBB, lsptr, 1);
10276 
10277   sendSignal(DBUTIL_REF, GSN_NODE_FAILREP, signal,
10278              NodeFailRep::SignalLength, JBB, lsptr, 1);
10279 
10280   sendSignal(DBTUP_REF, GSN_NODE_FAILREP, signal,
10281              NodeFailRep::SignalLength, JBB, lsptr, 1);
10282 
10283   sendSignal(TSMAN_REF, GSN_NODE_FAILREP, signal,
10284              NodeFailRep::SignalLength, JBB, lsptr, 1);
10285 
10286   sendSignal(LGMAN_REF, GSN_NODE_FAILREP, signal,
10287              NodeFailRep::SignalLength, JBB, lsptr, 1);
10288 
10289   sendSignal(DBSPJ_REF, GSN_NODE_FAILREP, signal,
10290              NodeFailRep::SignalLength, JBB, lsptr, 1);
10291 
10292   if (ERROR_INSERTED(7179) || ERROR_INSERTED(7217))
10293   {
10294     CLEAR_ERROR_INSERT_VALUE;
10295     CLEAR_ERROR_INSERT_EXTRA;
10296   }
10297 
10298   if (ERROR_INSERTED(7184))
10299   {
10300     SET_ERROR_INSERT_VALUE(7000);
10301   }
10302 
10303   c_increase_lcp_speed_after_nf = true;
10304 
10305   /*-------------------------------------------------------------------------*/
10306   // The first step is to convert from a bit mask to an array of failed nodes.
10307   /*-------------------------------------------------------------------------*/
10308   Uint32 index = 0;
10309   for (i = 1; i <= m_max_node_id; i++)
10310   {
10311     if (allFailed.get(i))
10312     {
10313       jamLine(i);
10314       failedNodes[index] = i;
10315       index++;
10316     }//if
10317   }//for
10318   ndbrequire(noOfFailedNodes == index);
10319   ndbrequire(noOfFailedNodes - 1 < MAX_NDB_NODES);
10320 
10321   /*-------------------------------------------------------------------------*/
10322   // The second step is to update the node status of the failed nodes, remove
10323   // them from the alive node list and put them into the dead node list. Also
10324   // update the number of nodes on-line.
10325   // We also set certain state variables ensuring that the node no longer is
10326   // used in transactions and also mark that we received this signal.
10327   /*-------------------------------------------------------------------------*/
10328   for (i = 0; i < noOfFailedNodes; i++) {
10329     jam();
10330     NodeRecordPtr TNodePtr;
10331     TNodePtr.i = failedNodes[i];
10332     ptrCheckGuard(TNodePtr, MAX_NDB_NODES, nodeRecord);
10333     setNodeRecoveryStatus(TNodePtr.i, NodeRecord::NODE_FAILED);
10334     make_node_not_usable(TNodePtr.p);
10335     TNodePtr.p->m_inclDihLcp = false;
10336     TNodePtr.p->recNODE_FAILREP = ZTRUE;
10337     if (TNodePtr.p->nodeStatus == NodeRecord::ALIVE) {
10338       jam();
10339       con_lineNodes--;
10340       TNodePtr.p->nodeStatus = NodeRecord::DIED_NOW;
10341       removeAlive(TNodePtr);
10342       insertDeadNode(TNodePtr);
10343     }//if
10344   }//for
10345 
10346   /*-------------------------------------------------------------------------*/
10347   // Verify that we can continue to operate the cluster. If we cannot we will
10348   // not return from checkEscalation.
10349   /*-------------------------------------------------------------------------*/
10350   checkEscalation();
10351 
10352   /*------------------------------------------------------------------------*/
10353   // Verify that a starting node has also crashed. Reset the node start record.
10354   /*-------------------------------------------------------------------------*/
10355 #if 0
10356   /**
10357    * Node will crash by itself...
10358    *   nodeRestart is run then...
10359    */
10360   if (false && c_nodeStartMaster.startNode != RNIL && getNodeStatus(c_nodeStartMaster.startNode) == NodeRecord::ALIVE)
10361   {
10362     BlockReference cntrRef = calcNdbCntrBlockRef(c_nodeStartMaster.startNode);
10363     SystemError * const sysErr = (SystemError*)&signal->theData[0];
10364     sysErr->errorCode = SystemError::StartInProgressError;
10365     sysErr->errorRef = reference();
10366     sysErr->data[0]= 0;
10367     sysErr->data[1]= __LINE__;
10368     sendSignal(cntrRef, GSN_SYSTEM_ERROR, signal,  SystemError::SignalLength, JBA);
10369     nodeResetStart(signal);
10370   }//if
10371 #endif
10372 
10373   if (is_lcp_paused())
10374   {
10375     /**
10376      * Stop any LCP pausing, a node has crashed, this implies that also the
10377      * node that caused us to pause the LCP has crashed.
10378      */
10379     jam();
10380     handle_node_failure_in_pause(signal);
10381   }
10382   /*--------------------------------------------------*/
10383   /*                                                  */
10384   /*       WE CHANGE THE REFERENCE TO MASTER DIH      */
10385   /*       BLOCK AND POINTER AT THIS PLACE IN THE CODE*/
10386   /*--------------------------------------------------*/
10387   Uint32 oldMasterId = cmasterNodeId;
10388   BlockReference oldMasterRef = cmasterdihref;
10389   cmasterdihref = calcDihBlockRef(newMasterId);
10390   cmasterNodeId = newMasterId;
10391 
10392   if (cmasterNodeId == getOwnNodeId() &&
10393       con_lineNodes >= 16)
10394   {
10395     log_setNoSend();
10396     setNoSend();
10397   }
10398   const bool masterTakeOver = (oldMasterId != newMasterId);
10399   bool check_more_start_lcp = false;
10400   for(i = 0; i < noOfFailedNodes; i++) {
10401     NodeRecordPtr failedNodePtr;
10402     failedNodePtr.i = failedNodes[i];
10403     ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
10404     if (oldMasterRef == reference()) {
10405       /*-------------------------------------------------------*/
10406       // Functions that need to be called only for master nodes.
10407       /*-------------------------------------------------------*/
10408       checkCopyTab(signal, failedNodePtr);
10409       checkStopPermMaster(signal, failedNodePtr);
10410       checkWaitGCPMaster(signal, failedNodes[i]);
10411 
10412       {
10413         Ptr<TakeOverRecord> takeOverPtr;
10414         if (findTakeOver(takeOverPtr, failedNodePtr.i))
10415         {
10416           handleTakeOver(signal, takeOverPtr);
10417         }
10418       }
10419       checkGcpOutstanding(signal, failedNodePtr.i);
10420     } else {
10421       jam();
10422       /*-----------------------------------------------------------*/
10423       // Functions that need to be called only for nodes that were
10424       // not master before these failures.
10425       /*-----------------------------------------------------------*/
10426       checkStopPermProxy(signal, failedNodes[i]);
10427       checkWaitGCPProxy(signal, failedNodes[i]);
10428     }//if
10429     /*--------------------------------------------------*/
10430     // Functions that need to be called for all nodes.
10431     /*--------------------------------------------------*/
10432     checkStopMe(signal, failedNodePtr);
10433     failedNodeLcpHandling(signal, failedNodePtr, check_more_start_lcp);
10434     startRemoveFailedNode(signal, failedNodePtr);
10435 
10436     /**
10437      * This is the last function called
10438      *   It modifies failedNodePtr.p->nodeStatus
10439      */
10440     failedNodeSynchHandling(signal, failedNodePtr);
10441   }//for
10442   if(masterTakeOver){
10443     jam();
10444     NodeRecordPtr nodePtr;
10445     g_eventLogger->info("Master takeover started from %u", oldMasterId);
10446     for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
10447     {
10448       ptrAss(nodePtr, nodeRecord);
10449       if (nodePtr.p->nodeStatus == NodeRecord::ALIVE)
10450       {
10451         jamLine(nodePtr.i);
10452         ndbrequire(nodePtr.p->noOfStartedChkpt == 0);
10453         ndbrequire(nodePtr.p->noOfQueuedChkpt == 0);
10454       }
10455     }
10456     startLcpMasterTakeOver(signal, oldMasterId);
10457     startGcpMasterTakeOver(signal, oldMasterId);
10458 
10459     if(getNodeState().getNodeRestartInProgress()){
10460       jam();
10461       progError(__LINE__, NDBD_EXIT_MASTER_FAILURE_DURING_NR);
10462     }
10463   }
10464 
10465   if (isMaster()) {
10466     jam();
10467     setNodeRestartInfoBits(signal);
10468   }//if
10469 
10470   // Request max lag recalculation to reflect new cluster scale
10471   // after a node failure
10472   m_gcp_monitor.m_gcp_save.m_need_max_lag_recalc = true;
10473   m_gcp_monitor.m_micro_gcp.m_need_max_lag_recalc = true;
10474 
10475   /**
10476    * Need to check if a node failed that was part of LCP. In this
10477    * case we need to ensure that we don't get LCP hang by checking
10478    * for sending of LCP_FRAG_ORD with last fragment flag set.
10479    *
10480    * This code cannot be called in master takeover case, in this
10481    * case we restart the LCP in DIH entirely, so no need to worry
10482    * here.
10483    */
10484   if (check_more_start_lcp &&
10485       c_lcpMasterTakeOverState.state == LMTOS_IDLE)
10486   {
10487     jam();
10488     ndbrequire(isMaster());
10489     startNextChkpt(signal);
10490   }
10491 }//Dbdih::execNODE_FAILREP()
10492 
checkCopyTab(Signal * signal,NodeRecordPtr failedNodePtr)10493 void Dbdih::checkCopyTab(Signal* signal, NodeRecordPtr failedNodePtr)
10494 {
10495   jam();
10496 
10497   if(c_nodeStartMaster.startNode != failedNodePtr.i){
10498     jam();
10499     return;
10500   }
10501 
10502   switch(c_nodeStartMaster.m_outstandingGsn){
10503   case GSN_COPY_TABREQ:
10504     jam();
10505     releaseTabPages(failedNodePtr.p->activeTabptr);
10506     if (c_COPY_TABREQ_Counter.isWaitingFor(failedNodePtr.i))
10507     {
10508       jam();
10509       c_COPY_TABREQ_Counter.clearWaitingFor(failedNodePtr.i);
10510     }
10511     c_nodeStartMaster.wait = ZFALSE;
10512     break;
10513   case GSN_START_INFOREQ:
10514   case GSN_START_PERMCONF:
10515   case GSN_DICTSTARTREQ:
10516   case GSN_COPY_GCIREQ:
10517     jam();
10518     break;
10519   default:
10520     g_eventLogger->error("outstanding gsn: %s(%d)",
10521                          getSignalName(c_nodeStartMaster.m_outstandingGsn),
10522                          c_nodeStartMaster.m_outstandingGsn);
10523     ndbabort();
10524   }
10525 
10526   if (!c_nodeStartMaster.m_fragmentInfoMutex.isNull())
10527   {
10528     jam();
10529     Mutex mutex(signal, c_mutexMgr, c_nodeStartMaster.m_fragmentInfoMutex);
10530     mutex.unlock();
10531   }
10532 
10533   nodeResetStart(signal);
10534 }//Dbdih::checkCopyTab()
10535 
checkStopMe(Signal * signal,NodeRecordPtr failedNodePtr)10536 void Dbdih::checkStopMe(Signal* signal, NodeRecordPtr failedNodePtr)
10537 {
10538   jam();
10539   if (c_STOP_ME_REQ_Counter.isWaitingFor(failedNodePtr.i)){
10540     jam();
10541     ndbrequire(c_stopMe.clientRef != 0);
10542     StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
10543     stopMeConf->senderRef = calcDihBlockRef(failedNodePtr.i);
10544     stopMeConf->senderData = c_stopMe.clientData;
10545     sendSignal(reference(), GSN_STOP_ME_CONF, signal,
10546 	       StopMeConf::SignalLength, JBB);
10547   }//if
10548 }//Dbdih::checkStopMe()
10549 
checkStopPermMaster(Signal * signal,NodeRecordPtr failedNodePtr)10550 void Dbdih::checkStopPermMaster(Signal* signal, NodeRecordPtr failedNodePtr)
10551 {
10552   DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
10553   jam();
10554   if (c_DIH_SWITCH_REPLICA_REQ_Counter.isWaitingFor(failedNodePtr.i)){
10555     jam();
10556     ndbrequire(c_stopPermMaster.clientRef != 0);
10557     ref->senderNode = failedNodePtr.i;
10558     ref->errorCode = StopPermRef::NF_CausedAbortOfStopProcedure;
10559     sendSignal(reference(), GSN_DIH_SWITCH_REPLICA_REF, signal,
10560                DihSwitchReplicaRef::SignalLength, JBB);
10561     return;
10562   }//if
10563 }//Dbdih::checkStopPermMaster()
10564 
checkStopPermProxy(Signal * signal,NodeId failedNodeId)10565 void Dbdih::checkStopPermProxy(Signal* signal, NodeId failedNodeId)
10566 {
10567   jam();
10568   if(c_stopPermProxy.clientRef != 0 &&
10569      refToNode(c_stopPermProxy.masterRef) == failedNodeId){
10570 
10571     /**
10572      * The master has failed report to proxy-client
10573      */
10574     jam();
10575     StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
10576 
10577     ref->senderData = c_stopPermProxy.clientData;
10578     ref->errorCode  = StopPermRef::NF_CausedAbortOfStopProcedure;
10579     sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_REF, signal, 2, JBB);
10580     c_stopPermProxy.clientRef = 0;
10581   }//if
10582 }//Dbdih::checkStopPermProxy()
10583 
10584 void
handleTakeOver(Signal * signal,TakeOverRecordPtr takeOverPtr)10585 Dbdih::handleTakeOver(Signal* signal, TakeOverRecordPtr takeOverPtr)
10586 {
10587   jam();
10588   switch(takeOverPtr.p->toMasterStatus){
10589   case TakeOverRecord::TO_MASTER_IDLE:
10590     jam();
10591     releaseTakeOver(takeOverPtr, true);
10592     return;
10593   case TakeOverRecord::TO_MUTEX_BEFORE_STORED:
10594     jam();
10595     /**
10596      * Waiting for lock...
10597      *   do nothing...will be detected when lock is acquired
10598      */
10599     return;
10600   case TakeOverRecord::TO_MUTEX_BEFORE_LOCKED:
10601     jam();
10602     /**
10603      * Has lock...and NGPtr reservation...
10604      */
10605     abortTakeOver(signal, takeOverPtr);
10606     return;
10607   case TakeOverRecord::TO_AFTER_STORED:{
10608     jam();
10609     /**
10610      * No lock...but NGPtr reservation...remove NGPtr reservation
10611      */
10612     NodeRecordPtr nodePtr;
10613     NodeGroupRecordPtr NGPtr;
10614     nodePtr.i = takeOverPtr.p->toCopyNode;
10615     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
10616     NGPtr.i = nodePtr.p->nodeGroup;
10617     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
10618 
10619     ndbassert(NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode);
10620     if (NGPtr.p->activeTakeOver == takeOverPtr.p->toStartingNode)
10621     {
10622       jam();
10623       NGPtr.p->activeTakeOver = 0;
10624       NGPtr.p->activeTakeOverCount = 0;
10625     }
10626     releaseTakeOver(takeOverPtr, true);
10627     return;
10628   }
10629   case TakeOverRecord::TO_MUTEX_BEFORE_COMMIT:
10630     jam();
10631     /**
10632      * Waiting for lock...
10633      *   do nothing...will be detected when lock is acquired
10634      */
10635     return;
10636   case TakeOverRecord::TO_MUTEX_BEFORE_SWITCH_REPLICA:
10637     jam();
10638     /**
10639      * Waiting for lock...
10640      *   do nothing...will be detected when lock is acquired
10641      */
10642     return;
10643   case TakeOverRecord::TO_MUTEX_AFTER_SWITCH_REPLICA:
10644     jam();
10645     abortTakeOver(signal, takeOverPtr);
10646     return;
10647   case TakeOverRecord::TO_WAIT_LCP:{
10648     jam();
10649     /**
10650      * Waiting for LCP
10651      */
10652     NodeRecordPtr nodePtr;
10653     nodePtr.i = takeOverPtr.p->toStartingNode;
10654     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
10655     nodePtr.p->copyCompleted = 0;
10656     releaseTakeOver(takeOverPtr, true);
10657     return;
10658   }
10659   default:
10660     jamLine(takeOverPtr.p->toMasterStatus);
10661     ndbabort();
10662   }
10663 }
10664 
failedNodeSynchHandling(Signal * signal,NodeRecordPtr failedNodePtr)10665 void Dbdih::failedNodeSynchHandling(Signal* signal,
10666 				    NodeRecordPtr failedNodePtr)
10667 {
10668   jam();
10669   /*----------------------------------------------------*/
10670   /*       INITIALISE THE VARIABLES THAT KEEP TRACK OF  */
10671   /*       WHEN A NODE FAILURE IS COMPLETED.            */
10672   /*----------------------------------------------------*/
10673   failedNodePtr.p->dbdictFailCompleted = ZFALSE;
10674   failedNodePtr.p->dbtcFailCompleted = ZFALSE;
10675   failedNodePtr.p->dbdihFailCompleted = ZFALSE;
10676   failedNodePtr.p->dblqhFailCompleted = ZFALSE;
10677 
10678   failedNodePtr.p->m_NF_COMPLETE_REP.clearWaitingFor();
10679 
10680   NodeRecordPtr nodePtr;
10681   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
10682   {
10683     ptrAss(nodePtr, nodeRecord);
10684     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
10685       jam();
10686       /**
10687        * We'r waiting for nodePtr.i to complete
10688        * handling of failedNodePtr.i's death
10689        */
10690 
10691       failedNodePtr.p->m_NF_COMPLETE_REP.setWaitingFor(nodePtr.i);
10692     } else {
10693       jam();
10694       if ((nodePtr.p->nodeStatus == NodeRecord::DYING) &&
10695           (nodePtr.p->m_NF_COMPLETE_REP.isWaitingFor(failedNodePtr.i))){
10696         jam();
10697 	/*----------------------------------------------------*/
10698 	/*       THE NODE FAILED BEFORE REPORTING THE FAILURE */
10699 	/*       HANDLING COMPLETED ON THIS FAILED NODE.      */
10700 	/*       REPORT THAT NODE FAILURE HANDLING WAS        */
10701 	/*       COMPLETED ON THE NEW FAILED NODE FOR THIS    */
10702 	/*       PARTICULAR OLD FAILED NODE.                  */
10703 	/*----------------------------------------------------*/
10704         NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
10705         nf->blockNo = 0;
10706         nf->nodeId  = failedNodePtr.i;
10707         nf->failedNodeId = nodePtr.i;
10708 	nf->from    = __LINE__;
10709         sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
10710                    NFCompleteRep::SignalLength, JBB);
10711       }//if
10712     }//if
10713   }//for
10714   if (failedNodePtr.p->nodeStatus == NodeRecord::DIED_NOW) {
10715     jam();
10716     failedNodePtr.p->nodeStatus = NodeRecord::DYING;
10717   } else {
10718     jam();
10719     /*----------------------------------------------------*/
10720     // No more processing needed when node not even started
10721     // yet. We give the node status to DEAD since we do not
10722     // care whether all nodes complete the node failure
10723     // handling. The node have not been included in the
10724     // node failure protocols.
10725     /*----------------------------------------------------*/
10726     failedNodePtr.p->nodeStatus = NodeRecord::DEAD;
10727     /**-----------------------------------------------------------------------
10728      * WE HAVE COMPLETED HANDLING THE NODE FAILURE IN DIH. WE CAN REPORT THIS
10729      * TO DIH THAT WAIT FOR THE OTHER BLOCKS TO BE CONCLUDED AS WELL.
10730      *-----------------------------------------------------------------------*/
10731     NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
10732     nf->blockNo      = DBDIH;
10733     nf->nodeId       = cownNodeId;
10734     nf->failedNodeId = failedNodePtr.i;
10735     nf->from         = __LINE__;
10736     sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
10737                NFCompleteRep::SignalLength, JBB);
10738   }//if
10739 }//Dbdih::failedNodeSynchHandling()
10740 
10741 bool
findTakeOver(Ptr<TakeOverRecord> & ptr,Uint32 failedNodeId)10742 Dbdih::findTakeOver(Ptr<TakeOverRecord> & ptr, Uint32 failedNodeId)
10743 {
10744   for (c_masterActiveTakeOverList.first(ptr); !ptr.isNull();
10745        c_masterActiveTakeOverList.next(ptr))
10746   {
10747     jam();
10748     if (ptr.p->toStartingNode == failedNodeId)
10749     {
10750       jam();
10751       return true;
10752     }
10753   }
10754   ptr.setNull();
10755   return false;
10756 }//Dbdih::findTakeOver()
10757 
failedNodeLcpHandling(Signal * signal,NodeRecordPtr failedNodePtr,bool & check_more_start_lcp)10758 void Dbdih::failedNodeLcpHandling(Signal* signal,
10759                                   NodeRecordPtr failedNodePtr,
10760                                   bool & check_more_start_lcp)
10761 {
10762   jam();
10763   const Uint32 nodeId = failedNodePtr.i;
10764 
10765   if (isMaster() && c_lcpState.m_participatingLQH.get(failedNodePtr.i))
10766   {
10767     /*----------------------------------------------------*/
10768     /*  THE NODE WAS INVOLVED IN A LOCAL CHECKPOINT. WE   */
10769     /* MUST UPDATE THE ACTIVE STATUS TO INDICATE THAT     */
10770     /* THE NODE HAVE MISSED A LOCAL CHECKPOINT.           */
10771     /*----------------------------------------------------*/
10772 
10773     /**
10774      * Bug#28717, Only master should do this, as this status is copied
10775      *   to other nodes
10776      */
10777     switch (failedNodePtr.p->activeStatus) {
10778     case Sysfile::NS_Active:
10779       jam();
10780       failedNodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
10781       break;
10782     case Sysfile::NS_ActiveMissed_1:
10783       jam();
10784       failedNodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
10785       break;
10786     case Sysfile::NS_ActiveMissed_2:
10787       jam();
10788       failedNodePtr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
10789       break;
10790     case Sysfile::NS_TakeOver:
10791       jam();
10792       failedNodePtr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
10793       break;
10794     case Sysfile::NS_Configured:
10795       jam();
10796       break;
10797     default:
10798       g_eventLogger->error("activeStatus = %u "
10799                            "at failure after NODE_FAILREP of node = %u",
10800                            (Uint32) failedNodePtr.p->activeStatus,
10801                            failedNodePtr.i);
10802       ndbabort();
10803     }//switch
10804     jam();
10805     /**
10806      * It could be that the ongoing LCP is only waiting for our node, so
10807      * it is important to here call checkStartMoreLcp. We need to go
10808      * through all nodes first though to ensure that we don't call
10809      * this and start checkpoints towards nodes already failed.
10810      */
10811     failedNodePtr.p->noOfQueuedChkpt = 0;
10812     failedNodePtr.p->noOfStartedChkpt = 0;
10813     check_more_start_lcp = true;
10814   }//if
10815 
10816   c_lcpState.m_participatingDIH.clear(failedNodePtr.i);
10817   c_lcpState.m_participatingLQH.clear(failedNodePtr.i);
10818 
10819   bool wf = c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i);
10820 
10821   if(c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(failedNodePtr.i))
10822   {
10823     jam();
10824     /**
10825      * Mark the signal as a special signal to distinguish it from a signal
10826      * that arrives from time queue for a dead node that should not be
10827      * handled. The marking here makes it known to the LCP_COMPLETE_REP
10828      * that this is a special node failure handling signal which should
10829      * be allowed to pass through although the node is dead.
10830      */
10831     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
10832     rep->nodeId = failedNodePtr.i;
10833     rep->lcpId = SYSFILE->latestLCP_ID;
10834     rep->blockNo = DBDIH;
10835     rep->fromTQ = 0;
10836     sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
10837                LcpCompleteRep::SignalLengthTQ, JBB);
10838   }
10839 
10840   bool lcp_complete_rep = false;
10841   if (!wf)
10842   {
10843     jam();
10844 
10845     /**
10846      * Check if we're waiting for the failed node's LQH to complete
10847      *
10848      * Note that this is ran "before" LCP master take over
10849      */
10850     if(c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId)){
10851       jam();
10852 
10853       lcp_complete_rep = true;
10854       LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
10855       rep->nodeId  = nodeId;
10856       rep->lcpId   = SYSFILE->latestLCP_ID;
10857       rep->blockNo = DBLQH;
10858       rep->fromTQ = 0;
10859       sendSignal(reference(), GSN_LCP_COMPLETE_REP, signal,
10860                  LcpCompleteRep::SignalLengthTQ, JBB);
10861 
10862       if(c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId)){
10863         jam();
10864         /**
10865          * Make sure we're ready to accept it
10866          */
10867         c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodeId);
10868       }
10869     }
10870   }
10871 
10872   if (c_TCGETOPSIZEREQ_Counter.isWaitingFor(failedNodePtr.i)) {
10873     jam();
10874     signal->theData[0] = failedNodePtr.i;
10875     signal->theData[1] = 0;
10876     sendSignal(reference(), GSN_TCGETOPSIZECONF, signal, 2, JBB);
10877   }//if
10878 
10879   if (c_TC_CLOPSIZEREQ_Counter.isWaitingFor(failedNodePtr.i)) {
10880     jam();
10881     signal->theData[0] = failedNodePtr.i;
10882     sendSignal(reference(), GSN_TC_CLOPSIZECONF, signal, 1, JBB);
10883   }//if
10884 
10885   if (c_START_LCP_REQ_Counter.isWaitingFor(failedNodePtr.i)) {
10886     jam();
10887     StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
10888     conf->senderRef = numberToRef(DBLQH, failedNodePtr.i);
10889     conf->lcpId = SYSFILE->latestLCP_ID;
10890     sendSignal(reference(), GSN_START_LCP_CONF, signal,
10891 	       StartLcpConf::SignalLength, JBB);
10892   }//if
10893 
10894   if (c_MASTER_LCPREQ_Counter.isWaitingFor(failedNodePtr.i)) {
10895     jam();
10896     MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
10897     ref->senderNodeId = failedNodePtr.i;
10898     ref->failedNodeId = cmasterTakeOverNode;
10899     sendSignal(reference(), GSN_MASTER_LCPREF, signal,
10900 	       MasterLCPRef::SignalLength, JBB);
10901   }//if
10902 
10903 }//Dbdih::failedNodeLcpHandling()
10904 
checkGcpOutstanding(Signal * signal,Uint32 failedNodeId)10905 void Dbdih::checkGcpOutstanding(Signal* signal, Uint32 failedNodeId){
10906   if (c_GCP_PREPARE_Counter.isWaitingFor(failedNodeId)){
10907     jam();
10908     GCPPrepareConf* conf = (GCPPrepareConf*)signal->getDataPtrSend();
10909     conf->nodeId = failedNodeId;
10910     conf->gci_hi = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
10911     conf->gci_lo = Uint32(m_micro_gcp.m_master.m_new_gci);
10912     sendSignal(reference(), GSN_GCP_PREPARECONF, signal,
10913                GCPPrepareConf::SignalLength, JBB);
10914   }//if
10915 
10916   if (c_GCP_COMMIT_Counter.isWaitingFor(failedNodeId))
10917   {
10918     jam();
10919     /* Record minimum failure number, will cause re-send of
10920      * GCP_NOMORETRANS if local GCP_NODEFINISH arrives before
10921      * TC has handled the failure.
10922      */
10923     cMinTcFailNo = cfailurenr;
10924 
10925     /**
10926      * Waiting for GSN_GCP_NODEFINISH
10927      *   TC-take-over can generate new transactions
10928      *   that will be in this epoch
10929      *   re-run GCP_NOMORETRANS to master-TC (self) that will run
10930      *   take-over
10931      */
10932     c_GCP_COMMIT_Counter.clearWaitingFor(failedNodeId);
10933 
10934     /* Check to see whether we have already received GCP_NODEFINISH
10935      * from the local (Master) TC instance
10936      */
10937     if (!c_GCP_COMMIT_Counter.isWaitingFor(getOwnNodeId()))
10938     {
10939       jam();
10940       /* Already received GCP_NODEFINISH for this GCI, must
10941        * resend GCP_NOMORETRANS request now.
10942        * Otherwise we will re-send it when GCP_NODEFINISH
10943        * arrives.
10944        */
10945       c_GCP_COMMIT_Counter.setWaitingFor(getOwnNodeId());
10946       /* Reset DIH GCP state */
10947       m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
10948 
10949       GCPNoMoreTrans* req = (GCPNoMoreTrans*)signal->getDataPtrSend();
10950       req->senderRef = reference();
10951       req->senderData = m_micro_gcp.m_master_ref;
10952       req->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
10953       req->gci_lo = Uint32(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
10954       sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
10955                  GCPNoMoreTrans::SignalLength, JBB);
10956     }
10957   }
10958 
10959   if (c_GCP_SAVEREQ_Counter.isWaitingFor(failedNodeId)) {
10960     jam();
10961     GCPSaveRef * const saveRef = (GCPSaveRef*)&signal->theData[0];
10962     saveRef->dihPtr = failedNodeId;
10963     saveRef->nodeId = failedNodeId;
10964     saveRef->gci    = m_gcp_save.m_master.m_new_gci;
10965     saveRef->errorCode = GCPSaveRef::FakedSignalDueToNodeFailure;
10966     sendSignal(reference(), GSN_GCP_SAVEREF, signal,
10967 	       GCPSaveRef::SignalLength, JBB);
10968   }//if
10969 
10970   if (c_COPY_GCIREQ_Counter.isWaitingFor(failedNodeId)) {
10971     jam();
10972     signal->theData[0] = failedNodeId;
10973     sendSignal(reference(), GSN_COPY_GCICONF, signal, 1, JBB);
10974   }//if
10975 
10976   if (c_MASTER_GCPREQ_Counter.isWaitingFor(failedNodeId)){
10977     jam();
10978     MasterGCPRef * const ref = (MasterGCPRef *)&signal->theData[0];
10979     ref->senderNodeId = failedNodeId;
10980     ref->failedNodeId = cmasterTakeOverNode;
10981     sendSignal(reference(), GSN_MASTER_GCPREF, signal,
10982 	       MasterGCPRef::SignalLength, JBB);
10983   }//if
10984 
10985   if (c_SUB_GCP_COMPLETE_REP_Counter.isWaitingFor(failedNodeId))
10986   {
10987     jam();
10988     SubGcpCompleteAck* ack = CAST_PTR(SubGcpCompleteAck,
10989                                       signal->getDataPtrSend());
10990     ack->rep.senderRef = numberToRef(DBDIH, failedNodeId);
10991     sendSignal(reference(), GSN_SUB_GCP_COMPLETE_ACK, signal,
10992 	       SubGcpCompleteAck::SignalLength, JBB);
10993   }
10994 }
10995 
10996 void
startLcpMasterTakeOver(Signal * signal,Uint32 nodeId)10997 Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId)
10998 {
10999   jam();
11000 
11001   if (ERROR_INSERTED(7230))
11002   {
11003     return;
11004   }
11005 
11006   Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;
11007 
11008   NodeRecordPtr nodePtr;
11009   nodePtr.i = oldNode;
11010   if (oldNode > 0 && oldNode < MAX_NDB_NODES)
11011   {
11012     jam();
11013     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
11014     if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
11015     {
11016       jam();
11017       checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
11018     }
11019   }
11020 
11021   {
11022     jam();
11023     /**
11024      * As of NDBD_EMPTY_LCP_PROTOCOL_NOT_NEEDED version this is the
11025      * normal path through the code. In 8.0 we removed upgrade from
11026      * these older versions, so the support for the old protocol could
11027      * be removed.
11028      *
11029      * We now ensures that LQH keeps track of which LCP_FRAG_ORD it has
11030      * received. So this means that we can be a bit more sloppy in master
11031      * take over. We need not worry if we resend LCP_FRAG_ORD since LQH will
11032      * simply drop it.
11033      *
11034      * So when we are done with the master take over we will simply start from
11035      * scratch from the first table and fragment. We have sufficient
11036      * information locally in the new master to skip resending all fragment
11037      * replicas where we already received LCP_FRAG_REP. For those where we sent
11038      * LCP_FRAG_ORD but not received LCP_FRAG_REP we simply send it again. If
11039      * it was sent before then LQH will discover it and drop it.
11040      *
11041      * We also don't need to worry about sending too many LCP_FRAG_ORDs to LQH
11042      * since we can send it for all fragment replicas given that we use the
11043      * fragment record as the queueing record. So in practice the queue is
11044      * always large enough.
11045      */
11046     c_lcpMasterTakeOverState.minTableId = 0;
11047     c_lcpMasterTakeOverState.minFragId = 0;
11048     c_lcpMasterTakeOverState.failedNodeId = nodeId;
11049     c_lcpMasterTakeOverState.set(LMTOS_WAIT_LCP_FRAG_REP, __LINE__);
11050     setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
11051     checkEmptyLcpComplete(signal);
11052     return;
11053   }
11054 }
11055 
startGcpMasterTakeOver(Signal * signal,Uint32 oldMasterId)11056 void Dbdih::startGcpMasterTakeOver(Signal* signal, Uint32 oldMasterId){
11057   jam();
11058   /*--------------------------------------------------*/
11059   /*                                                  */
11060   /*       THE MASTER HAVE FAILED AND WE WERE ELECTED */
11061   /*       TO BE THE NEW MASTER NODE. WE NEED TO QUERY*/
11062   /*       ALL THE OTHER NODES ABOUT THEIR STATUS IN  */
11063   /*       ORDER TO BE ABLE TO TAKE OVER CONTROL OF   */
11064   /*       THE GLOBAL CHECKPOINT PROTOCOL AND THE     */
11065   /*       LOCAL CHECKPOINT PROTOCOL.                 */
11066   /*--------------------------------------------------*/
11067   if(!isMaster()){
11068     jam();
11069     return;
11070   }
11071   cmasterState = MASTER_TAKE_OVER_GCP;
11072   cmasterTakeOverNode = oldMasterId;
11073   MasterGCPReq * const req = (MasterGCPReq *)&signal->theData[0];
11074   req->masterRef = reference();
11075   req->failedNodeId = oldMasterId;
11076   sendLoopMacro(MASTER_GCPREQ, sendMASTER_GCPREQ, RNIL);
11077 
11078   signal->theData[0] = NDB_LE_GCP_TakeoverStarted;
11079   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
11080 
11081   /**
11082    * save own value...
11083    *   to be able to check values returned in MASTER_GCPCONF
11084    */
11085   m_gcp_save.m_master.m_new_gci = m_gcp_save.m_gci;
11086 
11087   setLocalNodefailHandling(signal, oldMasterId, NF_GCP_TAKE_OVER);
11088 }
11089 
startRemoveFailedNode(Signal * signal,NodeRecordPtr failedNodePtr)11090 void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr)
11091 {
11092   Uint32 nodeId = failedNodePtr.i;
11093   if(failedNodePtr.p->nodeStatus != NodeRecord::DIED_NOW){
11094     jam();
11095     /**
11096      * Is node isn't alive. It can't be part of LCP
11097      */
11098     ndbrequire(!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId));
11099 
11100     /**
11101      * And there is no point in removing any replicas
11102      *   It's dead...
11103      */
11104     return;
11105   }
11106 
11107   /**
11108    * If node has node complete LCP
11109    *   we need to remove it as undo might not be complete
11110    *   bug#31257
11111    */
11112   failedNodePtr.p->m_remove_node_from_table_lcp_id = RNIL;
11113   if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(failedNodePtr.i))
11114   {
11115     jam();
11116     failedNodePtr.p->m_remove_node_from_table_lcp_id = SYSFILE->latestLCP_ID;
11117   }
11118 
11119   jam();
11120 
11121   if (!ERROR_INSERTED(7194) && !ERROR_INSERTED(7221))
11122   {
11123     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
11124     signal->theData[1] = failedNodePtr.i;
11125     signal->theData[2] = 0; // Tab id
11126     if (!ERROR_INSERTED(7233))
11127       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
11128     else
11129       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
11130   }
11131   else
11132   {
11133     if (ERROR_INSERTED(7194))
11134     {
11135       ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE");
11136     }
11137     else if (ERROR_INSERTED(7221))
11138     {
11139       ndbout_c("7221 Not starting ZREMOVE_NODE_FROM_TABLE");
11140     }
11141   }
11142 
11143   setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE);
11144 }//Dbdih::startRemoveFailedNode()
11145 
handle_master_take_over_copy_gci(Signal * signal,NodeId new_master_node_id)11146 bool Dbdih::handle_master_take_over_copy_gci(Signal *signal, NodeId new_master_node_id)
11147 {
11148   if (c_copyGCISlave.m_expectedNextWord != 0)
11149   {
11150     jam();
11151     c_copyGCISlave.m_expectedNextWord = 0;
11152     c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
11153   }
11154 
11155   if (c_copyGCISlave.m_copyReason != CopyGCIReq::IDLE)
11156   {
11157     /**
11158      * Before we allow the new master to start up the new GCP protocols
11159      * we need to ensure that the activity started by the previous
11160      * failed master is completed before we process the master takeover.
11161      * By enforcing this in MASTER_GCPREQ and MASTER_LCPREQ we are
11162      * certain that the master takeover is ready to start up the new
11163      * COPY_GCIREQ protocols.
11164      */
11165     sendSignalWithDelay(reference(), GSN_MASTER_GCPREQ,
11166                         signal, 10, MasterGCPReq::SignalLength);
11167     return true;
11168   }
11169   c_handled_master_take_over_copy_gci = new_master_node_id;
11170   return false;
11171 }
11172 
11173 /*--------------------------------------------------*/
11174 /*       THE MASTER HAS FAILED AND THE NEW MASTER IS*/
11175 /*       QUERYING THIS NODE ABOUT THE STATE OF THE  */
11176 /*       GLOBAL CHECKPOINT PROTOCOL                 */
11177 /*--------------------------------------------------*/
execMASTER_GCPREQ(Signal * signal)11178 void Dbdih::execMASTER_GCPREQ(Signal* signal)
11179 {
11180   NodeRecordPtr failedNodePtr;
11181   NodeRecordPtr newMasterNodePtr;
11182   MasterGCPReq * const masterGCPReq = (MasterGCPReq *)&signal->theData[0];
11183   jamEntry();
11184   const BlockReference newMasterBlockref = masterGCPReq->masterRef;
11185   const Uint32 failedNodeId = masterGCPReq->failedNodeId;
11186 
11187   failedNodePtr.i = failedNodeId;
11188   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
11189   newMasterNodePtr.i = refToNode(newMasterBlockref);
11190   ptrCheckGuard(newMasterNodePtr, MAX_NDB_NODES, nodeRecord);
11191 
11192   if (newMasterNodePtr.p->nodeStatus != NodeRecord::ALIVE)
11193   {
11194     /**
11195      * We delayed the MASTER_GCPREQ signal and now it arrived after
11196      * the new master already died. We ignore this signal.
11197      */
11198 #ifdef VM_TRACE
11199     g_eventLogger->info("Dropped MASTER_GCPREQ from node %u",
11200                         newMasterNodePtr.i);
11201 #endif
11202     jam();
11203     return;
11204   }
11205 
11206   if (failedNodePtr.p->nodeStatus == NodeRecord::ALIVE) {
11207     jam();
11208     /*--------------------------------------------------*/
11209     /*       ENSURE THAT WE HAVE PROCESSED THE SIGNAL   */
11210     /*       NODE_FAILURE BEFORE WE PROCESS THIS REQUEST*/
11211     /*       FROM THE NEW MASTER. THIS ENSURES THAT WE  */
11212     /*       HAVE REMOVED THE FAILED NODE FROM THE LIST */
11213     /*       OF ACTIVE NODES AND SO FORTH.              */
11214     /*--------------------------------------------------*/
11215     sendSignalWithDelay(reference(), GSN_MASTER_GCPREQ,
11216                         signal, 10, MasterGCPReq::SignalLength);
11217     return;
11218   } else {
11219     ndbrequire(failedNodePtr.p->nodeStatus == NodeRecord::DYING);
11220   }//if
11221 
11222   if (handle_master_take_over_copy_gci(signal, newMasterNodePtr.i))
11223   {
11224     return;
11225   }
11226 #ifdef VM_TRACE
11227   g_eventLogger->info("Handle MASTER_GCPREQ from node %u",
11228                       newMasterNodePtr.i);
11229 #endif
11230   if (ERROR_INSERTED(7181))
11231   {
11232     ndbout_c("execGCP_TCFINISHED in MASTER_GCPREQ");
11233     CLEAR_ERROR_INSERT_VALUE;
11234     signal->theData[0] = c_error_7181_ref;
11235     signal->theData[1] = (Uint32)(m_micro_gcp.m_old_gci >> 32);
11236     signal->theData[2] = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
11237     signal->theData[3] = cfailurenr;
11238     execGCP_TCFINISHED(signal);
11239   }
11240 
11241   MasterGCPConf::State gcpState;
11242   switch(m_micro_gcp.m_state){
11243   case MicroGcp::M_GCP_IDLE:
11244     jam();
11245     gcpState = MasterGCPConf::GCP_READY;
11246     break;
11247   case MicroGcp::M_GCP_PREPARE:
11248     jam();
11249     gcpState = MasterGCPConf::GCP_PREPARE_RECEIVED;
11250     break;
11251   case MicroGcp::M_GCP_COMMIT:
11252     jam();
11253     gcpState = MasterGCPConf::GCP_COMMIT_RECEIVED;
11254     break;
11255   case MicroGcp::M_GCP_COMMITTED:
11256     jam();
11257     gcpState = MasterGCPConf::GCP_COMMITTED;
11258 
11259     /**
11260      * Change state to GCP_COMMIT_RECEIVEDn and rerun GSN_GCP_NOMORETRANS
11261      */
11262     gcpState = MasterGCPConf::GCP_COMMIT_RECEIVED;
11263     m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
11264 
11265     {
11266       GCPNoMoreTrans* req2 = (GCPNoMoreTrans*)signal->getDataPtrSend();
11267       req2->senderRef = reference();
11268       req2->senderData = m_micro_gcp.m_master_ref;
11269       req2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
11270       req2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
11271       sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
11272                  GCPNoMoreTrans::SignalLength, JBB);
11273     }
11274     break;
11275   case MicroGcp::M_GCP_COMPLETE:
11276     /**
11277      * This is a master only state...
11278      */
11279     gcpState = MasterGCPConf::GCP_READY; //Compiler keep quiet
11280     ndbabort();
11281   }
11282 
11283   MasterGCPConf::SaveState saveState;
11284   switch(m_gcp_save.m_state){
11285   case GcpSave::GCP_SAVE_IDLE:
11286     jam();
11287     saveState = MasterGCPConf::GCP_SAVE_IDLE;
11288     break;
11289   case GcpSave::GCP_SAVE_REQ:
11290     jam();
11291     saveState = MasterGCPConf::GCP_SAVE_REQ;
11292     break;
11293   case GcpSave::GCP_SAVE_CONF:
11294     jam();
11295     saveState = MasterGCPConf::GCP_SAVE_CONF;
11296     break;
11297   case GcpSave::GCP_SAVE_COPY_GCI:
11298     jam();
11299     saveState = MasterGCPConf::GCP_SAVE_COPY_GCI;
11300     break;
11301   }
11302 
11303   MasterGCPConf * const masterGCPConf = (MasterGCPConf *)&signal->theData[0];
11304   masterGCPConf->gcpState  = gcpState;
11305   masterGCPConf->senderNodeId = cownNodeId;
11306   masterGCPConf->failedNodeId = failedNodeId;
11307   masterGCPConf->newGCP_hi = (Uint32)(m_micro_gcp.m_new_gci >> 32);
11308   masterGCPConf->latestLCP = SYSFILE->latestLCP_ID;
11309   masterGCPConf->oldestRestorableGCI = SYSFILE->oldestRestorableGCI;
11310   masterGCPConf->keepGCI = SYSFILE->keepGCI;
11311   masterGCPConf->newGCP_lo = Uint32(m_micro_gcp.m_new_gci);
11312   masterGCPConf->saveState = saveState;
11313   masterGCPConf->saveGCI = m_gcp_save.m_gci;
11314   Uint32 packed_length =
11315   NdbNodeBitmask::getPackedLengthInWords(SYSFILE->lcpActive);
11316 
11317   if (ERROR_INSERTED(7225))
11318   {
11319     CLEAR_ERROR_INSERT_VALUE;
11320     ndbrequire(refToNode(newMasterBlockref) == getOwnNodeId());
11321     LinearSectionPtr lsptr[3];
11322     lsptr[0].p = masterGCPConf->lcpActive_v1;
11323     lsptr[0].sz = packed_length;
11324     SectionHandle handle(this);
11325     import(handle.m_ptr[0], lsptr[0].p, lsptr[0].sz);
11326 
11327     sendSignalWithDelay(newMasterBlockref, GSN_MASTER_GCPCONF, signal,
11328                         500, MasterGCPConf::SignalLength, &handle);
11329   }
11330   else
11331   {
11332     Uint32 node_version = getNodeInfo(refToNode(newMasterBlockref)).m_version;
11333     if (ndbd_send_node_bitmask_in_section(node_version))
11334     {
11335       Uint32 lcpActiveCopy[NdbNodeBitmask::Size];
11336       NdbNodeBitmask::assign(lcpActiveCopy, SYSFILE->lcpActive);
11337       LinearSectionPtr lsptr[3];
11338       lsptr[0].p = lcpActiveCopy;
11339       lsptr[0].sz = packed_length;
11340       sendSignal(newMasterBlockref, GSN_MASTER_GCPCONF, signal,
11341                  MasterGCPConf::SignalLength, JBB, lsptr, 1);
11342     }
11343     else if (packed_length <= NdbNodeBitmask48::Size)
11344     {
11345       for(Uint32 i = 0; i < NdbNodeBitmask48::Size; i++)
11346         masterGCPConf->lcpActive_v1[i] = SYSFILE->lcpActive[i];
11347 
11348       sendSignal(newMasterBlockref, GSN_MASTER_GCPCONF, signal,
11349                  MasterGCPConf::SignalLength, JBB);
11350     }
11351     else
11352     {
11353       ndbabort();
11354     }
11355   }
11356 
11357   if (ERROR_INSERTED(7182))
11358   {
11359     ndbout_c("execGCP_TCFINISHED in MASTER_GCPREQ");
11360     CLEAR_ERROR_INSERT_VALUE;
11361     signal->theData[0] = c_error_7181_ref;
11362     signal->theData[1] = (Uint32)(m_micro_gcp.m_old_gci >> 32);
11363     signal->theData[2] = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
11364     signal->theData[3] = cfailurenr;
11365     execGCP_TCFINISHED(signal);
11366   }
11367 }//Dbdih::execMASTER_GCPREQ()
11368 
execMASTER_GCPCONF(Signal * signal)11369 void Dbdih::execMASTER_GCPCONF(Signal* signal)
11370 {
11371   NodeRecordPtr senderNodePtr;
11372   MasterGCPConf * const masterGCPConf = (MasterGCPConf *)&signal->theData[0];
11373   jamEntry();
11374 
11375   Uint32 senderRef = signal->getSendersBlockRef();
11376   Uint32 senderVersion = getNodeInfo(refToNode(senderRef)).m_version;
11377   Uint32* temp_lcpActive = &signal->theData[MasterGCPConf::SignalLength];
11378 
11379   if (signal->getNoOfSections() >= 1)
11380   {
11381     ndbrequire(ndbd_send_node_bitmask_in_section(senderVersion));
11382     SectionHandle handle(this, signal);
11383     SegmentedSectionPtr ptr;
11384     handle.getSection(ptr, 0);
11385 
11386     ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
11387     memset(temp_lcpActive,
11388            0,
11389            NdbNodeBitmask::Size * sizeof(Uint32));
11390     copy(temp_lcpActive, ptr);
11391     releaseSections(handle);
11392   }
11393   else
11394   {
11395     memset(temp_lcpActive + NdbNodeBitmask48::Size,
11396            0,
11397            _NDB_NBM_DIFF_BYTES);
11398   }
11399 
11400   senderNodePtr.i = masterGCPConf->senderNodeId;
11401   ptrCheckGuard(senderNodePtr, MAX_NDB_NODES, nodeRecord);
11402 
11403 #ifdef VM_TRACE
11404   g_eventLogger->info("MASTER_GCPCONF from node %u", senderNodePtr.i);
11405 #endif
11406 
11407   MasterGCPConf::State gcpState = (MasterGCPConf::State)masterGCPConf->gcpState;
11408   MasterGCPConf::SaveState saveState =
11409     (MasterGCPConf::SaveState)masterGCPConf->saveState;
11410   const Uint32 failedNodeId = masterGCPConf->failedNodeId;
11411   const Uint32 newGcp_hi = masterGCPConf->newGCP_hi;
11412   const Uint32 newGcp_lo = masterGCPConf->newGCP_lo;
11413   Uint64 newGCI = newGcp_lo | (Uint64(newGcp_hi) << 32);
11414   const Uint32 latestLcpId = masterGCPConf->latestLCP;
11415   const Uint32 oldestRestorableGci = masterGCPConf->oldestRestorableGCI;
11416   const Uint32 oldestKeepGci = masterGCPConf->keepGCI;
11417   const Uint32 saveGCI = masterGCPConf->saveGCI;
11418 
11419   if (latestLcpId > SYSFILE->latestLCP_ID) {
11420     jam();
11421 #if 0
11422     g_eventLogger->info("Dbdih: Setting SYSFILE->latestLCP_ID to %d",
11423                         latestLcpId);
11424     SYSFILE->latestLCP_ID = latestLcpId;
11425 #endif
11426     SYSFILE->keepGCI = oldestKeepGci;
11427 
11428     DEB_LCP(("Master takeover: Set SYSFILE->keepGCI = %u", SYSFILE->keepGCI));
11429 
11430     SYSFILE->oldestRestorableGCI = oldestRestorableGci;
11431     if (signal->getNoOfSections() >= 1)
11432     {
11433       for (Uint32 i = 0; i < NdbNodeBitmask::Size; i++)
11434         SYSFILE->lcpActive[i] = temp_lcpActive[i];
11435     }
11436     else
11437     {
11438       memset(SYSFILE->lcpActive, 0, sizeof(SYSFILE->lcpActive));
11439       for (Uint32 i = 0; i < NdbNodeBitmask48::Size; i++)
11440         SYSFILE->lcpActive[i] = masterGCPConf->lcpActive_v1[i];
11441     }
11442   }//if
11443 
11444   bool ok = false;
11445   switch (gcpState) {
11446   case MasterGCPConf::GCP_READY:
11447     jam();
11448     ok = true;
11449     // Either not started or complete...
11450     break;
11451   case MasterGCPConf::GCP_PREPARE_RECEIVED:
11452     jam();
11453     ok = true;
11454     if (m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_IDLE)
11455     {
11456       jam();
11457       m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_PREPARE;
11458       m_micro_gcp.m_master.m_new_gci = newGCI;
11459     }
11460     else
11461     {
11462       jam();
11463       ndbrequire(m_micro_gcp.m_master.m_new_gci == newGCI);
11464     }
11465     break;
11466   case MasterGCPConf::GCP_COMMIT_RECEIVED:
11467     jam();
11468     // Fall through
11469   case MasterGCPConf::GCP_COMMITTED:
11470     jam();
11471     ok = true;
11472     if (m_micro_gcp.m_master.m_state != MicroGcp::M_GCP_IDLE)
11473     {
11474       ndbrequire(m_micro_gcp.m_master.m_new_gci == newGCI);
11475     }
11476     m_micro_gcp.m_master.m_new_gci = newGCI;
11477     m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMMIT;
11478     break;
11479 #ifndef VM_TRACE
11480   default:
11481     jamLine(gcpState);
11482     ndbabort();
11483 #endif
11484   }
11485   ndbassert(ok); // Unhandled case...
11486 
11487   ok = false;
11488   /**
11489    * GCI should differ with atmost one
11490    */
11491   ndbrequire(saveGCI == m_gcp_save.m_gci ||
11492              saveGCI == m_gcp_save.m_gci + 1 ||
11493              saveGCI + 1 == m_gcp_save.m_gci);
11494   if (saveGCI > m_gcp_save.m_master.m_new_gci)
11495   {
11496     jam();
11497     m_gcp_save.m_master.m_new_gci = saveGCI;
11498   }
11499   switch(saveState){
11500   case MasterGCPConf::GCP_SAVE_IDLE:
11501     jam();
11502     break;
11503   case MasterGCPConf::GCP_SAVE_REQ:
11504     jam();
11505     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
11506     {
11507       jam();
11508       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
11509     }
11510     break;
11511   case MasterGCPConf::GCP_SAVE_CONF:
11512     jam();
11513     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
11514     {
11515       jam();
11516       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
11517     }
11518     break;
11519   case MasterGCPConf::GCP_SAVE_COPY_GCI:
11520     jam();
11521     if (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
11522     {
11523       jam();
11524       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
11525     }
11526     break;
11527 #ifndef VM_TRACE
11528   default:
11529     jamLine(saveState);
11530     ndbabort();
11531 #endif
11532   }
11533   //ndbassert(ok); // Unhandled case
11534 
11535   receiveLoopMacro(MASTER_GCPREQ, senderNodePtr.i);
11536   /*-------------------------------------------------------------------------*/
11537   // We have now received all responses and are ready to take over the GCP
11538   // protocol as master.
11539   /*-------------------------------------------------------------------------*/
11540   MASTER_GCPhandling(signal, failedNodeId);
11541 
11542   return;
11543 }//Dbdih::execMASTER_GCPCONF()
11544 
execMASTER_GCPREF(Signal * signal)11545 void Dbdih::execMASTER_GCPREF(Signal* signal)
11546 {
11547   const MasterGCPRef * const ref = (MasterGCPRef *)&signal->theData[0];
11548   jamEntry();
11549   receiveLoopMacro(MASTER_GCPREQ, ref->senderNodeId);
11550   /*-------------------------------------------------------------------------*/
11551   // We have now received all responses and are ready to take over the GCP
11552   // protocol as master.
11553   /*-------------------------------------------------------------------------*/
11554   MASTER_GCPhandling(signal, ref->failedNodeId);
11555 }//Dbdih::execMASTER_GCPREF()
11556 
MASTER_GCPhandling(Signal * signal,Uint32 failedNodeId)11557 void Dbdih::MASTER_GCPhandling(Signal* signal, Uint32 failedNodeId)
11558 {
11559   cmasterState = MASTER_ACTIVE;
11560 
11561   NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
11562   NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
11563 
11564   bool ok = false;
11565   switch(m_micro_gcp.m_master.m_state){
11566   case MicroGcp::M_GCP_IDLE:
11567     jam();
11568     ok = true;
11569     signal->theData[0] = DihContinueB::ZSTART_GCP;
11570     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
11571     break;
11572   case MicroGcp::M_GCP_PREPARE:
11573   {
11574     jam();
11575     ok = true;
11576 
11577     /**
11578      * Restart GCP_PREPARE
11579      */
11580     sendLoopMacro(GCP_PREPARE, sendGCP_PREPARE, RNIL);
11581     break;
11582   }
11583   case MicroGcp::M_GCP_COMMIT:
11584   {
11585     jam();
11586     ok = true;
11587 
11588     /**
11589      * Restart GCP_COMMIT
11590      */
11591     sendLoopMacro(GCP_COMMIT, sendGCP_COMMIT, RNIL);
11592     break;
11593   }
11594   case MicroGcp::M_GCP_COMMITTED:
11595     jam();
11596     ndbabort();
11597   case MicroGcp::M_GCP_COMPLETE:
11598     jam();
11599     ndbabort();
11600 #ifndef VM_TRACE
11601   default:
11602     jamLine(m_micro_gcp.m_master.m_state);
11603     ndbabort();
11604 #endif
11605   }
11606   ndbassert(ok);
11607 
11608   if (m_micro_gcp.m_enabled == false)
11609   {
11610     jam();
11611     m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_IDLE;
11612   }
11613   else
11614   {
11615     ok = false;
11616     switch(m_gcp_save.m_master.m_state){
11617     case GcpSave::GCP_SAVE_IDLE:
11618       jam();
11619       ok = true;
11620       break;
11621     case GcpSave::GCP_SAVE_REQ:
11622     {
11623       jam();
11624       ok = true;
11625 
11626       /**
11627        * Restart GCP_SAVE_REQ
11628        */
11629       sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
11630       break;
11631     }
11632     case GcpSave::GCP_SAVE_CONF:
11633       jam();
11634       // Fall through
11635     case GcpSave::GCP_SAVE_COPY_GCI:
11636       jam();
11637       ok = true;
11638       copyGciLab(signal, CopyGCIReq::GLOBAL_CHECKPOINT);
11639       m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
11640       break;
11641 #ifndef VM_TRACE
11642     default:
11643       jamLine(m_gcp_save.m_master.m_state);
11644       ndbabort();
11645 #endif
11646     }
11647     ndbrequire(ok);
11648   }
11649 
11650   signal->theData[0] = NDB_LE_GCP_TakeoverCompleted;
11651   signal->theData[1] = m_micro_gcp.m_master.m_state;
11652   signal->theData[2] = m_gcp_save.m_master.m_state;
11653   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
11654 
11655   infoEvent("kk: %u/%u %u %u",
11656             Uint32(m_micro_gcp.m_current_gci >> 32),
11657             Uint32(m_micro_gcp.m_current_gci),
11658             m_micro_gcp.m_master.m_state,
11659             m_gcp_save.m_master.m_state);
11660 
11661   /*--------------------------------------------------*/
11662   /*       WE SEPARATE HANDLING OF GLOBAL CHECKPOINTS */
11663   /*       AND LOCAL CHECKPOINTS HERE. LCP'S HAVE TO  */
11664   /*       REMOVE ALL FAILED FRAGMENTS BEFORE WE CAN  */
11665   /*       HANDLE THE LCP PROTOCOL.                   */
11666   /*--------------------------------------------------*/
11667   checkLocalNodefailComplete(signal, failedNodeId, NF_GCP_TAKE_OVER);
11668 
11669   startGcpMonitor(signal);
11670 
11671   return;
11672 }//Dbdih::masterGcpConfFromFailedLab()
11673 
11674 void
handle_send_continueb_invalidate_node_lcp(Signal * signal)11675 Dbdih::handle_send_continueb_invalidate_node_lcp(Signal *signal)
11676 {
11677   if (ERROR_INSERTED(7204))
11678   {
11679     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
11680   }
11681   else if (ERROR_INSERTED(7245))
11682   {
11683     if (isMaster())
11684     {
11685       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
11686     }
11687     else
11688     {
11689       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 3);
11690     }
11691   }
11692   else if (ERROR_INSERTED(7246))
11693   {
11694     /**
11695      * This error injection supports a special test case where we
11696      * delay node 1 and 2 more than other nodes to ensure that we
11697      * get some nodes that reply with START_INFOCONF and some that
11698      * reply with START_INFOREF to get the code tested for the case
11699      * some nodes reply with START_INFOREF and some with START_INFOCONF.
11700      */
11701     if (isMaster())
11702     {
11703       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 2000, 3);
11704     }
11705     else if (cownNodeId == Uint32(1) ||
11706              (refToNode(cmasterdihref) == Uint32(1) &&
11707               cownNodeId == Uint32(2)))
11708     {
11709       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5000, 3);
11710     }
11711     else
11712     {
11713       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 8000, 3);
11714     }
11715   }
11716   else
11717   {
11718     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
11719   }
11720 }
11721 
11722 void
invalidateNodeLCP(Signal * signal,Uint32 nodeId,Uint32 tableId)11723 Dbdih::invalidateNodeLCP(Signal* signal, Uint32 nodeId, Uint32 tableId)
11724 {
11725   jamEntry();
11726   TabRecordPtr tabPtr;
11727   tabPtr.i = tableId;
11728   const Uint32 RT_BREAK = 64;
11729   if (ERROR_INSERTED(7125)) {
11730     return;
11731   }//if
11732   for (Uint32 i = 0; i<RT_BREAK; i++) {
11733     jam();
11734     if (tabPtr.i >= ctabFileSize){
11735       jam();
11736       /**
11737        * Ready with entire loop
11738        * Return to master
11739        */
11740       if (ERROR_INSERTED(7204) ||
11741           ERROR_INSERTED(7245) ||
11742           ERROR_INSERTED(7246))
11743       {
11744         CLEAR_ERROR_INSERT_VALUE;
11745       }
11746       setAllowNodeStart(nodeId, true);
11747       g_eventLogger->info("Completed invalidation of node %u", nodeId);
11748       if (getNodeStatus(nodeId) == NodeRecord::STARTING) {
11749         jam();
11750         if (!isMaster())
11751         {
11752           jam();
11753           setNodeRecoveryStatus(nodeId, NodeRecord::NODE_GETTING_PERMIT);
11754         }
11755         StartInfoConf * conf = (StartInfoConf*)&signal->theData[0];
11756         conf->sendingNodeId = cownNodeId;
11757         conf->startingNodeId = nodeId;
11758         sendSignal(cmasterdihref, GSN_START_INFOCONF, signal,
11759                    StartInfoConf::SignalLength, JBB);
11760       }//if
11761       return;
11762     }//if
11763     ptrAss(tabPtr, tabRecord);
11764     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) {
11765       jam();
11766       invalidateNodeLCP(signal, nodeId, tabPtr);
11767       return;
11768     }//if
11769     tabPtr.i++;
11770   }//for
11771   signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
11772   signal->theData[1] = nodeId;
11773   signal->theData[2] = tabPtr.i;
11774   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
11775 }//Dbdih::invalidateNodeLCP()
11776 
11777 void
invalidateNodeLCP(Signal * signal,Uint32 nodeId,TabRecordPtr tabPtr)11778 Dbdih::invalidateNodeLCP(Signal* signal, Uint32 nodeId, TabRecordPtr tabPtr)
11779 {
11780   /**
11781    * Check so that no one else is using the tab descriptior
11782    */
11783   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
11784     jam();
11785     signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
11786     signal->theData[1] = nodeId;
11787     signal->theData[2] = tabPtr.i;
11788     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
11789                         WaitTableStateChangeMillis, 3);
11790     return;
11791   }//if
11792 
11793   /**
11794    * For each fragment
11795    */
11796   bool modified = false;
11797   FragmentstorePtr fragPtr;
11798   for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){
11799     jam();
11800     getFragstore(tabPtr.p, fragNo, fragPtr);
11801     /**
11802      * For each of replica record
11803      */
11804     ReplicaRecordPtr replicaPtr;
11805     for(replicaPtr.i = fragPtr.p->oldStoredReplicas; replicaPtr.i != RNIL;
11806         replicaPtr.i = replicaPtr.p->nextPool) {
11807       jam();
11808       c_replicaRecordPool.getPtr(replicaPtr);
11809       if(replicaPtr.p->procNode == nodeId){
11810         jam();
11811         /**
11812          * Found one with correct node id
11813          */
11814         /**
11815          * Invalidate all LCP's
11816          */
11817         modified = true;
11818         for(int i = 0; i < MAX_LCP_STORED; i++) {
11819           replicaPtr.p->lcpStatus[i] = ZINVALID;
11820         }//if
11821         /**
11822          * And reset nextLcp
11823          */
11824         replicaPtr.p->nextLcp = 0;
11825         replicaPtr.p->noCrashedReplicas = 0;
11826       }//if
11827     }//for
11828   }//for
11829 
11830   if (modified) {
11831     jam();
11832     /**
11833      * Save table description to disk
11834      */
11835     tabPtr.p->tabCopyStatus  = TabRecord::CS_INVALIDATE_NODE_LCP;
11836     tabPtr.p->tabUpdateState = TabRecord::US_INVALIDATE_NODE_LCP;
11837     tabPtr.p->tabRemoveNode  = nodeId;
11838     signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
11839     signal->theData[1] = tabPtr.i;
11840     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
11841     return;
11842   }
11843 
11844   jam();
11845   /**
11846    * Move to next table
11847    */
11848   tabPtr.i++;
11849   signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
11850   signal->theData[1] = nodeId;
11851   signal->theData[2] = tabPtr.i;
11852 
11853   handle_send_continueb_invalidate_node_lcp(signal);
11854 
11855   return;
11856 }//Dbdih::invalidateNodeLCP()
11857 
11858 /*------------------------------------------------*/
11859 /*       INPUT:  TABPTR                           */
11860 /*               TNODEID                          */
11861 /*------------------------------------------------*/
removeNodeFromTables(Signal * signal,Uint32 nodeId,Uint32 tableId)11862 void Dbdih::removeNodeFromTables(Signal* signal,
11863 				 Uint32 nodeId, Uint32 tableId)
11864 {
11865   jamEntry();
11866   TabRecordPtr tabPtr;
11867   tabPtr.i = tableId;
11868   const Uint32 RT_BREAK = 64;
11869   for (Uint32 i = 0; i<RT_BREAK; i++) {
11870     jam();
11871     if (tabPtr.i >= ctabFileSize){
11872       jam();
11873       if (ERROR_INSERTED(7233))
11874       {
11875         CLEAR_ERROR_INSERT_VALUE;
11876       }
11877 
11878       removeNodeFromTablesComplete(signal, nodeId);
11879       return;
11880     }//if
11881 
11882     ptrAss(tabPtr, tabRecord);
11883     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) {
11884       jam();
11885       removeNodeFromTable(signal, nodeId, tabPtr);
11886       return;
11887     }//if
11888     tabPtr.i++;
11889   }//for
11890   signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
11891   signal->theData[1] = nodeId;
11892   signal->theData[2] = tabPtr.i;
11893   if (!ERROR_INSERTED(7233))
11894     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
11895   else
11896     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
11897 }
11898 
removeNodeFromTable(Signal * signal,Uint32 nodeId,TabRecordPtr tabPtr)11899 void Dbdih::removeNodeFromTable(Signal* signal,
11900 				Uint32 nodeId, TabRecordPtr tabPtr){
11901 
11902   /**
11903    * Check so that no one else is using the tab descriptior
11904    */
11905   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
11906     jam();
11907     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
11908     signal->theData[1] = nodeId;
11909     signal->theData[2] = tabPtr.i;
11910     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
11911                         WaitTableStateChangeMillis, 3);
11912     return;
11913   }//if
11914 
11915   NodeRecordPtr nodePtr;
11916   nodePtr.i = nodeId;
11917   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
11918   const Uint32 lcpId = nodePtr.p->m_remove_node_from_table_lcp_id;
11919 
11920   /**
11921    * For each fragment
11922    */
11923   Uint32 noOfRemovedReplicas = 0;     // No of replicas removed
11924   Uint32 noOfRemovedLcpReplicas = 0;  // No of replicas in LCP removed
11925   Uint32 noOfRemainingLcpReplicas = 0;// No of replicas in LCP remaining
11926 
11927   const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE);
11928   const bool unlogged = (tabPtr.p->tabStorage != TabRecord::ST_NORMAL);
11929 
11930   FragmentstorePtr fragPtr;
11931   for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){
11932     jam();
11933     getFragstore(tabPtr.p, fragNo, fragPtr);
11934 
11935     /**
11936      * For each of replica record
11937      */
11938     bool found = false;
11939     ReplicaRecordPtr replicaPtr;
11940     for(replicaPtr.i = fragPtr.p->storedReplicas; replicaPtr.i != RNIL;
11941         replicaPtr.i = replicaPtr.p->nextPool) {
11942       jam();
11943 
11944       c_replicaRecordPool.getPtr(replicaPtr);
11945       if(replicaPtr.p->procNode == nodeId){
11946         jam();
11947 	found = true;
11948 	noOfRemovedReplicas++;
11949 	removeNodeFromStored(nodeId, fragPtr, replicaPtr, unlogged);
11950 	if(replicaPtr.p->lcpOngoingFlag)
11951         {
11952 	  jam();
11953 	  /**
11954 	   * This replica is currently LCP:ed
11955 	   */
11956 	  ndbrequire(fragPtr.p->noLcpReplicas > 0);
11957 	  fragPtr.p->noLcpReplicas--;
11958 
11959 	  noOfRemovedLcpReplicas ++;
11960 	  replicaPtr.p->lcpOngoingFlag = false;
11961           if (fragPtr.p->noLcpReplicas == 0)
11962           {
11963             ndbrequire(tabPtr.p->tabActiveLcpFragments > 0);
11964             tabPtr.p->tabActiveLcpFragments--;
11965           }
11966 	}
11967 
11968         if (lcpId != RNIL)
11969         {
11970           jam();
11971           Uint32 lcpNo = prevLcpNo(replicaPtr.p->nextLcp);
11972           if (replicaPtr.p->lcpStatus[lcpNo] == ZVALID &&
11973               replicaPtr.p->lcpId[lcpNo] == lcpId)
11974           {
11975             jam();
11976             replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
11977             replicaPtr.p->lcpId[lcpNo] = 0;
11978             replicaPtr.p->nextLcp = lcpNo;
11979             g_eventLogger->debug("REMOVING lcp: %u from table: %u frag:"
11980                                  " %u node: %u",
11981                                  SYSFILE->latestLCP_ID,
11982                                  tabPtr.i,
11983                                  fragNo,
11984                                  nodeId);
11985           }
11986         }
11987       }
11988     }
11989 
11990     /**
11991      * Run updateNodeInfo to remove any dead nodes from list of activeNodes
11992      *  see bug#15587
11993      */
11994     updateNodeInfo(fragPtr);
11995     noOfRemainingLcpReplicas += fragPtr.p->noLcpReplicas;
11996   }
11997 
11998   if (noOfRemovedReplicas == 0)
11999   {
12000     jam();
12001     /**
12002      * The table had no replica on the failed node
12003      *   continue with next table
12004      */
12005     tabPtr.i++;
12006     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
12007     signal->theData[1] = nodeId;
12008     signal->theData[2] = tabPtr.i;
12009     if (!ERROR_INSERTED(7233))
12010       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
12011     else
12012       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
12013     return;
12014   }
12015 
12016   /**
12017    * We did remove at least one replica
12018    */
12019   bool ok = false;
12020   switch(tabPtr.p->tabLcpStatus){
12021   case TabRecord::TLS_COMPLETED:
12022     ok = true;
12023     jam();
12024     /**
12025      * WE WILL WRITE THE TABLE DESCRIPTION TO DISK AT THIS TIME
12026      * INDEPENDENT OF WHAT THE LOCAL CHECKPOINT NEEDED.
12027      * THIS IS TO ENSURE THAT THE FAILED NODES ARE ALSO UPDATED ON DISK
12028      * IN THE DIH DATA STRUCTURES BEFORE WE COMPLETE HANDLING OF THE
12029      * NODE FAILURE.
12030      */
12031     ndbrequire(noOfRemovedLcpReplicas == 0);
12032 
12033     tabPtr.p->tabCopyStatus = TabRecord::CS_REMOVE_NODE;
12034     tabPtr.p->tabUpdateState = TabRecord::US_REMOVE_NODE;
12035     tabPtr.p->tabRemoveNode = nodeId;
12036     signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
12037     signal->theData[1] = tabPtr.i;
12038     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
12039     return;
12040     break;
12041   case TabRecord::TLS_ACTIVE:
12042     ok = true;
12043     jam();
12044     /**
12045      * The table is participating in an LCP currently
12046      */
12047     break;
12048   case TabRecord::TLS_WRITING_TO_FILE:
12049     ok = true;
12050     jam();
12051     /**
12052      * This should never happen since we in the beginning of this function
12053      * checks the tabCopyStatus
12054      */
12055     ndbrequire(lcpOngoingFlag);
12056     ndbabort();
12057   }
12058   ndbrequire(ok);
12059 
12060   /**
12061    * The table is participating in an LCP currently
12062    *   and we removed some replicas that should have been checkpointed
12063    */
12064   ndbrequire(tabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE);
12065 
12066   tabPtr.p->tabCopyStatus = TabRecord::CS_REMOVE_NODE;
12067   tabPtr.p->tabUpdateState = TabRecord::US_REMOVE_NODE;
12068   tabPtr.p->tabRemoveNode = nodeId;
12069   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
12070   signal->theData[1] = tabPtr.i;
12071   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
12072 
12073   if (noOfRemainingLcpReplicas == 0)
12074   {
12075     jam();
12076     /**
12077      * Check if the removal on the failed node made the LCP complete
12078      */
12079     tabPtr.p->tabLcpStatus = TabRecord::TLS_WRITING_TO_FILE;
12080     ndbrequire(tabPtr.p->tabActiveLcpFragments == 0);
12081     checkLcpAllTablesDoneInLqh(__LINE__);
12082   }
12083 }
12084 
12085 void
removeNodeFromTablesComplete(Signal * signal,Uint32 nodeId)12086 Dbdih::removeNodeFromTablesComplete(Signal* signal, Uint32 nodeId)
12087 {
12088   jam();
12089 
12090   /**
12091    * Check if we "accidently" completed a LCP
12092    */
12093   checkLcpCompletedLab(signal);
12094 
12095   /**
12096    * Check if we (DIH) are finished with node fail handling
12097    */
12098   checkLocalNodefailComplete(signal, nodeId, NF_REMOVE_NODE_FROM_TABLE);
12099 }
12100 
12101 void
checkLocalNodefailComplete(Signal * signal,Uint32 failedNodeId,NodefailHandlingStep step)12102 Dbdih::checkLocalNodefailComplete(Signal* signal, Uint32 failedNodeId,
12103 				  NodefailHandlingStep step){
12104   jam();
12105 
12106   NodeRecordPtr nodePtr;
12107   nodePtr.i = failedNodeId;
12108   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
12109 
12110   ndbrequire(nodePtr.p->m_nodefailSteps.get(step));
12111   nodePtr.p->m_nodefailSteps.clear(step);
12112 
12113   if(nodePtr.p->m_nodefailSteps.count() > 0){
12114     jam();
12115     return;
12116   }
12117 
12118   if (ERROR_INSERTED(7030))
12119   {
12120     g_eventLogger->info("Reenable GCP_PREPARE");
12121     CLEAR_ERROR_INSERT_VALUE;
12122   }
12123 
12124   NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
12125   nf->blockNo = DBDIH;
12126   nf->nodeId = cownNodeId;
12127   nf->failedNodeId = failedNodeId;
12128   nf->from = __LINE__;
12129   sendSignal(reference(), GSN_NF_COMPLETEREP, signal,
12130              NFCompleteRep::SignalLength, JBB);
12131 }
12132 
12133 
12134 void
setLocalNodefailHandling(Signal * signal,Uint32 failedNodeId,NodefailHandlingStep step)12135 Dbdih::setLocalNodefailHandling(Signal* signal, Uint32 failedNodeId,
12136 				NodefailHandlingStep step){
12137   jam();
12138 
12139   NodeRecordPtr nodePtr;
12140   nodePtr.i = failedNodeId;
12141   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
12142 
12143   ndbrequire(!nodePtr.p->m_nodefailSteps.get(step));
12144   nodePtr.p->m_nodefailSteps.set(step);
12145 }
12146 
startLcpTakeOverLab(Signal * signal,Uint32 failedNodeId)12147 void Dbdih::startLcpTakeOverLab(Signal* signal, Uint32 failedNodeId)
12148 {
12149   /*--------------------------------------------------------------------*/
12150   // Start LCP master take over process. Consists of the following steps.
12151   // 1) Ensure that all LQH's have reported all fragments they have been
12152   // told to checkpoint. Can be a fairly long step time-wise.
12153   // 2) Query all nodes about their LCP status.
12154   // During the query process we do not want our own state to change.
12155   // This can change due to delayed reception of LCP_REPORT, completed
12156   // save of table on disk or reception of DIH_LCPCOMPLETE from other
12157   // node.
12158   /*--------------------------------------------------------------------*/
12159 }//Dbdih::startLcpTakeOver()
12160 
12161 void
checkEmptyLcpComplete(Signal * signal)12162 Dbdih::checkEmptyLcpComplete(Signal *signal)
12163 {
12164 
12165   ndbrequire(c_lcpMasterTakeOverState.state == LMTOS_WAIT_LCP_FRAG_REP);
12166 
12167   if(isMaster()){
12168     jam();
12169 
12170     signal->theData[0] = NDB_LE_LCP_TakeoverStarted;
12171     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 1, JBB);
12172 
12173     signal->theData[0] = 7012;
12174     execDUMP_STATE_ORD(signal);
12175 
12176     if (ERROR_INSERTED(7194))
12177     {
12178       ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE");
12179       signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
12180       signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId;
12181       signal->theData[2] = 0; // Tab id
12182       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
12183     }
12184 
12185     c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__);
12186     m_master_lcp_req_lcp_already_completed = false;
12187     MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
12188     req->masterRef = reference();
12189     req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId;
12190     sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ, RNIL);
12191 
12192   }
12193   else
12194   {
12195     jam();
12196     sendMASTER_LCPCONF(signal, __LINE__);
12197   }
12198 }
12199 
12200 /*--------------------------------------------------*/
12201 /*       THE MASTER HAS FAILED AND THE NEW MASTER IS*/
12202 /*       QUERYING THIS NODE ABOUT THE STATE OF THE  */
12203 /*       LOCAL CHECKPOINT PROTOCOL.                 */
12204 /*--------------------------------------------------*/
execMASTER_LCPREQ(Signal * signal)12205 void Dbdih::execMASTER_LCPREQ(Signal* signal)
12206 {
12207   NodeRecordPtr newMasterNodePtr;
12208   const MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
12209   jamEntry();
12210   const BlockReference newMasterBlockref = req->masterRef;
12211 
12212   newMasterNodePtr.i = refToNode(newMasterBlockref);
12213   ptrCheckGuard(newMasterNodePtr, MAX_NDB_NODES, nodeRecord);
12214 
12215   if (newMasterNodePtr.p->nodeStatus != NodeRecord::ALIVE)
12216   {
12217     /**
12218      * We delayed the MASTER_LCPREQ signal and now it arrived after
12219      * the new master already died. We ignore this signal.
12220      */
12221     jam();
12222     return;
12223   }
12224 
12225   CRASH_INSERTION(7205);
12226 
12227   if (ERROR_INSERTED(7207))
12228   {
12229     jam();
12230     SET_ERROR_INSERT_VALUE(7208);
12231     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
12232 			500, signal->getLength());
12233     return;
12234   }
12235 
12236   if (ERROR_INSERTED(7208))
12237   {
12238     jam();
12239     signal->theData[0] = 9999;
12240     sendSignal(numberToRef(CMVMI, refToNode(newMasterBlockref)),
12241                GSN_NDB_TAMPER, signal, 1, JBB);
12242   }
12243 
12244   if (ERROR_INSERTED(7231))
12245   {
12246     CLEAR_ERROR_INSERT_VALUE;
12247     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
12248 			1500, signal->getLength());
12249     return;
12250   }
12251 
12252   if (newMasterBlockref != cmasterdihref)
12253   {
12254     /**
12255      * We haven't processed the NODE_FAILREP signal causing the new master
12256      * to be selected as the new master by this node.
12257      */
12258     jam();
12259     ndbout_c("resending GSN_MASTER_LCPREQ");
12260     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
12261 			50, signal->getLength());
12262     return;
12263   }
12264 
12265   if (c_handled_master_take_over_copy_gci != refToNode(newMasterNodePtr.i))
12266   {
12267     /**
12268      * We need to ensure that MASTER_GCPREQ has ensured that the COPY_GCIREQ
12269      * activity started by old master has been completed before we proceed
12270      * with handling the take over of the LCP protocol.
12271      */
12272     jam();
12273     sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
12274                         10, signal->getLength());
12275     return;
12276   }
12277   c_handled_master_take_over_copy_gci = 0;
12278 
12279   Uint32 failedNodeId = req->failedNodeId;
12280 
12281   /**
12282    * There can be no take over with the same master
12283    */
12284   ndbrequire(c_lcpState.m_masterLcpDihRef != newMasterBlockref);
12285   c_lcpState.m_masterLcpDihRef = newMasterBlockref;
12286   c_lcpState.m_MASTER_LCPREQ_Received = true;
12287   c_lcpState.m_MASTER_LCPREQ_FailedNodeId = failedNodeId;
12288 
12289   if(newMasterBlockref != cmasterdihref){
12290     jam();
12291     ndbabort();
12292   }
12293 
12294   if (c_lcpState.lcpStatus == LCP_INIT_TABLES)
12295   {
12296     jam();
12297     c_lcpState.m_participatingDIH.clear();
12298     c_lcpState.m_participatingLQH.clear();
12299     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
12300   }
12301   sendMASTER_LCPCONF(signal, __LINE__);
12302 }//Dbdih::execMASTER_LCPREQ()
12303 
12304 void
sendMASTER_LCPCONF(Signal * signal,Uint32 from)12305 Dbdih::sendMASTER_LCPCONF(Signal * signal, Uint32 from)
12306 {
12307   if (!c_lcpState.m_MASTER_LCPREQ_Received)
12308   {
12309     jam();
12310     /**
12311      * Has not received MASTER_LCPREQ yet
12312      */
12313     return;
12314   }
12315 
12316 #if defined VM_TRACE || defined ERROR_INSERT
12317   bool info = true;
12318 #else
12319   bool info = false;
12320 #endif
12321 
12322   if (ERROR_INSERTED(7230))
12323   {
12324     signal->theData[0] = 9999;
12325     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 100, 1);
12326     goto err7230;
12327   }
12328 
12329   if (c_lcpState.lcpStatus == LCP_INIT_TABLES)
12330   {
12331     jam();
12332     /**
12333      * Still aborting old initLcpLab
12334      */
12335     if (info)
12336       infoEvent("from: %u : c_lcpState.lcpStatus == LCP_INIT_TABLES", from);
12337     return;
12338   }
12339 
12340 err7230:
12341   if (info)
12342     infoEvent("from: %u : sendMASTER_LCPCONF", from);
12343 
12344   if (c_lcpState.lcpStatus == LCP_COPY_GCI)
12345   {
12346     jam();
12347     /**
12348      * Restart it
12349      */
12350     //Uint32 lcpId = SYSFILE->latestLCP_ID;
12351     SYSFILE->latestLCP_ID--;
12352     Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
12353     c_lcpState.m_participatingDIH.clear();
12354     c_lcpState.m_participatingLQH.clear();
12355     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
12356 #if 0
12357     if(c_copyGCISlave.m_copyReason == CopyGCIReq::LOCAL_CHECKPOINT){
12358       g_eventLogger->info("Dbdih: Also resetting c_copyGCISlave");
12359       c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
12360       c_copyGCISlave.m_expectedNextWord = 0;
12361     }
12362 #endif
12363   }
12364 
12365   MasterLCPConf::State lcpState;
12366   switch (c_lcpState.lcpStatus) {
12367   case LCP_STATUS_IDLE:
12368     jam();
12369     /*------------------------------------------------*/
12370     /*       LOCAL CHECKPOINT IS CURRENTLY NOT ACTIVE */
12371     /*       SINCE NO COPY OF RESTART INFORMATION HAVE*/
12372     /*       BEEN RECEIVED YET. ALSO THE PREVIOUS     */
12373     /*       CHECKPOINT HAVE BEEN FULLY COMPLETED.    */
12374     /*------------------------------------------------*/
12375     lcpState = MasterLCPConf::LCP_STATUS_IDLE;
12376     break;
12377   case LCP_STATUS_ACTIVE:
12378     jam();
12379     /*--------------------------------------------------*/
12380     /*       COPY OF RESTART INFORMATION HAS BEEN       */
12381     /*       PERFORMED AND ALSO RESPONSE HAVE BEEN SENT.*/
12382     /*--------------------------------------------------*/
12383     lcpState = MasterLCPConf::LCP_STATUS_ACTIVE;
12384     break;
12385   case LCP_TAB_COMPLETED:
12386     jam();
12387     /*--------------------------------------------------------*/
12388     /*       ALL LCP_REPORT'S HAVE BEEN COMPLETED FOR         */
12389     /*       ALL TABLES.     SAVE OF AT LEAST ONE TABLE IS    */
12390     /*       ONGOING YET.                                     */
12391     /*--------------------------------------------------------*/
12392     lcpState = MasterLCPConf::LCP_TAB_COMPLETED;
12393     break;
12394   case LCP_TAB_SAVED:
12395     jam();
12396     /*--------------------------------------------------------*/
12397     /*       ALL LCP_REPORT'S HAVE BEEN COMPLETED FOR         */
12398     /*       ALL TABLES.     ALL TABLES HAVE ALSO BEEN SAVED  */
12399     /*       ALL OTHER NODES ARE NOT YET FINISHED WITH        */
12400     /*       THE LOCAL CHECKPOINT.                            */
12401     /*--------------------------------------------------------*/
12402     lcpState = MasterLCPConf::LCP_TAB_SAVED;
12403     break;
12404   case LCP_TCGET:
12405   case LCP_CALCULATE_KEEP_GCI:
12406   case LCP_TC_CLOPSIZE:
12407   case LCP_WAIT_MUTEX:
12408   case LCP_START_LCP_ROUND:
12409     /**
12410      * These should only exists on the master
12411      *   but since this is master take over
12412      *   it not allowed
12413      */
12414     ndbabort();
12415     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
12416     break;
12417   case LCP_COPY_GCI:
12418   case LCP_INIT_TABLES:
12419     /**
12420      * These two states are handled by if statements above
12421      */
12422     ndbabort();
12423     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
12424     break;
12425   default:
12426     ndbabort();
12427     lcpState= MasterLCPConf::LCP_STATUS_IDLE; // remove warning
12428   }//switch
12429 
12430   Uint32 failedNodeId = c_lcpState.m_MASTER_LCPREQ_FailedNodeId;
12431   MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
12432   conf->senderNodeId = cownNodeId;
12433   conf->lcpState = lcpState;
12434   conf->failedNodeId = failedNodeId;
12435   sendSignal(c_lcpState.m_masterLcpDihRef, GSN_MASTER_LCPCONF,
12436              signal, MasterLCPConf::SignalLength, JBB);
12437 
12438   // Answer to MASTER_LCPREQ sent, reset flag so
12439   // that it's not sent again before another request comes in
12440   c_lcpState.m_MASTER_LCPREQ_Received = false;
12441 
12442   CRASH_INSERTION(7232);
12443 
12444   if (ERROR_INSERTED(7230))
12445   {
12446     return;
12447   }
12448 
12449   if(c_lcpState.lcpStatus == LCP_TAB_SAVED){
12450 #ifdef VM_TRACE
12451     g_eventLogger->info("Sending extra GSN_LCP_COMPLETE_REP to new master");
12452 #endif
12453     sendLCP_COMPLETE_REP(signal);
12454   }
12455 
12456   if(!isMaster())
12457   {
12458     c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
12459     checkLocalNodefailComplete(signal, failedNodeId, NF_LCP_TAKE_OVER);
12460   }
12461 
12462   return;
12463 }
12464 
12465 NdbOut&
operator <<(NdbOut & out,const Dbdih::LcpMasterTakeOverState state)12466 operator<<(NdbOut& out, const Dbdih::LcpMasterTakeOverState state){
12467   switch(state){
12468   case Dbdih::LMTOS_IDLE:
12469     out << "LMTOS_IDLE";
12470     break;
12471   case Dbdih::LMTOS_WAIT_LCP_FRAG_REP:
12472     out << "LMTOS_WAIT_LCP_FRAG_REP";
12473     break;
12474   case Dbdih::LMTOS_INITIAL:
12475     out << "LMTOS_INITIAL";
12476     break;
12477   case Dbdih::LMTOS_ALL_IDLE:
12478     out << "LMTOS_ALL_IDLE";
12479     break;
12480   case Dbdih::LMTOS_ALL_ACTIVE:
12481     out << "LMTOS_ALL_ACTIVE";
12482     break;
12483   case Dbdih::LMTOS_LCP_CONCLUDING:
12484     out << "LMTOS_LCP_CONCLUDING";
12485     break;
12486   case Dbdih::LMTOS_COPY_ONGOING:
12487     out << "LMTOS_COPY_ONGOING";
12488     break;
12489   }
12490   return out;
12491 }
12492 
12493 struct MASTERLCP_StateTransitions {
12494   Dbdih::LcpMasterTakeOverState CurrentState;
12495   MasterLCPConf::State ParticipantState;
12496   Dbdih::LcpMasterTakeOverState NewState;
12497 };
12498 
12499 static const
12500 MASTERLCP_StateTransitions g_masterLCPTakeoverStateTransitions[] = {
12501   /**
12502    * Current = LMTOS_INITIAL
12503    */
12504   { Dbdih::LMTOS_INITIAL,
12505     MasterLCPConf::LCP_STATUS_IDLE,
12506     Dbdih::LMTOS_ALL_IDLE },
12507 
12508   { Dbdih::LMTOS_INITIAL,
12509     MasterLCPConf::LCP_STATUS_ACTIVE,
12510     Dbdih::LMTOS_ALL_ACTIVE },
12511 
12512   { Dbdih::LMTOS_INITIAL,
12513     MasterLCPConf::LCP_TAB_COMPLETED,
12514     Dbdih::LMTOS_LCP_CONCLUDING },
12515 
12516   { Dbdih::LMTOS_INITIAL,
12517     MasterLCPConf::LCP_TAB_SAVED,
12518     Dbdih::LMTOS_LCP_CONCLUDING },
12519 
12520   /**
12521    * Current = LMTOS_ALL_IDLE
12522    */
12523   { Dbdih::LMTOS_ALL_IDLE,
12524     MasterLCPConf::LCP_STATUS_IDLE,
12525     Dbdih::LMTOS_ALL_IDLE },
12526 
12527   { Dbdih::LMTOS_ALL_IDLE,
12528     MasterLCPConf::LCP_STATUS_ACTIVE,
12529     Dbdih::LMTOS_COPY_ONGOING },
12530 
12531   { Dbdih::LMTOS_ALL_IDLE,
12532     MasterLCPConf::LCP_TAB_COMPLETED,
12533     Dbdih::LMTOS_LCP_CONCLUDING },
12534 
12535   { Dbdih::LMTOS_ALL_IDLE,
12536     MasterLCPConf::LCP_TAB_SAVED,
12537     Dbdih::LMTOS_LCP_CONCLUDING },
12538 
12539   /**
12540    * Current = LMTOS_COPY_ONGOING
12541    */
12542   { Dbdih::LMTOS_COPY_ONGOING,
12543     MasterLCPConf::LCP_STATUS_IDLE,
12544     Dbdih::LMTOS_COPY_ONGOING },
12545 
12546   { Dbdih::LMTOS_COPY_ONGOING,
12547     MasterLCPConf::LCP_STATUS_ACTIVE,
12548     Dbdih::LMTOS_COPY_ONGOING },
12549 
12550   /**
12551    * Current = LMTOS_ALL_ACTIVE
12552    */
12553   { Dbdih::LMTOS_ALL_ACTIVE,
12554     MasterLCPConf::LCP_STATUS_IDLE,
12555     Dbdih::LMTOS_COPY_ONGOING },
12556 
12557   { Dbdih::LMTOS_ALL_ACTIVE,
12558     MasterLCPConf::LCP_STATUS_ACTIVE,
12559     Dbdih::LMTOS_ALL_ACTIVE },
12560 
12561   { Dbdih::LMTOS_ALL_ACTIVE,
12562     MasterLCPConf::LCP_TAB_COMPLETED,
12563     Dbdih::LMTOS_LCP_CONCLUDING },
12564 
12565   { Dbdih::LMTOS_ALL_ACTIVE,
12566     MasterLCPConf::LCP_TAB_SAVED,
12567     Dbdih::LMTOS_LCP_CONCLUDING },
12568 
12569   /**
12570    * Current = LMTOS_LCP_CONCLUDING
12571    */
12572   { Dbdih::LMTOS_LCP_CONCLUDING,
12573     MasterLCPConf::LCP_STATUS_IDLE,
12574     Dbdih::LMTOS_LCP_CONCLUDING },
12575 
12576   { Dbdih::LMTOS_LCP_CONCLUDING,
12577     MasterLCPConf::LCP_STATUS_ACTIVE,
12578     Dbdih::LMTOS_LCP_CONCLUDING },
12579 
12580   { Dbdih::LMTOS_LCP_CONCLUDING,
12581     MasterLCPConf::LCP_TAB_COMPLETED,
12582     Dbdih::LMTOS_LCP_CONCLUDING },
12583 
12584   { Dbdih::LMTOS_LCP_CONCLUDING,
12585     MasterLCPConf::LCP_TAB_SAVED,
12586     Dbdih::LMTOS_LCP_CONCLUDING }
12587 };
12588 
12589 const Uint32 g_masterLCPTakeoverStateTransitionsRows =
12590 sizeof(g_masterLCPTakeoverStateTransitions) / sizeof(struct MASTERLCP_StateTransitions);
12591 
execMASTER_LCPCONF(Signal * signal)12592 void Dbdih::execMASTER_LCPCONF(Signal* signal)
12593 {
12594   const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
12595   jamEntry();
12596 
12597   if (ERROR_INSERTED(7194))
12598   {
12599     ndbout_c("delaying MASTER_LCPCONF due to error 7194");
12600     sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
12601                         300, signal->getLength());
12602     return;
12603   }
12604 
12605   if (ERROR_INSERTED(7230) &&
12606       refToNode(signal->getSendersBlockRef()) != getOwnNodeId())
12607   {
12608     infoEvent("delaying MASTER_LCPCONF due to error 7230 (from %u)",
12609               refToNode(signal->getSendersBlockRef()));
12610     sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
12611                         300, signal->getLength());
12612     return;
12613   }
12614 
12615   Uint32 senderNodeId = conf->senderNodeId;
12616   MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState;
12617   const Uint32 failedNodeId = conf->failedNodeId;
12618   NodeRecordPtr nodePtr;
12619   nodePtr.i = senderNodeId;
12620   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
12621   nodePtr.p->lcpStateAtTakeOver = lcpState;
12622 
12623   CRASH_INSERTION(7180);
12624 
12625 #ifdef VM_TRACE
12626   g_eventLogger->info("MASTER_LCPCONF from node %u", senderNodeId);
12627   printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);
12628 #endif
12629 
12630   bool found = false;
12631   for(Uint32 i = 0; i<g_masterLCPTakeoverStateTransitionsRows; i++){
12632     const struct MASTERLCP_StateTransitions * valid =
12633       &g_masterLCPTakeoverStateTransitions[i];
12634 
12635     if(valid->CurrentState == c_lcpMasterTakeOverState.state &&
12636        valid->ParticipantState == lcpState){
12637       jam();
12638       found = true;
12639       c_lcpMasterTakeOverState.set(valid->NewState, __LINE__);
12640       break;
12641     }
12642   }
12643   ndbrequire(found);
12644 
12645   bool ok = false;
12646   switch(lcpState){
12647   case MasterLCPConf::LCP_STATUS_IDLE:
12648     ok = true;
12649     m_master_lcp_req_lcp_already_completed = true;
12650     break;
12651   case MasterLCPConf::LCP_STATUS_ACTIVE:
12652   case MasterLCPConf::LCP_TAB_COMPLETED:
12653   case MasterLCPConf::LCP_TAB_SAVED:
12654     ok = true;
12655     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.setWaitingFor(nodePtr.i);
12656     break;
12657   }
12658   ndbrequire(ok);
12659 
12660   receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
12661   /*-------------------------------------------------------------------------*/
12662   // We have now received all responses and are ready to take over the LCP
12663   // protocol as master.
12664   /*-------------------------------------------------------------------------*/
12665   MASTER_LCPhandling(signal, failedNodeId);
12666 }//Dbdih::execMASTER_LCPCONF()
12667 
execMASTER_LCPREF(Signal * signal)12668 void Dbdih::execMASTER_LCPREF(Signal* signal)
12669 {
12670   const MasterLCPRef * const ref = (MasterLCPRef *)&signal->theData[0];
12671   jamEntry();
12672 
12673   Uint32 senderNodeId = ref->senderNodeId;
12674   Uint32 failedNodeId = ref->failedNodeId;
12675 
12676   if (c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(senderNodeId))
12677   {
12678     jam();
12679     c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(senderNodeId);
12680   }
12681 
12682   receiveLoopMacro(MASTER_LCPREQ, senderNodeId);
12683   /*-------------------------------------------------------------------------*/
12684   // We have now received all responses and are ready to take over the LCP
12685   // protocol as master.
12686   /*-------------------------------------------------------------------------*/
12687   MASTER_LCPhandling(signal, failedNodeId);
12688 }//Dbdih::execMASTER_LCPREF()
12689 
MASTER_LCPhandling(Signal * signal,Uint32 failedNodeId)12690 void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
12691 {
12692   bool lcp_already_completed = m_master_lcp_req_lcp_already_completed;
12693   m_master_lcp_req_lcp_already_completed = false;
12694   /*-------------------------------------------------------------------------
12695    *
12696    * WE ARE NOW READY TO CONCLUDE THE TAKE OVER AS MASTER.
12697    * WE HAVE ENOUGH INFO TO START UP ACTIVITIES IN THE PROPER PLACE.
12698    * ALSO SET THE PROPER STATE VARIABLES.
12699    *------------------------------------------------------------------------*/
12700   c_lcpState.currentFragment.tableId = c_lcpMasterTakeOverState.minTableId;
12701   c_lcpState.currentFragment.fragmentId = c_lcpMasterTakeOverState.minFragId;
12702   c_lcpState.m_LAST_LCP_FRAG_ORD = c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH;
12703   DEB_LCP(("MASTER_LCPhandling: m_LAST_LCP_FRAG_ORD = %s",
12704 	   c_lcpState.m_LAST_LCP_FRAG_ORD.getText()));
12705 
12706   NodeRecordPtr failedNodePtr;
12707   failedNodePtr.i = failedNodeId;
12708   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
12709 
12710   switch (c_lcpMasterTakeOverState.state) {
12711   case LMTOS_ALL_IDLE:
12712     jam();
12713     /* --------------------------------------------------------------------- */
12714     // All nodes were idle in the LCP protocol. Start checking for start of LCP
12715     // protocol.
12716     /* --------------------------------------------------------------------- */
12717 #ifdef VM_TRACE
12718     g_eventLogger->info("MASTER_LCPhandling:: LMTOS_ALL_IDLE -> checkLcpStart");
12719 #endif
12720     checkLcpStart(signal, __LINE__, 0);
12721     break;
12722   case LMTOS_COPY_ONGOING:
12723     jam();
12724     /* --------------------------------------------------------------------- */
12725     // We were in the starting process of the LCP protocol. We will restart the
12726     // protocol by calculating the keep gci and storing the new lcp id.
12727     /* --------------------------------------------------------------------- */
12728 #ifdef VM_TRACE
12729     g_eventLogger->info("MASTER_LCPhandling:: LMTOS_COPY_ONGOING -> storeNewLcpId");
12730 #endif
12731     if (c_lcpState.lcpStatus == LCP_STATUS_ACTIVE) {
12732       jam();
12733       /*---------------------------------------------------------------------*/
12734       /*  WE NEED TO DECREASE THE LATEST LCP ID SINCE WE HAVE ALREADY        */
12735       /*  STARTED THIS */
12736       /*  LOCAL CHECKPOINT.                                                  */
12737       /*---------------------------------------------------------------------*/
12738 #ifdef VM_TRACE
12739       Uint32 lcpId = SYSFILE->latestLCP_ID;
12740       g_eventLogger->info("Decreasing latestLCP_ID from %d to %d", lcpId, lcpId - 1);
12741 #endif
12742       SYSFILE->latestLCP_ID--;
12743     }//if
12744     start_lcp_before_mutex(signal);
12745     break;
12746   case LMTOS_ALL_ACTIVE:
12747     {
12748       jam();
12749       /* -------------------------------------------------------------------
12750        * Everybody was in the active phase. We will restart sending
12751        * LCP_FRAG_ORD to the nodes from the new master.
12752        * We also need to set dihLcpStatus to ZACTIVE
12753        * in the master node since the master will wait for all nodes to
12754        * complete before finalising the LCP process.
12755        * ------------------------------------------------------------------ */
12756 #ifdef VM_TRACE
12757       g_eventLogger->info("MASTER_LCPhandling:: LMTOS_ALL_ACTIVE -> "
12758                           "startLcpRoundLoopLab(table=%u, fragment=%u)",
12759                           c_lcpMasterTakeOverState.minTableId,
12760                           c_lcpMasterTakeOverState.minFragId);
12761 #endif
12762 
12763       c_lcpState.keepGci = SYSFILE->keepGCI;
12764 
12765       /**
12766        * We need not protect against ongoing copy of meta data here since
12767        * that cannot be ongoing while we are taking over as master. The
12768        * reason is that a starting node will always fail also if any node
12769        * fails in the middle of the start process.
12770        */
12771       c_lcp_runs_with_pause_support = true;
12772       jam();
12773       /* No mutex is needed, call callback function immediately */
12774       master_lcp_fragmentMutex_locked(signal, failedNodePtr.i, 0);
12775       return;
12776     }
12777   case LMTOS_LCP_CONCLUDING:
12778     {
12779       jam();
12780       /* ------------------------------------------------------------------- */
12781       // The LCP process is in the finalisation phase. We simply wait for it to
12782       // complete with signals arriving in. We need to check also if we should
12783       // change state due to table write completion during state
12784       // collection phase.
12785       /* ------------------------------------------------------------------- */
12786 
12787       /**
12788        * During master takeover, some participant nodes could have
12789        * been in IDLE state since they have already completed the
12790        * lcpId under the old master before it failed.
12791 
12792        * When I, the new master, take over and send MASTER_LCP_REQ and
12793        * execute MASTER_LCPCONF from participants, excempt the
12794        * already-completed participants from the requirement to be
12795        * "not in IDLE state". Those who sent MASTER_LCPREF had not
12796        * completed the current LCP under the old master and thus
12797        * cannot be in IDLE state.
12798        */
12799       ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE ||
12800                  lcp_already_completed);
12801 
12802       if (c_lcpState.lcpStatus == LCP_STATUS_IDLE)
12803       {
12804         jam();
12805         /**
12806          * From our point of view the LCP is completed since we heard the old
12807          * master conclude the LCP. But there are other nodes that still
12808          * haven't heard about the conclusion of the LCP since not all have
12809          * reached the IDLE state yet. To handle this in the code for
12810          * handling LCP_COMPLETE_REP we need to get back to the state
12811          * LCP_TAB_SAVED and ensure that we send LCP_COMPLETE_REP with
12812          * block 0 for all nodes that haven't heard of the completed LCP yet.
12813          *
12814          * We accomplish this by transferring the bitmap for the wait for
12815          * LCP_COMPLETE_REP to m_participatingDIH bitmask that is used to
12816          * send the LCP_COMPLETE_REP for block 0.
12817          */
12818         DEB_LCP_COMP(("LCP_IDLE => LCP_TAB_SAVED"));
12819         c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
12820         m_local_lcp_state.init_master_take_over_idle_to_tab_saved();
12821         for (Uint32 node = 1; node < MAX_NDB_NODES; node++)
12822         {
12823           if (c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(node))
12824           {
12825             jam();
12826             c_lcpState.m_participatingDIH.set(node);
12827           }
12828         }
12829       }
12830 
12831       c_lcp_runs_with_pause_support = true;
12832       jam();
12833       /* No mutex is needed, call callback function immediately */
12834       master_lcp_fragmentMutex_locked(signal, failedNodePtr.i, 0);
12835       return;
12836     }
12837   default:
12838     ndbabort();
12839   }//switch
12840   signal->theData[0] = NDB_LE_LCP_TakeoverCompleted;
12841   signal->theData[1] = c_lcpMasterTakeOverState.state;
12842   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
12843 
12844   signal->theData[0] = 7012;
12845   execDUMP_STATE_ORD(signal);
12846 
12847   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
12848 
12849   checkLocalNodefailComplete(signal, failedNodePtr.i, NF_LCP_TAKE_OVER);
12850 }
12851 
12852 /* ------------------------------------------------------------------------- */
12853 /*       A BLOCK OR A NODE HAS COMPLETED THE HANDLING OF THE NODE FAILURE.   */
12854 /* ------------------------------------------------------------------------- */
execNF_COMPLETEREP(Signal * signal)12855 void Dbdih::execNF_COMPLETEREP(Signal* signal)
12856 {
12857   NodeRecordPtr failedNodePtr;
12858   NFCompleteRep * const nfCompleteRep = (NFCompleteRep *)&signal->theData[0];
12859   jamEntry();
12860   const Uint32 blockNo = nfCompleteRep->blockNo;
12861   Uint32 nodeId       = nfCompleteRep->nodeId;
12862   failedNodePtr.i = nfCompleteRep->failedNodeId;
12863 
12864   ptrCheckGuard(failedNodePtr, MAX_NDB_NODES, nodeRecord);
12865   switch (blockNo) {
12866   case DBTC:
12867     jam();
12868     ndbrequire(failedNodePtr.p->dbtcFailCompleted == ZFALSE);
12869     /* -------------------------------------------------------------------- */
12870     // Report the event that DBTC completed node failure handling.
12871     /* -------------------------------------------------------------------- */
12872     signal->theData[0] = NDB_LE_NodeFailCompleted;
12873     signal->theData[1] = DBTC;
12874     signal->theData[2] = failedNodePtr.i;
12875     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
12876 
12877     failedNodePtr.p->dbtcFailCompleted = ZTRUE;
12878     break;
12879   case DBDICT:
12880     jam();
12881     ndbrequire(failedNodePtr.p->dbdictFailCompleted == ZFALSE);
12882     /* --------------------------------------------------------------------- */
12883     // Report the event that DBDICT completed node failure handling.
12884     /* --------------------------------------------------------------------- */
12885     signal->theData[0] = NDB_LE_NodeFailCompleted;
12886     signal->theData[1] = DBDICT;
12887     signal->theData[2] = failedNodePtr.i;
12888     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
12889 
12890     failedNodePtr.p->dbdictFailCompleted = ZTRUE;
12891     break;
12892   case DBDIH:
12893     jam();
12894     ndbrequire(failedNodePtr.p->dbdihFailCompleted == ZFALSE);
12895     /* --------------------------------------------------------------------- */
12896     // Report the event that DBDIH completed node failure handling.
12897     /* --------------------------------------------------------------------- */
12898     signal->theData[0] = NDB_LE_NodeFailCompleted;
12899     signal->theData[1] = DBDIH;
12900     signal->theData[2] = failedNodePtr.i;
12901     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
12902 
12903     failedNodePtr.p->dbdihFailCompleted = ZTRUE;
12904     break;
12905   case DBLQH:
12906     jam();
12907     ndbrequire(failedNodePtr.p->dblqhFailCompleted == ZFALSE);
12908     /* --------------------------------------------------------------------- */
12909     // Report the event that DBDIH completed node failure handling.
12910     /* --------------------------------------------------------------------- */
12911     signal->theData[0] = NDB_LE_NodeFailCompleted;
12912     signal->theData[1] = DBLQH;
12913     signal->theData[2] = failedNodePtr.i;
12914     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
12915 
12916     failedNodePtr.p->dblqhFailCompleted = ZTRUE;
12917     break;
12918   case 0: /* Node has finished */
12919     jam();
12920     ndbrequire(nodeId < MAX_NDB_NODES);
12921 
12922     if (failedNodePtr.p->recNODE_FAILREP == ZFALSE) {
12923       jam();
12924       /* ------------------------------------------------------------------- */
12925       // We received a report about completion of node failure before we
12926       // received the message about the NODE failure ourselves.
12927       // We will send the signal to ourselves with a small delay
12928       // (10 milliseconds).
12929       /* ------------------------------------------------------------------- */
12930       //nf->from = __LINE__;
12931       sendSignalWithDelay(reference(), GSN_NF_COMPLETEREP, signal, 10,
12932 			  signal->length());
12933       return;
12934     }//if
12935 
12936     if (!failedNodePtr.p->m_NF_COMPLETE_REP.isWaitingFor(nodeId)){
12937       jam();
12938       return;
12939     }
12940 
12941     failedNodePtr.p->m_NF_COMPLETE_REP.clearWaitingFor(nodeId);;
12942 
12943     /* -------------------------------------------------------------------- */
12944     // Report the event that nodeId has completed node failure handling.
12945     /* -------------------------------------------------------------------- */
12946     signal->theData[0] = NDB_LE_NodeFailCompleted;
12947     signal->theData[1] = 0;
12948     signal->theData[2] = failedNodePtr.i;
12949     signal->theData[3] = nodeId;
12950     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
12951 
12952     nodeFailCompletedCheckLab(signal, failedNodePtr);
12953     return;
12954     break;
12955   default:
12956     ndbabort();
12957   }//switch
12958   if (failedNodePtr.p->dbtcFailCompleted == ZFALSE) {
12959     jam();
12960     return;
12961   }//if
12962   if (failedNodePtr.p->dbdictFailCompleted == ZFALSE) {
12963     jam();
12964     return;
12965   }//if
12966   if (failedNodePtr.p->dbdihFailCompleted == ZFALSE) {
12967     jam();
12968     return;
12969   }//if
12970   if (failedNodePtr.p->dblqhFailCompleted == ZFALSE) {
12971     jam();
12972     return;
12973   }//if
12974   /* ----------------------------------------------------------------------- */
12975   /*     ALL BLOCKS IN THIS NODE HAVE COMPLETED THEIR PART OF HANDLING THE   */
12976   /*     NODE FAILURE. WE CAN NOW REPORT THIS COMPLETION TO ALL OTHER NODES. */
12977   /* ----------------------------------------------------------------------- */
12978   NodeRecordPtr nodePtr;
12979   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
12980   {
12981     jam();
12982     ptrAss(nodePtr, nodeRecord);
12983     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
12984       jam();
12985       BlockReference ref = calcDihBlockRef(nodePtr.i);
12986       NFCompleteRep * const nf = (NFCompleteRep *)&signal->theData[0];
12987       nf->blockNo      = 0;
12988       nf->nodeId       = cownNodeId;
12989       nf->failedNodeId = failedNodePtr.i;
12990       nf->from = __LINE__;
12991       sendSignal(ref, GSN_NF_COMPLETEREP, signal,
12992                  NFCompleteRep::SignalLength, JBB);
12993     }//if
12994   }//for
12995   return;
12996 }//Dbdih::execNF_COMPLETEREP()
12997 
nodeFailCompletedCheckLab(Signal * signal,NodeRecordPtr failedNodePtr)12998 void Dbdih::nodeFailCompletedCheckLab(Signal* signal,
12999 				      NodeRecordPtr failedNodePtr)
13000 {
13001   jam();
13002   if (!failedNodePtr.p->m_NF_COMPLETE_REP.done()){
13003     jam();
13004     return;
13005   }//if
13006   /* ---------------------------------------------------------------------- */
13007   /*    ALL BLOCKS IN ALL NODES HAVE NOW REPORTED COMPLETION OF THE NODE    */
13008   /*    FAILURE HANDLING. WE ARE NOW READY TO ACCEPT THAT THIS NODE STARTS  */
13009   /*    AGAIN.                                                              */
13010   /* ---------------------------------------------------------------------- */
13011   jam();
13012   failedNodePtr.p->nodeStatus = NodeRecord::DEAD;
13013   failedNodePtr.p->recNODE_FAILREP = ZFALSE;
13014 
13015   /* ---------------------------------------------------------------------- */
13016   // Report the event that all nodes completed node failure handling.
13017   /* ---------------------------------------------------------------------- */
13018   signal->theData[0] = NDB_LE_NodeFailCompleted;
13019   signal->theData[1] = 0;
13020   signal->theData[2] = failedNodePtr.i;
13021   signal->theData[3] = 0;
13022   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
13023 
13024   /* ---------------------------------------------------------------------- */
13025   // Report to QMGR that we have concluded recovery handling of this node.
13026   /* ---------------------------------------------------------------------- */
13027   signal->theData[0] = failedNodePtr.i;
13028   sendSignal(QMGR_REF, GSN_NDB_FAILCONF, signal, 1, JBB);
13029   setNodeRecoveryStatus(failedNodePtr.i, NodeRecord::NODE_FAILURE_COMPLETED);
13030   return;
13031 }//Dbdih::nodeFailCompletedCheckLab()
13032 
13033 /*****************************************************************************/
13034 /* **********     SEIZING / RELEASING MODULE                     *************/
13035 /*****************************************************************************/
13036 /*
13037   3.4   L O C A L  N O D E   S E I Z E
13038   ************************************
13039   */
13040 /*
13041   3.7   A D D   T A B L E
13042   **********************=
13043   */
13044 /*****************************************************************************/
13045 /* **********     TABLE ADDING MODULE                            *************/
13046 /*****************************************************************************/
13047 /*
13048   3.7.1   A D D   T A B L E   M A I N L Y
13049   ***************************************
13050   */
13051 
inc_node_or_group(Uint32 & node,Uint32 max_node)13052 static inline void inc_node_or_group(Uint32 &node, Uint32 max_node)
13053 {
13054   Uint32 next = node + 1;
13055   node = (next == max_node ? 0 : next);
13056 }
13057 
13058 /*
13059   Spread fragments in backwards compatible mode
13060 */
set_default_node_groups(Signal * signal,Uint32 noFrags)13061 static void set_default_node_groups(Signal *signal, Uint32 noFrags)
13062 {
13063   Uint16 *node_group_array = (Uint16*)&signal->theData[25];
13064   Uint32 i;
13065   for (i = 0; i < noFrags; i++)
13066     node_group_array[i] = NDB_UNDEF_NODEGROUP;
13067 }
13068 
find_min_index(const Uint16 * array,Uint32 cnt,Uint32 start_pos,Uint32 first_pos)13069 static Uint32 find_min_index(const Uint16* array,
13070                              Uint32 cnt,
13071                              Uint32 start_pos,
13072                              Uint32 first_pos)
13073 {
13074   Uint32 m = start_pos;
13075   Uint32 min_value = array[start_pos];
13076 
13077   for (Uint32 i = start_pos + 1; i<cnt; i++)
13078   {
13079     if (array[i] < min_value)
13080     {
13081       m = i;
13082       min_value = array[i];
13083     }
13084   }
13085   for (Uint32 i = first_pos; i < start_pos; i++)
13086   {
13087     if (array[i] < min_value)
13088     {
13089       m = i;
13090       min_value = array[i];
13091     }
13092   }
13093   return m;
13094 }
13095 
13096 Uint32
getFragmentsPerNode()13097 Dbdih::getFragmentsPerNode()
13098 {
13099   jam();
13100   if (c_fragments_per_node_ != 0)
13101   {
13102     return c_fragments_per_node_;
13103   }
13104 
13105   c_fragments_per_node_ = getLqhWorkers();
13106   if (c_fragments_per_node_ == 0)
13107     c_fragments_per_node_ = 1; // ndbd
13108 
13109   NodeRecordPtr nodePtr;
13110   nodePtr.i = cfirstAliveNode;
13111   do
13112   {
13113     jam();
13114     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
13115     Uint32 workers = getNodeInfo(nodePtr.i).m_lqh_workers;
13116     if (workers == 0) // ndbd
13117       workers = 1;
13118 
13119     c_fragments_per_node_ = MIN(workers, c_fragments_per_node_);
13120     nodePtr.i = nodePtr.p->nextNode;
13121   } while (nodePtr.i != RNIL);
13122 
13123   if (c_fragments_per_node_ == 0)
13124   {
13125     ndbassert(false);
13126     c_fragments_per_node_ = 1;
13127   }
13128 #ifdef VM_TRACE
13129   ndbout_c("Using %u fragments per node", c_fragments_per_node_);
13130 #endif
13131   return c_fragments_per_node_;
13132 }
13133 
13134 void
init_next_replica_node(Uint16 (* next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES],Uint32 noOfReplicas)13135 Dbdih::init_next_replica_node(
13136   Uint16 (*next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES],
13137   Uint32 noOfReplicas)
13138 {
13139   for (Uint32 i = 0; i < MAX_NDB_NODE_GROUPS; i++)
13140   {
13141     for (Uint32 j = 0; j < NDBMT_MAX_WORKER_INSTANCES; j++)
13142     {
13143       (*next_replica_node)[i][j] = (j % noOfReplicas);
13144     }
13145   }
13146 }
13147 
13148 /**
13149  * CREATE_FRAGMENTATION_REQ
13150  *
13151  * CREATE_FRAGMENTATION_REQ returns a FRAGMENTATION structure, a.k.a.
13152  * ReplicaData in Ndbapi.
13153  *
13154  * The FRAGMENTATION structure contains a mapping from fragment id to log part
13155  * id and a node id for each fragment replica, the first node id is for primary
13156  * replica.
13157  *
13158  * FRAGMENTATION contains of an array of Uint16 values:
13159  *
13160  * 0: #replicas
13161  * 1: #fragments
13162  * 2 + fragmentId*(1 + #replicas) + 0: log part id
13163  * 2 + fragmentId*(1 + #replicas) + 1: primary replica node id
13164  * 2 + fragmentId*(1 + #replicas) + 2: backup replica node id
13165  * ...
13166  *
13167  * CREATE_FRAGMENTATION_REQ supports three request types selected by setting
13168  * requestInfo in signal.
13169  *
13170  * requestInfo             | Description
13171  * ------------------------+----------------------------------------------
13172  * RI_CREATE_FRAGMENTATION | Create a new fragmentation.
13173  * RI_ADD_FRAGMENTS        | Adjust a fragmentation by adding fragments.
13174  * RI_GET_FRAGMENTATION    | Return the current fragmentation for a table.
13175  *
13176  * == Common parameters for all request types ==
13177  *
13178  *   senderRef - Used if response should be sent by signal, only used in old
13179  *       versions before and including 5.0.96, otherwise it must be zero.  New
13180  *       uses of GSN_CREATE_FRAGMENTATION_REQ must be executed using
13181  *       EXECUTE_DIRECT.
13182  *
13183  *   senderData - Used if senderRef is non-zero.
13184  *
13185  *   Fragmentation is returned in theData[25..] and caller must ensure theData
13186  *   is big enough for storing the fragmentation.
13187  *
13188  * == Values for unused parameters ==
13189  *
13190  *   senderRef         = 0
13191  *   senderData        = RNIL
13192  *   requestInfo  Must be set!
13193  *   fragmentationType = 0
13194  *   partitionBalance = 0
13195  *   primaryTableId    = RNIL
13196  *   noOfFragments     = 0
13197  *   partitionCount    = 0
13198  *   map_ptr_i         = RNIL
13199  *
13200  * == Create fragmentation (requestInfo RI_CREATE_FRAGMENTATION) ==
13201  *
13202  *   noOfFragments - Used by some fragmentation types, see fragmentationType
13203  *       below.
13204  *
13205  *   partitionCount - Must be same as noOfFragments, unless fragmentation is
13206  *       for a fully replicated table.  For fully replicated tables
13207  *       noOfFragments must be a multiple of partitionCount.
13208  *
13209  *   fragmentationType - Specifies how table is partitioned into fragments.
13210  *       Since MySQL Cluster 7.0 server only uses UserDefined and
13211  *       HashMapPartition.  Other types can occur from restoring old Ndb
13212  *       backups, or using Ndbapi directly.
13213  *
13214  *         AllNodesSmallTable - noOfFragments is set to 1 per LDM.
13215  *
13216  *         AllNodesMediumTable - noOfFragments is set to 2 per LDM.
13217  *
13218  *         AllNodesLargeTable - noOfFragments is set to 4 per LDM.
13219  *
13220  *         SingleFragment - noOfFragments is set to one.
13221  *
13222  *         DistrKeyHash
13223  *         DistrKeyLin
13224  *           If noOfFragments is zero, noOfFragments is set to 1 per LDM.
13225  *           FragmentData from theData[25..] is used if noOfFragments from
13226  *           signal is non-zero.
13227  *
13228  *         UserDefined - noOfFragment must be non zero.  FragmentData from
13229  *             theData[25..] is used.
13230  *
13231  *         HashMapPartition - Hashmap to use is given by map_ptr_i which must
13232  *             be set (not RNIL).  Both noOfFragments and partitionCount must
13233  *             be set.  Further more partitionCount must be equal to hashmaps
13234  *             partition count (m_fragments).
13235  *             For fully replicated tables, noOfFragments should be a multiple
13236  *             of partitionCount.
13237  *
13238  *   partitionBalance - Determines how the number of fragments depends on
13239  *       cluster configuration such as number of replicas, number of
13240  *       nodegroups, and, number of LDM per node.  The parameter is only used
13241  *       for HashMapPartition.
13242  *
13243  *   FragmentData theData[25..] - An array of Uint16 mapping each fragment to
13244  *       a nodegroup.  NDB_UNDEF_NODEGROUP is used to mark that no specific
13245  *       nodegroup is wanted for fragment.
13246  *
13247  * == Adjust fragmentation by adding fragments (requestInfo RI_ADD_PARTITION) ==
13248  *
13249  *   primaryTableId - Id of table fragmentation to adjust, must not be RNIL.
13250  *
13251  *   noOfFragments - New fragment count must be set (non zero).  Old fragment
13252  *       count is taken from old fragmentation for table.
13253  *
13254  *   partitionCount - New partition count.  For non fully replicated tables
13255  *       partitionCount must be same as noOfFragments.  For fully replicated
13256  *       tables partitionCount must be the same as the old partitionCount.
13257  *
13258  *   map_ptr_i - Is not used from signal but taken from old fragmentation.
13259  *
13260  *   fragmentationType - Must be HashMapPartition or DistrKeyOrderedIndex.
13261  *
13262  * == Get fragmentation (requestInfo RI_GET_FRAGMENTATION) ==
13263  *
13264  *   primaryTableId - Id of table whic fragmentation to return, must not be RNIL.
13265  *
13266  * No other parameters are used from signal (except for the common parameters).
13267  *
13268  */
execCREATE_FRAGMENTATION_REQ(Signal * signal)13269 void Dbdih::execCREATE_FRAGMENTATION_REQ(Signal * signal)
13270 {
13271   jamEntry();
13272   CreateFragmentationReq * const req =
13273     (CreateFragmentationReq*)signal->getDataPtr();
13274 
13275   const Uint32 senderRef = req->senderRef;
13276   const Uint32 senderData = req->senderData;
13277   Uint32 noOfFragments = req->noOfFragments;
13278   const Uint32 fragType = req->fragmentationType;
13279   const Uint32 primaryTableId = req->primaryTableId;
13280   const Uint32 map_ptr_i = req->map_ptr_i;
13281   const Uint32 flags = req->requestInfo;
13282   const Uint32 partitionBalance = req->partitionBalance;
13283   Uint32 partitionCount = req->partitionCount;
13284   Uint32 err = 0;
13285   bool use_specific_fragment_count = false;
13286   const Uint32 defaultFragments =
13287     getFragmentsPerNode() * cnoOfNodeGroups * cnoReplicas;
13288   const Uint32 maxFragments =
13289     MAX_FRAG_PER_LQH * getFragmentsPerNode() * cnoOfNodeGroups;
13290 
13291   if (flags != CreateFragmentationReq::RI_GET_FRAGMENTATION)
13292   {
13293     D("CREATE_FRAGMENTATION_REQ: " <<
13294       " primaryTableId: " << primaryTableId <<
13295       " partitionBalance: " <<
13296         getPartitionBalanceString(partitionBalance) <<
13297       " fragType: " << fragType <<
13298       " noOfFragments: " << noOfFragments);
13299   }
13300 
13301   do {
13302     NodeGroupRecordPtr NGPtr;
13303     TabRecordPtr primTabPtr;
13304     Uint32 count = 2;
13305     Uint16 noOfReplicas = cnoReplicas;
13306     Uint16 *fragments = (Uint16*)(signal->theData+25);
13307     if (primaryTableId == RNIL) {
13308       jam();
13309       switch ((DictTabInfo::FragmentType)fragType){
13310         /*
13311           Backward compatability and for all places in code not changed.
13312         */
13313       case DictTabInfo::AllNodesSmallTable:
13314         jam();
13315         noOfFragments = defaultFragments;
13316         partitionCount = noOfFragments;
13317         set_default_node_groups(signal, noOfFragments);
13318         break;
13319       case DictTabInfo::AllNodesMediumTable:
13320         jam();
13321         noOfFragments = 2 * defaultFragments;
13322         if (noOfFragments > maxFragments)
13323           noOfFragments = maxFragments;
13324         partitionCount = noOfFragments;
13325         set_default_node_groups(signal, noOfFragments);
13326         break;
13327       case DictTabInfo::AllNodesLargeTable:
13328         jam();
13329         noOfFragments = 4 * defaultFragments;
13330         if (noOfFragments > maxFragments)
13331           noOfFragments = maxFragments;
13332         partitionCount = noOfFragments;
13333         set_default_node_groups(signal, noOfFragments);
13334         break;
13335       case DictTabInfo::SingleFragment:
13336         jam();
13337         noOfFragments = 1;
13338         partitionCount = noOfFragments;
13339         use_specific_fragment_count = true;
13340         set_default_node_groups(signal, noOfFragments);
13341         break;
13342       case DictTabInfo::DistrKeyHash:
13343         jam();
13344         // Fall through
13345       case DictTabInfo::DistrKeyLin:
13346         jam();
13347         if (noOfFragments == 0)
13348         {
13349           jam();
13350           noOfFragments = defaultFragments;
13351           partitionCount = noOfFragments;
13352           set_default_node_groups(signal, noOfFragments);
13353         }
13354         else
13355         {
13356           jam();
13357           ndbrequire(noOfFragments == partitionCount);
13358           use_specific_fragment_count = true;
13359         }
13360         break;
13361       case DictTabInfo::UserDefined:
13362         jam();
13363         use_specific_fragment_count = true;
13364         if (noOfFragments == 0)
13365         {
13366           jam();
13367           err = CreateFragmentationRef::InvalidFragmentationType;
13368         }
13369         break;
13370       case DictTabInfo::HashMapPartition:
13371       {
13372         jam();
13373         ndbrequire(map_ptr_i != RNIL);
13374         Ptr<Hash2FragmentMap> ptr;
13375         g_hash_map.getPtr(ptr, map_ptr_i);
13376         if (noOfFragments == 0 ||
13377             partitionCount != ptr.p->m_fragments ||
13378             noOfFragments % partitionCount != 0)
13379         {
13380           jam();
13381           err = CreateFragmentationRef::InvalidFragmentationType;
13382           break;
13383         }
13384         set_default_node_groups(signal, noOfFragments);
13385         break;
13386       }
13387       case DictTabInfo::DistrKeyOrderedIndex:
13388         jam();
13389         // Fall through
13390       default:
13391         jam();
13392         err = CreateFragmentationRef::InvalidFragmentationType;
13393       }
13394       if (err)
13395         break;
13396       /*
13397         When we come here the the exact partition is specified
13398         and there is an array of node groups sent along as well.
13399       */
13400       memcpy(&tmp_node_group_id[0], &signal->theData[25], 2 * noOfFragments);
13401       Uint16 (*next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES] =
13402         &tmp_next_replica_node;
13403       init_next_replica_node(&tmp_next_replica_node, noOfReplicas);
13404 
13405       Uint32 default_node_group= 0;
13406       Uint32 next_log_part = 0;
13407       if ((DictTabInfo::FragmentType)fragType == DictTabInfo::HashMapPartition)
13408       {
13409         jam();
13410         if (partitionBalance != NDB_PARTITION_BALANCE_FOR_RP_BY_LDM)
13411         {
13412           jam();
13413           /**
13414            * The default partitioned table using FOR_RP_BY_LDM will
13415            * distribute exactly one primary replica to each LDM in each node,
13416            * so no need to use the information from other table creations to
13417            * define the primary replica node mapping. For all other tables
13418            * we will attempt to spread the replicas around by using a variable
13419            * in the master node that contains information about other tables
13420            * and how those have been distributed.
13421            */
13422           next_replica_node = &c_next_replica_node;
13423         }
13424         switch (partitionBalance)
13425         {
13426           case NDB_PARTITION_BALANCE_FOR_RP_BY_NODE:
13427           case NDB_PARTITION_BALANCE_FOR_RA_BY_NODE:
13428           {
13429             /**
13430              * Table will only use one log part, we will try spreading over
13431              * different log parts, however the variable isn't persistent, so
13432              * recommendation is to use only small tables for these
13433              * partition balances.
13434              *
13435              * One per node type will use one LDM per replica since fragment
13436              * count is higher.
13437              */
13438             jam();
13439             use_specific_fragment_count = true;
13440             break;
13441           }
13442           case NDB_PARTITION_BALANCE_FOR_RP_BY_LDM:
13443           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM:
13444           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_2:
13445           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_3:
13446           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_4:
13447           {
13448             /**
13449              * These tables will spread over all LDMs and over all node
13450              * groups. We will start with LDM 0 by setting next_log_part
13451              * to -1 and when we do ++ on first fragment in node group
13452              * 0 it will be set to 0.
13453              * We won't touch m_next_log_part in this case since it won't
13454              * change its value anyways.
13455              *
13456              * This is the same as the default behaviour except that the
13457              * old behaviour could be affected by previous tables. This
13458              * behaviour is now removed.
13459              */
13460             jam();
13461             next_log_part = (~0);
13462             break;
13463           }
13464           case NDB_PARTITION_BALANCE_SPECIFIC:
13465           {
13466             jam();
13467             use_specific_fragment_count = true;
13468             break;
13469           }
13470           default:
13471           {
13472             ndbabort();
13473           }
13474         }
13475       }
13476       else
13477       {
13478         /**
13479          * The only table type supported is HashMaps, so we can change the
13480          * mapping of non-HashMap tables to a more stringent one. We will
13481          * still always start at LDM 0 except for tables defined to have
13482          * non-standard fragment counts. In this case we will start at
13483          * m_next_log_part to attempt in spreading out the use on the
13484          * LDMs although we won't perform a perfect job.
13485          */
13486         next_replica_node = &c_next_replica_node;
13487         if (!use_specific_fragment_count)
13488         {
13489           jam();
13490           next_log_part = (~0);
13491         }
13492       }
13493       /**
13494        * Fragments are spread out in 3 different dimensions.
13495        * 1) Node group dimension, each fragment belongs to a node group.
13496        * 2) LDM instance dimenstion, each fragment is mapped to one of the
13497        *    LDMs.
13498        * 3) Primary replica dimension, each fragment maps the primary replica
13499        *    to one of the nodes in the node group.
13500        *
13501        * Node group Dimension:
13502        * ---------------------
13503        * Here the fragments are spread out in easy manner by placing the first
13504        * fragment in Node Group 0, the next in Node Group 1 (if there is one).
13505        * When we have mapped a fragment into each node group, then we restart
13506        * from Node Group 0.
13507        *
13508        * LDM dimension:
13509        * --------------
13510        * The default behaviour in 7.4 and earlier was to spread those in the
13511        * same manner as node groups, one started at the next LDM to receive
13512        * a fragment, this is normally LDM 0. The next fragment is mapped to
13513        * next LDM, normally 1 (if it exists). One proceeds like this until
13514        * one reaches the last LDM, then one starts again from LDM 0.
13515        * A variable m_next_log_part is kept for as long as the node lives.
13516        * Thus we cannot really tell on beforehand where fragments will end
13517        * up in this fragmentation scheme.
13518        *
13519        * We have changed the behaviour for normal tables in 7.5. Now we will
13520        * always start from LDM 0, we will use LDM 0 until all node groups
13521        * have received one fragment in LDM 0. Then when we return to Node
13522        * Group 0 we will step to LDM 1. When we reach the last LDM we will
13523        * step back to LDM 0 again.
13524        *
13525        * For tables with specific fragment count we will use the same mapping
13526        * algorithm except that we will start on the next LDM that was saved
13527        * from creating the last table with specific fragment count.
13528        * This means that tables that have a small number of fragments we will
13529        * attempt to spread them and this has precedence before predictable
13530        * fragmentation.
13531        *
13532        * For fully replicated tables that use all LDMs we want the primary
13533        * fragments to be the first ones. Thus we ensure that the first
13534        * fragments are all stored in Node Group 0 with increasing LDM number.
13535        * If we only have one fragment per Node Group then no changes are
13536        * needed for this. We discover fully replicated tables through the
13537        * fact that noOfFragments != partitionCount. This actually only
13538        * differs with fully replicated tables that are created with more
13539        * than one node group. One node group will however work with the
13540        * traditional algorithm since it then becomes the same.
13541        *
13542        * Primary replica dimension:
13543        * --------------------------
13544        * We will start with the first node in each node group in the first
13545        * round of node groups and with LDM 0. In the second turn for LDM 1
13546        * we will use the second node in the node group. In this manner we
13547        * will get a decent spreading of primary replicas on the nodes in the
13548        * node groups. It won't be perfect, but when we support read from
13549        * backup replicas the need to handle primary replica and backup
13550        * replica is much smaller.
13551        *
13552        * We keep information about tables previously created to try to get
13553        * an even distribution of the primary replicas in different tables
13554        * in the cluster.
13555        */
13556 
13557       if (use_specific_fragment_count)
13558       {
13559         jam();
13560         default_node_group = c_nextNodeGroup;
13561       }
13562       for(Uint32 fragNo = 0; fragNo < noOfFragments; fragNo++)
13563       {
13564         jam();
13565         NGPtr.i = tmp_node_group_id[fragNo];
13566         ndbrequire(default_node_group < MAX_NDB_NODE_GROUPS);
13567         if (NGPtr.i == NDB_UNDEF_NODEGROUP)
13568         {
13569           jam();
13570 	  NGPtr.i = c_node_groups[default_node_group];
13571         }
13572         if (NGPtr.i >= MAX_NDB_NODE_GROUPS)
13573         {
13574           jam();
13575           err = CreateFragmentationRef::InvalidNodeGroup;
13576           break;
13577         }
13578         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
13579         if (NGPtr.p->nodegroupIndex == RNIL)
13580         {
13581           jam();
13582           err = CreateFragmentationRef::InvalidNodeGroup;
13583           break;
13584         }
13585         Uint32 logPart;
13586         if (use_specific_fragment_count)
13587         {
13588           jam();
13589           /**
13590            * Time to increment to next LDM
13591            * Most tables use one fragment per LDM, but if there are
13592            * tables that only use one LDM we make sure in this manner that
13593            * those tables are spread over different LDMs.
13594            *
13595            * This means that the first fragment can end up a bit
13596            * anywhere, but there will still be a good spread of
13597            * the fragments over the LDMs.
13598            */
13599           logPart = NGPtr.p->m_next_log_part++ % globalData.ndbLogParts;
13600         }
13601         else
13602         {
13603           jam();
13604           if (NGPtr.i == 0 ||
13605               (noOfFragments != partitionCount))
13606           {
13607             /** Fully replicated table with one fragment per LDM first
13608              * distributed over all LDMs before moving to the next
13609              * node group.
13610              */
13611             jam();
13612             next_log_part++;
13613           }
13614           logPart = next_log_part % globalData.ndbLogParts;
13615         }
13616         ndbrequire(logPart < NDBMT_MAX_WORKER_INSTANCES);
13617         fragments[count++] = logPart; // Store logpart first
13618 
13619         /* Select primary replica node as next index in double array */
13620         Uint32 node_index = (*next_replica_node)[NGPtr.i][logPart];
13621         ndbrequire(node_index < noOfReplicas);
13622 
13623         for(Uint32 replicaNo = 0; replicaNo < noOfReplicas; replicaNo++)
13624         {
13625           jam();
13626           const Uint16 nodeId = NGPtr.p->nodesInGroup[node_index];
13627           fragments[count++]= nodeId;
13628           inc_node_or_group(node_index, NGPtr.p->nodeCount);
13629           ndbrequire(node_index < noOfReplicas);
13630         }
13631         inc_node_or_group(node_index, NGPtr.p->nodeCount);
13632         ndbrequire(node_index < noOfReplicas);
13633         (*next_replica_node)[NGPtr.i][logPart] = node_index;
13634 
13635         /**
13636          * Next node group for next fragment
13637          */
13638         if (noOfFragments == partitionCount ||
13639             ((fragNo + 1) % partitionCount == 0))
13640         {
13641           /**
13642            * Change to new node group for
13643            * 1) Normal tables
13644            * 2) Tables not stored on all LDMs
13645            * 3) Fully replicated when at last LDM
13646            *
13647            * Thus always except for fully replicated using all LDMs and
13648            * not yet used all LDMs.
13649            */
13650           jam();
13651           inc_node_or_group(default_node_group, cnoOfNodeGroups);
13652         }
13653       }
13654       if (err)
13655       {
13656         jam();
13657         break;
13658       }
13659       if (use_specific_fragment_count)
13660       {
13661         jam();
13662         ndbrequire(default_node_group < MAX_NDB_NODE_GROUPS);
13663         c_nextNodeGroup = default_node_group;
13664       }
13665     } else {
13666       if (primaryTableId >= ctabFileSize) {
13667         jam();
13668         err = CreateFragmentationRef::InvalidPrimaryTable;
13669         break;
13670       }
13671       primTabPtr.i = primaryTableId;
13672       ptrAss(primTabPtr, tabRecord);
13673       if (primTabPtr.p->tabStatus != TabRecord::TS_ACTIVE) {
13674         jam();
13675         err = CreateFragmentationRef::InvalidPrimaryTable;
13676         break;
13677       }
13678       // Keep track of no of (primary) fragments per node
13679       Uint16 (*next_replica_node)[MAX_NDB_NODE_GROUPS][NDBMT_MAX_WORKER_INSTANCES] =
13680         &tmp_next_replica_node;
13681 
13682       memcpy(tmp_next_replica_node,
13683              c_next_replica_node,
13684              sizeof(tmp_next_replica_node));
13685       memset(tmp_next_replica_node_set, 0, sizeof(tmp_next_replica_node_set));
13686       memset(tmp_fragments_per_node, 0, sizeof(tmp_fragments_per_node));
13687       memset(tmp_fragments_per_ldm, 0, sizeof(tmp_fragments_per_ldm));
13688       for (Uint32 fragNo = 0; fragNo < primTabPtr.p->totalfragments; fragNo++) {
13689         jam();
13690         FragmentstorePtr fragPtr;
13691         ReplicaRecordPtr replicaPtr;
13692         getFragstore(primTabPtr.p, fragNo, fragPtr);
13693         Uint32 log_part_id = fragPtr.p->m_log_part_id;
13694         ndbrequire(log_part_id < NDBMT_MAX_WORKER_INSTANCES);
13695 	fragments[count++] = log_part_id;
13696         fragments[count++] = fragPtr.p->preferredPrimary;
13697 
13698         /* Calculate current primary replica node double array */
13699         NGPtr.i = getNodeGroup(fragPtr.p->preferredPrimary);
13700         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
13701         for(Uint32 replicaNo = 0; replicaNo < noOfReplicas; replicaNo++)
13702         {
13703           jam();
13704           if (fragPtr.p->preferredPrimary ==
13705               NGPtr.p->nodesInGroup[replicaNo])
13706           {
13707             Uint32 node_index = replicaNo;
13708             inc_node_or_group(node_index, NGPtr.p->nodeCount);
13709             ndbrequire(node_index < noOfReplicas);
13710             (*next_replica_node)[NGPtr.i][log_part_id] = node_index;
13711             tmp_next_replica_node_set[NGPtr.i][log_part_id] = TRUE;
13712             break;
13713           }
13714         }
13715         for (replicaPtr.i = fragPtr.p->storedReplicas;
13716              replicaPtr.i != RNIL;
13717              replicaPtr.i = replicaPtr.p->nextPool) {
13718           jam();
13719           c_replicaRecordPool.getPtr(replicaPtr);
13720           tmp_fragments_per_ldm[replicaPtr.p->procNode][log_part_id]++;
13721           tmp_fragments_per_node[replicaPtr.p->procNode]++;
13722           if (replicaPtr.p->procNode != fragPtr.p->preferredPrimary) {
13723             jam();
13724             fragments[count++]= replicaPtr.p->procNode;
13725           }
13726         }
13727         for (replicaPtr.i = fragPtr.p->oldStoredReplicas;
13728              replicaPtr.i != RNIL;
13729              replicaPtr.i = replicaPtr.p->nextPool) {
13730           jam();
13731           c_replicaRecordPool.getPtr(replicaPtr);
13732           tmp_fragments_per_ldm[replicaPtr.p->procNode][log_part_id]++;
13733           tmp_fragments_per_node[replicaPtr.p->procNode]++;
13734           if (replicaPtr.p->procNode != fragPtr.p->preferredPrimary) {
13735             jam();
13736             fragments[count++]= replicaPtr.p->procNode;
13737             tmp_fragments_per_node[replicaPtr.p->procNode]++;
13738           }
13739         }
13740       }
13741       if (flags == CreateFragmentationReq::RI_GET_FRAGMENTATION)
13742       {
13743         jam();
13744         noOfFragments = primTabPtr.p->totalfragments;
13745       }
13746       else if (flags == CreateFragmentationReq::RI_ADD_FRAGMENTS)
13747       {
13748         jam();
13749         ndbrequire(fragType == DictTabInfo::HashMapPartition ||
13750                    fragType == DictTabInfo::DistrKeyOrderedIndex);
13751         /**
13752          * All nodes that don't belong to a nodegroup to ~0
13753          * tmp_fragments_per_node so that they don't get any more...
13754          */
13755         for (Uint32 i = 1; i <= m_max_node_id; i++)
13756         {
13757           if (getNodeStatus(i) == NodeRecord::NOT_IN_CLUSTER ||
13758               getNodeGroup(i) >= cnoOfNodeGroups)
13759           {
13760             jam();
13761             ndbassert(tmp_fragments_per_node[i] == 0);
13762             tmp_fragments_per_node[i] = ~(Uint16)0;
13763           }
13764         }
13765 
13766         /**
13767          * Fragments are also added in 3 dimensions.
13768          * Node group Dimension:
13769          * ---------------------
13770          * When we add fragments the algorithm strives to spread the fragments
13771          * in node group order first. If no new node groups exist to map the
13772          * table into then one will simply start up again at Node Group 0.
13773          *
13774          * So the next fragment always seeks out the most empty node group and
13775          * adds the fragment there. When new node groups exists and we haven't
13776          * changed the partition balance then all new fragments will end up
13777          * in the new node groups. If we change partition balance we will
13778          * also add new fragments to existing node groups.
13779          *
13780          * LDM Dimension:
13781          * --------------
13782          * We will ensure that we have an even distribution on the LDMs in the
13783          * nodes by ensuring that we have knowledge of which LDMs we primarily
13784          * used in the original table. This is necessary to support ALTER TABLE
13785          * from PARTITION_BALANCE_FOR_RP_BY_NODE to
13786          * PARTITION_BALANCE_FOR_RA_BY_NODE e.g. PARTITION_BALANCE_FOR_RP_BY_NODE
13787          * could have used any LDMs. So it is important to ensure that we
13788          * spread evenly over all LDMs also after the ALTER TABLE. We do this
13789          * by always finding the LDM in the node with the minimum number of
13790          * fragments.
13791          *
13792          * At the moment we don't support on-line add partition of for fully
13793          * replicated tables. We do however support adding more node groups.
13794          * In order to support adding partitions for fully replicated tables
13795          * it is necessary to provide a mapping from calculated main fragment
13796          * since they will then no longer be fragment id 0 to number of
13797          * main fragments minus one.
13798          *
13799          * Primary replica Dimension:
13800          * --------------------------
13801          * We make an effort to spread the primary replicas around amongst the
13802          * nodes in each node group and LDM. We need to spread both regarding
13803          * nodes and with regard to LDM. When we use partition balance
13804          * FOR_RP_BY_LDM we will spread on all LDMs in all nodes for
13805          * the table itself, so we don't need to use the DIH copy of the
13806          * next primary replica to use. For all other tables we will start by
13807          * reading what is already in the table, if the table itself has
13808          * already used an LDM in the node group to assign a primary replica,
13809          * then we will simply continue using the local copy. For new
13810          * partitions in a previously unused LDM in a node group we will
13811          * rather use the next based on what other tables have used in
13812          * creating and on-line altering tables.
13813          */
13814 
13815         Uint32 first_new_node = find_min_index(tmp_fragments_per_node,
13816                                                m_max_node_id + 1,
13817                                                1, 1);
13818         Uint32 firstNG = getNodeGroup(first_new_node);
13819         Uint32 next_log_part = 0;
13820         bool use_old_variant = true;
13821 
13822         bool const fully_replicated = (noOfFragments != partitionCount);
13823 
13824         switch(partitionBalance)
13825         {
13826           case NDB_PARTITION_BALANCE_SPECIFIC:
13827           case NDB_PARTITION_BALANCE_FOR_RP_BY_NODE:
13828           case NDB_PARTITION_BALANCE_FOR_RA_BY_NODE:
13829           {
13830             jam();
13831             break;
13832           }
13833           case NDB_PARTITION_BALANCE_FOR_RP_BY_LDM:
13834           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM:
13835           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_2:
13836           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_3:
13837           case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_4:
13838           {
13839             jam();
13840             use_old_variant = false;
13841             next_log_part = (~0);
13842             break;
13843           }
13844           default:
13845           {
13846             ndbabort();
13847           }
13848         }
13849         Uint32 node = 0;
13850         NGPtr.i = RNIL;
13851         for (Uint32 i = primTabPtr.p->totalfragments; i<noOfFragments; i++)
13852         {
13853           jam();
13854           if (!fully_replicated || (i % partitionCount == 0))
13855           {
13856             node = find_min_index(tmp_fragments_per_node,
13857                                   m_max_node_id + 1,
13858                                   1, 1);
13859             NGPtr.i = getNodeGroup(node);
13860           }
13861           ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
13862           Uint32 logPart;
13863           if (use_old_variant)
13864           {
13865             jam();
13866             logPart = (NGPtr.p->m_next_log_part++) % globalData.ndbLogParts;
13867           }
13868           else
13869           {
13870             jam();
13871             if (firstNG == NGPtr.i)
13872             {
13873               jam();
13874               next_log_part++;
13875             }
13876             logPart = next_log_part % globalData.ndbLogParts;
13877           }
13878           ndbrequire(node != 0);
13879           logPart = find_min_index(&tmp_fragments_per_ldm[node][0],
13880                                    globalData.ndbLogParts,
13881                                    logPart, 0);
13882           ndbrequire(logPart < NDBMT_MAX_WORKER_INSTANCES);
13883 
13884           /* Select primary replica node */
13885           Uint32 primary_node;
13886           if (tmp_next_replica_node_set[NGPtr.i][logPart] ||
13887               partitionBalance == NDB_PARTITION_BALANCE_FOR_RP_BY_LDM)
13888           {
13889             jam();
13890             Uint32 node_index = (*next_replica_node)[NGPtr.i][logPart];
13891             primary_node = NGPtr.p->nodesInGroup[node_index];
13892             inc_node_or_group(node_index, NGPtr.p->nodeCount);
13893             ndbrequire(node_index < noOfReplicas);
13894             (*next_replica_node)[NGPtr.i][logPart] = node_index;
13895           }
13896           else
13897           {
13898             jam();
13899             Uint32 node_index = c_next_replica_node[NGPtr.i][logPart];
13900             primary_node = NGPtr.p->nodesInGroup[node_index];
13901             inc_node_or_group(node_index, NGPtr.p->nodeCount);
13902             c_next_replica_node[NGPtr.i][logPart] = node_index;
13903           }
13904           ndbrequire(primary_node < MAX_NDB_NODES);
13905           fragments[count++] = logPart;
13906           fragments[count++] = primary_node;
13907           tmp_fragments_per_ldm[primary_node][logPart]++;
13908           /* Ensure that we don't report this as min immediately again */
13909           tmp_fragments_per_node[primary_node]++;
13910           for (Uint32 r = 0; r < noOfReplicas; r++)
13911           {
13912             jam();
13913             if (NGPtr.p->nodesInGroup[r] != primary_node)
13914             {
13915               jam();
13916               Uint32 replicaNode = NGPtr.p->nodesInGroup[r];
13917               fragments[count++] = replicaNode;
13918               tmp_fragments_per_node[replicaNode]++;
13919               tmp_fragments_per_ldm[replicaNode][logPart]++;
13920             }
13921           }
13922         }
13923       }
13924     }
13925     if(count != (2U + (1 + noOfReplicas) * noOfFragments)){
13926         char buf[255];
13927         BaseString::snprintf(buf, sizeof(buf),
13928                            "Illegal configuration change: NoOfReplicas."
13929                            " Can't be applied online ");
13930         progError(__LINE__, NDBD_EXIT_INVALID_CONFIG, buf);
13931     }
13932 
13933     CreateFragmentationConf * const conf =
13934       (CreateFragmentationConf*)signal->getDataPtrSend();
13935     conf->senderRef = reference();
13936     conf->senderData = senderData;
13937     conf->noOfReplicas = (Uint32)noOfReplicas;
13938     conf->noOfFragments = (Uint32)noOfFragments;
13939 
13940     fragments[0]= noOfReplicas;
13941     fragments[1]= noOfFragments;
13942 
13943     if (flags == CreateFragmentationReq::RI_ADD_FRAGMENTS ||
13944         flags == CreateFragmentationReq::RI_CREATE_FRAGMENTATION)
13945     {
13946       if (!verify_fragmentation(fragments, partitionCount, partitionBalance, getFragmentsPerNode()))
13947       {
13948         err = CreateFragmentationRef::InvalidFragmentationType;
13949         break;
13950       }
13951     }
13952 
13953     if(senderRef != 0)
13954     {
13955       /**
13956        * Only possible serving old client with lower version than 7.0.4
13957        * (WL#3600)
13958        */
13959       jam();
13960       LinearSectionPtr ptr[3];
13961       ptr[0].p = (Uint32*)&fragments[0];
13962       ptr[0].sz = (count + 1) / 2;
13963       sendSignal(senderRef,
13964 		 GSN_CREATE_FRAGMENTATION_CONF,
13965 		 signal,
13966 		 CreateFragmentationConf::SignalLength,
13967 		 JBB,
13968 		 ptr,
13969 		 1);
13970     }
13971     // Always ACK/NACK (here ACK)
13972     signal->theData[0] = 0;
13973     return;
13974   } while(false);
13975   // Always ACK/NACK (here NACK)
13976   signal->theData[0] = err;
13977 }
13978 
verify_fragmentation(Uint16 * fragments,Uint32 partition_count,Uint32 partition_balance,Uint32 ldm_count) const13979 bool Dbdih::verify_fragmentation(Uint16* fragments,
13980                                  Uint32 partition_count,
13981                                  Uint32 partition_balance,
13982                                  Uint32 ldm_count) const
13983 {
13984   jam();
13985   bool fatal = false;
13986   bool suboptimal = false;
13987 
13988   Uint32 const replica_count = fragments[0];
13989   Uint32 const fragment_count = fragments[1];
13990 
13991   Uint16 fragments_per_node[MAX_NDB_NODES];
13992   Uint16 primary_replica_per_node[MAX_NDB_NODES];
13993   Uint16 fragments_per_ldm[MAX_NDB_NODES][NDBMT_MAX_WORKER_INSTANCES];
13994   Uint16 primary_replica_per_ldm[MAX_NDB_NODES][NDBMT_MAX_WORKER_INSTANCES];
13995 
13996   bzero(fragments_per_node, sizeof(fragments_per_node));
13997   bzero(fragments_per_ldm, sizeof(fragments_per_ldm));
13998   bzero(primary_replica_per_node, sizeof(primary_replica_per_node));
13999   bzero(primary_replica_per_ldm, sizeof(primary_replica_per_ldm));
14000 
14001   /**
14002    * For fully replicated tables one partition can have several copy fragments.
14003    * The following conditions must be satisfied:
14004    * 1) No node have two copy fragments for same partition.
14005    * 2) The partition id that a fragment belongs to is calculated as module
14006    *    partition count.
14007    * 3) The main copy fragment of a partition have the same id as the partition.
14008    * 4) Fragments with consequtive id belonging to partition 0 upto partition
14009    *    count - 1, are in this function called a partition set and should have
14010    *    its replicas in one nodegroup.
14011    * 1) must always be satisfied also in future implementations. 2) and 3) may
14012    * be relaxed in future. 4) is not necessary, but as long as 2) and 3) must
14013    * be satisfied ensuring 4) is an easy condition to remember.
14014    */
14015 
14016   /**
14017    * partition_nodes indicates for each partition what nodes have a copy
14018    * fragment.  This is used to detect if two fragments for same partition is
14019    * located on same node, ie breakage of condition 1) above.
14020    * This also depends on condition 2) above.
14021    */
14022   NdbNodeBitmask partition_nodes[MAX_NDB_PARTITIONS];
14023 
14024   /**
14025    * partition_set_for_node keep track what partition_set (as in condition 4)
14026    * above) are located on a node.  Only one partition set per node is allowed.
14027    * This toghether with the fact that all nodes in same nodegroup share
14028    * fragments ensures condition 4) above.
14029    * ~0 are used as a still unset partition set indicator.
14030    */
14031   Uint32 partition_set_for_node[MAX_NDB_NODES];
14032   for (Uint32 node = 1; node <= m_max_node_id; node++)
14033   {
14034     partition_set_for_node[node] = ~Uint32(0);
14035   }
14036 
14037   for(Uint32 fragment_id = 0; fragment_id < fragment_count; fragment_id++)
14038   {
14039     jam();
14040     Uint32 const partition_id = fragment_id % partition_count;
14041     Uint32 const partition_set = fragment_id / partition_count;
14042     Uint32 const log_part_id = fragments[2 + fragment_id * (1 + replica_count)];
14043     Uint32 const ldm = (log_part_id % ldm_count);
14044     for(Uint32 replica_id = 0; replica_id < replica_count; replica_id++)
14045     {
14046       jam();
14047       Uint32 const node =
14048           fragments[2 + fragment_id * (1 + replica_count) + 1 + replica_id];
14049       fragments_per_node[node]++;
14050       fragments_per_ldm[node][ldm]++;
14051       if (replica_id == 0)
14052       {
14053         jam();
14054         primary_replica_per_node[node]++;
14055         primary_replica_per_ldm[node][ldm]++;
14056       }
14057 
14058       if (partition_set_for_node[node] == ~Uint32(0))
14059       {
14060         jam();
14061         partition_set_for_node[node] = partition_set;
14062       }
14063       if (partition_set_for_node[node] != partition_set)
14064       {
14065         jam();
14066         fatal = true;
14067         ndbassert(!"Copy fragments from different partition set on same node");
14068       }
14069 
14070       if (partition_nodes[partition_id].get(node))
14071       {
14072         jam();
14073         fatal = true;
14074         ndbassert(!"Two copy fragments for same partition on same node");
14075       }
14076       partition_nodes[partition_id].set(node);
14077     }
14078   }
14079 
14080   /**
14081    * Below counters for number of fragments (for ra) or primary replicas (for
14082    * rp) there are per ldm or node.
14083    *
14084    * ~0 is used to indicate unset value. 0 is used if there are conflicting
14085    * counts, in other word there is an unbalance.
14086    */
14087 
14088   Uint32 balance_for_ra_by_ldm_count = ~Uint32(0);
14089   Uint32 balance_for_ra_by_node_count = ~Uint32(0);
14090   Uint32 balance_for_rp_by_ldm_count = ~Uint32(0);
14091   Uint32 balance_for_rp_by_node_count = ~Uint32(0);
14092   for (Uint32 node = 1; node <= m_max_node_id; node++)
14093   {
14094     jam();
14095     if (balance_for_ra_by_node_count != 0 &&
14096         fragments_per_node[node] != 0 &&
14097         fragments_per_node[node] != balance_for_ra_by_node_count)
14098     {
14099       if (balance_for_ra_by_node_count == ~Uint32(0))
14100         balance_for_ra_by_node_count = fragments_per_node[node];
14101       else
14102         balance_for_ra_by_node_count = 0;
14103     }
14104     if (balance_for_rp_by_node_count != 0 &&
14105         primary_replica_per_node[node] != 0 &&
14106         primary_replica_per_node[node] != balance_for_rp_by_node_count)
14107     {
14108       if (balance_for_rp_by_node_count == ~Uint32(0))
14109         balance_for_rp_by_node_count = primary_replica_per_node[node];
14110       else
14111         balance_for_rp_by_node_count = 0;
14112     }
14113     for (Uint32 ldm = 0; ldm < NDBMT_MAX_WORKER_INSTANCES; ldm ++)
14114     {
14115       if (balance_for_ra_by_ldm_count != 0 &&
14116           fragments_per_ldm[node][ldm] != 0 &&
14117           fragments_per_ldm[node][ldm] != balance_for_ra_by_ldm_count)
14118       {
14119         if (balance_for_ra_by_ldm_count == ~Uint32(0))
14120           balance_for_ra_by_ldm_count = fragments_per_ldm[node][ldm];
14121         else
14122           balance_for_ra_by_ldm_count = 0;
14123       }
14124       if (balance_for_rp_by_ldm_count != 0 &&
14125           primary_replica_per_ldm[node][ldm] != 0 &&
14126           primary_replica_per_ldm[node][ldm] != balance_for_rp_by_ldm_count)
14127       {
14128         if (balance_for_rp_by_ldm_count == ~Uint32(0))
14129           balance_for_rp_by_ldm_count = primary_replica_per_ldm[node][ldm];
14130         else
14131           balance_for_rp_by_ldm_count = 0;
14132       }
14133     }
14134   }
14135   switch (partition_balance)
14136   {
14137   case NDB_PARTITION_BALANCE_FOR_RA_BY_NODE:
14138     jam();
14139     suboptimal = (balance_for_ra_by_node_count == 0);
14140     break;
14141   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM:
14142   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_2:
14143   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_3:
14144   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_4:
14145     jam();
14146     suboptimal = (balance_for_ra_by_ldm_count == 0);
14147     break;
14148   case NDB_PARTITION_BALANCE_FOR_RP_BY_NODE:
14149     jam();
14150     suboptimal = (balance_for_rp_by_node_count == 0);
14151     break;
14152   case NDB_PARTITION_BALANCE_FOR_RP_BY_LDM:
14153     jam();
14154     suboptimal = (balance_for_rp_by_ldm_count == 0);
14155     break;
14156   default:
14157     jam();
14158   }
14159   ndbassert(!fatal);
14160   // Allow suboptimal until we have a way to choose to allow it or not
14161   return !fatal;
14162 }
14163 
insertCopyFragmentList(TabRecord * tabPtr,Fragmentstore * fragPtr,Uint32 my_fragid)14164 void Dbdih::insertCopyFragmentList(TabRecord *tabPtr,
14165                                    Fragmentstore *fragPtr,
14166                                    Uint32 my_fragid)
14167 {
14168   Uint32 found_fragid = RNIL;
14169   FragmentstorePtr locFragPtr;
14170   Uint32 partition_id = fragPtr->partition_id;
14171   for (Uint32 i = 0; i < tabPtr->totalfragments; i++)
14172   {
14173     getFragstore(tabPtr, i, locFragPtr);
14174     if (locFragPtr.p->partition_id == partition_id)
14175     {
14176       if (fragPtr == locFragPtr.p)
14177       {
14178         /* We're inserting the main fragment */
14179         fragPtr->nextCopyFragment = RNIL;
14180         D("Inserting fragId " << my_fragid << " as main fragment");
14181         return;
14182       }
14183       jam();
14184       found_fragid = i;
14185       break;
14186     }
14187   }
14188   ndbrequire(found_fragid != RNIL);
14189   /**
14190    * We have now found the main copy fragment for this partition.
14191    * We will add the fragment last in this list. So we search for
14192    * end of list and add it to the list when we reach the end of
14193    * the list.
14194    */
14195   ndbrequire(locFragPtr.p != fragPtr);
14196   while (locFragPtr.p->nextCopyFragment != RNIL)
14197   {
14198     found_fragid = locFragPtr.p->nextCopyFragment;
14199     getFragstore(tabPtr, found_fragid, locFragPtr);
14200   }
14201   /**
14202    * We update in a safe manner here ensuring that the list is
14203    * always seen as a proper list by inserting a memory barrier
14204    * before setting the new nextCopyFragment. It isn't absolutely
14205    * necessary but is future proof given that we use a RCU
14206    * mechanism around this data.
14207    */
14208   fragPtr->nextCopyFragment = RNIL;
14209   mb();
14210   locFragPtr.p->nextCopyFragment = my_fragid;
14211   D("Insert fragId " << my_fragid << " after fragId " << found_fragid);
14212 }
14213 
execDIADDTABREQ(Signal * signal)14214 void Dbdih::execDIADDTABREQ(Signal* signal)
14215 {
14216   Uint32 fragType;
14217   jamEntry();
14218 
14219   DiAddTabReq * const req = (DiAddTabReq*)signal->getDataPtr();
14220 
14221   // Seize connect record
14222   ndbrequire(cfirstconnect != RNIL);
14223   ConnectRecordPtr connectPtr;
14224   connectPtr.i = cfirstconnect;
14225   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
14226   cfirstconnect = connectPtr.p->nextPool;
14227 
14228   const Uint32 userPtr = req->connectPtr;
14229   const BlockReference userRef = signal->getSendersBlockRef();
14230   connectPtr.p->nextPool = RNIL;
14231   connectPtr.p->userpointer = userPtr;
14232   connectPtr.p->userblockref = userRef;
14233   connectPtr.p->connectState = ConnectRecord::INUSE;
14234   connectPtr.p->table = req->tableId;
14235   connectPtr.p->m_alter.m_changeMask = 0;
14236   connectPtr.p->m_create.m_map_ptr_i = req->hashMapPtrI;
14237 
14238   TabRecordPtr tabPtr;
14239   tabPtr.i = req->tableId;
14240   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14241 
14242   D("DIADDTABREQ: tableId = " << tabPtr.i);
14243   fragType= req->fragType;
14244   if (prepare_add_table(tabPtr, connectPtr, signal))
14245   {
14246     jam();
14247     return;
14248   }
14249 
14250   /**
14251    * When we get here the table is under definition and DBTC can still not
14252    * use the table. So there is no possibility for conflict with DBTC.
14253    * Thus no need for mutexes and RCU lock calls.
14254    */
14255 
14256   /* Only the master should read a table definition from disk during SR */
14257   if (getNodeState().getSystemRestartInProgress() &&
14258       tabPtr.p->tabStatus == TabRecord::TS_IDLE &&
14259       cmasterNodeId == getOwnNodeId())
14260   {
14261     jam();
14262     /**
14263      * We start the process of creating the table in all nodes from
14264      * here.
14265      * Step 1)
14266      *   Read table definition from file system in master node (this
14267      *   node).
14268      *   Copy the pages read into the table and fragment records.
14269      *   Next copy the table to all other nodes in the cluster using
14270      *   COPY_TABREQ signals which starts by packing the table into
14271      *   pages again and next sending those in COPY_TABREQ signals.
14272      *   The non-master nodes will send COPY_TABCONF when completed
14273      *   the handling of the received table meta data information.
14274      *   It will also write the pages to the file system in the master
14275      *   node. After completing this the master will send COPY_TABCONF
14276      *   to itself.
14277      * Step 2)
14278      *   The non-master nodes will later also call DIADDTABREQ, these
14279      *   nodes will have set state to TS_ACTIVE as a flag to the function
14280      *   prepare_add_table to reflect that the table already has its
14281      *   fragmentation data setup and it is ready to create the fragments
14282      *   in the non-master nodes. It is also set to TS_ACTIVE since we
14283      *   become included in the LCP protocol immediately after copying
14284      *   the table meta data information.
14285      */
14286     tabPtr.p->tabStatus = TabRecord::TS_CREATING;
14287 
14288     initTableFile(tabPtr);
14289     FileRecordPtr filePtr;
14290     filePtr.i = tabPtr.p->tabFile[0];
14291     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
14292     openFileRw(signal, filePtr);
14293     filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
14294     return;
14295   }
14296 
14297   /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
14298   /* AT THE TIME OF INITIATING THE FILE OF TABLE         */
14299   /* DESCRIPTION IS CREATED FOR APPROPRIATE SIZE. EACH   */
14300   /* EACH RECORD IN THIS FILE HAS THE INFORMATION ABOUT  */
14301   /* ONE TABLE. THE POINTER TO THIS RECORD IS THE TABLE  */
14302   /* REFERENCE. IN THE BEGINNING ALL RECORDS ARE CREATED */
14303   /* BUT THEY DO NOT HAVE ANY INFORMATION ABOUT ANY TABLE*/
14304   /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
14305   tabPtr.p->tabStatus = TabRecord::TS_CREATING;
14306   if(req->loggedTable)
14307     tabPtr.p->tabStorage= TabRecord::ST_NORMAL;
14308   else if(req->temporaryTable)
14309     tabPtr.p->tabStorage= TabRecord::ST_TEMPORARY;
14310   else
14311     tabPtr.p->tabStorage= TabRecord::ST_NOLOGGING;
14312   tabPtr.p->kvalue = req->kValue;
14313 
14314   switch ((DictTabInfo::FragmentType)fragType){
14315   case DictTabInfo::HashMapPartition:
14316     tabPtr.p->method = TabRecord::HASH_MAP;
14317     break;
14318   case DictTabInfo::AllNodesSmallTable:
14319   case DictTabInfo::AllNodesMediumTable:
14320   case DictTabInfo::AllNodesLargeTable:
14321   case DictTabInfo::SingleFragment:
14322     jam();
14323     // Fall through
14324   case DictTabInfo::DistrKeyLin:
14325     jam();
14326     tabPtr.p->method = TabRecord::LINEAR_HASH;
14327     break;
14328   case DictTabInfo::DistrKeyHash:
14329     jam();
14330     tabPtr.p->method = TabRecord::NORMAL_HASH;
14331     break;
14332   case DictTabInfo::DistrKeyOrderedIndex:
14333   {
14334     TabRecordPtr primTabPtr;
14335     primTabPtr.i = req->primaryTableId;
14336     ptrCheckGuard(primTabPtr, ctabFileSize, tabRecord);
14337     tabPtr.p->method = primTabPtr.p->method;
14338     req->hashMapPtrI = primTabPtr.p->m_map_ptr_i;
14339     break;
14340   }
14341   case DictTabInfo::UserDefined:
14342     jam();
14343     tabPtr.p->method = TabRecord::USER_DEFINED;
14344     break;
14345   default:
14346     ndbabort();
14347   }
14348 
14349   union {
14350     Uint16 fragments[MAX_FRAGMENT_DATA_ENTRIES];
14351     Uint32 align;
14352   };
14353   (void)align; // kill warning
14354   SectionHandle handle(this, signal);
14355   SegmentedSectionPtr fragDataPtr;
14356   ndbrequire(handle.getSection(fragDataPtr, DiAddTabReq::FRAGMENTATION));
14357   copy((Uint32*)fragments, fragDataPtr);
14358   releaseSections(handle);
14359 
14360   const Uint32 noReplicas = fragments[0];
14361   const Uint32 noFragments = fragments[1];
14362 
14363   if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) == 0)
14364   {
14365     jam();
14366     D("partitionCount for normal table set to = " << noFragments);
14367     tabPtr.p->partitionCount = noFragments;
14368   }
14369   tabPtr.p->noOfBackups = noReplicas - 1;
14370   tabPtr.p->totalfragments = noFragments;
14371   ndbrequire(noReplicas == cnoReplicas); // Only allowed
14372 
14373   if (ERROR_INSERTED(7173)) {
14374     CLEAR_ERROR_INSERT_VALUE;
14375     addtabrefuseLab(signal, connectPtr, ZREPLERROR1);
14376     return;
14377   }
14378   if ((noReplicas * noFragments) > cnoFreeReplicaRec) {
14379     jam();
14380     addtabrefuseLab(signal, connectPtr, ZREPLERROR1);
14381     return;
14382   }//if
14383   if (noFragments > cremainingfrags) {
14384     jam();
14385     addtabrefuseLab(signal, connectPtr, ZREPLERROR2);
14386     return;
14387   }//if
14388 
14389   Uint32 logTotalFragments = 1;
14390   ndbrequire(tabPtr.p->partitionCount < (1 << 16));
14391   while (logTotalFragments <= tabPtr.p->partitionCount) {
14392     jam();
14393     logTotalFragments <<= 1;
14394   }
14395   logTotalFragments >>= 1;
14396   tabPtr.p->mask = logTotalFragments - 1;
14397   tabPtr.p->hashpointer = tabPtr.p->partitionCount - logTotalFragments;
14398   allocFragments(tabPtr.p->totalfragments, tabPtr);
14399 
14400   if (tabPtr.p->method == TabRecord::HASH_MAP)
14401   {
14402     jam();
14403     tabPtr.p->m_map_ptr_i = req->hashMapPtrI;
14404     tabPtr.p->m_new_map_ptr_i = RNIL;
14405     Ptr<Hash2FragmentMap> mapPtr;
14406     g_hash_map.getPtr(mapPtr, tabPtr.p->m_map_ptr_i);
14407     ndbrequire(tabPtr.p->totalfragments >= mapPtr.p->m_fragments);
14408   }
14409 
14410   Uint32 index = 2;
14411   for (Uint32 fragId = 0; fragId < noFragments; fragId++) {
14412     jam();
14413     FragmentstorePtr fragPtr;
14414     Uint32 activeIndex = 0;
14415     getFragstore(tabPtr.p, fragId, fragPtr);
14416     fragPtr.p->m_log_part_id = fragments[index++];
14417     fragPtr.p->preferredPrimary = fragments[index];
14418     fragPtr.p->partition_id = fragId % tabPtr.p->partitionCount;
14419 
14420     ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
14421 
14422     inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
14423 
14424     for (Uint32 i = 0; i<noReplicas; i++) {
14425       const Uint32 nodeId = fragments[index++];
14426       ReplicaRecordPtr replicaPtr;
14427       allocStoredReplica(fragPtr,
14428                          replicaPtr,
14429                          nodeId,
14430                          fragId,
14431                          tabPtr.i);
14432       if (getNodeStatus(nodeId) == NodeRecord::ALIVE) {
14433         jam();
14434         ndbrequire(activeIndex < MAX_REPLICAS);
14435         fragPtr.p->activeNodes[activeIndex] = nodeId;
14436         activeIndex++;
14437       } else {
14438         jam();
14439         removeStoredReplica(fragPtr, replicaPtr);
14440         linkOldStoredReplica(fragPtr, replicaPtr);
14441       }//if
14442     }//for
14443     fragPtr.p->fragReplicas = activeIndex;
14444     ndbrequire(activeIndex > 0 && fragPtr.p->storedReplicas != RNIL);
14445     if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) != 0)
14446     {
14447       jam();
14448       insertCopyFragmentList(tabPtr.p, fragPtr.p, fragId);
14449     }
14450   }
14451   initTableFile(tabPtr);
14452   tabPtr.p->tabCopyStatus = TabRecord::CS_ADD_TABLE_MASTER;
14453   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
14454   signal->theData[1] = tabPtr.i;
14455   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
14456 }
14457 
14458 void
addTable_closeConf(Signal * signal,Uint32 tabPtrI)14459 Dbdih::addTable_closeConf(Signal * signal, Uint32 tabPtrI){
14460   TabRecordPtr tabPtr;
14461   tabPtr.i = tabPtrI;
14462   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14463 
14464   ConnectRecordPtr connectPtr;
14465   connectPtr.i = tabPtr.p->connectrec;
14466   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
14467   connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
14468 
14469   sendAddFragreq(signal, connectPtr, tabPtr, 0, false);
14470 }
14471 
14472 void
sendAddFragreq(Signal * signal,ConnectRecordPtr connectPtr,TabRecordPtr tabPtr,Uint32 fragId,bool rcu_lock_held)14473 Dbdih::sendAddFragreq(Signal* signal,
14474                       ConnectRecordPtr connectPtr,
14475                       TabRecordPtr tabPtr,
14476                       Uint32 fragId,
14477                       bool rcu_lock_held)
14478 {
14479   jam();
14480   const Uint32 fragCount = connectPtr.p->m_alter.m_totalfragments;
14481   ReplicaRecordPtr replicaPtr;
14482   replicaPtr.i = RNIL;
14483   FragmentstorePtr fragPtr;
14484   for(; fragId<fragCount; fragId++){
14485     jam();
14486     getFragstore(tabPtr.p, fragId, fragPtr);
14487 
14488     replicaPtr.i = fragPtr.p->storedReplicas;
14489     while(replicaPtr.i != RNIL){
14490       jam();
14491       c_replicaRecordPool.getPtr(replicaPtr);
14492       if(replicaPtr.p->procNode == getOwnNodeId()){
14493 	break;
14494       }
14495       replicaPtr.i = replicaPtr.p->nextPool;
14496     }
14497 
14498     if(replicaPtr.i != RNIL){
14499       jam();
14500       break;
14501     }
14502 
14503     replicaPtr.i = fragPtr.p->oldStoredReplicas;
14504     while(replicaPtr.i != RNIL){
14505       jam();
14506       c_replicaRecordPool.getPtr(replicaPtr);
14507       if(replicaPtr.p->procNode == getOwnNodeId()){
14508 	break;
14509       }
14510       replicaPtr.i = replicaPtr.p->nextPool;
14511     }
14512 
14513     if(replicaPtr.i != RNIL){
14514       jam();
14515       break;
14516     }
14517   }
14518 
14519   if(replicaPtr.i != RNIL){
14520     jam();
14521     ndbrequire(fragId < fragCount);
14522     ndbrequire(replicaPtr.p->procNode == getOwnNodeId());
14523 
14524     Uint32 requestInfo = 0;
14525     if(tabPtr.p->tabStorage != TabRecord::ST_NORMAL){
14526       requestInfo |= LqhFragReq::TemporaryTable;
14527     }
14528 
14529     if(getNodeState().getNodeRestartInProgress()){
14530       requestInfo |= LqhFragReq::CreateInRunning;
14531     }
14532 
14533     AddFragReq* const req = (AddFragReq*)signal->getDataPtr();
14534     req->dihPtr = connectPtr.i;
14535     req->senderData = connectPtr.p->userpointer;
14536     req->fragmentId = fragId;
14537     req->requestInfo = requestInfo;
14538     req->tableId = tabPtr.i;
14539     req->nextLCP = 0;
14540     req->nodeId = getOwnNodeId();
14541     req->totalFragments = fragCount;
14542     req->startGci = SYSFILE->newestRestorableGCI;
14543     req->logPartId = fragPtr.p->m_log_part_id;
14544     req->createGci = replicaPtr.p->initialGci;
14545 
14546     if (connectPtr.p->connectState != ConnectRecord::ALTER_TABLE)
14547     {
14548       jam();
14549       req->changeMask = 0;
14550       req->partitionId = fragId % tabPtr.p->partitionCount;
14551     }
14552     else /* connectState == ALTER_TABLE */
14553     {
14554       jam();
14555       req->changeMask = connectPtr.p->m_alter.m_changeMask;
14556       req->partitionId = fragId % connectPtr.p->m_alter.m_partitionCount;
14557     }
14558 
14559     sendSignal(DBDICT_REF, GSN_ADD_FRAGREQ, signal,
14560 	       AddFragReq::SignalLength, JBB);
14561     return;
14562   }
14563 
14564   if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
14565   {
14566     jam();
14567     // Request handled successfully
14568 
14569     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
14570     {
14571       jam();
14572       make_new_table_writeable(tabPtr, connectPtr, rcu_lock_held);
14573     }
14574 
14575     if (AlterTableReq::getAddFragFlag(connectPtr.p->m_alter.m_changeMask))
14576     {
14577       jam();
14578       Callback cb;
14579       cb.m_callbackData = connectPtr.i;
14580       cb.m_callbackFunction = safe_cast(&Dbdih::alter_table_writeTable_conf);
14581       saveTableFile(signal, connectPtr, tabPtr, TabRecord::CS_ALTER_TABLE, cb);
14582       return;
14583     }
14584 
14585     send_alter_tab_conf(signal, connectPtr);
14586   }
14587   else
14588   {
14589     // Done
14590 
14591     /**
14592      * This code is only executed as part of CREATE TABLE, so at this point
14593      * in time DBTC hasn't been made aware of the table's usability yet, so
14594      * we rely on signal ordering to protect the data from DBTC here.
14595      * Naturally it could be executed as part of a CREATE INDEX as well, but
14596      * the principle is still the same.
14597      */
14598 
14599     /**
14600       * Don't expect to be adding tables due to e.g. user action
14601       * during NR or SR, so we init the CopyFragmentList here
14602       */
14603     if (( getNodeState().getSystemRestartInProgress() ||
14604           getNodeState().getNodeRestartInProgress() ) &&
14605         (tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) != 0)
14606     {
14607       jam();
14608       for(Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++)
14609       {
14610         jam();
14611         FragmentstorePtr fragPtr;
14612         getFragstore(tabPtr.p, fragId, fragPtr);
14613         fragPtr.p->partition_id = fragId % tabPtr.p->partitionCount;
14614         insertCopyFragmentList(tabPtr.p, fragPtr.p, fragId);
14615       }
14616     }
14617 
14618     DiAddTabConf * const conf = (DiAddTabConf*)signal->getDataPtr();
14619     conf->senderData = connectPtr.p->userpointer;
14620     sendSignal(connectPtr.p->userblockref, GSN_DIADDTABCONF, signal,
14621                DiAddTabConf::SignalLength, JBB);
14622 
14623 
14624     if (tabPtr.p->method == TabRecord::HASH_MAP)
14625     {
14626       Uint32 newValue = RNIL;
14627       if (DictTabInfo::isOrderedIndex(tabPtr.p->tableType))
14628       {
14629         jam();
14630         TabRecordPtr primTabPtr;
14631         primTabPtr.i = tabPtr.p->primaryTableId;
14632         ptrCheckGuard(primTabPtr, ctabFileSize, tabRecord);
14633         newValue = primTabPtr.p->m_map_ptr_i;
14634       }
14635       else
14636       {
14637         jam();
14638         newValue = connectPtr.p->m_create.m_map_ptr_i;
14639       }
14640 
14641       tabPtr.p->m_map_ptr_i = newValue;
14642     }
14643     // Release
14644     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
14645     tabPtr.p->connectrec = RNIL;
14646     release_connect(connectPtr);
14647   }
14648 
14649 }
14650 void
release_connect(ConnectRecordPtr ptr)14651 Dbdih::release_connect(ConnectRecordPtr ptr)
14652 {
14653   TabRecordPtr tabPtr;
14654   tabPtr.i = ptr.p->table;
14655   if (tabPtr.i != RNIL)
14656   {
14657     jam();
14658     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14659     if (tabPtr.p->connectrec == ptr.i)
14660     {
14661       ndbassert(false); // should be fixed elsewhere
14662       tabPtr.p->connectrec = RNIL;
14663     }
14664   }
14665 
14666   ptr.p->table = RNIL;
14667   ptr.p->userblockref = ZNIL;
14668   ptr.p->userpointer = RNIL;
14669   ptr.p->connectState = ConnectRecord::FREE;
14670   ptr.p->nextPool = cfirstconnect;
14671   cfirstconnect = ptr.i;
14672 }
14673 
14674 void
execADD_FRAGCONF(Signal * signal)14675 Dbdih::execADD_FRAGCONF(Signal* signal){
14676   jamEntry();
14677   AddFragConf * const conf = (AddFragConf*)signal->getDataPtr();
14678 
14679   ConnectRecordPtr connectPtr;
14680   connectPtr.i = conf->dihPtr;
14681   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
14682 
14683   TabRecordPtr tabPtr;
14684   tabPtr.i = connectPtr.p->table;
14685   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14686 
14687   sendAddFragreq(signal, connectPtr, tabPtr, conf->fragId + 1, false);
14688 }
14689 
14690 void
execADD_FRAGREF(Signal * signal)14691 Dbdih::execADD_FRAGREF(Signal* signal){
14692   jamEntry();
14693   AddFragRef * const ref = (AddFragRef*)signal->getDataPtr();
14694 
14695   ConnectRecordPtr connectPtr;
14696   connectPtr.i = ref->dihPtr;
14697   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
14698 
14699   Ptr<TabRecord> tabPtr;
14700   tabPtr.i = connectPtr.p->table;
14701   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14702   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
14703 
14704   if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
14705   {
14706     jam();
14707 
14708     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
14709     {
14710       jam();
14711       make_new_table_non_writeable(tabPtr);
14712     }
14713 
14714     connectPtr.p->connectState = ConnectRecord::ALTER_TABLE_ABORT;
14715     drop_fragments(signal, connectPtr, connectPtr.p->m_alter.m_totalfragments);
14716     return;
14717   }
14718   else
14719   {
14720     DiAddTabRef * const ref = (DiAddTabRef*)signal->getDataPtr();
14721     ref->senderData = connectPtr.p->userpointer;
14722     ref->errorCode = ~0;
14723     sendSignal(connectPtr.p->userblockref, GSN_DIADDTABREF, signal,
14724 	       DiAddTabRef::SignalLength, JBB);
14725 
14726     // Release
14727     tabPtr.p->connectrec = RNIL;
14728     release_connect(connectPtr);
14729   }
14730 }
14731 
14732 /*
14733   3.7.1.3   R E F U S E
14734   *********************
14735   */
14736 void
addtabrefuseLab(Signal * signal,ConnectRecordPtr connectPtr,Uint32 errorCode)14737 Dbdih::addtabrefuseLab(Signal* signal,
14738                        ConnectRecordPtr connectPtr, Uint32 errorCode)
14739 {
14740   signal->theData[0] = connectPtr.p->userpointer;
14741   signal->theData[1] = errorCode;
14742   sendSignal(connectPtr.p->userblockref, GSN_DIADDTABREF, signal, 2, JBB);
14743 
14744   Ptr<TabRecord> tabPtr;
14745   tabPtr.i = connectPtr.p->table;
14746   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14747   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
14748   tabPtr.p->connectrec = RNIL;
14749 
14750   release_connect(connectPtr);
14751   return;
14752 }//Dbdih::addtabrefuseLab()
14753 
14754 /*
14755   3.7.2   A D D   T A B L E   D U P L I C A T I O N
14756   *************************************************
14757   */
14758 /*
14759   3.7.2.1    A D D   T A B L E   D U P L I C A T I O N   R E Q U E S T
14760   *******************************************************************=
14761   */
14762 
14763 /*
14764   D E L E T E   T A B L E
14765   **********************=
14766   */
14767 /*****************************************************************************/
14768 /***********              DELETE TABLE  MODULE                   *************/
14769 /*****************************************************************************/
14770 void
execDROP_TAB_REQ(Signal * signal)14771 Dbdih::execDROP_TAB_REQ(Signal* signal)
14772 {
14773   jamEntry();
14774   DropTabReq* req = (DropTabReq*)signal->getDataPtr();
14775 
14776   D("DROP_TAB_REQ: " << req->tableId);
14777   CRASH_INSERTION(7248);
14778 
14779   TabRecordPtr tabPtr;
14780   tabPtr.i = req->tableId;
14781   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14782 
14783   tabPtr.p->m_dropTab.tabUserRef = req->senderRef;
14784   tabPtr.p->m_dropTab.tabUserPtr = req->senderData;
14785 
14786   DropTabReq::RequestType rt = (DropTabReq::RequestType)req->requestType;
14787 
14788   switch(rt){
14789   case DropTabReq::OnlineDropTab:
14790     jam();
14791     ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_DROPPING);
14792     break;
14793   case DropTabReq::CreateTabDrop:
14794     jam();
14795     break;
14796   case DropTabReq::RestartDropTab:
14797     break;
14798   }
14799 
14800   bool startNext = false;
14801   if (isMaster())
14802   {
14803     /**
14804      * Remove from queue
14805      */
14806     NodeRecordPtr nodePtr;
14807     for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
14808     {
14809       jam();
14810       ptrAss(nodePtr, nodeRecord);
14811       if (c_lcpState.m_participatingLQH.get(nodePtr.i))
14812       {
14813         /**
14814          * Remove any queued checkpoints for this table. Done in two phase
14815          * approach, first mark the entries with RNIL and next compress the
14816          * table, converts from O(n * m) algorithm to O(n) algorithm.
14817          */
14818         for (Uint32 i = 0; i < nodePtr.p->noOfQueuedChkpt; i++)
14819         {
14820 	  if (nodePtr.p->queuedChkpt[i].tableId == tabPtr.i)
14821           {
14822             nodePtr.p->queuedChkpt[i].tableId = RNIL;
14823           }
14824         }
14825         Uint32 index = 0;
14826         for (Uint32 i = 0; i < nodePtr.p->noOfQueuedChkpt; i++)
14827         {
14828           if (nodePtr.p->queuedChkpt[i].tableId != RNIL)
14829           {
14830             nodePtr.p->queuedChkpt[index] = nodePtr.p->queuedChkpt[i];
14831             index++;
14832           }
14833         }
14834         nodePtr.p->noOfQueuedChkpt = index;
14835         if (nodePtr.p->noOfStartedChkpt == 0)
14836         {
14837           jam();
14838           /**
14839            * Check whether more LCPs can be started for this node, but
14840            * don't check for starting to other nodes at this point in
14841            * time.
14842            */
14843           startNext |= checkStartMoreLcp(signal, nodePtr.i, false);
14844         }
14845         DEB_LCP(("DROP_TAB_REQ: nodePtr(%u)->noOfQueuedChkpt = %u"
14846                  ", nodePtr->noOfStartedChkpt = %u"
14847                  ", tab: %u",
14848                  nodePtr.i,
14849                  nodePtr.p->noOfQueuedChkpt,
14850                  nodePtr.p->noOfStartedChkpt,
14851                  tabPtr.i));
14852       }
14853     }
14854   }
14855   if (startNext)
14856   {
14857     /**
14858      * startNextChkpt is a heavy method, so not good to call it for
14859      * every node, it goes through all nodes, so better to do it once
14860      * if any node needed it.
14861      */
14862     jam();
14863     startNextChkpt(signal);
14864   }
14865 
14866   {
14867     /**
14868      * Check table lcp state
14869      */
14870     bool ok = false;
14871     switch(tabPtr.p->tabLcpStatus){
14872     case TabRecord::TLS_COMPLETED:
14873     case TabRecord::TLS_WRITING_TO_FILE:
14874       ok = true;
14875       jam();
14876       g_eventLogger->info("DROP_TAB_REQ: tab: %u, tabLcpStatus: %u",
14877                           tabPtr.i,
14878                           tabPtr.p->tabLcpStatus);
14879       break;
14880       return;
14881     case TabRecord::TLS_ACTIVE:
14882       ok = true;
14883       jam();
14884 
14885       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
14886 
14887       g_eventLogger->info("DROP_TAB_REQ: tab: %u, tabLcpStatus set to %u",
14888                           tabPtr.i,
14889                           tabPtr.p->tabLcpStatus);
14890       /**
14891        * First check if all fragments are done
14892        */
14893       if (checkLcpAllTablesDoneInLqh(__LINE__))
14894       {
14895 	jam();
14896 
14897         g_eventLogger->info("This is the last table");
14898 
14899 	/**
14900 	 * Then check if saving of tab info is done for all tables
14901 	 */
14902 	LcpStatus a = c_lcpState.lcpStatus;
14903 	checkLcpCompletedLab(signal);
14904 
14905         if(a != c_lcpState.lcpStatus)
14906         {
14907           g_eventLogger->info("And all tables are written to already written disk");
14908         }
14909       }
14910       break;
14911     }
14912     ndbrequire(ok);
14913   }
14914 
14915   waitDropTabWritingToFile(signal, tabPtr);
14916 }
14917 
startDeleteFile(Signal * signal,TabRecordPtr tabPtr)14918 void Dbdih::startDeleteFile(Signal* signal, TabRecordPtr tabPtr)
14919 {
14920   if (tabPtr.p->tabFile[0] == RNIL) {
14921     jam();
14922     initTableFile(tabPtr);
14923   }//if
14924   openTableFileForDelete(signal, tabPtr.p->tabFile[0]);
14925 }//Dbdih::startDeleteFile()
14926 
openTableFileForDelete(Signal * signal,Uint32 fileIndex)14927 void Dbdih::openTableFileForDelete(Signal* signal, Uint32 fileIndex)
14928 {
14929   FileRecordPtr filePtr;
14930   filePtr.i = fileIndex;
14931   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
14932   openFileRw(signal, filePtr);
14933   filePtr.p->reqStatus = FileRecord::TABLE_OPEN_FOR_DELETE;
14934 }//Dbdih::openTableFileForDelete()
14935 
tableOpenLab(Signal * signal,FileRecordPtr filePtr)14936 void Dbdih::tableOpenLab(Signal* signal, FileRecordPtr filePtr)
14937 {
14938   closeFileDelete(signal, filePtr);
14939   filePtr.p->reqStatus = FileRecord::TABLE_CLOSE_DELETE;
14940   return;
14941 }//Dbdih::tableOpenLab()
14942 
tableDeleteLab(Signal * signal,FileRecordPtr filePtr)14943 void Dbdih::tableDeleteLab(Signal* signal, FileRecordPtr filePtr)
14944 {
14945   TabRecordPtr tabPtr;
14946   tabPtr.i = filePtr.p->tabRef;
14947   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
14948   if (filePtr.i == tabPtr.p->tabFile[0]) {
14949     jam();
14950     openTableFileForDelete(signal, tabPtr.p->tabFile[1]);
14951     return;
14952   }//if
14953   ndbrequire(filePtr.i == tabPtr.p->tabFile[1]);
14954 
14955   releaseFile(tabPtr.p->tabFile[0]);
14956   releaseFile(tabPtr.p->tabFile[1]);
14957   tabPtr.p->tabFile[0] = tabPtr.p->tabFile[1] = RNIL;
14958 
14959   /**
14960    * Table has already been dropped from DBTC's view a long time
14961    * ago, we need not protect this change.
14962    */
14963   tabPtr.p->tabStatus = TabRecord::TS_IDLE;
14964 
14965   DropTabConf * const dropConf = (DropTabConf *)signal->getDataPtrSend();
14966   dropConf->senderRef = reference();
14967   dropConf->senderData = tabPtr.p->m_dropTab.tabUserPtr;
14968   dropConf->tableId = tabPtr.i;
14969   sendSignal(tabPtr.p->m_dropTab.tabUserRef, GSN_DROP_TAB_CONF,
14970 	     signal, DropTabConf::SignalLength, JBB);
14971 
14972   tabPtr.p->m_dropTab.tabUserPtr = RNIL;
14973   tabPtr.p->m_dropTab.tabUserRef = 0;
14974   releaseTable(tabPtr);
14975 }//Dbdih::tableDeleteLab()
14976 
14977 
releaseTable(TabRecordPtr tabPtr)14978 void Dbdih::releaseTable(TabRecordPtr tabPtr)
14979 {
14980   FragmentstorePtr fragPtr;
14981   if (tabPtr.p->noOfFragChunks > 0) {
14982     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
14983       jam();
14984       getFragstore(tabPtr.p, fragId, fragPtr);
14985       dec_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
14986       releaseReplicas(& fragPtr.p->storedReplicas);
14987       releaseReplicas(& fragPtr.p->oldStoredReplicas);
14988     }//for
14989     releaseFragments(tabPtr);
14990   }
14991   if (tabPtr.p->tabFile[0] != RNIL) {
14992     jam();
14993     releaseFile(tabPtr.p->tabFile[0]);
14994     releaseFile(tabPtr.p->tabFile[1]);
14995     tabPtr.p->tabFile[0] = tabPtr.p->tabFile[1] = RNIL;
14996   }//if
14997 }//Dbdih::releaseTable()
14998 
releaseReplicas(Uint32 * replicaPtrI)14999 void Dbdih::releaseReplicas(Uint32 * replicaPtrI)
15000 {
15001   ReplicaRecordPtr replicaPtr;
15002   replicaPtr.i = * replicaPtrI;
15003   jam();
15004   while (replicaPtr.i != RNIL)
15005   {
15006     jam();
15007     c_replicaRecordPool.getPtr(replicaPtr);
15008     Uint32 tmp = replicaPtr.p->nextPool;
15009     c_replicaRecordPool.release(replicaPtr);
15010     replicaPtr.i = tmp;
15011     cnoFreeReplicaRec++;
15012   }//while
15013 
15014   * replicaPtrI = RNIL;
15015 }//Dbdih::releaseReplicas()
15016 
seizeReplicaRec(ReplicaRecordPtr & replicaPtr)15017 void Dbdih::seizeReplicaRec(ReplicaRecordPtr& replicaPtr)
15018 {
15019   c_replicaRecordPool.seize(replicaPtr);
15020   cnoFreeReplicaRec--;
15021   replicaPtr.p->nextPool = RNIL;
15022 }//Dbdih::seizeReplicaRec()
15023 
releaseFile(Uint32 fileIndex)15024 void Dbdih::releaseFile(Uint32 fileIndex)
15025 {
15026   FileRecordPtr filePtr;
15027   filePtr.i = fileIndex;
15028   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
15029   filePtr.p->nextFile = cfirstfreeFile;
15030   cfirstfreeFile = filePtr.i;
15031 }//Dbdih::releaseFile()
15032 
15033 
execALTER_TAB_REQ(Signal * signal)15034 void Dbdih::execALTER_TAB_REQ(Signal * signal)
15035 {
15036   const AlterTabReq* req = (const AlterTabReq*)signal->getDataPtr();
15037   const Uint32 senderRef = req->senderRef;
15038   const Uint32 senderData = req->senderData;
15039   const Uint32 tableId = req->tableId;
15040   const Uint32 tableVersion = req->tableVersion;
15041   const Uint32 newTableVersion = req->newTableVersion;
15042   AlterTabReq::RequestType requestType =
15043     (AlterTabReq::RequestType) req->requestType;
15044   D("ALTER_TAB_REQ(DIH)");
15045 
15046   TabRecordPtr tabPtr;
15047   tabPtr.i = tableId;
15048   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
15049 
15050   switch(requestType){
15051   case AlterTabReq::AlterTablePrepare:
15052     jam();
15053     // fall through
15054   case AlterTabReq::AlterTableRevert:
15055     jam();
15056     if (AlterTableReq::getAddFragFlag(req->changeMask) &&
15057         tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE)
15058     {
15059       jam();
15060       SectionHandle handle(this, signal);
15061       sendSignalWithDelay(reference(), GSN_ALTER_TAB_REQ, signal, 10,
15062                           signal->getLength(), &handle);
15063       return;
15064     }
15065     break;
15066   case AlterTabReq::AlterTableCommit:
15067     jam();
15068     break;
15069   case AlterTabReq::AlterTableComplete:
15070     jam();
15071     break;
15072   case AlterTabReq::AlterTableWaitScan:
15073     jam();
15074     break;
15075   default:
15076     jamLine(requestType);
15077   }
15078 
15079   ConnectRecordPtr connectPtr;
15080   connectPtr.i = RNIL;
15081   switch (requestType) {
15082   case AlterTabReq::AlterTablePrepare:
15083     jam();
15084 
15085     D("AlterTabReq::AlterTablePrepare: tableId: " << tabPtr.i);
15086     ndbrequire(cfirstconnect != RNIL);
15087     connectPtr.i = cfirstconnect;
15088     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15089     cfirstconnect = connectPtr.p->nextPool;
15090 
15091     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
15092     connectPtr.p->m_alter.m_org_totalfragments = tabPtr.p->totalfragments;
15093     connectPtr.p->m_alter.m_partitionCount = tabPtr.p->partitionCount;
15094     connectPtr.p->m_alter.m_changeMask = req->changeMask;
15095     connectPtr.p->m_alter.m_new_map_ptr_i = req->new_map_ptr_i;
15096     connectPtr.p->userpointer = senderData;
15097     connectPtr.p->userblockref = senderRef;
15098     connectPtr.p->connectState = ConnectRecord::ALTER_TABLE;
15099     connectPtr.p->table = tabPtr.i;
15100     tabPtr.p->connectrec = connectPtr.i;
15101     break;
15102   case AlterTabReq::AlterTableRevert:
15103     jam();
15104     D("AlterTabReq::AlterTableRevert: tableId: " << tabPtr.i);
15105     tabPtr.p->schemaVersion = tableVersion;
15106 
15107     connectPtr.i = req->connectPtr;
15108     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15109 
15110     ndbrequire(connectPtr.p->connectState == ConnectRecord::ALTER_TABLE);
15111 
15112     connectPtr.p->userpointer = senderData;
15113     connectPtr.p->userblockref = senderRef;
15114 
15115     if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
15116     {
15117       jam();
15118       make_new_table_non_writeable(tabPtr);
15119     }
15120 
15121     if (AlterTableReq::getAddFragFlag(req->changeMask))
15122     {
15123       jam();
15124       tabPtr.p->tabCopyStatus = TabRecord::CS_ALTER_TABLE;
15125       connectPtr.p->connectState = ConnectRecord::ALTER_TABLE_REVERT;
15126       drop_fragments(signal, connectPtr,
15127                      connectPtr.p->m_alter.m_totalfragments);
15128       return;
15129     }
15130 
15131     send_alter_tab_conf(signal, connectPtr);
15132 
15133     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
15134     tabPtr.p->connectrec = RNIL;
15135     release_connect(connectPtr);
15136     return;
15137     break;
15138   case AlterTabReq::AlterTableCommit:
15139   {
15140     jam();
15141     D("AlterTabReq::AlterTableCommit: tableId: " << tabPtr.i);
15142     tabPtr.p->schemaVersion = newTableVersion;
15143 
15144     connectPtr.i = req->connectPtr;
15145     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15146     connectPtr.p->userpointer = senderData;
15147     connectPtr.p->userblockref = senderRef;
15148     ndbrequire(connectPtr.p->connectState == ConnectRecord::ALTER_TABLE);
15149     make_new_table_read_and_writeable(tabPtr, connectPtr, signal);
15150     return;
15151   }
15152   case AlterTabReq::AlterTableComplete:
15153     jam();
15154     D("AlterTabReq::AlterTableComplete: tableId: " << tabPtr.i);
15155     connectPtr.i = req->connectPtr;
15156     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15157     connectPtr.p->userpointer = senderData;
15158     connectPtr.p->userblockref = senderRef;
15159 
15160     if (!make_old_table_non_writeable(tabPtr, connectPtr))
15161     {
15162       jam();
15163       send_alter_tab_conf(signal, connectPtr);
15164       return;
15165     }
15166     /**
15167      * This is a table reorg, we want to wait for scans with
15168      * REORG_NOT_MOVED flag set to ensure that those scans have
15169      * completed before we start up a new ALTER TABLE REORG in
15170      * which case these scans might miss to read rows.
15171      *
15172      * Fall through to make this happen.
15173      */
15174   case AlterTabReq::AlterTableWaitScan:{
15175     jam();
15176     const NDB_TICKS now = NdbTick_getCurrentTicks();
15177     signal->theData[0] = DihContinueB::ZWAIT_OLD_SCAN;
15178     signal->theData[1] = tabPtr.i;
15179     signal->theData[2] = senderRef;
15180     signal->theData[3] = senderData;
15181     signal->theData[4] = connectPtr.i;
15182     signal->theData[5] = Uint32(now.getUint64() >> 32);
15183     signal->theData[6] = Uint32(now.getUint64());
15184     signal->theData[7] = 3; // Seconds to wait
15185     sendSignal(reference(), GSN_CONTINUEB, signal, 8, JBB);
15186     return;
15187   }
15188   default:
15189     ndbabort();
15190   }
15191 
15192   if (AlterTableReq::getAddFragFlag(req->changeMask))
15193   {
15194     jam();
15195     SegmentedSectionPtr ptr;
15196     SectionHandle handle(this, signal);
15197     handle.getSection(ptr, 0);
15198     union {
15199       Uint16 buf[2+2*MAX_NDB_PARTITIONS];
15200       Uint32 _align[1];
15201     };
15202     copy(_align, ptr);
15203     releaseSections(handle);
15204     start_add_fragments_in_new_table(tabPtr, connectPtr, buf, signal);
15205     return;
15206   }
15207 
15208   send_alter_tab_conf(signal, connectPtr);
15209 }
15210 
15211 Uint32
add_fragments_to_table(Ptr<TabRecord> tabPtr,const Uint16 buf[])15212 Dbdih::add_fragments_to_table(Ptr<TabRecord> tabPtr, const Uint16 buf[])
15213 {
15214   Uint32 replicas = buf[0];
15215   Uint32 cnt = buf[1];
15216 
15217   Uint32 i = 0;
15218   Uint32 err = 0;
15219   Uint32 current = tabPtr.p->totalfragments;
15220   for (i = 0; i<cnt; i++)
15221   {
15222     FragmentstorePtr fragPtr;
15223     Uint32 fragId = current + i;
15224     if (ERROR_INSERTED(7212) && cnt)
15225     {
15226       err = 1;
15227       CLEAR_ERROR_INSERT_VALUE;
15228       goto error;
15229     }
15230 
15231     if ((err = add_fragment_to_table(tabPtr, fragId, fragPtr)))
15232       goto error;
15233 
15234     fragPtr.p->m_log_part_id = buf[2+(1 + replicas)*i];
15235     ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
15236     fragPtr.p->preferredPrimary = buf[2+(1 + replicas)*i + 1];
15237     fragPtr.p->partition_id = fragId % tabPtr.p->partitionCount;
15238 
15239     inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
15240 
15241     Uint32 activeIndex = 0;
15242     for (Uint32 j = 0; j<replicas; j++)
15243     {
15244       const Uint32 nodeId = buf[2+(1 + replicas)*i + 1 + j];
15245       ReplicaRecordPtr replicaPtr;
15246       allocStoredReplica(fragPtr,
15247                          replicaPtr,
15248                          nodeId,
15249                          current + i,
15250                          tabPtr.i);
15251       if (getNodeStatus(nodeId) == NodeRecord::ALIVE) {
15252         jam();
15253         ndbrequire(activeIndex < MAX_REPLICAS);
15254         fragPtr.p->activeNodes[activeIndex] = nodeId;
15255         activeIndex++;
15256       } else {
15257         jam();
15258         removeStoredReplica(fragPtr, replicaPtr);
15259         linkOldStoredReplica(fragPtr, replicaPtr);
15260       }
15261     }
15262     fragPtr.p->fragReplicas = activeIndex;
15263   }
15264 
15265   return 0;
15266 error:
15267   for(i = i + current; i != current; i--)
15268   {
15269     release_fragment_from_table(tabPtr, i);
15270   }
15271 
15272   return err;
15273 }
15274 
15275 void
wait_old_scan(Signal * signal)15276 Dbdih::wait_old_scan(Signal* signal)
15277 {
15278   jam();
15279 
15280   TabRecordPtr tabPtr;
15281   tabPtr.i = signal->theData[1];
15282   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
15283 
15284   if (tabPtr.p->m_scan_count[1] == 0)
15285   {
15286     jam();
15287     Uint32 senderRef = signal->theData[2];
15288     Uint32 senderData = signal->theData[3];
15289     Uint32 connectPtrI = signal->theData[4];
15290 
15291     AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
15292     conf->senderRef = reference();
15293     conf->senderData = senderData;
15294     conf->connectPtr = connectPtrI;
15295     sendSignal(senderRef, GSN_ALTER_TAB_CONF, signal,
15296                AlterTabConf::SignalLength, JBB);
15297     return;
15298   }
15299 
15300   const Uint32 start_hi = signal->theData[5];
15301   const Uint32 start_lo = signal->theData[6];
15302   const Uint32 wait = signal->theData[7];
15303 
15304   const NDB_TICKS start((Uint64(start_hi) << 32) | start_lo);
15305   const NDB_TICKS now  = NdbTick_getCurrentTicks();
15306   const Uint32 elapsed = (Uint32)NdbTick_Elapsed(start,now).seconds();
15307 
15308   if (elapsed > wait)
15309   {
15310     infoEvent("Waiting(%u) for scans(%u) to complete on table %u",
15311               elapsed,
15312               tabPtr.p->m_scan_count[1],
15313               tabPtr.i);
15314 
15315     if (wait == 3)
15316     {
15317       signal->theData[7] = 3 + 7;
15318     }
15319     else
15320     {
15321       signal->theData[7] = 2 * wait;
15322     }
15323   }
15324 
15325   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 7);
15326 }
15327 
15328 Uint32
add_fragment_to_table(Ptr<TabRecord> tabPtr,Uint32 fragId,Ptr<Fragmentstore> & fragPtr)15329 Dbdih::add_fragment_to_table(Ptr<TabRecord> tabPtr,
15330                              Uint32 fragId,
15331                              Ptr<Fragmentstore>& fragPtr)
15332 {
15333   Uint32 fragments = tabPtr.p->totalfragments;
15334   Uint32 chunks = tabPtr.p->noOfFragChunks;
15335 
15336   ndbrequire(fragId == fragments); // Only add at the end
15337 
15338   if (ERROR_INSERTED(7211))
15339   {
15340     CLEAR_ERROR_INSERT_VALUE;
15341     return 1;
15342   }
15343 
15344   Uint32 allocated = chunks << LOG_NO_OF_FRAGS_PER_CHUNK;
15345   if (fragId < allocated)
15346   {
15347     jam();
15348     tabPtr.p->totalfragments++;
15349     getFragstore(tabPtr.p, fragId, fragPtr);
15350     return 0;
15351   }
15352 
15353   /**
15354    * Allocate a new chunk
15355    */
15356   fragPtr.i = cfirstfragstore;
15357   if (fragPtr.i == RNIL)
15358   {
15359     jam();
15360     return -1;
15361   }
15362 
15363   ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
15364   cfirstfragstore = fragPtr.p->nextFragmentChunk;
15365   ndbrequire(cremainingfrags >= NO_OF_FRAGS_PER_CHUNK);
15366   cremainingfrags -= NO_OF_FRAGS_PER_CHUNK;
15367 
15368   ndbrequire(chunks < NDB_ARRAY_SIZE(tabPtr.p->startFid));
15369   tabPtr.p->startFid[chunks] = fragPtr.i;
15370   Uint32 init_fragid = fragId;
15371   for (Uint32 i = 0; i<NO_OF_FRAGS_PER_CHUNK; i++)
15372   {
15373     jam();
15374     Ptr<Fragmentstore> tmp;
15375     tmp.i = fragPtr.i + i;
15376     ptrCheckGuard(tmp, cfragstoreFileSize, fragmentstore);
15377     initFragstore(tmp, init_fragid);
15378     init_fragid++;
15379   }
15380 
15381   tabPtr.p->totalfragments++;
15382   tabPtr.p->noOfFragChunks++;
15383 
15384   return 0;
15385 }
15386 
15387 /**
15388  * Both table mutex and table RCU lock need be held when calling
15389  * this function.
15390  */
15391 void
release_fragment_from_table(Ptr<TabRecord> tabPtr,Uint32 fragId)15392 Dbdih::release_fragment_from_table(Ptr<TabRecord> tabPtr, Uint32 fragId)
15393 {
15394   FragmentstorePtr fragPtr;
15395   Uint32 fragments = tabPtr.p->totalfragments;
15396   Uint32 chunks = tabPtr.p->noOfFragChunks;
15397 
15398   if (fragId >= fragments)
15399   {
15400     jam();
15401     return;
15402   }
15403   ndbrequire(fragId == fragments - 1); // only remove at end
15404   ndbrequire(fragments != 0);
15405 
15406   getFragstore(tabPtr.p, fragId, fragPtr);
15407   dec_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
15408 
15409   releaseReplicas(& fragPtr.p->storedReplicas);
15410   releaseReplicas(& fragPtr.p->oldStoredReplicas);
15411 
15412   if (fragId == ((chunks - 1) << LOG_NO_OF_FRAGS_PER_CHUNK))
15413   {
15414     jam();
15415 
15416     getFragstore(tabPtr.p, fragId, fragPtr);
15417 
15418     fragPtr.p->nextFragmentChunk = cfirstfragstore;
15419     cfirstfragstore = fragPtr.i;
15420     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
15421     tabPtr.p->noOfFragChunks = chunks - 1;
15422   }
15423 
15424   tabPtr.p->totalfragments--;
15425 }
15426 
15427 void
send_alter_tab_ref(Signal * signal,Ptr<TabRecord> tabPtr,Ptr<ConnectRecord> connectPtr,Uint32 errCode)15428 Dbdih::send_alter_tab_ref(Signal* signal,
15429                           Ptr<TabRecord> tabPtr,
15430                           Ptr<ConnectRecord> connectPtr,
15431                           Uint32 errCode)
15432 {
15433   AlterTabRef* ref = (AlterTabRef*)signal->getDataPtrSend();
15434   ref->senderRef = reference();
15435   ref->senderData = connectPtr.p->userpointer;
15436   ref->errorCode = errCode;
15437   sendSignal(connectPtr.p->userblockref, GSN_ALTER_TAB_REF, signal,
15438              AlterTabRef::SignalLength, JBB);
15439 }
15440 
15441 void
send_alter_tab_conf(Signal * signal,Ptr<ConnectRecord> connectPtr)15442 Dbdih::send_alter_tab_conf(Signal* signal, Ptr<ConnectRecord> connectPtr)
15443 {
15444   AlterTabConf* conf = (AlterTabConf*)signal->getDataPtrSend();
15445   conf->senderRef = reference();
15446   conf->senderData = connectPtr.p->userpointer;
15447   conf->connectPtr = connectPtr.i;
15448   sendSignal(connectPtr.p->userblockref, GSN_ALTER_TAB_CONF, signal,
15449              AlterTabConf::SignalLength, JBB);
15450 }
15451 
15452 void
saveTableFile(Signal * signal,Ptr<ConnectRecord> connectPtr,Ptr<TabRecord> tabPtr,TabRecord::CopyStatus expectedStatus,Callback & cb)15453 Dbdih::saveTableFile(Signal* signal,
15454                      Ptr<ConnectRecord> connectPtr,
15455                      Ptr<TabRecord> tabPtr,
15456                      TabRecord::CopyStatus expectedStatus,
15457                      Callback& cb)
15458 {
15459   ndbrequire(connectPtr.i == cb.m_callbackData);         // required
15460   ndbrequire(tabPtr.p->tabCopyStatus == expectedStatus); // locking
15461   memcpy(&connectPtr.p->m_callback, &cb, sizeof(Callback));
15462 
15463   tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_TO_SAVE;
15464   tabPtr.p->tabUpdateState = TabRecord::US_CALLBACK;
15465   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
15466   signal->theData[1] = tabPtr.i;
15467   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
15468 }
15469 
15470 void
alter_table_writeTable_conf(Signal * signal,Uint32 ptrI,Uint32 err)15471 Dbdih::alter_table_writeTable_conf(Signal* signal, Uint32 ptrI, Uint32 err)
15472 {
15473   jamEntry();
15474   ndbrequire(err == 0);
15475 
15476   ConnectRecordPtr connectPtr;
15477   connectPtr.i = ptrI;
15478   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15479 
15480   switch(connectPtr.p->connectState){
15481   case ConnectRecord::ALTER_TABLE_REVERT:
15482   {
15483     jam();
15484     send_alter_tab_conf(signal, connectPtr);
15485 
15486     Ptr<TabRecord> tabPtr;
15487     tabPtr.i = connectPtr.p->table;
15488     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
15489     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
15490     tabPtr.p->connectrec = RNIL;
15491     release_connect(connectPtr);
15492     return;
15493   }
15494   case ConnectRecord::ALTER_TABLE:
15495   {
15496     jam();
15497     send_alter_tab_conf(signal, connectPtr);
15498     return;
15499   }
15500   default:
15501     jamLine(connectPtr.p->connectState);
15502     ndbabort();
15503   }
15504 }
15505 
15506 void
drop_fragments(Signal * signal,Ptr<ConnectRecord> connectPtr,Uint32 curr)15507 Dbdih::drop_fragments(Signal* signal, Ptr<ConnectRecord> connectPtr,
15508                       Uint32 curr)
15509 {
15510   ndbrequire(curr >= connectPtr.p->m_alter.m_org_totalfragments);
15511   if (curr == connectPtr.p->m_alter.m_org_totalfragments)
15512   {
15513     /**
15514      * done...
15515      */
15516     jam();
15517     Ptr<TabRecord> tabPtr;
15518     tabPtr.i = connectPtr.p->table;
15519     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
15520 
15521     drop_fragments_from_new_table_view(tabPtr, connectPtr);
15522 
15523     switch(connectPtr.p->connectState){
15524     case ConnectRecord::ALTER_TABLE_ABORT:
15525     {
15526       jam();
15527       ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_ALTER_TABLE);
15528       tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
15529       send_alter_tab_ref(signal, tabPtr, connectPtr, ~0);
15530 
15531       connectPtr.p->connectState = ConnectRecord::ALTER_TABLE;
15532       return;
15533     }
15534     case ConnectRecord::ALTER_TABLE_REVERT:
15535     {
15536       jam();
15537       Callback cb;
15538       cb.m_callbackData = connectPtr.i;
15539       cb.m_callbackFunction = safe_cast(&Dbdih::alter_table_writeTable_conf);
15540       saveTableFile(signal, connectPtr, tabPtr, TabRecord::CS_ALTER_TABLE, cb);
15541       return;
15542     }
15543     default:
15544       jamLine(connectPtr.p->connectState);
15545       ndbabort();
15546     }
15547     return;
15548   }
15549 
15550   ndbrequire(curr > 0);
15551   DropFragReq* req = (DropFragReq*)signal->getDataPtrSend();
15552   req->senderRef = reference();
15553   req->senderData = connectPtr.i;
15554   req->tableId = connectPtr.p->table;
15555   req->fragId = curr - 1;
15556   req->requestInfo = DropFragReq::AlterTableAbort;
15557   sendSignal(DBLQH_REF, GSN_DROP_FRAG_REQ, signal,
15558              DropFragReq::SignalLength, JBB);
15559 }
15560 
15561 void
execDROP_FRAG_REF(Signal * signal)15562 Dbdih::execDROP_FRAG_REF(Signal* signal)
15563 {
15564   ndbabort();
15565 }
15566 
15567 void
execDROP_FRAG_CONF(Signal * signal)15568 Dbdih::execDROP_FRAG_CONF(Signal* signal)
15569 {
15570   DropFragConf* conf = (DropFragConf*)signal->getDataPtr();
15571 
15572   ConnectRecordPtr connectPtr;
15573   connectPtr.i = conf->senderData;
15574   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
15575 
15576   drop_fragments(signal, connectPtr, conf->fragId);
15577 }
15578 
15579 /*
15580   G E T   N O D E S
15581   **********************=
15582   */
15583 /*****************************************************************************/
15584 /* **********     TRANSACTION  HANDLING  MODULE                  *************/
15585 /*****************************************************************************/
15586 
15587 /**
15588  * Transaction Handling Module
15589  * ---------------------------
15590  *
15591  * This module can to a great extent be described as the heart of the
15592  * distribution aspects of MySQL Cluster. It is an essential part of key
15593  * operations and scan operations. It will ensure that the TC block will get
15594  * the correct data about table distribution in all operations of the cluster.
15595  *
15596  * It is absolutely for one of the USPs (Unique Selling Points) of MySQL
15597  * Cluster which is its high availability and its ability to perform online
15598  * meta data changes while still providing both read and write services
15599  * using the old meta data and even being able to handle both new and old
15600  * meta data at the same time during the switch over phase.
15601  *
15602  * It is absolutely vital for the recovery aspects and this module is the
15603  * reason that we can support failover in a number of milliseconds. The
15604  * longest time is to discover the failure, when that is done it is a
15605  * matter of 2 signals back and forth to all nodes to reconfigure the
15606  * nodes. It has much help in this node failure handling from QMGR and
15607  * NDBCNTR blocks.
15608  *
15609  * As described in database theory a node failure is handled as a transaction
15610  * in itself. This transaction is executed by QMGR and NDBCNTR and when
15611  * the report about a failed node reaches DBDIH it will immediately switch
15612  * the replicas used to read and write using the data controlled by this
15613  * module.
15614  *
15615  * The problems we are facing in this module are the following:
15616  * -----------------------------------------------------------
15617  * 1) We need to quickly remove fragment replicas belonging to nodes that
15618  *    died.
15619  *
15620  * 2) We need to include new fragment replicas to be writeable and later
15621  *    to be both read and writeable. This as part of bringing new nodes
15622  *    up.
15623  *
15624  * 3) To be able to balance up the usage of nodes we need the ability to
15625  *    switch primary replica after completing node recovery.
15626  *
15627  * 4) We need to add new tables with a flexible table distribution.
15628  *
15629  * 5) We need the ability to reorganize a table to make it use new nodes
15630  *    that have been added to the cluster.
15631  *
15632  * 6) We need to handle fully replicated tables that can at times read
15633  *    from any node that contains the table.
15634  *
15635  * 7) Supporting updates of several fragments when fully replicated using
15636  *    an iterator over a copy fragments.
15637  *
15638  * 8) We need to handle long-running scans that need a consistent view of
15639  *    the table for its entire operation while at the same reorganising the
15640  *    table.
15641  *
15642  * 9) We need to support many different variants of table distributions,
15643  *    as an example we can have tables with one fragment per LDM per node,
15644  *    we could have tables with just one fragment per node group and so
15645  *    forth.
15646  *
15647  * 10)We need to support many different fragmentation types. This includes
15648  *    range partitioning, list partitioning, key partitioning, linear key
15649  *    partitioning. These variants are currently only supported when
15650  *    operating only with a MySQL Server, so no direct NDB API access for
15651  *    these tables is allowed. Also these tables have no ability for table
15652  *    reorganisation at this time.
15653  *
15654  *    The most important fragmentation types we currently support is based
15655  *    on the concept of hash maps. So the table is distributed with e.g.
15656  *    3840 hash parts. When the table has 8 fragments these 3840 is
15657  *    distributed among those 8 fragments. If the table is later is
15658  *    reorganised to have 12 fragments then some of those 3840 hash
15659  *    parts will be moved to the new fragments and a significant number of
15660  *    those parts will stay put and not need any move.
15661  *
15662  * 11)Finally we also have a programmatic problem. The code that changes
15663  *    these data structures is not critical in performance and is handled
15664  *    by a single thread in each data node.
15665  *
15666  *    However reading of those data structures happens in each key operation
15667  *    and several times in a scan operation. There are many readers of this
15668  *    data structure, it is read from all other threads in the data node
15669  *    although mostly from the TC threads.
15670  *
15671  *    Given the rarity of updates to those data structures we opted for an
15672  *    RCU mechanism. So we get a counter before reading, then we read,
15673  *    after reading we check that the counter is still the same, if not
15674  *    we retry. In addition there are a number of memory barriers used to
15675  *    support this properly in a highly parallel environment.
15676  *
15677  *    This mechanism makes for scaling which is almost unlimited. It also
15678  *    means that any updates of these data structures have to be done in
15679  *    a safe manner always avoiding that the user might trap on a pointer
15680  *    or reference which isn't properly set. This requires very careful
15681  *    programming. To support this carefulness we have gathered together
15682  *    all code performing those functions into one module here in DDBIH.
15683  *
15684  *    To solve 8) in a multithreaded environments we use a mutex such that
15685  *    scans increment a reference counter when they start and decrement it
15686  *    when done. In this manner we can always keep track of any still
15687  *    outstanding scan operations at table reorganisation time.
15688  *
15689  * Distinguish between READs and WRITEs in DIH interface
15690  * -----------------------------------------------------
15691  * DIH will always deliver a list of all nodes that have a replica of the
15692  * data. However some of those nodes could be write-only during node
15693  * recovery and during on-line table reorganisation. However the receiver
15694  * of this data is only allowed to use the list in one of two ways.
15695  *
15696  * 1) Use entire list for write transactions
15697  * 2) Use any replica for reading in my own node (if read backup feature
15698  *    is active on the table AND READ COMMITTED is used to read the data.
15699  *
15700  * The reason that this works is that no one is allowed to use this
15701  * interface to read data while still in node recovery. So this is the manner
15702  * to ensure that we don't read any fragments that are not yet fully
15703  * recovered.
15704  *
15705  * For table reorg of a table we will only report back the fragments that
15706  * are readable. The fragments that are still in the build process will
15707  * be reported as new fragments and will only be used by special
15708  * transactions that perform the copy phase and the delete phase.
15709  *
15710  * Description of key algorithms DBDIH participates in
15711  * ---------------------------------------------------
15712  * One important feature in MySQL Cluster is ALTER TABLE REORG. This makes
15713  * it possible to reorganize the data in a table to make use of a new
15714  * node group that has been added. It also makes it possible to extend
15715  * the number of fragments in a table. It is still not supported to
15716  * decrease the number of fragments in a table.
15717  *
15718  * DBDIH participates in four very crucial points in this table reorg.
15719  * 1) start_add_fragments_in_new_table
15720  *    This phase is about creating new empty fragments and requires insertion
15721  *    of the new fragments into the shared data structures. The fragments are
15722  *    still not to be used, but it is imperative that we insert the data in
15723  *    a controlled manner.
15724  *
15725  * 2) make_new_table_writeable
15726  *    This method is called when all new fragments have been created, all
15727  *    triggers required to perform the copy phase has been installed. It is
15728  *    now time to make the new fragments participate in write transactions
15729  *    in a controlled manner.
15730  *
15731  *    This means that we have 2 hash maps, one for the old table distribution
15732  *    and for the new table distribution. When a write happens we need to
15733  *    keep both in synch if the write goes to different fragments in the two
15734  *    table distributions.
15735  *
15736  *    The data is also used when copying data over from old fragments to the
15737  *    new fragments.
15738  *
15739  *    Fully replicated tables are a bit special, they cannot add new real
15740  *    fragments, but they can add new copy fragments and thus extend the
15741  *    number of replicas of the data. In this phase we have to distinguish
15742  *    between which fragments can be used for reading and which needs to
15743  *    be updated.
15744  *
15745  *    We handle this by always ensuring that new fragments are at the end of
15746  *    list of copy fragments and that we never report any fragments with
15747  *    higher fragment id than the current variable totalfragments states.
15748  *
15749  * 3) make_table_read_and_writeable
15750  *    This is called after the copy phase has been completed. The fragments
15751  *    are now filled with all data and are also available for reading. The
15752  *    old fragments are still kept up to date. So here we need to ensure
15753  *    that all writes goes to both old and new fragment of each row.
15754  *
15755  * 4) make_old_table_non_writeable
15756  *    Now all transactions using old table distribution have completed (a
15757  *    number of scan operations) and we remove the old hash map from the
15758  *    table. We are now ready to start deleting data from old fragments
15759  *    This data isn't required to stay in those fragments any more.
15760  *
15761  * MySQL Cluster also supports schema transactions, this means that schema
15762  * transactions can be rolled back if they fail for some reason. There are
15763  * two functions used to rollback some of the above.
15764  *
15765  * If we have passed 4 it is too late to rollback and thus recovery is about
15766  * ensuring that the schema transaction is completed. Between 3 and 4 we are
15767  * able to both roll backward and roll forward. So it depends on other
15768  * parts of the schema transaction which path is choosen. If we fail between
15769  * 2 and 3 then we will have to remove the new table as writeable.
15770  * This is performed by make_new_table_non_writeable.
15771  * If a failure happens between 1 and 2 then we have to drop the new
15772  * fragments, this happens in drop_fragments_from_new_table_view. This method
15773  * is called also during revert ALTER TABLE when failure occurred between 2
15774  * and 3.
15775  *
15776  * Description of copy phase of ALTER TABLE REORG
15777  * ----------------------------------------------
15778  * The copy phase of ALTER TABLE REORG involves a great number of blocks.
15779  * The below setup and tear down phase is a description of what happens
15780  * for each table being reorganized.
15781  *
15782  * The below process happens in all nodes in parallel. Each node will
15783  * take care of the fragment replicas for which it is the primary
15784  * replica. This makes most of the communication here be local to
15785  * a node. Only the sending of updates to the new fragments and
15786  * updates to the backup replicas in the same node group will be
15787  * done over the network.
15788  *
15789  * DBDICT    DBDICT    TRIX          SUMA    DBUTIL      DBDIH   DBLQH
15790  * COPY_DATA_REQ
15791  * ------------>
15792  *   COPY_DATA_IMPL_REQ
15793  * --------------------->
15794  *                       UTIL_PREPARE_REQ
15795  *                       ---------------------->
15796  *   GET_TABINFOREQ
15797  * <--------------------------------------------
15798  *   GET_TABINFOCONF
15799  * -------------------------------------------->
15800  *                       UTIL_PREPARE_CONF
15801  *                       <----------------------
15802  *                       SUB_CREATE_REQ
15803  *                       ------------->
15804  *   GET_TABINFOREQ
15805  * <-----------------------------------
15806  *   GET_TABINFOCONF
15807  * ----------------------------------->
15808  *                       SUB_CREATE_CONF
15809  *                       <-------------
15810  *                       SUB_SYNC_REQ
15811  *                       ------------->
15812  *                                     DIH_SCAN_TAB_REQ (immediate)
15813  *                                     ---------------------->
15814  *                                     DIH_SCAN_TAB_CONF
15815  *                                     <----------------------
15816  *                                 Send DIH_SCAN_TAB_CONF to get rt break
15817  *
15818  *                                     DIGETNODESREQ (immediate)
15819  *                                     ---------------------->
15820  *                                     DIGETNODESCONF
15821  *                                     <----------------------
15822  *                         Get distribution data for each fragment
15823  *                         using DIGETNODESREQ possibly with
15824  *                         rt break through CONTINUEB. This builds
15825  *                         a list of fragments to handle.
15826  *
15827  *                                     SCAN_FRAGREQ
15828  *                                     -------------------------------->
15829  *                                     For each row we receive and send:
15830  *                                     TRANSID_AI
15831  *                                     <-------------------------------
15832  *                                     KEYINFO20
15833  *                                     <-------------------------------
15834  *                       SUB_TABLE_DATA
15835  *                       <-------------
15836  *                       UTIL_EXECUTE_REQ
15837  *                       --------------------->
15838  *                       TCKEYREQ to DBTC
15839  *                       ------------------------->
15840  *                       TCKEYCONF from DBTC
15841  *                       <-------------------------
15842  *                       UTIL_EXECUTE_CONF
15843  *                       <---------------------
15844  *
15845  * After 16 rows the scan will return (this will happen for each 16 row
15846  *                                         SCAN_FRAGCONF
15847  *                                      <--------------------------------
15848  *                       SUB_SYNC_CONTINUE_REQ
15849  *                       <--------------
15850  *                       wait for all outstanding transactions to complete
15851  *                       SUB_SYNC_CONTINUE_CONF
15852  *                       -------------->
15853  *                                         SCAN_NEXTREQ
15854  *                                       -------------------------------->
15855  *
15856  * Every now and then a fragment will have its scan completed. Then it will
15857  * receive SCAN_FRAGCONF with close flag set. Then it will send a new
15858  * SCAN_FRAGREQ for the next fragment to copy. When no more fragments is
15859  * available for copying then the copy action is completed.
15860  *
15861  * Copy phase completed after SCAN_FRAGCONF(close) from last fragment =>
15862  *                       SUB_SYNC_CONF
15863  *                       <-------------
15864  *                       WAIT_GCP_REQ
15865  *                       ----------------------------------->
15866  *
15867  *                       ..... wait for highest GCI to complete
15868  *
15869  *                       WAIT_GCP_CONF
15870  *                       <----------------------------------
15871  *                       SUB_REMOVE_REQ
15872  *                       ------------->
15873  *                       SUB_REMOVE_CONF
15874  *                       <-------------
15875  *                       UTIL_RELEASE_REQ
15876  *                       ------------------------>
15877  *                       UTIL_RELEASE_CONF
15878  *                       <------------------------
15879  * COPY_DATA_IMPL_CONF
15880  * <---------------------
15881  *
15882  * As can be seen the TRIX block is working with SUMA and DBUTIL to set up
15883  * the copy phase. The DBUTIL block is the block that performs the actual
15884  * read of the old fragments (through scans) and then copies the data to
15885  * the new fragments using write operations (key operations). Trix isn't
15886  * doing any real work, it is merely acting as a coordinator of the work
15887  * done.
15888  *
15889  * DBUTIL needs to set up generic data structures to enable receiving rows
15890  * from any table and pass them onto to be written from DBTC. There is fair
15891  * amount of code to do this, but it is straightforward code that doesn't
15892  * have much interaction issues, it is a fairly pure data structure problem.
15893  *
15894  * These data structures are released in UTIL_RELEASE_REQ.
15895  *
15896  * SUMA also reads the table metadata through the GET_TABINFO interface to
15897  * DICT, this is however only needed to read the number of attributes and
15898  * table version and verifying that the table exists.
15899  *
15900  * TRIX uses similar interfaces also to build indexes, create foreign keys
15901  * other basic operations. For COPY_DATA_IMPL_REQ TRIX receives the number
15902  * of real fragments from DBDICT. SUB_SYNC_REQ contains fragId == ZNIL which
15903  * means sync all fragments.
15904  *
15905  * Actually the copy phase is an exact replica of the also mentioned delete
15906  * phase. So when reorganising the data one first calls this functionality
15907  * using a few important flags. The first phase uses the flag REORG_COPY.
15908  * The second phase uses the flag called REORG_DELETE.
15909  *
15910  * COPY_DATA_IMPL_REQ always set the RF_WAIT_GCP, this means that when
15911  * TRIX receives SUB_SYNC_CONF we will wait for a GCP to complete to ensure
15912  * that the copy transactions are stable on disk through the REDO log.
15913  *
15914  * The SCAN_FRAGREQ uses TUP order if disk attributes in table. It always
15915  * scans using exclusive locks. This means that we will temporarily lock
15916  * each row when performing copy phase for the row, there should be no
15917  * risk of deadlocks due to this since only one row lock is required. So
15918  * deadlock cycles can form due to this. We use parallelism 16 in the
15919  * scanning.
15920  *
15921  * For each row we receive we get a TRANSID_AI with the attribute information
15922  * and KEYINFO20 with the key information. Based on this information we create
15923  * a SUB_TABLE_DATA signal and pass this to TRIX for execution by DBUTIL.
15924  * We send it to DBUTIL in a UTIL_EXECUTE_REQ signal referring to the prepared
15925  * transaction in DBUTIL. Each row is executed as a separate Scan Take Over
15926  * transaction. When the transaction is completed we get a UTIL_EXECUTE_CONF
15927  * response back. We record the GCI used to ensure we know the highest GCI
15928  * used as part of the Copy phase.
15929  *
15930  * The TCKEYREQ sent to DBTC is a Write operation and thus will either
15931  * overwrite the row or it will insert if it doesn't exist.
15932  *
15933  * There is a lot of logic in DBTC, DBLQH and DBTUP which is used to control
15934  * the upates on various fragments. During Copy phase and Delete phase all
15935  * fragments have a new reorg trigger installed. This trigger is fired for
15936  * all normal writes on tuples that are currently moving, nothing happens
15937  * for tuples that aren't moving. The trigger fires for moving tuples in
15938  * the old fragments and also in the new fragments when these are set to
15939  * online as having all data. In this phase we will make the new fragments
15940  * readable and also becomes the primary fragment for the tuples and in this
15941  * phase we still need to maintain the data in the old fragments until we
15942  * have completed the scans on those.
15943  *
15944  * This trigger will thus only fire during the time when we have two hash
15945  * maps here in DBDIH. As soon as we set the new hash map to RNIL the
15946  * reorg trigger won't fire anymore for writes going through this DIH.
15947  *
15948  * The copy phase and delete phase both sets the reorg flag in TCKEYREQ.
15949  * For the copy phase this means that the copy is only performed for
15950  * rows that are moving, for rows that aren't moving the action is
15951  * immediately completed. For moving rows the write is performed and will
15952  * either result in the row being inserted or the row being overwritten
15953  * with the same value (this will happen if an insert reorg trigger
15954  * inserted the row already).
15955  *
15956  * During the delete phase a delete action will be performed towards the
15957  * new hash map (which is actually now the old hash map since we have
15958  * switched to the new hash map as the original one and the old one is
15959  * the new one. This means that the delete will be performed only on
15960  * the old fragment and thus removing a row that has already completed
15961  * its move.
15962  *
15963  * When a reorg trigger is fired we only need to write the other fragment
15964  * with the same data as we did in the first fragment. However we have to
15965  * take into account that the fragments might have been swapped since
15966  * the original operation was here and when we come here to handle the
15967  * fired trigger. So the user of this interface have to verify that the
15968  * fragment id to update as new fragment isn't simply the same that the
15969  * trigger fired from, if it is then the other fragment is the one reported
15970  * as the current fragment from DIGETNODESREQ.
15971  *
15972  * How to handle ALTER TABLE REORG for fully replicated tables
15973  * -----------------------------------------------------------
15974  * First some observations. In fully replicated tables no data is moving.
15975  * We only need to copy the data to the new fragments. This means that
15976  * there is no need for reorg triggers. There is also no need for a
15977  * delete phase since no data has moved.
15978  *
15979  * The reorg triggers is avoided simply by never reporting REORG_MOVING
15980  * in the DIH interface. This ensures that no reorg trigger will ever
15981  * fire. Avoiding the delete phase isn't strictly necessary but it is
15982  * an easy optimisation and we can simply send COPY_DATA_IMPL_CONF
15983  * directly from COPY_DATA_IMPL_REQ in the delete phase to avoid it.
15984  *
15985  * The copy phase can be handled by DBTC putting a different meaning to
15986  * the reorg flag. Normall we would set SOF_REORG_COPY to ensure that
15987  * we only write the new fragment for those copy rows. Here we want to
15988  * perform an update that uses the fully replicated triggers to ensure
15989  * that all copy fragments are updated. One simple manner to do this is
15990  * to simply perform the update and let the fully replicated trigger
15991  * update all other copy fragments. However this means that we are
15992  * performing lots of unncessary writes.
15993  *
15994  * A very simple optimisation is to instead perform the write on the
15995  * first new copy fragment. In this case the trigger will fire and
15996  * since the initial fragment is the first new fragment and the
15997  * iterator only goes towards higher fragment ids, thus we thus
15998  * ensures that we won't write the old fragment that already has the
15999  * correct data. So this write becomes a perfectly normal update on
16000  * fully replicated table except that it uses a triggered operation
16001  * on a copy fragment which is normally not done. But triggers are
16002  * installed on also the copy fragments, so this is ok.
16003  *
16004  * This simple optimisation requires a new flag sent in the DIH
16005  * interface since DIH needs to be told to return the first
16006  * new fragment rather than the main fragment.
16007  *
16008  * More details about ALTER TABLE REORG
16009  * ------------------------------------
16010  * DBTUP has a bit in each tuple header called REORG_MOVE. This bit is set on
16011  * the first time that an update/delete/insert happens on the row after
16012  * calling make_new_table_writeable. After make_new_table_writeable has been
16013  * called we will set DiGetNodesConf::REORG_MOVING for rows that are to be
16014  * moved. So the first such a row has a write of it, this flag will be set
16015  * and also the reorg trigger will fire and send the update to the new
16016  * fragment. However the copy phase will copy this row even if this bit is
16017  * set since the bit can be set also by a transaction that is later aborted.
16018  * So there is no safe way of ensuring that a user transaction has actually
16019  * transferred this row. So when SUMA performs the scan in the copy phase it
16020  * will be a normal scan seeing all rows.
16021  *
16022  * When we have completed the copy phase and entered the delete phase then
16023  * we have set the m_scan_reorg_flag on the table and this means that all
16024  * transactions will have to set the flag ScanFragReq::REORG_NOT_MOVED to
16025  * ensure that they don't scan moved rows in both the new and the old
16026  * fragments. When all moved rows have been deleted from the old fragments
16027  * then we can stop reporting this flag to starting scans.
16028  *
16029  * A scan that is using the REORG_NOT_MOVED is safe unless we are moving
16030  * to yet another ALTER TABLE REORG of the same table very quickly. However
16031  * a potential problem could exist if we have a very long-running scan
16032  * and we start a new table reorg and user transactions start setting the
16033  * REORG_MOVE flag again. In that case the scan will actually miss those
16034  * rows. So effectively to close all possible problems we wait also for
16035  * all scans to complete also after completing the REORG_DELETE phase.
16036  * This ensures that we avoid this issue.
16037  */
16038 
16039 /*
16040   3.8.1    G E T   N O D E S   R E Q U E S T
16041   ******************************************
16042   Asks what nodes should be part of a transaction.
16043 */
execDIGETNODESREQ(Signal * signal)16044 void Dbdih::execDIGETNODESREQ(Signal* signal)
16045 {
16046   const DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
16047   FragmentstorePtr fragPtr;
16048   TabRecordPtr tabPtr;
16049   tabPtr.i = req->tableId;
16050   Uint32 hashValue = req->hashValue;
16051   Uint32 distr_key_indicator = req->distr_key_indicator;
16052   Uint32 anyNode = req->anyNode;
16053   Uint32 scan_indicator = req->scan_indicator;
16054   Uint32 get_next_fragid_indicator = req->get_next_fragid_indicator;
16055   Uint32 ttabFileSize = ctabFileSize;
16056   Uint32 fragId;
16057   Uint32 newFragId = RNIL;
16058   Uint32 nodeCount;
16059   Uint32 sig2;
16060   Ptr<Hash2FragmentMap> ptr;
16061   DiGetNodesConf * const conf = (DiGetNodesConf *)&signal->theData[0];
16062   TabRecord* regTabDesc = tabRecord;
16063   EmulatedJamBuffer * jambuf = (EmulatedJamBuffer*)req->jamBufferPtr;
16064   thrjamEntryDebug(jambuf);
16065   ptrCheckGuard(tabPtr, ttabFileSize, regTabDesc);
16066 
16067   /**
16068    * This check will be valid for the following reasons:
16069    * 1) If it is primary key operation we will have checked that the table
16070    *    is existing in DBTC before coming here and DBDIH is informed of new
16071    *    tables BEFORE DBTC and informed of dropping tables AFTER DBTC. So
16072    *    it is safe that if DBTC knows that a table exist then for sure we
16073    *    we will as well.
16074    *
16075    * 2) For ordered index scans we keep track of the number of scans working
16076    *    on the ordered index, so we won't be able to drop the index until
16077    *    all scans on the index has completed.
16078    */
16079   if (DictTabInfo::isOrderedIndex(tabPtr.p->tableType))
16080   {
16081     thrjam(jambuf);
16082     tabPtr.i = tabPtr.p->primaryTableId;
16083     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
16084   }
16085 
16086 loop:
16087   /**
16088    * To ensure we operate on a correct view of both table distribution and
16089    * alive nodes, we use an RCU mechanism to protect this call to
16090    * DIGETNODESREQ, this means that any changes in DBDIH will be reflected
16091    * in external DBTCs reading this data as well. These are variables
16092    * updated very seldomly and we only need to read them, thus a RCU is a
16093    * very powerful mechanism to achieve this.
16094    */
16095   Uint32 tab_val = tabPtr.p->m_lock.read_lock();
16096   Uint32 node_val = m_node_view_lock.read_lock();
16097   Uint32 map_ptr_i = tabPtr.p->m_map_ptr_i;
16098   Uint32 new_map_ptr_i = tabPtr.p->m_new_map_ptr_i;
16099 
16100   if (get_next_fragid_indicator != 0)
16101   {
16102     /**
16103      * The requester is interested in getting the next copy fragment.
16104      * This should only happen for Fully replicated tables atm.
16105      */
16106     thrjam(jambuf);
16107     fragId = hashValue;
16108     ndbassert((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) != 0);
16109     getFragstore(tabPtr.p, fragId, fragPtr);
16110     conf->fragId = fragPtr.p->nextCopyFragment;
16111     conf->zero = 0;
16112     goto check_exit;
16113   }
16114   /* When distr key indicator is set, regardless
16115    * of distribution algorithm in use, hashValue
16116    * IS fragment id.
16117    */
16118   if (distr_key_indicator)
16119   {
16120     thrjam(jambuf);
16121     fragId = hashValue;
16122     /**
16123      * This check isn't valid for scans, if we ever implement the possibility
16124      * to decrease the number of fragments then this can be true and still
16125      * be ok since we are using the old meta data and thus getFragstore
16126      * is still working even if we are reading a fragId out of range. We
16127      * keep track of such long-running scans to ensure we know when we
16128      * can remove the fragments completely.
16129      *
16130      * For execution of fully replicated triggers we come here with anyNode=3
16131      * In this case we have received the fragmentId from the code above with
16132      * get_next_fragid_indicator and we should also ensure that all writes
16133      * of fully replicated triggers also go to the new fragments.
16134      *
16135      */
16136     if (unlikely((!scan_indicator) &&
16137                  fragId >= tabPtr.p->totalfragments &&
16138                  anyNode != 3))
16139     {
16140       thrjam(jambuf);
16141       conf->zero= 1; //Indicate error;
16142       signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
16143       goto error;
16144     }
16145   }
16146   else if (tabPtr.p->method == TabRecord::HASH_MAP)
16147   {
16148     if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) == 0)
16149     {
16150       thrjam(jambuf);
16151       g_hash_map.getPtr(ptr, map_ptr_i);
16152       fragId = ptr.p->m_map[hashValue % ptr.p->m_cnt];
16153 
16154       if (unlikely(new_map_ptr_i != RNIL))
16155       {
16156         thrjam(jambuf);
16157         g_hash_map.getPtr(ptr, new_map_ptr_i);
16158         newFragId = ptr.p->m_map[hashValue % ptr.p->m_cnt];
16159         if (newFragId == fragId)
16160         {
16161           thrjam(jambuf);
16162           newFragId = RNIL;
16163         }
16164       }
16165     }
16166     else
16167     {
16168       /**
16169        * Fully replicated table. There are 3 cases:
16170        * anyNode == 0
16171        *   This is a normal read or write. We want the main fragment.
16172        * anyNode == 1
16173        *   This is a committed read. We want any fragment which is readable.
16174        * anyNode == 2
16175        *   This is a write from the copy phase of ALTER TABLE REORG
16176        *   We want the first new fragment.
16177        */
16178       thrjam(jambuf);
16179       g_hash_map.getPtr(ptr, map_ptr_i);
16180       const Uint32 partId = ptr.p->m_map[hashValue % ptr.p->m_cnt];
16181       if (anyNode == 2)
16182       {
16183         thrjam(jambuf);
16184         fragId = findFirstNewFragment(tabPtr.p, fragPtr, partId, jambuf);
16185         if (fragId == RNIL)
16186         {
16187           conf->zero = 0;
16188           conf->fragId = fragId;
16189           conf->nodes[0] = 0;
16190           goto check_exit;
16191         }
16192       }
16193       else fragId = partId;
16194     }
16195   }
16196   else if (tabPtr.p->method == TabRecord::LINEAR_HASH)
16197   {
16198     thrjam(jambuf);
16199     fragId = hashValue & tabPtr.p->mask;
16200     if (fragId < tabPtr.p->hashpointer) {
16201       thrjam(jambuf);
16202       fragId = hashValue & ((tabPtr.p->mask << 1) + 1);
16203     }//if
16204   }
16205   else if (tabPtr.p->method == TabRecord::NORMAL_HASH)
16206   {
16207     thrjam(jambuf);
16208     fragId= hashValue % tabPtr.p->partitionCount;
16209   }
16210   else
16211   {
16212     thrjam(jambuf);
16213     ndbassert(tabPtr.p->method == TabRecord::USER_DEFINED);
16214 
16215     /* User defined partitioning, but no distribution key passed */
16216     conf->zero= 1; //Indicate error;
16217     signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
16218     goto error;
16219   }
16220   if (ERROR_INSERTED_CLEAR(7240))
16221   {
16222     /* Error inject bypass the RCU lock */
16223     thrjam(jambuf);
16224     conf->zero= 1; //Indicate error;
16225     signal->theData[1]= ZUNDEFINED_FRAGMENT_ERROR;
16226     return;
16227   }
16228   if (ERROR_INSERTED_CLEAR(7234))
16229   {
16230     /* Error inject bypass the RCU lock */
16231     thrjam(jambuf);
16232     conf->zero= 1; //Indicate error;
16233     signal->theData[1]= ZLONG_MESSAGE_ERROR;
16234     return;
16235   }
16236   getFragstore(tabPtr.p, fragId, fragPtr);
16237   if (anyNode == 1)
16238   {
16239     thrjam(jambuf);
16240 
16241     /* anyNode is currently only useful for fully replicated tables */
16242     ndbassert((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) != 0);
16243 
16244     /**
16245      * search fragments to see if local fragment can be found
16246      *
16247      */
16248     fragId = findLocalFragment(tabPtr.p, fragPtr, jambuf);
16249   }
16250   nodeCount = extractNodeInfo(jambuf, fragPtr.p, conf->nodes);
16251   sig2 = (nodeCount - 1) +
16252     (fragPtr.p->distributionKey << 16) +
16253     (dihGetInstanceKey(fragPtr) << 24);
16254   conf->zero = 0;
16255   conf->reqinfo = sig2;
16256   conf->fragId = fragId;
16257 
16258   if (unlikely(newFragId != RNIL))
16259   {
16260     thrjam(jambuf);
16261     conf->reqinfo |= DiGetNodesConf::REORG_MOVING;
16262     getFragstore(tabPtr.p, newFragId, fragPtr);
16263     nodeCount = extractNodeInfo(jambuf,
16264                                fragPtr.p,
16265                                conf->nodes + 2 + MAX_REPLICAS);
16266     conf->nodes[MAX_REPLICAS] = newFragId;
16267     conf->nodes[MAX_REPLICAS + 1] = (nodeCount - 1) +
16268       (fragPtr.p->distributionKey << 16) +
16269       (dihGetInstanceKey(fragPtr) << 24);
16270   }
16271 
16272 check_exit:
16273   if (unlikely(!tabPtr.p->m_lock.read_unlock(tab_val)))
16274     goto loop;
16275   if (unlikely(!m_node_view_lock.read_unlock(node_val)))
16276     goto loop;
16277 
16278 error:
16279   /**
16280    * Ensure that also error conditions are based on a consistent view of
16281    * the data. In this no need to check node view since it wasn't used.
16282    */
16283   if (unlikely(!tabPtr.p->m_lock.read_unlock(tab_val)))
16284     goto loop;
16285   return;
16286 
16287 }//Dbdih::execDIGETNODESREQ()
16288 
16289 void
make_node_usable(NodeRecord * nodePtr)16290 Dbdih::make_node_usable(NodeRecord *nodePtr)
16291 {
16292   /**
16293    * Called when a node is ready to be used in transactions.
16294    * This means that the node needs to participate in writes,
16295    * it isn't necessarily ready for reads yet.
16296    */
16297   m_node_view_lock.write_lock();
16298   nodePtr->useInTransactions = true;
16299   m_node_view_lock.write_unlock();
16300 }
16301 
16302 void
make_node_not_usable(NodeRecord * nodePtr)16303 Dbdih::make_node_not_usable(NodeRecord *nodePtr)
16304 {
16305   /**
16306    * Node is no longer to be used in neither read nor
16307    * writes. The node is dead.
16308    */
16309   m_node_view_lock.write_lock();
16310   nodePtr->useInTransactions = false;
16311   m_node_view_lock.write_unlock();
16312 }
16313 
16314 Uint32
findPartitionOrder(const TabRecord * tabPtrP,FragmentstorePtr fragPtr)16315 Dbdih::findPartitionOrder(const TabRecord *tabPtrP,
16316                           FragmentstorePtr fragPtr)
16317 {
16318   Uint32 order = 0;
16319   FragmentstorePtr tempFragPtr;
16320   Uint32 fragId = fragPtr.p->partition_id;
16321   do
16322   {
16323     jam();
16324     getFragstore(tabPtrP, fragId, tempFragPtr);
16325     if (fragPtr.p == tempFragPtr.p)
16326     {
16327       jam();
16328       return order;
16329     }
16330     fragId = tempFragPtr.p->nextCopyFragment;
16331     order++;
16332   } while (fragId != RNIL);
16333   return RNIL;
16334 }
16335 
16336 Uint32
findFirstNewFragment(const TabRecord * tabPtrP,FragmentstorePtr & fragPtr,Uint32 fragId,EmulatedJamBuffer * jambuf)16337 Dbdih::findFirstNewFragment(const  TabRecord * tabPtrP,
16338                             FragmentstorePtr & fragPtr,
16339                             Uint32 fragId,
16340                             EmulatedJamBuffer *jambuf)
16341 {
16342   /**
16343    * Used by fully replicated tables to find the first new fragment
16344    * to copy data to during the copy phase.
16345    */
16346   do
16347   {
16348     getFragstore(tabPtrP, fragId, fragPtr);
16349     if (fragPtr.p->fragId >= tabPtrP->totalfragments)
16350     {
16351       /* Found first new fragment */
16352       break;
16353     }
16354     fragId = fragPtr.p->nextCopyFragment;
16355     if (fragId == RNIL)
16356       return fragId;
16357   } while (1);
16358   return fragPtr.p->fragId;
16359 }
16360 
16361 Uint32
findLocalFragment(const TabRecord * tabPtrP,FragmentstorePtr & fragPtr,EmulatedJamBuffer * jambuf)16362 Dbdih::findLocalFragment(const  TabRecord * tabPtrP,
16363                          FragmentstorePtr & fragPtr,
16364                          EmulatedJamBuffer *jambuf)
16365 {
16366   /**
16367    * We have found the main fragment, but we want to use any of the copy
16368    * fragments, so we search forward in the list of copy fragments until we
16369    * find a fragment that has a replica on our node. In rare cases (after
16370    * adding a node group and not yet reorganised all tables and performing
16371    * this on one of the new nodes in these new node groups, it could occur).
16372    *
16373    * Start searching the main fragment and then proceeding
16374    * forward until no more exists.
16375    */
16376   Uint32 fragId = fragPtr.p->fragId;
16377   do
16378   {
16379     thrjam(jambuf);
16380     if (check_if_local_fragment(jambuf, fragPtr.p))
16381     {
16382       thrjam(jambuf);
16383       return fragId;
16384     }
16385     /* Step to next copy fragment. */
16386     fragId = fragPtr.p->nextCopyFragment;
16387     if (fragId == RNIL || fragId > tabPtrP->totalfragments)
16388     {
16389       thrjam(jambuf);
16390       break;
16391     }
16392     getFragstore(tabPtrP, fragId, fragPtr);
16393   } while (1);
16394   /**
16395    * When no local fragment was found, simply use the last
16396    * copy fragment found, in this manner we avoid using
16397    * the main fragment during table reorg, this node group
16398    * has much to do in this phase.
16399    */
16400   return fragPtr.p->fragId;
16401 }
16402 
16403 bool
check_if_local_fragment(EmulatedJamBuffer * jambuf,const Fragmentstore * fragPtr)16404 Dbdih::check_if_local_fragment(EmulatedJamBuffer *jambuf,
16405                                const Fragmentstore *fragPtr)
16406 {
16407   for (Uint32 i = 0; i < fragPtr->fragReplicas; i++)
16408   {
16409     thrjam(jambuf);
16410     if (fragPtr->activeNodes[i] == getOwnNodeId())
16411     {
16412       thrjam(jambuf);
16413       return true;
16414     }
16415   }
16416   return false;
16417 }
16418 
extractNodeInfo(EmulatedJamBuffer * jambuf,const Fragmentstore * fragPtr,Uint32 nodes[])16419 Uint32 Dbdih::extractNodeInfo(EmulatedJamBuffer *jambuf,
16420                               const Fragmentstore * fragPtr,
16421                               Uint32 nodes[])
16422 {
16423   Uint32 nodeCount = 0;
16424   nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
16425   for (Uint32 i = 0; i < fragPtr->fragReplicas; i++) {
16426     thrjam(jambuf);
16427     NodeRecordPtr nodePtr;
16428     ndbrequire(i < MAX_REPLICAS);
16429     nodePtr.i = fragPtr->activeNodes[i];
16430     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
16431     if (nodePtr.p->useInTransactions) {
16432       thrjam(jambuf);
16433       nodes[nodeCount] = nodePtr.i;
16434       nodeCount++;
16435     }//if
16436   }//for
16437   ndbrequire(nodeCount > 0);
16438   return nodeCount;
16439 }//Dbdih::extractNodeInfo()
16440 
16441 #define DIH_TAB_WRITE_LOCK(tabPtrP) \
16442   do { assertOwnThread(); tabPtrP->m_lock.write_lock(); } while (0)
16443 
16444 #define DIH_TAB_WRITE_UNLOCK(tabPtrP) \
16445   do { assertOwnThread(); tabPtrP->m_lock.write_unlock(); } while (0)
16446 
16447 void
start_scan_on_table(TabRecordPtr tabPtr,Signal * signal,Uint32 schemaTransId,EmulatedJamBuffer * jambuf)16448 Dbdih::start_scan_on_table(TabRecordPtr tabPtr,
16449                            Signal *signal,
16450                            Uint32 schemaTransId,
16451                            EmulatedJamBuffer *jambuf)
16452 {
16453   /**
16454    * This method is called from start of scans in TC threads. We need to
16455    * protect against calls from multiple threads. The state and the
16456    * m_scan_count is protected by the mutex.
16457    *
16458    * To avoid having to protect this code with both mutex and RCU code
16459    * we ensure that the mutex is also held anytime we update the
16460    * m_map_ptr_i, totalfragments, noOfBackups, m_scan_reorg_flag
16461    * and partitionCount.
16462    */
16463   NdbMutex_Lock(&tabPtr.p->theMutex);
16464 
16465   if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
16466   {
16467     if (! (tabPtr.p->tabStatus == TabRecord::TS_CREATING &&
16468            tabPtr.p->schemaTransId == schemaTransId))
16469     {
16470       thrjam(jambuf);
16471       goto error;
16472     }
16473   }
16474 
16475   tabPtr.p->m_scan_count[0]++;
16476   ndbrequire(tabPtr.p->m_map_ptr_i != DihScanTabConf::InvalidCookie);
16477   {
16478     DihScanTabConf* conf = (DihScanTabConf*)signal->getDataPtrSend();
16479     conf->tableId = tabPtr.i;
16480     conf->senderData = 0; /* 0 indicates success */
16481     /**
16482      * For Fully replicated tables the totalfragments means the total
16483      * number of fragments including the copy fragment. Here however
16484      * we should respond with the real fragment count which is either
16485      * 1 or the number of LDMs dependent on which partition balance
16486      * the table was created with.
16487      *
16488      * partitionCount works also for other tables. We always scan
16489      * the real fragments when scanning all fragments and those
16490      * are always the first fragments in the interface to DIH.
16491      */
16492     conf->fragmentCount = tabPtr.p->partitionCount;
16493 
16494     conf->noOfBackups = tabPtr.p->noOfBackups;
16495     conf->scanCookie = tabPtr.p->m_map_ptr_i;
16496     conf->reorgFlag = tabPtr.p->m_scan_reorg_flag;
16497     NdbMutex_Unlock(&tabPtr.p->theMutex);
16498     return;
16499   }
16500 
16501 error:
16502   DihScanTabRef* ref = (DihScanTabRef*)signal->getDataPtrSend();
16503   ref->tableId = tabPtr.i;
16504   ref->senderData = 1; /* 1 indicates failure */
16505   ref->error = DihScanTabRef::ErroneousTableState;
16506   ref->tableStatus = tabPtr.p->tabStatus;
16507   ref->schemaTransId = schemaTransId;
16508   NdbMutex_Unlock(&tabPtr.p->theMutex);
16509   return;
16510 }
16511 
16512 void
complete_scan_on_table(TabRecordPtr tabPtr,Uint32 map_ptr_i,EmulatedJamBuffer * jambuf)16513 Dbdih::complete_scan_on_table(TabRecordPtr tabPtr,
16514                               Uint32 map_ptr_i,
16515                               EmulatedJamBuffer *jambuf)
16516 {
16517   /**
16518    * This method is called from other TC threads to signal that a
16519    * scan is completed. We keep track of number of outstanding scans
16520    * in two variables for old and new metadata (normally there is
16521    * only new metadata, but during changes we need this to ensure
16522    * that scans can continue also during schema changes).
16523    */
16524 
16525   Uint32 line;
16526   NdbMutex_Lock(&tabPtr.p->theMutex);
16527   if (map_ptr_i == tabPtr.p->m_map_ptr_i)
16528   {
16529     line = __LINE__;
16530     ndbassert(tabPtr.p->m_scan_count[0]);
16531     tabPtr.p->m_scan_count[0]--;
16532   }
16533   else
16534   {
16535     line = __LINE__;
16536     ndbassert(tabPtr.p->m_scan_count[1]);
16537     tabPtr.p->m_scan_count[1]--;
16538   }
16539   NdbMutex_Unlock(&tabPtr.p->theMutex);
16540   thrjamLine(jambuf, line);
16541 }
16542 
16543 bool
prepare_add_table(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr,Signal * signal)16544 Dbdih::prepare_add_table(TabRecordPtr tabPtr,
16545                          ConnectRecordPtr connectPtr,
16546                          Signal *signal)
16547 {
16548   DiAddTabReq * const req = (DiAddTabReq*)signal->getDataPtr();
16549   D("prepare_add_table tableId = " << tabPtr.i << " primaryTableId: " <<
16550     req->primaryTableId);
16551 
16552   NdbMutex_Lock(&tabPtr.p->theMutex);
16553   tabPtr.p->connectrec = connectPtr.i;
16554   tabPtr.p->tableType = req->tableType;
16555   tabPtr.p->schemaVersion = req->schemaVersion;
16556   tabPtr.p->primaryTableId = req->primaryTableId;
16557   tabPtr.p->schemaTransId = req->schemaTransId;
16558   tabPtr.p->m_scan_count[0] = 0;
16559   tabPtr.p->m_scan_count[1] = 0;
16560   tabPtr.p->m_scan_reorg_flag = 0;
16561   tabPtr.p->m_flags = 0;
16562 
16563   if (req->fullyReplicated)
16564   {
16565     jam();
16566     tabPtr.p->m_flags |= TabRecord::TF_FULLY_REPLICATED;
16567     tabPtr.p->partitionCount = req->partitionCount;
16568     D("fully replicated, partitionCount = " <<
16569       tabPtr.p->partitionCount);
16570   }
16571   else if (req->primaryTableId != RNIL)
16572   {
16573     jam();
16574     TabRecordPtr primTabPtr;
16575     primTabPtr.i = req->primaryTableId;
16576     ptrCheckGuard(primTabPtr, ctabFileSize, tabRecord);
16577     tabPtr.p->m_flags |= (primTabPtr.p->m_flags&TabRecord::TF_FULLY_REPLICATED);
16578     tabPtr.p->partitionCount = primTabPtr.p->partitionCount;
16579     D("Non-primary, m_flags: " << tabPtr.p->m_flags <<
16580       " partitionCount: " << tabPtr.p->partitionCount);
16581   }
16582   else
16583   {
16584     jam();
16585     tabPtr.p->partitionCount = req->partitionCount;
16586   }
16587   if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE)
16588   {
16589     /**
16590      * This code is used for starting non-master nodes in both System Restarts
16591      * and Node Restarts. The table and fragmentation information have been
16592      * copied from master node using COPY_TABREQ signals. We are ready to
16593      * add fragments and continue with creation of the tables.
16594      *
16595      * tabLcpActiveFragments is setup as part of reading table and
16596      * fragment information from disk. So we should not reset it to 0 here.
16597      */
16598     jam();
16599     tabPtr.p->tabStatus = TabRecord::TS_CREATING;
16600     NdbMutex_Unlock(&tabPtr.p->theMutex);
16601     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
16602     sendAddFragreq(signal, connectPtr, tabPtr, 0, false);
16603     return true;
16604   }
16605   else
16606   {
16607     jam();
16608     /* New table added, ensure that tabActiveLcpFragments is initialised. */
16609     tabPtr.p->tabActiveLcpFragments = 0;
16610   }
16611   NdbMutex_Unlock(&tabPtr.p->theMutex);
16612   return false;
16613 }
16614 
16615 void
commit_new_table(TabRecordPtr tabPtr)16616 Dbdih::commit_new_table(TabRecordPtr tabPtr)
16617 {
16618   /**
16619    * Normally this signal arrives as part of CREATE TABLE and then
16620    * DBTC haven't been informed of the table being available yet
16621    * and no protection is needed. It is however also used for
16622    * Table reorganisation and in that case the table is fully
16623    * available to DBTC and we need to protect the change here
16624    * to ensure that DIH_SCAN_TAB_REQ sees a correct view of
16625    * these variables.
16626    */
16627   D("commit_new_table: tableId = " << tabPtr.i);
16628   NdbMutex_Lock(&tabPtr.p->theMutex);
16629   tabPtr.p->tabStatus = TabRecord::TS_ACTIVE;
16630   tabPtr.p->schemaTransId = 0;
16631   NdbMutex_Unlock(&tabPtr.p->theMutex);
16632 }
16633 
16634 /**
16635  * start_add_fragments_in_new_table is called during prepare phase of
16636  * an ALTER TABLE reorg. It sets up new data structures for the new
16637  * fragments and starts up the calling of those to actually create
16638  * the new fragments. The only reason this method is protected is
16639  * because it touches some of the data structures used to get table
16640  * distribution.
16641  */
16642 void
start_add_fragments_in_new_table(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr,const Uint16 buf[],Signal * signal)16643 Dbdih::start_add_fragments_in_new_table(TabRecordPtr tabPtr,
16644                                         ConnectRecordPtr connectPtr,
16645                                         const Uint16 buf[],
16646                                         Signal *signal)
16647 {
16648   /**
16649    * We need to protect these changes to the node and fragment view of
16650    * the table since DBTC can see the table through these changes
16651    * and thus both the mutex and the RCU mechanism is required here to
16652    * ensure that DBTC sees a consistent view of the data.
16653    */
16654   D("start_add_fragments_in_new_table: tableId = " << tabPtr.i);
16655   Uint32 err;
16656   NdbMutex_Lock(&tabPtr.p->theMutex);
16657   DIH_TAB_WRITE_LOCK(tabPtr.p);
16658 
16659   Uint32 save = tabPtr.p->totalfragments;
16660   if ((err = add_fragments_to_table(tabPtr, buf)))
16661   {
16662     jam();
16663     DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16664     NdbMutex_Unlock(&tabPtr.p->theMutex);
16665     ndbrequire(tabPtr.p->totalfragments == save);
16666     ndbrequire(connectPtr.p->m_alter.m_org_totalfragments == save);
16667     send_alter_tab_ref(signal, tabPtr, connectPtr, err);
16668 
16669     ndbrequire(tabPtr.p->connectrec == connectPtr.i);
16670     tabPtr.p->connectrec = RNIL;
16671     release_connect(connectPtr);
16672     return;
16673   }
16674 
16675   tabPtr.p->tabCopyStatus = TabRecord::CS_ALTER_TABLE;
16676   connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
16677   if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) == 0)
16678   {
16679     jam();
16680     connectPtr.p->m_alter.m_partitionCount = tabPtr.p->totalfragments;
16681   }
16682   /* Don't make the new fragments available just yet. */
16683   tabPtr.p->totalfragments = save;
16684   NdbMutex_Unlock(&tabPtr.p->theMutex);
16685 
16686   sendAddFragreq(signal,
16687                  connectPtr,
16688                  tabPtr,
16689                  connectPtr.p->m_alter.m_org_totalfragments,
16690                  true);
16691 
16692   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16693   return;
16694 }
16695 
16696 /**
16697  * make_new_table_writeable starts off the copy phase. From here on the
16698  * copy triggers for reorg is activated. The new hash map is installed.
16699  * The new copy fragments are installed for fully replicated tables to
16700  * ensure that they are replicated to during each update of rows in the
16701  * fully replicated table.
16702  *
16703  * The new fragments are still not readable, they are only writeable. This
16704  * is secured by not changing totalfragments.
16705  */
16706 void
make_new_table_writeable(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr,bool rcu_lock_held)16707 Dbdih::make_new_table_writeable(TabRecordPtr tabPtr,
16708                                 ConnectRecordPtr connectPtr,
16709                                 bool rcu_lock_held)
16710 {
16711   D("make_new_table_writeable: tableId = " << tabPtr.i);
16712   if (!rcu_lock_held)
16713   {
16714     jam();
16715     DIH_TAB_WRITE_LOCK(tabPtr.p);
16716   }
16717   /**
16718    * At this point the new table fragments must be updated at proper times.
16719    * For tables without full replication this simply means setting the
16720    * value of the new_map_ptr_i referring to the new hash map. This hash
16721    * map will be used to point to new fragments for some rows.
16722    *
16723    * For fully replicated tables we must insert the new fragments into
16724    * list of copy fragments. These will still not be seen by readers
16725    * since we never return a fragment id larger than the totalfragments
16726    * variable.
16727    */
16728   if ((tabPtr.p->m_flags & TabRecord::TF_FULLY_REPLICATED) != 0 &&
16729        tabPtr.p->totalfragments <
16730        connectPtr.p->m_alter.m_totalfragments)
16731   {
16732     for (Uint32 i = tabPtr.p->totalfragments;
16733          i < connectPtr.p->m_alter.m_totalfragments;
16734          i++)
16735     {
16736       jam();
16737       FragmentstorePtr fragPtr;
16738       getFragstore(tabPtr.p, i, fragPtr);
16739       insertCopyFragmentList(tabPtr.p, fragPtr.p, i);
16740     }
16741   }
16742   mb();
16743   tabPtr.p->m_new_map_ptr_i = connectPtr.p->m_alter.m_new_map_ptr_i;
16744   if (!rcu_lock_held)
16745   {
16746     DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16747     jam();
16748   }
16749 }
16750 
16751 /**
16752  * make_new_table_read_and_writeable
16753  * ---------------------------------
16754  * Here we need to protect both using the table mutex and the RCU
16755  * mechanism. We want DIH_SCAN_TAB_REQ to see a correct combination
16756  * of those variables as protected by the mutex and we want
16757  * DIGETNODESREQ to see a protected and consistent view of its variables.
16758  *
16759  * At this point for an ALTER TABLE reorg we have completed copying the
16760  * data, so the new table distribution is completely ok to use. We thus
16761  * change the totalfragments to make the new fragments available for
16762  * both read and write.
16763  * We swap in the new hash map (so far only hash-map tables have support
16764  * for on-line table reorg), the old still exists for a while more.
16765  *
16766  * At this point we need to start waiting for old scans using the old
16767  * number of fragments to complete.
16768 */
16769 void
make_new_table_read_and_writeable(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr,Signal * signal)16770 Dbdih::make_new_table_read_and_writeable(TabRecordPtr tabPtr,
16771                                          ConnectRecordPtr connectPtr,
16772                                          Signal *signal)
16773 {
16774   jam();
16775   D("make_new_table_read_and_writeable tableId: " << tabPtr.i);
16776   NdbMutex_Lock(&tabPtr.p->theMutex);
16777   DIH_TAB_WRITE_LOCK(tabPtr.p);
16778   tabPtr.p->totalfragments = connectPtr.p->m_alter.m_totalfragments;
16779   tabPtr.p->partitionCount = connectPtr.p->m_alter.m_partitionCount;
16780   if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
16781   {
16782     jam();
16783     Uint32 save = tabPtr.p->m_map_ptr_i;
16784     tabPtr.p->m_map_ptr_i = tabPtr.p->m_new_map_ptr_i;
16785     tabPtr.p->m_new_map_ptr_i = save;
16786 
16787     for (Uint32 i = 0; i<tabPtr.p->totalfragments; i++)
16788     {
16789       jam();
16790       FragmentstorePtr fragPtr;
16791       getFragstore(tabPtr.p, i, fragPtr);
16792       fragPtr.p->distributionKey = (fragPtr.p->distributionKey + 1) & 0xFF;
16793     }
16794     DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16795 
16796     /* These variables are only protected by mutex. */
16797     ndbassert(tabPtr.p->m_scan_count[1] == 0);
16798     tabPtr.p->m_scan_count[1] = tabPtr.p->m_scan_count[0];
16799     tabPtr.p->m_scan_count[0] = 0;
16800     tabPtr.p->m_scan_reorg_flag = 1;
16801     NdbMutex_Unlock(&tabPtr.p->theMutex);
16802 
16803     send_alter_tab_conf(signal, connectPtr);
16804     return;
16805   }
16806 
16807   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16808   NdbMutex_Unlock(&tabPtr.p->theMutex);
16809   send_alter_tab_conf(signal, connectPtr);
16810   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
16811   tabPtr.p->connectrec = RNIL;
16812   release_connect(connectPtr);
16813 }
16814 
16815 /**
16816  * We need to ensure that all scans after this signal sees
16817  * the new m_scan_reorg_flag to ensure that we don't have
16818  * races where scans use this flag in an incorrect manner.
16819  * It is protected by mutex, so requires a mutex protecting
16820  * it, m_new_map_ptr_i is only protected by the RCU mechanism
16821  * and not by the mutex.
16822  *
16823  * At this point the ALTER TABLE is completed and any old scans
16824  * using the old table distribution is completed and we can
16825  * drop the old hash map.
16826  */
16827 bool
make_old_table_non_writeable(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr)16828 Dbdih::make_old_table_non_writeable(TabRecordPtr tabPtr,
16829                                     ConnectRecordPtr connectPtr)
16830 {
16831   bool wait_flag = false;
16832   D("make_old_table_non_writeable: tableId = " << tabPtr.i);
16833   NdbMutex_Lock(&tabPtr.p->theMutex);
16834   DIH_TAB_WRITE_LOCK(tabPtr.p);
16835   tabPtr.p->m_new_map_ptr_i = RNIL;
16836   tabPtr.p->m_scan_reorg_flag = 0;
16837   if (AlterTableReq::getReorgFragFlag(connectPtr.p->m_alter.m_changeMask))
16838   {
16839     /**
16840      * To ensure that we don't have any outstanding scans with
16841      * REORG_NOT_MOVED flag set we also start waiting for those
16842      * scans to complete here.
16843      */
16844     ndbassert(tabPtr.p->m_scan_count[1] == 0);
16845     tabPtr.p->m_scan_count[1] = tabPtr.p->m_scan_count[0];
16846     tabPtr.p->m_scan_count[0] = 0;
16847     wait_flag = true;
16848   }
16849   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16850   NdbMutex_Unlock(&tabPtr.p->theMutex);
16851 
16852   ndbrequire(tabPtr.p->connectrec == connectPtr.i);
16853   tabPtr.p->connectrec = RNIL;
16854   release_connect(connectPtr);
16855   return wait_flag;
16856 }
16857 
16858 /**
16859  * During node recovery a replica is first installed as
16860  * a new writeable replica. Then when committing this
16861  * the fragment replica is also readable.
16862  */
16863 void
make_table_use_new_replica(TabRecordPtr tabPtr,FragmentstorePtr fragPtr,ReplicaRecordPtr replicaPtr,Uint32 replicaType,Uint32 destNodeId)16864 Dbdih::make_table_use_new_replica(TabRecordPtr tabPtr,
16865                                   FragmentstorePtr fragPtr,
16866                                   ReplicaRecordPtr replicaPtr,
16867                                   Uint32 replicaType,
16868                                   Uint32 destNodeId)
16869 {
16870   D("make_table_use_new_replica: tableId: " << tabPtr.i <<
16871     " fragId = " << fragPtr.p->fragId <<
16872     " replicaType = " << replicaType <<
16873     " destNodeId = " << destNodeId);
16874 
16875   DIH_TAB_WRITE_LOCK(tabPtr.p);
16876   switch (replicaType) {
16877   case UpdateFragStateReq::STORED:
16878     jam();
16879     CRASH_INSERTION(7138);
16880     /* ----------------------------------------------------------------------*/
16881     /*  HERE WE ARE INSERTING THE NEW BACKUP NODE IN THE EXECUTION OF ALL    */
16882     /*  OPERATIONS. FROM HERE ON ALL OPERATIONS ON THIS FRAGMENT WILL INCLUDE*/
16883     /*  USE OF THE NEW REPLICA.                                              */
16884     /* --------------------------------------------------------------------- */
16885     insertBackup(fragPtr, destNodeId);
16886 
16887     fragPtr.p->distributionKey++;
16888     fragPtr.p->distributionKey &= 255;
16889     break;
16890   case UpdateFragStateReq::COMMIT_STORED:
16891     jam();
16892     CRASH_INSERTION(7139);
16893     /* ----------------------------------------------------------------------*/
16894     /*  HERE WE ARE MOVING THE REPLICA TO THE STORED SECTION SINCE IT IS NOW */
16895     /*  FULLY LOADED WITH ALL DATA NEEDED.                                   */
16896     // We also update the order of the replicas here so that if the new
16897     // replica is the desired primary we insert it as primary.
16898     /* ----------------------------------------------------------------------*/
16899     removeOldStoredReplica(fragPtr, replicaPtr);
16900     linkStoredReplica(fragPtr, replicaPtr);
16901     updateNodeInfo(fragPtr);
16902     break;
16903   case UpdateFragStateReq::START_LOGGING:
16904     jam();
16905     break;
16906   default:
16907     ndbabort();
16908   }//switch
16909   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16910 }
16911 
16912 /**
16913  * Switch in the new primary replica. This is used to ensure that
16914  * the primary replicas are balanced over all nodes.
16915  */
16916 void
make_table_use_new_node_order(TabRecordPtr tabPtr,FragmentstorePtr fragPtr,Uint32 numReplicas,Uint32 * newNodeOrder)16917 Dbdih::make_table_use_new_node_order(TabRecordPtr tabPtr,
16918                                      FragmentstorePtr fragPtr,
16919                                      Uint32 numReplicas,
16920                                      Uint32 *newNodeOrder)
16921 {
16922   D("make_table_use_new_node_order: tableId = " << tabPtr.i <<
16923     " fragId = " << fragPtr.p->fragId);
16924 
16925   DIH_TAB_WRITE_LOCK(tabPtr.p);
16926   for (Uint32 i = 0; i < numReplicas; i++)
16927   {
16928     jam();
16929     ndbrequire(i < MAX_REPLICAS);
16930     fragPtr.p->activeNodes[i] = newNodeOrder[i];
16931   }//for
16932   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16933 }
16934 
16935 /**
16936  * Remove new hash map during rollback of ALTER TABLE REORG.
16937  */
16938 void
make_new_table_non_writeable(TabRecordPtr tabPtr)16939 Dbdih::make_new_table_non_writeable(TabRecordPtr tabPtr)
16940 {
16941   D("make_new_table_non_writeable: tableId = " << tabPtr.i);
16942   DIH_TAB_WRITE_LOCK(tabPtr.p);
16943   tabPtr.p->m_new_map_ptr_i = RNIL;
16944   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16945 }
16946 
16947 /**
16948  * Drop fragments as part of rollback of ALTER TABLE REORG.
16949  */
16950 void
drop_fragments_from_new_table_view(TabRecordPtr tabPtr,ConnectRecordPtr connectPtr)16951 Dbdih::drop_fragments_from_new_table_view(TabRecordPtr tabPtr,
16952                                           ConnectRecordPtr connectPtr)
16953 {
16954   D("drop_fragments_from_new_table_view: tableId = " << tabPtr.i);
16955   Uint32 new_frags = connectPtr.p->m_alter.m_totalfragments;
16956   Uint32 org_frags = connectPtr.p->m_alter.m_org_totalfragments;
16957 
16958   /**
16959    * We need to manipulate the table distribution and we want to ensure
16960    * DBTC sees a consistent view of these changes. We affect both data
16961    * used by DIGETNODES and DIH_SCAN_TAB_REQ, so both mutex and RCU lock
16962    * need to be held.
16963    */
16964   NdbMutex_Lock(&tabPtr.p->theMutex);
16965   DIH_TAB_WRITE_LOCK(tabPtr.p);
16966 
16967   tabPtr.p->totalfragments = new_frags;
16968   for (Uint32 i = new_frags - 1; i >= org_frags; i--)
16969   {
16970     jam();
16971     release_fragment_from_table(tabPtr, i);
16972   }
16973   NdbMutex_Unlock(&tabPtr.p->theMutex);
16974   DIH_TAB_WRITE_UNLOCK(tabPtr.p);
16975   connectPtr.p->m_alter.m_totalfragments = org_frags;
16976   D("5: totalfragments = " << org_frags);
16977 }
16978 
16979 void
getFragstore(const TabRecord * tab,Uint32 fragNo,FragmentstorePtr & fragptr)16980 Dbdih::getFragstore(const TabRecord * tab,      //In parameter
16981                     Uint32 fragNo,              //In parameter
16982                     FragmentstorePtr & fragptr) //Out parameter
16983 {
16984   FragmentstorePtr fragPtr;
16985   Uint32 TfragstoreFileSize = cfragstoreFileSize;
16986   Fragmentstore* TfragStore = fragmentstore;
16987   Uint32 chunkNo = fragNo >> LOG_NO_OF_FRAGS_PER_CHUNK;
16988   Uint32 chunkIndex = fragNo & (NO_OF_FRAGS_PER_CHUNK - 1);
16989   fragPtr.i = tab->startFid[chunkNo] + chunkIndex;
16990   if (likely(chunkNo < NDB_ARRAY_SIZE(tab->startFid))) {
16991     ptrCheckGuard(fragPtr, TfragstoreFileSize, TfragStore);
16992     fragptr = fragPtr;
16993     return;
16994   }//if
16995   ndbabort();
16996 }//Dbdih::getFragstore()
16997 
16998 void
getFragstoreCanFail(const TabRecord * tab,Uint32 fragNo,FragmentstorePtr & fragptr)16999 Dbdih::getFragstoreCanFail(const TabRecord * tab,      //In parameter
17000                            Uint32 fragNo,              //In parameter
17001                            FragmentstorePtr & fragptr) //Out parameter
17002 {
17003   FragmentstorePtr fragPtr;
17004   Uint32 TfragstoreFileSize = cfragstoreFileSize;
17005   Fragmentstore* TfragStore = fragmentstore;
17006   Uint32 chunkNo = fragNo >> LOG_NO_OF_FRAGS_PER_CHUNK;
17007   Uint32 chunkIndex = fragNo & (NO_OF_FRAGS_PER_CHUNK - 1);
17008   fragPtr.i = tab->startFid[chunkNo] + chunkIndex;
17009   if (likely(chunkNo < NDB_ARRAY_SIZE(tab->startFid)))
17010   {
17011     if (fragPtr.i < TfragstoreFileSize)
17012     {
17013       ptrAss(fragPtr, TfragStore);
17014       fragptr = fragPtr;
17015       return;
17016     }
17017   }//if
17018   fragptr.i = RNIL;
17019   fragptr.p = NULL;
17020 }//Dbdih::getFragstoreCanFail()
17021 
17022 /**
17023  * End of TRANSACTION MODULE
17024  * -------------------------
17025  */
17026 
17027 /**
17028  * When this is called DBTC isn't made aware of the table just yet, so no
17029  * need to protect anything here from DBTC's view.
17030  */
allocFragments(Uint32 noOfFragments,TabRecordPtr tabPtr)17031 void Dbdih::allocFragments(Uint32 noOfFragments, TabRecordPtr tabPtr)
17032 {
17033   FragmentstorePtr fragPtr;
17034   Uint32 noOfChunks = (noOfFragments + (NO_OF_FRAGS_PER_CHUNK - 1)) >> LOG_NO_OF_FRAGS_PER_CHUNK;
17035   ndbrequire(cremainingfrags >= noOfFragments);
17036   Uint32 fragId = 0;
17037   for (Uint32 i = 0; i < noOfChunks; i++) {
17038     jam();
17039     Uint32 baseFrag = cfirstfragstore;
17040     ndbrequire(i < NDB_ARRAY_SIZE(tabPtr.p->startFid));
17041     tabPtr.p->startFid[i] = baseFrag;
17042     fragPtr.i = baseFrag;
17043     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
17044     cfirstfragstore = fragPtr.p->nextFragmentChunk;
17045     cremainingfrags -= NO_OF_FRAGS_PER_CHUNK;
17046     for (Uint32 j = 0; j < NO_OF_FRAGS_PER_CHUNK; j++) {
17047       jam();
17048       fragPtr.i = baseFrag + j;
17049       ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
17050       initFragstore(fragPtr, fragId);
17051       fragId++;
17052     }//if
17053   }//for
17054   tabPtr.p->noOfFragChunks = noOfChunks;
17055 }//Dbdih::allocFragments()
17056 
17057 /**
17058  * No need to protect anything from DBTC here, table is in last part
17059  * of being dropped and has been removed from DBTC's view long time
17060  * ago.
17061  */
releaseFragments(TabRecordPtr tabPtr)17062 void Dbdih::releaseFragments(TabRecordPtr tabPtr)
17063 {
17064   FragmentstorePtr fragPtr;
17065   for (Uint32 i = 0; i < tabPtr.p->noOfFragChunks; i++) {
17066     jam();
17067     ndbrequire(i < NDB_ARRAY_SIZE(tabPtr.p->startFid));
17068     Uint32 baseFrag = tabPtr.p->startFid[i];
17069     fragPtr.i = baseFrag;
17070     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
17071     fragPtr.p->nextFragmentChunk = cfirstfragstore;
17072     cfirstfragstore = baseFrag;
17073     tabPtr.p->startFid[i] = RNIL;
17074     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
17075   }//for
17076   tabPtr.p->noOfFragChunks = 0;
17077 }//Dbdih::releaseFragments()
17078 
initialiseFragstore()17079 void Dbdih::initialiseFragstore()
17080 {
17081   Uint32 i;
17082   FragmentstorePtr fragPtr;
17083   for (i = 0; i < cfragstoreFileSize; i++) {
17084     fragPtr.i = i;
17085     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
17086     initFragstore(fragPtr, 0);
17087   }//for
17088   Uint32 noOfChunks = cfragstoreFileSize >> LOG_NO_OF_FRAGS_PER_CHUNK;
17089   fragPtr.i = 0;
17090   cfirstfragstore = RNIL;
17091   cremainingfrags = 0;
17092   for (i = 0; i < noOfChunks; i++) {
17093     refresh_watch_dog();
17094     ptrCheckGuard(fragPtr, cfragstoreFileSize, fragmentstore);
17095     fragPtr.p->nextFragmentChunk = cfirstfragstore;
17096     cfirstfragstore = fragPtr.i;
17097     fragPtr.i += NO_OF_FRAGS_PER_CHUNK;
17098     cremainingfrags += NO_OF_FRAGS_PER_CHUNK;
17099   }//for
17100 }//Dbdih::initialiseFragstore()
17101 
17102 #ifndef NDB_HAVE_RMB
17103 #define rmb() do { } while (0)
17104 #endif
17105 
17106 #ifndef NDB_HAVE_WMB
17107 #define wmb() do { } while (0)
17108 #endif
17109 
17110 inline
17111 bool
isEmpty(const DIVERIFY_queue & q)17112 Dbdih::isEmpty(const DIVERIFY_queue & q)
17113 {
17114   /* read barrier, not for ordering but to try force fresh read */
17115   rmb();
17116   return q.cfirstVerifyQueue == q.clastVerifyQueue;
17117 }
17118 
17119 inline
17120 void
enqueue(DIVERIFY_queue & q)17121 Dbdih::enqueue(DIVERIFY_queue & q)
17122 {
17123 #ifndef NDEBUG
17124   /**
17125    * - assert only
17126    * - we must read first *before* "publishing last
17127    *   or else DIH-thread could already have consumed entry
17128    *   when we call assert
17129    */
17130   Uint32 first = q.cfirstVerifyQueue;
17131 #endif
17132 
17133   Uint32 last = q.clastVerifyQueue;
17134 
17135   q.clastVerifyQueue = last + 1;
17136 
17137   /* barrier to flush writes */
17138   wmb();
17139   assert(q.clastVerifyQueue != first);
17140 }
17141 
17142 inline
17143 void
dequeue(DIVERIFY_queue & q)17144 Dbdih::dequeue(DIVERIFY_queue & q)
17145 {
17146   Uint32 first = q.cfirstVerifyQueue;
17147 
17148   q.cfirstVerifyQueue = first + 1;
17149 
17150   /* barrier to flush writes */
17151   wmb();
17152 }
17153 
17154 /*
17155   3.9   V E R I F I C A T I O N
17156   ****************************=
17157   */
17158 /****************************************************************************/
17159 /* **********     VERIFICATION SUB-MODULE                       *************/
17160 /****************************************************************************/
17161 /*
17162   3.9.1     R E C E I V I N G  O F  V E R I F I C A T I O N   R E Q U E S T
17163   *************************************************************************
17164   */
execDIVERIFYREQ(Signal * signal)17165 void Dbdih::execDIVERIFYREQ(Signal* signal)
17166 {
17167   EmulatedJamBuffer * jambuf = * (EmulatedJamBuffer**)(signal->theData+2);
17168   thrjamEntry(jambuf);
17169   Uint32 qno = signal->theData[1];
17170   ndbassert(qno < NDB_ARRAY_SIZE(c_diverify_queue));
17171   DIVERIFY_queue & q = c_diverify_queue[qno];
17172 loop:
17173   Uint32 val = m_micro_gcp.m_lock.read_lock();
17174   Uint32 blocked = getBlockCommit() == true ? 1 : 0;
17175   if (blocked == 0)
17176   {
17177     thrjam(jambuf);
17178     /*-----------------------------------------------------------------------*/
17179     // We are not blocked so we can simply reply back to TC immediately. The
17180     // method was called with EXECUTE_DIRECT so we reply back by setting signal
17181     // data and returning.
17182     // theData[0] already contains the correct information so
17183     // we need not touch it.
17184     /*-----------------------------------------------------------------------*/
17185     signal->theData[1] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
17186     signal->theData[2] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
17187     signal->theData[3] = 0;
17188     if (unlikely(! m_micro_gcp.m_lock.read_unlock(val)))
17189       goto loop;
17190     return;
17191   }//if
17192   /*-------------------------------------------------------------------------*/
17193   // Since we are blocked we need to put this operation last in the verify
17194   // queue to ensure that operation starts up in the correct order.
17195   /*-------------------------------------------------------------------------*/
17196   enqueue(q);
17197   signal->theData[3] = blocked + 1; // Indicate no immediate return
17198   return;
17199 }//Dbdih::execDIVERIFYREQ()
17200 
execDIH_SCAN_TAB_REQ(Signal * signal)17201 void Dbdih::execDIH_SCAN_TAB_REQ(Signal* signal)
17202 {
17203   DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtr();
17204   EmulatedJamBuffer * jambuf = (EmulatedJamBuffer*)req->jamBufferPtr;
17205 
17206   thrjamEntry(jambuf);
17207 
17208   TabRecordPtr tabPtr;
17209   tabPtr.i = req->tableId;
17210   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
17211 
17212   start_scan_on_table(tabPtr, signal, req->schemaTransId, jambuf);
17213   return;
17214 }//Dbdih::execDIH_SCAN_TAB_REQ()
17215 
17216 void
execDIH_SCAN_TAB_COMPLETE_REP(Signal * signal)17217 Dbdih::execDIH_SCAN_TAB_COMPLETE_REP(Signal* signal)
17218 {
17219   DihScanTabCompleteRep* rep = (DihScanTabCompleteRep*)signal->getDataPtr();
17220   EmulatedJamBuffer * jambuf = (EmulatedJamBuffer*)rep->jamBufferPtr;
17221 
17222   TabRecordPtr tabPtr;
17223   tabPtr.i = rep->tableId;
17224   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
17225 
17226   complete_scan_on_table(tabPtr, rep->scanCookie, jambuf);
17227 }
17228 
17229 
17230 /****************************************************************************/
17231 /* **********     GLOBAL-CHECK-POINT HANDLING  MODULE           *************/
17232 /****************************************************************************/
17233 /*
17234   3.10   G L O B A L  C H E C K P O I N T ( IN  M A S T E R  R O L E)
17235   *******************************************************************
17236   */
17237 
17238 bool
check_enable_micro_gcp(Signal * signal,bool broadcast)17239 Dbdih::check_enable_micro_gcp(Signal* signal, bool broadcast)
17240 {
17241   ndbassert(m_micro_gcp.m_enabled == false);
17242   ndbassert(NodeVersionInfo::DataLength == 6);
17243   Uint32 min = ~(Uint32)0;
17244   const NodeVersionInfo& info = getNodeVersionInfo();
17245   for (Uint32 i = 0; i<3; i++)
17246   {
17247     Uint32 tmp = info.m_type[i].m_min_version;
17248     if (tmp)
17249     {
17250       min = (min < tmp) ? min : tmp;
17251     }
17252   }
17253 
17254   {
17255     jam();
17256     m_micro_gcp.m_enabled = true;
17257 
17258     infoEvent("Enabling micro GCP");
17259     if (broadcast)
17260     {
17261       jam();
17262       UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
17263       ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
17264 
17265       /**
17266        * We need to notify all ndbd's or they'll get confused!
17267        */
17268       NodeRecordPtr specNodePtr;
17269       specNodePtr.i = cfirstAliveNode;
17270       do {
17271         jam();
17272         ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
17273         sendSignal(calcDihBlockRef(specNodePtr.i), GSN_UPGRADE_PROTOCOL_ORD,
17274                    signal, UpgradeProtocolOrd::SignalLength, JBA);
17275         specNodePtr.i = specNodePtr.p->nextNode;
17276       } while (specNodePtr.i != RNIL);
17277       EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
17278     }
17279   }
17280   return m_micro_gcp.m_enabled;
17281 }
17282 
17283 void
execUPGRADE_PROTOCOL_ORD(Signal * signal)17284 Dbdih::execUPGRADE_PROTOCOL_ORD(Signal* signal)
17285 {
17286   const UpgradeProtocolOrd* ord = (UpgradeProtocolOrd*)signal->getDataPtr();
17287   switch(ord->type){
17288   case UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP:
17289     jam();
17290     m_micro_gcp.m_enabled = true;
17291     EXECUTE_DIRECT(QMGR, GSN_UPGRADE_PROTOCOL_ORD,signal, signal->getLength());
17292     return;
17293   }
17294 }
17295 
17296 void
startGcpLab(Signal * signal)17297 Dbdih::startGcpLab(Signal* signal)
17298 {
17299   if (ERROR_INSERTED(7242))
17300   {
17301     jam();
17302     g_eventLogger->info("Delayed GCP_COMMIT start 5s");
17303     signal->theData[0] = DihContinueB::ZSTART_GCP;
17304     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 5000, 1);
17305     return;
17306   }
17307 
17308   for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
17309   {
17310     if (c_diverify_queue[i].m_empty_done == 0)
17311     {
17312       // Previous global checkpoint is not yet completed.
17313       jam();
17314       signal->theData[0] = DihContinueB::ZSTART_GCP;
17315       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
17316       return;
17317     }
17318   }
17319 
17320   emptyWaitGCPMasterQueue(signal,
17321                           m_micro_gcp.m_current_gci,
17322                           c_waitEpochMasterList);
17323 
17324   if (c_nodeStartMaster.blockGcp != 0 &&
17325       m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE)
17326   {
17327     jam();
17328 
17329     /* ------------------------------------------------------------------ */
17330     /*  A NEW NODE WANTS IN AND WE MUST ALLOW IT TO COME IN NOW SINCE THE */
17331     /*       GCP IS COMPLETED.                                            */
17332     /* ------------------------------------------------------------------ */
17333 
17334     if (ERROR_INSERTED(7217))
17335     {
17336       jam();
17337 
17338       signal->theData[0] = 9999;
17339       sendSignal(numberToRef(CMVMI, refToNode(c_nodeStartMaster.startNode)),
17340                  GSN_NDB_TAMPER, signal, 1, JBB);
17341       NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time); // Force start
17342       // fall through
17343     }
17344     else
17345     {
17346       jam();
17347       ndbrequire(c_nodeStartMaster.blockGcp == 1); // Ordered...
17348       c_nodeStartMaster.blockGcp = 2; // effective
17349       gcpBlockedLab(signal);
17350       return;
17351     }
17352   }
17353 
17354   if (cgcpOrderBlocked)
17355   {
17356     jam();
17357     signal->theData[0] = DihContinueB::ZSTART_GCP;
17358     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
17359     return;
17360   }
17361 
17362   const NDB_TICKS now = c_current_time = NdbTick_getCurrentTicks();
17363 
17364   /**
17365    * An invalid micro-GCP 'start_time' is used to force
17366    * a micro GCP to be started immediately.
17367    */
17368   if (NdbTick_IsValid(m_micro_gcp.m_master.m_start_time))
17369   {
17370     const Uint32 delayMicro = m_micro_gcp.m_enabled ?
17371       m_micro_gcp.m_master.m_time_between_gcp :
17372       m_gcp_save.m_master.m_time_between_gcp;
17373     const Uint64 elapsed =
17374       NdbTick_Elapsed(m_micro_gcp.m_master.m_start_time, now).milliSec();
17375 
17376     if (elapsed < delayMicro)
17377     {
17378       jam();
17379       signal->theData[0] = DihContinueB::ZSTART_GCP;
17380       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
17381       return;
17382     }
17383   }
17384 
17385   m_micro_gcp.m_master.m_start_time = now;
17386 
17387   if (m_micro_gcp.m_enabled == false &&
17388       m_micro_gcp.m_master.m_time_between_gcp)
17389   {
17390     /**
17391      * Micro GCP is disabled...but configured...
17392      */
17393     jam();
17394     check_enable_micro_gcp(signal, true);
17395   }
17396 
17397   /**
17398    * Check that there has not been more than 2^32 micro GCP wo/ any save
17399    */
17400   Uint64 currGCI = m_micro_gcp.m_current_gci;
17401   ndbrequire(Uint32(currGCI) != ~(Uint32)0);
17402   m_micro_gcp.m_master.m_new_gci = currGCI + 1;
17403 
17404   const Uint32 delaySave = m_gcp_save.m_master.m_time_between_gcp;
17405   const NDB_TICKS start  = m_gcp_save.m_master.m_start_time;
17406   const bool need_gcp_save =
17407     (!NdbTick_IsValid(start) ||                              //First or forced GCP
17408      NdbTick_Elapsed(start, now).milliSec() >= delaySave) && //Reached time limit
17409     (!ERROR_INSERTED(7243));  /* 7243 = no GCP_SAVE initiation */
17410 
17411   if ((m_micro_gcp.m_enabled == false) ||
17412       (need_gcp_save &&
17413        m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE))
17414   {
17415     jam();
17416     /**
17417      * Time for save...switch gci_hi
17418      */
17419     m_gcp_save.m_master.m_start_time = now;
17420     m_micro_gcp.m_master.m_new_gci = Uint64((currGCI >> 32) + 1) << 32;
17421 
17422     signal->theData[0] = NDB_LE_GlobalCheckpointStarted; //Event type
17423     signal->theData[1] = Uint32(currGCI >> 32);
17424     signal->theData[2] = Uint32(currGCI);
17425     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
17426   }
17427 
17428   ndbassert(m_micro_gcp.m_enabled || Uint32(m_micro_gcp.m_new_gci) == 0);
17429 
17430 
17431   /***************************************************************************/
17432   // Report the event that a global checkpoint has started.
17433   /***************************************************************************/
17434 
17435   CRASH_INSERTION(7000);
17436   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_PREPARE;
17437   signal->setTrace(TestOrd::TraceGlobalCheckpoint);
17438 
17439 #ifdef ERROR_INSERT
17440   if (ERROR_INSERTED(7186))
17441   {
17442     sendToRandomNodes("GCP_PREPARE",
17443                       signal, &c_GCP_PREPARE_Counter, &Dbdih::sendGCP_PREPARE);
17444     signal->theData[0] = 9999;
17445     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
17446     return;
17447   }
17448   else if (ERROR_INSERTED(7200))
17449   {
17450     c_GCP_PREPARE_Counter.clearWaitingFor();
17451     NodeRecordPtr nodePtr;
17452     nodePtr.i = cfirstAliveNode;
17453     do {
17454       jam();
17455       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
17456       c_GCP_PREPARE_Counter.setWaitingFor(nodePtr.i);
17457       if (nodePtr.i != getOwnNodeId())
17458       {
17459         SET_ERROR_INSERT_VALUE(7201);
17460         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
17461       }
17462       else
17463       {
17464         SET_ERROR_INSERT_VALUE(7202);
17465         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
17466       }
17467       nodePtr.i = nodePtr.p->nextNode;
17468     } while (nodePtr.i != RNIL);
17469 
17470     NodeReceiverGroup rg(CMVMI, c_GCP_PREPARE_Counter);
17471     rg.m_nodes.clear(getOwnNodeId());
17472     Uint32 victim = rg.m_nodes.find(0);
17473 
17474     signal->theData[0] = 9999;
17475     sendSignal(numberToRef(CMVMI, victim),
17476 	       GSN_NDB_TAMPER, signal, 1, JBA);
17477 
17478     CLEAR_ERROR_INSERT_VALUE;
17479     return;
17480   }
17481   else if (ERROR_INSERTED(7227))
17482   {
17483     ndbout_c("Not sending GCP_PREPARE to %u", c_error_insert_extra);
17484     c_GCP_PREPARE_Counter.clearWaitingFor();
17485     NodeRecordPtr nodePtr;
17486     nodePtr.i = cfirstAliveNode;
17487     do {
17488       jam();
17489       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
17490       c_GCP_PREPARE_Counter.setWaitingFor(nodePtr.i);
17491       if (nodePtr.i != c_error_insert_extra)
17492       {
17493         sendGCP_PREPARE(signal, nodePtr.i, RNIL);
17494       }
17495       nodePtr.i = nodePtr.p->nextNode;
17496     } while (nodePtr.i != RNIL);
17497 
17498     signal->theData[0] = 9999;
17499     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
17500     return;
17501   }
17502 #endif
17503 
17504   sendLoopMacro(GCP_PREPARE, sendGCP_PREPARE, RNIL);
17505 }//Dbdih::startGcpLab()
17506 
execGCP_PREPARECONF(Signal * signal)17507 void Dbdih::execGCP_PREPARECONF(Signal* signal)
17508 {
17509   jamEntry();
17510   Uint32 senderNodeId = signal->theData[0];
17511   Uint32 gci_hi = signal->theData[1];
17512   Uint32 gci_lo = signal->theData[2];
17513 
17514   DEB_NODE_STOP(("Recv GCP_PREPARECONF(%u,%u) from %u",
17515                  gci_hi, gci_lo, senderNodeId));
17516 
17517   ndbrequire(signal->getLength() >= GCPPrepareConf::SignalLength);
17518 
17519   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
17520   ndbrequire(gci == m_micro_gcp.m_master.m_new_gci);
17521   receiveLoopMacro(GCP_PREPARE, senderNodeId);
17522   //-------------------------------------------------------------
17523   // We have now received all replies. We are ready to continue
17524   // with committing the global checkpoint.
17525   //-------------------------------------------------------------
17526   gcpcommitreqLab(signal);
17527 }//Dbdih::execGCP_PREPARECONF()
17528 
gcpcommitreqLab(Signal * signal)17529 void Dbdih::gcpcommitreqLab(Signal* signal)
17530 {
17531   CRASH_INSERTION(7001);
17532 
17533   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMMIT;
17534 
17535 #ifdef ERROR_INSERT
17536   if (ERROR_INSERTED(7187))
17537   {
17538     sendToRandomNodes("GCP_COMMIT",
17539                       signal, &c_GCP_COMMIT_Counter, &Dbdih::sendGCP_COMMIT);
17540     signal->theData[0] = 9999;
17541     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
17542     return;
17543   }
17544 #endif
17545 
17546   sendLoopMacro(GCP_COMMIT, sendGCP_COMMIT, RNIL);
17547   return;
17548 }//Dbdih::gcpcommitreqLab()
17549 
execGCP_NODEFINISH(Signal * signal)17550 void Dbdih::execGCP_NODEFINISH(Signal* signal)
17551 {
17552   jamEntry();
17553   const Uint32 senderNodeId = signal->theData[0];
17554   const Uint32 gci_hi = signal->theData[1];
17555   const Uint32 tcFailNo = signal->theData[2];
17556   const Uint32 gci_lo = signal->theData[3];
17557   const Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
17558 
17559   DEB_NODE_STOP(("Recv GCP_NODEFINISH(%u,%u) from %u",
17560                  gci_hi, gci_lo, senderNodeId));
17561 
17562   /* Check that there has not been a node failure since TC
17563    * reported this GCP complete...
17564    */
17565   if ((senderNodeId == getOwnNodeId()) &&
17566       (tcFailNo < cMinTcFailNo))
17567   {
17568     jam();
17569     ndbrequire(c_GCP_COMMIT_Counter.isWaitingFor(getOwnNodeId()));
17570 
17571     /* We are master, and the local TC will takeover the transactions
17572      * of the failed node, which can add to the current GCP, so resend
17573      * GCP_NOMORETRANS to TC...
17574      */
17575     m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT; /* Reset DIH Slave GCP state */
17576 
17577     GCPNoMoreTrans* req = (GCPNoMoreTrans*)signal->getDataPtrSend();
17578     req->senderRef = reference();
17579     req->senderData = m_micro_gcp.m_master_ref;
17580     req->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
17581     req->gci_lo = Uint32(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
17582     sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
17583                GCPNoMoreTrans::SignalLength, JBB);
17584 
17585     return;
17586   }
17587   (void)gci; // TODO validate
17588 
17589   ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT);
17590   receiveLoopMacro(GCP_COMMIT, senderNodeId);
17591 
17592   jam();
17593 
17594   if (m_micro_gcp.m_enabled)
17595   {
17596     jam();
17597 
17598     m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_COMPLETE;
17599 
17600     SubGcpCompleteRep * rep = (SubGcpCompleteRep*)signal->getDataPtr();
17601     rep->senderRef = reference();
17602     rep->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
17603     rep->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
17604     rep->flags = SubGcpCompleteRep::IN_MEMORY;
17605 
17606 #ifdef ERROR_INSERT
17607     if (ERROR_INSERTED(7190))
17608     {
17609       sendToRandomNodes("GCP_COMPLETE_REP", signal,
17610                         &c_SUB_GCP_COMPLETE_REP_Counter,
17611                         &Dbdih::sendSUB_GCP_COMPLETE_REP);
17612       signal->theData[0] = 9999;
17613       sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
17614     }
17615     else if (ERROR_INSERTED(7226))
17616     {
17617       ndbout_c("Not sending SUB_GCP_COMPLETE_REP to %u", c_error_insert_extra);
17618       c_SUB_GCP_COMPLETE_REP_Counter.clearWaitingFor();
17619       NodeRecordPtr nodePtr;
17620       nodePtr.i = cfirstAliveNode;
17621       do {
17622         jam();
17623         ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
17624         c_SUB_GCP_COMPLETE_REP_Counter.setWaitingFor(nodePtr.i);
17625         if (nodePtr.i != c_error_insert_extra)
17626         {
17627           sendSignal(calcDihBlockRef(nodePtr.i), GSN_SUB_GCP_COMPLETE_REP,
17628                      signal, SubGcpCompleteRep::SignalLength, JBA);
17629         }
17630         nodePtr.i = nodePtr.p->nextNode;
17631       } while (nodePtr.i != RNIL);
17632       SET_ERROR_INSERT_VALUE(7227);
17633 
17634       signal->theData[0] = 9999;
17635       sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 200, 1);
17636     }
17637     else
17638 #endif
17639     {
17640       jam();
17641       // Normal path...
17642       sendLoopMacro(SUB_GCP_COMPLETE_REP, sendSUB_GCP_COMPLETE_REP, RNIL);
17643     }
17644   }
17645 
17646   //-------------------------------------------------------------
17647   // We have now received all replies. We are ready to continue
17648   // with saving the global checkpoint to disk.
17649   //-------------------------------------------------------------
17650   CRASH_INSERTION(7002);
17651 
17652   Uint32 curr_hi = (Uint32)(m_micro_gcp.m_current_gci >> 32);
17653   Uint32 old_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
17654 
17655   if (m_micro_gcp.m_enabled)
17656   {
17657     jam();
17658   }
17659   else
17660   {
17661     ndbrequire(curr_hi != old_hi);
17662   }
17663 
17664   if (curr_hi == old_hi)
17665   {
17666     jam();
17667     return;
17668   }
17669 
17670   /**
17671    * Start a save
17672    */
17673   Uint32 saveGCI = old_hi;
17674   m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_REQ;
17675   m_gcp_save.m_master.m_new_gci = saveGCI;
17676 
17677 #ifdef ERROR_INSERT
17678   if (ERROR_INSERTED(7188))
17679   {
17680     sendToRandomNodes("GCP_SAVE",
17681                       signal, &c_GCP_SAVEREQ_Counter, &Dbdih::sendGCP_SAVEREQ);
17682     signal->theData[0] = 9999;
17683     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
17684     return;
17685   }
17686   else if (ERROR_INSERTED(7216))
17687   {
17688     infoEvent("GCP_SAVE all/%u", c_error_insert_extra);
17689     NodeRecordPtr nodePtr;
17690     nodePtr.i = c_error_insert_extra;
17691     ptrAss(nodePtr, nodeRecord);
17692 
17693     removeAlive(nodePtr);
17694     sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
17695     insertAlive(nodePtr);
17696     signal->theData[0] = 9999;
17697     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
17698     c_GCP_SAVEREQ_Counter.setWaitingFor(c_error_insert_extra);
17699     return;
17700   }
17701 #endif
17702 
17703   sendLoopMacro(GCP_SAVEREQ, sendGCP_SAVEREQ, RNIL);
17704 }
17705 
17706 void
execSUB_GCP_COMPLETE_ACK(Signal * signal)17707 Dbdih::execSUB_GCP_COMPLETE_ACK(Signal* signal)
17708 {
17709   jamEntry();
17710   SubGcpCompleteAck ack = * CAST_CONSTPTR(SubGcpCompleteAck,
17711                                           signal->getDataPtr());
17712   Uint32 senderNodeId = refToNode(ack.rep.senderRef);
17713 
17714   ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMPLETE);
17715   receiveLoopMacro(SUB_GCP_COMPLETE_REP, senderNodeId);
17716 
17717   m_micro_gcp.m_master.m_state = MicroGcp::M_GCP_IDLE;
17718 
17719   if (!ERROR_INSERTED(7190))
17720   {
17721     signal->theData[0] = DihContinueB::ZSTART_GCP;
17722     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 10, 1);
17723   }
17724 }
17725 
17726 void
execGCP_SAVEREQ(Signal * signal)17727 Dbdih::execGCP_SAVEREQ(Signal* signal)
17728 {
17729   jamEntry();
17730   GCPSaveReq * req = (GCPSaveReq*)&signal->theData[0];
17731 
17732   if (ERROR_INSERTED(7237))
17733   {
17734     jam();
17735     g_eventLogger->info("Delayed GCP_SAVEREQ 5s");
17736     sendSignalWithDelay(reference(), GSN_GCP_SAVEREQ,
17737                         signal, 5000,
17738                         signal->getLength());
17739     return;
17740   }
17741 
17742   if (m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ)
17743   {
17744     jam();
17745     /**
17746      * This is master take over...
17747      * and SAVE_REQ is already running
17748      */
17749     ndbrequire(m_gcp_save.m_gci == req->gci);
17750     m_gcp_save.m_master_ref = req->dihBlockRef;
17751     return;
17752   }
17753 
17754   if (m_gcp_save.m_gci == req->gci)
17755   {
17756     jam();
17757     /**
17758      * This is master take over...
17759      * and SAVE_REQ is complete...
17760      */
17761     m_gcp_save.m_master_ref = req->dihBlockRef;
17762 
17763     GCPSaveReq save = (* req);
17764     GCPSaveConf * conf = (GCPSaveConf*)signal->getDataPtrSend();
17765     conf->dihPtr = save.dihPtr;
17766     conf->nodeId = getOwnNodeId();
17767     conf->gci    = save.gci;
17768     sendSignal(m_gcp_save.m_master_ref, GSN_GCP_SAVECONF, signal,
17769                GCPSaveConf::SignalLength, JBA);
17770     return;
17771   }
17772 
17773   ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_IDLE);
17774   m_gcp_save.m_state = GcpSave::GCP_SAVE_REQ;
17775   m_gcp_save.m_master_ref = req->dihBlockRef;
17776   m_gcp_save.m_gci = req->gci;
17777 
17778   req->dihBlockRef = reference();
17779   sendSignal(DBLQH_REF, GSN_GCP_SAVEREQ, signal, signal->getLength(), JBA);
17780 }
17781 
execGCP_SAVECONF(Signal * signal)17782 void Dbdih::execGCP_SAVECONF(Signal* signal)
17783 {
17784   jamEntry();
17785   GCPSaveConf * saveConf = (GCPSaveConf*)&signal->theData[0];
17786 
17787   if (refToBlock(signal->getSendersBlockRef()) == DBLQH)
17788   {
17789     jam();
17790 
17791     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ);
17792     m_gcp_save.m_state = GcpSave::GCP_SAVE_CONF;
17793 
17794     sendSignal(m_gcp_save.m_master_ref,
17795                GSN_GCP_SAVECONF, signal, signal->getLength(), JBA);
17796     return;
17797   }
17798 
17799   ndbrequire(saveConf->gci == m_gcp_save.m_master.m_new_gci);
17800   ndbrequire(saveConf->nodeId == saveConf->dihPtr);
17801   SYSFILE->lastCompletedGCI[saveConf->nodeId] = saveConf->gci;
17802   GCP_SAVEhandling(signal, saveConf->nodeId);
17803 }//Dbdih::execGCP_SAVECONF()
17804 
execGCP_SAVEREF(Signal * signal)17805 void Dbdih::execGCP_SAVEREF(Signal* signal)
17806 {
17807   jamEntry();
17808   GCPSaveRef * const saveRef = (GCPSaveRef*)&signal->theData[0];
17809 
17810   if (refToBlock(signal->getSendersBlockRef()) == DBLQH)
17811   {
17812     jam();
17813 
17814     ndbrequire(m_gcp_save.m_state == GcpSave::GCP_SAVE_REQ);
17815     m_gcp_save.m_state = GcpSave::GCP_SAVE_CONF;
17816 
17817     sendSignal(m_gcp_save.m_master_ref,
17818                GSN_GCP_SAVEREF, signal, signal->getLength(), JBA);
17819     return;
17820   }
17821 
17822   ndbrequire(saveRef->gci == m_gcp_save.m_master.m_new_gci);
17823   ndbrequire(saveRef->nodeId == saveRef->dihPtr);
17824 
17825   /**
17826    * Only allow reason not to save
17827    */
17828   ndbrequire(saveRef->errorCode == GCPSaveRef::NodeShutdownInProgress ||
17829 	     saveRef->errorCode == GCPSaveRef::FakedSignalDueToNodeFailure ||
17830 	     saveRef->errorCode == GCPSaveRef::NodeRestartInProgress);
17831   GCP_SAVEhandling(signal, saveRef->nodeId);
17832 }//Dbdih::execGCP_SAVEREF()
17833 
GCP_SAVEhandling(Signal * signal,Uint32 nodeId)17834 void Dbdih::GCP_SAVEhandling(Signal* signal, Uint32 nodeId)
17835 {
17836   ndbrequire(m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_REQ);
17837   receiveLoopMacro(GCP_SAVEREQ, nodeId);
17838   /*-------------------------------------------------------------------------*/
17839   // All nodes have replied. We are ready to update the system file.
17840   /*-------------------------------------------------------------------------*/
17841 
17842   CRASH_INSERTION(7003);
17843   /**------------------------------------------------------------------------
17844    * SET NEW RECOVERABLE GCI. ALSO RESET RESTART COUNTER TO ZERO.
17845    * THIS INDICATES THAT THE SYSTEM HAS BEEN RECOVERED AND SURVIVED AT
17846    * LEAST ONE GLOBAL CHECKPOINT PERIOD. WE WILL USE THIS PARAMETER TO
17847    * SET BACK THE RESTART GCI IF WE ENCOUNTER MORE THAN ONE UNSUCCESSFUL
17848    * RESTART.
17849    *------------------------------------------------------------------------*/
17850   SYSFILE->newestRestorableGCI = m_gcp_save.m_gci;
17851   if(Sysfile::getInitialStartOngoing(SYSFILE->systemRestartBits) &&
17852      getNodeState().startLevel == NodeState::SL_STARTED){
17853     jam();
17854 #if 0
17855     g_eventLogger->info("Dbdih: Clearing initial start ongoing");
17856 #endif
17857     Sysfile::clearInitialStartOngoing(SYSFILE->systemRestartBits);
17858   }
17859   copyGciLab(signal, CopyGCIReq::GLOBAL_CHECKPOINT);
17860 
17861   m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_COPY_GCI;
17862 
17863 }//Dbdih::GCP_SAVEhandling()
17864 
17865 /*
17866   3.11   G L O B A L  C H E C K P O I N T (N O T - M A S T E R)
17867   *************************************************************
17868   */
execGCP_PREPARE(Signal * signal)17869 void Dbdih::execGCP_PREPARE(Signal* signal)
17870 {
17871   jamEntry();
17872   CRASH_INSERTION(7005);
17873 
17874   if (ERROR_INSERTED(7030))
17875   {
17876     cgckptflag = true;
17877   }
17878   if (ERROR_INSERTED(7030) ||
17879       ERROR_INSERTED(7238))
17880   {
17881     g_eventLogger->info("Delayed GCP_PREPARE 5s");
17882     sendSignalWithDelay(reference(), GSN_GCP_PREPARE, signal, 5000,
17883 			signal->getLength());
17884     return;
17885   }
17886 
17887   GCPPrepare* req = (GCPPrepare*)signal->getDataPtr();
17888   GCPPrepareConf * conf = (GCPPrepareConf*)signal->getDataPtrSend();
17889   Uint32 masterNodeId = req->nodeId;
17890   Uint32 gci_hi = req->gci_hi;
17891   Uint32 gci_lo = req->gci_lo;
17892   ndbrequire(signal->getLength() >= GCPPrepare::SignalLength);
17893   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
17894 
17895   DEB_NODE_STOP(("Recv GCP_PREPARE(%u,%u) from %u",
17896                  gci_hi, gci_lo, masterNodeId));
17897 
17898   BlockReference retRef = calcDihBlockRef(masterNodeId);
17899 
17900   if (isMaster())
17901   {
17902     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_PREPARE);
17903   }
17904 
17905   if (m_micro_gcp.m_state == MicroGcp::M_GCP_PREPARE)
17906   {
17907     jam();
17908     /**
17909      * This must be master take over
17910      *   Prepare is already complete
17911      */
17912     ndbrequire(m_micro_gcp.m_new_gci == gci);
17913     m_micro_gcp.m_master_ref = retRef;
17914     goto reply;
17915   }
17916 
17917   if (m_micro_gcp.m_new_gci == gci)
17918   {
17919     jam();
17920     /**
17921      * This GCP has already been prepared...
17922      *   Must be master takeover
17923      */
17924     m_micro_gcp.m_master_ref = retRef;
17925     goto reply;
17926   }
17927 
17928   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_IDLE);
17929 
17930   m_micro_gcp.m_lock.write_lock();
17931   cgckptflag = true;
17932   m_micro_gcp.m_state = MicroGcp::M_GCP_PREPARE;
17933   m_micro_gcp.m_new_gci = gci;
17934   m_micro_gcp.m_master_ref = retRef;
17935   m_micro_gcp.m_lock.write_unlock();
17936 
17937   if (ERROR_INSERTED(7031))
17938   {
17939     g_eventLogger->info("Crashing delayed in GCP_PREPARE 3s");
17940     signal->theData[0] = 9999;
17941     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 3000, 1);
17942     return;
17943   }
17944 #ifdef GCP_TIMER_HACK
17945   globalData.gcp_timer_commit[0] = NdbTick_getCurrentTicks();
17946 #endif
17947 
17948 reply:
17949   /**
17950    * Send the new gci to Suma.
17951    *
17952    * To get correct signal order and avoid races, this signal is sent on the
17953    * same prio as the SUB_GCP_COMPLETE_REP signal sent to SUMA in
17954    * execSUB_GCP_COMPLETE_REP().
17955    */
17956   sendSignal(SUMA_REF, GSN_GCP_PREPARE, signal, signal->length(), JBB);
17957 
17958   /* Send reply. */
17959   conf->nodeId = cownNodeId;
17960   conf->gci_hi = gci_hi;
17961   conf->gci_lo = gci_lo;
17962   DEB_NODE_STOP(("Send GCP_PREPARECONF(%u,%u) to %u",
17963                  req->gci_hi, req->gci_lo, refToNode(retRef)));
17964   sendSignal(retRef, GSN_GCP_PREPARECONF, signal,
17965              GCPPrepareConf::SignalLength, JBA);
17966   return;
17967 }
17968 
execGCP_COMMIT(Signal * signal)17969 void Dbdih::execGCP_COMMIT(Signal* signal)
17970 {
17971   jamEntry();
17972   CRASH_INSERTION(7006);
17973 
17974   if (ERROR_INSERTED(7239))
17975   {
17976     g_eventLogger->info("Delayed GCP_COMMIT 5s");
17977     sendSignalWithDelay(reference(), GSN_GCP_COMMIT, signal, 5000,
17978                         signal->getLength());
17979     return;
17980   }
17981 
17982   GCPCommit * req = (GCPCommit*)signal->getDataPtr();
17983   Uint32 masterNodeId = req->nodeId;
17984   Uint32 gci_hi = req->gci_hi;
17985   Uint32 gci_lo = req->gci_lo;
17986 
17987   DEB_NODE_STOP(("Recv GCP_COMMIT(%u,%u) from %u",
17988                  gci_hi, gci_lo, masterNodeId));
17989 
17990   ndbrequire(signal->getLength() >= GCPCommit::SignalLength);
17991   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
17992 
17993 #ifdef ERROR_INSERT
17994   if (ERROR_INSERTED(7213))
17995   {
17996     ndbout_c("err 7213 killing %d", c_error_insert_extra);
17997     Uint32 save = signal->theData[0];
17998     signal->theData[0] = 5048;
17999     sendSignal(numberToRef(DBLQH, c_error_insert_extra),
18000                GSN_NDB_TAMPER, signal, 1, JBB);
18001     signal->theData[0] = save;
18002     CLEAR_ERROR_INSERT_VALUE;
18003 
18004     signal->theData[0] = 9999;
18005     sendSignal(numberToRef(CMVMI, c_error_insert_extra),
18006                GSN_DUMP_STATE_ORD, signal, 1, JBB);
18007 
18008     signal->theData[0] = save;
18009     CLEAR_ERROR_INSERT_VALUE;
18010 
18011     return;
18012   }
18013 #endif
18014 
18015   Uint32 masterRef = calcDihBlockRef(masterNodeId);
18016   ndbrequire(masterNodeId == cmasterNodeId);
18017   if (isMaster())
18018   {
18019     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT);
18020   }
18021 
18022   if (m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT)
18023   {
18024     jam();
18025     /**
18026      * This must be master take over
18027      *   Commit is already ongoing...
18028      */
18029     ndbrequire(m_micro_gcp.m_current_gci == gci);
18030     m_micro_gcp.m_master_ref = masterRef;
18031     return;
18032   }
18033 
18034   if (m_micro_gcp.m_current_gci == gci)
18035   {
18036     jam();
18037     /**
18038      * This must be master take over
18039      *   Commit has already completed
18040      */
18041     m_micro_gcp.m_master_ref = masterRef;
18042 
18043     GCPNodeFinished* conf = (GCPNodeFinished*)signal->getDataPtrSend();
18044     conf->nodeId = cownNodeId;
18045     conf->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
18046     conf->failno = cfailurenr;
18047     conf->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
18048 
18049     DEB_NODE_STOP(("Send GCP_NODEFINISH(%u,%u) to %u",
18050                  conf->gci_hi, conf->gci_lo, refToNode(masterRef)));
18051 
18052     sendSignal(masterRef, GSN_GCP_NODEFINISH, signal,
18053                GCPNodeFinished::SignalLength, JBB);
18054     return;
18055   }
18056 
18057   ndbrequire(m_micro_gcp.m_new_gci == gci);
18058   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_PREPARE);
18059   m_micro_gcp.m_state = MicroGcp::M_GCP_COMMIT;
18060   m_micro_gcp.m_master_ref = calcDihBlockRef(masterNodeId);
18061 
18062   m_micro_gcp.m_lock.write_lock();
18063   m_micro_gcp.m_old_gci = m_micro_gcp.m_current_gci;
18064   m_micro_gcp.m_current_gci = gci;
18065   cgckptflag = false;
18066   m_micro_gcp.m_lock.write_unlock();
18067 
18068   for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
18069   {
18070     jam();
18071     c_diverify_queue[i].m_empty_done = 0;
18072     emptyverificbuffer(signal, i, true);
18073   }
18074 
18075   GCPNoMoreTrans* req2 = (GCPNoMoreTrans*)signal->getDataPtrSend();
18076   req2->senderRef = reference();
18077   req2->senderData = calcDihBlockRef(masterNodeId);
18078   req2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
18079   req2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
18080   sendSignal(clocaltcblockref, GSN_GCP_NOMORETRANS, signal,
18081              GCPNoMoreTrans::SignalLength, JBB);
18082   return;
18083 }//Dbdih::execGCP_COMMIT()
18084 
execGCP_TCFINISHED(Signal * signal)18085 void Dbdih::execGCP_TCFINISHED(Signal* signal)
18086 {
18087   jamEntry();
18088   CRASH_INSERTION(7007);
18089   GCPTCFinished* conf = (GCPTCFinished*)signal->getDataPtr();
18090   Uint32 retRef = conf->senderData;
18091   Uint32 gci_hi = conf->gci_hi;
18092   Uint32 gci_lo = conf->gci_lo;
18093   Uint32 tcFailNo = conf->tcFailNo;
18094   Uint64 gci = gci_lo | (Uint64(gci_hi) << 32);
18095   ndbrequire(gci == m_micro_gcp.m_old_gci);
18096 
18097   if (ERROR_INSERTED(7181) || ERROR_INSERTED(7182))
18098   {
18099     c_error_7181_ref = retRef; // Save ref
18100     ndbout_c("killing %d", refToNode(cmasterdihref));
18101     signal->theData[0] = 9999;
18102     sendSignal(numberToRef(CMVMI, refToNode(cmasterdihref)),
18103 	       GSN_NDB_TAMPER, signal, 1, JBB);
18104     return;
18105   }
18106 
18107 #ifdef ERROR_INSERT
18108   if (ERROR_INSERTED(7214))
18109   {
18110     ndbout_c("err 7214 killing %d", c_error_insert_extra);
18111     Uint32 save = signal->theData[0];
18112     signal->theData[0] = 9999;
18113     sendSignal(numberToRef(CMVMI, c_error_insert_extra),
18114                GSN_NDB_TAMPER, signal, 1, JBB);
18115     signal->theData[0] = save;
18116     CLEAR_ERROR_INSERT_VALUE;
18117   }
18118 #endif
18119 
18120 #ifdef GCP_TIMER_HACK
18121   globalData.gcp_timer_commit[1] = NdbTick_getCurrentTicks();
18122 #endif
18123 
18124   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT);
18125 
18126   /**
18127    * Make sure that each LQH gets scheduled, so that they don't get out of sync
18128    * wrt to SUB_GCP_COMPLETE_REP
18129    */
18130   Callback cb;
18131   cb.m_callbackData = tcFailNo;  /* Pass fail-no triggering TC_FINISHED to callback */
18132   cb.m_callbackFunction = safe_cast(&Dbdih::execGCP_TCFINISHED_sync_conf);
18133   Uint32 path[] = { DBLQH, SUMA, 0 };
18134   synchronize_path(signal, path, cb);
18135 }//Dbdih::execGCP_TCFINISHED()
18136 
18137 void
execGCP_TCFINISHED_sync_conf(Signal * signal,Uint32 cb,Uint32 err)18138 Dbdih::execGCP_TCFINISHED_sync_conf(Signal* signal, Uint32 cb, Uint32 err)
18139 {
18140   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMIT);
18141 
18142   m_micro_gcp.m_state = MicroGcp::M_GCP_COMMITTED;
18143   Uint32 retRef = m_micro_gcp.m_master_ref;
18144 
18145   GCPNodeFinished* conf2 = (GCPNodeFinished*)signal->getDataPtrSend();
18146   conf2->nodeId = cownNodeId;
18147   conf2->gci_hi = (Uint32)(m_micro_gcp.m_old_gci >> 32);
18148   conf2->failno = cb;  /* tcFailNo */
18149   conf2->gci_lo = (Uint32)(m_micro_gcp.m_old_gci & 0xFFFFFFFF);
18150 
18151   DEB_NODE_STOP(("2:Send GCP_NODEFINISH(%u,%u) to %u",
18152                  conf2->gci_hi, conf2->gci_lo, refToNode(retRef)));
18153 
18154   sendSignal(retRef, GSN_GCP_NODEFINISH, signal,
18155              GCPNodeFinished::SignalLength, JBB);
18156 }
18157 
18158 void
execSUB_GCP_COMPLETE_REP(Signal * signal)18159 Dbdih::execSUB_GCP_COMPLETE_REP(Signal* signal)
18160 {
18161   jamEntry();
18162 
18163   CRASH_INSERTION(7228);
18164 
18165   if (ERROR_INSERTED(7244))
18166   {
18167     g_eventLogger->info("Delayed SUB_GCP_COMPLETE_REP 5s");
18168     sendSignalWithDelay(reference(), GSN_SUB_GCP_COMPLETE_REP, signal, 5000,
18169                         signal->getLength());
18170     return;
18171   }
18172 
18173   SubGcpCompleteRep rep = * (SubGcpCompleteRep*)signal->getDataPtr();
18174   if (isMaster())
18175   {
18176     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMPLETE);
18177   }
18178 
18179   Uint32 masterRef = rep.senderRef;
18180   const Uint64 gci = (Uint64(rep.gci_hi) << 32) | rep.gci_lo;
18181 
18182   if (m_micro_gcp.m_state == MicroGcp::M_GCP_IDLE)
18183   {
18184     jam();
18185     /**
18186      * This must be master take over
18187      *   signal has already arrived
18188      */
18189     m_micro_gcp.m_master_ref = masterRef;
18190     goto reply;
18191   }
18192 
18193   ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMITTED);
18194   m_micro_gcp.m_state = MicroGcp::M_GCP_IDLE;
18195 
18196   /**
18197     Ensure that SUB_GCP_COMPLETE_REP is send once per epoch to Dblqh.
18198   */
18199   ndbrequire(gci == m_micro_gcp.m_old_gci);
18200 #if defined(ERROR_INSERT) || defined(VM_TRACE)
18201   /**
18202     Detect if some test actually provoke a double send.
18203     At point of writing no test have failed yet.
18204   */
18205   ndbrequire(gci > m_micro_gcp.m_last_sent_gci);
18206 #endif
18207   if (gci > m_micro_gcp.m_last_sent_gci)
18208   {
18209     /**
18210      * To handle multiple LDM instances, this need to be passed though
18211      * each LQH...(so that no fire-trig-ord can arrive "too" late)
18212      */
18213     sendSignal(DBLQH_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
18214                signal->length(), JBB);
18215     m_micro_gcp.m_last_sent_gci = gci;
18216   }
18217 reply:
18218   SubGcpCompleteAck* ack = CAST_PTR(SubGcpCompleteAck,
18219                                     signal->getDataPtrSend());
18220   ack->rep = rep;
18221   ack->rep.senderRef = reference();
18222   sendSignal(masterRef, GSN_SUB_GCP_COMPLETE_ACK,
18223              signal, SubGcpCompleteAck::SignalLength, JBA);
18224 }
18225 
18226 /*****************************************************************************/
18227 //******     RECEIVING   TAMPER   REQUEST   FROM    NDBAPI             ******
18228 /*****************************************************************************/
execDIHNDBTAMPER(Signal * signal)18229 void Dbdih::execDIHNDBTAMPER(Signal* signal)
18230 {
18231   jamEntry();
18232   Uint32 tcgcpblocked = signal->theData[0];
18233   /* ACTION TO BE TAKEN BY DIH */
18234   Uint32 tuserpointer = signal->theData[1];
18235   BlockReference tuserblockref = signal->theData[2];
18236   switch (tcgcpblocked) {
18237   case 1:
18238     jam();
18239     if (isMaster()) {
18240       jam();
18241       cgcpOrderBlocked = 1;
18242     } else {
18243       jam();
18244       /* TRANSFER THE REQUEST */
18245       /* TO MASTER*/
18246       signal->theData[0] = tcgcpblocked;
18247       signal->theData[1] = tuserpointer;
18248       signal->theData[2] = tuserblockref;
18249       sendSignal(cmasterdihref, GSN_DIHNDBTAMPER, signal, 3, JBB);
18250     }//if
18251     break;
18252   case 2:
18253     jam();
18254     if (isMaster()) {
18255       jam();
18256       cgcpOrderBlocked = 0;
18257     } else {
18258       jam();
18259       /* TRANSFER THE REQUEST */
18260       /* TO MASTER*/
18261       signal->theData[0] = tcgcpblocked;
18262       signal->theData[1] = tuserpointer;
18263       signal->theData[2] = tuserblockref;
18264       sendSignal(cmasterdihref, GSN_DIHNDBTAMPER, signal, 3, JBB);
18265     }//if
18266     break;
18267   case 3:
18268     ndbabort();
18269     return;
18270   case 4:
18271     jam();
18272     signal->theData[0] = tuserpointer;
18273     signal->theData[1] = crestartGci;
18274     sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 2, JBB);
18275     break;
18276 #ifdef ERROR_INSERT
18277   case 5:
18278     jam();
18279     if (tuserpointer >= 30000 && tuserpointer < 40000) {
18280       jam();
18281       /*--------------------------------------------------------------------*/
18282       // Redirect errors to master DIH in the 30000-range.
18283       /*--------------------------------------------------------------------*/
18284       tuserblockref = cmasterdihref;
18285       tuserpointer -= 30000;
18286       signal->theData[0] = 5;
18287       signal->theData[1] = tuserpointer;
18288       signal->theData[2] = tuserblockref;
18289       sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 3, JBB);
18290       return;
18291     } else if (tuserpointer >= 40000 && tuserpointer < 50000) {
18292       NodeRecordPtr localNodeptr;
18293       Uint32 Tfound = 0;
18294       jam();
18295       /*--------------------------------------------------------------------*/
18296       // Redirect errors to non-master DIH in the 40000-range.
18297       /*--------------------------------------------------------------------*/
18298       tuserpointer -= 40000;
18299       for (localNodeptr.i = 1;
18300            localNodeptr.i <= m_max_node_id;
18301            localNodeptr.i++) {
18302         jam();
18303         ptrAss(localNodeptr, nodeRecord);
18304         if ((localNodeptr.p->nodeStatus == NodeRecord::ALIVE) &&
18305             (localNodeptr.i != cmasterNodeId)) {
18306           jam();
18307           tuserblockref = calcDihBlockRef(localNodeptr.i);
18308           Tfound = 1;
18309           break;
18310         }//if
18311       }//for
18312       if (Tfound == 0) {
18313         jam();
18314 	/*-------------------------------------------------------------------*/
18315 	// Ignore since no non-master node existed.
18316 	/*-------------------------------------------------------------------*/
18317         return;
18318       }//if
18319       signal->theData[0] = 5;
18320       signal->theData[1] = tuserpointer;
18321       signal->theData[2] = tuserblockref;
18322       sendSignal(tuserblockref, GSN_DIHNDBTAMPER, signal, 3, JBB);
18323       return;
18324     } else {
18325       jam();
18326       return;
18327     }//if
18328     break;
18329 #endif
18330   default:
18331     ndbabort();
18332   }//switch
18333   return;
18334 }//Dbdih::execDIHNDBTAMPER()
18335 
18336 /*****************************************************************************/
18337 /* **********     FILE HANDLING MODULE                           *************/
18338 /*****************************************************************************/
copyGciLab(Signal * signal,CopyGCIReq::CopyReason reason)18339 void Dbdih::copyGciLab(Signal* signal, CopyGCIReq::CopyReason reason)
18340 {
18341   if(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE)
18342   {
18343     jam();
18344     /**
18345      * There can currently only be two waiting
18346      */
18347     for (Uint32 i = 0; i<CopyGCIMaster::WAIT_CNT; i++)
18348     {
18349       jam();
18350       if (c_copyGCIMaster.m_waiting[i] == CopyGCIReq::IDLE)
18351       {
18352         jam();
18353         c_copyGCIMaster.m_waiting[i] = reason;
18354         return;
18355       }
18356     }
18357 
18358     /**
18359      * Code should *not* request more than WAIT_CNT copy-gci's
18360      *   so this is an internal error
18361      */
18362     ndbabort();
18363   }
18364   c_copyGCIMaster.m_copyReason = reason;
18365 
18366 #ifdef ERROR_INSERT
18367   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT && ERROR_INSERTED(7189))
18368   {
18369     sendToRandomNodes("COPY_GCI",
18370                       signal, &c_COPY_GCIREQ_Counter, &Dbdih::sendCOPY_GCIREQ);
18371     signal->theData[0] = 9999;
18372     sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
18373     return;
18374   }
18375 #endif
18376 
18377   if (reason == CopyGCIReq::RESTART_NR)
18378   {
18379     jam();
18380     if (c_nodeStartMaster.startNode != RNIL)
18381     {
18382       jam();
18383       c_COPY_GCIREQ_Counter.clearWaitingFor();
18384       c_COPY_GCIREQ_Counter.setWaitingFor(c_nodeStartMaster.startNode);
18385       sendCOPY_GCIREQ(signal, c_nodeStartMaster.startNode, RNIL);
18386       return;
18387     }
18388     else
18389     {
18390       jam();
18391       reason = c_copyGCIMaster.m_copyReason = c_copyGCIMaster.m_waiting[0];
18392       for (Uint32 i = 1; i<CopyGCIMaster::WAIT_CNT; i++)
18393       {
18394         jam();
18395         c_copyGCIMaster.m_waiting[i-1] = c_copyGCIMaster.m_waiting[i];
18396       }
18397       c_copyGCIMaster.m_waiting[CopyGCIMaster::WAIT_CNT-1] =
18398         CopyGCIReq::IDLE;
18399 
18400       if (reason == CopyGCIReq::IDLE)
18401       {
18402         jam();
18403         return;
18404       }
18405       // fall-through
18406     }
18407   }
18408 
18409   sendLoopMacro(COPY_GCIREQ, sendCOPY_GCIREQ, RNIL);
18410 
18411 }//Dbdih::copyGciLab()
18412 
18413 #ifdef ERROR_INSERT
18414 static int s_7222_count = 0;
18415 #endif
18416 /* ------------------------------------------------------------------------- */
18417 /* COPY_GCICONF                           RESPONSE TO COPY_GCIREQ            */
18418 /* ------------------------------------------------------------------------- */
execCOPY_GCICONF(Signal * signal)18419 void Dbdih::execCOPY_GCICONF(Signal* signal)
18420 {
18421   jamEntry();
18422   NodeRecordPtr senderNodePtr;
18423   senderNodePtr.i = signal->theData[0];
18424   receiveLoopMacro(COPY_GCIREQ, senderNodePtr.i);
18425 
18426   CopyGCIReq::CopyReason current = c_copyGCIMaster.m_copyReason;
18427   c_copyGCIMaster.m_copyReason = CopyGCIReq::IDLE;
18428 
18429   bool ok = false;
18430   switch(current){
18431   case CopyGCIReq::RESTART:{
18432     ok = true;
18433     jam();
18434     DictStartReq * req = (DictStartReq*)&signal->theData[0];
18435     req->restartGci = SYSFILE->newestRestorableGCI;
18436     req->senderRef = reference();
18437     sendSignal(cdictblockref, GSN_DICTSTARTREQ,
18438                signal, DictStartReq::SignalLength, JBB);
18439     break;
18440   }
18441   case CopyGCIReq::LOCAL_CHECKPOINT:{
18442     ok = true;
18443     jam();
18444     startLcpRoundLab(signal);
18445     break;
18446   }
18447   case CopyGCIReq::GLOBAL_CHECKPOINT:
18448   {
18449     ok = true;
18450     jam();
18451 
18452     /************************************************************************/
18453     // Report the event that a global checkpoint has completed.
18454     /************************************************************************/
18455     signal->setTrace(0);
18456     signal->theData[0] = NDB_LE_GlobalCheckpointCompleted; //Event type
18457     signal->theData[1] = m_gcp_save.m_gci;
18458     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
18459 
18460     c_newest_restorable_gci = m_gcp_save.m_gci;
18461 
18462     /* Make sure Backup block knows about GCI restorable ASAP */
18463     signal->theData[0] = c_newest_restorable_gci;
18464     sendSignal(BACKUP_REF, GSN_RESTORABLE_GCI_REP, signal, 1, JBB);
18465 
18466 #ifdef ERROR_INSERT
18467     /**
18468      * With changes in LCP handling it became rare that we come here when
18469      * a LCP isn't ongoing, so to avoid test cases timing out we crash
18470      * after 15 attempts even when proper test conditions are not met.
18471      */
18472     if (ERROR_INSERTED(7222) &&
18473         ((!Sysfile::getLCPOngoing(SYSFILE->systemRestartBits) &&
18474         c_newest_restorable_gci >= c_lcpState.lcpStopGcp) ||
18475         s_7222_count++ >= 15))
18476     {
18477       s_7222_count = 0;
18478       sendLoopMacro(COPY_TABREQ, nullRoutine, 0);
18479       NodeReceiverGroup rg(CMVMI, c_COPY_TABREQ_Counter);
18480 
18481       rg.m_nodes.clear(getOwnNodeId());
18482       if (!rg.m_nodes.isclear())
18483       {
18484         signal->theData[0] = 9999;
18485         sendSignal(rg, GSN_NDB_TAMPER, signal, 1, JBA);
18486       }
18487       signal->theData[0] = 9999;
18488       sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
18489 
18490       signal->theData[0] = 932;
18491       EXECUTE_DIRECT(QMGR, GSN_NDB_TAMPER, signal, 1);
18492 
18493       return;
18494     }
18495 #endif
18496 
18497     if (m_micro_gcp.m_enabled == false)
18498     {
18499       jam();
18500       /**
18501        * Running old protocol
18502        */
18503       signal->theData[0] = DihContinueB::ZSTART_GCP;
18504       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
18505     }
18506     m_gcp_save.m_master.m_state = GcpSave::GCP_SAVE_IDLE;
18507 
18508     CRASH_INSERTION(7004);
18509     emptyWaitGCPMasterQueue(signal,
18510                             Uint64(m_gcp_save.m_gci) << 32,
18511                             c_waitGCPMasterList);
18512     break;
18513   }
18514   case CopyGCIReq::INITIAL_START_COMPLETED:
18515     ok = true;
18516     jam();
18517     break;
18518   case CopyGCIReq::IDLE:
18519     ok = false;
18520     jam();
18521     break;
18522   case CopyGCIReq::RESTART_NR:
18523     ok = true;
18524     jam();
18525     startme_copygci_conf(signal);
18526     break;
18527   }
18528   ndbrequire(ok);
18529 
18530 
18531   c_copyGCIMaster.m_copyReason = c_copyGCIMaster.m_waiting[0];
18532   for (Uint32 i = 1; i<CopyGCIMaster::WAIT_CNT; i++)
18533   {
18534     jam();
18535     c_copyGCIMaster.m_waiting[i-1] = c_copyGCIMaster.m_waiting[i];
18536   }
18537   c_copyGCIMaster.m_waiting[CopyGCIMaster::WAIT_CNT-1] = CopyGCIReq::IDLE;
18538 
18539   /**
18540    * Pop queue
18541    */
18542   if(c_copyGCIMaster.m_copyReason != CopyGCIReq::IDLE)
18543   {
18544     jam();
18545 
18546     signal->theData[0] = DihContinueB::ZCOPY_GCI;
18547     signal->theData[1] = c_copyGCIMaster.m_copyReason;
18548     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
18549   }
18550 }//Dbdih::execCOPY_GCICONF()
18551 
18552 void
check_node_in_restart(Signal * signal,BlockReference ref,Uint32 nodeId)18553 Dbdih::check_node_in_restart(Signal *signal,
18554                              BlockReference ref,
18555                              Uint32 nodeId)
18556 {
18557   NodeRecordPtr nodePtr;
18558   if (m_max_node_id == Uint32(~0))
18559   {
18560     jam();
18561     sendCHECK_NODE_RESTARTCONF(signal, ref, 1);
18562     return;
18563   }
18564   for (nodePtr.i = nodeId; nodePtr.i <= m_max_node_id; nodePtr.i++)
18565   {
18566     jam();
18567     ptrAss(nodePtr, nodeRecord);
18568     if (nodePtr.p->nodeGroup == RNIL ||
18569         nodePtr.p->nodeRecoveryStatus == NodeRecord::NOT_DEFINED_IN_CLUSTER ||
18570         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_NOT_RESTARTED_YET ||
18571         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILED ||
18572         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_FAILURE_COMPLETED ||
18573         nodePtr.p->nodeRecoveryStatus == NodeRecord::ALLOCATED_NODE_ID ||
18574         nodePtr.p->nodeRecoveryStatus == NodeRecord::RESTART_COMPLETED ||
18575         nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_ACTIVE)
18576     {
18577       /**
18578        * Nodes that aren't part of a node group won't be part of LCPs,
18579        * Nodes not defined in Cluster we can ignore
18580        * Nodes not restarted yet while we were started have no impact
18581        * on LCP speed, if they restart while we restart doesn't matter
18582        * since in this case we will run at a speed for starting nodes.
18583        * Nodes recently failed and even those that completed will speed
18584        * up LCPs temporarily but using the c_increase_lcp_speed_after_nf
18585        * variable instead.
18586        * Nodes that have allocated a node id haven't really started yet.
18587        * Nodes that have completed their restart also need no speed up.
18588        */
18589       continue;
18590     }
18591     /**
18592      * All other states indicate that the node is in some or the other
18593      * node restart state, so thus it is a good idea to speed up LCP
18594      * processing.
18595      */
18596     jam();
18597     jamLine(nodePtr.i);
18598     sendCHECK_NODE_RESTARTCONF(signal, ref, 1);
18599     return;
18600   }
18601   jam();
18602   /* All nodes are up and running, no restart is ongoing */
18603   sendCHECK_NODE_RESTARTCONF(signal, ref, 0);
18604   return;
18605 }
18606 
sendCHECK_NODE_RESTARTCONF(Signal * signal,BlockReference ref,Uint32 node_restart)18607 void Dbdih::sendCHECK_NODE_RESTARTCONF(Signal *signal,
18608                                         BlockReference ref,
18609                                         Uint32 node_restart)
18610 {
18611   signal->theData[0] = (m_local_lcp_state.m_state == LocalLCPState::LS_RUNNING)? 1 : 0;
18612   signal->theData[1] = node_restart;
18613   sendSignal(ref, GSN_CHECK_NODE_RESTARTCONF, signal, 2, JBB);
18614 }
18615 
execCHECK_NODE_RESTARTREQ(Signal * signal)18616 void Dbdih::execCHECK_NODE_RESTARTREQ(Signal *signal)
18617 {
18618   NodeRecordPtr nodePtr;
18619   Uint32 ref = signal->theData[0];
18620   jamEntry();
18621   /**
18622    * No signal data sent, this signal is sent to
18623    * check if we have any nodes that are currently
18624    * part of a LCP which is not yet been started.
18625    */
18626   if (c_increase_lcp_speed_after_nf == true)
18627   {
18628     /**
18629      * A node recently failed, we will run LCP faster until this LCP
18630      * has completed to ensure that we quickly get to a point where
18631      * we can copy the distribution and dictionary information.
18632      */
18633     jam();
18634     sendCHECK_NODE_RESTARTCONF(signal, ref, 1);
18635     return;
18636   }
18637   Uint32 start_node = 1;
18638   check_node_in_restart(signal, ref, start_node);
18639   return;
18640 }
18641 
invalidateLcpInfoAfterSr(Signal * signal)18642 void Dbdih::invalidateLcpInfoAfterSr(Signal* signal)
18643 {
18644   NodeRecordPtr nodePtr;
18645   SYSFILE->latestLCP_ID--;
18646   Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
18647   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
18648   {
18649     jam();
18650     ptrAss(nodePtr, nodeRecord);
18651     if (!NdbNodeBitmask::get(SYSFILE->lcpActive, nodePtr.i)){
18652       jam();
18653       /* ------------------------------------------------------------------- */
18654       // The node was not active in the local checkpoint.
18655       // To avoid that we step the active status too fast to not
18656       // active we step back one step from Sysfile::NS_ActiveMissed_x.
18657       /* ------------------------------------------------------------------- */
18658       switch (nodePtr.p->activeStatus) {
18659       case Sysfile::NS_Active:
18660         nodePtr.p->activeStatus = Sysfile::NS_Active;
18661         break;
18662       case Sysfile::NS_ActiveMissed_1:
18663         jam();
18664         nodePtr.p->activeStatus = Sysfile::NS_Active;
18665         break;
18666       case Sysfile::NS_ActiveMissed_2:
18667         jam();
18668         nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
18669         break;
18670       default:
18671         jam();
18672         break;
18673       }//switch
18674     }
18675     else
18676     {
18677       jam();
18678       /**
18679        * It is possible to get here with a number of different activeStatus
18680        * since the cluster crash could have occurred while a starting node
18681        * was participating in an LCP to get the node to the NS_Active state.
18682        */
18683     }
18684   }//for
18685   setNodeRestartInfoBits(signal);
18686 }//Dbdih::invalidateLcpInfoAfterSr()
18687 
18688 /* ------------------------------------------------------------------------- */
18689 /*       THE NEXT STEP IS TO WRITE THE FILE.                                 */
18690 /* ------------------------------------------------------------------------- */
openingCopyGciSkipInitLab(Signal * signal,FileRecordPtr filePtr)18691 void Dbdih::openingCopyGciSkipInitLab(Signal* signal, FileRecordPtr filePtr)
18692 {
18693   writeRestorableGci(signal, filePtr);
18694   filePtr.p->reqStatus = FileRecord::WRITING_COPY_GCI;
18695   return;
18696 }//Dbdih::openingCopyGciSkipInitLab()
18697 
writingCopyGciLab(Signal * signal,FileRecordPtr filePtr)18698 void Dbdih::writingCopyGciLab(Signal* signal, FileRecordPtr filePtr)
18699 {
18700   /* ----------------------------------------------------------------------- */
18701   /*     WE HAVE NOW WRITTEN THIS FILE. WRITE ALSO NEXT FILE IF THIS IS NOT  */
18702   /*     ALREADY THE LAST.                                                   */
18703   /* ----------------------------------------------------------------------- */
18704   CRASH_INSERTION(7219);
18705 
18706   filePtr.p->reqStatus = FileRecord::IDLE;
18707   if (filePtr.i == crestartInfoFile[0]) {
18708     jam();
18709     filePtr.i = crestartInfoFile[1];
18710     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
18711     if (filePtr.p->fileStatus == FileRecord::OPEN) {
18712       jam();
18713       openingCopyGciSkipInitLab(signal, filePtr);
18714       return;
18715     }//if
18716     openFileRw(signal, filePtr);
18717     filePtr.p->reqStatus = FileRecord::OPENING_COPY_GCI;
18718     return;
18719   }//if
18720   /* ----------------------------------------------------------------------- */
18721   /*     WE HAVE COMPLETED WRITING BOTH FILES SUCCESSFULLY. NOW REPORT OUR   */
18722   /*     SUCCESS TO THE MASTER DIH. BUT FIRST WE NEED TO RESET A NUMBER OF   */
18723   /*     VARIABLES USED BY THE LOCAL CHECKPOINT PROCESS (ONLY IF TRIGGERED   */
18724   /*     BY LOCAL CHECKPOINT PROCESS.                                        */
18725   /* ----------------------------------------------------------------------- */
18726   CopyGCIReq::CopyReason reason = c_copyGCISlave.m_copyReason;
18727 
18728   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT) {
18729     jam();
18730     m_gcp_save.m_state = GcpSave::GCP_SAVE_IDLE;
18731 
18732     SubGcpCompleteRep * const rep = (SubGcpCompleteRep*)signal->getDataPtr();
18733     rep->gci_hi = SYSFILE->newestRestorableGCI;
18734     rep->gci_lo = 0;
18735     rep->flags = SubGcpCompleteRep::ON_DISK;
18736 
18737     sendSignal(LGMAN_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
18738                SubGcpCompleteRep::SignalLength, JBB);
18739 
18740     jamEntry();
18741 
18742     if (m_micro_gcp.m_enabled == false)
18743     {
18744       jam();
18745       sendSignal(DBLQH_REF, GSN_SUB_GCP_COMPLETE_REP, signal,
18746                  SubGcpCompleteRep::SignalLength, JBB);
18747       jamEntry();
18748       ndbrequire(m_micro_gcp.m_state == MicroGcp::M_GCP_COMMITTED);
18749       m_micro_gcp.m_state = MicroGcp::M_GCP_IDLE;
18750 
18751       CRASH_INSERTION(7190);
18752     }
18753 
18754 #ifdef GCP_TIMER_HACK
18755     globalData.gcp_timer_copygci[1] = NdbTick_getCurrentTicks();
18756 
18757     // this is last timer point so we send local report here
18758     {
18759       const GlobalData& g = globalData;
18760       const Uint32 ms_commit = NdbTick_Elapsed(
18761 	  g.gcp_timer_commit[0], g.gcp_timer_commit[1]).milliSec();
18762       const Uint32 ms_save = NdbTick_Elapsed(
18763           g.gcp_timer_save[0], g.gcp_timer_save[1]).milliSec();
18764       const Uint32 ms_copygci = NdbTick_Elapsed(
18765           g.gcp_timer_copygci[0], g.gcp_timer_copygci[1]).milliSec();
18766 
18767       const Uint32 ms_total = ms_commit + ms_save + ms_copygci;
18768 
18769       // random formula to report excessive duration
18770       bool report =
18771         g.gcp_timer_limit != 0 ?
18772           (ms_total > g.gcp_timer_limit) :
18773           (ms_total > 3000 * (1 + cgcpDelay / 1000));
18774       if (report)
18775         infoEvent("GCP %u ms: total:%u commit:%u save:%u copygci:%u",
18776             coldgcp, ms_total, ms_commit, ms_save, ms_copygci);
18777     }
18778 #endif
18779   }
18780 
18781   jam();
18782   c_copyGCISlave.m_copyReason = CopyGCIReq::IDLE;
18783 
18784   if (reason == CopyGCIReq::GLOBAL_CHECKPOINT)
18785   {
18786     jam();
18787     signal->theData[0] = c_copyGCISlave.m_senderData;
18788     sendSignal(m_gcp_save.m_master_ref, GSN_COPY_GCICONF, signal, 1, JBB);
18789   }
18790   else if (c_copyGCISlave.m_senderRef == cmasterdihref)
18791   {
18792     jam();
18793     /**
18794      * Only if same master
18795      */
18796     signal->theData[0] = c_copyGCISlave.m_senderData;
18797     sendSignal(c_copyGCISlave.m_senderRef, GSN_COPY_GCICONF, signal, 1, JBB);
18798   }
18799   return;
18800 }//Dbdih::writingCopyGciLab()
18801 
execSTART_NODE_LCP_CONF(Signal * signal)18802 void Dbdih::execSTART_NODE_LCP_CONF(Signal *signal)
18803 {
18804   jamEntry();
18805   ndbrequire(c_start_node_lcp_req_outstanding);
18806   c_start_node_lcp_req_outstanding = false;
18807   handleStartLcpReq(signal, &c_save_startLcpReq);
18808 }
18809 
execSTART_LCP_REQ(Signal * signal)18810 void Dbdih::execSTART_LCP_REQ(Signal* signal)
18811 {
18812   jamEntry();
18813   Uint32 senderRef = signal->getSendersBlockRef();
18814   Uint32 senderVersion = getNodeInfo(refToNode(senderRef)).m_version;
18815   StartLcpReq * req = (StartLcpReq*)signal->getDataPtr();
18816   bool ownNodeIdSet;
18817   Uint32 noOfSections = signal->getNoOfSections();
18818   ndbrequire(!c_start_node_lcp_req_outstanding);
18819   req->participatingLQH.clear();
18820   req->participatingDIH.clear();
18821 
18822   if (noOfSections >= 1)
18823   {
18824     jam();
18825     ndbrequire(ndbd_send_node_bitmask_in_section(senderVersion));
18826     ndbrequire(signal->getNoOfSections() <= 2);
18827     SegmentedSectionPtr ptr1;
18828     SectionHandle handle(this, signal);
18829     handle.getSection(ptr1, 0);
18830     ndbrequire(ptr1.sz <= NdbNodeBitmask::Size);
18831     copy(req->participatingLQH.rep.data, ptr1);
18832     if (noOfSections == 2)
18833     {
18834       jam();
18835       SegmentedSectionPtr ptr2;
18836       handle.getSection(ptr2, 1);
18837       ndbrequire(ptr2.sz <= NdbNodeBitmask::Size);
18838       copy(req->participatingDIH.rep.data, ptr2);
18839     }
18840 
18841     ownNodeIdSet = req->participatingLQH.get(cownNodeId);
18842     releaseSections(handle);
18843   }
18844   else
18845   {
18846     jam();
18847     ownNodeIdSet = req->participatingLQH_v1.get(cownNodeId);
18848     req->participatingLQH = req->participatingLQH_v1;
18849     req->participatingDIH = req->participatingDIH_v1;
18850   }
18851 
18852   if ((req->pauseStart == StartLcpReq::NormalLcpStart) &&
18853         ownNodeIdSet)
18854   {
18855     jam();
18856     c_save_startLcpReq = *req;
18857     c_start_node_lcp_req_outstanding = true;
18858     signal->theData[0] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
18859     signal->theData[1] = c_newest_restorable_gci;
18860     sendSignal(DBLQH_REF, GSN_START_NODE_LCP_REQ, signal, 2, JBB);
18861     return;
18862   }
18863   handleStartLcpReq(signal, req);
18864 }
18865 
handleStartLcpReq(Signal * signal,StartLcpReq * req)18866 void Dbdih::handleStartLcpReq(Signal *signal, StartLcpReq *req)
18867 {
18868   {
18869     if (req->pauseStart == StartLcpReq::PauseLcpStartFirst)
18870     {
18871       /**
18872        * The message was sent as part of start of LCPs when PAUSE LCP was used.
18873        * We have paused the LCP protocol and we are preparing to copy the
18874        * meta data. Before copying the metadata we need access to the
18875        * m_participatingLQH bitmap of nodes participating in the LCP.
18876        */
18877       jam();
18878       ndbrequire(cmasterdihref == req->senderRef);
18879       m_local_lcp_state.init(req);
18880       c_lcpState.m_participatingDIH = req->participatingDIH;
18881       c_lcpState.m_participatingLQH = req->participatingLQH;
18882       c_lcpState.m_masterLcpDihRef = cmasterdihref;
18883       c_lcpState.setLcpStatus(LCP_STATUS_ACTIVE, __LINE__);
18884       /**
18885        * We need to update the SYSFILE since it can take some time before we
18886        * have this number updated after a COPY_GCIREQ in connection to a
18887        * GCP.
18888        */
18889       SYSFILE->latestLCP_ID = req->lcpId;
18890 
18891       {
18892         char buf[NdbNodeBitmask::TextLength + 1];
18893         g_eventLogger->info("c_lcpState.m_participatingLQH bitmap= %s",
18894             c_lcpState.m_participatingLQH.getText(buf));
18895         g_eventLogger->info("c_lcpState.m_participatingDIH bitmap= %s",
18896             c_lcpState.m_participatingDIH.getText(buf));
18897       }
18898 
18899       ndbrequire(!req->participatingDIH.get(getOwnNodeId()));
18900       c_lcpState.m_participatingDIH.set(getOwnNodeId());
18901 
18902       StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
18903       conf->senderRef = reference();
18904       conf->lcpId = SYSFILE->latestLCP_ID;
18905       sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
18906                  StartLcpConf::SignalLength, JBB);
18907       return;
18908     }
18909     if (req->pauseStart == StartLcpReq::PauseLcpStartSecond)
18910     {
18911       /**
18912        * We get the set of already completed LQHs from the master node.
18913        * No need to know anything about completed DIHs since only the
18914        * master keeps this information.
18915        *
18916        * This signal arrives after copying the meta data. Since we are
18917        * included into the LCP we verify that there is at least one
18918        * fragment replica that still hasn't arrived being ready with
18919        * the LCP execution.
18920        */
18921       jam();
18922       ndbrequire(c_lcpState.lcpStatus == LCP_STATUS_ACTIVE);
18923       ndbrequire(cmasterdihref == req->senderRef);
18924       ndbrequire(c_lcpState.m_masterLcpDihRef == cmasterdihref);
18925       c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH = req->participatingLQH;
18926       c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor();
18927       c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = false;
18928 
18929       c_current_time = NdbTick_getCurrentTicks();
18930       c_lcpState.m_start_time = c_current_time;
18931 
18932       g_eventLogger->info("Our node now in LCP execution after pausing LCP");
18933       g_eventLogger->info("LCP_COMPLETE_REP_Counter_LQH bitmap= %s",
18934           c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.getText());
18935 
18936       ndbrequire(!checkLcpAllTablesDoneInLqh(__LINE__));
18937 
18938       StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
18939       conf->senderRef = reference();
18940       conf->lcpId = SYSFILE->latestLCP_ID;
18941       sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
18942                  StartLcpConf::SignalLength, JBB);
18943       return;
18944     }
18945     ndbrequire(req->pauseStart == StartLcpReq::NormalLcpStart);
18946   }
18947   /**
18948    * Init m_local_lcp_state
18949    */
18950   m_local_lcp_state.init(req);
18951 
18952   if (!isMaster())
18953   {
18954     jam();
18955     c_current_time = NdbTick_getCurrentTicks();
18956     c_lcpState.m_start_time = c_current_time;
18957   }
18958 
18959   CRASH_INSERTION2(7021, isMaster());
18960   CRASH_INSERTION2(7022, !isMaster());
18961 
18962   for (Uint32 nodeId = 1; nodeId <= m_max_node_id; nodeId++)
18963   {
18964     /**
18965      * We could have a race here, a node could die while the START_LCP_REQ
18966      * is in flight. We need remove the node from the set of nodes
18967      * participating in this case. Not removing it here could lead to a
18968      * potential LCP deadlock.
18969      *
18970      * For the PAUSE LCP code where we are included in the LCP we don't need
18971      * to worry about this. If any node fails in the state of me being
18972      * started, I will fail as well.
18973      */
18974     NodeRecordPtr nodePtr;
18975     if (req->participatingDIH.get(nodeId) ||
18976         req->participatingLQH.get(nodeId))
18977     {
18978       nodePtr.i = nodeId;
18979       ptrAss(nodePtr, nodeRecord);
18980       if (nodePtr.p->nodeStatus != NodeRecord::ALIVE)
18981       {
18982         jam();
18983         jamLine(nodeId);
18984         req->participatingDIH.clear(nodeId);
18985         req->participatingLQH.clear(nodeId);
18986       }
18987     }
18988   }
18989   c_lcpState.m_participatingDIH = req->participatingDIH;
18990   c_lcpState.m_participatingLQH = req->participatingLQH;
18991 
18992   c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH = req->participatingLQH;
18993   if(isMaster())
18994   {
18995     jam();
18996     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH = req->participatingDIH;
18997   }
18998   else
18999   {
19000     jam();
19001     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor();
19002   }
19003 
19004   c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = false;
19005 
19006   c_lcpState.setLcpStatus(LCP_INIT_TABLES, __LINE__);
19007 
19008   ndbrequire(c_lcpState.m_masterLcpDihRef == req->senderRef);
19009 
19010   signal->theData[0] = DihContinueB::ZINIT_LCP;
19011   signal->theData[1] = c_lcpState.m_masterLcpDihRef;
19012   signal->theData[2] = 0;
19013   if (ERROR_INSERTED(7021))
19014   {
19015     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 3);
19016   }
19017   else
19018   {
19019     sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
19020   }
19021 }
19022 
19023 void
reset()19024 Dbdih::LocalLCPState::reset()
19025 {
19026   m_state = LS_INITIAL;
19027   m_keep_gci = RNIL;
19028   m_stop_gci = RNIL;
19029 }
19030 
19031 void
init(const StartLcpReq * req)19032 Dbdih::LocalLCPState::init(const StartLcpReq * req)
19033 {
19034   m_state = LS_RUNNING;
19035   m_start_lcp_req = *req;
19036   m_keep_gci = ~(Uint32)0;
19037   m_stop_gci = 0;
19038 }
19039 
19040 void
init_master_take_over_idle_to_tab_saved()19041 Dbdih::LocalLCPState::init_master_take_over_idle_to_tab_saved()
19042 {
19043   if (m_state == LS_COMPLETE)
19044   {
19045     m_state = LS_RUNNING;
19046   }
19047   else
19048   {
19049     assert(m_state == LS_INITIAL);
19050     m_state = LS_RUNNING_MTO_TAB_SAVED;
19051   }
19052 }
19053 
19054 void
lcp_frag_rep(const LcpFragRep * rep)19055 Dbdih::LocalLCPState::lcp_frag_rep(const LcpFragRep * rep)
19056 {
19057   assert(m_state == LS_RUNNING);
19058   if (rep->maxGciCompleted < m_keep_gci)
19059   {
19060     m_keep_gci = rep->maxGciCompleted;
19061   }
19062 
19063   if (rep->maxGciStarted > m_stop_gci)
19064   {
19065     m_stop_gci = rep->maxGciStarted;
19066   }
19067 }
19068 
19069 void
lcp_complete_rep(Uint32 gci)19070 Dbdih::LocalLCPState::lcp_complete_rep(Uint32 gci)
19071 {
19072   if (m_state == LS_RUNNING)
19073   {
19074     m_state = LS_COMPLETE;
19075     if (gci > m_stop_gci)
19076       m_stop_gci = gci;
19077   }
19078   else if (m_state == LS_RUNNING_MTO_TAB_SAVED)
19079   {
19080     reset();
19081   }
19082   else
19083   {
19084     require(false);
19085   }
19086 }
19087 
19088 bool
check_cut_log_tail(Uint32 gci) const19089 Dbdih::LocalLCPState::check_cut_log_tail(Uint32 gci) const
19090 {
19091   if (m_state == LS_COMPLETE)
19092   {
19093     if (gci >= m_stop_gci)
19094       return true;
19095   }
19096   return false;
19097 }
19098 
initLcpLab(Signal * signal,Uint32 senderRef,Uint32 tableId)19099 void Dbdih::initLcpLab(Signal* signal, Uint32 senderRef, Uint32 tableId)
19100 {
19101   TabRecordPtr tabPtr;
19102   tabPtr.i = tableId;
19103 
19104   if (c_lcpState.m_masterLcpDihRef != senderRef ||
19105       c_lcpState.m_masterLcpDihRef != cmasterdihref)
19106   {
19107     /**
19108      * This is LCP master takeover...abort
19109      */
19110     jam();
19111     return;
19112   }
19113 
19114   //const Uint32 lcpId = SYSFILE->latestLCP_ID;
19115 
19116   for(; tabPtr.i < ctabFileSize; tabPtr.i++){
19117 
19118     ptrAss(tabPtr, tabRecord);
19119 
19120     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
19121     {
19122       jam();
19123       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
19124       continue;
19125     }
19126 
19127     if (tabPtr.p->tabStorage != TabRecord::ST_NORMAL) {
19128       /**
19129        * Table is not logged
19130        */
19131       jam();
19132       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
19133       continue;
19134     }
19135 
19136     if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
19137       /* ----------------------------------------------------------------- */
19138       // We protect the updates of table data structures by this variable.
19139       /* ----------------------------------------------------------------- */
19140       jam();
19141       signal->theData[0] = DihContinueB::ZINIT_LCP;
19142       signal->theData[1] = senderRef;
19143       signal->theData[2] = tabPtr.i;
19144       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
19145                           WaitTableStateChangeMillis, 3);
19146       return;
19147     }//if
19148 
19149     /**
19150      * Found a table
19151      */
19152     tabPtr.p->tabLcpStatus = TabRecord::TLS_ACTIVE;
19153 
19154     /**
19155      * For each fragment
19156      */
19157     tabPtr.p->tabActiveLcpFragments = 0;
19158     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
19159       jam();
19160       FragmentstorePtr fragPtr;
19161       getFragstore(tabPtr.p, fragId, fragPtr);
19162 
19163       /**
19164        * For each of replica record
19165        */
19166       Uint32 replicaCount = 0;
19167       ReplicaRecordPtr replicaPtr;
19168       for(replicaPtr.i = fragPtr.p->storedReplicas; replicaPtr.i != RNIL;
19169 	  replicaPtr.i = replicaPtr.p->nextPool) {
19170 	jam();
19171 
19172         c_replicaRecordPool.getPtr(replicaPtr);
19173 	Uint32 nodeId = replicaPtr.p->procNode;
19174 	if(c_lcpState.m_participatingLQH.get(nodeId)){
19175 	  jam();
19176 	  replicaCount++;
19177 	  replicaPtr.p->lcpOngoingFlag = true;
19178 	}
19179         else if (replicaPtr.p->lcpOngoingFlag)
19180         {
19181           jam();
19182           replicaPtr.p->lcpOngoingFlag = false;
19183         }
19184       }
19185       fragPtr.p->noLcpReplicas = replicaCount;
19186       if (replicaCount > 0)
19187       {
19188         tabPtr.p->tabActiveLcpFragments++;
19189       }
19190     }//for
19191 
19192     signal->theData[0] = DihContinueB::ZINIT_LCP;
19193     signal->theData[1] = senderRef;
19194     signal->theData[2] = tabPtr.i + 1;
19195     if (ERROR_INSERTED(7021))
19196     {
19197       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 3);
19198     }
19199     else
19200     {
19201       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
19202     }
19203     return;
19204   }
19205 
19206   /**
19207    * No more tables
19208    */
19209   jam();
19210   if (ERROR_INSERTED(7236))
19211   {
19212     // delay 20s before completing last CONTINUEB(ZINIT_LCP)
19213     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 20000, 3);
19214     CLEAR_ERROR_INSERT_VALUE;
19215     return;
19216   }
19217 
19218   c_lcpState.setLcpStatus(LCP_STATUS_ACTIVE, __LINE__);
19219 
19220   CRASH_INSERTION2(7023, isMaster());
19221   CRASH_INSERTION2(7024, !isMaster());
19222 
19223   StartLcpConf * conf = (StartLcpConf*)signal->getDataPtrSend();
19224   conf->senderRef = reference();
19225   conf->lcpId = SYSFILE->latestLCP_ID;
19226   sendSignal(c_lcpState.m_masterLcpDihRef, GSN_START_LCP_CONF, signal,
19227              StartLcpConf::SignalLength, JBB);
19228 }//Dbdih::initLcpLab()
19229 
19230 /* ------------------------------------------------------------------------- */
19231 /*       ERROR HANDLING FOR COPY RESTORABLE GCI FILE.                        */
19232 /* ------------------------------------------------------------------------- */
openingCopyGciErrorLab(Signal * signal,FileRecordPtr filePtr)19233 void Dbdih::openingCopyGciErrorLab(Signal* signal, FileRecordPtr filePtr)
19234 {
19235   createFileRw(signal, filePtr);
19236   /* ------------------------------------------------------------------------- */
19237   /*       ERROR IN OPENING FILE. WE WILL TRY BY CREATING FILE INSTEAD.        */
19238   /* ------------------------------------------------------------------------- */
19239   filePtr.p->reqStatus = FileRecord::CREATING_COPY_GCI;
19240   return;
19241 }//Dbdih::openingCopyGciErrorLab()
19242 
19243 /* ------------------------------------------------------------------------- */
19244 /*       ENTER DICTSTARTCONF WITH                                            */
19245 /*         TBLOCKREF                                                         */
19246 /* ------------------------------------------------------------------------- */
dictStartConfLab(Signal * signal)19247 void Dbdih::dictStartConfLab(Signal* signal)
19248 {
19249   infoEvent("Restore Database from disk Starting");
19250   /* ----------------------------------------------------------------------- */
19251   /*     WE HAVE NOW RECEIVED ALL THE TABLES TO RESTART.                     */
19252   /* ----------------------------------------------------------------------- */
19253   signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
19254   signal->theData[1] = 0;  /* START WITH TABLE 0    */
19255   signal->theData[2] = 0;  /* AND FRAGMENT 0        */
19256   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
19257   return;
19258 }//Dbdih::dictStartConfLab()
19259 
19260 
openingTableLab(Signal * signal,FileRecordPtr filePtr)19261 void Dbdih::openingTableLab(Signal* signal, FileRecordPtr filePtr)
19262 {
19263   /* ---------------------------------------------------------------------- */
19264   /*    SUCCESSFULLY OPENED A FILE. READ THE FIRST PAGE OF THIS FILE.       */
19265   /* ---------------------------------------------------------------------- */
19266   TabRecordPtr tabPtr;
19267   PageRecordPtr pagePtr;
19268 
19269   tabPtr.i = filePtr.p->tabRef;
19270   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19271   tabPtr.p->noPages = 1;
19272   allocpage(pagePtr);
19273   tabPtr.p->pageRef[0] = pagePtr.i;
19274   readTabfile(signal, tabPtr.p, filePtr);
19275   filePtr.p->reqStatus = FileRecord::READING_TABLE;
19276   return;
19277 }//Dbdih::openingTableLab()
19278 
openingTableErrorLab(Signal * signal,FileRecordPtr filePtr)19279 void Dbdih::openingTableErrorLab(Signal* signal, FileRecordPtr filePtr)
19280 {
19281   TabRecordPtr tabPtr;
19282   tabPtr.i = filePtr.p->tabRef;
19283   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19284   /* ---------------------------------------------------------------------- */
19285   /*    WE FAILED IN OPENING A FILE. IF THE FIRST FILE THEN TRY WITH THE    */
19286   /*    DUPLICATE FILE, OTHERWISE WE REPORT AN ERROR IN THE SYSTEM RESTART. */
19287   /* ---------------------------------------------------------------------- */
19288   if (filePtr.i == tabPtr.p->tabFile[0])
19289   {
19290     filePtr.i = tabPtr.p->tabFile[1];
19291     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19292     openFileRw(signal, filePtr);
19293     filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
19294   }
19295   else
19296   {
19297     char buf[256];
19298     BaseString::snprintf(buf, sizeof(buf),
19299 			 "Error opening DIH schema files for table: %d",
19300 			 tabPtr.i);
19301     progError(__LINE__, NDBD_EXIT_AFS_NO_SUCH_FILE, buf);
19302   }
19303 }//Dbdih::openingTableErrorLab()
19304 
readingTableLab(Signal * signal,FileRecordPtr filePtr)19305 void Dbdih::readingTableLab(Signal* signal, FileRecordPtr filePtr)
19306 {
19307   TabRecordPtr tabPtr;
19308   PageRecordPtr pagePtr;
19309   /* ---------------------------------------------------------------------- */
19310   /*    WE HAVE SUCCESSFULLY READ A NUMBER OF PAGES IN THE TABLE FILE. IF   */
19311   /*    MORE PAGES EXIST IN THE FILE THEN READ ALL PAGES IN THE FILE.       */
19312   /* ---------------------------------------------------------------------- */
19313   filePtr.p->reqStatus = FileRecord::IDLE;
19314   tabPtr.i = filePtr.p->tabRef;
19315   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19316   pagePtr.i = tabPtr.p->pageRef[0];
19317   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
19318   Uint32 noOfStoredPages = pagePtr.p->word[33];
19319   if (tabPtr.p->noPages < noOfStoredPages) {
19320     jam();
19321     ndbrequire(noOfStoredPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
19322     for (Uint32 i = tabPtr.p->noPages; i < noOfStoredPages; i++) {
19323       jam();
19324       allocpage(pagePtr);
19325       tabPtr.p->pageRef[i] = pagePtr.i;
19326     }//for
19327     tabPtr.p->noPages = noOfStoredPages;
19328     readTabfile(signal, tabPtr.p, filePtr);
19329     filePtr.p->reqStatus = FileRecord::READING_TABLE;
19330   } else {
19331     ndbrequire(tabPtr.p->noPages == pagePtr.p->word[33]);
19332     ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
19333     jam();
19334     /* --------------------------------------------------------------------- */
19335     /*   WE HAVE READ ALL PAGES. NOW READ FROM PAGES INTO TABLE AND FRAGMENT */
19336     /*   DATA STRUCTURES.                                                    */
19337     /* --------------------------------------------------------------------- */
19338     tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE1_READ_PAGES;
19339     signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_TABLE;
19340     signal->theData[1] = tabPtr.i;
19341     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19342     return;
19343   }//if
19344   return;
19345 }//Dbdih::readingTableLab()
19346 
readTableFromPagesLab(Signal * signal,TabRecordPtr tabPtr)19347 void Dbdih::readTableFromPagesLab(Signal* signal, TabRecordPtr tabPtr)
19348 {
19349   FileRecordPtr filePtr;
19350   filePtr.i = tabPtr.p->tabFile[0];
19351   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19352   /* ---------------------------------------------------------------------- */
19353   /*    WE HAVE NOW COPIED TO OUR NODE. WE HAVE NOW COMPLETED RESTORING     */
19354   /*    THIS TABLE. CONTINUE WITH THE NEXT TABLE.                           */
19355   /*    WE ALSO NEED TO CLOSE THE TABLE FILE.                               */
19356   /* ---------------------------------------------------------------------- */
19357   if (filePtr.p->fileStatus != FileRecord::OPEN) {
19358     jam();
19359     filePtr.i = tabPtr.p->tabFile[1];
19360     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19361   }//if
19362   closeFile(signal, filePtr);
19363   filePtr.p->reqStatus = FileRecord::CLOSING_TABLE_SR;
19364   return;
19365 }//Dbdih::readTableFromPagesLab()
19366 
closingTableSrLab(Signal * signal,FileRecordPtr filePtr)19367 void Dbdih::closingTableSrLab(Signal* signal, FileRecordPtr filePtr)
19368 {
19369   /**
19370    * Update table/fragment info
19371    */
19372   TabRecordPtr tabPtr;
19373   tabPtr.i = filePtr.p->tabRef;
19374   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19375   resetReplicaSr(tabPtr);
19376 
19377   signal->theData[0] = DihContinueB::ZCOPY_TABLE;
19378   signal->theData[1] = filePtr.p->tabRef;
19379   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19380 
19381   return;
19382 }//Dbdih::closingTableSrLab()
19383 
19384 void
execDIH_GET_TABINFO_REQ(Signal * signal)19385 Dbdih::execDIH_GET_TABINFO_REQ(Signal* signal)
19386 {
19387   jamEntry();
19388 
19389   DihGetTabInfoReq req = * (DihGetTabInfoReq*)signal->getDataPtr();
19390 
19391   Uint32 err = 0;
19392   do
19393   {
19394     TabRecordPtr tabPtr;
19395     tabPtr.i = req.tableId;
19396     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19397 
19398     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
19399     {
19400       jam();
19401       err = DihGetTabInfoRef::TableNotDefined;
19402       break;
19403     }
19404 
19405     if (cfirstconnect == RNIL)
19406     {
19407       jam();
19408       err = DihGetTabInfoRef::OutOfConnectionRecords;
19409       break;
19410     }
19411 
19412     if (tabPtr.p->connectrec != RNIL)
19413     {
19414       jam();
19415 
19416       ConnectRecordPtr connectPtr;
19417       connectPtr.i = tabPtr.p->connectrec;
19418       ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
19419 
19420       if (connectPtr.p->connectState != ConnectRecord::GET_TABINFO)
19421       {
19422         jam();
19423         err = DihGetTabInfoRef::TableBusy;
19424         break;
19425       }
19426     }
19427 
19428     ConnectRecordPtr connectPtr;
19429     connectPtr.i = cfirstconnect;
19430     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
19431     cfirstconnect = connectPtr.p->nextPool;
19432 
19433     connectPtr.p->nextPool = tabPtr.p->connectrec;
19434     tabPtr.p->connectrec = connectPtr.i;
19435 
19436     connectPtr.p->m_get_tabinfo.m_requestInfo = req.requestInfo;
19437     connectPtr.p->userpointer = req.senderData;
19438     connectPtr.p->userblockref = req.senderRef;
19439     connectPtr.p->connectState = ConnectRecord::GET_TABINFO;
19440     connectPtr.p->table = tabPtr.i;
19441 
19442     if (connectPtr.p->nextPool == RNIL)
19443     {
19444       jam();
19445 
19446       /**
19447        * we're the first...start packing...
19448        */
19449       signal->theData[0] = DihContinueB::ZGET_TABINFO;
19450       signal->theData[1] = tabPtr.i;
19451       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19452     }
19453 
19454     return;
19455   } while (0);
19456 
19457   DihGetTabInfoRef * ref = (DihGetTabInfoRef*)signal->getDataPtrSend();
19458   ref->senderData = req.senderData;
19459   ref->senderRef = reference();
19460   ref->errorCode = err;
19461   sendSignal(req.senderRef, GSN_DIH_GET_TABINFO_REF, signal,
19462              DihGetTabInfoRef::SignalLength, JBB);
19463 }
19464 
19465 void
getTabInfo(Signal * signal)19466 Dbdih::getTabInfo(Signal* signal)
19467 {
19468   TabRecordPtr tabPtr;
19469   tabPtr.i = signal->theData[1];
19470   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19471 
19472   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE)
19473   {
19474     jam();
19475     signal->theData[0] = DihContinueB::ZGET_TABINFO;
19476     signal->theData[1] = tabPtr.i;
19477     sendSignalWithDelay(reference(),
19478                         GSN_CONTINUEB,
19479                         signal,
19480                         WaitTableStateChangeMillis,
19481                         signal->length());
19482     return;
19483   }
19484 
19485   tabPtr.p->tabCopyStatus  = TabRecord::CS_GET_TABINFO;
19486 
19487   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
19488   signal->theData[1] = tabPtr.i;
19489   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19490 }
19491 
19492 int
getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr,CopyTableNode ctn)19493 Dbdih::getTabInfo_copyTableToSection(SegmentedSectionPtr & ptr,
19494                                      CopyTableNode ctn)
19495 {
19496   PageRecordPtr pagePtr;
19497   pagePtr.i = ctn.ctnTabPtr.p->pageRef[0];
19498   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
19499 
19500   while (ctn.noOfWords > 2048)
19501   {
19502     jam();
19503     ndbrequire(import(ptr, pagePtr.p->word, 2048));
19504     ctn.noOfWords -= 2048;
19505 
19506     ctn.pageIndex++;
19507     pagePtr.i = ctn.ctnTabPtr.p->pageRef[ctn.pageIndex];
19508     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
19509   }
19510 
19511   ndbrequire(import(ptr, pagePtr.p->word, ctn.noOfWords));
19512   return 0;
19513 }
19514 
19515 int
getTabInfo_copySectionToPages(TabRecordPtr tabPtr,SegmentedSectionPtr ptr)19516 Dbdih::getTabInfo_copySectionToPages(TabRecordPtr tabPtr,
19517                                      SegmentedSectionPtr ptr)
19518 {
19519   jam();
19520   Uint32 sz = ptr.sz;
19521   SectionReader reader(ptr, getSectionSegmentPool());
19522 
19523   while (sz)
19524   {
19525     jam();
19526     PageRecordPtr pagePtr;
19527     allocpage(pagePtr);
19528     tabPtr.p->pageRef[tabPtr.p->noPages] = pagePtr.i;
19529     tabPtr.p->noPages++;
19530 
19531     Uint32 len = sz > 2048 ? 2048 : sz;
19532     ndbrequire(reader.getWords(pagePtr.p->word, len));
19533     sz -= len;
19534   }
19535   return 0;
19536 }
19537 
19538 void
getTabInfo_send(Signal * signal,TabRecordPtr tabPtr)19539 Dbdih::getTabInfo_send(Signal* signal,
19540                        TabRecordPtr tabPtr)
19541 {
19542   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_GET_TABINFO);
19543 
19544   ConnectRecordPtr connectPtr;
19545   connectPtr.i = tabPtr.p->connectrec;
19546 
19547   /**
19548    * Done
19549    */
19550   if (connectPtr.i == RNIL)
19551   {
19552     jam();
19553     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
19554     return;
19555   }
19556 
19557   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
19558 
19559   ndbrequire(connectPtr.p->connectState == ConnectRecord::GET_TABINFO);
19560   ndbrequire(connectPtr.p->table == tabPtr.i);
19561 
19562   /**
19563    * Copy into segmented sections here...
19564    * NOTE: A GenericSectionIterator would be nice inside kernel too
19565    *  or having a pack-method that writes directly into SegmentedSection
19566    */
19567   PageRecordPtr pagePtr;
19568   pagePtr.i = tabPtr.p->pageRef[0];
19569   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
19570   Uint32 words = pagePtr.p->word[34];
19571 
19572   CopyTableNode ctn;
19573   ctn.ctnTabPtr = tabPtr;
19574   ctn.pageIndex = 0;
19575   ctn.wordIndex = 0;
19576   ctn.noOfWords = words;
19577 
19578   SegmentedSectionPtr ptr;
19579   ndbrequire(getTabInfo_copyTableToSection(ptr, ctn) == 0);
19580 
19581   Callback cb = { safe_cast(&Dbdih::getTabInfo_sendComplete), connectPtr.i };
19582 
19583   SectionHandle handle(this, signal);
19584   handle.m_ptr[0] = ptr;
19585   handle.m_cnt = 1;
19586 
19587   DihGetTabInfoConf* conf = (DihGetTabInfoConf*)signal->getDataPtrSend();
19588   conf->senderData = connectPtr.p->userpointer;
19589   conf->senderRef = reference();
19590   sendFragmentedSignal(connectPtr.p->userblockref, GSN_DIH_GET_TABINFO_CONF, signal,
19591                        DihGetTabInfoConf::SignalLength, JBB, &handle, cb);
19592 }
19593 
19594 void
getTabInfo_sendComplete(Signal * signal,Uint32 senderData,Uint32 retVal)19595 Dbdih::getTabInfo_sendComplete(Signal * signal,
19596                                Uint32 senderData,
19597                                Uint32 retVal)
19598 {
19599   ndbrequire(retVal == 0);
19600 
19601   ConnectRecordPtr connectPtr;
19602   connectPtr.i = senderData;
19603   ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
19604 
19605   ndbrequire(connectPtr.p->connectState == ConnectRecord::GET_TABINFO);
19606 
19607   TabRecordPtr tabPtr;
19608   tabPtr.i = connectPtr.p->table;
19609   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19610   tabPtr.p->connectrec = connectPtr.p->nextPool;
19611 
19612   signal->theData[0] = DihContinueB::ZGET_TABINFO_SEND;
19613   signal->theData[1] = tabPtr.i;
19614   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19615 
19616   release_connect(connectPtr);
19617 }
19618 
19619 void
resetReplicaSr(TabRecordPtr tabPtr)19620 Dbdih::resetReplicaSr(TabRecordPtr tabPtr){
19621 
19622   const Uint32 newestRestorableGCI = SYSFILE->newestRestorableGCI;
19623 
19624   for(Uint32 i = 0; i<tabPtr.p->totalfragments; i++)
19625   {
19626     jam();
19627     FragmentstorePtr fragPtr;
19628     getFragstore(tabPtr.p, i, fragPtr);
19629 
19630     /**
19631      * During SR restart distributionKey from 0
19632      */
19633     fragPtr.p->distributionKey = 0;
19634 
19635     /**
19636      * 1) Start by moving all replicas into oldStoredReplicas
19637      */
19638     prepareReplicas(fragPtr);
19639 
19640     /**
19641      * 2) Move all "alive" replicas into storedReplicas
19642      *    + update noCrashedReplicas...
19643      */
19644     ReplicaRecordPtr replicaPtr;
19645     replicaPtr.i = fragPtr.p->oldStoredReplicas;
19646     while (replicaPtr.i != RNIL)
19647     {
19648       jam();
19649       c_replicaRecordPool.getPtr(replicaPtr);
19650 
19651       /**
19652        * invalidate LCP's not usable
19653        */
19654       resetReplica(replicaPtr);
19655 
19656       const Uint32 nextReplicaPtrI = replicaPtr.p->nextPool;
19657 
19658       NodeRecordPtr nodePtr;
19659       nodePtr.i = replicaPtr.p->procNode;
19660       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
19661 
19662       const Uint32 noCrashedReplicas = replicaPtr.p->noCrashedReplicas;
19663 
19664       if (nodePtr.p->nodeStatus == NodeRecord::ALIVE)
19665       {
19666 	jam();
19667         jamLine(Uint16(nodePtr.i));
19668 	switch (nodePtr.p->activeStatus) {
19669 	case Sysfile::NS_Active:
19670 	case Sysfile::NS_ActiveMissed_1:
19671 	case Sysfile::NS_ActiveMissed_2:{
19672 	  jam();
19673 	  /* --------------------------------------------------------------- */
19674 	  /* THE NODE IS ALIVE AND KICKING AND ACTIVE, LET'S USE IT.         */
19675 	  /* --------------------------------------------------------------- */
19676 	  arrGuardErr(noCrashedReplicas, MAX_CRASHED_REPLICAS, NDBD_EXIT_MAX_CRASHED_REPLICAS);
19677 
19678           // Create new crashed replica
19679           newCrashedReplica(replicaPtr);
19680 
19681           // Create a new redo-interval
19682           Uint32 nextCrashed = replicaPtr.p->noCrashedReplicas;
19683           replicaPtr.p->createGci[nextCrashed] = newestRestorableGCI + 1;
19684           replicaPtr.p->replicaLastGci[nextCrashed] = ZINIT_REPLICA_LAST_GCI;
19685 
19686           // merge
19687           mergeCrashedReplicas(replicaPtr);
19688 
19689 	  resetReplicaLcp(replicaPtr.p, newestRestorableGCI);
19690 
19691 	  /**
19692 	   * Make sure we can also find REDO for restoring replica...
19693 	   */
19694 	  {
19695 	    CreateReplicaRecord createReplica;
19696 	    if (tabPtr.p->tabStorage != TabRecord::ST_NORMAL ||
19697 		setup_create_replica(fragPtr,
19698 				     &createReplica, replicaPtr))
19699 	    {
19700 	      jam();
19701 	      removeOldStoredReplica(fragPtr, replicaPtr);
19702 	      linkStoredReplica(fragPtr, replicaPtr);
19703 	    }
19704 	    else
19705 	    {
19706 	      jam();
19707 	      g_eventLogger->info("Forcing take-over of node %d due to insufficient REDO"
19708 			" for tab(%u,%u)",
19709 			nodePtr.i, tabPtr.i, fragPtr.p->fragId);
19710 	      infoEvent("Forcing take-over of node %d due to insufficient REDO"
19711 			" for tab(%u,%u)",
19712 			nodePtr.i, tabPtr.i, fragPtr.p->fragId);
19713 
19714               m_sr_nodes.clear(nodePtr.i);
19715               m_to_nodes.set(nodePtr.i);
19716 	      setNodeActiveStatus(nodePtr.i,
19717 				  Sysfile::NS_NotActive_NotTakenOver);
19718 	    }
19719 	  }
19720           break;
19721 	}
19722         default:
19723 	  jam();
19724 	  /*empty*/;
19725 	  break;
19726 	}
19727       }
19728       replicaPtr.i = nextReplicaPtrI;
19729     }//while
19730     if (fragPtr.p->storedReplicas == RNIL)
19731     {
19732       // This should have been caught in Dbdih::execDIH_RESTARTREQ
19733 #ifdef ERROR_INSERT
19734       // Extra printouts for debugging
19735       g_eventLogger->info("newestRestorableGCI %u", newestRestorableGCI);
19736       ReplicaRecordPtr replicaPtr;
19737       replicaPtr.i = fragPtr.p->oldStoredReplicas;
19738       while (replicaPtr.i != RNIL)
19739       {
19740         c_replicaRecordPool.getPtr(replicaPtr);
19741         g_eventLogger->info("[1/3] frag %u, replica %u @%p, SYSFILE @%p",
19742           fragPtr.i, replicaPtr.i, replicaPtr.p, SYSFILE);
19743         g_eventLogger->info("[2/3] frag %u, replica %u, node %u, replicaLastGci %u,%u",
19744           fragPtr.i, replicaPtr.i, replicaPtr.p->procNode,
19745           replicaPtr.p->replicaLastGci[0], replicaPtr.p->replicaLastGci[1]);
19746         ndbrequire(replicaPtr.p->procNode < MAX_NDB_NODES)
19747         g_eventLogger->info("[3/3] frag %u, replica %u, node %u, lastCompletedGCI %u",
19748           fragPtr.i, replicaPtr.i, replicaPtr.p->procNode,
19749           SYSFILE->lastCompletedGCI[replicaPtr.p->procNode]);
19750         replicaPtr.i = replicaPtr.p->nextPool;
19751       }
19752 #endif
19753       char buf[255];
19754       BaseString::snprintf
19755         (buf, sizeof(buf),
19756          "Nodegroup %u has not enough data on disk for restart.", i);
19757       progError(__LINE__,
19758                 NDBD_EXIT_INSUFFICENT_NODES,
19759                 buf);
19760     }
19761     updateNodeInfo(fragPtr);
19762   }
19763 }
19764 
19765 void
resetReplica(ReplicaRecordPtr readReplicaPtr)19766 Dbdih::resetReplica(ReplicaRecordPtr readReplicaPtr)
19767 {
19768   Uint32 i;
19769   /* ---------------------------------------------------------------------- */
19770   /*       IF THE LAST COMPLETED LOCAL CHECKPOINT IS VALID AND LARGER THAN  */
19771   /*       THE LAST COMPLETED CHECKPOINT THEN WE WILL INVALIDATE THIS LOCAL */
19772   /*       CHECKPOINT FOR THIS REPLICA.                                     */
19773   /* ---------------------------------------------------------------------- */
19774   for (i = 0; i < MAX_LCP_STORED; i++)
19775   {
19776     jam();
19777     if (readReplicaPtr.p->lcpStatus[i] == ZVALID &&
19778         readReplicaPtr.p->lcpId[i] > SYSFILE->latestLCP_ID)
19779     {
19780       jam();
19781       readReplicaPtr.p->lcpStatus[i] = ZINVALID;
19782     }
19783   }
19784 
19785   /* ---------------------------------------------------------------------- */
19786   /*       WE ALSO HAVE TO INVALIDATE ANY LOCAL CHECKPOINTS THAT HAVE BEEN  */
19787   /*       INVALIDATED BY MOVING BACK THE RESTART GCI.                      */
19788   /* ---------------------------------------------------------------------- */
19789   Uint32 lastCompletedGCI = SYSFILE->newestRestorableGCI;
19790   for (i = 0; i < MAX_LCP_STORED; i++)
19791   {
19792     jam();
19793     if (readReplicaPtr.p->lcpStatus[i] == ZVALID &&
19794         readReplicaPtr.p->maxGciStarted[i] > lastCompletedGCI)
19795     {
19796       jam();
19797       readReplicaPtr.p->lcpStatus[i] = ZINVALID;
19798     }
19799   }
19800 
19801   /* ---------------------------------------------------------------------- */
19802   /*       WE WILL REMOVE ANY OCCURRENCES OF REPLICAS THAT HAVE CRASHED     */
19803   /*       THAT ARE NO LONGER VALID DUE TO MOVING RESTART GCI BACKWARDS.    */
19804   /* ---------------------------------------------------------------------- */
19805   removeTooNewCrashedReplicas(readReplicaPtr, lastCompletedGCI);
19806 
19807   /**
19808    * Don't remove crashed replicas here,
19809    *   as 1) this will disable optimized NR
19810    *         if oldestRestorableGCI > GCI needed for local LCP's
19811    *      2) This is anyway done during LCP, which will be run during SR
19812    */
19813   //removeOldCrashedReplicas(readReplicaPtr);
19814 
19815   /* ---------------------------------------------------------------------- */
19816   /*       FIND PROCESSOR RECORD                                            */
19817   /* ---------------------------------------------------------------------- */
19818 }
19819 
19820 void
resetReplicaLcp(ReplicaRecord * replicaP,Uint32 stopGci)19821 Dbdih::resetReplicaLcp(ReplicaRecord * replicaP, Uint32 stopGci){
19822 
19823   Uint32 lcpNo = replicaP->nextLcp;
19824   const Uint32 startLcpNo = lcpNo;
19825   do {
19826     lcpNo = prevLcpNo(lcpNo);
19827     ndbrequire(lcpNo < MAX_LCP_STORED);
19828     if (replicaP->lcpStatus[lcpNo] == ZVALID)
19829     {
19830       if (replicaP->maxGciStarted[lcpNo] <= stopGci)
19831       {
19832         jam();
19833         jamLine(Uint16(lcpNo));
19834 	/* ----------------------------------------------------------------- */
19835 	/*   WE HAVE FOUND A USEFUL LOCAL CHECKPOINT THAT CAN BE USED FOR    */
19836 	/*   RESTARTING THIS FRAGMENT REPLICA.                               */
19837 	/* ----------------------------------------------------------------- */
19838         return ;
19839       }//if
19840     }//if
19841     jam();
19842     jamLine(Uint16(lcpNo));
19843     /**
19844      * WE COULD  NOT USE THIS LOCAL CHECKPOINT. IT WAS TOO
19845      * RECENT OR SIMPLY NOT A VALID CHECKPOINT.
19846      * WE SHOULD THUS REMOVE THIS LOCAL CHECKPOINT SINCE IT WILL NEVER
19847      * AGAIN BE USED. SET LCP_STATUS TO INVALID.
19848      */
19849     replicaP->nextLcp = lcpNo;
19850     replicaP->lcpId[lcpNo] = 0;
19851     replicaP->lcpStatus[lcpNo] = ZINVALID;
19852   } while (lcpNo != startLcpNo);
19853 
19854   replicaP->nextLcp = 0;
19855 }
19856 
readingTableErrorLab(Signal * signal,FileRecordPtr filePtr)19857 void Dbdih::readingTableErrorLab(Signal* signal, FileRecordPtr filePtr)
19858 {
19859   TabRecordPtr tabPtr;
19860   tabPtr.i = filePtr.p->tabRef;
19861   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19862   /* ---------------------------------------------------------------------- */
19863   /*    READING THIS FILE FAILED. CLOSE IT AFTER RELEASING ALL PAGES.       */
19864   /* ---------------------------------------------------------------------- */
19865   ndbrequire(tabPtr.p->noPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
19866   for (Uint32 i = 0; i < tabPtr.p->noPages; i++) {
19867     jam();
19868     releasePage(tabPtr.p->pageRef[i]);
19869   }//for
19870   closeFile(signal, filePtr);
19871   filePtr.p->reqStatus = FileRecord::CLOSING_TABLE_CRASH;
19872   return;
19873 }//Dbdih::readingTableErrorLab()
19874 
closingTableCrashLab(Signal * signal,FileRecordPtr filePtr)19875 void Dbdih::closingTableCrashLab(Signal* signal, FileRecordPtr filePtr)
19876 {
19877   TabRecordPtr tabPtr;
19878   /* ---------------------------------------------------------------------- */
19879   /*    WE HAVE NOW CLOSED A FILE WHICH WE HAD A READ ERROR WITH. PROCEED   */
19880   /*    WITH NEXT FILE IF NOT THE LAST OTHERWISE REPORT ERROR.              */
19881   /* ---------------------------------------------------------------------- */
19882   tabPtr.i = filePtr.p->tabRef;
19883   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19884   ndbrequire(filePtr.i == tabPtr.p->tabFile[0]);
19885   filePtr.i = tabPtr.p->tabFile[1];
19886   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
19887   openFileRw(signal, filePtr);
19888   filePtr.p->reqStatus = FileRecord::OPENING_TABLE;
19889 }//Dbdih::closingTableCrashLab()
19890 
19891 /*****************************************************************************/
19892 /* **********     COPY TABLE MODULE                              *************/
19893 /*****************************************************************************/
execCOPY_TABREQ(Signal * signal)19894 void Dbdih::execCOPY_TABREQ(Signal* signal)
19895 {
19896   CopyTabReq *req = (CopyTabReq*) &signal->theData[0];
19897   CRASH_INSERTION(7172);
19898 
19899   TabRecordPtr tabPtr;
19900   PageRecordPtr pagePtr;
19901   jamEntry();
19902   BlockReference ref = req->senderRef;
19903   Uint32 reqinfo = req->reqinfo;
19904   tabPtr.i = req->tableId;
19905   Uint32 schemaVersion = req->tableSchemaVersion;
19906   Uint32 noOfWords = req->noOfWords;
19907   ndbrequire(ref == cmasterdihref);
19908   ndbrequire(!isMaster());
19909   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
19910   if (reqinfo == 1)
19911   {
19912     jam();
19913     tabPtr.p->schemaVersion = schemaVersion;
19914     initTableFile(tabPtr);
19915 
19916     /**
19917      * We need to set up the state of whether the table is actively writing
19918      * an LCP still. We can derive the state on replicas and fragments for
19919      * the LCP with the information that we get in the table by knowing the
19920      * currently executing LCP id. We also get the current LCP id fromt the
19921      * master here to ensure that we're up to date with this value.
19922      */
19923     c_lcp_id_while_copy_meta_data = req->currentLcpId;
19924     {
19925       if (req->tabLcpStatus == CopyTabReq::LcpCompleted)
19926       {
19927         jam();
19928         tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
19929       }
19930       else
19931       {
19932         jam();
19933         ndbrequire(req->tabLcpStatus == CopyTabReq::LcpActive);
19934         tabPtr.p->tabLcpStatus = TabRecord::TLS_ACTIVE;
19935       }
19936     }
19937   }//if
19938   ndbrequire(tabPtr.p->noPages < NDB_ARRAY_SIZE(tabPtr.p->pageRef));
19939   if (tabPtr.p->noOfWords == 0) {
19940     jam();
19941     allocpage(pagePtr);
19942     tabPtr.p->pageRef[tabPtr.p->noPages] = pagePtr.i;
19943     tabPtr.p->noPages++;
19944   } else {
19945     jam();
19946     pagePtr.i = tabPtr.p->pageRef[tabPtr.p->noPages - 1];
19947     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
19948   }//if
19949   ndbrequire(tabPtr.p->noOfWords + 15 < 2048);
19950   ndbrequire(tabPtr.p->noOfWords < 2048);
19951   MEMCOPY_NO_WORDS(&pagePtr.p->word[tabPtr.p->noOfWords], &signal->theData[5], 16);
19952   tabPtr.p->noOfWords += 16;
19953   if (tabPtr.p->noOfWords == 2048) {
19954     jam();
19955     tabPtr.p->noOfWords = 0;
19956   }//if
19957   if (noOfWords > 16) {
19958     jam();
19959     return;
19960   }//if
19961   tabPtr.p->noOfWords = 0;
19962   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
19963   tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_TAB_REQ;
19964   signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_TABLE;
19965   signal->theData[1] = tabPtr.i;
19966   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
19967 }//Dbdih::execCOPY_TABREQ()
19968 
19969 void
copyTabReq_complete(Signal * signal,TabRecordPtr tabPtr)19970 Dbdih::copyTabReq_complete(Signal* signal, TabRecordPtr tabPtr){
19971   if (!isMaster()) {
19972     jam();
19973     //----------------------------------------------------------------------------
19974     // In this particular case we do not release table pages if we are master. The
19975     // reason is that the master could still be sending the table info to another
19976     // node.
19977     //----------------------------------------------------------------------------
19978     releaseTabPages(tabPtr.i);
19979 
19980     /**
19981      * No need to protect these changes as they occur while recovery is ongoing
19982      * and DBTC hasn't started using these tables yet.
19983      */
19984     tabPtr.p->tabStatus = TabRecord::TS_ACTIVE;
19985     for (Uint32 fragId = 0; fragId < tabPtr.p->totalfragments; fragId++) {
19986       jam();
19987       FragmentstorePtr fragPtr;
19988       getFragstore(tabPtr.p, fragId, fragPtr);
19989       updateNodeInfo(fragPtr);
19990     }//for
19991   }//if
19992   c_lcp_id_while_copy_meta_data = RNIL;
19993   CopyTabConf *conf = (CopyTabConf*) signal->getDataPtrSend();
19994   conf->nodeId = getOwnNodeId();
19995   conf->tableId = tabPtr.i;
19996   sendSignal(cmasterdihref, GSN_COPY_TABCONF, signal,
19997              CopyTabConf::SignalLength, JBB);
19998 }
19999 
20000 /*****************************************************************************/
20001 /* ******  READ FROM A NUMBER OF PAGES INTO THE TABLE DATA STRUCTURES ********/
20002 /*****************************************************************************/
readPagesIntoTableLab(Signal * signal,Uint32 tableId)20003 void Dbdih::readPagesIntoTableLab(Signal* signal, Uint32 tableId)
20004 {
20005   /**
20006    * No need to protect these changes, they are only occurring during
20007    * recovery when DBTC hasn't accessibility to the table yet.
20008    */
20009   RWFragment rf;
20010   rf.wordIndex = 35;
20011   rf.pageIndex = 0;
20012   rf.rwfTabPtr.i = tableId;
20013   ptrCheckGuard(rf.rwfTabPtr, ctabFileSize, tabRecord);
20014   rf.rwfPageptr.i = rf.rwfTabPtr.p->pageRef[0];
20015   ptrCheckGuard(rf.rwfPageptr, cpageFileSize, pageRecord);
20016   rf.rwfTabPtr.p->totalfragments = readPageWord(&rf);
20017   rf.rwfTabPtr.p->noOfBackups = readPageWord(&rf);
20018   rf.rwfTabPtr.p->hashpointer = readPageWord(&rf);
20019   rf.rwfTabPtr.p->kvalue = readPageWord(&rf);
20020   rf.rwfTabPtr.p->mask = readPageWord(&rf);
20021   rf.rwfTabPtr.p->method = (TabRecord::Method)readPageWord(&rf);
20022   /* ------------- */
20023   /* Type of table */
20024   /* ------------- */
20025   rf.rwfTabPtr.p->tabStorage = (TabRecord::Storage)(readPageWord(&rf));
20026   rf.rwfTabPtr.p->tabActiveLcpFragments = 0;
20027 
20028   Uint32 noOfFrags = rf.rwfTabPtr.p->totalfragments;
20029   ndbrequire(noOfFrags > 0);
20030   ndbrequire((noOfFrags * (rf.rwfTabPtr.p->noOfBackups + 1)) <= cnoFreeReplicaRec);
20031   allocFragments(noOfFrags, rf.rwfTabPtr);
20032 
20033   signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_FRAG;
20034   signal->theData[1] = rf.rwfTabPtr.i;
20035   signal->theData[2] = 0;
20036   signal->theData[3] = rf.pageIndex;
20037   signal->theData[4] = rf.wordIndex;
20038   sendSignal(reference(), GSN_CONTINUEB, signal, 5, JBB);
20039   return;
20040 }//Dbdih::readPagesIntoTableLab()
20041 
readPagesIntoFragLab(Signal * signal,RWFragment * rf)20042 void Dbdih::readPagesIntoFragLab(Signal* signal, RWFragment* rf)
20043 {
20044   ndbrequire(rf->pageIndex < NDB_ARRAY_SIZE(rf->rwfTabPtr.p->pageRef));
20045   rf->rwfPageptr.i = rf->rwfTabPtr.p->pageRef[rf->pageIndex];
20046   ptrCheckGuard(rf->rwfPageptr, cpageFileSize, pageRecord);
20047   FragmentstorePtr fragPtr;
20048   getFragstore(rf->rwfTabPtr.p, rf->fragId, fragPtr);
20049   readFragment(rf, fragPtr);
20050   readReplicas(rf, rf->rwfTabPtr.p, fragPtr);
20051   rf->fragId++;
20052   if (rf->fragId == rf->rwfTabPtr.p->totalfragments) {
20053     jam();
20054     switch (rf->rwfTabPtr.p->tabCopyStatus) {
20055     case TabRecord::CS_SR_PHASE1_READ_PAGES:
20056       jam();
20057       releaseTabPages(rf->rwfTabPtr.i);
20058       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20059       signal->theData[0] = DihContinueB::ZREAD_TABLE_FROM_PAGES;
20060       signal->theData[1] = rf->rwfTabPtr.i;
20061       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20062       return;
20063       break;
20064     case TabRecord::CS_COPY_TAB_REQ:
20065       jam();
20066       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20067       if (getNodeState().getSystemRestartInProgress() &&
20068           rf->rwfTabPtr.p->tabStorage == TabRecord::ST_NORMAL)
20069       {
20070         /**
20071          * avoid overwriting own table-definition...
20072          *   but this is not possible for no-logging tables
20073          */
20074 	jam();
20075 	copyTabReq_complete(signal, rf->rwfTabPtr);
20076 	return;
20077       }
20078       rf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20079       rf->rwfTabPtr.p->tabUpdateState = TabRecord::US_COPY_TAB_REQ;
20080       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
20081       signal->theData[1] = rf->rwfTabPtr.i;
20082       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20083       return;
20084       break;
20085     default:
20086       ndbabort();
20087       return;
20088     }//switch
20089   } else {
20090     jam();
20091     signal->theData[0] = DihContinueB::ZREAD_PAGES_INTO_FRAG;
20092     signal->theData[1] = rf->rwfTabPtr.i;
20093     signal->theData[2] = rf->fragId;
20094     signal->theData[3] = rf->pageIndex;
20095     signal->theData[4] = rf->wordIndex;
20096     sendSignal(reference(), GSN_CONTINUEB, signal, 5, JBB);
20097   }//if
20098   return;
20099 }//Dbdih::readPagesIntoFragLab()
20100 
20101 /*****************************************************************************/
20102 /*****   WRITING FROM TABLE DATA STRUCTURES INTO A SET OF PAGES         ******/
20103 // execCONTINUEB(ZPACK_TABLE_INTO_PAGES)
20104 /*****************************************************************************/
packTableIntoPagesLab(Signal * signal,Uint32 tableId)20105 void Dbdih::packTableIntoPagesLab(Signal* signal, Uint32 tableId)
20106 {
20107   RWFragment wf;
20108   TabRecordPtr tabPtr;
20109   allocpage(wf.rwfPageptr);
20110   tabPtr.i = tableId;
20111   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
20112   tabPtr.p->pageRef[0] = wf.rwfPageptr.i;
20113   tabPtr.p->noPages = 1;
20114   wf.wordIndex = 35;
20115   wf.pageIndex = 0;
20116   Uint32 totalfragments = tabPtr.p->totalfragments;
20117   if (tabPtr.p->connectrec != RNIL)
20118   {
20119     jam();
20120     Ptr<ConnectRecord> connectPtr;
20121     connectPtr.i = tabPtr.p->connectrec;
20122     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
20123     ndbrequire(connectPtr.p->table == tabPtr.i);
20124     if (connectPtr.p->connectState == ConnectRecord::ALTER_TABLE)
20125     {
20126       jam();
20127       totalfragments = connectPtr.p->m_alter.m_totalfragments;
20128     }
20129   }
20130 
20131   writePageWord(&wf, totalfragments);
20132   writePageWord(&wf, tabPtr.p->noOfBackups);
20133   writePageWord(&wf, tabPtr.p->hashpointer);
20134   writePageWord(&wf, tabPtr.p->kvalue);
20135   writePageWord(&wf, tabPtr.p->mask);
20136   writePageWord(&wf, tabPtr.p->method);
20137   writePageWord(&wf, tabPtr.p->tabStorage);
20138 
20139   signal->theData[0] = DihContinueB::ZPACK_FRAG_INTO_PAGES;
20140   signal->theData[1] = tabPtr.i;
20141   signal->theData[2] = 0;
20142   signal->theData[3] = wf.pageIndex;
20143   signal->theData[4] = wf.wordIndex;
20144   signal->theData[5] = totalfragments;
20145   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
20146 }//Dbdih::packTableIntoPagesLab()
20147 
20148 /*****************************************************************************/
20149 // execCONTINUEB(ZPACK_FRAG_INTO_PAGES)
20150 /*****************************************************************************/
packFragIntoPagesLab(Signal * signal,RWFragment * wf)20151 void Dbdih::packFragIntoPagesLab(Signal* signal, RWFragment* wf)
20152 {
20153   ndbrequire(wf->pageIndex < NDB_ARRAY_SIZE(wf->rwfTabPtr.p->pageRef));
20154   wf->rwfPageptr.i = wf->rwfTabPtr.p->pageRef[wf->pageIndex];
20155   ptrCheckGuard(wf->rwfPageptr, cpageFileSize, pageRecord);
20156   FragmentstorePtr fragPtr;
20157   getFragstore(wf->rwfTabPtr.p, wf->fragId, fragPtr);
20158   writeFragment(wf, fragPtr);
20159   writeReplicas(wf, fragPtr.p->storedReplicas);
20160   writeReplicas(wf, fragPtr.p->oldStoredReplicas);
20161   wf->fragId++;
20162   if (wf->fragId == wf->totalfragments) {
20163     jam();
20164     PageRecordPtr pagePtr;
20165     pagePtr.i = wf->rwfTabPtr.p->pageRef[0];
20166     ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
20167     pagePtr.p->word[33] = wf->rwfTabPtr.p->noPages;
20168     pagePtr.p->word[34] = ((wf->rwfTabPtr.p->noPages - 1) * 2048) + wf->wordIndex;
20169     switch (wf->rwfTabPtr.p->tabCopyStatus) {
20170     case TabRecord::CS_SR_PHASE2_READ_TABLE:
20171       /* -------------------------------------------------------------------*/
20172       // We are performing a system restart and we are now ready to copy the
20173       // table from this node (the master) to all other nodes.
20174       /* -------------------------------------------------------------------*/
20175       jam();
20176       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20177       signal->theData[0] = DihContinueB::ZSR_PHASE2_READ_TABLE;
20178       signal->theData[1] = wf->rwfTabPtr.i;
20179       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20180       return;
20181       break;
20182     case TabRecord::CS_COPY_NODE_STATE:
20183       jam();
20184       tableCopyNodeLab(signal, wf->rwfTabPtr);
20185       return;
20186       break;
20187     case TabRecord::CS_LCP_READ_TABLE:
20188       jam();
20189       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
20190       signal->theData[1] = wf->rwfTabPtr.i;
20191       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20192       return;
20193       break;
20194     case TabRecord::CS_REMOVE_NODE:
20195     case TabRecord::CS_INVALIDATE_NODE_LCP:
20196       jam();
20197       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
20198       signal->theData[1] = wf->rwfTabPtr.i;
20199       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20200       return;
20201       break;
20202     case TabRecord::CS_ADD_TABLE_MASTER:
20203       jam();
20204       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20205       signal->theData[0] = DihContinueB::ZADD_TABLE_MASTER_PAGES;
20206       signal->theData[1] = wf->rwfTabPtr.i;
20207       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20208       return;
20209       break;
20210     case TabRecord::CS_ADD_TABLE_SLAVE:
20211       jam();
20212       wf->rwfTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20213       signal->theData[0] = DihContinueB::ZADD_TABLE_SLAVE_PAGES;
20214       signal->theData[1] = wf->rwfTabPtr.i;
20215       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20216       return;
20217     case TabRecord::CS_COPY_TO_SAVE:
20218       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
20219       signal->theData[1] = wf->rwfTabPtr.i;
20220       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20221       return;
20222     case TabRecord::CS_GET_TABINFO:
20223       jam();
20224       signal->theData[0] = DihContinueB::ZGET_TABINFO_SEND;
20225       signal->theData[1] = wf->rwfTabPtr.i;
20226       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20227       return;
20228     default:
20229       ndbabort();
20230       return;
20231     }//switch
20232   } else {
20233     jam();
20234     signal->theData[0] = DihContinueB::ZPACK_FRAG_INTO_PAGES;
20235     signal->theData[1] = wf->rwfTabPtr.i;
20236     signal->theData[2] = wf->fragId;
20237     signal->theData[3] = wf->pageIndex;
20238     signal->theData[4] = wf->wordIndex;
20239     signal->theData[5] = wf->totalfragments;
20240     sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
20241   }//if
20242   return;
20243 }//Dbdih::packFragIntoPagesLab()
20244 
20245 /*****************************************************************************/
20246 /* **********     START FRAGMENT MODULE                          *************/
20247 /*****************************************************************************/
20248 void
dump_replica_info()20249 Dbdih::dump_replica_info()
20250 {
20251   TabRecordPtr tabPtr;
20252   FragmentstorePtr fragPtr;
20253 
20254   for(tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++)
20255   {
20256     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
20257     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
20258       continue;
20259 
20260     for(Uint32 fid = 0; fid<tabPtr.p->totalfragments; fid++)
20261     {
20262       getFragstore(tabPtr.p, fid, fragPtr);
20263       ndbout_c("tab: %d frag: %d gci: %d\n",
20264 	       tabPtr.i, fid, SYSFILE->newestRestorableGCI);
20265 
20266       dump_replica_info(fragPtr.p);
20267     }
20268   }
20269 }
20270 
20271 void
dump_replica_info(const Fragmentstore * fragPtrP)20272 Dbdih::dump_replica_info(const Fragmentstore* fragPtrP)
20273 {
20274   ndbout_c("  -- storedReplicas: ");
20275   Uint32 i;
20276   ReplicaRecordPtr replicaPtr;
20277   replicaPtr.i = fragPtrP->storedReplicas;
20278   for(; replicaPtr.i != RNIL; replicaPtr.i = replicaPtr.p->nextPool)
20279   {
20280     c_replicaRecordPool.getPtr(replicaPtr);
20281     ndbout_c("  node: %d initialGci: %d nextLcp: %d noCrashedReplicas: %d",
20282              replicaPtr.p->procNode,
20283              replicaPtr.p->initialGci,
20284              replicaPtr.p->nextLcp,
20285              replicaPtr.p->noCrashedReplicas);
20286     for(i = 0; i<MAX_LCP_STORED; i++)
20287     {
20288       ndbout_c("    i: %d %s : lcpId: %d maxGci Completed: %d Started: %d",
20289                i,
20290                (replicaPtr.p->lcpStatus[i] == ZVALID ?"VALID":"INVALID"),
20291                replicaPtr.p->lcpId[i],
20292                replicaPtr.p->maxGciCompleted[i],
20293                replicaPtr.p->maxGciStarted[i]);
20294     }
20295 
20296     for (i = 0; i < 8; i++)
20297     {
20298       ndbout_c("    crashed replica: %d replicaLastGci: %d createGci: %d",
20299                i,
20300                replicaPtr.p->replicaLastGci[i],
20301                replicaPtr.p->createGci[i]);
20302     }
20303   }
20304   ndbout_c("  -- oldStoredReplicas");
20305   replicaPtr.i = fragPtrP->oldStoredReplicas;
20306   for(; replicaPtr.i != RNIL; replicaPtr.i = replicaPtr.p->nextPool)
20307   {
20308     c_replicaRecordPool.getPtr(replicaPtr);
20309     ndbout_c("  node: %d initialGci: %d nextLcp: %d noCrashedReplicas: %d",
20310              replicaPtr.p->procNode,
20311              replicaPtr.p->initialGci,
20312              replicaPtr.p->nextLcp,
20313              replicaPtr.p->noCrashedReplicas);
20314     for(i = 0; i<MAX_LCP_STORED; i++)
20315     {
20316       ndbout_c("    i: %d %s : lcpId: %d maxGci Completed: %d Started: %d",
20317                i,
20318                (replicaPtr.p->lcpStatus[i] == ZVALID ?"VALID":"INVALID"),
20319                replicaPtr.p->lcpId[i],
20320                replicaPtr.p->maxGciCompleted[i],
20321                replicaPtr.p->maxGciStarted[i]);
20322     }
20323 
20324     for (i = 0; i < 8; i++)
20325     {
20326       ndbout_c("    crashed replica: %d replicaLastGci: %d createGci: %d",
20327                i,
20328                replicaPtr.p->replicaLastGci[i],
20329                replicaPtr.p->createGci[i]);
20330     }
20331   }
20332 }
20333 
startFragment(Signal * signal,Uint32 tableId,Uint32 fragId)20334 void Dbdih::startFragment(Signal* signal, Uint32 tableId, Uint32 fragId)
20335 {
20336   Uint32 TloopCount = 0;
20337   TabRecordPtr tabPtr;
20338   while (true) {
20339     if (TloopCount > 100) {
20340       jam();
20341       signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
20342       signal->theData[1] = tableId;
20343       signal->theData[2] = 0;
20344       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
20345       return;
20346     }
20347 
20348     if (tableId >= ctabFileSize) {
20349       jam();
20350       signal->theData[0] = DihContinueB::ZCOMPLETE_RESTART;
20351       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
20352       return;
20353     }//if
20354 
20355     tabPtr.i = tableId;
20356     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
20357     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE){
20358       jam();
20359       TloopCount++;
20360       tableId++;
20361       fragId = 0;
20362       continue;
20363     }
20364 
20365     if(tabPtr.p->tabStorage != TabRecord::ST_NORMAL){
20366       jam();
20367       TloopCount++;
20368       tableId++;
20369       fragId = 0;
20370       continue;
20371     }
20372 
20373     jam();
20374     break;
20375   }//while
20376 
20377   FragmentstorePtr fragPtr;
20378   getFragstore(tabPtr.p, fragId, fragPtr);
20379   /* ----------------------------------------------------------------------- */
20380   /*     WE NEED TO RESET THE REPLICA DATA STRUCTURES. THIS MEANS THAT WE    */
20381   /*     MUST REMOVE REPLICAS THAT WAS NOT STARTED AT THE GCI TO RESTORE. WE */
20382   /*     NEED TO PUT ALL STORED REPLICAS ON THE LIST OF OLD STORED REPLICAS  */
20383   /*     RESET THE NUMBER OF REPLICAS TO CREATE.                             */
20384   /* ----------------------------------------------------------------------- */
20385   cnoOfCreateReplicas = 0;
20386   /* ----------------------------------------------------------------------- */
20387   /*     WE WILL NEVER START MORE THAN FOUR FRAGMENT REPLICAS WHATEVER THE   */
20388   /*     DESIRED REPLICATION IS.                                             */
20389   /* ----------------------------------------------------------------------- */
20390   ndbrequire(tabPtr.p->noOfBackups < MAX_REPLICAS);
20391   /* ----------------------------------------------------------------------- */
20392   /*     SEARCH FOR STORED REPLICAS THAT CAN BE USED TO RESTART THE SYSTEM.  */
20393   /* ----------------------------------------------------------------------- */
20394   searchStoredReplicas(fragPtr);
20395 
20396   if (cnoOfCreateReplicas == 0) {
20397     /* --------------------------------------------------------------------- */
20398     /*   THERE WERE NO STORED REPLICAS AVAILABLE THAT CAN SERVE AS REPLICA TO*/
20399     /*   RESTART THE SYSTEM FROM. IN A LATER RELEASE WE WILL ADD             */
20400     /*   FUNCTIONALITY TO CHECK IF THERE ARE ANY STANDBY NODES THAT COULD DO */
20401     /*   THIS TASK INSTEAD IN THIS IMPLEMENTATION WE SIMPLY CRASH THE SYSTEM.*/
20402     /*   THIS WILL DECREASE THE GCI TO RESTORE WHICH HOPEFULLY WILL MAKE IT  */
20403     /*   POSSIBLE TO RESTORE THE SYSTEM.                                     */
20404     /* --------------------------------------------------------------------- */
20405     char buf[64];
20406     BaseString::snprintf(buf, sizeof(buf), "table: %d fragment: %d gci: %d",
20407 			 tableId, fragId, SYSFILE->newestRestorableGCI);
20408 
20409     ndbout_c("%s", buf);
20410     dump_replica_info();
20411 
20412     progError(__LINE__, NDBD_EXIT_NO_RESTORABLE_REPLICA, buf);
20413     return;
20414   }//if
20415 
20416   /* ----------------------------------------------------------------------- */
20417   /*     WE HAVE CHANGED THE NODE TO BE PRIMARY REPLICA AND THE NODES TO BE  */
20418   /*     BACKUP NODES. WE MUST UPDATE THIS NODES DATA STRUCTURE SINCE WE     */
20419   /*     WILL NOT COPY THE TABLE DATA TO OURSELF.                            */
20420   /* ----------------------------------------------------------------------- */
20421   updateNodeInfo(fragPtr);
20422   /* ----------------------------------------------------------------------- */
20423   /*     NOW WE HAVE COLLECTED ALL THE REPLICAS WE COULD GET. WE WILL NOW    */
20424   /*     RESTART THE FRAGMENT REPLICAS WE HAVE FOUND IRRESPECTIVE OF IF THERE*/
20425   /*     ARE ENOUGH ACCORDING TO THE DESIRED REPLICATION.                    */
20426   /* ----------------------------------------------------------------------- */
20427   /*     WE START BY SENDING ADD_FRAGREQ FOR THOSE REPLICAS THAT NEED IT.    */
20428   /* ----------------------------------------------------------------------- */
20429   CreateReplicaRecordPtr createReplicaPtr;
20430   for (createReplicaPtr.i = 0;
20431        createReplicaPtr.i < cnoOfCreateReplicas;
20432        createReplicaPtr.i++) {
20433     jam();
20434     ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
20435   }//for
20436 
20437   sendStartFragreq(signal, tabPtr, fragId);
20438 
20439   /**
20440    * Don't wait for START_FRAGCONF
20441    */
20442   fragId++;
20443   if (fragId >= tabPtr.p->totalfragments) {
20444     jam();
20445     tabPtr.i++;
20446     fragId = 0;
20447   }//if
20448   signal->theData[0] = DihContinueB::ZSTART_FRAGMENT;
20449   signal->theData[1] = tabPtr.i;
20450   signal->theData[2] = fragId;
20451   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
20452 
20453   return;
20454 }//Dbdih::startFragmentLab()
20455 
20456 
20457 /*****************************************************************************/
20458 /* **********     COMPLETE RESTART MODULE                        *************/
20459 /*****************************************************************************/
completeRestartLab(Signal * signal)20460 void Dbdih::completeRestartLab(Signal* signal)
20461 {
20462   sendLoopMacro(START_RECREQ, sendSTART_RECREQ, RNIL);
20463 }//completeRestartLab()
20464 
20465 /* ------------------------------------------------------------------------- */
20466 //       SYSTEM RESTART:
20467 /*         A NODE HAS COMPLETED RESTORING ALL DATABASE FRAGMENTS.            */
20468 //       NODE RESTART:
20469 //         THE STARTING NODE HAS PREPARED ITS LOG FILES TO ENABLE EXECUTION
20470 //         OF TRANSACTIONS.
20471 // Precondition:
20472 //   This signal is received by the master node for the system restart.
20473 //   This signal is received by the starting node for node restart.
20474 /* ------------------------------------------------------------------------- */
execSTART_RECCONF(Signal * signal)20475 void Dbdih::execSTART_RECCONF(Signal* signal)
20476 {
20477   jamEntry();
20478   Uint32 senderNodeId = signal->theData[0];
20479   Uint32 senderData = signal->theData[1];
20480 
20481   if (senderData != RNIL)
20482   {
20483     jam();
20484     c_performed_copy_phase = true;
20485     /**
20486      * This is normally a node restart, but it could also be second
20487      * phase of a system restart where a node is restored from a more
20488      * alive node, in this case we could even be the master node although
20489      * we arrive here.
20490      */
20491     g_eventLogger->info("Restore Database Off-line Completed");
20492     infoEvent("Restore Database Off-line Completed on node %u",
20493               senderNodeId);
20494 
20495     g_eventLogger->info("Bring Database On-line Starting");
20496     infoEvent("Bring Database On-line Starting on node %u",
20497               senderNodeId);
20498 
20499     /**
20500      * This is node restart
20501      */
20502     Ptr<TakeOverRecord> takeOverPtr;
20503     c_takeOverPool.getPtr(takeOverPtr, senderData);
20504     sendStartTo(signal, takeOverPtr);
20505     return;
20506   }
20507   infoEvent("Restore Database from disk Completed on node %u",
20508             senderNodeId);
20509 
20510   /* No take over record in the system restart case here */
20511   ndbrequire(senderData == RNIL);
20512   /* --------------------------------------------------------------------- */
20513   // This was the system restart case. We set the state indicating that the
20514   // node has completed restoration of all fragments.
20515   /* --------------------------------------------------------------------- */
20516   receiveLoopMacro(START_RECREQ, senderNodeId);
20517 
20518   /**
20519    * Remove each node that has to TO from LCP/LQH
20520    */
20521   Uint32 i = 0;
20522   while ((i = m_to_nodes.find(i + 1)) != NdbNodeBitmask::NotFound)
20523   {
20524     jam();
20525     NodeRecordPtr nodePtr;
20526     nodePtr.i = i;
20527     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
20528     nodePtr.p->copyCompleted = 0;
20529   }
20530 
20531   if (m_to_nodes.get(getOwnNodeId()))
20532   {
20533     /**
20534      * We (master) needs take-over
20535      *   run this directly to avoid strange confusion
20536      */
20537     jam();
20538     c_sr_wait_to = true;
20539   }
20540 
20541   if (!m_to_nodes.isclear() && c_sr_wait_to)
20542   {
20543     jam();
20544 
20545     StartCopyReq* req = (StartCopyReq*)signal->getDataPtrSend();
20546     req->senderRef = reference();
20547     req->senderData = getOwnNodeId();
20548     req->flags = 0; // Note dont wait for LCP
20549 
20550     i = 0;
20551     while ((i = m_to_nodes.find(i + 1)) != NdbNodeBitmask::NotFound)
20552     {
20553       jam();
20554       req->startingNodeId = i;
20555       sendSignal(calcDihBlockRef(i), GSN_START_COPYREQ, signal,
20556                  StartCopyReq::SignalLength, JBB);
20557     }
20558 
20559     char buf[NdbNodeBitmask::TextLength + 1];
20560     infoEvent("Starting take-over of %s", m_to_nodes.getText(buf));
20561     return;
20562   }
20563 
20564   infoEvent("Restore Database from disk Completed");
20565 
20566   signal->theData[0] = reference();
20567   m_sr_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
20568 
20569   Uint32 packed_length = m_sr_nodes.getPackedLengthInWords();;
20570   if (ndbd_send_node_bitmask_in_section(
20571       getNodeInfo(refToNode(cntrlblockref)).m_version))
20572   {
20573     LinearSectionPtr lsptr[3];
20574     lsptr[0].p = signal->theData + 1;
20575     lsptr[0].sz = packed_length;
20576     sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
20577                1, JBB);
20578   }
20579   else if (packed_length <= NdbNodeBitmask48::Size)
20580   {
20581     sendSignal(cntrlblockref, GSN_NDB_STARTCONF, signal,
20582                1 + NdbNodeBitmask48::Size, JBB);
20583   }
20584   else
20585   {
20586     ndbabort();
20587   }
20588 }//Dbdih::execSTART_RECCONF()
20589 
copyNodeLab(Signal * signal,Uint32 tableId)20590 void Dbdih::copyNodeLab(Signal* signal, Uint32 tableId)
20591 {
20592   /* ----------------------------------------------------------------------- */
20593   // This code is executed by the master to assist a node restart in receiving
20594   // the data in the master.
20595   /* ----------------------------------------------------------------------- */
20596   Uint32 TloopCount = 0;
20597 
20598   if (!c_nodeStartMaster.activeState) {
20599     jam();
20600     /* --------------------------------------------------------------------- */
20601     // Obviously the node crashed in the middle of its node restart. We will
20602     // stop this process simply by returning after resetting the wait indicator.
20603     // We also need to handle the pausing of LCPs if it was active.
20604     /* ---------------------------------------------------------------------- */
20605     c_nodeStartMaster.wait = ZFALSE;
20606     return;
20607   }//if
20608   TabRecordPtr tabPtr;
20609   tabPtr.i = tableId;
20610   while (tabPtr.i < ctabFileSize) {
20611     ptrAss(tabPtr, tabRecord);
20612     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE)
20613     {
20614       /* -------------------------------------------------------------------- */
20615       // The table is defined. We will start by packing the table into pages.
20616       // The tabCopyStatus indicates to the CONTINUEB(ZPACK_TABLE_INTO_PAGES)
20617       // who called it. After packing the table into page(s) it will be sent to
20618       // the starting node by COPY_TABREQ signals. After returning from the
20619       // starting node we will return to this subroutine and continue
20620       // with the next table.
20621       /* -------------------------------------------------------------------- */
20622       if (! (tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE))
20623       {
20624         jam();
20625         signal->theData[0] = DihContinueB::ZCOPY_NODE;
20626         signal->theData[1] = tabPtr.i;
20627         sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
20628                             WaitTableStateChangeMillis, 2);
20629         return;
20630       }
20631       tabPtr.p->tabCopyStatus = TabRecord::CS_COPY_NODE_STATE;
20632       signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
20633       signal->theData[1] = tabPtr.i;
20634       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20635       return;
20636     } else {
20637       jam();
20638       if (TloopCount > 100) {
20639 	/* ------------------------------------------------------------------ */
20640 	// Introduce real-time break after looping through 100 not copied tables
20641 	/* ----------------------------------------------------------------- */
20642         jam();
20643         signal->theData[0] = DihContinueB::ZCOPY_NODE;
20644         signal->theData[1] = tabPtr.i + 1;
20645         sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20646         return;
20647       } else {
20648         jam();
20649         TloopCount++;
20650         tabPtr.i++;
20651       }//if
20652     }//if
20653   }//while
20654   jam();
20655   if (is_lcp_paused())
20656   {
20657     jam();
20658     /**
20659      * Copying is done, we now need to tell the starting node about the
20660      * already completed LQHs and to ensure that the starting node
20661      * verifies that the copy was correct.
20662      */
20663     check_for_pause_action(signal, StartLcpReq::PauseLcpStartSecond);
20664     return;
20665   }
20666   else
20667   {
20668     jam();
20669     dihCopyCompletedLab(signal);
20670     return;
20671   }
20672 }//Dbdih::copyNodeLab()
20673 
tableCopyNodeLab(Signal * signal,TabRecordPtr tabPtr)20674 void Dbdih::tableCopyNodeLab(Signal* signal, TabRecordPtr tabPtr)
20675 {
20676   /* ----------------------------------------------------------------------- */
20677   /*       COPY PAGES READ TO STARTING NODE.                                 */
20678   /* ----------------------------------------------------------------------- */
20679   if (!c_nodeStartMaster.activeState) {
20680     jam();
20681     releaseTabPages(tabPtr.i);
20682     c_nodeStartMaster.wait = ZFALSE;
20683     return;
20684   }//if
20685   NodeRecordPtr copyNodePtr;
20686   PageRecordPtr pagePtr;
20687   copyNodePtr.i = c_nodeStartMaster.startNode;
20688   ptrCheckGuard(copyNodePtr, MAX_NDB_NODES, nodeRecord);
20689 
20690   copyNodePtr.p->activeTabptr = tabPtr.i;
20691   pagePtr.i = tabPtr.p->pageRef[0];
20692   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
20693 
20694   signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
20695   signal->theData[1] = tabPtr.i;
20696   signal->theData[2] = copyNodePtr.i;
20697   signal->theData[3] = 0;
20698   signal->theData[4] = 0;
20699   signal->theData[5] = pagePtr.p->word[34];
20700   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
20701 }//Dbdih::tableCopyNodeLab()
20702 
20703 /* ------------------------------------------------------------------------- */
20704 // execCONTINUEB(ZCOPY_TABLE)
20705 // This routine is used to copy the table descriptions from the master to
20706 // other nodes. It is used in the system restart to copy from master to all
20707 // starting nodes.
20708 /* ------------------------------------------------------------------------- */
copyTableLab(Signal * signal,Uint32 tableId)20709 void Dbdih::copyTableLab(Signal* signal, Uint32 tableId)
20710 {
20711   TabRecordPtr tabPtr;
20712   tabPtr.i = tableId;
20713   ptrAss(tabPtr, tabRecord);
20714 
20715   ndbrequire(tabPtr.p->tabCopyStatus == TabRecord::CS_IDLE);
20716   tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE2_READ_TABLE;
20717   signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
20718   signal->theData[1] = tabPtr.i;
20719   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20720   return;
20721 }//Dbdih::copyTableLab()
20722 
20723 /* ------------------------------------------------------------------------- */
20724 // execCONTINUEB(ZSR_PHASE2_READ_TABLE)
20725 /* ------------------------------------------------------------------------- */
srPhase2ReadTableLab(Signal * signal,TabRecordPtr tabPtr)20726 void Dbdih::srPhase2ReadTableLab(Signal* signal, TabRecordPtr tabPtr)
20727 {
20728   /* ----------------------------------------------------------------------- */
20729   // We set the sendCOPY_TABREQState to ZACTIVE for all nodes since it is a long
20730   // process to send off all table descriptions. Thus we ensure that we do
20731   // not encounter race conditions where one node is completed before the
20732   // sending process is completed. This could lead to that we start off the
20733   // system before we actually finished all copying of table descriptions
20734   // and could lead to strange errors.
20735   /* ----------------------------------------------------------------------- */
20736 
20737   //sendLoopMacro(COPY_TABREQ, nullRoutine);
20738 
20739   breakCopyTableLab(signal, tabPtr, cfirstAliveNode);
20740   return;
20741 }//Dbdih::srPhase2ReadTableLab()
20742 
20743 /* ------------------------------------------------------------------------- */
20744 /*       COPY PAGES READ TO ALL NODES.                                       */
20745 /* ------------------------------------------------------------------------- */
breakCopyTableLab(Signal * signal,TabRecordPtr tabPtr,Uint32 nodeId)20746 void Dbdih::breakCopyTableLab(Signal* signal, TabRecordPtr tabPtr, Uint32 nodeId)
20747 {
20748   NodeRecordPtr nodePtr;
20749   nodePtr.i = nodeId;
20750   while (nodePtr.i != RNIL) {
20751     jam();
20752     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
20753     if (nodePtr.i == getOwnNodeId()){
20754       jam();
20755       /* ------------------------------------------------------------------- */
20756       /* NOT NECESSARY TO COPY TO MY OWN NODE. I ALREADY HAVE THE PAGES.     */
20757       /* I DO HOWEVER NEED TO STORE THE TABLE DESCRIPTION ONTO DISK.         */
20758       /* ------------------------------------------------------------------- */
20759       /* IF WE ARE MASTER WE ONLY NEED TO SAVE THE TABLE ON DISK. WE ALREADY */
20760       /* HAVE THE TABLE DESCRIPTION IN THE DATA STRUCTURES.                  */
20761       // AFTER COMPLETING THE WRITE TO DISK THE MASTER WILL ALSO SEND
20762       // COPY_TABCONF AS ALL THE OTHER NODES.
20763       /* ------------------------------------------------------------------- */
20764       c_COPY_TABREQ_Counter.setWaitingFor(nodePtr.i);
20765       tabPtr.p->tabUpdateState = TabRecord::US_COPY_TAB_REQ;
20766       signal->theData[0] = DihContinueB::ZTABLE_UPDATE;
20767       signal->theData[1] = tabPtr.i;
20768       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20769       nodePtr.i = nodePtr.p->nextNode;
20770     } else {
20771       PageRecordPtr pagePtr;
20772       /* -------------------------------------------------------------------- */
20773       // RATHER THAN SENDING ALL COPY_TABREQ IN PARALLEL WE WILL SERIALISE THIS
20774       // ACTIVITY AND WILL THUS CALL breakCopyTableLab AGAIN WHEN COMPLETED THE
20775       // SENDING OF COPY_TABREQ'S.
20776       /* -------------------------------------------------------------------- */
20777       jam();
20778       tabPtr.p->tabCopyStatus = TabRecord::CS_SR_PHASE3_COPY_TABLE;
20779       pagePtr.i = tabPtr.p->pageRef[0];
20780       ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
20781       signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
20782       signal->theData[1] = tabPtr.i;
20783       signal->theData[2] = nodePtr.i;
20784       signal->theData[3] = 0;
20785       signal->theData[4] = 0;
20786       signal->theData[5] = pagePtr.p->word[34];
20787       sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
20788       return;
20789     }//if
20790   }//while
20791   /* ----------------------------------------------------------------------- */
20792   /*    WE HAVE NOW SENT THE TABLE PAGES TO ALL NODES. EXIT AND WAIT FOR ALL */
20793   /*    REPLIES.                                                             */
20794   /* ----------------------------------------------------------------------- */
20795   return;
20796 }//Dbdih::breakCopyTableLab()
20797 
20798 /* ------------------------------------------------------------------------- */
20799 // execCONTINUEB(ZCOPY_TABLE_NODE)
20800 /* ------------------------------------------------------------------------- */
copyTableNode(Signal * signal,CopyTableNode * ctn,NodeRecordPtr nodePtr)20801 void Dbdih::copyTableNode(Signal* signal,
20802 			  CopyTableNode* ctn, NodeRecordPtr nodePtr)
20803 {
20804   if (getNodeState().startLevel >= NodeState::SL_STARTED){
20805     /* --------------------------------------------------------------------- */
20806     // We are in the process of performing a node restart and are copying a
20807     // table description to a starting node. We will check that no nodes have
20808     // crashed in this process.
20809     /* --------------------------------------------------------------------- */
20810     if (!c_nodeStartMaster.activeState) {
20811       jam();
20812       /** ------------------------------------------------------------------
20813        * The starting node crashed. We will release table pages and stop this
20814        * copy process and allow new node restarts to start.
20815        * ------------------------------------------------------------------ */
20816       releaseTabPages(ctn->ctnTabPtr.i);
20817       c_nodeStartMaster.wait = ZFALSE;
20818       return;
20819     }//if
20820   }//if
20821   ndbrequire(ctn->pageIndex < NDB_ARRAY_SIZE(ctn->ctnTabPtr.p->pageRef));
20822   ctn->ctnPageptr.i = ctn->ctnTabPtr.p->pageRef[ctn->pageIndex];
20823   ptrCheckGuard(ctn->ctnPageptr, cpageFileSize, pageRecord);
20824   /**
20825    * If first page & firstWord reqinfo = 1 (first signal)
20826    */
20827   Uint32 reqinfo = (ctn->pageIndex == 0) && (ctn->wordIndex == 0);
20828   if(reqinfo == 1){
20829     c_COPY_TABREQ_Counter.setWaitingFor(nodePtr.i);
20830   }
20831 
20832   for (Uint32 i = 0; i < 16; i++) {
20833     jam();
20834     sendCopyTable(signal, ctn, calcDihBlockRef(nodePtr.i), reqinfo);
20835     reqinfo = 0;
20836     if (ctn->noOfWords <= 16) {
20837       jam();
20838       switch (ctn->ctnTabPtr.p->tabCopyStatus) {
20839       case TabRecord::CS_SR_PHASE3_COPY_TABLE:
20840 	/* ------------------------------------------------------------------ */
20841 	// We have copied the table description to this node.
20842 	// We will now proceed
20843 	// with sending the table description to the next node in the node list.
20844 	/* ------------------------------------------------------------------ */
20845         jam();
20846         ctn->ctnTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20847         breakCopyTableLab(signal, ctn->ctnTabPtr, nodePtr.p->nextNode);
20848         return;
20849         break;
20850       case TabRecord::CS_COPY_NODE_STATE:
20851         jam();
20852         ctn->ctnTabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
20853         return;
20854         break;
20855       default:
20856         ndbabort();
20857       }//switch
20858     } else {
20859       jam();
20860       ctn->wordIndex += 16;
20861       if (ctn->wordIndex == 2048) {
20862         jam();
20863         ctn->wordIndex = 0;
20864         ctn->pageIndex++;
20865         ndbrequire(ctn->pageIndex < NDB_ARRAY_SIZE(ctn->ctnTabPtr.p->pageRef));
20866         ctn->ctnPageptr.i = ctn->ctnTabPtr.p->pageRef[ctn->pageIndex];
20867         ptrCheckGuard(ctn->ctnPageptr, cpageFileSize, pageRecord);
20868       }//if
20869       ctn->noOfWords -= 16;
20870     }//if
20871   }//for
20872   signal->theData[0] = DihContinueB::ZCOPY_TABLE_NODE;
20873   signal->theData[1] = ctn->ctnTabPtr.i;
20874   signal->theData[2] = nodePtr.i;
20875   signal->theData[3] = ctn->pageIndex;
20876   signal->theData[4] = ctn->wordIndex;
20877   signal->theData[5] = ctn->noOfWords;
20878   sendSignal(reference(), GSN_CONTINUEB, signal, 6, JBB);
20879 }//Dbdih::copyTableNode()
20880 
sendCopyTable(Signal * signal,CopyTableNode * ctn,BlockReference ref,Uint32 reqinfo)20881 void Dbdih::sendCopyTable(Signal* signal, CopyTableNode* ctn,
20882                           BlockReference ref, Uint32 reqinfo)
20883 {
20884   CopyTabReq *req = (CopyTabReq*) signal->getDataPtrSend();
20885   req->senderRef = reference();
20886   req->reqinfo = reqinfo;
20887   req->tableId = ctn->ctnTabPtr.i;
20888   req->tableSchemaVersion = ctn->ctnTabPtr.p->schemaVersion;
20889   req->noOfWords = ctn->noOfWords;
20890   ndbrequire(ctn->wordIndex + 15 < 2048);
20891   MEMCOPY_NO_WORDS(&req->tableWords[0],
20892                    &ctn->ctnPageptr.p->word[ctn->wordIndex],
20893                    16);
20894   Uint32 sig_len = CopyTabReq::SignalLength;
20895   if (reqinfo == 1)
20896   {
20897     if (ctn->ctnTabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE)
20898     {
20899       jam();
20900       req->tabLcpStatus = CopyTabReq::LcpActive;
20901     }
20902     else
20903     {
20904       jam();
20905       /**
20906        * The state TLS_WRITING_TO_FILE means that the LCP is completed from the
20907        * viewpoint of the new starting node since it will start by writing the
20908        * table description to disk.
20909        */
20910       req->tabLcpStatus = CopyTabReq::LcpCompleted;
20911     }
20912     req->currentLcpId = SYSFILE->latestLCP_ID;
20913     sig_len = CopyTabReq::SignalLengthExtra;
20914   }
20915   sendSignal(ref, GSN_COPY_TABREQ, signal, sig_len, JBB);
20916 }//Dbdih::sendCopyTable()
20917 
execCOPY_TABCONF(Signal * signal)20918 void Dbdih::execCOPY_TABCONF(Signal* signal)
20919 {
20920   CopyTabConf *conf = (CopyTabConf*) &signal->theData[0];
20921   jamEntry();
20922   Uint32 nodeId = conf->nodeId;
20923   Uint32 tableId = conf->tableId;
20924   if (getNodeState().startLevel >= NodeState::SL_STARTED){
20925     /* --------------------------------------------------------------------- */
20926     // We are in the process of performing a node restart. Continue by copying
20927     // the next table to the starting node.
20928     /* --------------------------------------------------------------------- */
20929     jam();
20930     ndbrequire(nodeId == c_nodeStartMaster.startNode);
20931     c_COPY_TABREQ_Counter.clearWaitingFor(nodeId);
20932 
20933     releaseTabPages(tableId);
20934     signal->theData[0] = DihContinueB::ZCOPY_NODE;
20935     signal->theData[1] = tableId + 1;
20936     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
20937     return;
20938   } else {
20939     /* --------------------------------------------------------------------- */
20940     // We are in the process of performing a system restart. Check if all nodes
20941     // have saved the new table description to file and then continue with the
20942     // next table.
20943     /* --------------------------------------------------------------------- */
20944     receiveLoopMacro(COPY_TABREQ, nodeId);
20945     /* --------------------------------------------------------------------- */
20946     /*   WE HAVE NOW COPIED TO ALL NODES. WE HAVE NOW COMPLETED RESTORING    */
20947     /*   THIS TABLE. CONTINUE WITH THE NEXT TABLE.                           */
20948     /*   WE NEED TO RELEASE THE PAGES IN THE TABLE IN THIS NODE HERE.        */
20949     /*   WE ALSO NEED TO CLOSE THE TABLE FILE.                               */
20950     /* --------------------------------------------------------------------- */
20951     releaseTabPages(tableId);
20952 
20953     TabRecordPtr tabPtr;
20954     tabPtr.i = tableId;
20955     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
20956 
20957     ConnectRecordPtr connectPtr;
20958     connectPtr.i = tabPtr.p->connectrec;
20959     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
20960 
20961     /**
20962      * No need to protect this as it happens during recovery when DBTC isn't
20963      * acting on the tables yet. Also given that fragId is 0 we are sure that
20964      * this will only result in ADD_FRAGREQ being sent.
20965      */
20966     connectPtr.p->m_alter.m_totalfragments = tabPtr.p->totalfragments;
20967     D("6: totalfragments = " << tabPtr.p->totalfragments);
20968     sendAddFragreq(signal, connectPtr, tabPtr, 0, false);
20969     return;
20970   }//if
20971 }//Dbdih::execCOPY_TABCONF()
20972 
20973 /*
20974   3.13   L O C A L   C H E C K P O I N T  (M A S T E R)
20975   ****************************************************
20976   */
20977 /*****************************************************************************/
20978 /* **********     LOCAL-CHECK-POINT-HANDLING MODULE              *************/
20979 /*****************************************************************************/
20980 /* ------------------------------------------------------------------------- */
20981 /*       IT IS TIME TO CHECK IF IT IS TIME TO START A LOCAL CHECKPOINT.      */
20982 /*       WE WILL EITHER START AFTER 1 MILLION WORDS HAVE ARRIVED OR WE WILL  */
20983 /*       EXECUTE AFTER ABOUT 16 MINUTES HAVE PASSED BY.                      */
20984 /* ------------------------------------------------------------------------- */
checkTcCounterLab(Signal * signal)20985 void Dbdih::checkTcCounterLab(Signal* signal)
20986 {
20987   CRASH_INSERTION(7009);
20988   if (c_lcpState.lcpStatus != LCP_STATUS_IDLE) {
20989     g_eventLogger->error("lcpStatus = %u"
20990                          "lcpStatusUpdatedPlace = %d",
20991                          (Uint32) c_lcpState.lcpStatus,
20992                          c_lcpState.lcpStatusUpdatedPlace);
20993     ndbabort();
20994     return;
20995   }//if
20996   add_lcp_counter(&c_lcpState.ctimer, 32);
20997   if (c_lcpState.lcpStopGcp >= c_newest_restorable_gci) {
20998     jam();
20999     /* --------------------------------------------------------------------- */
21000     // We block LCP start if we have not completed one global checkpoints
21001     // before starting another local checkpoint.
21002     /* --------------------------------------------------------------------- */
21003     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
21004     checkLcpStart(signal, __LINE__, 100);
21005     return;
21006   }//if
21007   c_lcpState.setLcpStatus(LCP_TCGET, __LINE__);
21008 
21009   c_lcpState.ctcCounter = c_lcpState.ctimer;
21010   sendLoopMacro(TCGETOPSIZEREQ, sendTCGETOPSIZEREQ, RNIL);
21011 }//Dbdih::checkTcCounterLab()
21012 
checkLcpStart(Signal * signal,Uint32 lineNo,Uint32 delay)21013 void Dbdih::checkLcpStart(Signal* signal, Uint32 lineNo, Uint32 delay)
21014 {
21015   /* ----------------------------------------------------------------------- */
21016   // Verify that we are not attempting to start another instance of the LCP
21017   // when it is not alright to do so.
21018   /* ----------------------------------------------------------------------- */
21019   c_lcpState.lcpStart = ZACTIVE;
21020   signal->theData[0] = DihContinueB::ZCHECK_TC_COUNTER;
21021   signal->theData[1] = lineNo;
21022   if (delay == 0)
21023   {
21024     jam();
21025     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
21026   }
21027   else
21028   {
21029     jam();
21030     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, delay, 2);
21031   }
21032 }//Dbdih::checkLcpStart()
21033 
21034 /**
21035  * It is possible that the previous LCP is fully completed by the master
21036  * node. Still we could be waiting for some delayed LCP_COMPLETE_REP(LQH)
21037  * signals from non-master nodes. To ensure that those signals are not
21038  * arriving when a new LCP has started we delay responding to this signal
21039  * until we have reached the LCP idle state.
21040  */
execCHECK_LCP_IDLE_ORD(Signal * signal)21041 void Dbdih::execCHECK_LCP_IDLE_ORD(Signal *signal)
21042 {
21043   jamEntry();
21044   if (c_lcpState.lcpStatus == LCP_STATUS_IDLE ||
21045       c_lcpState.lcpStatus == LCP_TCGET)
21046   {
21047     jam();
21048     BlockReference ref = signal->theData[2];
21049     sendSignal(ref, GSN_TCGETOPSIZECONF, signal, 2, JBB);
21050     return;
21051   }
21052   jam();
21053   DEB_LCP(("Delay LCP start, state = %u", c_lcpState.lcpStatus));
21054   sendSignalWithDelay(reference(), GSN_CHECK_LCP_IDLE_ORD, signal, 10, 3);
21055 }
21056 
21057 /* ------------------------------------------------------------------------- */
21058 /*TCGETOPSIZECONF          HOW MUCH OPERATION SIZE HAVE BEEN EXECUTED BY TC  */
21059 /* ------------------------------------------------------------------------- */
execTCGETOPSIZECONF(Signal * signal)21060 void Dbdih::execTCGETOPSIZECONF(Signal* signal)
21061 {
21062   jamEntry();
21063   Uint32 senderNodeId = signal->theData[0];
21064   add_lcp_counter(&c_lcpState.ctcCounter, signal->theData[1]);
21065 
21066   receiveLoopMacro(TCGETOPSIZEREQ, senderNodeId);
21067 
21068   ndbrequire(c_lcpState.lcpStatus == LCP_TCGET);
21069   ndbrequire(c_lcpState.lcpStart == ZACTIVE);
21070   /* ----------------------------------------------------------------------- */
21071   // We are not actively starting another LCP, still we receive this signal.
21072   // This is not ok.
21073   /* ---------------------------------------------------------------------- */
21074   /*    ALL TC'S HAVE RESPONDED NOW. NOW WE WILL CHECK IF ENOUGH OPERATIONS */
21075   /*    HAVE EXECUTED TO ENABLE US TO START A NEW LOCAL CHECKPOINT.         */
21076   /*    WHILE COPYING DICTIONARY AND DISTRIBUTION INFO TO A STARTING NODE   */
21077   /*    WE WILL ALSO NOT ALLOW THE LOCAL CHECKPOINT TO PROCEED.             */
21078   /*----------------------------------------------------------------------- */
21079   if (c_lcpState.immediateLcpStart == false)
21080   {
21081     Uint64 cnt = Uint64(c_lcpState.ctcCounter);
21082     Uint64 limit = Uint64(1) << c_lcpState.clcpDelay;
21083     bool dostart = cnt >= limit;
21084     if (dostart == false)
21085     {
21086       jam();
21087       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
21088       checkLcpStart(signal, __LINE__, 1000);
21089       return;
21090     }//if
21091 
21092     /**
21093      * Check if we have reason to stall the start of the LCP due to
21094      * outstanding node restarts that are reasonably close to
21095      * need a LCP to complete or to need a point in time where there
21096      * are no LCPs ongoing.
21097      */
21098     if (check_stall_lcp_start())
21099     {
21100       c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
21101       checkLcpStart(signal, __LINE__, 3000);
21102       return;
21103     }
21104   }
21105 
21106   if (unlikely(c_lcpState.lcpManualStallStart))
21107   {
21108     jam();
21109     g_eventLogger->warning("LCP start triggered, but manually stalled (Immediate %u, Change %llu / %llu)",
21110                            c_lcpState.immediateLcpStart,
21111                            Uint64(c_lcpState.ctcCounter),
21112                            (Uint64(1) << c_lcpState.clcpDelay));
21113     c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
21114     checkLcpStart(signal, __LINE__, 3000);
21115     return;
21116   }
21117 
21118   c_lcpState.lcpStart = ZIDLE;
21119   c_lcpState.immediateLcpStart = false;
21120   /* -----------------------------------------------------------------------
21121    * Now the initial lcp is started,
21122    * we can reset the delay to its orginal value
21123    * --------------------------------------------------------------------- */
21124   CRASH_INSERTION(7010);
21125   /* ----------------------------------------------------------------------- */
21126   /*     IF MORE THAN 1 MILLION WORDS PASSED THROUGH THE TC'S THEN WE WILL   */
21127   /*     START A NEW LOCAL CHECKPOINT. CLEAR CTIMER. START CHECKPOINT        */
21128   /*     ACTIVITY BY CALCULATING THE KEEP GLOBAL CHECKPOINT.                 */
21129   // Also remember the current global checkpoint to ensure that we run at least
21130   // one global checkpoints between each local checkpoint that we start up.
21131   /* ----------------------------------------------------------------------- */
21132   c_lcpState.ctimer = 0;
21133   c_lcpState.keepGci = (Uint32)(m_micro_gcp.m_old_gci >> 32);
21134   c_lcpState.oldestRestorableGci = SYSFILE->oldestRestorableGCI;
21135 
21136   CRASH_INSERTION(7014);
21137   c_lcpState.setLcpStatus(LCP_TC_CLOPSIZE, __LINE__);
21138   sendLoopMacro(TC_CLOPSIZEREQ, sendTC_CLOPSIZEREQ, RNIL);
21139 }
21140 
execTC_CLOPSIZECONF(Signal * signal)21141 void Dbdih::execTC_CLOPSIZECONF(Signal* signal)
21142 {
21143   jamEntry();
21144   Uint32 senderNodeId = signal->theData[0];
21145   receiveLoopMacro(TC_CLOPSIZEREQ, senderNodeId);
21146 
21147   ndbrequire(c_lcpState.lcpStatus == LCP_TC_CLOPSIZE);
21148 
21149   /* ----------------------------------------------------------------------- */
21150   /*       UPDATE THE NEW LATEST LOCAL CHECKPOINT ID.                        */
21151   /* ----------------------------------------------------------------------- */
21152   cnoOfActiveTables = 0;
21153   c_lcpState.setLcpStatus(LCP_WAIT_MUTEX, __LINE__);
21154   ndbrequire(((int)c_lcpState.oldestRestorableGci) > 0);
21155 
21156   if (ERROR_INSERTED(7011)) {
21157     signal->theData[0] = NDB_LE_LCPStoppedInCalcKeepGci;
21158     signal->theData[1] = 0;
21159     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
21160     return;
21161   }//if
21162   start_lcp_before_mutex(signal);
21163 }
21164 
start_lcp_before_mutex(Signal * signal)21165 void Dbdih::start_lcp_before_mutex(Signal *signal)
21166 {
21167   /**
21168    * We lock the Fragment Info for at least a short time. This ensures
21169    * that we don't start an LCP while we are copying meta data. If we
21170    * support PAUSE LCP protocol we can later release the mutex early
21171    * on.
21172    */
21173   jam();
21174   Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
21175   Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked), 0 };
21176   ndbrequire(mutex.trylock(c, false));
21177 }
21178 
21179 void
lcpFragmentMutex_locked(Signal * signal,Uint32 senderData,Uint32 retVal)21180 Dbdih::lcpFragmentMutex_locked(Signal* signal,
21181                                Uint32 senderData,
21182                                Uint32 retVal)
21183 {
21184   jamEntry();
21185 
21186   if (retVal == UtilLockRef::LockAlreadyHeld)
21187   {
21188     jam();
21189     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
21190     mutex.release();
21191 
21192     if (senderData == 0)
21193     {
21194       jam();
21195       infoEvent("Local checkpoint blocked waiting for node-restart");
21196     }
21197     // 2* is as parameter is in seconds, and we sendSignalWithDelay 500ms
21198     if (senderData >= 2*c_lcpState.m_lcp_trylock_timeout)
21199     {
21200       jam();
21201       Callback c = { safe_cast(&Dbdih::lcpFragmentMutex_locked), 0 };
21202       ndbrequire(mutex.lock(c, false));
21203       return;
21204     }
21205     signal->theData[0] = DihContinueB::ZLCP_TRY_LOCK;
21206     signal->theData[1] = senderData + 1;
21207     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 500, 2);
21208     return;
21209   }
21210 
21211   ndbrequire(retVal == 0);
21212   start_lcp(signal);
21213 }
21214 
start_lcp(Signal * signal)21215 void Dbdih::start_lcp(Signal *signal)
21216 {
21217   c_lcpState.m_start_time = c_current_time = NdbTick_getCurrentTicks();
21218 
21219   setLcpActiveStatusStart(signal);
21220 
21221   c_lcpState.setLcpStatus(LCP_CALCULATE_KEEP_GCI, __LINE__);
21222   c_lcpState.keepGci = m_micro_gcp.m_old_gci >> 32;
21223   c_lcpState.oldestRestorableGci = SYSFILE->oldestRestorableGCI;
21224   SYSFILE->latestLCP_ID++;
21225 
21226   signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
21227   signal->theData[1] = 0;  /* TABLE ID = 0          */
21228   signal->theData[2] = 0;  /* FRAGMENT ID = 0       */
21229   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
21230   return;
21231 }
21232 
21233 /* ------------------------------------------------------------------------- */
21234 /*       WE NEED TO CALCULATE THE OLDEST GLOBAL CHECKPOINT THAT WILL BE      */
21235 /*       COMPLETELY RESTORABLE AFTER EXECUTING THIS LOCAL CHECKPOINT.        */
21236 /* ------------------------------------------------------------------------- */
calculateKeepGciLab(Signal * signal,Uint32 tableId,Uint32 fragId)21237 void Dbdih::calculateKeepGciLab(Signal* signal, Uint32 tableId, Uint32 fragId)
21238 {
21239   TabRecordPtr tabPtr;
21240   Uint32 TloopCount = 1;
21241   tabPtr.i = tableId;
21242   do {
21243     if (tabPtr.i >= ctabFileSize) {
21244       if (cnoOfActiveTables > 0) {
21245         jam();
21246         signal->theData[0] = DihContinueB::ZSTORE_NEW_LCP_ID;
21247         sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
21248         return;
21249       } else {
21250         jam();
21251 	/* ------------------------------------------------------------------ */
21252 	/* THERE ARE NO TABLES TO CHECKPOINT. WE STOP THE CHECKPOINT ALREADY  */
21253 	/* HERE TO AVOID STRANGE PROBLEMS LATER.                              */
21254 	/* ------------------------------------------------------------------ */
21255         c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
21256         checkLcpStart(signal, __LINE__, 1000);
21257         return;
21258       }//if
21259     }//if
21260     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
21261     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
21262 	tabPtr.p->tabStorage != TabRecord::ST_NORMAL) {
21263       if (TloopCount > 100) {
21264         jam();
21265         signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
21266         signal->theData[1] = tabPtr.i + 1;
21267         signal->theData[2] = 0;
21268         sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
21269         return;
21270       } else {
21271         jam();
21272         TloopCount++;
21273         tabPtr.i++;
21274       }//if
21275     } else {
21276       jam();
21277       TloopCount = 0;
21278     }//if
21279   } while (TloopCount != 0);
21280   cnoOfActiveTables++;
21281   FragmentstorePtr fragPtr;
21282   getFragstore(tabPtr.p, fragId, fragPtr);
21283   checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->storedReplicas);
21284   checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->oldStoredReplicas);
21285   fragId++;
21286   if (fragId >= tabPtr.p->totalfragments) {
21287     jam();
21288     tabPtr.i++;
21289     fragId = 0;
21290   }//if
21291   signal->theData[0] = DihContinueB::ZCALCULATE_KEEP_GCI;
21292   signal->theData[1] = tabPtr.i;
21293   signal->theData[2] = fragId;
21294   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
21295   return;
21296 }//Dbdih::calculateKeepGciLab()
21297 
21298 /* ------------------------------------------------------------------------- */
21299 /*       WE NEED TO STORE ON DISK THE FACT THAT WE ARE STARTING THIS LOCAL   */
21300 /*       CHECKPOINT ROUND. THIS WILL INVALIDATE ALL THE LOCAL CHECKPOINTS    */
21301 /*       THAT WILL EVENTUALLY BE OVERWRITTEN AS PART OF THIS LOCAL CHECKPOINT*/
21302 /* ------------------------------------------------------------------------- */
storeNewLcpIdLab(Signal * signal)21303 void Dbdih::storeNewLcpIdLab(Signal* signal)
21304 {
21305   signal->theData[0] = NDB_LE_LocalCheckpointStarted; //Event type
21306   signal->theData[1] = SYSFILE->latestLCP_ID;
21307   signal->theData[2] = c_lcpState.keepGci;
21308   signal->theData[3] = c_lcpState.oldestRestorableGci;
21309   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
21310 
21311   /***************************************************************************/
21312   // Report the event that a local checkpoint has started.
21313   /***************************************************************************/
21314 
21315   signal->setTrace(TestOrd::TraceLocalCheckpoint);
21316 
21317   CRASH_INSERTION(7013);
21318   SYSFILE->keepGCI = c_lcpState.keepGci;
21319 
21320   DEB_LCP(("Set SYSFILE->keepGCI = %u", SYSFILE->keepGCI));
21321 
21322   SYSFILE->oldestRestorableGCI = c_lcpState.oldestRestorableGci;
21323 
21324   const Uint32 oldestRestorableGCI = SYSFILE->oldestRestorableGCI;
21325 
21326   Int32 val = oldestRestorableGCI;
21327   ndbrequire(val > 0);
21328 
21329   /* ----------------------------------------------------------------------- */
21330   /* SET BIT INDICATING THAT LOCAL CHECKPOINT IS ONGOING. THIS IS CLEARED    */
21331   /* AT THE END OF A LOCAL CHECKPOINT.                                       */
21332   /* ----------------------------------------------------------------------- */
21333   SYSFILE->setLCPOngoing(SYSFILE->systemRestartBits);
21334   /* ---------------------------------------------------------------------- */
21335   /*    CHECK IF ANY NODE MUST BE TAKEN OUT OF SERVICE AND REFILLED WITH    */
21336   /*    NEW FRESH DATA FROM AN ACTIVE NODE.                                 */
21337   /* ---------------------------------------------------------------------- */
21338 
21339   /**
21340    * This used be done in setLcpActiveStatusStart
21341    *   but this function has been move "up" in the flow
21342    *   to just before calcKeepGci
21343    */
21344   setNodeRestartInfoBits(signal);
21345 
21346   c_lcpState.setLcpStatus(LCP_COPY_GCI, __LINE__);
21347   //#ifdef VM_TRACE
21348   //  infoEvent("LocalCheckpoint %d started", SYSFILE->latestLCP_ID);
21349   //  signal->theData[0] = 7012;
21350   //  execDUMP_STATE_ORD(signal);
21351   //#endif
21352 
21353   copyGciLab(signal, CopyGCIReq::LOCAL_CHECKPOINT);
21354 }//Dbdih::storeNewLcpIdLab()
21355 
startLcpRoundLab(Signal * signal)21356 void Dbdih::startLcpRoundLab(Signal* signal)
21357 {
21358   jam();
21359 
21360   CRASH_INSERTION(7218);
21361 
21362   /**
21363    * Next step in starting up a local checkpoint is to define which
21364    * tables that should participate in the local checkpoint, while
21365    * we are performing this step we don't want to have committing
21366    * schema transactions in the middle of this, this mutex ensures
21367    * that we will wait for a schema transaction to commit before we
21368    * proceed and once we acquired the mutex, then schema transaction
21369    * commits will block waiting for this LCP phase to complete.
21370    *
21371    * The reason we need this mutex is to ensure that all nodes that
21372    * participate in the LCP have the same view on the tables involved
21373    * in the LCP. This makes it possible for a node to easily take
21374    * over the master role in executing a LCP if the master node that
21375    * controls the LCP fails.
21376    */
21377   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
21378   Callback c = { safe_cast(&Dbdih::startLcpMutex_locked), 0 };
21379   ndbrequire(mutex.lock(c));
21380 }
21381 
21382 void
startLcpMutex_locked(Signal * signal,Uint32 senderData,Uint32 retVal)21383 Dbdih::startLcpMutex_locked(Signal* signal, Uint32 senderData, Uint32 retVal){
21384   jamEntry();
21385   ndbrequire(retVal == 0);
21386 
21387   StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
21388   req->senderRef = reference();
21389   req->lcpId = SYSFILE->latestLCP_ID;
21390   req->pauseStart = StartLcpReq::NormalLcpStart; /* Normal LCP start */
21391   /**
21392    * Handle bitmasks in send-method below based on who the receiver is.
21393    */
21394   sendLoopMacro(START_LCP_REQ, sendSTART_LCP_REQ, RNIL);
21395 }
21396 
21397 void
sendSTART_LCP_REQ(Signal * signal,Uint32 nodeId,Uint32 extra)21398 Dbdih::sendSTART_LCP_REQ(Signal* signal, Uint32 nodeId, Uint32 extra)
21399 {
21400   BlockReference ref = calcDihBlockRef(nodeId);
21401   Uint32 packed_length1 = c_lcpState.m_participatingLQH.getPackedLengthInWords();
21402   Uint32 packed_length2 = c_lcpState.m_participatingDIH.getPackedLengthInWords();
21403   Uint32 participatingLQH[NdbNodeBitmask::Size];
21404   Uint32 participatingDIH[NdbNodeBitmask::Size];
21405 
21406   if (ERROR_INSERTED(7021) && nodeId == getOwnNodeId())
21407   {
21408     if (ndbd_send_node_bitmask_in_section(getNodeInfo(nodeId).m_version))
21409     {
21410       jam();
21411       SectionHandle handle(this);
21412       LinearSectionPtr lsptr[3];
21413       c_lcpState.m_participatingLQH.copyto(NdbNodeBitmask::Size, participatingLQH);
21414       c_lcpState.m_participatingDIH.copyto(NdbNodeBitmask::Size, participatingDIH);
21415 
21416       lsptr[0].p = participatingLQH;
21417       lsptr[0].sz = packed_length1;
21418       lsptr[1].p = participatingDIH;
21419       lsptr[1].sz = packed_length2;
21420 
21421       import(handle.m_ptr[0]  , lsptr[0].p, lsptr[0].sz);
21422       handle.m_cnt = 1;
21423 
21424       sendSignalWithDelay(ref, GSN_START_LCP_REQ, signal, 500,
21425                         StartLcpReq::SignalLength, &handle);
21426     }
21427     else if ((packed_length1 <= NdbNodeBitmask48::Size) &&
21428              (packed_length2 <= NdbNodeBitmask48::Size))
21429     {
21430       jam();
21431       StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
21432       req->participatingLQH_v1 = c_lcpState.m_participatingLQH;
21433       req->participatingDIH_v1 = c_lcpState.m_participatingDIH;
21434       sendSignalWithDelay(ref, GSN_START_LCP_REQ, signal, 500,
21435                         StartLcpReq::SignalLength);
21436     }
21437     else
21438     {
21439       ndbabort();
21440     }
21441     return;
21442   }
21443   else if (ERROR_INSERTED(7021) && ((rand() % 10) > 4))
21444   {
21445     infoEvent("Don't send START_LCP_REQ to %u", nodeId);
21446     return;
21447   }
21448 
21449   StartLcpReq* req = (StartLcpReq*)signal->getDataPtrSend();
21450   if (ndbd_send_node_bitmask_in_section(getNodeInfo((nodeId)).m_version))
21451   {
21452     jam();
21453     LinearSectionPtr lsptr[3];
21454     lsptr[0].p = c_lcpState.m_participatingLQH.rep.data;
21455     lsptr[0].sz = packed_length1;
21456     lsptr[1].p = c_lcpState.m_participatingDIH.rep.data;
21457     lsptr[1].sz = packed_length2;
21458     req->participatingLQH_v1.clear();
21459     req->participatingDIH_v1.clear();
21460     sendSignal(ref, GSN_START_LCP_REQ, signal, StartLcpReq::SignalLength, JBB,
21461                lsptr, 2);
21462   }
21463   else if ((packed_length1 <= NdbNodeBitmask48::Size) &&
21464            (packed_length2 <= NdbNodeBitmask48::Size))
21465   {
21466     jam();
21467     req->participatingLQH_v1 = c_lcpState.m_participatingLQH;
21468     req->participatingDIH_v1 = c_lcpState.m_participatingDIH;
21469     sendSignal(ref, GSN_START_LCP_REQ, signal, StartLcpReq::SignalLength, JBB);
21470   }
21471   else
21472   {
21473     ndbabort();
21474   }
21475 }
21476 
21477 void
execSTART_LCP_CONF(Signal * signal)21478 Dbdih::execSTART_LCP_CONF(Signal* signal)
21479 {
21480   StartLcpConf * conf = (StartLcpConf*)signal->getDataPtr();
21481 
21482   Uint32 nodeId = refToNode(conf->senderRef);
21483 
21484   if (is_lcp_paused())
21485   {
21486     ndbrequire(isMaster());
21487     if (c_pause_lcp_master_state == PAUSE_START_LCP_INCLUSION)
21488     {
21489       jam();
21490       /**
21491        * We have completed including the starting node into the LCP.
21492        * We now need to copy the meta data.
21493        *
21494        * We come here as part of starting up a new starting node, so
21495        * we don't come here as part of a normal LCP start. So the
21496        * bitmap for outstanding signals we should not use since we
21497        * haven't set it up in this case.
21498        */
21499       c_pause_lcp_master_state = PAUSE_IN_LCP_COPY_META_DATA;
21500       start_copy_meta_data(signal);
21501       return;
21502     }
21503     else
21504     {
21505       jam();
21506       ndbrequire(c_pause_lcp_master_state == PAUSE_COMPLETE_LCP_INCLUSION);
21507       /**
21508        * We have completed copying the meta data and now we have also
21509        * completed the inclusion of the new node into the LCP protocol.
21510        * We are now ready to continue to the next stage of the node
21511        * restart handling for the starting node.
21512        */
21513       sendPAUSE_LCP_REQ(signal, false);
21514       return;
21515     }
21516   }
21517   receiveLoopMacro(START_LCP_REQ, nodeId);
21518 
21519   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
21520   Callback c = { safe_cast(&Dbdih::startLcpMutex_unlocked), 0 };
21521   mutex.unlock(c);
21522 }
21523 
21524 void
startLcpMutex_unlocked(Signal * signal,Uint32 data,Uint32 retVal)21525 Dbdih::startLcpMutex_unlocked(Signal* signal, Uint32 data, Uint32 retVal){
21526   jamEntry();
21527   ndbrequire(retVal == 0);
21528 
21529   Mutex mutex(signal, c_mutexMgr, c_startLcpMutexHandle);
21530   mutex.release();
21531 
21532   /* ----------------------------------------------------------------------- */
21533   /*     NOW PROCEED BY STARTING THE LOCAL CHECKPOINT IN EACH LQH.           */
21534   /* ----------------------------------------------------------------------- */
21535   c_lcpState.m_LAST_LCP_FRAG_ORD = c_lcpState.m_participatingLQH;
21536   DEB_LCP(("startLcpMutex_unlocked: m_LAST_LCP_FRAG_ORD = %s",
21537 	   c_lcpState.m_LAST_LCP_FRAG_ORD.getText()));
21538 
21539   c_lcp_runs_with_pause_support = true;
21540   {
21541     jam();
21542     /**
21543      * We can release the mutex now that we have started the LCP. Since we
21544      * hold the mutex we know that currently no copy of meta data is ongoing.
21545      * We have setup everything for the LCP to start we reach this call, so it
21546      * is safe to release the mutex and rely on the PAUSE LCP protocol to
21547      * handle the rest.
21548      *
21549      * We have held the fragment info mutex long enough to ensure that we have
21550      * copied the m_participatingDIH bitmap to all participants in the LCP.
21551      * This means that when we reach the participant nodes we can safely add
21552      * the starting node to m_participatingDIH to ensure that the starting
21553      * node also gets all the rest of the updates to the LCP data in DIH
21554      * while the LCP is completing. This phase of the LCP is fairly quick, so
21555      * the cost of holding the mutex here should be fairly small. The part of
21556      * the LCP that consumes most time is when we start performing the real
21557      * checkpointing on the m_participatingLQH nodes.
21558      */
21559     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
21560     mutex.unlock();
21561   }
21562   CRASH_INSERTION(7015);
21563   c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__);
21564   startLcpRoundLoopLab(signal, 0, 0);
21565 }
21566 
21567 void
master_lcp_fragmentMutex_locked(Signal * signal,Uint32 failedNodePtrI,Uint32 retVal)21568 Dbdih::master_lcp_fragmentMutex_locked(Signal* signal,
21569                                        Uint32 failedNodePtrI, Uint32 retVal)
21570 {
21571   jamEntry();
21572   ndbrequire(retVal == 0);
21573 
21574   signal->theData[0] = NDB_LE_LCP_TakeoverCompleted;
21575   signal->theData[1] = c_lcpMasterTakeOverState.state;
21576   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
21577 
21578   signal->theData[0] = 7012;
21579   execDUMP_STATE_ORD(signal);
21580 
21581   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
21582 
21583   checkLocalNodefailComplete(signal, failedNodePtrI, NF_LCP_TAKE_OVER);
21584 
21585   startLcpRoundLoopLab(signal, 0, 0);
21586 }
21587 
21588 
21589 //#define DIH_DEBUG_REPLICA_SEARCH
21590 #ifdef DIH_DEBUG_REPLICA_SEARCH
21591 static Uint32 totalScheduled;
21592 static Uint32 totalExamined;
21593 #endif
21594 
startLcpRoundLoopLab(Signal * signal,Uint32 startTableId,Uint32 startFragId)21595 void Dbdih::startLcpRoundLoopLab(Signal* signal,
21596 				 Uint32 startTableId, Uint32 startFragId)
21597 {
21598   NodeRecordPtr nodePtr;
21599   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
21600   {
21601     ptrAss(nodePtr, nodeRecord);
21602     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
21603       jamLine(nodePtr.i);
21604       ndbrequire(nodePtr.p->noOfStartedChkpt == 0);
21605       ndbrequire(nodePtr.p->noOfQueuedChkpt == 0);
21606     }//if
21607   }//if
21608   c_lcpState.currentFragment.tableId = startTableId;
21609   c_lcpState.currentFragment.fragmentId = startFragId;
21610   c_lcpState.m_allReplicasQueuedLQH.clear();
21611 
21612 #ifdef DIH_DEBUG_REPLICA_SEARCH
21613   totalScheduled = totalExamined = 0;
21614 #endif
21615 
21616   startNextChkpt(signal);
21617 }//Dbdih::startLcpRoundLoopLab()
21618 
startNextChkpt(Signal * signal)21619 void Dbdih::startNextChkpt(Signal* signal)
21620 {
21621   jam();
21622   const bool allReplicaCheckpointsQueued =
21623     c_lcpState.m_allReplicasQueuedLQH.
21624     contains(c_lcpState.m_participatingLQH);
21625 
21626   if (allReplicaCheckpointsQueued)
21627   {
21628     jam();
21629 
21630     /**
21631      * No need to find new checkpoints to start,
21632      * just waiting for completion
21633      */
21634 
21635     sendLastLCP_FRAG_ORD(signal);
21636     return;
21637   }
21638 
21639   Uint32 lcpId = SYSFILE->latestLCP_ID;
21640 
21641   /* Initialise handledNodes with those already fully queued */
21642   NdbNodeBitmask handledNodes = c_lcpState.m_allReplicasQueuedLQH;
21643 
21644   /* Remove any that have failed in the interim */
21645   handledNodes.bitAND(c_lcpState.m_participatingLQH);
21646 
21647   const Uint32 lcpNodes = c_lcpState.m_participatingLQH.count();
21648 
21649   bool save = true;
21650   LcpState::CurrentFragment curr = c_lcpState.currentFragment;
21651 
21652   Uint32 examined = 0;
21653   Uint32 started = 0;
21654   Uint32 queued = 0;
21655 
21656   while (curr.tableId < ctabFileSize) {
21657     TabRecordPtr tabPtr;
21658     tabPtr.i = curr.tableId;
21659     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
21660     if ((tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) ||
21661         (tabPtr.p->tabLcpStatus != TabRecord::TLS_ACTIVE)) {
21662       curr.tableId++;
21663       curr.fragmentId = 0;
21664       continue;
21665     }//if
21666 
21667     FragmentstorePtr fragPtr;
21668     getFragstore(tabPtr.p, curr.fragmentId, fragPtr);
21669 
21670     ReplicaRecordPtr replicaPtr;
21671     for(replicaPtr.i = fragPtr.p->storedReplicas;
21672 	replicaPtr.i != RNIL ;
21673 	replicaPtr.i = replicaPtr.p->nextPool){
21674 
21675       jam();
21676       c_replicaRecordPool.getPtr(replicaPtr);
21677 
21678       examined++;
21679 
21680       NodeRecordPtr nodePtr;
21681       nodePtr.i = replicaPtr.p->procNode;
21682       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
21683 
21684       if (c_lcpState.m_participatingLQH.get(nodePtr.i))
21685       {
21686 	if (replicaPtr.p->lcpOngoingFlag &&
21687 	    replicaPtr.p->lcpIdStarted < lcpId)
21688 	{
21689 	  jam();
21690 	  //-------------------------------------------------------------------
21691 	  // We have found a replica on a node that performs local checkpoint
21692 	  // that is alive and that have not yet been started.
21693 	  //-------------------------------------------------------------------
21694 
21695           if (nodePtr.p->noOfStartedChkpt <
21696               getMaxStartedFragCheckpointsForNode(nodePtr.i))
21697 	  {
21698 	    jam();
21699 	    /**
21700 	     * Send LCP_FRAG_ORD to LQH
21701 	     */
21702 
21703 	    /**
21704 	     * Mark the replica so with lcpIdStarted == true
21705 	     */
21706 	    replicaPtr.p->lcpIdStarted = lcpId;
21707 
21708 	    Uint32 i = nodePtr.p->noOfStartedChkpt;
21709 	    nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
21710 	    nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
21711 	    nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
21712 	    nodePtr.p->noOfStartedChkpt = i + 1;
21713 
21714 	    sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
21715 
21716             started++;
21717 	  }
21718           else if (nodePtr.p->noOfQueuedChkpt <
21719                    MAX_QUEUED_FRAG_CHECKPOINTS_PER_NODE)
21720 	  {
21721 	    jam();
21722 	    /**
21723 	     * Put LCP_FRAG_ORD "in queue"
21724 	     */
21725 
21726 	    /**
21727 	     * Mark the replica so with lcpIdStarted == true
21728 	     */
21729 	    replicaPtr.p->lcpIdStarted = lcpId;
21730 
21731 	    Uint32 i = nodePtr.p->noOfQueuedChkpt;
21732 	    nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
21733 	    nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
21734 	    nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
21735 	    nodePtr.p->noOfQueuedChkpt = i + 1;
21736             queued++;
21737 	  }
21738 	  else
21739 	  {
21740 	    jam();
21741 
21742 	    if(save)
21743 	    {
21744               /**
21745                * Stop increasing value on first replica that
21746                * we could not enqueue, so we don't miss it
21747                * next time
21748                */
21749               c_lcpState.currentFragment = curr;
21750               save = false;
21751             }
21752 
21753 	    handledNodes.set(nodePtr.i);
21754 	    if (handledNodes.count() == lcpNodes)
21755 	    {
21756               /**
21757                * All participating nodes have either
21758                * - Full queues
21759                * - All available replica checkpoints queued
21760                *   (m_allReplicasQueuedLQH)
21761                *
21762                * Therefore, exit the search here.
21763                */
21764 #ifdef DIH_DEBUG_REPLICA_SEARCH
21765               ndbout_c("Search : All nodes busy.  Examined %u Started %u Queued %u",
21766                        examined, started, queued);
21767               totalExamined+= examined;
21768               totalScheduled += (started + queued);
21769 #endif
21770 	      return;
21771 	    }//if
21772 	  }//if
21773 	}
21774       }//while
21775     }
21776     curr.fragmentId++;
21777     if (curr.fragmentId >= tabPtr.p->totalfragments) {
21778       jam();
21779       curr.fragmentId = 0;
21780       curr.tableId++;
21781     }//if
21782     if (started + queued > 256 ||
21783         (!save && examined > 128))
21784     {
21785       /**
21786        * This method can take a very long time (around 30ms in a 72-node
21787        * cluster with 4 LDMs and around a few hundred tables. In this
21788        * case filling the start and queue can be around 8000 things to
21789        * start and queue up which is a significant effort. This can lead
21790        * to problems with heartbeats and other real-time mechanisms, so
21791        * we stop here after reaching more than 256 items. This should
21792        * set the limit around 1ms of execution time and give a more
21793        * stable real-time environment.
21794        *
21795        * We also avoid doing any long searches forward when we already
21796        * found one node queue being full.
21797        */
21798       jam();
21799       if (save)
21800       {
21801         jam();
21802         c_lcpState.currentFragment = curr;
21803       }
21804 #ifdef DIH_DEBUG_REPLICA_SEARCH
21805       ndbout_c("Search : 256 handled.  Examined %u Started %u Queued %u",
21806                examined, started, queued);
21807       totalExamined+= examined;
21808       totalScheduled += (started + queued);
21809 #endif
21810       return;
21811     }
21812   }//while
21813 
21814 #ifdef DIH_DEBUG_REPLICA_SEARCH
21815   ndbout_c("Search : At least one node not busy.  Examined %u Started %u Queued %u",
21816            examined, started, queued);
21817   totalExamined+= examined;
21818   totalScheduled += (started + queued);
21819 #endif
21820 
21821   /**
21822    * Have examined all replicas and attempted to
21823    * enqueue as many replica LCPs as possible,
21824    * without filling all queues.
21825    * This means that some node(s) have no more
21826    * replica LCPs to be enqueued.
21827    * These are the node(s) which are *not* in
21828    * the handled bitmap on this round.
21829    * We keep track of these to allow the search
21830    * to exit early on future invocations.
21831    */
21832 
21833   /* Invert handled nodes to reveal newly finished nodes */
21834   handledNodes.bitXOR(c_lcpState.m_participatingLQH);
21835 
21836   /* Add newly finished nodes to the global state */
21837   c_lcpState.m_allReplicasQueuedLQH.bitOR(handledNodes);
21838 
21839   sendLastLCP_FRAG_ORD(signal);
21840 }//Dbdih::startNextChkpt()
21841 
sendLastLCP_FRAG_ORD(Signal * signal)21842 void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal)
21843 {
21844   LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
21845   lcpFragOrd->tableId = RNIL;
21846   lcpFragOrd->fragmentId = 0;
21847   lcpFragOrd->lcpId = SYSFILE->latestLCP_ID;
21848   lcpFragOrd->lcpNo = 0;
21849   lcpFragOrd->keepGci = c_lcpState.keepGci;
21850   lcpFragOrd->lastFragmentFlag = true;
21851 
21852   NodeRecordPtr nodePtr;
21853   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
21854   {
21855     jam();
21856     ptrAss(nodePtr, nodeRecord);
21857 
21858     if(nodePtr.p->noOfQueuedChkpt == 0 &&
21859        nodePtr.p->noOfStartedChkpt == 0 &&
21860        c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodePtr.i)){
21861       jam();
21862 
21863       CRASH_INSERTION(7028);
21864 
21865       /**
21866        * Nothing queued or started <=> Complete on that node
21867        *
21868        */
21869       c_lcpState.m_LAST_LCP_FRAG_ORD.clearWaitingFor(nodePtr.i);
21870       if(ERROR_INSERTED(7075)){
21871 	continue;
21872       }
21873 
21874       CRASH_INSERTION(7193);
21875       DEB_LCP(("Send last LCP_FRAG_ORD to node %u", nodePtr.i));
21876       BlockReference ref = calcLqhBlockRef(nodePtr.i);
21877       sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB);
21878     }
21879     else
21880     {
21881 #ifdef DEBUG_LCP
21882       if (c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodePtr.i))
21883       {
21884         DEB_LCP(("Still waiting for sending last LCP_FRAG_ORD to node %u,"
21885                  " queued: %u, started: %u, waiting_for: %u",
21886                  nodePtr.i,
21887                  nodePtr.p->noOfQueuedChkpt,
21888                  nodePtr.p->noOfStartedChkpt,
21889                  c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodePtr.i)));
21890       }
21891 #endif
21892     }
21893   }
21894   if(ERROR_INSERTED(7075))
21895   {
21896     if(c_lcpState.m_LAST_LCP_FRAG_ORD.done())
21897     {
21898       CRASH_INSERTION(7075);
21899     }
21900   }
21901 }//Dbdih::sendLastLCP_FRAGORD()
21902 
21903 /* ------------------------------------------------------------------------- */
21904 /*       A FRAGMENT REPLICA HAS COMPLETED EXECUTING ITS LOCAL CHECKPOINT.    */
21905 /*       CHECK IF ALL REPLICAS IN THE TABLE HAVE COMPLETED. IF SO STORE THE  */
21906 /*       THE TABLE DISTRIBUTION ON DISK. ALSO SEND LCP_REPORT TO ALL OTHER   */
21907 /*       NODES SO THAT THEY CAN STORE THE TABLE ONTO DISK AS WELL.           */
21908 /* ------------------------------------------------------------------------- */
execLCP_FRAG_REP(Signal * signal)21909 void Dbdih::execLCP_FRAG_REP(Signal* signal)
21910 {
21911   jamEntry();
21912 
21913   LcpFragRep * lcpReport = (LcpFragRep *)&signal->theData[0];
21914 
21915   /**
21916    * Proxying LCP_FRAG_REP
21917    */
21918   const bool broadcast_req = lcpReport->nodeId == LcpFragRep::BROADCAST_REQ;
21919   if (broadcast_req)
21920   {
21921     jam();
21922     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
21923 
21924     /**
21925      * Set correct nodeId
21926      */
21927     lcpReport->nodeId = getOwnNodeId();
21928 
21929     if (is_lcp_paused() || c_dequeue_lcp_rep_ongoing)
21930     {
21931       jam();
21932       /**
21933        * We are currently pausing sending all information about LCP_FRAG_REP
21934        * from this node and also pausing any local processing of signals
21935        * received from LQH. We can still handle messages from other DIH
21936        * nodes. These will eventually stop due to pausing and we will wait
21937        * until we know that all those signals have arrived at their
21938        * destination.
21939        *
21940        * We won't send anything until we have completed the
21941        * PAUSE_LCP_REQ protocol which means until the starting node have
21942        * received all the meta data from the master node.
21943        */
21944       queue_lcp_frag_rep(signal, lcpReport);
21945       return;
21946     }
21947     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
21948     rg.m_nodes.clear(getOwnNodeId());
21949     sendSignal(rg, GSN_LCP_FRAG_REP, signal, signal->getLength(), JBB);
21950 
21951     /**
21952      * and continue processing
21953      */
21954   }
21955 
21956   Uint32 nodeId = lcpReport->nodeId;
21957   Uint32 tableId = lcpReport->tableId;
21958   Uint32 fragId = lcpReport->fragId;
21959 
21960   /**
21961    * We can receive LCP_FRAG_REP in 2 different situations:
21962    * 1) signal->length() == SignalLength
21963    * A normal report of completion of a LCP on a specific fragment. This
21964    * cannot arrive when the node is down, the sending must be in
21965    * the m_participatingLQH set, in addition the node must be alive
21966    * in the DIH sense which means that it has passed the state where it
21967    * is included in all the LCP protocols and GCP protocols.
21968    *
21969    * 2) signal->length() == SignalLengthTQ && lcpReport->fromTQ == 1
21970    * This signal is sent when the table is in copy state when a signal
21971    * in 1) is received. In this case the node could die before we
21972    * arrive here. We check this by simply checking if the node is still
21973    * alive. If this happens we can simply drop the signal.
21974    */
21975   if (!checkNodeAlive(nodeId))
21976   {
21977     jam();
21978     ndbrequire(signal->length() == LcpFragRep::SignalLengthTQ &&
21979                lcpReport->fromTQ == Uint32(1));
21980     /**
21981      * Given that we can delay this signal during a table copy situation,
21982      * we can actually receive this signal when the node is already dead. If
21983      * the node is dead then we drop the signal as soon as possible, the node
21984      * failure handling will ensure that the node is properly handled anyways.
21985      */
21986     return;
21987   }
21988 
21989   ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE);
21990 
21991 #if 0
21992   printLCP_FRAG_REP(stdout,
21993 		    signal->getDataPtr(),
21994 		    signal->length(), number());
21995 #endif
21996 
21997   jamEntry();
21998 
21999   if (ERROR_INSERTED(7178) && (nodeId == ERROR_INSERT_EXTRA))
22000   {
22001     jam();
22002     ndbout_c("throwing away LCP_FRAG_REP from  (and killing) %d", nodeId);
22003     SET_ERROR_INSERT_VALUE2(7179, nodeId);
22004     signal->theData[0] = 9999;
22005     sendSignal(numberToRef(CMVMI, nodeId),
22006 		  GSN_NDB_TAMPER, signal, 1, JBA);
22007     return;
22008   }
22009 
22010   if (ERROR_INSERTED(7179) && (nodeId == ERROR_INSERT_EXTRA))
22011   {
22012     jam();
22013     ndbout_c("throwing away LCP_FRAG_REP from %d", nodeId);
22014     return;
22015   }
22016 
22017   CRASH_INSERTION2(7025, isMaster());
22018   CRASH_INSERTION2(7016, !isMaster());
22019   CRASH_INSERTION2(7191, (!isMaster() && tableId));
22020 
22021   bool fromTimeQueue = (signal->length() == LcpFragRep::SignalLengthTQ &&
22022                         lcpReport->fromTQ == Uint32(1) &&
22023                         !broadcast_req);
22024 
22025   TabRecordPtr tabPtr;
22026   tabPtr.i = tableId;
22027   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
22028   if(tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
22029     jam();
22030     /*-----------------------------------------------------------------------*/
22031     // If the table is currently copied to disk we also
22032     // stop already here to avoid strange half-way updates
22033     // of the table data structures.
22034     /*-----------------------------------------------------------------------*/
22035     /*
22036       We need to send this signal without a delay since we have discovered
22037       that we have run out of space in the short time queue. This problem
22038       is very unlikely to happen but it has and it results in a node crash.
22039       This should be considered a "quick fix" and not a permanent solution.
22040       A cleaner/better way would be to check the time queue if it is full or
22041       not before sending this signal.
22042     */
22043     lcpReport->fromTQ = Uint32(1);
22044     sendSignal(reference(), GSN_LCP_FRAG_REP, signal,
22045                LcpFragRep::SignalLengthTQ, JBB);
22046     /* Kept here for reference
22047        sendSignalWithDelay(reference(), GSN_LCP_FRAG_REP,
22048        signal, 20, signal->length());
22049     */
22050 
22051     if(!fromTimeQueue){
22052       c_lcpState.noOfLcpFragRepOutstanding++;
22053     }
22054 
22055     return;
22056   }//if
22057 
22058   if(fromTimeQueue)
22059   {
22060     jam();
22061     ndbrequire(c_lcpState.noOfLcpFragRepOutstanding > 0);
22062     c_lcpState.noOfLcpFragRepOutstanding--;
22063   }
22064 
22065   bool tableDone = reportLcpCompletion(lcpReport);
22066 
22067   Uint32 started = lcpReport->maxGciStarted;
22068 #ifdef VM_TRACE
22069   Uint32 completed = lcpReport->maxGciCompleted;
22070 #endif
22071 
22072   if (started > c_lcpState.lcpStopGcp)
22073   {
22074     jam();
22075     c_lcpState.lcpStopGcp = started;
22076   }
22077 
22078   /**
22079    * Update m_local_lcp_state
22080    *
22081    * we could only look fragments that we have locally...
22082    *   but for now we look at all fragments
22083    */
22084   m_local_lcp_state.lcp_frag_rep(lcpReport);
22085 
22086   if (tableDone)
22087   {
22088     jam();
22089 
22090     if (tabPtr.p->tabStatus == TabRecord::TS_IDLE ||
22091         tabPtr.p->tabStatus == TabRecord::TS_DROPPING)
22092     {
22093       jam();
22094       g_eventLogger->info("TS_DROPPING - Neglecting to save Table: %d Frag: %d - ",
22095                           tableId, fragId);
22096     }
22097     else
22098     {
22099       jam();
22100       /**
22101        * Write table description to file
22102        */
22103       tabPtr.p->tabLcpStatus = TabRecord::TLS_WRITING_TO_FILE;
22104       tabPtr.p->tabCopyStatus = TabRecord::CS_LCP_READ_TABLE;
22105 
22106       /**
22107        * Check whether we should write immediately, or queue...
22108        */
22109       if (c_lcpTabDefWritesControl.requestMustQueue())
22110       {
22111         jam();
22112         //ndbout_c("DIH : Queueing tab def flush op on table %u", tabPtr.i);
22113         /* Mark as queued - will be started when an already running op completes */
22114         tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT_QUEUED;
22115       }
22116       else
22117       {
22118         /* Run immediately */
22119         jam();
22120         tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT;
22121         signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
22122         signal->theData[1] = tabPtr.i;
22123         sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
22124       }
22125 
22126       bool ret = checkLcpAllTablesDoneInLqh(__LINE__);
22127       if (ret && ERROR_INSERTED(7209))
22128       {
22129         jam();
22130         CLEAR_ERROR_INSERT_VALUE;
22131         signal->theData[0] = 9999;
22132         sendSignal(numberToRef(CMVMI, cmasterNodeId),
22133                    GSN_NDB_TAMPER, signal, 1, JBB);
22134       }
22135     }
22136   }
22137 
22138 #ifdef VM_TRACE
22139   /* --------------------------------------------------------------------- */
22140   // REPORT that local checkpoint have completed this fragment.
22141   /* --------------------------------------------------------------------- */
22142   signal->theData[0] = NDB_LE_LCPFragmentCompleted;
22143   signal->theData[1] = nodeId;
22144   signal->theData[2] = tableId;
22145   signal->theData[3] = fragId;
22146   signal->theData[4] = started;
22147   signal->theData[5] = completed;
22148   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 6, JBB);
22149 #endif
22150 
22151   bool ok = false;
22152   switch(c_lcpMasterTakeOverState.state){
22153   case LMTOS_IDLE:
22154     ok = true;
22155     jam();
22156     break;
22157   case LMTOS_WAIT_LCP_FRAG_REP:
22158     jam();
22159     checkEmptyLcpComplete(signal);
22160     return;
22161   case LMTOS_INITIAL:
22162   case LMTOS_ALL_IDLE:
22163   case LMTOS_ALL_ACTIVE:
22164   case LMTOS_LCP_CONCLUDING:
22165   case LMTOS_COPY_ONGOING:
22166     /**
22167      * In the old code we ensured that all outstanding LCP_FRAG_REPs
22168      * were handled before entering those states. So receiving an
22169      * LCP_FRAG_REP is ok in new code, even in new code will block
22170      * LCP_COMPLETE_REP such that we don't complete an LCP while
22171      * processing a master take over. But we can still receive
22172      * LCP_FRAG_REP while processing a master takeover.
22173      */
22174     return;
22175   }
22176   ndbrequire(ok);
22177 
22178   /* ----------------------------------------------------------------------- */
22179   // Check if there are more LCP's to start up.
22180   /* ----------------------------------------------------------------------- */
22181   if(isMaster())
22182   {
22183     jam();
22184 
22185     /**
22186      * Remove from "running" array
22187      */
22188     NodeRecordPtr nodePtr;
22189     nodePtr.i = nodeId;
22190     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22191 
22192     const Uint32 outstanding = nodePtr.p->noOfStartedChkpt;
22193     if (outstanding > 0)
22194     {
22195       jam();
22196       bool found = false;
22197       for (Uint32 i = 0; i < outstanding; i++)
22198       {
22199         if(nodePtr.p->startedChkpt[i].tableId != tableId ||
22200            nodePtr.p->startedChkpt[i].fragId != fragId)
22201         {
22202           jam();
22203           continue;
22204         }
22205         jam();
22206         memmove(&nodePtr.p->startedChkpt[i],
22207                 &nodePtr.p->startedChkpt[i+1],
22208                 (outstanding - (i + 1)) * sizeof(nodePtr.p->startedChkpt[0]));
22209         found = true;
22210       }
22211       if (found)
22212       {
22213         jam();
22214         nodePtr.p->noOfStartedChkpt--;
22215         checkStartMoreLcp(signal, nodeId, true);
22216         return;
22217       }
22218     }
22219     const Uint32 outstanding_queued = nodePtr.p->noOfQueuedChkpt;
22220     if (outstanding_queued > 0)
22221     {
22222       jam();
22223       bool found = false;
22224       for (Uint32 i = 0; i < outstanding_queued; i++)
22225       {
22226         if(nodePtr.p->queuedChkpt[i].tableId != tableId ||
22227            nodePtr.p->queuedChkpt[i].fragId != fragId)
22228         {
22229           jam();
22230           continue;
22231         }
22232         jam();
22233         memmove(&nodePtr.p->queuedChkpt[i],
22234                 &nodePtr.p->queuedChkpt[i+1],
22235                 (outstanding_queued - (i + 1)) *
22236                   sizeof(nodePtr.p->queuedChkpt[0]));
22237         found = true;
22238       }
22239       if (found)
22240       {
22241         jam();
22242         nodePtr.p->noOfQueuedChkpt--;
22243         if (nodePtr.p->noOfStartedChkpt == 0)
22244         {
22245           jam();
22246           checkStartMoreLcp(signal, nodePtr.i, true);
22247         }
22248         DEB_LCP(("LCP_FRAG_REP: nodePtr(%u)->noOfQueuedChkpt = %u"
22249                  ", nodePtr->noOfStartedChkpt = %u"
22250                  ", tab(%u,%u)",
22251                  nodePtr.i,
22252                  nodePtr.p->noOfQueuedChkpt,
22253                  nodePtr.p->noOfStartedChkpt,
22254                  tableId,
22255                  fragId));
22256         return;
22257       }
22258     }
22259     /**
22260      * In a master takeover situation we might have the fragment replica
22261      * placed in the queue as well. It is possible that the old master
22262      * did send LCP_FRAG_ORD and it is now arriving here.
22263      *
22264      * We start by checking the queued list, if it is in neither the
22265      * queued nor in the started list, then the table is dropped. There
22266      * is also one more obscure variant when the old master had a deeper
22267      * queue than we have, in that case we could come here, to handle
22268      * that we only assert on that the table is dropped.
22269      */
22270     ndbassert(tabPtr.p->tabStatus == TabRecord::TS_IDLE ||
22271               tabPtr.p->tabStatus == TabRecord::TS_DROPPING);
22272   }
22273 }
22274 
22275 bool
checkLcpAllTablesDoneInLqh(Uint32 line)22276 Dbdih::checkLcpAllTablesDoneInLqh(Uint32 line){
22277   TabRecordPtr tabPtr;
22278 
22279   /**
22280    * Check if finished with all tables
22281    */
22282   for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++) {
22283     //jam(); Removed as it flushed all other jam traces.
22284     ptrAss(tabPtr, tabRecord);
22285     if ((tabPtr.p->tabStatus == TabRecord::TS_ACTIVE) &&
22286         (tabPtr.p->tabLcpStatus == TabRecord::TLS_ACTIVE))
22287     {
22288       jam();
22289       /**
22290        * Nope, not finished with all tables
22291        */
22292       return false;
22293     }//if
22294   }//for
22295 
22296   CRASH_INSERTION2(7026, isMaster());
22297   CRASH_INSERTION2(7017, !isMaster());
22298 
22299   c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, line);
22300 
22301   if (ERROR_INSERTED(7194))
22302   {
22303     ndbout_c("CLEARING 7194");
22304     CLEAR_ERROR_INSERT_VALUE;
22305   }
22306 
22307 #ifdef DIH_DEBUG_REPLICA_SEARCH
22308   if (totalScheduled == 0)
22309   {
22310     totalScheduled = 1;
22311   }
22312   ndbout_c("LCP complete.  Examined %u replicas, scheduled %u.  Ratio : %u.%u",
22313            totalExamined,
22314            totalScheduled,
22315            totalExamined/totalScheduled,
22316            (10 * (totalExamined -
22317                   ((totalExamined/totalScheduled) *
22318                    totalScheduled)))/
22319            totalScheduled);
22320 #endif
22321 
22322   return true;
22323 }
22324 
findReplica(ReplicaRecordPtr & replicaPtr,Fragmentstore * fragPtrP,Uint32 nodeId,bool old)22325 void Dbdih::findReplica(ReplicaRecordPtr& replicaPtr,
22326 			Fragmentstore* fragPtrP,
22327 			Uint32 nodeId,
22328 			bool old)
22329 {
22330   replicaPtr.i = old ? fragPtrP->oldStoredReplicas : fragPtrP->storedReplicas;
22331   while(replicaPtr.i != RNIL){
22332     c_replicaRecordPool.getPtr(replicaPtr);
22333     if (replicaPtr.p->procNode == nodeId) {
22334       jam();
22335       return;
22336     } else {
22337       jam();
22338       replicaPtr.i = replicaPtr.p->nextPool;
22339     }//if
22340   };
22341 
22342 #ifdef VM_TRACE
22343   g_eventLogger->info("Fragment Replica(node=%d) not found", nodeId);
22344   replicaPtr.i = fragPtrP->oldStoredReplicas;
22345   while(replicaPtr.i != RNIL){
22346     c_replicaRecordPool.getPtr(replicaPtr);
22347     if (replicaPtr.p->procNode == nodeId) {
22348       jam();
22349       break;
22350     } else {
22351       jam();
22352       replicaPtr.i = replicaPtr.p->nextPool;
22353     }//if
22354   };
22355   if(replicaPtr.i != RNIL){
22356     g_eventLogger->info("...But was found in oldStoredReplicas");
22357   } else {
22358     g_eventLogger->info("...And wasn't found in oldStoredReplicas");
22359   }
22360 #endif
22361   ndbabort();
22362 }//Dbdih::findReplica()
22363 
22364 
22365 int
handle_invalid_lcp_no(const LcpFragRep * rep,ReplicaRecordPtr replicaPtr)22366 Dbdih::handle_invalid_lcp_no(const LcpFragRep* rep,
22367 			     ReplicaRecordPtr replicaPtr)
22368 {
22369   ndbrequire(!isMaster());
22370   Uint32 lcpNo = rep->lcpNo;
22371   Uint32 lcpId = rep->lcpId;
22372 
22373   warningEvent("Detected previous node failure of %d during lcp",
22374                rep->nodeId);
22375 
22376   replicaPtr.p->nextLcp = lcpNo;
22377   replicaPtr.p->lcpId[lcpNo] = 0;
22378   replicaPtr.p->lcpStatus[lcpNo] = ZINVALID;
22379 
22380   for (Uint32 i = lcpNo; i != lcpNo; i = nextLcpNo(i))
22381   {
22382     jam();
22383     if (replicaPtr.p->lcpStatus[i] == ZVALID &&
22384 	replicaPtr.p->lcpId[i] >= lcpId)
22385     {
22386       ndbout_c("i: %d lcpId: %d", i, replicaPtr.p->lcpId[i]);
22387       ndbabort();
22388     }
22389   }
22390 
22391   return 0;
22392 }
22393 
22394 /**
22395  * Return true  if table is all fragment replicas have been checkpointed
22396  *                 to disk (in all LQHs)
22397  *        false otherwise
22398  */
22399 bool
reportLcpCompletion(const LcpFragRep * lcpReport)22400 Dbdih::reportLcpCompletion(const LcpFragRep* lcpReport)
22401 {
22402   Uint32 lcpNo = lcpReport->lcpNo;
22403   Uint32 lcpId = lcpReport->lcpId;
22404   Uint32 maxGciStarted = lcpReport->maxGciStarted;
22405   Uint32 maxGciCompleted = lcpReport->maxGciCompleted;
22406   Uint32 tableId = lcpReport->tableId;
22407   Uint32 fragId = lcpReport->fragId;
22408   Uint32 nodeId = lcpReport->nodeId;
22409 
22410   TabRecordPtr tabPtr;
22411   tabPtr.i = tableId;
22412   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
22413 
22414   if (tabPtr.p->tabStatus == TabRecord::TS_DROPPING ||
22415       tabPtr.p->tabStatus == TabRecord::TS_IDLE)
22416   {
22417     jam();
22418     return true;
22419   }
22420 
22421   FragmentstorePtr fragPtr;
22422   getFragstore(tabPtr.p, fragId, fragPtr);
22423 
22424   ReplicaRecordPtr replicaPtr;
22425   findReplica(replicaPtr, fragPtr.p, nodeId);
22426 
22427   ndbrequire(replicaPtr.p->lcpOngoingFlag == true);
22428   if(lcpNo != replicaPtr.p->nextLcp){
22429     if (handle_invalid_lcp_no(lcpReport, replicaPtr))
22430     {
22431       g_eventLogger->error("lcpNo = %d replicaPtr.p->nextLcp = %d",
22432                            lcpNo, replicaPtr.p->nextLcp);
22433       ndbabort();
22434     }
22435   }
22436   ndbrequire(lcpNo == replicaPtr.p->nextLcp);
22437   ndbrequire(lcpNo < MAX_LCP_STORED);
22438   ndbrequire(replicaPtr.p->lcpId[lcpNo] != lcpId);
22439 
22440   replicaPtr.p->lcpIdStarted = lcpId;
22441   replicaPtr.p->lcpOngoingFlag = false;
22442 
22443   removeOldCrashedReplicas(tableId, fragId, replicaPtr);
22444   replicaPtr.p->lcpId[lcpNo] = lcpId;
22445   replicaPtr.p->lcpStatus[lcpNo] = ZVALID;
22446   replicaPtr.p->maxGciStarted[lcpNo] = maxGciStarted;
22447   replicaPtr.p->maxGciCompleted[lcpNo] = maxGciCompleted;
22448   replicaPtr.p->nextLcp = nextLcpNo(replicaPtr.p->nextLcp);
22449   ndbrequire(fragPtr.p->noLcpReplicas > 0);
22450   fragPtr.p->noLcpReplicas--;
22451 
22452   if(fragPtr.p->noLcpReplicas > 0)
22453   {
22454     jam();
22455     return false;
22456   }
22457   ndbrequire(tabPtr.p->tabActiveLcpFragments > 0);
22458   tabPtr.p->tabActiveLcpFragments--;
22459   if (tabPtr.p->tabActiveLcpFragments > 0)
22460   {
22461     jam();
22462     return false;
22463   }
22464   return true;
22465 }//Dbdih::reportLcpCompletion()
22466 
checkStartMoreLcp(Signal * signal,Uint32 nodeId,bool startNext)22467 bool Dbdih::checkStartMoreLcp(Signal* signal, Uint32 nodeId, bool startNext)
22468 {
22469   ndbrequire(isMaster());
22470 
22471   NodeRecordPtr nodePtr;
22472   nodePtr.i = nodeId;
22473   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
22474 
22475   ndbrequire(nodePtr.p->noOfStartedChkpt <
22476              getMaxStartedFragCheckpointsForNode(nodePtr.i));
22477 
22478   if (nodePtr.p->noOfQueuedChkpt > 0) {
22479     jam();
22480     Uint32 startIndex = nodePtr.p->noOfStartedChkpt;
22481     nodePtr.p->startedChkpt[startIndex] = nodePtr.p->queuedChkpt[0];
22482     nodePtr.p->noOfQueuedChkpt--;
22483     nodePtr.p->noOfStartedChkpt++;
22484     memmove(&nodePtr.p->queuedChkpt[0],
22485             &nodePtr.p->queuedChkpt[1],
22486             nodePtr.p->noOfQueuedChkpt *
22487               sizeof(nodePtr.p->queuedChkpt[0]));
22488     //-------------------------------------------------------------------
22489     // We can send a LCP_FRAG_ORD to the node ordering it to perform a
22490     // local checkpoint on this fragment replica.
22491     //-------------------------------------------------------------------
22492 
22493     sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[startIndex]);
22494     return false;
22495   }
22496 
22497   /**
22498    * If this node has no checkpoints queued up, then attempt to re-fill the
22499    * queues across all nodes.
22500    * The search for next replicas can be expensive, so we only do it when
22501    * the queues are empty and also avoid it if we are in the middle of going
22502    * through the queues to remove deleted tables.
22503    */
22504   if (startNext)
22505   {
22506     startNextChkpt(signal);
22507     return false;
22508   }
22509   /**
22510    * When we didn't want to call startNextChkpt from this method we need to
22511    * report that we skipped this call to ensure that this call is later made.
22512    */
22513   return true;
22514 }//Dbdih::checkStartMoreLcp()
22515 
22516 void
sendLCP_FRAG_ORD(Signal * signal,NodeRecord::FragmentCheckpointInfo info)22517 Dbdih::sendLCP_FRAG_ORD(Signal* signal,
22518 			NodeRecord::FragmentCheckpointInfo info){
22519 
22520   ReplicaRecordPtr replicaPtr;
22521   replicaPtr.i = info.replicaPtr;
22522   c_replicaRecordPool.getPtr(replicaPtr);
22523 
22524   // MT LQH goes via proxy for DD reasons
22525   BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode);
22526 
22527   if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId())
22528   {
22529     return;
22530   }
22531 
22532   if (replicaPtr.p->nextLcp >= MAX_LCP_USED)
22533   {
22534     jam();
22535     infoEvent("Updating nextLcp from %u to %u tab: %u",
22536               replicaPtr.p->nextLcp, 0,
22537               info.tableId);
22538     replicaPtr.p->nextLcp = 0;
22539   }
22540 
22541   Uint32 keepGci = c_lcpState.keepGci;
22542   if (keepGci > SYSFILE->lastCompletedGCI[replicaPtr.p->procNode])
22543   {
22544     jam();
22545     keepGci = SYSFILE->lastCompletedGCI[replicaPtr.p->procNode];
22546   }
22547 
22548   LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
22549   lcpFragOrd->tableId    = info.tableId;
22550   lcpFragOrd->fragmentId = info.fragId;
22551   lcpFragOrd->lcpId      = SYSFILE->latestLCP_ID;
22552   lcpFragOrd->lcpNo      = replicaPtr.p->nextLcp;
22553   lcpFragOrd->keepGci    = keepGci;
22554   lcpFragOrd->lastFragmentFlag = false;
22555   sendSignal(ref, GSN_LCP_FRAG_ORD, signal, LcpFragOrd::SignalLength, JBB);
22556 }
22557 
checkLcpCompletedLab(Signal * signal)22558 void Dbdih::checkLcpCompletedLab(Signal* signal)
22559 {
22560   if (c_lcp_id_paused != RNIL)
22561   {
22562     jam();
22563     return;
22564   }
22565 
22566   if(c_lcpState.lcpStatus < LCP_TAB_COMPLETED)
22567   {
22568     jam();
22569     return;
22570   }
22571 
22572   /**
22573    * We only wait for completion of tables that are not in a dropping state.
22574    * This is to avoid that LCPs are being blocked by dropped tables. There
22575    * could be bugs in reporting dropped tables properly.
22576    */
22577   TabRecordPtr tabPtr;
22578   for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++)
22579   {
22580     //jam(); Removed as it flushed all other jam traces.
22581     ptrAss(tabPtr, tabRecord);
22582     if (tabPtr.p->tabLcpStatus != TabRecord::TLS_COMPLETED)
22583     {
22584       jam();
22585       return;
22586     }
22587   }
22588 
22589   CRASH_INSERTION2(7027, isMaster());
22590   CRASH_INSERTION2(7018, !isMaster());
22591 
22592   if(c_lcpState.lcpStatus == LCP_TAB_COMPLETED)
22593   {
22594     /**
22595      * We're done
22596      */
22597 
22598     c_lcpState.setLcpStatus(LCP_TAB_SAVED, __LINE__);
22599     sendLCP_COMPLETE_REP(signal);
22600     return;
22601   }
22602 
22603   ndbrequire(c_lcpState.lcpStatus == LCP_TAB_SAVED);
22604   allNodesLcpCompletedLab(signal);
22605   return;
22606 }//Dbdih::checkLcpCompletedLab()
22607 
22608 void
sendLCP_COMPLETE_REP(Signal * signal)22609 Dbdih::sendLCP_COMPLETE_REP(Signal* signal){
22610   jam();
22611 
22612   /**
22613    * Quick and dirty fix for bug#36276 dont save
22614    * LCP_COMPLETE_REP to same node same LCP twice
22615    */
22616   bool alreadysent =
22617     c_lcpState.m_lastLCP_COMPLETE_REP_id == SYSFILE->latestLCP_ID &&
22618     c_lcpState.m_lastLCP_COMPLETE_REP_ref == c_lcpState.m_masterLcpDihRef;
22619 
22620   if (!alreadysent)
22621   {
22622     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
22623     rep->nodeId = getOwnNodeId();
22624     rep->lcpId = SYSFILE->latestLCP_ID;
22625     rep->blockNo = DBDIH;
22626 
22627     sendSignal(c_lcpState.m_masterLcpDihRef, GSN_LCP_COMPLETE_REP, signal,
22628                LcpCompleteRep::SignalLength, JBB);
22629 
22630     c_lcpState.m_lastLCP_COMPLETE_REP_id = SYSFILE->latestLCP_ID;
22631     c_lcpState.m_lastLCP_COMPLETE_REP_ref = c_lcpState.m_masterLcpDihRef;
22632   }
22633 
22634   /**
22635    * Say that an initial node restart does not need to be redone
22636    *   once node has been part of first LCP
22637    */
22638   if (c_set_initial_start_flag &&
22639       c_lcpState.m_participatingLQH.get(getOwnNodeId()))
22640   {
22641     jam();
22642     c_set_initial_start_flag = FALSE;
22643   }
22644 }
22645 
22646 /*-------------------------------------------------------------------------- */
22647 /* COMP_LCP_ROUND                   A LQH HAS COMPLETED A LOCAL CHECKPOINT  */
22648 /*------------------------------------------------------------------------- */
execLCP_COMPLETE_REP(Signal * signal)22649 void Dbdih::execLCP_COMPLETE_REP(Signal* signal)
22650 {
22651   jamEntry();
22652 
22653   CRASH_INSERTION(7191);
22654 
22655 #if 0
22656   g_eventLogger->info("LCP_COMPLETE_REP");
22657   printLCP_COMPLETE_REP(stdout,
22658 			signal->getDataPtr(),
22659 			signal->length(), number());
22660   fflush(stdout);
22661 #endif
22662 
22663   LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtr();
22664 
22665   if (rep->nodeId == LcpFragRep::BROADCAST_REQ)
22666   {
22667     jam();
22668     ndbrequire(refToNode(signal->getSendersBlockRef()) == getOwnNodeId());
22669 
22670     /**
22671      * Set correct nodeId
22672      */
22673     rep->nodeId = getOwnNodeId();
22674 
22675     /**
22676      * We want to ensure that we don't receive multiple LCP_COMPLETE_REP
22677      * from our LQH for the same LCP id. This wouldn't fly with the
22678      * PAUSE LCP protocol handling.
22679      */
22680     ndbrequire(rep->blockNo == DBLQH);
22681     ndbrequire(c_last_id_lcp_complete_rep != rep->lcpId ||
22682                c_last_id_lcp_complete_rep == RNIL);
22683     c_last_id_lcp_complete_rep = rep->lcpId;
22684     if (is_lcp_paused() || c_dequeue_lcp_rep_ongoing)
22685     {
22686       jam();
22687       /**
22688        * Also the LCP_COMPLETE_REP are queued when we pause the LCP reporting.
22689        */
22690       queue_lcp_complete_rep(signal, rep->lcpId);
22691       return;
22692     }
22693     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
22694     rg.m_nodes.clear(getOwnNodeId());
22695     sendSignal(rg, GSN_LCP_COMPLETE_REP, signal, signal->getLength(), JBB);
22696 
22697     /**
22698      * and continue processing
22699      */
22700   }
22701 
22702   Uint32 lcpId = rep->lcpId;
22703   Uint32 nodeId = rep->nodeId;
22704   Uint32 blockNo = rep->blockNo;
22705 
22706   /**
22707    * We can arrive here in the following cases:
22708    * 1) blockNo == DBLQH and signal->length() == SignalLength
22709    *
22710    * This is a normal message from a node in the m_participatingLQH
22711    * bitmap. It indicates that the node has completed everything of
22712    * its processing in DBLQH, both sending all LCP_FRAG_REP and
22713    * handling the UNDO log. The sender must be in the set of
22714    * c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH waited for.
22715    *
22716    * There is an exception for this during master takeover, another node
22717    * might send LCP_COMPLETE_REP after receiving MASTER_LCPREQ and finalising
22718    * its part of the master takeover protocol. This signal might arrive
22719    * before we have completed the master takeover protocol. In this case
22720    * the signal must be delayed until the master takeover handling is
22721    * completed. One reason for this is that we haven't finalised setting
22722    * up the master bitmaps yet.
22723    *
22724    * We know in this case that the node is alive by assumption that
22725    * we don't receive messages from dead nodes.
22726    *
22727    * 2) blockNo == DBLQH and signal->length() == SignalLengthTQ and
22728    *    rep->fromTQ == 0
22729    *
22730    * This signal is sent from NODE_FAILREP. It should be allowed to
22731    * pass through although the node is already declared dead and
22732    * no longer part of the m_participatingLQH set. It is a vital part
22733    * of the node failure handling. It should also not be blocked by
22734    * an early starting master takeover. It should however be dropped
22735    * if it isn't part of the set waited for (can happen if 3) arrives
22736    * after NODE_FAILREP but before this signal).
22737    *
22738    * This signal cannot be delayed by a master takeover. We know that
22739    * the master takeover state should not be possible to go beyond
22740    * LMTOS_INITIAL.
22741    *
22742    * 3) blockNo == DBLQH and signal->length() == SignalLengthTQ and
22743    *    rep->fromTQ == 1
22744    *
22745    * This signal is sent as a delayed signal when signal 1) above is
22746    * received in the middle of processing a master take over.
22747    * If it is received when the node is already dead (removed from
22748    * the m_participatingLQH set), then we should simply ignore it
22749    * and drop the signal since the node failure handling already
22750    * has handled it. We find this out by checking if the node is
22751    * part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH set or
22752    * not.
22753    *
22754    * This signal can be delayed by a master takeover if it is not
22755    * to be dropped.
22756    *
22757    * 4) blockNo == DBDIH and signal->length() == SignalLength
22758    *
22759    * This is a normal signal sent from one of the nodes when it has
22760    * received LCP_COMPLETE_REP from all participating LQHs. It is
22761    * received from a node in the set of
22762    * c_lcpState.m_LCP_COMPLETE_REP_DIH_Counter. This set ensures that we
22763    * only receive one of these. We should never receive this signal if
22764    * the node isn't in the above set. The duplication of this signal
22765    * happens as part of executing NODE_FAILREP, but here we set
22766    * signal->length() to SignalLengthTQ and fromTQ = 0, so only that
22767    * signal can be arriving with the node not being part of this set.
22768    * The sending node can both be an alive node and a starting node
22769    * which hasn't been set to alive yet.
22770    *
22771    * The same principle applies as in 1) here, the signal could arrive
22772    * during master takeover when we haven't yet formed the correct
22773    * c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set. In this case we need
22774    * to delay the signal until the master takeover is completed.
22775    *
22776    * 5) blockNo == DBDIH and signal->length() == SignalLengthTQ and
22777    *    rep->fromTQ == 0
22778    *
22779    * This is sent from node failure processing when the node has died.
22780    * The same logic as in 6) applies, the signal can be dropped if the
22781    * node isn't part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set.
22782    * Otherwise it should be allowed to pass through.
22783    *
22784    * This signal cannot be delayed by the master takeover.
22785    *
22786    * 6) blockNo == DBDIH and signal->length() == SignalLengthTQ and
22787    *    rep->fromTQ == 1
22788    *
22789    * This is a signal sent as delayed after receiving 4) above in a master
22790    * takeover situation, if it arrives when the node is no
22791    * longer part of the c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH set,
22792    * then we know that the signal is a duplicate and has already been
22793    * processed and we can safely ignore it.
22794    *
22795    * This signal can be delayed by a master takeover if it is not
22796    * to be dropped.
22797    *
22798    * 7) blockNo == 0 and signal->length() == SignalLength
22799    * This is a signal from the master indicating that the LCP is completely
22800    * done. It should not be possible to receive it during a master takeover
22801    * and thus should never be allowed to be delayed since if the master
22802    * takeover is being processed, then this signal cannot arrive from the
22803    * dead master and it is too early to receive it from the new master.
22804    */
22805 
22806   if (blockNo == DBLQH &&
22807       signal->length() == LcpCompleteRep::SignalLengthTQ &&
22808       rep->fromTQ == Uint32(0))
22809   {
22810     /* Handle case 2) above */
22811     ndbrequire(c_lcpMasterTakeOverState.state <= LMTOS_INITIAL);
22812     if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
22813     {
22814       jam();
22815       return;
22816     }
22817     jam();
22818   }
22819   else if (blockNo == DBDIH &&
22820            signal->length() == LcpCompleteRep::SignalLengthTQ &&
22821            rep->fromTQ == Uint32(0))
22822   {
22823     /* Handle case 5) above */
22824     ndbrequire(c_lcpMasterTakeOverState.state <= LMTOS_INITIAL);
22825     if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(nodeId))
22826     {
22827       jam();
22828       return;
22829     }
22830     jam();
22831   }
22832   else if (blockNo == 0)
22833   {
22834     /* Handle case 7) above) */
22835     jam();
22836     ndbrequire(signal->length() == LcpCompleteRep::SignalLength);
22837 
22838     /**
22839      * During master takeover, some participant nodes could have been
22840      * in IDLE state since they have already completed the lcpId under
22841      * the old master before it failed. However, they may receive
22842      * LCP_COMPLETE_REP again for the same lcpId from the new master.
22843      */
22844     if (c_lcpState.lcpStatus == LCP_STATUS_IDLE &&
22845         c_lcpState.already_completed_lcp(lcpId, nodeId))
22846     {
22847       return;
22848     }
22849 
22850     /**
22851      * Always allowed free pass through for signals from master that LCP is
22852      * completed.
22853      * These signals should not be blocked by master takeover since the
22854      * master is the last node to complete master takeover and the master
22855      * is sending this signal.
22856      */
22857   }
22858   else
22859   {
22860     /* Handle case 1), case 3), case 4) and case 6) above */
22861     jam();
22862     ndbrequire(blockNo == DBDIH || blockNo == DBLQH);
22863     if(c_lcpMasterTakeOverState.state > LMTOS_WAIT_LCP_FRAG_REP)
22864     {
22865       jam();
22866       /**
22867        * Don't allow LCP_COMPLETE_REP to arrive during
22868        * LCP master take over. We haven't yet formed the set of
22869        * expected signals and we don't want the master state to go to
22870        * completed while we are forming the state.
22871        *
22872        * We keep this even when removing the need to use the EMPTY_LCP_REQ
22873        * protocol. The reason is that we don't want to handle code to
22874        * process LCP completion as part of master take over as a
22875        * simplification. It is perfectly doable but we opted for keeping
22876        * this variant.
22877        */
22878       ndbrequire(isMaster());
22879       rep->fromTQ = Uint32(1);
22880       sendSignalWithDelay(reference(), GSN_LCP_COMPLETE_REP, signal, 100,
22881                           LcpCompleteRep::SignalLengthTQ);
22882       return;
22883     }
22884     /**
22885      * We are not in a master takeover situation, so we should have the
22886      * signal expected by the sets, however this could have been handled
22887      * by the signal sent from NODE_FAILREP already. So we need to verify
22888      * we really are in those sets. Not being in those states when a master
22889      * takeover isn't ongoing should only happen for delayed signals.
22890      */
22891     if (blockNo == DBLQH &&
22892         !c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.isWaitingFor(nodeId))
22893     {
22894       /* Can happen in case 3) above */
22895       jam();
22896       ndbrequire(signal->length() == LcpCompleteRep::SignalLengthTQ &&
22897                  rep->fromTQ == Uint32(1));
22898       return;
22899     }
22900     if (blockNo == DBDIH &&
22901         !c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.isWaitingFor(nodeId))
22902     {
22903       /* Can happen in case 6) above */
22904       jam();
22905       ndbrequire(signal->length() == LcpCompleteRep::SignalLengthTQ &&
22906                  rep->fromTQ == Uint32(1));
22907       return;
22908     }
22909   }
22910 
22911   ndbrequire(c_lcpState.lcpStatus != LCP_STATUS_IDLE);
22912 
22913   switch(blockNo){
22914   case DBLQH:
22915     jam();
22916     c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.clearWaitingFor(nodeId);
22917     ndbrequire(!c_lcpState.m_LAST_LCP_FRAG_ORD.isWaitingFor(nodeId));
22918     DEB_LCP_COMP(("LCP_COMPLETE_REP(LQH)(%u), LCP: %u",
22919                    nodeId,
22920                    lcpId));
22921     break;
22922   case DBDIH:
22923     jam();
22924     ndbrequire(isMaster());
22925     c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.clearWaitingFor(nodeId);
22926     DEB_LCP_COMP(("LCP_COMPLETE_REP(DIH)(%u), LCP: %u",
22927                    nodeId,
22928                    lcpId));
22929     break;
22930   case 0:
22931     jam();
22932     ndbrequire(!isMaster());
22933     ndbrequire(c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received == false);
22934     c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received = true;
22935     DEB_LCP_COMP(("LCP_COMPLETE_REP(0)(%u), LCP: %u",
22936                    nodeId,
22937                    lcpId));
22938     break;
22939   default:
22940     ndbabort();
22941   }
22942   ndbrequire(lcpId == SYSFILE->latestLCP_ID);
22943 
22944   allNodesLcpCompletedLab(signal);
22945   return;
22946 }
22947 
allNodesLcpCompletedLab(Signal * signal)22948 void Dbdih::allNodesLcpCompletedLab(Signal* signal)
22949 {
22950   jam();
22951 
22952   if (c_lcpState.lcpStatus != LCP_TAB_SAVED) {
22953     jam();
22954     DEB_LCP_COMP(("LCP_COMPLETE_REQ not complete, LCP_TAB_SAVED"));
22955     /**
22956      * We have not sent LCP_COMPLETE_REP to master DIH yet
22957      */
22958     return;
22959   }//if
22960 
22961   if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.done()){
22962     jam();
22963     DEB_LCP_COMP(("LCP_COMPLETE_REQ not complete, LQH not done"));
22964     return;
22965   }
22966 
22967   if (!c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.done()){
22968     jam();
22969     DEB_LCP_COMP(("LCP_COMPLETE_REQ not complete, DIH not done"));
22970     return;
22971   }
22972 
22973   if (!isMaster() &&
22974       c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received == false){
22975     jam();
22976     DEB_LCP_COMP(("LCP_COMPLETE_REQ not complete, Master not done"));
22977     /**
22978      * Wait until master DIH has signalled lcp is complete
22979      */
22980     return;
22981   }
22982 
22983   if(c_lcpMasterTakeOverState.state != LMTOS_IDLE){
22984     jam();
22985 #ifdef VM_TRACE
22986     g_eventLogger->info("Exiting from allNodesLcpCompletedLab");
22987 #endif
22988     return;
22989   }
22990 
22991   /*------------------------------------------------------------------------ */
22992   /*     WE HAVE NOW COMPLETED A LOCAL CHECKPOINT. WE ARE NOW READY TO WAIT  */
22993   /*     FOR THE NEXT LOCAL CHECKPOINT. SEND WITHOUT TIME-OUT SINCE IT MIGHT */
22994   /*     BE TIME TO START THE NEXT LOCAL CHECKPOINT IMMEDIATELY.             */
22995   /*     CLEAR BIT 3 OF SYSTEM RESTART BITS TO INDICATE THAT THERE IS NO     */
22996   /*     LOCAL CHECKPOINT ONGOING. THIS WILL BE WRITTEN AT SOME LATER TIME   */
22997   /*     DURING A GLOBAL CHECKPOINT. IT IS NOT NECESSARY TO WRITE IT         */
22998   /*     IMMEDIATELY. WE WILL ALSO CLEAR BIT 2 OF SYSTEM RESTART BITS IF ALL */
22999   /*     CURRENTLY ACTIVE NODES COMPLETED THE LOCAL CHECKPOINT.              */
23000   /*------------------------------------------------------------------------ */
23001   CRASH_INSERTION(7019);
23002   signal->setTrace(0);
23003 
23004   /* Check pause states */
23005   check_pause_state_lcp_idle();
23006   c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
23007   c_increase_lcp_speed_after_nf = false;
23008 
23009   DEB_LCP_COMP(("LCP all completed"));
23010   /**
23011    * Update m_local_lcp_state
23012    */
23013   m_local_lcp_state.lcp_complete_rep(c_newest_restorable_gci);
23014 
23015   if (isMaster())
23016   {
23017     /**
23018      * Check for any "completed" TO
23019      */
23020     TakeOverRecordPtr takeOverPtr;
23021     for (c_masterActiveTakeOverList.first(takeOverPtr); !takeOverPtr.isNull();)
23022     {
23023       jam();
23024 
23025       // move to next, since takeOverPtr might be release below
23026       TakeOverRecordPtr nextPtr = takeOverPtr;
23027       c_masterActiveTakeOverList.next(nextPtr);
23028 
23029       Ptr<NodeRecord> nodePtr;
23030       nodePtr.i = takeOverPtr.p->toStartingNode;
23031       if (takeOverPtr.p->toMasterStatus == TakeOverRecord::TO_WAIT_LCP)
23032       {
23033         jam();
23034         if (c_lcpState.m_participatingLQH.get(nodePtr.i))
23035         {
23036           jam();
23037           ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
23038           ndbrequire(nodePtr.p->copyCompleted == 2);
23039 
23040           /**
23041            * We have completed the node restart for this node. We set the
23042            * node recovery status to completed. This is used also in
23043            * estimating times for other nodes to complete their restarts.
23044            * It is also used to build NDBINFO table about node restart
23045            * status.
23046            *
23047            * This code is only executed in master node.
23048            */
23049           setNodeRecoveryStatus(nodePtr.i, NodeRecord::WAIT_SUMA_HANDOVER);
23050 
23051           EndToConf * conf = (EndToConf *)signal->getDataPtrSend();
23052           conf->senderData = takeOverPtr.p->m_senderData;
23053           conf->sendingNodeId = cownNodeId;
23054           conf->startingNodeId = nodePtr.i;
23055           sendSignal(takeOverPtr.p->m_senderRef, GSN_END_TOCONF, signal,
23056                      EndToConf::SignalLength, JBB);
23057 
23058           releaseTakeOver(takeOverPtr, true);
23059         }
23060       }
23061 
23062       takeOverPtr = nextPtr;
23063     }
23064     /**
23065      * We send the LCP_COMPLETE_REP from the master node to all nodes
23066      * that participated in the LCP in DIH, we could have alive nodes
23067      * here that didn't participate in the LCP because they became
23068      * alive so recently that they didn't need to participate in the
23069      * LCP since it was already closing when they entered through the
23070      * PAUSE LCP protocol. Sending to those nodes is not a good idea
23071      * since they are not at all set up to receive a LCP_COMPLETE_REP
23072      * message.
23073      */
23074     LcpCompleteRep * rep = (LcpCompleteRep*)signal->getDataPtrSend();
23075     rep->nodeId = getOwnNodeId();
23076     rep->lcpId = SYSFILE->latestLCP_ID;
23077     rep->blockNo = 0; // 0 = Sent from master
23078     NodeReceiverGroup rg(DBDIH, c_lcpState.m_participatingDIH);
23079     rg.m_nodes.clear(getOwnNodeId());
23080     sendSignal(rg, GSN_LCP_COMPLETE_REP, signal,
23081                LcpCompleteRep::SignalLength, JBB);
23082 
23083     jam();
23084   }
23085 
23086   Sysfile::clearLCPOngoing(SYSFILE->systemRestartBits);
23087   setLcpActiveStatusEnd(signal);
23088 
23089   /**
23090    * We calculate LCP time also in non-master although it's only used by
23091    * master nodes. The idea is to have an estimate of LCP execution time
23092    * already when the master node is running it's first LCP.
23093    */
23094   c_lcpState.m_lcp_time =
23095     NdbTick_Elapsed(c_lcpState.m_start_time, c_current_time).milliSec();
23096 
23097   if(!isMaster()){
23098     jam();
23099     /**
23100      * We're not master, be content
23101      */
23102     return;
23103   }
23104 
23105   /***************************************************************************/
23106   // Report the event that a local checkpoint has completed.
23107   /***************************************************************************/
23108   signal->theData[0] = NDB_LE_LocalCheckpointCompleted; //Event type
23109   signal->theData[1] = SYSFILE->latestLCP_ID;
23110   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB);
23111 
23112   if (c_newest_restorable_gci > c_lcpState.lcpStopGcp &&
23113       !ERROR_INSERTED(7222))
23114   {
23115     jam();
23116     c_lcpState.lcpStopGcp = c_newest_restorable_gci;
23117   }
23118 
23119   /**
23120    * Start checking for next LCP
23121    */
23122   checkLcpStart(signal, __LINE__, 0);
23123 
23124   ndbassert(check_pause_state_sanity());
23125   if (!c_lcp_runs_with_pause_support)
23126   {
23127     jam();
23128     Mutex mutex(signal, c_mutexMgr, c_fragmentInfoMutex_lcp);
23129     mutex.unlock();
23130   }
23131 
23132   c_lcp_runs_with_pause_support = false;
23133   ndbassert(check_pause_state_sanity());
23134   c_current_time = NdbTick_getCurrentTicks();
23135 
23136   if (cwaitLcpSr == true) {
23137     jam();
23138 
23139     infoEvent("Make On-line Database recoverable by waiting for LCP"
23140               " Completed, LCP id = %u",
23141               SYSFILE->latestLCP_ID);
23142 
23143     cwaitLcpSr = false;
23144     ndbsttorry10Lab(signal, __LINE__);
23145     return;
23146   }//if
23147   return;
23148 }//Dbdih::allNodesLcpCompletedLab()
23149 
23150 /******************************************************************************/
23151 /* **********     TABLE UPDATE MODULE                             *************/
23152 /* ****************************************************************************/
23153 /* ------------------------------------------------------------------------- */
23154 /*       THIS MODULE IS USED TO UPDATE THE TABLE DESCRIPTION. IT STARTS BY   */
23155 /*       CREATING THE FIRST TABLE FILE, THEN UPDATES THIS FILE AND CLOSES IT.*/
23156 /*       AFTER THAT THE SAME HAPPENS WITH THE SECOND FILE. AFTER THAT THE    */
23157 /*       TABLE DISTRIBUTION HAS BEEN UPDATED.                                */
23158 /*                                                                           */
23159 /*       THE REASON FOR CREATING THE FILE AND NOT OPENING IT IS TO ENSURE    */
23160 /*       THAT WE DO NOT GET A MIX OF OLD AND NEW INFORMATION IN THE FILE IN  */
23161 /*       ERROR SITUATIONS.                                                   */
23162 /* ------------------------------------------------------------------------- */
tableUpdateLab(Signal * signal,TabRecordPtr tabPtr)23163 void Dbdih::tableUpdateLab(Signal* signal, TabRecordPtr tabPtr) {
23164   FileRecordPtr filePtr;
23165   if (tabPtr.p->tabStorage == TabRecord::ST_TEMPORARY)
23166   {
23167     // For temporary tables we do not write to disk. Mark both copies 0 and 1
23168     // as done, and go straight to the after-close code.
23169     filePtr.i = tabPtr.p->tabFile[1];
23170     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
23171     tableCloseLab(signal, filePtr);
23172     return;
23173   }
23174   filePtr.i = tabPtr.p->tabFile[0];
23175   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
23176   createFileRw(signal, filePtr);
23177   filePtr.p->reqStatus = FileRecord::TABLE_CREATE;
23178   return;
23179 }//Dbdih::tableUpdateLab()
23180 
tableCreateLab(Signal * signal,FileRecordPtr filePtr)23181 void Dbdih::tableCreateLab(Signal* signal, FileRecordPtr filePtr)
23182 {
23183   TabRecordPtr tabPtr;
23184   tabPtr.i = filePtr.p->tabRef;
23185   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23186   writeTabfile(signal, tabPtr.p, filePtr);
23187   filePtr.p->reqStatus = FileRecord::TABLE_WRITE;
23188   return;
23189 }//Dbdih::tableCreateLab()
23190 
tableWriteLab(Signal * signal,FileRecordPtr filePtr)23191 void Dbdih::tableWriteLab(Signal* signal, FileRecordPtr filePtr)
23192 {
23193   closeFile(signal, filePtr);
23194   filePtr.p->reqStatus = FileRecord::TABLE_CLOSE;
23195   return;
23196 }//Dbdih::tableWriteLab()
23197 
tableCloseLab(Signal * signal,FileRecordPtr filePtr)23198 void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
23199 {
23200   TabRecordPtr tabPtr;
23201   tabPtr.i = filePtr.p->tabRef;
23202   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
23203   if (filePtr.i == tabPtr.p->tabFile[0]) {
23204     jam();
23205     filePtr.i = tabPtr.p->tabFile[1];
23206     ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
23207     createFileRw(signal, filePtr);
23208     filePtr.p->reqStatus = FileRecord::TABLE_CREATE;
23209     return;
23210   }//if
23211   switch (tabPtr.p->tabUpdateState) {
23212   case TabRecord::US_LOCAL_CHECKPOINT:
23213     jam();
23214     releaseTabPages(tabPtr.i);
23215 
23216     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
23217     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23218     tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
23219 
23220     /* Check whether there's some queued table definition flush op to start */
23221     if (c_lcpTabDefWritesControl.releaseMustStartQueued())
23222     {
23223       jam();
23224       /* Some table write is queued - let's kick it off */
23225       /* First find it...
23226        *   By using the tabUpdateState to 'queue' operations, we lose
23227        *   the original flush request order, which shouldn't matter.
23228        *   In any case, the checkpoint proceeds by table id, as does this
23229        *   search, so a similar order should result
23230        */
23231       TabRecordPtr tabPtr;
23232       for (tabPtr.i = 0; tabPtr.i < ctabFileSize; tabPtr.i++)
23233       {
23234         ptrAss(tabPtr, tabRecord);
23235         if (tabPtr.p->tabUpdateState == TabRecord::US_LOCAL_CHECKPOINT_QUEUED)
23236         {
23237           jam();
23238           //ndbout_c("DIH : Starting queued table def flush op on table %u", tabPtr.i);
23239           tabPtr.p->tabUpdateState = TabRecord::US_LOCAL_CHECKPOINT;
23240           signal->theData[0] = DihContinueB::ZPACK_TABLE_INTO_PAGES;
23241           signal->theData[1] = tabPtr.i;
23242           sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
23243           return;
23244         }
23245       }
23246       /* No queued table write found - error */
23247       g_eventLogger->warning("DIH : Error in queued table writes : inUse %u"
23248                              " queued %u total %u",
23249                              c_lcpTabDefWritesControl.inUse,
23250                              c_lcpTabDefWritesControl.queuedRequests,
23251                              c_lcpTabDefWritesControl.totalResources);
23252       ndbabort();
23253     }
23254     jam();
23255     signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
23256     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
23257 
23258     return;
23259     break;
23260   case TabRecord::US_REMOVE_NODE:
23261     jam();
23262     releaseTabPages(tabPtr.i);
23263     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
23264     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23265     if (tabPtr.p->tabLcpStatus == TabRecord::TLS_WRITING_TO_FILE) {
23266       jam();
23267       tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
23268       signal->theData[0] = DihContinueB::ZCHECK_LCP_COMPLETED;
23269       sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
23270     }//if
23271     signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
23272     signal->theData[1] = tabPtr.p->tabRemoveNode;
23273     signal->theData[2] = tabPtr.i + 1;
23274     if (!ERROR_INSERTED(7233))
23275       sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
23276     else
23277       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 300, 3);
23278     return;
23279     break;
23280   case TabRecord::US_INVALIDATE_NODE_LCP:
23281     jam();
23282     releaseTabPages(tabPtr.i);
23283     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
23284     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23285 
23286     signal->theData[0] = DihContinueB::ZINVALIDATE_NODE_LCP;
23287     signal->theData[1] = tabPtr.p->tabRemoveNode;
23288     signal->theData[2] = tabPtr.i + 1;
23289 
23290     handle_send_continueb_invalidate_node_lcp(signal);
23291     return;
23292   case TabRecord::US_COPY_TAB_REQ:
23293     jam();
23294     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23295     copyTabReq_complete(signal, tabPtr);
23296     return;
23297     break;
23298   case TabRecord::US_ADD_TABLE_MASTER:
23299     jam();
23300     releaseTabPages(tabPtr.i);
23301     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23302     signal->theData[0] = DihContinueB::ZDIH_ADD_TABLE_MASTER;
23303     signal->theData[1] = tabPtr.i;
23304     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
23305     return;
23306     break;
23307   case TabRecord::US_ADD_TABLE_SLAVE:
23308     jam();
23309     releaseTabPages(tabPtr.i);
23310     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23311     signal->theData[0] = DihContinueB::ZDIH_ADD_TABLE_SLAVE;
23312     signal->theData[1] = tabPtr.i;
23313     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
23314     return;
23315     break;
23316   case TabRecord::US_CALLBACK:
23317   {
23318     jam();
23319     releaseTabPages(tabPtr.i);
23320     tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
23321     tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
23322 
23323     Ptr<ConnectRecord> connectPtr;
23324     connectPtr.i = tabPtr.p->connectrec;
23325     ptrCheckGuard(connectPtr, cconnectFileSize, connectRecord);
23326     execute(signal, connectPtr.p->m_callback, 0);
23327     return;
23328   }
23329   default:
23330     ndbabort();
23331     return;
23332   }//switch
23333 }//Dbdih::tableCloseLab()
23334 
checkGcpStopLab(Signal * signal)23335 void Dbdih::checkGcpStopLab(Signal* signal)
23336 {
23337   static const Uint32 GCPCheckPeriodMillis = 100;
23338 
23339   // Calculate real time elapsed since last check
23340   const NDB_TICKS now = NdbTick_getCurrentTicks();
23341   const NDB_TICKS last = m_gcp_monitor.m_last_check;
23342   m_gcp_monitor.m_last_check = now;
23343 
23344   /**
23345    * Avoid false GCP failures if timers misbehaves,
23346    * (timer is non-monotonic, or OS/VM bugs which there are some of)
23347    * or we have scheduler problems due to being CPU starved:
23348    *
23349    * - If we overslept 'GCPCheckPeriodMillis', (CPU starved?) or
23350    *   timer leapt forward for other reasons (Adjusted, or OS-bug)
23351    *   we never calculate an elapsed periode of more than
23352    *   the requested sleep 'GCPCheckPeriodMillis'
23353    * - Else we add the real measured elapsed time to total.
23354    *   (Timers may fire prior to requested 'GCPCheckPeriodMillis')
23355    *
23356    * Note: If timer for some reason ticked backwards such that
23357    *       'now < last', NdbTick_Elapsed() will return '0' such
23358    *       that this is 'absorbed'
23359    */
23360   Uint32 elapsed_ms = (Uint32)NdbTick_Elapsed(last,now).milliSec();
23361   if (elapsed_ms > GCPCheckPeriodMillis)
23362     elapsed_ms = GCPCheckPeriodMillis;
23363 
23364   const Uint32 lag0 = (m_gcp_monitor.m_gcp_save.m_elapsed_ms  += elapsed_ms);
23365   const Uint32 lag1 = (m_gcp_monitor.m_micro_gcp.m_elapsed_ms += elapsed_ms);
23366 
23367   if (ERROR_INSERTED(7145))
23368   {
23369     static bool done = false;
23370     /*
23371       Recalculate the timeouts the get the low values that the test
23372       needs.  This was initially done at startup, and at that point,
23373       the ERROR_INSERT was not set yet.
23374     */
23375     if (!done)
23376     {
23377       setGCPStopTimeouts(signal);
23378       done = true;
23379     }
23380   }
23381 
23382   if (m_gcp_monitor.m_gcp_save.m_gci == m_gcp_save.m_gci)
23383   {
23384     jam();
23385     if (m_gcp_monitor.m_gcp_save.m_max_lag_ms &&
23386         lag0 >= m_gcp_monitor.m_gcp_save.m_max_lag_ms)
23387     {
23388       crashSystemAtGcpStop(signal, false);
23389       /* Continue monitoring */
23390     }
23391 
23392     /**
23393      * Will report a warning every time lag crosses
23394      * a multiple of 'report_period_ms'
23395      */
23396     const Uint32 report_period_ms = 60*1000; // 60 seconds
23397     if (lag0 > 0 && (lag0 % report_period_ms) < elapsed_ms)
23398     {
23399       if (m_gcp_monitor.m_gcp_save.m_max_lag_ms)
23400       {
23401         warningEvent("GCP Monitor: GCP_SAVE lag %u sec"
23402                      " (max lag: %us), %s",
23403                      lag0/1000, m_gcp_monitor.m_gcp_save.m_max_lag_ms/1000,
23404                      c_GCP_SAVEREQ_Counter.getText());
23405       }
23406       else
23407       {
23408         warningEvent("GCP Monitor: GCP_SAVE lag %u sec"
23409                      " (no max lag), %s",
23410                      lag0/1000, c_GCP_SAVEREQ_Counter.getText());
23411       }
23412     }
23413   }
23414   else
23415   {
23416     jam();
23417     m_gcp_monitor.m_gcp_save.m_gci = m_gcp_save.m_gci;
23418     m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
23419 
23420     /**
23421      * Recalculate gcp_save.m_max_lag.
23422      * Since the maxima for gcp_save and micro_gcp are calculated
23423      * separately at protocol basis, the normal constraint that
23424      * GCP_SAVE max is > GCP_COMMIT max, may not hold in cases where
23425      * GCP_SAVE is stalled.
23426     */
23427     if (m_gcp_monitor.m_gcp_save.m_need_max_lag_recalc)
23428     {
23429       setGCPStopTimeouts(signal, true, false); // true: for gcp_save
23430       m_gcp_monitor.m_gcp_save.m_need_max_lag_recalc = false;
23431     }
23432   }
23433 
23434   if (m_gcp_monitor.m_micro_gcp.m_gci == m_micro_gcp.m_current_gci)
23435   {
23436     jam();
23437     const Uint32 cmp = m_micro_gcp.m_enabled ?
23438       m_gcp_monitor.m_micro_gcp.m_max_lag_ms :
23439       m_gcp_monitor.m_gcp_save.m_max_lag_ms;
23440 
23441     if (cmp && lag1 >= cmp)
23442     {
23443       crashSystemAtGcpStop(signal, false);
23444       /* Continue monitoring */
23445     }
23446 
23447     /**
23448      * Will report a warning every time lag crosses
23449      * a multiple of 'report_period_ms'
23450      */
23451     const Uint32 report_period_ms = 10*1000; // 10 seconds
23452     if (lag1 > 0 && (lag1 % report_period_ms) < elapsed_ms)
23453     {
23454       if (m_gcp_monitor.m_micro_gcp.m_max_lag_ms)
23455       {
23456         warningEvent("GCP Monitor: GCP_COMMIT lag %u sec"
23457                      " (max lag: %us), %s",
23458                      lag1/1000, m_gcp_monitor.m_micro_gcp.m_max_lag_ms/1000,
23459                      c_GCP_COMMIT_Counter.getText());
23460       }
23461       else
23462       {
23463         warningEvent("GCP Monitor: GCP_COMMIT lag %u sec"
23464                      " (no max lag), %s",
23465                      lag1/1000, c_GCP_COMMIT_Counter.getText());
23466       }
23467     }
23468   }
23469   else
23470   {
23471     jam();
23472     m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
23473     m_gcp_monitor.m_micro_gcp.m_gci = m_micro_gcp.m_current_gci;
23474 
23475     /**
23476      * Recalculate micro_gcp.m_max_lag.
23477      * Since the maxima for gcp_save and micro_gcp are calculated
23478      * separately at protocol basis, the normal constraint that
23479      * GCP_SAVE max is > GCP_COMMIT max, may not hold in cases where
23480      * GCP_SAVE is stalled.
23481     */
23482     if (m_gcp_monitor.m_micro_gcp.m_need_max_lag_recalc)
23483     {
23484       setGCPStopTimeouts(signal, false); // set_micro_gcp_max_lag is true by default
23485       m_gcp_monitor.m_micro_gcp.m_need_max_lag_recalc = false;
23486     }
23487   }
23488 
23489   signal->theData[0] = DihContinueB::ZCHECK_GCP_STOP;
23490   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
23491                       GCPCheckPeriodMillis, 1);
23492   return;
23493 }//Dbdih::checkGcpStopLab()
23494 
23495 void
dumpGcpStop()23496 Dbdih::dumpGcpStop()
23497 {
23498   ndbout_c("c_nodeStartMaster.blockGcp: %u %u",
23499            c_nodeStartMaster.blockGcp,
23500            c_nodeStartMaster.startNode);
23501   ndbout_c("m_gcp_save.m_elapsed: %u(ms) m_gcp_save.m_max_lag: %u(ms)",
23502            m_gcp_monitor.m_gcp_save.m_elapsed_ms,
23503            m_gcp_monitor.m_gcp_save.m_max_lag_ms);
23504   ndbout_c("m_micro_gcp.m_elapsed: %u(ms) m_micro_gcp.m_max_lag: %u(ms)",
23505            m_gcp_monitor.m_micro_gcp.m_elapsed_ms,
23506            m_gcp_monitor.m_micro_gcp.m_max_lag_ms);
23507 
23508 
23509   ndbout_c("m_gcp_save.m_state: %u", m_gcp_save.m_state);
23510   ndbout_c("m_gcp_save.m_master.m_state: %u", m_gcp_save.m_master.m_state);
23511   ndbout_c("m_micro_gcp.m_state: %u", m_micro_gcp.m_state);
23512   ndbout_c("m_micro_gcp.m_master.m_state: %u", m_micro_gcp.m_master.m_state);
23513 
23514   ndbout_c("c_COPY_GCIREQ_Counter = %s", c_COPY_GCIREQ_Counter.getText());
23515   ndbout_c("c_COPY_TABREQ_Counter = %s", c_COPY_TABREQ_Counter.getText());
23516   ndbout_c("c_UPDATE_FRAG_STATEREQ_Counter = %s",
23517             c_UPDATE_FRAG_STATEREQ_Counter.getText());
23518   ndbout_c("c_DIH_SWITCH_REPLICA_REQ_Counter = %s",
23519 	   c_DIH_SWITCH_REPLICA_REQ_Counter.getText());
23520   ndbout_c("c_GCP_COMMIT_Counter = %s", c_GCP_COMMIT_Counter.getText());
23521   ndbout_c("c_GCP_PREPARE_Counter = %s", c_GCP_PREPARE_Counter.getText());
23522   ndbout_c("c_GCP_SAVEREQ_Counter = %s", c_GCP_SAVEREQ_Counter.getText());
23523   ndbout_c("c_SUB_GCP_COMPLETE_REP_Counter = %s",
23524            c_SUB_GCP_COMPLETE_REP_Counter.getText());
23525   ndbout_c("c_INCL_NODEREQ_Counter = %s", c_INCL_NODEREQ_Counter.getText());
23526   ndbout_c("c_MASTER_GCPREQ_Counter = %s", c_MASTER_GCPREQ_Counter.getText());
23527   ndbout_c("c_MASTER_LCPREQ_Counter = %s", c_MASTER_LCPREQ_Counter.getText());
23528   ndbout_c("c_START_INFOREQ_Counter = %s", c_START_INFOREQ_Counter.getText());
23529   ndbout_c("c_START_RECREQ_Counter = %s", c_START_RECREQ_Counter.getText());
23530   ndbout_c("c_STOP_ME_REQ_Counter = %s", c_STOP_ME_REQ_Counter.getText());
23531   ndbout_c("c_TC_CLOPSIZEREQ_Counter = %s", c_TC_CLOPSIZEREQ_Counter.getText());
23532   ndbout_c("c_TCGETOPSIZEREQ_Counter = %s", c_TCGETOPSIZEREQ_Counter.getText());
23533 
23534   ndbout_c("m_copyReason: %d m_waiting: %u %u",
23535            c_copyGCIMaster.m_copyReason,
23536            c_copyGCIMaster.m_waiting[0],
23537            c_copyGCIMaster.m_waiting[1]);
23538 
23539   ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
23540 	   c_copyGCISlave.m_senderData,
23541 	   c_copyGCISlave.m_senderRef,
23542 	   c_copyGCISlave.m_copyReason,
23543 	   c_copyGCISlave.m_expectedNextWord);
23544 }
23545 
23546 /**
23547  * GCP stop detected,
23548  * local == true means we must shutdown
23549  * local == false means we (GCP Master) are deciding what to
23550  *  do - may involve requesting shut down of other nodes and/or
23551  *  ourself.
23552  *
23553  * The action to take is generally :
23554  *   1.  Send 'Please log debug info + shutdown' signals to
23555  *       stalled nodes
23556  *   2,  Send ISOLATE_ORD with delay of X millis to *all*
23557  *       nodes (including self)
23558  *
23559  * Part 1 should result in a clean shutdown with debug
23560  * information and a clear cause
23561  * Part 2 ensures that if part 1 fails (as it might if the
23562  * nodes are 'ill'), the live nodes quickly exclude the
23563  * ill node and get on with their lives.
23564  *
23565  * Part 1 is implemented by various DUMP_STATE_ORD signals
23566  * and SYSTEM_ERROR
23567  * Part 2 is implemented using ISOLATE_ORD.
23568 */
crashSystemAtGcpStop(Signal * signal,bool local)23569 void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
23570 {
23571   dumpGcpStop();
23572   const Uint32 save_elapsed = m_gcp_monitor.m_gcp_save.m_elapsed_ms;
23573   const Uint32 micro_elapsed = m_gcp_monitor.m_micro_gcp.m_elapsed_ms;
23574   m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
23575   m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
23576 
23577   const Uint32 NodeIsolationTimeoutMillis = 100;
23578 
23579   if (local)
23580     goto dolocal;
23581 
23582   if (c_nodeStartMaster.blockGcp == 2)
23583   {
23584     jam();
23585     /**
23586      * Starting node...is delaying GCP to long...
23587      *   kill it
23588      */
23589     SystemError * const sysErr = (SystemError*)&signal->theData[0];
23590     sysErr->errorCode = SystemError::GCPStopDetected;
23591     sysErr->errorRef = reference();
23592     sysErr->data[0] = m_gcp_save.m_master.m_state;
23593     sysErr->data[1] = cgcpOrderBlocked;
23594     sysErr->data[2] = m_micro_gcp.m_master.m_state;
23595     sendSignal(calcNdbCntrBlockRef(c_nodeStartMaster.startNode),
23596                GSN_SYSTEM_ERROR, signal, SystemError::SignalLength, JBA);
23597 
23598     {
23599       /* Isolate, just in case */
23600       NdbNodeBitmask victims;
23601       victims.set(c_nodeStartMaster.startNode);
23602 
23603       isolateNodes(signal,
23604                    NodeIsolationTimeoutMillis,
23605                    victims);
23606     }
23607     return;
23608   }
23609 
23610   if (save_elapsed >= m_gcp_monitor.m_gcp_save.m_max_lag_ms)
23611   {
23612     switch(m_gcp_save.m_master.m_state){
23613     case GcpSave::GCP_SAVE_IDLE:
23614     {
23615       jam();
23616       /**
23617        * No switch for looong time...and we're idle...it *our* fault
23618        */
23619       /* Ask others to isolate me, just in case */
23620       {
23621         NdbNodeBitmask victims;
23622         victims.set(cownNodeId);
23623 
23624         isolateNodes(signal,
23625                      NodeIsolationTimeoutMillis,
23626                      victims);
23627       }
23628       local = true;
23629       break;
23630     }
23631     case GcpSave::GCP_SAVE_REQ:
23632     {
23633       jam();
23634       NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
23635       signal->theData[0] = 2305;
23636       sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
23637 
23638       isolateNodes(signal,
23639                    NodeIsolationTimeoutMillis,
23640                    c_GCP_SAVEREQ_Counter.getNodeBitmask());
23641 
23642       warningEvent("Detected GCP stop(%d)...sending kill to %s",
23643                 m_gcp_save.m_master.m_state, c_GCP_SAVEREQ_Counter.getText());
23644       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
23645                m_gcp_save.m_master.m_state, c_GCP_SAVEREQ_Counter.getText());
23646       ndbrequire(!c_GCP_SAVEREQ_Counter.done());
23647       return;
23648     }
23649     case GcpSave::GCP_SAVE_COPY_GCI:
23650     {
23651       /**
23652        * We're waiting for a COPY_GCICONF
23653        */
23654       jam();
23655       warningEvent("Detected GCP stop(%d)...sending kill to %s",
23656                 m_gcp_save.m_master.m_state, c_COPY_GCIREQ_Counter.getText());
23657       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
23658                m_gcp_save.m_master.m_state, c_COPY_GCIREQ_Counter.getText());
23659 
23660       {
23661         NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
23662         signal->theData[0] = 7022;
23663         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
23664       }
23665 
23666       {
23667         NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
23668         SystemError * const sysErr = (SystemError*)&signal->theData[0];
23669         sysErr->errorCode = SystemError::GCPStopDetected;
23670         sysErr->errorRef = reference();
23671         sysErr->data[0] = m_gcp_save.m_master.m_state;
23672         sysErr->data[1] = cgcpOrderBlocked;
23673         sysErr->data[2] = m_micro_gcp.m_master.m_state;
23674         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
23675                    SystemError::SignalLength, JBA);
23676       }
23677 
23678       isolateNodes(signal,
23679                    NodeIsolationTimeoutMillis,
23680                    c_COPY_GCIREQ_Counter.getNodeBitmask());
23681 
23682       ndbrequire(!c_COPY_GCIREQ_Counter.done());
23683       return;
23684     }
23685     case GcpSave::GCP_SAVE_CONF:
23686       /**
23687        * This *should* not happen (not a master state)
23688        */
23689       local = true;
23690       break;
23691     }
23692   }
23693 
23694   if (micro_elapsed >= m_gcp_monitor.m_micro_gcp.m_max_lag_ms)
23695   {
23696     switch(m_micro_gcp.m_master.m_state)
23697     {
23698     case MicroGcp::M_GCP_IDLE:
23699     {
23700       jam();
23701       /**
23702        * No switch for looong time...and we're idle...it *our* fault
23703        */
23704       /* Ask others to isolate me, just in case */
23705       {
23706         NdbNodeBitmask victims;
23707         victims.set(cownNodeId);
23708 
23709         isolateNodes(signal,
23710                      NodeIsolationTimeoutMillis,
23711                      victims);
23712       }
23713       local = true;
23714       break;
23715     }
23716     case MicroGcp::M_GCP_PREPARE:
23717     {
23718     /**
23719      * We're waiting for a GCP PREPARE CONF
23720      */
23721       jam();
23722       warningEvent("Detected GCP stop(%d)...sending kill to %s",
23723                 m_micro_gcp.m_state, c_GCP_PREPARE_Counter.getText());
23724       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
23725                m_micro_gcp.m_state, c_GCP_PREPARE_Counter.getText());
23726 
23727       {
23728         NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
23729         signal->theData[0] = 7022;
23730         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
23731       }
23732 
23733       {
23734         NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
23735         SystemError * const sysErr = (SystemError*)&signal->theData[0];
23736         sysErr->errorCode = SystemError::GCPStopDetected;
23737         sysErr->errorRef = reference();
23738         sysErr->data[0] = m_gcp_save.m_master.m_state;
23739         sysErr->data[1] = cgcpOrderBlocked;
23740         sysErr->data[2] = m_micro_gcp.m_master.m_state;
23741         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
23742                    SystemError::SignalLength, JBA);
23743       }
23744 
23745       isolateNodes(signal,
23746                    NodeIsolationTimeoutMillis,
23747                    c_GCP_PREPARE_Counter.getNodeBitmask());
23748 
23749       ndbrequire(!c_GCP_PREPARE_Counter.done());
23750       return;
23751     }
23752     case MicroGcp::M_GCP_COMMIT:
23753     {
23754       jam();
23755       warningEvent("Detected GCP stop(%d)...sending kill to %s",
23756                 m_micro_gcp.m_state, c_GCP_COMMIT_Counter.getText());
23757       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
23758                m_micro_gcp.m_state, c_GCP_COMMIT_Counter.getText());
23759 
23760       {
23761         NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
23762         signal->theData[0] = 7022;
23763         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
23764       }
23765 
23766       {
23767         NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
23768         SystemError * const sysErr = (SystemError*)&signal->theData[0];
23769         sysErr->errorCode = SystemError::GCPStopDetected;
23770         sysErr->errorRef = reference();
23771         sysErr->data[0] = m_gcp_save.m_master.m_state;
23772         sysErr->data[1] = cgcpOrderBlocked;
23773         sysErr->data[2] = m_micro_gcp.m_master.m_state;
23774         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
23775                    SystemError::SignalLength, JBA);
23776       }
23777 
23778       isolateNodes(signal,
23779                    NodeIsolationTimeoutMillis,
23780                    c_GCP_COMMIT_Counter.getNodeBitmask());
23781 
23782       ndbrequire(!c_GCP_COMMIT_Counter.done());
23783       return;
23784     }
23785     case MicroGcp::M_GCP_COMMITTED:
23786       /**
23787        * This *should* not happen (not a master state)
23788        */
23789       local = true;
23790       break;
23791     case MicroGcp::M_GCP_COMPLETE:
23792     {
23793       jam();
23794       infoEvent("Detected GCP stop(%d)...sending kill to %s",
23795                 m_micro_gcp.m_state, c_SUB_GCP_COMPLETE_REP_Counter.getText());
23796       ndbout_c("Detected GCP stop(%d)...sending kill to %s",
23797                m_micro_gcp.m_state, c_SUB_GCP_COMPLETE_REP_Counter.getText());
23798 
23799       {
23800         NodeReceiverGroup rg(DBDIH, c_SUB_GCP_COMPLETE_REP_Counter);
23801         signal->theData[0] = 7022;
23802         sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
23803       }
23804 
23805       {
23806         NodeReceiverGroup rg(NDBCNTR, c_SUB_GCP_COMPLETE_REP_Counter);
23807         SystemError * const sysErr = (SystemError*)&signal->theData[0];
23808         sysErr->errorCode = SystemError::GCPStopDetected;
23809         sysErr->errorRef = reference();
23810         sysErr->data[0] = m_gcp_save.m_master.m_state;
23811         sysErr->data[1] = cgcpOrderBlocked;
23812         sysErr->data[2] = m_micro_gcp.m_master.m_state;
23813         sendSignal(rg, GSN_SYSTEM_ERROR, signal,
23814                    SystemError::SignalLength, JBA);
23815       }
23816 
23817       isolateNodes(signal,
23818                    NodeIsolationTimeoutMillis,
23819                    c_SUB_GCP_COMPLETE_REP_Counter.getNodeBitmask());
23820 
23821       ndbrequire(!c_SUB_GCP_COMPLETE_REP_Counter.done());
23822       return;
23823     }
23824     }
23825   }
23826 
23827 dolocal:
23828   FileRecordPtr file0Ptr;
23829   file0Ptr.i = crestartInfoFile[0];
23830   ptrCheckGuard(file0Ptr, cfileFileSize, fileRecord);
23831   FileRecordPtr file1Ptr;
23832   file1Ptr.i = crestartInfoFile[1];
23833   ptrCheckGuard(file1Ptr, cfileFileSize, fileRecord);
23834 
23835   ndbout_c("file[0] status: %d type: %d reqStatus: %d file1: %d %d %d",
23836 	   file0Ptr.p->fileStatus, file0Ptr.p->fileType, file0Ptr.p->reqStatus,
23837 	   file1Ptr.p->fileStatus, file1Ptr.p->fileType, file1Ptr.p->reqStatus
23838 	   );
23839 
23840   signal->theData[0] = 404;
23841   signal->theData[1] = file0Ptr.p->fileRef;
23842   EXECUTE_DIRECT(NDBFS, GSN_DUMP_STATE_ORD, signal, 2);
23843 
23844   signal->theData[0] = 404;
23845   signal->theData[1] = file1Ptr.p->fileRef;
23846   EXECUTE_DIRECT(NDBFS, GSN_DUMP_STATE_ORD, signal, 2);
23847 
23848   signal->theData[0] = DumpStateOrd::NdbfsDumpRequests;
23849   signal->theData[1] = file1Ptr.p->fileRef;
23850   EXECUTE_DIRECT(NDBFS, GSN_DUMP_STATE_ORD, signal, 2);
23851 
23852   /* Various GCP_STOP error insert codes */
23853   if (ERROR_INSERTED(7238) ||
23854       ERROR_INSERTED(7239) ||
23855       ERROR_INSERTED(7244) ||
23856       ERROR_INSERTED(7237) ||
23857       ERROR_INSERTED(7241) ||
23858       ERROR_INSERTED(7242) ||
23859       ERROR_INSERTED(7243))
23860   {
23861     jam();
23862     if (ERROR_INSERT_EXTRA == 1)
23863     {
23864       /* Testing GCP STOP handling via node isolation */
23865       jam();
23866       g_eventLogger->info("Not killing local due to GCP stop");
23867       return;
23868     }
23869     /* Otherwise fall through to SYSTEM_ERROR  */
23870   }
23871 
23872   jam();
23873   SystemError * const sysErr = (SystemError*)&signal->theData[0];
23874   sysErr->errorCode = SystemError::GCPStopDetected;
23875   sysErr->errorRef = reference();
23876   sysErr->data[0] = m_gcp_save.m_master.m_state;
23877   sysErr->data[1] = cgcpOrderBlocked;
23878   sysErr->data[2] = m_micro_gcp.m_master.m_state;
23879   EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
23880                  signal, SystemError::SignalLength);
23881   ndbabort();
23882 }//Dbdih::crashSystemAtGcpStop()
23883 
23884 /*************************************************************************/
23885 /*                                                                       */
23886 /*       MODULE: ALLOCPAGE                                               */
23887 /*       DESCRIPTION: THE SUBROUTINE IS CALLED WITH POINTER TO PAGE      */
23888 /*                    RECORD. A PAGE  RECORD IS TAKEN FROM               */
23889 /*                    THE FREE PAGE  LIST                                */
23890 /*************************************************************************/
allocpage(PageRecordPtr & pagePtr)23891 void Dbdih::allocpage(PageRecordPtr& pagePtr)
23892 {
23893   ndbrequire(cfirstfreepage != RNIL);
23894   pagePtr.i = cfirstfreepage;
23895   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
23896   cfirstfreepage = pagePtr.p->nextfreepage;
23897   pagePtr.p->nextfreepage = RNIL;
23898 }//Dbdih::allocpage()
23899 
23900 /*************************************************************************/
23901 /*                                                                       */
23902 /*       MODULE: ALLOC_STORED_REPLICA                                    */
23903 /*       DESCRIPTION: THE SUBROUTINE IS CALLED TO GET A REPLICA RECORD,  */
23904 /*                    TO INITIALISE IT AND TO LINK IT INTO THE FRAGMENT  */
23905 /*                    STORE RECORD. USED FOR STORED REPLICAS.            */
23906 /*************************************************************************/
allocStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr & newReplicaPtr,Uint32 nodeId,Uint32 fragId,Uint32 tableId)23907 void Dbdih::allocStoredReplica(FragmentstorePtr fragPtr,
23908                                ReplicaRecordPtr& newReplicaPtr,
23909                                Uint32 nodeId,
23910                                Uint32 fragId,
23911                                Uint32 tableId)
23912 {
23913   Uint32 i;
23914   ReplicaRecordPtr arrReplicaPtr;
23915   ReplicaRecordPtr arrPrevReplicaPtr;
23916 
23917   seizeReplicaRec(newReplicaPtr);
23918   for (i = 0; i < MAX_LCP_STORED; i++) {
23919     newReplicaPtr.p->maxGciCompleted[i] = 0;
23920     newReplicaPtr.p->maxGciStarted[i] = 0;
23921     newReplicaPtr.p->lcpId[i] = 0;
23922     newReplicaPtr.p->lcpStatus[i] = ZINVALID;
23923   }//for
23924   newReplicaPtr.p->fragId = fragId;
23925   newReplicaPtr.p->tableId = tableId;
23926   newReplicaPtr.p->noCrashedReplicas = 0;
23927   newReplicaPtr.p->initialGci = (Uint32)(m_micro_gcp.m_current_gci >> 32);
23928   for (i = 0; i < MAX_CRASHED_REPLICAS; i++) {
23929     newReplicaPtr.p->replicaLastGci[i] = ZINIT_REPLICA_LAST_GCI;
23930     newReplicaPtr.p->createGci[i] = ZINIT_CREATE_GCI;
23931   }//for
23932   newReplicaPtr.p->createGci[0] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
23933   newReplicaPtr.p->nextLcp = 0;
23934   newReplicaPtr.p->procNode = nodeId;
23935   newReplicaPtr.p->lcpOngoingFlag = false;
23936   newReplicaPtr.p->lcpIdStarted = 0;
23937 
23938   arrPrevReplicaPtr.i = RNIL;
23939   arrReplicaPtr.i = fragPtr.p->storedReplicas;
23940   while (arrReplicaPtr.i != RNIL) {
23941     jam();
23942     c_replicaRecordPool.getPtr(arrReplicaPtr);
23943     arrPrevReplicaPtr = arrReplicaPtr;
23944     arrReplicaPtr.i = arrReplicaPtr.p->nextPool;
23945   }//while
23946   if (arrPrevReplicaPtr.i == RNIL) {
23947     jam();
23948     fragPtr.p->storedReplicas = newReplicaPtr.i;
23949   } else {
23950     jam();
23951     arrPrevReplicaPtr.p->nextPool = newReplicaPtr.i;
23952   }//if
23953   fragPtr.p->noStoredReplicas++;
23954 }//Dbdih::allocStoredReplica()
23955 
23956 /*************************************************************************/
23957 /* CHECK IF THE NODE CRASH IS TO ESCALATE INTO A SYSTEM CRASH. WE COULD  */
23958 /* DO THIS BECAUSE ALL REPLICAS OF SOME FRAGMENT ARE LOST. WE COULD ALSO */
23959 /* DO IT AFTER MANY NODE FAILURES THAT MAKE IT VERY DIFFICULT TO RESTORE */
23960 /* DATABASE AFTER A SYSTEM CRASH. IT MIGHT EVEN BE IMPOSSIBLE AND THIS   */
23961 /* MUST BE AVOIDED EVEN MORE THAN AVOIDING SYSTEM CRASHES.               */
23962 /*************************************************************************/
checkEscalation()23963 void Dbdih::checkEscalation()
23964 {
23965   Uint32 TnodeGroup[MAX_NDB_NODE_GROUPS];
23966   NodeRecordPtr nodePtr;
23967   Uint32 i;
23968   for (i = 0; i < cnoOfNodeGroups; i++) {
23969     TnodeGroup[i] = ZFALSE;
23970   }//for
23971   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
23972   {
23973     jam();
23974     ptrAss(nodePtr, nodeRecord);
23975     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
23976 	nodePtr.p->activeStatus == Sysfile::NS_Active){
23977       ndbrequire(nodePtr.p->nodeGroup < MAX_NDB_NODE_GROUPS);
23978       TnodeGroup[nodePtr.p->nodeGroup] = ZTRUE;
23979     }
23980   }
23981   for (i = 0; i < cnoOfNodeGroups; i++) {
23982     jam();
23983     ndbrequire(c_node_groups[i] < MAX_NDB_NODE_GROUPS);
23984     if (TnodeGroup[c_node_groups[i]] == ZFALSE) {
23985       jam();
23986       progError(__LINE__, NDBD_EXIT_LOST_NODE_GROUP, "Lost node group");
23987     }//if
23988   }//for
23989 }//Dbdih::checkEscalation()
23990 
23991 /*************************************************************************/
23992 /*                                                                       */
23993 /*       MODULE: CHECK_KEEP_GCI                                          */
23994 /*       DESCRIPTION: CHECK FOR MINIMUM GCI RESTORABLE WITH NEW LOCAL    */
23995 /*                    CHECKPOINT.                                        */
23996 /*************************************************************************/
checkKeepGci(TabRecordPtr tabPtr,Uint32 fragId,Fragmentstore *,Uint32 replicaStartIndex)23997 void Dbdih::checkKeepGci(TabRecordPtr tabPtr, Uint32 fragId, Fragmentstore*,
23998 			 Uint32 replicaStartIndex)
23999 {
24000   ReplicaRecordPtr ckgReplicaPtr;
24001   ckgReplicaPtr.i = replicaStartIndex;
24002   while (ckgReplicaPtr.i != RNIL) {
24003     jam();
24004     c_replicaRecordPool.getPtr(ckgReplicaPtr);
24005     if (c_lcpState.m_participatingLQH.get(ckgReplicaPtr.p->procNode))
24006     {
24007       Uint32 keepGci;
24008       Uint32 oldestRestorableGci;
24009       findMinGci(ckgReplicaPtr, keepGci, oldestRestorableGci);
24010       if (keepGci < c_lcpState.keepGci) {
24011         jam();
24012         /* ----------------------------------------------------------------- */
24013         /* WE MUST KEEP LOG RECORDS SO THAT WE CAN USE ALL LOCAL CHECKPOINTS */
24014         /* THAT ARE AVAILABLE. THUS WE NEED TO CALCULATE THE MINIMUM OVER ALL*/
24015         /* FRAGMENTS.                                                        */
24016         /* ----------------------------------------------------------------- */
24017         c_lcpState.keepGci = keepGci;
24018       }//if
24019       if (oldestRestorableGci > c_lcpState.oldestRestorableGci) {
24020         jam();
24021         c_lcpState.oldestRestorableGci = oldestRestorableGci;
24022       }//if
24023     }
24024     ckgReplicaPtr.i = ckgReplicaPtr.p->nextPool;
24025   }//while
24026 }//Dbdih::checkKeepGci()
24027 
closeFile(Signal * signal,FileRecordPtr filePtr)24028 void Dbdih::closeFile(Signal* signal, FileRecordPtr filePtr)
24029 {
24030   signal->theData[0] = filePtr.p->fileRef;
24031   signal->theData[1] = reference();
24032   signal->theData[2] = filePtr.i;
24033   signal->theData[3] = ZCLOSE_NO_DELETE;
24034   sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, 4, JBA);
24035 }//Dbdih::closeFile()
24036 
closeFileDelete(Signal * signal,FileRecordPtr filePtr)24037 void Dbdih::closeFileDelete(Signal* signal, FileRecordPtr filePtr)
24038 {
24039   signal->theData[0] = filePtr.p->fileRef;
24040   signal->theData[1] = reference();
24041   signal->theData[2] = filePtr.i;
24042   signal->theData[3] = ZCLOSE_DELETE;
24043   sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, 4, JBA);
24044 }//Dbdih::closeFileDelete()
24045 
createFileRw(Signal * signal,FileRecordPtr filePtr)24046 void Dbdih::createFileRw(Signal* signal, FileRecordPtr filePtr)
24047 {
24048   signal->theData[0] = reference();
24049   signal->theData[1] = filePtr.i;
24050   signal->theData[2] = filePtr.p->fileName[0];
24051   signal->theData[3] = filePtr.p->fileName[1];
24052   signal->theData[4] = filePtr.p->fileName[2];
24053   signal->theData[5] = filePtr.p->fileName[3];
24054   signal->theData[6] = ZCREATE_READ_WRITE;
24055   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
24056 }//Dbdih::createFileRw()
24057 
24058 void
emptyverificbuffer(Signal * signal,Uint32 q,bool aContinueB)24059 Dbdih::emptyverificbuffer(Signal* signal, Uint32 q, bool aContinueB)
24060 {
24061   if(unlikely(getBlockCommit() == true))
24062   {
24063     jam();
24064     return;
24065   }
24066 
24067   if (!isEmpty(c_diverify_queue[q]))
24068   {
24069     jam();
24070 
24071     dequeue(c_diverify_queue[q]);
24072     signal->theData[0] = RNIL;
24073     signal->theData[1] = (Uint32)(m_micro_gcp.m_current_gci >> 32);
24074     signal->theData[2] = (Uint32)(m_micro_gcp.m_current_gci & 0xFFFFFFFF);
24075     signal->theData[3] = 0;
24076     sendSignal(c_diverify_queue[q].m_ref, GSN_DIVERIFYCONF, signal, 4, JBB);
24077   }
24078   else if (aContinueB == true)
24079   {
24080     jam();
24081     /**
24082      * Make sure that we don't miss any pending transactions
24083      *   (transactions that are added to list by other thread
24084      *    while we execute this code)
24085      */
24086     Uint32 blocks[] = { DBTC, 0 };
24087     Callback c = { safe_cast(&Dbdih::emptyverificbuffer_check), q };
24088     /* Wait until all DIVERIFYCONF sent (from DBDIH) to any DBTC worker have
24089      * been received.
24090      * This function is also called from Dbdih::execUNBLOCK_COMMIT_ORD() which
24091      * can be called from QMGR by EXECUTE_DIRECT (they must share thread to
24092      * share the relevant state without explicit synchronization).
24093      * The below synchronization depends on that signals from QMGR to any DBTC
24094      * worker ends up in same signal queue as signals from DBDIH.
24095      */
24096     synchronize_threads_for_blocks(signal, blocks, c);
24097     return;
24098   }
24099 
24100   if (aContinueB == true)
24101   {
24102     jam();
24103     //-----------------------------------------------------------------------
24104     // This emptying happened as part of a take-out process by continueb signals
24105     // This ensures that we will empty the queue eventually. We will also empty
24106     // one item every time we insert one item to ensure that the list doesn't
24107     // grow when it is not blocked.
24108     //-----------------------------------------------------------------------
24109     signal->theData[0] = DihContinueB::ZEMPTY_VERIFY_QUEUE;
24110     signal->theData[1] = q;
24111     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
24112   }//if
24113 
24114   return;
24115 }//Dbdih::emptyverificbuffer()
24116 
24117 void
emptyverificbuffer_check(Signal * signal,Uint32 q,Uint32 retVal)24118 Dbdih::emptyverificbuffer_check(Signal* signal, Uint32 q, Uint32 retVal)
24119 {
24120   ndbrequire(retVal == 0);
24121   if (!isEmpty(c_diverify_queue[q]))
24122   {
24123     jam();
24124     signal->theData[0] = DihContinueB::ZEMPTY_VERIFY_QUEUE;
24125     signal->theData[1] = q;
24126     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
24127   }
24128   else
24129   {
24130     /**
24131      * Done with emptyverificbuffer
24132      */
24133     c_diverify_queue[q].m_empty_done = 1;
24134   }
24135 }
24136 
24137 /*************************************************************************/
24138 /*       FIND THE NODES FROM WHICH WE CAN EXECUTE THE LOG TO RESTORE THE */
24139 /*       DATA NODE IN A SYSTEM RESTART.                                  */
24140 /*************************************************************************/
findLogNodes(CreateReplicaRecord * createReplica,FragmentstorePtr fragPtr,Uint32 startGci,Uint32 stopGci)24141 bool Dbdih::findLogNodes(CreateReplicaRecord* createReplica,
24142                          FragmentstorePtr fragPtr,
24143                          Uint32 startGci,
24144                          Uint32 stopGci)
24145 {
24146   ConstPtr<ReplicaRecord> flnReplicaPtr;
24147   flnReplicaPtr.i = createReplica->replicaRec;
24148   c_replicaRecordPool.getPtr(flnReplicaPtr);
24149   /* --------------------------------------------------------------------- */
24150   /*       WE START BY CHECKING IF THE DATA NODE CAN HANDLE THE LOG ALL BY */
24151   /*       ITSELF. THIS IS THE DESIRED BEHAVIOUR. IF THIS IS NOT POSSIBLE  */
24152   /*       THEN WE SEARCH FOR THE BEST POSSIBLE NODES AMONG THE NODES THAT */
24153   /*       ARE PART OF THIS SYSTEM RESTART.                                */
24154   /*       THIS CAN ONLY BE HANDLED BY THE LAST CRASHED REPLICA.           */
24155   /*       The condition is that the replica was created before or at the  */
24156   /*       time of the starting gci, in addition it must have been alive   */
24157   /*       at the time of the stopping gci. This is checked by two         */
24158   /*       conditions, the first checks replicaLastGci and the second      */
24159   /*       checks that it is also smaller than the last gci the node was   */
24160   /*       involved in. This is necessary to check since createGci is set  */
24161   /*       Last + 1 and sometimes startGci = stopGci + 1 and in that case  */
24162   /*       it could happen that replicaLastGci is set to -1 with CreateGci */
24163   /*       set to LastGci + 1.                                             */
24164   /* --------------------------------------------------------------------- */
24165   arrGuard(flnReplicaPtr.p->noCrashedReplicas, MAX_CRASHED_REPLICAS);
24166   const Uint32 noCrashed = flnReplicaPtr.p->noCrashedReplicas;
24167 
24168   if ((startGci >= flnReplicaPtr.p->createGci[noCrashed]) &&
24169       (stopGci <= flnReplicaPtr.p->replicaLastGci[noCrashed]) &&
24170       (stopGci <= SYSFILE->lastCompletedGCI[flnReplicaPtr.p->procNode]))
24171   {
24172     jam();
24173     /* --------------------------------------------------------------------- */
24174     /*       WE FOUND ALL THE LOG RECORDS NEEDED IN THE DATA NODE. WE WILL   */
24175     /*       USE THOSE.                                                      */
24176     /* --------------------------------------------------------------------- */
24177     createReplica->noLogNodes = 1;
24178     createReplica->logStartGci[0] = startGci;
24179     createReplica->logStopGci[0] = stopGci;
24180     createReplica->logNodeId[0] = flnReplicaPtr.p->procNode;
24181     return true;
24182   }//if
24183   /* If we reach this code we're in trouble nowadays */
24184   g_eventLogger->info("startGci: %u, stopGci: %u, noCrashed: %u"
24185                       "newestRestorableGci: %u, createGci: %u,"
24186                       " replicaLastGci: %u, lastCompletedGci: %u"
24187                       ", node: %u",
24188                       startGci,
24189                       stopGci,
24190                       noCrashed,
24191                       SYSFILE->newestRestorableGCI,
24192                       flnReplicaPtr.p->createGci[noCrashed],
24193                       flnReplicaPtr.p->replicaLastGci[noCrashed],
24194                       SYSFILE->lastCompletedGCI[flnReplicaPtr.p->procNode],
24195                       flnReplicaPtr.p->procNode);
24196   Uint32 logNode = 0;
24197   do {
24198     Uint32 fblStopGci;
24199     jam();
24200     if(!findBestLogNode(createReplica,
24201 			fragPtr,
24202 			startGci,
24203 			stopGci,
24204 			logNode,
24205 			fblStopGci)){
24206       jam();
24207       return false;
24208     }
24209 
24210     logNode++;
24211     if (fblStopGci >= stopGci) {
24212       jam();
24213       createReplica->noLogNodes = logNode;
24214       return true;
24215     }//if
24216     startGci = fblStopGci + 1;
24217     if (logNode >= MAX_LOG_EXEC)
24218     {
24219       jam();
24220       break;
24221     }//if
24222   } while (1);
24223   /* --------------------------------------------------------------------- */
24224   /*       IT WAS NOT POSSIBLE TO RESTORE THE REPLICA. THIS CAN EITHER BE  */
24225   /*       BECAUSE OF LACKING NODES OR BECAUSE OF A REALLY SERIOUS PROBLEM.*/
24226   /* --------------------------------------------------------------------- */
24227   return false;
24228 }//Dbdih::findLogNodes()
24229 
24230 /*************************************************************************/
24231 /*       FIND THE BEST POSSIBLE LOG NODE TO EXECUTE THE LOG AS SPECIFIED */
24232 /*       BY THE INPUT PARAMETERS. WE SCAN THROUGH ALL ALIVE REPLICAS.    */
24233 /*       THIS MEANS STORED, OLD_STORED                                   */
24234 /*************************************************************************/
24235 bool
findBestLogNode(CreateReplicaRecord * createReplica,FragmentstorePtr fragPtr,Uint32 startGci,Uint32 stopGci,Uint32 logNode,Uint32 & fblStopGci)24236 Dbdih::findBestLogNode(CreateReplicaRecord* createReplica,
24237 		       FragmentstorePtr fragPtr,
24238 		       Uint32 startGci,
24239 		       Uint32 stopGci,
24240 		       Uint32 logNode,
24241 		       Uint32& fblStopGci)
24242 {
24243   ConstPtr<ReplicaRecord> fblFoundReplicaPtr;
24244   ConstPtr<ReplicaRecord> fblReplicaPtr;
24245 
24246   /* --------------------------------------------------------------------- */
24247   /*       WE START WITH ZERO AS FOUND TO ENSURE THAT FIRST HIT WILL BE    */
24248   /*       BETTER.                                                         */
24249   /* --------------------------------------------------------------------- */
24250   fblStopGci = 0;
24251   fblReplicaPtr.i = fragPtr.p->storedReplicas;
24252   while (fblReplicaPtr.i != RNIL) {
24253     jam();
24254     c_replicaRecordPool.getPtr(fblReplicaPtr);
24255     if (m_sr_nodes.get(fblReplicaPtr.p->procNode))
24256     {
24257       jam();
24258       Uint32 fliStopGci = findLogInterval(fblReplicaPtr, startGci);
24259       if (fliStopGci > fblStopGci)
24260       {
24261         jam();
24262         fblStopGci = fliStopGci;
24263         fblFoundReplicaPtr = fblReplicaPtr;
24264       }//if
24265     }//if
24266     fblReplicaPtr.i = fblReplicaPtr.p->nextPool;
24267   }//while
24268   fblReplicaPtr.i = fragPtr.p->oldStoredReplicas;
24269   while (fblReplicaPtr.i != RNIL) {
24270     jam();
24271     c_replicaRecordPool.getPtr(fblReplicaPtr);
24272     if (m_sr_nodes.get(fblReplicaPtr.p->procNode))
24273     {
24274       jam();
24275       Uint32 fliStopGci = findLogInterval(fblReplicaPtr, startGci);
24276       if (fliStopGci > fblStopGci)
24277       {
24278         jam();
24279         fblStopGci = fliStopGci;
24280         fblFoundReplicaPtr = fblReplicaPtr;
24281       }//if
24282     }//if
24283     fblReplicaPtr.i = fblReplicaPtr.p->nextPool;
24284   }//while
24285   if (fblStopGci != 0) {
24286     jam();
24287     ndbrequire(logNode < MAX_LOG_EXEC);
24288     createReplica->logNodeId[logNode] = fblFoundReplicaPtr.p->procNode;
24289     createReplica->logStartGci[logNode] = startGci;
24290     if (fblStopGci >= stopGci) {
24291       jam();
24292       createReplica->logStopGci[logNode] = stopGci;
24293     } else {
24294       jam();
24295       createReplica->logStopGci[logNode] = fblStopGci;
24296     }//if
24297   }//if
24298 
24299   return fblStopGci != 0;
24300 }//Dbdih::findBestLogNode()
24301 
findLogInterval(ConstPtr<ReplicaRecord> replicaPtr,Uint32 startGci)24302 Uint32 Dbdih::findLogInterval(ConstPtr<ReplicaRecord> replicaPtr,
24303 			      Uint32 startGci)
24304 {
24305   ndbrequire(replicaPtr.p->noCrashedReplicas <= MAX_CRASHED_REPLICAS);
24306   Uint32 loopLimit = replicaPtr.p->noCrashedReplicas + 1;
24307   for (Uint32 i = 0; i < loopLimit; i++) {
24308     jam();
24309     if (replicaPtr.p->createGci[i] <= startGci) {
24310       if (replicaPtr.p->replicaLastGci[i] >= startGci) {
24311         jam();
24312         return replicaPtr.p->replicaLastGci[i];
24313       }//if
24314     }//if
24315   }//for
24316   return 0;
24317 }//Dbdih::findLogInterval()
24318 
24319 /*************************************************************************/
24320 /*                                                                       */
24321 /*       MODULE: FIND THE MINIMUM GCI THAT THIS NODE HAS LOG RECORDS FOR.*/
24322 /*************************************************************************/
findMinGci(ReplicaRecordPtr fmgReplicaPtr,Uint32 & keepGci,Uint32 & oldestRestorableGci)24323 void Dbdih::findMinGci(ReplicaRecordPtr fmgReplicaPtr,
24324                        Uint32& keepGci,
24325                        Uint32& oldestRestorableGci)
24326 {
24327   keepGci = (Uint32)-1;
24328   oldestRestorableGci = 0;
24329 
24330   Uint32 maxLcpId = 0;              // LcpId of latest valid LCP
24331   Uint32 maxLcpNo = MAX_LCP_STORED; // Index of latest valid LCP
24332   for (Uint32 i = 0; i < MAX_LCP_STORED; i++)
24333   {
24334     jam();
24335     if (fmgReplicaPtr.p->lcpStatus[i] == ZVALID)
24336     {
24337       if ((fmgReplicaPtr.p->lcpId[i] + MAX_LCP_STORED) <= SYSFILE->latestLCP_ID)
24338       {
24339         jam();
24340         /*-----------------------------------------------------------------*/
24341         // We invalidate the checkpoint we are preparing to overwrite.
24342         // The LCP id is still the old lcp id,
24343         // this is the reason of comparing with lcpId + 1.
24344         /*-----------------------------------------------------------------*/
24345         fmgReplicaPtr.p->lcpStatus[i] = ZINVALID;
24346       }
24347       else if (fmgReplicaPtr.p->lcpId[i] > maxLcpId)
24348       {
24349         jam();
24350         maxLcpId = fmgReplicaPtr.p->lcpId[i];
24351         maxLcpNo = i;
24352       }
24353     }
24354   }
24355 
24356   if (maxLcpNo < MAX_LCP_STORED)
24357   {
24358     /**
24359      * Only consider latest LCP (wrt to how to cut REDO)
24360      */
24361     jam();
24362     keepGci = fmgReplicaPtr.p->maxGciCompleted[maxLcpNo];
24363     oldestRestorableGci = fmgReplicaPtr.p->maxGciStarted[maxLcpNo];
24364   }
24365 
24366   if (oldestRestorableGci == 0 && keepGci == Uint32(-1))
24367   {
24368     jam();
24369     if (fmgReplicaPtr.p->createGci[0] == fmgReplicaPtr.p->initialGci)
24370     {
24371       keepGci = fmgReplicaPtr.p->createGci[0];
24372       // XXX Jonas
24373       //oldestRestorableGci = fmgReplicaPtr.p->createGci[0];
24374     }
24375   }
24376   else
24377   {
24378     ndbassert(oldestRestorableGci <= c_newest_restorable_gci);
24379   }
24380   return;
24381 }//Dbdih::findMinGci()
24382 
findStartGci(Ptr<ReplicaRecord> replicaPtr,Uint32 stopGci,Uint32 & startGci,Uint32 & lcpNo)24383 bool Dbdih::findStartGci(Ptr<ReplicaRecord> replicaPtr,
24384                          Uint32 stopGci,
24385                          Uint32& startGci,
24386                          Uint32& lcpNo)
24387 {
24388   Uint32 cnt = 0;
24389   Uint32 tmp[MAX_LCP_STORED];
24390   for (Uint32 i = 0; i<MAX_LCP_STORED; i++)
24391   {
24392     jam();
24393     if (replicaPtr.p->lcpStatus[i] == ZVALID &&
24394         replicaPtr.p->maxGciStarted[i] <= stopGci)
24395     {
24396       /**
24397        * In order to use LCP
24398        *   we must be able to run REDO atleast up until maxGciStarted
24399        *   which is that highest GCI that
24400        */
24401       jam();
24402       tmp[cnt] = i;
24403       cnt++;
24404     }
24405   }
24406 
24407   if (cnt)
24408   {
24409     jam();
24410     /**
24411      * We found atleast one...get the highest
24412      */
24413     lcpNo = tmp[0];
24414     Uint32 lcpId = replicaPtr.p->lcpId[lcpNo];
24415     for (Uint32 i = 1; i<cnt; i++)
24416     {
24417       jam();
24418       if (replicaPtr.p->lcpId[tmp[i]] > lcpId)
24419       {
24420         jam();
24421         lcpNo = tmp[i];
24422         lcpId = replicaPtr.p->lcpId[lcpNo];
24423       }
24424     }
24425     startGci = replicaPtr.p->maxGciCompleted[lcpNo] + 1;
24426     return true;
24427   }
24428 
24429   /* --------------------------------------------------------------------- */
24430   /*       NO VALID LOCAL CHECKPOINT WAS AVAILABLE. WE WILL ADD THE        */
24431   /*       FRAGMENT. THUS THE NEXT LCP MUST BE SET TO ZERO.                */
24432   /*       WE MUST EXECUTE THE LOG FROM THE INITIAL GLOBAL CHECKPOINT WHEN */
24433   /*       THE TABLE WAS CREATED.                                          */
24434   /* --------------------------------------------------------------------- */
24435   startGci = replicaPtr.p->initialGci;
24436   jam();
24437   /**
24438    * It is possible that we have saved an LCP that from DIH point of view isn't
24439    * completed before the crash, so we set the nextLcp to 0 to start from
24440    * 0 again.
24441    */
24442   replicaPtr.p->nextLcp = 0;
24443   return false;
24444 }//Dbdih::findStartGci()
24445 
24446 /**
24447  * Compute max time it can take to "resolve" cascading node-failures
24448  *   given hb-interval, arbit timeout and #db-nodes.
24449  */
24450 Uint32
compute_max_failure_time()24451 Dbdih::compute_max_failure_time()
24452 {
24453   jam();
24454   Uint32 no_of_live_db_nodes = 0;
24455 
24456   // Count the number of live data nodes.
24457   NodeRecordPtr nodePtr(NULL, cfirstAliveNode);
24458   while (nodePtr.i != RNIL)
24459   {
24460     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24461 
24462     ndbassert(nodePtr.p->nodeStatus == NodeRecord::ALIVE);
24463 
24464     no_of_live_db_nodes++;
24465     nodePtr.i = nodePtr.p->nextNode;
24466   }
24467 
24468   const ndb_mgm_configuration_iterator* cfgIter =
24469     m_ctx.m_config.getOwnConfigIterator();
24470 
24471   Uint32 hbDBDB = 5000;
24472   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_HEARTBEAT_INTERVAL, &hbDBDB);
24473 
24474   Uint32 arbit_timeout = 7500;
24475   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_ARBIT_TIMEOUT, &arbit_timeout);
24476 
24477   /*
24478     A node is presumed dead if it is silent for four missed heartbeats,
24479     meaning that the worst case is five heartbeat intervals.
24480   */
24481   const Uint32 heartbeat_fail_time = hbDBDB * 5;
24482 
24483   /*
24484     The worst case failure scenario works as follows:
24485 
24486     1) All data nodes are running.
24487 
24488     2) One in each node group fail. Detecting this takes:
24489     no_of_node_groups * heartbeat_fail_time
24490 
24491     3) Arbitration is started, as the failed nodes could have formed an
24492     independent cluster. Arbitration make take up to arbit_timeout to
24493     complete.
24494 
24495     4) Just before arbitration completes, all remaining nodes except
24496     for the master fail. The remain node *could* have shut itself down
24497     as soon as the first of these failures are detected, but as it
24498     waits for outstanding PREP_FAILCONF messages before checking of
24499     the cluster is viable, it does not do so until all the failures
24500     have been detected. Detecting these failures thus takes:
24501     (no_of_nodes - no_of_node_groups - 1) * heartbeat_fail_time
24502 
24503     Combining these figure we get a total failure time of:
24504     (no_of_nodes - 1) * heartbeat_fail_time + arbit_timeout
24505 
24506     (For NoOfReplicas>2 there could be cases of nodes failing sequentially
24507     that would require more than one round of arbitration. These have not
24508     been considered here.)
24509   */
24510 
24511   return (MAX(no_of_live_db_nodes, 1) - 1) * heartbeat_fail_time
24512     + arbit_timeout;
24513 }
24514 
24515 /*
24516   Calculate timeouts for detecting GCP stops. These must be set such that
24517   node failures are not falsely interpreted as GCP stops.
24518 */
setGCPStopTimeouts(Signal * signal,bool set_gcp_save_max_lag,bool set_micro_gcp_max_lag)24519 void Dbdih::setGCPStopTimeouts(Signal *signal,
24520                                bool set_gcp_save_max_lag,
24521                                bool set_micro_gcp_max_lag)
24522 {
24523 
24524   const ndb_mgm_configuration_iterator* cfgIter =
24525     m_ctx.m_config.getOwnConfigIterator();
24526 
24527   const Uint32 max_failure_time = compute_max_failure_time();
24528 
24529   // Set time-between epochs timeout
24530   Uint32 micro_GCP_timeout = 4000;
24531   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_MICRO_GCP_TIMEOUT,
24532                             &micro_GCP_timeout);
24533 
24534   /*
24535     Set minimum value for time-between global checkpoint timeout.
24536     By default, this is 2 minutes.
24537   */
24538   Uint32 gcp_timeout = 120000;
24539   ndb_mgm_get_int_parameter(cfgIter, CFG_DB_GCP_TIMEOUT, &gcp_timeout);
24540 
24541   const Uint32 old_micro_GCP_max_lag = m_gcp_monitor.m_micro_gcp.m_max_lag_ms;
24542   const Uint32 old_GCP_save_max_lag = m_gcp_monitor.m_gcp_save.m_max_lag_ms;
24543 
24544   if (micro_GCP_timeout != 0)
24545   {
24546     jam();
24547     if (ERROR_INSERTED(7145))
24548     {
24549       /*
24550         We drop these lower limits in certain tests, to verify that the
24551         calculated value for max_failure_time is sufficient.
24552        */
24553       ndbout << "Dbdih::setGCPStopTimeouts() setting minimal GCP timout values"
24554              << " for test purposes."  << endl;
24555       micro_GCP_timeout = 0;
24556       gcp_timeout = 0;
24557     }
24558 
24559     if (set_micro_gcp_max_lag)
24560     {
24561       m_gcp_monitor.m_micro_gcp.m_max_lag_ms =
24562         m_micro_gcp.m_master.m_time_between_gcp + micro_GCP_timeout
24563         + max_failure_time;
24564     }
24565 
24566     if (set_gcp_save_max_lag)
24567     {
24568       m_gcp_monitor.m_gcp_save.m_max_lag_ms =
24569         m_gcp_save.m_master.m_time_between_gcp +
24570         // Ensure that GCP-commit times out before GCP-save if both stops.
24571         MAX(gcp_timeout, micro_GCP_timeout) +
24572         max_failure_time;
24573     }
24574   }
24575   else
24576   {
24577     jam();
24578     m_gcp_monitor.m_gcp_save.m_max_lag_ms = 0;
24579     m_gcp_monitor.m_micro_gcp.m_max_lag_ms = 0;
24580   }
24581 
24582 #ifdef ERROR_INSERT
24583   // If a test has already set gcp save max lag, don't overwrite it
24584   if (m_gcp_monitor.m_gcp_save.test_set_max_lag)
24585   {
24586     m_gcp_monitor.m_gcp_save.m_max_lag_ms = old_GCP_save_max_lag;
24587   }
24588 
24589   // If a test has already set gcp commit max lag, don't overwrite it
24590   if (m_gcp_monitor.m_micro_gcp.test_set_max_lag)
24591   {
24592     m_gcp_monitor.m_micro_gcp.m_max_lag_ms = old_micro_GCP_max_lag;
24593   }
24594 #endif
24595 
24596   // If timeouts have changed, log it for micro_gcp
24597   if (old_micro_GCP_max_lag != m_gcp_monitor.m_micro_gcp.m_max_lag_ms)
24598   {
24599     if (m_gcp_monitor.m_micro_gcp.m_max_lag_ms > 0)
24600     {
24601       jam();
24602       if (isMaster())
24603       {
24604         jam();
24605         // Log to mgmd.
24606         infoEvent("GCP Monitor: Computed max GCP_COMMIT lag to %u seconds",
24607                   m_gcp_monitor.m_micro_gcp.m_max_lag_ms / 1000);
24608       }
24609       // Log locallly.
24610       g_eventLogger->info("GCP Monitor: Computed max GCP_COMMIT lag to %u"
24611                           " seconds",
24612                           m_gcp_monitor.m_micro_gcp.m_max_lag_ms / 1000);
24613     }
24614     else
24615     {
24616       jam();
24617       if (isMaster())
24618       {
24619         jam();
24620         infoEvent("GCP Monitor: GCP_COMMIT: unlimited lags allowed");
24621       }
24622       g_eventLogger->info("GCP Monitor: GCP_COMMIT: unlimited lags allowed");
24623     }
24624   }
24625 
24626   // If timeouts have changed, log it for gcp_save
24627   if (old_GCP_save_max_lag != m_gcp_monitor.m_gcp_save.m_max_lag_ms)
24628   {
24629     if (m_gcp_monitor.m_gcp_save.m_max_lag_ms > 0)
24630     {
24631       jam();
24632       if (isMaster())
24633       {
24634         jam();
24635         // Log to mgmd.
24636         infoEvent("GCP Monitor: Computed max GCP_SAVE lag to %u seconds",
24637                   m_gcp_monitor.m_gcp_save.m_max_lag_ms / 1000);
24638       }
24639       // Log locallly.
24640       g_eventLogger->info("GCP Monitor: Computed max GCP_SAVE lag to %u"
24641                           " seconds",
24642                           m_gcp_monitor.m_gcp_save.m_max_lag_ms / 1000);
24643     }
24644     else
24645     {
24646       jam();
24647       if (isMaster())
24648       {
24649         jam();
24650         infoEvent("GCP Monitor: GCP_SAVE: unlimited lags allowed");
24651       }
24652       g_eventLogger->info("GCP Monitor: GCP_SAVE: unlimited lags allowed");
24653     }
24654   }
24655   sendINFO_GCP_STOP_TIMER(signal);
24656 } // setGCPStopTimeouts()
24657 
sendINFO_GCP_STOP_TIMER(Signal * signal)24658 void Dbdih::sendINFO_GCP_STOP_TIMER(Signal *signal)
24659 {
24660   Uint32 gcp_stop_timer_in_ms = MAX(m_gcp_monitor.m_micro_gcp.m_max_lag_ms,
24661                                     m_gcp_monitor.m_gcp_save.m_max_lag_ms);
24662   signal->theData[0] = gcp_stop_timer_in_ms;
24663   sendSignal(DBLQH_REF, GSN_INFO_GCP_STOP_TIMER, signal, 1, JBB);
24664 }
24665 
initCommonData()24666 void Dbdih::initCommonData()
24667 {
24668   c_blockCommit = false;
24669   c_blockCommitNo = 0;
24670   cfailurenr = 1;
24671   cMinTcFailNo = 0; /* 0 as TC inits to 0 */
24672   cfirstAliveNode = RNIL;
24673   cfirstDeadNode = RNIL;
24674   cgckptflag = false;
24675   cgcpOrderBlocked = 0;
24676   c_performed_copy_phase = false;
24677 
24678   c_lcpMasterTakeOverState.set(LMTOS_IDLE, __LINE__);
24679 
24680   c_lcpState.clcpDelay = 0;
24681   c_lcpState.lcpStart = ZIDLE;
24682   c_lcpState.lcpStopGcp = 0;
24683   c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__);
24684   c_lcpState.currentFragment.tableId = 0;
24685   c_lcpState.currentFragment.fragmentId = 0;
24686   c_lcpState.noOfLcpFragRepOutstanding = 0;
24687   c_lcpState.keepGci = 0;
24688   c_lcpState.oldestRestorableGci = 0;
24689   c_lcpState.ctcCounter = 0;
24690   c_lcpState.ctimer = 0;
24691   c_lcpState.immediateLcpStart = false;
24692   c_lcpState.m_MASTER_LCPREQ_Received = false;
24693   c_lcpState.m_lastLCP_COMPLETE_REP_ref = 0;
24694   cmasterdihref = 0;
24695   cmasterNodeId = 0;
24696   cmasterState = MASTER_IDLE;
24697   cmasterTakeOverNode = 0;
24698   cnoOfActiveTables = 0;
24699   cnoOfNodeGroups = 0;
24700   c_nextNodeGroup = 0;
24701   cnoReplicas = 0;
24702   con_lineNodes = 0;
24703   creceivedfrag = 0;
24704   crestartGci = 0;
24705   crestartInfoFile[0] = RNIL;
24706   crestartInfoFile[1] = RNIL;
24707   cstartPhase = 0;
24708   cstarttype = (Uint32)-1;
24709   csystemnodes = 0;
24710   c_newest_restorable_gci = 0;
24711   cwaitLcpSr = false;
24712   c_nodeStartMaster.blockGcp = 0;
24713 
24714   nodeResetStart(0);
24715   c_nodeStartMaster.wait = ZFALSE;
24716 
24717   memset(&sysfileData[0], 0, sizeof(sysfileData));
24718   SYSFILE->initSysFile(SYSFILE->nodeStatus, SYSFILE->nodeGroups);
24719   SYSFILE->latestLCP_ID = 1; /* Ensure that first LCP id is 1 */
24720 
24721   const ndb_mgm_configuration_iterator * p =
24722     m_ctx.m_config.getOwnConfigIterator();
24723   ndbrequire(p != 0);
24724 
24725   c_lcpState.clcpDelay = 20;
24726 
24727   /**
24728    * Get the configuration value for how many parallel fragment copy scans we
24729    * are going to do in parallel when we are requested to handle a node
24730    * recovery. If 0 set it to default value.
24731    */
24732   c_max_takeover_copy_threads = 0;
24733   ndb_mgm_get_int_parameter(p,
24734                             CFG_DB_PARALLEL_COPY_THREADS,
24735                             &c_max_takeover_copy_threads);
24736   if (c_max_takeover_copy_threads == 0)
24737   {
24738     jam();
24739     c_max_takeover_copy_threads = ZTAKE_OVER_THREADS;
24740   }
24741 
24742   ndb_mgm_get_int_parameter(p, CFG_DB_LCP_INTERVAL, &c_lcpState.clcpDelay);
24743   c_lcpState.clcpDelay = c_lcpState.clcpDelay > 31 ? 31 : c_lcpState.clcpDelay;
24744 
24745   cnoReplicas = 1;
24746   ndb_mgm_get_int_parameter(p, CFG_DB_NO_REPLICAS, &cnoReplicas);
24747   if (cnoReplicas > MAX_REPLICAS)
24748   {
24749     progError(__LINE__, NDBD_EXIT_INVALID_CONFIG,
24750 	      "Only up to four replicas are supported. Check NoOfReplicas.");
24751   }
24752 
24753   init_next_replica_node(&c_next_replica_node, cnoReplicas);
24754   bzero(&m_gcp_save, sizeof(m_gcp_save));
24755   bzero(&m_micro_gcp, sizeof(m_micro_gcp));
24756   NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
24757   NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
24758   {
24759     { // Set time-between global checkpoint
24760       Uint32 tmp = 2000;
24761       ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &tmp);
24762       tmp = tmp > 60000 ? 60000 : (tmp < 10 ? 10 : tmp);
24763       m_gcp_save.m_master.m_time_between_gcp = tmp;
24764     }
24765 
24766     Uint32 tmp = 0;
24767     if (ndb_mgm_get_int_parameter(p, CFG_DB_MICRO_GCP_INTERVAL, &tmp) == 0 &&
24768         tmp)
24769     {
24770       /**
24771        * Set time-between epochs
24772        */
24773       if (tmp > m_gcp_save.m_master.m_time_between_gcp)
24774         tmp = m_gcp_save.m_master.m_time_between_gcp;
24775       if (tmp < 10)
24776         tmp = 10;
24777       m_micro_gcp.m_master.m_time_between_gcp = tmp;
24778     }
24779 
24780     // These will be set when nodes reach state 'started'.
24781     m_gcp_monitor.m_micro_gcp.m_max_lag_ms = 0;
24782     m_gcp_monitor.m_gcp_save.m_max_lag_ms = 0;
24783   }
24784 }//Dbdih::initCommonData()
24785 
initFragstore(FragmentstorePtr fragPtr,Uint32 fragId)24786 void Dbdih::initFragstore(FragmentstorePtr fragPtr, Uint32 fragId)
24787 {
24788   fragPtr.p->fragId = fragId;
24789   fragPtr.p->nextCopyFragment = RNIL;
24790   fragPtr.p->storedReplicas = RNIL;
24791   fragPtr.p->oldStoredReplicas = RNIL;
24792   fragPtr.p->m_log_part_id = RNIL; /* To ensure not used uninited */
24793   fragPtr.p->partition_id = ~Uint32(0); /* To ensure not used uninited */
24794 
24795   fragPtr.p->noStoredReplicas = 0;
24796   fragPtr.p->noOldStoredReplicas = 0;
24797   fragPtr.p->fragReplicas = 0;
24798   fragPtr.p->preferredPrimary = 0;
24799 
24800   for (Uint32 i = 0; i < MAX_REPLICAS; i++)
24801     fragPtr.p->activeNodes[i] = 0;
24802 
24803   fragPtr.p->noLcpReplicas = 0;
24804   fragPtr.p->distributionKey = 0;
24805 }//Dbdih::initFragstore()
24806 
24807 /*************************************************************************/
24808 /*                                                                       */
24809 /*       MODULE: INIT_RESTART_INFO                                       */
24810 /*       DESCRIPTION: INITIATE RESTART INFO VARIABLE AND VARIABLES FOR   */
24811 /*                    GLOBAL CHECKPOINTS.                                */
24812 /*************************************************************************/
initRestartInfo(Signal * signal)24813 void Dbdih::initRestartInfo(Signal* signal)
24814 {
24815   Uint32 i;
24816   for (i = 0; i < MAX_NDB_NODES; i++) {
24817     SYSFILE->lastCompletedGCI[i] = 0;
24818   }//for
24819   NodeRecordPtr nodePtr;
24820   nodePtr.i = cfirstAliveNode;
24821   do {
24822     jam();
24823     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
24824     SYSFILE->lastCompletedGCI[nodePtr.i] = 1;
24825     /* FIRST GCP = 1 ALREADY SET BY LQH */
24826     nodePtr.i = nodePtr.p->nextNode;
24827   } while (nodePtr.i != RNIL);
24828 
24829   Uint32 startGci = 1;
24830 #ifndef DBUG_OFF
24831 #ifdef NDB_USE_GET_ENV
24832   {
24833     char envBuf[256];
24834     const char* v = NdbEnv_GetEnv("NDB_START_GCI",
24835                                   envBuf,
24836                                   256);
24837     if (v && *v != 0)
24838     {
24839       startGci = my_strtoull(v, NULL, 0);
24840 
24841       ndbout_c("DbDih : Using value of %u from NDB_START_GCI",
24842                startGci);
24843     }
24844   }
24845 #endif
24846 #endif
24847 
24848   m_micro_gcp.m_old_gci = Uint64(startGci) << 32;
24849   m_micro_gcp.m_current_gci = Uint64(startGci + 1) << 32;
24850   crestartGci = startGci;
24851   c_newest_restorable_gci = startGci;
24852 
24853   SYSFILE->keepGCI             = startGci;
24854 
24855   DEB_LCP(("Init SYSFILE->keepGCI = %u", SYSFILE->keepGCI));
24856 
24857   SYSFILE->oldestRestorableGCI = startGci;
24858   SYSFILE->newestRestorableGCI = startGci;
24859   SYSFILE->systemRestartBits   = 0;
24860   for (i = 0; i < NdbNodeBitmask::Size; i++) {
24861     SYSFILE->lcpActive[0]        = 0;
24862   }//for
24863   memset(SYSFILE->takeOver, 0, sizeof(SYSFILE->takeOver));
24864   Sysfile::setInitialStartOngoing(SYSFILE->systemRestartBits);
24865   srand((unsigned int)time(0));
24866   globalData.m_restart_seq = SYSFILE->m_restart_seq = 1;
24867   g_eventLogger->info("Starting with m_restart_seq set to %u",
24868                       globalData.m_restart_seq);
24869 
24870   if (m_micro_gcp.m_enabled == false &&
24871       m_micro_gcp.m_master.m_time_between_gcp)
24872   {
24873     /**
24874      * Micro GCP is disabled...but configured...
24875      */
24876     jam();
24877     m_micro_gcp.m_enabled = true;
24878     UpgradeProtocolOrd * ord = (UpgradeProtocolOrd*)signal->getDataPtrSend();
24879     ord->type = UpgradeProtocolOrd::UPO_ENABLE_MICRO_GCP;
24880     EXECUTE_DIRECT(QMGR,GSN_UPGRADE_PROTOCOL_ORD,signal,signal->getLength());
24881   }
24882 }//Dbdih::initRestartInfo()
24883 
24884 /*--------------------------------------------------------------------*/
24885 /*       NODE GROUP BITS ARE INITIALISED BEFORE THIS.                 */
24886 /*       NODE ACTIVE BITS ARE INITIALISED BEFORE THIS.                */
24887 /*--------------------------------------------------------------------*/
24888 /*************************************************************************/
24889 /*                                                                       */
24890 /*       MODULE: INIT_RESTORABLE_GCI_FILES                               */
24891 /*       DESCRIPTION: THE SUBROUTINE SETS UP THE FILES THAT REFERS TO THE*/
24892 /*       FILES THAT KEEP THE VARIABLE CRESTART_INFO                      */
24893 /*************************************************************************/
initRestorableGciFiles()24894 void Dbdih::initRestorableGciFiles()
24895 {
24896   Uint32 tirgTmp;
24897   FileRecordPtr filePtr;
24898   seizeFile(filePtr);
24899   filePtr.p->tabRef = RNIL;
24900   filePtr.p->fileType = FileRecord::GCP_FILE;
24901   filePtr.p->reqStatus = FileRecord::IDLE;
24902   filePtr.p->fileStatus = FileRecord::CLOSED;
24903   crestartInfoFile[0] = filePtr.i;
24904   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
24905   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
24906   filePtr.p->fileName[2] = (Uint32)-1;  /* S PART IGNORED        */
24907   tirgTmp = 1;  /* FILE NAME VERSION 1   */
24908   tirgTmp = (tirgTmp << 8) + 6; /* .SYSFILE              */
24909   tirgTmp = (tirgTmp << 8) + 1; /* D1 DIRECTORY          */
24910   tirgTmp = (tirgTmp << 8) + 0; /* P0 FILE NAME          */
24911   filePtr.p->fileName[3] = tirgTmp;
24912   /* --------------------------------------------------------------------- */
24913   /*       THE NAME BECOMES /D1/DBDIH/P0.SYSFILE                          */
24914   /* --------------------------------------------------------------------- */
24915   seizeFile(filePtr);
24916   filePtr.p->tabRef = RNIL;
24917   filePtr.p->fileType = FileRecord::GCP_FILE;
24918   filePtr.p->reqStatus = FileRecord::IDLE;
24919   filePtr.p->fileStatus = FileRecord::CLOSED;
24920   crestartInfoFile[1] = filePtr.i;
24921   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
24922   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
24923   filePtr.p->fileName[2] = (Uint32)-1;  /* S PART IGNORED        */
24924   tirgTmp = 1;  /* FILE NAME VERSION 1   */
24925   tirgTmp = (tirgTmp << 8) + 6; /* .SYSFILE              */
24926   tirgTmp = (tirgTmp << 8) + 2; /* D1 DIRECTORY          */
24927   tirgTmp = (tirgTmp << 8) + 0; /* P0 FILE NAME          */
24928   filePtr.p->fileName[3] = tirgTmp;
24929   /* --------------------------------------------------------------------- */
24930   /*       THE NAME BECOMES /D2/DBDIH/P0.SYSFILE                          */
24931   /* --------------------------------------------------------------------- */
24932 }//Dbdih::initRestorableGciFiles()
24933 
initTable(TabRecordPtr tabPtr)24934 void Dbdih::initTable(TabRecordPtr tabPtr)
24935 {
24936   new (tabPtr.p) TabRecord();
24937   NdbMutex_Init(&tabPtr.p->theMutex);
24938   tabPtr.p->noOfFragChunks = 0;
24939   tabPtr.p->method = TabRecord::NOTDEFINED;
24940   tabPtr.p->tabStatus = TabRecord::TS_IDLE;
24941   tabPtr.p->noOfWords = 0;
24942   tabPtr.p->noPages = 0;
24943   tabPtr.p->tabLcpStatus = TabRecord::TLS_COMPLETED;
24944   tabPtr.p->tabCopyStatus = TabRecord::CS_IDLE;
24945   tabPtr.p->tabUpdateState = TabRecord::US_IDLE;
24946   tabPtr.p->noOfBackups = 0;
24947   tabPtr.p->kvalue = 0;
24948   tabPtr.p->hashpointer = (Uint32)-1;
24949   tabPtr.p->mask = 0;
24950   tabPtr.p->tabStorage = TabRecord::ST_NORMAL;
24951   tabPtr.p->schemaVersion = (Uint32)-1;
24952   tabPtr.p->tabRemoveNode = RNIL;
24953   tabPtr.p->totalfragments = (Uint32)-1;
24954   tabPtr.p->partitionCount = (Uint32)-1;
24955   tabPtr.p->connectrec = RNIL;
24956   tabPtr.p->tabFile[0] = RNIL;
24957   tabPtr.p->tabFile[1] = RNIL;
24958   tabPtr.p->m_dropTab.tabUserRef = 0;
24959   tabPtr.p->m_dropTab.tabUserPtr = RNIL;
24960   Uint32 i;
24961   for (i = 0; i < NDB_ARRAY_SIZE(tabPtr.p->startFid); i++) {
24962     tabPtr.p->startFid[i] = RNIL;
24963   }//for
24964   for (i = 0; i < NDB_ARRAY_SIZE(tabPtr.p->pageRef); i++) {
24965     tabPtr.p->pageRef[i] = RNIL;
24966   }//for
24967   tabPtr.p->tableType = DictTabInfo::UndefTableType;
24968   tabPtr.p->schemaTransId = 0;
24969   tabPtr.p->tabActiveLcpFragments = 0;
24970 }//Dbdih::initTable()
24971 
24972 /*************************************************************************/
24973 /*                                                                       */
24974 /*       MODULE: INIT_TABLE_FILES                                        */
24975 /*       DESCRIPTION: THE SUBROUTINE SETS UP THE FILES THAT REFERS TO THE*/
24976 /*       FILES THAT KEEP THE TABLE FRAGMENTATION DESCRIPTION.            */
24977 /*************************************************************************/
initTableFile(TabRecordPtr tabPtr)24978 void Dbdih::initTableFile(TabRecordPtr tabPtr)
24979 {
24980   Uint32 titfTmp;
24981   FileRecordPtr filePtr;
24982   seizeFile(filePtr);
24983   filePtr.p->tabRef = tabPtr.i;
24984   filePtr.p->fileType = FileRecord::TABLE_FILE;
24985   filePtr.p->reqStatus = FileRecord::IDLE;
24986   filePtr.p->fileStatus = FileRecord::CLOSED;
24987   tabPtr.p->tabFile[0] = filePtr.i;
24988   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
24989   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
24990   filePtr.p->fileName[2] = tabPtr.i;    /* Stid FILE NAME        */
24991   titfTmp = 1;  /* FILE NAME VERSION 1   */
24992   titfTmp = (titfTmp << 8) + 3; /* .FRAGLIST             */
24993   titfTmp = (titfTmp << 8) + 1; /* D1 DIRECTORY          */
24994   titfTmp = (titfTmp << 8) + 255;       /* P PART IGNORED        */
24995   filePtr.p->fileName[3] = titfTmp;
24996   /* --------------------------------------------------------------------- */
24997   /*       THE NAME BECOMES /D1/DBDICT/Stid.FRAGLIST                       */
24998   /* --------------------------------------------------------------------- */
24999   seizeFile(filePtr);
25000   filePtr.p->tabRef = tabPtr.i;
25001   filePtr.p->fileType = FileRecord::TABLE_FILE;
25002   filePtr.p->reqStatus = FileRecord::IDLE;
25003   filePtr.p->fileStatus = FileRecord::CLOSED;
25004   tabPtr.p->tabFile[1] = filePtr.i;
25005   filePtr.p->fileName[0] = (Uint32)-1;  /* T DIRECTORY NOT USED  */
25006   filePtr.p->fileName[1] = (Uint32)-1;  /* F DIRECTORY NOT USED  */
25007   filePtr.p->fileName[2] = tabPtr.i;    /* Stid FILE NAME        */
25008   titfTmp = 1;  /* FILE NAME VERSION 1   */
25009   titfTmp = (titfTmp << 8) + 3; /* .FRAGLIST             */
25010   titfTmp = (titfTmp << 8) + 2; /* D2 DIRECTORY          */
25011   titfTmp = (titfTmp << 8) + 255;       /* P PART IGNORED        */
25012   filePtr.p->fileName[3] = titfTmp;
25013   /* --------------------------------------------------------------------- */
25014   /*       THE NAME BECOMES /D2/DBDICT/Stid.FRAGLIST                       */
25015   /* --------------------------------------------------------------------- */
25016 }//Dbdih::initTableFile()
25017 
initialiseRecordsLab(Signal * signal,Uint32 stepNo,Uint32 retRef,Uint32 retData)25018 void Dbdih::initialiseRecordsLab(Signal* signal,
25019 				 Uint32 stepNo, Uint32 retRef, Uint32 retData)
25020 {
25021   switch (stepNo) {
25022   case 0:
25023     jam();
25024     initCommonData();
25025     break;
25026   case 1:{
25027     jam();
25028     c_diverify_queue[0].m_ref = calcTcBlockRef(getOwnNodeId());
25029     for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
25030     {
25031       if (globalData.ndbMtTcThreads > 0)
25032       {
25033         c_diverify_queue[i].m_ref = numberToRef(DBTC, i + 1, 0);
25034       }
25035     }
25036     jam();
25037     break;
25038   }
25039   case 2:{
25040     ConnectRecordPtr connectPtr;
25041     jam();
25042     /****** CONNECT ******/
25043     for (connectPtr.i = 0; connectPtr.i < cconnectFileSize; connectPtr.i++) {
25044       refresh_watch_dog();
25045       ptrAss(connectPtr, connectRecord);
25046       connectPtr.p->userpointer = RNIL;
25047       connectPtr.p->userblockref = ZNIL;
25048       connectPtr.p->connectState = ConnectRecord::FREE;
25049       connectPtr.p->table = RNIL;
25050       connectPtr.p->nextPool = connectPtr.i + 1;
25051       bzero(connectPtr.p->nodes, sizeof(connectPtr.p->nodes));
25052     }//for
25053     connectPtr.i = cconnectFileSize - 1;
25054     ptrAss(connectPtr, connectRecord);
25055     connectPtr.p->nextPool = RNIL;
25056     cfirstconnect = 0;
25057     break;
25058   }
25059   case 3:
25060     {
25061       FileRecordPtr filePtr;
25062       jam();
25063       /******** INTIALIZING FILE RECORDS ********/
25064       for (filePtr.i = 0; filePtr.i < cfileFileSize; filePtr.i++) {
25065 	ptrAss(filePtr, fileRecord);
25066 	filePtr.p->nextFile = filePtr.i + 1;
25067 	filePtr.p->fileStatus = FileRecord::CLOSED;
25068 	filePtr.p->reqStatus = FileRecord::IDLE;
25069       }//for
25070       filePtr.i = cfileFileSize - 1;
25071       ptrAss(filePtr, fileRecord);
25072       filePtr.p->nextFile = RNIL;
25073       cfirstfreeFile = 0;
25074       initRestorableGciFiles();
25075       break;
25076     }
25077   case 4:
25078     jam();
25079     initialiseFragstore();
25080     break;
25081   case 5:
25082     {
25083       jam();
25084       /******* NODE GROUP RECORD ******/
25085       /******* NODE RECORD       ******/
25086       NodeGroupRecordPtr loopNGPtr;
25087       for (loopNGPtr.i = 0; loopNGPtr.i < MAX_NDB_NODE_GROUPS; loopNGPtr.i++) {
25088 	ptrAss(loopNGPtr, nodeGroupRecord);
25089         loopNGPtr.p->nodesInGroup[0] = RNIL;
25090         loopNGPtr.p->nodesInGroup[1] = RNIL;
25091         loopNGPtr.p->nodesInGroup[2] = RNIL;
25092         loopNGPtr.p->nodesInGroup[3] = RNIL;
25093         loopNGPtr.p->nextReplicaNode = 0;
25094         loopNGPtr.p->nodeCount = 0;
25095         loopNGPtr.p->activeTakeOver = 0;
25096         loopNGPtr.p->activeTakeOverCount = 0;
25097         loopNGPtr.p->nodegroupIndex = RNIL;
25098         loopNGPtr.p->m_ref_count = 0;
25099         loopNGPtr.p->m_next_log_part = 0;
25100       }//for
25101       break;
25102     }
25103   case 6:
25104     {
25105       PageRecordPtr pagePtr;
25106       jam();
25107       /******* PAGE RECORD ******/
25108       for (pagePtr.i = 0; pagePtr.i < cpageFileSize; pagePtr.i++) {
25109         refresh_watch_dog();
25110 	ptrAss(pagePtr, pageRecord);
25111 	pagePtr.p->nextfreepage = pagePtr.i + 1;
25112       }//for
25113       pagePtr.i = cpageFileSize - 1;
25114       ptrAss(pagePtr, pageRecord);
25115       pagePtr.p->nextfreepage = RNIL;
25116       cfirstfreepage = 0;
25117       break;
25118     }
25119   case 7:
25120     {
25121       ReplicaRecordPtr initReplicaPtr;
25122       jam();
25123       /******* REPLICA RECORD ******/
25124       for (initReplicaPtr.i = 0; initReplicaPtr.i < creplicaFileSize;
25125 	   initReplicaPtr.i++) {
25126         refresh_watch_dog();
25127         c_replicaRecordPool.seizeId(initReplicaPtr, initReplicaPtr.i);
25128 	initReplicaPtr.p->lcpIdStarted = 0;
25129 	initReplicaPtr.p->lcpOngoingFlag = false;
25130         c_replicaRecordPool.releaseLast(initReplicaPtr);
25131       }//for
25132       cnoFreeReplicaRec = creplicaFileSize;
25133       break;
25134     }
25135   case 8:
25136     {
25137       TabRecordPtr loopTabptr;
25138       jam();
25139       /********* TAB-DESCRIPTOR ********/
25140       for (loopTabptr.i = 0; loopTabptr.i < ctabFileSize; loopTabptr.i++) {
25141 	ptrAss(loopTabptr, tabRecord);
25142         refresh_watch_dog();
25143 	initTable(loopTabptr);
25144       }//for
25145       break;
25146     }
25147   case 9:
25148     {
25149       jam();
25150       ReadConfigConf * conf = (ReadConfigConf*)signal->getDataPtrSend();
25151       conf->senderRef = reference();
25152       conf->senderData = retData;
25153       sendSignal(retRef, GSN_READ_CONFIG_CONF, signal,
25154 		 ReadConfigConf::SignalLength, JBB);
25155       return;
25156       break;
25157     }
25158   default:
25159     ndbabort();
25160   }//switch
25161   jam();
25162   /* ---------------------------------------------------------------------- */
25163   /* SEND REAL-TIME BREAK DURING INIT OF VARIABLES DURING SYSTEM RESTART.   */
25164   /* ---------------------------------------------------------------------- */
25165   signal->theData[0] = DihContinueB::ZINITIALISE_RECORDS;
25166   signal->theData[1] = stepNo + 1;
25167   signal->theData[2] = retRef;
25168   signal->theData[3] = retData;
25169   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
25170 }//Dbdih::initialiseRecordsLab()
25171 
25172 /*************************************************************************/
25173 /*       INSERT THE NODE INTO THE LINKED LIST OF NODES INVOLVED ALL      */
25174 /*       DISTRIBUTED PROTOCOLS (EXCEPT GCP PROTOCOL THAT USES THE DIH    */
25175 /*       LINKED LIST INSTEAD).                                           */
25176 /*************************************************************************/
insertAlive(NodeRecordPtr newNodePtr)25177 void Dbdih::insertAlive(NodeRecordPtr newNodePtr)
25178 {
25179   NodeRecordPtr nodePtr;
25180 
25181   nodePtr.i = cfirstAliveNode;
25182   if (nodePtr.i == RNIL) {
25183     jam();
25184     cfirstAliveNode = newNodePtr.i;
25185   } else {
25186     do {
25187       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
25188       if (nodePtr.p->nextNode == RNIL) {
25189         jam();
25190         nodePtr.p->nextNode = newNodePtr.i;
25191         break;
25192       } else {
25193         jam();
25194         nodePtr.i = nodePtr.p->nextNode;
25195       }//if
25196     } while (1);
25197   }//if
25198   newNodePtr.p->nextNode = RNIL;
25199 }//Dbdih::insertAlive()
25200 
25201 /**
25202  * RCU lock must be held on table while calling this method when
25203  * not in recovery.
25204  */
insertBackup(FragmentstorePtr fragPtr,Uint32 nodeId)25205 void Dbdih::insertBackup(FragmentstorePtr fragPtr, Uint32 nodeId)
25206 {
25207   for (Uint32 i = fragPtr.p->fragReplicas; i > 1; i--) {
25208     jam();
25209     ndbrequire(i < MAX_REPLICAS && i > 0);
25210     fragPtr.p->activeNodes[i] = fragPtr.p->activeNodes[i - 1];
25211   }//for
25212   fragPtr.p->activeNodes[1] = nodeId;
25213   fragPtr.p->fragReplicas++;
25214 }//Dbdih::insertBackup()
25215 
insertDeadNode(NodeRecordPtr newNodePtr)25216 void Dbdih::insertDeadNode(NodeRecordPtr newNodePtr)
25217 {
25218   NodeRecordPtr nodePtr;
25219 
25220   nodePtr.i = cfirstDeadNode;
25221   if (nodePtr.i == RNIL) {
25222     jam();
25223     cfirstDeadNode = newNodePtr.i;
25224   } else {
25225     do {
25226       jam();
25227       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
25228       if (nodePtr.p->nextNode == RNIL) {
25229         jam();
25230         nodePtr.p->nextNode = newNodePtr.i;
25231         break;
25232       } else {
25233         jam();
25234         nodePtr.i = nodePtr.p->nextNode;
25235       }//if
25236     } while (1);
25237   }//if
25238   newNodePtr.p->nextNode = RNIL;
25239 }//Dbdih::insertDeadNode()
25240 
linkOldStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)25241 void Dbdih::linkOldStoredReplica(FragmentstorePtr fragPtr,
25242                                  ReplicaRecordPtr replicatePtr)
25243 {
25244   ReplicaRecordPtr losReplicaPtr;
25245 
25246   replicatePtr.p->nextPool = RNIL;
25247   fragPtr.p->noOldStoredReplicas++;
25248   losReplicaPtr.i = fragPtr.p->oldStoredReplicas;
25249   if (losReplicaPtr.i == RNIL) {
25250     jam();
25251     fragPtr.p->oldStoredReplicas = replicatePtr.i;
25252     return;
25253   }//if
25254   c_replicaRecordPool.getPtr(losReplicaPtr);
25255   while (losReplicaPtr.p->nextPool != RNIL) {
25256     jam();
25257     losReplicaPtr.i = losReplicaPtr.p->nextPool;
25258     c_replicaRecordPool.getPtr(losReplicaPtr);
25259   }//if
25260   losReplicaPtr.p->nextPool = replicatePtr.i;
25261 }//Dbdih::linkOldStoredReplica()
25262 
linkStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)25263 void Dbdih::linkStoredReplica(FragmentstorePtr fragPtr,
25264                               ReplicaRecordPtr replicatePtr)
25265 {
25266   ReplicaRecordPtr lsrReplicaPtr;
25267 
25268   fragPtr.p->noStoredReplicas++;
25269   replicatePtr.p->nextPool = RNIL;
25270   lsrReplicaPtr.i = fragPtr.p->storedReplicas;
25271   if (fragPtr.p->storedReplicas == RNIL) {
25272     jam();
25273     fragPtr.p->storedReplicas = replicatePtr.i;
25274     return;
25275   }//if
25276   c_replicaRecordPool.getPtr(lsrReplicaPtr);
25277   while (lsrReplicaPtr.p->nextPool != RNIL) {
25278     jam();
25279     lsrReplicaPtr.i = lsrReplicaPtr.p->nextPool;
25280     c_replicaRecordPool.getPtr(lsrReplicaPtr);
25281   }//if
25282   lsrReplicaPtr.p->nextPool = replicatePtr.i;
25283 }//Dbdih::linkStoredReplica()
25284 
25285 /*************************************************************************/
25286 /*        MAKE NODE GROUPS BASED ON THE LIST OF NODES RECEIVED FROM CNTR */
25287 /*************************************************************************/
25288 void
add_nodegroup(NodeGroupRecordPtr NGPtr)25289 Dbdih::add_nodegroup(NodeGroupRecordPtr NGPtr)
25290 {
25291   if (NGPtr.p->nodegroupIndex == RNIL)
25292   {
25293     jam();
25294     NGPtr.p->nodegroupIndex = cnoOfNodeGroups;
25295     c_node_groups[cnoOfNodeGroups++] = NGPtr.i;
25296   }
25297 }
25298 
25299 void
inc_ng_refcount(Uint32 i)25300 Dbdih::inc_ng_refcount(Uint32 i)
25301 {
25302   NodeGroupRecordPtr NGPtr;
25303   NGPtr.i = i;
25304   ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25305   NGPtr.p->m_ref_count++;
25306 }
25307 
25308 void
dec_ng_refcount(Uint32 i)25309 Dbdih::dec_ng_refcount(Uint32 i)
25310 {
25311   NodeGroupRecordPtr NGPtr;
25312   NGPtr.i = i;
25313   ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25314   ndbrequire(NGPtr.p->m_ref_count);
25315   NGPtr.p->m_ref_count--;
25316 }
25317 
makeNodeGroups(Uint32 nodeArray[])25318 void Dbdih::makeNodeGroups(Uint32 nodeArray[])
25319 {
25320   NodeGroupRecordPtr NGPtr;
25321   NodeRecordPtr mngNodeptr;
25322   Uint32 j;
25323 
25324   /**-----------------------------------------------------------------------
25325    * ASSIGN ALL ACTIVE NODES INTO NODE GROUPS. HOT SPARE NODES ARE ASSIGNED
25326    * TO NODE GROUP ZNIL
25327    *-----------------------------------------------------------------------*/
25328   cnoOfNodeGroups = 0;
25329   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
25330   {
25331     jam();
25332     mngNodeptr.i = nodeArray[i];
25333     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
25334     if (mngNodeptr.p->nodeGroup == NDB_NO_NODEGROUP)
25335     {
25336       jam();
25337       mngNodeptr.p->nodeGroup = ZNIL;
25338       g_eventLogger->info("setting nodeGroup = ZNIL for node %u",
25339                           mngNodeptr.i);
25340     }
25341     else if (mngNodeptr.p->nodeGroup != RNIL)
25342     {
25343       jam();
25344       NGPtr.i = mngNodeptr.p->nodeGroup;
25345       ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25346       arrGuard(NGPtr.p->nodeCount, MAX_REPLICAS);
25347       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount++] = mngNodeptr.i;
25348 
25349       add_nodegroup(NGPtr);
25350     }
25351   }
25352   NGPtr.i = 0;
25353   for (; NGPtr.i < MAX_NDB_NODE_GROUPS; NGPtr.i++)
25354   {
25355     jam();
25356     ptrAss(NGPtr, nodeGroupRecord);
25357     if (NGPtr.p->nodeCount < cnoReplicas)
25358       break;
25359   }
25360 
25361   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
25362   {
25363     jam();
25364     mngNodeptr.i = nodeArray[i];
25365     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
25366     if (mngNodeptr.p->nodeGroup == RNIL)
25367     {
25368       mngNodeptr.p->nodeGroup = NGPtr.i;
25369       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount++] = mngNodeptr.i;
25370 
25371       add_nodegroup(NGPtr);
25372 
25373       if (NGPtr.p->nodeCount == cnoReplicas)
25374       {
25375         jam();
25376         for (; NGPtr.i < MAX_NDB_NODE_GROUPS; NGPtr.i++)
25377         {
25378           jam();
25379           ptrAss(NGPtr, nodeGroupRecord);
25380           if (NGPtr.p->nodeCount < cnoReplicas)
25381             break;
25382         }
25383       }
25384     }
25385   }
25386 
25387   Uint32 maxNG = 0;
25388   for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
25389   {
25390     jam();
25391     NGPtr.i = c_node_groups[i];
25392     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25393     if (NGPtr.p->nodeCount == 0)
25394     {
25395       jam();
25396     }
25397     else if (NGPtr.p->nodeCount != cnoReplicas)
25398     {
25399       ndbabort();
25400     }
25401     else
25402     {
25403       if (NGPtr.i > maxNG)
25404       {
25405         maxNG = NGPtr.i;
25406       }
25407     }
25408   }
25409 
25410   ndbrequire(csystemnodes < MAX_NDB_NODES);
25411 
25412   /**
25413    * Init sysfile
25414    */
25415 
25416   SYSFILE->initSysFile(SYSFILE->nodeStatus, SYSFILE->nodeGroups);
25417 
25418   for (Uint32 i = 0; nodeArray[i] != RNIL; i++)
25419   {
25420     jam();
25421     Uint32 nodeId = mngNodeptr.i = nodeArray[i];
25422     ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
25423 
25424     if (mngNodeptr.p->nodeGroup != ZNIL)
25425     {
25426       jam();
25427       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups,
25428                             mngNodeptr.p->nodeGroup);
25429 
25430       if (mngNodeptr.p->nodeStatus == NodeRecord::ALIVE)
25431       {
25432         jam();
25433         mngNodeptr.p->activeStatus = Sysfile::NS_Active;
25434       }
25435       else
25436       {
25437         jam();
25438         mngNodeptr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
25439       }
25440     }
25441     else
25442     {
25443       jam();
25444       Sysfile::setNodeGroup(mngNodeptr.i, SYSFILE->nodeGroups,
25445                             NO_NODE_GROUP_ID);
25446       mngNodeptr.p->activeStatus = Sysfile::NS_Configured;
25447     }
25448     Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus,
25449                            mngNodeptr.p->activeStatus);
25450   }
25451 
25452   for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
25453   {
25454     jam();
25455     bool alive = false;
25456     NodeGroupRecordPtr NGPtr;
25457     NGPtr.i = c_node_groups[i];
25458     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25459     for (j = 0; j<NGPtr.p->nodeCount; j++)
25460     {
25461       jam();
25462       mngNodeptr.i = NGPtr.p->nodesInGroup[j];
25463       ptrCheckGuard(mngNodeptr, MAX_NDB_NODES, nodeRecord);
25464       if (checkNodeAlive(NGPtr.p->nodesInGroup[j]))
25465       {
25466 	alive = true;
25467 	break;
25468       }
25469     }
25470 
25471     if (!alive)
25472     {
25473       char buf[255];
25474       BaseString::snprintf
25475         (buf, sizeof(buf),
25476          "Illegal initial start, no alive node in nodegroup %u", i);
25477       progError(__LINE__,
25478                 NDBD_EXIT_INSUFFICENT_NODES,
25479                 buf);
25480     }
25481   }
25482 }//Dbdih::makeNodeGroups()
25483 
25484 /**
25485  * On node failure QMGR asks DIH about node groups.  This is
25486  * a direct signal (function call in same process).  Input is
25487  * bitmask of surviving nodes.  The routine is not concerned
25488  * about node count.  Reply is one of:
25489  * 1) win - we can survive, and nobody else can
25490  * 2) lose - we cannot survive
25491  * 3) partition - we can survive but there could be others
25492  */
execCHECKNODEGROUPSREQ(Signal * signal)25493 void Dbdih::execCHECKNODEGROUPSREQ(Signal* signal)
25494 {
25495   jamNoBlock();
25496   CheckNodeGroups* sd = (CheckNodeGroups*)&signal->theData[0];
25497   bool direct = (sd->requestType & CheckNodeGroups::Direct);
25498 
25499   if (!direct)
25500   {
25501     /**
25502      * Handle NDB node bitmask now arriving in section to handle
25503      * very many data nodes. Only necessary to handle this when
25504      * signal isn't direct. For direct signals the signal object
25505      * is large enough to contain the entire bitmask.
25506      */
25507     jamNoBlock();
25508     Uint32 *node_bitmask =
25509       (Uint32*)&signal->theData[CheckNodeGroups::SignalLength];
25510     ndbrequire(signal->getNoOfSections() == 1);
25511     SegmentedSectionPtr ptr;
25512     SectionHandle handle(this, signal);
25513     handle.getSection(ptr, 0);
25514     ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
25515     memset(node_bitmask,
25516            0,
25517            NdbNodeBitmask::Size * sizeof(Uint32));
25518     copy(node_bitmask, ptr);
25519     sd->mask.assign(NdbNodeBitmask::Size, node_bitmask);
25520     releaseSections(handle);
25521   }
25522 
25523   bool ok = false;
25524   switch(sd->requestType & ~CheckNodeGroups::Direct){
25525   case CheckNodeGroups::ArbitCheck:{
25526     ok = true;
25527     jamNoBlock();
25528     unsigned missall = 0;
25529     unsigned haveall = 0;
25530     for (Uint32 i = 0; i < cnoOfNodeGroups; i++) {
25531       jamNoBlock();
25532       NodeGroupRecordPtr ngPtr;
25533       ngPtr.i = c_node_groups[i];
25534       ptrCheckGuard(ngPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
25535       Uint32 count = 0;
25536       for (Uint32 j = 0; j < ngPtr.p->nodeCount; j++) {
25537 	jamNoBlock();
25538 	Uint32 nodeId = ngPtr.p->nodesInGroup[j];
25539 	if (sd->mask.get(nodeId)) {
25540 	  jamNoBlock();
25541 	  count++;
25542 	}//if
25543       }//for
25544       if (count == 0) {
25545 	jamNoBlock();
25546 	missall++;
25547       }//if
25548       if (count == ngPtr.p->nodeCount) {
25549 	haveall++;
25550       }//if
25551     }//for
25552 
25553     if (missall) {
25554       jamNoBlock();
25555       sd->output = CheckNodeGroups::Lose;
25556     } else if (haveall) {
25557       jamNoBlock();
25558       sd->output = CheckNodeGroups::Win;
25559     } else {
25560       jamNoBlock();
25561       sd->output = CheckNodeGroups::Partitioning;
25562     }//if
25563   }
25564     break;
25565   case CheckNodeGroups::GetNodeGroup:{
25566     ok = true;
25567     Uint32 ng = Sysfile::getNodeGroup(getOwnNodeId(), SYSFILE->nodeGroups);
25568     if (ng == NO_NODE_GROUP_ID)
25569       ng = RNIL;
25570     sd->output = ng;
25571     break;
25572   }
25573   case CheckNodeGroups::GetNodeGroupMembers: {
25574     ok = true;
25575     Uint32 ng = Sysfile::getNodeGroup(sd->nodeId, SYSFILE->nodeGroups);
25576     DEB_MULTI_TRP(("My node group is %u", ng));
25577     if (ng == NO_NODE_GROUP_ID)
25578       ng = RNIL;
25579 
25580     sd->output = ng;
25581     sd->mask.clear();
25582 
25583     NodeGroupRecordPtr ngPtr;
25584     ngPtr.i = ng;
25585     if (ngPtr.i != RNIL)
25586     {
25587       jamNoBlock();
25588       ptrAss(ngPtr, nodeGroupRecord);
25589       DEB_MULTI_TRP(("%u nodes in node group", ngPtr.p->nodeCount));
25590       for (Uint32 j = 0; j < ngPtr.p->nodeCount; j++) {
25591         jamNoBlock();
25592         DEB_MULTI_TRP(("Node %u is in same node group",
25593                        ngPtr.p->nodesInGroup[j]));
25594         sd->mask.set(ngPtr.p->nodesInGroup[j]);
25595       }
25596     }
25597     break;
25598   }
25599   case CheckNodeGroups::GetDefaultFragments:
25600     jamNoBlock();
25601     ok = true;
25602     sd->output = getFragmentCount(sd->partitionBalance,
25603                                   cnoOfNodeGroups + sd->extraNodeGroups,
25604                                   cnoReplicas,
25605                                   getFragmentsPerNode());
25606     break;
25607   case CheckNodeGroups::GetDefaultFragmentsFullyReplicated:
25608     jamNoBlock();
25609     ok = true;
25610     sd->output = getFragmentCount(sd->partitionBalance,
25611                                   1,
25612                                   cnoReplicas,
25613                                   getFragmentsPerNode());
25614     break;
25615   }
25616   ndbrequire(ok);
25617 
25618   if (!direct)
25619   {
25620     /* Send node bitmask in section for non-direct signals */
25621     LinearSectionPtr lsptr[3];
25622     lsptr[0].p = sd->mask.rep.data;
25623     lsptr[0].sz = sd->mask.getPackedLengthInWords();
25624     sendSignal(sd->blockRef,
25625                GSN_CHECKNODEGROUPSCONF,
25626                signal,
25627 	       CheckNodeGroups::SignalLengthNoBitmask,
25628                JBB,
25629                lsptr,
25630                1);
25631   }
25632 }//Dbdih::execCHECKNODEGROUPSREQ()
25633 
25634 Uint32
getFragmentCount(Uint32 partitionBalance,Uint32 numOfNodeGroups,Uint32 numOfReplicas,Uint32 numOfLDMs) const25635 Dbdih::getFragmentCount(Uint32 partitionBalance,
25636                         Uint32 numOfNodeGroups,
25637                         Uint32 numOfReplicas,
25638                         Uint32 numOfLDMs) const
25639 {
25640   switch (partitionBalance)
25641   {
25642   case NDB_PARTITION_BALANCE_FOR_RP_BY_LDM:
25643     return numOfNodeGroups * numOfReplicas * numOfLDMs;
25644   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM:
25645     return numOfNodeGroups * numOfLDMs;
25646   case NDB_PARTITION_BALANCE_FOR_RP_BY_NODE:
25647     return numOfNodeGroups * numOfReplicas;
25648   case NDB_PARTITION_BALANCE_FOR_RA_BY_NODE:
25649     return numOfNodeGroups;
25650   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_2:
25651     return numOfNodeGroups * numOfLDMs * 2;
25652   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_3:
25653     return numOfNodeGroups * numOfLDMs * 3;
25654   case NDB_PARTITION_BALANCE_FOR_RA_BY_LDM_X_4:
25655     return numOfNodeGroups * numOfLDMs * 4;
25656 
25657   case NDB_PARTITION_BALANCE_SPECIFIC:
25658   default:
25659     ndbabort();
25660     return 0;
25661   }
25662 }
25663 
25664 void
makePrnList(ReadNodesConf * readNodes,Uint32 nodeArray[])25665 Dbdih::makePrnList(ReadNodesConf * readNodes, Uint32 nodeArray[])
25666 {
25667   cfirstAliveNode = RNIL;
25668   ndbrequire(con_lineNodes > 0);
25669   ndbrequire(csystemnodes < MAX_NDB_NODES);
25670   for (Uint32 i = 0; i < csystemnodes; i++) {
25671     NodeRecordPtr nodePtr;
25672     jam();
25673     nodePtr.i = nodeArray[i];
25674     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
25675     initNodeRecord(nodePtr);
25676     if (readNodes->inactiveNodes.get(nodePtr.i) == false)
25677     {
25678       jam();
25679       nodePtr.p->nodeStatus = NodeRecord::ALIVE;
25680       nodePtr.p->useInTransactions = true;
25681       nodePtr.p->copyCompleted = 1;
25682       nodePtr.p->m_inclDihLcp = true;
25683       insertAlive(nodePtr);
25684     } else {
25685       jam();
25686       nodePtr.p->nodeStatus = NodeRecord::DEAD;
25687       insertDeadNode(nodePtr);
25688     }//if
25689   }//for
25690 }//Dbdih::makePrnList()
25691 
25692 /*************************************************************************/
25693 /*       A NEW CRASHED REPLICA IS ADDED BY A NODE FAILURE.               */
25694 /*************************************************************************/
newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr)25695 void Dbdih::newCrashedReplica(ReplicaRecordPtr ncrReplicaPtr)
25696 {
25697   /*----------------------------------------------------------------------*/
25698   /*       SET THE REPLICA_LAST_GCI OF THE CRASHED REPLICA TO LAST GCI    */
25699   /*       EXECUTED BY THE FAILED NODE.                                   */
25700   /*----------------------------------------------------------------------*/
25701   /*       WE HAVE A NEW CRASHED REPLICA. INITIATE CREATE GCI TO INDICATE */
25702   /*       THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
25703   /*       SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET.                 */
25704   /*----------------------------------------------------------------------*/
25705   Uint32 nodeId = ncrReplicaPtr.p->procNode;
25706   Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
25707   if (ncrReplicaPtr.p->noCrashedReplicas + 1 == MAX_CRASHED_REPLICAS)
25708   {
25709     jam();
25710     packCrashedReplicas(ncrReplicaPtr);
25711   }
25712 
25713   Uint32 noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas;
25714   arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, MAX_CRASHED_REPLICAS,
25715               NDBD_EXIT_MAX_CRASHED_REPLICAS);
25716 
25717   if (noCrashedReplicas > 0 &&
25718       ncrReplicaPtr.p->replicaLastGci[noCrashedReplicas - 1] == lastGCI)
25719   {
25720     jam();
25721     /**
25722      * Don't add another redo-interval, that already exist
25723      *  instead initalize new
25724      */
25725     ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] =
25726       ZINIT_CREATE_GCI;
25727     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
25728       ZINIT_REPLICA_LAST_GCI;
25729   }
25730   else if (ncrReplicaPtr.p->createGci[noCrashedReplicas] <= lastGCI)
25731   {
25732     jam();
25733     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
25734       lastGCI;
25735     ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
25736     ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] =
25737       ZINIT_CREATE_GCI;
25738     ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
25739       ZINIT_REPLICA_LAST_GCI;
25740   }
25741   else
25742   {
25743     jam();
25744     /**
25745      * This can happen if createGci is set
25746      *   (during sendUpdateFragStateReq(COMMIT_STORED))
25747      *   but SYSFILE->lastCompletedGCI[nodeId] has not been updated
25748      *   as node has not yet completed it's first LCP, causing it to return
25749      *   GCP_SAVEREF (which makes SYSFILE->lastCompletedGCI[nodeId] be left
25750      *   untouched)
25751      *
25752      * I.e crash during node-restart
25753      */
25754     ncrReplicaPtr.p->createGci[noCrashedReplicas] = ZINIT_CREATE_GCI;
25755   }
25756 
25757 }//Dbdih::newCrashedReplica()
25758 
25759 /*************************************************************************/
25760 /*       AT NODE FAILURE DURING START OF A NEW NODE WE NEED TO RESET A   */
25761 /*       SET OF VARIABLES CONTROLLING THE START AND INDICATING ONGOING   */
25762 /*       START OF A NEW NODE.                                            */
25763 /*************************************************************************/
nodeResetStart(Signal * signal)25764 void Dbdih::nodeResetStart(Signal *signal)
25765 {
25766   jam();
25767   Uint32 startGCP = c_nodeStartMaster.blockGcp;
25768 
25769   c_nodeStartSlave.nodeId = 0;
25770   c_nodeStartMaster.startNode = RNIL;
25771   c_nodeStartMaster.failNr = cfailurenr;
25772   c_nodeStartMaster.activeState = false;
25773   c_nodeStartMaster.blockGcp = 0;
25774   c_nodeStartMaster.m_outstandingGsn = 0;
25775 
25776   if (startGCP == 2) // effective
25777   {
25778     jam();
25779     ndbrequire(isMaster());
25780     ndbrequire(m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_IDLE);
25781     signal->theData[0] = DihContinueB::ZSTART_GCP;
25782     sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
25783   }
25784 }//Dbdih::nodeResetStart()
25785 
openFileRw(Signal * signal,FileRecordPtr filePtr)25786 void Dbdih::openFileRw(Signal* signal, FileRecordPtr filePtr)
25787 {
25788   signal->theData[0] = reference();
25789   signal->theData[1] = filePtr.i;
25790   signal->theData[2] = filePtr.p->fileName[0];
25791   signal->theData[3] = filePtr.p->fileName[1];
25792   signal->theData[4] = filePtr.p->fileName[2];
25793   signal->theData[5] = filePtr.p->fileName[3];
25794   signal->theData[6] = FsOpenReq::OM_READWRITE;
25795   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
25796 }//Dbdih::openFileRw()
25797 
openFileRo(Signal * signal,FileRecordPtr filePtr)25798 void Dbdih::openFileRo(Signal* signal, FileRecordPtr filePtr)
25799 {
25800   signal->theData[0] = reference();
25801   signal->theData[1] = filePtr.i;
25802   signal->theData[2] = filePtr.p->fileName[0];
25803   signal->theData[3] = filePtr.p->fileName[1];
25804   signal->theData[4] = filePtr.p->fileName[2];
25805   signal->theData[5] = filePtr.p->fileName[3];
25806   signal->theData[6] = FsOpenReq::OM_READONLY;
25807   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, 7, JBA);
25808 }//Dbdih::openFileRw()
25809 
25810 /*************************************************************************/
25811 /*       REMOVE A CRASHED REPLICA BY PACKING THE ARRAY OF CREATED GCI AND*/
25812 /*       THE LAST GCI OF THE CRASHED REPLICA.                            */
25813 /*************************************************************************/
packCrashedReplicas(ReplicaRecordPtr replicaPtr)25814 void Dbdih::packCrashedReplicas(ReplicaRecordPtr replicaPtr)
25815 {
25816   ndbrequire(replicaPtr.p->noCrashedReplicas > 0);
25817   ndbrequire(replicaPtr.p->noCrashedReplicas <= MAX_CRASHED_REPLICAS);
25818   for (Uint32 i = 0; i < replicaPtr.p->noCrashedReplicas; i++) {
25819     jam();
25820     replicaPtr.p->createGci[i] = replicaPtr.p->createGci[i + 1];
25821     replicaPtr.p->replicaLastGci[i] = replicaPtr.p->replicaLastGci[i + 1];
25822   }//for
25823   replicaPtr.p->noCrashedReplicas--;
25824   replicaPtr.p->createGci[replicaPtr.p->noCrashedReplicas + 1] =
25825     ZINIT_CREATE_GCI;
25826   replicaPtr.p->replicaLastGci[replicaPtr.p->noCrashedReplicas + 1] =
25827     ZINIT_REPLICA_LAST_GCI;
25828 }//Dbdih::packCrashedReplicas()
25829 
25830 void
mergeCrashedReplicas(ReplicaRecordPtr replicaPtr)25831 Dbdih::mergeCrashedReplicas(ReplicaRecordPtr replicaPtr)
25832 {
25833   /**
25834    * merge adjacent redo-intervals
25835    */
25836   jam();
25837   jamLine(Uint16(replicaPtr.p->noCrashedReplicas));
25838   for (Uint32 i = replicaPtr.p->noCrashedReplicas; i > 0; i--)
25839   {
25840     if (replicaPtr.p->createGci[i] == 1 + replicaPtr.p->replicaLastGci[i-1])
25841     {
25842       jam();
25843       replicaPtr.p->replicaLastGci[i-1] = replicaPtr.p->replicaLastGci[i];
25844       replicaPtr.p->createGci[i] = ZINIT_CREATE_GCI;
25845       replicaPtr.p->replicaLastGci[i] = ZINIT_REPLICA_LAST_GCI;
25846       replicaPtr.p->noCrashedReplicas--;
25847     }
25848     else
25849     {
25850       jam();
25851       break;
25852     }
25853   }
25854 }
25855 
prepareReplicas(FragmentstorePtr fragPtr)25856 void Dbdih::prepareReplicas(FragmentstorePtr fragPtr)
25857 {
25858   ReplicaRecordPtr prReplicaPtr;
25859   Uint32 prevReplica = RNIL;
25860 
25861   /* --------------------------------------------------------------------- */
25862   /*       BEGIN BY LINKING ALL REPLICA RECORDS ONTO THE OLD STORED REPLICA*/
25863   /*       LIST.                                                           */
25864   /*       AT A SYSTEM RESTART OBVIOUSLY ALL NODES ARE OLD.                */
25865   /* --------------------------------------------------------------------- */
25866   prReplicaPtr.i = fragPtr.p->storedReplicas;
25867   while (prReplicaPtr.i != RNIL) {
25868     jam();
25869     prevReplica = prReplicaPtr.i;
25870     c_replicaRecordPool.getPtr(prReplicaPtr);
25871     prReplicaPtr.i = prReplicaPtr.p->nextPool;
25872   }//while
25873   /* --------------------------------------------------------------------- */
25874   /*       LIST OF STORED REPLICAS WILL BE EMPTY NOW.                      */
25875   /* --------------------------------------------------------------------- */
25876   if (prevReplica != RNIL) {
25877     prReplicaPtr.i = prevReplica;
25878     c_replicaRecordPool.getPtr(prReplicaPtr);
25879     prReplicaPtr.p->nextPool = fragPtr.p->oldStoredReplicas;
25880     fragPtr.p->oldStoredReplicas = fragPtr.p->storedReplicas;
25881     fragPtr.p->storedReplicas = RNIL;
25882     fragPtr.p->noOldStoredReplicas += fragPtr.p->noStoredReplicas;
25883     fragPtr.p->noStoredReplicas = 0;
25884   }//if
25885 }//Dbdih::prepareReplicas()
25886 
readFragment(RWFragment * rf,FragmentstorePtr fragPtr)25887 void Dbdih::readFragment(RWFragment* rf, FragmentstorePtr fragPtr)
25888 {
25889   Uint32 TreadFid = readPageWord(rf);
25890   fragPtr.p->preferredPrimary = readPageWord(rf);
25891   fragPtr.p->noStoredReplicas = readPageWord(rf);
25892   fragPtr.p->noOldStoredReplicas = readPageWord(rf);
25893   Uint32 TdistKey = readPageWord(rf);
25894 
25895   ndbrequire(fragPtr.p->noStoredReplicas > 0);
25896   ndbrequire(TreadFid == rf->fragId);
25897   ndbrequire(TdistKey < 256);
25898   fragPtr.p->distributionKey = TdistKey;
25899 
25900   fragPtr.p->m_log_part_id = readPageWord(rf);
25901 
25902   /* Older nodes stored unlimited log part ids in the fragment definition,
25903    * now we constrain them to a valid range of actual values for this node.
25904    * Here we ensure that unlimited log part ids fit in the value range for
25905    * this node.
25906    */
25907   ndbrequire(globalData.ndbLogParts <= NDBMT_MAX_WORKER_INSTANCES);
25908 
25909   fragPtr.p->m_log_part_id %= globalData.ndbLogParts;
25910 
25911   ndbrequire(fragPtr.p->m_log_part_id < NDBMT_MAX_WORKER_INSTANCES);
25912 
25913   inc_ng_refcount(getNodeGroup(fragPtr.p->preferredPrimary));
25914 }//Dbdih::readFragment()
25915 
readPageWord(RWFragment * rf)25916 Uint32 Dbdih::readPageWord(RWFragment* rf)
25917 {
25918   if (rf->wordIndex >= 2048) {
25919     jam();
25920     ndbrequire(rf->wordIndex == 2048);
25921     rf->pageIndex++;
25922     ndbrequire(rf->pageIndex < NDB_ARRAY_SIZE(rf->rwfTabPtr.p->pageRef));
25923     rf->rwfPageptr.i = rf->rwfTabPtr.p->pageRef[rf->pageIndex];
25924     ptrCheckGuard(rf->rwfPageptr, cpageFileSize, pageRecord);
25925     rf->wordIndex = 32;
25926   }//if
25927   Uint32 dataWord = rf->rwfPageptr.p->word[rf->wordIndex];
25928   rf->wordIndex++;
25929   return dataWord;
25930 }//Dbdih::readPageWord()
25931 
readReplica(RWFragment * rf,ReplicaRecordPtr readReplicaPtr)25932 void Dbdih::readReplica(RWFragment* rf, ReplicaRecordPtr readReplicaPtr)
25933 {
25934   Uint32 i;
25935   readReplicaPtr.p->procNode = readPageWord(rf);
25936   readReplicaPtr.p->initialGci = readPageWord(rf);
25937   readReplicaPtr.p->noCrashedReplicas = readPageWord(rf);
25938   readReplicaPtr.p->nextLcp = readPageWord(rf);
25939 
25940   /**
25941    * Initialise LCP inclusion data, this is to enable us to be included
25942    * in an LCP during a node restart.
25943    */
25944   readReplicaPtr.p->fragId = rf->fragId;
25945   readReplicaPtr.p->tableId = rf->rwfTabPtr.i;
25946   readReplicaPtr.p->lcpOngoingFlag = false;
25947 
25948   for (i = 0; i < MAX_LCP_STORED; i++) {
25949     readReplicaPtr.p->maxGciCompleted[i] = readPageWord(rf);
25950     readReplicaPtr.p->maxGciStarted[i] = readPageWord(rf);
25951     readReplicaPtr.p->lcpId[i] = readPageWord(rf);
25952     readReplicaPtr.p->lcpStatus[i] = readPageWord(rf);
25953   }//for
25954   const Uint32 noCrashedReplicas = readReplicaPtr.p->noCrashedReplicas;
25955   ndbrequire(noCrashedReplicas < MAX_CRASHED_REPLICAS);
25956   for (i = 0; i < noCrashedReplicas; i++) {
25957     readReplicaPtr.p->createGci[i] = readPageWord(rf);
25958     readReplicaPtr.p->replicaLastGci[i] = readPageWord(rf);
25959   }//for
25960   for(i = noCrashedReplicas; i<MAX_CRASHED_REPLICAS; i++){
25961     readReplicaPtr.p->createGci[i] = readPageWord(rf);
25962     readReplicaPtr.p->replicaLastGci[i] = readPageWord(rf);
25963   }
25964 }//Dbdih::readReplica()
25965 
25966 /**
25967  * This method is useful when we read the table distribution information from
25968  * the master node. In this case with the new PAUSE LCP protocol we need to
25969  * perform the functionality of the initLcpLab while copying the table to
25970  * ensure that we're a full DIH participant in the LCP when the copying of
25971  * the meta data has been completed.
25972  *
25973  * For all other cases the tabLcpStatus is TLS_COMPLETED and thus the method
25974  * will be ignored.
25975  */
updateLcpInfo(TabRecord * regTabPtr,Fragmentstore * regFragPtr,ReplicaRecord * regReplicaPtr)25976 void Dbdih::updateLcpInfo(TabRecord *regTabPtr,
25977                           Fragmentstore *regFragPtr,
25978                           ReplicaRecord *regReplicaPtr)
25979 {
25980   if (regTabPtr->tabLcpStatus == TabRecord::TLS_ACTIVE)
25981   {
25982     jam();
25983     Uint32 lastLcpNo = prevLcpNo(regReplicaPtr->nextLcp);
25984     if (c_lcp_id_while_copy_meta_data != RNIL &&
25985         regReplicaPtr->lcpId[lastLcpNo] < c_lcp_id_while_copy_meta_data &&
25986         c_lcpState.m_participatingLQH.get(regReplicaPtr->procNode))
25987     {
25988       /**
25989        * If the copy table indicating that the table is participating in
25990        * an LCP, if the fragment replica hasn't performed this LCP yet,
25991        * and the replica node is participating in the LCP at hand now.
25992        *
25993        * This code executes in the starting node after the LCP being
25994        * paused and we are included into the LCP protocol immediately
25995        * after copying the meta data. We received the bitmap of
25996        * participating LCP nodes just before the copying of meta
25997        * data started.
25998        */
25999       jam();
26000       regReplicaPtr->lcpOngoingFlag = true;
26001       if (regFragPtr->noLcpReplicas == 0)
26002       {
26003         jam();
26004         regTabPtr->tabActiveLcpFragments++;
26005       }
26006       regFragPtr->noLcpReplicas++;
26007 #if 0
26008       g_eventLogger->info("LCP Ongoing: TableId: %u, fragId: %u, node: %u"
26009                           " lastLcpNo: %u, lastLcpId: %u, lcpId: %u",
26010       regReplicaPtr->tableId,
26011       regReplicaPtr->fragId,
26012       regReplicaPtr->procNode,
26013       lastLcpNo,
26014       regReplicaPtr->lcpId[lastLcpNo],
26015       c_lcp_id_while_copy_meta_data);
26016 #endif
26017     }
26018   }
26019 }
26020 
readReplicas(RWFragment * rf,TabRecord * regTabPtr,FragmentstorePtr fragPtr)26021 void Dbdih::readReplicas(RWFragment* rf,
26022                          TabRecord *regTabPtr,
26023                          FragmentstorePtr fragPtr)
26024 {
26025   Uint32 i;
26026   ReplicaRecordPtr newReplicaPtr;
26027   Uint32 noStoredReplicas = fragPtr.p->noStoredReplicas;
26028   Uint32 noOldStoredReplicas = fragPtr.p->noOldStoredReplicas;
26029   /* ----------------------------------------------------------------------- */
26030   /*      WE CLEAR THE NUMBER OF STORED REPLICAS SINCE IT WILL BE CALCULATED */
26031   /*      BY THE LINKING SUBROUTINES.                                        */
26032   /* ----------------------------------------------------------------------- */
26033   fragPtr.p->noStoredReplicas = 0;
26034   fragPtr.p->noOldStoredReplicas = 0;
26035   fragPtr.p->noLcpReplicas = 0;
26036   Uint32 replicaIndex = 0;
26037   ndbrequire(noStoredReplicas + noOldStoredReplicas <= MAX_REPLICAS);
26038   for (i = 0; i < noStoredReplicas; i++)
26039   {
26040     seizeReplicaRec(newReplicaPtr);
26041     readReplica(rf, newReplicaPtr);
26042     ndbrequire(replicaIndex < MAX_REPLICAS);
26043     fragPtr.p->activeNodes[replicaIndex] = newReplicaPtr.p->procNode;
26044     replicaIndex++;
26045     linkStoredReplica(fragPtr, newReplicaPtr);
26046     updateLcpInfo(regTabPtr, fragPtr.p, newReplicaPtr.p);
26047   }//for
26048   fragPtr.p->fragReplicas = noStoredReplicas;
26049   for (i = 0; i < noOldStoredReplicas; i++) {
26050     jam();
26051     seizeReplicaRec(newReplicaPtr);
26052     readReplica(rf, newReplicaPtr);
26053     linkOldStoredReplica(fragPtr, newReplicaPtr);
26054   }//for
26055 }//Dbdih::readReplicas()
26056 
readRestorableGci(Signal * signal,FileRecordPtr filePtr)26057 void Dbdih::readRestorableGci(Signal* signal, FileRecordPtr filePtr)
26058 {
26059   FsReadWriteReq *req = (FsReadWriteReq*)signal->getDataPtrSend();
26060   req->filePointer = filePtr.p->fileRef;
26061   req->userReference = reference();
26062   req->userPointer = filePtr.i;
26063   req->operationFlag = 0;
26064   req->varIndex = ZVAR_NO_CRESTART_INFO;
26065   req->numberOfPages = 1;
26066   FsReadWriteReq::setFormatFlag(req->operationFlag,
26067                                 FsReadWriteReq::fsFormatMemAddress);
26068   FsReadWriteReq::setPartialReadFlag(req->operationFlag, 1);
26069   req->data.memoryAddress.memoryOffset = 0;
26070   req->data.memoryAddress.fileOffset = 0;
26071   req->data.memoryAddress.size = Sysfile::SYSFILE_FILE_SIZE;
26072   sendSignal(NDBFS_REF,
26073              GSN_FSREADREQ,
26074              signal,
26075              FsReadWriteReq::FixedLength + 3,
26076              JBA);
26077 }//Dbdih::readRestorableGci()
26078 
readTabfile(Signal * signal,TabRecord * tab,FileRecordPtr filePtr)26079 void Dbdih::readTabfile(Signal* signal, TabRecord* tab, FileRecordPtr filePtr)
26080 {
26081   signal->theData[0] = filePtr.p->fileRef;
26082   signal->theData[1] = reference();
26083   signal->theData[2] = filePtr.i;
26084   signal->theData[3] = ZLIST_OF_PAIRS;
26085   signal->theData[4] = ZVAR_NO_WORD;
26086   signal->theData[5] = tab->noPages;
26087   Uint32 section[2 * NDB_ARRAY_SIZE(tab->pageRef)];
26088   for (Uint32 i = 0; i < tab->noPages; i++)
26089   {
26090     section[(2 * i) + 0] = tab->pageRef[i];
26091     section[(2 * i) + 1] = i;
26092   }
26093   LinearSectionPtr ptr[3];
26094   ptr[0].p = section;
26095   ptr[0].sz = 2 * tab->noPages;
26096   sendSignal(NDBFS_REF, GSN_FSREADREQ, signal, 6, JBA, ptr, 1);
26097 }//Dbdih::readTabfile()
26098 
releasePage(Uint32 pageIndex)26099 void Dbdih::releasePage(Uint32 pageIndex)
26100 {
26101   PageRecordPtr pagePtr;
26102   pagePtr.i = pageIndex;
26103   ptrCheckGuard(pagePtr, cpageFileSize, pageRecord);
26104   pagePtr.p->nextfreepage = cfirstfreepage;
26105   cfirstfreepage = pagePtr.i;
26106 }//Dbdih::releasePage()
26107 
releaseTabPages(Uint32 tableId)26108 void Dbdih::releaseTabPages(Uint32 tableId)
26109 {
26110   TabRecordPtr tabPtr;
26111   tabPtr.i = tableId;
26112   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
26113   ndbrequire(tabPtr.p->noPages <= NDB_ARRAY_SIZE(tabPtr.p->pageRef));
26114   for (Uint32 i = 0; i < tabPtr.p->noPages; i++) {
26115     jam();
26116     releasePage(tabPtr.p->pageRef[i]);
26117   }//for
26118   tabPtr.p->noPages = 0;
26119 }//Dbdih::releaseTabPages()
26120 
26121 /*************************************************************************/
26122 /*       REMOVE NODE FROM SET OF ALIVE NODES.                            */
26123 /*************************************************************************/
removeAlive(NodeRecordPtr removeNodePtr)26124 void Dbdih::removeAlive(NodeRecordPtr removeNodePtr)
26125 {
26126   NodeRecordPtr nodePtr;
26127 
26128   nodePtr.i = cfirstAliveNode;
26129   if (nodePtr.i == removeNodePtr.i) {
26130     jam();
26131     cfirstAliveNode = removeNodePtr.p->nextNode;
26132     return;
26133   }//if
26134   do {
26135     jam();
26136     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
26137     if (nodePtr.p->nextNode == removeNodePtr.i) {
26138       jam();
26139       nodePtr.p->nextNode = removeNodePtr.p->nextNode;
26140       break;
26141     } else {
26142       jam();
26143       nodePtr.i = nodePtr.p->nextNode;
26144     }//if
26145   } while (1);
26146 }//Dbdih::removeAlive()
26147 
26148 /*************************************************************************/
26149 /*       REMOVE NODE FROM SET OF DEAD NODES.                             */
26150 /*************************************************************************/
removeDeadNode(NodeRecordPtr removeNodePtr)26151 void Dbdih::removeDeadNode(NodeRecordPtr removeNodePtr)
26152 {
26153   NodeRecordPtr nodePtr;
26154 
26155   nodePtr.i = cfirstDeadNode;
26156   if (nodePtr.i == removeNodePtr.i) {
26157     jam();
26158     cfirstDeadNode = removeNodePtr.p->nextNode;
26159     return;
26160   }//if
26161   do {
26162     jam();
26163     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
26164     if (nodePtr.p->nextNode == removeNodePtr.i) {
26165       jam();
26166       nodePtr.p->nextNode = removeNodePtr.p->nextNode;
26167       break;
26168     } else {
26169       jam();
26170       nodePtr.i = nodePtr.p->nextNode;
26171     }//if
26172   } while (1);
26173 }//Dbdih::removeDeadNode()
26174 
26175 /*---------------------------------------------------------------*/
26176 /*       REMOVE REPLICAS OF A FAILED NODE FROM LIST OF STORED    */
26177 /*       REPLICAS AND MOVE IT TO THE LIST OF OLD STORED REPLICAS.*/
26178 /*       ALSO UPDATE THE CRASHED REPLICA INFORMATION.            */
26179 /*---------------------------------------------------------------*/
removeNodeFromStored(Uint32 nodeId,FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr,bool temporary)26180 void Dbdih::removeNodeFromStored(Uint32 nodeId,
26181                                  FragmentstorePtr fragPtr,
26182                                  ReplicaRecordPtr replicatePtr,
26183 				 bool temporary)
26184 {
26185   if (!temporary)
26186   {
26187     jam();
26188     newCrashedReplica(replicatePtr);
26189   }
26190   else
26191   {
26192     jam();
26193   }
26194   removeStoredReplica(fragPtr, replicatePtr);
26195   linkOldStoredReplica(fragPtr, replicatePtr);
26196   ndbrequire(fragPtr.p->storedReplicas != RNIL);
26197 }//Dbdih::removeNodeFromStored()
26198 
26199 /*************************************************************************/
26200 /*       REMOVE ANY OLD CRASHED REPLICAS THAT ARE NOT RESTORABLE ANY MORE*/
26201 /*************************************************************************/
removeOldCrashedReplicas(Uint32 tab,Uint32 frag,ReplicaRecordPtr rocReplicaPtr)26202 void Dbdih::removeOldCrashedReplicas(Uint32 tab, Uint32 frag,
26203                                      ReplicaRecordPtr rocReplicaPtr)
26204 {
26205   mergeCrashedReplicas(rocReplicaPtr);
26206   while (rocReplicaPtr.p->noCrashedReplicas > 0) {
26207     jam();
26208     /* --------------------------------------------------------------------- */
26209     /*       ONLY IF THERE IS AT LEAST ONE REPLICA THEN CAN WE REMOVE ANY.   */
26210     /* --------------------------------------------------------------------- */
26211     if (rocReplicaPtr.p->replicaLastGci[0] < SYSFILE->oldestRestorableGCI){
26212       jam();
26213       /* ------------------------------------------------------------------- */
26214       /*     THIS CRASHED REPLICA HAS BECOME EXTINCT AND MUST BE REMOVED TO  */
26215       /*     GIVE SPACE FOR NEW CRASHED REPLICAS.                            */
26216       /* ------------------------------------------------------------------- */
26217       packCrashedReplicas(rocReplicaPtr);
26218     } else {
26219       break;
26220     }//if
26221   }//while
26222 
26223   while (rocReplicaPtr.p->createGci[0] < SYSFILE->keepGCI)
26224   {
26225     jam();
26226     /* --------------------------------------------------------------------- */
26227     /*       MOVE FORWARD THE CREATE GCI TO A GCI THAT CAN BE USED. WE HAVE  */
26228     /*       NO CERTAINTY IN FINDING ANY LOG RECORDS FROM OLDER GCI'S.       */
26229     /* --------------------------------------------------------------------- */
26230     rocReplicaPtr.p->createGci[0] = SYSFILE->keepGCI;
26231 
26232     if (rocReplicaPtr.p->noCrashedReplicas)
26233     {
26234       /**
26235        * a REDO interval while is from 78 to 14 is not usefull
26236        *   but rather harmful, remove it...
26237        */
26238       if (rocReplicaPtr.p->createGci[0] > rocReplicaPtr.p->replicaLastGci[0])
26239       {
26240         jam();
26241         packCrashedReplicas(rocReplicaPtr);
26242       }
26243     }
26244   }
26245 }
26246 
removeOldStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)26247 void Dbdih::removeOldStoredReplica(FragmentstorePtr fragPtr,
26248                                    ReplicaRecordPtr replicatePtr)
26249 {
26250   ReplicaRecordPtr rosTmpReplicaPtr;
26251   ReplicaRecordPtr rosPrevReplicaPtr;
26252 
26253   fragPtr.p->noOldStoredReplicas--;
26254   if (fragPtr.p->oldStoredReplicas == replicatePtr.i) {
26255     jam();
26256     fragPtr.p->oldStoredReplicas = replicatePtr.p->nextPool;
26257   } else {
26258     rosPrevReplicaPtr.i = fragPtr.p->oldStoredReplicas;
26259     c_replicaRecordPool.getPtr(rosPrevReplicaPtr);
26260     rosTmpReplicaPtr.i = rosPrevReplicaPtr.p->nextPool;
26261     while (rosTmpReplicaPtr.i != replicatePtr.i) {
26262       jam();
26263       c_replicaRecordPool.getPtr(rosTmpReplicaPtr);
26264       rosPrevReplicaPtr = rosTmpReplicaPtr;
26265       rosTmpReplicaPtr.i = rosTmpReplicaPtr.p->nextPool;
26266     }//if
26267     rosPrevReplicaPtr.p->nextPool = replicatePtr.p->nextPool;
26268   }//if
26269 }//Dbdih::removeOldStoredReplica()
26270 
removeStoredReplica(FragmentstorePtr fragPtr,ReplicaRecordPtr replicatePtr)26271 void Dbdih::removeStoredReplica(FragmentstorePtr fragPtr,
26272                                 ReplicaRecordPtr replicatePtr)
26273 {
26274   ReplicaRecordPtr rsrTmpReplicaPtr;
26275   ReplicaRecordPtr rsrPrevReplicaPtr;
26276 
26277   fragPtr.p->noStoredReplicas--;
26278   if (fragPtr.p->storedReplicas == replicatePtr.i) {
26279     jam();
26280     fragPtr.p->storedReplicas = replicatePtr.p->nextPool;
26281   } else {
26282     jam();
26283     rsrPrevReplicaPtr.i = fragPtr.p->storedReplicas;
26284     rsrTmpReplicaPtr.i = fragPtr.p->storedReplicas;
26285     c_replicaRecordPool.getPtr(rsrTmpReplicaPtr);
26286     rsrTmpReplicaPtr.i = rsrTmpReplicaPtr.p->nextPool;
26287     while (rsrTmpReplicaPtr.i != replicatePtr.i) {
26288       jam();
26289       rsrPrevReplicaPtr.i = rsrTmpReplicaPtr.i;
26290       c_replicaRecordPool.getPtr(rsrTmpReplicaPtr);
26291       rsrTmpReplicaPtr.i = rsrTmpReplicaPtr.p->nextPool;
26292     }//while
26293     c_replicaRecordPool.getPtr(rsrPrevReplicaPtr);
26294     rsrPrevReplicaPtr.p->nextPool = replicatePtr.p->nextPool;
26295   }//if
26296 }//Dbdih::removeStoredReplica()
26297 
26298 /*************************************************************************/
26299 /*       REMOVE ALL TOO NEW CRASHED REPLICAS THAT IS IN THIS REPLICA.    */
26300 /*************************************************************************/
removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr,Uint32 lastCompletedGCI)26301 void Dbdih::removeTooNewCrashedReplicas(ReplicaRecordPtr rtnReplicaPtr, Uint32 lastCompletedGCI)
26302 {
26303   while (rtnReplicaPtr.p->noCrashedReplicas > 0) {
26304     jam();
26305     /* --------------------------------------------------------------------- */
26306     /*       REMOVE ALL REPLICAS THAT ONLY LIVED IN A PERIOD THAT HAVE BEEN  */
26307     /*       REMOVED FROM THE RESTART INFORMATION SINCE THE RESTART FAILED   */
26308     /*       TOO MANY TIMES.                                                 */
26309     /* --------------------------------------------------------------------- */
26310     arrGuard(rtnReplicaPtr.p->noCrashedReplicas - 1, MAX_CRASHED_REPLICAS);
26311     if (rtnReplicaPtr.p->createGci[rtnReplicaPtr.p->noCrashedReplicas - 1] > lastCompletedGCI)
26312     {
26313       jam();
26314       rtnReplicaPtr.p->createGci[rtnReplicaPtr.p->noCrashedReplicas - 1] =
26315 	ZINIT_CREATE_GCI;
26316       rtnReplicaPtr.p->replicaLastGci[rtnReplicaPtr.p->noCrashedReplicas - 1] =
26317 	ZINIT_REPLICA_LAST_GCI;
26318       rtnReplicaPtr.p->noCrashedReplicas--;
26319     } else {
26320       break;
26321     }//if
26322   }//while
26323 }//Dbdih::removeTooNewCrashedReplicas()
26324 
26325 /*************************************************************************/
26326 /*                                                                       */
26327 /*       MODULE: SEARCH FOR POSSIBLE REPLICAS THAT CAN HANDLE THE GLOBAL */
26328 /*               CHECKPOINT WITHOUT NEEDING ANY EXTRA LOGGING FACILITIES.*/
26329 /*               A MAXIMUM OF FOUR NODES IS RETRIEVED.                   */
26330 /*************************************************************************/
26331 bool
setup_create_replica(FragmentstorePtr fragPtr,CreateReplicaRecord * createReplicaPtrP,Ptr<ReplicaRecord> replicaPtr)26332 Dbdih::setup_create_replica(FragmentstorePtr fragPtr,
26333 			    CreateReplicaRecord* createReplicaPtrP,
26334 			    Ptr<ReplicaRecord> replicaPtr)
26335 {
26336   createReplicaPtrP->dataNodeId = replicaPtr.p->procNode;
26337   createReplicaPtrP->replicaRec = replicaPtr.i;
26338 
26339   /**
26340    * We search for a proper local checkpoint to use for the system restart.
26341    * This local checkpoint isn't allowed to use any GCIs beyond what is
26342    * restorable from this node. It is possible that the following has
26343    * happened if we use a too fresh local checkpoint.
26344    *
26345    * Assume we have a simple 2-node cluster with node 1 and 2.
26346    * 1) Cluster crashes
26347    * 2) Node 2 performs system restart on its own.
26348    * 3) Node 2 runs for a few GCIs and then crashes.
26349    * 4) Node 1 and Node 2 performs system restart.
26350    *
26351    * If we come here as part of 4) and we grab a local checkpoint that is
26352    * newer than our last completed GCI, then we could restore data which
26353    * was overwritten by the restart performed by the Node 2 on its own
26354    * and its running afterwards.
26355    *
26356    * We cannot distinguish the above case from the following.
26357    *
26358    * 1) Node 1 crashes
26359    * 2) Node 2 crashes and thus cluster has crashed
26360    * 3) Node 1 and Node 2 are restarted in a system restart
26361    *
26362    * In the above case Node 1 sees exactly the same view here as with the
26363    * case above. In this case it is ok to use a more recent local checkpoint
26364    * than our last completed GCI since all data we will restore was also
26365    * committed and saved by Node 2 before crashing. Thus it would be safe
26366    * to use a more recent local checkpoint in this case.
26367    *
26368    * The fact is however that when we come here we have no way of
26369    * finding out which of those two scenarios that have happened.
26370    * So the only safe manner of proceeding here is to not use local
26371    * checkpoints that are too new.
26372    *
26373    * Doing so will require a bit more REDO log to be executed, but the
26374    * recovery will still work perfectly fine.
26375    */
26376   Uint32 startGci;
26377   Uint32 startLcpNo;
26378   Uint32 nodeStopGci = SYSFILE->lastCompletedGCI[replicaPtr.p->procNode];
26379   bool result = findStartGci(replicaPtr,
26380 			     nodeStopGci,
26381 			     startGci,
26382 			     startLcpNo);
26383   if (!result)
26384   {
26385     jam();
26386     /* --------------------------------------------------------------- */
26387     /* WE COULD NOT FIND ANY LOCAL CHECKPOINT. THE FRAGMENT THUS DO NOT*/
26388     /* CONTAIN ANY VALID LOCAL CHECKPOINT. IT DOES HOWEVER CONTAIN A   */
26389     /* VALID FRAGMENT LOG. THUS BY FIRST CREATING THE FRAGMENT AND THEN*/
26390     /* EXECUTING THE FRAGMENT LOG WE CAN CREATE THE FRAGMENT AS        */
26391     /* DESIRED. THIS SHOULD ONLY OCCUR AFTER CREATING A FRAGMENT.      */
26392     /*                                                                 */
26393     /* TO INDICATE THAT NO LOCAL CHECKPOINT IS TO BE USED WE SET THE   */
26394     /* LOCAL CHECKPOINT TO ZNIL.                                       */
26395     /* --------------------------------------------------------------- */
26396     createReplicaPtrP->lcpNo = ZNIL;
26397   }
26398   else
26399   {
26400     jam();
26401     /* --------------------------------------------------------------- */
26402     /* WE FOUND A PROPER LOCAL CHECKPOINT TO RESTART FROM.             */
26403     /* SET LOCAL CHECKPOINT ID AND LOCAL CHECKPOINT NUMBER.            */
26404     /* --------------------------------------------------------------- */
26405     createReplicaPtrP->lcpNo = startLcpNo;
26406     arrGuard(startLcpNo, MAX_LCP_STORED);
26407     createReplicaPtrP->createLcpId = replicaPtr.p->lcpId[startLcpNo];
26408   }//if
26409 
26410 
26411   /* ----------------------------------------------------------------- */
26412   /*   WE HAVE EITHER FOUND A LOCAL CHECKPOINT OR WE ARE PLANNING TO   */
26413   /*   EXECUTE THE LOG FROM THE INITIAL CREATION OF THE TABLE. IN BOTH */
26414   /*   CASES WE NEED TO FIND A SET OF LOGS THAT CAN EXECUTE SUCH THAT  */
26415   /*   WE RECOVER TO THE SYSTEM RESTART GLOBAL CHECKPOINT.             */
26416   /* -_--------------------------------------------------------------- */
26417   Uint32 stopGci = SYSFILE->newestRestorableGCI;
26418   return findLogNodes(createReplicaPtrP, fragPtr, startGci, stopGci);
26419 }
26420 
searchStoredReplicas(FragmentstorePtr fragPtr)26421 void Dbdih::searchStoredReplicas(FragmentstorePtr fragPtr)
26422 {
26423   Uint32 nextReplicaPtrI;
26424   Ptr<ReplicaRecord> replicaPtr;
26425 
26426   replicaPtr.i = fragPtr.p->storedReplicas;
26427   while (replicaPtr.i != RNIL) {
26428     jam();
26429     c_replicaRecordPool.getPtr(replicaPtr);
26430     nextReplicaPtrI = replicaPtr.p->nextPool;
26431     NodeRecordPtr nodePtr;
26432     nodePtr.i = replicaPtr.p->procNode;
26433     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
26434     if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
26435       jam();
26436       switch (nodePtr.p->activeStatus) {
26437       case Sysfile::NS_Active:
26438       case Sysfile::NS_ActiveMissed_1:
26439       case Sysfile::NS_ActiveMissed_2:{
26440 	/* ----------------------------------------------------------------- */
26441 	/*   INITIALISE THE CREATE REPLICA STRUCTURE THAT IS USED FOR SENDING*/
26442 	/*   TO LQH START_FRAGREQ.                                           */
26443 	/*   SET THE DATA NODE WHERE THE LOCAL CHECKPOINT IS FOUND. ALSO     */
26444 	/*   SET A REFERENCE TO THE REPLICA POINTER OF THAT.                 */
26445 	/* ----------------------------------------------------------------- */
26446 	CreateReplicaRecordPtr createReplicaPtr;
26447 	createReplicaPtr.i = cnoOfCreateReplicas;
26448 	ptrCheckGuard(createReplicaPtr, 4, createReplicaRecord);
26449 	cnoOfCreateReplicas++;
26450 
26451 	/**
26452 	 * Should have been checked in resetReplicaSr
26453 	 */
26454 	ndbrequire(setup_create_replica(fragPtr,
26455 					createReplicaPtr.p,
26456 					replicaPtr));
26457 	break;
26458       }
26459       default:
26460         jam();
26461         /*empty*/;
26462         break;
26463       }//switch
26464     }
26465     replicaPtr.i = nextReplicaPtrI;
26466   }//while
26467 }//Dbdih::searchStoredReplicas()
26468 
26469 /*************************************************************************/
26470 /*                                                                       */
26471 /*       MODULE: SEIZE_FILE                                              */
26472 /*       DESCRIPTION: THE SUBROUTINE SEIZES A FILE RECORD FROM THE       */
26473 /*                    FREE LIST.                                         */
26474 /*************************************************************************/
seizeFile(FileRecordPtr & filePtr)26475 void Dbdih::seizeFile(FileRecordPtr& filePtr)
26476 {
26477   filePtr.i = cfirstfreeFile;
26478   ptrCheckGuard(filePtr, cfileFileSize, fileRecord);
26479   cfirstfreeFile = filePtr.p->nextFile;
26480   filePtr.p->nextFile = RNIL;
26481 }//Dbdih::seizeFile()
26482 
26483 /*************************************************************************/
26484 /*       SEND UPDATE_FRAG_STATEREQ TO ALL NODES IN THE NDB CLUSTER.      */
26485 /*************************************************************************/
26486 /*************************************************************************/
26487 /*                                                                       */
26488 /*       MODULE: FIND THE START GCI AND LOCAL CHECKPOINT TO USE.         */
26489 /*************************************************************************/
sendStartFragreq(Signal * signal,TabRecordPtr tabPtr,Uint32 fragId)26490 void Dbdih::sendStartFragreq(Signal* signal,
26491 			     TabRecordPtr tabPtr, Uint32 fragId)
26492 {
26493   CreateReplicaRecordPtr replicaPtr;
26494   for (replicaPtr.i = 0; replicaPtr.i < cnoOfCreateReplicas; replicaPtr.i++) {
26495     jam();
26496     ptrAss(replicaPtr, createReplicaRecord);
26497 
26498     BlockReference ref = numberToRef(DBLQH, replicaPtr.p->dataNodeId);
26499 
26500     StartFragReq * const startFragReq = (StartFragReq *)&signal->theData[0];
26501     startFragReq->userPtr = replicaPtr.p->replicaRec;
26502     startFragReq->userRef = reference();
26503     startFragReq->lcpNo = replicaPtr.p->lcpNo;
26504     startFragReq->lcpId = replicaPtr.p->createLcpId;
26505     startFragReq->tableId = tabPtr.i;
26506     startFragReq->fragId = fragId;
26507     startFragReq->requestInfo = StartFragReq::SFR_RESTORE_LCP;
26508 
26509     if (ERROR_INSERTED(7072))
26510     {
26511       jam();
26512       const Uint32 noNodes = replicaPtr.p->noLogNodes;
26513       Uint32 start = replicaPtr.p->logStartGci[noNodes - 1];
26514       const Uint32 stop  = replicaPtr.p->logStopGci[noNodes - 1];
26515 
26516       for(Uint32 i = noNodes; i < MAX_LOG_EXEC && (stop - start) > 0; i++){
26517 	replicaPtr.p->noLogNodes++;
26518 	replicaPtr.p->logStopGci[i - 1] = start;
26519 
26520 	replicaPtr.p->logNodeId[i] = replicaPtr.p->logNodeId[i-1];
26521 	replicaPtr.p->logStartGci[i] = start + 1;
26522 	replicaPtr.p->logStopGci[i] = stop;
26523 	start += 1;
26524       }
26525     }
26526 
26527     startFragReq->noOfLogNodes = replicaPtr.p->noLogNodes;
26528 
26529     for (Uint32 i = 0; i < MAX_LOG_EXEC ; i++) {
26530       startFragReq->lqhLogNode[i] = replicaPtr.p->logNodeId[i];
26531       startFragReq->startGci[i] = replicaPtr.p->logStartGci[i];
26532       startFragReq->lastGci[i] = replicaPtr.p->logStopGci[i];
26533     }//for
26534 
26535     startFragReq->nodeRestorableGci =
26536       SYSFILE->lastCompletedGCI[replicaPtr.p->dataNodeId];
26537     sendSignal(ref, GSN_START_FRAGREQ, signal,
26538 	       StartFragReq::SignalLength, JBB);
26539   }//for
26540 }//Dbdih::sendStartFragreq()
26541 
26542 /*************************************************************************/
26543 /*       SET LCP ACTIVE STATUS BEFORE STARTING A LOCAL CHECKPOINT.       */
26544 /*************************************************************************/
setLcpActiveStatusStart(Signal * signal)26545 void Dbdih::setLcpActiveStatusStart(Signal* signal)
26546 {
26547   NodeRecordPtr nodePtr;
26548 
26549   jam();
26550   c_lcpState.m_participatingLQH.clear();
26551   c_lcpState.m_participatingDIH.clear();
26552 
26553   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
26554   {
26555     ptrAss(nodePtr, nodeRecord);
26556 #if 0
26557     if(nodePtr.p->nodeStatus != NodeRecord::NOT_IN_CLUSTER){
26558       infoEvent("Node %d nodeStatus=%d activeStatus=%d copyCompleted=%d lcp=%d",
26559 		nodePtr.i,
26560 		nodePtr.p->nodeStatus,
26561 		nodePtr.p->activeStatus,
26562 		nodePtr.p->copyCompleted,
26563 		nodePtr.p->m_inclDihLcp);
26564     }
26565 #endif
26566     if(nodePtr.p->nodeStatus == NodeRecord::ALIVE)
26567     {
26568       jam();
26569       if (nodePtr.p->m_inclDihLcp)
26570       {
26571         jam();
26572         c_lcpState.m_participatingDIH.set(nodePtr.i);
26573       }
26574 
26575       if (nodePtr.p->copyCompleted)
26576       {
26577         jam();
26578 	c_lcpState.m_participatingLQH.set(nodePtr.i);
26579       }
26580       else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
26581       {
26582         jam();
26583         continue;
26584       }
26585       else
26586       {
26587         jam();
26588         nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
26589       }
26590     }
26591     else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
26592     {
26593       jam();
26594       continue;
26595     }
26596     else if (nodePtr.p->activeStatus != Sysfile::NS_NotDefined)
26597     {
26598       jam();
26599       nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
26600     }
26601   }
26602 }//Dbdih::setLcpActiveStatusStart()
26603 
26604 /*************************************************************************/
26605 /*       SET LCP ACTIVE STATUS AT THE END OF A LOCAL CHECKPOINT.        */
26606 /*************************************************************************/
setLcpActiveStatusEnd(Signal * signal)26607 void Dbdih::setLcpActiveStatusEnd(Signal* signal)
26608 {
26609   NodeRecordPtr nodePtr;
26610 
26611   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
26612   {
26613     jam();
26614     ptrAss(nodePtr, nodeRecord);
26615     if (c_lcpState.m_participatingLQH.get(nodePtr.i))
26616     {
26617       jam();
26618       nodePtr.p->copyCompleted = 1;
26619       if (! (nodePtr.p->activeStatus == Sysfile::NS_Configured))
26620       {
26621         jam();
26622         nodePtr.p->activeStatus = Sysfile::NS_Active;
26623       }
26624       else
26625       {
26626         jam();
26627         // Do nothing
26628       }
26629       if (nodePtr.p->nodeRecoveryStatus == NodeRecord::NODE_IN_LCP_WAIT_STATE)
26630       {
26631         jam();
26632         /**
26633          * This is a non-master node and this is the first time we heard this
26634          * node is alive and active. We set the node recovery status, this
26635          * status is only used in printouts if this node later becomes master
26636          * and the node is still alive and kicking. This means we have no
26637          * detailed information about its restart status.
26638          */
26639         setNodeRecoveryStatus(nodePtr.i, NodeRecord::NODE_ACTIVE);
26640       }
26641     }
26642     else if (nodePtr.p->activeStatus == Sysfile::NS_Configured)
26643     {
26644       jam();
26645       continue;
26646     }
26647     else if (nodePtr.p->activeStatus != Sysfile::NS_NotDefined)
26648     {
26649       jam();
26650       nodePtr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
26651     }
26652   }
26653 
26654   c_lcpState.m_participatingDIH.clear();
26655   c_lcpState.m_participatingLQH.clear();
26656 
26657   if (isMaster()) {
26658     jam();
26659     setNodeRestartInfoBits(signal);
26660   }//if
26661 }//Dbdih::setLcpActiveStatusEnd()
26662 
26663 /*************************************************************************/
26664 /* SET NODE ACTIVE STATUS AT SYSTEM RESTART AND WHEN UPDATED BY MASTER   */
26665 /*************************************************************************/
setNodeActiveStatus()26666 void Dbdih::setNodeActiveStatus()
26667 {
26668   NodeRecordPtr snaNodeptr;
26669 
26670   for (snaNodeptr.i = 1; snaNodeptr.i <= m_max_node_id; snaNodeptr.i++)
26671   {
26672     ptrAss(snaNodeptr, nodeRecord);
26673     const Uint32 tsnaNodeBits = Sysfile::getNodeStatus(snaNodeptr.i,
26674                                                        SYSFILE->nodeStatus);
26675     switch (tsnaNodeBits) {
26676     case Sysfile::NS_Active:
26677       jam();
26678       snaNodeptr.p->activeStatus = Sysfile::NS_Active;
26679       break;
26680     case Sysfile::NS_ActiveMissed_1:
26681       jam();
26682       snaNodeptr.p->activeStatus = Sysfile::NS_ActiveMissed_1;
26683       break;
26684     case Sysfile::NS_ActiveMissed_2:
26685       jam();
26686       snaNodeptr.p->activeStatus = Sysfile::NS_ActiveMissed_2;
26687       break;
26688     case Sysfile::NS_TakeOver:
26689       jam();
26690       snaNodeptr.p->activeStatus = Sysfile::NS_TakeOver;
26691       break;
26692     case Sysfile::NS_NotActive_NotTakenOver:
26693       jam();
26694       snaNodeptr.p->activeStatus = Sysfile::NS_NotActive_NotTakenOver;
26695       break;
26696     case Sysfile::NS_NotDefined:
26697       jam();
26698       snaNodeptr.p->activeStatus = Sysfile::NS_NotDefined;
26699       break;
26700     case Sysfile::NS_Configured:
26701       jam();
26702       snaNodeptr.p->activeStatus = Sysfile::NS_Configured;
26703       break;
26704     default:
26705       ndbabort();
26706     }//switch
26707   }//for
26708 }//Dbdih::setNodeActiveStatus()
26709 
26710 /***************************************************************************/
26711 /* SET THE NODE GROUP BASED ON THE RESTART INFORMATION OR AS SET BY MASTER */
26712 /***************************************************************************/
setNodeGroups()26713 void Dbdih::setNodeGroups()
26714 {
26715   DEB_MULTI_TRP(("setNodeGroups"));
26716   NodeGroupRecordPtr NGPtr;
26717   NodeRecordPtr sngNodeptr;
26718   Uint32 Ti;
26719   for (Ti = 0; Ti < cnoOfNodeGroups; Ti++) {
26720     NGPtr.i = c_node_groups[Ti];
26721     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
26722     NGPtr.p->nodeCount = 0;
26723     NGPtr.p->nodegroupIndex = RNIL;
26724   }//for
26725   cnoOfNodeGroups = 0;
26726   for (sngNodeptr.i = 1; sngNodeptr.i <= m_max_node_id; sngNodeptr.i++)
26727   {
26728     ptrAss(sngNodeptr, nodeRecord);
26729     Sysfile::ActiveStatus s =
26730       (Sysfile::ActiveStatus)Sysfile::getNodeStatus(sngNodeptr.i,
26731 						    SYSFILE->nodeStatus);
26732     switch (s){
26733     case Sysfile::NS_Active:
26734     case Sysfile::NS_ActiveMissed_1:
26735     case Sysfile::NS_ActiveMissed_2:
26736     case Sysfile::NS_NotActive_NotTakenOver:
26737     case Sysfile::NS_TakeOver:
26738       jam();
26739       sngNodeptr.p->nodeGroup = Sysfile::getNodeGroup(sngNodeptr.i,
26740                                                       SYSFILE->nodeGroups);
26741       NGPtr.i = sngNodeptr.p->nodeGroup;
26742       ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
26743       NGPtr.p->nodesInGroup[NGPtr.p->nodeCount] = sngNodeptr.i;
26744       NGPtr.p->nodeCount++;
26745       ndbrequire(NGPtr.p->nodeCount <= cnoReplicas);
26746       add_nodegroup(NGPtr);
26747       DEB_MULTI_TRP(("Node %u into node group %u", sngNodeptr.i, NGPtr.i));
26748       break;
26749     case Sysfile::NS_NotDefined:
26750     case Sysfile::NS_Configured:
26751       jam();
26752       sngNodeptr.p->nodeGroup = ZNIL;
26753       break;
26754     default:
26755       ndbabort();
26756       return;
26757     }//switch
26758   }//for
26759   sngNodeptr.i = getOwnNodeId();
26760   ptrCheckGuard(sngNodeptr, MAX_NDB_NODES, nodeRecord);
26761   NGPtr.i = sngNodeptr.p->nodeGroup;
26762   if (NGPtr.i == ZNIL)
26763   {
26764     jam();
26765     return;
26766   }
26767   ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
26768   if (NGPtr.p->nodeCount <= 1)
26769   {
26770     /**
26771      * Only one replica in this node group, so no neighbour.
26772      * Could also be a node in a new nodegroup, so effectively
26773      * it is part of no nodegroup and thus has no neighbours
26774      * in this case either.
26775      */
26776     jam();
26777     return;
26778   }
26779   ndbrequire(NGPtr.p->nodeCount <= MAX_REPLICAS);
26780   /**
26781    * Inform scheduler of our neighbour node to ensure the best
26782    * possible communication with this node. If more than two
26783    * replicas we will still only have one neighbour, so we will
26784    * have most communication with this neighbour node.
26785    */
26786   startChangeNeighbourNode();
26787   for (Uint32 i = 0; i < NGPtr.p->nodeCount; i++)
26788   {
26789     jam();
26790     Uint32 nodeId = NGPtr.p->nodesInGroup[i];
26791     if (nodeId != getOwnNodeId())
26792     {
26793       jam();
26794       ndbrequire(nodeId != 0 && nodeId < MAX_NODES);
26795       setNeighbourNode(nodeId);
26796     }
26797   }
26798   endChangeNeighbourNode();
26799 }//Dbdih::setNodeGroups()
26800 
26801 /*************************************************************************/
26802 /* SET THE RESTART INFO BITS BASED ON THE NODES ACTIVE STATUS.           */
26803 /*************************************************************************/
setNodeRestartInfoBits(Signal * signal)26804 void Dbdih::setNodeRestartInfoBits(Signal * signal)
26805 {
26806   NodeRecordPtr nodePtr;
26807   Uint32 tsnrNodeGroup;
26808   Uint32 tsnrNodeActiveStatus;
26809   Uint32 i;
26810   for(i = 1; i <= m_max_node_id; i++)
26811   {
26812     Sysfile::setNodeStatus(i, SYSFILE->nodeStatus, Sysfile::NS_Active);
26813   }//for
26814   NdbNodeBitmask::clear(SYSFILE->lcpActive);
26815 
26816 #ifdef ERROR_INSERT
26817   NdbNodeBitmask tmp;
26818 #endif
26819 
26820   for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
26821   {
26822     ptrAss(nodePtr, nodeRecord);
26823     switch (nodePtr.p->activeStatus) {
26824     case Sysfile::NS_Active:
26825       jam();
26826       tsnrNodeActiveStatus = Sysfile::NS_Active;
26827       break;
26828     case Sysfile::NS_ActiveMissed_1:
26829       jam();
26830       tsnrNodeActiveStatus = Sysfile::NS_ActiveMissed_1;
26831       break;
26832     case Sysfile::NS_ActiveMissed_2:
26833       jam();
26834       tsnrNodeActiveStatus = Sysfile::NS_ActiveMissed_2;
26835       break;
26836     case Sysfile::NS_TakeOver:
26837       jam();
26838       tsnrNodeActiveStatus = Sysfile::NS_TakeOver;
26839       break;
26840     case Sysfile::NS_NotActive_NotTakenOver:
26841       jam();
26842       tsnrNodeActiveStatus = Sysfile::NS_NotActive_NotTakenOver;
26843       break;
26844     case Sysfile::NS_NotDefined:
26845       jam();
26846       tsnrNodeActiveStatus = Sysfile::NS_NotDefined;
26847       break;
26848     case Sysfile::NS_Configured:
26849       jam();
26850       tsnrNodeActiveStatus = Sysfile::NS_Configured;
26851       break;
26852     default:
26853       ndbabort();
26854       tsnrNodeActiveStatus = Sysfile::NS_NotDefined; // remove warning
26855     }//switch
26856     Sysfile::setNodeStatus(nodePtr.i, SYSFILE->nodeStatus,
26857                            tsnrNodeActiveStatus);
26858     if (nodePtr.p->nodeGroup == ZNIL) {
26859       jam();
26860       tsnrNodeGroup = NO_NODE_GROUP_ID;
26861     } else {
26862       jam();
26863       tsnrNodeGroup = nodePtr.p->nodeGroup;
26864     }//if
26865     Sysfile::setNodeGroup(nodePtr.i, SYSFILE->nodeGroups, tsnrNodeGroup);
26866     if (c_lcpState.m_participatingLQH.get(nodePtr.i))
26867     {
26868       jam();
26869       NdbNodeBitmask::set(SYSFILE->lcpActive, nodePtr.i);
26870     }//if
26871 #ifdef ERROR_INSERT
26872     else if (Sysfile::getLCPOngoing(SYSFILE->systemRestartBits))
26873     {
26874       jam();
26875       if (nodePtr.p->activeStatus == Sysfile::NS_Active)
26876         tmp.set(nodePtr.i);
26877     }
26878 #endif
26879   }//for
26880 
26881 #ifdef ERROR_INSERT
26882   if (ERROR_INSERTED(7220) && !tmp.isclear())
26883   {
26884     jam();
26885 
26886     NdbNodeBitmask all;
26887     nodePtr.i = cfirstAliveNode;
26888     do {
26889       jam();
26890       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
26891       all.set(nodePtr.i);
26892       nodePtr.i = nodePtr.p->nextNode;
26893     } while (nodePtr.i != RNIL);
26894 
26895 
26896     NodeReceiverGroup rg(DBDIH, all);
26897     signal->theData[0] = 7219;
26898     sendSignal(rg, GSN_NDB_TAMPER, signal,  1, JBA);
26899   }
26900 #endif
26901 }//Dbdih::setNodeRestartInfoBits()
26902 
26903 /*************************************************************************/
26904 /*       START THE GLOBAL CHECKPOINT PROTOCOL IN MASTER AT START-UP      */
26905 /*************************************************************************/
startGcp(Signal * signal)26906 void Dbdih::startGcp(Signal* signal)
26907 {
26908   signal->theData[0] = DihContinueB::ZSTART_GCP;
26909   sendSignal(reference(), GSN_CONTINUEB, signal, 1, JBB);
26910 
26911   startGcpMonitor(signal);
26912 }//Dbdih::startGcp()
26913 
26914 void
startGcpMonitor(Signal * signal)26915 Dbdih::startGcpMonitor(Signal* signal)
26916 {
26917   jam();
26918   m_gcp_monitor.m_gcp_save.m_gci = m_gcp_save.m_gci;
26919   m_gcp_monitor.m_gcp_save.m_elapsed_ms = 0;
26920   m_gcp_monitor.m_gcp_save.m_need_max_lag_recalc = true;
26921   m_gcp_monitor.m_micro_gcp.m_gci = m_micro_gcp.m_current_gci;
26922   m_gcp_monitor.m_micro_gcp.m_elapsed_ms = 0;
26923   m_gcp_monitor.m_micro_gcp.m_need_max_lag_recalc = true;
26924   m_gcp_monitor.m_last_check = NdbTick_getCurrentTicks();
26925 
26926 #ifdef ERROR_INSERT
26927   m_gcp_monitor.m_savedMaxCommitLag = 0;
26928   m_gcp_monitor.m_gcp_save.test_set_max_lag = false;
26929   m_gcp_monitor.m_micro_gcp.test_set_max_lag = false;
26930 #endif
26931 
26932   signal->theData[0] = DihContinueB::ZCHECK_GCP_STOP;
26933   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 100, 1);
26934 }
26935 
26936 /**
26937  * This changes the table distribution and this can be seen by
26938  * DIGETNODES, so if this is called when we are not in recovery
26939  * we need to hold the table RCU lock.
26940  */
updateNodeInfo(FragmentstorePtr fragPtr)26941 void Dbdih::updateNodeInfo(FragmentstorePtr fragPtr)
26942 {
26943   ReplicaRecordPtr replicatePtr;
26944   Uint32 index = 0;
26945   replicatePtr.i = fragPtr.p->storedReplicas;
26946   do {
26947     jam();
26948     c_replicaRecordPool.getPtr(replicatePtr);
26949     ndbrequire(index < MAX_REPLICAS);
26950     fragPtr.p->activeNodes[index] = replicatePtr.p->procNode;
26951     index++;
26952     replicatePtr.i = replicatePtr.p->nextPool;
26953   } while (replicatePtr.i != RNIL);
26954   fragPtr.p->fragReplicas = index;
26955 
26956   /* ----------------------------------------------------------------------- */
26957   // We switch primary to the preferred primary if the preferred primary is
26958   // in the list.
26959   /* ----------------------------------------------------------------------- */
26960   const Uint32 prefPrim = fragPtr.p->preferredPrimary;
26961   for (Uint32 i = 1; i < index; i++) {
26962     jam();
26963     ndbrequire(i < MAX_REPLICAS);
26964     if (fragPtr.p->activeNodes[i] == prefPrim){
26965       jam();
26966       Uint32 switchNode = fragPtr.p->activeNodes[0];
26967       fragPtr.p->activeNodes[0] = prefPrim;
26968       fragPtr.p->activeNodes[i] = switchNode;
26969       break;
26970     }//if
26971   }//for
26972 }//Dbdih::updateNodeInfo()
26973 
writeFragment(RWFragment * wf,FragmentstorePtr fragPtr)26974 void Dbdih::writeFragment(RWFragment* wf, FragmentstorePtr fragPtr)
26975 {
26976   writePageWord(wf, wf->fragId);
26977   writePageWord(wf, fragPtr.p->preferredPrimary);
26978   writePageWord(wf, fragPtr.p->noStoredReplicas);
26979   writePageWord(wf, fragPtr.p->noOldStoredReplicas);
26980   writePageWord(wf, fragPtr.p->distributionKey);
26981   writePageWord(wf, fragPtr.p->m_log_part_id);
26982 }//Dbdih::writeFragment()
26983 
writePageWord(RWFragment * wf,Uint32 dataWord)26984 void Dbdih::writePageWord(RWFragment* wf, Uint32 dataWord)
26985 {
26986   if (wf->wordIndex >= 2048) {
26987     jam();
26988     ndbrequire(wf->wordIndex == 2048);
26989     allocpage(wf->rwfPageptr);
26990     wf->wordIndex = 32;
26991     wf->pageIndex++;
26992     ndbrequire(wf->pageIndex < NDB_ARRAY_SIZE(wf->rwfTabPtr.p->pageRef));
26993     wf->rwfTabPtr.p->pageRef[wf->pageIndex] = wf->rwfPageptr.i;
26994     wf->rwfTabPtr.p->noPages++;
26995   }//if
26996   wf->rwfPageptr.p->word[wf->wordIndex] = dataWord;
26997   wf->wordIndex++;
26998 }//Dbdih::writePageWord()
26999 
writeReplicas(RWFragment * wf,Uint32 replicaStartIndex)27000 void Dbdih::writeReplicas(RWFragment* wf, Uint32 replicaStartIndex)
27001 {
27002   ReplicaRecordPtr wfReplicaPtr;
27003   wfReplicaPtr.i = replicaStartIndex;
27004   while (wfReplicaPtr.i != RNIL) {
27005     jam();
27006     c_replicaRecordPool.getPtr(wfReplicaPtr);
27007     writePageWord(wf, wfReplicaPtr.p->procNode);
27008     writePageWord(wf, wfReplicaPtr.p->initialGci);
27009     writePageWord(wf, wfReplicaPtr.p->noCrashedReplicas);
27010     writePageWord(wf, wfReplicaPtr.p->nextLcp);
27011     Uint32 i;
27012     for (i = 0; i < MAX_LCP_STORED; i++) {
27013       writePageWord(wf, wfReplicaPtr.p->maxGciCompleted[i]);
27014       writePageWord(wf, wfReplicaPtr.p->maxGciStarted[i]);
27015       writePageWord(wf, wfReplicaPtr.p->lcpId[i]);
27016       writePageWord(wf, wfReplicaPtr.p->lcpStatus[i]);
27017     }//if
27018     for (i = 0; i < MAX_CRASHED_REPLICAS; i++) {
27019       writePageWord(wf, wfReplicaPtr.p->createGci[i]);
27020       writePageWord(wf, wfReplicaPtr.p->replicaLastGci[i]);
27021     }//if
27022 
27023     wfReplicaPtr.i = wfReplicaPtr.p->nextPool;
27024   }//while
27025 }//Dbdih::writeReplicas()
27026 
writeRestorableGci(Signal * signal,FileRecordPtr filePtr)27027 void Dbdih::writeRestorableGci(Signal* signal, FileRecordPtr filePtr)
27028 {
27029   pack_sysfile_format_v2();
27030   STATIC_ASSERT(Sysfile::SYSFILE_FILE_SIZE >= Sysfile::SYSFILE_SIZE32_v2);
27031   memcpy(&sysfileDataToFile[0], &cdata[0], 4 * cdata_size_in_words);
27032   FsReadWriteReq* req = (FsReadWriteReq*)signal->getDataPtrSend();
27033   req->filePointer = filePtr.p->fileRef;
27034   req->userReference = reference();
27035   req->userPointer = filePtr.i;
27036   req->operationFlag = 0;
27037   req->varIndex = ZVAR_NO_CRESTART_INFO_TO_FILE;
27038   req->numberOfPages = 1;
27039   FsReadWriteReq::setFormatFlag(req->operationFlag,
27040                                 FsReadWriteReq::fsFormatMemAddress);
27041   FsReadWriteReq::setSyncFlag(req->operationFlag, 1);
27042   req->data.memoryAddress.memoryOffset = 0;
27043   req->data.memoryAddress.fileOffset = 0;
27044   req->data.memoryAddress.size = Sysfile::SYSFILE_FILE_SIZE;
27045   if (ERROR_INSERTED(7224) && filePtr.i == crestartInfoFile[1])
27046   {
27047     jam();
27048     SET_ERROR_INSERT_VALUE(7225);
27049     sendSignalWithDelay(NDBFS_REF,
27050                         GSN_FSWRITEREQ,
27051                         signal,
27052                         2000,
27053                         FsReadWriteReq::FixedLength + 3);
27054 
27055     signal->theData[0] = 9999;
27056     sendSignal(numberToRef(CMVMI, refToNode(cmasterdihref)),
27057 	       GSN_NDB_TAMPER, signal, 1, JBB);
27058     g_eventLogger->info("FS_WRITEREQ delay 2 second for COPY_GCIREQ");
27059     return;
27060   }
27061   sendSignal(NDBFS_REF,
27062              GSN_FSWRITEREQ,
27063              signal,
27064              FsReadWriteReq::FixedLength + 3,
27065              JBA);
27066 }//Dbdih::writeRestorableGci()
27067 
writeTabfile(Signal * signal,TabRecord * tab,FileRecordPtr filePtr)27068 void Dbdih::writeTabfile(Signal* signal, TabRecord* tab, FileRecordPtr filePtr)
27069 {
27070   signal->theData[0] = filePtr.p->fileRef;
27071   signal->theData[1] = reference();
27072   signal->theData[2] = filePtr.i;
27073   signal->theData[3] = ZLIST_OF_PAIRS_SYNCH;
27074   signal->theData[4] = ZVAR_NO_WORD;
27075   signal->theData[5] = tab->noPages;
27076 
27077   NDB_STATIC_ASSERT(NDB_ARRAY_SIZE(tab->pageRef) <= NDB_FS_RW_PAGES);
27078   Uint32 section[2 * NDB_ARRAY_SIZE(tab->pageRef)];
27079   for (Uint32 i = 0; i < tab->noPages; i++)
27080   {
27081     section[(2 * i) + 0] = tab->pageRef[i];
27082     section[(2 * i) + 1] = i;
27083   }
27084   LinearSectionPtr ptr[3];
27085   ptr[0].p = section;
27086   ptr[0].sz = 2 * tab->noPages;
27087   sendSignal(NDBFS_REF, GSN_FSWRITEREQ, signal, 6, JBA, ptr, 1);
27088 }//Dbdih::writeTabfile()
27089 
execDEBUG_SIG(Signal * signal)27090 void Dbdih::execDEBUG_SIG(Signal* signal)
27091 {
27092   (void)signal; //Avoid compiler warnings
27093 }//Dbdih::execDEBUG_SIG()
27094 
27095 void
execDUMP_STATE_ORD(Signal * signal)27096 Dbdih::execDUMP_STATE_ORD(Signal* signal)
27097 {
27098   DumpStateOrd * const & dumpState = (DumpStateOrd *)&signal->theData[0];
27099   Uint32 arg = dumpState->args[0];
27100 
27101   if (arg == DumpStateOrd::DihFragmentsPerNode)
27102   {
27103     infoEvent("Fragments per node = %u", getFragmentsPerNode());
27104   }
27105   if (arg == DumpStateOrd::DihDumpNodeRestartInfo) {
27106     infoEvent("c_nodeStartMaster.blockGcp = %d, c_nodeStartMaster.wait = %d",
27107 	      c_nodeStartMaster.blockGcp, c_nodeStartMaster.wait);
27108     for (Uint32 i = 0; i < c_diverify_queue_cnt; i++)
27109     {
27110       /* read barrier to try force fresh reads of c_diverify_queue */
27111       rmb();
27112       infoEvent("[ %u : cfirstVerifyQueue = %u clastVerifyQueue = %u]",
27113                 i,
27114                 c_diverify_queue[i].cfirstVerifyQueue,
27115                 c_diverify_queue[i].clastVerifyQueue);
27116     }
27117     infoEvent("cgcpOrderBlocked = %d",
27118               cgcpOrderBlocked);
27119   }//if
27120   if (arg == DumpStateOrd::DihDumpNodeStatusInfo) {
27121     NodeRecordPtr localNodePtr;
27122     infoEvent("Printing nodeStatus of all nodes");
27123     for (localNodePtr.i = 1; localNodePtr.i <= m_max_node_id; localNodePtr.i++)
27124     {
27125       ptrAss(localNodePtr, nodeRecord);
27126       if (localNodePtr.p->nodeStatus != NodeRecord::NOT_IN_CLUSTER) {
27127         infoEvent("Node = %d has status = %d",
27128 		  localNodePtr.i, localNodePtr.p->nodeStatus);
27129       }//if
27130     }//for
27131   }//if
27132 
27133   if (arg == DumpStateOrd::DihPrintFragmentation)
27134   {
27135     Uint32 tableid = 0;
27136     Uint32 fragid = 0;
27137     if (signal->getLength() == 1)
27138     {
27139       infoEvent("Printing nodegroups --");
27140       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
27141       {
27142         jam();
27143         NodeGroupRecordPtr NGPtr;
27144         NGPtr.i = c_node_groups[i];
27145         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
27146 
27147         infoEvent("NG %u(%u) ref: %u [ cnt: %u : %u %u %u %u ]",
27148                   NGPtr.i, NGPtr.p->nodegroupIndex, NGPtr.p->m_ref_count,
27149                   NGPtr.p->nodeCount,
27150                   NGPtr.p->nodesInGroup[0], NGPtr.p->nodesInGroup[1],
27151                   NGPtr.p->nodesInGroup[2], NGPtr.p->nodesInGroup[3]);
27152       }
27153       infoEvent("Printing fragmentation of all tables --");
27154     }
27155     else if (signal->getLength() == 3)
27156     {
27157       jam();
27158       tableid = dumpState->args[1];
27159       fragid = dumpState->args[2];
27160     }
27161     else
27162     {
27163       return;
27164     }
27165 
27166     if (tableid >= ctabFileSize)
27167     {
27168       return;
27169     }
27170 
27171     TabRecordPtr tabPtr;
27172     tabPtr.i = tableid;
27173     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
27174 
27175     if (tabPtr.p->tabStatus == TabRecord::TS_ACTIVE &&
27176         fragid < tabPtr.p->totalfragments)
27177     {
27178       dumpState->args[0] = DumpStateOrd::DihPrintOneFragmentation;
27179       dumpState->args[1] = tableid;
27180       dumpState->args[2] = fragid;
27181       execDUMP_STATE_ORD(signal);
27182     }
27183 
27184     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE ||
27185         ++fragid >= tabPtr.p->totalfragments)
27186     {
27187         tableid++;
27188         fragid = 0;
27189     }
27190 
27191     if (tableid < ctabFileSize)
27192     {
27193       dumpState->args[0] = DumpStateOrd::DihPrintFragmentation;
27194       dumpState->args[1] = tableid;
27195       dumpState->args[2] = fragid;
27196       sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 3, JBB);
27197     }
27198   }
27199 
27200   if (arg == DumpStateOrd::DihPrintOneFragmentation)
27201   {
27202     Uint32 tableid = RNIL;
27203     Uint32 fragid = RNIL;
27204 
27205     if (signal->getLength() == 3)
27206     {
27207       jam();
27208       tableid = dumpState->args[1];
27209       fragid = dumpState->args[2];
27210     }
27211     else
27212     {
27213       return;
27214     }
27215 
27216     if (tableid >= ctabFileSize)
27217     {
27218       return;
27219     }
27220 
27221     TabRecordPtr tabPtr;
27222     tabPtr.i = tableid;
27223     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
27224 
27225     if (fragid >= tabPtr.p->totalfragments)
27226     {
27227       return;
27228     }
27229 
27230     FragmentstorePtr fragPtr;
27231     getFragstore(tabPtr.p, fragid, fragPtr);
27232 
27233     Uint32 nodeOrder[MAX_REPLICAS];
27234     const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
27235                                                 fragPtr.p,
27236                                                 nodeOrder);
27237     char buf[100];
27238     BaseString::snprintf(buf, sizeof(buf),
27239                          " Table %d Fragment %d(%u) LP: %u - ",
27240                          tabPtr.i, fragid, dihGetInstanceKey(fragPtr),
27241                          fragPtr.p->m_log_part_id);
27242 
27243     for(Uint32 k = 0; k < noOfReplicas; k++)
27244     {
27245       char tmp[100];
27246       BaseString::snprintf(tmp, sizeof(tmp), "%d ", nodeOrder[k]);
27247       strcat(buf, tmp);
27248     }
27249     infoEvent("%s", buf);
27250   }
27251 
27252   if (signal->theData[0] == 7000) {
27253     infoEvent("ctimer = %d",
27254               c_lcpState.ctimer);
27255     infoEvent("cmasterState = %d", cmasterState);
27256     infoEvent("cmasterTakeOverNode = %d, ctcCounter = %d",
27257               cmasterTakeOverNode, c_lcpState.ctcCounter);
27258   }//if
27259   if (signal->theData[0] == 7001) {
27260     infoEvent("c_lcpState.keepGci = %d",
27261               c_lcpState.keepGci);
27262     infoEvent("c_lcpState.lcpStatus = %d, clcpStopGcp = %d",
27263               c_lcpState.lcpStatus,
27264 	      c_lcpState.lcpStopGcp);
27265     infoEvent("cimmediateLcpStart = %d",
27266               c_lcpState.immediateLcpStart);
27267   }//if
27268   if (signal->theData[0] == 7002) {
27269     infoEvent("cnoOfActiveTables = %d",
27270               cnoOfActiveTables);
27271     infoEvent("cdictblockref = %d, cfailurenr = %d",
27272               cdictblockref, cfailurenr);
27273     infoEvent("con_lineNodes = %d, reference() = %d, creceivedfrag = %d",
27274               con_lineNodes, reference(), creceivedfrag);
27275   }//if
27276   if (signal->theData[0] == 7003) {
27277     infoEvent("cfirstAliveNode = %d, cgckptflag = %d",
27278               cfirstAliveNode, cgckptflag);
27279     infoEvent("clocallqhblockref = %d, clocaltcblockref = %d, cgcpOrderBlocked = %d",
27280               clocallqhblockref, clocaltcblockref, cgcpOrderBlocked);
27281     infoEvent("cstarttype = %d, csystemnodes = %d",
27282               cstarttype, csystemnodes);
27283   }//if
27284   if (signal->theData[0] == 7004) {
27285     infoEvent("cmasterdihref = %d, cownNodeId = %d",
27286               cmasterdihref, cownNodeId);
27287     infoEvent("cndbStartReqBlockref = %d, cremainingfrags = %d",
27288               cndbStartReqBlockref, cremainingfrags);
27289   }//if
27290   if (signal->theData[0] == 7005) {
27291     infoEvent("crestartGci = %d",
27292               crestartGci);
27293   }//if
27294   if (signal->theData[0] == 7006) {
27295     infoEvent("clcpDelay = %d",
27296               c_lcpState.clcpDelay);
27297     infoEvent("cmasterNodeId = %d", cmasterNodeId);
27298     infoEvent("c_nodeStartMaster.startNode = %d, c_nodeStartMaster.wait = %d",
27299               c_nodeStartMaster.startNode, c_nodeStartMaster.wait);
27300   }//if
27301   if (signal->theData[0] == 7007) {
27302     infoEvent("c_nodeStartMaster.failNr = %d", c_nodeStartMaster.failNr);
27303     infoEvent("c_nodeStartMaster.startInfoErrorCode = %d",
27304               c_nodeStartMaster.startInfoErrorCode);
27305     infoEvent("c_nodeStartMaster.blockGcp = %d",
27306               c_nodeStartMaster.blockGcp);
27307   }//if
27308   if (signal->theData[0] == 7008) {
27309     infoEvent("cfirstDeadNode = %d, cstartPhase = %d, cnoReplicas = %d",
27310               cfirstDeadNode, cstartPhase, cnoReplicas);
27311     infoEvent("cwaitLcpSr = %d",cwaitLcpSr);
27312   }//if
27313   if (signal->theData[0] == 7009) {
27314     infoEvent("ccalcOldestRestorableGci = %d, cnoOfNodeGroups = %d",
27315               c_lcpState.oldestRestorableGci, cnoOfNodeGroups);
27316     infoEvent("crestartGci = %d",
27317               crestartGci);
27318   }//if
27319   if (signal->theData[0] == 7010) {
27320     infoEvent("c_lcpState.lcpStatusUpdatedPlace = %d, cLcpStart = %d",
27321               c_lcpState.lcpStatusUpdatedPlace, c_lcpState.lcpStart);
27322     infoEvent("c_blockCommit = %d, c_blockCommitNo = %d",
27323               c_blockCommit, c_blockCommitNo);
27324   }//if
27325   if (signal->theData[0] == 7011){
27326     infoEvent("c_COPY_GCIREQ_Counter = %s",
27327 	      c_COPY_GCIREQ_Counter.getText());
27328     infoEvent("c_COPY_TABREQ_Counter = %s",
27329 	      c_COPY_TABREQ_Counter.getText());
27330     infoEvent("c_UPDATE_FRAG_STATEREQ_Counter = %s",
27331 	      c_UPDATE_FRAG_STATEREQ_Counter.getText());
27332     infoEvent("c_DIH_SWITCH_REPLICA_REQ_Counter = %s",
27333 	      c_DIH_SWITCH_REPLICA_REQ_Counter.getText());
27334     infoEvent("c_GCP_COMMIT_Counter = %s", c_GCP_COMMIT_Counter.getText());
27335     infoEvent("c_GCP_PREPARE_Counter = %s", c_GCP_PREPARE_Counter.getText());
27336     infoEvent("c_GCP_SAVEREQ_Counter = %s", c_GCP_SAVEREQ_Counter.getText());
27337     infoEvent("c_SUB_GCP_COMPLETE_REP_Counter = %s",
27338               c_SUB_GCP_COMPLETE_REP_Counter.getText());
27339     infoEvent("c_INCL_NODEREQ_Counter = %s", c_INCL_NODEREQ_Counter.getText());
27340     infoEvent("c_MASTER_GCPREQ_Counter = %s",
27341 	      c_MASTER_GCPREQ_Counter.getText());
27342     infoEvent("c_MASTER_LCPREQ_Counter = %s",
27343 	      c_MASTER_LCPREQ_Counter.getText());
27344     infoEvent("c_START_INFOREQ_Counter = %s",
27345 	      c_START_INFOREQ_Counter.getText());
27346     infoEvent("c_START_RECREQ_Counter = %s", c_START_RECREQ_Counter.getText());
27347     infoEvent("c_STOP_ME_REQ_Counter = %s", c_STOP_ME_REQ_Counter.getText());
27348     infoEvent("c_TC_CLOPSIZEREQ_Counter = %s",
27349 	      c_TC_CLOPSIZEREQ_Counter.getText());
27350     infoEvent("c_TCGETOPSIZEREQ_Counter = %s",
27351 	      c_TCGETOPSIZEREQ_Counter.getText());
27352   }
27353 
27354   if(signal->theData[0] == 7012){
27355     char buf[8*_NDB_NODE_BITMASK_SIZE+1];
27356     infoEvent("ParticipatingDIH = %s", c_lcpState.m_participatingDIH.getText(buf));
27357     infoEvent("ParticipatingLQH = %s", c_lcpState.m_participatingLQH.getText(buf));
27358     infoEvent("m_LCP_COMPLETE_REP_Counter_DIH = %s",
27359 	      c_lcpState.m_LCP_COMPLETE_REP_Counter_DIH.getText());
27360     infoEvent("m_LCP_COMPLETE_REP_Counter_LQH = %s",
27361 	      c_lcpState.m_LCP_COMPLETE_REP_Counter_LQH.getText());
27362     infoEvent("m_lastLCP_COMPLETE_REP_id = %u",
27363                c_lcpState.m_lastLCP_COMPLETE_REP_id);
27364     infoEvent("m_lastLCP_COMPLETE_REP_ref = %x",
27365                c_lcpState.m_lastLCP_COMPLETE_REP_ref);
27366     infoEvent("noOfLcpFragRepOutstanding: %u",
27367               c_lcpState.noOfLcpFragRepOutstanding);
27368     infoEvent("m_LAST_LCP_FRAG_ORD = %s",
27369 	      c_lcpState.m_LAST_LCP_FRAG_ORD.getText());
27370     infoEvent("m_LCP_COMPLETE_REP_From_Master_Received = %d",
27371 	      c_lcpState.m_LCP_COMPLETE_REP_From_Master_Received);
27372 
27373     NodeRecordPtr nodePtr;
27374     for (nodePtr.i = 1; nodePtr.i <= m_max_node_id; nodePtr.i++)
27375     {
27376       jam();
27377       ptrAss(nodePtr, nodeRecord);
27378       if(nodePtr.p->nodeStatus == NodeRecord::ALIVE){
27379         Uint32 i;
27380 	for(i = 0; i<nodePtr.p->noOfStartedChkpt; i++){
27381 	  infoEvent("Node %d: started: table=%d fragment=%d replica=%d",
27382 		    nodePtr.i,
27383 		    nodePtr.p->startedChkpt[i].tableId,
27384 		    nodePtr.p->startedChkpt[i].fragId,
27385 		    nodePtr.p->startedChkpt[i].replicaPtr);
27386 	}
27387 
27388 	for(i = 0; i<nodePtr.p->noOfQueuedChkpt; i++){
27389 	  infoEvent("Node %d: queued: table=%d fragment=%d replica=%d",
27390 		    nodePtr.i,
27391 		    nodePtr.p->queuedChkpt[i].tableId,
27392 		    nodePtr.p->queuedChkpt[i].fragId,
27393 		    nodePtr.p->queuedChkpt[i].replicaPtr);
27394 	}
27395       }
27396       else
27397       {
27398 #ifdef DEBUG_LCP
27399         infoEvent("Node(%u)->nodeStatus = %u",
27400                   nodePtr.i,
27401                   nodePtr.p->nodeStatus);
27402 #endif
27403       }
27404     }
27405   }
27406 
27407   if(arg == DumpStateOrd::DihTcSumaNodeFailCompleted &&
27408      signal->getLength() == 2 &&
27409      signal->theData[1] < MAX_NDB_NODES)
27410   {
27411     jam();
27412     char buf2[8+1];
27413     NodeRecordPtr nodePtr;
27414     nodePtr.i = signal->theData[1];
27415     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
27416     infoEvent("NF Node %d tc: %d lqh: %d dih: %d dict: %d recNODE_FAILREP: %d",
27417 	      nodePtr.i,
27418 	      nodePtr.p->dbtcFailCompleted,
27419 	      nodePtr.p->dblqhFailCompleted,
27420 	      nodePtr.p->dbdihFailCompleted,
27421 	      nodePtr.p->dbdictFailCompleted,
27422 	      nodePtr.p->recNODE_FAILREP);
27423     infoEvent(" m_NF_COMPLETE_REP: %s m_nodefailSteps: %s",
27424 	      nodePtr.p->m_NF_COMPLETE_REP.getText(),
27425 	      nodePtr.p->m_nodefailSteps.getText(buf2));
27426   }
27427 
27428   if(arg == 7020 && signal->getLength() > 3)
27429   {
27430     Uint32 gsn= signal->theData[1];
27431     Uint32 block= signal->theData[2];
27432     Uint32 length= signal->length() - 3;
27433     memmove(signal->theData, signal->theData+3, 4*length);
27434     sendSignal(numberToRef(block, getOwnNodeId()), gsn, signal, length, JBB);
27435 
27436     warningEvent("-- SENDING CUSTOM SIGNAL --");
27437     char buf[100], buf2[100];
27438     buf2[0]= 0;
27439     for(Uint32 i = 0; i<length; i++)
27440     {
27441       BaseString::snprintf(buf, 100, "%s %.8x", buf2, signal->theData[i]);
27442       BaseString::snprintf(buf2, 100, "%s", buf);
27443     }
27444     warningEvent("gsn: %d block: %s, length: %d theData: %s",
27445 		 gsn, getBlockName(block, "UNKNOWN"), length, buf);
27446 
27447     g_eventLogger->warning("-- SENDING CUSTOM SIGNAL --");
27448     g_eventLogger->warning("gsn: %d block: %s, length: %d theData: %s",
27449                            gsn, getBlockName(block, "UNKNOWN"), length, buf);
27450   }
27451 
27452   if(arg == DumpStateOrd::DihDumpLCPState){
27453     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
27454     infoEvent("lcpStatus = %d (update place = %d) ",
27455 	      c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace);
27456     infoEvent
27457       ("lcpStart = %d lcpStopGcp = %d keepGci = %d oldestRestorable = %d",
27458        c_lcpState.lcpStart, c_lcpState.lcpStopGcp,
27459        c_lcpState.keepGci, c_lcpState.oldestRestorableGci);
27460 
27461     infoEvent
27462       ("immediateLcpStart = %d masterLcpNodeId = %d",
27463        c_lcpState.immediateLcpStart,
27464        refToNode(c_lcpState.m_masterLcpDihRef));
27465 
27466     for (Uint32 i = 0; i<10; i++)
27467     {
27468       infoEvent("%u : status: %u place: %u", i,
27469                 c_lcpState.m_saveState[i].m_status,
27470                 c_lcpState.m_saveState[i].m_place);
27471     }
27472 
27473     infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
27474     if (refToMain(signal->getSendersBlockRef()) == DBLQH)
27475     {
27476       jam();
27477       signal->theData[0] = 7011;
27478       sendSignal(cmasterdihref, GSN_DUMP_STATE_ORD, signal, 1, JBB);
27479     }
27480   }
27481 
27482   if(arg == DumpStateOrd::DihDumpLCPMasterTakeOver){
27483     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
27484     infoEvent
27485       ("c_lcpMasterTakeOverState.state = %d updatePlace = %d failedNodeId = %d",
27486        c_lcpMasterTakeOverState.state,
27487        c_lcpMasterTakeOverState.updatePlace,
27488        c_lcpMasterTakeOverState.failedNodeId);
27489 
27490     infoEvent("c_lcpMasterTakeOverState.minTableId = %u minFragId = %u",
27491 	      c_lcpMasterTakeOverState.minTableId,
27492 	      c_lcpMasterTakeOverState.minFragId);
27493 
27494     infoEvent("-- Node %d LCP MASTER TAKE OVER STATE --", getOwnNodeId());
27495   }
27496 
27497   if (signal->theData[0] == 7015)
27498   {
27499     if (signal->getLength() == 1)
27500     {
27501       signal->theData[1] = 0;
27502     }
27503 
27504     Uint32 tableId = signal->theData[1];
27505     if (tableId < ctabFileSize)
27506     {
27507       signal->theData[0] = 7021;
27508       execDUMP_STATE_ORD(signal);
27509       signal->theData[0] = 7015;
27510       signal->theData[1] = tableId + 1;
27511       sendSignal(reference(), GSN_DUMP_STATE_ORD, signal, 2, JBB);
27512     }
27513   }
27514 
27515   if(arg == DumpStateOrd::EnableUndoDelayDataWrite){
27516     g_eventLogger->info("Dbdih:: delay write of datapages for table = %d",
27517                         dumpState->args[1]);
27518     // Send this dump to ACC and TUP
27519     sendSignal(DBACC_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
27520     sendSignal(DBTUP_REF, GSN_DUMP_STATE_ORD, signal, 2, JBB);
27521 
27522     // Start immediate LCP
27523     add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
27524     if (cmasterNodeId == getOwnNodeId())
27525     {
27526       jam();
27527       c_lcpState.immediateLcpStart = true;
27528     }
27529     return;
27530   }
27531 
27532   if (signal->theData[0] == DumpStateOrd::DihAllAllowNodeStart) {
27533     for (Uint32 i = 1; i <= m_max_node_id; i++)
27534       setAllowNodeStart(i, true);
27535     return;
27536   }//if
27537   if (signal->theData[0] == DumpStateOrd::DihMinTimeBetweenLCP) {
27538     // Set time between LCP to min value
27539     if (signal->getLength() == 2)
27540     {
27541       Uint32 tmp;
27542       const ndb_mgm_configuration_iterator * p =
27543 	m_ctx.m_config.getOwnConfigIterator();
27544       ndbrequire(p != 0);
27545       ndb_mgm_get_int_parameter(p, CFG_DB_LCP_INTERVAL, &tmp);
27546       g_eventLogger->info("Reset time between LCP to %u", tmp);
27547       c_lcpState.clcpDelay = tmp;
27548     }
27549     else
27550     {
27551       g_eventLogger->info("Set time between LCP to min value");
27552       c_lcpState.clcpDelay = 0; // TimeBetweenLocalCheckpoints.min
27553     }
27554     return;
27555   }
27556   if (signal->theData[0] == DumpStateOrd::DihMaxTimeBetweenLCP) {
27557     // Set time between LCP to max value
27558     g_eventLogger->info("Set time between LCP to max value");
27559     c_lcpState.clcpDelay = 31; // TimeBetweenLocalCheckpoints.max
27560     return;
27561   }
27562 
27563   if(arg == 7098){
27564     if(signal->length() == 3){
27565       jam();
27566       infoEvent("startLcpRoundLoopLab(tabel=%d, fragment=%d)",
27567 		signal->theData[1], signal->theData[2]);
27568       startLcpRoundLoopLab(signal, signal->theData[1], signal->theData[2]);
27569       return;
27570     } else {
27571       infoEvent("Invalid no of arguments to 7098 - startLcpRoundLoopLab -"
27572 		" expected 2 (tableId, fragmentId)");
27573     }
27574   }
27575 
27576   if (arg == DumpStateOrd::DihStartLcpImmediately)
27577   {
27578     jam();
27579     if (cmasterNodeId == getOwnNodeId())
27580     {
27581       jam();
27582       c_lcpState.immediateLcpStart = true;
27583       return;
27584     }
27585 
27586     add_lcp_counter(&c_lcpState.ctimer, (1 << 31));
27587     /**
27588      * If sent from local LQH, forward to master
27589      */
27590     if (refToMain(signal->getSendersBlockRef()) == DBLQH)
27591     {
27592       jam();
27593       sendSignal(cmasterdihref, GSN_DUMP_STATE_ORD, signal, 1, JBB);
27594     }
27595     return;
27596   }
27597 
27598   if (arg == DumpStateOrd::DihSetTimeBetweenGcp)
27599   {
27600     Uint32 tmp = 0;
27601     if (signal->getLength() == 1)
27602     {
27603       const ndb_mgm_configuration_iterator * p =
27604 	m_ctx.m_config.getOwnConfigIterator();
27605       ndbrequire(p != 0);
27606       ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &tmp);
27607     }
27608     else
27609     {
27610       tmp = signal->theData[1];
27611     }
27612     m_gcp_save.m_master.m_time_between_gcp = tmp;
27613     g_eventLogger->info("Setting time between gcp : %d", tmp);
27614   }
27615 
27616   if (arg == 7021 && signal->getLength() == 2)
27617   {
27618     TabRecordPtr tabPtr;
27619     tabPtr.i = signal->theData[1];
27620     if (tabPtr.i >= ctabFileSize)
27621       return;
27622 
27623     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
27624 
27625     if(tabPtr.p->tabStatus != TabRecord::TS_ACTIVE)
27626       return;
27627 
27628     infoEvent
27629       ("Table %d: TabCopyStatus: %d TabUpdateStatus: %d TabLcpStatus: %d",
27630        tabPtr.i,
27631        tabPtr.p->tabCopyStatus,
27632        tabPtr.p->tabUpdateState,
27633        tabPtr.p->tabLcpStatus);
27634 
27635     FragmentstorePtr fragPtr;
27636     for (Uint32 fid = 0; fid < tabPtr.p->totalfragments; fid++) {
27637       jam();
27638       getFragstore(tabPtr.p, fid, fragPtr);
27639 
27640       char buf[100], buf2[100];
27641       BaseString::snprintf(buf, sizeof(buf), " Fragment %d: noLcpReplicas==%d ",
27642 			   fid, fragPtr.p->noLcpReplicas);
27643 
27644       Uint32 num=0;
27645       ReplicaRecordPtr replicaPtr;
27646       replicaPtr.i = fragPtr.p->storedReplicas;
27647       do {
27648         c_replicaRecordPool.getPtr(replicaPtr);
27649 	BaseString::snprintf(buf2, sizeof(buf2), "%s %d(on %d)=%d(%s)",
27650 			     buf, num,
27651 			     replicaPtr.p->procNode,
27652 			     replicaPtr.p->lcpIdStarted,
27653 			     replicaPtr.p->lcpOngoingFlag ? "Ongoing" : "Idle");
27654 	BaseString::snprintf(buf, sizeof(buf), "%s", buf2);
27655 
27656 	num++;
27657 	replicaPtr.i = replicaPtr.p->nextPool;
27658       } while (replicaPtr.i != RNIL);
27659       infoEvent("%s", buf);
27660     }
27661   }
27662 
27663   if (arg == 7022)
27664   {
27665     jam();
27666     crashSystemAtGcpStop(signal, true);
27667   }
27668 
27669   if (arg == 7025)
27670   {
27671     jam();
27672     dumpGcpStop();
27673     return;
27674   }
27675 
27676 #ifdef GCP_TIMER_HACK
27677   if (signal->theData[0] == 7901)
27678     globalData.gcp_timer_limit = signal->theData[1];
27679 #endif
27680   if (arg == 7023)
27681   {
27682     /**
27683      * Dump all active TakeOver
27684      */
27685     Ptr<TakeOverRecord> ptr;
27686     ptr.i = signal->theData[1];
27687     if (signal->getLength() == 1)
27688     {
27689       infoEvent("Starting dump all active take-over");
27690       c_masterActiveTakeOverList.first(ptr);
27691     }
27692 
27693     if (ptr.i == RNIL)
27694     {
27695       infoEvent("Dump all active take-over done");
27696       return;
27697     }
27698 
27699     c_masterActiveTakeOverList.getPtr(ptr);
27700     infoEvent("TakeOverPtr(%u) starting: %u flags: 0x%x ref: 0x%x, data: %u",
27701               ptr.i,
27702               ptr.p->toStartingNode,
27703               ptr.p->m_flags,
27704               ptr.p->m_senderRef,
27705               ptr.p->m_senderData);
27706     infoEvent("slaveState: %u masterState: %u",
27707               ptr.p->toSlaveStatus, ptr.p->toMasterStatus);
27708     infoEvent("restorableGci: %u startGci: %u tab: %u frag: %u src: %u max: %u",
27709               ptr.p->restorableGci, ptr.p->startGci,
27710               ptr.p->toCurrentTabref, ptr.p->toCurrentFragid,
27711               ptr.p->toCopyNode, ptr.p->maxPage);
27712 
27713     c_masterActiveTakeOverList.next(ptr);
27714     signal->theData[0] = arg;
27715     signal->theData[1] = ptr.i;
27716   }
27717 
27718   if (arg == DumpStateOrd::DihDumpPageRecInfo)
27719   {
27720     jam();
27721     ndbout_c("MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES %u", MAX_CONCURRENT_LCP_TAB_DEF_FLUSHES);
27722     ndbout_c("MAX_CONCURRENT_DIH_TAB_DEF_OPS %u", MAX_CONCURRENT_DIH_TAB_DEF_OPS);
27723     ndbout_c("MAX_CRASHED_REPLICAS %u", MAX_CRASHED_REPLICAS);
27724     ndbout_c("MAX_LCP_STORED %u", MAX_LCP_STORED);
27725     ndbout_c("MAX_REPLICAS %u", MAX_REPLICAS);
27726     ndbout_c("MAX_NDB_PARTITIONS %u", MAX_NDB_PARTITIONS);
27727     ndbout_c("PACK_REPLICAS_WORDS %u", PACK_REPLICAS_WORDS);
27728     ndbout_c("PACK_FRAGMENT_WORDS %u", PACK_FRAGMENT_WORDS);
27729     ndbout_c("PACK_TABLE_WORDS %u", PACK_TABLE_WORDS);
27730     ndbout_c("PACK_TABLE_PAGE_WORDS %u", PACK_TABLE_PAGE_WORDS);
27731     ndbout_c("PACK_TABLE_PAGES %u", PACK_TABLE_PAGES);
27732     ndbout_c("ZPAGEREC %u", ZPAGEREC);
27733     ndbout_c("Total bytes : %lu",
27734              (unsigned long) ZPAGEREC * sizeof(PageRecord));
27735     ndbout_c("LCP Tab def write ops inUse %u queued %u",
27736              c_lcpTabDefWritesControl.inUse,
27737              c_lcpTabDefWritesControl.queuedRequests);
27738 
27739     if (getNodeState().startLevel < NodeState::SL_STARTING)
27740       return ;
27741 
27742     Uint32 freeCount = 0;
27743     PageRecordPtr tmp;
27744     tmp.i = cfirstfreepage;
27745     while (tmp.i != RNIL)
27746     {
27747       jam();
27748       ptrCheckGuard(tmp, cpageFileSize, pageRecord);
27749       freeCount++;
27750       tmp.i = tmp.p->nextfreepage;
27751     };
27752     ndbout_c("Pages in use %u/%u", cpageFileSize - freeCount, cpageFileSize);
27753     return;
27754   }
27755 
27756   if (arg == DumpStateOrd::SchemaResourceSnapshot)
27757   {
27758     RSS_OP_SNAPSHOT_SAVE(cremainingfrags);
27759     RSS_OP_SNAPSHOT_SAVE(cnoFreeReplicaRec);
27760 
27761     {
27762       Uint32 cnghash = 0;
27763       NodeGroupRecordPtr NGPtr;
27764       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
27765       {
27766         NGPtr.i = c_node_groups[i];
27767         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
27768         cnghash = (cnghash * 33) + NGPtr.p->m_ref_count;
27769       }
27770       RSS_OP_SNAPSHOT_SAVE(cnghash);
27771     }
27772     return;
27773   }
27774 
27775   if (arg == DumpStateOrd::SchemaResourceCheckLeak)
27776   {
27777     RSS_OP_SNAPSHOT_CHECK(cremainingfrags);
27778     RSS_OP_SNAPSHOT_SAVE(cnoFreeReplicaRec);
27779 
27780     {
27781       Uint32 cnghash = 0;
27782       NodeGroupRecordPtr NGPtr;
27783       for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
27784       {
27785         NGPtr.i = c_node_groups[i];
27786         ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
27787         cnghash = (cnghash * 33) + NGPtr.p->m_ref_count;
27788       }
27789       RSS_OP_SNAPSHOT_CHECK(cnghash);
27790     }
27791   }
27792 
27793   /* Checks whether add frag failure was cleaned up.
27794    * Should NOT be used while commands involving addFragReq
27795    * are being performed.
27796    */
27797   if (arg == DumpStateOrd::DihAddFragFailCleanedUp && signal->length() == 2)
27798   {
27799     jam();
27800     TabRecordPtr tabPtr;
27801     tabPtr.i = signal->theData[1];
27802     if (tabPtr.i >= ctabFileSize)
27803       return;
27804 
27805     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
27806 
27807     if (tabPtr.p->m_new_map_ptr_i == RNIL)
27808     {
27809       jam();
27810       infoEvent("DIH : Add frag fail clean up ok for table %u", tabPtr.i);
27811     }
27812     else
27813     {
27814       jam();
27815       warningEvent("new_map_ptr_i to table id %d is not NIL", tabPtr.i);
27816       /*
27817         This ndbrequire is needed by the runFailAddPartition() test case.
27818         This dump code is *not* intended for interactive usage, as the node
27819         is likely to crash.
27820       */
27821       ndbabort();
27822     }
27823   }
27824   if (arg == DumpStateOrd::DihDisplayPauseState)
27825   {
27826     infoEvent("Pause LCP ref: %x, is_lcp_paused %u,"
27827               " c_dequeue_lcp_rep_ongoing %u",
27828               cmasterdihref,
27829               is_lcp_paused(),
27830               c_dequeue_lcp_rep_ongoing);
27831     infoEvent("c_pause_lcp_master_state: %u,",
27832               Uint32(c_pause_lcp_master_state));
27833     infoEvent("c_queued_lcp_complete_rep: %u,"
27834               " c_lcp_id_paused: %u",
27835               c_queued_lcp_complete_rep,
27836               c_lcp_id_paused);
27837     infoEvent("c_last_id_lcp_complete_rep: %u"
27838               " c_lcp_runs_with_pause_support: %u",
27839               c_last_id_lcp_complete_rep,
27840               c_lcp_runs_with_pause_support);
27841     infoEvent("c_lcp_id_while_copy_meta_data: %u, c_pause_lcp_start_node: %u",
27842               c_lcp_id_while_copy_meta_data,
27843               c_pause_lcp_start_node);
27844     infoEvent("c_PAUSE_LCP_REQ_Counter: %s",
27845               c_PAUSE_LCP_REQ_Counter.getText());
27846     infoEvent("c_FLUSH_LCP_REP_REQ_Counter: %s",
27847               c_FLUSH_LCP_REP_REQ_Counter.getText());
27848     if (isMaster())
27849     {
27850       char buf[100];
27851       infoEvent("c_lcpState.m_participatingLQH: %s",
27852                 c_lcpState.m_participatingLQH.getText(buf));
27853       infoEvent("c_pause_participants: %s",
27854                 c_pause_participants.getText(buf));
27855     }
27856   }
27857 
27858   DECLARE_DUMP0(DBDIH, 7213, "Set error 7213 with extra arg")
27859   {
27860     SET_ERROR_INSERT_VALUE2(7213, signal->theData[1]);
27861     return;
27862   }
27863   DECLARE_DUMP0(DBDIH, 7214, "Set error 7214 with extra arg")
27864   {
27865     SET_ERROR_INSERT_VALUE2(7214, signal->theData[1]);
27866     return;
27867   }
27868 
27869   DECLARE_DUMP0(DBDIH, 7216, "Set error 7216 with extra arg")
27870   {
27871     SET_ERROR_INSERT_VALUE2(7216, signal->theData[1]);
27872     return;
27873   }
27874   DECLARE_DUMP0(DBDIH, 6099, "Start microgcp")
27875   {
27876     if (isMaster())
27877     {
27878       jam();
27879       // Invalidating timestamp will force an immediate microGCP
27880       NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
27881     }
27882     else
27883     {
27884       jam();
27885       sendSignal(cmasterdihref, GSN_DUMP_STATE_ORD, signal, 1, JBB);
27886     }
27887     return;
27888   }
27889   DECLARE_DUMP0(DBDIH, 7999, "Set error code with extra arg")
27890   {
27891     SET_ERROR_INSERT_VALUE2(signal->theData[1],
27892                             signal->theData[2]);
27893   }
27894 
27895   if (arg == DumpStateOrd::DihSetGcpStopVals)
27896   {
27897     jam();
27898     if (signal->getLength() != 3)
27899     {
27900       jam();
27901       return;
27902     }
27903     if (signal->theData[1] == 0)
27904     {
27905       g_eventLogger->info("Changing GCP_COMMIT max_lag_millis from %u to %u",
27906                           m_gcp_monitor.m_micro_gcp.m_max_lag_ms,
27907                           signal->theData[2]);
27908       m_gcp_monitor.m_micro_gcp.m_max_lag_ms = signal->theData[2];
27909 
27910 #ifdef ERROR_INSERT
27911       m_gcp_monitor.m_micro_gcp.test_set_max_lag = true;
27912 #endif
27913     }
27914     else
27915     {
27916       g_eventLogger->info("Changing GCP_SAVE max_lag_millis from %u to %u",
27917                           m_gcp_monitor.m_gcp_save.m_max_lag_ms,
27918                           signal->theData[2]);
27919       m_gcp_monitor.m_gcp_save.m_max_lag_ms = signal->theData[2];
27920 
27921 #ifdef ERROR_INSERT
27922       m_gcp_monitor.m_gcp_save.test_set_max_lag = true;
27923 #endif
27924     }
27925     sendINFO_GCP_STOP_TIMER(signal);
27926   }
27927 
27928   if (arg == DumpStateOrd::DihStallLcpStart)
27929   {
27930     jam();
27931 
27932     if (signal->getLength() != 2)
27933     {
27934       g_eventLogger->warning("Malformed DihStallLcpStart(%u) received, ignoring",
27935                              DumpStateOrd::DihStallLcpStart);
27936       return;
27937     }
27938     const Uint32 key = signal->theData[1];
27939     if (key == 91919191)
27940     {
27941       jam();
27942       g_eventLogger->warning("DihStallLcpStart(%u) received, stalling subsequent LCP starts",
27943                              DumpStateOrd::DihStallLcpStart);
27944       c_lcpState.lcpManualStallStart = true;
27945     }
27946     else
27947     {
27948       jam();
27949       g_eventLogger->warning("DihStallLcpStart(%u) received, clearing LCP stall state (%u)",
27950                              DumpStateOrd::DihStallLcpStart,
27951                              c_lcpState.lcpManualStallStart);
27952       c_lcpState.lcpManualStallStart = false;
27953     }
27954     return;
27955   }
27956 #ifdef ERROR_INSERT
27957   if (arg == DumpStateOrd::DihSaveGcpCommitLag)
27958   {
27959     jam();
27960     m_gcp_monitor.m_savedMaxCommitLag =
27961       m_gcp_monitor.m_micro_gcp.m_max_lag_ms;
27962     g_eventLogger->info("Saving Gcp commit lag %u",
27963                         m_gcp_monitor.m_savedMaxCommitLag);
27964     return;
27965   }
27966   if (arg == DumpStateOrd::DihCheckGcpCommitLag)
27967   {
27968     jam();
27969     g_eventLogger->info("Checking Gcp commit lag (%u) == saved lag (%u)",
27970                         m_gcp_monitor.m_micro_gcp.m_max_lag_ms,
27971                         m_gcp_monitor.m_savedMaxCommitLag);
27972     ndbrequire(m_gcp_monitor.m_micro_gcp.m_max_lag_ms ==
27973                m_gcp_monitor.m_savedMaxCommitLag);
27974     return;
27975   }
27976 #endif
27977 
27978 }//Dbdih::execDUMP_STATE_ORD()
27979 
27980 void
execPREP_DROP_TAB_REQ(Signal * signal)27981 Dbdih::execPREP_DROP_TAB_REQ(Signal* signal){
27982   jamEntry();
27983 
27984   PrepDropTabReq* req = (PrepDropTabReq*)signal->getDataPtr();
27985 
27986   TabRecordPtr tabPtr;
27987   tabPtr.i = req->tableId;
27988   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
27989 
27990   Uint32 senderRef = req->senderRef;
27991   Uint32 senderData = req->senderData;
27992 
27993   PrepDropTabRef::ErrorCode err = PrepDropTabRef::OK;
27994   { /**
27995      * Check table state
27996      */
27997     bool ok = false;
27998     switch(tabPtr.p->tabStatus){
27999     case TabRecord::TS_IDLE:
28000       ok = true;
28001       jam();
28002       err = PrepDropTabRef::NoSuchTable;
28003       break;
28004     case TabRecord::TS_DROPPING:
28005       ok = true;
28006       jam();
28007       err = PrepDropTabRef::PrepDropInProgress;
28008       break;
28009     case TabRecord::TS_CREATING:
28010       jam();
28011       ok = true;
28012       break;
28013     case TabRecord::TS_ACTIVE:
28014       ok = true;
28015       jam();
28016       break;
28017     default:
28018       break;
28019     }
28020     ndbrequire(ok);
28021   }
28022 
28023   if(err != PrepDropTabRef::OK)
28024   {
28025     jam();
28026     PrepDropTabRef* ref = (PrepDropTabRef*)signal->getDataPtrSend();
28027     ref->senderRef = reference();
28028     ref->senderData = senderData;
28029     ref->tableId = tabPtr.i;
28030     ref->errorCode = err;
28031     sendSignal(senderRef, GSN_PREP_DROP_TAB_REF, signal,
28032 	       PrepDropTabRef::SignalLength, JBB);
28033     return;
28034   }
28035 
28036   /**
28037    * When we come here DBTC is already aware of the table being dropped,
28038    * so no requests for the table will arrive after this from DBTC, so
28039    * no need to protect this variable here, it is protected by the
28040    * signalling order of drop table signals instead.
28041    */
28042   tabPtr.p->tabStatus = TabRecord::TS_DROPPING;
28043 
28044   PrepDropTabConf* conf = (PrepDropTabConf*)signal->getDataPtrSend();
28045   conf->tableId = tabPtr.i;
28046   conf->senderRef = reference();
28047   conf->senderData = senderData;
28048   sendSignal(senderRef, GSN_PREP_DROP_TAB_CONF,
28049              signal, PrepDropTabConf::SignalLength, JBB);
28050 }
28051 
28052 void
waitDropTabWritingToFile(Signal * signal,TabRecordPtr tabPtr)28053 Dbdih::waitDropTabWritingToFile(Signal* signal, TabRecordPtr tabPtr){
28054 
28055   if (tabPtr.p->tabLcpStatus == TabRecord::TLS_WRITING_TO_FILE)
28056   {
28057     jam();
28058     signal->theData[0] = DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE;
28059     signal->theData[1] = tabPtr.i;
28060     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
28061                         WaitTableStateChangeMillis, 2);
28062     return;
28063   }
28064 
28065   if (tabPtr.p->tabUpdateState != TabRecord::US_IDLE)
28066   {
28067     jam();
28068     signal->theData[0] = DihContinueB::WAIT_DROP_TAB_WRITING_TO_FILE;
28069     signal->theData[1] = tabPtr.i;
28070     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
28071                         WaitTableStateChangeMillis, 2);
28072     return;
28073   }
28074 
28075   ndbrequire(tabPtr.p->tabLcpStatus ==  TabRecord::TLS_COMPLETED);
28076   checkDropTabComplete(signal, tabPtr);
28077 }
28078 
28079 void
checkDropTabComplete(Signal * signal,TabRecordPtr tabPtr)28080 Dbdih::checkDropTabComplete(Signal* signal, TabRecordPtr tabPtr)
28081 {
28082   startDeleteFile(signal, tabPtr);
28083 }
28084 
28085 void
execNDB_TAMPER(Signal * signal)28086 Dbdih::execNDB_TAMPER(Signal* signal)
28087 {
28088   if ((ERROR_INSERTED(7011)) &&
28089       (signal->theData[0] == 7012)) {
28090     CLEAR_ERROR_INSERT_VALUE;
28091     calculateKeepGciLab(signal, 0, 0);
28092     return;
28093   }//if
28094   if (signal->getLength() == 1)
28095   {
28096     SET_ERROR_INSERT_VALUE2(signal->theData[0],
28097                             0);
28098   }
28099   else
28100   {
28101     SET_ERROR_INSERT_VALUE2(signal->theData[0],
28102                             signal->theData[1]);
28103   }
28104   return;
28105 }//Dbdih::execNDB_TAMPER()
28106 
execBLOCK_COMMIT_ORD(Signal * signal)28107 void Dbdih::execBLOCK_COMMIT_ORD(Signal* signal){
28108   BlockCommitOrd* const block = (BlockCommitOrd *)&signal->theData[0];
28109 
28110   jamEntry();
28111 
28112   c_blockCommit = true;
28113   c_blockCommitNo = block->failNo;
28114 }
28115 
execUNBLOCK_COMMIT_ORD(Signal * signal)28116 void Dbdih::execUNBLOCK_COMMIT_ORD(Signal* signal){
28117   UnblockCommitOrd* const unblock = (UnblockCommitOrd *)&signal->theData[0];
28118   (void)unblock;
28119 
28120   jamEntry();
28121 
28122   if(c_blockCommit == true)
28123   {
28124     jam();
28125 
28126     c_blockCommit = false;
28127     for (Uint32 i = 0; i<c_diverify_queue_cnt; i++)
28128     {
28129       c_diverify_queue[i].m_empty_done = 0;
28130       emptyverificbuffer(signal, i, true);
28131     }
28132   }
28133 }
28134 
execSTOP_PERM_REQ(Signal * signal)28135 void Dbdih::execSTOP_PERM_REQ(Signal* signal){
28136 
28137   jamEntry();
28138 
28139   StopPermReq* const req = (StopPermReq*)&signal->theData[0];
28140   StopPermRef* const ref = (StopPermRef*)&signal->theData[0];
28141 
28142   const Uint32 senderData = req->senderData;
28143   const BlockReference senderRef = req->senderRef;
28144   const NodeId nodeId = refToNode(senderRef);
28145 
28146   if (isMaster()) {
28147     /**
28148      * Master
28149      */
28150     jam();
28151     CRASH_INSERTION(7065);
28152     if (c_stopPermMaster.clientRef != 0) {
28153       jam();
28154 
28155       ref->senderData = senderData;
28156       ref->errorCode  = StopPermRef::NodeShutdownInProgress;
28157       sendSignal(senderRef, GSN_STOP_PERM_REF, signal,
28158                  StopPermRef::SignalLength, JBB);
28159       return;
28160     }//if
28161 
28162     if (c_nodeStartMaster.activeState) {
28163       jam();
28164       ref->senderData = senderData;
28165       ref->errorCode  = StopPermRef::NodeStartInProgress;
28166       sendSignal(senderRef, GSN_STOP_PERM_REF, signal,
28167                  StopPermRef::SignalLength, JBB);
28168       return;
28169     }//if
28170 
28171     /**
28172      * Lock
28173      */
28174     c_nodeStartMaster.activeState = true;
28175     c_stopPermMaster.clientRef = senderRef;
28176 
28177     c_stopPermMaster.clientData = senderData;
28178     c_stopPermMaster.returnValue = 0;
28179     c_switchReplicas.clear();
28180 
28181     Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
28182     Callback c = { safe_cast(&Dbdih::switch_primary_stop_node), nodeId };
28183     ndbrequire(mutex.lock(c));
28184   } else {
28185     /**
28186      * Proxy part
28187      */
28188     jam();
28189     CRASH_INSERTION(7066);
28190     if(c_stopPermProxy.clientRef != 0){
28191       jam();
28192       ref->senderData = senderData;
28193       ref->errorCode = StopPermRef::NodeShutdownInProgress;
28194       sendSignal(senderRef, GSN_STOP_PERM_REF, signal, 2, JBB);
28195       return;
28196     }//if
28197 
28198     c_stopPermProxy.clientRef = senderRef;
28199     c_stopPermProxy.masterRef = cmasterdihref;
28200     c_stopPermProxy.clientData = senderData;
28201 
28202     req->senderRef = reference();
28203     req->senderData = senderData;
28204     sendSignal(cmasterdihref, GSN_STOP_PERM_REQ, signal,
28205 	       StopPermReq::SignalLength, JBB);
28206   }//if
28207 }//Dbdih::execSTOP_PERM_REQ()
28208 
28209 void
switch_primary_stop_node(Signal * signal,Uint32 node_id,Uint32 ret_val)28210 Dbdih::switch_primary_stop_node(Signal* signal, Uint32 node_id, Uint32 ret_val)
28211 {
28212   ndbrequire(ret_val == 0);
28213   signal->theData[0] = DihContinueB::SwitchReplica;
28214   signal->theData[1] = node_id;
28215   signal->theData[2] = 0; // table id
28216   signal->theData[3] = 0; // fragment id
28217   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
28218 }
28219 
execSTOP_PERM_REF(Signal * signal)28220 void Dbdih::execSTOP_PERM_REF(Signal* signal)
28221 {
28222   jamEntry();
28223   ndbrequire(c_stopPermProxy.clientRef != 0);
28224   ndbrequire(c_stopPermProxy.masterRef == signal->senderBlockRef());
28225   sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_REF, signal, 2, JBB);
28226   c_stopPermProxy.clientRef = 0;
28227 }//Dbdih::execSTOP_PERM_REF()
28228 
execSTOP_PERM_CONF(Signal * signal)28229 void Dbdih::execSTOP_PERM_CONF(Signal* signal)
28230 {
28231   jamEntry();
28232   ndbrequire(c_stopPermProxy.clientRef != 0);
28233   ndbrequire(c_stopPermProxy.masterRef == signal->senderBlockRef());
28234   sendSignal(c_stopPermProxy.clientRef, GSN_STOP_PERM_CONF, signal, 1, JBB);
28235   c_stopPermProxy.clientRef = 0;
28236 }//Dbdih::execSTOP_PERM_CONF()
28237 
execDIH_SWITCH_REPLICA_REQ(Signal * signal)28238 void Dbdih::execDIH_SWITCH_REPLICA_REQ(Signal* signal)
28239 {
28240   jamEntry();
28241   DihSwitchReplicaReq* const req = (DihSwitchReplicaReq*)&signal->theData[0];
28242   const Uint32 tableId = req->tableId;
28243   const Uint32 fragNo = req->fragNo;
28244   const BlockReference senderRef = req->senderRef;
28245 
28246   CRASH_INSERTION(7067);
28247   TabRecordPtr tabPtr;
28248   tabPtr.i = tableId;
28249   ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
28250 
28251   ndbrequire(tabPtr.p->tabStatus == TabRecord::TS_ACTIVE);
28252   if (tabPtr.p->tabCopyStatus != TabRecord::CS_IDLE) {
28253     jam();
28254     sendSignal(reference(), GSN_DIH_SWITCH_REPLICA_REQ, signal,
28255 	       DihSwitchReplicaReq::SignalLength, JBB);
28256     return;
28257   }//if
28258   FragmentstorePtr fragPtr;
28259   getFragstore(tabPtr.p, fragNo, fragPtr);
28260 
28261   /**
28262    * Do funky stuff
28263    */
28264   Uint32 oldOrder[MAX_REPLICAS];
28265   const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
28266                                               fragPtr.p,
28267                                               oldOrder);
28268 
28269   if (noOfReplicas < req->noOfReplicas) {
28270     jam();
28271     //---------------------------------------------------------------------
28272     // A crash occurred in the middle of our switch handling.
28273     //---------------------------------------------------------------------
28274     DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
28275     ref->senderNode = cownNodeId;
28276     ref->errorCode = StopPermRef::NF_CausedAbortOfStopProcedure;
28277     sendSignal(senderRef, GSN_DIH_SWITCH_REPLICA_REF, signal,
28278                DihSwitchReplicaRef::SignalLength, JBB);
28279   }//if
28280 
28281   make_table_use_new_node_order(tabPtr,
28282                                 fragPtr,
28283                                 noOfReplicas,
28284                                 &req->newNodeOrder[0]);
28285 
28286   /**
28287    * Reply
28288    */
28289   DihSwitchReplicaConf* const conf = (DihSwitchReplicaConf*)&signal->theData[0];
28290   conf->senderNode = cownNodeId;
28291   sendSignal(senderRef, GSN_DIH_SWITCH_REPLICA_CONF, signal,
28292              DihSwitchReplicaConf::SignalLength, JBB);
28293 }//Dbdih::execDIH_SWITCH_REPLICA_REQ()
28294 
execDIH_SWITCH_REPLICA_CONF(Signal * signal)28295 void Dbdih::execDIH_SWITCH_REPLICA_CONF(Signal* signal)
28296 {
28297   jamEntry();
28298   /**
28299    * Response to master
28300    */
28301   CRASH_INSERTION(7068);
28302   DihSwitchReplicaConf* const conf = (DihSwitchReplicaConf*)&signal->theData[0];
28303   switchReplicaReply(signal, conf->senderNode);
28304 }//Dbdih::execDIH_SWITCH_REPLICA_CONF()
28305 
execDIH_SWITCH_REPLICA_REF(Signal * signal)28306 void Dbdih::execDIH_SWITCH_REPLICA_REF(Signal* signal)
28307 {
28308   jamEntry();
28309   DihSwitchReplicaRef* const ref = (DihSwitchReplicaRef*)&signal->theData[0];
28310   if(c_stopPermMaster.returnValue == 0){
28311     jam();
28312     c_stopPermMaster.returnValue = ref->errorCode;
28313   }//if
28314   switchReplicaReply(signal, ref->senderNode);
28315 }//Dbdih::execDIH_SWITCH_REPLICA_REF()
28316 
switchReplicaReply(Signal * signal,NodeId nodeId)28317 void Dbdih::switchReplicaReply(Signal* signal,
28318 			       NodeId nodeId){
28319   jam();
28320   receiveLoopMacro(DIH_SWITCH_REPLICA_REQ, nodeId);
28321   //------------------------------------------------------
28322   // We have received all responses from the nodes. Thus
28323   // we have completed switching replica roles. Continue
28324   // with the next fragment.
28325   //------------------------------------------------------
28326   if(c_stopPermMaster.returnValue != 0){
28327     jam();
28328     c_switchReplicas.tableId = ctabFileSize + 1;
28329   }//if
28330   c_switchReplicas.fragNo++;
28331 
28332   signal->theData[0] = DihContinueB::SwitchReplica;
28333   signal->theData[1] = c_switchReplicas.nodeId;
28334   signal->theData[2] = c_switchReplicas.tableId;
28335   signal->theData[3] = c_switchReplicas.fragNo;
28336   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
28337 }//Dbdih::switchReplicaReply()
28338 
28339 void
switchReplica(Signal * signal,Uint32 nodeId,Uint32 tableId,Uint32 fragNo)28340 Dbdih::switchReplica(Signal* signal,
28341 		     Uint32 nodeId,
28342 		     Uint32 tableId,
28343 		     Uint32 fragNo){
28344   jam();
28345   DihSwitchReplicaReq* const req = (DihSwitchReplicaReq*)&signal->theData[0];
28346 
28347   const Uint32 RT_BREAK = 64;
28348 
28349   for (Uint32 i = 0; i < RT_BREAK; i++) {
28350     jam();
28351     if (tableId >= ctabFileSize) {
28352       jam();
28353       StopPermConf* const conf = (StopPermConf*)&signal->theData[0];
28354       StopPermRef*  const ref  = (StopPermRef*)&signal->theData[0];
28355       /**
28356        * Finished with all tables
28357        */
28358       if(c_stopPermMaster.returnValue == 0) {
28359 	jam();
28360 	conf->senderData = c_stopPermMaster.clientData;
28361 	sendSignal(c_stopPermMaster.clientRef, GSN_STOP_PERM_CONF,
28362 		   signal, 1, JBB);
28363       } else {
28364         jam();
28365         ref->senderData = c_stopPermMaster.clientData;
28366         ref->errorCode  = c_stopPermMaster.returnValue;
28367         sendSignal(c_stopPermMaster.clientRef, GSN_STOP_PERM_REF, signal, 2,JBB);
28368       }//if
28369 
28370       /**
28371        * UnLock
28372        */
28373       c_nodeStartMaster.activeState = false;
28374       c_stopPermMaster.clientRef = 0;
28375       c_stopPermMaster.clientData = 0;
28376       c_stopPermMaster.returnValue = 0;
28377       Mutex mutex(signal, c_mutexMgr, c_switchPrimaryMutexHandle);
28378       mutex.unlock(); // ignore result
28379       return;
28380     }//if
28381 
28382     TabRecordPtr tabPtr;
28383     tabPtr.i = tableId;
28384     ptrCheckGuard(tabPtr, ctabFileSize, tabRecord);
28385 
28386     if (tabPtr.p->tabStatus != TabRecord::TS_ACTIVE) {
28387       jam();
28388       tableId++;
28389       fragNo = 0;
28390       continue;
28391     }//if
28392     if (fragNo >= tabPtr.p->totalfragments) {
28393       jam();
28394       tableId++;
28395       fragNo = 0;
28396       continue;
28397     }//if
28398     FragmentstorePtr fragPtr;
28399     getFragstore(tabPtr.p, fragNo, fragPtr);
28400 
28401     Uint32 oldOrder[MAX_REPLICAS];
28402     const Uint32 noOfReplicas = extractNodeInfo(jamBuffer(),
28403                                                 fragPtr.p,
28404                                                 oldOrder);
28405 
28406     if(oldOrder[0] != nodeId) {
28407       jam();
28408       fragNo++;
28409       continue;
28410     }//if
28411     req->tableId = tableId;
28412     req->fragNo = fragNo;
28413     req->noOfReplicas = noOfReplicas;
28414     for (Uint32 i = 0; i < (noOfReplicas - 1); i++) {
28415       req->newNodeOrder[i] = oldOrder[i+1];
28416     }//for
28417     req->newNodeOrder[noOfReplicas-1] = nodeId;
28418     req->senderRef = reference();
28419 
28420     /**
28421      * Initialize struct
28422      */
28423     c_switchReplicas.tableId = tableId;
28424     c_switchReplicas.fragNo = fragNo;
28425     c_switchReplicas.nodeId = nodeId;
28426 
28427     sendLoopMacro(DIH_SWITCH_REPLICA_REQ, sendDIH_SWITCH_REPLICA_REQ, RNIL);
28428     return;
28429   }//for
28430 
28431   signal->theData[0] = DihContinueB::SwitchReplica;
28432   signal->theData[1] = nodeId;
28433   signal->theData[2] = tableId;
28434   signal->theData[3] = fragNo;
28435   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
28436 }//Dbdih::switchReplica()
28437 
execSTOP_ME_REQ(Signal * signal)28438 void Dbdih::execSTOP_ME_REQ(Signal* signal)
28439 {
28440   jamEntry();
28441   StopMeReq* const req = (StopMeReq*)&signal->theData[0];
28442   const BlockReference senderRef = req->senderRef;
28443   const Uint32 senderData = req->senderData;
28444   const Uint32 nodeId = refToNode(senderRef);
28445   {
28446     /**
28447      * Set node dead (remove from operations)
28448      */
28449     NodeRecordPtr nodePtr;
28450     nodePtr.i = nodeId;
28451     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
28452     make_node_not_usable(nodePtr.p);
28453   }
28454   if (nodeId != getOwnNodeId()) {
28455     jam();
28456     StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
28457     stopMeConf->senderData = senderData;
28458     stopMeConf->senderRef  = reference();
28459     sendSignal(senderRef, GSN_STOP_ME_CONF, signal,
28460 	       StopMeConf::SignalLength, JBB);
28461     return;
28462   }//if
28463 
28464   /**
28465    * Local signal
28466    */
28467   jam();
28468   ndbrequire(c_stopMe.clientRef == 0);
28469 
28470   c_stopMe.clientData  = senderData;
28471   c_stopMe.clientRef   = senderRef;
28472 
28473   req->senderData = senderData;
28474   req->senderRef  = reference();
28475 
28476   sendLoopMacro(STOP_ME_REQ, sendSTOP_ME_REQ, RNIL);
28477 
28478   /**
28479    * Send conf to self
28480    */
28481   StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
28482   stopMeConf->senderData = senderData;
28483   stopMeConf->senderRef  = reference();
28484   sendSignal(reference(), GSN_STOP_ME_CONF, signal,
28485 	     StopMeConf::SignalLength, JBB);
28486 }//Dbdih::execSTOP_ME_REQ()
28487 
execSTOP_ME_REF(Signal * signal)28488 void Dbdih::execSTOP_ME_REF(Signal* signal)
28489 {
28490   ndbabort();
28491 }
28492 
execSTOP_ME_CONF(Signal * signal)28493 void Dbdih::execSTOP_ME_CONF(Signal* signal)
28494 {
28495   jamEntry();
28496   StopMeConf * const stopMeConf = (StopMeConf *)&signal->theData[0];
28497 
28498   const Uint32 senderRef  = stopMeConf->senderRef;
28499   const Uint32 senderData = stopMeConf->senderData;
28500   const Uint32 nodeId     = refToNode(senderRef);
28501 
28502   ndbrequire(c_stopMe.clientRef != 0);
28503   ndbrequire(c_stopMe.clientData == senderData);
28504 
28505   receiveLoopMacro(STOP_ME_REQ, nodeId);
28506   //---------------------------------------------------------
28507   // All STOP_ME_REQ have been received. We will send the
28508   // confirmation back to the requesting block.
28509   //---------------------------------------------------------
28510 
28511   stopMeConf->senderRef = reference();
28512   stopMeConf->senderData = c_stopMe.clientData;
28513   sendSignal(c_stopMe.clientRef, GSN_STOP_ME_CONF, signal,
28514 	     StopMeConf::SignalLength, JBB);
28515   c_stopMe.clientRef = 0;
28516 }//Dbdih::execSTOP_ME_CONF()
28517 
28518 void
sendREDO_STATE_REP_to_all(Signal * signal,Uint32 block,bool send_to_all)28519 Dbdih::sendREDO_STATE_REP_to_all(Signal *signal,
28520                                  Uint32 block,
28521                                  bool send_to_all)
28522 {
28523   NodeRecordPtr nodePtr;
28524   nodePtr.i = cfirstAliveNode;
28525   jam();
28526   do
28527   {
28528     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
28529     if (nodePtr.p->copyCompleted || send_to_all)
28530     {
28531       BlockReference ref = numberToRef(block, nodePtr.i);
28532       if (ndbd_enable_redo_control(getNodeInfo(nodePtr.i).m_version))
28533       {
28534         jamLine(nodePtr.i);
28535         sendSignal(ref, GSN_REDO_STATE_REP, signal, 2, JBB);
28536       }
28537       else
28538       {
28539         jamLine(nodePtr.i);
28540       }
28541     }
28542     nodePtr.i = nodePtr.p->nextNode;
28543   } while (nodePtr.i != RNIL);
28544 }
28545 
28546 RedoStateRep::RedoAlertState
get_global_redo_alert_state()28547 Dbdih::get_global_redo_alert_state()
28548 {
28549   RedoStateRep::RedoAlertState redo_alert_state = RedoStateRep::NO_REDO_ALERT;
28550   NodeRecordPtr nodePtr;
28551   nodePtr.i = cfirstAliveNode;
28552   jam();
28553   do
28554   {
28555     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
28556     if (m_node_redo_alert_state[nodePtr.i] > redo_alert_state &&
28557         nodePtr.p->copyCompleted)
28558     {
28559       jamLine(nodePtr.i);
28560       redo_alert_state = m_node_redo_alert_state[nodePtr.i];
28561     }
28562     nodePtr.i = nodePtr.p->nextNode;
28563   } while (nodePtr.i != RNIL);
28564   return redo_alert_state;
28565 }
28566 
execREDO_STATE_REP(Signal * signal)28567 void Dbdih::execREDO_STATE_REP(Signal* signal)
28568 {
28569   RedoStateRep* rep = (RedoStateRep*)&signal->theData[0];
28570   if (rep->receiverInfo == RedoStateRep::ToLocalDih)
28571   {
28572     /**
28573      * Our local REDO alert state have changed. We should send
28574      * this information to all alive DIH nodes.
28575      */
28576     jam();
28577     rep->receiverInfo = RedoStateRep::ToAllDih;
28578     sendREDO_STATE_REP_to_all(signal, DBDIH, true);
28579   }
28580   else
28581   {
28582     /**
28583      * We received a new REDO alert state from a node. We record
28584      * this information. Only if we are the master will we
28585      * send this information onward to all the NDBCNTR of the live
28586      * nodes. In addition only send this onwards when the global
28587      * state have changed.
28588      */
28589     jam();
28590     RedoStateRep::RedoAlertState new_global_redo_alert_state;
28591     ndbrequire(rep->receiverInfo == RedoStateRep::ToAllDih);
28592     BlockReference sender = signal->senderBlockRef();
28593     Uint32 node_id_sender = refToNode(sender);
28594     m_node_redo_alert_state[node_id_sender] =
28595       (RedoStateRep::RedoAlertState)rep->redoState;
28596     new_global_redo_alert_state = get_global_redo_alert_state();
28597     if (isMaster() &&
28598         new_global_redo_alert_state != m_global_redo_alert_state)
28599     {
28600       jam();
28601       DEB_REDO_CONTROL(("Send out new REDO alert state: %u",
28602                         (Uint32)new_global_redo_alert_state));
28603       m_global_redo_alert_state = new_global_redo_alert_state;
28604       rep->receiverInfo = RedoStateRep::ToNdbcntr;
28605       rep->redoState = (RedoStateRep::RedoAlertState)m_global_redo_alert_state;
28606       sendREDO_STATE_REP_to_all(signal, NDBCNTR, false);
28607     }
28608   }
28609 }
28610 
execWAIT_GCP_REQ(Signal * signal)28611 void Dbdih::execWAIT_GCP_REQ(Signal* signal)
28612 {
28613   jamEntry();
28614   WaitGCPReq* const req = (WaitGCPReq*)&signal->theData[0];
28615   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
28616   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
28617   const Uint32 senderData = req->senderData;
28618   const BlockReference senderRef = req->senderRef;
28619   const Uint32 requestType = req->requestType;
28620   Uint32 errorCode = 0;
28621   if(ERROR_INSERTED(7247))
28622   {
28623     ndbout_c("Delaying WAIT_GCP_REQ");
28624     sendSignalWithDelay(reference(), GSN_WAIT_GCP_REQ, signal, 1000,
28625                         signal->getLength());
28626     return;
28627   }
28628 
28629   if(requestType == WaitGCPReq::CurrentGCI)
28630   {
28631     jam();
28632     conf->senderData = senderData;
28633     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
28634     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
28635     conf->blockStatus = cgcpOrderBlocked;
28636     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
28637 	       WaitGCPConf::SignalLength, JBB);
28638     return;
28639   }//if
28640 
28641   if(requestType == WaitGCPReq::RestartGCI)
28642   {
28643     jam();
28644     conf->senderData = senderData;
28645     conf->gci_hi = Uint32(crestartGci);
28646     conf->gci_lo = 0;
28647     conf->blockStatus = cgcpOrderBlocked;
28648     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
28649 	       WaitGCPConf::SignalLength, JBB);
28650     return;
28651   }//if
28652 
28653   if (requestType == WaitGCPReq::BlockStartGcp)
28654   {
28655     jam();
28656     conf->senderData = senderData;
28657     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
28658     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
28659     conf->blockStatus = cgcpOrderBlocked;
28660     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
28661 	       WaitGCPConf::SignalLength, JBB);
28662     cgcpOrderBlocked = 1;
28663     return;
28664   }
28665 
28666   if (requestType == WaitGCPReq::UnblockStartGcp)
28667   {
28668     jam();
28669     conf->senderData = senderData;
28670     conf->gci_hi = Uint32(m_micro_gcp.m_current_gci >> 32);
28671     conf->gci_lo = Uint32(m_micro_gcp.m_current_gci);
28672     conf->blockStatus = cgcpOrderBlocked;
28673     sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
28674 	       WaitGCPConf::SignalLength, JBB);
28675     cgcpOrderBlocked = 0;
28676     return;
28677   }
28678 
28679   ndbassert(requestType == WaitGCPReq::Complete ||
28680             requestType == WaitGCPReq::CompleteForceStart ||
28681             requestType == WaitGCPReq::CompleteIfRunning ||
28682             requestType == WaitGCPReq::WaitEpoch ||
28683             requestType == WaitGCPReq::ShutdownSync);
28684 
28685   /**
28686    * At this point, we wish to wait for some GCP/Epoch related
28687    * event
28688    *
28689    * Complete           : Wait for the next GCI completion,
28690    *                      and return its identity
28691    * CompleteForceStart : Same as complete, but force a GCI to
28692    *                      start ASAP
28693    * CompleteIfRunning  : Wait for any running GCI to complete
28694    *                      Return latest completed GCI
28695    * WaitEpoch          : Wait for the next epoch completion,
28696    *                      and return its identity
28697    * ShutdownSync       : Wait for all running nodes to request
28698    *                      the same, then wait for the next
28699    *                      GCI completion, and return its
28700    *                      identity
28701    *
28702    * Notes
28703    *   For GCIs, the 'next' GCI is generally next GCI to *start*
28704    *   after the WAIT_GCP_REQ is received.
28705    *   This is generally used to ensure that changes prior to
28706    *   WAIT_GCP_REQ are included in the GCI, which requires
28707    *   that any currently open epoch be included in the GCI
28708    *   waited for.
28709    *   Special care is required during epoch transitions.
28710    *
28711    *   Note that epochs are started and completed by the
28712    *   GCP_PREPARE/GCP_COMMIT protocols, but GCIs are completed
28713    *   by the GCP_SAVEREQ et al protocols.
28714    *   GCI completion is triggered as part of GCP_COMMIT processing,
28715    *   but does not stall further GCP_PREPARE/COMMIT rounds.
28716    *
28717    *   CompleteIfRunning waits for any running GCP_SAVEREQ,
28718    *   it is not currently checking GCP_PREPARE/COMMIT status
28719    *
28720    */
28721   if(isMaster())
28722   {
28723     /**
28724      * Master
28725      */
28726 
28727     if (!isActiveMaster())
28728     {
28729       ndbassert(cmasterState == MASTER_TAKE_OVER_GCP);
28730       errorCode = WaitGCPRef::NF_MasterTakeOverInProgress;
28731       goto error;
28732     }
28733 
28734     /**
28735      * Beware here :
28736      *   - GCP_SAVE and GCP_PREPARE/COMMIT can run
28737      *     concurrently
28738      *   - GCP_SAVE can be running concurrently for
28739      *     quite an 'old' epoch
28740      *   - Care must be taken in each use case to
28741      *     understand the significance of the
28742      *     current state ('now')  when WAIT_GCP_REQ
28743      *     reaches the Master
28744      */
28745     if((requestType == WaitGCPReq::CompleteIfRunning) &&
28746        (m_gcp_save.m_master.m_state == GcpSave::GCP_SAVE_IDLE))
28747     {
28748       jam();
28749       /* No GCP_SAVE running, return last durable GCI */
28750       conf->senderData = senderData;
28751       conf->gci_hi = Uint32(m_micro_gcp.m_old_gci >> 32);
28752       conf->gci_lo = Uint32(m_micro_gcp.m_old_gci);
28753       conf->blockStatus = cgcpOrderBlocked;
28754       sendSignal(senderRef, GSN_WAIT_GCP_CONF, signal,
28755 		 WaitGCPConf::SignalLength, JBB);
28756       return;
28757     }//if
28758 
28759     WaitGCPMasterPtr ptr;
28760     WaitGCPList * list = &c_waitGCPMasterList;
28761     if (requestType == WaitGCPReq::WaitEpoch)
28762     {
28763       jam();
28764       list = &c_waitEpochMasterList;
28765     }
28766 
28767     if (list->seizeFirst(ptr) == false)
28768     {
28769       jam();
28770       errorCode = WaitGCPRef::NoWaitGCPRecords;
28771       goto error;
28772       return;
28773     }
28774 
28775     ptr.p->clientRef = senderRef;
28776     ptr.p->clientData = senderData;
28777 
28778     switch (requestType)
28779     {
28780     case WaitGCPReq::WaitEpoch:
28781     {
28782       /* Wait for the next epoch completion (GCP_PREPARE/COMMIT) */
28783       ptr.p->waitGCI = 0;
28784       break;
28785     }
28786     case WaitGCPReq::CompleteIfRunning:
28787     {
28788       ndbrequire(m_gcp_save.m_master.m_state != GcpSave::GCP_SAVE_IDLE);
28789       /* Wait for GCI currently being saved to complete */
28790       ptr.p->waitGCI = m_gcp_save.m_gci;
28791       break;
28792     }
28793     case WaitGCPReq::Complete:
28794     case WaitGCPReq::CompleteForceStart:
28795     {
28796       /**
28797        * We need to block until the highest known epoch
28798        * in the cluster at *this* time has been included
28799        * in a subsequent GCP_SAVE round, then return that
28800        * complete, saved GCI to the requestor.
28801        * If we are not changing epochs then we wait for
28802        * a GCI containing the current epoch.
28803        * If we are changing epochs then we wait for a GCI
28804        * containing the next epoch.
28805        */
28806       ptr.p->waitGCI = Uint32(m_micro_gcp.m_current_gci >> 32);
28807       DEB_NODE_STOP(("waitGCI = %u",
28808                      Uint32(m_micro_gcp.m_current_gci >> 32)));
28809 
28810       if (m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT)
28811       {
28812         jam();
28813         /**
28814          * DIHs are currently committing the transition to
28815          * a new epoch.
28816          * Some TCs may have started committing transactions
28817          * in that epoch, so to ensure that all previously
28818          * committed transactions from the point of view of the
28819          * sender of this signal are included, we will use
28820          * the new epoch as the epoch after which to send the
28821          * CONF.
28822          */
28823         ptr.p->waitGCI = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
28824         DEB_NODE_STOP(("2: waitGCI = %u",
28825           Uint32(m_micro_gcp.m_master.m_new_gci >> 32)));
28826       }
28827 
28828       if (requestType == WaitGCPReq::CompleteForceStart)
28829       {
28830         jam();
28831         // Invalidating timestamps will force GCP_PREPARE/COMMIT
28832         // and GCP_SAVEREQ et al ASAP
28833         NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
28834         NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
28835       }//if
28836 
28837       break;
28838     }
28839     case WaitGCPReq::ShutdownSync:
28840     {
28841       jam();
28842 
28843       const Uint32 requestingNode = refToNode(senderRef);
28844       ptr.p->waitGCI = WaitGCPMasterRecord::ShutdownSyncGci;
28845 
28846       ndbrequire(requestingNode <= MAX_DATA_NODE_ID);
28847       ndbrequire(!c_shutdownReqNodes.get(requestingNode));
28848       c_shutdownReqNodes.set(requestingNode);
28849 
28850       checkShutdownSync();
28851 
28852       break;
28853     }
28854     default:
28855       jamLine(requestType);
28856       ndbabort();
28857     }
28858 
28859     return;
28860   }
28861   else
28862   {
28863     /**
28864      * Proxy part
28865      */
28866     jam();
28867     WaitGCPProxyPtr ptr;
28868     if (c_waitGCPProxyList.seizeFirst(ptr) == false)
28869     {
28870       jam();
28871       errorCode = WaitGCPRef::NoWaitGCPRecords;
28872       goto error;
28873     }//if
28874     ptr.p->clientRef = senderRef;
28875     ptr.p->clientData = senderData;
28876     ptr.p->masterRef = cmasterdihref;
28877 
28878     req->senderData = ptr.i;
28879     req->senderRef = reference();
28880     req->requestType = requestType;
28881 
28882     if (requestType == WaitGCPReq::ShutdownSync)
28883     {
28884       jam();
28885       const Uint32 masterVersion = getNodeInfo(refToNode(cmasterdihref)).m_version;
28886       if (!ndbd_support_waitgcp_shutdownsync(masterVersion))
28887       {
28888         jam();
28889         req->requestType = WaitGCPReq::CompleteForceStart;
28890       }
28891     }
28892 
28893     sendSignal(cmasterdihref, GSN_WAIT_GCP_REQ, signal,
28894 	       WaitGCPReq::SignalLength, JBB);
28895     return;
28896   }//if
28897 
28898 error:
28899   ref->senderData = senderData;
28900   ref->errorCode = errorCode;
28901   sendSignal(senderRef, GSN_WAIT_GCP_REF, signal,
28902              WaitGCPRef::SignalLength, JBB);
28903 }//Dbdih::execWAIT_GCP_REQ()
28904 
execWAIT_GCP_REF(Signal * signal)28905 void Dbdih::execWAIT_GCP_REF(Signal* signal)
28906 {
28907   jamEntry();
28908   ndbrequire(!isMaster());
28909   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
28910 
28911   const Uint32 proxyPtr = ref->senderData;
28912   const Uint32 errorCode = ref->errorCode;
28913 
28914   WaitGCPProxyPtr ptr;
28915   ptr.i = proxyPtr;
28916   c_waitGCPProxyList.getPtr(ptr);
28917 
28918   ref->senderData = ptr.p->clientData;
28919   ref->errorCode = errorCode;
28920   sendSignal(ptr.p->clientRef, GSN_WAIT_GCP_REF, signal,
28921 	     WaitGCPRef::SignalLength, JBB);
28922 
28923   c_waitGCPProxyList.release(ptr);
28924 }//Dbdih::execWAIT_GCP_REF()
28925 
execWAIT_GCP_CONF(Signal * signal)28926 void Dbdih::execWAIT_GCP_CONF(Signal* signal)
28927 {
28928   jamEntry();
28929   ndbrequire(!isMaster());
28930   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
28931   const Uint32 proxyPtr = conf->senderData;
28932   const Uint32 gci_hi = conf->gci_hi;
28933   const Uint32 gci_lo = conf->gci_lo;
28934   WaitGCPProxyPtr ptr;
28935 
28936   ptr.i = proxyPtr;
28937   c_waitGCPProxyList.getPtr(ptr);
28938 
28939   conf->senderData = ptr.p->clientData;
28940   conf->gci_hi = gci_hi;
28941   conf->gci_lo = gci_lo;
28942   conf->blockStatus = cgcpOrderBlocked;
28943   sendSignal(ptr.p->clientRef, GSN_WAIT_GCP_CONF, signal,
28944 	     WaitGCPConf::SignalLength, JBB);
28945 
28946   c_waitGCPProxyList.release(ptr);
28947 }//Dbdih::execWAIT_GCP_CONF()
28948 
checkWaitGCPProxy(Signal * signal,NodeId failedNodeId)28949 void Dbdih::checkWaitGCPProxy(Signal* signal, NodeId failedNodeId)
28950 {
28951   jam();
28952   WaitGCPRef* const ref = (WaitGCPRef*)&signal->theData[0];
28953   ref->errorCode = WaitGCPRef::NF_CausedAbortOfProcedure;
28954 
28955   WaitGCPProxyPtr ptr;
28956   c_waitGCPProxyList.first(ptr);
28957   while(ptr.i != RNIL) {
28958     jam();
28959     const Uint32 i = ptr.i;
28960     const Uint32 clientData = ptr.p->clientData;
28961     const BlockReference clientRef = ptr.p->clientRef;
28962     const BlockReference masterRef = ptr.p->masterRef;
28963 
28964     c_waitGCPProxyList.next(ptr);
28965     if(refToNode(masterRef) == failedNodeId) {
28966       jam();
28967       c_waitGCPProxyList.release(i);
28968       ref->senderData = clientData;
28969       sendSignal(clientRef, GSN_WAIT_GCP_REF, signal,
28970 		 WaitGCPRef::SignalLength, JBB);
28971     }//if
28972   }//while
28973 }//Dbdih::checkWaitGCPProxy()
28974 
checkWaitGCPMaster(Signal * signal,NodeId failedNodeId)28975 void Dbdih::checkWaitGCPMaster(Signal* signal, NodeId failedNodeId)
28976 {
28977   jam();
28978   WaitGCPMasterPtr ptr;
28979   c_waitGCPMasterList.first(ptr);
28980 
28981   while (ptr.i != RNIL) {
28982     jam();
28983     const Uint32 i = ptr.i;
28984     const NodeId nodeId = refToNode(ptr.p->clientRef);
28985 
28986     c_waitGCPMasterList.next(ptr);
28987     if (nodeId == failedNodeId) {
28988       jam();
28989       c_waitGCPMasterList.release(i);
28990     }//if
28991   }//while
28992 
28993   /* Node failure might mean we are now shutdown sync ready */
28994   checkShutdownSync();
28995 }//Dbdih::checkWaitGCPMaster()
28996 
28997 /**
28998  * getNodeBitmap
28999  *
29000  * Function to set a bitmap/mask with a bit set for each
29001  * node currently in the given list [and with a version
29002  * passing the supplied version function test].
29003  *
29004  * e.g. cfirstAliveNode / cfirstDeadNode
29005  *
29006  */
getNodeBitmap(NdbNodeBitmask & map,Uint32 listHead,int (* versionFunction)(Uint32))29007 void Dbdih::getNodeBitmap(NdbNodeBitmask& map,
29008                           Uint32 listHead,
29009                           int (*versionFunction)(Uint32))
29010 {
29011   jam();
29012 
29013   map.clear();
29014 
29015   NodeRecordPtr nodePtr;
29016   nodePtr.i = listHead;
29017   do
29018   {
29019     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29020     if (versionFunction != NULL)
29021     {
29022       if (versionFunction(getNodeInfo(nodePtr.i).m_version))
29023       {
29024         map.set(nodePtr.i);
29025       }
29026     }
29027     else
29028     {
29029       map.set(nodePtr.i);
29030     }
29031     nodePtr.i = nodePtr.p->nextNode;
29032   } while (nodePtr.i != RNIL);
29033 }
29034 
29035 /**
29036  * checkShutdownSync
29037  *
29038  * Called when a new shutdown sync request or node failure
29039  * occurs.
29040  * If all currently live nodes have requested shutdown sync
29041  * then we choose the next GCI, and update their queued requests
29042  * to complete on that GCI boundary
29043  */
checkShutdownSync()29044 void Dbdih::checkShutdownSync()
29045 {
29046   jam();
29047 
29048   if (likely(c_shutdownReqNodes.isclear()))
29049   {
29050     /* Nothing happening */
29051     return;
29052   }
29053 
29054   /* Get bitmap of current live nodes supporting shutdownSync */
29055   NdbNodeBitmask allDataNodes;
29056   getNodeBitmap(allDataNodes,
29057                 cfirstAliveNode,
29058                 &ndbd_support_waitgcp_shutdownsync);
29059 
29060   if (c_shutdownReqNodes.contains(allDataNodes))
29061   {
29062     /**
29063      * Now we have all nodes waiting for a GCI
29064      * boundary, lets choose the next boundary, as is done
29065      * for WaitGCPReq::CompleteForceStart to ensure that any
29066      * committed transactions are durable.
29067      */
29068     Uint32 safeGCI = Uint32(m_micro_gcp.m_current_gci >> 32);
29069     if (m_micro_gcp.m_master.m_state == MicroGcp::M_GCP_COMMIT)
29070     {
29071       safeGCI = Uint32(m_micro_gcp.m_master.m_new_gci >> 32);
29072     }
29073 
29074     g_eventLogger->info("Cluster shutdown durable gci : %u", safeGCI);
29075 
29076     WaitGCPMasterPtr ptr;
29077     c_waitGCPMasterList.first(ptr);
29078     while(ptr.i != RNIL) {
29079       jam();
29080 
29081       if (ptr.p->waitGCI == WaitGCPMasterRecord::ShutdownSyncGci)
29082       {
29083         jam();
29084         ptr.p->waitGCI = safeGCI;
29085       }
29086       c_waitGCPMasterList.next(ptr);
29087     }
29088 
29089     // Invalidating timestamps will force GCP_PREPARE/COMMIT
29090     // and GCP_SAVEREQ et al ASAP
29091     NdbTick_Invalidate(&m_micro_gcp.m_master.m_start_time);
29092     NdbTick_Invalidate(&m_gcp_save.m_master.m_start_time);
29093 
29094     c_shutdownReqNodes.clear();
29095   }
29096 }
29097 
29098 
emptyWaitGCPMasterQueue(Signal * signal,Uint64 gci,WaitGCPList & list)29099 void Dbdih::emptyWaitGCPMasterQueue(Signal* signal,
29100                                     Uint64 gci,
29101                                     WaitGCPList & list)
29102 {
29103   jam();
29104   WaitGCPConf* const conf = (WaitGCPConf*)&signal->theData[0];
29105   conf->gci_hi = Uint32(gci >> 32);
29106   conf->gci_lo = Uint32(gci);
29107 
29108   WaitGCPMasterPtr ptr;
29109   list.first(ptr);
29110   while(ptr.i != RNIL) {
29111     jam();
29112     const Uint32 i = ptr.i;
29113     const Uint32 clientData = ptr.p->clientData;
29114     const BlockReference clientRef = ptr.p->clientRef;
29115     const Uint32 waitGCI = ptr.p->waitGCI;
29116 
29117     list.next(ptr);
29118 
29119     if (waitGCI != 0)
29120     {
29121       jam();
29122       /* Waiting for a specific GCI */
29123       const Uint64 completedGci = (gci >> 32);
29124       ndbrequire(completedGci <= waitGCI)
29125 
29126       if (completedGci < waitGCI)
29127       {
29128         jam();
29129         /* Keep waiting */
29130         continue;
29131       }
29132     }
29133 
29134     conf->senderData = clientData;
29135     conf->blockStatus = cgcpOrderBlocked;
29136     sendSignal(clientRef, GSN_WAIT_GCP_CONF, signal,
29137                WaitGCPConf::SignalLength, JBB);
29138 
29139     list.release(i);
29140   }//while
29141 }//Dbdih::emptyWaitGCPMasterQueue()
29142 
setNodeStatus(Uint32 nodeId,NodeRecord::NodeStatus newStatus)29143 void Dbdih::setNodeStatus(Uint32 nodeId, NodeRecord::NodeStatus newStatus)
29144 {
29145   NodeRecordPtr nodePtr;
29146   nodePtr.i = nodeId;
29147   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29148   nodePtr.p->nodeStatus = newStatus;
29149 }//Dbdih::setNodeStatus()
29150 
getNodeStatus(Uint32 nodeId)29151 Dbdih::NodeRecord::NodeStatus Dbdih::getNodeStatus(Uint32 nodeId)
29152 {
29153   NodeRecordPtr nodePtr;
29154   nodePtr.i = nodeId;
29155   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29156   return nodePtr.p->nodeStatus;
29157 }//Dbdih::getNodeStatus()
29158 
29159 Sysfile::ActiveStatus
getNodeActiveStatus(Uint32 nodeId)29160 Dbdih::getNodeActiveStatus(Uint32 nodeId)
29161 {
29162   NodeRecordPtr nodePtr;
29163   nodePtr.i = nodeId;
29164   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29165   return nodePtr.p->activeStatus;
29166 }//Dbdih::getNodeActiveStatus()
29167 
29168 
29169 void
setNodeActiveStatus(Uint32 nodeId,Sysfile::ActiveStatus newStatus)29170 Dbdih::setNodeActiveStatus(Uint32 nodeId, Sysfile::ActiveStatus newStatus)
29171 {
29172   NodeRecordPtr nodePtr;
29173   nodePtr.i = nodeId;
29174   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29175   nodePtr.p->activeStatus = newStatus;
29176 }//Dbdih::setNodeActiveStatus()
29177 
setAllowNodeStart(Uint32 nodeId,bool newState)29178 void Dbdih::setAllowNodeStart(Uint32 nodeId, bool newState)
29179 {
29180   NodeRecordPtr nodePtr;
29181   nodePtr.i = nodeId;
29182   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29183   nodePtr.p->allowNodeStart = newState;
29184 }//Dbdih::setAllowNodeStart()
29185 
getAllowNodeStart(Uint32 nodeId)29186 bool Dbdih::getAllowNodeStart(Uint32 nodeId)
29187 {
29188   NodeRecordPtr nodePtr;
29189   nodePtr.i = nodeId;
29190   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29191   return nodePtr.p->allowNodeStart;
29192 }//Dbdih::getAllowNodeStart()
29193 
29194 Uint32
getNodeGroup(Uint32 nodeId) const29195 Dbdih::getNodeGroup(Uint32 nodeId) const
29196 {
29197   NodeRecordPtr nodePtr;
29198   nodePtr.i = nodeId;
29199   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29200   return nodePtr.p->nodeGroup;
29201 }
29202 
checkNodeAlive(Uint32 nodeId)29203 bool Dbdih::checkNodeAlive(Uint32 nodeId)
29204 {
29205   NodeRecordPtr nodePtr;
29206   nodePtr.i = nodeId;
29207   ndbrequire(nodeId > 0);
29208   ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29209   if (nodePtr.p->nodeStatus != NodeRecord::ALIVE) {
29210     return false;
29211   } else {
29212     return true;
29213   }//if
29214 }//Dbdih::checkNodeAlive()
29215 
isMaster()29216 bool Dbdih::isMaster()
29217 {
29218   return (reference() == cmasterdihref);
29219 }//Dbdih::isMaster()
29220 
isActiveMaster()29221 bool Dbdih::isActiveMaster()
29222 {
29223   return ((reference() == cmasterdihref) && (cmasterState == MASTER_ACTIVE));
29224 }//Dbdih::isActiveMaster()
29225 
initNodeRecord(NodeRecordPtr nodePtr)29226 void Dbdih::initNodeRecord(NodeRecordPtr nodePtr)
29227 {
29228   DEB_LCP(("initNodeRecord(%u)", nodePtr.i));
29229   nodePtr.p->m_nodefailSteps.clear();
29230 
29231   nodePtr.p->activeStatus = Sysfile::NS_NotDefined;
29232   nodePtr.p->recNODE_FAILREP = ZFALSE;
29233   nodePtr.p->dbtcFailCompleted = ZTRUE;
29234   nodePtr.p->dbdictFailCompleted = ZTRUE;
29235   nodePtr.p->dbdihFailCompleted = ZTRUE;
29236   nodePtr.p->dblqhFailCompleted = ZTRUE;
29237   nodePtr.p->noOfStartedChkpt = 0;
29238   nodePtr.p->noOfQueuedChkpt = 0;
29239   nodePtr.p->lcpStateAtTakeOver = (MasterLCPConf::State)255;
29240 
29241   nodePtr.p->activeTabptr = RNIL;
29242   nodePtr.p->nodeStatus = NodeRecord::NOT_IN_CLUSTER;
29243   nodePtr.p->useInTransactions = false;
29244   nodePtr.p->copyCompleted = 0;
29245   nodePtr.p->allowNodeStart = true;
29246 }
29247 // DICT lock slave
29248 
29249 void
sendDictLockReq(Signal * signal,Uint32 lockType,Callback c)29250 Dbdih::sendDictLockReq(Signal* signal, Uint32 lockType, Callback c)
29251 {
29252   DictLockReq* req = (DictLockReq*)&signal->theData[0];
29253   DictLockSlavePtr lockPtr;
29254 
29255   c_dictLockSlavePool.seize(lockPtr);
29256   ndbrequire(lockPtr.i != RNIL);
29257 
29258   req->userPtr = lockPtr.i;
29259   req->lockType = lockType;
29260   req->userRef = reference();
29261 
29262   lockPtr.p->lockPtr = RNIL;
29263   lockPtr.p->lockType = lockType;
29264   lockPtr.p->locked = false;
29265   lockPtr.p->callback = c;
29266 
29267   BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
29268   sendSignal(dictMasterRef, GSN_DICT_LOCK_REQ, signal,
29269       DictLockReq::SignalLength, JBB);
29270 }
29271 
29272 void
execDICT_LOCK_CONF(Signal * signal)29273 Dbdih::execDICT_LOCK_CONF(Signal* signal)
29274 {
29275   jamEntry();
29276   recvDictLockConf(signal);
29277 }
29278 
29279 void
execDICT_LOCK_REF(Signal * signal)29280 Dbdih::execDICT_LOCK_REF(Signal* signal)
29281 {
29282   jamEntry();
29283   ndbabort();
29284 }
29285 
29286 void
recvDictLockConf(Signal * signal)29287 Dbdih::recvDictLockConf(Signal* signal)
29288 {
29289   const DictLockConf* conf = (const DictLockConf*)&signal->theData[0];
29290 
29291   DictLockSlavePtr lockPtr;
29292   c_dictLockSlavePool.getPtr(lockPtr, conf->userPtr);
29293 
29294   lockPtr.p->lockPtr = conf->lockPtr;
29295   ndbrequire(lockPtr.p->lockType == conf->lockType);
29296   ndbrequire(lockPtr.p->locked == false);
29297   lockPtr.p->locked = true;
29298 
29299   lockPtr.p->callback.m_callbackData = lockPtr.i;
29300   execute(signal, lockPtr.p->callback, 0);
29301 }
29302 
29303 void
sendDictUnlockOrd(Signal * signal,Uint32 lockSlavePtrI)29304 Dbdih::sendDictUnlockOrd(Signal* signal, Uint32 lockSlavePtrI)
29305 {
29306   DictUnlockOrd* ord = (DictUnlockOrd*)&signal->theData[0];
29307 
29308   DictLockSlavePtr lockPtr;
29309   c_dictLockSlavePool.getPtr(lockPtr, lockSlavePtrI);
29310 
29311   ord->lockPtr = lockPtr.p->lockPtr;
29312   ord->lockType = lockPtr.p->lockType;
29313   ord->senderData = lockPtr.i;
29314   ord->senderRef = reference();
29315 
29316   c_dictLockSlavePool.release(lockPtr);
29317 
29318   Uint32 len = DictUnlockOrd::SignalLength;
29319   if (unlikely(getNodeInfo(cmasterNodeId).m_version < NDB_MAKE_VERSION(6,3,0)))
29320   {
29321     jam();
29322     len = 2;
29323   }
29324 
29325   BlockReference dictMasterRef = calcDictBlockRef(cmasterNodeId);
29326   sendSignal(dictMasterRef, GSN_DICT_UNLOCK_ORD, signal, len, JBB);
29327 }
29328 
29329 #ifdef ERROR_INSERT
29330 void
sendToRandomNodes(const char * msg,Signal * signal,SignalCounter * counter,SendFunction fun,Uint32 extra,Uint32 block,Uint32 gsn,Uint32 len,JobBufferLevel level)29331 Dbdih::sendToRandomNodes(const char * msg,
29332                          Signal* signal,
29333                          SignalCounter* counter,
29334                          SendFunction fun,
29335                          Uint32 extra,
29336                          Uint32 block,
29337                          Uint32 gsn,
29338                          Uint32 len,
29339                          JobBufferLevel level)
29340 {
29341 
29342   if (counter)
29343     counter->clearWaitingFor();
29344 
29345   Vector<Uint32> nodes;
29346   NodeRecordPtr nodePtr;
29347   nodePtr.i = cfirstAliveNode;
29348   do {
29349     jam();
29350     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29351     if (nodePtr.i != getOwnNodeId())
29352     {
29353       nodes.push_back(nodePtr.i);
29354     }
29355     nodePtr.i = nodePtr.p->nextNode;
29356   } while (nodePtr.i != RNIL);
29357 
29358 
29359   NdbNodeBitmask masked;
29360   Uint32 cnt = nodes.size();
29361   if (cnt <= 1)
29362   {
29363     goto do_send;
29364   }
29365 
29366   {
29367     Uint32 remove = (rand() % cnt);
29368     if (remove == 0)
29369       remove = 1;
29370 
29371     for (Uint32 i = 0; i<remove; i++)
29372     {
29373       Uint32 rand_node = rand() % nodes.size();
29374       masked.set(nodes[rand_node]);
29375       nodes.erase(rand_node);
29376     }
29377   }
29378 
29379 do_send:
29380   char bufpos = 0;
29381   char buf[256];
29382 
29383   nodePtr.i = cfirstAliveNode;
29384   do {
29385     jam();
29386     ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
29387     if (counter)
29388       counter->setWaitingFor(nodePtr.i);
29389     if (!masked.get(nodePtr.i))
29390     {
29391       if (fun)
29392       {
29393         (this->*fun)(signal, nodePtr.i, extra);
29394       }
29395       else
29396       {
29397         Uint32 ref = numberToRef(block, nodePtr.i);
29398         sendSignal(ref, gsn, signal, len, level);
29399       }
29400       BaseString::snprintf(buf+bufpos, sizeof(buf)-bufpos, "%u ", nodePtr.i);
29401     }
29402     else
29403     {
29404       BaseString::snprintf(buf+bufpos, sizeof(buf)-bufpos, "[%u] ", nodePtr.i);
29405     }
29406     bufpos = strlen(buf);
29407     nodePtr.i = nodePtr.p->nextNode;
29408   } while (nodePtr.i != RNIL);
29409   infoEvent("%s %s", msg, buf);
29410 }
29411 
29412 #endif
29413 
29414 // MT LQH
29415 
29416 Uint32
dihGetInstanceKey(Uint32 tabId,Uint32 fragId)29417 Dbdih::dihGetInstanceKey(Uint32 tabId, Uint32 fragId)
29418 {
29419   TabRecordPtr tTabPtr;
29420   tTabPtr.i = tabId;
29421   ptrCheckGuard(tTabPtr, ctabFileSize, tabRecord);
29422   FragmentstorePtr tFragPtr;
29423 loop:
29424   Uint32 tab_val = tTabPtr.p->m_lock.read_lock();
29425   getFragstore(tTabPtr.p, fragId, tFragPtr);
29426   Uint32 instanceKey = dihGetInstanceKey(tFragPtr);
29427   if (unlikely(!tTabPtr.p->m_lock.read_unlock(tab_val)))
29428     goto loop;
29429   return instanceKey;
29430 }
29431 
29432 Uint32
dihGetInstanceKeyCanFail(Uint32 tabId,Uint32 fragId)29433 Dbdih::dihGetInstanceKeyCanFail(Uint32 tabId, Uint32 fragId)
29434 {
29435   TabRecordPtr tTabPtr;
29436   tTabPtr.i = tabId;
29437   Uint32 instanceKey;
29438   if (tabId >= ctabFileSize)
29439   {
29440     return Uint32(RNIL);
29441   }
29442   ptrAss(tTabPtr, tabRecord);
29443   if (fragId >= tTabPtr.p->totalfragments)
29444   {
29445     return Uint32(RNIL);
29446   }
29447   FragmentstorePtr tFragPtr;
29448   Uint32 tab_val;
29449   do
29450   {
29451     tab_val = tTabPtr.p->m_lock.read_lock();
29452     getFragstoreCanFail(tTabPtr.p, fragId, tFragPtr);
29453     if (tFragPtr.p == NULL)
29454     {
29455       instanceKey = Uint32(RNIL);
29456     }
29457     else
29458     {
29459       instanceKey = dihGetInstanceKey(tFragPtr);
29460     }
29461   } while ((unlikely(!tTabPtr.p->m_lock.read_unlock(tab_val))));
29462   return instanceKey;
29463 }
29464 
29465 /**
29466  *
29467  */
29468 void
execCREATE_NODEGROUP_IMPL_REQ(Signal * signal)29469 Dbdih::execCREATE_NODEGROUP_IMPL_REQ(Signal* signal)
29470 {
29471   jamEntry();
29472   CreateNodegroupImplReq reqCopy = *(CreateNodegroupImplReq*)signal->getDataPtr();
29473   CreateNodegroupImplReq *req = &reqCopy;
29474 
29475   Uint32 err = 0;
29476   Uint32 rt = req->requestType;
29477   Uint64 gci = 0;
29478   switch(rt){
29479   case CreateNodegroupImplReq::RT_ABORT:
29480     jam(); // do nothing
29481     break;
29482   case CreateNodegroupImplReq::RT_PARSE:
29483   case CreateNodegroupImplReq::RT_PREPARE:
29484   case CreateNodegroupImplReq::RT_COMMIT:
29485   {
29486     Uint32 cnt = 0;
29487     for (Uint32 i = 0; i<NDB_ARRAY_SIZE(req->nodes) && req->nodes[i] ; i++)
29488     {
29489       cnt++;
29490       if(req->nodes[i] >= MAX_NDB_NODES)
29491       {
29492         err = CreateNodegroupRef::NodeNotDefined;
29493         goto error;
29494       }
29495       if (getNodeActiveStatus(req->nodes[i]) != Sysfile::NS_Configured)
29496       {
29497         jam();
29498         err = CreateNodegroupRef::NodeAlreadyInNodegroup;
29499         goto error;
29500       }
29501     }
29502 
29503     if (cnt != cnoReplicas)
29504     {
29505       jam();
29506       err = CreateNodegroupRef::InvalidNoOfNodesInNodegroup;
29507       goto error;
29508     }
29509 
29510     Uint32 ng = req->nodegroupId;
29511     NdbNodeBitmask tmp;
29512     tmp.set();
29513     for (Uint32 i = 0; i<cnoOfNodeGroups; i++)
29514     {
29515       ndbrequire(c_node_groups[i] < MAX_NDB_NODE_GROUPS);
29516       tmp.clear(c_node_groups[i]);
29517     }
29518 
29519     if (ng == RNIL && rt == CreateNodegroupImplReq::RT_PARSE)
29520     {
29521       jam();
29522       ng = tmp.find(0);
29523     }
29524 
29525     if (ng > MAX_NDB_NODE_GROUPS)
29526     {
29527       jam();
29528       err = CreateNodegroupRef::InvalidNodegroupId;
29529       goto error;
29530     }
29531 
29532     if (tmp.get(ng) == false)
29533     {
29534       jam();
29535       err = CreateNodegroupRef::NodegroupInUse;
29536       goto error;
29537     }
29538 
29539     if (rt == CreateNodegroupImplReq::RT_PARSE || rt == CreateNodegroupImplReq::RT_PREPARE)
29540     {
29541       /**
29542        * Check that atleast one of the nodes are alive
29543        */
29544       bool alive = false;
29545       for (Uint32 i = 0; i<cnoReplicas; i++)
29546       {
29547         jam();
29548         Uint32 nodeId = req->nodes[i];
29549         if (getNodeStatus(nodeId) == NodeRecord::ALIVE)
29550         {
29551           jam();
29552           alive = true;
29553           break;
29554         }
29555       }
29556 
29557       jam();
29558       if (alive == false)
29559       {
29560         jam();
29561         err = CreateNodegroupRef::NoNodeAlive;
29562         goto error;
29563       }
29564     }
29565 
29566     if (rt == CreateNodegroupImplReq::RT_PARSE)
29567     {
29568       jam();
29569       signal->theData[0] = 0;
29570       signal->theData[1] = ng;
29571       return;
29572     }
29573 
29574     if (rt == CreateNodegroupImplReq::RT_PREPARE)
29575     {
29576       jam(); // do nothing
29577       break;
29578     }
29579 
29580     ndbrequire(rt == CreateNodegroupImplReq::RT_COMMIT);
29581     bool our_node_in_new_nodegroup = false;
29582     for (Uint32 i = 0; i<cnoReplicas; i++)
29583     {
29584       Uint32 nodeId = req->nodes[i];
29585       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups, req->nodegroupId);
29586       if (getNodeStatus(nodeId) == NodeRecord::ALIVE)
29587       {
29588         jam();
29589         Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_Active);
29590         if (nodeId == getOwnNodeId())
29591         {
29592           jam();
29593           our_node_in_new_nodegroup = true;
29594         }
29595       }
29596       else
29597       {
29598         jam();
29599         Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_ActiveMissed_1);
29600       }
29601       setNodeActiveStatus();
29602       setNodeGroups();
29603     }
29604     if (our_node_in_new_nodegroup)
29605     {
29606       jam();
29607       /**
29608        * We are part of a newly created node group. Thus it is now time to
29609        * setup multi socket transporter to communicate with other nodes in
29610        * the new node group.
29611        */
29612       DEB_MULTI_TRP(("Set up multi transporter after Create nodegroup"));
29613       m_set_up_multi_trp_in_node_restart = false;
29614       signal->theData[0] = reference();
29615       sendSignal(QMGR_REF, GSN_SET_UP_MULTI_TRP_REQ, signal, 1, JBB);
29616     }
29617     break;
29618   }
29619   case CreateNodegroupImplReq::RT_COMPLETE:
29620     jam();
29621     gci = m_micro_gcp.m_current_gci;
29622     break;
29623   }
29624 
29625   {
29626     CreateNodegroupImplConf* conf = (CreateNodegroupImplConf*)signal->getDataPtrSend();
29627     conf->senderRef = reference();
29628     conf->senderData = req->senderData;
29629     conf->gci_hi = Uint32(gci >> 32);
29630     conf->gci_lo = Uint32(gci);
29631     sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_CONF, signal,
29632                CreateNodegroupImplConf::SignalLength, JBB);
29633   }
29634   return;
29635 
29636 error:
29637   if (rt == CreateNodegroupImplReq::RT_PARSE)
29638   {
29639     jam();
29640     signal->theData[0] = err;
29641     return;
29642   }
29643 
29644   if (rt == CreateNodegroupImplReq::RT_PREPARE)
29645   {
29646     jam();
29647     CreateNodegroupImplRef * ref = (CreateNodegroupImplRef*)signal->getDataPtrSend();
29648     ref->senderRef = reference();
29649     ref->senderData = req->senderData;
29650     ref->errorCode = err;
29651     sendSignal(req->senderRef, GSN_CREATE_NODEGROUP_IMPL_REF, signal,
29652                CreateNodegroupImplRef::SignalLength, JBB);
29653     return;
29654   }
29655 
29656   jamLine(err);
29657   ndbabort();
29658 }
29659 
29660 /**
29661  *
29662  */
29663 void
execDROP_NODEGROUP_IMPL_REQ(Signal * signal)29664 Dbdih::execDROP_NODEGROUP_IMPL_REQ(Signal* signal)
29665 {
29666   jamEntry();
29667   DropNodegroupImplReq reqCopy = *(DropNodegroupImplReq*)signal->getDataPtr();
29668   DropNodegroupImplReq *req = &reqCopy;
29669 
29670   NodeGroupRecordPtr NGPtr;
29671 
29672   Uint32 err = 0;
29673   Uint32 rt = req->requestType;
29674   Uint64 gci = 0;
29675   switch(rt){
29676   case DropNodegroupImplReq::RT_ABORT:
29677     jam(); // do nothing
29678     break;
29679   case DropNodegroupImplReq::RT_PARSE:
29680   case DropNodegroupImplReq::RT_PREPARE:
29681     jam();
29682     NGPtr.i = req->nodegroupId;
29683     if (NGPtr.i >= MAX_NDB_NODE_GROUPS)
29684     {
29685       jam();
29686       err = DropNodegroupRef::NoSuchNodegroup;
29687       goto error;
29688     }
29689     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
29690 
29691     if (NGPtr.p->nodegroupIndex == RNIL)
29692     {
29693       jam();
29694       err = DropNodegroupRef::NoSuchNodegroup;
29695       goto error;
29696     }
29697 
29698     if (NGPtr.p->m_ref_count)
29699     {
29700       jam();
29701       err = DropNodegroupRef::NodegroupInUse;
29702       goto error;
29703     }
29704     break;
29705   case DropNodegroupImplReq::RT_COMMIT:
29706   {
29707     jam();
29708     gci = m_micro_gcp.m_current_gci;
29709     break;
29710   }
29711   case DropNodegroupImplReq::RT_COMPLETE:
29712   {
29713     NGPtr.i = req->nodegroupId;
29714     ptrCheckGuard(NGPtr, MAX_NDB_NODE_GROUPS, nodeGroupRecord);
29715     for (Uint32 i = 0; i<NGPtr.p->nodeCount; i++)
29716     {
29717       jam();
29718       Uint32 nodeId = NGPtr.p->nodesInGroup[i];
29719       Sysfile::setNodeGroup(nodeId, SYSFILE->nodeGroups, NO_NODE_GROUP_ID);
29720       Sysfile::setNodeStatus(nodeId, SYSFILE->nodeStatus, Sysfile::NS_Configured);
29721     }
29722     setNodeActiveStatus();
29723     setNodeGroups();
29724     break;
29725   }
29726   }
29727 
29728   {
29729     DropNodegroupImplConf* conf = (DropNodegroupImplConf*)signal->getDataPtrSend();
29730     conf->senderRef = reference();
29731     conf->senderData = req->senderData;
29732     conf->gci_hi = Uint32(gci >> 32);
29733     conf->gci_lo = Uint32(gci);
29734     sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_CONF, signal,
29735                DropNodegroupImplConf::SignalLength, JBB);
29736   }
29737   return;
29738 
29739 error:
29740   DropNodegroupImplRef * ref = (DropNodegroupImplRef*)signal->getDataPtrSend();
29741   ref->senderRef = reference();
29742   ref->senderData = req->senderData;
29743   ref->errorCode = err;
29744   sendSignal(req->senderRef, GSN_DROP_NODEGROUP_IMPL_REF, signal,
29745              DropNodegroupImplRef::SignalLength, JBB);
29746 }
29747 
29748 Uint32
getMinVersion() const29749 Dbdih::getMinVersion() const
29750 {
29751   Uint32 ver = getNodeInfo(getOwnNodeId()).m_version;
29752   NodeRecordPtr specNodePtr;
29753   specNodePtr.i = cfirstAliveNode;
29754   do
29755   {
29756     jam();
29757     ptrCheckGuard(specNodePtr, MAX_NDB_NODES, nodeRecord);
29758     Uint32 v = getNodeInfo(specNodePtr.i).m_version;
29759     if (v < ver)
29760     {
29761       jam();
29762       ver = v;
29763     }
29764     specNodePtr.i = specNodePtr.p->nextNode;
29765   } while (specNodePtr.i != RNIL);
29766 
29767   return ver;
29768 }
29769 
29770 Uint8
getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const29771 Dbdih::getMaxStartedFragCheckpointsForNode(Uint32 nodeId) const
29772 {
29773   return MAX_STARTED_FRAG_CHECKPOINTS_PER_NODE;
29774 }
29775 
29776 
29777 /**
29778  * isolateNodes
29779  *
29780  * Get all live nodes to disconnect the set of victims
29781  * in minDelayMillis.
29782  *
29783  * The signals are sent to live nodes immediately, and
29784  * those nodes perform the delay, to reduce the chance
29785  * of lag on this node causing problems
29786  */
29787 void
isolateNodes(Signal * signal,Uint32 delayMillis,const NdbNodeBitmask & victims)29788 Dbdih::isolateNodes(Signal* signal,
29789                     Uint32 delayMillis,
29790                     const NdbNodeBitmask& victims)
29791 {
29792   jam();
29793 
29794   IsolateOrd* ord = (IsolateOrd*) signal->theData;
29795 
29796   ord->senderRef          = reference();
29797   ord->isolateStep        = IsolateOrd::IS_REQ;
29798   ord->delayMillis        = delayMillis;
29799 
29800   victims.copyto(NdbNodeBitmask::Size, ord->nodesToIsolate);
29801   LinearSectionPtr lsptr[3];
29802   lsptr[0].p = ord->nodesToIsolate;
29803   lsptr[0].sz = NdbNodeBitmask::Size;
29804   /* QMGR handles this */
29805   sendSignal(QMGR_REF,
29806              GSN_ISOLATE_ORD,
29807              signal,
29808              IsolateOrd::SignalLength,
29809              JBA,
29810              lsptr,
29811              1);
29812 }
29813