1 /*
2    Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software
22    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA
23 */
24 
25 #include "Backup.hpp"
26 
27 #include <ndb_version.h>
28 
29 #include <NdbTCP.h>
30 #include <Bitmask.hpp>
31 
32 #include <signaldata/NodeFailRep.hpp>
33 #include <signaldata/ReadNodesConf.hpp>
34 
35 #include <signaldata/DihScanTab.hpp>
36 #include <signaldata/DiGetNodes.hpp>
37 #include <signaldata/ScanFrag.hpp>
38 
39 #include <signaldata/GetTabInfo.hpp>
40 #include <signaldata/DictTabInfo.hpp>
41 #include <signaldata/ListTables.hpp>
42 
43 #include <signaldata/FsOpenReq.hpp>
44 #include <signaldata/FsAppendReq.hpp>
45 #include <signaldata/FsCloseReq.hpp>
46 #include <signaldata/FsConf.hpp>
47 #include <signaldata/FsRef.hpp>
48 #include <signaldata/FsRemoveReq.hpp>
49 #include <signaldata/FsReadWriteReq.hpp>
50 
51 #include <signaldata/BackupImpl.hpp>
52 #include <signaldata/BackupSignalData.hpp>
53 #include <signaldata/BackupContinueB.hpp>
54 #include <signaldata/EventReport.hpp>
55 
56 #include <signaldata/UtilSequence.hpp>
57 
58 #include <signaldata/CreateTrigImpl.hpp>
59 #include <signaldata/DropTrigImpl.hpp>
60 #include <signaldata/FireTrigOrd.hpp>
61 #include <signaldata/TrigAttrInfo.hpp>
62 #include <AttributeHeader.hpp>
63 
64 #include <signaldata/WaitGCP.hpp>
65 #include <signaldata/LCP.hpp>
66 #include <signaldata/BackupLockTab.hpp>
67 #include <signaldata/DumpStateOrd.hpp>
68 
69 #include <signaldata/DumpStateOrd.hpp>
70 
71 #include <signaldata/DbinfoScan.hpp>
72 #include <signaldata/TransIdAI.hpp>
73 
74 #include <NdbTick.h>
75 #include <dbtup/Dbtup.hpp>
76 
77 #include <EventLogger.hpp>
78 extern EventLogger * g_eventLogger;
79 
80 #include <math.h>
81 
82 #define JAM_FILE_ID 475
83 
84 static const Uint32 WaitDiskBufferCapacityMillis = 1;
85 static const Uint32 WaitScanTempErrorRetryMillis = 10;
86 
87 static NDB_TICKS startTime;
88 
89 #if (defined(VM_TRACE) || defined(ERROR_INSERT))
90 //#define DEBUG_LCP 1
91 //#define DEBUG_LCP_ROW 1
92 //#define DEBUG_LCP_DEL_FILES 1
93 //#define DEBUG_LCP_DEL 1
94 //#define DEBUG_EXTRA_LCP 1
95 //#define DEBUG_REDO_CONTROL 1
96 //#define DEBUG_REDO_CONTROL_DETAIL 1
97 //#define DEBUG_LCP_DD 1
98 //#define DEBUG_LCP_STAT 1
99 //#define DEBUG_LCP_LAG 1
100 #endif
101 
102 #ifdef DEBUG_REDO_CONTROL
103 #define DEB_REDO_CONTROL(arglist) do { g_eventLogger->info arglist ; } while (0)
104 #else
105 #define DEB_REDO_CONTROL(arglist) do { } while (0)
106 #endif
107 
108 #ifdef DEBUG_REDO_CONTROL_DETAIL
109 #define DEB_REDO_CONTROL_DETAIL(arglist) do { g_eventLogger->info arglist ; } while (0)
110 #else
111 #define DEB_REDO_CONTROL_DETAIL(arglist) do { } while (0)
112 #endif
113 
114 #ifdef DEBUG_LCP
115 #define DEB_LCP(arglist) do { g_eventLogger->info arglist ; } while (0)
116 #else
117 #define DEB_LCP(arglist) do { } while (0)
118 #endif
119 
120 #ifdef DEBUG_LCP_DD
121 #define DEB_LCP_DD(arglist) do { g_eventLogger->info arglist ; } while (0)
122 #else
123 #define DEB_LCP_DD(arglist) do { } while (0)
124 #endif
125 
126 #ifdef DEBUG_LCP_DEL_FILES
127 #define DEB_LCP_DEL_FILES(arglist) do { g_eventLogger->info arglist ; } while (0)
128 #else
129 #define DEB_LCP_DEL_FILES(arglist) do { } while (0)
130 #endif
131 
132 #ifdef DEBUG_LCP_DEL
133 #define DEB_LCP_DEL(arglist) do { g_eventLogger->info arglist ; } while (0)
134 #else
135 #define DEB_LCP_DEL(arglist) do { } while (0)
136 #endif
137 
138 #ifdef DEBUG_LCP_STAT
139 #define DEB_LCP_STAT(arglist) do { g_eventLogger->info arglist ; } while (0)
140 #else
141 #define DEB_LCP_STAT(arglist) do { } while (0)
142 #endif
143 
144 #ifdef DEBUG_LCP_LAG
145 #define DEB_LCP_LAG(arglist) do { g_eventLogger->info arglist ; } while (0)
146 #else
147 #define DEB_LCP_LAG(arglist) do { } while (0)
148 #endif
149 
150 #ifdef DEBUG_EXTRA_LCP
151 #define DEB_EXTRA_LCP(arglist) do { g_eventLogger->info arglist ; } while (0)
152 #else
153 #define DEB_EXTRA_LCP(arglist) do { } while (0)
154 #endif
155 
156 #ifdef VM_TRACE
157 #define DEBUG_OUT(x) ndbout << x << endl
158 #else
159 #define DEBUG_OUT(x)
160 #endif
161 
162 //#define DEBUG_ABORT
163 //#define dbg globalSignalLoggers.log
164 
165 static Uint32 g_TypeOfStart = NodeState::ST_ILLEGAL_TYPE;
166 
167 #define SEND_BACKUP_STARTED_FLAG(A) (((A) & BackupReq::WAITCOMPLETED) > 0)
168 #define SEND_BACKUP_COMPLETED_FLAG(A) (((A) & BackupReq::WAITCOMPLETED) > 1)
169 #define MT_BACKUP_FLAG(A) (((A) & BackupReq::MT_BACKUP) > 0)
170 
171 /**
172  * "Magic" constants used for adaptive LCP speed algorithm. These magic
173  * constants tries to ensure a smooth LCP load which is high enough to
174  * avoid slowing down LCPs such that we run out of REDO logs. Also low
175  * enough to avoid that we use so much CPU on LCPs that we block out
176  * most user transactions. We also want to avoid destroying real-time
177  * characteristics due to LCPs.
178  *
179  * See much longer explanation of these values below.
180  */
181 #define HIGH_LOAD_LEVEL 32
182 #define VERY_HIGH_LOAD_LEVEL 48
183 #define NUMBER_OF_SIGNALS_PER_SCAN_BATCH 3
184 #define MAX_RAISE_PRIO_MEMORY 16
185 
186 void
execSTTOR(Signal * signal)187 Backup::execSTTOR(Signal* signal)
188 {
189   jamEntry();
190 
191   const Uint32 startphase  = signal->theData[1];
192   const Uint32 typeOfStart = signal->theData[7];
193 
194   if (startphase == 1)
195   {
196     ndbrequire((c_lqh = (Dblqh*)globalData.getBlock(DBLQH, instance())) != 0);
197     ndbrequire((c_tup = (Dbtup*)globalData.getBlock(DBTUP, instance())) != 0);
198     ndbrequire((c_lgman =
199                 (Lgman*)globalData.getBlock(LGMAN, instance())) != 0);
200     ndbrequire((c_pgman =
201                 (Pgman*)globalData.getBlock(PGMAN, instance())) != 0);
202 
203     m_words_written_this_period = 0;
204     m_backup_words_written_this_period = 0;
205     last_disk_write_speed_report = 0;
206     next_disk_write_speed_report = 0;
207     m_monitor_words_written = 0;
208     m_backup_monitor_words_written = 0;
209     m_periods_passed_in_monitor_period = 0;
210     m_monitor_snapshot_start = NdbTick_getCurrentTicks();
211     m_curr_lcp_id = 0;
212     m_curr_disk_write_speed = c_defaults.m_disk_write_speed_max_own_restart;
213     m_curr_backup_disk_write_speed =
214       c_defaults.m_disk_write_speed_max_own_restart;
215     m_overflow_disk_write = 0;
216     m_backup_overflow_disk_write = 0;
217     slowdowns_due_to_io_lag = 0;
218     slowdowns_due_to_high_cpu = 0;
219     disk_write_speed_set_to_min = 0;
220     m_is_lcp_running = false;
221     m_is_backup_running = false;
222     m_is_any_node_restarting = false;
223     m_node_restart_check_sent = false;
224     m_our_node_started = false;
225     m_lcp_ptr.i = RNIL;
226     m_lcp_ptr.p = 0;
227     m_first_lcp_started = false;
228     m_newestRestorableGci = 0;
229     m_delete_lcp_files_ongoing = false;
230     m_reset_disk_speed_time = NdbTick_getCurrentTicks();
231     m_reset_delay_used = Backup::DISK_SPEED_CHECK_DELAY;
232     c_initial_start_lcp_not_done_yet = false;
233     m_redo_alert_factor = 1;
234     m_redo_alert_state = RedoStateRep::NO_REDO_ALERT;
235     signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
236     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
237                         Backup::DISK_SPEED_CHECK_DELAY, 1);
238   }
239   if (startphase == 3)
240   {
241     jam();
242 
243     g_TypeOfStart = typeOfStart;
244     if (g_TypeOfStart == NodeState::ST_INITIAL_START ||
245         g_TypeOfStart == NodeState::ST_INITIAL_NODE_RESTART)
246     {
247       jam();
248       c_initial_start_lcp_not_done_yet = true;
249     }
250     signal->theData[0] = reference();
251     sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
252     return;
253   }//if
254 
255   if (startphase == 7)
256   {
257     m_monitor_words_written = 0;
258     m_backup_monitor_words_written = 0;
259     m_periods_passed_in_monitor_period = 0;
260     m_monitor_snapshot_start = NdbTick_getCurrentTicks();
261     m_curr_disk_write_speed = c_defaults.m_disk_write_speed_min;
262     m_curr_backup_disk_write_speed = c_defaults.m_disk_write_speed_min;
263     m_our_node_started = true;
264     c_initial_start_lcp_not_done_yet = false;
265   }
266 
267   if(startphase == 7 && g_TypeOfStart == NodeState::ST_INITIAL_START &&
268      c_masterNodeId == getOwnNodeId() && !isNdbMtLqh()){
269     jam();
270     createSequence(signal);
271     return;
272   }//if
273 
274   sendSTTORRY(signal);
275   return;
276 }//Dbdict::execSTTOR()
277 
278 void
execREAD_NODESCONF(Signal * signal)279 Backup::execREAD_NODESCONF(Signal* signal)
280 {
281   jamEntry();
282   ReadNodesConf * conf = (ReadNodesConf *)signal->getDataPtr();
283 
284   {
285     ndbrequire(signal->getNoOfSections() == 1);
286     SegmentedSectionPtr ptr;
287     SectionHandle handle(this, signal);
288     handle.getSection(ptr, 0);
289     ndbrequire(ptr.sz == 5 * NdbNodeBitmask::Size);
290     copy((Uint32*)&conf->definedNodes.rep.data, ptr);
291     releaseSections(handle);
292   }
293 
294   c_aliveNodes.clear();
295 
296   Uint32 count = 0;
297   for (Uint32 i = 0; i<MAX_NDB_NODES; i++) {
298     jam();
299     if (conf->definedNodes.get(i))
300     {
301       jam();
302       count++;
303 
304       NodePtr node;
305       ndbrequire(c_nodes.seizeFirst(node));
306 
307       node.p->nodeId = i;
308       if (conf->inactiveNodes.get(i))
309       {
310         jam();
311 	node.p->alive = 0;
312       } else {
313         jam();
314 	node.p->alive = 1;
315 	c_aliveNodes.set(i);
316       }//if
317     }//if
318   }//for
319   c_masterNodeId = conf->masterNodeId;
320   ndbrequire(count == conf->noOfNodes);
321   sendSTTORRY(signal);
322 }
323 
324 void
sendSTTORRY(Signal * signal)325 Backup::sendSTTORRY(Signal* signal)
326 {
327   signal->theData[0] = 0;
328   signal->theData[3] = 1;
329   signal->theData[4] = 3;
330   signal->theData[5] = 7;
331   signal->theData[6] = 255; // No more start phases from missra
332   BlockReference cntrRef = !isNdbMtLqh() ? NDBCNTR_REF : BACKUP_REF;
333   sendSignal(cntrRef, GSN_STTORRY, signal, 7, JBB);
334 }
335 
336 void
createSequence(Signal * signal)337 Backup::createSequence(Signal* signal)
338 {
339   UtilSequenceReq * req = (UtilSequenceReq*)signal->getDataPtrSend();
340 
341   req->senderData  = RNIL;
342   req->sequenceId  = NDB_BACKUP_SEQUENCE;
343   req->requestType = UtilSequenceReq::Create;
344 
345   sendSignal(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
346 	     signal, UtilSequenceReq::SignalLength, JBB);
347 }
348 
349 void
handle_overflow(Uint64 & overflow_disk_write,Uint64 & words_written_this_period,Uint64 & curr_disk_write_speed)350 Backup::handle_overflow(Uint64& overflow_disk_write,
351                         Uint64& words_written_this_period,
352                         Uint64& curr_disk_write_speed)
353 {
354   jam();
355   /**
356    * If we overflowed in the last period, count it in
357    * this new period, potentially overflowing again into
358    * future periods...
359    *
360    * The overflow can only come from the last write we did in this
361    * period, but potentially this write is bigger than what we are
362    * allowed to write during one period.
363    *
364    * Calculate the overflow to pass into the new period
365    * (overflowThisPeriod). It can never be more than what is
366    * allowed to be written during a period.
367    *
368    * We could rarely end up in the case that the overflow of the
369    * last write in the period even overflows the entire next period.
370    * If so we put this into the remainingOverFlow and put this into
371    * overflow_disk_write (in this case nothing will be written in
372    * this period so ready_to_write need not worry about this case
373    * when setting overflow_disk_write since it isn't written any time
374    * in this case and in all other cases only written by the last write
375    * in a period.
376    *
377    * This routine is called both for collective LCP and Backup overflow
378    * and for only Backup overflow.
379    */
380   Uint64 overflowThisPeriod = MIN(overflow_disk_write,
381                                   curr_disk_write_speed + 1);
382 
383   /* How much overflow remains after this period? */
384   Uint64 remainingOverFlow = overflow_disk_write - overflowThisPeriod;
385 
386   if (overflowThisPeriod)
387   {
388     jam();
389 #ifdef DEBUG_CHECKPOINTSPEED
390     ndbout_c("Overflow of %u bytes (max/period is %u bytes)",
391              overflowThisPeriod * 4, curr_disk_write_speed * 4);
392 #endif
393     if (remainingOverFlow)
394     {
395       jam();
396 #ifdef DEBUG_CHECKPOINTSPEED
397       ndbout_c("  Extra overflow : %u bytes, will take %u further periods"
398                " to clear", remainingOverFlow * 4,
399                  remainingOverFlow / curr_disk_write_speed);
400 #endif
401     }
402   }
403   if (true || curr_disk_write_speed == m_curr_disk_write_speed)
404   {
405     DEB_REDO_CONTROL_DETAIL(("(%u)bytes_written_this_period: %llu kB, "
406                              " overflowThisPeriod: %llu kB, "
407                              " remainingOverFlow: %llu kB, "
408                              " curr_disk_write_speed %llu kB",
409                              instance(),
410                              words_written_this_period / 256,
411                              overflowThisPeriod / 256,
412                              remainingOverFlow / 256,
413                              curr_disk_write_speed / 256));
414   }
415   words_written_this_period = overflowThisPeriod;
416   overflow_disk_write = remainingOverFlow;
417 }
418 
419 void
calculate_next_delay(const NDB_TICKS curr_time)420 Backup::calculate_next_delay(const NDB_TICKS curr_time)
421 {
422   /**
423    * Adjust for upto 10 millisecond delay of this signal. Longer
424    * delays will not be handled, in this case the system is most
425    * likely under too high load and it won't matter very much that
426    * we decrease the speed of checkpoints.
427    *
428    * We use a technique where we allow an overflow write in one
429    * period. This overflow will be removed from the next period
430    * such that the load will at average be as specified.
431    * Calculate new delay time based on if we overslept or underslept
432    * this time. We will never regulate more than 10ms, if the
433    * oversleep is bigger than we will simply ignore it. We will
434    * decrease the delay by as much as we overslept or increase it by
435    * as much as we underslept.
436    */
437   int delay_time = m_reset_delay_used;
438   int sig_delay = int(NdbTick_Elapsed(m_reset_disk_speed_time,
439                                       curr_time).milliSec());
440   if (sig_delay > delay_time + 10)
441   {
442     delay_time = Backup::DISK_SPEED_CHECK_DELAY - 10;
443   }
444   else if (sig_delay < delay_time - 10)
445   {
446     delay_time = Backup::DISK_SPEED_CHECK_DELAY + 10;
447   }
448   else
449   {
450     delay_time = Backup::DISK_SPEED_CHECK_DELAY -
451                  (sig_delay - delay_time);
452   }
453   m_periods_passed_in_monitor_period++;
454   m_reset_delay_used= delay_time;
455   m_reset_disk_speed_time = curr_time;
456 #if 0
457   ndbout << "Signal delay was = " << sig_delay;
458   ndbout << " Current time = " << curr_time << endl;
459   ndbout << " Delay time will be = " << delay_time << endl << endl;
460 #endif
461 }
462 
463 void
report_disk_write_speed_report(Uint64 bytes_written_this_period,Uint64 backup_bytes_written_this_period,Uint64 millis_passed)464 Backup::report_disk_write_speed_report(Uint64 bytes_written_this_period,
465                                        Uint64 backup_bytes_written_this_period,
466                                        Uint64 millis_passed)
467 {
468   Uint32 report = next_disk_write_speed_report;
469   disk_write_speed_rep[report].backup_bytes_written =
470     backup_bytes_written_this_period;
471   disk_write_speed_rep[report].backup_lcp_bytes_written =
472     bytes_written_this_period;
473   disk_write_speed_rep[report].millis_passed =
474     millis_passed;
475   disk_write_speed_rep[report].redo_bytes_written =
476     c_lqh->report_redo_written_bytes();
477   disk_write_speed_rep[report].target_disk_write_speed =
478     m_curr_disk_write_speed * CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
479   disk_write_speed_rep[report].target_backup_disk_write_speed =
480     m_curr_backup_disk_write_speed * CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
481 
482   next_disk_write_speed_report++;
483   if (next_disk_write_speed_report == DISK_WRITE_SPEED_REPORT_SIZE)
484   {
485     next_disk_write_speed_report = 0;
486   }
487   if (next_disk_write_speed_report == last_disk_write_speed_report)
488   {
489     last_disk_write_speed_report++;
490     if (last_disk_write_speed_report == DISK_WRITE_SPEED_REPORT_SIZE)
491     {
492       last_disk_write_speed_report = 0;
493     }
494   }
495 }
496 
497 #define DELETE_RECOVERY_WORK 120
498 /**
499  * This method is a check that we haven't been writing faster than we're
500  * supposed to during the last interval.
501  */
502 void
monitor_disk_write_speed(const NDB_TICKS curr_time,const Uint64 millisPassed)503 Backup::monitor_disk_write_speed(const NDB_TICKS curr_time,
504                                  const Uint64 millisPassed)
505 {
506   /**
507    * Independent check of DiskCheckpointSpeed.
508    * We check every second or so that we are roughly sticking
509    * to our diet.
510    */
511   jam();
512   const Uint64 periodsPassed =
513     (millisPassed / DISK_SPEED_CHECK_DELAY) + 1;
514   const Uint64 quotaWordsPerPeriod = m_curr_disk_write_speed;
515   const Uint64 quotaWordsPerPeriodBackup = m_curr_backup_disk_write_speed;
516   const Uint64 maxOverFlowWords = c_defaults.m_maxWriteSize / 4;
517   const Uint64 maxExpectedWords = (periodsPassed * quotaWordsPerPeriod) +
518                                   maxOverFlowWords;
519   const Uint64 maxExpectedWordsBackup = (periodsPassed *
520                                          quotaWordsPerPeriodBackup) +
521                                          maxOverFlowWords;
522 
523   if (unlikely((m_monitor_words_written > maxExpectedWords) ||
524                (m_backup_monitor_words_written > maxExpectedWordsBackup)))
525   {
526     jam();
527     /**
528      * In the last monitoring interval, we have written more words
529      * than allowed by the quota (DiskCheckpointSpeed), including
530      * transient spikes due to a single MaxBackupWriteSize write
531      */
532     ndbout << "Backup : Excessive Backup/LCP write rate in last"
533            << " monitoring period - recorded = "
534            << (m_monitor_words_written * 4 * 1000) / millisPassed
535            << " bytes/s, "
536            << endl
537            << "Recorded writes to backup: "
538            << (m_backup_monitor_words_written * 4 * 1000) / millisPassed
539            << " bytes/s, "
540            << endl;
541     ndbout << "Current speed is = "
542            << m_curr_disk_write_speed *
543                 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS
544            << " bytes/s"
545            << endl;
546     ndbout << "Current backup speed is = "
547            << m_curr_backup_disk_write_speed *
548                 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS
549            << " bytes/s"
550            << endl;
551     ndbout << "Backup : Monitoring period : " << millisPassed
552            << " millis. Bytes written : " << (m_monitor_words_written * 4)
553            << ".  Max allowed : " << (maxExpectedWords * 4) << endl;
554     ndbout << "Backup : Monitoring period : " << millisPassed
555            << " millis. Bytes written : "
556            << (m_backup_monitor_words_written * 4)
557            << ".  Max allowed : " << (maxExpectedWordsBackup * 4) << endl;
558     ndbout << "Actual number of periods in this monitoring interval: ";
559     ndbout << m_periods_passed_in_monitor_period;
560     ndbout << " calculated number was: " << periodsPassed << endl;
561   }
562   report_disk_write_speed_report(4 * m_monitor_words_written,
563                                  4 * m_backup_monitor_words_written,
564                                  millisPassed);
565   /**
566    * The LCP write rate is removed from the calculated LCP change rate to
567    * derive the lag (a lag is a positive number, if we are ahead of the
568    * calculated rate we report it as a negative number).
569    * We keep track of the lag since the start of the LCP and since the
570    * start of the previous LCP.
571    */
572   Int64 lag = m_lcp_change_rate -
573               ((4 * m_monitor_words_written) -
574                (4 * m_backup_monitor_words_written));
575   m_lcp_lag[1] += lag;
576 
577   DEB_REDO_CONTROL(("(%u)change_rate: %llu kB, LCP+Backup: %llu kB,"
578                     " Backup: %llu kB, lag: %lld kB",
579                     instance(),
580                     m_lcp_change_rate / 1024,
581                     m_monitor_words_written / 256,
582                     m_backup_monitor_words_written / 256,
583                     lag / 1024));
584 
585   m_monitor_words_written = 0;
586   m_backup_monitor_words_written = 0;
587   m_periods_passed_in_monitor_period = 0;
588   m_monitor_snapshot_start = curr_time;
589 }
590 
591 void
debug_report_redo_control(Uint32 cpu_usage)592 Backup::debug_report_redo_control(Uint32 cpu_usage)
593 {
594 #ifdef DEBUG_REDO_CONTROL
595   {
596     Uint64 millis_passed;
597     Uint64 backup_lcp_bytes_written;
598     Uint64 backup_bytes_written;
599     Uint64 redo_bytes_written;
600     calculate_disk_write_speed_seconds_back(1,
601                                             millis_passed,
602                                             backup_lcp_bytes_written,
603                                             backup_bytes_written,
604                                             redo_bytes_written,
605                                             true);
606     backup_bytes_written *= Uint64(1000);
607     backup_bytes_written /= (millis_passed * Uint64(1024));
608     backup_lcp_bytes_written *= Uint64(1000);
609     backup_lcp_bytes_written /= (millis_passed * Uint64(1024));
610     redo_bytes_written *= Uint64(1000);
611     redo_bytes_written /= (millis_passed * Uint64(1024));
612 
613    /* Report new disk write speed and last seconds achievement on disk */
614    DEB_REDO_CONTROL(("(%u)Current disk write speed is %llu kB/sec"
615                      " and current backup disk write speed is %llu kB/sec"
616                      ", last sec REDO write speed %llu kB/sec, "
617                      "LCP+Backup write speed %llu kB/sec"
618                      ", Backup write speed %llu kB/sec"
619                      ", cpu_usage: %u",
620                       instance(),
621                       ((m_curr_disk_write_speed *
622                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
623                        Uint64(1024)),
624                       ((m_curr_backup_disk_write_speed *
625                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
626                        Uint64(1024)),
627                      redo_bytes_written,
628                      backup_lcp_bytes_written,
629                      backup_bytes_written,
630                      cpu_usage));
631  }
632 #else
633  (void)cpu_usage;
634 #endif
635 }
636 
637 void
execREDO_STATE_REP(Signal * signal)638 Backup::execREDO_STATE_REP(Signal* signal)
639 {
640   RedoStateRep *rep = (RedoStateRep*)signal->getDataPtr();
641   ndbrequire(rep->receiverInfo == RedoStateRep::ToBackup);
642   m_global_redo_alert_state = (RedoStateRep::RedoAlertState)rep->redoState;
643   DEB_REDO_CONTROL(("(%u) New global redo alert state: %u",
644                     instance(),
645                     m_global_redo_alert_state));
646 }
647 
648 /**
649  * Initialise LCP timers at the time we hear of the first writes to the
650  * REDO log. Could also be initialised by the start of the first LCP.
651  */
652 void
init_lcp_timers(Uint64 redo_written_since_last_call)653 Backup::init_lcp_timers(Uint64 redo_written_since_last_call)
654 {
655   if (redo_written_since_last_call > 0)
656   {
657     if (!NdbTick_IsValid(m_lcp_start_time))
658     {
659       m_lcp_start_time = getHighResTimer();
660       m_prev_lcp_start_time = m_lcp_start_time;
661     }
662   }
663 }
664 
665 void
lcp_start_point(Signal * signal)666 Backup::lcp_start_point(Signal *signal)
667 {
668   /**
669    * A new LCP is starting up, we need to keep track of this to handle
670    * REDO control.
671    * The start and end points of LCPs currently only come with an
672    * accuracy of about 1 second, so if the LCP time is shorter than
673    * this we can definitely ignore any REDO alerts.
674    */
675   if (!NdbTick_IsValid(m_prev_lcp_start_time))
676   {
677     jam();
678     m_prev_lcp_start_time = getHighResTimer();
679   }
680   else
681   {
682     m_prev_lcp_start_time = m_lcp_start_time;
683   }
684   c_pgman->lcp_start_point(signal,
685                            m_max_undo_log_level_percentage + 1,
686                            m_max_redo_percentage);
687   m_max_undo_log_level_percentage = m_undo_log_level_percentage;
688   m_max_redo_percentage = m_redo_percentage;
689   m_first_lcp_started = true;
690   m_lcp_start_time = getHighResTimer();
691   ndbrequire(NdbTick_IsValid(m_lcp_start_time));
692   m_lcp_current_cut_point = m_prev_lcp_start_time;
693   m_update_size_lcp[0] = m_update_size_lcp[1];
694   m_update_size_lcp[1] = m_update_size_lcp_last;
695   m_insert_size_lcp[0] = m_insert_size_lcp[1];
696   m_insert_size_lcp[1] = m_insert_size_lcp_last;
697   m_delete_size_lcp[0] = m_delete_size_lcp[1];
698   m_delete_size_lcp[1] = m_delete_size_lcp_last;
699   DEB_REDO_CONTROL(("(%u)m_insert_size_lcp[0]: %llu MByte, "
700                     "m_insert_size_lcp[1]: %llu MByte, "
701                     "m_insert_size_lcp_last: %llu MByte",
702                     instance(),
703                     (m_insert_size_lcp[0] / (1024 * 1024)),
704                     (m_insert_size_lcp[1] / (1024 * 1024)),
705                     (m_insert_size_lcp_last / (1024 * 1024))));
706 }
707 
708 void
lcp_end_point()709 Backup::lcp_end_point()
710 {
711   NDB_TICKS current_time = getHighResTimer();
712   ndbrequire(NdbTick_IsValid(m_lcp_start_time));
713   m_last_lcp_exec_time_in_ms =
714     NdbTick_Elapsed(m_lcp_start_time, current_time).milliSec();
715   m_lcp_current_cut_point = m_lcp_start_time;
716 
717   c_pgman->lcp_end_point(m_last_lcp_exec_time_in_ms);
718   reset_lcp_timing_factors();
719 #ifdef DEBUG_REDO_CONTROL
720   Uint64 checkpoint_size = m_insert_size_lcp[1] - m_insert_size_lcp[0];
721   Uint64 checkpoint_rate = 0;
722   if (m_last_lcp_exec_time_in_ms > 0)
723   {
724     checkpoint_rate = checkpoint_size / m_last_lcp_exec_time_in_ms;
725   }
726   DEB_REDO_CONTROL(("(%u)LCP END: m_insert_size_lcp[0]: %llu MByte, "
727                     "Remaining lag: %lld MB, "
728                     "Removed lag: %lld MB, "
729                     "Checkpoint rate in this LCP: %llu kB/sec",
730                     instance(),
731                     (checkpoint_size / (1024 * 1024)),
732                     (m_lcp_lag[1] / (1024 * 1024)),
733                     (m_lcp_lag[0] / (1024 * 1024)),
734                     checkpoint_rate));
735 #endif
736   m_update_size_lcp[0] = m_update_size_lcp[1];
737   m_insert_size_lcp[0] = m_insert_size_lcp[1];
738   m_delete_size_lcp[0] = m_delete_size_lcp[1];
739   m_lcp_lag[0] = m_lcp_lag[1];
740   m_lcp_lag[1] = Int64(0);
741 }
742 
743 Uint64
init_change_size(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_memory)744 Backup::init_change_size(Uint64 update_size,
745                          Uint64 insert_size,
746                          Uint64 delete_size,
747                          Uint64 total_memory)
748 {
749   /**
750    * The initial value for change_size is based on that the new
751    * rows or deleted rows are always changes, but updates can
752    * at times be updates of the same row. We use an exponential
753    * probability distribution that a row has been updated or not.
754    */
755   Uint64 change_size = insert_size + delete_size;
756   long double f_total_memory = (long double)total_memory;
757   long double f_change_size = update_size;
758   long double f_change_percentage = f_change_size / f_total_memory;
759   long double f_real_change_percentage = ((long double)1) -
760                                        exp(-f_change_percentage);
761   long double f_real_change_size = f_real_change_percentage *
762                                    f_total_memory;
763   change_size += (Uint64)f_real_change_size;
764   return change_size;
765 }
766 
767 Uint64
modify_change_size(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_size,Uint64 change_size)768 Backup::modify_change_size(Uint64 update_size,
769                            Uint64 insert_size,
770                            Uint64 delete_size,
771                            Uint64 total_size,
772                            Uint64 change_size)
773 {
774   /**
775    * Now we have calculated an estimate that is comparable
776    * to the row_change_count that we get per fragment before
777    * calculating the number of parts to checkpoint.
778    *
779    * The next step is now to modify this estimate based on
780    * the amount of inserts and deletes compared to the updates.
781    */
782   Uint64 updates_percent = (update_size * Uint64(1005)) /
783                           (Uint64(10) * total_size);
784   Uint64 inserts_percent = (insert_size * Uint64(1005)) /
785                           (Uint64(10) * total_size);
786   Uint64 insert_recovery_work = (Uint64)get_insert_recovery_work();
787   inserts_percent *= insert_recovery_work;
788   inserts_percent /= Uint64(100);
789   Uint64 deletes_percent = (delete_size * Uint64(1005)) /
790                           (Uint64(10) * total_size);
791   deletes_percent *= Uint64(DELETE_RECOVERY_WORK);
792   deletes_percent /= Uint64(100);
793   Uint64 change_factor = updates_percent +
794                          inserts_percent +
795                          deletes_percent;
796   change_size *= change_factor;
797   change_size /= Uint64(100);
798   return change_size;
799 }
800 
801 Uint32
calculate_parts(Uint64 change_size,Uint64 total_memory)802 Backup::calculate_parts(Uint64 change_size,
803                         Uint64 total_memory)
804 {
805   Uint64 part_total_memory = total_memory / Uint64(10);
806   Uint32 min_parts = calculate_min_parts(total_memory,
807                                          change_size,
808                                          part_total_memory,
809                                          total_memory);
810   return min_parts;
811 }
812 
813 void
calculate_seconds_since_lcp_cut(Uint64 & seconds_since_lcp_cut)814 Backup::calculate_seconds_since_lcp_cut(Uint64& seconds_since_lcp_cut)
815 {
816   NDB_TICKS now = getHighResTimer();
817   if (!NdbTick_IsValid(m_lcp_current_cut_point))
818   {
819     jam();
820     seconds_since_lcp_cut = 0;
821     return;
822   }
823   seconds_since_lcp_cut =
824     NdbTick_Elapsed(m_lcp_current_cut_point, now).seconds();
825 }
826 
827 Uint64
calculate_change_rate(Uint64 change_size,Uint64 & seconds_since_lcp_cut)828 Backup::calculate_change_rate(Uint64 change_size,
829                               Uint64& seconds_since_lcp_cut)
830 {
831   if (seconds_since_lcp_cut < 2)
832   {
833     jam();
834     /**
835      * We ignore very short LCPs, in this case it is hard to see
836      * how we could run out of REDO log and need more disk write
837      * speed.
838      */
839     return 0;
840   }
841   Uint64 change_size_per_sec = change_size / seconds_since_lcp_cut;
842   return change_size_per_sec;
843 }
844 
845 Uint64
calculate_checkpoint_rate(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_memory,Uint64 & seconds_since_lcp_cut)846 Backup::calculate_checkpoint_rate(Uint64 update_size,
847                                   Uint64 insert_size,
848                                   Uint64 delete_size,
849                                   Uint64 total_memory,
850                                   Uint64& seconds_since_lcp_cut)
851 {
852   Uint64 checkpoint_size = 0;
853   Uint32 all_parts = 0;
854   Uint64 all_size = 0;
855   Uint64 change_size = 0;
856   Uint64 mod_change_size = 0;
857   Uint64 total_size = update_size + insert_size + delete_size;
858   if (total_size != 0)
859   {
860     if (delete_size > insert_size)
861     {
862       update_size += insert_size;
863       delete_size -= insert_size;
864       insert_size = 0;
865     }
866     else
867     {
868       update_size += delete_size;
869       insert_size -= delete_size;
870       delete_size = 0;
871     }
872     calculate_seconds_since_lcp_cut(seconds_since_lcp_cut);
873     change_size = init_change_size(update_size,
874                                    insert_size,
875                                    delete_size,
876                                    total_memory);
877     mod_change_size = modify_change_size(update_size,
878                                          insert_size,
879                                          delete_size,
880                                          total_size,
881                                          change_size);
882     all_parts = calculate_parts(mod_change_size, total_memory);
883     all_size = total_memory * Uint64(all_parts);
884     all_size /= Uint64(BackupFormat::NDB_MAX_LCP_PARTS);
885     change_size = (BackupFormat::NDB_MAX_LCP_PARTS - all_parts) *
886                   change_size;
887     change_size /= BackupFormat::NDB_MAX_LCP_PARTS;
888     checkpoint_size = all_size + change_size;
889   }
890   Uint64 change_rate = calculate_change_rate(checkpoint_size,
891                                              seconds_since_lcp_cut);
892   DEB_REDO_CONTROL(("(%u)update_size: %llu MB, insert_size: %llu MB,"
893                     " delete_size: %llu MB, checkpoint_size: %llu MB"
894                     ", all_parts: %u, total_memory: %llu MB, "
895                     "all_size: %llu MB, change_size: %llu MB, "
896                     "mod_change_size: %llu MB, "
897                     "seconds_since_lcp_cut: %llu",
898                     instance(),
899                     update_size / (Uint64(1024) * Uint64(1024)),
900                     insert_size / (Uint64(1024) * Uint64(1024)),
901                     delete_size / (Uint64(1024) * Uint64(1024)),
902                     checkpoint_size / (Uint64(1024) * Uint64(1024)),
903                     all_parts,
904                     total_memory / (Uint64(1024 * Uint64(1024))),
905                     all_size / (Uint64(1024) * Uint64(1024)),
906                     change_size / (Uint64(1024) * Uint64(1024)),
907                     mod_change_size / (Uint64(1024) * Uint64(1024)),
908                     seconds_since_lcp_cut));
909   return change_rate;
910 }
911 
912 void
calculate_redo_parameters(Uint64 redo_usage,Uint64 redo_size,Uint64 redo_written_since_last_call,Uint64 millis_since_last_call,Uint64 & redo_percentage,Uint64 & max_redo_used_before_cut,Uint64 & mean_redo_used_before_cut,Uint64 & mean_redo_speed_per_sec,Uint64 & current_redo_speed_per_sec,Uint64 & redo_available)913 Backup::calculate_redo_parameters(Uint64 redo_usage,
914                                   Uint64 redo_size,
915                                   Uint64 redo_written_since_last_call,
916                                   Uint64 millis_since_last_call,
917                                   Uint64& redo_percentage,
918                                   Uint64& max_redo_used_before_cut,
919                                   Uint64& mean_redo_used_before_cut,
920                                   Uint64& mean_redo_speed_per_sec,
921                                   Uint64& current_redo_speed_per_sec,
922                                   Uint64& redo_available)
923 {
924   /* redo_size and redo_usage is in MBytes, convert to bytes */
925   redo_size *= (Uint64(1024) * Uint64(1024));
926   redo_usage *= (Uint64(1024) * Uint64(1024));
927   redo_available = redo_size - redo_usage;
928   redo_percentage = redo_usage * Uint64(100);
929   redo_percentage /= redo_size;
930   current_redo_speed_per_sec = redo_written_since_last_call * Uint64(1000);
931   current_redo_speed_per_sec /= millis_since_last_call;
932   if (current_redo_speed_per_sec > m_max_redo_speed_per_sec)
933   {
934     jam();
935     m_max_redo_speed_per_sec = current_redo_speed_per_sec;
936   }
937   mean_redo_speed_per_sec = 0;
938   Uint64 seconds_since_lcp_cut = 0;
939   if (NdbTick_IsValid(m_lcp_current_cut_point))
940   {
941     jam();
942     NDB_TICKS current_time = getHighResTimer();
943     seconds_since_lcp_cut =
944       NdbTick_Elapsed(m_lcp_current_cut_point, current_time).seconds();
945   }
946   if (seconds_since_lcp_cut != 0)
947   {
948     jam();
949     mean_redo_speed_per_sec = redo_usage / seconds_since_lcp_cut;
950   }
951   /**
952    * We assume that LCP execution time is Poisson-distributed.
953    * This means that our mean estimated time is the same even
954    * if the LCP has been ongoing for a while (Poisson distribution
955    * has no memory). It doesn't matter so much if this estimate
956    * isn't 100% correct, it will at least not be overoptimistic.
957    *
958    * Thus we estimate the time to complete the next LCP to be
959    * the time of the last LCP.
960    */
961   max_redo_used_before_cut = m_max_redo_speed_per_sec *
962                              m_last_lcp_exec_time_in_ms;
963   max_redo_used_before_cut /= Uint64(1000);
964 
965   mean_redo_used_before_cut = mean_redo_speed_per_sec *
966                               m_last_lcp_exec_time_in_ms;
967   mean_redo_used_before_cut /= Uint64(1000);
968 }
969 
970 void
change_alert_state_redo_percent(Uint64 redo_percentage)971 Backup::change_alert_state_redo_percent(Uint64 redo_percentage)
972 {
973   /**
974    * If the fill level of the REDO log reaches beyond 60% we set
975    * it in critical state independent of calculations on REDO
976    * speed. Similarly when going beyond 40% we set it in high
977    * alert state. Using more than 40% of the REDO log is
978    * not a desired state to run in. This is both too close to
979    * the end to be comfortable and it also extends the time
980    * to recover at a restart substantially.
981    */
982   m_redo_alert_state = RedoStateRep::NO_REDO_ALERT;
983   if (redo_percentage > Uint64(60) ||
984       m_undo_log_level_percentage > 60)
985   {
986     jam();
987     m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
988   }
989   else if (redo_percentage > Uint64(40) ||
990            m_undo_log_level_percentage > 40)
991   {
992     jam();
993     m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
994   }
995   else if (redo_percentage > Uint64(25) ||
996            m_undo_log_level_percentage > 25)
997   {
998     jam();
999     m_redo_alert_state = RedoStateRep::REDO_ALERT_LOW;
1000   }
1001 }
1002 
1003 void
change_alert_state_redo_usage(Uint64 max_redo_used_before_cut,Uint64 mean_redo_used_before_cut,Uint64 redo_available)1004 Backup::change_alert_state_redo_usage(Uint64 max_redo_used_before_cut,
1005                                       Uint64 mean_redo_used_before_cut,
1006                                       Uint64 redo_available)
1007 {
1008   if (m_redo_alert_state != RedoStateRep::REDO_ALERT_CRITICAL)
1009   {
1010     jam();
1011     /**
1012      * We have estimated the REDO usage until the next LCP will cut it again.
1013      * The first estimate is based on the maximum speed we have seen so far.
1014      * The second estimate is based on the mean speed we have seen since
1015      * the first current REDO log record was generated.
1016      *
1017      * If we write at max speed and we estimate this to run out of REDO space
1018      * we are at a high alert state. If we can use only 40% of this to run out
1019      * of REDO log we are at a critical state.
1020      *
1021      * If we run at mean speed and we can run out of REDO space we are obviously
1022      * in a critical state, even with only an estimate to fill half of this we
1023      * are in a critical state and if we estimate to fill a third of this we are
1024      * in a high alert state.
1025      *
1026      * We don't even attempt those checks if we haven't got good measures of
1027      * times until the next REDO cut.
1028      */
1029     Uint64 max_critical_limit = (Uint64(2) * max_redo_used_before_cut) / Uint64(5);
1030     Uint64 max_high_limit = max_redo_used_before_cut;
1031     Uint64 mean_critical_limit = mean_redo_used_before_cut / Uint64(2);
1032     Uint64 mean_high_limit = mean_redo_used_before_cut / Uint64(3);
1033 
1034     if (redo_available < max_critical_limit)
1035     {
1036       jam();
1037       m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
1038     }
1039     else if (redo_available < mean_critical_limit)
1040     {
1041       jam();
1042       m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
1043     }
1044     else if (redo_available < max_high_limit)
1045     {
1046       jam();
1047       m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
1048     }
1049     else if (redo_available < mean_high_limit)
1050     {
1051       jam();
1052       m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
1053     }
1054   }
1055 }
1056 
1057 void
handle_global_alert_state(Signal * signal,RedoStateRep::RedoAlertState save_redo_alert_state)1058 Backup::handle_global_alert_state(
1059   Signal *signal,
1060   RedoStateRep::RedoAlertState save_redo_alert_state)
1061 {
1062   m_local_redo_alert_state = m_redo_alert_state;
1063   if (save_redo_alert_state != m_redo_alert_state)
1064   {
1065     jam();
1066     RedoStateRep *rep = (RedoStateRep*)signal->getDataPtrSend();
1067     rep->receiverInfo = RedoStateRep::ToNdbcntr;
1068     rep->redoState = m_redo_alert_state;
1069     sendSignal(NDBCNTR_REF, GSN_REDO_STATE_REP, signal, 2, JBB);
1070   }
1071   if (m_global_redo_alert_state > m_redo_alert_state)
1072   {
1073     jam();
1074     m_redo_alert_state = m_global_redo_alert_state;
1075   }
1076 }
1077 
1078 void
set_redo_alert_factor(Uint64 redo_percentage)1079 Backup::set_redo_alert_factor(Uint64 redo_percentage)
1080 {
1081   m_redo_alert_factor = 1;
1082   if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1083   {
1084     jam();
1085     m_redo_alert_factor = 24;
1086   }
1087   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1088   {
1089     jam();
1090     m_redo_alert_factor = 8;
1091   }
1092   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1093   {
1094     jam();
1095     m_redo_alert_factor = 4;
1096   }
1097 }
1098 
1099 void
set_lcp_timing_factors(Uint64 seconds_since_lcp_cut)1100 Backup::set_lcp_timing_factors(Uint64 seconds_since_lcp_cut)
1101 {
1102   if (m_last_lcp_exec_time_in_ms == 0)
1103   {
1104     return;
1105   }
1106   Uint64 lcp_time_in_secs = m_last_lcp_exec_time_in_ms / 1000;
1107 
1108   /**
1109    * seconds_since_lcp_cut normally goes to a bit more than
1110    * two times the LCP time. If the LCP time increases by more
1111    * than 6 seconds we try to increase the disk write speed to
1112    * handle this. If the seconds since last cut is increasing
1113    * even to double the LCP time we increase the factor even
1114    * more.
1115    *
1116    * There is no need to set those factors in a dramatic manner.
1117    * These factors are used to keep LCP times low to ensure that
1118    * recovery times are low. They assist in protecting the REDO
1119    * log from head meeting tail, but it isn't the main purpose.
1120    * There are many other mechanisms that take care of this
1121    * purpose.
1122    */
1123   Uint64 low_threshold = Uint64(2) * lcp_time_in_secs;
1124   low_threshold += Uint64(6);
1125   Uint64 high_threshold = Uint64(3) * lcp_time_in_secs;
1126   high_threshold += Uint64(6);
1127   if (seconds_since_lcp_cut + Uint64(3) < lcp_time_in_secs)
1128   {
1129     jam();
1130     /**
1131      * Ignore checking this for a while after the LCP have just
1132      * started. First of all we write more at the start due to
1133      * lag anyways, second we give time for the state to settle
1134      * done before acting on it.
1135      */
1136     return;
1137   }
1138   if (seconds_since_lcp_cut > low_threshold)
1139   {
1140     jam();
1141     m_lcp_timing_counter = 2;
1142     Uint64 new_timing_factor = Uint64(110);
1143     if (seconds_since_lcp_cut > high_threshold)
1144     {
1145       jam();
1146       new_timing_factor = Uint64(120);
1147     }
1148     if (new_timing_factor > m_lcp_timing_factor)
1149     {
1150       jam();
1151       m_lcp_timing_factor = new_timing_factor;
1152     }
1153   }
1154   /**
1155    * Ensure that the effects of REDO Alert Level stick to some
1156    * level all through the next LCP as well. This will help
1157    * bringing us permanently down in REDO Alert levels.
1158    */
1159   if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1160   {
1161     jam();
1162     m_lcp_timing_counter = 2;
1163     Uint64 new_timing_factor = Uint64(110);
1164     if (new_timing_factor > m_lcp_timing_factor)
1165     {
1166       jam();
1167       m_lcp_timing_factor = new_timing_factor;
1168     }
1169   }
1170   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1171   {
1172     jam();
1173     m_lcp_timing_counter = 2;
1174     Uint64 new_timing_factor = Uint64(120);
1175     if (new_timing_factor > m_lcp_timing_factor)
1176     {
1177       jam();
1178       m_lcp_timing_factor = new_timing_factor;
1179     }
1180   }
1181   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1182   {
1183     jam();
1184     m_lcp_timing_counter = 2;
1185     Uint64 new_timing_factor = Uint64(130);
1186     if (new_timing_factor > m_lcp_timing_factor)
1187     {
1188       jam();
1189       m_lcp_timing_factor = new_timing_factor;
1190     }
1191   }
1192 }
1193 
1194 void
reset_lcp_timing_factors()1195 Backup::reset_lcp_timing_factors()
1196 {
1197   if (m_lcp_timing_counter > 0)
1198   {
1199     jam();
1200     m_lcp_timing_counter--;
1201     if (m_lcp_timing_counter == 0)
1202     {
1203       jam();
1204       m_lcp_timing_factor = Uint64(100);
1205     }
1206     else
1207     {
1208       jam();
1209       m_lcp_timing_factor -= Uint64(10);
1210       ndbrequire(m_lcp_timing_factor >= Uint64(100));
1211     }
1212   }
1213 }
1214 
1215 void
set_proposed_disk_write_speed(Uint64 current_redo_speed_per_sec,Uint64 mean_redo_speed_per_sec,Uint64 seconds_since_lcp_cut)1216 Backup::set_proposed_disk_write_speed(Uint64 current_redo_speed_per_sec,
1217                                       Uint64 mean_redo_speed_per_sec,
1218                                       Uint64 seconds_since_lcp_cut)
1219 {
1220   /**
1221    * When LCPs are increasing the time it takes to execute an LCP we try to
1222    * get it back by increasing the disk write speed until the end of the
1223    * next LCP. This is controlled by the m_lcp_timing_factor variable. This
1224    * variable is set to 100 when no such issues are at hand.
1225    */
1226   m_proposed_disk_write_speed *= m_lcp_timing_factor;
1227   m_proposed_disk_write_speed /= Uint64(100);
1228 
1229   /**
1230    * We save the proposed disk write speed with multiplication of LCP timing
1231    * factor as the m_lcp_change_rate, this is the calculated change rate with
1232    * some long-term factors derived from m_lcp_timing_factor.
1233    *
1234    * The short-term proposed disk write speed in addition will contain
1235    * additional components to ensure that we actually deliver the calculated
1236    * LCP change rate.
1237    */
1238   m_lcp_change_rate = m_proposed_disk_write_speed;
1239 
1240   /**
1241    * The proposed disk write speed is not always achieved and we have some
1242    * level of slowness in responding to this setting, so we increase the
1243    * proposed disk write speed by 25% cater for this.
1244    *
1245    * There are many reasons why we won't achieve this speed. A few are:
1246    * 1) Variable completion of LCP execution in the LDMs in the cluster.
1247    * 2) High CPU usage when REDO log alert factor is still not activated
1248    * 3) Disk not keeping up temporarily
1249    * 4) Setting proposed disk write speed increases the maximum disk write
1250    *    speed, thus it can take a while before it affects the actual
1251    *    disk write speed since this is changed by an adaptive change
1252    *    algorithm.
1253    */
1254   m_proposed_disk_write_speed *= Uint64(125);
1255   m_proposed_disk_write_speed /= Uint64(100);
1256 
1257   Int64 lag = m_lcp_lag[0] + m_lcp_lag[1];
1258   Int64 lag_per_sec = 0;
1259   if (seconds_since_lcp_cut > 0)
1260   {
1261     lag_per_sec = lag / (Int64)seconds_since_lcp_cut;
1262   }
1263   if (current_redo_speed_per_sec > mean_redo_speed_per_sec)
1264   {
1265     jam();
1266     Uint64 factor = current_redo_speed_per_sec * Uint64(100);
1267     factor /= (mean_redo_speed_per_sec + 1);
1268     if (factor > Uint64(120))
1269     {
1270       jam();
1271       factor = Uint64(120);
1272     }
1273     /**
1274      * Increase the proposed disk write speed by up to 20% if we currently
1275      * generate more REDO logging compared to the mean. This is aiming to
1276      * cater for sudden increases in write activity to ensure that we start
1277      * acting quickly on those changes. At the same we put a dent on this
1278      * change to 20% increase. This avoids too high fluctuations in the
1279      * disk write speed.
1280      */
1281     m_proposed_disk_write_speed *= factor;
1282     m_proposed_disk_write_speed /= Uint64(100);
1283   }
1284   if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1285   {
1286     jam();
1287     /**
1288      * Add another 15% to proposed speed if we are at low
1289      * alert level.
1290      */
1291     m_proposed_disk_write_speed *= Uint64(110);
1292     m_proposed_disk_write_speed /= Uint64(100);
1293   }
1294   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1295   {
1296     jam();
1297     /**
1298      * Add another 25% to proposed speed if we are at high
1299      * alert level.
1300      */
1301     m_proposed_disk_write_speed *= Uint64(125);
1302     m_proposed_disk_write_speed /= Uint64(100);
1303   }
1304   else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1305   {
1306     jam();
1307     /**
1308      * Add another 50% to proposed speed if we are at critical
1309      * alert level.
1310      */
1311     m_proposed_disk_write_speed *= Uint64(150);
1312     m_proposed_disk_write_speed /= Uint64(100);
1313   }
1314   if (lag < Int64(0) &&
1315       m_redo_alert_state < RedoStateRep::REDO_ALERT_HIGH)
1316   {
1317     /**
1318      * There is high REDO Alert level and we are running faster than
1319      * necessary, we will slow down based on the calculated lag per
1320      * second (which when negative means that we are ahead). We will
1321      * never slow down more than 20%.
1322      */
1323     lag_per_sec = Int64(-1) * lag_per_sec; /* Make number positive */
1324     Uint64 percentage_decrease = Uint64(lag_per_sec) * Uint64(100);
1325     percentage_decrease /= (m_proposed_disk_write_speed + 1);
1326     if (percentage_decrease > Uint64(20))
1327     {
1328       jam();
1329       m_proposed_disk_write_speed *= Uint64(80);
1330       m_proposed_disk_write_speed /= Uint64(100);
1331     }
1332     else
1333     {
1334       jam();
1335       m_proposed_disk_write_speed -= lag_per_sec;
1336     }
1337   }
1338   if (lag > Int64(0))
1339   {
1340     /**
1341      * We don't keep up with the calculated LCP change rate.
1342      * We will increase the proposed disk write speed by up
1343      * to 25% to keep up with the LCP change rate.
1344      *
1345      * We avoid regaining the lag too fast since it is easy
1346      * to write too much at the beginning of an LCP otherwise.
1347      * This will create a too bursty environment which is
1348      * undesirable.
1349      */
1350     jam();
1351     Uint64 percentage_increase = lag_per_sec * Uint64(100);
1352     percentage_increase /= (m_proposed_disk_write_speed + 1);
1353     DEB_LCP_LAG(("(%u)Lag per second is %lld, percent_increase: %llu",
1354                  instance(), lag_per_sec, percentage_increase));
1355     Uint64 max_percentage_increase = Uint64(25);
1356     if (m_last_lcp_dd_percentage > 85)
1357     {
1358       jam();
1359       max_percentage_increase = Uint64(600);
1360     }
1361     else if (m_last_lcp_dd_percentage > 10)
1362     {
1363       /**
1364        * increase =  percentage / (100 - percentage)
1365        * Multiply by 100 to get it in percent
1366        */
1367       jam();
1368       Uint64 divisor = Uint64(100) - Uint64(m_last_lcp_dd_percentage);
1369       Uint64 mult = Uint64(100) * Uint64(m_last_lcp_dd_percentage);
1370       max_percentage_increase = mult / divisor;
1371 
1372     }
1373 
1374     if (percentage_increase > max_percentage_increase)
1375     {
1376       jam();
1377       Uint64 increase_factor = Uint64(100) + max_percentage_increase;
1378       m_proposed_disk_write_speed *= increase_factor;
1379       m_proposed_disk_write_speed /= Uint64(100);
1380     }
1381     else
1382     {
1383       jam();
1384       m_proposed_disk_write_speed += lag_per_sec;
1385     }
1386   }
1387 }
1388 
1389 void
measure_change_speed(Signal * signal,Uint64 millis_since_last_call)1390 Backup::measure_change_speed(Signal *signal, Uint64 millis_since_last_call)
1391 {
1392   /**
1393    * The aim of this function is to calculate the following values:
1394    * 1) m_redo_alert_state
1395    * 2) m_redo_alert_factor
1396    * 3) m_proposed_disk_write_speed
1397    *
1398    * The m_redo_alert_state variable is used to set the m_redo_alert_factor
1399    * that raises the priority of LCP writes towards other operation.
1400    *
1401    * The variable is kept consistent in the cluster to ensure that one
1402    * REDO log that is overloaded will also ensure that all other LDMs in
1403    * the cluster will speed up LCP execution.
1404    *
1405    * Based on this variable we raise the maximum speed based on the
1406    * configured disk write parameters.
1407    * This variable can also change the adaptive algorithm that slows down
1408    * LCP execution due to high CPU load. It ensures that we raise the
1409    * prio on LCP execution by ensuring that all LCP execution signals
1410    * are executed at A-level and we fill the buffers more actively when
1411    * set at alert levels.
1412    * Finally setting this variable to an alert level means that we speed up
1413    * handling of empty LCP fragments.
1414    *
1415    * The m_redo_alert_factor changes the amount of writes we will do in
1416    * one real-time break when executing at A-level.
1417    *
1418    * The proposed disk write speed is used to increase the maximum speed
1419    * used in the adaptive disk write speed algorithm if necessary.
1420    *
1421    * Calculation of the proposed disk write speed is fairly complicated.
1422    * The idea is to use the same mechanics used to decide how much an LCP
1423    * will execute on a fragment basis on a global level.
1424    *
1425    * get_redo_stats
1426    * --------------
1427    * To do this we keep track of the amount of changes we have done since
1428    * the start of the previous LCP. We keep track of this by adding the
1429    * average row size to a global update_size, insert_size and delete_size
1430    * in DBLQH. These variables are requested in the get_redo_stats call to
1431    * DBLQH.
1432    *
1433    * calculate_total_size
1434    * --------------------
1435    * To calculate the change size we use different change factors for
1436    * inserts and deletes. Deletes generate 20% more per byte compared
1437    * to updates and inserts generate less, 40% by default, compared to
1438    * updates. If we have both inserts and deletes we will only use
1439    * the larger of the two and the overlap is treated as updates.
1440    * This is the same mechanism used in the method calculate_row_change_count
1441    * used when deciding the number of parts to checkpoint for a specific
1442    * fragment.
1443    *
1444    * calculate_parts
1445    * ---------------
1446    * Updates can at times hit the same row, we estimate the number of updates
1447    * to the same row by using a Poisson distribution of writes to the rows.
1448    * This means that we can estimate the number of rows not written by using
1449    * an exponential distribution. Thus it is easy to calculate the percent of
1450    * data that has been written. Using this information we use the same
1451    * function (calculate_min_parts) to calculate the parts to checkpoint
1452    * on a global level, this function returns the number of parts with the
1453    * maximum number of parts being the BackupFormat::NDB_MAX_LCP_PARTS.
1454    *
1455    * calculate_change_rate
1456    * ---------------------
1457    * Finally we use the change size, the number of parts and the seconds since
1458    * the changes we used was started. This gives us a calculated proposed disk
1459    * write speed. To calculate we will retrieve the time since the start of
1460    * previous LCP.
1461    *
1462    * calculate_redo_parameters
1463    * -------------------------
1464    * We got redo_size, redo_usage and redo_written_since_last_call from the
1465    * call to get_redo_stats. Based on this information we calculate the
1466    * following variables.
1467    * redo_percentage:
1468    * ................
1469    * Percentage of REDO log currently in use. This is used directly to set the
1470    * m_redo_alert_factor.
1471    *
1472    * max_redo_used_before_cut:
1473    * mean_redo_used_before_cut:
1474    * redo_available:
1475    * ..........................
1476    * These three variables together are used to calculate if there is a risk
1477    * that we will run out of REDO log even without a high REDO percentage. If
1478    * so we will set the m_redo_alert_state based on these variables.
1479    * The max_redo_used_before_cut is an estimate of how much REDO log will
1480    * write before the next LCP is completed if maximum REDO write speed is
1481    * used. Similarly for mean_redo_used_before_cut but based on average REDO
1482    * write speed. redo_available is the amount of REDO log still available.
1483    *
1484    * mean_redo_speed_per_sec:
1485    * current_redo_speed_per_sec:
1486    * ...........................
1487    * These are used to see if we are currently very active in writing the
1488    * REDO log. If we are we will increase the proposed disk write speed a bit
1489    * as an effect of this.
1490    *
1491    * change_alert_state_redo_percent
1492    * -------------------------------
1493    * Based on redo_percentage we will set m_redo_alert_state.
1494    *
1495    * change_alert_state_redo_usage
1496    * -----------------------------
1497    * The above calculation based on max_redo_before_cut, mean_before_redo_cut,
1498    * and redo_available is performed here to set m_redo_alert_state
1499    * appropriately.
1500    *
1501    * handle_global_alert_state
1502    * -------------------------
1503    * Ensure that we are synchronised in our REDO alert state with other LDMs
1504    * in the cluster since the LCP protocol is global.
1505    *
1506    * set_redo_alert_factor
1507    * ---------------------
1508    * Set m_redo_alert_factor based on m_redo_alert_state and redo_percentage.
1509    *
1510    * calculate_change_rate
1511    * ---------------------
1512    * Calculate proposed disk write speed based on calculated value and on the
1513    * current activity level as reported in mean_redo_speed_per_sec and
1514    * current_redo_speed_per_sec. We will also increase to cater for some safety
1515    * levels and based on the m_redo_alert_state.
1516    */
1517   Uint64 redo_usage;
1518   Uint64 redo_size;
1519   Uint64 redo_written_since_last_call;
1520   Uint64 insert_size;
1521   Uint64 delete_size;
1522   Uint64 update_size;
1523   c_lqh->get_redo_stats(redo_usage,
1524                         redo_size,
1525                         redo_written_since_last_call,
1526                         update_size,
1527                         insert_size,
1528                         delete_size);
1529 
1530   if (redo_size == 0)
1531   {
1532     jam();
1533     return;
1534   }
1535   init_lcp_timers(redo_written_since_last_call);
1536 
1537   Uint64 total_memory = get_total_memory();
1538   Uint64 curr_change_rate;
1539   {
1540     /**
1541      * In some cases we might have had an almost idle system for a while,
1542      * in this case it is not so good to base our disk write speed on
1543      * the average change rate, in this case it is better to use the
1544      * current change rate. But we don't want to base on the current
1545      * too much, so we decrease the current rate by 75% to avoid being
1546      * too much impacted by sudden hikes in write rates.
1547      */
1548     Uint64 curr_update_size = update_size - m_update_size_lcp_last;
1549     Uint64 curr_insert_size = insert_size - m_insert_size_lcp_last;
1550     Uint64 curr_delete_size = delete_size - m_delete_size_lcp_last;
1551     Uint64 curr_seconds_since_lcp_cut = 0;
1552     curr_change_rate = calculate_checkpoint_rate(curr_update_size,
1553                                                  curr_insert_size,
1554                                                  curr_delete_size,
1555                                                  total_memory,
1556                                                  curr_seconds_since_lcp_cut);
1557     if (curr_change_rate != 0)
1558     {
1559       curr_change_rate *= curr_seconds_since_lcp_cut;
1560     }
1561     curr_change_rate /= Uint64(100);
1562     curr_change_rate *= Uint64(75);
1563   }
1564   m_update_size_lcp_last = update_size;
1565   m_insert_size_lcp_last = insert_size;
1566   m_delete_size_lcp_last = delete_size;
1567 
1568   Uint64 redo_percentage;
1569   Uint64 max_redo_used_before_cut;
1570   Uint64 mean_redo_used_before_cut;
1571   Uint64 mean_redo_speed_per_sec;
1572   Uint64 current_redo_speed_per_sec;
1573   Uint64 redo_available;
1574   calculate_redo_parameters(redo_usage,
1575                             redo_size,
1576                             redo_written_since_last_call,
1577                             millis_since_last_call,
1578                             redo_percentage,
1579                             max_redo_used_before_cut,
1580                             mean_redo_used_before_cut,
1581                             mean_redo_speed_per_sec,
1582                             current_redo_speed_per_sec,
1583                             redo_available);
1584 
1585   update_size -= m_update_size_lcp[0];
1586   insert_size -= m_insert_size_lcp[0];
1587   delete_size -= m_delete_size_lcp[0];
1588   Uint64 seconds_since_lcp_cut = 0;
1589   Uint64 change_rate = calculate_checkpoint_rate(update_size,
1590                                                  insert_size,
1591                                                  delete_size,
1592                                                  get_total_memory(),
1593                                                  seconds_since_lcp_cut);
1594   change_rate = MAX(change_rate, curr_change_rate);
1595 
1596   m_proposed_disk_write_speed = change_rate;
1597 
1598   m_redo_percentage = redo_percentage;
1599   m_max_redo_percentage = MAX(redo_percentage, m_max_redo_percentage);
1600   RedoStateRep::RedoAlertState save_redo_alert_state =
1601     m_local_redo_alert_state;
1602   change_alert_state_redo_percent(redo_percentage);
1603   change_alert_state_redo_usage(max_redo_used_before_cut,
1604                                 mean_redo_used_before_cut,
1605                                 redo_available);
1606   handle_global_alert_state(signal, save_redo_alert_state);
1607   c_pgman->set_redo_alert_state(m_redo_alert_state);
1608   set_redo_alert_factor(redo_percentage);
1609   set_lcp_timing_factors(seconds_since_lcp_cut);
1610   set_proposed_disk_write_speed(current_redo_speed_per_sec,
1611                                 mean_redo_speed_per_sec,
1612                                 seconds_since_lcp_cut);
1613 
1614 #ifdef DEBUG_REDO_CONTROL
1615   Int64 current_lag = m_lcp_lag[0] + m_lcp_lag[1];
1616   DEB_REDO_CONTROL(("(%u)Proposed speed is %llu kB/sec"
1617                     ", current_redo_speed is %llu kB/sec and"
1618                     ", mean_redo_speed is %llu kB/sec"
1619                     ", %s is %llu MB, change_rate is: %llu kB",
1620                     instance(),
1621                     (m_proposed_disk_write_speed / Uint64(1024)),
1622                     (current_redo_speed_per_sec / Uint64(1024)),
1623                     (mean_redo_speed_per_sec / Uint64(1024)),
1624                     (current_lag >= 0) ? "lag" : "ahead",
1625                     (current_lag >= 0) ? (current_lag / (1024 * 1024)) :
1626                                          (-current_lag/ (1024 * 1024)),
1627                     (m_lcp_change_rate / 1024)));
1628   DEB_REDO_CONTROL(("(%u)state: %u, redo_size: %llu MByte, "
1629                     "redo_percent: %llu, last LCP time in ms: %llu"
1630                     ", m_lcp_timing_factor: %llu%%",
1631                     instance(),
1632                     m_redo_alert_state,
1633                     redo_size,
1634                     redo_percentage,
1635                     m_last_lcp_exec_time_in_ms,
1636                     m_lcp_timing_factor));
1637 #endif
1638 }
1639 
1640 Uint64
calculate_proposed_disk_write_speed()1641 Backup::calculate_proposed_disk_write_speed()
1642 {
1643   if (!is_partial_lcp_enabled() || !is_redo_control_enabled())
1644   {
1645     jam();
1646     return 0;
1647   }
1648   Uint64 proposed_speed = m_proposed_disk_write_speed;
1649   proposed_speed /= CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
1650   return proposed_speed;
1651 }
1652 
1653 /**
1654  * Calculate the current max and min write speeds, based on the
1655  * current disk-write demands on this LDM thread
1656  */
1657 void
calculate_current_speed_bounds(Uint64 & max_speed,Uint64 & max_backup_speed,Uint64 & min_speed)1658 Backup::calculate_current_speed_bounds(Uint64& max_speed,
1659                                        Uint64& max_backup_speed,
1660                                        Uint64& min_speed)
1661 {
1662   jam();
1663 
1664   max_speed = c_defaults.m_disk_write_speed_max;
1665   max_backup_speed = c_defaults.m_disk_write_speed_max;
1666   min_speed = c_defaults.m_disk_write_speed_min;
1667 
1668   {
1669     /**
1670      * Critical level for REDO means that we need to write checkpoint
1671      * urgently. We set it to maximum configurable level (level at own
1672      * restarts).
1673      *
1674      * High level for REDO means that we need to speed up checkpoints,
1675      * but there is still no urgency. In this we set the maximum
1676      * checkpoint speed equal to the speed when another node is
1677      * performing a node restart.
1678      *
1679      * We calculate proposed speed based on the REDO write speed
1680      * adjusted based on the setting of RecoveryWork. To keep up
1681      * with writing in a large database we need to write about
1682      * CHANGE_SPEED * (1 + (100 / RecoveryWork)). Thus at default
1683      * setting of RecoveryWork we need to write 3x the CHANGE_SPEED
1684      * to LCP files to keep the checkpoints short.
1685      *
1686      * We will attempt to keep the checkpoint short, but we will
1687      * only adjust the maximum level for this purpose. We will
1688      * not decrease application writes more than necessary to keep
1689      * this write speed. We will impact application performance
1690      * more when the REDO log level comes closer to critical levels.
1691      *
1692      * We keep track of proposed disk write speed also when no LCP
1693      * is ongoing. Otherwise it will take a long time to speed up
1694      * disk write speed again when a new LCP starts up again.
1695      */
1696     jam();
1697     if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1698     {
1699       jam();
1700       max_speed = c_defaults.m_disk_write_speed_max_own_restart;
1701       DEB_REDO_CONTROL(("(%u)Critical REDO level, new max_speed: %llu kB/sec",
1702                         instance(),
1703                         ((max_speed *
1704          Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1705                         ));
1706     }
1707     else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1708     {
1709       jam();
1710       max_speed = c_defaults.m_disk_write_speed_max_other_node_restart;
1711       DEB_REDO_CONTROL(("(%u)High REDO level, new max_speed: %llu kB/sec",
1712                         instance(),
1713                         ((max_speed *
1714          Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1715                         ));
1716     }
1717     else if (m_is_any_node_restarting)
1718     {
1719       jam();
1720       max_speed = c_defaults.m_disk_write_speed_max_other_node_restart;
1721       DEB_REDO_CONTROL(("(%u)Node restarting, new max_speed: %llu kB/sec",
1722                         instance(),
1723                         ((max_speed *
1724          Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1725                         ));
1726     }
1727     Uint64 proposed_speed = calculate_proposed_disk_write_speed();
1728     if (proposed_speed > max_speed)
1729     {
1730       jam();
1731       max_speed = proposed_speed;
1732       DEB_REDO_CONTROL(("(%u)Proposed speed exceeds max_speed, "
1733                         "new max_speed: %llu kB/sec",
1734                         instance(),
1735                         ((max_speed *
1736          Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1737                         ));
1738     }
1739     DEB_REDO_CONTROL(("(%u)max_speed set to %llu kB/sec",
1740                       instance(),
1741                       ((max_speed *
1742       Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))));
1743   }
1744 
1745   /**
1746    * Thread balance
1747    *
1748    * As Backup is currently run on one LDM instance, we need to take
1749    * some steps to give it some extra DiskWriteSpeed allowance during
1750    * a Backup.  This becomes more acute with more LDM threads.
1751    * The correct way to handle this is to parallelise backup and
1752    * the backup log.
1753    *
1754    * Until then, we will skew the per-LDM disk write speed bounds
1755    * temporarily during a Backup so that LDM 1 has a large fixed
1756    * portion as well as its usual 1/n share for LCP.
1757    *
1758    * When the Backup completes, balance is restored.
1759    */
1760 
1761   const Uint32 num_ldm_threads = globalData.ndbMtLqhThreads;
1762 
1763   if (m_is_backup_running && m_skew_disk_speed &&
1764       num_ldm_threads > 1)
1765   {
1766     jam();
1767 
1768     const Uint64 node_max_speed =
1769       max_backup_speed *
1770       num_ldm_threads;
1771 
1772     /* Backup will get a percentage of the node total allowance */
1773     Uint64 node_backup_max_speed =
1774       (node_max_speed * c_defaults.m_backup_disk_write_pct) /
1775       100;
1776 
1777     /* LCP gets the rest */
1778     Uint64 node_lcp_max_speed =
1779       node_max_speed - node_backup_max_speed;
1780 
1781     /* LDM threads get a fair share of the LCP allowance */
1782     Uint64 ldm_thread_lcp_max_speed =
1783       node_lcp_max_speed / num_ldm_threads;
1784 
1785     /* Backup LDM must perform both node Backup + thread LCP */
1786     Uint64 backup_ldm_max_speed =
1787       node_backup_max_speed +
1788       ldm_thread_lcp_max_speed;
1789 
1790     /* Other LDMs just do thread LCP */
1791     Uint64 other_ldm_max_speed =
1792       ldm_thread_lcp_max_speed;
1793 
1794     ndbrequire(backup_ldm_max_speed +
1795                ((num_ldm_threads - 1) *
1796                 other_ldm_max_speed) <=
1797                node_max_speed);
1798 
1799     if (is_backup_worker())
1800     {
1801       jam();
1802       /**
1803        * Min is set to node backup speed,
1804        * this should quickly increase the thread's
1805        * allowance.
1806        */
1807       max_backup_speed = backup_ldm_max_speed;
1808       min_speed = MAX(min_speed, node_backup_max_speed);
1809       if (!is_redo_control_enabled())
1810       {
1811         jam();
1812         max_speed = MAX(max_speed, max_backup_speed);
1813       }
1814     }
1815     else
1816     {
1817       jam();
1818       /**
1819        * Trim write bandwidth available
1820        * to other LDM threads
1821        */
1822       max_backup_speed = other_ldm_max_speed;
1823       min_speed = MIN(min_speed, max_backup_speed);
1824       if (!is_redo_control_enabled())
1825       {
1826         jam();
1827         max_speed = max_backup_speed;
1828       }
1829     }
1830   }
1831   if (m_is_backup_running &&
1832       is_redo_control_enabled())
1833   {
1834     /**
1835      * Make sure that the total can be the sum while running both a backup
1836      * and an LCP at the same time. The minimum is the same for total and
1837      * for backup. The minimum is always based on the configured value.
1838      */
1839     jam();
1840     max_speed += max_backup_speed;
1841   }
1842   ndbrequire(min_speed <= max_speed);
1843 }
1844 
1845 void
adjust_disk_write_speed_down(Uint64 & curr_disk_write_speed,Uint64 & loc_disk_write_speed_set_to_min,Uint64 min_speed,int adjust_speed)1846 Backup::adjust_disk_write_speed_down(Uint64& curr_disk_write_speed,
1847                                      Uint64& loc_disk_write_speed_set_to_min,
1848                                      Uint64 min_speed,
1849                                      int adjust_speed)
1850 {
1851   if ((Int64)curr_disk_write_speed < (Int64)adjust_speed)
1852   {
1853     loc_disk_write_speed_set_to_min++;
1854     curr_disk_write_speed = min_speed;
1855   }
1856   else
1857   {
1858     curr_disk_write_speed -= adjust_speed;
1859     if (curr_disk_write_speed < min_speed)
1860     {
1861       loc_disk_write_speed_set_to_min++;
1862       curr_disk_write_speed = min_speed;
1863     }
1864   }
1865 }
1866 
1867 void
adjust_disk_write_speed_up(Uint64 & curr_disk_write_speed,Uint64 max_speed,int adjust_speed)1868 Backup::adjust_disk_write_speed_up(Uint64& curr_disk_write_speed,
1869                                    Uint64 max_speed,
1870                                    int adjust_speed)
1871 {
1872   curr_disk_write_speed += adjust_speed;
1873   if (curr_disk_write_speed > max_speed)
1874   {
1875     curr_disk_write_speed = max_speed;
1876   }
1877 }
1878 
1879 /**
1880  * Calculate new disk checkpoint write speed based on the new
1881  * multiplication factor, we decrease in steps of 10% per second
1882  */
1883 void
calculate_disk_write_speed(Signal * signal)1884 Backup::calculate_disk_write_speed(Signal *signal)
1885 {
1886   if (!m_our_node_started && !m_first_lcp_started)
1887   {
1888     /* No adaptiveness while we're still starting. */
1889     jam();
1890     return;
1891   }
1892   Uint64 max_disk_write_speed;
1893   Uint64 max_backup_disk_write_speed;
1894   Uint64 min_disk_write_speed;
1895   jamEntry();
1896   calculate_current_speed_bounds(max_disk_write_speed,
1897                                  max_backup_disk_write_speed,
1898                                  min_disk_write_speed);
1899 
1900   /**
1901    * Get CPU usage for the thread */
1902   EXECUTE_DIRECT_MT(THRMAN, GSN_GET_CPU_USAGE_REQ, signal,
1903                     1,
1904                     getThrmanInstance());
1905   Uint32 cpu_usage = signal->theData[0];
1906 
1907   /**
1908    * It is possible that the limits (max + min) have moved so that
1909    * the current speed is now outside them, if so we immediately
1910    * track to the relevant limit.
1911    * In these cases, the data collected for the last period regarding
1912    * redo log etc will not be relevant here.
1913    */
1914   bool ret_flag = false;
1915   if (m_curr_disk_write_speed < min_disk_write_speed)
1916   {
1917     jam();
1918     m_curr_disk_write_speed = min_disk_write_speed;
1919     DEB_REDO_CONTROL(("(%u)1:Current disk write speed is %llu kB/sec",
1920                       instance(),
1921                       ((m_curr_disk_write_speed *
1922                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1923                        Uint64(1024))
1924                       ));
1925     ret_flag = true;
1926   }
1927   else if (m_curr_disk_write_speed > max_disk_write_speed)
1928   {
1929     jam();
1930     m_curr_disk_write_speed = max_disk_write_speed;
1931     DEB_REDO_CONTROL(("(%u)2:Current disk write speed is %llu kB/sec",
1932                       instance(),
1933                       ((m_curr_disk_write_speed *
1934                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1935                        Uint64(1024))
1936                       ));
1937     ret_flag = true;
1938   }
1939   if (m_curr_backup_disk_write_speed > max_backup_disk_write_speed)
1940   {
1941     jam();
1942     DEB_REDO_CONTROL(("(%u)Current backup disk write speed is %llu kB/sec",
1943                       instance(),
1944                       ((m_curr_backup_disk_write_speed *
1945                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1946                        Uint64(1024))
1947                       ));
1948     m_curr_backup_disk_write_speed = max_backup_disk_write_speed;
1949   }
1950   if (ret_flag)
1951   {
1952     jam();
1953     debug_report_redo_control(cpu_usage);
1954     return;
1955   }
1956 
1957 
1958   /**
1959    * Current speed is within bounds, now consider whether to adjust
1960    * based on feedback.
1961    *
1962    * Calculate the max - min and divide by 6 to get the adjustment parameter
1963    * which is 16% of max - min. We will never adjust faster than this to avoid
1964    * too quick adaptiveness. For adjustments down we will adapt faster for IO
1965    * lags, for CPU speed we will adapt a bit slower dependent on how high
1966    * the CPU load is.
1967    */
1968   int diff_disk_write_speed =
1969     max_disk_write_speed - min_disk_write_speed;
1970 
1971   int adjust_speed_up = diff_disk_write_speed / 6;
1972   int adjust_speed_up_high = diff_disk_write_speed / 3;
1973   int adjust_speed_down_high = diff_disk_write_speed / 5;
1974   int adjust_speed_down_medium = diff_disk_write_speed / 8;
1975   int adjust_speed_down_low = diff_disk_write_speed / 12;
1976 
1977   jam();
1978   if (diff_disk_write_speed <= 0 || adjust_speed_up == 0)
1979   {
1980     jam();
1981     /**
1982      * The min == max which gives no room to adapt the LCP speed.
1983      * or the difference is too small to adapt it.
1984      *
1985      * If min == max for total we will treat backup the same way.
1986      */
1987     DEB_REDO_CONTROL(("(%u)3:Current disk write speed is %llu kB/sec",
1988                       instance(),
1989                       ((m_curr_disk_write_speed *
1990                         CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1991                        Uint64(1024))
1992                       ));
1993     debug_report_redo_control(cpu_usage);
1994     return;
1995   }
1996   if (c_lqh->is_ldm_instance_io_lagging())
1997   {
1998     /**
1999      * With IO lagging behind we will decrease the LCP speed to accomodate
2000      * for more REDO logging bandwidth. The definition of REDO log IO lagging
2001      * is kept in DBLQH, but will be a number of seconds of outstanding REDO
2002      * IO requests that LQH is still waiting for completion of.
2003      * This is a harder condition, so here we will immediately slow down fast.
2004      */
2005     jam();
2006     slowdowns_due_to_io_lag++;
2007     adjust_disk_write_speed_down(m_curr_disk_write_speed,
2008                                  disk_write_speed_set_to_min,
2009                                  min_disk_write_speed,
2010                                  adjust_speed_down_high);
2011     adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2012                                  backup_disk_write_speed_set_to_min,
2013                                  min_disk_write_speed,
2014                                  adjust_speed_down_high);
2015   }
2016   else
2017   {
2018     /**
2019      * Get CPU usage of this LDM thread during last second.
2020      * If CPU usage is over or equal to 95% we will decrease the LCP speed
2021      * If CPU usage is below 90% we will increase the LCP speed
2022      * one more step. Otherwise we will keep it where it currently is.
2023      *
2024      * We will not slow down checkpointing due to high CPU when the REDO log
2025      * is close to become exhausted. This should protect it from becoming
2026      * full.
2027      *
2028      * The speed of writing backups and LCPs are fairly linear to the
2029      * amount of bytes written. So e.g. writing 10 MByte/second gives
2030      * roughly about 10% CPU usage in one CPU. So by writing less we have a
2031      * more or less linear decrease of CPU usage. Naturally the speed of
2032      * writing is very much coupled to the CPU speed. CPUs today have all
2033      * sorts of power save magic, but this algorithm doesn't kick in until
2034      * we're at very high CPU loads where we won't be in power save mode.
2035      * Obviously it also works in the opposite direction that we can easily
2036      * speed up things when the CPU is less used.
2037      *
2038      * One complication of this algorithm is that we only measure the thread
2039      * CPU usage, so we don't really know here the level of CPU usage in total
2040      * of the system. Getting this information is quite complex and can
2041      * quickly change if the user is also using the machine for many other
2042      * things. In this case the algorithm will simply go up to the current
2043      * maximum value. So it will work much the same as before this algorithm
2044      * was put in place with the maximum value as the new DiskCheckpointSpeed
2045      * parameter.
2046      *
2047      * The algorithm will work best in cases where the user has locked the
2048      * thread to one or more CPUs and ensures that the thread can always run
2049      * by not allocating more than one thread per CPU.
2050      *
2051      * The reason we put the CPU usage limits fairly high is that the LDM
2052      * threads become more and more efficient as loads goes up. The reason
2053      * for this is that as more and more signals are executed in each loop
2054      * before checking for new signals. This means that as load goes up we
2055      * spend more and more time doing useful work. At low loads we spend a
2056      * significant time simply waiting for new signals to arrive and going to
2057      * sleep and waking up. So being at 95% load still means that we have
2058      * a bit more than 5% capacity left and even being at 90% means we
2059      * might have as much as 20% more capacity to use.
2060      */
2061     jam();
2062     bool adjust_disk_speed = true;
2063     bool adjust_backup_disk_speed = true;
2064     if (m_redo_alert_state >= RedoStateRep::REDO_ALERT_LOW)
2065     {
2066       /**
2067        * We are in a critical or high state for our REDO log, we must ensure
2068        * that we step up to use more and more CPU for checkpoints as long as
2069        * we don't oversubscribe the IO subsystem. This is why we check for
2070        * IO lag slowdown before we come here. The IO lag will still slow
2071        * down the checkpoint speed. CPU usage will not slow down checkpoint
2072        * processing.
2073        */
2074       jam();
2075       adjust_disk_speed = false;
2076       adjust_disk_write_speed_up(m_curr_disk_write_speed,
2077                                  max_disk_write_speed,
2078                                  adjust_speed_up_high);
2079     }
2080     else if (!m_our_node_started)
2081     {
2082       adjust_disk_speed = false;
2083       adjust_backup_disk_speed = false;
2084       /**
2085        * We are not in a critical state of the REDO log and we are
2086        * executing a node restart. We will allow for more CPU usage
2087        * in this state, but we will still slow down checkpoints when
2088        * CPU become overloaded.
2089        */
2090       if (cpu_usage < 99)
2091       {
2092         jam();
2093         /* 0-98% load, slow down */
2094         adjust_disk_write_speed_up(m_curr_disk_write_speed,
2095                                    max_disk_write_speed,
2096                                    adjust_speed_up);
2097       }
2098       else if (cpu_usage < 100)
2099       {
2100         jam();
2101         /* 99% load, slow down */
2102         slowdowns_due_to_high_cpu++;
2103         adjust_disk_write_speed_down(m_curr_disk_write_speed,
2104                                      disk_write_speed_set_to_min,
2105                                      min_disk_write_speed,
2106                                      adjust_speed_down_low);
2107       }
2108       else
2109       {
2110         /* 100% load, slow down a bit faster */
2111         jam();
2112         slowdowns_due_to_high_cpu++;
2113         adjust_disk_write_speed_down(m_curr_disk_write_speed,
2114                                      disk_write_speed_set_to_min,
2115                                      min_disk_write_speed,
2116                                      adjust_speed_down_medium);
2117       }
2118     }
2119     if (cpu_usage < 90)
2120     {
2121       jamEntry();
2122       if (adjust_disk_speed)
2123       {
2124         adjust_disk_write_speed_up(m_curr_disk_write_speed,
2125                                    max_disk_write_speed,
2126                                    adjust_speed_up);
2127       }
2128       if (adjust_backup_disk_speed)
2129       {
2130         adjust_disk_write_speed_up(m_curr_backup_disk_write_speed,
2131                                    max_backup_disk_write_speed,
2132                                    adjust_speed_up);
2133       }
2134     }
2135     else if (cpu_usage < 95)
2136     {
2137       jam();
2138     }
2139     else if (cpu_usage < 97)
2140     {
2141       jam();
2142       /* 95-96% load, slightly slow down */
2143       if (adjust_disk_speed)
2144       {
2145         slowdowns_due_to_high_cpu++;
2146         adjust_disk_write_speed_down(m_curr_disk_write_speed,
2147                                      disk_write_speed_set_to_min,
2148                                      min_disk_write_speed,
2149                                      adjust_speed_down_low);
2150       }
2151       if (adjust_backup_disk_speed)
2152       {
2153         slowdown_backups_due_to_high_cpu++;
2154         adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2155                                      backup_disk_write_speed_set_to_min,
2156                                      min_disk_write_speed,
2157                                      adjust_speed_down_low);
2158       }
2159     }
2160     else if (cpu_usage < 99)
2161     {
2162       jamEntry();
2163       /* 97-98% load, slow down */
2164       if (adjust_disk_speed)
2165       {
2166         slowdowns_due_to_high_cpu++;
2167         adjust_disk_write_speed_down(m_curr_disk_write_speed,
2168                                      disk_write_speed_set_to_min,
2169                                      min_disk_write_speed,
2170                                      adjust_speed_down_medium);
2171       }
2172       if (adjust_backup_disk_speed)
2173       {
2174         slowdown_backups_due_to_high_cpu++;
2175         adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2176                                      backup_disk_write_speed_set_to_min,
2177                                      min_disk_write_speed,
2178                                      adjust_speed_down_medium);
2179       }
2180     }
2181     else
2182     {
2183       jamEntry();
2184       /* 99-100% load, slow down a bit faster */
2185       if (adjust_disk_speed)
2186       {
2187         slowdowns_due_to_high_cpu++;
2188         adjust_disk_write_speed_down(m_curr_disk_write_speed,
2189                                      disk_write_speed_set_to_min,
2190                                      min_disk_write_speed,
2191                                      adjust_speed_down_high);
2192       }
2193       if (adjust_backup_disk_speed)
2194       {
2195         slowdown_backups_due_to_high_cpu++;
2196         adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2197                                      backup_disk_write_speed_set_to_min,
2198                                      min_disk_write_speed,
2199                                      adjust_speed_down_high);
2200       }
2201     }
2202   }
2203   debug_report_redo_control(cpu_usage);
2204 }
2205 
2206 void
send_next_reset_disk_speed_counter(Signal * signal)2207 Backup::send_next_reset_disk_speed_counter(Signal *signal)
2208 {
2209   signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
2210   sendSignalWithDelay(reference(),
2211                       GSN_CONTINUEB,
2212                       signal,
2213                       m_reset_delay_used,
2214                       1);
2215   return;
2216 }
2217 
2218 void
execCHECK_NODE_RESTARTCONF(Signal * signal)2219 Backup::execCHECK_NODE_RESTARTCONF(Signal *signal)
2220 {
2221   bool old_is_backup_running = m_is_backup_running;
2222   bool old_is_any_node_restarting = m_is_any_node_restarting;
2223   m_is_lcp_running = (signal->theData[0] == 1);
2224   m_is_backup_running = g_is_single_thr_backup_running;  /* Global from backup instance */
2225   m_is_any_node_restarting = (signal->theData[1] == 1);
2226   const char* backup_text=NULL;
2227   const char* restart_text=NULL;
2228 
2229   /* No logging of LCP start/stop w.r.t. Disk Speed */
2230   if (old_is_backup_running != m_is_backup_running)
2231   {
2232     if (old_is_backup_running)
2233     {
2234       backup_text=" Backup completed";
2235     }
2236     else
2237     {
2238       backup_text=" Backup started";
2239     }
2240   }
2241   if (old_is_any_node_restarting != m_is_any_node_restarting)
2242   {
2243     if (old_is_any_node_restarting)
2244     {
2245       restart_text=" Node restart finished";
2246     }
2247     else
2248     {
2249       restart_text=" Node restart ongoing";
2250     }
2251   }
2252 
2253   if (is_backup_worker())
2254   {
2255     /* Just have one LDM log the transition */
2256     if (backup_text || restart_text)
2257     {
2258       g_eventLogger->info("Adjusting disk write speed bounds due to :%s%s",
2259                           (backup_text ? backup_text : ""),
2260                           (restart_text ? restart_text : ""));
2261     }
2262   }
2263 }
2264 
2265 void
execCONTINUEB(Signal * signal)2266 Backup::execCONTINUEB(Signal* signal)
2267 {
2268   jamEntry();
2269   const Uint32 Tdata0 = signal->theData[0];
2270   const Uint32 Tdata1 = signal->theData[1];
2271   const Uint32 Tdata2 = signal->theData[2];
2272   const Uint32 Tdata3 = signal->theData[3];
2273 
2274   switch(Tdata0) {
2275   case BackupContinueB::RESET_DISK_SPEED_COUNTER:
2276   {
2277     jam();
2278     const NDB_TICKS curr_time = NdbTick_getCurrentTicks();
2279     const Uint64 millisPassed =
2280       NdbTick_Elapsed(m_monitor_snapshot_start,curr_time).milliSec();
2281     if (millisPassed >= 800 && !m_node_restart_check_sent)
2282     {
2283       /**
2284        * Check for node restart ongoing, we will check for it and use
2285        * the cached copy of the node restart state when deciding on the
2286        * disk checkpoint speed. We will start this check a few intervals
2287        * before calculating the new disk checkpoint speed. We will send
2288        * such a check once per interval we are changing disk checkpoint
2289        * speed.
2290        *
2291        * So we call DIH asynchronously here after 800ms have passed such
2292        * that when 1000 ms have passed and we will check disk speeds we
2293        * have information about if there is a node restart ongoing or not.
2294        * This information will only affect disk write speed, so it's not
2295        * a problem to rely on up to 200ms old information.
2296        */
2297       jam();
2298       m_node_restart_check_sent = true;
2299       signal->theData[0] = reference();
2300       sendSignal(DBDIH_REF, GSN_CHECK_NODE_RESTARTREQ, signal, 1, JBB);
2301     }
2302     /**
2303      * We check for millis passed larger than 989 to handle the situation
2304      * when we wake up slightly too early. Since we only wake up once every
2305      * 100 millisecond, this should be better than occasionally get intervals
2306      * of 1100 milliseconds. All the calculations takes the real interval into
2307      * account, so it should not corrupt any data.
2308      */
2309     if (millisPassed > 989)
2310     {
2311       jam();
2312       m_node_restart_check_sent = false;
2313       monitor_disk_write_speed(curr_time, millisPassed);
2314       measure_change_speed(signal, Uint64(millisPassed));
2315       calculate_disk_write_speed(signal);
2316       c_pgman->set_current_disk_write_speed(m_curr_disk_write_speed *
2317                     Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS));
2318     }
2319     handle_overflow(m_overflow_disk_write,
2320                     m_words_written_this_period,
2321                     m_curr_disk_write_speed);
2322     handle_overflow(m_backup_overflow_disk_write,
2323                     m_backup_words_written_this_period,
2324                     m_curr_backup_disk_write_speed);
2325     calculate_next_delay(curr_time);
2326     send_next_reset_disk_speed_counter(signal);
2327     break;
2328   }
2329   case BackupContinueB::BACKUP_FRAGMENT_INFO:
2330   {
2331     jam();
2332     const Uint32 ptr_I = Tdata1;
2333     Uint32 tabPtr_I = Tdata2;
2334     Uint32 fragPtr_I = signal->theData[3];
2335 
2336     BackupRecordPtr ptr;
2337     c_backupPool.getPtr(ptr, ptr_I);
2338     TablePtr tabPtr;
2339     ptr.p->tables.getPtr(tabPtr, tabPtr_I);
2340 
2341     if (fragPtr_I != tabPtr.p->fragments.getSize())
2342     {
2343       jam();
2344       FragmentPtr fragPtr;
2345       tabPtr.p->fragments.getPtr(fragPtr, fragPtr_I);
2346 
2347       BackupFilePtr filePtr;
2348       ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
2349 
2350       const Uint32 sz = sizeof(BackupFormat::CtlFile::FragmentInfo) >> 2;
2351       Uint32 * dst;
2352       if (!filePtr.p->operation.dataBuffer.getWritePtr(&dst, sz))
2353       {
2354 	sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
2355                             WaitDiskBufferCapacityMillis, 4);
2356 	return;
2357       }
2358 
2359       BackupFormat::CtlFile::FragmentInfo * fragInfo =
2360 	(BackupFormat::CtlFile::FragmentInfo*)dst;
2361       fragInfo->SectionType = htonl(BackupFormat::FRAGMENT_INFO);
2362       fragInfo->SectionLength = htonl(sz);
2363       fragInfo->TableId = htonl(fragPtr.p->tableId);
2364       fragInfo->FragmentNo = htonl(fragPtr_I);
2365       fragInfo->NoOfRecordsLow = htonl((Uint32)(fragPtr.p->noOfRecords & 0xFFFFFFFF));
2366       fragInfo->NoOfRecordsHigh = htonl((Uint32)(fragPtr.p->noOfRecords >> 32));
2367       fragInfo->FilePosLow = htonl(0);
2368       fragInfo->FilePosHigh = htonl(0);
2369 
2370       filePtr.p->operation.dataBuffer.updateWritePtr(sz);
2371 
2372       fragPtr_I++;
2373     }
2374 
2375     if (fragPtr_I == tabPtr.p->fragments.getSize())
2376     {
2377       BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
2378       req->m_senderRef = reference();
2379       req->m_tableId = tabPtr.p->tableId;
2380       req->m_lock_unlock = BackupLockTab::UNLOCK_TABLE;
2381       req->m_backup_state = BackupLockTab::BACKUP_FRAGMENT_INFO;
2382       req->m_backupRecordPtr_I = ptr_I;
2383       req->m_tablePtr_I = tabPtr_I;
2384       sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
2385                  BackupLockTab::SignalLength, JBB);
2386       return;
2387     }
2388 
2389     signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
2390     signal->theData[1] = ptr_I;
2391     signal->theData[2] = tabPtr_I;
2392     signal->theData[3] = fragPtr_I;
2393     sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
2394     return;
2395   }
2396   case BackupContinueB::START_FILE_THREAD:
2397   case BackupContinueB::BUFFER_UNDERFLOW:
2398   {
2399     jam();
2400     BackupFilePtr filePtr;
2401     c_backupFilePool.getPtr(filePtr, Tdata1);
2402     checkFile(signal, filePtr);
2403     return;
2404   }
2405   case BackupContinueB::BUFFER_FULL_SCAN:
2406   {
2407     jam();
2408     BackupFilePtr filePtr;
2409     BackupRecordPtr ptr;
2410     c_backupFilePool.getPtr(filePtr, Tdata1);
2411     c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
2412     /**
2413      * Given that we've been waiting a few milliseconds for buffers to become
2414      * free, we need to initialise the priority mode algorithm to ensure that
2415      * we select the correct priority mode.
2416      *
2417      * We get the number of jobs waiting at B-level to assess the current
2418      * activity level to get a new starting point of the algorithm.
2419      * Any load level below 16 signals in the buffer we ignore, if we have
2420      * a higher level we provide a value that will ensure that we most likely
2421      * will start at A-level.
2422      */
2423     init_scan_prio_level(signal, ptr);
2424     checkScan(signal, ptr, filePtr, true);
2425     return;
2426   }
2427   break;
2428   case BackupContinueB::BUFFER_FULL_FRAG_COMPLETE:
2429   {
2430     jam();
2431     BackupFilePtr filePtr;
2432     c_backupFilePool.getPtr(filePtr, Tdata1);
2433     fragmentCompleted(signal, filePtr, Tdata2);
2434     return;
2435   }
2436   break;
2437   case BackupContinueB::BUFFER_FULL_META:
2438   {
2439     jam();
2440     BackupRecordPtr ptr;
2441     c_backupPool.getPtr(ptr, Tdata1);
2442 
2443     BackupFilePtr filePtr;
2444 
2445     if (ptr.p->is_lcp())
2446     {
2447       jam();
2448       ptr.p->files.getPtr(filePtr, Tdata3);
2449     }
2450     else
2451     {
2452       jam();
2453       ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
2454     }
2455     FsBuffer & buf = filePtr.p->operation.dataBuffer;
2456 
2457     if(buf.getFreeSize() < buf.getMaxWrite()) {
2458       jam();
2459       TablePtr tabPtr;
2460       c_tablePool.getPtr(tabPtr, Tdata2);
2461 
2462       DEBUG_OUT("Backup - Buffer full - "
2463                 << buf.getFreeSize()
2464 		<< " < " << buf.getMaxWrite()
2465                 << " (sz: " << buf.getUsableSize()
2466                 << " getMinRead: " << buf.getMinRead()
2467 		<< ") - tableId = " << tabPtr.p->tableId);
2468 
2469       signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
2470       signal->theData[1] = Tdata1;
2471       signal->theData[2] = Tdata2;
2472       signal->theData[3] = Tdata3;
2473       sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
2474                           WaitDiskBufferCapacityMillis, 4);
2475       return;
2476     }//if
2477 
2478     TablePtr tabPtr;
2479     c_tablePool.getPtr(tabPtr, Tdata2);
2480     GetTabInfoReq * req = (GetTabInfoReq *)signal->getDataPtrSend();
2481     req->senderRef = reference();
2482     req->senderData = filePtr.i;
2483     req->requestType = GetTabInfoReq::RequestById |
2484       GetTabInfoReq::LongSignalConf;
2485     req->tableId = tabPtr.p->tableId;
2486     req->schemaTransId = 0;
2487     sendSignal(DBDICT_REF, GSN_GET_TABINFOREQ, signal,
2488 	       GetTabInfoReq::SignalLength, JBB);
2489     return;
2490   }
2491   case BackupContinueB::ZGET_NEXT_FRAGMENT:
2492   {
2493     BackupRecordPtr backupPtr;
2494     TablePtr tabPtr;
2495     Uint32 fragNo = signal->theData[3];
2496     c_backupPool.getPtr(backupPtr, signal->theData[1]);
2497     ndbrequire(findTable(backupPtr, tabPtr, signal->theData[2]));
2498     getFragmentInfo(signal, backupPtr, tabPtr, fragNo);
2499     return;
2500   }
2501   case BackupContinueB::ZDELETE_LCP_FILE:
2502   {
2503     jam();
2504     delete_lcp_file_processing(signal);
2505     return;
2506   }
2507   default:
2508     ndbabort();
2509   }//switch
2510 }
2511 
2512 void
execBACKUP_LOCK_TAB_CONF(Signal * signal)2513 Backup::execBACKUP_LOCK_TAB_CONF(Signal *signal)
2514 {
2515   jamEntry();
2516   const BackupLockTab *conf = (const BackupLockTab *)signal->getDataPtr();
2517   BackupRecordPtr ptr;
2518   c_backupPool.getPtr(ptr, conf->m_backupRecordPtr_I);
2519   TablePtr tabPtr;
2520   ptr.p->tables.getPtr(tabPtr, conf->m_tablePtr_I);
2521 
2522   switch(conf->m_backup_state) {
2523   case BackupLockTab::BACKUP_FRAGMENT_INFO:
2524   {
2525     jam();
2526     ptr.p->tables.next(tabPtr);
2527     if (tabPtr.i == RNIL)
2528     {
2529       jam();
2530       closeFiles(signal, ptr);
2531       return;
2532     }
2533 
2534     signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
2535     signal->theData[1] = ptr.i;
2536     signal->theData[2] = tabPtr.i;
2537     signal->theData[3] = 0;       // Start from first fragment of next table
2538     sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
2539     return;
2540   }
2541   case BackupLockTab::GET_TABINFO_CONF:
2542   {
2543     jam();
2544     if (conf->errorCode)
2545     {
2546       jam();
2547       defineBackupRef(signal, ptr, conf->errorCode);
2548       return;
2549     }
2550 
2551     ptr.p->tables.next(tabPtr);
2552     afterGetTabinfoLockTab(signal, ptr, tabPtr);
2553     return;
2554   }
2555   case BackupLockTab::CLEANUP:
2556   {
2557     jam();
2558     ptr.p->tables.next(tabPtr);
2559     cleanupNextTable(signal, ptr, tabPtr);
2560     return;
2561   }
2562   default:
2563     ndbabort();
2564   }
2565 }
2566 
2567 void
execBACKUP_LOCK_TAB_REF(Signal * signal)2568 Backup::execBACKUP_LOCK_TAB_REF(Signal *signal)
2569 {
2570   jamEntry();
2571   ndbabort(); /* Not currently possible. */
2572 }
2573 
get_new_speed_val64(Signal * signal)2574 Uint64 Backup::get_new_speed_val64(Signal *signal)
2575 {
2576   if (signal->length() == 3)
2577   {
2578     jam();
2579     Uint64 val = Uint64(signal->theData[1]);
2580     val <<= 32;
2581     val += Uint64(signal->theData[2]);
2582     return val;
2583   }
2584   else
2585   {
2586     jam();
2587     return 0;
2588   }
2589 }
2590 
get_new_speed_val32(Signal * signal)2591 Uint64 Backup::get_new_speed_val32(Signal *signal)
2592 {
2593   if (signal->length() == 2)
2594   {
2595     jam();
2596     return Uint64(signal->theData[1]);
2597   }
2598   else
2599   {
2600     jam();
2601     return 0;
2602   }
2603 }
2604 
2605 void
execDUMP_STATE_ORD(Signal * signal)2606 Backup::execDUMP_STATE_ORD(Signal* signal)
2607 {
2608   jamEntry();
2609 
2610   /* Dump commands used in public interfaces */
2611   switch (signal->theData[0]) {
2612   case DumpStateOrd::BackupStatus:
2613   {
2614     /* See code in BackupProxy.cpp as well */
2615     BlockReference result_ref = CMVMI_REF;
2616     if (signal->length() == 2)
2617       result_ref = signal->theData[1];
2618 
2619     BackupRecordPtr ptr;
2620     get_backup_record(ptr);
2621     reportStatus(signal, ptr, result_ref);
2622     return;
2623   }
2624   case DumpStateOrd::BackupMinWriteSpeed32:
2625   {
2626     jam();
2627     Uint64 new_val = get_new_speed_val32(signal);
2628     if (new_val < Uint64(1024*1024))
2629     {
2630       jam();
2631       g_eventLogger->info("Use: DUMP 100001 MinDiskWriteSpeed");
2632       return;
2633     }
2634     restore_disk_write_speed_numbers();
2635     c_defaults.m_disk_write_speed_min = new_val;
2636     calculate_real_disk_write_speed_parameters();
2637     return;
2638   }
2639   case DumpStateOrd::BackupMaxWriteSpeed32:
2640   {
2641     jam();
2642     Uint64 new_val = get_new_speed_val32(signal);
2643     if (new_val < Uint64(1024*1024))
2644     {
2645       jam();
2646       g_eventLogger->info("Use: DUMP 100002 MaxDiskWriteSpeed");
2647       return;
2648     }
2649     restore_disk_write_speed_numbers();
2650     c_defaults.m_disk_write_speed_max = new_val;
2651     calculate_real_disk_write_speed_parameters();
2652     return;
2653   }
2654   case DumpStateOrd::BackupMaxWriteSpeedOtherNodeRestart32:
2655   {
2656     jam();
2657     Uint64 new_val = get_new_speed_val32(signal);
2658     if (new_val < Uint64(1024*1024))
2659     {
2660       jam();
2661       g_eventLogger->info("Use: DUMP 100003 MaxDiskWriteSpeedOtherNodeRestart");
2662       return;
2663     }
2664     restore_disk_write_speed_numbers();
2665     c_defaults.m_disk_write_speed_max_other_node_restart = new_val;
2666     calculate_real_disk_write_speed_parameters();
2667     return;
2668   }
2669   case DumpStateOrd::BackupMinWriteSpeed64:
2670   {
2671     jam();
2672     Uint64 new_val = get_new_speed_val64(signal);
2673     if (new_val < Uint64(1024*1024))
2674     {
2675       jam();
2676       g_eventLogger->info("Use: DUMP 100004 MinDiskWriteSpeed(MSB) "
2677                           "MinDiskWriteSpeed(LSB)");
2678       return;
2679     }
2680     restore_disk_write_speed_numbers();
2681     c_defaults.m_disk_write_speed_min = new_val;
2682     calculate_real_disk_write_speed_parameters();
2683     return;
2684   }
2685   case DumpStateOrd::BackupMaxWriteSpeed64:
2686   {
2687     jam();
2688     Uint64 new_val = get_new_speed_val64(signal);
2689     if (new_val < Uint64(1024*1024))
2690     {
2691       jam();
2692       g_eventLogger->info("Use: DUMP 100005 MaxDiskWriteSpeed(MSB) "
2693                           "MaxDiskWriteSpeed(LSB)");
2694       return;
2695     }
2696     restore_disk_write_speed_numbers();
2697     c_defaults.m_disk_write_speed_max = new_val;
2698     calculate_real_disk_write_speed_parameters();
2699     return;
2700   }
2701   case DumpStateOrd::BackupMaxWriteSpeedOtherNodeRestart64:
2702   {
2703     jam();
2704     Uint64 new_val = get_new_speed_val64(signal);
2705     if (new_val < Uint64(1024*1024))
2706     {
2707       jam();
2708       g_eventLogger->info("Use: DUMP 100006"
2709                           " MaxDiskWriteSpeedOtherNodeRestart(MSB)"
2710                           " MaxDiskWriteSpeedOtherNodeRestart(LSB)");
2711       return;
2712     }
2713     restore_disk_write_speed_numbers();
2714     c_defaults.m_disk_write_speed_max_other_node_restart = new_val;
2715     calculate_real_disk_write_speed_parameters();
2716     return;
2717   }
2718   default:
2719     /* continue to debug section */
2720     break;
2721   }
2722 
2723   /* Debugging or unclassified section */
2724 
2725   if(signal->theData[0] == 20){
2726     if(signal->length() > 1){
2727       c_defaults.m_dataBufferSize = (signal->theData[1] * 1024 * 1024);
2728     }
2729     if(signal->length() > 2){
2730       c_defaults.m_logBufferSize = (signal->theData[2] * 1024 * 1024);
2731     }
2732     if(signal->length() > 3){
2733       c_defaults.m_minWriteSize = signal->theData[3] * 1024;
2734     }
2735     if(signal->length() > 4){
2736       c_defaults.m_maxWriteSize = signal->theData[4] * 1024;
2737     }
2738 
2739     infoEvent("Backup: data: %d log: %d min: %d max: %d",
2740 	      c_defaults.m_dataBufferSize,
2741 	      c_defaults.m_logBufferSize,
2742 	      c_defaults.m_minWriteSize,
2743 	      c_defaults.m_maxWriteSize);
2744     return;
2745   }
2746   if(signal->theData[0] == 21){
2747     BackupReq * req = (BackupReq*)signal->getDataPtrSend();
2748     req->senderData = 23;
2749     req->backupDataLen = 0;
2750     sendSignal(reference(), GSN_BACKUP_REQ,signal,BackupReq::SignalLength, JBB);
2751     startTime = NdbTick_getCurrentTicks();
2752     return;
2753   }
2754 
2755   if(signal->theData[0] == 22){
2756     const Uint32 seq = signal->theData[1];
2757     FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
2758     req->userReference = reference();
2759     req->userPointer = 23;
2760     req->directory = 1;
2761     req->ownDirectory = 1;
2762     FsOpenReq::setVersion(req->fileNumber, 2);
2763     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
2764     FsOpenReq::v2_setSequence(req->fileNumber, seq);
2765     FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
2766     sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
2767 	       FsRemoveReq::SignalLength, JBA);
2768     return;
2769   }
2770 
2771   if(signal->theData[0] == 23){
2772     /**
2773      * Print records
2774      */
2775     BackupRecordPtr ptr;
2776     for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr)){
2777       infoEvent("BackupRecord %d: BackupId: %u MasterRef: %x ClientRef: %x",
2778 		ptr.i, ptr.p->backupId, ptr.p->masterRef, ptr.p->clientRef);
2779       infoEvent(" State: %d", ptr.p->slaveState.getState());
2780       BackupFilePtr filePtr;
2781       for(ptr.p->files.first(filePtr); filePtr.i != RNIL;
2782 	  ptr.p->files.next(filePtr)){
2783 	jam();
2784 	infoEvent(" file %d: type: %d flags: H'%x",
2785 		  filePtr.i, filePtr.p->fileType,
2786 		  filePtr.p->m_flags);
2787       }
2788     }
2789 
2790     const NDB_TICKS now = NdbTick_getCurrentTicks();
2791     const Uint64 resetElapsed = NdbTick_Elapsed(m_reset_disk_speed_time,now).milliSec();
2792     const Uint64 millisPassed = NdbTick_Elapsed(m_monitor_snapshot_start,now).milliSec();
2793     /* Dump measured disk write speed since last RESET_DISK_SPEED */
2794     ndbout_c("m_curr_disk_write_speed: %ukb  m_words_written_this_period:"
2795              " %u kwords  m_overflow_disk_write: %u kb",
2796               Uint32(4 * m_curr_disk_write_speed / 1024),
2797               Uint32(m_words_written_this_period / 1024),
2798               Uint32(m_overflow_disk_write / 1024));
2799     ndbout_c("m_backup_curr_disk_write_speed: %ukb  "
2800              "m_backup_words_written_this_period:"
2801              " %u kwords  m_backup_overflow_disk_write: %u kb",
2802               Uint32(4 * m_curr_backup_disk_write_speed / 1024),
2803               Uint32(m_backup_words_written_this_period / 1024),
2804               Uint32(m_backup_overflow_disk_write / 1024));
2805     ndbout_c("m_reset_delay_used: %u  time since last RESET_DISK_SPEED: %llu millis",
2806              m_reset_delay_used, resetElapsed);
2807     /* Dump measured rate since last snapshot start */
2808     Uint64 byteRate = (4000 * m_monitor_words_written) / (millisPassed + 1);
2809     ndbout_c("m_monitor_words_written : %llu, duration : %llu millis, rate :"
2810              " %llu bytes/s : (%u pct of config)",
2811              m_monitor_words_written, millisPassed,
2812              byteRate,
2813              (Uint32) ((100 * byteRate / (4 * 10)) /
2814                        (m_curr_disk_write_speed + 1)));
2815     byteRate = (4000 * m_backup_monitor_words_written) / (millisPassed + 1);
2816     ndbout_c("m_backup_monitor_words_written : %llu, duration : %llu"
2817              " millis, rate :"
2818              " %llu bytes/s : (%u pct of config)",
2819              m_backup_monitor_words_written, millisPassed,
2820              byteRate,
2821              (Uint32) ((100 * byteRate / (4 * 10)) /
2822                        (m_curr_backup_disk_write_speed + 1)));
2823 
2824     for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr))
2825     {
2826       ndbout_c("BackupRecord %u:  BackupId: %u  MasterRef: %x  ClientRef: %x",
2827                ptr.i, ptr.p->backupId, ptr.p->masterRef, ptr.p->clientRef);
2828       ndbout_c(" State: %u", ptr.p->slaveState.getState());
2829       ndbout_c(" noOfByte: %llu  noOfRecords: %llu",
2830                ptr.p->noOfBytes, ptr.p->noOfRecords);
2831       ndbout_c(" noOfLogBytes: %llu  noOfLogRecords: %llu",
2832                ptr.p->noOfLogBytes, ptr.p->noOfLogRecords);
2833       ndbout_c(" errorCode: %u", ptr.p->errorCode);
2834       BackupFilePtr filePtr;
2835       for(ptr.p->files.first(filePtr); filePtr.i != RNIL;
2836 	  ptr.p->files.next(filePtr))
2837       {
2838 	ndbout_c(" file %u:  type: %u  flags: H'%x  tableId: %u  fragmentId: %u",
2839                  filePtr.i, filePtr.p->fileType, filePtr.p->m_flags,
2840                  filePtr.p->tableId, filePtr.p->fragmentNo);
2841       }
2842       if (ptr.p->slaveState.getState() == SCANNING && ptr.p->dataFilePtr[0] != RNIL)
2843       {
2844         c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
2845         OperationRecord & op = filePtr.p->operation;
2846         Uint32 *tmp = NULL;
2847         Uint32 sz = 0;
2848         bool eof = FALSE;
2849         bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
2850         ndbout_c("ready: %s  eof: %s", ready ? "TRUE" : "FALSE", eof ? "TRUE" : "FALSE");
2851       }
2852     }
2853     return;
2854   }
2855   if(signal->theData[0] == 24){
2856     /**
2857      * Print size of records etc.
2858      */
2859     infoEvent("Backup - dump pool sizes");
2860     infoEvent("BackupPool: %d BackupFilePool: %d TablePool: %d",
2861 	      c_backupPool.getSize(), c_backupFilePool.getSize(),
2862 	      c_tablePool.getSize());
2863     infoEvent("AttrPool: %d TriggerPool: %d FragmentPool: %d",
2864 	      c_backupPool.getSize(), c_backupFilePool.getSize(),
2865 	      c_tablePool.getSize());
2866     infoEvent("PagePool: %d",
2867 	      c_pagePool.getSize());
2868 
2869 
2870     if(signal->getLength() == 2 && signal->theData[1] == 2424)
2871     {
2872       /**
2873        * Handle LCP
2874        */
2875       BackupRecordPtr lcp;
2876       get_lcp_record(lcp);
2877 
2878       ndbrequire(c_backupPool.getSize() == c_backupPool.getNoOfFree() + 1);
2879       ndbrequire(c_tablePool.getSize() == c_tablePool.getNoOfFree() + 2);
2880       ndbrequire(c_fragmentPool.getSize() == c_fragmentPool.getNoOfFree() + 2);
2881       ndbrequire(c_triggerPool.getSize() == c_triggerPool.getNoOfFree());
2882 
2883       ndbrequire(c_backupFilePool.getSize() == (c_backupFilePool.getNoOfFree() +
2884                            (4 + 2 * BackupFormat::NDB_MAX_FILES_PER_LCP)));
2885 
2886       Uint32 file_pages = 0;
2887       BackupFilePtr lcp_file;
2888 
2889       c_backupFilePool.getPtr(lcp_file, lcp.p->prepareCtlFilePtr[0]);
2890       file_pages += lcp_file.p->pages.getSize();
2891 
2892       c_backupFilePool.getPtr(lcp_file, lcp.p->prepareCtlFilePtr[1]);
2893       file_pages += lcp_file.p->pages.getSize();
2894 
2895       for (Uint32 i = 0; i < BackupFormat::NDB_MAX_FILES_PER_LCP; i++)
2896       {
2897         c_backupFilePool.getPtr(lcp_file, lcp.p->dataFilePtr[i]);
2898         file_pages += lcp_file.p->pages.getSize();
2899 
2900         c_backupFilePool.getPtr(lcp_file, lcp.p->prepareDataFilePtr[i]);
2901         file_pages += lcp_file.p->pages.getSize();
2902       }
2903 
2904       c_backupFilePool.getPtr(lcp_file, lcp.p->ctlFilePtr);
2905       file_pages += lcp_file.p->pages.getSize();
2906 
2907       c_backupFilePool.getPtr(lcp_file, lcp.p->deleteFilePtr);
2908       file_pages += lcp_file.p->pages.getSize();
2909 
2910       ndbrequire(c_pagePool.getSize() ==
2911 		 c_pagePool.getNoOfFree() +
2912                  file_pages);
2913     }
2914   }
2915 
2916   if(signal->theData[0] == DumpStateOrd::DumpBackup)
2917   {
2918     /* Display a bunch of stuff about Backup defaults */
2919     infoEvent("Compressed Backup: %d", c_defaults.m_compressed_backup);
2920     infoEvent("Compressed LCP: %d", c_defaults.m_compressed_lcp);
2921   }
2922 
2923   if(signal->theData[0] == DumpStateOrd::DumpBackupSetCompressed)
2924   {
2925     c_defaults.m_compressed_backup= signal->theData[1];
2926     infoEvent("Compressed Backup: %d", c_defaults.m_compressed_backup);
2927   }
2928 
2929   if(signal->theData[0] == DumpStateOrd::DumpBackupSetCompressedLCP)
2930   {
2931     c_defaults.m_compressed_lcp= signal->theData[1];
2932     infoEvent("Compressed LCP: %d", c_defaults.m_compressed_lcp);
2933   }
2934 
2935   if (signal->theData[0] == DumpStateOrd::BackupErrorInsert)
2936   {
2937     if (signal->getLength() == 1)
2938       ndbout_c("BACKUP: setting error %u", signal->theData[1]);
2939     else
2940       ndbout_c("BACKUP: setting error %u, %u",
2941                signal->theData[1], signal->theData[2]);
2942     SET_ERROR_INSERT_VALUE2(signal->theData[1], signal->theData[2]);
2943   }
2944 }
2945 
2946 /**
2947  * We are using a round buffer of measurements, to simplify the code we
2948  * use this routing to quickly derive the disk write record from an index
2949  * (how many seconds back we want to check).
2950  */
2951 Uint32
get_disk_write_speed_record(Uint32 start_index)2952 Backup::get_disk_write_speed_record(Uint32 start_index)
2953 {
2954   ndbassert(start_index < DISK_WRITE_SPEED_REPORT_SIZE);
2955   if (next_disk_write_speed_report == last_disk_write_speed_report)
2956   {
2957     /* No speed reports generated yet */
2958     return DISK_WRITE_SPEED_REPORT_SIZE;
2959   }
2960   if (start_index < next_disk_write_speed_report)
2961   {
2962     return (next_disk_write_speed_report - (start_index + 1));
2963   }
2964   else if (last_disk_write_speed_report == 0)
2965   {
2966     /**
2967      * We might still be in inital phase when not all records have
2968      * been written yet.
2969      */
2970     return DISK_WRITE_SPEED_REPORT_SIZE;
2971   }
2972   else
2973   {
2974     return (DISK_WRITE_SPEED_REPORT_SIZE -
2975             ((start_index + 1) - next_disk_write_speed_report));
2976   }
2977   ndbassert(false);
2978   return 0;
2979 }
2980 
2981 /**
2982  * Calculates the average speed for a number of seconds back.
2983  * reports the numbers in number of milliseconds that actually
2984  * passed and the number of bytes written in this period.
2985  */
2986 void
calculate_disk_write_speed_seconds_back(Uint32 seconds_back,Uint64 & millis_passed,Uint64 & backup_lcp_bytes_written,Uint64 & backup_bytes_written,Uint64 & redo_bytes_written,bool at_least_one)2987 Backup::calculate_disk_write_speed_seconds_back(Uint32 seconds_back,
2988                                          Uint64 & millis_passed,
2989                                          Uint64 & backup_lcp_bytes_written,
2990                                          Uint64 & backup_bytes_written,
2991                                          Uint64 & redo_bytes_written,
2992                                          bool at_least_one)
2993 {
2994   Uint64 millis_back = (MILLIS_IN_A_SECOND * seconds_back) -
2995     MILLIS_ADJUST_FOR_EARLY_REPORT;
2996   Uint32 start_index = 0;
2997 
2998   ndbassert(seconds_back > 0);
2999 
3000   millis_passed = 0;
3001   backup_lcp_bytes_written = 0;
3002   backup_bytes_written = 0;
3003   redo_bytes_written = 0;
3004   jam();
3005   while (at_least_one ||
3006          (millis_passed < millis_back &&
3007           start_index < DISK_WRITE_SPEED_REPORT_SIZE))
3008   {
3009     jam();
3010     at_least_one = false;
3011     Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3012     if (disk_write_speed_record == DISK_WRITE_SPEED_REPORT_SIZE)
3013       break;
3014     millis_passed +=
3015       disk_write_speed_rep[disk_write_speed_record].millis_passed;
3016     backup_lcp_bytes_written +=
3017       disk_write_speed_rep[disk_write_speed_record].backup_lcp_bytes_written;
3018     backup_bytes_written +=
3019       disk_write_speed_rep[disk_write_speed_record].backup_bytes_written;
3020     redo_bytes_written +=
3021       disk_write_speed_rep[disk_write_speed_record].redo_bytes_written;
3022     start_index++;
3023   }
3024   /**
3025    * Always report at least one millisecond to avoid risk of division
3026    * by zero later on in the code.
3027    */
3028   jam();
3029   if (millis_passed == 0)
3030   {
3031     jam();
3032     millis_passed = 1;
3033   }
3034   return;
3035 }
3036 
3037 void
calculate_std_disk_write_speed_seconds_back(Uint32 seconds_back,Uint64 millis_passed_total,Uint64 backup_lcp_bytes_written,Uint64 backup_bytes_written,Uint64 redo_bytes_written,Uint64 & std_dev_backup_lcp_in_bytes_per_sec,Uint64 & std_dev_backup_in_bytes_per_sec,Uint64 & std_dev_redo_in_bytes_per_sec)3038 Backup::calculate_std_disk_write_speed_seconds_back(Uint32 seconds_back,
3039                              Uint64 millis_passed_total,
3040                              Uint64 backup_lcp_bytes_written,
3041                              Uint64 backup_bytes_written,
3042                              Uint64 redo_bytes_written,
3043                              Uint64 & std_dev_backup_lcp_in_bytes_per_sec,
3044                              Uint64 & std_dev_backup_in_bytes_per_sec,
3045                              Uint64 & std_dev_redo_in_bytes_per_sec)
3046 {
3047   Uint32 start_index = 0;
3048   Uint64 millis_passed = 0;
3049   Uint64 millis_back = (MILLIS_IN_A_SECOND * seconds_back) -
3050     MILLIS_ADJUST_FOR_EARLY_REPORT;
3051   Uint64 millis_passed_this_period;
3052 
3053   Uint64 avg_backup_lcp_bytes_per_milli;
3054   Uint64 backup_lcp_bytes_written_this_period;
3055   Uint64 avg_backup_lcp_bytes_per_milli_this_period;
3056   long double backup_lcp_temp_sum;
3057   long double backup_lcp_square_sum;
3058 
3059   Uint64 avg_backup_bytes_per_milli;
3060   Uint64 backup_bytes_written_this_period;
3061   Uint64 avg_backup_bytes_per_milli_this_period;
3062   long double backup_temp_sum;
3063   long double backup_square_sum;
3064 
3065   Uint64 avg_redo_bytes_per_milli;
3066   Uint64 redo_bytes_written_this_period;
3067   Uint64 avg_redo_bytes_per_milli_this_period;
3068   long double redo_temp_sum;
3069   long double redo_square_sum;
3070 
3071   ndbassert(seconds_back > 0);
3072   if (millis_passed_total == 0)
3073   {
3074     jam();
3075     std_dev_backup_lcp_in_bytes_per_sec = 0;
3076     std_dev_backup_in_bytes_per_sec = 0;
3077     std_dev_redo_in_bytes_per_sec = 0;
3078     return;
3079   }
3080   avg_backup_lcp_bytes_per_milli = backup_lcp_bytes_written /
3081                                    millis_passed_total;
3082   avg_backup_bytes_per_milli = backup_bytes_written /
3083                                millis_passed_total;
3084   avg_redo_bytes_per_milli = redo_bytes_written / millis_passed_total;
3085   backup_lcp_square_sum = 0;
3086   backup_square_sum = 0;
3087   redo_square_sum = 0;
3088   jam();
3089   while (millis_passed < millis_back &&
3090          start_index < DISK_WRITE_SPEED_REPORT_SIZE)
3091   {
3092     jam();
3093     Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3094     if (disk_write_speed_record == DISK_WRITE_SPEED_REPORT_SIZE)
3095       break;
3096     millis_passed_this_period =
3097       disk_write_speed_rep[disk_write_speed_record].millis_passed;
3098     backup_lcp_bytes_written_this_period =
3099       disk_write_speed_rep[disk_write_speed_record].backup_lcp_bytes_written;
3100     backup_bytes_written_this_period =
3101       disk_write_speed_rep[disk_write_speed_record].backup_bytes_written;
3102     redo_bytes_written_this_period =
3103       disk_write_speed_rep[disk_write_speed_record].redo_bytes_written;
3104     millis_passed += millis_passed_this_period;
3105 
3106     if (millis_passed_this_period != 0)
3107     {
3108       /**
3109        * We use here a calculation of standard deviation that firsts
3110        * calculates the variance. The variance is calculated as the square
3111        * mean of the difference. To get standard intervals we compute the
3112        * average per millisecond and then sum over all milliseconds. To
3113        * simplify the calculation we then multiply the square of the diffs
3114        * per milli to the number of millis passed in a particular measurement.
3115        * We divide by the total number of millis passed. We do this first to
3116        * avoid too big numbers. We use long double in all calculations to
3117        * ensure that we don't overflow.
3118        *
3119        * We also try to avoid divisions by zero in the code in multiple
3120        * places when we query this table before the first measurement have
3121        * been logged.
3122        *
3123        * Calculating standard deviation as:
3124        * Sum of X(i) - E(X) squared where X(i) is the average per millisecond
3125        * in this time period and E(X) is the average over the entire period.
3126        * We divide by number of periods, but to get it more real, we divide
3127        * by total_millis / millis_in_this_period since the periods aren't
3128        * exactly the same. Finally we take square root of the sum of those
3129        * (X(i) - E(X))^2 / #periods. Actually the standard deviation should
3130        * be calculated using #periods - 1 as divisor. Finally we also need
3131        * to convert it from standard deviation per millisecond to standard
3132        * deviation per second. We make that simple by multiplying the
3133        * result from this function by 1000.
3134        */
3135       jam();
3136       avg_backup_lcp_bytes_per_milli_this_period =
3137         backup_lcp_bytes_written_this_period / millis_passed_this_period;
3138       backup_lcp_temp_sum = (long double)avg_backup_lcp_bytes_per_milli;
3139       backup_lcp_temp_sum -=
3140         (long double)avg_backup_lcp_bytes_per_milli_this_period;
3141       backup_lcp_temp_sum *= backup_lcp_temp_sum;
3142       backup_lcp_temp_sum /= (long double)millis_passed_total;
3143       backup_lcp_temp_sum *= (long double)millis_passed_this_period;
3144       backup_lcp_square_sum += backup_lcp_temp_sum;
3145 
3146       avg_backup_bytes_per_milli_this_period =
3147         backup_bytes_written_this_period / millis_passed_this_period;
3148       backup_temp_sum = (long double)avg_backup_bytes_per_milli;
3149       backup_temp_sum -=
3150         (long double)avg_backup_bytes_per_milli_this_period;
3151       backup_temp_sum *= backup_temp_sum;
3152       backup_temp_sum /= (long double)millis_passed_total;
3153       backup_temp_sum *= (long double)millis_passed_this_period;
3154       backup_square_sum += backup_temp_sum;
3155 
3156       avg_redo_bytes_per_milli_this_period =
3157         redo_bytes_written_this_period / millis_passed_this_period;
3158       redo_temp_sum = (long double)avg_redo_bytes_per_milli;
3159       redo_temp_sum -= (long double)avg_redo_bytes_per_milli_this_period;
3160       redo_temp_sum *= redo_temp_sum;
3161       redo_temp_sum /= (long double)millis_passed_total;
3162       redo_temp_sum *= (long double)millis_passed_this_period;
3163       redo_square_sum += redo_temp_sum;
3164     }
3165     start_index++;
3166   }
3167   if (millis_passed == 0)
3168   {
3169     jam();
3170     std_dev_backup_lcp_in_bytes_per_sec = 0;
3171     std_dev_backup_in_bytes_per_sec = 0;
3172     std_dev_redo_in_bytes_per_sec = 0;
3173     return;
3174   }
3175   /**
3176    * Calculate standard deviation per millisecond
3177    * We use long double for the calculation, but we want to report it to
3178    * it in bytes per second, so this is easiest to do with an unsigned
3179    * integer number. Conversion from long double to Uint64 is a real
3180    * conversion that we leave to the compiler to generate code to make.
3181    */
3182   std_dev_backup_lcp_in_bytes_per_sec = (Uint64)sqrtl(backup_lcp_square_sum);
3183   std_dev_backup_in_bytes_per_sec = (Uint64)sqrtl(backup_square_sum);
3184   std_dev_redo_in_bytes_per_sec = (Uint64)sqrtl(redo_square_sum);
3185 
3186   /**
3187    * Convert to standard deviation per second
3188    * We calculated it in bytes per millisecond, so simple multiplication of
3189    * 1000 is sufficient here.
3190    */
3191   std_dev_backup_lcp_in_bytes_per_sec*= (Uint64)1000;
3192   std_dev_backup_in_bytes_per_sec*= (Uint64)1000;
3193   std_dev_redo_in_bytes_per_sec*= (Uint64)1000;
3194 }
3195 
3196 Uint64
calculate_millis_since_finished(Uint32 start_index)3197 Backup::calculate_millis_since_finished(Uint32 start_index)
3198 {
3199   Uint64 millis_passed = 0;
3200   jam();
3201   if (start_index == 0)
3202   {
3203     jam();
3204     return 0;
3205   }
3206   for (Uint32 i = 0; i < start_index; i++)
3207   {
3208     Uint32 disk_write_speed_record = get_disk_write_speed_record(i);
3209     millis_passed +=
3210       disk_write_speed_rep[disk_write_speed_record].millis_passed;
3211   }
3212   return millis_passed;
3213 }
3214 
execDBINFO_SCANREQ(Signal * signal)3215 void Backup::execDBINFO_SCANREQ(Signal *signal)
3216 {
3217   jamEntry();
3218   DbinfoScanReq req= *(DbinfoScanReq*)signal->theData;
3219   const Ndbinfo::ScanCursor* cursor =
3220     CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
3221 
3222   Ndbinfo::Ratelimit rl;
3223 
3224   switch(req.tableId){
3225   case Ndbinfo::POOLS_TABLEID:
3226   {
3227     Ndbinfo::pool_entry pools[] =
3228     {
3229       { "Backup Record",
3230         c_backupPool.getUsed(),
3231         c_backupPool.getSize(),
3232         c_backupPool.getEntrySize(),
3233         c_backupPool.getUsedHi(),
3234         { CFG_DB_PARALLEL_BACKUPS,0,0,0 },
3235         0},
3236       { "Backup File",
3237         c_backupFilePool.getUsed(),
3238         c_backupFilePool.getSize(),
3239         c_backupFilePool.getEntrySize(),
3240         c_backupFilePool.getUsedHi(),
3241         { CFG_DB_PARALLEL_BACKUPS,0,0,0 },
3242         0},
3243       { "Table",
3244         c_tablePool.getUsed(),
3245         c_tablePool.getSize(),
3246         c_tablePool.getEntrySize(),
3247         c_tablePool.getUsedHi(),
3248         { CFG_DB_PARALLEL_BACKUPS,
3249           CFG_DB_NO_TABLES,
3250           CFG_DB_NO_ORDERED_INDEXES,
3251           CFG_DB_NO_UNIQUE_HASH_INDEXES },
3252         0},
3253       { "Trigger",
3254         c_triggerPool.getUsed(),
3255         c_triggerPool.getSize(),
3256         c_triggerPool.getEntrySize(),
3257         c_triggerPool.getUsedHi(),
3258         { CFG_DB_PARALLEL_BACKUPS,
3259           CFG_DB_NO_TABLES,
3260           CFG_DB_NO_ORDERED_INDEXES,
3261           CFG_DB_NO_UNIQUE_HASH_INDEXES },
3262         0},
3263       { "Fragment",
3264         c_fragmentPool.getUsed(),
3265         c_fragmentPool.getSize(),
3266         c_fragmentPool.getEntrySize(),
3267         c_fragmentPool.getUsedHi(),
3268         { CFG_DB_NO_TABLES,
3269           CFG_DB_NO_ORDERED_INDEXES,
3270           CFG_DB_NO_UNIQUE_HASH_INDEXES,0 },
3271         0},
3272       { "Page",
3273         c_pagePool.getUsed(),
3274         c_pagePool.getSize(),
3275         c_pagePool.getEntrySize(),
3276         c_pagePool.getUsedHi(),
3277         { CFG_DB_BACKUP_MEM,
3278           CFG_DB_BACKUP_DATA_BUFFER_MEM,0,0 },
3279         0},
3280       { NULL, 0,0,0,0, { 0,0,0,0 }, 0}
3281     };
3282 
3283     const size_t num_config_params =
3284       sizeof(pools[0].config_params) / sizeof(pools[0].config_params[0]);
3285     Uint32 pool = cursor->data[0];
3286     BlockNumber bn = blockToMain(number());
3287     while(pools[pool].poolname)
3288     {
3289       jam();
3290       Ndbinfo::Row row(signal, req);
3291       row.write_uint32(getOwnNodeId());
3292       row.write_uint32(bn);           // block number
3293       row.write_uint32(instance());   // block instance
3294       row.write_string(pools[pool].poolname);
3295 
3296       row.write_uint64(pools[pool].used);
3297       row.write_uint64(pools[pool].total);
3298       row.write_uint64(pools[pool].used_hi);
3299       row.write_uint64(pools[pool].entry_size);
3300       for (size_t i = 0; i < num_config_params; i++)
3301         row.write_uint32(pools[pool].config_params[i]);
3302       row.write_uint32(GET_RG(pools[pool].record_type));
3303       row.write_uint32(GET_TID(pools[pool].record_type));
3304       ndbinfo_send_row(signal, req, row, rl);
3305       pool++;
3306       if (rl.need_break(req))
3307       {
3308         jam();
3309         ndbinfo_send_scan_break(signal, req, rl, pool);
3310         return;
3311       }
3312     }
3313     break;
3314   }
3315   case Ndbinfo::DISK_WRITE_SPEED_AGGREGATE_TABLEID:
3316   {
3317 
3318     jam();
3319     Uint64 backup_lcp_bytes_written;
3320     Uint64 backup_bytes_written;
3321     Uint64 redo_bytes_written;
3322     Uint64 std_dev_backup;
3323     Uint64 std_dev_backup_lcp;
3324     Uint64 std_dev_redo;
3325     Uint64 millis_passed;
3326     Ndbinfo::Row row(signal, req);
3327     Uint32 ldm_instance = instance();
3328 
3329     if (ldm_instance > 0)
3330     {
3331       /* Always start counting instances from 0 */
3332       ldm_instance--;
3333     }
3334     row.write_uint32(getOwnNodeId());
3335     row.write_uint32(ldm_instance);
3336 
3337     /* Report last second */
3338     calculate_disk_write_speed_seconds_back(1,
3339                                             millis_passed,
3340                                             backup_lcp_bytes_written,
3341                                             backup_bytes_written,
3342                                             redo_bytes_written);
3343 
3344     row.write_uint64((backup_lcp_bytes_written / millis_passed ) * 1000);
3345     row.write_uint64((redo_bytes_written / millis_passed) * 1000);
3346 
3347     /* Report average and std_dev of last 10 seconds */
3348     calculate_disk_write_speed_seconds_back(10,
3349                                             millis_passed,
3350                                             backup_lcp_bytes_written,
3351                                             backup_bytes_written,
3352                                             redo_bytes_written);
3353 
3354     row.write_uint64((backup_lcp_bytes_written * 1000) / millis_passed);
3355     row.write_uint64((redo_bytes_written * 1000) / millis_passed);
3356 
3357     calculate_std_disk_write_speed_seconds_back(10,
3358                                                 millis_passed,
3359                                                 backup_lcp_bytes_written,
3360                                                 backup_bytes_written,
3361                                                 redo_bytes_written,
3362                                                 std_dev_backup_lcp,
3363                                                 std_dev_backup,
3364                                                 std_dev_redo);
3365 
3366     row.write_uint64(std_dev_backup_lcp);
3367     row.write_uint64(std_dev_redo);
3368 
3369     /* Report average and std_dev of last 60 seconds */
3370     calculate_disk_write_speed_seconds_back(60,
3371                                             millis_passed,
3372                                             backup_lcp_bytes_written,
3373                                             backup_bytes_written,
3374                                             redo_bytes_written);
3375 
3376     row.write_uint64((backup_lcp_bytes_written / millis_passed ) * 1000);
3377     row.write_uint64((redo_bytes_written / millis_passed) * 1000);
3378 
3379     calculate_std_disk_write_speed_seconds_back(60,
3380                                                 millis_passed,
3381                                                 backup_lcp_bytes_written,
3382                                                 backup_bytes_written,
3383                                                 redo_bytes_written,
3384                                                 std_dev_backup_lcp,
3385                                                 std_dev_backup,
3386                                                 std_dev_redo);
3387 
3388     row.write_uint64(std_dev_backup_lcp);
3389     row.write_uint64(std_dev_redo);
3390 
3391     row.write_uint64(slowdowns_due_to_io_lag);
3392     row.write_uint64(slowdowns_due_to_high_cpu);
3393     row.write_uint64(disk_write_speed_set_to_min);
3394     row.write_uint64(m_curr_disk_write_speed *
3395                      CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS);
3396 
3397     ndbinfo_send_row(signal, req, row, rl);
3398     break;
3399   }
3400   case Ndbinfo::DISK_WRITE_SPEED_BASE_TABLEID:
3401   {
3402     jam();
3403     Uint32 ldm_instance = instance();
3404 
3405     if (ldm_instance > 0)
3406     {
3407       /* Always start counting instances from 0 */
3408       ldm_instance--;
3409     }
3410     Uint32 start_index = cursor->data[0];
3411     for ( ; start_index < DISK_WRITE_SPEED_REPORT_SIZE;)
3412     {
3413       jam();
3414       Ndbinfo::Row row(signal, req);
3415       row.write_uint32(getOwnNodeId());
3416       row.write_uint32(ldm_instance);
3417       Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3418       if (disk_write_speed_record != DISK_WRITE_SPEED_REPORT_SIZE)
3419       {
3420         jam();
3421         Uint64 backup_lcp_bytes_written_this_period =
3422           disk_write_speed_rep[disk_write_speed_record].
3423             backup_lcp_bytes_written;
3424         Uint64 redo_bytes_written_this_period =
3425           disk_write_speed_rep[disk_write_speed_record].
3426             redo_bytes_written;
3427         Uint64 millis_passed_this_period =
3428           disk_write_speed_rep[disk_write_speed_record].millis_passed;
3429         Uint64 millis_since_finished =
3430           calculate_millis_since_finished(start_index);
3431         Uint64 target_disk_write_speed =
3432           disk_write_speed_rep[disk_write_speed_record].target_disk_write_speed;
3433 
3434         row.write_uint64(millis_since_finished);
3435         row.write_uint64(millis_passed_this_period);
3436         row.write_uint64(backup_lcp_bytes_written_this_period);
3437         row.write_uint64(redo_bytes_written_this_period);
3438         row.write_uint64(target_disk_write_speed);
3439       }
3440       else
3441       {
3442         jam();
3443         row.write_uint64((Uint64)0);
3444         row.write_uint64((Uint64)0);
3445         row.write_uint64((Uint64)0);
3446         row.write_uint64((Uint64)0);
3447         row.write_uint64((Uint64)0);
3448       }
3449       ndbinfo_send_row(signal, req, row, rl);
3450       start_index++;
3451       if (rl.need_break(req))
3452       {
3453         jam();
3454         ndbinfo_send_scan_break(signal, req, rl, start_index);
3455         return;
3456       }
3457     }
3458     break;
3459   }
3460   case Ndbinfo::LOGBUFFERS_TABLEID:
3461   {
3462     jam();
3463     BackupRecordPtr ptr;
3464     if (!get_backup_record(ptr))
3465     {
3466       break;
3467     }
3468 
3469     jam();
3470     Uint32 files[2] = { ptr.p->dataFilePtr[0], ptr.p->logFilePtr };
3471     for (Uint32 i=0; i<NDB_ARRAY_SIZE(files); i++)
3472     {
3473       jam();
3474       Uint32 usableBytes, freeLwmBytes, freeSizeBytes;
3475       usableBytes = freeLwmBytes = freeSizeBytes = 0;
3476       Uint32 logtype = Ndbinfo::BACKUP_DATA_BUFFER;
3477 
3478       switch(i){
3479       case 0:
3480         logtype = Ndbinfo::BACKUP_DATA_BUFFER;
3481         usableBytes = c_defaults.m_dataBufferSize;
3482         break;
3483       case 1:
3484         logtype = Ndbinfo::BACKUP_LOG_BUFFER;
3485         usableBytes = c_defaults.m_logBufferSize;
3486         break;
3487       default:
3488         ndbabort();
3489         break;
3490       };
3491 
3492       BackupFilePtr filePtr;
3493       ptr.p->files.getPtr(filePtr, files[i]);
3494       if (ptr.p->logFilePtr != RNIL)
3495       {
3496         freeSizeBytes = filePtr.p->operation.dataBuffer.getFreeSize() << 2;
3497         freeLwmBytes = filePtr.p->operation.dataBuffer.getFreeLwm() << 2;
3498       }
3499       else
3500       {
3501         freeSizeBytes = usableBytes;
3502         freeLwmBytes = usableBytes;
3503       }
3504 
3505       Ndbinfo::Row data_row(signal, req);
3506       data_row.write_uint32(getOwnNodeId());
3507       data_row.write_uint32(logtype);
3508       data_row.write_uint32(0);   // log id, always 0
3509       data_row.write_uint32(instance());     // log part, instance for ndbmtd
3510 
3511       data_row.write_uint64(usableBytes);        // total allocated
3512       data_row.write_uint64(usableBytes - freeSizeBytes); // currently in use
3513       data_row.write_uint64(usableBytes - freeLwmBytes);  // high water mark
3514       // only 2 rows to send in total, so ignore ratelimit
3515       ndbinfo_send_row(signal, req, data_row, rl);
3516     }
3517     break;
3518   }
3519   default:
3520     break;
3521   }
3522 
3523   ndbinfo_send_scan_conf(signal, req, rl);
3524 }
3525 
3526 static const Uint32 MAX_TABLE_MAPS = 2;
3527 bool
findTable(const BackupRecordPtr & ptr,TablePtr & tabPtr,Uint32 tableId)3528 Backup::findTable(const BackupRecordPtr & ptr,
3529 		  TablePtr & tabPtr, Uint32 tableId)
3530 {
3531   Uint32 loopCount = 0;
3532   tabPtr.i = c_tableMap[tableId];
3533   while (loopCount++ < MAX_TABLE_MAPS)
3534   {
3535     if (tabPtr.i == RNIL)
3536     {
3537       jam();
3538       return false;
3539     }
3540     c_tablePool.getPtr(tabPtr);
3541     if (tabPtr.p->backupPtrI == ptr.i)
3542     {
3543       jam();
3544       return true;
3545     }
3546     jam();
3547     tabPtr.i = tabPtr.p->nextMapTable;
3548   }
3549   return false;
3550 }
3551 
3552 void
insertTableMap(TablePtr & tabPtr,Uint32 backupPtrI,Uint32 tableId)3553 Backup::insertTableMap(TablePtr & tabPtr,
3554                        Uint32 backupPtrI,
3555                        Uint32 tableId)
3556 {
3557   tabPtr.p->backupPtrI = backupPtrI;
3558   tabPtr.p->tableId = tableId;
3559   tabPtr.p->nextMapTable = c_tableMap[tableId];
3560   c_tableMap[tableId] = tabPtr.i;
3561 }
3562 
3563 void
removeTableMap(TablePtr & tabPtr,Uint32 backupPtr,Uint32 tableId)3564 Backup::removeTableMap(TablePtr &tabPtr,
3565                        Uint32 backupPtr,
3566                        Uint32 tableId)
3567 {
3568   TablePtr prevTabPtr;
3569   TablePtr locTabPtr;
3570   Uint32 loopCount = 0;
3571 
3572   prevTabPtr.i = RNIL;
3573   prevTabPtr.p = 0;
3574   locTabPtr.i = c_tableMap[tableId];
3575 
3576   while (loopCount++ < MAX_TABLE_MAPS)
3577   {
3578     jam();
3579     c_tablePool.getPtr(locTabPtr);
3580     ndbrequire(locTabPtr.p->tableId == tableId);
3581     if (locTabPtr.p->backupPtrI == backupPtr)
3582     {
3583       ndbrequire(tabPtr.i == locTabPtr.i);
3584       if (prevTabPtr.i == RNIL)
3585       {
3586         jam();
3587         c_tableMap[tableId] = locTabPtr.p->nextMapTable;
3588       }
3589       else
3590       {
3591         jam();
3592         prevTabPtr.p->nextMapTable = locTabPtr.p->nextMapTable;
3593       }
3594       locTabPtr.p->nextMapTable = RNIL;
3595       locTabPtr.p->tableId = RNIL;
3596       locTabPtr.p->backupPtrI = RNIL;
3597       return;
3598     }
3599     prevTabPtr = locTabPtr;
3600     locTabPtr.i = locTabPtr.p->nextMapTable;
3601   }
3602   ndbabort();
3603 }
3604 
xps(Uint64 x,Uint64 ms)3605 static Uint32 xps(Uint64 x, Uint64 ms)
3606 {
3607   float fx = float(x);
3608   float fs = float(ms);
3609 
3610   if(ms == 0 || x == 0) {
3611     jamNoBlock();
3612     return 0;
3613   }//if
3614   jamNoBlock();
3615   return ((Uint32)(1000.0f * (fx + fs/2.1f))) / ((Uint32)fs);
3616 }
3617 
3618 struct Number {
NumberNumber3619   Number(Uint64 r) { val = r;}
operator =Number3620   Number & operator=(Uint64 r) { val = r; return * this; }
3621   Uint64 val;
3622 };
3623 
3624 NdbOut &
operator <<(NdbOut & out,const Number & val)3625 operator<< (NdbOut & out, const Number & val){
3626   char p = 0;
3627   Uint32 loop = 1;
3628   while(val.val > loop){
3629     loop *= 1000;
3630     p += 3;
3631   }
3632   if(loop != 1){
3633     p -= 3;
3634     loop /= 1000;
3635   }
3636 
3637   switch(p){
3638   case 0:
3639     break;
3640   case 3:
3641     p = 'k';
3642     break;
3643   case 6:
3644     p = 'M';
3645     break;
3646   case 9:
3647     p = 'G';
3648     break;
3649   default:
3650     p = 0;
3651   }
3652   char str[2];
3653   str[0] = p;
3654   str[1] = 0;
3655   Uint32 tmp = (Uint32)((val.val + (loop >> 1)) / loop);
3656 #if 1
3657   if(p > 0)
3658     out << tmp << str;
3659   else
3660     out << tmp;
3661 #else
3662   out << val.val;
3663 #endif
3664 
3665   return out;
3666 }
3667 
3668 void
execBACKUP_CONF(Signal * signal)3669 Backup::execBACKUP_CONF(Signal* signal)
3670 {
3671   jamEntry();
3672   BackupConf * conf = (BackupConf*)signal->getDataPtr();
3673 
3674   ndbout_c("Backup %u has started", conf->backupId);
3675 }
3676 
3677 void
execBACKUP_REF(Signal * signal)3678 Backup::execBACKUP_REF(Signal* signal)
3679 {
3680   jamEntry();
3681   BackupRef * ref = (BackupRef*)signal->getDataPtr();
3682 
3683   ndbout_c("Backup (%u) has NOT started %d", ref->senderData, ref->errorCode);
3684 }
3685 
3686 void
execBACKUP_COMPLETE_REP(Signal * signal)3687 Backup::execBACKUP_COMPLETE_REP(Signal* signal)
3688 {
3689   jamEntry();
3690   BackupCompleteRep* rep = (BackupCompleteRep*)signal->getDataPtr();
3691 
3692   const NDB_TICKS now = NdbTick_getCurrentTicks();
3693   const Uint64 elapsed = NdbTick_Elapsed(startTime,now).milliSec();
3694 
3695   ndbout_c("Backup %u has completed", rep->backupId);
3696   const Uint64 bytes =
3697     rep->noOfBytesLow + (((Uint64)rep->noOfBytesHigh) << 32);
3698   const Uint64 records =
3699     rep->noOfRecordsLow + (((Uint64)rep->noOfRecordsHigh) << 32);
3700 
3701   Number rps = xps(records, elapsed);
3702   Number bps = xps(bytes, elapsed);
3703 
3704   ndbout << " Data [ "
3705 	 << Number(records) << " rows "
3706 	 << Number(bytes) << " bytes " << elapsed << " ms ] "
3707 	 << " => "
3708 	 << rps << " row/s & " << bps << "b/s" << endl;
3709 
3710   bps = xps(rep->noOfLogBytes, elapsed);
3711   rps = xps(rep->noOfLogRecords, elapsed);
3712 
3713   ndbout << " Log [ "
3714 	 << Number(rep->noOfLogRecords) << " log records "
3715 	 << Number(rep->noOfLogBytes) << " bytes " << elapsed << " ms ] "
3716 	 << " => "
3717 	 << rps << " records/s & " << bps << "b/s" << endl;
3718 
3719 }
3720 
3721 void
execBACKUP_ABORT_REP(Signal * signal)3722 Backup::execBACKUP_ABORT_REP(Signal* signal)
3723 {
3724   jamEntry();
3725   BackupAbortRep* rep = (BackupAbortRep*)signal->getDataPtr();
3726 
3727   ndbout_c("Backup %u has been aborted %d", rep->backupId, rep->reason);
3728 }
3729 
3730 const TriggerEvent::Value triggerEventValues[] = {
3731   TriggerEvent::TE_INSERT,
3732   TriggerEvent::TE_UPDATE,
3733   TriggerEvent::TE_DELETE
3734 };
3735 
3736 const Backup::State
3737 Backup::validSlaveTransitions[] = {
3738   INITIAL,  DEFINING,
3739   DEFINING, DEFINED,
3740   DEFINED,  STARTED,
3741   STARTED,  STARTED, // Several START_BACKUP_REQ is sent
3742   STARTED,  SCANNING,
3743   SCANNING, STARTED,
3744   STARTED,  STOPPING,
3745   STOPPING, CLEANING,
3746   CLEANING, INITIAL,
3747 
3748   INITIAL,  ABORTING, // Node fail
3749   DEFINING, ABORTING,
3750   DEFINED,  ABORTING,
3751   STARTED,  ABORTING,
3752   SCANNING, ABORTING,
3753   STOPPING, ABORTING,
3754   CLEANING, ABORTING, // Node fail w/ master takeover
3755   ABORTING, ABORTING, // Slave who initiates ABORT should have this transition
3756 
3757   ABORTING, INITIAL,
3758   INITIAL,  INITIAL
3759 };
3760 
3761 const Uint32
3762 Backup::validSlaveTransitionsCount =
3763 sizeof(Backup::validSlaveTransitions) / sizeof(Backup::State);
3764 
3765 void
setState(State newState)3766 Backup::CompoundState::setState(State newState){
3767   bool found = false;
3768   const State currState = state;
3769   for(unsigned i = 0; i<noOfValidTransitions; i+= 2) {
3770     jam();
3771     if(validTransitions[i]   == currState &&
3772        validTransitions[i+1] == newState){
3773       jam();
3774       found = true;
3775       break;
3776     }
3777   }
3778 
3779   //ndbrequire(found);
3780 
3781   if (newState == INITIAL)
3782     abortState = INITIAL;
3783   if(newState == ABORTING && currState != ABORTING) {
3784     jam();
3785     abortState = currState;
3786   }
3787   state = newState;
3788 #ifdef DEBUG_ABORT
3789   if (newState != currState) {
3790     ndbout_c("%u: Old state = %u, new state = %u, abort state = %u",
3791 	     id, currState, newState, abortState);
3792   }
3793 #endif
3794 }
3795 
3796 void
forceState(State newState)3797 Backup::CompoundState::forceState(State newState)
3798 {
3799   const State currState = state;
3800   if (newState == INITIAL)
3801     abortState = INITIAL;
3802   if(newState == ABORTING && currState != ABORTING) {
3803     jam();
3804     abortState = currState;
3805   }
3806   state = newState;
3807 #ifdef DEBUG_ABORT
3808   if (newState != currState) {
3809     ndbout_c("%u: FORCE: Old state = %u, new state = %u, abort state = %u",
3810 	     id, currState, newState, abortState);
3811   }
3812 #endif
3813 }
3814 
Table(Fragment_pool & fh)3815 Backup::Table::Table(Fragment_pool & fh)
3816   : fragments(fh)
3817 {
3818   triggerIds[0] = ILLEGAL_TRIGGER_ID;
3819   triggerIds[1] = ILLEGAL_TRIGGER_ID;
3820   triggerIds[2] = ILLEGAL_TRIGGER_ID;
3821   triggerAllocated[0] = false;
3822   triggerAllocated[1] = false;
3823   triggerAllocated[2] = false;
3824 }
3825 
3826 /*****************************************************************************
3827  *
3828  * Node state handling
3829  *
3830  *****************************************************************************/
3831 void
execNODE_FAILREP(Signal * signal)3832 Backup::execNODE_FAILREP(Signal* signal)
3833 {
3834   jamEntry();
3835 
3836   NodeFailRep * rep = (NodeFailRep*)signal->getDataPtr();
3837 
3838   if(signal->getLength() == NodeFailRep::SignalLength)
3839   {
3840     ndbrequire(signal->getNoOfSections() == 1);
3841     ndbrequire(ndbd_send_node_bitmask_in_section(
3842         getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version));
3843     SegmentedSectionPtr ptr;
3844     SectionHandle handle(this, signal);
3845     handle.getSection(ptr, 0);
3846     memset(rep->theNodes, 0, sizeof(rep->theNodes));
3847     copy(rep->theNodes, ptr);
3848     releaseSections(handle);
3849   }
3850   else
3851   {
3852     memset(rep->theNodes + NdbNodeBitmask48::Size,
3853            0,
3854            _NDB_NBM_DIFF_BYTES);
3855   }
3856   bool doStuff = false;
3857   /*
3858   Start by saving important signal data which will be destroyed before the
3859   process is completed.
3860   */
3861   NodeId new_master_node_id = rep->masterNodeId;
3862   Uint32 theFailedNodes[NdbNodeBitmask::Size];
3863   for (Uint32 i = 0; i < NdbNodeBitmask::Size; i++)
3864     theFailedNodes[i] = rep->theNodes[i];
3865 
3866   c_masterNodeId = new_master_node_id;
3867 
3868   NodePtr nodePtr;
3869   for(c_nodes.first(nodePtr); nodePtr.i != RNIL; c_nodes.next(nodePtr)) {
3870     jam();
3871     if(NdbNodeBitmask::get(theFailedNodes, nodePtr.p->nodeId)){
3872       if(nodePtr.p->alive){
3873 	jam();
3874 	ndbrequire(c_aliveNodes.get(nodePtr.p->nodeId));
3875 	doStuff = true;
3876       } else {
3877         jam();
3878 	ndbrequire(!c_aliveNodes.get(nodePtr.p->nodeId));
3879       }//if
3880       nodePtr.p->alive = 0;
3881       c_aliveNodes.clear(nodePtr.p->nodeId);
3882     }//if
3883   }//for
3884 
3885   if(!doStuff){
3886     jam();
3887     return;
3888   }//if
3889 
3890 #ifdef DEBUG_ABORT
3891   ndbout_c("****************** Node fail rep ******************");
3892 #endif
3893 
3894   NodeId newCoordinator = c_masterNodeId;
3895   BackupRecordPtr ptr;
3896   if (get_backup_record(ptr))
3897   {
3898     jam();
3899     checkNodeFail(signal, ptr, newCoordinator, theFailedNodes);
3900   }
3901 
3902   /* Block level cleanup */
3903   for(unsigned i = 1; i < MAX_NDB_NODES; i++) {
3904     jam();
3905     if(NdbNodeBitmask::get(theFailedNodes, i))
3906     {
3907       jam();
3908       Uint32 elementsCleaned = simBlockNodeFailure(signal, i); // No callback
3909       ndbassert(elementsCleaned == 0); // Backup should have no distributed frag signals
3910       (void) elementsCleaned; // Remove compiler warning
3911     }//if
3912   }//for
3913 }
3914 
3915 bool
verifyNodesAlive(BackupRecordPtr ptr,const NdbNodeBitmask & aNodeBitMask)3916 Backup::verifyNodesAlive(BackupRecordPtr ptr,
3917 			 const NdbNodeBitmask& aNodeBitMask)
3918 {
3919   Uint32 version = getNodeInfo(getOwnNodeId()).m_version;
3920   for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
3921     jam();
3922     if(aNodeBitMask.get(i)) {
3923       if(!c_aliveNodes.get(i)){
3924         jam();
3925 	ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
3926         return false;
3927       }//if
3928       if(getNodeInfo(i).m_version != version)
3929       {
3930 	jam();
3931 	ptr.p->setErrorCode(AbortBackupOrd::IncompatibleVersions);
3932 	return false;
3933       }
3934     }//if
3935   }//for
3936   return true;
3937 }
3938 
3939 void
checkNodeFail(Signal * signal,BackupRecordPtr ptr,NodeId newCoord,Uint32 theFailedNodes[NdbNodeBitmask::Size])3940 Backup::checkNodeFail(Signal* signal,
3941 		      BackupRecordPtr ptr,
3942 		      NodeId newCoord,
3943 		      Uint32 theFailedNodes[NdbNodeBitmask::Size])
3944 {
3945   NdbNodeBitmask mask;
3946   mask.assign(NdbNodeBitmask::Size, theFailedNodes);
3947 
3948   /* Update ptr.p->nodes to be up to date with current alive nodes
3949    */
3950   NodePtr nodePtr;
3951   bool found = false;
3952   for(c_nodes.first(nodePtr); nodePtr.i != RNIL; c_nodes.next(nodePtr)) {
3953     jam();
3954     if(NdbNodeBitmask::get(theFailedNodes, nodePtr.p->nodeId)) {
3955       jam();
3956       if (ptr.p->nodes.get(nodePtr.p->nodeId)) {
3957 	jam();
3958 	ptr.p->nodes.clear(nodePtr.p->nodeId);
3959 	found = true;
3960       }
3961     }//if
3962   }//for
3963 
3964   if(!found) {
3965     jam();
3966     return; // failed node is not part of backup process, safe to continue
3967   }
3968 
3969   if(mask.get(refToNode(ptr.p->masterRef)))
3970   {
3971     /**
3972      * Master died...abort
3973      */
3974     ptr.p->masterRef = reference();
3975     ptr.p->senderRef = reference();
3976     // Each ldm on each node becomes master and sends signals only to self
3977     ptr.p->nodes.clear();
3978     ptr.p->nodes.set(getOwnNodeId());
3979     ptr.p->fragWorkers[getOwnNodeId()].clear();
3980     ptr.p->fragWorkers[getOwnNodeId()].set(instance());
3981     ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
3982     switch(ptr.p->m_gsn){
3983     case GSN_DEFINE_BACKUP_REQ:
3984     case GSN_START_BACKUP_REQ:
3985     case GSN_BACKUP_FRAGMENT_REQ:
3986     case GSN_STOP_BACKUP_REQ:
3987       // I'm currently processing...reply to self and abort...
3988       ptr.p->masterData.gsn = ptr.p->m_gsn;
3989       ptr.p->masterData.sendCounter = ptr.p->nodes;
3990       return;
3991     case GSN_DEFINE_BACKUP_REF:
3992     case GSN_DEFINE_BACKUP_CONF:
3993     case GSN_START_BACKUP_REF:
3994     case GSN_START_BACKUP_CONF:
3995     case GSN_BACKUP_FRAGMENT_REF:
3996     case GSN_BACKUP_FRAGMENT_CONF:
3997     case GSN_STOP_BACKUP_REF:
3998     case GSN_STOP_BACKUP_CONF:
3999       ptr.p->masterData.gsn = GSN_DEFINE_BACKUP_REQ;
4000       masterAbort(signal, ptr);
4001       return;
4002     case GSN_ABORT_BACKUP_ORD:
4003       // Already aborting
4004       return;
4005     }
4006   }
4007   else if (newCoord == getOwnNodeId() &&
4008            instance() == masterInstanceKey(ptr))
4009   {
4010     /**
4011      * I'm master for this backup: LDM1 on master node
4012      */
4013     jam();
4014     CRASH_INSERTION((10001));
4015 #ifdef DEBUG_ABORT
4016     ndbout_c("**** Master: Node failed: Master id = %u",
4017 	     refToNode(ptr.p->masterRef));
4018 #endif
4019 
4020     Uint32 gsn, len, pos;
4021     ptr.p->nodes.bitANDC(mask);
4022     switch(ptr.p->masterData.gsn){
4023     case GSN_DEFINE_BACKUP_REQ:
4024     {
4025       DefineBackupRef * ref = (DefineBackupRef*)signal->getDataPtrSend();
4026       ref->backupPtr = ptr.i;
4027       ref->backupId = ptr.p->backupId;
4028       ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4029       gsn= GSN_DEFINE_BACKUP_REF;
4030       len= DefineBackupRef::SignalLength;
4031       pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4032       break;
4033     }
4034     case GSN_START_BACKUP_REQ:
4035     {
4036       StartBackupRef * ref = (StartBackupRef*)signal->getDataPtrSend();
4037       ref->backupPtr = ptr.i;
4038       ref->backupId = ptr.p->backupId;
4039       ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4040       gsn= GSN_START_BACKUP_REF;
4041       len= StartBackupRef::SignalLength;
4042       pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4043       break;
4044     }
4045     case GSN_BACKUP_FRAGMENT_REQ:
4046     {
4047       BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtrSend();
4048       ref->backupPtr = ptr.i;
4049       ref->backupId = ptr.p->backupId;
4050       ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4051       gsn= GSN_BACKUP_FRAGMENT_REF;
4052       len= BackupFragmentRef::SignalLength;
4053       pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4054       break;
4055     }
4056     case GSN_STOP_BACKUP_REQ:
4057     {
4058       StopBackupRef * ref = (StopBackupRef*)signal->getDataPtrSend();
4059       ref->backupPtr = ptr.i;
4060       ref->backupId = ptr.p->backupId;
4061       ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4062       ref->nodeId = getOwnNodeId();
4063       gsn= GSN_STOP_BACKUP_REF;
4064       len= StopBackupRef::SignalLength;
4065       pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4066       break;
4067     }
4068     case GSN_WAIT_GCP_REQ:
4069     case GSN_DROP_TRIG_IMPL_REQ:
4070     case GSN_CREATE_TRIG_IMPL_REQ:
4071     case GSN_ALTER_TRIG_IMPL_REQ:
4072       ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
4073       return;
4074     case GSN_UTIL_SEQUENCE_REQ:
4075     case GSN_UTIL_LOCK_REQ:
4076       return;
4077     default:
4078       ndbabort();
4079     }
4080 
4081     for(Uint32 i = 0; (i = mask.find(i+1)) != NdbNodeBitmask::NotFound; )
4082     {
4083       signal->theData[pos] = i;
4084       if (gsn == GSN_BACKUP_FRAGMENT_REF)
4085       {
4086         // Handle mt-backup case where all LDMs process BACKUP_FRAGMENT_REQs
4087         // simultaneously. If any node fails, master sends REFs to self on
4088         // behalf of every failed node. Extend handling for BACKUP_FRAGMENT_REQ
4089         // so that master sends BACKUP_FRAGMENT_REFs to self from every LDM
4090         // on every failed node.
4091         Uint32 workers = getNodeInfo(i).m_lqh_workers;
4092         for (Uint32 j=0; j<workers; j++)
4093         {
4094           sendSignal(reference(), gsn, signal, len, JBB);
4095         }
4096       }
4097       else
4098       {
4099         // master sends REQs only to one instance (BackupProxy) on each node
4100         // send only one reply to self per node on behalf of BackupProxy
4101         sendSignal(reference(), gsn, signal, len, JBB);
4102 #ifdef DEBUG_ABORT
4103         ndbout_c("sending %d to self from %d", gsn, i);
4104 #endif
4105       }
4106     }
4107     return;
4108   }//if
4109 
4110   /**
4111    * I abort myself as slave if not master
4112    */
4113   CRASH_INSERTION((10021));
4114 }
4115 
4116 void
execINCL_NODEREQ(Signal * signal)4117 Backup::execINCL_NODEREQ(Signal* signal)
4118 {
4119   jamEntry();
4120 
4121   const Uint32 senderRef = signal->theData[0];
4122   const Uint32 inclNode  = signal->theData[1];
4123 
4124   NodePtr node;
4125   for(c_nodes.first(node); node.i != RNIL; c_nodes.next(node)) {
4126     jam();
4127     const Uint32 nodeId = node.p->nodeId;
4128     if(inclNode == nodeId){
4129       jam();
4130 
4131       ndbrequire(node.p->alive == 0);
4132       ndbrequire(!c_aliveNodes.get(nodeId));
4133 
4134       node.p->alive = 1;
4135       c_aliveNodes.set(nodeId);
4136 
4137       break;
4138     }//if
4139   }//for
4140   signal->theData[0] = inclNode;
4141   signal->theData[1] = reference();
4142   sendSignal(senderRef, GSN_INCL_NODECONF, signal, 2, JBB);
4143 }
4144 
4145 /*****************************************************************************
4146  *
4147  * Master functionallity - Define backup
4148  *
4149  * Backup master = BACKUP instance 1 (LDM1) on master node.
4150  * Backup master receives BACKUP_REQ and sends control signals to all slaves
4151  * for mt-backup, slaves = all BACKUP instances(all LDMs) on all nodes
4152  * for st-backup, slaves = BACKUP 1(LDM1) on all nodes
4153  *
4154  * File thread: A file-thread signal train of FSAPPENDREQ/FSAPPENDCONF is
4155  * started for each backup file, i.e. one train each for the ctl, data and log
4156  * file. The file-thread signal trains interleave with BACKUP-related signals
4157  * on each slave thread. The BACKUP-related signals write data to dataBuffers
4158  * as needed, using sendSignalWithDelay loops to wait in case a dataBuffer is
4159  * not accepting writes. Each file-thread signal picks up data from its
4160  * dataBuffer and writes it to the file.
4161  *
4162  * Control signals
4163  * 1) DEFINE_BACKUP_REQ
4164  * - seize BackupRecord, alloc and init file ptrs
4165  * - send LIST_TABLES_REQ to DICT to get table info to create tablemap
4166  * - send FSOPENREQ to open ctl, data and logfiles
4167  * - write file headers for ctl, data and logfiles
4168  * - start ctl file thread and write table list to ctl file
4169  * - get table info for each table, save in thread-local list
4170  * - lock tables
4171  * - get frag counts for each table + frag info for each frag on each table,
4172      save in thread-local list
4173  * - reply to sender with DEFINE_BACKUP_CONF
4174  *
4175  * 2) START_BACKUP_REQ
4176  * - start file threads for data and log files
4177  * - tell DBTUP to create triggers for logfile writes
4178  * - reply to sender with START_BACKUP_CONF
4179  *
4180  * 3) BACKUP_FRAGMENT_REQ
4181  * - send SCAN_FRAGREQ to LQH to start scan
4182  * - on receiving SCAN_FRAGCONF, reply to master with BACKUP_FRAGMENT_CONF
4183  *
4184  * 4) STOP_BACKUP_REQ
4185  * - drop all triggers in TUP
4186  * - insert footers in ctl and log files
4187  * - unlock tables
4188  * - close all files
4189  * - reply to sender with STOP_BACKUP_CONF
4190  *
4191  * 5) ABORT_BACKUP_ORD
4192  * - unlock tables
4193  * - release file pages, file ptrs, thread-local lists of frag info, table data
4194  * - release BackupRecord
4195  *
4196  *****************************************************************************/
4197 
4198 void
execBACKUP_REQ(Signal * signal)4199 Backup::execBACKUP_REQ(Signal* signal)
4200 {
4201   jamEntry();
4202   BackupReq * req = (BackupReq*)signal->getDataPtr();
4203 
4204   const Uint32 senderData = req->senderData;
4205   const BlockReference senderRef = signal->senderBlockRef();
4206   const Uint32 dataLen32 = req->backupDataLen; // In 32 bit words
4207   const Uint32 flags = signal->getLength() > 2 ? req->flags : 2;
4208   const Uint32 input_backupId = signal->getLength() > 3 ? req->inputBackupId : 0;
4209 
4210   if (getOwnNodeId() != getMasterNodeId())
4211   {
4212     jam();
4213     sendBackupRef(senderRef, flags, signal, senderData,
4214                   BackupRef::IAmNotMaster);
4215     return;
4216   }//if
4217 
4218   if (c_defaults.m_diskless)
4219   {
4220     jam();
4221     sendBackupRef(senderRef, flags, signal, senderData,
4222 		  BackupRef::CannotBackupDiskless);
4223     return;
4224   }
4225 
4226   if (dataLen32 != 0)
4227   {
4228     jam();
4229     sendBackupRef(senderRef, flags, signal, senderData,
4230 		  BackupRef::BackupDefinitionNotImplemented);
4231     return;
4232   }//if
4233 
4234 #ifdef DEBUG_ABORT
4235   dumpUsedResources();
4236 #endif
4237   /**
4238    * Seize a backup record
4239    */
4240   BackupRecordPtr ptr;
4241   c_backups.seizeFirst(ptr);
4242   if (ptr.i == RNIL)
4243   {
4244     jam();
4245     sendBackupRef(senderRef, flags, signal, senderData,
4246                   BackupRef::OutOfBackupRecord);
4247     return;
4248   }//if
4249 
4250   ndbrequire(ptr.p->tables.isEmpty());
4251 
4252   ptr.p->m_gsn = 0;
4253   ptr.p->errorCode = 0;
4254   ptr.p->clientRef = senderRef;
4255   ptr.p->clientData = senderData;
4256   ptr.p->flags = flags;
4257   ptr.p->masterRef = reference();
4258   ptr.p->nodes = c_aliveNodes;
4259 
4260   Uint32 node = ptr.p->nodes.find_first();
4261   Uint32 version = getNodeInfo(getOwnNodeId()).m_version;
4262   ptr.p->idleFragWorkerCount = 0;
4263   while(node != NdbNodeBitmask::NotFound)
4264   {
4265     const NodeInfo nodeInfo = getNodeInfo(node);
4266     // setup fragWorkers[] for master to control BACKUP_FRAGMENT_REQs
4267     ptr.p->fragWorkers[node].clear();
4268     Uint32 ldmCount = nodeInfo.m_lqh_workers;
4269     ldmCount += (nodeInfo.m_lqh_workers == 0); // set LDM1 as worker for ndbd
4270 
4271     for(Uint32 i=0; i<=ldmCount; i++)
4272       ptr.p->fragWorkers[node].set(i);
4273 
4274     ptr.p->idleFragWorkerCount += ldmCount;
4275 
4276     // Only support multithreaded backup if all nodes have multiple LDMs
4277     if (ldmCount <= 1 && (m_cfg_mt_backup > 0))
4278     {
4279      /* The MT_BACKUP flag is set to false in these
4280       * cases:
4281       * - ndbds
4282       * - ndbmtds with only one LDM worker
4283       */
4284       m_cfg_mt_backup = 0;
4285       g_eventLogger->info("Running single-threaded backup since node %u has only one LDM", node);
4286     }
4287     if (getNodeInfo(node).m_version != version)
4288     {
4289       jam();
4290       g_eventLogger->info("Detected incompatible versions, aborting backup");
4291       ptr.p->setErrorCode(AbortBackupOrd::IncompatibleVersions);
4292       sendBackupRef(senderRef, flags, signal, senderData,
4293                     BackupRef::BackupDuringUpgradeUnsupported);
4294       // clean up backup state
4295       ptr.p->m_gsn = 0;
4296       ptr.p->masterData.gsn = 0;
4297       c_backups.release(ptr);
4298       return;
4299     }
4300 
4301     node = ptr.p->nodes.find_next(node+1);
4302   }
4303 
4304    if(m_cfg_mt_backup)
4305    {
4306     /* Exec backup using all LDMs. To perform a backup, a BACKUP
4307      * block must receive all these signals from master:
4308      * 1) DEFINE_BACKUP_REQ, START_BACKUP_REQ, STOP_BACKUP_REQ to set
4309      *   up and clean up filesets, file-write signal 'threads', triggers,
4310      *   table locks, and to fetch metadata and write CTL and LOG files
4311      * 2) BACKUP_FRAGMENT_REQs to write fragments to data file, master must
4312      *    assign frags to LDMs by sending BACKUP_FRAGMENT_REQs
4313      * 3) ABORT_BACKUP_ORD for failure-handling and cleanup
4314      * If all these signals are received by an LDM, that LDM will independently
4315      * execute a backup and write a restorable backup fileset.
4316      *
4317      * With MT_BACKUP enabled, all these signals will be sent to all
4318      * LDMs on each node.
4319      *
4320      * With MT_BACKUP disabled, the node performs a single-threaded backup.
4321      * In a single-threaded backup, all these signals are sent to LDM1 on each
4322      * node. The remaining BACKUP instances do not participate in the backup.
4323      */
4324 
4325      ptr.p->flags |= BackupReq::MT_BACKUP;
4326    }
4327 
4328   if (input_backupId)
4329   {
4330     jam();
4331     ptr.p->backupId = input_backupId;
4332   }
4333   else
4334   {
4335     jam();
4336     ptr.p->backupId = 0;
4337   }
4338   ptr.p->backupKey[0] = 0;
4339   ptr.p->backupKey[1] = 0;
4340   ptr.p->backupDataLen = 0;
4341   ptr.p->masterData.errorCode = 0;
4342 
4343   ptr.p->masterData.sequence.retriesLeft = 3;
4344   sendUtilSequenceReq(signal, ptr);
4345 }
4346 
4347 void
sendUtilSequenceReq(Signal * signal,BackupRecordPtr ptr,Uint32 delay)4348 Backup::sendUtilSequenceReq(Signal* signal, BackupRecordPtr ptr, Uint32 delay)
4349 {
4350   jam();
4351 
4352   UtilSequenceReq * utilReq = (UtilSequenceReq*)signal->getDataPtrSend();
4353   ptr.p->masterData.gsn = GSN_UTIL_SEQUENCE_REQ;
4354   utilReq->senderData  = ptr.i;
4355   utilReq->sequenceId  = NDB_BACKUP_SEQUENCE;
4356 
4357   if (ptr.p->backupId)
4358   {
4359     jam();
4360     utilReq->requestType = UtilSequenceReq::SetVal;
4361     utilReq->value = ptr.p->backupId;
4362   }
4363   else
4364   {
4365     jam();
4366     utilReq->requestType = UtilSequenceReq::NextVal;
4367   }
4368 
4369   if (delay == 0)
4370   {
4371     jam();
4372     sendSignal(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
4373                signal, UtilSequenceReq::SignalLength, JBB);
4374   }
4375   else
4376   {
4377     jam();
4378     sendSignalWithDelay(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
4379                         signal, delay, UtilSequenceReq::SignalLength);
4380   }
4381 }
4382 
4383 void
execUTIL_SEQUENCE_REF(Signal * signal)4384 Backup::execUTIL_SEQUENCE_REF(Signal* signal)
4385 {
4386   jamEntry();
4387   BackupRecordPtr ptr;
4388   UtilSequenceRef * utilRef = (UtilSequenceRef*)signal->getDataPtr();
4389   ptr.i = utilRef->senderData;
4390   c_backupPool.getPtr(ptr);
4391   ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_SEQUENCE_REQ);
4392 
4393   if (utilRef->errorCode == UtilSequenceRef::TCError)
4394   {
4395     jam();
4396     if (ptr.p->masterData.sequence.retriesLeft > 0)
4397     {
4398       jam();
4399       infoEvent("BACKUP: retrying sequence on error %u",
4400                 utilRef->TCErrorCode);
4401       ptr.p->masterData.sequence.retriesLeft--;
4402       sendUtilSequenceReq(signal, ptr, 300);
4403       return;
4404     }
4405   }
4406   warningEvent("BACKUP: aborting due to sequence error (%u, %u)",
4407                utilRef->errorCode,
4408                utilRef->TCErrorCode);
4409 
4410   sendBackupRef(signal, ptr, BackupRef::SequenceFailure);
4411 }//execUTIL_SEQUENCE_REF()
4412 
4413 void
sendBackupRef(Signal * signal,BackupRecordPtr ptr,Uint32 errorCode)4414 Backup::sendBackupRef(Signal* signal, BackupRecordPtr ptr, Uint32 errorCode)
4415 {
4416   jam();
4417   sendBackupRef(ptr.p->clientRef, ptr.p->flags, signal,
4418                 ptr.p->clientData, errorCode);
4419   cleanup(signal, ptr);
4420 }
4421 
4422 void
sendBackupRef(BlockReference senderRef,Uint32 flags,Signal * signal,Uint32 senderData,Uint32 errorCode)4423 Backup::sendBackupRef(BlockReference senderRef, Uint32 flags, Signal *signal,
4424 		      Uint32 senderData, Uint32 errorCode)
4425 {
4426   jam();
4427   if (SEND_BACKUP_STARTED_FLAG(flags))
4428   {
4429     jam();
4430     BackupRef* ref = (BackupRef*)signal->getDataPtrSend();
4431     ref->senderData = senderData;
4432     ref->errorCode = errorCode;
4433     ref->masterRef = numberToRef(BACKUP, getMasterNodeId());
4434     sendSignal(senderRef, GSN_BACKUP_REF, signal, BackupRef::SignalLength, JBB);
4435   }
4436 
4437   if (errorCode != BackupRef::IAmNotMaster)
4438   {
4439     jam();
4440     signal->theData[0] = NDB_LE_BackupFailedToStart;
4441     signal->theData[1] = senderRef;
4442     signal->theData[2] = errorCode;
4443     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4444   }
4445 }
4446 
4447 void
execUTIL_SEQUENCE_CONF(Signal * signal)4448 Backup::execUTIL_SEQUENCE_CONF(Signal* signal)
4449 {
4450   jamEntry();
4451 
4452   UtilSequenceConf * conf = (UtilSequenceConf*)signal->getDataPtr();
4453 
4454   if(conf->requestType == UtilSequenceReq::Create)
4455   {
4456     jam();
4457     sendSTTORRY(signal); // At startup in NDB
4458     return;
4459   }
4460 
4461   BackupRecordPtr ptr;
4462   ptr.i = conf->senderData;
4463   c_backupPool.getPtr(ptr);
4464 
4465   ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_SEQUENCE_REQ);
4466 
4467   if (ptr.p->checkError())
4468   {
4469     jam();
4470     sendBackupRef(signal, ptr, ptr.p->errorCode);
4471     return;
4472   }//if
4473 
4474   if (ERROR_INSERTED(10023))
4475   {
4476     sendBackupRef(signal, ptr, 323);
4477     return;
4478   }//if
4479 
4480 
4481   if(!ptr.p->backupId && conf->requestType != UtilSequenceReq::SetVal)
4482   {
4483     Uint64 backupId;
4484     memcpy(&backupId,conf->sequenceValue,8);
4485     ptr.p->backupId= (Uint32)backupId;
4486   }
4487 
4488   ptr.p->backupKey[0] = (getOwnNodeId() << 16) | (ptr.p->backupId & 0xFFFF);
4489   ptr.p->backupKey[1] = Uint32(NdbTick_CurrentMillisecond());
4490 
4491   ptr.p->masterData.gsn = GSN_UTIL_LOCK_REQ;
4492   Mutex mutex(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4493   Callback c = { safe_cast(&Backup::defineBackupMutex_locked), ptr.i };
4494   ndbrequire(mutex.lock(c));
4495 
4496   return;
4497 }
4498 
4499 void
defineBackupMutex_locked(Signal * signal,Uint32 ptrI,Uint32 retVal)4500 Backup::defineBackupMutex_locked(Signal* signal, Uint32 ptrI, Uint32 retVal){
4501   jamEntry();
4502   ndbrequire(retVal == 0);
4503 
4504   BackupRecordPtr ptr;
4505   ptr.i = ptrI;
4506   c_backupPool.getPtr(ptr);
4507 
4508   ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_LOCK_REQ);
4509 
4510   ptr.p->masterData.gsn = GSN_UTIL_LOCK_REQ;
4511   Mutex mutex(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4512   Callback c = { safe_cast(&Backup::dictCommitTableMutex_locked), ptr.i };
4513   ndbrequire(mutex.lock(c));
4514 }
4515 
4516 void
dictCommitTableMutex_locked(Signal * signal,Uint32 ptrI,Uint32 retVal)4517 Backup::dictCommitTableMutex_locked(Signal* signal, Uint32 ptrI,Uint32 retVal)
4518 {
4519   jamEntry();
4520   ndbrequire(retVal == 0);
4521 
4522   /**
4523    * We now have both the mutexes
4524    */
4525   BackupRecordPtr ptr;
4526   ptr.i = ptrI;
4527   c_backupPool.getPtr(ptr);
4528 
4529   ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_LOCK_REQ);
4530 
4531   if (ERROR_INSERTED(10031)) {
4532     ptr.p->setErrorCode(331);
4533   }//if
4534 
4535   if (ptr.p->checkError())
4536   {
4537     jam();
4538 
4539     /**
4540      * Unlock mutexes
4541      */
4542     jam();
4543     Mutex mutex1(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4544     jam();
4545     mutex1.unlock(); // ignore response
4546 
4547     jam();
4548     Mutex mutex2(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4549     jam();
4550     mutex2.unlock(); // ignore response
4551 
4552     sendBackupRef(signal, ptr, ptr.p->errorCode);
4553     return;
4554   }//if
4555 
4556   sendDefineBackupReq(signal, ptr);
4557 }
4558 
4559 /*****************************************************************************
4560  *
4561  * Master functionallity - Define backup cont'd (from now on all slaves are in)
4562  *
4563  *****************************************************************************/
4564 
4565 bool
haveAllSignals(BackupRecordPtr ptr,Uint32 gsn,Uint32 nodeId)4566 Backup::haveAllSignals(BackupRecordPtr ptr, Uint32 gsn, Uint32 nodeId)
4567 {
4568   ndbrequire(ptr.p->masterRef == reference());
4569   ndbrequire(ptr.p->masterData.gsn == gsn);
4570   ndbrequire(!ptr.p->masterData.sendCounter.done());
4571   if (ptr.p->masterData.sendCounter.isWaitingFor(nodeId))
4572   {
4573     ptr.p->masterData.sendCounter.clearWaitingFor(nodeId);
4574   }
4575   else
4576   {
4577     ndbrequire(ptr.p->errorCode == AbortBackupOrd::BackupFailureDueToNodeFail);
4578     if (ERROR_INSERTED(10051) || ERROR_INSERTED(10052) ||
4579         ERROR_INSERTED(10053))
4580     {
4581       ndbout_c("Received duplicate signal from non-master node %u for gsn %u",
4582                nodeId, gsn);
4583       CLEAR_ERROR_INSERT_VALUE;
4584     }
4585   }
4586   return ptr.p->masterData.sendCounter.done();
4587 }
4588 
4589 void
sendDefineBackupReq(Signal * signal,BackupRecordPtr ptr)4590 Backup::sendDefineBackupReq(Signal *signal, BackupRecordPtr ptr)
4591 {
4592   /**
4593    * Sending define backup to all participants
4594    */
4595   DefineBackupReq * req = (DefineBackupReq*)signal->getDataPtrSend();
4596   req->backupId = ptr.p->backupId;
4597   req->clientRef = ptr.p->clientRef;
4598   req->clientData = ptr.p->clientData;
4599   req->senderRef = reference();
4600   req->masterRef = reference();
4601   req->backupPtr = ptr.i;
4602   req->backupKey[0] = ptr.p->backupKey[0];
4603   req->backupKey[1] = ptr.p->backupKey[1];
4604   req->backupDataLen = ptr.p->backupDataLen;
4605   req->flags = ptr.p->flags;
4606 
4607   /**
4608    * If backup is multithreaded, DEFINE_BACKUP_REQ sent to BackupProxy on
4609    * all nodes. BackupProxy fwds REQ to all LDMs, collects CONF/REFs
4610    * and replies to master. N backup filesets created per node, N=#ldms.
4611    *
4612    * If backup is not multithreaded, DEFINE_BACKUP_REQ sent only to LDM 1
4613    * on all nodes. Only 1 backup fileset created per node.
4614    *
4615    * instanceKey() selects instance to send signal to:
4616    * - for LCP, send to self
4617    * - for single-threaded backup: only one LDM thread, send to that thread
4618    * - for multithreaded backup, send to the BackupProxy LDM0, which then
4619    *   broadcasts the signal to all the LDMs on its node
4620    *
4621    * On receiving DEFINE_BACKUP_REQ, the BACKUP block creates a
4622    * backup fileset, queries DICT+DIH for table info, locks tables,
4623    * and writes table metadata into the CTL file in its fileset.
4624    */
4625   ptr.p->masterData.gsn = GSN_DEFINE_BACKUP_REQ;
4626   ptr.p->masterData.sendCounter = ptr.p->nodes;
4627   Uint32 recNode = 0;
4628   const Uint32 packed_length = ptr.p->nodes.getPackedLengthInWords();
4629 
4630   NdbNodeBitmask nodes = ptr.p->nodes;
4631   while ((recNode = nodes.find(recNode + 1)) != NdbNodeBitmask::NotFound)
4632   {
4633     const Uint32 ref = numberToRef(BACKUP, instanceKey(ptr), recNode);
4634 
4635     // Backup is not allowed for mixed versions data nodes
4636     ndbrequire(ndbd_send_node_bitmask_in_section(getNodeInfo(recNode).m_version));
4637 
4638     LinearSectionPtr lsptr[3];
4639     lsptr[0].p = nodes.rep.data;
4640     lsptr[0].sz = packed_length;
4641     sendSignal(ref, GSN_DEFINE_BACKUP_REQ, signal,
4642         DefineBackupReq::SignalLength_v1, JBB, lsptr, 1);
4643   }
4644 
4645   /**
4646    * Now send backup data
4647    */
4648   const Uint32 len = ptr.p->backupDataLen;
4649   if(len == 0){
4650     /**
4651      * No data to send
4652      */
4653     jam();
4654     return;
4655   }//if
4656 
4657   /**
4658    * Not implemented
4659    */
4660   ndbabort();
4661 }
4662 
4663 void
execDEFINE_BACKUP_REF(Signal * signal)4664 Backup::execDEFINE_BACKUP_REF(Signal* signal)
4665 {
4666   jamEntry();
4667 
4668   DefineBackupRef* ref = (DefineBackupRef*)signal->getDataPtr();
4669 
4670   const Uint32 ptrI = ref->backupPtr;
4671   //const Uint32 backupId = ref->backupId;
4672   const Uint32 nodeId = ref->nodeId;
4673 
4674   BackupRecordPtr ptr;
4675   c_backupPool.getPtr(ptr, ptrI);
4676 
4677   ptr.p->setErrorCode(ref->errorCode);
4678   defineBackupReply(signal, ptr, nodeId);
4679 }
4680 
4681 void
execDEFINE_BACKUP_CONF(Signal * signal)4682 Backup::execDEFINE_BACKUP_CONF(Signal* signal)
4683 {
4684   jamEntry();
4685 
4686   DefineBackupConf* conf = (DefineBackupConf*)signal->getDataPtr();
4687   const Uint32 ptrI = conf->backupPtr;
4688   //const Uint32 backupId = conf->backupId;
4689   const Uint32 nodeId = refToNode(signal->senderBlockRef());
4690 
4691   BackupRecordPtr ptr;
4692   c_backupPool.getPtr(ptr, ptrI);
4693 
4694   if (ERROR_INSERTED(10024))
4695   {
4696     ptr.p->setErrorCode(324);
4697   }
4698 
4699   defineBackupReply(signal, ptr, nodeId);
4700 }
4701 
4702 void
defineBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)4703 Backup::defineBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
4704 {
4705   if (ERROR_INSERTED(10051))
4706   {
4707     if (nodeId == getOwnNodeId())
4708     {
4709       jam();
4710       ndbrequire(ptr.p->errorCode == 0)
4711       // Delay reply from self so that master waits for DEFINE_BACKUP_REFs
4712       sendSignalWithDelay(reference(), GSN_DEFINE_BACKUP_CONF, signal,
4713                           5000, signal->getLength());
4714       return;
4715     }
4716     else
4717     {
4718       // Received DEFINE_BACKUP_REF/CONF from node n1, now crash n1. This will
4719       // trigger node-failure handling where master sends DEFINE_BACKUP_REF to
4720       // self on behalf of n1. So master receives 2 REFs from n1.
4721       signal->theData[0] = 9999;
4722       sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
4723     }
4724   }
4725   if (!haveAllSignals(ptr, GSN_DEFINE_BACKUP_REQ, nodeId)) {
4726     jam();
4727     return;
4728   }
4729 
4730   /**
4731    * Unlock mutexes
4732    */
4733   jam();
4734   Mutex mutex1(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4735   jam();
4736   mutex1.unlock(); // ignore response
4737 
4738   jam();
4739   Mutex mutex2(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4740   jam();
4741   mutex2.unlock(); // ignore response
4742 
4743   if(ptr.p->checkError())
4744   {
4745     jam();
4746     masterAbort(signal, ptr);
4747     return;
4748   }
4749 
4750   CRASH_INSERTION((10034));
4751 
4752   /**
4753    * We've received GSN_DEFINE_BACKUP_CONF from all participants.
4754    *
4755    * Our next step is to send START_BACKUP_REQ to all participants,
4756    * who will then send CREATE_TRIG_REQ for all tables to their local
4757    * DBTUP.
4758    */
4759   TablePtr tabPtr;
4760   ptr.p->tables.first(tabPtr);
4761 
4762   sendStartBackup(signal, ptr, tabPtr);
4763 }
4764 
4765 /*****************************************************************************
4766  *
4767  * Master functionallity - Prepare triggers
4768  *
4769  *****************************************************************************/
4770 void
createAttributeMask(TablePtr tabPtr,Bitmask<MAXNROFATTRIBUTESINWORDS> & mask)4771 Backup::createAttributeMask(TablePtr tabPtr,
4772 			    Bitmask<MAXNROFATTRIBUTESINWORDS> & mask)
4773 {
4774   mask.clear();
4775   for (Uint32 i = 0; i<tabPtr.p->noOfAttributes; i++)
4776     mask.set(i);
4777 }
4778 
4779 void
sendCreateTrig(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)4780 Backup::sendCreateTrig(Signal* signal,
4781 			   BackupRecordPtr ptr, TablePtr tabPtr)
4782 {
4783   CreateTrigImplReq* req = (CreateTrigImplReq*)signal->getDataPtr();
4784 
4785   /*
4786    * First, setup the structures
4787    */
4788   OperationRecord* operation = &ptr.p->files.getPtr(ptr.p->logFilePtr)->operation;
4789   operation->noOfBytes = 0;
4790   operation->noOfRecords = 0;
4791 
4792   for(Uint32 j=0; j<3; j++) {
4793     jam();
4794 
4795     TriggerPtr trigPtr;
4796     if (!ptr.p->triggers.seizeFirst(trigPtr)) {
4797       jam();
4798       ptr.p->m_gsn = GSN_START_BACKUP_REF;
4799       StartBackupRef* ref = (StartBackupRef*)signal->getDataPtrSend();
4800       ref->backupPtr = ptr.i;
4801       ref->backupId = ptr.p->backupId;
4802       ref->errorCode = StartBackupRef::FailedToAllocateTriggerRecord;
4803       ref->nodeId = getOwnNodeId();
4804       sendSignal(ptr.p->senderRef, GSN_START_BACKUP_REF, signal,
4805 		 StartBackupRef::SignalLength, JBB);
4806       return;
4807     } // if
4808 
4809     const Uint32 triggerId= trigPtr.i;
4810     tabPtr.p->triggerIds[j] = triggerId;
4811     tabPtr.p->triggerAllocated[j] = true;
4812     trigPtr.p->backupPtr = ptr.i;
4813     trigPtr.p->tableId = tabPtr.p->tableId;
4814     trigPtr.p->tab_ptr_i = tabPtr.i;
4815     trigPtr.p->logEntry = 0;
4816     trigPtr.p->event = j;
4817     trigPtr.p->operation = operation;
4818     trigPtr.p->errorCode = 0;
4819   } // for
4820 
4821   /*
4822    * now ask DBTUP to create
4823    */
4824   ptr.p->slaveData.gsn = GSN_CREATE_TRIG_IMPL_REQ;
4825   ptr.p->slaveData.trigSendCounter = 3;
4826   ptr.p->slaveData.createTrig.tableId = tabPtr.p->tableId;
4827 
4828   req->senderRef = reference();
4829   req->receiverRef = reference();
4830   req->senderData = ptr.i;
4831   req->requestType = 0;
4832 
4833   Bitmask<MAXNROFATTRIBUTESINWORDS> attrMask;
4834   createAttributeMask(tabPtr, attrMask);
4835 
4836   req->tableId = tabPtr.p->tableId;
4837   req->tableVersion = 0;
4838   req->indexId = RNIL;
4839   req->indexVersion = 0;
4840 
4841   Uint32 ti = 0;
4842   /*
4843    * We always send PK for any operations and any triggertypes.
4844    * For SUBSCRIPTION_BEFORE
4845    *   We send after image for INSERT.
4846    *   We send before image for DELETE.
4847    *   We send before+after image for UPDATE.
4848    * For SUBSCRIPTION
4849    *   We send after image for INSERT.
4850    *   We send only PK for DELETE.
4851    *   We send after image for UPDATE.
4852    */
4853   if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
4854     TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION_BEFORE);
4855   else
4856     TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION);
4857   TriggerInfo::setTriggerActionTime(ti, TriggerActionTime::TA_DETACHED);
4858   TriggerInfo::setMonitorReplicas(ti, true);
4859   TriggerInfo::setMonitorAllAttributes(ti, false);
4860 
4861   for (int i=0; i < 3; i++) {
4862     req->triggerId = tabPtr.p->triggerIds[i];
4863 
4864     Uint32 ti2 = ti;
4865     TriggerInfo::setTriggerEvent(ti2, triggerEventValues[i]);
4866     req->triggerInfo = ti2;
4867 
4868     LinearSectionPtr attrPtr[3];
4869     attrPtr[0].p = attrMask.rep.data;
4870     attrPtr[0].sz = attrMask.getSizeInWords();
4871 
4872 
4873     if (MT_BACKUP_FLAG(ptr.p->flags))
4874     {
4875       // In mt-backup, the backup log is divided between LDMs. Each
4876       // BACKUP block writes insert/update/delete logs for the tuples it owns.
4877       // Each LDM in a multithreaded backup sends CREATE_TRIG_IMPL_REQs only to
4878       // its local DBTUP. Each DBTUP processes changes on its own fragments
4879       // and sends FIRE_TRIG_ORDs to its local BACKUP block. Since one DBTUP
4880       // instance has no knowledge of changes in other DBTUPs, this ensures
4881       // that a BACKUP block receives FIRE_TRIG_ORDs only for tuples it owns.
4882       // Each BACKUP block sends a CREATE_TRIG_IMPL_REQ to its local DBTUP
4883       // Each DBTUP processes changes on its frags and sends FIRE_TRIG_ORDs
4884       // to the local BACKUP block. Since one DBTUP instance has no knowledge
4885       // of changes in other DBTUPs, this ensures that a BACKUP block receives
4886       // FIRE_TRIG_ORDs for all changes on frags owned by its LDM, and for
4887       // no other frags.
4888       BlockReference ref = numberToRef(DBTUP, instance(), getOwnNodeId());
4889       sendSignal(ref, GSN_CREATE_TRIG_IMPL_REQ,
4890           signal, CreateTrigImplReq::SignalLength, JBB, attrPtr ,1);
4891     }
4892     else
4893     {
4894       // In single-threaded backup, the BACKUP block on LDM1 sends
4895       // CREATE_TRIG_IMPL_REQs for insert/update/delete on all tables to the
4896       // DbtupProxy. The DbtupProxy broadcasts the CREATE_TRIG to all LDMs.
4897       // So for every insert/update/delete, the DBTUP which owns the modified
4898       // fragment sends a FIRE_TRIG_ORD to the trigger creator on LDM1. When
4899       // the BACKUP block receives a FIRE_TRIG_ORD, it extracts the details of
4900       // the insert/update/delete and writes it to the backup log.
4901       sendSignal(DBTUP_REF, GSN_CREATE_TRIG_IMPL_REQ,
4902           signal, CreateTrigImplReq::SignalLength, JBB, attrPtr ,1);
4903     }
4904   }
4905 }
4906 
4907 void
execCREATE_TRIG_IMPL_CONF(Signal * signal)4908 Backup::execCREATE_TRIG_IMPL_CONF(Signal* signal)
4909 {
4910   jamEntry();
4911   const CreateTrigImplConf* conf =
4912     (const CreateTrigImplConf*)signal->getDataPtr();
4913 
4914   const Uint32 ptrI = conf->senderData;
4915   const Uint32 tableId = conf->tableId;
4916   const TriggerEvent::Value type =
4917     TriggerInfo::getTriggerEvent(conf->triggerInfo);
4918 
4919   BackupRecordPtr ptr;
4920   c_backupPool.getPtr(ptr, ptrI);
4921 
4922   /**
4923    * Verify that I'm waiting for this conf
4924    *
4925    * ptr.p->masterRef != reference()
4926    * as slaves and masters have triggers now.
4927    */
4928   ndbrequire(ptr.p->slaveData.gsn == GSN_CREATE_TRIG_IMPL_REQ);
4929   ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
4930   ndbrequire(ptr.p->slaveData.createTrig.tableId == tableId);
4931 
4932   TablePtr tabPtr;
4933   ndbrequire(findTable(ptr, tabPtr, tableId));
4934   ndbrequire(type < 3); // if some decides to change the enums
4935 
4936   createTrigReply(signal, ptr);
4937 }
4938 
4939 void
execCREATE_TRIG_IMPL_REF(Signal * signal)4940 Backup::execCREATE_TRIG_IMPL_REF(Signal* signal)
4941 {
4942   jamEntry();
4943   const CreateTrigImplRef* ref =
4944     (const CreateTrigImplRef*)signal->getDataPtr();
4945 
4946   const Uint32 ptrI = ref->senderData;
4947   const Uint32 tableId = ref->tableId;
4948 
4949   BackupRecordPtr ptr;
4950   c_backupPool.getPtr(ptr, ptrI);
4951 
4952   /**
4953    * Verify that I'm waiting for this ref
4954    *
4955    * ptr.p->masterRef != reference()
4956    * as slaves and masters have triggers now
4957    */
4958   ndbrequire(ptr.p->slaveData.gsn == GSN_CREATE_TRIG_IMPL_REQ);
4959   ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
4960   ndbrequire(ptr.p->slaveData.createTrig.tableId == tableId);
4961 
4962   ptr.p->setErrorCode(ref->errorCode);
4963 
4964   createTrigReply(signal, ptr);
4965 }
4966 
4967 void
createTrigReply(Signal * signal,BackupRecordPtr ptr)4968 Backup::createTrigReply(Signal* signal, BackupRecordPtr ptr)
4969 {
4970   CRASH_INSERTION(10003);
4971 
4972   /**
4973    * Check finished with table
4974    */
4975   ptr.p->slaveData.trigSendCounter--;
4976   if(ptr.p->slaveData.trigSendCounter.done() == false){
4977     jam();
4978     return;
4979   }//if
4980 
4981   if (ERROR_INSERTED(10025))
4982   {
4983     ptr.p->errorCode = 325;
4984   }
4985 
4986   if(ptr.p->checkError()) {
4987     jam();
4988     ptr.p->m_gsn = GSN_START_BACKUP_REF;
4989     StartBackupRef* ref = (StartBackupRef*)signal->getDataPtrSend();
4990     ref->backupPtr = ptr.i;
4991     ref->backupId = ptr.p->backupId;
4992     ref->errorCode = ptr.p->errorCode;
4993     ref->nodeId = getOwnNodeId();
4994     ndbout_c("Backup::createTrigReply : CREATE_TRIG_IMPL error %d, backup id %u node %d",
4995              ref->errorCode, ref->backupId, ref->nodeId);
4996     sendSignal(ptr.p->senderRef, GSN_START_BACKUP_REF, signal,
4997                StartBackupRef::SignalLength, JBB);
4998     return;
4999   }//if
5000 
5001   TablePtr tabPtr;
5002   ndbrequire(findTable(ptr, tabPtr, ptr.p->slaveData.createTrig.tableId));
5003 
5004   /**
5005    * Next table
5006    */
5007   ptr.p->tables.next(tabPtr);
5008   if(tabPtr.i != RNIL){
5009     jam();
5010     sendCreateTrig(signal, ptr, tabPtr);
5011     return;
5012   }//if
5013 
5014   /**
5015    * We've finished creating triggers.
5016    *
5017    * send conf and wait
5018    */
5019   ptr.p->m_gsn = GSN_START_BACKUP_CONF;
5020   StartBackupConf* conf = (StartBackupConf*)signal->getDataPtrSend();
5021   conf->backupPtr = ptr.i;
5022   conf->backupId = ptr.p->backupId;
5023   sendSignal(ptr.p->senderRef, GSN_START_BACKUP_CONF, signal,
5024 	     StartBackupConf::SignalLength, JBB);
5025 }
5026 
5027 /*****************************************************************************
5028  *
5029  * Master functionallity - Start backup
5030  *
5031  *****************************************************************************/
5032 void
sendStartBackup(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)5033 Backup::sendStartBackup(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr)
5034 {
5035 
5036   ptr.p->masterData.startBackup.tablePtr = tabPtr.i;
5037 
5038   StartBackupReq* req = (StartBackupReq*)signal->getDataPtrSend();
5039   req->backupId = ptr.p->backupId;
5040   req->backupPtr = ptr.i;
5041   req->senderRef = reference();
5042   /**
5043    * We use trigger Ids that are unique to BACKUP.
5044    * These don't interfere with other triggers (e.g. from DBDICT)
5045    * as there is a special case in DBTUP.
5046    *
5047    * Consequently, backups during online upgrade won't work
5048    */
5049   ptr.p->masterData.gsn = GSN_START_BACKUP_REQ;
5050   ptr.p->masterData.sendCounter = ptr.p->nodes;
5051   BlockNumber backupBlockNo = numberToBlock(BACKUP, instanceKey(ptr));
5052   NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5053   sendSignal(rg, GSN_START_BACKUP_REQ, signal,
5054 	     StartBackupReq::SignalLength, JBB);
5055 }
5056 
5057 void
execSTART_BACKUP_REF(Signal * signal)5058 Backup::execSTART_BACKUP_REF(Signal* signal)
5059 {
5060   jamEntry();
5061 
5062   StartBackupRef* ref = (StartBackupRef*)signal->getDataPtr();
5063   const Uint32 ptrI = ref->backupPtr;
5064   //const Uint32 backupId = ref->backupId;
5065   const Uint32 nodeId = ref->nodeId;
5066 
5067   BackupRecordPtr ptr;
5068   c_backupPool.getPtr(ptr, ptrI);
5069 
5070   ptr.p->setErrorCode(ref->errorCode);
5071   startBackupReply(signal, ptr, nodeId);
5072 }
5073 
5074 void
execSTART_BACKUP_CONF(Signal * signal)5075 Backup::execSTART_BACKUP_CONF(Signal* signal)
5076 {
5077   jamEntry();
5078 
5079   StartBackupConf* conf = (StartBackupConf*)signal->getDataPtr();
5080   const Uint32 ptrI = conf->backupPtr;
5081   //const Uint32 backupId = conf->backupId;
5082   const Uint32 nodeId = refToNode(signal->senderBlockRef());
5083 
5084   BackupRecordPtr ptr;
5085   c_backupPool.getPtr(ptr, ptrI);
5086 
5087   startBackupReply(signal, ptr, nodeId);
5088 }
5089 
5090 void
startBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)5091 Backup::startBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
5092 {
5093   if (ERROR_INSERTED(10052))
5094   {
5095     if (nodeId == getOwnNodeId())
5096     {
5097       jam();
5098       ndbrequire(ptr.p->errorCode == 0)
5099       // Delay reply from self so that master waits for START_BACKUP_REFs
5100       sendSignalWithDelay(reference(), GSN_START_BACKUP_CONF, signal,
5101                           5000, signal->getLength());
5102       return;
5103     }
5104     else
5105     {
5106       // Received START_BACKUP_REF/CONF from node n1, now crash n1. This will
5107       // trigger node-failure handling where master sends START_BACKUP_REF to
5108       // self on behalf of n1. So master receives 2 REFs from n1.
5109       signal->theData[0] = 9999;
5110       sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
5111     }
5112   }
5113 
5114   CRASH_INSERTION((10004));
5115 
5116   if (!haveAllSignals(ptr, GSN_START_BACKUP_REQ, nodeId)) {
5117     jam();
5118     return;
5119   }
5120 
5121   if (ERROR_INSERTED(10026))
5122   {
5123     ptr.p->errorCode = 326;
5124   }
5125 
5126   if(ptr.p->checkError()){
5127     jam();
5128     masterAbort(signal, ptr);
5129     return;
5130   }
5131 
5132   /*
5133    * We reply to client after create trigger
5134    */
5135   if (SEND_BACKUP_STARTED_FLAG(ptr.p->flags))
5136   {
5137     BackupConf * conf = (BackupConf*)signal->getDataPtrSend();
5138     conf->backupId = ptr.p->backupId;
5139     conf->senderData = ptr.p->clientData;
5140     sendSignal(ptr.p->clientRef, GSN_BACKUP_CONF, signal,
5141              BackupConf::SignalLength, JBB);
5142   }
5143 
5144   signal->theData[0] = NDB_LE_BackupStarted;
5145   signal->theData[1] = ptr.p->clientRef;
5146   signal->theData[2] = ptr.p->backupId;
5147   // Node bitmask is not used at the receiver, so zeroing it out.
5148   NdbNodeBitmask::clear(signal->theData + 3, NdbNodeBitmask48::Size);
5149   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3 + NdbNodeBitmask48::Size, JBB);
5150 
5151   /**
5152    * Wait for startGCP to a establish a consistent point at backup start.
5153    * This point is consistent since backup logging has started but scans
5154    * have not yet started, so it needs to be identified by a GCP. Wait till
5155    * the existing GCP has completed and capture the GCI as the startGCP of
5156    * this backup.
5157    * This is needed for SNAPSHOTSTART backups, which are restored to a
5158    * consistent point at backup start by replaying the backup undo logs up
5159    * till the end of startGCP.
5160    */
5161   ptr.p->masterData.gsn = GSN_WAIT_GCP_REQ;
5162   ptr.p->masterData.waitGCP.startBackup = true;
5163 
5164   WaitGCPReq * waitGCPReq = (WaitGCPReq*)signal->getDataPtrSend();
5165   waitGCPReq->senderRef = reference();
5166   waitGCPReq->senderData = ptr.i;
5167   waitGCPReq->requestType = WaitGCPReq::CompleteForceStart;
5168   //we delay 10 seconds for testcases to generate events to be recorded in the UNDO log
5169   if (ERROR_INSERTED(10041))
5170   {
5171     sendSignalWithDelay(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 10*1000, WaitGCPReq::SignalLength);
5172   }
5173   else
5174     sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5175     	       WaitGCPReq::SignalLength,JBB);
5176 }
5177 
5178 void
execWAIT_GCP_REF(Signal * signal)5179 Backup::execWAIT_GCP_REF(Signal* signal)
5180 {
5181   jamEntry();
5182 
5183   CRASH_INSERTION((10006));
5184 
5185   WaitGCPRef * ref = (WaitGCPRef*)signal->getDataPtr();
5186   const Uint32 ptrI = ref->senderData;
5187 
5188   BackupRecordPtr ptr;
5189   c_backupPool.getPtr(ptr, ptrI);
5190 
5191   ndbrequire(ptr.p->masterRef == reference());
5192   ndbrequire(ptr.p->masterData.gsn == GSN_WAIT_GCP_REQ);
5193 
5194   WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5195   req->senderRef = reference();
5196   req->senderData = ptr.i;
5197   req->requestType = WaitGCPReq::CompleteForceStart;
5198   sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5199 	     WaitGCPReq::SignalLength,JBB);
5200 }
5201 
5202 void
execWAIT_GCP_CONF(Signal * signal)5203 Backup::execWAIT_GCP_CONF(Signal* signal){
5204   jamEntry();
5205 
5206   CRASH_INSERTION((10007));
5207 
5208   WaitGCPConf * conf = (WaitGCPConf*)signal->getDataPtr();
5209   const Uint32 ptrI = conf->senderData;
5210   const Uint32 gcp = conf->gci_hi;
5211 
5212   BackupRecordPtr ptr;
5213   c_backupPool.getPtr(ptr, ptrI);
5214 
5215   ndbrequire(ptr.p->masterRef == reference());
5216   ndbrequire(ptr.p->masterData.gsn == GSN_WAIT_GCP_REQ);
5217 
5218   if(ptr.p->checkError()) {
5219     jam();
5220     masterAbort(signal, ptr);
5221     return;
5222   }//if
5223 
5224   if(ptr.p->masterData.waitGCP.startBackup) {
5225     jam();
5226     CRASH_INSERTION((10008));
5227     ptr.p->startGCP = gcp;
5228     ptr.p->masterData.sendCounter= 0;
5229     ptr.p->masterData.gsn = GSN_BACKUP_FRAGMENT_REQ;
5230     nextFragment(signal, ptr);
5231     return;
5232   } else {
5233     jam();
5234     if(gcp >= ptr.p->startGCP + 3)
5235     {
5236       CRASH_INSERTION((10009));
5237       ptr.p->stopGCP = gcp;
5238       /**
5239        * Backup is complete - begin cleanup
5240        * STOP_BACKUP_REQ is sent to participants.
5241        * They then drop the local triggers
5242        */
5243       sendStopBackup(signal, ptr);
5244       return;
5245     }//if
5246 
5247     /**
5248      * Make sure that we got entire stopGCP
5249      */
5250     WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5251     req->senderRef = reference();
5252     req->senderData = ptr.i;
5253     req->requestType = WaitGCPReq::CompleteForceStart;
5254     sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5255 	       WaitGCPReq::SignalLength,JBB);
5256     return;
5257   }
5258 }
5259 
5260 /*****************************************************************************
5261  *
5262  * Master functionallity - Backup fragment
5263  *
5264  *****************************************************************************/
5265 void
nextFragment(Signal * signal,BackupRecordPtr ptr)5266 Backup::nextFragment(Signal* signal, BackupRecordPtr ptr)
5267 {
5268   jam();
5269 
5270   BackupFragmentReq* req = (BackupFragmentReq*)signal->getDataPtrSend();
5271   req->backupPtr = ptr.i;
5272   req->backupId = ptr.p->backupId;
5273 
5274   TablePtr tabPtr;
5275   Uint32 unscanned_frag_count = 0;
5276   ptr.p->tables.first(tabPtr);
5277   for(; tabPtr.i != RNIL && ptr.p->idleFragWorkerCount > 0; ptr.p->tables.next(tabPtr))
5278   {
5279     jam();
5280     FragmentPtr fragPtr;
5281     Array<Fragment> & frags = tabPtr.p->fragments;
5282     const Uint32 fragCount = frags.getSize();
5283 
5284     for(Uint32 i = 0; i<fragCount && ptr.p->idleFragWorkerCount > 0; i++)
5285     {
5286       jam();
5287       tabPtr.p->fragments.getPtr(fragPtr, i);
5288       const Uint32 nodeId = fragPtr.p->node;
5289       /* Each frag is owned by a specific LDM on a specific node.
5290        * Master assigns each frag to an LDM on one of the nodes.
5291        * Frags are always assigned to nodes which own them, but
5292        * may be assigned to non-owner LDMs on owner nodes.
5293        * single-threaded backup -> always assign frag to LDM1
5294        * multithreaded backup -> assign frag to owner LDM
5295        * mapFragToLdm() detects backup type and selects LDM.
5296        */
5297       Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
5298       req->tableId = tabPtr.p->tableId;
5299       req->fragmentNo = i;
5300       req->count = 0;
5301       req->senderRef = reference();
5302       if (fragPtr.p->scanned == 0)
5303         unscanned_frag_count++;
5304 
5305       if ((fragPtr.p->scanned == 0) && (fragPtr.p->scanning == 0) &&
5306                                   (ptr.p->fragWorkers[nodeId].get(ldm)))
5307       {
5308         ptr.p->fragWorkers[nodeId].clear(ldm);
5309         fragPtr.p->scanning = 1;
5310         ptr.p->idleFragWorkerCount--;
5311         ptr.p->masterData.sendCounter++;
5312         BlockReference ref = numberToRef(BACKUP, ldm, nodeId);
5313         sendSignal(ref, GSN_BACKUP_FRAGMENT_REQ, signal,
5314                    BackupFragmentReq::SignalLength, JBB);
5315 
5316        }//if
5317     }//for
5318   }//for
5319 
5320   if (unscanned_frag_count > 0)
5321   {
5322     jam();
5323     return;
5324   }//if
5325 
5326   /**
5327    * Finished with all tables
5328    */
5329   {
5330     /**
5331      * Wait for stopGCP to a establish a consistent point at backup stop.
5332      * This point is consistent since backup logging has stopped and scans
5333      * have completed, so it needs to be identified by a GCP. Wait till
5334      * the existing GCP has completed and capture the GCI as the stopGCP of
5335      * this backup.
5336      * This is needed for SNAPSHOTEND backups, which are restored to a
5337      * consistent point at backup stop by replaying the backup redo logs up
5338      * till the end of stopGCP.
5339      */
5340     ptr.p->masterData.gsn = GSN_WAIT_GCP_REQ;
5341     ptr.p->masterData.waitGCP.startBackup = false;
5342 
5343     WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5344     req->senderRef = reference();
5345     req->senderData = ptr.i;
5346     req->requestType = WaitGCPReq::CompleteForceStart;
5347     sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5348 	       WaitGCPReq::SignalLength, JBB);
5349   }
5350 }
5351 
5352 void
execBACKUP_FRAGMENT_CONF(Signal * signal)5353 Backup::execBACKUP_FRAGMENT_CONF(Signal* signal)
5354 {
5355   jamEntry();
5356 
5357   CRASH_INSERTION((10010));
5358 
5359   BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtr();
5360   const Uint32 ptrI = conf->backupPtr;
5361   //const Uint32 backupId = conf->backupId;
5362   const Uint32 tableId = conf->tableId;
5363   const Uint32 fragmentNo = conf->fragmentNo;
5364   const Uint32 nodeId = refToNode(signal->senderBlockRef());
5365   const Uint64 noOfBytes =
5366     conf->noOfBytesLow + (((Uint64)conf->noOfBytesHigh) << 32);
5367   const Uint64 noOfRecords =
5368     conf->noOfRecordsLow + (((Uint64)conf->noOfRecordsHigh) << 32);
5369 
5370   BackupRecordPtr ptr;
5371   c_backupPool.getPtr(ptr, ptrI);
5372 
5373   ptr.p->noOfBytes += noOfBytes;
5374   ptr.p->noOfRecords += noOfRecords;
5375   ptr.p->masterData.sendCounter--;
5376 
5377   TablePtr tabPtr;
5378   ndbrequire(findTable(ptr, tabPtr, tableId));
5379 
5380   tabPtr.p->noOfRecords += noOfRecords;
5381 
5382   FragmentPtr fragPtr;
5383   tabPtr.p->fragments.getPtr(fragPtr, fragmentNo);
5384 
5385   fragPtr.p->noOfRecords = noOfRecords;
5386 
5387   ndbrequire(fragPtr.p->scanned == 0);
5388   ndbrequire(fragPtr.p->scanning == 1);
5389   ndbrequire(fragPtr.p->node == nodeId);
5390 
5391   fragPtr.p->scanned = 1;
5392   fragPtr.p->scanning = 0;
5393 
5394   if (ERROR_INSERTED(10028))
5395   {
5396     ptr.p->errorCode = 328;
5397   }
5398 
5399   if(ptr.p->checkError())
5400   {
5401     jam();
5402     if(ptr.p->masterData.sendCounter.done())
5403     {
5404       jam();
5405       masterAbort(signal, ptr);
5406       return;
5407     }//if
5408   }
5409   else
5410   {
5411     jam();
5412     Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
5413     ptr.p->fragWorkers[nodeId].set(ldm);
5414     ptr.p->idleFragWorkerCount++;
5415     nextFragment(signal, ptr);
5416   }
5417 }
5418 
5419 void
execBACKUP_FRAGMENT_REF(Signal * signal)5420 Backup::execBACKUP_FRAGMENT_REF(Signal* signal)
5421 {
5422   jamEntry();
5423 
5424   CRASH_INSERTION((10011));
5425 
5426   BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtr();
5427   const Uint32 ptrI = ref->backupPtr;
5428   //const Uint32 backupId = ref->backupId;
5429   const Uint32 nodeId = ref->nodeId;
5430 
5431   BackupRecordPtr ptr;
5432   c_backupPool.getPtr(ptr, ptrI);
5433 
5434   TablePtr tabPtr;
5435   ptr.p->tables.first(tabPtr);
5436   for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
5437     jam();
5438     FragmentPtr fragPtr;
5439     Array<Fragment> & frags = tabPtr.p->fragments;
5440     const Uint32 fragCount = frags.getSize();
5441 
5442     for(Uint32 i = 0; i<fragCount; i++) {
5443       jam();
5444       tabPtr.p->fragments.getPtr(fragPtr, i);
5445         if(fragPtr.p->scanning != 0 && nodeId == fragPtr.p->node)
5446       {
5447         jam();
5448 	ndbrequire(fragPtr.p->scanned == 0);
5449 	fragPtr.p->scanned = 1;
5450 	fragPtr.p->scanning = 0;
5451 	goto done;
5452       }
5453     }
5454   }
5455   goto err;
5456 
5457 done:
5458   ptr.p->masterData.sendCounter--;
5459   ptr.p->setErrorCode(ref->errorCode);
5460 
5461   if(ptr.p->masterData.sendCounter.done())
5462   {
5463     jam();
5464     masterAbort(signal, ptr);
5465     return;
5466   }//if
5467 
5468 err:
5469   AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
5470   ord->backupId = ptr.p->backupId;
5471   ord->backupPtr = ptr.i;
5472   ord->requestType = AbortBackupOrd::LogBufferFull;
5473   ord->senderData= ptr.i;
5474   execABORT_BACKUP_ORD(signal);
5475 }
5476 
5477 void
execBACKUP_FRAGMENT_COMPLETE_REP(Signal * signal)5478 Backup::execBACKUP_FRAGMENT_COMPLETE_REP(Signal* signal)
5479 {
5480   jamEntry();
5481   BackupFragmentCompleteRep * rep =
5482     (BackupFragmentCompleteRep*)signal->getDataPtr();
5483 
5484   BackupRecordPtr ptr;
5485   c_backupPool.getPtr(ptr, rep->backupPtr);
5486 
5487   TablePtr tabPtr;
5488   ndbrequire(findTable(ptr, tabPtr, rep->tableId));
5489 
5490   tabPtr.p->noOfRecords =
5491     rep->noOfTableRowsLow + (((Uint64)rep->noOfTableRowsHigh) << 32);
5492 
5493   FragmentPtr fragPtr;
5494   tabPtr.p->fragments.getPtr(fragPtr, rep->fragmentNo);
5495 
5496   fragPtr.p->noOfRecords =
5497     rep->noOfFragmentRowsLow + (((Uint64)rep->noOfFragmentRowsHigh) << 32);
5498 }
5499 
5500 /*****************************************************************************
5501  *
5502  * Slave functionallity - Drop triggers
5503  *
5504  *****************************************************************************/
5505 
5506 void
sendDropTrig(Signal * signal,BackupRecordPtr ptr)5507 Backup::sendDropTrig(Signal* signal, BackupRecordPtr ptr)
5508 {
5509   TablePtr tabPtr;
5510   ptr.p->slaveData.gsn = GSN_DROP_TRIG_IMPL_REQ;
5511 
5512   if (ptr.p->slaveData.dropTrig.tableId == RNIL) {
5513     jam();
5514     if(ptr.p->tables.getCount())
5515       ptr.p->tables.first(tabPtr);
5516     else
5517     {
5518       // Early abort, go to close files
5519       jam();
5520       closeFiles(signal, ptr);
5521       return;
5522     }
5523   } else {
5524     jam();
5525     ndbrequire(findTable(ptr, tabPtr, ptr.p->slaveData.dropTrig.tableId));
5526     ptr.p->tables.next(tabPtr);
5527   }//if
5528   if (tabPtr.i != RNIL) {
5529     jam();
5530     sendDropTrig(signal, ptr, tabPtr);
5531   } else {
5532     /**
5533      * Insert footers
5534      */
5535     //if backup error, we needn't insert footers
5536     if(ptr.p->checkError())
5537     {
5538       jam();
5539       closeFiles(signal, ptr);
5540       ptr.p->errorCode = 0;
5541       return;
5542     }
5543 
5544     {
5545       BackupFilePtr filePtr;
5546       ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
5547       Uint32 * dst;
5548       ndbrequire(filePtr.p->operation.dataBuffer.getWritePtr(&dst, 1));
5549       * dst = 0;
5550       filePtr.p->operation.dataBuffer.updateWritePtr(1);
5551     }
5552 
5553     {
5554       BackupFilePtr filePtr;
5555       ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
5556 
5557       const Uint32 gcpSz = sizeof(BackupFormat::CtlFile::GCPEntry) >> 2;
5558 
5559       Uint32 * dst;
5560       ndbrequire(filePtr.p->operation.dataBuffer.getWritePtr(&dst, gcpSz));
5561 
5562       BackupFormat::CtlFile::GCPEntry * gcp =
5563 	(BackupFormat::CtlFile::GCPEntry*)dst;
5564 
5565       gcp->SectionType   = htonl(BackupFormat::GCP_ENTRY);
5566       gcp->SectionLength = htonl(gcpSz);
5567       gcp->StartGCP      = htonl(ptr.p->startGCP);
5568       gcp->StopGCP       = htonl(ptr.p->stopGCP - 1);
5569       filePtr.p->operation.dataBuffer.updateWritePtr(gcpSz);
5570 
5571       {
5572         TablePtr tabPtr;
5573         if (ptr.p->tables.first(tabPtr))
5574 	{
5575 	  jam();
5576 	  signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
5577 	  signal->theData[1] = ptr.i;
5578 	  signal->theData[2] = tabPtr.i;
5579 	  signal->theData[3] = 0;
5580 	  sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
5581 	}
5582 	else
5583 	{
5584 	  jam();
5585 	  closeFiles(signal, ptr);
5586 	}
5587       }
5588     }
5589   }
5590 }
5591 
5592 void
sendDropTrig(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)5593 Backup::sendDropTrig(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr)
5594 {
5595   jam();
5596   DropTrigImplReq* req = (DropTrigImplReq*)signal->getDataPtrSend();
5597 
5598   ptr.p->slaveData.gsn = GSN_DROP_TRIG_IMPL_REQ;
5599   ptr.p->slaveData.trigSendCounter = 0;
5600   req->senderRef = reference(); // Sending to myself
5601   req->senderData = ptr.i;
5602   req->requestType = 0;
5603   req->tableId = tabPtr.p->tableId;
5604   req->tableVersion = 0;
5605   req->indexId = RNIL;
5606   req->indexVersion = 0;
5607   req->receiverRef = reference();
5608 
5609   // TUP needs some triggerInfo to find right list
5610   Uint32 ti = 0;
5611   if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
5612     TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION_BEFORE);
5613   else
5614     TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION);
5615   TriggerInfo::setTriggerActionTime(ti, TriggerActionTime::TA_DETACHED);
5616   TriggerInfo::setMonitorReplicas(ti, true);
5617   TriggerInfo::setMonitorAllAttributes(ti, false);
5618 
5619   ptr.p->slaveData.dropTrig.tableId = tabPtr.p->tableId;
5620   req->tableId = tabPtr.p->tableId;
5621 
5622   for (int i = 0; i < 3; i++) {
5623     Uint32 id = tabPtr.p->triggerIds[i];
5624     req->triggerId = id;
5625 
5626     Uint32 ti2 = ti;
5627     TriggerInfo::setTriggerEvent(ti2, triggerEventValues[i]);
5628     req->triggerInfo = ti2;
5629     if (MT_BACKUP_FLAG(ptr.p->flags))
5630     {
5631       BlockReference ref = numberToRef(DBTUP, instance(), getOwnNodeId());
5632       sendSignal(ref, GSN_DROP_TRIG_IMPL_REQ,
5633                signal, DropTrigImplReq::SignalLength, JBB);
5634     }
5635     else
5636     {
5637       sendSignal(DBTUP_REF, GSN_DROP_TRIG_IMPL_REQ,
5638                signal, DropTrigImplReq::SignalLength, JBB);
5639     }
5640 
5641     ptr.p->slaveData.trigSendCounter ++;
5642   }
5643 }
5644 
5645 void
execDROP_TRIG_IMPL_REF(Signal * signal)5646 Backup::execDROP_TRIG_IMPL_REF(Signal* signal)
5647 {
5648   jamEntry();
5649 
5650   const DropTrigImplRef* ref = (const DropTrigImplRef*)signal->getDataPtr();
5651   const Uint32 ptrI = ref->senderData;
5652 
5653   BackupRecordPtr ptr;
5654   c_backupPool.getPtr(ptr, ptrI);
5655 
5656   if(ref->triggerId != ~(Uint32) 0)
5657   {
5658     ndbout << "ERROR DROPPING TRIGGER: " << ref->triggerId;
5659     ndbout << " Err: " << ref->errorCode << endl << endl;
5660   }
5661 
5662   dropTrigReply(signal, ptr);
5663 }
5664 
5665 void
execDROP_TRIG_IMPL_CONF(Signal * signal)5666 Backup::execDROP_TRIG_IMPL_CONF(Signal* signal)
5667 {
5668   jamEntry();
5669 
5670   const DropTrigImplConf* conf = (const DropTrigImplConf*)signal->getDataPtr();
5671   const Uint32 ptrI = conf->senderData;
5672 
5673   BackupRecordPtr ptr;
5674   c_backupPool.getPtr(ptr, ptrI);
5675 
5676   dropTrigReply(signal, ptr);
5677 }
5678 
5679 void
dropTrigReply(Signal * signal,BackupRecordPtr ptr)5680 Backup::dropTrigReply(Signal* signal, BackupRecordPtr ptr)
5681 {
5682   CRASH_INSERTION((10012));
5683 
5684   ndbrequire(ptr.p->slaveData.gsn == GSN_DROP_TRIG_IMPL_REQ);
5685   ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
5686 
5687   // move from .masterData to .slaveData
5688   ptr.p->slaveData.trigSendCounter--;
5689   if(ptr.p->slaveData.trigSendCounter.done() == false){
5690     jam();
5691     return;
5692   }//if
5693 
5694   sendDropTrig(signal, ptr); // recursive next
5695 }
5696 
5697 /*****************************************************************************
5698  *
5699  * Master functionallity - Stop backup
5700  *
5701  *****************************************************************************/
5702 void
execSTOP_BACKUP_REF(Signal * signal)5703 Backup::execSTOP_BACKUP_REF(Signal* signal)
5704 {
5705   jamEntry();
5706 
5707   StopBackupRef* ref = (StopBackupRef*)signal->getDataPtr();
5708   const Uint32 ptrI = ref->backupPtr;
5709   //const Uint32 backupId = ref->backupId;
5710   const Uint32 nodeId = ref->nodeId;
5711 
5712   BackupRecordPtr ptr;
5713   c_backupPool.getPtr(ptr, ptrI);
5714 
5715   ptr.p->setErrorCode(ref->errorCode);
5716   stopBackupReply(signal, ptr, nodeId);
5717 }
5718 
5719 void
sendStopBackup(Signal * signal,BackupRecordPtr ptr)5720 Backup::sendStopBackup(Signal* signal, BackupRecordPtr ptr)
5721 {
5722   jam();
5723 
5724   StopBackupReq* stop = (StopBackupReq*)signal->getDataPtrSend();
5725   stop->backupPtr = ptr.i;
5726   stop->backupId = ptr.p->backupId;
5727   stop->startGCP = ptr.p->startGCP;
5728   stop->stopGCP = ptr.p->stopGCP;
5729   stop->senderRef = reference();
5730 
5731   ptr.p->masterData.gsn = GSN_STOP_BACKUP_REQ;
5732   ptr.p->masterData.sendCounter = ptr.p->nodes;
5733   Uint32 receiverInstance = instanceKey(ptr);
5734 
5735   if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
5736       && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
5737   {
5738     // All signal-sender functions in abort protocol detect
5739     // send-to-self bitmask settings and send signals accordingly.
5740     ptr.p->senderRef = reference();
5741     receiverInstance = instance();
5742   }
5743   BlockNumber backupBlockNo = numberToBlock(BACKUP, receiverInstance);
5744   NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5745   sendSignal(rg, GSN_STOP_BACKUP_REQ, signal,
5746 	     StopBackupReq::SignalLength, JBB);
5747 }
5748 
5749 void
execSTOP_BACKUP_CONF(Signal * signal)5750 Backup::execSTOP_BACKUP_CONF(Signal* signal)
5751 {
5752   jamEntry();
5753 
5754   StopBackupConf* conf = (StopBackupConf*)signal->getDataPtr();
5755   const Uint32 ptrI = conf->backupPtr;
5756   //const Uint32 backupId = conf->backupId;
5757   const Uint32 nodeId = refToNode(signal->senderBlockRef());
5758 
5759   BackupRecordPtr ptr;
5760   c_backupPool.getPtr(ptr, ptrI);
5761 
5762   ptr.p->noOfLogBytes += conf->noOfLogBytes;
5763   ptr.p->noOfLogRecords += conf->noOfLogRecords;
5764 
5765   stopBackupReply(signal, ptr, nodeId);
5766 }
5767 
5768 void
stopBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)5769 Backup::stopBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
5770 {
5771   if (ERROR_INSERTED(10053))
5772   {
5773     if (nodeId == getOwnNodeId())
5774     {
5775       jam();
5776       ndbrequire(ptr.p->errorCode == 0)
5777       // Delay reply from self so that master waits for STOP_BACKUP_REFs
5778       sendSignalWithDelay(reference(), GSN_STOP_BACKUP_CONF, signal,
5779                           5000, signal->getLength());
5780       return;
5781     }
5782     else
5783     {
5784       // Received STOP_BACKUP_REF/CONF from node n1, now crash n1. This will
5785       // trigger node-failure handling where master sends STOP_BACKUP_REF to
5786       // self on behalf of n1. So master receives 2 REFs from n1.
5787       signal->theData[0] = 9999;
5788       sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
5789     }
5790   }
5791   CRASH_INSERTION((10013));
5792 
5793   if (!haveAllSignals(ptr, GSN_STOP_BACKUP_REQ, nodeId)) {
5794     jam();
5795     return;
5796   }
5797 
5798   sendAbortBackupOrd(signal, ptr, AbortBackupOrd::BackupComplete);
5799 
5800   if(!ptr.p->checkError() &&  ptr.p->masterData.errorCode == 0)
5801   {
5802     if (SEND_BACKUP_COMPLETED_FLAG(ptr.p->flags))
5803     {
5804       BackupCompleteRep * rep = (BackupCompleteRep*)signal->getDataPtrSend();
5805       rep->backupId = ptr.p->backupId;
5806       rep->senderData = ptr.p->clientData;
5807       rep->startGCP = ptr.p->startGCP;
5808       rep->stopGCP = ptr.p->stopGCP;
5809       rep->noOfBytesLow = (Uint32)(ptr.p->noOfBytes & 0xFFFFFFFF);
5810       rep->noOfRecordsLow = (Uint32)(ptr.p->noOfRecords & 0xFFFFFFFF);
5811       rep->noOfBytesHigh = (Uint32)(ptr.p->noOfBytes >> 32);
5812       rep->noOfRecordsHigh = (Uint32)(ptr.p->noOfRecords >> 32);
5813       rep->noOfLogBytes = Uint32(ptr.p->noOfLogBytes); // TODO 64-bit log-bytes
5814       rep->noOfLogRecords = Uint32(ptr.p->noOfLogRecords); // TODO ^^
5815       sendSignal(ptr.p->clientRef, GSN_BACKUP_COMPLETE_REP, signal,
5816 		 BackupCompleteRep::SignalLength, JBB);
5817     }
5818 
5819     if (ERROR_INSERTED(10042))
5820     {
5821       // Change backup statistics to reflect values > 32 bit
5822       ptr.p->noOfRecords = INT_MAX64;
5823       ptr.p->noOfBytes = INT_MAX64;
5824       ptr.p->noOfLogRecords = INT_MAX64;
5825       ptr.p->noOfLogBytes = INT_MAX64;
5826     }
5827 
5828     signal->theData[0] = NDB_LE_BackupCompleted;
5829     signal->theData[1] = ptr.p->clientRef;
5830     signal->theData[2] = ptr.p->backupId;
5831     signal->theData[3] = ptr.p->startGCP;
5832     signal->theData[4] = ptr.p->stopGCP;
5833     signal->theData[5] = (Uint32)(ptr.p->noOfBytes & 0xFFFFFFFF);
5834     signal->theData[6] = (Uint32)(ptr.p->noOfRecords & 0xFFFFFFFF);
5835     signal->theData[7] = (Uint32)(ptr.p->noOfLogBytes & 0xFFFFFFFF);
5836     signal->theData[8] = (Uint32)(ptr.p->noOfLogRecords & 0xFFFFFFFF);
5837     signal->theData[9] = 0; //unused
5838     signal->theData[10] = 0; //unused
5839     signal->theData[11] = (Uint32)(ptr.p->noOfBytes >> 32);
5840     signal->theData[12] = (Uint32)(ptr.p->noOfRecords >> 32);
5841     signal->theData[13] = (Uint32)(ptr.p->noOfLogBytes >> 32);
5842     signal->theData[14] = (Uint32)(ptr.p->noOfLogRecords >> 32);
5843     sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 15, JBB);
5844   }
5845   else
5846   {
5847     masterAbort(signal, ptr);
5848   }
5849 }
5850 
5851 void
initReportStatus(Signal * signal,BackupRecordPtr ptr)5852 Backup::initReportStatus(Signal *signal, BackupRecordPtr ptr)
5853 {
5854   ptr.p->m_prev_report = NdbTick_getCurrentTicks();
5855 }
5856 
5857 void
checkReportStatus(Signal * signal,BackupRecordPtr ptr)5858 Backup::checkReportStatus(Signal *signal, BackupRecordPtr ptr)
5859 {
5860   if (m_backup_report_frequency == 0)
5861     return;
5862 
5863   const NDB_TICKS now = NdbTick_getCurrentTicks();
5864   const Uint64 elapsed = NdbTick_Elapsed(ptr.p->m_prev_report, now).seconds();
5865   if (elapsed > m_backup_report_frequency)
5866   {
5867     reportStatus(signal, ptr);
5868     ptr.p->m_prev_report = now;
5869   }
5870 }
5871 
5872 void
reportStatus(Signal * signal,BackupRecordPtr ptr,BlockReference ref)5873 Backup::reportStatus(Signal* signal, BackupRecordPtr ptr,
5874                      BlockReference ref)
5875 {
5876   const int signal_length = 11;
5877 
5878   signal->theData[0] = NDB_LE_BackupStatus;
5879   for (int i= 1; i < signal_length; i++)
5880     signal->theData[i] = 0;
5881 
5882   if (ptr.i == RNIL ||
5883       (ptr.p->m_gsn == 0 &&
5884        ptr.p->masterData.gsn == 0))
5885   {
5886     sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5887     return;
5888   }
5889   signal->theData[1] = ptr.p->clientRef;
5890   signal->theData[2] = ptr.p->backupId;
5891 
5892   if (ptr.p->dataFilePtr[0] == RNIL)
5893   {
5894     sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5895     return;
5896   }
5897 
5898   BackupFilePtr dataFilePtr;
5899   ptr.p->files.getPtr(dataFilePtr, ptr.p->dataFilePtr[0]);
5900   signal->theData[3] = (Uint32)(dataFilePtr.p->operation.m_bytes_total & 0xFFFFFFFF);
5901   signal->theData[4] = (Uint32)(dataFilePtr.p->operation.m_bytes_total >> 32);
5902   signal->theData[5] = (Uint32)(dataFilePtr.p->operation.m_records_total & 0xFFFFFFFF);
5903   signal->theData[6] = (Uint32)(dataFilePtr.p->operation.m_records_total >> 32);
5904 
5905   if (ptr.p->logFilePtr == RNIL)
5906   {
5907     sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5908     return;
5909   }
5910 
5911   BackupFilePtr logFilePtr;
5912   ptr.p->files.getPtr(logFilePtr, ptr.p->logFilePtr);
5913   signal->theData[7] = (Uint32)(logFilePtr.p->operation.m_bytes_total & 0xFFFFFFFF);
5914   signal->theData[8] = (Uint32)(logFilePtr.p->operation.m_bytes_total >> 32);
5915   signal->theData[9] = (Uint32)(logFilePtr.p->operation.m_records_total & 0xFFFFFFFF);
5916   signal->theData[10]= (Uint32)(logFilePtr.p->operation.m_records_total >> 32);
5917 
5918   sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5919 }
5920 
5921 /*****************************************************************************
5922  *
5923  * Master functionallity - Abort backup
5924  *
5925  *****************************************************************************/
5926 void
masterAbort(Signal * signal,BackupRecordPtr ptr)5927 Backup::masterAbort(Signal* signal, BackupRecordPtr ptr)
5928 {
5929   jam();
5930 #ifdef DEBUG_ABORT
5931   ndbout_c("************ masterAbort");
5932 #endif
5933 
5934   ndbassert(ptr.p->masterRef == reference());
5935   if(ptr.p->masterData.errorCode != 0)
5936   {
5937     jam();
5938     return;
5939   }
5940 
5941   if (SEND_BACKUP_STARTED_FLAG(ptr.p->flags))
5942   {
5943     BackupAbortRep* rep = (BackupAbortRep*)signal->getDataPtrSend();
5944     rep->backupId = ptr.p->backupId;
5945     rep->senderData = ptr.p->clientData;
5946     rep->reason = ptr.p->errorCode;
5947     sendSignal(ptr.p->clientRef, GSN_BACKUP_ABORT_REP, signal,
5948 	       BackupAbortRep::SignalLength, JBB);
5949   }
5950   signal->theData[0] = NDB_LE_BackupAborted;
5951   signal->theData[1] = ptr.p->clientRef;
5952   signal->theData[2] = ptr.p->backupId;
5953   signal->theData[3] = ptr.p->errorCode;
5954   sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
5955 
5956   ndbrequire(ptr.p->errorCode);
5957   ptr.p->masterData.errorCode = ptr.p->errorCode;
5958 
5959   AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
5960   ord->backupId = ptr.p->backupId;
5961   ord->backupPtr = ptr.i;
5962   ord->senderData= ptr.i;
5963   Uint32 receiverInstance = instanceKey(ptr); // = BackupProxy for mt-backup
5964 
5965   if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
5966       && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
5967   {
5968     // All signal-sender functions in abort protocol detect
5969     // send-to-self bitmask settings and send signals accordingly.
5970     ptr.p->senderRef = reference();
5971     receiverInstance = instance();
5972   }
5973 
5974   BlockNumber backupBlockNo = numberToBlock(BACKUP, receiverInstance);
5975   NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5976 
5977   switch(ptr.p->masterData.gsn){
5978   case GSN_DEFINE_BACKUP_REQ:
5979     ord->requestType = AbortBackupOrd::BackupFailure;
5980     sendSignal(rg, GSN_ABORT_BACKUP_ORD, signal,
5981 	       AbortBackupOrd::SignalLength, JBB);
5982     return;
5983   case GSN_CREATE_TRIG_IMPL_REQ:
5984   case GSN_START_BACKUP_REQ:
5985   case GSN_ALTER_TRIG_REQ:
5986   case GSN_WAIT_GCP_REQ:
5987   case GSN_BACKUP_FRAGMENT_REQ:
5988     jam();
5989     ptr.p->stopGCP= ptr.p->startGCP + 1;
5990     sendStopBackup(signal, ptr); // dropping due to error
5991     return;
5992   case GSN_UTIL_SEQUENCE_REQ:
5993   case GSN_UTIL_LOCK_REQ:
5994     ndbabort();
5995   case GSN_DROP_TRIG_IMPL_REQ:
5996   case GSN_STOP_BACKUP_REQ:
5997     return;
5998   }
5999 }
6000 
6001 void
abort_scan(Signal * signal,BackupRecordPtr ptr)6002 Backup::abort_scan(Signal * signal, BackupRecordPtr ptr)
6003 {
6004   AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
6005   ord->backupId = ptr.p->backupId;
6006   ord->backupPtr = ptr.i;
6007   ord->senderData= ptr.i;
6008   ord->requestType = AbortBackupOrd::AbortScan;
6009 
6010   TablePtr tabPtr;
6011   ptr.p->tables.first(tabPtr);
6012   for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
6013     jam();
6014     FragmentPtr fragPtr;
6015     Array<Fragment> & frags = tabPtr.p->fragments;
6016     const Uint32 fragCount = frags.getSize();
6017 
6018     for(Uint32 i = 0; i<fragCount; i++) {
6019       jam();
6020       tabPtr.p->fragments.getPtr(fragPtr, i);
6021       const Uint32 nodeId = fragPtr.p->node;
6022       if(fragPtr.p->scanning != 0 && ptr.p->nodes.get(nodeId)) {
6023         jam();
6024         Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
6025         BlockReference ref = numberToRef(BACKUP, ldm, nodeId);
6026 	sendSignal(ref, GSN_ABORT_BACKUP_ORD, signal,
6027 		   AbortBackupOrd::SignalLength, JBB);
6028 
6029       }
6030     }
6031   }
6032 }
6033 
6034 /*****************************************************************************
6035  *
6036  * Slave functionallity: Define Backup
6037  *
6038  *****************************************************************************/
6039 void
defineBackupRef(Signal * signal,BackupRecordPtr ptr,Uint32 errCode)6040 Backup::defineBackupRef(Signal* signal, BackupRecordPtr ptr, Uint32 errCode)
6041 {
6042   jam();
6043   if(ptr.p->is_lcp())
6044   {
6045     jam();
6046     ptr.p->setPrepareErrorCode(errCode);
6047     ptr.p->prepareState = PREPARE_ABORTING;
6048     ndbrequire(ptr.p->ctlFilePtr != RNIL);
6049 
6050     /**
6051      * This normally happens when a table has been deleted before we got to
6052      * start the LCP. This is a normal behaviour.
6053      *
6054      * At this point we have both the data file and the control file to use
6055      * open. At this point it is ok to remove both of them since they will
6056      * no longer be needed. This will happen in closeFile since we have set
6057      * the error code here.
6058      */
6059     BackupFilePtr filePtr;
6060     ptr.p->files.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
6061     if (filePtr.p->m_flags & BackupFile::BF_OPEN &&
6062         !(filePtr.p->m_flags & BackupFile::BF_CLOSING))
6063     {
6064       jam();
6065       ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_FILE_THREAD));
6066       filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_LCP_META;
6067       closeFile(signal, ptr, filePtr, true);
6068     }
6069     else if (filePtr.p->m_flags & BackupFile::BF_CLOSING)
6070     {
6071       /* Wait for the data file closing */
6072       jam();
6073       return;
6074     }
6075     else
6076     {
6077       jam();
6078       ndbrequire(filePtr.p->m_flags == 0);
6079     }
6080     ptr.p->files.getPtr(filePtr,
6081           ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber]);
6082     if (filePtr.p->m_flags & BackupFile::BF_OPEN &&
6083         !(filePtr.p->m_flags & BackupFile::BF_CLOSING))
6084     {
6085       jam();
6086       closeFile(signal, ptr, filePtr, true);
6087       return;
6088     }
6089     else if (filePtr.p->m_flags & BackupFile::BF_CLOSING)
6090     {
6091       /* Wait for the control file to close as well. */
6092       jam();
6093       return;
6094     }
6095     else
6096     {
6097       jam();
6098       ndbrequire(filePtr.p->m_flags == 0);
6099     }
6100 
6101     TablePtr tabPtr;
6102     FragmentPtr fragPtr;
6103 
6104     ndbrequire(ptr.p->prepare_table.first(tabPtr));
6105     tabPtr.p->fragments.getPtr(fragPtr, 0);
6106     DEB_LCP(("(%u)LCP_PREPARE_REF", instance()));
6107     LcpPrepareRef* ref= (LcpPrepareRef*)signal->getDataPtrSend();
6108     ref->senderData = ptr.p->clientData;
6109     ref->senderRef = reference();
6110     ref->tableId = tabPtr.p->tableId;
6111     ref->fragmentId = fragPtr.p->fragmentId;
6112     ref->errorCode = ptr.p->prepareErrorCode;
6113     sendSignal(ptr.p->masterRef, GSN_LCP_PREPARE_REF,
6114 	       signal, LcpPrepareRef::SignalLength, JBA);
6115     ptr.p->prepareState = NOT_ACTIVE;
6116     return;
6117   }
6118   ptr.p->setErrorCode(errCode);
6119 
6120   ptr.p->m_gsn = GSN_DEFINE_BACKUP_REF;
6121   ndbrequire(ptr.p->errorCode != 0);
6122 
6123   DefineBackupRef* ref = (DefineBackupRef*)signal->getDataPtrSend();
6124   ref->backupId = ptr.p->backupId;
6125   ref->backupPtr = ptr.i;
6126   ref->errorCode = ptr.p->errorCode;
6127   ref->nodeId = getOwnNodeId();
6128   sendSignal(ptr.p->senderRef, GSN_DEFINE_BACKUP_REF, signal,
6129 	     DefineBackupRef::SignalLength, JBB);
6130 }
6131 
6132 void
init_file(BackupFilePtr filePtr,Uint32 backupPtrI)6133 Backup::init_file(BackupFilePtr filePtr, Uint32 backupPtrI)
6134 {
6135   filePtr.p->tableId = RNIL;
6136   filePtr.p->backupPtr = backupPtrI;
6137   filePtr.p->filePointer = RNIL;
6138   filePtr.p->m_flags = 0;
6139   filePtr.p->errorCode = 0;
6140 }
6141 
6142 void
execDEFINE_BACKUP_REQ(Signal * signal)6143 Backup::execDEFINE_BACKUP_REQ(Signal* signal)
6144 {
6145   jamEntry();
6146 
6147   DefineBackupReq* req = (DefineBackupReq*)signal->getDataPtr();
6148   NdbNodeBitmask nodes;
6149 
6150   const Uint32 senderVersion =
6151       getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version;
6152 
6153   if (signal->getNoOfSections() >= 1)
6154   {
6155     ndbrequire(ndbd_send_node_bitmask_in_section(senderVersion));
6156     SegmentedSectionPtr ptr;
6157     SectionHandle handle(this,signal);
6158     handle.getSection(ptr, 0);
6159     ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
6160     copy(nodes.rep.data, ptr);
6161     releaseSections(handle);
6162   }
6163   else
6164   {
6165     nodes = req->nodes;
6166   }
6167 
6168   BackupRecordPtr ptr;
6169   const Uint32 ptrI = req->backupPtr;
6170   const Uint32 backupId = req->backupId;
6171 
6172   if(req->masterRef == reference())
6173   {
6174      /**
6175       * Signal sent from myself -> record already seized
6176       */
6177     jam();
6178     c_backupPool.getPtr(ptr, ptrI);
6179   } else { // from other node
6180     jam();
6181 #ifdef DEBUG_ABORT
6182     dumpUsedResources();
6183 #endif
6184     if (!c_backups.getPool().seizeId(ptr, ptrI)) {
6185       jam();
6186       ndbabort(); // If master has succeeded slave should succed
6187     }//if
6188     c_backups.addFirst(ptr);
6189   }//if
6190 
6191   CRASH_INSERTION((10014));
6192 
6193   if (MT_BACKUP_FLAG(ptr.p->flags))
6194   {
6195     // All LDMs participate in backup, not just LDM1
6196     // Prevent allotment of extra resources for LDM1
6197     m_skew_disk_speed = false;
6198   }
6199   else
6200   {
6201     // only LDM1 participates in backup, allot extra disk speed quota
6202     m_skew_disk_speed = true;
6203   }
6204 
6205   // The masterRef is the BACKUP block which coordinates the backup
6206   // across all the nodes, i.e. LDM1 on the master node. The senderRef
6207   // is the BACKUP block which sent the last REQ signal. The masterRef
6208   // sends signals to the BackupProxies on all the nodes, and each
6209   // BackupProxy sends the signals to the LDMs. So the LDMs need to reply
6210   // to the BackupProxy, not the master.
6211   //
6212   // - For single-threaded backup: backup master directly controls
6213   // participants on all nodes, so
6214   //   masterRef = senderRef = LDM1_on_master_node.
6215   // - For multithreaded backup: backup master sends control signals to
6216   // BackupProxy on each node + each BackupProxy controls backup exec
6217   // across LDMs, so:
6218   //   For all LDMs on node N, senderRef = BackupProxy_on_node_N
6219   //   For all LDMs on all nodes, masterRef = LDM1_on_master_node.
6220   //
6221   // masterRef is passed in DEFINE_BACKUP_REQ so that all participants set a
6222   // masterRef explicitly specified by the master.
6223   ptr.p->masterRef = req->masterRef;
6224   ptr.p->senderRef = req->senderRef;
6225   ptr.p->m_gsn = GSN_DEFINE_BACKUP_REQ;
6226   ptr.p->slaveState.forceState(INITIAL);
6227   ptr.p->slaveState.setState(DEFINING);
6228   ptr.p->prepareState = NOT_ACTIVE;
6229   ptr.p->slaveData.dropTrig.tableId = RNIL;
6230   ptr.p->errorCode = 0;
6231   ptr.p->clientRef = req->clientRef;
6232   ptr.p->clientData = req->clientData;
6233   if(req->masterRef == reference())
6234   {
6235     ptr.p->flags = req->flags;
6236   }
6237   else
6238     ptr.p->flags = req->flags & ~((Uint32)BackupReq::WAITCOMPLETED); /* remove waitCompleted flags
6239 						 * as non master should never
6240 						 * reply
6241 						 */
6242   ptr.p->nodes = nodes;
6243   ptr.p->backupId = backupId;
6244   ptr.p->backupKey[0] = req->backupKey[0];
6245   ptr.p->backupKey[1] = req->backupKey[1];
6246   ptr.p->backupDataLen = req->backupDataLen;
6247   ptr.p->masterData.errorCode = 0;
6248   ptr.p->noOfBytes = 0;
6249   ptr.p->m_bytes_written = 0;
6250   ptr.p->m_row_scan_counter = 0;
6251   ptr.p->noOfRecords = 0;
6252   ptr.p->noOfLogBytes = 0;
6253   ptr.p->noOfLogRecords = 0;
6254   ptr.p->currGCP = 0;
6255   ptr.p->startGCP = 0;
6256   ptr.p->stopGCP = 0;
6257   ptr.p->m_prioA_scan_batches_to_execute = 0;
6258   ptr.p->m_lastSignalId = 0;
6259 
6260   /**
6261    * Allocate files
6262    */
6263   BackupFilePtr files[4 + (2*BackupFormat::NDB_MAX_FILES_PER_LCP)];
6264   Uint32 noOfPages[] = {
6265     NO_OF_PAGES_META_FILE,
6266     2,   // 32k
6267     0    // 3M
6268   };
6269 
6270   constexpr Uint32 maxInsert[] =
6271   {
6272     MAX_WORDS_META_FILE,                       // control files
6273     BackupFormat::LogFile::LogEntry::MAX_SIZE, // redo/undo log files
6274     BACKUP_MIN_BUFF_WORDS                      // data files
6275   };
6276 
6277   Uint32 minWrite[] = {
6278     8192,
6279     8192,
6280     32768
6281   };
6282   Uint32 maxWrite[] = {
6283     8192,
6284     8192,
6285     32768
6286   };
6287 
6288   minWrite[1] = c_defaults.m_minWriteSize;
6289   maxWrite[1] = c_defaults.m_maxWriteSize;
6290   noOfPages[1] = (c_defaults.m_logBufferSize + sizeof(Page32) - 1) /
6291     sizeof(Page32);
6292   minWrite[2] = c_defaults.m_minWriteSize;
6293   maxWrite[2] = c_defaults.m_maxWriteSize;
6294   noOfPages[2] = (c_defaults.m_dataBufferSize + sizeof(Page32) - 1) /
6295     sizeof(Page32);
6296 
6297   ptr.p->ctlFilePtr = ptr.p->logFilePtr = RNIL;
6298   for (Uint32 i = 0; i < BackupFormat::NDB_MAX_FILES_PER_LCP; i++)
6299   {
6300     ptr.p->dataFilePtr[i] = RNIL;
6301     ptr.p->prepareDataFilePtr[i] = RNIL;
6302   }
6303 
6304   if (ptr.p->is_lcp())
6305   {
6306     /**
6307      * Allocate table and fragment object LCP prepare and execute
6308      * phase once and for all. This means we don't risk getting out
6309      * of resource issues for LCPs.
6310      */
6311     jam();
6312     TablePtr tabPtr;
6313     m_lcp_ptr = ptr;
6314     ndbrequire(ptr.p->prepare_table.seizeLast(tabPtr));
6315     ndbrequire(tabPtr.p->fragments.seize(1));
6316     ndbrequire(ptr.p->tables.seizeLast(tabPtr));
6317     ndbrequire(tabPtr.p->fragments.seize(1));
6318 
6319     noOfPages[2] = (c_defaults.m_lcp_buffer_size + sizeof(Page32) - 1) /
6320       sizeof(Page32);
6321     for (Uint32 i = 0; i < (4 + (2*BackupFormat::NDB_MAX_FILES_PER_LCP)); i++)
6322     {
6323       Uint32 minWriteLcp;
6324       Uint32 maxWriteLcp;
6325       Uint32 maxInsertLcp;
6326       Uint32 noOfPagesLcp;
6327       ndbrequire(ptr.p->files.seizeFirst(files[i]));
6328       init_file(files[i], ptr.i);
6329       switch (i)
6330       {
6331         case 0:
6332         {
6333           jam();
6334           minWriteLcp = 1024;
6335           maxWriteLcp = 32768;
6336           maxInsertLcp = 8192;
6337           noOfPagesLcp = 2;
6338           ptr.p->ctlFilePtr = files[i].i;
6339           files[i].p->fileType = BackupFormat::CTL_FILE;
6340           break;
6341         }
6342         case 1:
6343         {
6344           jam();
6345           minWriteLcp = 1024;
6346           maxWriteLcp = 32768;
6347           maxInsertLcp = 8192;
6348           noOfPagesLcp = 2;
6349           ptr.p->prepareCtlFilePtr[0] = files[i].i;
6350           files[i].p->fileType = BackupFormat::CTL_FILE;
6351           break;
6352         }
6353         case 2:
6354         {
6355           jam();
6356           minWriteLcp = 1024;
6357           maxWriteLcp = 32768;
6358           maxInsertLcp = 8192;
6359           noOfPagesLcp = 2;
6360           ptr.p->prepareCtlFilePtr[1] = files[i].i;
6361           files[i].p->fileType = BackupFormat::CTL_FILE;
6362           break;
6363         }
6364         case 3:
6365         {
6366           jam();
6367           minWriteLcp = 1024;
6368           maxWriteLcp = 32768;
6369           maxInsertLcp = 8192;
6370           noOfPagesLcp = 2;
6371           ptr.p->deleteFilePtr = files[i].i;
6372           files[i].p->fileType = BackupFormat::DATA_FILE;
6373           break;
6374         }
6375         default:
6376         {
6377           if (i < 4 + BackupFormat::NDB_MAX_FILES_PER_LCP)
6378           {
6379             jam();
6380             minWriteLcp = minWrite[2];
6381             maxWriteLcp = maxWrite[2];
6382             maxInsertLcp = maxInsert[2];
6383             noOfPagesLcp = noOfPages[2];
6384             jam();
6385             ptr.p->prepareDataFilePtr[i - 4] = files[i].i;
6386             jam();
6387             files[i].p->fileType = BackupFormat::DATA_FILE;
6388             jam();
6389           }
6390           else
6391           {
6392             jam();
6393             minWriteLcp = minWrite[2];
6394             maxWriteLcp = maxWrite[2];
6395             maxInsertLcp = maxInsert[2];
6396             noOfPagesLcp = noOfPages[2];
6397             jam();
6398             ptr.p->dataFilePtr[i - (4 + BackupFormat::NDB_MAX_FILES_PER_LCP)] =
6399               files[i].i;
6400             jam();
6401             files[i].p->fileType = BackupFormat::DATA_FILE;
6402             jam();
6403           }
6404           break;
6405         }
6406       }
6407       Page32Ptr pagePtr;
6408       DEB_LCP(("LCP: instance: %u, i: %u, seize %u pages",
6409                instance(),
6410                i,
6411                noOfPagesLcp));
6412       ndbrequire(files[i].p->pages.seize(noOfPagesLcp));
6413       files[i].p->pages.getPtr(pagePtr, 0);
6414       const char * msg = files[i].p->
6415         operation.dataBuffer.setup((Uint32*)pagePtr.p,
6416                                    noOfPagesLcp * (sizeof(Page32) >> 2),
6417                                    128,
6418                                    minWriteLcp >> 2,
6419                                    maxWriteLcp >> 2,
6420                                    maxInsertLcp);
6421       if (msg != 0)
6422       {
6423         ndbout_c("setup msg = %s, i = %u", msg, i);
6424         ndbabort();
6425       }
6426       files[i].p->operation.m_bytes_total = 0;
6427       files[i].p->operation.m_records_total = 0;
6428     }
6429   }
6430   else
6431   {
6432     for (Uint32 i = 0; i < 3; i++)
6433     {
6434       jam();
6435       if (!ptr.p->files.seizeFirst(files[i]))
6436       {
6437         jam();
6438         defineBackupRef(signal, ptr,
6439                         DefineBackupRef::FailedToAllocateFileRecord);
6440         return;
6441       }//if
6442       init_file(files[i], ptr.i);
6443 
6444       if(ERROR_INSERTED(10035) || files[i].p->pages.seize(noOfPages[i]) == false)
6445       {
6446         jam();
6447         DEBUG_OUT("Failed to seize " << noOfPages[i] << " pages");
6448         defineBackupRef(signal, ptr, DefineBackupRef::FailedToAllocateBuffers);
6449         return;
6450       }//if
6451 
6452       Page32Ptr pagePtr;
6453       files[i].p->pages.getPtr(pagePtr, 0);
6454 
6455       const char * msg = files[i].p->
6456         operation.dataBuffer.setup((Uint32*)pagePtr.p,
6457                                    noOfPages[i] * (sizeof(Page32) >> 2),
6458                                    128,
6459                                    minWrite[i] >> 2,
6460                                    maxWrite[i] >> 2,
6461                                    maxInsert[i]);
6462       if(msg != 0) {
6463         jam();
6464         defineBackupRef(signal, ptr, DefineBackupRef::FailedToSetupFsBuffers);
6465         return;
6466       }//if
6467 
6468       switch(i)
6469       {
6470         case 0:
6471           files[i].p->fileType = BackupFormat::CTL_FILE;
6472           ptr.p->ctlFilePtr = files[i].i;
6473           break;
6474         case 1:
6475           if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6476             files[i].p->fileType = BackupFormat::UNDO_FILE;
6477           else
6478             files[i].p->fileType = BackupFormat::LOG_FILE;
6479           ptr.p->logFilePtr = files[i].i;
6480           break;
6481         case 2:
6482           files[i].p->fileType = BackupFormat::DATA_FILE;
6483           ptr.p->dataFilePtr[0] = files[i].i;
6484       }
6485       files[i].p->operation.m_bytes_total = 0;
6486       files[i].p->operation.m_records_total = 0;
6487     }//for
6488   }
6489 
6490   initReportStatus(signal, ptr);
6491 
6492   if (!verifyNodesAlive(ptr, ptr.p->nodes)) {
6493     jam();
6494     defineBackupRef(signal, ptr, DefineBackupRef::Undefined);
6495     return;
6496   }//if
6497   if (ERROR_INSERTED(10027)) {
6498     jam();
6499     defineBackupRef(signal, ptr, 327);
6500     return;
6501   }//if
6502 
6503   if(ptr.p->is_lcp())
6504   {
6505     jam();
6506     getFragmentInfoDone(signal, ptr);
6507     return;
6508   }
6509 
6510   if (ptr.p->backupDataLen == 0)
6511   {
6512     jam();
6513     backupAllData(signal, ptr);
6514     return;
6515   }//if
6516 
6517   /**
6518    * Not implemented
6519    */
6520   ndbabort();
6521 }
6522 
6523 void
backupAllData(Signal * signal,BackupRecordPtr ptr)6524 Backup::backupAllData(Signal* signal, BackupRecordPtr ptr)
6525 {
6526   /**
6527    * Get all tables from dict
6528    */
6529   ListTablesReq * req = (ListTablesReq*)signal->getDataPtrSend();
6530   req->init();
6531   req->senderRef = reference();
6532   req->senderData = ptr.i;
6533   req->setTableId(0);
6534   req->setTableType(0);
6535   sendSignal(DBDICT_REF, GSN_LIST_TABLES_REQ, signal,
6536 	     ListTablesReq::SignalLength, JBB);
6537 }
6538 
6539 void
execLIST_TABLES_CONF(Signal * signal)6540 Backup::execLIST_TABLES_CONF(Signal* signal)
6541 {
6542   jamEntry();
6543   Uint32 fragInfo = signal->header.m_fragmentInfo;
6544   ListTablesConf* conf = (ListTablesConf*)signal->getDataPtr();
6545   Uint32 noOfTables = conf->noOfTables;
6546 
6547   BackupRecordPtr ptr;
6548   c_backupPool.getPtr(ptr, conf->senderData);
6549 
6550   SectionHandle handle (this, signal);
6551   signal->header.m_fragmentInfo = 0;
6552   if (noOfTables > 0)
6553   {
6554     ListTablesData ltd;
6555     const Uint32 listTablesDataSizeInWords = (sizeof(ListTablesData) + 3) / 4;
6556     SegmentedSectionPtr tableDataPtr;
6557     handle.getSection(tableDataPtr, ListTablesConf::TABLE_DATA);
6558     SimplePropertiesSectionReader
6559       tableDataReader(tableDataPtr, getSectionSegmentPool());
6560 
6561     tableDataReader.reset();
6562     for(unsigned int i = 0; i<noOfTables; i++) {
6563       jam();
6564       tableDataReader.getWords((Uint32 *)&ltd, listTablesDataSizeInWords);
6565       Uint32 tableId = ltd.getTableId();
6566       Uint32 tableType = ltd.getTableType();
6567       Uint32 state= ltd.getTableState();
6568       jamLine(tableId);
6569 
6570       if (! (DictTabInfo::isTable(tableType) ||
6571              DictTabInfo::isIndex(tableType) ||
6572              DictTabInfo::isFilegroup(tableType) ||
6573              DictTabInfo::isFile(tableType)
6574              || DictTabInfo::isHashMap(tableType)
6575              || DictTabInfo::isForeignKey(tableType)
6576              ))
6577       {
6578         jam();
6579         continue;
6580       }
6581 
6582       if (state != DictTabInfo::StateOnline)
6583       {
6584         jam();
6585         continue;
6586       }
6587 
6588       TablePtr tabPtr;
6589       ptr.p->tables.seizeLast(tabPtr);
6590       if(tabPtr.i == RNIL) {
6591         jam();
6592         defineBackupRef(signal, ptr, DefineBackupRef::FailedToAllocateTables);
6593         releaseSections(handle);
6594         return;
6595       }//if
6596       tabPtr.p->tableType = tableType;
6597       tabPtr.p->tableId = tableId;
6598 #ifdef VM_TRACE
6599       TablePtr locTabPtr;
6600       ndbassert(findTable(ptr, locTabPtr, tabPtr.p->tableId) == false);
6601 #endif
6602       insertTableMap(tabPtr, ptr.i, tabPtr.p->tableId);
6603     }//for
6604   }
6605 
6606   releaseSections(handle);
6607 
6608   /*
6609     If first or not last signal
6610     then keep accumulating table data
6611    */
6612   if ((fragInfo == 1) || (fragInfo == 2))
6613   {
6614     jam();
6615     return;
6616   }
6617   openFiles(signal, ptr);
6618 }
6619 
6620 void
openFiles(Signal * signal,BackupRecordPtr ptr)6621 Backup::openFiles(Signal* signal, BackupRecordPtr ptr)
6622 {
6623   jam();
6624 
6625   BackupFilePtr filePtr;
6626 
6627   FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
6628   req->userReference = reference();
6629   req->fileFlags =
6630     FsOpenReq::OM_WRITEONLY |
6631     FsOpenReq::OM_CREATE_IF_NONE |
6632     FsOpenReq::OM_APPEND |
6633     FsOpenReq::OM_AUTOSYNC;
6634 
6635   if (c_defaults.m_compressed_backup)
6636     req->fileFlags |= FsOpenReq::OM_GZ;
6637 
6638   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
6639   req->auto_sync_size = c_defaults.m_disk_synch_size;
6640   /**
6641    * Ctl file
6642    */
6643   c_backupFilePool.getPtr(filePtr, ptr.p->ctlFilePtr);
6644   filePtr.p->m_flags |= BackupFile::BF_OPENING;
6645 
6646   req->userPointer = filePtr.i;
6647   FsOpenReq::setVersion(req->fileNumber, 2);
6648   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
6649   FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6650   FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6651   /*
6652    * NDBFS supports 2 backup formats: single-threaded backup and
6653    * multithreaded backup format.
6654    *
6655    * Example of st-backup directory structure in backup path (backup
6656    * files present in BACKUP-<backupID> directory):
6657    *
6658    * mysql@psangam-T460:~$ ls data2/BACKUP/
6659    * BACKUP-1  BACKUP-2
6660    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/
6661    * BACKUP-1-0.2.Data  BACKUP-1.2.ctl  BACKUP-1.2.log
6662    *
6663    * Example of mt-backup directory structure (backup subfolders in
6664    * BACKUP-<backupID>, subfolders contain backup files):
6665    *
6666    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/
6667    * BACKUP-1-PART-1-OF-4  BACKUP-1-PART-2-OF-4  BACKUP-1-PART-3-OF-4  BACKUP-1-PART-4-OF-4
6668    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-1-OF-4/
6669    * BACKUP-1-0.2.Data  BACKUP-1.2.ctl  BACKUP-1.2.log
6670    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-2-OF-4/
6671    * BACKUP-1-0.2.Data  BACKUP-1.2.ctl  BACKUP-1.2.log
6672    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-3-OF-4/
6673    * BACKUP-1-0.2.Data  BACKUP-1.2.ctl  BACKUP-1.2.log
6674    * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-4-OF-4/
6675    * BACKUP-1-0.2.Data  BACKUP-1.2.ctl  BACKUP-1.2.log
6676    *
6677    * NDBFS is now aware of the backup part in the file-open operation, as
6678    * well as the total number of backup parts. If a backup part number is set
6679    * to 0, it creates files as per the single-threaded backup directory
6680    * structure. If a non-zero part number is set, it creates files as per the
6681    * mt-backup directory structure.
6682    */
6683   if (MT_BACKUP_FLAG(ptr.p->flags))
6684   {
6685     /*
6686      * If the MT_BACKUP flag is set, a non-zero backup-part-ID is
6687      * passed to NDBFS so that the multithreaded backup directory
6688      * structure is used. If it is false, the old single-threaded
6689      * backup structure is used.
6690      */
6691     FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6692     FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6693   }
6694   else
6695   {
6696     FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6697     FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6698   }
6699   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6700 
6701   /**
6702    * Log file
6703    */
6704   c_backupFilePool.getPtr(filePtr, ptr.p->logFilePtr);
6705   filePtr.p->m_flags |= BackupFile::BF_OPENING;
6706 
6707   //write uncompressed log file when enable undo log,since log file is read from back to front.
6708   if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6709     req->fileFlags &= ~FsOpenReq::OM_GZ;
6710 
6711   req->userPointer = filePtr.i;
6712   FsOpenReq::setVersion(req->fileNumber, 2);
6713   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_LOG);
6714   FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6715   FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6716   if (MT_BACKUP_FLAG(ptr.p->flags))
6717   {
6718     FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6719     FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6720   }
6721   else
6722   {
6723     FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6724     FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6725   }
6726   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6727 
6728   /**
6729    * Data file
6730    */
6731   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
6732   filePtr.p->m_flags |= BackupFile::BF_OPENING;
6733 
6734   if (c_defaults.m_o_direct)
6735     req->fileFlags |= FsOpenReq::OM_DIRECT;
6736   if (c_defaults.m_compressed_backup)
6737     req->fileFlags |= FsOpenReq::OM_GZ;
6738   req->userPointer = filePtr.i;
6739   FsOpenReq::setVersion(req->fileNumber, 2);
6740   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
6741   FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6742   FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6743   if (MT_BACKUP_FLAG(ptr.p->flags))
6744   {
6745     FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6746     FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6747   }
6748   else
6749   {
6750     FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6751     FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6752   }
6753   FsOpenReq::v2_setCount(req->fileNumber, 0);
6754   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6755 }
6756 
6757 void
execFSOPENREF(Signal * signal)6758 Backup::execFSOPENREF(Signal* signal)
6759 {
6760   jamEntry();
6761 
6762   FsRef * ref = (FsRef *)signal->getDataPtr();
6763 
6764   const Uint32 userPtr = ref->userPointer;
6765 
6766   BackupFilePtr filePtr;
6767   c_backupFilePool.getPtr(filePtr, userPtr);
6768 
6769   ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPEN));
6770   ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPENING);
6771   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_OPENING;
6772 
6773   BackupRecordPtr ptr;
6774   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
6775 
6776   ptr.p->setErrorCode(ref->errorCode);
6777   if (ptr.p->is_lcp())
6778   {
6779     jam();
6780     openFilesReplyLCP(signal, ptr, filePtr);
6781     return;
6782   }
6783   openFilesReply(signal, ptr, filePtr);
6784 }
6785 
6786 void
execFSOPENCONF(Signal * signal)6787 Backup::execFSOPENCONF(Signal* signal)
6788 {
6789   jamEntry();
6790 
6791   FsConf * conf = (FsConf *)signal->getDataPtr();
6792 
6793   const Uint32 userPtr = conf->userPointer;
6794   const Uint32 filePointer = conf->filePointer;
6795 
6796   BackupFilePtr filePtr;
6797   c_backupFilePool.getPtr(filePtr, userPtr);
6798   filePtr.p->filePointer = filePointer;
6799 
6800   BackupRecordPtr ptr;
6801   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
6802 
6803   /**
6804    * Mark files as "opened"
6805    */
6806   ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPEN));
6807   ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPENING);
6808   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_OPENING;
6809   filePtr.p->m_flags |= BackupFile::BF_OPEN;
6810 
6811   if (ptr.p->is_lcp())
6812   {
6813     jam();
6814     openFilesReplyLCP(signal, ptr, filePtr);
6815     return;
6816   }
6817   openFilesReply(signal, ptr, filePtr);
6818 }
6819 
6820 void
openFilesReply(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)6821 Backup::openFilesReply(Signal* signal,
6822 		       BackupRecordPtr ptr, BackupFilePtr filePtr)
6823 {
6824   jam();
6825   /**
6826    * Check if all files have recived open_reply
6827    */
6828   for(ptr.p->files.first(filePtr); filePtr.i!=RNIL;ptr.p->files.next(filePtr))
6829   {
6830     jam();
6831     if(filePtr.p->m_flags & BackupFile::BF_OPENING) {
6832       jam();
6833       return;
6834     }//if
6835   }//for
6836 
6837   if (ERROR_INSERTED(10037)) {
6838     jam();
6839     /**
6840      * Dont return FailedForBackupFilesAleadyExist
6841      * cause this will make NdbBackup auto-retry with higher number :-)
6842      */
6843     ptr.p->errorCode = DefineBackupRef::FailedInsertFileHeader;
6844     defineBackupRef(signal, ptr);
6845     return;
6846   }
6847   /**
6848    * Did open succeed for all files
6849    */
6850   if(ptr.p->checkError())
6851   {
6852     jam();
6853     if(ptr.p->errorCode == FsRef::fsErrFileExists)
6854     {
6855       jam();
6856       ptr.p->errorCode = DefineBackupRef::FailedForBackupFilesAleadyExist;
6857     }
6858     defineBackupRef(signal, ptr);
6859     return;
6860   }//if
6861 
6862   /**
6863    * Insert file headers
6864    */
6865   ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
6866   if(!insertFileHeader(BackupFormat::CTL_FILE, ptr.p, filePtr.p)) {
6867     jam();
6868     defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6869     return;
6870   }//if
6871 
6872   BackupFormat::FileType logfiletype;
6873   if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6874     logfiletype = BackupFormat::UNDO_FILE;
6875   else
6876     logfiletype = BackupFormat::LOG_FILE;
6877 
6878   ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
6879   if(!insertFileHeader(logfiletype, ptr.p, filePtr.p)) {
6880     jam();
6881     defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6882     return;
6883   }//if
6884 
6885   ptr.p->files.getPtr(filePtr, ptr.p->dataFilePtr[0]);
6886   if(!insertFileHeader(BackupFormat::DATA_FILE, ptr.p, filePtr.p)) {
6887     jam();
6888     defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6889     return;
6890   }//if
6891 
6892   /**
6893    * Start CTL file thread
6894    */
6895   ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
6896   filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
6897 
6898   signal->theData[0] = BackupContinueB::START_FILE_THREAD;
6899   signal->theData[1] = filePtr.i;
6900   sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
6901 
6902   /**
6903    * Insert table list in ctl file
6904    */
6905   FsBuffer & buf = filePtr.p->operation.dataBuffer;
6906 
6907   const Uint32 sz =
6908     (sizeof(BackupFormat::CtlFile::TableList) >> 2) +
6909     ptr.p->tables.getCount() - 1;
6910 
6911   Uint32 * dst;
6912   ndbrequire(sz < buf.getMaxWrite());
6913   if(!buf.getWritePtr(&dst, sz)) {
6914     jam();
6915     defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertTableList);
6916     return;
6917   }//if
6918 
6919   BackupFormat::CtlFile::TableList* tl =
6920     (BackupFormat::CtlFile::TableList*)dst;
6921   tl->SectionType   = htonl(BackupFormat::TABLE_LIST);
6922   tl->SectionLength = htonl(sz);
6923 
6924   TablePtr tabPtr;
6925   Uint32 count = 0;
6926   for(ptr.p->tables.first(tabPtr);
6927       tabPtr.i != RNIL;
6928       ptr.p->tables.next(tabPtr)){
6929     jam();
6930     tl->TableIds[count] = htonl(tabPtr.p->tableId);
6931     count++;
6932   }//for
6933 
6934   buf.updateWritePtr(sz);
6935 
6936   /**
6937    * Start getting table definition data
6938    */
6939   ndbrequire(ptr.p->tables.first(tabPtr));
6940 
6941   signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
6942   signal->theData[1] = ptr.i;
6943   signal->theData[2] = tabPtr.i;
6944   signal->theData[3] = 0;
6945   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
6946   return;
6947 }
6948 
6949 bool
insertFileHeader(BackupFormat::FileType ft,BackupRecord * ptrP,BackupFile * filePtrP)6950 Backup::insertFileHeader(BackupFormat::FileType ft,
6951 			 BackupRecord * ptrP,
6952 			 BackupFile * filePtrP){
6953   FsBuffer & buf = filePtrP->operation.dataBuffer;
6954 
6955   const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
6956 
6957   Uint32 * dst;
6958   ndbrequire(sz < buf.getMaxWrite());
6959   if(!buf.getWritePtr(&dst, sz)) {
6960     jam();
6961     return false;
6962   }//if
6963 
6964   BackupFormat::FileHeader* header = (BackupFormat::FileHeader*)dst;
6965   ndbrequire(sizeof(header->Magic) == sizeof(BACKUP_MAGIC));
6966   memcpy(header->Magic, BACKUP_MAGIC, sizeof(BACKUP_MAGIC));
6967   if (ft == BackupFormat::LCP_FILE)
6968   {
6969     jam();
6970     header->BackupVersion = htonl(NDBD_USE_PARTIAL_LCP_v2);
6971   }
6972   else
6973   {
6974     jam();
6975     header->BackupVersion = htonl(NDB_BACKUP_VERSION);
6976   }
6977   header->SectionType   = htonl(BackupFormat::FILE_HEADER);
6978   header->SectionLength = htonl(sz - 3);
6979   header->FileType      = htonl(ft);
6980   header->BackupId      = htonl(ptrP->backupId);
6981   header->BackupKey_0   = htonl(ptrP->backupKey[0]);
6982   header->BackupKey_1   = htonl(ptrP->backupKey[1]);
6983   header->ByteOrder     = 0x12345678;
6984   header->NdbVersion    = htonl(NDB_VERSION_D);
6985   header->MySQLVersion  = htonl(NDB_MYSQL_VERSION_D);
6986 
6987   buf.updateWritePtr(sz);
6988   return true;
6989 }
6990 
6991 void
execGET_TABINFOREF(Signal * signal)6992 Backup::execGET_TABINFOREF(Signal* signal)
6993 {
6994   jamEntry();
6995   GetTabInfoRef * ref = (GetTabInfoRef*)signal->getDataPtr();
6996   BackupFilePtr filePtr;
6997 
6998   const Uint32 senderData = ref->senderData;
6999   BackupRecordPtr ptr;
7000   c_backupFilePool.getPtr(filePtr, senderData);
7001   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
7002 
7003   ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0] ||
7004              !ptr.p->is_lcp());
7005   defineBackupRef(signal, ptr, ref->errorCode);
7006 }
7007 
7008 void
execGET_TABINFO_CONF(Signal * signal)7009 Backup::execGET_TABINFO_CONF(Signal* signal)
7010 {
7011   jamEntry();
7012 
7013   if(!assembleFragments(signal)) {
7014     jam();
7015     return;
7016   }//if
7017 
7018   BackupFilePtr filePtr;
7019   GetTabInfoConf * const conf = (GetTabInfoConf*)signal->getDataPtr();
7020   //const Uint32 senderRef = info->senderRef;
7021   const Uint32 len = conf->totalLen;
7022   const Uint32 senderData = conf->senderData;
7023   const Uint32 tableType = conf->tableType;
7024   const Uint32 tableId = conf->tableId;
7025 
7026   BackupRecordPtr ptr;
7027   c_backupFilePool.getPtr(filePtr, senderData);
7028   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
7029 
7030   SectionHandle handle(this, signal);
7031   SegmentedSectionPtr dictTabInfoPtr;
7032   handle.getSection(dictTabInfoPtr, GetTabInfoConf::DICT_TAB_INFO);
7033   ndbrequire(dictTabInfoPtr.sz == len);
7034 
7035   TablePtr tabPtr ;
7036   if (ptr.p->is_lcp())
7037   {
7038     jam();
7039     ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0])
7040     ptr.p->prepare_table.first(tabPtr);
7041     ndbrequire(tabPtr.p->tableId == tableId);
7042   }
7043   else
7044   {
7045     jam();
7046     ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
7047     ndbrequire(findTable(ptr, tabPtr, tableId));
7048   }
7049 
7050   FsBuffer & buf = filePtr.p->operation.dataBuffer;
7051   Uint32* dst = 0;
7052   {
7053     /**
7054      * Write into ctl file for Backups
7055      *
7056      * We don't write TABLE_DESCRIPTION into data LCP files. It is not
7057      * used in the restore process, so it only uses up space on
7058      * disk for no purpose.
7059      *
7060      * An LCP file only has the following sections:
7061      * 1) File header section
7062      * 2) Fragment Header section
7063      * 3) LCP data section that contains records of type:
7064      *    - INSERT_TYPE (normal records in ALL parts)
7065      *    - WRITE_TYPE (normal records in CHANGE parts)
7066      *    - DELETE_BY_ROWID_TYPE (record deleted in CHANGE parts)
7067      *    - DELETE_BY_PAGEID_TYPE (all records in page deleted in CHANGE part)
7068      * 4) Fragment Footer section
7069      *
7070      * We still need to copy the table description into a linear array,
7071      * we solve this by using the FsBuffer also for LCPs. We skip the
7072      * call to updateWritePtr. This means that we write into the
7073      * buffer, but the next time we write into the buffer we will
7074      * overwrite this area.
7075      */
7076     Uint32 dstLen = len + 3;
7077     if(!buf.getWritePtr(&dst, dstLen)) {
7078       jam();
7079       ndbabort();
7080       ptr.p->setErrorCode(DefineBackupRef::FailedAllocateTableMem);
7081       releaseSections(handle);
7082       defineBackupRef(signal, ptr);
7083       return;
7084     }//if
7085     if(dst != 0) {
7086       jam();
7087 
7088       BackupFormat::CtlFile::TableDescription * desc =
7089         (BackupFormat::CtlFile::TableDescription*)dst;
7090       desc->SectionType = htonl(BackupFormat::TABLE_DESCRIPTION);
7091       desc->SectionLength = htonl(len + 3);
7092       desc->TableType = htonl(tableType);
7093       dst += 3;
7094 
7095       copy(dst, dictTabInfoPtr);
7096       if (!ptr.p->is_lcp())
7097       {
7098         jam();
7099         buf.updateWritePtr(dstLen);
7100       }
7101     }//if
7102   }
7103 
7104   releaseSections(handle);
7105 
7106   if(ptr.p->checkError())
7107   {
7108     jam();
7109     ndbrequire(!ptr.p->is_lcp());
7110     defineBackupRef(signal, ptr);
7111     return;
7112   }//if
7113 
7114   if (!DictTabInfo::isTable(tabPtr.p->tableType))
7115   {
7116     jam();
7117     ndbrequire(!ptr.p->is_lcp());
7118     TablePtr tmp = tabPtr;
7119     removeTableMap(tmp, ptr.i, tmp.p->tableId);
7120     ptr.p->tables.next(tabPtr);
7121     ptr.p->tables.release(tmp);
7122     jamLine(tmp.p->tableId);
7123     afterGetTabinfoLockTab(signal, ptr, tabPtr);
7124     return;
7125   }
7126 
7127   if (!parseTableDescription(signal, ptr, tabPtr, dst, len))
7128   {
7129     jam();
7130     ndbrequire(!ptr.p->is_lcp());
7131     defineBackupRef(signal, ptr);
7132     return;
7133   }
7134 
7135   if(!ptr.p->is_lcp())
7136   {
7137     jam();
7138     BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
7139     req->m_senderRef = reference();
7140     req->m_tableId = tabPtr.p->tableId;
7141     req->m_lock_unlock = BackupLockTab::LOCK_TABLE;
7142     req->m_backup_state = BackupLockTab::GET_TABINFO_CONF;
7143     req->m_backupRecordPtr_I = ptr.i;
7144     req->m_tablePtr_I = tabPtr.i;
7145     sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
7146                BackupLockTab::SignalLength, JBB);
7147     if (ERROR_INSERTED(10038))
7148     {
7149       /* Test */
7150       AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
7151       ord->backupId = ptr.p->backupId;
7152       ord->backupPtr = ptr.i;
7153       ord->requestType = AbortBackupOrd::ClientAbort;
7154       ord->senderData= ptr.p->clientData;
7155       sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
7156                  AbortBackupOrd::SignalLength, JBB);
7157     }
7158     return;
7159   }
7160   else
7161   {
7162     jam();
7163     ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0]);
7164     lcp_open_data_file_done(signal,
7165                             ptr);
7166     return;
7167   }
7168 }
7169 
7170 void
afterGetTabinfoLockTab(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)7171 Backup::afterGetTabinfoLockTab(Signal *signal,
7172                                BackupRecordPtr ptr, TablePtr tabPtr)
7173 {
7174   if(tabPtr.i == RNIL)
7175   {
7176     /**
7177      * Done with all tables...
7178      */
7179     jam();
7180 
7181     ndbrequire(ptr.p->tables.first(tabPtr));
7182     ndbrequire(!ptr.p->is_lcp());
7183     DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
7184     req->senderRef = reference();
7185     req->senderData = ptr.i;
7186     req->tableId = tabPtr.p->tableId;
7187     req->schemaTransId = 0;
7188     req->jamBufferPtr = jamBuffer();
7189     EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
7190                DihScanTabReq::SignalLength, 0);
7191     DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7192     ndbrequire(conf->senderData == 0);
7193     conf->senderData = ptr.i;
7194     execDIH_SCAN_TAB_CONF(signal);
7195     return;
7196   }//if
7197 
7198   /**
7199    * Fetch next table...
7200    */
7201   signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
7202   signal->theData[1] = ptr.i;
7203   signal->theData[2] = tabPtr.i;
7204   signal->theData[3] = 0;
7205   sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
7206   return;
7207 }
7208 
7209 bool
parseTableDescription(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,const Uint32 * tabdescptr,Uint32 len)7210 Backup::parseTableDescription(Signal* signal,
7211 			      BackupRecordPtr ptr,
7212 			      TablePtr tabPtr,
7213 			      const Uint32 * tabdescptr,
7214 			      Uint32 len)
7215 {
7216   SimplePropertiesLinearReader it(tabdescptr, len);
7217 
7218   it.first();
7219 
7220   DictTabInfo::Table tmpTab; tmpTab.init();
7221   SimpleProperties::UnpackStatus stat;
7222   stat = SimpleProperties::unpack(it, &tmpTab,
7223 				  DictTabInfo::TableMapping,
7224 				  DictTabInfo::TableMappingSize);
7225   ndbrequire(stat == SimpleProperties::Break);
7226 
7227   bool lcp = ptr.p->is_lcp();
7228 
7229   ndbrequire(tabPtr.p->tableId == tmpTab.TableId);
7230   ndbrequire(lcp || (tabPtr.p->tableType == tmpTab.TableType));
7231 
7232   /**
7233    * LCP should not save disk attributes but only mem attributes
7234    */
7235 
7236   /**
7237    * Initialize table object
7238    */
7239   tabPtr.p->noOfRecords = 0;
7240   tabPtr.p->schemaVersion = tmpTab.TableVersion;
7241   tabPtr.p->triggerIds[0] = ILLEGAL_TRIGGER_ID;
7242   tabPtr.p->triggerIds[1] = ILLEGAL_TRIGGER_ID;
7243   tabPtr.p->triggerIds[2] = ILLEGAL_TRIGGER_ID;
7244   tabPtr.p->triggerAllocated[0] = false;
7245   tabPtr.p->triggerAllocated[1] = false;
7246   tabPtr.p->triggerAllocated[2] = false;
7247 
7248   tabPtr.p->noOfAttributes = tmpTab.NoOfAttributes;
7249   tabPtr.p->maxRecordSize = 1; // LEN word
7250   bzero(tabPtr.p->attrInfo, sizeof(tabPtr.p->attrInfo));
7251 
7252   if (lcp)
7253   {
7254     jam();
7255     AttributeHeader::init(tabPtr.p->attrInfo, AttributeHeader::READ_LCP, 0);
7256   }
7257   else
7258   {
7259     jam();
7260     AttributeHeader::init(tabPtr.p->attrInfo, AttributeHeader::READ_ALL,
7261                           tmpTab.NoOfAttributes);
7262   }
7263 
7264   Uint32 varsize = 0;
7265   Uint32 disk = 0;
7266   Uint32 null = 0;
7267   for(Uint32 i = 0; i<tmpTab.NoOfAttributes; i++) {
7268     jam();
7269     DictTabInfo::Attribute tmp; tmp.init();
7270     stat = SimpleProperties::unpack(it, &tmp,
7271 				    DictTabInfo::AttributeMapping,
7272 				    DictTabInfo::AttributeMappingSize);
7273 
7274     ndbrequire(stat == SimpleProperties::Break);
7275     it.next(); // Move Past EndOfAttribute
7276 
7277     if(lcp && tmp.AttributeStorageType == NDB_STORAGETYPE_DISK)
7278     {
7279       disk++;
7280       continue;
7281     }
7282 
7283     if (tmp.AttributeArrayType != NDB_ARRAYTYPE_FIXED)
7284       varsize++;
7285 
7286     if (tmp.AttributeNullableFlag)
7287       null++;
7288 
7289     if (tmp.AttributeSize == 0)
7290     {
7291       tabPtr.p->maxRecordSize += (tmp.AttributeArraySize + 31) >> 5;
7292     }
7293     else
7294     {
7295       const Uint32 arr = tmp.AttributeArraySize;
7296       const Uint32 sz = 1 << tmp.AttributeSize;
7297       const Uint32 sz32 = (sz * arr + 31) >> 5;
7298 
7299       tabPtr.p->maxRecordSize += sz32;
7300     }
7301   }
7302 
7303   tabPtr.p->attrInfoLen = 1;
7304 
7305   if (lcp)
7306   {
7307     jam();
7308     Dbtup* tup = (Dbtup*)globalData.getBlock(DBTUP, instance());
7309     tabPtr.p->maxRecordSize = 1 + tup->get_max_lcp_record_size(tmpTab.TableId);
7310   }
7311   else
7312   {
7313     // mask
7314     tabPtr.p->maxRecordSize += 1 + ((tmpTab.NoOfAttributes + null + 31) >> 5);
7315     tabPtr.p->maxRecordSize += (2 * varsize + 3) / 4;
7316   }
7317 
7318   return true;
7319 }
7320 
7321 void
execDIH_SCAN_TAB_CONF(Signal * signal)7322 Backup::execDIH_SCAN_TAB_CONF(Signal* signal)
7323 {
7324   jamEntry();
7325   DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7326   const Uint32 fragCount = conf->fragmentCount;
7327   const Uint32 tableId = conf->tableId;
7328   const Uint32 senderData = conf->senderData;
7329   const Uint32 scanCookie = conf->scanCookie;
7330   ndbrequire(conf->reorgFlag == 0); // no backup during table reorg
7331 
7332   BackupRecordPtr ptr;
7333   c_backupPool.getPtr(ptr, senderData);
7334 
7335   TablePtr tabPtr;
7336   ndbrequire(findTable(ptr, tabPtr, tableId));
7337 
7338   tabPtr.p->m_scan_cookie = scanCookie;
7339   ndbrequire(tabPtr.p->fragments.seize(fragCount) != false);
7340   for(Uint32 i = 0; i<fragCount; i++) {
7341     jam();
7342     FragmentPtr fragPtr;
7343     tabPtr.p->fragments.getPtr(fragPtr, i);
7344     fragPtr.p->scanned = 0;
7345     fragPtr.p->scanning = 0;
7346     fragPtr.p->tableId = tableId;
7347     fragPtr.p->fragmentId = i;
7348     fragPtr.p->lqhInstanceKey = 0;
7349     fragPtr.p->node = 0;
7350   }//for
7351 
7352   /**
7353    * Next table
7354    */
7355   if(ptr.p->tables.next(tabPtr))
7356   {
7357     jam();
7358     DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
7359     req->senderRef = reference();
7360     req->senderData = ptr.i;
7361     req->tableId = tabPtr.p->tableId;
7362     req->schemaTransId = 0;
7363     req->jamBufferPtr = jamBuffer();
7364     EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
7365                       DihScanTabReq::SignalLength, 0);
7366     jamEntry();
7367     DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7368     ndbrequire(conf->senderData == 0);
7369     conf->senderData = ptr.i;
7370     /* conf is already set up properly to be sent as signal */
7371     /* Real-time break to ensure we don't run for too long in one signal. */
7372     sendSignal(reference(), GSN_DIH_SCAN_TAB_CONF, signal,
7373                DihScanTabConf::SignalLength, JBB);
7374     return;
7375   }//if
7376 
7377   ptr.p->tables.first(tabPtr);
7378   getFragmentInfo(signal, ptr, tabPtr, 0);
7379 }
7380 
7381 void
getFragmentInfo(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,Uint32 fragNo)7382 Backup::getFragmentInfo(Signal* signal,
7383 			BackupRecordPtr ptr, TablePtr tabPtr, Uint32 fragNo)
7384 {
7385   Uint32 loopCount = 0;
7386   jam();
7387 
7388   for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
7389     jam();
7390     const Uint32 fragCount = tabPtr.p->fragments.getSize();
7391     for(; fragNo < fragCount; fragNo ++) {
7392       jam();
7393       FragmentPtr fragPtr;
7394       tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7395 
7396       if(fragPtr.p->scanned == 0 && fragPtr.p->scanning == 0) {
7397         jam();
7398         DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
7399         req->tableId = tabPtr.p->tableId;
7400         req->hashValue = fragNo;
7401         req->distr_key_indicator = ZTRUE;
7402         req->anyNode = 0;
7403         req->scan_indicator = ZTRUE;
7404         req->jamBufferPtr = jamBuffer();
7405         req->get_next_fragid_indicator = 0;
7406         EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
7407                           DiGetNodesReq::SignalLength, 0);
7408         jamEntry();
7409         DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
7410         Uint32 reqinfo = conf->reqinfo;
7411         Uint32 nodeId = conf->nodes[0];
7412         /* Require successful read of table fragmentation */
7413         ndbrequire(conf->zero == 0);
7414         Uint32 instanceKey = (reqinfo >> 24) & 127;
7415         fragPtr.p->lqhInstanceKey = instanceKey;
7416         fragPtr.p->node = nodeId;
7417         if (++loopCount >= DiGetNodesReq::MAX_DIGETNODESREQS ||
7418             ERROR_INSERTED(10046))
7419         {
7420           jam();
7421           if (ERROR_INSERTED(10046))
7422           {
7423             CLEAR_ERROR_INSERT_VALUE;
7424           }
7425           signal->theData[0] = BackupContinueB::ZGET_NEXT_FRAGMENT;
7426           signal->theData[1] = ptr.i;
7427           signal->theData[2] = tabPtr.p->tableId;
7428           signal->theData[3] = fragNo + 1;
7429           sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
7430           return;
7431         }
7432       }//if
7433     }//for
7434 
7435     DihScanTabCompleteRep*rep= (DihScanTabCompleteRep*)signal->getDataPtrSend();
7436     rep->tableId = tabPtr.p->tableId;
7437     rep->scanCookie = tabPtr.p->m_scan_cookie;
7438     rep->jamBufferPtr = jamBuffer();
7439     EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_COMPLETE_REP, signal,
7440                       DihScanTabCompleteRep::SignalLength, 0);
7441 
7442     fragNo = 0;
7443   }//for
7444 
7445 
7446   getFragmentInfoDone(signal, ptr);
7447 }
7448 
7449 void
getFragmentInfoDone(Signal * signal,BackupRecordPtr ptr)7450 Backup::getFragmentInfoDone(Signal* signal, BackupRecordPtr ptr)
7451 {
7452   ptr.p->m_gsn = GSN_DEFINE_BACKUP_CONF;
7453   ptr.p->slaveState.setState(DEFINED);
7454   DefineBackupConf * conf = (DefineBackupConf*)signal->getDataPtrSend();
7455   conf->backupPtr = ptr.i;
7456   conf->backupId = ptr.p->backupId;
7457   sendSignal(ptr.p->senderRef, GSN_DEFINE_BACKUP_CONF, signal,
7458 	     DefineBackupConf::SignalLength, JBB);
7459 }
7460 
7461 
7462 /*****************************************************************************
7463  *
7464  * Slave functionallity: Start backup
7465  *
7466  *****************************************************************************/
7467 void
execSTART_BACKUP_REQ(Signal * signal)7468 Backup::execSTART_BACKUP_REQ(Signal* signal)
7469 {
7470   jamEntry();
7471 
7472   CRASH_INSERTION((10015));
7473 
7474   StartBackupReq* req = (StartBackupReq*)signal->getDataPtr();
7475   const Uint32 ptrI = req->backupPtr;
7476 
7477   BackupRecordPtr ptr;
7478   c_backupPool.getPtr(ptr, ptrI);
7479 
7480   ptr.p->slaveState.setState(STARTED);
7481   ptr.p->m_gsn = GSN_START_BACKUP_REQ;
7482 
7483   /* At this point, we are effectively starting
7484    * bulk file writes for this backup, so lets
7485    * record the fact
7486    */
7487   Backup::g_is_single_thr_backup_running = true;
7488 
7489   /**
7490    * Start file threads...
7491    */
7492   BackupFilePtr filePtr;
7493   for(ptr.p->files.first(filePtr); filePtr.i!=RNIL;ptr.p->files.next(filePtr))
7494   {
7495     jam();
7496     if(! (filePtr.p->m_flags & BackupFile::BF_FILE_THREAD))
7497     {
7498       jam();
7499       filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
7500       signal->theData[0] = BackupContinueB::START_FILE_THREAD;
7501       signal->theData[1] = filePtr.i;
7502       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7503     }//if
7504   }//for
7505 
7506   /* A backup needs to be restored to a consistent point, for
7507    * which it uses a fuzzy scan and a log.
7508    *
7509    * The fuzzy scan is restored, and then the log is replayed
7510    * idempotently up to some consistent point which is after
7511    * (SNAPSHOTEND) or before (SNAPSHOTSTART) any of the states
7512    * captured in the scan.
7513    *
7514    * This requires that the backup is captured in order :
7515    * 1) Start recording logs of all committed transactions
7516    * 2) Choose SNAPSHOTSTART consistent point
7517    * 3) Perform data scan
7518    * 4) Choose SNAPSHOTEND consistent point
7519    * 5) Stop recording logs
7520    *
7521    * Tell DBTUP to create triggers to start recording logs
7522    */
7523   TablePtr tabPtr;
7524   ndbrequire(ptr.p->tables.first(tabPtr));
7525   sendCreateTrig(signal, ptr, tabPtr);
7526 }
7527 
7528 /*****************************************************************************
7529  *
7530  * Slave functionallity: Backup fragment
7531  *
7532  *****************************************************************************/
7533 void
execBACKUP_FRAGMENT_REQ(Signal * signal)7534 Backup::execBACKUP_FRAGMENT_REQ(Signal* signal)
7535 {
7536   jamEntry();
7537   BackupFragmentReq* req = (BackupFragmentReq*)signal->getDataPtr();
7538 
7539   CRASH_INSERTION((10016));
7540 
7541   const Uint32 ptrI = req->backupPtr;
7542   //const Uint32 backupId = req->backupId;
7543   const Uint32 tableId = req->tableId;
7544   const Uint32 fragNo = req->fragmentNo;
7545   const Uint32 count = req->count;
7546 
7547   /**
7548    * Get backup record
7549    */
7550   BackupRecordPtr ptr;
7551   BackupFilePtr filePtr;
7552   TablePtr tabPtr;
7553 
7554   c_backupPool.getPtr(ptr, ptrI);
7555 
7556   if (ptr.p->is_lcp())
7557   {
7558     jam();
7559     start_execute_lcp(signal, ptr, tabPtr, tableId);
7560     if (ptr.p->m_empty_lcp)
7561     {
7562       /**
7563        * No need to start LCP processing in this case, we only
7564        * update LCP control file and this process has already
7565        * been started when we come here.
7566        */
7567       jam();
7568     }
7569     else
7570     {
7571       jam();
7572       start_lcp_scan(signal, ptr, tabPtr, ptrI, fragNo);
7573     }
7574     return;
7575   }
7576   else
7577   {
7578     jam();
7579     /* Backup path */
7580     if (ERROR_INSERTED(10039))
7581     {
7582       sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_REQ, signal,
7583                           300, signal->getLength());
7584       return;
7585     }
7586     /* Get Table */
7587     ndbrequire(findTable(ptr, tabPtr, tableId));
7588   }
7589   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
7590 
7591   ptr.p->slaveState.setState(SCANNING);
7592   ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REQ;
7593 
7594   ndbrequire(filePtr.p->backupPtr == ptrI);
7595 
7596   /* Get fragment */
7597   FragmentPtr fragPtr;
7598   tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7599 
7600   ndbrequire(fragPtr.p->scanned == 0);
7601   ndbrequire(fragPtr.p->scanning == 0 ||
7602 	     refToNode(ptr.p->masterRef) == getOwnNodeId());
7603 
7604   /**
7605    * Init operation
7606    */
7607   if (filePtr.p->tableId != tableId)
7608   {
7609     jam();
7610     DEB_EXTRA_LCP(("(%u)Init new tab(%u): maxRecordSize: %u",
7611                    instance(),
7612                    tableId,
7613                    tabPtr.p->maxRecordSize));
7614     filePtr.p->operation.init(tabPtr);
7615     filePtr.p->tableId = tableId;
7616   }//if
7617 
7618   /**
7619    * Check for space in buffer
7620    */
7621   if(!filePtr.p->operation.newFragment(tableId, fragPtr.p->fragmentId)) {
7622     jam();
7623     ndbrequire(!ptr.p->is_lcp());
7624     req->count = count + 1;
7625     sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_REQ, signal,
7626                         WaitDiskBufferCapacityMillis,
7627 			signal->length());
7628     ptr.p->slaveState.setState(STARTED);
7629     return;
7630   }//if
7631 
7632   /**
7633    * Mark things as "in use"
7634    */
7635   fragPtr.p->scanning = 1;
7636   filePtr.p->fragmentNo = fragPtr.p->fragmentId;
7637   filePtr.p->m_retry_count = 0;
7638 
7639   ndbrequire(filePtr.p->m_flags ==
7640 	     (BackupFile::BF_OPEN | BackupFile::BF_FILE_THREAD));
7641   sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 0);
7642 }
7643 
7644 void
start_lcp_scan(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,Uint32 ptrI,Uint32 fragNo)7645 Backup::start_lcp_scan(Signal *signal,
7646                        BackupRecordPtr ptr,
7647                        TablePtr tabPtr,
7648                        Uint32 ptrI,
7649                        Uint32 fragNo)
7650 {
7651   BackupFilePtr filePtr;
7652   FragmentPtr fragPtr;
7653 
7654   DEB_EXTRA_LCP(("(%u)Start lcp scan",
7655                  instance()));
7656 
7657   ptr.p->slaveState.setState(SCANNING);
7658   ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REQ;
7659 
7660   /* Get fragment */
7661   tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7662 
7663   c_tup->start_lcp_scan(tabPtr.p->tableId,
7664                         fragPtr.p->fragmentId,
7665                         ptr.p->m_lcp_max_page_cnt);
7666   ptr.p->m_is_lcp_scan_active = true;
7667   ptr.p->m_lcp_current_page_scanned = 0;
7668 
7669   /**
7670    * Now the LCP have started for this fragment. The following
7671    * things have to be done in the same real-time break.
7672    *
7673    * 1) Write an LCP entry into the UNDO log.
7674    * 2) Get number of pages to checkpoint.
7675    * 3) Inform TUP that LCP scan have started
7676    *
7677    * It is not absolutely necessary to start the actual LCP scan
7678    * in the same real-time break. We use this opportunity to open
7679    * any extra LCP files that this LCP needs. If only one is needed
7680    * it has already been opened and we can proceed immediately.
7681    * However large fragments that have seen large number of writes
7682    * since the last LCP can require multiple LCP files. These
7683    * extra LCP files are opened before we actually start the
7684    * LCP scan.
7685    */
7686 
7687   ndbrequire(fragPtr.p->scanned == 0);
7688   ndbrequire(fragPtr.p->scanning == 0 ||
7689 	     refToNode(ptr.p->masterRef) == getOwnNodeId());
7690 
7691   ptr.p->m_last_data_file_number =
7692     get_file_add(ptr.p->m_first_data_file_number,
7693                  ptr.p->m_num_lcp_files - 1);
7694 
7695   init_file_for_lcp(signal, 0, ptr, ptrI);
7696   if (ptr.p->m_num_lcp_files > 1)
7697   {
7698     jam();
7699     for (Uint32 i = 1; i < ptr.p->m_num_lcp_files; i++)
7700     {
7701       jam();
7702       lcp_open_data_file_late(signal, ptr, i);
7703     }
7704     return;
7705   }
7706   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
7707   Uint32 delay = 0;
7708   if (ERROR_INSERTED(10047))
7709   {
7710     g_eventLogger->info("(%u)Start LCP on tab(%u,%u) 3 seconds delay, max_page: %u",
7711                         instance(),
7712                         tabPtr.p->tableId,
7713                         fragPtr.p->fragmentId,
7714                         ptr.p->m_lcp_max_page_cnt);
7715 
7716     if (ptr.p->m_lcp_max_page_cnt > 20)
7717     {
7718       delay = 9000;
7719     }
7720   }
7721   sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, delay);
7722 }
7723 
7724 void
init_file_for_lcp(Signal * signal,Uint32 index,BackupRecordPtr ptr,Uint32 ptrI)7725 Backup::init_file_for_lcp(Signal *signal,
7726                           Uint32 index,
7727                           BackupRecordPtr ptr,
7728                           Uint32 ptrI)
7729 {
7730   TablePtr tabPtr;
7731   FragmentPtr fragPtr;
7732   BackupFilePtr filePtr;
7733   ptr.p->tables.first(tabPtr);
7734   tabPtr.p->fragments.getPtr(fragPtr, 0);
7735 
7736   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[index]);
7737   ndbrequire(filePtr.p->backupPtr == ptrI);
7738 
7739   /**
7740    * Init operation
7741    */
7742   DEB_EXTRA_LCP(("(%u)Init new tab(%u): maxRecordSize: %u",
7743                  instance(),
7744                  tabPtr.p->tableId,
7745                  tabPtr.p->maxRecordSize));
7746   filePtr.p->operation.init(tabPtr);
7747   filePtr.p->tableId = tabPtr.p->tableId;
7748 
7749   /**
7750    * Mark things as "in use"
7751    */
7752   fragPtr.p->scanning = 1;
7753   filePtr.p->m_retry_count = 0;
7754   filePtr.p->m_lcp_inserts = 0;
7755   filePtr.p->m_lcp_writes = 0;
7756   filePtr.p->m_lcp_delete_by_rowids = 0;
7757   filePtr.p->m_lcp_delete_by_pageids = 0;
7758 
7759   filePtr.p->fragmentNo = 0;
7760 
7761   ndbrequire(filePtr.p->operation.newFragment(tabPtr.p->tableId,
7762                                               fragPtr.p->fragmentId));
7763 
7764   /**
7765    * Start file thread now that we will start writing also
7766    * fragment checkpoint data.
7767    */
7768   ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
7769   filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
7770 
7771   signal->theData[0] = BackupContinueB::START_FILE_THREAD;
7772   signal->theData[1] = filePtr.i;
7773   signal->theData[2] = __LINE__;
7774   sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
7775 }
7776 
7777 /**
7778  * Backups and LCPs are actions that operate on a long time-scale compared to
7779  * other activities in the cluster. We also have a number of similar
7780  * activities that operate on a longer time scale. These operations have to
7781  * continue to operate at some decent level even if user transactions are
7782  * arriving at extreme rates.
7783  *
7784  * Not providing sufficient activity for LCPs might mean that we run out of
7785  * REDO log, this means that no writing user transactions are allowed until
7786  * we have completed an LCP. Clearly this is not a desirable user experience.
7787  * So we need to find a balance between long-term needs and short-term needs
7788  * in scheduling LCPs and Backups versus normal user transactions.
7789  *
7790  * When designing those scheduling algorithms we need to remember the design
7791  * aim for the NDB storage engine. We want to ensure that NDB can be used in
7792  * soft real-time applications such as financial applications, telecom
7793  * applications. We do not aim for hard real-time applications such as
7794  * controlling power plants where missing a deadline can lead to major
7795  * catastrophies.
7796  *
7797  * Using NDB for a soft real-time application can still be done at different
7798  * levels of real-time requirements. If the aim is to provide that more or
7799  * less 100% of the transactions complete in say 100 microseconds then a
7800  * certain level of control is needed also from the application.
7801  *
7802  * Things that will affect scheduling in NDB are:
7803  * 1) Use of large rows
7804  *   NDB will schedule at least one row at a time. There are currently very
7805  *   few places where execution of one row operation contains breaks for
7806  *   scheduling. Executing a row operation on the maximum row size of
7807  *   around 14 kBytes means that signals can execute for up to about 20
7808  *   microseconds as of 2018. Clearly using smaller rows can give a better
7809  *   response time experience.
7810  *
7811  * 2) Using complex conditions per row
7812  *   NDB supports pushing down conditions on rows in both key operations and
7813  *   scan operations and even on join operations. Clearly if these pushed
7814  *   conditions are very complex the time to execute them per row can extend
7815  *   the time spent in executing one particular signal. Normal conditions
7816  *   involving one or a number of columns doesn't present a problem but
7817  *   SQL have no specific limits on conditions, so extremely complex
7818  *   conditions are possible to construct.
7819  *
7820  * 3) Metadata operations
7821  *   Creating tables, indexes can contain some operations that take a bit
7822  *   longer to execute. However using the multi-threaded data nodes (ndbmtd)
7823  *   means that most of these signals are executed in threads that are not
7824  *   used for normal user transactions. So using ndbmtd is here a method to
7825  *   decrease impact of response time of metadata operations.
7826  *
7827  * 4) Use of ndbd vs ndbmtd
7828  *   ndbd is a single threaded data node, ndbd does receive data, operate on
7829  *   the data and send the data all in one thread. In low load cases with
7830  *   very high requirements on response time and strict control of the
7831  *   application layer the use of ndbd for real-time operation can be
7832  *   beneficial.
7833  *
7834  *   Important here is to understand that the single-threaded nature of ndbd
7835  *   means that it is limited in throughput. One data node using ndbd is
7836  *   limited to handling on the order of 100.000 row operations per second
7837  *   with maintained responsiveness as of 2015. ndbmtd can achieve a few
7838  *   million row operations in very large configurations with maintained
7839  *   responsiveness.
7840  *
7841  * When looking at maintaining a balance between various operations long-term
7842  * it is important to consider what types of operations that can go in parallel
7843  * in an NDB data node. These are the activities currently possible.
7844  *
7845  * 1) Normal user transactions
7846  *   These consist of primary key row operations, unique key row operations
7847  *   (these are implemented as two primary key row operations), scan operations
7848  *   and finally a bit more complex operations that can have both key
7849  *   operations and scan operations as part of them. The last category is
7850  *   created as part of executing SPJ operation trees that currently is used
7851  *   for executing complex SQL queries.
7852  *
7853  * 2) Local checkpoints (LCPs)
7854  *   These can operate continously without user interaction. The LCPs are
7855  *   needed to ensure that we can cut the REDO log. If LCPs execute too slow
7856  *   the we won't have sufficient REDO log to store all user transactions that
7857  *   are writing on logging tables.
7858  *
7859  * 3) Backups
7860  *   These are started by a user, only one backup at a time is allowed. These
7861  *   can be stored offsite and used by the user to restore NDB to a former
7862  *   state, either as an emergency fix, it can also be used to start up a
7863  *   new cluster or as part of setting up a slave cluster. A backup consists
7864  *   of a data file per data node and one log file of changes since the backup
7865  *   started and a control file. It is important that the backup maintains a
7866  *   level of speed such that the system doesn't run out of disk space for the
7867  *   log file.
7868  *
7869  * 4) Metadata operations
7870  *   There are many different types of metadata operations. One can define
7871  *   new tables, indexes, foreign keys, tablespaces. One can also rearrange
7872  *   the tables for a new number of nodes as part of adding nodes to the
7873  *   cluster. There are also operations to analyse tables, optimise tables
7874  *   and so forth. Most of these are fairly short in duration and usage of
7875  *   resources. But there are a few of them such as rearranging tables for
7876  *   a new set of nodes that require shuffling data around in the cluster.
7877  *   This can be a fairly long-running operation.
7878  *
7879  * 5) Event operations
7880  *   To support replication from one MySQL Cluster to another MySQL Cluster
7881  *   or a different MySQL storage engine we use event operations.
7882  *   These operate always as part of the normal user transactions, so they
7883  *   do not constitute anything to consider in the balance between long-term
7884  *   and short-term needs. In addition in ndbmtd much of the processing happens
7885  *   in a special thread for event operations.
7886  *
7887  * 6) Node synchronisation during node recovery
7888  *   Recovery as such normally happens when no user transactions are happening
7889  *   so thus have no special requirements on maintaining a balance between
7890  *   short-term needs and long-term needs since recovery is always a long-term
7891  *   operation that has no competing short-term operations. There is however
7892  *   one exception to this and this is during node recovery when the starting
7893  *   node needs to synchronize its data with a live node. In this case the
7894  *   starting node has recovered an old version of the data node using LCPs
7895  *   and REDO logs and have rebuilt the indexes. At this point it needs to
7896  *   synchronize the data in each table with a live node within the same node
7897  *   group.
7898  *
7899  *   This synchronization happens row by row controlled by the live node. The
7900  *   live scans its own data and checks each row to the global checkpoint id
7901  *   (GCI) that the starting node has restored. If the row has been updated
7902  *   with a more recent GCI then the row needs to be sent over to the starting
7903  *   node.
7904  *
7905  *   Only one node recovery per node group at a time is possible when using
7906  *   two replicas.
7907  *
7908  * So there can be as many as 4 long-term operations running in parallel to
7909  * the user transactions. These are 1 LCP scan, 1 Backup scan, 1 node recovery
7910  * scan and finally 1 metadata scan. All of these long-running operations
7911  * perform scans of table partitions (fragments). LCPs scan a partition and
7912  * write rows into a LCP file. Backups scan a partition and write its result
7913  * into a backup file. Node recovery scans searches for rows that have been
7914  * updated since the GCI recovered in the starting node and for each row
7915  * found it is sent over to the starting node. Metadata scans for either
7916  * all rows or using some condition and then can use this information to
7917  * send the row to another node, to build an index, to build a foreign key
7918  * index or other online operation which is performed in parallel to user
7919  * transactions.
7920  *
7921  * From this analysis it's clear that we don't want any long-running operation
7922  * to consume any major part of the resources. It's desirable that user
7923  * transactions can use at least about half of the resources even when running
7924  * in parallel with all four of those activities. Node recovery is slightly
7925  * more important than the other activities, this means that our aim should
7926  * be to ensure that LCPs, Backups and metadata operations can at least use
7927  * about 10% of the CPU resources and that node recovery operations can use
7928  * at least about 20% of the CPU resources. Obviously they should be able to
7929  * use more resources when there is less user transactions competing for the
7930  * resources. But we should try to maintain this level of CPU usage for LCPs
7931  * and Backups even when the user load is at extreme levels.
7932  *
7933  * There is no absolute way of ensuring 10% CPU usage for a certain activity.
7934  * We use a number of magic numbers controlling the algorithms to ensure this.
7935  *
7936  * At first we use the coding rule that one signal should never execute for
7937  * more than 10 microseconds in the normal case. There are exceptions to this
7938  * rule as explained above, but it should be outliers that won't affect the
7939  * long-term rates very much.
7940  *
7941  * Second we use the scheduling classes we have access to. The first is B-level
7942  * signals, these can have an arbitrary long queue of other jobs waiting before
7943  * they are executed, so these have no bound on when they execute. We also
7944  * have special signals that execute with a bounded delay, in one signal they
7945  * can be delayed more than a B-level signal, but the scheduler ensures that
7946  * at most 100 B-level signals execute before they are executed. Normally it
7947  * would even operate with at most 75 B-level signals executed even in high
7948  * load scenarios and mostly even better than that. We achieve this by calling
7949  * sendSignalWithDelay with timeout BOUNDED_DELAY.
7950  *
7951  * So how fast can an LCP run that is using about 10% of the CPU. In a fairly
7952  * standard CPU of 2015, not a high-end, but also not at the very low-end,
7953  * the CPU can produce about 150 MBytes of data for LCPs per second. This is
7954  * using 100 byte rows. So this constitutes about 1.5M rows per second plus
7955  * transporting 150 MBytes of data to the write buffers in the Backup block.
7956  * So we use a formula here where we assume that the fixed cost of scanning
7957  * a row is about 550 ns and cost per word of data is 4 ns. The reason we
7958  * a different formula for LCP scans compared to the formula we assume in
7959  * DBLQH for generic scans is that the copy of data is per row for LCPs
7960  * whereas it is per column for generic scans. Similarly we never use any
7961  * scan filters for LCPs, we only check for LCP_SKIP bits and FREE bits.
7962  * This is much more efficient compared to generic scan filters.
7963  *
7964  * At very high load we will assume that we have to wait about 50 signals
7965  * when sending BOUNDED_DELAY signals. Worst case can be up to about 100
7966  * signals, but the worst case won't happen very often and more common
7967  * will be much less than that.
7968  * The mean execution time of signals are about 5 microseconds. This means
7969  * that by constantly using bounded delay signals we ensure that we get at
7970  * least around 4000 executions per second. So this means that
7971  * in extreme overload situations we can allow for execution to go on
7972  * for up to about 25 microseconds without giving B-level signals access.
7973  * 25 microseconds times 4000 is 100 milliseconds so about 10% of the
7974  * CPU usage.
7975  *
7976  * LCPs and Backups also operate using conditions on how fast they can write
7977  * to the disk subsystem. The user can configure these numbers, the LCPs
7978  * and Backups gets a quota per 100 millisecond. So if the LCPs and Backups
7979  * runs too fast they will pause a part of those 100 milliseconds. However
7980  * it is a good idea to set the minimum disk write speed to at least 20%
7981  * of the possible CPU speed. So this means setting it to 30 MByte per
7982  * second. In high-load scenarios we might not be able to process more
7983  * than 15 MByte per second, but as soon as user load and other load
7984  * goes down we will get back to the higher write speed.
7985  *
7986  * Scans operate in the following fashion which is an important input to
7987  * the construction of the magic numbers. We start a scan with SCAN_FRAGREQ
7988  * and here we don't really know the row sizes other than the maximum row
7989  * size. This SCAN_FRAGREQ will return 16 rows and then it will return
7990  * SCAN_FRAGCONF. For each row it will return a TRANSID_AI signal.
7991  * If we haven't used our quota for writing LCPs and Backups AND there is
7992  * still room in the backup write buffer then we will continue with another
7993  * set of 16 rows. These will be retrieved using the SCAN_NEXTREQ signal
7994  * and the response to this signal will be SCAN_FRAGCONF when done with the
7995  * 16 rows (or all rows scanned).
7996  *
7997  * Processing 16 rows takes about 8800 ns on standard HW of 2015 and so even
7998  * for minimal rows we will use at least 10000 ns if we execute an entire batch
7999  * of 16 rows without providing access for other B-level signals. So the
8000  * absolute maximum amount of rows that we will ever execute without
8001  * giving access for B-level signals are 32 rows so that we don't go beyond
8002  * the allowed quota of 25 microsecond without giving B-level priority signal
8003  * access, this means two SCAN_FRAGREQ/SCAN_NEXTREQ executions.
8004  *
8005  * Using the formula we derive that we should never start another set of
8006  * 16 rows if we have passed 1500 words in the previous batch of 16 rows.
8007  * Even when deciding in the Backup block to send an entire batch of 16
8008  * rows at A-level we will never allow to continue gathering when we have
8009  * already gathered more than 4000 words. When we reach this limit we will
8010  * send another bounded delay signal. The reason is that we've already
8011  * reached sufficient CPU usage and going further would go beyond 15%.
8012  *
8013  * The boundary 1500 and 4000 is actually based on using 15% of the CPU
8014  * resources which is better if not all four activities happen at the
8015  * same time. When we support rate control on all activities we need to
8016  * adaptively decrease this limit to ensure that the total rate controlled
8017  * efforts doesn't go beyond 50%.
8018  *
8019  * The limit 4000 is ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO set in DblqhMain.cpp.
8020  * This constant limit the impact of wide rows on responsiveness.
8021  *
8022  * When operating in normal mode, we will not continue gathering when we
8023  * already gathered at least 500 words. However we will only operate in
8024  * this mode when we are in low load scenario in which case this speed will
8025  * be quite sufficient. This limit is to ensure that we don't go beyond
8026  * normal real-time break limits in normal operations. This limits LCP
8027  * execution during normal load to around 3-4 microseconds.
8028  *
8029  * In the following paragraph a high priority of LCPs means that we need to
8030  * raise LCP priority to maintain LCP write rate at the expense of user
8031  * traffic responsiveness. Low priority means that we can get sufficient
8032  * LCP write rates even with normal responsiveness to user requests.
8033  *
8034  * Finally we have to make a decision when we should execute at high priority
8035  * and when operating at normal priority. Obviously we should avoid entering
8036  * high priority mode as much as possible since it will affect response times.
8037  * At the same time once we have entered this mode we need to have some
8038  * memory of it. The reason is that we will have lost some ground while
8039  * executing at normal priority when the job buffers were long. We will limit
8040  * the memory to at most 16 executions of 16 rows at high priority. Each
8041  * time we start a new execution we will see if we need to add to this
8042  * "memory". We will add one per 48 signals that we had to wait for between
8043  * executing a set of 16 rows (normally this means execution of 3 bounded
8044  * delay signals). When the load level is even higher than we will add to
8045  * the memory such that we operate in high priority mode a bit longer since
8046  * we are likely to have missed a bit more opportunity to perform LCP scans
8047  * in this overload situation.
8048  *
8049  * The following "magic" constants control these algorithms:
8050  * 1) ZMAX_SCAN_DIRECT_COUNT set to 5
8051  * Means that at most 6 rows will be scanned per execute direct, set in
8052  * Dblqh.hpp. This applies to all scan types, not only to LCP scans.
8053  *
8054  * 2) ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO set to 1600
8055  * This controls the maximum number of words that is allowed to be gathered
8056  * before we decide to do a real-time break when executing at normal
8057  * priority level. This is defined in Backup.hpp. This will execute for about
8058  * 2 microseconds.
8059  *
8060  * 3) ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO set to 8000
8061  * This controls the maximum words gathered before we decide to send the
8062  * next row to be scanned in another bounded delay signal. This is defined in
8063  * Backup.hpp. In this case the Backup block decided to execute on priority A
8064  * level due to a high load in the node. This limit is set to execute for about
8065  * 10 microseconds (around 300 MBytes can be written per second per CPU).
8066  * LCPs can override this limit with a multiplication factor of
8067  * m_redo_alert_factor.
8068  *
8069  * We will always use the priority A-level when the REDO log limit has been
8070  * reached to ensure that we execute proper batches already when seeing the
8071  * first signs of REDO log overload.
8072  *
8073  * 4) MAX_LCP_WORDS_PER_BATCH no longer used
8074  *
8075  * 5) HIGH_LOAD_LEVEL set to 32
8076  * Limit of how many signals have been executed in this LDM thread since
8077  * starting last 16 rows in order to enter high priority mode.
8078  * Defined in this block Backup.cpp.
8079  *
8080  * 6) VERY_HIGH_LOAD_LEVEL set to 48
8081  * For each additional of this we increase the memory. So e.g. with 80 signals
8082  * executed since last we will increase the memory by two, with 128 we will
8083  * increase it by three. Thus if #signals >= (32 + 48) => 2, #signals >=
8084  * (32 + 48 * 2) => 3 and so forth. Memory here means that we will remember
8085  * the high load until we have compensated for it in a sufficient manner, so
8086  * we will retain executing on high priority for a bit longer to compensate
8087  * for what we lost during execution at low priority when load suddenly
8088  * increased.
8089  * Defined in this block Backup.cpp.
8090  *
8091  * 7) MAX_RAISE_PRIO_MEMORY set to 16
8092  * Max memory of priority raising, so after load disappears we will at most
8093  * an additional set of 16*16 rows at high priority mode before going back to
8094  * normal priority mode.
8095  * Defined in this block Backup.cpp.
8096  *
8097  * 8) NUMBER_OF_SIGNALS_PER_SCAN_BATCH set to 3
8098  * When starting up the algorithm we check how many signals are in the
8099  * B-level job buffer. Based on this number we set the initial value to
8100  * high priority or not. This is based on that we expect a set of 16
8101  * rows to be executed in 3 signals with 6 rows, 6 rows and last signal
8102  * 4 rows.
8103  * Defined in this block Backup.cpp.
8104  */
8105 
8106  /**
8107  * These routines are more or less our scheduling logic for LCPs. This is
8108  * how we try to achieve a balanced output from LCPs while still
8109  * processing normal transactions at a high rate.
8110  */
init_scan_prio_level(Signal * signal,BackupRecordPtr ptr)8111 void Backup::init_scan_prio_level(Signal *signal, BackupRecordPtr ptr)
8112 {
8113   Uint32 level = getSignalsInJBB();
8114   if ((level * NUMBER_OF_SIGNALS_PER_SCAN_BATCH) > HIGH_LOAD_LEVEL)
8115   {
8116     /* Ensure we use prio A and only 1 signal at prio A */
8117     jam();
8118     level = VERY_HIGH_LOAD_LEVEL;
8119   }
8120   ptr.p->m_lastSignalId = signal->getSignalId() - level;
8121   ptr.p->m_prioA_scan_batches_to_execute = 0;
8122 }
8123 
8124 bool
check_scan_if_raise_prio(Signal * signal,BackupRecordPtr ptr)8125 Backup::check_scan_if_raise_prio(Signal *signal, BackupRecordPtr ptr)
8126 {
8127   bool flag = false;
8128   const Uint32 current_signal_id = signal->getSignalId();
8129   const Uint32 lastSignalId = ptr.p->m_lastSignalId;
8130   Uint32 prioA_scan_batches_to_execute =
8131     ptr.p->m_prioA_scan_batches_to_execute;
8132   const Uint32 num_signals_executed = current_signal_id - lastSignalId;
8133 
8134   if (num_signals_executed > HIGH_LOAD_LEVEL)
8135   {
8136     jam();
8137     prioA_scan_batches_to_execute+=
8138       ((num_signals_executed + (VERY_HIGH_LOAD_LEVEL - 1)) /
8139         VERY_HIGH_LOAD_LEVEL);
8140     if (prioA_scan_batches_to_execute > MAX_RAISE_PRIO_MEMORY)
8141     {
8142       jam();
8143       prioA_scan_batches_to_execute = MAX_RAISE_PRIO_MEMORY;
8144     }
8145   }
8146   else if (ptr.p->is_lcp() &&
8147            m_redo_alert_state != RedoStateRep::NO_REDO_ALERT)
8148   {
8149     jam();
8150     prioA_scan_batches_to_execute = 1;
8151   }
8152   if (prioA_scan_batches_to_execute > 0)
8153   {
8154     jam();
8155     prioA_scan_batches_to_execute--;
8156     flag = true;
8157   }
8158   ptr.p->m_lastSignalId = current_signal_id;
8159   ptr.p->m_prioA_scan_batches_to_execute = prioA_scan_batches_to_execute;
8160   return flag;
8161 }
8162 
8163 void
sendScanFragReq(Signal * signal,Ptr<BackupRecord> ptr,Ptr<BackupFile> filePtr,Ptr<Table> tabPtr,Ptr<Fragment> fragPtr,Uint32 delay)8164 Backup::sendScanFragReq(Signal* signal,
8165                         Ptr<BackupRecord> ptr,
8166                         Ptr<BackupFile> filePtr,
8167                         Ptr<Table> tabPtr,
8168                         Ptr<Fragment> fragPtr,
8169                         Uint32 delay)
8170 {
8171   /**
8172    * Start scan
8173    */
8174   {
8175     if (!(ptr.p->is_lcp() &&
8176           ptr.p->m_num_lcp_files > 1))
8177     {
8178       jam();
8179       filePtr.p->m_flags |= BackupFile::BF_SCAN_THREAD;
8180     }
8181     else
8182     {
8183       jam();
8184       for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
8185       {
8186         BackupFilePtr loopFilePtr;
8187         c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
8188         loopFilePtr.p->m_flags |= BackupFile::BF_SCAN_THREAD;
8189       }
8190     }
8191 
8192     Table & table = * tabPtr.p;
8193     ScanFragReq * req = (ScanFragReq *)signal->getDataPtrSend();
8194     const Uint32 parallelism = ZRESERVED_SCAN_BATCH_SIZE;
8195 
8196     req->senderData = filePtr.i;
8197     req->resultRef = reference();
8198     req->schemaVersion = table.schemaVersion;
8199     req->fragmentNoKeyLen = fragPtr.p->fragmentId;
8200     req->requestInfo = 0;
8201     req->savePointId = 0;
8202     req->tableId = table.tableId;
8203     ScanFragReq::setReadCommittedFlag(req->requestInfo, 1);
8204     ScanFragReq::setLockMode(req->requestInfo, 0);
8205     ScanFragReq::setHoldLockFlag(req->requestInfo, 0);
8206     ScanFragReq::setKeyinfoFlag(req->requestInfo, 0);
8207     ScanFragReq::setTupScanFlag(req->requestInfo, 1);
8208     ScanFragReq::setNotInterpretedFlag(req->requestInfo, 1);
8209     if (ptr.p->is_lcp())
8210     {
8211       ScanFragReq::setScanPrio(req->requestInfo, 1);
8212       ScanFragReq::setNoDiskFlag(req->requestInfo, 1);
8213       ScanFragReq::setLcpScanFlag(req->requestInfo, 1);
8214       NDB_TICKS now = getHighResTimer();
8215       ptr.p->m_scan_start_timer = now;
8216     }
8217     ptr.p->m_num_scan_req_on_prioa = 0;
8218     init_scan_prio_level(signal, ptr);
8219     if (check_scan_if_raise_prio(signal, ptr))
8220     {
8221       jam();
8222       ScanFragReq::setPrioAFlag(req->requestInfo, 1);
8223       ptr.p->m_num_scan_req_on_prioa = 1;
8224     }
8225 
8226     req->transId1 = 0;
8227     req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
8228     req->clientOpPtr= filePtr.i;
8229     req->batch_size_rows= parallelism;
8230     req->batch_size_bytes= 0;
8231     BlockReference lqhRef = 0;
8232     bool delay_possible = true;
8233     if (ptr.p->is_lcp()) {
8234       lqhRef = calcInstanceBlockRef(DBLQH);
8235     } else {
8236       const Uint32 instanceKey = fragPtr.p->lqhInstanceKey;
8237       ndbrequire(instanceKey != 0);
8238       lqhRef = numberToRef(DBLQH, instanceKey, getOwnNodeId());
8239       if (lqhRef != calcInstanceBlockRef(DBLQH))
8240       {
8241         /* We can't send delayed signals to other threads. */
8242         delay_possible = false;
8243       }
8244     }
8245 
8246     Uint32 attrInfo[25];
8247     memcpy(attrInfo, table.attrInfo, 4*table.attrInfoLen);
8248     LinearSectionPtr ptr[3];
8249     ptr[0].p = attrInfo;
8250     ptr[0].sz = table.attrInfoLen;
8251     if (delay_possible)
8252     {
8253       SectionHandle handle(this);
8254       ndbrequire(import(handle.m_ptr[0], ptr[0].p, ptr[0].sz));
8255       handle.m_cnt = 1;
8256       if (delay == 0)
8257       {
8258         jam();
8259         sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
8260                             BOUNDED_DELAY, ScanFragReq::SignalLength, &handle);
8261       }
8262       else
8263       {
8264         jam();
8265         sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
8266                             delay, ScanFragReq::SignalLength, &handle);
8267       }
8268     }
8269     else
8270     {
8271       /**
8272        * There is no way to send signals over to another thread at a rate
8273        * level at the moment. So we send at priority B, but the response
8274        * back to us will arrive at Priority A if necessary.
8275        */
8276       jam();
8277       sendSignal(lqhRef,
8278                  GSN_SCAN_FRAGREQ,
8279                  signal,
8280                  ScanFragReq::SignalLength,
8281                  JBB,
8282                  ptr,
8283                  1);
8284     }
8285   }
8286 }
8287 
8288 void
execSCAN_HBREP(Signal * signal)8289 Backup::execSCAN_HBREP(Signal* signal)
8290 {
8291   jamEntry();
8292 }
8293 
8294 void
record_deleted_pageid(Uint32 pageNo,Uint32 record_size)8295 Backup::record_deleted_pageid(Uint32 pageNo, Uint32 record_size)
8296 {
8297   BackupRecordPtr ptr;
8298   BackupFilePtr zeroFilePtr;
8299   BackupFilePtr currentFilePtr;
8300   ptr = m_lcp_ptr;
8301   c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
8302   c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8303   OperationRecord & current_op = currentFilePtr.p->operation;
8304   OperationRecord & zero_op = zeroFilePtr.p->operation;
8305   ndbrequire(ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS);
8306   Uint32 * dst = current_op.dst;
8307   Uint32 dataLen = 2;
8308   Uint32 copy_array[2];
8309   copy_array[0] = pageNo;
8310   copy_array[1] = record_size;
8311   DEB_LCP_DEL(("(%u) DELETE_BY_PAGEID: page(%u)",
8312                 instance(),
8313                 pageNo));
8314   *dst = htonl(Uint32(dataLen + (BackupFormat::DELETE_BY_PAGEID_TYPE << 16)));
8315   memcpy(dst + 1, copy_array, dataLen*sizeof(Uint32));
8316   ndbrequire(dataLen < zero_op.maxRecordSize);
8317   zeroFilePtr.p->m_lcp_delete_by_pageids++;
8318   zero_op.finished(dataLen);
8319   current_op.newRecord(dst + dataLen + 1);
8320   ptr.p->noOfRecords++;
8321   ptr.p->noOfBytes += (4*(dataLen + 1));
8322   ptr.p->m_bytes_written += (4*(dataLen + 1));
8323   /**
8324    * LCP keep pages are handled out of order, so here we have prepared before
8325    * calling NEXT_SCANCONF by temporarily changing the current data file used.
8326    * Since scans use deep call chaining we restore the current data file
8327    * immediately after each row written into the LCP data file. Same happens
8328    * also for TRANSID_AI and record_deleted_rowid.
8329    */
8330   restore_current_page(ptr);
8331 }
8332 
8333 void
record_deleted_rowid(Uint32 pageNo,Uint32 pageIndex,Uint32 gci)8334 Backup::record_deleted_rowid(Uint32 pageNo, Uint32 pageIndex, Uint32 gci)
8335 {
8336   BackupRecordPtr ptr;
8337   BackupFilePtr zeroFilePtr;
8338   BackupFilePtr currentFilePtr;
8339   ptr = m_lcp_ptr;
8340   c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
8341   c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8342   OperationRecord & current_op = currentFilePtr.p->operation;
8343   OperationRecord & zero_op = zeroFilePtr.p->operation;
8344   ndbrequire(ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS);
8345   Uint32 * dst = current_op.dst;
8346   Uint32 dataLen = 3;
8347   Uint32 copy_array[3];
8348   copy_array[0] = pageNo;
8349   copy_array[1] = pageIndex;
8350   copy_array[2] = gci;
8351   DEB_LCP_DEL(("(%u) DELETE_BY_ROWID: row(%u,%u)",
8352                 instance(),
8353                 pageNo,
8354                 pageIndex));
8355   *dst = htonl(Uint32(dataLen + (BackupFormat::DELETE_BY_ROWID_TYPE << 16)));
8356   memcpy(dst + 1, copy_array, dataLen*sizeof(Uint32));
8357   ndbrequire(dataLen < zero_op.maxRecordSize);
8358   zeroFilePtr.p->m_lcp_delete_by_rowids++;
8359   zero_op.finished(dataLen);
8360   current_op.newRecord(dst + dataLen + 1);
8361   ptr.p->noOfRecords++;
8362   ptr.p->noOfBytes += (4*(dataLen + 1));
8363   ptr.p->m_bytes_written += (4*(dataLen + 1));
8364   restore_current_page(ptr);
8365 }
8366 
8367 void
execTRANSID_AI(Signal * signal)8368 Backup::execTRANSID_AI(Signal* signal)
8369 {
8370   jamEntryDebug();
8371 
8372   const Uint32 filePtrI = signal->theData[0];
8373   //const Uint32 transId1 = signal->theData[1];
8374   //const Uint32 transId2 = signal->theData[2];
8375   Uint32 dataLen  = signal->length() - 3;
8376 
8377   BackupFilePtr filePtr;
8378   c_backupFilePool.getPtr(filePtr, filePtrI);
8379 
8380   BackupRecordPtr ptr;
8381   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
8382 
8383   OperationRecord & op = filePtr.p->operation;
8384   if (ptr.p->is_lcp())
8385   {
8386     BackupFilePtr currentFilePtr;
8387     c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8388     OperationRecord & current_op = currentFilePtr.p->operation;
8389     Uint32 * dst = current_op.dst;
8390     Uint32 header;
8391     if (ptr.p->m_working_changed_row_page_flag)
8392     {
8393       /* LCP for CHANGED ROWS pages */
8394       jam();
8395       header = dataLen + (BackupFormat::WRITE_TYPE << 16);
8396       filePtr.p->m_lcp_writes++;
8397     }
8398     else
8399     {
8400       /* LCP for ALL ROWS pages */
8401       jam();
8402       header = dataLen + (BackupFormat::INSERT_TYPE << 16);
8403       filePtr.p->m_lcp_inserts++;
8404     }
8405     ptr.p->noOfRecords++;
8406     ptr.p->noOfBytes += (4*(dataLen + 1));
8407     ptr.p->m_bytes_written += (4*(dataLen + 1));
8408 #ifdef VM_TRACE
8409     Uint32 th = signal->theData[4];
8410     ndbassert(! (th & 0x00400000)); /* Is MM_GROWN set */
8411 #endif
8412     ndbrequire(signal->getNoOfSections() == 0);
8413     const Uint32 * src = &signal->theData[3];
8414     * dst = htonl(header);
8415     memcpy(dst + 1, src, 4*dataLen);
8416 #ifdef DEBUG_LCP_ROW
8417     TablePtr debTabPtr;
8418     FragmentPtr fragPtr;
8419     ptr.p->tables.first(debTabPtr);
8420     debTabPtr.p->fragments.getPtr(fragPtr, 0);
8421     g_eventLogger->info("(%u) tab(%u,%u) Write row(%u,%u) into LCP, bits: %x",
8422                  instance(),
8423                  debTabPtr.p->tableId,
8424                  fragPtr.p->fragmentId,
8425                  src[0],
8426                  src[1],
8427                  src[3]);
8428 #endif
8429     if (unlikely(dataLen >= op.maxRecordSize))
8430     {
8431       g_eventLogger->info("dataLen: %u, op.maxRecordSize = %u, header: %u",
8432                           dataLen, op.maxRecordSize, header);
8433       jamLine(dataLen);
8434       jamLine(op.maxRecordSize);
8435       ndbabort();
8436     }
8437     op.finished(dataLen);
8438     current_op.newRecord(dst + dataLen + 1);
8439     restore_current_page(ptr);
8440   }
8441   else
8442   {
8443     /* Backup handling */
8444     Uint32 * dst = op.dst;
8445     Uint32 header = dataLen;
8446     if (signal->getNoOfSections() == 0)
8447     {
8448       jam();
8449       const Uint32 * src = &signal->theData[3];
8450       * dst = htonl(header);
8451       memcpy(dst + 1, src, 4*dataLen);
8452     }
8453     else
8454     {
8455       jam();
8456       SectionHandle handle(this, signal);
8457       SegmentedSectionPtr dataPtr;
8458       handle.getSection(dataPtr, 0);
8459       dataLen = dataPtr.sz;
8460 
8461       * dst = htonl(dataLen);
8462       copy(dst + 1, dataPtr);
8463       releaseSections(handle);
8464     }
8465     ptr.p->m_bytes_written += (4*(dataLen + 1));
8466     op.finished(dataLen);
8467     op.newRecord(dst + dataLen + 1);
8468   }
8469 }
8470 
8471 bool
is_all_rows_page(BackupRecordPtr ptr,Uint32 part_id)8472 Backup::is_all_rows_page(BackupRecordPtr ptr,
8473                          Uint32 part_id)
8474 {
8475   if (check_if_in_page_range(part_id,
8476          ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part,
8477          ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts))
8478   {
8479     jam();
8480     return false;
8481   }
8482   jam();
8483   return true;
8484 }
8485 
8486 void
set_working_file(BackupRecordPtr ptr,Uint32 part_id,bool is_all_rows_page)8487 Backup::set_working_file(BackupRecordPtr ptr,
8488                          Uint32 part_id,
8489                          bool is_all_rows_page)
8490 {
8491   Uint32 index = ptr.p->m_num_lcp_files - 1; //Change pages index
8492   if (is_all_rows_page)
8493   {
8494     bool found = false;
8495     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
8496     {
8497       if (check_if_in_page_range(part_id,
8498             ptr.p->m_scan_info[i].m_start_all_part,
8499             ptr.p->m_scan_info[i].m_num_all_parts))
8500       {
8501         jam();
8502         found = true;
8503         index = i;
8504         break;
8505       }
8506     }
8507     ndbrequire(found);
8508   }
8509   ptr.p->m_working_data_file_ptr = ptr.p->dataFilePtr[index];
8510 }
8511 
8512 bool
check_if_in_page_range(Uint32 part_id,Uint32 start_part,Uint32 num_parts)8513 Backup::check_if_in_page_range(Uint32 part_id,
8514                                Uint32 start_part,
8515                                Uint32 num_parts)
8516 {
8517   Uint32 end_part;
8518   if (part_id >= start_part)
8519   {
8520     if ((start_part + num_parts) > part_id)
8521     {
8522       return true;
8523     }
8524   }
8525   else
8526   {
8527     end_part = start_part + num_parts;
8528     if ((part_id + BackupFormat::NDB_MAX_LCP_PARTS) < end_part)
8529     {
8530       return true;
8531     }
8532   }
8533   jam();
8534   return false;
8535 }
8536 
8537 Uint32
hash_lcp_part(Uint32 page_id) const8538 Backup::hash_lcp_part(Uint32 page_id) const
8539 {
8540   /**
8541    * To ensure proper operation also with small number of pages
8542    * we make a complete bit reorder of the 11 least significant
8543    * bits of the page id and returns this as the part id to use.
8544    * This means that for e.g. 8 pages we get the following parts
8545    * used:
8546    * 0: 0, 1: 1024, 2: 512, 3: 1536, 4: 256, 5: 1280, 6: 768, 7: 1792
8547    *
8548    * This provides a fairly good spread also of small number of
8549    * pages into the various parts.
8550    *
8551    * We implement this bit reorder by handling 4 sets of 3 bits,
8552    * except for the highest bits where we only use 2 bits.
8553    * Each 3 bit set is reversed using a simple static lookup
8554    * table and then the result of those 4 lookups is put back
8555    * into the hash value in reverse order.
8556    *
8557    * As a final step we remove bit 0 which is always 0 since we
8558    * only use 11 bits and not 12 bits.
8559    */
8560   static Uint32 reverse_3bits_array[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
8561   const Uint32 lowest_3bits_page_id = page_id & 7;
8562   const Uint32 low_3bits_page_id = (page_id >> 3) & 7;
8563   const Uint32 high_3bits_page_id = (page_id >> 6) & 7;
8564   const Uint32 highest_3bits_page_id = (page_id >> 9) & 3;
8565   Uint32 part_id =
8566     reverse_3bits_array[highest_3bits_page_id] +
8567     (reverse_3bits_array[high_3bits_page_id] << 3) +
8568     (reverse_3bits_array[low_3bits_page_id] << 6) +
8569     (reverse_3bits_array[lowest_3bits_page_id] << 9);
8570   part_id >>= 1;
8571   return part_id;
8572 }
8573 
8574 bool
is_change_part_state(Uint32 page_id)8575 Backup::is_change_part_state(Uint32 page_id)
8576 {
8577   BackupRecordPtr ptr;
8578   jamEntryDebug();
8579   ptr = m_lcp_ptr;
8580   Uint32 part_id = hash_lcp_part(page_id);
8581   bool is_all_part = is_all_rows_page(ptr, part_id);
8582   return !is_all_part;
8583 }
8584 
8585 void
get_page_info(BackupRecordPtr ptr,Uint32 part_id,Uint32 & scanGCI,bool & changed_row_page_flag)8586 Backup::get_page_info(BackupRecordPtr ptr,
8587                       Uint32 part_id,
8588                       Uint32 & scanGCI,
8589                       bool & changed_row_page_flag)
8590 {
8591   if (is_all_rows_page(ptr, part_id))
8592   {
8593     /**
8594      * We are  within range for all parts to be changed.
8595      * return scanGCI = 0 such that all rows in this page becomes part
8596      * of this LCP.
8597      */
8598     jam();
8599     scanGCI = 0;
8600     changed_row_page_flag = false;
8601   }
8602   else
8603   {
8604     /**
8605      * Not all rows to be recorded, only changed rows on this page.
8606      */
8607     jam();
8608     ndbassert(is_partial_lcp_enabled());
8609     scanGCI = ptr.p->m_scan_change_gci;
8610     ndbrequire(scanGCI != 0);
8611     changed_row_page_flag = true;
8612   }
8613 }
8614 
8615 void
change_current_page_temp(Uint32 page_no)8616 Backup::change_current_page_temp(Uint32 page_no)
8617 {
8618   BackupRecordPtr ptr;
8619   jamEntry();
8620   ptr = m_lcp_ptr;
8621   Uint32 part_id = hash_lcp_part(page_no);
8622   ptr.p->m_working_changed_row_page_flag = !(is_all_rows_page(ptr, part_id));
8623   set_working_file(ptr,
8624                    part_id,
8625                    !ptr.p->m_working_changed_row_page_flag);
8626 }
8627 
8628 /**
8629  * After each operation, whether it is INSERT, WRITE or any DELETE variant,
8630  * we restore the working data file and current page flag. We can change
8631  * those for one operation (when retrieving a record from LCP keep list).
8632  * Since we don't know when we retrieved a record from LCP keep list here,
8633  * we simply always restore. The current values always have the current
8634  * setting and the working is the one we're currently using.
8635  */
8636 void
restore_current_page(BackupRecordPtr ptr)8637 Backup::restore_current_page(BackupRecordPtr ptr)
8638 {
8639   ptr.p->m_working_data_file_ptr = ptr.p->m_current_data_file_ptr;
8640   ptr.p->m_working_changed_row_page_flag =
8641     ptr.p->m_current_changed_row_page_flag;
8642 }
8643 
8644 void
init_lcp_scan(Uint32 & scanGCI,bool & changed_row_page_flag)8645 Backup::init_lcp_scan(Uint32 & scanGCI,
8646                       bool & changed_row_page_flag)
8647 {
8648   /**
8649    * Here we come to get what to do with page 0.
8650    *
8651    * The number of pages seen at start of LCP scan was set in the method
8652    * start_lcp_scan. It is of vital importance that this happens
8653    * synchronised with the insertion of the LCP record in the UNDO log
8654    * record. There cannot be any signal breaks between setting the
8655    * max page count, initialising the LCP scan variable in TUP and
8656    * initialising the variables in this block and finally to insert a
8657    * start LCP record in UNDO log to allow for proper
8658    * handling of commits after start of LCP scan (to ensure that we
8659    * set LCP_SKIP and LCP_DELETE bits when necessary). It is important
8660    * that we retain exactly the set of rows committed before the start
8661    * of the LCP scan (the commit point is when the signal TUP_COMMITREQ
8662    * returns to DBLQH) and that rows inserted after this point is not
8663    * part of the LCP, this will guarantee that we get synchronisation
8664    * between the LCP main memory data and the disk data parts after
8665    * executing the UNDO log.
8666    *
8667    * The number of pages will be stored in the LCP to ensure that we can
8668    * remove rowid's that have been deleted before the next LCP starts.
8669    * The next LCP will never see any deleted rowid's, so those need to be
8670    * deleted before applying the rest of the LCP. The actual LCP contains
8671    * DELETE by ROWID for all rowid's in the range of pages still existing,
8672    * but for those removed we need to delete all those rows in one go at
8673    * start of restore by using the number of pages that is part of LCP.
8674    */
8675   BackupRecordPtr ptr;
8676   jamEntry();
8677   ptr = m_lcp_ptr;
8678   Uint32 part_id = hash_lcp_part(0);
8679   get_page_info(ptr,
8680                 part_id,
8681                 scanGCI,
8682                 changed_row_page_flag);
8683   set_working_file(ptr, part_id, !changed_row_page_flag);
8684   ptr.p->m_current_data_file_ptr = ptr.p->m_working_data_file_ptr;
8685   ptr.p->m_working_changed_row_page_flag = changed_row_page_flag;
8686   ptr.p->m_current_changed_row_page_flag = changed_row_page_flag;
8687 
8688 #ifdef DEBUG_EXTRA_LCP
8689   TablePtr debTabPtr;
8690   FragmentPtr fragPtr;
8691   ptr.p->tables.first(debTabPtr);
8692   debTabPtr.p->fragments.getPtr(fragPtr, 0);
8693   DEB_EXTRA_LCP(("(%u)LCP scan page tab(%u,%u): %u, part_id: %u,"
8694                  " round: %u, %s",
8695           instance(),
8696           debTabPtr.p->tableId,
8697           fragPtr.p->fragmentId,
8698           0,
8699           part_id,
8700           0,
8701           changed_row_page_flag ? "CHANGED ROWS page" : " ALL ROWS page"));
8702 #endif
8703 }
8704 
8705 void
alloc_page_after_lcp_start(Uint32 page_no)8706 Backup::alloc_page_after_lcp_start(Uint32 page_no)
8707 {
8708   BackupRecordPtr ptr;
8709   jamEntry();
8710   ptr = m_lcp_ptr;
8711   ptr.p->m_any_lcp_page_ops = true;
8712   if (is_change_part_state(page_no))
8713     ptr.p->m_change_page_alloc_after_start++;
8714   else
8715     ptr.p->m_all_page_alloc_after_start++;
8716 }
8717 
8718 void
alloc_dropped_page_after_lcp_start(bool is_change_page)8719 Backup::alloc_dropped_page_after_lcp_start(bool is_change_page)
8720 {
8721   BackupRecordPtr ptr;
8722   jamEntry();
8723   ptr = m_lcp_ptr;
8724   ptr.p->m_any_lcp_page_ops = true;
8725   if (is_change_page)
8726   {
8727     ptr.p->m_change_page_alloc_dropped_after_start++;
8728   }
8729   else
8730   {
8731     ptr.p->m_all_page_alloc_dropped_after_start++;
8732   }
8733 }
8734 
8735 void
dropped_page_after_lcp_start(bool is_change_page,bool is_last_lcp_state_A)8736 Backup::dropped_page_after_lcp_start(bool is_change_page,
8737                                      bool is_last_lcp_state_A)
8738 {
8739   BackupRecordPtr ptr;
8740   jamEntry();
8741   ptr = m_lcp_ptr;
8742   ptr.p->m_any_lcp_page_ops = true;
8743   if (is_last_lcp_state_A)
8744   {
8745     if (is_change_page)
8746       ptr.p->m_change_page_dropped_A_after_start++;
8747     else
8748       ptr.p->m_all_page_dropped_A_after_start++;
8749   }
8750   else
8751   {
8752     if (is_change_page)
8753       ptr.p->m_change_page_dropped_D_after_start++;
8754     else
8755       ptr.p->m_all_page_dropped_D_after_start++;
8756   }
8757 }
8758 
8759 void
skip_page_lcp_scanned_bit()8760 Backup::skip_page_lcp_scanned_bit()
8761 {
8762   BackupRecordPtr ptr;
8763   jamEntry();
8764   ptr = m_lcp_ptr;
8765   ptr.p->m_any_lcp_page_ops = true;
8766   if (ptr.p->m_working_changed_row_page_flag)
8767     ptr.p->m_skip_change_page_lcp_scanned_bit++;
8768   else
8769     ptr.p->m_skip_all_page_lcp_scanned_bit++;
8770 }
8771 
8772 void
skip_no_change_page()8773 Backup::skip_no_change_page()
8774 {
8775   BackupRecordPtr ptr;
8776   jamEntryDebug();
8777   ptr = m_lcp_ptr;
8778   ptr.p->m_any_lcp_page_ops = true;
8779   ptr.p->m_skip_change_page_no_change++;
8780 }
8781 
8782 void
skip_empty_page_lcp()8783 Backup::skip_empty_page_lcp()
8784 {
8785   BackupRecordPtr ptr;
8786   jamEntryDebug();
8787   ptr = m_lcp_ptr;
8788   ptr.p->m_any_lcp_page_ops = true;
8789   if (ptr.p->m_working_changed_row_page_flag)
8790     ptr.p->m_skip_empty_change_page++;
8791   else
8792     ptr.p->m_skip_empty_all_page++;
8793 }
8794 
8795 void
record_dropped_empty_page_lcp()8796 Backup::record_dropped_empty_page_lcp()
8797 {
8798   BackupRecordPtr ptr;
8799   jamEntry();
8800   ptr = m_lcp_ptr;
8801   ndbrequire(ptr.p->m_working_changed_row_page_flag)
8802   ptr.p->m_any_lcp_page_ops = true;
8803   ptr.p->m_record_empty_change_page_A++;
8804 }
8805 
8806 void
record_late_alloc_page_lcp()8807 Backup::record_late_alloc_page_lcp()
8808 {
8809   BackupRecordPtr ptr;
8810   jamEntry();
8811   ptr = m_lcp_ptr;
8812   ndbrequire(ptr.p->m_working_changed_row_page_flag)
8813   ptr.p->m_any_lcp_page_ops = true;
8814   ptr.p->m_record_late_alloc_change_page_A++;
8815 }
8816 
8817 void
page_to_skip_lcp(bool is_last_lcp_state_A)8818 Backup::page_to_skip_lcp(bool is_last_lcp_state_A)
8819 {
8820   BackupRecordPtr ptr;
8821   jamEntry();
8822   ptr = m_lcp_ptr;
8823   ptr.p->m_any_lcp_page_ops = true;
8824   if (ptr.p->m_working_changed_row_page_flag)
8825   {
8826     ndbrequire(!is_last_lcp_state_A);
8827     ptr.p->m_skip_late_alloc_change_page_D++;
8828   }
8829   else
8830   {
8831     if (is_last_lcp_state_A)
8832       ptr.p->m_skip_late_alloc_all_page_A++;
8833     else
8834       ptr.p->m_skip_late_alloc_all_page_D++;
8835   }
8836 }
8837 
8838 void
lcp_keep_delete_by_page_id()8839 Backup::lcp_keep_delete_by_page_id()
8840 {
8841   BackupRecordPtr ptr;
8842   jamEntry();
8843   ptr = m_lcp_ptr;
8844   ptr.p->m_any_lcp_page_ops = true;
8845   if (ptr.p->m_working_changed_row_page_flag)
8846     ptr.p->m_lcp_keep_delete_change_pages++;
8847   else
8848     ptr.p->m_lcp_keep_delete_all_pages++;
8849 }
8850 
8851 void
lcp_keep_delete_row()8852 Backup::lcp_keep_delete_row()
8853 {
8854   BackupRecordPtr ptr;
8855   jamEntry();
8856   ptr = m_lcp_ptr;
8857   ptr.p->m_any_lcp_page_ops = true;
8858   if (ptr.p->m_working_changed_row_page_flag)
8859     ptr.p->m_lcp_keep_delete_row_change_pages++;
8860   else
8861     ptr.p->m_lcp_keep_delete_row_all_pages++;
8862 }
8863 
8864 void
lcp_keep_row()8865 Backup::lcp_keep_row()
8866 {
8867   BackupRecordPtr ptr;
8868   jamEntry();
8869   ptr = m_lcp_ptr;
8870   ptr.p->m_any_lcp_page_ops = true;
8871   if (ptr.p->m_working_changed_row_page_flag)
8872     ptr.p->m_lcp_keep_row_change_pages++;
8873   else
8874     ptr.p->m_lcp_keep_row_all_pages++;
8875 }
8876 
8877 void
print_extended_lcp_stat()8878 Backup::print_extended_lcp_stat()
8879 {
8880   BackupRecordPtr ptr;
8881   ptr = m_lcp_ptr;
8882   if (!ptr.p->m_any_lcp_page_ops)
8883     return;
8884   g_eventLogger->info("(%u)change_page_alloc_after_start: %u, "
8885                       "all_page_alloc_after_start: %u, "
8886                       "change_page_alloc_dropped_after_start: %u, "
8887                       "all_page_alloc_dropped_after_start: %u",
8888                       instance(),
8889                       ptr.p->m_change_page_alloc_after_start,
8890                       ptr.p->m_all_page_alloc_after_start,
8891                       ptr.p->m_change_page_alloc_dropped_after_start,
8892                       ptr.p->m_all_page_alloc_dropped_after_start);
8893   g_eventLogger->info("(%u)change_page_dropped_A_after_start: %u, "
8894                       "all_page_dropped_A_after_start: %u, "
8895                       "change_page_dropped_D_after_start: %u, "
8896                       "all_page_dropped_D_after_start: %u",
8897                       instance(),
8898                       ptr.p->m_change_page_dropped_A_after_start,
8899                       ptr.p->m_all_page_dropped_A_after_start,
8900                       ptr.p->m_change_page_dropped_D_after_start,
8901                       ptr.p->m_all_page_dropped_D_after_start);
8902   g_eventLogger->info("(%u)skip_change_page_lcp_scanned_bit: %u, "
8903                       "skip_all_page_lcp_scanned_bit: %u, "
8904                       "skip_change_page_no_change: %u, "
8905                       "skip_empty_change_page: %u, "
8906                       "skip_empty_all_page: %u",
8907                       instance(),
8908                       ptr.p->m_skip_change_page_lcp_scanned_bit,
8909                       ptr.p->m_skip_all_page_lcp_scanned_bit,
8910                       ptr.p->m_skip_change_page_no_change,
8911                       ptr.p->m_skip_empty_change_page,
8912                       ptr.p->m_skip_empty_all_page);
8913   g_eventLogger->info("(%u)record_empty_change_page_A: %u, "
8914                       "record_late_alloc_change_page_A: %u, "
8915                       "skip_late_alloc_change_page_D: %u, "
8916                       "skip_late_alloc_all_page_A: %u, "
8917                       "skip_late_alloc_all_page_D: %u",
8918                       instance(),
8919                       ptr.p->m_record_empty_change_page_A,
8920                       ptr.p->m_record_late_alloc_change_page_A,
8921                       ptr.p->m_skip_late_alloc_change_page_D,
8922                       ptr.p->m_skip_late_alloc_all_page_A,
8923                       ptr.p->m_skip_late_alloc_all_page_D);
8924   g_eventLogger->info("(%u)lcp_keep_row_change_pages: %llu, "
8925                       "lcp_keep_row_all_pages: %llu, "
8926                       "lcp_keep_delete_row_change_pages: %llu, "
8927                       "lcp_keep_delete_row_all_pages: %llu, "
8928                       "lcp_keep_delete_change_pages: %u, "
8929                       "lcp_keep_delete_all_pages: %u",
8930                       instance(),
8931                       ptr.p->m_lcp_keep_row_change_pages,
8932                       ptr.p->m_lcp_keep_row_all_pages,
8933                       ptr.p->m_lcp_keep_delete_row_change_pages,
8934                       ptr.p->m_lcp_keep_delete_row_all_pages,
8935                       ptr.p->m_lcp_keep_delete_change_pages,
8936                       ptr.p->m_lcp_keep_delete_all_pages);
8937 }
8938 
8939 void
init_extended_lcp_stat()8940 Backup::init_extended_lcp_stat()
8941 {
8942   BackupRecordPtr ptr;
8943   ptr = m_lcp_ptr;
8944   ptr.p->m_change_page_alloc_after_start = 0;
8945   ptr.p->m_all_page_alloc_after_start = 0;
8946   ptr.p->m_change_page_alloc_dropped_after_start = 0;
8947   ptr.p->m_all_page_alloc_dropped_after_start = 0;
8948   ptr.p->m_change_page_dropped_A_after_start = 0;
8949   ptr.p->m_all_page_dropped_A_after_start = 0;
8950   ptr.p->m_change_page_dropped_D_after_start = 0;
8951   ptr.p->m_all_page_dropped_D_after_start = 0;
8952   ptr.p->m_skip_change_page_lcp_scanned_bit = 0;
8953   ptr.p->m_skip_all_page_lcp_scanned_bit = 0;
8954   ptr.p->m_skip_change_page_no_change = 0;
8955   ptr.p->m_skip_empty_change_page = 0;
8956   ptr.p->m_skip_empty_all_page = 0;
8957   ptr.p->m_record_empty_change_page_A = 0;
8958   ptr.p->m_record_late_alloc_change_page_A = 0;
8959   ptr.p->m_skip_late_alloc_change_page_D = 0;
8960   ptr.p->m_skip_late_alloc_all_page_A = 0;
8961   ptr.p->m_skip_late_alloc_all_page_D = 0;
8962   ptr.p->m_lcp_keep_delete_row_change_pages = 0;
8963   ptr.p->m_lcp_keep_delete_row_all_pages = 0;
8964   ptr.p->m_lcp_keep_delete_change_pages = 0;
8965   ptr.p->m_lcp_keep_delete_all_pages = 0;
8966   ptr.p->m_lcp_keep_row_change_pages = 0;
8967   ptr.p->m_lcp_keep_row_all_pages = 0;
8968   ptr.p->m_any_lcp_page_ops = false;
8969 }
8970 
8971 /**
8972  * Return values:
8973  * +1 Page have been scanned
8974  * -1 Page have not been scanned
8975  * 0 Page is scanned, so need to check the page index as well.
8976  */
8977 int
is_page_lcp_scanned(Uint32 page_id,bool & all_part)8978 Backup::is_page_lcp_scanned(Uint32 page_id, bool & all_part)
8979 {
8980   BackupRecordPtr ptr;
8981   ptr = m_lcp_ptr;
8982   all_part = false;
8983 
8984   if (page_id >= ptr.p->m_lcp_max_page_cnt)
8985   {
8986     jam();
8987     return +1; /* Page will never be scanned */
8988   }
8989   Uint32 part_id = hash_lcp_part(page_id);
8990   if (is_all_rows_page(ptr, part_id))
8991   {
8992     jam();
8993     all_part = true;
8994   }
8995   if (!ptr.p->m_is_lcp_scan_active)
8996   {
8997     /**
8998      * LCP scan is already completed.
8999      */
9000     jam();
9001     return +1;
9002   }
9003   if (page_id < ptr.p->m_lcp_current_page_scanned)
9004   {
9005     jam();
9006     return +1; /* Page have been scanned in this LCP scan round */
9007   }
9008   else if (page_id > ptr.p->m_lcp_current_page_scanned)
9009   {
9010     jam();
9011     return -1; /* Page to be scanned this LCP scan round, not done yet */
9012   }
9013   else
9014   {
9015     jam();
9016     return 0; /* Page is currently being scanned. Need more info */
9017   }
9018 }
9019 
9020 void
update_lcp_pages_scanned(Signal * signal,Uint32 filePtrI,Uint32 scanned_pages,Uint32 & scanGCI,bool & changed_row_page_flag)9021 Backup::update_lcp_pages_scanned(Signal *signal,
9022                                  Uint32 filePtrI,
9023                                  Uint32 scanned_pages,
9024                                  Uint32 & scanGCI,
9025                                  bool & changed_row_page_flag)
9026 {
9027   BackupFilePtr filePtr;
9028   BackupRecordPtr ptr;
9029   jamEntry();
9030 
9031   c_backupFilePool.getPtr(filePtr, filePtrI);
9032 
9033   OperationRecord & op = filePtr.p->operation;
9034 
9035   op.set_scanned_pages(scanned_pages);
9036 
9037   /**
9038    * scanned_pages also contains the Page number which can be used
9039    * to deduce the part_id for the page.
9040    */
9041   ptr = m_lcp_ptr;
9042   Uint32 part_id = hash_lcp_part(scanned_pages);
9043   ptr.p->m_lcp_current_page_scanned = scanned_pages;
9044   get_page_info(ptr,
9045                 part_id,
9046                 scanGCI,
9047                 changed_row_page_flag);
9048   set_working_file(ptr, part_id, !changed_row_page_flag);
9049   ptr.p->m_current_data_file_ptr = ptr.p->m_working_data_file_ptr;
9050   ptr.p->m_working_changed_row_page_flag = changed_row_page_flag;
9051   ptr.p->m_current_changed_row_page_flag = changed_row_page_flag;
9052 #ifdef DEBUG_EXTRA_LCP
9053   TablePtr debTabPtr;
9054   FragmentPtr fragPtr;
9055   ptr.p->tables.first(debTabPtr);
9056   debTabPtr.p->fragments.getPtr(fragPtr, 0);
9057   DEB_EXTRA_LCP(("(%u)LCP scan page tab(%u,%u):%u, part_id: %u, round: %u, %s",
9058                  instance(),
9059                  debTabPtr.p->tableId,
9060                  fragPtr.p->fragmentId,
9061                  scanned_pages,
9062                  part_id,
9063                  0,
9064                  changed_row_page_flag ?
9065                      "CHANGED ROWS page" : " ALL ROWS page"));
9066 #endif
9067 }
9068 
9069 void
init(const TablePtr & tabPtr)9070 Backup::OperationRecord::init(const TablePtr & tabPtr)
9071 {
9072   tablePtr = tabPtr.i;
9073   maxRecordSize = tabPtr.p->maxRecordSize;
9074   lcpScannedPages = 0;
9075 }
9076 
9077 bool
newFragment(Uint32 tableId,Uint32 fragNo)9078 Backup::OperationRecord::newFragment(Uint32 tableId, Uint32 fragNo)
9079 {
9080   Uint32 * tmp;
9081   const Uint32 headSz = (sizeof(BackupFormat::DataFile::FragmentHeader) >> 2);
9082   const Uint32 sz = headSz + ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize;
9083 
9084   ndbrequire(sz < dataBuffer.getMaxWrite());
9085   if(dataBuffer.getWritePtr(&tmp, sz)) {
9086     jam();
9087     BackupFormat::DataFile::FragmentHeader * head =
9088       (BackupFormat::DataFile::FragmentHeader*)tmp;
9089 
9090     head->SectionType   = htonl(BackupFormat::FRAGMENT_HEADER);
9091     head->SectionLength = htonl(headSz);
9092     head->TableId       = htonl(tableId);
9093     head->FragmentNo    = htonl(fragNo);
9094     head->ChecksumType  = htonl(0);
9095 
9096     opNoDone = opNoConf = opLen = 0;
9097     newRecord(tmp + headSz);
9098     scanStart = tmp;
9099     scanStop  = (tmp + headSz);
9100 
9101     noOfRecords = 0;
9102     noOfBytes = 0;
9103     return true;
9104   }//if
9105   return false;
9106 }
9107 
9108 bool
fragComplete(Uint32 tableId,Uint32 fragNo,bool fill_record)9109 Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record)
9110 {
9111   Uint32 * tmp;
9112   const Uint32 footSz = sizeof(BackupFormat::DataFile::FragmentFooter) >> 2;
9113   Uint32 sz = footSz + 1;
9114 
9115   if (fill_record)
9116   {
9117     Uint32 * new_tmp;
9118     if (!dataBuffer.getWritePtr(&tmp, sz))
9119       return false;
9120     new_tmp = tmp + sz;
9121 
9122     if ((UintPtr)new_tmp & (sizeof(Page32)-1))
9123     {
9124       /* padding is needed to get full write */
9125       new_tmp += 2 /* to fit empty header minimum 2 words*/;
9126       new_tmp = (Uint32 *)(((UintPtr)new_tmp + sizeof(Page32)-1) &
9127                             ~(UintPtr)(sizeof(Page32)-1));
9128       /* new write sz */
9129       sz = Uint32(new_tmp - tmp);
9130     }
9131   }
9132 
9133   if(dataBuffer.getWritePtr(&tmp, sz)) {
9134     jam();
9135     * tmp = 0; // Finish record stream
9136     tmp++;
9137     BackupFormat::DataFile::FragmentFooter * foot =
9138       (BackupFormat::DataFile::FragmentFooter*)tmp;
9139     foot->SectionType   = htonl(BackupFormat::FRAGMENT_FOOTER);
9140     foot->SectionLength = htonl(footSz);
9141     foot->TableId       = htonl(tableId);
9142     foot->FragmentNo    = htonl(fragNo);
9143     foot->NoOfRecords   = htonl(Uint32(noOfRecords)); // TODO
9144     foot->Checksum      = htonl(0);
9145 
9146     if (sz != footSz + 1)
9147     {
9148       tmp += footSz;
9149       memset(tmp, 0, (sz - footSz - 1) * 4);
9150       *tmp = htonl(BackupFormat::EMPTY_ENTRY);
9151       tmp++;
9152       *tmp = htonl(sz - footSz - 1);
9153     }
9154 
9155     dataBuffer.updateWritePtr(sz);
9156     return true;
9157   }//if
9158   return false;
9159 }
9160 
9161 bool
newScan()9162 Backup::OperationRecord::newScan()
9163 {
9164   Uint32 * tmp;
9165   ndbrequire(ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize < dataBuffer.getMaxWrite());
9166   if(dataBuffer.getWritePtr(&tmp, ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize))
9167   {
9168     jam();
9169     opNoDone = opNoConf = opLen = 0;
9170     newRecord(tmp);
9171     scanStart = tmp;
9172     scanStop = tmp;
9173     return true;
9174   }//if
9175   return false;
9176 }
9177 
9178 bool
check_new_scan(BackupRecordPtr ptr,OperationRecord & op,bool after_wait)9179 Backup::check_new_scan(BackupRecordPtr ptr,
9180                        OperationRecord & op,
9181                        bool after_wait)
9182 {
9183   bool any_min_buf = false;
9184   Uint32 tot_size_written = 0;
9185   if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9186   {
9187     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9188     {
9189       jam();
9190       BackupFilePtr loopFilePtr;
9191       c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9192       OperationRecord & loop_op = loopFilePtr.p->operation;
9193       if (!loop_op.newScan())
9194       {
9195         jam();
9196         return false;
9197       }
9198       Uint32 size_written = loop_op.dataBuffer.getSizeUsed();
9199       if (size_written > BACKUP_DEFAULT_WRITE_SIZE)
9200       {
9201         jam();
9202         any_min_buf = true;
9203       }
9204       tot_size_written += size_written;
9205     }
9206   }
9207   else
9208   {
9209     jam();
9210     bool ready = op.newScan();
9211     if (!ready)
9212     {
9213       jam();
9214       return false;
9215     }
9216     tot_size_written = op.dataBuffer.getSizeUsed();
9217     if (tot_size_written > BACKUP_DEFAULT_WRITE_SIZE)
9218     {
9219       jam();
9220       any_min_buf = true;
9221     }
9222   }
9223   if (after_wait ||
9224       !any_min_buf ||
9225       (ptr.p->is_lcp() &&
9226        (m_redo_alert_state > RedoStateRep::REDO_ALERT_LOW ||
9227         tot_size_written < MAX_BUFFER_USED_WITHOUT_REDO_ALERT ||
9228         (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW &&
9229          tot_size_written < BACKUP_DEFAULT_BUFFER_SIZE))))
9230   {
9231     jam();
9232     return true;
9233   }
9234   /**
9235    * We have buffer space, but we are ready to write at least one
9236    * file, so there is no urgency in continuing the LCP/Backup scan
9237    * right now, we have already written at least 512 kB into the
9238    * buffers. At Low REDO alert levels we allow writing up to
9239    * 2M into the buffers. At higher alert levels we will continue
9240    * writing until buffer is full.
9241    *
9242    * After sleeping for a while we will always handle at least one
9243    * batch of scanning if there is buffer space for it (this is
9244    * signalled through the variable after_wait).
9245    */
9246   return false;
9247 }
9248 
9249 bool
check_frag_complete(BackupRecordPtr ptr,BackupFilePtr filePtr)9250 Backup::check_frag_complete(BackupRecordPtr ptr, BackupFilePtr filePtr)
9251 {
9252   if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9253   {
9254     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9255     {
9256       jam();
9257       BackupFilePtr loopFilePtr;
9258       c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9259       OperationRecord & op = loopFilePtr.p->operation;
9260       if (((loopFilePtr.p->m_flags &
9261             Uint32(BackupFile::BF_SCAN_THREAD)) == 0) ||
9262             op.fragComplete(filePtr.p->tableId,
9263                             filePtr.p->fragmentNo,
9264                             c_defaults.m_o_direct))
9265       {
9266         jam();
9267         loopFilePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9268       }
9269       else
9270       {
9271         jam();
9272         return false;
9273       }
9274     }
9275     return true;
9276   }
9277   else
9278   {
9279     OperationRecord & op = filePtr.p->operation;
9280     if (op.fragComplete(filePtr.p->tableId,
9281                         filePtr.p->fragmentNo,
9282                         c_defaults.m_o_direct))
9283     {
9284       jam();
9285       filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9286       return true;
9287     }
9288     return false;
9289   }
9290 }
9291 
9292 bool
check_min_buf_size(BackupRecordPtr ptr,OperationRecord & op)9293 Backup::check_min_buf_size(BackupRecordPtr ptr, OperationRecord &op)
9294 {
9295   bool is_lcp = ptr.p->is_lcp();
9296   if (is_lcp && m_redo_alert_state != RedoStateRep::NO_REDO_ALERT)
9297   {
9298     /**
9299      * We have reached at least 25% REDO log fill level, we will be more
9300      * active in filling up the buffers to write to disk for LCPs.
9301      */
9302     return false;
9303   }
9304   if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9305   {
9306     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9307     {
9308       jam();
9309       Uint32 *tmp = NULL;
9310       Uint32 sz = 0;
9311       bool eof = FALSE;
9312       BackupFilePtr loopFilePtr;
9313       c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9314       OperationRecord & loop_op = loopFilePtr.p->operation;
9315       if (!loop_op.dataBuffer.getReadPtr(&tmp, &sz, &eof))
9316       {
9317         return false;
9318       }
9319     }
9320     return true;
9321   }
9322   else
9323   {
9324     jam();
9325     Uint32 *tmp = NULL;
9326     Uint32 sz = 0;
9327     bool eof = FALSE;
9328     return op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
9329   }
9330 }
9331 
9332 bool
check_error(BackupRecordPtr ptr,BackupFilePtr filePtr)9333 Backup::check_error(BackupRecordPtr ptr, BackupFilePtr filePtr)
9334 {
9335   if (ptr.p->checkError())
9336   {
9337     jam();
9338     return true;
9339   }
9340   if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9341   {
9342     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9343     {
9344       jam();
9345       BackupFilePtr loopFilePtr;
9346       c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9347       if (loopFilePtr.p->errorCode != 0)
9348       {
9349         jam();
9350         return true;
9351       }
9352     }
9353     return false;
9354   }
9355   else
9356   {
9357     return (filePtr.p->errorCode != 0);
9358   }
9359 }
9360 
9361 void
closeScan()9362 Backup::OperationRecord::closeScan()
9363 {
9364   opNoDone = opNoConf = opLen = 0;
9365 }
9366 
9367 Uint32
publishBufferData()9368 Backup::OperationRecord::publishBufferData()
9369 {
9370   const Uint32 len = Uint32(scanStop - scanStart);
9371   ndbrequire(len < dataBuffer.getMaxWrite());
9372   dataBuffer.updateWritePtr(len);
9373 
9374   /**
9375    * In case a second SCAN_FRAGCONF is received with scanCompleted set to 2
9376    * follow, without any call to newScan() or newFragment() is called to reset
9377    * scanStart and scanStop in between, set scanStart to scanStop to indicate
9378    * that all buffered data already been published.
9379    */
9380   scanStart = scanStop;
9381   return len;
9382 }
9383 
9384 void
scanConf(Uint32 noOfOps,Uint32 total_len,Uint32 len)9385 Backup::OperationRecord::scanConf(Uint32 noOfOps, Uint32 total_len, Uint32 len)
9386 {
9387   const Uint32 done = Uint32(opNoDone-opNoConf);
9388 
9389   ndbrequire(noOfOps == done);
9390   ndbrequire(opLen == total_len);
9391   opNoConf = opNoDone;
9392 
9393   noOfBytes += (len << 2);
9394   m_bytes_total += (len << 2);
9395   m_records_total += noOfOps;
9396 }
9397 
9398 void
execSCAN_FRAGREF(Signal * signal)9399 Backup::execSCAN_FRAGREF(Signal* signal)
9400 {
9401   jamEntry();
9402 
9403   ScanFragRef * ref = (ScanFragRef*)signal->getDataPtr();
9404 
9405   const Uint32 filePtrI = ref->senderData;
9406   BackupFilePtr filePtr;
9407   c_backupFilePool.getPtr(filePtr, filePtrI);
9408 
9409   Uint32 errCode = ref->errorCode;
9410   if (filePtr.p->errorCode == 0)
9411   {
9412     // check for transient errors
9413     switch(errCode){
9414     case ScanFragRef::ZSCAN_BOOK_ACC_OP_ERROR:
9415     case ScanFragRef::NO_TC_CONNECT_ERROR:
9416     case ScanFragRef::ZTOO_MANY_ACTIVE_SCAN_ERROR:
9417       jam();
9418       DEB_LCP(("(%u)execSCAN_FRAGREF(temp error: %u)",
9419                instance(),
9420                errCode));
9421       break;
9422     case ScanFragRef::TABLE_NOT_DEFINED_ERROR:
9423     case ScanFragRef::DROP_TABLE_IN_PROGRESS_ERROR:
9424       jam();
9425       /**
9426        * The table was dropped either at start of LCP scan or in the
9427        * middle of it. We will complete in the same manner as if we
9428        * got a SCAN_FRAGCONF with close flag set. The idea is that
9429        * the content of the LCP file in this case is not going to
9430        * be used anyways, so we just ensure that we complete things
9431        * in an ordered manner and then the higher layers will ensure
9432        * that the files are dropped and taken care of.
9433        *
9434        * This handling will ensure that drop table can complete
9435        * much faster.
9436        */
9437       DEB_LCP(("(%u)execSCAN_FRAGREF(DROP_TABLE_IN_PROGRESS)", instance()));
9438       fragmentCompleted(signal, filePtr, errCode);
9439       return;
9440     default:
9441       jam();
9442       filePtr.p->errorCode = errCode;
9443     }
9444   }
9445 
9446   if (filePtr.p->errorCode == 0)
9447   {
9448     jam();
9449     filePtr.p->m_retry_count++;
9450     if (filePtr.p->m_retry_count == 10)
9451     {
9452       jam();
9453       filePtr.p->errorCode = errCode;
9454     }
9455   }
9456 
9457   if (filePtr.p->errorCode != 0)
9458   {
9459     jam();
9460     filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9461     DEB_LCP(("(%u)execSCAN_FRAGREF(backupFragmentRef)", instance()));
9462     backupFragmentRef(signal, filePtr);
9463   }
9464   else
9465   {
9466     jam();
9467 
9468     // retry
9469 
9470     BackupRecordPtr ptr;
9471     c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9472     TablePtr tabPtr;
9473     FragmentPtr fragPtr;
9474     if (ptr.p->is_lcp())
9475     {
9476       ptr.p->tables.first(tabPtr);
9477       ndbrequire(filePtr.p->fragmentNo == 0);
9478       ndbrequire(filePtr.p->tableId == tabPtr.p->tableId);
9479       tabPtr.p->fragments.getPtr(fragPtr, 0);
9480       DEB_LCP(("(%u)execSCAN_FRAGREF", instance()));
9481     }
9482     else
9483     {
9484       ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
9485       tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
9486     }
9487     sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr,
9488                     WaitScanTempErrorRetryMillis);
9489   }
9490 }
9491 
9492 void
execSCAN_FRAGCONF(Signal * signal)9493 Backup::execSCAN_FRAGCONF(Signal* signal)
9494 {
9495   jamEntry();
9496 
9497   CRASH_INSERTION((10017));
9498 
9499   ScanFragConf conf = *(ScanFragConf*)signal->getDataPtr();
9500 
9501   const Uint32 filePtrI = conf.senderData;
9502   BackupFilePtr filePtr;
9503   c_backupFilePool.getPtr(filePtr, filePtrI);
9504 
9505   OperationRecord & op = filePtr.p->operation;
9506   BackupRecordPtr ptr;
9507   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9508 
9509   if (ptr.p->is_lcp() && c_lqh->handleLCPSurfacing(signal))
9510   {
9511     jam();
9512     TablePtr tabPtr;
9513     ptr.p->tables.first(tabPtr);
9514     Dbtup* tup = (Dbtup*)globalData.getBlock(DBTUP, instance());
9515     op.maxRecordSize = tabPtr.p->maxRecordSize =
9516       1 + tup->get_max_lcp_record_size(tabPtr.p->tableId);
9517   }
9518   Uint32 buffer_data_len = op.publishBufferData();
9519   if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9520   {
9521     jam();
9522     BackupFilePtr loopFilePtr;
9523     for (Uint32 i = 1; i < ptr.p->m_num_lcp_files; i++)
9524     {
9525       c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9526       OperationRecord & loop_op = loopFilePtr.p->operation;
9527       // The extra lcp files only use operation for the data buffer.
9528       buffer_data_len += loop_op.publishBufferData();
9529       // Always update maxRecordSize, op.maxRecordSize may have changed.
9530       loop_op.maxRecordSize = op.maxRecordSize;
9531     }
9532   }
9533   op.scanConf(conf.completedOps, conf.total_len, buffer_data_len);
9534 
9535   {
9536     const bool senderIsThreadLocal =
9537       (signal->senderBlockRef() == calcInstanceBlockRef(DBLQH));
9538     ndbrequire(senderIsThreadLocal ||
9539                !MT_BACKUP_FLAG(ptr.p->flags));
9540   }
9541 
9542   const Uint32 completed = conf.fragmentCompleted;
9543   if(completed != 2) {
9544     jam();
9545     checkScan(signal, ptr, filePtr, false);
9546     return;
9547   }//if
9548 
9549   fragmentCompleted(signal, filePtr);
9550 }
9551 
9552 void
fragmentCompleted(Signal * signal,BackupFilePtr filePtr,Uint32 errCode)9553 Backup::fragmentCompleted(Signal* signal,
9554                           BackupFilePtr filePtr,
9555                           Uint32 errCode)
9556 {
9557   jam();
9558 
9559   if(filePtr.p->errorCode != 0)
9560   {
9561     jam();
9562     filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9563     DEB_LCP(("(%u)fragmentCompleted(backupFragmentRef)", instance()));
9564     backupFragmentRef(signal, filePtr); // Scan completed
9565     return;
9566   }//if
9567 
9568   BackupRecordPtr ptr;
9569   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9570 
9571   if (!check_frag_complete(ptr, filePtr))
9572   {
9573     jam();
9574     signal->theData[0] = BackupContinueB::BUFFER_FULL_FRAG_COMPLETE;
9575     signal->theData[1] = filePtr.i;
9576     signal->theData[2] = errCode;
9577     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
9578                         WaitDiskBufferCapacityMillis, 2);
9579     return;
9580   }//if
9581   OperationRecord & op = filePtr.p->operation;
9582   if (ptr.p->is_lcp())
9583   {
9584     jam();
9585     ptr.p->m_is_lcp_scan_active = false;
9586     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9587     {
9588       BackupFilePtr loopFilePtr;
9589       c_backupFilePool.getPtr(loopFilePtr,
9590                               ptr.p->dataFilePtr[i]);
9591       loopFilePtr.p->operation.dataBuffer.eof();
9592     }
9593     {
9594       jam();
9595       TablePtr tabPtr;
9596       FragmentPtr fragPtr;
9597       ptr.p->tables.first(tabPtr);
9598       tabPtr.p->fragments.getPtr(fragPtr, 0);
9599       DEB_LCP_STAT(("(%u)LCP tab(%u,%u): inserts: %llu, writes: %llu"
9600                     ", delete_by_row: %llu, delete_by_page: %llu"
9601                     ", bytes written: %llu, num_files: %u"
9602                     ", first data file: %u",
9603                instance(),
9604                tabPtr.p->tableId,
9605                fragPtr.p->fragmentId,
9606                filePtr.p->m_lcp_inserts,
9607                filePtr.p->m_lcp_writes,
9608                filePtr.p->m_lcp_delete_by_rowids,
9609                filePtr.p->m_lcp_delete_by_pageids,
9610                ptr.p->m_bytes_written,
9611                ptr.p->m_num_lcp_files,
9612                ptr.p->m_first_data_file_number));
9613 #ifdef DEBUG_LCP_EXTENDED_STAT
9614       print_extended_lcp_stat();
9615 #endif
9616       c_tup->stop_lcp_scan(tabPtr.p->tableId, fragPtr.p->fragmentId);
9617     }
9618 
9619     /* Save errCode for later checks */
9620     ptr.p->m_save_error_code = errCode;
9621     ptr.p->slaveState.setState(STOPPING);
9622 
9623     /**
9624      * Scan is completed, we get the newest GCI involved in the
9625      * LCP. We update both LQH and ourselves with this value.
9626      */
9627     c_lqh->lcp_complete_scan(ptr.p->newestGci);
9628 
9629     /**
9630      * The actual complete processing is started from checkFile which is
9631      * called regularly from a CONTINUEB loop. We cannot start the complete
9632      * processing until all data of the fragment have been sent properly to
9633      * the disk. checkFile is called from CONTINUEB(START_FILE_THREAD).
9634      *
9635      * lcp_start_complete_processing will start by sync:ing UNDO log, sync
9636      * the page cache and sync:ing the extent pages. When all this is done
9637      * AND the fragment LCP data files are sync:ed and closed then the
9638      * LCP is done.
9639      */
9640     lcp_start_complete_processing(signal, ptr);
9641   }
9642   else
9643   {
9644     jam();
9645     BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtrSend();
9646     conf->backupId = ptr.p->backupId;
9647     conf->backupPtr = ptr.i;
9648     conf->tableId = filePtr.p->tableId;
9649     conf->fragmentNo = filePtr.p->fragmentNo;
9650     conf->noOfRecordsLow = (Uint32)(op.noOfRecords & 0xFFFFFFFF);
9651     conf->noOfRecordsHigh = (Uint32)(op.noOfRecords >> 32);
9652     conf->noOfBytesLow = (Uint32)(op.noOfBytes & 0xFFFFFFFF);
9653     conf->noOfBytesHigh = (Uint32)(op.noOfBytes >> 32);
9654     sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
9655 	       BackupFragmentConf::SignalLength, JBA);
9656 
9657     ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_CONF;
9658     ptr.p->slaveState.setState(STARTED);
9659   }
9660   return;
9661 }
9662 
9663 void
backupFragmentRef(Signal * signal,BackupFilePtr filePtr)9664 Backup::backupFragmentRef(Signal * signal, BackupFilePtr filePtr)
9665 {
9666   BackupRecordPtr ptr;
9667   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9668 
9669   ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REF;
9670 
9671   CRASH_INSERTION((10044));
9672   CRASH_INSERTION((10045));
9673 
9674   BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtrSend();
9675   ref->backupId = ptr.p->backupId;
9676   ref->backupPtr = ptr.i;
9677   ref->nodeId = getOwnNodeId();
9678   ref->errorCode = filePtr.p->errorCode;
9679   sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_REF, signal,
9680 	     BackupFragmentRef::SignalLength, JBB);
9681 }
9682 
9683 void
update_pause_lcp_counter(Uint32 loop_count)9684 Backup::update_pause_lcp_counter(Uint32 loop_count)
9685 {
9686   /**
9687    * We keep track of the time we are executing LCP writes on a
9688    * fairly detailed level to ensure that our real-time properties
9689    * are ok. In some cases we can loop quite extensively in TUP
9690    * looking for rows to checkpoint. This involves scanning each
9691    * row to see if it has changed since last LCP.
9692    *
9693    * We provide a loop count where scanning a row or a page is worth
9694    * 4 ticks whereas a quick check of a row to find that it isn't
9695    * eligible is only worth one tick.
9696    * The checks of rows in CHANGED
9697    * pages is optimised since it is such a common case. This scan
9698    * uses prefetching techniques to ensure that we avoid being
9699    * hindered by cache misses. In a large database it is very
9700    * likely that these scans will touch lot of memory and will
9701    * thus require prefetching to keep up. We predict that we can
9702    * scan one row in about 25 nanoseconds. Thus one loop is equal
9703    * to 100 nanoseconds. We estimate that we will be able to write
9704    * about 320 bytes per microsecond and thus one loop count is
9705    * counted equal to 32 bytes. This cost is fairly independent of
9706    * the table size and table structure since we are only checking
9707    * the header of the row.
9708    */
9709   BackupRecordPtr ptr = m_lcp_ptr;
9710   ndbassert(ptr.p->is_lcp());
9711   ptr.p->m_row_scan_counter += (loop_count / 4);
9712   ptr.p->m_pause_counter += (loop_count * 8);
9713 }
9714 
9715 bool
check_pause_lcp_backup(BackupRecordPtr ptr,bool is_lcp,bool is_send_scan_next_req)9716 Backup::check_pause_lcp_backup(BackupRecordPtr ptr,
9717                                bool is_lcp,
9718                                bool is_send_scan_next_req)
9719 {
9720   /**
9721    * We call this function every time it is necessary to decide if
9722    * we should issue a real-time break in an LCP scan, we also call
9723    * it to decide if we are to stay at prio A level for backups.
9724    *
9725    * We keep track of the desired write speed. We try to write as
9726    * much as is necessary to keep up with the desired write speed
9727    * since the last time we had a real-time break.
9728    *
9729    * If we are lagging for some reason the desired write speed since
9730   * the start of the scan, we write a bit more on each real-time
9731    * break until we have catched up. There could be many reasons why
9732    * this is necessary, one could be that we had a real-time break
9733    * that overslept a bit.
9734    *
9735    * To avoid problems when we overslept we also maximise the amount
9736    * of writes we can perform in one real-time break. This maximum
9737    * is dependent on the ALERT level on the REDO log.
9738    *
9739    * To handle these requirements we keep track of the start time
9740    * of the scan (sending SCAN_FRAGREQ). We keep track of the last
9741    * time this method decided to issue a real-time break, it could
9742    * also be decided by higher level methods, in this case they
9743    * will call the pausing_lcp method to cause this timer and
9744    * the amount of bytes written to that point before entering a
9745    * real-time break.
9746    */
9747   if (!is_lcp)
9748   {
9749     jam();
9750     ndbassert(!ptr.p->is_lcp());
9751     Uint64 max_bytes_to_write = 4 * ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO;
9752     if (ptr.p->m_num_scan_req_on_prioa == 0)
9753     {
9754       jam();
9755       return false;
9756     }
9757     Uint64 bytes_written_in_last_lcp = ptr.p->m_bytes_written;
9758     Uint64 last_recorded_bytes_written = ptr.p->m_last_recorded_bytes_written;
9759     ptr.p->m_last_recorded_bytes_written = bytes_written_in_last_lcp;
9760     Uint64 bytes_written_since_last_delay =
9761       bytes_written_in_last_lcp - last_recorded_bytes_written;
9762     return (bytes_written_since_last_delay >= max_bytes_to_write);
9763   }
9764   jam();
9765   ndbassert(ptr.p->is_lcp());
9766   Uint64 max_words_to_scan = 4 * (ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO *
9767                                   m_redo_alert_factor);
9768   if (ptr.p->m_num_scan_req_on_prioa == 0 &&
9769       !is_send_scan_next_req)
9770   {
9771     jam();
9772     max_words_to_scan = 4 * ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO *
9773                         m_redo_alert_factor;
9774   }
9775   Uint64 bytes_written_in_last_lcp = ptr.p->m_bytes_written;
9776   Uint64 last_recorded_bytes_written = ptr.p->m_last_recorded_bytes_written;
9777   Uint64 pause_counter = ptr.p->m_pause_counter;
9778   Uint64 bytes_written_since_last_delay =
9779     bytes_written_in_last_lcp - last_recorded_bytes_written;
9780   bytes_written_since_last_delay += pause_counter;
9781 
9782   /* Calculate if we are behind since start of scan */
9783   /* Current disk write speed is in per 100 ms */
9784   Uint64 desired_write_speed =
9785     Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) *
9786     m_curr_disk_write_speed;
9787   NDB_TICKS now = getHighResTimer();
9788   NDB_TICKS start_scan = ptr.p->m_scan_start_timer;
9789   Uint64 micros_since_start_scan =
9790     NdbTick_Elapsed(start_scan, now).microSec();
9791   Uint64 desired_written_bytes =
9792     (desired_write_speed * micros_since_start_scan) /
9793       (Uint64(1000) * Uint64(1000));
9794 
9795   /* Calculate if we are behind since last rt break */
9796   NDB_TICKS last_delay_timer = ptr.p->m_last_delay_scan_timer;
9797   Uint64 micros_since_last_delay =
9798     NdbTick_Elapsed(last_delay_timer, now).microSec();
9799   Uint64 desired_bytes_in_this_rt_break =
9800     (micros_since_last_delay * desired_write_speed) /
9801       (Uint64(1000) * Uint64(1000));
9802 
9803   /* Adjust bytes to write in this rt break if behind since last scan */
9804   if (bytes_written_in_last_lcp < desired_written_bytes)
9805   {
9806     desired_bytes_in_this_rt_break *= Uint64(125);
9807     desired_bytes_in_this_rt_break /= Uint64(100);
9808   }
9809   Uint64 max_bytes_to_write = MIN(desired_bytes_in_this_rt_break,
9810                                   max_words_to_scan);
9811   max_bytes_to_write = MAX(max_bytes_to_write,
9812                            (4 * ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO));
9813 #ifdef VM_TRACE
9814   if (is_send_scan_next_req ||
9815       (bytes_written_since_last_delay >= max_bytes_to_write))
9816   {
9817     m_debug_redo_log_count++;
9818     if (m_debug_redo_log_count > 1000000)
9819     {
9820       if (m_debug_redo_log_count > 1000004)
9821       {
9822         m_debug_redo_log_count = 0;
9823       }
9824       DEB_REDO_CONTROL(("(%u)check_pause_lcp: bytes_since_last_delay: %llu"
9825                         ", desired_bytes_in_this_break: %llu"
9826                         ", max_bytes_to_write: %llu"
9827                         ", micros_since_last_delay: %llu"
9828                         ", scan_row_counter: %llu",
9829                         instance(),
9830                         bytes_written_since_last_delay,
9831                         desired_bytes_in_this_rt_break,
9832                         max_bytes_to_write,
9833                         micros_since_last_delay,
9834                         ptr.p->m_row_scan_counter));
9835     }
9836   }
9837 #endif
9838   return (bytes_written_since_last_delay >= max_bytes_to_write);
9839 }
9840 
9841 void
pausing_lcp(Uint32 place,Uint32 val)9842 Backup::pausing_lcp(Uint32 place, Uint32 val)
9843 {
9844   /* Pause LCP execution, record current time and bytes written */
9845   BackupRecordPtr ptr = m_lcp_ptr;
9846   ndbassert(ptr.p->is_lcp());
9847   Uint64 bytes_written_in_last_lcp =
9848     ptr.p->m_bytes_written;
9849   NDB_TICKS now = getHighResTimer();
9850   ptr.p->m_last_recorded_bytes_written = bytes_written_in_last_lcp;
9851   ptr.p->m_last_delay_scan_timer = now;
9852   ptr.p->m_pause_counter = 0;
9853   ptr.p->m_num_scan_req_on_prioa = 0;
9854 #ifdef VM_TRACE
9855   if (m_debug_redo_log_count > 1000000)
9856   {
9857     DEB_REDO_CONTROL(("(%u)pausing_lcp from place: %u, val: %u",
9858                       instance(),
9859                       place,
9860                       val));
9861   }
9862 #endif
9863 }
9864 
9865 void
checkScan(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr,bool after_wait)9866 Backup::checkScan(Signal* signal,
9867                   BackupRecordPtr ptr,
9868                   BackupFilePtr filePtr,
9869                   bool after_wait)
9870 {
9871   OperationRecord & op = filePtr.p->operation;
9872   BlockReference lqhRef = 0;
9873   if (ptr.p->is_lcp())
9874   {
9875     lqhRef = calcInstanceBlockRef(DBLQH);
9876   }
9877   else
9878   {
9879     TablePtr tabPtr;
9880     ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
9881     FragmentPtr fragPtr;
9882     tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
9883     const Uint32 instanceKey = fragPtr.p->lqhInstanceKey;
9884     lqhRef = numberToRef(DBLQH, instanceKey, getOwnNodeId());
9885   }
9886   if (check_error(ptr, filePtr))
9887   {
9888     jam();
9889     /**
9890      * Close scan
9891      */
9892     if (ptr.p->is_lcp())
9893     {
9894       DEB_LCP(("(%u) Close LCP scan after receiving error: %u",
9895               instance(),
9896               filePtr.p->errorCode));
9897     }
9898     op.closeScan();
9899     ScanFragNextReq * req = (ScanFragNextReq *)signal->getDataPtrSend();
9900     req->senderData = filePtr.i;
9901     req->requestInfo = 0;
9902     ScanFragNextReq::setCloseFlag(req->requestInfo, 1);
9903     req->transId1 = 0;
9904     req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
9905     sendSignal(lqhRef, GSN_SCAN_NEXTREQ, signal,
9906 	       ScanFragNextReq::SignalLength, JBB);
9907     return;
9908   }//if
9909   if (check_new_scan(ptr, op, after_wait))
9910   {
9911     jam();
9912 
9913     ScanFragNextReq * req = (ScanFragNextReq *)signal->getDataPtrSend();
9914     req->senderData = filePtr.i;
9915     req->requestInfo = 0;
9916     req->transId1 = 0;
9917     req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
9918     req->batch_size_rows= ZRESERVED_SCAN_BATCH_SIZE;
9919     req->batch_size_bytes= 0;
9920 
9921     if(ERROR_INSERTED(10032))
9922       sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9923 			  100, ScanFragNextReq::SignalLength);
9924     else if(ERROR_INSERTED(10033))
9925     {
9926       SET_ERROR_INSERT_VALUE(10032);
9927       sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9928 			  10000, ScanFragNextReq::SignalLength);
9929 
9930       BackupRecordPtr ptr;
9931       c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9932       AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
9933       ord->backupId = ptr.p->backupId;
9934       ord->backupPtr = ptr.i;
9935       ord->requestType = AbortBackupOrd::FileOrScanError;
9936       ord->senderData= ptr.i;
9937       sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
9938 		 AbortBackupOrd::SignalLength, JBB);
9939     }
9940 #ifdef ERROR_INSERT
9941     else if (ERROR_INSERTED(10042) && filePtr.p->tableId ==c_error_insert_extra)
9942     {
9943       sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9944 			  10, ScanFragNextReq::SignalLength);
9945     }
9946 #endif
9947     else
9948     {
9949       /**
9950        * We send all interactions with bounded delay, this means that we will
9951        * wait for at most 128 signals before the signal is put into the A-level
9952        * job buffer. After this we will execute at A-level until we arrive
9953        * back with a SCAN_FRAGCONF. After SCAN_FRAGCONF we get back to here
9954        * again, so this means we will execute at least 16 rows before any
9955        * B-level signals are allowed again. So this means that the LCP will
9956        * scan at least 16 rows per 128 signals even at complete overload.
9957        *
9958        * We will even send yet one more row of 16 rows at A-priority level
9959        * per 100 B-level signals if we have difficulties in even meeting the
9960        * minimum desired checkpoint level.
9961        */
9962       JobBufferLevel prio_level = JBB;
9963       bool file_buf_contains_min_write_size = false;
9964       if (check_scan_if_raise_prio(signal, ptr))
9965       {
9966         OperationRecord & op = filePtr.p->operation;
9967         file_buf_contains_min_write_size =
9968           check_min_buf_size(ptr, op);
9969 
9970         ScanFragNextReq::setPrioAFlag(req->requestInfo, 1);
9971         if (!file_buf_contains_min_write_size &&
9972             !check_pause_lcp_backup(ptr))
9973         {
9974           jam();
9975           /**
9976            * There are three reasons why we won't continue executing at
9977            * prio A level.
9978            *
9979            * 1) The last execution at prio A generated more than the max words
9980            *    per A-level batch, so we get back to a bounded delay signal.
9981            *
9982            * 2) We already have a buffer ready to be sent to the file
9983            *    system. No reason to execute at a very high priority simply
9984            *    to fill buffers not waiting to be filled. If it is an LCP and
9985            *    we are reaching some limit we will be more active in filling
9986            *    up buffers.
9987            *
9988            * We will continue a bit more if we have set m_redo_alert_factor
9989            * higher than 1. We will do this in very critical situations when we
9990            * want to ensure that LCP writes gets higher priority. The redo
9991            * alert factor is always 1 for backups since there is no need of
9992            * urgency to complete backups. It is enough to manage backups
9993            * properly.
9994            */
9995           /* Continue at prio A level 16 more rows */
9996           ptr.p->m_num_scan_req_on_prioa++;
9997           prio_level = JBA;
9998         }
9999       }
10000       if (lqhRef == calcInstanceBlockRef(DBLQH) && (prio_level == JBB))
10001       {
10002         if (ptr.p->is_lcp())
10003         {
10004           pausing_lcp(1,
10005                       (2*(ScanFragNextReq::getPrioAFlag(req->requestInfo))) +
10006                       file_buf_contains_min_write_size);
10007         }
10008         sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
10009                             BOUNDED_DELAY, ScanFragNextReq::SignalLength);
10010       }
10011       else
10012       {
10013         /* Cannot send delayed signals to other threads. */
10014         ndbrequire(!ptr.p->is_lcp() || prio_level == JBA);
10015         sendSignal(lqhRef,
10016                    GSN_SCAN_NEXTREQ,
10017                    signal,
10018                    ScanFragNextReq::SignalLength,
10019                    prio_level);
10020       }
10021       /*
10022         check if it is time to report backup status
10023       */
10024       if (!ptr.p->is_lcp())
10025       {
10026         jam();
10027         checkReportStatus(signal, ptr);
10028       }
10029     }
10030     return;
10031   }//if
10032   if (ptr.p->is_lcp())
10033   {
10034     pausing_lcp(2,0);
10035     DEB_EXTRA_LCP(("(%u)newScan false in checkScan", instance()));
10036   }
10037   signal->theData[0] = BackupContinueB::BUFFER_FULL_SCAN;
10038   signal->theData[1] = filePtr.i;
10039   sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10040                       WaitDiskBufferCapacityMillis, 2);
10041 }
10042 
10043 void
execFSAPPENDREF(Signal * signal)10044 Backup::execFSAPPENDREF(Signal* signal)
10045 {
10046   jamEntry();
10047 
10048   FsRef * ref = (FsRef *)signal->getDataPtr();
10049 
10050   const Uint32 filePtrI = ref->userPointer;
10051   const Uint32 errCode = ref->errorCode;
10052 
10053   BackupFilePtr filePtr;
10054   c_backupFilePool.getPtr(filePtr, filePtrI);
10055 
10056   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10057   filePtr.p->errorCode = errCode;
10058 
10059   CRASH_INSERTION(10044);
10060   CRASH_INSERTION(10045);
10061   BackupRecordPtr ptr;
10062   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10063   if (ptr.p->is_lcp())
10064   {
10065     /**
10066      * Log in this case for LCPs, Backups should be able to
10067      * handle out of disk space. LCPs could potentially survive for
10068      * a while, but will eventually crash or they will hit the
10069      * infamous 410 condition.
10070      */
10071     g_eventLogger->info("LCP got FSAPPENDREF, serious error: error code: %u",
10072                         errCode);
10073   }
10074   checkFile(signal, filePtr);
10075 }
10076 
10077 void
execFSAPPENDCONF(Signal * signal)10078 Backup::execFSAPPENDCONF(Signal* signal)
10079 {
10080   jamEntry();
10081 
10082   CRASH_INSERTION((10018));
10083 
10084   //FsConf * conf = (FsConf*)signal->getDataPtr();
10085   const Uint32 filePtrI = signal->theData[0]; //conf->userPointer;
10086   const Uint32 bytes = signal->theData[1]; //conf->bytes;
10087 
10088   BackupFilePtr filePtr;
10089   c_backupFilePool.getPtr(filePtr, filePtrI);
10090 
10091   OperationRecord & op = filePtr.p->operation;
10092 
10093   op.dataBuffer.updateReadPtr(bytes >> 2);
10094 
10095   checkFile(signal, filePtr);
10096 }
10097 
10098 /*
10099   This routine handles two problems with writing to disk during local
10100   checkpoints and backups. The first problem is that we need to limit
10101   the writing to ensure that we don't use too much CPU and disk resources
10102   for backups and checkpoints. For LCPs we use an adaptive algorithm that
10103   changes the current disk write speed based on how much checkpointing we
10104   need to do in order to not run out of REDO log.
10105   Backup writes are added to the total disk write speed we control, but
10106   backup writes are also separately controlled to avoid that backups take
10107   up resources that are needed by the REDO log.
10108 
10109   The second problem is that in Linux we can get severe problems if we
10110   write very much to the disk without synching. In the worst case we
10111   can have Gigabytes of data in the Linux page cache before we reach
10112   the limit of how much we can write. If this happens the performance
10113   will drop significantly when we reach this limit since the Linux flush
10114   daemon will spend a few minutes on writing out the page cache to disk.
10115   To avoid this we ensure that a file never have more than a certain
10116   amount of data outstanding before synch. This variable is also
10117   configurable.
10118 */
10119 bool
ready_to_write(bool ready,Uint32 sz,bool eof,BackupFile * fileP,BackupRecord * ptrP)10120 Backup::ready_to_write(bool ready,
10121                        Uint32 sz,
10122                        bool eof,
10123                        BackupFile *fileP,
10124                        BackupRecord *ptrP)
10125 {
10126 #if 0
10127   ndbout << "ready_to_write: ready = " << ready << " eof = " << eof;
10128   ndbout << " sz = " << sz << endl;
10129   ndbout << "words this period = " << m_words_written_this_period;
10130   ndbout << "backup words this period = "
10131          << m_backup_words_written_this_period;
10132   ndbout << endl << "overflow disk write = " << m_overflow_disk_write;
10133   ndbout << endl << "backup overflow disk write = "
10134          << m_backup_overflow_disk_write;
10135   ndbout << endl << "Current Millisecond is = ";
10136   ndbout << NdbTick_CurrentMillisecond() << endl;
10137 #endif
10138 
10139   if (ERROR_INSERTED(10043) && eof)
10140   {
10141     /* Block indefinitely without closing the file */
10142     jam();
10143     return false;
10144   }
10145 
10146   if ((ready || eof) &&
10147       m_words_written_this_period <= m_curr_disk_write_speed &&
10148       (ptrP->is_lcp() ||
10149        m_backup_words_written_this_period <= m_curr_backup_disk_write_speed))
10150   {
10151     /*
10152       We have a buffer ready to write or we have reached end of
10153       file and thus we must write the last before closing the
10154       file.
10155       We have already checked that we are allowed to write at this
10156       moment. We only worry about history of last 100 milliseconds.
10157       What happened before that is of no interest since a disk
10158       write that was issued more than 100 milliseconds should be
10159       completed by now.
10160     */
10161     jam();
10162     int overflow;
10163     m_monitor_words_written+= sz;
10164     m_words_written_this_period += sz;
10165     overflow = m_words_written_this_period - m_curr_disk_write_speed;
10166     if (overflow > 0)
10167       m_overflow_disk_write = overflow;
10168     if (!ptrP->is_lcp())
10169     {
10170       m_backup_monitor_words_written += sz;
10171       m_backup_words_written_this_period += sz;
10172       overflow = m_backup_words_written_this_period -
10173                  m_curr_backup_disk_write_speed;
10174       if (overflow > 0)
10175         m_backup_overflow_disk_write = overflow;
10176     }
10177 #if 0
10178     ndbout << "Will write with " << endl;
10179     ndbout << endl;
10180 #endif
10181     return true;
10182   }
10183   else
10184   {
10185 #if 0
10186     ndbout << "Will not write now" << endl << endl;
10187 #endif
10188     jam();
10189     return false;
10190   }
10191 }
10192 
10193 void
checkFile(Signal * signal,BackupFilePtr filePtr)10194 Backup::checkFile(Signal* signal, BackupFilePtr filePtr)
10195 {
10196 
10197 #ifdef DEBUG_ABORT
10198   //  ndbout_c("---- check file filePtr.i = %u", filePtr.i);
10199 #endif
10200 
10201   OperationRecord & op = filePtr.p->operation;
10202   Uint32 *tmp = NULL;
10203   Uint32 sz = 0;
10204   bool eof = FALSE;
10205   bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
10206 
10207 #if 0
10208   ndbout << "Ptr to data = " << hex << tmp << endl;
10209 #endif
10210   BackupRecordPtr ptr;
10211   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10212 
10213   if (ERROR_INSERTED(10036))
10214   {
10215     jam();
10216     filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10217     filePtr.p->errorCode = 2810;
10218     ptr.p->setErrorCode(2810);
10219 
10220     if(ptr.p->m_gsn == GSN_STOP_BACKUP_REQ)
10221     {
10222       jam();
10223       closeFile(signal, ptr, filePtr);
10224     }
10225     return;
10226   }
10227 
10228   if(filePtr.p->errorCode != 0)
10229   {
10230     jam();
10231     ptr.p->setErrorCode(filePtr.p->errorCode);
10232 
10233     if(ptr.p->m_gsn == GSN_STOP_BACKUP_REQ)
10234     {
10235       jam();
10236       closeFile(signal, ptr, filePtr);
10237     }
10238 
10239     if (ptr.p->is_lcp())
10240     {
10241       jam();
10242       /* Close file with error - will delete it */
10243       closeFile(signal, ptr, filePtr);
10244     }
10245 
10246     return;
10247   }
10248 
10249   if (!ready_to_write(ready,
10250                       sz,
10251                       eof,
10252                       filePtr.p,
10253                       ptr.p))
10254   {
10255     jam();
10256     signal->theData[0] = BackupContinueB::BUFFER_UNDERFLOW;
10257     signal->theData[1] = filePtr.i;
10258     sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10259                         WaitDiskBufferCapacityMillis, 2);
10260     return;
10261   }
10262   else if (sz > 0)
10263   {
10264     jam();
10265 #ifdef ERROR_INSERT
10266     /* Test APPENDREF handling */
10267     if (filePtr.p->fileType == BackupFormat::DATA_FILE)
10268     {
10269       if (ERROR_INSERTED(10045))
10270       {
10271         ndbout_c("BF_SCAN_THREAD = %u",
10272                  (filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD));
10273       }
10274 
10275       if ((ERROR_INSERTED(10044) &&
10276            !(filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD)) ||
10277           (ERROR_INSERTED(10045) &&
10278            (filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD)))
10279       {
10280         jam();
10281         ndbout_c("REFing on append to data file for table %u, fragment %u, "
10282                  "BF_SCAN_THREAD running : %u",
10283                  filePtr.p->tableId,
10284                  filePtr.p->fragmentNo,
10285                  filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD);
10286         FsRef* ref = (FsRef *)signal->getDataPtrSend();
10287         ref->userPointer = filePtr.i;
10288         ref->errorCode = FsRef::fsErrInvalidParameters;
10289         ref->osErrorCode = ~0;
10290         /* EXEC DIRECT to avoid change in BF_SCAN_THREAD state */
10291         EXECUTE_DIRECT(BACKUP, GSN_FSAPPENDREF, signal,
10292                        3);
10293         return;
10294       }
10295     }
10296 #endif
10297 
10298     const bool write_to_datafile = (filePtr.i == ptr.p->dataFilePtr[0]);
10299     /**
10300      * If O_DIRECT is enabled, the write should be done in 128-word chunks.
10301      * For O_DIRECT writes of less than 128 words, we skip the writes when
10302      * we have reached end of file and we are about to abort the backup (and
10303      * will not be interested in its results). We avoid writing in this case
10304      * since we don't want to handle errors for O_DIRECT calls.
10305      * However we only avoid this write for data files since CTL files and
10306      * LOG files never use O_DIRECT. Also no need to avoid write if we don't
10307      * use O_DIRECT at all.
10308      */
10309     const bool skip_write = (c_defaults.m_o_direct &&  // O_DIRECT write
10310                        write_to_datafile &&   // to datafile
10311                        !ptr.p->is_lcp() &&    // during backup
10312                        eof &&    // last chunk of data to write to file
10313                        (sz % 128 != 0) &&     // too small for O_DIRECT
10314                        (ptr.p->slaveState.getState() == STOPPING) &&
10315                        ptr.p->checkError());  // backup to be aborted
10316 
10317     if(likely(!skip_write))
10318     {
10319       jam();
10320       ndbassert((Uint64(tmp - c_startOfPages) >> 32) == 0); // 4Gb buffers!
10321       FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend();
10322       req->filePointer   = filePtr.p->filePointer;
10323       req->userPointer   = filePtr.i;
10324       req->userReference = reference();
10325       req->varIndex      = 0;
10326       req->offset        = Uint32(tmp - c_startOfPages); // 4Gb buffers!
10327       req->size          = sz;
10328       req->synch_flag    = 0;
10329 
10330       sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal,
10331 	         FsAppendReq::SignalLength, JBA);
10332       return;
10333     }
10334   }
10335 
10336   Uint32 flags = filePtr.p->m_flags;
10337   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10338 
10339   ndbrequire(flags & BackupFile::BF_OPEN);
10340   ndbrequire(flags & BackupFile::BF_FILE_THREAD);
10341 
10342   if (ptr.p->is_lcp())
10343   {
10344     jam();
10345     closeFile(signal, ptr, filePtr, false, false);
10346   }
10347   else
10348   {
10349     jam();
10350     closeFile(signal, ptr, filePtr);
10351   }
10352   return;
10353 }
10354 
10355 
10356 /****************************************************************************
10357  *
10358  * Slave functionallity: Perform logging
10359  *
10360  ****************************************************************************/
10361 void
execBACKUP_TRIG_REQ(Signal * signal)10362 Backup::execBACKUP_TRIG_REQ(Signal* signal)
10363 {
10364   /*
10365   TUP asks if this trigger is to be fired on this node.
10366   */
10367   TriggerPtr trigPtr;
10368   TablePtr tabPtr;
10369   FragmentPtr fragPtr;
10370   Uint32 trigger_id = signal->theData[0];
10371   Uint32 frag_id = signal->theData[1];
10372   Uint32 result;
10373 
10374   jamEntry();
10375 
10376   c_triggerPool.getPtr(trigPtr, trigger_id);
10377 
10378   c_tablePool.getPtr(tabPtr, trigPtr.p->tab_ptr_i);
10379   tabPtr.p->fragments.getPtr(fragPtr, frag_id);
10380   if (fragPtr.p->node != getOwnNodeId()) {
10381 
10382     jam();
10383     result = ZFALSE;
10384   } else {
10385     jam();
10386     result = ZTRUE;
10387   }//if
10388   signal->theData[0] = result;
10389 }
10390 
10391 BackupFormat::LogFile::LogEntry *
get_log_buffer(Signal * signal,TriggerPtr trigPtr,Uint32 sz)10392 Backup::get_log_buffer(Signal* signal,
10393                        TriggerPtr trigPtr, Uint32 sz)
10394 {
10395   Uint32 * dst;
10396   if(ERROR_INSERTED(10030))
10397   {
10398     jam();
10399     dst = 0;
10400   }
10401   else
10402   {
10403     jam();
10404     FsBuffer & buf = trigPtr.p->operation->dataBuffer;
10405     ndbrequire(sz <= buf.getMaxWrite());
10406     if (unlikely(!buf.getWritePtr(&dst, sz)))
10407     {
10408       jam();
10409       dst = 0;
10410     }
10411   }
10412 
10413   if (unlikely(dst == 0))
10414   {
10415     Uint32 save[TrigAttrInfo::StaticLength];
10416     memcpy(save, signal->getDataPtr(), 4*TrigAttrInfo::StaticLength);
10417     BackupRecordPtr ptr;
10418     c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10419     trigPtr.p->errorCode = AbortBackupOrd::LogBufferFull;
10420     AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10421     ord->backupId = ptr.p->backupId;
10422     ord->backupPtr = ptr.i;
10423     ord->requestType = AbortBackupOrd::LogBufferFull;
10424     ord->senderData= ptr.i;
10425     sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
10426                AbortBackupOrd::SignalLength, JBB);
10427 
10428     memcpy(signal->getDataPtrSend(), save, 4*TrigAttrInfo::StaticLength);
10429     return 0;
10430   }//if
10431 
10432   BackupFormat::LogFile::LogEntry * logEntry =
10433     (BackupFormat::LogFile::LogEntry *)dst;
10434   logEntry->Length       = 0;
10435   logEntry->TableId      = htonl(trigPtr.p->tableId);
10436 
10437   if(trigPtr.p->event==0)
10438     logEntry->TriggerEvent= htonl(TriggerEvent::TE_INSERT);
10439   else if(trigPtr.p->event==1)
10440     logEntry->TriggerEvent= htonl(TriggerEvent::TE_UPDATE);
10441   else if(trigPtr.p->event==2)
10442     logEntry->TriggerEvent= htonl(TriggerEvent::TE_DELETE);
10443   else {
10444     ndbout << "Bad Event: " << trigPtr.p->event << endl;
10445     ndbabort();
10446   }
10447 
10448   return logEntry;
10449 }
10450 
10451 void
execTRIG_ATTRINFO(Signal * signal)10452 Backup::execTRIG_ATTRINFO(Signal* signal) {
10453   jamEntry();
10454 
10455   CRASH_INSERTION((10019));
10456 
10457   TrigAttrInfo * trg = (TrigAttrInfo*)signal->getDataPtr();
10458 
10459   TriggerPtr trigPtr;
10460   c_triggerPool.getPtr(trigPtr, trg->getTriggerId());
10461   ndbrequire(trigPtr.p->event != ILLEGAL_TRIGGER_ID); // Online...
10462 
10463   if(trigPtr.p->errorCode != 0) {
10464     jam();
10465     return;
10466   }//if
10467 
10468   BackupRecordPtr ptr;
10469   c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10470 
10471   if(ptr.p->flags & BackupReq::USE_UNDO_LOG) {
10472     if(trg->getAttrInfoType() == TrigAttrInfo::AFTER_VALUES) {
10473       jam();
10474       /**
10475        * Backup is doing UNDO logging and don't need after values
10476        */
10477       return;
10478     }//if
10479   }
10480   else {
10481     if(trg->getAttrInfoType() == TrigAttrInfo::BEFORE_VALUES) {
10482       jam();
10483       /**
10484        * Backup is doing REDO logging and don't need before values
10485        */
10486       return;
10487     }//if
10488   }
10489 
10490   BackupFormat::LogFile::LogEntry * logEntry = trigPtr.p->logEntry;
10491   if(logEntry == 0)
10492   {
10493     jam();
10494     logEntry = get_log_buffer(signal,
10495                               trigPtr,
10496                               BackupFormat::LogFile::LogEntry::MAX_SIZE);
10497     trigPtr.p->logEntry = logEntry;
10498     if (unlikely(logEntry == 0))
10499     {
10500       jam();
10501       return;
10502     }
10503   } else {
10504     ndbrequire(logEntry->TableId == htonl(trigPtr.p->tableId));
10505 //    ndbrequire(logEntry->TriggerEvent == htonl(trigPtr.p->event));
10506   }//if
10507 
10508   const Uint32 pos = logEntry->Length;
10509   const Uint32 dataLen = signal->length() - TrigAttrInfo::StaticLength;
10510   memcpy(&logEntry->Data[pos], trg->getData(), dataLen << 2);
10511 
10512   logEntry->Length = pos + dataLen;
10513 }
10514 
10515 void
execFIRE_TRIG_ORD(Signal * signal)10516 Backup::execFIRE_TRIG_ORD(Signal* signal)
10517 {
10518   jamEntry();
10519 
10520   if (!assembleFragments(signal))
10521   {
10522     jam();
10523     return;
10524   }
10525 
10526   FireTrigOrd* trg = (FireTrigOrd*)signal->getDataPtr();
10527 
10528   const Uint32 gci = trg->getGCI();
10529   const Uint32 trI = trg->getTriggerId();
10530   const Uint32 fragId = trg->fragId;
10531 
10532   TriggerPtr trigPtr;
10533   c_triggerPool.getPtr(trigPtr, trI);
10534 
10535   ndbrequire(trigPtr.p->event != ILLEGAL_TRIGGER_ID);
10536 
10537   BackupRecordPtr ptr;
10538   c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10539 
10540   if(trigPtr.p->errorCode != 0) {
10541     jam();
10542     SectionHandle handle(this, signal);
10543     releaseSections(handle);
10544     return;
10545   }//if
10546 
10547   if (isNdbMtLqh())
10548   {
10549     jam();
10550     /* This is the decision point for including
10551      * this row change in the log file on ndbmtd
10552      */
10553     TablePtr tabPtr;
10554     c_tablePool.getPtr(tabPtr, trigPtr.p->tab_ptr_i);
10555     FragmentPtr fragPtr;
10556     tabPtr.p->fragments.getPtr(fragPtr, fragId);
10557     if (fragPtr.p->node != getOwnNodeId())
10558     {
10559       jam();
10560       trigPtr.p->logEntry = 0;
10561       SectionHandle handle(this,signal);
10562       releaseSections(handle);
10563       return;
10564     }
10565   }
10566 
10567   if (signal->getNoOfSections())
10568   {
10569     jam();
10570     SectionHandle handle(this,signal);
10571     SegmentedSectionPtr dataPtr[3];
10572     handle.getSection(dataPtr[0], 0);
10573     handle.getSection(dataPtr[1], 1);
10574     handle.getSection(dataPtr[2], 2);
10575     /**
10576      * dataPtr[0] : Primary key info
10577      * dataPtr[1] : Before values
10578      * dataPtr[2] : After values
10579      */
10580 
10581     // Add one word to get_log_buffer for potential gci info stored at end.
10582     const Uint32 log_entry_words =
10583         1 /* length word */ +
10584         BackupFormat::LogFile::LogEntry::HEADER_LENGTH_WORDS +
10585         1 /* gci_word */;
10586 
10587     // Backup is doing UNDO logging and need before values
10588     if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
10589     {
10590       jam();
10591       // Add one word to get_log_buffer for logEntry length info stored at end.
10592       trigPtr.p->logEntry = get_log_buffer(signal,
10593                                            trigPtr,
10594                                            log_entry_words +
10595                                              dataPtr[0].sz +
10596                                              dataPtr[1].sz +
10597                                              1);
10598       if (unlikely(trigPtr.p->logEntry == 0))
10599       {
10600         jam();
10601         releaseSections(handle);
10602         return;
10603       }
10604       copy(trigPtr.p->logEntry->Data, dataPtr[0]);
10605       copy(trigPtr.p->logEntry->Data+dataPtr[0].sz, dataPtr[1]);
10606       trigPtr.p->logEntry->Length = dataPtr[0].sz + dataPtr[1].sz;
10607     }
10608     //  Backup is doing REDO logging and need after values
10609     else
10610     {
10611       jam();
10612       trigPtr.p->logEntry = get_log_buffer(signal,
10613                                            trigPtr,
10614                                            log_entry_words +
10615                                              dataPtr[0].sz +
10616                                              dataPtr[2].sz);
10617       if (unlikely(trigPtr.p->logEntry == 0))
10618       {
10619         jam();
10620         releaseSections(handle);
10621         return;
10622       }
10623       copy(trigPtr.p->logEntry->Data, dataPtr[0]);
10624       copy(trigPtr.p->logEntry->Data+dataPtr[0].sz, dataPtr[2]);
10625       trigPtr.p->logEntry->Length = dataPtr[0].sz + dataPtr[2].sz;
10626     }
10627 
10628     releaseSections(handle);
10629   }
10630 
10631   ndbrequire(trigPtr.p->logEntry != 0);
10632   Uint32 len = trigPtr.p->logEntry->Length;
10633   trigPtr.p->logEntry->FragId = htonl(fragId);
10634 
10635   /* Redo logs are always read from file start to file end, so
10636    * GCI content can be optimised out. If a set of N consecutive
10637    * log entries have the same GCI, the GCI is written only in the
10638    * first log entry of the set, while the remaining entries do
10639    * not contain a GCP. So an entry is written with a GCP only
10640    * when its GCP differs from the previous entry.
10641    * This cannot be done for undo logs since undo logs are read in
10642    * reverse, from file end to file start.
10643    */
10644   if ((ptr.p->flags & BackupReq::USE_UNDO_LOG) || (gci != ptr.p->currGCP))
10645   {
10646     jam();
10647     trigPtr.p->logEntry->TriggerEvent|= htonl(0x10000);
10648     trigPtr.p->logEntry->Data[len] = htonl(gci);
10649     len++;
10650     ptr.p->currGCP = gci;
10651   }
10652 
10653   Uint32 datalen = len;
10654   len += BackupFormat::LogFile::LogEntry::HEADER_LENGTH_WORDS;
10655   trigPtr.p->logEntry->Length = htonl(len);
10656 
10657   if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
10658   {
10659     jam();
10660     /* keep the length at both the end of logEntry and ->logEntry variable
10661        The total length of logEntry is len + 2
10662     */
10663     trigPtr.p->logEntry->Data[datalen] = htonl(len);
10664   }
10665 
10666   Uint32 entryLength = len + 1;
10667   if (ptr.p->flags & BackupReq::USE_UNDO_LOG)
10668   {
10669     jam();
10670     entryLength++;
10671   }
10672 
10673   ndbrequire(entryLength <= trigPtr.p->operation->dataBuffer.getMaxWrite());
10674   trigPtr.p->operation->dataBuffer.updateWritePtr(entryLength);
10675   trigPtr.p->logEntry = 0;
10676 
10677   {
10678     const Uint32 entryByteLength = entryLength << 2;
10679     trigPtr.p->operation->noOfBytes     += entryByteLength;
10680     trigPtr.p->operation->m_bytes_total += entryByteLength;
10681     trigPtr.p->operation->noOfRecords     += 1;
10682     trigPtr.p->operation->m_records_total += 1;
10683   }
10684 }
10685 
10686 void
sendAbortBackupOrd(Signal * signal,BackupRecordPtr ptr,Uint32 requestType)10687 Backup::sendAbortBackupOrd(Signal* signal, BackupRecordPtr ptr,
10688 			   Uint32 requestType)
10689 {
10690   jam();
10691   AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10692   ord->backupId = ptr.p->backupId;
10693   ord->backupPtr = ptr.i;
10694   ord->requestType = requestType;
10695   ord->senderData= ptr.i;
10696   NodePtr node;
10697   Uint32 receiverInstance = instanceKey(ptr); // = BackupProxy for mt-backup
10698 
10699   if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
10700       && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
10701   {
10702     // All signal-sender functions in abort protocol detect
10703     // send-to-self bitmask settings and send signals accordingly.
10704     ptr.p->senderRef = reference();
10705     receiverInstance = instance();
10706   }
10707 
10708   for(c_nodes.first(node); node.i != RNIL; c_nodes.next(node)) {
10709     jam();
10710     const Uint32 nodeId = node.p->nodeId;
10711     if(node.p->alive && ptr.p->nodes.get(nodeId)) {
10712       jam();
10713       BlockReference ref = numberToRef(BACKUP, receiverInstance, nodeId);
10714       sendSignal(ref, GSN_ABORT_BACKUP_ORD, signal,
10715 		 AbortBackupOrd::SignalLength, JBB);
10716     }//if
10717   }//for
10718 }
10719 
10720 /*****************************************************************************
10721  *
10722  * Slave functionallity: Stop backup
10723  *
10724  *****************************************************************************/
10725 void
execSTOP_BACKUP_REQ(Signal * signal)10726 Backup::execSTOP_BACKUP_REQ(Signal* signal)
10727 {
10728   jamEntry();
10729   StopBackupReq * req = (StopBackupReq*)signal->getDataPtr();
10730 
10731   CRASH_INSERTION((10020));
10732 
10733   const Uint32 ptrI = req->backupPtr;
10734   //const Uint32 backupId = req->backupId;
10735   const Uint32 startGCP = req->startGCP;
10736   const Uint32 stopGCP = req->stopGCP;
10737 
10738   /**
10739    * At least one GCP must have passed
10740    */
10741   ndbrequire(stopGCP > startGCP);
10742 
10743   /**
10744    * Get backup record
10745    */
10746   BackupRecordPtr ptr;
10747   c_backupPool.getPtr(ptr, ptrI);
10748 
10749   ptr.p->startGCP= startGCP;
10750   ptr.p->stopGCP= stopGCP;
10751 
10752   if (MT_BACKUP_FLAG(ptr.p->flags))
10753   {
10754     /**
10755      * In multithreaded backup, each Backup Worker sends
10756      * trigger-drop and trigger-firing signals only to its
10757      * local TUP. No sync is needed to ensure ordering of
10758      * trigger signals wrt STOP_BACKUP_REQ, since the
10759      * signals are added in order to the signal queue.
10760      */
10761     Uint32 retVal = 0;
10762     startDropTrig_synced(signal, ptrI, retVal);
10763   }
10764   else
10765   {
10766     /**
10767      * Ensure that any in-flight changes are
10768      * included in the backup log before
10769      * dropping the triggers
10770      *
10771      * This is necessary as the trigger-drop
10772      * signals are routed :
10773      *
10774      *   Backup Worker 1 <-> Proxy <-> TUP Worker 1..n
10775      *
10776      * While the trigger firing signals are
10777      * routed :
10778      *
10779      *   TUP Worker 1..n   -> Backup Worker 1
10780      *
10781      * So the arrival of signal-drop acks
10782      * does not imply that all fired
10783      * triggers have been seen.
10784      *
10785      *  Backup Worker 1
10786      *
10787      *        |             SYNC_PATH_REQ
10788      *        V
10789      *     TUP Proxy
10790      *    |  | ... |
10791      *    V  V     V
10792      *    1  2 ... n        (Workers)
10793      *    |  |     |
10794      *    |  |     |
10795      *
10796      *   Backup Worker 1
10797      */
10798     Uint32 path[] = { DBTUP, 0 };
10799     Callback cb = { safe_cast(&Backup::startDropTrig_synced), ptrI };
10800     synchronize_path(signal,
10801                      path,
10802                      cb);
10803     if (ERROR_INSERTED(10049) && (ptr.p->masterRef == reference()))
10804     {
10805       AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10806       ord->backupId = ptr.p->backupId;
10807       ord->backupPtr = ptr.i;
10808       ord->requestType = AbortBackupOrd::LogBufferFull;
10809       ord->senderData= ptr.i;
10810       execABORT_BACKUP_ORD(signal);
10811     }
10812   }
10813 }
10814 
10815 void
startDropTrig_synced(Signal * signal,Uint32 ptrI,Uint32 retVal)10816 Backup::startDropTrig_synced(Signal* signal, Uint32 ptrI, Uint32 retVal)
10817 {
10818   jamEntry();
10819   /**
10820    * Get backup record
10821    */
10822   BackupRecordPtr ptr;
10823   c_backupPool.getPtr(ptr, ptrI);
10824 
10825   ptr.p->slaveState.setState(STOPPING);
10826   ptr.p->m_gsn = GSN_STOP_BACKUP_REQ;
10827 
10828   /**
10829    * Now drop the triggers
10830    */
10831   sendDropTrig(signal, ptr);
10832 }
10833 
10834 void
closeFiles(Signal * sig,BackupRecordPtr ptr)10835 Backup::closeFiles(Signal* sig, BackupRecordPtr ptr)
10836 {
10837   /**
10838    * Close all files
10839    */
10840   BackupFilePtr filePtr;
10841   int openCount = 0;
10842   for(ptr.p->files.first(filePtr); filePtr.i!=RNIL; ptr.p->files.next(filePtr))
10843   {
10844     if(! (filePtr.p->m_flags & BackupFile::BF_OPEN))
10845     {
10846       jam();
10847       continue;
10848     }
10849 
10850     jam();
10851     openCount++;
10852 
10853     if(filePtr.p->m_flags & BackupFile::BF_CLOSING)
10854     {
10855       jam();
10856       continue;
10857     }//if
10858 
10859     filePtr.p->operation.dataBuffer.eof();
10860     if(filePtr.p->m_flags & BackupFile::BF_FILE_THREAD)
10861     {
10862       jam();
10863 #ifdef DEBUG_ABORT
10864       ndbout_c("Close files fileRunning == 1, filePtr.i=%u", filePtr.i);
10865 #endif
10866     }
10867     else
10868     {
10869       jam();
10870       closeFile(sig, ptr, filePtr);
10871     }
10872   }
10873 
10874   if(openCount == 0){
10875     jam();
10876     closeFilesDone(sig, ptr);
10877   }//if
10878 }
10879 
10880 void
closeFile(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr,bool prepare_phase,bool remove_flag)10881 Backup::closeFile(Signal* signal,
10882                   BackupRecordPtr ptr,
10883                   BackupFilePtr filePtr,
10884                   bool prepare_phase,
10885                   bool remove_flag)
10886 {
10887   ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPEN);
10888   ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPENING));
10889   ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_CLOSING));
10890   filePtr.p->m_flags |= BackupFile::BF_CLOSING;
10891 
10892   FsCloseReq * req = (FsCloseReq *)signal->getDataPtrSend();
10893   req->filePointer = filePtr.p->filePointer;
10894   req->userPointer = filePtr.i;
10895   req->userReference = reference();
10896   req->fileFlag = 0;
10897 
10898   if (prepare_phase)
10899   {
10900     jam();
10901     if (ptr.p->prepareErrorCode)
10902     {
10903       jam();
10904       FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10905     }
10906   }
10907   else
10908   {
10909     jam();
10910     if (ptr.p->errorCode)
10911     {
10912       jam();
10913       FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10914     }
10915   }
10916   if (remove_flag)
10917   {
10918     jam();
10919     FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10920   }
10921 
10922 #ifdef DEBUG_ABORT
10923   ndbout_c("***** a FSCLOSEREQ filePtr.i = %u flags: %x",
10924 	   filePtr.i, filePtr.p->m_flags);
10925 #endif
10926   sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, FsCloseReq::SignalLength, JBA);
10927 
10928 }
10929 
10930 void
execFSCLOSEREF(Signal * signal)10931 Backup::execFSCLOSEREF(Signal* signal)
10932 {
10933   jamEntry();
10934 
10935   FsRef * ref = (FsRef*)signal->getDataPtr();
10936   const Uint32 filePtrI = ref->userPointer;
10937 
10938   BackupFilePtr filePtr;
10939   c_backupFilePool.getPtr(filePtr, filePtrI);
10940 
10941   BackupRecordPtr ptr;
10942   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10943 
10944   FsConf * conf = (FsConf*)signal->getDataPtr();
10945   conf->userPointer = filePtrI;
10946 
10947   const char *file_type_str;
10948   const char *op_type_str;
10949 
10950   if (ptr.p->errorCode == 0)
10951   {
10952     ptr.p->errorCode = ref->errorCode;
10953   }
10954   if (filePtr.p->errorCode == 0)
10955   {
10956     filePtr.p->errorCode = ref->errorCode;
10957   }
10958   if (ptr.p->is_lcp())
10959   {
10960     op_type_str = "LCP";
10961     if (ptr.p->prepareCtlFilePtr[0] == filePtrI ||
10962         ptr.p->prepareCtlFilePtr[1] == filePtrI)
10963       file_type_str = "prepare ctl";
10964     else if (ptr.p->prepareDataFilePtr[0] == filePtrI)
10965       file_type_str = "prepare data";
10966     else if (ptr.p->deleteFilePtr == filePtrI)
10967       file_type_str = "delete file";
10968     else if (ptr.p->dataFilePtr[0] == filePtrI)
10969       file_type_str = "data";
10970     else if (ptr.p->ctlFilePtr == filePtrI)
10971       file_type_str = "ctl";
10972     else
10973     {
10974       ndbabort();
10975       file_type_str = NULL;
10976     }
10977   }
10978   else
10979   {
10980     op_type_str = "backup";
10981     if (ptr.p->ctlFilePtr == filePtrI)
10982       file_type_str = "ctl";
10983     else if (ptr.p->dataFilePtr[0] == filePtrI)
10984       file_type_str = "data";
10985     else if (ptr.p->logFilePtr == filePtrI)
10986       file_type_str = "log";
10987     else
10988     {
10989       ndbabort();
10990       file_type_str = NULL;
10991     }
10992   }
10993   g_eventLogger->warning("FSCLOSEREF: errCode: %d, performing %s"
10994                          " for file type %s, ignoring error",
10995                          ref->errorCode,
10996                          op_type_str,
10997                          file_type_str);
10998   execFSCLOSECONF(signal);
10999 }
11000 
11001 void
execFSCLOSECONF(Signal * signal)11002 Backup::execFSCLOSECONF(Signal* signal)
11003 {
11004   jamEntry();
11005 
11006   FsConf * conf = (FsConf*)signal->getDataPtr();
11007   const Uint32 filePtrI = conf->userPointer;
11008 
11009   BackupFilePtr filePtr;
11010   c_backupFilePool.getPtr(filePtr, filePtrI);
11011 
11012 #ifdef DEBUG_ABORT
11013   ndbout_c("***** FSCLOSECONF filePtrI = %u", filePtrI);
11014 #endif
11015 
11016   ndbrequire(filePtr.p->m_flags == (BackupFile::BF_OPEN |
11017 				    BackupFile::BF_CLOSING));
11018 
11019 
11020   const Uint32 usableBytes =
11021     filePtr.p->operation.dataBuffer.getUsableSize() << 2;
11022   const Uint32 freeLwmBytes =
11023     filePtr.p->operation.dataBuffer.getFreeLwm() << 2;
11024 
11025   const BackupFormat::FileType ft = filePtr.p->fileType;
11026 
11027   if (ft == BackupFormat::LOG_FILE ||
11028       ft == BackupFormat::UNDO_FILE)
11029   {
11030     g_eventLogger->info("Backup log buffer report : size %u bytes, "
11031                         "hwm %u bytes (%u pct)",
11032                         usableBytes,
11033                         (usableBytes - freeLwmBytes),
11034                         ((usableBytes - freeLwmBytes) * 100) /
11035                         usableBytes);
11036   }
11037 
11038   filePtr.p->m_flags &= ~(Uint32)(BackupFile::BF_OPEN |BackupFile::BF_CLOSING);
11039   filePtr.p->operation.dataBuffer.reset();
11040 
11041   BackupRecordPtr ptr;
11042   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
11043 
11044   if (ptr.p->is_lcp())
11045   {
11046     if (ptr.p->prepareDataFilePtr[0] == filePtrI)
11047     {
11048       /* Close of prepare data file, error condition */
11049       jam();
11050       ndbrequire(ptr.p->prepareState == PREPARE_ABORTING);
11051       defineBackupRef(signal, ptr, ptr.p->errorCode);
11052       return;
11053     }
11054     else if (ptr.p->prepareCtlFilePtr[0] == filePtrI ||
11055              ptr.p->prepareCtlFilePtr[1] == filePtrI)
11056     {
11057       jam();
11058       if (ptr.p->prepareState == PREPARE_DROP_CLOSE)
11059       {
11060         jam();
11061         lcp_close_ctl_file_drop_case(signal, ptr);
11062         return;
11063       }
11064       if (ptr.p->prepareState == PREPARE_ABORTING)
11065       {
11066         jam();
11067         defineBackupRef(signal, ptr, ptr.p->errorCode);
11068         return;
11069       }
11070       ndbrequire(ptr.p->prepareState == PREPARE_READ_CTL_FILES);
11071       lcp_close_prepare_ctl_file_done(signal, ptr);
11072       return;
11073     }
11074     else if (ptr.p->ctlFilePtr == filePtrI)
11075     {
11076       jam();
11077       finalize_lcp_processing(signal, ptr);
11078       return;
11079     }
11080     else if (ptr.p->deleteFilePtr == filePtrI)
11081     {
11082       jam();
11083       lcp_close_ctl_file_for_rewrite_done(signal, ptr, filePtr);
11084       return;
11085     }
11086     else if ((ptr.p->dataFilePtr[0] == filePtrI) ||
11087              (ptr.p->dataFilePtr[1] == filePtrI) ||
11088              (ptr.p->dataFilePtr[2] == filePtrI) ||
11089              (ptr.p->dataFilePtr[3] == filePtrI) ||
11090              (ptr.p->dataFilePtr[4] == filePtrI) ||
11091              (ptr.p->dataFilePtr[5] == filePtrI) ||
11092              (ptr.p->dataFilePtr[6] == filePtrI) ||
11093              (ptr.p->dataFilePtr[7] == filePtrI))
11094     {
11095       jam();
11096       ndbrequire(filePtr.p->m_flags == 0);
11097       ndbrequire(ptr.p->m_num_lcp_data_files_open > 0);
11098       ptr.p->m_num_lcp_data_files_open--;
11099       if (ptr.p->m_num_lcp_data_files_open > 0)
11100       {
11101         jam();
11102         DEB_EXTRA_LCP(("(%u) Closed LCP data file, still waiting for %u files",
11103                        instance(),
11104                        ptr.p->m_num_lcp_data_files_open));
11105         return;
11106       }
11107       lcp_close_data_file_conf(signal, ptr);
11108       return;
11109     }
11110     else
11111     {
11112       ndbabort();
11113     }
11114   }
11115   /* Backup closing files */
11116   closeFiles(signal, ptr);
11117 }
11118 
11119 void
closeFilesDone(Signal * signal,BackupRecordPtr ptr)11120 Backup::closeFilesDone(Signal* signal, BackupRecordPtr ptr)
11121 {
11122   jam();
11123   /* Record end-of-backup */
11124   //ndbassert(Backup::g_is_single_thr_backup_running); /* !set on error paths */
11125   Backup::g_is_single_thr_backup_running = false;
11126 
11127   //error when do insert footer or close file
11128   if(ptr.p->checkError())
11129   {
11130     StopBackupRef * ref = (StopBackupRef*)signal->getDataPtrSend();
11131     ref->backupPtr = ptr.i;
11132     ref->backupId = ptr.p->backupId;
11133     ref->errorCode = ptr.p->errorCode;
11134     ref->nodeId = getOwnNodeId();
11135     sendSignal(ptr.p->senderRef, GSN_STOP_BACKUP_REF, signal,
11136              StopBackupConf::SignalLength, JBB);
11137 
11138     ptr.p->m_gsn = GSN_STOP_BACKUP_REF;
11139     ptr.p->slaveState.setState(CLEANING);
11140     return;
11141   }
11142 
11143   StopBackupConf* conf = (StopBackupConf*)signal->getDataPtrSend();
11144   conf->backupId = ptr.p->backupId;
11145   conf->backupPtr = ptr.i;
11146 
11147   BackupFilePtr filePtr;
11148   if(ptr.p->logFilePtr != RNIL)
11149   {
11150     ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
11151     conf->noOfLogBytes= Uint32(filePtr.p->operation.noOfBytes);     // TODO
11152     conf->noOfLogRecords= Uint32(filePtr.p->operation.noOfRecords); // TODO
11153   }
11154   else
11155   {
11156     conf->noOfLogBytes= 0;
11157     conf->noOfLogRecords= 0;
11158   }
11159 
11160   sendSignal(ptr.p->senderRef, GSN_STOP_BACKUP_CONF, signal,
11161 	     StopBackupConf::SignalLength, JBB);
11162 
11163   ptr.p->m_gsn = GSN_STOP_BACKUP_CONF;
11164   ptr.p->slaveState.setState(CLEANING);
11165 }
11166 
11167 /*****************************************************************************
11168  *
11169  * Slave functionallity: Abort backup
11170  *
11171  *****************************************************************************/
11172 /*****************************************************************************
11173  *
11174  * Slave functionallity: Abort backup
11175  *
11176  *****************************************************************************/
11177 void
execABORT_BACKUP_ORD(Signal * signal)11178 Backup::execABORT_BACKUP_ORD(Signal* signal)
11179 {
11180   jamEntry();
11181   AbortBackupOrd* ord = (AbortBackupOrd*)signal->getDataPtr();
11182   const Uint32 backupId = ord->backupId;
11183   const AbortBackupOrd::RequestType requestType =
11184     (AbortBackupOrd::RequestType)ord->requestType;
11185   const Uint32 senderData = ord->senderData;
11186 
11187 #ifdef DEBUG_ABORT
11188   ndbout_c("******** ABORT_BACKUP_ORD ********* nodeId = %u",
11189 	   refToNode(signal->getSendersBlockRef()));
11190   ndbout_c("backupId = %u, requestType = %u, senderData = %u, ",
11191 	   backupId, requestType, senderData);
11192   dumpUsedResources();
11193 #endif
11194 
11195   BackupRecordPtr ptr;
11196   if(requestType == AbortBackupOrd::ClientAbort) {
11197     jam();
11198     if ((!get_backup_record(ptr)) ||
11199          ptr.p->backupId != backupId ||
11200          ptr.p->clientData != senderData)
11201     {
11202       jam();
11203       return;
11204     }//if
11205     if (ptr.p->masterRef != reference())
11206     {
11207       jam();
11208       // forward to master
11209 #ifdef DEBUG_ABORT
11210       ndbout_c("---- Forward to master nodeId = %u", getMasterNodeId());
11211 #endif
11212       sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD,
11213 		 signal, AbortBackupOrd::SignalLength, JBB);
11214       return;
11215     }
11216   } else {
11217     if (c_backupPool.findId(senderData)) {
11218       jam();
11219       c_backupPool.getPtr(ptr, senderData);
11220     } else {
11221       jam();
11222 #ifdef DEBUG_ABORT
11223       ndbout_c("Backup: abort request type=%u on id=%u,%u not found",
11224 	       requestType, backupId, senderData);
11225 #endif
11226       return;
11227     }
11228   }//if
11229 
11230   ptr.p->m_gsn = GSN_ABORT_BACKUP_ORD;
11231   const bool isCoordinator = (ptr.p->masterRef == reference());
11232 
11233   bool ok = false;
11234   switch(requestType){
11235 
11236     /**
11237      * Requests sent to master
11238      */
11239   case AbortBackupOrd::ClientAbort:
11240     jam();
11241     // fall through
11242   case AbortBackupOrd::LogBufferFull:
11243     jam();
11244     // fall through
11245   case AbortBackupOrd::FileOrScanError:
11246     jam();
11247     ndbrequire(isCoordinator);
11248     ptr.p->setErrorCode(requestType);
11249     if(ptr.p->masterData.gsn == GSN_BACKUP_FRAGMENT_REQ)
11250     {
11251       /**
11252        * Only scans are actively aborted
11253        */
11254       abort_scan(signal, ptr);
11255     }
11256     return;
11257 
11258     /**
11259      * Requests sent to slave
11260      */
11261   case AbortBackupOrd::AbortScan:
11262     jam();
11263     ptr.p->setErrorCode(requestType);
11264     return;
11265 
11266   case AbortBackupOrd::BackupComplete:
11267     jam();
11268     cleanup(signal, ptr);
11269     return;
11270   case AbortBackupOrd::BackupFailure:
11271   case AbortBackupOrd::BackupFailureDueToNodeFail:
11272   case AbortBackupOrd::OkToClean:
11273   case AbortBackupOrd::IncompatibleVersions:
11274 #ifndef VM_TRACE
11275   default:
11276 #endif
11277     ptr.p->setErrorCode(requestType);
11278     ptr.p->masterData.errorCode = requestType;
11279     ok= true;
11280   }
11281   ndbrequire(ok);
11282 
11283   ptr.p->masterRef = reference();
11284   ptr.p->nodes.clear();
11285   ptr.p->nodes.set(getOwnNodeId());
11286 
11287   // Backup aborts on node failure are handled as follows for st-backup:
11288   // - each node declares itself master
11289   // - each node modifies 'nodes' bitmask of signal receivers
11290   // to disable sending to any nodes except self
11291   // For mt-backup,
11292   // - each instance declares itself master
11293   // - each instance modifies 'nodes' bitmask of signal receivers
11294   // to disable sending to any nodes except self
11295   // - each instance modifies 'fragWorkers' bitmask of signal receivers
11296   // to disable sending to any LDM on this node except self
11297   ptr.p->fragWorkers[getOwnNodeId()].clear();
11298   ptr.p->fragWorkers[getOwnNodeId()].set(instance());
11299   ptr.p->masterRef = reference();
11300   ptr.p->senderRef = reference();
11301   ptr.p->stopGCP= ptr.p->startGCP + 1;
11302   sendStopBackup(signal, ptr);
11303 }
11304 
11305 
11306 void
dumpUsedResources()11307 Backup::dumpUsedResources()
11308 {
11309   jam();
11310   BackupRecordPtr ptr;
11311 
11312   if (get_backup_record(ptr))
11313   {
11314     ndbout_c("Backup id=%u, slaveState.getState = %u, errorCode=%u",
11315 	     ptr.p->backupId,
11316 	     ptr.p->slaveState.getState(),
11317 	     ptr.p->errorCode);
11318 
11319     TablePtr tabPtr;
11320     for(ptr.p->tables.first(tabPtr);
11321 	tabPtr.i != RNIL;
11322 	ptr.p->tables.next(tabPtr)) {
11323       jam();
11324       for(Uint32 j = 0; j<3; j++) {
11325 	jam();
11326 	TriggerPtr trigPtr;
11327 	if(tabPtr.p->triggerAllocated[j]) {
11328 	  jam();
11329 	  c_triggerPool.getPtr(trigPtr, tabPtr.p->triggerIds[j]);
11330 	  ndbout_c("Allocated[%u] Triggerid = %u, event = %u",
11331 		 j,
11332 		 tabPtr.p->triggerIds[j],
11333 		 trigPtr.p->event);
11334 	}//if
11335       }//for
11336     }//for
11337 
11338     BackupFilePtr filePtr;
11339     for(ptr.p->files.first(filePtr);
11340 	filePtr.i != RNIL;
11341 	ptr.p->files.next(filePtr)) {
11342       jam();
11343       ndbout_c("filePtr.i = %u, flags: H'%x ",
11344 	       filePtr.i, filePtr.p->m_flags);
11345     }//for
11346   }
11347 }
11348 
11349 void
cleanup(Signal * signal,BackupRecordPtr ptr)11350 Backup::cleanup(Signal* signal, BackupRecordPtr ptr)
11351 {
11352   TablePtr tabPtr;
11353   ptr.p->tables.first(tabPtr);
11354   cleanupNextTable(signal, ptr, tabPtr);
11355 }
11356 
11357 void
release_tables(BackupRecordPtr ptr)11358 Backup::release_tables(BackupRecordPtr ptr)
11359 {
11360   TablePtr tabPtr;
11361   /* Clear backupPtr before releasing */
11362   for (ptr.p->tables.first(tabPtr);
11363        tabPtr.i != RNIL;
11364        ptr.p->tables.next(tabPtr))
11365   {
11366     jam();
11367     tabPtr.p->fragments.release();
11368     jamLine(tabPtr.p->tableId);
11369     removeTableMap(tabPtr, ptr.i, tabPtr.p->tableId);
11370   }
11371   while (ptr.p->tables.releaseFirst());
11372 }
11373 
11374 void
cleanupNextTable(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)11375 Backup::cleanupNextTable(Signal *signal, BackupRecordPtr ptr, TablePtr tabPtr)
11376 {
11377   if (tabPtr.i != RNIL)
11378   {
11379     jam();
11380     tabPtr.p->fragments.release();
11381     for(Uint32 j = 0; j<3; j++) {
11382       jam();
11383       TriggerPtr trigPtr;
11384       if(tabPtr.p->triggerAllocated[j]) {
11385         jam();
11386 	c_triggerPool.getPtr(trigPtr, tabPtr.p->triggerIds[j]);
11387 	trigPtr.p->event = ILLEGAL_TRIGGER_ID;
11388         tabPtr.p->triggerAllocated[j] = false;
11389       }//if
11390       tabPtr.p->triggerIds[j] = ILLEGAL_TRIGGER_ID;
11391     }//for
11392     {
11393       BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
11394       req->m_senderRef = reference();
11395       req->m_tableId = tabPtr.p->tableId;
11396       req->m_lock_unlock = BackupLockTab::UNLOCK_TABLE;
11397       req->m_backup_state = BackupLockTab::CLEANUP;
11398       req->m_backupRecordPtr_I = ptr.i;
11399       req->m_tablePtr_I = tabPtr.i;
11400       sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
11401                  BackupLockTab::SignalLength, JBB);
11402       return;
11403     }
11404   }
11405 
11406   BackupFilePtr filePtr;
11407   for(ptr.p->files.first(filePtr);filePtr.i != RNIL;ptr.p->files.next(filePtr))
11408   {
11409     jam();
11410     ndbrequire(filePtr.p->m_flags == 0);
11411     filePtr.p->pages.release();
11412   }//for
11413 
11414   while (ptr.p->files.releaseFirst());
11415   release_tables(ptr);
11416   while (ptr.p->triggers.releaseFirst());
11417   ptr.p->backupId = ~0;
11418 
11419   /*
11420     report of backup status uses these variables to keep track
11421     if files are used
11422   */
11423   ptr.p->ctlFilePtr = ptr.p->logFilePtr = ptr.p->dataFilePtr[0] = RNIL;
11424 
11425   if(ptr.p->checkError())
11426     removeBackup(signal, ptr);
11427   else
11428   {
11429     /*
11430       report of backup status uses these variables to keep track
11431       if backup ia running and current state
11432     */
11433     ptr.p->m_gsn = 0;
11434     ptr.p->masterData.gsn = 0;
11435     c_backups.release(ptr);
11436   }
11437 }
11438 
11439 
11440 void
removeBackup(Signal * signal,BackupRecordPtr ptr)11441 Backup::removeBackup(Signal* signal, BackupRecordPtr ptr)
11442 {
11443   jam();
11444 
11445   FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
11446   req->userReference = reference();
11447   req->userPointer = ptr.i;
11448   req->directory = 1;
11449   req->ownDirectory = 1;
11450   FsOpenReq::setVersion(req->fileNumber, 2);
11451   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
11452   FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
11453   FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
11454   sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
11455 	     FsRemoveReq::SignalLength, JBA);
11456 }
11457 
11458 void
execFSREMOVEREF(Signal * signal)11459 Backup::execFSREMOVEREF(Signal* signal)
11460 {
11461   jamEntry();
11462   FsRef * ref = (FsRef*)signal->getDataPtr();
11463   const Uint32 ptrI = ref->userPointer;
11464 
11465   FsConf * conf = (FsConf*)signal->getDataPtr();
11466   conf->userPointer = ptrI;
11467   execFSREMOVECONF(signal);
11468 }
11469 
11470 void
execFSREMOVECONF(Signal * signal)11471 Backup::execFSREMOVECONF(Signal* signal)
11472 {
11473   jamEntry();
11474 
11475   FsConf * conf = (FsConf*)signal->getDataPtr();
11476   const Uint32 ptrI = conf->userPointer;
11477 
11478   /**
11479    * Get backup record
11480    */
11481   BackupRecordPtr ptr;
11482   c_backupPool.getPtr(ptr, ptrI);
11483 
11484   if (ptr.p->is_lcp())
11485   {
11486     jam();
11487     lcp_remove_file_conf(signal, ptr);
11488     return;
11489   }
11490   /*
11491     report of backup status uses these variables to keep track
11492     if backup ia running and current state
11493   */
11494   ptr.p->m_gsn = 0;
11495   ptr.p->masterData.gsn = 0;
11496   c_backups.release(ptr);
11497 }
11498 
11499 /**
11500  * Description of how LCP works and its principles
11501  * -----------------------------------------------
11502  *
11503  * Introduction of Partial LCP
11504  * ...........................
11505  * In MySQL Cluster 7.6 partial LCPs was introduced. This means that each
11506  * LCP doesn't record every single row in the system. It only records a subset
11507  * of all rows + all changed rows since the last partial LCP.
11508  *
11509  * This allows partial LCPs to complete more quickly than a full LCP, and
11510  * the REDO log to be trimmed more frequently.
11511  *
11512  * We keep track of changed rows by using the GCI stored on each row. We
11513  * know which GCI that was fully part of the previous LCP. Thus if the
11514  * previous LCP contained all changes up to and including GCI = 77 this
11515  * means that the new LCP will only need to record changes from GCI = 78
11516  * and onwards.
11517  *
11518  * There is some complication that comes from deletions here.
11519  * The restore of the system uses a number of partial LCPs back in time.
11520  * For a specific rowid this means that there is a first partial LCP file
11521  * where it is recorded. It can either be restored with an inserted value as
11522  * part of this LCP, if it isn't then the rowid will be empty after executing
11523  * this first partial LCP, further partial LCPs might add it.
11524  *
11525  * In the following LCPs this rowid will only be part of the LCP if it has
11526  * changed since the last LCP. This is absolutely no problem if the row
11527  * has been inserted or updated since then the row exists and its value will
11528  * be recorded in the LCP as a changed row.
11529  *
11530  * At start of a partial LCP we decide the number of parts to checkpoint
11531  * fully, currently we have divided the page id range into 2048 different
11532  * parts. We can checkpoint anything between 1 to 2048 parts in one
11533  * partial LCP, this is driven by data size of fragment and change percentage.
11534  *
11535  * Definition: The set of rows where we record all rows are called ALL ROWS.
11536  * The set of rows where we only record the changed rows. We call this
11537  * CHANGE ROWS.
11538  *
11539  * The ALL ROWS parts are the same as used in earlier versions of MySQL
11540  * Cluster, and are a 'state dump' containing INSERT BY ROWID operations.
11541  * Each row existing at start of LCP will be recorded in pages belonging
11542  * to this part.
11543  *
11544  * The CHANGED ROWS parts are a kind of operation log with WRITE BY ROWID
11545  * and DELETE BY ROWID and DELETE BY PAGEID (DELETE by ROWID for all rows in a
11546  * page) operations which must be applied.
11547  *
11548  * For partial LCP we divide up the range of pages into 2048 parts using a hash
11549  * function on page id. For a specific LCP we will have one set of parts that
11550  * are checkpointed in the ALL ROWS part and the rest is checkpointed in the
11551  * CHANGE ROWS part.
11552  *
11553  * To restore we need to perform the following for each of the 2048 parts.
11554  * 1) Find the last LCP where this part belonged to the ALL ROWS part.
11555  * 2) Restore this part from this LCP.
11556  * 3) For each of the LCP after that up to the LCP we are restoring we will
11557  *    restore the CHANGE ROWS part for this part.
11558  *
11559  * This means that at restore we will never need to go further back than the
11560  * oldest ALL ROWS part we have remaining which is restorable. This is
11561  * important understanding for knowing when LCP files can be deleted.
11562  *
11563  * More definitions
11564  * ----------------
11565  * Rowid: Each row has a rowid (page id and page index) which is a local key
11566  * to the fixed size part of the row. The fixed part of the row has references
11567  * to the variable sized part and it also has a reference to the disk part of
11568  * the row.
11569  *
11570  * Page Map: The page map takes a rowid as input and gives back the page
11571  * pointer to this page. The page map also knows if the page id is empty
11572  * and it is also used to keep some page state after page has been deleted
11573  * as discussed further below.
11574  *
11575  * Disk reference: This is the reference in the main memory part of the row
11576  * that refers to the disk part of the row. Currently this reference is
11577  * located in the fixed size part of the row and the disk data part is a
11578  * fixed size part.
11579  *
11580  * Row content: This is the actual values of the attributes in this row.
11581  *
11582  * Row structure:
11583  * -------------------------------------------
11584  * | Fixed size part in main memory          |
11585  * | Tuple header + Fixed size attributes +  |
11586  * | disk reference + variable size reference|
11587  * | + NULL bits                             |
11588  * ------------------------------------------
11589  *
11590  * ------------------------------------------
11591  * | Var part in main memory                |
11592  * | Header part + variable sized attributes|
11593  * | + dynamic attributes                   |
11594  * |-----------------------------------------
11595  *
11596  * ------------------------------------------
11597  * | Fixed size part on disk page           |
11598  * | Header part + Fix size disk attributes |
11599  * ------------------------------------------
11600  *
11601  * The Fixed main memory part header contains also GCI, Checksum. Also the
11602  * disk part contains a GCI and a reference to the main memory part.
11603  *
11604  * Purpose of LCP
11605  * ..............
11606  * The purpose of the LCP (Local checkpoint) is to ensure that we can cut the
11607  * REDO log tail which otherwise grow to infinity. We do this by performing
11608  * a regular LCP of each fragment.
11609  *
11610  * NDB contains both main memory data and disk data. The disk data part is
11611  * recovered by using a No Steal approach. This means that only committed
11612  * data is ever sent to the pages written to disk. To support this we use an
11613  * UNDO log to ensure that the disk data is possible to restore to the
11614  * exact state it had at the starting point of the LCP.
11615  *
11616  * The main memory part of the row content is stored in the LCP files
11617  * generated by the LCP. The disk part is stored in its position in the
11618  * disk pages by flushing the pages in memory to disk for the disk parts.
11619  *
11620  * Observation 1:
11621  * Only committed rows are written into any LCP for both main memory data and
11622  * disk data. Thus after restoring an LCP we only need to roll forward using
11623  * a REDO log.
11624  *
11625  * Observation 2:
11626  * Given that the LCP maintains the exact row structure at the start of the
11627  * LCP the REDO log can be a logical log (only logging actions (INSERT, DELETE,
11628  * UPDATE) and the values changed).
11629  *
11630  * The REDO log is mainly operating with primary keys, but to ensure that
11631  * we synchronize the rowids on different nodes all INSERTs must also log
11632  * the rowid they are inserted into.
11633  *
11634  * Observation 3:
11635  * Given that the REDO log is a logical log it is location and replication
11636  * independent. This means that we can restore the LCP stored locally
11637  * and then apply a mix of the local REDO log and REDO logs from other
11638  * nodes in the same node group. Using remote REDO logs is a principle we
11639  * have decided to abandon and instead fully rely on the ability to
11640  * synchronize data nodes at node restarts.
11641  *
11642  * An LCP is performed per fragment. A table consists of multiple fragments
11643  * that can be checkpointed in parallel in different LDMs.
11644  *
11645  * Only one LCP per fragment per LDM instance is currently executed. However
11646  * we allow for the prepare phase of the next LCP (opening files and preparing
11647  * the LCP execution) to proceed in parallel to the currently running
11648  * LCP. In addition the deletion of old LCP files is a background process
11649  * going on in the background to both of these processes.
11650  *
11651  *     Need of LCP_SKIP bit for inserts
11652  *     ................................
11653  * Performing a checkpoint for disk pages means simply writing any pages that
11654  * got dirty since the last checkpoint. It is a bit more involved to perform
11655  * checkpoints (LCPs) for main memory data. For main memory data we only
11656  * checkpoint the rows and not pages. This gives us the opportunity to write
11657  * less data in the main memory checkpoints since we don't have to save the
11658  * entire page where the changes were done.
11659  *
11660  * The idea for LCPs is that we need a LCP to contain exactly the rows present
11661  * at the start of the LCP. This means that we set an LCP_SKIP bit on rows
11662  * that are inserted during LCPs to avoid putting those rows into the LCP when
11663  * we pass by them in the LCP scan.
11664  *
11665  * The requirement to have exactly the correct set of rows that existed at
11666  * start of LCP comes from that we need the reference from main-memory rows
11667  * to disk rows to be correct. The content of the main memory row and
11668  * disk data row must not be exactly synchronized but if the row exists
11669  * in main memory the referred disk row must exist in disk pages and
11670  * vice versa.
11671  *
11672  * Tables that don't have disk data don't need this requirement, but we
11673  * treat them the same way.
11674  *
11675  * The row content in both the disk data and the main memory data can be
11676  * newer than the data at the start of the LCP, but not older.
11677  *
11678  * The reason is that the REDO log or other synchronisation efforts will
11679  * ensure that all updates from before the LCP and until the restoration
11680  * point is reapplied, so we will eventually have the correct data in
11681  * each row at the restoration point.
11682  *
11683  *     Need of LCP keep list for deletes
11684  *     .................................
11685  * Similarly we use an LCP keep list to record deleted rows such that we
11686  * record them in the LCP. We use this list to give those recordings a
11687  * higher priority since we will release the rowid immediately after
11688  * committing the row.
11689  *
11690  * These two principles ensure that the LCP will contain exactly the same
11691  * set of rows as we had at the start of the LCP. The row data might
11692  * differ from what it looked at the start of the LCP. This is however
11693  * of no significance since the REDO log will ensure that we will
11694  * after recovery have the correct state of the data.
11695  *
11696  * As an example a row with a certain rowid can be deleted before LCP scans
11697  * it and then the row will be sent to the LCP keep list. Later a new row
11698  * will be inserted while the LCP scan still hasn't arrived at this rowid
11699  * and then the INSERT will set the LCP_SKIP to ensure that the new row
11700  * is ignored in this rowid.
11701  *
11702  * This leads to the following observations.
11703  *
11704  * Observation 1:
11705  * ..............
11706  * In an LCP there will only be one row existing for a specific rowid. There
11707  * will never be two rows with the same rowid in an LCP.
11708  *
11709  * Proof:
11710  * ------
11711  * If two rows existed there must have been a delete followed by an insert
11712  * in the LCP scan. The delete will ensure that the first row with this
11713  * rowid will exist in LCP and the LCP_SKIP bit will ensure that the
11714  * second row with this rowid will not exist in the LCP.
11715  *
11716  * Observation 2:
11717  * ..............
11718  * It isn't allowed for any updates to change the disk reference. The disk
11719  * reference must be stable over a set of LCPs.
11720  *
11721  * Proof:
11722  * ------
11723  * If an update did change the disk reference the restored main memory row
11724  * would refer to the wrong disk data part which would not work.
11725  *
11726  * The above is the essential requirement on any LCP that is used in a
11727  * restore of NDB tables. We formulate it here as a theorem.
11728  *
11729  * Theorem 1:
11730  * ..........
11731  * An LCP used in the recovery of NDB must meet the following requirements.
11732  * 1) All committed rows that are present at start of LCP (defined as the
11733  *    the time when we write the marker in the UNDO log of disk data) must
11734  *    all be part of LCP and no other rows may be present in the LCP.
11735  * 2) All links from main memory to disk data and vice versa must be
11736  *    consistent in a checkpoint.
11737  * 3) The row data must be the same as at the time of the start of the LCP
11738  *    OR at a time after the start of the LCP.
11739  *
11740  * Proof:
11741  * ------
11742  * A proof for this is presented in the Ph.D thesis of Mikael Ronström,
11743  * Design and Modelling of a Parallel Data Server for Telecom Applications,
11744  * 1998 in chapter 9.2.1. The bearing principle is that the logical REDO
11745  * log will replay all transactions from a point which is certain to be
11746  * before the start of the LCP, thus all updates, inserts and deletes
11747  * happening after the start of the LCP is certain to be part of the
11748  * REDO log execution.
11749  *
11750  * A paper at VLDB 2005 also presents some of the proof behind this in
11751  * the paper called "Recovery principles in MySQL Cluster 5.1". This paper
11752  * also takes into account the use of disk data parts.
11753  *
11754  * While applying the REDO log the following events can happen to a row that
11755  * existed in LCP. Note that the start of LCP is not known when executing
11756  * the REDO log, so this is a theoretical proof of the validity of the
11757  * algorithm, not how it works.
11758  *
11759  * 1) Delete of row before start of LCP, no problems to execute. There are
11760  *    two variants, the row is not inserted again, in this case the row
11761  *    won't be in the LCP and no REDO log record will reinsert it. In case
11762  *    the row is later reinserted the REDO log record will be executed as
11763  *    part of recovery and the row is thus certain to be part of the
11764  *    restorable state.
11765  *
11766  *    This operation can discover that the row doesn't exist, but this is
11767  *    ok and can only occur before start of LCP.
11768  *
11769  * 2) Delete of row after start of LCP, this is ok since the row will exist
11770  *    before the delete as it existed at start of LCP.
11771  *
11772  * 3) Update before start of LCP. This is ok, it will restore a value to
11773  *    the record that might not be the end state, but if not so there
11774  *    will be more updates recorded in the REDO log. The important principle
11775  *    here is that the REDO log application must be idempotent. Since the
11776  *    REDO log simply restores the values of the attributes it is
11777  *    idempotent. It is possible to construct a REDO log that contains
11778  *    operations also (like add one to column a). This would not work in
11779  *    this algorithm since we don't have exact control how exactly we
11780  *    restore a row state. Our algorithm requires an idempotent REDO log.
11781  *
11782  *    This update might discover that the row doesn't exist, this can only
11783  *    occur before start of LCP so it is safe to ignore the REDO log record.
11784  *
11785  * 4) Update after start of LCP. The value this REDO log entry restores
11786  *    could already be in the LCP since we don't care if the LCP records a
11787  *    newer record than at the start of the LCP.
11788  *
11789  * 5) Insert before start of LCP. The REDO log execution will perform this if
11790  *    the row doesn't exist. If it existed already we are certain that this
11791  *    insert is before start of LCP and it can be safely ignored.
11792  *
11793  * 6) Insert after start of LCP, the row won't be in LCP, so will always work
11794  *    fine.
11795  *
11796  * So what we see here is that the REDO log can sometimes bring us backwards
11797  * in the row history, but it will eventually bring us forward in row history
11798  * to the desired state at a particular GCP (global checkpoint).
11799  *
11800  *     Handling deletes for partial LCPs
11801  *     .................................
11802  * The problematic part is deletes of a row. This could result in 4 different
11803  * scenarios.
11804  *
11805  *     Special handling with reuse of rowids for partial LCPs
11806  *     ......................................................
11807  * 1) A first partial LCP has inserted row A into rowid X, after the LCP the
11808  *    row is deleted and then the delete is followed by a new insert of row B
11809  *    into rowid X. In this case the LCP will attempt to restore a row where
11810  *    a row already exists in this rowid. Here we need to remove the old row
11811  *    first before inserting the new row to ensure that the primary key hash
11812  *    index is correct.
11813  *
11814  *    To handle this case properly we always need to drop the row in the
11815  *    row id position if the primary key has changed from the previous
11816  *    LCP to this LCP. One manner is to always drop it first and then
11817  *    reinsert it even if it is the same row.
11818  *
11819  *     Special case of handling deleted rowids with GCI > 0
11820  *     ....................................................
11821  * 2) A first partial LCP has inserted row A into rowid X, after that the
11822  *    row is deleted. At the next LCP this will be recorded as a DELETE
11823  *    by ROWID. So when applying this partial LCP the rowid X will be
11824  *    set to an empty rowid and the record A will be deleted as part of
11825  *    executing that partial LCP. So after executing that partial LCP the
11826  *    row will not exist.
11827  *
11828  *     Special case of empty rowids (GCI = 0) for newly allocated pages
11829  *     ...............................................................
11830  * 3) The first partial LCP records the rows within page Y, after the LCP
11831  *    but before the new LCP the page is dropped, after the drop it is
11832  *    allocated again. When the LCP starts the page has at least 1 row in
11833  *    it which has been reinserted.
11834  *
11835  *    The remainder of the rows in the page can have GCI = 0, these rows
11836  *    need to have a DELETE by ROWID in the LCP. This DELETE by ROWID might
11837  *    encounter a row that actually didn't exist, so DELETE by ROWID at
11838  *    restore must be able to handle that the row didn't exist when we
11839  *    try to delete it.
11840  *
11841  *    Special case of empty page slot at start of LCP
11842  *    ...............................................
11843  * 4) At start of the LCP the page slot is free, in this case we record
11844  *    the entire page as deleted, we call this DELETE by PAGEID. In this
11845  *    case restore will delete all rows in this position. This only needs
11846  *    to happen if the page exists when restoring, if the page slot is
11847  *    empty when this is reached, then we can ignore the DELETE by PAGEID
11848  *    since it is already handled.
11849  *
11850  *    We only record DELETE by PAGEID for pages that are part of CHANGE
11851  *    ROWS.
11852  *
11853  *    We record this information by setting a flag on the page that says
11854  *    LCP_EMPTY_PAGE_FLAG. This says that the page is now allocated, but
11855  *    at start of the LCP scan it was empty, so when we reach this
11856  *    page we will see this state and record a DELETE by PAGEID.
11857  *    Similarly if we come by an empty page slot that haven't got the
11858  *    LCP_SCANNED bit set in the page map as described in 5) we will
11859  *    also record this as DELETE by PAGEID.
11860  *
11861  *    Problematic case of Drop page during LCP scan
11862  *    .............................................
11863  * 5) In this case the page exists at start of LCP, for ALL ROWS this is not
11864  *    a problem, those rows that was deleted since the start of LCP is put
11865  *    into the LCP through LCP keep lists. However for CHANGE ROWS we need to
11866  *    record DELETE by ROWID for each row that has GCI = 0 or GCI > scanGCI
11867  *    for LCP. We cannot drop the page without recording this information
11868  *    since there is no way to recreate this information.
11869  *
11870  *    To solve this issue we use the LCP keep list to enter the information
11871  *    about rowids that we need to issue DELETE by ROWID for. This means that
11872  *    we are able to drop the page immediately and store its state information
11873  *    needed for LCP elsewhere.
11874  *
11875  *    When dropping the page we will immediately scan the page and each
11876  *    rowid that has GCI = 0 or GCI >= lcpScanGCI will be recorded into the
11877  *    LCP keep list. However for efficiency reasons we will record multiple
11878  *    rowids in each row in the LCP keep list. So each record in the
11879  *    LCP keep list will either contain a full row as usual OR it will
11880  *    contain an indicator of containing dropped rowids, the number of
11881  *    dropped rowids in this row and the rowids in an array (each rowid
11882  *    consumes 2 words).
11883  *
11884  *    However there is one more problem related to this. Once the page has
11885  *    been dropped before LCP scan has reached it, it can be reinserted
11886  *    again. Now if this page as mentioned above belongs to the CHANGE ROWS
11887  *    category then as explained in 4) we want to record it as a
11888  *    DELETE by PAGEID. However in this case this is not correct, the page
11889  *    has already been scanned by the LCP.
11890  *
11891  *    We can avoid problems with future updates on the page by setting the
11892  *    LCP_SKIP bit on the page when it is reinserted, but we also need some
11893  *    information to avoid inserting the DELETE by PAEGID into the LCP.
11894  *
11895  *    The place where we retain information about dropped pages is the page
11896  *    map. We have 2 32-bit words in memory for each page in the current
11897  *    set of pages. These 2 words are handled by the DynArr256 data structure.
11898  *    We need to temporarily use this place to store information about pages
11899  *    dropped during LCP scan in the CHANGE ROW part.
11900  *
11901  *    To describe how this happens requires a description of the Page Map and
11902  *    its workings and how we make use of it in this case.
11903  *
11904  *    Description of Fragment Page Map
11905  *    ................................
11906  *
11907  *    ------------------
11908  *    | Page Map Head  |
11909  *    ------------------
11910  *    The page map head is a normal head of a doubly linked list that contains
11911  *    the logical page id of the first free logical page id slot.
11912  *
11913  *    The entries in the page map is different dependent on if the slot is
11914  *    free or not. First we'll show the non-empty variant (actually the
11915  *    second slot can be uninitialised in which case the DynArr256 will return
11916  *    RNIL (RNIL cannot be set in any manner since we cannot use page ids
11917  *    higher than or equal to RNIL & 0x3fffffff).
11918  *    ------------------------------------------
11919  *    | Physical page id  | Bit 31 set any rest|
11920  *    ------------------------------------------
11921  *    Now the empty variant
11922  *
11923  *     Next reference              Previous reference
11924  *    -----------------------------------------------------------
11925  *    | Bit 31 set, logicalPageId | Bit 31 set logicalPageId    |
11926  *    -----------------------------------------------------------
11927  *    So the first position uses bit 31 to indicate that the logical
11928  *    page id position is empty, the other 31 bits in this position is used
11929  *    to point to the next free logical page id. If all 30 lowest bits
11930  *    are set in the logical page id it is FREE_PAGE_RNIL. FREE_PAGE_RNIL
11931  *    means that there is no next logical page id.
11932  *
11933  *    The previous position also contains a reference to a logical page id,
11934  *    in this case the previous free logical page id. If there is no free
11935  *    previous logical page id then this is set to FREE_PAGE_RNIL as
11936  *    well. Bit 31 is set in both words when the entry is free.
11937  *
11938  *    The reason that Bit 31 is set in both words is to ensure that when
11939  *    we scan the fragment page map at drop fragment to release pages
11940  *    that we don't release any pages from the second position. The
11941  *    iterator delivers each word back and we don't keep track of which
11942  *    position is which, so we need bit 31 to be set at all times for
11943  *    the second position.
11944  *
11945  *    The page map is only growing, the only manner to get rid of it is to
11946  *    either drop the table or restart the node. At restart the page map
11947  *    starts from scratch again.
11948  *
11949  *    The conclusion is that the page map is a place where we can store
11950  *    the special information about that a logical page id has been dropped
11951  *    as part of the CHANGE ROWS category and it needs no more LCP scanning
11952  *    even if reinserted. So by setting a bit here we can use this information
11953  *    to avoid inserting a DELETE by PAGEID into the LCP and we can set some
11954  *    some proper information on the page to ensure that we skip this page
11955  *    later in the LCP scan (obviously also need the LCP scan to reset this
11956  *    bit then).
11957  *
11958  *    We also use bit 30 in the second word to indicate what the page state
11959  *    was at the start of the previous LCP. This enables us to decide what
11960  *    to do in those situations when we find that the page or row is not
11961  *    used at start of this LCP.
11962  *
11963  *    Solution:
11964  *    ---------
11965  *    We will use bit 30 in the first word of the page map to indicate this
11966  *    special page state. This has the effect that we can at most have
11967  *    2^30 pages in one page map. This limits the size of the main memory
11968  *    fixed part to 32 TBytes. If this becomes a problem then we need to
11969  *    use 64-bit page id as well and that means that the page map will
11970  *    contain 2 64-bit words instead and thus the problem will be resolved.
11971  *    We call this bit the LCP_SCANNED_BIT. Bit 31 in the first word is
11972  *    already used to store the FREE_PAGE_BIT which indicates if the page
11973  *    entry is free or in use, if FREE_PAGE_BIT the two words are used
11974  *    as next and prev of a linked list of free page ids for the fragment.
11975  *
11976  *    Obviously we need to ensure that during all page map operations that
11977  *    we take care in handling this special page state.
11978  *
11979  *    Note: For the pages in the ALL ROWS catoegory where re we record all
11980  *    rows we write all the rowids existing at start of LCP, this means that
11981  *    a rowid in these parts that isn't recorded as an empty rowid by
11982  *    definition. For parts where only record changes we have to ensure that
11983  *    we get the same set of rows after executing all changes, so we need to
11984  *    record all changes, both new rowids and deleted rowids and updates of
11985  *    row content of rows.
11986  *
11987  *    We will also use the 1 free bit in the second word in the page map.
11988  *    This bit will be used to store the LCP state at the previous LCP.
11989  *    When we reach a page in the LCP scan we will set the state of the last
11990  *    LCP based on the current state and of oter flags as described below.
11991  *
11992  *    The state that no page map entry exists is also a valid state, this
11993  *    state indicates that the previous LCP state was that the page was
11994  *    released and that the current state is empty state as well and that
11995  *    that the state of the LCP_SCANNED_BIT is 0.
11996  *
11997  *    So we have three bits in the page map:
11998  *    LCP_SCANNED_BIT: Indicates that we have taken care of everything
11999  *    related to LCP scans for this page in this LCP.
12000  *    FREE_PAGE_BIT: Indicates that the current state of the page is free.
12001  *    LAST_LCP_FREE_BIT: Set to 1 indicates that the last LCP state is D
12002  *    and set to 0 indicates that the last LCP state is A. This is bit 30
12003  *    of the second word in the page map.
12004  *
12005  *     Detailed discussion of each case of release/allocation of page
12006  *     ..............................................................
12007  *
12008  * A stands for an allocation event, D stands for an release event (drop page)
12009  * [AD].. stands for a A followed by D but possibly several ones and possibly
12010  * also no events.
12011  * E stands for empty set of events (no A or D events happened in the period).
12012  *
12013  * Case 1: Dropped before start of last LCP and dropped at start of this LCP
12014  * Desired action for ALL ROWS pages: Ignore page
12015  * Desired action for CHANGED ROWS pages: Ignore page, technically acceptable
12016  * to record it as DELETE by PAGEID as well.
12017  *
12018  * D  LCP_Start(n)   [AD]..    LCP_Start(n+1)  E           LCP_End(n+1) (1)
12019  * D  LCP_Start(n)   [AD]..    LCP_Start(n+1)  A           LCP_End(n+1) (2)
12020  * D  LCP_Start(n)   [AD]..    LCP_Start(n+1)  [AD]..A     LCP_End(n+1) (3)
12021  *
12022  * (1) is found by the empty page when the LCP scan finds it and the
12023  *     LCP_SCANNED_BIT is not set. Thus ALL ROWS pages knows to ignore the
12024  *     the page. CHANGED ROWS pages can ignore it by looking at the state
12025  *     of the last LCP and notice that the page was dropped also then and
12026  *     thus the page can be ignored.
12027  *
12028  *     In this case we set the state of last LCP to D in the LCP scan.
12029  *
12030  * (2) is found by discovering that page->is_page_to_skip_lcp() is true.
12031  *     The LCP_SCANNED_BIT isn't set in this case when the LCP scan reaches
12032  *     it. Thus ALL ROWS pages can ignore it. CHANGED ROWS pages will ignore
12033  *     it after checking the state of the last LCP.
12034  *
12035  *     In this case we need to keep the keep the state of last LCP until the
12036  *     LCP scan has reached the page. When LCP scan reaches the page we will
12037  *     set the state of last LCP to D when page->is_page_to_skip_lcp() is
12038  *     true.
12039  *
12040  * (3) is found by discovering that LCP_SCANNED_BIT is set since the first
12041  *     D event after LCP start handled the page and handled any needed
12042  *     DELETE by PAGEID. After discovering this one needs to reset the
12043  *     LCP_SCANNED_BIT again. At the first A the page_to_skip_lcp bit
12044  *     was set, but the first D issued a DELETE BY PAGEID and dropped
12045  *     the page and to flag that the LCP scan was handled the
12046  *     LCP_SCANNED_BIT was set.
12047  *
12048  *     We read the old last LCP state and set the new last LCP state when
12049  *     reaching the first D event after start of LCP. The
12050  *     page->is_page_to_skip_lcp() flag will assist in determining what
12051  *     the state at start of LCP was.
12052  *
12053  * Case 2: Dropped before start of last LCP and allocated at start of LCP.
12054  *
12055  * Desired action for ALL ROWS pages: Any rows with committed data at start
12056  * of LCP should be recorded as INSERTs into the LCP.
12057  *
12058  * Desired action for CHANGED ROWS pages: Any rows with committed data at
12059  * start of LCP should be recorded as WRITEs into the LCP. All other rows
12060  * should be ignored, technically acceptable behaviour is to issue
12061  * DELETE by ROWID for those rows that should be ignored as well.
12062  *
12063  * D  LCP_Start(n)   [AD].. A  LCP_Start(n+1)  E           LCP_End(n+1) (1)
12064  * D  LCP_Start(n)   [AD].. A  LCP_Start(n+1)  D           LCP_End(n+1) (2)
12065  * D  LCP_Start(n)   [AD].. A  LCP_Start(n+1)  [DA].. D    LCP_End(n+1) (3)
12066  *
12067  * (1) is found by that the page exists when being scanned, no LCP_SCANNED_BIT
12068  *     is set and also not the page to skip lcp flag is set. Individual rows
12069  *     can have their LCP_SKIP flag set. All rows with committed data AND not
12070  *     LCP_SKIP flag set will be recorded. All rows with LCP_SKIP flag set
12071  *     will be ignored for ALL ROWS pages and will be ignored for CHANGED ROWS
12072  *     pages based on the last LCP state. Rows without committed data will be
12073  *     ignored for ALL ROWS pages and will be ignored based on the last LCP
12074  *     state for CHANGED ROWS pages.
12075  *
12076  *     When we are done executing a page for the LCP scan we can set the
12077  *     last LCP state to A.
12078  *
12079  * (2) is found when releasing the page. Before page is released it will have
12080  *     its rows deleted, for each row that is deleted and wasn't already
12081  *     deleted since start of LCP we will record the row using the LCP keep
12082  *     list and also setting LCP_SKIP flag on the row. When releasing the
12083  *     page we can ignore it based on knowledge of the last LCP state.
12084  *
12085  *     In this we set the last LCP state and also read it when reaching the
12086  *     D event. This event can even occur while we're in the middle of
12087  *     scanning the page for the LCP.
12088  *
12089  * (3) is found by discovering that the LCP_SCANNED_BIT is set. This is set
12090  *     by the first D event after start of LCP after handling the page as
12091  *     in (2).
12092  *
12093  *     Last LCP state already set in the first D event after start of LCP.
12094  *
12095  * Case 3: Allocated before start of last LCP and dropped at start of this LCP
12096  *
12097  * Desired action for ALL ROWS pages: Page ignored
12098  *
12099  * Desired action for CHANGED ROWS pages: DELETE by PAGEID recorded in LCP
12100  *
12101  * A  LCP_Start(n) D [AD]..    LCP_Start(n+1)  E           LCP_End(n+1) (1)
12102  * A  LCP_Start(n) D [AD]..    LCP_Start(n+1)  A           LCP_End(n+1) (2)
12103  * A  LCP_Start(n) D [AD]..    LCP_Start(n+1)  [AD].. A    LCP_End(n+1) (3)
12104  *
12105  * Here we will take the same action for all cases independent of if we know
12106  * state of the last LCP or not since the state was allocated before and thus
12107  * we need to record the change in state.
12108  *
12109  * (1) is found by empty page slot and no LCP_SCANNED_BIT set and not skip
12110  *     flag set on page. For ALL ROWS pages we will simply ignore those
12111  *     pages. For CHANGED ROWS pages we will record DELETE by PAGEID based
12112  *     on the state of the last LCP.
12113  * (2) is found by discovering page->is_page_to_skip_lcp() is true when LCP
12114  *     scan reaches it. For ALL ROWS pages this means we can ignore it, for
12115  *     CHANGED ROWS pages we record it as DELETE by PAGEID based on the state
12116  *     of the last LCP.
12117  * (3) is found by discovering the LCP_SCANNED_BIT set which was set when the
12118  *     first D event after start of LCP was found. When this first D event
12119  *     occurred we handled the page as in (1) followed by setting the
12120  *     LCP_SCANNED_BIT.
12121  *
12122  * The same principles for handling last LCP state exists here as for Case 1.
12123  *
12124  * Case 4: Allocated before start of last LCP and allocated before start
12125  *         of this LCP
12126  *
12127  * Desired action for ALL ROWS pages: Record all rows with committed data at
12128  * start of LCP. Ignore all rows without committed data at start of LCP.
12129  *
12130  * Desired action for CHANGED ROWS pages: Record all rows with committed data
12131  * at start of LCP. Record all rows without committed data at start of LCP as
12132  * DELETE by ROWID.
12133  *
12134  * A  LCP_Start(n)   [DA]..    LCP_Start(n+1)  E           LCP_End(n+1) (1)
12135  * A  LCP_Start(n)   [DA]..    LCP_Start(n+1)  D           LCP_End(n+1) (2)
12136  * A  LCP_Start(n)   [DA]..    LCP_Start(n+1)  [DA].. D    LCP_End(n+1) (3)
12137  *
12138  * (1) is found by an existing page without LCP_SCANNED_BIT set and without
12139  *     the page to skip flag set on the page. We will check row by row if the
12140  *     row is to be copied to LCP.
12141  *
12142  *     If a row exists at start of LCP then it will be recorded in the LCP,
12143  *     either at LCP scan time or at first delete after the start of the LCP.
12144  *     When the first delete have occurred then we set the LCP_SKIP flag on
12145  *     the row to indicate that the row has already been processed for this
12146  *     LCP. The handling here is the same for ALL ROWS pages and for CHANGED
12147  *     ROWS pages.
12148  *
12149  *     If a row didn't exist at start of LCP then we will ignore it for ALL
12150  *     ROWS pages and we will record a DELETE by ROWID for CHANGED ROWS
12151  *     pages. We discover this as part of LCP scan for rows not inserted
12152  *     again before the LCP scan reaches them. For rows that are inserted
12153  *     after start of LCP we will mark them with LCP_SKIP flag for ALL ROWS
12154  *     pages. For CHANGED ROWS pages we could record the DELETE by ROWID
12155  *     immediately, but there is no safe space to record this information.
12156  *     So instead we mark the row with LCP_DELETE to flag to the LCP scan
12157  *     that this row needs to generate a DELETE by ROWID.
12158  *
12159  * (2) is found when releasing a page, at this point the page has already
12160  *     recorded everything for ALL ROWS pages. We indicate this by setting
12161  *     LCP_SCANNED_BIT on the page.
12162  *
12163  *     However for CHANGED ROWS pages we can still have a set of rowids that
12164  *     was empty at start of LCP that we need to record before moving on.
12165  *     We scan the page before moving on, we ignore rows that have the
12166  *     LCP_SKIP flag set and rows that have rowGCI < scanGCI which indicates
12167  *     that they were empty also at last LCP. All other rows we generate a
12168  *     DELETE by ROWID for. Also here we set the LCP_SCANNED_BIT after
12169  *     doing this.
12170  *
12171  * (3) is found by LCP_SCANNED_BIT set when LCP scan reaches it. Any A or D
12172  *     event after the first D event will be ignored since LCP_SCANNED_BIT
12173  *     is set.
12174  *
12175  * The same principles for handling last LCP state exists here as for Case 2.
12176  *
12177  *     Requirement to record number of pages at start of LCP
12178  *     .....................................................
12179  * For partial LCPs we record the number of pages existing in the whole
12180  * fragment at the start of the partial LCP, this has the effect that during
12181  * restore we can safely ignore all LCP records on rowids with higher page id
12182  * than the recorded number of pages. They could never be part of the LCP even
12183  * if they are part of earlier LCPs.
12184  *
12185  * Let's look at an example here. Each page can be sparse or full, it doesn't
12186  * matter for the description, we need to ensure that the restore can recover
12187  * the correct set of rows.
12188  *
12189  * LCP 1: Contains 17 pages (rowids from page 0 to 16 included)
12190  * LCP 2: Contains 13 pages
12191  * LCP 3: Contains 14 pages
12192  *
12193  * When restoring LCP 3 we make use also of parts from LCP 1 and LCP 2.
12194  * We start by applying the LCP 1 for rowids in page 0 to 13. Next when
12195  * start applying LCP 2 we need to perform DELETE by ROWID for all rows
12196  * page id 13. We know that all rowids from page id 13 have either
12197  * GCI = 0 or a GCI > lcpScanGci which makes them recorded as changes
12198  * in LCP 3.
12199  *
12200  * If we had not recorded the number of pages in LCPs we would not be
12201  * able to know that rows in page id 14 through 16 was deleted since
12202  * the LCP scan would not see them since they were not part of the
12203  * pages scanned during LCP (simply because the pages no longer existed).
12204  *
12205  *
12206  *     Multiple LCP files to save disk space
12207  *     .....................................
12208  * Using partial LCP it is essential to be able to drop files as early as
12209  * possible. If an LCP file contain too many parts fully written then the
12210  * file needs to be retained although most of its data is no longer useful.
12211  *
12212  * To avoid this we cap the number of parts we use for large fragments
12213  * in each file and use a multi-file implementation of each partial LCP.
12214  *
12215  * What we do here is that we divide the LCP of each fragment into several
12216  * files. We will write each of those files in sequential order. Assume that
12217  * we have 2048 parts and that this LCP is to record 256 of those parts starting
12218  * at part 100. Assume that we divide this LCP into 4 files.
12219  *
12220  * The first file will record all rows from part 100-163, the second will
12221  * contain all rows from part 164-228, the third file will contain all
12222  * rows from part 229-292 and the fourth and last file will contain
12223  * all rows from part 293-356.
12224  *
12225  * The rows from the LCP keep list is written into the file currently
12226  * used.
12227  *
12228  * Changed rows are written to any of the files. But we choose to write
12229  * them to the first file. The reason is that this means that the biggest
12230  * file in the LCP will be removed first and thus it is the most efficient
12231  * algorithm to save disk space.
12232  *
12233  * It is a bit complicated to understand to prove that this brings about
12234  * an LCP that can be correctly restored. We prove it in a number of
12235  * steps before proving the theorem for Partial LCPs.
12236  *
12237  * Corollary 1:
12238  * ............
12239  * For each LCP part we always start by applying an LCP where all rows
12240  * of the part is recorded. Then we will execute the change parts of
12241  * all LCPs thereafter until the last.
12242  *
12243  * Proof:
12244  * This is the intended recovery algorithm, so proof is not really
12245  * needed. Proof is only required to prove that this recovers a
12246  * proper LCP according to Theorem 1 above.
12247  *
12248  * Case 1:
12249  * Assume that the row existed at the time of the first LCP used in
12250  * restore and is kept all the way until the last LCP, updates can
12251  * occur.
12252  *
12253  * Case 2:
12254  * Assume that the row was inserted after initial LCP and is kept
12255  * until the last LCP.
12256  *
12257  * Case 3:
12258  * Assume that the row existed at the time of the first LCP but has
12259  * been deleted before the final LCP.
12260  *
12261  * Case 4:
12262  * Assume that the row didn't exist at the first LCP and did not
12263  * exist at the time of the last LCP.
12264  *
12265  * Case 4 is obviously ok, no LCP has recorded anything regarding
12266  * this row, so it cannot be a problem.
12267  *
12268  * Case 1 means that the row is restored in first LCP, if any changes
12269  * has occurred before the last LCP they will be recorded in any of
12270  * the LCP preceding the last LCP or in the last LCP itself. It
12271  * could contain a newer value if the last LCP had changes that
12272  * occurred after start of the LCP. Thus the row is present with
12273  * same or newer data as it should be according to Theorem 1.
12274  *
12275  * Case 2 means that the row was not present in the first LCP.
12276  * It must have been inserted in either of the following LCPs
12277  * or the last LCP and since it will be marked with a higher GCI
12278  * when inserted it will be part of the next LCP after being
12279  * inserted, similary any updates will be recorded in some LCP if
12280  * it happens before or during the last LCP. Thus the row exists
12281  * after applying rows according to Corollary 1 such that Theorem 1
12282  * holds true.
12283  *
12284  * Finally Case 3 have inserted the row as part of the first LCP. The
12285  * row could have been written by the LCP keep list in this first LCP.
12286  * However when the row is deleted the GCI of the row will be set to
12287  * to a GCI higher than the GCI of the first LCP and this ensures that
12288  * the rowid is recorded in LCP as DELETE by ROWID. Finally if the
12289  * entire page have been removed before the last LCP we will record
12290  * this in the last LCP and this means that we will ignore the row
12291  * that exists in the first LCP restored since we know that not any
12292  * rows with that rowid is present in the LCP.
12293  *
12294  * This means that we have proven that the LCP also in case 3 fits
12295  * with Theorem 1 in that the row is certain to not be part of the
12296  * LCP restored.
12297  *
12298  * Thus all cases have been proven and Corollary 1 is proven to be
12299  * a correct restore method for LCPs with Partial LCPs.
12300  *
12301  * Corollary 2:
12302  * ............
12303  * The LCP keep list can be recorded in any LCP file in the case where
12304  * multiple files are used to record an LCP.
12305  *
12306  * Proof:
12307  * The record in the LCP from a LCP keep list will always be overwritten
12308  * or ignored by the following LCPs. The reason is simply that the GCI of
12309  * the delete is higher than LCP scan GCI of the current LCP. Thus the
12310  * next LCP will either overwrite this record with a DELETE by ROWID or
12311  * the record will be ignored by the next LCP since the entire page has
12312  * been dropped or the rowid will be overwritten by another row that
12313  * reused the rowid of the deleted row.
12314  *
12315  * So thus it is safe to store these LCP keep list items as they come
12316  * and record them in any list. Obviously all the files of the last
12317  * LCP will be kept and applied as part of restore.
12318  *
12319  * Corollary 3:
12320  * ............
12321  * When we remove a file from an LCP we could not be interested in any
12322  * of the change rows from this LCP. We are only interested in the
12323  * parts where we have recorded all rows.
12324  *
12325  * Proof:
12326  * We will only remove the oldest LCP files at any time. Thus when we
12327  * remove a file from an LCP we are sure that all the files from the
12328  * previous LCP is already deleted. This means that the LCP from where
12329  * we delete files can only be used to restore the all rows part as
12330  * described in Corollary 1. Thus we will always ignore all parts
12331  * with changed rows for an LCP where we are about to delete a file.
12332  *
12333  * Theorem 2:
12334  * ----------
12335  * The following algorithm will be applied using multiple files.
12336  * If we want to divide the parts where we record all rows into multiple
12337  * files we do so in the following manner:
12338  * 1) In the first file we will record up to 1/8th of the parts. We will
12339  * also record all changed rows for parts where we are not recording
12340  * all rows. In addition LCP keep rows are recorded as they arrive.
12341  * 2) In the following files we will record also all rows for up to 1/8th
12342  * of the parts. Also LCP keep rows for those as they arrive.
12343  *
12344  * Proof:
12345  * ------
12346  * Corollary 2 shows that it is correct to record LCP keep rows as they
12347  * arrive in any of the files.
12348  * Corollary 3 shows that the any algorithm to select where to record
12349  * changed rows is correct, in particular this shows that the selected
12350  * variant to record all in the first file is correct.
12351  * Corollary 1 shows that the restore algorithm for this type of LCP
12352  * works as desired.
12353  *
12354  * Observation 2:
12355  * --------------
12356  * Given that we need two different mechanisms to deduce if a page should
12357  * be skipped when LCP scanned (is_page_to_skip_lcp() through state on
12358  * page and lcp_scanned_bit set in page map) this means that both of
12359  * those need to be checked to see if a row is in remaining LCP set
12360  * that is used to decide whether to set LCP_SKIP bit on the row.
12361  *
12362  * The is_page_to_skip_lcp() flag on page is set when a page as first
12363  * alloc/release page event after start of LCP scan is allocated. After
12364  * this the page can be released and if so the last LCP state of the
12365  * page will be updated and the lcp scanned bit will be set.
12366  *
12367  * Similarly if the page is released as the first page event after
12368  * start of LCP scan we will also update the last LCP state and
12369  * next set the lcp scanned bit. So when we see a lcp scanned bit we
12370  * need never do anything more during the LCP scan, we only need to
12371  * reset the bit.
12372  *
12373  * Lemma 1:
12374  * --------
12375  * Based on theorem 2 we deduce that each LCP requires a LCP control
12376  * file that contains at least the following information.
12377  *
12378  * MaxGciCompleted:
12379  * This is the GCI where which we have all changes for in the LCP. The
12380  * LCP can also contain changes for MaxGciCompleted + 1 and
12381  * MaxGciCompleted + 2 and beyond.
12382  *
12383  * MaxPageCount:
12384  * This is the number of pages existing (with rowids) in the LCP which
12385  * is recorded at the start of the partial LCP.
12386  *
12387  * A list of part ranges (one part range per file) and the file numbers.
12388  * This is recorded using the following variables in the LCP control file.
12389  *
12390  * MaxPartPairs:
12391  * This is the maximum number of LCPs that can constitute a recoverable
12392  * checkpoints. Thus an LCP control file can write at most this many
12393  * parts. Currently this number is set to 2048.
12394  *
12395  * NumPartPairs:
12396  * This is the number of files used in the restore of this LCP, there is
12397  * one part range per file.
12398  *
12399  * MaxNumberDataFiles:
12400  * This is the maximum number of files used, it is used to calculate the
12401  * file numbers based on a number of files (NumPartPairs) and the
12402  * parameter LastDataFileNumber.
12403  *
12404  * LastDataFileNumber:
12405  * The last LCP file, this will be the final file restored in a restore
12406  * situation.
12407  *
12408  * An array of pairs (startPart, numParts) where the last records the
12409  * last LCP file and the first records the first file to start restoring
12410  * from.
12411  *
12412  * In addition we record the following information in the LCP control
12413  * file.
12414  *
12415  * Checksum:
12416  * To verify the content of the LCP control file.
12417  *
12418  * TableId:
12419  * Table id of the checkpointed fragment.
12420  *
12421  * FragmentId:
12422  * Fragment id of the checkpointed fragment.
12423  *
12424  * LcpId:
12425  * The global LcpId this LCP belongs to.
12426  *
12427  * LocalLcpId:
12428  * If part of global LCP it is 0, otherwise it is 1, 2, 3 and so forth
12429  * for a local LCP executed without control of DIH.
12430  *
12431  * In addition the LCP control file contains a file header as all LCP
12432  * files and backup files. The most important information here is the
12433  * version number of the partial LCP changes as such and the version
12434  * number that wrote this file. This is important for any upgrade
12435  * scenarios.
12436  *
12437  * LCPs and Restarts:
12438  * ------------------
12439  * Partial LCP is developed to store less information in LCPs and also
12440  * that LCPs can run faster. When LCPs complete faster that means that
12441  * we can cut the REDO log much sooner.
12442  *
12443  * However we still need to make a full checkpoint as part of a restart.
12444  * We will describe the implications this has for various types of
12445  * restarts.
12446  *
12447  * System Restart:
12448  * ...............
12449  * No real implication, we have ensured that doing a full checkpoint is
12450  * still divided into separate files to ensure that we save disk space.
12451  * There is no updates ongoing during this LCP so this LCP will simply
12452  * write the changed contents while executing the REDO log.
12453  *
12454  * Node restart:
12455  * .............
12456  * This restart depends to a great extent on how long time the node
12457  * was dead, if it was dead for a long time it will have a lot more
12458  * to write in a LCP than otherwise.
12459  *
12460  * Initial node restart:
12461  * .....................
12462  * This is the trickiest problem to solve. Using partial LCP we aim for
12463  * LCPs to complete in 5-10 minutes, but writing the initial LCP after
12464  * synching the data with the live node might take many hours if the
12465  * node contains terabytes of data.
12466  *
12467  * We solve this by running local LCPs before we become part of the
12468  * global LCP protocol. DIH won't know about these LCPs but it doesn't
12469  * really matter, we can make use of it if the node crashes during
12470  * restart although DIH didn't know about it. But more importantly
12471  * as soon as we participate in the first global LCP we can run that
12472  * LCP much faster since we already have logged all rows, so we only
12473  * need to record the changes since the last local LCP in the first
12474  * global LCP.
12475  *
12476  * The protocol used to tell the starting node about state of fragments
12477  * is called COPY_ACTIVEREQ. This is received 2 times per fragment
12478  * per node restart. The first one says that we have completed the
12479  * synchronisation. We will use this first signal to put the fragment
12480  * in queue for running an LCP.
12481  *
12482  * When all fragments have been synchronised then DIH will start the
12483  * second phase. In this phase each fragment will start using the
12484  * REDO log as preparation for the first LCP.
12485  *
12486  * Note that a local LCP cannot be used to restore the database on
12487  * its own. It requires either a node synchronization as part of node
12488  * restart which works fine as the rowids are synchronized one by one
12489  * and there might be unneeded work done if the live node uses a GCI
12490  * from DIH, but it will still be correct.
12491  *
12492  * It can also be restored in a system restart by using REDO logs from
12493  * other nodes, we can avoid applying REDO logs we don't need since we
12494  * know what GCP we have completely recorded in the LCP. The proof of
12495  * why applying REDO logs will restore a consistent database still
12496  * holds.
12497  *
12498  * Obviously if as part of recovery we are told to execute the REDO log
12499  * from GCI 77 to 119 and we know that the LCP is completed for GCI
12500  * GCI 144 then we can completely skip the part where we execute the
12501  * REDO log for that fragment as part of the recovery. Later it will
12502  * be synched up in this case using a live node.
12503  *
12504  * Local LCPs during restart
12505  * .........................
12506  * When we receive the first COPY_ACTIVEREQ in DBLQH we will start a
12507  * new local LCP. This will insert an UNDO_LOCAL_LCP_FIRST into the
12508  * UNDO log. This means that we can move the UNDO log forward, we
12509  * still need to retain all UNDO log records from the previous LCP,
12510  * and the one before that since we cannot be certain that the previous
12511  * LCP actually completed.
12512  *
12513  * During Local LCP we cannot insert one more UNDO_LOCAL_LCP_FIRST again
12514  * until we have completed a Local LCP of each and every fragment to be
12515  * restored.
12516  *
12517  * So what this means is that we will start running a Local LCP as part
12518  * of the synchronisation with the live node. It is possible to run an
12519  * LCP for an individual fragment several times during this round of
12520  * LCP. But we need to complete the Local LCP before allowing the
12521  * first COPY_ACTIVEREQ in the second phase to continue. If we didn't
12522  * do this we would run a much bigger chance of running out of UNDO
12523  * log. In some cases we might still run out of UNDO log and in this
12524  * case we will ensure that the LCP gets higher priority and that the
12525  * synchronisation process is blocked temporarily. We will do this
12526  * when certain thresholds in UNDO log usage is reached.
12527  *
12528  * We will allow for two choices in how we perform Local LCPs. We will
12529  * perform 1 Local LCP for all node restarts before we allow the
12530  * REDO logging to be activated (activated by COPY_ACTIVEREQ in second
12531  * phase). After completing this first Local LCP we will measure how
12532  * much impact introducing the node into the distributed LCP would mean.
12533  * If we consider the impact too high we will execute one more round of
12534  * Local LCP.
12535  *
12536  * We will not for the moment consider executing a third Local LCP to
12537  * ensure that we don't get stuck in this state for too long.
12538  *
12539  * Executing 2 Local LCPs should in most cases be sufficient to catch
12540  * up with LCP times at other nodes.
12541  *
12542  * Dropped tables during a node failure
12543  * ....................................
12544  * This is a tricky problem that requires us to avoid reusing a table id
12545  * for a new table until we're sure that all nodes have restarted and
12546  * heard that the table have been dropped. We also need to tell starting
12547  * nodes that the table is dropped and that it requires all LCP files
12548  * to be removed.
12549  *
12550  * Various implementation details about LCPs
12551  * .........................................
12552  * When we commit a delete we need to know if the fragment is currently
12553  * performing a LCP and if so we need to know if the row has been
12554  * scanned yet during LCP.
12555  *
12556  * With Partial LCP this is a bit more intricate where we need to check
12557  * the scan order in the Backup block. However only DBTUP knows if a
12558  * page has been deleted and then followed by a new page allocation.
12559  *
12560  * For parts where we record all rows of the part these pages can be
12561  * skipped since all rows inserted into this page occurs after start of
12562  * LCP.
12563  *
12564  * However for parts where we record changed rows we need to scan these
12565  * pages and record DELETE by ROWID for those entries that are free.
12566  *
12567  * LCP signal flow
12568  * ---------------
12569  *
12570  * Description of local LCP handling when checkpointing one fragment locally in
12571  * this data node. DBLQH, BACKUP are executing always in the same thread. DICT
12572  * and NDBFS mostly execute in different threads.
12573  *
12574  * The LCP_PREPARE_REQ for the next fragment to checkpoint can execute in
12575  * parallel with BACKUP_FRAGMENT_REQ processing. This makes LCP processing
12576  * faster when there is many small fragments.
12577  *
12578 
12579  DBLQH                        BACKUP             DICT              NDBFS
12580   |                             |
12581   |   LCP_PREPARE_REQ           |
12582   |---------------------------->|
12583   |                             |    2 * FSOPENREQ (control files)
12584   |                             |----------------------------------->|
12585   |                             |    2 * FSOPENCONF                  |
12586   |                             |<-----------------------------------|
12587   |                             |    2 * FSREADREQ (control files)
12588   |                             |----------------------------------->|
12589   |                             |    2 * FSREADCONF                  |
12590   |                             |<-----------------------------------|
12591   |                             |    FSCLOSEREQ (most recent control file)
12592   |                             |----------------------------------->|
12593   |                             |    FSCLOSECONF                     |
12594   |                             |<-----------------------------------|
12595   |                             |    FSOPENREQ (checkpoint data file)
12596   |                             |----------------------------------->|
12597   |                             |    FSOPENCONF                      |
12598   |                             |<-----------------------------------|
12599   |                             | CONTINUEB(ZBUFFER_FULL_META) to oneself
12600   |                             |--------------------------------------->
12601   |                             |  GET_TABINFOREQ  |
12602   |                             |----------------->|
12603   |                             | GET_TABINFO_CONF |
12604   |                             |<-----------------|
12605   |   LCP_PREPARE_CONF          |
12606   |<----------------------------|
12607   ...
12608   |   BACKUP_FRAGMENT_REQ       |-------> CONTINUEB(START_FILE_THREAD)|
12609   |---------------------------->|
12610   |   SCAN_FRAGREQ              |
12611   |<----------------------------|
12612   |
12613   | Potential CONTINUEB(ZTUP_SCAN) while scanning for tuples to record in LCP
12614   |
12615   |  TRANSID_AI                 |
12616   |---------------------------->|
12617   |.... More TRANSID_AI         | (Up to 16 TRANSID_AI, 1 per record)
12618   |  SCAN_FRAGCONF(close_flag)  |
12619   |---------------------------->|
12620   |  SCAN_NEXTREQ               |
12621   |<----------------------------|
12622   |
12623   | Potential CONTINUEB(ZTUP_SCAN) while scanning for tuples to record in LCP
12624   |
12625   |  TRANSID_AI                 |
12626   |---------------------------->|
12627   |.... More TRANSID_AI         | (Up to 16 TRANSID_AI, 1 per record)
12628   |  SCAN_FRAGCONF(close_flag)  |
12629   |---------------------------->|
12630 
12631   After each SCAN_FRAGCONF we check of there is enough space in the Backup
12632   buffer used for the LCP. We will not check it until here, so the buffer
12633   must be big enough to be able to store the maximum size of 16 records
12634   in the buffer. Given that maximum record size is about 16kB, this means
12635   that we must have at least 256 kB of buffer space for LCPs. The default
12636   is 2MB, so should not set it lower than this unless trying to achieve
12637   a really memory optimised setup.
12638 
12639   If there is currently no space in the LCP buffer, then the buffer is either
12640   waiting to be written to disk, or it is being written to disk. In this case
12641   we will send a CONTINUEB(BUFFER_FULL_SCAN) delayed signal until the buffer
12642   is available again.
12643 
12644   When the buffer is available again we send a new SCAN_NEXTREQ for the next
12645   set of rows to be recorded in LCP.
12646 
12647   CONTINUEB(START_FILE_THREAD) will either send a FSAPPENDREQ to the opened
12648   file or it will send a delayed CONTINUEB(BUFFER_UNDERFLOW).
12649 
12650   When FSAPPENDCONF arrives it will make the same check again and either
12651   send one more file write through FSAPPENDREQ or another
12652   CONTINUEB(BUFFER_UNDERFLOW). It will continue like this until the
12653   SCAN_FRAGCONF has been sent with close_flag set to true AND all the buffers
12654   have been written to disk.
12655 
12656   After the LCP file write have been completed the close of the fragment LCP
12657   is started.
12658 
12659   An important consideration when executing LCPs is that they conflict with
12660   the normal processing of user commands such as key lookups, scans and so
12661   forth. If we execute on normal JBB-level everything we are going to get
12662   problems in that we could have job buffers of thousands of signals. This
12663   means that we will run the LCP extremely slow which will be a significant
12664   problem.
12665 
12666   The other approach is to use JBA-level. This will obviously give the
12667   LCP too high priority, we will run LCPs until we have filled up the
12668   buffer or even until we have filled up our quota for the 100ms timeslot
12669   where we check for those things. This could end up in producing 10
12670   MByte of LCP data before allowing user level transactions again. This
12671   is also obviously not a good idea.
12672 
12673   So most of the startup and shutdown logic for LCPs, both for the entire
12674   LCP and messages per fragment LCP is ok to raise to JBA level. They are
12675   short and concise messages and won't bother the user transactions at any
12676   noticable level. We will avoid fixing GET_TABINFO for that since it
12677   is only one signal per fragment LCP and also the code path is also used
12678   many other activitites which are not suitable to run at JBA-level.
12679 
12680   So the major problem to handle is the actual scanning towards LQH. Here
12681   we need to use a mechanism that keeps the rate at appropriate levels.
12682   We will use a mix of keeping track of how many jobs were executed since
12683   last time we executed together with sending JBA-level signals to speed
12684   up LCP processing for a short time and using signals sent with delay 0
12685   to avoid being delayed for more than 128 signals (the maximum amount
12686   of signals executed before we check timed signals).
12687 
12688   The first step to handle this is to ensure that we can send SCAN_FRAGREQ
12689   on priority A and that this also causes the resulting signals that these
12690   messages generate also to be sent on priority A level. Then each time
12691   we can continue the scan immediately after receiving SCAN_FRAGCONF we
12692   need to make a decision at which level to send the signal. We can
12693   either send it as delayed signal with 0 delay or we could send them
12694   at priority A level to get another chunk of data for the LCP at a high
12695   priority.
12696 
12697   We send the information about Priority A-level as a flag in the
12698   SCAN_FRAGREQ signal. This will ensure that all resulting signals
12699   will be sent on Priority A except the CONTINUEB(ZTUP_SCAN) which
12700   will get special treatment where it increases the length of the
12701   loop counter and sends the signal with delay 0. We cannot send
12702   this signal on priority level A since there is no bound on how
12703   long it will execute.
12704 
12705  DBLQH      PGMAN   LGMAN     BACKUP             DICT              NDBFS
12706   |         SYNC_PAGE_CACHE_REQ
12707   |          <------------------|
12708   |           sync_log_lcp_lsn  |
12709   |                  <----------|
12710   |           Flush UNDO log
12711   |                  ---------->|
12712   |         Flush fragment page cache
12713   |         SYNC_PAGE_CACHE_CONF
12714   |          ------------------>|
12715   |         If first fragment in LCP then also:
12716   |         SYNC_EXTENT_PAGES_REQ
12717   |          <------------------|
12718   |         Flush all extent pages
12719   |         SYNC_EXTENT_PAGES_CONF
12720   |          ------------------>|
12721   |
12722   | After all file writes to LCP data file completed:
12723   |
12724   |                             |     FSCLOSEREQ
12725   |                             |------------------------------------>|
12726   |                             |     FSCLOSECONF
12727   |                             |<------------------------------------|
12728 
12729   When all those activities are completed:
12730   1) Sync UNDO log
12731   2) Sync page cache
12732   3) Sync extent pages (done immediately following sync of page cache)
12733   4) Write and close of LCP data file
12734   then we are ready to write the LCP control file. After this file
12735   is written and closed the LCP of this fragment is completed.
12736 
12737   With this scheme the LCP of a fragment is immediately usable when the
12738   LCP of a fragment is completed and the signal of this completion is
12739   that a written LCP control file exists. At restart one needs to verify
12740   the GCI of this file to ensure that the LCP is restorable. Otherwise
12741   the older LCP will be used.
12742 
12743   |                             |     FSWRITEREQ (LCP control file)
12744   |                             |----------------------------------->|
12745   |                             |     FSWRITECONF
12746   |                             |<-----------------------------------|
12747   |                             |     FSCLOSEREQ (LCP control file)
12748   |                             |----------------------------------->|
12749   |                             |     FSCLOSECONF
12750   |                             |<-----------------------------------|
12751   |                             |
12752   | BACKUP_FRAGMENT_CONF        |
12753   |<----------------------------|
12754   |
12755   |                     DIH (local)
12756   |  LCP_FRAG_REP        |
12757   |--------------------->|
12758 
12759   LCP_FRAG_REP is distributed to all DIHs from the local DIH instance.
12760 
12761   Finally after completing all fragments we have a number of signals sent to
12762   complete the LCP processing. The only one needed here is the END_LCPREQ
12763   to TSMAN to make the dropped pages from any dropped tables available again
12764   after completing the LCP. This signal needs no wait for it to complete.
12765   DBLQH knows when the last fragment is completed since it will receive a
12766   special LCP_FRAG_ORD with lastFragmentFlag set from LQH proxy which in
12767   turn received this from DIH.
12768 
12769                              LQH Proxy   PGMAN(extra)     LGMAN  TSMAN
12770   |   LCP_FRAG_ORD(last)        |
12771   |<----------------------------|
12772   ......
12773   | LCP_COMPLETE_REP            |
12774   |---------------------------->|
12775 
12776   Here the LQH Proxy block will wait for all DBLQH instances to complete.
12777   After all have completed the following signals will be sent.
12778                              LQH Proxy   PGMAN(extra)     LGMAN  TSMAN
12779 
12780                                 | END_LCPREQ                        |
12781                                 |---------------------------------->|
12782                                 | END_LCPCONF                       |
12783                                 |<----------------------------------|
12784                                 |
12785                                 | LCP_COMPLETE_REP(DBLQH) sent to DIH(local)
12786 
12787 
12788   As preparation for this DBLQH sent DEFINE_BACKUP_REQ to setup a backup
12789   record in restart phase 4. It must get the response DEFINE_BACKUP_CONF for
12790   the restart to successfully complete. This signal allocates memory for the
12791   LCP buffers.
12792 
12793   Background deletion process
12794   ---------------------------
12795   To save file space we try to delete old checkpoint files no longer needed
12796   as soon as possible. This is a background process fully handled by the
12797   BACKUP block, it is handled outside the normal LCP processing protocol.
12798 
12799   It could interfere with LCP processing in the exceptional case that we
12800   haven't managed to delete the old LCP files for a fragment before starting
12801   to prepare the next local checkpoint.
12802 
12803   From DIH's point of view we always have a LCP instance 0 and a LCP instance
12804   1 for each fragment. When we complete writing a checkpoint file we need to
12805   keep the old checkpoint file until the new checkpoint file is usable in a
12806   restore case. At the time when it completes we cannot use it since it can
12807   contain rows from a GCI that haven't been fully completed yet. As soon as
12808   we get an indication of that the checkpoint is useful for restore we can
12809   delete the old checkpoint file.
12810 
12811   To handle this we maintain a list of fragments to handle deletes of fragment
12812   checkpoint files.
12813 
12814   We also need a way to handle deletion of old files after crashes. This is
12815   actually fairly easy to handle as part of the recovery as we use the
12816   checkpoint files to restore, we can as part of that remove any old
12817   checkpoint files.
12818 
12819   Local LCP execution
12820   -------------------
12821   Normally an LCP is executed as a distributed checkpoint where all nodes
12822   perform the checkpoint in an synchronised manner. During restarts we might
12823   execute extra local LCPs that can be used to cut the logs (REDO and UNDO
12824   logs). We don't generate REDO logs until very late in the recovery process,
12825   UNDO logs however we generate all the time, so it is mainly the UNDO log
12826   we have to protect from being exhausted during a restart.
12827 
12828   Such a local checkpoint can be used to recover a system, but it can normally
12829   not be used to recover a node on its own. If the local LCP happens during a
12830   system restart there are two options. If we have seen the GCP that we are
12831   attempting to restore we have all checkpoints and REDO logs required and
12832   a local LCP during restart should not be necessary normally. If our node is
12833   behind and we rely on some other node to bring us the latest GCIs then we
12834   might have to perform a checkpoint. In this case this local LCP will not
12835   be recoverable on its own.
12836 
12837   The reason why these local LCPs are not recoverable on their own is two
12838   things. First the synchronisation of data with the other node might not
12839   be completed yet when the local LCP starts. This means that the local LCP
12840   isn't seeing a united view, some rows will see a very new version whereas
12841   other rows will be seeing a very old view. To make a consistent state one
12842   more node is required. Second even if the local LCP started after the
12843   synchronisation was complete we don't have local REDO log records that
12844   can bring the local LCP to a consistent state since we don't write to
12845   the REDO log during the synchronisation phase. Even if we did write to
12846   the REDO log during synchronisation the various fragments would still be
12847   able to recover to different GCIs, thus a consistent restore of the node
12848   is still not possible.
12849 
12850   So when a node crashes the first time it is always recoverable on its
12851   own from a certain GCI. The node with the highest such GCI per node
12852   group is selected as the primary recovery node. Other nodes might have
12853   to rely on this node for its further recovery. Obviously each node group
12854   need to be restored from the same GCI to restore a consistent database.
12855   As soon as we start executing a local LCP the node is no longer able to
12856   be restored independent of other nodes. So before starting to execute a
12857   local LCP we must first write something to the file system indicating that
12858   this node is now not recoverable unless another node gives us assistance.
12859 
12860   So independent of what GCI this can restore according to the system file
12861   it cannot be used to recover data to other nodes without first recovering
12862   its own data using another node as aid.
12863 
12864   When a node is started we know of the GCI to restore for our node, it
12865   is stored in DBLQH in the variable crestartNewestGci during recovery
12866   and DBLQH gets it from DBDIH that got it from the system file stored
12867   in the DIH blocks.
12868 
12869   For distributed LCPs we use this GCI to restore to check if a fragment
12870   LCP can be used for recovery. However for local LCPs this information
12871   is normally not sufficient. For local LCPs we either have a fixed
12872   new GCI that we need to handle (during system restart) or a moving
12873   set of GCPs (during node start).
12874 
12875   So for a restore we need to know the crestartNewestGci from DBLQH, but
12876   we also need to know the GCIs that we can use from other nodes. This
12877   information must be written into the local system file of this node.
12878 
12879   The local system file is stored in NDBCNTR. It contains the following
12880   information:
12881   1) Flag whether node is restorable on its own
12882   2) Flag whether node have already removed old LCP files
12883   3) Last GCI of partial GCPs
12884 
12885   When a node is starting up and we are recovering the data (executing
12886   RESTORE_LCP_REQ from restore) we want to delete any files that isn't
12887   usable for recovery since they have a MaxGCIWritten that is larger
12888   than the above Last GCP of partial GCPs. Once we have completed
12889   the RESTORE_LCP_REQ phase we know that we have deleted all old
12890   LCP files that can no longer be used and we should only have one
12891   copy of each fragment LCP stored at this point. At this point we
12892   can set the flag above to indicate that we have already removed the
12893   old LCP files.
12894 
12895   The important parameters in the LCP metadata files stored here are
12896   the parameters MaxGCIWritten and MaxGCICompleted.
12897 
12898   When we write a local LCP the following holds for MaxGCIWritten.
12899   During system restart the MaxGCIWritten will be set to the
12900   GCI that the system restart is trying to restore. If the fragment
12901   has been fully synchronised before the local LCP started it will
12902   have the MaxGCICompleted set to the same GCI, otherwise it will
12903   have its value set to the crestartNewestGci (the GCP that was
12904   the last GCP we were part of the distributed protocol).
12905 
12906   So for system restarts there are only two GCI values that can be
12907   used during a local LCP. It is the GCI we are attempting to
12908   restore in the cluster or it is the GCI we were last involved in
12909   a distributed protocol for, crestartNewestGci).
12910 
12911   For node restarts the MaxGCIWritten is set according to what
12912   was set during the writing of the local LCP of the fragment.
12913   It will never be set smaller than crestartNewestGci.
12914 
12915   MaxGCICompleted is set dependent on the state at the start
12916   of the local LCP. If the fragment was fully synchronized
12917   before the start of the fragment LCP we set MaxGCICompleted
12918   to the GCI that was recoverable in the cluster at the time
12919   of the start of the local fragment LCP. If the fragment
12920   wasn't fully synchronised before the start of the local LCP
12921   we set it to crestartNewestGci or the maximum completed GCI
12922   in the fragment LCP restored.
12923 
12924   MaxGCIWritten is important during recovery to know whether
12925   a local LCP is valid, if MaxGCIWritten is larger than the
12926   GCP we have seen complete, the local LCP files cannot be
12927   trusted and must be deleted.
12928 
12929   MaxGCICompleted setting can ensure that we don't have to
12930   re-execute the local REDO log any more. It also takes
12931   into account that we don't have to synchronize more
12932   than necessary with the starting node.
12933 
12934   Information needed during restore for local LCP
12935   ...............................................
12936   We need to know about the crestartNewestGci. We also need
12937   to know the maximum GCI that is allowed when we encounter
12938   a local fragment LCP to understand which local fragment
12939   LCPs to remove.
12940   crestartNewestGci is sent as part of RESTORE_LCP_REQ for
12941   each restored fragment. We also need to add the max
12942   GCI restorable. Actually it is sufficient to send the
12943   maximum of those two values. Thus if the local system
12944   file says that we can recover on our own we will
12945   continue sending crestartNewestGci. Otherwise we will
12946   send the maximum of crestartNewestGci and the max GCI
12947   found in local system file.
12948 
12949   If any of the MaxGciWritten and MaxGciCompleted is set
12950   higher than the max GCI restorable we are sending to
12951   the restore block we need to remove that fragment LCP.
12952 
12953   Information needed during write of local LCP
12954   ............................................
12955   We need to know the state of the synchronisation of the fragment.
12956   If m_copy_started_state == AC_NORMAL &&
12957      fragStatus == ACTIVE_CREATION in DBLQH then we have completed
12958   the synchronisation of the fragment. Otherwise we haven't.
12959   We'll get this information from DBLQH at start of write of LCP
12960   in the Backup block.
12961 
12962   The backup block is informed about the GCI that is currently
12963   completed in the cluster through the signal RESTORABLE_GCI_REP
12964   sent from DBLQH. This information DBLQH collects from
12965   the GCP_SAVEREQ signal. This information is stored in the
12966   Backup block in m_newestRestorableGci.
12967 
12968   MaxGciCompleted is set by DBLQH and retrieved by Backup block
12969   in the method lcp_max_completed_gci. For normal distributed
12970   LCPs this method will simply set the MaxGciCompleted to the
12971   last completed GCI that DBLQH knows of. DBLQH gets to know
12972   of completion of a GCI through GCP_SAVEREQ. However for
12973   local LCP the procedure is a bit more complicated.
12974 
12975   It will first check if the fragment is fully synchronised.
12976   If not it will set MaxGciCompleted to crestartNewestGci.
12977   If it is synchronised we will use the same method as for
12978   a distributed LCP given that we have completed the
12979   GCI fully since the fragment contains the same data as the
12980   live node although the data isn't yet recoverable.
12981 
12982   Writing of local system file
12983   ............................
12984   Before we start a local LCP during recovery we write
12985   the local system file to indicate that the node can
12986   no longer be restored on its own until recovered again.
12987   This sets the following information in the local system
12988   file.
12989   1) Node restorable on its own flag is set to 0 (false).
12990   2) Flag indicating whether local LCPs removed is set to 0 (false).
12991   3) max GCP recoverable value is set to
12992   System Restart case: GCI cluster is restored to
12993   Node Restart case: GCI recoverable at the moment in cluster
12994 
12995   For node restarts we also write the local system file and update
12996   the max GCI recoverable value each time a GCI have been made
12997   recoverable.
12998 
12999   During recovery we read the local system file to discover
13000   whether we can be master in the system restart and also to
13001   discover if we can recover on our own.
13002 
13003   We propagate the max GCI recoverable value to DBLQH to ensure
13004   that we drop old LCP files that are not of any value in
13005   recovery any more.
13006 
13007   After completing the restart we finally write the local system
13008   file during phase 50. In this phase all recovery of data is
13009   completed and only initialisation of SUMA clients remains, so
13010   it is safe to write the local system file here again. This time
13011   we set the values to:
13012   1) Node restorable on its own flag is set to 1 (true)
13013   2) Flag indicating whether local LCPs removed is set to 0 (ignorable)
13014   3) max GCP recoverable value is set to 0 (ignorable)
13015 */
13016 void
execLCP_PREPARE_REQ(Signal * signal)13017 Backup::execLCP_PREPARE_REQ(Signal* signal)
13018 {
13019   jamEntry();
13020   LcpPrepareReq req = *(LcpPrepareReq*)signal->getDataPtr();
13021 
13022   BackupRecordPtr ptr;
13023   c_backupPool.getPtr(ptr, req.backupPtr);
13024 
13025   TablePtr tabPtr;
13026   FragmentPtr fragPtr;
13027 
13028   jamLine(req.tableId);
13029 
13030   ndbrequire(ptr.p->prepareState == NOT_ACTIVE);
13031   ptr.p->prepareState = PREPARE_READ_CTL_FILES;
13032   ptr.p->prepareErrorCode = 0;
13033 
13034   ptr.p->prepare_table.first(tabPtr);
13035   tabPtr.p->fragments.getPtr(fragPtr, 0);
13036 
13037   tabPtr.p->tableId = req.tableId;
13038   tabPtr.p->tableType = DictTabInfo::UserTable;
13039 
13040   fragPtr.p->fragmentId = req.fragmentId;
13041   fragPtr.p->scanned = 0;
13042   fragPtr.p->scanning = 0;
13043   fragPtr.p->tableId = req.tableId;
13044   fragPtr.p->createGci = req.createGci;
13045 
13046   if (req.backupId != ptr.p->backupId ||
13047       req.localLcpId != ptr.p->localLcpId ||
13048       !ptr.p->m_initial_lcp_started)
13049   {
13050     jam();
13051     /**
13052      * These variables are only set at the very first LCP_PREPARE_REQ in
13053      * an LCP. At this point there is no parallelism, so no need to
13054      * care for concurrency on the ptr object here.
13055      *
13056      * New LCP, reset per-LCP counters. noOfBytes and noOfRecords is other
13057      * than here handled by the LCP execution phase.
13058      */
13059     ptr.p->noOfBytes = 0;
13060     ptr.p->noOfRecords = 0;
13061     ptr.p->backupId = req.backupId;
13062     ptr.p->localLcpId = req.localLcpId;
13063     ptr.p->m_initial_lcp_started = true;
13064     ndbrequire(ptr.p->m_first_fragment == false);
13065     ptr.p->m_first_fragment = true;
13066     ptr.p->m_is_lcp_scan_active = false;
13067     ptr.p->m_current_lcp_lsn = Uint64(0);
13068     ptr.p->m_high_res_lcp_start_time = getHighResTimer();
13069     m_current_dd_time_us = Uint64(0);
13070     lcp_start_point(signal);
13071     DEB_LCP_STAT(("(%u)TAGS Start new LCP, id: %u", instance(), req.backupId));
13072     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
13073                                   m_delete_lcp_file_head);
13074     ndbrequire(queue.isEmpty());
13075   }
13076 
13077   /**
13078    * We need to open both header files. One of them contains the latest
13079    * information from the last local checkpoint. We need however to
13080    * keep the old information around since this new LCP isn't immediately
13081    * useful for recovery. This also has the added benefit that we have the
13082    * files replicated. If we crash while we are still writing the new
13083    * header file we can always recover using the old header file. We
13084    * retain the old header file. This means that we need to open both
13085    * files to discover which of them is the most recent one. We should
13086    * use the older one to write the new header information into, but
13087    * we should use the newer header file to get the information about
13088    * which parts to perform the LCP on.
13089    */
13090   lcp_open_ctl_file(signal, ptr, 0);
13091   lcp_open_ctl_file(signal, ptr, 1);
13092 }
13093 
13094 /**
13095  * File processing for an LCP
13096  * --------------------------
13097  * At LCP_PREPARE_REQ we prepare the files for an LCP. There are two control
13098  * files for each fragment. These two files are both opened at prepare time.
13099  * One contains the description of the previous LCP and one contains the
13100  * description of the LCP before that one. Usually only one control file
13101  * exist per fragment since as soon as the LCP is fully completed we delete
13102  * the now oldest control file.
13103  *
13104  * So the steps are:
13105  * 1) Open both control files
13106  * 2) Find out which is the most recent control file.
13107  * 3) Use data from most recent control file to prepare which parts we will
13108  *    use for the this LCP. Calculate number of next data file to use.
13109  * 4) Open the new data file for this LCP.
13110  *    The old data file(s) will still exist
13111  * 5) Prepare phase is completed
13112  * 6) Execute phase of LCP fills the data file with data from this LCP.
13113  * 7) Flush and close the new data file.
13114  * 8) Write new control file, flush and close it.
13115  * 9) Report LCP processing as completed.
13116  *
13117  * Step 10) and onwards is handled as a background process.
13118  *
13119  * 10)Calculate data files to delete after this LCP is completed.
13120  * 11)Delete old data files no longer needed.
13121  * 12)Delete the LCP control no longer needed.
13122  */
lcp_open_ctl_file(Signal * signal,BackupRecordPtr ptr,Uint32 lcpNo)13123 void Backup::lcp_open_ctl_file(Signal *signal,
13124                                BackupRecordPtr ptr,
13125                                Uint32 lcpNo)
13126 {
13127   FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13128   req->userReference = reference();
13129   req->fileFlags =
13130     FsOpenReq::OM_READWRITE | FsOpenReq::OM_CREATE;
13131 
13132   /**
13133    * Compressed files do not support OM_READWRITE, so we will never
13134    * use compression for the LCP control files. The files will not
13135    * take up very much space. If it is necessary to support
13136    * compressed LCP control files then it is easy to do so by first
13137    * opening the LCP control files for read in this phase and then
13138    * when deciding which file to use for the next LCP we will close
13139    * both files and open the file to use with OM_CREATE and also
13140    * with OM_TRUNCATE to ensure we overwrite the old file
13141    * content.
13142    *
13143    * O_DIRECT requires very special write semantics which we don't
13144    * follow for CTL files. So we never set this option for CTL files.
13145    */
13146 
13147   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
13148 
13149   /**
13150    * Lcp header file
13151    */
13152   BackupFilePtr filePtr;
13153   TablePtr tabPtr;
13154   FragmentPtr fragPtr;
13155 
13156   c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[lcpNo]);
13157   ptr.p->prepare_table.first(tabPtr);
13158   tabPtr.p->fragments.getPtr(fragPtr, 0);
13159 
13160   ndbrequire(filePtr.p->m_flags == 0);
13161   filePtr.p->m_flags |= BackupFile::BF_OPENING;
13162   filePtr.p->m_flags |= BackupFile::BF_HEADER_FILE;
13163   filePtr.p->tableId = RNIL; // Will force init
13164   req->userPointer = filePtr.i;
13165   FsOpenReq::setVersion(req->fileNumber, 5);
13166   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
13167   FsOpenReq::v5_setLcpNo(req->fileNumber, lcpNo);
13168   FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
13169   FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
13170   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
13171 }
13172 
13173 void
lcp_open_ctl_file_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)13174 Backup::lcp_open_ctl_file_done(Signal* signal,
13175                                BackupRecordPtr ptr,
13176                                BackupFilePtr filePtr)
13177 {
13178   /**
13179    * Header file has been opened, now time to read it.
13180    * Header file is never bigger than one page. Get page from list of
13181    * pages in the file record. Page comes from global page pool.
13182    */
13183   Page32Ptr pagePtr;
13184   FsReadWriteReq* req = (FsReadWriteReq*)signal->getDataPtrSend();
13185 
13186   filePtr.p->pages.getPtr(pagePtr, 0);
13187   filePtr.p->m_flags |= BackupFile::BF_READING;
13188 
13189   req->userPointer = filePtr.i;
13190   req->filePointer = filePtr.p->filePointer;
13191   req->userReference = reference();
13192   req->varIndex = 0;
13193   req->numberOfPages = 1;
13194   req->operationFlag = 0;
13195   FsReadWriteReq::setFormatFlag(req->operationFlag,
13196                                 FsReadWriteReq::fsFormatMemAddress);
13197   FsReadWriteReq::setPartialReadFlag(req->operationFlag, 1);
13198 
13199   Uint32 mem_offset = Uint32((char*)pagePtr.p - (char*)c_startOfPages);
13200   req->data.memoryAddress.memoryOffset = mem_offset;
13201   req->data.memoryAddress.fileOffset = 0;
13202   req->data.memoryAddress.size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
13203 
13204   sendSignal(NDBFS_REF, GSN_FSREADREQ, signal,
13205              FsReadWriteReq::FixedLength + 3, JBA);
13206 }
13207 
13208 void
execFSREADREF(Signal * signal)13209 Backup::execFSREADREF(Signal *signal)
13210 {
13211   jamEntry();
13212 
13213   FsRef * ref = (FsRef *)signal->getDataPtr();
13214   const Uint32 userPtr = ref->userPointer;
13215 
13216   BackupFilePtr filePtr;
13217   c_backupFilePool.getPtr(filePtr, userPtr);
13218   /**
13219    * Since we create the file if it doesn't exist, this should not occur
13220    * unless something is completely wrong with the file system.
13221    */
13222   ndbabort();
13223 }
13224 
13225 void
execFSREADCONF(Signal * signal)13226 Backup::execFSREADCONF(Signal *signal)
13227 {
13228   jamEntry();
13229 
13230   FsConf * conf = (FsConf *)signal->getDataPtr();
13231   const Uint32 userPtr = conf->userPointer;
13232 
13233   BackupFilePtr filePtr;
13234   c_backupFilePool.getPtr(filePtr, userPtr);
13235 
13236   /**
13237    * If we created the file in the open call, then bytes_read will be 0.
13238    * This will distinguish a non-existing file from an existing file.
13239    */
13240   filePtr.p->bytesRead = conf->bytes_read;
13241   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_READING;
13242 
13243   BackupRecordPtr ptr;
13244   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
13245 
13246   if (ptr.p->deleteFilePtr == filePtr.i)
13247   {
13248     jam();
13249     ndbrequire(filePtr.p->bytesRead ==
13250                  BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL ||
13251                filePtr.p->bytesRead ==
13252                  BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG);
13253     lcp_read_ctl_file_for_rewrite_done(signal, filePtr);
13254     return;
13255   }
13256   for (Uint32 i = 0; i < 2; i++)
13257   {
13258     jam();
13259     c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[i]);
13260     if ((filePtr.p->m_flags & BackupFile::BF_READING) ||
13261         (filePtr.p->m_flags & BackupFile::BF_OPENING))
13262     {
13263       jam();
13264       return;
13265     }
13266   }
13267   lcp_read_ctl_file_done(signal, ptr);
13268 }
13269 
13270 void
lcp_read_ctl_file_done(Signal * signal,BackupRecordPtr ptr)13271 Backup::lcp_read_ctl_file_done(Signal* signal, BackupRecordPtr ptr)
13272 {
13273   BackupFilePtr filePtr[2];
13274   for (Uint32 i = 0; i < 2; i++)
13275   {
13276     jam();
13277     c_backupFilePool.getPtr(filePtr[i], ptr.p->prepareCtlFilePtr[i]);
13278     DEB_EXTRA_LCP(("(%u)ctl: %u, bytesRead: %u",
13279                    instance(), i, filePtr[i].p->bytesRead));
13280     if (filePtr[i].p->bytesRead != 0)
13281     {
13282       Page32Ptr pagePtr;
13283       jam();
13284       filePtr[i].p->pages.getPtr(pagePtr, 0);
13285       lcp_read_ctl_file(pagePtr, filePtr[i].p->bytesRead, ptr);
13286     }
13287     else
13288     {
13289       Page32Ptr pagePtr;
13290       jam();
13291       filePtr[i].p->pages.getPtr(pagePtr, 0);
13292       lcp_init_ctl_file(pagePtr);
13293     }
13294   }
13295   Page32Ptr pagePtr0, pagePtr1;
13296   filePtr[0].p->pages.getPtr(pagePtr0, 0);
13297   filePtr[1].p->pages.getPtr(pagePtr1, 0);
13298   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr0 =
13299     (struct BackupFormat::LCPCtlFile*)pagePtr0.p;
13300   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr1 =
13301     (struct BackupFormat::LCPCtlFile*)pagePtr1.p;
13302   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr;
13303   Uint32 closeLcpNumber;
13304   Uint32 dataFileNumber;
13305   Uint32 maxGciCompleted;
13306   Uint32 maxGciWritten;
13307   Uint32 createGci;
13308   Uint32 createTableVersion;
13309   Uint32 lqhCreateTableVersion;
13310 
13311   /**
13312    * Ignore LCP files that are not valid, a file that have
13313    * CreateTableVersion equal to 0 is also not valid. This kind of
13314    * file can be created during Drop Table processing.
13315    */
13316   if (lcpCtlFilePtr0->ValidFlag == 0 ||
13317       lcpCtlFilePtr0->CreateTableVersion == 0)
13318   {
13319     jam();
13320     lcpCtlFilePtr0->ValidFlag = 0;
13321     lcpCtlFilePtr0->LcpId = 0;
13322     lcpCtlFilePtr0->LocalLcpId = 0;
13323   }
13324   if (lcpCtlFilePtr1->ValidFlag == 0 ||
13325       lcpCtlFilePtr1->CreateTableVersion == 0)
13326   {
13327     jam();
13328     lcpCtlFilePtr1->ValidFlag = 0;
13329     lcpCtlFilePtr1->LcpId = 0;
13330     lcpCtlFilePtr1->LocalLcpId = 0;
13331   }
13332   if (lcpCtlFilePtr0->LcpId > lcpCtlFilePtr1->LcpId ||
13333       (lcpCtlFilePtr0->LcpId == lcpCtlFilePtr1->LcpId &&
13334        lcpCtlFilePtr0->LcpId != 0 &&
13335        lcpCtlFilePtr0->LocalLcpId > lcpCtlFilePtr1->LocalLcpId))
13336   {
13337     jam();
13338     dataFileNumber = lcpCtlFilePtr0->LastDataFileNumber;
13339     lcpCtlFilePtr = lcpCtlFilePtr1;
13340     ptr.p->prepareNextLcpCtlFileNumber = 1;
13341     closeLcpNumber = 0;
13342     createGci = lcpCtlFilePtr0->CreateGci;
13343     createTableVersion = lcpCtlFilePtr0->CreateTableVersion;
13344     maxGciCompleted = lcpCtlFilePtr0->MaxGciCompleted;
13345     maxGciWritten = lcpCtlFilePtr0->MaxGciWritten;
13346     ptr.p->prepareDeleteCtlFileNumber = closeLcpNumber;
13347     copy_prev_lcp_info(ptr, lcpCtlFilePtr0);
13348   }
13349   else
13350   {
13351     /**
13352      * Both can have the same LCP id. This should only happen when none of the
13353      * files existed and in this case the LCP id should be 0.
13354      * This will happen after a new table is created. If upgrading from 7.4 or
13355      * earlier than this is handled as part of node or cluster restart. So this
13356      * will not be the reason.
13357      */
13358     jam();
13359     ndbrequire(lcpCtlFilePtr0->LcpId < lcpCtlFilePtr1->LcpId ||
13360                (lcpCtlFilePtr0->LcpId == lcpCtlFilePtr1->LcpId &&
13361                 (lcpCtlFilePtr0->LcpId == 0 ||
13362                  lcpCtlFilePtr0->LocalLcpId < lcpCtlFilePtr1->LocalLcpId)));
13363     dataFileNumber = lcpCtlFilePtr1->LastDataFileNumber;
13364     lcpCtlFilePtr = lcpCtlFilePtr0;
13365     ptr.p->prepareNextLcpCtlFileNumber = 0;
13366     createGci = lcpCtlFilePtr1->CreateGci;
13367     createTableVersion = lcpCtlFilePtr1->CreateTableVersion;
13368     maxGciCompleted = lcpCtlFilePtr1->MaxGciCompleted;
13369     maxGciWritten = lcpCtlFilePtr1->MaxGciWritten;
13370     closeLcpNumber = 1;
13371     ptr.p->prepareDeleteCtlFileNumber = closeLcpNumber;
13372     if (lcpCtlFilePtr1->LcpId == 0)
13373     {
13374       jam();
13375       /**
13376        * None of the files existed before, ensure that we don't delete
13377        * any data file since no one exists at this moment. Also ensure
13378        * that the other control file is removed.
13379        *
13380        * lcpCtlFilePtr1->LcpId == 0 => lcpCtlFilePtr0->LcpId == 0 since
13381        * lcpCtlFilePtr1->LcpId >= lcpCtlFilePtr0->LcpId when we come
13382        * here.
13383        *
13384        * We set m_num_parts_in_lcp to 0 to indicate this is first LCP for
13385        * this fragment and thus needs to always be a full LCP.
13386        */
13387       ptr.p->prepareDeleteCtlFileNumber = RNIL;
13388       ptr.p->m_prepare_num_parts_in_lcp = 0;
13389       ptr.p->m_prepare_max_parts_in_lcp = 0;
13390       ptr.p->m_prepare_scan_change_gci = 0;
13391       ptr.p->m_prepare_first_start_part_in_lcp = 0;
13392       ptr.p->preparePrevLcpId = 0;
13393       ptr.p->preparePrevLocalLcpId = 0;
13394       maxGciCompleted = 0;
13395       maxGciWritten = 0;
13396       TablePtr tabPtr;
13397       FragmentPtr fragPtr;
13398       ndbrequire(ptr.p->prepare_table.first(tabPtr));
13399       tabPtr.p->fragments.getPtr(fragPtr, 0);
13400       createGci = fragPtr.p->createGci;
13401       createTableVersion = c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
13402     }
13403     else
13404     {
13405       jam();
13406       copy_prev_lcp_info(ptr, lcpCtlFilePtr1);
13407     }
13408   }
13409   /**
13410    * prepareNextLcpCtlFileNumber is the number of the prepareCtlFilePtr's
13411    * which will be kept for this LCP. We have written the data in its page
13412    * with i-value of 0. This is what lcpCtlFilePtr points to at the moment.
13413    * This is the page we will later write after completing the LCP of this
13414    * fragment.
13415    *
13416    * We will always get the last data file number by getting the last
13417    * data file number from the control file to close which is the most
13418    * recent, then we will add one modulo the max number to get the
13419    * new last data file number.
13420    */
13421   dataFileNumber = get_file_add(dataFileNumber, 1);
13422   ptr.p->prepareFirstDataFileNumber = dataFileNumber;
13423   TablePtr tabPtr;
13424   FragmentPtr fragPtr;
13425   ndbrequire(ptr.p->prepare_table.first(tabPtr));
13426   tabPtr.p->fragments.getPtr(fragPtr, 0);
13427   ptr.p->prepareMaxGciWritten = maxGciWritten;
13428   lqhCreateTableVersion = c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
13429 
13430   Uint32 maxGci = MAX(maxGciCompleted, maxGciWritten);
13431   if ((maxGci < fragPtr.p->createGci &&
13432        maxGci != 0) ||
13433        (c_initial_start_lcp_not_done_yet &&
13434         (ptr.p->preparePrevLocalLcpId != 0 ||
13435          ptr.p->preparePrevLcpId != 0)))
13436   {
13437     jam();
13438     /**
13439      * This case is somewhat obscure. Due to the fact that we support the
13440      * config variable __at_restart_skip_indexes we can actually come here
13441      * for a table (should be a unique index table) that have an LCP file
13442      * remaining from the previous use of this table id. It is potentially
13443      * possible also when dropping a table while this node is down and then
13444      * creating it again before this node has started. In this case we could
13445      * come here and find an old LCP file. So what we do here is that we
13446      * perform the drop of the old LCP fragments and then we restart the
13447      * LCP handling again with an empty set of LCP files as it should be.
13448      *
13449      * This means first closing the CTL files (deleting the older one and
13450      * keeping the newer one to ensure we keep one CTL file until all data
13451      * files have been deleted and to integrate easily into the drop file
13452      * handling in this block.
13453      *
13454      * We can only discover this case in a cluster where the master is
13455      * on 7.6 version. So in upgrade cases we won't discover this case
13456      * since we don't get the createGci from the DICT master in that case
13457      * when the fragment is created.
13458      *
13459      * We can also get here when doing an initial node restart and there
13460      * is old LCP files to clean up.
13461      */
13462     DEB_LCP(("(%u)TAGT Drop case: tab(%u,%u).%u (now %u),"
13463              " maxGciCompleted: %u,"
13464              " maxGciWritten: %u, createGci: %u",
13465             instance(),
13466             tabPtr.p->tableId,
13467             fragPtr.p->fragmentId,
13468             createTableVersion,
13469             c_lqh->getCreateSchemaVersion(tabPtr.p->tableId),
13470             maxGciCompleted,
13471             maxGciWritten,
13472             fragPtr.p->createGci));
13473 
13474     ptr.p->prepareState = PREPARE_DROP_CLOSE;
13475     closeFile(signal, ptr, filePtr[closeLcpNumber]);
13476     closeFile(signal,
13477               ptr,
13478               filePtr[ptr.p->prepareNextLcpCtlFileNumber],
13479               true,
13480               true);
13481     return;
13482   }
13483   /* Initialise page to write to next CTL file with new LCP id */
13484   lcp_set_lcp_id(ptr, lcpCtlFilePtr);
13485 
13486   DEB_LCP(("(%u)TAGC Use ctl file: %u, prev Lcp(%u,%u), curr Lcp(%u,%u)"
13487            ", next data file: %u, tab(%u,%u).%u"
13488            ", prevMaxGciCompleted: %u, createGci: %u",
13489            instance(),
13490            ptr.p->prepareNextLcpCtlFileNumber,
13491            ptr.p->preparePrevLcpId,
13492            ptr.p->preparePrevLocalLcpId,
13493            lcpCtlFilePtr->LcpId,
13494            lcpCtlFilePtr->LocalLcpId,
13495            dataFileNumber,
13496            tabPtr.p->tableId,
13497            fragPtr.p->fragmentId,
13498            c_lqh->getCreateSchemaVersion(tabPtr.p->tableId),
13499            maxGciCompleted,
13500            fragPtr.p->createGci));
13501 
13502   /**
13503    * lqhCreateTableVersion == 0 means that the table is no longer active.
13504    * We will continue as if things were ok, the table is being dropped so
13505    * no need to abort here, the file will be dropped anyways.
13506    */
13507   if (lqhCreateTableVersion != 0 &&
13508       lqhCreateTableVersion != createTableVersion)
13509   {
13510     g_eventLogger->info("(%u) tab(%u,%u) lqhCreateTableVersion: %u"
13511                         ", createTableVersion: %u",
13512                         instance(),
13513                         tabPtr.p->tableId,
13514                         fragPtr.p->fragmentId,
13515                         lqhCreateTableVersion,
13516                         createTableVersion);
13517   }
13518   ndbrequire(createTableVersion == lqhCreateTableVersion ||
13519              lqhCreateTableVersion == 0);
13520 
13521 
13522   /**
13523    * We close the file which was the previous LCP control file. We will
13524    * retain the oldest one and use this for this LCP, it will then
13525    * become the most recent one when we are done. We keep the one to
13526    * use open for now, it will be closed later in the LCP processing.
13527    */
13528   ndbrequire(ptr.p->prepareErrorCode == 0);
13529   closeFile(signal,
13530             ptr,
13531             filePtr[closeLcpNumber],
13532             true,
13533             (ptr.p->prepareDeleteCtlFileNumber == RNIL));
13534   return;
13535 }
13536 
13537 void
copy_prev_lcp_info(BackupRecordPtr ptr,struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)13538 Backup::copy_prev_lcp_info(BackupRecordPtr ptr,
13539                            struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
13540 {
13541   Uint32 next_start_part = 0;
13542   ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13543   ptr.p->m_prepare_max_parts_in_lcp = lcpCtlFilePtr->MaxPartPairs;
13544   ptr.p->m_prepare_num_parts_in_lcp = lcpCtlFilePtr->NumPartPairs;
13545   jam();
13546   Uint32 total_parts = 0;
13547   for (Uint32 i = 0; i < ptr.p->m_prepare_num_parts_in_lcp; i++)
13548   {
13549     Uint32 start_part = lcpCtlFilePtr->partPairs[i].startPart;
13550     Uint32 num_parts = lcpCtlFilePtr->partPairs[i].numParts;
13551     next_start_part = get_part_add(start_part, num_parts);
13552     ptr.p->m_prepare_part_info[i].startPart = start_part;
13553     ptr.p->m_prepare_part_info[i].numParts = num_parts;
13554     total_parts += num_parts;
13555   }
13556   ndbrequire(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13557   ptr.p->m_prepare_first_start_part_in_lcp = next_start_part;
13558   ptr.p->m_prepare_scan_change_gci = lcpCtlFilePtr->MaxGciCompleted;
13559   ptr.p->preparePrevLcpId = lcpCtlFilePtr->LcpId;
13560   ptr.p->preparePrevLocalLcpId = lcpCtlFilePtr->LocalLcpId;
13561 }
13562 
13563 Uint32
get_part_add(Uint32 start_part,Uint32 num_parts)13564 Backup::get_part_add(Uint32 start_part, Uint32 num_parts)
13565 {
13566   return (start_part + num_parts) % BackupFormat::NDB_MAX_LCP_PARTS;
13567 }
13568 
13569 Uint32
get_file_add(Uint32 start_file,Uint32 num_files)13570 Backup::get_file_add(Uint32 start_file, Uint32 num_files)
13571 {
13572   return (start_file + num_files) % BackupFormat::NDB_MAX_LCP_FILES;
13573 }
13574 
13575 Uint32
get_file_sub(Uint32 start_file,Uint32 num_files)13576 Backup::get_file_sub(Uint32 start_file, Uint32 num_files)
13577 {
13578   if (start_file >= num_files)
13579   {
13580     jam();
13581     return (start_file - num_files);
13582   }
13583   else
13584   {
13585     jam();
13586     return (start_file + BackupFormat::NDB_MAX_LCP_FILES - num_files);
13587   }
13588 }
13589 
13590 void
lcp_read_ctl_file(Page32Ptr pagePtr,Uint32 bytesRead,BackupRecordPtr ptr)13591 Backup::lcp_read_ctl_file(Page32Ptr pagePtr,
13592                           Uint32 bytesRead,
13593                           BackupRecordPtr ptr)
13594 {
13595   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13596     (struct BackupFormat::LCPCtlFile*)pagePtr.p;
13597   /**
13598    * This function reads the LCP Control file data and retrieves information
13599    * about:
13600    * 1) next starting part
13601    * 2) LCP id this file is a header for
13602    *
13603    * This information is used to decide which header file to close (the most
13604    * recent one) and which header file to use for the next LCP.
13605    */
13606   ndbrequire(BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL == bytesRead ||
13607              BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG == bytesRead);
13608   if (!convert_ctl_page_to_host(lcpCtlFilePtr))
13609   {
13610     jam();
13611     lcp_init_ctl_file(pagePtr);
13612   }
13613   {
13614     TablePtr tabPtr;
13615     FragmentPtr fragPtr;
13616     ptr.p->prepare_table.first(tabPtr);
13617     tabPtr.p->fragments.getPtr(fragPtr, 0);
13618     ndbrequire(lcpCtlFilePtr->TableId == tabPtr.p->tableId)
13619     ndbrequire(lcpCtlFilePtr->FragmentId == fragPtr.p->fragmentId);
13620   }
13621 }
13622 
13623 /**
13624  * We compress before writing LCP control and after reading it we will
13625  * decompress the part information. In compressed format we use 3 bytes
13626  * to store two numbers that can at most be 2048. In uncompressed
13627  * format each part is a 16-bit unsigned integer.
13628  */
13629 #define BYTES_PER_PART 3
13630 /**
13631  * Define the LCP Control file header size, remove the one part pair
13632  * defined in the common header.
13633  */
13634 #define LCP_CTL_FILE_HEADER_SIZE (sizeof(BackupFormat::LCPCtlFile) - \
13635                                   sizeof(BackupFormat::PartPair))
13636 
13637 bool
convert_ctl_page_to_host(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)13638 Backup::convert_ctl_page_to_host(
13639   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
13640 {
13641   Uint32 *pageData = (Uint32*)lcpCtlFilePtr;
13642   Uint32 numPartPairs = ntohl(lcpCtlFilePtr->NumPartPairs);
13643   Uint32 real_bytes_read = LCP_CTL_FILE_HEADER_SIZE +
13644                            (BYTES_PER_PART * numPartPairs);
13645 
13646   /* Checksum is calculated on compressed network byte order */
13647   if (numPartPairs > BackupFormat::NDB_MAX_LCP_PARTS)
13648   {
13649     DEB_LCP(("(%u)numPartPairs: %x", instance(), numPartPairs));
13650     ndbassert(false);
13651     return false;
13652   }
13653   /**
13654    * Add 3 to ensure that we get also the last word with anything not
13655    * equal to 0 when changing to word count.
13656    */
13657   Uint32 words = (real_bytes_read + 3) / sizeof(Uint32);
13658   Uint32 chksum = 0;
13659   for (Uint32 i = 0; i < words; i++)
13660   {
13661     chksum ^= pageData[i];
13662   }
13663   ndbassert(chksum == 0);
13664 
13665   if (chksum != 0)
13666   {
13667     jam();
13668     ndbassert(false);
13669     return false;
13670   }
13671   /* Magic is written/read as is */
13672   lcpCtlFilePtr->fileHeader.BackupVersion =
13673     ntohl(lcpCtlFilePtr->fileHeader.BackupVersion);
13674   lcpCtlFilePtr->fileHeader.SectionType =
13675     ntohl(lcpCtlFilePtr->fileHeader.SectionType);
13676   lcpCtlFilePtr->fileHeader.SectionLength =
13677     ntohl(lcpCtlFilePtr->fileHeader.SectionLength);
13678   lcpCtlFilePtr->fileHeader.FileType =
13679     ntohl(lcpCtlFilePtr->fileHeader.FileType);
13680   lcpCtlFilePtr->fileHeader.BackupId =
13681     ntohl(lcpCtlFilePtr->fileHeader.BackupId);
13682   ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_0 == 0);
13683   ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_1 == 0);
13684   /* ByteOrder as is */
13685   lcpCtlFilePtr->fileHeader.NdbVersion =
13686     ntohl(lcpCtlFilePtr->fileHeader.NdbVersion);
13687   lcpCtlFilePtr->fileHeader.MySQLVersion =
13688     ntohl(lcpCtlFilePtr->fileHeader.MySQLVersion);
13689 
13690   lcpCtlFilePtr->ValidFlag = ntohl(lcpCtlFilePtr->ValidFlag);
13691   lcpCtlFilePtr->TableId = ntohl(lcpCtlFilePtr->TableId);
13692   lcpCtlFilePtr->FragmentId = ntohl(lcpCtlFilePtr->FragmentId);
13693   lcpCtlFilePtr->CreateTableVersion = ntohl(lcpCtlFilePtr->CreateTableVersion);
13694   lcpCtlFilePtr->CreateGci = ntohl(lcpCtlFilePtr->CreateGci);
13695   lcpCtlFilePtr->MaxGciCompleted = ntohl(lcpCtlFilePtr->MaxGciCompleted);
13696   lcpCtlFilePtr->MaxGciWritten = ntohl(lcpCtlFilePtr->MaxGciWritten);
13697   lcpCtlFilePtr->LcpId = ntohl(lcpCtlFilePtr->LcpId);
13698   lcpCtlFilePtr->LocalLcpId = ntohl(lcpCtlFilePtr->LocalLcpId);
13699   lcpCtlFilePtr->MaxPageCount = ntohl(lcpCtlFilePtr->MaxPageCount);
13700   lcpCtlFilePtr->MaxNumberDataFiles = ntohl(lcpCtlFilePtr->MaxNumberDataFiles);
13701   lcpCtlFilePtr->LastDataFileNumber = ntohl(lcpCtlFilePtr->LastDataFileNumber);
13702   lcpCtlFilePtr->MaxPartPairs = ntohl(lcpCtlFilePtr->MaxPartPairs);
13703   lcpCtlFilePtr->NumPartPairs = ntohl(lcpCtlFilePtr->NumPartPairs);
13704 
13705   ndbrequire(BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG >= real_bytes_read);
13706   ndbrequire(lcpCtlFilePtr->fileHeader.FileType ==
13707              BackupFormat::LCP_CTL_FILE);
13708   ndbrequire(memcmp(BACKUP_MAGIC, lcpCtlFilePtr->fileHeader.Magic, 8) == 0);
13709   ndbrequire(lcpCtlFilePtr->NumPartPairs <= lcpCtlFilePtr->MaxPartPairs);
13710   ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13711   Uint32 total_parts;
13712   ndbrequire(lcpCtlFilePtr->fileHeader.BackupVersion >= NDBD_USE_PARTIAL_LCP_v2)
13713   lcpCtlFilePtr->RowCountLow = ntohl(lcpCtlFilePtr->RowCountLow);
13714   lcpCtlFilePtr->RowCountHigh = ntohl(lcpCtlFilePtr->RowCountHigh);
13715   total_parts = decompress_part_pairs(lcpCtlFilePtr,
13716                                       lcpCtlFilePtr->NumPartPairs,
13717                                       &lcpCtlFilePtr->partPairs[0]);
13718   ndbrequire(total_parts <= lcpCtlFilePtr->MaxPartPairs);
13719   return true;
13720 }
13721 
13722 void
convert_ctl_page_to_network(Uint32 * page,Uint32 file_size)13723 Backup::convert_ctl_page_to_network(Uint32 *page, Uint32 file_size)
13724 {
13725   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13726     (struct BackupFormat::LCPCtlFile*)page;
13727   Uint32 numPartPairs = lcpCtlFilePtr->NumPartPairs;
13728   Uint32 compressed_bytes_written = LCP_CTL_FILE_HEADER_SIZE +
13729                                     (BYTES_PER_PART * numPartPairs);
13730 
13731   /**
13732    * Add 3 to ensure that we take into account the last word that might
13733    * filled with only 1 byte of information.
13734    */
13735   ndbrequire(file_size >= (compressed_bytes_written + 3));
13736 
13737   ndbrequire(memcmp(BACKUP_MAGIC, lcpCtlFilePtr->fileHeader.Magic, 8) == 0);
13738   ndbrequire(lcpCtlFilePtr->fileHeader.FileType ==
13739              BackupFormat::LCP_CTL_FILE);
13740   ndbrequire(lcpCtlFilePtr->NumPartPairs <= lcpCtlFilePtr->MaxPartPairs);
13741   ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13742   ndbrequire(lcpCtlFilePtr->fileHeader.NdbVersion >= NDBD_USE_PARTIAL_LCP_v2);
13743   ndbrequire(lcpCtlFilePtr->fileHeader.BackupVersion == NDBD_USE_PARTIAL_LCP_v2);
13744 
13745   /* Magic is written/read as is */
13746   lcpCtlFilePtr->fileHeader.BackupVersion =
13747     htonl(lcpCtlFilePtr->fileHeader.BackupVersion);
13748   lcpCtlFilePtr->fileHeader.SectionType =
13749     htonl(lcpCtlFilePtr->fileHeader.SectionType);
13750   lcpCtlFilePtr->fileHeader.SectionLength =
13751     htonl(lcpCtlFilePtr->fileHeader.SectionLength);
13752   lcpCtlFilePtr->fileHeader.FileType =
13753     htonl(lcpCtlFilePtr->fileHeader.FileType);
13754   lcpCtlFilePtr->fileHeader.BackupId =
13755     htonl(lcpCtlFilePtr->fileHeader.BackupId);
13756   ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_0 == 0);
13757   ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_1 == 0);
13758   /* ByteOrder as is */
13759   lcpCtlFilePtr->fileHeader.NdbVersion =
13760     htonl(lcpCtlFilePtr->fileHeader.NdbVersion);
13761   lcpCtlFilePtr->fileHeader.MySQLVersion =
13762     htonl(lcpCtlFilePtr->fileHeader.MySQLVersion);
13763 
13764   lcpCtlFilePtr->ValidFlag = htonl(lcpCtlFilePtr->ValidFlag);
13765   lcpCtlFilePtr->TableId = htonl(lcpCtlFilePtr->TableId);
13766   lcpCtlFilePtr->FragmentId = htonl(lcpCtlFilePtr->FragmentId);
13767   lcpCtlFilePtr->CreateTableVersion = htonl(lcpCtlFilePtr->CreateTableVersion);
13768   lcpCtlFilePtr->CreateGci = htonl(lcpCtlFilePtr->CreateGci);
13769   lcpCtlFilePtr->MaxGciCompleted = htonl(lcpCtlFilePtr->MaxGciCompleted);
13770   lcpCtlFilePtr->MaxGciWritten = htonl(lcpCtlFilePtr->MaxGciWritten);
13771   lcpCtlFilePtr->LcpId = htonl(lcpCtlFilePtr->LcpId);
13772   lcpCtlFilePtr->LocalLcpId = htonl(lcpCtlFilePtr->LocalLcpId);
13773   lcpCtlFilePtr->MaxPageCount = htonl(lcpCtlFilePtr->MaxPageCount);
13774   lcpCtlFilePtr->MaxNumberDataFiles = htonl(lcpCtlFilePtr->MaxNumberDataFiles);
13775   lcpCtlFilePtr->LastDataFileNumber = htonl(lcpCtlFilePtr->LastDataFileNumber);
13776 
13777   Uint32 maxPartPairs = lcpCtlFilePtr->MaxPartPairs;
13778   lcpCtlFilePtr->MaxPartPairs = htonl(lcpCtlFilePtr->MaxPartPairs);
13779   lcpCtlFilePtr->NumPartPairs = htonl(lcpCtlFilePtr->NumPartPairs);
13780 
13781   lcpCtlFilePtr->RowCountLow = htonl(lcpCtlFilePtr->RowCountLow);
13782   lcpCtlFilePtr->RowCountHigh = htonl(lcpCtlFilePtr->RowCountHigh);
13783 
13784   Uint32 total_parts = compress_part_pairs(lcpCtlFilePtr,
13785                                            numPartPairs,
13786                                            file_size);
13787   ndbrequire(total_parts <= maxPartPairs);
13788 
13789   /**
13790    * Checksum is calculated on compressed network byte order.
13791    * The checksum is calculated without regard to size decreasing due to
13792    * compression. This is not a problem since we fill the remainder with
13793    * zeroes and XOR doesn't change the checksum with extra zeroes.
13794    *
13795    * Add 3 to ensure that we move to word count in a correct manner.
13796    */
13797   lcpCtlFilePtr->Checksum = 0;
13798   Uint32 words = (compressed_bytes_written + 3) / sizeof(Uint32);
13799   Uint32 chksum = 0;
13800   for (Uint32 i = 0; i < words; i++)
13801   {
13802     chksum ^= page[i];
13803   }
13804   lcpCtlFilePtr->Checksum = chksum;
13805 }
13806 
13807 Uint32
compress_part_pairs(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr,Uint32 num_parts,Uint32 file_size)13808 Backup::compress_part_pairs(struct BackupFormat::LCPCtlFile *lcpCtlFilePtr,
13809                             Uint32 num_parts,
13810                             Uint32 file_size)
13811 {
13812   Uint32 total_parts = 0;
13813   unsigned char *part_array =
13814     (unsigned char*)&lcpCtlFilePtr->partPairs[0].startPart;
13815   for (Uint32 part = 0; part < num_parts; part++)
13816   {
13817     /**
13818      * Compress the 32 bit by only using 12 bits word. This means that we
13819      * can fit up to 2048 parts in 8 kBytes.
13820      * The start part uses the first byte to store the upper 8 bits of
13821      * 12 bits and bits 0-3 of the second byte is bit 0-3 of the start
13822      * part. The number of parts has bit 0-3 stored in bit 4-7 of the
13823      * second byte and bit 4-11 stored in the third byte.
13824      */
13825     Uint32 startPart = lcpCtlFilePtr->partPairs[part].startPart;
13826     Uint32 numParts = lcpCtlFilePtr->partPairs[part].numParts;
13827     ndbrequire(numParts <= BackupFormat::NDB_MAX_LCP_PARTS);
13828     Uint32 startPart_bit0_3 = (startPart & 0xF);
13829     Uint32 startPart_bit4_11 = (startPart >> 4) & 0xFF;
13830     Uint32 numParts_bit0_3 = (numParts & 0xF);
13831     Uint32 numParts_bit4_11 = (numParts >> 4) & 0xFF;
13832     part_array[0] = (unsigned char)startPart_bit4_11;
13833     part_array[1] = (unsigned char)(startPart_bit0_3 + (numParts_bit0_3 << 4));
13834     part_array[2] = (unsigned char)numParts_bit4_11;
13835     part_array += 3;
13836     total_parts += numParts;
13837     DEB_EXTRA_LCP(("(%u)compress:tab(%u,%u) Part(%u), start:%u, num_parts: %u",
13838                    instance(),
13839                    ntohl(lcpCtlFilePtr->TableId),
13840                    ntohl(lcpCtlFilePtr->FragmentId),
13841                    part,
13842                    startPart,
13843                    numParts));
13844   }
13845   ndbrequire(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13846   unsigned char *start_pos = (unsigned char*)lcpCtlFilePtr;
13847   unsigned char *end_pos = start_pos + file_size;
13848   Uint64 remaining_size_64 = end_pos - part_array;
13849   ndbrequire(remaining_size_64 < file_size);
13850   Uint32 remaining_size = Uint32(remaining_size_64);
13851   memset(part_array, 0, remaining_size);
13852   return total_parts;
13853 }
13854 
decompress_part_pairs(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr,Uint32 num_parts,struct BackupFormat::PartPair * partPairs)13855 Uint32 Backup::decompress_part_pairs(
13856   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr,
13857   Uint32 num_parts,
13858   struct BackupFormat::PartPair *partPairs)
13859 {
13860   Uint32 total_parts = 0;
13861   unsigned char *part_array = (unsigned char*)&partPairs[0].startPart;
13862   ndbrequire(num_parts <= BackupFormat::NDB_MAX_LCP_PARTS);
13863   memcpy(c_part_array, part_array, 3 * num_parts);
13864   Uint32 j = 0;
13865   for (Uint32 part = 0; part < num_parts; part++)
13866   {
13867     Uint32 part_0 = c_part_array[j+0];
13868     Uint32 part_1 = c_part_array[j+1];
13869     Uint32 part_2 = c_part_array[j+2];
13870     Uint32 startPart = ((part_1 & 0xF) + (part_0 << 4));
13871     Uint32 numParts = (((part_1 >> 4) & 0xF)) + (part_2 << 4);
13872     ndbrequire(numParts <= BackupFormat::NDB_MAX_LCP_PARTS);
13873     partPairs[part].startPart = startPart;
13874     partPairs[part].numParts = numParts;
13875     total_parts += numParts;
13876     DEB_EXTRA_LCP(("(%u)decompress:tab(%u,%u) Part(%u), start:%u, num_parts: %u",
13877                    instance(),
13878                    lcpCtlFilePtr->TableId,
13879                    lcpCtlFilePtr->FragmentId,
13880                    part,
13881                    startPart,
13882                    numParts));
13883     j += 3;
13884   }
13885   ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13886   return total_parts;
13887 }
13888 
13889 void
lcp_init_ctl_file(Page32Ptr pagePtr)13890 Backup::lcp_init_ctl_file(Page32Ptr pagePtr)
13891 {
13892   const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
13893   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13894     (struct BackupFormat::LCPCtlFile*)pagePtr.p;
13895 
13896   memcpy(lcpCtlFilePtr->fileHeader.Magic, BACKUP_MAGIC, 8);
13897   lcpCtlFilePtr->fileHeader.BackupVersion = NDBD_USE_PARTIAL_LCP_v2;
13898   lcpCtlFilePtr->fileHeader.SectionType = BackupFormat::FILE_HEADER;
13899   lcpCtlFilePtr->fileHeader.SectionLength = sz - 3;
13900   lcpCtlFilePtr->fileHeader.FileType = BackupFormat::LCP_CTL_FILE;
13901   lcpCtlFilePtr->fileHeader.BackupId = 0;
13902   lcpCtlFilePtr->fileHeader.BackupKey_0 = 0;
13903   lcpCtlFilePtr->fileHeader.BackupKey_1 = 0;
13904   lcpCtlFilePtr->fileHeader.ByteOrder = 0x12345678;
13905   lcpCtlFilePtr->fileHeader.NdbVersion = NDB_VERSION_D;
13906   lcpCtlFilePtr->fileHeader.MySQLVersion = NDB_MYSQL_VERSION_D;
13907 
13908   /* Checksum needs to calculated again before write to disk */
13909   lcpCtlFilePtr->Checksum = 0;
13910   lcpCtlFilePtr->ValidFlag = 0;
13911   lcpCtlFilePtr->TableId = 0;
13912   lcpCtlFilePtr->FragmentId = 0;
13913   lcpCtlFilePtr->CreateTableVersion = 0;
13914   lcpCtlFilePtr->CreateGci = 0;
13915   lcpCtlFilePtr->MaxGciWritten = 0;
13916   lcpCtlFilePtr->MaxGciCompleted = 0;
13917   lcpCtlFilePtr->LcpId = 0;
13918   lcpCtlFilePtr->LocalLcpId = 0;
13919   lcpCtlFilePtr->MaxPageCount = 0;
13920   lcpCtlFilePtr->MaxNumberDataFiles = BackupFormat::NDB_MAX_LCP_FILES;
13921   lcpCtlFilePtr->LastDataFileNumber = BackupFormat::NDB_MAX_LCP_FILES - 1;
13922   lcpCtlFilePtr->MaxPartPairs = BackupFormat::NDB_MAX_LCP_PARTS;
13923   lcpCtlFilePtr->NumPartPairs = 1;
13924   lcpCtlFilePtr->RowCountLow = 0;
13925   lcpCtlFilePtr->RowCountHigh = 0;
13926   lcpCtlFilePtr->partPairs[0].startPart = 0;
13927   lcpCtlFilePtr->partPairs[0].numParts = BackupFormat::NDB_MAX_LCP_PARTS;
13928 }
13929 
13930 void
lcp_close_prepare_ctl_file_done(Signal * signal,BackupRecordPtr ptr)13931 Backup::lcp_close_prepare_ctl_file_done(Signal* signal,
13932                                         BackupRecordPtr ptr)
13933 {
13934   /**
13935    * We have closed the old LCP control file now. We have calculated the
13936    * number of the data file to be used in this LCP. We will now open this
13937    * data file to be used by this LCP.
13938    */
13939   lcp_open_data_file(signal, ptr);
13940 }
13941 
13942 void
lcp_open_data_file(Signal * signal,BackupRecordPtr ptr)13943 Backup::lcp_open_data_file(Signal* signal,
13944                            BackupRecordPtr ptr)
13945 {
13946   FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13947   req->userReference = reference();
13948   req->fileFlags =
13949     FsOpenReq::OM_WRITEONLY |
13950     FsOpenReq::OM_TRUNCATE |
13951     FsOpenReq::OM_CREATE |
13952     FsOpenReq::OM_APPEND |
13953     FsOpenReq::OM_AUTOSYNC;
13954 
13955   if (c_defaults.m_compressed_lcp)
13956   {
13957     req->fileFlags |= FsOpenReq::OM_GZ;
13958   }
13959 
13960   if (c_defaults.m_o_direct)
13961   {
13962     req->fileFlags |= FsOpenReq::OM_DIRECT;
13963   }
13964 
13965   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
13966   req->auto_sync_size = c_defaults.m_disk_synch_size;
13967 
13968   TablePtr tabPtr;
13969   FragmentPtr fragPtr;
13970   BackupFilePtr filePtr;
13971   Uint32 dataFileNumber;
13972 
13973   ndbrequire(ptr.p->prepare_table.first(tabPtr));
13974   tabPtr.p->fragments.getPtr(fragPtr, 0);
13975 
13976   c_backupFilePool.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
13977   dataFileNumber = ptr.p->prepareFirstDataFileNumber;
13978   ndbrequire(ptr.p->prepareState == PREPARE_READ_CTL_FILES);
13979   ptr.p->prepareState = PREPARE_OPEN_DATA_FILE;
13980 
13981   ndbrequire(filePtr.p->m_flags == 0);
13982   filePtr.p->m_flags |= BackupFile::BF_OPENING;
13983   filePtr.p->tableId = RNIL; // Will force init
13984   req->userPointer = filePtr.i;
13985   FsOpenReq::setVersion(req->fileNumber, 5);
13986   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
13987   FsOpenReq::v5_setLcpNo(req->fileNumber, dataFileNumber);
13988   FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
13989   FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
13990   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
13991 }
13992 
13993 void
lcp_open_data_file_late(Signal * signal,BackupRecordPtr ptr,Uint32 index)13994 Backup::lcp_open_data_file_late(Signal* signal,
13995                                 BackupRecordPtr ptr,
13996                                 Uint32 index)
13997 {
13998   FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13999   req->userReference = reference();
14000   req->fileFlags =
14001     FsOpenReq::OM_WRITEONLY |
14002     FsOpenReq::OM_TRUNCATE |
14003     FsOpenReq::OM_CREATE |
14004     FsOpenReq::OM_APPEND |
14005     FsOpenReq::OM_AUTOSYNC;
14006 
14007   if (c_defaults.m_compressed_lcp)
14008   {
14009     req->fileFlags |= FsOpenReq::OM_GZ;
14010   }
14011 
14012   if (c_defaults.m_o_direct)
14013   {
14014     req->fileFlags |= FsOpenReq::OM_DIRECT;
14015   }
14016 
14017   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
14018   req->auto_sync_size = c_defaults.m_disk_synch_size;
14019 
14020   TablePtr tabPtr;
14021   FragmentPtr fragPtr;
14022   BackupFilePtr filePtr;
14023   ndbrequire(ptr.p->tables.first(tabPtr));
14024   tabPtr.p->fragments.getPtr(fragPtr, 0);
14025 
14026   ndbrequire(index != 0);
14027   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[index]);
14028 
14029   Uint32 dataFileNumber = get_file_add(ptr.p->m_first_data_file_number,
14030                                        index);
14031 
14032   ndbrequire(filePtr.p->m_flags == 0);
14033   filePtr.p->m_flags |= BackupFile::BF_OPENING;
14034   req->userPointer = filePtr.i;
14035   FsOpenReq::setVersion(req->fileNumber, 5);
14036   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
14037   FsOpenReq::v5_setLcpNo(req->fileNumber, dataFileNumber);
14038   FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
14039   FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
14040   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
14041 }
14042 
14043 void
lcp_open_data_file_done(Signal * signal,BackupRecordPtr ptr)14044 Backup::lcp_open_data_file_done(Signal* signal,
14045                                 BackupRecordPtr ptr)
14046 {
14047   TablePtr tabPtr;
14048   FragmentPtr fragPtr;
14049 
14050   ndbrequire(ptr.p->prepare_table.first(tabPtr));
14051   tabPtr.p->fragments.getPtr(fragPtr, 0);
14052 
14053   BackupFilePtr filePtr;
14054   c_backupFilePool.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
14055   ndbrequire(filePtr.p->m_flags ==
14056              (BackupFile::BF_OPEN | BackupFile::BF_LCP_META));
14057   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_LCP_META;
14058 
14059   ndbrequire(ptr.p->prepareState == PREPARE_READ_TABLE_DESC);
14060   ptr.p->prepareState = PREPARED;
14061 
14062   LcpPrepareConf* conf= (LcpPrepareConf*)signal->getDataPtrSend();
14063   conf->senderData = ptr.p->clientData;
14064   conf->senderRef = reference();
14065   conf->tableId = tabPtr.p->tableId;
14066   conf->fragmentId = fragPtr.p->fragmentId;
14067   sendSignal(ptr.p->masterRef, GSN_LCP_PREPARE_CONF,
14068 	     signal, LcpPrepareConf::SignalLength, JBA);
14069 }
14070 
14071 void
lcp_set_lcp_id(BackupRecordPtr ptr,struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)14072 Backup::lcp_set_lcp_id(BackupRecordPtr ptr,
14073                        struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
14074 {
14075   jam();
14076   lcpCtlFilePtr->fileHeader.BackupId = ptr.p->backupId;
14077   lcpCtlFilePtr->LcpId = ptr.p->backupId;
14078   lcpCtlFilePtr->LocalLcpId = ptr.p->localLcpId;
14079   if (ptr.p->backupId == ptr.p->preparePrevLcpId)
14080   {
14081     jam();
14082     ndbrequire(ptr.p->localLcpId > ptr.p->preparePrevLocalLcpId);
14083   }
14084   else
14085   {
14086     jam();
14087     ndbrequire(ptr.p->backupId > ptr.p->preparePrevLcpId);
14088   }
14089 }
14090 
14091 void
lcp_copy_ctl_page(BackupRecordPtr ptr)14092 Backup::lcp_copy_ctl_page(BackupRecordPtr ptr)
14093 {
14094   Page32Ptr page_ptr, recent_page_ptr;
14095   BackupFilePtr file_ptr, recent_file_ptr;
14096   Uint32 oldest = ptr.p->prepareNextLcpCtlFileNumber;
14097   ndbrequire(oldest <= 1);
14098   Uint32 recent = oldest == 0 ? 1 : 0;
14099   c_backupFilePool.getPtr(file_ptr, ptr.p->ctlFilePtr);
14100   c_backupFilePool.getPtr(recent_file_ptr, ptr.p->prepareCtlFilePtr[recent]);
14101   file_ptr.p->pages.getPtr(page_ptr, 0);
14102   recent_file_ptr.p->pages.getPtr(recent_page_ptr, 0);
14103   /**
14104    * Important to consider here that the page is currently in expanded
14105    * format. So before we copy it we calculate how much to copy.
14106    */
14107   {
14108     struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14109       (struct BackupFormat::LCPCtlFile*)recent_page_ptr.p;
14110     Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
14111     Uint32 size_to_copy = LCP_CTL_FILE_HEADER_SIZE;
14112     size_to_copy += (num_parts * sizeof(struct BackupFormat::PartPair));
14113     memcpy(page_ptr.p,
14114            recent_page_ptr.p,
14115            size_to_copy);
14116   }
14117 #ifdef VM_TRACE
14118   {
14119     struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14120       (struct BackupFormat::LCPCtlFile*)page_ptr.p;
14121     jam();
14122     Uint32 total_parts = 0;
14123     Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
14124     jamLine(num_parts);
14125     for (Uint32 i = 0; i < num_parts; i++)
14126     {
14127       Uint32 parts = lcpCtlFilePtr->partPairs[i].numParts;
14128       total_parts += parts;
14129       jamLine(parts);
14130     }
14131     jam();
14132     ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
14133   }
14134 #endif
14135 }
14136 
14137 void
setRestorableGci(Uint32 restorableGci)14138 Backup::setRestorableGci(Uint32 restorableGci)
14139 {
14140   jam();
14141   if (restorableGci > m_newestRestorableGci)
14142   {
14143     jam();
14144     m_newestRestorableGci = restorableGci;
14145   }
14146 }
14147 
14148 void
lcp_update_ctl_page(BackupRecordPtr ptr,Page32Ptr & page_ptr,BackupFilePtr & file_ptr)14149 Backup::lcp_update_ctl_page(BackupRecordPtr ptr,
14150                             Page32Ptr & page_ptr,
14151                             BackupFilePtr & file_ptr)
14152 {
14153   Uint32 maxCompletedGci;
14154   c_backupFilePool.getPtr(file_ptr, ptr.p->ctlFilePtr);
14155   file_ptr.p->pages.getPtr(page_ptr, 0);
14156   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14157     (struct BackupFormat::LCPCtlFile*)page_ptr.p;
14158 
14159   /**
14160    * An idle LCP cannot have written anything since last LCP. The
14161    * last LCP was definitely restorable on disk, so there is no
14162    * need to set MaxGciCompleted to an unrestorable GCI since we
14163    * haven't written this anyways.
14164    *
14165    * Thus for idle LCPs we need not wait for a GCI to be restorable
14166    * ever. We reflect this by sending max_gci_written equal to the
14167    * restorable gci in the lcp_max_completed_gci call.
14168    */
14169   c_lqh->lcp_max_completed_gci(maxCompletedGci,
14170                                m_newestRestorableGci,
14171                                m_newestRestorableGci);
14172   lcpCtlFilePtr->MaxGciCompleted = maxCompletedGci;
14173   ptr.p->slaveState.setState(STOPPING);
14174   c_lqh->lcp_complete_scan(ptr.p->newestGci);
14175   if (ptr.p->newestGci != lcpCtlFilePtr->MaxGciWritten)
14176   {
14177     /**
14178      * Can happen when performing a LCP as part of restart
14179      * We will set the newestGci as part of the restore to
14180      * the GCI we restore.
14181      */
14182     DEB_LCP(("(%u)newestGci = %u, MaxGciWritten: %u, MaxGciCompleted: %u",
14183             instance(),
14184             ptr.p->newestGci,
14185             lcpCtlFilePtr->MaxGciWritten,
14186             lcpCtlFilePtr->MaxGciCompleted));
14187   }
14188   ndbassert(ptr.p->newestGci ==
14189             lcpCtlFilePtr->MaxGciWritten ||
14190             !m_our_node_started);
14191   /* Check that schema version is ok, 0 means we're currently deleting table */
14192   Uint32 lqhCreateTableVersion = c_lqh->getCreateSchemaVersion(lcpCtlFilePtr->TableId);
14193   ndbrequire(lcpCtlFilePtr->CreateTableVersion == lqhCreateTableVersion ||
14194              lqhCreateTableVersion == 0);
14195 
14196   lcpCtlFilePtr->MaxGciWritten = ptr.p->newestGci;
14197 
14198   ptr.p->m_wait_gci_to_delete = MAX(maxCompletedGci, ptr.p->newestGci);
14199 
14200   lcp_set_lcp_id(ptr, lcpCtlFilePtr);
14201 
14202   ndbrequire(lcpCtlFilePtr->MaxGciWritten <= m_newestRestorableGci);
14203   ndbrequire(m_newestRestorableGci != 0);
14204   /**
14205    * Also idle LCPs have to be careful to ensure that the LCP is valid before
14206    * we write it as valid. The reason is that otherwise we won't find the
14207    * LCP record in the UNDO log and apply too many UNDO log records.
14208    */
14209   TablePtr tabPtr;
14210   ptr.p->tables.first(tabPtr);
14211   Uint32 tableId = tabPtr.p->tableId;
14212   ptr.p->m_disk_data_exist = c_lqh->is_disk_columns_in_table(tableId);
14213   Uint32 valid_flag = lcp_pre_sync_lsn(ptr);
14214   ptr.p->m_lcp_lsn_synced = valid_flag;
14215   lcpCtlFilePtr->ValidFlag = valid_flag;
14216 
14217   DEB_LCP(("(%u)TAGY Handle idle LCP, tab(%u,%u).%u, maxGciCompleted = %u"
14218            ", validFlag = %u",
14219             instance(),
14220             lcpCtlFilePtr->TableId,
14221             lcpCtlFilePtr->FragmentId,
14222             lcpCtlFilePtr->CreateTableVersion,
14223             lcpCtlFilePtr->MaxGciCompleted,
14224             valid_flag));
14225 }
14226 
14227 void
handle_idle_lcp(Signal * signal,BackupRecordPtr ptr)14228 Backup::handle_idle_lcp(Signal *signal, BackupRecordPtr ptr)
14229 {
14230   /**
14231    * In the prepare phase we opened the data file, we need to
14232    * close this file before returning to DBLQH as completed.
14233    *
14234    * We also need to write the new LCP control file. The
14235    * contents we will take from the most recent LCP control
14236    * file updated with a new MaxGciCompleted.
14237    *
14238    * We need to move data files and control files to the
14239    * execution part since we will start preparing a new
14240    * LCP immediately after completing this signal execution.
14241    * A LCP_PREPARE_REQ is most likely waiting to be executed
14242    * as the next signal.
14243    */
14244   Page32Ptr page_ptr;
14245   BackupFilePtr file_ptr;
14246   ptr.p->m_empty_lcp = true;
14247   lcp_copy_ctl_page(ptr);
14248   lcp_update_ctl_page(ptr, page_ptr, file_ptr);
14249   ptr.p->deleteDataFileNumber = RNIL;
14250   lcp_write_ctl_file_to_disk(signal, file_ptr, page_ptr);
14251   lcp_close_data_file(signal, ptr, true);
14252   ptr.p->m_wait_disk_data_sync = false;
14253   ptr.p->m_wait_sync_extent = false;
14254   ptr.p->m_wait_data_file_close = false;
14255   ptr.p->m_outstanding_operations = 2;
14256 }
14257 
14258 void
prepare_parts_for_lcp(Signal * signal,BackupRecordPtr ptr)14259 Backup::prepare_parts_for_lcp(Signal *signal, BackupRecordPtr ptr)
14260 {
14261   /**
14262    * We need to switch in prepared data file and ctl file.
14263    * We make the previous execute data file and ctl file
14264    * record to be the new prepare data and ctl file record.
14265    */
14266   ptr.p->m_empty_lcp = false;
14267   calculate_number_of_parts(ptr);
14268 }
14269 
14270 void
prepare_ranges_for_parts(BackupRecordPtr ptr,Uint32 in_parts)14271 Backup::prepare_ranges_for_parts(BackupRecordPtr ptr,
14272                                  Uint32 in_parts)
14273 {
14274 #ifdef DEBUG_LCP
14275   TablePtr debTabPtr;
14276   FragmentPtr fragPtr;
14277   ptr.p->tables.first(debTabPtr);
14278   debTabPtr.p->fragments.getPtr(fragPtr, 0);
14279 #endif
14280   Uint64 parts = Uint64(in_parts);
14281   ndbrequire(parts > 0);
14282   Uint32 start_part = ptr.p->m_first_start_part_in_lcp;
14283   Uint64 parts_per_file = parts / Uint64(ptr.p->m_num_lcp_files);
14284   Uint64 parts_extra_in_first_file =
14285     parts - (parts_per_file * Uint64(ptr.p->m_num_lcp_files));
14286   for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
14287   {
14288     ptr.p->m_scan_info[i].m_start_all_part = start_part;
14289     Uint32 num_parts;
14290     if (i == 0)
14291     {
14292       num_parts = Uint32(parts_extra_in_first_file) + Uint32(parts_per_file);
14293     }
14294     else
14295     {
14296       num_parts = Uint32(parts_per_file);
14297     }
14298     ptr.p->m_scan_info[i].m_num_all_parts = num_parts;
14299     start_part = get_part_add(start_part, num_parts);
14300     DEB_LCP(("(%u)tab(%u,%u),m_scan_info[%u].start_all_part = %u,"
14301              " num_all_parts: %u",
14302              instance(),
14303              debTabPtr.p->tableId,
14304              fragPtr.p->fragmentId,
14305              i,
14306              ptr.p->m_scan_info[i].m_start_all_part,
14307              ptr.p->m_scan_info[i].m_num_all_parts));
14308   }
14309   Uint32 num_change_parts = BackupFormat::NDB_MAX_LCP_PARTS - parts;
14310   ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part =
14311     start_part;
14312   ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts =
14313     num_change_parts;
14314   start_part = get_part_add(start_part, num_change_parts);
14315   ndbassert(start_part == ptr.p->m_first_start_part_in_lcp);
14316   ndbassert(is_partial_lcp_enabled() || num_change_parts == 0);
14317   DEB_LCP(("(%u)tab(%u,%u),m_scan_info[%u].start_change_part = %u,"
14318            " num_all_parts: %u",
14319            instance(),
14320            debTabPtr.p->tableId,
14321            fragPtr.p->fragmentId,
14322            ptr.p->m_num_lcp_files - 1,
14323            ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part,
14324            ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts));
14325 }
14326 
14327 void
prepare_new_part_info(BackupRecordPtr ptr,Uint32 new_parts)14328 Backup::prepare_new_part_info(BackupRecordPtr ptr, Uint32 new_parts)
14329 {
14330   Uint32 remove_files = 0;
14331   ptr.p->m_num_parts_in_this_lcp = new_parts;
14332   Uint32 old_num_parts = ptr.p->m_num_parts_in_lcp;
14333   if (old_num_parts != 0)
14334   {
14335     Uint32 new_start_part = ptr.p->m_first_start_part_in_lcp;
14336     Uint32 new_end_part = new_start_part + new_parts;
14337     Uint32 old_start_part = ptr.p->m_part_info[0].startPart;
14338     Uint32 old_end_part = old_start_part;
14339     ndbrequire(new_start_part == old_start_part);
14340     jam();
14341     do
14342     {
14343       jam();
14344       Uint32 old_parts = ptr.p->m_part_info[remove_files].numParts;
14345       old_end_part += old_parts;
14346       if (old_end_part > new_end_part)
14347       {
14348         jam();
14349         /* This file has to be kept */
14350         break;
14351       }
14352       old_num_parts--;
14353       remove_files++;
14354     } while (old_num_parts > 0);
14355   }
14356   Uint32 remaining_files = ptr.p->m_num_parts_in_lcp - remove_files;
14357   /* First remove all files no longer used */
14358   for (Uint32 i = 0; i < remaining_files; i++)
14359   {
14360     ptr.p->m_part_info[i] = ptr.p->m_part_info[i + remove_files];
14361     DEB_EXTRA_LCP(("(%u)Parts(%u,%u)",
14362                    instance(),
14363                    ptr.p->m_part_info[i].startPart,
14364                    ptr.p->m_part_info[i].numParts));
14365   }
14366 
14367   /**
14368    * The first set of parts is now likely too many parts. The new set of
14369    * parts have eaten into this from the start. So it needs to be moved
14370    * ahead as many parts as we have eaten into it.
14371    */
14372   if (remaining_files >= 1)
14373   {
14374     jam();
14375     Uint32 new_first_part = get_part_add(
14376              ptr.p->m_scan_info[0].m_start_all_part, new_parts);
14377     Uint32 old_first_part = ptr.p->m_part_info[0].startPart;
14378     Uint32 decrement_parts;
14379     if (old_first_part > new_first_part)
14380     {
14381       jam();
14382       decrement_parts = (new_first_part +
14383                          BackupFormat::NDB_MAX_LCP_PARTS) - old_first_part;
14384     }
14385     else
14386     {
14387       jam();
14388       decrement_parts = new_first_part - old_first_part;
14389     }
14390     ndbrequire(decrement_parts < ptr.p->m_part_info[0].numParts);
14391     ptr.p->m_part_info[0].numParts -= decrement_parts;
14392     ptr.p->m_part_info[0].startPart = new_first_part;
14393     DEB_EXTRA_LCP(("(%u)New first data file span is (%u,%u)",
14394                    instance(),
14395                    ptr.p->m_part_info[0].startPart,
14396                    ptr.p->m_part_info[0].numParts));
14397   }
14398 
14399   /**
14400    * Calculate file numbers of files to delete after LCP is
14401    * completed.
14402    */
14403   ptr.p->m_lcp_remove_files = remove_files;
14404   if (remove_files == 0)
14405   {
14406     jam();
14407     ptr.p->deleteDataFileNumber = RNIL;
14408   }
14409   else
14410   {
14411     Uint32 move_back_files = remove_files + remaining_files;
14412     ptr.p->deleteDataFileNumber = get_file_sub(
14413       ptr.p->m_first_data_file_number,
14414       move_back_files);
14415 
14416     DEB_LCP(("(%u)m_first_data_file_number = %u, deleteDataFileNumber: %u,"
14417              " remove_files: %u",
14418              instance(),
14419              ptr.p->m_first_data_file_number,
14420              ptr.p->deleteDataFileNumber,
14421              remove_files));
14422   }
14423 
14424   /* Insert the new parts at the end */
14425   jamLineDebug(ptr.p->m_num_lcp_files);
14426   for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
14427   {
14428     jamDebug();
14429     ptr.p->m_part_info[old_num_parts + i].startPart =
14430       ptr.p->m_scan_info[i].m_start_all_part;
14431     ptr.p->m_part_info[old_num_parts + i].numParts =
14432       ptr.p->m_scan_info[i].m_num_all_parts;
14433     ndbrequire(ptr.p->m_part_info[old_num_parts + i].startPart <
14434                BackupFormat::NDB_MAX_LCP_PARTS);
14435     ndbrequire(ptr.p->m_part_info[old_num_parts + i].numParts <=
14436                BackupFormat::NDB_MAX_LCP_PARTS);
14437   }
14438   jamLineDebug(remaining_files);
14439   ptr.p->m_num_parts_in_lcp = ptr.p->m_num_lcp_files + remaining_files;
14440   ptr.p->m_max_parts_in_lcp = BackupFormat::NDB_MAX_LCP_PARTS;
14441 #ifdef VM_TRACE
14442   Uint32 total_parts = 0;
14443   jam();
14444   for (Uint32 i = 0; i < ptr.p->m_num_parts_in_lcp; i++)
14445   {
14446     Uint32 numParts = ptr.p->m_part_info[i].numParts;
14447     total_parts += numParts;
14448   }
14449   ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
14450 #endif
14451 }
14452 
14453 Uint32
calculate_min_parts(Uint64 row_count,Uint64 row_change_count,Uint64 mem_used,Uint64 total_mem)14454 Backup::calculate_min_parts(Uint64 row_count,
14455                             Uint64 row_change_count,
14456                             Uint64 mem_used,
14457                             Uint64 total_mem)
14458 {
14459   /**
14460    * Calculates
14461    *   min_parts = 1 + (2048 * k) / (k + p)
14462    * let y = row_change_count / row_count
14463    * let z = y * (mem_used / total_mem)
14464    * let k = y + z * 0.5
14465    * where k = (row_change_count / row_count) +
14466    *           0.5 * (mem_used / total_mem)
14467    * let p = RecoveryWork configuration parameter
14468    *
14469    * as explained below.
14470    *
14471    * Broken down to:
14472    * memory_used = memory_used / (1024 * 1024)
14473    * total_memory = total_memory / (1024 * 1024)
14474    * This means we are ignoring anything not in the range of MBytes to ensure
14475    * we don't overflow the 64 bits.
14476    */
14477 
14478   Uint32 recovery_work = get_recovery_work();
14479 
14480   if (!is_partial_lcp_enabled() || row_count == 0)
14481   {
14482     jam();
14483     /**
14484      * We have configured the defaults to be that we always execute a full LCP.
14485      * The LCP can still be a multi-file one, but we will never have to handle
14486      * anything related to CHANGE ROWS pages.
14487      *
14488      * If no rows exists in table we might as well run a full LCP.
14489      */
14490     return BackupFormat::NDB_MAX_LCP_PARTS;
14491   }
14492   if (row_count < row_change_count)
14493   {
14494     jam();
14495     row_change_count = row_count;
14496   }
14497   mem_used /= Uint64(1024 * 1024);
14498   total_mem /= Uint64(1024 * 1024);
14499   if (total_mem == Uint64(0))
14500   {
14501     jam();
14502     total_mem = 1;
14503   }
14504 
14505   double y = double(row_change_count);
14506   y = y / double(row_count);
14507 
14508   double z = double(mem_used);
14509   z = z / double(total_mem);
14510   z = z * y;
14511 
14512   double k = y + (z / double(2));
14513 
14514   double parts = double(2048) * k;
14515 
14516   double p = double(recovery_work) / double(100);
14517   double parts_divisor = p + k;
14518 
14519   parts = parts / parts_divisor;
14520   parts = parts + double(1);
14521 
14522   Uint32 min_parts = Uint32(parts);
14523   ndbrequire(min_parts < Uint32(BackupFormat::NDB_MAX_LCP_PARTS));
14524   return min_parts;
14525 }
14526 
14527 /**
14528  * This function is closely related to the simulations performed by the
14529  * lcp_simulator.cc program. These simulations shows that is sufficient
14530  * to count as little as 70% of the inserts and still maintain the
14531  * same LCP size and recovery time. Even decreasing it to 50% means
14532  * that we only temporarily can increase the LCP by 3.3% and decreasing
14533  * it to 40% we can increase it by 6.7%. Even decreasing it to 0 and
14534  * thus only write the changed rows after insert and no extra speed of
14535  * LCPs due to inserts would still only increase the maximum LCP size
14536  * by 30%. The default setting is now 40% and it can be set between 0
14537  * and 70%. There are no particular reason to set it higher than 70%.
14538  *
14539  * If faster restarts are desired one should instead set RecoveryWork
14540  * lower.
14541  *
14542  * Deletes were shown to need a bit more parts, so we set a delete to
14543  * mean the same as 1.2 updates. There are no common use cases for
14544  * massive deletes, so we do not make this configurable, this is
14545  * hard coded.
14546  *
14547  * The idea of how to apply this is to split up row_change_count in
14548  * an update part, an insert part and a delete part. We multiply
14549  * the update part by 1, the delete part by 1.2 and the insert part
14550  * by the configured InsertRecoveryWork (defaults to 0.4).
14551  */
14552 Uint64
calculate_row_change_count(BackupRecordPtr ptr)14553 Backup::calculate_row_change_count(BackupRecordPtr ptr)
14554 {
14555   Uint64 insert_recovery_work = (Uint64)get_insert_recovery_work();
14556   Uint64 delete_recovery_work = (Uint64)DELETE_RECOVERY_WORK;
14557   Uint64 row_count = ptr.p->m_row_count;
14558   Uint64 prev_row_count = ptr.p->m_prev_row_count;
14559   Uint64 row_change_count = ptr.p->m_row_change_count;
14560   Uint64 decrease_row_change_count = 0;
14561   Uint64 new_rows, dropped_rows;
14562   if (row_count > prev_row_count)
14563   {
14564     jam();
14565     new_rows = row_count - prev_row_count;
14566     dropped_rows = 0;
14567     decrease_row_change_count = new_rows;
14568   }
14569   else
14570   {
14571     jam();
14572     new_rows = 0;
14573     dropped_rows = prev_row_count - row_count;
14574     decrease_row_change_count = dropped_rows;
14575   }
14576   if (decrease_row_change_count > row_change_count)
14577   {
14578     g_eventLogger->info("prev_row_count: %llu, row_count: %llu,"
14579                         " row_change_count: %llu",
14580                         prev_row_count,
14581                         row_count,
14582                         row_change_count);
14583   }
14584   ndbrequire(decrease_row_change_count <= row_change_count);
14585 
14586   row_change_count -= decrease_row_change_count;
14587 
14588   new_rows *= insert_recovery_work;
14589   new_rows /= (Uint64)100;
14590 
14591   dropped_rows *= delete_recovery_work;
14592   dropped_rows /= Uint64(100);
14593 
14594   row_change_count += new_rows;
14595   row_change_count += dropped_rows;
14596 
14597   return row_change_count;
14598 }
14599 
14600 Uint64
get_total_memory()14601 Backup::get_total_memory()
14602 {
14603   Resource_limit res_limit;
14604   m_ctx.m_mm.get_resource_limit(RG_DATAMEM, res_limit);
14605   const Uint32 pages_used = res_limit.m_curr;
14606   const Uint64 dm_used = Uint64(pages_used) * Uint64(sizeof(GlobalPage));
14607   const Uint64 num_ldms = getLqhWorkers() != 0 ?
14608                          (Uint64)getLqhWorkers() : (Uint64)1;
14609   const Uint64 total_memory = dm_used / num_ldms;
14610   return total_memory;
14611 }
14612 
14613 void
calculate_number_of_parts(BackupRecordPtr ptr)14614 Backup::calculate_number_of_parts(BackupRecordPtr ptr)
14615 {
14616   /**
14617    * Here we decide on how many parts we need to use for this LCP.
14618    * As input we have:
14619    * 1) Row count
14620    * 2) Row change count since last LCP
14621    * => Percentage of rows changed since last LCP
14622    *
14623    *   The percentage of rows changed since last LCP is the most
14624    *   important to this algorithm. This gives us a minimum number of
14625    *   parts that we need to write as part of this LCP.
14626    *
14627    *   There is an overhead in not writing full LCPs. The overhead is
14628    *   dependent on the amount of changed rows in comparison with the
14629    *   percentage of parts written.
14630    *
14631    *   The overhead formula can be written as:
14632    *   (1 - x) * (y + 0.5 * z) / x
14633    *   where:
14634    *   x = percentage of parts fully written in this LCP
14635    *   y = percentage of rows changed since last LCP
14636    *   z = percentage of rows changed during LCP
14637    *
14638    *   The (1 - x) comes from that only the parts not written have
14639    *   overhead for writing changed rows.
14640    *
14641    *   The y comes from that writing changed rows is an overhead.
14642    *
14643    *   The 0.5 * z comes from that writing changed rows during the LCP
14644    *   is also an overhead, however only half of those rows will
14645    *   actually be written since the LCP scan will not see rows
14646    *   changed before the scan pointer.
14647    *
14648    *   The division comes from that the first part of the formula is
14649    *   the overhead cost for one LCP. However a full LCP consists of
14650    *   1/x LCPs.
14651    *
14652    *   We want to select an x such that the overhead becomes smaller
14653    *   than some select value.
14654    *
14655    *   We can also have overhead in that we have written more parts
14656    *   than are actually needed. To avoid that this overhead is
14657    *   unnecessary big we will ensure that we never write any files
14658    *   that contains more than 1/8th of the parts. This means that at
14659    *   most we can get 12.5% overhead due to extra parts being written.
14660    *
14661    *   We will try to ensure that x is chosen such that overhead is
14662    *   smaller than p where p is the overhead percentage. p is
14663    *   configurable in the RecoveryWork parameter and can be set between
14664    *   25 and 100%. It defaults to 50%.
14665    *
14666    *   This means that we should at most require
14667    *   60% overhead compared to the data memory size. This number
14668    *   is based on that we don't have an extreme amount of small
14669    *   fragments with very small memory sizes. In this case the
14670    *   overhead of writing table meta data as well will make the
14671    *   overhead. So with most applications we can guarantee that the
14672    *   overhead stays below 60% and actually in most cases we will
14673    *   probably even have an overhead of around 40%.
14674    *
14675    *   So we want to select an x such that:
14676    *   (1 - x) (y + z*0.5) / x < p
14677    *
14678    *   Now at start of an LCP for a fragment we can treat both y and z
14679    *   as constants, so let us call (y + 0.5*z) k.
14680    *   =>
14681    *   (1 - x) * k < p * x
14682    *   =>
14683    *   k - k * x < p * x
14684    *   =>
14685    *   k < (k + p) * x
14686    *   =>
14687    *   x > k / (k + p)
14688    *   where k = y + 0.5 * z
14689    *
14690    *   Now x is the percentage of parts we should use, when x = 1 we have
14691    *   2048 parts. So replacing x by parts we get.
14692    *
14693    *   parts > 2048 * k / (k + p)
14694    *   We will select min_parts = 1 + (2048 * k) / (k + p)
14695    *
14696    *   Now we know the following:
14697    *   row_count, row_change_count, memory_used_in_fragment, total_memory_used
14698    *   This gives:
14699    *   y = row_change_count / row_count
14700    *   z = (row_change_count / row_count) *
14701    *       (memory_used_in_fragment / total_memory_used)
14702    *
14703    *   The calculation of z is a prediction based on history, so a sort of
14704    *   Bayesian average.
14705    *
14706    *   Now if we assume that the LCP have entered a steady state with a steady
14707    *   flow of writes going on.
14708    *
14709    *   When the k-value above is large we certainly benefits most from writing
14710    *   entire set. If for example 70% of the data set was changed the execution
14711    *   overhead of writing everything is only 50% and this certainly pays off
14712    *   in order to make restart faster by writing the entire data set in this
14713    *   case.
14714    *
14715    *   At the other end of the spectrum we have small k-values (around 1% or
14716    *   even smaller), in this the above equation can be simplified to
14717    *   parts = k / p
14718    *   Thus p = 25% => parts = 4 * k
14719    *   p = 50% => parts = 2 * k
14720    *   p = 100% => parts = k
14721    *
14722    *   Now k is more or less the percentage of data changing between LCPs.
14723    *   So if we have a 1 TByte database and k is 1% we will write 10 GByte
14724    *   per LCP to the database. This means 10 GByte will be written to the
14725    *   REDO log (can be smaller or larger since REDO log have a 4 byte overhead
14726    *   per column, but the REDO log only writes changed columns), almost
14727    *   10 GByte will be written to the CHANGE pages in the partial LCP
14728    *
14729    *   Thus with p = 25% we will write 60 GByte to disk, with p = 50% we will
14730    *   write 40 GByte to disk and with p = 100% we will write 30 GByte to
14731    *   disk to handle 10 Gbytes of writes.
14732    *
14733    *   The other side of the picture is that increasing p means that more
14734    *   storage space is needed for LCP files. We need (1 + p) * DataMemory
14735    *   of storage space for LCP files (unless we use compression when
14736    *   this should be divided by at least 2). Actually the storage space
14737    *   should in the worst case be increased by 12.5% of the DataMemory
14738    *   size since we might need to keep LCP data no longer needed since
14739    *   we only delete LCP files and not parts of a file.
14740    *
14741    *   The third side of the picture is that higher p means longer time to
14742    *   read in the LCP at restart. If we assume in the above example that
14743    *   we use p = 25%, thus x = 40GByte of parts, thus 25 LCPs are needed
14744    *   to restore data. In each such LCP there will be 10 GByte of updated
14745    *   rows extra, but only half of those need to be applied (mean value).
14746    *   Thus the extra processing during restart is p/2%. So with p = 25%
14747    *   we will execute 12.5% more rows compared to if all rows fitted in
14748    *   one LCP. We will have to read all LCP files from disk though, so
14749    *   we need to read 25% more from disk during restart.
14750    *
14751    *   So thus it becomes natural to think of the p value as the
14752    *   work we are willing to put into recovery during normal operation.
14753    *   The more work we do during normal operation, the less work we need
14754    *   to do during recovery.
14755    *
14756    *   Thus we call the config parameter RecoveryWork where small values
14757    *   means lots of work done and higher values means smaller amount of
14758    *   work done.
14759    *
14760    *   Given that decreasing p beyond 25% increases the load of LCPs
14761    *   exponentially we set the minimum p to be 25%. Increasing
14762    *   p beyond 100% means exponentially smaller benefits with
14763    *   linearly increasing recovery, we set the upper limit at 100%
14764    *   for p.
14765    *
14766    *   It is still possible to use the old algorithm where we always
14767    *   write everything in each LCP. This is kept for better backwards
14768    *   compatability and for risk averse users. It also works very well
14769    *   still for smaller database sizes that updates most of the data
14770    *   all the time.
14771    *
14772    *   Independent of all these settings we will never write any new LCP
14773    *   data files (only LCP control files will be updated) when no changes
14774    *   have been made to a table. This will be a great benefit to all
14775    *   database tables that are read-only most of the time.
14776    *
14777    * 3) Total memory size used for memory part of rows
14778    * => Memory size needed to log changed rows
14779    * => Memory sized needed to write each part of the LCP
14780    *
14781    *   Total memory used gives us an indication if we need to bother about
14782    *   splitting it into parts at all. We don't care about parts smaller
14783    *   than 64 kBytes. Also we will never split it into parts smaller than
14784    *   64 kBytes.
14785    *
14786    * 4) Total memory space
14787    * 5) Number of LDMs in the node
14788    * => Approximate memory space used by this LDM
14789    *
14790    *   This gives us a good understanding how large this fragment is
14791    *   compared to the rest of the memory in this LDM.
14792    *
14793    * 6) Current disk write speed
14794    *
14795    *   This gives a good approximation of how long time this particular
14796    *   fragment LCP will take, it will also give us an indication of how
14797    *   long time the entire LCP will take.
14798    *
14799    * 7) Total REDO log size for our log part
14800    * 8) Total free REDO log size for our log part
14801    * 9) => Percentage used of REDO log for our log part
14802    * 10) We also keep free REDO log size from last LCP we executed and the
14803    *     timestamp for when we last was here. This helps us calculating the
14804    *     speed we are writing REDO log at.
14805    *
14806    *   We mainly use this to see if we are close to running out of REDO
14807    *   log, if we are we need to speed up LCP processing by raising the
14808    *   speed of disk writes for LCP.
14809    *
14810    * 11) Time used for last distributed LCP
14811    * 12) Time used for last LCP locally
14812    */
14813 
14814   const Uint64 total_memory = get_total_memory();
14815 
14816   /**
14817    * There are four rules that apply for choosing the number of parts to
14818    * write all rows in.
14819    * 1) Make sure that overhead doesn't exceed p% for partial LCPs
14820    *    So we call this rule 1, rule 1 says that we will select the number
14821    *    of parts that gives p% overhead.
14822    *
14823    * 2) Avoid overhead when it doesn't provide any value, if e.g. we
14824    *    have 80% of the rows that have been changed then the calculation
14825    *    means that we're going to use actually less than 80% (about 78%)
14826    *    since that brings about p% overhead. Obviously there is no sense
14827    *    in creating overhead in this case since we will write 78% of the
14828    *    rows + 80% of the remaining 22%. Thus we get an overhead of 25%
14829    *    to save 4.4% of the row writes which doesn't make a lot of sense.
14830    *
14831    *    Rule 2 says that we will select all parts if we have changed
14832    *    more than 70% of the rows. Otherwise rule 2 selects 0 parts.
14833    *
14834    *    An observation here is that during heavy deletes patterns we will
14835    *    very often fall back to full LCPs since the number of rows is
14836    *    getting smaller whereas the number of changed rows is increasing.
14837    *
14838    *    In a sense this is positive since it means that we will quickly
14839    *    remove LCP files that contain deleted rows, this space might be
14840    *    needed by other tables that at the same time gets many inserts.
14841    *
14842    * 3) The number of pages sets a limit on how small the number of parts
14843    *    can be. So with 1 page we can only perform full LCPs, with 2 pages
14844    *    we can never checkpoint with less than 1024 parts, so the rule
14845    *    here is that we never go below 2048 divided by number of pages.
14846    *    This ensures that most of the time there is at least one page
14847    *    that will write ALL rows in the page.
14848    *
14849    *  4) First LCP on  fragment must always be a full LCP.
14850    *     Rule 4 is 2048 parts when first LCP, otherwise it is 0.
14851    *
14852    *  5) This rules says that the minimum number of parts is 1, we will
14853    *     never run an LCP with 0 parts.
14854    *
14855    * In conclusion we will select the rule that returns the highest number
14856    * of parts.
14857    */
14858   Uint64 row_count = ptr.p->m_row_count;
14859   Uint64 memory_used = ptr.p->m_memory_used_in_bytes;
14860   Uint64 row_change_count = calculate_row_change_count(ptr);
14861   Uint32 min_parts_rule1 = calculate_min_parts(row_count,
14862                                                row_change_count,
14863                                                memory_used,
14864                                                total_memory);
14865 
14866   Uint32 min_parts_rule2 = 0;
14867   if ((Uint64(10) * row_change_count) >
14868       (Uint64(7) * row_count))
14869   {
14870     jam();
14871     min_parts_rule2 = BackupFormat::NDB_MAX_LCP_PARTS;
14872   }
14873 
14874   Uint32 min_parts_rule3 = BackupFormat::NDB_MAX_LCP_PARTS;
14875   if (ptr.p->m_lcp_max_page_cnt > 1)
14876   {
14877     jam();
14878     min_parts_rule3 = BackupFormat::NDB_MAX_LCP_PARTS /
14879                         ptr.p->m_lcp_max_page_cnt;
14880   }
14881   Uint32 min_parts_rule4 = 0;
14882   if (ptr.p->preparePrevLcpId == 0)
14883   {
14884     jam();
14885     min_parts_rule4 = BackupFormat::NDB_MAX_LCP_PARTS;
14886   }
14887   /**
14888    * We can never go below 1 part, this is the absolute minimum even if
14889    * all rules say 0.
14890    */
14891   Uint32 min_parts_rule5 = 1;
14892   Uint32 parts = MAX(MAX(min_parts_rule1, min_parts_rule2),
14893                      MAX(min_parts_rule3,
14894                      MAX(min_parts_rule4, min_parts_rule5)));
14895 
14896   if (ERROR_INSERTED(10048) && min_parts_rule4 == 0)
14897   {
14898     /**
14899      * We need this in test cases to ensure that we can create a situation
14900      * with 1 part per LCP and having more than 980 parts and even close to
14901      * 2048 LCPs to restore a LCP.
14902      */
14903     jam();
14904     g_eventLogger->info("Set to 1 part by ERROR 10048 injection");
14905     parts = 1;
14906   }
14907 #ifdef DEBUG_LCP_STAT
14908   TablePtr debTabPtr;
14909   FragmentPtr fragPtr;
14910   ptr.p->tables.first(debTabPtr);
14911   debTabPtr.p->fragments.getPtr(fragPtr, 0);
14912   DEB_LCP_STAT(("(%u)tab(%u,%u), row_count: %llu, calc_row_change_count: %llu"
14913                 ", prev_row_count: %llu, "
14914                 "memory_used: %llu kB, total_dm_memory: %llu MB, "
14915                 "parts: %u, min_parts_rule1: %u, "
14916                 "min_parts_rule3: %u",
14917                 instance(),
14918                 debTabPtr.p->tableId,
14919                 fragPtr.p->fragmentId,
14920                 row_count,
14921                 row_change_count,
14922                 ptr.p->m_prev_row_count,
14923                 memory_used / 1024,
14924                 total_memory / (1024 * 1024),
14925                 parts,
14926                 min_parts_rule1,
14927                 min_parts_rule3));
14928 #endif
14929   /**
14930    * We have now calculated the parts to use in this LCP.
14931    * Now we need to calculate how many LCP files to use for this
14932    * LCP.
14933    *
14934    * The calculation of this is to use 1 file per 12.5% of the
14935    * parts. Each file must still be at least one fixed page
14936    * since this is what makes use choose which part something
14937    * goes into.
14938    */
14939   Uint32 min_file_rule_1 =
14940     (BackupFormat::NDB_MAX_FILES_PER_LCP * parts +
14941     ((BackupFormat::NDB_MAX_LCP_PARTS / BackupFormat::NDB_MAX_FILES_PER_LCP) -
14942       1)) /
14943     BackupFormat::NDB_MAX_LCP_PARTS;
14944   Uint32 min_file_rule = MAX(1, min_file_rule_1);
14945   Uint32 max_file_rule_1 = ptr.p->m_lcp_max_page_cnt;
14946   Uint32 max_file_rule_2 = BackupFormat::NDB_MAX_FILES_PER_LCP;
14947   Uint32 max_file_rule = MIN(max_file_rule_1, max_file_rule_2);
14948   max_file_rule = MAX(1, max_file_rule);
14949   Uint32 num_lcp_files = MIN(min_file_rule, max_file_rule);
14950   if (!is_partial_lcp_enabled())
14951   {
14952     /**
14953      * To not set EnablePartialLcp to true is mostly there to be able to
14954      * use NDB as close to the 7.5 manner as possible, this means also not
14955      * using 8 files when partial LCP isn't enabled. So we use only one
14956      * file here, it will always be full writes in this case.
14957      */
14958     jam();
14959     num_lcp_files = 1;
14960   }
14961   ptr.p->m_num_lcp_files = num_lcp_files;
14962   DEB_EXTRA_LCP(("(%u) min_file_rules1 = %u, max_file_rule1 = %u",
14963                  instance(),
14964                  min_file_rule_1,
14965                  max_file_rule_1));
14966   DEB_LCP(("(%u) LCP using %u files",
14967            instance(),
14968            ptr.p->m_num_lcp_files));
14969 
14970   /**
14971    * We will now prepare the BackupRecord such that it has all the
14972    * information set up to execute this LCP.
14973    */
14974   prepare_ranges_for_parts(ptr, parts);
14975   prepare_new_part_info(ptr, parts);
14976 }
14977 
14978 void
lcp_swap_tables(BackupRecordPtr ptr,TablePtr & tabPtr,Uint32 tableId)14979 Backup::lcp_swap_tables(BackupRecordPtr ptr,
14980                         TablePtr & tabPtr,
14981                         Uint32 tableId)
14982 {
14983   ptr.p->prepare_table.first(tabPtr);
14984   ndbrequire(tabPtr.p->tableId == tableId);
14985   ptr.p->prepare_table.removeFirst(tabPtr);
14986 
14987   TablePtr newPrepareTablePtr;
14988   ptr.p->tables.removeFirst(newPrepareTablePtr);
14989   ptr.p->tables.addFirst(tabPtr);
14990   ptr.p->prepare_table.addFirst(newPrepareTablePtr);
14991 }
14992 
14993 void
lcp_swap_data_file(BackupRecordPtr ptr)14994 Backup::lcp_swap_data_file(BackupRecordPtr ptr)
14995 {
14996   Uint32 newPrepareDataFilePtr = ptr.p->dataFilePtr[0];
14997   ptr.p->dataFilePtr[0] = ptr.p->prepareDataFilePtr[0];
14998   ptr.p->prepareDataFilePtr[0] = newPrepareDataFilePtr;
14999 }
15000 
15001 void
lcp_swap_ctl_file(BackupRecordPtr ptr)15002 Backup::lcp_swap_ctl_file(BackupRecordPtr ptr)
15003 {
15004   Uint32 newPrepareCtlFilePtr = ptr.p->ctlFilePtr;
15005   ptr.p->ctlFilePtr =
15006     ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber];
15007   ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber] =
15008     newPrepareCtlFilePtr;
15009 }
15010 
15011 void
copy_lcp_info_from_prepare(BackupRecordPtr ptr)15012 Backup::copy_lcp_info_from_prepare(BackupRecordPtr ptr)
15013 {
15014   ptr.p->m_scan_change_gci = ptr.p->m_prepare_scan_change_gci;
15015   Uint32 total_parts = 0;
15016   for (Uint32 i = 0; i < ptr.p->m_prepare_num_parts_in_lcp; i++)
15017   {
15018     Uint32 num_parts = ptr.p->m_prepare_part_info[i].numParts;
15019     total_parts += num_parts;
15020     ptr.p->m_part_info[i] = ptr.p->m_prepare_part_info[i];
15021   }
15022   ndbrequire(total_parts == 0 || /* First LCP */
15023              total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
15024 
15025   ptr.p->m_num_parts_in_lcp = ptr.p->m_prepare_num_parts_in_lcp;
15026   ptr.p->m_max_parts_in_lcp = ptr.p->m_prepare_max_parts_in_lcp;
15027   ptr.p->m_first_start_part_in_lcp =
15028     ptr.p->m_prepare_first_start_part_in_lcp;
15029   ptr.p->m_first_data_file_number = ptr.p->prepareFirstDataFileNumber;
15030   ptr.p->deleteCtlFileNumber = ptr.p->prepareDeleteCtlFileNumber;
15031 }
15032 
15033 /**
15034  * An important part of starting an LCP is to insert a record in the
15035  * UNDO log record indicating start of the LCP. This is used to ensure
15036  * that main memory rows restored and the disk data restored is in
15037  * perfect synch with each other. This UNDO log record must be
15038  * completely synchronised with start of LCP scanning.
15039  */
15040 void
lcp_write_undo_log(Signal * signal,BackupRecordPtr ptr)15041 Backup::lcp_write_undo_log(Signal *signal,
15042                            BackupRecordPtr ptr)
15043 {
15044   TablePtr tabPtr;
15045   ptr.p->tables.first(tabPtr);
15046   if (c_lqh->is_disk_columns_in_table(tabPtr.p->tableId))
15047   {
15048     jam();
15049     LcpFragOrd *ord = (LcpFragOrd*)signal->getDataPtr();
15050     FragmentPtr fragPtr;
15051     tabPtr.p->fragments.getPtr(fragPtr, 0);
15052     ord->tableId = tabPtr.p->tableId;
15053     ord->fragmentId = fragPtr.p->fragmentId;
15054     ord->lcpId = ptr.p->backupId;
15055     {
15056       Logfile_client lgman(this, c_lgman, 0);
15057       ptr.p->m_current_lcp_lsn = lgman.exec_lcp_frag_ord(signal,
15058                                c_lqh->get_current_local_lcp_id());
15059       ndbrequire(ptr.p->m_current_lcp_lsn > Uint64(0));
15060     }
15061   }
15062   else
15063   {
15064     jam();
15065     ptr.p->m_current_lcp_lsn = Uint64(0);
15066   }
15067 }
15068 
15069 /**
15070  * Start execution of LCP after receiving BACKUP_FRAGMENT_REQ
15071  *
15072  * When executing this method we know that there is no
15073  * LCP_PREPARE processing ongoing and there is no LCP
15074  * execution processing going on. So this is a safe place to
15075  * move data from prepare part of BackupRecord to execution
15076  * part of the BackupRecord.
15077  */
15078 void
start_execute_lcp(Signal * signal,BackupRecordPtr ptr,TablePtr & tabPtr,Uint32 tableId)15079 Backup::start_execute_lcp(Signal *signal,
15080                           BackupRecordPtr ptr,
15081                           TablePtr & tabPtr,
15082                           Uint32 tableId)
15083 {
15084   init_extended_lcp_stat();
15085   ptr.p->slaveState.setState(STARTED);
15086   ndbrequire(ptr.p->prepareState == PREPARED);
15087   ptr.p->prepareState = NOT_ACTIVE;
15088   ptr.p->m_lcp_lsn_synced = 1;
15089   ptr.p->m_num_lcp_data_files_open = 1;
15090   ptr.p->m_bytes_written = 0;
15091   ptr.p->m_row_scan_counter = 0;
15092   ptr.p->m_last_recorded_bytes_written = 0;
15093   ptr.p->m_pause_counter = 0;
15094   pausing_lcp(3,0);
15095 
15096   copy_lcp_info_from_prepare(ptr);
15097 
15098   /**
15099    * We need to switch places on prepare table
15100    * execute table.
15101    */
15102   lcp_swap_tables(ptr, tabPtr, tableId);
15103   lcp_swap_data_file(ptr);
15104   lcp_swap_ctl_file(ptr);
15105 
15106   lcp_write_undo_log(signal, ptr);
15107   /**
15108    * With the introduction of Partial LCPs we need to calculate how
15109    * many parts that should be part of this LCP.
15110    *
15111    * We tell LDM that we are about to start a new LCP. This means that
15112    * we want to know the number of rows changed since last LCP. We
15113    * want also to know the current number of rows to calculate the
15114    * proportion between updated rows and the number of rows in total
15115    * in the fragment.
15116    *
15117    * We treat 0 updated rows as a special case. This means that not a
15118    * single commit has changed any rows since the last LCP started.
15119    * In this special case we can actually still use the data files
15120    * from the old LCP. We do however still need to write a new LCP
15121    * control file. This is the case since we need to update the
15122    * MaxGciCompleted in the LCP control file which is very
15123    * important. It is this value which makes it possible for us to
15124    * use the LCP to cut the REDO log tail (which in principle is
15125    * the main reason for doing LCPs, to cut the REDO log tail).
15126    *
15127    * The 0 updated rows is most likely a very common case and will
15128    * save us radical amounts of REDO log processing in idle nodes.
15129    * If this is the very first LCP we are performing, then we
15130    * will still go ahead and perform the LCP to simplify the code.
15131    */
15132   c_lqh->get_lcp_frag_stats(ptr.p->m_row_count,
15133                             ptr.p->m_prev_row_count,
15134                             ptr.p->m_row_change_count,
15135                             ptr.p->m_memory_used_in_bytes,
15136                             ptr.p->m_lcp_max_page_cnt);
15137   Uint32 newestGci = c_lqh->get_lcp_newest_gci();
15138 
15139   FragmentPtr fragPtr;
15140   ptr.p->tables.first(tabPtr);
15141   tabPtr.p->fragments.getPtr(fragPtr, 0);
15142 #ifdef DEBUG_LCP_STAT
15143   DEB_LCP_STAT((
15144            "(%u)TAGY LCP_Start: tab(%u,%u).%u, row_count: %llu,"
15145            " row_change_count: %llu,"
15146            " prev_row_count: %llu,"
15147            " memory_used_in_bytes: %llu, max_page_cnt: %u, LCP lsn: %llu",
15148            instance(),
15149            tabPtr.p->tableId,
15150            fragPtr.p->fragmentId,
15151            c_lqh->getCreateSchemaVersion(ttabPtr.p->tableId),
15152            ptr.p->m_row_count,
15153            ptr.p->m_row_change_count,
15154            ptr.p->m_prev_row_count,
15155            ptr.p->m_memory_used_in_bytes,
15156            ptr.p->m_lcp_max_page_cnt,
15157            ptr.p->m_current_lcp_lsn));
15158 #endif
15159 
15160   if (ptr.p->m_row_change_count == 0 &&
15161       ptr.p->preparePrevLcpId != 0 &&
15162       (ptr.p->prepareMaxGciWritten == newestGci &&
15163        m_our_node_started) &&
15164       c_pgman->idle_fragment_lcp(tabPtr.p->tableId,
15165                                  fragPtr.p->fragmentId))
15166   {
15167     /**
15168      * We don't handle it as an idle LCP when it is the first LCP
15169      * executed on the fragment. In this case we need to run a normal
15170      * LCP even if it produces an empty LCP data file.
15171      *
15172      * Also if someone has committed a transaction on the fragment
15173      * we will not treat it as an idle LCP even if row change count
15174      * hasn't changed.
15175      */
15176     jam();
15177     handle_idle_lcp(signal, ptr);
15178     return;
15179   }
15180   else
15181   {
15182     jam();
15183     prepare_parts_for_lcp(signal, ptr);
15184   }
15185 }
15186 
15187 /**
15188  * We have finished writing of a fragment, the file is written to
15189  * disk and we can start the complete processing of the LCP for
15190  * this fragment.
15191  */
15192 void
lcp_close_data_file(Signal * signal,BackupRecordPtr ptr,bool delete_flag)15193 Backup::lcp_close_data_file(Signal* signal,
15194                             BackupRecordPtr ptr,
15195                             bool delete_flag)
15196 {
15197   BackupFilePtr filePtr;
15198   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
15199   closeFile(signal, ptr, filePtr, false, delete_flag);
15200 }
15201 
15202 void
lcp_start_complete_processing(Signal * signal,BackupRecordPtr ptr)15203 Backup::lcp_start_complete_processing(Signal *signal, BackupRecordPtr ptr)
15204 {
15205   /**
15206    * We start wait here for 2 parallel events.
15207    * 1) Sync:ing page cache and extent pages
15208    * 2) Finalising write of LCP data file and closing it
15209    *
15210    * After these events are ready we will check if the LSN have been synched
15211    * yet. If it hasn't we will still write the LCP control file, but we will
15212    * write with an invalid flag set in it. We will later rewrite it before
15213    * deleting the data files.
15214    *
15215    * When all of those are done we will write the control file and when this
15216    * write is completed and the file closed then we will report the LCP back
15217    * as completed.
15218    *
15219    * The only reason for syncing the UNDO log is to ensure that if no
15220    * pages at all was written as part of LCP for the fragment, then we
15221    * still need to ensure that the UNDO_LCP log record is flushed to
15222    * disk. We get the LSN of the UNDO_LCP record from DBLQH.
15223    *
15224    * When we sync the pages we will ensure that any writes will also
15225    * sync the UNDO log to the proper point. So we need not worry about
15226    * losing any UNDO log records as long as we sync the page cache for
15227    * a fragment as part of LCP processing. This is called the
15228    * WAL rule.
15229    *
15230    * Sync:ing the extent pages will write all dirty extent pages, so no
15231    * special phase is needed to write those at the end of all fragment
15232    * LCPs.
15233    *
15234    *
15235    * Sync:ing happens in two stages
15236    * The first stage is syncing all data pages in the PGMAN which executes
15237    * in the same thread as we do. This goes through the list of dirty pages
15238    * on the fragment and sync's them one by one with potential throttling of
15239    * write speed here.
15240    *
15241    * The second stage is synching the extent pages. This always happens in
15242    * the PGMAN proxy block that takes care of the extent pages. Here we
15243    * sync all extent pages that are dirty for each fragment checkpoint. The
15244    * reason is that one extent page is shared by many fragments, also the
15245    * extent pages are only updated when we allocate a new page, allocate a
15246    * new extent or free an extent (only happens at drop table). So normally
15247    * we should only dirty a page when adding another page to a fragment.
15248    * Also many of those writes will usually occur on the same fragment and
15249    * thus the number of writes on those pages will only be high when there
15250    * is high insert activity into the database. Also each extent page covers
15251    * about 1.3 GByte of disk space. So even with 10 TByte of disk space we
15252    * only have a total of 7000 extent pages. So the activity on writing those
15253    * to disk cannot be very high.
15254    *
15255    * By sync:ing data pages and extent pages after writing the main memory
15256    * part of the fragment to disk we are sure that we can recover using this
15257    * fragment LCP. After this we are ready to write the control files for
15258    * this LCP. The LCP is still not 100% ready to use, it still will have
15259    * to wait until the global checkpoint is completed of its highest GCI
15260    * that was written as part of the checkpoint.
15261    *
15262    * As explained in another place it is actually only necessary to sync
15263    * the extent pages for the first fragment containing disk data and
15264    * also at the end of the local checkpoint.
15265    *
15266    * We don't need to wait for this however since the restart will check
15267    * that we don't recover an LCP which has more recent GCI's than we are
15268    * to restore. We must however wait with deleting the old LCP control
15269    * file and data files until we have seen the GCI being completed that
15270    * we wait for.
15271    *
15272    * The localisation of LCP handling and immediate removal of old LCPs
15273    * means that we can no longer restore any older GCPs than the last
15274    * completed one. If a requirement comes up for this it is fairly
15275    * straightforward to add this feature. What is needed is that we wait
15276    * for yet some more time before deleting an old LCP. If we e.g. want
15277    * to support restoring up to 100 GCI's back from the last completed
15278    * than we have to wait for 100 GCI's after completing the one we waited
15279    * for before we can remove the old LCP files. This might require us to
15280    * maintain many LCP control files. One could handle this by ensuring
15281    * that new LCPs aren't started so fast in this case.
15282    *
15283    * However most likely there are better options to restore old versions
15284    * of the database by using backups.
15285    */
15286 
15287   ptr.p->m_wait_data_file_close = true;
15288   ptr.p->m_wait_disk_data_sync = true;
15289   ptr.p->m_wait_sync_extent = true;
15290   ptr.p->m_disk_data_exist = false;
15291 
15292   if (ptr.p->m_current_lcp_lsn == Uint64(0))
15293   {
15294     /**
15295      * No entry in log file group created, thus table isn't a disk data
15296      * table. So we can safely ignore going to PGMAN to sync data pages.
15297      */
15298     jam();
15299     ptr.p->m_wait_disk_data_sync = false;
15300     if (ptr.p->m_first_fragment)
15301     {
15302       jam();
15303       send_firstSYNC_EXTENT_PAGES_REQ(signal, ptr);
15304       return;
15305     }
15306     ptr.p->m_wait_sync_extent = false;
15307     lcp_write_ctl_file(signal, ptr);
15308     return;
15309   }
15310   BlockReference ref = numberToRef(PGMAN, instance(), getOwnNodeId());
15311   TablePtr tabPtr;
15312   FragmentPtr fragPtr;
15313   ptr.p->tables.first(tabPtr);
15314   tabPtr.p->fragments.getPtr(fragPtr, 0);
15315   ptr.p->m_num_sync_pages_waiting = Uint32(~0);
15316   ptr.p->m_start_sync_op = getHighResTimer();
15317 
15318   SyncPageCacheReq *sync_req = (SyncPageCacheReq*)signal->getDataPtrSend();
15319   sync_req->senderData = ptr.i;
15320   sync_req->senderRef = reference();
15321   sync_req->tableId = tabPtr.p->tableId;
15322   sync_req->fragmentId = fragPtr.p->fragmentId;
15323   sendSignal(ref, GSN_SYNC_PAGE_CACHE_REQ, signal,
15324              SyncPageCacheReq::SignalLength, JBB);
15325 }
15326 
15327 void
execSYNC_PAGE_WAIT_REP(Signal * signal)15328 Backup::execSYNC_PAGE_WAIT_REP(Signal *signal)
15329 {
15330   jamEntry();
15331   BackupRecordPtr ptr;
15332   c_backupPool.getPtr(ptr, signal->theData[0]);
15333   if (ptr.p->m_wait_disk_data_sync)
15334   {
15335     jam();
15336     ptr.p->m_num_sync_pages_waiting = signal->theData[1];
15337   }
15338   else if (ptr.p->m_wait_sync_extent ||
15339            ptr.p->m_wait_final_sync_extent)
15340   {
15341     jam();
15342     ptr.p->m_num_sync_extent_pages_written = signal->theData[1];
15343   }
15344   else
15345   {
15346     ndbabort();
15347   }
15348 }
15349 
15350 void
execSYNC_PAGE_CACHE_CONF(Signal * signal)15351 Backup::execSYNC_PAGE_CACHE_CONF(Signal *signal)
15352 {
15353   SyncPageCacheConf *conf = (SyncPageCacheConf*)signal->getDataPtr();
15354   BackupRecordPtr ptr;
15355   TablePtr tabPtr;
15356   FragmentPtr fragPtr;
15357   jamEntry();
15358 
15359   c_backupPool.getPtr(ptr, conf->senderData);
15360   ptr.p->m_num_sync_pages_waiting = 0;
15361   ptr.p->tables.first(tabPtr);
15362   tabPtr.p->fragments.getPtr(fragPtr, 0);
15363   ndbrequire(conf->tableId == tabPtr.p->tableId);
15364   ndbrequire(conf->fragmentId == fragPtr.p->fragmentId);
15365 
15366   NDB_TICKS now = getHighResTimer();
15367   Uint64 elapsed_us = NdbTick_Elapsed(ptr.p->m_start_sync_op, now).microSec();
15368   m_current_dd_time_us += elapsed_us;
15369 
15370   DEB_LCP_DD(("(%u)Completed SYNC_PAGE_CACHE_CONF for tab(%u,%u)"
15371               ", diskDataExistFlag: %u",
15372              instance(),
15373              tabPtr.p->tableId,
15374              fragPtr.p->fragmentId,
15375              conf->diskDataExistFlag));
15376 
15377   ptr.p->m_wait_disk_data_sync = false;
15378   if (conf->diskDataExistFlag)
15379   {
15380     jam();
15381     ptr.p->m_disk_data_exist = true;
15382   }
15383   if (!ptr.p->m_first_fragment)
15384   {
15385     jam();
15386     ptr.p->m_wait_sync_extent = false;
15387     lcp_write_ctl_file(signal, ptr);
15388     return;
15389   }
15390   send_firstSYNC_EXTENT_PAGES_REQ(signal, ptr);
15391 }
15392 
15393 void
send_firstSYNC_EXTENT_PAGES_REQ(Signal * signal,BackupRecordPtr ptr)15394 Backup::send_firstSYNC_EXTENT_PAGES_REQ(Signal *signal,
15395                                         BackupRecordPtr ptr)
15396 {
15397   ptr.p->m_num_sync_extent_pages_written = Uint32(~0);
15398   ptr.p->m_start_sync_op = getHighResTimer();
15399   /**
15400    * Sync extent pages, this is sent to Proxy block that routes the signal to
15401    * the "extra" PGMAN worker that handles the extent pages.
15402    */
15403   SyncExtentPagesReq *req = (SyncExtentPagesReq*)signal->getDataPtrSend();
15404   req->senderData = ptr.i;
15405   req->senderRef = reference();
15406   req->lcpOrder = SyncExtentPagesReq::FIRST_LCP;
15407   ptr.p->m_first_fragment = false;
15408   sendSignal(PGMAN_REF, GSN_SYNC_EXTENT_PAGES_REQ, signal,
15409              SyncExtentPagesReq::SignalLength, JBB);
15410 }
15411 
15412 void
execSYNC_EXTENT_PAGES_CONF(Signal * signal)15413 Backup::execSYNC_EXTENT_PAGES_CONF(Signal *signal)
15414 {
15415   SyncExtentPagesConf *conf = (SyncExtentPagesConf*)signal->getDataPtr();
15416   BackupRecordPtr ptr;
15417   jamEntry();
15418 
15419   c_backupPool.getPtr(ptr, conf->senderData);
15420   ptr.p->m_num_sync_extent_pages_written = 0;
15421 
15422   NDB_TICKS now = getHighResTimer();
15423   Uint64 elapsed_us = NdbTick_Elapsed(ptr.p->m_start_sync_op, now).microSec();
15424   m_current_dd_time_us += elapsed_us;
15425 
15426   if (ptr.p->slaveState.getState() == DEFINED)
15427   {
15428     jam();
15429     finish_end_lcp(signal, ptr);
15430     return;
15431   }
15432   ndbrequire(ptr.p->slaveState.getState() == STOPPING);
15433   ptr.p->m_wait_sync_extent = false;
15434   lcp_write_ctl_file(signal, ptr);
15435 }
15436 
15437 /**
15438  * A file has been closed as part of LCP completion processing
15439  * for a fragment.
15440  */
15441 void
lcp_close_data_file_conf(Signal * signal,BackupRecordPtr ptr)15442 Backup::lcp_close_data_file_conf(Signal* signal, BackupRecordPtr ptr)
15443 {
15444   jam();
15445   /**
15446    * We could have completed only 1 part of this fragment LCP.
15447    * Check for this and start up next part.
15448    */
15449   if (ptr.p->m_empty_lcp)
15450   {
15451     jam();
15452     finalize_lcp_processing(signal, ptr);
15453     return;
15454   }
15455   ndbrequire(ptr.p->m_wait_data_file_close);
15456   ptr.p->m_wait_data_file_close = false;
15457   lcp_write_ctl_file(signal, ptr);
15458 }
15459 
15460 Uint32
lcp_pre_sync_lsn(BackupRecordPtr ptr)15461 Backup::lcp_pre_sync_lsn(BackupRecordPtr ptr)
15462 {
15463   Uint32 valid_flag = 1;
15464   if (ptr.p->m_disk_data_exist)
15465   {
15466     jam();
15467     Uint64 sync_lsn;
15468     {
15469       Logfile_client lgman(this, c_lgman, 0);
15470       sync_lsn = lgman.pre_sync_lsn(ptr.p->m_current_lcp_lsn);
15471     }
15472     if (sync_lsn < ptr.p->m_current_lcp_lsn)
15473     {
15474       jam();
15475       /**
15476        * LSN for UNDO log record of this LCP haven't been sync:ed to disk
15477        * yet. We will still write the LCP control file, but we will write
15478        * it with an invalid indicator. Later before deleting the LCP data
15479        * files we will ensure that the LSN is sync:ed by calling sync_lsn.
15480        * We will actually call it with LSN = 0 then since the LSN we called
15481        * with here has been recorded already in LGMAN. So there is no need
15482        * to remember the individual LSNs for individual fragments. When we
15483        * call sync_lsn we will ensure that all fragment LCPs already handled
15484        * before will be sync:ed to disk.
15485        */
15486       valid_flag = 0;
15487     }
15488   }
15489   else
15490   {
15491     jam();
15492   }
15493   DEB_LCP(("(%u)Writing first with ValidFlag = %u", instance(), valid_flag));
15494   return valid_flag;
15495 }
15496 
15497 void
lcp_write_ctl_file(Signal * signal,BackupRecordPtr ptr)15498 Backup::lcp_write_ctl_file(Signal *signal, BackupRecordPtr ptr)
15499 {
15500   if (ptr.p->m_wait_data_file_close ||
15501       ptr.p->m_wait_sync_extent ||
15502       ptr.p->m_wait_disk_data_sync)
15503   {
15504     jam();
15505     return;
15506   }
15507 
15508   /**
15509    * Ensure that we didn't find more rows in LCP than what was
15510    * in fragment at start of LCP.
15511    *
15512    * If we run a full LCP we should always find as many rows as was
15513    * present in the row count at the start of the LCP.
15514    * If we run a partial LCP we should never find more rows in this
15515    * LCP file than was present at the start of the LCP, this is the
15516    * sum of rows from ALL pages and changed rows in CHANGE pages.
15517    *
15518    * This check is important such that we find inconsistencies as
15519    * soon as they occur, rather than at the time when we recover
15520    * when it is very difficult to trace back the source of the
15521    * problem.
15522    *
15523    * Error means that the table was dropped during LCP and in this
15524    * case these numbers are not consistent, we're simply closing
15525    * the LCP scan in an orderly manner with no rows read. So we
15526    * should not crash in this case.
15527    *
15528    * We wait until we come here to check the numbers, this means
15529    * that the data file exists when we crash and can be used for
15530    * analysis.
15531    */
15532   {
15533     BackupFilePtr dataFilePtr;
15534     c_backupFilePool.getPtr(dataFilePtr,
15535                             ptr.p->dataFilePtr[0]);
15536     if (!(ptr.p->m_save_error_code != 0 ||
15537           ptr.p->m_row_count == dataFilePtr.p->m_lcp_inserts ||
15538           ((ptr.p->m_num_parts_in_this_lcp !=
15539              BackupFormat::NDB_MAX_LCP_PARTS) &&
15540            (ptr.p->m_row_count >=
15541             (dataFilePtr.p->m_lcp_inserts +
15542              dataFilePtr.p->m_lcp_writes)))))
15543     {
15544       g_eventLogger->info("errCode = %u, row_count = %llu, inserts: %llu"
15545                           ", writes: %llu, parts: %u",
15546                           ptr.p->m_save_error_code,
15547                           ptr.p->m_row_count,
15548                           dataFilePtr.p->m_lcp_inserts,
15549                           dataFilePtr.p->m_lcp_writes,
15550                           ptr.p->m_num_parts_in_this_lcp);
15551       print_extended_lcp_stat();
15552       ndbrequire(ptr.p->m_save_error_code != 0 ||
15553                  ptr.p->m_row_count == dataFilePtr.p->m_lcp_inserts ||
15554         ((ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS) &&
15555          (ptr.p->m_row_count >=
15556           (dataFilePtr.p->m_lcp_inserts + dataFilePtr.p->m_lcp_writes))));
15557     }
15558   }
15559 
15560   Uint32 valid_flag = lcp_pre_sync_lsn(ptr);
15561 
15562   /**
15563    * This function prepares the page for the LCP Control file data
15564    * and ensures checksum is correct, values are written in network
15565    * byte order when appropriate.
15566    *
15567    * As soon as this file is properly written to disk, it can be used
15568    * in restarts. The restart code will ensure that the GCI is restored
15569    * which this LCP cannot roll back from.
15570    */
15571 
15572   BackupFilePtr filePtr;
15573   Page32Ptr pagePtr;
15574 
15575   jam();
15576   ptr.p->m_lcp_lsn_synced = valid_flag;
15577   c_backupFilePool.getPtr(filePtr, ptr.p->ctlFilePtr);
15578   filePtr.p->pages.getPtr(pagePtr, 0);
15579   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
15580     (struct BackupFormat::LCPCtlFile*)pagePtr.p;
15581 
15582   memcpy(lcpCtlFilePtr->fileHeader.Magic, BACKUP_MAGIC, 8);
15583   lcpCtlFilePtr->fileHeader.BackupVersion = NDBD_USE_PARTIAL_LCP_v2;
15584 
15585   const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
15586   lcpCtlFilePtr->fileHeader.SectionType = BackupFormat::FILE_HEADER;
15587   lcpCtlFilePtr->fileHeader.SectionLength = sz - 3;
15588   lcpCtlFilePtr->fileHeader.FileType = BackupFormat::LCP_CTL_FILE;
15589   lcpCtlFilePtr->fileHeader.BackupId = 0;
15590   lcpCtlFilePtr->fileHeader.BackupKey_0 = 0;
15591   lcpCtlFilePtr->fileHeader.BackupKey_1 = 0;
15592   lcpCtlFilePtr->fileHeader.ByteOrder = 0x12345678;
15593   lcpCtlFilePtr->fileHeader.NdbVersion = NDB_VERSION_D;
15594   lcpCtlFilePtr->fileHeader.MySQLVersion = NDB_MYSQL_VERSION_D;
15595 
15596   lcpCtlFilePtr->ValidFlag = valid_flag;
15597 
15598   TablePtr tabPtr;
15599   FragmentPtr fragPtr;
15600   ptr.p->tables.first(tabPtr);
15601   tabPtr.p->fragments.getPtr(fragPtr, 0);
15602 
15603   lcpCtlFilePtr->TableId = tabPtr.p->tableId;
15604   lcpCtlFilePtr->FragmentId = fragPtr.p->fragmentId;
15605   lcpCtlFilePtr->CreateTableVersion =
15606     c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
15607 
15608   Uint32 maxCompletedGci;
15609   c_lqh->lcp_max_completed_gci(maxCompletedGci,
15610                                ptr.p->newestGci,
15611                                m_newestRestorableGci);
15612   lcpCtlFilePtr->CreateGci = fragPtr.p->createGci;
15613   lcpCtlFilePtr->MaxGciCompleted = maxCompletedGci;
15614   lcpCtlFilePtr->MaxGciWritten = ptr.p->newestGci;
15615 
15616   ptr.p->m_wait_gci_to_delete = MAX(maxCompletedGci, ptr.p->newestGci);
15617 
15618   ndbrequire(m_newestRestorableGci != 0);
15619   DEB_LCP(("(%u)tab(%u,%u).%u, use ctl file %u, GCI completed: %u,"
15620            " GCI written: %u, createGci: %u",
15621            instance(),
15622            lcpCtlFilePtr->TableId,
15623            lcpCtlFilePtr->FragmentId,
15624            lcpCtlFilePtr->CreateTableVersion,
15625            (ptr.p->deleteCtlFileNumber == 0 ? 1 : 0),
15626            lcpCtlFilePtr->MaxGciCompleted,
15627            lcpCtlFilePtr->MaxGciWritten,
15628            lcpCtlFilePtr->CreateGci));
15629   ndbrequire((lcpCtlFilePtr->MaxGciWritten + 1) >= fragPtr.p->createGci);
15630   /**
15631    * LcpId and LocalLcpId was set in prepare phase.
15632    */
15633   if (lcpCtlFilePtr->LocalLcpId != c_lqh->get_current_local_lcp_id())
15634   {
15635     g_eventLogger->info("(%u)LocalLcpId: %u, local_lcp_id: %u",
15636      instance(),
15637      lcpCtlFilePtr->LocalLcpId,
15638      c_lqh->get_current_local_lcp_id());
15639   }
15640   ndbrequire(lcpCtlFilePtr->LocalLcpId == c_lqh->get_current_local_lcp_id());
15641   lcpCtlFilePtr->MaxPageCount = ptr.p->m_lcp_max_page_cnt;
15642   lcpCtlFilePtr->LastDataFileNumber = ptr.p->m_last_data_file_number;
15643   lcpCtlFilePtr->MaxNumberDataFiles =
15644     BackupFormat::NDB_MAX_LCP_FILES;
15645   lcpCtlFilePtr->NumPartPairs = ptr.p->m_num_parts_in_lcp;
15646   lcpCtlFilePtr->MaxPartPairs = BackupFormat::NDB_MAX_LCP_PARTS;
15647   lcpCtlFilePtr->RowCountLow = Uint32(ptr.p->m_row_count & 0xFFFFFFFF);
15648   lcpCtlFilePtr->RowCountHigh = Uint32(ptr.p->m_row_count >> 32);
15649 
15650   for (Uint32 i = 0; i < ptr.p->m_num_parts_in_lcp; i++)
15651   {
15652     jam();
15653     lcpCtlFilePtr->partPairs[i] = ptr.p->m_part_info[i];
15654   }
15655 
15656   /**
15657    * Since we calculated checksum with bytes in network order we will write it
15658    * without setting it in network order, this will ensure that the XOR will
15659    * be over the same bits as here.
15660    */
15661   lcp_write_ctl_file_to_disk(signal, filePtr, pagePtr);
15662 }
15663 
15664 void
lcp_write_ctl_file_to_disk(Signal * signal,BackupFilePtr filePtr,Page32Ptr pagePtr)15665 Backup::lcp_write_ctl_file_to_disk(Signal *signal,
15666                                    BackupFilePtr filePtr,
15667                                    Page32Ptr pagePtr)
15668 {
15669   /**
15670    * If file size becomes bigger than 4096 bytes we need to write
15671    * 8192 bytes instead. Currently the header parts are 108 bytes,
15672    * each part consumes 3 bytes, this means that we can fit
15673    * (4096 - 108) / 3 parts in 4096 bytes == 1329 parts.
15674    * Maximum number of parts is currently 2048, thus we can
15675    * always fit in 8192 bytes. We use multiples of 4096 bytes
15676    * to fit well with disk devices, no need to complicate
15677    * file management with lots of different file sizes.
15678    */
15679   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
15680     (struct BackupFormat::LCPCtlFile*)pagePtr.p;
15681   Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
15682   Uint32 file_size = LCP_CTL_FILE_HEADER_SIZE +
15683                      (3 * num_parts + 3);
15684   if (file_size > BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL)
15685   {
15686     jam();
15687     DEB_LCP(("(%u)Writing 8192 byte control file", instance()));
15688     file_size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
15689   }
15690   else
15691   {
15692     jam();
15693     file_size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL;
15694   }
15695   convert_ctl_page_to_network((Uint32*)pagePtr.p, file_size);
15696   filePtr.p->m_flags |= BackupFile::BF_WRITING;
15697   FsReadWriteReq* req = (FsReadWriteReq*)signal->getDataPtrSend();
15698   req->userPointer = filePtr.i;
15699   req->filePointer = filePtr.p->filePointer;
15700   req->userReference = reference();
15701   req->varIndex = 0;
15702   req->numberOfPages = 1;
15703   req->operationFlag = 0;
15704   FsReadWriteReq::setFormatFlag(req->operationFlag,
15705                                 FsReadWriteReq::fsFormatMemAddress);
15706   FsReadWriteReq::setSyncFlag(req->operationFlag, 1);
15707 
15708   Uint32 mem_offset = Uint32((char*)pagePtr.p - (char*)c_startOfPages);
15709   req->data.memoryAddress.memoryOffset = mem_offset;
15710   req->data.memoryAddress.fileOffset = 0;
15711   req->data.memoryAddress.size = file_size;
15712 
15713   sendSignal(NDBFS_REF, GSN_FSWRITEREQ, signal,
15714              FsReadWriteReq::FixedLength + 3, JBA);
15715 }
15716 
15717 void
execFSWRITEREF(Signal * signal)15718 Backup::execFSWRITEREF(Signal *signal)
15719 {
15720   ndbabort();
15721 }
15722 
15723 void
execFSWRITECONF(Signal * signal)15724 Backup::execFSWRITECONF(Signal *signal)
15725 {
15726   BackupRecordPtr ptr;
15727   BackupFilePtr filePtr;
15728   FsConf * conf = (FsConf *)signal->getDataPtr();
15729   const Uint32 userPtr = conf->userPointer;
15730   jamEntry();
15731 
15732   c_backupFilePool.getPtr(filePtr, userPtr);
15733   ndbrequire((filePtr.p->m_flags & BackupFile::BF_WRITING) != 0);
15734   filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_WRITING;
15735   c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
15736 
15737   if (ptr.p->ctlFilePtr == filePtr.i)
15738   {
15739     jam();
15740     closeFile(signal, ptr, filePtr);
15741     return;
15742   }
15743   else if (ptr.p->deleteFilePtr == filePtr.i)
15744   {
15745     jam();
15746     lcp_update_ctl_file_for_rewrite_done(signal, ptr, filePtr);
15747     return;
15748   }
15749   ndbabort();
15750 }
15751 
15752 void
finalize_lcp_processing(Signal * signal,BackupRecordPtr ptr)15753 Backup::finalize_lcp_processing(Signal *signal, BackupRecordPtr ptr)
15754 {
15755   TablePtr tabPtr;
15756   FragmentPtr fragPtr;
15757   BackupFilePtr filePtr;
15758 
15759   if (ptr.p->m_empty_lcp)
15760   {
15761     jam();
15762     ndbrequire(ptr.p->m_outstanding_operations > 0);
15763     ptr.p->m_outstanding_operations--;
15764     if (ptr.p->m_outstanding_operations > 0)
15765     {
15766       jam();
15767       return;
15768     }
15769   }
15770   c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
15771   ndbrequire(ptr.p->tables.first(tabPtr));
15772   Uint32 tableId = tabPtr.p->tableId;
15773 
15774   tabPtr.p->fragments.getPtr(fragPtr, 0);
15775   Uint32 fragmentId = fragPtr.p->fragmentId;
15776 
15777   if (ptr.p->errorCode != 0)
15778   {
15779     jam();
15780     ndbout_c("Fatal : LCP Frag scan failed with error %u"
15781              " file error is: %d",
15782              ptr.p->errorCode,
15783              filePtr.p->errorCode);
15784     ndbrequire(filePtr.p->errorCode == ptr.p->errorCode);
15785 
15786     if ((filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD) == 0)
15787     {
15788       jam();
15789       /* No active scan thread to 'find' the file error.
15790        * Scan is closed, so let's send backupFragmentRef
15791        * back to LQH now...
15792        */
15793       backupFragmentRef(signal, filePtr);
15794       return;
15795     }
15796     ndbabort();
15797   }
15798 
15799   /**
15800    * We're fully done with everything related to the LCP of this fragment.
15801    * Report this back to LQH such that LQH can order the start of a new
15802    * LCP on a new fragment when it is ready to do so.
15803    */
15804   if (ptr.p->deleteDataFileNumber != RNIL ||
15805       ptr.p->deleteCtlFileNumber != RNIL ||
15806       !ptr.p->m_lcp_lsn_synced)
15807   {
15808     /**
15809      * We insert a record into the list for files to delete that will ensure
15810      * that we will delete old LCP files as soon as possible.
15811      * If deleteDataFileNumber is RNIL it means that this was the very first
15812      * LCP on this fragment, so no need to delete any old files. It could
15813      * also be an LCP that retains all files from the old LCP, but we might
15814      * still need to delete a control file.
15815      *
15816      * We wait an extra GCP before we delete the old LCP files. The reason is
15817      * to avoid calling sync_lsn unnecessarily often. Calling sync_lsn will
15818      * remove log space (up to one log page) each time it is called and it
15819      * needs to sync the LSN on the current page.
15820      */
15821     jam();
15822     DeleteLcpFilePtr deleteLcpFilePtr;
15823     ndbrequire(c_deleteLcpFilePool.seize(deleteLcpFilePtr));
15824     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
15825                                   m_delete_lcp_file_head);
15826 
15827     Uint32 wait_for_gci = ptr.p->m_wait_gci_to_delete;
15828     if (m_our_node_started)
15829     {
15830       jam();
15831       wait_for_gci++;
15832     }
15833     bool ready_for_delete = (wait_for_gci <= m_newestRestorableGci);
15834     Uint32 lastDeleteFileNumber= get_file_add(ptr.p->deleteDataFileNumber,
15835                           (ptr.p->m_lcp_remove_files - 1));
15836     deleteLcpFilePtr.p->tableId = tableId;
15837     deleteLcpFilePtr.p->fragmentId = fragmentId;
15838     deleteLcpFilePtr.p->firstFileId = ptr.p->deleteDataFileNumber;
15839     deleteLcpFilePtr.p->lastFileId = lastDeleteFileNumber;
15840     deleteLcpFilePtr.p->waitCompletedGci = wait_for_gci;
15841     deleteLcpFilePtr.p->lcpCtlFileNumber = ptr.p->deleteCtlFileNumber;
15842     deleteLcpFilePtr.p->validFlag = ptr.p->m_lcp_lsn_synced;
15843     deleteLcpFilePtr.p->lcpLsn = ptr.p->m_current_lcp_lsn;
15844 #ifdef DEBUG_LCP
15845     if (deleteLcpFilePtr.p->firstFileId != RNIL)
15846     {
15847       DEB_LCP(("(%u)TAGI Insert delete file in queue:"
15848         " tab(%u,%u).%u, file(%u-%u,%u) GCI: %u, validFlag: %u",
15849         instance(),
15850         tableId,
15851         fragmentId,
15852         c_lqh->getCreateSchemaVersion(tableId),
15853         deleteLcpFilePtr.p->firstFileId,
15854         deleteLcpFilePtr.p->lastFileId,
15855         ptr.p->deleteCtlFileNumber,
15856         ptr.p->m_wait_gci_to_delete,
15857         ptr.p->m_lcp_lsn_synced));
15858     }
15859     else
15860     {
15861       DEB_LCP(("(%u)TAGI Insert delete file in queue:"
15862         " tab(%u,%u).%u, file(RNIL,%u) GCI: %u, validFlag: %u",
15863         instance(),
15864         tableId,
15865         fragmentId,
15866         c_lqh->getCreateSchemaVersion(tableId),
15867         ptr.p->deleteCtlFileNumber,
15868         ptr.p->m_wait_gci_to_delete,
15869         ptr.p->m_lcp_lsn_synced));
15870     }
15871 #endif
15872 
15873     if (ready_for_delete)
15874     {
15875       /**
15876        * Add first to delete processing queue since it is already ready for
15877        * deletion.
15878        */
15879       jam();
15880       queue.addFirst(deleteLcpFilePtr);
15881     }
15882     else
15883     {
15884       jam();
15885       queue.addLast(deleteLcpFilePtr);
15886     }
15887     if (!m_delete_lcp_files_ongoing && ready_for_delete)
15888     {
15889       jam();
15890       m_delete_lcp_files_ongoing = true;
15891       signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
15892       signal->theData[1] = ptr.i;
15893       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
15894     }
15895   }
15896 
15897   ptr.p->errorCode = 0;
15898   ptr.p->slaveState.forceState(DEFINED);
15899   check_empty_queue_waiters(signal, ptr);
15900 
15901   BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtrSend();
15902   conf->backupId = ptr.p->backupId;
15903   conf->backupPtr = ptr.i;
15904   conf->tableId = tableId;
15905   conf->fragmentNo = fragmentId;
15906   conf->noOfRecordsLow = (ptr.p->noOfRecords & 0xFFFFFFFF);
15907   conf->noOfRecordsHigh = (ptr.p->noOfRecords >> 32);
15908   conf->noOfBytesLow = (ptr.p->noOfBytes & 0xFFFFFFFF);
15909   conf->noOfBytesHigh = (ptr.p->noOfBytes >> 32);
15910   if (ptr.p->m_empty_lcp)
15911   {
15912     jam();
15913     /**
15914      * Slow down things a bit for empty LCPs to avoid that we use too much
15915      * CPU for idle LCP processing. This tends to get a bit bursty and can
15916      * affect traffic performance for short times.
15917      */
15918     sendSignalWithDelay(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
15919 	                1, BackupFragmentConf::SignalLength);
15920   }
15921   else
15922   {
15923     jam();
15924     sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
15925 	       BackupFragmentConf::SignalLength, JBA);
15926   }
15927 }
15928 
15929 void
execRESTORABLE_GCI_REP(Signal * signal)15930 Backup::execRESTORABLE_GCI_REP(Signal *signal)
15931 {
15932   Uint32 restorable_gci = signal->theData[0];
15933   /**
15934    * LQH has a more up-to-date view of the node state so use LQHs version
15935    * of the node state rather than our own.
15936    */
15937   if (c_lqh->getNodeState().startLevel >= NodeState::SL_STOPPING_4)
15938   {
15939     jam();
15940     DEB_LCP(("(%u)Ignore RESTORABLE_GCI_REP: %u in SL_STOPPING_4",
15941              instance(),
15942              restorable_gci));
15943     return;
15944   }
15945   if (restorable_gci > m_newestRestorableGci)
15946   {
15947     jam();
15948     m_newestRestorableGci = restorable_gci;
15949   }
15950   else
15951   {
15952     jam();
15953     DEB_LCP(("(%u)Already received this restorable gci: %u",
15954              instance(),
15955              restorable_gci));
15956     return;
15957   }
15958 #ifdef DEBUG_LCP_DEL_FILES
15959   DeleteLcpFilePtr deleteLcpFilePtr;
15960   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
15961                                 m_delete_lcp_file_head);
15962   queue.first(deleteLcpFilePtr);
15963   Uint32 waitGCI = (deleteLcpFilePtr.i != RNIL) ?
15964            deleteLcpFilePtr.p->waitCompletedGci : 0;
15965 #endif
15966   if (m_delete_lcp_files_ongoing)
15967   {
15968     jam();
15969     DEB_LCP_DEL_FILES(("(%u)TAGX Completed GCI: %u (delete files ongoing)"
15970                        ", waitGCI: %u",
15971                        instance(),
15972                        m_newestRestorableGci,
15973                        waitGCI));
15974     return;
15975   }
15976   jam();
15977   DEB_LCP_DEL_FILES(("(%u)TAGX Completed GCI: %u (delete files not ongoing)"
15978                      ", waitGCI: %u",
15979                      instance(),
15980                      m_newestRestorableGci,
15981                      waitGCI));
15982   m_delete_lcp_files_ongoing = true;
15983   delete_lcp_file_processing(signal);
15984   return;
15985 }
15986 
15987 void
delete_lcp_file_processing(Signal * signal)15988 Backup::delete_lcp_file_processing(Signal *signal)
15989 {
15990   BackupRecordPtr ptr;
15991   DeleteLcpFilePtr deleteLcpFilePtr;
15992 
15993   if (m_lcp_ptr.i == RNIL)
15994   {
15995     jam();
15996     m_delete_lcp_files_ongoing = false;
15997     return;
15998   }
15999   ptr = m_lcp_ptr;
16000   ndbrequire(m_delete_lcp_files_ongoing);
16001 
16002   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16003                                 m_delete_lcp_file_head);
16004   if (queue.isEmpty())
16005   {
16006     jam();
16007     ndbrequire(!ptr.p->m_wait_end_lcp);
16008     m_delete_lcp_files_ongoing = false;
16009     if (ptr.p->prepareState == PREPARE_DROP)
16010     {
16011       jam();
16012       /**
16013        * We use this route when we find the obscure case of
16014        * finding LCP files belonging to an already dropped table.
16015        * We keep the code simple here and even wait until the
16016        * queue is completely empty also for this special case to
16017        * avoid any unnecessary checks. We then proceed with normal
16018        * LCP_PREPARE_REQ handling for this case.
16019        */
16020       ptr.p->prepareState = PREPARE_READ_CTL_FILES;
16021       DEB_LCP(("(%u)TAGT Completed wait delete files for drop case",
16022                instance()));
16023       lcp_open_ctl_file(signal, ptr, 0);
16024       lcp_open_ctl_file(signal, ptr, 1);
16025       return;
16026     }
16027     DEB_LCP_DEL_FILES(("(%u)TAGB Completed delete files,"
16028                        " queue empty, no LCP wait",
16029                        instance()));
16030     return;
16031   }
16032   queue.first(deleteLcpFilePtr);
16033   if (deleteLcpFilePtr.p->waitCompletedGci > m_newestRestorableGci)
16034   {
16035     jam();
16036     DEB_LCP(("(%u)TAGW Wait for completed GCI: %u",
16037              instance(),
16038              deleteLcpFilePtr.p->waitCompletedGci));
16039     m_delete_lcp_files_ongoing = false;
16040     return;
16041   }
16042   /* The delete record is ready for deletion process to start. */
16043   ptr.p->currentDeleteLcpFile = deleteLcpFilePtr.i;
16044   if (deleteLcpFilePtr.p->validFlag == 0)
16045   {
16046     jam();
16047     sync_log_lcp_lsn(signal, deleteLcpFilePtr, ptr.i);
16048     return;
16049   }
16050   BackupFilePtr filePtr;
16051   c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16052   lcp_close_ctl_file_for_rewrite_done(signal, ptr, filePtr);
16053 }
16054 
16055 /**
16056  * This segment of code does a rewrite of the LCP control file.
16057  * The LCP control file was written with the valid flag set to
16058  * to 0. This indicates to the restore block that the LCP control
16059  * file isn't safe to use.
16060  *
16061  * Before the old LCP control file is deleted we must ensure that
16062  * the new LCP control file is ready to use by setting the validFlag
16063  * to 1.
16064  *
16065  * The validFlag can however only be set to 1 if we are sure that
16066  * the LSN of our UNDO log record for this fragment LCP has been
16067  * flushed to disk. This is done by calling sync_lsn.
16068  *
16069  * Calling sync_lsn for each fragment is not a good solution since
16070  * each such call can cause one page of UNDO log space to be wasted.
16071  * So to ensure that we minimize the amount of wasted log space we
16072  * instead wait for the GCI to be completed before we call sync_lsn.
16073  * To ensure that we pack as many sync_lsn into one sync_lsn as
16074  * possible we call pre_sync_lsn earlier in the LCP process.
16075  *
16076  * So the idea is that as much as possible we will wait for the
16077  * LSN to be flushed by someone else, if no one has done that job
16078  * after almost 2 GCPs we will do it ourselves. If we do it ourselves
16079  * we will also ensure that all LSNs of calls to pre_sync_lsn will
16080  * be flushed to disk in the same go.
16081  *
16082  * If we find that pre_sync_lsn call indicates that our LSN has already
16083  * been flushed to disk we can avoid this extra round of read and write
16084  * of the LCP control file. We also don't need it for tables without
16085  * disk data columns.
16086  *
16087  * After sync:ing the UNDO LSN we will read the LCP control file,
16088  * set the ValidFlag in the LCP control file and write it again
16089  * and finally close it.
16090  *
16091  * Then we will continue deleting the old data files and old
16092  * LCP control file.
16093  */
16094 void
sync_log_lcp_lsn(Signal * signal,DeleteLcpFilePtr deleteLcpFilePtr,Uint32 ptrI)16095 Backup::sync_log_lcp_lsn(Signal *signal,
16096                          DeleteLcpFilePtr deleteLcpFilePtr,
16097                          Uint32 ptrI)
16098 {
16099   Logfile_client::Request req;
16100   int ret;
16101   req.m_callback.m_callbackData = ptrI;
16102   req.m_callback.m_callbackIndex = SYNC_LOG_LCP_LSN;
16103   {
16104     Logfile_client lgman(this, c_lgman, 0);
16105     ret = lgman.sync_lsn(signal, deleteLcpFilePtr.p->lcpLsn, &req, 1);
16106     jamEntry();
16107   }
16108   switch (ret)
16109   {
16110     case 0:
16111     {
16112       jam();
16113       return;
16114     }
16115     case -1:
16116     {
16117       g_eventLogger->info("(%u)Failed to Sync LCP lsn", instance());
16118       ndbabort();
16119     }
16120     default:
16121     {
16122       jam();
16123       execute(signal, req.m_callback, 0);
16124       return;
16125     }
16126   }
16127 }
16128 
16129 void
sync_log_lcp_lsn_callback(Signal * signal,Uint32 ptrI,Uint32 res)16130 Backup::sync_log_lcp_lsn_callback(Signal *signal, Uint32 ptrI, Uint32 res)
16131 {
16132   BackupRecordPtr ptr;
16133   DeleteLcpFilePtr deleteLcpFilePtr;
16134   jamEntry();
16135   c_backupPool.getPtr(ptr, ptrI);
16136   ndbrequire(res == 0);
16137   c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16138   ndbrequire(deleteLcpFilePtr.p->validFlag == 0);
16139   /**
16140    * The LSN have now been sync:ed, now time to read the LCP control file
16141    * again to update the validFlag.
16142    */
16143   lcp_open_ctl_file_for_rewrite(signal, deleteLcpFilePtr, ptr);
16144 }
16145 
16146 void
lcp_open_ctl_file_for_rewrite(Signal * signal,DeleteLcpFilePtr deleteLcpFilePtr,BackupRecordPtr ptr)16147 Backup::lcp_open_ctl_file_for_rewrite(Signal *signal,
16148                                       DeleteLcpFilePtr deleteLcpFilePtr,
16149                                       BackupRecordPtr ptr)
16150 {
16151   BackupFilePtr filePtr;
16152   c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16153   FsOpenReq *req = (FsOpenReq*)signal->getDataPtrSend();
16154 
16155   req->userReference = reference();
16156   req->fileFlags = FsOpenReq::OM_READWRITE;
16157   req->userPointer = filePtr.i;
16158 
16159   ndbrequire(filePtr.p->m_flags == 0);
16160   filePtr.p->m_flags = BackupFile::BF_OPENING;
16161 
16162   /**
16163    * We use same table id and fragment id as the one we are about to
16164    * delete. If we are about to delete LCP control file 0, then we should
16165    * rewrite LCP control file 1 and vice versa if we are to delete LCP
16166    * control file 1.
16167    */
16168   Uint32 tableId = deleteLcpFilePtr.p->tableId;
16169   Uint32 fragmentId = deleteLcpFilePtr.p->fragmentId;
16170   Uint32 lcpNo = (deleteLcpFilePtr.p->lcpCtlFileNumber == 0) ? 1 : 0;
16171 
16172   FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
16173   FsOpenReq::setVersion(req->fileNumber, 5);
16174   FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
16175   FsOpenReq::v5_setLcpNo(req->fileNumber, lcpNo);
16176   FsOpenReq::v5_setTableId(req->fileNumber, tableId);
16177   FsOpenReq::v5_setFragmentId(req->fileNumber, fragmentId);
16178 
16179   sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
16180 }
16181 
16182 void
lcp_open_ctl_file_for_rewrite_done(Signal * signal,BackupFilePtr filePtr)16183 Backup::lcp_open_ctl_file_for_rewrite_done(Signal *signal,
16184                                            BackupFilePtr filePtr)
16185 {
16186   lcp_read_ctl_file_for_rewrite(signal, filePtr);
16187 }
16188 
16189 void
lcp_read_ctl_file_for_rewrite(Signal * signal,BackupFilePtr filePtr)16190 Backup::lcp_read_ctl_file_for_rewrite(Signal *signal,
16191                                       BackupFilePtr filePtr)
16192 {
16193   FsReadWriteReq *req = (FsReadWriteReq*)signal->getDataPtrSend();
16194   Page32Ptr pagePtr;
16195 
16196   filePtr.p->pages.getPtr(pagePtr, 0);
16197   ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
16198   filePtr.p->m_flags |= BackupFile::BF_READING;
16199 
16200   req->userPointer = filePtr.i;
16201   req->filePointer = filePtr.p->filePointer;
16202   req->userReference = reference();
16203   req->varIndex = 0;
16204   req->numberOfPages = 1;
16205   req->operationFlag = 0;
16206   FsReadWriteReq::setFormatFlag(req->operationFlag,
16207                                 FsReadWriteReq::fsFormatMemAddress);
16208   FsReadWriteReq::setPartialReadFlag(req->operationFlag, 1);
16209 
16210   Uint32 mem_offset = Uint32(((char*)pagePtr.p) - ((char*)c_startOfPages));
16211   req->data.memoryAddress.memoryOffset = mem_offset;
16212   req->data.memoryAddress.fileOffset = 0;
16213   req->data.memoryAddress.size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
16214 
16215   sendSignal(NDBFS_REF, GSN_FSREADREQ, signal,
16216              FsReadWriteReq::FixedLength + 3, JBA);
16217 }
16218 
16219 void
lcp_read_ctl_file_for_rewrite_done(Signal * signal,BackupFilePtr filePtr)16220 Backup::lcp_read_ctl_file_for_rewrite_done(Signal *signal,
16221                                            BackupFilePtr filePtr)
16222 {
16223   Page32Ptr pagePtr;
16224 
16225   filePtr.p->pages.getPtr(pagePtr, 0);
16226   struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
16227     (struct BackupFormat::LCPCtlFile*)pagePtr.p;
16228   ndbrequire(convert_ctl_page_to_host(lcpCtlFilePtr));
16229   lcpCtlFilePtr->ValidFlag = 1;
16230   lcp_update_ctl_file_for_rewrite(signal, filePtr, pagePtr);
16231 }
16232 
16233 void
lcp_update_ctl_file_for_rewrite(Signal * signal,BackupFilePtr filePtr,Page32Ptr pagePtr)16234 Backup::lcp_update_ctl_file_for_rewrite(Signal *signal,
16235                                         BackupFilePtr filePtr,
16236                                         Page32Ptr pagePtr)
16237 {
16238   ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
16239   lcp_write_ctl_file_to_disk(signal, filePtr, pagePtr);
16240 }
16241 
16242 void
lcp_update_ctl_file_for_rewrite_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16243 Backup::lcp_update_ctl_file_for_rewrite_done(Signal *signal,
16244                                              BackupRecordPtr ptr,
16245                                              BackupFilePtr filePtr)
16246 {
16247   lcp_close_ctl_file_for_rewrite(signal, ptr, filePtr);
16248 }
16249 
16250 void
lcp_close_ctl_file_for_rewrite(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16251 Backup::lcp_close_ctl_file_for_rewrite(Signal *signal,
16252                                        BackupRecordPtr ptr,
16253                                        BackupFilePtr filePtr)
16254 {
16255   ndbrequire(ptr.p->errorCode == 0);
16256   closeFile(signal, ptr, filePtr, false, false);
16257 #ifdef DEBUG_LCP
16258   DeleteLcpFilePtr deleteLcpFilePtr;
16259   c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16260   DEB_LCP(("(%u)Completed writing with ValidFlag = 1 for tab(%u,%u).%u",
16261            instance(),
16262            deleteLcpFilePtr.p->tableId,
16263            deleteLcpFilePtr.p->fragmentId,
16264            c_lqh->getCreateSchemaVersion(deleteLcpFilePtr.p->tableId)));
16265 #endif
16266 }
16267 
16268 void
lcp_close_ctl_file_for_rewrite_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16269 Backup::lcp_close_ctl_file_for_rewrite_done(Signal *signal,
16270                                             BackupRecordPtr ptr,
16271                                             BackupFilePtr filePtr)
16272 {
16273   ndbrequire(filePtr.p->m_flags == 0);
16274   ndbrequire(ptr.p->errorCode == 0);
16275   DeleteLcpFilePtr deleteLcpFilePtr;
16276   c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16277 
16278   if (deleteLcpFilePtr.p->firstFileId != RNIL)
16279   {
16280     jam();
16281     ptr.p->m_delete_data_file_ongoing = true;
16282     lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16283   }
16284   else if (deleteLcpFilePtr.p->lcpCtlFileNumber != RNIL)
16285   {
16286     jam();
16287     ptr.p->m_delete_data_file_ongoing = false;
16288     lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16289   }
16290   else
16291   {
16292     jam();
16293     finished_removing_files(signal, ptr);
16294   }
16295 }
16296 
16297 void
lcp_remove_file(Signal * signal,BackupRecordPtr ptr,DeleteLcpFilePtr deleteLcpFilePtr)16298 Backup::lcp_remove_file(Signal* signal,
16299                         BackupRecordPtr ptr,
16300                         DeleteLcpFilePtr deleteLcpFilePtr)
16301 {
16302   BackupFilePtr filePtr;
16303   c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16304   FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
16305   req->userReference = reference();
16306   req->userPointer = ptr.i;
16307   req->directory = 0;
16308   req->ownDirectory = 0;
16309 
16310   filePtr.p->m_flags |= BackupFile::BF_REMOVING;
16311 
16312   FsOpenReq::setVersion(req->fileNumber, 5);
16313   if (ptr.p->m_delete_data_file_ongoing)
16314   {
16315     jam();
16316     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
16317     FsOpenReq::v5_setLcpNo(req->fileNumber, deleteLcpFilePtr.p->firstFileId);
16318     DEB_LCP_DEL_FILES(("(%u)TAGD Remove data file: %u for tab(%u,%u)",
16319                        instance(),
16320                        deleteLcpFilePtr.p->firstFileId,
16321                        deleteLcpFilePtr.p->tableId,
16322                        deleteLcpFilePtr.p->fragmentId));
16323   }
16324   else
16325   {
16326     jam();
16327     FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
16328     FsOpenReq::v5_setLcpNo(req->fileNumber,
16329                            deleteLcpFilePtr.p->lcpCtlFileNumber);
16330     DEB_LCP_DEL_FILES(("(%u)TAGD Remove control file: %u for tab(%u,%u)",
16331                        instance(),
16332                        deleteLcpFilePtr.p->lcpCtlFileNumber,
16333                        deleteLcpFilePtr.p->tableId,
16334                        deleteLcpFilePtr.p->fragmentId));
16335   }
16336   FsOpenReq::v5_setTableId(req->fileNumber, deleteLcpFilePtr.p->tableId);
16337   FsOpenReq::v5_setFragmentId(req->fileNumber, deleteLcpFilePtr.p->fragmentId);
16338   sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal, FsOpenReq::SignalLength, JBA);
16339 }
16340 
16341 void
lcp_remove_file_conf(Signal * signal,BackupRecordPtr ptr)16342 Backup::lcp_remove_file_conf(Signal *signal, BackupRecordPtr ptr)
16343 {
16344   BackupFilePtr filePtr;
16345 
16346   c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16347   filePtr.p->m_flags &= (~(BackupFile::BF_REMOVING));
16348   ndbrequire(filePtr.p->m_flags == 0);
16349 
16350   if (ptr.p->m_delete_data_file_ongoing)
16351   {
16352     jam();
16353     DeleteLcpFilePtr deleteLcpFilePtr;
16354     c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16355     if (deleteLcpFilePtr.p->firstFileId == deleteLcpFilePtr.p->lastFileId)
16356     {
16357       jam();
16358       /**
16359        * We're done with deleting the data files belonging to this LCP which
16360        * we no longer need. We continue with deletion of the control LCP
16361        * file for this LCP.
16362        */
16363       ptr.p->m_delete_data_file_ongoing = false;
16364       lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16365       return;
16366     }
16367     /* Continue with deleting the next data file. */
16368     deleteLcpFilePtr.p->firstFileId =
16369       get_file_add(deleteLcpFilePtr.p->firstFileId, 1);
16370     lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16371   }
16372   else
16373   {
16374     /**
16375      * We are done deleting files for this fragment LCP, send CONTINUEB
16376      * to see if more fragment LCPs are ready to be deleted.
16377      *
16378      * We remove it from queue here to ensure that the next LCP can now
16379      * start up again.
16380      * It is important to not remove it from queue until we actually deleted
16381      * all the files, the logic depends on that only one LCP is allowed to
16382      * execute at a time and that this LCP will remove all the files
16383      * of the old LCP before the next one is allowed to start.
16384      */
16385     jam();
16386     finished_removing_files(signal, ptr);
16387   }
16388 }
16389 
16390 void
finished_removing_files(Signal * signal,BackupRecordPtr ptr)16391 Backup::finished_removing_files(Signal *signal,
16392                                 BackupRecordPtr ptr)
16393 {
16394   DeleteLcpFilePtr deleteLcpFilePtr;
16395   jam();
16396   {
16397     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16398                                   m_delete_lcp_file_head);
16399     c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16400     queue.remove(deleteLcpFilePtr);
16401     c_deleteLcpFilePool.release(deleteLcpFilePtr);
16402     ptr.p->currentDeleteLcpFile = RNIL;
16403   }
16404   check_empty_queue_waiters(signal, ptr);
16405   if (ptr.p->m_informDropTabTableId != Uint32(~0))
16406   {
16407     jam();
16408     sendINFORM_BACKUP_DROP_TAB_CONF(signal, ptr);
16409   }
16410   else
16411   {
16412     jam();
16413     check_wait_end_lcp(signal, ptr);
16414   }
16415   {
16416     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16417                                   m_delete_lcp_file_head);
16418     if (!queue.isEmpty())
16419     {
16420       jam();
16421       signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
16422       signal->theData[1] = ptr.i;
16423       sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16424     }
16425     else
16426     {
16427       jam();
16428       delete_lcp_file_processing(signal);
16429     }
16430   }
16431 }
16432 
16433 /**
16434  * Wait for LCP activity to cease, in particular wait for the delete queue
16435  * to become empty. When the delete queue is empty we know that all fragment
16436  * LCPs have completed and are recoverable. No files will be deleted unless
16437  * the fragment LCP is completed and even if no files require deletion we will
16438  * insert an entry into the delete file queue if we are still waiting for the
16439  * LSN of the table fragment to be flushed.
16440  *
16441  * See comments in Dblqh::insert_new_fragments_into_lcp for more details on
16442  * the use case for this signal.
16443  */
16444 void
execWAIT_LCP_IDLE_REQ(Signal * signal)16445 Backup::execWAIT_LCP_IDLE_REQ(Signal *signal)
16446 {
16447   BackupRecordPtr ptr;
16448   jamEntry();
16449   c_backupPool.getPtr(ptr, signal->theData[0]);
16450   jamDebug();
16451   ndbrequire(ptr.p->is_lcp());
16452   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16453                                 m_delete_lcp_file_head);
16454   if (queue.isEmpty() && ptr.p->slaveState.getState() == DEFINED)
16455   {
16456     jam();
16457     signal->theData[0] = ptr.p->clientData;
16458     sendSignal(ptr.p->masterRef, GSN_WAIT_LCP_IDLE_CONF,
16459                signal, 1, JBB);
16460   }
16461   else
16462   {
16463     jam();
16464     ptr.p->m_wait_empty_queue = true;
16465   }
16466 }
16467 
16468 void
check_empty_queue_waiters(Signal * signal,BackupRecordPtr ptr)16469 Backup::check_empty_queue_waiters(Signal *signal, BackupRecordPtr ptr)
16470 {
16471   ndbrequire(ptr.p->is_lcp());
16472   if (ptr.p->m_wait_empty_queue)
16473   {
16474     jam();
16475     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16476                                   m_delete_lcp_file_head);
16477     if (queue.isEmpty() && ptr.p->slaveState.getState() == DEFINED)
16478     {
16479       jam();
16480       ptr.p->m_wait_empty_queue = false;
16481       signal->theData[0] = ptr.p->clientData;
16482       sendSignal(ptr.p->masterRef, GSN_WAIT_LCP_IDLE_CONF,
16483                  signal, 1, JBB);
16484     }
16485   }
16486 }
16487 
16488 void
execINFORM_BACKUP_DROP_TAB_REQ(Signal * signal)16489 Backup::execINFORM_BACKUP_DROP_TAB_REQ(Signal *signal)
16490 {
16491   BackupRecordPtr ptr;
16492   get_lcp_record(ptr);
16493   ptr.p->m_informDropTabTableId = signal->theData[0];
16494   ptr.p->m_informDropTabReference = signal->theData[1];
16495   if (ptr.p->currentDeleteLcpFile != RNIL)
16496   {
16497     DeleteLcpFilePtr deleteLcpFilePtr;
16498     jam();
16499     c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16500     if (deleteLcpFilePtr.p->tableId == ptr.p->m_informDropTabTableId)
16501     {
16502       jam();
16503       /**
16504        * The current delete record is deleting files and writing files
16505        * from the dropped table. Wait until this is completed before
16506        * we continue.
16507        */
16508       return;
16509     }
16510   }
16511   sendINFORM_BACKUP_DROP_TAB_CONF(signal, ptr);
16512 }
16513 
16514 void
check_wait_end_lcp(Signal * signal,BackupRecordPtr ptr)16515 Backup::check_wait_end_lcp(Signal *signal, BackupRecordPtr ptr)
16516 {
16517   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16518                                 m_delete_lcp_file_head);
16519   ndbrequire(ptr.p->is_lcp());
16520   if (queue.isEmpty() && ptr.p->m_wait_end_lcp)
16521   {
16522     jam();
16523     ndbrequire(ptr.p->prepareState != PREPARE_DROP);
16524     ptr.p->m_wait_end_lcp = false;
16525     sendEND_LCPCONF(signal, ptr);
16526   }
16527 }
16528 
16529 void
sendINFORM_BACKUP_DROP_TAB_CONF(Signal * signal,BackupRecordPtr ptr)16530 Backup::sendINFORM_BACKUP_DROP_TAB_CONF(Signal *signal,
16531                                         BackupRecordPtr ptr)
16532 {
16533   /**
16534    * Before we send the confirm we have to remove all entries from
16535    * drop delete queue that refer to the dropped table. We have already
16536    * ensured that the dropped table isn't currently involved in drops.
16537    * It would create complex code if we could remove the LCP files
16538    * while we were writing them.
16539    */
16540 
16541   DEB_LCP(("(%u)Remove all delete file requests for table %u",
16542            instance(),
16543            ptr.p->m_informDropTabTableId));
16544   {
16545     DeleteLcpFilePtr deleteLcpFilePtr;
16546     DeleteLcpFilePtr nextDeleteLcpFilePtr;
16547     LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16548                                   m_delete_lcp_file_head);
16549     bool is_next_available = queue.first(deleteLcpFilePtr);
16550     while (is_next_available)
16551     {
16552       nextDeleteLcpFilePtr = deleteLcpFilePtr;
16553       is_next_available = queue.next(nextDeleteLcpFilePtr);
16554       if (deleteLcpFilePtr.p->tableId == ptr.p->m_informDropTabTableId)
16555       {
16556         jam();
16557         /**
16558          * We found an entry that is from the dropped table, we can
16559          * ignore this since the table will be dropped and all
16560          * LCP files with it.
16561          */
16562         queue.remove(deleteLcpFilePtr);
16563         c_deleteLcpFilePool.release(deleteLcpFilePtr);
16564       }
16565       deleteLcpFilePtr = nextDeleteLcpFilePtr;
16566     }
16567   }
16568   check_empty_queue_waiters(signal, ptr);
16569   check_wait_end_lcp(signal, ptr);
16570 
16571   /**
16572    * Now we have removed all entries from queue and we are ready to inform
16573    * LQH that he can continue dropping the table.
16574    * At this point LQH have already ensured that no more LCPs are started
16575    * on this table.
16576    */
16577   BlockReference ref = ptr.p->m_informDropTabReference;
16578   Uint32 tableId = ptr.p->m_informDropTabTableId;
16579   signal->theData[0] = tableId;
16580   sendSignal(ref, GSN_INFORM_BACKUP_DROP_TAB_CONF, signal, 1, JBB);
16581   ptr.p->m_informDropTabReference = Uint32(~0);
16582   ptr.p->m_informDropTabTableId = Uint32(~0);
16583 }
16584 
16585 void
openFilesReplyLCP(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16586 Backup::openFilesReplyLCP(Signal* signal,
16587 		          BackupRecordPtr ptr,
16588                           BackupFilePtr filePtr)
16589 {
16590   /**
16591    * Did open succeed
16592    */
16593   if(ptr.p->checkError())
16594   {
16595     jam();
16596     if(ptr.p->errorCode == FsRef::fsErrFileExists)
16597     {
16598       jam();
16599       ptr.p->errorCode = DefineBackupRef::FailedForBackupFilesAleadyExist;
16600     }
16601     for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
16602     {
16603       jam();
16604       if (ptr.p->dataFilePtr[i] == filePtr.i)
16605       {
16606         jam();
16607         /* Currently we can't handle failures to open data file */
16608         g_eventLogger->critical("Fatal: Open file of LCP data file %u failed,"
16609                                 " errCode: %u",
16610                                 i,
16611                                 ptr.p->errorCode);
16612         ndbabort();
16613       }
16614     }
16615     if (ptr.p->deleteFilePtr == filePtr.i)
16616     {
16617       jam();
16618       g_eventLogger->critical("Fatal: Reopen LCP control file failed,"
16619                               " errCode: %u",
16620                               ptr.p->errorCode);
16621       ndbabort();
16622     }
16623     defineBackupRef(signal, ptr);
16624     return;
16625   }//if
16626 
16627   if (ptr.p->deleteFilePtr == filePtr.i)
16628   {
16629     jam();
16630     lcp_open_ctl_file_for_rewrite_done(signal, filePtr);
16631     return;
16632   }
16633   if (filePtr.p->m_flags & BackupFile::BF_HEADER_FILE)
16634   {
16635     jam();
16636     filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_HEADER_FILE;
16637     ndbrequire(filePtr.i == ptr.p->prepareCtlFilePtr[0] ||
16638                filePtr.i == ptr.p->prepareCtlFilePtr[1]);
16639     lcp_open_ctl_file_done(signal, ptr, filePtr);
16640     return;
16641   }
16642   TablePtr tabPtr;
16643   bool prepare_phase;
16644   Uint32 index = 0;
16645   if (filePtr.i == ptr.p->prepareDataFilePtr[0])
16646   {
16647     jam();
16648     filePtr.p->m_flags |= BackupFile::BF_LCP_META;
16649     ndbrequire(ptr.p->prepareState == PREPARE_OPEN_DATA_FILE);
16650     ptr.p->prepareState = PREPARE_READ_TABLE_DESC;
16651     ptr.p->prepare_table.first(tabPtr);
16652     prepare_phase = true;
16653   }
16654   else
16655   {
16656     prepare_phase = true;
16657     for (index = 0 ; index < ptr.p->m_num_lcp_files; index++)
16658     {
16659       if (filePtr.i == ptr.p->dataFilePtr[index])
16660       {
16661         prepare_phase = false;
16662         break;
16663       }
16664     }
16665     ndbrequire(!prepare_phase);
16666     ptr.p->tables.first(tabPtr);
16667   }
16668   ndbrequire(insertFileHeader(BackupFormat::LCP_FILE, ptr.p, filePtr.p));
16669   /**
16670    * Insert table list in ctl file
16671    */
16672   FsBuffer & buf = filePtr.p->operation.dataBuffer;
16673   const Uint32 sz = (sizeof(BackupFormat::CtlFile::TableList) >> 2);
16674   Uint32 * dst;
16675   ndbrequire(sz < buf.getMaxWrite());
16676   ndbrequire(buf.getWritePtr(&dst, sz))
16677 
16678   BackupFormat::CtlFile::TableList* tl =
16679     (BackupFormat::CtlFile::TableList*)dst;
16680 
16681   tl->SectionType   = htonl(BackupFormat::TABLE_LIST);
16682   tl->SectionLength = htonl(sz);
16683   tl->TableIds[0] = htonl(tabPtr.p->tableId);
16684   buf.updateWritePtr(sz);
16685 
16686   if (prepare_phase)
16687   {
16688     jam();
16689     /**
16690      * Start getting table definition data
16691      */
16692     signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
16693     signal->theData[1] = ptr.i;
16694     signal->theData[2] = tabPtr.i;
16695     signal->theData[3] = filePtr.i;
16696     sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
16697     return;
16698   }
16699   else
16700   {
16701     jam();
16702     FragmentPtr fragPtr;
16703     tabPtr.p->fragments.getPtr(fragPtr, 0);
16704     init_file_for_lcp(signal, index, ptr, ptr.i);
16705     ptr.p->m_num_lcp_data_files_open++;
16706     ndbrequire(ptr.p->m_num_lcp_data_files_open <= ptr.p->m_num_lcp_files);
16707     if (ptr.p->m_num_lcp_data_files_open < ptr.p->m_num_lcp_files)
16708     {
16709       jam();
16710       return;
16711     }
16712     /**
16713      * Now all files are open and we can start the actual scanning.
16714      * We always use the first file record to track number of scanned
16715      * pages.
16716      */
16717     BackupFilePtr zeroFilePtr;
16718     c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
16719     Uint32 delay = 0;
16720     if (ERROR_INSERTED(10047))
16721     {
16722       g_eventLogger->info("(%u)Start LCP on tab(%u,%u) 3 seconds delay, max_page: %u",
16723                           instance(),
16724                           tabPtr.p->tableId,
16725                           fragPtr.p->fragmentId,
16726                           ptr.p->m_lcp_max_page_cnt);
16727 
16728       if (ptr.p->m_lcp_max_page_cnt > 20)
16729       {
16730         delay = 3000;
16731       }
16732     }
16733     sendScanFragReq(signal, ptr, zeroFilePtr, tabPtr, fragPtr, delay);
16734   }
16735 }
16736 
16737 void
execEND_LCPREQ(Signal * signal)16738 Backup::execEND_LCPREQ(Signal* signal)
16739 {
16740   BackupRecordPtr ptr;
16741   {
16742     EndLcpReq* req= (EndLcpReq*)signal->getDataPtr();
16743     c_backupPool.getPtr(ptr, req->backupPtr);
16744     ptr.p->senderData = req->senderData;
16745   }
16746   jamEntry();
16747   ndbrequire(ptr.p->is_lcp());
16748 
16749   BackupFilePtr filePtr;
16750   ptr.p->files.getPtr(filePtr, ptr.p->prepareCtlFilePtr[0]);
16751   ndbrequire(filePtr.p->m_flags == 0);
16752   ptr.p->files.getPtr(filePtr, ptr.p->prepareCtlFilePtr[1]);
16753   ndbrequire(filePtr.p->m_flags == 0);
16754   ptr.p->files.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
16755   ndbrequire(filePtr.p->m_flags == 0);
16756   ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
16757   ndbrequire(filePtr.p->m_flags == 0);
16758   ptr.p->files.getPtr(filePtr, ptr.p->dataFilePtr[0]);
16759   ndbrequire(filePtr.p->m_flags == 0);
16760 
16761   ptr.p->errorCode = 0;
16762   ptr.p->slaveState.setState(CLEANING);
16763   ptr.p->slaveState.setState(INITIAL);
16764   ptr.p->slaveState.setState(DEFINING);
16765   ptr.p->slaveState.setState(DEFINED);
16766 
16767   DEB_LCP(("(%u)TAGE Send SYNC_EXTENT_PAGES_REQ", instance()));
16768   /**
16769    * As part of ending the LCP we need to ensure that the extent pages
16770    * are synchronised. This is to ensure that the case with dropped
16771    * tables after completing a fragment LCP is handled properly. These
16772    * extent pages need to be synchronised at end of LCP since after the
16773    * end of the LCP here we will inform TSMAN that it is free to start
16774    * sharing those pages again and then we need to ensure that the
16775    * free status is up-to-date in preparation for a potential restart.
16776    */
16777   ptr.p->m_wait_final_sync_extent = true;
16778   ptr.p->m_num_sync_extent_pages_written = Uint32(~0);
16779   ptr.p->m_start_sync_op = getHighResTimer();
16780   {
16781     SyncExtentPagesReq *req = (SyncExtentPagesReq*)signal->getDataPtrSend();
16782     req->senderData = ptr.i;
16783     req->senderRef = reference();
16784     if (ptr.p->m_first_fragment)
16785     {
16786       jam();
16787       ptr.p->m_first_fragment = false;
16788       req->lcpOrder = SyncExtentPagesReq::FIRST_AND_END_LCP;
16789     }
16790     else
16791     {
16792       jam();
16793       req->lcpOrder = SyncExtentPagesReq::END_LCP;
16794     }
16795     sendSignal(PGMAN_REF, GSN_SYNC_EXTENT_PAGES_REQ, signal,
16796                SyncExtentPagesReq::SignalLength, JBB);
16797   }
16798   return;
16799 }
16800 
16801 void
finish_end_lcp(Signal * signal,BackupRecordPtr ptr)16802 Backup::finish_end_lcp(Signal *signal, BackupRecordPtr ptr)
16803 {
16804   DEB_LCP(("(%u)TAGE SYNC_EXTENT_PAGES_CONF: lcpId: %u",
16805           instance(),
16806           ptr.p->backupId));
16807   ptr.p->m_wait_final_sync_extent = false;
16808   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16809                                 m_delete_lcp_file_head);
16810   if (!queue.isEmpty())
16811   {
16812     jam();
16813     ptr.p->m_wait_end_lcp = true;
16814     return;
16815   }
16816   /**
16817    * The delete LCP file queue is empty, this means that we are sure
16818    * that all reported LCP_FRAG_REP's are actually completed. DIH
16819    * will not think that any LCP_FRAG_REP is ok to use until we have
16820    * received LCP_COMPLETE_REP and so we need to wait with sending
16821    * this signal until we have emptied the queue and thus completed
16822    * the full LCP.
16823    */
16824   sendEND_LCPCONF(signal, ptr);
16825 }
16826 
16827 void
sendEND_LCPCONF(Signal * signal,BackupRecordPtr ptr)16828 Backup::sendEND_LCPCONF(Signal *signal, BackupRecordPtr ptr)
16829 {
16830   DEB_LCP(("(%u)TAGE END_LCPREQ: lcpId: %u",
16831           instance(),
16832           ptr.p->backupId));
16833   ndbrequire(!ptr.p->m_wait_end_lcp);
16834   ptr.p->backupId = 0; /* Ensure next LCP_PREPARE_REQ sees a new LCP id */
16835 
16836   {
16837     NDB_TICKS now = getHighResTimer();
16838     Uint64 lcp_elapsed_us =
16839       NdbTick_Elapsed(ptr.p->m_high_res_lcp_start_time, now).microSec();
16840     Uint64 dd_percentage = 100 * m_current_dd_time_us;
16841     dd_percentage = dd_percentage / lcp_elapsed_us;
16842     m_last_lcp_dd_percentage = dd_percentage;
16843     c_pgman->set_lcp_dd_percentage(dd_percentage);
16844   }
16845   DEB_LCP_STAT(("(%u)Bytes written in this LCP: %llu MB, dd_percent: %u",
16846                  instance(),
16847                  ptr.p->noOfBytes / (1024 * 1024),
16848                  m_last_lcp_dd_percentage));
16849   lcp_end_point();
16850 
16851   EndLcpConf* conf= (EndLcpConf*)signal->getDataPtrSend();
16852   conf->senderData = ptr.p->senderData;
16853   conf->senderRef = reference();
16854   sendSignal(ptr.p->masterRef, GSN_END_LCPCONF,
16855 	     signal, EndLcpConf::SignalLength, JBA);
16856 }
16857 
16858 void
lcp_close_ctl_file_drop_case(Signal * signal,BackupRecordPtr ptr)16859 Backup::lcp_close_ctl_file_drop_case(Signal *signal, BackupRecordPtr ptr)
16860 {
16861   BackupFilePtr filePtr;
16862   for (Uint32 i = 0; i < 2; i++)
16863   {
16864     c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[i]);
16865     if ((filePtr.p->m_flags & BackupFile::BF_OPEN) != 0)
16866     {
16867       jam();
16868       /* Still waiting for second file to close */
16869       return;
16870     }
16871   }
16872   /* Now time to start removing data files. */
16873   DeleteLcpFilePtr deleteLcpFilePtr;
16874   TablePtr tabPtr;
16875   FragmentPtr fragPtr;
16876   ndbrequire(c_deleteLcpFilePool.seize(deleteLcpFilePtr));
16877   LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16878                                 m_delete_lcp_file_head);
16879 
16880   /**
16881    * We avoid all complexity for this code since it is an obscure case that
16882    * should be extremely rare. So we simply delete all potential files.
16883    */
16884   ptr.p->prepare_table.first(tabPtr);
16885   tabPtr.p->fragments.getPtr(fragPtr, 0);
16886   deleteLcpFilePtr.p->tableId = fragPtr.p->tableId;
16887   deleteLcpFilePtr.p->fragmentId = fragPtr.p->fragmentId;
16888   deleteLcpFilePtr.p->firstFileId = 0;
16889   deleteLcpFilePtr.p->lastFileId = BackupFormat::NDB_MAX_LCP_FILES - 1;
16890   deleteLcpFilePtr.p->waitCompletedGci = 0;
16891   deleteLcpFilePtr.p->validFlag = 1;
16892   deleteLcpFilePtr.p->lcpCtlFileNumber =
16893     ptr.p->prepareNextLcpCtlFileNumber == 0 ? 1 : 0;
16894   queue.addFirst(deleteLcpFilePtr);
16895   if (!m_delete_lcp_files_ongoing)
16896   {
16897     jam();
16898     m_delete_lcp_files_ongoing = true;
16899     signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
16900     signal->theData[1] = ptr.i;
16901     sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16902   }
16903   /**
16904    * We have now closed the files and as soon as the queue of
16905    * deleted files are empty we can proceed with starting of
16906    * the LCP.
16907    */
16908   ptr.p->prepareState = PREPARE_DROP;
16909   DEB_LCP(("(%u)TAGT Insert delete files in queue (drop case):"
16910     " tab(%u,%u), createGci: %u, waitCompletedGCI: 0",
16911     instance(),
16912     fragPtr.p->tableId,
16913     fragPtr.p->fragmentId,
16914     fragPtr.p->createGci));
16915 }
16916 
16917 inline
16918 static
setWords(const Uint64 src,Uint32 & hi,Uint32 & lo)16919 void setWords(const Uint64 src, Uint32& hi, Uint32& lo)
16920 {
16921   hi = (Uint32) (src >> 32);
16922   lo = (Uint32) (src & 0xffffffff);
16923 }
16924 
16925 void
execLCP_STATUS_REQ(Signal * signal)16926 Backup::execLCP_STATUS_REQ(Signal* signal)
16927 {
16928   jamEntry();
16929   const LcpStatusReq* req = (const LcpStatusReq*) signal->getDataPtr();
16930 
16931   const Uint32 senderRef = req->senderRef;
16932   const Uint32 senderData = req->senderData;
16933   Uint32 failCode = LcpStatusRef::NoLCPRecord;
16934 
16935   /* Find LCP record */
16936   BackupRecordPtr ptr;
16937   get_lcp_record(ptr);
16938   do
16939   {
16940     jam();
16941     ndbrequire(ptr.p->is_lcp());
16942     {
16943       jam();
16944       LcpStatusConf::LcpState state = LcpStatusConf::LCP_IDLE;
16945       if (ptr.p->m_wait_end_lcp)
16946       {
16947         jam();
16948         state = LcpStatusConf::LCP_WAIT_END_LCP;
16949       }
16950       else if (ptr.p->m_wait_final_sync_extent)
16951       {
16952         jam();
16953         state = LcpStatusConf::LCP_WAIT_FINAL_SYNC_EXTENT;
16954       }
16955       else
16956       {
16957         jam();
16958         switch (ptr.p->slaveState.getState())
16959         {
16960         case STARTED:
16961           jam();
16962           state = LcpStatusConf::LCP_PREPARED;
16963           break;
16964         case SCANNING:
16965           jam();
16966           state = LcpStatusConf::LCP_SCANNING;
16967           break;
16968         case STOPPING:
16969           jam();
16970           if (ptr.p->m_wait_disk_data_sync)
16971           {
16972             jam();
16973             state = LcpStatusConf::LCP_WAIT_SYNC_DISK;
16974           }
16975           else if (ptr.p->m_wait_sync_extent)
16976           {
16977             jam();
16978             state = LcpStatusConf::LCP_WAIT_SYNC_EXTENT;
16979           }
16980           else if (ptr.p->m_wait_data_file_close)
16981           {
16982             jam();
16983             state = LcpStatusConf::LCP_SCANNED;
16984           }
16985           else if (ptr.p->m_empty_lcp)
16986           {
16987             jam();
16988             state = LcpStatusConf::LCP_WAIT_CLOSE_EMPTY;
16989           }
16990           else
16991           {
16992             jam();
16993             state = LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE;
16994           }
16995           break;
16996         case DEFINED:
16997           jam();
16998           if (ptr.p->prepareState == NOT_ACTIVE ||
16999               ptr.p->prepareState == PREPARED)
17000           {
17001             jam();
17002             state = LcpStatusConf::LCP_IDLE;
17003           }
17004           else if (ptr.p->prepareState == PREPARE_READ_CTL_FILES)
17005           {
17006             jam();
17007             state = LcpStatusConf::LCP_PREPARE_READ_CTL_FILES;
17008           }
17009           else if (ptr.p->prepareState == PREPARE_OPEN_DATA_FILE)
17010           {
17011             jam();
17012             state = LcpStatusConf::LCP_PREPARE_OPEN_DATA_FILE;
17013           }
17014           else if (ptr.p->prepareState == PREPARE_READ_TABLE_DESC)
17015           {
17016             jam();
17017             state = LcpStatusConf::LCP_PREPARE_READ_TABLE_DESC;
17018           }
17019           else if (ptr.p->prepareState == PREPARE_ABORTING)
17020           {
17021             jam();
17022             state = LcpStatusConf::LCP_PREPARE_ABORTING;
17023           }
17024           else if (ptr.p->prepareState == PREPARE_DROP ||
17025                    ptr.p->prepareState == PREPARE_DROP_CLOSE)
17026           {
17027             jam();
17028             state = LcpStatusConf::LCP_PREPARE_WAIT_DROP_CASE;
17029           }
17030           else
17031           {
17032             jam();
17033             ndbout_c("Unusual LCP prepare state in LCP_STATUS_REQ() : %u",
17034                      ptr.p->prepareState);
17035             state = LcpStatusConf::LCP_IDLE;
17036           }
17037           break;
17038         default:
17039           jam();
17040           ndbout_c("Unusual LCP state in LCP_STATUS_REQ() : %u",
17041                    ptr.p->slaveState.getState());
17042           state = LcpStatusConf::LCP_IDLE;
17043         };
17044       }
17045 
17046       /* Not all values are set here */
17047       const Uint32 UnsetConst = ~0;
17048 
17049       LcpStatusConf* conf = (LcpStatusConf*) signal->getDataPtrSend();
17050       conf->senderRef = reference();
17051       conf->senderData = senderData;
17052       conf->lcpState = state;
17053       conf->tableId = UnsetConst;
17054       conf->fragId = UnsetConst;
17055       conf->completionStateHi = UnsetConst;
17056       conf->completionStateLo = UnsetConst;
17057       setWords(ptr.p->noOfRecords,
17058                conf->lcpDoneRowsHi,
17059                conf->lcpDoneRowsLo);
17060       setWords(ptr.p->noOfBytes,
17061                conf->lcpDoneBytesHi,
17062                conf->lcpDoneBytesLo);
17063       conf->lcpScannedPages = 0;
17064 
17065       if (state == LcpStatusConf::LCP_SCANNING ||
17066           state == LcpStatusConf::LCP_WAIT_SYNC_DISK ||
17067           state == LcpStatusConf::LCP_WAIT_SYNC_EXTENT ||
17068           state == LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE ||
17069           state == LcpStatusConf::LCP_WAIT_CLOSE_EMPTY ||
17070           state == LcpStatusConf::LCP_SCANNED)
17071       {
17072         jam();
17073         /* Actually scanning/closing a fragment, let's grab the details */
17074         TablePtr tabPtr;
17075         FragmentPtr fragPtr;
17076         BackupFilePtr filePtr;
17077 
17078         if (ptr.p->dataFilePtr[0] == RNIL)
17079         {
17080           jam();
17081           failCode = LcpStatusRef::NoFileRecord;
17082           break;
17083         }
17084         c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
17085         ndbrequire(filePtr.p->backupPtr == ptr.i);
17086 
17087         ptr.p->tables.first(tabPtr);
17088         if (tabPtr.i != RNIL)
17089         {
17090           jam();
17091           tabPtr.p->fragments.getPtr(fragPtr, 0);
17092           ndbrequire(fragPtr.p->tableId == tabPtr.p->tableId);
17093           conf->tableId = tabPtr.p->tableId;
17094           conf->fragId = fragPtr.p->fragmentId;
17095         }
17096 
17097         if (state == LcpStatusConf::LCP_SCANNING)
17098         {
17099           jam();
17100           setWords(filePtr.p->operation.noOfRecords,
17101                    conf->completionStateHi,
17102                    conf->completionStateLo);
17103           conf->lcpScannedPages = filePtr.p->operation.lcpScannedPages;
17104         }
17105         else if (state == LcpStatusConf::LCP_SCANNED)
17106         {
17107           jam();
17108           BackupFilePtr tmp_filePtr;
17109           Uint64 flushBacklog = 0;
17110           for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
17111           {
17112             c_backupFilePool.getPtr(tmp_filePtr, ptr.p->dataFilePtr[i]);
17113             /* May take some time to drain the FS buffer, depending on
17114              * size of buff, achieved rate.
17115              * We provide the buffer fill level so that requestors
17116              * can observe whether there's progress in this phase.
17117              */
17118             flushBacklog +=
17119               tmp_filePtr.p->operation.dataBuffer.getUsableSize() -
17120               tmp_filePtr.p->operation.dataBuffer.getFreeSize();
17121           }
17122           setWords(flushBacklog,
17123                    conf->completionStateHi,
17124                    conf->completionStateLo);
17125         }
17126         else if (state == LcpStatusConf::LCP_WAIT_SYNC_DISK)
17127         {
17128           jam();
17129           conf->completionStateHi = 0;
17130           conf->completionStateLo = ptr.p->m_num_sync_pages_waiting;
17131         }
17132         else if (state == LcpStatusConf::LCP_WAIT_SYNC_EXTENT)
17133         {
17134           jam();
17135           conf->completionStateHi = 0;
17136           conf->completionStateLo = ptr.p->m_num_sync_extent_pages_written;
17137         }
17138         else if (state == LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE)
17139         {
17140           jam();
17141           conf->completionStateHi = 0;
17142           conf->completionStateLo = 0;
17143         }
17144         else if (state == LcpStatusConf::LCP_WAIT_CLOSE_EMPTY)
17145         {
17146           jam();
17147           conf->completionStateHi = 0;
17148           conf->completionStateLo = ptr.p->m_outstanding_operations;
17149         }
17150         else
17151         {
17152           ndbabort(); // Impossible state
17153         }
17154       }
17155       else if (state == LcpStatusConf::LCP_WAIT_END_LCP)
17156       {
17157         jam();
17158         DeleteLcpFilePtr deleteLcpFilePtr;
17159         LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
17160                                       m_delete_lcp_file_head);
17161         ndbrequire(!queue.isEmpty());
17162         conf->completionStateHi = 0;
17163         conf->completionStateLo = m_newestRestorableGci;
17164       }
17165       else if (state == LcpStatusConf::LCP_WAIT_FINAL_SYNC_EXTENT)
17166       {
17167         jam();
17168         conf->completionStateHi = 0;
17169         conf->completionStateLo = ptr.p->m_num_sync_extent_pages_written;
17170       }
17171       else if (state == LcpStatusConf::LCP_PREPARED)
17172       {
17173         /**
17174          * We are in state of closing LCP control files with a
17175          * idle fragment LCP.
17176          */
17177         jam();
17178         TablePtr tabPtr;
17179         FragmentPtr fragPtr;
17180         ptr.p->tables.first(tabPtr);
17181         ndbrequire(tabPtr.i != RNIL);
17182         tabPtr.p->fragments.getPtr(fragPtr, 0);
17183         ndbrequire(fragPtr.p->tableId == tabPtr.p->tableId);
17184         conf->tableId = tabPtr.p->tableId;
17185         conf->fragId = fragPtr.p->fragmentId;
17186       }
17187 
17188       failCode = 0;
17189     }
17190   } while (false);
17191 
17192   if (failCode == 0)
17193   {
17194     jam();
17195     sendSignal(senderRef, GSN_LCP_STATUS_CONF,
17196                signal, LcpStatusConf::SignalLength, JBB);
17197     return;
17198   }
17199 
17200   jam();
17201   LcpStatusRef* ref = (LcpStatusRef*) signal->getDataPtrSend();
17202 
17203   ref->senderRef = reference();
17204   ref->senderData = senderData;
17205   ref->error = failCode;
17206 
17207   sendSignal(senderRef, GSN_LCP_STATUS_REF,
17208              signal, LcpStatusRef::SignalLength, JBB);
17209   return;
17210 }
17211 
17212 bool
get_backup_record(BackupRecordPtr & ptr)17213 Backup::get_backup_record(BackupRecordPtr &ptr)
17214 {
17215   /**
17216    * The first record in c_backups is the LCP record when no backup
17217    * is running, if a backup is running, it will be first one. We will
17218    * return true if a backup record is found and false otherwise.
17219    */
17220   c_backups.first(ptr);
17221   if (ptr.p->is_lcp())
17222   {
17223     ptr.i = RNIL;
17224     ptr.p = 0;
17225     return false;
17226   }
17227   return true;
17228 }
17229 
17230 void
get_lcp_record(BackupRecordPtr & ptr)17231 Backup::get_lcp_record(BackupRecordPtr &ptr)
17232 {
17233   for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr))
17234   {
17235     if (ptr.p->is_lcp())
17236     {
17237       return;
17238     }
17239   }
17240   ndbrequire(false);
17241 }
17242 
17243 void
set_undo_log_level(Uint32 percentage)17244 Backup::set_undo_log_level(Uint32 percentage)
17245 {
17246   m_undo_log_level_percentage = percentage;
17247   if (percentage > m_max_undo_log_level_percentage)
17248   {
17249     jam();
17250     m_max_undo_log_level_percentage = percentage;
17251   }
17252 }
17253 bool Backup::g_is_single_thr_backup_running = false;
17254