1 /*
2 Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License, version 2.0,
6 as published by the Free Software Foundation.
7
8 This program is also distributed with certain software (including
9 but not limited to OpenSSL) that is licensed under separate terms,
10 as designated in a particular file or component or in included license
11 documentation. The authors of MySQL hereby grant you an additional
12 permission to link the program and your derivative works with the
13 separately licensed software that they have included with MySQL.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License, version 2.0, for more details.
19
20 You should have received a copy of the GNU General Public License
21 along with this program; if not, write to the Free Software
22 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "Backup.hpp"
26
27 #include <ndb_version.h>
28
29 #include <NdbTCP.h>
30 #include <Bitmask.hpp>
31
32 #include <signaldata/NodeFailRep.hpp>
33 #include <signaldata/ReadNodesConf.hpp>
34
35 #include <signaldata/DihScanTab.hpp>
36 #include <signaldata/DiGetNodes.hpp>
37 #include <signaldata/ScanFrag.hpp>
38
39 #include <signaldata/GetTabInfo.hpp>
40 #include <signaldata/DictTabInfo.hpp>
41 #include <signaldata/ListTables.hpp>
42
43 #include <signaldata/FsOpenReq.hpp>
44 #include <signaldata/FsAppendReq.hpp>
45 #include <signaldata/FsCloseReq.hpp>
46 #include <signaldata/FsConf.hpp>
47 #include <signaldata/FsRef.hpp>
48 #include <signaldata/FsRemoveReq.hpp>
49 #include <signaldata/FsReadWriteReq.hpp>
50
51 #include <signaldata/BackupImpl.hpp>
52 #include <signaldata/BackupSignalData.hpp>
53 #include <signaldata/BackupContinueB.hpp>
54 #include <signaldata/EventReport.hpp>
55
56 #include <signaldata/UtilSequence.hpp>
57
58 #include <signaldata/CreateTrigImpl.hpp>
59 #include <signaldata/DropTrigImpl.hpp>
60 #include <signaldata/FireTrigOrd.hpp>
61 #include <signaldata/TrigAttrInfo.hpp>
62 #include <AttributeHeader.hpp>
63
64 #include <signaldata/WaitGCP.hpp>
65 #include <signaldata/LCP.hpp>
66 #include <signaldata/BackupLockTab.hpp>
67 #include <signaldata/DumpStateOrd.hpp>
68
69 #include <signaldata/DumpStateOrd.hpp>
70
71 #include <signaldata/DbinfoScan.hpp>
72 #include <signaldata/TransIdAI.hpp>
73
74 #include <NdbTick.h>
75 #include <dbtup/Dbtup.hpp>
76
77 #include <EventLogger.hpp>
78 extern EventLogger * g_eventLogger;
79
80 #include <math.h>
81
82 #define JAM_FILE_ID 475
83
84 static const Uint32 WaitDiskBufferCapacityMillis = 1;
85 static const Uint32 WaitScanTempErrorRetryMillis = 10;
86
87 static NDB_TICKS startTime;
88
89 #if (defined(VM_TRACE) || defined(ERROR_INSERT))
90 //#define DEBUG_LCP 1
91 //#define DEBUG_LCP_ROW 1
92 //#define DEBUG_LCP_DEL_FILES 1
93 //#define DEBUG_LCP_DEL 1
94 //#define DEBUG_EXTRA_LCP 1
95 //#define DEBUG_REDO_CONTROL 1
96 //#define DEBUG_REDO_CONTROL_DETAIL 1
97 //#define DEBUG_LCP_DD 1
98 //#define DEBUG_LCP_STAT 1
99 //#define DEBUG_LCP_LAG 1
100 #endif
101
102 #ifdef DEBUG_REDO_CONTROL
103 #define DEB_REDO_CONTROL(arglist) do { g_eventLogger->info arglist ; } while (0)
104 #else
105 #define DEB_REDO_CONTROL(arglist) do { } while (0)
106 #endif
107
108 #ifdef DEBUG_REDO_CONTROL_DETAIL
109 #define DEB_REDO_CONTROL_DETAIL(arglist) do { g_eventLogger->info arglist ; } while (0)
110 #else
111 #define DEB_REDO_CONTROL_DETAIL(arglist) do { } while (0)
112 #endif
113
114 #ifdef DEBUG_LCP
115 #define DEB_LCP(arglist) do { g_eventLogger->info arglist ; } while (0)
116 #else
117 #define DEB_LCP(arglist) do { } while (0)
118 #endif
119
120 #ifdef DEBUG_LCP_DD
121 #define DEB_LCP_DD(arglist) do { g_eventLogger->info arglist ; } while (0)
122 #else
123 #define DEB_LCP_DD(arglist) do { } while (0)
124 #endif
125
126 #ifdef DEBUG_LCP_DEL_FILES
127 #define DEB_LCP_DEL_FILES(arglist) do { g_eventLogger->info arglist ; } while (0)
128 #else
129 #define DEB_LCP_DEL_FILES(arglist) do { } while (0)
130 #endif
131
132 #ifdef DEBUG_LCP_DEL
133 #define DEB_LCP_DEL(arglist) do { g_eventLogger->info arglist ; } while (0)
134 #else
135 #define DEB_LCP_DEL(arglist) do { } while (0)
136 #endif
137
138 #ifdef DEBUG_LCP_STAT
139 #define DEB_LCP_STAT(arglist) do { g_eventLogger->info arglist ; } while (0)
140 #else
141 #define DEB_LCP_STAT(arglist) do { } while (0)
142 #endif
143
144 #ifdef DEBUG_LCP_LAG
145 #define DEB_LCP_LAG(arglist) do { g_eventLogger->info arglist ; } while (0)
146 #else
147 #define DEB_LCP_LAG(arglist) do { } while (0)
148 #endif
149
150 #ifdef DEBUG_EXTRA_LCP
151 #define DEB_EXTRA_LCP(arglist) do { g_eventLogger->info arglist ; } while (0)
152 #else
153 #define DEB_EXTRA_LCP(arglist) do { } while (0)
154 #endif
155
156 #ifdef VM_TRACE
157 #define DEBUG_OUT(x) ndbout << x << endl
158 #else
159 #define DEBUG_OUT(x)
160 #endif
161
162 //#define DEBUG_ABORT
163 //#define dbg globalSignalLoggers.log
164
165 static Uint32 g_TypeOfStart = NodeState::ST_ILLEGAL_TYPE;
166
167 #define SEND_BACKUP_STARTED_FLAG(A) (((A) & BackupReq::WAITCOMPLETED) > 0)
168 #define SEND_BACKUP_COMPLETED_FLAG(A) (((A) & BackupReq::WAITCOMPLETED) > 1)
169 #define MT_BACKUP_FLAG(A) (((A) & BackupReq::MT_BACKUP) > 0)
170
171 /**
172 * "Magic" constants used for adaptive LCP speed algorithm. These magic
173 * constants tries to ensure a smooth LCP load which is high enough to
174 * avoid slowing down LCPs such that we run out of REDO logs. Also low
175 * enough to avoid that we use so much CPU on LCPs that we block out
176 * most user transactions. We also want to avoid destroying real-time
177 * characteristics due to LCPs.
178 *
179 * See much longer explanation of these values below.
180 */
181 #define HIGH_LOAD_LEVEL 32
182 #define VERY_HIGH_LOAD_LEVEL 48
183 #define NUMBER_OF_SIGNALS_PER_SCAN_BATCH 3
184 #define MAX_RAISE_PRIO_MEMORY 16
185
186 void
execSTTOR(Signal * signal)187 Backup::execSTTOR(Signal* signal)
188 {
189 jamEntry();
190
191 const Uint32 startphase = signal->theData[1];
192 const Uint32 typeOfStart = signal->theData[7];
193
194 if (startphase == 1)
195 {
196 ndbrequire((c_lqh = (Dblqh*)globalData.getBlock(DBLQH, instance())) != 0);
197 ndbrequire((c_tup = (Dbtup*)globalData.getBlock(DBTUP, instance())) != 0);
198 ndbrequire((c_lgman =
199 (Lgman*)globalData.getBlock(LGMAN, instance())) != 0);
200 ndbrequire((c_pgman =
201 (Pgman*)globalData.getBlock(PGMAN, instance())) != 0);
202
203 m_words_written_this_period = 0;
204 m_backup_words_written_this_period = 0;
205 last_disk_write_speed_report = 0;
206 next_disk_write_speed_report = 0;
207 m_monitor_words_written = 0;
208 m_backup_monitor_words_written = 0;
209 m_periods_passed_in_monitor_period = 0;
210 m_monitor_snapshot_start = NdbTick_getCurrentTicks();
211 m_curr_lcp_id = 0;
212 m_curr_disk_write_speed = c_defaults.m_disk_write_speed_max_own_restart;
213 m_curr_backup_disk_write_speed =
214 c_defaults.m_disk_write_speed_max_own_restart;
215 m_overflow_disk_write = 0;
216 m_backup_overflow_disk_write = 0;
217 slowdowns_due_to_io_lag = 0;
218 slowdowns_due_to_high_cpu = 0;
219 disk_write_speed_set_to_min = 0;
220 m_is_lcp_running = false;
221 m_is_backup_running = false;
222 m_is_any_node_restarting = false;
223 m_node_restart_check_sent = false;
224 m_our_node_started = false;
225 m_lcp_ptr.i = RNIL;
226 m_lcp_ptr.p = 0;
227 m_first_lcp_started = false;
228 m_newestRestorableGci = 0;
229 m_delete_lcp_files_ongoing = false;
230 m_reset_disk_speed_time = NdbTick_getCurrentTicks();
231 m_reset_delay_used = Backup::DISK_SPEED_CHECK_DELAY;
232 c_initial_start_lcp_not_done_yet = false;
233 m_redo_alert_factor = 1;
234 m_redo_alert_state = RedoStateRep::NO_REDO_ALERT;
235 signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
236 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
237 Backup::DISK_SPEED_CHECK_DELAY, 1);
238 }
239 if (startphase == 3)
240 {
241 jam();
242
243 g_TypeOfStart = typeOfStart;
244 if (g_TypeOfStart == NodeState::ST_INITIAL_START ||
245 g_TypeOfStart == NodeState::ST_INITIAL_NODE_RESTART)
246 {
247 jam();
248 c_initial_start_lcp_not_done_yet = true;
249 }
250 signal->theData[0] = reference();
251 sendSignal(NDBCNTR_REF, GSN_READ_NODESREQ, signal, 1, JBB);
252 return;
253 }//if
254
255 if (startphase == 7)
256 {
257 m_monitor_words_written = 0;
258 m_backup_monitor_words_written = 0;
259 m_periods_passed_in_monitor_period = 0;
260 m_monitor_snapshot_start = NdbTick_getCurrentTicks();
261 m_curr_disk_write_speed = c_defaults.m_disk_write_speed_min;
262 m_curr_backup_disk_write_speed = c_defaults.m_disk_write_speed_min;
263 m_our_node_started = true;
264 c_initial_start_lcp_not_done_yet = false;
265 }
266
267 if(startphase == 7 && g_TypeOfStart == NodeState::ST_INITIAL_START &&
268 c_masterNodeId == getOwnNodeId() && !isNdbMtLqh()){
269 jam();
270 createSequence(signal);
271 return;
272 }//if
273
274 sendSTTORRY(signal);
275 return;
276 }//Dbdict::execSTTOR()
277
278 void
execREAD_NODESCONF(Signal * signal)279 Backup::execREAD_NODESCONF(Signal* signal)
280 {
281 jamEntry();
282 ReadNodesConf * conf = (ReadNodesConf *)signal->getDataPtr();
283
284 {
285 ndbrequire(signal->getNoOfSections() == 1);
286 SegmentedSectionPtr ptr;
287 SectionHandle handle(this, signal);
288 handle.getSection(ptr, 0);
289 ndbrequire(ptr.sz == 5 * NdbNodeBitmask::Size);
290 copy((Uint32*)&conf->definedNodes.rep.data, ptr);
291 releaseSections(handle);
292 }
293
294 c_aliveNodes.clear();
295
296 Uint32 count = 0;
297 for (Uint32 i = 0; i<MAX_NDB_NODES; i++) {
298 jam();
299 if (conf->definedNodes.get(i))
300 {
301 jam();
302 count++;
303
304 NodePtr node;
305 ndbrequire(c_nodes.seizeFirst(node));
306
307 node.p->nodeId = i;
308 if (conf->inactiveNodes.get(i))
309 {
310 jam();
311 node.p->alive = 0;
312 } else {
313 jam();
314 node.p->alive = 1;
315 c_aliveNodes.set(i);
316 }//if
317 }//if
318 }//for
319 c_masterNodeId = conf->masterNodeId;
320 ndbrequire(count == conf->noOfNodes);
321 sendSTTORRY(signal);
322 }
323
324 void
sendSTTORRY(Signal * signal)325 Backup::sendSTTORRY(Signal* signal)
326 {
327 signal->theData[0] = 0;
328 signal->theData[3] = 1;
329 signal->theData[4] = 3;
330 signal->theData[5] = 7;
331 signal->theData[6] = 255; // No more start phases from missra
332 BlockReference cntrRef = !isNdbMtLqh() ? NDBCNTR_REF : BACKUP_REF;
333 sendSignal(cntrRef, GSN_STTORRY, signal, 7, JBB);
334 }
335
336 void
createSequence(Signal * signal)337 Backup::createSequence(Signal* signal)
338 {
339 UtilSequenceReq * req = (UtilSequenceReq*)signal->getDataPtrSend();
340
341 req->senderData = RNIL;
342 req->sequenceId = NDB_BACKUP_SEQUENCE;
343 req->requestType = UtilSequenceReq::Create;
344
345 sendSignal(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
346 signal, UtilSequenceReq::SignalLength, JBB);
347 }
348
349 void
handle_overflow(Uint64 & overflow_disk_write,Uint64 & words_written_this_period,Uint64 & curr_disk_write_speed)350 Backup::handle_overflow(Uint64& overflow_disk_write,
351 Uint64& words_written_this_period,
352 Uint64& curr_disk_write_speed)
353 {
354 jam();
355 /**
356 * If we overflowed in the last period, count it in
357 * this new period, potentially overflowing again into
358 * future periods...
359 *
360 * The overflow can only come from the last write we did in this
361 * period, but potentially this write is bigger than what we are
362 * allowed to write during one period.
363 *
364 * Calculate the overflow to pass into the new period
365 * (overflowThisPeriod). It can never be more than what is
366 * allowed to be written during a period.
367 *
368 * We could rarely end up in the case that the overflow of the
369 * last write in the period even overflows the entire next period.
370 * If so we put this into the remainingOverFlow and put this into
371 * overflow_disk_write (in this case nothing will be written in
372 * this period so ready_to_write need not worry about this case
373 * when setting overflow_disk_write since it isn't written any time
374 * in this case and in all other cases only written by the last write
375 * in a period.
376 *
377 * This routine is called both for collective LCP and Backup overflow
378 * and for only Backup overflow.
379 */
380 Uint64 overflowThisPeriod = MIN(overflow_disk_write,
381 curr_disk_write_speed + 1);
382
383 /* How much overflow remains after this period? */
384 Uint64 remainingOverFlow = overflow_disk_write - overflowThisPeriod;
385
386 if (overflowThisPeriod)
387 {
388 jam();
389 #ifdef DEBUG_CHECKPOINTSPEED
390 ndbout_c("Overflow of %u bytes (max/period is %u bytes)",
391 overflowThisPeriod * 4, curr_disk_write_speed * 4);
392 #endif
393 if (remainingOverFlow)
394 {
395 jam();
396 #ifdef DEBUG_CHECKPOINTSPEED
397 ndbout_c(" Extra overflow : %u bytes, will take %u further periods"
398 " to clear", remainingOverFlow * 4,
399 remainingOverFlow / curr_disk_write_speed);
400 #endif
401 }
402 }
403 if (true || curr_disk_write_speed == m_curr_disk_write_speed)
404 {
405 DEB_REDO_CONTROL_DETAIL(("(%u)bytes_written_this_period: %llu kB, "
406 " overflowThisPeriod: %llu kB, "
407 " remainingOverFlow: %llu kB, "
408 " curr_disk_write_speed %llu kB",
409 instance(),
410 words_written_this_period / 256,
411 overflowThisPeriod / 256,
412 remainingOverFlow / 256,
413 curr_disk_write_speed / 256));
414 }
415 words_written_this_period = overflowThisPeriod;
416 overflow_disk_write = remainingOverFlow;
417 }
418
419 void
calculate_next_delay(const NDB_TICKS curr_time)420 Backup::calculate_next_delay(const NDB_TICKS curr_time)
421 {
422 /**
423 * Adjust for upto 10 millisecond delay of this signal. Longer
424 * delays will not be handled, in this case the system is most
425 * likely under too high load and it won't matter very much that
426 * we decrease the speed of checkpoints.
427 *
428 * We use a technique where we allow an overflow write in one
429 * period. This overflow will be removed from the next period
430 * such that the load will at average be as specified.
431 * Calculate new delay time based on if we overslept or underslept
432 * this time. We will never regulate more than 10ms, if the
433 * oversleep is bigger than we will simply ignore it. We will
434 * decrease the delay by as much as we overslept or increase it by
435 * as much as we underslept.
436 */
437 int delay_time = m_reset_delay_used;
438 int sig_delay = int(NdbTick_Elapsed(m_reset_disk_speed_time,
439 curr_time).milliSec());
440 if (sig_delay > delay_time + 10)
441 {
442 delay_time = Backup::DISK_SPEED_CHECK_DELAY - 10;
443 }
444 else if (sig_delay < delay_time - 10)
445 {
446 delay_time = Backup::DISK_SPEED_CHECK_DELAY + 10;
447 }
448 else
449 {
450 delay_time = Backup::DISK_SPEED_CHECK_DELAY -
451 (sig_delay - delay_time);
452 }
453 m_periods_passed_in_monitor_period++;
454 m_reset_delay_used= delay_time;
455 m_reset_disk_speed_time = curr_time;
456 #if 0
457 ndbout << "Signal delay was = " << sig_delay;
458 ndbout << " Current time = " << curr_time << endl;
459 ndbout << " Delay time will be = " << delay_time << endl << endl;
460 #endif
461 }
462
463 void
report_disk_write_speed_report(Uint64 bytes_written_this_period,Uint64 backup_bytes_written_this_period,Uint64 millis_passed)464 Backup::report_disk_write_speed_report(Uint64 bytes_written_this_period,
465 Uint64 backup_bytes_written_this_period,
466 Uint64 millis_passed)
467 {
468 Uint32 report = next_disk_write_speed_report;
469 disk_write_speed_rep[report].backup_bytes_written =
470 backup_bytes_written_this_period;
471 disk_write_speed_rep[report].backup_lcp_bytes_written =
472 bytes_written_this_period;
473 disk_write_speed_rep[report].millis_passed =
474 millis_passed;
475 disk_write_speed_rep[report].redo_bytes_written =
476 c_lqh->report_redo_written_bytes();
477 disk_write_speed_rep[report].target_disk_write_speed =
478 m_curr_disk_write_speed * CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
479 disk_write_speed_rep[report].target_backup_disk_write_speed =
480 m_curr_backup_disk_write_speed * CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
481
482 next_disk_write_speed_report++;
483 if (next_disk_write_speed_report == DISK_WRITE_SPEED_REPORT_SIZE)
484 {
485 next_disk_write_speed_report = 0;
486 }
487 if (next_disk_write_speed_report == last_disk_write_speed_report)
488 {
489 last_disk_write_speed_report++;
490 if (last_disk_write_speed_report == DISK_WRITE_SPEED_REPORT_SIZE)
491 {
492 last_disk_write_speed_report = 0;
493 }
494 }
495 }
496
497 #define DELETE_RECOVERY_WORK 120
498 /**
499 * This method is a check that we haven't been writing faster than we're
500 * supposed to during the last interval.
501 */
502 void
monitor_disk_write_speed(const NDB_TICKS curr_time,const Uint64 millisPassed)503 Backup::monitor_disk_write_speed(const NDB_TICKS curr_time,
504 const Uint64 millisPassed)
505 {
506 /**
507 * Independent check of DiskCheckpointSpeed.
508 * We check every second or so that we are roughly sticking
509 * to our diet.
510 */
511 jam();
512 const Uint64 periodsPassed =
513 (millisPassed / DISK_SPEED_CHECK_DELAY) + 1;
514 const Uint64 quotaWordsPerPeriod = m_curr_disk_write_speed;
515 const Uint64 quotaWordsPerPeriodBackup = m_curr_backup_disk_write_speed;
516 const Uint64 maxOverFlowWords = c_defaults.m_maxWriteSize / 4;
517 const Uint64 maxExpectedWords = (periodsPassed * quotaWordsPerPeriod) +
518 maxOverFlowWords;
519 const Uint64 maxExpectedWordsBackup = (periodsPassed *
520 quotaWordsPerPeriodBackup) +
521 maxOverFlowWords;
522
523 if (unlikely((m_monitor_words_written > maxExpectedWords) ||
524 (m_backup_monitor_words_written > maxExpectedWordsBackup)))
525 {
526 jam();
527 /**
528 * In the last monitoring interval, we have written more words
529 * than allowed by the quota (DiskCheckpointSpeed), including
530 * transient spikes due to a single MaxBackupWriteSize write
531 */
532 ndbout << "Backup : Excessive Backup/LCP write rate in last"
533 << " monitoring period - recorded = "
534 << (m_monitor_words_written * 4 * 1000) / millisPassed
535 << " bytes/s, "
536 << endl
537 << "Recorded writes to backup: "
538 << (m_backup_monitor_words_written * 4 * 1000) / millisPassed
539 << " bytes/s, "
540 << endl;
541 ndbout << "Current speed is = "
542 << m_curr_disk_write_speed *
543 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS
544 << " bytes/s"
545 << endl;
546 ndbout << "Current backup speed is = "
547 << m_curr_backup_disk_write_speed *
548 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS
549 << " bytes/s"
550 << endl;
551 ndbout << "Backup : Monitoring period : " << millisPassed
552 << " millis. Bytes written : " << (m_monitor_words_written * 4)
553 << ". Max allowed : " << (maxExpectedWords * 4) << endl;
554 ndbout << "Backup : Monitoring period : " << millisPassed
555 << " millis. Bytes written : "
556 << (m_backup_monitor_words_written * 4)
557 << ". Max allowed : " << (maxExpectedWordsBackup * 4) << endl;
558 ndbout << "Actual number of periods in this monitoring interval: ";
559 ndbout << m_periods_passed_in_monitor_period;
560 ndbout << " calculated number was: " << periodsPassed << endl;
561 }
562 report_disk_write_speed_report(4 * m_monitor_words_written,
563 4 * m_backup_monitor_words_written,
564 millisPassed);
565 /**
566 * The LCP write rate is removed from the calculated LCP change rate to
567 * derive the lag (a lag is a positive number, if we are ahead of the
568 * calculated rate we report it as a negative number).
569 * We keep track of the lag since the start of the LCP and since the
570 * start of the previous LCP.
571 */
572 Int64 lag = m_lcp_change_rate -
573 ((4 * m_monitor_words_written) -
574 (4 * m_backup_monitor_words_written));
575 m_lcp_lag[1] += lag;
576
577 DEB_REDO_CONTROL(("(%u)change_rate: %llu kB, LCP+Backup: %llu kB,"
578 " Backup: %llu kB, lag: %lld kB",
579 instance(),
580 m_lcp_change_rate / 1024,
581 m_monitor_words_written / 256,
582 m_backup_monitor_words_written / 256,
583 lag / 1024));
584
585 m_monitor_words_written = 0;
586 m_backup_monitor_words_written = 0;
587 m_periods_passed_in_monitor_period = 0;
588 m_monitor_snapshot_start = curr_time;
589 }
590
591 void
debug_report_redo_control(Uint32 cpu_usage)592 Backup::debug_report_redo_control(Uint32 cpu_usage)
593 {
594 #ifdef DEBUG_REDO_CONTROL
595 {
596 Uint64 millis_passed;
597 Uint64 backup_lcp_bytes_written;
598 Uint64 backup_bytes_written;
599 Uint64 redo_bytes_written;
600 calculate_disk_write_speed_seconds_back(1,
601 millis_passed,
602 backup_lcp_bytes_written,
603 backup_bytes_written,
604 redo_bytes_written,
605 true);
606 backup_bytes_written *= Uint64(1000);
607 backup_bytes_written /= (millis_passed * Uint64(1024));
608 backup_lcp_bytes_written *= Uint64(1000);
609 backup_lcp_bytes_written /= (millis_passed * Uint64(1024));
610 redo_bytes_written *= Uint64(1000);
611 redo_bytes_written /= (millis_passed * Uint64(1024));
612
613 /* Report new disk write speed and last seconds achievement on disk */
614 DEB_REDO_CONTROL(("(%u)Current disk write speed is %llu kB/sec"
615 " and current backup disk write speed is %llu kB/sec"
616 ", last sec REDO write speed %llu kB/sec, "
617 "LCP+Backup write speed %llu kB/sec"
618 ", Backup write speed %llu kB/sec"
619 ", cpu_usage: %u",
620 instance(),
621 ((m_curr_disk_write_speed *
622 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
623 Uint64(1024)),
624 ((m_curr_backup_disk_write_speed *
625 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
626 Uint64(1024)),
627 redo_bytes_written,
628 backup_lcp_bytes_written,
629 backup_bytes_written,
630 cpu_usage));
631 }
632 #else
633 (void)cpu_usage;
634 #endif
635 }
636
637 void
execREDO_STATE_REP(Signal * signal)638 Backup::execREDO_STATE_REP(Signal* signal)
639 {
640 RedoStateRep *rep = (RedoStateRep*)signal->getDataPtr();
641 ndbrequire(rep->receiverInfo == RedoStateRep::ToBackup);
642 m_global_redo_alert_state = (RedoStateRep::RedoAlertState)rep->redoState;
643 DEB_REDO_CONTROL(("(%u) New global redo alert state: %u",
644 instance(),
645 m_global_redo_alert_state));
646 }
647
648 /**
649 * Initialise LCP timers at the time we hear of the first writes to the
650 * REDO log. Could also be initialised by the start of the first LCP.
651 */
652 void
init_lcp_timers(Uint64 redo_written_since_last_call)653 Backup::init_lcp_timers(Uint64 redo_written_since_last_call)
654 {
655 if (redo_written_since_last_call > 0)
656 {
657 if (!NdbTick_IsValid(m_lcp_start_time))
658 {
659 m_lcp_start_time = getHighResTimer();
660 m_prev_lcp_start_time = m_lcp_start_time;
661 }
662 }
663 }
664
665 void
lcp_start_point(Signal * signal)666 Backup::lcp_start_point(Signal *signal)
667 {
668 /**
669 * A new LCP is starting up, we need to keep track of this to handle
670 * REDO control.
671 * The start and end points of LCPs currently only come with an
672 * accuracy of about 1 second, so if the LCP time is shorter than
673 * this we can definitely ignore any REDO alerts.
674 */
675 if (!NdbTick_IsValid(m_prev_lcp_start_time))
676 {
677 jam();
678 m_prev_lcp_start_time = getHighResTimer();
679 }
680 else
681 {
682 m_prev_lcp_start_time = m_lcp_start_time;
683 }
684 c_pgman->lcp_start_point(signal,
685 m_max_undo_log_level_percentage + 1,
686 m_max_redo_percentage);
687 m_max_undo_log_level_percentage = m_undo_log_level_percentage;
688 m_max_redo_percentage = m_redo_percentage;
689 m_first_lcp_started = true;
690 m_lcp_start_time = getHighResTimer();
691 ndbrequire(NdbTick_IsValid(m_lcp_start_time));
692 m_lcp_current_cut_point = m_prev_lcp_start_time;
693 m_update_size_lcp[0] = m_update_size_lcp[1];
694 m_update_size_lcp[1] = m_update_size_lcp_last;
695 m_insert_size_lcp[0] = m_insert_size_lcp[1];
696 m_insert_size_lcp[1] = m_insert_size_lcp_last;
697 m_delete_size_lcp[0] = m_delete_size_lcp[1];
698 m_delete_size_lcp[1] = m_delete_size_lcp_last;
699 DEB_REDO_CONTROL(("(%u)m_insert_size_lcp[0]: %llu MByte, "
700 "m_insert_size_lcp[1]: %llu MByte, "
701 "m_insert_size_lcp_last: %llu MByte",
702 instance(),
703 (m_insert_size_lcp[0] / (1024 * 1024)),
704 (m_insert_size_lcp[1] / (1024 * 1024)),
705 (m_insert_size_lcp_last / (1024 * 1024))));
706 }
707
708 void
lcp_end_point()709 Backup::lcp_end_point()
710 {
711 NDB_TICKS current_time = getHighResTimer();
712 ndbrequire(NdbTick_IsValid(m_lcp_start_time));
713 m_last_lcp_exec_time_in_ms =
714 NdbTick_Elapsed(m_lcp_start_time, current_time).milliSec();
715 m_lcp_current_cut_point = m_lcp_start_time;
716
717 c_pgman->lcp_end_point(m_last_lcp_exec_time_in_ms);
718 reset_lcp_timing_factors();
719 #ifdef DEBUG_REDO_CONTROL
720 Uint64 checkpoint_size = m_insert_size_lcp[1] - m_insert_size_lcp[0];
721 Uint64 checkpoint_rate = 0;
722 if (m_last_lcp_exec_time_in_ms > 0)
723 {
724 checkpoint_rate = checkpoint_size / m_last_lcp_exec_time_in_ms;
725 }
726 DEB_REDO_CONTROL(("(%u)LCP END: m_insert_size_lcp[0]: %llu MByte, "
727 "Remaining lag: %lld MB, "
728 "Removed lag: %lld MB, "
729 "Checkpoint rate in this LCP: %llu kB/sec",
730 instance(),
731 (checkpoint_size / (1024 * 1024)),
732 (m_lcp_lag[1] / (1024 * 1024)),
733 (m_lcp_lag[0] / (1024 * 1024)),
734 checkpoint_rate));
735 #endif
736 m_update_size_lcp[0] = m_update_size_lcp[1];
737 m_insert_size_lcp[0] = m_insert_size_lcp[1];
738 m_delete_size_lcp[0] = m_delete_size_lcp[1];
739 m_lcp_lag[0] = m_lcp_lag[1];
740 m_lcp_lag[1] = Int64(0);
741 }
742
743 Uint64
init_change_size(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_memory)744 Backup::init_change_size(Uint64 update_size,
745 Uint64 insert_size,
746 Uint64 delete_size,
747 Uint64 total_memory)
748 {
749 /**
750 * The initial value for change_size is based on that the new
751 * rows or deleted rows are always changes, but updates can
752 * at times be updates of the same row. We use an exponential
753 * probability distribution that a row has been updated or not.
754 */
755 Uint64 change_size = insert_size + delete_size;
756 long double f_total_memory = (long double)total_memory;
757 long double f_change_size = update_size;
758 long double f_change_percentage = f_change_size / f_total_memory;
759 long double f_real_change_percentage = ((long double)1) -
760 exp(-f_change_percentage);
761 long double f_real_change_size = f_real_change_percentage *
762 f_total_memory;
763 change_size += (Uint64)f_real_change_size;
764 return change_size;
765 }
766
767 Uint64
modify_change_size(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_size,Uint64 change_size)768 Backup::modify_change_size(Uint64 update_size,
769 Uint64 insert_size,
770 Uint64 delete_size,
771 Uint64 total_size,
772 Uint64 change_size)
773 {
774 /**
775 * Now we have calculated an estimate that is comparable
776 * to the row_change_count that we get per fragment before
777 * calculating the number of parts to checkpoint.
778 *
779 * The next step is now to modify this estimate based on
780 * the amount of inserts and deletes compared to the updates.
781 */
782 Uint64 updates_percent = (update_size * Uint64(1005)) /
783 (Uint64(10) * total_size);
784 Uint64 inserts_percent = (insert_size * Uint64(1005)) /
785 (Uint64(10) * total_size);
786 Uint64 insert_recovery_work = (Uint64)get_insert_recovery_work();
787 inserts_percent *= insert_recovery_work;
788 inserts_percent /= Uint64(100);
789 Uint64 deletes_percent = (delete_size * Uint64(1005)) /
790 (Uint64(10) * total_size);
791 deletes_percent *= Uint64(DELETE_RECOVERY_WORK);
792 deletes_percent /= Uint64(100);
793 Uint64 change_factor = updates_percent +
794 inserts_percent +
795 deletes_percent;
796 change_size *= change_factor;
797 change_size /= Uint64(100);
798 return change_size;
799 }
800
801 Uint32
calculate_parts(Uint64 change_size,Uint64 total_memory)802 Backup::calculate_parts(Uint64 change_size,
803 Uint64 total_memory)
804 {
805 Uint64 part_total_memory = total_memory / Uint64(10);
806 Uint32 min_parts = calculate_min_parts(total_memory,
807 change_size,
808 part_total_memory,
809 total_memory);
810 return min_parts;
811 }
812
813 void
calculate_seconds_since_lcp_cut(Uint64 & seconds_since_lcp_cut)814 Backup::calculate_seconds_since_lcp_cut(Uint64& seconds_since_lcp_cut)
815 {
816 NDB_TICKS now = getHighResTimer();
817 if (!NdbTick_IsValid(m_lcp_current_cut_point))
818 {
819 jam();
820 seconds_since_lcp_cut = 0;
821 return;
822 }
823 seconds_since_lcp_cut =
824 NdbTick_Elapsed(m_lcp_current_cut_point, now).seconds();
825 }
826
827 Uint64
calculate_change_rate(Uint64 change_size,Uint64 & seconds_since_lcp_cut)828 Backup::calculate_change_rate(Uint64 change_size,
829 Uint64& seconds_since_lcp_cut)
830 {
831 if (seconds_since_lcp_cut < 2)
832 {
833 jam();
834 /**
835 * We ignore very short LCPs, in this case it is hard to see
836 * how we could run out of REDO log and need more disk write
837 * speed.
838 */
839 return 0;
840 }
841 Uint64 change_size_per_sec = change_size / seconds_since_lcp_cut;
842 return change_size_per_sec;
843 }
844
845 Uint64
calculate_checkpoint_rate(Uint64 update_size,Uint64 insert_size,Uint64 delete_size,Uint64 total_memory,Uint64 & seconds_since_lcp_cut)846 Backup::calculate_checkpoint_rate(Uint64 update_size,
847 Uint64 insert_size,
848 Uint64 delete_size,
849 Uint64 total_memory,
850 Uint64& seconds_since_lcp_cut)
851 {
852 Uint64 checkpoint_size = 0;
853 Uint32 all_parts = 0;
854 Uint64 all_size = 0;
855 Uint64 change_size = 0;
856 Uint64 mod_change_size = 0;
857 Uint64 total_size = update_size + insert_size + delete_size;
858 if (total_size != 0)
859 {
860 if (delete_size > insert_size)
861 {
862 update_size += insert_size;
863 delete_size -= insert_size;
864 insert_size = 0;
865 }
866 else
867 {
868 update_size += delete_size;
869 insert_size -= delete_size;
870 delete_size = 0;
871 }
872 calculate_seconds_since_lcp_cut(seconds_since_lcp_cut);
873 change_size = init_change_size(update_size,
874 insert_size,
875 delete_size,
876 total_memory);
877 mod_change_size = modify_change_size(update_size,
878 insert_size,
879 delete_size,
880 total_size,
881 change_size);
882 all_parts = calculate_parts(mod_change_size, total_memory);
883 all_size = total_memory * Uint64(all_parts);
884 all_size /= Uint64(BackupFormat::NDB_MAX_LCP_PARTS);
885 change_size = (BackupFormat::NDB_MAX_LCP_PARTS - all_parts) *
886 change_size;
887 change_size /= BackupFormat::NDB_MAX_LCP_PARTS;
888 checkpoint_size = all_size + change_size;
889 }
890 Uint64 change_rate = calculate_change_rate(checkpoint_size,
891 seconds_since_lcp_cut);
892 DEB_REDO_CONTROL(("(%u)update_size: %llu MB, insert_size: %llu MB,"
893 " delete_size: %llu MB, checkpoint_size: %llu MB"
894 ", all_parts: %u, total_memory: %llu MB, "
895 "all_size: %llu MB, change_size: %llu MB, "
896 "mod_change_size: %llu MB, "
897 "seconds_since_lcp_cut: %llu",
898 instance(),
899 update_size / (Uint64(1024) * Uint64(1024)),
900 insert_size / (Uint64(1024) * Uint64(1024)),
901 delete_size / (Uint64(1024) * Uint64(1024)),
902 checkpoint_size / (Uint64(1024) * Uint64(1024)),
903 all_parts,
904 total_memory / (Uint64(1024 * Uint64(1024))),
905 all_size / (Uint64(1024) * Uint64(1024)),
906 change_size / (Uint64(1024) * Uint64(1024)),
907 mod_change_size / (Uint64(1024) * Uint64(1024)),
908 seconds_since_lcp_cut));
909 return change_rate;
910 }
911
912 void
calculate_redo_parameters(Uint64 redo_usage,Uint64 redo_size,Uint64 redo_written_since_last_call,Uint64 millis_since_last_call,Uint64 & redo_percentage,Uint64 & max_redo_used_before_cut,Uint64 & mean_redo_used_before_cut,Uint64 & mean_redo_speed_per_sec,Uint64 & current_redo_speed_per_sec,Uint64 & redo_available)913 Backup::calculate_redo_parameters(Uint64 redo_usage,
914 Uint64 redo_size,
915 Uint64 redo_written_since_last_call,
916 Uint64 millis_since_last_call,
917 Uint64& redo_percentage,
918 Uint64& max_redo_used_before_cut,
919 Uint64& mean_redo_used_before_cut,
920 Uint64& mean_redo_speed_per_sec,
921 Uint64& current_redo_speed_per_sec,
922 Uint64& redo_available)
923 {
924 /* redo_size and redo_usage is in MBytes, convert to bytes */
925 redo_size *= (Uint64(1024) * Uint64(1024));
926 redo_usage *= (Uint64(1024) * Uint64(1024));
927 redo_available = redo_size - redo_usage;
928 redo_percentage = redo_usage * Uint64(100);
929 redo_percentage /= redo_size;
930 current_redo_speed_per_sec = redo_written_since_last_call * Uint64(1000);
931 current_redo_speed_per_sec /= millis_since_last_call;
932 if (current_redo_speed_per_sec > m_max_redo_speed_per_sec)
933 {
934 jam();
935 m_max_redo_speed_per_sec = current_redo_speed_per_sec;
936 }
937 mean_redo_speed_per_sec = 0;
938 Uint64 seconds_since_lcp_cut = 0;
939 if (NdbTick_IsValid(m_lcp_current_cut_point))
940 {
941 jam();
942 NDB_TICKS current_time = getHighResTimer();
943 seconds_since_lcp_cut =
944 NdbTick_Elapsed(m_lcp_current_cut_point, current_time).seconds();
945 }
946 if (seconds_since_lcp_cut != 0)
947 {
948 jam();
949 mean_redo_speed_per_sec = redo_usage / seconds_since_lcp_cut;
950 }
951 /**
952 * We assume that LCP execution time is Poisson-distributed.
953 * This means that our mean estimated time is the same even
954 * if the LCP has been ongoing for a while (Poisson distribution
955 * has no memory). It doesn't matter so much if this estimate
956 * isn't 100% correct, it will at least not be overoptimistic.
957 *
958 * Thus we estimate the time to complete the next LCP to be
959 * the time of the last LCP.
960 */
961 max_redo_used_before_cut = m_max_redo_speed_per_sec *
962 m_last_lcp_exec_time_in_ms;
963 max_redo_used_before_cut /= Uint64(1000);
964
965 mean_redo_used_before_cut = mean_redo_speed_per_sec *
966 m_last_lcp_exec_time_in_ms;
967 mean_redo_used_before_cut /= Uint64(1000);
968 }
969
970 void
change_alert_state_redo_percent(Uint64 redo_percentage)971 Backup::change_alert_state_redo_percent(Uint64 redo_percentage)
972 {
973 /**
974 * If the fill level of the REDO log reaches beyond 60% we set
975 * it in critical state independent of calculations on REDO
976 * speed. Similarly when going beyond 40% we set it in high
977 * alert state. Using more than 40% of the REDO log is
978 * not a desired state to run in. This is both too close to
979 * the end to be comfortable and it also extends the time
980 * to recover at a restart substantially.
981 */
982 m_redo_alert_state = RedoStateRep::NO_REDO_ALERT;
983 if (redo_percentage > Uint64(60) ||
984 m_undo_log_level_percentage > 60)
985 {
986 jam();
987 m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
988 }
989 else if (redo_percentage > Uint64(40) ||
990 m_undo_log_level_percentage > 40)
991 {
992 jam();
993 m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
994 }
995 else if (redo_percentage > Uint64(25) ||
996 m_undo_log_level_percentage > 25)
997 {
998 jam();
999 m_redo_alert_state = RedoStateRep::REDO_ALERT_LOW;
1000 }
1001 }
1002
1003 void
change_alert_state_redo_usage(Uint64 max_redo_used_before_cut,Uint64 mean_redo_used_before_cut,Uint64 redo_available)1004 Backup::change_alert_state_redo_usage(Uint64 max_redo_used_before_cut,
1005 Uint64 mean_redo_used_before_cut,
1006 Uint64 redo_available)
1007 {
1008 if (m_redo_alert_state != RedoStateRep::REDO_ALERT_CRITICAL)
1009 {
1010 jam();
1011 /**
1012 * We have estimated the REDO usage until the next LCP will cut it again.
1013 * The first estimate is based on the maximum speed we have seen so far.
1014 * The second estimate is based on the mean speed we have seen since
1015 * the first current REDO log record was generated.
1016 *
1017 * If we write at max speed and we estimate this to run out of REDO space
1018 * we are at a high alert state. If we can use only 40% of this to run out
1019 * of REDO log we are at a critical state.
1020 *
1021 * If we run at mean speed and we can run out of REDO space we are obviously
1022 * in a critical state, even with only an estimate to fill half of this we
1023 * are in a critical state and if we estimate to fill a third of this we are
1024 * in a high alert state.
1025 *
1026 * We don't even attempt those checks if we haven't got good measures of
1027 * times until the next REDO cut.
1028 */
1029 Uint64 max_critical_limit = (Uint64(2) * max_redo_used_before_cut) / Uint64(5);
1030 Uint64 max_high_limit = max_redo_used_before_cut;
1031 Uint64 mean_critical_limit = mean_redo_used_before_cut / Uint64(2);
1032 Uint64 mean_high_limit = mean_redo_used_before_cut / Uint64(3);
1033
1034 if (redo_available < max_critical_limit)
1035 {
1036 jam();
1037 m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
1038 }
1039 else if (redo_available < mean_critical_limit)
1040 {
1041 jam();
1042 m_redo_alert_state = RedoStateRep::REDO_ALERT_CRITICAL;
1043 }
1044 else if (redo_available < max_high_limit)
1045 {
1046 jam();
1047 m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
1048 }
1049 else if (redo_available < mean_high_limit)
1050 {
1051 jam();
1052 m_redo_alert_state = RedoStateRep::REDO_ALERT_HIGH;
1053 }
1054 }
1055 }
1056
1057 void
handle_global_alert_state(Signal * signal,RedoStateRep::RedoAlertState save_redo_alert_state)1058 Backup::handle_global_alert_state(
1059 Signal *signal,
1060 RedoStateRep::RedoAlertState save_redo_alert_state)
1061 {
1062 m_local_redo_alert_state = m_redo_alert_state;
1063 if (save_redo_alert_state != m_redo_alert_state)
1064 {
1065 jam();
1066 RedoStateRep *rep = (RedoStateRep*)signal->getDataPtrSend();
1067 rep->receiverInfo = RedoStateRep::ToNdbcntr;
1068 rep->redoState = m_redo_alert_state;
1069 sendSignal(NDBCNTR_REF, GSN_REDO_STATE_REP, signal, 2, JBB);
1070 }
1071 if (m_global_redo_alert_state > m_redo_alert_state)
1072 {
1073 jam();
1074 m_redo_alert_state = m_global_redo_alert_state;
1075 }
1076 }
1077
1078 void
set_redo_alert_factor(Uint64 redo_percentage)1079 Backup::set_redo_alert_factor(Uint64 redo_percentage)
1080 {
1081 m_redo_alert_factor = 1;
1082 if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1083 {
1084 jam();
1085 m_redo_alert_factor = 24;
1086 }
1087 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1088 {
1089 jam();
1090 m_redo_alert_factor = 8;
1091 }
1092 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1093 {
1094 jam();
1095 m_redo_alert_factor = 4;
1096 }
1097 }
1098
1099 void
set_lcp_timing_factors(Uint64 seconds_since_lcp_cut)1100 Backup::set_lcp_timing_factors(Uint64 seconds_since_lcp_cut)
1101 {
1102 if (m_last_lcp_exec_time_in_ms == 0)
1103 {
1104 return;
1105 }
1106 Uint64 lcp_time_in_secs = m_last_lcp_exec_time_in_ms / 1000;
1107
1108 /**
1109 * seconds_since_lcp_cut normally goes to a bit more than
1110 * two times the LCP time. If the LCP time increases by more
1111 * than 6 seconds we try to increase the disk write speed to
1112 * handle this. If the seconds since last cut is increasing
1113 * even to double the LCP time we increase the factor even
1114 * more.
1115 *
1116 * There is no need to set those factors in a dramatic manner.
1117 * These factors are used to keep LCP times low to ensure that
1118 * recovery times are low. They assist in protecting the REDO
1119 * log from head meeting tail, but it isn't the main purpose.
1120 * There are many other mechanisms that take care of this
1121 * purpose.
1122 */
1123 Uint64 low_threshold = Uint64(2) * lcp_time_in_secs;
1124 low_threshold += Uint64(6);
1125 Uint64 high_threshold = Uint64(3) * lcp_time_in_secs;
1126 high_threshold += Uint64(6);
1127 if (seconds_since_lcp_cut + Uint64(3) < lcp_time_in_secs)
1128 {
1129 jam();
1130 /**
1131 * Ignore checking this for a while after the LCP have just
1132 * started. First of all we write more at the start due to
1133 * lag anyways, second we give time for the state to settle
1134 * done before acting on it.
1135 */
1136 return;
1137 }
1138 if (seconds_since_lcp_cut > low_threshold)
1139 {
1140 jam();
1141 m_lcp_timing_counter = 2;
1142 Uint64 new_timing_factor = Uint64(110);
1143 if (seconds_since_lcp_cut > high_threshold)
1144 {
1145 jam();
1146 new_timing_factor = Uint64(120);
1147 }
1148 if (new_timing_factor > m_lcp_timing_factor)
1149 {
1150 jam();
1151 m_lcp_timing_factor = new_timing_factor;
1152 }
1153 }
1154 /**
1155 * Ensure that the effects of REDO Alert Level stick to some
1156 * level all through the next LCP as well. This will help
1157 * bringing us permanently down in REDO Alert levels.
1158 */
1159 if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1160 {
1161 jam();
1162 m_lcp_timing_counter = 2;
1163 Uint64 new_timing_factor = Uint64(110);
1164 if (new_timing_factor > m_lcp_timing_factor)
1165 {
1166 jam();
1167 m_lcp_timing_factor = new_timing_factor;
1168 }
1169 }
1170 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1171 {
1172 jam();
1173 m_lcp_timing_counter = 2;
1174 Uint64 new_timing_factor = Uint64(120);
1175 if (new_timing_factor > m_lcp_timing_factor)
1176 {
1177 jam();
1178 m_lcp_timing_factor = new_timing_factor;
1179 }
1180 }
1181 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1182 {
1183 jam();
1184 m_lcp_timing_counter = 2;
1185 Uint64 new_timing_factor = Uint64(130);
1186 if (new_timing_factor > m_lcp_timing_factor)
1187 {
1188 jam();
1189 m_lcp_timing_factor = new_timing_factor;
1190 }
1191 }
1192 }
1193
1194 void
reset_lcp_timing_factors()1195 Backup::reset_lcp_timing_factors()
1196 {
1197 if (m_lcp_timing_counter > 0)
1198 {
1199 jam();
1200 m_lcp_timing_counter--;
1201 if (m_lcp_timing_counter == 0)
1202 {
1203 jam();
1204 m_lcp_timing_factor = Uint64(100);
1205 }
1206 else
1207 {
1208 jam();
1209 m_lcp_timing_factor -= Uint64(10);
1210 ndbrequire(m_lcp_timing_factor >= Uint64(100));
1211 }
1212 }
1213 }
1214
1215 void
set_proposed_disk_write_speed(Uint64 current_redo_speed_per_sec,Uint64 mean_redo_speed_per_sec,Uint64 seconds_since_lcp_cut)1216 Backup::set_proposed_disk_write_speed(Uint64 current_redo_speed_per_sec,
1217 Uint64 mean_redo_speed_per_sec,
1218 Uint64 seconds_since_lcp_cut)
1219 {
1220 /**
1221 * When LCPs are increasing the time it takes to execute an LCP we try to
1222 * get it back by increasing the disk write speed until the end of the
1223 * next LCP. This is controlled by the m_lcp_timing_factor variable. This
1224 * variable is set to 100 when no such issues are at hand.
1225 */
1226 m_proposed_disk_write_speed *= m_lcp_timing_factor;
1227 m_proposed_disk_write_speed /= Uint64(100);
1228
1229 /**
1230 * We save the proposed disk write speed with multiplication of LCP timing
1231 * factor as the m_lcp_change_rate, this is the calculated change rate with
1232 * some long-term factors derived from m_lcp_timing_factor.
1233 *
1234 * The short-term proposed disk write speed in addition will contain
1235 * additional components to ensure that we actually deliver the calculated
1236 * LCP change rate.
1237 */
1238 m_lcp_change_rate = m_proposed_disk_write_speed;
1239
1240 /**
1241 * The proposed disk write speed is not always achieved and we have some
1242 * level of slowness in responding to this setting, so we increase the
1243 * proposed disk write speed by 25% cater for this.
1244 *
1245 * There are many reasons why we won't achieve this speed. A few are:
1246 * 1) Variable completion of LCP execution in the LDMs in the cluster.
1247 * 2) High CPU usage when REDO log alert factor is still not activated
1248 * 3) Disk not keeping up temporarily
1249 * 4) Setting proposed disk write speed increases the maximum disk write
1250 * speed, thus it can take a while before it affects the actual
1251 * disk write speed since this is changed by an adaptive change
1252 * algorithm.
1253 */
1254 m_proposed_disk_write_speed *= Uint64(125);
1255 m_proposed_disk_write_speed /= Uint64(100);
1256
1257 Int64 lag = m_lcp_lag[0] + m_lcp_lag[1];
1258 Int64 lag_per_sec = 0;
1259 if (seconds_since_lcp_cut > 0)
1260 {
1261 lag_per_sec = lag / (Int64)seconds_since_lcp_cut;
1262 }
1263 if (current_redo_speed_per_sec > mean_redo_speed_per_sec)
1264 {
1265 jam();
1266 Uint64 factor = current_redo_speed_per_sec * Uint64(100);
1267 factor /= (mean_redo_speed_per_sec + 1);
1268 if (factor > Uint64(120))
1269 {
1270 jam();
1271 factor = Uint64(120);
1272 }
1273 /**
1274 * Increase the proposed disk write speed by up to 20% if we currently
1275 * generate more REDO logging compared to the mean. This is aiming to
1276 * cater for sudden increases in write activity to ensure that we start
1277 * acting quickly on those changes. At the same we put a dent on this
1278 * change to 20% increase. This avoids too high fluctuations in the
1279 * disk write speed.
1280 */
1281 m_proposed_disk_write_speed *= factor;
1282 m_proposed_disk_write_speed /= Uint64(100);
1283 }
1284 if (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW)
1285 {
1286 jam();
1287 /**
1288 * Add another 15% to proposed speed if we are at low
1289 * alert level.
1290 */
1291 m_proposed_disk_write_speed *= Uint64(110);
1292 m_proposed_disk_write_speed /= Uint64(100);
1293 }
1294 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1295 {
1296 jam();
1297 /**
1298 * Add another 25% to proposed speed if we are at high
1299 * alert level.
1300 */
1301 m_proposed_disk_write_speed *= Uint64(125);
1302 m_proposed_disk_write_speed /= Uint64(100);
1303 }
1304 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1305 {
1306 jam();
1307 /**
1308 * Add another 50% to proposed speed if we are at critical
1309 * alert level.
1310 */
1311 m_proposed_disk_write_speed *= Uint64(150);
1312 m_proposed_disk_write_speed /= Uint64(100);
1313 }
1314 if (lag < Int64(0) &&
1315 m_redo_alert_state < RedoStateRep::REDO_ALERT_HIGH)
1316 {
1317 /**
1318 * There is high REDO Alert level and we are running faster than
1319 * necessary, we will slow down based on the calculated lag per
1320 * second (which when negative means that we are ahead). We will
1321 * never slow down more than 20%.
1322 */
1323 lag_per_sec = Int64(-1) * lag_per_sec; /* Make number positive */
1324 Uint64 percentage_decrease = Uint64(lag_per_sec) * Uint64(100);
1325 percentage_decrease /= (m_proposed_disk_write_speed + 1);
1326 if (percentage_decrease > Uint64(20))
1327 {
1328 jam();
1329 m_proposed_disk_write_speed *= Uint64(80);
1330 m_proposed_disk_write_speed /= Uint64(100);
1331 }
1332 else
1333 {
1334 jam();
1335 m_proposed_disk_write_speed -= lag_per_sec;
1336 }
1337 }
1338 if (lag > Int64(0))
1339 {
1340 /**
1341 * We don't keep up with the calculated LCP change rate.
1342 * We will increase the proposed disk write speed by up
1343 * to 25% to keep up with the LCP change rate.
1344 *
1345 * We avoid regaining the lag too fast since it is easy
1346 * to write too much at the beginning of an LCP otherwise.
1347 * This will create a too bursty environment which is
1348 * undesirable.
1349 */
1350 jam();
1351 Uint64 percentage_increase = lag_per_sec * Uint64(100);
1352 percentage_increase /= (m_proposed_disk_write_speed + 1);
1353 DEB_LCP_LAG(("(%u)Lag per second is %lld, percent_increase: %llu",
1354 instance(), lag_per_sec, percentage_increase));
1355 Uint64 max_percentage_increase = Uint64(25);
1356 if (m_last_lcp_dd_percentage > 85)
1357 {
1358 jam();
1359 max_percentage_increase = Uint64(600);
1360 }
1361 else if (m_last_lcp_dd_percentage > 10)
1362 {
1363 /**
1364 * increase = percentage / (100 - percentage)
1365 * Multiply by 100 to get it in percent
1366 */
1367 jam();
1368 Uint64 divisor = Uint64(100) - Uint64(m_last_lcp_dd_percentage);
1369 Uint64 mult = Uint64(100) * Uint64(m_last_lcp_dd_percentage);
1370 max_percentage_increase = mult / divisor;
1371
1372 }
1373
1374 if (percentage_increase > max_percentage_increase)
1375 {
1376 jam();
1377 Uint64 increase_factor = Uint64(100) + max_percentage_increase;
1378 m_proposed_disk_write_speed *= increase_factor;
1379 m_proposed_disk_write_speed /= Uint64(100);
1380 }
1381 else
1382 {
1383 jam();
1384 m_proposed_disk_write_speed += lag_per_sec;
1385 }
1386 }
1387 }
1388
1389 void
measure_change_speed(Signal * signal,Uint64 millis_since_last_call)1390 Backup::measure_change_speed(Signal *signal, Uint64 millis_since_last_call)
1391 {
1392 /**
1393 * The aim of this function is to calculate the following values:
1394 * 1) m_redo_alert_state
1395 * 2) m_redo_alert_factor
1396 * 3) m_proposed_disk_write_speed
1397 *
1398 * The m_redo_alert_state variable is used to set the m_redo_alert_factor
1399 * that raises the priority of LCP writes towards other operation.
1400 *
1401 * The variable is kept consistent in the cluster to ensure that one
1402 * REDO log that is overloaded will also ensure that all other LDMs in
1403 * the cluster will speed up LCP execution.
1404 *
1405 * Based on this variable we raise the maximum speed based on the
1406 * configured disk write parameters.
1407 * This variable can also change the adaptive algorithm that slows down
1408 * LCP execution due to high CPU load. It ensures that we raise the
1409 * prio on LCP execution by ensuring that all LCP execution signals
1410 * are executed at A-level and we fill the buffers more actively when
1411 * set at alert levels.
1412 * Finally setting this variable to an alert level means that we speed up
1413 * handling of empty LCP fragments.
1414 *
1415 * The m_redo_alert_factor changes the amount of writes we will do in
1416 * one real-time break when executing at A-level.
1417 *
1418 * The proposed disk write speed is used to increase the maximum speed
1419 * used in the adaptive disk write speed algorithm if necessary.
1420 *
1421 * Calculation of the proposed disk write speed is fairly complicated.
1422 * The idea is to use the same mechanics used to decide how much an LCP
1423 * will execute on a fragment basis on a global level.
1424 *
1425 * get_redo_stats
1426 * --------------
1427 * To do this we keep track of the amount of changes we have done since
1428 * the start of the previous LCP. We keep track of this by adding the
1429 * average row size to a global update_size, insert_size and delete_size
1430 * in DBLQH. These variables are requested in the get_redo_stats call to
1431 * DBLQH.
1432 *
1433 * calculate_total_size
1434 * --------------------
1435 * To calculate the change size we use different change factors for
1436 * inserts and deletes. Deletes generate 20% more per byte compared
1437 * to updates and inserts generate less, 40% by default, compared to
1438 * updates. If we have both inserts and deletes we will only use
1439 * the larger of the two and the overlap is treated as updates.
1440 * This is the same mechanism used in the method calculate_row_change_count
1441 * used when deciding the number of parts to checkpoint for a specific
1442 * fragment.
1443 *
1444 * calculate_parts
1445 * ---------------
1446 * Updates can at times hit the same row, we estimate the number of updates
1447 * to the same row by using a Poisson distribution of writes to the rows.
1448 * This means that we can estimate the number of rows not written by using
1449 * an exponential distribution. Thus it is easy to calculate the percent of
1450 * data that has been written. Using this information we use the same
1451 * function (calculate_min_parts) to calculate the parts to checkpoint
1452 * on a global level, this function returns the number of parts with the
1453 * maximum number of parts being the BackupFormat::NDB_MAX_LCP_PARTS.
1454 *
1455 * calculate_change_rate
1456 * ---------------------
1457 * Finally we use the change size, the number of parts and the seconds since
1458 * the changes we used was started. This gives us a calculated proposed disk
1459 * write speed. To calculate we will retrieve the time since the start of
1460 * previous LCP.
1461 *
1462 * calculate_redo_parameters
1463 * -------------------------
1464 * We got redo_size, redo_usage and redo_written_since_last_call from the
1465 * call to get_redo_stats. Based on this information we calculate the
1466 * following variables.
1467 * redo_percentage:
1468 * ................
1469 * Percentage of REDO log currently in use. This is used directly to set the
1470 * m_redo_alert_factor.
1471 *
1472 * max_redo_used_before_cut:
1473 * mean_redo_used_before_cut:
1474 * redo_available:
1475 * ..........................
1476 * These three variables together are used to calculate if there is a risk
1477 * that we will run out of REDO log even without a high REDO percentage. If
1478 * so we will set the m_redo_alert_state based on these variables.
1479 * The max_redo_used_before_cut is an estimate of how much REDO log will
1480 * write before the next LCP is completed if maximum REDO write speed is
1481 * used. Similarly for mean_redo_used_before_cut but based on average REDO
1482 * write speed. redo_available is the amount of REDO log still available.
1483 *
1484 * mean_redo_speed_per_sec:
1485 * current_redo_speed_per_sec:
1486 * ...........................
1487 * These are used to see if we are currently very active in writing the
1488 * REDO log. If we are we will increase the proposed disk write speed a bit
1489 * as an effect of this.
1490 *
1491 * change_alert_state_redo_percent
1492 * -------------------------------
1493 * Based on redo_percentage we will set m_redo_alert_state.
1494 *
1495 * change_alert_state_redo_usage
1496 * -----------------------------
1497 * The above calculation based on max_redo_before_cut, mean_before_redo_cut,
1498 * and redo_available is performed here to set m_redo_alert_state
1499 * appropriately.
1500 *
1501 * handle_global_alert_state
1502 * -------------------------
1503 * Ensure that we are synchronised in our REDO alert state with other LDMs
1504 * in the cluster since the LCP protocol is global.
1505 *
1506 * set_redo_alert_factor
1507 * ---------------------
1508 * Set m_redo_alert_factor based on m_redo_alert_state and redo_percentage.
1509 *
1510 * calculate_change_rate
1511 * ---------------------
1512 * Calculate proposed disk write speed based on calculated value and on the
1513 * current activity level as reported in mean_redo_speed_per_sec and
1514 * current_redo_speed_per_sec. We will also increase to cater for some safety
1515 * levels and based on the m_redo_alert_state.
1516 */
1517 Uint64 redo_usage;
1518 Uint64 redo_size;
1519 Uint64 redo_written_since_last_call;
1520 Uint64 insert_size;
1521 Uint64 delete_size;
1522 Uint64 update_size;
1523 c_lqh->get_redo_stats(redo_usage,
1524 redo_size,
1525 redo_written_since_last_call,
1526 update_size,
1527 insert_size,
1528 delete_size);
1529
1530 if (redo_size == 0)
1531 {
1532 jam();
1533 return;
1534 }
1535 init_lcp_timers(redo_written_since_last_call);
1536
1537 Uint64 total_memory = get_total_memory();
1538 Uint64 curr_change_rate;
1539 {
1540 /**
1541 * In some cases we might have had an almost idle system for a while,
1542 * in this case it is not so good to base our disk write speed on
1543 * the average change rate, in this case it is better to use the
1544 * current change rate. But we don't want to base on the current
1545 * too much, so we decrease the current rate by 75% to avoid being
1546 * too much impacted by sudden hikes in write rates.
1547 */
1548 Uint64 curr_update_size = update_size - m_update_size_lcp_last;
1549 Uint64 curr_insert_size = insert_size - m_insert_size_lcp_last;
1550 Uint64 curr_delete_size = delete_size - m_delete_size_lcp_last;
1551 Uint64 curr_seconds_since_lcp_cut = 0;
1552 curr_change_rate = calculate_checkpoint_rate(curr_update_size,
1553 curr_insert_size,
1554 curr_delete_size,
1555 total_memory,
1556 curr_seconds_since_lcp_cut);
1557 if (curr_change_rate != 0)
1558 {
1559 curr_change_rate *= curr_seconds_since_lcp_cut;
1560 }
1561 curr_change_rate /= Uint64(100);
1562 curr_change_rate *= Uint64(75);
1563 }
1564 m_update_size_lcp_last = update_size;
1565 m_insert_size_lcp_last = insert_size;
1566 m_delete_size_lcp_last = delete_size;
1567
1568 Uint64 redo_percentage;
1569 Uint64 max_redo_used_before_cut;
1570 Uint64 mean_redo_used_before_cut;
1571 Uint64 mean_redo_speed_per_sec;
1572 Uint64 current_redo_speed_per_sec;
1573 Uint64 redo_available;
1574 calculate_redo_parameters(redo_usage,
1575 redo_size,
1576 redo_written_since_last_call,
1577 millis_since_last_call,
1578 redo_percentage,
1579 max_redo_used_before_cut,
1580 mean_redo_used_before_cut,
1581 mean_redo_speed_per_sec,
1582 current_redo_speed_per_sec,
1583 redo_available);
1584
1585 update_size -= m_update_size_lcp[0];
1586 insert_size -= m_insert_size_lcp[0];
1587 delete_size -= m_delete_size_lcp[0];
1588 Uint64 seconds_since_lcp_cut = 0;
1589 Uint64 change_rate = calculate_checkpoint_rate(update_size,
1590 insert_size,
1591 delete_size,
1592 get_total_memory(),
1593 seconds_since_lcp_cut);
1594 change_rate = MAX(change_rate, curr_change_rate);
1595
1596 m_proposed_disk_write_speed = change_rate;
1597
1598 m_redo_percentage = redo_percentage;
1599 m_max_redo_percentage = MAX(redo_percentage, m_max_redo_percentage);
1600 RedoStateRep::RedoAlertState save_redo_alert_state =
1601 m_local_redo_alert_state;
1602 change_alert_state_redo_percent(redo_percentage);
1603 change_alert_state_redo_usage(max_redo_used_before_cut,
1604 mean_redo_used_before_cut,
1605 redo_available);
1606 handle_global_alert_state(signal, save_redo_alert_state);
1607 c_pgman->set_redo_alert_state(m_redo_alert_state);
1608 set_redo_alert_factor(redo_percentage);
1609 set_lcp_timing_factors(seconds_since_lcp_cut);
1610 set_proposed_disk_write_speed(current_redo_speed_per_sec,
1611 mean_redo_speed_per_sec,
1612 seconds_since_lcp_cut);
1613
1614 #ifdef DEBUG_REDO_CONTROL
1615 Int64 current_lag = m_lcp_lag[0] + m_lcp_lag[1];
1616 DEB_REDO_CONTROL(("(%u)Proposed speed is %llu kB/sec"
1617 ", current_redo_speed is %llu kB/sec and"
1618 ", mean_redo_speed is %llu kB/sec"
1619 ", %s is %llu MB, change_rate is: %llu kB",
1620 instance(),
1621 (m_proposed_disk_write_speed / Uint64(1024)),
1622 (current_redo_speed_per_sec / Uint64(1024)),
1623 (mean_redo_speed_per_sec / Uint64(1024)),
1624 (current_lag >= 0) ? "lag" : "ahead",
1625 (current_lag >= 0) ? (current_lag / (1024 * 1024)) :
1626 (-current_lag/ (1024 * 1024)),
1627 (m_lcp_change_rate / 1024)));
1628 DEB_REDO_CONTROL(("(%u)state: %u, redo_size: %llu MByte, "
1629 "redo_percent: %llu, last LCP time in ms: %llu"
1630 ", m_lcp_timing_factor: %llu%%",
1631 instance(),
1632 m_redo_alert_state,
1633 redo_size,
1634 redo_percentage,
1635 m_last_lcp_exec_time_in_ms,
1636 m_lcp_timing_factor));
1637 #endif
1638 }
1639
1640 Uint64
calculate_proposed_disk_write_speed()1641 Backup::calculate_proposed_disk_write_speed()
1642 {
1643 if (!is_partial_lcp_enabled() || !is_redo_control_enabled())
1644 {
1645 jam();
1646 return 0;
1647 }
1648 Uint64 proposed_speed = m_proposed_disk_write_speed;
1649 proposed_speed /= CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS;
1650 return proposed_speed;
1651 }
1652
1653 /**
1654 * Calculate the current max and min write speeds, based on the
1655 * current disk-write demands on this LDM thread
1656 */
1657 void
calculate_current_speed_bounds(Uint64 & max_speed,Uint64 & max_backup_speed,Uint64 & min_speed)1658 Backup::calculate_current_speed_bounds(Uint64& max_speed,
1659 Uint64& max_backup_speed,
1660 Uint64& min_speed)
1661 {
1662 jam();
1663
1664 max_speed = c_defaults.m_disk_write_speed_max;
1665 max_backup_speed = c_defaults.m_disk_write_speed_max;
1666 min_speed = c_defaults.m_disk_write_speed_min;
1667
1668 {
1669 /**
1670 * Critical level for REDO means that we need to write checkpoint
1671 * urgently. We set it to maximum configurable level (level at own
1672 * restarts).
1673 *
1674 * High level for REDO means that we need to speed up checkpoints,
1675 * but there is still no urgency. In this we set the maximum
1676 * checkpoint speed equal to the speed when another node is
1677 * performing a node restart.
1678 *
1679 * We calculate proposed speed based on the REDO write speed
1680 * adjusted based on the setting of RecoveryWork. To keep up
1681 * with writing in a large database we need to write about
1682 * CHANGE_SPEED * (1 + (100 / RecoveryWork)). Thus at default
1683 * setting of RecoveryWork we need to write 3x the CHANGE_SPEED
1684 * to LCP files to keep the checkpoints short.
1685 *
1686 * We will attempt to keep the checkpoint short, but we will
1687 * only adjust the maximum level for this purpose. We will
1688 * not decrease application writes more than necessary to keep
1689 * this write speed. We will impact application performance
1690 * more when the REDO log level comes closer to critical levels.
1691 *
1692 * We keep track of proposed disk write speed also when no LCP
1693 * is ongoing. Otherwise it will take a long time to speed up
1694 * disk write speed again when a new LCP starts up again.
1695 */
1696 jam();
1697 if (m_redo_alert_state == RedoStateRep::REDO_ALERT_CRITICAL)
1698 {
1699 jam();
1700 max_speed = c_defaults.m_disk_write_speed_max_own_restart;
1701 DEB_REDO_CONTROL(("(%u)Critical REDO level, new max_speed: %llu kB/sec",
1702 instance(),
1703 ((max_speed *
1704 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1705 ));
1706 }
1707 else if (m_redo_alert_state == RedoStateRep::REDO_ALERT_HIGH)
1708 {
1709 jam();
1710 max_speed = c_defaults.m_disk_write_speed_max_other_node_restart;
1711 DEB_REDO_CONTROL(("(%u)High REDO level, new max_speed: %llu kB/sec",
1712 instance(),
1713 ((max_speed *
1714 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1715 ));
1716 }
1717 else if (m_is_any_node_restarting)
1718 {
1719 jam();
1720 max_speed = c_defaults.m_disk_write_speed_max_other_node_restart;
1721 DEB_REDO_CONTROL(("(%u)Node restarting, new max_speed: %llu kB/sec",
1722 instance(),
1723 ((max_speed *
1724 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1725 ));
1726 }
1727 Uint64 proposed_speed = calculate_proposed_disk_write_speed();
1728 if (proposed_speed > max_speed)
1729 {
1730 jam();
1731 max_speed = proposed_speed;
1732 DEB_REDO_CONTROL(("(%u)Proposed speed exceeds max_speed, "
1733 "new max_speed: %llu kB/sec",
1734 instance(),
1735 ((max_speed *
1736 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))
1737 ));
1738 }
1739 DEB_REDO_CONTROL(("(%u)max_speed set to %llu kB/sec",
1740 instance(),
1741 ((max_speed *
1742 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS)) / Uint64(1024))));
1743 }
1744
1745 /**
1746 * Thread balance
1747 *
1748 * As Backup is currently run on one LDM instance, we need to take
1749 * some steps to give it some extra DiskWriteSpeed allowance during
1750 * a Backup. This becomes more acute with more LDM threads.
1751 * The correct way to handle this is to parallelise backup and
1752 * the backup log.
1753 *
1754 * Until then, we will skew the per-LDM disk write speed bounds
1755 * temporarily during a Backup so that LDM 1 has a large fixed
1756 * portion as well as its usual 1/n share for LCP.
1757 *
1758 * When the Backup completes, balance is restored.
1759 */
1760
1761 const Uint32 num_ldm_threads = globalData.ndbMtLqhThreads;
1762
1763 if (m_is_backup_running && m_skew_disk_speed &&
1764 num_ldm_threads > 1)
1765 {
1766 jam();
1767
1768 const Uint64 node_max_speed =
1769 max_backup_speed *
1770 num_ldm_threads;
1771
1772 /* Backup will get a percentage of the node total allowance */
1773 Uint64 node_backup_max_speed =
1774 (node_max_speed * c_defaults.m_backup_disk_write_pct) /
1775 100;
1776
1777 /* LCP gets the rest */
1778 Uint64 node_lcp_max_speed =
1779 node_max_speed - node_backup_max_speed;
1780
1781 /* LDM threads get a fair share of the LCP allowance */
1782 Uint64 ldm_thread_lcp_max_speed =
1783 node_lcp_max_speed / num_ldm_threads;
1784
1785 /* Backup LDM must perform both node Backup + thread LCP */
1786 Uint64 backup_ldm_max_speed =
1787 node_backup_max_speed +
1788 ldm_thread_lcp_max_speed;
1789
1790 /* Other LDMs just do thread LCP */
1791 Uint64 other_ldm_max_speed =
1792 ldm_thread_lcp_max_speed;
1793
1794 ndbrequire(backup_ldm_max_speed +
1795 ((num_ldm_threads - 1) *
1796 other_ldm_max_speed) <=
1797 node_max_speed);
1798
1799 if (is_backup_worker())
1800 {
1801 jam();
1802 /**
1803 * Min is set to node backup speed,
1804 * this should quickly increase the thread's
1805 * allowance.
1806 */
1807 max_backup_speed = backup_ldm_max_speed;
1808 min_speed = MAX(min_speed, node_backup_max_speed);
1809 if (!is_redo_control_enabled())
1810 {
1811 jam();
1812 max_speed = MAX(max_speed, max_backup_speed);
1813 }
1814 }
1815 else
1816 {
1817 jam();
1818 /**
1819 * Trim write bandwidth available
1820 * to other LDM threads
1821 */
1822 max_backup_speed = other_ldm_max_speed;
1823 min_speed = MIN(min_speed, max_backup_speed);
1824 if (!is_redo_control_enabled())
1825 {
1826 jam();
1827 max_speed = max_backup_speed;
1828 }
1829 }
1830 }
1831 if (m_is_backup_running &&
1832 is_redo_control_enabled())
1833 {
1834 /**
1835 * Make sure that the total can be the sum while running both a backup
1836 * and an LCP at the same time. The minimum is the same for total and
1837 * for backup. The minimum is always based on the configured value.
1838 */
1839 jam();
1840 max_speed += max_backup_speed;
1841 }
1842 ndbrequire(min_speed <= max_speed);
1843 }
1844
1845 void
adjust_disk_write_speed_down(Uint64 & curr_disk_write_speed,Uint64 & loc_disk_write_speed_set_to_min,Uint64 min_speed,int adjust_speed)1846 Backup::adjust_disk_write_speed_down(Uint64& curr_disk_write_speed,
1847 Uint64& loc_disk_write_speed_set_to_min,
1848 Uint64 min_speed,
1849 int adjust_speed)
1850 {
1851 if ((Int64)curr_disk_write_speed < (Int64)adjust_speed)
1852 {
1853 loc_disk_write_speed_set_to_min++;
1854 curr_disk_write_speed = min_speed;
1855 }
1856 else
1857 {
1858 curr_disk_write_speed -= adjust_speed;
1859 if (curr_disk_write_speed < min_speed)
1860 {
1861 loc_disk_write_speed_set_to_min++;
1862 curr_disk_write_speed = min_speed;
1863 }
1864 }
1865 }
1866
1867 void
adjust_disk_write_speed_up(Uint64 & curr_disk_write_speed,Uint64 max_speed,int adjust_speed)1868 Backup::adjust_disk_write_speed_up(Uint64& curr_disk_write_speed,
1869 Uint64 max_speed,
1870 int adjust_speed)
1871 {
1872 curr_disk_write_speed += adjust_speed;
1873 if (curr_disk_write_speed > max_speed)
1874 {
1875 curr_disk_write_speed = max_speed;
1876 }
1877 }
1878
1879 /**
1880 * Calculate new disk checkpoint write speed based on the new
1881 * multiplication factor, we decrease in steps of 10% per second
1882 */
1883 void
calculate_disk_write_speed(Signal * signal)1884 Backup::calculate_disk_write_speed(Signal *signal)
1885 {
1886 if (!m_our_node_started && !m_first_lcp_started)
1887 {
1888 /* No adaptiveness while we're still starting. */
1889 jam();
1890 return;
1891 }
1892 Uint64 max_disk_write_speed;
1893 Uint64 max_backup_disk_write_speed;
1894 Uint64 min_disk_write_speed;
1895 jamEntry();
1896 calculate_current_speed_bounds(max_disk_write_speed,
1897 max_backup_disk_write_speed,
1898 min_disk_write_speed);
1899
1900 /**
1901 * Get CPU usage for the thread */
1902 EXECUTE_DIRECT_MT(THRMAN, GSN_GET_CPU_USAGE_REQ, signal,
1903 1,
1904 getThrmanInstance());
1905 Uint32 cpu_usage = signal->theData[0];
1906
1907 /**
1908 * It is possible that the limits (max + min) have moved so that
1909 * the current speed is now outside them, if so we immediately
1910 * track to the relevant limit.
1911 * In these cases, the data collected for the last period regarding
1912 * redo log etc will not be relevant here.
1913 */
1914 bool ret_flag = false;
1915 if (m_curr_disk_write_speed < min_disk_write_speed)
1916 {
1917 jam();
1918 m_curr_disk_write_speed = min_disk_write_speed;
1919 DEB_REDO_CONTROL(("(%u)1:Current disk write speed is %llu kB/sec",
1920 instance(),
1921 ((m_curr_disk_write_speed *
1922 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1923 Uint64(1024))
1924 ));
1925 ret_flag = true;
1926 }
1927 else if (m_curr_disk_write_speed > max_disk_write_speed)
1928 {
1929 jam();
1930 m_curr_disk_write_speed = max_disk_write_speed;
1931 DEB_REDO_CONTROL(("(%u)2:Current disk write speed is %llu kB/sec",
1932 instance(),
1933 ((m_curr_disk_write_speed *
1934 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1935 Uint64(1024))
1936 ));
1937 ret_flag = true;
1938 }
1939 if (m_curr_backup_disk_write_speed > max_backup_disk_write_speed)
1940 {
1941 jam();
1942 DEB_REDO_CONTROL(("(%u)Current backup disk write speed is %llu kB/sec",
1943 instance(),
1944 ((m_curr_backup_disk_write_speed *
1945 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1946 Uint64(1024))
1947 ));
1948 m_curr_backup_disk_write_speed = max_backup_disk_write_speed;
1949 }
1950 if (ret_flag)
1951 {
1952 jam();
1953 debug_report_redo_control(cpu_usage);
1954 return;
1955 }
1956
1957
1958 /**
1959 * Current speed is within bounds, now consider whether to adjust
1960 * based on feedback.
1961 *
1962 * Calculate the max - min and divide by 6 to get the adjustment parameter
1963 * which is 16% of max - min. We will never adjust faster than this to avoid
1964 * too quick adaptiveness. For adjustments down we will adapt faster for IO
1965 * lags, for CPU speed we will adapt a bit slower dependent on how high
1966 * the CPU load is.
1967 */
1968 int diff_disk_write_speed =
1969 max_disk_write_speed - min_disk_write_speed;
1970
1971 int adjust_speed_up = diff_disk_write_speed / 6;
1972 int adjust_speed_up_high = diff_disk_write_speed / 3;
1973 int adjust_speed_down_high = diff_disk_write_speed / 5;
1974 int adjust_speed_down_medium = diff_disk_write_speed / 8;
1975 int adjust_speed_down_low = diff_disk_write_speed / 12;
1976
1977 jam();
1978 if (diff_disk_write_speed <= 0 || adjust_speed_up == 0)
1979 {
1980 jam();
1981 /**
1982 * The min == max which gives no room to adapt the LCP speed.
1983 * or the difference is too small to adapt it.
1984 *
1985 * If min == max for total we will treat backup the same way.
1986 */
1987 DEB_REDO_CONTROL(("(%u)3:Current disk write speed is %llu kB/sec",
1988 instance(),
1989 ((m_curr_disk_write_speed *
1990 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) /
1991 Uint64(1024))
1992 ));
1993 debug_report_redo_control(cpu_usage);
1994 return;
1995 }
1996 if (c_lqh->is_ldm_instance_io_lagging())
1997 {
1998 /**
1999 * With IO lagging behind we will decrease the LCP speed to accomodate
2000 * for more REDO logging bandwidth. The definition of REDO log IO lagging
2001 * is kept in DBLQH, but will be a number of seconds of outstanding REDO
2002 * IO requests that LQH is still waiting for completion of.
2003 * This is a harder condition, so here we will immediately slow down fast.
2004 */
2005 jam();
2006 slowdowns_due_to_io_lag++;
2007 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2008 disk_write_speed_set_to_min,
2009 min_disk_write_speed,
2010 adjust_speed_down_high);
2011 adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2012 backup_disk_write_speed_set_to_min,
2013 min_disk_write_speed,
2014 adjust_speed_down_high);
2015 }
2016 else
2017 {
2018 /**
2019 * Get CPU usage of this LDM thread during last second.
2020 * If CPU usage is over or equal to 95% we will decrease the LCP speed
2021 * If CPU usage is below 90% we will increase the LCP speed
2022 * one more step. Otherwise we will keep it where it currently is.
2023 *
2024 * We will not slow down checkpointing due to high CPU when the REDO log
2025 * is close to become exhausted. This should protect it from becoming
2026 * full.
2027 *
2028 * The speed of writing backups and LCPs are fairly linear to the
2029 * amount of bytes written. So e.g. writing 10 MByte/second gives
2030 * roughly about 10% CPU usage in one CPU. So by writing less we have a
2031 * more or less linear decrease of CPU usage. Naturally the speed of
2032 * writing is very much coupled to the CPU speed. CPUs today have all
2033 * sorts of power save magic, but this algorithm doesn't kick in until
2034 * we're at very high CPU loads where we won't be in power save mode.
2035 * Obviously it also works in the opposite direction that we can easily
2036 * speed up things when the CPU is less used.
2037 *
2038 * One complication of this algorithm is that we only measure the thread
2039 * CPU usage, so we don't really know here the level of CPU usage in total
2040 * of the system. Getting this information is quite complex and can
2041 * quickly change if the user is also using the machine for many other
2042 * things. In this case the algorithm will simply go up to the current
2043 * maximum value. So it will work much the same as before this algorithm
2044 * was put in place with the maximum value as the new DiskCheckpointSpeed
2045 * parameter.
2046 *
2047 * The algorithm will work best in cases where the user has locked the
2048 * thread to one or more CPUs and ensures that the thread can always run
2049 * by not allocating more than one thread per CPU.
2050 *
2051 * The reason we put the CPU usage limits fairly high is that the LDM
2052 * threads become more and more efficient as loads goes up. The reason
2053 * for this is that as more and more signals are executed in each loop
2054 * before checking for new signals. This means that as load goes up we
2055 * spend more and more time doing useful work. At low loads we spend a
2056 * significant time simply waiting for new signals to arrive and going to
2057 * sleep and waking up. So being at 95% load still means that we have
2058 * a bit more than 5% capacity left and even being at 90% means we
2059 * might have as much as 20% more capacity to use.
2060 */
2061 jam();
2062 bool adjust_disk_speed = true;
2063 bool adjust_backup_disk_speed = true;
2064 if (m_redo_alert_state >= RedoStateRep::REDO_ALERT_LOW)
2065 {
2066 /**
2067 * We are in a critical or high state for our REDO log, we must ensure
2068 * that we step up to use more and more CPU for checkpoints as long as
2069 * we don't oversubscribe the IO subsystem. This is why we check for
2070 * IO lag slowdown before we come here. The IO lag will still slow
2071 * down the checkpoint speed. CPU usage will not slow down checkpoint
2072 * processing.
2073 */
2074 jam();
2075 adjust_disk_speed = false;
2076 adjust_disk_write_speed_up(m_curr_disk_write_speed,
2077 max_disk_write_speed,
2078 adjust_speed_up_high);
2079 }
2080 else if (!m_our_node_started)
2081 {
2082 adjust_disk_speed = false;
2083 adjust_backup_disk_speed = false;
2084 /**
2085 * We are not in a critical state of the REDO log and we are
2086 * executing a node restart. We will allow for more CPU usage
2087 * in this state, but we will still slow down checkpoints when
2088 * CPU become overloaded.
2089 */
2090 if (cpu_usage < 99)
2091 {
2092 jam();
2093 /* 0-98% load, slow down */
2094 adjust_disk_write_speed_up(m_curr_disk_write_speed,
2095 max_disk_write_speed,
2096 adjust_speed_up);
2097 }
2098 else if (cpu_usage < 100)
2099 {
2100 jam();
2101 /* 99% load, slow down */
2102 slowdowns_due_to_high_cpu++;
2103 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2104 disk_write_speed_set_to_min,
2105 min_disk_write_speed,
2106 adjust_speed_down_low);
2107 }
2108 else
2109 {
2110 /* 100% load, slow down a bit faster */
2111 jam();
2112 slowdowns_due_to_high_cpu++;
2113 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2114 disk_write_speed_set_to_min,
2115 min_disk_write_speed,
2116 adjust_speed_down_medium);
2117 }
2118 }
2119 if (cpu_usage < 90)
2120 {
2121 jamEntry();
2122 if (adjust_disk_speed)
2123 {
2124 adjust_disk_write_speed_up(m_curr_disk_write_speed,
2125 max_disk_write_speed,
2126 adjust_speed_up);
2127 }
2128 if (adjust_backup_disk_speed)
2129 {
2130 adjust_disk_write_speed_up(m_curr_backup_disk_write_speed,
2131 max_backup_disk_write_speed,
2132 adjust_speed_up);
2133 }
2134 }
2135 else if (cpu_usage < 95)
2136 {
2137 jam();
2138 }
2139 else if (cpu_usage < 97)
2140 {
2141 jam();
2142 /* 95-96% load, slightly slow down */
2143 if (adjust_disk_speed)
2144 {
2145 slowdowns_due_to_high_cpu++;
2146 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2147 disk_write_speed_set_to_min,
2148 min_disk_write_speed,
2149 adjust_speed_down_low);
2150 }
2151 if (adjust_backup_disk_speed)
2152 {
2153 slowdown_backups_due_to_high_cpu++;
2154 adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2155 backup_disk_write_speed_set_to_min,
2156 min_disk_write_speed,
2157 adjust_speed_down_low);
2158 }
2159 }
2160 else if (cpu_usage < 99)
2161 {
2162 jamEntry();
2163 /* 97-98% load, slow down */
2164 if (adjust_disk_speed)
2165 {
2166 slowdowns_due_to_high_cpu++;
2167 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2168 disk_write_speed_set_to_min,
2169 min_disk_write_speed,
2170 adjust_speed_down_medium);
2171 }
2172 if (adjust_backup_disk_speed)
2173 {
2174 slowdown_backups_due_to_high_cpu++;
2175 adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2176 backup_disk_write_speed_set_to_min,
2177 min_disk_write_speed,
2178 adjust_speed_down_medium);
2179 }
2180 }
2181 else
2182 {
2183 jamEntry();
2184 /* 99-100% load, slow down a bit faster */
2185 if (adjust_disk_speed)
2186 {
2187 slowdowns_due_to_high_cpu++;
2188 adjust_disk_write_speed_down(m_curr_disk_write_speed,
2189 disk_write_speed_set_to_min,
2190 min_disk_write_speed,
2191 adjust_speed_down_high);
2192 }
2193 if (adjust_backup_disk_speed)
2194 {
2195 slowdown_backups_due_to_high_cpu++;
2196 adjust_disk_write_speed_down(m_curr_backup_disk_write_speed,
2197 backup_disk_write_speed_set_to_min,
2198 min_disk_write_speed,
2199 adjust_speed_down_high);
2200 }
2201 }
2202 }
2203 debug_report_redo_control(cpu_usage);
2204 }
2205
2206 void
send_next_reset_disk_speed_counter(Signal * signal)2207 Backup::send_next_reset_disk_speed_counter(Signal *signal)
2208 {
2209 signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
2210 sendSignalWithDelay(reference(),
2211 GSN_CONTINUEB,
2212 signal,
2213 m_reset_delay_used,
2214 1);
2215 return;
2216 }
2217
2218 void
execCHECK_NODE_RESTARTCONF(Signal * signal)2219 Backup::execCHECK_NODE_RESTARTCONF(Signal *signal)
2220 {
2221 bool old_is_backup_running = m_is_backup_running;
2222 bool old_is_any_node_restarting = m_is_any_node_restarting;
2223 m_is_lcp_running = (signal->theData[0] == 1);
2224 m_is_backup_running = g_is_single_thr_backup_running; /* Global from backup instance */
2225 m_is_any_node_restarting = (signal->theData[1] == 1);
2226 const char* backup_text=NULL;
2227 const char* restart_text=NULL;
2228
2229 /* No logging of LCP start/stop w.r.t. Disk Speed */
2230 if (old_is_backup_running != m_is_backup_running)
2231 {
2232 if (old_is_backup_running)
2233 {
2234 backup_text=" Backup completed";
2235 }
2236 else
2237 {
2238 backup_text=" Backup started";
2239 }
2240 }
2241 if (old_is_any_node_restarting != m_is_any_node_restarting)
2242 {
2243 if (old_is_any_node_restarting)
2244 {
2245 restart_text=" Node restart finished";
2246 }
2247 else
2248 {
2249 restart_text=" Node restart ongoing";
2250 }
2251 }
2252
2253 if (is_backup_worker())
2254 {
2255 /* Just have one LDM log the transition */
2256 if (backup_text || restart_text)
2257 {
2258 g_eventLogger->info("Adjusting disk write speed bounds due to :%s%s",
2259 (backup_text ? backup_text : ""),
2260 (restart_text ? restart_text : ""));
2261 }
2262 }
2263 }
2264
2265 void
execCONTINUEB(Signal * signal)2266 Backup::execCONTINUEB(Signal* signal)
2267 {
2268 jamEntry();
2269 const Uint32 Tdata0 = signal->theData[0];
2270 const Uint32 Tdata1 = signal->theData[1];
2271 const Uint32 Tdata2 = signal->theData[2];
2272 const Uint32 Tdata3 = signal->theData[3];
2273
2274 switch(Tdata0) {
2275 case BackupContinueB::RESET_DISK_SPEED_COUNTER:
2276 {
2277 jam();
2278 const NDB_TICKS curr_time = NdbTick_getCurrentTicks();
2279 const Uint64 millisPassed =
2280 NdbTick_Elapsed(m_monitor_snapshot_start,curr_time).milliSec();
2281 if (millisPassed >= 800 && !m_node_restart_check_sent)
2282 {
2283 /**
2284 * Check for node restart ongoing, we will check for it and use
2285 * the cached copy of the node restart state when deciding on the
2286 * disk checkpoint speed. We will start this check a few intervals
2287 * before calculating the new disk checkpoint speed. We will send
2288 * such a check once per interval we are changing disk checkpoint
2289 * speed.
2290 *
2291 * So we call DIH asynchronously here after 800ms have passed such
2292 * that when 1000 ms have passed and we will check disk speeds we
2293 * have information about if there is a node restart ongoing or not.
2294 * This information will only affect disk write speed, so it's not
2295 * a problem to rely on up to 200ms old information.
2296 */
2297 jam();
2298 m_node_restart_check_sent = true;
2299 signal->theData[0] = reference();
2300 sendSignal(DBDIH_REF, GSN_CHECK_NODE_RESTARTREQ, signal, 1, JBB);
2301 }
2302 /**
2303 * We check for millis passed larger than 989 to handle the situation
2304 * when we wake up slightly too early. Since we only wake up once every
2305 * 100 millisecond, this should be better than occasionally get intervals
2306 * of 1100 milliseconds. All the calculations takes the real interval into
2307 * account, so it should not corrupt any data.
2308 */
2309 if (millisPassed > 989)
2310 {
2311 jam();
2312 m_node_restart_check_sent = false;
2313 monitor_disk_write_speed(curr_time, millisPassed);
2314 measure_change_speed(signal, Uint64(millisPassed));
2315 calculate_disk_write_speed(signal);
2316 c_pgman->set_current_disk_write_speed(m_curr_disk_write_speed *
2317 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS));
2318 }
2319 handle_overflow(m_overflow_disk_write,
2320 m_words_written_this_period,
2321 m_curr_disk_write_speed);
2322 handle_overflow(m_backup_overflow_disk_write,
2323 m_backup_words_written_this_period,
2324 m_curr_backup_disk_write_speed);
2325 calculate_next_delay(curr_time);
2326 send_next_reset_disk_speed_counter(signal);
2327 break;
2328 }
2329 case BackupContinueB::BACKUP_FRAGMENT_INFO:
2330 {
2331 jam();
2332 const Uint32 ptr_I = Tdata1;
2333 Uint32 tabPtr_I = Tdata2;
2334 Uint32 fragPtr_I = signal->theData[3];
2335
2336 BackupRecordPtr ptr;
2337 c_backupPool.getPtr(ptr, ptr_I);
2338 TablePtr tabPtr;
2339 ptr.p->tables.getPtr(tabPtr, tabPtr_I);
2340
2341 if (fragPtr_I != tabPtr.p->fragments.getSize())
2342 {
2343 jam();
2344 FragmentPtr fragPtr;
2345 tabPtr.p->fragments.getPtr(fragPtr, fragPtr_I);
2346
2347 BackupFilePtr filePtr;
2348 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
2349
2350 const Uint32 sz = sizeof(BackupFormat::CtlFile::FragmentInfo) >> 2;
2351 Uint32 * dst;
2352 if (!filePtr.p->operation.dataBuffer.getWritePtr(&dst, sz))
2353 {
2354 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
2355 WaitDiskBufferCapacityMillis, 4);
2356 return;
2357 }
2358
2359 BackupFormat::CtlFile::FragmentInfo * fragInfo =
2360 (BackupFormat::CtlFile::FragmentInfo*)dst;
2361 fragInfo->SectionType = htonl(BackupFormat::FRAGMENT_INFO);
2362 fragInfo->SectionLength = htonl(sz);
2363 fragInfo->TableId = htonl(fragPtr.p->tableId);
2364 fragInfo->FragmentNo = htonl(fragPtr_I);
2365 fragInfo->NoOfRecordsLow = htonl((Uint32)(fragPtr.p->noOfRecords & 0xFFFFFFFF));
2366 fragInfo->NoOfRecordsHigh = htonl((Uint32)(fragPtr.p->noOfRecords >> 32));
2367 fragInfo->FilePosLow = htonl(0);
2368 fragInfo->FilePosHigh = htonl(0);
2369
2370 filePtr.p->operation.dataBuffer.updateWritePtr(sz);
2371
2372 fragPtr_I++;
2373 }
2374
2375 if (fragPtr_I == tabPtr.p->fragments.getSize())
2376 {
2377 BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
2378 req->m_senderRef = reference();
2379 req->m_tableId = tabPtr.p->tableId;
2380 req->m_lock_unlock = BackupLockTab::UNLOCK_TABLE;
2381 req->m_backup_state = BackupLockTab::BACKUP_FRAGMENT_INFO;
2382 req->m_backupRecordPtr_I = ptr_I;
2383 req->m_tablePtr_I = tabPtr_I;
2384 sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
2385 BackupLockTab::SignalLength, JBB);
2386 return;
2387 }
2388
2389 signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
2390 signal->theData[1] = ptr_I;
2391 signal->theData[2] = tabPtr_I;
2392 signal->theData[3] = fragPtr_I;
2393 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
2394 return;
2395 }
2396 case BackupContinueB::START_FILE_THREAD:
2397 case BackupContinueB::BUFFER_UNDERFLOW:
2398 {
2399 jam();
2400 BackupFilePtr filePtr;
2401 c_backupFilePool.getPtr(filePtr, Tdata1);
2402 checkFile(signal, filePtr);
2403 return;
2404 }
2405 case BackupContinueB::BUFFER_FULL_SCAN:
2406 {
2407 jam();
2408 BackupFilePtr filePtr;
2409 BackupRecordPtr ptr;
2410 c_backupFilePool.getPtr(filePtr, Tdata1);
2411 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
2412 /**
2413 * Given that we've been waiting a few milliseconds for buffers to become
2414 * free, we need to initialise the priority mode algorithm to ensure that
2415 * we select the correct priority mode.
2416 *
2417 * We get the number of jobs waiting at B-level to assess the current
2418 * activity level to get a new starting point of the algorithm.
2419 * Any load level below 16 signals in the buffer we ignore, if we have
2420 * a higher level we provide a value that will ensure that we most likely
2421 * will start at A-level.
2422 */
2423 init_scan_prio_level(signal, ptr);
2424 checkScan(signal, ptr, filePtr, true);
2425 return;
2426 }
2427 break;
2428 case BackupContinueB::BUFFER_FULL_FRAG_COMPLETE:
2429 {
2430 jam();
2431 BackupFilePtr filePtr;
2432 c_backupFilePool.getPtr(filePtr, Tdata1);
2433 fragmentCompleted(signal, filePtr, Tdata2);
2434 return;
2435 }
2436 break;
2437 case BackupContinueB::BUFFER_FULL_META:
2438 {
2439 jam();
2440 BackupRecordPtr ptr;
2441 c_backupPool.getPtr(ptr, Tdata1);
2442
2443 BackupFilePtr filePtr;
2444
2445 if (ptr.p->is_lcp())
2446 {
2447 jam();
2448 ptr.p->files.getPtr(filePtr, Tdata3);
2449 }
2450 else
2451 {
2452 jam();
2453 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
2454 }
2455 FsBuffer & buf = filePtr.p->operation.dataBuffer;
2456
2457 if(buf.getFreeSize() < buf.getMaxWrite()) {
2458 jam();
2459 TablePtr tabPtr;
2460 c_tablePool.getPtr(tabPtr, Tdata2);
2461
2462 DEBUG_OUT("Backup - Buffer full - "
2463 << buf.getFreeSize()
2464 << " < " << buf.getMaxWrite()
2465 << " (sz: " << buf.getUsableSize()
2466 << " getMinRead: " << buf.getMinRead()
2467 << ") - tableId = " << tabPtr.p->tableId);
2468
2469 signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
2470 signal->theData[1] = Tdata1;
2471 signal->theData[2] = Tdata2;
2472 signal->theData[3] = Tdata3;
2473 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
2474 WaitDiskBufferCapacityMillis, 4);
2475 return;
2476 }//if
2477
2478 TablePtr tabPtr;
2479 c_tablePool.getPtr(tabPtr, Tdata2);
2480 GetTabInfoReq * req = (GetTabInfoReq *)signal->getDataPtrSend();
2481 req->senderRef = reference();
2482 req->senderData = filePtr.i;
2483 req->requestType = GetTabInfoReq::RequestById |
2484 GetTabInfoReq::LongSignalConf;
2485 req->tableId = tabPtr.p->tableId;
2486 req->schemaTransId = 0;
2487 sendSignal(DBDICT_REF, GSN_GET_TABINFOREQ, signal,
2488 GetTabInfoReq::SignalLength, JBB);
2489 return;
2490 }
2491 case BackupContinueB::ZGET_NEXT_FRAGMENT:
2492 {
2493 BackupRecordPtr backupPtr;
2494 TablePtr tabPtr;
2495 Uint32 fragNo = signal->theData[3];
2496 c_backupPool.getPtr(backupPtr, signal->theData[1]);
2497 ndbrequire(findTable(backupPtr, tabPtr, signal->theData[2]));
2498 getFragmentInfo(signal, backupPtr, tabPtr, fragNo);
2499 return;
2500 }
2501 case BackupContinueB::ZDELETE_LCP_FILE:
2502 {
2503 jam();
2504 delete_lcp_file_processing(signal);
2505 return;
2506 }
2507 default:
2508 ndbabort();
2509 }//switch
2510 }
2511
2512 void
execBACKUP_LOCK_TAB_CONF(Signal * signal)2513 Backup::execBACKUP_LOCK_TAB_CONF(Signal *signal)
2514 {
2515 jamEntry();
2516 const BackupLockTab *conf = (const BackupLockTab *)signal->getDataPtr();
2517 BackupRecordPtr ptr;
2518 c_backupPool.getPtr(ptr, conf->m_backupRecordPtr_I);
2519 TablePtr tabPtr;
2520 ptr.p->tables.getPtr(tabPtr, conf->m_tablePtr_I);
2521
2522 switch(conf->m_backup_state) {
2523 case BackupLockTab::BACKUP_FRAGMENT_INFO:
2524 {
2525 jam();
2526 ptr.p->tables.next(tabPtr);
2527 if (tabPtr.i == RNIL)
2528 {
2529 jam();
2530 closeFiles(signal, ptr);
2531 return;
2532 }
2533
2534 signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
2535 signal->theData[1] = ptr.i;
2536 signal->theData[2] = tabPtr.i;
2537 signal->theData[3] = 0; // Start from first fragment of next table
2538 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
2539 return;
2540 }
2541 case BackupLockTab::GET_TABINFO_CONF:
2542 {
2543 jam();
2544 if (conf->errorCode)
2545 {
2546 jam();
2547 defineBackupRef(signal, ptr, conf->errorCode);
2548 return;
2549 }
2550
2551 ptr.p->tables.next(tabPtr);
2552 afterGetTabinfoLockTab(signal, ptr, tabPtr);
2553 return;
2554 }
2555 case BackupLockTab::CLEANUP:
2556 {
2557 jam();
2558 ptr.p->tables.next(tabPtr);
2559 cleanupNextTable(signal, ptr, tabPtr);
2560 return;
2561 }
2562 default:
2563 ndbabort();
2564 }
2565 }
2566
2567 void
execBACKUP_LOCK_TAB_REF(Signal * signal)2568 Backup::execBACKUP_LOCK_TAB_REF(Signal *signal)
2569 {
2570 jamEntry();
2571 ndbabort(); /* Not currently possible. */
2572 }
2573
get_new_speed_val64(Signal * signal)2574 Uint64 Backup::get_new_speed_val64(Signal *signal)
2575 {
2576 if (signal->length() == 3)
2577 {
2578 jam();
2579 Uint64 val = Uint64(signal->theData[1]);
2580 val <<= 32;
2581 val += Uint64(signal->theData[2]);
2582 return val;
2583 }
2584 else
2585 {
2586 jam();
2587 return 0;
2588 }
2589 }
2590
get_new_speed_val32(Signal * signal)2591 Uint64 Backup::get_new_speed_val32(Signal *signal)
2592 {
2593 if (signal->length() == 2)
2594 {
2595 jam();
2596 return Uint64(signal->theData[1]);
2597 }
2598 else
2599 {
2600 jam();
2601 return 0;
2602 }
2603 }
2604
2605 void
execDUMP_STATE_ORD(Signal * signal)2606 Backup::execDUMP_STATE_ORD(Signal* signal)
2607 {
2608 jamEntry();
2609
2610 /* Dump commands used in public interfaces */
2611 switch (signal->theData[0]) {
2612 case DumpStateOrd::BackupStatus:
2613 {
2614 /* See code in BackupProxy.cpp as well */
2615 BlockReference result_ref = CMVMI_REF;
2616 if (signal->length() == 2)
2617 result_ref = signal->theData[1];
2618
2619 BackupRecordPtr ptr;
2620 get_backup_record(ptr);
2621 reportStatus(signal, ptr, result_ref);
2622 return;
2623 }
2624 case DumpStateOrd::BackupMinWriteSpeed32:
2625 {
2626 jam();
2627 Uint64 new_val = get_new_speed_val32(signal);
2628 if (new_val < Uint64(1024*1024))
2629 {
2630 jam();
2631 g_eventLogger->info("Use: DUMP 100001 MinDiskWriteSpeed");
2632 return;
2633 }
2634 restore_disk_write_speed_numbers();
2635 c_defaults.m_disk_write_speed_min = new_val;
2636 calculate_real_disk_write_speed_parameters();
2637 return;
2638 }
2639 case DumpStateOrd::BackupMaxWriteSpeed32:
2640 {
2641 jam();
2642 Uint64 new_val = get_new_speed_val32(signal);
2643 if (new_val < Uint64(1024*1024))
2644 {
2645 jam();
2646 g_eventLogger->info("Use: DUMP 100002 MaxDiskWriteSpeed");
2647 return;
2648 }
2649 restore_disk_write_speed_numbers();
2650 c_defaults.m_disk_write_speed_max = new_val;
2651 calculate_real_disk_write_speed_parameters();
2652 return;
2653 }
2654 case DumpStateOrd::BackupMaxWriteSpeedOtherNodeRestart32:
2655 {
2656 jam();
2657 Uint64 new_val = get_new_speed_val32(signal);
2658 if (new_val < Uint64(1024*1024))
2659 {
2660 jam();
2661 g_eventLogger->info("Use: DUMP 100003 MaxDiskWriteSpeedOtherNodeRestart");
2662 return;
2663 }
2664 restore_disk_write_speed_numbers();
2665 c_defaults.m_disk_write_speed_max_other_node_restart = new_val;
2666 calculate_real_disk_write_speed_parameters();
2667 return;
2668 }
2669 case DumpStateOrd::BackupMinWriteSpeed64:
2670 {
2671 jam();
2672 Uint64 new_val = get_new_speed_val64(signal);
2673 if (new_val < Uint64(1024*1024))
2674 {
2675 jam();
2676 g_eventLogger->info("Use: DUMP 100004 MinDiskWriteSpeed(MSB) "
2677 "MinDiskWriteSpeed(LSB)");
2678 return;
2679 }
2680 restore_disk_write_speed_numbers();
2681 c_defaults.m_disk_write_speed_min = new_val;
2682 calculate_real_disk_write_speed_parameters();
2683 return;
2684 }
2685 case DumpStateOrd::BackupMaxWriteSpeed64:
2686 {
2687 jam();
2688 Uint64 new_val = get_new_speed_val64(signal);
2689 if (new_val < Uint64(1024*1024))
2690 {
2691 jam();
2692 g_eventLogger->info("Use: DUMP 100005 MaxDiskWriteSpeed(MSB) "
2693 "MaxDiskWriteSpeed(LSB)");
2694 return;
2695 }
2696 restore_disk_write_speed_numbers();
2697 c_defaults.m_disk_write_speed_max = new_val;
2698 calculate_real_disk_write_speed_parameters();
2699 return;
2700 }
2701 case DumpStateOrd::BackupMaxWriteSpeedOtherNodeRestart64:
2702 {
2703 jam();
2704 Uint64 new_val = get_new_speed_val64(signal);
2705 if (new_val < Uint64(1024*1024))
2706 {
2707 jam();
2708 g_eventLogger->info("Use: DUMP 100006"
2709 " MaxDiskWriteSpeedOtherNodeRestart(MSB)"
2710 " MaxDiskWriteSpeedOtherNodeRestart(LSB)");
2711 return;
2712 }
2713 restore_disk_write_speed_numbers();
2714 c_defaults.m_disk_write_speed_max_other_node_restart = new_val;
2715 calculate_real_disk_write_speed_parameters();
2716 return;
2717 }
2718 default:
2719 /* continue to debug section */
2720 break;
2721 }
2722
2723 /* Debugging or unclassified section */
2724
2725 if(signal->theData[0] == 20){
2726 if(signal->length() > 1){
2727 c_defaults.m_dataBufferSize = (signal->theData[1] * 1024 * 1024);
2728 }
2729 if(signal->length() > 2){
2730 c_defaults.m_logBufferSize = (signal->theData[2] * 1024 * 1024);
2731 }
2732 if(signal->length() > 3){
2733 c_defaults.m_minWriteSize = signal->theData[3] * 1024;
2734 }
2735 if(signal->length() > 4){
2736 c_defaults.m_maxWriteSize = signal->theData[4] * 1024;
2737 }
2738
2739 infoEvent("Backup: data: %d log: %d min: %d max: %d",
2740 c_defaults.m_dataBufferSize,
2741 c_defaults.m_logBufferSize,
2742 c_defaults.m_minWriteSize,
2743 c_defaults.m_maxWriteSize);
2744 return;
2745 }
2746 if(signal->theData[0] == 21){
2747 BackupReq * req = (BackupReq*)signal->getDataPtrSend();
2748 req->senderData = 23;
2749 req->backupDataLen = 0;
2750 sendSignal(reference(), GSN_BACKUP_REQ,signal,BackupReq::SignalLength, JBB);
2751 startTime = NdbTick_getCurrentTicks();
2752 return;
2753 }
2754
2755 if(signal->theData[0] == 22){
2756 const Uint32 seq = signal->theData[1];
2757 FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
2758 req->userReference = reference();
2759 req->userPointer = 23;
2760 req->directory = 1;
2761 req->ownDirectory = 1;
2762 FsOpenReq::setVersion(req->fileNumber, 2);
2763 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
2764 FsOpenReq::v2_setSequence(req->fileNumber, seq);
2765 FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
2766 sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
2767 FsRemoveReq::SignalLength, JBA);
2768 return;
2769 }
2770
2771 if(signal->theData[0] == 23){
2772 /**
2773 * Print records
2774 */
2775 BackupRecordPtr ptr;
2776 for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr)){
2777 infoEvent("BackupRecord %d: BackupId: %u MasterRef: %x ClientRef: %x",
2778 ptr.i, ptr.p->backupId, ptr.p->masterRef, ptr.p->clientRef);
2779 infoEvent(" State: %d", ptr.p->slaveState.getState());
2780 BackupFilePtr filePtr;
2781 for(ptr.p->files.first(filePtr); filePtr.i != RNIL;
2782 ptr.p->files.next(filePtr)){
2783 jam();
2784 infoEvent(" file %d: type: %d flags: H'%x",
2785 filePtr.i, filePtr.p->fileType,
2786 filePtr.p->m_flags);
2787 }
2788 }
2789
2790 const NDB_TICKS now = NdbTick_getCurrentTicks();
2791 const Uint64 resetElapsed = NdbTick_Elapsed(m_reset_disk_speed_time,now).milliSec();
2792 const Uint64 millisPassed = NdbTick_Elapsed(m_monitor_snapshot_start,now).milliSec();
2793 /* Dump measured disk write speed since last RESET_DISK_SPEED */
2794 ndbout_c("m_curr_disk_write_speed: %ukb m_words_written_this_period:"
2795 " %u kwords m_overflow_disk_write: %u kb",
2796 Uint32(4 * m_curr_disk_write_speed / 1024),
2797 Uint32(m_words_written_this_period / 1024),
2798 Uint32(m_overflow_disk_write / 1024));
2799 ndbout_c("m_backup_curr_disk_write_speed: %ukb "
2800 "m_backup_words_written_this_period:"
2801 " %u kwords m_backup_overflow_disk_write: %u kb",
2802 Uint32(4 * m_curr_backup_disk_write_speed / 1024),
2803 Uint32(m_backup_words_written_this_period / 1024),
2804 Uint32(m_backup_overflow_disk_write / 1024));
2805 ndbout_c("m_reset_delay_used: %u time since last RESET_DISK_SPEED: %llu millis",
2806 m_reset_delay_used, resetElapsed);
2807 /* Dump measured rate since last snapshot start */
2808 Uint64 byteRate = (4000 * m_monitor_words_written) / (millisPassed + 1);
2809 ndbout_c("m_monitor_words_written : %llu, duration : %llu millis, rate :"
2810 " %llu bytes/s : (%u pct of config)",
2811 m_monitor_words_written, millisPassed,
2812 byteRate,
2813 (Uint32) ((100 * byteRate / (4 * 10)) /
2814 (m_curr_disk_write_speed + 1)));
2815 byteRate = (4000 * m_backup_monitor_words_written) / (millisPassed + 1);
2816 ndbout_c("m_backup_monitor_words_written : %llu, duration : %llu"
2817 " millis, rate :"
2818 " %llu bytes/s : (%u pct of config)",
2819 m_backup_monitor_words_written, millisPassed,
2820 byteRate,
2821 (Uint32) ((100 * byteRate / (4 * 10)) /
2822 (m_curr_backup_disk_write_speed + 1)));
2823
2824 for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr))
2825 {
2826 ndbout_c("BackupRecord %u: BackupId: %u MasterRef: %x ClientRef: %x",
2827 ptr.i, ptr.p->backupId, ptr.p->masterRef, ptr.p->clientRef);
2828 ndbout_c(" State: %u", ptr.p->slaveState.getState());
2829 ndbout_c(" noOfByte: %llu noOfRecords: %llu",
2830 ptr.p->noOfBytes, ptr.p->noOfRecords);
2831 ndbout_c(" noOfLogBytes: %llu noOfLogRecords: %llu",
2832 ptr.p->noOfLogBytes, ptr.p->noOfLogRecords);
2833 ndbout_c(" errorCode: %u", ptr.p->errorCode);
2834 BackupFilePtr filePtr;
2835 for(ptr.p->files.first(filePtr); filePtr.i != RNIL;
2836 ptr.p->files.next(filePtr))
2837 {
2838 ndbout_c(" file %u: type: %u flags: H'%x tableId: %u fragmentId: %u",
2839 filePtr.i, filePtr.p->fileType, filePtr.p->m_flags,
2840 filePtr.p->tableId, filePtr.p->fragmentNo);
2841 }
2842 if (ptr.p->slaveState.getState() == SCANNING && ptr.p->dataFilePtr[0] != RNIL)
2843 {
2844 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
2845 OperationRecord & op = filePtr.p->operation;
2846 Uint32 *tmp = NULL;
2847 Uint32 sz = 0;
2848 bool eof = FALSE;
2849 bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
2850 ndbout_c("ready: %s eof: %s", ready ? "TRUE" : "FALSE", eof ? "TRUE" : "FALSE");
2851 }
2852 }
2853 return;
2854 }
2855 if(signal->theData[0] == 24){
2856 /**
2857 * Print size of records etc.
2858 */
2859 infoEvent("Backup - dump pool sizes");
2860 infoEvent("BackupPool: %d BackupFilePool: %d TablePool: %d",
2861 c_backupPool.getSize(), c_backupFilePool.getSize(),
2862 c_tablePool.getSize());
2863 infoEvent("AttrPool: %d TriggerPool: %d FragmentPool: %d",
2864 c_backupPool.getSize(), c_backupFilePool.getSize(),
2865 c_tablePool.getSize());
2866 infoEvent("PagePool: %d",
2867 c_pagePool.getSize());
2868
2869
2870 if(signal->getLength() == 2 && signal->theData[1] == 2424)
2871 {
2872 /**
2873 * Handle LCP
2874 */
2875 BackupRecordPtr lcp;
2876 get_lcp_record(lcp);
2877
2878 ndbrequire(c_backupPool.getSize() == c_backupPool.getNoOfFree() + 1);
2879 ndbrequire(c_tablePool.getSize() == c_tablePool.getNoOfFree() + 2);
2880 ndbrequire(c_fragmentPool.getSize() == c_fragmentPool.getNoOfFree() + 2);
2881 ndbrequire(c_triggerPool.getSize() == c_triggerPool.getNoOfFree());
2882
2883 ndbrequire(c_backupFilePool.getSize() == (c_backupFilePool.getNoOfFree() +
2884 (4 + 2 * BackupFormat::NDB_MAX_FILES_PER_LCP)));
2885
2886 Uint32 file_pages = 0;
2887 BackupFilePtr lcp_file;
2888
2889 c_backupFilePool.getPtr(lcp_file, lcp.p->prepareCtlFilePtr[0]);
2890 file_pages += lcp_file.p->pages.getSize();
2891
2892 c_backupFilePool.getPtr(lcp_file, lcp.p->prepareCtlFilePtr[1]);
2893 file_pages += lcp_file.p->pages.getSize();
2894
2895 for (Uint32 i = 0; i < BackupFormat::NDB_MAX_FILES_PER_LCP; i++)
2896 {
2897 c_backupFilePool.getPtr(lcp_file, lcp.p->dataFilePtr[i]);
2898 file_pages += lcp_file.p->pages.getSize();
2899
2900 c_backupFilePool.getPtr(lcp_file, lcp.p->prepareDataFilePtr[i]);
2901 file_pages += lcp_file.p->pages.getSize();
2902 }
2903
2904 c_backupFilePool.getPtr(lcp_file, lcp.p->ctlFilePtr);
2905 file_pages += lcp_file.p->pages.getSize();
2906
2907 c_backupFilePool.getPtr(lcp_file, lcp.p->deleteFilePtr);
2908 file_pages += lcp_file.p->pages.getSize();
2909
2910 ndbrequire(c_pagePool.getSize() ==
2911 c_pagePool.getNoOfFree() +
2912 file_pages);
2913 }
2914 }
2915
2916 if(signal->theData[0] == DumpStateOrd::DumpBackup)
2917 {
2918 /* Display a bunch of stuff about Backup defaults */
2919 infoEvent("Compressed Backup: %d", c_defaults.m_compressed_backup);
2920 infoEvent("Compressed LCP: %d", c_defaults.m_compressed_lcp);
2921 }
2922
2923 if(signal->theData[0] == DumpStateOrd::DumpBackupSetCompressed)
2924 {
2925 c_defaults.m_compressed_backup= signal->theData[1];
2926 infoEvent("Compressed Backup: %d", c_defaults.m_compressed_backup);
2927 }
2928
2929 if(signal->theData[0] == DumpStateOrd::DumpBackupSetCompressedLCP)
2930 {
2931 c_defaults.m_compressed_lcp= signal->theData[1];
2932 infoEvent("Compressed LCP: %d", c_defaults.m_compressed_lcp);
2933 }
2934
2935 if (signal->theData[0] == DumpStateOrd::BackupErrorInsert)
2936 {
2937 if (signal->getLength() == 1)
2938 ndbout_c("BACKUP: setting error %u", signal->theData[1]);
2939 else
2940 ndbout_c("BACKUP: setting error %u, %u",
2941 signal->theData[1], signal->theData[2]);
2942 SET_ERROR_INSERT_VALUE2(signal->theData[1], signal->theData[2]);
2943 }
2944 }
2945
2946 /**
2947 * We are using a round buffer of measurements, to simplify the code we
2948 * use this routing to quickly derive the disk write record from an index
2949 * (how many seconds back we want to check).
2950 */
2951 Uint32
get_disk_write_speed_record(Uint32 start_index)2952 Backup::get_disk_write_speed_record(Uint32 start_index)
2953 {
2954 ndbassert(start_index < DISK_WRITE_SPEED_REPORT_SIZE);
2955 if (next_disk_write_speed_report == last_disk_write_speed_report)
2956 {
2957 /* No speed reports generated yet */
2958 return DISK_WRITE_SPEED_REPORT_SIZE;
2959 }
2960 if (start_index < next_disk_write_speed_report)
2961 {
2962 return (next_disk_write_speed_report - (start_index + 1));
2963 }
2964 else if (last_disk_write_speed_report == 0)
2965 {
2966 /**
2967 * We might still be in inital phase when not all records have
2968 * been written yet.
2969 */
2970 return DISK_WRITE_SPEED_REPORT_SIZE;
2971 }
2972 else
2973 {
2974 return (DISK_WRITE_SPEED_REPORT_SIZE -
2975 ((start_index + 1) - next_disk_write_speed_report));
2976 }
2977 ndbassert(false);
2978 return 0;
2979 }
2980
2981 /**
2982 * Calculates the average speed for a number of seconds back.
2983 * reports the numbers in number of milliseconds that actually
2984 * passed and the number of bytes written in this period.
2985 */
2986 void
calculate_disk_write_speed_seconds_back(Uint32 seconds_back,Uint64 & millis_passed,Uint64 & backup_lcp_bytes_written,Uint64 & backup_bytes_written,Uint64 & redo_bytes_written,bool at_least_one)2987 Backup::calculate_disk_write_speed_seconds_back(Uint32 seconds_back,
2988 Uint64 & millis_passed,
2989 Uint64 & backup_lcp_bytes_written,
2990 Uint64 & backup_bytes_written,
2991 Uint64 & redo_bytes_written,
2992 bool at_least_one)
2993 {
2994 Uint64 millis_back = (MILLIS_IN_A_SECOND * seconds_back) -
2995 MILLIS_ADJUST_FOR_EARLY_REPORT;
2996 Uint32 start_index = 0;
2997
2998 ndbassert(seconds_back > 0);
2999
3000 millis_passed = 0;
3001 backup_lcp_bytes_written = 0;
3002 backup_bytes_written = 0;
3003 redo_bytes_written = 0;
3004 jam();
3005 while (at_least_one ||
3006 (millis_passed < millis_back &&
3007 start_index < DISK_WRITE_SPEED_REPORT_SIZE))
3008 {
3009 jam();
3010 at_least_one = false;
3011 Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3012 if (disk_write_speed_record == DISK_WRITE_SPEED_REPORT_SIZE)
3013 break;
3014 millis_passed +=
3015 disk_write_speed_rep[disk_write_speed_record].millis_passed;
3016 backup_lcp_bytes_written +=
3017 disk_write_speed_rep[disk_write_speed_record].backup_lcp_bytes_written;
3018 backup_bytes_written +=
3019 disk_write_speed_rep[disk_write_speed_record].backup_bytes_written;
3020 redo_bytes_written +=
3021 disk_write_speed_rep[disk_write_speed_record].redo_bytes_written;
3022 start_index++;
3023 }
3024 /**
3025 * Always report at least one millisecond to avoid risk of division
3026 * by zero later on in the code.
3027 */
3028 jam();
3029 if (millis_passed == 0)
3030 {
3031 jam();
3032 millis_passed = 1;
3033 }
3034 return;
3035 }
3036
3037 void
calculate_std_disk_write_speed_seconds_back(Uint32 seconds_back,Uint64 millis_passed_total,Uint64 backup_lcp_bytes_written,Uint64 backup_bytes_written,Uint64 redo_bytes_written,Uint64 & std_dev_backup_lcp_in_bytes_per_sec,Uint64 & std_dev_backup_in_bytes_per_sec,Uint64 & std_dev_redo_in_bytes_per_sec)3038 Backup::calculate_std_disk_write_speed_seconds_back(Uint32 seconds_back,
3039 Uint64 millis_passed_total,
3040 Uint64 backup_lcp_bytes_written,
3041 Uint64 backup_bytes_written,
3042 Uint64 redo_bytes_written,
3043 Uint64 & std_dev_backup_lcp_in_bytes_per_sec,
3044 Uint64 & std_dev_backup_in_bytes_per_sec,
3045 Uint64 & std_dev_redo_in_bytes_per_sec)
3046 {
3047 Uint32 start_index = 0;
3048 Uint64 millis_passed = 0;
3049 Uint64 millis_back = (MILLIS_IN_A_SECOND * seconds_back) -
3050 MILLIS_ADJUST_FOR_EARLY_REPORT;
3051 Uint64 millis_passed_this_period;
3052
3053 Uint64 avg_backup_lcp_bytes_per_milli;
3054 Uint64 backup_lcp_bytes_written_this_period;
3055 Uint64 avg_backup_lcp_bytes_per_milli_this_period;
3056 long double backup_lcp_temp_sum;
3057 long double backup_lcp_square_sum;
3058
3059 Uint64 avg_backup_bytes_per_milli;
3060 Uint64 backup_bytes_written_this_period;
3061 Uint64 avg_backup_bytes_per_milli_this_period;
3062 long double backup_temp_sum;
3063 long double backup_square_sum;
3064
3065 Uint64 avg_redo_bytes_per_milli;
3066 Uint64 redo_bytes_written_this_period;
3067 Uint64 avg_redo_bytes_per_milli_this_period;
3068 long double redo_temp_sum;
3069 long double redo_square_sum;
3070
3071 ndbassert(seconds_back > 0);
3072 if (millis_passed_total == 0)
3073 {
3074 jam();
3075 std_dev_backup_lcp_in_bytes_per_sec = 0;
3076 std_dev_backup_in_bytes_per_sec = 0;
3077 std_dev_redo_in_bytes_per_sec = 0;
3078 return;
3079 }
3080 avg_backup_lcp_bytes_per_milli = backup_lcp_bytes_written /
3081 millis_passed_total;
3082 avg_backup_bytes_per_milli = backup_bytes_written /
3083 millis_passed_total;
3084 avg_redo_bytes_per_milli = redo_bytes_written / millis_passed_total;
3085 backup_lcp_square_sum = 0;
3086 backup_square_sum = 0;
3087 redo_square_sum = 0;
3088 jam();
3089 while (millis_passed < millis_back &&
3090 start_index < DISK_WRITE_SPEED_REPORT_SIZE)
3091 {
3092 jam();
3093 Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3094 if (disk_write_speed_record == DISK_WRITE_SPEED_REPORT_SIZE)
3095 break;
3096 millis_passed_this_period =
3097 disk_write_speed_rep[disk_write_speed_record].millis_passed;
3098 backup_lcp_bytes_written_this_period =
3099 disk_write_speed_rep[disk_write_speed_record].backup_lcp_bytes_written;
3100 backup_bytes_written_this_period =
3101 disk_write_speed_rep[disk_write_speed_record].backup_bytes_written;
3102 redo_bytes_written_this_period =
3103 disk_write_speed_rep[disk_write_speed_record].redo_bytes_written;
3104 millis_passed += millis_passed_this_period;
3105
3106 if (millis_passed_this_period != 0)
3107 {
3108 /**
3109 * We use here a calculation of standard deviation that firsts
3110 * calculates the variance. The variance is calculated as the square
3111 * mean of the difference. To get standard intervals we compute the
3112 * average per millisecond and then sum over all milliseconds. To
3113 * simplify the calculation we then multiply the square of the diffs
3114 * per milli to the number of millis passed in a particular measurement.
3115 * We divide by the total number of millis passed. We do this first to
3116 * avoid too big numbers. We use long double in all calculations to
3117 * ensure that we don't overflow.
3118 *
3119 * We also try to avoid divisions by zero in the code in multiple
3120 * places when we query this table before the first measurement have
3121 * been logged.
3122 *
3123 * Calculating standard deviation as:
3124 * Sum of X(i) - E(X) squared where X(i) is the average per millisecond
3125 * in this time period and E(X) is the average over the entire period.
3126 * We divide by number of periods, but to get it more real, we divide
3127 * by total_millis / millis_in_this_period since the periods aren't
3128 * exactly the same. Finally we take square root of the sum of those
3129 * (X(i) - E(X))^2 / #periods. Actually the standard deviation should
3130 * be calculated using #periods - 1 as divisor. Finally we also need
3131 * to convert it from standard deviation per millisecond to standard
3132 * deviation per second. We make that simple by multiplying the
3133 * result from this function by 1000.
3134 */
3135 jam();
3136 avg_backup_lcp_bytes_per_milli_this_period =
3137 backup_lcp_bytes_written_this_period / millis_passed_this_period;
3138 backup_lcp_temp_sum = (long double)avg_backup_lcp_bytes_per_milli;
3139 backup_lcp_temp_sum -=
3140 (long double)avg_backup_lcp_bytes_per_milli_this_period;
3141 backup_lcp_temp_sum *= backup_lcp_temp_sum;
3142 backup_lcp_temp_sum /= (long double)millis_passed_total;
3143 backup_lcp_temp_sum *= (long double)millis_passed_this_period;
3144 backup_lcp_square_sum += backup_lcp_temp_sum;
3145
3146 avg_backup_bytes_per_milli_this_period =
3147 backup_bytes_written_this_period / millis_passed_this_period;
3148 backup_temp_sum = (long double)avg_backup_bytes_per_milli;
3149 backup_temp_sum -=
3150 (long double)avg_backup_bytes_per_milli_this_period;
3151 backup_temp_sum *= backup_temp_sum;
3152 backup_temp_sum /= (long double)millis_passed_total;
3153 backup_temp_sum *= (long double)millis_passed_this_period;
3154 backup_square_sum += backup_temp_sum;
3155
3156 avg_redo_bytes_per_milli_this_period =
3157 redo_bytes_written_this_period / millis_passed_this_period;
3158 redo_temp_sum = (long double)avg_redo_bytes_per_milli;
3159 redo_temp_sum -= (long double)avg_redo_bytes_per_milli_this_period;
3160 redo_temp_sum *= redo_temp_sum;
3161 redo_temp_sum /= (long double)millis_passed_total;
3162 redo_temp_sum *= (long double)millis_passed_this_period;
3163 redo_square_sum += redo_temp_sum;
3164 }
3165 start_index++;
3166 }
3167 if (millis_passed == 0)
3168 {
3169 jam();
3170 std_dev_backup_lcp_in_bytes_per_sec = 0;
3171 std_dev_backup_in_bytes_per_sec = 0;
3172 std_dev_redo_in_bytes_per_sec = 0;
3173 return;
3174 }
3175 /**
3176 * Calculate standard deviation per millisecond
3177 * We use long double for the calculation, but we want to report it to
3178 * it in bytes per second, so this is easiest to do with an unsigned
3179 * integer number. Conversion from long double to Uint64 is a real
3180 * conversion that we leave to the compiler to generate code to make.
3181 */
3182 std_dev_backup_lcp_in_bytes_per_sec = (Uint64)sqrtl(backup_lcp_square_sum);
3183 std_dev_backup_in_bytes_per_sec = (Uint64)sqrtl(backup_square_sum);
3184 std_dev_redo_in_bytes_per_sec = (Uint64)sqrtl(redo_square_sum);
3185
3186 /**
3187 * Convert to standard deviation per second
3188 * We calculated it in bytes per millisecond, so simple multiplication of
3189 * 1000 is sufficient here.
3190 */
3191 std_dev_backup_lcp_in_bytes_per_sec*= (Uint64)1000;
3192 std_dev_backup_in_bytes_per_sec*= (Uint64)1000;
3193 std_dev_redo_in_bytes_per_sec*= (Uint64)1000;
3194 }
3195
3196 Uint64
calculate_millis_since_finished(Uint32 start_index)3197 Backup::calculate_millis_since_finished(Uint32 start_index)
3198 {
3199 Uint64 millis_passed = 0;
3200 jam();
3201 if (start_index == 0)
3202 {
3203 jam();
3204 return 0;
3205 }
3206 for (Uint32 i = 0; i < start_index; i++)
3207 {
3208 Uint32 disk_write_speed_record = get_disk_write_speed_record(i);
3209 millis_passed +=
3210 disk_write_speed_rep[disk_write_speed_record].millis_passed;
3211 }
3212 return millis_passed;
3213 }
3214
execDBINFO_SCANREQ(Signal * signal)3215 void Backup::execDBINFO_SCANREQ(Signal *signal)
3216 {
3217 jamEntry();
3218 DbinfoScanReq req= *(DbinfoScanReq*)signal->theData;
3219 const Ndbinfo::ScanCursor* cursor =
3220 CAST_CONSTPTR(Ndbinfo::ScanCursor, DbinfoScan::getCursorPtr(&req));
3221
3222 Ndbinfo::Ratelimit rl;
3223
3224 switch(req.tableId){
3225 case Ndbinfo::POOLS_TABLEID:
3226 {
3227 Ndbinfo::pool_entry pools[] =
3228 {
3229 { "Backup Record",
3230 c_backupPool.getUsed(),
3231 c_backupPool.getSize(),
3232 c_backupPool.getEntrySize(),
3233 c_backupPool.getUsedHi(),
3234 { CFG_DB_PARALLEL_BACKUPS,0,0,0 },
3235 0},
3236 { "Backup File",
3237 c_backupFilePool.getUsed(),
3238 c_backupFilePool.getSize(),
3239 c_backupFilePool.getEntrySize(),
3240 c_backupFilePool.getUsedHi(),
3241 { CFG_DB_PARALLEL_BACKUPS,0,0,0 },
3242 0},
3243 { "Table",
3244 c_tablePool.getUsed(),
3245 c_tablePool.getSize(),
3246 c_tablePool.getEntrySize(),
3247 c_tablePool.getUsedHi(),
3248 { CFG_DB_PARALLEL_BACKUPS,
3249 CFG_DB_NO_TABLES,
3250 CFG_DB_NO_ORDERED_INDEXES,
3251 CFG_DB_NO_UNIQUE_HASH_INDEXES },
3252 0},
3253 { "Trigger",
3254 c_triggerPool.getUsed(),
3255 c_triggerPool.getSize(),
3256 c_triggerPool.getEntrySize(),
3257 c_triggerPool.getUsedHi(),
3258 { CFG_DB_PARALLEL_BACKUPS,
3259 CFG_DB_NO_TABLES,
3260 CFG_DB_NO_ORDERED_INDEXES,
3261 CFG_DB_NO_UNIQUE_HASH_INDEXES },
3262 0},
3263 { "Fragment",
3264 c_fragmentPool.getUsed(),
3265 c_fragmentPool.getSize(),
3266 c_fragmentPool.getEntrySize(),
3267 c_fragmentPool.getUsedHi(),
3268 { CFG_DB_NO_TABLES,
3269 CFG_DB_NO_ORDERED_INDEXES,
3270 CFG_DB_NO_UNIQUE_HASH_INDEXES,0 },
3271 0},
3272 { "Page",
3273 c_pagePool.getUsed(),
3274 c_pagePool.getSize(),
3275 c_pagePool.getEntrySize(),
3276 c_pagePool.getUsedHi(),
3277 { CFG_DB_BACKUP_MEM,
3278 CFG_DB_BACKUP_DATA_BUFFER_MEM,0,0 },
3279 0},
3280 { NULL, 0,0,0,0, { 0,0,0,0 }, 0}
3281 };
3282
3283 const size_t num_config_params =
3284 sizeof(pools[0].config_params) / sizeof(pools[0].config_params[0]);
3285 Uint32 pool = cursor->data[0];
3286 BlockNumber bn = blockToMain(number());
3287 while(pools[pool].poolname)
3288 {
3289 jam();
3290 Ndbinfo::Row row(signal, req);
3291 row.write_uint32(getOwnNodeId());
3292 row.write_uint32(bn); // block number
3293 row.write_uint32(instance()); // block instance
3294 row.write_string(pools[pool].poolname);
3295
3296 row.write_uint64(pools[pool].used);
3297 row.write_uint64(pools[pool].total);
3298 row.write_uint64(pools[pool].used_hi);
3299 row.write_uint64(pools[pool].entry_size);
3300 for (size_t i = 0; i < num_config_params; i++)
3301 row.write_uint32(pools[pool].config_params[i]);
3302 row.write_uint32(GET_RG(pools[pool].record_type));
3303 row.write_uint32(GET_TID(pools[pool].record_type));
3304 ndbinfo_send_row(signal, req, row, rl);
3305 pool++;
3306 if (rl.need_break(req))
3307 {
3308 jam();
3309 ndbinfo_send_scan_break(signal, req, rl, pool);
3310 return;
3311 }
3312 }
3313 break;
3314 }
3315 case Ndbinfo::DISK_WRITE_SPEED_AGGREGATE_TABLEID:
3316 {
3317
3318 jam();
3319 Uint64 backup_lcp_bytes_written;
3320 Uint64 backup_bytes_written;
3321 Uint64 redo_bytes_written;
3322 Uint64 std_dev_backup;
3323 Uint64 std_dev_backup_lcp;
3324 Uint64 std_dev_redo;
3325 Uint64 millis_passed;
3326 Ndbinfo::Row row(signal, req);
3327 Uint32 ldm_instance = instance();
3328
3329 if (ldm_instance > 0)
3330 {
3331 /* Always start counting instances from 0 */
3332 ldm_instance--;
3333 }
3334 row.write_uint32(getOwnNodeId());
3335 row.write_uint32(ldm_instance);
3336
3337 /* Report last second */
3338 calculate_disk_write_speed_seconds_back(1,
3339 millis_passed,
3340 backup_lcp_bytes_written,
3341 backup_bytes_written,
3342 redo_bytes_written);
3343
3344 row.write_uint64((backup_lcp_bytes_written / millis_passed ) * 1000);
3345 row.write_uint64((redo_bytes_written / millis_passed) * 1000);
3346
3347 /* Report average and std_dev of last 10 seconds */
3348 calculate_disk_write_speed_seconds_back(10,
3349 millis_passed,
3350 backup_lcp_bytes_written,
3351 backup_bytes_written,
3352 redo_bytes_written);
3353
3354 row.write_uint64((backup_lcp_bytes_written * 1000) / millis_passed);
3355 row.write_uint64((redo_bytes_written * 1000) / millis_passed);
3356
3357 calculate_std_disk_write_speed_seconds_back(10,
3358 millis_passed,
3359 backup_lcp_bytes_written,
3360 backup_bytes_written,
3361 redo_bytes_written,
3362 std_dev_backup_lcp,
3363 std_dev_backup,
3364 std_dev_redo);
3365
3366 row.write_uint64(std_dev_backup_lcp);
3367 row.write_uint64(std_dev_redo);
3368
3369 /* Report average and std_dev of last 60 seconds */
3370 calculate_disk_write_speed_seconds_back(60,
3371 millis_passed,
3372 backup_lcp_bytes_written,
3373 backup_bytes_written,
3374 redo_bytes_written);
3375
3376 row.write_uint64((backup_lcp_bytes_written / millis_passed ) * 1000);
3377 row.write_uint64((redo_bytes_written / millis_passed) * 1000);
3378
3379 calculate_std_disk_write_speed_seconds_back(60,
3380 millis_passed,
3381 backup_lcp_bytes_written,
3382 backup_bytes_written,
3383 redo_bytes_written,
3384 std_dev_backup_lcp,
3385 std_dev_backup,
3386 std_dev_redo);
3387
3388 row.write_uint64(std_dev_backup_lcp);
3389 row.write_uint64(std_dev_redo);
3390
3391 row.write_uint64(slowdowns_due_to_io_lag);
3392 row.write_uint64(slowdowns_due_to_high_cpu);
3393 row.write_uint64(disk_write_speed_set_to_min);
3394 row.write_uint64(m_curr_disk_write_speed *
3395 CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS);
3396
3397 ndbinfo_send_row(signal, req, row, rl);
3398 break;
3399 }
3400 case Ndbinfo::DISK_WRITE_SPEED_BASE_TABLEID:
3401 {
3402 jam();
3403 Uint32 ldm_instance = instance();
3404
3405 if (ldm_instance > 0)
3406 {
3407 /* Always start counting instances from 0 */
3408 ldm_instance--;
3409 }
3410 Uint32 start_index = cursor->data[0];
3411 for ( ; start_index < DISK_WRITE_SPEED_REPORT_SIZE;)
3412 {
3413 jam();
3414 Ndbinfo::Row row(signal, req);
3415 row.write_uint32(getOwnNodeId());
3416 row.write_uint32(ldm_instance);
3417 Uint32 disk_write_speed_record = get_disk_write_speed_record(start_index);
3418 if (disk_write_speed_record != DISK_WRITE_SPEED_REPORT_SIZE)
3419 {
3420 jam();
3421 Uint64 backup_lcp_bytes_written_this_period =
3422 disk_write_speed_rep[disk_write_speed_record].
3423 backup_lcp_bytes_written;
3424 Uint64 redo_bytes_written_this_period =
3425 disk_write_speed_rep[disk_write_speed_record].
3426 redo_bytes_written;
3427 Uint64 millis_passed_this_period =
3428 disk_write_speed_rep[disk_write_speed_record].millis_passed;
3429 Uint64 millis_since_finished =
3430 calculate_millis_since_finished(start_index);
3431 Uint64 target_disk_write_speed =
3432 disk_write_speed_rep[disk_write_speed_record].target_disk_write_speed;
3433
3434 row.write_uint64(millis_since_finished);
3435 row.write_uint64(millis_passed_this_period);
3436 row.write_uint64(backup_lcp_bytes_written_this_period);
3437 row.write_uint64(redo_bytes_written_this_period);
3438 row.write_uint64(target_disk_write_speed);
3439 }
3440 else
3441 {
3442 jam();
3443 row.write_uint64((Uint64)0);
3444 row.write_uint64((Uint64)0);
3445 row.write_uint64((Uint64)0);
3446 row.write_uint64((Uint64)0);
3447 row.write_uint64((Uint64)0);
3448 }
3449 ndbinfo_send_row(signal, req, row, rl);
3450 start_index++;
3451 if (rl.need_break(req))
3452 {
3453 jam();
3454 ndbinfo_send_scan_break(signal, req, rl, start_index);
3455 return;
3456 }
3457 }
3458 break;
3459 }
3460 case Ndbinfo::LOGBUFFERS_TABLEID:
3461 {
3462 jam();
3463 BackupRecordPtr ptr;
3464 if (!get_backup_record(ptr))
3465 {
3466 break;
3467 }
3468
3469 jam();
3470 Uint32 files[2] = { ptr.p->dataFilePtr[0], ptr.p->logFilePtr };
3471 for (Uint32 i=0; i<NDB_ARRAY_SIZE(files); i++)
3472 {
3473 jam();
3474 Uint32 usableBytes, freeLwmBytes, freeSizeBytes;
3475 usableBytes = freeLwmBytes = freeSizeBytes = 0;
3476 Uint32 logtype = Ndbinfo::BACKUP_DATA_BUFFER;
3477
3478 switch(i){
3479 case 0:
3480 logtype = Ndbinfo::BACKUP_DATA_BUFFER;
3481 usableBytes = c_defaults.m_dataBufferSize;
3482 break;
3483 case 1:
3484 logtype = Ndbinfo::BACKUP_LOG_BUFFER;
3485 usableBytes = c_defaults.m_logBufferSize;
3486 break;
3487 default:
3488 ndbabort();
3489 break;
3490 };
3491
3492 BackupFilePtr filePtr;
3493 ptr.p->files.getPtr(filePtr, files[i]);
3494 if (ptr.p->logFilePtr != RNIL)
3495 {
3496 freeSizeBytes = filePtr.p->operation.dataBuffer.getFreeSize() << 2;
3497 freeLwmBytes = filePtr.p->operation.dataBuffer.getFreeLwm() << 2;
3498 }
3499 else
3500 {
3501 freeSizeBytes = usableBytes;
3502 freeLwmBytes = usableBytes;
3503 }
3504
3505 Ndbinfo::Row data_row(signal, req);
3506 data_row.write_uint32(getOwnNodeId());
3507 data_row.write_uint32(logtype);
3508 data_row.write_uint32(0); // log id, always 0
3509 data_row.write_uint32(instance()); // log part, instance for ndbmtd
3510
3511 data_row.write_uint64(usableBytes); // total allocated
3512 data_row.write_uint64(usableBytes - freeSizeBytes); // currently in use
3513 data_row.write_uint64(usableBytes - freeLwmBytes); // high water mark
3514 // only 2 rows to send in total, so ignore ratelimit
3515 ndbinfo_send_row(signal, req, data_row, rl);
3516 }
3517 break;
3518 }
3519 default:
3520 break;
3521 }
3522
3523 ndbinfo_send_scan_conf(signal, req, rl);
3524 }
3525
3526 static const Uint32 MAX_TABLE_MAPS = 2;
3527 bool
findTable(const BackupRecordPtr & ptr,TablePtr & tabPtr,Uint32 tableId)3528 Backup::findTable(const BackupRecordPtr & ptr,
3529 TablePtr & tabPtr, Uint32 tableId)
3530 {
3531 Uint32 loopCount = 0;
3532 tabPtr.i = c_tableMap[tableId];
3533 while (loopCount++ < MAX_TABLE_MAPS)
3534 {
3535 if (tabPtr.i == RNIL)
3536 {
3537 jam();
3538 return false;
3539 }
3540 c_tablePool.getPtr(tabPtr);
3541 if (tabPtr.p->backupPtrI == ptr.i)
3542 {
3543 jam();
3544 return true;
3545 }
3546 jam();
3547 tabPtr.i = tabPtr.p->nextMapTable;
3548 }
3549 return false;
3550 }
3551
3552 void
insertTableMap(TablePtr & tabPtr,Uint32 backupPtrI,Uint32 tableId)3553 Backup::insertTableMap(TablePtr & tabPtr,
3554 Uint32 backupPtrI,
3555 Uint32 tableId)
3556 {
3557 tabPtr.p->backupPtrI = backupPtrI;
3558 tabPtr.p->tableId = tableId;
3559 tabPtr.p->nextMapTable = c_tableMap[tableId];
3560 c_tableMap[tableId] = tabPtr.i;
3561 }
3562
3563 void
removeTableMap(TablePtr & tabPtr,Uint32 backupPtr,Uint32 tableId)3564 Backup::removeTableMap(TablePtr &tabPtr,
3565 Uint32 backupPtr,
3566 Uint32 tableId)
3567 {
3568 TablePtr prevTabPtr;
3569 TablePtr locTabPtr;
3570 Uint32 loopCount = 0;
3571
3572 prevTabPtr.i = RNIL;
3573 prevTabPtr.p = 0;
3574 locTabPtr.i = c_tableMap[tableId];
3575
3576 while (loopCount++ < MAX_TABLE_MAPS)
3577 {
3578 jam();
3579 c_tablePool.getPtr(locTabPtr);
3580 ndbrequire(locTabPtr.p->tableId == tableId);
3581 if (locTabPtr.p->backupPtrI == backupPtr)
3582 {
3583 ndbrequire(tabPtr.i == locTabPtr.i);
3584 if (prevTabPtr.i == RNIL)
3585 {
3586 jam();
3587 c_tableMap[tableId] = locTabPtr.p->nextMapTable;
3588 }
3589 else
3590 {
3591 jam();
3592 prevTabPtr.p->nextMapTable = locTabPtr.p->nextMapTable;
3593 }
3594 locTabPtr.p->nextMapTable = RNIL;
3595 locTabPtr.p->tableId = RNIL;
3596 locTabPtr.p->backupPtrI = RNIL;
3597 return;
3598 }
3599 prevTabPtr = locTabPtr;
3600 locTabPtr.i = locTabPtr.p->nextMapTable;
3601 }
3602 ndbabort();
3603 }
3604
xps(Uint64 x,Uint64 ms)3605 static Uint32 xps(Uint64 x, Uint64 ms)
3606 {
3607 float fx = float(x);
3608 float fs = float(ms);
3609
3610 if(ms == 0 || x == 0) {
3611 jamNoBlock();
3612 return 0;
3613 }//if
3614 jamNoBlock();
3615 return ((Uint32)(1000.0f * (fx + fs/2.1f))) / ((Uint32)fs);
3616 }
3617
3618 struct Number {
NumberNumber3619 Number(Uint64 r) { val = r;}
operator =Number3620 Number & operator=(Uint64 r) { val = r; return * this; }
3621 Uint64 val;
3622 };
3623
3624 NdbOut &
operator <<(NdbOut & out,const Number & val)3625 operator<< (NdbOut & out, const Number & val){
3626 char p = 0;
3627 Uint32 loop = 1;
3628 while(val.val > loop){
3629 loop *= 1000;
3630 p += 3;
3631 }
3632 if(loop != 1){
3633 p -= 3;
3634 loop /= 1000;
3635 }
3636
3637 switch(p){
3638 case 0:
3639 break;
3640 case 3:
3641 p = 'k';
3642 break;
3643 case 6:
3644 p = 'M';
3645 break;
3646 case 9:
3647 p = 'G';
3648 break;
3649 default:
3650 p = 0;
3651 }
3652 char str[2];
3653 str[0] = p;
3654 str[1] = 0;
3655 Uint32 tmp = (Uint32)((val.val + (loop >> 1)) / loop);
3656 #if 1
3657 if(p > 0)
3658 out << tmp << str;
3659 else
3660 out << tmp;
3661 #else
3662 out << val.val;
3663 #endif
3664
3665 return out;
3666 }
3667
3668 void
execBACKUP_CONF(Signal * signal)3669 Backup::execBACKUP_CONF(Signal* signal)
3670 {
3671 jamEntry();
3672 BackupConf * conf = (BackupConf*)signal->getDataPtr();
3673
3674 ndbout_c("Backup %u has started", conf->backupId);
3675 }
3676
3677 void
execBACKUP_REF(Signal * signal)3678 Backup::execBACKUP_REF(Signal* signal)
3679 {
3680 jamEntry();
3681 BackupRef * ref = (BackupRef*)signal->getDataPtr();
3682
3683 ndbout_c("Backup (%u) has NOT started %d", ref->senderData, ref->errorCode);
3684 }
3685
3686 void
execBACKUP_COMPLETE_REP(Signal * signal)3687 Backup::execBACKUP_COMPLETE_REP(Signal* signal)
3688 {
3689 jamEntry();
3690 BackupCompleteRep* rep = (BackupCompleteRep*)signal->getDataPtr();
3691
3692 const NDB_TICKS now = NdbTick_getCurrentTicks();
3693 const Uint64 elapsed = NdbTick_Elapsed(startTime,now).milliSec();
3694
3695 ndbout_c("Backup %u has completed", rep->backupId);
3696 const Uint64 bytes =
3697 rep->noOfBytesLow + (((Uint64)rep->noOfBytesHigh) << 32);
3698 const Uint64 records =
3699 rep->noOfRecordsLow + (((Uint64)rep->noOfRecordsHigh) << 32);
3700
3701 Number rps = xps(records, elapsed);
3702 Number bps = xps(bytes, elapsed);
3703
3704 ndbout << " Data [ "
3705 << Number(records) << " rows "
3706 << Number(bytes) << " bytes " << elapsed << " ms ] "
3707 << " => "
3708 << rps << " row/s & " << bps << "b/s" << endl;
3709
3710 bps = xps(rep->noOfLogBytes, elapsed);
3711 rps = xps(rep->noOfLogRecords, elapsed);
3712
3713 ndbout << " Log [ "
3714 << Number(rep->noOfLogRecords) << " log records "
3715 << Number(rep->noOfLogBytes) << " bytes " << elapsed << " ms ] "
3716 << " => "
3717 << rps << " records/s & " << bps << "b/s" << endl;
3718
3719 }
3720
3721 void
execBACKUP_ABORT_REP(Signal * signal)3722 Backup::execBACKUP_ABORT_REP(Signal* signal)
3723 {
3724 jamEntry();
3725 BackupAbortRep* rep = (BackupAbortRep*)signal->getDataPtr();
3726
3727 ndbout_c("Backup %u has been aborted %d", rep->backupId, rep->reason);
3728 }
3729
3730 const TriggerEvent::Value triggerEventValues[] = {
3731 TriggerEvent::TE_INSERT,
3732 TriggerEvent::TE_UPDATE,
3733 TriggerEvent::TE_DELETE
3734 };
3735
3736 const Backup::State
3737 Backup::validSlaveTransitions[] = {
3738 INITIAL, DEFINING,
3739 DEFINING, DEFINED,
3740 DEFINED, STARTED,
3741 STARTED, STARTED, // Several START_BACKUP_REQ is sent
3742 STARTED, SCANNING,
3743 SCANNING, STARTED,
3744 STARTED, STOPPING,
3745 STOPPING, CLEANING,
3746 CLEANING, INITIAL,
3747
3748 INITIAL, ABORTING, // Node fail
3749 DEFINING, ABORTING,
3750 DEFINED, ABORTING,
3751 STARTED, ABORTING,
3752 SCANNING, ABORTING,
3753 STOPPING, ABORTING,
3754 CLEANING, ABORTING, // Node fail w/ master takeover
3755 ABORTING, ABORTING, // Slave who initiates ABORT should have this transition
3756
3757 ABORTING, INITIAL,
3758 INITIAL, INITIAL
3759 };
3760
3761 const Uint32
3762 Backup::validSlaveTransitionsCount =
3763 sizeof(Backup::validSlaveTransitions) / sizeof(Backup::State);
3764
3765 void
setState(State newState)3766 Backup::CompoundState::setState(State newState){
3767 bool found = false;
3768 const State currState = state;
3769 for(unsigned i = 0; i<noOfValidTransitions; i+= 2) {
3770 jam();
3771 if(validTransitions[i] == currState &&
3772 validTransitions[i+1] == newState){
3773 jam();
3774 found = true;
3775 break;
3776 }
3777 }
3778
3779 //ndbrequire(found);
3780
3781 if (newState == INITIAL)
3782 abortState = INITIAL;
3783 if(newState == ABORTING && currState != ABORTING) {
3784 jam();
3785 abortState = currState;
3786 }
3787 state = newState;
3788 #ifdef DEBUG_ABORT
3789 if (newState != currState) {
3790 ndbout_c("%u: Old state = %u, new state = %u, abort state = %u",
3791 id, currState, newState, abortState);
3792 }
3793 #endif
3794 }
3795
3796 void
forceState(State newState)3797 Backup::CompoundState::forceState(State newState)
3798 {
3799 const State currState = state;
3800 if (newState == INITIAL)
3801 abortState = INITIAL;
3802 if(newState == ABORTING && currState != ABORTING) {
3803 jam();
3804 abortState = currState;
3805 }
3806 state = newState;
3807 #ifdef DEBUG_ABORT
3808 if (newState != currState) {
3809 ndbout_c("%u: FORCE: Old state = %u, new state = %u, abort state = %u",
3810 id, currState, newState, abortState);
3811 }
3812 #endif
3813 }
3814
Table(Fragment_pool & fh)3815 Backup::Table::Table(Fragment_pool & fh)
3816 : fragments(fh)
3817 {
3818 triggerIds[0] = ILLEGAL_TRIGGER_ID;
3819 triggerIds[1] = ILLEGAL_TRIGGER_ID;
3820 triggerIds[2] = ILLEGAL_TRIGGER_ID;
3821 triggerAllocated[0] = false;
3822 triggerAllocated[1] = false;
3823 triggerAllocated[2] = false;
3824 }
3825
3826 /*****************************************************************************
3827 *
3828 * Node state handling
3829 *
3830 *****************************************************************************/
3831 void
execNODE_FAILREP(Signal * signal)3832 Backup::execNODE_FAILREP(Signal* signal)
3833 {
3834 jamEntry();
3835
3836 NodeFailRep * rep = (NodeFailRep*)signal->getDataPtr();
3837
3838 if(signal->getLength() == NodeFailRep::SignalLength)
3839 {
3840 ndbrequire(signal->getNoOfSections() == 1);
3841 ndbrequire(ndbd_send_node_bitmask_in_section(
3842 getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version));
3843 SegmentedSectionPtr ptr;
3844 SectionHandle handle(this, signal);
3845 handle.getSection(ptr, 0);
3846 memset(rep->theNodes, 0, sizeof(rep->theNodes));
3847 copy(rep->theNodes, ptr);
3848 releaseSections(handle);
3849 }
3850 else
3851 {
3852 memset(rep->theNodes + NdbNodeBitmask48::Size,
3853 0,
3854 _NDB_NBM_DIFF_BYTES);
3855 }
3856 bool doStuff = false;
3857 /*
3858 Start by saving important signal data which will be destroyed before the
3859 process is completed.
3860 */
3861 NodeId new_master_node_id = rep->masterNodeId;
3862 Uint32 theFailedNodes[NdbNodeBitmask::Size];
3863 for (Uint32 i = 0; i < NdbNodeBitmask::Size; i++)
3864 theFailedNodes[i] = rep->theNodes[i];
3865
3866 c_masterNodeId = new_master_node_id;
3867
3868 NodePtr nodePtr;
3869 for(c_nodes.first(nodePtr); nodePtr.i != RNIL; c_nodes.next(nodePtr)) {
3870 jam();
3871 if(NdbNodeBitmask::get(theFailedNodes, nodePtr.p->nodeId)){
3872 if(nodePtr.p->alive){
3873 jam();
3874 ndbrequire(c_aliveNodes.get(nodePtr.p->nodeId));
3875 doStuff = true;
3876 } else {
3877 jam();
3878 ndbrequire(!c_aliveNodes.get(nodePtr.p->nodeId));
3879 }//if
3880 nodePtr.p->alive = 0;
3881 c_aliveNodes.clear(nodePtr.p->nodeId);
3882 }//if
3883 }//for
3884
3885 if(!doStuff){
3886 jam();
3887 return;
3888 }//if
3889
3890 #ifdef DEBUG_ABORT
3891 ndbout_c("****************** Node fail rep ******************");
3892 #endif
3893
3894 NodeId newCoordinator = c_masterNodeId;
3895 BackupRecordPtr ptr;
3896 if (get_backup_record(ptr))
3897 {
3898 jam();
3899 checkNodeFail(signal, ptr, newCoordinator, theFailedNodes);
3900 }
3901
3902 /* Block level cleanup */
3903 for(unsigned i = 1; i < MAX_NDB_NODES; i++) {
3904 jam();
3905 if(NdbNodeBitmask::get(theFailedNodes, i))
3906 {
3907 jam();
3908 Uint32 elementsCleaned = simBlockNodeFailure(signal, i); // No callback
3909 ndbassert(elementsCleaned == 0); // Backup should have no distributed frag signals
3910 (void) elementsCleaned; // Remove compiler warning
3911 }//if
3912 }//for
3913 }
3914
3915 bool
verifyNodesAlive(BackupRecordPtr ptr,const NdbNodeBitmask & aNodeBitMask)3916 Backup::verifyNodesAlive(BackupRecordPtr ptr,
3917 const NdbNodeBitmask& aNodeBitMask)
3918 {
3919 Uint32 version = getNodeInfo(getOwnNodeId()).m_version;
3920 for (Uint32 i = 0; i < MAX_NDB_NODES; i++) {
3921 jam();
3922 if(aNodeBitMask.get(i)) {
3923 if(!c_aliveNodes.get(i)){
3924 jam();
3925 ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
3926 return false;
3927 }//if
3928 if(getNodeInfo(i).m_version != version)
3929 {
3930 jam();
3931 ptr.p->setErrorCode(AbortBackupOrd::IncompatibleVersions);
3932 return false;
3933 }
3934 }//if
3935 }//for
3936 return true;
3937 }
3938
3939 void
checkNodeFail(Signal * signal,BackupRecordPtr ptr,NodeId newCoord,Uint32 theFailedNodes[NdbNodeBitmask::Size])3940 Backup::checkNodeFail(Signal* signal,
3941 BackupRecordPtr ptr,
3942 NodeId newCoord,
3943 Uint32 theFailedNodes[NdbNodeBitmask::Size])
3944 {
3945 NdbNodeBitmask mask;
3946 mask.assign(NdbNodeBitmask::Size, theFailedNodes);
3947
3948 /* Update ptr.p->nodes to be up to date with current alive nodes
3949 */
3950 NodePtr nodePtr;
3951 bool found = false;
3952 for(c_nodes.first(nodePtr); nodePtr.i != RNIL; c_nodes.next(nodePtr)) {
3953 jam();
3954 if(NdbNodeBitmask::get(theFailedNodes, nodePtr.p->nodeId)) {
3955 jam();
3956 if (ptr.p->nodes.get(nodePtr.p->nodeId)) {
3957 jam();
3958 ptr.p->nodes.clear(nodePtr.p->nodeId);
3959 found = true;
3960 }
3961 }//if
3962 }//for
3963
3964 if(!found) {
3965 jam();
3966 return; // failed node is not part of backup process, safe to continue
3967 }
3968
3969 if(mask.get(refToNode(ptr.p->masterRef)))
3970 {
3971 /**
3972 * Master died...abort
3973 */
3974 ptr.p->masterRef = reference();
3975 ptr.p->senderRef = reference();
3976 // Each ldm on each node becomes master and sends signals only to self
3977 ptr.p->nodes.clear();
3978 ptr.p->nodes.set(getOwnNodeId());
3979 ptr.p->fragWorkers[getOwnNodeId()].clear();
3980 ptr.p->fragWorkers[getOwnNodeId()].set(instance());
3981 ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
3982 switch(ptr.p->m_gsn){
3983 case GSN_DEFINE_BACKUP_REQ:
3984 case GSN_START_BACKUP_REQ:
3985 case GSN_BACKUP_FRAGMENT_REQ:
3986 case GSN_STOP_BACKUP_REQ:
3987 // I'm currently processing...reply to self and abort...
3988 ptr.p->masterData.gsn = ptr.p->m_gsn;
3989 ptr.p->masterData.sendCounter = ptr.p->nodes;
3990 return;
3991 case GSN_DEFINE_BACKUP_REF:
3992 case GSN_DEFINE_BACKUP_CONF:
3993 case GSN_START_BACKUP_REF:
3994 case GSN_START_BACKUP_CONF:
3995 case GSN_BACKUP_FRAGMENT_REF:
3996 case GSN_BACKUP_FRAGMENT_CONF:
3997 case GSN_STOP_BACKUP_REF:
3998 case GSN_STOP_BACKUP_CONF:
3999 ptr.p->masterData.gsn = GSN_DEFINE_BACKUP_REQ;
4000 masterAbort(signal, ptr);
4001 return;
4002 case GSN_ABORT_BACKUP_ORD:
4003 // Already aborting
4004 return;
4005 }
4006 }
4007 else if (newCoord == getOwnNodeId() &&
4008 instance() == masterInstanceKey(ptr))
4009 {
4010 /**
4011 * I'm master for this backup: LDM1 on master node
4012 */
4013 jam();
4014 CRASH_INSERTION((10001));
4015 #ifdef DEBUG_ABORT
4016 ndbout_c("**** Master: Node failed: Master id = %u",
4017 refToNode(ptr.p->masterRef));
4018 #endif
4019
4020 Uint32 gsn, len, pos;
4021 ptr.p->nodes.bitANDC(mask);
4022 switch(ptr.p->masterData.gsn){
4023 case GSN_DEFINE_BACKUP_REQ:
4024 {
4025 DefineBackupRef * ref = (DefineBackupRef*)signal->getDataPtrSend();
4026 ref->backupPtr = ptr.i;
4027 ref->backupId = ptr.p->backupId;
4028 ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4029 gsn= GSN_DEFINE_BACKUP_REF;
4030 len= DefineBackupRef::SignalLength;
4031 pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4032 break;
4033 }
4034 case GSN_START_BACKUP_REQ:
4035 {
4036 StartBackupRef * ref = (StartBackupRef*)signal->getDataPtrSend();
4037 ref->backupPtr = ptr.i;
4038 ref->backupId = ptr.p->backupId;
4039 ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4040 gsn= GSN_START_BACKUP_REF;
4041 len= StartBackupRef::SignalLength;
4042 pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4043 break;
4044 }
4045 case GSN_BACKUP_FRAGMENT_REQ:
4046 {
4047 BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtrSend();
4048 ref->backupPtr = ptr.i;
4049 ref->backupId = ptr.p->backupId;
4050 ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4051 gsn= GSN_BACKUP_FRAGMENT_REF;
4052 len= BackupFragmentRef::SignalLength;
4053 pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4054 break;
4055 }
4056 case GSN_STOP_BACKUP_REQ:
4057 {
4058 StopBackupRef * ref = (StopBackupRef*)signal->getDataPtrSend();
4059 ref->backupPtr = ptr.i;
4060 ref->backupId = ptr.p->backupId;
4061 ref->errorCode = AbortBackupOrd::BackupFailureDueToNodeFail;
4062 ref->nodeId = getOwnNodeId();
4063 gsn= GSN_STOP_BACKUP_REF;
4064 len= StopBackupRef::SignalLength;
4065 pos= Uint32(&ref->nodeId - signal->getDataPtrSend());
4066 break;
4067 }
4068 case GSN_WAIT_GCP_REQ:
4069 case GSN_DROP_TRIG_IMPL_REQ:
4070 case GSN_CREATE_TRIG_IMPL_REQ:
4071 case GSN_ALTER_TRIG_IMPL_REQ:
4072 ptr.p->setErrorCode(AbortBackupOrd::BackupFailureDueToNodeFail);
4073 return;
4074 case GSN_UTIL_SEQUENCE_REQ:
4075 case GSN_UTIL_LOCK_REQ:
4076 return;
4077 default:
4078 ndbabort();
4079 }
4080
4081 for(Uint32 i = 0; (i = mask.find(i+1)) != NdbNodeBitmask::NotFound; )
4082 {
4083 signal->theData[pos] = i;
4084 if (gsn == GSN_BACKUP_FRAGMENT_REF)
4085 {
4086 // Handle mt-backup case where all LDMs process BACKUP_FRAGMENT_REQs
4087 // simultaneously. If any node fails, master sends REFs to self on
4088 // behalf of every failed node. Extend handling for BACKUP_FRAGMENT_REQ
4089 // so that master sends BACKUP_FRAGMENT_REFs to self from every LDM
4090 // on every failed node.
4091 Uint32 workers = getNodeInfo(i).m_lqh_workers;
4092 for (Uint32 j=0; j<workers; j++)
4093 {
4094 sendSignal(reference(), gsn, signal, len, JBB);
4095 }
4096 }
4097 else
4098 {
4099 // master sends REQs only to one instance (BackupProxy) on each node
4100 // send only one reply to self per node on behalf of BackupProxy
4101 sendSignal(reference(), gsn, signal, len, JBB);
4102 #ifdef DEBUG_ABORT
4103 ndbout_c("sending %d to self from %d", gsn, i);
4104 #endif
4105 }
4106 }
4107 return;
4108 }//if
4109
4110 /**
4111 * I abort myself as slave if not master
4112 */
4113 CRASH_INSERTION((10021));
4114 }
4115
4116 void
execINCL_NODEREQ(Signal * signal)4117 Backup::execINCL_NODEREQ(Signal* signal)
4118 {
4119 jamEntry();
4120
4121 const Uint32 senderRef = signal->theData[0];
4122 const Uint32 inclNode = signal->theData[1];
4123
4124 NodePtr node;
4125 for(c_nodes.first(node); node.i != RNIL; c_nodes.next(node)) {
4126 jam();
4127 const Uint32 nodeId = node.p->nodeId;
4128 if(inclNode == nodeId){
4129 jam();
4130
4131 ndbrequire(node.p->alive == 0);
4132 ndbrequire(!c_aliveNodes.get(nodeId));
4133
4134 node.p->alive = 1;
4135 c_aliveNodes.set(nodeId);
4136
4137 break;
4138 }//if
4139 }//for
4140 signal->theData[0] = inclNode;
4141 signal->theData[1] = reference();
4142 sendSignal(senderRef, GSN_INCL_NODECONF, signal, 2, JBB);
4143 }
4144
4145 /*****************************************************************************
4146 *
4147 * Master functionallity - Define backup
4148 *
4149 * Backup master = BACKUP instance 1 (LDM1) on master node.
4150 * Backup master receives BACKUP_REQ and sends control signals to all slaves
4151 * for mt-backup, slaves = all BACKUP instances(all LDMs) on all nodes
4152 * for st-backup, slaves = BACKUP 1(LDM1) on all nodes
4153 *
4154 * File thread: A file-thread signal train of FSAPPENDREQ/FSAPPENDCONF is
4155 * started for each backup file, i.e. one train each for the ctl, data and log
4156 * file. The file-thread signal trains interleave with BACKUP-related signals
4157 * on each slave thread. The BACKUP-related signals write data to dataBuffers
4158 * as needed, using sendSignalWithDelay loops to wait in case a dataBuffer is
4159 * not accepting writes. Each file-thread signal picks up data from its
4160 * dataBuffer and writes it to the file.
4161 *
4162 * Control signals
4163 * 1) DEFINE_BACKUP_REQ
4164 * - seize BackupRecord, alloc and init file ptrs
4165 * - send LIST_TABLES_REQ to DICT to get table info to create tablemap
4166 * - send FSOPENREQ to open ctl, data and logfiles
4167 * - write file headers for ctl, data and logfiles
4168 * - start ctl file thread and write table list to ctl file
4169 * - get table info for each table, save in thread-local list
4170 * - lock tables
4171 * - get frag counts for each table + frag info for each frag on each table,
4172 save in thread-local list
4173 * - reply to sender with DEFINE_BACKUP_CONF
4174 *
4175 * 2) START_BACKUP_REQ
4176 * - start file threads for data and log files
4177 * - tell DBTUP to create triggers for logfile writes
4178 * - reply to sender with START_BACKUP_CONF
4179 *
4180 * 3) BACKUP_FRAGMENT_REQ
4181 * - send SCAN_FRAGREQ to LQH to start scan
4182 * - on receiving SCAN_FRAGCONF, reply to master with BACKUP_FRAGMENT_CONF
4183 *
4184 * 4) STOP_BACKUP_REQ
4185 * - drop all triggers in TUP
4186 * - insert footers in ctl and log files
4187 * - unlock tables
4188 * - close all files
4189 * - reply to sender with STOP_BACKUP_CONF
4190 *
4191 * 5) ABORT_BACKUP_ORD
4192 * - unlock tables
4193 * - release file pages, file ptrs, thread-local lists of frag info, table data
4194 * - release BackupRecord
4195 *
4196 *****************************************************************************/
4197
4198 void
execBACKUP_REQ(Signal * signal)4199 Backup::execBACKUP_REQ(Signal* signal)
4200 {
4201 jamEntry();
4202 BackupReq * req = (BackupReq*)signal->getDataPtr();
4203
4204 const Uint32 senderData = req->senderData;
4205 const BlockReference senderRef = signal->senderBlockRef();
4206 const Uint32 dataLen32 = req->backupDataLen; // In 32 bit words
4207 const Uint32 flags = signal->getLength() > 2 ? req->flags : 2;
4208 const Uint32 input_backupId = signal->getLength() > 3 ? req->inputBackupId : 0;
4209
4210 if (getOwnNodeId() != getMasterNodeId())
4211 {
4212 jam();
4213 sendBackupRef(senderRef, flags, signal, senderData,
4214 BackupRef::IAmNotMaster);
4215 return;
4216 }//if
4217
4218 if (c_defaults.m_diskless)
4219 {
4220 jam();
4221 sendBackupRef(senderRef, flags, signal, senderData,
4222 BackupRef::CannotBackupDiskless);
4223 return;
4224 }
4225
4226 if (dataLen32 != 0)
4227 {
4228 jam();
4229 sendBackupRef(senderRef, flags, signal, senderData,
4230 BackupRef::BackupDefinitionNotImplemented);
4231 return;
4232 }//if
4233
4234 #ifdef DEBUG_ABORT
4235 dumpUsedResources();
4236 #endif
4237 /**
4238 * Seize a backup record
4239 */
4240 BackupRecordPtr ptr;
4241 c_backups.seizeFirst(ptr);
4242 if (ptr.i == RNIL)
4243 {
4244 jam();
4245 sendBackupRef(senderRef, flags, signal, senderData,
4246 BackupRef::OutOfBackupRecord);
4247 return;
4248 }//if
4249
4250 ndbrequire(ptr.p->tables.isEmpty());
4251
4252 ptr.p->m_gsn = 0;
4253 ptr.p->errorCode = 0;
4254 ptr.p->clientRef = senderRef;
4255 ptr.p->clientData = senderData;
4256 ptr.p->flags = flags;
4257 ptr.p->masterRef = reference();
4258 ptr.p->nodes = c_aliveNodes;
4259
4260 Uint32 node = ptr.p->nodes.find_first();
4261 Uint32 version = getNodeInfo(getOwnNodeId()).m_version;
4262 ptr.p->idleFragWorkerCount = 0;
4263 while(node != NdbNodeBitmask::NotFound)
4264 {
4265 const NodeInfo nodeInfo = getNodeInfo(node);
4266 // setup fragWorkers[] for master to control BACKUP_FRAGMENT_REQs
4267 ptr.p->fragWorkers[node].clear();
4268 Uint32 ldmCount = nodeInfo.m_lqh_workers;
4269 ldmCount += (nodeInfo.m_lqh_workers == 0); // set LDM1 as worker for ndbd
4270
4271 for(Uint32 i=0; i<=ldmCount; i++)
4272 ptr.p->fragWorkers[node].set(i);
4273
4274 ptr.p->idleFragWorkerCount += ldmCount;
4275
4276 // Only support multithreaded backup if all nodes have multiple LDMs
4277 if (ldmCount <= 1 && (m_cfg_mt_backup > 0))
4278 {
4279 /* The MT_BACKUP flag is set to false in these
4280 * cases:
4281 * - ndbds
4282 * - ndbmtds with only one LDM worker
4283 */
4284 m_cfg_mt_backup = 0;
4285 g_eventLogger->info("Running single-threaded backup since node %u has only one LDM", node);
4286 }
4287 if (getNodeInfo(node).m_version != version)
4288 {
4289 jam();
4290 g_eventLogger->info("Detected incompatible versions, aborting backup");
4291 ptr.p->setErrorCode(AbortBackupOrd::IncompatibleVersions);
4292 sendBackupRef(senderRef, flags, signal, senderData,
4293 BackupRef::BackupDuringUpgradeUnsupported);
4294 // clean up backup state
4295 ptr.p->m_gsn = 0;
4296 ptr.p->masterData.gsn = 0;
4297 c_backups.release(ptr);
4298 return;
4299 }
4300
4301 node = ptr.p->nodes.find_next(node+1);
4302 }
4303
4304 if(m_cfg_mt_backup)
4305 {
4306 /* Exec backup using all LDMs. To perform a backup, a BACKUP
4307 * block must receive all these signals from master:
4308 * 1) DEFINE_BACKUP_REQ, START_BACKUP_REQ, STOP_BACKUP_REQ to set
4309 * up and clean up filesets, file-write signal 'threads', triggers,
4310 * table locks, and to fetch metadata and write CTL and LOG files
4311 * 2) BACKUP_FRAGMENT_REQs to write fragments to data file, master must
4312 * assign frags to LDMs by sending BACKUP_FRAGMENT_REQs
4313 * 3) ABORT_BACKUP_ORD for failure-handling and cleanup
4314 * If all these signals are received by an LDM, that LDM will independently
4315 * execute a backup and write a restorable backup fileset.
4316 *
4317 * With MT_BACKUP enabled, all these signals will be sent to all
4318 * LDMs on each node.
4319 *
4320 * With MT_BACKUP disabled, the node performs a single-threaded backup.
4321 * In a single-threaded backup, all these signals are sent to LDM1 on each
4322 * node. The remaining BACKUP instances do not participate in the backup.
4323 */
4324
4325 ptr.p->flags |= BackupReq::MT_BACKUP;
4326 }
4327
4328 if (input_backupId)
4329 {
4330 jam();
4331 ptr.p->backupId = input_backupId;
4332 }
4333 else
4334 {
4335 jam();
4336 ptr.p->backupId = 0;
4337 }
4338 ptr.p->backupKey[0] = 0;
4339 ptr.p->backupKey[1] = 0;
4340 ptr.p->backupDataLen = 0;
4341 ptr.p->masterData.errorCode = 0;
4342
4343 ptr.p->masterData.sequence.retriesLeft = 3;
4344 sendUtilSequenceReq(signal, ptr);
4345 }
4346
4347 void
sendUtilSequenceReq(Signal * signal,BackupRecordPtr ptr,Uint32 delay)4348 Backup::sendUtilSequenceReq(Signal* signal, BackupRecordPtr ptr, Uint32 delay)
4349 {
4350 jam();
4351
4352 UtilSequenceReq * utilReq = (UtilSequenceReq*)signal->getDataPtrSend();
4353 ptr.p->masterData.gsn = GSN_UTIL_SEQUENCE_REQ;
4354 utilReq->senderData = ptr.i;
4355 utilReq->sequenceId = NDB_BACKUP_SEQUENCE;
4356
4357 if (ptr.p->backupId)
4358 {
4359 jam();
4360 utilReq->requestType = UtilSequenceReq::SetVal;
4361 utilReq->value = ptr.p->backupId;
4362 }
4363 else
4364 {
4365 jam();
4366 utilReq->requestType = UtilSequenceReq::NextVal;
4367 }
4368
4369 if (delay == 0)
4370 {
4371 jam();
4372 sendSignal(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
4373 signal, UtilSequenceReq::SignalLength, JBB);
4374 }
4375 else
4376 {
4377 jam();
4378 sendSignalWithDelay(DBUTIL_REF, GSN_UTIL_SEQUENCE_REQ,
4379 signal, delay, UtilSequenceReq::SignalLength);
4380 }
4381 }
4382
4383 void
execUTIL_SEQUENCE_REF(Signal * signal)4384 Backup::execUTIL_SEQUENCE_REF(Signal* signal)
4385 {
4386 jamEntry();
4387 BackupRecordPtr ptr;
4388 UtilSequenceRef * utilRef = (UtilSequenceRef*)signal->getDataPtr();
4389 ptr.i = utilRef->senderData;
4390 c_backupPool.getPtr(ptr);
4391 ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_SEQUENCE_REQ);
4392
4393 if (utilRef->errorCode == UtilSequenceRef::TCError)
4394 {
4395 jam();
4396 if (ptr.p->masterData.sequence.retriesLeft > 0)
4397 {
4398 jam();
4399 infoEvent("BACKUP: retrying sequence on error %u",
4400 utilRef->TCErrorCode);
4401 ptr.p->masterData.sequence.retriesLeft--;
4402 sendUtilSequenceReq(signal, ptr, 300);
4403 return;
4404 }
4405 }
4406 warningEvent("BACKUP: aborting due to sequence error (%u, %u)",
4407 utilRef->errorCode,
4408 utilRef->TCErrorCode);
4409
4410 sendBackupRef(signal, ptr, BackupRef::SequenceFailure);
4411 }//execUTIL_SEQUENCE_REF()
4412
4413 void
sendBackupRef(Signal * signal,BackupRecordPtr ptr,Uint32 errorCode)4414 Backup::sendBackupRef(Signal* signal, BackupRecordPtr ptr, Uint32 errorCode)
4415 {
4416 jam();
4417 sendBackupRef(ptr.p->clientRef, ptr.p->flags, signal,
4418 ptr.p->clientData, errorCode);
4419 cleanup(signal, ptr);
4420 }
4421
4422 void
sendBackupRef(BlockReference senderRef,Uint32 flags,Signal * signal,Uint32 senderData,Uint32 errorCode)4423 Backup::sendBackupRef(BlockReference senderRef, Uint32 flags, Signal *signal,
4424 Uint32 senderData, Uint32 errorCode)
4425 {
4426 jam();
4427 if (SEND_BACKUP_STARTED_FLAG(flags))
4428 {
4429 jam();
4430 BackupRef* ref = (BackupRef*)signal->getDataPtrSend();
4431 ref->senderData = senderData;
4432 ref->errorCode = errorCode;
4433 ref->masterRef = numberToRef(BACKUP, getMasterNodeId());
4434 sendSignal(senderRef, GSN_BACKUP_REF, signal, BackupRef::SignalLength, JBB);
4435 }
4436
4437 if (errorCode != BackupRef::IAmNotMaster)
4438 {
4439 jam();
4440 signal->theData[0] = NDB_LE_BackupFailedToStart;
4441 signal->theData[1] = senderRef;
4442 signal->theData[2] = errorCode;
4443 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3, JBB);
4444 }
4445 }
4446
4447 void
execUTIL_SEQUENCE_CONF(Signal * signal)4448 Backup::execUTIL_SEQUENCE_CONF(Signal* signal)
4449 {
4450 jamEntry();
4451
4452 UtilSequenceConf * conf = (UtilSequenceConf*)signal->getDataPtr();
4453
4454 if(conf->requestType == UtilSequenceReq::Create)
4455 {
4456 jam();
4457 sendSTTORRY(signal); // At startup in NDB
4458 return;
4459 }
4460
4461 BackupRecordPtr ptr;
4462 ptr.i = conf->senderData;
4463 c_backupPool.getPtr(ptr);
4464
4465 ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_SEQUENCE_REQ);
4466
4467 if (ptr.p->checkError())
4468 {
4469 jam();
4470 sendBackupRef(signal, ptr, ptr.p->errorCode);
4471 return;
4472 }//if
4473
4474 if (ERROR_INSERTED(10023))
4475 {
4476 sendBackupRef(signal, ptr, 323);
4477 return;
4478 }//if
4479
4480
4481 if(!ptr.p->backupId && conf->requestType != UtilSequenceReq::SetVal)
4482 {
4483 Uint64 backupId;
4484 memcpy(&backupId,conf->sequenceValue,8);
4485 ptr.p->backupId= (Uint32)backupId;
4486 }
4487
4488 ptr.p->backupKey[0] = (getOwnNodeId() << 16) | (ptr.p->backupId & 0xFFFF);
4489 ptr.p->backupKey[1] = Uint32(NdbTick_CurrentMillisecond());
4490
4491 ptr.p->masterData.gsn = GSN_UTIL_LOCK_REQ;
4492 Mutex mutex(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4493 Callback c = { safe_cast(&Backup::defineBackupMutex_locked), ptr.i };
4494 ndbrequire(mutex.lock(c));
4495
4496 return;
4497 }
4498
4499 void
defineBackupMutex_locked(Signal * signal,Uint32 ptrI,Uint32 retVal)4500 Backup::defineBackupMutex_locked(Signal* signal, Uint32 ptrI, Uint32 retVal){
4501 jamEntry();
4502 ndbrequire(retVal == 0);
4503
4504 BackupRecordPtr ptr;
4505 ptr.i = ptrI;
4506 c_backupPool.getPtr(ptr);
4507
4508 ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_LOCK_REQ);
4509
4510 ptr.p->masterData.gsn = GSN_UTIL_LOCK_REQ;
4511 Mutex mutex(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4512 Callback c = { safe_cast(&Backup::dictCommitTableMutex_locked), ptr.i };
4513 ndbrequire(mutex.lock(c));
4514 }
4515
4516 void
dictCommitTableMutex_locked(Signal * signal,Uint32 ptrI,Uint32 retVal)4517 Backup::dictCommitTableMutex_locked(Signal* signal, Uint32 ptrI,Uint32 retVal)
4518 {
4519 jamEntry();
4520 ndbrequire(retVal == 0);
4521
4522 /**
4523 * We now have both the mutexes
4524 */
4525 BackupRecordPtr ptr;
4526 ptr.i = ptrI;
4527 c_backupPool.getPtr(ptr);
4528
4529 ndbrequire(ptr.p->masterData.gsn == GSN_UTIL_LOCK_REQ);
4530
4531 if (ERROR_INSERTED(10031)) {
4532 ptr.p->setErrorCode(331);
4533 }//if
4534
4535 if (ptr.p->checkError())
4536 {
4537 jam();
4538
4539 /**
4540 * Unlock mutexes
4541 */
4542 jam();
4543 Mutex mutex1(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4544 jam();
4545 mutex1.unlock(); // ignore response
4546
4547 jam();
4548 Mutex mutex2(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4549 jam();
4550 mutex2.unlock(); // ignore response
4551
4552 sendBackupRef(signal, ptr, ptr.p->errorCode);
4553 return;
4554 }//if
4555
4556 sendDefineBackupReq(signal, ptr);
4557 }
4558
4559 /*****************************************************************************
4560 *
4561 * Master functionallity - Define backup cont'd (from now on all slaves are in)
4562 *
4563 *****************************************************************************/
4564
4565 bool
haveAllSignals(BackupRecordPtr ptr,Uint32 gsn,Uint32 nodeId)4566 Backup::haveAllSignals(BackupRecordPtr ptr, Uint32 gsn, Uint32 nodeId)
4567 {
4568 ndbrequire(ptr.p->masterRef == reference());
4569 ndbrequire(ptr.p->masterData.gsn == gsn);
4570 ndbrequire(!ptr.p->masterData.sendCounter.done());
4571 if (ptr.p->masterData.sendCounter.isWaitingFor(nodeId))
4572 {
4573 ptr.p->masterData.sendCounter.clearWaitingFor(nodeId);
4574 }
4575 else
4576 {
4577 ndbrequire(ptr.p->errorCode == AbortBackupOrd::BackupFailureDueToNodeFail);
4578 if (ERROR_INSERTED(10051) || ERROR_INSERTED(10052) ||
4579 ERROR_INSERTED(10053))
4580 {
4581 ndbout_c("Received duplicate signal from non-master node %u for gsn %u",
4582 nodeId, gsn);
4583 CLEAR_ERROR_INSERT_VALUE;
4584 }
4585 }
4586 return ptr.p->masterData.sendCounter.done();
4587 }
4588
4589 void
sendDefineBackupReq(Signal * signal,BackupRecordPtr ptr)4590 Backup::sendDefineBackupReq(Signal *signal, BackupRecordPtr ptr)
4591 {
4592 /**
4593 * Sending define backup to all participants
4594 */
4595 DefineBackupReq * req = (DefineBackupReq*)signal->getDataPtrSend();
4596 req->backupId = ptr.p->backupId;
4597 req->clientRef = ptr.p->clientRef;
4598 req->clientData = ptr.p->clientData;
4599 req->senderRef = reference();
4600 req->masterRef = reference();
4601 req->backupPtr = ptr.i;
4602 req->backupKey[0] = ptr.p->backupKey[0];
4603 req->backupKey[1] = ptr.p->backupKey[1];
4604 req->backupDataLen = ptr.p->backupDataLen;
4605 req->flags = ptr.p->flags;
4606
4607 /**
4608 * If backup is multithreaded, DEFINE_BACKUP_REQ sent to BackupProxy on
4609 * all nodes. BackupProxy fwds REQ to all LDMs, collects CONF/REFs
4610 * and replies to master. N backup filesets created per node, N=#ldms.
4611 *
4612 * If backup is not multithreaded, DEFINE_BACKUP_REQ sent only to LDM 1
4613 * on all nodes. Only 1 backup fileset created per node.
4614 *
4615 * instanceKey() selects instance to send signal to:
4616 * - for LCP, send to self
4617 * - for single-threaded backup: only one LDM thread, send to that thread
4618 * - for multithreaded backup, send to the BackupProxy LDM0, which then
4619 * broadcasts the signal to all the LDMs on its node
4620 *
4621 * On receiving DEFINE_BACKUP_REQ, the BACKUP block creates a
4622 * backup fileset, queries DICT+DIH for table info, locks tables,
4623 * and writes table metadata into the CTL file in its fileset.
4624 */
4625 ptr.p->masterData.gsn = GSN_DEFINE_BACKUP_REQ;
4626 ptr.p->masterData.sendCounter = ptr.p->nodes;
4627 Uint32 recNode = 0;
4628 const Uint32 packed_length = ptr.p->nodes.getPackedLengthInWords();
4629
4630 NdbNodeBitmask nodes = ptr.p->nodes;
4631 while ((recNode = nodes.find(recNode + 1)) != NdbNodeBitmask::NotFound)
4632 {
4633 const Uint32 ref = numberToRef(BACKUP, instanceKey(ptr), recNode);
4634
4635 // Backup is not allowed for mixed versions data nodes
4636 ndbrequire(ndbd_send_node_bitmask_in_section(getNodeInfo(recNode).m_version));
4637
4638 LinearSectionPtr lsptr[3];
4639 lsptr[0].p = nodes.rep.data;
4640 lsptr[0].sz = packed_length;
4641 sendSignal(ref, GSN_DEFINE_BACKUP_REQ, signal,
4642 DefineBackupReq::SignalLength_v1, JBB, lsptr, 1);
4643 }
4644
4645 /**
4646 * Now send backup data
4647 */
4648 const Uint32 len = ptr.p->backupDataLen;
4649 if(len == 0){
4650 /**
4651 * No data to send
4652 */
4653 jam();
4654 return;
4655 }//if
4656
4657 /**
4658 * Not implemented
4659 */
4660 ndbabort();
4661 }
4662
4663 void
execDEFINE_BACKUP_REF(Signal * signal)4664 Backup::execDEFINE_BACKUP_REF(Signal* signal)
4665 {
4666 jamEntry();
4667
4668 DefineBackupRef* ref = (DefineBackupRef*)signal->getDataPtr();
4669
4670 const Uint32 ptrI = ref->backupPtr;
4671 //const Uint32 backupId = ref->backupId;
4672 const Uint32 nodeId = ref->nodeId;
4673
4674 BackupRecordPtr ptr;
4675 c_backupPool.getPtr(ptr, ptrI);
4676
4677 ptr.p->setErrorCode(ref->errorCode);
4678 defineBackupReply(signal, ptr, nodeId);
4679 }
4680
4681 void
execDEFINE_BACKUP_CONF(Signal * signal)4682 Backup::execDEFINE_BACKUP_CONF(Signal* signal)
4683 {
4684 jamEntry();
4685
4686 DefineBackupConf* conf = (DefineBackupConf*)signal->getDataPtr();
4687 const Uint32 ptrI = conf->backupPtr;
4688 //const Uint32 backupId = conf->backupId;
4689 const Uint32 nodeId = refToNode(signal->senderBlockRef());
4690
4691 BackupRecordPtr ptr;
4692 c_backupPool.getPtr(ptr, ptrI);
4693
4694 if (ERROR_INSERTED(10024))
4695 {
4696 ptr.p->setErrorCode(324);
4697 }
4698
4699 defineBackupReply(signal, ptr, nodeId);
4700 }
4701
4702 void
defineBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)4703 Backup::defineBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
4704 {
4705 if (ERROR_INSERTED(10051))
4706 {
4707 if (nodeId == getOwnNodeId())
4708 {
4709 jam();
4710 ndbrequire(ptr.p->errorCode == 0)
4711 // Delay reply from self so that master waits for DEFINE_BACKUP_REFs
4712 sendSignalWithDelay(reference(), GSN_DEFINE_BACKUP_CONF, signal,
4713 5000, signal->getLength());
4714 return;
4715 }
4716 else
4717 {
4718 // Received DEFINE_BACKUP_REF/CONF from node n1, now crash n1. This will
4719 // trigger node-failure handling where master sends DEFINE_BACKUP_REF to
4720 // self on behalf of n1. So master receives 2 REFs from n1.
4721 signal->theData[0] = 9999;
4722 sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
4723 }
4724 }
4725 if (!haveAllSignals(ptr, GSN_DEFINE_BACKUP_REQ, nodeId)) {
4726 jam();
4727 return;
4728 }
4729
4730 /**
4731 * Unlock mutexes
4732 */
4733 jam();
4734 Mutex mutex1(signal, c_mutexMgr, ptr.p->masterData.m_dictCommitTableMutex);
4735 jam();
4736 mutex1.unlock(); // ignore response
4737
4738 jam();
4739 Mutex mutex2(signal, c_mutexMgr, ptr.p->masterData.m_defineBackupMutex);
4740 jam();
4741 mutex2.unlock(); // ignore response
4742
4743 if(ptr.p->checkError())
4744 {
4745 jam();
4746 masterAbort(signal, ptr);
4747 return;
4748 }
4749
4750 CRASH_INSERTION((10034));
4751
4752 /**
4753 * We've received GSN_DEFINE_BACKUP_CONF from all participants.
4754 *
4755 * Our next step is to send START_BACKUP_REQ to all participants,
4756 * who will then send CREATE_TRIG_REQ for all tables to their local
4757 * DBTUP.
4758 */
4759 TablePtr tabPtr;
4760 ptr.p->tables.first(tabPtr);
4761
4762 sendStartBackup(signal, ptr, tabPtr);
4763 }
4764
4765 /*****************************************************************************
4766 *
4767 * Master functionallity - Prepare triggers
4768 *
4769 *****************************************************************************/
4770 void
createAttributeMask(TablePtr tabPtr,Bitmask<MAXNROFATTRIBUTESINWORDS> & mask)4771 Backup::createAttributeMask(TablePtr tabPtr,
4772 Bitmask<MAXNROFATTRIBUTESINWORDS> & mask)
4773 {
4774 mask.clear();
4775 for (Uint32 i = 0; i<tabPtr.p->noOfAttributes; i++)
4776 mask.set(i);
4777 }
4778
4779 void
sendCreateTrig(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)4780 Backup::sendCreateTrig(Signal* signal,
4781 BackupRecordPtr ptr, TablePtr tabPtr)
4782 {
4783 CreateTrigImplReq* req = (CreateTrigImplReq*)signal->getDataPtr();
4784
4785 /*
4786 * First, setup the structures
4787 */
4788 OperationRecord* operation = &ptr.p->files.getPtr(ptr.p->logFilePtr)->operation;
4789 operation->noOfBytes = 0;
4790 operation->noOfRecords = 0;
4791
4792 for(Uint32 j=0; j<3; j++) {
4793 jam();
4794
4795 TriggerPtr trigPtr;
4796 if (!ptr.p->triggers.seizeFirst(trigPtr)) {
4797 jam();
4798 ptr.p->m_gsn = GSN_START_BACKUP_REF;
4799 StartBackupRef* ref = (StartBackupRef*)signal->getDataPtrSend();
4800 ref->backupPtr = ptr.i;
4801 ref->backupId = ptr.p->backupId;
4802 ref->errorCode = StartBackupRef::FailedToAllocateTriggerRecord;
4803 ref->nodeId = getOwnNodeId();
4804 sendSignal(ptr.p->senderRef, GSN_START_BACKUP_REF, signal,
4805 StartBackupRef::SignalLength, JBB);
4806 return;
4807 } // if
4808
4809 const Uint32 triggerId= trigPtr.i;
4810 tabPtr.p->triggerIds[j] = triggerId;
4811 tabPtr.p->triggerAllocated[j] = true;
4812 trigPtr.p->backupPtr = ptr.i;
4813 trigPtr.p->tableId = tabPtr.p->tableId;
4814 trigPtr.p->tab_ptr_i = tabPtr.i;
4815 trigPtr.p->logEntry = 0;
4816 trigPtr.p->event = j;
4817 trigPtr.p->operation = operation;
4818 trigPtr.p->errorCode = 0;
4819 } // for
4820
4821 /*
4822 * now ask DBTUP to create
4823 */
4824 ptr.p->slaveData.gsn = GSN_CREATE_TRIG_IMPL_REQ;
4825 ptr.p->slaveData.trigSendCounter = 3;
4826 ptr.p->slaveData.createTrig.tableId = tabPtr.p->tableId;
4827
4828 req->senderRef = reference();
4829 req->receiverRef = reference();
4830 req->senderData = ptr.i;
4831 req->requestType = 0;
4832
4833 Bitmask<MAXNROFATTRIBUTESINWORDS> attrMask;
4834 createAttributeMask(tabPtr, attrMask);
4835
4836 req->tableId = tabPtr.p->tableId;
4837 req->tableVersion = 0;
4838 req->indexId = RNIL;
4839 req->indexVersion = 0;
4840
4841 Uint32 ti = 0;
4842 /*
4843 * We always send PK for any operations and any triggertypes.
4844 * For SUBSCRIPTION_BEFORE
4845 * We send after image for INSERT.
4846 * We send before image for DELETE.
4847 * We send before+after image for UPDATE.
4848 * For SUBSCRIPTION
4849 * We send after image for INSERT.
4850 * We send only PK for DELETE.
4851 * We send after image for UPDATE.
4852 */
4853 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
4854 TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION_BEFORE);
4855 else
4856 TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION);
4857 TriggerInfo::setTriggerActionTime(ti, TriggerActionTime::TA_DETACHED);
4858 TriggerInfo::setMonitorReplicas(ti, true);
4859 TriggerInfo::setMonitorAllAttributes(ti, false);
4860
4861 for (int i=0; i < 3; i++) {
4862 req->triggerId = tabPtr.p->triggerIds[i];
4863
4864 Uint32 ti2 = ti;
4865 TriggerInfo::setTriggerEvent(ti2, triggerEventValues[i]);
4866 req->triggerInfo = ti2;
4867
4868 LinearSectionPtr attrPtr[3];
4869 attrPtr[0].p = attrMask.rep.data;
4870 attrPtr[0].sz = attrMask.getSizeInWords();
4871
4872
4873 if (MT_BACKUP_FLAG(ptr.p->flags))
4874 {
4875 // In mt-backup, the backup log is divided between LDMs. Each
4876 // BACKUP block writes insert/update/delete logs for the tuples it owns.
4877 // Each LDM in a multithreaded backup sends CREATE_TRIG_IMPL_REQs only to
4878 // its local DBTUP. Each DBTUP processes changes on its own fragments
4879 // and sends FIRE_TRIG_ORDs to its local BACKUP block. Since one DBTUP
4880 // instance has no knowledge of changes in other DBTUPs, this ensures
4881 // that a BACKUP block receives FIRE_TRIG_ORDs only for tuples it owns.
4882 // Each BACKUP block sends a CREATE_TRIG_IMPL_REQ to its local DBTUP
4883 // Each DBTUP processes changes on its frags and sends FIRE_TRIG_ORDs
4884 // to the local BACKUP block. Since one DBTUP instance has no knowledge
4885 // of changes in other DBTUPs, this ensures that a BACKUP block receives
4886 // FIRE_TRIG_ORDs for all changes on frags owned by its LDM, and for
4887 // no other frags.
4888 BlockReference ref = numberToRef(DBTUP, instance(), getOwnNodeId());
4889 sendSignal(ref, GSN_CREATE_TRIG_IMPL_REQ,
4890 signal, CreateTrigImplReq::SignalLength, JBB, attrPtr ,1);
4891 }
4892 else
4893 {
4894 // In single-threaded backup, the BACKUP block on LDM1 sends
4895 // CREATE_TRIG_IMPL_REQs for insert/update/delete on all tables to the
4896 // DbtupProxy. The DbtupProxy broadcasts the CREATE_TRIG to all LDMs.
4897 // So for every insert/update/delete, the DBTUP which owns the modified
4898 // fragment sends a FIRE_TRIG_ORD to the trigger creator on LDM1. When
4899 // the BACKUP block receives a FIRE_TRIG_ORD, it extracts the details of
4900 // the insert/update/delete and writes it to the backup log.
4901 sendSignal(DBTUP_REF, GSN_CREATE_TRIG_IMPL_REQ,
4902 signal, CreateTrigImplReq::SignalLength, JBB, attrPtr ,1);
4903 }
4904 }
4905 }
4906
4907 void
execCREATE_TRIG_IMPL_CONF(Signal * signal)4908 Backup::execCREATE_TRIG_IMPL_CONF(Signal* signal)
4909 {
4910 jamEntry();
4911 const CreateTrigImplConf* conf =
4912 (const CreateTrigImplConf*)signal->getDataPtr();
4913
4914 const Uint32 ptrI = conf->senderData;
4915 const Uint32 tableId = conf->tableId;
4916 const TriggerEvent::Value type =
4917 TriggerInfo::getTriggerEvent(conf->triggerInfo);
4918
4919 BackupRecordPtr ptr;
4920 c_backupPool.getPtr(ptr, ptrI);
4921
4922 /**
4923 * Verify that I'm waiting for this conf
4924 *
4925 * ptr.p->masterRef != reference()
4926 * as slaves and masters have triggers now.
4927 */
4928 ndbrequire(ptr.p->slaveData.gsn == GSN_CREATE_TRIG_IMPL_REQ);
4929 ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
4930 ndbrequire(ptr.p->slaveData.createTrig.tableId == tableId);
4931
4932 TablePtr tabPtr;
4933 ndbrequire(findTable(ptr, tabPtr, tableId));
4934 ndbrequire(type < 3); // if some decides to change the enums
4935
4936 createTrigReply(signal, ptr);
4937 }
4938
4939 void
execCREATE_TRIG_IMPL_REF(Signal * signal)4940 Backup::execCREATE_TRIG_IMPL_REF(Signal* signal)
4941 {
4942 jamEntry();
4943 const CreateTrigImplRef* ref =
4944 (const CreateTrigImplRef*)signal->getDataPtr();
4945
4946 const Uint32 ptrI = ref->senderData;
4947 const Uint32 tableId = ref->tableId;
4948
4949 BackupRecordPtr ptr;
4950 c_backupPool.getPtr(ptr, ptrI);
4951
4952 /**
4953 * Verify that I'm waiting for this ref
4954 *
4955 * ptr.p->masterRef != reference()
4956 * as slaves and masters have triggers now
4957 */
4958 ndbrequire(ptr.p->slaveData.gsn == GSN_CREATE_TRIG_IMPL_REQ);
4959 ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
4960 ndbrequire(ptr.p->slaveData.createTrig.tableId == tableId);
4961
4962 ptr.p->setErrorCode(ref->errorCode);
4963
4964 createTrigReply(signal, ptr);
4965 }
4966
4967 void
createTrigReply(Signal * signal,BackupRecordPtr ptr)4968 Backup::createTrigReply(Signal* signal, BackupRecordPtr ptr)
4969 {
4970 CRASH_INSERTION(10003);
4971
4972 /**
4973 * Check finished with table
4974 */
4975 ptr.p->slaveData.trigSendCounter--;
4976 if(ptr.p->slaveData.trigSendCounter.done() == false){
4977 jam();
4978 return;
4979 }//if
4980
4981 if (ERROR_INSERTED(10025))
4982 {
4983 ptr.p->errorCode = 325;
4984 }
4985
4986 if(ptr.p->checkError()) {
4987 jam();
4988 ptr.p->m_gsn = GSN_START_BACKUP_REF;
4989 StartBackupRef* ref = (StartBackupRef*)signal->getDataPtrSend();
4990 ref->backupPtr = ptr.i;
4991 ref->backupId = ptr.p->backupId;
4992 ref->errorCode = ptr.p->errorCode;
4993 ref->nodeId = getOwnNodeId();
4994 ndbout_c("Backup::createTrigReply : CREATE_TRIG_IMPL error %d, backup id %u node %d",
4995 ref->errorCode, ref->backupId, ref->nodeId);
4996 sendSignal(ptr.p->senderRef, GSN_START_BACKUP_REF, signal,
4997 StartBackupRef::SignalLength, JBB);
4998 return;
4999 }//if
5000
5001 TablePtr tabPtr;
5002 ndbrequire(findTable(ptr, tabPtr, ptr.p->slaveData.createTrig.tableId));
5003
5004 /**
5005 * Next table
5006 */
5007 ptr.p->tables.next(tabPtr);
5008 if(tabPtr.i != RNIL){
5009 jam();
5010 sendCreateTrig(signal, ptr, tabPtr);
5011 return;
5012 }//if
5013
5014 /**
5015 * We've finished creating triggers.
5016 *
5017 * send conf and wait
5018 */
5019 ptr.p->m_gsn = GSN_START_BACKUP_CONF;
5020 StartBackupConf* conf = (StartBackupConf*)signal->getDataPtrSend();
5021 conf->backupPtr = ptr.i;
5022 conf->backupId = ptr.p->backupId;
5023 sendSignal(ptr.p->senderRef, GSN_START_BACKUP_CONF, signal,
5024 StartBackupConf::SignalLength, JBB);
5025 }
5026
5027 /*****************************************************************************
5028 *
5029 * Master functionallity - Start backup
5030 *
5031 *****************************************************************************/
5032 void
sendStartBackup(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)5033 Backup::sendStartBackup(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr)
5034 {
5035
5036 ptr.p->masterData.startBackup.tablePtr = tabPtr.i;
5037
5038 StartBackupReq* req = (StartBackupReq*)signal->getDataPtrSend();
5039 req->backupId = ptr.p->backupId;
5040 req->backupPtr = ptr.i;
5041 req->senderRef = reference();
5042 /**
5043 * We use trigger Ids that are unique to BACKUP.
5044 * These don't interfere with other triggers (e.g. from DBDICT)
5045 * as there is a special case in DBTUP.
5046 *
5047 * Consequently, backups during online upgrade won't work
5048 */
5049 ptr.p->masterData.gsn = GSN_START_BACKUP_REQ;
5050 ptr.p->masterData.sendCounter = ptr.p->nodes;
5051 BlockNumber backupBlockNo = numberToBlock(BACKUP, instanceKey(ptr));
5052 NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5053 sendSignal(rg, GSN_START_BACKUP_REQ, signal,
5054 StartBackupReq::SignalLength, JBB);
5055 }
5056
5057 void
execSTART_BACKUP_REF(Signal * signal)5058 Backup::execSTART_BACKUP_REF(Signal* signal)
5059 {
5060 jamEntry();
5061
5062 StartBackupRef* ref = (StartBackupRef*)signal->getDataPtr();
5063 const Uint32 ptrI = ref->backupPtr;
5064 //const Uint32 backupId = ref->backupId;
5065 const Uint32 nodeId = ref->nodeId;
5066
5067 BackupRecordPtr ptr;
5068 c_backupPool.getPtr(ptr, ptrI);
5069
5070 ptr.p->setErrorCode(ref->errorCode);
5071 startBackupReply(signal, ptr, nodeId);
5072 }
5073
5074 void
execSTART_BACKUP_CONF(Signal * signal)5075 Backup::execSTART_BACKUP_CONF(Signal* signal)
5076 {
5077 jamEntry();
5078
5079 StartBackupConf* conf = (StartBackupConf*)signal->getDataPtr();
5080 const Uint32 ptrI = conf->backupPtr;
5081 //const Uint32 backupId = conf->backupId;
5082 const Uint32 nodeId = refToNode(signal->senderBlockRef());
5083
5084 BackupRecordPtr ptr;
5085 c_backupPool.getPtr(ptr, ptrI);
5086
5087 startBackupReply(signal, ptr, nodeId);
5088 }
5089
5090 void
startBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)5091 Backup::startBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
5092 {
5093 if (ERROR_INSERTED(10052))
5094 {
5095 if (nodeId == getOwnNodeId())
5096 {
5097 jam();
5098 ndbrequire(ptr.p->errorCode == 0)
5099 // Delay reply from self so that master waits for START_BACKUP_REFs
5100 sendSignalWithDelay(reference(), GSN_START_BACKUP_CONF, signal,
5101 5000, signal->getLength());
5102 return;
5103 }
5104 else
5105 {
5106 // Received START_BACKUP_REF/CONF from node n1, now crash n1. This will
5107 // trigger node-failure handling where master sends START_BACKUP_REF to
5108 // self on behalf of n1. So master receives 2 REFs from n1.
5109 signal->theData[0] = 9999;
5110 sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
5111 }
5112 }
5113
5114 CRASH_INSERTION((10004));
5115
5116 if (!haveAllSignals(ptr, GSN_START_BACKUP_REQ, nodeId)) {
5117 jam();
5118 return;
5119 }
5120
5121 if (ERROR_INSERTED(10026))
5122 {
5123 ptr.p->errorCode = 326;
5124 }
5125
5126 if(ptr.p->checkError()){
5127 jam();
5128 masterAbort(signal, ptr);
5129 return;
5130 }
5131
5132 /*
5133 * We reply to client after create trigger
5134 */
5135 if (SEND_BACKUP_STARTED_FLAG(ptr.p->flags))
5136 {
5137 BackupConf * conf = (BackupConf*)signal->getDataPtrSend();
5138 conf->backupId = ptr.p->backupId;
5139 conf->senderData = ptr.p->clientData;
5140 sendSignal(ptr.p->clientRef, GSN_BACKUP_CONF, signal,
5141 BackupConf::SignalLength, JBB);
5142 }
5143
5144 signal->theData[0] = NDB_LE_BackupStarted;
5145 signal->theData[1] = ptr.p->clientRef;
5146 signal->theData[2] = ptr.p->backupId;
5147 // Node bitmask is not used at the receiver, so zeroing it out.
5148 NdbNodeBitmask::clear(signal->theData + 3, NdbNodeBitmask48::Size);
5149 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 3 + NdbNodeBitmask48::Size, JBB);
5150
5151 /**
5152 * Wait for startGCP to a establish a consistent point at backup start.
5153 * This point is consistent since backup logging has started but scans
5154 * have not yet started, so it needs to be identified by a GCP. Wait till
5155 * the existing GCP has completed and capture the GCI as the startGCP of
5156 * this backup.
5157 * This is needed for SNAPSHOTSTART backups, which are restored to a
5158 * consistent point at backup start by replaying the backup undo logs up
5159 * till the end of startGCP.
5160 */
5161 ptr.p->masterData.gsn = GSN_WAIT_GCP_REQ;
5162 ptr.p->masterData.waitGCP.startBackup = true;
5163
5164 WaitGCPReq * waitGCPReq = (WaitGCPReq*)signal->getDataPtrSend();
5165 waitGCPReq->senderRef = reference();
5166 waitGCPReq->senderData = ptr.i;
5167 waitGCPReq->requestType = WaitGCPReq::CompleteForceStart;
5168 //we delay 10 seconds for testcases to generate events to be recorded in the UNDO log
5169 if (ERROR_INSERTED(10041))
5170 {
5171 sendSignalWithDelay(DBDIH_REF, GSN_WAIT_GCP_REQ, signal, 10*1000, WaitGCPReq::SignalLength);
5172 }
5173 else
5174 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5175 WaitGCPReq::SignalLength,JBB);
5176 }
5177
5178 void
execWAIT_GCP_REF(Signal * signal)5179 Backup::execWAIT_GCP_REF(Signal* signal)
5180 {
5181 jamEntry();
5182
5183 CRASH_INSERTION((10006));
5184
5185 WaitGCPRef * ref = (WaitGCPRef*)signal->getDataPtr();
5186 const Uint32 ptrI = ref->senderData;
5187
5188 BackupRecordPtr ptr;
5189 c_backupPool.getPtr(ptr, ptrI);
5190
5191 ndbrequire(ptr.p->masterRef == reference());
5192 ndbrequire(ptr.p->masterData.gsn == GSN_WAIT_GCP_REQ);
5193
5194 WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5195 req->senderRef = reference();
5196 req->senderData = ptr.i;
5197 req->requestType = WaitGCPReq::CompleteForceStart;
5198 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5199 WaitGCPReq::SignalLength,JBB);
5200 }
5201
5202 void
execWAIT_GCP_CONF(Signal * signal)5203 Backup::execWAIT_GCP_CONF(Signal* signal){
5204 jamEntry();
5205
5206 CRASH_INSERTION((10007));
5207
5208 WaitGCPConf * conf = (WaitGCPConf*)signal->getDataPtr();
5209 const Uint32 ptrI = conf->senderData;
5210 const Uint32 gcp = conf->gci_hi;
5211
5212 BackupRecordPtr ptr;
5213 c_backupPool.getPtr(ptr, ptrI);
5214
5215 ndbrequire(ptr.p->masterRef == reference());
5216 ndbrequire(ptr.p->masterData.gsn == GSN_WAIT_GCP_REQ);
5217
5218 if(ptr.p->checkError()) {
5219 jam();
5220 masterAbort(signal, ptr);
5221 return;
5222 }//if
5223
5224 if(ptr.p->masterData.waitGCP.startBackup) {
5225 jam();
5226 CRASH_INSERTION((10008));
5227 ptr.p->startGCP = gcp;
5228 ptr.p->masterData.sendCounter= 0;
5229 ptr.p->masterData.gsn = GSN_BACKUP_FRAGMENT_REQ;
5230 nextFragment(signal, ptr);
5231 return;
5232 } else {
5233 jam();
5234 if(gcp >= ptr.p->startGCP + 3)
5235 {
5236 CRASH_INSERTION((10009));
5237 ptr.p->stopGCP = gcp;
5238 /**
5239 * Backup is complete - begin cleanup
5240 * STOP_BACKUP_REQ is sent to participants.
5241 * They then drop the local triggers
5242 */
5243 sendStopBackup(signal, ptr);
5244 return;
5245 }//if
5246
5247 /**
5248 * Make sure that we got entire stopGCP
5249 */
5250 WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5251 req->senderRef = reference();
5252 req->senderData = ptr.i;
5253 req->requestType = WaitGCPReq::CompleteForceStart;
5254 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5255 WaitGCPReq::SignalLength,JBB);
5256 return;
5257 }
5258 }
5259
5260 /*****************************************************************************
5261 *
5262 * Master functionallity - Backup fragment
5263 *
5264 *****************************************************************************/
5265 void
nextFragment(Signal * signal,BackupRecordPtr ptr)5266 Backup::nextFragment(Signal* signal, BackupRecordPtr ptr)
5267 {
5268 jam();
5269
5270 BackupFragmentReq* req = (BackupFragmentReq*)signal->getDataPtrSend();
5271 req->backupPtr = ptr.i;
5272 req->backupId = ptr.p->backupId;
5273
5274 TablePtr tabPtr;
5275 Uint32 unscanned_frag_count = 0;
5276 ptr.p->tables.first(tabPtr);
5277 for(; tabPtr.i != RNIL && ptr.p->idleFragWorkerCount > 0; ptr.p->tables.next(tabPtr))
5278 {
5279 jam();
5280 FragmentPtr fragPtr;
5281 Array<Fragment> & frags = tabPtr.p->fragments;
5282 const Uint32 fragCount = frags.getSize();
5283
5284 for(Uint32 i = 0; i<fragCount && ptr.p->idleFragWorkerCount > 0; i++)
5285 {
5286 jam();
5287 tabPtr.p->fragments.getPtr(fragPtr, i);
5288 const Uint32 nodeId = fragPtr.p->node;
5289 /* Each frag is owned by a specific LDM on a specific node.
5290 * Master assigns each frag to an LDM on one of the nodes.
5291 * Frags are always assigned to nodes which own them, but
5292 * may be assigned to non-owner LDMs on owner nodes.
5293 * single-threaded backup -> always assign frag to LDM1
5294 * multithreaded backup -> assign frag to owner LDM
5295 * mapFragToLdm() detects backup type and selects LDM.
5296 */
5297 Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
5298 req->tableId = tabPtr.p->tableId;
5299 req->fragmentNo = i;
5300 req->count = 0;
5301 req->senderRef = reference();
5302 if (fragPtr.p->scanned == 0)
5303 unscanned_frag_count++;
5304
5305 if ((fragPtr.p->scanned == 0) && (fragPtr.p->scanning == 0) &&
5306 (ptr.p->fragWorkers[nodeId].get(ldm)))
5307 {
5308 ptr.p->fragWorkers[nodeId].clear(ldm);
5309 fragPtr.p->scanning = 1;
5310 ptr.p->idleFragWorkerCount--;
5311 ptr.p->masterData.sendCounter++;
5312 BlockReference ref = numberToRef(BACKUP, ldm, nodeId);
5313 sendSignal(ref, GSN_BACKUP_FRAGMENT_REQ, signal,
5314 BackupFragmentReq::SignalLength, JBB);
5315
5316 }//if
5317 }//for
5318 }//for
5319
5320 if (unscanned_frag_count > 0)
5321 {
5322 jam();
5323 return;
5324 }//if
5325
5326 /**
5327 * Finished with all tables
5328 */
5329 {
5330 /**
5331 * Wait for stopGCP to a establish a consistent point at backup stop.
5332 * This point is consistent since backup logging has stopped and scans
5333 * have completed, so it needs to be identified by a GCP. Wait till
5334 * the existing GCP has completed and capture the GCI as the stopGCP of
5335 * this backup.
5336 * This is needed for SNAPSHOTEND backups, which are restored to a
5337 * consistent point at backup stop by replaying the backup redo logs up
5338 * till the end of stopGCP.
5339 */
5340 ptr.p->masterData.gsn = GSN_WAIT_GCP_REQ;
5341 ptr.p->masterData.waitGCP.startBackup = false;
5342
5343 WaitGCPReq * req = (WaitGCPReq*)signal->getDataPtrSend();
5344 req->senderRef = reference();
5345 req->senderData = ptr.i;
5346 req->requestType = WaitGCPReq::CompleteForceStart;
5347 sendSignal(DBDIH_REF, GSN_WAIT_GCP_REQ, signal,
5348 WaitGCPReq::SignalLength, JBB);
5349 }
5350 }
5351
5352 void
execBACKUP_FRAGMENT_CONF(Signal * signal)5353 Backup::execBACKUP_FRAGMENT_CONF(Signal* signal)
5354 {
5355 jamEntry();
5356
5357 CRASH_INSERTION((10010));
5358
5359 BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtr();
5360 const Uint32 ptrI = conf->backupPtr;
5361 //const Uint32 backupId = conf->backupId;
5362 const Uint32 tableId = conf->tableId;
5363 const Uint32 fragmentNo = conf->fragmentNo;
5364 const Uint32 nodeId = refToNode(signal->senderBlockRef());
5365 const Uint64 noOfBytes =
5366 conf->noOfBytesLow + (((Uint64)conf->noOfBytesHigh) << 32);
5367 const Uint64 noOfRecords =
5368 conf->noOfRecordsLow + (((Uint64)conf->noOfRecordsHigh) << 32);
5369
5370 BackupRecordPtr ptr;
5371 c_backupPool.getPtr(ptr, ptrI);
5372
5373 ptr.p->noOfBytes += noOfBytes;
5374 ptr.p->noOfRecords += noOfRecords;
5375 ptr.p->masterData.sendCounter--;
5376
5377 TablePtr tabPtr;
5378 ndbrequire(findTable(ptr, tabPtr, tableId));
5379
5380 tabPtr.p->noOfRecords += noOfRecords;
5381
5382 FragmentPtr fragPtr;
5383 tabPtr.p->fragments.getPtr(fragPtr, fragmentNo);
5384
5385 fragPtr.p->noOfRecords = noOfRecords;
5386
5387 ndbrequire(fragPtr.p->scanned == 0);
5388 ndbrequire(fragPtr.p->scanning == 1);
5389 ndbrequire(fragPtr.p->node == nodeId);
5390
5391 fragPtr.p->scanned = 1;
5392 fragPtr.p->scanning = 0;
5393
5394 if (ERROR_INSERTED(10028))
5395 {
5396 ptr.p->errorCode = 328;
5397 }
5398
5399 if(ptr.p->checkError())
5400 {
5401 jam();
5402 if(ptr.p->masterData.sendCounter.done())
5403 {
5404 jam();
5405 masterAbort(signal, ptr);
5406 return;
5407 }//if
5408 }
5409 else
5410 {
5411 jam();
5412 Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
5413 ptr.p->fragWorkers[nodeId].set(ldm);
5414 ptr.p->idleFragWorkerCount++;
5415 nextFragment(signal, ptr);
5416 }
5417 }
5418
5419 void
execBACKUP_FRAGMENT_REF(Signal * signal)5420 Backup::execBACKUP_FRAGMENT_REF(Signal* signal)
5421 {
5422 jamEntry();
5423
5424 CRASH_INSERTION((10011));
5425
5426 BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtr();
5427 const Uint32 ptrI = ref->backupPtr;
5428 //const Uint32 backupId = ref->backupId;
5429 const Uint32 nodeId = ref->nodeId;
5430
5431 BackupRecordPtr ptr;
5432 c_backupPool.getPtr(ptr, ptrI);
5433
5434 TablePtr tabPtr;
5435 ptr.p->tables.first(tabPtr);
5436 for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
5437 jam();
5438 FragmentPtr fragPtr;
5439 Array<Fragment> & frags = tabPtr.p->fragments;
5440 const Uint32 fragCount = frags.getSize();
5441
5442 for(Uint32 i = 0; i<fragCount; i++) {
5443 jam();
5444 tabPtr.p->fragments.getPtr(fragPtr, i);
5445 if(fragPtr.p->scanning != 0 && nodeId == fragPtr.p->node)
5446 {
5447 jam();
5448 ndbrequire(fragPtr.p->scanned == 0);
5449 fragPtr.p->scanned = 1;
5450 fragPtr.p->scanning = 0;
5451 goto done;
5452 }
5453 }
5454 }
5455 goto err;
5456
5457 done:
5458 ptr.p->masterData.sendCounter--;
5459 ptr.p->setErrorCode(ref->errorCode);
5460
5461 if(ptr.p->masterData.sendCounter.done())
5462 {
5463 jam();
5464 masterAbort(signal, ptr);
5465 return;
5466 }//if
5467
5468 err:
5469 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
5470 ord->backupId = ptr.p->backupId;
5471 ord->backupPtr = ptr.i;
5472 ord->requestType = AbortBackupOrd::LogBufferFull;
5473 ord->senderData= ptr.i;
5474 execABORT_BACKUP_ORD(signal);
5475 }
5476
5477 void
execBACKUP_FRAGMENT_COMPLETE_REP(Signal * signal)5478 Backup::execBACKUP_FRAGMENT_COMPLETE_REP(Signal* signal)
5479 {
5480 jamEntry();
5481 BackupFragmentCompleteRep * rep =
5482 (BackupFragmentCompleteRep*)signal->getDataPtr();
5483
5484 BackupRecordPtr ptr;
5485 c_backupPool.getPtr(ptr, rep->backupPtr);
5486
5487 TablePtr tabPtr;
5488 ndbrequire(findTable(ptr, tabPtr, rep->tableId));
5489
5490 tabPtr.p->noOfRecords =
5491 rep->noOfTableRowsLow + (((Uint64)rep->noOfTableRowsHigh) << 32);
5492
5493 FragmentPtr fragPtr;
5494 tabPtr.p->fragments.getPtr(fragPtr, rep->fragmentNo);
5495
5496 fragPtr.p->noOfRecords =
5497 rep->noOfFragmentRowsLow + (((Uint64)rep->noOfFragmentRowsHigh) << 32);
5498 }
5499
5500 /*****************************************************************************
5501 *
5502 * Slave functionallity - Drop triggers
5503 *
5504 *****************************************************************************/
5505
5506 void
sendDropTrig(Signal * signal,BackupRecordPtr ptr)5507 Backup::sendDropTrig(Signal* signal, BackupRecordPtr ptr)
5508 {
5509 TablePtr tabPtr;
5510 ptr.p->slaveData.gsn = GSN_DROP_TRIG_IMPL_REQ;
5511
5512 if (ptr.p->slaveData.dropTrig.tableId == RNIL) {
5513 jam();
5514 if(ptr.p->tables.getCount())
5515 ptr.p->tables.first(tabPtr);
5516 else
5517 {
5518 // Early abort, go to close files
5519 jam();
5520 closeFiles(signal, ptr);
5521 return;
5522 }
5523 } else {
5524 jam();
5525 ndbrequire(findTable(ptr, tabPtr, ptr.p->slaveData.dropTrig.tableId));
5526 ptr.p->tables.next(tabPtr);
5527 }//if
5528 if (tabPtr.i != RNIL) {
5529 jam();
5530 sendDropTrig(signal, ptr, tabPtr);
5531 } else {
5532 /**
5533 * Insert footers
5534 */
5535 //if backup error, we needn't insert footers
5536 if(ptr.p->checkError())
5537 {
5538 jam();
5539 closeFiles(signal, ptr);
5540 ptr.p->errorCode = 0;
5541 return;
5542 }
5543
5544 {
5545 BackupFilePtr filePtr;
5546 ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
5547 Uint32 * dst;
5548 ndbrequire(filePtr.p->operation.dataBuffer.getWritePtr(&dst, 1));
5549 * dst = 0;
5550 filePtr.p->operation.dataBuffer.updateWritePtr(1);
5551 }
5552
5553 {
5554 BackupFilePtr filePtr;
5555 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
5556
5557 const Uint32 gcpSz = sizeof(BackupFormat::CtlFile::GCPEntry) >> 2;
5558
5559 Uint32 * dst;
5560 ndbrequire(filePtr.p->operation.dataBuffer.getWritePtr(&dst, gcpSz));
5561
5562 BackupFormat::CtlFile::GCPEntry * gcp =
5563 (BackupFormat::CtlFile::GCPEntry*)dst;
5564
5565 gcp->SectionType = htonl(BackupFormat::GCP_ENTRY);
5566 gcp->SectionLength = htonl(gcpSz);
5567 gcp->StartGCP = htonl(ptr.p->startGCP);
5568 gcp->StopGCP = htonl(ptr.p->stopGCP - 1);
5569 filePtr.p->operation.dataBuffer.updateWritePtr(gcpSz);
5570
5571 {
5572 TablePtr tabPtr;
5573 if (ptr.p->tables.first(tabPtr))
5574 {
5575 jam();
5576 signal->theData[0] = BackupContinueB::BACKUP_FRAGMENT_INFO;
5577 signal->theData[1] = ptr.i;
5578 signal->theData[2] = tabPtr.i;
5579 signal->theData[3] = 0;
5580 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
5581 }
5582 else
5583 {
5584 jam();
5585 closeFiles(signal, ptr);
5586 }
5587 }
5588 }
5589 }
5590 }
5591
5592 void
sendDropTrig(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)5593 Backup::sendDropTrig(Signal* signal, BackupRecordPtr ptr, TablePtr tabPtr)
5594 {
5595 jam();
5596 DropTrigImplReq* req = (DropTrigImplReq*)signal->getDataPtrSend();
5597
5598 ptr.p->slaveData.gsn = GSN_DROP_TRIG_IMPL_REQ;
5599 ptr.p->slaveData.trigSendCounter = 0;
5600 req->senderRef = reference(); // Sending to myself
5601 req->senderData = ptr.i;
5602 req->requestType = 0;
5603 req->tableId = tabPtr.p->tableId;
5604 req->tableVersion = 0;
5605 req->indexId = RNIL;
5606 req->indexVersion = 0;
5607 req->receiverRef = reference();
5608
5609 // TUP needs some triggerInfo to find right list
5610 Uint32 ti = 0;
5611 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
5612 TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION_BEFORE);
5613 else
5614 TriggerInfo::setTriggerType(ti, TriggerType::SUBSCRIPTION);
5615 TriggerInfo::setTriggerActionTime(ti, TriggerActionTime::TA_DETACHED);
5616 TriggerInfo::setMonitorReplicas(ti, true);
5617 TriggerInfo::setMonitorAllAttributes(ti, false);
5618
5619 ptr.p->slaveData.dropTrig.tableId = tabPtr.p->tableId;
5620 req->tableId = tabPtr.p->tableId;
5621
5622 for (int i = 0; i < 3; i++) {
5623 Uint32 id = tabPtr.p->triggerIds[i];
5624 req->triggerId = id;
5625
5626 Uint32 ti2 = ti;
5627 TriggerInfo::setTriggerEvent(ti2, triggerEventValues[i]);
5628 req->triggerInfo = ti2;
5629 if (MT_BACKUP_FLAG(ptr.p->flags))
5630 {
5631 BlockReference ref = numberToRef(DBTUP, instance(), getOwnNodeId());
5632 sendSignal(ref, GSN_DROP_TRIG_IMPL_REQ,
5633 signal, DropTrigImplReq::SignalLength, JBB);
5634 }
5635 else
5636 {
5637 sendSignal(DBTUP_REF, GSN_DROP_TRIG_IMPL_REQ,
5638 signal, DropTrigImplReq::SignalLength, JBB);
5639 }
5640
5641 ptr.p->slaveData.trigSendCounter ++;
5642 }
5643 }
5644
5645 void
execDROP_TRIG_IMPL_REF(Signal * signal)5646 Backup::execDROP_TRIG_IMPL_REF(Signal* signal)
5647 {
5648 jamEntry();
5649
5650 const DropTrigImplRef* ref = (const DropTrigImplRef*)signal->getDataPtr();
5651 const Uint32 ptrI = ref->senderData;
5652
5653 BackupRecordPtr ptr;
5654 c_backupPool.getPtr(ptr, ptrI);
5655
5656 if(ref->triggerId != ~(Uint32) 0)
5657 {
5658 ndbout << "ERROR DROPPING TRIGGER: " << ref->triggerId;
5659 ndbout << " Err: " << ref->errorCode << endl << endl;
5660 }
5661
5662 dropTrigReply(signal, ptr);
5663 }
5664
5665 void
execDROP_TRIG_IMPL_CONF(Signal * signal)5666 Backup::execDROP_TRIG_IMPL_CONF(Signal* signal)
5667 {
5668 jamEntry();
5669
5670 const DropTrigImplConf* conf = (const DropTrigImplConf*)signal->getDataPtr();
5671 const Uint32 ptrI = conf->senderData;
5672
5673 BackupRecordPtr ptr;
5674 c_backupPool.getPtr(ptr, ptrI);
5675
5676 dropTrigReply(signal, ptr);
5677 }
5678
5679 void
dropTrigReply(Signal * signal,BackupRecordPtr ptr)5680 Backup::dropTrigReply(Signal* signal, BackupRecordPtr ptr)
5681 {
5682 CRASH_INSERTION((10012));
5683
5684 ndbrequire(ptr.p->slaveData.gsn == GSN_DROP_TRIG_IMPL_REQ);
5685 ndbrequire(ptr.p->slaveData.trigSendCounter.done() == false);
5686
5687 // move from .masterData to .slaveData
5688 ptr.p->slaveData.trigSendCounter--;
5689 if(ptr.p->slaveData.trigSendCounter.done() == false){
5690 jam();
5691 return;
5692 }//if
5693
5694 sendDropTrig(signal, ptr); // recursive next
5695 }
5696
5697 /*****************************************************************************
5698 *
5699 * Master functionallity - Stop backup
5700 *
5701 *****************************************************************************/
5702 void
execSTOP_BACKUP_REF(Signal * signal)5703 Backup::execSTOP_BACKUP_REF(Signal* signal)
5704 {
5705 jamEntry();
5706
5707 StopBackupRef* ref = (StopBackupRef*)signal->getDataPtr();
5708 const Uint32 ptrI = ref->backupPtr;
5709 //const Uint32 backupId = ref->backupId;
5710 const Uint32 nodeId = ref->nodeId;
5711
5712 BackupRecordPtr ptr;
5713 c_backupPool.getPtr(ptr, ptrI);
5714
5715 ptr.p->setErrorCode(ref->errorCode);
5716 stopBackupReply(signal, ptr, nodeId);
5717 }
5718
5719 void
sendStopBackup(Signal * signal,BackupRecordPtr ptr)5720 Backup::sendStopBackup(Signal* signal, BackupRecordPtr ptr)
5721 {
5722 jam();
5723
5724 StopBackupReq* stop = (StopBackupReq*)signal->getDataPtrSend();
5725 stop->backupPtr = ptr.i;
5726 stop->backupId = ptr.p->backupId;
5727 stop->startGCP = ptr.p->startGCP;
5728 stop->stopGCP = ptr.p->stopGCP;
5729 stop->senderRef = reference();
5730
5731 ptr.p->masterData.gsn = GSN_STOP_BACKUP_REQ;
5732 ptr.p->masterData.sendCounter = ptr.p->nodes;
5733 Uint32 receiverInstance = instanceKey(ptr);
5734
5735 if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
5736 && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
5737 {
5738 // All signal-sender functions in abort protocol detect
5739 // send-to-self bitmask settings and send signals accordingly.
5740 ptr.p->senderRef = reference();
5741 receiverInstance = instance();
5742 }
5743 BlockNumber backupBlockNo = numberToBlock(BACKUP, receiverInstance);
5744 NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5745 sendSignal(rg, GSN_STOP_BACKUP_REQ, signal,
5746 StopBackupReq::SignalLength, JBB);
5747 }
5748
5749 void
execSTOP_BACKUP_CONF(Signal * signal)5750 Backup::execSTOP_BACKUP_CONF(Signal* signal)
5751 {
5752 jamEntry();
5753
5754 StopBackupConf* conf = (StopBackupConf*)signal->getDataPtr();
5755 const Uint32 ptrI = conf->backupPtr;
5756 //const Uint32 backupId = conf->backupId;
5757 const Uint32 nodeId = refToNode(signal->senderBlockRef());
5758
5759 BackupRecordPtr ptr;
5760 c_backupPool.getPtr(ptr, ptrI);
5761
5762 ptr.p->noOfLogBytes += conf->noOfLogBytes;
5763 ptr.p->noOfLogRecords += conf->noOfLogRecords;
5764
5765 stopBackupReply(signal, ptr, nodeId);
5766 }
5767
5768 void
stopBackupReply(Signal * signal,BackupRecordPtr ptr,Uint32 nodeId)5769 Backup::stopBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId)
5770 {
5771 if (ERROR_INSERTED(10053))
5772 {
5773 if (nodeId == getOwnNodeId())
5774 {
5775 jam();
5776 ndbrequire(ptr.p->errorCode == 0)
5777 // Delay reply from self so that master waits for STOP_BACKUP_REFs
5778 sendSignalWithDelay(reference(), GSN_STOP_BACKUP_CONF, signal,
5779 5000, signal->getLength());
5780 return;
5781 }
5782 else
5783 {
5784 // Received STOP_BACKUP_REF/CONF from node n1, now crash n1. This will
5785 // trigger node-failure handling where master sends STOP_BACKUP_REF to
5786 // self on behalf of n1. So master receives 2 REFs from n1.
5787 signal->theData[0] = 9999;
5788 sendSignal(numberToRef(CMVMI, nodeId), GSN_NDB_TAMPER, signal, 1, JBB);
5789 }
5790 }
5791 CRASH_INSERTION((10013));
5792
5793 if (!haveAllSignals(ptr, GSN_STOP_BACKUP_REQ, nodeId)) {
5794 jam();
5795 return;
5796 }
5797
5798 sendAbortBackupOrd(signal, ptr, AbortBackupOrd::BackupComplete);
5799
5800 if(!ptr.p->checkError() && ptr.p->masterData.errorCode == 0)
5801 {
5802 if (SEND_BACKUP_COMPLETED_FLAG(ptr.p->flags))
5803 {
5804 BackupCompleteRep * rep = (BackupCompleteRep*)signal->getDataPtrSend();
5805 rep->backupId = ptr.p->backupId;
5806 rep->senderData = ptr.p->clientData;
5807 rep->startGCP = ptr.p->startGCP;
5808 rep->stopGCP = ptr.p->stopGCP;
5809 rep->noOfBytesLow = (Uint32)(ptr.p->noOfBytes & 0xFFFFFFFF);
5810 rep->noOfRecordsLow = (Uint32)(ptr.p->noOfRecords & 0xFFFFFFFF);
5811 rep->noOfBytesHigh = (Uint32)(ptr.p->noOfBytes >> 32);
5812 rep->noOfRecordsHigh = (Uint32)(ptr.p->noOfRecords >> 32);
5813 rep->noOfLogBytes = Uint32(ptr.p->noOfLogBytes); // TODO 64-bit log-bytes
5814 rep->noOfLogRecords = Uint32(ptr.p->noOfLogRecords); // TODO ^^
5815 sendSignal(ptr.p->clientRef, GSN_BACKUP_COMPLETE_REP, signal,
5816 BackupCompleteRep::SignalLength, JBB);
5817 }
5818
5819 if (ERROR_INSERTED(10042))
5820 {
5821 // Change backup statistics to reflect values > 32 bit
5822 ptr.p->noOfRecords = INT_MAX64;
5823 ptr.p->noOfBytes = INT_MAX64;
5824 ptr.p->noOfLogRecords = INT_MAX64;
5825 ptr.p->noOfLogBytes = INT_MAX64;
5826 }
5827
5828 signal->theData[0] = NDB_LE_BackupCompleted;
5829 signal->theData[1] = ptr.p->clientRef;
5830 signal->theData[2] = ptr.p->backupId;
5831 signal->theData[3] = ptr.p->startGCP;
5832 signal->theData[4] = ptr.p->stopGCP;
5833 signal->theData[5] = (Uint32)(ptr.p->noOfBytes & 0xFFFFFFFF);
5834 signal->theData[6] = (Uint32)(ptr.p->noOfRecords & 0xFFFFFFFF);
5835 signal->theData[7] = (Uint32)(ptr.p->noOfLogBytes & 0xFFFFFFFF);
5836 signal->theData[8] = (Uint32)(ptr.p->noOfLogRecords & 0xFFFFFFFF);
5837 signal->theData[9] = 0; //unused
5838 signal->theData[10] = 0; //unused
5839 signal->theData[11] = (Uint32)(ptr.p->noOfBytes >> 32);
5840 signal->theData[12] = (Uint32)(ptr.p->noOfRecords >> 32);
5841 signal->theData[13] = (Uint32)(ptr.p->noOfLogBytes >> 32);
5842 signal->theData[14] = (Uint32)(ptr.p->noOfLogRecords >> 32);
5843 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 15, JBB);
5844 }
5845 else
5846 {
5847 masterAbort(signal, ptr);
5848 }
5849 }
5850
5851 void
initReportStatus(Signal * signal,BackupRecordPtr ptr)5852 Backup::initReportStatus(Signal *signal, BackupRecordPtr ptr)
5853 {
5854 ptr.p->m_prev_report = NdbTick_getCurrentTicks();
5855 }
5856
5857 void
checkReportStatus(Signal * signal,BackupRecordPtr ptr)5858 Backup::checkReportStatus(Signal *signal, BackupRecordPtr ptr)
5859 {
5860 if (m_backup_report_frequency == 0)
5861 return;
5862
5863 const NDB_TICKS now = NdbTick_getCurrentTicks();
5864 const Uint64 elapsed = NdbTick_Elapsed(ptr.p->m_prev_report, now).seconds();
5865 if (elapsed > m_backup_report_frequency)
5866 {
5867 reportStatus(signal, ptr);
5868 ptr.p->m_prev_report = now;
5869 }
5870 }
5871
5872 void
reportStatus(Signal * signal,BackupRecordPtr ptr,BlockReference ref)5873 Backup::reportStatus(Signal* signal, BackupRecordPtr ptr,
5874 BlockReference ref)
5875 {
5876 const int signal_length = 11;
5877
5878 signal->theData[0] = NDB_LE_BackupStatus;
5879 for (int i= 1; i < signal_length; i++)
5880 signal->theData[i] = 0;
5881
5882 if (ptr.i == RNIL ||
5883 (ptr.p->m_gsn == 0 &&
5884 ptr.p->masterData.gsn == 0))
5885 {
5886 sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5887 return;
5888 }
5889 signal->theData[1] = ptr.p->clientRef;
5890 signal->theData[2] = ptr.p->backupId;
5891
5892 if (ptr.p->dataFilePtr[0] == RNIL)
5893 {
5894 sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5895 return;
5896 }
5897
5898 BackupFilePtr dataFilePtr;
5899 ptr.p->files.getPtr(dataFilePtr, ptr.p->dataFilePtr[0]);
5900 signal->theData[3] = (Uint32)(dataFilePtr.p->operation.m_bytes_total & 0xFFFFFFFF);
5901 signal->theData[4] = (Uint32)(dataFilePtr.p->operation.m_bytes_total >> 32);
5902 signal->theData[5] = (Uint32)(dataFilePtr.p->operation.m_records_total & 0xFFFFFFFF);
5903 signal->theData[6] = (Uint32)(dataFilePtr.p->operation.m_records_total >> 32);
5904
5905 if (ptr.p->logFilePtr == RNIL)
5906 {
5907 sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5908 return;
5909 }
5910
5911 BackupFilePtr logFilePtr;
5912 ptr.p->files.getPtr(logFilePtr, ptr.p->logFilePtr);
5913 signal->theData[7] = (Uint32)(logFilePtr.p->operation.m_bytes_total & 0xFFFFFFFF);
5914 signal->theData[8] = (Uint32)(logFilePtr.p->operation.m_bytes_total >> 32);
5915 signal->theData[9] = (Uint32)(logFilePtr.p->operation.m_records_total & 0xFFFFFFFF);
5916 signal->theData[10]= (Uint32)(logFilePtr.p->operation.m_records_total >> 32);
5917
5918 sendSignal(ref, GSN_EVENT_REP, signal, signal_length, JBB);
5919 }
5920
5921 /*****************************************************************************
5922 *
5923 * Master functionallity - Abort backup
5924 *
5925 *****************************************************************************/
5926 void
masterAbort(Signal * signal,BackupRecordPtr ptr)5927 Backup::masterAbort(Signal* signal, BackupRecordPtr ptr)
5928 {
5929 jam();
5930 #ifdef DEBUG_ABORT
5931 ndbout_c("************ masterAbort");
5932 #endif
5933
5934 ndbassert(ptr.p->masterRef == reference());
5935 if(ptr.p->masterData.errorCode != 0)
5936 {
5937 jam();
5938 return;
5939 }
5940
5941 if (SEND_BACKUP_STARTED_FLAG(ptr.p->flags))
5942 {
5943 BackupAbortRep* rep = (BackupAbortRep*)signal->getDataPtrSend();
5944 rep->backupId = ptr.p->backupId;
5945 rep->senderData = ptr.p->clientData;
5946 rep->reason = ptr.p->errorCode;
5947 sendSignal(ptr.p->clientRef, GSN_BACKUP_ABORT_REP, signal,
5948 BackupAbortRep::SignalLength, JBB);
5949 }
5950 signal->theData[0] = NDB_LE_BackupAborted;
5951 signal->theData[1] = ptr.p->clientRef;
5952 signal->theData[2] = ptr.p->backupId;
5953 signal->theData[3] = ptr.p->errorCode;
5954 sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 4, JBB);
5955
5956 ndbrequire(ptr.p->errorCode);
5957 ptr.p->masterData.errorCode = ptr.p->errorCode;
5958
5959 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
5960 ord->backupId = ptr.p->backupId;
5961 ord->backupPtr = ptr.i;
5962 ord->senderData= ptr.i;
5963 Uint32 receiverInstance = instanceKey(ptr); // = BackupProxy for mt-backup
5964
5965 if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
5966 && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
5967 {
5968 // All signal-sender functions in abort protocol detect
5969 // send-to-self bitmask settings and send signals accordingly.
5970 ptr.p->senderRef = reference();
5971 receiverInstance = instance();
5972 }
5973
5974 BlockNumber backupBlockNo = numberToBlock(BACKUP, receiverInstance);
5975 NodeReceiverGroup rg(backupBlockNo, ptr.p->nodes);
5976
5977 switch(ptr.p->masterData.gsn){
5978 case GSN_DEFINE_BACKUP_REQ:
5979 ord->requestType = AbortBackupOrd::BackupFailure;
5980 sendSignal(rg, GSN_ABORT_BACKUP_ORD, signal,
5981 AbortBackupOrd::SignalLength, JBB);
5982 return;
5983 case GSN_CREATE_TRIG_IMPL_REQ:
5984 case GSN_START_BACKUP_REQ:
5985 case GSN_ALTER_TRIG_REQ:
5986 case GSN_WAIT_GCP_REQ:
5987 case GSN_BACKUP_FRAGMENT_REQ:
5988 jam();
5989 ptr.p->stopGCP= ptr.p->startGCP + 1;
5990 sendStopBackup(signal, ptr); // dropping due to error
5991 return;
5992 case GSN_UTIL_SEQUENCE_REQ:
5993 case GSN_UTIL_LOCK_REQ:
5994 ndbabort();
5995 case GSN_DROP_TRIG_IMPL_REQ:
5996 case GSN_STOP_BACKUP_REQ:
5997 return;
5998 }
5999 }
6000
6001 void
abort_scan(Signal * signal,BackupRecordPtr ptr)6002 Backup::abort_scan(Signal * signal, BackupRecordPtr ptr)
6003 {
6004 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
6005 ord->backupId = ptr.p->backupId;
6006 ord->backupPtr = ptr.i;
6007 ord->senderData= ptr.i;
6008 ord->requestType = AbortBackupOrd::AbortScan;
6009
6010 TablePtr tabPtr;
6011 ptr.p->tables.first(tabPtr);
6012 for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
6013 jam();
6014 FragmentPtr fragPtr;
6015 Array<Fragment> & frags = tabPtr.p->fragments;
6016 const Uint32 fragCount = frags.getSize();
6017
6018 for(Uint32 i = 0; i<fragCount; i++) {
6019 jam();
6020 tabPtr.p->fragments.getPtr(fragPtr, i);
6021 const Uint32 nodeId = fragPtr.p->node;
6022 if(fragPtr.p->scanning != 0 && ptr.p->nodes.get(nodeId)) {
6023 jam();
6024 Uint32 ldm = mapFragToLdm(ptr, nodeId, fragPtr.p->lqhInstanceKey);
6025 BlockReference ref = numberToRef(BACKUP, ldm, nodeId);
6026 sendSignal(ref, GSN_ABORT_BACKUP_ORD, signal,
6027 AbortBackupOrd::SignalLength, JBB);
6028
6029 }
6030 }
6031 }
6032 }
6033
6034 /*****************************************************************************
6035 *
6036 * Slave functionallity: Define Backup
6037 *
6038 *****************************************************************************/
6039 void
defineBackupRef(Signal * signal,BackupRecordPtr ptr,Uint32 errCode)6040 Backup::defineBackupRef(Signal* signal, BackupRecordPtr ptr, Uint32 errCode)
6041 {
6042 jam();
6043 if(ptr.p->is_lcp())
6044 {
6045 jam();
6046 ptr.p->setPrepareErrorCode(errCode);
6047 ptr.p->prepareState = PREPARE_ABORTING;
6048 ndbrequire(ptr.p->ctlFilePtr != RNIL);
6049
6050 /**
6051 * This normally happens when a table has been deleted before we got to
6052 * start the LCP. This is a normal behaviour.
6053 *
6054 * At this point we have both the data file and the control file to use
6055 * open. At this point it is ok to remove both of them since they will
6056 * no longer be needed. This will happen in closeFile since we have set
6057 * the error code here.
6058 */
6059 BackupFilePtr filePtr;
6060 ptr.p->files.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
6061 if (filePtr.p->m_flags & BackupFile::BF_OPEN &&
6062 !(filePtr.p->m_flags & BackupFile::BF_CLOSING))
6063 {
6064 jam();
6065 ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_FILE_THREAD));
6066 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_LCP_META;
6067 closeFile(signal, ptr, filePtr, true);
6068 }
6069 else if (filePtr.p->m_flags & BackupFile::BF_CLOSING)
6070 {
6071 /* Wait for the data file closing */
6072 jam();
6073 return;
6074 }
6075 else
6076 {
6077 jam();
6078 ndbrequire(filePtr.p->m_flags == 0);
6079 }
6080 ptr.p->files.getPtr(filePtr,
6081 ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber]);
6082 if (filePtr.p->m_flags & BackupFile::BF_OPEN &&
6083 !(filePtr.p->m_flags & BackupFile::BF_CLOSING))
6084 {
6085 jam();
6086 closeFile(signal, ptr, filePtr, true);
6087 return;
6088 }
6089 else if (filePtr.p->m_flags & BackupFile::BF_CLOSING)
6090 {
6091 /* Wait for the control file to close as well. */
6092 jam();
6093 return;
6094 }
6095 else
6096 {
6097 jam();
6098 ndbrequire(filePtr.p->m_flags == 0);
6099 }
6100
6101 TablePtr tabPtr;
6102 FragmentPtr fragPtr;
6103
6104 ndbrequire(ptr.p->prepare_table.first(tabPtr));
6105 tabPtr.p->fragments.getPtr(fragPtr, 0);
6106 DEB_LCP(("(%u)LCP_PREPARE_REF", instance()));
6107 LcpPrepareRef* ref= (LcpPrepareRef*)signal->getDataPtrSend();
6108 ref->senderData = ptr.p->clientData;
6109 ref->senderRef = reference();
6110 ref->tableId = tabPtr.p->tableId;
6111 ref->fragmentId = fragPtr.p->fragmentId;
6112 ref->errorCode = ptr.p->prepareErrorCode;
6113 sendSignal(ptr.p->masterRef, GSN_LCP_PREPARE_REF,
6114 signal, LcpPrepareRef::SignalLength, JBA);
6115 ptr.p->prepareState = NOT_ACTIVE;
6116 return;
6117 }
6118 ptr.p->setErrorCode(errCode);
6119
6120 ptr.p->m_gsn = GSN_DEFINE_BACKUP_REF;
6121 ndbrequire(ptr.p->errorCode != 0);
6122
6123 DefineBackupRef* ref = (DefineBackupRef*)signal->getDataPtrSend();
6124 ref->backupId = ptr.p->backupId;
6125 ref->backupPtr = ptr.i;
6126 ref->errorCode = ptr.p->errorCode;
6127 ref->nodeId = getOwnNodeId();
6128 sendSignal(ptr.p->senderRef, GSN_DEFINE_BACKUP_REF, signal,
6129 DefineBackupRef::SignalLength, JBB);
6130 }
6131
6132 void
init_file(BackupFilePtr filePtr,Uint32 backupPtrI)6133 Backup::init_file(BackupFilePtr filePtr, Uint32 backupPtrI)
6134 {
6135 filePtr.p->tableId = RNIL;
6136 filePtr.p->backupPtr = backupPtrI;
6137 filePtr.p->filePointer = RNIL;
6138 filePtr.p->m_flags = 0;
6139 filePtr.p->errorCode = 0;
6140 }
6141
6142 void
execDEFINE_BACKUP_REQ(Signal * signal)6143 Backup::execDEFINE_BACKUP_REQ(Signal* signal)
6144 {
6145 jamEntry();
6146
6147 DefineBackupReq* req = (DefineBackupReq*)signal->getDataPtr();
6148 NdbNodeBitmask nodes;
6149
6150 const Uint32 senderVersion =
6151 getNodeInfo(refToNode(signal->getSendersBlockRef())).m_version;
6152
6153 if (signal->getNoOfSections() >= 1)
6154 {
6155 ndbrequire(ndbd_send_node_bitmask_in_section(senderVersion));
6156 SegmentedSectionPtr ptr;
6157 SectionHandle handle(this,signal);
6158 handle.getSection(ptr, 0);
6159 ndbrequire(ptr.sz <= NdbNodeBitmask::Size);
6160 copy(nodes.rep.data, ptr);
6161 releaseSections(handle);
6162 }
6163 else
6164 {
6165 nodes = req->nodes;
6166 }
6167
6168 BackupRecordPtr ptr;
6169 const Uint32 ptrI = req->backupPtr;
6170 const Uint32 backupId = req->backupId;
6171
6172 if(req->masterRef == reference())
6173 {
6174 /**
6175 * Signal sent from myself -> record already seized
6176 */
6177 jam();
6178 c_backupPool.getPtr(ptr, ptrI);
6179 } else { // from other node
6180 jam();
6181 #ifdef DEBUG_ABORT
6182 dumpUsedResources();
6183 #endif
6184 if (!c_backups.getPool().seizeId(ptr, ptrI)) {
6185 jam();
6186 ndbabort(); // If master has succeeded slave should succed
6187 }//if
6188 c_backups.addFirst(ptr);
6189 }//if
6190
6191 CRASH_INSERTION((10014));
6192
6193 if (MT_BACKUP_FLAG(ptr.p->flags))
6194 {
6195 // All LDMs participate in backup, not just LDM1
6196 // Prevent allotment of extra resources for LDM1
6197 m_skew_disk_speed = false;
6198 }
6199 else
6200 {
6201 // only LDM1 participates in backup, allot extra disk speed quota
6202 m_skew_disk_speed = true;
6203 }
6204
6205 // The masterRef is the BACKUP block which coordinates the backup
6206 // across all the nodes, i.e. LDM1 on the master node. The senderRef
6207 // is the BACKUP block which sent the last REQ signal. The masterRef
6208 // sends signals to the BackupProxies on all the nodes, and each
6209 // BackupProxy sends the signals to the LDMs. So the LDMs need to reply
6210 // to the BackupProxy, not the master.
6211 //
6212 // - For single-threaded backup: backup master directly controls
6213 // participants on all nodes, so
6214 // masterRef = senderRef = LDM1_on_master_node.
6215 // - For multithreaded backup: backup master sends control signals to
6216 // BackupProxy on each node + each BackupProxy controls backup exec
6217 // across LDMs, so:
6218 // For all LDMs on node N, senderRef = BackupProxy_on_node_N
6219 // For all LDMs on all nodes, masterRef = LDM1_on_master_node.
6220 //
6221 // masterRef is passed in DEFINE_BACKUP_REQ so that all participants set a
6222 // masterRef explicitly specified by the master.
6223 ptr.p->masterRef = req->masterRef;
6224 ptr.p->senderRef = req->senderRef;
6225 ptr.p->m_gsn = GSN_DEFINE_BACKUP_REQ;
6226 ptr.p->slaveState.forceState(INITIAL);
6227 ptr.p->slaveState.setState(DEFINING);
6228 ptr.p->prepareState = NOT_ACTIVE;
6229 ptr.p->slaveData.dropTrig.tableId = RNIL;
6230 ptr.p->errorCode = 0;
6231 ptr.p->clientRef = req->clientRef;
6232 ptr.p->clientData = req->clientData;
6233 if(req->masterRef == reference())
6234 {
6235 ptr.p->flags = req->flags;
6236 }
6237 else
6238 ptr.p->flags = req->flags & ~((Uint32)BackupReq::WAITCOMPLETED); /* remove waitCompleted flags
6239 * as non master should never
6240 * reply
6241 */
6242 ptr.p->nodes = nodes;
6243 ptr.p->backupId = backupId;
6244 ptr.p->backupKey[0] = req->backupKey[0];
6245 ptr.p->backupKey[1] = req->backupKey[1];
6246 ptr.p->backupDataLen = req->backupDataLen;
6247 ptr.p->masterData.errorCode = 0;
6248 ptr.p->noOfBytes = 0;
6249 ptr.p->m_bytes_written = 0;
6250 ptr.p->m_row_scan_counter = 0;
6251 ptr.p->noOfRecords = 0;
6252 ptr.p->noOfLogBytes = 0;
6253 ptr.p->noOfLogRecords = 0;
6254 ptr.p->currGCP = 0;
6255 ptr.p->startGCP = 0;
6256 ptr.p->stopGCP = 0;
6257 ptr.p->m_prioA_scan_batches_to_execute = 0;
6258 ptr.p->m_lastSignalId = 0;
6259
6260 /**
6261 * Allocate files
6262 */
6263 BackupFilePtr files[4 + (2*BackupFormat::NDB_MAX_FILES_PER_LCP)];
6264 Uint32 noOfPages[] = {
6265 NO_OF_PAGES_META_FILE,
6266 2, // 32k
6267 0 // 3M
6268 };
6269
6270 constexpr Uint32 maxInsert[] =
6271 {
6272 MAX_WORDS_META_FILE, // control files
6273 BackupFormat::LogFile::LogEntry::MAX_SIZE, // redo/undo log files
6274 BACKUP_MIN_BUFF_WORDS // data files
6275 };
6276
6277 Uint32 minWrite[] = {
6278 8192,
6279 8192,
6280 32768
6281 };
6282 Uint32 maxWrite[] = {
6283 8192,
6284 8192,
6285 32768
6286 };
6287
6288 minWrite[1] = c_defaults.m_minWriteSize;
6289 maxWrite[1] = c_defaults.m_maxWriteSize;
6290 noOfPages[1] = (c_defaults.m_logBufferSize + sizeof(Page32) - 1) /
6291 sizeof(Page32);
6292 minWrite[2] = c_defaults.m_minWriteSize;
6293 maxWrite[2] = c_defaults.m_maxWriteSize;
6294 noOfPages[2] = (c_defaults.m_dataBufferSize + sizeof(Page32) - 1) /
6295 sizeof(Page32);
6296
6297 ptr.p->ctlFilePtr = ptr.p->logFilePtr = RNIL;
6298 for (Uint32 i = 0; i < BackupFormat::NDB_MAX_FILES_PER_LCP; i++)
6299 {
6300 ptr.p->dataFilePtr[i] = RNIL;
6301 ptr.p->prepareDataFilePtr[i] = RNIL;
6302 }
6303
6304 if (ptr.p->is_lcp())
6305 {
6306 /**
6307 * Allocate table and fragment object LCP prepare and execute
6308 * phase once and for all. This means we don't risk getting out
6309 * of resource issues for LCPs.
6310 */
6311 jam();
6312 TablePtr tabPtr;
6313 m_lcp_ptr = ptr;
6314 ndbrequire(ptr.p->prepare_table.seizeLast(tabPtr));
6315 ndbrequire(tabPtr.p->fragments.seize(1));
6316 ndbrequire(ptr.p->tables.seizeLast(tabPtr));
6317 ndbrequire(tabPtr.p->fragments.seize(1));
6318
6319 noOfPages[2] = (c_defaults.m_lcp_buffer_size + sizeof(Page32) - 1) /
6320 sizeof(Page32);
6321 for (Uint32 i = 0; i < (4 + (2*BackupFormat::NDB_MAX_FILES_PER_LCP)); i++)
6322 {
6323 Uint32 minWriteLcp;
6324 Uint32 maxWriteLcp;
6325 Uint32 maxInsertLcp;
6326 Uint32 noOfPagesLcp;
6327 ndbrequire(ptr.p->files.seizeFirst(files[i]));
6328 init_file(files[i], ptr.i);
6329 switch (i)
6330 {
6331 case 0:
6332 {
6333 jam();
6334 minWriteLcp = 1024;
6335 maxWriteLcp = 32768;
6336 maxInsertLcp = 8192;
6337 noOfPagesLcp = 2;
6338 ptr.p->ctlFilePtr = files[i].i;
6339 files[i].p->fileType = BackupFormat::CTL_FILE;
6340 break;
6341 }
6342 case 1:
6343 {
6344 jam();
6345 minWriteLcp = 1024;
6346 maxWriteLcp = 32768;
6347 maxInsertLcp = 8192;
6348 noOfPagesLcp = 2;
6349 ptr.p->prepareCtlFilePtr[0] = files[i].i;
6350 files[i].p->fileType = BackupFormat::CTL_FILE;
6351 break;
6352 }
6353 case 2:
6354 {
6355 jam();
6356 minWriteLcp = 1024;
6357 maxWriteLcp = 32768;
6358 maxInsertLcp = 8192;
6359 noOfPagesLcp = 2;
6360 ptr.p->prepareCtlFilePtr[1] = files[i].i;
6361 files[i].p->fileType = BackupFormat::CTL_FILE;
6362 break;
6363 }
6364 case 3:
6365 {
6366 jam();
6367 minWriteLcp = 1024;
6368 maxWriteLcp = 32768;
6369 maxInsertLcp = 8192;
6370 noOfPagesLcp = 2;
6371 ptr.p->deleteFilePtr = files[i].i;
6372 files[i].p->fileType = BackupFormat::DATA_FILE;
6373 break;
6374 }
6375 default:
6376 {
6377 if (i < 4 + BackupFormat::NDB_MAX_FILES_PER_LCP)
6378 {
6379 jam();
6380 minWriteLcp = minWrite[2];
6381 maxWriteLcp = maxWrite[2];
6382 maxInsertLcp = maxInsert[2];
6383 noOfPagesLcp = noOfPages[2];
6384 jam();
6385 ptr.p->prepareDataFilePtr[i - 4] = files[i].i;
6386 jam();
6387 files[i].p->fileType = BackupFormat::DATA_FILE;
6388 jam();
6389 }
6390 else
6391 {
6392 jam();
6393 minWriteLcp = minWrite[2];
6394 maxWriteLcp = maxWrite[2];
6395 maxInsertLcp = maxInsert[2];
6396 noOfPagesLcp = noOfPages[2];
6397 jam();
6398 ptr.p->dataFilePtr[i - (4 + BackupFormat::NDB_MAX_FILES_PER_LCP)] =
6399 files[i].i;
6400 jam();
6401 files[i].p->fileType = BackupFormat::DATA_FILE;
6402 jam();
6403 }
6404 break;
6405 }
6406 }
6407 Page32Ptr pagePtr;
6408 DEB_LCP(("LCP: instance: %u, i: %u, seize %u pages",
6409 instance(),
6410 i,
6411 noOfPagesLcp));
6412 ndbrequire(files[i].p->pages.seize(noOfPagesLcp));
6413 files[i].p->pages.getPtr(pagePtr, 0);
6414 const char * msg = files[i].p->
6415 operation.dataBuffer.setup((Uint32*)pagePtr.p,
6416 noOfPagesLcp * (sizeof(Page32) >> 2),
6417 128,
6418 minWriteLcp >> 2,
6419 maxWriteLcp >> 2,
6420 maxInsertLcp);
6421 if (msg != 0)
6422 {
6423 ndbout_c("setup msg = %s, i = %u", msg, i);
6424 ndbabort();
6425 }
6426 files[i].p->operation.m_bytes_total = 0;
6427 files[i].p->operation.m_records_total = 0;
6428 }
6429 }
6430 else
6431 {
6432 for (Uint32 i = 0; i < 3; i++)
6433 {
6434 jam();
6435 if (!ptr.p->files.seizeFirst(files[i]))
6436 {
6437 jam();
6438 defineBackupRef(signal, ptr,
6439 DefineBackupRef::FailedToAllocateFileRecord);
6440 return;
6441 }//if
6442 init_file(files[i], ptr.i);
6443
6444 if(ERROR_INSERTED(10035) || files[i].p->pages.seize(noOfPages[i]) == false)
6445 {
6446 jam();
6447 DEBUG_OUT("Failed to seize " << noOfPages[i] << " pages");
6448 defineBackupRef(signal, ptr, DefineBackupRef::FailedToAllocateBuffers);
6449 return;
6450 }//if
6451
6452 Page32Ptr pagePtr;
6453 files[i].p->pages.getPtr(pagePtr, 0);
6454
6455 const char * msg = files[i].p->
6456 operation.dataBuffer.setup((Uint32*)pagePtr.p,
6457 noOfPages[i] * (sizeof(Page32) >> 2),
6458 128,
6459 minWrite[i] >> 2,
6460 maxWrite[i] >> 2,
6461 maxInsert[i]);
6462 if(msg != 0) {
6463 jam();
6464 defineBackupRef(signal, ptr, DefineBackupRef::FailedToSetupFsBuffers);
6465 return;
6466 }//if
6467
6468 switch(i)
6469 {
6470 case 0:
6471 files[i].p->fileType = BackupFormat::CTL_FILE;
6472 ptr.p->ctlFilePtr = files[i].i;
6473 break;
6474 case 1:
6475 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6476 files[i].p->fileType = BackupFormat::UNDO_FILE;
6477 else
6478 files[i].p->fileType = BackupFormat::LOG_FILE;
6479 ptr.p->logFilePtr = files[i].i;
6480 break;
6481 case 2:
6482 files[i].p->fileType = BackupFormat::DATA_FILE;
6483 ptr.p->dataFilePtr[0] = files[i].i;
6484 }
6485 files[i].p->operation.m_bytes_total = 0;
6486 files[i].p->operation.m_records_total = 0;
6487 }//for
6488 }
6489
6490 initReportStatus(signal, ptr);
6491
6492 if (!verifyNodesAlive(ptr, ptr.p->nodes)) {
6493 jam();
6494 defineBackupRef(signal, ptr, DefineBackupRef::Undefined);
6495 return;
6496 }//if
6497 if (ERROR_INSERTED(10027)) {
6498 jam();
6499 defineBackupRef(signal, ptr, 327);
6500 return;
6501 }//if
6502
6503 if(ptr.p->is_lcp())
6504 {
6505 jam();
6506 getFragmentInfoDone(signal, ptr);
6507 return;
6508 }
6509
6510 if (ptr.p->backupDataLen == 0)
6511 {
6512 jam();
6513 backupAllData(signal, ptr);
6514 return;
6515 }//if
6516
6517 /**
6518 * Not implemented
6519 */
6520 ndbabort();
6521 }
6522
6523 void
backupAllData(Signal * signal,BackupRecordPtr ptr)6524 Backup::backupAllData(Signal* signal, BackupRecordPtr ptr)
6525 {
6526 /**
6527 * Get all tables from dict
6528 */
6529 ListTablesReq * req = (ListTablesReq*)signal->getDataPtrSend();
6530 req->init();
6531 req->senderRef = reference();
6532 req->senderData = ptr.i;
6533 req->setTableId(0);
6534 req->setTableType(0);
6535 sendSignal(DBDICT_REF, GSN_LIST_TABLES_REQ, signal,
6536 ListTablesReq::SignalLength, JBB);
6537 }
6538
6539 void
execLIST_TABLES_CONF(Signal * signal)6540 Backup::execLIST_TABLES_CONF(Signal* signal)
6541 {
6542 jamEntry();
6543 Uint32 fragInfo = signal->header.m_fragmentInfo;
6544 ListTablesConf* conf = (ListTablesConf*)signal->getDataPtr();
6545 Uint32 noOfTables = conf->noOfTables;
6546
6547 BackupRecordPtr ptr;
6548 c_backupPool.getPtr(ptr, conf->senderData);
6549
6550 SectionHandle handle (this, signal);
6551 signal->header.m_fragmentInfo = 0;
6552 if (noOfTables > 0)
6553 {
6554 ListTablesData ltd;
6555 const Uint32 listTablesDataSizeInWords = (sizeof(ListTablesData) + 3) / 4;
6556 SegmentedSectionPtr tableDataPtr;
6557 handle.getSection(tableDataPtr, ListTablesConf::TABLE_DATA);
6558 SimplePropertiesSectionReader
6559 tableDataReader(tableDataPtr, getSectionSegmentPool());
6560
6561 tableDataReader.reset();
6562 for(unsigned int i = 0; i<noOfTables; i++) {
6563 jam();
6564 tableDataReader.getWords((Uint32 *)<d, listTablesDataSizeInWords);
6565 Uint32 tableId = ltd.getTableId();
6566 Uint32 tableType = ltd.getTableType();
6567 Uint32 state= ltd.getTableState();
6568 jamLine(tableId);
6569
6570 if (! (DictTabInfo::isTable(tableType) ||
6571 DictTabInfo::isIndex(tableType) ||
6572 DictTabInfo::isFilegroup(tableType) ||
6573 DictTabInfo::isFile(tableType)
6574 || DictTabInfo::isHashMap(tableType)
6575 || DictTabInfo::isForeignKey(tableType)
6576 ))
6577 {
6578 jam();
6579 continue;
6580 }
6581
6582 if (state != DictTabInfo::StateOnline)
6583 {
6584 jam();
6585 continue;
6586 }
6587
6588 TablePtr tabPtr;
6589 ptr.p->tables.seizeLast(tabPtr);
6590 if(tabPtr.i == RNIL) {
6591 jam();
6592 defineBackupRef(signal, ptr, DefineBackupRef::FailedToAllocateTables);
6593 releaseSections(handle);
6594 return;
6595 }//if
6596 tabPtr.p->tableType = tableType;
6597 tabPtr.p->tableId = tableId;
6598 #ifdef VM_TRACE
6599 TablePtr locTabPtr;
6600 ndbassert(findTable(ptr, locTabPtr, tabPtr.p->tableId) == false);
6601 #endif
6602 insertTableMap(tabPtr, ptr.i, tabPtr.p->tableId);
6603 }//for
6604 }
6605
6606 releaseSections(handle);
6607
6608 /*
6609 If first or not last signal
6610 then keep accumulating table data
6611 */
6612 if ((fragInfo == 1) || (fragInfo == 2))
6613 {
6614 jam();
6615 return;
6616 }
6617 openFiles(signal, ptr);
6618 }
6619
6620 void
openFiles(Signal * signal,BackupRecordPtr ptr)6621 Backup::openFiles(Signal* signal, BackupRecordPtr ptr)
6622 {
6623 jam();
6624
6625 BackupFilePtr filePtr;
6626
6627 FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
6628 req->userReference = reference();
6629 req->fileFlags =
6630 FsOpenReq::OM_WRITEONLY |
6631 FsOpenReq::OM_CREATE_IF_NONE |
6632 FsOpenReq::OM_APPEND |
6633 FsOpenReq::OM_AUTOSYNC;
6634
6635 if (c_defaults.m_compressed_backup)
6636 req->fileFlags |= FsOpenReq::OM_GZ;
6637
6638 FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
6639 req->auto_sync_size = c_defaults.m_disk_synch_size;
6640 /**
6641 * Ctl file
6642 */
6643 c_backupFilePool.getPtr(filePtr, ptr.p->ctlFilePtr);
6644 filePtr.p->m_flags |= BackupFile::BF_OPENING;
6645
6646 req->userPointer = filePtr.i;
6647 FsOpenReq::setVersion(req->fileNumber, 2);
6648 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
6649 FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6650 FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6651 /*
6652 * NDBFS supports 2 backup formats: single-threaded backup and
6653 * multithreaded backup format.
6654 *
6655 * Example of st-backup directory structure in backup path (backup
6656 * files present in BACKUP-<backupID> directory):
6657 *
6658 * mysql@psangam-T460:~$ ls data2/BACKUP/
6659 * BACKUP-1 BACKUP-2
6660 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/
6661 * BACKUP-1-0.2.Data BACKUP-1.2.ctl BACKUP-1.2.log
6662 *
6663 * Example of mt-backup directory structure (backup subfolders in
6664 * BACKUP-<backupID>, subfolders contain backup files):
6665 *
6666 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/
6667 * BACKUP-1-PART-1-OF-4 BACKUP-1-PART-2-OF-4 BACKUP-1-PART-3-OF-4 BACKUP-1-PART-4-OF-4
6668 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-1-OF-4/
6669 * BACKUP-1-0.2.Data BACKUP-1.2.ctl BACKUP-1.2.log
6670 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-2-OF-4/
6671 * BACKUP-1-0.2.Data BACKUP-1.2.ctl BACKUP-1.2.log
6672 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-3-OF-4/
6673 * BACKUP-1-0.2.Data BACKUP-1.2.ctl BACKUP-1.2.log
6674 * mysql@psangam-T460:~$ ls data2/BACKUP/BACKUP-1/BACKUP-1-PART-4-OF-4/
6675 * BACKUP-1-0.2.Data BACKUP-1.2.ctl BACKUP-1.2.log
6676 *
6677 * NDBFS is now aware of the backup part in the file-open operation, as
6678 * well as the total number of backup parts. If a backup part number is set
6679 * to 0, it creates files as per the single-threaded backup directory
6680 * structure. If a non-zero part number is set, it creates files as per the
6681 * mt-backup directory structure.
6682 */
6683 if (MT_BACKUP_FLAG(ptr.p->flags))
6684 {
6685 /*
6686 * If the MT_BACKUP flag is set, a non-zero backup-part-ID is
6687 * passed to NDBFS so that the multithreaded backup directory
6688 * structure is used. If it is false, the old single-threaded
6689 * backup structure is used.
6690 */
6691 FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6692 FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6693 }
6694 else
6695 {
6696 FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6697 FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6698 }
6699 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6700
6701 /**
6702 * Log file
6703 */
6704 c_backupFilePool.getPtr(filePtr, ptr.p->logFilePtr);
6705 filePtr.p->m_flags |= BackupFile::BF_OPENING;
6706
6707 //write uncompressed log file when enable undo log,since log file is read from back to front.
6708 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6709 req->fileFlags &= ~FsOpenReq::OM_GZ;
6710
6711 req->userPointer = filePtr.i;
6712 FsOpenReq::setVersion(req->fileNumber, 2);
6713 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_LOG);
6714 FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6715 FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6716 if (MT_BACKUP_FLAG(ptr.p->flags))
6717 {
6718 FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6719 FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6720 }
6721 else
6722 {
6723 FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6724 FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6725 }
6726 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6727
6728 /**
6729 * Data file
6730 */
6731 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
6732 filePtr.p->m_flags |= BackupFile::BF_OPENING;
6733
6734 if (c_defaults.m_o_direct)
6735 req->fileFlags |= FsOpenReq::OM_DIRECT;
6736 if (c_defaults.m_compressed_backup)
6737 req->fileFlags |= FsOpenReq::OM_GZ;
6738 req->userPointer = filePtr.i;
6739 FsOpenReq::setVersion(req->fileNumber, 2);
6740 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
6741 FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
6742 FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
6743 if (MT_BACKUP_FLAG(ptr.p->flags))
6744 {
6745 FsOpenReq::v2_setPartNum(req->fileNumber, instance());
6746 FsOpenReq::v2_setTotalParts(req->fileNumber, globalData.ndbMtLqhWorkers);
6747 }
6748 else
6749 {
6750 FsOpenReq::v2_setPartNum(req->fileNumber, 0);
6751 FsOpenReq::v2_setTotalParts(req->fileNumber, 0);
6752 }
6753 FsOpenReq::v2_setCount(req->fileNumber, 0);
6754 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
6755 }
6756
6757 void
execFSOPENREF(Signal * signal)6758 Backup::execFSOPENREF(Signal* signal)
6759 {
6760 jamEntry();
6761
6762 FsRef * ref = (FsRef *)signal->getDataPtr();
6763
6764 const Uint32 userPtr = ref->userPointer;
6765
6766 BackupFilePtr filePtr;
6767 c_backupFilePool.getPtr(filePtr, userPtr);
6768
6769 ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPEN));
6770 ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPENING);
6771 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_OPENING;
6772
6773 BackupRecordPtr ptr;
6774 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
6775
6776 ptr.p->setErrorCode(ref->errorCode);
6777 if (ptr.p->is_lcp())
6778 {
6779 jam();
6780 openFilesReplyLCP(signal, ptr, filePtr);
6781 return;
6782 }
6783 openFilesReply(signal, ptr, filePtr);
6784 }
6785
6786 void
execFSOPENCONF(Signal * signal)6787 Backup::execFSOPENCONF(Signal* signal)
6788 {
6789 jamEntry();
6790
6791 FsConf * conf = (FsConf *)signal->getDataPtr();
6792
6793 const Uint32 userPtr = conf->userPointer;
6794 const Uint32 filePointer = conf->filePointer;
6795
6796 BackupFilePtr filePtr;
6797 c_backupFilePool.getPtr(filePtr, userPtr);
6798 filePtr.p->filePointer = filePointer;
6799
6800 BackupRecordPtr ptr;
6801 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
6802
6803 /**
6804 * Mark files as "opened"
6805 */
6806 ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPEN));
6807 ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPENING);
6808 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_OPENING;
6809 filePtr.p->m_flags |= BackupFile::BF_OPEN;
6810
6811 if (ptr.p->is_lcp())
6812 {
6813 jam();
6814 openFilesReplyLCP(signal, ptr, filePtr);
6815 return;
6816 }
6817 openFilesReply(signal, ptr, filePtr);
6818 }
6819
6820 void
openFilesReply(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)6821 Backup::openFilesReply(Signal* signal,
6822 BackupRecordPtr ptr, BackupFilePtr filePtr)
6823 {
6824 jam();
6825 /**
6826 * Check if all files have recived open_reply
6827 */
6828 for(ptr.p->files.first(filePtr); filePtr.i!=RNIL;ptr.p->files.next(filePtr))
6829 {
6830 jam();
6831 if(filePtr.p->m_flags & BackupFile::BF_OPENING) {
6832 jam();
6833 return;
6834 }//if
6835 }//for
6836
6837 if (ERROR_INSERTED(10037)) {
6838 jam();
6839 /**
6840 * Dont return FailedForBackupFilesAleadyExist
6841 * cause this will make NdbBackup auto-retry with higher number :-)
6842 */
6843 ptr.p->errorCode = DefineBackupRef::FailedInsertFileHeader;
6844 defineBackupRef(signal, ptr);
6845 return;
6846 }
6847 /**
6848 * Did open succeed for all files
6849 */
6850 if(ptr.p->checkError())
6851 {
6852 jam();
6853 if(ptr.p->errorCode == FsRef::fsErrFileExists)
6854 {
6855 jam();
6856 ptr.p->errorCode = DefineBackupRef::FailedForBackupFilesAleadyExist;
6857 }
6858 defineBackupRef(signal, ptr);
6859 return;
6860 }//if
6861
6862 /**
6863 * Insert file headers
6864 */
6865 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
6866 if(!insertFileHeader(BackupFormat::CTL_FILE, ptr.p, filePtr.p)) {
6867 jam();
6868 defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6869 return;
6870 }//if
6871
6872 BackupFormat::FileType logfiletype;
6873 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
6874 logfiletype = BackupFormat::UNDO_FILE;
6875 else
6876 logfiletype = BackupFormat::LOG_FILE;
6877
6878 ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
6879 if(!insertFileHeader(logfiletype, ptr.p, filePtr.p)) {
6880 jam();
6881 defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6882 return;
6883 }//if
6884
6885 ptr.p->files.getPtr(filePtr, ptr.p->dataFilePtr[0]);
6886 if(!insertFileHeader(BackupFormat::DATA_FILE, ptr.p, filePtr.p)) {
6887 jam();
6888 defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertFileHeader);
6889 return;
6890 }//if
6891
6892 /**
6893 * Start CTL file thread
6894 */
6895 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
6896 filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
6897
6898 signal->theData[0] = BackupContinueB::START_FILE_THREAD;
6899 signal->theData[1] = filePtr.i;
6900 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
6901
6902 /**
6903 * Insert table list in ctl file
6904 */
6905 FsBuffer & buf = filePtr.p->operation.dataBuffer;
6906
6907 const Uint32 sz =
6908 (sizeof(BackupFormat::CtlFile::TableList) >> 2) +
6909 ptr.p->tables.getCount() - 1;
6910
6911 Uint32 * dst;
6912 ndbrequire(sz < buf.getMaxWrite());
6913 if(!buf.getWritePtr(&dst, sz)) {
6914 jam();
6915 defineBackupRef(signal, ptr, DefineBackupRef::FailedInsertTableList);
6916 return;
6917 }//if
6918
6919 BackupFormat::CtlFile::TableList* tl =
6920 (BackupFormat::CtlFile::TableList*)dst;
6921 tl->SectionType = htonl(BackupFormat::TABLE_LIST);
6922 tl->SectionLength = htonl(sz);
6923
6924 TablePtr tabPtr;
6925 Uint32 count = 0;
6926 for(ptr.p->tables.first(tabPtr);
6927 tabPtr.i != RNIL;
6928 ptr.p->tables.next(tabPtr)){
6929 jam();
6930 tl->TableIds[count] = htonl(tabPtr.p->tableId);
6931 count++;
6932 }//for
6933
6934 buf.updateWritePtr(sz);
6935
6936 /**
6937 * Start getting table definition data
6938 */
6939 ndbrequire(ptr.p->tables.first(tabPtr));
6940
6941 signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
6942 signal->theData[1] = ptr.i;
6943 signal->theData[2] = tabPtr.i;
6944 signal->theData[3] = 0;
6945 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
6946 return;
6947 }
6948
6949 bool
insertFileHeader(BackupFormat::FileType ft,BackupRecord * ptrP,BackupFile * filePtrP)6950 Backup::insertFileHeader(BackupFormat::FileType ft,
6951 BackupRecord * ptrP,
6952 BackupFile * filePtrP){
6953 FsBuffer & buf = filePtrP->operation.dataBuffer;
6954
6955 const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
6956
6957 Uint32 * dst;
6958 ndbrequire(sz < buf.getMaxWrite());
6959 if(!buf.getWritePtr(&dst, sz)) {
6960 jam();
6961 return false;
6962 }//if
6963
6964 BackupFormat::FileHeader* header = (BackupFormat::FileHeader*)dst;
6965 ndbrequire(sizeof(header->Magic) == sizeof(BACKUP_MAGIC));
6966 memcpy(header->Magic, BACKUP_MAGIC, sizeof(BACKUP_MAGIC));
6967 if (ft == BackupFormat::LCP_FILE)
6968 {
6969 jam();
6970 header->BackupVersion = htonl(NDBD_USE_PARTIAL_LCP_v2);
6971 }
6972 else
6973 {
6974 jam();
6975 header->BackupVersion = htonl(NDB_BACKUP_VERSION);
6976 }
6977 header->SectionType = htonl(BackupFormat::FILE_HEADER);
6978 header->SectionLength = htonl(sz - 3);
6979 header->FileType = htonl(ft);
6980 header->BackupId = htonl(ptrP->backupId);
6981 header->BackupKey_0 = htonl(ptrP->backupKey[0]);
6982 header->BackupKey_1 = htonl(ptrP->backupKey[1]);
6983 header->ByteOrder = 0x12345678;
6984 header->NdbVersion = htonl(NDB_VERSION_D);
6985 header->MySQLVersion = htonl(NDB_MYSQL_VERSION_D);
6986
6987 buf.updateWritePtr(sz);
6988 return true;
6989 }
6990
6991 void
execGET_TABINFOREF(Signal * signal)6992 Backup::execGET_TABINFOREF(Signal* signal)
6993 {
6994 jamEntry();
6995 GetTabInfoRef * ref = (GetTabInfoRef*)signal->getDataPtr();
6996 BackupFilePtr filePtr;
6997
6998 const Uint32 senderData = ref->senderData;
6999 BackupRecordPtr ptr;
7000 c_backupFilePool.getPtr(filePtr, senderData);
7001 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
7002
7003 ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0] ||
7004 !ptr.p->is_lcp());
7005 defineBackupRef(signal, ptr, ref->errorCode);
7006 }
7007
7008 void
execGET_TABINFO_CONF(Signal * signal)7009 Backup::execGET_TABINFO_CONF(Signal* signal)
7010 {
7011 jamEntry();
7012
7013 if(!assembleFragments(signal)) {
7014 jam();
7015 return;
7016 }//if
7017
7018 BackupFilePtr filePtr;
7019 GetTabInfoConf * const conf = (GetTabInfoConf*)signal->getDataPtr();
7020 //const Uint32 senderRef = info->senderRef;
7021 const Uint32 len = conf->totalLen;
7022 const Uint32 senderData = conf->senderData;
7023 const Uint32 tableType = conf->tableType;
7024 const Uint32 tableId = conf->tableId;
7025
7026 BackupRecordPtr ptr;
7027 c_backupFilePool.getPtr(filePtr, senderData);
7028 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
7029
7030 SectionHandle handle(this, signal);
7031 SegmentedSectionPtr dictTabInfoPtr;
7032 handle.getSection(dictTabInfoPtr, GetTabInfoConf::DICT_TAB_INFO);
7033 ndbrequire(dictTabInfoPtr.sz == len);
7034
7035 TablePtr tabPtr ;
7036 if (ptr.p->is_lcp())
7037 {
7038 jam();
7039 ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0])
7040 ptr.p->prepare_table.first(tabPtr);
7041 ndbrequire(tabPtr.p->tableId == tableId);
7042 }
7043 else
7044 {
7045 jam();
7046 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
7047 ndbrequire(findTable(ptr, tabPtr, tableId));
7048 }
7049
7050 FsBuffer & buf = filePtr.p->operation.dataBuffer;
7051 Uint32* dst = 0;
7052 {
7053 /**
7054 * Write into ctl file for Backups
7055 *
7056 * We don't write TABLE_DESCRIPTION into data LCP files. It is not
7057 * used in the restore process, so it only uses up space on
7058 * disk for no purpose.
7059 *
7060 * An LCP file only has the following sections:
7061 * 1) File header section
7062 * 2) Fragment Header section
7063 * 3) LCP data section that contains records of type:
7064 * - INSERT_TYPE (normal records in ALL parts)
7065 * - WRITE_TYPE (normal records in CHANGE parts)
7066 * - DELETE_BY_ROWID_TYPE (record deleted in CHANGE parts)
7067 * - DELETE_BY_PAGEID_TYPE (all records in page deleted in CHANGE part)
7068 * 4) Fragment Footer section
7069 *
7070 * We still need to copy the table description into a linear array,
7071 * we solve this by using the FsBuffer also for LCPs. We skip the
7072 * call to updateWritePtr. This means that we write into the
7073 * buffer, but the next time we write into the buffer we will
7074 * overwrite this area.
7075 */
7076 Uint32 dstLen = len + 3;
7077 if(!buf.getWritePtr(&dst, dstLen)) {
7078 jam();
7079 ndbabort();
7080 ptr.p->setErrorCode(DefineBackupRef::FailedAllocateTableMem);
7081 releaseSections(handle);
7082 defineBackupRef(signal, ptr);
7083 return;
7084 }//if
7085 if(dst != 0) {
7086 jam();
7087
7088 BackupFormat::CtlFile::TableDescription * desc =
7089 (BackupFormat::CtlFile::TableDescription*)dst;
7090 desc->SectionType = htonl(BackupFormat::TABLE_DESCRIPTION);
7091 desc->SectionLength = htonl(len + 3);
7092 desc->TableType = htonl(tableType);
7093 dst += 3;
7094
7095 copy(dst, dictTabInfoPtr);
7096 if (!ptr.p->is_lcp())
7097 {
7098 jam();
7099 buf.updateWritePtr(dstLen);
7100 }
7101 }//if
7102 }
7103
7104 releaseSections(handle);
7105
7106 if(ptr.p->checkError())
7107 {
7108 jam();
7109 ndbrequire(!ptr.p->is_lcp());
7110 defineBackupRef(signal, ptr);
7111 return;
7112 }//if
7113
7114 if (!DictTabInfo::isTable(tabPtr.p->tableType))
7115 {
7116 jam();
7117 ndbrequire(!ptr.p->is_lcp());
7118 TablePtr tmp = tabPtr;
7119 removeTableMap(tmp, ptr.i, tmp.p->tableId);
7120 ptr.p->tables.next(tabPtr);
7121 ptr.p->tables.release(tmp);
7122 jamLine(tmp.p->tableId);
7123 afterGetTabinfoLockTab(signal, ptr, tabPtr);
7124 return;
7125 }
7126
7127 if (!parseTableDescription(signal, ptr, tabPtr, dst, len))
7128 {
7129 jam();
7130 ndbrequire(!ptr.p->is_lcp());
7131 defineBackupRef(signal, ptr);
7132 return;
7133 }
7134
7135 if(!ptr.p->is_lcp())
7136 {
7137 jam();
7138 BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
7139 req->m_senderRef = reference();
7140 req->m_tableId = tabPtr.p->tableId;
7141 req->m_lock_unlock = BackupLockTab::LOCK_TABLE;
7142 req->m_backup_state = BackupLockTab::GET_TABINFO_CONF;
7143 req->m_backupRecordPtr_I = ptr.i;
7144 req->m_tablePtr_I = tabPtr.i;
7145 sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
7146 BackupLockTab::SignalLength, JBB);
7147 if (ERROR_INSERTED(10038))
7148 {
7149 /* Test */
7150 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
7151 ord->backupId = ptr.p->backupId;
7152 ord->backupPtr = ptr.i;
7153 ord->requestType = AbortBackupOrd::ClientAbort;
7154 ord->senderData= ptr.p->clientData;
7155 sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
7156 AbortBackupOrd::SignalLength, JBB);
7157 }
7158 return;
7159 }
7160 else
7161 {
7162 jam();
7163 ndbrequire(filePtr.i == ptr.p->prepareDataFilePtr[0]);
7164 lcp_open_data_file_done(signal,
7165 ptr);
7166 return;
7167 }
7168 }
7169
7170 void
afterGetTabinfoLockTab(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)7171 Backup::afterGetTabinfoLockTab(Signal *signal,
7172 BackupRecordPtr ptr, TablePtr tabPtr)
7173 {
7174 if(tabPtr.i == RNIL)
7175 {
7176 /**
7177 * Done with all tables...
7178 */
7179 jam();
7180
7181 ndbrequire(ptr.p->tables.first(tabPtr));
7182 ndbrequire(!ptr.p->is_lcp());
7183 DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
7184 req->senderRef = reference();
7185 req->senderData = ptr.i;
7186 req->tableId = tabPtr.p->tableId;
7187 req->schemaTransId = 0;
7188 req->jamBufferPtr = jamBuffer();
7189 EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
7190 DihScanTabReq::SignalLength, 0);
7191 DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7192 ndbrequire(conf->senderData == 0);
7193 conf->senderData = ptr.i;
7194 execDIH_SCAN_TAB_CONF(signal);
7195 return;
7196 }//if
7197
7198 /**
7199 * Fetch next table...
7200 */
7201 signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
7202 signal->theData[1] = ptr.i;
7203 signal->theData[2] = tabPtr.i;
7204 signal->theData[3] = 0;
7205 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
7206 return;
7207 }
7208
7209 bool
parseTableDescription(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,const Uint32 * tabdescptr,Uint32 len)7210 Backup::parseTableDescription(Signal* signal,
7211 BackupRecordPtr ptr,
7212 TablePtr tabPtr,
7213 const Uint32 * tabdescptr,
7214 Uint32 len)
7215 {
7216 SimplePropertiesLinearReader it(tabdescptr, len);
7217
7218 it.first();
7219
7220 DictTabInfo::Table tmpTab; tmpTab.init();
7221 SimpleProperties::UnpackStatus stat;
7222 stat = SimpleProperties::unpack(it, &tmpTab,
7223 DictTabInfo::TableMapping,
7224 DictTabInfo::TableMappingSize);
7225 ndbrequire(stat == SimpleProperties::Break);
7226
7227 bool lcp = ptr.p->is_lcp();
7228
7229 ndbrequire(tabPtr.p->tableId == tmpTab.TableId);
7230 ndbrequire(lcp || (tabPtr.p->tableType == tmpTab.TableType));
7231
7232 /**
7233 * LCP should not save disk attributes but only mem attributes
7234 */
7235
7236 /**
7237 * Initialize table object
7238 */
7239 tabPtr.p->noOfRecords = 0;
7240 tabPtr.p->schemaVersion = tmpTab.TableVersion;
7241 tabPtr.p->triggerIds[0] = ILLEGAL_TRIGGER_ID;
7242 tabPtr.p->triggerIds[1] = ILLEGAL_TRIGGER_ID;
7243 tabPtr.p->triggerIds[2] = ILLEGAL_TRIGGER_ID;
7244 tabPtr.p->triggerAllocated[0] = false;
7245 tabPtr.p->triggerAllocated[1] = false;
7246 tabPtr.p->triggerAllocated[2] = false;
7247
7248 tabPtr.p->noOfAttributes = tmpTab.NoOfAttributes;
7249 tabPtr.p->maxRecordSize = 1; // LEN word
7250 bzero(tabPtr.p->attrInfo, sizeof(tabPtr.p->attrInfo));
7251
7252 if (lcp)
7253 {
7254 jam();
7255 AttributeHeader::init(tabPtr.p->attrInfo, AttributeHeader::READ_LCP, 0);
7256 }
7257 else
7258 {
7259 jam();
7260 AttributeHeader::init(tabPtr.p->attrInfo, AttributeHeader::READ_ALL,
7261 tmpTab.NoOfAttributes);
7262 }
7263
7264 Uint32 varsize = 0;
7265 Uint32 disk = 0;
7266 Uint32 null = 0;
7267 for(Uint32 i = 0; i<tmpTab.NoOfAttributes; i++) {
7268 jam();
7269 DictTabInfo::Attribute tmp; tmp.init();
7270 stat = SimpleProperties::unpack(it, &tmp,
7271 DictTabInfo::AttributeMapping,
7272 DictTabInfo::AttributeMappingSize);
7273
7274 ndbrequire(stat == SimpleProperties::Break);
7275 it.next(); // Move Past EndOfAttribute
7276
7277 if(lcp && tmp.AttributeStorageType == NDB_STORAGETYPE_DISK)
7278 {
7279 disk++;
7280 continue;
7281 }
7282
7283 if (tmp.AttributeArrayType != NDB_ARRAYTYPE_FIXED)
7284 varsize++;
7285
7286 if (tmp.AttributeNullableFlag)
7287 null++;
7288
7289 if (tmp.AttributeSize == 0)
7290 {
7291 tabPtr.p->maxRecordSize += (tmp.AttributeArraySize + 31) >> 5;
7292 }
7293 else
7294 {
7295 const Uint32 arr = tmp.AttributeArraySize;
7296 const Uint32 sz = 1 << tmp.AttributeSize;
7297 const Uint32 sz32 = (sz * arr + 31) >> 5;
7298
7299 tabPtr.p->maxRecordSize += sz32;
7300 }
7301 }
7302
7303 tabPtr.p->attrInfoLen = 1;
7304
7305 if (lcp)
7306 {
7307 jam();
7308 Dbtup* tup = (Dbtup*)globalData.getBlock(DBTUP, instance());
7309 tabPtr.p->maxRecordSize = 1 + tup->get_max_lcp_record_size(tmpTab.TableId);
7310 }
7311 else
7312 {
7313 // mask
7314 tabPtr.p->maxRecordSize += 1 + ((tmpTab.NoOfAttributes + null + 31) >> 5);
7315 tabPtr.p->maxRecordSize += (2 * varsize + 3) / 4;
7316 }
7317
7318 return true;
7319 }
7320
7321 void
execDIH_SCAN_TAB_CONF(Signal * signal)7322 Backup::execDIH_SCAN_TAB_CONF(Signal* signal)
7323 {
7324 jamEntry();
7325 DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7326 const Uint32 fragCount = conf->fragmentCount;
7327 const Uint32 tableId = conf->tableId;
7328 const Uint32 senderData = conf->senderData;
7329 const Uint32 scanCookie = conf->scanCookie;
7330 ndbrequire(conf->reorgFlag == 0); // no backup during table reorg
7331
7332 BackupRecordPtr ptr;
7333 c_backupPool.getPtr(ptr, senderData);
7334
7335 TablePtr tabPtr;
7336 ndbrequire(findTable(ptr, tabPtr, tableId));
7337
7338 tabPtr.p->m_scan_cookie = scanCookie;
7339 ndbrequire(tabPtr.p->fragments.seize(fragCount) != false);
7340 for(Uint32 i = 0; i<fragCount; i++) {
7341 jam();
7342 FragmentPtr fragPtr;
7343 tabPtr.p->fragments.getPtr(fragPtr, i);
7344 fragPtr.p->scanned = 0;
7345 fragPtr.p->scanning = 0;
7346 fragPtr.p->tableId = tableId;
7347 fragPtr.p->fragmentId = i;
7348 fragPtr.p->lqhInstanceKey = 0;
7349 fragPtr.p->node = 0;
7350 }//for
7351
7352 /**
7353 * Next table
7354 */
7355 if(ptr.p->tables.next(tabPtr))
7356 {
7357 jam();
7358 DihScanTabReq * req = (DihScanTabReq*)signal->getDataPtrSend();
7359 req->senderRef = reference();
7360 req->senderData = ptr.i;
7361 req->tableId = tabPtr.p->tableId;
7362 req->schemaTransId = 0;
7363 req->jamBufferPtr = jamBuffer();
7364 EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_REQ, signal,
7365 DihScanTabReq::SignalLength, 0);
7366 jamEntry();
7367 DihScanTabConf * conf = (DihScanTabConf*)signal->getDataPtr();
7368 ndbrequire(conf->senderData == 0);
7369 conf->senderData = ptr.i;
7370 /* conf is already set up properly to be sent as signal */
7371 /* Real-time break to ensure we don't run for too long in one signal. */
7372 sendSignal(reference(), GSN_DIH_SCAN_TAB_CONF, signal,
7373 DihScanTabConf::SignalLength, JBB);
7374 return;
7375 }//if
7376
7377 ptr.p->tables.first(tabPtr);
7378 getFragmentInfo(signal, ptr, tabPtr, 0);
7379 }
7380
7381 void
getFragmentInfo(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,Uint32 fragNo)7382 Backup::getFragmentInfo(Signal* signal,
7383 BackupRecordPtr ptr, TablePtr tabPtr, Uint32 fragNo)
7384 {
7385 Uint32 loopCount = 0;
7386 jam();
7387
7388 for(; tabPtr.i != RNIL; ptr.p->tables.next(tabPtr)) {
7389 jam();
7390 const Uint32 fragCount = tabPtr.p->fragments.getSize();
7391 for(; fragNo < fragCount; fragNo ++) {
7392 jam();
7393 FragmentPtr fragPtr;
7394 tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7395
7396 if(fragPtr.p->scanned == 0 && fragPtr.p->scanning == 0) {
7397 jam();
7398 DiGetNodesReq * const req = (DiGetNodesReq *)&signal->theData[0];
7399 req->tableId = tabPtr.p->tableId;
7400 req->hashValue = fragNo;
7401 req->distr_key_indicator = ZTRUE;
7402 req->anyNode = 0;
7403 req->scan_indicator = ZTRUE;
7404 req->jamBufferPtr = jamBuffer();
7405 req->get_next_fragid_indicator = 0;
7406 EXECUTE_DIRECT_MT(DBDIH, GSN_DIGETNODESREQ, signal,
7407 DiGetNodesReq::SignalLength, 0);
7408 jamEntry();
7409 DiGetNodesConf * conf = (DiGetNodesConf *)&signal->theData[0];
7410 Uint32 reqinfo = conf->reqinfo;
7411 Uint32 nodeId = conf->nodes[0];
7412 /* Require successful read of table fragmentation */
7413 ndbrequire(conf->zero == 0);
7414 Uint32 instanceKey = (reqinfo >> 24) & 127;
7415 fragPtr.p->lqhInstanceKey = instanceKey;
7416 fragPtr.p->node = nodeId;
7417 if (++loopCount >= DiGetNodesReq::MAX_DIGETNODESREQS ||
7418 ERROR_INSERTED(10046))
7419 {
7420 jam();
7421 if (ERROR_INSERTED(10046))
7422 {
7423 CLEAR_ERROR_INSERT_VALUE;
7424 }
7425 signal->theData[0] = BackupContinueB::ZGET_NEXT_FRAGMENT;
7426 signal->theData[1] = ptr.i;
7427 signal->theData[2] = tabPtr.p->tableId;
7428 signal->theData[3] = fragNo + 1;
7429 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
7430 return;
7431 }
7432 }//if
7433 }//for
7434
7435 DihScanTabCompleteRep*rep= (DihScanTabCompleteRep*)signal->getDataPtrSend();
7436 rep->tableId = tabPtr.p->tableId;
7437 rep->scanCookie = tabPtr.p->m_scan_cookie;
7438 rep->jamBufferPtr = jamBuffer();
7439 EXECUTE_DIRECT_MT(DBDIH, GSN_DIH_SCAN_TAB_COMPLETE_REP, signal,
7440 DihScanTabCompleteRep::SignalLength, 0);
7441
7442 fragNo = 0;
7443 }//for
7444
7445
7446 getFragmentInfoDone(signal, ptr);
7447 }
7448
7449 void
getFragmentInfoDone(Signal * signal,BackupRecordPtr ptr)7450 Backup::getFragmentInfoDone(Signal* signal, BackupRecordPtr ptr)
7451 {
7452 ptr.p->m_gsn = GSN_DEFINE_BACKUP_CONF;
7453 ptr.p->slaveState.setState(DEFINED);
7454 DefineBackupConf * conf = (DefineBackupConf*)signal->getDataPtrSend();
7455 conf->backupPtr = ptr.i;
7456 conf->backupId = ptr.p->backupId;
7457 sendSignal(ptr.p->senderRef, GSN_DEFINE_BACKUP_CONF, signal,
7458 DefineBackupConf::SignalLength, JBB);
7459 }
7460
7461
7462 /*****************************************************************************
7463 *
7464 * Slave functionallity: Start backup
7465 *
7466 *****************************************************************************/
7467 void
execSTART_BACKUP_REQ(Signal * signal)7468 Backup::execSTART_BACKUP_REQ(Signal* signal)
7469 {
7470 jamEntry();
7471
7472 CRASH_INSERTION((10015));
7473
7474 StartBackupReq* req = (StartBackupReq*)signal->getDataPtr();
7475 const Uint32 ptrI = req->backupPtr;
7476
7477 BackupRecordPtr ptr;
7478 c_backupPool.getPtr(ptr, ptrI);
7479
7480 ptr.p->slaveState.setState(STARTED);
7481 ptr.p->m_gsn = GSN_START_BACKUP_REQ;
7482
7483 /* At this point, we are effectively starting
7484 * bulk file writes for this backup, so lets
7485 * record the fact
7486 */
7487 Backup::g_is_single_thr_backup_running = true;
7488
7489 /**
7490 * Start file threads...
7491 */
7492 BackupFilePtr filePtr;
7493 for(ptr.p->files.first(filePtr); filePtr.i!=RNIL;ptr.p->files.next(filePtr))
7494 {
7495 jam();
7496 if(! (filePtr.p->m_flags & BackupFile::BF_FILE_THREAD))
7497 {
7498 jam();
7499 filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
7500 signal->theData[0] = BackupContinueB::START_FILE_THREAD;
7501 signal->theData[1] = filePtr.i;
7502 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
7503 }//if
7504 }//for
7505
7506 /* A backup needs to be restored to a consistent point, for
7507 * which it uses a fuzzy scan and a log.
7508 *
7509 * The fuzzy scan is restored, and then the log is replayed
7510 * idempotently up to some consistent point which is after
7511 * (SNAPSHOTEND) or before (SNAPSHOTSTART) any of the states
7512 * captured in the scan.
7513 *
7514 * This requires that the backup is captured in order :
7515 * 1) Start recording logs of all committed transactions
7516 * 2) Choose SNAPSHOTSTART consistent point
7517 * 3) Perform data scan
7518 * 4) Choose SNAPSHOTEND consistent point
7519 * 5) Stop recording logs
7520 *
7521 * Tell DBTUP to create triggers to start recording logs
7522 */
7523 TablePtr tabPtr;
7524 ndbrequire(ptr.p->tables.first(tabPtr));
7525 sendCreateTrig(signal, ptr, tabPtr);
7526 }
7527
7528 /*****************************************************************************
7529 *
7530 * Slave functionallity: Backup fragment
7531 *
7532 *****************************************************************************/
7533 void
execBACKUP_FRAGMENT_REQ(Signal * signal)7534 Backup::execBACKUP_FRAGMENT_REQ(Signal* signal)
7535 {
7536 jamEntry();
7537 BackupFragmentReq* req = (BackupFragmentReq*)signal->getDataPtr();
7538
7539 CRASH_INSERTION((10016));
7540
7541 const Uint32 ptrI = req->backupPtr;
7542 //const Uint32 backupId = req->backupId;
7543 const Uint32 tableId = req->tableId;
7544 const Uint32 fragNo = req->fragmentNo;
7545 const Uint32 count = req->count;
7546
7547 /**
7548 * Get backup record
7549 */
7550 BackupRecordPtr ptr;
7551 BackupFilePtr filePtr;
7552 TablePtr tabPtr;
7553
7554 c_backupPool.getPtr(ptr, ptrI);
7555
7556 if (ptr.p->is_lcp())
7557 {
7558 jam();
7559 start_execute_lcp(signal, ptr, tabPtr, tableId);
7560 if (ptr.p->m_empty_lcp)
7561 {
7562 /**
7563 * No need to start LCP processing in this case, we only
7564 * update LCP control file and this process has already
7565 * been started when we come here.
7566 */
7567 jam();
7568 }
7569 else
7570 {
7571 jam();
7572 start_lcp_scan(signal, ptr, tabPtr, ptrI, fragNo);
7573 }
7574 return;
7575 }
7576 else
7577 {
7578 jam();
7579 /* Backup path */
7580 if (ERROR_INSERTED(10039))
7581 {
7582 sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_REQ, signal,
7583 300, signal->getLength());
7584 return;
7585 }
7586 /* Get Table */
7587 ndbrequire(findTable(ptr, tabPtr, tableId));
7588 }
7589 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
7590
7591 ptr.p->slaveState.setState(SCANNING);
7592 ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REQ;
7593
7594 ndbrequire(filePtr.p->backupPtr == ptrI);
7595
7596 /* Get fragment */
7597 FragmentPtr fragPtr;
7598 tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7599
7600 ndbrequire(fragPtr.p->scanned == 0);
7601 ndbrequire(fragPtr.p->scanning == 0 ||
7602 refToNode(ptr.p->masterRef) == getOwnNodeId());
7603
7604 /**
7605 * Init operation
7606 */
7607 if (filePtr.p->tableId != tableId)
7608 {
7609 jam();
7610 DEB_EXTRA_LCP(("(%u)Init new tab(%u): maxRecordSize: %u",
7611 instance(),
7612 tableId,
7613 tabPtr.p->maxRecordSize));
7614 filePtr.p->operation.init(tabPtr);
7615 filePtr.p->tableId = tableId;
7616 }//if
7617
7618 /**
7619 * Check for space in buffer
7620 */
7621 if(!filePtr.p->operation.newFragment(tableId, fragPtr.p->fragmentId)) {
7622 jam();
7623 ndbrequire(!ptr.p->is_lcp());
7624 req->count = count + 1;
7625 sendSignalWithDelay(reference(), GSN_BACKUP_FRAGMENT_REQ, signal,
7626 WaitDiskBufferCapacityMillis,
7627 signal->length());
7628 ptr.p->slaveState.setState(STARTED);
7629 return;
7630 }//if
7631
7632 /**
7633 * Mark things as "in use"
7634 */
7635 fragPtr.p->scanning = 1;
7636 filePtr.p->fragmentNo = fragPtr.p->fragmentId;
7637 filePtr.p->m_retry_count = 0;
7638
7639 ndbrequire(filePtr.p->m_flags ==
7640 (BackupFile::BF_OPEN | BackupFile::BF_FILE_THREAD));
7641 sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, 0);
7642 }
7643
7644 void
start_lcp_scan(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr,Uint32 ptrI,Uint32 fragNo)7645 Backup::start_lcp_scan(Signal *signal,
7646 BackupRecordPtr ptr,
7647 TablePtr tabPtr,
7648 Uint32 ptrI,
7649 Uint32 fragNo)
7650 {
7651 BackupFilePtr filePtr;
7652 FragmentPtr fragPtr;
7653
7654 DEB_EXTRA_LCP(("(%u)Start lcp scan",
7655 instance()));
7656
7657 ptr.p->slaveState.setState(SCANNING);
7658 ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REQ;
7659
7660 /* Get fragment */
7661 tabPtr.p->fragments.getPtr(fragPtr, fragNo);
7662
7663 c_tup->start_lcp_scan(tabPtr.p->tableId,
7664 fragPtr.p->fragmentId,
7665 ptr.p->m_lcp_max_page_cnt);
7666 ptr.p->m_is_lcp_scan_active = true;
7667 ptr.p->m_lcp_current_page_scanned = 0;
7668
7669 /**
7670 * Now the LCP have started for this fragment. The following
7671 * things have to be done in the same real-time break.
7672 *
7673 * 1) Write an LCP entry into the UNDO log.
7674 * 2) Get number of pages to checkpoint.
7675 * 3) Inform TUP that LCP scan have started
7676 *
7677 * It is not absolutely necessary to start the actual LCP scan
7678 * in the same real-time break. We use this opportunity to open
7679 * any extra LCP files that this LCP needs. If only one is needed
7680 * it has already been opened and we can proceed immediately.
7681 * However large fragments that have seen large number of writes
7682 * since the last LCP can require multiple LCP files. These
7683 * extra LCP files are opened before we actually start the
7684 * LCP scan.
7685 */
7686
7687 ndbrequire(fragPtr.p->scanned == 0);
7688 ndbrequire(fragPtr.p->scanning == 0 ||
7689 refToNode(ptr.p->masterRef) == getOwnNodeId());
7690
7691 ptr.p->m_last_data_file_number =
7692 get_file_add(ptr.p->m_first_data_file_number,
7693 ptr.p->m_num_lcp_files - 1);
7694
7695 init_file_for_lcp(signal, 0, ptr, ptrI);
7696 if (ptr.p->m_num_lcp_files > 1)
7697 {
7698 jam();
7699 for (Uint32 i = 1; i < ptr.p->m_num_lcp_files; i++)
7700 {
7701 jam();
7702 lcp_open_data_file_late(signal, ptr, i);
7703 }
7704 return;
7705 }
7706 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
7707 Uint32 delay = 0;
7708 if (ERROR_INSERTED(10047))
7709 {
7710 g_eventLogger->info("(%u)Start LCP on tab(%u,%u) 3 seconds delay, max_page: %u",
7711 instance(),
7712 tabPtr.p->tableId,
7713 fragPtr.p->fragmentId,
7714 ptr.p->m_lcp_max_page_cnt);
7715
7716 if (ptr.p->m_lcp_max_page_cnt > 20)
7717 {
7718 delay = 9000;
7719 }
7720 }
7721 sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr, delay);
7722 }
7723
7724 void
init_file_for_lcp(Signal * signal,Uint32 index,BackupRecordPtr ptr,Uint32 ptrI)7725 Backup::init_file_for_lcp(Signal *signal,
7726 Uint32 index,
7727 BackupRecordPtr ptr,
7728 Uint32 ptrI)
7729 {
7730 TablePtr tabPtr;
7731 FragmentPtr fragPtr;
7732 BackupFilePtr filePtr;
7733 ptr.p->tables.first(tabPtr);
7734 tabPtr.p->fragments.getPtr(fragPtr, 0);
7735
7736 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[index]);
7737 ndbrequire(filePtr.p->backupPtr == ptrI);
7738
7739 /**
7740 * Init operation
7741 */
7742 DEB_EXTRA_LCP(("(%u)Init new tab(%u): maxRecordSize: %u",
7743 instance(),
7744 tabPtr.p->tableId,
7745 tabPtr.p->maxRecordSize));
7746 filePtr.p->operation.init(tabPtr);
7747 filePtr.p->tableId = tabPtr.p->tableId;
7748
7749 /**
7750 * Mark things as "in use"
7751 */
7752 fragPtr.p->scanning = 1;
7753 filePtr.p->m_retry_count = 0;
7754 filePtr.p->m_lcp_inserts = 0;
7755 filePtr.p->m_lcp_writes = 0;
7756 filePtr.p->m_lcp_delete_by_rowids = 0;
7757 filePtr.p->m_lcp_delete_by_pageids = 0;
7758
7759 filePtr.p->fragmentNo = 0;
7760
7761 ndbrequire(filePtr.p->operation.newFragment(tabPtr.p->tableId,
7762 fragPtr.p->fragmentId));
7763
7764 /**
7765 * Start file thread now that we will start writing also
7766 * fragment checkpoint data.
7767 */
7768 ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
7769 filePtr.p->m_flags |= BackupFile::BF_FILE_THREAD;
7770
7771 signal->theData[0] = BackupContinueB::START_FILE_THREAD;
7772 signal->theData[1] = filePtr.i;
7773 signal->theData[2] = __LINE__;
7774 sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
7775 }
7776
7777 /**
7778 * Backups and LCPs are actions that operate on a long time-scale compared to
7779 * other activities in the cluster. We also have a number of similar
7780 * activities that operate on a longer time scale. These operations have to
7781 * continue to operate at some decent level even if user transactions are
7782 * arriving at extreme rates.
7783 *
7784 * Not providing sufficient activity for LCPs might mean that we run out of
7785 * REDO log, this means that no writing user transactions are allowed until
7786 * we have completed an LCP. Clearly this is not a desirable user experience.
7787 * So we need to find a balance between long-term needs and short-term needs
7788 * in scheduling LCPs and Backups versus normal user transactions.
7789 *
7790 * When designing those scheduling algorithms we need to remember the design
7791 * aim for the NDB storage engine. We want to ensure that NDB can be used in
7792 * soft real-time applications such as financial applications, telecom
7793 * applications. We do not aim for hard real-time applications such as
7794 * controlling power plants where missing a deadline can lead to major
7795 * catastrophies.
7796 *
7797 * Using NDB for a soft real-time application can still be done at different
7798 * levels of real-time requirements. If the aim is to provide that more or
7799 * less 100% of the transactions complete in say 100 microseconds then a
7800 * certain level of control is needed also from the application.
7801 *
7802 * Things that will affect scheduling in NDB are:
7803 * 1) Use of large rows
7804 * NDB will schedule at least one row at a time. There are currently very
7805 * few places where execution of one row operation contains breaks for
7806 * scheduling. Executing a row operation on the maximum row size of
7807 * around 14 kBytes means that signals can execute for up to about 20
7808 * microseconds as of 2018. Clearly using smaller rows can give a better
7809 * response time experience.
7810 *
7811 * 2) Using complex conditions per row
7812 * NDB supports pushing down conditions on rows in both key operations and
7813 * scan operations and even on join operations. Clearly if these pushed
7814 * conditions are very complex the time to execute them per row can extend
7815 * the time spent in executing one particular signal. Normal conditions
7816 * involving one or a number of columns doesn't present a problem but
7817 * SQL have no specific limits on conditions, so extremely complex
7818 * conditions are possible to construct.
7819 *
7820 * 3) Metadata operations
7821 * Creating tables, indexes can contain some operations that take a bit
7822 * longer to execute. However using the multi-threaded data nodes (ndbmtd)
7823 * means that most of these signals are executed in threads that are not
7824 * used for normal user transactions. So using ndbmtd is here a method to
7825 * decrease impact of response time of metadata operations.
7826 *
7827 * 4) Use of ndbd vs ndbmtd
7828 * ndbd is a single threaded data node, ndbd does receive data, operate on
7829 * the data and send the data all in one thread. In low load cases with
7830 * very high requirements on response time and strict control of the
7831 * application layer the use of ndbd for real-time operation can be
7832 * beneficial.
7833 *
7834 * Important here is to understand that the single-threaded nature of ndbd
7835 * means that it is limited in throughput. One data node using ndbd is
7836 * limited to handling on the order of 100.000 row operations per second
7837 * with maintained responsiveness as of 2015. ndbmtd can achieve a few
7838 * million row operations in very large configurations with maintained
7839 * responsiveness.
7840 *
7841 * When looking at maintaining a balance between various operations long-term
7842 * it is important to consider what types of operations that can go in parallel
7843 * in an NDB data node. These are the activities currently possible.
7844 *
7845 * 1) Normal user transactions
7846 * These consist of primary key row operations, unique key row operations
7847 * (these are implemented as two primary key row operations), scan operations
7848 * and finally a bit more complex operations that can have both key
7849 * operations and scan operations as part of them. The last category is
7850 * created as part of executing SPJ operation trees that currently is used
7851 * for executing complex SQL queries.
7852 *
7853 * 2) Local checkpoints (LCPs)
7854 * These can operate continously without user interaction. The LCPs are
7855 * needed to ensure that we can cut the REDO log. If LCPs execute too slow
7856 * the we won't have sufficient REDO log to store all user transactions that
7857 * are writing on logging tables.
7858 *
7859 * 3) Backups
7860 * These are started by a user, only one backup at a time is allowed. These
7861 * can be stored offsite and used by the user to restore NDB to a former
7862 * state, either as an emergency fix, it can also be used to start up a
7863 * new cluster or as part of setting up a slave cluster. A backup consists
7864 * of a data file per data node and one log file of changes since the backup
7865 * started and a control file. It is important that the backup maintains a
7866 * level of speed such that the system doesn't run out of disk space for the
7867 * log file.
7868 *
7869 * 4) Metadata operations
7870 * There are many different types of metadata operations. One can define
7871 * new tables, indexes, foreign keys, tablespaces. One can also rearrange
7872 * the tables for a new number of nodes as part of adding nodes to the
7873 * cluster. There are also operations to analyse tables, optimise tables
7874 * and so forth. Most of these are fairly short in duration and usage of
7875 * resources. But there are a few of them such as rearranging tables for
7876 * a new set of nodes that require shuffling data around in the cluster.
7877 * This can be a fairly long-running operation.
7878 *
7879 * 5) Event operations
7880 * To support replication from one MySQL Cluster to another MySQL Cluster
7881 * or a different MySQL storage engine we use event operations.
7882 * These operate always as part of the normal user transactions, so they
7883 * do not constitute anything to consider in the balance between long-term
7884 * and short-term needs. In addition in ndbmtd much of the processing happens
7885 * in a special thread for event operations.
7886 *
7887 * 6) Node synchronisation during node recovery
7888 * Recovery as such normally happens when no user transactions are happening
7889 * so thus have no special requirements on maintaining a balance between
7890 * short-term needs and long-term needs since recovery is always a long-term
7891 * operation that has no competing short-term operations. There is however
7892 * one exception to this and this is during node recovery when the starting
7893 * node needs to synchronize its data with a live node. In this case the
7894 * starting node has recovered an old version of the data node using LCPs
7895 * and REDO logs and have rebuilt the indexes. At this point it needs to
7896 * synchronize the data in each table with a live node within the same node
7897 * group.
7898 *
7899 * This synchronization happens row by row controlled by the live node. The
7900 * live scans its own data and checks each row to the global checkpoint id
7901 * (GCI) that the starting node has restored. If the row has been updated
7902 * with a more recent GCI then the row needs to be sent over to the starting
7903 * node.
7904 *
7905 * Only one node recovery per node group at a time is possible when using
7906 * two replicas.
7907 *
7908 * So there can be as many as 4 long-term operations running in parallel to
7909 * the user transactions. These are 1 LCP scan, 1 Backup scan, 1 node recovery
7910 * scan and finally 1 metadata scan. All of these long-running operations
7911 * perform scans of table partitions (fragments). LCPs scan a partition and
7912 * write rows into a LCP file. Backups scan a partition and write its result
7913 * into a backup file. Node recovery scans searches for rows that have been
7914 * updated since the GCI recovered in the starting node and for each row
7915 * found it is sent over to the starting node. Metadata scans for either
7916 * all rows or using some condition and then can use this information to
7917 * send the row to another node, to build an index, to build a foreign key
7918 * index or other online operation which is performed in parallel to user
7919 * transactions.
7920 *
7921 * From this analysis it's clear that we don't want any long-running operation
7922 * to consume any major part of the resources. It's desirable that user
7923 * transactions can use at least about half of the resources even when running
7924 * in parallel with all four of those activities. Node recovery is slightly
7925 * more important than the other activities, this means that our aim should
7926 * be to ensure that LCPs, Backups and metadata operations can at least use
7927 * about 10% of the CPU resources and that node recovery operations can use
7928 * at least about 20% of the CPU resources. Obviously they should be able to
7929 * use more resources when there is less user transactions competing for the
7930 * resources. But we should try to maintain this level of CPU usage for LCPs
7931 * and Backups even when the user load is at extreme levels.
7932 *
7933 * There is no absolute way of ensuring 10% CPU usage for a certain activity.
7934 * We use a number of magic numbers controlling the algorithms to ensure this.
7935 *
7936 * At first we use the coding rule that one signal should never execute for
7937 * more than 10 microseconds in the normal case. There are exceptions to this
7938 * rule as explained above, but it should be outliers that won't affect the
7939 * long-term rates very much.
7940 *
7941 * Second we use the scheduling classes we have access to. The first is B-level
7942 * signals, these can have an arbitrary long queue of other jobs waiting before
7943 * they are executed, so these have no bound on when they execute. We also
7944 * have special signals that execute with a bounded delay, in one signal they
7945 * can be delayed more than a B-level signal, but the scheduler ensures that
7946 * at most 100 B-level signals execute before they are executed. Normally it
7947 * would even operate with at most 75 B-level signals executed even in high
7948 * load scenarios and mostly even better than that. We achieve this by calling
7949 * sendSignalWithDelay with timeout BOUNDED_DELAY.
7950 *
7951 * So how fast can an LCP run that is using about 10% of the CPU. In a fairly
7952 * standard CPU of 2015, not a high-end, but also not at the very low-end,
7953 * the CPU can produce about 150 MBytes of data for LCPs per second. This is
7954 * using 100 byte rows. So this constitutes about 1.5M rows per second plus
7955 * transporting 150 MBytes of data to the write buffers in the Backup block.
7956 * So we use a formula here where we assume that the fixed cost of scanning
7957 * a row is about 550 ns and cost per word of data is 4 ns. The reason we
7958 * a different formula for LCP scans compared to the formula we assume in
7959 * DBLQH for generic scans is that the copy of data is per row for LCPs
7960 * whereas it is per column for generic scans. Similarly we never use any
7961 * scan filters for LCPs, we only check for LCP_SKIP bits and FREE bits.
7962 * This is much more efficient compared to generic scan filters.
7963 *
7964 * At very high load we will assume that we have to wait about 50 signals
7965 * when sending BOUNDED_DELAY signals. Worst case can be up to about 100
7966 * signals, but the worst case won't happen very often and more common
7967 * will be much less than that.
7968 * The mean execution time of signals are about 5 microseconds. This means
7969 * that by constantly using bounded delay signals we ensure that we get at
7970 * least around 4000 executions per second. So this means that
7971 * in extreme overload situations we can allow for execution to go on
7972 * for up to about 25 microseconds without giving B-level signals access.
7973 * 25 microseconds times 4000 is 100 milliseconds so about 10% of the
7974 * CPU usage.
7975 *
7976 * LCPs and Backups also operate using conditions on how fast they can write
7977 * to the disk subsystem. The user can configure these numbers, the LCPs
7978 * and Backups gets a quota per 100 millisecond. So if the LCPs and Backups
7979 * runs too fast they will pause a part of those 100 milliseconds. However
7980 * it is a good idea to set the minimum disk write speed to at least 20%
7981 * of the possible CPU speed. So this means setting it to 30 MByte per
7982 * second. In high-load scenarios we might not be able to process more
7983 * than 15 MByte per second, but as soon as user load and other load
7984 * goes down we will get back to the higher write speed.
7985 *
7986 * Scans operate in the following fashion which is an important input to
7987 * the construction of the magic numbers. We start a scan with SCAN_FRAGREQ
7988 * and here we don't really know the row sizes other than the maximum row
7989 * size. This SCAN_FRAGREQ will return 16 rows and then it will return
7990 * SCAN_FRAGCONF. For each row it will return a TRANSID_AI signal.
7991 * If we haven't used our quota for writing LCPs and Backups AND there is
7992 * still room in the backup write buffer then we will continue with another
7993 * set of 16 rows. These will be retrieved using the SCAN_NEXTREQ signal
7994 * and the response to this signal will be SCAN_FRAGCONF when done with the
7995 * 16 rows (or all rows scanned).
7996 *
7997 * Processing 16 rows takes about 8800 ns on standard HW of 2015 and so even
7998 * for minimal rows we will use at least 10000 ns if we execute an entire batch
7999 * of 16 rows without providing access for other B-level signals. So the
8000 * absolute maximum amount of rows that we will ever execute without
8001 * giving access for B-level signals are 32 rows so that we don't go beyond
8002 * the allowed quota of 25 microsecond without giving B-level priority signal
8003 * access, this means two SCAN_FRAGREQ/SCAN_NEXTREQ executions.
8004 *
8005 * Using the formula we derive that we should never start another set of
8006 * 16 rows if we have passed 1500 words in the previous batch of 16 rows.
8007 * Even when deciding in the Backup block to send an entire batch of 16
8008 * rows at A-level we will never allow to continue gathering when we have
8009 * already gathered more than 4000 words. When we reach this limit we will
8010 * send another bounded delay signal. The reason is that we've already
8011 * reached sufficient CPU usage and going further would go beyond 15%.
8012 *
8013 * The boundary 1500 and 4000 is actually based on using 15% of the CPU
8014 * resources which is better if not all four activities happen at the
8015 * same time. When we support rate control on all activities we need to
8016 * adaptively decrease this limit to ensure that the total rate controlled
8017 * efforts doesn't go beyond 50%.
8018 *
8019 * The limit 4000 is ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO set in DblqhMain.cpp.
8020 * This constant limit the impact of wide rows on responsiveness.
8021 *
8022 * When operating in normal mode, we will not continue gathering when we
8023 * already gathered at least 500 words. However we will only operate in
8024 * this mode when we are in low load scenario in which case this speed will
8025 * be quite sufficient. This limit is to ensure that we don't go beyond
8026 * normal real-time break limits in normal operations. This limits LCP
8027 * execution during normal load to around 3-4 microseconds.
8028 *
8029 * In the following paragraph a high priority of LCPs means that we need to
8030 * raise LCP priority to maintain LCP write rate at the expense of user
8031 * traffic responsiveness. Low priority means that we can get sufficient
8032 * LCP write rates even with normal responsiveness to user requests.
8033 *
8034 * Finally we have to make a decision when we should execute at high priority
8035 * and when operating at normal priority. Obviously we should avoid entering
8036 * high priority mode as much as possible since it will affect response times.
8037 * At the same time once we have entered this mode we need to have some
8038 * memory of it. The reason is that we will have lost some ground while
8039 * executing at normal priority when the job buffers were long. We will limit
8040 * the memory to at most 16 executions of 16 rows at high priority. Each
8041 * time we start a new execution we will see if we need to add to this
8042 * "memory". We will add one per 48 signals that we had to wait for between
8043 * executing a set of 16 rows (normally this means execution of 3 bounded
8044 * delay signals). When the load level is even higher than we will add to
8045 * the memory such that we operate in high priority mode a bit longer since
8046 * we are likely to have missed a bit more opportunity to perform LCP scans
8047 * in this overload situation.
8048 *
8049 * The following "magic" constants control these algorithms:
8050 * 1) ZMAX_SCAN_DIRECT_COUNT set to 5
8051 * Means that at most 6 rows will be scanned per execute direct, set in
8052 * Dblqh.hpp. This applies to all scan types, not only to LCP scans.
8053 *
8054 * 2) ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO set to 1600
8055 * This controls the maximum number of words that is allowed to be gathered
8056 * before we decide to do a real-time break when executing at normal
8057 * priority level. This is defined in Backup.hpp. This will execute for about
8058 * 2 microseconds.
8059 *
8060 * 3) ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO set to 8000
8061 * This controls the maximum words gathered before we decide to send the
8062 * next row to be scanned in another bounded delay signal. This is defined in
8063 * Backup.hpp. In this case the Backup block decided to execute on priority A
8064 * level due to a high load in the node. This limit is set to execute for about
8065 * 10 microseconds (around 300 MBytes can be written per second per CPU).
8066 * LCPs can override this limit with a multiplication factor of
8067 * m_redo_alert_factor.
8068 *
8069 * We will always use the priority A-level when the REDO log limit has been
8070 * reached to ensure that we execute proper batches already when seeing the
8071 * first signs of REDO log overload.
8072 *
8073 * 4) MAX_LCP_WORDS_PER_BATCH no longer used
8074 *
8075 * 5) HIGH_LOAD_LEVEL set to 32
8076 * Limit of how many signals have been executed in this LDM thread since
8077 * starting last 16 rows in order to enter high priority mode.
8078 * Defined in this block Backup.cpp.
8079 *
8080 * 6) VERY_HIGH_LOAD_LEVEL set to 48
8081 * For each additional of this we increase the memory. So e.g. with 80 signals
8082 * executed since last we will increase the memory by two, with 128 we will
8083 * increase it by three. Thus if #signals >= (32 + 48) => 2, #signals >=
8084 * (32 + 48 * 2) => 3 and so forth. Memory here means that we will remember
8085 * the high load until we have compensated for it in a sufficient manner, so
8086 * we will retain executing on high priority for a bit longer to compensate
8087 * for what we lost during execution at low priority when load suddenly
8088 * increased.
8089 * Defined in this block Backup.cpp.
8090 *
8091 * 7) MAX_RAISE_PRIO_MEMORY set to 16
8092 * Max memory of priority raising, so after load disappears we will at most
8093 * an additional set of 16*16 rows at high priority mode before going back to
8094 * normal priority mode.
8095 * Defined in this block Backup.cpp.
8096 *
8097 * 8) NUMBER_OF_SIGNALS_PER_SCAN_BATCH set to 3
8098 * When starting up the algorithm we check how many signals are in the
8099 * B-level job buffer. Based on this number we set the initial value to
8100 * high priority or not. This is based on that we expect a set of 16
8101 * rows to be executed in 3 signals with 6 rows, 6 rows and last signal
8102 * 4 rows.
8103 * Defined in this block Backup.cpp.
8104 */
8105
8106 /**
8107 * These routines are more or less our scheduling logic for LCPs. This is
8108 * how we try to achieve a balanced output from LCPs while still
8109 * processing normal transactions at a high rate.
8110 */
init_scan_prio_level(Signal * signal,BackupRecordPtr ptr)8111 void Backup::init_scan_prio_level(Signal *signal, BackupRecordPtr ptr)
8112 {
8113 Uint32 level = getSignalsInJBB();
8114 if ((level * NUMBER_OF_SIGNALS_PER_SCAN_BATCH) > HIGH_LOAD_LEVEL)
8115 {
8116 /* Ensure we use prio A and only 1 signal at prio A */
8117 jam();
8118 level = VERY_HIGH_LOAD_LEVEL;
8119 }
8120 ptr.p->m_lastSignalId = signal->getSignalId() - level;
8121 ptr.p->m_prioA_scan_batches_to_execute = 0;
8122 }
8123
8124 bool
check_scan_if_raise_prio(Signal * signal,BackupRecordPtr ptr)8125 Backup::check_scan_if_raise_prio(Signal *signal, BackupRecordPtr ptr)
8126 {
8127 bool flag = false;
8128 const Uint32 current_signal_id = signal->getSignalId();
8129 const Uint32 lastSignalId = ptr.p->m_lastSignalId;
8130 Uint32 prioA_scan_batches_to_execute =
8131 ptr.p->m_prioA_scan_batches_to_execute;
8132 const Uint32 num_signals_executed = current_signal_id - lastSignalId;
8133
8134 if (num_signals_executed > HIGH_LOAD_LEVEL)
8135 {
8136 jam();
8137 prioA_scan_batches_to_execute+=
8138 ((num_signals_executed + (VERY_HIGH_LOAD_LEVEL - 1)) /
8139 VERY_HIGH_LOAD_LEVEL);
8140 if (prioA_scan_batches_to_execute > MAX_RAISE_PRIO_MEMORY)
8141 {
8142 jam();
8143 prioA_scan_batches_to_execute = MAX_RAISE_PRIO_MEMORY;
8144 }
8145 }
8146 else if (ptr.p->is_lcp() &&
8147 m_redo_alert_state != RedoStateRep::NO_REDO_ALERT)
8148 {
8149 jam();
8150 prioA_scan_batches_to_execute = 1;
8151 }
8152 if (prioA_scan_batches_to_execute > 0)
8153 {
8154 jam();
8155 prioA_scan_batches_to_execute--;
8156 flag = true;
8157 }
8158 ptr.p->m_lastSignalId = current_signal_id;
8159 ptr.p->m_prioA_scan_batches_to_execute = prioA_scan_batches_to_execute;
8160 return flag;
8161 }
8162
8163 void
sendScanFragReq(Signal * signal,Ptr<BackupRecord> ptr,Ptr<BackupFile> filePtr,Ptr<Table> tabPtr,Ptr<Fragment> fragPtr,Uint32 delay)8164 Backup::sendScanFragReq(Signal* signal,
8165 Ptr<BackupRecord> ptr,
8166 Ptr<BackupFile> filePtr,
8167 Ptr<Table> tabPtr,
8168 Ptr<Fragment> fragPtr,
8169 Uint32 delay)
8170 {
8171 /**
8172 * Start scan
8173 */
8174 {
8175 if (!(ptr.p->is_lcp() &&
8176 ptr.p->m_num_lcp_files > 1))
8177 {
8178 jam();
8179 filePtr.p->m_flags |= BackupFile::BF_SCAN_THREAD;
8180 }
8181 else
8182 {
8183 jam();
8184 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
8185 {
8186 BackupFilePtr loopFilePtr;
8187 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
8188 loopFilePtr.p->m_flags |= BackupFile::BF_SCAN_THREAD;
8189 }
8190 }
8191
8192 Table & table = * tabPtr.p;
8193 ScanFragReq * req = (ScanFragReq *)signal->getDataPtrSend();
8194 const Uint32 parallelism = ZRESERVED_SCAN_BATCH_SIZE;
8195
8196 req->senderData = filePtr.i;
8197 req->resultRef = reference();
8198 req->schemaVersion = table.schemaVersion;
8199 req->fragmentNoKeyLen = fragPtr.p->fragmentId;
8200 req->requestInfo = 0;
8201 req->savePointId = 0;
8202 req->tableId = table.tableId;
8203 ScanFragReq::setReadCommittedFlag(req->requestInfo, 1);
8204 ScanFragReq::setLockMode(req->requestInfo, 0);
8205 ScanFragReq::setHoldLockFlag(req->requestInfo, 0);
8206 ScanFragReq::setKeyinfoFlag(req->requestInfo, 0);
8207 ScanFragReq::setTupScanFlag(req->requestInfo, 1);
8208 ScanFragReq::setNotInterpretedFlag(req->requestInfo, 1);
8209 if (ptr.p->is_lcp())
8210 {
8211 ScanFragReq::setScanPrio(req->requestInfo, 1);
8212 ScanFragReq::setNoDiskFlag(req->requestInfo, 1);
8213 ScanFragReq::setLcpScanFlag(req->requestInfo, 1);
8214 NDB_TICKS now = getHighResTimer();
8215 ptr.p->m_scan_start_timer = now;
8216 }
8217 ptr.p->m_num_scan_req_on_prioa = 0;
8218 init_scan_prio_level(signal, ptr);
8219 if (check_scan_if_raise_prio(signal, ptr))
8220 {
8221 jam();
8222 ScanFragReq::setPrioAFlag(req->requestInfo, 1);
8223 ptr.p->m_num_scan_req_on_prioa = 1;
8224 }
8225
8226 req->transId1 = 0;
8227 req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
8228 req->clientOpPtr= filePtr.i;
8229 req->batch_size_rows= parallelism;
8230 req->batch_size_bytes= 0;
8231 BlockReference lqhRef = 0;
8232 bool delay_possible = true;
8233 if (ptr.p->is_lcp()) {
8234 lqhRef = calcInstanceBlockRef(DBLQH);
8235 } else {
8236 const Uint32 instanceKey = fragPtr.p->lqhInstanceKey;
8237 ndbrequire(instanceKey != 0);
8238 lqhRef = numberToRef(DBLQH, instanceKey, getOwnNodeId());
8239 if (lqhRef != calcInstanceBlockRef(DBLQH))
8240 {
8241 /* We can't send delayed signals to other threads. */
8242 delay_possible = false;
8243 }
8244 }
8245
8246 Uint32 attrInfo[25];
8247 memcpy(attrInfo, table.attrInfo, 4*table.attrInfoLen);
8248 LinearSectionPtr ptr[3];
8249 ptr[0].p = attrInfo;
8250 ptr[0].sz = table.attrInfoLen;
8251 if (delay_possible)
8252 {
8253 SectionHandle handle(this);
8254 ndbrequire(import(handle.m_ptr[0], ptr[0].p, ptr[0].sz));
8255 handle.m_cnt = 1;
8256 if (delay == 0)
8257 {
8258 jam();
8259 sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
8260 BOUNDED_DELAY, ScanFragReq::SignalLength, &handle);
8261 }
8262 else
8263 {
8264 jam();
8265 sendSignalWithDelay(lqhRef, GSN_SCAN_FRAGREQ, signal,
8266 delay, ScanFragReq::SignalLength, &handle);
8267 }
8268 }
8269 else
8270 {
8271 /**
8272 * There is no way to send signals over to another thread at a rate
8273 * level at the moment. So we send at priority B, but the response
8274 * back to us will arrive at Priority A if necessary.
8275 */
8276 jam();
8277 sendSignal(lqhRef,
8278 GSN_SCAN_FRAGREQ,
8279 signal,
8280 ScanFragReq::SignalLength,
8281 JBB,
8282 ptr,
8283 1);
8284 }
8285 }
8286 }
8287
8288 void
execSCAN_HBREP(Signal * signal)8289 Backup::execSCAN_HBREP(Signal* signal)
8290 {
8291 jamEntry();
8292 }
8293
8294 void
record_deleted_pageid(Uint32 pageNo,Uint32 record_size)8295 Backup::record_deleted_pageid(Uint32 pageNo, Uint32 record_size)
8296 {
8297 BackupRecordPtr ptr;
8298 BackupFilePtr zeroFilePtr;
8299 BackupFilePtr currentFilePtr;
8300 ptr = m_lcp_ptr;
8301 c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
8302 c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8303 OperationRecord & current_op = currentFilePtr.p->operation;
8304 OperationRecord & zero_op = zeroFilePtr.p->operation;
8305 ndbrequire(ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS);
8306 Uint32 * dst = current_op.dst;
8307 Uint32 dataLen = 2;
8308 Uint32 copy_array[2];
8309 copy_array[0] = pageNo;
8310 copy_array[1] = record_size;
8311 DEB_LCP_DEL(("(%u) DELETE_BY_PAGEID: page(%u)",
8312 instance(),
8313 pageNo));
8314 *dst = htonl(Uint32(dataLen + (BackupFormat::DELETE_BY_PAGEID_TYPE << 16)));
8315 memcpy(dst + 1, copy_array, dataLen*sizeof(Uint32));
8316 ndbrequire(dataLen < zero_op.maxRecordSize);
8317 zeroFilePtr.p->m_lcp_delete_by_pageids++;
8318 zero_op.finished(dataLen);
8319 current_op.newRecord(dst + dataLen + 1);
8320 ptr.p->noOfRecords++;
8321 ptr.p->noOfBytes += (4*(dataLen + 1));
8322 ptr.p->m_bytes_written += (4*(dataLen + 1));
8323 /**
8324 * LCP keep pages are handled out of order, so here we have prepared before
8325 * calling NEXT_SCANCONF by temporarily changing the current data file used.
8326 * Since scans use deep call chaining we restore the current data file
8327 * immediately after each row written into the LCP data file. Same happens
8328 * also for TRANSID_AI and record_deleted_rowid.
8329 */
8330 restore_current_page(ptr);
8331 }
8332
8333 void
record_deleted_rowid(Uint32 pageNo,Uint32 pageIndex,Uint32 gci)8334 Backup::record_deleted_rowid(Uint32 pageNo, Uint32 pageIndex, Uint32 gci)
8335 {
8336 BackupRecordPtr ptr;
8337 BackupFilePtr zeroFilePtr;
8338 BackupFilePtr currentFilePtr;
8339 ptr = m_lcp_ptr;
8340 c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
8341 c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8342 OperationRecord & current_op = currentFilePtr.p->operation;
8343 OperationRecord & zero_op = zeroFilePtr.p->operation;
8344 ndbrequire(ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS);
8345 Uint32 * dst = current_op.dst;
8346 Uint32 dataLen = 3;
8347 Uint32 copy_array[3];
8348 copy_array[0] = pageNo;
8349 copy_array[1] = pageIndex;
8350 copy_array[2] = gci;
8351 DEB_LCP_DEL(("(%u) DELETE_BY_ROWID: row(%u,%u)",
8352 instance(),
8353 pageNo,
8354 pageIndex));
8355 *dst = htonl(Uint32(dataLen + (BackupFormat::DELETE_BY_ROWID_TYPE << 16)));
8356 memcpy(dst + 1, copy_array, dataLen*sizeof(Uint32));
8357 ndbrequire(dataLen < zero_op.maxRecordSize);
8358 zeroFilePtr.p->m_lcp_delete_by_rowids++;
8359 zero_op.finished(dataLen);
8360 current_op.newRecord(dst + dataLen + 1);
8361 ptr.p->noOfRecords++;
8362 ptr.p->noOfBytes += (4*(dataLen + 1));
8363 ptr.p->m_bytes_written += (4*(dataLen + 1));
8364 restore_current_page(ptr);
8365 }
8366
8367 void
execTRANSID_AI(Signal * signal)8368 Backup::execTRANSID_AI(Signal* signal)
8369 {
8370 jamEntryDebug();
8371
8372 const Uint32 filePtrI = signal->theData[0];
8373 //const Uint32 transId1 = signal->theData[1];
8374 //const Uint32 transId2 = signal->theData[2];
8375 Uint32 dataLen = signal->length() - 3;
8376
8377 BackupFilePtr filePtr;
8378 c_backupFilePool.getPtr(filePtr, filePtrI);
8379
8380 BackupRecordPtr ptr;
8381 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
8382
8383 OperationRecord & op = filePtr.p->operation;
8384 if (ptr.p->is_lcp())
8385 {
8386 BackupFilePtr currentFilePtr;
8387 c_backupFilePool.getPtr(currentFilePtr, ptr.p->m_working_data_file_ptr);
8388 OperationRecord & current_op = currentFilePtr.p->operation;
8389 Uint32 * dst = current_op.dst;
8390 Uint32 header;
8391 if (ptr.p->m_working_changed_row_page_flag)
8392 {
8393 /* LCP for CHANGED ROWS pages */
8394 jam();
8395 header = dataLen + (BackupFormat::WRITE_TYPE << 16);
8396 filePtr.p->m_lcp_writes++;
8397 }
8398 else
8399 {
8400 /* LCP for ALL ROWS pages */
8401 jam();
8402 header = dataLen + (BackupFormat::INSERT_TYPE << 16);
8403 filePtr.p->m_lcp_inserts++;
8404 }
8405 ptr.p->noOfRecords++;
8406 ptr.p->noOfBytes += (4*(dataLen + 1));
8407 ptr.p->m_bytes_written += (4*(dataLen + 1));
8408 #ifdef VM_TRACE
8409 Uint32 th = signal->theData[4];
8410 ndbassert(! (th & 0x00400000)); /* Is MM_GROWN set */
8411 #endif
8412 ndbrequire(signal->getNoOfSections() == 0);
8413 const Uint32 * src = &signal->theData[3];
8414 * dst = htonl(header);
8415 memcpy(dst + 1, src, 4*dataLen);
8416 #ifdef DEBUG_LCP_ROW
8417 TablePtr debTabPtr;
8418 FragmentPtr fragPtr;
8419 ptr.p->tables.first(debTabPtr);
8420 debTabPtr.p->fragments.getPtr(fragPtr, 0);
8421 g_eventLogger->info("(%u) tab(%u,%u) Write row(%u,%u) into LCP, bits: %x",
8422 instance(),
8423 debTabPtr.p->tableId,
8424 fragPtr.p->fragmentId,
8425 src[0],
8426 src[1],
8427 src[3]);
8428 #endif
8429 if (unlikely(dataLen >= op.maxRecordSize))
8430 {
8431 g_eventLogger->info("dataLen: %u, op.maxRecordSize = %u, header: %u",
8432 dataLen, op.maxRecordSize, header);
8433 jamLine(dataLen);
8434 jamLine(op.maxRecordSize);
8435 ndbabort();
8436 }
8437 op.finished(dataLen);
8438 current_op.newRecord(dst + dataLen + 1);
8439 restore_current_page(ptr);
8440 }
8441 else
8442 {
8443 /* Backup handling */
8444 Uint32 * dst = op.dst;
8445 Uint32 header = dataLen;
8446 if (signal->getNoOfSections() == 0)
8447 {
8448 jam();
8449 const Uint32 * src = &signal->theData[3];
8450 * dst = htonl(header);
8451 memcpy(dst + 1, src, 4*dataLen);
8452 }
8453 else
8454 {
8455 jam();
8456 SectionHandle handle(this, signal);
8457 SegmentedSectionPtr dataPtr;
8458 handle.getSection(dataPtr, 0);
8459 dataLen = dataPtr.sz;
8460
8461 * dst = htonl(dataLen);
8462 copy(dst + 1, dataPtr);
8463 releaseSections(handle);
8464 }
8465 ptr.p->m_bytes_written += (4*(dataLen + 1));
8466 op.finished(dataLen);
8467 op.newRecord(dst + dataLen + 1);
8468 }
8469 }
8470
8471 bool
is_all_rows_page(BackupRecordPtr ptr,Uint32 part_id)8472 Backup::is_all_rows_page(BackupRecordPtr ptr,
8473 Uint32 part_id)
8474 {
8475 if (check_if_in_page_range(part_id,
8476 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part,
8477 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts))
8478 {
8479 jam();
8480 return false;
8481 }
8482 jam();
8483 return true;
8484 }
8485
8486 void
set_working_file(BackupRecordPtr ptr,Uint32 part_id,bool is_all_rows_page)8487 Backup::set_working_file(BackupRecordPtr ptr,
8488 Uint32 part_id,
8489 bool is_all_rows_page)
8490 {
8491 Uint32 index = ptr.p->m_num_lcp_files - 1; //Change pages index
8492 if (is_all_rows_page)
8493 {
8494 bool found = false;
8495 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
8496 {
8497 if (check_if_in_page_range(part_id,
8498 ptr.p->m_scan_info[i].m_start_all_part,
8499 ptr.p->m_scan_info[i].m_num_all_parts))
8500 {
8501 jam();
8502 found = true;
8503 index = i;
8504 break;
8505 }
8506 }
8507 ndbrequire(found);
8508 }
8509 ptr.p->m_working_data_file_ptr = ptr.p->dataFilePtr[index];
8510 }
8511
8512 bool
check_if_in_page_range(Uint32 part_id,Uint32 start_part,Uint32 num_parts)8513 Backup::check_if_in_page_range(Uint32 part_id,
8514 Uint32 start_part,
8515 Uint32 num_parts)
8516 {
8517 Uint32 end_part;
8518 if (part_id >= start_part)
8519 {
8520 if ((start_part + num_parts) > part_id)
8521 {
8522 return true;
8523 }
8524 }
8525 else
8526 {
8527 end_part = start_part + num_parts;
8528 if ((part_id + BackupFormat::NDB_MAX_LCP_PARTS) < end_part)
8529 {
8530 return true;
8531 }
8532 }
8533 jam();
8534 return false;
8535 }
8536
8537 Uint32
hash_lcp_part(Uint32 page_id) const8538 Backup::hash_lcp_part(Uint32 page_id) const
8539 {
8540 /**
8541 * To ensure proper operation also with small number of pages
8542 * we make a complete bit reorder of the 11 least significant
8543 * bits of the page id and returns this as the part id to use.
8544 * This means that for e.g. 8 pages we get the following parts
8545 * used:
8546 * 0: 0, 1: 1024, 2: 512, 3: 1536, 4: 256, 5: 1280, 6: 768, 7: 1792
8547 *
8548 * This provides a fairly good spread also of small number of
8549 * pages into the various parts.
8550 *
8551 * We implement this bit reorder by handling 4 sets of 3 bits,
8552 * except for the highest bits where we only use 2 bits.
8553 * Each 3 bit set is reversed using a simple static lookup
8554 * table and then the result of those 4 lookups is put back
8555 * into the hash value in reverse order.
8556 *
8557 * As a final step we remove bit 0 which is always 0 since we
8558 * only use 11 bits and not 12 bits.
8559 */
8560 static Uint32 reverse_3bits_array[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
8561 const Uint32 lowest_3bits_page_id = page_id & 7;
8562 const Uint32 low_3bits_page_id = (page_id >> 3) & 7;
8563 const Uint32 high_3bits_page_id = (page_id >> 6) & 7;
8564 const Uint32 highest_3bits_page_id = (page_id >> 9) & 3;
8565 Uint32 part_id =
8566 reverse_3bits_array[highest_3bits_page_id] +
8567 (reverse_3bits_array[high_3bits_page_id] << 3) +
8568 (reverse_3bits_array[low_3bits_page_id] << 6) +
8569 (reverse_3bits_array[lowest_3bits_page_id] << 9);
8570 part_id >>= 1;
8571 return part_id;
8572 }
8573
8574 bool
is_change_part_state(Uint32 page_id)8575 Backup::is_change_part_state(Uint32 page_id)
8576 {
8577 BackupRecordPtr ptr;
8578 jamEntryDebug();
8579 ptr = m_lcp_ptr;
8580 Uint32 part_id = hash_lcp_part(page_id);
8581 bool is_all_part = is_all_rows_page(ptr, part_id);
8582 return !is_all_part;
8583 }
8584
8585 void
get_page_info(BackupRecordPtr ptr,Uint32 part_id,Uint32 & scanGCI,bool & changed_row_page_flag)8586 Backup::get_page_info(BackupRecordPtr ptr,
8587 Uint32 part_id,
8588 Uint32 & scanGCI,
8589 bool & changed_row_page_flag)
8590 {
8591 if (is_all_rows_page(ptr, part_id))
8592 {
8593 /**
8594 * We are within range for all parts to be changed.
8595 * return scanGCI = 0 such that all rows in this page becomes part
8596 * of this LCP.
8597 */
8598 jam();
8599 scanGCI = 0;
8600 changed_row_page_flag = false;
8601 }
8602 else
8603 {
8604 /**
8605 * Not all rows to be recorded, only changed rows on this page.
8606 */
8607 jam();
8608 ndbassert(is_partial_lcp_enabled());
8609 scanGCI = ptr.p->m_scan_change_gci;
8610 ndbrequire(scanGCI != 0);
8611 changed_row_page_flag = true;
8612 }
8613 }
8614
8615 void
change_current_page_temp(Uint32 page_no)8616 Backup::change_current_page_temp(Uint32 page_no)
8617 {
8618 BackupRecordPtr ptr;
8619 jamEntry();
8620 ptr = m_lcp_ptr;
8621 Uint32 part_id = hash_lcp_part(page_no);
8622 ptr.p->m_working_changed_row_page_flag = !(is_all_rows_page(ptr, part_id));
8623 set_working_file(ptr,
8624 part_id,
8625 !ptr.p->m_working_changed_row_page_flag);
8626 }
8627
8628 /**
8629 * After each operation, whether it is INSERT, WRITE or any DELETE variant,
8630 * we restore the working data file and current page flag. We can change
8631 * those for one operation (when retrieving a record from LCP keep list).
8632 * Since we don't know when we retrieved a record from LCP keep list here,
8633 * we simply always restore. The current values always have the current
8634 * setting and the working is the one we're currently using.
8635 */
8636 void
restore_current_page(BackupRecordPtr ptr)8637 Backup::restore_current_page(BackupRecordPtr ptr)
8638 {
8639 ptr.p->m_working_data_file_ptr = ptr.p->m_current_data_file_ptr;
8640 ptr.p->m_working_changed_row_page_flag =
8641 ptr.p->m_current_changed_row_page_flag;
8642 }
8643
8644 void
init_lcp_scan(Uint32 & scanGCI,bool & changed_row_page_flag)8645 Backup::init_lcp_scan(Uint32 & scanGCI,
8646 bool & changed_row_page_flag)
8647 {
8648 /**
8649 * Here we come to get what to do with page 0.
8650 *
8651 * The number of pages seen at start of LCP scan was set in the method
8652 * start_lcp_scan. It is of vital importance that this happens
8653 * synchronised with the insertion of the LCP record in the UNDO log
8654 * record. There cannot be any signal breaks between setting the
8655 * max page count, initialising the LCP scan variable in TUP and
8656 * initialising the variables in this block and finally to insert a
8657 * start LCP record in UNDO log to allow for proper
8658 * handling of commits after start of LCP scan (to ensure that we
8659 * set LCP_SKIP and LCP_DELETE bits when necessary). It is important
8660 * that we retain exactly the set of rows committed before the start
8661 * of the LCP scan (the commit point is when the signal TUP_COMMITREQ
8662 * returns to DBLQH) and that rows inserted after this point is not
8663 * part of the LCP, this will guarantee that we get synchronisation
8664 * between the LCP main memory data and the disk data parts after
8665 * executing the UNDO log.
8666 *
8667 * The number of pages will be stored in the LCP to ensure that we can
8668 * remove rowid's that have been deleted before the next LCP starts.
8669 * The next LCP will never see any deleted rowid's, so those need to be
8670 * deleted before applying the rest of the LCP. The actual LCP contains
8671 * DELETE by ROWID for all rowid's in the range of pages still existing,
8672 * but for those removed we need to delete all those rows in one go at
8673 * start of restore by using the number of pages that is part of LCP.
8674 */
8675 BackupRecordPtr ptr;
8676 jamEntry();
8677 ptr = m_lcp_ptr;
8678 Uint32 part_id = hash_lcp_part(0);
8679 get_page_info(ptr,
8680 part_id,
8681 scanGCI,
8682 changed_row_page_flag);
8683 set_working_file(ptr, part_id, !changed_row_page_flag);
8684 ptr.p->m_current_data_file_ptr = ptr.p->m_working_data_file_ptr;
8685 ptr.p->m_working_changed_row_page_flag = changed_row_page_flag;
8686 ptr.p->m_current_changed_row_page_flag = changed_row_page_flag;
8687
8688 #ifdef DEBUG_EXTRA_LCP
8689 TablePtr debTabPtr;
8690 FragmentPtr fragPtr;
8691 ptr.p->tables.first(debTabPtr);
8692 debTabPtr.p->fragments.getPtr(fragPtr, 0);
8693 DEB_EXTRA_LCP(("(%u)LCP scan page tab(%u,%u): %u, part_id: %u,"
8694 " round: %u, %s",
8695 instance(),
8696 debTabPtr.p->tableId,
8697 fragPtr.p->fragmentId,
8698 0,
8699 part_id,
8700 0,
8701 changed_row_page_flag ? "CHANGED ROWS page" : " ALL ROWS page"));
8702 #endif
8703 }
8704
8705 void
alloc_page_after_lcp_start(Uint32 page_no)8706 Backup::alloc_page_after_lcp_start(Uint32 page_no)
8707 {
8708 BackupRecordPtr ptr;
8709 jamEntry();
8710 ptr = m_lcp_ptr;
8711 ptr.p->m_any_lcp_page_ops = true;
8712 if (is_change_part_state(page_no))
8713 ptr.p->m_change_page_alloc_after_start++;
8714 else
8715 ptr.p->m_all_page_alloc_after_start++;
8716 }
8717
8718 void
alloc_dropped_page_after_lcp_start(bool is_change_page)8719 Backup::alloc_dropped_page_after_lcp_start(bool is_change_page)
8720 {
8721 BackupRecordPtr ptr;
8722 jamEntry();
8723 ptr = m_lcp_ptr;
8724 ptr.p->m_any_lcp_page_ops = true;
8725 if (is_change_page)
8726 {
8727 ptr.p->m_change_page_alloc_dropped_after_start++;
8728 }
8729 else
8730 {
8731 ptr.p->m_all_page_alloc_dropped_after_start++;
8732 }
8733 }
8734
8735 void
dropped_page_after_lcp_start(bool is_change_page,bool is_last_lcp_state_A)8736 Backup::dropped_page_after_lcp_start(bool is_change_page,
8737 bool is_last_lcp_state_A)
8738 {
8739 BackupRecordPtr ptr;
8740 jamEntry();
8741 ptr = m_lcp_ptr;
8742 ptr.p->m_any_lcp_page_ops = true;
8743 if (is_last_lcp_state_A)
8744 {
8745 if (is_change_page)
8746 ptr.p->m_change_page_dropped_A_after_start++;
8747 else
8748 ptr.p->m_all_page_dropped_A_after_start++;
8749 }
8750 else
8751 {
8752 if (is_change_page)
8753 ptr.p->m_change_page_dropped_D_after_start++;
8754 else
8755 ptr.p->m_all_page_dropped_D_after_start++;
8756 }
8757 }
8758
8759 void
skip_page_lcp_scanned_bit()8760 Backup::skip_page_lcp_scanned_bit()
8761 {
8762 BackupRecordPtr ptr;
8763 jamEntry();
8764 ptr = m_lcp_ptr;
8765 ptr.p->m_any_lcp_page_ops = true;
8766 if (ptr.p->m_working_changed_row_page_flag)
8767 ptr.p->m_skip_change_page_lcp_scanned_bit++;
8768 else
8769 ptr.p->m_skip_all_page_lcp_scanned_bit++;
8770 }
8771
8772 void
skip_no_change_page()8773 Backup::skip_no_change_page()
8774 {
8775 BackupRecordPtr ptr;
8776 jamEntryDebug();
8777 ptr = m_lcp_ptr;
8778 ptr.p->m_any_lcp_page_ops = true;
8779 ptr.p->m_skip_change_page_no_change++;
8780 }
8781
8782 void
skip_empty_page_lcp()8783 Backup::skip_empty_page_lcp()
8784 {
8785 BackupRecordPtr ptr;
8786 jamEntryDebug();
8787 ptr = m_lcp_ptr;
8788 ptr.p->m_any_lcp_page_ops = true;
8789 if (ptr.p->m_working_changed_row_page_flag)
8790 ptr.p->m_skip_empty_change_page++;
8791 else
8792 ptr.p->m_skip_empty_all_page++;
8793 }
8794
8795 void
record_dropped_empty_page_lcp()8796 Backup::record_dropped_empty_page_lcp()
8797 {
8798 BackupRecordPtr ptr;
8799 jamEntry();
8800 ptr = m_lcp_ptr;
8801 ndbrequire(ptr.p->m_working_changed_row_page_flag)
8802 ptr.p->m_any_lcp_page_ops = true;
8803 ptr.p->m_record_empty_change_page_A++;
8804 }
8805
8806 void
record_late_alloc_page_lcp()8807 Backup::record_late_alloc_page_lcp()
8808 {
8809 BackupRecordPtr ptr;
8810 jamEntry();
8811 ptr = m_lcp_ptr;
8812 ndbrequire(ptr.p->m_working_changed_row_page_flag)
8813 ptr.p->m_any_lcp_page_ops = true;
8814 ptr.p->m_record_late_alloc_change_page_A++;
8815 }
8816
8817 void
page_to_skip_lcp(bool is_last_lcp_state_A)8818 Backup::page_to_skip_lcp(bool is_last_lcp_state_A)
8819 {
8820 BackupRecordPtr ptr;
8821 jamEntry();
8822 ptr = m_lcp_ptr;
8823 ptr.p->m_any_lcp_page_ops = true;
8824 if (ptr.p->m_working_changed_row_page_flag)
8825 {
8826 ndbrequire(!is_last_lcp_state_A);
8827 ptr.p->m_skip_late_alloc_change_page_D++;
8828 }
8829 else
8830 {
8831 if (is_last_lcp_state_A)
8832 ptr.p->m_skip_late_alloc_all_page_A++;
8833 else
8834 ptr.p->m_skip_late_alloc_all_page_D++;
8835 }
8836 }
8837
8838 void
lcp_keep_delete_by_page_id()8839 Backup::lcp_keep_delete_by_page_id()
8840 {
8841 BackupRecordPtr ptr;
8842 jamEntry();
8843 ptr = m_lcp_ptr;
8844 ptr.p->m_any_lcp_page_ops = true;
8845 if (ptr.p->m_working_changed_row_page_flag)
8846 ptr.p->m_lcp_keep_delete_change_pages++;
8847 else
8848 ptr.p->m_lcp_keep_delete_all_pages++;
8849 }
8850
8851 void
lcp_keep_delete_row()8852 Backup::lcp_keep_delete_row()
8853 {
8854 BackupRecordPtr ptr;
8855 jamEntry();
8856 ptr = m_lcp_ptr;
8857 ptr.p->m_any_lcp_page_ops = true;
8858 if (ptr.p->m_working_changed_row_page_flag)
8859 ptr.p->m_lcp_keep_delete_row_change_pages++;
8860 else
8861 ptr.p->m_lcp_keep_delete_row_all_pages++;
8862 }
8863
8864 void
lcp_keep_row()8865 Backup::lcp_keep_row()
8866 {
8867 BackupRecordPtr ptr;
8868 jamEntry();
8869 ptr = m_lcp_ptr;
8870 ptr.p->m_any_lcp_page_ops = true;
8871 if (ptr.p->m_working_changed_row_page_flag)
8872 ptr.p->m_lcp_keep_row_change_pages++;
8873 else
8874 ptr.p->m_lcp_keep_row_all_pages++;
8875 }
8876
8877 void
print_extended_lcp_stat()8878 Backup::print_extended_lcp_stat()
8879 {
8880 BackupRecordPtr ptr;
8881 ptr = m_lcp_ptr;
8882 if (!ptr.p->m_any_lcp_page_ops)
8883 return;
8884 g_eventLogger->info("(%u)change_page_alloc_after_start: %u, "
8885 "all_page_alloc_after_start: %u, "
8886 "change_page_alloc_dropped_after_start: %u, "
8887 "all_page_alloc_dropped_after_start: %u",
8888 instance(),
8889 ptr.p->m_change_page_alloc_after_start,
8890 ptr.p->m_all_page_alloc_after_start,
8891 ptr.p->m_change_page_alloc_dropped_after_start,
8892 ptr.p->m_all_page_alloc_dropped_after_start);
8893 g_eventLogger->info("(%u)change_page_dropped_A_after_start: %u, "
8894 "all_page_dropped_A_after_start: %u, "
8895 "change_page_dropped_D_after_start: %u, "
8896 "all_page_dropped_D_after_start: %u",
8897 instance(),
8898 ptr.p->m_change_page_dropped_A_after_start,
8899 ptr.p->m_all_page_dropped_A_after_start,
8900 ptr.p->m_change_page_dropped_D_after_start,
8901 ptr.p->m_all_page_dropped_D_after_start);
8902 g_eventLogger->info("(%u)skip_change_page_lcp_scanned_bit: %u, "
8903 "skip_all_page_lcp_scanned_bit: %u, "
8904 "skip_change_page_no_change: %u, "
8905 "skip_empty_change_page: %u, "
8906 "skip_empty_all_page: %u",
8907 instance(),
8908 ptr.p->m_skip_change_page_lcp_scanned_bit,
8909 ptr.p->m_skip_all_page_lcp_scanned_bit,
8910 ptr.p->m_skip_change_page_no_change,
8911 ptr.p->m_skip_empty_change_page,
8912 ptr.p->m_skip_empty_all_page);
8913 g_eventLogger->info("(%u)record_empty_change_page_A: %u, "
8914 "record_late_alloc_change_page_A: %u, "
8915 "skip_late_alloc_change_page_D: %u, "
8916 "skip_late_alloc_all_page_A: %u, "
8917 "skip_late_alloc_all_page_D: %u",
8918 instance(),
8919 ptr.p->m_record_empty_change_page_A,
8920 ptr.p->m_record_late_alloc_change_page_A,
8921 ptr.p->m_skip_late_alloc_change_page_D,
8922 ptr.p->m_skip_late_alloc_all_page_A,
8923 ptr.p->m_skip_late_alloc_all_page_D);
8924 g_eventLogger->info("(%u)lcp_keep_row_change_pages: %llu, "
8925 "lcp_keep_row_all_pages: %llu, "
8926 "lcp_keep_delete_row_change_pages: %llu, "
8927 "lcp_keep_delete_row_all_pages: %llu, "
8928 "lcp_keep_delete_change_pages: %u, "
8929 "lcp_keep_delete_all_pages: %u",
8930 instance(),
8931 ptr.p->m_lcp_keep_row_change_pages,
8932 ptr.p->m_lcp_keep_row_all_pages,
8933 ptr.p->m_lcp_keep_delete_row_change_pages,
8934 ptr.p->m_lcp_keep_delete_row_all_pages,
8935 ptr.p->m_lcp_keep_delete_change_pages,
8936 ptr.p->m_lcp_keep_delete_all_pages);
8937 }
8938
8939 void
init_extended_lcp_stat()8940 Backup::init_extended_lcp_stat()
8941 {
8942 BackupRecordPtr ptr;
8943 ptr = m_lcp_ptr;
8944 ptr.p->m_change_page_alloc_after_start = 0;
8945 ptr.p->m_all_page_alloc_after_start = 0;
8946 ptr.p->m_change_page_alloc_dropped_after_start = 0;
8947 ptr.p->m_all_page_alloc_dropped_after_start = 0;
8948 ptr.p->m_change_page_dropped_A_after_start = 0;
8949 ptr.p->m_all_page_dropped_A_after_start = 0;
8950 ptr.p->m_change_page_dropped_D_after_start = 0;
8951 ptr.p->m_all_page_dropped_D_after_start = 0;
8952 ptr.p->m_skip_change_page_lcp_scanned_bit = 0;
8953 ptr.p->m_skip_all_page_lcp_scanned_bit = 0;
8954 ptr.p->m_skip_change_page_no_change = 0;
8955 ptr.p->m_skip_empty_change_page = 0;
8956 ptr.p->m_skip_empty_all_page = 0;
8957 ptr.p->m_record_empty_change_page_A = 0;
8958 ptr.p->m_record_late_alloc_change_page_A = 0;
8959 ptr.p->m_skip_late_alloc_change_page_D = 0;
8960 ptr.p->m_skip_late_alloc_all_page_A = 0;
8961 ptr.p->m_skip_late_alloc_all_page_D = 0;
8962 ptr.p->m_lcp_keep_delete_row_change_pages = 0;
8963 ptr.p->m_lcp_keep_delete_row_all_pages = 0;
8964 ptr.p->m_lcp_keep_delete_change_pages = 0;
8965 ptr.p->m_lcp_keep_delete_all_pages = 0;
8966 ptr.p->m_lcp_keep_row_change_pages = 0;
8967 ptr.p->m_lcp_keep_row_all_pages = 0;
8968 ptr.p->m_any_lcp_page_ops = false;
8969 }
8970
8971 /**
8972 * Return values:
8973 * +1 Page have been scanned
8974 * -1 Page have not been scanned
8975 * 0 Page is scanned, so need to check the page index as well.
8976 */
8977 int
is_page_lcp_scanned(Uint32 page_id,bool & all_part)8978 Backup::is_page_lcp_scanned(Uint32 page_id, bool & all_part)
8979 {
8980 BackupRecordPtr ptr;
8981 ptr = m_lcp_ptr;
8982 all_part = false;
8983
8984 if (page_id >= ptr.p->m_lcp_max_page_cnt)
8985 {
8986 jam();
8987 return +1; /* Page will never be scanned */
8988 }
8989 Uint32 part_id = hash_lcp_part(page_id);
8990 if (is_all_rows_page(ptr, part_id))
8991 {
8992 jam();
8993 all_part = true;
8994 }
8995 if (!ptr.p->m_is_lcp_scan_active)
8996 {
8997 /**
8998 * LCP scan is already completed.
8999 */
9000 jam();
9001 return +1;
9002 }
9003 if (page_id < ptr.p->m_lcp_current_page_scanned)
9004 {
9005 jam();
9006 return +1; /* Page have been scanned in this LCP scan round */
9007 }
9008 else if (page_id > ptr.p->m_lcp_current_page_scanned)
9009 {
9010 jam();
9011 return -1; /* Page to be scanned this LCP scan round, not done yet */
9012 }
9013 else
9014 {
9015 jam();
9016 return 0; /* Page is currently being scanned. Need more info */
9017 }
9018 }
9019
9020 void
update_lcp_pages_scanned(Signal * signal,Uint32 filePtrI,Uint32 scanned_pages,Uint32 & scanGCI,bool & changed_row_page_flag)9021 Backup::update_lcp_pages_scanned(Signal *signal,
9022 Uint32 filePtrI,
9023 Uint32 scanned_pages,
9024 Uint32 & scanGCI,
9025 bool & changed_row_page_flag)
9026 {
9027 BackupFilePtr filePtr;
9028 BackupRecordPtr ptr;
9029 jamEntry();
9030
9031 c_backupFilePool.getPtr(filePtr, filePtrI);
9032
9033 OperationRecord & op = filePtr.p->operation;
9034
9035 op.set_scanned_pages(scanned_pages);
9036
9037 /**
9038 * scanned_pages also contains the Page number which can be used
9039 * to deduce the part_id for the page.
9040 */
9041 ptr = m_lcp_ptr;
9042 Uint32 part_id = hash_lcp_part(scanned_pages);
9043 ptr.p->m_lcp_current_page_scanned = scanned_pages;
9044 get_page_info(ptr,
9045 part_id,
9046 scanGCI,
9047 changed_row_page_flag);
9048 set_working_file(ptr, part_id, !changed_row_page_flag);
9049 ptr.p->m_current_data_file_ptr = ptr.p->m_working_data_file_ptr;
9050 ptr.p->m_working_changed_row_page_flag = changed_row_page_flag;
9051 ptr.p->m_current_changed_row_page_flag = changed_row_page_flag;
9052 #ifdef DEBUG_EXTRA_LCP
9053 TablePtr debTabPtr;
9054 FragmentPtr fragPtr;
9055 ptr.p->tables.first(debTabPtr);
9056 debTabPtr.p->fragments.getPtr(fragPtr, 0);
9057 DEB_EXTRA_LCP(("(%u)LCP scan page tab(%u,%u):%u, part_id: %u, round: %u, %s",
9058 instance(),
9059 debTabPtr.p->tableId,
9060 fragPtr.p->fragmentId,
9061 scanned_pages,
9062 part_id,
9063 0,
9064 changed_row_page_flag ?
9065 "CHANGED ROWS page" : " ALL ROWS page"));
9066 #endif
9067 }
9068
9069 void
init(const TablePtr & tabPtr)9070 Backup::OperationRecord::init(const TablePtr & tabPtr)
9071 {
9072 tablePtr = tabPtr.i;
9073 maxRecordSize = tabPtr.p->maxRecordSize;
9074 lcpScannedPages = 0;
9075 }
9076
9077 bool
newFragment(Uint32 tableId,Uint32 fragNo)9078 Backup::OperationRecord::newFragment(Uint32 tableId, Uint32 fragNo)
9079 {
9080 Uint32 * tmp;
9081 const Uint32 headSz = (sizeof(BackupFormat::DataFile::FragmentHeader) >> 2);
9082 const Uint32 sz = headSz + ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize;
9083
9084 ndbrequire(sz < dataBuffer.getMaxWrite());
9085 if(dataBuffer.getWritePtr(&tmp, sz)) {
9086 jam();
9087 BackupFormat::DataFile::FragmentHeader * head =
9088 (BackupFormat::DataFile::FragmentHeader*)tmp;
9089
9090 head->SectionType = htonl(BackupFormat::FRAGMENT_HEADER);
9091 head->SectionLength = htonl(headSz);
9092 head->TableId = htonl(tableId);
9093 head->FragmentNo = htonl(fragNo);
9094 head->ChecksumType = htonl(0);
9095
9096 opNoDone = opNoConf = opLen = 0;
9097 newRecord(tmp + headSz);
9098 scanStart = tmp;
9099 scanStop = (tmp + headSz);
9100
9101 noOfRecords = 0;
9102 noOfBytes = 0;
9103 return true;
9104 }//if
9105 return false;
9106 }
9107
9108 bool
fragComplete(Uint32 tableId,Uint32 fragNo,bool fill_record)9109 Backup::OperationRecord::fragComplete(Uint32 tableId, Uint32 fragNo, bool fill_record)
9110 {
9111 Uint32 * tmp;
9112 const Uint32 footSz = sizeof(BackupFormat::DataFile::FragmentFooter) >> 2;
9113 Uint32 sz = footSz + 1;
9114
9115 if (fill_record)
9116 {
9117 Uint32 * new_tmp;
9118 if (!dataBuffer.getWritePtr(&tmp, sz))
9119 return false;
9120 new_tmp = tmp + sz;
9121
9122 if ((UintPtr)new_tmp & (sizeof(Page32)-1))
9123 {
9124 /* padding is needed to get full write */
9125 new_tmp += 2 /* to fit empty header minimum 2 words*/;
9126 new_tmp = (Uint32 *)(((UintPtr)new_tmp + sizeof(Page32)-1) &
9127 ~(UintPtr)(sizeof(Page32)-1));
9128 /* new write sz */
9129 sz = Uint32(new_tmp - tmp);
9130 }
9131 }
9132
9133 if(dataBuffer.getWritePtr(&tmp, sz)) {
9134 jam();
9135 * tmp = 0; // Finish record stream
9136 tmp++;
9137 BackupFormat::DataFile::FragmentFooter * foot =
9138 (BackupFormat::DataFile::FragmentFooter*)tmp;
9139 foot->SectionType = htonl(BackupFormat::FRAGMENT_FOOTER);
9140 foot->SectionLength = htonl(footSz);
9141 foot->TableId = htonl(tableId);
9142 foot->FragmentNo = htonl(fragNo);
9143 foot->NoOfRecords = htonl(Uint32(noOfRecords)); // TODO
9144 foot->Checksum = htonl(0);
9145
9146 if (sz != footSz + 1)
9147 {
9148 tmp += footSz;
9149 memset(tmp, 0, (sz - footSz - 1) * 4);
9150 *tmp = htonl(BackupFormat::EMPTY_ENTRY);
9151 tmp++;
9152 *tmp = htonl(sz - footSz - 1);
9153 }
9154
9155 dataBuffer.updateWritePtr(sz);
9156 return true;
9157 }//if
9158 return false;
9159 }
9160
9161 bool
newScan()9162 Backup::OperationRecord::newScan()
9163 {
9164 Uint32 * tmp;
9165 ndbrequire(ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize < dataBuffer.getMaxWrite());
9166 if(dataBuffer.getWritePtr(&tmp, ZRESERVED_SCAN_BATCH_SIZE * maxRecordSize))
9167 {
9168 jam();
9169 opNoDone = opNoConf = opLen = 0;
9170 newRecord(tmp);
9171 scanStart = tmp;
9172 scanStop = tmp;
9173 return true;
9174 }//if
9175 return false;
9176 }
9177
9178 bool
check_new_scan(BackupRecordPtr ptr,OperationRecord & op,bool after_wait)9179 Backup::check_new_scan(BackupRecordPtr ptr,
9180 OperationRecord & op,
9181 bool after_wait)
9182 {
9183 bool any_min_buf = false;
9184 Uint32 tot_size_written = 0;
9185 if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9186 {
9187 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9188 {
9189 jam();
9190 BackupFilePtr loopFilePtr;
9191 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9192 OperationRecord & loop_op = loopFilePtr.p->operation;
9193 if (!loop_op.newScan())
9194 {
9195 jam();
9196 return false;
9197 }
9198 Uint32 size_written = loop_op.dataBuffer.getSizeUsed();
9199 if (size_written > BACKUP_DEFAULT_WRITE_SIZE)
9200 {
9201 jam();
9202 any_min_buf = true;
9203 }
9204 tot_size_written += size_written;
9205 }
9206 }
9207 else
9208 {
9209 jam();
9210 bool ready = op.newScan();
9211 if (!ready)
9212 {
9213 jam();
9214 return false;
9215 }
9216 tot_size_written = op.dataBuffer.getSizeUsed();
9217 if (tot_size_written > BACKUP_DEFAULT_WRITE_SIZE)
9218 {
9219 jam();
9220 any_min_buf = true;
9221 }
9222 }
9223 if (after_wait ||
9224 !any_min_buf ||
9225 (ptr.p->is_lcp() &&
9226 (m_redo_alert_state > RedoStateRep::REDO_ALERT_LOW ||
9227 tot_size_written < MAX_BUFFER_USED_WITHOUT_REDO_ALERT ||
9228 (m_redo_alert_state == RedoStateRep::REDO_ALERT_LOW &&
9229 tot_size_written < BACKUP_DEFAULT_BUFFER_SIZE))))
9230 {
9231 jam();
9232 return true;
9233 }
9234 /**
9235 * We have buffer space, but we are ready to write at least one
9236 * file, so there is no urgency in continuing the LCP/Backup scan
9237 * right now, we have already written at least 512 kB into the
9238 * buffers. At Low REDO alert levels we allow writing up to
9239 * 2M into the buffers. At higher alert levels we will continue
9240 * writing until buffer is full.
9241 *
9242 * After sleeping for a while we will always handle at least one
9243 * batch of scanning if there is buffer space for it (this is
9244 * signalled through the variable after_wait).
9245 */
9246 return false;
9247 }
9248
9249 bool
check_frag_complete(BackupRecordPtr ptr,BackupFilePtr filePtr)9250 Backup::check_frag_complete(BackupRecordPtr ptr, BackupFilePtr filePtr)
9251 {
9252 if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9253 {
9254 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9255 {
9256 jam();
9257 BackupFilePtr loopFilePtr;
9258 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9259 OperationRecord & op = loopFilePtr.p->operation;
9260 if (((loopFilePtr.p->m_flags &
9261 Uint32(BackupFile::BF_SCAN_THREAD)) == 0) ||
9262 op.fragComplete(filePtr.p->tableId,
9263 filePtr.p->fragmentNo,
9264 c_defaults.m_o_direct))
9265 {
9266 jam();
9267 loopFilePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9268 }
9269 else
9270 {
9271 jam();
9272 return false;
9273 }
9274 }
9275 return true;
9276 }
9277 else
9278 {
9279 OperationRecord & op = filePtr.p->operation;
9280 if (op.fragComplete(filePtr.p->tableId,
9281 filePtr.p->fragmentNo,
9282 c_defaults.m_o_direct))
9283 {
9284 jam();
9285 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9286 return true;
9287 }
9288 return false;
9289 }
9290 }
9291
9292 bool
check_min_buf_size(BackupRecordPtr ptr,OperationRecord & op)9293 Backup::check_min_buf_size(BackupRecordPtr ptr, OperationRecord &op)
9294 {
9295 bool is_lcp = ptr.p->is_lcp();
9296 if (is_lcp && m_redo_alert_state != RedoStateRep::NO_REDO_ALERT)
9297 {
9298 /**
9299 * We have reached at least 25% REDO log fill level, we will be more
9300 * active in filling up the buffers to write to disk for LCPs.
9301 */
9302 return false;
9303 }
9304 if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9305 {
9306 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9307 {
9308 jam();
9309 Uint32 *tmp = NULL;
9310 Uint32 sz = 0;
9311 bool eof = FALSE;
9312 BackupFilePtr loopFilePtr;
9313 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9314 OperationRecord & loop_op = loopFilePtr.p->operation;
9315 if (!loop_op.dataBuffer.getReadPtr(&tmp, &sz, &eof))
9316 {
9317 return false;
9318 }
9319 }
9320 return true;
9321 }
9322 else
9323 {
9324 jam();
9325 Uint32 *tmp = NULL;
9326 Uint32 sz = 0;
9327 bool eof = FALSE;
9328 return op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
9329 }
9330 }
9331
9332 bool
check_error(BackupRecordPtr ptr,BackupFilePtr filePtr)9333 Backup::check_error(BackupRecordPtr ptr, BackupFilePtr filePtr)
9334 {
9335 if (ptr.p->checkError())
9336 {
9337 jam();
9338 return true;
9339 }
9340 if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9341 {
9342 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9343 {
9344 jam();
9345 BackupFilePtr loopFilePtr;
9346 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9347 if (loopFilePtr.p->errorCode != 0)
9348 {
9349 jam();
9350 return true;
9351 }
9352 }
9353 return false;
9354 }
9355 else
9356 {
9357 return (filePtr.p->errorCode != 0);
9358 }
9359 }
9360
9361 void
closeScan()9362 Backup::OperationRecord::closeScan()
9363 {
9364 opNoDone = opNoConf = opLen = 0;
9365 }
9366
9367 Uint32
publishBufferData()9368 Backup::OperationRecord::publishBufferData()
9369 {
9370 const Uint32 len = Uint32(scanStop - scanStart);
9371 ndbrequire(len < dataBuffer.getMaxWrite());
9372 dataBuffer.updateWritePtr(len);
9373
9374 /**
9375 * In case a second SCAN_FRAGCONF is received with scanCompleted set to 2
9376 * follow, without any call to newScan() or newFragment() is called to reset
9377 * scanStart and scanStop in between, set scanStart to scanStop to indicate
9378 * that all buffered data already been published.
9379 */
9380 scanStart = scanStop;
9381 return len;
9382 }
9383
9384 void
scanConf(Uint32 noOfOps,Uint32 total_len,Uint32 len)9385 Backup::OperationRecord::scanConf(Uint32 noOfOps, Uint32 total_len, Uint32 len)
9386 {
9387 const Uint32 done = Uint32(opNoDone-opNoConf);
9388
9389 ndbrequire(noOfOps == done);
9390 ndbrequire(opLen == total_len);
9391 opNoConf = opNoDone;
9392
9393 noOfBytes += (len << 2);
9394 m_bytes_total += (len << 2);
9395 m_records_total += noOfOps;
9396 }
9397
9398 void
execSCAN_FRAGREF(Signal * signal)9399 Backup::execSCAN_FRAGREF(Signal* signal)
9400 {
9401 jamEntry();
9402
9403 ScanFragRef * ref = (ScanFragRef*)signal->getDataPtr();
9404
9405 const Uint32 filePtrI = ref->senderData;
9406 BackupFilePtr filePtr;
9407 c_backupFilePool.getPtr(filePtr, filePtrI);
9408
9409 Uint32 errCode = ref->errorCode;
9410 if (filePtr.p->errorCode == 0)
9411 {
9412 // check for transient errors
9413 switch(errCode){
9414 case ScanFragRef::ZSCAN_BOOK_ACC_OP_ERROR:
9415 case ScanFragRef::NO_TC_CONNECT_ERROR:
9416 case ScanFragRef::ZTOO_MANY_ACTIVE_SCAN_ERROR:
9417 jam();
9418 DEB_LCP(("(%u)execSCAN_FRAGREF(temp error: %u)",
9419 instance(),
9420 errCode));
9421 break;
9422 case ScanFragRef::TABLE_NOT_DEFINED_ERROR:
9423 case ScanFragRef::DROP_TABLE_IN_PROGRESS_ERROR:
9424 jam();
9425 /**
9426 * The table was dropped either at start of LCP scan or in the
9427 * middle of it. We will complete in the same manner as if we
9428 * got a SCAN_FRAGCONF with close flag set. The idea is that
9429 * the content of the LCP file in this case is not going to
9430 * be used anyways, so we just ensure that we complete things
9431 * in an ordered manner and then the higher layers will ensure
9432 * that the files are dropped and taken care of.
9433 *
9434 * This handling will ensure that drop table can complete
9435 * much faster.
9436 */
9437 DEB_LCP(("(%u)execSCAN_FRAGREF(DROP_TABLE_IN_PROGRESS)", instance()));
9438 fragmentCompleted(signal, filePtr, errCode);
9439 return;
9440 default:
9441 jam();
9442 filePtr.p->errorCode = errCode;
9443 }
9444 }
9445
9446 if (filePtr.p->errorCode == 0)
9447 {
9448 jam();
9449 filePtr.p->m_retry_count++;
9450 if (filePtr.p->m_retry_count == 10)
9451 {
9452 jam();
9453 filePtr.p->errorCode = errCode;
9454 }
9455 }
9456
9457 if (filePtr.p->errorCode != 0)
9458 {
9459 jam();
9460 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9461 DEB_LCP(("(%u)execSCAN_FRAGREF(backupFragmentRef)", instance()));
9462 backupFragmentRef(signal, filePtr);
9463 }
9464 else
9465 {
9466 jam();
9467
9468 // retry
9469
9470 BackupRecordPtr ptr;
9471 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9472 TablePtr tabPtr;
9473 FragmentPtr fragPtr;
9474 if (ptr.p->is_lcp())
9475 {
9476 ptr.p->tables.first(tabPtr);
9477 ndbrequire(filePtr.p->fragmentNo == 0);
9478 ndbrequire(filePtr.p->tableId == tabPtr.p->tableId);
9479 tabPtr.p->fragments.getPtr(fragPtr, 0);
9480 DEB_LCP(("(%u)execSCAN_FRAGREF", instance()));
9481 }
9482 else
9483 {
9484 ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
9485 tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
9486 }
9487 sendScanFragReq(signal, ptr, filePtr, tabPtr, fragPtr,
9488 WaitScanTempErrorRetryMillis);
9489 }
9490 }
9491
9492 void
execSCAN_FRAGCONF(Signal * signal)9493 Backup::execSCAN_FRAGCONF(Signal* signal)
9494 {
9495 jamEntry();
9496
9497 CRASH_INSERTION((10017));
9498
9499 ScanFragConf conf = *(ScanFragConf*)signal->getDataPtr();
9500
9501 const Uint32 filePtrI = conf.senderData;
9502 BackupFilePtr filePtr;
9503 c_backupFilePool.getPtr(filePtr, filePtrI);
9504
9505 OperationRecord & op = filePtr.p->operation;
9506 BackupRecordPtr ptr;
9507 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9508
9509 if (ptr.p->is_lcp() && c_lqh->handleLCPSurfacing(signal))
9510 {
9511 jam();
9512 TablePtr tabPtr;
9513 ptr.p->tables.first(tabPtr);
9514 Dbtup* tup = (Dbtup*)globalData.getBlock(DBTUP, instance());
9515 op.maxRecordSize = tabPtr.p->maxRecordSize =
9516 1 + tup->get_max_lcp_record_size(tabPtr.p->tableId);
9517 }
9518 Uint32 buffer_data_len = op.publishBufferData();
9519 if (ptr.p->is_lcp() && ptr.p->m_num_lcp_files > 1)
9520 {
9521 jam();
9522 BackupFilePtr loopFilePtr;
9523 for (Uint32 i = 1; i < ptr.p->m_num_lcp_files; i++)
9524 {
9525 c_backupFilePool.getPtr(loopFilePtr, ptr.p->dataFilePtr[i]);
9526 OperationRecord & loop_op = loopFilePtr.p->operation;
9527 // The extra lcp files only use operation for the data buffer.
9528 buffer_data_len += loop_op.publishBufferData();
9529 // Always update maxRecordSize, op.maxRecordSize may have changed.
9530 loop_op.maxRecordSize = op.maxRecordSize;
9531 }
9532 }
9533 op.scanConf(conf.completedOps, conf.total_len, buffer_data_len);
9534
9535 {
9536 const bool senderIsThreadLocal =
9537 (signal->senderBlockRef() == calcInstanceBlockRef(DBLQH));
9538 ndbrequire(senderIsThreadLocal ||
9539 !MT_BACKUP_FLAG(ptr.p->flags));
9540 }
9541
9542 const Uint32 completed = conf.fragmentCompleted;
9543 if(completed != 2) {
9544 jam();
9545 checkScan(signal, ptr, filePtr, false);
9546 return;
9547 }//if
9548
9549 fragmentCompleted(signal, filePtr);
9550 }
9551
9552 void
fragmentCompleted(Signal * signal,BackupFilePtr filePtr,Uint32 errCode)9553 Backup::fragmentCompleted(Signal* signal,
9554 BackupFilePtr filePtr,
9555 Uint32 errCode)
9556 {
9557 jam();
9558
9559 if(filePtr.p->errorCode != 0)
9560 {
9561 jam();
9562 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_SCAN_THREAD;
9563 DEB_LCP(("(%u)fragmentCompleted(backupFragmentRef)", instance()));
9564 backupFragmentRef(signal, filePtr); // Scan completed
9565 return;
9566 }//if
9567
9568 BackupRecordPtr ptr;
9569 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9570
9571 if (!check_frag_complete(ptr, filePtr))
9572 {
9573 jam();
9574 signal->theData[0] = BackupContinueB::BUFFER_FULL_FRAG_COMPLETE;
9575 signal->theData[1] = filePtr.i;
9576 signal->theData[2] = errCode;
9577 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
9578 WaitDiskBufferCapacityMillis, 2);
9579 return;
9580 }//if
9581 OperationRecord & op = filePtr.p->operation;
9582 if (ptr.p->is_lcp())
9583 {
9584 jam();
9585 ptr.p->m_is_lcp_scan_active = false;
9586 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
9587 {
9588 BackupFilePtr loopFilePtr;
9589 c_backupFilePool.getPtr(loopFilePtr,
9590 ptr.p->dataFilePtr[i]);
9591 loopFilePtr.p->operation.dataBuffer.eof();
9592 }
9593 {
9594 jam();
9595 TablePtr tabPtr;
9596 FragmentPtr fragPtr;
9597 ptr.p->tables.first(tabPtr);
9598 tabPtr.p->fragments.getPtr(fragPtr, 0);
9599 DEB_LCP_STAT(("(%u)LCP tab(%u,%u): inserts: %llu, writes: %llu"
9600 ", delete_by_row: %llu, delete_by_page: %llu"
9601 ", bytes written: %llu, num_files: %u"
9602 ", first data file: %u",
9603 instance(),
9604 tabPtr.p->tableId,
9605 fragPtr.p->fragmentId,
9606 filePtr.p->m_lcp_inserts,
9607 filePtr.p->m_lcp_writes,
9608 filePtr.p->m_lcp_delete_by_rowids,
9609 filePtr.p->m_lcp_delete_by_pageids,
9610 ptr.p->m_bytes_written,
9611 ptr.p->m_num_lcp_files,
9612 ptr.p->m_first_data_file_number));
9613 #ifdef DEBUG_LCP_EXTENDED_STAT
9614 print_extended_lcp_stat();
9615 #endif
9616 c_tup->stop_lcp_scan(tabPtr.p->tableId, fragPtr.p->fragmentId);
9617 }
9618
9619 /* Save errCode for later checks */
9620 ptr.p->m_save_error_code = errCode;
9621 ptr.p->slaveState.setState(STOPPING);
9622
9623 /**
9624 * Scan is completed, we get the newest GCI involved in the
9625 * LCP. We update both LQH and ourselves with this value.
9626 */
9627 c_lqh->lcp_complete_scan(ptr.p->newestGci);
9628
9629 /**
9630 * The actual complete processing is started from checkFile which is
9631 * called regularly from a CONTINUEB loop. We cannot start the complete
9632 * processing until all data of the fragment have been sent properly to
9633 * the disk. checkFile is called from CONTINUEB(START_FILE_THREAD).
9634 *
9635 * lcp_start_complete_processing will start by sync:ing UNDO log, sync
9636 * the page cache and sync:ing the extent pages. When all this is done
9637 * AND the fragment LCP data files are sync:ed and closed then the
9638 * LCP is done.
9639 */
9640 lcp_start_complete_processing(signal, ptr);
9641 }
9642 else
9643 {
9644 jam();
9645 BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtrSend();
9646 conf->backupId = ptr.p->backupId;
9647 conf->backupPtr = ptr.i;
9648 conf->tableId = filePtr.p->tableId;
9649 conf->fragmentNo = filePtr.p->fragmentNo;
9650 conf->noOfRecordsLow = (Uint32)(op.noOfRecords & 0xFFFFFFFF);
9651 conf->noOfRecordsHigh = (Uint32)(op.noOfRecords >> 32);
9652 conf->noOfBytesLow = (Uint32)(op.noOfBytes & 0xFFFFFFFF);
9653 conf->noOfBytesHigh = (Uint32)(op.noOfBytes >> 32);
9654 sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
9655 BackupFragmentConf::SignalLength, JBA);
9656
9657 ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_CONF;
9658 ptr.p->slaveState.setState(STARTED);
9659 }
9660 return;
9661 }
9662
9663 void
backupFragmentRef(Signal * signal,BackupFilePtr filePtr)9664 Backup::backupFragmentRef(Signal * signal, BackupFilePtr filePtr)
9665 {
9666 BackupRecordPtr ptr;
9667 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9668
9669 ptr.p->m_gsn = GSN_BACKUP_FRAGMENT_REF;
9670
9671 CRASH_INSERTION((10044));
9672 CRASH_INSERTION((10045));
9673
9674 BackupFragmentRef * ref = (BackupFragmentRef*)signal->getDataPtrSend();
9675 ref->backupId = ptr.p->backupId;
9676 ref->backupPtr = ptr.i;
9677 ref->nodeId = getOwnNodeId();
9678 ref->errorCode = filePtr.p->errorCode;
9679 sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_REF, signal,
9680 BackupFragmentRef::SignalLength, JBB);
9681 }
9682
9683 void
update_pause_lcp_counter(Uint32 loop_count)9684 Backup::update_pause_lcp_counter(Uint32 loop_count)
9685 {
9686 /**
9687 * We keep track of the time we are executing LCP writes on a
9688 * fairly detailed level to ensure that our real-time properties
9689 * are ok. In some cases we can loop quite extensively in TUP
9690 * looking for rows to checkpoint. This involves scanning each
9691 * row to see if it has changed since last LCP.
9692 *
9693 * We provide a loop count where scanning a row or a page is worth
9694 * 4 ticks whereas a quick check of a row to find that it isn't
9695 * eligible is only worth one tick.
9696 * The checks of rows in CHANGED
9697 * pages is optimised since it is such a common case. This scan
9698 * uses prefetching techniques to ensure that we avoid being
9699 * hindered by cache misses. In a large database it is very
9700 * likely that these scans will touch lot of memory and will
9701 * thus require prefetching to keep up. We predict that we can
9702 * scan one row in about 25 nanoseconds. Thus one loop is equal
9703 * to 100 nanoseconds. We estimate that we will be able to write
9704 * about 320 bytes per microsecond and thus one loop count is
9705 * counted equal to 32 bytes. This cost is fairly independent of
9706 * the table size and table structure since we are only checking
9707 * the header of the row.
9708 */
9709 BackupRecordPtr ptr = m_lcp_ptr;
9710 ndbassert(ptr.p->is_lcp());
9711 ptr.p->m_row_scan_counter += (loop_count / 4);
9712 ptr.p->m_pause_counter += (loop_count * 8);
9713 }
9714
9715 bool
check_pause_lcp_backup(BackupRecordPtr ptr,bool is_lcp,bool is_send_scan_next_req)9716 Backup::check_pause_lcp_backup(BackupRecordPtr ptr,
9717 bool is_lcp,
9718 bool is_send_scan_next_req)
9719 {
9720 /**
9721 * We call this function every time it is necessary to decide if
9722 * we should issue a real-time break in an LCP scan, we also call
9723 * it to decide if we are to stay at prio A level for backups.
9724 *
9725 * We keep track of the desired write speed. We try to write as
9726 * much as is necessary to keep up with the desired write speed
9727 * since the last time we had a real-time break.
9728 *
9729 * If we are lagging for some reason the desired write speed since
9730 * the start of the scan, we write a bit more on each real-time
9731 * break until we have catched up. There could be many reasons why
9732 * this is necessary, one could be that we had a real-time break
9733 * that overslept a bit.
9734 *
9735 * To avoid problems when we overslept we also maximise the amount
9736 * of writes we can perform in one real-time break. This maximum
9737 * is dependent on the ALERT level on the REDO log.
9738 *
9739 * To handle these requirements we keep track of the start time
9740 * of the scan (sending SCAN_FRAGREQ). We keep track of the last
9741 * time this method decided to issue a real-time break, it could
9742 * also be decided by higher level methods, in this case they
9743 * will call the pausing_lcp method to cause this timer and
9744 * the amount of bytes written to that point before entering a
9745 * real-time break.
9746 */
9747 if (!is_lcp)
9748 {
9749 jam();
9750 ndbassert(!ptr.p->is_lcp());
9751 Uint64 max_bytes_to_write = 4 * ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO;
9752 if (ptr.p->m_num_scan_req_on_prioa == 0)
9753 {
9754 jam();
9755 return false;
9756 }
9757 Uint64 bytes_written_in_last_lcp = ptr.p->m_bytes_written;
9758 Uint64 last_recorded_bytes_written = ptr.p->m_last_recorded_bytes_written;
9759 ptr.p->m_last_recorded_bytes_written = bytes_written_in_last_lcp;
9760 Uint64 bytes_written_since_last_delay =
9761 bytes_written_in_last_lcp - last_recorded_bytes_written;
9762 return (bytes_written_since_last_delay >= max_bytes_to_write);
9763 }
9764 jam();
9765 ndbassert(ptr.p->is_lcp());
9766 Uint64 max_words_to_scan = 4 * (ZMAX_WORDS_PER_SCAN_BATCH_HIGH_PRIO *
9767 m_redo_alert_factor);
9768 if (ptr.p->m_num_scan_req_on_prioa == 0 &&
9769 !is_send_scan_next_req)
9770 {
9771 jam();
9772 max_words_to_scan = 4 * ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO *
9773 m_redo_alert_factor;
9774 }
9775 Uint64 bytes_written_in_last_lcp = ptr.p->m_bytes_written;
9776 Uint64 last_recorded_bytes_written = ptr.p->m_last_recorded_bytes_written;
9777 Uint64 pause_counter = ptr.p->m_pause_counter;
9778 Uint64 bytes_written_since_last_delay =
9779 bytes_written_in_last_lcp - last_recorded_bytes_written;
9780 bytes_written_since_last_delay += pause_counter;
9781
9782 /* Calculate if we are behind since start of scan */
9783 /* Current disk write speed is in per 100 ms */
9784 Uint64 desired_write_speed =
9785 Uint64(CURR_DISK_SPEED_CONVERSION_FACTOR_TO_SECONDS) *
9786 m_curr_disk_write_speed;
9787 NDB_TICKS now = getHighResTimer();
9788 NDB_TICKS start_scan = ptr.p->m_scan_start_timer;
9789 Uint64 micros_since_start_scan =
9790 NdbTick_Elapsed(start_scan, now).microSec();
9791 Uint64 desired_written_bytes =
9792 (desired_write_speed * micros_since_start_scan) /
9793 (Uint64(1000) * Uint64(1000));
9794
9795 /* Calculate if we are behind since last rt break */
9796 NDB_TICKS last_delay_timer = ptr.p->m_last_delay_scan_timer;
9797 Uint64 micros_since_last_delay =
9798 NdbTick_Elapsed(last_delay_timer, now).microSec();
9799 Uint64 desired_bytes_in_this_rt_break =
9800 (micros_since_last_delay * desired_write_speed) /
9801 (Uint64(1000) * Uint64(1000));
9802
9803 /* Adjust bytes to write in this rt break if behind since last scan */
9804 if (bytes_written_in_last_lcp < desired_written_bytes)
9805 {
9806 desired_bytes_in_this_rt_break *= Uint64(125);
9807 desired_bytes_in_this_rt_break /= Uint64(100);
9808 }
9809 Uint64 max_bytes_to_write = MIN(desired_bytes_in_this_rt_break,
9810 max_words_to_scan);
9811 max_bytes_to_write = MAX(max_bytes_to_write,
9812 (4 * ZMAX_WORDS_PER_SCAN_BATCH_LOW_PRIO));
9813 #ifdef VM_TRACE
9814 if (is_send_scan_next_req ||
9815 (bytes_written_since_last_delay >= max_bytes_to_write))
9816 {
9817 m_debug_redo_log_count++;
9818 if (m_debug_redo_log_count > 1000000)
9819 {
9820 if (m_debug_redo_log_count > 1000004)
9821 {
9822 m_debug_redo_log_count = 0;
9823 }
9824 DEB_REDO_CONTROL(("(%u)check_pause_lcp: bytes_since_last_delay: %llu"
9825 ", desired_bytes_in_this_break: %llu"
9826 ", max_bytes_to_write: %llu"
9827 ", micros_since_last_delay: %llu"
9828 ", scan_row_counter: %llu",
9829 instance(),
9830 bytes_written_since_last_delay,
9831 desired_bytes_in_this_rt_break,
9832 max_bytes_to_write,
9833 micros_since_last_delay,
9834 ptr.p->m_row_scan_counter));
9835 }
9836 }
9837 #endif
9838 return (bytes_written_since_last_delay >= max_bytes_to_write);
9839 }
9840
9841 void
pausing_lcp(Uint32 place,Uint32 val)9842 Backup::pausing_lcp(Uint32 place, Uint32 val)
9843 {
9844 /* Pause LCP execution, record current time and bytes written */
9845 BackupRecordPtr ptr = m_lcp_ptr;
9846 ndbassert(ptr.p->is_lcp());
9847 Uint64 bytes_written_in_last_lcp =
9848 ptr.p->m_bytes_written;
9849 NDB_TICKS now = getHighResTimer();
9850 ptr.p->m_last_recorded_bytes_written = bytes_written_in_last_lcp;
9851 ptr.p->m_last_delay_scan_timer = now;
9852 ptr.p->m_pause_counter = 0;
9853 ptr.p->m_num_scan_req_on_prioa = 0;
9854 #ifdef VM_TRACE
9855 if (m_debug_redo_log_count > 1000000)
9856 {
9857 DEB_REDO_CONTROL(("(%u)pausing_lcp from place: %u, val: %u",
9858 instance(),
9859 place,
9860 val));
9861 }
9862 #endif
9863 }
9864
9865 void
checkScan(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr,bool after_wait)9866 Backup::checkScan(Signal* signal,
9867 BackupRecordPtr ptr,
9868 BackupFilePtr filePtr,
9869 bool after_wait)
9870 {
9871 OperationRecord & op = filePtr.p->operation;
9872 BlockReference lqhRef = 0;
9873 if (ptr.p->is_lcp())
9874 {
9875 lqhRef = calcInstanceBlockRef(DBLQH);
9876 }
9877 else
9878 {
9879 TablePtr tabPtr;
9880 ndbrequire(findTable(ptr, tabPtr, filePtr.p->tableId));
9881 FragmentPtr fragPtr;
9882 tabPtr.p->fragments.getPtr(fragPtr, filePtr.p->fragmentNo);
9883 const Uint32 instanceKey = fragPtr.p->lqhInstanceKey;
9884 lqhRef = numberToRef(DBLQH, instanceKey, getOwnNodeId());
9885 }
9886 if (check_error(ptr, filePtr))
9887 {
9888 jam();
9889 /**
9890 * Close scan
9891 */
9892 if (ptr.p->is_lcp())
9893 {
9894 DEB_LCP(("(%u) Close LCP scan after receiving error: %u",
9895 instance(),
9896 filePtr.p->errorCode));
9897 }
9898 op.closeScan();
9899 ScanFragNextReq * req = (ScanFragNextReq *)signal->getDataPtrSend();
9900 req->senderData = filePtr.i;
9901 req->requestInfo = 0;
9902 ScanFragNextReq::setCloseFlag(req->requestInfo, 1);
9903 req->transId1 = 0;
9904 req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
9905 sendSignal(lqhRef, GSN_SCAN_NEXTREQ, signal,
9906 ScanFragNextReq::SignalLength, JBB);
9907 return;
9908 }//if
9909 if (check_new_scan(ptr, op, after_wait))
9910 {
9911 jam();
9912
9913 ScanFragNextReq * req = (ScanFragNextReq *)signal->getDataPtrSend();
9914 req->senderData = filePtr.i;
9915 req->requestInfo = 0;
9916 req->transId1 = 0;
9917 req->transId2 = (BACKUP << 20) + (getOwnNodeId() << 8);
9918 req->batch_size_rows= ZRESERVED_SCAN_BATCH_SIZE;
9919 req->batch_size_bytes= 0;
9920
9921 if(ERROR_INSERTED(10032))
9922 sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9923 100, ScanFragNextReq::SignalLength);
9924 else if(ERROR_INSERTED(10033))
9925 {
9926 SET_ERROR_INSERT_VALUE(10032);
9927 sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9928 10000, ScanFragNextReq::SignalLength);
9929
9930 BackupRecordPtr ptr;
9931 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
9932 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
9933 ord->backupId = ptr.p->backupId;
9934 ord->backupPtr = ptr.i;
9935 ord->requestType = AbortBackupOrd::FileOrScanError;
9936 ord->senderData= ptr.i;
9937 sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
9938 AbortBackupOrd::SignalLength, JBB);
9939 }
9940 #ifdef ERROR_INSERT
9941 else if (ERROR_INSERTED(10042) && filePtr.p->tableId ==c_error_insert_extra)
9942 {
9943 sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
9944 10, ScanFragNextReq::SignalLength);
9945 }
9946 #endif
9947 else
9948 {
9949 /**
9950 * We send all interactions with bounded delay, this means that we will
9951 * wait for at most 128 signals before the signal is put into the A-level
9952 * job buffer. After this we will execute at A-level until we arrive
9953 * back with a SCAN_FRAGCONF. After SCAN_FRAGCONF we get back to here
9954 * again, so this means we will execute at least 16 rows before any
9955 * B-level signals are allowed again. So this means that the LCP will
9956 * scan at least 16 rows per 128 signals even at complete overload.
9957 *
9958 * We will even send yet one more row of 16 rows at A-priority level
9959 * per 100 B-level signals if we have difficulties in even meeting the
9960 * minimum desired checkpoint level.
9961 */
9962 JobBufferLevel prio_level = JBB;
9963 bool file_buf_contains_min_write_size = false;
9964 if (check_scan_if_raise_prio(signal, ptr))
9965 {
9966 OperationRecord & op = filePtr.p->operation;
9967 file_buf_contains_min_write_size =
9968 check_min_buf_size(ptr, op);
9969
9970 ScanFragNextReq::setPrioAFlag(req->requestInfo, 1);
9971 if (!file_buf_contains_min_write_size &&
9972 !check_pause_lcp_backup(ptr))
9973 {
9974 jam();
9975 /**
9976 * There are three reasons why we won't continue executing at
9977 * prio A level.
9978 *
9979 * 1) The last execution at prio A generated more than the max words
9980 * per A-level batch, so we get back to a bounded delay signal.
9981 *
9982 * 2) We already have a buffer ready to be sent to the file
9983 * system. No reason to execute at a very high priority simply
9984 * to fill buffers not waiting to be filled. If it is an LCP and
9985 * we are reaching some limit we will be more active in filling
9986 * up buffers.
9987 *
9988 * We will continue a bit more if we have set m_redo_alert_factor
9989 * higher than 1. We will do this in very critical situations when we
9990 * want to ensure that LCP writes gets higher priority. The redo
9991 * alert factor is always 1 for backups since there is no need of
9992 * urgency to complete backups. It is enough to manage backups
9993 * properly.
9994 */
9995 /* Continue at prio A level 16 more rows */
9996 ptr.p->m_num_scan_req_on_prioa++;
9997 prio_level = JBA;
9998 }
9999 }
10000 if (lqhRef == calcInstanceBlockRef(DBLQH) && (prio_level == JBB))
10001 {
10002 if (ptr.p->is_lcp())
10003 {
10004 pausing_lcp(1,
10005 (2*(ScanFragNextReq::getPrioAFlag(req->requestInfo))) +
10006 file_buf_contains_min_write_size);
10007 }
10008 sendSignalWithDelay(lqhRef, GSN_SCAN_NEXTREQ, signal,
10009 BOUNDED_DELAY, ScanFragNextReq::SignalLength);
10010 }
10011 else
10012 {
10013 /* Cannot send delayed signals to other threads. */
10014 ndbrequire(!ptr.p->is_lcp() || prio_level == JBA);
10015 sendSignal(lqhRef,
10016 GSN_SCAN_NEXTREQ,
10017 signal,
10018 ScanFragNextReq::SignalLength,
10019 prio_level);
10020 }
10021 /*
10022 check if it is time to report backup status
10023 */
10024 if (!ptr.p->is_lcp())
10025 {
10026 jam();
10027 checkReportStatus(signal, ptr);
10028 }
10029 }
10030 return;
10031 }//if
10032 if (ptr.p->is_lcp())
10033 {
10034 pausing_lcp(2,0);
10035 DEB_EXTRA_LCP(("(%u)newScan false in checkScan", instance()));
10036 }
10037 signal->theData[0] = BackupContinueB::BUFFER_FULL_SCAN;
10038 signal->theData[1] = filePtr.i;
10039 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10040 WaitDiskBufferCapacityMillis, 2);
10041 }
10042
10043 void
execFSAPPENDREF(Signal * signal)10044 Backup::execFSAPPENDREF(Signal* signal)
10045 {
10046 jamEntry();
10047
10048 FsRef * ref = (FsRef *)signal->getDataPtr();
10049
10050 const Uint32 filePtrI = ref->userPointer;
10051 const Uint32 errCode = ref->errorCode;
10052
10053 BackupFilePtr filePtr;
10054 c_backupFilePool.getPtr(filePtr, filePtrI);
10055
10056 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10057 filePtr.p->errorCode = errCode;
10058
10059 CRASH_INSERTION(10044);
10060 CRASH_INSERTION(10045);
10061 BackupRecordPtr ptr;
10062 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10063 if (ptr.p->is_lcp())
10064 {
10065 /**
10066 * Log in this case for LCPs, Backups should be able to
10067 * handle out of disk space. LCPs could potentially survive for
10068 * a while, but will eventually crash or they will hit the
10069 * infamous 410 condition.
10070 */
10071 g_eventLogger->info("LCP got FSAPPENDREF, serious error: error code: %u",
10072 errCode);
10073 }
10074 checkFile(signal, filePtr);
10075 }
10076
10077 void
execFSAPPENDCONF(Signal * signal)10078 Backup::execFSAPPENDCONF(Signal* signal)
10079 {
10080 jamEntry();
10081
10082 CRASH_INSERTION((10018));
10083
10084 //FsConf * conf = (FsConf*)signal->getDataPtr();
10085 const Uint32 filePtrI = signal->theData[0]; //conf->userPointer;
10086 const Uint32 bytes = signal->theData[1]; //conf->bytes;
10087
10088 BackupFilePtr filePtr;
10089 c_backupFilePool.getPtr(filePtr, filePtrI);
10090
10091 OperationRecord & op = filePtr.p->operation;
10092
10093 op.dataBuffer.updateReadPtr(bytes >> 2);
10094
10095 checkFile(signal, filePtr);
10096 }
10097
10098 /*
10099 This routine handles two problems with writing to disk during local
10100 checkpoints and backups. The first problem is that we need to limit
10101 the writing to ensure that we don't use too much CPU and disk resources
10102 for backups and checkpoints. For LCPs we use an adaptive algorithm that
10103 changes the current disk write speed based on how much checkpointing we
10104 need to do in order to not run out of REDO log.
10105 Backup writes are added to the total disk write speed we control, but
10106 backup writes are also separately controlled to avoid that backups take
10107 up resources that are needed by the REDO log.
10108
10109 The second problem is that in Linux we can get severe problems if we
10110 write very much to the disk without synching. In the worst case we
10111 can have Gigabytes of data in the Linux page cache before we reach
10112 the limit of how much we can write. If this happens the performance
10113 will drop significantly when we reach this limit since the Linux flush
10114 daemon will spend a few minutes on writing out the page cache to disk.
10115 To avoid this we ensure that a file never have more than a certain
10116 amount of data outstanding before synch. This variable is also
10117 configurable.
10118 */
10119 bool
ready_to_write(bool ready,Uint32 sz,bool eof,BackupFile * fileP,BackupRecord * ptrP)10120 Backup::ready_to_write(bool ready,
10121 Uint32 sz,
10122 bool eof,
10123 BackupFile *fileP,
10124 BackupRecord *ptrP)
10125 {
10126 #if 0
10127 ndbout << "ready_to_write: ready = " << ready << " eof = " << eof;
10128 ndbout << " sz = " << sz << endl;
10129 ndbout << "words this period = " << m_words_written_this_period;
10130 ndbout << "backup words this period = "
10131 << m_backup_words_written_this_period;
10132 ndbout << endl << "overflow disk write = " << m_overflow_disk_write;
10133 ndbout << endl << "backup overflow disk write = "
10134 << m_backup_overflow_disk_write;
10135 ndbout << endl << "Current Millisecond is = ";
10136 ndbout << NdbTick_CurrentMillisecond() << endl;
10137 #endif
10138
10139 if (ERROR_INSERTED(10043) && eof)
10140 {
10141 /* Block indefinitely without closing the file */
10142 jam();
10143 return false;
10144 }
10145
10146 if ((ready || eof) &&
10147 m_words_written_this_period <= m_curr_disk_write_speed &&
10148 (ptrP->is_lcp() ||
10149 m_backup_words_written_this_period <= m_curr_backup_disk_write_speed))
10150 {
10151 /*
10152 We have a buffer ready to write or we have reached end of
10153 file and thus we must write the last before closing the
10154 file.
10155 We have already checked that we are allowed to write at this
10156 moment. We only worry about history of last 100 milliseconds.
10157 What happened before that is of no interest since a disk
10158 write that was issued more than 100 milliseconds should be
10159 completed by now.
10160 */
10161 jam();
10162 int overflow;
10163 m_monitor_words_written+= sz;
10164 m_words_written_this_period += sz;
10165 overflow = m_words_written_this_period - m_curr_disk_write_speed;
10166 if (overflow > 0)
10167 m_overflow_disk_write = overflow;
10168 if (!ptrP->is_lcp())
10169 {
10170 m_backup_monitor_words_written += sz;
10171 m_backup_words_written_this_period += sz;
10172 overflow = m_backup_words_written_this_period -
10173 m_curr_backup_disk_write_speed;
10174 if (overflow > 0)
10175 m_backup_overflow_disk_write = overflow;
10176 }
10177 #if 0
10178 ndbout << "Will write with " << endl;
10179 ndbout << endl;
10180 #endif
10181 return true;
10182 }
10183 else
10184 {
10185 #if 0
10186 ndbout << "Will not write now" << endl << endl;
10187 #endif
10188 jam();
10189 return false;
10190 }
10191 }
10192
10193 void
checkFile(Signal * signal,BackupFilePtr filePtr)10194 Backup::checkFile(Signal* signal, BackupFilePtr filePtr)
10195 {
10196
10197 #ifdef DEBUG_ABORT
10198 // ndbout_c("---- check file filePtr.i = %u", filePtr.i);
10199 #endif
10200
10201 OperationRecord & op = filePtr.p->operation;
10202 Uint32 *tmp = NULL;
10203 Uint32 sz = 0;
10204 bool eof = FALSE;
10205 bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
10206
10207 #if 0
10208 ndbout << "Ptr to data = " << hex << tmp << endl;
10209 #endif
10210 BackupRecordPtr ptr;
10211 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10212
10213 if (ERROR_INSERTED(10036))
10214 {
10215 jam();
10216 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10217 filePtr.p->errorCode = 2810;
10218 ptr.p->setErrorCode(2810);
10219
10220 if(ptr.p->m_gsn == GSN_STOP_BACKUP_REQ)
10221 {
10222 jam();
10223 closeFile(signal, ptr, filePtr);
10224 }
10225 return;
10226 }
10227
10228 if(filePtr.p->errorCode != 0)
10229 {
10230 jam();
10231 ptr.p->setErrorCode(filePtr.p->errorCode);
10232
10233 if(ptr.p->m_gsn == GSN_STOP_BACKUP_REQ)
10234 {
10235 jam();
10236 closeFile(signal, ptr, filePtr);
10237 }
10238
10239 if (ptr.p->is_lcp())
10240 {
10241 jam();
10242 /* Close file with error - will delete it */
10243 closeFile(signal, ptr, filePtr);
10244 }
10245
10246 return;
10247 }
10248
10249 if (!ready_to_write(ready,
10250 sz,
10251 eof,
10252 filePtr.p,
10253 ptr.p))
10254 {
10255 jam();
10256 signal->theData[0] = BackupContinueB::BUFFER_UNDERFLOW;
10257 signal->theData[1] = filePtr.i;
10258 sendSignalWithDelay(reference(), GSN_CONTINUEB, signal,
10259 WaitDiskBufferCapacityMillis, 2);
10260 return;
10261 }
10262 else if (sz > 0)
10263 {
10264 jam();
10265 #ifdef ERROR_INSERT
10266 /* Test APPENDREF handling */
10267 if (filePtr.p->fileType == BackupFormat::DATA_FILE)
10268 {
10269 if (ERROR_INSERTED(10045))
10270 {
10271 ndbout_c("BF_SCAN_THREAD = %u",
10272 (filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD));
10273 }
10274
10275 if ((ERROR_INSERTED(10044) &&
10276 !(filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD)) ||
10277 (ERROR_INSERTED(10045) &&
10278 (filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD)))
10279 {
10280 jam();
10281 ndbout_c("REFing on append to data file for table %u, fragment %u, "
10282 "BF_SCAN_THREAD running : %u",
10283 filePtr.p->tableId,
10284 filePtr.p->fragmentNo,
10285 filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD);
10286 FsRef* ref = (FsRef *)signal->getDataPtrSend();
10287 ref->userPointer = filePtr.i;
10288 ref->errorCode = FsRef::fsErrInvalidParameters;
10289 ref->osErrorCode = ~0;
10290 /* EXEC DIRECT to avoid change in BF_SCAN_THREAD state */
10291 EXECUTE_DIRECT(BACKUP, GSN_FSAPPENDREF, signal,
10292 3);
10293 return;
10294 }
10295 }
10296 #endif
10297
10298 const bool write_to_datafile = (filePtr.i == ptr.p->dataFilePtr[0]);
10299 /**
10300 * If O_DIRECT is enabled, the write should be done in 128-word chunks.
10301 * For O_DIRECT writes of less than 128 words, we skip the writes when
10302 * we have reached end of file and we are about to abort the backup (and
10303 * will not be interested in its results). We avoid writing in this case
10304 * since we don't want to handle errors for O_DIRECT calls.
10305 * However we only avoid this write for data files since CTL files and
10306 * LOG files never use O_DIRECT. Also no need to avoid write if we don't
10307 * use O_DIRECT at all.
10308 */
10309 const bool skip_write = (c_defaults.m_o_direct && // O_DIRECT write
10310 write_to_datafile && // to datafile
10311 !ptr.p->is_lcp() && // during backup
10312 eof && // last chunk of data to write to file
10313 (sz % 128 != 0) && // too small for O_DIRECT
10314 (ptr.p->slaveState.getState() == STOPPING) &&
10315 ptr.p->checkError()); // backup to be aborted
10316
10317 if(likely(!skip_write))
10318 {
10319 jam();
10320 ndbassert((Uint64(tmp - c_startOfPages) >> 32) == 0); // 4Gb buffers!
10321 FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend();
10322 req->filePointer = filePtr.p->filePointer;
10323 req->userPointer = filePtr.i;
10324 req->userReference = reference();
10325 req->varIndex = 0;
10326 req->offset = Uint32(tmp - c_startOfPages); // 4Gb buffers!
10327 req->size = sz;
10328 req->synch_flag = 0;
10329
10330 sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal,
10331 FsAppendReq::SignalLength, JBA);
10332 return;
10333 }
10334 }
10335
10336 Uint32 flags = filePtr.p->m_flags;
10337 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_FILE_THREAD;
10338
10339 ndbrequire(flags & BackupFile::BF_OPEN);
10340 ndbrequire(flags & BackupFile::BF_FILE_THREAD);
10341
10342 if (ptr.p->is_lcp())
10343 {
10344 jam();
10345 closeFile(signal, ptr, filePtr, false, false);
10346 }
10347 else
10348 {
10349 jam();
10350 closeFile(signal, ptr, filePtr);
10351 }
10352 return;
10353 }
10354
10355
10356 /****************************************************************************
10357 *
10358 * Slave functionallity: Perform logging
10359 *
10360 ****************************************************************************/
10361 void
execBACKUP_TRIG_REQ(Signal * signal)10362 Backup::execBACKUP_TRIG_REQ(Signal* signal)
10363 {
10364 /*
10365 TUP asks if this trigger is to be fired on this node.
10366 */
10367 TriggerPtr trigPtr;
10368 TablePtr tabPtr;
10369 FragmentPtr fragPtr;
10370 Uint32 trigger_id = signal->theData[0];
10371 Uint32 frag_id = signal->theData[1];
10372 Uint32 result;
10373
10374 jamEntry();
10375
10376 c_triggerPool.getPtr(trigPtr, trigger_id);
10377
10378 c_tablePool.getPtr(tabPtr, trigPtr.p->tab_ptr_i);
10379 tabPtr.p->fragments.getPtr(fragPtr, frag_id);
10380 if (fragPtr.p->node != getOwnNodeId()) {
10381
10382 jam();
10383 result = ZFALSE;
10384 } else {
10385 jam();
10386 result = ZTRUE;
10387 }//if
10388 signal->theData[0] = result;
10389 }
10390
10391 BackupFormat::LogFile::LogEntry *
get_log_buffer(Signal * signal,TriggerPtr trigPtr,Uint32 sz)10392 Backup::get_log_buffer(Signal* signal,
10393 TriggerPtr trigPtr, Uint32 sz)
10394 {
10395 Uint32 * dst;
10396 if(ERROR_INSERTED(10030))
10397 {
10398 jam();
10399 dst = 0;
10400 }
10401 else
10402 {
10403 jam();
10404 FsBuffer & buf = trigPtr.p->operation->dataBuffer;
10405 ndbrequire(sz <= buf.getMaxWrite());
10406 if (unlikely(!buf.getWritePtr(&dst, sz)))
10407 {
10408 jam();
10409 dst = 0;
10410 }
10411 }
10412
10413 if (unlikely(dst == 0))
10414 {
10415 Uint32 save[TrigAttrInfo::StaticLength];
10416 memcpy(save, signal->getDataPtr(), 4*TrigAttrInfo::StaticLength);
10417 BackupRecordPtr ptr;
10418 c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10419 trigPtr.p->errorCode = AbortBackupOrd::LogBufferFull;
10420 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10421 ord->backupId = ptr.p->backupId;
10422 ord->backupPtr = ptr.i;
10423 ord->requestType = AbortBackupOrd::LogBufferFull;
10424 ord->senderData= ptr.i;
10425 sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD, signal,
10426 AbortBackupOrd::SignalLength, JBB);
10427
10428 memcpy(signal->getDataPtrSend(), save, 4*TrigAttrInfo::StaticLength);
10429 return 0;
10430 }//if
10431
10432 BackupFormat::LogFile::LogEntry * logEntry =
10433 (BackupFormat::LogFile::LogEntry *)dst;
10434 logEntry->Length = 0;
10435 logEntry->TableId = htonl(trigPtr.p->tableId);
10436
10437 if(trigPtr.p->event==0)
10438 logEntry->TriggerEvent= htonl(TriggerEvent::TE_INSERT);
10439 else if(trigPtr.p->event==1)
10440 logEntry->TriggerEvent= htonl(TriggerEvent::TE_UPDATE);
10441 else if(trigPtr.p->event==2)
10442 logEntry->TriggerEvent= htonl(TriggerEvent::TE_DELETE);
10443 else {
10444 ndbout << "Bad Event: " << trigPtr.p->event << endl;
10445 ndbabort();
10446 }
10447
10448 return logEntry;
10449 }
10450
10451 void
execTRIG_ATTRINFO(Signal * signal)10452 Backup::execTRIG_ATTRINFO(Signal* signal) {
10453 jamEntry();
10454
10455 CRASH_INSERTION((10019));
10456
10457 TrigAttrInfo * trg = (TrigAttrInfo*)signal->getDataPtr();
10458
10459 TriggerPtr trigPtr;
10460 c_triggerPool.getPtr(trigPtr, trg->getTriggerId());
10461 ndbrequire(trigPtr.p->event != ILLEGAL_TRIGGER_ID); // Online...
10462
10463 if(trigPtr.p->errorCode != 0) {
10464 jam();
10465 return;
10466 }//if
10467
10468 BackupRecordPtr ptr;
10469 c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10470
10471 if(ptr.p->flags & BackupReq::USE_UNDO_LOG) {
10472 if(trg->getAttrInfoType() == TrigAttrInfo::AFTER_VALUES) {
10473 jam();
10474 /**
10475 * Backup is doing UNDO logging and don't need after values
10476 */
10477 return;
10478 }//if
10479 }
10480 else {
10481 if(trg->getAttrInfoType() == TrigAttrInfo::BEFORE_VALUES) {
10482 jam();
10483 /**
10484 * Backup is doing REDO logging and don't need before values
10485 */
10486 return;
10487 }//if
10488 }
10489
10490 BackupFormat::LogFile::LogEntry * logEntry = trigPtr.p->logEntry;
10491 if(logEntry == 0)
10492 {
10493 jam();
10494 logEntry = get_log_buffer(signal,
10495 trigPtr,
10496 BackupFormat::LogFile::LogEntry::MAX_SIZE);
10497 trigPtr.p->logEntry = logEntry;
10498 if (unlikely(logEntry == 0))
10499 {
10500 jam();
10501 return;
10502 }
10503 } else {
10504 ndbrequire(logEntry->TableId == htonl(trigPtr.p->tableId));
10505 // ndbrequire(logEntry->TriggerEvent == htonl(trigPtr.p->event));
10506 }//if
10507
10508 const Uint32 pos = logEntry->Length;
10509 const Uint32 dataLen = signal->length() - TrigAttrInfo::StaticLength;
10510 memcpy(&logEntry->Data[pos], trg->getData(), dataLen << 2);
10511
10512 logEntry->Length = pos + dataLen;
10513 }
10514
10515 void
execFIRE_TRIG_ORD(Signal * signal)10516 Backup::execFIRE_TRIG_ORD(Signal* signal)
10517 {
10518 jamEntry();
10519
10520 if (!assembleFragments(signal))
10521 {
10522 jam();
10523 return;
10524 }
10525
10526 FireTrigOrd* trg = (FireTrigOrd*)signal->getDataPtr();
10527
10528 const Uint32 gci = trg->getGCI();
10529 const Uint32 trI = trg->getTriggerId();
10530 const Uint32 fragId = trg->fragId;
10531
10532 TriggerPtr trigPtr;
10533 c_triggerPool.getPtr(trigPtr, trI);
10534
10535 ndbrequire(trigPtr.p->event != ILLEGAL_TRIGGER_ID);
10536
10537 BackupRecordPtr ptr;
10538 c_backupPool.getPtr(ptr, trigPtr.p->backupPtr);
10539
10540 if(trigPtr.p->errorCode != 0) {
10541 jam();
10542 SectionHandle handle(this, signal);
10543 releaseSections(handle);
10544 return;
10545 }//if
10546
10547 if (isNdbMtLqh())
10548 {
10549 jam();
10550 /* This is the decision point for including
10551 * this row change in the log file on ndbmtd
10552 */
10553 TablePtr tabPtr;
10554 c_tablePool.getPtr(tabPtr, trigPtr.p->tab_ptr_i);
10555 FragmentPtr fragPtr;
10556 tabPtr.p->fragments.getPtr(fragPtr, fragId);
10557 if (fragPtr.p->node != getOwnNodeId())
10558 {
10559 jam();
10560 trigPtr.p->logEntry = 0;
10561 SectionHandle handle(this,signal);
10562 releaseSections(handle);
10563 return;
10564 }
10565 }
10566
10567 if (signal->getNoOfSections())
10568 {
10569 jam();
10570 SectionHandle handle(this,signal);
10571 SegmentedSectionPtr dataPtr[3];
10572 handle.getSection(dataPtr[0], 0);
10573 handle.getSection(dataPtr[1], 1);
10574 handle.getSection(dataPtr[2], 2);
10575 /**
10576 * dataPtr[0] : Primary key info
10577 * dataPtr[1] : Before values
10578 * dataPtr[2] : After values
10579 */
10580
10581 // Add one word to get_log_buffer for potential gci info stored at end.
10582 const Uint32 log_entry_words =
10583 1 /* length word */ +
10584 BackupFormat::LogFile::LogEntry::HEADER_LENGTH_WORDS +
10585 1 /* gci_word */;
10586
10587 // Backup is doing UNDO logging and need before values
10588 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
10589 {
10590 jam();
10591 // Add one word to get_log_buffer for logEntry length info stored at end.
10592 trigPtr.p->logEntry = get_log_buffer(signal,
10593 trigPtr,
10594 log_entry_words +
10595 dataPtr[0].sz +
10596 dataPtr[1].sz +
10597 1);
10598 if (unlikely(trigPtr.p->logEntry == 0))
10599 {
10600 jam();
10601 releaseSections(handle);
10602 return;
10603 }
10604 copy(trigPtr.p->logEntry->Data, dataPtr[0]);
10605 copy(trigPtr.p->logEntry->Data+dataPtr[0].sz, dataPtr[1]);
10606 trigPtr.p->logEntry->Length = dataPtr[0].sz + dataPtr[1].sz;
10607 }
10608 // Backup is doing REDO logging and need after values
10609 else
10610 {
10611 jam();
10612 trigPtr.p->logEntry = get_log_buffer(signal,
10613 trigPtr,
10614 log_entry_words +
10615 dataPtr[0].sz +
10616 dataPtr[2].sz);
10617 if (unlikely(trigPtr.p->logEntry == 0))
10618 {
10619 jam();
10620 releaseSections(handle);
10621 return;
10622 }
10623 copy(trigPtr.p->logEntry->Data, dataPtr[0]);
10624 copy(trigPtr.p->logEntry->Data+dataPtr[0].sz, dataPtr[2]);
10625 trigPtr.p->logEntry->Length = dataPtr[0].sz + dataPtr[2].sz;
10626 }
10627
10628 releaseSections(handle);
10629 }
10630
10631 ndbrequire(trigPtr.p->logEntry != 0);
10632 Uint32 len = trigPtr.p->logEntry->Length;
10633 trigPtr.p->logEntry->FragId = htonl(fragId);
10634
10635 /* Redo logs are always read from file start to file end, so
10636 * GCI content can be optimised out. If a set of N consecutive
10637 * log entries have the same GCI, the GCI is written only in the
10638 * first log entry of the set, while the remaining entries do
10639 * not contain a GCP. So an entry is written with a GCP only
10640 * when its GCP differs from the previous entry.
10641 * This cannot be done for undo logs since undo logs are read in
10642 * reverse, from file end to file start.
10643 */
10644 if ((ptr.p->flags & BackupReq::USE_UNDO_LOG) || (gci != ptr.p->currGCP))
10645 {
10646 jam();
10647 trigPtr.p->logEntry->TriggerEvent|= htonl(0x10000);
10648 trigPtr.p->logEntry->Data[len] = htonl(gci);
10649 len++;
10650 ptr.p->currGCP = gci;
10651 }
10652
10653 Uint32 datalen = len;
10654 len += BackupFormat::LogFile::LogEntry::HEADER_LENGTH_WORDS;
10655 trigPtr.p->logEntry->Length = htonl(len);
10656
10657 if(ptr.p->flags & BackupReq::USE_UNDO_LOG)
10658 {
10659 jam();
10660 /* keep the length at both the end of logEntry and ->logEntry variable
10661 The total length of logEntry is len + 2
10662 */
10663 trigPtr.p->logEntry->Data[datalen] = htonl(len);
10664 }
10665
10666 Uint32 entryLength = len + 1;
10667 if (ptr.p->flags & BackupReq::USE_UNDO_LOG)
10668 {
10669 jam();
10670 entryLength++;
10671 }
10672
10673 ndbrequire(entryLength <= trigPtr.p->operation->dataBuffer.getMaxWrite());
10674 trigPtr.p->operation->dataBuffer.updateWritePtr(entryLength);
10675 trigPtr.p->logEntry = 0;
10676
10677 {
10678 const Uint32 entryByteLength = entryLength << 2;
10679 trigPtr.p->operation->noOfBytes += entryByteLength;
10680 trigPtr.p->operation->m_bytes_total += entryByteLength;
10681 trigPtr.p->operation->noOfRecords += 1;
10682 trigPtr.p->operation->m_records_total += 1;
10683 }
10684 }
10685
10686 void
sendAbortBackupOrd(Signal * signal,BackupRecordPtr ptr,Uint32 requestType)10687 Backup::sendAbortBackupOrd(Signal* signal, BackupRecordPtr ptr,
10688 Uint32 requestType)
10689 {
10690 jam();
10691 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10692 ord->backupId = ptr.p->backupId;
10693 ord->backupPtr = ptr.i;
10694 ord->requestType = requestType;
10695 ord->senderData= ptr.i;
10696 NodePtr node;
10697 Uint32 receiverInstance = instanceKey(ptr); // = BackupProxy for mt-backup
10698
10699 if((ptr.p->fragWorkers[getOwnNodeId()].count() == 1)
10700 && (ptr.p->fragWorkers[getOwnNodeId()].find_first() == instance()))
10701 {
10702 // All signal-sender functions in abort protocol detect
10703 // send-to-self bitmask settings and send signals accordingly.
10704 ptr.p->senderRef = reference();
10705 receiverInstance = instance();
10706 }
10707
10708 for(c_nodes.first(node); node.i != RNIL; c_nodes.next(node)) {
10709 jam();
10710 const Uint32 nodeId = node.p->nodeId;
10711 if(node.p->alive && ptr.p->nodes.get(nodeId)) {
10712 jam();
10713 BlockReference ref = numberToRef(BACKUP, receiverInstance, nodeId);
10714 sendSignal(ref, GSN_ABORT_BACKUP_ORD, signal,
10715 AbortBackupOrd::SignalLength, JBB);
10716 }//if
10717 }//for
10718 }
10719
10720 /*****************************************************************************
10721 *
10722 * Slave functionallity: Stop backup
10723 *
10724 *****************************************************************************/
10725 void
execSTOP_BACKUP_REQ(Signal * signal)10726 Backup::execSTOP_BACKUP_REQ(Signal* signal)
10727 {
10728 jamEntry();
10729 StopBackupReq * req = (StopBackupReq*)signal->getDataPtr();
10730
10731 CRASH_INSERTION((10020));
10732
10733 const Uint32 ptrI = req->backupPtr;
10734 //const Uint32 backupId = req->backupId;
10735 const Uint32 startGCP = req->startGCP;
10736 const Uint32 stopGCP = req->stopGCP;
10737
10738 /**
10739 * At least one GCP must have passed
10740 */
10741 ndbrequire(stopGCP > startGCP);
10742
10743 /**
10744 * Get backup record
10745 */
10746 BackupRecordPtr ptr;
10747 c_backupPool.getPtr(ptr, ptrI);
10748
10749 ptr.p->startGCP= startGCP;
10750 ptr.p->stopGCP= stopGCP;
10751
10752 if (MT_BACKUP_FLAG(ptr.p->flags))
10753 {
10754 /**
10755 * In multithreaded backup, each Backup Worker sends
10756 * trigger-drop and trigger-firing signals only to its
10757 * local TUP. No sync is needed to ensure ordering of
10758 * trigger signals wrt STOP_BACKUP_REQ, since the
10759 * signals are added in order to the signal queue.
10760 */
10761 Uint32 retVal = 0;
10762 startDropTrig_synced(signal, ptrI, retVal);
10763 }
10764 else
10765 {
10766 /**
10767 * Ensure that any in-flight changes are
10768 * included in the backup log before
10769 * dropping the triggers
10770 *
10771 * This is necessary as the trigger-drop
10772 * signals are routed :
10773 *
10774 * Backup Worker 1 <-> Proxy <-> TUP Worker 1..n
10775 *
10776 * While the trigger firing signals are
10777 * routed :
10778 *
10779 * TUP Worker 1..n -> Backup Worker 1
10780 *
10781 * So the arrival of signal-drop acks
10782 * does not imply that all fired
10783 * triggers have been seen.
10784 *
10785 * Backup Worker 1
10786 *
10787 * | SYNC_PATH_REQ
10788 * V
10789 * TUP Proxy
10790 * | | ... |
10791 * V V V
10792 * 1 2 ... n (Workers)
10793 * | | |
10794 * | | |
10795 *
10796 * Backup Worker 1
10797 */
10798 Uint32 path[] = { DBTUP, 0 };
10799 Callback cb = { safe_cast(&Backup::startDropTrig_synced), ptrI };
10800 synchronize_path(signal,
10801 path,
10802 cb);
10803 if (ERROR_INSERTED(10049) && (ptr.p->masterRef == reference()))
10804 {
10805 AbortBackupOrd *ord = (AbortBackupOrd*)signal->getDataPtrSend();
10806 ord->backupId = ptr.p->backupId;
10807 ord->backupPtr = ptr.i;
10808 ord->requestType = AbortBackupOrd::LogBufferFull;
10809 ord->senderData= ptr.i;
10810 execABORT_BACKUP_ORD(signal);
10811 }
10812 }
10813 }
10814
10815 void
startDropTrig_synced(Signal * signal,Uint32 ptrI,Uint32 retVal)10816 Backup::startDropTrig_synced(Signal* signal, Uint32 ptrI, Uint32 retVal)
10817 {
10818 jamEntry();
10819 /**
10820 * Get backup record
10821 */
10822 BackupRecordPtr ptr;
10823 c_backupPool.getPtr(ptr, ptrI);
10824
10825 ptr.p->slaveState.setState(STOPPING);
10826 ptr.p->m_gsn = GSN_STOP_BACKUP_REQ;
10827
10828 /**
10829 * Now drop the triggers
10830 */
10831 sendDropTrig(signal, ptr);
10832 }
10833
10834 void
closeFiles(Signal * sig,BackupRecordPtr ptr)10835 Backup::closeFiles(Signal* sig, BackupRecordPtr ptr)
10836 {
10837 /**
10838 * Close all files
10839 */
10840 BackupFilePtr filePtr;
10841 int openCount = 0;
10842 for(ptr.p->files.first(filePtr); filePtr.i!=RNIL; ptr.p->files.next(filePtr))
10843 {
10844 if(! (filePtr.p->m_flags & BackupFile::BF_OPEN))
10845 {
10846 jam();
10847 continue;
10848 }
10849
10850 jam();
10851 openCount++;
10852
10853 if(filePtr.p->m_flags & BackupFile::BF_CLOSING)
10854 {
10855 jam();
10856 continue;
10857 }//if
10858
10859 filePtr.p->operation.dataBuffer.eof();
10860 if(filePtr.p->m_flags & BackupFile::BF_FILE_THREAD)
10861 {
10862 jam();
10863 #ifdef DEBUG_ABORT
10864 ndbout_c("Close files fileRunning == 1, filePtr.i=%u", filePtr.i);
10865 #endif
10866 }
10867 else
10868 {
10869 jam();
10870 closeFile(sig, ptr, filePtr);
10871 }
10872 }
10873
10874 if(openCount == 0){
10875 jam();
10876 closeFilesDone(sig, ptr);
10877 }//if
10878 }
10879
10880 void
closeFile(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr,bool prepare_phase,bool remove_flag)10881 Backup::closeFile(Signal* signal,
10882 BackupRecordPtr ptr,
10883 BackupFilePtr filePtr,
10884 bool prepare_phase,
10885 bool remove_flag)
10886 {
10887 ndbrequire(filePtr.p->m_flags & BackupFile::BF_OPEN);
10888 ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_OPENING));
10889 ndbrequire(! (filePtr.p->m_flags & BackupFile::BF_CLOSING));
10890 filePtr.p->m_flags |= BackupFile::BF_CLOSING;
10891
10892 FsCloseReq * req = (FsCloseReq *)signal->getDataPtrSend();
10893 req->filePointer = filePtr.p->filePointer;
10894 req->userPointer = filePtr.i;
10895 req->userReference = reference();
10896 req->fileFlag = 0;
10897
10898 if (prepare_phase)
10899 {
10900 jam();
10901 if (ptr.p->prepareErrorCode)
10902 {
10903 jam();
10904 FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10905 }
10906 }
10907 else
10908 {
10909 jam();
10910 if (ptr.p->errorCode)
10911 {
10912 jam();
10913 FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10914 }
10915 }
10916 if (remove_flag)
10917 {
10918 jam();
10919 FsCloseReq::setRemoveFileFlag(req->fileFlag, 1);
10920 }
10921
10922 #ifdef DEBUG_ABORT
10923 ndbout_c("***** a FSCLOSEREQ filePtr.i = %u flags: %x",
10924 filePtr.i, filePtr.p->m_flags);
10925 #endif
10926 sendSignal(NDBFS_REF, GSN_FSCLOSEREQ, signal, FsCloseReq::SignalLength, JBA);
10927
10928 }
10929
10930 void
execFSCLOSEREF(Signal * signal)10931 Backup::execFSCLOSEREF(Signal* signal)
10932 {
10933 jamEntry();
10934
10935 FsRef * ref = (FsRef*)signal->getDataPtr();
10936 const Uint32 filePtrI = ref->userPointer;
10937
10938 BackupFilePtr filePtr;
10939 c_backupFilePool.getPtr(filePtr, filePtrI);
10940
10941 BackupRecordPtr ptr;
10942 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
10943
10944 FsConf * conf = (FsConf*)signal->getDataPtr();
10945 conf->userPointer = filePtrI;
10946
10947 const char *file_type_str;
10948 const char *op_type_str;
10949
10950 if (ptr.p->errorCode == 0)
10951 {
10952 ptr.p->errorCode = ref->errorCode;
10953 }
10954 if (filePtr.p->errorCode == 0)
10955 {
10956 filePtr.p->errorCode = ref->errorCode;
10957 }
10958 if (ptr.p->is_lcp())
10959 {
10960 op_type_str = "LCP";
10961 if (ptr.p->prepareCtlFilePtr[0] == filePtrI ||
10962 ptr.p->prepareCtlFilePtr[1] == filePtrI)
10963 file_type_str = "prepare ctl";
10964 else if (ptr.p->prepareDataFilePtr[0] == filePtrI)
10965 file_type_str = "prepare data";
10966 else if (ptr.p->deleteFilePtr == filePtrI)
10967 file_type_str = "delete file";
10968 else if (ptr.p->dataFilePtr[0] == filePtrI)
10969 file_type_str = "data";
10970 else if (ptr.p->ctlFilePtr == filePtrI)
10971 file_type_str = "ctl";
10972 else
10973 {
10974 ndbabort();
10975 file_type_str = NULL;
10976 }
10977 }
10978 else
10979 {
10980 op_type_str = "backup";
10981 if (ptr.p->ctlFilePtr == filePtrI)
10982 file_type_str = "ctl";
10983 else if (ptr.p->dataFilePtr[0] == filePtrI)
10984 file_type_str = "data";
10985 else if (ptr.p->logFilePtr == filePtrI)
10986 file_type_str = "log";
10987 else
10988 {
10989 ndbabort();
10990 file_type_str = NULL;
10991 }
10992 }
10993 g_eventLogger->warning("FSCLOSEREF: errCode: %d, performing %s"
10994 " for file type %s, ignoring error",
10995 ref->errorCode,
10996 op_type_str,
10997 file_type_str);
10998 execFSCLOSECONF(signal);
10999 }
11000
11001 void
execFSCLOSECONF(Signal * signal)11002 Backup::execFSCLOSECONF(Signal* signal)
11003 {
11004 jamEntry();
11005
11006 FsConf * conf = (FsConf*)signal->getDataPtr();
11007 const Uint32 filePtrI = conf->userPointer;
11008
11009 BackupFilePtr filePtr;
11010 c_backupFilePool.getPtr(filePtr, filePtrI);
11011
11012 #ifdef DEBUG_ABORT
11013 ndbout_c("***** FSCLOSECONF filePtrI = %u", filePtrI);
11014 #endif
11015
11016 ndbrequire(filePtr.p->m_flags == (BackupFile::BF_OPEN |
11017 BackupFile::BF_CLOSING));
11018
11019
11020 const Uint32 usableBytes =
11021 filePtr.p->operation.dataBuffer.getUsableSize() << 2;
11022 const Uint32 freeLwmBytes =
11023 filePtr.p->operation.dataBuffer.getFreeLwm() << 2;
11024
11025 const BackupFormat::FileType ft = filePtr.p->fileType;
11026
11027 if (ft == BackupFormat::LOG_FILE ||
11028 ft == BackupFormat::UNDO_FILE)
11029 {
11030 g_eventLogger->info("Backup log buffer report : size %u bytes, "
11031 "hwm %u bytes (%u pct)",
11032 usableBytes,
11033 (usableBytes - freeLwmBytes),
11034 ((usableBytes - freeLwmBytes) * 100) /
11035 usableBytes);
11036 }
11037
11038 filePtr.p->m_flags &= ~(Uint32)(BackupFile::BF_OPEN |BackupFile::BF_CLOSING);
11039 filePtr.p->operation.dataBuffer.reset();
11040
11041 BackupRecordPtr ptr;
11042 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
11043
11044 if (ptr.p->is_lcp())
11045 {
11046 if (ptr.p->prepareDataFilePtr[0] == filePtrI)
11047 {
11048 /* Close of prepare data file, error condition */
11049 jam();
11050 ndbrequire(ptr.p->prepareState == PREPARE_ABORTING);
11051 defineBackupRef(signal, ptr, ptr.p->errorCode);
11052 return;
11053 }
11054 else if (ptr.p->prepareCtlFilePtr[0] == filePtrI ||
11055 ptr.p->prepareCtlFilePtr[1] == filePtrI)
11056 {
11057 jam();
11058 if (ptr.p->prepareState == PREPARE_DROP_CLOSE)
11059 {
11060 jam();
11061 lcp_close_ctl_file_drop_case(signal, ptr);
11062 return;
11063 }
11064 if (ptr.p->prepareState == PREPARE_ABORTING)
11065 {
11066 jam();
11067 defineBackupRef(signal, ptr, ptr.p->errorCode);
11068 return;
11069 }
11070 ndbrequire(ptr.p->prepareState == PREPARE_READ_CTL_FILES);
11071 lcp_close_prepare_ctl_file_done(signal, ptr);
11072 return;
11073 }
11074 else if (ptr.p->ctlFilePtr == filePtrI)
11075 {
11076 jam();
11077 finalize_lcp_processing(signal, ptr);
11078 return;
11079 }
11080 else if (ptr.p->deleteFilePtr == filePtrI)
11081 {
11082 jam();
11083 lcp_close_ctl_file_for_rewrite_done(signal, ptr, filePtr);
11084 return;
11085 }
11086 else if ((ptr.p->dataFilePtr[0] == filePtrI) ||
11087 (ptr.p->dataFilePtr[1] == filePtrI) ||
11088 (ptr.p->dataFilePtr[2] == filePtrI) ||
11089 (ptr.p->dataFilePtr[3] == filePtrI) ||
11090 (ptr.p->dataFilePtr[4] == filePtrI) ||
11091 (ptr.p->dataFilePtr[5] == filePtrI) ||
11092 (ptr.p->dataFilePtr[6] == filePtrI) ||
11093 (ptr.p->dataFilePtr[7] == filePtrI))
11094 {
11095 jam();
11096 ndbrequire(filePtr.p->m_flags == 0);
11097 ndbrequire(ptr.p->m_num_lcp_data_files_open > 0);
11098 ptr.p->m_num_lcp_data_files_open--;
11099 if (ptr.p->m_num_lcp_data_files_open > 0)
11100 {
11101 jam();
11102 DEB_EXTRA_LCP(("(%u) Closed LCP data file, still waiting for %u files",
11103 instance(),
11104 ptr.p->m_num_lcp_data_files_open));
11105 return;
11106 }
11107 lcp_close_data_file_conf(signal, ptr);
11108 return;
11109 }
11110 else
11111 {
11112 ndbabort();
11113 }
11114 }
11115 /* Backup closing files */
11116 closeFiles(signal, ptr);
11117 }
11118
11119 void
closeFilesDone(Signal * signal,BackupRecordPtr ptr)11120 Backup::closeFilesDone(Signal* signal, BackupRecordPtr ptr)
11121 {
11122 jam();
11123 /* Record end-of-backup */
11124 //ndbassert(Backup::g_is_single_thr_backup_running); /* !set on error paths */
11125 Backup::g_is_single_thr_backup_running = false;
11126
11127 //error when do insert footer or close file
11128 if(ptr.p->checkError())
11129 {
11130 StopBackupRef * ref = (StopBackupRef*)signal->getDataPtrSend();
11131 ref->backupPtr = ptr.i;
11132 ref->backupId = ptr.p->backupId;
11133 ref->errorCode = ptr.p->errorCode;
11134 ref->nodeId = getOwnNodeId();
11135 sendSignal(ptr.p->senderRef, GSN_STOP_BACKUP_REF, signal,
11136 StopBackupConf::SignalLength, JBB);
11137
11138 ptr.p->m_gsn = GSN_STOP_BACKUP_REF;
11139 ptr.p->slaveState.setState(CLEANING);
11140 return;
11141 }
11142
11143 StopBackupConf* conf = (StopBackupConf*)signal->getDataPtrSend();
11144 conf->backupId = ptr.p->backupId;
11145 conf->backupPtr = ptr.i;
11146
11147 BackupFilePtr filePtr;
11148 if(ptr.p->logFilePtr != RNIL)
11149 {
11150 ptr.p->files.getPtr(filePtr, ptr.p->logFilePtr);
11151 conf->noOfLogBytes= Uint32(filePtr.p->operation.noOfBytes); // TODO
11152 conf->noOfLogRecords= Uint32(filePtr.p->operation.noOfRecords); // TODO
11153 }
11154 else
11155 {
11156 conf->noOfLogBytes= 0;
11157 conf->noOfLogRecords= 0;
11158 }
11159
11160 sendSignal(ptr.p->senderRef, GSN_STOP_BACKUP_CONF, signal,
11161 StopBackupConf::SignalLength, JBB);
11162
11163 ptr.p->m_gsn = GSN_STOP_BACKUP_CONF;
11164 ptr.p->slaveState.setState(CLEANING);
11165 }
11166
11167 /*****************************************************************************
11168 *
11169 * Slave functionallity: Abort backup
11170 *
11171 *****************************************************************************/
11172 /*****************************************************************************
11173 *
11174 * Slave functionallity: Abort backup
11175 *
11176 *****************************************************************************/
11177 void
execABORT_BACKUP_ORD(Signal * signal)11178 Backup::execABORT_BACKUP_ORD(Signal* signal)
11179 {
11180 jamEntry();
11181 AbortBackupOrd* ord = (AbortBackupOrd*)signal->getDataPtr();
11182 const Uint32 backupId = ord->backupId;
11183 const AbortBackupOrd::RequestType requestType =
11184 (AbortBackupOrd::RequestType)ord->requestType;
11185 const Uint32 senderData = ord->senderData;
11186
11187 #ifdef DEBUG_ABORT
11188 ndbout_c("******** ABORT_BACKUP_ORD ********* nodeId = %u",
11189 refToNode(signal->getSendersBlockRef()));
11190 ndbout_c("backupId = %u, requestType = %u, senderData = %u, ",
11191 backupId, requestType, senderData);
11192 dumpUsedResources();
11193 #endif
11194
11195 BackupRecordPtr ptr;
11196 if(requestType == AbortBackupOrd::ClientAbort) {
11197 jam();
11198 if ((!get_backup_record(ptr)) ||
11199 ptr.p->backupId != backupId ||
11200 ptr.p->clientData != senderData)
11201 {
11202 jam();
11203 return;
11204 }//if
11205 if (ptr.p->masterRef != reference())
11206 {
11207 jam();
11208 // forward to master
11209 #ifdef DEBUG_ABORT
11210 ndbout_c("---- Forward to master nodeId = %u", getMasterNodeId());
11211 #endif
11212 sendSignal(ptr.p->masterRef, GSN_ABORT_BACKUP_ORD,
11213 signal, AbortBackupOrd::SignalLength, JBB);
11214 return;
11215 }
11216 } else {
11217 if (c_backupPool.findId(senderData)) {
11218 jam();
11219 c_backupPool.getPtr(ptr, senderData);
11220 } else {
11221 jam();
11222 #ifdef DEBUG_ABORT
11223 ndbout_c("Backup: abort request type=%u on id=%u,%u not found",
11224 requestType, backupId, senderData);
11225 #endif
11226 return;
11227 }
11228 }//if
11229
11230 ptr.p->m_gsn = GSN_ABORT_BACKUP_ORD;
11231 const bool isCoordinator = (ptr.p->masterRef == reference());
11232
11233 bool ok = false;
11234 switch(requestType){
11235
11236 /**
11237 * Requests sent to master
11238 */
11239 case AbortBackupOrd::ClientAbort:
11240 jam();
11241 // fall through
11242 case AbortBackupOrd::LogBufferFull:
11243 jam();
11244 // fall through
11245 case AbortBackupOrd::FileOrScanError:
11246 jam();
11247 ndbrequire(isCoordinator);
11248 ptr.p->setErrorCode(requestType);
11249 if(ptr.p->masterData.gsn == GSN_BACKUP_FRAGMENT_REQ)
11250 {
11251 /**
11252 * Only scans are actively aborted
11253 */
11254 abort_scan(signal, ptr);
11255 }
11256 return;
11257
11258 /**
11259 * Requests sent to slave
11260 */
11261 case AbortBackupOrd::AbortScan:
11262 jam();
11263 ptr.p->setErrorCode(requestType);
11264 return;
11265
11266 case AbortBackupOrd::BackupComplete:
11267 jam();
11268 cleanup(signal, ptr);
11269 return;
11270 case AbortBackupOrd::BackupFailure:
11271 case AbortBackupOrd::BackupFailureDueToNodeFail:
11272 case AbortBackupOrd::OkToClean:
11273 case AbortBackupOrd::IncompatibleVersions:
11274 #ifndef VM_TRACE
11275 default:
11276 #endif
11277 ptr.p->setErrorCode(requestType);
11278 ptr.p->masterData.errorCode = requestType;
11279 ok= true;
11280 }
11281 ndbrequire(ok);
11282
11283 ptr.p->masterRef = reference();
11284 ptr.p->nodes.clear();
11285 ptr.p->nodes.set(getOwnNodeId());
11286
11287 // Backup aborts on node failure are handled as follows for st-backup:
11288 // - each node declares itself master
11289 // - each node modifies 'nodes' bitmask of signal receivers
11290 // to disable sending to any nodes except self
11291 // For mt-backup,
11292 // - each instance declares itself master
11293 // - each instance modifies 'nodes' bitmask of signal receivers
11294 // to disable sending to any nodes except self
11295 // - each instance modifies 'fragWorkers' bitmask of signal receivers
11296 // to disable sending to any LDM on this node except self
11297 ptr.p->fragWorkers[getOwnNodeId()].clear();
11298 ptr.p->fragWorkers[getOwnNodeId()].set(instance());
11299 ptr.p->masterRef = reference();
11300 ptr.p->senderRef = reference();
11301 ptr.p->stopGCP= ptr.p->startGCP + 1;
11302 sendStopBackup(signal, ptr);
11303 }
11304
11305
11306 void
dumpUsedResources()11307 Backup::dumpUsedResources()
11308 {
11309 jam();
11310 BackupRecordPtr ptr;
11311
11312 if (get_backup_record(ptr))
11313 {
11314 ndbout_c("Backup id=%u, slaveState.getState = %u, errorCode=%u",
11315 ptr.p->backupId,
11316 ptr.p->slaveState.getState(),
11317 ptr.p->errorCode);
11318
11319 TablePtr tabPtr;
11320 for(ptr.p->tables.first(tabPtr);
11321 tabPtr.i != RNIL;
11322 ptr.p->tables.next(tabPtr)) {
11323 jam();
11324 for(Uint32 j = 0; j<3; j++) {
11325 jam();
11326 TriggerPtr trigPtr;
11327 if(tabPtr.p->triggerAllocated[j]) {
11328 jam();
11329 c_triggerPool.getPtr(trigPtr, tabPtr.p->triggerIds[j]);
11330 ndbout_c("Allocated[%u] Triggerid = %u, event = %u",
11331 j,
11332 tabPtr.p->triggerIds[j],
11333 trigPtr.p->event);
11334 }//if
11335 }//for
11336 }//for
11337
11338 BackupFilePtr filePtr;
11339 for(ptr.p->files.first(filePtr);
11340 filePtr.i != RNIL;
11341 ptr.p->files.next(filePtr)) {
11342 jam();
11343 ndbout_c("filePtr.i = %u, flags: H'%x ",
11344 filePtr.i, filePtr.p->m_flags);
11345 }//for
11346 }
11347 }
11348
11349 void
cleanup(Signal * signal,BackupRecordPtr ptr)11350 Backup::cleanup(Signal* signal, BackupRecordPtr ptr)
11351 {
11352 TablePtr tabPtr;
11353 ptr.p->tables.first(tabPtr);
11354 cleanupNextTable(signal, ptr, tabPtr);
11355 }
11356
11357 void
release_tables(BackupRecordPtr ptr)11358 Backup::release_tables(BackupRecordPtr ptr)
11359 {
11360 TablePtr tabPtr;
11361 /* Clear backupPtr before releasing */
11362 for (ptr.p->tables.first(tabPtr);
11363 tabPtr.i != RNIL;
11364 ptr.p->tables.next(tabPtr))
11365 {
11366 jam();
11367 tabPtr.p->fragments.release();
11368 jamLine(tabPtr.p->tableId);
11369 removeTableMap(tabPtr, ptr.i, tabPtr.p->tableId);
11370 }
11371 while (ptr.p->tables.releaseFirst());
11372 }
11373
11374 void
cleanupNextTable(Signal * signal,BackupRecordPtr ptr,TablePtr tabPtr)11375 Backup::cleanupNextTable(Signal *signal, BackupRecordPtr ptr, TablePtr tabPtr)
11376 {
11377 if (tabPtr.i != RNIL)
11378 {
11379 jam();
11380 tabPtr.p->fragments.release();
11381 for(Uint32 j = 0; j<3; j++) {
11382 jam();
11383 TriggerPtr trigPtr;
11384 if(tabPtr.p->triggerAllocated[j]) {
11385 jam();
11386 c_triggerPool.getPtr(trigPtr, tabPtr.p->triggerIds[j]);
11387 trigPtr.p->event = ILLEGAL_TRIGGER_ID;
11388 tabPtr.p->triggerAllocated[j] = false;
11389 }//if
11390 tabPtr.p->triggerIds[j] = ILLEGAL_TRIGGER_ID;
11391 }//for
11392 {
11393 BackupLockTab *req = (BackupLockTab *)signal->getDataPtrSend();
11394 req->m_senderRef = reference();
11395 req->m_tableId = tabPtr.p->tableId;
11396 req->m_lock_unlock = BackupLockTab::UNLOCK_TABLE;
11397 req->m_backup_state = BackupLockTab::CLEANUP;
11398 req->m_backupRecordPtr_I = ptr.i;
11399 req->m_tablePtr_I = tabPtr.i;
11400 sendSignal(DBDICT_REF, GSN_BACKUP_LOCK_TAB_REQ, signal,
11401 BackupLockTab::SignalLength, JBB);
11402 return;
11403 }
11404 }
11405
11406 BackupFilePtr filePtr;
11407 for(ptr.p->files.first(filePtr);filePtr.i != RNIL;ptr.p->files.next(filePtr))
11408 {
11409 jam();
11410 ndbrequire(filePtr.p->m_flags == 0);
11411 filePtr.p->pages.release();
11412 }//for
11413
11414 while (ptr.p->files.releaseFirst());
11415 release_tables(ptr);
11416 while (ptr.p->triggers.releaseFirst());
11417 ptr.p->backupId = ~0;
11418
11419 /*
11420 report of backup status uses these variables to keep track
11421 if files are used
11422 */
11423 ptr.p->ctlFilePtr = ptr.p->logFilePtr = ptr.p->dataFilePtr[0] = RNIL;
11424
11425 if(ptr.p->checkError())
11426 removeBackup(signal, ptr);
11427 else
11428 {
11429 /*
11430 report of backup status uses these variables to keep track
11431 if backup ia running and current state
11432 */
11433 ptr.p->m_gsn = 0;
11434 ptr.p->masterData.gsn = 0;
11435 c_backups.release(ptr);
11436 }
11437 }
11438
11439
11440 void
removeBackup(Signal * signal,BackupRecordPtr ptr)11441 Backup::removeBackup(Signal* signal, BackupRecordPtr ptr)
11442 {
11443 jam();
11444
11445 FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
11446 req->userReference = reference();
11447 req->userPointer = ptr.i;
11448 req->directory = 1;
11449 req->ownDirectory = 1;
11450 FsOpenReq::setVersion(req->fileNumber, 2);
11451 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
11452 FsOpenReq::v2_setSequence(req->fileNumber, ptr.p->backupId);
11453 FsOpenReq::v2_setNodeId(req->fileNumber, getOwnNodeId());
11454 sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal,
11455 FsRemoveReq::SignalLength, JBA);
11456 }
11457
11458 void
execFSREMOVEREF(Signal * signal)11459 Backup::execFSREMOVEREF(Signal* signal)
11460 {
11461 jamEntry();
11462 FsRef * ref = (FsRef*)signal->getDataPtr();
11463 const Uint32 ptrI = ref->userPointer;
11464
11465 FsConf * conf = (FsConf*)signal->getDataPtr();
11466 conf->userPointer = ptrI;
11467 execFSREMOVECONF(signal);
11468 }
11469
11470 void
execFSREMOVECONF(Signal * signal)11471 Backup::execFSREMOVECONF(Signal* signal)
11472 {
11473 jamEntry();
11474
11475 FsConf * conf = (FsConf*)signal->getDataPtr();
11476 const Uint32 ptrI = conf->userPointer;
11477
11478 /**
11479 * Get backup record
11480 */
11481 BackupRecordPtr ptr;
11482 c_backupPool.getPtr(ptr, ptrI);
11483
11484 if (ptr.p->is_lcp())
11485 {
11486 jam();
11487 lcp_remove_file_conf(signal, ptr);
11488 return;
11489 }
11490 /*
11491 report of backup status uses these variables to keep track
11492 if backup ia running and current state
11493 */
11494 ptr.p->m_gsn = 0;
11495 ptr.p->masterData.gsn = 0;
11496 c_backups.release(ptr);
11497 }
11498
11499 /**
11500 * Description of how LCP works and its principles
11501 * -----------------------------------------------
11502 *
11503 * Introduction of Partial LCP
11504 * ...........................
11505 * In MySQL Cluster 7.6 partial LCPs was introduced. This means that each
11506 * LCP doesn't record every single row in the system. It only records a subset
11507 * of all rows + all changed rows since the last partial LCP.
11508 *
11509 * This allows partial LCPs to complete more quickly than a full LCP, and
11510 * the REDO log to be trimmed more frequently.
11511 *
11512 * We keep track of changed rows by using the GCI stored on each row. We
11513 * know which GCI that was fully part of the previous LCP. Thus if the
11514 * previous LCP contained all changes up to and including GCI = 77 this
11515 * means that the new LCP will only need to record changes from GCI = 78
11516 * and onwards.
11517 *
11518 * There is some complication that comes from deletions here.
11519 * The restore of the system uses a number of partial LCPs back in time.
11520 * For a specific rowid this means that there is a first partial LCP file
11521 * where it is recorded. It can either be restored with an inserted value as
11522 * part of this LCP, if it isn't then the rowid will be empty after executing
11523 * this first partial LCP, further partial LCPs might add it.
11524 *
11525 * In the following LCPs this rowid will only be part of the LCP if it has
11526 * changed since the last LCP. This is absolutely no problem if the row
11527 * has been inserted or updated since then the row exists and its value will
11528 * be recorded in the LCP as a changed row.
11529 *
11530 * At start of a partial LCP we decide the number of parts to checkpoint
11531 * fully, currently we have divided the page id range into 2048 different
11532 * parts. We can checkpoint anything between 1 to 2048 parts in one
11533 * partial LCP, this is driven by data size of fragment and change percentage.
11534 *
11535 * Definition: The set of rows where we record all rows are called ALL ROWS.
11536 * The set of rows where we only record the changed rows. We call this
11537 * CHANGE ROWS.
11538 *
11539 * The ALL ROWS parts are the same as used in earlier versions of MySQL
11540 * Cluster, and are a 'state dump' containing INSERT BY ROWID operations.
11541 * Each row existing at start of LCP will be recorded in pages belonging
11542 * to this part.
11543 *
11544 * The CHANGED ROWS parts are a kind of operation log with WRITE BY ROWID
11545 * and DELETE BY ROWID and DELETE BY PAGEID (DELETE by ROWID for all rows in a
11546 * page) operations which must be applied.
11547 *
11548 * For partial LCP we divide up the range of pages into 2048 parts using a hash
11549 * function on page id. For a specific LCP we will have one set of parts that
11550 * are checkpointed in the ALL ROWS part and the rest is checkpointed in the
11551 * CHANGE ROWS part.
11552 *
11553 * To restore we need to perform the following for each of the 2048 parts.
11554 * 1) Find the last LCP where this part belonged to the ALL ROWS part.
11555 * 2) Restore this part from this LCP.
11556 * 3) For each of the LCP after that up to the LCP we are restoring we will
11557 * restore the CHANGE ROWS part for this part.
11558 *
11559 * This means that at restore we will never need to go further back than the
11560 * oldest ALL ROWS part we have remaining which is restorable. This is
11561 * important understanding for knowing when LCP files can be deleted.
11562 *
11563 * More definitions
11564 * ----------------
11565 * Rowid: Each row has a rowid (page id and page index) which is a local key
11566 * to the fixed size part of the row. The fixed part of the row has references
11567 * to the variable sized part and it also has a reference to the disk part of
11568 * the row.
11569 *
11570 * Page Map: The page map takes a rowid as input and gives back the page
11571 * pointer to this page. The page map also knows if the page id is empty
11572 * and it is also used to keep some page state after page has been deleted
11573 * as discussed further below.
11574 *
11575 * Disk reference: This is the reference in the main memory part of the row
11576 * that refers to the disk part of the row. Currently this reference is
11577 * located in the fixed size part of the row and the disk data part is a
11578 * fixed size part.
11579 *
11580 * Row content: This is the actual values of the attributes in this row.
11581 *
11582 * Row structure:
11583 * -------------------------------------------
11584 * | Fixed size part in main memory |
11585 * | Tuple header + Fixed size attributes + |
11586 * | disk reference + variable size reference|
11587 * | + NULL bits |
11588 * ------------------------------------------
11589 *
11590 * ------------------------------------------
11591 * | Var part in main memory |
11592 * | Header part + variable sized attributes|
11593 * | + dynamic attributes |
11594 * |-----------------------------------------
11595 *
11596 * ------------------------------------------
11597 * | Fixed size part on disk page |
11598 * | Header part + Fix size disk attributes |
11599 * ------------------------------------------
11600 *
11601 * The Fixed main memory part header contains also GCI, Checksum. Also the
11602 * disk part contains a GCI and a reference to the main memory part.
11603 *
11604 * Purpose of LCP
11605 * ..............
11606 * The purpose of the LCP (Local checkpoint) is to ensure that we can cut the
11607 * REDO log tail which otherwise grow to infinity. We do this by performing
11608 * a regular LCP of each fragment.
11609 *
11610 * NDB contains both main memory data and disk data. The disk data part is
11611 * recovered by using a No Steal approach. This means that only committed
11612 * data is ever sent to the pages written to disk. To support this we use an
11613 * UNDO log to ensure that the disk data is possible to restore to the
11614 * exact state it had at the starting point of the LCP.
11615 *
11616 * The main memory part of the row content is stored in the LCP files
11617 * generated by the LCP. The disk part is stored in its position in the
11618 * disk pages by flushing the pages in memory to disk for the disk parts.
11619 *
11620 * Observation 1:
11621 * Only committed rows are written into any LCP for both main memory data and
11622 * disk data. Thus after restoring an LCP we only need to roll forward using
11623 * a REDO log.
11624 *
11625 * Observation 2:
11626 * Given that the LCP maintains the exact row structure at the start of the
11627 * LCP the REDO log can be a logical log (only logging actions (INSERT, DELETE,
11628 * UPDATE) and the values changed).
11629 *
11630 * The REDO log is mainly operating with primary keys, but to ensure that
11631 * we synchronize the rowids on different nodes all INSERTs must also log
11632 * the rowid they are inserted into.
11633 *
11634 * Observation 3:
11635 * Given that the REDO log is a logical log it is location and replication
11636 * independent. This means that we can restore the LCP stored locally
11637 * and then apply a mix of the local REDO log and REDO logs from other
11638 * nodes in the same node group. Using remote REDO logs is a principle we
11639 * have decided to abandon and instead fully rely on the ability to
11640 * synchronize data nodes at node restarts.
11641 *
11642 * An LCP is performed per fragment. A table consists of multiple fragments
11643 * that can be checkpointed in parallel in different LDMs.
11644 *
11645 * Only one LCP per fragment per LDM instance is currently executed. However
11646 * we allow for the prepare phase of the next LCP (opening files and preparing
11647 * the LCP execution) to proceed in parallel to the currently running
11648 * LCP. In addition the deletion of old LCP files is a background process
11649 * going on in the background to both of these processes.
11650 *
11651 * Need of LCP_SKIP bit for inserts
11652 * ................................
11653 * Performing a checkpoint for disk pages means simply writing any pages that
11654 * got dirty since the last checkpoint. It is a bit more involved to perform
11655 * checkpoints (LCPs) for main memory data. For main memory data we only
11656 * checkpoint the rows and not pages. This gives us the opportunity to write
11657 * less data in the main memory checkpoints since we don't have to save the
11658 * entire page where the changes were done.
11659 *
11660 * The idea for LCPs is that we need a LCP to contain exactly the rows present
11661 * at the start of the LCP. This means that we set an LCP_SKIP bit on rows
11662 * that are inserted during LCPs to avoid putting those rows into the LCP when
11663 * we pass by them in the LCP scan.
11664 *
11665 * The requirement to have exactly the correct set of rows that existed at
11666 * start of LCP comes from that we need the reference from main-memory rows
11667 * to disk rows to be correct. The content of the main memory row and
11668 * disk data row must not be exactly synchronized but if the row exists
11669 * in main memory the referred disk row must exist in disk pages and
11670 * vice versa.
11671 *
11672 * Tables that don't have disk data don't need this requirement, but we
11673 * treat them the same way.
11674 *
11675 * The row content in both the disk data and the main memory data can be
11676 * newer than the data at the start of the LCP, but not older.
11677 *
11678 * The reason is that the REDO log or other synchronisation efforts will
11679 * ensure that all updates from before the LCP and until the restoration
11680 * point is reapplied, so we will eventually have the correct data in
11681 * each row at the restoration point.
11682 *
11683 * Need of LCP keep list for deletes
11684 * .................................
11685 * Similarly we use an LCP keep list to record deleted rows such that we
11686 * record them in the LCP. We use this list to give those recordings a
11687 * higher priority since we will release the rowid immediately after
11688 * committing the row.
11689 *
11690 * These two principles ensure that the LCP will contain exactly the same
11691 * set of rows as we had at the start of the LCP. The row data might
11692 * differ from what it looked at the start of the LCP. This is however
11693 * of no significance since the REDO log will ensure that we will
11694 * after recovery have the correct state of the data.
11695 *
11696 * As an example a row with a certain rowid can be deleted before LCP scans
11697 * it and then the row will be sent to the LCP keep list. Later a new row
11698 * will be inserted while the LCP scan still hasn't arrived at this rowid
11699 * and then the INSERT will set the LCP_SKIP to ensure that the new row
11700 * is ignored in this rowid.
11701 *
11702 * This leads to the following observations.
11703 *
11704 * Observation 1:
11705 * ..............
11706 * In an LCP there will only be one row existing for a specific rowid. There
11707 * will never be two rows with the same rowid in an LCP.
11708 *
11709 * Proof:
11710 * ------
11711 * If two rows existed there must have been a delete followed by an insert
11712 * in the LCP scan. The delete will ensure that the first row with this
11713 * rowid will exist in LCP and the LCP_SKIP bit will ensure that the
11714 * second row with this rowid will not exist in the LCP.
11715 *
11716 * Observation 2:
11717 * ..............
11718 * It isn't allowed for any updates to change the disk reference. The disk
11719 * reference must be stable over a set of LCPs.
11720 *
11721 * Proof:
11722 * ------
11723 * If an update did change the disk reference the restored main memory row
11724 * would refer to the wrong disk data part which would not work.
11725 *
11726 * The above is the essential requirement on any LCP that is used in a
11727 * restore of NDB tables. We formulate it here as a theorem.
11728 *
11729 * Theorem 1:
11730 * ..........
11731 * An LCP used in the recovery of NDB must meet the following requirements.
11732 * 1) All committed rows that are present at start of LCP (defined as the
11733 * the time when we write the marker in the UNDO log of disk data) must
11734 * all be part of LCP and no other rows may be present in the LCP.
11735 * 2) All links from main memory to disk data and vice versa must be
11736 * consistent in a checkpoint.
11737 * 3) The row data must be the same as at the time of the start of the LCP
11738 * OR at a time after the start of the LCP.
11739 *
11740 * Proof:
11741 * ------
11742 * A proof for this is presented in the Ph.D thesis of Mikael Ronström,
11743 * Design and Modelling of a Parallel Data Server for Telecom Applications,
11744 * 1998 in chapter 9.2.1. The bearing principle is that the logical REDO
11745 * log will replay all transactions from a point which is certain to be
11746 * before the start of the LCP, thus all updates, inserts and deletes
11747 * happening after the start of the LCP is certain to be part of the
11748 * REDO log execution.
11749 *
11750 * A paper at VLDB 2005 also presents some of the proof behind this in
11751 * the paper called "Recovery principles in MySQL Cluster 5.1". This paper
11752 * also takes into account the use of disk data parts.
11753 *
11754 * While applying the REDO log the following events can happen to a row that
11755 * existed in LCP. Note that the start of LCP is not known when executing
11756 * the REDO log, so this is a theoretical proof of the validity of the
11757 * algorithm, not how it works.
11758 *
11759 * 1) Delete of row before start of LCP, no problems to execute. There are
11760 * two variants, the row is not inserted again, in this case the row
11761 * won't be in the LCP and no REDO log record will reinsert it. In case
11762 * the row is later reinserted the REDO log record will be executed as
11763 * part of recovery and the row is thus certain to be part of the
11764 * restorable state.
11765 *
11766 * This operation can discover that the row doesn't exist, but this is
11767 * ok and can only occur before start of LCP.
11768 *
11769 * 2) Delete of row after start of LCP, this is ok since the row will exist
11770 * before the delete as it existed at start of LCP.
11771 *
11772 * 3) Update before start of LCP. This is ok, it will restore a value to
11773 * the record that might not be the end state, but if not so there
11774 * will be more updates recorded in the REDO log. The important principle
11775 * here is that the REDO log application must be idempotent. Since the
11776 * REDO log simply restores the values of the attributes it is
11777 * idempotent. It is possible to construct a REDO log that contains
11778 * operations also (like add one to column a). This would not work in
11779 * this algorithm since we don't have exact control how exactly we
11780 * restore a row state. Our algorithm requires an idempotent REDO log.
11781 *
11782 * This update might discover that the row doesn't exist, this can only
11783 * occur before start of LCP so it is safe to ignore the REDO log record.
11784 *
11785 * 4) Update after start of LCP. The value this REDO log entry restores
11786 * could already be in the LCP since we don't care if the LCP records a
11787 * newer record than at the start of the LCP.
11788 *
11789 * 5) Insert before start of LCP. The REDO log execution will perform this if
11790 * the row doesn't exist. If it existed already we are certain that this
11791 * insert is before start of LCP and it can be safely ignored.
11792 *
11793 * 6) Insert after start of LCP, the row won't be in LCP, so will always work
11794 * fine.
11795 *
11796 * So what we see here is that the REDO log can sometimes bring us backwards
11797 * in the row history, but it will eventually bring us forward in row history
11798 * to the desired state at a particular GCP (global checkpoint).
11799 *
11800 * Handling deletes for partial LCPs
11801 * .................................
11802 * The problematic part is deletes of a row. This could result in 4 different
11803 * scenarios.
11804 *
11805 * Special handling with reuse of rowids for partial LCPs
11806 * ......................................................
11807 * 1) A first partial LCP has inserted row A into rowid X, after the LCP the
11808 * row is deleted and then the delete is followed by a new insert of row B
11809 * into rowid X. In this case the LCP will attempt to restore a row where
11810 * a row already exists in this rowid. Here we need to remove the old row
11811 * first before inserting the new row to ensure that the primary key hash
11812 * index is correct.
11813 *
11814 * To handle this case properly we always need to drop the row in the
11815 * row id position if the primary key has changed from the previous
11816 * LCP to this LCP. One manner is to always drop it first and then
11817 * reinsert it even if it is the same row.
11818 *
11819 * Special case of handling deleted rowids with GCI > 0
11820 * ....................................................
11821 * 2) A first partial LCP has inserted row A into rowid X, after that the
11822 * row is deleted. At the next LCP this will be recorded as a DELETE
11823 * by ROWID. So when applying this partial LCP the rowid X will be
11824 * set to an empty rowid and the record A will be deleted as part of
11825 * executing that partial LCP. So after executing that partial LCP the
11826 * row will not exist.
11827 *
11828 * Special case of empty rowids (GCI = 0) for newly allocated pages
11829 * ...............................................................
11830 * 3) The first partial LCP records the rows within page Y, after the LCP
11831 * but before the new LCP the page is dropped, after the drop it is
11832 * allocated again. When the LCP starts the page has at least 1 row in
11833 * it which has been reinserted.
11834 *
11835 * The remainder of the rows in the page can have GCI = 0, these rows
11836 * need to have a DELETE by ROWID in the LCP. This DELETE by ROWID might
11837 * encounter a row that actually didn't exist, so DELETE by ROWID at
11838 * restore must be able to handle that the row didn't exist when we
11839 * try to delete it.
11840 *
11841 * Special case of empty page slot at start of LCP
11842 * ...............................................
11843 * 4) At start of the LCP the page slot is free, in this case we record
11844 * the entire page as deleted, we call this DELETE by PAGEID. In this
11845 * case restore will delete all rows in this position. This only needs
11846 * to happen if the page exists when restoring, if the page slot is
11847 * empty when this is reached, then we can ignore the DELETE by PAGEID
11848 * since it is already handled.
11849 *
11850 * We only record DELETE by PAGEID for pages that are part of CHANGE
11851 * ROWS.
11852 *
11853 * We record this information by setting a flag on the page that says
11854 * LCP_EMPTY_PAGE_FLAG. This says that the page is now allocated, but
11855 * at start of the LCP scan it was empty, so when we reach this
11856 * page we will see this state and record a DELETE by PAGEID.
11857 * Similarly if we come by an empty page slot that haven't got the
11858 * LCP_SCANNED bit set in the page map as described in 5) we will
11859 * also record this as DELETE by PAGEID.
11860 *
11861 * Problematic case of Drop page during LCP scan
11862 * .............................................
11863 * 5) In this case the page exists at start of LCP, for ALL ROWS this is not
11864 * a problem, those rows that was deleted since the start of LCP is put
11865 * into the LCP through LCP keep lists. However for CHANGE ROWS we need to
11866 * record DELETE by ROWID for each row that has GCI = 0 or GCI > scanGCI
11867 * for LCP. We cannot drop the page without recording this information
11868 * since there is no way to recreate this information.
11869 *
11870 * To solve this issue we use the LCP keep list to enter the information
11871 * about rowids that we need to issue DELETE by ROWID for. This means that
11872 * we are able to drop the page immediately and store its state information
11873 * needed for LCP elsewhere.
11874 *
11875 * When dropping the page we will immediately scan the page and each
11876 * rowid that has GCI = 0 or GCI >= lcpScanGCI will be recorded into the
11877 * LCP keep list. However for efficiency reasons we will record multiple
11878 * rowids in each row in the LCP keep list. So each record in the
11879 * LCP keep list will either contain a full row as usual OR it will
11880 * contain an indicator of containing dropped rowids, the number of
11881 * dropped rowids in this row and the rowids in an array (each rowid
11882 * consumes 2 words).
11883 *
11884 * However there is one more problem related to this. Once the page has
11885 * been dropped before LCP scan has reached it, it can be reinserted
11886 * again. Now if this page as mentioned above belongs to the CHANGE ROWS
11887 * category then as explained in 4) we want to record it as a
11888 * DELETE by PAGEID. However in this case this is not correct, the page
11889 * has already been scanned by the LCP.
11890 *
11891 * We can avoid problems with future updates on the page by setting the
11892 * LCP_SKIP bit on the page when it is reinserted, but we also need some
11893 * information to avoid inserting the DELETE by PAEGID into the LCP.
11894 *
11895 * The place where we retain information about dropped pages is the page
11896 * map. We have 2 32-bit words in memory for each page in the current
11897 * set of pages. These 2 words are handled by the DynArr256 data structure.
11898 * We need to temporarily use this place to store information about pages
11899 * dropped during LCP scan in the CHANGE ROW part.
11900 *
11901 * To describe how this happens requires a description of the Page Map and
11902 * its workings and how we make use of it in this case.
11903 *
11904 * Description of Fragment Page Map
11905 * ................................
11906 *
11907 * ------------------
11908 * | Page Map Head |
11909 * ------------------
11910 * The page map head is a normal head of a doubly linked list that contains
11911 * the logical page id of the first free logical page id slot.
11912 *
11913 * The entries in the page map is different dependent on if the slot is
11914 * free or not. First we'll show the non-empty variant (actually the
11915 * second slot can be uninitialised in which case the DynArr256 will return
11916 * RNIL (RNIL cannot be set in any manner since we cannot use page ids
11917 * higher than or equal to RNIL & 0x3fffffff).
11918 * ------------------------------------------
11919 * | Physical page id | Bit 31 set any rest|
11920 * ------------------------------------------
11921 * Now the empty variant
11922 *
11923 * Next reference Previous reference
11924 * -----------------------------------------------------------
11925 * | Bit 31 set, logicalPageId | Bit 31 set logicalPageId |
11926 * -----------------------------------------------------------
11927 * So the first position uses bit 31 to indicate that the logical
11928 * page id position is empty, the other 31 bits in this position is used
11929 * to point to the next free logical page id. If all 30 lowest bits
11930 * are set in the logical page id it is FREE_PAGE_RNIL. FREE_PAGE_RNIL
11931 * means that there is no next logical page id.
11932 *
11933 * The previous position also contains a reference to a logical page id,
11934 * in this case the previous free logical page id. If there is no free
11935 * previous logical page id then this is set to FREE_PAGE_RNIL as
11936 * well. Bit 31 is set in both words when the entry is free.
11937 *
11938 * The reason that Bit 31 is set in both words is to ensure that when
11939 * we scan the fragment page map at drop fragment to release pages
11940 * that we don't release any pages from the second position. The
11941 * iterator delivers each word back and we don't keep track of which
11942 * position is which, so we need bit 31 to be set at all times for
11943 * the second position.
11944 *
11945 * The page map is only growing, the only manner to get rid of it is to
11946 * either drop the table or restart the node. At restart the page map
11947 * starts from scratch again.
11948 *
11949 * The conclusion is that the page map is a place where we can store
11950 * the special information about that a logical page id has been dropped
11951 * as part of the CHANGE ROWS category and it needs no more LCP scanning
11952 * even if reinserted. So by setting a bit here we can use this information
11953 * to avoid inserting a DELETE by PAGEID into the LCP and we can set some
11954 * some proper information on the page to ensure that we skip this page
11955 * later in the LCP scan (obviously also need the LCP scan to reset this
11956 * bit then).
11957 *
11958 * We also use bit 30 in the second word to indicate what the page state
11959 * was at the start of the previous LCP. This enables us to decide what
11960 * to do in those situations when we find that the page or row is not
11961 * used at start of this LCP.
11962 *
11963 * Solution:
11964 * ---------
11965 * We will use bit 30 in the first word of the page map to indicate this
11966 * special page state. This has the effect that we can at most have
11967 * 2^30 pages in one page map. This limits the size of the main memory
11968 * fixed part to 32 TBytes. If this becomes a problem then we need to
11969 * use 64-bit page id as well and that means that the page map will
11970 * contain 2 64-bit words instead and thus the problem will be resolved.
11971 * We call this bit the LCP_SCANNED_BIT. Bit 31 in the first word is
11972 * already used to store the FREE_PAGE_BIT which indicates if the page
11973 * entry is free or in use, if FREE_PAGE_BIT the two words are used
11974 * as next and prev of a linked list of free page ids for the fragment.
11975 *
11976 * Obviously we need to ensure that during all page map operations that
11977 * we take care in handling this special page state.
11978 *
11979 * Note: For the pages in the ALL ROWS catoegory where re we record all
11980 * rows we write all the rowids existing at start of LCP, this means that
11981 * a rowid in these parts that isn't recorded as an empty rowid by
11982 * definition. For parts where only record changes we have to ensure that
11983 * we get the same set of rows after executing all changes, so we need to
11984 * record all changes, both new rowids and deleted rowids and updates of
11985 * row content of rows.
11986 *
11987 * We will also use the 1 free bit in the second word in the page map.
11988 * This bit will be used to store the LCP state at the previous LCP.
11989 * When we reach a page in the LCP scan we will set the state of the last
11990 * LCP based on the current state and of oter flags as described below.
11991 *
11992 * The state that no page map entry exists is also a valid state, this
11993 * state indicates that the previous LCP state was that the page was
11994 * released and that the current state is empty state as well and that
11995 * that the state of the LCP_SCANNED_BIT is 0.
11996 *
11997 * So we have three bits in the page map:
11998 * LCP_SCANNED_BIT: Indicates that we have taken care of everything
11999 * related to LCP scans for this page in this LCP.
12000 * FREE_PAGE_BIT: Indicates that the current state of the page is free.
12001 * LAST_LCP_FREE_BIT: Set to 1 indicates that the last LCP state is D
12002 * and set to 0 indicates that the last LCP state is A. This is bit 30
12003 * of the second word in the page map.
12004 *
12005 * Detailed discussion of each case of release/allocation of page
12006 * ..............................................................
12007 *
12008 * A stands for an allocation event, D stands for an release event (drop page)
12009 * [AD].. stands for a A followed by D but possibly several ones and possibly
12010 * also no events.
12011 * E stands for empty set of events (no A or D events happened in the period).
12012 *
12013 * Case 1: Dropped before start of last LCP and dropped at start of this LCP
12014 * Desired action for ALL ROWS pages: Ignore page
12015 * Desired action for CHANGED ROWS pages: Ignore page, technically acceptable
12016 * to record it as DELETE by PAGEID as well.
12017 *
12018 * D LCP_Start(n) [AD].. LCP_Start(n+1) E LCP_End(n+1) (1)
12019 * D LCP_Start(n) [AD].. LCP_Start(n+1) A LCP_End(n+1) (2)
12020 * D LCP_Start(n) [AD].. LCP_Start(n+1) [AD]..A LCP_End(n+1) (3)
12021 *
12022 * (1) is found by the empty page when the LCP scan finds it and the
12023 * LCP_SCANNED_BIT is not set. Thus ALL ROWS pages knows to ignore the
12024 * the page. CHANGED ROWS pages can ignore it by looking at the state
12025 * of the last LCP and notice that the page was dropped also then and
12026 * thus the page can be ignored.
12027 *
12028 * In this case we set the state of last LCP to D in the LCP scan.
12029 *
12030 * (2) is found by discovering that page->is_page_to_skip_lcp() is true.
12031 * The LCP_SCANNED_BIT isn't set in this case when the LCP scan reaches
12032 * it. Thus ALL ROWS pages can ignore it. CHANGED ROWS pages will ignore
12033 * it after checking the state of the last LCP.
12034 *
12035 * In this case we need to keep the keep the state of last LCP until the
12036 * LCP scan has reached the page. When LCP scan reaches the page we will
12037 * set the state of last LCP to D when page->is_page_to_skip_lcp() is
12038 * true.
12039 *
12040 * (3) is found by discovering that LCP_SCANNED_BIT is set since the first
12041 * D event after LCP start handled the page and handled any needed
12042 * DELETE by PAGEID. After discovering this one needs to reset the
12043 * LCP_SCANNED_BIT again. At the first A the page_to_skip_lcp bit
12044 * was set, but the first D issued a DELETE BY PAGEID and dropped
12045 * the page and to flag that the LCP scan was handled the
12046 * LCP_SCANNED_BIT was set.
12047 *
12048 * We read the old last LCP state and set the new last LCP state when
12049 * reaching the first D event after start of LCP. The
12050 * page->is_page_to_skip_lcp() flag will assist in determining what
12051 * the state at start of LCP was.
12052 *
12053 * Case 2: Dropped before start of last LCP and allocated at start of LCP.
12054 *
12055 * Desired action for ALL ROWS pages: Any rows with committed data at start
12056 * of LCP should be recorded as INSERTs into the LCP.
12057 *
12058 * Desired action for CHANGED ROWS pages: Any rows with committed data at
12059 * start of LCP should be recorded as WRITEs into the LCP. All other rows
12060 * should be ignored, technically acceptable behaviour is to issue
12061 * DELETE by ROWID for those rows that should be ignored as well.
12062 *
12063 * D LCP_Start(n) [AD].. A LCP_Start(n+1) E LCP_End(n+1) (1)
12064 * D LCP_Start(n) [AD].. A LCP_Start(n+1) D LCP_End(n+1) (2)
12065 * D LCP_Start(n) [AD].. A LCP_Start(n+1) [DA].. D LCP_End(n+1) (3)
12066 *
12067 * (1) is found by that the page exists when being scanned, no LCP_SCANNED_BIT
12068 * is set and also not the page to skip lcp flag is set. Individual rows
12069 * can have their LCP_SKIP flag set. All rows with committed data AND not
12070 * LCP_SKIP flag set will be recorded. All rows with LCP_SKIP flag set
12071 * will be ignored for ALL ROWS pages and will be ignored for CHANGED ROWS
12072 * pages based on the last LCP state. Rows without committed data will be
12073 * ignored for ALL ROWS pages and will be ignored based on the last LCP
12074 * state for CHANGED ROWS pages.
12075 *
12076 * When we are done executing a page for the LCP scan we can set the
12077 * last LCP state to A.
12078 *
12079 * (2) is found when releasing the page. Before page is released it will have
12080 * its rows deleted, for each row that is deleted and wasn't already
12081 * deleted since start of LCP we will record the row using the LCP keep
12082 * list and also setting LCP_SKIP flag on the row. When releasing the
12083 * page we can ignore it based on knowledge of the last LCP state.
12084 *
12085 * In this we set the last LCP state and also read it when reaching the
12086 * D event. This event can even occur while we're in the middle of
12087 * scanning the page for the LCP.
12088 *
12089 * (3) is found by discovering that the LCP_SCANNED_BIT is set. This is set
12090 * by the first D event after start of LCP after handling the page as
12091 * in (2).
12092 *
12093 * Last LCP state already set in the first D event after start of LCP.
12094 *
12095 * Case 3: Allocated before start of last LCP and dropped at start of this LCP
12096 *
12097 * Desired action for ALL ROWS pages: Page ignored
12098 *
12099 * Desired action for CHANGED ROWS pages: DELETE by PAGEID recorded in LCP
12100 *
12101 * A LCP_Start(n) D [AD].. LCP_Start(n+1) E LCP_End(n+1) (1)
12102 * A LCP_Start(n) D [AD].. LCP_Start(n+1) A LCP_End(n+1) (2)
12103 * A LCP_Start(n) D [AD].. LCP_Start(n+1) [AD].. A LCP_End(n+1) (3)
12104 *
12105 * Here we will take the same action for all cases independent of if we know
12106 * state of the last LCP or not since the state was allocated before and thus
12107 * we need to record the change in state.
12108 *
12109 * (1) is found by empty page slot and no LCP_SCANNED_BIT set and not skip
12110 * flag set on page. For ALL ROWS pages we will simply ignore those
12111 * pages. For CHANGED ROWS pages we will record DELETE by PAGEID based
12112 * on the state of the last LCP.
12113 * (2) is found by discovering page->is_page_to_skip_lcp() is true when LCP
12114 * scan reaches it. For ALL ROWS pages this means we can ignore it, for
12115 * CHANGED ROWS pages we record it as DELETE by PAGEID based on the state
12116 * of the last LCP.
12117 * (3) is found by discovering the LCP_SCANNED_BIT set which was set when the
12118 * first D event after start of LCP was found. When this first D event
12119 * occurred we handled the page as in (1) followed by setting the
12120 * LCP_SCANNED_BIT.
12121 *
12122 * The same principles for handling last LCP state exists here as for Case 1.
12123 *
12124 * Case 4: Allocated before start of last LCP and allocated before start
12125 * of this LCP
12126 *
12127 * Desired action for ALL ROWS pages: Record all rows with committed data at
12128 * start of LCP. Ignore all rows without committed data at start of LCP.
12129 *
12130 * Desired action for CHANGED ROWS pages: Record all rows with committed data
12131 * at start of LCP. Record all rows without committed data at start of LCP as
12132 * DELETE by ROWID.
12133 *
12134 * A LCP_Start(n) [DA].. LCP_Start(n+1) E LCP_End(n+1) (1)
12135 * A LCP_Start(n) [DA].. LCP_Start(n+1) D LCP_End(n+1) (2)
12136 * A LCP_Start(n) [DA].. LCP_Start(n+1) [DA].. D LCP_End(n+1) (3)
12137 *
12138 * (1) is found by an existing page without LCP_SCANNED_BIT set and without
12139 * the page to skip flag set on the page. We will check row by row if the
12140 * row is to be copied to LCP.
12141 *
12142 * If a row exists at start of LCP then it will be recorded in the LCP,
12143 * either at LCP scan time or at first delete after the start of the LCP.
12144 * When the first delete have occurred then we set the LCP_SKIP flag on
12145 * the row to indicate that the row has already been processed for this
12146 * LCP. The handling here is the same for ALL ROWS pages and for CHANGED
12147 * ROWS pages.
12148 *
12149 * If a row didn't exist at start of LCP then we will ignore it for ALL
12150 * ROWS pages and we will record a DELETE by ROWID for CHANGED ROWS
12151 * pages. We discover this as part of LCP scan for rows not inserted
12152 * again before the LCP scan reaches them. For rows that are inserted
12153 * after start of LCP we will mark them with LCP_SKIP flag for ALL ROWS
12154 * pages. For CHANGED ROWS pages we could record the DELETE by ROWID
12155 * immediately, but there is no safe space to record this information.
12156 * So instead we mark the row with LCP_DELETE to flag to the LCP scan
12157 * that this row needs to generate a DELETE by ROWID.
12158 *
12159 * (2) is found when releasing a page, at this point the page has already
12160 * recorded everything for ALL ROWS pages. We indicate this by setting
12161 * LCP_SCANNED_BIT on the page.
12162 *
12163 * However for CHANGED ROWS pages we can still have a set of rowids that
12164 * was empty at start of LCP that we need to record before moving on.
12165 * We scan the page before moving on, we ignore rows that have the
12166 * LCP_SKIP flag set and rows that have rowGCI < scanGCI which indicates
12167 * that they were empty also at last LCP. All other rows we generate a
12168 * DELETE by ROWID for. Also here we set the LCP_SCANNED_BIT after
12169 * doing this.
12170 *
12171 * (3) is found by LCP_SCANNED_BIT set when LCP scan reaches it. Any A or D
12172 * event after the first D event will be ignored since LCP_SCANNED_BIT
12173 * is set.
12174 *
12175 * The same principles for handling last LCP state exists here as for Case 2.
12176 *
12177 * Requirement to record number of pages at start of LCP
12178 * .....................................................
12179 * For partial LCPs we record the number of pages existing in the whole
12180 * fragment at the start of the partial LCP, this has the effect that during
12181 * restore we can safely ignore all LCP records on rowids with higher page id
12182 * than the recorded number of pages. They could never be part of the LCP even
12183 * if they are part of earlier LCPs.
12184 *
12185 * Let's look at an example here. Each page can be sparse or full, it doesn't
12186 * matter for the description, we need to ensure that the restore can recover
12187 * the correct set of rows.
12188 *
12189 * LCP 1: Contains 17 pages (rowids from page 0 to 16 included)
12190 * LCP 2: Contains 13 pages
12191 * LCP 3: Contains 14 pages
12192 *
12193 * When restoring LCP 3 we make use also of parts from LCP 1 and LCP 2.
12194 * We start by applying the LCP 1 for rowids in page 0 to 13. Next when
12195 * start applying LCP 2 we need to perform DELETE by ROWID for all rows
12196 * page id 13. We know that all rowids from page id 13 have either
12197 * GCI = 0 or a GCI > lcpScanGci which makes them recorded as changes
12198 * in LCP 3.
12199 *
12200 * If we had not recorded the number of pages in LCPs we would not be
12201 * able to know that rows in page id 14 through 16 was deleted since
12202 * the LCP scan would not see them since they were not part of the
12203 * pages scanned during LCP (simply because the pages no longer existed).
12204 *
12205 *
12206 * Multiple LCP files to save disk space
12207 * .....................................
12208 * Using partial LCP it is essential to be able to drop files as early as
12209 * possible. If an LCP file contain too many parts fully written then the
12210 * file needs to be retained although most of its data is no longer useful.
12211 *
12212 * To avoid this we cap the number of parts we use for large fragments
12213 * in each file and use a multi-file implementation of each partial LCP.
12214 *
12215 * What we do here is that we divide the LCP of each fragment into several
12216 * files. We will write each of those files in sequential order. Assume that
12217 * we have 2048 parts and that this LCP is to record 256 of those parts starting
12218 * at part 100. Assume that we divide this LCP into 4 files.
12219 *
12220 * The first file will record all rows from part 100-163, the second will
12221 * contain all rows from part 164-228, the third file will contain all
12222 * rows from part 229-292 and the fourth and last file will contain
12223 * all rows from part 293-356.
12224 *
12225 * The rows from the LCP keep list is written into the file currently
12226 * used.
12227 *
12228 * Changed rows are written to any of the files. But we choose to write
12229 * them to the first file. The reason is that this means that the biggest
12230 * file in the LCP will be removed first and thus it is the most efficient
12231 * algorithm to save disk space.
12232 *
12233 * It is a bit complicated to understand to prove that this brings about
12234 * an LCP that can be correctly restored. We prove it in a number of
12235 * steps before proving the theorem for Partial LCPs.
12236 *
12237 * Corollary 1:
12238 * ............
12239 * For each LCP part we always start by applying an LCP where all rows
12240 * of the part is recorded. Then we will execute the change parts of
12241 * all LCPs thereafter until the last.
12242 *
12243 * Proof:
12244 * This is the intended recovery algorithm, so proof is not really
12245 * needed. Proof is only required to prove that this recovers a
12246 * proper LCP according to Theorem 1 above.
12247 *
12248 * Case 1:
12249 * Assume that the row existed at the time of the first LCP used in
12250 * restore and is kept all the way until the last LCP, updates can
12251 * occur.
12252 *
12253 * Case 2:
12254 * Assume that the row was inserted after initial LCP and is kept
12255 * until the last LCP.
12256 *
12257 * Case 3:
12258 * Assume that the row existed at the time of the first LCP but has
12259 * been deleted before the final LCP.
12260 *
12261 * Case 4:
12262 * Assume that the row didn't exist at the first LCP and did not
12263 * exist at the time of the last LCP.
12264 *
12265 * Case 4 is obviously ok, no LCP has recorded anything regarding
12266 * this row, so it cannot be a problem.
12267 *
12268 * Case 1 means that the row is restored in first LCP, if any changes
12269 * has occurred before the last LCP they will be recorded in any of
12270 * the LCP preceding the last LCP or in the last LCP itself. It
12271 * could contain a newer value if the last LCP had changes that
12272 * occurred after start of the LCP. Thus the row is present with
12273 * same or newer data as it should be according to Theorem 1.
12274 *
12275 * Case 2 means that the row was not present in the first LCP.
12276 * It must have been inserted in either of the following LCPs
12277 * or the last LCP and since it will be marked with a higher GCI
12278 * when inserted it will be part of the next LCP after being
12279 * inserted, similary any updates will be recorded in some LCP if
12280 * it happens before or during the last LCP. Thus the row exists
12281 * after applying rows according to Corollary 1 such that Theorem 1
12282 * holds true.
12283 *
12284 * Finally Case 3 have inserted the row as part of the first LCP. The
12285 * row could have been written by the LCP keep list in this first LCP.
12286 * However when the row is deleted the GCI of the row will be set to
12287 * to a GCI higher than the GCI of the first LCP and this ensures that
12288 * the rowid is recorded in LCP as DELETE by ROWID. Finally if the
12289 * entire page have been removed before the last LCP we will record
12290 * this in the last LCP and this means that we will ignore the row
12291 * that exists in the first LCP restored since we know that not any
12292 * rows with that rowid is present in the LCP.
12293 *
12294 * This means that we have proven that the LCP also in case 3 fits
12295 * with Theorem 1 in that the row is certain to not be part of the
12296 * LCP restored.
12297 *
12298 * Thus all cases have been proven and Corollary 1 is proven to be
12299 * a correct restore method for LCPs with Partial LCPs.
12300 *
12301 * Corollary 2:
12302 * ............
12303 * The LCP keep list can be recorded in any LCP file in the case where
12304 * multiple files are used to record an LCP.
12305 *
12306 * Proof:
12307 * The record in the LCP from a LCP keep list will always be overwritten
12308 * or ignored by the following LCPs. The reason is simply that the GCI of
12309 * the delete is higher than LCP scan GCI of the current LCP. Thus the
12310 * next LCP will either overwrite this record with a DELETE by ROWID or
12311 * the record will be ignored by the next LCP since the entire page has
12312 * been dropped or the rowid will be overwritten by another row that
12313 * reused the rowid of the deleted row.
12314 *
12315 * So thus it is safe to store these LCP keep list items as they come
12316 * and record them in any list. Obviously all the files of the last
12317 * LCP will be kept and applied as part of restore.
12318 *
12319 * Corollary 3:
12320 * ............
12321 * When we remove a file from an LCP we could not be interested in any
12322 * of the change rows from this LCP. We are only interested in the
12323 * parts where we have recorded all rows.
12324 *
12325 * Proof:
12326 * We will only remove the oldest LCP files at any time. Thus when we
12327 * remove a file from an LCP we are sure that all the files from the
12328 * previous LCP is already deleted. This means that the LCP from where
12329 * we delete files can only be used to restore the all rows part as
12330 * described in Corollary 1. Thus we will always ignore all parts
12331 * with changed rows for an LCP where we are about to delete a file.
12332 *
12333 * Theorem 2:
12334 * ----------
12335 * The following algorithm will be applied using multiple files.
12336 * If we want to divide the parts where we record all rows into multiple
12337 * files we do so in the following manner:
12338 * 1) In the first file we will record up to 1/8th of the parts. We will
12339 * also record all changed rows for parts where we are not recording
12340 * all rows. In addition LCP keep rows are recorded as they arrive.
12341 * 2) In the following files we will record also all rows for up to 1/8th
12342 * of the parts. Also LCP keep rows for those as they arrive.
12343 *
12344 * Proof:
12345 * ------
12346 * Corollary 2 shows that it is correct to record LCP keep rows as they
12347 * arrive in any of the files.
12348 * Corollary 3 shows that the any algorithm to select where to record
12349 * changed rows is correct, in particular this shows that the selected
12350 * variant to record all in the first file is correct.
12351 * Corollary 1 shows that the restore algorithm for this type of LCP
12352 * works as desired.
12353 *
12354 * Observation 2:
12355 * --------------
12356 * Given that we need two different mechanisms to deduce if a page should
12357 * be skipped when LCP scanned (is_page_to_skip_lcp() through state on
12358 * page and lcp_scanned_bit set in page map) this means that both of
12359 * those need to be checked to see if a row is in remaining LCP set
12360 * that is used to decide whether to set LCP_SKIP bit on the row.
12361 *
12362 * The is_page_to_skip_lcp() flag on page is set when a page as first
12363 * alloc/release page event after start of LCP scan is allocated. After
12364 * this the page can be released and if so the last LCP state of the
12365 * page will be updated and the lcp scanned bit will be set.
12366 *
12367 * Similarly if the page is released as the first page event after
12368 * start of LCP scan we will also update the last LCP state and
12369 * next set the lcp scanned bit. So when we see a lcp scanned bit we
12370 * need never do anything more during the LCP scan, we only need to
12371 * reset the bit.
12372 *
12373 * Lemma 1:
12374 * --------
12375 * Based on theorem 2 we deduce that each LCP requires a LCP control
12376 * file that contains at least the following information.
12377 *
12378 * MaxGciCompleted:
12379 * This is the GCI where which we have all changes for in the LCP. The
12380 * LCP can also contain changes for MaxGciCompleted + 1 and
12381 * MaxGciCompleted + 2 and beyond.
12382 *
12383 * MaxPageCount:
12384 * This is the number of pages existing (with rowids) in the LCP which
12385 * is recorded at the start of the partial LCP.
12386 *
12387 * A list of part ranges (one part range per file) and the file numbers.
12388 * This is recorded using the following variables in the LCP control file.
12389 *
12390 * MaxPartPairs:
12391 * This is the maximum number of LCPs that can constitute a recoverable
12392 * checkpoints. Thus an LCP control file can write at most this many
12393 * parts. Currently this number is set to 2048.
12394 *
12395 * NumPartPairs:
12396 * This is the number of files used in the restore of this LCP, there is
12397 * one part range per file.
12398 *
12399 * MaxNumberDataFiles:
12400 * This is the maximum number of files used, it is used to calculate the
12401 * file numbers based on a number of files (NumPartPairs) and the
12402 * parameter LastDataFileNumber.
12403 *
12404 * LastDataFileNumber:
12405 * The last LCP file, this will be the final file restored in a restore
12406 * situation.
12407 *
12408 * An array of pairs (startPart, numParts) where the last records the
12409 * last LCP file and the first records the first file to start restoring
12410 * from.
12411 *
12412 * In addition we record the following information in the LCP control
12413 * file.
12414 *
12415 * Checksum:
12416 * To verify the content of the LCP control file.
12417 *
12418 * TableId:
12419 * Table id of the checkpointed fragment.
12420 *
12421 * FragmentId:
12422 * Fragment id of the checkpointed fragment.
12423 *
12424 * LcpId:
12425 * The global LcpId this LCP belongs to.
12426 *
12427 * LocalLcpId:
12428 * If part of global LCP it is 0, otherwise it is 1, 2, 3 and so forth
12429 * for a local LCP executed without control of DIH.
12430 *
12431 * In addition the LCP control file contains a file header as all LCP
12432 * files and backup files. The most important information here is the
12433 * version number of the partial LCP changes as such and the version
12434 * number that wrote this file. This is important for any upgrade
12435 * scenarios.
12436 *
12437 * LCPs and Restarts:
12438 * ------------------
12439 * Partial LCP is developed to store less information in LCPs and also
12440 * that LCPs can run faster. When LCPs complete faster that means that
12441 * we can cut the REDO log much sooner.
12442 *
12443 * However we still need to make a full checkpoint as part of a restart.
12444 * We will describe the implications this has for various types of
12445 * restarts.
12446 *
12447 * System Restart:
12448 * ...............
12449 * No real implication, we have ensured that doing a full checkpoint is
12450 * still divided into separate files to ensure that we save disk space.
12451 * There is no updates ongoing during this LCP so this LCP will simply
12452 * write the changed contents while executing the REDO log.
12453 *
12454 * Node restart:
12455 * .............
12456 * This restart depends to a great extent on how long time the node
12457 * was dead, if it was dead for a long time it will have a lot more
12458 * to write in a LCP than otherwise.
12459 *
12460 * Initial node restart:
12461 * .....................
12462 * This is the trickiest problem to solve. Using partial LCP we aim for
12463 * LCPs to complete in 5-10 minutes, but writing the initial LCP after
12464 * synching the data with the live node might take many hours if the
12465 * node contains terabytes of data.
12466 *
12467 * We solve this by running local LCPs before we become part of the
12468 * global LCP protocol. DIH won't know about these LCPs but it doesn't
12469 * really matter, we can make use of it if the node crashes during
12470 * restart although DIH didn't know about it. But more importantly
12471 * as soon as we participate in the first global LCP we can run that
12472 * LCP much faster since we already have logged all rows, so we only
12473 * need to record the changes since the last local LCP in the first
12474 * global LCP.
12475 *
12476 * The protocol used to tell the starting node about state of fragments
12477 * is called COPY_ACTIVEREQ. This is received 2 times per fragment
12478 * per node restart. The first one says that we have completed the
12479 * synchronisation. We will use this first signal to put the fragment
12480 * in queue for running an LCP.
12481 *
12482 * When all fragments have been synchronised then DIH will start the
12483 * second phase. In this phase each fragment will start using the
12484 * REDO log as preparation for the first LCP.
12485 *
12486 * Note that a local LCP cannot be used to restore the database on
12487 * its own. It requires either a node synchronization as part of node
12488 * restart which works fine as the rowids are synchronized one by one
12489 * and there might be unneeded work done if the live node uses a GCI
12490 * from DIH, but it will still be correct.
12491 *
12492 * It can also be restored in a system restart by using REDO logs from
12493 * other nodes, we can avoid applying REDO logs we don't need since we
12494 * know what GCP we have completely recorded in the LCP. The proof of
12495 * why applying REDO logs will restore a consistent database still
12496 * holds.
12497 *
12498 * Obviously if as part of recovery we are told to execute the REDO log
12499 * from GCI 77 to 119 and we know that the LCP is completed for GCI
12500 * GCI 144 then we can completely skip the part where we execute the
12501 * REDO log for that fragment as part of the recovery. Later it will
12502 * be synched up in this case using a live node.
12503 *
12504 * Local LCPs during restart
12505 * .........................
12506 * When we receive the first COPY_ACTIVEREQ in DBLQH we will start a
12507 * new local LCP. This will insert an UNDO_LOCAL_LCP_FIRST into the
12508 * UNDO log. This means that we can move the UNDO log forward, we
12509 * still need to retain all UNDO log records from the previous LCP,
12510 * and the one before that since we cannot be certain that the previous
12511 * LCP actually completed.
12512 *
12513 * During Local LCP we cannot insert one more UNDO_LOCAL_LCP_FIRST again
12514 * until we have completed a Local LCP of each and every fragment to be
12515 * restored.
12516 *
12517 * So what this means is that we will start running a Local LCP as part
12518 * of the synchronisation with the live node. It is possible to run an
12519 * LCP for an individual fragment several times during this round of
12520 * LCP. But we need to complete the Local LCP before allowing the
12521 * first COPY_ACTIVEREQ in the second phase to continue. If we didn't
12522 * do this we would run a much bigger chance of running out of UNDO
12523 * log. In some cases we might still run out of UNDO log and in this
12524 * case we will ensure that the LCP gets higher priority and that the
12525 * synchronisation process is blocked temporarily. We will do this
12526 * when certain thresholds in UNDO log usage is reached.
12527 *
12528 * We will allow for two choices in how we perform Local LCPs. We will
12529 * perform 1 Local LCP for all node restarts before we allow the
12530 * REDO logging to be activated (activated by COPY_ACTIVEREQ in second
12531 * phase). After completing this first Local LCP we will measure how
12532 * much impact introducing the node into the distributed LCP would mean.
12533 * If we consider the impact too high we will execute one more round of
12534 * Local LCP.
12535 *
12536 * We will not for the moment consider executing a third Local LCP to
12537 * ensure that we don't get stuck in this state for too long.
12538 *
12539 * Executing 2 Local LCPs should in most cases be sufficient to catch
12540 * up with LCP times at other nodes.
12541 *
12542 * Dropped tables during a node failure
12543 * ....................................
12544 * This is a tricky problem that requires us to avoid reusing a table id
12545 * for a new table until we're sure that all nodes have restarted and
12546 * heard that the table have been dropped. We also need to tell starting
12547 * nodes that the table is dropped and that it requires all LCP files
12548 * to be removed.
12549 *
12550 * Various implementation details about LCPs
12551 * .........................................
12552 * When we commit a delete we need to know if the fragment is currently
12553 * performing a LCP and if so we need to know if the row has been
12554 * scanned yet during LCP.
12555 *
12556 * With Partial LCP this is a bit more intricate where we need to check
12557 * the scan order in the Backup block. However only DBTUP knows if a
12558 * page has been deleted and then followed by a new page allocation.
12559 *
12560 * For parts where we record all rows of the part these pages can be
12561 * skipped since all rows inserted into this page occurs after start of
12562 * LCP.
12563 *
12564 * However for parts where we record changed rows we need to scan these
12565 * pages and record DELETE by ROWID for those entries that are free.
12566 *
12567 * LCP signal flow
12568 * ---------------
12569 *
12570 * Description of local LCP handling when checkpointing one fragment locally in
12571 * this data node. DBLQH, BACKUP are executing always in the same thread. DICT
12572 * and NDBFS mostly execute in different threads.
12573 *
12574 * The LCP_PREPARE_REQ for the next fragment to checkpoint can execute in
12575 * parallel with BACKUP_FRAGMENT_REQ processing. This makes LCP processing
12576 * faster when there is many small fragments.
12577 *
12578
12579 DBLQH BACKUP DICT NDBFS
12580 | |
12581 | LCP_PREPARE_REQ |
12582 |---------------------------->|
12583 | | 2 * FSOPENREQ (control files)
12584 | |----------------------------------->|
12585 | | 2 * FSOPENCONF |
12586 | |<-----------------------------------|
12587 | | 2 * FSREADREQ (control files)
12588 | |----------------------------------->|
12589 | | 2 * FSREADCONF |
12590 | |<-----------------------------------|
12591 | | FSCLOSEREQ (most recent control file)
12592 | |----------------------------------->|
12593 | | FSCLOSECONF |
12594 | |<-----------------------------------|
12595 | | FSOPENREQ (checkpoint data file)
12596 | |----------------------------------->|
12597 | | FSOPENCONF |
12598 | |<-----------------------------------|
12599 | | CONTINUEB(ZBUFFER_FULL_META) to oneself
12600 | |--------------------------------------->
12601 | | GET_TABINFOREQ |
12602 | |----------------->|
12603 | | GET_TABINFO_CONF |
12604 | |<-----------------|
12605 | LCP_PREPARE_CONF |
12606 |<----------------------------|
12607 ...
12608 | BACKUP_FRAGMENT_REQ |-------> CONTINUEB(START_FILE_THREAD)|
12609 |---------------------------->|
12610 | SCAN_FRAGREQ |
12611 |<----------------------------|
12612 |
12613 | Potential CONTINUEB(ZTUP_SCAN) while scanning for tuples to record in LCP
12614 |
12615 | TRANSID_AI |
12616 |---------------------------->|
12617 |.... More TRANSID_AI | (Up to 16 TRANSID_AI, 1 per record)
12618 | SCAN_FRAGCONF(close_flag) |
12619 |---------------------------->|
12620 | SCAN_NEXTREQ |
12621 |<----------------------------|
12622 |
12623 | Potential CONTINUEB(ZTUP_SCAN) while scanning for tuples to record in LCP
12624 |
12625 | TRANSID_AI |
12626 |---------------------------->|
12627 |.... More TRANSID_AI | (Up to 16 TRANSID_AI, 1 per record)
12628 | SCAN_FRAGCONF(close_flag) |
12629 |---------------------------->|
12630
12631 After each SCAN_FRAGCONF we check of there is enough space in the Backup
12632 buffer used for the LCP. We will not check it until here, so the buffer
12633 must be big enough to be able to store the maximum size of 16 records
12634 in the buffer. Given that maximum record size is about 16kB, this means
12635 that we must have at least 256 kB of buffer space for LCPs. The default
12636 is 2MB, so should not set it lower than this unless trying to achieve
12637 a really memory optimised setup.
12638
12639 If there is currently no space in the LCP buffer, then the buffer is either
12640 waiting to be written to disk, or it is being written to disk. In this case
12641 we will send a CONTINUEB(BUFFER_FULL_SCAN) delayed signal until the buffer
12642 is available again.
12643
12644 When the buffer is available again we send a new SCAN_NEXTREQ for the next
12645 set of rows to be recorded in LCP.
12646
12647 CONTINUEB(START_FILE_THREAD) will either send a FSAPPENDREQ to the opened
12648 file or it will send a delayed CONTINUEB(BUFFER_UNDERFLOW).
12649
12650 When FSAPPENDCONF arrives it will make the same check again and either
12651 send one more file write through FSAPPENDREQ or another
12652 CONTINUEB(BUFFER_UNDERFLOW). It will continue like this until the
12653 SCAN_FRAGCONF has been sent with close_flag set to true AND all the buffers
12654 have been written to disk.
12655
12656 After the LCP file write have been completed the close of the fragment LCP
12657 is started.
12658
12659 An important consideration when executing LCPs is that they conflict with
12660 the normal processing of user commands such as key lookups, scans and so
12661 forth. If we execute on normal JBB-level everything we are going to get
12662 problems in that we could have job buffers of thousands of signals. This
12663 means that we will run the LCP extremely slow which will be a significant
12664 problem.
12665
12666 The other approach is to use JBA-level. This will obviously give the
12667 LCP too high priority, we will run LCPs until we have filled up the
12668 buffer or even until we have filled up our quota for the 100ms timeslot
12669 where we check for those things. This could end up in producing 10
12670 MByte of LCP data before allowing user level transactions again. This
12671 is also obviously not a good idea.
12672
12673 So most of the startup and shutdown logic for LCPs, both for the entire
12674 LCP and messages per fragment LCP is ok to raise to JBA level. They are
12675 short and concise messages and won't bother the user transactions at any
12676 noticable level. We will avoid fixing GET_TABINFO for that since it
12677 is only one signal per fragment LCP and also the code path is also used
12678 many other activitites which are not suitable to run at JBA-level.
12679
12680 So the major problem to handle is the actual scanning towards LQH. Here
12681 we need to use a mechanism that keeps the rate at appropriate levels.
12682 We will use a mix of keeping track of how many jobs were executed since
12683 last time we executed together with sending JBA-level signals to speed
12684 up LCP processing for a short time and using signals sent with delay 0
12685 to avoid being delayed for more than 128 signals (the maximum amount
12686 of signals executed before we check timed signals).
12687
12688 The first step to handle this is to ensure that we can send SCAN_FRAGREQ
12689 on priority A and that this also causes the resulting signals that these
12690 messages generate also to be sent on priority A level. Then each time
12691 we can continue the scan immediately after receiving SCAN_FRAGCONF we
12692 need to make a decision at which level to send the signal. We can
12693 either send it as delayed signal with 0 delay or we could send them
12694 at priority A level to get another chunk of data for the LCP at a high
12695 priority.
12696
12697 We send the information about Priority A-level as a flag in the
12698 SCAN_FRAGREQ signal. This will ensure that all resulting signals
12699 will be sent on Priority A except the CONTINUEB(ZTUP_SCAN) which
12700 will get special treatment where it increases the length of the
12701 loop counter and sends the signal with delay 0. We cannot send
12702 this signal on priority level A since there is no bound on how
12703 long it will execute.
12704
12705 DBLQH PGMAN LGMAN BACKUP DICT NDBFS
12706 | SYNC_PAGE_CACHE_REQ
12707 | <------------------|
12708 | sync_log_lcp_lsn |
12709 | <----------|
12710 | Flush UNDO log
12711 | ---------->|
12712 | Flush fragment page cache
12713 | SYNC_PAGE_CACHE_CONF
12714 | ------------------>|
12715 | If first fragment in LCP then also:
12716 | SYNC_EXTENT_PAGES_REQ
12717 | <------------------|
12718 | Flush all extent pages
12719 | SYNC_EXTENT_PAGES_CONF
12720 | ------------------>|
12721 |
12722 | After all file writes to LCP data file completed:
12723 |
12724 | | FSCLOSEREQ
12725 | |------------------------------------>|
12726 | | FSCLOSECONF
12727 | |<------------------------------------|
12728
12729 When all those activities are completed:
12730 1) Sync UNDO log
12731 2) Sync page cache
12732 3) Sync extent pages (done immediately following sync of page cache)
12733 4) Write and close of LCP data file
12734 then we are ready to write the LCP control file. After this file
12735 is written and closed the LCP of this fragment is completed.
12736
12737 With this scheme the LCP of a fragment is immediately usable when the
12738 LCP of a fragment is completed and the signal of this completion is
12739 that a written LCP control file exists. At restart one needs to verify
12740 the GCI of this file to ensure that the LCP is restorable. Otherwise
12741 the older LCP will be used.
12742
12743 | | FSWRITEREQ (LCP control file)
12744 | |----------------------------------->|
12745 | | FSWRITECONF
12746 | |<-----------------------------------|
12747 | | FSCLOSEREQ (LCP control file)
12748 | |----------------------------------->|
12749 | | FSCLOSECONF
12750 | |<-----------------------------------|
12751 | |
12752 | BACKUP_FRAGMENT_CONF |
12753 |<----------------------------|
12754 |
12755 | DIH (local)
12756 | LCP_FRAG_REP |
12757 |--------------------->|
12758
12759 LCP_FRAG_REP is distributed to all DIHs from the local DIH instance.
12760
12761 Finally after completing all fragments we have a number of signals sent to
12762 complete the LCP processing. The only one needed here is the END_LCPREQ
12763 to TSMAN to make the dropped pages from any dropped tables available again
12764 after completing the LCP. This signal needs no wait for it to complete.
12765 DBLQH knows when the last fragment is completed since it will receive a
12766 special LCP_FRAG_ORD with lastFragmentFlag set from LQH proxy which in
12767 turn received this from DIH.
12768
12769 LQH Proxy PGMAN(extra) LGMAN TSMAN
12770 | LCP_FRAG_ORD(last) |
12771 |<----------------------------|
12772 ......
12773 | LCP_COMPLETE_REP |
12774 |---------------------------->|
12775
12776 Here the LQH Proxy block will wait for all DBLQH instances to complete.
12777 After all have completed the following signals will be sent.
12778 LQH Proxy PGMAN(extra) LGMAN TSMAN
12779
12780 | END_LCPREQ |
12781 |---------------------------------->|
12782 | END_LCPCONF |
12783 |<----------------------------------|
12784 |
12785 | LCP_COMPLETE_REP(DBLQH) sent to DIH(local)
12786
12787
12788 As preparation for this DBLQH sent DEFINE_BACKUP_REQ to setup a backup
12789 record in restart phase 4. It must get the response DEFINE_BACKUP_CONF for
12790 the restart to successfully complete. This signal allocates memory for the
12791 LCP buffers.
12792
12793 Background deletion process
12794 ---------------------------
12795 To save file space we try to delete old checkpoint files no longer needed
12796 as soon as possible. This is a background process fully handled by the
12797 BACKUP block, it is handled outside the normal LCP processing protocol.
12798
12799 It could interfere with LCP processing in the exceptional case that we
12800 haven't managed to delete the old LCP files for a fragment before starting
12801 to prepare the next local checkpoint.
12802
12803 From DIH's point of view we always have a LCP instance 0 and a LCP instance
12804 1 for each fragment. When we complete writing a checkpoint file we need to
12805 keep the old checkpoint file until the new checkpoint file is usable in a
12806 restore case. At the time when it completes we cannot use it since it can
12807 contain rows from a GCI that haven't been fully completed yet. As soon as
12808 we get an indication of that the checkpoint is useful for restore we can
12809 delete the old checkpoint file.
12810
12811 To handle this we maintain a list of fragments to handle deletes of fragment
12812 checkpoint files.
12813
12814 We also need a way to handle deletion of old files after crashes. This is
12815 actually fairly easy to handle as part of the recovery as we use the
12816 checkpoint files to restore, we can as part of that remove any old
12817 checkpoint files.
12818
12819 Local LCP execution
12820 -------------------
12821 Normally an LCP is executed as a distributed checkpoint where all nodes
12822 perform the checkpoint in an synchronised manner. During restarts we might
12823 execute extra local LCPs that can be used to cut the logs (REDO and UNDO
12824 logs). We don't generate REDO logs until very late in the recovery process,
12825 UNDO logs however we generate all the time, so it is mainly the UNDO log
12826 we have to protect from being exhausted during a restart.
12827
12828 Such a local checkpoint can be used to recover a system, but it can normally
12829 not be used to recover a node on its own. If the local LCP happens during a
12830 system restart there are two options. If we have seen the GCP that we are
12831 attempting to restore we have all checkpoints and REDO logs required and
12832 a local LCP during restart should not be necessary normally. If our node is
12833 behind and we rely on some other node to bring us the latest GCIs then we
12834 might have to perform a checkpoint. In this case this local LCP will not
12835 be recoverable on its own.
12836
12837 The reason why these local LCPs are not recoverable on their own is two
12838 things. First the synchronisation of data with the other node might not
12839 be completed yet when the local LCP starts. This means that the local LCP
12840 isn't seeing a united view, some rows will see a very new version whereas
12841 other rows will be seeing a very old view. To make a consistent state one
12842 more node is required. Second even if the local LCP started after the
12843 synchronisation was complete we don't have local REDO log records that
12844 can bring the local LCP to a consistent state since we don't write to
12845 the REDO log during the synchronisation phase. Even if we did write to
12846 the REDO log during synchronisation the various fragments would still be
12847 able to recover to different GCIs, thus a consistent restore of the node
12848 is still not possible.
12849
12850 So when a node crashes the first time it is always recoverable on its
12851 own from a certain GCI. The node with the highest such GCI per node
12852 group is selected as the primary recovery node. Other nodes might have
12853 to rely on this node for its further recovery. Obviously each node group
12854 need to be restored from the same GCI to restore a consistent database.
12855 As soon as we start executing a local LCP the node is no longer able to
12856 be restored independent of other nodes. So before starting to execute a
12857 local LCP we must first write something to the file system indicating that
12858 this node is now not recoverable unless another node gives us assistance.
12859
12860 So independent of what GCI this can restore according to the system file
12861 it cannot be used to recover data to other nodes without first recovering
12862 its own data using another node as aid.
12863
12864 When a node is started we know of the GCI to restore for our node, it
12865 is stored in DBLQH in the variable crestartNewestGci during recovery
12866 and DBLQH gets it from DBDIH that got it from the system file stored
12867 in the DIH blocks.
12868
12869 For distributed LCPs we use this GCI to restore to check if a fragment
12870 LCP can be used for recovery. However for local LCPs this information
12871 is normally not sufficient. For local LCPs we either have a fixed
12872 new GCI that we need to handle (during system restart) or a moving
12873 set of GCPs (during node start).
12874
12875 So for a restore we need to know the crestartNewestGci from DBLQH, but
12876 we also need to know the GCIs that we can use from other nodes. This
12877 information must be written into the local system file of this node.
12878
12879 The local system file is stored in NDBCNTR. It contains the following
12880 information:
12881 1) Flag whether node is restorable on its own
12882 2) Flag whether node have already removed old LCP files
12883 3) Last GCI of partial GCPs
12884
12885 When a node is starting up and we are recovering the data (executing
12886 RESTORE_LCP_REQ from restore) we want to delete any files that isn't
12887 usable for recovery since they have a MaxGCIWritten that is larger
12888 than the above Last GCP of partial GCPs. Once we have completed
12889 the RESTORE_LCP_REQ phase we know that we have deleted all old
12890 LCP files that can no longer be used and we should only have one
12891 copy of each fragment LCP stored at this point. At this point we
12892 can set the flag above to indicate that we have already removed the
12893 old LCP files.
12894
12895 The important parameters in the LCP metadata files stored here are
12896 the parameters MaxGCIWritten and MaxGCICompleted.
12897
12898 When we write a local LCP the following holds for MaxGCIWritten.
12899 During system restart the MaxGCIWritten will be set to the
12900 GCI that the system restart is trying to restore. If the fragment
12901 has been fully synchronised before the local LCP started it will
12902 have the MaxGCICompleted set to the same GCI, otherwise it will
12903 have its value set to the crestartNewestGci (the GCP that was
12904 the last GCP we were part of the distributed protocol).
12905
12906 So for system restarts there are only two GCI values that can be
12907 used during a local LCP. It is the GCI we are attempting to
12908 restore in the cluster or it is the GCI we were last involved in
12909 a distributed protocol for, crestartNewestGci).
12910
12911 For node restarts the MaxGCIWritten is set according to what
12912 was set during the writing of the local LCP of the fragment.
12913 It will never be set smaller than crestartNewestGci.
12914
12915 MaxGCICompleted is set dependent on the state at the start
12916 of the local LCP. If the fragment was fully synchronized
12917 before the start of the fragment LCP we set MaxGCICompleted
12918 to the GCI that was recoverable in the cluster at the time
12919 of the start of the local fragment LCP. If the fragment
12920 wasn't fully synchronised before the start of the local LCP
12921 we set it to crestartNewestGci or the maximum completed GCI
12922 in the fragment LCP restored.
12923
12924 MaxGCIWritten is important during recovery to know whether
12925 a local LCP is valid, if MaxGCIWritten is larger than the
12926 GCP we have seen complete, the local LCP files cannot be
12927 trusted and must be deleted.
12928
12929 MaxGCICompleted setting can ensure that we don't have to
12930 re-execute the local REDO log any more. It also takes
12931 into account that we don't have to synchronize more
12932 than necessary with the starting node.
12933
12934 Information needed during restore for local LCP
12935 ...............................................
12936 We need to know about the crestartNewestGci. We also need
12937 to know the maximum GCI that is allowed when we encounter
12938 a local fragment LCP to understand which local fragment
12939 LCPs to remove.
12940 crestartNewestGci is sent as part of RESTORE_LCP_REQ for
12941 each restored fragment. We also need to add the max
12942 GCI restorable. Actually it is sufficient to send the
12943 maximum of those two values. Thus if the local system
12944 file says that we can recover on our own we will
12945 continue sending crestartNewestGci. Otherwise we will
12946 send the maximum of crestartNewestGci and the max GCI
12947 found in local system file.
12948
12949 If any of the MaxGciWritten and MaxGciCompleted is set
12950 higher than the max GCI restorable we are sending to
12951 the restore block we need to remove that fragment LCP.
12952
12953 Information needed during write of local LCP
12954 ............................................
12955 We need to know the state of the synchronisation of the fragment.
12956 If m_copy_started_state == AC_NORMAL &&
12957 fragStatus == ACTIVE_CREATION in DBLQH then we have completed
12958 the synchronisation of the fragment. Otherwise we haven't.
12959 We'll get this information from DBLQH at start of write of LCP
12960 in the Backup block.
12961
12962 The backup block is informed about the GCI that is currently
12963 completed in the cluster through the signal RESTORABLE_GCI_REP
12964 sent from DBLQH. This information DBLQH collects from
12965 the GCP_SAVEREQ signal. This information is stored in the
12966 Backup block in m_newestRestorableGci.
12967
12968 MaxGciCompleted is set by DBLQH and retrieved by Backup block
12969 in the method lcp_max_completed_gci. For normal distributed
12970 LCPs this method will simply set the MaxGciCompleted to the
12971 last completed GCI that DBLQH knows of. DBLQH gets to know
12972 of completion of a GCI through GCP_SAVEREQ. However for
12973 local LCP the procedure is a bit more complicated.
12974
12975 It will first check if the fragment is fully synchronised.
12976 If not it will set MaxGciCompleted to crestartNewestGci.
12977 If it is synchronised we will use the same method as for
12978 a distributed LCP given that we have completed the
12979 GCI fully since the fragment contains the same data as the
12980 live node although the data isn't yet recoverable.
12981
12982 Writing of local system file
12983 ............................
12984 Before we start a local LCP during recovery we write
12985 the local system file to indicate that the node can
12986 no longer be restored on its own until recovered again.
12987 This sets the following information in the local system
12988 file.
12989 1) Node restorable on its own flag is set to 0 (false).
12990 2) Flag indicating whether local LCPs removed is set to 0 (false).
12991 3) max GCP recoverable value is set to
12992 System Restart case: GCI cluster is restored to
12993 Node Restart case: GCI recoverable at the moment in cluster
12994
12995 For node restarts we also write the local system file and update
12996 the max GCI recoverable value each time a GCI have been made
12997 recoverable.
12998
12999 During recovery we read the local system file to discover
13000 whether we can be master in the system restart and also to
13001 discover if we can recover on our own.
13002
13003 We propagate the max GCI recoverable value to DBLQH to ensure
13004 that we drop old LCP files that are not of any value in
13005 recovery any more.
13006
13007 After completing the restart we finally write the local system
13008 file during phase 50. In this phase all recovery of data is
13009 completed and only initialisation of SUMA clients remains, so
13010 it is safe to write the local system file here again. This time
13011 we set the values to:
13012 1) Node restorable on its own flag is set to 1 (true)
13013 2) Flag indicating whether local LCPs removed is set to 0 (ignorable)
13014 3) max GCP recoverable value is set to 0 (ignorable)
13015 */
13016 void
execLCP_PREPARE_REQ(Signal * signal)13017 Backup::execLCP_PREPARE_REQ(Signal* signal)
13018 {
13019 jamEntry();
13020 LcpPrepareReq req = *(LcpPrepareReq*)signal->getDataPtr();
13021
13022 BackupRecordPtr ptr;
13023 c_backupPool.getPtr(ptr, req.backupPtr);
13024
13025 TablePtr tabPtr;
13026 FragmentPtr fragPtr;
13027
13028 jamLine(req.tableId);
13029
13030 ndbrequire(ptr.p->prepareState == NOT_ACTIVE);
13031 ptr.p->prepareState = PREPARE_READ_CTL_FILES;
13032 ptr.p->prepareErrorCode = 0;
13033
13034 ptr.p->prepare_table.first(tabPtr);
13035 tabPtr.p->fragments.getPtr(fragPtr, 0);
13036
13037 tabPtr.p->tableId = req.tableId;
13038 tabPtr.p->tableType = DictTabInfo::UserTable;
13039
13040 fragPtr.p->fragmentId = req.fragmentId;
13041 fragPtr.p->scanned = 0;
13042 fragPtr.p->scanning = 0;
13043 fragPtr.p->tableId = req.tableId;
13044 fragPtr.p->createGci = req.createGci;
13045
13046 if (req.backupId != ptr.p->backupId ||
13047 req.localLcpId != ptr.p->localLcpId ||
13048 !ptr.p->m_initial_lcp_started)
13049 {
13050 jam();
13051 /**
13052 * These variables are only set at the very first LCP_PREPARE_REQ in
13053 * an LCP. At this point there is no parallelism, so no need to
13054 * care for concurrency on the ptr object here.
13055 *
13056 * New LCP, reset per-LCP counters. noOfBytes and noOfRecords is other
13057 * than here handled by the LCP execution phase.
13058 */
13059 ptr.p->noOfBytes = 0;
13060 ptr.p->noOfRecords = 0;
13061 ptr.p->backupId = req.backupId;
13062 ptr.p->localLcpId = req.localLcpId;
13063 ptr.p->m_initial_lcp_started = true;
13064 ndbrequire(ptr.p->m_first_fragment == false);
13065 ptr.p->m_first_fragment = true;
13066 ptr.p->m_is_lcp_scan_active = false;
13067 ptr.p->m_current_lcp_lsn = Uint64(0);
13068 ptr.p->m_high_res_lcp_start_time = getHighResTimer();
13069 m_current_dd_time_us = Uint64(0);
13070 lcp_start_point(signal);
13071 DEB_LCP_STAT(("(%u)TAGS Start new LCP, id: %u", instance(), req.backupId));
13072 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
13073 m_delete_lcp_file_head);
13074 ndbrequire(queue.isEmpty());
13075 }
13076
13077 /**
13078 * We need to open both header files. One of them contains the latest
13079 * information from the last local checkpoint. We need however to
13080 * keep the old information around since this new LCP isn't immediately
13081 * useful for recovery. This also has the added benefit that we have the
13082 * files replicated. If we crash while we are still writing the new
13083 * header file we can always recover using the old header file. We
13084 * retain the old header file. This means that we need to open both
13085 * files to discover which of them is the most recent one. We should
13086 * use the older one to write the new header information into, but
13087 * we should use the newer header file to get the information about
13088 * which parts to perform the LCP on.
13089 */
13090 lcp_open_ctl_file(signal, ptr, 0);
13091 lcp_open_ctl_file(signal, ptr, 1);
13092 }
13093
13094 /**
13095 * File processing for an LCP
13096 * --------------------------
13097 * At LCP_PREPARE_REQ we prepare the files for an LCP. There are two control
13098 * files for each fragment. These two files are both opened at prepare time.
13099 * One contains the description of the previous LCP and one contains the
13100 * description of the LCP before that one. Usually only one control file
13101 * exist per fragment since as soon as the LCP is fully completed we delete
13102 * the now oldest control file.
13103 *
13104 * So the steps are:
13105 * 1) Open both control files
13106 * 2) Find out which is the most recent control file.
13107 * 3) Use data from most recent control file to prepare which parts we will
13108 * use for the this LCP. Calculate number of next data file to use.
13109 * 4) Open the new data file for this LCP.
13110 * The old data file(s) will still exist
13111 * 5) Prepare phase is completed
13112 * 6) Execute phase of LCP fills the data file with data from this LCP.
13113 * 7) Flush and close the new data file.
13114 * 8) Write new control file, flush and close it.
13115 * 9) Report LCP processing as completed.
13116 *
13117 * Step 10) and onwards is handled as a background process.
13118 *
13119 * 10)Calculate data files to delete after this LCP is completed.
13120 * 11)Delete old data files no longer needed.
13121 * 12)Delete the LCP control no longer needed.
13122 */
lcp_open_ctl_file(Signal * signal,BackupRecordPtr ptr,Uint32 lcpNo)13123 void Backup::lcp_open_ctl_file(Signal *signal,
13124 BackupRecordPtr ptr,
13125 Uint32 lcpNo)
13126 {
13127 FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13128 req->userReference = reference();
13129 req->fileFlags =
13130 FsOpenReq::OM_READWRITE | FsOpenReq::OM_CREATE;
13131
13132 /**
13133 * Compressed files do not support OM_READWRITE, so we will never
13134 * use compression for the LCP control files. The files will not
13135 * take up very much space. If it is necessary to support
13136 * compressed LCP control files then it is easy to do so by first
13137 * opening the LCP control files for read in this phase and then
13138 * when deciding which file to use for the next LCP we will close
13139 * both files and open the file to use with OM_CREATE and also
13140 * with OM_TRUNCATE to ensure we overwrite the old file
13141 * content.
13142 *
13143 * O_DIRECT requires very special write semantics which we don't
13144 * follow for CTL files. So we never set this option for CTL files.
13145 */
13146
13147 FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
13148
13149 /**
13150 * Lcp header file
13151 */
13152 BackupFilePtr filePtr;
13153 TablePtr tabPtr;
13154 FragmentPtr fragPtr;
13155
13156 c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[lcpNo]);
13157 ptr.p->prepare_table.first(tabPtr);
13158 tabPtr.p->fragments.getPtr(fragPtr, 0);
13159
13160 ndbrequire(filePtr.p->m_flags == 0);
13161 filePtr.p->m_flags |= BackupFile::BF_OPENING;
13162 filePtr.p->m_flags |= BackupFile::BF_HEADER_FILE;
13163 filePtr.p->tableId = RNIL; // Will force init
13164 req->userPointer = filePtr.i;
13165 FsOpenReq::setVersion(req->fileNumber, 5);
13166 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
13167 FsOpenReq::v5_setLcpNo(req->fileNumber, lcpNo);
13168 FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
13169 FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
13170 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
13171 }
13172
13173 void
lcp_open_ctl_file_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)13174 Backup::lcp_open_ctl_file_done(Signal* signal,
13175 BackupRecordPtr ptr,
13176 BackupFilePtr filePtr)
13177 {
13178 /**
13179 * Header file has been opened, now time to read it.
13180 * Header file is never bigger than one page. Get page from list of
13181 * pages in the file record. Page comes from global page pool.
13182 */
13183 Page32Ptr pagePtr;
13184 FsReadWriteReq* req = (FsReadWriteReq*)signal->getDataPtrSend();
13185
13186 filePtr.p->pages.getPtr(pagePtr, 0);
13187 filePtr.p->m_flags |= BackupFile::BF_READING;
13188
13189 req->userPointer = filePtr.i;
13190 req->filePointer = filePtr.p->filePointer;
13191 req->userReference = reference();
13192 req->varIndex = 0;
13193 req->numberOfPages = 1;
13194 req->operationFlag = 0;
13195 FsReadWriteReq::setFormatFlag(req->operationFlag,
13196 FsReadWriteReq::fsFormatMemAddress);
13197 FsReadWriteReq::setPartialReadFlag(req->operationFlag, 1);
13198
13199 Uint32 mem_offset = Uint32((char*)pagePtr.p - (char*)c_startOfPages);
13200 req->data.memoryAddress.memoryOffset = mem_offset;
13201 req->data.memoryAddress.fileOffset = 0;
13202 req->data.memoryAddress.size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
13203
13204 sendSignal(NDBFS_REF, GSN_FSREADREQ, signal,
13205 FsReadWriteReq::FixedLength + 3, JBA);
13206 }
13207
13208 void
execFSREADREF(Signal * signal)13209 Backup::execFSREADREF(Signal *signal)
13210 {
13211 jamEntry();
13212
13213 FsRef * ref = (FsRef *)signal->getDataPtr();
13214 const Uint32 userPtr = ref->userPointer;
13215
13216 BackupFilePtr filePtr;
13217 c_backupFilePool.getPtr(filePtr, userPtr);
13218 /**
13219 * Since we create the file if it doesn't exist, this should not occur
13220 * unless something is completely wrong with the file system.
13221 */
13222 ndbabort();
13223 }
13224
13225 void
execFSREADCONF(Signal * signal)13226 Backup::execFSREADCONF(Signal *signal)
13227 {
13228 jamEntry();
13229
13230 FsConf * conf = (FsConf *)signal->getDataPtr();
13231 const Uint32 userPtr = conf->userPointer;
13232
13233 BackupFilePtr filePtr;
13234 c_backupFilePool.getPtr(filePtr, userPtr);
13235
13236 /**
13237 * If we created the file in the open call, then bytes_read will be 0.
13238 * This will distinguish a non-existing file from an existing file.
13239 */
13240 filePtr.p->bytesRead = conf->bytes_read;
13241 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_READING;
13242
13243 BackupRecordPtr ptr;
13244 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
13245
13246 if (ptr.p->deleteFilePtr == filePtr.i)
13247 {
13248 jam();
13249 ndbrequire(filePtr.p->bytesRead ==
13250 BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL ||
13251 filePtr.p->bytesRead ==
13252 BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG);
13253 lcp_read_ctl_file_for_rewrite_done(signal, filePtr);
13254 return;
13255 }
13256 for (Uint32 i = 0; i < 2; i++)
13257 {
13258 jam();
13259 c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[i]);
13260 if ((filePtr.p->m_flags & BackupFile::BF_READING) ||
13261 (filePtr.p->m_flags & BackupFile::BF_OPENING))
13262 {
13263 jam();
13264 return;
13265 }
13266 }
13267 lcp_read_ctl_file_done(signal, ptr);
13268 }
13269
13270 void
lcp_read_ctl_file_done(Signal * signal,BackupRecordPtr ptr)13271 Backup::lcp_read_ctl_file_done(Signal* signal, BackupRecordPtr ptr)
13272 {
13273 BackupFilePtr filePtr[2];
13274 for (Uint32 i = 0; i < 2; i++)
13275 {
13276 jam();
13277 c_backupFilePool.getPtr(filePtr[i], ptr.p->prepareCtlFilePtr[i]);
13278 DEB_EXTRA_LCP(("(%u)ctl: %u, bytesRead: %u",
13279 instance(), i, filePtr[i].p->bytesRead));
13280 if (filePtr[i].p->bytesRead != 0)
13281 {
13282 Page32Ptr pagePtr;
13283 jam();
13284 filePtr[i].p->pages.getPtr(pagePtr, 0);
13285 lcp_read_ctl_file(pagePtr, filePtr[i].p->bytesRead, ptr);
13286 }
13287 else
13288 {
13289 Page32Ptr pagePtr;
13290 jam();
13291 filePtr[i].p->pages.getPtr(pagePtr, 0);
13292 lcp_init_ctl_file(pagePtr);
13293 }
13294 }
13295 Page32Ptr pagePtr0, pagePtr1;
13296 filePtr[0].p->pages.getPtr(pagePtr0, 0);
13297 filePtr[1].p->pages.getPtr(pagePtr1, 0);
13298 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr0 =
13299 (struct BackupFormat::LCPCtlFile*)pagePtr0.p;
13300 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr1 =
13301 (struct BackupFormat::LCPCtlFile*)pagePtr1.p;
13302 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr;
13303 Uint32 closeLcpNumber;
13304 Uint32 dataFileNumber;
13305 Uint32 maxGciCompleted;
13306 Uint32 maxGciWritten;
13307 Uint32 createGci;
13308 Uint32 createTableVersion;
13309 Uint32 lqhCreateTableVersion;
13310
13311 /**
13312 * Ignore LCP files that are not valid, a file that have
13313 * CreateTableVersion equal to 0 is also not valid. This kind of
13314 * file can be created during Drop Table processing.
13315 */
13316 if (lcpCtlFilePtr0->ValidFlag == 0 ||
13317 lcpCtlFilePtr0->CreateTableVersion == 0)
13318 {
13319 jam();
13320 lcpCtlFilePtr0->ValidFlag = 0;
13321 lcpCtlFilePtr0->LcpId = 0;
13322 lcpCtlFilePtr0->LocalLcpId = 0;
13323 }
13324 if (lcpCtlFilePtr1->ValidFlag == 0 ||
13325 lcpCtlFilePtr1->CreateTableVersion == 0)
13326 {
13327 jam();
13328 lcpCtlFilePtr1->ValidFlag = 0;
13329 lcpCtlFilePtr1->LcpId = 0;
13330 lcpCtlFilePtr1->LocalLcpId = 0;
13331 }
13332 if (lcpCtlFilePtr0->LcpId > lcpCtlFilePtr1->LcpId ||
13333 (lcpCtlFilePtr0->LcpId == lcpCtlFilePtr1->LcpId &&
13334 lcpCtlFilePtr0->LcpId != 0 &&
13335 lcpCtlFilePtr0->LocalLcpId > lcpCtlFilePtr1->LocalLcpId))
13336 {
13337 jam();
13338 dataFileNumber = lcpCtlFilePtr0->LastDataFileNumber;
13339 lcpCtlFilePtr = lcpCtlFilePtr1;
13340 ptr.p->prepareNextLcpCtlFileNumber = 1;
13341 closeLcpNumber = 0;
13342 createGci = lcpCtlFilePtr0->CreateGci;
13343 createTableVersion = lcpCtlFilePtr0->CreateTableVersion;
13344 maxGciCompleted = lcpCtlFilePtr0->MaxGciCompleted;
13345 maxGciWritten = lcpCtlFilePtr0->MaxGciWritten;
13346 ptr.p->prepareDeleteCtlFileNumber = closeLcpNumber;
13347 copy_prev_lcp_info(ptr, lcpCtlFilePtr0);
13348 }
13349 else
13350 {
13351 /**
13352 * Both can have the same LCP id. This should only happen when none of the
13353 * files existed and in this case the LCP id should be 0.
13354 * This will happen after a new table is created. If upgrading from 7.4 or
13355 * earlier than this is handled as part of node or cluster restart. So this
13356 * will not be the reason.
13357 */
13358 jam();
13359 ndbrequire(lcpCtlFilePtr0->LcpId < lcpCtlFilePtr1->LcpId ||
13360 (lcpCtlFilePtr0->LcpId == lcpCtlFilePtr1->LcpId &&
13361 (lcpCtlFilePtr0->LcpId == 0 ||
13362 lcpCtlFilePtr0->LocalLcpId < lcpCtlFilePtr1->LocalLcpId)));
13363 dataFileNumber = lcpCtlFilePtr1->LastDataFileNumber;
13364 lcpCtlFilePtr = lcpCtlFilePtr0;
13365 ptr.p->prepareNextLcpCtlFileNumber = 0;
13366 createGci = lcpCtlFilePtr1->CreateGci;
13367 createTableVersion = lcpCtlFilePtr1->CreateTableVersion;
13368 maxGciCompleted = lcpCtlFilePtr1->MaxGciCompleted;
13369 maxGciWritten = lcpCtlFilePtr1->MaxGciWritten;
13370 closeLcpNumber = 1;
13371 ptr.p->prepareDeleteCtlFileNumber = closeLcpNumber;
13372 if (lcpCtlFilePtr1->LcpId == 0)
13373 {
13374 jam();
13375 /**
13376 * None of the files existed before, ensure that we don't delete
13377 * any data file since no one exists at this moment. Also ensure
13378 * that the other control file is removed.
13379 *
13380 * lcpCtlFilePtr1->LcpId == 0 => lcpCtlFilePtr0->LcpId == 0 since
13381 * lcpCtlFilePtr1->LcpId >= lcpCtlFilePtr0->LcpId when we come
13382 * here.
13383 *
13384 * We set m_num_parts_in_lcp to 0 to indicate this is first LCP for
13385 * this fragment and thus needs to always be a full LCP.
13386 */
13387 ptr.p->prepareDeleteCtlFileNumber = RNIL;
13388 ptr.p->m_prepare_num_parts_in_lcp = 0;
13389 ptr.p->m_prepare_max_parts_in_lcp = 0;
13390 ptr.p->m_prepare_scan_change_gci = 0;
13391 ptr.p->m_prepare_first_start_part_in_lcp = 0;
13392 ptr.p->preparePrevLcpId = 0;
13393 ptr.p->preparePrevLocalLcpId = 0;
13394 maxGciCompleted = 0;
13395 maxGciWritten = 0;
13396 TablePtr tabPtr;
13397 FragmentPtr fragPtr;
13398 ndbrequire(ptr.p->prepare_table.first(tabPtr));
13399 tabPtr.p->fragments.getPtr(fragPtr, 0);
13400 createGci = fragPtr.p->createGci;
13401 createTableVersion = c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
13402 }
13403 else
13404 {
13405 jam();
13406 copy_prev_lcp_info(ptr, lcpCtlFilePtr1);
13407 }
13408 }
13409 /**
13410 * prepareNextLcpCtlFileNumber is the number of the prepareCtlFilePtr's
13411 * which will be kept for this LCP. We have written the data in its page
13412 * with i-value of 0. This is what lcpCtlFilePtr points to at the moment.
13413 * This is the page we will later write after completing the LCP of this
13414 * fragment.
13415 *
13416 * We will always get the last data file number by getting the last
13417 * data file number from the control file to close which is the most
13418 * recent, then we will add one modulo the max number to get the
13419 * new last data file number.
13420 */
13421 dataFileNumber = get_file_add(dataFileNumber, 1);
13422 ptr.p->prepareFirstDataFileNumber = dataFileNumber;
13423 TablePtr tabPtr;
13424 FragmentPtr fragPtr;
13425 ndbrequire(ptr.p->prepare_table.first(tabPtr));
13426 tabPtr.p->fragments.getPtr(fragPtr, 0);
13427 ptr.p->prepareMaxGciWritten = maxGciWritten;
13428 lqhCreateTableVersion = c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
13429
13430 Uint32 maxGci = MAX(maxGciCompleted, maxGciWritten);
13431 if ((maxGci < fragPtr.p->createGci &&
13432 maxGci != 0) ||
13433 (c_initial_start_lcp_not_done_yet &&
13434 (ptr.p->preparePrevLocalLcpId != 0 ||
13435 ptr.p->preparePrevLcpId != 0)))
13436 {
13437 jam();
13438 /**
13439 * This case is somewhat obscure. Due to the fact that we support the
13440 * config variable __at_restart_skip_indexes we can actually come here
13441 * for a table (should be a unique index table) that have an LCP file
13442 * remaining from the previous use of this table id. It is potentially
13443 * possible also when dropping a table while this node is down and then
13444 * creating it again before this node has started. In this case we could
13445 * come here and find an old LCP file. So what we do here is that we
13446 * perform the drop of the old LCP fragments and then we restart the
13447 * LCP handling again with an empty set of LCP files as it should be.
13448 *
13449 * This means first closing the CTL files (deleting the older one and
13450 * keeping the newer one to ensure we keep one CTL file until all data
13451 * files have been deleted and to integrate easily into the drop file
13452 * handling in this block.
13453 *
13454 * We can only discover this case in a cluster where the master is
13455 * on 7.6 version. So in upgrade cases we won't discover this case
13456 * since we don't get the createGci from the DICT master in that case
13457 * when the fragment is created.
13458 *
13459 * We can also get here when doing an initial node restart and there
13460 * is old LCP files to clean up.
13461 */
13462 DEB_LCP(("(%u)TAGT Drop case: tab(%u,%u).%u (now %u),"
13463 " maxGciCompleted: %u,"
13464 " maxGciWritten: %u, createGci: %u",
13465 instance(),
13466 tabPtr.p->tableId,
13467 fragPtr.p->fragmentId,
13468 createTableVersion,
13469 c_lqh->getCreateSchemaVersion(tabPtr.p->tableId),
13470 maxGciCompleted,
13471 maxGciWritten,
13472 fragPtr.p->createGci));
13473
13474 ptr.p->prepareState = PREPARE_DROP_CLOSE;
13475 closeFile(signal, ptr, filePtr[closeLcpNumber]);
13476 closeFile(signal,
13477 ptr,
13478 filePtr[ptr.p->prepareNextLcpCtlFileNumber],
13479 true,
13480 true);
13481 return;
13482 }
13483 /* Initialise page to write to next CTL file with new LCP id */
13484 lcp_set_lcp_id(ptr, lcpCtlFilePtr);
13485
13486 DEB_LCP(("(%u)TAGC Use ctl file: %u, prev Lcp(%u,%u), curr Lcp(%u,%u)"
13487 ", next data file: %u, tab(%u,%u).%u"
13488 ", prevMaxGciCompleted: %u, createGci: %u",
13489 instance(),
13490 ptr.p->prepareNextLcpCtlFileNumber,
13491 ptr.p->preparePrevLcpId,
13492 ptr.p->preparePrevLocalLcpId,
13493 lcpCtlFilePtr->LcpId,
13494 lcpCtlFilePtr->LocalLcpId,
13495 dataFileNumber,
13496 tabPtr.p->tableId,
13497 fragPtr.p->fragmentId,
13498 c_lqh->getCreateSchemaVersion(tabPtr.p->tableId),
13499 maxGciCompleted,
13500 fragPtr.p->createGci));
13501
13502 /**
13503 * lqhCreateTableVersion == 0 means that the table is no longer active.
13504 * We will continue as if things were ok, the table is being dropped so
13505 * no need to abort here, the file will be dropped anyways.
13506 */
13507 if (lqhCreateTableVersion != 0 &&
13508 lqhCreateTableVersion != createTableVersion)
13509 {
13510 g_eventLogger->info("(%u) tab(%u,%u) lqhCreateTableVersion: %u"
13511 ", createTableVersion: %u",
13512 instance(),
13513 tabPtr.p->tableId,
13514 fragPtr.p->fragmentId,
13515 lqhCreateTableVersion,
13516 createTableVersion);
13517 }
13518 ndbrequire(createTableVersion == lqhCreateTableVersion ||
13519 lqhCreateTableVersion == 0);
13520
13521
13522 /**
13523 * We close the file which was the previous LCP control file. We will
13524 * retain the oldest one and use this for this LCP, it will then
13525 * become the most recent one when we are done. We keep the one to
13526 * use open for now, it will be closed later in the LCP processing.
13527 */
13528 ndbrequire(ptr.p->prepareErrorCode == 0);
13529 closeFile(signal,
13530 ptr,
13531 filePtr[closeLcpNumber],
13532 true,
13533 (ptr.p->prepareDeleteCtlFileNumber == RNIL));
13534 return;
13535 }
13536
13537 void
copy_prev_lcp_info(BackupRecordPtr ptr,struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)13538 Backup::copy_prev_lcp_info(BackupRecordPtr ptr,
13539 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
13540 {
13541 Uint32 next_start_part = 0;
13542 ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13543 ptr.p->m_prepare_max_parts_in_lcp = lcpCtlFilePtr->MaxPartPairs;
13544 ptr.p->m_prepare_num_parts_in_lcp = lcpCtlFilePtr->NumPartPairs;
13545 jam();
13546 Uint32 total_parts = 0;
13547 for (Uint32 i = 0; i < ptr.p->m_prepare_num_parts_in_lcp; i++)
13548 {
13549 Uint32 start_part = lcpCtlFilePtr->partPairs[i].startPart;
13550 Uint32 num_parts = lcpCtlFilePtr->partPairs[i].numParts;
13551 next_start_part = get_part_add(start_part, num_parts);
13552 ptr.p->m_prepare_part_info[i].startPart = start_part;
13553 ptr.p->m_prepare_part_info[i].numParts = num_parts;
13554 total_parts += num_parts;
13555 }
13556 ndbrequire(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13557 ptr.p->m_prepare_first_start_part_in_lcp = next_start_part;
13558 ptr.p->m_prepare_scan_change_gci = lcpCtlFilePtr->MaxGciCompleted;
13559 ptr.p->preparePrevLcpId = lcpCtlFilePtr->LcpId;
13560 ptr.p->preparePrevLocalLcpId = lcpCtlFilePtr->LocalLcpId;
13561 }
13562
13563 Uint32
get_part_add(Uint32 start_part,Uint32 num_parts)13564 Backup::get_part_add(Uint32 start_part, Uint32 num_parts)
13565 {
13566 return (start_part + num_parts) % BackupFormat::NDB_MAX_LCP_PARTS;
13567 }
13568
13569 Uint32
get_file_add(Uint32 start_file,Uint32 num_files)13570 Backup::get_file_add(Uint32 start_file, Uint32 num_files)
13571 {
13572 return (start_file + num_files) % BackupFormat::NDB_MAX_LCP_FILES;
13573 }
13574
13575 Uint32
get_file_sub(Uint32 start_file,Uint32 num_files)13576 Backup::get_file_sub(Uint32 start_file, Uint32 num_files)
13577 {
13578 if (start_file >= num_files)
13579 {
13580 jam();
13581 return (start_file - num_files);
13582 }
13583 else
13584 {
13585 jam();
13586 return (start_file + BackupFormat::NDB_MAX_LCP_FILES - num_files);
13587 }
13588 }
13589
13590 void
lcp_read_ctl_file(Page32Ptr pagePtr,Uint32 bytesRead,BackupRecordPtr ptr)13591 Backup::lcp_read_ctl_file(Page32Ptr pagePtr,
13592 Uint32 bytesRead,
13593 BackupRecordPtr ptr)
13594 {
13595 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13596 (struct BackupFormat::LCPCtlFile*)pagePtr.p;
13597 /**
13598 * This function reads the LCP Control file data and retrieves information
13599 * about:
13600 * 1) next starting part
13601 * 2) LCP id this file is a header for
13602 *
13603 * This information is used to decide which header file to close (the most
13604 * recent one) and which header file to use for the next LCP.
13605 */
13606 ndbrequire(BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL == bytesRead ||
13607 BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG == bytesRead);
13608 if (!convert_ctl_page_to_host(lcpCtlFilePtr))
13609 {
13610 jam();
13611 lcp_init_ctl_file(pagePtr);
13612 }
13613 {
13614 TablePtr tabPtr;
13615 FragmentPtr fragPtr;
13616 ptr.p->prepare_table.first(tabPtr);
13617 tabPtr.p->fragments.getPtr(fragPtr, 0);
13618 ndbrequire(lcpCtlFilePtr->TableId == tabPtr.p->tableId)
13619 ndbrequire(lcpCtlFilePtr->FragmentId == fragPtr.p->fragmentId);
13620 }
13621 }
13622
13623 /**
13624 * We compress before writing LCP control and after reading it we will
13625 * decompress the part information. In compressed format we use 3 bytes
13626 * to store two numbers that can at most be 2048. In uncompressed
13627 * format each part is a 16-bit unsigned integer.
13628 */
13629 #define BYTES_PER_PART 3
13630 /**
13631 * Define the LCP Control file header size, remove the one part pair
13632 * defined in the common header.
13633 */
13634 #define LCP_CTL_FILE_HEADER_SIZE (sizeof(BackupFormat::LCPCtlFile) - \
13635 sizeof(BackupFormat::PartPair))
13636
13637 bool
convert_ctl_page_to_host(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)13638 Backup::convert_ctl_page_to_host(
13639 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
13640 {
13641 Uint32 *pageData = (Uint32*)lcpCtlFilePtr;
13642 Uint32 numPartPairs = ntohl(lcpCtlFilePtr->NumPartPairs);
13643 Uint32 real_bytes_read = LCP_CTL_FILE_HEADER_SIZE +
13644 (BYTES_PER_PART * numPartPairs);
13645
13646 /* Checksum is calculated on compressed network byte order */
13647 if (numPartPairs > BackupFormat::NDB_MAX_LCP_PARTS)
13648 {
13649 DEB_LCP(("(%u)numPartPairs: %x", instance(), numPartPairs));
13650 ndbassert(false);
13651 return false;
13652 }
13653 /**
13654 * Add 3 to ensure that we get also the last word with anything not
13655 * equal to 0 when changing to word count.
13656 */
13657 Uint32 words = (real_bytes_read + 3) / sizeof(Uint32);
13658 Uint32 chksum = 0;
13659 for (Uint32 i = 0; i < words; i++)
13660 {
13661 chksum ^= pageData[i];
13662 }
13663 ndbassert(chksum == 0);
13664
13665 if (chksum != 0)
13666 {
13667 jam();
13668 ndbassert(false);
13669 return false;
13670 }
13671 /* Magic is written/read as is */
13672 lcpCtlFilePtr->fileHeader.BackupVersion =
13673 ntohl(lcpCtlFilePtr->fileHeader.BackupVersion);
13674 lcpCtlFilePtr->fileHeader.SectionType =
13675 ntohl(lcpCtlFilePtr->fileHeader.SectionType);
13676 lcpCtlFilePtr->fileHeader.SectionLength =
13677 ntohl(lcpCtlFilePtr->fileHeader.SectionLength);
13678 lcpCtlFilePtr->fileHeader.FileType =
13679 ntohl(lcpCtlFilePtr->fileHeader.FileType);
13680 lcpCtlFilePtr->fileHeader.BackupId =
13681 ntohl(lcpCtlFilePtr->fileHeader.BackupId);
13682 ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_0 == 0);
13683 ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_1 == 0);
13684 /* ByteOrder as is */
13685 lcpCtlFilePtr->fileHeader.NdbVersion =
13686 ntohl(lcpCtlFilePtr->fileHeader.NdbVersion);
13687 lcpCtlFilePtr->fileHeader.MySQLVersion =
13688 ntohl(lcpCtlFilePtr->fileHeader.MySQLVersion);
13689
13690 lcpCtlFilePtr->ValidFlag = ntohl(lcpCtlFilePtr->ValidFlag);
13691 lcpCtlFilePtr->TableId = ntohl(lcpCtlFilePtr->TableId);
13692 lcpCtlFilePtr->FragmentId = ntohl(lcpCtlFilePtr->FragmentId);
13693 lcpCtlFilePtr->CreateTableVersion = ntohl(lcpCtlFilePtr->CreateTableVersion);
13694 lcpCtlFilePtr->CreateGci = ntohl(lcpCtlFilePtr->CreateGci);
13695 lcpCtlFilePtr->MaxGciCompleted = ntohl(lcpCtlFilePtr->MaxGciCompleted);
13696 lcpCtlFilePtr->MaxGciWritten = ntohl(lcpCtlFilePtr->MaxGciWritten);
13697 lcpCtlFilePtr->LcpId = ntohl(lcpCtlFilePtr->LcpId);
13698 lcpCtlFilePtr->LocalLcpId = ntohl(lcpCtlFilePtr->LocalLcpId);
13699 lcpCtlFilePtr->MaxPageCount = ntohl(lcpCtlFilePtr->MaxPageCount);
13700 lcpCtlFilePtr->MaxNumberDataFiles = ntohl(lcpCtlFilePtr->MaxNumberDataFiles);
13701 lcpCtlFilePtr->LastDataFileNumber = ntohl(lcpCtlFilePtr->LastDataFileNumber);
13702 lcpCtlFilePtr->MaxPartPairs = ntohl(lcpCtlFilePtr->MaxPartPairs);
13703 lcpCtlFilePtr->NumPartPairs = ntohl(lcpCtlFilePtr->NumPartPairs);
13704
13705 ndbrequire(BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG >= real_bytes_read);
13706 ndbrequire(lcpCtlFilePtr->fileHeader.FileType ==
13707 BackupFormat::LCP_CTL_FILE);
13708 ndbrequire(memcmp(BACKUP_MAGIC, lcpCtlFilePtr->fileHeader.Magic, 8) == 0);
13709 ndbrequire(lcpCtlFilePtr->NumPartPairs <= lcpCtlFilePtr->MaxPartPairs);
13710 ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13711 Uint32 total_parts;
13712 ndbrequire(lcpCtlFilePtr->fileHeader.BackupVersion >= NDBD_USE_PARTIAL_LCP_v2)
13713 lcpCtlFilePtr->RowCountLow = ntohl(lcpCtlFilePtr->RowCountLow);
13714 lcpCtlFilePtr->RowCountHigh = ntohl(lcpCtlFilePtr->RowCountHigh);
13715 total_parts = decompress_part_pairs(lcpCtlFilePtr,
13716 lcpCtlFilePtr->NumPartPairs,
13717 &lcpCtlFilePtr->partPairs[0]);
13718 ndbrequire(total_parts <= lcpCtlFilePtr->MaxPartPairs);
13719 return true;
13720 }
13721
13722 void
convert_ctl_page_to_network(Uint32 * page,Uint32 file_size)13723 Backup::convert_ctl_page_to_network(Uint32 *page, Uint32 file_size)
13724 {
13725 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13726 (struct BackupFormat::LCPCtlFile*)page;
13727 Uint32 numPartPairs = lcpCtlFilePtr->NumPartPairs;
13728 Uint32 compressed_bytes_written = LCP_CTL_FILE_HEADER_SIZE +
13729 (BYTES_PER_PART * numPartPairs);
13730
13731 /**
13732 * Add 3 to ensure that we take into account the last word that might
13733 * filled with only 1 byte of information.
13734 */
13735 ndbrequire(file_size >= (compressed_bytes_written + 3));
13736
13737 ndbrequire(memcmp(BACKUP_MAGIC, lcpCtlFilePtr->fileHeader.Magic, 8) == 0);
13738 ndbrequire(lcpCtlFilePtr->fileHeader.FileType ==
13739 BackupFormat::LCP_CTL_FILE);
13740 ndbrequire(lcpCtlFilePtr->NumPartPairs <= lcpCtlFilePtr->MaxPartPairs);
13741 ndbrequire(lcpCtlFilePtr->NumPartPairs > 0);
13742 ndbrequire(lcpCtlFilePtr->fileHeader.NdbVersion >= NDBD_USE_PARTIAL_LCP_v2);
13743 ndbrequire(lcpCtlFilePtr->fileHeader.BackupVersion == NDBD_USE_PARTIAL_LCP_v2);
13744
13745 /* Magic is written/read as is */
13746 lcpCtlFilePtr->fileHeader.BackupVersion =
13747 htonl(lcpCtlFilePtr->fileHeader.BackupVersion);
13748 lcpCtlFilePtr->fileHeader.SectionType =
13749 htonl(lcpCtlFilePtr->fileHeader.SectionType);
13750 lcpCtlFilePtr->fileHeader.SectionLength =
13751 htonl(lcpCtlFilePtr->fileHeader.SectionLength);
13752 lcpCtlFilePtr->fileHeader.FileType =
13753 htonl(lcpCtlFilePtr->fileHeader.FileType);
13754 lcpCtlFilePtr->fileHeader.BackupId =
13755 htonl(lcpCtlFilePtr->fileHeader.BackupId);
13756 ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_0 == 0);
13757 ndbrequire(lcpCtlFilePtr->fileHeader.BackupKey_1 == 0);
13758 /* ByteOrder as is */
13759 lcpCtlFilePtr->fileHeader.NdbVersion =
13760 htonl(lcpCtlFilePtr->fileHeader.NdbVersion);
13761 lcpCtlFilePtr->fileHeader.MySQLVersion =
13762 htonl(lcpCtlFilePtr->fileHeader.MySQLVersion);
13763
13764 lcpCtlFilePtr->ValidFlag = htonl(lcpCtlFilePtr->ValidFlag);
13765 lcpCtlFilePtr->TableId = htonl(lcpCtlFilePtr->TableId);
13766 lcpCtlFilePtr->FragmentId = htonl(lcpCtlFilePtr->FragmentId);
13767 lcpCtlFilePtr->CreateTableVersion = htonl(lcpCtlFilePtr->CreateTableVersion);
13768 lcpCtlFilePtr->CreateGci = htonl(lcpCtlFilePtr->CreateGci);
13769 lcpCtlFilePtr->MaxGciCompleted = htonl(lcpCtlFilePtr->MaxGciCompleted);
13770 lcpCtlFilePtr->MaxGciWritten = htonl(lcpCtlFilePtr->MaxGciWritten);
13771 lcpCtlFilePtr->LcpId = htonl(lcpCtlFilePtr->LcpId);
13772 lcpCtlFilePtr->LocalLcpId = htonl(lcpCtlFilePtr->LocalLcpId);
13773 lcpCtlFilePtr->MaxPageCount = htonl(lcpCtlFilePtr->MaxPageCount);
13774 lcpCtlFilePtr->MaxNumberDataFiles = htonl(lcpCtlFilePtr->MaxNumberDataFiles);
13775 lcpCtlFilePtr->LastDataFileNumber = htonl(lcpCtlFilePtr->LastDataFileNumber);
13776
13777 Uint32 maxPartPairs = lcpCtlFilePtr->MaxPartPairs;
13778 lcpCtlFilePtr->MaxPartPairs = htonl(lcpCtlFilePtr->MaxPartPairs);
13779 lcpCtlFilePtr->NumPartPairs = htonl(lcpCtlFilePtr->NumPartPairs);
13780
13781 lcpCtlFilePtr->RowCountLow = htonl(lcpCtlFilePtr->RowCountLow);
13782 lcpCtlFilePtr->RowCountHigh = htonl(lcpCtlFilePtr->RowCountHigh);
13783
13784 Uint32 total_parts = compress_part_pairs(lcpCtlFilePtr,
13785 numPartPairs,
13786 file_size);
13787 ndbrequire(total_parts <= maxPartPairs);
13788
13789 /**
13790 * Checksum is calculated on compressed network byte order.
13791 * The checksum is calculated without regard to size decreasing due to
13792 * compression. This is not a problem since we fill the remainder with
13793 * zeroes and XOR doesn't change the checksum with extra zeroes.
13794 *
13795 * Add 3 to ensure that we move to word count in a correct manner.
13796 */
13797 lcpCtlFilePtr->Checksum = 0;
13798 Uint32 words = (compressed_bytes_written + 3) / sizeof(Uint32);
13799 Uint32 chksum = 0;
13800 for (Uint32 i = 0; i < words; i++)
13801 {
13802 chksum ^= page[i];
13803 }
13804 lcpCtlFilePtr->Checksum = chksum;
13805 }
13806
13807 Uint32
compress_part_pairs(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr,Uint32 num_parts,Uint32 file_size)13808 Backup::compress_part_pairs(struct BackupFormat::LCPCtlFile *lcpCtlFilePtr,
13809 Uint32 num_parts,
13810 Uint32 file_size)
13811 {
13812 Uint32 total_parts = 0;
13813 unsigned char *part_array =
13814 (unsigned char*)&lcpCtlFilePtr->partPairs[0].startPart;
13815 for (Uint32 part = 0; part < num_parts; part++)
13816 {
13817 /**
13818 * Compress the 32 bit by only using 12 bits word. This means that we
13819 * can fit up to 2048 parts in 8 kBytes.
13820 * The start part uses the first byte to store the upper 8 bits of
13821 * 12 bits and bits 0-3 of the second byte is bit 0-3 of the start
13822 * part. The number of parts has bit 0-3 stored in bit 4-7 of the
13823 * second byte and bit 4-11 stored in the third byte.
13824 */
13825 Uint32 startPart = lcpCtlFilePtr->partPairs[part].startPart;
13826 Uint32 numParts = lcpCtlFilePtr->partPairs[part].numParts;
13827 ndbrequire(numParts <= BackupFormat::NDB_MAX_LCP_PARTS);
13828 Uint32 startPart_bit0_3 = (startPart & 0xF);
13829 Uint32 startPart_bit4_11 = (startPart >> 4) & 0xFF;
13830 Uint32 numParts_bit0_3 = (numParts & 0xF);
13831 Uint32 numParts_bit4_11 = (numParts >> 4) & 0xFF;
13832 part_array[0] = (unsigned char)startPart_bit4_11;
13833 part_array[1] = (unsigned char)(startPart_bit0_3 + (numParts_bit0_3 << 4));
13834 part_array[2] = (unsigned char)numParts_bit4_11;
13835 part_array += 3;
13836 total_parts += numParts;
13837 DEB_EXTRA_LCP(("(%u)compress:tab(%u,%u) Part(%u), start:%u, num_parts: %u",
13838 instance(),
13839 ntohl(lcpCtlFilePtr->TableId),
13840 ntohl(lcpCtlFilePtr->FragmentId),
13841 part,
13842 startPart,
13843 numParts));
13844 }
13845 ndbrequire(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13846 unsigned char *start_pos = (unsigned char*)lcpCtlFilePtr;
13847 unsigned char *end_pos = start_pos + file_size;
13848 Uint64 remaining_size_64 = end_pos - part_array;
13849 ndbrequire(remaining_size_64 < file_size);
13850 Uint32 remaining_size = Uint32(remaining_size_64);
13851 memset(part_array, 0, remaining_size);
13852 return total_parts;
13853 }
13854
decompress_part_pairs(struct BackupFormat::LCPCtlFile * lcpCtlFilePtr,Uint32 num_parts,struct BackupFormat::PartPair * partPairs)13855 Uint32 Backup::decompress_part_pairs(
13856 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr,
13857 Uint32 num_parts,
13858 struct BackupFormat::PartPair *partPairs)
13859 {
13860 Uint32 total_parts = 0;
13861 unsigned char *part_array = (unsigned char*)&partPairs[0].startPart;
13862 ndbrequire(num_parts <= BackupFormat::NDB_MAX_LCP_PARTS);
13863 memcpy(c_part_array, part_array, 3 * num_parts);
13864 Uint32 j = 0;
13865 for (Uint32 part = 0; part < num_parts; part++)
13866 {
13867 Uint32 part_0 = c_part_array[j+0];
13868 Uint32 part_1 = c_part_array[j+1];
13869 Uint32 part_2 = c_part_array[j+2];
13870 Uint32 startPart = ((part_1 & 0xF) + (part_0 << 4));
13871 Uint32 numParts = (((part_1 >> 4) & 0xF)) + (part_2 << 4);
13872 ndbrequire(numParts <= BackupFormat::NDB_MAX_LCP_PARTS);
13873 partPairs[part].startPart = startPart;
13874 partPairs[part].numParts = numParts;
13875 total_parts += numParts;
13876 DEB_EXTRA_LCP(("(%u)decompress:tab(%u,%u) Part(%u), start:%u, num_parts: %u",
13877 instance(),
13878 lcpCtlFilePtr->TableId,
13879 lcpCtlFilePtr->FragmentId,
13880 part,
13881 startPart,
13882 numParts));
13883 j += 3;
13884 }
13885 ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
13886 return total_parts;
13887 }
13888
13889 void
lcp_init_ctl_file(Page32Ptr pagePtr)13890 Backup::lcp_init_ctl_file(Page32Ptr pagePtr)
13891 {
13892 const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
13893 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
13894 (struct BackupFormat::LCPCtlFile*)pagePtr.p;
13895
13896 memcpy(lcpCtlFilePtr->fileHeader.Magic, BACKUP_MAGIC, 8);
13897 lcpCtlFilePtr->fileHeader.BackupVersion = NDBD_USE_PARTIAL_LCP_v2;
13898 lcpCtlFilePtr->fileHeader.SectionType = BackupFormat::FILE_HEADER;
13899 lcpCtlFilePtr->fileHeader.SectionLength = sz - 3;
13900 lcpCtlFilePtr->fileHeader.FileType = BackupFormat::LCP_CTL_FILE;
13901 lcpCtlFilePtr->fileHeader.BackupId = 0;
13902 lcpCtlFilePtr->fileHeader.BackupKey_0 = 0;
13903 lcpCtlFilePtr->fileHeader.BackupKey_1 = 0;
13904 lcpCtlFilePtr->fileHeader.ByteOrder = 0x12345678;
13905 lcpCtlFilePtr->fileHeader.NdbVersion = NDB_VERSION_D;
13906 lcpCtlFilePtr->fileHeader.MySQLVersion = NDB_MYSQL_VERSION_D;
13907
13908 /* Checksum needs to calculated again before write to disk */
13909 lcpCtlFilePtr->Checksum = 0;
13910 lcpCtlFilePtr->ValidFlag = 0;
13911 lcpCtlFilePtr->TableId = 0;
13912 lcpCtlFilePtr->FragmentId = 0;
13913 lcpCtlFilePtr->CreateTableVersion = 0;
13914 lcpCtlFilePtr->CreateGci = 0;
13915 lcpCtlFilePtr->MaxGciWritten = 0;
13916 lcpCtlFilePtr->MaxGciCompleted = 0;
13917 lcpCtlFilePtr->LcpId = 0;
13918 lcpCtlFilePtr->LocalLcpId = 0;
13919 lcpCtlFilePtr->MaxPageCount = 0;
13920 lcpCtlFilePtr->MaxNumberDataFiles = BackupFormat::NDB_MAX_LCP_FILES;
13921 lcpCtlFilePtr->LastDataFileNumber = BackupFormat::NDB_MAX_LCP_FILES - 1;
13922 lcpCtlFilePtr->MaxPartPairs = BackupFormat::NDB_MAX_LCP_PARTS;
13923 lcpCtlFilePtr->NumPartPairs = 1;
13924 lcpCtlFilePtr->RowCountLow = 0;
13925 lcpCtlFilePtr->RowCountHigh = 0;
13926 lcpCtlFilePtr->partPairs[0].startPart = 0;
13927 lcpCtlFilePtr->partPairs[0].numParts = BackupFormat::NDB_MAX_LCP_PARTS;
13928 }
13929
13930 void
lcp_close_prepare_ctl_file_done(Signal * signal,BackupRecordPtr ptr)13931 Backup::lcp_close_prepare_ctl_file_done(Signal* signal,
13932 BackupRecordPtr ptr)
13933 {
13934 /**
13935 * We have closed the old LCP control file now. We have calculated the
13936 * number of the data file to be used in this LCP. We will now open this
13937 * data file to be used by this LCP.
13938 */
13939 lcp_open_data_file(signal, ptr);
13940 }
13941
13942 void
lcp_open_data_file(Signal * signal,BackupRecordPtr ptr)13943 Backup::lcp_open_data_file(Signal* signal,
13944 BackupRecordPtr ptr)
13945 {
13946 FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13947 req->userReference = reference();
13948 req->fileFlags =
13949 FsOpenReq::OM_WRITEONLY |
13950 FsOpenReq::OM_TRUNCATE |
13951 FsOpenReq::OM_CREATE |
13952 FsOpenReq::OM_APPEND |
13953 FsOpenReq::OM_AUTOSYNC;
13954
13955 if (c_defaults.m_compressed_lcp)
13956 {
13957 req->fileFlags |= FsOpenReq::OM_GZ;
13958 }
13959
13960 if (c_defaults.m_o_direct)
13961 {
13962 req->fileFlags |= FsOpenReq::OM_DIRECT;
13963 }
13964
13965 FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
13966 req->auto_sync_size = c_defaults.m_disk_synch_size;
13967
13968 TablePtr tabPtr;
13969 FragmentPtr fragPtr;
13970 BackupFilePtr filePtr;
13971 Uint32 dataFileNumber;
13972
13973 ndbrequire(ptr.p->prepare_table.first(tabPtr));
13974 tabPtr.p->fragments.getPtr(fragPtr, 0);
13975
13976 c_backupFilePool.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
13977 dataFileNumber = ptr.p->prepareFirstDataFileNumber;
13978 ndbrequire(ptr.p->prepareState == PREPARE_READ_CTL_FILES);
13979 ptr.p->prepareState = PREPARE_OPEN_DATA_FILE;
13980
13981 ndbrequire(filePtr.p->m_flags == 0);
13982 filePtr.p->m_flags |= BackupFile::BF_OPENING;
13983 filePtr.p->tableId = RNIL; // Will force init
13984 req->userPointer = filePtr.i;
13985 FsOpenReq::setVersion(req->fileNumber, 5);
13986 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
13987 FsOpenReq::v5_setLcpNo(req->fileNumber, dataFileNumber);
13988 FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
13989 FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
13990 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
13991 }
13992
13993 void
lcp_open_data_file_late(Signal * signal,BackupRecordPtr ptr,Uint32 index)13994 Backup::lcp_open_data_file_late(Signal* signal,
13995 BackupRecordPtr ptr,
13996 Uint32 index)
13997 {
13998 FsOpenReq * req = (FsOpenReq *)signal->getDataPtrSend();
13999 req->userReference = reference();
14000 req->fileFlags =
14001 FsOpenReq::OM_WRITEONLY |
14002 FsOpenReq::OM_TRUNCATE |
14003 FsOpenReq::OM_CREATE |
14004 FsOpenReq::OM_APPEND |
14005 FsOpenReq::OM_AUTOSYNC;
14006
14007 if (c_defaults.m_compressed_lcp)
14008 {
14009 req->fileFlags |= FsOpenReq::OM_GZ;
14010 }
14011
14012 if (c_defaults.m_o_direct)
14013 {
14014 req->fileFlags |= FsOpenReq::OM_DIRECT;
14015 }
14016
14017 FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
14018 req->auto_sync_size = c_defaults.m_disk_synch_size;
14019
14020 TablePtr tabPtr;
14021 FragmentPtr fragPtr;
14022 BackupFilePtr filePtr;
14023 ndbrequire(ptr.p->tables.first(tabPtr));
14024 tabPtr.p->fragments.getPtr(fragPtr, 0);
14025
14026 ndbrequire(index != 0);
14027 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[index]);
14028
14029 Uint32 dataFileNumber = get_file_add(ptr.p->m_first_data_file_number,
14030 index);
14031
14032 ndbrequire(filePtr.p->m_flags == 0);
14033 filePtr.p->m_flags |= BackupFile::BF_OPENING;
14034 req->userPointer = filePtr.i;
14035 FsOpenReq::setVersion(req->fileNumber, 5);
14036 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
14037 FsOpenReq::v5_setLcpNo(req->fileNumber, dataFileNumber);
14038 FsOpenReq::v5_setTableId(req->fileNumber, tabPtr.p->tableId);
14039 FsOpenReq::v5_setFragmentId(req->fileNumber, fragPtr.p->fragmentId);
14040 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
14041 }
14042
14043 void
lcp_open_data_file_done(Signal * signal,BackupRecordPtr ptr)14044 Backup::lcp_open_data_file_done(Signal* signal,
14045 BackupRecordPtr ptr)
14046 {
14047 TablePtr tabPtr;
14048 FragmentPtr fragPtr;
14049
14050 ndbrequire(ptr.p->prepare_table.first(tabPtr));
14051 tabPtr.p->fragments.getPtr(fragPtr, 0);
14052
14053 BackupFilePtr filePtr;
14054 c_backupFilePool.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
14055 ndbrequire(filePtr.p->m_flags ==
14056 (BackupFile::BF_OPEN | BackupFile::BF_LCP_META));
14057 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_LCP_META;
14058
14059 ndbrequire(ptr.p->prepareState == PREPARE_READ_TABLE_DESC);
14060 ptr.p->prepareState = PREPARED;
14061
14062 LcpPrepareConf* conf= (LcpPrepareConf*)signal->getDataPtrSend();
14063 conf->senderData = ptr.p->clientData;
14064 conf->senderRef = reference();
14065 conf->tableId = tabPtr.p->tableId;
14066 conf->fragmentId = fragPtr.p->fragmentId;
14067 sendSignal(ptr.p->masterRef, GSN_LCP_PREPARE_CONF,
14068 signal, LcpPrepareConf::SignalLength, JBA);
14069 }
14070
14071 void
lcp_set_lcp_id(BackupRecordPtr ptr,struct BackupFormat::LCPCtlFile * lcpCtlFilePtr)14072 Backup::lcp_set_lcp_id(BackupRecordPtr ptr,
14073 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr)
14074 {
14075 jam();
14076 lcpCtlFilePtr->fileHeader.BackupId = ptr.p->backupId;
14077 lcpCtlFilePtr->LcpId = ptr.p->backupId;
14078 lcpCtlFilePtr->LocalLcpId = ptr.p->localLcpId;
14079 if (ptr.p->backupId == ptr.p->preparePrevLcpId)
14080 {
14081 jam();
14082 ndbrequire(ptr.p->localLcpId > ptr.p->preparePrevLocalLcpId);
14083 }
14084 else
14085 {
14086 jam();
14087 ndbrequire(ptr.p->backupId > ptr.p->preparePrevLcpId);
14088 }
14089 }
14090
14091 void
lcp_copy_ctl_page(BackupRecordPtr ptr)14092 Backup::lcp_copy_ctl_page(BackupRecordPtr ptr)
14093 {
14094 Page32Ptr page_ptr, recent_page_ptr;
14095 BackupFilePtr file_ptr, recent_file_ptr;
14096 Uint32 oldest = ptr.p->prepareNextLcpCtlFileNumber;
14097 ndbrequire(oldest <= 1);
14098 Uint32 recent = oldest == 0 ? 1 : 0;
14099 c_backupFilePool.getPtr(file_ptr, ptr.p->ctlFilePtr);
14100 c_backupFilePool.getPtr(recent_file_ptr, ptr.p->prepareCtlFilePtr[recent]);
14101 file_ptr.p->pages.getPtr(page_ptr, 0);
14102 recent_file_ptr.p->pages.getPtr(recent_page_ptr, 0);
14103 /**
14104 * Important to consider here that the page is currently in expanded
14105 * format. So before we copy it we calculate how much to copy.
14106 */
14107 {
14108 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14109 (struct BackupFormat::LCPCtlFile*)recent_page_ptr.p;
14110 Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
14111 Uint32 size_to_copy = LCP_CTL_FILE_HEADER_SIZE;
14112 size_to_copy += (num_parts * sizeof(struct BackupFormat::PartPair));
14113 memcpy(page_ptr.p,
14114 recent_page_ptr.p,
14115 size_to_copy);
14116 }
14117 #ifdef VM_TRACE
14118 {
14119 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14120 (struct BackupFormat::LCPCtlFile*)page_ptr.p;
14121 jam();
14122 Uint32 total_parts = 0;
14123 Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
14124 jamLine(num_parts);
14125 for (Uint32 i = 0; i < num_parts; i++)
14126 {
14127 Uint32 parts = lcpCtlFilePtr->partPairs[i].numParts;
14128 total_parts += parts;
14129 jamLine(parts);
14130 }
14131 jam();
14132 ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
14133 }
14134 #endif
14135 }
14136
14137 void
setRestorableGci(Uint32 restorableGci)14138 Backup::setRestorableGci(Uint32 restorableGci)
14139 {
14140 jam();
14141 if (restorableGci > m_newestRestorableGci)
14142 {
14143 jam();
14144 m_newestRestorableGci = restorableGci;
14145 }
14146 }
14147
14148 void
lcp_update_ctl_page(BackupRecordPtr ptr,Page32Ptr & page_ptr,BackupFilePtr & file_ptr)14149 Backup::lcp_update_ctl_page(BackupRecordPtr ptr,
14150 Page32Ptr & page_ptr,
14151 BackupFilePtr & file_ptr)
14152 {
14153 Uint32 maxCompletedGci;
14154 c_backupFilePool.getPtr(file_ptr, ptr.p->ctlFilePtr);
14155 file_ptr.p->pages.getPtr(page_ptr, 0);
14156 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
14157 (struct BackupFormat::LCPCtlFile*)page_ptr.p;
14158
14159 /**
14160 * An idle LCP cannot have written anything since last LCP. The
14161 * last LCP was definitely restorable on disk, so there is no
14162 * need to set MaxGciCompleted to an unrestorable GCI since we
14163 * haven't written this anyways.
14164 *
14165 * Thus for idle LCPs we need not wait for a GCI to be restorable
14166 * ever. We reflect this by sending max_gci_written equal to the
14167 * restorable gci in the lcp_max_completed_gci call.
14168 */
14169 c_lqh->lcp_max_completed_gci(maxCompletedGci,
14170 m_newestRestorableGci,
14171 m_newestRestorableGci);
14172 lcpCtlFilePtr->MaxGciCompleted = maxCompletedGci;
14173 ptr.p->slaveState.setState(STOPPING);
14174 c_lqh->lcp_complete_scan(ptr.p->newestGci);
14175 if (ptr.p->newestGci != lcpCtlFilePtr->MaxGciWritten)
14176 {
14177 /**
14178 * Can happen when performing a LCP as part of restart
14179 * We will set the newestGci as part of the restore to
14180 * the GCI we restore.
14181 */
14182 DEB_LCP(("(%u)newestGci = %u, MaxGciWritten: %u, MaxGciCompleted: %u",
14183 instance(),
14184 ptr.p->newestGci,
14185 lcpCtlFilePtr->MaxGciWritten,
14186 lcpCtlFilePtr->MaxGciCompleted));
14187 }
14188 ndbassert(ptr.p->newestGci ==
14189 lcpCtlFilePtr->MaxGciWritten ||
14190 !m_our_node_started);
14191 /* Check that schema version is ok, 0 means we're currently deleting table */
14192 Uint32 lqhCreateTableVersion = c_lqh->getCreateSchemaVersion(lcpCtlFilePtr->TableId);
14193 ndbrequire(lcpCtlFilePtr->CreateTableVersion == lqhCreateTableVersion ||
14194 lqhCreateTableVersion == 0);
14195
14196 lcpCtlFilePtr->MaxGciWritten = ptr.p->newestGci;
14197
14198 ptr.p->m_wait_gci_to_delete = MAX(maxCompletedGci, ptr.p->newestGci);
14199
14200 lcp_set_lcp_id(ptr, lcpCtlFilePtr);
14201
14202 ndbrequire(lcpCtlFilePtr->MaxGciWritten <= m_newestRestorableGci);
14203 ndbrequire(m_newestRestorableGci != 0);
14204 /**
14205 * Also idle LCPs have to be careful to ensure that the LCP is valid before
14206 * we write it as valid. The reason is that otherwise we won't find the
14207 * LCP record in the UNDO log and apply too many UNDO log records.
14208 */
14209 TablePtr tabPtr;
14210 ptr.p->tables.first(tabPtr);
14211 Uint32 tableId = tabPtr.p->tableId;
14212 ptr.p->m_disk_data_exist = c_lqh->is_disk_columns_in_table(tableId);
14213 Uint32 valid_flag = lcp_pre_sync_lsn(ptr);
14214 ptr.p->m_lcp_lsn_synced = valid_flag;
14215 lcpCtlFilePtr->ValidFlag = valid_flag;
14216
14217 DEB_LCP(("(%u)TAGY Handle idle LCP, tab(%u,%u).%u, maxGciCompleted = %u"
14218 ", validFlag = %u",
14219 instance(),
14220 lcpCtlFilePtr->TableId,
14221 lcpCtlFilePtr->FragmentId,
14222 lcpCtlFilePtr->CreateTableVersion,
14223 lcpCtlFilePtr->MaxGciCompleted,
14224 valid_flag));
14225 }
14226
14227 void
handle_idle_lcp(Signal * signal,BackupRecordPtr ptr)14228 Backup::handle_idle_lcp(Signal *signal, BackupRecordPtr ptr)
14229 {
14230 /**
14231 * In the prepare phase we opened the data file, we need to
14232 * close this file before returning to DBLQH as completed.
14233 *
14234 * We also need to write the new LCP control file. The
14235 * contents we will take from the most recent LCP control
14236 * file updated with a new MaxGciCompleted.
14237 *
14238 * We need to move data files and control files to the
14239 * execution part since we will start preparing a new
14240 * LCP immediately after completing this signal execution.
14241 * A LCP_PREPARE_REQ is most likely waiting to be executed
14242 * as the next signal.
14243 */
14244 Page32Ptr page_ptr;
14245 BackupFilePtr file_ptr;
14246 ptr.p->m_empty_lcp = true;
14247 lcp_copy_ctl_page(ptr);
14248 lcp_update_ctl_page(ptr, page_ptr, file_ptr);
14249 ptr.p->deleteDataFileNumber = RNIL;
14250 lcp_write_ctl_file_to_disk(signal, file_ptr, page_ptr);
14251 lcp_close_data_file(signal, ptr, true);
14252 ptr.p->m_wait_disk_data_sync = false;
14253 ptr.p->m_wait_sync_extent = false;
14254 ptr.p->m_wait_data_file_close = false;
14255 ptr.p->m_outstanding_operations = 2;
14256 }
14257
14258 void
prepare_parts_for_lcp(Signal * signal,BackupRecordPtr ptr)14259 Backup::prepare_parts_for_lcp(Signal *signal, BackupRecordPtr ptr)
14260 {
14261 /**
14262 * We need to switch in prepared data file and ctl file.
14263 * We make the previous execute data file and ctl file
14264 * record to be the new prepare data and ctl file record.
14265 */
14266 ptr.p->m_empty_lcp = false;
14267 calculate_number_of_parts(ptr);
14268 }
14269
14270 void
prepare_ranges_for_parts(BackupRecordPtr ptr,Uint32 in_parts)14271 Backup::prepare_ranges_for_parts(BackupRecordPtr ptr,
14272 Uint32 in_parts)
14273 {
14274 #ifdef DEBUG_LCP
14275 TablePtr debTabPtr;
14276 FragmentPtr fragPtr;
14277 ptr.p->tables.first(debTabPtr);
14278 debTabPtr.p->fragments.getPtr(fragPtr, 0);
14279 #endif
14280 Uint64 parts = Uint64(in_parts);
14281 ndbrequire(parts > 0);
14282 Uint32 start_part = ptr.p->m_first_start_part_in_lcp;
14283 Uint64 parts_per_file = parts / Uint64(ptr.p->m_num_lcp_files);
14284 Uint64 parts_extra_in_first_file =
14285 parts - (parts_per_file * Uint64(ptr.p->m_num_lcp_files));
14286 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
14287 {
14288 ptr.p->m_scan_info[i].m_start_all_part = start_part;
14289 Uint32 num_parts;
14290 if (i == 0)
14291 {
14292 num_parts = Uint32(parts_extra_in_first_file) + Uint32(parts_per_file);
14293 }
14294 else
14295 {
14296 num_parts = Uint32(parts_per_file);
14297 }
14298 ptr.p->m_scan_info[i].m_num_all_parts = num_parts;
14299 start_part = get_part_add(start_part, num_parts);
14300 DEB_LCP(("(%u)tab(%u,%u),m_scan_info[%u].start_all_part = %u,"
14301 " num_all_parts: %u",
14302 instance(),
14303 debTabPtr.p->tableId,
14304 fragPtr.p->fragmentId,
14305 i,
14306 ptr.p->m_scan_info[i].m_start_all_part,
14307 ptr.p->m_scan_info[i].m_num_all_parts));
14308 }
14309 Uint32 num_change_parts = BackupFormat::NDB_MAX_LCP_PARTS - parts;
14310 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part =
14311 start_part;
14312 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts =
14313 num_change_parts;
14314 start_part = get_part_add(start_part, num_change_parts);
14315 ndbassert(start_part == ptr.p->m_first_start_part_in_lcp);
14316 ndbassert(is_partial_lcp_enabled() || num_change_parts == 0);
14317 DEB_LCP(("(%u)tab(%u,%u),m_scan_info[%u].start_change_part = %u,"
14318 " num_all_parts: %u",
14319 instance(),
14320 debTabPtr.p->tableId,
14321 fragPtr.p->fragmentId,
14322 ptr.p->m_num_lcp_files - 1,
14323 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_start_change_part,
14324 ptr.p->m_scan_info[ptr.p->m_num_lcp_files-1].m_num_change_parts));
14325 }
14326
14327 void
prepare_new_part_info(BackupRecordPtr ptr,Uint32 new_parts)14328 Backup::prepare_new_part_info(BackupRecordPtr ptr, Uint32 new_parts)
14329 {
14330 Uint32 remove_files = 0;
14331 ptr.p->m_num_parts_in_this_lcp = new_parts;
14332 Uint32 old_num_parts = ptr.p->m_num_parts_in_lcp;
14333 if (old_num_parts != 0)
14334 {
14335 Uint32 new_start_part = ptr.p->m_first_start_part_in_lcp;
14336 Uint32 new_end_part = new_start_part + new_parts;
14337 Uint32 old_start_part = ptr.p->m_part_info[0].startPart;
14338 Uint32 old_end_part = old_start_part;
14339 ndbrequire(new_start_part == old_start_part);
14340 jam();
14341 do
14342 {
14343 jam();
14344 Uint32 old_parts = ptr.p->m_part_info[remove_files].numParts;
14345 old_end_part += old_parts;
14346 if (old_end_part > new_end_part)
14347 {
14348 jam();
14349 /* This file has to be kept */
14350 break;
14351 }
14352 old_num_parts--;
14353 remove_files++;
14354 } while (old_num_parts > 0);
14355 }
14356 Uint32 remaining_files = ptr.p->m_num_parts_in_lcp - remove_files;
14357 /* First remove all files no longer used */
14358 for (Uint32 i = 0; i < remaining_files; i++)
14359 {
14360 ptr.p->m_part_info[i] = ptr.p->m_part_info[i + remove_files];
14361 DEB_EXTRA_LCP(("(%u)Parts(%u,%u)",
14362 instance(),
14363 ptr.p->m_part_info[i].startPart,
14364 ptr.p->m_part_info[i].numParts));
14365 }
14366
14367 /**
14368 * The first set of parts is now likely too many parts. The new set of
14369 * parts have eaten into this from the start. So it needs to be moved
14370 * ahead as many parts as we have eaten into it.
14371 */
14372 if (remaining_files >= 1)
14373 {
14374 jam();
14375 Uint32 new_first_part = get_part_add(
14376 ptr.p->m_scan_info[0].m_start_all_part, new_parts);
14377 Uint32 old_first_part = ptr.p->m_part_info[0].startPart;
14378 Uint32 decrement_parts;
14379 if (old_first_part > new_first_part)
14380 {
14381 jam();
14382 decrement_parts = (new_first_part +
14383 BackupFormat::NDB_MAX_LCP_PARTS) - old_first_part;
14384 }
14385 else
14386 {
14387 jam();
14388 decrement_parts = new_first_part - old_first_part;
14389 }
14390 ndbrequire(decrement_parts < ptr.p->m_part_info[0].numParts);
14391 ptr.p->m_part_info[0].numParts -= decrement_parts;
14392 ptr.p->m_part_info[0].startPart = new_first_part;
14393 DEB_EXTRA_LCP(("(%u)New first data file span is (%u,%u)",
14394 instance(),
14395 ptr.p->m_part_info[0].startPart,
14396 ptr.p->m_part_info[0].numParts));
14397 }
14398
14399 /**
14400 * Calculate file numbers of files to delete after LCP is
14401 * completed.
14402 */
14403 ptr.p->m_lcp_remove_files = remove_files;
14404 if (remove_files == 0)
14405 {
14406 jam();
14407 ptr.p->deleteDataFileNumber = RNIL;
14408 }
14409 else
14410 {
14411 Uint32 move_back_files = remove_files + remaining_files;
14412 ptr.p->deleteDataFileNumber = get_file_sub(
14413 ptr.p->m_first_data_file_number,
14414 move_back_files);
14415
14416 DEB_LCP(("(%u)m_first_data_file_number = %u, deleteDataFileNumber: %u,"
14417 " remove_files: %u",
14418 instance(),
14419 ptr.p->m_first_data_file_number,
14420 ptr.p->deleteDataFileNumber,
14421 remove_files));
14422 }
14423
14424 /* Insert the new parts at the end */
14425 jamLineDebug(ptr.p->m_num_lcp_files);
14426 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
14427 {
14428 jamDebug();
14429 ptr.p->m_part_info[old_num_parts + i].startPart =
14430 ptr.p->m_scan_info[i].m_start_all_part;
14431 ptr.p->m_part_info[old_num_parts + i].numParts =
14432 ptr.p->m_scan_info[i].m_num_all_parts;
14433 ndbrequire(ptr.p->m_part_info[old_num_parts + i].startPart <
14434 BackupFormat::NDB_MAX_LCP_PARTS);
14435 ndbrequire(ptr.p->m_part_info[old_num_parts + i].numParts <=
14436 BackupFormat::NDB_MAX_LCP_PARTS);
14437 }
14438 jamLineDebug(remaining_files);
14439 ptr.p->m_num_parts_in_lcp = ptr.p->m_num_lcp_files + remaining_files;
14440 ptr.p->m_max_parts_in_lcp = BackupFormat::NDB_MAX_LCP_PARTS;
14441 #ifdef VM_TRACE
14442 Uint32 total_parts = 0;
14443 jam();
14444 for (Uint32 i = 0; i < ptr.p->m_num_parts_in_lcp; i++)
14445 {
14446 Uint32 numParts = ptr.p->m_part_info[i].numParts;
14447 total_parts += numParts;
14448 }
14449 ndbassert(total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
14450 #endif
14451 }
14452
14453 Uint32
calculate_min_parts(Uint64 row_count,Uint64 row_change_count,Uint64 mem_used,Uint64 total_mem)14454 Backup::calculate_min_parts(Uint64 row_count,
14455 Uint64 row_change_count,
14456 Uint64 mem_used,
14457 Uint64 total_mem)
14458 {
14459 /**
14460 * Calculates
14461 * min_parts = 1 + (2048 * k) / (k + p)
14462 * let y = row_change_count / row_count
14463 * let z = y * (mem_used / total_mem)
14464 * let k = y + z * 0.5
14465 * where k = (row_change_count / row_count) +
14466 * 0.5 * (mem_used / total_mem)
14467 * let p = RecoveryWork configuration parameter
14468 *
14469 * as explained below.
14470 *
14471 * Broken down to:
14472 * memory_used = memory_used / (1024 * 1024)
14473 * total_memory = total_memory / (1024 * 1024)
14474 * This means we are ignoring anything not in the range of MBytes to ensure
14475 * we don't overflow the 64 bits.
14476 */
14477
14478 Uint32 recovery_work = get_recovery_work();
14479
14480 if (!is_partial_lcp_enabled() || row_count == 0)
14481 {
14482 jam();
14483 /**
14484 * We have configured the defaults to be that we always execute a full LCP.
14485 * The LCP can still be a multi-file one, but we will never have to handle
14486 * anything related to CHANGE ROWS pages.
14487 *
14488 * If no rows exists in table we might as well run a full LCP.
14489 */
14490 return BackupFormat::NDB_MAX_LCP_PARTS;
14491 }
14492 if (row_count < row_change_count)
14493 {
14494 jam();
14495 row_change_count = row_count;
14496 }
14497 mem_used /= Uint64(1024 * 1024);
14498 total_mem /= Uint64(1024 * 1024);
14499 if (total_mem == Uint64(0))
14500 {
14501 jam();
14502 total_mem = 1;
14503 }
14504
14505 double y = double(row_change_count);
14506 y = y / double(row_count);
14507
14508 double z = double(mem_used);
14509 z = z / double(total_mem);
14510 z = z * y;
14511
14512 double k = y + (z / double(2));
14513
14514 double parts = double(2048) * k;
14515
14516 double p = double(recovery_work) / double(100);
14517 double parts_divisor = p + k;
14518
14519 parts = parts / parts_divisor;
14520 parts = parts + double(1);
14521
14522 Uint32 min_parts = Uint32(parts);
14523 ndbrequire(min_parts < Uint32(BackupFormat::NDB_MAX_LCP_PARTS));
14524 return min_parts;
14525 }
14526
14527 /**
14528 * This function is closely related to the simulations performed by the
14529 * lcp_simulator.cc program. These simulations shows that is sufficient
14530 * to count as little as 70% of the inserts and still maintain the
14531 * same LCP size and recovery time. Even decreasing it to 50% means
14532 * that we only temporarily can increase the LCP by 3.3% and decreasing
14533 * it to 40% we can increase it by 6.7%. Even decreasing it to 0 and
14534 * thus only write the changed rows after insert and no extra speed of
14535 * LCPs due to inserts would still only increase the maximum LCP size
14536 * by 30%. The default setting is now 40% and it can be set between 0
14537 * and 70%. There are no particular reason to set it higher than 70%.
14538 *
14539 * If faster restarts are desired one should instead set RecoveryWork
14540 * lower.
14541 *
14542 * Deletes were shown to need a bit more parts, so we set a delete to
14543 * mean the same as 1.2 updates. There are no common use cases for
14544 * massive deletes, so we do not make this configurable, this is
14545 * hard coded.
14546 *
14547 * The idea of how to apply this is to split up row_change_count in
14548 * an update part, an insert part and a delete part. We multiply
14549 * the update part by 1, the delete part by 1.2 and the insert part
14550 * by the configured InsertRecoveryWork (defaults to 0.4).
14551 */
14552 Uint64
calculate_row_change_count(BackupRecordPtr ptr)14553 Backup::calculate_row_change_count(BackupRecordPtr ptr)
14554 {
14555 Uint64 insert_recovery_work = (Uint64)get_insert_recovery_work();
14556 Uint64 delete_recovery_work = (Uint64)DELETE_RECOVERY_WORK;
14557 Uint64 row_count = ptr.p->m_row_count;
14558 Uint64 prev_row_count = ptr.p->m_prev_row_count;
14559 Uint64 row_change_count = ptr.p->m_row_change_count;
14560 Uint64 decrease_row_change_count = 0;
14561 Uint64 new_rows, dropped_rows;
14562 if (row_count > prev_row_count)
14563 {
14564 jam();
14565 new_rows = row_count - prev_row_count;
14566 dropped_rows = 0;
14567 decrease_row_change_count = new_rows;
14568 }
14569 else
14570 {
14571 jam();
14572 new_rows = 0;
14573 dropped_rows = prev_row_count - row_count;
14574 decrease_row_change_count = dropped_rows;
14575 }
14576 if (decrease_row_change_count > row_change_count)
14577 {
14578 g_eventLogger->info("prev_row_count: %llu, row_count: %llu,"
14579 " row_change_count: %llu",
14580 prev_row_count,
14581 row_count,
14582 row_change_count);
14583 }
14584 ndbrequire(decrease_row_change_count <= row_change_count);
14585
14586 row_change_count -= decrease_row_change_count;
14587
14588 new_rows *= insert_recovery_work;
14589 new_rows /= (Uint64)100;
14590
14591 dropped_rows *= delete_recovery_work;
14592 dropped_rows /= Uint64(100);
14593
14594 row_change_count += new_rows;
14595 row_change_count += dropped_rows;
14596
14597 return row_change_count;
14598 }
14599
14600 Uint64
get_total_memory()14601 Backup::get_total_memory()
14602 {
14603 Resource_limit res_limit;
14604 m_ctx.m_mm.get_resource_limit(RG_DATAMEM, res_limit);
14605 const Uint32 pages_used = res_limit.m_curr;
14606 const Uint64 dm_used = Uint64(pages_used) * Uint64(sizeof(GlobalPage));
14607 const Uint64 num_ldms = getLqhWorkers() != 0 ?
14608 (Uint64)getLqhWorkers() : (Uint64)1;
14609 const Uint64 total_memory = dm_used / num_ldms;
14610 return total_memory;
14611 }
14612
14613 void
calculate_number_of_parts(BackupRecordPtr ptr)14614 Backup::calculate_number_of_parts(BackupRecordPtr ptr)
14615 {
14616 /**
14617 * Here we decide on how many parts we need to use for this LCP.
14618 * As input we have:
14619 * 1) Row count
14620 * 2) Row change count since last LCP
14621 * => Percentage of rows changed since last LCP
14622 *
14623 * The percentage of rows changed since last LCP is the most
14624 * important to this algorithm. This gives us a minimum number of
14625 * parts that we need to write as part of this LCP.
14626 *
14627 * There is an overhead in not writing full LCPs. The overhead is
14628 * dependent on the amount of changed rows in comparison with the
14629 * percentage of parts written.
14630 *
14631 * The overhead formula can be written as:
14632 * (1 - x) * (y + 0.5 * z) / x
14633 * where:
14634 * x = percentage of parts fully written in this LCP
14635 * y = percentage of rows changed since last LCP
14636 * z = percentage of rows changed during LCP
14637 *
14638 * The (1 - x) comes from that only the parts not written have
14639 * overhead for writing changed rows.
14640 *
14641 * The y comes from that writing changed rows is an overhead.
14642 *
14643 * The 0.5 * z comes from that writing changed rows during the LCP
14644 * is also an overhead, however only half of those rows will
14645 * actually be written since the LCP scan will not see rows
14646 * changed before the scan pointer.
14647 *
14648 * The division comes from that the first part of the formula is
14649 * the overhead cost for one LCP. However a full LCP consists of
14650 * 1/x LCPs.
14651 *
14652 * We want to select an x such that the overhead becomes smaller
14653 * than some select value.
14654 *
14655 * We can also have overhead in that we have written more parts
14656 * than are actually needed. To avoid that this overhead is
14657 * unnecessary big we will ensure that we never write any files
14658 * that contains more than 1/8th of the parts. This means that at
14659 * most we can get 12.5% overhead due to extra parts being written.
14660 *
14661 * We will try to ensure that x is chosen such that overhead is
14662 * smaller than p where p is the overhead percentage. p is
14663 * configurable in the RecoveryWork parameter and can be set between
14664 * 25 and 100%. It defaults to 50%.
14665 *
14666 * This means that we should at most require
14667 * 60% overhead compared to the data memory size. This number
14668 * is based on that we don't have an extreme amount of small
14669 * fragments with very small memory sizes. In this case the
14670 * overhead of writing table meta data as well will make the
14671 * overhead. So with most applications we can guarantee that the
14672 * overhead stays below 60% and actually in most cases we will
14673 * probably even have an overhead of around 40%.
14674 *
14675 * So we want to select an x such that:
14676 * (1 - x) (y + z*0.5) / x < p
14677 *
14678 * Now at start of an LCP for a fragment we can treat both y and z
14679 * as constants, so let us call (y + 0.5*z) k.
14680 * =>
14681 * (1 - x) * k < p * x
14682 * =>
14683 * k - k * x < p * x
14684 * =>
14685 * k < (k + p) * x
14686 * =>
14687 * x > k / (k + p)
14688 * where k = y + 0.5 * z
14689 *
14690 * Now x is the percentage of parts we should use, when x = 1 we have
14691 * 2048 parts. So replacing x by parts we get.
14692 *
14693 * parts > 2048 * k / (k + p)
14694 * We will select min_parts = 1 + (2048 * k) / (k + p)
14695 *
14696 * Now we know the following:
14697 * row_count, row_change_count, memory_used_in_fragment, total_memory_used
14698 * This gives:
14699 * y = row_change_count / row_count
14700 * z = (row_change_count / row_count) *
14701 * (memory_used_in_fragment / total_memory_used)
14702 *
14703 * The calculation of z is a prediction based on history, so a sort of
14704 * Bayesian average.
14705 *
14706 * Now if we assume that the LCP have entered a steady state with a steady
14707 * flow of writes going on.
14708 *
14709 * When the k-value above is large we certainly benefits most from writing
14710 * entire set. If for example 70% of the data set was changed the execution
14711 * overhead of writing everything is only 50% and this certainly pays off
14712 * in order to make restart faster by writing the entire data set in this
14713 * case.
14714 *
14715 * At the other end of the spectrum we have small k-values (around 1% or
14716 * even smaller), in this the above equation can be simplified to
14717 * parts = k / p
14718 * Thus p = 25% => parts = 4 * k
14719 * p = 50% => parts = 2 * k
14720 * p = 100% => parts = k
14721 *
14722 * Now k is more or less the percentage of data changing between LCPs.
14723 * So if we have a 1 TByte database and k is 1% we will write 10 GByte
14724 * per LCP to the database. This means 10 GByte will be written to the
14725 * REDO log (can be smaller or larger since REDO log have a 4 byte overhead
14726 * per column, but the REDO log only writes changed columns), almost
14727 * 10 GByte will be written to the CHANGE pages in the partial LCP
14728 *
14729 * Thus with p = 25% we will write 60 GByte to disk, with p = 50% we will
14730 * write 40 GByte to disk and with p = 100% we will write 30 GByte to
14731 * disk to handle 10 Gbytes of writes.
14732 *
14733 * The other side of the picture is that increasing p means that more
14734 * storage space is needed for LCP files. We need (1 + p) * DataMemory
14735 * of storage space for LCP files (unless we use compression when
14736 * this should be divided by at least 2). Actually the storage space
14737 * should in the worst case be increased by 12.5% of the DataMemory
14738 * size since we might need to keep LCP data no longer needed since
14739 * we only delete LCP files and not parts of a file.
14740 *
14741 * The third side of the picture is that higher p means longer time to
14742 * read in the LCP at restart. If we assume in the above example that
14743 * we use p = 25%, thus x = 40GByte of parts, thus 25 LCPs are needed
14744 * to restore data. In each such LCP there will be 10 GByte of updated
14745 * rows extra, but only half of those need to be applied (mean value).
14746 * Thus the extra processing during restart is p/2%. So with p = 25%
14747 * we will execute 12.5% more rows compared to if all rows fitted in
14748 * one LCP. We will have to read all LCP files from disk though, so
14749 * we need to read 25% more from disk during restart.
14750 *
14751 * So thus it becomes natural to think of the p value as the
14752 * work we are willing to put into recovery during normal operation.
14753 * The more work we do during normal operation, the less work we need
14754 * to do during recovery.
14755 *
14756 * Thus we call the config parameter RecoveryWork where small values
14757 * means lots of work done and higher values means smaller amount of
14758 * work done.
14759 *
14760 * Given that decreasing p beyond 25% increases the load of LCPs
14761 * exponentially we set the minimum p to be 25%. Increasing
14762 * p beyond 100% means exponentially smaller benefits with
14763 * linearly increasing recovery, we set the upper limit at 100%
14764 * for p.
14765 *
14766 * It is still possible to use the old algorithm where we always
14767 * write everything in each LCP. This is kept for better backwards
14768 * compatability and for risk averse users. It also works very well
14769 * still for smaller database sizes that updates most of the data
14770 * all the time.
14771 *
14772 * Independent of all these settings we will never write any new LCP
14773 * data files (only LCP control files will be updated) when no changes
14774 * have been made to a table. This will be a great benefit to all
14775 * database tables that are read-only most of the time.
14776 *
14777 * 3) Total memory size used for memory part of rows
14778 * => Memory size needed to log changed rows
14779 * => Memory sized needed to write each part of the LCP
14780 *
14781 * Total memory used gives us an indication if we need to bother about
14782 * splitting it into parts at all. We don't care about parts smaller
14783 * than 64 kBytes. Also we will never split it into parts smaller than
14784 * 64 kBytes.
14785 *
14786 * 4) Total memory space
14787 * 5) Number of LDMs in the node
14788 * => Approximate memory space used by this LDM
14789 *
14790 * This gives us a good understanding how large this fragment is
14791 * compared to the rest of the memory in this LDM.
14792 *
14793 * 6) Current disk write speed
14794 *
14795 * This gives a good approximation of how long time this particular
14796 * fragment LCP will take, it will also give us an indication of how
14797 * long time the entire LCP will take.
14798 *
14799 * 7) Total REDO log size for our log part
14800 * 8) Total free REDO log size for our log part
14801 * 9) => Percentage used of REDO log for our log part
14802 * 10) We also keep free REDO log size from last LCP we executed and the
14803 * timestamp for when we last was here. This helps us calculating the
14804 * speed we are writing REDO log at.
14805 *
14806 * We mainly use this to see if we are close to running out of REDO
14807 * log, if we are we need to speed up LCP processing by raising the
14808 * speed of disk writes for LCP.
14809 *
14810 * 11) Time used for last distributed LCP
14811 * 12) Time used for last LCP locally
14812 */
14813
14814 const Uint64 total_memory = get_total_memory();
14815
14816 /**
14817 * There are four rules that apply for choosing the number of parts to
14818 * write all rows in.
14819 * 1) Make sure that overhead doesn't exceed p% for partial LCPs
14820 * So we call this rule 1, rule 1 says that we will select the number
14821 * of parts that gives p% overhead.
14822 *
14823 * 2) Avoid overhead when it doesn't provide any value, if e.g. we
14824 * have 80% of the rows that have been changed then the calculation
14825 * means that we're going to use actually less than 80% (about 78%)
14826 * since that brings about p% overhead. Obviously there is no sense
14827 * in creating overhead in this case since we will write 78% of the
14828 * rows + 80% of the remaining 22%. Thus we get an overhead of 25%
14829 * to save 4.4% of the row writes which doesn't make a lot of sense.
14830 *
14831 * Rule 2 says that we will select all parts if we have changed
14832 * more than 70% of the rows. Otherwise rule 2 selects 0 parts.
14833 *
14834 * An observation here is that during heavy deletes patterns we will
14835 * very often fall back to full LCPs since the number of rows is
14836 * getting smaller whereas the number of changed rows is increasing.
14837 *
14838 * In a sense this is positive since it means that we will quickly
14839 * remove LCP files that contain deleted rows, this space might be
14840 * needed by other tables that at the same time gets many inserts.
14841 *
14842 * 3) The number of pages sets a limit on how small the number of parts
14843 * can be. So with 1 page we can only perform full LCPs, with 2 pages
14844 * we can never checkpoint with less than 1024 parts, so the rule
14845 * here is that we never go below 2048 divided by number of pages.
14846 * This ensures that most of the time there is at least one page
14847 * that will write ALL rows in the page.
14848 *
14849 * 4) First LCP on fragment must always be a full LCP.
14850 * Rule 4 is 2048 parts when first LCP, otherwise it is 0.
14851 *
14852 * 5) This rules says that the minimum number of parts is 1, we will
14853 * never run an LCP with 0 parts.
14854 *
14855 * In conclusion we will select the rule that returns the highest number
14856 * of parts.
14857 */
14858 Uint64 row_count = ptr.p->m_row_count;
14859 Uint64 memory_used = ptr.p->m_memory_used_in_bytes;
14860 Uint64 row_change_count = calculate_row_change_count(ptr);
14861 Uint32 min_parts_rule1 = calculate_min_parts(row_count,
14862 row_change_count,
14863 memory_used,
14864 total_memory);
14865
14866 Uint32 min_parts_rule2 = 0;
14867 if ((Uint64(10) * row_change_count) >
14868 (Uint64(7) * row_count))
14869 {
14870 jam();
14871 min_parts_rule2 = BackupFormat::NDB_MAX_LCP_PARTS;
14872 }
14873
14874 Uint32 min_parts_rule3 = BackupFormat::NDB_MAX_LCP_PARTS;
14875 if (ptr.p->m_lcp_max_page_cnt > 1)
14876 {
14877 jam();
14878 min_parts_rule3 = BackupFormat::NDB_MAX_LCP_PARTS /
14879 ptr.p->m_lcp_max_page_cnt;
14880 }
14881 Uint32 min_parts_rule4 = 0;
14882 if (ptr.p->preparePrevLcpId == 0)
14883 {
14884 jam();
14885 min_parts_rule4 = BackupFormat::NDB_MAX_LCP_PARTS;
14886 }
14887 /**
14888 * We can never go below 1 part, this is the absolute minimum even if
14889 * all rules say 0.
14890 */
14891 Uint32 min_parts_rule5 = 1;
14892 Uint32 parts = MAX(MAX(min_parts_rule1, min_parts_rule2),
14893 MAX(min_parts_rule3,
14894 MAX(min_parts_rule4, min_parts_rule5)));
14895
14896 if (ERROR_INSERTED(10048) && min_parts_rule4 == 0)
14897 {
14898 /**
14899 * We need this in test cases to ensure that we can create a situation
14900 * with 1 part per LCP and having more than 980 parts and even close to
14901 * 2048 LCPs to restore a LCP.
14902 */
14903 jam();
14904 g_eventLogger->info("Set to 1 part by ERROR 10048 injection");
14905 parts = 1;
14906 }
14907 #ifdef DEBUG_LCP_STAT
14908 TablePtr debTabPtr;
14909 FragmentPtr fragPtr;
14910 ptr.p->tables.first(debTabPtr);
14911 debTabPtr.p->fragments.getPtr(fragPtr, 0);
14912 DEB_LCP_STAT(("(%u)tab(%u,%u), row_count: %llu, calc_row_change_count: %llu"
14913 ", prev_row_count: %llu, "
14914 "memory_used: %llu kB, total_dm_memory: %llu MB, "
14915 "parts: %u, min_parts_rule1: %u, "
14916 "min_parts_rule3: %u",
14917 instance(),
14918 debTabPtr.p->tableId,
14919 fragPtr.p->fragmentId,
14920 row_count,
14921 row_change_count,
14922 ptr.p->m_prev_row_count,
14923 memory_used / 1024,
14924 total_memory / (1024 * 1024),
14925 parts,
14926 min_parts_rule1,
14927 min_parts_rule3));
14928 #endif
14929 /**
14930 * We have now calculated the parts to use in this LCP.
14931 * Now we need to calculate how many LCP files to use for this
14932 * LCP.
14933 *
14934 * The calculation of this is to use 1 file per 12.5% of the
14935 * parts. Each file must still be at least one fixed page
14936 * since this is what makes use choose which part something
14937 * goes into.
14938 */
14939 Uint32 min_file_rule_1 =
14940 (BackupFormat::NDB_MAX_FILES_PER_LCP * parts +
14941 ((BackupFormat::NDB_MAX_LCP_PARTS / BackupFormat::NDB_MAX_FILES_PER_LCP) -
14942 1)) /
14943 BackupFormat::NDB_MAX_LCP_PARTS;
14944 Uint32 min_file_rule = MAX(1, min_file_rule_1);
14945 Uint32 max_file_rule_1 = ptr.p->m_lcp_max_page_cnt;
14946 Uint32 max_file_rule_2 = BackupFormat::NDB_MAX_FILES_PER_LCP;
14947 Uint32 max_file_rule = MIN(max_file_rule_1, max_file_rule_2);
14948 max_file_rule = MAX(1, max_file_rule);
14949 Uint32 num_lcp_files = MIN(min_file_rule, max_file_rule);
14950 if (!is_partial_lcp_enabled())
14951 {
14952 /**
14953 * To not set EnablePartialLcp to true is mostly there to be able to
14954 * use NDB as close to the 7.5 manner as possible, this means also not
14955 * using 8 files when partial LCP isn't enabled. So we use only one
14956 * file here, it will always be full writes in this case.
14957 */
14958 jam();
14959 num_lcp_files = 1;
14960 }
14961 ptr.p->m_num_lcp_files = num_lcp_files;
14962 DEB_EXTRA_LCP(("(%u) min_file_rules1 = %u, max_file_rule1 = %u",
14963 instance(),
14964 min_file_rule_1,
14965 max_file_rule_1));
14966 DEB_LCP(("(%u) LCP using %u files",
14967 instance(),
14968 ptr.p->m_num_lcp_files));
14969
14970 /**
14971 * We will now prepare the BackupRecord such that it has all the
14972 * information set up to execute this LCP.
14973 */
14974 prepare_ranges_for_parts(ptr, parts);
14975 prepare_new_part_info(ptr, parts);
14976 }
14977
14978 void
lcp_swap_tables(BackupRecordPtr ptr,TablePtr & tabPtr,Uint32 tableId)14979 Backup::lcp_swap_tables(BackupRecordPtr ptr,
14980 TablePtr & tabPtr,
14981 Uint32 tableId)
14982 {
14983 ptr.p->prepare_table.first(tabPtr);
14984 ndbrequire(tabPtr.p->tableId == tableId);
14985 ptr.p->prepare_table.removeFirst(tabPtr);
14986
14987 TablePtr newPrepareTablePtr;
14988 ptr.p->tables.removeFirst(newPrepareTablePtr);
14989 ptr.p->tables.addFirst(tabPtr);
14990 ptr.p->prepare_table.addFirst(newPrepareTablePtr);
14991 }
14992
14993 void
lcp_swap_data_file(BackupRecordPtr ptr)14994 Backup::lcp_swap_data_file(BackupRecordPtr ptr)
14995 {
14996 Uint32 newPrepareDataFilePtr = ptr.p->dataFilePtr[0];
14997 ptr.p->dataFilePtr[0] = ptr.p->prepareDataFilePtr[0];
14998 ptr.p->prepareDataFilePtr[0] = newPrepareDataFilePtr;
14999 }
15000
15001 void
lcp_swap_ctl_file(BackupRecordPtr ptr)15002 Backup::lcp_swap_ctl_file(BackupRecordPtr ptr)
15003 {
15004 Uint32 newPrepareCtlFilePtr = ptr.p->ctlFilePtr;
15005 ptr.p->ctlFilePtr =
15006 ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber];
15007 ptr.p->prepareCtlFilePtr[ptr.p->prepareNextLcpCtlFileNumber] =
15008 newPrepareCtlFilePtr;
15009 }
15010
15011 void
copy_lcp_info_from_prepare(BackupRecordPtr ptr)15012 Backup::copy_lcp_info_from_prepare(BackupRecordPtr ptr)
15013 {
15014 ptr.p->m_scan_change_gci = ptr.p->m_prepare_scan_change_gci;
15015 Uint32 total_parts = 0;
15016 for (Uint32 i = 0; i < ptr.p->m_prepare_num_parts_in_lcp; i++)
15017 {
15018 Uint32 num_parts = ptr.p->m_prepare_part_info[i].numParts;
15019 total_parts += num_parts;
15020 ptr.p->m_part_info[i] = ptr.p->m_prepare_part_info[i];
15021 }
15022 ndbrequire(total_parts == 0 || /* First LCP */
15023 total_parts == BackupFormat::NDB_MAX_LCP_PARTS);
15024
15025 ptr.p->m_num_parts_in_lcp = ptr.p->m_prepare_num_parts_in_lcp;
15026 ptr.p->m_max_parts_in_lcp = ptr.p->m_prepare_max_parts_in_lcp;
15027 ptr.p->m_first_start_part_in_lcp =
15028 ptr.p->m_prepare_first_start_part_in_lcp;
15029 ptr.p->m_first_data_file_number = ptr.p->prepareFirstDataFileNumber;
15030 ptr.p->deleteCtlFileNumber = ptr.p->prepareDeleteCtlFileNumber;
15031 }
15032
15033 /**
15034 * An important part of starting an LCP is to insert a record in the
15035 * UNDO log record indicating start of the LCP. This is used to ensure
15036 * that main memory rows restored and the disk data restored is in
15037 * perfect synch with each other. This UNDO log record must be
15038 * completely synchronised with start of LCP scanning.
15039 */
15040 void
lcp_write_undo_log(Signal * signal,BackupRecordPtr ptr)15041 Backup::lcp_write_undo_log(Signal *signal,
15042 BackupRecordPtr ptr)
15043 {
15044 TablePtr tabPtr;
15045 ptr.p->tables.first(tabPtr);
15046 if (c_lqh->is_disk_columns_in_table(tabPtr.p->tableId))
15047 {
15048 jam();
15049 LcpFragOrd *ord = (LcpFragOrd*)signal->getDataPtr();
15050 FragmentPtr fragPtr;
15051 tabPtr.p->fragments.getPtr(fragPtr, 0);
15052 ord->tableId = tabPtr.p->tableId;
15053 ord->fragmentId = fragPtr.p->fragmentId;
15054 ord->lcpId = ptr.p->backupId;
15055 {
15056 Logfile_client lgman(this, c_lgman, 0);
15057 ptr.p->m_current_lcp_lsn = lgman.exec_lcp_frag_ord(signal,
15058 c_lqh->get_current_local_lcp_id());
15059 ndbrequire(ptr.p->m_current_lcp_lsn > Uint64(0));
15060 }
15061 }
15062 else
15063 {
15064 jam();
15065 ptr.p->m_current_lcp_lsn = Uint64(0);
15066 }
15067 }
15068
15069 /**
15070 * Start execution of LCP after receiving BACKUP_FRAGMENT_REQ
15071 *
15072 * When executing this method we know that there is no
15073 * LCP_PREPARE processing ongoing and there is no LCP
15074 * execution processing going on. So this is a safe place to
15075 * move data from prepare part of BackupRecord to execution
15076 * part of the BackupRecord.
15077 */
15078 void
start_execute_lcp(Signal * signal,BackupRecordPtr ptr,TablePtr & tabPtr,Uint32 tableId)15079 Backup::start_execute_lcp(Signal *signal,
15080 BackupRecordPtr ptr,
15081 TablePtr & tabPtr,
15082 Uint32 tableId)
15083 {
15084 init_extended_lcp_stat();
15085 ptr.p->slaveState.setState(STARTED);
15086 ndbrequire(ptr.p->prepareState == PREPARED);
15087 ptr.p->prepareState = NOT_ACTIVE;
15088 ptr.p->m_lcp_lsn_synced = 1;
15089 ptr.p->m_num_lcp_data_files_open = 1;
15090 ptr.p->m_bytes_written = 0;
15091 ptr.p->m_row_scan_counter = 0;
15092 ptr.p->m_last_recorded_bytes_written = 0;
15093 ptr.p->m_pause_counter = 0;
15094 pausing_lcp(3,0);
15095
15096 copy_lcp_info_from_prepare(ptr);
15097
15098 /**
15099 * We need to switch places on prepare table
15100 * execute table.
15101 */
15102 lcp_swap_tables(ptr, tabPtr, tableId);
15103 lcp_swap_data_file(ptr);
15104 lcp_swap_ctl_file(ptr);
15105
15106 lcp_write_undo_log(signal, ptr);
15107 /**
15108 * With the introduction of Partial LCPs we need to calculate how
15109 * many parts that should be part of this LCP.
15110 *
15111 * We tell LDM that we are about to start a new LCP. This means that
15112 * we want to know the number of rows changed since last LCP. We
15113 * want also to know the current number of rows to calculate the
15114 * proportion between updated rows and the number of rows in total
15115 * in the fragment.
15116 *
15117 * We treat 0 updated rows as a special case. This means that not a
15118 * single commit has changed any rows since the last LCP started.
15119 * In this special case we can actually still use the data files
15120 * from the old LCP. We do however still need to write a new LCP
15121 * control file. This is the case since we need to update the
15122 * MaxGciCompleted in the LCP control file which is very
15123 * important. It is this value which makes it possible for us to
15124 * use the LCP to cut the REDO log tail (which in principle is
15125 * the main reason for doing LCPs, to cut the REDO log tail).
15126 *
15127 * The 0 updated rows is most likely a very common case and will
15128 * save us radical amounts of REDO log processing in idle nodes.
15129 * If this is the very first LCP we are performing, then we
15130 * will still go ahead and perform the LCP to simplify the code.
15131 */
15132 c_lqh->get_lcp_frag_stats(ptr.p->m_row_count,
15133 ptr.p->m_prev_row_count,
15134 ptr.p->m_row_change_count,
15135 ptr.p->m_memory_used_in_bytes,
15136 ptr.p->m_lcp_max_page_cnt);
15137 Uint32 newestGci = c_lqh->get_lcp_newest_gci();
15138
15139 FragmentPtr fragPtr;
15140 ptr.p->tables.first(tabPtr);
15141 tabPtr.p->fragments.getPtr(fragPtr, 0);
15142 #ifdef DEBUG_LCP_STAT
15143 DEB_LCP_STAT((
15144 "(%u)TAGY LCP_Start: tab(%u,%u).%u, row_count: %llu,"
15145 " row_change_count: %llu,"
15146 " prev_row_count: %llu,"
15147 " memory_used_in_bytes: %llu, max_page_cnt: %u, LCP lsn: %llu",
15148 instance(),
15149 tabPtr.p->tableId,
15150 fragPtr.p->fragmentId,
15151 c_lqh->getCreateSchemaVersion(ttabPtr.p->tableId),
15152 ptr.p->m_row_count,
15153 ptr.p->m_row_change_count,
15154 ptr.p->m_prev_row_count,
15155 ptr.p->m_memory_used_in_bytes,
15156 ptr.p->m_lcp_max_page_cnt,
15157 ptr.p->m_current_lcp_lsn));
15158 #endif
15159
15160 if (ptr.p->m_row_change_count == 0 &&
15161 ptr.p->preparePrevLcpId != 0 &&
15162 (ptr.p->prepareMaxGciWritten == newestGci &&
15163 m_our_node_started) &&
15164 c_pgman->idle_fragment_lcp(tabPtr.p->tableId,
15165 fragPtr.p->fragmentId))
15166 {
15167 /**
15168 * We don't handle it as an idle LCP when it is the first LCP
15169 * executed on the fragment. In this case we need to run a normal
15170 * LCP even if it produces an empty LCP data file.
15171 *
15172 * Also if someone has committed a transaction on the fragment
15173 * we will not treat it as an idle LCP even if row change count
15174 * hasn't changed.
15175 */
15176 jam();
15177 handle_idle_lcp(signal, ptr);
15178 return;
15179 }
15180 else
15181 {
15182 jam();
15183 prepare_parts_for_lcp(signal, ptr);
15184 }
15185 }
15186
15187 /**
15188 * We have finished writing of a fragment, the file is written to
15189 * disk and we can start the complete processing of the LCP for
15190 * this fragment.
15191 */
15192 void
lcp_close_data_file(Signal * signal,BackupRecordPtr ptr,bool delete_flag)15193 Backup::lcp_close_data_file(Signal* signal,
15194 BackupRecordPtr ptr,
15195 bool delete_flag)
15196 {
15197 BackupFilePtr filePtr;
15198 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
15199 closeFile(signal, ptr, filePtr, false, delete_flag);
15200 }
15201
15202 void
lcp_start_complete_processing(Signal * signal,BackupRecordPtr ptr)15203 Backup::lcp_start_complete_processing(Signal *signal, BackupRecordPtr ptr)
15204 {
15205 /**
15206 * We start wait here for 2 parallel events.
15207 * 1) Sync:ing page cache and extent pages
15208 * 2) Finalising write of LCP data file and closing it
15209 *
15210 * After these events are ready we will check if the LSN have been synched
15211 * yet. If it hasn't we will still write the LCP control file, but we will
15212 * write with an invalid flag set in it. We will later rewrite it before
15213 * deleting the data files.
15214 *
15215 * When all of those are done we will write the control file and when this
15216 * write is completed and the file closed then we will report the LCP back
15217 * as completed.
15218 *
15219 * The only reason for syncing the UNDO log is to ensure that if no
15220 * pages at all was written as part of LCP for the fragment, then we
15221 * still need to ensure that the UNDO_LCP log record is flushed to
15222 * disk. We get the LSN of the UNDO_LCP record from DBLQH.
15223 *
15224 * When we sync the pages we will ensure that any writes will also
15225 * sync the UNDO log to the proper point. So we need not worry about
15226 * losing any UNDO log records as long as we sync the page cache for
15227 * a fragment as part of LCP processing. This is called the
15228 * WAL rule.
15229 *
15230 * Sync:ing the extent pages will write all dirty extent pages, so no
15231 * special phase is needed to write those at the end of all fragment
15232 * LCPs.
15233 *
15234 *
15235 * Sync:ing happens in two stages
15236 * The first stage is syncing all data pages in the PGMAN which executes
15237 * in the same thread as we do. This goes through the list of dirty pages
15238 * on the fragment and sync's them one by one with potential throttling of
15239 * write speed here.
15240 *
15241 * The second stage is synching the extent pages. This always happens in
15242 * the PGMAN proxy block that takes care of the extent pages. Here we
15243 * sync all extent pages that are dirty for each fragment checkpoint. The
15244 * reason is that one extent page is shared by many fragments, also the
15245 * extent pages are only updated when we allocate a new page, allocate a
15246 * new extent or free an extent (only happens at drop table). So normally
15247 * we should only dirty a page when adding another page to a fragment.
15248 * Also many of those writes will usually occur on the same fragment and
15249 * thus the number of writes on those pages will only be high when there
15250 * is high insert activity into the database. Also each extent page covers
15251 * about 1.3 GByte of disk space. So even with 10 TByte of disk space we
15252 * only have a total of 7000 extent pages. So the activity on writing those
15253 * to disk cannot be very high.
15254 *
15255 * By sync:ing data pages and extent pages after writing the main memory
15256 * part of the fragment to disk we are sure that we can recover using this
15257 * fragment LCP. After this we are ready to write the control files for
15258 * this LCP. The LCP is still not 100% ready to use, it still will have
15259 * to wait until the global checkpoint is completed of its highest GCI
15260 * that was written as part of the checkpoint.
15261 *
15262 * As explained in another place it is actually only necessary to sync
15263 * the extent pages for the first fragment containing disk data and
15264 * also at the end of the local checkpoint.
15265 *
15266 * We don't need to wait for this however since the restart will check
15267 * that we don't recover an LCP which has more recent GCI's than we are
15268 * to restore. We must however wait with deleting the old LCP control
15269 * file and data files until we have seen the GCI being completed that
15270 * we wait for.
15271 *
15272 * The localisation of LCP handling and immediate removal of old LCPs
15273 * means that we can no longer restore any older GCPs than the last
15274 * completed one. If a requirement comes up for this it is fairly
15275 * straightforward to add this feature. What is needed is that we wait
15276 * for yet some more time before deleting an old LCP. If we e.g. want
15277 * to support restoring up to 100 GCI's back from the last completed
15278 * than we have to wait for 100 GCI's after completing the one we waited
15279 * for before we can remove the old LCP files. This might require us to
15280 * maintain many LCP control files. One could handle this by ensuring
15281 * that new LCPs aren't started so fast in this case.
15282 *
15283 * However most likely there are better options to restore old versions
15284 * of the database by using backups.
15285 */
15286
15287 ptr.p->m_wait_data_file_close = true;
15288 ptr.p->m_wait_disk_data_sync = true;
15289 ptr.p->m_wait_sync_extent = true;
15290 ptr.p->m_disk_data_exist = false;
15291
15292 if (ptr.p->m_current_lcp_lsn == Uint64(0))
15293 {
15294 /**
15295 * No entry in log file group created, thus table isn't a disk data
15296 * table. So we can safely ignore going to PGMAN to sync data pages.
15297 */
15298 jam();
15299 ptr.p->m_wait_disk_data_sync = false;
15300 if (ptr.p->m_first_fragment)
15301 {
15302 jam();
15303 send_firstSYNC_EXTENT_PAGES_REQ(signal, ptr);
15304 return;
15305 }
15306 ptr.p->m_wait_sync_extent = false;
15307 lcp_write_ctl_file(signal, ptr);
15308 return;
15309 }
15310 BlockReference ref = numberToRef(PGMAN, instance(), getOwnNodeId());
15311 TablePtr tabPtr;
15312 FragmentPtr fragPtr;
15313 ptr.p->tables.first(tabPtr);
15314 tabPtr.p->fragments.getPtr(fragPtr, 0);
15315 ptr.p->m_num_sync_pages_waiting = Uint32(~0);
15316 ptr.p->m_start_sync_op = getHighResTimer();
15317
15318 SyncPageCacheReq *sync_req = (SyncPageCacheReq*)signal->getDataPtrSend();
15319 sync_req->senderData = ptr.i;
15320 sync_req->senderRef = reference();
15321 sync_req->tableId = tabPtr.p->tableId;
15322 sync_req->fragmentId = fragPtr.p->fragmentId;
15323 sendSignal(ref, GSN_SYNC_PAGE_CACHE_REQ, signal,
15324 SyncPageCacheReq::SignalLength, JBB);
15325 }
15326
15327 void
execSYNC_PAGE_WAIT_REP(Signal * signal)15328 Backup::execSYNC_PAGE_WAIT_REP(Signal *signal)
15329 {
15330 jamEntry();
15331 BackupRecordPtr ptr;
15332 c_backupPool.getPtr(ptr, signal->theData[0]);
15333 if (ptr.p->m_wait_disk_data_sync)
15334 {
15335 jam();
15336 ptr.p->m_num_sync_pages_waiting = signal->theData[1];
15337 }
15338 else if (ptr.p->m_wait_sync_extent ||
15339 ptr.p->m_wait_final_sync_extent)
15340 {
15341 jam();
15342 ptr.p->m_num_sync_extent_pages_written = signal->theData[1];
15343 }
15344 else
15345 {
15346 ndbabort();
15347 }
15348 }
15349
15350 void
execSYNC_PAGE_CACHE_CONF(Signal * signal)15351 Backup::execSYNC_PAGE_CACHE_CONF(Signal *signal)
15352 {
15353 SyncPageCacheConf *conf = (SyncPageCacheConf*)signal->getDataPtr();
15354 BackupRecordPtr ptr;
15355 TablePtr tabPtr;
15356 FragmentPtr fragPtr;
15357 jamEntry();
15358
15359 c_backupPool.getPtr(ptr, conf->senderData);
15360 ptr.p->m_num_sync_pages_waiting = 0;
15361 ptr.p->tables.first(tabPtr);
15362 tabPtr.p->fragments.getPtr(fragPtr, 0);
15363 ndbrequire(conf->tableId == tabPtr.p->tableId);
15364 ndbrequire(conf->fragmentId == fragPtr.p->fragmentId);
15365
15366 NDB_TICKS now = getHighResTimer();
15367 Uint64 elapsed_us = NdbTick_Elapsed(ptr.p->m_start_sync_op, now).microSec();
15368 m_current_dd_time_us += elapsed_us;
15369
15370 DEB_LCP_DD(("(%u)Completed SYNC_PAGE_CACHE_CONF for tab(%u,%u)"
15371 ", diskDataExistFlag: %u",
15372 instance(),
15373 tabPtr.p->tableId,
15374 fragPtr.p->fragmentId,
15375 conf->diskDataExistFlag));
15376
15377 ptr.p->m_wait_disk_data_sync = false;
15378 if (conf->diskDataExistFlag)
15379 {
15380 jam();
15381 ptr.p->m_disk_data_exist = true;
15382 }
15383 if (!ptr.p->m_first_fragment)
15384 {
15385 jam();
15386 ptr.p->m_wait_sync_extent = false;
15387 lcp_write_ctl_file(signal, ptr);
15388 return;
15389 }
15390 send_firstSYNC_EXTENT_PAGES_REQ(signal, ptr);
15391 }
15392
15393 void
send_firstSYNC_EXTENT_PAGES_REQ(Signal * signal,BackupRecordPtr ptr)15394 Backup::send_firstSYNC_EXTENT_PAGES_REQ(Signal *signal,
15395 BackupRecordPtr ptr)
15396 {
15397 ptr.p->m_num_sync_extent_pages_written = Uint32(~0);
15398 ptr.p->m_start_sync_op = getHighResTimer();
15399 /**
15400 * Sync extent pages, this is sent to Proxy block that routes the signal to
15401 * the "extra" PGMAN worker that handles the extent pages.
15402 */
15403 SyncExtentPagesReq *req = (SyncExtentPagesReq*)signal->getDataPtrSend();
15404 req->senderData = ptr.i;
15405 req->senderRef = reference();
15406 req->lcpOrder = SyncExtentPagesReq::FIRST_LCP;
15407 ptr.p->m_first_fragment = false;
15408 sendSignal(PGMAN_REF, GSN_SYNC_EXTENT_PAGES_REQ, signal,
15409 SyncExtentPagesReq::SignalLength, JBB);
15410 }
15411
15412 void
execSYNC_EXTENT_PAGES_CONF(Signal * signal)15413 Backup::execSYNC_EXTENT_PAGES_CONF(Signal *signal)
15414 {
15415 SyncExtentPagesConf *conf = (SyncExtentPagesConf*)signal->getDataPtr();
15416 BackupRecordPtr ptr;
15417 jamEntry();
15418
15419 c_backupPool.getPtr(ptr, conf->senderData);
15420 ptr.p->m_num_sync_extent_pages_written = 0;
15421
15422 NDB_TICKS now = getHighResTimer();
15423 Uint64 elapsed_us = NdbTick_Elapsed(ptr.p->m_start_sync_op, now).microSec();
15424 m_current_dd_time_us += elapsed_us;
15425
15426 if (ptr.p->slaveState.getState() == DEFINED)
15427 {
15428 jam();
15429 finish_end_lcp(signal, ptr);
15430 return;
15431 }
15432 ndbrequire(ptr.p->slaveState.getState() == STOPPING);
15433 ptr.p->m_wait_sync_extent = false;
15434 lcp_write_ctl_file(signal, ptr);
15435 }
15436
15437 /**
15438 * A file has been closed as part of LCP completion processing
15439 * for a fragment.
15440 */
15441 void
lcp_close_data_file_conf(Signal * signal,BackupRecordPtr ptr)15442 Backup::lcp_close_data_file_conf(Signal* signal, BackupRecordPtr ptr)
15443 {
15444 jam();
15445 /**
15446 * We could have completed only 1 part of this fragment LCP.
15447 * Check for this and start up next part.
15448 */
15449 if (ptr.p->m_empty_lcp)
15450 {
15451 jam();
15452 finalize_lcp_processing(signal, ptr);
15453 return;
15454 }
15455 ndbrequire(ptr.p->m_wait_data_file_close);
15456 ptr.p->m_wait_data_file_close = false;
15457 lcp_write_ctl_file(signal, ptr);
15458 }
15459
15460 Uint32
lcp_pre_sync_lsn(BackupRecordPtr ptr)15461 Backup::lcp_pre_sync_lsn(BackupRecordPtr ptr)
15462 {
15463 Uint32 valid_flag = 1;
15464 if (ptr.p->m_disk_data_exist)
15465 {
15466 jam();
15467 Uint64 sync_lsn;
15468 {
15469 Logfile_client lgman(this, c_lgman, 0);
15470 sync_lsn = lgman.pre_sync_lsn(ptr.p->m_current_lcp_lsn);
15471 }
15472 if (sync_lsn < ptr.p->m_current_lcp_lsn)
15473 {
15474 jam();
15475 /**
15476 * LSN for UNDO log record of this LCP haven't been sync:ed to disk
15477 * yet. We will still write the LCP control file, but we will write
15478 * it with an invalid indicator. Later before deleting the LCP data
15479 * files we will ensure that the LSN is sync:ed by calling sync_lsn.
15480 * We will actually call it with LSN = 0 then since the LSN we called
15481 * with here has been recorded already in LGMAN. So there is no need
15482 * to remember the individual LSNs for individual fragments. When we
15483 * call sync_lsn we will ensure that all fragment LCPs already handled
15484 * before will be sync:ed to disk.
15485 */
15486 valid_flag = 0;
15487 }
15488 }
15489 else
15490 {
15491 jam();
15492 }
15493 DEB_LCP(("(%u)Writing first with ValidFlag = %u", instance(), valid_flag));
15494 return valid_flag;
15495 }
15496
15497 void
lcp_write_ctl_file(Signal * signal,BackupRecordPtr ptr)15498 Backup::lcp_write_ctl_file(Signal *signal, BackupRecordPtr ptr)
15499 {
15500 if (ptr.p->m_wait_data_file_close ||
15501 ptr.p->m_wait_sync_extent ||
15502 ptr.p->m_wait_disk_data_sync)
15503 {
15504 jam();
15505 return;
15506 }
15507
15508 /**
15509 * Ensure that we didn't find more rows in LCP than what was
15510 * in fragment at start of LCP.
15511 *
15512 * If we run a full LCP we should always find as many rows as was
15513 * present in the row count at the start of the LCP.
15514 * If we run a partial LCP we should never find more rows in this
15515 * LCP file than was present at the start of the LCP, this is the
15516 * sum of rows from ALL pages and changed rows in CHANGE pages.
15517 *
15518 * This check is important such that we find inconsistencies as
15519 * soon as they occur, rather than at the time when we recover
15520 * when it is very difficult to trace back the source of the
15521 * problem.
15522 *
15523 * Error means that the table was dropped during LCP and in this
15524 * case these numbers are not consistent, we're simply closing
15525 * the LCP scan in an orderly manner with no rows read. So we
15526 * should not crash in this case.
15527 *
15528 * We wait until we come here to check the numbers, this means
15529 * that the data file exists when we crash and can be used for
15530 * analysis.
15531 */
15532 {
15533 BackupFilePtr dataFilePtr;
15534 c_backupFilePool.getPtr(dataFilePtr,
15535 ptr.p->dataFilePtr[0]);
15536 if (!(ptr.p->m_save_error_code != 0 ||
15537 ptr.p->m_row_count == dataFilePtr.p->m_lcp_inserts ||
15538 ((ptr.p->m_num_parts_in_this_lcp !=
15539 BackupFormat::NDB_MAX_LCP_PARTS) &&
15540 (ptr.p->m_row_count >=
15541 (dataFilePtr.p->m_lcp_inserts +
15542 dataFilePtr.p->m_lcp_writes)))))
15543 {
15544 g_eventLogger->info("errCode = %u, row_count = %llu, inserts: %llu"
15545 ", writes: %llu, parts: %u",
15546 ptr.p->m_save_error_code,
15547 ptr.p->m_row_count,
15548 dataFilePtr.p->m_lcp_inserts,
15549 dataFilePtr.p->m_lcp_writes,
15550 ptr.p->m_num_parts_in_this_lcp);
15551 print_extended_lcp_stat();
15552 ndbrequire(ptr.p->m_save_error_code != 0 ||
15553 ptr.p->m_row_count == dataFilePtr.p->m_lcp_inserts ||
15554 ((ptr.p->m_num_parts_in_this_lcp != BackupFormat::NDB_MAX_LCP_PARTS) &&
15555 (ptr.p->m_row_count >=
15556 (dataFilePtr.p->m_lcp_inserts + dataFilePtr.p->m_lcp_writes))));
15557 }
15558 }
15559
15560 Uint32 valid_flag = lcp_pre_sync_lsn(ptr);
15561
15562 /**
15563 * This function prepares the page for the LCP Control file data
15564 * and ensures checksum is correct, values are written in network
15565 * byte order when appropriate.
15566 *
15567 * As soon as this file is properly written to disk, it can be used
15568 * in restarts. The restart code will ensure that the GCI is restored
15569 * which this LCP cannot roll back from.
15570 */
15571
15572 BackupFilePtr filePtr;
15573 Page32Ptr pagePtr;
15574
15575 jam();
15576 ptr.p->m_lcp_lsn_synced = valid_flag;
15577 c_backupFilePool.getPtr(filePtr, ptr.p->ctlFilePtr);
15578 filePtr.p->pages.getPtr(pagePtr, 0);
15579 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
15580 (struct BackupFormat::LCPCtlFile*)pagePtr.p;
15581
15582 memcpy(lcpCtlFilePtr->fileHeader.Magic, BACKUP_MAGIC, 8);
15583 lcpCtlFilePtr->fileHeader.BackupVersion = NDBD_USE_PARTIAL_LCP_v2;
15584
15585 const Uint32 sz = sizeof(BackupFormat::FileHeader) >> 2;
15586 lcpCtlFilePtr->fileHeader.SectionType = BackupFormat::FILE_HEADER;
15587 lcpCtlFilePtr->fileHeader.SectionLength = sz - 3;
15588 lcpCtlFilePtr->fileHeader.FileType = BackupFormat::LCP_CTL_FILE;
15589 lcpCtlFilePtr->fileHeader.BackupId = 0;
15590 lcpCtlFilePtr->fileHeader.BackupKey_0 = 0;
15591 lcpCtlFilePtr->fileHeader.BackupKey_1 = 0;
15592 lcpCtlFilePtr->fileHeader.ByteOrder = 0x12345678;
15593 lcpCtlFilePtr->fileHeader.NdbVersion = NDB_VERSION_D;
15594 lcpCtlFilePtr->fileHeader.MySQLVersion = NDB_MYSQL_VERSION_D;
15595
15596 lcpCtlFilePtr->ValidFlag = valid_flag;
15597
15598 TablePtr tabPtr;
15599 FragmentPtr fragPtr;
15600 ptr.p->tables.first(tabPtr);
15601 tabPtr.p->fragments.getPtr(fragPtr, 0);
15602
15603 lcpCtlFilePtr->TableId = tabPtr.p->tableId;
15604 lcpCtlFilePtr->FragmentId = fragPtr.p->fragmentId;
15605 lcpCtlFilePtr->CreateTableVersion =
15606 c_lqh->getCreateSchemaVersion(tabPtr.p->tableId);
15607
15608 Uint32 maxCompletedGci;
15609 c_lqh->lcp_max_completed_gci(maxCompletedGci,
15610 ptr.p->newestGci,
15611 m_newestRestorableGci);
15612 lcpCtlFilePtr->CreateGci = fragPtr.p->createGci;
15613 lcpCtlFilePtr->MaxGciCompleted = maxCompletedGci;
15614 lcpCtlFilePtr->MaxGciWritten = ptr.p->newestGci;
15615
15616 ptr.p->m_wait_gci_to_delete = MAX(maxCompletedGci, ptr.p->newestGci);
15617
15618 ndbrequire(m_newestRestorableGci != 0);
15619 DEB_LCP(("(%u)tab(%u,%u).%u, use ctl file %u, GCI completed: %u,"
15620 " GCI written: %u, createGci: %u",
15621 instance(),
15622 lcpCtlFilePtr->TableId,
15623 lcpCtlFilePtr->FragmentId,
15624 lcpCtlFilePtr->CreateTableVersion,
15625 (ptr.p->deleteCtlFileNumber == 0 ? 1 : 0),
15626 lcpCtlFilePtr->MaxGciCompleted,
15627 lcpCtlFilePtr->MaxGciWritten,
15628 lcpCtlFilePtr->CreateGci));
15629 ndbrequire((lcpCtlFilePtr->MaxGciWritten + 1) >= fragPtr.p->createGci);
15630 /**
15631 * LcpId and LocalLcpId was set in prepare phase.
15632 */
15633 if (lcpCtlFilePtr->LocalLcpId != c_lqh->get_current_local_lcp_id())
15634 {
15635 g_eventLogger->info("(%u)LocalLcpId: %u, local_lcp_id: %u",
15636 instance(),
15637 lcpCtlFilePtr->LocalLcpId,
15638 c_lqh->get_current_local_lcp_id());
15639 }
15640 ndbrequire(lcpCtlFilePtr->LocalLcpId == c_lqh->get_current_local_lcp_id());
15641 lcpCtlFilePtr->MaxPageCount = ptr.p->m_lcp_max_page_cnt;
15642 lcpCtlFilePtr->LastDataFileNumber = ptr.p->m_last_data_file_number;
15643 lcpCtlFilePtr->MaxNumberDataFiles =
15644 BackupFormat::NDB_MAX_LCP_FILES;
15645 lcpCtlFilePtr->NumPartPairs = ptr.p->m_num_parts_in_lcp;
15646 lcpCtlFilePtr->MaxPartPairs = BackupFormat::NDB_MAX_LCP_PARTS;
15647 lcpCtlFilePtr->RowCountLow = Uint32(ptr.p->m_row_count & 0xFFFFFFFF);
15648 lcpCtlFilePtr->RowCountHigh = Uint32(ptr.p->m_row_count >> 32);
15649
15650 for (Uint32 i = 0; i < ptr.p->m_num_parts_in_lcp; i++)
15651 {
15652 jam();
15653 lcpCtlFilePtr->partPairs[i] = ptr.p->m_part_info[i];
15654 }
15655
15656 /**
15657 * Since we calculated checksum with bytes in network order we will write it
15658 * without setting it in network order, this will ensure that the XOR will
15659 * be over the same bits as here.
15660 */
15661 lcp_write_ctl_file_to_disk(signal, filePtr, pagePtr);
15662 }
15663
15664 void
lcp_write_ctl_file_to_disk(Signal * signal,BackupFilePtr filePtr,Page32Ptr pagePtr)15665 Backup::lcp_write_ctl_file_to_disk(Signal *signal,
15666 BackupFilePtr filePtr,
15667 Page32Ptr pagePtr)
15668 {
15669 /**
15670 * If file size becomes bigger than 4096 bytes we need to write
15671 * 8192 bytes instead. Currently the header parts are 108 bytes,
15672 * each part consumes 3 bytes, this means that we can fit
15673 * (4096 - 108) / 3 parts in 4096 bytes == 1329 parts.
15674 * Maximum number of parts is currently 2048, thus we can
15675 * always fit in 8192 bytes. We use multiples of 4096 bytes
15676 * to fit well with disk devices, no need to complicate
15677 * file management with lots of different file sizes.
15678 */
15679 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
15680 (struct BackupFormat::LCPCtlFile*)pagePtr.p;
15681 Uint32 num_parts = lcpCtlFilePtr->NumPartPairs;
15682 Uint32 file_size = LCP_CTL_FILE_HEADER_SIZE +
15683 (3 * num_parts + 3);
15684 if (file_size > BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL)
15685 {
15686 jam();
15687 DEB_LCP(("(%u)Writing 8192 byte control file", instance()));
15688 file_size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
15689 }
15690 else
15691 {
15692 jam();
15693 file_size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_SMALL;
15694 }
15695 convert_ctl_page_to_network((Uint32*)pagePtr.p, file_size);
15696 filePtr.p->m_flags |= BackupFile::BF_WRITING;
15697 FsReadWriteReq* req = (FsReadWriteReq*)signal->getDataPtrSend();
15698 req->userPointer = filePtr.i;
15699 req->filePointer = filePtr.p->filePointer;
15700 req->userReference = reference();
15701 req->varIndex = 0;
15702 req->numberOfPages = 1;
15703 req->operationFlag = 0;
15704 FsReadWriteReq::setFormatFlag(req->operationFlag,
15705 FsReadWriteReq::fsFormatMemAddress);
15706 FsReadWriteReq::setSyncFlag(req->operationFlag, 1);
15707
15708 Uint32 mem_offset = Uint32((char*)pagePtr.p - (char*)c_startOfPages);
15709 req->data.memoryAddress.memoryOffset = mem_offset;
15710 req->data.memoryAddress.fileOffset = 0;
15711 req->data.memoryAddress.size = file_size;
15712
15713 sendSignal(NDBFS_REF, GSN_FSWRITEREQ, signal,
15714 FsReadWriteReq::FixedLength + 3, JBA);
15715 }
15716
15717 void
execFSWRITEREF(Signal * signal)15718 Backup::execFSWRITEREF(Signal *signal)
15719 {
15720 ndbabort();
15721 }
15722
15723 void
execFSWRITECONF(Signal * signal)15724 Backup::execFSWRITECONF(Signal *signal)
15725 {
15726 BackupRecordPtr ptr;
15727 BackupFilePtr filePtr;
15728 FsConf * conf = (FsConf *)signal->getDataPtr();
15729 const Uint32 userPtr = conf->userPointer;
15730 jamEntry();
15731
15732 c_backupFilePool.getPtr(filePtr, userPtr);
15733 ndbrequire((filePtr.p->m_flags & BackupFile::BF_WRITING) != 0);
15734 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_WRITING;
15735 c_backupPool.getPtr(ptr, filePtr.p->backupPtr);
15736
15737 if (ptr.p->ctlFilePtr == filePtr.i)
15738 {
15739 jam();
15740 closeFile(signal, ptr, filePtr);
15741 return;
15742 }
15743 else if (ptr.p->deleteFilePtr == filePtr.i)
15744 {
15745 jam();
15746 lcp_update_ctl_file_for_rewrite_done(signal, ptr, filePtr);
15747 return;
15748 }
15749 ndbabort();
15750 }
15751
15752 void
finalize_lcp_processing(Signal * signal,BackupRecordPtr ptr)15753 Backup::finalize_lcp_processing(Signal *signal, BackupRecordPtr ptr)
15754 {
15755 TablePtr tabPtr;
15756 FragmentPtr fragPtr;
15757 BackupFilePtr filePtr;
15758
15759 if (ptr.p->m_empty_lcp)
15760 {
15761 jam();
15762 ndbrequire(ptr.p->m_outstanding_operations > 0);
15763 ptr.p->m_outstanding_operations--;
15764 if (ptr.p->m_outstanding_operations > 0)
15765 {
15766 jam();
15767 return;
15768 }
15769 }
15770 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
15771 ndbrequire(ptr.p->tables.first(tabPtr));
15772 Uint32 tableId = tabPtr.p->tableId;
15773
15774 tabPtr.p->fragments.getPtr(fragPtr, 0);
15775 Uint32 fragmentId = fragPtr.p->fragmentId;
15776
15777 if (ptr.p->errorCode != 0)
15778 {
15779 jam();
15780 ndbout_c("Fatal : LCP Frag scan failed with error %u"
15781 " file error is: %d",
15782 ptr.p->errorCode,
15783 filePtr.p->errorCode);
15784 ndbrequire(filePtr.p->errorCode == ptr.p->errorCode);
15785
15786 if ((filePtr.p->m_flags & BackupFile::BF_SCAN_THREAD) == 0)
15787 {
15788 jam();
15789 /* No active scan thread to 'find' the file error.
15790 * Scan is closed, so let's send backupFragmentRef
15791 * back to LQH now...
15792 */
15793 backupFragmentRef(signal, filePtr);
15794 return;
15795 }
15796 ndbabort();
15797 }
15798
15799 /**
15800 * We're fully done with everything related to the LCP of this fragment.
15801 * Report this back to LQH such that LQH can order the start of a new
15802 * LCP on a new fragment when it is ready to do so.
15803 */
15804 if (ptr.p->deleteDataFileNumber != RNIL ||
15805 ptr.p->deleteCtlFileNumber != RNIL ||
15806 !ptr.p->m_lcp_lsn_synced)
15807 {
15808 /**
15809 * We insert a record into the list for files to delete that will ensure
15810 * that we will delete old LCP files as soon as possible.
15811 * If deleteDataFileNumber is RNIL it means that this was the very first
15812 * LCP on this fragment, so no need to delete any old files. It could
15813 * also be an LCP that retains all files from the old LCP, but we might
15814 * still need to delete a control file.
15815 *
15816 * We wait an extra GCP before we delete the old LCP files. The reason is
15817 * to avoid calling sync_lsn unnecessarily often. Calling sync_lsn will
15818 * remove log space (up to one log page) each time it is called and it
15819 * needs to sync the LSN on the current page.
15820 */
15821 jam();
15822 DeleteLcpFilePtr deleteLcpFilePtr;
15823 ndbrequire(c_deleteLcpFilePool.seize(deleteLcpFilePtr));
15824 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
15825 m_delete_lcp_file_head);
15826
15827 Uint32 wait_for_gci = ptr.p->m_wait_gci_to_delete;
15828 if (m_our_node_started)
15829 {
15830 jam();
15831 wait_for_gci++;
15832 }
15833 bool ready_for_delete = (wait_for_gci <= m_newestRestorableGci);
15834 Uint32 lastDeleteFileNumber= get_file_add(ptr.p->deleteDataFileNumber,
15835 (ptr.p->m_lcp_remove_files - 1));
15836 deleteLcpFilePtr.p->tableId = tableId;
15837 deleteLcpFilePtr.p->fragmentId = fragmentId;
15838 deleteLcpFilePtr.p->firstFileId = ptr.p->deleteDataFileNumber;
15839 deleteLcpFilePtr.p->lastFileId = lastDeleteFileNumber;
15840 deleteLcpFilePtr.p->waitCompletedGci = wait_for_gci;
15841 deleteLcpFilePtr.p->lcpCtlFileNumber = ptr.p->deleteCtlFileNumber;
15842 deleteLcpFilePtr.p->validFlag = ptr.p->m_lcp_lsn_synced;
15843 deleteLcpFilePtr.p->lcpLsn = ptr.p->m_current_lcp_lsn;
15844 #ifdef DEBUG_LCP
15845 if (deleteLcpFilePtr.p->firstFileId != RNIL)
15846 {
15847 DEB_LCP(("(%u)TAGI Insert delete file in queue:"
15848 " tab(%u,%u).%u, file(%u-%u,%u) GCI: %u, validFlag: %u",
15849 instance(),
15850 tableId,
15851 fragmentId,
15852 c_lqh->getCreateSchemaVersion(tableId),
15853 deleteLcpFilePtr.p->firstFileId,
15854 deleteLcpFilePtr.p->lastFileId,
15855 ptr.p->deleteCtlFileNumber,
15856 ptr.p->m_wait_gci_to_delete,
15857 ptr.p->m_lcp_lsn_synced));
15858 }
15859 else
15860 {
15861 DEB_LCP(("(%u)TAGI Insert delete file in queue:"
15862 " tab(%u,%u).%u, file(RNIL,%u) GCI: %u, validFlag: %u",
15863 instance(),
15864 tableId,
15865 fragmentId,
15866 c_lqh->getCreateSchemaVersion(tableId),
15867 ptr.p->deleteCtlFileNumber,
15868 ptr.p->m_wait_gci_to_delete,
15869 ptr.p->m_lcp_lsn_synced));
15870 }
15871 #endif
15872
15873 if (ready_for_delete)
15874 {
15875 /**
15876 * Add first to delete processing queue since it is already ready for
15877 * deletion.
15878 */
15879 jam();
15880 queue.addFirst(deleteLcpFilePtr);
15881 }
15882 else
15883 {
15884 jam();
15885 queue.addLast(deleteLcpFilePtr);
15886 }
15887 if (!m_delete_lcp_files_ongoing && ready_for_delete)
15888 {
15889 jam();
15890 m_delete_lcp_files_ongoing = true;
15891 signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
15892 signal->theData[1] = ptr.i;
15893 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
15894 }
15895 }
15896
15897 ptr.p->errorCode = 0;
15898 ptr.p->slaveState.forceState(DEFINED);
15899 check_empty_queue_waiters(signal, ptr);
15900
15901 BackupFragmentConf * conf = (BackupFragmentConf*)signal->getDataPtrSend();
15902 conf->backupId = ptr.p->backupId;
15903 conf->backupPtr = ptr.i;
15904 conf->tableId = tableId;
15905 conf->fragmentNo = fragmentId;
15906 conf->noOfRecordsLow = (ptr.p->noOfRecords & 0xFFFFFFFF);
15907 conf->noOfRecordsHigh = (ptr.p->noOfRecords >> 32);
15908 conf->noOfBytesLow = (ptr.p->noOfBytes & 0xFFFFFFFF);
15909 conf->noOfBytesHigh = (ptr.p->noOfBytes >> 32);
15910 if (ptr.p->m_empty_lcp)
15911 {
15912 jam();
15913 /**
15914 * Slow down things a bit for empty LCPs to avoid that we use too much
15915 * CPU for idle LCP processing. This tends to get a bit bursty and can
15916 * affect traffic performance for short times.
15917 */
15918 sendSignalWithDelay(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
15919 1, BackupFragmentConf::SignalLength);
15920 }
15921 else
15922 {
15923 jam();
15924 sendSignal(ptr.p->masterRef, GSN_BACKUP_FRAGMENT_CONF, signal,
15925 BackupFragmentConf::SignalLength, JBA);
15926 }
15927 }
15928
15929 void
execRESTORABLE_GCI_REP(Signal * signal)15930 Backup::execRESTORABLE_GCI_REP(Signal *signal)
15931 {
15932 Uint32 restorable_gci = signal->theData[0];
15933 /**
15934 * LQH has a more up-to-date view of the node state so use LQHs version
15935 * of the node state rather than our own.
15936 */
15937 if (c_lqh->getNodeState().startLevel >= NodeState::SL_STOPPING_4)
15938 {
15939 jam();
15940 DEB_LCP(("(%u)Ignore RESTORABLE_GCI_REP: %u in SL_STOPPING_4",
15941 instance(),
15942 restorable_gci));
15943 return;
15944 }
15945 if (restorable_gci > m_newestRestorableGci)
15946 {
15947 jam();
15948 m_newestRestorableGci = restorable_gci;
15949 }
15950 else
15951 {
15952 jam();
15953 DEB_LCP(("(%u)Already received this restorable gci: %u",
15954 instance(),
15955 restorable_gci));
15956 return;
15957 }
15958 #ifdef DEBUG_LCP_DEL_FILES
15959 DeleteLcpFilePtr deleteLcpFilePtr;
15960 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
15961 m_delete_lcp_file_head);
15962 queue.first(deleteLcpFilePtr);
15963 Uint32 waitGCI = (deleteLcpFilePtr.i != RNIL) ?
15964 deleteLcpFilePtr.p->waitCompletedGci : 0;
15965 #endif
15966 if (m_delete_lcp_files_ongoing)
15967 {
15968 jam();
15969 DEB_LCP_DEL_FILES(("(%u)TAGX Completed GCI: %u (delete files ongoing)"
15970 ", waitGCI: %u",
15971 instance(),
15972 m_newestRestorableGci,
15973 waitGCI));
15974 return;
15975 }
15976 jam();
15977 DEB_LCP_DEL_FILES(("(%u)TAGX Completed GCI: %u (delete files not ongoing)"
15978 ", waitGCI: %u",
15979 instance(),
15980 m_newestRestorableGci,
15981 waitGCI));
15982 m_delete_lcp_files_ongoing = true;
15983 delete_lcp_file_processing(signal);
15984 return;
15985 }
15986
15987 void
delete_lcp_file_processing(Signal * signal)15988 Backup::delete_lcp_file_processing(Signal *signal)
15989 {
15990 BackupRecordPtr ptr;
15991 DeleteLcpFilePtr deleteLcpFilePtr;
15992
15993 if (m_lcp_ptr.i == RNIL)
15994 {
15995 jam();
15996 m_delete_lcp_files_ongoing = false;
15997 return;
15998 }
15999 ptr = m_lcp_ptr;
16000 ndbrequire(m_delete_lcp_files_ongoing);
16001
16002 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16003 m_delete_lcp_file_head);
16004 if (queue.isEmpty())
16005 {
16006 jam();
16007 ndbrequire(!ptr.p->m_wait_end_lcp);
16008 m_delete_lcp_files_ongoing = false;
16009 if (ptr.p->prepareState == PREPARE_DROP)
16010 {
16011 jam();
16012 /**
16013 * We use this route when we find the obscure case of
16014 * finding LCP files belonging to an already dropped table.
16015 * We keep the code simple here and even wait until the
16016 * queue is completely empty also for this special case to
16017 * avoid any unnecessary checks. We then proceed with normal
16018 * LCP_PREPARE_REQ handling for this case.
16019 */
16020 ptr.p->prepareState = PREPARE_READ_CTL_FILES;
16021 DEB_LCP(("(%u)TAGT Completed wait delete files for drop case",
16022 instance()));
16023 lcp_open_ctl_file(signal, ptr, 0);
16024 lcp_open_ctl_file(signal, ptr, 1);
16025 return;
16026 }
16027 DEB_LCP_DEL_FILES(("(%u)TAGB Completed delete files,"
16028 " queue empty, no LCP wait",
16029 instance()));
16030 return;
16031 }
16032 queue.first(deleteLcpFilePtr);
16033 if (deleteLcpFilePtr.p->waitCompletedGci > m_newestRestorableGci)
16034 {
16035 jam();
16036 DEB_LCP(("(%u)TAGW Wait for completed GCI: %u",
16037 instance(),
16038 deleteLcpFilePtr.p->waitCompletedGci));
16039 m_delete_lcp_files_ongoing = false;
16040 return;
16041 }
16042 /* The delete record is ready for deletion process to start. */
16043 ptr.p->currentDeleteLcpFile = deleteLcpFilePtr.i;
16044 if (deleteLcpFilePtr.p->validFlag == 0)
16045 {
16046 jam();
16047 sync_log_lcp_lsn(signal, deleteLcpFilePtr, ptr.i);
16048 return;
16049 }
16050 BackupFilePtr filePtr;
16051 c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16052 lcp_close_ctl_file_for_rewrite_done(signal, ptr, filePtr);
16053 }
16054
16055 /**
16056 * This segment of code does a rewrite of the LCP control file.
16057 * The LCP control file was written with the valid flag set to
16058 * to 0. This indicates to the restore block that the LCP control
16059 * file isn't safe to use.
16060 *
16061 * Before the old LCP control file is deleted we must ensure that
16062 * the new LCP control file is ready to use by setting the validFlag
16063 * to 1.
16064 *
16065 * The validFlag can however only be set to 1 if we are sure that
16066 * the LSN of our UNDO log record for this fragment LCP has been
16067 * flushed to disk. This is done by calling sync_lsn.
16068 *
16069 * Calling sync_lsn for each fragment is not a good solution since
16070 * each such call can cause one page of UNDO log space to be wasted.
16071 * So to ensure that we minimize the amount of wasted log space we
16072 * instead wait for the GCI to be completed before we call sync_lsn.
16073 * To ensure that we pack as many sync_lsn into one sync_lsn as
16074 * possible we call pre_sync_lsn earlier in the LCP process.
16075 *
16076 * So the idea is that as much as possible we will wait for the
16077 * LSN to be flushed by someone else, if no one has done that job
16078 * after almost 2 GCPs we will do it ourselves. If we do it ourselves
16079 * we will also ensure that all LSNs of calls to pre_sync_lsn will
16080 * be flushed to disk in the same go.
16081 *
16082 * If we find that pre_sync_lsn call indicates that our LSN has already
16083 * been flushed to disk we can avoid this extra round of read and write
16084 * of the LCP control file. We also don't need it for tables without
16085 * disk data columns.
16086 *
16087 * After sync:ing the UNDO LSN we will read the LCP control file,
16088 * set the ValidFlag in the LCP control file and write it again
16089 * and finally close it.
16090 *
16091 * Then we will continue deleting the old data files and old
16092 * LCP control file.
16093 */
16094 void
sync_log_lcp_lsn(Signal * signal,DeleteLcpFilePtr deleteLcpFilePtr,Uint32 ptrI)16095 Backup::sync_log_lcp_lsn(Signal *signal,
16096 DeleteLcpFilePtr deleteLcpFilePtr,
16097 Uint32 ptrI)
16098 {
16099 Logfile_client::Request req;
16100 int ret;
16101 req.m_callback.m_callbackData = ptrI;
16102 req.m_callback.m_callbackIndex = SYNC_LOG_LCP_LSN;
16103 {
16104 Logfile_client lgman(this, c_lgman, 0);
16105 ret = lgman.sync_lsn(signal, deleteLcpFilePtr.p->lcpLsn, &req, 1);
16106 jamEntry();
16107 }
16108 switch (ret)
16109 {
16110 case 0:
16111 {
16112 jam();
16113 return;
16114 }
16115 case -1:
16116 {
16117 g_eventLogger->info("(%u)Failed to Sync LCP lsn", instance());
16118 ndbabort();
16119 }
16120 default:
16121 {
16122 jam();
16123 execute(signal, req.m_callback, 0);
16124 return;
16125 }
16126 }
16127 }
16128
16129 void
sync_log_lcp_lsn_callback(Signal * signal,Uint32 ptrI,Uint32 res)16130 Backup::sync_log_lcp_lsn_callback(Signal *signal, Uint32 ptrI, Uint32 res)
16131 {
16132 BackupRecordPtr ptr;
16133 DeleteLcpFilePtr deleteLcpFilePtr;
16134 jamEntry();
16135 c_backupPool.getPtr(ptr, ptrI);
16136 ndbrequire(res == 0);
16137 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16138 ndbrequire(deleteLcpFilePtr.p->validFlag == 0);
16139 /**
16140 * The LSN have now been sync:ed, now time to read the LCP control file
16141 * again to update the validFlag.
16142 */
16143 lcp_open_ctl_file_for_rewrite(signal, deleteLcpFilePtr, ptr);
16144 }
16145
16146 void
lcp_open_ctl_file_for_rewrite(Signal * signal,DeleteLcpFilePtr deleteLcpFilePtr,BackupRecordPtr ptr)16147 Backup::lcp_open_ctl_file_for_rewrite(Signal *signal,
16148 DeleteLcpFilePtr deleteLcpFilePtr,
16149 BackupRecordPtr ptr)
16150 {
16151 BackupFilePtr filePtr;
16152 c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16153 FsOpenReq *req = (FsOpenReq*)signal->getDataPtrSend();
16154
16155 req->userReference = reference();
16156 req->fileFlags = FsOpenReq::OM_READWRITE;
16157 req->userPointer = filePtr.i;
16158
16159 ndbrequire(filePtr.p->m_flags == 0);
16160 filePtr.p->m_flags = BackupFile::BF_OPENING;
16161
16162 /**
16163 * We use same table id and fragment id as the one we are about to
16164 * delete. If we are about to delete LCP control file 0, then we should
16165 * rewrite LCP control file 1 and vice versa if we are to delete LCP
16166 * control file 1.
16167 */
16168 Uint32 tableId = deleteLcpFilePtr.p->tableId;
16169 Uint32 fragmentId = deleteLcpFilePtr.p->fragmentId;
16170 Uint32 lcpNo = (deleteLcpFilePtr.p->lcpCtlFileNumber == 0) ? 1 : 0;
16171
16172 FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
16173 FsOpenReq::setVersion(req->fileNumber, 5);
16174 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
16175 FsOpenReq::v5_setLcpNo(req->fileNumber, lcpNo);
16176 FsOpenReq::v5_setTableId(req->fileNumber, tableId);
16177 FsOpenReq::v5_setFragmentId(req->fileNumber, fragmentId);
16178
16179 sendSignal(NDBFS_REF, GSN_FSOPENREQ, signal, FsOpenReq::SignalLength, JBA);
16180 }
16181
16182 void
lcp_open_ctl_file_for_rewrite_done(Signal * signal,BackupFilePtr filePtr)16183 Backup::lcp_open_ctl_file_for_rewrite_done(Signal *signal,
16184 BackupFilePtr filePtr)
16185 {
16186 lcp_read_ctl_file_for_rewrite(signal, filePtr);
16187 }
16188
16189 void
lcp_read_ctl_file_for_rewrite(Signal * signal,BackupFilePtr filePtr)16190 Backup::lcp_read_ctl_file_for_rewrite(Signal *signal,
16191 BackupFilePtr filePtr)
16192 {
16193 FsReadWriteReq *req = (FsReadWriteReq*)signal->getDataPtrSend();
16194 Page32Ptr pagePtr;
16195
16196 filePtr.p->pages.getPtr(pagePtr, 0);
16197 ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
16198 filePtr.p->m_flags |= BackupFile::BF_READING;
16199
16200 req->userPointer = filePtr.i;
16201 req->filePointer = filePtr.p->filePointer;
16202 req->userReference = reference();
16203 req->varIndex = 0;
16204 req->numberOfPages = 1;
16205 req->operationFlag = 0;
16206 FsReadWriteReq::setFormatFlag(req->operationFlag,
16207 FsReadWriteReq::fsFormatMemAddress);
16208 FsReadWriteReq::setPartialReadFlag(req->operationFlag, 1);
16209
16210 Uint32 mem_offset = Uint32(((char*)pagePtr.p) - ((char*)c_startOfPages));
16211 req->data.memoryAddress.memoryOffset = mem_offset;
16212 req->data.memoryAddress.fileOffset = 0;
16213 req->data.memoryAddress.size = BackupFormat::NDB_LCP_CTL_FILE_SIZE_BIG;
16214
16215 sendSignal(NDBFS_REF, GSN_FSREADREQ, signal,
16216 FsReadWriteReq::FixedLength + 3, JBA);
16217 }
16218
16219 void
lcp_read_ctl_file_for_rewrite_done(Signal * signal,BackupFilePtr filePtr)16220 Backup::lcp_read_ctl_file_for_rewrite_done(Signal *signal,
16221 BackupFilePtr filePtr)
16222 {
16223 Page32Ptr pagePtr;
16224
16225 filePtr.p->pages.getPtr(pagePtr, 0);
16226 struct BackupFormat::LCPCtlFile *lcpCtlFilePtr =
16227 (struct BackupFormat::LCPCtlFile*)pagePtr.p;
16228 ndbrequire(convert_ctl_page_to_host(lcpCtlFilePtr));
16229 lcpCtlFilePtr->ValidFlag = 1;
16230 lcp_update_ctl_file_for_rewrite(signal, filePtr, pagePtr);
16231 }
16232
16233 void
lcp_update_ctl_file_for_rewrite(Signal * signal,BackupFilePtr filePtr,Page32Ptr pagePtr)16234 Backup::lcp_update_ctl_file_for_rewrite(Signal *signal,
16235 BackupFilePtr filePtr,
16236 Page32Ptr pagePtr)
16237 {
16238 ndbrequire(filePtr.p->m_flags == BackupFile::BF_OPEN);
16239 lcp_write_ctl_file_to_disk(signal, filePtr, pagePtr);
16240 }
16241
16242 void
lcp_update_ctl_file_for_rewrite_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16243 Backup::lcp_update_ctl_file_for_rewrite_done(Signal *signal,
16244 BackupRecordPtr ptr,
16245 BackupFilePtr filePtr)
16246 {
16247 lcp_close_ctl_file_for_rewrite(signal, ptr, filePtr);
16248 }
16249
16250 void
lcp_close_ctl_file_for_rewrite(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16251 Backup::lcp_close_ctl_file_for_rewrite(Signal *signal,
16252 BackupRecordPtr ptr,
16253 BackupFilePtr filePtr)
16254 {
16255 ndbrequire(ptr.p->errorCode == 0);
16256 closeFile(signal, ptr, filePtr, false, false);
16257 #ifdef DEBUG_LCP
16258 DeleteLcpFilePtr deleteLcpFilePtr;
16259 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16260 DEB_LCP(("(%u)Completed writing with ValidFlag = 1 for tab(%u,%u).%u",
16261 instance(),
16262 deleteLcpFilePtr.p->tableId,
16263 deleteLcpFilePtr.p->fragmentId,
16264 c_lqh->getCreateSchemaVersion(deleteLcpFilePtr.p->tableId)));
16265 #endif
16266 }
16267
16268 void
lcp_close_ctl_file_for_rewrite_done(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16269 Backup::lcp_close_ctl_file_for_rewrite_done(Signal *signal,
16270 BackupRecordPtr ptr,
16271 BackupFilePtr filePtr)
16272 {
16273 ndbrequire(filePtr.p->m_flags == 0);
16274 ndbrequire(ptr.p->errorCode == 0);
16275 DeleteLcpFilePtr deleteLcpFilePtr;
16276 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16277
16278 if (deleteLcpFilePtr.p->firstFileId != RNIL)
16279 {
16280 jam();
16281 ptr.p->m_delete_data_file_ongoing = true;
16282 lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16283 }
16284 else if (deleteLcpFilePtr.p->lcpCtlFileNumber != RNIL)
16285 {
16286 jam();
16287 ptr.p->m_delete_data_file_ongoing = false;
16288 lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16289 }
16290 else
16291 {
16292 jam();
16293 finished_removing_files(signal, ptr);
16294 }
16295 }
16296
16297 void
lcp_remove_file(Signal * signal,BackupRecordPtr ptr,DeleteLcpFilePtr deleteLcpFilePtr)16298 Backup::lcp_remove_file(Signal* signal,
16299 BackupRecordPtr ptr,
16300 DeleteLcpFilePtr deleteLcpFilePtr)
16301 {
16302 BackupFilePtr filePtr;
16303 c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16304 FsRemoveReq * req = (FsRemoveReq *)signal->getDataPtrSend();
16305 req->userReference = reference();
16306 req->userPointer = ptr.i;
16307 req->directory = 0;
16308 req->ownDirectory = 0;
16309
16310 filePtr.p->m_flags |= BackupFile::BF_REMOVING;
16311
16312 FsOpenReq::setVersion(req->fileNumber, 5);
16313 if (ptr.p->m_delete_data_file_ongoing)
16314 {
16315 jam();
16316 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_DATA);
16317 FsOpenReq::v5_setLcpNo(req->fileNumber, deleteLcpFilePtr.p->firstFileId);
16318 DEB_LCP_DEL_FILES(("(%u)TAGD Remove data file: %u for tab(%u,%u)",
16319 instance(),
16320 deleteLcpFilePtr.p->firstFileId,
16321 deleteLcpFilePtr.p->tableId,
16322 deleteLcpFilePtr.p->fragmentId));
16323 }
16324 else
16325 {
16326 jam();
16327 FsOpenReq::setSuffix(req->fileNumber, FsOpenReq::S_CTL);
16328 FsOpenReq::v5_setLcpNo(req->fileNumber,
16329 deleteLcpFilePtr.p->lcpCtlFileNumber);
16330 DEB_LCP_DEL_FILES(("(%u)TAGD Remove control file: %u for tab(%u,%u)",
16331 instance(),
16332 deleteLcpFilePtr.p->lcpCtlFileNumber,
16333 deleteLcpFilePtr.p->tableId,
16334 deleteLcpFilePtr.p->fragmentId));
16335 }
16336 FsOpenReq::v5_setTableId(req->fileNumber, deleteLcpFilePtr.p->tableId);
16337 FsOpenReq::v5_setFragmentId(req->fileNumber, deleteLcpFilePtr.p->fragmentId);
16338 sendSignal(NDBFS_REF, GSN_FSREMOVEREQ, signal, FsOpenReq::SignalLength, JBA);
16339 }
16340
16341 void
lcp_remove_file_conf(Signal * signal,BackupRecordPtr ptr)16342 Backup::lcp_remove_file_conf(Signal *signal, BackupRecordPtr ptr)
16343 {
16344 BackupFilePtr filePtr;
16345
16346 c_backupFilePool.getPtr(filePtr, ptr.p->deleteFilePtr);
16347 filePtr.p->m_flags &= (~(BackupFile::BF_REMOVING));
16348 ndbrequire(filePtr.p->m_flags == 0);
16349
16350 if (ptr.p->m_delete_data_file_ongoing)
16351 {
16352 jam();
16353 DeleteLcpFilePtr deleteLcpFilePtr;
16354 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16355 if (deleteLcpFilePtr.p->firstFileId == deleteLcpFilePtr.p->lastFileId)
16356 {
16357 jam();
16358 /**
16359 * We're done with deleting the data files belonging to this LCP which
16360 * we no longer need. We continue with deletion of the control LCP
16361 * file for this LCP.
16362 */
16363 ptr.p->m_delete_data_file_ongoing = false;
16364 lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16365 return;
16366 }
16367 /* Continue with deleting the next data file. */
16368 deleteLcpFilePtr.p->firstFileId =
16369 get_file_add(deleteLcpFilePtr.p->firstFileId, 1);
16370 lcp_remove_file(signal, ptr, deleteLcpFilePtr);
16371 }
16372 else
16373 {
16374 /**
16375 * We are done deleting files for this fragment LCP, send CONTINUEB
16376 * to see if more fragment LCPs are ready to be deleted.
16377 *
16378 * We remove it from queue here to ensure that the next LCP can now
16379 * start up again.
16380 * It is important to not remove it from queue until we actually deleted
16381 * all the files, the logic depends on that only one LCP is allowed to
16382 * execute at a time and that this LCP will remove all the files
16383 * of the old LCP before the next one is allowed to start.
16384 */
16385 jam();
16386 finished_removing_files(signal, ptr);
16387 }
16388 }
16389
16390 void
finished_removing_files(Signal * signal,BackupRecordPtr ptr)16391 Backup::finished_removing_files(Signal *signal,
16392 BackupRecordPtr ptr)
16393 {
16394 DeleteLcpFilePtr deleteLcpFilePtr;
16395 jam();
16396 {
16397 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16398 m_delete_lcp_file_head);
16399 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16400 queue.remove(deleteLcpFilePtr);
16401 c_deleteLcpFilePool.release(deleteLcpFilePtr);
16402 ptr.p->currentDeleteLcpFile = RNIL;
16403 }
16404 check_empty_queue_waiters(signal, ptr);
16405 if (ptr.p->m_informDropTabTableId != Uint32(~0))
16406 {
16407 jam();
16408 sendINFORM_BACKUP_DROP_TAB_CONF(signal, ptr);
16409 }
16410 else
16411 {
16412 jam();
16413 check_wait_end_lcp(signal, ptr);
16414 }
16415 {
16416 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16417 m_delete_lcp_file_head);
16418 if (!queue.isEmpty())
16419 {
16420 jam();
16421 signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
16422 signal->theData[1] = ptr.i;
16423 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16424 }
16425 else
16426 {
16427 jam();
16428 delete_lcp_file_processing(signal);
16429 }
16430 }
16431 }
16432
16433 /**
16434 * Wait for LCP activity to cease, in particular wait for the delete queue
16435 * to become empty. When the delete queue is empty we know that all fragment
16436 * LCPs have completed and are recoverable. No files will be deleted unless
16437 * the fragment LCP is completed and even if no files require deletion we will
16438 * insert an entry into the delete file queue if we are still waiting for the
16439 * LSN of the table fragment to be flushed.
16440 *
16441 * See comments in Dblqh::insert_new_fragments_into_lcp for more details on
16442 * the use case for this signal.
16443 */
16444 void
execWAIT_LCP_IDLE_REQ(Signal * signal)16445 Backup::execWAIT_LCP_IDLE_REQ(Signal *signal)
16446 {
16447 BackupRecordPtr ptr;
16448 jamEntry();
16449 c_backupPool.getPtr(ptr, signal->theData[0]);
16450 jamDebug();
16451 ndbrequire(ptr.p->is_lcp());
16452 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16453 m_delete_lcp_file_head);
16454 if (queue.isEmpty() && ptr.p->slaveState.getState() == DEFINED)
16455 {
16456 jam();
16457 signal->theData[0] = ptr.p->clientData;
16458 sendSignal(ptr.p->masterRef, GSN_WAIT_LCP_IDLE_CONF,
16459 signal, 1, JBB);
16460 }
16461 else
16462 {
16463 jam();
16464 ptr.p->m_wait_empty_queue = true;
16465 }
16466 }
16467
16468 void
check_empty_queue_waiters(Signal * signal,BackupRecordPtr ptr)16469 Backup::check_empty_queue_waiters(Signal *signal, BackupRecordPtr ptr)
16470 {
16471 ndbrequire(ptr.p->is_lcp());
16472 if (ptr.p->m_wait_empty_queue)
16473 {
16474 jam();
16475 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16476 m_delete_lcp_file_head);
16477 if (queue.isEmpty() && ptr.p->slaveState.getState() == DEFINED)
16478 {
16479 jam();
16480 ptr.p->m_wait_empty_queue = false;
16481 signal->theData[0] = ptr.p->clientData;
16482 sendSignal(ptr.p->masterRef, GSN_WAIT_LCP_IDLE_CONF,
16483 signal, 1, JBB);
16484 }
16485 }
16486 }
16487
16488 void
execINFORM_BACKUP_DROP_TAB_REQ(Signal * signal)16489 Backup::execINFORM_BACKUP_DROP_TAB_REQ(Signal *signal)
16490 {
16491 BackupRecordPtr ptr;
16492 get_lcp_record(ptr);
16493 ptr.p->m_informDropTabTableId = signal->theData[0];
16494 ptr.p->m_informDropTabReference = signal->theData[1];
16495 if (ptr.p->currentDeleteLcpFile != RNIL)
16496 {
16497 DeleteLcpFilePtr deleteLcpFilePtr;
16498 jam();
16499 c_deleteLcpFilePool.getPtr(deleteLcpFilePtr, ptr.p->currentDeleteLcpFile);
16500 if (deleteLcpFilePtr.p->tableId == ptr.p->m_informDropTabTableId)
16501 {
16502 jam();
16503 /**
16504 * The current delete record is deleting files and writing files
16505 * from the dropped table. Wait until this is completed before
16506 * we continue.
16507 */
16508 return;
16509 }
16510 }
16511 sendINFORM_BACKUP_DROP_TAB_CONF(signal, ptr);
16512 }
16513
16514 void
check_wait_end_lcp(Signal * signal,BackupRecordPtr ptr)16515 Backup::check_wait_end_lcp(Signal *signal, BackupRecordPtr ptr)
16516 {
16517 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16518 m_delete_lcp_file_head);
16519 ndbrequire(ptr.p->is_lcp());
16520 if (queue.isEmpty() && ptr.p->m_wait_end_lcp)
16521 {
16522 jam();
16523 ndbrequire(ptr.p->prepareState != PREPARE_DROP);
16524 ptr.p->m_wait_end_lcp = false;
16525 sendEND_LCPCONF(signal, ptr);
16526 }
16527 }
16528
16529 void
sendINFORM_BACKUP_DROP_TAB_CONF(Signal * signal,BackupRecordPtr ptr)16530 Backup::sendINFORM_BACKUP_DROP_TAB_CONF(Signal *signal,
16531 BackupRecordPtr ptr)
16532 {
16533 /**
16534 * Before we send the confirm we have to remove all entries from
16535 * drop delete queue that refer to the dropped table. We have already
16536 * ensured that the dropped table isn't currently involved in drops.
16537 * It would create complex code if we could remove the LCP files
16538 * while we were writing them.
16539 */
16540
16541 DEB_LCP(("(%u)Remove all delete file requests for table %u",
16542 instance(),
16543 ptr.p->m_informDropTabTableId));
16544 {
16545 DeleteLcpFilePtr deleteLcpFilePtr;
16546 DeleteLcpFilePtr nextDeleteLcpFilePtr;
16547 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16548 m_delete_lcp_file_head);
16549 bool is_next_available = queue.first(deleteLcpFilePtr);
16550 while (is_next_available)
16551 {
16552 nextDeleteLcpFilePtr = deleteLcpFilePtr;
16553 is_next_available = queue.next(nextDeleteLcpFilePtr);
16554 if (deleteLcpFilePtr.p->tableId == ptr.p->m_informDropTabTableId)
16555 {
16556 jam();
16557 /**
16558 * We found an entry that is from the dropped table, we can
16559 * ignore this since the table will be dropped and all
16560 * LCP files with it.
16561 */
16562 queue.remove(deleteLcpFilePtr);
16563 c_deleteLcpFilePool.release(deleteLcpFilePtr);
16564 }
16565 deleteLcpFilePtr = nextDeleteLcpFilePtr;
16566 }
16567 }
16568 check_empty_queue_waiters(signal, ptr);
16569 check_wait_end_lcp(signal, ptr);
16570
16571 /**
16572 * Now we have removed all entries from queue and we are ready to inform
16573 * LQH that he can continue dropping the table.
16574 * At this point LQH have already ensured that no more LCPs are started
16575 * on this table.
16576 */
16577 BlockReference ref = ptr.p->m_informDropTabReference;
16578 Uint32 tableId = ptr.p->m_informDropTabTableId;
16579 signal->theData[0] = tableId;
16580 sendSignal(ref, GSN_INFORM_BACKUP_DROP_TAB_CONF, signal, 1, JBB);
16581 ptr.p->m_informDropTabReference = Uint32(~0);
16582 ptr.p->m_informDropTabTableId = Uint32(~0);
16583 }
16584
16585 void
openFilesReplyLCP(Signal * signal,BackupRecordPtr ptr,BackupFilePtr filePtr)16586 Backup::openFilesReplyLCP(Signal* signal,
16587 BackupRecordPtr ptr,
16588 BackupFilePtr filePtr)
16589 {
16590 /**
16591 * Did open succeed
16592 */
16593 if(ptr.p->checkError())
16594 {
16595 jam();
16596 if(ptr.p->errorCode == FsRef::fsErrFileExists)
16597 {
16598 jam();
16599 ptr.p->errorCode = DefineBackupRef::FailedForBackupFilesAleadyExist;
16600 }
16601 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
16602 {
16603 jam();
16604 if (ptr.p->dataFilePtr[i] == filePtr.i)
16605 {
16606 jam();
16607 /* Currently we can't handle failures to open data file */
16608 g_eventLogger->critical("Fatal: Open file of LCP data file %u failed,"
16609 " errCode: %u",
16610 i,
16611 ptr.p->errorCode);
16612 ndbabort();
16613 }
16614 }
16615 if (ptr.p->deleteFilePtr == filePtr.i)
16616 {
16617 jam();
16618 g_eventLogger->critical("Fatal: Reopen LCP control file failed,"
16619 " errCode: %u",
16620 ptr.p->errorCode);
16621 ndbabort();
16622 }
16623 defineBackupRef(signal, ptr);
16624 return;
16625 }//if
16626
16627 if (ptr.p->deleteFilePtr == filePtr.i)
16628 {
16629 jam();
16630 lcp_open_ctl_file_for_rewrite_done(signal, filePtr);
16631 return;
16632 }
16633 if (filePtr.p->m_flags & BackupFile::BF_HEADER_FILE)
16634 {
16635 jam();
16636 filePtr.p->m_flags &= ~(Uint32)BackupFile::BF_HEADER_FILE;
16637 ndbrequire(filePtr.i == ptr.p->prepareCtlFilePtr[0] ||
16638 filePtr.i == ptr.p->prepareCtlFilePtr[1]);
16639 lcp_open_ctl_file_done(signal, ptr, filePtr);
16640 return;
16641 }
16642 TablePtr tabPtr;
16643 bool prepare_phase;
16644 Uint32 index = 0;
16645 if (filePtr.i == ptr.p->prepareDataFilePtr[0])
16646 {
16647 jam();
16648 filePtr.p->m_flags |= BackupFile::BF_LCP_META;
16649 ndbrequire(ptr.p->prepareState == PREPARE_OPEN_DATA_FILE);
16650 ptr.p->prepareState = PREPARE_READ_TABLE_DESC;
16651 ptr.p->prepare_table.first(tabPtr);
16652 prepare_phase = true;
16653 }
16654 else
16655 {
16656 prepare_phase = true;
16657 for (index = 0 ; index < ptr.p->m_num_lcp_files; index++)
16658 {
16659 if (filePtr.i == ptr.p->dataFilePtr[index])
16660 {
16661 prepare_phase = false;
16662 break;
16663 }
16664 }
16665 ndbrequire(!prepare_phase);
16666 ptr.p->tables.first(tabPtr);
16667 }
16668 ndbrequire(insertFileHeader(BackupFormat::LCP_FILE, ptr.p, filePtr.p));
16669 /**
16670 * Insert table list in ctl file
16671 */
16672 FsBuffer & buf = filePtr.p->operation.dataBuffer;
16673 const Uint32 sz = (sizeof(BackupFormat::CtlFile::TableList) >> 2);
16674 Uint32 * dst;
16675 ndbrequire(sz < buf.getMaxWrite());
16676 ndbrequire(buf.getWritePtr(&dst, sz))
16677
16678 BackupFormat::CtlFile::TableList* tl =
16679 (BackupFormat::CtlFile::TableList*)dst;
16680
16681 tl->SectionType = htonl(BackupFormat::TABLE_LIST);
16682 tl->SectionLength = htonl(sz);
16683 tl->TableIds[0] = htonl(tabPtr.p->tableId);
16684 buf.updateWritePtr(sz);
16685
16686 if (prepare_phase)
16687 {
16688 jam();
16689 /**
16690 * Start getting table definition data
16691 */
16692 signal->theData[0] = BackupContinueB::BUFFER_FULL_META;
16693 signal->theData[1] = ptr.i;
16694 signal->theData[2] = tabPtr.i;
16695 signal->theData[3] = filePtr.i;
16696 sendSignal(reference(), GSN_CONTINUEB, signal, 4, JBB);
16697 return;
16698 }
16699 else
16700 {
16701 jam();
16702 FragmentPtr fragPtr;
16703 tabPtr.p->fragments.getPtr(fragPtr, 0);
16704 init_file_for_lcp(signal, index, ptr, ptr.i);
16705 ptr.p->m_num_lcp_data_files_open++;
16706 ndbrequire(ptr.p->m_num_lcp_data_files_open <= ptr.p->m_num_lcp_files);
16707 if (ptr.p->m_num_lcp_data_files_open < ptr.p->m_num_lcp_files)
16708 {
16709 jam();
16710 return;
16711 }
16712 /**
16713 * Now all files are open and we can start the actual scanning.
16714 * We always use the first file record to track number of scanned
16715 * pages.
16716 */
16717 BackupFilePtr zeroFilePtr;
16718 c_backupFilePool.getPtr(zeroFilePtr, ptr.p->dataFilePtr[0]);
16719 Uint32 delay = 0;
16720 if (ERROR_INSERTED(10047))
16721 {
16722 g_eventLogger->info("(%u)Start LCP on tab(%u,%u) 3 seconds delay, max_page: %u",
16723 instance(),
16724 tabPtr.p->tableId,
16725 fragPtr.p->fragmentId,
16726 ptr.p->m_lcp_max_page_cnt);
16727
16728 if (ptr.p->m_lcp_max_page_cnt > 20)
16729 {
16730 delay = 3000;
16731 }
16732 }
16733 sendScanFragReq(signal, ptr, zeroFilePtr, tabPtr, fragPtr, delay);
16734 }
16735 }
16736
16737 void
execEND_LCPREQ(Signal * signal)16738 Backup::execEND_LCPREQ(Signal* signal)
16739 {
16740 BackupRecordPtr ptr;
16741 {
16742 EndLcpReq* req= (EndLcpReq*)signal->getDataPtr();
16743 c_backupPool.getPtr(ptr, req->backupPtr);
16744 ptr.p->senderData = req->senderData;
16745 }
16746 jamEntry();
16747 ndbrequire(ptr.p->is_lcp());
16748
16749 BackupFilePtr filePtr;
16750 ptr.p->files.getPtr(filePtr, ptr.p->prepareCtlFilePtr[0]);
16751 ndbrequire(filePtr.p->m_flags == 0);
16752 ptr.p->files.getPtr(filePtr, ptr.p->prepareCtlFilePtr[1]);
16753 ndbrequire(filePtr.p->m_flags == 0);
16754 ptr.p->files.getPtr(filePtr, ptr.p->prepareDataFilePtr[0]);
16755 ndbrequire(filePtr.p->m_flags == 0);
16756 ptr.p->files.getPtr(filePtr, ptr.p->ctlFilePtr);
16757 ndbrequire(filePtr.p->m_flags == 0);
16758 ptr.p->files.getPtr(filePtr, ptr.p->dataFilePtr[0]);
16759 ndbrequire(filePtr.p->m_flags == 0);
16760
16761 ptr.p->errorCode = 0;
16762 ptr.p->slaveState.setState(CLEANING);
16763 ptr.p->slaveState.setState(INITIAL);
16764 ptr.p->slaveState.setState(DEFINING);
16765 ptr.p->slaveState.setState(DEFINED);
16766
16767 DEB_LCP(("(%u)TAGE Send SYNC_EXTENT_PAGES_REQ", instance()));
16768 /**
16769 * As part of ending the LCP we need to ensure that the extent pages
16770 * are synchronised. This is to ensure that the case with dropped
16771 * tables after completing a fragment LCP is handled properly. These
16772 * extent pages need to be synchronised at end of LCP since after the
16773 * end of the LCP here we will inform TSMAN that it is free to start
16774 * sharing those pages again and then we need to ensure that the
16775 * free status is up-to-date in preparation for a potential restart.
16776 */
16777 ptr.p->m_wait_final_sync_extent = true;
16778 ptr.p->m_num_sync_extent_pages_written = Uint32(~0);
16779 ptr.p->m_start_sync_op = getHighResTimer();
16780 {
16781 SyncExtentPagesReq *req = (SyncExtentPagesReq*)signal->getDataPtrSend();
16782 req->senderData = ptr.i;
16783 req->senderRef = reference();
16784 if (ptr.p->m_first_fragment)
16785 {
16786 jam();
16787 ptr.p->m_first_fragment = false;
16788 req->lcpOrder = SyncExtentPagesReq::FIRST_AND_END_LCP;
16789 }
16790 else
16791 {
16792 jam();
16793 req->lcpOrder = SyncExtentPagesReq::END_LCP;
16794 }
16795 sendSignal(PGMAN_REF, GSN_SYNC_EXTENT_PAGES_REQ, signal,
16796 SyncExtentPagesReq::SignalLength, JBB);
16797 }
16798 return;
16799 }
16800
16801 void
finish_end_lcp(Signal * signal,BackupRecordPtr ptr)16802 Backup::finish_end_lcp(Signal *signal, BackupRecordPtr ptr)
16803 {
16804 DEB_LCP(("(%u)TAGE SYNC_EXTENT_PAGES_CONF: lcpId: %u",
16805 instance(),
16806 ptr.p->backupId));
16807 ptr.p->m_wait_final_sync_extent = false;
16808 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16809 m_delete_lcp_file_head);
16810 if (!queue.isEmpty())
16811 {
16812 jam();
16813 ptr.p->m_wait_end_lcp = true;
16814 return;
16815 }
16816 /**
16817 * The delete LCP file queue is empty, this means that we are sure
16818 * that all reported LCP_FRAG_REP's are actually completed. DIH
16819 * will not think that any LCP_FRAG_REP is ok to use until we have
16820 * received LCP_COMPLETE_REP and so we need to wait with sending
16821 * this signal until we have emptied the queue and thus completed
16822 * the full LCP.
16823 */
16824 sendEND_LCPCONF(signal, ptr);
16825 }
16826
16827 void
sendEND_LCPCONF(Signal * signal,BackupRecordPtr ptr)16828 Backup::sendEND_LCPCONF(Signal *signal, BackupRecordPtr ptr)
16829 {
16830 DEB_LCP(("(%u)TAGE END_LCPREQ: lcpId: %u",
16831 instance(),
16832 ptr.p->backupId));
16833 ndbrequire(!ptr.p->m_wait_end_lcp);
16834 ptr.p->backupId = 0; /* Ensure next LCP_PREPARE_REQ sees a new LCP id */
16835
16836 {
16837 NDB_TICKS now = getHighResTimer();
16838 Uint64 lcp_elapsed_us =
16839 NdbTick_Elapsed(ptr.p->m_high_res_lcp_start_time, now).microSec();
16840 Uint64 dd_percentage = 100 * m_current_dd_time_us;
16841 dd_percentage = dd_percentage / lcp_elapsed_us;
16842 m_last_lcp_dd_percentage = dd_percentage;
16843 c_pgman->set_lcp_dd_percentage(dd_percentage);
16844 }
16845 DEB_LCP_STAT(("(%u)Bytes written in this LCP: %llu MB, dd_percent: %u",
16846 instance(),
16847 ptr.p->noOfBytes / (1024 * 1024),
16848 m_last_lcp_dd_percentage));
16849 lcp_end_point();
16850
16851 EndLcpConf* conf= (EndLcpConf*)signal->getDataPtrSend();
16852 conf->senderData = ptr.p->senderData;
16853 conf->senderRef = reference();
16854 sendSignal(ptr.p->masterRef, GSN_END_LCPCONF,
16855 signal, EndLcpConf::SignalLength, JBA);
16856 }
16857
16858 void
lcp_close_ctl_file_drop_case(Signal * signal,BackupRecordPtr ptr)16859 Backup::lcp_close_ctl_file_drop_case(Signal *signal, BackupRecordPtr ptr)
16860 {
16861 BackupFilePtr filePtr;
16862 for (Uint32 i = 0; i < 2; i++)
16863 {
16864 c_backupFilePool.getPtr(filePtr, ptr.p->prepareCtlFilePtr[i]);
16865 if ((filePtr.p->m_flags & BackupFile::BF_OPEN) != 0)
16866 {
16867 jam();
16868 /* Still waiting for second file to close */
16869 return;
16870 }
16871 }
16872 /* Now time to start removing data files. */
16873 DeleteLcpFilePtr deleteLcpFilePtr;
16874 TablePtr tabPtr;
16875 FragmentPtr fragPtr;
16876 ndbrequire(c_deleteLcpFilePool.seize(deleteLcpFilePtr));
16877 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
16878 m_delete_lcp_file_head);
16879
16880 /**
16881 * We avoid all complexity for this code since it is an obscure case that
16882 * should be extremely rare. So we simply delete all potential files.
16883 */
16884 ptr.p->prepare_table.first(tabPtr);
16885 tabPtr.p->fragments.getPtr(fragPtr, 0);
16886 deleteLcpFilePtr.p->tableId = fragPtr.p->tableId;
16887 deleteLcpFilePtr.p->fragmentId = fragPtr.p->fragmentId;
16888 deleteLcpFilePtr.p->firstFileId = 0;
16889 deleteLcpFilePtr.p->lastFileId = BackupFormat::NDB_MAX_LCP_FILES - 1;
16890 deleteLcpFilePtr.p->waitCompletedGci = 0;
16891 deleteLcpFilePtr.p->validFlag = 1;
16892 deleteLcpFilePtr.p->lcpCtlFileNumber =
16893 ptr.p->prepareNextLcpCtlFileNumber == 0 ? 1 : 0;
16894 queue.addFirst(deleteLcpFilePtr);
16895 if (!m_delete_lcp_files_ongoing)
16896 {
16897 jam();
16898 m_delete_lcp_files_ongoing = true;
16899 signal->theData[0] = BackupContinueB::ZDELETE_LCP_FILE;
16900 signal->theData[1] = ptr.i;
16901 sendSignal(reference(), GSN_CONTINUEB, signal, 2, JBB);
16902 }
16903 /**
16904 * We have now closed the files and as soon as the queue of
16905 * deleted files are empty we can proceed with starting of
16906 * the LCP.
16907 */
16908 ptr.p->prepareState = PREPARE_DROP;
16909 DEB_LCP(("(%u)TAGT Insert delete files in queue (drop case):"
16910 " tab(%u,%u), createGci: %u, waitCompletedGCI: 0",
16911 instance(),
16912 fragPtr.p->tableId,
16913 fragPtr.p->fragmentId,
16914 fragPtr.p->createGci));
16915 }
16916
16917 inline
16918 static
setWords(const Uint64 src,Uint32 & hi,Uint32 & lo)16919 void setWords(const Uint64 src, Uint32& hi, Uint32& lo)
16920 {
16921 hi = (Uint32) (src >> 32);
16922 lo = (Uint32) (src & 0xffffffff);
16923 }
16924
16925 void
execLCP_STATUS_REQ(Signal * signal)16926 Backup::execLCP_STATUS_REQ(Signal* signal)
16927 {
16928 jamEntry();
16929 const LcpStatusReq* req = (const LcpStatusReq*) signal->getDataPtr();
16930
16931 const Uint32 senderRef = req->senderRef;
16932 const Uint32 senderData = req->senderData;
16933 Uint32 failCode = LcpStatusRef::NoLCPRecord;
16934
16935 /* Find LCP record */
16936 BackupRecordPtr ptr;
16937 get_lcp_record(ptr);
16938 do
16939 {
16940 jam();
16941 ndbrequire(ptr.p->is_lcp());
16942 {
16943 jam();
16944 LcpStatusConf::LcpState state = LcpStatusConf::LCP_IDLE;
16945 if (ptr.p->m_wait_end_lcp)
16946 {
16947 jam();
16948 state = LcpStatusConf::LCP_WAIT_END_LCP;
16949 }
16950 else if (ptr.p->m_wait_final_sync_extent)
16951 {
16952 jam();
16953 state = LcpStatusConf::LCP_WAIT_FINAL_SYNC_EXTENT;
16954 }
16955 else
16956 {
16957 jam();
16958 switch (ptr.p->slaveState.getState())
16959 {
16960 case STARTED:
16961 jam();
16962 state = LcpStatusConf::LCP_PREPARED;
16963 break;
16964 case SCANNING:
16965 jam();
16966 state = LcpStatusConf::LCP_SCANNING;
16967 break;
16968 case STOPPING:
16969 jam();
16970 if (ptr.p->m_wait_disk_data_sync)
16971 {
16972 jam();
16973 state = LcpStatusConf::LCP_WAIT_SYNC_DISK;
16974 }
16975 else if (ptr.p->m_wait_sync_extent)
16976 {
16977 jam();
16978 state = LcpStatusConf::LCP_WAIT_SYNC_EXTENT;
16979 }
16980 else if (ptr.p->m_wait_data_file_close)
16981 {
16982 jam();
16983 state = LcpStatusConf::LCP_SCANNED;
16984 }
16985 else if (ptr.p->m_empty_lcp)
16986 {
16987 jam();
16988 state = LcpStatusConf::LCP_WAIT_CLOSE_EMPTY;
16989 }
16990 else
16991 {
16992 jam();
16993 state = LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE;
16994 }
16995 break;
16996 case DEFINED:
16997 jam();
16998 if (ptr.p->prepareState == NOT_ACTIVE ||
16999 ptr.p->prepareState == PREPARED)
17000 {
17001 jam();
17002 state = LcpStatusConf::LCP_IDLE;
17003 }
17004 else if (ptr.p->prepareState == PREPARE_READ_CTL_FILES)
17005 {
17006 jam();
17007 state = LcpStatusConf::LCP_PREPARE_READ_CTL_FILES;
17008 }
17009 else if (ptr.p->prepareState == PREPARE_OPEN_DATA_FILE)
17010 {
17011 jam();
17012 state = LcpStatusConf::LCP_PREPARE_OPEN_DATA_FILE;
17013 }
17014 else if (ptr.p->prepareState == PREPARE_READ_TABLE_DESC)
17015 {
17016 jam();
17017 state = LcpStatusConf::LCP_PREPARE_READ_TABLE_DESC;
17018 }
17019 else if (ptr.p->prepareState == PREPARE_ABORTING)
17020 {
17021 jam();
17022 state = LcpStatusConf::LCP_PREPARE_ABORTING;
17023 }
17024 else if (ptr.p->prepareState == PREPARE_DROP ||
17025 ptr.p->prepareState == PREPARE_DROP_CLOSE)
17026 {
17027 jam();
17028 state = LcpStatusConf::LCP_PREPARE_WAIT_DROP_CASE;
17029 }
17030 else
17031 {
17032 jam();
17033 ndbout_c("Unusual LCP prepare state in LCP_STATUS_REQ() : %u",
17034 ptr.p->prepareState);
17035 state = LcpStatusConf::LCP_IDLE;
17036 }
17037 break;
17038 default:
17039 jam();
17040 ndbout_c("Unusual LCP state in LCP_STATUS_REQ() : %u",
17041 ptr.p->slaveState.getState());
17042 state = LcpStatusConf::LCP_IDLE;
17043 };
17044 }
17045
17046 /* Not all values are set here */
17047 const Uint32 UnsetConst = ~0;
17048
17049 LcpStatusConf* conf = (LcpStatusConf*) signal->getDataPtrSend();
17050 conf->senderRef = reference();
17051 conf->senderData = senderData;
17052 conf->lcpState = state;
17053 conf->tableId = UnsetConst;
17054 conf->fragId = UnsetConst;
17055 conf->completionStateHi = UnsetConst;
17056 conf->completionStateLo = UnsetConst;
17057 setWords(ptr.p->noOfRecords,
17058 conf->lcpDoneRowsHi,
17059 conf->lcpDoneRowsLo);
17060 setWords(ptr.p->noOfBytes,
17061 conf->lcpDoneBytesHi,
17062 conf->lcpDoneBytesLo);
17063 conf->lcpScannedPages = 0;
17064
17065 if (state == LcpStatusConf::LCP_SCANNING ||
17066 state == LcpStatusConf::LCP_WAIT_SYNC_DISK ||
17067 state == LcpStatusConf::LCP_WAIT_SYNC_EXTENT ||
17068 state == LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE ||
17069 state == LcpStatusConf::LCP_WAIT_CLOSE_EMPTY ||
17070 state == LcpStatusConf::LCP_SCANNED)
17071 {
17072 jam();
17073 /* Actually scanning/closing a fragment, let's grab the details */
17074 TablePtr tabPtr;
17075 FragmentPtr fragPtr;
17076 BackupFilePtr filePtr;
17077
17078 if (ptr.p->dataFilePtr[0] == RNIL)
17079 {
17080 jam();
17081 failCode = LcpStatusRef::NoFileRecord;
17082 break;
17083 }
17084 c_backupFilePool.getPtr(filePtr, ptr.p->dataFilePtr[0]);
17085 ndbrequire(filePtr.p->backupPtr == ptr.i);
17086
17087 ptr.p->tables.first(tabPtr);
17088 if (tabPtr.i != RNIL)
17089 {
17090 jam();
17091 tabPtr.p->fragments.getPtr(fragPtr, 0);
17092 ndbrequire(fragPtr.p->tableId == tabPtr.p->tableId);
17093 conf->tableId = tabPtr.p->tableId;
17094 conf->fragId = fragPtr.p->fragmentId;
17095 }
17096
17097 if (state == LcpStatusConf::LCP_SCANNING)
17098 {
17099 jam();
17100 setWords(filePtr.p->operation.noOfRecords,
17101 conf->completionStateHi,
17102 conf->completionStateLo);
17103 conf->lcpScannedPages = filePtr.p->operation.lcpScannedPages;
17104 }
17105 else if (state == LcpStatusConf::LCP_SCANNED)
17106 {
17107 jam();
17108 BackupFilePtr tmp_filePtr;
17109 Uint64 flushBacklog = 0;
17110 for (Uint32 i = 0; i < ptr.p->m_num_lcp_files; i++)
17111 {
17112 c_backupFilePool.getPtr(tmp_filePtr, ptr.p->dataFilePtr[i]);
17113 /* May take some time to drain the FS buffer, depending on
17114 * size of buff, achieved rate.
17115 * We provide the buffer fill level so that requestors
17116 * can observe whether there's progress in this phase.
17117 */
17118 flushBacklog +=
17119 tmp_filePtr.p->operation.dataBuffer.getUsableSize() -
17120 tmp_filePtr.p->operation.dataBuffer.getFreeSize();
17121 }
17122 setWords(flushBacklog,
17123 conf->completionStateHi,
17124 conf->completionStateLo);
17125 }
17126 else if (state == LcpStatusConf::LCP_WAIT_SYNC_DISK)
17127 {
17128 jam();
17129 conf->completionStateHi = 0;
17130 conf->completionStateLo = ptr.p->m_num_sync_pages_waiting;
17131 }
17132 else if (state == LcpStatusConf::LCP_WAIT_SYNC_EXTENT)
17133 {
17134 jam();
17135 conf->completionStateHi = 0;
17136 conf->completionStateLo = ptr.p->m_num_sync_extent_pages_written;
17137 }
17138 else if (state == LcpStatusConf::LCP_WAIT_WRITE_CTL_FILE)
17139 {
17140 jam();
17141 conf->completionStateHi = 0;
17142 conf->completionStateLo = 0;
17143 }
17144 else if (state == LcpStatusConf::LCP_WAIT_CLOSE_EMPTY)
17145 {
17146 jam();
17147 conf->completionStateHi = 0;
17148 conf->completionStateLo = ptr.p->m_outstanding_operations;
17149 }
17150 else
17151 {
17152 ndbabort(); // Impossible state
17153 }
17154 }
17155 else if (state == LcpStatusConf::LCP_WAIT_END_LCP)
17156 {
17157 jam();
17158 DeleteLcpFilePtr deleteLcpFilePtr;
17159 LocalDeleteLcpFile_list queue(c_deleteLcpFilePool,
17160 m_delete_lcp_file_head);
17161 ndbrequire(!queue.isEmpty());
17162 conf->completionStateHi = 0;
17163 conf->completionStateLo = m_newestRestorableGci;
17164 }
17165 else if (state == LcpStatusConf::LCP_WAIT_FINAL_SYNC_EXTENT)
17166 {
17167 jam();
17168 conf->completionStateHi = 0;
17169 conf->completionStateLo = ptr.p->m_num_sync_extent_pages_written;
17170 }
17171 else if (state == LcpStatusConf::LCP_PREPARED)
17172 {
17173 /**
17174 * We are in state of closing LCP control files with a
17175 * idle fragment LCP.
17176 */
17177 jam();
17178 TablePtr tabPtr;
17179 FragmentPtr fragPtr;
17180 ptr.p->tables.first(tabPtr);
17181 ndbrequire(tabPtr.i != RNIL);
17182 tabPtr.p->fragments.getPtr(fragPtr, 0);
17183 ndbrequire(fragPtr.p->tableId == tabPtr.p->tableId);
17184 conf->tableId = tabPtr.p->tableId;
17185 conf->fragId = fragPtr.p->fragmentId;
17186 }
17187
17188 failCode = 0;
17189 }
17190 } while (false);
17191
17192 if (failCode == 0)
17193 {
17194 jam();
17195 sendSignal(senderRef, GSN_LCP_STATUS_CONF,
17196 signal, LcpStatusConf::SignalLength, JBB);
17197 return;
17198 }
17199
17200 jam();
17201 LcpStatusRef* ref = (LcpStatusRef*) signal->getDataPtrSend();
17202
17203 ref->senderRef = reference();
17204 ref->senderData = senderData;
17205 ref->error = failCode;
17206
17207 sendSignal(senderRef, GSN_LCP_STATUS_REF,
17208 signal, LcpStatusRef::SignalLength, JBB);
17209 return;
17210 }
17211
17212 bool
get_backup_record(BackupRecordPtr & ptr)17213 Backup::get_backup_record(BackupRecordPtr &ptr)
17214 {
17215 /**
17216 * The first record in c_backups is the LCP record when no backup
17217 * is running, if a backup is running, it will be first one. We will
17218 * return true if a backup record is found and false otherwise.
17219 */
17220 c_backups.first(ptr);
17221 if (ptr.p->is_lcp())
17222 {
17223 ptr.i = RNIL;
17224 ptr.p = 0;
17225 return false;
17226 }
17227 return true;
17228 }
17229
17230 void
get_lcp_record(BackupRecordPtr & ptr)17231 Backup::get_lcp_record(BackupRecordPtr &ptr)
17232 {
17233 for(c_backups.first(ptr); ptr.i != RNIL; c_backups.next(ptr))
17234 {
17235 if (ptr.p->is_lcp())
17236 {
17237 return;
17238 }
17239 }
17240 ndbrequire(false);
17241 }
17242
17243 void
set_undo_log_level(Uint32 percentage)17244 Backup::set_undo_log_level(Uint32 percentage)
17245 {
17246 m_undo_log_level_percentage = percentage;
17247 if (percentage > m_max_undo_log_level_percentage)
17248 {
17249 jam();
17250 m_max_undo_log_level_percentage = percentage;
17251 }
17252 }
17253 bool Backup::g_is_single_thr_backup_running = false;
17254