1 /*****************************************************************************
2
3 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2012, Facebook Inc.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2.0,
8 as published by the Free Software Foundation.
9
10 This program is also distributed with certain software (including
11 but not limited to OpenSSL) that is licensed under separate terms,
12 as designated in a particular file or component or in included license
13 documentation. The authors of MySQL hereby grant you an additional
14 permission to link the program and your derivative works with the
15 separately licensed software that they have included with MySQL.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License, version 2.0, for more details.
21
22 You should have received a copy of the GNU General Public License along with
23 this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
25
26 *****************************************************************************/
27
28 /**************************************************//**
29 @file btr/btr0btr.cc
30 The B-tree
31
32 Created 6/2/1994 Heikki Tuuri
33 *******************************************************/
34
35 #include "btr0btr.h"
36
37 #ifdef UNIV_NONINL
38 #include "btr0btr.ic"
39 #endif
40
41 #include "fsp0fsp.h"
42 #include "page0page.h"
43 #include "page0zip.h"
44
45 #ifndef UNIV_HOTBACKUP
46 #include "btr0cur.h"
47 #include "btr0sea.h"
48 #include "btr0pcur.h"
49 #include "rem0cmp.h"
50 #include "lock0lock.h"
51 #include "ibuf0ibuf.h"
52 #include "trx0trx.h"
53 #include "srv0mon.h"
54
55 /**************************************************************//**
56 Checks if the page in the cursor can be merged with given page.
57 If necessary, re-organize the merge_page.
58 @return TRUE if possible to merge. */
59 UNIV_INTERN
60 ibool
61 btr_can_merge_with_page(
62 /*====================*/
63 btr_cur_t* cursor, /*!< in: cursor on the page to merge */
64 ulint page_no, /*!< in: a sibling page */
65 buf_block_t** merge_block, /*!< out: the merge block */
66 mtr_t* mtr); /*!< in: mini-transaction */
67
68 #endif /* UNIV_HOTBACKUP */
69
70 /**************************************************************//**
71 Report that an index page is corrupted. */
72 UNIV_INTERN
73 void
btr_corruption_report(const buf_block_t * block,const dict_index_t * index)74 btr_corruption_report(
75 /*==================*/
76 const buf_block_t* block, /*!< in: corrupted block */
77 const dict_index_t* index) /*!< in: index tree */
78 {
79 fprintf(stderr, "InnoDB: flag mismatch in space %u page %u"
80 " index %s of table %s\n",
81 (unsigned) buf_block_get_space(block),
82 (unsigned) buf_block_get_page_no(block),
83 index->name, index->table_name);
84 if (block->page.zip.data) {
85 buf_page_print(block->page.zip.data,
86 buf_block_get_zip_size(block),
87 BUF_PAGE_PRINT_NO_CRASH);
88 }
89 buf_page_print(buf_block_get_frame(block), 0, 0);
90 }
91
92 #ifndef UNIV_HOTBACKUP
93 #ifdef UNIV_BLOB_DEBUG
94 # include "srv0srv.h"
95 # include "ut0rbt.h"
96
97 /** TRUE when messages about index->blobs modification are enabled. */
98 static ibool btr_blob_dbg_msg;
99
100 /** Issue a message about an operation on index->blobs.
101 @param op operation
102 @param b the entry being subjected to the operation
103 @param ctx the context of the operation */
104 #define btr_blob_dbg_msg_issue(op, b, ctx) \
105 fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n", \
106 (b)->ref_page_no, (b)->ref_heap_no, \
107 (b)->ref_field_no, (b)->blob_page_no, ctx, \
108 (b)->owner, (b)->always_owner, (b)->del)
109
110 /** Insert to index->blobs a reference to an off-page column.
111 @param index the index tree
112 @param b the reference
113 @param ctx context (for logging) */
114 UNIV_INTERN
115 void
btr_blob_dbg_rbt_insert(dict_index_t * index,const btr_blob_dbg_t * b,const char * ctx)116 btr_blob_dbg_rbt_insert(
117 /*====================*/
118 dict_index_t* index, /*!< in/out: index tree */
119 const btr_blob_dbg_t* b, /*!< in: the reference */
120 const char* ctx) /*!< in: context (for logging) */
121 {
122 if (btr_blob_dbg_msg) {
123 btr_blob_dbg_msg_issue("insert", b, ctx);
124 }
125 mutex_enter(&index->blobs_mutex);
126 rbt_insert(index->blobs, b, b);
127 mutex_exit(&index->blobs_mutex);
128 }
129
130 /** Remove from index->blobs a reference to an off-page column.
131 @param index the index tree
132 @param b the reference
133 @param ctx context (for logging) */
134 UNIV_INTERN
135 void
btr_blob_dbg_rbt_delete(dict_index_t * index,const btr_blob_dbg_t * b,const char * ctx)136 btr_blob_dbg_rbt_delete(
137 /*====================*/
138 dict_index_t* index, /*!< in/out: index tree */
139 const btr_blob_dbg_t* b, /*!< in: the reference */
140 const char* ctx) /*!< in: context (for logging) */
141 {
142 if (btr_blob_dbg_msg) {
143 btr_blob_dbg_msg_issue("delete", b, ctx);
144 }
145 mutex_enter(&index->blobs_mutex);
146 ut_a(rbt_delete(index->blobs, b));
147 mutex_exit(&index->blobs_mutex);
148 }
149
150 /**************************************************************//**
151 Comparator for items (btr_blob_dbg_t) in index->blobs.
152 The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
153 @return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
154 static
155 int
btr_blob_dbg_cmp(const void * a,const void * b)156 btr_blob_dbg_cmp(
157 /*=============*/
158 const void* a, /*!< in: first btr_blob_dbg_t to compare */
159 const void* b) /*!< in: second btr_blob_dbg_t to compare */
160 {
161 const btr_blob_dbg_t* aa = static_cast<const btr_blob_dbg_t*>(a);
162 const btr_blob_dbg_t* bb = static_cast<const btr_blob_dbg_t*>(b);
163
164 ut_ad(aa != NULL);
165 ut_ad(bb != NULL);
166
167 if (aa->ref_page_no != bb->ref_page_no) {
168 return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
169 }
170 if (aa->ref_heap_no != bb->ref_heap_no) {
171 return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
172 }
173 if (aa->ref_field_no != bb->ref_field_no) {
174 return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
175 }
176 return(0);
177 }
178
179 /**************************************************************//**
180 Add a reference to an off-page column to the index->blobs map. */
181 UNIV_INTERN
182 void
btr_blob_dbg_add_blob(const rec_t * rec,ulint field_no,ulint page_no,dict_index_t * index,const char * ctx)183 btr_blob_dbg_add_blob(
184 /*==================*/
185 const rec_t* rec, /*!< in: clustered index record */
186 ulint field_no, /*!< in: off-page column number */
187 ulint page_no, /*!< in: start page of the column */
188 dict_index_t* index, /*!< in/out: index tree */
189 const char* ctx) /*!< in: context (for logging) */
190 {
191 btr_blob_dbg_t b;
192 const page_t* page = page_align(rec);
193
194 ut_a(index->blobs);
195
196 b.blob_page_no = page_no;
197 b.ref_page_no = page_get_page_no(page);
198 b.ref_heap_no = page_rec_get_heap_no(rec);
199 b.ref_field_no = field_no;
200 ut_a(b.ref_field_no >= index->n_uniq);
201 b.always_owner = b.owner = TRUE;
202 b.del = FALSE;
203 ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
204 btr_blob_dbg_rbt_insert(index, &b, ctx);
205 }
206
207 /**************************************************************//**
208 Add to index->blobs any references to off-page columns from a record.
209 @return number of references added */
210 UNIV_INTERN
211 ulint
btr_blob_dbg_add_rec(const rec_t * rec,dict_index_t * index,const ulint * offsets,const char * ctx)212 btr_blob_dbg_add_rec(
213 /*=================*/
214 const rec_t* rec, /*!< in: record */
215 dict_index_t* index, /*!< in/out: index */
216 const ulint* offsets,/*!< in: offsets */
217 const char* ctx) /*!< in: context (for logging) */
218 {
219 ulint count = 0;
220 ulint i;
221 btr_blob_dbg_t b;
222 ibool del;
223
224 ut_ad(rec_offs_validate(rec, index, offsets));
225
226 if (!rec_offs_any_extern(offsets)) {
227 return(0);
228 }
229
230 b.ref_page_no = page_get_page_no(page_align(rec));
231 b.ref_heap_no = page_rec_get_heap_no(rec);
232 del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
233
234 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
235 if (rec_offs_nth_extern(offsets, i)) {
236 ulint len;
237 const byte* field_ref = rec_get_nth_field(
238 rec, offsets, i, &len);
239
240 ut_a(len != UNIV_SQL_NULL);
241 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
242 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
243
244 if (!memcmp(field_ref, field_ref_zero,
245 BTR_EXTERN_FIELD_REF_SIZE)) {
246 /* the column has not been stored yet */
247 continue;
248 }
249
250 b.ref_field_no = i;
251 b.blob_page_no = mach_read_from_4(
252 field_ref + BTR_EXTERN_PAGE_NO);
253 ut_a(b.ref_field_no >= index->n_uniq);
254 b.always_owner = b.owner
255 = !(field_ref[BTR_EXTERN_LEN]
256 & BTR_EXTERN_OWNER_FLAG);
257 b.del = del;
258
259 btr_blob_dbg_rbt_insert(index, &b, ctx);
260 count++;
261 }
262 }
263
264 return(count);
265 }
266
267 /**************************************************************//**
268 Display the references to off-page columns.
269 This function is to be called from a debugger,
270 for example when a breakpoint on ut_dbg_assertion_failed is hit. */
271 UNIV_INTERN
272 void
btr_blob_dbg_print(const dict_index_t * index)273 btr_blob_dbg_print(
274 /*===============*/
275 const dict_index_t* index) /*!< in: index tree */
276 {
277 const ib_rbt_node_t* node;
278
279 if (!index->blobs) {
280 return;
281 }
282
283 /* We intentionally do not acquire index->blobs_mutex here.
284 This function is to be called from a debugger, and the caller
285 should make sure that the index->blobs_mutex is held. */
286
287 for (node = rbt_first(index->blobs);
288 node != NULL; node = rbt_next(index->blobs, node)) {
289 const btr_blob_dbg_t* b
290 = rbt_value(btr_blob_dbg_t, node);
291 fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
292 b->ref_page_no, b->ref_heap_no, b->ref_field_no,
293 b->blob_page_no,
294 b->owner ? "" : "(disowned)",
295 b->always_owner ? "" : "(has disowned)",
296 b->del ? "(deleted)" : "");
297 }
298 }
299
300 /**************************************************************//**
301 Remove from index->blobs any references to off-page columns from a record.
302 @return number of references removed */
303 UNIV_INTERN
304 ulint
btr_blob_dbg_remove_rec(const rec_t * rec,dict_index_t * index,const ulint * offsets,const char * ctx)305 btr_blob_dbg_remove_rec(
306 /*====================*/
307 const rec_t* rec, /*!< in: record */
308 dict_index_t* index, /*!< in/out: index */
309 const ulint* offsets,/*!< in: offsets */
310 const char* ctx) /*!< in: context (for logging) */
311 {
312 ulint i;
313 ulint count = 0;
314 btr_blob_dbg_t b;
315
316 ut_ad(rec_offs_validate(rec, index, offsets));
317
318 if (!rec_offs_any_extern(offsets)) {
319 return(0);
320 }
321
322 b.ref_page_no = page_get_page_no(page_align(rec));
323 b.ref_heap_no = page_rec_get_heap_no(rec);
324
325 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
326 if (rec_offs_nth_extern(offsets, i)) {
327 ulint len;
328 const byte* field_ref = rec_get_nth_field(
329 rec, offsets, i, &len);
330
331 ut_a(len != UNIV_SQL_NULL);
332 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
333 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
334
335 b.ref_field_no = i;
336 b.blob_page_no = mach_read_from_4(
337 field_ref + BTR_EXTERN_PAGE_NO);
338
339 switch (b.blob_page_no) {
340 case 0:
341 /* The column has not been stored yet.
342 The BLOB pointer must be all zero.
343 There cannot be a BLOB starting at
344 page 0, because page 0 is reserved for
345 the tablespace header. */
346 ut_a(!memcmp(field_ref, field_ref_zero,
347 BTR_EXTERN_FIELD_REF_SIZE));
348 /* fall through */
349 case FIL_NULL:
350 /* the column has been freed already */
351 continue;
352 }
353
354 btr_blob_dbg_rbt_delete(index, &b, ctx);
355 count++;
356 }
357 }
358
359 return(count);
360 }
361
362 /**************************************************************//**
363 Check that there are no references to off-page columns from or to
364 the given page. Invoked when freeing or clearing a page.
365 @return TRUE when no orphan references exist */
366 UNIV_INTERN
367 ibool
btr_blob_dbg_is_empty(dict_index_t * index,ulint page_no)368 btr_blob_dbg_is_empty(
369 /*==================*/
370 dict_index_t* index, /*!< in: index */
371 ulint page_no) /*!< in: page number */
372 {
373 const ib_rbt_node_t* node;
374 ibool success = TRUE;
375
376 if (!index->blobs) {
377 return(success);
378 }
379
380 mutex_enter(&index->blobs_mutex);
381
382 for (node = rbt_first(index->blobs);
383 node != NULL; node = rbt_next(index->blobs, node)) {
384 const btr_blob_dbg_t* b
385 = rbt_value(btr_blob_dbg_t, node);
386
387 if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
388 continue;
389 }
390
391 fprintf(stderr,
392 "InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
393 b->owner ? "" : "(disowned)",
394 b->always_owner ? "" : "(has disowned)",
395 b->del ? "(deleted)" : "",
396 b->ref_page_no, b->ref_heap_no, b->ref_field_no,
397 b->blob_page_no);
398
399 if (b->blob_page_no != page_no || b->owner || !b->del) {
400 success = FALSE;
401 }
402 }
403
404 mutex_exit(&index->blobs_mutex);
405 return(success);
406 }
407
408 /**************************************************************//**
409 Count and process all references to off-page columns on a page.
410 @return number of references processed */
411 UNIV_INTERN
412 ulint
btr_blob_dbg_op(const page_t * page,const rec_t * rec,dict_index_t * index,const char * ctx,const btr_blob_dbg_op_f op)413 btr_blob_dbg_op(
414 /*============*/
415 const page_t* page, /*!< in: B-tree leaf page */
416 const rec_t* rec, /*!< in: record to start from
417 (NULL to process the whole page) */
418 dict_index_t* index, /*!< in/out: index */
419 const char* ctx, /*!< in: context (for logging) */
420 const btr_blob_dbg_op_f op) /*!< in: operation on records */
421 {
422 ulint count = 0;
423 mem_heap_t* heap = NULL;
424 ulint offsets_[REC_OFFS_NORMAL_SIZE];
425 ulint* offsets = offsets_;
426 rec_offs_init(offsets_);
427
428 ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
429 ut_a(!rec || page_align(rec) == page);
430
431 if (!index->blobs || !page_is_leaf(page)
432 || !dict_index_is_clust(index)) {
433 return(0);
434 }
435
436 if (rec == NULL) {
437 rec = page_get_infimum_rec(page);
438 }
439
440 do {
441 offsets = rec_get_offsets(rec, index, offsets,
442 ULINT_UNDEFINED, &heap);
443 count += op(rec, index, offsets, ctx);
444 rec = page_rec_get_next_const(rec);
445 } while (!page_rec_is_supremum(rec));
446
447 if (heap) {
448 mem_heap_free(heap);
449 }
450
451 return(count);
452 }
453
454 /**************************************************************//**
455 Count and add to index->blobs any references to off-page columns
456 from records on a page.
457 @return number of references added */
458 UNIV_INTERN
459 ulint
btr_blob_dbg_add(const page_t * page,dict_index_t * index,const char * ctx)460 btr_blob_dbg_add(
461 /*=============*/
462 const page_t* page, /*!< in: rewritten page */
463 dict_index_t* index, /*!< in/out: index */
464 const char* ctx) /*!< in: context (for logging) */
465 {
466 btr_blob_dbg_assert_empty(index, page_get_page_no(page));
467
468 return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
469 }
470
471 /**************************************************************//**
472 Count and remove from index->blobs any references to off-page columns
473 from records on a page.
474 Used when reorganizing a page, before copying the records.
475 @return number of references removed */
476 UNIV_INTERN
477 ulint
btr_blob_dbg_remove(const page_t * page,dict_index_t * index,const char * ctx)478 btr_blob_dbg_remove(
479 /*================*/
480 const page_t* page, /*!< in: b-tree page */
481 dict_index_t* index, /*!< in/out: index */
482 const char* ctx) /*!< in: context (for logging) */
483 {
484 ulint count;
485
486 count = btr_blob_dbg_op(page, NULL, index, ctx,
487 btr_blob_dbg_remove_rec);
488
489 /* Check that no references exist. */
490 btr_blob_dbg_assert_empty(index, page_get_page_no(page));
491
492 return(count);
493 }
494
495 /**************************************************************//**
496 Restore in index->blobs any references to off-page columns
497 Used when page reorganize fails due to compressed page overflow. */
498 UNIV_INTERN
499 void
btr_blob_dbg_restore(const page_t * npage,const page_t * page,dict_index_t * index,const char * ctx)500 btr_blob_dbg_restore(
501 /*=================*/
502 const page_t* npage, /*!< in: page that failed to compress */
503 const page_t* page, /*!< in: copy of original page */
504 dict_index_t* index, /*!< in/out: index */
505 const char* ctx) /*!< in: context (for logging) */
506 {
507 ulint removed;
508 ulint added;
509
510 ut_a(page_get_page_no(npage) == page_get_page_no(page));
511 ut_a(page_get_space_id(npage) == page_get_space_id(page));
512
513 removed = btr_blob_dbg_remove(npage, index, ctx);
514 added = btr_blob_dbg_add(page, index, ctx);
515 ut_a(added == removed);
516 }
517
518 /**************************************************************//**
519 Modify the 'deleted' flag of a record. */
520 UNIV_INTERN
521 void
btr_blob_dbg_set_deleted_flag(const rec_t * rec,dict_index_t * index,const ulint * offsets,ibool del)522 btr_blob_dbg_set_deleted_flag(
523 /*==========================*/
524 const rec_t* rec, /*!< in: record */
525 dict_index_t* index, /*!< in/out: index */
526 const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
527 ibool del) /*!< in: TRUE=deleted, FALSE=exists */
528 {
529 const ib_rbt_node_t* node;
530 btr_blob_dbg_t b;
531 btr_blob_dbg_t* c;
532 ulint i;
533
534 ut_ad(rec_offs_validate(rec, index, offsets));
535 ut_a(dict_index_is_clust(index));
536 ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
537
538 if (!rec_offs_any_extern(offsets) || !index->blobs) {
539
540 return;
541 }
542
543 b.ref_page_no = page_get_page_no(page_align(rec));
544 b.ref_heap_no = page_rec_get_heap_no(rec);
545
546 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
547 if (rec_offs_nth_extern(offsets, i)) {
548 ulint len;
549 const byte* field_ref = rec_get_nth_field(
550 rec, offsets, i, &len);
551
552 ut_a(len != UNIV_SQL_NULL);
553 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
554 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
555
556 b.ref_field_no = i;
557 b.blob_page_no = mach_read_from_4(
558 field_ref + BTR_EXTERN_PAGE_NO);
559
560 switch (b.blob_page_no) {
561 case 0:
562 ut_a(memcmp(field_ref, field_ref_zero,
563 BTR_EXTERN_FIELD_REF_SIZE));
564 /* page number 0 is for the
565 page allocation bitmap */
566 case FIL_NULL:
567 /* the column has been freed already */
568 ut_error;
569 }
570
571 mutex_enter(&index->blobs_mutex);
572 node = rbt_lookup(index->blobs, &b);
573 ut_a(node);
574
575 c = rbt_value(btr_blob_dbg_t, node);
576 /* The flag should be modified. */
577 c->del = del;
578 if (btr_blob_dbg_msg) {
579 b = *c;
580 mutex_exit(&index->blobs_mutex);
581 btr_blob_dbg_msg_issue("del_mk", &b, "");
582 } else {
583 mutex_exit(&index->blobs_mutex);
584 }
585 }
586 }
587 }
588
589 /**************************************************************//**
590 Change the ownership of an off-page column. */
591 UNIV_INTERN
592 void
btr_blob_dbg_owner(const rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool own)593 btr_blob_dbg_owner(
594 /*===============*/
595 const rec_t* rec, /*!< in: record */
596 dict_index_t* index, /*!< in/out: index */
597 const ulint* offsets,/*!< in: rec_get_offs(rec, index) */
598 ulint i, /*!< in: ith field in rec */
599 ibool own) /*!< in: TRUE=owned, FALSE=disowned */
600 {
601 const ib_rbt_node_t* node;
602 btr_blob_dbg_t b;
603 const byte* field_ref;
604 ulint len;
605
606 ut_ad(rec_offs_validate(rec, index, offsets));
607 ut_a(rec_offs_nth_extern(offsets, i));
608
609 field_ref = rec_get_nth_field(rec, offsets, i, &len);
610 ut_a(len != UNIV_SQL_NULL);
611 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
612 field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
613
614 b.ref_page_no = page_get_page_no(page_align(rec));
615 b.ref_heap_no = page_rec_get_heap_no(rec);
616 b.ref_field_no = i;
617 b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
618 b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
619
620 ut_a(b.owner == own);
621
622 mutex_enter(&index->blobs_mutex);
623 node = rbt_lookup(index->blobs, &b);
624 /* row_ins_clust_index_entry_by_modify() invokes
625 btr_cur_unmark_extern_fields() also for the newly inserted
626 references, which are all zero bytes until the columns are stored.
627 The node lookup must fail if and only if that is the case. */
628 ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
629 == !node);
630
631 if (node) {
632 btr_blob_dbg_t* c = rbt_value(btr_blob_dbg_t, node);
633 /* Some code sets ownership from TRUE to TRUE.
634 We do not allow changing ownership from FALSE to FALSE. */
635 ut_a(own || c->owner);
636
637 c->owner = own;
638 if (!own) {
639 c->always_owner = FALSE;
640 }
641 }
642
643 mutex_exit(&index->blobs_mutex);
644 }
645 #endif /* UNIV_BLOB_DEBUG */
646
647 /*
648 Latching strategy of the InnoDB B-tree
649 --------------------------------------
650 A tree latch protects all non-leaf nodes of the tree. Each node of a tree
651 also has a latch of its own.
652
653 A B-tree operation normally first acquires an S-latch on the tree. It
654 searches down the tree and releases the tree latch when it has the
655 leaf node latch. To save CPU time we do not acquire any latch on
656 non-leaf nodes of the tree during a search, those pages are only bufferfixed.
657
658 If an operation needs to restructure the tree, it acquires an X-latch on
659 the tree before searching to a leaf node. If it needs, for example, to
660 split a leaf,
661 (1) InnoDB decides the split point in the leaf,
662 (2) allocates a new page,
663 (3) inserts the appropriate node pointer to the first non-leaf level,
664 (4) releases the tree X-latch,
665 (5) and then moves records from the leaf to the new allocated page.
666
667 Node pointers
668 -------------
669 Leaf pages of a B-tree contain the index records stored in the
670 tree. On levels n > 0 we store 'node pointers' to pages on level
671 n - 1. For each page there is exactly one node pointer stored:
672 thus the our tree is an ordinary B-tree, not a B-link tree.
673
674 A node pointer contains a prefix P of an index record. The prefix
675 is long enough so that it determines an index record uniquely.
676 The file page number of the child page is added as the last
677 field. To the child page we can store node pointers or index records
678 which are >= P in the alphabetical order, but < P1 if there is
679 a next node pointer on the level, and P1 is its prefix.
680
681 If a node pointer with a prefix P points to a non-leaf child,
682 then the leftmost record in the child must have the same
683 prefix P. If it points to a leaf node, the child is not required
684 to contain any record with a prefix equal to P. The leaf case
685 is decided this way to allow arbitrary deletions in a leaf node
686 without touching upper levels of the tree.
687
688 We have predefined a special minimum record which we
689 define as the smallest record in any alphabetical order.
690 A minimum record is denoted by setting a bit in the record
691 header. A minimum record acts as the prefix of a node pointer
692 which points to a leftmost node on any level of the tree.
693
694 File page allocation
695 --------------------
696 In the root node of a B-tree there are two file segment headers.
697 The leaf pages of a tree are allocated from one file segment, to
698 make them consecutive on disk if possible. From the other file segment
699 we allocate pages for the non-leaf levels of the tree.
700 */
701
702 #ifdef UNIV_BTR_DEBUG
703 /**************************************************************//**
704 Checks a file segment header within a B-tree root page.
705 @return TRUE if valid */
706 static
707 ibool
btr_root_fseg_validate(const fseg_header_t * seg_header,ulint space)708 btr_root_fseg_validate(
709 /*===================*/
710 const fseg_header_t* seg_header, /*!< in: segment header */
711 ulint space) /*!< in: tablespace identifier */
712 {
713 ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
714
715 ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
716 ut_a(offset >= FIL_PAGE_DATA);
717 ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
718 return(TRUE);
719 }
720 #endif /* UNIV_BTR_DEBUG */
721
722 /**************************************************************//**
723 Gets the root node of a tree and x- or s-latches it.
724 @return root page, x- or s-latched */
725 static
726 buf_block_t*
btr_root_block_get(const dict_index_t * index,ulint mode,mtr_t * mtr)727 btr_root_block_get(
728 /*===============*/
729 const dict_index_t* index, /*!< in: index tree */
730 ulint mode, /*!< in: either RW_S_LATCH
731 or RW_X_LATCH */
732 mtr_t* mtr) /*!< in: mtr */
733 {
734 ulint space;
735 ulint zip_size;
736 ulint root_page_no;
737 buf_block_t* block;
738
739 space = dict_index_get_space(index);
740 zip_size = dict_table_zip_size(index->table);
741 root_page_no = dict_index_get_page(index);
742
743 block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr);
744 btr_assert_not_corrupted(block, index);
745 #ifdef UNIV_BTR_DEBUG
746 if (!dict_index_is_ibuf(index)) {
747 const page_t* root = buf_block_get_frame(block);
748
749 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
750 + root, space));
751 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
752 + root, space));
753 }
754 #endif /* UNIV_BTR_DEBUG */
755
756 return(block);
757 }
758
759 /**************************************************************//**
760 Gets the root node of a tree and x-latches it.
761 @return root page, x-latched */
762 UNIV_INTERN
763 page_t*
btr_root_get(const dict_index_t * index,mtr_t * mtr)764 btr_root_get(
765 /*=========*/
766 const dict_index_t* index, /*!< in: index tree */
767 mtr_t* mtr) /*!< in: mtr */
768 {
769 return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH,
770 mtr)));
771 }
772
773 /**************************************************************//**
774 Gets the height of the B-tree (the level of the root, when the leaf
775 level is assumed to be 0). The caller must hold an S or X latch on
776 the index.
777 @return tree height (level of the root) */
778 UNIV_INTERN
779 ulint
btr_height_get(dict_index_t * index,mtr_t * mtr)780 btr_height_get(
781 /*===========*/
782 dict_index_t* index, /*!< in: index tree */
783 mtr_t* mtr) /*!< in/out: mini-transaction */
784 {
785 ulint height;
786 buf_block_t* root_block;
787
788 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
789 MTR_MEMO_S_LOCK)
790 || mtr_memo_contains(mtr, dict_index_get_lock(index),
791 MTR_MEMO_X_LOCK));
792
793 /* S latches the page */
794 root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
795
796 height = btr_page_get_level(buf_block_get_frame(root_block), mtr);
797
798 /* Release the S latch on the root page. */
799 mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX);
800 #ifdef UNIV_SYNC_DEBUG
801 sync_thread_reset_level(&root_block->lock);
802 #endif /* UNIV_SYNC_DEBUG */
803
804 return(height);
805 }
806
807 /**************************************************************//**
808 Checks a file segment header within a B-tree root page and updates
809 the segment header space id.
810 @return TRUE if valid */
811 static
812 bool
btr_root_fseg_adjust_on_import(fseg_header_t * seg_header,page_zip_des_t * page_zip,ulint space,mtr_t * mtr)813 btr_root_fseg_adjust_on_import(
814 /*===========================*/
815 fseg_header_t* seg_header, /*!< in/out: segment header */
816 page_zip_des_t* page_zip, /*!< in/out: compressed page,
817 or NULL */
818 ulint space, /*!< in: tablespace identifier */
819 mtr_t* mtr) /*!< in/out: mini-transaction */
820 {
821 ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
822
823 if (offset < FIL_PAGE_DATA
824 || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) {
825
826 return(FALSE);
827
828 } else if (page_zip) {
829 mach_write_to_4(seg_header + FSEG_HDR_SPACE, space);
830 page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE,
831 4, mtr);
832 } else {
833 mlog_write_ulint(seg_header + FSEG_HDR_SPACE,
834 space, MLOG_4BYTES, mtr);
835 }
836
837 return(TRUE);
838 }
839
840 /**************************************************************//**
841 Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
842 @return error code, or DB_SUCCESS */
843 UNIV_INTERN
844 dberr_t
btr_root_adjust_on_import(const dict_index_t * index)845 btr_root_adjust_on_import(
846 /*======================*/
847 const dict_index_t* index) /*!< in: index tree */
848 {
849 dberr_t err;
850 mtr_t mtr;
851 page_t* page;
852 buf_block_t* block;
853 page_zip_des_t* page_zip;
854 dict_table_t* table = index->table;
855 ulint space_id = dict_index_get_space(index);
856 ulint zip_size = dict_table_zip_size(table);
857 ulint root_page_no = dict_index_get_page(index);
858
859 mtr_start(&mtr);
860
861 mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
862
863 DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
864 return(DB_CORRUPTION););
865
866 block = btr_block_get(
867 space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr);
868
869 page = buf_block_get_frame(block);
870 page_zip = buf_block_get_page_zip(block);
871
872 /* Check that this is a B-tree page and both the PREV and NEXT
873 pointers are FIL_NULL, because the root page does not have any
874 siblings. */
875 if (fil_page_get_type(page) != FIL_PAGE_INDEX
876 || fil_page_get_prev(page) != FIL_NULL
877 || fil_page_get_next(page) != FIL_NULL) {
878
879 err = DB_CORRUPTION;
880
881 } else if (dict_index_is_clust(index)) {
882 bool page_is_compact_format;
883
884 page_is_compact_format = page_is_comp(page) > 0;
885
886 /* Check if the page format and table format agree. */
887 if (page_is_compact_format != dict_table_is_comp(table)) {
888 err = DB_CORRUPTION;
889 } else {
890
891 /* Check that the table flags and the tablespace
892 flags match. */
893 ulint flags = fil_space_get_flags(table->space);
894
895 if (flags
896 && flags != dict_tf_to_fsp_flags(table->flags)) {
897
898 err = DB_CORRUPTION;
899 } else {
900 err = DB_SUCCESS;
901 }
902 }
903 } else {
904 err = DB_SUCCESS;
905 }
906
907 /* Check and adjust the file segment headers, if all OK so far. */
908 if (err == DB_SUCCESS
909 && (!btr_root_fseg_adjust_on_import(
910 FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
911 + page, page_zip, space_id, &mtr)
912 || !btr_root_fseg_adjust_on_import(
913 FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
914 + page, page_zip, space_id, &mtr))) {
915
916 err = DB_CORRUPTION;
917 }
918
919 mtr_commit(&mtr);
920
921 return(err);
922 }
923
924 /*************************************************************//**
925 Gets pointer to the previous user record in the tree. It is assumed that
926 the caller has appropriate latches on the page and its neighbor.
927 @return previous user record, NULL if there is none */
928 UNIV_INTERN
929 rec_t*
btr_get_prev_user_rec(rec_t * rec,mtr_t * mtr)930 btr_get_prev_user_rec(
931 /*==================*/
932 rec_t* rec, /*!< in: record on leaf level */
933 mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
934 needed, also to the previous page */
935 {
936 page_t* page;
937 page_t* prev_page;
938 ulint prev_page_no;
939
940 if (!page_rec_is_infimum(rec)) {
941
942 rec_t* prev_rec = page_rec_get_prev(rec);
943
944 if (!page_rec_is_infimum(prev_rec)) {
945
946 return(prev_rec);
947 }
948 }
949
950 page = page_align(rec);
951 prev_page_no = btr_page_get_prev(page, mtr);
952
953 if (prev_page_no != FIL_NULL) {
954
955 ulint space;
956 ulint zip_size;
957 buf_block_t* prev_block;
958
959 space = page_get_space_id(page);
960 zip_size = fil_space_get_zip_size(space);
961
962 prev_block = buf_page_get_with_no_latch(space, zip_size,
963 prev_page_no, mtr);
964 prev_page = buf_block_get_frame(prev_block);
965 /* The caller must already have a latch to the brother */
966 ut_ad(mtr_memo_contains(mtr, prev_block,
967 MTR_MEMO_PAGE_S_FIX)
968 || mtr_memo_contains(mtr, prev_block,
969 MTR_MEMO_PAGE_X_FIX));
970 #ifdef UNIV_BTR_DEBUG
971 ut_a(page_is_comp(prev_page) == page_is_comp(page));
972 ut_a(btr_page_get_next(prev_page, mtr)
973 == page_get_page_no(page));
974 #endif /* UNIV_BTR_DEBUG */
975
976 return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
977 }
978
979 return(NULL);
980 }
981
982 /*************************************************************//**
983 Gets pointer to the next user record in the tree. It is assumed that the
984 caller has appropriate latches on the page and its neighbor.
985 @return next user record, NULL if there is none */
986 UNIV_INTERN
987 rec_t*
btr_get_next_user_rec(rec_t * rec,mtr_t * mtr)988 btr_get_next_user_rec(
989 /*==================*/
990 rec_t* rec, /*!< in: record on leaf level */
991 mtr_t* mtr) /*!< in: mtr holding a latch on the page, and if
992 needed, also to the next page */
993 {
994 page_t* page;
995 page_t* next_page;
996 ulint next_page_no;
997
998 if (!page_rec_is_supremum(rec)) {
999
1000 rec_t* next_rec = page_rec_get_next(rec);
1001
1002 if (!page_rec_is_supremum(next_rec)) {
1003
1004 return(next_rec);
1005 }
1006 }
1007
1008 page = page_align(rec);
1009 next_page_no = btr_page_get_next(page, mtr);
1010
1011 if (next_page_no != FIL_NULL) {
1012 ulint space;
1013 ulint zip_size;
1014 buf_block_t* next_block;
1015
1016 space = page_get_space_id(page);
1017 zip_size = fil_space_get_zip_size(space);
1018
1019 next_block = buf_page_get_with_no_latch(space, zip_size,
1020 next_page_no, mtr);
1021 next_page = buf_block_get_frame(next_block);
1022 /* The caller must already have a latch to the brother */
1023 ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
1024 || mtr_memo_contains(mtr, next_block,
1025 MTR_MEMO_PAGE_X_FIX));
1026 #ifdef UNIV_BTR_DEBUG
1027 ut_a(page_is_comp(next_page) == page_is_comp(page));
1028 ut_a(btr_page_get_prev(next_page, mtr)
1029 == page_get_page_no(page));
1030 #endif /* UNIV_BTR_DEBUG */
1031
1032 return(page_rec_get_next(page_get_infimum_rec(next_page)));
1033 }
1034
1035 return(NULL);
1036 }
1037
1038 /**************************************************************//**
1039 Creates a new index page (not the root, and also not
1040 used in page reorganization). @see btr_page_empty(). */
1041 static
1042 void
btr_page_create(buf_block_t * block,page_zip_des_t * page_zip,dict_index_t * index,ulint level,mtr_t * mtr)1043 btr_page_create(
1044 /*============*/
1045 buf_block_t* block, /*!< in/out: page to be created */
1046 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
1047 dict_index_t* index, /*!< in: index */
1048 ulint level, /*!< in: the B-tree level of the page */
1049 mtr_t* mtr) /*!< in: mtr */
1050 {
1051 page_t* page = buf_block_get_frame(block);
1052
1053 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1054 btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
1055
1056 if (page_zip) {
1057 page_create_zip(block, index, level, 0, mtr);
1058 } else {
1059 page_create(block, mtr, dict_table_is_comp(index->table));
1060 /* Set the level of the new index page */
1061 btr_page_set_level(page, NULL, level, mtr);
1062 }
1063
1064 block->check_index_page_at_flush = TRUE;
1065
1066 btr_page_set_index_id(page, page_zip, index->id, mtr);
1067 }
1068
1069 /**************************************************************//**
1070 Allocates a new file page to be used in an ibuf tree. Takes the page from
1071 the free list of the tree, which must contain pages!
1072 @return new allocated block, x-latched */
1073 static
1074 buf_block_t*
btr_page_alloc_for_ibuf(dict_index_t * index,mtr_t * mtr)1075 btr_page_alloc_for_ibuf(
1076 /*====================*/
1077 dict_index_t* index, /*!< in: index tree */
1078 mtr_t* mtr) /*!< in: mtr */
1079 {
1080 fil_addr_t node_addr;
1081 page_t* root;
1082 page_t* new_page;
1083 buf_block_t* new_block;
1084
1085 root = btr_root_get(index, mtr);
1086
1087 node_addr = flst_get_first(root + PAGE_HEADER
1088 + PAGE_BTR_IBUF_FREE_LIST, mtr);
1089 ut_a(node_addr.page != FIL_NULL);
1090
1091 new_block = buf_page_get(dict_index_get_space(index),
1092 dict_table_zip_size(index->table),
1093 node_addr.page, RW_X_LATCH, mtr);
1094 new_page = buf_block_get_frame(new_block);
1095 buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
1096
1097 flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1098 new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
1099 mtr);
1100 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1101 mtr));
1102
1103 return(new_block);
1104 }
1105
1106 /**************************************************************//**
1107 Allocates a new file page to be used in an index tree. NOTE: we assume
1108 that the caller has made the reservation for free extents!
1109 @retval NULL if no page could be allocated
1110 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
1111 (init_mtr == mtr, or the page was not previously freed in mtr)
1112 @retval block (not allocated or initialized) otherwise */
1113 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1114 buf_block_t*
btr_page_alloc_low(dict_index_t * index,ulint hint_page_no,byte file_direction,ulint level,mtr_t * mtr,mtr_t * init_mtr)1115 btr_page_alloc_low(
1116 /*===============*/
1117 dict_index_t* index, /*!< in: index */
1118 ulint hint_page_no, /*!< in: hint of a good page */
1119 byte file_direction, /*!< in: direction where a possible
1120 page split is made */
1121 ulint level, /*!< in: level where the page is placed
1122 in the tree */
1123 mtr_t* mtr, /*!< in/out: mini-transaction
1124 for the allocation */
1125 mtr_t* init_mtr) /*!< in/out: mtr or another
1126 mini-transaction in which the
1127 page should be initialized.
1128 If init_mtr!=mtr, but the page
1129 is already X-latched in mtr, do
1130 not initialize the page. */
1131 {
1132 fseg_header_t* seg_header;
1133 page_t* root;
1134
1135 root = btr_root_get(index, mtr);
1136
1137 if (level == 0) {
1138 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1139 } else {
1140 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1141 }
1142
1143 /* Parameter TRUE below states that the caller has made the
1144 reservation for free extents, and thus we know that a page can
1145 be allocated: */
1146
1147 return(fseg_alloc_free_page_general(
1148 seg_header, hint_page_no, file_direction,
1149 TRUE, mtr, init_mtr));
1150 }
1151
1152 /**************************************************************//**
1153 Allocates a new file page to be used in an index tree. NOTE: we assume
1154 that the caller has made the reservation for free extents!
1155 @retval NULL if no page could be allocated
1156 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
1157 (init_mtr == mtr, or the page was not previously freed in mtr)
1158 @retval block (not allocated or initialized) otherwise */
1159 UNIV_INTERN
1160 buf_block_t*
btr_page_alloc(dict_index_t * index,ulint hint_page_no,byte file_direction,ulint level,mtr_t * mtr,mtr_t * init_mtr)1161 btr_page_alloc(
1162 /*===========*/
1163 dict_index_t* index, /*!< in: index */
1164 ulint hint_page_no, /*!< in: hint of a good page */
1165 byte file_direction, /*!< in: direction where a possible
1166 page split is made */
1167 ulint level, /*!< in: level where the page is placed
1168 in the tree */
1169 mtr_t* mtr, /*!< in/out: mini-transaction
1170 for the allocation */
1171 mtr_t* init_mtr) /*!< in/out: mini-transaction
1172 for x-latching and initializing
1173 the page */
1174 {
1175 buf_block_t* new_block;
1176
1177 if (dict_index_is_ibuf(index)) {
1178
1179 return(btr_page_alloc_for_ibuf(index, mtr));
1180 }
1181
1182 new_block = btr_page_alloc_low(
1183 index, hint_page_no, file_direction, level, mtr, init_mtr);
1184
1185 if (new_block) {
1186 buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
1187 }
1188
1189 return(new_block);
1190 }
1191
1192 /**************************************************************//**
1193 Gets the number of pages in a B-tree.
1194 @return number of pages, or ULINT_UNDEFINED if the index is unavailable */
1195 UNIV_INTERN
1196 ulint
btr_get_size(dict_index_t * index,ulint flag,mtr_t * mtr)1197 btr_get_size(
1198 /*=========*/
1199 dict_index_t* index, /*!< in: index */
1200 ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
1201 mtr_t* mtr) /*!< in/out: mini-transaction where index
1202 is s-latched */
1203 {
1204 fseg_header_t* seg_header;
1205 page_t* root;
1206 ulint n;
1207 ulint dummy;
1208
1209 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1210 MTR_MEMO_S_LOCK));
1211
1212 if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
1213 || *index->name == TEMP_INDEX_PREFIX) {
1214 return(ULINT_UNDEFINED);
1215 }
1216
1217 root = btr_root_get(index, mtr);
1218
1219 if (flag == BTR_N_LEAF_PAGES) {
1220 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1221
1222 fseg_n_reserved_pages(seg_header, &n, mtr);
1223
1224 } else if (flag == BTR_TOTAL_SIZE) {
1225 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1226
1227 n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
1228
1229 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1230
1231 n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
1232 } else {
1233 ut_error;
1234 }
1235
1236 return(n);
1237 }
1238
1239 /**************************************************************//**
1240 Frees a page used in an ibuf tree. Puts the page to the free list of the
1241 ibuf tree. */
1242 static
1243 void
btr_page_free_for_ibuf(dict_index_t * index,buf_block_t * block,mtr_t * mtr)1244 btr_page_free_for_ibuf(
1245 /*===================*/
1246 dict_index_t* index, /*!< in: index tree */
1247 buf_block_t* block, /*!< in: block to be freed, x-latched */
1248 mtr_t* mtr) /*!< in: mtr */
1249 {
1250 page_t* root;
1251
1252 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1253 root = btr_root_get(index, mtr);
1254
1255 flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1256 buf_block_get_frame(block)
1257 + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
1258
1259 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1260 mtr));
1261 }
1262
1263 /**************************************************************//**
1264 Frees a file page used in an index tree. Can be used also to (BLOB)
1265 external storage pages, because the page level 0 can be given as an
1266 argument. */
1267 UNIV_INTERN
1268 void
btr_page_free_low(dict_index_t * index,buf_block_t * block,ulint level,mtr_t * mtr)1269 btr_page_free_low(
1270 /*==============*/
1271 dict_index_t* index, /*!< in: index tree */
1272 buf_block_t* block, /*!< in: block to be freed, x-latched */
1273 ulint level, /*!< in: page level */
1274 mtr_t* mtr) /*!< in: mtr */
1275 {
1276 fseg_header_t* seg_header;
1277 page_t* root;
1278
1279 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1280 /* The page gets invalid for optimistic searches: increment the frame
1281 modify clock */
1282
1283 buf_block_modify_clock_inc(block);
1284 btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
1285
1286 if (dict_index_is_ibuf(index)) {
1287
1288 btr_page_free_for_ibuf(index, block, mtr);
1289
1290 return;
1291 }
1292
1293 root = btr_root_get(index, mtr);
1294
1295 if (level == 0) {
1296 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1297 } else {
1298 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1299 }
1300
1301 fseg_free_page(seg_header,
1302 buf_block_get_space(block),
1303 buf_block_get_page_no(block), mtr);
1304
1305 /* The page was marked free in the allocation bitmap, but it
1306 should remain buffer-fixed until mtr_commit(mtr) or until it
1307 is explicitly freed from the mini-transaction. */
1308 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1309 /* TODO: Discard any operations on the page from the redo log
1310 and remove the block from the flush list and the buffer pool.
1311 This would free up buffer pool earlier and reduce writes to
1312 both the tablespace and the redo log. */
1313 }
1314
1315 /**************************************************************//**
1316 Frees a file page used in an index tree. NOTE: cannot free field external
1317 storage pages because the page must contain info on its level. */
1318 UNIV_INTERN
1319 void
btr_page_free(dict_index_t * index,buf_block_t * block,mtr_t * mtr)1320 btr_page_free(
1321 /*==========*/
1322 dict_index_t* index, /*!< in: index tree */
1323 buf_block_t* block, /*!< in: block to be freed, x-latched */
1324 mtr_t* mtr) /*!< in: mtr */
1325 {
1326 const page_t* page = buf_block_get_frame(block);
1327 ulint level = btr_page_get_level(page, mtr);
1328
1329 ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
1330 btr_page_free_low(index, block, level, mtr);
1331 }
1332
1333 /**************************************************************//**
1334 Sets the child node file address in a node pointer. */
1335 UNIV_INLINE
1336 void
btr_node_ptr_set_child_page_no(rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,ulint page_no,mtr_t * mtr)1337 btr_node_ptr_set_child_page_no(
1338 /*===========================*/
1339 rec_t* rec, /*!< in: node pointer record */
1340 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
1341 part will be updated, or NULL */
1342 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
1343 ulint page_no,/*!< in: child node address */
1344 mtr_t* mtr) /*!< in: mtr */
1345 {
1346 byte* field;
1347 ulint len;
1348
1349 ut_ad(rec_offs_validate(rec, NULL, offsets));
1350 ut_ad(!page_is_leaf(page_align(rec)));
1351 ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
1352
1353 /* The child address is in the last field */
1354 field = rec_get_nth_field(rec, offsets,
1355 rec_offs_n_fields(offsets) - 1, &len);
1356
1357 ut_ad(len == REC_NODE_PTR_SIZE);
1358
1359 if (page_zip) {
1360 page_zip_write_node_ptr(page_zip, rec,
1361 rec_offs_data_size(offsets),
1362 page_no, mtr);
1363 } else {
1364 mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
1365 }
1366 }
1367
1368 /************************************************************//**
1369 Returns the child page of a node pointer and x-latches it.
1370 @return child page, x-latched */
1371 static
1372 buf_block_t*
btr_node_ptr_get_child(const rec_t * node_ptr,dict_index_t * index,const ulint * offsets,mtr_t * mtr)1373 btr_node_ptr_get_child(
1374 /*===================*/
1375 const rec_t* node_ptr,/*!< in: node pointer */
1376 dict_index_t* index, /*!< in: index */
1377 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
1378 mtr_t* mtr) /*!< in: mtr */
1379 {
1380 ulint page_no;
1381 ulint space;
1382
1383 ut_ad(rec_offs_validate(node_ptr, index, offsets));
1384 space = page_get_space_id(page_align(node_ptr));
1385 page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
1386
1387 return(btr_block_get(space, dict_table_zip_size(index->table),
1388 page_no, RW_X_LATCH, index, mtr));
1389 }
1390
1391 /************************************************************//**
1392 Returns the upper level node pointer to a page. It is assumed that mtr holds
1393 an x-latch on the tree.
1394 @return rec_get_offsets() of the node pointer record */
1395 static
1396 ulint*
btr_page_get_father_node_ptr_func(ulint * offsets,mem_heap_t * heap,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)1397 btr_page_get_father_node_ptr_func(
1398 /*==============================*/
1399 ulint* offsets,/*!< in: work area for the return value */
1400 mem_heap_t* heap, /*!< in: memory heap to use */
1401 btr_cur_t* cursor, /*!< in: cursor pointing to user record,
1402 out: cursor on node pointer record,
1403 its page x-latched */
1404 const char* file, /*!< in: file name */
1405 ulint line, /*!< in: line where called */
1406 mtr_t* mtr) /*!< in: mtr */
1407 {
1408 dtuple_t* tuple;
1409 rec_t* user_rec;
1410 rec_t* node_ptr;
1411 ulint level;
1412 ulint page_no;
1413 dict_index_t* index;
1414
1415 page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
1416 index = btr_cur_get_index(cursor);
1417
1418 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1419 MTR_MEMO_X_LOCK));
1420
1421 ut_ad(dict_index_get_page(index) != page_no);
1422
1423 level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
1424
1425 user_rec = btr_cur_get_rec(cursor);
1426 ut_a(page_rec_is_user_rec(user_rec));
1427 tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
1428
1429 btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
1430 BTR_CONT_MODIFY_TREE, cursor, 0,
1431 file, line, mtr);
1432
1433 node_ptr = btr_cur_get_rec(cursor);
1434 ut_ad(!page_rec_is_comp(node_ptr)
1435 || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
1436 offsets = rec_get_offsets(node_ptr, index, offsets,
1437 ULINT_UNDEFINED, &heap);
1438
1439 if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
1440 rec_t* print_rec;
1441 fputs("InnoDB: Dump of the child page:\n", stderr);
1442 buf_page_print(page_align(user_rec), 0,
1443 BUF_PAGE_PRINT_NO_CRASH);
1444 fputs("InnoDB: Dump of the parent page:\n", stderr);
1445 buf_page_print(page_align(node_ptr), 0,
1446 BUF_PAGE_PRINT_NO_CRASH);
1447
1448 fputs("InnoDB: Corruption of an index tree: table ", stderr);
1449 ut_print_name(stderr, NULL, TRUE, index->table_name);
1450 fputs(", index ", stderr);
1451 ut_print_name(stderr, NULL, FALSE, index->name);
1452 fprintf(stderr, ",\n"
1453 "InnoDB: father ptr page no %lu, child page no %lu\n",
1454 (ulong)
1455 btr_node_ptr_get_child_page_no(node_ptr, offsets),
1456 (ulong) page_no);
1457 print_rec = page_rec_get_next(
1458 page_get_infimum_rec(page_align(user_rec)));
1459 offsets = rec_get_offsets(print_rec, index,
1460 offsets, ULINT_UNDEFINED, &heap);
1461 page_rec_print(print_rec, offsets);
1462 offsets = rec_get_offsets(node_ptr, index, offsets,
1463 ULINT_UNDEFINED, &heap);
1464 page_rec_print(node_ptr, offsets);
1465
1466 fputs("InnoDB: You should dump + drop + reimport the table"
1467 " to fix the\n"
1468 "InnoDB: corruption. If the crash happens at "
1469 "the database startup, see\n"
1470 "InnoDB: " REFMAN "forcing-innodb-recovery.html about\n"
1471 "InnoDB: forcing recovery. "
1472 "Then dump + drop + reimport.\n", stderr);
1473
1474 ut_error;
1475 }
1476
1477 return(offsets);
1478 }
1479
1480 #define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
1481 btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
1482
1483 /************************************************************//**
1484 Returns the upper level node pointer to a page. It is assumed that mtr holds
1485 an x-latch on the tree.
1486 @return rec_get_offsets() of the node pointer record */
1487 static
1488 ulint*
btr_page_get_father_block(ulint * offsets,mem_heap_t * heap,dict_index_t * index,buf_block_t * block,mtr_t * mtr,btr_cur_t * cursor)1489 btr_page_get_father_block(
1490 /*======================*/
1491 ulint* offsets,/*!< in: work area for the return value */
1492 mem_heap_t* heap, /*!< in: memory heap to use */
1493 dict_index_t* index, /*!< in: b-tree index */
1494 buf_block_t* block, /*!< in: child page in the index */
1495 mtr_t* mtr, /*!< in: mtr */
1496 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1497 its page x-latched */
1498 {
1499 rec_t* rec
1500 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1501 block)));
1502 btr_cur_position(index, rec, block, cursor);
1503 return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
1504 }
1505
1506 /************************************************************//**
1507 Seeks to the upper level node pointer to a page.
1508 It is assumed that mtr holds an x-latch on the tree. */
1509 static
1510 void
btr_page_get_father(dict_index_t * index,buf_block_t * block,mtr_t * mtr,btr_cur_t * cursor)1511 btr_page_get_father(
1512 /*================*/
1513 dict_index_t* index, /*!< in: b-tree index */
1514 buf_block_t* block, /*!< in: child page in the index */
1515 mtr_t* mtr, /*!< in: mtr */
1516 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1517 its page x-latched */
1518 {
1519 mem_heap_t* heap;
1520 rec_t* rec
1521 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1522 block)));
1523 btr_cur_position(index, rec, block, cursor);
1524
1525 heap = mem_heap_create(100);
1526 btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
1527 mem_heap_free(heap);
1528 }
1529
1530 /************************************************************//**
1531 Creates the root node for a new index tree.
1532 @return page number of the created root, FIL_NULL if did not succeed */
1533 UNIV_INTERN
1534 ulint
btr_create(ulint type,ulint space,ulint zip_size,index_id_t index_id,dict_index_t * index,mtr_t * mtr)1535 btr_create(
1536 /*=======*/
1537 ulint type, /*!< in: type of the index */
1538 ulint space, /*!< in: space where created */
1539 ulint zip_size,/*!< in: compressed page size in bytes
1540 or 0 for uncompressed pages */
1541 index_id_t index_id,/*!< in: index id */
1542 dict_index_t* index, /*!< in: index */
1543 mtr_t* mtr) /*!< in: mini-transaction handle */
1544 {
1545 ulint page_no;
1546 buf_block_t* block;
1547 buf_frame_t* frame;
1548 page_t* page;
1549 page_zip_des_t* page_zip;
1550
1551 /* Create the two new segments (one, in the case of an ibuf tree) for
1552 the index tree; the segment headers are put on the allocated root page
1553 (for an ibuf tree, not in the root, but on a separate ibuf header
1554 page) */
1555
1556 if (type & DICT_IBUF) {
1557 /* Allocate first the ibuf header page */
1558 buf_block_t* ibuf_hdr_block = fseg_create(
1559 space, 0,
1560 IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
1561
1562 buf_block_dbg_add_level(
1563 ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
1564
1565 ut_ad(buf_block_get_page_no(ibuf_hdr_block)
1566 == IBUF_HEADER_PAGE_NO);
1567 /* Allocate then the next page to the segment: it will be the
1568 tree root page */
1569
1570 block = fseg_alloc_free_page(
1571 buf_block_get_frame(ibuf_hdr_block)
1572 + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
1573 IBUF_TREE_ROOT_PAGE_NO,
1574 FSP_UP, mtr);
1575 ut_ad(buf_block_get_page_no(block) == IBUF_TREE_ROOT_PAGE_NO);
1576 } else {
1577 #ifdef UNIV_BLOB_DEBUG
1578 if ((type & DICT_CLUSTERED) && !index->blobs) {
1579 mutex_create(PFS_NOT_INSTRUMENTED,
1580 &index->blobs_mutex, SYNC_ANY_LATCH);
1581 index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
1582 btr_blob_dbg_cmp);
1583 }
1584 #endif /* UNIV_BLOB_DEBUG */
1585 block = fseg_create(space, 0,
1586 PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
1587 }
1588
1589 if (block == NULL) {
1590
1591 return(FIL_NULL);
1592 }
1593
1594 page_no = buf_block_get_page_no(block);
1595 frame = buf_block_get_frame(block);
1596
1597 if (type & DICT_IBUF) {
1598 /* It is an insert buffer tree: initialize the free list */
1599 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
1600
1601 ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
1602
1603 flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
1604 } else {
1605 /* It is a non-ibuf tree: create a file segment for leaf
1606 pages */
1607 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1608
1609 if (!fseg_create(space, page_no,
1610 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
1611 /* Not enough space for new segment, free root
1612 segment before return. */
1613 btr_free_root(space, zip_size, page_no, mtr);
1614
1615 return(FIL_NULL);
1616 }
1617
1618 /* The fseg create acquires a second latch on the page,
1619 therefore we must declare it: */
1620 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1621 }
1622
1623 /* Create a new index page on the allocated segment page */
1624 page_zip = buf_block_get_page_zip(block);
1625
1626 if (page_zip) {
1627 page = page_create_zip(block, index, 0, 0, mtr);
1628 } else {
1629 page = page_create(block, mtr,
1630 dict_table_is_comp(index->table));
1631 /* Set the level of the new index page */
1632 btr_page_set_level(page, NULL, 0, mtr);
1633 }
1634
1635 block->check_index_page_at_flush = TRUE;
1636
1637 /* Set the index id of the page */
1638 btr_page_set_index_id(page, page_zip, index_id, mtr);
1639
1640 /* Set the next node and previous node fields */
1641 btr_page_set_next(page, page_zip, FIL_NULL, mtr);
1642 btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
1643
1644 /* We reset the free bits for the page to allow creation of several
1645 trees in the same mtr, otherwise the latch on a bitmap page would
1646 prevent it because of the latching order */
1647
1648 if (!(type & DICT_CLUSTERED)) {
1649 ibuf_reset_free_bits(block);
1650 }
1651
1652 /* In the following assertion we test that two records of maximum
1653 allowed size fit on the root page: this fact is needed to ensure
1654 correctness of split algorithms */
1655
1656 ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
1657
1658 return(page_no);
1659 }
1660
1661 /************************************************************//**
1662 Frees a B-tree except the root page, which MUST be freed after this
1663 by calling btr_free_root. */
1664 UNIV_INTERN
1665 void
btr_free_but_not_root(ulint space,ulint zip_size,ulint root_page_no)1666 btr_free_but_not_root(
1667 /*==================*/
1668 ulint space, /*!< in: space where created */
1669 ulint zip_size, /*!< in: compressed page size in bytes
1670 or 0 for uncompressed pages */
1671 ulint root_page_no) /*!< in: root page number */
1672 {
1673 ibool finished;
1674 page_t* root;
1675 mtr_t mtr;
1676
1677 leaf_loop:
1678 mtr_start(&mtr);
1679
1680 root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1681 NULL, &mtr);
1682 #ifdef UNIV_BTR_DEBUG
1683 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1684 + root, space));
1685 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1686 + root, space));
1687 #endif /* UNIV_BTR_DEBUG */
1688
1689 /* NOTE: page hash indexes are dropped when a page is freed inside
1690 fsp0fsp. */
1691
1692 finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
1693 &mtr);
1694 mtr_commit(&mtr);
1695
1696 if (!finished) {
1697
1698 goto leaf_loop;
1699 }
1700 top_loop:
1701 mtr_start(&mtr);
1702
1703 root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1704 NULL, &mtr);
1705 #ifdef UNIV_BTR_DEBUG
1706 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1707 + root, space));
1708 #endif /* UNIV_BTR_DEBUG */
1709
1710 finished = fseg_free_step_not_header(
1711 root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
1712 mtr_commit(&mtr);
1713
1714 if (!finished) {
1715
1716 goto top_loop;
1717 }
1718 }
1719
1720 /************************************************************//**
1721 Frees the B-tree root page. Other tree MUST already have been freed. */
1722 UNIV_INTERN
1723 void
btr_free_root(ulint space,ulint zip_size,ulint root_page_no,mtr_t * mtr)1724 btr_free_root(
1725 /*==========*/
1726 ulint space, /*!< in: space where created */
1727 ulint zip_size, /*!< in: compressed page size in bytes
1728 or 0 for uncompressed pages */
1729 ulint root_page_no, /*!< in: root page number */
1730 mtr_t* mtr) /*!< in/out: mini-transaction */
1731 {
1732 buf_block_t* block;
1733 fseg_header_t* header;
1734
1735 block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
1736 NULL, mtr);
1737
1738 btr_search_drop_page_hash_index(block);
1739
1740 header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1741 #ifdef UNIV_BTR_DEBUG
1742 ut_a(btr_root_fseg_validate(header, space));
1743 #endif /* UNIV_BTR_DEBUG */
1744
1745 while (!fseg_free_step(header, mtr)) {
1746 /* Free the entire segment in small steps. */
1747 }
1748 }
1749 #endif /* !UNIV_HOTBACKUP */
1750
1751 /*************************************************************//**
1752 Reorganizes an index page.
1753
1754 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1755 if this is a compressed leaf page in a secondary index. This has to
1756 be done either within the same mini-transaction, or by invoking
1757 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1758 IBUF_BITMAP_FREE is unaffected by reorganization.
1759
1760 @retval true if the operation was successful
1761 @retval false if it is a compressed page, and recompression failed */
1762 UNIV_INTERN
1763 bool
btr_page_reorganize_low(bool recovery,ulint z_level,page_cur_t * cursor,dict_index_t * index,mtr_t * mtr)1764 btr_page_reorganize_low(
1765 /*====================*/
1766 bool recovery,/*!< in: true if called in recovery:
1767 locks should not be updated, i.e.,
1768 there cannot exist locks on the
1769 page, and a hash index should not be
1770 dropped: it cannot exist */
1771 ulint z_level,/*!< in: compression level to be used
1772 if dealing with compressed page */
1773 page_cur_t* cursor, /*!< in/out: page cursor */
1774 dict_index_t* index, /*!< in: the index tree of the page */
1775 mtr_t* mtr) /*!< in/out: mini-transaction */
1776 {
1777 buf_block_t* block = page_cur_get_block(cursor);
1778 #ifndef UNIV_HOTBACKUP
1779 buf_pool_t* buf_pool = buf_pool_from_bpage(&block->page);
1780 #endif /* !UNIV_HOTBACKUP */
1781 page_t* page = buf_block_get_frame(block);
1782 page_zip_des_t* page_zip = buf_block_get_page_zip(block);
1783 buf_block_t* temp_block;
1784 page_t* temp_page;
1785 ulint log_mode;
1786 ulint data_size1;
1787 ulint data_size2;
1788 ulint max_ins_size1;
1789 ulint max_ins_size2;
1790 bool success = false;
1791 ulint pos;
1792 bool log_compressed;
1793
1794 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1795 btr_assert_not_corrupted(block, index);
1796 #ifdef UNIV_ZIP_DEBUG
1797 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1798 #endif /* UNIV_ZIP_DEBUG */
1799 data_size1 = page_get_data_size(page);
1800 max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
1801
1802 /* Turn logging off */
1803 log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
1804
1805 #ifndef UNIV_HOTBACKUP
1806 temp_block = buf_block_alloc(buf_pool);
1807 #else /* !UNIV_HOTBACKUP */
1808 ut_ad(block == back_block1);
1809 temp_block = back_block2;
1810 #endif /* !UNIV_HOTBACKUP */
1811 temp_page = temp_block->frame;
1812
1813 MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
1814
1815 /* Copy the old page to temporary space */
1816 buf_frame_copy(temp_page, page);
1817
1818 #ifndef UNIV_HOTBACKUP
1819 if (!recovery) {
1820 btr_search_drop_page_hash_index(block);
1821 }
1822
1823 block->check_index_page_at_flush = TRUE;
1824 #endif /* !UNIV_HOTBACKUP */
1825 btr_blob_dbg_remove(page, index, "btr_page_reorganize");
1826
1827 /* Save the cursor position. */
1828 pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
1829
1830 /* Recreate the page: note that global data on page (possible
1831 segment headers, next page-field, etc.) is preserved intact */
1832
1833 page_create(block, mtr, dict_table_is_comp(index->table));
1834
1835 /* Copy the records from the temporary space to the recreated page;
1836 do not copy the lock bits yet */
1837
1838 page_copy_rec_list_end_no_locks(block, temp_block,
1839 page_get_infimum_rec(temp_page),
1840 index, mtr);
1841
1842 if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
1843 /* Copy max trx id to recreated page */
1844 trx_id_t max_trx_id = page_get_max_trx_id(temp_page);
1845 page_set_max_trx_id(block, NULL, max_trx_id, mtr);
1846 /* In crash recovery, dict_index_is_sec_or_ibuf() always
1847 holds, even for clustered indexes. max_trx_id is
1848 unused in clustered index pages. */
1849 ut_ad(max_trx_id != 0 || recovery);
1850 }
1851
1852 /* If innodb_log_compressed_pages is ON, page reorganize should log the
1853 compressed page image.*/
1854 log_compressed = page_zip && page_zip_log_pages;
1855
1856 if (log_compressed) {
1857 mtr_set_log_mode(mtr, log_mode);
1858 }
1859
1860 if (page_zip
1861 && !page_zip_compress(page_zip, page, index, z_level, mtr)) {
1862
1863 /* Restore the old page and exit. */
1864 btr_blob_dbg_restore(page, temp_page, index,
1865 "btr_page_reorganize_compress_fail");
1866
1867 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1868 /* Check that the bytes that we skip are identical. */
1869 ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1870 ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1871 PAGE_HEADER + PAGE_N_RECS + temp_page,
1872 PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1873 ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
1874 UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
1875 FIL_PAGE_DATA_END));
1876 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1877
1878 memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1879 PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1880 memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1881 UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
1882
1883 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1884 ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
1885 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1886
1887 goto func_exit;
1888 }
1889
1890 #ifndef UNIV_HOTBACKUP
1891 if (!recovery) {
1892 /* Update the record lock bitmaps */
1893 lock_move_reorganize_page(block, temp_block);
1894 }
1895 #endif /* !UNIV_HOTBACKUP */
1896
1897 data_size2 = page_get_data_size(page);
1898 max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1899
1900 if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
1901 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
1902 buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
1903
1904 fprintf(stderr,
1905 "InnoDB: Error: page old data size %lu"
1906 " new data size %lu\n"
1907 "InnoDB: Error: page old max ins size %lu"
1908 " new max ins size %lu\n"
1909 "InnoDB: Submit a detailed bug report"
1910 " to http://bugs.mysql.com\n",
1911 (unsigned long) data_size1, (unsigned long) data_size2,
1912 (unsigned long) max_ins_size1,
1913 (unsigned long) max_ins_size2);
1914 ut_ad(0);
1915 } else {
1916 success = true;
1917 }
1918
1919 /* Restore the cursor position. */
1920 if (pos > 0) {
1921 cursor->rec = page_rec_get_nth(page, pos);
1922 } else {
1923 ut_ad(cursor->rec == page_get_infimum_rec(page));
1924 }
1925
1926 func_exit:
1927 #ifdef UNIV_ZIP_DEBUG
1928 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1929 #endif /* UNIV_ZIP_DEBUG */
1930 #ifndef UNIV_HOTBACKUP
1931 buf_block_free(temp_block);
1932 #endif /* !UNIV_HOTBACKUP */
1933
1934 /* Restore logging mode */
1935 mtr_set_log_mode(mtr, log_mode);
1936
1937 #ifndef UNIV_HOTBACKUP
1938 if (success) {
1939 byte type;
1940 byte* log_ptr;
1941
1942 /* Write the log record */
1943 if (page_zip) {
1944 ut_ad(page_is_comp(page));
1945 type = MLOG_ZIP_PAGE_REORGANIZE;
1946 } else if (page_is_comp(page)) {
1947 type = MLOG_COMP_PAGE_REORGANIZE;
1948 } else {
1949 type = MLOG_PAGE_REORGANIZE;
1950 }
1951
1952 log_ptr = log_compressed
1953 ? NULL
1954 : mlog_open_and_write_index(
1955 mtr, page, index, type,
1956 page_zip ? 1 : 0);
1957
1958 /* For compressed pages write the compression level. */
1959 if (log_ptr && page_zip) {
1960 mach_write_to_1(log_ptr, z_level);
1961 mlog_close(mtr, log_ptr + 1);
1962 }
1963
1964 MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
1965 }
1966 #endif /* !UNIV_HOTBACKUP */
1967
1968 return(success);
1969 }
1970
1971 /*************************************************************//**
1972 Reorganizes an index page.
1973
1974 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1975 if this is a compressed leaf page in a secondary index. This has to
1976 be done either within the same mini-transaction, or by invoking
1977 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1978 IBUF_BITMAP_FREE is unaffected by reorganization.
1979
1980 @retval true if the operation was successful
1981 @retval false if it is a compressed page, and recompression failed */
1982 static MY_ATTRIBUTE((nonnull))
1983 bool
btr_page_reorganize_block(bool recovery,ulint z_level,buf_block_t * block,dict_index_t * index,mtr_t * mtr)1984 btr_page_reorganize_block(
1985 /*======================*/
1986 bool recovery,/*!< in: true if called in recovery:
1987 locks should not be updated, i.e.,
1988 there cannot exist locks on the
1989 page, and a hash index should not be
1990 dropped: it cannot exist */
1991 ulint z_level,/*!< in: compression level to be used
1992 if dealing with compressed page */
1993 buf_block_t* block, /*!< in/out: B-tree page */
1994 dict_index_t* index, /*!< in: the index tree of the page */
1995 mtr_t* mtr) /*!< in/out: mini-transaction */
1996 {
1997 page_cur_t cur;
1998 page_cur_set_before_first(block, &cur);
1999
2000 return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
2001 }
2002
2003 #ifndef UNIV_HOTBACKUP
2004 /*************************************************************//**
2005 Reorganizes an index page.
2006
2007 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
2008 if this is a compressed leaf page in a secondary index. This has to
2009 be done either within the same mini-transaction, or by invoking
2010 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
2011 IBUF_BITMAP_FREE is unaffected by reorganization.
2012
2013 @retval true if the operation was successful
2014 @retval false if it is a compressed page, and recompression failed */
2015 UNIV_INTERN
2016 bool
btr_page_reorganize(page_cur_t * cursor,dict_index_t * index,mtr_t * mtr)2017 btr_page_reorganize(
2018 /*================*/
2019 page_cur_t* cursor, /*!< in/out: page cursor */
2020 dict_index_t* index, /*!< in: the index tree of the page */
2021 mtr_t* mtr) /*!< in/out: mini-transaction */
2022 {
2023 return(btr_page_reorganize_low(false, page_zip_level,
2024 cursor, index, mtr));
2025 }
2026 #endif /* !UNIV_HOTBACKUP */
2027
2028 /***********************************************************//**
2029 Parses a redo log record of reorganizing a page.
2030 @return end of log record or NULL */
2031 UNIV_INTERN
2032 byte*
btr_parse_page_reorganize(byte * ptr,byte * end_ptr,dict_index_t * index,bool compressed,buf_block_t * block,mtr_t * mtr)2033 btr_parse_page_reorganize(
2034 /*======================*/
2035 byte* ptr, /*!< in: buffer */
2036 byte* end_ptr,/*!< in: buffer end */
2037 dict_index_t* index, /*!< in: record descriptor */
2038 bool compressed,/*!< in: true if compressed page */
2039 buf_block_t* block, /*!< in: page to be reorganized, or NULL */
2040 mtr_t* mtr) /*!< in: mtr or NULL */
2041 {
2042 ulint level;
2043
2044 ut_ad(ptr != NULL);
2045 ut_ad(end_ptr != NULL);
2046
2047 /* If dealing with a compressed page the record has the
2048 compression level used during original compression written in
2049 one byte. Otherwise record is empty. */
2050 if (compressed) {
2051 if (ptr == end_ptr) {
2052 return(NULL);
2053 }
2054
2055 level = mach_read_from_1(ptr);
2056
2057 ut_a(level <= 9);
2058 ++ptr;
2059 } else {
2060 level = page_zip_level;
2061 }
2062
2063 if (block != NULL) {
2064 btr_page_reorganize_block(true, level, block, index, mtr);
2065 }
2066
2067 return(ptr);
2068 }
2069
2070 #ifndef UNIV_HOTBACKUP
2071 /*************************************************************//**
2072 Empties an index page. @see btr_page_create(). */
2073 static
2074 void
btr_page_empty(buf_block_t * block,page_zip_des_t * page_zip,dict_index_t * index,ulint level,mtr_t * mtr)2075 btr_page_empty(
2076 /*===========*/
2077 buf_block_t* block, /*!< in: page to be emptied */
2078 page_zip_des_t* page_zip,/*!< out: compressed page, or NULL */
2079 dict_index_t* index, /*!< in: index of the page */
2080 ulint level, /*!< in: the B-tree level of the page */
2081 mtr_t* mtr) /*!< in: mtr */
2082 {
2083 page_t* page = buf_block_get_frame(block);
2084
2085 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2086 ut_ad(page_zip == buf_block_get_page_zip(block));
2087 #ifdef UNIV_ZIP_DEBUG
2088 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2089 #endif /* UNIV_ZIP_DEBUG */
2090
2091 btr_search_drop_page_hash_index(block);
2092 btr_blob_dbg_remove(page, index, "btr_page_empty");
2093
2094 /* Recreate the page: note that global data on page (possible
2095 segment headers, next page-field, etc.) is preserved intact */
2096
2097 if (page_zip) {
2098 page_create_zip(block, index, level, 0, mtr);
2099 } else {
2100 page_create(block, mtr, dict_table_is_comp(index->table));
2101 btr_page_set_level(page, NULL, level, mtr);
2102 }
2103
2104 block->check_index_page_at_flush = TRUE;
2105 }
2106
2107 /*************************************************************//**
2108 Makes tree one level higher by splitting the root, and inserts
2109 the tuple. It is assumed that mtr contains an x-latch on the tree.
2110 NOTE that the operation of this function must always succeed,
2111 we cannot reverse it: therefore enough free disk space must be
2112 guaranteed to be available before this function is called.
2113 @return inserted record or NULL if run out of space */
2114 UNIV_INTERN
2115 rec_t*
btr_root_raise_and_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2116 btr_root_raise_and_insert(
2117 /*======================*/
2118 ulint flags, /*!< in: undo logging and locking flags */
2119 btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
2120 on the root page; when the function returns,
2121 the cursor is positioned on the predecessor
2122 of the inserted record */
2123 ulint** offsets,/*!< out: offsets on inserted record */
2124 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2125 const dtuple_t* tuple, /*!< in: tuple to insert */
2126 ulint n_ext, /*!< in: number of externally stored columns */
2127 mtr_t* mtr) /*!< in: mtr */
2128 {
2129 dict_index_t* index;
2130 page_t* root;
2131 page_t* new_page;
2132 ulint new_page_no;
2133 rec_t* rec;
2134 dtuple_t* node_ptr;
2135 ulint level;
2136 rec_t* node_ptr_rec;
2137 page_cur_t* page_cursor;
2138 page_zip_des_t* root_page_zip;
2139 page_zip_des_t* new_page_zip;
2140 buf_block_t* root_block;
2141 buf_block_t* new_block;
2142
2143 root = btr_cur_get_page(cursor);
2144 root_block = btr_cur_get_block(cursor);
2145 root_page_zip = buf_block_get_page_zip(root_block);
2146 ut_ad(!page_is_empty(root));
2147 index = btr_cur_get_index(cursor);
2148 #ifdef UNIV_ZIP_DEBUG
2149 ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
2150 #endif /* UNIV_ZIP_DEBUG */
2151 #ifdef UNIV_BTR_DEBUG
2152 if (!dict_index_is_ibuf(index)) {
2153 ulint space = dict_index_get_space(index);
2154
2155 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
2156 + root, space));
2157 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
2158 + root, space));
2159 }
2160
2161 ut_a(dict_index_get_page(index) == page_get_page_no(root));
2162 #endif /* UNIV_BTR_DEBUG */
2163 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2164 MTR_MEMO_X_LOCK));
2165 ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
2166
2167 /* Allocate a new page to the tree. Root splitting is done by first
2168 moving the root records to the new page, emptying the root, putting
2169 a node pointer to the new page, and then splitting the new page. */
2170
2171 level = btr_page_get_level(root, mtr);
2172
2173 new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
2174
2175 if (new_block == NULL && os_has_said_disk_full) {
2176 return(NULL);
2177 }
2178
2179 new_page = buf_block_get_frame(new_block);
2180 new_page_zip = buf_block_get_page_zip(new_block);
2181 ut_a(!new_page_zip == !root_page_zip);
2182 ut_a(!new_page_zip
2183 || page_zip_get_size(new_page_zip)
2184 == page_zip_get_size(root_page_zip));
2185
2186 btr_page_create(new_block, new_page_zip, index, level, mtr);
2187
2188 /* Set the next node and previous node fields of new page */
2189 btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
2190 btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
2191
2192 /* Copy the records from root to the new page one by one. */
2193
2194 if (0
2195 #ifdef UNIV_ZIP_COPY
2196 || new_page_zip
2197 #endif /* UNIV_ZIP_COPY */
2198 || !page_copy_rec_list_end(new_block, root_block,
2199 page_get_infimum_rec(root),
2200 index, mtr)) {
2201 ut_a(new_page_zip);
2202
2203 /* Copy the page byte for byte. */
2204 page_zip_copy_recs(new_page_zip, new_page,
2205 root_page_zip, root, index, mtr);
2206
2207 /* Update the lock table and possible hash index. */
2208
2209 lock_move_rec_list_end(new_block, root_block,
2210 page_get_infimum_rec(root));
2211
2212 btr_search_move_or_delete_hash_entries(new_block, root_block,
2213 index);
2214 }
2215
2216 /* If this is a pessimistic insert which is actually done to
2217 perform a pessimistic update then we have stored the lock
2218 information of the record to be inserted on the infimum of the
2219 root page: we cannot discard the lock structs on the root page */
2220
2221 lock_update_root_raise(new_block, root_block);
2222
2223 /* Create a memory heap where the node pointer is stored */
2224 if (!*heap) {
2225 *heap = mem_heap_create(1000);
2226 }
2227
2228 rec = page_rec_get_next(page_get_infimum_rec(new_page));
2229 new_page_no = buf_block_get_page_no(new_block);
2230
2231 /* Build the node pointer (= node key and page address) for the
2232 child */
2233
2234 node_ptr = dict_index_build_node_ptr(
2235 index, rec, new_page_no, *heap, level);
2236 /* The node pointer must be marked as the predefined minimum record,
2237 as there is no lower alphabetical limit to records in the leftmost
2238 node of a level: */
2239 dtuple_set_info_bits(node_ptr,
2240 dtuple_get_info_bits(node_ptr)
2241 | REC_INFO_MIN_REC_FLAG);
2242
2243 /* Rebuild the root page to get free space */
2244 btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
2245
2246 /* Set the next node and previous node fields, although
2247 they should already have been set. The previous node field
2248 must be FIL_NULL if root_page_zip != NULL, because the
2249 REC_INFO_MIN_REC_FLAG (of the first user record) will be
2250 set if and only if btr_page_get_prev() == FIL_NULL. */
2251 btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
2252 btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
2253
2254 page_cursor = btr_cur_get_page_cur(cursor);
2255
2256 /* Insert node pointer to the root */
2257
2258 page_cur_set_before_first(root_block, page_cursor);
2259
2260 node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
2261 index, offsets, heap, 0, mtr);
2262
2263 /* The root page should only contain the node pointer
2264 to new_page at this point. Thus, the data should fit. */
2265 ut_a(node_ptr_rec);
2266
2267 /* We play safe and reset the free bits for the new page */
2268
2269 #if 0
2270 fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
2271 #endif
2272
2273 if (!dict_index_is_clust(index)) {
2274 ibuf_reset_free_bits(new_block);
2275 }
2276
2277 /* Reposition the cursor to the child node */
2278 page_cur_search(new_block, index, tuple,
2279 PAGE_CUR_LE, page_cursor);
2280
2281 /* Split the child and insert tuple */
2282 return(btr_page_split_and_insert(flags, cursor, offsets, heap,
2283 tuple, n_ext, mtr));
2284 }
2285
2286 /*************************************************************//**
2287 Decides if the page should be split at the convergence point of inserts
2288 converging to the left.
2289 @return TRUE if split recommended */
2290 UNIV_INTERN
2291 ibool
btr_page_get_split_rec_to_left(btr_cur_t * cursor,rec_t ** split_rec)2292 btr_page_get_split_rec_to_left(
2293 /*===========================*/
2294 btr_cur_t* cursor, /*!< in: cursor at which to insert */
2295 rec_t** split_rec) /*!< out: if split recommended,
2296 the first record on upper half page,
2297 or NULL if tuple to be inserted should
2298 be first */
2299 {
2300 page_t* page;
2301 rec_t* insert_point;
2302 rec_t* infimum;
2303
2304 page = btr_cur_get_page(cursor);
2305 insert_point = btr_cur_get_rec(cursor);
2306
2307 if (page_header_get_ptr(page, PAGE_LAST_INSERT)
2308 == page_rec_get_next(insert_point)) {
2309
2310 infimum = page_get_infimum_rec(page);
2311
2312 /* If the convergence is in the middle of a page, include also
2313 the record immediately before the new insert to the upper
2314 page. Otherwise, we could repeatedly move from page to page
2315 lots of records smaller than the convergence point. */
2316
2317 if (infimum != insert_point
2318 && page_rec_get_next(infimum) != insert_point) {
2319
2320 *split_rec = insert_point;
2321 } else {
2322 *split_rec = page_rec_get_next(insert_point);
2323 }
2324
2325 return(TRUE);
2326 }
2327
2328 return(FALSE);
2329 }
2330
2331 /*************************************************************//**
2332 Decides if the page should be split at the convergence point of inserts
2333 converging to the right.
2334 @return TRUE if split recommended */
2335 UNIV_INTERN
2336 ibool
btr_page_get_split_rec_to_right(btr_cur_t * cursor,rec_t ** split_rec)2337 btr_page_get_split_rec_to_right(
2338 /*============================*/
2339 btr_cur_t* cursor, /*!< in: cursor at which to insert */
2340 rec_t** split_rec) /*!< out: if split recommended,
2341 the first record on upper half page,
2342 or NULL if tuple to be inserted should
2343 be first */
2344 {
2345 page_t* page;
2346 rec_t* insert_point;
2347
2348 page = btr_cur_get_page(cursor);
2349 insert_point = btr_cur_get_rec(cursor);
2350
2351 /* We use eager heuristics: if the new insert would be right after
2352 the previous insert on the same page, we assume that there is a
2353 pattern of sequential inserts here. */
2354
2355 if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
2356
2357 rec_t* next_rec;
2358
2359 next_rec = page_rec_get_next(insert_point);
2360
2361 if (page_rec_is_supremum(next_rec)) {
2362 split_at_new:
2363 /* Split at the new record to insert */
2364 *split_rec = NULL;
2365 } else {
2366 rec_t* next_next_rec = page_rec_get_next(next_rec);
2367 if (page_rec_is_supremum(next_next_rec)) {
2368
2369 goto split_at_new;
2370 }
2371
2372 /* If there are >= 2 user records up from the insert
2373 point, split all but 1 off. We want to keep one because
2374 then sequential inserts can use the adaptive hash
2375 index, as they can do the necessary checks of the right
2376 search position just by looking at the records on this
2377 page. */
2378
2379 *split_rec = next_next_rec;
2380 }
2381
2382 return(TRUE);
2383 }
2384
2385 return(FALSE);
2386 }
2387
2388 /*************************************************************//**
2389 Calculates a split record such that the tuple will certainly fit on
2390 its half-page when the split is performed. We assume in this function
2391 only that the cursor page has at least one user record.
2392 @return split record, or NULL if tuple will be the first record on
2393 the lower or upper half-page (determined by btr_page_tuple_smaller()) */
2394 static
2395 rec_t*
btr_page_get_split_rec(btr_cur_t * cursor,const dtuple_t * tuple,ulint n_ext)2396 btr_page_get_split_rec(
2397 /*===================*/
2398 btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
2399 const dtuple_t* tuple, /*!< in: tuple to insert */
2400 ulint n_ext) /*!< in: number of externally stored columns */
2401 {
2402 page_t* page;
2403 page_zip_des_t* page_zip;
2404 ulint insert_size;
2405 ulint free_space;
2406 ulint total_data;
2407 ulint total_n_recs;
2408 ulint total_space;
2409 ulint incl_data;
2410 rec_t* ins_rec;
2411 rec_t* rec;
2412 rec_t* next_rec;
2413 ulint n;
2414 mem_heap_t* heap;
2415 ulint* offsets;
2416
2417 page = btr_cur_get_page(cursor);
2418
2419 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2420 free_space = page_get_free_space_of_empty(page_is_comp(page));
2421
2422 page_zip = btr_cur_get_page_zip(cursor);
2423 if (page_zip) {
2424 /* Estimate the free space of an empty compressed page. */
2425 ulint free_space_zip = page_zip_empty_size(
2426 cursor->index->n_fields,
2427 page_zip_get_size(page_zip));
2428
2429 if (free_space > (ulint) free_space_zip) {
2430 free_space = (ulint) free_space_zip;
2431 }
2432 }
2433
2434 /* free_space is now the free space of a created new page */
2435
2436 total_data = page_get_data_size(page) + insert_size;
2437 total_n_recs = page_get_n_recs(page) + 1;
2438 ut_ad(total_n_recs >= 2);
2439 total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
2440
2441 n = 0;
2442 incl_data = 0;
2443 ins_rec = btr_cur_get_rec(cursor);
2444 rec = page_get_infimum_rec(page);
2445
2446 heap = NULL;
2447 offsets = NULL;
2448
2449 /* We start to include records to the left half, and when the
2450 space reserved by them exceeds half of total_space, then if
2451 the included records fit on the left page, they will be put there
2452 if something was left over also for the right page,
2453 otherwise the last included record will be the first on the right
2454 half page */
2455
2456 do {
2457 /* Decide the next record to include */
2458 if (rec == ins_rec) {
2459 rec = NULL; /* NULL denotes that tuple is
2460 now included */
2461 } else if (rec == NULL) {
2462 rec = page_rec_get_next(ins_rec);
2463 } else {
2464 rec = page_rec_get_next(rec);
2465 }
2466
2467 if (rec == NULL) {
2468 /* Include tuple */
2469 incl_data += insert_size;
2470 } else {
2471 offsets = rec_get_offsets(rec, cursor->index,
2472 offsets, ULINT_UNDEFINED,
2473 &heap);
2474 incl_data += rec_offs_size(offsets);
2475 }
2476
2477 n++;
2478 } while (incl_data + page_dir_calc_reserved_space(n)
2479 < total_space / 2);
2480
2481 if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
2482 /* The next record will be the first on
2483 the right half page if it is not the
2484 supremum record of page */
2485
2486 if (rec == ins_rec) {
2487 rec = NULL;
2488
2489 goto func_exit;
2490 } else if (rec == NULL) {
2491 next_rec = page_rec_get_next(ins_rec);
2492 } else {
2493 next_rec = page_rec_get_next(rec);
2494 }
2495 ut_ad(next_rec);
2496 if (!page_rec_is_supremum(next_rec)) {
2497 rec = next_rec;
2498 }
2499 }
2500
2501 func_exit:
2502 if (heap) {
2503 mem_heap_free(heap);
2504 }
2505 return(rec);
2506 }
2507
2508 /*************************************************************//**
2509 Returns TRUE if the insert fits on the appropriate half-page with the
2510 chosen split_rec.
2511 @return true if fits */
2512 static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
2513 bool
btr_page_insert_fits(btr_cur_t * cursor,const rec_t * split_rec,ulint ** offsets,const dtuple_t * tuple,ulint n_ext,mem_heap_t ** heap)2514 btr_page_insert_fits(
2515 /*=================*/
2516 btr_cur_t* cursor, /*!< in: cursor at which insert
2517 should be made */
2518 const rec_t* split_rec,/*!< in: suggestion for first record
2519 on upper half-page, or NULL if
2520 tuple to be inserted should be first */
2521 ulint** offsets,/*!< in: rec_get_offsets(
2522 split_rec, cursor->index); out: garbage */
2523 const dtuple_t* tuple, /*!< in: tuple to insert */
2524 ulint n_ext, /*!< in: number of externally stored columns */
2525 mem_heap_t** heap) /*!< in: temporary memory heap */
2526 {
2527 page_t* page;
2528 ulint insert_size;
2529 ulint free_space;
2530 ulint total_data;
2531 ulint total_n_recs;
2532 const rec_t* rec;
2533 const rec_t* end_rec;
2534
2535 page = btr_cur_get_page(cursor);
2536
2537 ut_ad(!split_rec
2538 || !page_is_comp(page) == !rec_offs_comp(*offsets));
2539 ut_ad(!split_rec
2540 || rec_offs_validate(split_rec, cursor->index, *offsets));
2541
2542 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2543 free_space = page_get_free_space_of_empty(page_is_comp(page));
2544
2545 /* free_space is now the free space of a created new page */
2546
2547 total_data = page_get_data_size(page) + insert_size;
2548 total_n_recs = page_get_n_recs(page) + 1;
2549
2550 /* We determine which records (from rec to end_rec, not including
2551 end_rec) will end up on the other half page from tuple when it is
2552 inserted. */
2553
2554 if (split_rec == NULL) {
2555 rec = page_rec_get_next(page_get_infimum_rec(page));
2556 end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
2557
2558 } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
2559
2560 rec = page_rec_get_next(page_get_infimum_rec(page));
2561 end_rec = split_rec;
2562 } else {
2563 rec = split_rec;
2564 end_rec = page_get_supremum_rec(page);
2565 }
2566
2567 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2568 <= free_space) {
2569
2570 /* Ok, there will be enough available space on the
2571 half page where the tuple is inserted */
2572
2573 return(true);
2574 }
2575
2576 while (rec != end_rec) {
2577 /* In this loop we calculate the amount of reserved
2578 space after rec is removed from page. */
2579
2580 *offsets = rec_get_offsets(rec, cursor->index, *offsets,
2581 ULINT_UNDEFINED, heap);
2582
2583 total_data -= rec_offs_size(*offsets);
2584 total_n_recs--;
2585
2586 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2587 <= free_space) {
2588
2589 /* Ok, there will be enough available space on the
2590 half page where the tuple is inserted */
2591
2592 return(true);
2593 }
2594
2595 rec = page_rec_get_next_const(rec);
2596 }
2597
2598 return(false);
2599 }
2600
2601 /*******************************************************//**
2602 Inserts a data tuple to a tree on a non-leaf level. It is assumed
2603 that mtr holds an x-latch on the tree. */
2604 UNIV_INTERN
2605 void
btr_insert_on_non_leaf_level_func(ulint flags,dict_index_t * index,ulint level,dtuple_t * tuple,const char * file,ulint line,mtr_t * mtr)2606 btr_insert_on_non_leaf_level_func(
2607 /*==============================*/
2608 ulint flags, /*!< in: undo logging and locking flags */
2609 dict_index_t* index, /*!< in: index */
2610 ulint level, /*!< in: level, must be > 0 */
2611 dtuple_t* tuple, /*!< in: the record to be inserted */
2612 const char* file, /*!< in: file name */
2613 ulint line, /*!< in: line where called */
2614 mtr_t* mtr) /*!< in: mtr */
2615 {
2616 big_rec_t* dummy_big_rec;
2617 btr_cur_t cursor;
2618 dberr_t err;
2619 rec_t* rec;
2620 ulint* offsets = NULL;
2621 mem_heap_t* heap = NULL;
2622
2623 ut_ad(level > 0);
2624
2625 btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
2626 BTR_CONT_MODIFY_TREE,
2627 &cursor, 0, file, line, mtr);
2628
2629 ut_ad(cursor.flag == BTR_CUR_BINARY);
2630
2631 err = btr_cur_optimistic_insert(
2632 flags
2633 | BTR_NO_LOCKING_FLAG
2634 | BTR_KEEP_SYS_FLAG
2635 | BTR_NO_UNDO_LOG_FLAG,
2636 &cursor, &offsets, &heap,
2637 tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
2638
2639 if (err == DB_FAIL) {
2640 err = btr_cur_pessimistic_insert(flags
2641 | BTR_NO_LOCKING_FLAG
2642 | BTR_KEEP_SYS_FLAG
2643 | BTR_NO_UNDO_LOG_FLAG,
2644 &cursor, &offsets, &heap,
2645 tuple, &rec,
2646 &dummy_big_rec, 0, NULL, mtr);
2647 ut_a(err == DB_SUCCESS);
2648 }
2649 mem_heap_free(heap);
2650 }
2651
2652 /**************************************************************//**
2653 Attaches the halves of an index page on the appropriate level in an
2654 index tree. */
2655 static MY_ATTRIBUTE((nonnull))
2656 void
btr_attach_half_pages(ulint flags,dict_index_t * index,buf_block_t * block,const rec_t * split_rec,buf_block_t * new_block,ulint direction,mtr_t * mtr)2657 btr_attach_half_pages(
2658 /*==================*/
2659 ulint flags, /*!< in: undo logging and
2660 locking flags */
2661 dict_index_t* index, /*!< in: the index tree */
2662 buf_block_t* block, /*!< in/out: page to be split */
2663 const rec_t* split_rec, /*!< in: first record on upper
2664 half page */
2665 buf_block_t* new_block, /*!< in/out: the new half page */
2666 ulint direction, /*!< in: FSP_UP or FSP_DOWN */
2667 mtr_t* mtr) /*!< in: mtr */
2668 {
2669 ulint space;
2670 ulint zip_size;
2671 ulint prev_page_no;
2672 ulint next_page_no;
2673 ulint level;
2674 page_t* page = buf_block_get_frame(block);
2675 page_t* lower_page;
2676 page_t* upper_page;
2677 ulint lower_page_no;
2678 ulint upper_page_no;
2679 page_zip_des_t* lower_page_zip;
2680 page_zip_des_t* upper_page_zip;
2681 dtuple_t* node_ptr_upper;
2682 mem_heap_t* heap;
2683
2684 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2685 ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
2686
2687 /* Create a memory heap where the data tuple is stored */
2688 heap = mem_heap_create(1024);
2689
2690 /* Based on split direction, decide upper and lower pages */
2691 if (direction == FSP_DOWN) {
2692
2693 btr_cur_t cursor;
2694 ulint* offsets;
2695
2696 lower_page = buf_block_get_frame(new_block);
2697 lower_page_no = buf_block_get_page_no(new_block);
2698 lower_page_zip = buf_block_get_page_zip(new_block);
2699 upper_page = buf_block_get_frame(block);
2700 upper_page_no = buf_block_get_page_no(block);
2701 upper_page_zip = buf_block_get_page_zip(block);
2702
2703 /* Look up the index for the node pointer to page */
2704 offsets = btr_page_get_father_block(NULL, heap, index,
2705 block, mtr, &cursor);
2706
2707 /* Replace the address of the old child node (= page) with the
2708 address of the new lower half */
2709
2710 btr_node_ptr_set_child_page_no(
2711 btr_cur_get_rec(&cursor),
2712 btr_cur_get_page_zip(&cursor),
2713 offsets, lower_page_no, mtr);
2714 mem_heap_empty(heap);
2715 } else {
2716 lower_page = buf_block_get_frame(block);
2717 lower_page_no = buf_block_get_page_no(block);
2718 lower_page_zip = buf_block_get_page_zip(block);
2719 upper_page = buf_block_get_frame(new_block);
2720 upper_page_no = buf_block_get_page_no(new_block);
2721 upper_page_zip = buf_block_get_page_zip(new_block);
2722 }
2723
2724 /* Get the level of the split pages */
2725 level = btr_page_get_level(buf_block_get_frame(block), mtr);
2726 ut_ad(level
2727 == btr_page_get_level(buf_block_get_frame(new_block), mtr));
2728
2729 /* Build the node pointer (= node key and page address) for the upper
2730 half */
2731
2732 node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
2733 upper_page_no, heap, level);
2734
2735 /* Insert it next to the pointer to the lower half. Note that this
2736 may generate recursion leading to a split on the higher level. */
2737
2738 btr_insert_on_non_leaf_level(flags, index, level + 1,
2739 node_ptr_upper, mtr);
2740
2741 /* Free the memory heap */
2742 mem_heap_free(heap);
2743
2744 /* Get the previous and next pages of page */
2745
2746 prev_page_no = btr_page_get_prev(page, mtr);
2747 next_page_no = btr_page_get_next(page, mtr);
2748 space = buf_block_get_space(block);
2749 zip_size = buf_block_get_zip_size(block);
2750
2751 /* Update page links of the level */
2752
2753 if (prev_page_no != FIL_NULL) {
2754 buf_block_t* prev_block = btr_block_get(
2755 space, zip_size, prev_page_no, RW_X_LATCH, index, mtr);
2756 #ifdef UNIV_BTR_DEBUG
2757 ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
2758 ut_a(btr_page_get_next(prev_block->frame, mtr)
2759 == buf_block_get_page_no(block));
2760 #endif /* UNIV_BTR_DEBUG */
2761
2762 btr_page_set_next(buf_block_get_frame(prev_block),
2763 buf_block_get_page_zip(prev_block),
2764 lower_page_no, mtr);
2765 }
2766
2767 if (next_page_no != FIL_NULL) {
2768 buf_block_t* next_block = btr_block_get(
2769 space, zip_size, next_page_no, RW_X_LATCH, index, mtr);
2770 #ifdef UNIV_BTR_DEBUG
2771 ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
2772 ut_a(btr_page_get_prev(next_block->frame, mtr)
2773 == page_get_page_no(page));
2774 #endif /* UNIV_BTR_DEBUG */
2775
2776 btr_page_set_prev(buf_block_get_frame(next_block),
2777 buf_block_get_page_zip(next_block),
2778 upper_page_no, mtr);
2779 }
2780
2781 btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
2782 btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
2783
2784 btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
2785 btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
2786 }
2787
2788 /*************************************************************//**
2789 Determine if a tuple is smaller than any record on the page.
2790 @return TRUE if smaller */
2791 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2792 bool
btr_page_tuple_smaller(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,ulint n_uniq,mem_heap_t ** heap)2793 btr_page_tuple_smaller(
2794 /*===================*/
2795 btr_cur_t* cursor, /*!< in: b-tree cursor */
2796 const dtuple_t* tuple, /*!< in: tuple to consider */
2797 ulint** offsets,/*!< in/out: temporary storage */
2798 ulint n_uniq, /*!< in: number of unique fields
2799 in the index page records */
2800 mem_heap_t** heap) /*!< in/out: heap for offsets */
2801 {
2802 buf_block_t* block;
2803 const rec_t* first_rec;
2804 page_cur_t pcur;
2805
2806 /* Read the first user record in the page. */
2807 block = btr_cur_get_block(cursor);
2808 page_cur_set_before_first(block, &pcur);
2809 page_cur_move_to_next(&pcur);
2810 first_rec = page_cur_get_rec(&pcur);
2811
2812 *offsets = rec_get_offsets(
2813 first_rec, cursor->index, *offsets,
2814 n_uniq, heap);
2815
2816 return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
2817 }
2818
2819 /** Insert the tuple into the right sibling page, if the cursor is at the end
2820 of a page.
2821 @param[in] flags undo logging and locking flags
2822 @param[in,out] cursor cursor at which to insert; when the function succeeds,
2823 the cursor is positioned before the insert point.
2824 @param[out] offsets offsets on inserted record
2825 @param[in,out] heap memory heap for allocating offsets
2826 @param[in] tuple tuple to insert
2827 @param[in] n_ext number of externally stored columns
2828 @param[in,out] mtr mini-transaction
2829 @return inserted record (first record on the right sibling page);
2830 the cursor will be positioned on the page infimum
2831 @retval NULL if the operation was not performed */
2832 static
2833 rec_t*
btr_insert_into_right_sibling(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t * heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2834 btr_insert_into_right_sibling(
2835 ulint flags,
2836 btr_cur_t* cursor,
2837 ulint** offsets,
2838 mem_heap_t* heap,
2839 const dtuple_t* tuple,
2840 ulint n_ext,
2841 mtr_t* mtr)
2842 {
2843 buf_block_t* block = btr_cur_get_block(cursor);
2844 page_t* page = buf_block_get_frame(block);
2845 ulint next_page_no = btr_page_get_next(page, mtr);
2846
2847 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
2848 MTR_MEMO_X_LOCK));
2849 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2850 ut_ad(heap);
2851
2852 if (next_page_no == FIL_NULL || !page_rec_is_supremum(
2853 page_rec_get_next(btr_cur_get_rec(cursor)))) {
2854
2855 return(NULL);
2856 }
2857
2858 page_cur_t next_page_cursor;
2859 buf_block_t* next_block;
2860 page_t* next_page;
2861 btr_cur_t next_father_cursor;
2862 rec_t* rec = NULL;
2863 ulint zip_size = buf_block_get_zip_size(block);
2864 ulint max_size;
2865
2866 next_block = btr_block_get(
2867 buf_block_get_space(block), zip_size,
2868 next_page_no, RW_X_LATCH, cursor->index, mtr);
2869 next_page = buf_block_get_frame(next_block);
2870
2871 bool is_leaf = page_is_leaf(next_page);
2872
2873 btr_page_get_father(
2874 cursor->index, next_block, mtr, &next_father_cursor);
2875
2876 page_cur_search(
2877 next_block, cursor->index, tuple, PAGE_CUR_LE,
2878 &next_page_cursor);
2879
2880 max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
2881
2882 /* Extends gap lock for the next page */
2883 lock_update_split_left(next_block, block);
2884
2885 rec = page_cur_tuple_insert(
2886 &next_page_cursor, tuple, cursor->index, offsets, &heap,
2887 n_ext, mtr);
2888
2889 if (rec == NULL) {
2890 if (zip_size && is_leaf
2891 && !dict_index_is_clust(cursor->index)) {
2892 /* Reset the IBUF_BITMAP_FREE bits, because
2893 page_cur_tuple_insert() will have attempted page
2894 reorganize before failing. */
2895 ibuf_reset_free_bits(next_block);
2896 }
2897 return(NULL);
2898 }
2899
2900 ibool compressed;
2901 dberr_t err;
2902 ulint level = btr_page_get_level(next_page, mtr);
2903
2904 /* adjust cursor position */
2905 *btr_cur_get_page_cur(cursor) = next_page_cursor;
2906
2907 ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
2908 ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
2909
2910 /* We have to change the parent node pointer */
2911
2912 compressed = btr_cur_pessimistic_delete(
2913 &err, TRUE, &next_father_cursor,
2914 BTR_CREATE_FLAG, RB_NONE, mtr);
2915
2916 ut_a(err == DB_SUCCESS);
2917
2918 if (!compressed) {
2919 btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
2920 }
2921
2922 dtuple_t* node_ptr = dict_index_build_node_ptr(
2923 cursor->index, rec, buf_block_get_page_no(next_block),
2924 heap, level);
2925
2926 btr_insert_on_non_leaf_level(
2927 flags, cursor->index, level + 1, node_ptr, mtr);
2928
2929 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
2930
2931 if (is_leaf && !dict_index_is_clust(cursor->index)) {
2932 /* Update the free bits of the B-tree page in the
2933 insert buffer bitmap. */
2934
2935 if (zip_size) {
2936 ibuf_update_free_bits_zip(next_block, mtr);
2937 } else {
2938 ibuf_update_free_bits_if_full(
2939 next_block, max_size,
2940 rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
2941 }
2942 }
2943
2944 return(rec);
2945 }
2946
2947 /*************************************************************//**
2948 Splits an index page to halves and inserts the tuple. It is assumed
2949 that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
2950 released within this function! NOTE that the operation of this
2951 function must always succeed, we cannot reverse it: therefore enough
2952 free disk space (2 pages) must be guaranteed to be available before
2953 this function is called.
2954
2955 @return inserted record or NULL if run out of space */
2956 UNIV_INTERN
2957 rec_t*
btr_page_split_and_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2958 btr_page_split_and_insert(
2959 /*======================*/
2960 ulint flags, /*!< in: undo logging and locking flags */
2961 btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
2962 function returns, the cursor is positioned
2963 on the predecessor of the inserted record */
2964 ulint** offsets,/*!< out: offsets on inserted record */
2965 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2966 const dtuple_t* tuple, /*!< in: tuple to insert */
2967 ulint n_ext, /*!< in: number of externally stored columns */
2968 mtr_t* mtr) /*!< in: mtr */
2969 {
2970 buf_block_t* block;
2971 page_t* page;
2972 page_zip_des_t* page_zip;
2973 ulint page_no;
2974 byte direction;
2975 ulint hint_page_no;
2976 buf_block_t* new_block;
2977 page_t* new_page;
2978 page_zip_des_t* new_page_zip;
2979 rec_t* split_rec;
2980 buf_block_t* left_block;
2981 buf_block_t* right_block;
2982 buf_block_t* insert_block;
2983 page_cur_t* page_cursor;
2984 rec_t* first_rec;
2985 byte* buf = 0; /* remove warning */
2986 rec_t* move_limit;
2987 ibool insert_will_fit;
2988 ibool insert_left;
2989 ulint n_iterations = 0;
2990 rec_t* rec;
2991 ulint n_uniq;
2992
2993 if (!*heap) {
2994 *heap = mem_heap_create(1024);
2995 }
2996 n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
2997 func_start:
2998 mem_heap_empty(*heap);
2999 *offsets = NULL;
3000
3001 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
3002 MTR_MEMO_X_LOCK));
3003 ut_ad(!dict_index_is_online_ddl(cursor->index)
3004 || (flags & BTR_CREATE_FLAG)
3005 || dict_index_is_clust(cursor->index));
3006 #ifdef UNIV_SYNC_DEBUG
3007 ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
3008 #endif /* UNIV_SYNC_DEBUG */
3009
3010 block = btr_cur_get_block(cursor);
3011 page = buf_block_get_frame(block);
3012 page_zip = buf_block_get_page_zip(block);
3013
3014 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3015 ut_ad(!page_is_empty(page));
3016
3017 /* try to insert to the next page if possible before split */
3018 rec = btr_insert_into_right_sibling(
3019 flags, cursor, offsets, *heap, tuple, n_ext, mtr);
3020
3021 if (rec != NULL) {
3022 return(rec);
3023 }
3024
3025 page_no = buf_block_get_page_no(block);
3026
3027 /* 1. Decide the split record; split_rec == NULL means that the
3028 tuple to be inserted should be the first record on the upper
3029 half-page */
3030 insert_left = FALSE;
3031
3032 if (n_iterations > 0) {
3033 direction = FSP_UP;
3034 hint_page_no = page_no + 1;
3035 split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
3036
3037 if (split_rec == NULL) {
3038 insert_left = btr_page_tuple_smaller(
3039 cursor, tuple, offsets, n_uniq, heap);
3040 }
3041 } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
3042 direction = FSP_UP;
3043 hint_page_no = page_no + 1;
3044
3045 } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
3046 direction = FSP_DOWN;
3047 hint_page_no = page_no - 1;
3048 ut_ad(split_rec);
3049 } else {
3050 direction = FSP_UP;
3051 hint_page_no = page_no + 1;
3052
3053 /* If there is only one record in the index page, we
3054 can't split the node in the middle by default. We need
3055 to determine whether the new record will be inserted
3056 to the left or right. */
3057
3058 if (page_get_n_recs(page) > 1) {
3059 split_rec = page_get_middle_rec(page);
3060 } else if (btr_page_tuple_smaller(cursor, tuple,
3061 offsets, n_uniq, heap)) {
3062 split_rec = page_rec_get_next(
3063 page_get_infimum_rec(page));
3064 } else {
3065 split_rec = NULL;
3066 }
3067 }
3068
3069 DBUG_EXECUTE_IF("disk_is_full",
3070 os_has_said_disk_full = true;
3071 return(NULL););
3072
3073 /* 2. Allocate a new page to the index */
3074 new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
3075 btr_page_get_level(page, mtr), mtr, mtr);
3076
3077 if (new_block == NULL && os_has_said_disk_full) {
3078 return(NULL);
3079 }
3080
3081 new_page = buf_block_get_frame(new_block);
3082 new_page_zip = buf_block_get_page_zip(new_block);
3083 btr_page_create(new_block, new_page_zip, cursor->index,
3084 btr_page_get_level(page, mtr), mtr);
3085
3086 /* 3. Calculate the first record on the upper half-page, and the
3087 first record (move_limit) on original page which ends up on the
3088 upper half */
3089
3090 if (split_rec) {
3091 first_rec = move_limit = split_rec;
3092
3093 *offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
3094 n_uniq, heap);
3095
3096 insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
3097
3098 if (!insert_left && new_page_zip && n_iterations > 0) {
3099 /* If a compressed page has already been split,
3100 avoid further splits by inserting the record
3101 to an empty page. */
3102 split_rec = NULL;
3103 goto insert_empty;
3104 }
3105 } else if (insert_left) {
3106 ut_a(n_iterations > 0);
3107 first_rec = page_rec_get_next(page_get_infimum_rec(page));
3108 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3109 } else {
3110 insert_empty:
3111 ut_ad(!split_rec);
3112 ut_ad(!insert_left);
3113 buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index,
3114 tuple, n_ext));
3115
3116 first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
3117 tuple, n_ext);
3118 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3119 }
3120
3121 /* 4. Do first the modifications in the tree structure */
3122
3123 btr_attach_half_pages(flags, cursor->index, block,
3124 first_rec, new_block, direction, mtr);
3125
3126 /* If the split is made on the leaf level and the insert will fit
3127 on the appropriate half-page, we may release the tree x-latch.
3128 We can then move the records after releasing the tree latch,
3129 thus reducing the tree latch contention. */
3130
3131 if (split_rec) {
3132 insert_will_fit = !new_page_zip
3133 && btr_page_insert_fits(cursor, split_rec,
3134 offsets, tuple, n_ext, heap);
3135 } else {
3136 if (!insert_left) {
3137 mem_free(buf);
3138 buf = NULL;
3139 }
3140
3141 insert_will_fit = !new_page_zip
3142 && btr_page_insert_fits(cursor, NULL,
3143 offsets, tuple, n_ext, heap);
3144 }
3145
3146 if (insert_will_fit && page_is_leaf(page)
3147 && !dict_index_is_online_ddl(cursor->index)) {
3148
3149 mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
3150 MTR_MEMO_X_LOCK);
3151 }
3152
3153 /* 5. Move then the records to the new page */
3154 if (direction == FSP_DOWN) {
3155 /* fputs("Split left\n", stderr); */
3156
3157 if (0
3158 #ifdef UNIV_ZIP_COPY
3159 || page_zip
3160 #endif /* UNIV_ZIP_COPY */
3161 || !page_move_rec_list_start(new_block, block, move_limit,
3162 cursor->index, mtr)) {
3163 /* For some reason, compressing new_page failed,
3164 even though it should contain fewer records than
3165 the original page. Copy the page byte for byte
3166 and then delete the records from both pages
3167 as appropriate. Deleting will always succeed. */
3168 ut_a(new_page_zip);
3169
3170 page_zip_copy_recs(new_page_zip, new_page,
3171 page_zip, page, cursor->index, mtr);
3172 page_delete_rec_list_end(move_limit - page + new_page,
3173 new_block, cursor->index,
3174 ULINT_UNDEFINED,
3175 ULINT_UNDEFINED, mtr);
3176
3177 /* Update the lock table and possible hash index. */
3178
3179 lock_move_rec_list_start(
3180 new_block, block, move_limit,
3181 new_page + PAGE_NEW_INFIMUM);
3182
3183 btr_search_move_or_delete_hash_entries(
3184 new_block, block, cursor->index);
3185
3186 /* Delete the records from the source page. */
3187
3188 page_delete_rec_list_start(move_limit, block,
3189 cursor->index, mtr);
3190 }
3191
3192 left_block = new_block;
3193 right_block = block;
3194
3195 lock_update_split_left(right_block, left_block);
3196 } else {
3197 /* fputs("Split right\n", stderr); */
3198
3199 if (0
3200 #ifdef UNIV_ZIP_COPY
3201 || page_zip
3202 #endif /* UNIV_ZIP_COPY */
3203 || !page_move_rec_list_end(new_block, block, move_limit,
3204 cursor->index, mtr)) {
3205 /* For some reason, compressing new_page failed,
3206 even though it should contain fewer records than
3207 the original page. Copy the page byte for byte
3208 and then delete the records from both pages
3209 as appropriate. Deleting will always succeed. */
3210 ut_a(new_page_zip);
3211
3212 page_zip_copy_recs(new_page_zip, new_page,
3213 page_zip, page, cursor->index, mtr);
3214 page_delete_rec_list_start(move_limit - page
3215 + new_page, new_block,
3216 cursor->index, mtr);
3217
3218 /* Update the lock table and possible hash index. */
3219
3220 lock_move_rec_list_end(new_block, block, move_limit);
3221
3222 btr_search_move_or_delete_hash_entries(
3223 new_block, block, cursor->index);
3224
3225 /* Delete the records from the source page. */
3226
3227 page_delete_rec_list_end(move_limit, block,
3228 cursor->index,
3229 ULINT_UNDEFINED,
3230 ULINT_UNDEFINED, mtr);
3231 }
3232
3233 left_block = block;
3234 right_block = new_block;
3235
3236 lock_update_split_right(right_block, left_block);
3237 }
3238
3239 #ifdef UNIV_ZIP_DEBUG
3240 if (page_zip) {
3241 ut_a(page_zip_validate(page_zip, page, cursor->index));
3242 ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
3243 }
3244 #endif /* UNIV_ZIP_DEBUG */
3245
3246 /* At this point, split_rec, move_limit and first_rec may point
3247 to garbage on the old page. */
3248
3249 /* 6. The split and the tree modification is now completed. Decide the
3250 page where the tuple should be inserted */
3251
3252 if (insert_left) {
3253 insert_block = left_block;
3254 } else {
3255 insert_block = right_block;
3256 }
3257
3258 /* 7. Reposition the cursor for insert and try insertion */
3259 page_cursor = btr_cur_get_page_cur(cursor);
3260
3261 page_cur_search(insert_block, cursor->index, tuple,
3262 PAGE_CUR_LE, page_cursor);
3263
3264 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3265 offsets, heap, n_ext, mtr);
3266
3267 #ifdef UNIV_ZIP_DEBUG
3268 {
3269 page_t* insert_page
3270 = buf_block_get_frame(insert_block);
3271
3272 page_zip_des_t* insert_page_zip
3273 = buf_block_get_page_zip(insert_block);
3274
3275 ut_a(!insert_page_zip
3276 || page_zip_validate(insert_page_zip, insert_page,
3277 cursor->index));
3278 }
3279 #endif /* UNIV_ZIP_DEBUG */
3280
3281 if (rec != NULL) {
3282
3283 goto func_exit;
3284 }
3285
3286 /* 8. If insert did not fit, try page reorganization.
3287 For compressed pages, page_cur_tuple_insert() will have
3288 attempted this already. */
3289
3290 if (page_cur_get_page_zip(page_cursor)
3291 || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3292
3293 goto insert_failed;
3294 }
3295
3296 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3297 offsets, heap, n_ext, mtr);
3298
3299 if (rec == NULL) {
3300 /* The insert did not fit on the page: loop back to the
3301 start of the function for a new split */
3302 insert_failed:
3303 /* We play safe and reset the free bits */
3304 if (!dict_index_is_clust(cursor->index)) {
3305 ibuf_reset_free_bits(new_block);
3306 ibuf_reset_free_bits(block);
3307 }
3308
3309 /* fprintf(stderr, "Split second round %lu\n",
3310 page_get_page_no(page)); */
3311 n_iterations++;
3312 ut_ad(n_iterations < 2
3313 || buf_block_get_page_zip(insert_block));
3314 ut_ad(!insert_will_fit);
3315
3316 goto func_start;
3317 }
3318
3319 func_exit:
3320 /* Insert fit on the page: update the free bits for the
3321 left and right pages in the same mtr */
3322
3323 if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
3324 ibuf_update_free_bits_for_two_pages_low(
3325 buf_block_get_zip_size(left_block),
3326 left_block, right_block, mtr);
3327 }
3328
3329 #if 0
3330 fprintf(stderr, "Split and insert done %lu %lu\n",
3331 buf_block_get_page_no(left_block),
3332 buf_block_get_page_no(right_block));
3333 #endif
3334 MONITOR_INC(MONITOR_INDEX_SPLIT);
3335
3336 ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
3337 ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
3338
3339 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3340 return(rec);
3341 }
3342
3343 #ifdef UNIV_SYNC_DEBUG
3344 /*************************************************************//**
3345 Removes a page from the level list of pages.
3346 @param space in: space where removed
3347 @param zip_size in: compressed page size in bytes, or 0 for uncompressed
3348 @param page in/out: page to remove
3349 @param index in: index tree
3350 @param mtr in/out: mini-transaction */
3351 # define btr_level_list_remove(space,zip_size,page,index,mtr) \
3352 btr_level_list_remove_func(space,zip_size,page,index,mtr)
3353 #else /* UNIV_SYNC_DEBUG */
3354 /*************************************************************//**
3355 Removes a page from the level list of pages.
3356 @param space in: space where removed
3357 @param zip_size in: compressed page size in bytes, or 0 for uncompressed
3358 @param page in/out: page to remove
3359 @param index in: index tree
3360 @param mtr in/out: mini-transaction */
3361 # define btr_level_list_remove(space,zip_size,page,index,mtr) \
3362 btr_level_list_remove_func(space,zip_size,page,mtr)
3363 #endif /* UNIV_SYNC_DEBUG */
3364
3365 /*************************************************************//**
3366 Removes a page from the level list of pages. */
3367 static MY_ATTRIBUTE((nonnull))
3368 void
btr_level_list_remove_func(ulint space,ulint zip_size,page_t * page,const dict_index_t * index,mtr_t * mtr)3369 btr_level_list_remove_func(
3370 /*=======================*/
3371 ulint space, /*!< in: space where removed */
3372 ulint zip_size,/*!< in: compressed page size in bytes
3373 or 0 for uncompressed pages */
3374 page_t* page, /*!< in/out: page to remove */
3375 #ifdef UNIV_SYNC_DEBUG
3376 const dict_index_t* index, /*!< in: index tree */
3377 #endif /* UNIV_SYNC_DEBUG */
3378 mtr_t* mtr) /*!< in/out: mini-transaction */
3379 {
3380 ulint prev_page_no;
3381 ulint next_page_no;
3382
3383 ut_ad(page != NULL);
3384 ut_ad(mtr != NULL);
3385 ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
3386 ut_ad(space == page_get_space_id(page));
3387 /* Get the previous and next page numbers of page */
3388
3389 prev_page_no = btr_page_get_prev(page, mtr);
3390 next_page_no = btr_page_get_next(page, mtr);
3391
3392 /* Update page links of the level */
3393
3394 if (prev_page_no != FIL_NULL) {
3395 buf_block_t* prev_block
3396 = btr_block_get(space, zip_size, prev_page_no,
3397 RW_X_LATCH, index, mtr);
3398 page_t* prev_page
3399 = buf_block_get_frame(prev_block);
3400 #ifdef UNIV_BTR_DEBUG
3401 ut_a(page_is_comp(prev_page) == page_is_comp(page));
3402 ut_a(btr_page_get_next(prev_page, mtr)
3403 == page_get_page_no(page));
3404 #endif /* UNIV_BTR_DEBUG */
3405
3406 btr_page_set_next(prev_page,
3407 buf_block_get_page_zip(prev_block),
3408 next_page_no, mtr);
3409 }
3410
3411 if (next_page_no != FIL_NULL) {
3412 buf_block_t* next_block
3413 = btr_block_get(space, zip_size, next_page_no,
3414 RW_X_LATCH, index, mtr);
3415 page_t* next_page
3416 = buf_block_get_frame(next_block);
3417 #ifdef UNIV_BTR_DEBUG
3418 ut_a(page_is_comp(next_page) == page_is_comp(page));
3419 ut_a(btr_page_get_prev(next_page, mtr)
3420 == page_get_page_no(page));
3421 #endif /* UNIV_BTR_DEBUG */
3422
3423 btr_page_set_prev(next_page,
3424 buf_block_get_page_zip(next_block),
3425 prev_page_no, mtr);
3426 }
3427 }
3428
3429 /****************************************************************//**
3430 Writes the redo log record for setting an index record as the predefined
3431 minimum record. */
3432 UNIV_INLINE
3433 void
btr_set_min_rec_mark_log(rec_t * rec,byte type,mtr_t * mtr)3434 btr_set_min_rec_mark_log(
3435 /*=====================*/
3436 rec_t* rec, /*!< in: record */
3437 byte type, /*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
3438 mtr_t* mtr) /*!< in: mtr */
3439 {
3440 mlog_write_initial_log_record(rec, type, mtr);
3441
3442 /* Write rec offset as a 2-byte ulint */
3443 mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
3444 }
3445 #else /* !UNIV_HOTBACKUP */
3446 # define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
3447 #endif /* !UNIV_HOTBACKUP */
3448
3449 /****************************************************************//**
3450 Parses the redo log record for setting an index record as the predefined
3451 minimum record.
3452 @return end of log record or NULL */
3453 UNIV_INTERN
3454 byte*
btr_parse_set_min_rec_mark(byte * ptr,byte * end_ptr,ulint comp,page_t * page,mtr_t * mtr)3455 btr_parse_set_min_rec_mark(
3456 /*=======================*/
3457 byte* ptr, /*!< in: buffer */
3458 byte* end_ptr,/*!< in: buffer end */
3459 ulint comp, /*!< in: nonzero=compact page format */
3460 page_t* page, /*!< in: page or NULL */
3461 mtr_t* mtr) /*!< in: mtr or NULL */
3462 {
3463 rec_t* rec;
3464
3465 if (end_ptr < ptr + 2) {
3466
3467 return(NULL);
3468 }
3469
3470 if (page) {
3471 ut_a(!page_is_comp(page) == !comp);
3472
3473 rec = page + mach_read_from_2(ptr);
3474
3475 btr_set_min_rec_mark(rec, mtr);
3476 }
3477
3478 return(ptr + 2);
3479 }
3480
3481 /****************************************************************//**
3482 Sets a record as the predefined minimum record. */
3483 UNIV_INTERN
3484 void
btr_set_min_rec_mark(rec_t * rec,mtr_t * mtr)3485 btr_set_min_rec_mark(
3486 /*=================*/
3487 rec_t* rec, /*!< in: record */
3488 mtr_t* mtr) /*!< in: mtr */
3489 {
3490 ulint info_bits;
3491
3492 if (page_rec_is_comp(rec)) {
3493 info_bits = rec_get_info_bits(rec, TRUE);
3494
3495 rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3496
3497 btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
3498 } else {
3499 info_bits = rec_get_info_bits(rec, FALSE);
3500
3501 rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3502
3503 btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
3504 }
3505 }
3506
3507 #ifndef UNIV_HOTBACKUP
3508 /*************************************************************//**
3509 Deletes on the upper level the node pointer to a page. */
3510 UNIV_INTERN
3511 void
btr_node_ptr_delete(dict_index_t * index,buf_block_t * block,mtr_t * mtr)3512 btr_node_ptr_delete(
3513 /*================*/
3514 dict_index_t* index, /*!< in: index tree */
3515 buf_block_t* block, /*!< in: page whose node pointer is deleted */
3516 mtr_t* mtr) /*!< in: mtr */
3517 {
3518 btr_cur_t cursor;
3519 ibool compressed;
3520 dberr_t err;
3521
3522 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3523
3524 /* Delete node pointer on father page */
3525 btr_page_get_father(index, block, mtr, &cursor);
3526
3527 compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor,
3528 BTR_CREATE_FLAG, RB_NONE, mtr);
3529 ut_a(err == DB_SUCCESS);
3530
3531 if (!compressed) {
3532 btr_cur_compress_if_useful(&cursor, FALSE, mtr);
3533 }
3534 }
3535
3536 /*************************************************************//**
3537 If page is the only on its level, this function moves its records to the
3538 father page, thus reducing the tree height.
3539 @return father block */
3540 static
3541 buf_block_t*
btr_lift_page_up(dict_index_t * index,buf_block_t * block,mtr_t * mtr)3542 btr_lift_page_up(
3543 /*=============*/
3544 dict_index_t* index, /*!< in: index tree */
3545 buf_block_t* block, /*!< in: page which is the only on its level;
3546 must not be empty: use
3547 btr_discard_only_page_on_level if the last
3548 record from the page should be removed */
3549 mtr_t* mtr) /*!< in: mtr */
3550 {
3551 buf_block_t* father_block;
3552 page_t* father_page;
3553 ulint page_level;
3554 page_zip_des_t* father_page_zip;
3555 page_t* page = buf_block_get_frame(block);
3556 ulint root_page_no;
3557 buf_block_t* blocks[BTR_MAX_LEVELS];
3558 ulint n_blocks; /*!< last used index in blocks[] */
3559 ulint i;
3560 bool lift_father_up;
3561 buf_block_t* block_orig = block;
3562
3563 ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3564 ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3565 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3566
3567 page_level = btr_page_get_level(page, mtr);
3568 root_page_no = dict_index_get_page(index);
3569
3570 {
3571 btr_cur_t cursor;
3572 ulint* offsets = NULL;
3573 mem_heap_t* heap = mem_heap_create(
3574 sizeof(*offsets)
3575 * (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields));
3576 buf_block_t* b;
3577
3578 offsets = btr_page_get_father_block(offsets, heap, index,
3579 block, mtr, &cursor);
3580 father_block = btr_cur_get_block(&cursor);
3581 father_page_zip = buf_block_get_page_zip(father_block);
3582 father_page = buf_block_get_frame(father_block);
3583
3584 n_blocks = 0;
3585
3586 /* Store all ancestor pages so we can reset their
3587 levels later on. We have to do all the searches on
3588 the tree now because later on, after we've replaced
3589 the first level, the tree is in an inconsistent state
3590 and can not be searched. */
3591 for (b = father_block;
3592 buf_block_get_page_no(b) != root_page_no; ) {
3593 ut_a(n_blocks < BTR_MAX_LEVELS);
3594
3595 offsets = btr_page_get_father_block(offsets, heap,
3596 index, b,
3597 mtr, &cursor);
3598
3599 blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
3600 }
3601
3602 lift_father_up = (n_blocks && page_level == 0);
3603 if (lift_father_up) {
3604 /* The father page also should be the only on its level (not
3605 root). We should lift up the father page at first.
3606 Because the leaf page should be lifted up only for root page.
3607 The freeing page is based on page_level (==0 or !=0)
3608 to choose segment. If the page_level is changed ==0 from !=0,
3609 later freeing of the page doesn't find the page allocation
3610 to be freed.*/
3611
3612 block = father_block;
3613 page = buf_block_get_frame(block);
3614 page_level = btr_page_get_level(page, mtr);
3615
3616 ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3617 ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3618 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3619
3620 father_block = blocks[0];
3621 father_page_zip = buf_block_get_page_zip(father_block);
3622 father_page = buf_block_get_frame(father_block);
3623 }
3624
3625 mem_heap_free(heap);
3626 }
3627
3628 btr_search_drop_page_hash_index(block);
3629
3630 /* Make the father empty */
3631 btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
3632 page_level++;
3633
3634 /* Copy the records to the father page one by one. */
3635 if (0
3636 #ifdef UNIV_ZIP_COPY
3637 || father_page_zip
3638 #endif /* UNIV_ZIP_COPY */
3639 || !page_copy_rec_list_end(father_block, block,
3640 page_get_infimum_rec(page),
3641 index, mtr)) {
3642 const page_zip_des_t* page_zip
3643 = buf_block_get_page_zip(block);
3644 ut_a(father_page_zip);
3645 ut_a(page_zip);
3646
3647 /* Copy the page byte for byte. */
3648 page_zip_copy_recs(father_page_zip, father_page,
3649 page_zip, page, index, mtr);
3650
3651 /* Update the lock table and possible hash index. */
3652
3653 lock_move_rec_list_end(father_block, block,
3654 page_get_infimum_rec(page));
3655
3656 btr_search_move_or_delete_hash_entries(father_block, block,
3657 index);
3658 }
3659
3660 btr_blob_dbg_remove(page, index, "btr_lift_page_up");
3661 lock_update_copy_and_discard(father_block, block);
3662
3663 /* Go upward to root page, decrementing levels by one. */
3664 for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
3665 page_t* page = buf_block_get_frame(blocks[i]);
3666 page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
3667
3668 ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
3669
3670 btr_page_set_level(page, page_zip, page_level, mtr);
3671 #ifdef UNIV_ZIP_DEBUG
3672 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3673 #endif /* UNIV_ZIP_DEBUG */
3674 }
3675
3676 /* Free the file page */
3677 btr_page_free(index, block, mtr);
3678
3679 /* We play it safe and reset the free bits for the father */
3680 if (!dict_index_is_clust(index)) {
3681 ibuf_reset_free_bits(father_block);
3682 }
3683 ut_ad(page_validate(father_page, index));
3684 ut_ad(btr_check_node_ptr(index, father_block, mtr));
3685
3686 return(lift_father_up ? block_orig : father_block);
3687 }
3688
3689 /*************************************************************//**
3690 Tries to merge the page first to the left immediate brother if such a
3691 brother exists, and the node pointers to the current page and to the brother
3692 reside on the same page. If the left brother does not satisfy these
3693 conditions, looks at the right brother. If the page is the only one on that
3694 level lifts the records of the page to the father page, thus reducing the
3695 tree height. It is assumed that mtr holds an x-latch on the tree and on the
3696 page. If cursor is on the leaf level, mtr must also hold x-latches to the
3697 brothers, if they exist.
3698 @return TRUE on success */
3699 UNIV_INTERN
3700 ibool
btr_compress(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)3701 btr_compress(
3702 /*=========*/
3703 btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
3704 or lift; the page must not be empty:
3705 when deleting records, use btr_discard_page()
3706 if the page would become empty */
3707 ibool adjust, /*!< in: TRUE if should adjust the
3708 cursor position even if compression occurs */
3709 mtr_t* mtr) /*!< in/out: mini-transaction */
3710 {
3711 dict_index_t* index;
3712 ulint space;
3713 ulint zip_size;
3714 ulint left_page_no;
3715 ulint right_page_no;
3716 buf_block_t* merge_block;
3717 page_t* merge_page = NULL;
3718 page_zip_des_t* merge_page_zip;
3719 ibool is_left;
3720 buf_block_t* block;
3721 page_t* page;
3722 btr_cur_t father_cursor;
3723 mem_heap_t* heap;
3724 ulint* offsets;
3725 ulint nth_rec = 0; /* remove bogus warning */
3726 DBUG_ENTER("btr_compress");
3727
3728 block = btr_cur_get_block(cursor);
3729 page = btr_cur_get_page(cursor);
3730 index = btr_cur_get_index(cursor);
3731
3732 btr_assert_not_corrupted(block, index);
3733
3734 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3735 MTR_MEMO_X_LOCK));
3736 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3737 space = dict_index_get_space(index);
3738 zip_size = dict_table_zip_size(index->table);
3739
3740 MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
3741
3742 left_page_no = btr_page_get_prev(page, mtr);
3743 right_page_no = btr_page_get_next(page, mtr);
3744
3745 #ifdef UNIV_DEBUG
3746 if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
3747 ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
3748 page_rec_get_next(page_get_infimum_rec(page)),
3749 page_is_comp(page)));
3750 }
3751 #endif /* UNIV_DEBUG */
3752
3753 heap = mem_heap_create(100);
3754 offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3755 &father_cursor);
3756
3757 if (adjust) {
3758 nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
3759 ut_ad(nth_rec > 0);
3760 }
3761
3762 if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
3763 /* The page is the only one on the level, lift the records
3764 to the father */
3765
3766 merge_block = btr_lift_page_up(index, block, mtr);
3767 goto func_exit;
3768 }
3769
3770 /* Decide the page to which we try to merge and which will inherit
3771 the locks */
3772
3773 is_left = btr_can_merge_with_page(cursor, left_page_no,
3774 &merge_block, mtr);
3775
3776 DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
3777
3778 if(!is_left
3779 && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
3780 mtr)) {
3781 goto err_exit;
3782 }
3783
3784 merge_page = buf_block_get_frame(merge_block);
3785
3786 #ifdef UNIV_BTR_DEBUG
3787 if (is_left) {
3788 ut_a(btr_page_get_next(merge_page, mtr)
3789 == buf_block_get_page_no(block));
3790 } else {
3791 ut_a(btr_page_get_prev(merge_page, mtr)
3792 == buf_block_get_page_no(block));
3793 }
3794 #endif /* UNIV_BTR_DEBUG */
3795
3796 ut_ad(page_validate(merge_page, index));
3797
3798 merge_page_zip = buf_block_get_page_zip(merge_block);
3799 #ifdef UNIV_ZIP_DEBUG
3800 if (merge_page_zip) {
3801 const page_zip_des_t* page_zip
3802 = buf_block_get_page_zip(block);
3803 ut_a(page_zip);
3804 ut_a(page_zip_validate(merge_page_zip, merge_page, index));
3805 ut_a(page_zip_validate(page_zip, page, index));
3806 }
3807 #endif /* UNIV_ZIP_DEBUG */
3808
3809 /* Move records to the merge page */
3810 if (is_left) {
3811 rec_t* orig_pred = page_copy_rec_list_start(
3812 merge_block, block, page_get_supremum_rec(page),
3813 index, mtr);
3814
3815 if (!orig_pred) {
3816 goto err_exit;
3817 }
3818
3819 btr_search_drop_page_hash_index(block);
3820
3821 /* Remove the page from the level list */
3822 btr_level_list_remove(space, zip_size, page, index, mtr);
3823
3824 btr_node_ptr_delete(index, block, mtr);
3825 lock_update_merge_left(merge_block, orig_pred, block);
3826
3827 if (adjust) {
3828 nth_rec += page_rec_get_n_recs_before(orig_pred);
3829 }
3830 } else {
3831 rec_t* orig_succ;
3832 ibool compressed;
3833 dberr_t err;
3834 btr_cur_t cursor2;
3835 /* father cursor pointing to node ptr
3836 of the right sibling */
3837 #ifdef UNIV_BTR_DEBUG
3838 byte fil_page_prev[4];
3839 #endif /* UNIV_BTR_DEBUG */
3840
3841 btr_page_get_father(index, merge_block, mtr, &cursor2);
3842
3843 if (merge_page_zip && left_page_no == FIL_NULL) {
3844
3845 /* The function page_zip_compress(), which will be
3846 invoked by page_copy_rec_list_end() below,
3847 requires that FIL_PAGE_PREV be FIL_NULL.
3848 Clear the field, but prepare to restore it. */
3849 #ifdef UNIV_BTR_DEBUG
3850 memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
3851 #endif /* UNIV_BTR_DEBUG */
3852 #if FIL_NULL != 0xffffffff
3853 # error "FIL_NULL != 0xffffffff"
3854 #endif
3855 memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
3856 }
3857
3858 orig_succ = page_copy_rec_list_end(merge_block, block,
3859 page_get_infimum_rec(page),
3860 cursor->index, mtr);
3861
3862 if (!orig_succ) {
3863 ut_a(merge_page_zip);
3864 #ifdef UNIV_BTR_DEBUG
3865 if (left_page_no == FIL_NULL) {
3866 /* FIL_PAGE_PREV was restored from
3867 merge_page_zip. */
3868 ut_a(!memcmp(fil_page_prev,
3869 merge_page + FIL_PAGE_PREV, 4));
3870 }
3871 #endif /* UNIV_BTR_DEBUG */
3872 goto err_exit;
3873 }
3874
3875 btr_search_drop_page_hash_index(block);
3876
3877 #ifdef UNIV_BTR_DEBUG
3878 if (merge_page_zip && left_page_no == FIL_NULL) {
3879
3880 /* Restore FIL_PAGE_PREV in order to avoid an assertion
3881 failure in btr_level_list_remove(), which will set
3882 the field again to FIL_NULL. Even though this makes
3883 merge_page and merge_page_zip inconsistent for a
3884 split second, it is harmless, because the pages
3885 are X-latched. */
3886 memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
3887 }
3888 #endif /* UNIV_BTR_DEBUG */
3889
3890 /* Remove the page from the level list */
3891 btr_level_list_remove(space, zip_size, page, index, mtr);
3892
3893 /* Replace the address of the old child node (= page) with the
3894 address of the merge page to the right */
3895 btr_node_ptr_set_child_page_no(
3896 btr_cur_get_rec(&father_cursor),
3897 btr_cur_get_page_zip(&father_cursor),
3898 offsets, right_page_no, mtr);
3899
3900 compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2,
3901 BTR_CREATE_FLAG,
3902 RB_NONE, mtr);
3903 ut_a(err == DB_SUCCESS);
3904
3905 if (!compressed) {
3906 btr_cur_compress_if_useful(&cursor2, FALSE, mtr);
3907 }
3908
3909 lock_update_merge_right(merge_block, orig_succ, block);
3910 }
3911
3912 btr_blob_dbg_remove(page, index, "btr_compress");
3913
3914 if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
3915 /* Update the free bits of the B-tree page in the
3916 insert buffer bitmap. This has to be done in a
3917 separate mini-transaction that is committed before the
3918 main mini-transaction. We cannot update the insert
3919 buffer bitmap in this mini-transaction, because
3920 btr_compress() can be invoked recursively without
3921 committing the mini-transaction in between. Since
3922 insert buffer bitmap pages have a lower rank than
3923 B-tree pages, we must not access other pages in the
3924 same mini-transaction after accessing an insert buffer
3925 bitmap page. */
3926
3927 /* The free bits in the insert buffer bitmap must
3928 never exceed the free space on a page. It is safe to
3929 decrement or reset the bits in the bitmap in a
3930 mini-transaction that is committed before the
3931 mini-transaction that affects the free space. */
3932
3933 /* It is unsafe to increment the bits in a separately
3934 committed mini-transaction, because in crash recovery,
3935 the free bits could momentarily be set too high. */
3936
3937 if (zip_size) {
3938 /* Because the free bits may be incremented
3939 and we cannot update the insert buffer bitmap
3940 in the same mini-transaction, the only safe
3941 thing we can do here is the pessimistic
3942 approach: reset the free bits. */
3943 ibuf_reset_free_bits(merge_block);
3944 } else {
3945 /* On uncompressed pages, the free bits will
3946 never increase here. Thus, it is safe to
3947 write the bits accurately in a separate
3948 mini-transaction. */
3949 ibuf_update_free_bits_if_full(merge_block,
3950 UNIV_PAGE_SIZE,
3951 ULINT_UNDEFINED);
3952 }
3953 }
3954
3955 ut_ad(page_validate(merge_page, index));
3956 #ifdef UNIV_ZIP_DEBUG
3957 ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
3958 index));
3959 #endif /* UNIV_ZIP_DEBUG */
3960
3961 /* Free the file page */
3962 btr_page_free(index, block, mtr);
3963
3964 ut_ad(btr_check_node_ptr(index, merge_block, mtr));
3965 func_exit:
3966 mem_heap_free(heap);
3967
3968 if (adjust) {
3969 ut_ad(nth_rec > 0);
3970 btr_cur_position(
3971 index,
3972 page_rec_get_nth(merge_block->frame, nth_rec),
3973 merge_block, cursor);
3974 }
3975
3976 MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
3977
3978 DBUG_RETURN(TRUE);
3979
3980 err_exit:
3981 /* We play it safe and reset the free bits. */
3982 if (zip_size
3983 && merge_page
3984 && page_is_leaf(merge_page)
3985 && !dict_index_is_clust(index)) {
3986 ibuf_reset_free_bits(merge_block);
3987 }
3988
3989 mem_heap_free(heap);
3990 DBUG_RETURN(FALSE);
3991 }
3992
3993 /*************************************************************//**
3994 Discards a page that is the only page on its level. This will empty
3995 the whole B-tree, leaving just an empty root page. This function
3996 should never be reached, because btr_compress(), which is invoked in
3997 delete operations, calls btr_lift_page_up() to flatten the B-tree. */
3998 static
3999 void
btr_discard_only_page_on_level(dict_index_t * index,buf_block_t * block,mtr_t * mtr)4000 btr_discard_only_page_on_level(
4001 /*===========================*/
4002 dict_index_t* index, /*!< in: index tree */
4003 buf_block_t* block, /*!< in: page which is the only on its level */
4004 mtr_t* mtr) /*!< in: mtr */
4005 {
4006 ulint page_level = 0;
4007 trx_id_t max_trx_id;
4008
4009 /* Save the PAGE_MAX_TRX_ID from the leaf page. */
4010 max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
4011
4012 while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
4013 btr_cur_t cursor;
4014 buf_block_t* father;
4015 const page_t* page = buf_block_get_frame(block);
4016
4017 ut_a(page_get_n_recs(page) == 1);
4018 ut_a(page_level == btr_page_get_level(page, mtr));
4019 ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
4020 ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
4021
4022 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4023 btr_search_drop_page_hash_index(block);
4024
4025 btr_page_get_father(index, block, mtr, &cursor);
4026 father = btr_cur_get_block(&cursor);
4027
4028 lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
4029
4030 /* Free the file page */
4031 btr_page_free(index, block, mtr);
4032
4033 block = father;
4034 page_level++;
4035 }
4036
4037 /* block is the root page, which must be empty, except
4038 for the node pointer to the (now discarded) block(s). */
4039
4040 #ifdef UNIV_BTR_DEBUG
4041 if (!dict_index_is_ibuf(index)) {
4042 const page_t* root = buf_block_get_frame(block);
4043 const ulint space = dict_index_get_space(index);
4044 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
4045 + root, space));
4046 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
4047 + root, space));
4048 }
4049 #endif /* UNIV_BTR_DEBUG */
4050
4051 btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
4052 ut_ad(page_is_leaf(buf_block_get_frame(block)));
4053
4054 if (!dict_index_is_clust(index)) {
4055 /* We play it safe and reset the free bits for the root */
4056 ibuf_reset_free_bits(block);
4057
4058 ut_a(max_trx_id);
4059 page_set_max_trx_id(block,
4060 buf_block_get_page_zip(block),
4061 max_trx_id, mtr);
4062 }
4063 }
4064
4065 /*************************************************************//**
4066 Discards a page from a B-tree. This is used to remove the last record from
4067 a B-tree page: the whole page must be removed at the same time. This cannot
4068 be used for the root page, which is allowed to be empty. */
4069 UNIV_INTERN
4070 void
btr_discard_page(btr_cur_t * cursor,mtr_t * mtr)4071 btr_discard_page(
4072 /*=============*/
4073 btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
4074 the root page */
4075 mtr_t* mtr) /*!< in: mtr */
4076 {
4077 dict_index_t* index;
4078 ulint space;
4079 ulint zip_size;
4080 ulint left_page_no;
4081 ulint right_page_no;
4082 buf_block_t* merge_block;
4083 page_t* merge_page;
4084 buf_block_t* block;
4085 page_t* page;
4086 rec_t* node_ptr;
4087
4088 block = btr_cur_get_block(cursor);
4089 index = btr_cur_get_index(cursor);
4090
4091 ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
4092 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
4093 MTR_MEMO_X_LOCK));
4094 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4095 space = dict_index_get_space(index);
4096 zip_size = dict_table_zip_size(index->table);
4097
4098 MONITOR_INC(MONITOR_INDEX_DISCARD);
4099
4100 /* Decide the page which will inherit the locks */
4101
4102 left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
4103 right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
4104
4105 if (left_page_no != FIL_NULL) {
4106 merge_block = btr_block_get(space, zip_size, left_page_no,
4107 RW_X_LATCH, index, mtr);
4108 merge_page = buf_block_get_frame(merge_block);
4109 #ifdef UNIV_BTR_DEBUG
4110 ut_a(btr_page_get_next(merge_page, mtr)
4111 == buf_block_get_page_no(block));
4112 #endif /* UNIV_BTR_DEBUG */
4113 } else if (right_page_no != FIL_NULL) {
4114 merge_block = btr_block_get(space, zip_size, right_page_no,
4115 RW_X_LATCH, index, mtr);
4116 merge_page = buf_block_get_frame(merge_block);
4117 #ifdef UNIV_BTR_DEBUG
4118 ut_a(btr_page_get_prev(merge_page, mtr)
4119 == buf_block_get_page_no(block));
4120 #endif /* UNIV_BTR_DEBUG */
4121 } else {
4122 btr_discard_only_page_on_level(index, block, mtr);
4123
4124 return;
4125 }
4126
4127 page = buf_block_get_frame(block);
4128 ut_a(page_is_comp(merge_page) == page_is_comp(page));
4129 btr_search_drop_page_hash_index(block);
4130
4131 if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
4132
4133 /* We have to mark the leftmost node pointer on the right
4134 side page as the predefined minimum record */
4135 node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
4136
4137 ut_ad(page_rec_is_user_rec(node_ptr));
4138
4139 /* This will make page_zip_validate() fail on merge_page
4140 until btr_level_list_remove() completes. This is harmless,
4141 because everything will take place within a single
4142 mini-transaction and because writing to the redo log
4143 is an atomic operation (performed by mtr_commit()). */
4144 btr_set_min_rec_mark(node_ptr, mtr);
4145 }
4146
4147 btr_node_ptr_delete(index, block, mtr);
4148
4149 /* Remove the page from the level list */
4150 btr_level_list_remove(space, zip_size, page, index, mtr);
4151 #ifdef UNIV_ZIP_DEBUG
4152 {
4153 page_zip_des_t* merge_page_zip
4154 = buf_block_get_page_zip(merge_block);
4155 ut_a(!merge_page_zip
4156 || page_zip_validate(merge_page_zip, merge_page, index));
4157 }
4158 #endif /* UNIV_ZIP_DEBUG */
4159
4160 if (left_page_no != FIL_NULL) {
4161 lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
4162 block);
4163 } else {
4164 lock_update_discard(merge_block,
4165 lock_get_min_heap_no(merge_block),
4166 block);
4167 }
4168
4169 btr_blob_dbg_remove(page, index, "btr_discard_page");
4170
4171 /* Free the file page */
4172 btr_page_free(index, block, mtr);
4173
4174 ut_ad(btr_check_node_ptr(index, merge_block, mtr));
4175 }
4176
4177 #ifdef UNIV_BTR_PRINT
4178 /*************************************************************//**
4179 Prints size info of a B-tree. */
4180 UNIV_INTERN
4181 void
btr_print_size(dict_index_t * index)4182 btr_print_size(
4183 /*===========*/
4184 dict_index_t* index) /*!< in: index tree */
4185 {
4186 page_t* root;
4187 fseg_header_t* seg;
4188 mtr_t mtr;
4189
4190 if (dict_index_is_ibuf(index)) {
4191 fputs("Sorry, cannot print info of an ibuf tree:"
4192 " use ibuf functions\n", stderr);
4193
4194 return;
4195 }
4196
4197 mtr_start(&mtr);
4198
4199 root = btr_root_get(index, &mtr);
4200
4201 seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
4202
4203 fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
4204 fseg_print(seg, &mtr);
4205
4206 if (!dict_index_is_univ(index)) {
4207
4208 seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
4209
4210 fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
4211 fseg_print(seg, &mtr);
4212 }
4213
4214 mtr_commit(&mtr);
4215 }
4216
4217 /************************************************************//**
4218 Prints recursively index tree pages. */
4219 static
4220 void
btr_print_recursive(dict_index_t * index,buf_block_t * block,ulint width,mem_heap_t ** heap,ulint ** offsets,mtr_t * mtr)4221 btr_print_recursive(
4222 /*================*/
4223 dict_index_t* index, /*!< in: index tree */
4224 buf_block_t* block, /*!< in: index page */
4225 ulint width, /*!< in: print this many entries from start
4226 and end */
4227 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
4228 ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
4229 mtr_t* mtr) /*!< in: mtr */
4230 {
4231 const page_t* page = buf_block_get_frame(block);
4232 page_cur_t cursor;
4233 ulint n_recs;
4234 ulint i = 0;
4235 mtr_t mtr2;
4236
4237 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4238 fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
4239 (ulong) btr_page_get_level(page, mtr),
4240 (ulong) buf_block_get_page_no(block));
4241
4242 page_print(block, index, width, width);
4243
4244 n_recs = page_get_n_recs(page);
4245
4246 page_cur_set_before_first(block, &cursor);
4247 page_cur_move_to_next(&cursor);
4248
4249 while (!page_cur_is_after_last(&cursor)) {
4250
4251 if (page_is_leaf(page)) {
4252
4253 /* If this is the leaf level, do nothing */
4254
4255 } else if ((i <= width) || (i >= n_recs - width)) {
4256
4257 const rec_t* node_ptr;
4258
4259 mtr_start(&mtr2);
4260
4261 node_ptr = page_cur_get_rec(&cursor);
4262
4263 *offsets = rec_get_offsets(node_ptr, index, *offsets,
4264 ULINT_UNDEFINED, heap);
4265 btr_print_recursive(index,
4266 btr_node_ptr_get_child(node_ptr,
4267 index,
4268 *offsets,
4269 &mtr2),
4270 width, heap, offsets, &mtr2);
4271 mtr_commit(&mtr2);
4272 }
4273
4274 page_cur_move_to_next(&cursor);
4275 i++;
4276 }
4277 }
4278
4279 /**************************************************************//**
4280 Prints directories and other info of all nodes in the tree. */
4281 UNIV_INTERN
4282 void
btr_print_index(dict_index_t * index,ulint width)4283 btr_print_index(
4284 /*============*/
4285 dict_index_t* index, /*!< in: index */
4286 ulint width) /*!< in: print this many entries from start
4287 and end */
4288 {
4289 mtr_t mtr;
4290 buf_block_t* root;
4291 mem_heap_t* heap = NULL;
4292 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4293 ulint* offsets = offsets_;
4294 rec_offs_init(offsets_);
4295
4296 fputs("--------------------------\n"
4297 "INDEX TREE PRINT\n", stderr);
4298
4299 mtr_start(&mtr);
4300
4301 root = btr_root_block_get(index, RW_X_LATCH, &mtr);
4302
4303 btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
4304 if (heap) {
4305 mem_heap_free(heap);
4306 }
4307
4308 mtr_commit(&mtr);
4309
4310 btr_validate_index(index, 0);
4311 }
4312 #endif /* UNIV_BTR_PRINT */
4313
4314 #ifdef UNIV_DEBUG
4315 /************************************************************//**
4316 Checks that the node pointer to a page is appropriate.
4317 @return TRUE */
4318 UNIV_INTERN
4319 ibool
btr_check_node_ptr(dict_index_t * index,buf_block_t * block,mtr_t * mtr)4320 btr_check_node_ptr(
4321 /*===============*/
4322 dict_index_t* index, /*!< in: index tree */
4323 buf_block_t* block, /*!< in: index page */
4324 mtr_t* mtr) /*!< in: mtr */
4325 {
4326 mem_heap_t* heap;
4327 dtuple_t* tuple;
4328 ulint* offsets;
4329 btr_cur_t cursor;
4330 page_t* page = buf_block_get_frame(block);
4331
4332 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4333 if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
4334
4335 return(TRUE);
4336 }
4337
4338 heap = mem_heap_create(256);
4339 offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
4340 &cursor);
4341
4342 if (page_is_leaf(page)) {
4343
4344 goto func_exit;
4345 }
4346
4347 tuple = dict_index_build_node_ptr(
4348 index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
4349 btr_page_get_level(page, mtr));
4350
4351 ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
4352 func_exit:
4353 mem_heap_free(heap);
4354
4355 return(TRUE);
4356 }
4357 #endif /* UNIV_DEBUG */
4358
4359 /************************************************************//**
4360 Display identification information for a record. */
4361 static
4362 void
btr_index_rec_validate_report(const page_t * page,const rec_t * rec,const dict_index_t * index)4363 btr_index_rec_validate_report(
4364 /*==========================*/
4365 const page_t* page, /*!< in: index page */
4366 const rec_t* rec, /*!< in: index record */
4367 const dict_index_t* index) /*!< in: index */
4368 {
4369 fputs("InnoDB: Record in ", stderr);
4370 dict_index_name_print(stderr, NULL, index);
4371 fprintf(stderr, ", page %lu, at offset %lu\n",
4372 page_get_page_no(page), (ulint) page_offset(rec));
4373 }
4374
4375 /************************************************************//**
4376 Checks the size and number of fields in a record based on the definition of
4377 the index.
4378 @return TRUE if ok */
4379 UNIV_INTERN
4380 ibool
btr_index_rec_validate(const rec_t * rec,const dict_index_t * index,ibool dump_on_error)4381 btr_index_rec_validate(
4382 /*===================*/
4383 const rec_t* rec, /*!< in: index record */
4384 const dict_index_t* index, /*!< in: index */
4385 ibool dump_on_error) /*!< in: TRUE if the function
4386 should print hex dump of record
4387 and page on error */
4388 {
4389 ulint len;
4390 ulint n;
4391 ulint i;
4392 const page_t* page;
4393 mem_heap_t* heap = NULL;
4394 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4395 ulint* offsets = offsets_;
4396 rec_offs_init(offsets_);
4397
4398 page = page_align(rec);
4399
4400 if (dict_index_is_univ(index)) {
4401 /* The insert buffer index tree can contain records from any
4402 other index: we cannot check the number of fields or
4403 their length */
4404
4405 return(TRUE);
4406 }
4407
4408 if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
4409 btr_index_rec_validate_report(page, rec, index);
4410 fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
4411 (ulong) !!page_is_comp(page),
4412 (ulong) dict_table_is_comp(index->table));
4413
4414 return(FALSE);
4415 }
4416
4417 n = dict_index_get_n_fields(index);
4418
4419 if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) {
4420 btr_index_rec_validate_report(page, rec, index);
4421 fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
4422 (ulong) rec_get_n_fields_old(rec), (ulong) n);
4423
4424 if (dump_on_error) {
4425 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4426
4427 fputs("InnoDB: corrupt record ", stderr);
4428 rec_print_old(stderr, rec);
4429 putc('\n', stderr);
4430 }
4431 return(FALSE);
4432 }
4433
4434 offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4435
4436 for (i = 0; i < n; i++) {
4437 ulint fixed_size = dict_col_get_fixed_size(
4438 dict_index_get_nth_col(index, i), page_is_comp(page));
4439
4440 rec_get_nth_field_offs(offsets, i, &len);
4441
4442 /* Note that if fixed_size != 0, it equals the
4443 length of a fixed-size column in the clustered index.
4444 A prefix index of the column is of fixed, but different
4445 length. When fixed_size == 0, prefix_len is the maximum
4446 length of the prefix index column. */
4447
4448 if ((dict_index_get_nth_field(index, i)->prefix_len == 0
4449 && len != UNIV_SQL_NULL && fixed_size
4450 && len != fixed_size)
4451 || (dict_index_get_nth_field(index, i)->prefix_len > 0
4452 && len != UNIV_SQL_NULL
4453 && len
4454 > dict_index_get_nth_field(index, i)->prefix_len)) {
4455
4456 btr_index_rec_validate_report(page, rec, index);
4457 fprintf(stderr,
4458 "InnoDB: field %lu len is %lu,"
4459 " should be %lu\n",
4460 (ulong) i, (ulong) len, (ulong) fixed_size);
4461
4462 if (dump_on_error) {
4463 buf_page_print(page, 0,
4464 BUF_PAGE_PRINT_NO_CRASH);
4465
4466 fputs("InnoDB: corrupt record ", stderr);
4467 rec_print_new(stderr, rec, offsets);
4468 putc('\n', stderr);
4469 }
4470 if (heap) {
4471 mem_heap_free(heap);
4472 }
4473 return(FALSE);
4474 }
4475 }
4476
4477 if (heap) {
4478 mem_heap_free(heap);
4479 }
4480 return(TRUE);
4481 }
4482
4483 /************************************************************//**
4484 Checks the size and number of fields in records based on the definition of
4485 the index.
4486 @return TRUE if ok */
4487 static
4488 ibool
btr_index_page_validate(buf_block_t * block,dict_index_t * index)4489 btr_index_page_validate(
4490 /*====================*/
4491 buf_block_t* block, /*!< in: index page */
4492 dict_index_t* index) /*!< in: index */
4493 {
4494 page_cur_t cur;
4495 ibool ret = TRUE;
4496 #ifndef DBUG_OFF
4497 ulint nth = 1;
4498 #endif /* !DBUG_OFF */
4499
4500 page_cur_set_before_first(block, &cur);
4501
4502 /* Directory slot 0 should only contain the infimum record. */
4503 DBUG_EXECUTE_IF("check_table_rec_next",
4504 ut_a(page_rec_get_nth_const(
4505 page_cur_get_page(&cur), 0)
4506 == cur.rec);
4507 ut_a(page_dir_slot_get_n_owned(
4508 page_dir_get_nth_slot(
4509 page_cur_get_page(&cur), 0))
4510 == 1););
4511
4512 page_cur_move_to_next(&cur);
4513
4514 for (;;) {
4515 if (page_cur_is_after_last(&cur)) {
4516
4517 break;
4518 }
4519
4520 if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
4521
4522 return(FALSE);
4523 }
4524
4525 /* Verify that page_rec_get_nth_const() is correctly
4526 retrieving each record. */
4527 DBUG_EXECUTE_IF("check_table_rec_next",
4528 ut_a(cur.rec == page_rec_get_nth_const(
4529 page_cur_get_page(&cur),
4530 page_rec_get_n_recs_before(
4531 cur.rec)));
4532 ut_a(nth++ == page_rec_get_n_recs_before(
4533 cur.rec)););
4534
4535 page_cur_move_to_next(&cur);
4536 }
4537
4538 return(ret);
4539 }
4540
4541 /************************************************************//**
4542 Report an error on one page of an index tree. */
4543 static
4544 void
btr_validate_report1(dict_index_t * index,ulint level,const buf_block_t * block)4545 btr_validate_report1(
4546 /*=================*/
4547 dict_index_t* index, /*!< in: index */
4548 ulint level, /*!< in: B-tree level */
4549 const buf_block_t* block) /*!< in: index page */
4550 {
4551 fprintf(stderr, "InnoDB: Error in page %lu of ",
4552 buf_block_get_page_no(block));
4553 dict_index_name_print(stderr, NULL, index);
4554 if (level) {
4555 fprintf(stderr, ", index tree level %lu", level);
4556 }
4557 putc('\n', stderr);
4558 }
4559
4560 /************************************************************//**
4561 Report an error on two pages of an index tree. */
4562 static
4563 void
btr_validate_report2(const dict_index_t * index,ulint level,const buf_block_t * block1,const buf_block_t * block2)4564 btr_validate_report2(
4565 /*=================*/
4566 const dict_index_t* index, /*!< in: index */
4567 ulint level, /*!< in: B-tree level */
4568 const buf_block_t* block1, /*!< in: first index page */
4569 const buf_block_t* block2) /*!< in: second index page */
4570 {
4571 fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
4572 buf_block_get_page_no(block1),
4573 buf_block_get_page_no(block2));
4574 dict_index_name_print(stderr, NULL, index);
4575 if (level) {
4576 fprintf(stderr, ", index tree level %lu", level);
4577 }
4578 putc('\n', stderr);
4579 }
4580
4581 /************************************************************//**
4582 Validates index tree level.
4583 @return TRUE if ok */
4584 static
4585 bool
btr_validate_level(dict_index_t * index,const trx_t * trx,ulint level)4586 btr_validate_level(
4587 /*===============*/
4588 dict_index_t* index, /*!< in: index tree */
4589 const trx_t* trx, /*!< in: transaction or NULL */
4590 ulint level) /*!< in: level number */
4591 {
4592 ulint space;
4593 ulint space_flags;
4594 ulint zip_size;
4595 buf_block_t* block;
4596 page_t* page;
4597 buf_block_t* right_block = 0; /* remove warning */
4598 page_t* right_page = 0; /* remove warning */
4599 page_t* father_page;
4600 btr_cur_t node_cur;
4601 btr_cur_t right_node_cur;
4602 rec_t* rec;
4603 ulint right_page_no;
4604 ulint left_page_no;
4605 page_cur_t cursor;
4606 dtuple_t* node_ptr_tuple;
4607 bool ret = true;
4608 mtr_t mtr;
4609 mem_heap_t* heap = mem_heap_create(256);
4610 fseg_header_t* seg;
4611 ulint* offsets = NULL;
4612 ulint* offsets2= NULL;
4613 #ifdef UNIV_ZIP_DEBUG
4614 page_zip_des_t* page_zip;
4615 #endif /* UNIV_ZIP_DEBUG */
4616
4617 mtr_start(&mtr);
4618
4619 mtr_x_lock(dict_index_get_lock(index), &mtr);
4620
4621 block = btr_root_block_get(index, RW_X_LATCH, &mtr);
4622 page = buf_block_get_frame(block);
4623 seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP;
4624
4625 space = dict_index_get_space(index);
4626 zip_size = dict_table_zip_size(index->table);
4627
4628 fil_space_get_latch(space, &space_flags);
4629
4630 if (zip_size != dict_tf_get_zip_size(space_flags)) {
4631
4632 ib_logf(IB_LOG_LEVEL_WARN,
4633 "Flags mismatch: table=%lu, tablespace=%lu",
4634 (ulint) index->table->flags, (ulint) space_flags);
4635
4636 mtr_commit(&mtr);
4637
4638 return(false);
4639 }
4640
4641 while (level != btr_page_get_level(page, &mtr)) {
4642 const rec_t* node_ptr;
4643
4644 if (fseg_page_is_free(seg,
4645 block->page.space, block->page.offset)) {
4646
4647 btr_validate_report1(index, level, block);
4648
4649 ib_logf(IB_LOG_LEVEL_WARN, "page is free");
4650
4651 ret = false;
4652 }
4653
4654 ut_a(space == buf_block_get_space(block));
4655 ut_a(space == page_get_space_id(page));
4656 #ifdef UNIV_ZIP_DEBUG
4657 page_zip = buf_block_get_page_zip(block);
4658 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4659 #endif /* UNIV_ZIP_DEBUG */
4660 ut_a(!page_is_leaf(page));
4661
4662 page_cur_set_before_first(block, &cursor);
4663 page_cur_move_to_next(&cursor);
4664
4665 node_ptr = page_cur_get_rec(&cursor);
4666 offsets = rec_get_offsets(node_ptr, index, offsets,
4667 ULINT_UNDEFINED, &heap);
4668 block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
4669 page = buf_block_get_frame(block);
4670 }
4671
4672 /* Now we are on the desired level. Loop through the pages on that
4673 level. */
4674
4675 if (level == 0) {
4676 /* Leaf pages are managed in their own file segment. */
4677 seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF;
4678 }
4679
4680 loop:
4681 mem_heap_empty(heap);
4682 offsets = offsets2 = NULL;
4683 mtr_x_lock(dict_index_get_lock(index), &mtr);
4684
4685 #ifdef UNIV_ZIP_DEBUG
4686 page_zip = buf_block_get_page_zip(block);
4687 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4688 #endif /* UNIV_ZIP_DEBUG */
4689
4690 ut_a(block->page.space == space);
4691
4692 if (fseg_page_is_free(seg, block->page.space, block->page.offset)) {
4693
4694 btr_validate_report1(index, level, block);
4695
4696 ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free");
4697 ret = false;
4698
4699 } else if (btr_page_get_index_id(page) != index->id) {
4700
4701 ib_logf(IB_LOG_LEVEL_ERROR,
4702 "Page index id " IB_ID_FMT " != data dictionary "
4703 "index id " IB_ID_FMT,
4704 btr_page_get_index_id(page), index->id);
4705
4706 ret = false;
4707
4708 } else if (!page_validate(page, index)) {
4709
4710 btr_validate_report1(index, level, block);
4711 ret = false;
4712
4713 } else if (level == 0 && !btr_index_page_validate(block, index)) {
4714
4715 /* We are on level 0. Check that the records have the right
4716 number of fields, and field lengths are right. */
4717
4718 ret = false;
4719 }
4720
4721 ut_a(btr_page_get_level(page, &mtr) == level);
4722
4723 right_page_no = btr_page_get_next(page, &mtr);
4724 left_page_no = btr_page_get_prev(page, &mtr);
4725
4726 ut_a(!page_is_empty(page)
4727 || (level == 0
4728 && page_get_page_no(page) == dict_index_get_page(index)));
4729
4730 if (right_page_no != FIL_NULL) {
4731 const rec_t* right_rec;
4732 right_block = btr_block_get(space, zip_size, right_page_no,
4733 RW_X_LATCH, index, &mtr);
4734 right_page = buf_block_get_frame(right_block);
4735 if (btr_page_get_prev(right_page, &mtr)
4736 != page_get_page_no(page)) {
4737
4738 btr_validate_report2(index, level, block, right_block);
4739 fputs("InnoDB: broken FIL_PAGE_NEXT"
4740 " or FIL_PAGE_PREV links\n", stderr);
4741 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4742 buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4743
4744 ret = false;
4745 }
4746
4747 if (page_is_comp(right_page) != page_is_comp(page)) {
4748 btr_validate_report2(index, level, block, right_block);
4749 fputs("InnoDB: 'compact' flag mismatch\n", stderr);
4750 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4751 buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4752
4753 ret = false;
4754
4755 goto node_ptr_fails;
4756 }
4757
4758 rec = page_rec_get_prev(page_get_supremum_rec(page));
4759 right_rec = page_rec_get_next(page_get_infimum_rec(
4760 right_page));
4761 offsets = rec_get_offsets(rec, index,
4762 offsets, ULINT_UNDEFINED, &heap);
4763 offsets2 = rec_get_offsets(right_rec, index,
4764 offsets2, ULINT_UNDEFINED, &heap);
4765 if (cmp_rec_rec(rec, right_rec, offsets, offsets2,
4766 index) >= 0) {
4767
4768 btr_validate_report2(index, level, block, right_block);
4769
4770 fputs("InnoDB: records in wrong order"
4771 " on adjacent pages\n", stderr);
4772
4773 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4774 buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4775
4776 fputs("InnoDB: record ", stderr);
4777 rec = page_rec_get_prev(page_get_supremum_rec(page));
4778 rec_print(stderr, rec, index);
4779 putc('\n', stderr);
4780 fputs("InnoDB: record ", stderr);
4781 rec = page_rec_get_next(
4782 page_get_infimum_rec(right_page));
4783 rec_print(stderr, rec, index);
4784 putc('\n', stderr);
4785
4786 ret = false;
4787 }
4788 }
4789
4790 if (level > 0 && left_page_no == FIL_NULL) {
4791 ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
4792 page_rec_get_next(page_get_infimum_rec(page)),
4793 page_is_comp(page)));
4794 }
4795
4796 if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
4797
4798 /* Check father node pointers */
4799
4800 rec_t* node_ptr;
4801
4802 offsets = btr_page_get_father_block(offsets, heap, index,
4803 block, &mtr, &node_cur);
4804 father_page = btr_cur_get_page(&node_cur);
4805 node_ptr = btr_cur_get_rec(&node_cur);
4806
4807 btr_cur_position(
4808 index, page_rec_get_prev(page_get_supremum_rec(page)),
4809 block, &node_cur);
4810 offsets = btr_page_get_father_node_ptr(offsets, heap,
4811 &node_cur, &mtr);
4812
4813 if (node_ptr != btr_cur_get_rec(&node_cur)
4814 || btr_node_ptr_get_child_page_no(node_ptr, offsets)
4815 != buf_block_get_page_no(block)) {
4816
4817 btr_validate_report1(index, level, block);
4818
4819 fputs("InnoDB: node pointer to the page is wrong\n",
4820 stderr);
4821
4822 buf_page_print(father_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4823 buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4824
4825 fputs("InnoDB: node ptr ", stderr);
4826 rec_print(stderr, node_ptr, index);
4827
4828 rec = btr_cur_get_rec(&node_cur);
4829 fprintf(stderr, "\n"
4830 "InnoDB: node ptr child page n:o %lu\n",
4831 (ulong) btr_node_ptr_get_child_page_no(
4832 rec, offsets));
4833
4834 fputs("InnoDB: record on page ", stderr);
4835 rec_print_new(stderr, rec, offsets);
4836 putc('\n', stderr);
4837 ret = false;
4838
4839 goto node_ptr_fails;
4840 }
4841
4842 if (!page_is_leaf(page)) {
4843 node_ptr_tuple = dict_index_build_node_ptr(
4844 index,
4845 page_rec_get_next(page_get_infimum_rec(page)),
4846 0, heap, btr_page_get_level(page, &mtr));
4847
4848 if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
4849 offsets)) {
4850 const rec_t* first_rec = page_rec_get_next(
4851 page_get_infimum_rec(page));
4852
4853 btr_validate_report1(index, level, block);
4854
4855 buf_page_print(father_page, 0,
4856 BUF_PAGE_PRINT_NO_CRASH);
4857 buf_page_print(page, 0,
4858 BUF_PAGE_PRINT_NO_CRASH);
4859
4860 fputs("InnoDB: Error: node ptrs differ"
4861 " on levels > 0\n"
4862 "InnoDB: node ptr ", stderr);
4863 rec_print_new(stderr, node_ptr, offsets);
4864 fputs("InnoDB: first rec ", stderr);
4865 rec_print(stderr, first_rec, index);
4866 putc('\n', stderr);
4867 ret = false;
4868
4869 goto node_ptr_fails;
4870 }
4871 }
4872
4873 if (left_page_no == FIL_NULL) {
4874 ut_a(node_ptr == page_rec_get_next(
4875 page_get_infimum_rec(father_page)));
4876 ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
4877 }
4878
4879 if (right_page_no == FIL_NULL) {
4880 ut_a(node_ptr == page_rec_get_prev(
4881 page_get_supremum_rec(father_page)));
4882 ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
4883 } else {
4884 const rec_t* right_node_ptr
4885 = page_rec_get_next(node_ptr);
4886
4887 offsets = btr_page_get_father_block(
4888 offsets, heap, index, right_block,
4889 &mtr, &right_node_cur);
4890 if (right_node_ptr
4891 != page_get_supremum_rec(father_page)) {
4892
4893 if (btr_cur_get_rec(&right_node_cur)
4894 != right_node_ptr) {
4895 ret = false;
4896 fputs("InnoDB: node pointer to"
4897 " the right page is wrong\n",
4898 stderr);
4899
4900 btr_validate_report1(index, level,
4901 block);
4902
4903 buf_page_print(
4904 father_page, 0,
4905 BUF_PAGE_PRINT_NO_CRASH);
4906 buf_page_print(
4907 page, 0,
4908 BUF_PAGE_PRINT_NO_CRASH);
4909 buf_page_print(
4910 right_page, 0,
4911 BUF_PAGE_PRINT_NO_CRASH);
4912 }
4913 } else {
4914 page_t* right_father_page
4915 = btr_cur_get_page(&right_node_cur);
4916
4917 if (btr_cur_get_rec(&right_node_cur)
4918 != page_rec_get_next(
4919 page_get_infimum_rec(
4920 right_father_page))) {
4921 ret = false;
4922 fputs("InnoDB: node pointer 2 to"
4923 " the right page is wrong\n",
4924 stderr);
4925
4926 btr_validate_report1(index, level,
4927 block);
4928
4929 buf_page_print(
4930 father_page, 0,
4931 BUF_PAGE_PRINT_NO_CRASH);
4932 buf_page_print(
4933 right_father_page, 0,
4934 BUF_PAGE_PRINT_NO_CRASH);
4935 buf_page_print(
4936 page, 0,
4937 BUF_PAGE_PRINT_NO_CRASH);
4938 buf_page_print(
4939 right_page, 0,
4940 BUF_PAGE_PRINT_NO_CRASH);
4941 }
4942
4943 if (page_get_page_no(right_father_page)
4944 != btr_page_get_next(father_page, &mtr)) {
4945
4946 ret = false;
4947 fputs("InnoDB: node pointer 3 to"
4948 " the right page is wrong\n",
4949 stderr);
4950
4951 btr_validate_report1(index, level,
4952 block);
4953
4954 buf_page_print(
4955 father_page, 0,
4956 BUF_PAGE_PRINT_NO_CRASH);
4957 buf_page_print(
4958 right_father_page, 0,
4959 BUF_PAGE_PRINT_NO_CRASH);
4960 buf_page_print(
4961 page, 0,
4962 BUF_PAGE_PRINT_NO_CRASH);
4963 buf_page_print(
4964 right_page, 0,
4965 BUF_PAGE_PRINT_NO_CRASH);
4966 }
4967 }
4968 }
4969 }
4970
4971 node_ptr_fails:
4972 /* Commit the mini-transaction to release the latch on 'page'.
4973 Re-acquire the latch on right_page, which will become 'page'
4974 on the next loop. The page has already been checked. */
4975 mtr_commit(&mtr);
4976
4977 if (trx_is_interrupted(trx)) {
4978 /* On interrupt, return the current status. */
4979 } else if (right_page_no != FIL_NULL) {
4980
4981 mtr_start(&mtr);
4982
4983 block = btr_block_get(
4984 space, zip_size, right_page_no,
4985 RW_X_LATCH, index, &mtr);
4986
4987 page = buf_block_get_frame(block);
4988
4989 goto loop;
4990 }
4991
4992 mem_heap_free(heap);
4993
4994 return(ret);
4995 }
4996
4997 /**************************************************************//**
4998 Checks the consistency of an index tree.
4999 @return TRUE if ok */
5000 UNIV_INTERN
5001 bool
btr_validate_index(dict_index_t * index,const trx_t * trx)5002 btr_validate_index(
5003 /*===============*/
5004 dict_index_t* index, /*!< in: index */
5005 const trx_t* trx) /*!< in: transaction or NULL */
5006 {
5007 /* Full Text index are implemented by auxiliary tables,
5008 not the B-tree */
5009 if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
5010 return(true);
5011 }
5012
5013 mtr_t mtr;
5014
5015 mtr_start(&mtr);
5016
5017 mtr_x_lock(dict_index_get_lock(index), &mtr);
5018
5019 bool ok = true;
5020 page_t* root = btr_root_get(index, &mtr);
5021 ulint n = btr_page_get_level(root, &mtr);
5022
5023 for (ulint i = 0; i <= n; ++i) {
5024
5025 if (!btr_validate_level(index, trx, n - i)) {
5026 ok = false;
5027 break;
5028 }
5029 }
5030
5031 mtr_commit(&mtr);
5032
5033 return(ok);
5034 }
5035
5036 /**************************************************************//**
5037 Checks if the page in the cursor can be merged with given page.
5038 If necessary, re-organize the merge_page.
5039 @return TRUE if possible to merge. */
5040 UNIV_INTERN
5041 ibool
btr_can_merge_with_page(btr_cur_t * cursor,ulint page_no,buf_block_t ** merge_block,mtr_t * mtr)5042 btr_can_merge_with_page(
5043 /*====================*/
5044 btr_cur_t* cursor, /*!< in: cursor on the page to merge */
5045 ulint page_no, /*!< in: a sibling page */
5046 buf_block_t** merge_block, /*!< out: the merge block */
5047 mtr_t* mtr) /*!< in: mini-transaction */
5048 {
5049 dict_index_t* index;
5050 page_t* page;
5051 ulint space;
5052 ulint zip_size;
5053 ulint n_recs;
5054 ulint data_size;
5055 ulint max_ins_size_reorg;
5056 ulint max_ins_size;
5057 buf_block_t* mblock;
5058 page_t* mpage;
5059 DBUG_ENTER("btr_can_merge_with_page");
5060
5061 if (page_no == FIL_NULL) {
5062 goto error;
5063 }
5064
5065 index = btr_cur_get_index(cursor);
5066 page = btr_cur_get_page(cursor);
5067 space = dict_index_get_space(index);
5068 zip_size = dict_table_zip_size(index->table);
5069
5070 mblock = btr_block_get(space, zip_size, page_no, RW_X_LATCH, index,
5071 mtr);
5072 mpage = buf_block_get_frame(mblock);
5073
5074 n_recs = page_get_n_recs(page);
5075 data_size = page_get_data_size(page);
5076
5077 max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
5078 mpage, n_recs);
5079
5080 if (data_size > max_ins_size_reorg) {
5081 goto error;
5082 }
5083
5084 /* If compression padding tells us that merging will result in
5085 too packed up page i.e.: which is likely to cause compression
5086 failure then don't merge the pages. */
5087 if (zip_size && page_is_leaf(mpage)
5088 && (page_get_data_size(mpage) + data_size
5089 >= dict_index_zip_pad_optimal_page_size(index))) {
5090
5091 goto error;
5092 }
5093
5094
5095 max_ins_size = page_get_max_insert_size(mpage, n_recs);
5096
5097 if (data_size > max_ins_size) {
5098
5099 /* We have to reorganize mpage */
5100
5101 if (!btr_page_reorganize_block(
5102 false, page_zip_level, mblock, index, mtr)) {
5103
5104 goto error;
5105 }
5106
5107 max_ins_size = page_get_max_insert_size(mpage, n_recs);
5108
5109 ut_ad(page_validate(mpage, index));
5110 ut_ad(max_ins_size == max_ins_size_reorg);
5111
5112 if (data_size > max_ins_size) {
5113
5114 /* Add fault tolerance, though this should
5115 never happen */
5116
5117 goto error;
5118 }
5119 }
5120
5121 *merge_block = mblock;
5122 DBUG_RETURN(TRUE);
5123
5124 error:
5125 *merge_block = NULL;
5126 DBUG_RETURN(FALSE);
5127 }
5128
5129 #endif /* !UNIV_HOTBACKUP */
5130