1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2012, Facebook Inc.
5 
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License, version 2.0,
8 as published by the Free Software Foundation.
9 
10 This program is also distributed with certain software (including
11 but not limited to OpenSSL) that is licensed under separate terms,
12 as designated in a particular file or component or in included license
13 documentation.  The authors of MySQL hereby grant you an additional
14 permission to link the program and your derivative works with the
15 separately licensed software that they have included with MySQL.
16 
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 GNU General Public License, version 2.0, for more details.
21 
22 You should have received a copy of the GNU General Public License along with
23 this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
25 
26 *****************************************************************************/
27 
28 /**************************************************//**
29 @file btr/btr0btr.cc
30 The B-tree
31 
32 Created 6/2/1994 Heikki Tuuri
33 *******************************************************/
34 
35 #include "btr0btr.h"
36 
37 #ifdef UNIV_NONINL
38 #include "btr0btr.ic"
39 #endif
40 
41 #include "fsp0fsp.h"
42 #include "page0page.h"
43 #include "page0zip.h"
44 
45 #ifndef UNIV_HOTBACKUP
46 #include "btr0cur.h"
47 #include "btr0sea.h"
48 #include "btr0pcur.h"
49 #include "rem0cmp.h"
50 #include "lock0lock.h"
51 #include "ibuf0ibuf.h"
52 #include "trx0trx.h"
53 #include "srv0mon.h"
54 
55 /**************************************************************//**
56 Checks if the page in the cursor can be merged with given page.
57 If necessary, re-organize the merge_page.
58 @return	TRUE if possible to merge. */
59 UNIV_INTERN
60 ibool
61 btr_can_merge_with_page(
62 /*====================*/
63 	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
64 	ulint		page_no,	/*!< in: a sibling page */
65 	buf_block_t**	merge_block,	/*!< out: the merge block */
66 	mtr_t*		mtr);		/*!< in: mini-transaction */
67 
68 #endif /* UNIV_HOTBACKUP */
69 
70 /**************************************************************//**
71 Report that an index page is corrupted. */
72 UNIV_INTERN
73 void
btr_corruption_report(const buf_block_t * block,const dict_index_t * index)74 btr_corruption_report(
75 /*==================*/
76 	const buf_block_t*	block,	/*!< in: corrupted block */
77 	const dict_index_t*	index)	/*!< in: index tree */
78 {
79 	fprintf(stderr, "InnoDB: flag mismatch in space %u page %u"
80 		" index %s of table %s\n",
81 		(unsigned) buf_block_get_space(block),
82 		(unsigned) buf_block_get_page_no(block),
83 		index->name, index->table_name);
84 	if (block->page.zip.data) {
85 		buf_page_print(block->page.zip.data,
86 			       buf_block_get_zip_size(block),
87 			       BUF_PAGE_PRINT_NO_CRASH);
88 	}
89 	buf_page_print(buf_block_get_frame(block), 0, 0);
90 }
91 
92 #ifndef UNIV_HOTBACKUP
93 #ifdef UNIV_BLOB_DEBUG
94 # include "srv0srv.h"
95 # include "ut0rbt.h"
96 
97 /** TRUE when messages about index->blobs modification are enabled. */
98 static ibool btr_blob_dbg_msg;
99 
100 /** Issue a message about an operation on index->blobs.
101 @param op	operation
102 @param b	the entry being subjected to the operation
103 @param ctx	the context of the operation */
104 #define btr_blob_dbg_msg_issue(op, b, ctx)			\
105 	fprintf(stderr, op " %u:%u:%u->%u %s(%u,%u,%u)\n",	\
106 		(b)->ref_page_no, (b)->ref_heap_no,		\
107 		(b)->ref_field_no, (b)->blob_page_no, ctx,	\
108 		(b)->owner, (b)->always_owner, (b)->del)
109 
110 /** Insert to index->blobs a reference to an off-page column.
111 @param index	the index tree
112 @param b	the reference
113 @param ctx	context (for logging) */
114 UNIV_INTERN
115 void
btr_blob_dbg_rbt_insert(dict_index_t * index,const btr_blob_dbg_t * b,const char * ctx)116 btr_blob_dbg_rbt_insert(
117 /*====================*/
118 	dict_index_t*		index,	/*!< in/out: index tree */
119 	const btr_blob_dbg_t*	b,	/*!< in: the reference */
120 	const char*		ctx)	/*!< in: context (for logging) */
121 {
122 	if (btr_blob_dbg_msg) {
123 		btr_blob_dbg_msg_issue("insert", b, ctx);
124 	}
125 	mutex_enter(&index->blobs_mutex);
126 	rbt_insert(index->blobs, b, b);
127 	mutex_exit(&index->blobs_mutex);
128 }
129 
130 /** Remove from index->blobs a reference to an off-page column.
131 @param index	the index tree
132 @param b	the reference
133 @param ctx	context (for logging) */
134 UNIV_INTERN
135 void
btr_blob_dbg_rbt_delete(dict_index_t * index,const btr_blob_dbg_t * b,const char * ctx)136 btr_blob_dbg_rbt_delete(
137 /*====================*/
138 	dict_index_t*		index,	/*!< in/out: index tree */
139 	const btr_blob_dbg_t*	b,	/*!< in: the reference */
140 	const char*		ctx)	/*!< in: context (for logging) */
141 {
142 	if (btr_blob_dbg_msg) {
143 		btr_blob_dbg_msg_issue("delete", b, ctx);
144 	}
145 	mutex_enter(&index->blobs_mutex);
146 	ut_a(rbt_delete(index->blobs, b));
147 	mutex_exit(&index->blobs_mutex);
148 }
149 
150 /**************************************************************//**
151 Comparator for items (btr_blob_dbg_t) in index->blobs.
152 The key in index->blobs is (ref_page_no, ref_heap_no, ref_field_no).
153 @return negative, 0 or positive if *a<*b, *a=*b, *a>*b */
154 static
155 int
btr_blob_dbg_cmp(const void * a,const void * b)156 btr_blob_dbg_cmp(
157 /*=============*/
158 	const void*	a,	/*!< in: first btr_blob_dbg_t to compare */
159 	const void*	b)	/*!< in: second btr_blob_dbg_t to compare */
160 {
161 	const btr_blob_dbg_t*	aa = static_cast<const btr_blob_dbg_t*>(a);
162 	const btr_blob_dbg_t*	bb = static_cast<const btr_blob_dbg_t*>(b);
163 
164 	ut_ad(aa != NULL);
165 	ut_ad(bb != NULL);
166 
167 	if (aa->ref_page_no != bb->ref_page_no) {
168 		return(aa->ref_page_no < bb->ref_page_no ? -1 : 1);
169 	}
170 	if (aa->ref_heap_no != bb->ref_heap_no) {
171 		return(aa->ref_heap_no < bb->ref_heap_no ? -1 : 1);
172 	}
173 	if (aa->ref_field_no != bb->ref_field_no) {
174 		return(aa->ref_field_no < bb->ref_field_no ? -1 : 1);
175 	}
176 	return(0);
177 }
178 
179 /**************************************************************//**
180 Add a reference to an off-page column to the index->blobs map. */
181 UNIV_INTERN
182 void
btr_blob_dbg_add_blob(const rec_t * rec,ulint field_no,ulint page_no,dict_index_t * index,const char * ctx)183 btr_blob_dbg_add_blob(
184 /*==================*/
185 	const rec_t*	rec,		/*!< in: clustered index record */
186 	ulint		field_no,	/*!< in: off-page column number */
187 	ulint		page_no,	/*!< in: start page of the column */
188 	dict_index_t*	index,		/*!< in/out: index tree */
189 	const char*	ctx)		/*!< in: context (for logging) */
190 {
191 	btr_blob_dbg_t	b;
192 	const page_t*	page	= page_align(rec);
193 
194 	ut_a(index->blobs);
195 
196 	b.blob_page_no = page_no;
197 	b.ref_page_no = page_get_page_no(page);
198 	b.ref_heap_no = page_rec_get_heap_no(rec);
199 	b.ref_field_no = field_no;
200 	ut_a(b.ref_field_no >= index->n_uniq);
201 	b.always_owner = b.owner = TRUE;
202 	b.del = FALSE;
203 	ut_a(!rec_get_deleted_flag(rec, page_is_comp(page)));
204 	btr_blob_dbg_rbt_insert(index, &b, ctx);
205 }
206 
207 /**************************************************************//**
208 Add to index->blobs any references to off-page columns from a record.
209 @return number of references added */
210 UNIV_INTERN
211 ulint
btr_blob_dbg_add_rec(const rec_t * rec,dict_index_t * index,const ulint * offsets,const char * ctx)212 btr_blob_dbg_add_rec(
213 /*=================*/
214 	const rec_t*	rec,	/*!< in: record */
215 	dict_index_t*	index,	/*!< in/out: index */
216 	const ulint*	offsets,/*!< in: offsets */
217 	const char*	ctx)	/*!< in: context (for logging) */
218 {
219 	ulint		count	= 0;
220 	ulint		i;
221 	btr_blob_dbg_t	b;
222 	ibool		del;
223 
224 	ut_ad(rec_offs_validate(rec, index, offsets));
225 
226 	if (!rec_offs_any_extern(offsets)) {
227 		return(0);
228 	}
229 
230 	b.ref_page_no = page_get_page_no(page_align(rec));
231 	b.ref_heap_no = page_rec_get_heap_no(rec);
232 	del = (rec_get_deleted_flag(rec, rec_offs_comp(offsets)) != 0);
233 
234 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
235 		if (rec_offs_nth_extern(offsets, i)) {
236 			ulint		len;
237 			const byte*	field_ref = rec_get_nth_field(
238 				rec, offsets, i, &len);
239 
240 			ut_a(len != UNIV_SQL_NULL);
241 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
242 			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
243 
244 			if (!memcmp(field_ref, field_ref_zero,
245 				    BTR_EXTERN_FIELD_REF_SIZE)) {
246 				/* the column has not been stored yet */
247 				continue;
248 			}
249 
250 			b.ref_field_no = i;
251 			b.blob_page_no = mach_read_from_4(
252 				field_ref + BTR_EXTERN_PAGE_NO);
253 			ut_a(b.ref_field_no >= index->n_uniq);
254 			b.always_owner = b.owner
255 				= !(field_ref[BTR_EXTERN_LEN]
256 				    & BTR_EXTERN_OWNER_FLAG);
257 			b.del = del;
258 
259 			btr_blob_dbg_rbt_insert(index, &b, ctx);
260 			count++;
261 		}
262 	}
263 
264 	return(count);
265 }
266 
267 /**************************************************************//**
268 Display the references to off-page columns.
269 This function is to be called from a debugger,
270 for example when a breakpoint on ut_dbg_assertion_failed is hit. */
271 UNIV_INTERN
272 void
btr_blob_dbg_print(const dict_index_t * index)273 btr_blob_dbg_print(
274 /*===============*/
275 	const dict_index_t*	index)	/*!< in: index tree */
276 {
277 	const ib_rbt_node_t*	node;
278 
279 	if (!index->blobs) {
280 		return;
281 	}
282 
283 	/* We intentionally do not acquire index->blobs_mutex here.
284 	This function is to be called from a debugger, and the caller
285 	should make sure that the index->blobs_mutex is held. */
286 
287 	for (node = rbt_first(index->blobs);
288 	     node != NULL; node = rbt_next(index->blobs, node)) {
289 		const btr_blob_dbg_t*	b
290 			= rbt_value(btr_blob_dbg_t, node);
291 		fprintf(stderr, "%u:%u:%u->%u%s%s%s\n",
292 			b->ref_page_no, b->ref_heap_no, b->ref_field_no,
293 			b->blob_page_no,
294 			b->owner ? "" : "(disowned)",
295 			b->always_owner ? "" : "(has disowned)",
296 			b->del ? "(deleted)" : "");
297 	}
298 }
299 
300 /**************************************************************//**
301 Remove from index->blobs any references to off-page columns from a record.
302 @return number of references removed */
303 UNIV_INTERN
304 ulint
btr_blob_dbg_remove_rec(const rec_t * rec,dict_index_t * index,const ulint * offsets,const char * ctx)305 btr_blob_dbg_remove_rec(
306 /*====================*/
307 	const rec_t*	rec,	/*!< in: record */
308 	dict_index_t*	index,	/*!< in/out: index */
309 	const ulint*	offsets,/*!< in: offsets */
310 	const char*	ctx)	/*!< in: context (for logging) */
311 {
312 	ulint		i;
313 	ulint		count	= 0;
314 	btr_blob_dbg_t	b;
315 
316 	ut_ad(rec_offs_validate(rec, index, offsets));
317 
318 	if (!rec_offs_any_extern(offsets)) {
319 		return(0);
320 	}
321 
322 	b.ref_page_no = page_get_page_no(page_align(rec));
323 	b.ref_heap_no = page_rec_get_heap_no(rec);
324 
325 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
326 		if (rec_offs_nth_extern(offsets, i)) {
327 			ulint		len;
328 			const byte*	field_ref = rec_get_nth_field(
329 				rec, offsets, i, &len);
330 
331 			ut_a(len != UNIV_SQL_NULL);
332 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
333 			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
334 
335 			b.ref_field_no = i;
336 			b.blob_page_no = mach_read_from_4(
337 				field_ref + BTR_EXTERN_PAGE_NO);
338 
339 			switch (b.blob_page_no) {
340 			case 0:
341 				/* The column has not been stored yet.
342 				The BLOB pointer must be all zero.
343 				There cannot be a BLOB starting at
344 				page 0, because page 0 is reserved for
345 				the tablespace header. */
346 				ut_a(!memcmp(field_ref, field_ref_zero,
347 					     BTR_EXTERN_FIELD_REF_SIZE));
348 				/* fall through */
349 			case FIL_NULL:
350 				/* the column has been freed already */
351 				continue;
352 			}
353 
354 			btr_blob_dbg_rbt_delete(index, &b, ctx);
355 			count++;
356 		}
357 	}
358 
359 	return(count);
360 }
361 
362 /**************************************************************//**
363 Check that there are no references to off-page columns from or to
364 the given page. Invoked when freeing or clearing a page.
365 @return TRUE when no orphan references exist */
366 UNIV_INTERN
367 ibool
btr_blob_dbg_is_empty(dict_index_t * index,ulint page_no)368 btr_blob_dbg_is_empty(
369 /*==================*/
370 	dict_index_t*	index,		/*!< in: index */
371 	ulint		page_no)	/*!< in: page number */
372 {
373 	const ib_rbt_node_t*	node;
374 	ibool			success	= TRUE;
375 
376 	if (!index->blobs) {
377 		return(success);
378 	}
379 
380 	mutex_enter(&index->blobs_mutex);
381 
382 	for (node = rbt_first(index->blobs);
383 	     node != NULL; node = rbt_next(index->blobs, node)) {
384 		const btr_blob_dbg_t*	b
385 			= rbt_value(btr_blob_dbg_t, node);
386 
387 		if (b->ref_page_no != page_no && b->blob_page_no != page_no) {
388 			continue;
389 		}
390 
391 		fprintf(stderr,
392 			"InnoDB: orphan BLOB ref%s%s%s %u:%u:%u->%u\n",
393 			b->owner ? "" : "(disowned)",
394 			b->always_owner ? "" : "(has disowned)",
395 			b->del ? "(deleted)" : "",
396 			b->ref_page_no, b->ref_heap_no, b->ref_field_no,
397 			b->blob_page_no);
398 
399 		if (b->blob_page_no != page_no || b->owner || !b->del) {
400 			success = FALSE;
401 		}
402 	}
403 
404 	mutex_exit(&index->blobs_mutex);
405 	return(success);
406 }
407 
408 /**************************************************************//**
409 Count and process all references to off-page columns on a page.
410 @return number of references processed */
411 UNIV_INTERN
412 ulint
btr_blob_dbg_op(const page_t * page,const rec_t * rec,dict_index_t * index,const char * ctx,const btr_blob_dbg_op_f op)413 btr_blob_dbg_op(
414 /*============*/
415 	const page_t*		page,	/*!< in: B-tree leaf page */
416 	const rec_t*		rec,	/*!< in: record to start from
417 					(NULL to process the whole page) */
418 	dict_index_t*		index,	/*!< in/out: index */
419 	const char*		ctx,	/*!< in: context (for logging) */
420 	const btr_blob_dbg_op_f	op)	/*!< in: operation on records */
421 {
422 	ulint		count	= 0;
423 	mem_heap_t*	heap	= NULL;
424 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
425 	ulint*		offsets	= offsets_;
426 	rec_offs_init(offsets_);
427 
428 	ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
429 	ut_a(!rec || page_align(rec) == page);
430 
431 	if (!index->blobs || !page_is_leaf(page)
432 	    || !dict_index_is_clust(index)) {
433 		return(0);
434 	}
435 
436 	if (rec == NULL) {
437 		rec = page_get_infimum_rec(page);
438 	}
439 
440 	do {
441 		offsets = rec_get_offsets(rec, index, offsets,
442 					  ULINT_UNDEFINED, &heap);
443 		count += op(rec, index, offsets, ctx);
444 		rec = page_rec_get_next_const(rec);
445 	} while (!page_rec_is_supremum(rec));
446 
447 	if (heap) {
448 		mem_heap_free(heap);
449 	}
450 
451 	return(count);
452 }
453 
454 /**************************************************************//**
455 Count and add to index->blobs any references to off-page columns
456 from records on a page.
457 @return number of references added */
458 UNIV_INTERN
459 ulint
btr_blob_dbg_add(const page_t * page,dict_index_t * index,const char * ctx)460 btr_blob_dbg_add(
461 /*=============*/
462 	const page_t*	page,	/*!< in: rewritten page */
463 	dict_index_t*	index,	/*!< in/out: index */
464 	const char*	ctx)	/*!< in: context (for logging) */
465 {
466 	btr_blob_dbg_assert_empty(index, page_get_page_no(page));
467 
468 	return(btr_blob_dbg_op(page, NULL, index, ctx, btr_blob_dbg_add_rec));
469 }
470 
471 /**************************************************************//**
472 Count and remove from index->blobs any references to off-page columns
473 from records on a page.
474 Used when reorganizing a page, before copying the records.
475 @return number of references removed */
476 UNIV_INTERN
477 ulint
btr_blob_dbg_remove(const page_t * page,dict_index_t * index,const char * ctx)478 btr_blob_dbg_remove(
479 /*================*/
480 	const page_t*	page,	/*!< in: b-tree page */
481 	dict_index_t*	index,	/*!< in/out: index */
482 	const char*	ctx)	/*!< in: context (for logging) */
483 {
484 	ulint	count;
485 
486 	count = btr_blob_dbg_op(page, NULL, index, ctx,
487 				btr_blob_dbg_remove_rec);
488 
489 	/* Check that no references exist. */
490 	btr_blob_dbg_assert_empty(index, page_get_page_no(page));
491 
492 	return(count);
493 }
494 
495 /**************************************************************//**
496 Restore in index->blobs any references to off-page columns
497 Used when page reorganize fails due to compressed page overflow. */
498 UNIV_INTERN
499 void
btr_blob_dbg_restore(const page_t * npage,const page_t * page,dict_index_t * index,const char * ctx)500 btr_blob_dbg_restore(
501 /*=================*/
502 	const page_t*	npage,	/*!< in: page that failed to compress  */
503 	const page_t*	page,	/*!< in: copy of original page */
504 	dict_index_t*	index,	/*!< in/out: index */
505 	const char*	ctx)	/*!< in: context (for logging) */
506 {
507 	ulint	removed;
508 	ulint	added;
509 
510 	ut_a(page_get_page_no(npage) == page_get_page_no(page));
511 	ut_a(page_get_space_id(npage) == page_get_space_id(page));
512 
513 	removed = btr_blob_dbg_remove(npage, index, ctx);
514 	added = btr_blob_dbg_add(page, index, ctx);
515 	ut_a(added == removed);
516 }
517 
518 /**************************************************************//**
519 Modify the 'deleted' flag of a record. */
520 UNIV_INTERN
521 void
btr_blob_dbg_set_deleted_flag(const rec_t * rec,dict_index_t * index,const ulint * offsets,ibool del)522 btr_blob_dbg_set_deleted_flag(
523 /*==========================*/
524 	const rec_t*		rec,	/*!< in: record */
525 	dict_index_t*		index,	/*!< in/out: index */
526 	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
527 	ibool			del)	/*!< in: TRUE=deleted, FALSE=exists */
528 {
529 	const ib_rbt_node_t*	node;
530 	btr_blob_dbg_t		b;
531 	btr_blob_dbg_t*		c;
532 	ulint			i;
533 
534 	ut_ad(rec_offs_validate(rec, index, offsets));
535 	ut_a(dict_index_is_clust(index));
536 	ut_a(del == !!del);/* must be FALSE==0 or TRUE==1 */
537 
538 	if (!rec_offs_any_extern(offsets) || !index->blobs) {
539 
540 		return;
541 	}
542 
543 	b.ref_page_no = page_get_page_no(page_align(rec));
544 	b.ref_heap_no = page_rec_get_heap_no(rec);
545 
546 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
547 		if (rec_offs_nth_extern(offsets, i)) {
548 			ulint		len;
549 			const byte*	field_ref = rec_get_nth_field(
550 				rec, offsets, i, &len);
551 
552 			ut_a(len != UNIV_SQL_NULL);
553 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
554 			field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
555 
556 			b.ref_field_no = i;
557 			b.blob_page_no = mach_read_from_4(
558 				field_ref + BTR_EXTERN_PAGE_NO);
559 
560 			switch (b.blob_page_no) {
561 			case 0:
562 				ut_a(memcmp(field_ref, field_ref_zero,
563 					    BTR_EXTERN_FIELD_REF_SIZE));
564 				/* page number 0 is for the
565 				page allocation bitmap */
566 			case FIL_NULL:
567 				/* the column has been freed already */
568 				ut_error;
569 			}
570 
571 			mutex_enter(&index->blobs_mutex);
572 			node = rbt_lookup(index->blobs, &b);
573 			ut_a(node);
574 
575 			c = rbt_value(btr_blob_dbg_t, node);
576 			/* The flag should be modified. */
577 			c->del = del;
578 			if (btr_blob_dbg_msg) {
579 				b = *c;
580 				mutex_exit(&index->blobs_mutex);
581 				btr_blob_dbg_msg_issue("del_mk", &b, "");
582 			} else {
583 				mutex_exit(&index->blobs_mutex);
584 			}
585 		}
586 	}
587 }
588 
589 /**************************************************************//**
590 Change the ownership of an off-page column. */
591 UNIV_INTERN
592 void
btr_blob_dbg_owner(const rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool own)593 btr_blob_dbg_owner(
594 /*===============*/
595 	const rec_t*		rec,	/*!< in: record */
596 	dict_index_t*		index,	/*!< in/out: index */
597 	const ulint*		offsets,/*!< in: rec_get_offs(rec, index) */
598 	ulint			i,	/*!< in: ith field in rec */
599 	ibool			own)	/*!< in: TRUE=owned, FALSE=disowned */
600 {
601 	const ib_rbt_node_t*	node;
602 	btr_blob_dbg_t		b;
603 	const byte*		field_ref;
604 	ulint			len;
605 
606 	ut_ad(rec_offs_validate(rec, index, offsets));
607 	ut_a(rec_offs_nth_extern(offsets, i));
608 
609 	field_ref = rec_get_nth_field(rec, offsets, i, &len);
610 	ut_a(len != UNIV_SQL_NULL);
611 	ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
612 	field_ref += len - BTR_EXTERN_FIELD_REF_SIZE;
613 
614 	b.ref_page_no = page_get_page_no(page_align(rec));
615 	b.ref_heap_no = page_rec_get_heap_no(rec);
616 	b.ref_field_no = i;
617 	b.owner = !(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG);
618 	b.blob_page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
619 
620 	ut_a(b.owner == own);
621 
622 	mutex_enter(&index->blobs_mutex);
623 	node = rbt_lookup(index->blobs, &b);
624 	/* row_ins_clust_index_entry_by_modify() invokes
625 	btr_cur_unmark_extern_fields() also for the newly inserted
626 	references, which are all zero bytes until the columns are stored.
627 	The node lookup must fail if and only if that is the case. */
628 	ut_a(!memcmp(field_ref, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE)
629 	     == !node);
630 
631 	if (node) {
632 		btr_blob_dbg_t*	c = rbt_value(btr_blob_dbg_t, node);
633 		/* Some code sets ownership from TRUE to TRUE.
634 		We do not allow changing ownership from FALSE to FALSE. */
635 		ut_a(own || c->owner);
636 
637 		c->owner = own;
638 		if (!own) {
639 			c->always_owner = FALSE;
640 		}
641 	}
642 
643 	mutex_exit(&index->blobs_mutex);
644 }
645 #endif /* UNIV_BLOB_DEBUG */
646 
647 /*
648 Latching strategy of the InnoDB B-tree
649 --------------------------------------
650 A tree latch protects all non-leaf nodes of the tree. Each node of a tree
651 also has a latch of its own.
652 
653 A B-tree operation normally first acquires an S-latch on the tree. It
654 searches down the tree and releases the tree latch when it has the
655 leaf node latch. To save CPU time we do not acquire any latch on
656 non-leaf nodes of the tree during a search, those pages are only bufferfixed.
657 
658 If an operation needs to restructure the tree, it acquires an X-latch on
659 the tree before searching to a leaf node. If it needs, for example, to
660 split a leaf,
661 (1) InnoDB decides the split point in the leaf,
662 (2) allocates a new page,
663 (3) inserts the appropriate node pointer to the first non-leaf level,
664 (4) releases the tree X-latch,
665 (5) and then moves records from the leaf to the new allocated page.
666 
667 Node pointers
668 -------------
669 Leaf pages of a B-tree contain the index records stored in the
670 tree. On levels n > 0 we store 'node pointers' to pages on level
671 n - 1. For each page there is exactly one node pointer stored:
672 thus the our tree is an ordinary B-tree, not a B-link tree.
673 
674 A node pointer contains a prefix P of an index record. The prefix
675 is long enough so that it determines an index record uniquely.
676 The file page number of the child page is added as the last
677 field. To the child page we can store node pointers or index records
678 which are >= P in the alphabetical order, but < P1 if there is
679 a next node pointer on the level, and P1 is its prefix.
680 
681 If a node pointer with a prefix P points to a non-leaf child,
682 then the leftmost record in the child must have the same
683 prefix P. If it points to a leaf node, the child is not required
684 to contain any record with a prefix equal to P. The leaf case
685 is decided this way to allow arbitrary deletions in a leaf node
686 without touching upper levels of the tree.
687 
688 We have predefined a special minimum record which we
689 define as the smallest record in any alphabetical order.
690 A minimum record is denoted by setting a bit in the record
691 header. A minimum record acts as the prefix of a node pointer
692 which points to a leftmost node on any level of the tree.
693 
694 File page allocation
695 --------------------
696 In the root node of a B-tree there are two file segment headers.
697 The leaf pages of a tree are allocated from one file segment, to
698 make them consecutive on disk if possible. From the other file segment
699 we allocate pages for the non-leaf levels of the tree.
700 */
701 
702 #ifdef UNIV_BTR_DEBUG
703 /**************************************************************//**
704 Checks a file segment header within a B-tree root page.
705 @return	TRUE if valid */
706 static
707 ibool
btr_root_fseg_validate(const fseg_header_t * seg_header,ulint space)708 btr_root_fseg_validate(
709 /*===================*/
710 	const fseg_header_t*	seg_header,	/*!< in: segment header */
711 	ulint			space)		/*!< in: tablespace identifier */
712 {
713 	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
714 
715 	ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
716 	ut_a(offset >= FIL_PAGE_DATA);
717 	ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
718 	return(TRUE);
719 }
720 #endif /* UNIV_BTR_DEBUG */
721 
722 /**************************************************************//**
723 Gets the root node of a tree and x- or s-latches it.
724 @return	root page, x- or s-latched */
725 static
726 buf_block_t*
btr_root_block_get(const dict_index_t * index,ulint mode,mtr_t * mtr)727 btr_root_block_get(
728 /*===============*/
729 	const dict_index_t*	index,	/*!< in: index tree */
730 	ulint			mode,	/*!< in: either RW_S_LATCH
731 					or RW_X_LATCH */
732 	mtr_t*			mtr)	/*!< in: mtr */
733 {
734 	ulint		space;
735 	ulint		zip_size;
736 	ulint		root_page_no;
737 	buf_block_t*	block;
738 
739 	space = dict_index_get_space(index);
740 	zip_size = dict_table_zip_size(index->table);
741 	root_page_no = dict_index_get_page(index);
742 
743 	block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr);
744 	btr_assert_not_corrupted(block, index);
745 #ifdef UNIV_BTR_DEBUG
746 	if (!dict_index_is_ibuf(index)) {
747 		const page_t*	root = buf_block_get_frame(block);
748 
749 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
750 					    + root, space));
751 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
752 					    + root, space));
753 	}
754 #endif /* UNIV_BTR_DEBUG */
755 
756 	return(block);
757 }
758 
759 /**************************************************************//**
760 Gets the root node of a tree and x-latches it.
761 @return	root page, x-latched */
762 UNIV_INTERN
763 page_t*
btr_root_get(const dict_index_t * index,mtr_t * mtr)764 btr_root_get(
765 /*=========*/
766 	const dict_index_t*	index,	/*!< in: index tree */
767 	mtr_t*			mtr)	/*!< in: mtr */
768 {
769 	return(buf_block_get_frame(btr_root_block_get(index, RW_X_LATCH,
770 						      mtr)));
771 }
772 
773 /**************************************************************//**
774 Gets the height of the B-tree (the level of the root, when the leaf
775 level is assumed to be 0). The caller must hold an S or X latch on
776 the index.
777 @return	tree height (level of the root) */
778 UNIV_INTERN
779 ulint
btr_height_get(dict_index_t * index,mtr_t * mtr)780 btr_height_get(
781 /*===========*/
782 	dict_index_t*	index,	/*!< in: index tree */
783 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
784 {
785 	ulint		height;
786 	buf_block_t*	root_block;
787 
788 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
789 				MTR_MEMO_S_LOCK)
790 	      || mtr_memo_contains(mtr, dict_index_get_lock(index),
791 				MTR_MEMO_X_LOCK));
792 
793         /* S latches the page */
794         root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
795 
796         height = btr_page_get_level(buf_block_get_frame(root_block), mtr);
797 
798         /* Release the S latch on the root page. */
799         mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX);
800 #ifdef UNIV_SYNC_DEBUG
801         sync_thread_reset_level(&root_block->lock);
802 #endif /* UNIV_SYNC_DEBUG */
803 
804 	return(height);
805 }
806 
807 /**************************************************************//**
808 Checks a file segment header within a B-tree root page and updates
809 the segment header space id.
810 @return	TRUE if valid */
811 static
812 bool
btr_root_fseg_adjust_on_import(fseg_header_t * seg_header,page_zip_des_t * page_zip,ulint space,mtr_t * mtr)813 btr_root_fseg_adjust_on_import(
814 /*===========================*/
815 	fseg_header_t*	seg_header,	/*!< in/out: segment header */
816 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page,
817 					or NULL */
818 	ulint		space,		/*!< in: tablespace identifier */
819 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
820 {
821 	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
822 
823 	if (offset < FIL_PAGE_DATA
824 	    || offset > UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) {
825 
826 		return(FALSE);
827 
828 	} else if (page_zip) {
829 		mach_write_to_4(seg_header + FSEG_HDR_SPACE, space);
830 		page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE,
831 				      4, mtr);
832 	} else {
833 		mlog_write_ulint(seg_header + FSEG_HDR_SPACE,
834 				 space, MLOG_4BYTES, mtr);
835 	}
836 
837 	return(TRUE);
838 }
839 
840 /**************************************************************//**
841 Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
842 @return error code, or DB_SUCCESS */
843 UNIV_INTERN
844 dberr_t
btr_root_adjust_on_import(const dict_index_t * index)845 btr_root_adjust_on_import(
846 /*======================*/
847 	const dict_index_t*	index)	/*!< in: index tree */
848 {
849 	dberr_t		err;
850 	mtr_t		mtr;
851 	page_t*		page;
852 	buf_block_t*	block;
853 	page_zip_des_t*	page_zip;
854 	dict_table_t*	table		= index->table;
855 	ulint		space_id	= dict_index_get_space(index);
856 	ulint		zip_size	= dict_table_zip_size(table);
857 	ulint		root_page_no	= dict_index_get_page(index);
858 
859 	mtr_start(&mtr);
860 
861 	mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
862 
863 	DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
864 			return(DB_CORRUPTION););
865 
866 	block = btr_block_get(
867 		space_id, zip_size, root_page_no, RW_X_LATCH, index, &mtr);
868 
869 	page = buf_block_get_frame(block);
870 	page_zip = buf_block_get_page_zip(block);
871 
872 	/* Check that this is a B-tree page and both the PREV and NEXT
873 	pointers are FIL_NULL, because the root page does not have any
874 	siblings. */
875 	if (fil_page_get_type(page) != FIL_PAGE_INDEX
876 	    || fil_page_get_prev(page) != FIL_NULL
877 	    || fil_page_get_next(page) != FIL_NULL) {
878 
879 		err = DB_CORRUPTION;
880 
881 	} else if (dict_index_is_clust(index)) {
882 		bool	page_is_compact_format;
883 
884 		page_is_compact_format = page_is_comp(page) > 0;
885 
886 		/* Check if the page format and table format agree. */
887 		if (page_is_compact_format != dict_table_is_comp(table)) {
888 			err = DB_CORRUPTION;
889 		} else {
890 
891 			/* Check that the table flags and the tablespace
892 			flags match. */
893 			ulint	flags = fil_space_get_flags(table->space);
894 
895 			if (flags
896 			    && flags != dict_tf_to_fsp_flags(table->flags)) {
897 
898 				err = DB_CORRUPTION;
899 			} else {
900 				err = DB_SUCCESS;
901 			}
902 		}
903 	} else {
904 		err = DB_SUCCESS;
905 	}
906 
907 	/* Check and adjust the file segment headers, if all OK so far. */
908 	if (err == DB_SUCCESS
909 	    && (!btr_root_fseg_adjust_on_import(
910 			FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
911 			+ page, page_zip, space_id, &mtr)
912 		|| !btr_root_fseg_adjust_on_import(
913 			FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
914 			+ page, page_zip, space_id, &mtr))) {
915 
916 		err = DB_CORRUPTION;
917 	}
918 
919 	mtr_commit(&mtr);
920 
921 	return(err);
922 }
923 
924 /*************************************************************//**
925 Gets pointer to the previous user record in the tree. It is assumed that
926 the caller has appropriate latches on the page and its neighbor.
927 @return	previous user record, NULL if there is none */
928 UNIV_INTERN
929 rec_t*
btr_get_prev_user_rec(rec_t * rec,mtr_t * mtr)930 btr_get_prev_user_rec(
931 /*==================*/
932 	rec_t*	rec,	/*!< in: record on leaf level */
933 	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
934 			needed, also to the previous page */
935 {
936 	page_t*	page;
937 	page_t*	prev_page;
938 	ulint	prev_page_no;
939 
940 	if (!page_rec_is_infimum(rec)) {
941 
942 		rec_t*	prev_rec = page_rec_get_prev(rec);
943 
944 		if (!page_rec_is_infimum(prev_rec)) {
945 
946 			return(prev_rec);
947 		}
948 	}
949 
950 	page = page_align(rec);
951 	prev_page_no = btr_page_get_prev(page, mtr);
952 
953 	if (prev_page_no != FIL_NULL) {
954 
955 		ulint		space;
956 		ulint		zip_size;
957 		buf_block_t*	prev_block;
958 
959 		space = page_get_space_id(page);
960 		zip_size = fil_space_get_zip_size(space);
961 
962 		prev_block = buf_page_get_with_no_latch(space, zip_size,
963 							prev_page_no, mtr);
964 		prev_page = buf_block_get_frame(prev_block);
965 		/* The caller must already have a latch to the brother */
966 		ut_ad(mtr_memo_contains(mtr, prev_block,
967 					MTR_MEMO_PAGE_S_FIX)
968 		      || mtr_memo_contains(mtr, prev_block,
969 					   MTR_MEMO_PAGE_X_FIX));
970 #ifdef UNIV_BTR_DEBUG
971 		ut_a(page_is_comp(prev_page) == page_is_comp(page));
972 		ut_a(btr_page_get_next(prev_page, mtr)
973 		     == page_get_page_no(page));
974 #endif /* UNIV_BTR_DEBUG */
975 
976 		return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
977 	}
978 
979 	return(NULL);
980 }
981 
982 /*************************************************************//**
983 Gets pointer to the next user record in the tree. It is assumed that the
984 caller has appropriate latches on the page and its neighbor.
985 @return	next user record, NULL if there is none */
986 UNIV_INTERN
987 rec_t*
btr_get_next_user_rec(rec_t * rec,mtr_t * mtr)988 btr_get_next_user_rec(
989 /*==================*/
990 	rec_t*	rec,	/*!< in: record on leaf level */
991 	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
992 			needed, also to the next page */
993 {
994 	page_t*	page;
995 	page_t*	next_page;
996 	ulint	next_page_no;
997 
998 	if (!page_rec_is_supremum(rec)) {
999 
1000 		rec_t*	next_rec = page_rec_get_next(rec);
1001 
1002 		if (!page_rec_is_supremum(next_rec)) {
1003 
1004 			return(next_rec);
1005 		}
1006 	}
1007 
1008 	page = page_align(rec);
1009 	next_page_no = btr_page_get_next(page, mtr);
1010 
1011 	if (next_page_no != FIL_NULL) {
1012 		ulint		space;
1013 		ulint		zip_size;
1014 		buf_block_t*	next_block;
1015 
1016 		space = page_get_space_id(page);
1017 		zip_size = fil_space_get_zip_size(space);
1018 
1019 		next_block = buf_page_get_with_no_latch(space, zip_size,
1020 							next_page_no, mtr);
1021 		next_page = buf_block_get_frame(next_block);
1022 		/* The caller must already have a latch to the brother */
1023 		ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
1024 		      || mtr_memo_contains(mtr, next_block,
1025 					   MTR_MEMO_PAGE_X_FIX));
1026 #ifdef UNIV_BTR_DEBUG
1027 		ut_a(page_is_comp(next_page) == page_is_comp(page));
1028 		ut_a(btr_page_get_prev(next_page, mtr)
1029 		     == page_get_page_no(page));
1030 #endif /* UNIV_BTR_DEBUG */
1031 
1032 		return(page_rec_get_next(page_get_infimum_rec(next_page)));
1033 	}
1034 
1035 	return(NULL);
1036 }
1037 
1038 /**************************************************************//**
1039 Creates a new index page (not the root, and also not
1040 used in page reorganization).  @see btr_page_empty(). */
1041 static
1042 void
btr_page_create(buf_block_t * block,page_zip_des_t * page_zip,dict_index_t * index,ulint level,mtr_t * mtr)1043 btr_page_create(
1044 /*============*/
1045 	buf_block_t*	block,	/*!< in/out: page to be created */
1046 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
1047 	dict_index_t*	index,	/*!< in: index */
1048 	ulint		level,	/*!< in: the B-tree level of the page */
1049 	mtr_t*		mtr)	/*!< in: mtr */
1050 {
1051 	page_t*		page = buf_block_get_frame(block);
1052 
1053 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1054 	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
1055 
1056 	if (page_zip) {
1057 		page_create_zip(block, index, level, 0, mtr);
1058 	} else {
1059 		page_create(block, mtr, dict_table_is_comp(index->table));
1060 		/* Set the level of the new index page */
1061 		btr_page_set_level(page, NULL, level, mtr);
1062 	}
1063 
1064 	block->check_index_page_at_flush = TRUE;
1065 
1066 	btr_page_set_index_id(page, page_zip, index->id, mtr);
1067 }
1068 
1069 /**************************************************************//**
1070 Allocates a new file page to be used in an ibuf tree. Takes the page from
1071 the free list of the tree, which must contain pages!
1072 @return	new allocated block, x-latched */
1073 static
1074 buf_block_t*
btr_page_alloc_for_ibuf(dict_index_t * index,mtr_t * mtr)1075 btr_page_alloc_for_ibuf(
1076 /*====================*/
1077 	dict_index_t*	index,	/*!< in: index tree */
1078 	mtr_t*		mtr)	/*!< in: mtr */
1079 {
1080 	fil_addr_t	node_addr;
1081 	page_t*		root;
1082 	page_t*		new_page;
1083 	buf_block_t*	new_block;
1084 
1085 	root = btr_root_get(index, mtr);
1086 
1087 	node_addr = flst_get_first(root + PAGE_HEADER
1088 				   + PAGE_BTR_IBUF_FREE_LIST, mtr);
1089 	ut_a(node_addr.page != FIL_NULL);
1090 
1091 	new_block = buf_page_get(dict_index_get_space(index),
1092 				 dict_table_zip_size(index->table),
1093 				 node_addr.page, RW_X_LATCH, mtr);
1094 	new_page = buf_block_get_frame(new_block);
1095 	buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
1096 
1097 	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1098 		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
1099 		    mtr);
1100 	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1101 			    mtr));
1102 
1103 	return(new_block);
1104 }
1105 
1106 /**************************************************************//**
1107 Allocates a new file page to be used in an index tree. NOTE: we assume
1108 that the caller has made the reservation for free extents!
1109 @retval NULL if no page could be allocated
1110 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
1111 (init_mtr == mtr, or the page was not previously freed in mtr)
1112 @retval block (not allocated or initialized) otherwise */
1113 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1114 buf_block_t*
btr_page_alloc_low(dict_index_t * index,ulint hint_page_no,byte file_direction,ulint level,mtr_t * mtr,mtr_t * init_mtr)1115 btr_page_alloc_low(
1116 /*===============*/
1117 	dict_index_t*	index,		/*!< in: index */
1118 	ulint		hint_page_no,	/*!< in: hint of a good page */
1119 	byte		file_direction,	/*!< in: direction where a possible
1120 					page split is made */
1121 	ulint		level,		/*!< in: level where the page is placed
1122 					in the tree */
1123 	mtr_t*		mtr,		/*!< in/out: mini-transaction
1124 					for the allocation */
1125 	mtr_t*		init_mtr)	/*!< in/out: mtr or another
1126 					mini-transaction in which the
1127 					page should be initialized.
1128 					If init_mtr!=mtr, but the page
1129 					is already X-latched in mtr, do
1130 					not initialize the page. */
1131 {
1132 	fseg_header_t*	seg_header;
1133 	page_t*		root;
1134 
1135 	root = btr_root_get(index, mtr);
1136 
1137 	if (level == 0) {
1138 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1139 	} else {
1140 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1141 	}
1142 
1143 	/* Parameter TRUE below states that the caller has made the
1144 	reservation for free extents, and thus we know that a page can
1145 	be allocated: */
1146 
1147 	return(fseg_alloc_free_page_general(
1148 		       seg_header, hint_page_no, file_direction,
1149 		       TRUE, mtr, init_mtr));
1150 }
1151 
1152 /**************************************************************//**
1153 Allocates a new file page to be used in an index tree. NOTE: we assume
1154 that the caller has made the reservation for free extents!
1155 @retval NULL if no page could be allocated
1156 @retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
1157 (init_mtr == mtr, or the page was not previously freed in mtr)
1158 @retval block (not allocated or initialized) otherwise */
1159 UNIV_INTERN
1160 buf_block_t*
btr_page_alloc(dict_index_t * index,ulint hint_page_no,byte file_direction,ulint level,mtr_t * mtr,mtr_t * init_mtr)1161 btr_page_alloc(
1162 /*===========*/
1163 	dict_index_t*	index,		/*!< in: index */
1164 	ulint		hint_page_no,	/*!< in: hint of a good page */
1165 	byte		file_direction,	/*!< in: direction where a possible
1166 					page split is made */
1167 	ulint		level,		/*!< in: level where the page is placed
1168 					in the tree */
1169 	mtr_t*		mtr,		/*!< in/out: mini-transaction
1170 					for the allocation */
1171 	mtr_t*		init_mtr)	/*!< in/out: mini-transaction
1172 					for x-latching and initializing
1173 					the page */
1174 {
1175 	buf_block_t*	new_block;
1176 
1177 	if (dict_index_is_ibuf(index)) {
1178 
1179 		return(btr_page_alloc_for_ibuf(index, mtr));
1180 	}
1181 
1182 	new_block = btr_page_alloc_low(
1183 		index, hint_page_no, file_direction, level, mtr, init_mtr);
1184 
1185 	if (new_block) {
1186 		buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
1187 	}
1188 
1189 	return(new_block);
1190 }
1191 
1192 /**************************************************************//**
1193 Gets the number of pages in a B-tree.
1194 @return	number of pages, or ULINT_UNDEFINED if the index is unavailable */
1195 UNIV_INTERN
1196 ulint
btr_get_size(dict_index_t * index,ulint flag,mtr_t * mtr)1197 btr_get_size(
1198 /*=========*/
1199 	dict_index_t*	index,	/*!< in: index */
1200 	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
1201 	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
1202 				is s-latched */
1203 {
1204 	fseg_header_t*	seg_header;
1205 	page_t*		root;
1206 	ulint		n;
1207 	ulint		dummy;
1208 
1209 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1210 				MTR_MEMO_S_LOCK));
1211 
1212 	if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
1213 	    || *index->name == TEMP_INDEX_PREFIX) {
1214 		return(ULINT_UNDEFINED);
1215 	}
1216 
1217 	root = btr_root_get(index, mtr);
1218 
1219 	if (flag == BTR_N_LEAF_PAGES) {
1220 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1221 
1222 		fseg_n_reserved_pages(seg_header, &n, mtr);
1223 
1224 	} else if (flag == BTR_TOTAL_SIZE) {
1225 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1226 
1227 		n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
1228 
1229 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1230 
1231 		n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
1232 	} else {
1233 		ut_error;
1234 	}
1235 
1236 	return(n);
1237 }
1238 
1239 /**************************************************************//**
1240 Frees a page used in an ibuf tree. Puts the page to the free list of the
1241 ibuf tree. */
1242 static
1243 void
btr_page_free_for_ibuf(dict_index_t * index,buf_block_t * block,mtr_t * mtr)1244 btr_page_free_for_ibuf(
1245 /*===================*/
1246 	dict_index_t*	index,	/*!< in: index tree */
1247 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
1248 	mtr_t*		mtr)	/*!< in: mtr */
1249 {
1250 	page_t*		root;
1251 
1252 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1253 	root = btr_root_get(index, mtr);
1254 
1255 	flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1256 		       buf_block_get_frame(block)
1257 		       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
1258 
1259 	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1260 			    mtr));
1261 }
1262 
1263 /**************************************************************//**
1264 Frees a file page used in an index tree. Can be used also to (BLOB)
1265 external storage pages, because the page level 0 can be given as an
1266 argument. */
1267 UNIV_INTERN
1268 void
btr_page_free_low(dict_index_t * index,buf_block_t * block,ulint level,mtr_t * mtr)1269 btr_page_free_low(
1270 /*==============*/
1271 	dict_index_t*	index,	/*!< in: index tree */
1272 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
1273 	ulint		level,	/*!< in: page level */
1274 	mtr_t*		mtr)	/*!< in: mtr */
1275 {
1276 	fseg_header_t*	seg_header;
1277 	page_t*		root;
1278 
1279 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1280 	/* The page gets invalid for optimistic searches: increment the frame
1281 	modify clock */
1282 
1283 	buf_block_modify_clock_inc(block);
1284 	btr_blob_dbg_assert_empty(index, buf_block_get_page_no(block));
1285 
1286 	if (dict_index_is_ibuf(index)) {
1287 
1288 		btr_page_free_for_ibuf(index, block, mtr);
1289 
1290 		return;
1291 	}
1292 
1293 	root = btr_root_get(index, mtr);
1294 
1295 	if (level == 0) {
1296 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
1297 	} else {
1298 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1299 	}
1300 
1301 	fseg_free_page(seg_header,
1302 		       buf_block_get_space(block),
1303 		       buf_block_get_page_no(block), mtr);
1304 
1305 	/* The page was marked free in the allocation bitmap, but it
1306 	should remain buffer-fixed until mtr_commit(mtr) or until it
1307 	is explicitly freed from the mini-transaction. */
1308 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1309 	/* TODO: Discard any operations on the page from the redo log
1310 	and remove the block from the flush list and the buffer pool.
1311 	This would free up buffer pool earlier and reduce writes to
1312 	both the tablespace and the redo log. */
1313 }
1314 
1315 /**************************************************************//**
1316 Frees a file page used in an index tree. NOTE: cannot free field external
1317 storage pages because the page must contain info on its level. */
1318 UNIV_INTERN
1319 void
btr_page_free(dict_index_t * index,buf_block_t * block,mtr_t * mtr)1320 btr_page_free(
1321 /*==========*/
1322 	dict_index_t*	index,	/*!< in: index tree */
1323 	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
1324 	mtr_t*		mtr)	/*!< in: mtr */
1325 {
1326 	const page_t*	page	= buf_block_get_frame(block);
1327 	ulint		level	= btr_page_get_level(page, mtr);
1328 
1329 	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_INDEX);
1330 	btr_page_free_low(index, block, level, mtr);
1331 }
1332 
1333 /**************************************************************//**
1334 Sets the child node file address in a node pointer. */
1335 UNIV_INLINE
1336 void
btr_node_ptr_set_child_page_no(rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,ulint page_no,mtr_t * mtr)1337 btr_node_ptr_set_child_page_no(
1338 /*===========================*/
1339 	rec_t*		rec,	/*!< in: node pointer record */
1340 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
1341 				part will be updated, or NULL */
1342 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
1343 	ulint		page_no,/*!< in: child node address */
1344 	mtr_t*		mtr)	/*!< in: mtr */
1345 {
1346 	byte*	field;
1347 	ulint	len;
1348 
1349 	ut_ad(rec_offs_validate(rec, NULL, offsets));
1350 	ut_ad(!page_is_leaf(page_align(rec)));
1351 	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
1352 
1353 	/* The child address is in the last field */
1354 	field = rec_get_nth_field(rec, offsets,
1355 				  rec_offs_n_fields(offsets) - 1, &len);
1356 
1357 	ut_ad(len == REC_NODE_PTR_SIZE);
1358 
1359 	if (page_zip) {
1360 		page_zip_write_node_ptr(page_zip, rec,
1361 					rec_offs_data_size(offsets),
1362 					page_no, mtr);
1363 	} else {
1364 		mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
1365 	}
1366 }
1367 
1368 /************************************************************//**
1369 Returns the child page of a node pointer and x-latches it.
1370 @return	child page, x-latched */
1371 static
1372 buf_block_t*
btr_node_ptr_get_child(const rec_t * node_ptr,dict_index_t * index,const ulint * offsets,mtr_t * mtr)1373 btr_node_ptr_get_child(
1374 /*===================*/
1375 	const rec_t*	node_ptr,/*!< in: node pointer */
1376 	dict_index_t*	index,	/*!< in: index */
1377 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
1378 	mtr_t*		mtr)	/*!< in: mtr */
1379 {
1380 	ulint	page_no;
1381 	ulint	space;
1382 
1383 	ut_ad(rec_offs_validate(node_ptr, index, offsets));
1384 	space = page_get_space_id(page_align(node_ptr));
1385 	page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
1386 
1387 	return(btr_block_get(space, dict_table_zip_size(index->table),
1388 			     page_no, RW_X_LATCH, index, mtr));
1389 }
1390 
1391 /************************************************************//**
1392 Returns the upper level node pointer to a page. It is assumed that mtr holds
1393 an x-latch on the tree.
1394 @return	rec_get_offsets() of the node pointer record */
1395 static
1396 ulint*
btr_page_get_father_node_ptr_func(ulint * offsets,mem_heap_t * heap,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)1397 btr_page_get_father_node_ptr_func(
1398 /*==============================*/
1399 	ulint*		offsets,/*!< in: work area for the return value */
1400 	mem_heap_t*	heap,	/*!< in: memory heap to use */
1401 	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
1402 				out: cursor on node pointer record,
1403 				its page x-latched */
1404 	const char*	file,	/*!< in: file name */
1405 	ulint		line,	/*!< in: line where called */
1406 	mtr_t*		mtr)	/*!< in: mtr */
1407 {
1408 	dtuple_t*	tuple;
1409 	rec_t*		user_rec;
1410 	rec_t*		node_ptr;
1411 	ulint		level;
1412 	ulint		page_no;
1413 	dict_index_t*	index;
1414 
1415 	page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
1416 	index = btr_cur_get_index(cursor);
1417 
1418 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
1419 				MTR_MEMO_X_LOCK));
1420 
1421 	ut_ad(dict_index_get_page(index) != page_no);
1422 
1423 	level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
1424 
1425 	user_rec = btr_cur_get_rec(cursor);
1426 	ut_a(page_rec_is_user_rec(user_rec));
1427 	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
1428 
1429 	btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
1430 				    BTR_CONT_MODIFY_TREE, cursor, 0,
1431 				    file, line, mtr);
1432 
1433 	node_ptr = btr_cur_get_rec(cursor);
1434 	ut_ad(!page_rec_is_comp(node_ptr)
1435 	      || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
1436 	offsets = rec_get_offsets(node_ptr, index, offsets,
1437 				  ULINT_UNDEFINED, &heap);
1438 
1439 	if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
1440 		rec_t*	print_rec;
1441 		fputs("InnoDB: Dump of the child page:\n", stderr);
1442 		buf_page_print(page_align(user_rec), 0,
1443 			       BUF_PAGE_PRINT_NO_CRASH);
1444 		fputs("InnoDB: Dump of the parent page:\n", stderr);
1445 		buf_page_print(page_align(node_ptr), 0,
1446 			       BUF_PAGE_PRINT_NO_CRASH);
1447 
1448 		fputs("InnoDB: Corruption of an index tree: table ", stderr);
1449 		ut_print_name(stderr, NULL, TRUE, index->table_name);
1450 		fputs(", index ", stderr);
1451 		ut_print_name(stderr, NULL, FALSE, index->name);
1452 		fprintf(stderr, ",\n"
1453 			"InnoDB: father ptr page no %lu, child page no %lu\n",
1454 			(ulong)
1455 			btr_node_ptr_get_child_page_no(node_ptr, offsets),
1456 			(ulong) page_no);
1457 		print_rec = page_rec_get_next(
1458 			page_get_infimum_rec(page_align(user_rec)));
1459 		offsets = rec_get_offsets(print_rec, index,
1460 					  offsets, ULINT_UNDEFINED, &heap);
1461 		page_rec_print(print_rec, offsets);
1462 		offsets = rec_get_offsets(node_ptr, index, offsets,
1463 					  ULINT_UNDEFINED, &heap);
1464 		page_rec_print(node_ptr, offsets);
1465 
1466 		fputs("InnoDB: You should dump + drop + reimport the table"
1467 		      " to fix the\n"
1468 		      "InnoDB: corruption. If the crash happens at "
1469 		      "the database startup, see\n"
1470 		      "InnoDB: " REFMAN "forcing-innodb-recovery.html about\n"
1471 		      "InnoDB: forcing recovery. "
1472 		      "Then dump + drop + reimport.\n", stderr);
1473 
1474 		ut_error;
1475 	}
1476 
1477 	return(offsets);
1478 }
1479 
1480 #define btr_page_get_father_node_ptr(of,heap,cur,mtr)			\
1481 	btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
1482 
1483 /************************************************************//**
1484 Returns the upper level node pointer to a page. It is assumed that mtr holds
1485 an x-latch on the tree.
1486 @return	rec_get_offsets() of the node pointer record */
1487 static
1488 ulint*
btr_page_get_father_block(ulint * offsets,mem_heap_t * heap,dict_index_t * index,buf_block_t * block,mtr_t * mtr,btr_cur_t * cursor)1489 btr_page_get_father_block(
1490 /*======================*/
1491 	ulint*		offsets,/*!< in: work area for the return value */
1492 	mem_heap_t*	heap,	/*!< in: memory heap to use */
1493 	dict_index_t*	index,	/*!< in: b-tree index */
1494 	buf_block_t*	block,	/*!< in: child page in the index */
1495 	mtr_t*		mtr,	/*!< in: mtr */
1496 	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
1497 				its page x-latched */
1498 {
1499 	rec_t*	rec
1500 		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1501 								 block)));
1502 	btr_cur_position(index, rec, block, cursor);
1503 	return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
1504 }
1505 
1506 /************************************************************//**
1507 Seeks to the upper level node pointer to a page.
1508 It is assumed that mtr holds an x-latch on the tree. */
1509 static
1510 void
btr_page_get_father(dict_index_t * index,buf_block_t * block,mtr_t * mtr,btr_cur_t * cursor)1511 btr_page_get_father(
1512 /*================*/
1513 	dict_index_t*	index,	/*!< in: b-tree index */
1514 	buf_block_t*	block,	/*!< in: child page in the index */
1515 	mtr_t*		mtr,	/*!< in: mtr */
1516 	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
1517 				its page x-latched */
1518 {
1519 	mem_heap_t*	heap;
1520 	rec_t*		rec
1521 		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1522 								 block)));
1523 	btr_cur_position(index, rec, block, cursor);
1524 
1525 	heap = mem_heap_create(100);
1526 	btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
1527 	mem_heap_free(heap);
1528 }
1529 
1530 /************************************************************//**
1531 Creates the root node for a new index tree.
1532 @return	page number of the created root, FIL_NULL if did not succeed */
1533 UNIV_INTERN
1534 ulint
btr_create(ulint type,ulint space,ulint zip_size,index_id_t index_id,dict_index_t * index,mtr_t * mtr)1535 btr_create(
1536 /*=======*/
1537 	ulint		type,	/*!< in: type of the index */
1538 	ulint		space,	/*!< in: space where created */
1539 	ulint		zip_size,/*!< in: compressed page size in bytes
1540 				or 0 for uncompressed pages */
1541 	index_id_t	index_id,/*!< in: index id */
1542 	dict_index_t*	index,	/*!< in: index */
1543 	mtr_t*		mtr)	/*!< in: mini-transaction handle */
1544 {
1545 	ulint		page_no;
1546 	buf_block_t*	block;
1547 	buf_frame_t*	frame;
1548 	page_t*		page;
1549 	page_zip_des_t*	page_zip;
1550 
1551 	/* Create the two new segments (one, in the case of an ibuf tree) for
1552 	the index tree; the segment headers are put on the allocated root page
1553 	(for an ibuf tree, not in the root, but on a separate ibuf header
1554 	page) */
1555 
1556 	if (type & DICT_IBUF) {
1557 		/* Allocate first the ibuf header page */
1558 		buf_block_t*	ibuf_hdr_block = fseg_create(
1559 			space, 0,
1560 			IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
1561 
1562 		buf_block_dbg_add_level(
1563 			ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
1564 
1565 		ut_ad(buf_block_get_page_no(ibuf_hdr_block)
1566 		      == IBUF_HEADER_PAGE_NO);
1567 		/* Allocate then the next page to the segment: it will be the
1568 		tree root page */
1569 
1570 		block = fseg_alloc_free_page(
1571 			buf_block_get_frame(ibuf_hdr_block)
1572 			+ IBUF_HEADER + IBUF_TREE_SEG_HEADER,
1573 			IBUF_TREE_ROOT_PAGE_NO,
1574 			FSP_UP, mtr);
1575 		ut_ad(buf_block_get_page_no(block) == IBUF_TREE_ROOT_PAGE_NO);
1576 	} else {
1577 #ifdef UNIV_BLOB_DEBUG
1578 		if ((type & DICT_CLUSTERED) && !index->blobs) {
1579 			mutex_create(PFS_NOT_INSTRUMENTED,
1580 				     &index->blobs_mutex, SYNC_ANY_LATCH);
1581 			index->blobs = rbt_create(sizeof(btr_blob_dbg_t),
1582 						  btr_blob_dbg_cmp);
1583 		}
1584 #endif /* UNIV_BLOB_DEBUG */
1585 		block = fseg_create(space, 0,
1586 				    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
1587 	}
1588 
1589 	if (block == NULL) {
1590 
1591 		return(FIL_NULL);
1592 	}
1593 
1594 	page_no = buf_block_get_page_no(block);
1595 	frame = buf_block_get_frame(block);
1596 
1597 	if (type & DICT_IBUF) {
1598 		/* It is an insert buffer tree: initialize the free list */
1599 		buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
1600 
1601 		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
1602 
1603 		flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
1604 	} else {
1605 		/* It is a non-ibuf tree: create a file segment for leaf
1606 		pages */
1607 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1608 
1609 		if (!fseg_create(space, page_no,
1610 				 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
1611 			/* Not enough space for new segment, free root
1612 			segment before return. */
1613 			btr_free_root(space, zip_size, page_no, mtr);
1614 
1615 			return(FIL_NULL);
1616 		}
1617 
1618 		/* The fseg create acquires a second latch on the page,
1619 		therefore we must declare it: */
1620 		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1621 	}
1622 
1623 	/* Create a new index page on the allocated segment page */
1624 	page_zip = buf_block_get_page_zip(block);
1625 
1626 	if (page_zip) {
1627 		page = page_create_zip(block, index, 0, 0, mtr);
1628 	} else {
1629 		page = page_create(block, mtr,
1630 				   dict_table_is_comp(index->table));
1631 		/* Set the level of the new index page */
1632 		btr_page_set_level(page, NULL, 0, mtr);
1633 	}
1634 
1635 	block->check_index_page_at_flush = TRUE;
1636 
1637 	/* Set the index id of the page */
1638 	btr_page_set_index_id(page, page_zip, index_id, mtr);
1639 
1640 	/* Set the next node and previous node fields */
1641 	btr_page_set_next(page, page_zip, FIL_NULL, mtr);
1642 	btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
1643 
1644 	/* We reset the free bits for the page to allow creation of several
1645 	trees in the same mtr, otherwise the latch on a bitmap page would
1646 	prevent it because of the latching order */
1647 
1648 	if (!(type & DICT_CLUSTERED)) {
1649 		ibuf_reset_free_bits(block);
1650 	}
1651 
1652 	/* In the following assertion we test that two records of maximum
1653 	allowed size fit on the root page: this fact is needed to ensure
1654 	correctness of split algorithms */
1655 
1656 	ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
1657 
1658 	return(page_no);
1659 }
1660 
1661 /************************************************************//**
1662 Frees a B-tree except the root page, which MUST be freed after this
1663 by calling btr_free_root. */
1664 UNIV_INTERN
1665 void
btr_free_but_not_root(ulint space,ulint zip_size,ulint root_page_no)1666 btr_free_but_not_root(
1667 /*==================*/
1668 	ulint	space,		/*!< in: space where created */
1669 	ulint	zip_size,	/*!< in: compressed page size in bytes
1670 				or 0 for uncompressed pages */
1671 	ulint	root_page_no)	/*!< in: root page number */
1672 {
1673 	ibool	finished;
1674 	page_t*	root;
1675 	mtr_t	mtr;
1676 
1677 leaf_loop:
1678 	mtr_start(&mtr);
1679 
1680 	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1681 			    NULL, &mtr);
1682 #ifdef UNIV_BTR_DEBUG
1683 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1684 				    + root, space));
1685 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1686 				    + root, space));
1687 #endif /* UNIV_BTR_DEBUG */
1688 
1689 	/* NOTE: page hash indexes are dropped when a page is freed inside
1690 	fsp0fsp. */
1691 
1692 	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
1693 				  &mtr);
1694 	mtr_commit(&mtr);
1695 
1696 	if (!finished) {
1697 
1698 		goto leaf_loop;
1699 	}
1700 top_loop:
1701 	mtr_start(&mtr);
1702 
1703 	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
1704 			    NULL, &mtr);
1705 #ifdef UNIV_BTR_DEBUG
1706 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1707 				    + root, space));
1708 #endif /* UNIV_BTR_DEBUG */
1709 
1710 	finished = fseg_free_step_not_header(
1711 		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
1712 	mtr_commit(&mtr);
1713 
1714 	if (!finished) {
1715 
1716 		goto top_loop;
1717 	}
1718 }
1719 
1720 /************************************************************//**
1721 Frees the B-tree root page. Other tree MUST already have been freed. */
1722 UNIV_INTERN
1723 void
btr_free_root(ulint space,ulint zip_size,ulint root_page_no,mtr_t * mtr)1724 btr_free_root(
1725 /*==========*/
1726 	ulint	space,		/*!< in: space where created */
1727 	ulint	zip_size,	/*!< in: compressed page size in bytes
1728 				or 0 for uncompressed pages */
1729 	ulint	root_page_no,	/*!< in: root page number */
1730 	mtr_t*	mtr)		/*!< in/out: mini-transaction */
1731 {
1732 	buf_block_t*	block;
1733 	fseg_header_t*	header;
1734 
1735 	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
1736 			      NULL, mtr);
1737 
1738 	btr_search_drop_page_hash_index(block);
1739 
1740 	header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1741 #ifdef UNIV_BTR_DEBUG
1742 	ut_a(btr_root_fseg_validate(header, space));
1743 #endif /* UNIV_BTR_DEBUG */
1744 
1745 	while (!fseg_free_step(header, mtr)) {
1746 		/* Free the entire segment in small steps. */
1747 	}
1748 }
1749 #endif /* !UNIV_HOTBACKUP */
1750 
1751 /*************************************************************//**
1752 Reorganizes an index page.
1753 
1754 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1755 if this is a compressed leaf page in a secondary index. This has to
1756 be done either within the same mini-transaction, or by invoking
1757 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1758 IBUF_BITMAP_FREE is unaffected by reorganization.
1759 
1760 @retval true if the operation was successful
1761 @retval false if it is a compressed page, and recompression failed */
1762 UNIV_INTERN
1763 bool
btr_page_reorganize_low(bool recovery,ulint z_level,page_cur_t * cursor,dict_index_t * index,mtr_t * mtr)1764 btr_page_reorganize_low(
1765 /*====================*/
1766 	bool		recovery,/*!< in: true if called in recovery:
1767 				locks should not be updated, i.e.,
1768 				there cannot exist locks on the
1769 				page, and a hash index should not be
1770 				dropped: it cannot exist */
1771 	ulint		z_level,/*!< in: compression level to be used
1772 				if dealing with compressed page */
1773 	page_cur_t*	cursor,	/*!< in/out: page cursor */
1774 	dict_index_t*	index,	/*!< in: the index tree of the page */
1775 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
1776 {
1777 	buf_block_t*	block		= page_cur_get_block(cursor);
1778 #ifndef UNIV_HOTBACKUP
1779 	buf_pool_t*	buf_pool	= buf_pool_from_bpage(&block->page);
1780 #endif /* !UNIV_HOTBACKUP */
1781 	page_t*		page		= buf_block_get_frame(block);
1782 	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
1783 	buf_block_t*	temp_block;
1784 	page_t*		temp_page;
1785 	ulint		log_mode;
1786 	ulint		data_size1;
1787 	ulint		data_size2;
1788 	ulint		max_ins_size1;
1789 	ulint		max_ins_size2;
1790 	bool		success		= false;
1791 	ulint		pos;
1792 	bool		log_compressed;
1793 
1794 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1795 	btr_assert_not_corrupted(block, index);
1796 #ifdef UNIV_ZIP_DEBUG
1797 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1798 #endif /* UNIV_ZIP_DEBUG */
1799 	data_size1 = page_get_data_size(page);
1800 	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
1801 
1802 	/* Turn logging off */
1803 	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
1804 
1805 #ifndef UNIV_HOTBACKUP
1806 	temp_block = buf_block_alloc(buf_pool);
1807 #else /* !UNIV_HOTBACKUP */
1808 	ut_ad(block == back_block1);
1809 	temp_block = back_block2;
1810 #endif /* !UNIV_HOTBACKUP */
1811 	temp_page = temp_block->frame;
1812 
1813 	MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
1814 
1815 	/* Copy the old page to temporary space */
1816 	buf_frame_copy(temp_page, page);
1817 
1818 #ifndef UNIV_HOTBACKUP
1819 	if (!recovery) {
1820 		btr_search_drop_page_hash_index(block);
1821 	}
1822 
1823 	block->check_index_page_at_flush = TRUE;
1824 #endif /* !UNIV_HOTBACKUP */
1825 	btr_blob_dbg_remove(page, index, "btr_page_reorganize");
1826 
1827 	/* Save the cursor position. */
1828 	pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
1829 
1830 	/* Recreate the page: note that global data on page (possible
1831 	segment headers, next page-field, etc.) is preserved intact */
1832 
1833 	page_create(block, mtr, dict_table_is_comp(index->table));
1834 
1835 	/* Copy the records from the temporary space to the recreated page;
1836 	do not copy the lock bits yet */
1837 
1838 	page_copy_rec_list_end_no_locks(block, temp_block,
1839 					page_get_infimum_rec(temp_page),
1840 					index, mtr);
1841 
1842 	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
1843 		/* Copy max trx id to recreated page */
1844 		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
1845 		page_set_max_trx_id(block, NULL, max_trx_id, mtr);
1846 		/* In crash recovery, dict_index_is_sec_or_ibuf() always
1847 		holds, even for clustered indexes.  max_trx_id is
1848 		unused in clustered index pages. */
1849 		ut_ad(max_trx_id != 0 || recovery);
1850 	}
1851 
1852 	/* If innodb_log_compressed_pages is ON, page reorganize should log the
1853 	compressed page image.*/
1854 	log_compressed = page_zip && page_zip_log_pages;
1855 
1856 	if (log_compressed) {
1857 		mtr_set_log_mode(mtr, log_mode);
1858 	}
1859 
1860 	if (page_zip
1861 	    && !page_zip_compress(page_zip, page, index, z_level, mtr)) {
1862 
1863 		/* Restore the old page and exit. */
1864 		btr_blob_dbg_restore(page, temp_page, index,
1865 				     "btr_page_reorganize_compress_fail");
1866 
1867 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1868 		/* Check that the bytes that we skip are identical. */
1869 		ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1870 		ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1871 			     PAGE_HEADER + PAGE_N_RECS + temp_page,
1872 			     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1873 		ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
1874 			     UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
1875 			     FIL_PAGE_DATA_END));
1876 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1877 
1878 		memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1879 		       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1880 		memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1881 		       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
1882 
1883 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1884 		ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
1885 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1886 
1887 		goto func_exit;
1888 	}
1889 
1890 #ifndef UNIV_HOTBACKUP
1891 	if (!recovery) {
1892 		/* Update the record lock bitmaps */
1893 		lock_move_reorganize_page(block, temp_block);
1894 	}
1895 #endif /* !UNIV_HOTBACKUP */
1896 
1897 	data_size2 = page_get_data_size(page);
1898 	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1899 
1900 	if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
1901 		buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
1902 		buf_page_print(temp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
1903 
1904 		fprintf(stderr,
1905 			"InnoDB: Error: page old data size %lu"
1906 			" new data size %lu\n"
1907 			"InnoDB: Error: page old max ins size %lu"
1908 			" new max ins size %lu\n"
1909 			"InnoDB: Submit a detailed bug report"
1910 			" to http://bugs.mysql.com\n",
1911 			(unsigned long) data_size1, (unsigned long) data_size2,
1912 			(unsigned long) max_ins_size1,
1913 			(unsigned long) max_ins_size2);
1914 		ut_ad(0);
1915 	} else {
1916 		success = true;
1917 	}
1918 
1919 	/* Restore the cursor position. */
1920 	if (pos > 0) {
1921 		cursor->rec = page_rec_get_nth(page, pos);
1922 	} else {
1923 		ut_ad(cursor->rec == page_get_infimum_rec(page));
1924 	}
1925 
1926 func_exit:
1927 #ifdef UNIV_ZIP_DEBUG
1928 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1929 #endif /* UNIV_ZIP_DEBUG */
1930 #ifndef UNIV_HOTBACKUP
1931 	buf_block_free(temp_block);
1932 #endif /* !UNIV_HOTBACKUP */
1933 
1934 	/* Restore logging mode */
1935 	mtr_set_log_mode(mtr, log_mode);
1936 
1937 #ifndef UNIV_HOTBACKUP
1938 	if (success) {
1939 		byte	type;
1940 		byte*	log_ptr;
1941 
1942 		/* Write the log record */
1943 		if (page_zip) {
1944 			ut_ad(page_is_comp(page));
1945 			type = MLOG_ZIP_PAGE_REORGANIZE;
1946 		} else if (page_is_comp(page)) {
1947 			type = MLOG_COMP_PAGE_REORGANIZE;
1948 		} else {
1949 			type = MLOG_PAGE_REORGANIZE;
1950 		}
1951 
1952 		log_ptr = log_compressed
1953 			? NULL
1954 			: mlog_open_and_write_index(
1955 				mtr, page, index, type,
1956 				page_zip ? 1 : 0);
1957 
1958 		/* For compressed pages write the compression level. */
1959 		if (log_ptr && page_zip) {
1960 			mach_write_to_1(log_ptr, z_level);
1961 			mlog_close(mtr, log_ptr + 1);
1962 		}
1963 
1964 		MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
1965 	}
1966 #endif /* !UNIV_HOTBACKUP */
1967 
1968 	return(success);
1969 }
1970 
1971 /*************************************************************//**
1972 Reorganizes an index page.
1973 
1974 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1975 if this is a compressed leaf page in a secondary index. This has to
1976 be done either within the same mini-transaction, or by invoking
1977 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1978 IBUF_BITMAP_FREE is unaffected by reorganization.
1979 
1980 @retval true if the operation was successful
1981 @retval false if it is a compressed page, and recompression failed */
1982 static MY_ATTRIBUTE((nonnull))
1983 bool
btr_page_reorganize_block(bool recovery,ulint z_level,buf_block_t * block,dict_index_t * index,mtr_t * mtr)1984 btr_page_reorganize_block(
1985 /*======================*/
1986 	bool		recovery,/*!< in: true if called in recovery:
1987 				locks should not be updated, i.e.,
1988 				there cannot exist locks on the
1989 				page, and a hash index should not be
1990 				dropped: it cannot exist */
1991 	ulint		z_level,/*!< in: compression level to be used
1992 				if dealing with compressed page */
1993 	buf_block_t*	block,	/*!< in/out: B-tree page */
1994 	dict_index_t*	index,	/*!< in: the index tree of the page */
1995 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
1996 {
1997 	page_cur_t	cur;
1998 	page_cur_set_before_first(block, &cur);
1999 
2000 	return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
2001 }
2002 
2003 #ifndef UNIV_HOTBACKUP
2004 /*************************************************************//**
2005 Reorganizes an index page.
2006 
2007 IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
2008 if this is a compressed leaf page in a secondary index. This has to
2009 be done either within the same mini-transaction, or by invoking
2010 ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
2011 IBUF_BITMAP_FREE is unaffected by reorganization.
2012 
2013 @retval true if the operation was successful
2014 @retval false if it is a compressed page, and recompression failed */
2015 UNIV_INTERN
2016 bool
btr_page_reorganize(page_cur_t * cursor,dict_index_t * index,mtr_t * mtr)2017 btr_page_reorganize(
2018 /*================*/
2019 	page_cur_t*	cursor,	/*!< in/out: page cursor */
2020 	dict_index_t*	index,	/*!< in: the index tree of the page */
2021 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
2022 {
2023 	return(btr_page_reorganize_low(false, page_zip_level,
2024 				       cursor, index, mtr));
2025 }
2026 #endif /* !UNIV_HOTBACKUP */
2027 
2028 /***********************************************************//**
2029 Parses a redo log record of reorganizing a page.
2030 @return	end of log record or NULL */
2031 UNIV_INTERN
2032 byte*
btr_parse_page_reorganize(byte * ptr,byte * end_ptr,dict_index_t * index,bool compressed,buf_block_t * block,mtr_t * mtr)2033 btr_parse_page_reorganize(
2034 /*======================*/
2035 	byte*		ptr,	/*!< in: buffer */
2036 	byte*		end_ptr,/*!< in: buffer end */
2037 	dict_index_t*	index,	/*!< in: record descriptor */
2038 	bool		compressed,/*!< in: true if compressed page */
2039 	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
2040 	mtr_t*		mtr)	/*!< in: mtr or NULL */
2041 {
2042 	ulint	level;
2043 
2044 	ut_ad(ptr != NULL);
2045 	ut_ad(end_ptr != NULL);
2046 
2047 	/* If dealing with a compressed page the record has the
2048 	compression level used during original compression written in
2049 	one byte. Otherwise record is empty. */
2050 	if (compressed) {
2051 		if (ptr == end_ptr) {
2052 			return(NULL);
2053 		}
2054 
2055 		level = mach_read_from_1(ptr);
2056 
2057 		ut_a(level <= 9);
2058 		++ptr;
2059 	} else {
2060 		level = page_zip_level;
2061 	}
2062 
2063 	if (block != NULL) {
2064 		btr_page_reorganize_block(true, level, block, index, mtr);
2065 	}
2066 
2067 	return(ptr);
2068 }
2069 
2070 #ifndef UNIV_HOTBACKUP
2071 /*************************************************************//**
2072 Empties an index page.  @see btr_page_create(). */
2073 static
2074 void
btr_page_empty(buf_block_t * block,page_zip_des_t * page_zip,dict_index_t * index,ulint level,mtr_t * mtr)2075 btr_page_empty(
2076 /*===========*/
2077 	buf_block_t*	block,	/*!< in: page to be emptied */
2078 	page_zip_des_t*	page_zip,/*!< out: compressed page, or NULL */
2079 	dict_index_t*	index,	/*!< in: index of the page */
2080 	ulint		level,	/*!< in: the B-tree level of the page */
2081 	mtr_t*		mtr)	/*!< in: mtr */
2082 {
2083 	page_t*	page = buf_block_get_frame(block);
2084 
2085 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2086 	ut_ad(page_zip == buf_block_get_page_zip(block));
2087 #ifdef UNIV_ZIP_DEBUG
2088 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
2089 #endif /* UNIV_ZIP_DEBUG */
2090 
2091 	btr_search_drop_page_hash_index(block);
2092 	btr_blob_dbg_remove(page, index, "btr_page_empty");
2093 
2094 	/* Recreate the page: note that global data on page (possible
2095 	segment headers, next page-field, etc.) is preserved intact */
2096 
2097 	if (page_zip) {
2098 		page_create_zip(block, index, level, 0, mtr);
2099 	} else {
2100 		page_create(block, mtr, dict_table_is_comp(index->table));
2101 		btr_page_set_level(page, NULL, level, mtr);
2102 	}
2103 
2104 	block->check_index_page_at_flush = TRUE;
2105 }
2106 
2107 /*************************************************************//**
2108 Makes tree one level higher by splitting the root, and inserts
2109 the tuple. It is assumed that mtr contains an x-latch on the tree.
2110 NOTE that the operation of this function must always succeed,
2111 we cannot reverse it: therefore enough free disk space must be
2112 guaranteed to be available before this function is called.
2113 @return	inserted record or NULL if run out of space */
2114 UNIV_INTERN
2115 rec_t*
btr_root_raise_and_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2116 btr_root_raise_and_insert(
2117 /*======================*/
2118 	ulint		flags,	/*!< in: undo logging and locking flags */
2119 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
2120 				on the root page; when the function returns,
2121 				the cursor is positioned on the predecessor
2122 				of the inserted record */
2123 	ulint**		offsets,/*!< out: offsets on inserted record */
2124 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2125 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
2126 	ulint		n_ext,	/*!< in: number of externally stored columns */
2127 	mtr_t*		mtr)	/*!< in: mtr */
2128 {
2129 	dict_index_t*	index;
2130 	page_t*		root;
2131 	page_t*		new_page;
2132 	ulint		new_page_no;
2133 	rec_t*		rec;
2134 	dtuple_t*	node_ptr;
2135 	ulint		level;
2136 	rec_t*		node_ptr_rec;
2137 	page_cur_t*	page_cursor;
2138 	page_zip_des_t*	root_page_zip;
2139 	page_zip_des_t*	new_page_zip;
2140 	buf_block_t*	root_block;
2141 	buf_block_t*	new_block;
2142 
2143 	root = btr_cur_get_page(cursor);
2144 	root_block = btr_cur_get_block(cursor);
2145 	root_page_zip = buf_block_get_page_zip(root_block);
2146 	ut_ad(!page_is_empty(root));
2147 	index = btr_cur_get_index(cursor);
2148 #ifdef UNIV_ZIP_DEBUG
2149 	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
2150 #endif /* UNIV_ZIP_DEBUG */
2151 #ifdef UNIV_BTR_DEBUG
2152 	if (!dict_index_is_ibuf(index)) {
2153 		ulint	space = dict_index_get_space(index);
2154 
2155 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
2156 					    + root, space));
2157 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
2158 					    + root, space));
2159 	}
2160 
2161 	ut_a(dict_index_get_page(index) == page_get_page_no(root));
2162 #endif /* UNIV_BTR_DEBUG */
2163 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
2164 				MTR_MEMO_X_LOCK));
2165 	ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
2166 
2167 	/* Allocate a new page to the tree. Root splitting is done by first
2168 	moving the root records to the new page, emptying the root, putting
2169 	a node pointer to the new page, and then splitting the new page. */
2170 
2171 	level = btr_page_get_level(root, mtr);
2172 
2173 	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
2174 
2175 	if (new_block == NULL && os_has_said_disk_full) {
2176 		return(NULL);
2177         }
2178 
2179 	new_page = buf_block_get_frame(new_block);
2180 	new_page_zip = buf_block_get_page_zip(new_block);
2181 	ut_a(!new_page_zip == !root_page_zip);
2182 	ut_a(!new_page_zip
2183 	     || page_zip_get_size(new_page_zip)
2184 	     == page_zip_get_size(root_page_zip));
2185 
2186 	btr_page_create(new_block, new_page_zip, index, level, mtr);
2187 
2188 	/* Set the next node and previous node fields of new page */
2189 	btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
2190 	btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
2191 
2192 	/* Copy the records from root to the new page one by one. */
2193 
2194 	if (0
2195 #ifdef UNIV_ZIP_COPY
2196 	    || new_page_zip
2197 #endif /* UNIV_ZIP_COPY */
2198 	    || !page_copy_rec_list_end(new_block, root_block,
2199 				       page_get_infimum_rec(root),
2200 				       index, mtr)) {
2201 		ut_a(new_page_zip);
2202 
2203 		/* Copy the page byte for byte. */
2204 		page_zip_copy_recs(new_page_zip, new_page,
2205 				   root_page_zip, root, index, mtr);
2206 
2207 		/* Update the lock table and possible hash index. */
2208 
2209 		lock_move_rec_list_end(new_block, root_block,
2210 				       page_get_infimum_rec(root));
2211 
2212 		btr_search_move_or_delete_hash_entries(new_block, root_block,
2213 						       index);
2214 	}
2215 
2216 	/* If this is a pessimistic insert which is actually done to
2217 	perform a pessimistic update then we have stored the lock
2218 	information of the record to be inserted on the infimum of the
2219 	root page: we cannot discard the lock structs on the root page */
2220 
2221 	lock_update_root_raise(new_block, root_block);
2222 
2223 	/* Create a memory heap where the node pointer is stored */
2224 	if (!*heap) {
2225 		*heap = mem_heap_create(1000);
2226 	}
2227 
2228 	rec = page_rec_get_next(page_get_infimum_rec(new_page));
2229 	new_page_no = buf_block_get_page_no(new_block);
2230 
2231 	/* Build the node pointer (= node key and page address) for the
2232 	child */
2233 
2234 	node_ptr = dict_index_build_node_ptr(
2235 		index, rec, new_page_no, *heap, level);
2236 	/* The node pointer must be marked as the predefined minimum record,
2237 	as there is no lower alphabetical limit to records in the leftmost
2238 	node of a level: */
2239 	dtuple_set_info_bits(node_ptr,
2240 			     dtuple_get_info_bits(node_ptr)
2241 			     | REC_INFO_MIN_REC_FLAG);
2242 
2243 	/* Rebuild the root page to get free space */
2244 	btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
2245 
2246 	/* Set the next node and previous node fields, although
2247 	they should already have been set.  The previous node field
2248 	must be FIL_NULL if root_page_zip != NULL, because the
2249 	REC_INFO_MIN_REC_FLAG (of the first user record) will be
2250 	set if and only if btr_page_get_prev() == FIL_NULL. */
2251 	btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
2252 	btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
2253 
2254 	page_cursor = btr_cur_get_page_cur(cursor);
2255 
2256 	/* Insert node pointer to the root */
2257 
2258 	page_cur_set_before_first(root_block, page_cursor);
2259 
2260 	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
2261 					     index, offsets, heap, 0, mtr);
2262 
2263 	/* The root page should only contain the node pointer
2264 	to new_page at this point.  Thus, the data should fit. */
2265 	ut_a(node_ptr_rec);
2266 
2267 	/* We play safe and reset the free bits for the new page */
2268 
2269 #if 0
2270 	fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
2271 #endif
2272 
2273 	if (!dict_index_is_clust(index)) {
2274 		ibuf_reset_free_bits(new_block);
2275 	}
2276 
2277 	/* Reposition the cursor to the child node */
2278 	page_cur_search(new_block, index, tuple,
2279 			PAGE_CUR_LE, page_cursor);
2280 
2281 	/* Split the child and insert tuple */
2282 	return(btr_page_split_and_insert(flags, cursor, offsets, heap,
2283 					 tuple, n_ext, mtr));
2284 }
2285 
2286 /*************************************************************//**
2287 Decides if the page should be split at the convergence point of inserts
2288 converging to the left.
2289 @return	TRUE if split recommended */
2290 UNIV_INTERN
2291 ibool
btr_page_get_split_rec_to_left(btr_cur_t * cursor,rec_t ** split_rec)2292 btr_page_get_split_rec_to_left(
2293 /*===========================*/
2294 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
2295 	rec_t**		split_rec) /*!< out: if split recommended,
2296 				the first record on upper half page,
2297 				or NULL if tuple to be inserted should
2298 				be first */
2299 {
2300 	page_t*	page;
2301 	rec_t*	insert_point;
2302 	rec_t*	infimum;
2303 
2304 	page = btr_cur_get_page(cursor);
2305 	insert_point = btr_cur_get_rec(cursor);
2306 
2307 	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
2308 	    == page_rec_get_next(insert_point)) {
2309 
2310 		infimum = page_get_infimum_rec(page);
2311 
2312 		/* If the convergence is in the middle of a page, include also
2313 		the record immediately before the new insert to the upper
2314 		page. Otherwise, we could repeatedly move from page to page
2315 		lots of records smaller than the convergence point. */
2316 
2317 		if (infimum != insert_point
2318 		    && page_rec_get_next(infimum) != insert_point) {
2319 
2320 			*split_rec = insert_point;
2321 		} else {
2322 			*split_rec = page_rec_get_next(insert_point);
2323 		}
2324 
2325 		return(TRUE);
2326 	}
2327 
2328 	return(FALSE);
2329 }
2330 
2331 /*************************************************************//**
2332 Decides if the page should be split at the convergence point of inserts
2333 converging to the right.
2334 @return	TRUE if split recommended */
2335 UNIV_INTERN
2336 ibool
btr_page_get_split_rec_to_right(btr_cur_t * cursor,rec_t ** split_rec)2337 btr_page_get_split_rec_to_right(
2338 /*============================*/
2339 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
2340 	rec_t**		split_rec) /*!< out: if split recommended,
2341 				the first record on upper half page,
2342 				or NULL if tuple to be inserted should
2343 				be first */
2344 {
2345 	page_t*	page;
2346 	rec_t*	insert_point;
2347 
2348 	page = btr_cur_get_page(cursor);
2349 	insert_point = btr_cur_get_rec(cursor);
2350 
2351 	/* We use eager heuristics: if the new insert would be right after
2352 	the previous insert on the same page, we assume that there is a
2353 	pattern of sequential inserts here. */
2354 
2355 	if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
2356 
2357 		rec_t*	next_rec;
2358 
2359 		next_rec = page_rec_get_next(insert_point);
2360 
2361 		if (page_rec_is_supremum(next_rec)) {
2362 split_at_new:
2363 			/* Split at the new record to insert */
2364 			*split_rec = NULL;
2365 		} else {
2366 			rec_t*	next_next_rec = page_rec_get_next(next_rec);
2367 			if (page_rec_is_supremum(next_next_rec)) {
2368 
2369 				goto split_at_new;
2370 			}
2371 
2372 			/* If there are >= 2 user records up from the insert
2373 			point, split all but 1 off. We want to keep one because
2374 			then sequential inserts can use the adaptive hash
2375 			index, as they can do the necessary checks of the right
2376 			search position just by looking at the records on this
2377 			page. */
2378 
2379 			*split_rec = next_next_rec;
2380 		}
2381 
2382 		return(TRUE);
2383 	}
2384 
2385 	return(FALSE);
2386 }
2387 
2388 /*************************************************************//**
2389 Calculates a split record such that the tuple will certainly fit on
2390 its half-page when the split is performed. We assume in this function
2391 only that the cursor page has at least one user record.
2392 @return split record, or NULL if tuple will be the first record on
2393 the lower or upper half-page (determined by btr_page_tuple_smaller()) */
2394 static
2395 rec_t*
btr_page_get_split_rec(btr_cur_t * cursor,const dtuple_t * tuple,ulint n_ext)2396 btr_page_get_split_rec(
2397 /*===================*/
2398 	btr_cur_t*	cursor,	/*!< in: cursor at which insert should be made */
2399 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
2400 	ulint		n_ext)	/*!< in: number of externally stored columns */
2401 {
2402 	page_t*		page;
2403 	page_zip_des_t*	page_zip;
2404 	ulint		insert_size;
2405 	ulint		free_space;
2406 	ulint		total_data;
2407 	ulint		total_n_recs;
2408 	ulint		total_space;
2409 	ulint		incl_data;
2410 	rec_t*		ins_rec;
2411 	rec_t*		rec;
2412 	rec_t*		next_rec;
2413 	ulint		n;
2414 	mem_heap_t*	heap;
2415 	ulint*		offsets;
2416 
2417 	page = btr_cur_get_page(cursor);
2418 
2419 	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2420 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
2421 
2422 	page_zip = btr_cur_get_page_zip(cursor);
2423 	if (page_zip) {
2424 		/* Estimate the free space of an empty compressed page. */
2425 		ulint	free_space_zip = page_zip_empty_size(
2426 			cursor->index->n_fields,
2427 			page_zip_get_size(page_zip));
2428 
2429 		if (free_space > (ulint) free_space_zip) {
2430 			free_space = (ulint) free_space_zip;
2431 		}
2432 	}
2433 
2434 	/* free_space is now the free space of a created new page */
2435 
2436 	total_data   = page_get_data_size(page) + insert_size;
2437 	total_n_recs = page_get_n_recs(page) + 1;
2438 	ut_ad(total_n_recs >= 2);
2439 	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
2440 
2441 	n = 0;
2442 	incl_data = 0;
2443 	ins_rec = btr_cur_get_rec(cursor);
2444 	rec = page_get_infimum_rec(page);
2445 
2446 	heap = NULL;
2447 	offsets = NULL;
2448 
2449 	/* We start to include records to the left half, and when the
2450 	space reserved by them exceeds half of total_space, then if
2451 	the included records fit on the left page, they will be put there
2452 	if something was left over also for the right page,
2453 	otherwise the last included record will be the first on the right
2454 	half page */
2455 
2456 	do {
2457 		/* Decide the next record to include */
2458 		if (rec == ins_rec) {
2459 			rec = NULL;	/* NULL denotes that tuple is
2460 					now included */
2461 		} else if (rec == NULL) {
2462 			rec = page_rec_get_next(ins_rec);
2463 		} else {
2464 			rec = page_rec_get_next(rec);
2465 		}
2466 
2467 		if (rec == NULL) {
2468 			/* Include tuple */
2469 			incl_data += insert_size;
2470 		} else {
2471 			offsets = rec_get_offsets(rec, cursor->index,
2472 						  offsets, ULINT_UNDEFINED,
2473 						  &heap);
2474 			incl_data += rec_offs_size(offsets);
2475 		}
2476 
2477 		n++;
2478 	} while (incl_data + page_dir_calc_reserved_space(n)
2479 		 < total_space / 2);
2480 
2481 	if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
2482 		/* The next record will be the first on
2483 		the right half page if it is not the
2484 		supremum record of page */
2485 
2486 		if (rec == ins_rec) {
2487 			rec = NULL;
2488 
2489 			goto func_exit;
2490 		} else if (rec == NULL) {
2491 			next_rec = page_rec_get_next(ins_rec);
2492 		} else {
2493 			next_rec = page_rec_get_next(rec);
2494 		}
2495 		ut_ad(next_rec);
2496 		if (!page_rec_is_supremum(next_rec)) {
2497 			rec = next_rec;
2498 		}
2499 	}
2500 
2501 func_exit:
2502 	if (heap) {
2503 		mem_heap_free(heap);
2504 	}
2505 	return(rec);
2506 }
2507 
2508 /*************************************************************//**
2509 Returns TRUE if the insert fits on the appropriate half-page with the
2510 chosen split_rec.
2511 @return	true if fits */
2512 static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
2513 bool
btr_page_insert_fits(btr_cur_t * cursor,const rec_t * split_rec,ulint ** offsets,const dtuple_t * tuple,ulint n_ext,mem_heap_t ** heap)2514 btr_page_insert_fits(
2515 /*=================*/
2516 	btr_cur_t*	cursor,	/*!< in: cursor at which insert
2517 				should be made */
2518 	const rec_t*	split_rec,/*!< in: suggestion for first record
2519 				on upper half-page, or NULL if
2520 				tuple to be inserted should be first */
2521 	ulint**		offsets,/*!< in: rec_get_offsets(
2522 				split_rec, cursor->index); out: garbage */
2523 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
2524 	ulint		n_ext,	/*!< in: number of externally stored columns */
2525 	mem_heap_t**	heap)	/*!< in: temporary memory heap */
2526 {
2527 	page_t*		page;
2528 	ulint		insert_size;
2529 	ulint		free_space;
2530 	ulint		total_data;
2531 	ulint		total_n_recs;
2532 	const rec_t*	rec;
2533 	const rec_t*	end_rec;
2534 
2535 	page = btr_cur_get_page(cursor);
2536 
2537 	ut_ad(!split_rec
2538 	      || !page_is_comp(page) == !rec_offs_comp(*offsets));
2539 	ut_ad(!split_rec
2540 	      || rec_offs_validate(split_rec, cursor->index, *offsets));
2541 
2542 	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2543 	free_space  = page_get_free_space_of_empty(page_is_comp(page));
2544 
2545 	/* free_space is now the free space of a created new page */
2546 
2547 	total_data   = page_get_data_size(page) + insert_size;
2548 	total_n_recs = page_get_n_recs(page) + 1;
2549 
2550 	/* We determine which records (from rec to end_rec, not including
2551 	end_rec) will end up on the other half page from tuple when it is
2552 	inserted. */
2553 
2554 	if (split_rec == NULL) {
2555 		rec = page_rec_get_next(page_get_infimum_rec(page));
2556 		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
2557 
2558 	} else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
2559 
2560 		rec = page_rec_get_next(page_get_infimum_rec(page));
2561 		end_rec = split_rec;
2562 	} else {
2563 		rec = split_rec;
2564 		end_rec = page_get_supremum_rec(page);
2565 	}
2566 
2567 	if (total_data + page_dir_calc_reserved_space(total_n_recs)
2568 	    <= free_space) {
2569 
2570 		/* Ok, there will be enough available space on the
2571 		half page where the tuple is inserted */
2572 
2573 		return(true);
2574 	}
2575 
2576 	while (rec != end_rec) {
2577 		/* In this loop we calculate the amount of reserved
2578 		space after rec is removed from page. */
2579 
2580 		*offsets = rec_get_offsets(rec, cursor->index, *offsets,
2581 					   ULINT_UNDEFINED, heap);
2582 
2583 		total_data -= rec_offs_size(*offsets);
2584 		total_n_recs--;
2585 
2586 		if (total_data + page_dir_calc_reserved_space(total_n_recs)
2587 		    <= free_space) {
2588 
2589 			/* Ok, there will be enough available space on the
2590 			half page where the tuple is inserted */
2591 
2592 			return(true);
2593 		}
2594 
2595 		rec = page_rec_get_next_const(rec);
2596 	}
2597 
2598 	return(false);
2599 }
2600 
2601 /*******************************************************//**
2602 Inserts a data tuple to a tree on a non-leaf level. It is assumed
2603 that mtr holds an x-latch on the tree. */
2604 UNIV_INTERN
2605 void
btr_insert_on_non_leaf_level_func(ulint flags,dict_index_t * index,ulint level,dtuple_t * tuple,const char * file,ulint line,mtr_t * mtr)2606 btr_insert_on_non_leaf_level_func(
2607 /*==============================*/
2608 	ulint		flags,	/*!< in: undo logging and locking flags */
2609 	dict_index_t*	index,	/*!< in: index */
2610 	ulint		level,	/*!< in: level, must be > 0 */
2611 	dtuple_t*	tuple,	/*!< in: the record to be inserted */
2612 	const char*	file,	/*!< in: file name */
2613 	ulint		line,	/*!< in: line where called */
2614 	mtr_t*		mtr)	/*!< in: mtr */
2615 {
2616 	big_rec_t*	dummy_big_rec;
2617 	btr_cur_t	cursor;
2618 	dberr_t		err;
2619 	rec_t*		rec;
2620 	ulint*		offsets	= NULL;
2621 	mem_heap_t*	heap = NULL;
2622 
2623 	ut_ad(level > 0);
2624 
2625 	btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
2626 				    BTR_CONT_MODIFY_TREE,
2627 				    &cursor, 0, file, line, mtr);
2628 
2629 	ut_ad(cursor.flag == BTR_CUR_BINARY);
2630 
2631 	err = btr_cur_optimistic_insert(
2632 		flags
2633 		| BTR_NO_LOCKING_FLAG
2634 		| BTR_KEEP_SYS_FLAG
2635 		| BTR_NO_UNDO_LOG_FLAG,
2636 		&cursor, &offsets, &heap,
2637 		tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
2638 
2639 	if (err == DB_FAIL) {
2640 		err = btr_cur_pessimistic_insert(flags
2641 						 | BTR_NO_LOCKING_FLAG
2642 						 | BTR_KEEP_SYS_FLAG
2643 						 | BTR_NO_UNDO_LOG_FLAG,
2644 						 &cursor, &offsets, &heap,
2645 						 tuple, &rec,
2646 						 &dummy_big_rec, 0, NULL, mtr);
2647 		ut_a(err == DB_SUCCESS);
2648 	}
2649 	mem_heap_free(heap);
2650 }
2651 
2652 /**************************************************************//**
2653 Attaches the halves of an index page on the appropriate level in an
2654 index tree. */
2655 static MY_ATTRIBUTE((nonnull))
2656 void
btr_attach_half_pages(ulint flags,dict_index_t * index,buf_block_t * block,const rec_t * split_rec,buf_block_t * new_block,ulint direction,mtr_t * mtr)2657 btr_attach_half_pages(
2658 /*==================*/
2659 	ulint		flags,		/*!< in: undo logging and
2660 					locking flags */
2661 	dict_index_t*	index,		/*!< in: the index tree */
2662 	buf_block_t*	block,		/*!< in/out: page to be split */
2663 	const rec_t*	split_rec,	/*!< in: first record on upper
2664 					half page */
2665 	buf_block_t*	new_block,	/*!< in/out: the new half page */
2666 	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
2667 	mtr_t*		mtr)		/*!< in: mtr */
2668 {
2669 	ulint		space;
2670 	ulint		zip_size;
2671 	ulint		prev_page_no;
2672 	ulint		next_page_no;
2673 	ulint		level;
2674 	page_t*		page		= buf_block_get_frame(block);
2675 	page_t*		lower_page;
2676 	page_t*		upper_page;
2677 	ulint		lower_page_no;
2678 	ulint		upper_page_no;
2679 	page_zip_des_t*	lower_page_zip;
2680 	page_zip_des_t*	upper_page_zip;
2681 	dtuple_t*	node_ptr_upper;
2682 	mem_heap_t*	heap;
2683 
2684 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2685 	ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
2686 
2687 	/* Create a memory heap where the data tuple is stored */
2688 	heap = mem_heap_create(1024);
2689 
2690 	/* Based on split direction, decide upper and lower pages */
2691 	if (direction == FSP_DOWN) {
2692 
2693 		btr_cur_t	cursor;
2694 		ulint*		offsets;
2695 
2696 		lower_page = buf_block_get_frame(new_block);
2697 		lower_page_no = buf_block_get_page_no(new_block);
2698 		lower_page_zip = buf_block_get_page_zip(new_block);
2699 		upper_page = buf_block_get_frame(block);
2700 		upper_page_no = buf_block_get_page_no(block);
2701 		upper_page_zip = buf_block_get_page_zip(block);
2702 
2703 		/* Look up the index for the node pointer to page */
2704 		offsets = btr_page_get_father_block(NULL, heap, index,
2705 						    block, mtr, &cursor);
2706 
2707 		/* Replace the address of the old child node (= page) with the
2708 		address of the new lower half */
2709 
2710 		btr_node_ptr_set_child_page_no(
2711 			btr_cur_get_rec(&cursor),
2712 			btr_cur_get_page_zip(&cursor),
2713 			offsets, lower_page_no, mtr);
2714 		mem_heap_empty(heap);
2715 	} else {
2716 		lower_page = buf_block_get_frame(block);
2717 		lower_page_no = buf_block_get_page_no(block);
2718 		lower_page_zip = buf_block_get_page_zip(block);
2719 		upper_page = buf_block_get_frame(new_block);
2720 		upper_page_no = buf_block_get_page_no(new_block);
2721 		upper_page_zip = buf_block_get_page_zip(new_block);
2722 	}
2723 
2724 	/* Get the level of the split pages */
2725 	level = btr_page_get_level(buf_block_get_frame(block), mtr);
2726 	ut_ad(level
2727 	      == btr_page_get_level(buf_block_get_frame(new_block), mtr));
2728 
2729 	/* Build the node pointer (= node key and page address) for the upper
2730 	half */
2731 
2732 	node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
2733 						   upper_page_no, heap, level);
2734 
2735 	/* Insert it next to the pointer to the lower half. Note that this
2736 	may generate recursion leading to a split on the higher level. */
2737 
2738 	btr_insert_on_non_leaf_level(flags, index, level + 1,
2739 				     node_ptr_upper, mtr);
2740 
2741 	/* Free the memory heap */
2742 	mem_heap_free(heap);
2743 
2744 	/* Get the previous and next pages of page */
2745 
2746 	prev_page_no = btr_page_get_prev(page, mtr);
2747 	next_page_no = btr_page_get_next(page, mtr);
2748 	space = buf_block_get_space(block);
2749 	zip_size = buf_block_get_zip_size(block);
2750 
2751 	/* Update page links of the level */
2752 
2753 	if (prev_page_no != FIL_NULL) {
2754 		buf_block_t*	prev_block = btr_block_get(
2755 			space, zip_size, prev_page_no, RW_X_LATCH, index, mtr);
2756 #ifdef UNIV_BTR_DEBUG
2757 		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
2758 		ut_a(btr_page_get_next(prev_block->frame, mtr)
2759 		     == buf_block_get_page_no(block));
2760 #endif /* UNIV_BTR_DEBUG */
2761 
2762 		btr_page_set_next(buf_block_get_frame(prev_block),
2763 				  buf_block_get_page_zip(prev_block),
2764 				  lower_page_no, mtr);
2765 	}
2766 
2767 	if (next_page_no != FIL_NULL) {
2768 		buf_block_t*	next_block = btr_block_get(
2769 			space, zip_size, next_page_no, RW_X_LATCH, index, mtr);
2770 #ifdef UNIV_BTR_DEBUG
2771 		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
2772 		ut_a(btr_page_get_prev(next_block->frame, mtr)
2773 		     == page_get_page_no(page));
2774 #endif /* UNIV_BTR_DEBUG */
2775 
2776 		btr_page_set_prev(buf_block_get_frame(next_block),
2777 				  buf_block_get_page_zip(next_block),
2778 				  upper_page_no, mtr);
2779 	}
2780 
2781 	btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
2782 	btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
2783 
2784 	btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
2785 	btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
2786 }
2787 
2788 /*************************************************************//**
2789 Determine if a tuple is smaller than any record on the page.
2790 @return TRUE if smaller */
2791 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2792 bool
btr_page_tuple_smaller(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,ulint n_uniq,mem_heap_t ** heap)2793 btr_page_tuple_smaller(
2794 /*===================*/
2795 	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
2796 	const dtuple_t*	tuple,	/*!< in: tuple to consider */
2797 	ulint**		offsets,/*!< in/out: temporary storage */
2798 	ulint		n_uniq,	/*!< in: number of unique fields
2799 				in the index page records */
2800 	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
2801 {
2802 	buf_block_t*	block;
2803 	const rec_t*	first_rec;
2804 	page_cur_t	pcur;
2805 
2806 	/* Read the first user record in the page. */
2807 	block = btr_cur_get_block(cursor);
2808 	page_cur_set_before_first(block, &pcur);
2809 	page_cur_move_to_next(&pcur);
2810 	first_rec = page_cur_get_rec(&pcur);
2811 
2812 	*offsets = rec_get_offsets(
2813 		first_rec, cursor->index, *offsets,
2814 		n_uniq, heap);
2815 
2816 	return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
2817 }
2818 
2819 /** Insert the tuple into the right sibling page, if the cursor is at the end
2820 of a page.
2821 @param[in]	flags	undo logging and locking flags
2822 @param[in,out]	cursor	cursor at which to insert; when the function succeeds,
2823 			the cursor is positioned before the insert point.
2824 @param[out]	offsets	offsets on inserted record
2825 @param[in,out]	heap	memory heap for allocating offsets
2826 @param[in]	tuple	tuple to insert
2827 @param[in]	n_ext	number of externally stored columns
2828 @param[in,out]	mtr	mini-transaction
2829 @return	inserted record (first record on the right sibling page);
2830 	the cursor will be positioned on the page infimum
2831 @retval	NULL if the operation was not performed */
2832 static
2833 rec_t*
btr_insert_into_right_sibling(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t * heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2834 btr_insert_into_right_sibling(
2835 	ulint		flags,
2836 	btr_cur_t*	cursor,
2837 	ulint**		offsets,
2838 	mem_heap_t*	heap,
2839 	const dtuple_t*	tuple,
2840 	ulint		n_ext,
2841 	mtr_t*		mtr)
2842 {
2843 	buf_block_t*	block = btr_cur_get_block(cursor);
2844 	page_t*		page = buf_block_get_frame(block);
2845 	ulint		next_page_no = btr_page_get_next(page, mtr);
2846 
2847 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
2848 				MTR_MEMO_X_LOCK));
2849 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2850 	ut_ad(heap);
2851 
2852 	if (next_page_no == FIL_NULL || !page_rec_is_supremum(
2853 			page_rec_get_next(btr_cur_get_rec(cursor)))) {
2854 
2855 		return(NULL);
2856 	}
2857 
2858 	page_cur_t	next_page_cursor;
2859 	buf_block_t*	next_block;
2860 	page_t*		next_page;
2861 	btr_cur_t	next_father_cursor;
2862 	rec_t*		rec = NULL;
2863 	ulint		zip_size = buf_block_get_zip_size(block);
2864 	ulint		max_size;
2865 
2866 	next_block = btr_block_get(
2867 		buf_block_get_space(block), zip_size,
2868 		next_page_no, RW_X_LATCH, cursor->index, mtr);
2869 	next_page = buf_block_get_frame(next_block);
2870 
2871 	bool	is_leaf = page_is_leaf(next_page);
2872 
2873 	btr_page_get_father(
2874 		cursor->index, next_block, mtr, &next_father_cursor);
2875 
2876 	page_cur_search(
2877 		next_block, cursor->index, tuple, PAGE_CUR_LE,
2878 		&next_page_cursor);
2879 
2880 	max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
2881 
2882 	/* Extends gap lock for the next page */
2883 	lock_update_split_left(next_block, block);
2884 
2885 	rec = page_cur_tuple_insert(
2886 		&next_page_cursor, tuple, cursor->index, offsets, &heap,
2887 		n_ext, mtr);
2888 
2889 	if (rec == NULL) {
2890 		if (zip_size && is_leaf
2891 		    && !dict_index_is_clust(cursor->index)) {
2892 			/* Reset the IBUF_BITMAP_FREE bits, because
2893 			page_cur_tuple_insert() will have attempted page
2894 			reorganize before failing. */
2895 			ibuf_reset_free_bits(next_block);
2896 		}
2897 		return(NULL);
2898 	}
2899 
2900 	ibool	compressed;
2901 	dberr_t	err;
2902 	ulint	level = btr_page_get_level(next_page, mtr);
2903 
2904 	/* adjust cursor position */
2905 	*btr_cur_get_page_cur(cursor) = next_page_cursor;
2906 
2907 	ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
2908 	ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
2909 
2910 	/* We have to change the parent node pointer */
2911 
2912 	compressed = btr_cur_pessimistic_delete(
2913 		&err, TRUE, &next_father_cursor,
2914 		BTR_CREATE_FLAG, RB_NONE, mtr);
2915 
2916 	ut_a(err == DB_SUCCESS);
2917 
2918 	if (!compressed) {
2919 		btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
2920 	}
2921 
2922 	dtuple_t*	node_ptr = dict_index_build_node_ptr(
2923 		cursor->index, rec, buf_block_get_page_no(next_block),
2924 		heap, level);
2925 
2926 	btr_insert_on_non_leaf_level(
2927 		flags, cursor->index, level + 1, node_ptr, mtr);
2928 
2929 	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
2930 
2931 	if (is_leaf && !dict_index_is_clust(cursor->index)) {
2932 		/* Update the free bits of the B-tree page in the
2933 		insert buffer bitmap. */
2934 
2935 		if (zip_size) {
2936 			ibuf_update_free_bits_zip(next_block, mtr);
2937 		} else {
2938 			ibuf_update_free_bits_if_full(
2939 				next_block, max_size,
2940 				rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
2941 		}
2942 	}
2943 
2944 	return(rec);
2945 }
2946 
2947 /*************************************************************//**
2948 Splits an index page to halves and inserts the tuple. It is assumed
2949 that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
2950 released within this function! NOTE that the operation of this
2951 function must always succeed, we cannot reverse it: therefore enough
2952 free disk space (2 pages) must be guaranteed to be available before
2953 this function is called.
2954 
2955 @return inserted record or NULL if run out of space */
2956 UNIV_INTERN
2957 rec_t*
btr_page_split_and_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const dtuple_t * tuple,ulint n_ext,mtr_t * mtr)2958 btr_page_split_and_insert(
2959 /*======================*/
2960 	ulint		flags,	/*!< in: undo logging and locking flags */
2961 	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
2962 				function returns, the cursor is positioned
2963 				on the predecessor of the inserted record */
2964 	ulint**		offsets,/*!< out: offsets on inserted record */
2965 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2966 	const dtuple_t*	tuple,	/*!< in: tuple to insert */
2967 	ulint		n_ext,	/*!< in: number of externally stored columns */
2968 	mtr_t*		mtr)	/*!< in: mtr */
2969 {
2970 	buf_block_t*	block;
2971 	page_t*		page;
2972 	page_zip_des_t*	page_zip;
2973 	ulint		page_no;
2974 	byte		direction;
2975 	ulint		hint_page_no;
2976 	buf_block_t*	new_block;
2977 	page_t*		new_page;
2978 	page_zip_des_t*	new_page_zip;
2979 	rec_t*		split_rec;
2980 	buf_block_t*	left_block;
2981 	buf_block_t*	right_block;
2982 	buf_block_t*	insert_block;
2983 	page_cur_t*	page_cursor;
2984 	rec_t*		first_rec;
2985 	byte*		buf = 0; /* remove warning */
2986 	rec_t*		move_limit;
2987 	ibool		insert_will_fit;
2988 	ibool		insert_left;
2989 	ulint		n_iterations = 0;
2990 	rec_t*		rec;
2991 	ulint		n_uniq;
2992 
2993 	if (!*heap) {
2994 		*heap = mem_heap_create(1024);
2995 	}
2996 	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
2997 func_start:
2998 	mem_heap_empty(*heap);
2999 	*offsets = NULL;
3000 
3001 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
3002 				MTR_MEMO_X_LOCK));
3003 	ut_ad(!dict_index_is_online_ddl(cursor->index)
3004 	      || (flags & BTR_CREATE_FLAG)
3005 	      || dict_index_is_clust(cursor->index));
3006 #ifdef UNIV_SYNC_DEBUG
3007 	ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
3008 #endif /* UNIV_SYNC_DEBUG */
3009 
3010 	block = btr_cur_get_block(cursor);
3011 	page = buf_block_get_frame(block);
3012 	page_zip = buf_block_get_page_zip(block);
3013 
3014 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3015 	ut_ad(!page_is_empty(page));
3016 
3017 	/* try to insert to the next page if possible before split */
3018 	rec = btr_insert_into_right_sibling(
3019 		flags, cursor, offsets, *heap, tuple, n_ext, mtr);
3020 
3021 	if (rec != NULL) {
3022 		return(rec);
3023 	}
3024 
3025 	page_no = buf_block_get_page_no(block);
3026 
3027 	/* 1. Decide the split record; split_rec == NULL means that the
3028 	tuple to be inserted should be the first record on the upper
3029 	half-page */
3030 	insert_left = FALSE;
3031 
3032 	if (n_iterations > 0) {
3033 		direction = FSP_UP;
3034 		hint_page_no = page_no + 1;
3035 		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
3036 
3037 		if (split_rec == NULL) {
3038 			insert_left = btr_page_tuple_smaller(
3039 				cursor, tuple, offsets, n_uniq, heap);
3040 		}
3041 	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
3042 		direction = FSP_UP;
3043 		hint_page_no = page_no + 1;
3044 
3045 	} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
3046 		direction = FSP_DOWN;
3047 		hint_page_no = page_no - 1;
3048 		ut_ad(split_rec);
3049 	} else {
3050 		direction = FSP_UP;
3051 		hint_page_no = page_no + 1;
3052 
3053 		/* If there is only one record in the index page, we
3054 		can't split the node in the middle by default. We need
3055 		to determine whether the new record will be inserted
3056 		to the left or right. */
3057 
3058 		if (page_get_n_recs(page) > 1) {
3059 			split_rec = page_get_middle_rec(page);
3060 		} else if (btr_page_tuple_smaller(cursor, tuple,
3061 						  offsets, n_uniq, heap)) {
3062 			split_rec = page_rec_get_next(
3063 				page_get_infimum_rec(page));
3064 		} else {
3065 			split_rec = NULL;
3066 		}
3067 	}
3068 
3069 	DBUG_EXECUTE_IF("disk_is_full",
3070 			os_has_said_disk_full = true;
3071                         return(NULL););
3072 
3073 	/* 2. Allocate a new page to the index */
3074 	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
3075 				   btr_page_get_level(page, mtr), mtr, mtr);
3076 
3077 	if (new_block == NULL && os_has_said_disk_full) {
3078 		return(NULL);
3079         }
3080 
3081 	new_page = buf_block_get_frame(new_block);
3082 	new_page_zip = buf_block_get_page_zip(new_block);
3083 	btr_page_create(new_block, new_page_zip, cursor->index,
3084 			btr_page_get_level(page, mtr), mtr);
3085 
3086 	/* 3. Calculate the first record on the upper half-page, and the
3087 	first record (move_limit) on original page which ends up on the
3088 	upper half */
3089 
3090 	if (split_rec) {
3091 		first_rec = move_limit = split_rec;
3092 
3093 		*offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
3094 					   n_uniq, heap);
3095 
3096 		insert_left = cmp_dtuple_rec(tuple, split_rec, *offsets) < 0;
3097 
3098 		if (!insert_left && new_page_zip && n_iterations > 0) {
3099 			/* If a compressed page has already been split,
3100 			avoid further splits by inserting the record
3101 			to an empty page. */
3102 			split_rec = NULL;
3103 			goto insert_empty;
3104 		}
3105 	} else if (insert_left) {
3106 		ut_a(n_iterations > 0);
3107 		first_rec = page_rec_get_next(page_get_infimum_rec(page));
3108 		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3109 	} else {
3110 insert_empty:
3111 		ut_ad(!split_rec);
3112 		ut_ad(!insert_left);
3113 		buf = (byte*) mem_alloc(rec_get_converted_size(cursor->index,
3114 							       tuple, n_ext));
3115 
3116 		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
3117 						      tuple, n_ext);
3118 		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3119 	}
3120 
3121 	/* 4. Do first the modifications in the tree structure */
3122 
3123 	btr_attach_half_pages(flags, cursor->index, block,
3124 			      first_rec, new_block, direction, mtr);
3125 
3126 	/* If the split is made on the leaf level and the insert will fit
3127 	on the appropriate half-page, we may release the tree x-latch.
3128 	We can then move the records after releasing the tree latch,
3129 	thus reducing the tree latch contention. */
3130 
3131 	if (split_rec) {
3132 		insert_will_fit = !new_page_zip
3133 			&& btr_page_insert_fits(cursor, split_rec,
3134 						offsets, tuple, n_ext, heap);
3135 	} else {
3136 		if (!insert_left) {
3137 			mem_free(buf);
3138 			buf = NULL;
3139 		}
3140 
3141 		insert_will_fit = !new_page_zip
3142 			&& btr_page_insert_fits(cursor, NULL,
3143 						offsets, tuple, n_ext, heap);
3144 	}
3145 
3146 	if (insert_will_fit && page_is_leaf(page)
3147 	    && !dict_index_is_online_ddl(cursor->index)) {
3148 
3149 		mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
3150 				 MTR_MEMO_X_LOCK);
3151 	}
3152 
3153 	/* 5. Move then the records to the new page */
3154 	if (direction == FSP_DOWN) {
3155 		/*		fputs("Split left\n", stderr); */
3156 
3157 		if (0
3158 #ifdef UNIV_ZIP_COPY
3159 		    || page_zip
3160 #endif /* UNIV_ZIP_COPY */
3161 		    || !page_move_rec_list_start(new_block, block, move_limit,
3162 						 cursor->index, mtr)) {
3163 			/* For some reason, compressing new_page failed,
3164 			even though it should contain fewer records than
3165 			the original page.  Copy the page byte for byte
3166 			and then delete the records from both pages
3167 			as appropriate.  Deleting will always succeed. */
3168 			ut_a(new_page_zip);
3169 
3170 			page_zip_copy_recs(new_page_zip, new_page,
3171 					   page_zip, page, cursor->index, mtr);
3172 			page_delete_rec_list_end(move_limit - page + new_page,
3173 						 new_block, cursor->index,
3174 						 ULINT_UNDEFINED,
3175 						 ULINT_UNDEFINED, mtr);
3176 
3177 			/* Update the lock table and possible hash index. */
3178 
3179 			lock_move_rec_list_start(
3180 				new_block, block, move_limit,
3181 				new_page + PAGE_NEW_INFIMUM);
3182 
3183 			btr_search_move_or_delete_hash_entries(
3184 				new_block, block, cursor->index);
3185 
3186 			/* Delete the records from the source page. */
3187 
3188 			page_delete_rec_list_start(move_limit, block,
3189 						   cursor->index, mtr);
3190 		}
3191 
3192 		left_block = new_block;
3193 		right_block = block;
3194 
3195 		lock_update_split_left(right_block, left_block);
3196 	} else {
3197 		/*		fputs("Split right\n", stderr); */
3198 
3199 		if (0
3200 #ifdef UNIV_ZIP_COPY
3201 		    || page_zip
3202 #endif /* UNIV_ZIP_COPY */
3203 		    || !page_move_rec_list_end(new_block, block, move_limit,
3204 					       cursor->index, mtr)) {
3205 			/* For some reason, compressing new_page failed,
3206 			even though it should contain fewer records than
3207 			the original page.  Copy the page byte for byte
3208 			and then delete the records from both pages
3209 			as appropriate.  Deleting will always succeed. */
3210 			ut_a(new_page_zip);
3211 
3212 			page_zip_copy_recs(new_page_zip, new_page,
3213 					   page_zip, page, cursor->index, mtr);
3214 			page_delete_rec_list_start(move_limit - page
3215 						   + new_page, new_block,
3216 						   cursor->index, mtr);
3217 
3218 			/* Update the lock table and possible hash index. */
3219 
3220 			lock_move_rec_list_end(new_block, block, move_limit);
3221 
3222 			btr_search_move_or_delete_hash_entries(
3223 				new_block, block, cursor->index);
3224 
3225 			/* Delete the records from the source page. */
3226 
3227 			page_delete_rec_list_end(move_limit, block,
3228 						 cursor->index,
3229 						 ULINT_UNDEFINED,
3230 						 ULINT_UNDEFINED, mtr);
3231 		}
3232 
3233 		left_block = block;
3234 		right_block = new_block;
3235 
3236 		lock_update_split_right(right_block, left_block);
3237 	}
3238 
3239 #ifdef UNIV_ZIP_DEBUG
3240 	if (page_zip) {
3241 		ut_a(page_zip_validate(page_zip, page, cursor->index));
3242 		ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
3243 	}
3244 #endif /* UNIV_ZIP_DEBUG */
3245 
3246 	/* At this point, split_rec, move_limit and first_rec may point
3247 	to garbage on the old page. */
3248 
3249 	/* 6. The split and the tree modification is now completed. Decide the
3250 	page where the tuple should be inserted */
3251 
3252 	if (insert_left) {
3253 		insert_block = left_block;
3254 	} else {
3255 		insert_block = right_block;
3256 	}
3257 
3258 	/* 7. Reposition the cursor for insert and try insertion */
3259 	page_cursor = btr_cur_get_page_cur(cursor);
3260 
3261 	page_cur_search(insert_block, cursor->index, tuple,
3262 			PAGE_CUR_LE, page_cursor);
3263 
3264 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3265 				    offsets, heap, n_ext, mtr);
3266 
3267 #ifdef UNIV_ZIP_DEBUG
3268 	{
3269 		page_t*		insert_page
3270 			= buf_block_get_frame(insert_block);
3271 
3272 		page_zip_des_t*	insert_page_zip
3273 			= buf_block_get_page_zip(insert_block);
3274 
3275 		ut_a(!insert_page_zip
3276 		     || page_zip_validate(insert_page_zip, insert_page,
3277 					  cursor->index));
3278 	}
3279 #endif /* UNIV_ZIP_DEBUG */
3280 
3281 	if (rec != NULL) {
3282 
3283 		goto func_exit;
3284 	}
3285 
3286 	/* 8. If insert did not fit, try page reorganization.
3287 	For compressed pages, page_cur_tuple_insert() will have
3288 	attempted this already. */
3289 
3290 	if (page_cur_get_page_zip(page_cursor)
3291 	    || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3292 
3293 		goto insert_failed;
3294 	}
3295 
3296 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3297 				    offsets, heap, n_ext, mtr);
3298 
3299 	if (rec == NULL) {
3300 		/* The insert did not fit on the page: loop back to the
3301 		start of the function for a new split */
3302 insert_failed:
3303 		/* We play safe and reset the free bits */
3304 		if (!dict_index_is_clust(cursor->index)) {
3305 			ibuf_reset_free_bits(new_block);
3306 			ibuf_reset_free_bits(block);
3307 		}
3308 
3309 		/* fprintf(stderr, "Split second round %lu\n",
3310 		page_get_page_no(page)); */
3311 		n_iterations++;
3312 		ut_ad(n_iterations < 2
3313 		      || buf_block_get_page_zip(insert_block));
3314 		ut_ad(!insert_will_fit);
3315 
3316 		goto func_start;
3317 	}
3318 
3319 func_exit:
3320 	/* Insert fit on the page: update the free bits for the
3321 	left and right pages in the same mtr */
3322 
3323 	if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
3324 		ibuf_update_free_bits_for_two_pages_low(
3325 			buf_block_get_zip_size(left_block),
3326 			left_block, right_block, mtr);
3327 	}
3328 
3329 #if 0
3330 	fprintf(stderr, "Split and insert done %lu %lu\n",
3331 		buf_block_get_page_no(left_block),
3332 		buf_block_get_page_no(right_block));
3333 #endif
3334 	MONITOR_INC(MONITOR_INDEX_SPLIT);
3335 
3336 	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
3337 	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
3338 
3339 	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3340 	return(rec);
3341 }
3342 
3343 #ifdef UNIV_SYNC_DEBUG
3344 /*************************************************************//**
3345 Removes a page from the level list of pages.
3346 @param space	in: space where removed
3347 @param zip_size	in: compressed page size in bytes, or 0 for uncompressed
3348 @param page	in/out: page to remove
3349 @param index	in: index tree
3350 @param mtr	in/out: mini-transaction */
3351 # define btr_level_list_remove(space,zip_size,page,index,mtr)		\
3352 	btr_level_list_remove_func(space,zip_size,page,index,mtr)
3353 #else /* UNIV_SYNC_DEBUG */
3354 /*************************************************************//**
3355 Removes a page from the level list of pages.
3356 @param space	in: space where removed
3357 @param zip_size	in: compressed page size in bytes, or 0 for uncompressed
3358 @param page	in/out: page to remove
3359 @param index	in: index tree
3360 @param mtr	in/out: mini-transaction */
3361 # define btr_level_list_remove(space,zip_size,page,index,mtr)		\
3362 	btr_level_list_remove_func(space,zip_size,page,mtr)
3363 #endif /* UNIV_SYNC_DEBUG */
3364 
3365 /*************************************************************//**
3366 Removes a page from the level list of pages. */
3367 static MY_ATTRIBUTE((nonnull))
3368 void
btr_level_list_remove_func(ulint space,ulint zip_size,page_t * page,const dict_index_t * index,mtr_t * mtr)3369 btr_level_list_remove_func(
3370 /*=======================*/
3371 	ulint			space,	/*!< in: space where removed */
3372 	ulint			zip_size,/*!< in: compressed page size in bytes
3373 					or 0 for uncompressed pages */
3374 	page_t*			page,	/*!< in/out: page to remove */
3375 #ifdef UNIV_SYNC_DEBUG
3376 	const dict_index_t*	index,	/*!< in: index tree */
3377 #endif /* UNIV_SYNC_DEBUG */
3378 	mtr_t*			mtr)	/*!< in/out: mini-transaction */
3379 {
3380 	ulint	prev_page_no;
3381 	ulint	next_page_no;
3382 
3383 	ut_ad(page != NULL);
3384 	ut_ad(mtr != NULL);
3385 	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
3386 	ut_ad(space == page_get_space_id(page));
3387 	/* Get the previous and next page numbers of page */
3388 
3389 	prev_page_no = btr_page_get_prev(page, mtr);
3390 	next_page_no = btr_page_get_next(page, mtr);
3391 
3392 	/* Update page links of the level */
3393 
3394 	if (prev_page_no != FIL_NULL) {
3395 		buf_block_t*	prev_block
3396 			= btr_block_get(space, zip_size, prev_page_no,
3397 					RW_X_LATCH, index, mtr);
3398 		page_t*		prev_page
3399 			= buf_block_get_frame(prev_block);
3400 #ifdef UNIV_BTR_DEBUG
3401 		ut_a(page_is_comp(prev_page) == page_is_comp(page));
3402 		ut_a(btr_page_get_next(prev_page, mtr)
3403 		     == page_get_page_no(page));
3404 #endif /* UNIV_BTR_DEBUG */
3405 
3406 		btr_page_set_next(prev_page,
3407 				  buf_block_get_page_zip(prev_block),
3408 				  next_page_no, mtr);
3409 	}
3410 
3411 	if (next_page_no != FIL_NULL) {
3412 		buf_block_t*	next_block
3413 			= btr_block_get(space, zip_size, next_page_no,
3414 					RW_X_LATCH, index, mtr);
3415 		page_t*		next_page
3416 			= buf_block_get_frame(next_block);
3417 #ifdef UNIV_BTR_DEBUG
3418 		ut_a(page_is_comp(next_page) == page_is_comp(page));
3419 		ut_a(btr_page_get_prev(next_page, mtr)
3420 		     == page_get_page_no(page));
3421 #endif /* UNIV_BTR_DEBUG */
3422 
3423 		btr_page_set_prev(next_page,
3424 				  buf_block_get_page_zip(next_block),
3425 				  prev_page_no, mtr);
3426 	}
3427 }
3428 
3429 /****************************************************************//**
3430 Writes the redo log record for setting an index record as the predefined
3431 minimum record. */
3432 UNIV_INLINE
3433 void
btr_set_min_rec_mark_log(rec_t * rec,byte type,mtr_t * mtr)3434 btr_set_min_rec_mark_log(
3435 /*=====================*/
3436 	rec_t*	rec,	/*!< in: record */
3437 	byte	type,	/*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
3438 	mtr_t*	mtr)	/*!< in: mtr */
3439 {
3440 	mlog_write_initial_log_record(rec, type, mtr);
3441 
3442 	/* Write rec offset as a 2-byte ulint */
3443 	mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
3444 }
3445 #else /* !UNIV_HOTBACKUP */
3446 # define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
3447 #endif /* !UNIV_HOTBACKUP */
3448 
3449 /****************************************************************//**
3450 Parses the redo log record for setting an index record as the predefined
3451 minimum record.
3452 @return	end of log record or NULL */
3453 UNIV_INTERN
3454 byte*
btr_parse_set_min_rec_mark(byte * ptr,byte * end_ptr,ulint comp,page_t * page,mtr_t * mtr)3455 btr_parse_set_min_rec_mark(
3456 /*=======================*/
3457 	byte*	ptr,	/*!< in: buffer */
3458 	byte*	end_ptr,/*!< in: buffer end */
3459 	ulint	comp,	/*!< in: nonzero=compact page format */
3460 	page_t*	page,	/*!< in: page or NULL */
3461 	mtr_t*	mtr)	/*!< in: mtr or NULL */
3462 {
3463 	rec_t*	rec;
3464 
3465 	if (end_ptr < ptr + 2) {
3466 
3467 		return(NULL);
3468 	}
3469 
3470 	if (page) {
3471 		ut_a(!page_is_comp(page) == !comp);
3472 
3473 		rec = page + mach_read_from_2(ptr);
3474 
3475 		btr_set_min_rec_mark(rec, mtr);
3476 	}
3477 
3478 	return(ptr + 2);
3479 }
3480 
3481 /****************************************************************//**
3482 Sets a record as the predefined minimum record. */
3483 UNIV_INTERN
3484 void
btr_set_min_rec_mark(rec_t * rec,mtr_t * mtr)3485 btr_set_min_rec_mark(
3486 /*=================*/
3487 	rec_t*	rec,	/*!< in: record */
3488 	mtr_t*	mtr)	/*!< in: mtr */
3489 {
3490 	ulint	info_bits;
3491 
3492 	if (page_rec_is_comp(rec)) {
3493 		info_bits = rec_get_info_bits(rec, TRUE);
3494 
3495 		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3496 
3497 		btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
3498 	} else {
3499 		info_bits = rec_get_info_bits(rec, FALSE);
3500 
3501 		rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3502 
3503 		btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
3504 	}
3505 }
3506 
3507 #ifndef UNIV_HOTBACKUP
3508 /*************************************************************//**
3509 Deletes on the upper level the node pointer to a page. */
3510 UNIV_INTERN
3511 void
btr_node_ptr_delete(dict_index_t * index,buf_block_t * block,mtr_t * mtr)3512 btr_node_ptr_delete(
3513 /*================*/
3514 	dict_index_t*	index,	/*!< in: index tree */
3515 	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
3516 	mtr_t*		mtr)	/*!< in: mtr */
3517 {
3518 	btr_cur_t	cursor;
3519 	ibool		compressed;
3520 	dberr_t		err;
3521 
3522 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3523 
3524 	/* Delete node pointer on father page */
3525 	btr_page_get_father(index, block, mtr, &cursor);
3526 
3527 	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor,
3528 						BTR_CREATE_FLAG, RB_NONE, mtr);
3529 	ut_a(err == DB_SUCCESS);
3530 
3531 	if (!compressed) {
3532 		btr_cur_compress_if_useful(&cursor, FALSE, mtr);
3533 	}
3534 }
3535 
3536 /*************************************************************//**
3537 If page is the only on its level, this function moves its records to the
3538 father page, thus reducing the tree height.
3539 @return father block */
3540 static
3541 buf_block_t*
btr_lift_page_up(dict_index_t * index,buf_block_t * block,mtr_t * mtr)3542 btr_lift_page_up(
3543 /*=============*/
3544 	dict_index_t*	index,	/*!< in: index tree */
3545 	buf_block_t*	block,	/*!< in: page which is the only on its level;
3546 				must not be empty: use
3547 				btr_discard_only_page_on_level if the last
3548 				record from the page should be removed */
3549 	mtr_t*		mtr)	/*!< in: mtr */
3550 {
3551 	buf_block_t*	father_block;
3552 	page_t*		father_page;
3553 	ulint		page_level;
3554 	page_zip_des_t*	father_page_zip;
3555 	page_t*		page		= buf_block_get_frame(block);
3556 	ulint		root_page_no;
3557 	buf_block_t*	blocks[BTR_MAX_LEVELS];
3558 	ulint		n_blocks;	/*!< last used index in blocks[] */
3559 	ulint		i;
3560 	bool		lift_father_up;
3561 	buf_block_t*	block_orig	= block;
3562 
3563 	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3564 	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3565 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3566 
3567 	page_level = btr_page_get_level(page, mtr);
3568 	root_page_no = dict_index_get_page(index);
3569 
3570 	{
3571 		btr_cur_t	cursor;
3572 		ulint*		offsets	= NULL;
3573 		mem_heap_t*	heap	= mem_heap_create(
3574 			sizeof(*offsets)
3575 			* (REC_OFFS_HEADER_SIZE + 1 + 1 + index->n_fields));
3576 		buf_block_t*	b;
3577 
3578 		offsets = btr_page_get_father_block(offsets, heap, index,
3579 						    block, mtr, &cursor);
3580 		father_block = btr_cur_get_block(&cursor);
3581 		father_page_zip = buf_block_get_page_zip(father_block);
3582 		father_page = buf_block_get_frame(father_block);
3583 
3584 		n_blocks = 0;
3585 
3586 		/* Store all ancestor pages so we can reset their
3587 		levels later on.  We have to do all the searches on
3588 		the tree now because later on, after we've replaced
3589 		the first level, the tree is in an inconsistent state
3590 		and can not be searched. */
3591 		for (b = father_block;
3592 		     buf_block_get_page_no(b) != root_page_no; ) {
3593 			ut_a(n_blocks < BTR_MAX_LEVELS);
3594 
3595 			offsets = btr_page_get_father_block(offsets, heap,
3596 							    index, b,
3597 							    mtr, &cursor);
3598 
3599 			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
3600 		}
3601 
3602 		lift_father_up = (n_blocks && page_level == 0);
3603 		if (lift_father_up) {
3604 			/* The father page also should be the only on its level (not
3605 			root). We should lift up the father page at first.
3606 			Because the leaf page should be lifted up only for root page.
3607 			The freeing page is based on page_level (==0 or !=0)
3608 			to choose segment. If the page_level is changed ==0 from !=0,
3609 			later freeing of the page doesn't find the page allocation
3610 			to be freed.*/
3611 
3612 			block = father_block;
3613 			page = buf_block_get_frame(block);
3614 			page_level = btr_page_get_level(page, mtr);
3615 
3616 			ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
3617 			ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
3618 			ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3619 
3620 			father_block = blocks[0];
3621 			father_page_zip = buf_block_get_page_zip(father_block);
3622 			father_page = buf_block_get_frame(father_block);
3623 		}
3624 
3625 		mem_heap_free(heap);
3626 	}
3627 
3628 	btr_search_drop_page_hash_index(block);
3629 
3630 	/* Make the father empty */
3631 	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
3632 	page_level++;
3633 
3634 	/* Copy the records to the father page one by one. */
3635 	if (0
3636 #ifdef UNIV_ZIP_COPY
3637 	    || father_page_zip
3638 #endif /* UNIV_ZIP_COPY */
3639 	    || !page_copy_rec_list_end(father_block, block,
3640 				       page_get_infimum_rec(page),
3641 				       index, mtr)) {
3642 		const page_zip_des_t*	page_zip
3643 			= buf_block_get_page_zip(block);
3644 		ut_a(father_page_zip);
3645 		ut_a(page_zip);
3646 
3647 		/* Copy the page byte for byte. */
3648 		page_zip_copy_recs(father_page_zip, father_page,
3649 				   page_zip, page, index, mtr);
3650 
3651 		/* Update the lock table and possible hash index. */
3652 
3653 		lock_move_rec_list_end(father_block, block,
3654 				       page_get_infimum_rec(page));
3655 
3656 		btr_search_move_or_delete_hash_entries(father_block, block,
3657 						       index);
3658 	}
3659 
3660 	btr_blob_dbg_remove(page, index, "btr_lift_page_up");
3661 	lock_update_copy_and_discard(father_block, block);
3662 
3663 	/* Go upward to root page, decrementing levels by one. */
3664 	for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
3665 		page_t*		page	= buf_block_get_frame(blocks[i]);
3666 		page_zip_des_t*	page_zip= buf_block_get_page_zip(blocks[i]);
3667 
3668 		ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
3669 
3670 		btr_page_set_level(page, page_zip, page_level, mtr);
3671 #ifdef UNIV_ZIP_DEBUG
3672 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3673 #endif /* UNIV_ZIP_DEBUG */
3674 	}
3675 
3676 	/* Free the file page */
3677 	btr_page_free(index, block, mtr);
3678 
3679 	/* We play it safe and reset the free bits for the father */
3680 	if (!dict_index_is_clust(index)) {
3681 		ibuf_reset_free_bits(father_block);
3682 	}
3683 	ut_ad(page_validate(father_page, index));
3684 	ut_ad(btr_check_node_ptr(index, father_block, mtr));
3685 
3686 	return(lift_father_up ? block_orig : father_block);
3687 }
3688 
3689 /*************************************************************//**
3690 Tries to merge the page first to the left immediate brother if such a
3691 brother exists, and the node pointers to the current page and to the brother
3692 reside on the same page. If the left brother does not satisfy these
3693 conditions, looks at the right brother. If the page is the only one on that
3694 level lifts the records of the page to the father page, thus reducing the
3695 tree height. It is assumed that mtr holds an x-latch on the tree and on the
3696 page. If cursor is on the leaf level, mtr must also hold x-latches to the
3697 brothers, if they exist.
3698 @return	TRUE on success */
3699 UNIV_INTERN
3700 ibool
btr_compress(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)3701 btr_compress(
3702 /*=========*/
3703 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to merge
3704 				or lift; the page must not be empty:
3705 				when deleting records, use btr_discard_page()
3706 				if the page would become empty */
3707 	ibool		adjust,	/*!< in: TRUE if should adjust the
3708 				cursor position even if compression occurs */
3709 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3710 {
3711 	dict_index_t*	index;
3712 	ulint		space;
3713 	ulint		zip_size;
3714 	ulint		left_page_no;
3715 	ulint		right_page_no;
3716 	buf_block_t*	merge_block;
3717 	page_t*		merge_page = NULL;
3718 	page_zip_des_t*	merge_page_zip;
3719 	ibool		is_left;
3720 	buf_block_t*	block;
3721 	page_t*		page;
3722 	btr_cur_t	father_cursor;
3723 	mem_heap_t*	heap;
3724 	ulint*		offsets;
3725 	ulint		nth_rec = 0; /* remove bogus warning */
3726 	DBUG_ENTER("btr_compress");
3727 
3728 	block = btr_cur_get_block(cursor);
3729 	page = btr_cur_get_page(cursor);
3730 	index = btr_cur_get_index(cursor);
3731 
3732 	btr_assert_not_corrupted(block, index);
3733 
3734 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
3735 				MTR_MEMO_X_LOCK));
3736 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3737 	space = dict_index_get_space(index);
3738 	zip_size = dict_table_zip_size(index->table);
3739 
3740 	MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
3741 
3742 	left_page_no = btr_page_get_prev(page, mtr);
3743 	right_page_no = btr_page_get_next(page, mtr);
3744 
3745 #ifdef UNIV_DEBUG
3746 	if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
3747 		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
3748 			page_rec_get_next(page_get_infimum_rec(page)),
3749 			page_is_comp(page)));
3750 	}
3751 #endif /* UNIV_DEBUG */
3752 
3753 	heap = mem_heap_create(100);
3754 	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
3755 					    &father_cursor);
3756 
3757 	if (adjust) {
3758 		nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
3759 		ut_ad(nth_rec > 0);
3760 	}
3761 
3762 	if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
3763 		/* The page is the only one on the level, lift the records
3764 		to the father */
3765 
3766 		merge_block = btr_lift_page_up(index, block, mtr);
3767 		goto func_exit;
3768 	}
3769 
3770 	/* Decide the page to which we try to merge and which will inherit
3771 	the locks */
3772 
3773 	is_left = btr_can_merge_with_page(cursor, left_page_no,
3774 					  &merge_block, mtr);
3775 
3776 	DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
3777 
3778 	if(!is_left
3779 	   && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
3780 				       mtr)) {
3781 		goto err_exit;
3782 	}
3783 
3784 	merge_page = buf_block_get_frame(merge_block);
3785 
3786 #ifdef UNIV_BTR_DEBUG
3787 	if (is_left) {
3788                 ut_a(btr_page_get_next(merge_page, mtr)
3789                      == buf_block_get_page_no(block));
3790 	} else {
3791                ut_a(btr_page_get_prev(merge_page, mtr)
3792                      == buf_block_get_page_no(block));
3793 	}
3794 #endif /* UNIV_BTR_DEBUG */
3795 
3796 	ut_ad(page_validate(merge_page, index));
3797 
3798 	merge_page_zip = buf_block_get_page_zip(merge_block);
3799 #ifdef UNIV_ZIP_DEBUG
3800 	if (merge_page_zip) {
3801 		const page_zip_des_t*	page_zip
3802 			= buf_block_get_page_zip(block);
3803 		ut_a(page_zip);
3804 		ut_a(page_zip_validate(merge_page_zip, merge_page, index));
3805 		ut_a(page_zip_validate(page_zip, page, index));
3806 	}
3807 #endif /* UNIV_ZIP_DEBUG */
3808 
3809 	/* Move records to the merge page */
3810 	if (is_left) {
3811 		rec_t*	orig_pred = page_copy_rec_list_start(
3812 			merge_block, block, page_get_supremum_rec(page),
3813 			index, mtr);
3814 
3815 		if (!orig_pred) {
3816 			goto err_exit;
3817 		}
3818 
3819 		btr_search_drop_page_hash_index(block);
3820 
3821 		/* Remove the page from the level list */
3822 		btr_level_list_remove(space, zip_size, page, index, mtr);
3823 
3824 		btr_node_ptr_delete(index, block, mtr);
3825 		lock_update_merge_left(merge_block, orig_pred, block);
3826 
3827 		if (adjust) {
3828 			nth_rec += page_rec_get_n_recs_before(orig_pred);
3829 		}
3830 	} else {
3831 		rec_t*		orig_succ;
3832 		ibool		compressed;
3833 		dberr_t		err;
3834 		btr_cur_t	cursor2;
3835 					/* father cursor pointing to node ptr
3836 					of the right sibling */
3837 #ifdef UNIV_BTR_DEBUG
3838 		byte		fil_page_prev[4];
3839 #endif /* UNIV_BTR_DEBUG */
3840 
3841 		btr_page_get_father(index, merge_block, mtr, &cursor2);
3842 
3843 		if (merge_page_zip && left_page_no == FIL_NULL) {
3844 
3845 			/* The function page_zip_compress(), which will be
3846 			invoked by page_copy_rec_list_end() below,
3847 			requires that FIL_PAGE_PREV be FIL_NULL.
3848 			Clear the field, but prepare to restore it. */
3849 #ifdef UNIV_BTR_DEBUG
3850 			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
3851 #endif /* UNIV_BTR_DEBUG */
3852 #if FIL_NULL != 0xffffffff
3853 # error "FIL_NULL != 0xffffffff"
3854 #endif
3855 			memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
3856 		}
3857 
3858 		orig_succ = page_copy_rec_list_end(merge_block, block,
3859 						   page_get_infimum_rec(page),
3860 						   cursor->index, mtr);
3861 
3862 		if (!orig_succ) {
3863 			ut_a(merge_page_zip);
3864 #ifdef UNIV_BTR_DEBUG
3865 			if (left_page_no == FIL_NULL) {
3866 				/* FIL_PAGE_PREV was restored from
3867 				merge_page_zip. */
3868 				ut_a(!memcmp(fil_page_prev,
3869 					     merge_page + FIL_PAGE_PREV, 4));
3870 			}
3871 #endif /* UNIV_BTR_DEBUG */
3872 			goto err_exit;
3873 		}
3874 
3875 		btr_search_drop_page_hash_index(block);
3876 
3877 #ifdef UNIV_BTR_DEBUG
3878 		if (merge_page_zip && left_page_no == FIL_NULL) {
3879 
3880 			/* Restore FIL_PAGE_PREV in order to avoid an assertion
3881 			failure in btr_level_list_remove(), which will set
3882 			the field again to FIL_NULL.  Even though this makes
3883 			merge_page and merge_page_zip inconsistent for a
3884 			split second, it is harmless, because the pages
3885 			are X-latched. */
3886 			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
3887 		}
3888 #endif /* UNIV_BTR_DEBUG */
3889 
3890 		/* Remove the page from the level list */
3891 		btr_level_list_remove(space, zip_size, page, index, mtr);
3892 
3893 		/* Replace the address of the old child node (= page) with the
3894 		address of the merge page to the right */
3895 		btr_node_ptr_set_child_page_no(
3896 			btr_cur_get_rec(&father_cursor),
3897 			btr_cur_get_page_zip(&father_cursor),
3898 			offsets, right_page_no, mtr);
3899 
3900 		compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor2,
3901 							BTR_CREATE_FLAG,
3902 							RB_NONE, mtr);
3903 		ut_a(err == DB_SUCCESS);
3904 
3905 		if (!compressed) {
3906 			btr_cur_compress_if_useful(&cursor2, FALSE, mtr);
3907 		}
3908 
3909 		lock_update_merge_right(merge_block, orig_succ, block);
3910 	}
3911 
3912 	btr_blob_dbg_remove(page, index, "btr_compress");
3913 
3914 	if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
3915 		/* Update the free bits of the B-tree page in the
3916 		insert buffer bitmap.  This has to be done in a
3917 		separate mini-transaction that is committed before the
3918 		main mini-transaction.  We cannot update the insert
3919 		buffer bitmap in this mini-transaction, because
3920 		btr_compress() can be invoked recursively without
3921 		committing the mini-transaction in between.  Since
3922 		insert buffer bitmap pages have a lower rank than
3923 		B-tree pages, we must not access other pages in the
3924 		same mini-transaction after accessing an insert buffer
3925 		bitmap page. */
3926 
3927 		/* The free bits in the insert buffer bitmap must
3928 		never exceed the free space on a page.  It is safe to
3929 		decrement or reset the bits in the bitmap in a
3930 		mini-transaction that is committed before the
3931 		mini-transaction that affects the free space. */
3932 
3933 		/* It is unsafe to increment the bits in a separately
3934 		committed mini-transaction, because in crash recovery,
3935 		the free bits could momentarily be set too high. */
3936 
3937 		if (zip_size) {
3938 			/* Because the free bits may be incremented
3939 			and we cannot update the insert buffer bitmap
3940 			in the same mini-transaction, the only safe
3941 			thing we can do here is the pessimistic
3942 			approach: reset the free bits. */
3943 			ibuf_reset_free_bits(merge_block);
3944 		} else {
3945 			/* On uncompressed pages, the free bits will
3946 			never increase here.  Thus, it is safe to
3947 			write the bits accurately in a separate
3948 			mini-transaction. */
3949 			ibuf_update_free_bits_if_full(merge_block,
3950 						      UNIV_PAGE_SIZE,
3951 						      ULINT_UNDEFINED);
3952 		}
3953 	}
3954 
3955 	ut_ad(page_validate(merge_page, index));
3956 #ifdef UNIV_ZIP_DEBUG
3957 	ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
3958 						  index));
3959 #endif /* UNIV_ZIP_DEBUG */
3960 
3961 	/* Free the file page */
3962 	btr_page_free(index, block, mtr);
3963 
3964 	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
3965 func_exit:
3966 	mem_heap_free(heap);
3967 
3968 	if (adjust) {
3969 		ut_ad(nth_rec > 0);
3970 		btr_cur_position(
3971 			index,
3972 			page_rec_get_nth(merge_block->frame, nth_rec),
3973 			merge_block, cursor);
3974 	}
3975 
3976 	MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
3977 
3978 	DBUG_RETURN(TRUE);
3979 
3980 err_exit:
3981 	/* We play it safe and reset the free bits. */
3982 	if (zip_size
3983 	    && merge_page
3984 	    && page_is_leaf(merge_page)
3985 	    && !dict_index_is_clust(index)) {
3986 		ibuf_reset_free_bits(merge_block);
3987 	}
3988 
3989 	mem_heap_free(heap);
3990 	DBUG_RETURN(FALSE);
3991 }
3992 
3993 /*************************************************************//**
3994 Discards a page that is the only page on its level.  This will empty
3995 the whole B-tree, leaving just an empty root page.  This function
3996 should never be reached, because btr_compress(), which is invoked in
3997 delete operations, calls btr_lift_page_up() to flatten the B-tree. */
3998 static
3999 void
btr_discard_only_page_on_level(dict_index_t * index,buf_block_t * block,mtr_t * mtr)4000 btr_discard_only_page_on_level(
4001 /*===========================*/
4002 	dict_index_t*	index,	/*!< in: index tree */
4003 	buf_block_t*	block,	/*!< in: page which is the only on its level */
4004 	mtr_t*		mtr)	/*!< in: mtr */
4005 {
4006 	ulint		page_level = 0;
4007 	trx_id_t	max_trx_id;
4008 
4009 	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
4010 	max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
4011 
4012 	while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
4013 		btr_cur_t	cursor;
4014 		buf_block_t*	father;
4015 		const page_t*	page	= buf_block_get_frame(block);
4016 
4017 		ut_a(page_get_n_recs(page) == 1);
4018 		ut_a(page_level == btr_page_get_level(page, mtr));
4019 		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
4020 		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
4021 
4022 		ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4023 		btr_search_drop_page_hash_index(block);
4024 
4025 		btr_page_get_father(index, block, mtr, &cursor);
4026 		father = btr_cur_get_block(&cursor);
4027 
4028 		lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
4029 
4030 		/* Free the file page */
4031 		btr_page_free(index, block, mtr);
4032 
4033 		block = father;
4034 		page_level++;
4035 	}
4036 
4037 	/* block is the root page, which must be empty, except
4038 	for the node pointer to the (now discarded) block(s). */
4039 
4040 #ifdef UNIV_BTR_DEBUG
4041 	if (!dict_index_is_ibuf(index)) {
4042 		const page_t*	root	= buf_block_get_frame(block);
4043 		const ulint	space	= dict_index_get_space(index);
4044 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
4045 					    + root, space));
4046 		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
4047 					    + root, space));
4048 	}
4049 #endif /* UNIV_BTR_DEBUG */
4050 
4051 	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
4052 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
4053 
4054 	if (!dict_index_is_clust(index)) {
4055 		/* We play it safe and reset the free bits for the root */
4056 		ibuf_reset_free_bits(block);
4057 
4058 		ut_a(max_trx_id);
4059 		page_set_max_trx_id(block,
4060 				    buf_block_get_page_zip(block),
4061 				    max_trx_id, mtr);
4062 	}
4063 }
4064 
4065 /*************************************************************//**
4066 Discards a page from a B-tree. This is used to remove the last record from
4067 a B-tree page: the whole page must be removed at the same time. This cannot
4068 be used for the root page, which is allowed to be empty. */
4069 UNIV_INTERN
4070 void
btr_discard_page(btr_cur_t * cursor,mtr_t * mtr)4071 btr_discard_page(
4072 /*=============*/
4073 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
4074 				the root page */
4075 	mtr_t*		mtr)	/*!< in: mtr */
4076 {
4077 	dict_index_t*	index;
4078 	ulint		space;
4079 	ulint		zip_size;
4080 	ulint		left_page_no;
4081 	ulint		right_page_no;
4082 	buf_block_t*	merge_block;
4083 	page_t*		merge_page;
4084 	buf_block_t*	block;
4085 	page_t*		page;
4086 	rec_t*		node_ptr;
4087 
4088 	block = btr_cur_get_block(cursor);
4089 	index = btr_cur_get_index(cursor);
4090 
4091 	ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
4092 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
4093 				MTR_MEMO_X_LOCK));
4094 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4095 	space = dict_index_get_space(index);
4096 	zip_size = dict_table_zip_size(index->table);
4097 
4098 	MONITOR_INC(MONITOR_INDEX_DISCARD);
4099 
4100 	/* Decide the page which will inherit the locks */
4101 
4102 	left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
4103 	right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
4104 
4105 	if (left_page_no != FIL_NULL) {
4106 		merge_block = btr_block_get(space, zip_size, left_page_no,
4107 					    RW_X_LATCH, index, mtr);
4108 		merge_page = buf_block_get_frame(merge_block);
4109 #ifdef UNIV_BTR_DEBUG
4110 		ut_a(btr_page_get_next(merge_page, mtr)
4111 		     == buf_block_get_page_no(block));
4112 #endif /* UNIV_BTR_DEBUG */
4113 	} else if (right_page_no != FIL_NULL) {
4114 		merge_block = btr_block_get(space, zip_size, right_page_no,
4115 					    RW_X_LATCH, index, mtr);
4116 		merge_page = buf_block_get_frame(merge_block);
4117 #ifdef UNIV_BTR_DEBUG
4118 		ut_a(btr_page_get_prev(merge_page, mtr)
4119 		     == buf_block_get_page_no(block));
4120 #endif /* UNIV_BTR_DEBUG */
4121 	} else {
4122 		btr_discard_only_page_on_level(index, block, mtr);
4123 
4124 		return;
4125 	}
4126 
4127 	page = buf_block_get_frame(block);
4128 	ut_a(page_is_comp(merge_page) == page_is_comp(page));
4129 	btr_search_drop_page_hash_index(block);
4130 
4131 	if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
4132 
4133 		/* We have to mark the leftmost node pointer on the right
4134 		side page as the predefined minimum record */
4135 		node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
4136 
4137 		ut_ad(page_rec_is_user_rec(node_ptr));
4138 
4139 		/* This will make page_zip_validate() fail on merge_page
4140 		until btr_level_list_remove() completes.  This is harmless,
4141 		because everything will take place within a single
4142 		mini-transaction and because writing to the redo log
4143 		is an atomic operation (performed by mtr_commit()). */
4144 		btr_set_min_rec_mark(node_ptr, mtr);
4145 	}
4146 
4147 	btr_node_ptr_delete(index, block, mtr);
4148 
4149 	/* Remove the page from the level list */
4150 	btr_level_list_remove(space, zip_size, page, index, mtr);
4151 #ifdef UNIV_ZIP_DEBUG
4152 	{
4153 		page_zip_des_t*	merge_page_zip
4154 			= buf_block_get_page_zip(merge_block);
4155 		ut_a(!merge_page_zip
4156 		     || page_zip_validate(merge_page_zip, merge_page, index));
4157 	}
4158 #endif /* UNIV_ZIP_DEBUG */
4159 
4160 	if (left_page_no != FIL_NULL) {
4161 		lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
4162 				    block);
4163 	} else {
4164 		lock_update_discard(merge_block,
4165 				    lock_get_min_heap_no(merge_block),
4166 				    block);
4167 	}
4168 
4169 	btr_blob_dbg_remove(page, index, "btr_discard_page");
4170 
4171 	/* Free the file page */
4172 	btr_page_free(index, block, mtr);
4173 
4174 	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
4175 }
4176 
4177 #ifdef UNIV_BTR_PRINT
4178 /*************************************************************//**
4179 Prints size info of a B-tree. */
4180 UNIV_INTERN
4181 void
btr_print_size(dict_index_t * index)4182 btr_print_size(
4183 /*===========*/
4184 	dict_index_t*	index)	/*!< in: index tree */
4185 {
4186 	page_t*		root;
4187 	fseg_header_t*	seg;
4188 	mtr_t		mtr;
4189 
4190 	if (dict_index_is_ibuf(index)) {
4191 		fputs("Sorry, cannot print info of an ibuf tree:"
4192 		      " use ibuf functions\n", stderr);
4193 
4194 		return;
4195 	}
4196 
4197 	mtr_start(&mtr);
4198 
4199 	root = btr_root_get(index, &mtr);
4200 
4201 	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
4202 
4203 	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
4204 	fseg_print(seg, &mtr);
4205 
4206 	if (!dict_index_is_univ(index)) {
4207 
4208 		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
4209 
4210 		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
4211 		fseg_print(seg, &mtr);
4212 	}
4213 
4214 	mtr_commit(&mtr);
4215 }
4216 
4217 /************************************************************//**
4218 Prints recursively index tree pages. */
4219 static
4220 void
btr_print_recursive(dict_index_t * index,buf_block_t * block,ulint width,mem_heap_t ** heap,ulint ** offsets,mtr_t * mtr)4221 btr_print_recursive(
4222 /*================*/
4223 	dict_index_t*	index,	/*!< in: index tree */
4224 	buf_block_t*	block,	/*!< in: index page */
4225 	ulint		width,	/*!< in: print this many entries from start
4226 				and end */
4227 	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
4228 	ulint**		offsets,/*!< in/out: buffer for rec_get_offsets() */
4229 	mtr_t*		mtr)	/*!< in: mtr */
4230 {
4231 	const page_t*	page	= buf_block_get_frame(block);
4232 	page_cur_t	cursor;
4233 	ulint		n_recs;
4234 	ulint		i	= 0;
4235 	mtr_t		mtr2;
4236 
4237 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4238 	fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
4239 		(ulong) btr_page_get_level(page, mtr),
4240 		(ulong) buf_block_get_page_no(block));
4241 
4242 	page_print(block, index, width, width);
4243 
4244 	n_recs = page_get_n_recs(page);
4245 
4246 	page_cur_set_before_first(block, &cursor);
4247 	page_cur_move_to_next(&cursor);
4248 
4249 	while (!page_cur_is_after_last(&cursor)) {
4250 
4251 		if (page_is_leaf(page)) {
4252 
4253 			/* If this is the leaf level, do nothing */
4254 
4255 		} else if ((i <= width) || (i >= n_recs - width)) {
4256 
4257 			const rec_t*	node_ptr;
4258 
4259 			mtr_start(&mtr2);
4260 
4261 			node_ptr = page_cur_get_rec(&cursor);
4262 
4263 			*offsets = rec_get_offsets(node_ptr, index, *offsets,
4264 						   ULINT_UNDEFINED, heap);
4265 			btr_print_recursive(index,
4266 					    btr_node_ptr_get_child(node_ptr,
4267 								   index,
4268 								   *offsets,
4269 								   &mtr2),
4270 					    width, heap, offsets, &mtr2);
4271 			mtr_commit(&mtr2);
4272 		}
4273 
4274 		page_cur_move_to_next(&cursor);
4275 		i++;
4276 	}
4277 }
4278 
4279 /**************************************************************//**
4280 Prints directories and other info of all nodes in the tree. */
4281 UNIV_INTERN
4282 void
btr_print_index(dict_index_t * index,ulint width)4283 btr_print_index(
4284 /*============*/
4285 	dict_index_t*	index,	/*!< in: index */
4286 	ulint		width)	/*!< in: print this many entries from start
4287 				and end */
4288 {
4289 	mtr_t		mtr;
4290 	buf_block_t*	root;
4291 	mem_heap_t*	heap	= NULL;
4292 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4293 	ulint*		offsets	= offsets_;
4294 	rec_offs_init(offsets_);
4295 
4296 	fputs("--------------------------\n"
4297 	      "INDEX TREE PRINT\n", stderr);
4298 
4299 	mtr_start(&mtr);
4300 
4301 	root = btr_root_block_get(index, RW_X_LATCH, &mtr);
4302 
4303 	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
4304 	if (heap) {
4305 		mem_heap_free(heap);
4306 	}
4307 
4308 	mtr_commit(&mtr);
4309 
4310 	btr_validate_index(index, 0);
4311 }
4312 #endif /* UNIV_BTR_PRINT */
4313 
4314 #ifdef UNIV_DEBUG
4315 /************************************************************//**
4316 Checks that the node pointer to a page is appropriate.
4317 @return	TRUE */
4318 UNIV_INTERN
4319 ibool
btr_check_node_ptr(dict_index_t * index,buf_block_t * block,mtr_t * mtr)4320 btr_check_node_ptr(
4321 /*===============*/
4322 	dict_index_t*	index,	/*!< in: index tree */
4323 	buf_block_t*	block,	/*!< in: index page */
4324 	mtr_t*		mtr)	/*!< in: mtr */
4325 {
4326 	mem_heap_t*	heap;
4327 	dtuple_t*	tuple;
4328 	ulint*		offsets;
4329 	btr_cur_t	cursor;
4330 	page_t*		page = buf_block_get_frame(block);
4331 
4332 	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4333 	if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
4334 
4335 		return(TRUE);
4336 	}
4337 
4338 	heap = mem_heap_create(256);
4339 	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
4340 					    &cursor);
4341 
4342 	if (page_is_leaf(page)) {
4343 
4344 		goto func_exit;
4345 	}
4346 
4347 	tuple = dict_index_build_node_ptr(
4348 		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
4349 		btr_page_get_level(page, mtr));
4350 
4351 	ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
4352 func_exit:
4353 	mem_heap_free(heap);
4354 
4355 	return(TRUE);
4356 }
4357 #endif /* UNIV_DEBUG */
4358 
4359 /************************************************************//**
4360 Display identification information for a record. */
4361 static
4362 void
btr_index_rec_validate_report(const page_t * page,const rec_t * rec,const dict_index_t * index)4363 btr_index_rec_validate_report(
4364 /*==========================*/
4365 	const page_t*		page,	/*!< in: index page */
4366 	const rec_t*		rec,	/*!< in: index record */
4367 	const dict_index_t*	index)	/*!< in: index */
4368 {
4369 	fputs("InnoDB: Record in ", stderr);
4370 	dict_index_name_print(stderr, NULL, index);
4371 	fprintf(stderr, ", page %lu, at offset %lu\n",
4372 		page_get_page_no(page), (ulint) page_offset(rec));
4373 }
4374 
4375 /************************************************************//**
4376 Checks the size and number of fields in a record based on the definition of
4377 the index.
4378 @return	TRUE if ok */
4379 UNIV_INTERN
4380 ibool
btr_index_rec_validate(const rec_t * rec,const dict_index_t * index,ibool dump_on_error)4381 btr_index_rec_validate(
4382 /*===================*/
4383 	const rec_t*		rec,		/*!< in: index record */
4384 	const dict_index_t*	index,		/*!< in: index */
4385 	ibool			dump_on_error)	/*!< in: TRUE if the function
4386 						should print hex dump of record
4387 						and page on error */
4388 {
4389 	ulint		len;
4390 	ulint		n;
4391 	ulint		i;
4392 	const page_t*	page;
4393 	mem_heap_t*	heap	= NULL;
4394 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4395 	ulint*		offsets	= offsets_;
4396 	rec_offs_init(offsets_);
4397 
4398 	page = page_align(rec);
4399 
4400 	if (dict_index_is_univ(index)) {
4401 		/* The insert buffer index tree can contain records from any
4402 		other index: we cannot check the number of fields or
4403 		their length */
4404 
4405 		return(TRUE);
4406 	}
4407 
4408 	if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
4409 		btr_index_rec_validate_report(page, rec, index);
4410 		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
4411 			(ulong) !!page_is_comp(page),
4412 			(ulong) dict_table_is_comp(index->table));
4413 
4414 		return(FALSE);
4415 	}
4416 
4417 	n = dict_index_get_n_fields(index);
4418 
4419 	if (!page_is_comp(page) && rec_get_n_fields_old(rec) != n) {
4420 		btr_index_rec_validate_report(page, rec, index);
4421 		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
4422 			(ulong) rec_get_n_fields_old(rec), (ulong) n);
4423 
4424 		if (dump_on_error) {
4425 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4426 
4427 			fputs("InnoDB: corrupt record ", stderr);
4428 			rec_print_old(stderr, rec);
4429 			putc('\n', stderr);
4430 		}
4431 		return(FALSE);
4432 	}
4433 
4434 	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
4435 
4436 	for (i = 0; i < n; i++) {
4437 		ulint	fixed_size = dict_col_get_fixed_size(
4438 			dict_index_get_nth_col(index, i), page_is_comp(page));
4439 
4440 		rec_get_nth_field_offs(offsets, i, &len);
4441 
4442 		/* Note that if fixed_size != 0, it equals the
4443 		length of a fixed-size column in the clustered index.
4444 		A prefix index of the column is of fixed, but different
4445 		length.  When fixed_size == 0, prefix_len is the maximum
4446 		length of the prefix index column. */
4447 
4448 		if ((dict_index_get_nth_field(index, i)->prefix_len == 0
4449 		     && len != UNIV_SQL_NULL && fixed_size
4450 		     && len != fixed_size)
4451 		    || (dict_index_get_nth_field(index, i)->prefix_len > 0
4452 			&& len != UNIV_SQL_NULL
4453 			&& len
4454 			> dict_index_get_nth_field(index, i)->prefix_len)) {
4455 
4456 			btr_index_rec_validate_report(page, rec, index);
4457 			fprintf(stderr,
4458 				"InnoDB: field %lu len is %lu,"
4459 				" should be %lu\n",
4460 				(ulong) i, (ulong) len, (ulong) fixed_size);
4461 
4462 			if (dump_on_error) {
4463 				buf_page_print(page, 0,
4464 					       BUF_PAGE_PRINT_NO_CRASH);
4465 
4466 				fputs("InnoDB: corrupt record ", stderr);
4467 				rec_print_new(stderr, rec, offsets);
4468 				putc('\n', stderr);
4469 			}
4470 			if (heap) {
4471 				mem_heap_free(heap);
4472 			}
4473 			return(FALSE);
4474 		}
4475 	}
4476 
4477 	if (heap) {
4478 		mem_heap_free(heap);
4479 	}
4480 	return(TRUE);
4481 }
4482 
4483 /************************************************************//**
4484 Checks the size and number of fields in records based on the definition of
4485 the index.
4486 @return	TRUE if ok */
4487 static
4488 ibool
btr_index_page_validate(buf_block_t * block,dict_index_t * index)4489 btr_index_page_validate(
4490 /*====================*/
4491 	buf_block_t*	block,	/*!< in: index page */
4492 	dict_index_t*	index)	/*!< in: index */
4493 {
4494 	page_cur_t	cur;
4495 	ibool		ret	= TRUE;
4496 #ifndef DBUG_OFF
4497 	ulint		nth	= 1;
4498 #endif /* !DBUG_OFF */
4499 
4500 	page_cur_set_before_first(block, &cur);
4501 
4502 	/* Directory slot 0 should only contain the infimum record. */
4503 	DBUG_EXECUTE_IF("check_table_rec_next",
4504 			ut_a(page_rec_get_nth_const(
4505 				     page_cur_get_page(&cur), 0)
4506 			     == cur.rec);
4507 			ut_a(page_dir_slot_get_n_owned(
4508 				     page_dir_get_nth_slot(
4509 					     page_cur_get_page(&cur), 0))
4510 			     == 1););
4511 
4512 	page_cur_move_to_next(&cur);
4513 
4514 	for (;;) {
4515 		if (page_cur_is_after_last(&cur)) {
4516 
4517 			break;
4518 		}
4519 
4520 		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
4521 
4522 			return(FALSE);
4523 		}
4524 
4525 		/* Verify that page_rec_get_nth_const() is correctly
4526 		retrieving each record. */
4527 		DBUG_EXECUTE_IF("check_table_rec_next",
4528 				ut_a(cur.rec == page_rec_get_nth_const(
4529 					     page_cur_get_page(&cur),
4530 					     page_rec_get_n_recs_before(
4531 						     cur.rec)));
4532 				ut_a(nth++ == page_rec_get_n_recs_before(
4533 					     cur.rec)););
4534 
4535 		page_cur_move_to_next(&cur);
4536 	}
4537 
4538 	return(ret);
4539 }
4540 
4541 /************************************************************//**
4542 Report an error on one page of an index tree. */
4543 static
4544 void
btr_validate_report1(dict_index_t * index,ulint level,const buf_block_t * block)4545 btr_validate_report1(
4546 /*=================*/
4547 	dict_index_t*		index,	/*!< in: index */
4548 	ulint			level,	/*!< in: B-tree level */
4549 	const buf_block_t*	block)	/*!< in: index page */
4550 {
4551 	fprintf(stderr, "InnoDB: Error in page %lu of ",
4552 		buf_block_get_page_no(block));
4553 	dict_index_name_print(stderr, NULL, index);
4554 	if (level) {
4555 		fprintf(stderr, ", index tree level %lu", level);
4556 	}
4557 	putc('\n', stderr);
4558 }
4559 
4560 /************************************************************//**
4561 Report an error on two pages of an index tree. */
4562 static
4563 void
btr_validate_report2(const dict_index_t * index,ulint level,const buf_block_t * block1,const buf_block_t * block2)4564 btr_validate_report2(
4565 /*=================*/
4566 	const dict_index_t*	index,	/*!< in: index */
4567 	ulint			level,	/*!< in: B-tree level */
4568 	const buf_block_t*	block1,	/*!< in: first index page */
4569 	const buf_block_t*	block2)	/*!< in: second index page */
4570 {
4571 	fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
4572 		buf_block_get_page_no(block1),
4573 		buf_block_get_page_no(block2));
4574 	dict_index_name_print(stderr, NULL, index);
4575 	if (level) {
4576 		fprintf(stderr, ", index tree level %lu", level);
4577 	}
4578 	putc('\n', stderr);
4579 }
4580 
4581 /************************************************************//**
4582 Validates index tree level.
4583 @return	TRUE if ok */
4584 static
4585 bool
btr_validate_level(dict_index_t * index,const trx_t * trx,ulint level)4586 btr_validate_level(
4587 /*===============*/
4588 	dict_index_t*	index,	/*!< in: index tree */
4589 	const trx_t*	trx,	/*!< in: transaction or NULL */
4590 	ulint		level)	/*!< in: level number */
4591 {
4592 	ulint		space;
4593 	ulint		space_flags;
4594 	ulint		zip_size;
4595 	buf_block_t*	block;
4596 	page_t*		page;
4597 	buf_block_t*	right_block = 0; /* remove warning */
4598 	page_t*		right_page = 0; /* remove warning */
4599 	page_t*		father_page;
4600 	btr_cur_t	node_cur;
4601 	btr_cur_t	right_node_cur;
4602 	rec_t*		rec;
4603 	ulint		right_page_no;
4604 	ulint		left_page_no;
4605 	page_cur_t	cursor;
4606 	dtuple_t*	node_ptr_tuple;
4607 	bool		ret	= true;
4608 	mtr_t		mtr;
4609 	mem_heap_t*	heap	= mem_heap_create(256);
4610 	fseg_header_t*	seg;
4611 	ulint*		offsets	= NULL;
4612 	ulint*		offsets2= NULL;
4613 #ifdef UNIV_ZIP_DEBUG
4614 	page_zip_des_t*	page_zip;
4615 #endif /* UNIV_ZIP_DEBUG */
4616 
4617 	mtr_start(&mtr);
4618 
4619 	mtr_x_lock(dict_index_get_lock(index), &mtr);
4620 
4621 	block = btr_root_block_get(index, RW_X_LATCH, &mtr);
4622 	page = buf_block_get_frame(block);
4623 	seg = page + PAGE_HEADER + PAGE_BTR_SEG_TOP;
4624 
4625 	space = dict_index_get_space(index);
4626 	zip_size = dict_table_zip_size(index->table);
4627 
4628 	fil_space_get_latch(space, &space_flags);
4629 
4630 	if (zip_size != dict_tf_get_zip_size(space_flags)) {
4631 
4632 		ib_logf(IB_LOG_LEVEL_WARN,
4633 			"Flags mismatch: table=%lu, tablespace=%lu",
4634 			(ulint) index->table->flags, (ulint) space_flags);
4635 
4636 		mtr_commit(&mtr);
4637 
4638 		return(false);
4639 	}
4640 
4641 	while (level != btr_page_get_level(page, &mtr)) {
4642 		const rec_t*	node_ptr;
4643 
4644 		if (fseg_page_is_free(seg,
4645 				      block->page.space, block->page.offset)) {
4646 
4647 			btr_validate_report1(index, level, block);
4648 
4649 			ib_logf(IB_LOG_LEVEL_WARN, "page is free");
4650 
4651 			ret = false;
4652 		}
4653 
4654 		ut_a(space == buf_block_get_space(block));
4655 		ut_a(space == page_get_space_id(page));
4656 #ifdef UNIV_ZIP_DEBUG
4657 		page_zip = buf_block_get_page_zip(block);
4658 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4659 #endif /* UNIV_ZIP_DEBUG */
4660 		ut_a(!page_is_leaf(page));
4661 
4662 		page_cur_set_before_first(block, &cursor);
4663 		page_cur_move_to_next(&cursor);
4664 
4665 		node_ptr = page_cur_get_rec(&cursor);
4666 		offsets = rec_get_offsets(node_ptr, index, offsets,
4667 					  ULINT_UNDEFINED, &heap);
4668 		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
4669 		page = buf_block_get_frame(block);
4670 	}
4671 
4672 	/* Now we are on the desired level. Loop through the pages on that
4673 	level. */
4674 
4675 	if (level == 0) {
4676 		/* Leaf pages are managed in their own file segment. */
4677 		seg -= PAGE_BTR_SEG_TOP - PAGE_BTR_SEG_LEAF;
4678 	}
4679 
4680 loop:
4681 	mem_heap_empty(heap);
4682 	offsets = offsets2 = NULL;
4683 	mtr_x_lock(dict_index_get_lock(index), &mtr);
4684 
4685 #ifdef UNIV_ZIP_DEBUG
4686 	page_zip = buf_block_get_page_zip(block);
4687 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4688 #endif /* UNIV_ZIP_DEBUG */
4689 
4690 	ut_a(block->page.space == space);
4691 
4692 	if (fseg_page_is_free(seg, block->page.space, block->page.offset)) {
4693 
4694 		btr_validate_report1(index, level, block);
4695 
4696 		ib_logf(IB_LOG_LEVEL_WARN, "Page is marked as free");
4697 		ret = false;
4698 
4699 	} else if (btr_page_get_index_id(page) != index->id) {
4700 
4701 		ib_logf(IB_LOG_LEVEL_ERROR,
4702 			"Page index id " IB_ID_FMT " != data dictionary "
4703 			"index id " IB_ID_FMT,
4704 			btr_page_get_index_id(page), index->id);
4705 
4706 		ret = false;
4707 
4708 	} else if (!page_validate(page, index)) {
4709 
4710 		btr_validate_report1(index, level, block);
4711 		ret = false;
4712 
4713 	} else if (level == 0 && !btr_index_page_validate(block, index)) {
4714 
4715 		/* We are on level 0. Check that the records have the right
4716 		number of fields, and field lengths are right. */
4717 
4718 		ret = false;
4719 	}
4720 
4721 	ut_a(btr_page_get_level(page, &mtr) == level);
4722 
4723 	right_page_no = btr_page_get_next(page, &mtr);
4724 	left_page_no = btr_page_get_prev(page, &mtr);
4725 
4726 	ut_a(!page_is_empty(page)
4727 	     || (level == 0
4728 		 && page_get_page_no(page) == dict_index_get_page(index)));
4729 
4730 	if (right_page_no != FIL_NULL) {
4731 		const rec_t*	right_rec;
4732 		right_block = btr_block_get(space, zip_size, right_page_no,
4733 					    RW_X_LATCH, index, &mtr);
4734 		right_page = buf_block_get_frame(right_block);
4735 		if (btr_page_get_prev(right_page, &mtr)
4736 		    != page_get_page_no(page)) {
4737 
4738 			btr_validate_report2(index, level, block, right_block);
4739 			fputs("InnoDB: broken FIL_PAGE_NEXT"
4740 			      " or FIL_PAGE_PREV links\n", stderr);
4741 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4742 			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4743 
4744 			ret = false;
4745 		}
4746 
4747 		if (page_is_comp(right_page) != page_is_comp(page)) {
4748 			btr_validate_report2(index, level, block, right_block);
4749 			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
4750 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4751 			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4752 
4753 			ret = false;
4754 
4755 			goto node_ptr_fails;
4756 		}
4757 
4758 		rec = page_rec_get_prev(page_get_supremum_rec(page));
4759 		right_rec = page_rec_get_next(page_get_infimum_rec(
4760 						      right_page));
4761 		offsets = rec_get_offsets(rec, index,
4762 					  offsets, ULINT_UNDEFINED, &heap);
4763 		offsets2 = rec_get_offsets(right_rec, index,
4764 					   offsets2, ULINT_UNDEFINED, &heap);
4765 		if (cmp_rec_rec(rec, right_rec, offsets, offsets2,
4766 			        index) >= 0) {
4767 
4768 			btr_validate_report2(index, level, block, right_block);
4769 
4770 			fputs("InnoDB: records in wrong order"
4771 			      " on adjacent pages\n", stderr);
4772 
4773 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4774 			buf_page_print(right_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4775 
4776 			fputs("InnoDB: record ", stderr);
4777 			rec = page_rec_get_prev(page_get_supremum_rec(page));
4778 			rec_print(stderr, rec, index);
4779 			putc('\n', stderr);
4780 			fputs("InnoDB: record ", stderr);
4781 			rec = page_rec_get_next(
4782 				page_get_infimum_rec(right_page));
4783 			rec_print(stderr, rec, index);
4784 			putc('\n', stderr);
4785 
4786 			ret = false;
4787 		}
4788 	}
4789 
4790 	if (level > 0 && left_page_no == FIL_NULL) {
4791 		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
4792 			     page_rec_get_next(page_get_infimum_rec(page)),
4793 			     page_is_comp(page)));
4794 	}
4795 
4796 	if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
4797 
4798 		/* Check father node pointers */
4799 
4800 		rec_t*	node_ptr;
4801 
4802 		offsets = btr_page_get_father_block(offsets, heap, index,
4803 						    block, &mtr, &node_cur);
4804 		father_page = btr_cur_get_page(&node_cur);
4805 		node_ptr = btr_cur_get_rec(&node_cur);
4806 
4807 		btr_cur_position(
4808 			index, page_rec_get_prev(page_get_supremum_rec(page)),
4809 			block, &node_cur);
4810 		offsets = btr_page_get_father_node_ptr(offsets, heap,
4811 						       &node_cur, &mtr);
4812 
4813 		if (node_ptr != btr_cur_get_rec(&node_cur)
4814 		    || btr_node_ptr_get_child_page_no(node_ptr, offsets)
4815 				     != buf_block_get_page_no(block)) {
4816 
4817 			btr_validate_report1(index, level, block);
4818 
4819 			fputs("InnoDB: node pointer to the page is wrong\n",
4820 			      stderr);
4821 
4822 			buf_page_print(father_page, 0, BUF_PAGE_PRINT_NO_CRASH);
4823 			buf_page_print(page, 0, BUF_PAGE_PRINT_NO_CRASH);
4824 
4825 			fputs("InnoDB: node ptr ", stderr);
4826 			rec_print(stderr, node_ptr, index);
4827 
4828 			rec = btr_cur_get_rec(&node_cur);
4829 			fprintf(stderr, "\n"
4830 				"InnoDB: node ptr child page n:o %lu\n",
4831 				(ulong) btr_node_ptr_get_child_page_no(
4832 					rec, offsets));
4833 
4834 			fputs("InnoDB: record on page ", stderr);
4835 			rec_print_new(stderr, rec, offsets);
4836 			putc('\n', stderr);
4837 			ret = false;
4838 
4839 			goto node_ptr_fails;
4840 		}
4841 
4842 		if (!page_is_leaf(page)) {
4843 			node_ptr_tuple = dict_index_build_node_ptr(
4844 				index,
4845 				page_rec_get_next(page_get_infimum_rec(page)),
4846 				0, heap, btr_page_get_level(page, &mtr));
4847 
4848 			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
4849 					   offsets)) {
4850 				const rec_t* first_rec = page_rec_get_next(
4851 					page_get_infimum_rec(page));
4852 
4853 				btr_validate_report1(index, level, block);
4854 
4855 				buf_page_print(father_page, 0,
4856 					       BUF_PAGE_PRINT_NO_CRASH);
4857 				buf_page_print(page, 0,
4858 					       BUF_PAGE_PRINT_NO_CRASH);
4859 
4860 				fputs("InnoDB: Error: node ptrs differ"
4861 				      " on levels > 0\n"
4862 				      "InnoDB: node ptr ", stderr);
4863 				rec_print_new(stderr, node_ptr, offsets);
4864 				fputs("InnoDB: first rec ", stderr);
4865 				rec_print(stderr, first_rec, index);
4866 				putc('\n', stderr);
4867 				ret = false;
4868 
4869 				goto node_ptr_fails;
4870 			}
4871 		}
4872 
4873 		if (left_page_no == FIL_NULL) {
4874 			ut_a(node_ptr == page_rec_get_next(
4875 				     page_get_infimum_rec(father_page)));
4876 			ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
4877 		}
4878 
4879 		if (right_page_no == FIL_NULL) {
4880 			ut_a(node_ptr == page_rec_get_prev(
4881 				     page_get_supremum_rec(father_page)));
4882 			ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
4883 		} else {
4884 			const rec_t*	right_node_ptr
4885 				= page_rec_get_next(node_ptr);
4886 
4887 			offsets = btr_page_get_father_block(
4888 				offsets, heap, index, right_block,
4889 				&mtr, &right_node_cur);
4890 			if (right_node_ptr
4891 			    != page_get_supremum_rec(father_page)) {
4892 
4893 				if (btr_cur_get_rec(&right_node_cur)
4894 				    != right_node_ptr) {
4895 					ret = false;
4896 					fputs("InnoDB: node pointer to"
4897 					      " the right page is wrong\n",
4898 					      stderr);
4899 
4900 					btr_validate_report1(index, level,
4901 							     block);
4902 
4903 					buf_page_print(
4904 						father_page, 0,
4905 						BUF_PAGE_PRINT_NO_CRASH);
4906 					buf_page_print(
4907 						page, 0,
4908 						BUF_PAGE_PRINT_NO_CRASH);
4909 					buf_page_print(
4910 						right_page, 0,
4911 						BUF_PAGE_PRINT_NO_CRASH);
4912 				}
4913 			} else {
4914 				page_t*	right_father_page
4915 					= btr_cur_get_page(&right_node_cur);
4916 
4917 				if (btr_cur_get_rec(&right_node_cur)
4918 				    != page_rec_get_next(
4919 					    page_get_infimum_rec(
4920 						    right_father_page))) {
4921 					ret = false;
4922 					fputs("InnoDB: node pointer 2 to"
4923 					      " the right page is wrong\n",
4924 					      stderr);
4925 
4926 					btr_validate_report1(index, level,
4927 							     block);
4928 
4929 					buf_page_print(
4930 						father_page, 0,
4931 						BUF_PAGE_PRINT_NO_CRASH);
4932 					buf_page_print(
4933 						right_father_page, 0,
4934 						BUF_PAGE_PRINT_NO_CRASH);
4935 					buf_page_print(
4936 						page, 0,
4937 						BUF_PAGE_PRINT_NO_CRASH);
4938 					buf_page_print(
4939 						right_page, 0,
4940 						BUF_PAGE_PRINT_NO_CRASH);
4941 				}
4942 
4943 				if (page_get_page_no(right_father_page)
4944 				    != btr_page_get_next(father_page, &mtr)) {
4945 
4946 					ret = false;
4947 					fputs("InnoDB: node pointer 3 to"
4948 					      " the right page is wrong\n",
4949 					      stderr);
4950 
4951 					btr_validate_report1(index, level,
4952 							     block);
4953 
4954 					buf_page_print(
4955 						father_page, 0,
4956 						BUF_PAGE_PRINT_NO_CRASH);
4957 					buf_page_print(
4958 						right_father_page, 0,
4959 						BUF_PAGE_PRINT_NO_CRASH);
4960 					buf_page_print(
4961 						page, 0,
4962 						BUF_PAGE_PRINT_NO_CRASH);
4963 					buf_page_print(
4964 						right_page, 0,
4965 						BUF_PAGE_PRINT_NO_CRASH);
4966 				}
4967 			}
4968 		}
4969 	}
4970 
4971 node_ptr_fails:
4972 	/* Commit the mini-transaction to release the latch on 'page'.
4973 	Re-acquire the latch on right_page, which will become 'page'
4974 	on the next loop.  The page has already been checked. */
4975 	mtr_commit(&mtr);
4976 
4977 	if (trx_is_interrupted(trx)) {
4978 		/* On interrupt, return the current status. */
4979 	} else if (right_page_no != FIL_NULL) {
4980 
4981 		mtr_start(&mtr);
4982 
4983 		block = btr_block_get(
4984 			space, zip_size, right_page_no,
4985 			RW_X_LATCH, index, &mtr);
4986 
4987 		page = buf_block_get_frame(block);
4988 
4989 		goto loop;
4990 	}
4991 
4992 	mem_heap_free(heap);
4993 
4994 	return(ret);
4995 }
4996 
4997 /**************************************************************//**
4998 Checks the consistency of an index tree.
4999 @return	TRUE if ok */
5000 UNIV_INTERN
5001 bool
btr_validate_index(dict_index_t * index,const trx_t * trx)5002 btr_validate_index(
5003 /*===============*/
5004 	dict_index_t*	index,	/*!< in: index */
5005 	const trx_t*	trx)	/*!< in: transaction or NULL */
5006 {
5007 	/* Full Text index are implemented by auxiliary tables,
5008 	not the B-tree */
5009 	if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
5010 		return(true);
5011 	}
5012 
5013 	mtr_t		mtr;
5014 
5015 	mtr_start(&mtr);
5016 
5017 	mtr_x_lock(dict_index_get_lock(index), &mtr);
5018 
5019 	bool	ok = true;
5020 	page_t*	root = btr_root_get(index, &mtr);
5021 	ulint	n = btr_page_get_level(root, &mtr);
5022 
5023 	for (ulint i = 0; i <= n; ++i) {
5024 
5025 		if (!btr_validate_level(index, trx, n - i)) {
5026 			ok = false;
5027 			break;
5028 		}
5029 	}
5030 
5031 	mtr_commit(&mtr);
5032 
5033 	return(ok);
5034 }
5035 
5036 /**************************************************************//**
5037 Checks if the page in the cursor can be merged with given page.
5038 If necessary, re-organize the merge_page.
5039 @return	TRUE if possible to merge. */
5040 UNIV_INTERN
5041 ibool
btr_can_merge_with_page(btr_cur_t * cursor,ulint page_no,buf_block_t ** merge_block,mtr_t * mtr)5042 btr_can_merge_with_page(
5043 /*====================*/
5044 	btr_cur_t*	cursor,		/*!< in: cursor on the page to merge */
5045 	ulint		page_no,	/*!< in: a sibling page */
5046 	buf_block_t**	merge_block,	/*!< out: the merge block */
5047 	mtr_t*		mtr)		/*!< in: mini-transaction */
5048 {
5049 	dict_index_t*	index;
5050 	page_t*		page;
5051 	ulint		space;
5052 	ulint		zip_size;
5053 	ulint		n_recs;
5054 	ulint		data_size;
5055         ulint           max_ins_size_reorg;
5056 	ulint		max_ins_size;
5057 	buf_block_t*	mblock;
5058 	page_t*		mpage;
5059 	DBUG_ENTER("btr_can_merge_with_page");
5060 
5061 	if (page_no == FIL_NULL) {
5062 		goto error;
5063 	}
5064 
5065 	index = btr_cur_get_index(cursor);
5066 	page  = btr_cur_get_page(cursor);
5067 	space = dict_index_get_space(index);
5068         zip_size = dict_table_zip_size(index->table);
5069 
5070 	mblock = btr_block_get(space, zip_size, page_no, RW_X_LATCH, index,
5071 			       mtr);
5072 	mpage = buf_block_get_frame(mblock);
5073 
5074         n_recs = page_get_n_recs(page);
5075         data_size = page_get_data_size(page);
5076 
5077         max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
5078                 mpage, n_recs);
5079 
5080 	if (data_size > max_ins_size_reorg) {
5081 		goto error;
5082 	}
5083 
5084 	/* If compression padding tells us that merging will result in
5085 	too packed up page i.e.: which is likely to cause compression
5086 	failure then don't merge the pages. */
5087 	if (zip_size && page_is_leaf(mpage)
5088 	    && (page_get_data_size(mpage) + data_size
5089 		>= dict_index_zip_pad_optimal_page_size(index))) {
5090 
5091 		goto error;
5092 	}
5093 
5094 
5095 	max_ins_size = page_get_max_insert_size(mpage, n_recs);
5096 
5097 	if (data_size > max_ins_size) {
5098 
5099 		/* We have to reorganize mpage */
5100 
5101 		if (!btr_page_reorganize_block(
5102 			    false, page_zip_level, mblock, index, mtr)) {
5103 
5104 			goto error;
5105 		}
5106 
5107 		max_ins_size = page_get_max_insert_size(mpage, n_recs);
5108 
5109 		ut_ad(page_validate(mpage, index));
5110 		ut_ad(max_ins_size == max_ins_size_reorg);
5111 
5112 		if (data_size > max_ins_size) {
5113 
5114 			/* Add fault tolerance, though this should
5115 			never happen */
5116 
5117 			goto error;
5118 		}
5119 	}
5120 
5121 	*merge_block = mblock;
5122 	DBUG_RETURN(TRUE);
5123 
5124 error:
5125 	*merge_block = NULL;
5126 	DBUG_RETURN(FALSE);
5127 }
5128 
5129 #endif /* !UNIV_HOTBACKUP */
5130