1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 *****************************************************************************/
34 
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38 
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42 
43 			NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49 
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52 
53 #include "btr0cur.h"
54 
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58 
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81 
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 	BTR_NO_OP = 0,			/*!< Not buffered */
85 	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
86 	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
87 	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
88 	BTR_DELMARK_OP			/*!< Mark a record for deletion */
89 };
90 
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 	BTR_INTENTION_DELETE,
94 	BTR_INTENTION_BOTH,
95 	BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103 
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110 
111  Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH	100000
114 
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint	btr_cur_n_non_sea	= 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint	btr_cur_n_sea		= 0;
120 /** Old value of btr_cur_n_non_sea.  Copied by
121 srv_refresh_innodb_monitor_stats().  Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint	btr_cur_n_non_sea_old	= 0;
124 /** Old value of btr_cur_n_sea.  Copied by
125 srv_refresh_innodb_monitor_stats().  Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint	btr_cur_n_sea_old	= 0;
128 
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint	btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133 
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
137 
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
142 						page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
144 						FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
147 						part header, in bytes */
148 
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 	(((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159 
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162 
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
173 				part will be updated, or NULL */
174 	rec_t*		rec,	/*!< in/out: record in a clustered index */
175 	dict_index_t*	index,	/*!< in: index of the page */
176 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
177 	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
186 	ulint		height,		/*!< in: height of the page in tree;
187 					0 means leaf node */
188 	ulint		root_height);	/*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
197 				X-latched */
198 	rec_t*		rec,	/*!< in: record */
199 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
200 				part will be updated, or NULL */
201 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
202 	const upd_t*	update,	/*!< in: update vector */
203 	bool		rollback,/*!< in: performing rollback? */
204 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
205 				an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 	dict_index_t*	index,	/*!< in: index of the data, the index
213 				tree MUST be X-latched */
214 	rec_t*		rec,	/*!< in: record */
215 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
216 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
217 				part will be updated, or NULL */
218 	bool		rollback,/*!< in: performing rollback? */
219 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
220 				an X-latch to record page and to the index
221 				tree */
222 #endif /* !UNIV_HOTBACKUP */
223 
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226 
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236 
237 /** Latches the leaf page or pages requested.
238 @param[in]	block		leaf page where the search converged
239 @param[in]	page_id		page id of the leaf
240 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
241 @param[in]	cursor		cursor
242 @param[in]	mtr		mini-transaction
243 @return	blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 	buf_block_t*		block,
247 	const page_id_t&	page_id,
248 	const page_size_t&	page_size,
249 	ulint			latch_mode,
250 	btr_cur_t*		cursor,
251 	mtr_t*			mtr)
252 {
253 	ulint		mode;
254 	ulint		left_page_no;
255 	ulint		right_page_no;
256 	buf_block_t*	get_block;
257 	page_t*		page = buf_block_get_frame(block);
258 	bool		spatial;
259 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260 
261 	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 	ut_ad(buf_page_in_file(&block->page));
263 
264 	switch (latch_mode) {
265 	case BTR_SEARCH_LEAF:
266 	case BTR_MODIFY_LEAF:
267 	case BTR_SEARCH_TREE:
268 		if (spatial) {
269 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 				= mtr_set_savepoint(mtr);
271 		}
272 
273 		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 		get_block = btr_block_get(page_id, page_size, mode,
276 					  cursor->index, mtr);
277 
278 		SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
279 
280 		latch_leaves.blocks[1] = get_block;
281 #ifdef UNIV_BTR_DEBUG
282 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
283 #endif /* UNIV_BTR_DEBUG */
284 		if (spatial) {
285 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
286 				= get_block;
287 		}
288 
289 		return(latch_leaves);
290 	case BTR_MODIFY_TREE:
291 		/* It is exclusive for other operations which calls
292 		btr_page_set_prev() */
293 		ut_ad(mtr_memo_contains_flagged(mtr,
294 			dict_index_get_lock(cursor->index),
295 			MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
296 		      || dict_table_is_intrinsic(cursor->index->table));
297 		/* x-latch also siblings from left to right */
298 		left_page_no = btr_page_get_prev(page, mtr);
299 
300 		if (left_page_no != FIL_NULL) {
301 
302 			if (spatial) {
303 				cursor->rtr_info->tree_savepoints[
304 					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
305 			}
306 
307 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
308 			get_block = btr_block_get(
309 				page_id_t(page_id.space(), left_page_no),
310 				page_size, RW_X_LATCH, cursor->index, mtr);
311 
312 			SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
313 
314 			latch_leaves.blocks[0] = get_block;
315 
316 			if (spatial) {
317 				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
318 					= get_block;
319 			}
320 		}
321 
322 		if (spatial) {
323 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
324 				= mtr_set_savepoint(mtr);
325 		}
326 
327 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
328 		get_block = btr_block_get(
329 			page_id, page_size, RW_X_LATCH, cursor->index, mtr);
330 
331 		SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
332 
333 		latch_leaves.blocks[1] = get_block;
334 
335 #ifdef UNIV_BTR_DEBUG
336 		/* Sanity check only after both the blocks are latched. */
337 		if (latch_leaves.blocks[0] != NULL) {
338 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
339 				== page_is_comp(page));
340 			ut_a(btr_page_get_next(
341 				latch_leaves.blocks[0]->frame, mtr)
342 				== page_get_page_no(page));
343 		}
344 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
345 #endif /* UNIV_BTR_DEBUG */
346 
347 		if (spatial) {
348 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
349 				= get_block;
350 		}
351 
352 		right_page_no = btr_page_get_next(page, mtr);
353 
354 		if (right_page_no != FIL_NULL) {
355 			if (spatial) {
356 				cursor->rtr_info->tree_savepoints[
357 					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
358 								mtr);
359 			}
360 			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
361 			get_block = btr_block_get(
362 				page_id_t(page_id.space(), right_page_no),
363 				page_size, RW_X_LATCH, cursor->index, mtr);
364 
365 			SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
366 
367 			latch_leaves.blocks[2] = get_block;
368 #ifdef UNIV_BTR_DEBUG
369 			ut_a(page_is_comp(get_block->frame)
370 			     == page_is_comp(page));
371 			ut_a(btr_page_get_prev(get_block->frame, mtr)
372 			     == page_get_page_no(page));
373 #endif /* UNIV_BTR_DEBUG */
374 			if (spatial) {
375 				cursor->rtr_info->tree_blocks[
376 					RTR_MAX_LEVELS + 2] = get_block;
377 			}
378 		}
379 
380 		return(latch_leaves);
381 
382 	case BTR_SEARCH_PREV:
383 	case BTR_MODIFY_PREV:
384 		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
385 		/* latch also left sibling */
386 		rw_lock_s_lock(&block->lock);
387 		left_page_no = btr_page_get_prev(page, mtr);
388 		rw_lock_s_unlock(&block->lock);
389 
390 		if (left_page_no != FIL_NULL) {
391 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
392 			get_block = btr_block_get(
393 				page_id_t(page_id.space(), left_page_no),
394 				page_size, mode, cursor->index, mtr);
395 			latch_leaves.blocks[0] = get_block;
396 			cursor->left_block = get_block;
397 
398 			SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
399 		}
400 
401 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
402 		get_block = btr_block_get(page_id, page_size, mode,
403 					  cursor->index, mtr);
404 
405 		SRV_CORRUPT_TABLE_CHECK(get_block, return latch_leaves;);
406 
407 		latch_leaves.blocks[1] = get_block;
408 #ifdef UNIV_BTR_DEBUG
409 		/* Sanity check only after both the blocks are latched. */
410 		if (latch_leaves.blocks[0] != NULL) {
411 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
412 			     == page_is_comp(page));;
413 			ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame, mtr)
414 			     == page_get_page_no(page));
415 		}
416 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
417 #endif /* UNIV_BTR_DEBUG */
418 		return(latch_leaves);
419 	case BTR_CONT_MODIFY_TREE:
420 		ut_ad(dict_index_is_spatial(cursor->index));
421 		return(latch_leaves);
422 	}
423 
424 	ut_error;
425 	return(latch_leaves);
426 }
427 
428 /** Optimistically latches the leaf page or pages requested.
429 @param[in]	block		guessed buffer block
430 @param[in]	modify_clock	modify clock value
431 @param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
432 @param[in,out]	cursor		cursor
433 @param[in]	file		file name
434 @param[in]	line		line where called
435 @param[in]	mtr		mini-transaction
436 @return true if success */
437 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)438 btr_cur_optimistic_latch_leaves(
439 	buf_block_t*	block,
440 	ib_uint64_t	modify_clock,
441 	ulint*		latch_mode,
442 	btr_cur_t*	cursor,
443 	const char*	file,
444 	ulint		line,
445 	mtr_t*		mtr)
446 {
447 	ulint		mode;
448 	ulint		left_page_no;
449 	ut_ad(block->page.buf_fix_count > 0);
450 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
451 
452 	switch (*latch_mode) {
453 	case BTR_SEARCH_LEAF:
454 	case BTR_MODIFY_LEAF:
455 		return(buf_page_optimistic_get(*latch_mode, block,
456 				modify_clock, file, line, mtr));
457 	case BTR_SEARCH_PREV:
458 	case BTR_MODIFY_PREV:
459 		mode = *latch_mode == BTR_SEARCH_PREV
460 			? RW_S_LATCH : RW_X_LATCH;
461 
462 		rw_lock_s_lock(&block->lock);
463 		if (block->modify_clock != modify_clock) {
464 			rw_lock_s_unlock(&block->lock);
465 
466 			return(false);
467 		}
468 		left_page_no = btr_page_get_prev(
469 			buf_block_get_frame(block), mtr);
470 		rw_lock_s_unlock(&block->lock);
471 
472 		if (left_page_no != FIL_NULL) {
473 			const page_id_t	page_id(
474 				dict_index_get_space(cursor->index),
475 				left_page_no);
476 
477 			cursor->left_block = btr_block_get(
478 				page_id,
479 				dict_table_page_size(cursor->index->table),
480 				mode, cursor->index, mtr);
481 		} else {
482 			cursor->left_block = NULL;
483 		}
484 
485 		if (buf_page_optimistic_get(mode, block, modify_clock,
486 					    file, line, mtr)) {
487 			if (btr_page_get_prev(buf_block_get_frame(block), mtr)
488 			    == left_page_no) {
489 				/* We've entered this function with the block already buffer-fixed,
490 				and buf_page_optimistic_get() buffer-fixes it again. The caller should
491 				unfix the block once (to undo their buffer-fixing). */
492 				ut_ad(2 <= block->page.buf_fix_count);
493 				*latch_mode = mode;
494 				return(true);
495 			} else {
496 				/* release the block, which will also decrement the buf_fix_count once
497 				undoing the increment in successful buf_page_optimistic_get() */
498 				btr_leaf_page_release(block, mode, mtr);
499 			}
500 		}
501 
502 		/* If we are still here then buf_page_optimistic_get() did not buffer-fix
503 		the page, but it should still be buffer-fixed as it was before the call.*/
504 		ut_ad(0 < block->page.buf_fix_count);
505 		/* release the left block */
506 		if (cursor->left_block != NULL) {
507 			btr_leaf_page_release(cursor->left_block,
508 					      mode, mtr);
509 		}
510 
511 		return(false);
512 
513 	default:
514 		ut_error;
515 		return(false);
516 	}
517 }
518 
519 /**
520 Gets intention in btr_intention_t from latch_mode, and cleares the intention
521 at the latch_mode.
522 @param latch_mode	in/out: pointer to latch_mode
523 @return intention for latching tree */
524 static
525 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)526 btr_cur_get_and_clear_intention(
527 	ulint	*latch_mode)
528 {
529 	btr_intention_t	intention;
530 
531 	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
532 	case BTR_LATCH_FOR_INSERT:
533 		intention = BTR_INTENTION_INSERT;
534 		break;
535 	case BTR_LATCH_FOR_DELETE:
536 		intention = BTR_INTENTION_DELETE;
537 		break;
538 	default:
539 		/* both or unknown */
540 		intention = BTR_INTENTION_BOTH;
541 	}
542 	*latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
543 
544 	return(intention);
545 }
546 
547 /**
548 Gets the desired latch type for the root leaf (root page is root leaf)
549 at the latch mode.
550 @param latch_mode	in: BTR_SEARCH_LEAF, ...
551 @return latch type */
552 static
553 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)554 btr_cur_latch_for_root_leaf(
555 	ulint	latch_mode)
556 {
557 	switch (latch_mode) {
558 	case BTR_SEARCH_LEAF:
559 	case BTR_SEARCH_TREE:
560 	case BTR_SEARCH_PREV:
561 		return(RW_S_LATCH);
562 	case BTR_MODIFY_LEAF:
563 	case BTR_MODIFY_TREE:
564 	case BTR_MODIFY_PREV:
565 		return(RW_X_LATCH);
566 	case BTR_CONT_MODIFY_TREE:
567 	case BTR_CONT_SEARCH_TREE:
568 		/* A root page should be latched already,
569 		and don't need to be latched here.
570 		fall through (RW_NO_LATCH) */
571 	case BTR_NO_LATCHES:
572 		return(RW_NO_LATCH);
573 	}
574 
575 	ut_error;
576 	return(RW_NO_LATCH); /* avoid compiler warnings */
577 }
578 
579 /** Detects whether the modifying record might need a modifying tree structure.
580 @param[in]	index		index
581 @param[in]	page		page
582 @param[in]	lock_intention	lock intention for the tree operation
583 @param[in]	rec		record (current node_ptr)
584 @param[in]	rec_size	size of the record or max size of node_ptr
585 @param[in]	page_size	page size
586 @param[in]	mtr		mtr
587 @return true if tree modification is needed */
588 static
589 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)590 btr_cur_will_modify_tree(
591 	dict_index_t*	index,
592 	const page_t*	page,
593 	btr_intention_t	lock_intention,
594 	const rec_t*	rec,
595 	ulint		rec_size,
596 	const page_size_t&	page_size,
597 	mtr_t*		mtr)
598 {
599 	ut_ad(!page_is_leaf(page));
600 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
601 					MTR_MEMO_X_LOCK
602 					| MTR_MEMO_SX_LOCK)
603 	      || dict_table_is_intrinsic(index->table));
604 
605 	/* Pessimistic delete of the first record causes delete & insert
606 	of node_ptr at upper level. And a subsequent page shrink is
607 	possible. It causes delete of node_ptr at the upper level.
608 	So we should pay attention also to 2nd record not only
609 	first record and last record. Because if the "delete & insert" are
610 	done for the different page, the 2nd record become
611 	first record and following compress might delete the record and causes
612 	the uppper level node_ptr modification. */
613 
614 	if (lock_intention <= BTR_INTENTION_BOTH) {
615 		ulint	margin;
616 
617 		if (lock_intention == BTR_INTENTION_BOTH) {
618 			ulint	level = btr_page_get_level(page, mtr);
619 
620 			/* This value is the worst expectation for the node_ptr
621 			records to be deleted from this page. It is used to
622 			expect whether the cursor position can be the left_most
623 			record in this page or not. */
624 			ulint   max_nodes_deleted = 0;
625 
626 			/* By modifying tree operations from the under of this
627 			level, logically (2 ^ (level - 1)) opportunities to
628 			deleting records in maximum even unreally rare case. */
629 			if (level > 7) {
630 				/* TODO: adjust this practical limit. */
631 				max_nodes_deleted = 64;
632 			} else if (level > 0) {
633 				max_nodes_deleted = (ulint)1 << (level - 1);
634 			}
635 
636 			/* check delete will cause. (BTR_INTENTION_BOTH
637 			or BTR_INTENTION_DELETE) */
638 			if (page_get_n_recs(page) <= max_nodes_deleted * 2
639 			    || page_rec_is_first(rec, page)) {
640 				/* The cursor record can be the left most record
641 				in this page. */
642 				return(true);
643 			}
644 
645 			if (fil_page_get_prev(page) != FIL_NULL
646 			    && page_rec_distance_is_at_most(
647 					page_get_infimum_rec(page), rec,
648 					max_nodes_deleted)) {
649 				return (true);
650 			}
651 
652 			if (fil_page_get_next(page) != FIL_NULL
653 			    && page_rec_distance_is_at_most(
654 					rec, page_get_supremum_rec(page),
655 					max_nodes_deleted)) {
656 				return (true);
657 			}
658 
659 			/* Delete at leftmost record in a page causes delete
660 			& insert at its parent page. After that, the delete
661 			might cause btr_compress() and delete record at its
662 			parent page. Thus we should consider max deletes. */
663 
664 			margin = rec_size * max_nodes_deleted;
665 		} else {
666 			ut_ad(lock_intention == BTR_INTENTION_DELETE);
667 
668 			margin = rec_size;
669 		}
670 		/* Safe because we already have SX latch of the index tree */
671 		if (page_get_data_size(page)
672 			< margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
673 		    || (fil_page_get_next(page) == FIL_NULL
674 			&& fil_page_get_prev(page) == FIL_NULL)) {
675 			return(true);
676 		}
677 	}
678 
679 	if (lock_intention >= BTR_INTENTION_BOTH) {
680 		/* check insert will cause. BTR_INTENTION_BOTH
681 		or BTR_INTENTION_INSERT*/
682 
683 		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
684 		we should check it here in advance, since the max allowable
685 		records in a page is limited. */
686 		LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
687 					      return(true));
688 
689 		/* needs 2 records' space for the case the single split and
690 		insert cannot fit.
691 		page_get_max_insert_size_after_reorganize() includes space
692 		for page directory already */
693 		ulint	max_size
694 			= page_get_max_insert_size_after_reorganize(page, 2);
695 
696 		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
697 		    || max_size < rec_size * 2) {
698 			return(true);
699 		}
700 		/* TODO: optimize this condition for compressed page.
701 		this is based on the worst compress rate.
702 		currently looking only uncompressed page, but we can look
703 		also compressed page page_zip_available() if already in the
704 		buffer pool */
705 		/* needs 2 records' space also for worst compress rate. */
706 		if (page_size.is_compressed()
707 		    && page_zip_empty_size(index->n_fields,
708 					   page_size.physical())
709 		       < rec_size * 2 + page_get_data_size(page)
710 			 + page_dir_calc_reserved_space(
711 				page_get_n_recs(page) + 2) + 1) {
712 			return(true);
713 		}
714 	}
715 
716 	return(false);
717 }
718 
719 /** Detects whether the modifying record might need a opposite modification
720 to the intention.
721 @param[in]	page		page
722 @param[in]	lock_intention	lock intention for the tree operation
723 @param[in]	rec		record (current node_ptr)
724 @return	true if tree modification is needed */
725 static
726 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)727 btr_cur_need_opposite_intention(
728 	const page_t*	page,
729 	btr_intention_t	lock_intention,
730 	const rec_t*	rec)
731 {
732 	switch (lock_intention) {
733 	case BTR_INTENTION_DELETE:
734 		return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
735 			&& page_rec_is_first(rec, page))
736 		       || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
737 			   && page_rec_is_last(rec, page)));
738 	case BTR_INTENTION_INSERT:
739 		return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
740 		       && page_rec_is_last(rec, page));
741 	case BTR_INTENTION_BOTH:
742 		return(false);
743 	}
744 
745 	ut_error;
746 	return(false);
747 }
748 
749 /********************************************************************//**
750 Searches an index tree and positions a tree cursor on a given level.
751 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
752 to node pointer page number fields on the upper levels of the tree!
753 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
754 cursor->up_match and cursor->low_match both will have sensible values.
755 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
756 
757 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
758 search tuple should be performed in the B-tree. InnoDB does an insert
759 immediately after the cursor. Thus, the cursor may end up on a user record,
760 or on a page infimum record. */
761 dberr_t
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)762 btr_cur_search_to_nth_level(
763 /*========================*/
764 	dict_index_t*	index,	/*!< in: index */
765 	ulint		level,	/*!< in: the tree level of search */
766 	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
767 				tuple must be set so that it cannot get
768 				compared to the node ptr page number field! */
769 	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
770 				Inserts should always be made using
771 				PAGE_CUR_LE to search the position! */
772 	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
773 				at most one of BTR_INSERT, BTR_DELETE_MARK,
774 				BTR_DELETE, or BTR_ESTIMATE;
775 				cursor->left_block is used to store a pointer
776 				to the left neighbor page, in the cases
777 				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
778 				NOTE that if has_search_latch
779 				is != 0, we maybe do not have a latch set
780 				on the cursor page, we assume
781 				the caller uses his search latch
782 				to protect the record! */
783 	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
784 				s- or x-latched, but see also above! */
785 	ulint		has_search_latch,
786 				/*!< in: info on the latch mode the
787 				caller currently has on search system:
788 				RW_S_LATCH, or 0 */
789 	const char*	file,	/*!< in: file name */
790 	ulint		line,	/*!< in: line where called */
791 	mtr_t*		mtr)	/*!< in: mtr */
792 {
793 	page_t*		page = NULL; /* remove warning */
794 	buf_block_t*	block;
795 	ulint		height;
796 	ulint		up_match;
797 	ulint		up_bytes;
798 	ulint		low_match;
799 	ulint		low_bytes;
800 	ulint		savepoint;
801 	ulint		rw_latch;
802 	page_cur_mode_t	page_mode;
803 	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
804 	ulint		buf_mode;
805 	ulint		estimate;
806 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
807 	page_cur_t*	page_cursor;
808 	btr_op_t	btr_op;
809 	ulint		root_height = 0; /* remove warning */
810 	dberr_t		err = DB_SUCCESS;
811 
812 	ulint		upper_rw_latch, root_leaf_rw_latch;
813 	btr_intention_t	lock_intention;
814 	bool		modify_external;
815 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
816 	ulint		tree_savepoints[BTR_MAX_LEVELS];
817 	ulint		n_blocks = 0;
818 	ulint		n_releases = 0;
819 	bool		detected_same_key_root = false;
820 
821 	bool		retrying_for_search_prev = false;
822 	ulint		leftmost_from_level = 0;
823 	buf_block_t**	prev_tree_blocks = NULL;
824 	ulint*		prev_tree_savepoints = NULL;
825 	ulint		prev_n_blocks = 0;
826 	ulint		prev_n_releases = 0;
827 	bool		need_path = true;
828 	bool		rtree_parent_modified = false;
829 	bool		mbr_adj = false;
830 	bool		found = false;
831 
832 	DBUG_ENTER("btr_cur_search_to_nth_level");
833 
834 	btr_search_t*	info;
835 	mem_heap_t*	heap		= NULL;
836 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
837 	ulint*		offsets		= offsets_;
838 	ulint		offsets2_[REC_OFFS_NORMAL_SIZE];
839 	ulint*		offsets2	= offsets2_;
840 	rec_offs_init(offsets_);
841 	rec_offs_init(offsets2_);
842 	/* Currently, PAGE_CUR_LE is the only search mode used for searches
843 	ending to upper levels */
844 
845 	ut_ad(level == 0 || mode == PAGE_CUR_LE
846 	      || RTREE_SEARCH_MODE(mode));
847 	ut_ad(dict_index_check_search_tuple(index, tuple));
848 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
849 	ut_ad(dtuple_check_typed(tuple));
850 	ut_ad(!(index->type & DICT_FTS));
851 	ut_ad(index->page != FIL_NULL);
852 
853 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
854 	UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
855 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
856 	UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
857 #ifdef UNIV_DEBUG
858 	cursor->up_match = ULINT_UNDEFINED;
859 	cursor->low_match = ULINT_UNDEFINED;
860 #endif /* UNIV_DEBUG */
861 
862 	ibool	s_latch_by_caller;
863 
864 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
865 
866 	ut_ad(!s_latch_by_caller
867 	      || srv_read_only_mode
868 	      || mtr_memo_contains_flagged(mtr,
869 					   dict_index_get_lock(index),
870 					   MTR_MEMO_S_LOCK
871 					   | MTR_MEMO_SX_LOCK));
872 
873 	/* These flags are mutually exclusive, they are lumped together
874 	with the latch mode for historical reasons. It's possible for
875 	none of the flags to be set. */
876 	switch (UNIV_EXPECT(latch_mode
877 			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
878 			    0)) {
879 	case 0:
880 		btr_op = BTR_NO_OP;
881 		break;
882 	case BTR_INSERT:
883 		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
884 			? BTR_INSERT_IGNORE_UNIQUE_OP
885 			: BTR_INSERT_OP;
886 		break;
887 	case BTR_DELETE:
888 		btr_op = BTR_DELETE_OP;
889 		ut_a(cursor->purge_node);
890 		break;
891 	case BTR_DELETE_MARK:
892 		btr_op = BTR_DELMARK_OP;
893 		break;
894 	default:
895 		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
896 		should be specified at a time */
897 		ut_error;
898 	}
899 
900 	/* Operations on the insert buffer tree cannot be buffered. */
901 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
902 	/* Operations on the clustered index cannot be buffered. */
903 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
904 	/* Operations on the temporary table(indexes) cannot be buffered. */
905 	ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
906 	/* Operation on the spatial index cannot be buffered. */
907 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
908 
909 	estimate = latch_mode & BTR_ESTIMATE;
910 
911 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
912 
913 	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
914 
915 	/* Turn the flags unrelated to the latch mode off. */
916 	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
917 
918 	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
919 
920 	ut_ad(!s_latch_by_caller
921 	      || latch_mode == BTR_SEARCH_LEAF
922 	      || latch_mode == BTR_SEARCH_TREE
923 	      || latch_mode == BTR_MODIFY_LEAF);
924 
925 	cursor->flag = BTR_CUR_BINARY;
926 	cursor->index = index;
927 
928 	info = btr_search_get_info(index);
929 
930 # ifdef UNIV_SEARCH_PERF_STAT
931 	info->n_searches++;
932 # endif
933 	/* Use of AHI is disabled for intrinsic table as these tables re-use
934 	the index-id and AHI validation is based on index-id. */
935 	if (rw_lock_get_writer(btr_get_search_latch(index))
936 		== RW_LOCK_NOT_LOCKED
937 	    && latch_mode <= BTR_MODIFY_LEAF
938 	    && info->last_hash_succ
939 	    && !index->disable_ahi
940 	    && !estimate
941 # ifdef PAGE_CUR_LE_OR_EXTENDS
942 	    && mode != PAGE_CUR_LE_OR_EXTENDS
943 # endif /* PAGE_CUR_LE_OR_EXTENDS */
944 	    && !dict_index_is_spatial(index)
945 	    /* If !has_search_latch, we do a dirty read of
946 	    btr_search_enabled below, and btr_search_guess_on_hash()
947 	    will have to check it again. */
948 	    && UNIV_LIKELY(btr_search_enabled)
949 	    && !modify_external
950 	    && btr_search_guess_on_hash(index, info, tuple, mode,
951 					latch_mode, cursor,
952 					has_search_latch, mtr)) {
953 
954 		/* Search using the hash index succeeded */
955 
956 		ut_ad(cursor->up_match != ULINT_UNDEFINED
957 		      || mode != PAGE_CUR_GE);
958 		ut_ad(cursor->up_match != ULINT_UNDEFINED
959 		      || mode != PAGE_CUR_LE);
960 		ut_ad(cursor->low_match != ULINT_UNDEFINED
961 		      || mode != PAGE_CUR_LE);
962 		btr_cur_n_sea++;
963 
964 		DBUG_RETURN(err);
965 	}
966 	btr_cur_n_non_sea++;
967 
968 	/* If the hash search did not succeed, do binary search down the
969 	tree */
970 
971 	if (has_search_latch) {
972 		/* Release possible search latch to obey latching order */
973 		rw_lock_s_unlock(btr_get_search_latch(index));
974 	}
975 
976 	/* Store the position of the tree latch we push to mtr so that we
977 	know how to release it when we have latched leaf node(s) */
978 
979 	savepoint = mtr_set_savepoint(mtr);
980 
981 	switch (latch_mode) {
982 	case BTR_MODIFY_TREE:
983 		/* Most of delete-intended operations are purging.
984 		Free blocks and read IO bandwidth should be prior
985 		for them, when the history list is glowing huge. */
986 		if (lock_intention == BTR_INTENTION_DELETE
987 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
988 			&& buf_get_n_pending_read_ios()) {
989 			mtr_x_lock(dict_index_get_lock(index), mtr);
990 		} else if (dict_index_is_spatial(index)
991 			   && lock_intention <= BTR_INTENTION_BOTH) {
992 			/* X lock the if there is possibility of
993 			pessimistic delete on spatial index. As we could
994 			lock upward for the tree */
995 
996 			mtr_x_lock(dict_index_get_lock(index), mtr);
997 		} else {
998 			mtr_sx_lock(dict_index_get_lock(index), mtr);
999 		}
1000 		upper_rw_latch = RW_X_LATCH;
1001 		break;
1002 	case BTR_CONT_MODIFY_TREE:
1003 	case BTR_CONT_SEARCH_TREE:
1004 		/* Do nothing */
1005 		ut_ad(srv_read_only_mode
1006 		      || mtr_memo_contains_flagged(mtr,
1007 						   dict_index_get_lock(index),
1008 						   MTR_MEMO_X_LOCK
1009 						   | MTR_MEMO_SX_LOCK));
1010 		if (dict_index_is_spatial(index)
1011 		    && latch_mode == BTR_CONT_MODIFY_TREE) {
1012 			/* If we are about to locating parent page for split
1013 			and/or merge operation for R-Tree index, X latch
1014 			the parent */
1015 			upper_rw_latch = RW_X_LATCH;
1016 		} else {
1017 			upper_rw_latch = RW_NO_LATCH;
1018 		}
1019 		break;
1020 	default:
1021 		if (!srv_read_only_mode) {
1022 			if (s_latch_by_caller) {
1023 				ut_ad(rw_lock_own(dict_index_get_lock(index),
1024 				              RW_LOCK_S));
1025 			} else if (!modify_external) {
1026 				/* BTR_SEARCH_TREE is intended to be used with
1027 				BTR_ALREADY_S_LATCHED */
1028 				ut_ad(latch_mode != BTR_SEARCH_TREE);
1029 
1030 				mtr_s_lock(dict_index_get_lock(index), mtr);
1031 			} else {
1032 				/* BTR_MODIFY_EXTERNAL needs to be excluded */
1033 				mtr_sx_lock(dict_index_get_lock(index), mtr);
1034 			}
1035 			upper_rw_latch = RW_S_LATCH;
1036 		} else {
1037 			upper_rw_latch = RW_NO_LATCH;
1038 		}
1039 	}
1040 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1041 
1042 	page_cursor = btr_cur_get_page_cur(cursor);
1043 
1044 	const ulint		space = dict_index_get_space(index);
1045 	const page_size_t	page_size(dict_table_page_size(index->table));
1046 
1047 	/* Start with the root page. */
1048 	page_id_t		page_id(space, dict_index_get_page(index));
1049 
1050 	if (root_leaf_rw_latch == RW_X_LATCH) {
1051 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
1052 	}
1053 
1054 	up_match = 0;
1055 	up_bytes = 0;
1056 	low_match = 0;
1057 	low_bytes = 0;
1058 
1059 	height = ULINT_UNDEFINED;
1060 
1061 	/* We use these modified search modes on non-leaf levels of the
1062 	B-tree. These let us end up in the right B-tree leaf. In that leaf
1063 	we use the original search mode. */
1064 
1065 	switch (mode) {
1066 	case PAGE_CUR_GE:
1067 		page_mode = PAGE_CUR_L;
1068 		break;
1069 	case PAGE_CUR_G:
1070 		page_mode = PAGE_CUR_LE;
1071 		break;
1072 	default:
1073 #ifdef PAGE_CUR_LE_OR_EXTENDS
1074 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1075 		      || RTREE_SEARCH_MODE(mode)
1076 		      || mode == PAGE_CUR_LE_OR_EXTENDS);
1077 #else /* PAGE_CUR_LE_OR_EXTENDS */
1078 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1079 		      || RTREE_SEARCH_MODE(mode));
1080 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1081 		page_mode = mode;
1082 		break;
1083 	}
1084 
1085 	/* Loop and search until we arrive at the desired level */
1086 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1087 
1088 search_loop:
1089 	buf_mode = BUF_GET;
1090 	rw_latch = RW_NO_LATCH;
1091 	rtree_parent_modified = false;
1092 
1093 	if (height != 0) {
1094 		/* We are about to fetch the root or a non-leaf page. */
1095 		if ((latch_mode != BTR_MODIFY_TREE
1096 		     || height == level)
1097 		    && !retrying_for_search_prev) {
1098 			/* If doesn't have SX or X latch of index,
1099 			each pages should be latched before reading. */
1100 			if (modify_external
1101 			    && height == ULINT_UNDEFINED
1102 			    && upper_rw_latch == RW_S_LATCH) {
1103 				/* needs sx-latch of root page
1104 				for fseg operation */
1105 				rw_latch = RW_SX_LATCH;
1106 			} else {
1107 				rw_latch = upper_rw_latch;
1108 			}
1109 		}
1110 	} else if (latch_mode <= BTR_MODIFY_LEAF) {
1111 		rw_latch = latch_mode;
1112 
1113 		if (btr_op != BTR_NO_OP
1114 		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1115 
1116 			/* Try to buffer the operation if the leaf
1117 			page is not in the buffer pool. */
1118 
1119 			buf_mode = btr_op == BTR_DELETE_OP
1120 				? BUF_GET_IF_IN_POOL_OR_WATCH
1121 				: BUF_GET_IF_IN_POOL;
1122 		}
1123 	}
1124 
1125 retry_page_get:
1126 	ut_ad(n_blocks < BTR_MAX_LEVELS);
1127 	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1128 	block = buf_page_get_gen(
1129 		page_id, page_size, rw_latch,
1130 		(height == ULINT_UNDEFINED ? info->root_guess : NULL),
1131 		buf_mode, file, line, mtr, false, &err
1132 	);
1133 
1134 	tree_blocks[n_blocks] = block;
1135 
1136 	if (err != DB_SUCCESS) {
1137 		ut_ad(block == NULL);
1138 		if (err == DB_DECRYPTION_FAILED) {
1139 			ib::warn() << "Table is encrypted but encryption service or"
1140 				" used key_id is not available. "
1141 				" Can't continue reading table.";
1142 
1143 			page_cursor->block = 0;
1144 			page_cursor->rec = 0;
1145 			index->table->set_file_unreadable();
1146 			if (estimate) {
1147 
1148 				cursor->path_arr->nth_rec =
1149 					ULINT_UNDEFINED;
1150 			}
1151 		}
1152 
1153 		goto func_exit;
1154 	}
1155 
1156 	if (block == NULL) {
1157 		SRV_CORRUPT_TABLE_CHECK(buf_mode == BUF_GET_IF_IN_POOL ||
1158 					buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH,
1159 			{
1160 				page_cursor->block = 0;
1161 				page_cursor->rec = 0;
1162 				if (estimate) {
1163 
1164 					cursor->path_arr->nth_rec =
1165 						ULINT_UNDEFINED;
1166 				}
1167 
1168 				goto func_exit;
1169 			});
1170 
1171 		/* This must be a search to perform an insert/delete
1172 		mark/ delete; try using the insert/delete buffer */
1173 
1174 		ut_ad(height == 0);
1175 		ut_ad(cursor->thr);
1176 
1177 		switch (btr_op) {
1178 		case BTR_INSERT_OP:
1179 		case BTR_INSERT_IGNORE_UNIQUE_OP:
1180 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1181 			ut_ad(!dict_index_is_spatial(index));
1182 
1183 			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1184 					page_id, page_size, cursor->thr)) {
1185 
1186 				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1187 
1188 				goto func_exit;
1189 			}
1190 			break;
1191 
1192 		case BTR_DELMARK_OP:
1193 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1194 			ut_ad(!dict_index_is_spatial(index));
1195 
1196 			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1197 					index, page_id, page_size,
1198 					cursor->thr)) {
1199 
1200 				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1201 
1202 				goto func_exit;
1203 			}
1204 
1205 			break;
1206 
1207 		case BTR_DELETE_OP:
1208 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1209 			ut_ad(!dict_index_is_spatial(index));
1210 
1211 			if (!row_purge_poss_sec(cursor->purge_node,
1212 						index, tuple)) {
1213 
1214 				/* The record cannot be purged yet. */
1215 				cursor->flag = BTR_CUR_DELETE_REF;
1216 			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1217 					       index, page_id, page_size,
1218 					       cursor->thr)) {
1219 
1220 				/* The purge was buffered. */
1221 				cursor->flag = BTR_CUR_DELETE_IBUF;
1222 			} else {
1223 				/* The purge could not be buffered. */
1224 				buf_pool_watch_unset(page_id);
1225 				break;
1226 			}
1227 
1228 			buf_pool_watch_unset(page_id);
1229 			goto func_exit;
1230 
1231 		default:
1232 			ut_error;
1233 		}
1234 
1235 		/* Insert to the insert/delete buffer did not succeed, we
1236 		must read the page from disk. */
1237 
1238 		buf_mode = BUF_GET;
1239 
1240 		goto retry_page_get;
1241 	}
1242 
1243 	if (retrying_for_search_prev && height != 0) {
1244 		/* also latch left sibling */
1245 		ulint		left_page_no;
1246 		buf_block_t*	get_block;
1247 
1248 		ut_ad(rw_latch == RW_NO_LATCH);
1249 
1250 		rw_latch = upper_rw_latch;
1251 
1252 		rw_lock_s_lock(&block->lock);
1253 		left_page_no = btr_page_get_prev(
1254 			buf_block_get_frame(block), mtr);
1255 		rw_lock_s_unlock(&block->lock);
1256 
1257 		if (left_page_no != FIL_NULL) {
1258 			ut_ad(prev_n_blocks < leftmost_from_level);
1259 
1260 			prev_tree_savepoints[prev_n_blocks]
1261 				= mtr_set_savepoint(mtr);
1262 			get_block = buf_page_get_gen(
1263 				page_id_t(page_id.space(), left_page_no),
1264 				page_size, rw_latch, NULL, buf_mode,
1265 				file, line, mtr, false, &err);
1266 			prev_tree_blocks[prev_n_blocks] = get_block;
1267 			prev_n_blocks++;
1268 
1269 			if (err != DB_SUCCESS) {
1270 				if (err == DB_DECRYPTION_FAILED) {
1271 					ib::warn() << "Table is encrypted but encryption service or"
1272 						" used key_id is not available. "
1273 						" Can't continue reading table.";
1274 					if (estimate) {
1275 
1276 						page_cursor->block = 0;
1277 						page_cursor->rec = 0;
1278 						cursor->path_arr->nth_rec =
1279 							ULINT_UNDEFINED;
1280 					}
1281 					index->table->set_file_unreadable();
1282 				}
1283 				goto func_exit;
1284 			}
1285 
1286 
1287 			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1288 			without their parent page's lock. So, not needed to
1289 			retry here, because we have the parent page's lock. */
1290 		}
1291 
1292 		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
1293 		mtr_release_block_at_savepoint(
1294 			mtr, tree_savepoints[n_blocks],
1295 			tree_blocks[n_blocks]);
1296 
1297 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1298 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1299 					 buf_mode, file, line, mtr, false, &err);
1300 		tree_blocks[n_blocks] = block;
1301 
1302 		if (err != DB_SUCCESS) {
1303 			if (err == DB_DECRYPTION_FAILED) {
1304 				ib::warn() << "Table is encrypted but encryption service or"
1305 					" used key_id is not available. "
1306 					" Can't continue reading table.";
1307 				if (estimate) {
1308 					page_cursor->block = 0;
1309 					page_cursor->rec = 0;
1310 
1311 					cursor->path_arr->nth_rec =
1312 						ULINT_UNDEFINED;
1313 				}
1314 				index->table->set_file_unreadable();
1315 			}
1316 
1317 			goto func_exit;
1318 		}
1319 	}
1320 
1321 	page = buf_block_get_frame(block);
1322 
1323 	SRV_CORRUPT_TABLE_CHECK(page,
1324 		{
1325 		    page_cursor->block = 0;
1326 		    page_cursor->rec = 0;
1327 
1328 		    if (estimate) {
1329 
1330 			cursor->path_arr->nth_rec = ULINT_UNDEFINED;
1331 		    }
1332 
1333 		    goto func_exit;
1334 		});
1335 
1336 	if (height == ULINT_UNDEFINED
1337 	    && page_is_leaf(page)
1338 	    && rw_latch != RW_NO_LATCH
1339 	    && rw_latch != root_leaf_rw_latch) {
1340 		/* We should retry to get the page, because the root page
1341 		is latched with different level as a leaf page. */
1342 		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1343 		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1344 		ut_ad(rw_latch == RW_S_LATCH || modify_external);
1345 
1346 		ut_ad(n_blocks == 0);
1347 		mtr_release_block_at_savepoint(
1348 			mtr, tree_savepoints[n_blocks],
1349 			tree_blocks[n_blocks]);
1350 
1351 		upper_rw_latch = root_leaf_rw_latch;
1352 		goto search_loop;
1353 	}
1354 
1355 	if (rw_latch != RW_NO_LATCH) {
1356 #ifdef UNIV_ZIP_DEBUG
1357 		const page_zip_des_t*	page_zip
1358 			= buf_block_get_page_zip(block);
1359 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1360 #endif /* UNIV_ZIP_DEBUG */
1361 
1362 		buf_block_dbg_add_level(
1363 			block, dict_index_is_ibuf(index)
1364 			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1365 	}
1366 
1367 	ut_ad(fil_page_index_page_check(page));
1368 	ut_ad(index->id == btr_page_get_index_id(page));
1369 
1370 	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1371 		/* We are in the root node */
1372 
1373 		height = btr_page_get_level(page, mtr);
1374 		root_height = height;
1375 		cursor->tree_height = root_height + 1;
1376 
1377 		if (dict_index_is_spatial(index)) {
1378 			ut_ad(cursor->rtr_info);
1379 
1380 			node_seq_t      seq_no = rtr_get_current_ssn_id(index);
1381 
1382 			/* If SSN in memory is not initialized, fetch
1383 			it from root page */
1384 			if (seq_no < 1) {
1385 				node_seq_t      root_seq_no;
1386 
1387 				root_seq_no = page_get_ssn_id(page);
1388 
1389 				mutex_enter(&(index->rtr_ssn.mutex));
1390 				index->rtr_ssn.seq_no = root_seq_no + 1;
1391 				mutex_exit(&(index->rtr_ssn.mutex));
1392 			}
1393 
1394 			/* Save the MBR */
1395 			cursor->rtr_info->thr = cursor->thr;
1396 			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1397 		}
1398 
1399 		info->root_guess = block;
1400 	}
1401 
1402 	if (height == 0) {
1403 		if (rw_latch == RW_NO_LATCH) {
1404 
1405 			latch_leaves = btr_cur_latch_leaves(
1406 				block, page_id, page_size, latch_mode,
1407 				cursor, mtr);
1408 		}
1409 
1410 		switch (latch_mode) {
1411 		case BTR_MODIFY_TREE:
1412 		case BTR_CONT_MODIFY_TREE:
1413 		case BTR_CONT_SEARCH_TREE:
1414 			break;
1415 		default:
1416 			if (!s_latch_by_caller
1417 			    && !srv_read_only_mode
1418 			    && !modify_external) {
1419 				/* Release the tree s-latch */
1420 				/* NOTE: BTR_MODIFY_EXTERNAL
1421 				needs to keep tree sx-latch */
1422 				mtr_release_s_latch_at_savepoint(
1423 					mtr, savepoint,
1424 					dict_index_get_lock(index));
1425 			}
1426 
1427 			/* release upper blocks */
1428 			if (retrying_for_search_prev) {
1429 				for (;
1430 				     prev_n_releases < prev_n_blocks;
1431 				     prev_n_releases++) {
1432 					mtr_release_block_at_savepoint(
1433 						mtr,
1434 						prev_tree_savepoints[
1435 							prev_n_releases],
1436 						prev_tree_blocks[
1437 							prev_n_releases]);
1438 				}
1439 			}
1440 
1441 			for (; n_releases < n_blocks; n_releases++) {
1442 				if (n_releases == 0 && modify_external) {
1443 					/* keep latch of root page */
1444 					ut_ad(mtr_memo_contains_flagged(
1445 						mtr, tree_blocks[n_releases],
1446 						MTR_MEMO_PAGE_SX_FIX
1447 						| MTR_MEMO_PAGE_X_FIX));
1448 					continue;
1449 				}
1450 
1451 				mtr_release_block_at_savepoint(
1452 					mtr, tree_savepoints[n_releases],
1453 					tree_blocks[n_releases]);
1454 			}
1455 		}
1456 
1457 		page_mode = mode;
1458 	}
1459 
1460 	if (dict_index_is_spatial(index)) {
1461 		/* Remember the page search mode */
1462 		search_mode = page_mode;
1463 
1464 		/* Some adjustment on search mode, when the
1465 		page search mode is PAGE_CUR_RTREE_LOCATE
1466 		or PAGE_CUR_RTREE_INSERT, as we are searching
1467 		with MBRs. When it is not the target level, we
1468 		should search all sub-trees that "CONTAIN" the
1469 		search range/MBR. When it is at the target
1470 		level, the search becomes PAGE_CUR_LE */
1471 		if (page_mode == PAGE_CUR_RTREE_LOCATE
1472 		    && level == height) {
1473 			if (level == 0) {
1474 				page_mode = PAGE_CUR_LE;
1475 			} else {
1476 				page_mode = PAGE_CUR_RTREE_GET_FATHER;
1477 			}
1478 		}
1479 
1480 		if (page_mode == PAGE_CUR_RTREE_INSERT) {
1481 			page_mode = (level == height)
1482 					? PAGE_CUR_LE
1483 					: PAGE_CUR_RTREE_INSERT;
1484 
1485 			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1486 		}
1487 
1488 		/* "need_path" indicates if we need to tracking the parent
1489 		pages, if it is not spatial comparison, then no need to
1490 		track it */
1491 		if (page_mode < PAGE_CUR_CONTAIN) {
1492 			need_path = false;
1493 		}
1494 
1495 		up_match = 0;
1496 		low_match = 0;
1497 
1498 		if (latch_mode == BTR_MODIFY_TREE
1499 		    || latch_mode == BTR_CONT_MODIFY_TREE
1500 		    || latch_mode == BTR_CONT_SEARCH_TREE) {
1501 			/* Tree are locked, no need for Page Lock to protect
1502 			the "path" */
1503 			cursor->rtr_info->need_page_lock = false;
1504 		}
1505         }
1506 
1507 	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1508 		ut_ad(need_path);
1509 		found = rtr_cur_search_with_match(
1510 			block, index, tuple, page_mode, page_cursor,
1511 			cursor->rtr_info);
1512 
1513 		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1514 		if (search_mode == PAGE_CUR_RTREE_INSERT
1515 		    && cursor->rtr_info->mbr_adj) {
1516 			if (latch_mode & BTR_MODIFY_LEAF) {
1517 				/* Parent MBR needs updated, should retry
1518 				with BTR_MODIFY_TREE */
1519 				goto func_exit;
1520 			} else if (latch_mode & BTR_MODIFY_TREE) {
1521 				rtree_parent_modified = true;
1522 				cursor->rtr_info->mbr_adj = false;
1523 				mbr_adj = true;
1524 			} else {
1525 				ut_ad(0);
1526 			}
1527 		}
1528 
1529 		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1530 			cursor->low_match =
1531 				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1532 		}
1533 	} else if (height == 0 && btr_search_enabled
1534 		   && !dict_index_is_spatial(index)) {
1535 		/* The adaptive hash index is only used when searching
1536 		for leaf pages (height==0), but not in r-trees.
1537 		We only need the byte prefix comparison for the purpose
1538 		of updating the adaptive hash index. */
1539 		page_cur_search_with_match_bytes(
1540 			block, index, tuple, page_mode, &up_match, &up_bytes,
1541 			&low_match, &low_bytes, page_cursor);
1542 	} else {
1543 		/* Search for complete index fields. */
1544 		up_bytes = low_bytes = 0;
1545 		page_cur_search_with_match(
1546 			block, index, tuple, page_mode, &up_match,
1547 			&low_match, page_cursor,
1548 			need_path ? cursor->rtr_info : NULL);
1549 	}
1550 
1551 	if (estimate) {
1552 		btr_cur_add_path_info(cursor, height, root_height);
1553 	}
1554 
1555 	/* If this is the desired level, leave the loop */
1556 
1557 	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1558 					   mtr));
1559 
1560 	/* Add Predicate lock if it is serializable isolation
1561 	and only if it is in the search case */
1562 	if (dict_index_is_spatial(index)
1563 	    && cursor->rtr_info->need_prdt_lock
1564 	    && mode != PAGE_CUR_RTREE_INSERT
1565 	    && mode != PAGE_CUR_RTREE_LOCATE
1566 	    && mode >= PAGE_CUR_CONTAIN) {
1567 		trx_t*		trx = thr_get_trx(cursor->thr);
1568 		lock_prdt_t	prdt;
1569 
1570 		lock_mutex_enter();
1571 		lock_init_prdt_from_mbr(
1572 			&prdt, &cursor->rtr_info->mbr, mode,
1573 			trx->lock.lock_heap);
1574 		lock_mutex_exit();
1575 
1576 		if (rw_latch == RW_NO_LATCH && height != 0) {
1577 			rw_lock_s_lock(&(block->lock));
1578 		}
1579 
1580 		lock_prdt_lock(block, &prdt, index, LOCK_S,
1581 			       LOCK_PREDICATE, cursor->thr, mtr);
1582 
1583 		if (rw_latch == RW_NO_LATCH && height != 0) {
1584 			rw_lock_s_unlock(&(block->lock));
1585 		}
1586 	}
1587 
1588 	if (level != height) {
1589 
1590 		const rec_t*	node_ptr;
1591 		ut_ad(height > 0);
1592 
1593 		height--;
1594 
1595 		node_ptr = page_cur_get_rec(page_cursor);
1596 
1597 		offsets = rec_get_offsets(
1598 			node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1599 
1600 		/* If the rec is the first or last in the page for
1601 		pessimistic delete intention, it might cause node_ptr insert
1602 		for the upper level. We should change the intention and retry.
1603 		*/
1604 		if (latch_mode == BTR_MODIFY_TREE
1605 		    && btr_cur_need_opposite_intention(
1606 			page, lock_intention, node_ptr)) {
1607 
1608 need_opposite_intention:
1609 			ut_ad(upper_rw_latch == RW_X_LATCH);
1610 
1611 			if (n_releases > 0) {
1612 				/* release root block */
1613 				mtr_release_block_at_savepoint(
1614 					mtr, tree_savepoints[0],
1615 					tree_blocks[0]);
1616 			}
1617 
1618 			/* release all blocks */
1619 			for (; n_releases <= n_blocks; n_releases++) {
1620 				mtr_release_block_at_savepoint(
1621 					mtr, tree_savepoints[n_releases],
1622 					tree_blocks[n_releases]);
1623 			}
1624 
1625 			lock_intention = BTR_INTENTION_BOTH;
1626 
1627 			page_id.reset(space, dict_index_get_page(index));
1628 			up_match = 0;
1629 			low_match = 0;
1630 			height = ULINT_UNDEFINED;
1631 
1632 			n_blocks = 0;
1633 			n_releases = 0;
1634 
1635 			goto search_loop;
1636 		}
1637 
1638 		if (dict_index_is_spatial(index)) {
1639 			if (page_rec_is_supremum(node_ptr)) {
1640 				cursor->low_match = 0;
1641 				cursor->up_match = 0;
1642 				goto func_exit;
1643 			}
1644 
1645 			/* If we are doing insertion or record locating,
1646 			remember the tree nodes we visited */
1647 			if (page_mode == PAGE_CUR_RTREE_INSERT
1648 			    || (search_mode == PAGE_CUR_RTREE_LOCATE
1649 			        && (latch_mode != BTR_MODIFY_LEAF))) {
1650 				bool		add_latch = false;
1651 
1652 				if (latch_mode == BTR_MODIFY_TREE
1653 				    && rw_latch == RW_NO_LATCH) {
1654 					ut_ad(mtr_memo_contains_flagged(
1655 						mtr, dict_index_get_lock(index),
1656 						MTR_MEMO_X_LOCK
1657 						| MTR_MEMO_SX_LOCK));
1658 					rw_lock_s_lock(&block->lock);
1659 					add_latch = true;
1660 				}
1661 
1662 				/* Store the parent cursor location */
1663 #ifdef UNIV_DEBUG
1664 				ulint	num_stored = rtr_store_parent_path(
1665 					block, cursor, latch_mode,
1666 					height + 1, mtr);
1667 #else
1668 				rtr_store_parent_path(
1669 					block, cursor, latch_mode,
1670 					height + 1, mtr);
1671 #endif
1672 
1673 				if (page_mode == PAGE_CUR_RTREE_INSERT) {
1674 					btr_pcur_t*     r_cursor =
1675 						rtr_get_parent_cursor(
1676 							cursor, height + 1,
1677 							true);
1678 					/* If it is insertion, there should
1679 					be only one parent for each level
1680 					traverse */
1681 #ifdef UNIV_DEBUG
1682 					ut_ad(num_stored == 1);
1683 #endif
1684 
1685 					node_ptr = btr_pcur_get_rec(r_cursor);
1686 
1687 				}
1688 
1689 				if (add_latch) {
1690 					rw_lock_s_unlock(&block->lock);
1691 				}
1692 
1693 				ut_ad(!page_rec_is_supremum(node_ptr));
1694 			}
1695 
1696 			ut_ad(page_mode == search_mode
1697 			      || (page_mode == PAGE_CUR_WITHIN
1698 				  && search_mode == PAGE_CUR_RTREE_LOCATE));
1699 
1700 			page_mode = search_mode;
1701 		}
1702 
1703 		/* If the first or the last record of the page
1704 		or the same key value to the first record or last record,
1705 		the another page might be choosen when BTR_CONT_MODIFY_TREE.
1706 		So, the parent page should not released to avoiding deadlock
1707 		with blocking the another search with the same key value. */
1708 		if (!detected_same_key_root
1709 		    && lock_intention == BTR_INTENTION_BOTH
1710 		    && !dict_index_is_unique(index)
1711 		    && latch_mode == BTR_MODIFY_TREE
1712 		    && (up_match >= rec_offs_n_fields(offsets) - 1
1713 			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
1714 			const rec_t*	first_rec
1715 						= page_rec_get_next_const(
1716 							page_get_infimum_rec(
1717 								page));
1718 			ulint		matched_fields;
1719 
1720 			ut_ad(upper_rw_latch == RW_X_LATCH);
1721 
1722 			if (node_ptr == first_rec
1723 			    || page_rec_is_last(node_ptr, page)) {
1724 				detected_same_key_root = true;
1725 			} else {
1726 				matched_fields = 0;
1727 
1728 				offsets2 = rec_get_offsets(
1729 					first_rec, index, offsets2,
1730 					ULINT_UNDEFINED, &heap);
1731 				cmp_rec_rec_with_match(node_ptr, first_rec,
1732 					offsets, offsets2, index,
1733 					page_is_spatial_non_leaf(first_rec, index),
1734 					false, &matched_fields);
1735 
1736 				if (matched_fields
1737 				    >= rec_offs_n_fields(offsets) - 1) {
1738 					detected_same_key_root = true;
1739 				} else {
1740 					const rec_t*	last_rec;
1741 
1742 					last_rec = page_rec_get_prev_const(
1743 							page_get_supremum_rec(
1744 								page));
1745 
1746 					matched_fields = 0;
1747 
1748 					offsets2 = rec_get_offsets(
1749 						last_rec, index, offsets2,
1750 						ULINT_UNDEFINED, &heap);
1751 					cmp_rec_rec_with_match(
1752 						node_ptr, last_rec,
1753 						offsets, offsets2, index,
1754 						page_is_spatial_non_leaf(last_rec, index),
1755 						false, &matched_fields);
1756 					if (matched_fields
1757 					    >= rec_offs_n_fields(offsets) - 1) {
1758 						detected_same_key_root = true;
1759 					}
1760 				}
1761 			}
1762 		}
1763 
1764 		/* If the page might cause modify_tree,
1765 		we should not release the parent page's lock. */
1766 		if (!detected_same_key_root
1767 		    && latch_mode == BTR_MODIFY_TREE
1768 		    && !btr_cur_will_modify_tree(
1769 				index, page, lock_intention, node_ptr,
1770 				node_ptr_max_size, page_size, mtr)
1771 		    && !rtree_parent_modified) {
1772 			ut_ad(upper_rw_latch == RW_X_LATCH);
1773 			ut_ad(n_releases <= n_blocks);
1774 
1775 			/* we can release upper blocks */
1776 			for (; n_releases < n_blocks; n_releases++) {
1777 				if (n_releases == 0) {
1778 					/* we should not release root page
1779 					to pin to same block. */
1780 					continue;
1781 				}
1782 
1783 				/* release unused blocks to unpin */
1784 				mtr_release_block_at_savepoint(
1785 					mtr, tree_savepoints[n_releases],
1786 					tree_blocks[n_releases]);
1787 			}
1788 		}
1789 
1790 		if (height == level
1791 		    && latch_mode == BTR_MODIFY_TREE) {
1792 			ut_ad(upper_rw_latch == RW_X_LATCH);
1793 			/* we should sx-latch root page, if released already.
1794 			It contains seg_header. */
1795 			if (n_releases > 0) {
1796 				mtr_block_sx_latch_at_savepoint(
1797 					mtr, tree_savepoints[0],
1798 					tree_blocks[0]);
1799 			}
1800 
1801 			/* x-latch the branch blocks not released yet. */
1802 			for (ulint i = n_releases; i <= n_blocks; i++) {
1803 				mtr_block_x_latch_at_savepoint(
1804 					mtr, tree_savepoints[i],
1805 					tree_blocks[i]);
1806 			}
1807 		}
1808 
1809 		/* We should consider prev_page of parent page, if the node_ptr
1810 		is the leftmost of the page. because BTR_SEARCH_PREV and
1811 		BTR_MODIFY_PREV latches prev_page of the leaf page. */
1812 		if ((latch_mode == BTR_SEARCH_PREV
1813 		     || latch_mode == BTR_MODIFY_PREV)
1814 		    && !retrying_for_search_prev) {
1815 			/* block should be latched for consistent
1816 			   btr_page_get_prev() */
1817 			ut_ad(mtr_memo_contains_flagged(mtr, block,
1818 				MTR_MEMO_PAGE_S_FIX
1819 				| MTR_MEMO_PAGE_X_FIX));
1820 
1821 			if (btr_page_get_prev(page, mtr) != FIL_NULL
1822 			    && page_rec_is_first(node_ptr, page)) {
1823 
1824 				if (leftmost_from_level == 0) {
1825 					leftmost_from_level = height + 1;
1826 				}
1827 			} else {
1828 				leftmost_from_level = 0;
1829 			}
1830 
1831 			if (height == 0 && leftmost_from_level > 0) {
1832 				/* should retry to get also prev_page
1833 				from level==leftmost_from_level. */
1834 				retrying_for_search_prev = true;
1835 
1836 				prev_tree_blocks = static_cast<buf_block_t**>(
1837 					ut_malloc_nokey(sizeof(buf_block_t*)
1838 							* leftmost_from_level));
1839 
1840 				prev_tree_savepoints = static_cast<ulint*>(
1841 					ut_malloc_nokey(sizeof(ulint)
1842 							* leftmost_from_level));
1843 
1844 				/* back to the level (leftmost_from_level+1) */
1845 				ulint	idx = n_blocks
1846 					- (leftmost_from_level - 1);
1847 
1848 				page_id.reset(
1849 					space,
1850 					tree_blocks[idx]->page.id.page_no());
1851 
1852 				for (ulint i = n_blocks
1853 					       - (leftmost_from_level - 1);
1854 				     i <= n_blocks; i++) {
1855 					mtr_release_block_at_savepoint(
1856 						mtr, tree_savepoints[i],
1857 						tree_blocks[i]);
1858 				}
1859 
1860 				n_blocks -= (leftmost_from_level - 1);
1861 				height = leftmost_from_level;
1862 				ut_ad(n_releases == 0);
1863 
1864 				/* replay up_match, low_match */
1865 				up_match = 0;
1866 				low_match = 0;
1867 				rtr_info_t*	rtr_info	= need_path
1868 					? cursor->rtr_info : NULL;
1869 
1870 				for (ulint i = 0; i < n_blocks; i++) {
1871 					page_cur_search_with_match(
1872 						tree_blocks[i], index, tuple,
1873 						page_mode, &up_match,
1874 						&low_match, page_cursor,
1875 						rtr_info);
1876 				}
1877 
1878 				goto search_loop;
1879 			}
1880 		}
1881 
1882 		/* Go to the child node */
1883 		page_id.reset(
1884 			space,
1885 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
1886 
1887 		n_blocks++;
1888 
1889 		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1890 			/* We're doing a search on an ibuf tree and we're one
1891 			level above the leaf page. */
1892 
1893 			ut_ad(level == 0);
1894 
1895 			buf_mode = BUF_GET;
1896 			rw_latch = RW_NO_LATCH;
1897 			goto retry_page_get;
1898 		}
1899 
1900 		if (dict_index_is_spatial(index)
1901 		    && page_mode >= PAGE_CUR_CONTAIN
1902 		    && page_mode != PAGE_CUR_RTREE_INSERT) {
1903 			ut_ad(need_path);
1904 			rtr_node_path_t* path =
1905 				cursor->rtr_info->path;
1906 
1907 			if (!path->empty() && found) {
1908 #ifdef UNIV_DEBUG
1909 				node_visit_t    last_visit = path->back();
1910 
1911 				ut_ad(last_visit.page_no == page_id.page_no());
1912 #endif /* UNIV_DEBUG */
1913 
1914 				path->pop_back();
1915 
1916 #ifdef UNIV_DEBUG
1917 				if (page_mode == PAGE_CUR_RTREE_LOCATE
1918 				    && (latch_mode != BTR_MODIFY_LEAF)) {
1919 					btr_pcur_t*	cur
1920 					= cursor->rtr_info->parent_path->back(
1921 					  ).cursor;
1922 					rec_t*	my_node_ptr
1923 						= btr_pcur_get_rec(cur);
1924 
1925 					offsets = rec_get_offsets(
1926 						my_node_ptr, index, offsets,
1927 						ULINT_UNDEFINED, &heap);
1928 
1929 					ulint	my_page_no
1930 					= btr_node_ptr_get_child_page_no(
1931 						my_node_ptr, offsets);
1932 
1933 					ut_ad(page_id.page_no() == my_page_no);
1934 
1935 				}
1936 #endif
1937 			}
1938 		}
1939 
1940 		goto search_loop;
1941 	} else if (!dict_index_is_spatial(index)
1942 		   && latch_mode == BTR_MODIFY_TREE
1943 		   && lock_intention == BTR_INTENTION_INSERT
1944 		   && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1945 		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1946 
1947 		/* btr_insert_into_right_sibling() might cause
1948 		deleting node_ptr at upper level */
1949 
1950 		if (height == 0) {
1951 			/* release the leaf pages if latched */
1952 			for (uint i = 0; i < 3; i++) {
1953 				if (latch_leaves.blocks[i] != NULL) {
1954 					mtr_release_block_at_savepoint(
1955 						mtr, latch_leaves.savepoints[i],
1956 						latch_leaves.blocks[i]);
1957 					latch_leaves.blocks[i] = NULL;
1958 				}
1959 			}
1960 		}
1961 
1962 		goto need_opposite_intention;
1963 	}
1964 
1965 	if (level != 0) {
1966 		if (upper_rw_latch == RW_NO_LATCH) {
1967 			/* latch the page */
1968 			buf_block_t*	child_block;
1969 
1970 			if (latch_mode == BTR_CONT_MODIFY_TREE) {
1971 				child_block = btr_block_get(
1972 					page_id, page_size, RW_X_LATCH,
1973 					index, mtr);
1974 			} else {
1975 				ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1976 				child_block = btr_block_get(
1977 					page_id, page_size, RW_SX_LATCH,
1978 					index, mtr);
1979 			}
1980 
1981 			btr_assert_not_corrupted(child_block, index);
1982 		} else {
1983 			ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1984 			btr_assert_not_corrupted(block, index);
1985 
1986 			if (s_latch_by_caller) {
1987 				ut_ad(latch_mode == BTR_SEARCH_TREE);
1988 				/* to exclude modifying tree operations
1989 				should sx-latch the index. */
1990 				ut_ad(mtr_memo_contains(
1991 					mtr, dict_index_get_lock(index),
1992 					MTR_MEMO_SX_LOCK));
1993 				/* because has sx-latch of index,
1994 				can release upper blocks. */
1995 				for (; n_releases < n_blocks; n_releases++) {
1996 					mtr_release_block_at_savepoint(
1997 						mtr,
1998 						tree_savepoints[n_releases],
1999 						tree_blocks[n_releases]);
2000 				}
2001 			}
2002 		}
2003 
2004 		if (page_mode <= PAGE_CUR_LE) {
2005 			cursor->low_match = low_match;
2006 			cursor->up_match = up_match;
2007 		}
2008 	} else {
2009 		cursor->low_match = low_match;
2010 		cursor->low_bytes = low_bytes;
2011 		cursor->up_match = up_match;
2012 		cursor->up_bytes = up_bytes;
2013 
2014 		/* We do a dirty read of btr_search_enabled here.  We
2015 		will properly check btr_search_enabled again in
2016 		btr_search_build_page_hash_index() before building a
2017 		page hash index, while holding search latch. */
2018 		if (btr_search_enabled && !index->disable_ahi) {
2019 			btr_search_info_update(index, cursor);
2020 		}
2021 		ut_ad(cursor->up_match != ULINT_UNDEFINED
2022 		      || mode != PAGE_CUR_GE);
2023 		ut_ad(cursor->up_match != ULINT_UNDEFINED
2024 		      || mode != PAGE_CUR_LE);
2025 		ut_ad(cursor->low_match != ULINT_UNDEFINED
2026 		      || mode != PAGE_CUR_LE);
2027 	}
2028 
2029 	/* For spatial index, remember  what blocks are still latched */
2030 	if (dict_index_is_spatial(index)
2031 	    && (latch_mode == BTR_MODIFY_TREE
2032 		|| latch_mode == BTR_MODIFY_LEAF)) {
2033 		for (ulint i = 0; i < n_releases; i++) {
2034 			cursor->rtr_info->tree_blocks[i] = NULL;
2035 			cursor->rtr_info->tree_savepoints[i] = 0;
2036 		}
2037 
2038 		for (ulint i = n_releases; i <= n_blocks; i++) {
2039 			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
2040 			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
2041 		}
2042 	}
2043 
2044 func_exit:
2045 
2046 	if (UNIV_LIKELY_NULL(heap)) {
2047 		mem_heap_free(heap);
2048 	}
2049 
2050 	if (retrying_for_search_prev) {
2051 		ut_free(prev_tree_blocks);
2052 		ut_free(prev_tree_savepoints);
2053 	}
2054 
2055 	if (has_search_latch) {
2056 
2057 		rw_lock_s_lock(btr_get_search_latch(index));
2058 	}
2059 
2060 	if (mbr_adj) {
2061 		/* remember that we will need to adjust parent MBR */
2062 		cursor->rtr_info->mbr_adj = true;
2063 	}
2064 
2065 	DBUG_RETURN(err);
2066 }
2067 
2068 /** Searches an index tree and positions a tree cursor on a given level.
2069 This function will avoid latching the traversal path and so should be
2070 used only for cases where-in latching is not needed.
2071 
2072 @param[in,out]	index	index
2073 @param[in]	level	the tree level of search
2074 @param[in]	tuple	data tuple; Note: n_fields_cmp in compared
2075 			to the node ptr page node field
2076 @param[in]	mode	PAGE_CUR_L, ....
2077 			Insert should always be made using PAGE_CUR_LE
2078 			to search the position.
2079 @param[in,out]	cursor	tree cursor; points to record of interest.
2080 @param[in]	file	file name
2081 @param[in[	line	line where called from
2082 @param[in,out]	mtr	mtr
2083 @param[in]	mark_dirty
2084 			if true then mark the block as dirty */
2085 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)2086 btr_cur_search_to_nth_level_with_no_latch(
2087 	dict_index_t*		index,
2088 	ulint			level,
2089 	const dtuple_t*		tuple,
2090 	page_cur_mode_t		mode,
2091 	btr_cur_t*		cursor,
2092 	const char*		file,
2093 	ulint			line,
2094 	mtr_t*			mtr,
2095 	bool			mark_dirty)
2096 {
2097 	page_t*		page = NULL; /* remove warning */
2098 	buf_block_t*	block;
2099 	ulint		height;
2100 	ulint		up_match;
2101 	ulint		low_match;
2102 	ulint		rw_latch;
2103 	page_cur_mode_t	page_mode;
2104 	ulint		buf_mode;
2105 	page_cur_t*	page_cursor;
2106 	ulint		root_height = 0; /* remove warning */
2107 	ulint		n_blocks = 0;
2108 
2109 	mem_heap_t*	heap		= NULL;
2110 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2111 	ulint*		offsets		= offsets_;
2112 	rec_offs_init(offsets_);
2113 
2114 	DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2115 
2116 	ut_ad(dict_table_is_intrinsic(index->table));
2117 	ut_ad(level == 0 || mode == PAGE_CUR_LE);
2118 	ut_ad(dict_index_check_search_tuple(index, tuple));
2119 	ut_ad(dtuple_check_typed(tuple));
2120 	ut_ad(index->page != FIL_NULL);
2121 
2122 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2123 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2124 #ifdef UNIV_DEBUG
2125 	cursor->up_match = ULINT_UNDEFINED;
2126 	cursor->low_match = ULINT_UNDEFINED;
2127 #endif /* UNIV_DEBUG */
2128 
2129 	cursor->flag = BTR_CUR_BINARY;
2130 	cursor->index = index;
2131 
2132 	page_cursor = btr_cur_get_page_cur(cursor);
2133 
2134         const ulint		space = dict_index_get_space(index);
2135         const page_size_t	page_size(dict_table_page_size(index->table));
2136         /* Start with the root page. */
2137         page_id_t		page_id(space, dict_index_get_page(index));
2138 
2139 	up_match = 0;
2140 	low_match = 0;
2141 
2142 	height = ULINT_UNDEFINED;
2143 
2144 	/* We use these modified search modes on non-leaf levels of the
2145 	B-tree. These let us end up in the right B-tree leaf. In that leaf
2146 	we use the original search mode. */
2147 
2148 	switch (mode) {
2149 	case PAGE_CUR_GE:
2150 		page_mode = PAGE_CUR_L;
2151 		break;
2152 	case PAGE_CUR_G:
2153 		page_mode = PAGE_CUR_LE;
2154 		break;
2155 	default:
2156 		page_mode = mode;
2157 		break;
2158 	}
2159 
2160 	/* Loop and search until we arrive at the desired level */
2161 	bool at_desired_level = false;
2162 	while (!at_desired_level) {
2163 		buf_mode = BUF_GET;
2164 		rw_latch = RW_NO_LATCH;
2165 
2166 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2167 
2168 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2169 				buf_mode, file, line, mtr, mark_dirty);
2170 
2171 		page = buf_block_get_frame(block);
2172 
2173 		if (height == ULINT_UNDEFINED) {
2174 			/* We are in the root node */
2175 
2176 			height = btr_page_get_level(page, mtr);
2177 			root_height = height;
2178 			cursor->tree_height = root_height + 1;
2179 		}
2180 
2181 		if (height == 0) {
2182 			/* On leaf level. Switch back to original search mode.*/
2183 			page_mode = mode;
2184 		}
2185 
2186 		page_cur_search_with_match(
2187 				block, index, tuple, page_mode, &up_match,
2188 				&low_match, page_cursor, NULL);
2189 
2190 		ut_ad(height == btr_page_get_level(
2191 			page_cur_get_page(page_cursor), mtr));
2192 
2193 		if (level != height) {
2194 
2195 			const rec_t*	node_ptr;
2196 			ut_ad(height > 0);
2197 
2198 			height--;
2199 
2200 			node_ptr = page_cur_get_rec(page_cursor);
2201 
2202 			offsets = rec_get_offsets(
2203 					node_ptr, index, offsets,
2204 					ULINT_UNDEFINED, &heap);
2205 
2206 			/* Go to the child node */
2207 			page_id.reset(space, btr_node_ptr_get_child_page_no(
2208 				node_ptr, offsets));
2209 
2210 			n_blocks++;
2211 		} else {
2212 			/* If this is the desired level, leave the loop */
2213 			at_desired_level = true;
2214 		}
2215 	}
2216 
2217 	cursor->low_match = low_match;
2218 	cursor->up_match = up_match;
2219 
2220 	if (heap != NULL) {
2221 		mem_heap_free(heap);
2222 	}
2223 
2224 	DBUG_VOID_RETURN;
2225 }
2226 
2227 /*****************************************************************//**
2228 Opens a cursor at either end of an index. */
2229 dberr_t
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2230 btr_cur_open_at_index_side_func(
2231 /*============================*/
2232 	bool		from_left,	/*!< in: true if open to the low end,
2233 					false if to the high end */
2234 	dict_index_t*	index,		/*!< in: index */
2235 	ulint		latch_mode,	/*!< in: latch mode */
2236 	btr_cur_t*	cursor,		/*!< in/out: cursor */
2237 	ulint		level,		/*!< in: level to search for
2238 					(0=leaf). */
2239 	const char*	file,		/*!< in: file name */
2240 	ulint		line,		/*!< in: line where called */
2241 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
2242 {
2243 	page_cur_t*	page_cursor;
2244 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2245 	ulint		height;
2246 	ulint		root_height = 0; /* remove warning */
2247 	rec_t*		node_ptr;
2248 	ulint		estimate;
2249 	ulint		savepoint;
2250 	ulint		upper_rw_latch, root_leaf_rw_latch;
2251 	btr_intention_t	lock_intention;
2252 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2253 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2254 	ulint		n_blocks = 0;
2255 	ulint		n_releases = 0;
2256 	mem_heap_t*	heap		= NULL;
2257 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2258 	ulint*		offsets		= offsets_;
2259 	dberr_t		err = DB_SUCCESS;
2260 	rec_offs_init(offsets_);
2261 
2262 	estimate = latch_mode & BTR_ESTIMATE;
2263 	latch_mode &= ~BTR_ESTIMATE;
2264 
2265 	ut_ad(level != ULINT_UNDEFINED);
2266 
2267 	bool	s_latch_by_caller;
2268 
2269 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2270 	latch_mode &= ~BTR_ALREADY_S_LATCHED;
2271 
2272 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2273 
2274 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2275 
2276 	/* This function doesn't need to lock left page of the leaf page */
2277 	if (latch_mode == BTR_SEARCH_PREV) {
2278 		latch_mode = BTR_SEARCH_LEAF;
2279 	} else if (latch_mode == BTR_MODIFY_PREV) {
2280 		latch_mode = BTR_MODIFY_LEAF;
2281 	}
2282 
2283 	/* Store the position of the tree latch we push to mtr so that we
2284 	know how to release it when we have latched the leaf node */
2285 
2286 	savepoint = mtr_set_savepoint(mtr);
2287 
2288 	switch (latch_mode) {
2289 	case BTR_CONT_MODIFY_TREE:
2290 	case BTR_CONT_SEARCH_TREE:
2291 		upper_rw_latch = RW_NO_LATCH;
2292 		break;
2293 	case BTR_MODIFY_TREE:
2294 		/* Most of delete-intended operations are purging.
2295 		Free blocks and read IO bandwidth should be prior
2296 		for them, when the history list is glowing huge. */
2297 		if (lock_intention == BTR_INTENTION_DELETE
2298 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2299 		    && buf_get_n_pending_read_ios()) {
2300 			mtr_x_lock(dict_index_get_lock(index), mtr);
2301 		} else {
2302 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2303 		}
2304 		upper_rw_latch = RW_X_LATCH;
2305 		break;
2306 	default:
2307 		ut_ad(!s_latch_by_caller
2308 		      || mtr_memo_contains_flagged(mtr,
2309 						 dict_index_get_lock(index),
2310 						 MTR_MEMO_SX_LOCK
2311 						 | MTR_MEMO_S_LOCK));
2312 		if (!srv_read_only_mode) {
2313 			if (!s_latch_by_caller) {
2314 				/* BTR_SEARCH_TREE is intended to be used with
2315 				BTR_ALREADY_S_LATCHED */
2316 				ut_ad(latch_mode != BTR_SEARCH_TREE);
2317 
2318 				mtr_s_lock(dict_index_get_lock(index), mtr);
2319 			}
2320 			upper_rw_latch = RW_S_LATCH;
2321 		} else {
2322 			upper_rw_latch = RW_NO_LATCH;
2323 		}
2324 	}
2325 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2326 
2327 	page_cursor = btr_cur_get_page_cur(cursor);
2328 	cursor->index = index;
2329 
2330 	page_id_t		page_id(dict_index_get_space(index),
2331 					dict_index_get_page(index));
2332 	const page_size_t&	page_size = dict_table_page_size(index->table);
2333 
2334 	if (root_leaf_rw_latch == RW_X_LATCH) {
2335 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2336 	}
2337 
2338 	height = ULINT_UNDEFINED;
2339 
2340 	for (;;) {
2341 		buf_block_t*	block;
2342 		page_t*		page;
2343 		ulint		rw_latch;
2344 
2345 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2346 
2347 		if (height != 0
2348 		    && (latch_mode != BTR_MODIFY_TREE
2349 			|| height == level)) {
2350 			rw_latch = upper_rw_latch;
2351 		} else {
2352 			rw_latch = RW_NO_LATCH;
2353 		}
2354 
2355 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2356 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2357 					 BUF_GET, file, line, mtr, false, &err);
2358 		tree_blocks[n_blocks] = block;
2359 
2360 		if (err != DB_SUCCESS) {
2361 			if (err == DB_DECRYPTION_FAILED) {
2362 				ib::warn() << "Table is encrypted but encryption service or"
2363 					" used key_id is not available. "
2364 					" Can't continue reading table.";
2365 				page_cursor->block = 0;
2366 				page_cursor->rec = 0;
2367 				if (estimate) {
2368 
2369 					cursor->path_arr->nth_rec = ULINT_UNDEFINED;
2370 				}
2371 
2372 				index->table->set_file_unreadable();
2373 			}
2374 			goto exit_loop;
2375 		}
2376 
2377 		page = buf_block_get_frame(block);
2378 
2379 		SRV_CORRUPT_TABLE_CHECK(page,
2380 		{
2381 			page_cursor->block = 0;
2382 			page_cursor->rec = 0;
2383 
2384 			if (estimate) {
2385 
2386 				cursor->path_arr->nth_rec = ULINT_UNDEFINED;
2387 			}
2388 			/* Can't use break with the macro */
2389 			goto exit_loop;
2390 		});
2391 
2392 		if (height == ULINT_UNDEFINED
2393 		    && btr_page_get_level(page, mtr) == 0
2394 		    && rw_latch != RW_NO_LATCH
2395 		    && rw_latch != root_leaf_rw_latch) {
2396 			/* We should retry to get the page, because the root page
2397 			is latched with different level as a leaf page. */
2398 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2399 			ut_ad(rw_latch == RW_S_LATCH);
2400 
2401 			ut_ad(n_blocks == 0);
2402 			mtr_release_block_at_savepoint(
2403 				mtr, tree_savepoints[n_blocks],
2404 				tree_blocks[n_blocks]);
2405 
2406 			upper_rw_latch = root_leaf_rw_latch;
2407 			continue;
2408 		}
2409 
2410 		ut_ad(fil_page_index_page_check(page));
2411 		ut_ad(index->id == btr_page_get_index_id(page));
2412 
2413 		if (height == ULINT_UNDEFINED) {
2414 			/* We are in the root node */
2415 
2416 			height = btr_page_get_level(page, mtr);
2417 			root_height = height;
2418 			ut_a(height >= level);
2419 		} else {
2420 			/* TODO: flag the index corrupted if this fails */
2421 			ut_ad(height == btr_page_get_level(page, mtr));
2422 		}
2423 
2424 		if (height == level) {
2425 			if (srv_read_only_mode) {
2426 				btr_cur_latch_leaves(
2427 					block, page_id, page_size,
2428 					latch_mode, cursor, mtr);
2429 			} else if (height == 0) {
2430 				if (rw_latch == RW_NO_LATCH) {
2431 					btr_cur_latch_leaves(
2432 						block, page_id, page_size,
2433 						latch_mode, cursor, mtr);
2434 				}
2435 				/* In versions <= 3.23.52 we had
2436 				forgotten to release the tree latch
2437 				here. If in an index scan we had to
2438 				scan far to find a record visible to
2439 				the current transaction, that could
2440 				starve others waiting for the tree
2441 				latch. */
2442 
2443 				switch (latch_mode) {
2444 				case BTR_MODIFY_TREE:
2445 				case BTR_CONT_MODIFY_TREE:
2446 				case BTR_CONT_SEARCH_TREE:
2447 					break;
2448 				default:
2449 					if (!s_latch_by_caller) {
2450 						/* Release the tree s-latch */
2451 						mtr_release_s_latch_at_savepoint(
2452 							mtr, savepoint,
2453 							dict_index_get_lock(
2454 								index));
2455 					}
2456 
2457 					/* release upper blocks */
2458 					for (; n_releases < n_blocks;
2459 					     n_releases++) {
2460 						mtr_release_block_at_savepoint(
2461 							mtr,
2462 							tree_savepoints[
2463 								n_releases],
2464 							tree_blocks[
2465 								n_releases]);
2466 					}
2467 				}
2468 			} else { /* height != 0 */
2469 				/* We already have the block latched. */
2470 				ut_ad(latch_mode == BTR_SEARCH_TREE);
2471 				ut_ad(s_latch_by_caller);
2472 				ut_ad(upper_rw_latch == RW_S_LATCH);
2473 
2474 				ut_ad(mtr_memo_contains(mtr, block,
2475 							upper_rw_latch));
2476 
2477 				if (s_latch_by_caller) {
2478 					/* to exclude modifying tree operations
2479 					should sx-latch the index. */
2480 					ut_ad(mtr_memo_contains(
2481 						mtr,
2482 						dict_index_get_lock(index),
2483 						MTR_MEMO_SX_LOCK));
2484 					/* because has sx-latch of index,
2485 					can release upper blocks. */
2486 					for (; n_releases < n_blocks;
2487 					     n_releases++) {
2488 						mtr_release_block_at_savepoint(
2489 							mtr,
2490 							tree_savepoints[
2491 								n_releases],
2492 							tree_blocks[
2493 								n_releases]);
2494 					}
2495 				}
2496 			}
2497 		}
2498 
2499 		if (from_left) {
2500 			page_cur_set_before_first(block, page_cursor);
2501 		} else {
2502 			page_cur_set_after_last(block, page_cursor);
2503 		}
2504 
2505 		if (height == level) {
2506 			if (estimate) {
2507 				btr_cur_add_path_info(cursor, height,
2508 						      root_height);
2509 			}
2510 
2511 			break;
2512 		}
2513 
2514 		ut_ad(height > 0);
2515 
2516 		if (from_left) {
2517 			page_cur_move_to_next(page_cursor);
2518 		} else {
2519 			page_cur_move_to_prev(page_cursor);
2520 		}
2521 
2522 		if (estimate) {
2523 			btr_cur_add_path_info(cursor, height, root_height);
2524 		}
2525 
2526 		height--;
2527 
2528 		node_ptr = page_cur_get_rec(page_cursor);
2529 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2530 					  ULINT_UNDEFINED, &heap);
2531 
2532 		/* If the rec is the first or last in the page for
2533 		pessimistic delete intention, it might cause node_ptr insert
2534 		for the upper level. We should change the intention and retry.
2535 		*/
2536 		if (latch_mode == BTR_MODIFY_TREE
2537 		    && btr_cur_need_opposite_intention(
2538 			page, lock_intention, node_ptr)) {
2539 
2540 			ut_ad(upper_rw_latch == RW_X_LATCH);
2541 			/* release all blocks */
2542 			for (; n_releases <= n_blocks; n_releases++) {
2543 				mtr_release_block_at_savepoint(
2544 					mtr, tree_savepoints[n_releases],
2545 					tree_blocks[n_releases]);
2546 			}
2547 
2548 			lock_intention = BTR_INTENTION_BOTH;
2549 
2550 			page_id.set_page_no(dict_index_get_page(index));
2551 
2552 			height = ULINT_UNDEFINED;
2553 
2554 			n_blocks = 0;
2555 			n_releases = 0;
2556 
2557 			continue;
2558 		}
2559 
2560 		if (latch_mode == BTR_MODIFY_TREE
2561 		    && !btr_cur_will_modify_tree(
2562 				cursor->index, page, lock_intention, node_ptr,
2563 				node_ptr_max_size, page_size, mtr)) {
2564 			ut_ad(upper_rw_latch == RW_X_LATCH);
2565 			ut_ad(n_releases <= n_blocks);
2566 
2567 			/* we can release upper blocks */
2568 			for (; n_releases < n_blocks; n_releases++) {
2569 				if (n_releases == 0) {
2570 					/* we should not release root page
2571 					to pin to same block. */
2572 					continue;
2573 				}
2574 
2575 				/* release unused blocks to unpin */
2576 				mtr_release_block_at_savepoint(
2577 					mtr, tree_savepoints[n_releases],
2578 					tree_blocks[n_releases]);
2579 			}
2580 		}
2581 
2582 		if (height == level
2583 		    && latch_mode == BTR_MODIFY_TREE) {
2584 			ut_ad(upper_rw_latch == RW_X_LATCH);
2585 			/* we should sx-latch root page, if released already.
2586 			It contains seg_header. */
2587 			if (n_releases > 0) {
2588 				mtr_block_sx_latch_at_savepoint(
2589 					mtr, tree_savepoints[0],
2590 					tree_blocks[0]);
2591 			}
2592 
2593 			/* x-latch the branch blocks not released yet. */
2594 			for (ulint i = n_releases; i <= n_blocks; i++) {
2595 				mtr_block_x_latch_at_savepoint(
2596 					mtr, tree_savepoints[i],
2597 					tree_blocks[i]);
2598 			}
2599 		}
2600 
2601 		/* Go to the child node */
2602 		page_id.set_page_no(
2603 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2604 
2605 		n_blocks++;
2606 	}
2607 
2608 exit_loop:
2609 	if (heap) {
2610 		mem_heap_free(heap);
2611 	}
2612 
2613 	return err;
2614 }
2615 
2616 /** Opens a cursor at either end of an index.
2617 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2618 to keep them in buffer pool. This mode is used by intrinsic table
2619 as they are not shared and so there is no need of latching.
2620 @param[in]	from_left	true if open to low end, false if open
2621 				to high end.
2622 @param[in]	index		index
2623 @param[in,out]	cursor		cursor
2624 @param[in]	file		file name
2625 @param[in]	line		line where called
2626 @param[in,out]	mtr		mini transaction
2627 */
2628 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2629 btr_cur_open_at_index_side_with_no_latch_func(
2630 	bool		from_left,
2631 	dict_index_t*	index,
2632 	btr_cur_t*	cursor,
2633 	ulint		level,
2634 	const char*	file,
2635 	ulint		line,
2636 	mtr_t*		mtr)
2637 {
2638 	page_cur_t*	page_cursor;
2639 	ulint		height;
2640 	rec_t*		node_ptr;
2641 	ulint		n_blocks = 0;
2642 	mem_heap_t*	heap		= NULL;
2643 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2644 	ulint*		offsets		= offsets_;
2645 	rec_offs_init(offsets_);
2646 
2647 	ut_ad(level != ULINT_UNDEFINED);
2648 
2649 	page_cursor = btr_cur_get_page_cur(cursor);
2650 	cursor->index = index;
2651 	page_id_t		page_id(dict_index_get_space(index),
2652 					dict_index_get_page(index));
2653 	const page_size_t&	page_size = dict_table_page_size(index->table);
2654 
2655 	height = ULINT_UNDEFINED;
2656 
2657 	for (;;) {
2658 		buf_block_t*	block;
2659 		page_t*		page;
2660 		ulint		rw_latch = RW_NO_LATCH;
2661 
2662 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2663 
2664 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2665 					 BUF_GET, file, line, mtr);
2666 
2667 		page = buf_block_get_frame(block);
2668 
2669 		ut_ad(fil_page_index_page_check(page));
2670 		ut_ad(index->id == btr_page_get_index_id(page));
2671 
2672 		if (height == ULINT_UNDEFINED) {
2673 			/* We are in the root node */
2674 
2675 			height = btr_page_get_level(page, mtr);
2676 			ut_a(height >= level);
2677 		} else {
2678 			/* TODO: flag the index corrupted if this fails */
2679 			ut_ad(height == btr_page_get_level(page, mtr));
2680 		}
2681 
2682 		if (from_left) {
2683 			page_cur_set_before_first(block, page_cursor);
2684 		} else {
2685 			page_cur_set_after_last(block, page_cursor);
2686 		}
2687 
2688 		if (height == level) {
2689 			break;
2690 		}
2691 
2692 		ut_ad(height > 0);
2693 
2694 		if (from_left) {
2695 			page_cur_move_to_next(page_cursor);
2696 		} else {
2697 			page_cur_move_to_prev(page_cursor);
2698 		}
2699 
2700 		height--;
2701 
2702 		node_ptr = page_cur_get_rec(page_cursor);
2703 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2704 					  ULINT_UNDEFINED, &heap);
2705 
2706 		/* Go to the child node */
2707 		page_id.set_page_no(
2708 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2709 
2710 		n_blocks++;
2711 	}
2712 
2713 	if (heap != NULL) {
2714 		mem_heap_free(heap);
2715 	}
2716 }
2717 
2718 /**********************************************************************//**
2719 Positions a cursor at a randomly chosen position within a B-tree.
2720 @return true if the index is available and we have put the cursor, false
2721 if the index is unavailable */
2722 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2723 btr_cur_open_at_rnd_pos_func(
2724 /*=========================*/
2725 	dict_index_t*	index,		/*!< in: index */
2726 	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
2727 	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
2728 	const char*	file,		/*!< in: file name */
2729 	ulint		line,		/*!< in: line where called */
2730 	mtr_t*		mtr)		/*!< in: mtr */
2731 {
2732 	page_cur_t*	page_cursor;
2733 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2734 	ulint		height;
2735 	rec_t*		node_ptr;
2736 	ulint		savepoint;
2737 	ulint		upper_rw_latch, root_leaf_rw_latch;
2738 	btr_intention_t	lock_intention;
2739 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2740 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2741 	ulint		n_blocks = 0;
2742 	ulint		n_releases = 0;
2743 	mem_heap_t*	heap		= NULL;
2744 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2745 	ulint*		offsets		= offsets_;
2746 	rec_offs_init(offsets_);
2747 
2748 	ut_ad(!dict_index_is_spatial(index));
2749 
2750 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2751 
2752 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2753 
2754 	savepoint = mtr_set_savepoint(mtr);
2755 
2756 	switch (latch_mode) {
2757 	case BTR_MODIFY_TREE:
2758 		/* Most of delete-intended operations are purging.
2759 		Free blocks and read IO bandwidth should be prior
2760 		for them, when the history list is glowing huge. */
2761 		if (lock_intention == BTR_INTENTION_DELETE
2762 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2763 		    && buf_get_n_pending_read_ios()) {
2764 			mtr_x_lock(dict_index_get_lock(index), mtr);
2765 		} else {
2766 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2767 		}
2768 		upper_rw_latch = RW_X_LATCH;
2769 		break;
2770 	case BTR_SEARCH_PREV:
2771 	case BTR_MODIFY_PREV:
2772 		/* This function doesn't support left uncle
2773 		   page lock for left leaf page lock, when
2774 		   needed. */
2775 	case BTR_SEARCH_TREE:
2776 	case BTR_CONT_MODIFY_TREE:
2777 	case BTR_CONT_SEARCH_TREE:
2778 		ut_ad(0);
2779 		/* fall through */
2780 	default:
2781 		if (!srv_read_only_mode) {
2782 			mtr_s_lock(dict_index_get_lock(index), mtr);
2783 			upper_rw_latch = RW_S_LATCH;
2784 		} else {
2785 			upper_rw_latch = RW_NO_LATCH;
2786 		}
2787 	}
2788 
2789 	DBUG_EXECUTE_IF("test_index_is_unavailable",
2790 			return(false););
2791 
2792 	if (index->page == FIL_NULL) {
2793 		/* Since we don't hold index lock until just now, the index
2794 		could be modified by others, for example, if this is a
2795 		statistics updater for referenced table, it could be marked
2796 		as unavailable by 'DROP TABLE' in the mean time, since
2797 		we don't hold lock for statistics updater */
2798 		return(false);
2799 	}
2800 
2801 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2802 
2803 	page_cursor = btr_cur_get_page_cur(cursor);
2804 	cursor->index = index;
2805 
2806 	page_id_t		page_id(dict_index_get_space(index),
2807 					dict_index_get_page(index));
2808 	const page_size_t&	page_size = dict_table_page_size(index->table);
2809 	dberr_t			err = DB_SUCCESS;
2810 
2811 	if (root_leaf_rw_latch == RW_X_LATCH) {
2812 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2813 	}
2814 
2815 	height = ULINT_UNDEFINED;
2816 
2817 	for (;;) {
2818 		buf_block_t*	block;
2819 		page_t*		page;
2820 		ulint		rw_latch;
2821 
2822 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2823 
2824 		if (height != 0
2825 		    && latch_mode != BTR_MODIFY_TREE) {
2826 			rw_latch = upper_rw_latch;
2827 		} else {
2828 			rw_latch = RW_NO_LATCH;
2829 		}
2830 
2831 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2832 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2833 					 BUF_GET, file, line, mtr, false, &err);
2834 		tree_blocks[n_blocks] = block;
2835 
2836 		ut_ad((block != NULL) == (err == DB_SUCCESS));
2837 
2838 		if (err != DB_SUCCESS) {
2839 			if (err == DB_DECRYPTION_FAILED) {
2840 				ib::warn() << "Table %s is encrypted but encryption service or"
2841 						" used key_id is not available. "
2842 						" Can't continue reading table.";
2843 				page_cursor->block = 0;
2844 				page_cursor->rec = 0;
2845 				index->table->set_file_unreadable();
2846 			}
2847 
2848 			goto exit_loop;
2849 		}
2850 
2851 
2852 		page = buf_block_get_frame(block);
2853 
2854 		SRV_CORRUPT_TABLE_CHECK(page,
2855 		{
2856 			page_cursor->block = 0;
2857 			page_cursor->rec = 0;
2858 
2859 			goto exit_loop;
2860 		});
2861 
2862 		if (height == ULINT_UNDEFINED
2863 		    && btr_page_get_level(page, mtr) == 0
2864 		    && rw_latch != RW_NO_LATCH
2865 		    && rw_latch != root_leaf_rw_latch) {
2866 			/* We should retry to get the page, because the root page
2867 			is latched with different level as a leaf page. */
2868 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2869 			ut_ad(rw_latch == RW_S_LATCH);
2870 
2871 			ut_ad(n_blocks == 0);
2872 			mtr_release_block_at_savepoint(
2873 				mtr, tree_savepoints[n_blocks],
2874 				tree_blocks[n_blocks]);
2875 
2876 			upper_rw_latch = root_leaf_rw_latch;
2877 			continue;
2878 		}
2879 
2880 		ut_ad(fil_page_index_page_check(page));
2881 		ut_ad(index->id == btr_page_get_index_id(page));
2882 
2883 		if (height == ULINT_UNDEFINED) {
2884 			/* We are in the root node */
2885 
2886 			height = btr_page_get_level(page, mtr);
2887 		}
2888 
2889 		if (height == 0) {
2890 			if (rw_latch == RW_NO_LATCH
2891 			    || srv_read_only_mode) {
2892 				btr_cur_latch_leaves(
2893 					block, page_id, page_size,
2894 					latch_mode, cursor, mtr);
2895 			}
2896 
2897 			/* btr_cur_open_at_index_side_func() and
2898 			btr_cur_search_to_nth_level() release
2899 			tree s-latch here.*/
2900 			switch (latch_mode) {
2901 			case BTR_MODIFY_TREE:
2902 			case BTR_CONT_MODIFY_TREE:
2903 			case BTR_CONT_SEARCH_TREE:
2904 				break;
2905 			default:
2906 				/* Release the tree s-latch */
2907 				if (!srv_read_only_mode) {
2908 					mtr_release_s_latch_at_savepoint(
2909 						mtr, savepoint,
2910 						dict_index_get_lock(index));
2911 				}
2912 
2913 				/* release upper blocks */
2914 				for (; n_releases < n_blocks; n_releases++) {
2915 					mtr_release_block_at_savepoint(
2916 						mtr,
2917 						tree_savepoints[n_releases],
2918 						tree_blocks[n_releases]);
2919 				}
2920 			}
2921 		}
2922 
2923 		page_cur_open_on_rnd_user_rec(block, page_cursor);
2924 
2925 		if (height == 0) {
2926 
2927 			break;
2928 		}
2929 
2930 		ut_ad(height > 0);
2931 
2932 		height--;
2933 
2934 		node_ptr = page_cur_get_rec(page_cursor);
2935 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2936 					  ULINT_UNDEFINED, &heap);
2937 
2938 		/* If the rec is the first or last in the page for
2939 		pessimistic delete intention, it might cause node_ptr insert
2940 		for the upper level. We should change the intention and retry.
2941 		*/
2942 		if (latch_mode == BTR_MODIFY_TREE
2943 		    && btr_cur_need_opposite_intention(
2944 			page, lock_intention, node_ptr)) {
2945 
2946 			ut_ad(upper_rw_latch == RW_X_LATCH);
2947 			/* release all blocks */
2948 			for (; n_releases <= n_blocks; n_releases++) {
2949 				mtr_release_block_at_savepoint(
2950 					mtr, tree_savepoints[n_releases],
2951 					tree_blocks[n_releases]);
2952 			}
2953 
2954 			lock_intention = BTR_INTENTION_BOTH;
2955 
2956 			page_id.set_page_no(dict_index_get_page(index));
2957 
2958 			height = ULINT_UNDEFINED;
2959 
2960 			n_blocks = 0;
2961 			n_releases = 0;
2962 
2963 			continue;
2964 		}
2965 
2966 		if (latch_mode == BTR_MODIFY_TREE
2967 		    && !btr_cur_will_modify_tree(
2968 				cursor->index, page, lock_intention, node_ptr,
2969 				node_ptr_max_size, page_size, mtr)) {
2970 			ut_ad(upper_rw_latch == RW_X_LATCH);
2971 			ut_ad(n_releases <= n_blocks);
2972 
2973 			/* we can release upper blocks */
2974 			for (; n_releases < n_blocks; n_releases++) {
2975 				if (n_releases == 0) {
2976 					/* we should not release root page
2977 					to pin to same block. */
2978 					continue;
2979 				}
2980 
2981 				/* release unused blocks to unpin */
2982 				mtr_release_block_at_savepoint(
2983 					mtr, tree_savepoints[n_releases],
2984 					tree_blocks[n_releases]);
2985 			}
2986 		}
2987 
2988 		if (height == 0
2989 		    && latch_mode == BTR_MODIFY_TREE) {
2990 			ut_ad(upper_rw_latch == RW_X_LATCH);
2991 			/* we should sx-latch root page, if released already.
2992 			It contains seg_header. */
2993 			if (n_releases > 0) {
2994 				mtr_block_sx_latch_at_savepoint(
2995 					mtr, tree_savepoints[0],
2996 					tree_blocks[0]);
2997 			}
2998 
2999 			/* x-latch the branch blocks not released yet. */
3000 			for (ulint i = n_releases; i <= n_blocks; i++) {
3001 				mtr_block_x_latch_at_savepoint(
3002 					mtr, tree_savepoints[i],
3003 					tree_blocks[i]);
3004 			}
3005 		}
3006 
3007 		/* Go to the child node */
3008 		page_id.set_page_no(
3009 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
3010 
3011 		n_blocks++;
3012 	}
3013 
3014 exit_loop:
3015 	if (UNIV_LIKELY_NULL(heap)) {
3016 		mem_heap_free(heap);
3017 	}
3018 
3019 	return(true);
3020 }
3021 
3022 /*==================== B-TREE INSERT =========================*/
3023 
3024 /*************************************************************//**
3025 Inserts a record if there is enough space, or if enough space can
3026 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
3027 no heuristics is applied to whether it pays to use CPU time for
3028 reorganizing the page or not.
3029 
3030 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
3031 if this is a compressed leaf page in a secondary index.
3032 This has to be done either within the same mini-transaction,
3033 or by invoking ibuf_reset_free_bits() before mtr_commit().
3034 
3035 @return pointer to inserted record if succeed, else NULL */
3036 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3037 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)3038 btr_cur_insert_if_possible(
3039 /*=======================*/
3040 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3041 				cursor stays valid */
3042 	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
3043 				have been stored to tuple */
3044 	ulint**		offsets,/*!< out: offsets on *rec */
3045 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
3046 	ulint		n_ext,	/*!< in: number of externally stored columns */
3047 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3048 {
3049 	page_cur_t*	page_cursor;
3050 	rec_t*		rec;
3051 
3052 	ut_ad(dtuple_check_typed(tuple));
3053 
3054 	ut_ad(mtr_is_block_fix(
3055 		mtr, btr_cur_get_block(cursor),
3056 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3057 	page_cursor = btr_cur_get_page_cur(cursor);
3058 
3059 	/* Now, try the insert */
3060 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3061 				    offsets, heap, n_ext, mtr);
3062 
3063 	/* If the record did not fit, reorganize.
3064 	For compressed pages, page_cur_tuple_insert()
3065 	attempted this already. */
3066 	if (!rec && !page_cur_get_page_zip(page_cursor)
3067 	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3068 		rec = page_cur_tuple_insert(
3069 			page_cursor, tuple, cursor->index,
3070 			offsets, heap, n_ext, mtr);
3071 	}
3072 
3073 	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3074 	return(rec);
3075 }
3076 
3077 /*************************************************************//**
3078 For an insert, checks the locks and does the undo logging if desired.
3079 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3080 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
3081 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)3082 btr_cur_ins_lock_and_undo(
3083 /*======================*/
3084 	ulint		flags,	/*!< in: undo logging and locking flags: if
3085 				not zero, the parameters index and thr
3086 				should be specified */
3087 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
3088 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3089 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3090 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3091 	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
3092 				should inherit LOCK_GAP type locks from the
3093 				successor record */
3094 {
3095 	dict_index_t*	index;
3096 	dberr_t		err = DB_SUCCESS;
3097 	rec_t*		rec;
3098 	roll_ptr_t	roll_ptr;
3099 
3100 	/* Check if we have to wait for a lock: enqueue an explicit lock
3101 	request if yes */
3102 
3103 	rec = btr_cur_get_rec(cursor);
3104 	index = cursor->index;
3105 
3106 	ut_ad(!dict_index_is_online_ddl(index)
3107 	      || dict_index_is_clust(index)
3108 	      || (flags & BTR_CREATE_FLAG));
3109 	ut_ad(mtr->is_named_space(index->space));
3110 
3111 	/* Check if there is predicate or GAP lock preventing the insertion */
3112 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3113 		if (dict_index_is_spatial(index)) {
3114 			lock_prdt_t	prdt;
3115 			rtr_mbr_t	mbr;
3116 
3117 			rtr_get_mbr_from_tuple(entry, &mbr);
3118 
3119 			/* Use on stack MBR variable to test if a lock is
3120 			needed. If so, the predicate (MBR) will be allocated
3121 			from lock heap in lock_prdt_insert_check_and_lock() */
3122 			lock_init_prdt_from_mbr(
3123 				&prdt, &mbr, 0, NULL);
3124 
3125 			err = lock_prdt_insert_check_and_lock(
3126 				flags, rec, btr_cur_get_block(cursor),
3127 				index, thr, mtr, &prdt);
3128 			*inherit = false;
3129 		} else {
3130 			err = lock_rec_insert_check_and_lock(
3131 				flags, rec, btr_cur_get_block(cursor),
3132 				index, thr, mtr, inherit);
3133 		}
3134 	}
3135 
3136 	if (err != DB_SUCCESS
3137 	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
3138 
3139 		return(err);
3140 	}
3141 
3142 	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
3143 					    thr, index, entry,
3144 					    NULL, 0, NULL, NULL,
3145 					    &roll_ptr);
3146 	if (err != DB_SUCCESS) {
3147 
3148 		return(err);
3149 	}
3150 
3151 	/* Now we can fill in the roll ptr field in entry
3152 	(except if table is intrinsic) */
3153 
3154 	if (!(flags & BTR_KEEP_SYS_FLAG)
3155 	    && !dict_table_is_intrinsic(index->table)) {
3156 
3157 		row_upd_index_entry_sys_field(entry, index,
3158 					      DATA_ROLL_PTR, roll_ptr);
3159 	}
3160 
3161 	return(DB_SUCCESS);
3162 }
3163 
3164 /**
3165 Prefetch siblings of the leaf for the pessimistic operation.
3166 @param block	leaf page */
3167 static
3168 void
btr_cur_prefetch_siblings(buf_block_t * block)3169 btr_cur_prefetch_siblings(
3170 	buf_block_t*	block)
3171 {
3172 	page_t*	page = buf_block_get_frame(block);
3173 
3174 	ut_ad(page_is_leaf(page));
3175 
3176 	ulint left_page_no = fil_page_get_prev(page);
3177 	ulint right_page_no = fil_page_get_next(page);
3178 
3179 	if (left_page_no != FIL_NULL) {
3180 		buf_read_page_background(
3181 			page_id_t(block->page.id.space(), left_page_no),
3182 			block->page.size, false);
3183 	}
3184 	if (right_page_no != FIL_NULL) {
3185 		buf_read_page_background(
3186 			page_id_t(block->page.id.space(), right_page_no),
3187 			block->page.size, false);
3188 	}
3189 	if (left_page_no != FIL_NULL
3190 	    || right_page_no != FIL_NULL) {
3191 		os_aio_simulated_wake_handler_threads();
3192 	}
3193 }
3194 
3195 /*************************************************************//**
3196 Tries to perform an insert to a page in an index tree, next to cursor.
3197 It is assumed that mtr holds an x-latch on the page. The operation does
3198 not succeed if there is too little space on the page. If there is just
3199 one record on the page, the insert will always succeed; this is to
3200 prevent trying to split a page with just one record.
3201 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3202 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3203 btr_cur_optimistic_insert(
3204 /*======================*/
3205 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3206 				zero, the parameters index and thr should be
3207 				specified */
3208 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3209 				cursor stays valid */
3210 	ulint**		offsets,/*!< out: offsets on *rec */
3211 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
3212 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3213 	rec_t**		rec,	/*!< out: pointer to inserted record if
3214 				succeed */
3215 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3216 				be stored externally by the caller, or
3217 				NULL */
3218 	ulint		n_ext,	/*!< in: number of externally stored columns */
3219 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3220 	mtr_t*		mtr)	/*!< in/out: mini-transaction;
3221 				if this function returns DB_SUCCESS on
3222 				a leaf page of a secondary index in a
3223 				compressed tablespace, the caller must
3224 				mtr_commit(mtr) before latching
3225 				any further pages */
3226 {
3227 	big_rec_t*	big_rec_vec	= NULL;
3228 	dict_index_t*	index;
3229 	page_cur_t*	page_cursor;
3230 	buf_block_t*	block;
3231 	page_t*		page;
3232 	rec_t*		dummy;
3233 	ibool		leaf;
3234 	ibool		reorg;
3235 	ibool		inherit = TRUE;
3236 	ulint		rec_size;
3237 	dberr_t		err;
3238 
3239 	*big_rec = NULL;
3240 
3241 	block = btr_cur_get_block(cursor);
3242 
3243 	SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
3244 
3245 	page = buf_block_get_frame(block);
3246 	index = cursor->index;
3247 
3248 	/* Block are not latched for insert if table is intrinsic
3249 	and index is auto-generated clustered index. */
3250 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3251 	ut_ad(!dict_index_is_online_ddl(index)
3252 	      || dict_index_is_clust(index)
3253 	      || (flags & BTR_CREATE_FLAG));
3254 	ut_ad(dtuple_check_typed(entry));
3255 
3256 	const page_size_t&	page_size = block->page.size;
3257 
3258 #ifdef UNIV_DEBUG_VALGRIND
3259 	if (page_size.is_compressed()) {
3260 		UNIV_MEM_ASSERT_RW(page, page_size.logical());
3261 		UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3262 	}
3263 #endif /* UNIV_DEBUG_VALGRIND */
3264 
3265 	leaf = page_is_leaf(page);
3266 
3267 	/* Calculate the record size when entry is converted to a record */
3268 	rec_size = rec_get_converted_size(index, entry, n_ext);
3269 
3270 	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3271 				   dtuple_get_n_fields(entry), page_size)) {
3272 
3273 		/* The record is so big that we have to store some fields
3274 		externally on separate database pages */
3275 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3276 
3277 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3278 
3279 			return(DB_TOO_BIG_RECORD);
3280 		}
3281 
3282 		rec_size = rec_get_converted_size(index, entry, n_ext);
3283 	}
3284 
3285 	if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3286 		if (big_rec_vec != NULL) {
3287 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3288 		}
3289 
3290 		return(DB_TOO_BIG_RECORD);
3291 	}
3292 
3293 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3294 				      goto fail);
3295 
3296 	if (leaf && page_size.is_compressed()
3297 	    && (page_get_data_size(page) + rec_size
3298 		>= dict_index_zip_pad_optimal_page_size(index))) {
3299 		/* If compression padding tells us that insertion will
3300 		result in too packed up page i.e.: which is likely to
3301 		cause compression failure then don't do an optimistic
3302 		insertion. */
3303 fail:
3304 		err = DB_FAIL;
3305 
3306 		/* prefetch siblings of the leaf for the pessimistic
3307 		operation, if the page is leaf. */
3308 		if (page_is_leaf(page)) {
3309 			btr_cur_prefetch_siblings(block);
3310 		}
3311 fail_err:
3312 
3313 		if (big_rec_vec) {
3314 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3315 		}
3316 
3317 		return(err);
3318 	}
3319 
3320 	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
3321 
3322 	if (page_has_garbage(page)) {
3323 		if ((max_size < rec_size
3324 		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3325 		    && page_get_n_recs(page) > 1
3326 		    && page_get_max_insert_size(page, 1) < rec_size) {
3327 
3328 			goto fail;
3329 		}
3330 	} else if (max_size < rec_size) {
3331 		goto fail;
3332 	}
3333 
3334 	/* If there have been many consecutive inserts to the
3335 	clustered index leaf page of an uncompressed table, check if
3336 	we have to split the page to reserve enough free space for
3337 	future updates of records. */
3338 
3339 	if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3340 	    && page_get_n_recs(page) >= 2
3341 	    && dict_index_get_space_reserve() + rec_size > max_size
3342 	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
3343 		|| btr_page_get_split_rec_to_left(cursor, &dummy))) {
3344 		goto fail;
3345 	}
3346 
3347 	page_cursor = btr_cur_get_page_cur(cursor);
3348 
3349 	DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3350 			      ": %s",
3351 			      index->name(), index->id,
3352 			      thr != NULL
3353 			      ? trx_get_id_for_print(thr_get_trx(thr))
3354 			      : 0,
3355 			      rec_printer(entry).str().c_str()));
3356 
3357 	DBUG_EXECUTE_IF("do_page_reorganize",
3358 			btr_page_reorganize(page_cursor, index, mtr););
3359 
3360 	/* Now, try the insert */
3361 	{
3362 		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
3363 
3364 		if (dict_table_is_intrinsic(index->table)) {
3365 
3366 			index->rec_cache.rec_size = rec_size;
3367 
3368 			*rec = page_cur_tuple_direct_insert(
3369 				page_cursor, entry, index, n_ext, mtr);
3370 		} else {
3371 			/* Check locks and write to the undo log,
3372 			if specified */
3373 			err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3374 							thr, mtr, &inherit);
3375 
3376 			if (err != DB_SUCCESS) {
3377 				goto fail_err;
3378 			}
3379 
3380 			*rec = page_cur_tuple_insert(
3381 				page_cursor, entry, index, offsets, heap,
3382 				n_ext, mtr);
3383 		}
3384 
3385 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3386 	}
3387 
3388 	if (*rec) {
3389 	} else if (page_size.is_compressed()) {
3390 		/* Reset the IBUF_BITMAP_FREE bits, because
3391 		page_cur_tuple_insert() will have attempted page
3392 		reorganize before failing. */
3393 		if (leaf
3394 		    && !dict_index_is_clust(index)
3395 		    && !dict_table_is_temporary(index->table)) {
3396 			ibuf_reset_free_bits(block);
3397 		}
3398 
3399 		goto fail;
3400 	} else {
3401 
3402 		/* For intrinsic table we take a consistent path
3403 		to re-organize using pessimistic path. */
3404 		if (dict_table_is_intrinsic(index->table)) {
3405 			goto fail;
3406 		}
3407 
3408 		ut_ad(!reorg);
3409 
3410 		/* If the record did not fit, reorganize */
3411 		if (!btr_page_reorganize(page_cursor, index, mtr)) {
3412 			ut_ad(0);
3413 			goto fail;
3414 		}
3415 
3416 		ut_ad(page_get_max_insert_size(page, 1) == max_size);
3417 
3418 		reorg = TRUE;
3419 
3420 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
3421 					     offsets, heap, n_ext, mtr);
3422 
3423 		if (UNIV_UNLIKELY(!*rec)) {
3424 			ib::fatal() <<  "Cannot insert tuple " << *entry
3425 				<< "into index " << index->name
3426 				<< " of table " << index->table->name
3427 				<< ". Max size: " << max_size;
3428 		}
3429 	}
3430 
3431 	if (!index->disable_ahi) {
3432 		if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3433 			btr_search_update_hash_node_on_insert(cursor);
3434 		} else {
3435 			btr_search_update_hash_on_insert(cursor);
3436 		}
3437 	}
3438 
3439 	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3440 
3441 		lock_update_insert(block, *rec);
3442 	}
3443 
3444 	if (leaf
3445 	    && !dict_index_is_clust(index)
3446 	    && !dict_table_is_temporary(index->table)) {
3447 		/* Update the free bits of the B-tree page in the
3448 		insert buffer bitmap. */
3449 
3450 		/* The free bits in the insert buffer bitmap must
3451 		never exceed the free space on a page.  It is safe to
3452 		decrement or reset the bits in the bitmap in a
3453 		mini-transaction that is committed before the
3454 		mini-transaction that affects the free space. */
3455 
3456 		/* It is unsafe to increment the bits in a separately
3457 		committed mini-transaction, because in crash recovery,
3458 		the free bits could momentarily be set too high. */
3459 
3460 		if (page_size.is_compressed()) {
3461 			/* Update the bits in the same mini-transaction. */
3462 			ibuf_update_free_bits_zip(block, mtr);
3463 		} else {
3464 			/* Decrement the bits in a separate
3465 			mini-transaction. */
3466 			ibuf_update_free_bits_if_full(
3467 				block, max_size,
3468 				rec_size + PAGE_DIR_SLOT_SIZE);
3469 		}
3470 	}
3471 
3472 	*big_rec = big_rec_vec;
3473 
3474 	return(DB_SUCCESS);
3475 }
3476 
3477 /*************************************************************//**
3478 Performs an insert on a page of an index tree. It is assumed that mtr
3479 holds an x-latch on the tree and on the cursor page. If the insert is
3480 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3481 to brothers of page, if those brothers exist.
3482 @return DB_SUCCESS or error number */
3483 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3484 btr_cur_pessimistic_insert(
3485 /*=======================*/
3486 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3487 				zero, the parameter thr should be
3488 				specified; if no undo logging is specified,
3489 				then the caller must have reserved enough
3490 				free extents in the file space so that the
3491 				insertion will certainly succeed */
3492 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
3493 				cursor stays valid */
3494 	ulint**		offsets,/*!< out: offsets on *rec */
3495 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
3496 				that can be emptied, or NULL */
3497 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3498 	rec_t**		rec,	/*!< out: pointer to inserted record if
3499 				succeed */
3500 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3501 				be stored externally by the caller, or
3502 				NULL */
3503 	ulint		n_ext,	/*!< in: number of externally stored columns */
3504 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3505 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3506 {
3507 	dict_index_t*	index		= cursor->index;
3508 	big_rec_t*	big_rec_vec	= NULL;
3509 	dberr_t		err;
3510 	ibool		inherit = FALSE;
3511 	bool		success;
3512 	ulint		n_reserved	= 0;
3513 
3514 	ut_ad(dtuple_check_typed(entry));
3515 
3516 	*big_rec = NULL;
3517 
3518 	ut_ad(mtr_memo_contains_flagged(
3519 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3520 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3521 	      || dict_table_is_intrinsic(cursor->index->table));
3522 	ut_ad(mtr_is_block_fix(
3523 		mtr, btr_cur_get_block(cursor),
3524 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3525 	ut_ad(!dict_index_is_online_ddl(index)
3526 	      || dict_index_is_clust(index)
3527 	      || (flags & BTR_CREATE_FLAG));
3528 
3529 	cursor->flag = BTR_CUR_BINARY;
3530 
3531 	/* Check locks and write to undo log, if specified */
3532 
3533 	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3534 					thr, mtr, &inherit);
3535 
3536 	if (err != DB_SUCCESS) {
3537 
3538 		return(err);
3539 	}
3540 
3541 	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3542 	    || dict_table_is_intrinsic(index->table)) {
3543 
3544 		ut_a(cursor->tree_height != ULINT_UNDEFINED);
3545 
3546 		/* First reserve enough free space for the file segments
3547 		of the index tree, so that the insert will not fail because
3548 		of lack of space */
3549 
3550 		ulint	n_extents = cursor->tree_height / 16 + 3;
3551 
3552 		success = fsp_reserve_free_extents(&n_reserved, index->space,
3553 						   n_extents, FSP_NORMAL, mtr);
3554 		if (!success) {
3555 			return(DB_OUT_OF_FILE_SPACE);
3556 		}
3557 	}
3558 
3559 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3560 				   dict_table_is_comp(index->table),
3561 				   dtuple_get_n_fields(entry),
3562 				   dict_table_page_size(index->table))) {
3563 		/* The record is so big that we have to store some fields
3564 		externally on separate database pages */
3565 
3566 		if (UNIV_LIKELY_NULL(big_rec_vec)) {
3567 			/* This should never happen, but we handle
3568 			the situation in a robust manner. */
3569 			ut_ad(0);
3570 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3571 		}
3572 
3573 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3574 
3575 		if (big_rec_vec == NULL) {
3576 
3577 			if (n_reserved > 0) {
3578 				fil_space_release_free_extents(index->space,
3579 							       n_reserved);
3580 			}
3581 			return(DB_TOO_BIG_RECORD);
3582 		}
3583 	}
3584 
3585 	if (dict_index_get_page(index)
3586 	    == btr_cur_get_block(cursor)->page.id.page_no()) {
3587 
3588 		/* The page is the root page */
3589 		*rec = btr_root_raise_and_insert(
3590 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3591 	} else {
3592 		*rec = btr_page_split_and_insert(
3593 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3594 	}
3595 
3596 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3597 	      || dict_index_is_spatial(index));
3598 
3599 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3600 		ut_ad(!dict_table_is_temporary(index->table));
3601 		if (dict_index_is_spatial(index)) {
3602 			/* Do nothing */
3603 		} else {
3604 			/* The cursor might be moved to the other page
3605 			and the max trx id field should be updated after
3606 			the cursor was fixed. */
3607 			if (!dict_index_is_clust(index)) {
3608 				page_update_max_trx_id(
3609 					btr_cur_get_block(cursor),
3610 					btr_cur_get_page_zip(cursor),
3611 					thr_get_trx(thr)->id, mtr);
3612 			}
3613 			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3614 			    || btr_page_get_prev(
3615 				buf_block_get_frame(
3616 					btr_cur_get_block(cursor)), mtr)
3617 			       == FIL_NULL) {
3618 				/* split and inserted need to call
3619 				lock_update_insert() always. */
3620 				inherit = TRUE;
3621 			}
3622 		}
3623 	}
3624 
3625 	if (!index->disable_ahi) {
3626 		btr_search_update_hash_on_insert(cursor);
3627 	}
3628 	if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3629 
3630 		lock_update_insert(btr_cur_get_block(cursor), *rec);
3631 	}
3632 
3633 	if (n_reserved > 0) {
3634 		fil_space_release_free_extents(index->space, n_reserved);
3635 	}
3636 
3637 	*big_rec = big_rec_vec;
3638 
3639 	return(DB_SUCCESS);
3640 }
3641 
3642 /*==================== B-TREE UPDATE =========================*/
3643 
3644 /*************************************************************//**
3645 For an update, checks the locks and does the undo logging.
3646 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3647 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3648 dberr_t
3649 btr_cur_upd_lock_and_undo(
3650 /*======================*/
3651 	ulint		flags,	/*!< in: undo logging and locking flags */
3652 	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
3653 	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
3654 	const upd_t*	update,	/*!< in: update vector */
3655 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3656 				updates */
3657 	que_thr_t*	thr,	/*!< in: query thread
3658 				(can be NULL if BTR_NO_LOCKING_FLAG) */
3659 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3660 	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
3661 {
3662 	dict_index_t*	index;
3663 	const rec_t*	rec;
3664 	dberr_t		err;
3665 
3666 	ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3667 
3668 	rec = btr_cur_get_rec(cursor);
3669 	index = cursor->index;
3670 
3671 	ut_ad(rec_offs_validate(rec, index, offsets));
3672 	ut_ad(mtr->is_named_space(index->space));
3673 
3674 	if (!dict_index_is_clust(index)) {
3675 		ut_ad(dict_index_is_online_ddl(index)
3676 		      == !!(flags & BTR_CREATE_FLAG));
3677 
3678 		/* We do undo logging only when we update a clustered index
3679 		record */
3680 		return(lock_sec_rec_modify_check_and_lock(
3681 			       flags, btr_cur_get_block(cursor), rec,
3682 			       index, thr, mtr));
3683 	}
3684 
3685 	/* Check if we have to wait for a lock: enqueue an explicit lock
3686 	request if yes */
3687 
3688 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3689 		err = lock_clust_rec_modify_check_and_lock(
3690 			flags, btr_cur_get_block(cursor), rec, index,
3691 			offsets, thr);
3692 		if (err != DB_SUCCESS) {
3693 			return(err);
3694 		}
3695 	}
3696 
3697 	/* Append the info about the update in the undo log */
3698 
3699 	return(trx_undo_report_row_operation(
3700 		       flags, TRX_UNDO_MODIFY_OP, thr,
3701 		       index, NULL, update,
3702 		       cmpl_info, rec, offsets, roll_ptr));
3703 }
3704 
3705 /***********************************************************//**
3706 Writes a redo log record of updating a record in-place. */
3707 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3708 btr_cur_update_in_place_log(
3709 /*========================*/
3710 	ulint		flags,		/*!< in: flags */
3711 	const rec_t*	rec,		/*!< in: record */
3712 	dict_index_t*	index,		/*!< in: index of the record */
3713 	const upd_t*	update,		/*!< in: update vector */
3714 	trx_id_t	trx_id,		/*!< in: transaction id */
3715 	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
3716 	mtr_t*		mtr)		/*!< in: mtr */
3717 {
3718 	byte*		log_ptr;
3719 	const page_t*	page	= page_align(rec);
3720 	ut_ad(flags < 256);
3721 	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3722 
3723 	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3724 					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
3725 					    : MLOG_REC_UPDATE_IN_PLACE,
3726 					    1 + DATA_ROLL_PTR_LEN + 14 + 2
3727 					    + MLOG_BUF_MARGIN);
3728 
3729 	if (!log_ptr) {
3730 		/* Logging in mtr is switched off during crash recovery */
3731 		return;
3732 	}
3733 
3734 	/* For secondary indexes, we could skip writing the dummy system fields
3735 	to the redo log but we have to change redo log parsing of
3736 	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3737 	new redo log record. For now, just write dummy sys fields to the redo
3738 	log if we are updating a secondary index record.
3739 	*/
3740 	mach_write_to_1(log_ptr, flags);
3741 	log_ptr++;
3742 
3743 	if (dict_index_is_clust(index)) {
3744 		log_ptr = row_upd_write_sys_vals_to_log(
3745 				index, trx_id, roll_ptr, log_ptr, mtr);
3746 	} else {
3747 		/* Dummy system fields for a secondary index */
3748 		/* TRX_ID Position */
3749 		log_ptr += mach_write_compressed(log_ptr, 0);
3750 		/* ROLL_PTR */
3751 		trx_write_roll_ptr(log_ptr, 0);
3752 		log_ptr += DATA_ROLL_PTR_LEN;
3753 		/* TRX_ID */
3754 		log_ptr += mach_u64_write_compressed(log_ptr, 0);
3755 	}
3756 
3757 	mach_write_to_2(log_ptr, page_offset(rec));
3758 	log_ptr += 2;
3759 
3760 	row_upd_index_write_log(update, log_ptr, mtr);
3761 }
3762 #endif /* UNIV_HOTBACKUP */
3763 
3764 /***********************************************************//**
3765 Parses a redo log record of updating a record in-place.
3766 @return end of log record or NULL */
3767 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3768 btr_cur_parse_update_in_place(
3769 /*==========================*/
3770 	byte*		ptr,	/*!< in: buffer */
3771 	byte*		end_ptr,/*!< in: buffer end */
3772 	page_t*		page,	/*!< in/out: page or NULL */
3773 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
3774 	dict_index_t*	index)	/*!< in: index corresponding to page */
3775 {
3776 	ulint		flags;
3777 	rec_t*		rec;
3778 	upd_t*		update;
3779 	ulint		pos;
3780 	trx_id_t	trx_id;
3781 	roll_ptr_t	roll_ptr;
3782 	ulint		rec_offset;
3783 	mem_heap_t*	heap;
3784 	ulint*		offsets;
3785 
3786 	if (end_ptr < ptr + 1) {
3787 
3788 		return(NULL);
3789 	}
3790 
3791 	flags = mach_read_from_1(ptr);
3792 	ptr++;
3793 
3794 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3795 
3796 	if (ptr == NULL) {
3797 
3798 		return(NULL);
3799 	}
3800 
3801 	if (end_ptr < ptr + 2) {
3802 
3803 		return(NULL);
3804 	}
3805 
3806 	rec_offset = mach_read_from_2(ptr);
3807 	ptr += 2;
3808 
3809 	ut_a(rec_offset <= UNIV_PAGE_SIZE);
3810 
3811 	heap = mem_heap_create(256);
3812 
3813 	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3814 
3815 	if (!ptr || !page) {
3816 
3817 		goto func_exit;
3818 	}
3819 
3820 	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3821 	rec = page + rec_offset;
3822 
3823 	/* We do not need to reserve search latch, as the page is only
3824 	being recovered, and there cannot be a hash index to it. */
3825 
3826 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3827 
3828 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
3829 		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3830 						   pos, trx_id, roll_ptr);
3831 	}
3832 
3833 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3834 
3835 func_exit:
3836 	mem_heap_free(heap);
3837 
3838 	return(ptr);
3839 }
3840 
3841 #ifndef UNIV_HOTBACKUP
3842 /*************************************************************//**
3843 See if there is enough place in the page modification log to log
3844 an update-in-place.
3845 
3846 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3847 outside mtr if the page was recompressed
3848 @retval true if enough place;
3849 
3850 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3851 a secondary index leaf page. This has to be done either within the
3852 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3853 mtr_commit(mtr). */
3854 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3855 btr_cur_update_alloc_zip_func(
3856 /*==========================*/
3857 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
3858 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
3859 	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
3860 #ifdef UNIV_DEBUG
3861 	ulint*		offsets,/*!< in/out: offsets of the cursor record */
3862 #endif /* UNIV_DEBUG */
3863 	ulint		length,	/*!< in: size needed */
3864 	bool		create,	/*!< in: true=delete-and-insert,
3865 				false=update-in-place */
3866 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3867 {
3868 	const page_t*	page = page_cur_get_page(cursor);
3869 
3870 	ut_ad(page_zip == page_cur_get_page_zip(cursor));
3871 	ut_ad(page_zip);
3872 	ut_ad(!dict_index_is_ibuf(index));
3873 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3874 
3875 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3876 			       length, create)) {
3877 		return(true);
3878 	}
3879 
3880 	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3881 		/* The page has been freshly compressed, so
3882 		reorganizing it will not help. */
3883 		return(false);
3884 	}
3885 
3886 	if (create && page_is_leaf(page)
3887 	    && (length + page_get_data_size(page)
3888 		>= dict_index_zip_pad_optimal_page_size(index))) {
3889 		return(false);
3890 	}
3891 
3892 	if (!btr_page_reorganize(cursor, index, mtr)) {
3893 		goto out_of_space;
3894 	}
3895 
3896 	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3897 
3898 	/* After recompressing a page, we must make sure that the free
3899 	bits in the insert buffer bitmap will not exceed the free
3900 	space on the page.  Because this function will not attempt
3901 	recompression unless page_zip_available() fails above, it is
3902 	safe to reset the free bits if page_zip_available() fails
3903 	again, below.  The free bits can safely be reset in a separate
3904 	mini-transaction.  If page_zip_available() succeeds below, we
3905 	can be sure that the btr_page_reorganize() above did not reduce
3906 	the free space available on the page. */
3907 
3908 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3909 			       length, create)) {
3910 		return(true);
3911 	}
3912 
3913 out_of_space:
3914 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3915 
3916 	/* Out of space: reset the free bits. */
3917 	if (!dict_index_is_clust(index)
3918 	    && !dict_table_is_temporary(index->table)
3919 	    && page_is_leaf(page)) {
3920 		ibuf_reset_free_bits(page_cur_get_block(cursor));
3921 	}
3922 
3923 	return(false);
3924 }
3925 
3926 /*************************************************************//**
3927 Updates a record when the update causes no size changes in its fields.
3928 We assume here that the ordering fields of the record do not change.
3929 @return locking or undo log related error code, or
3930 @retval DB_SUCCESS on success
3931 @retval DB_ZIP_OVERFLOW if there is not enough space left
3932 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3933 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3934 btr_cur_update_in_place(
3935 /*====================*/
3936 	ulint		flags,	/*!< in: undo logging and locking flags */
3937 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
3938 				cursor stays valid and positioned on the
3939 				same record */
3940 	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3941 	const upd_t*	update,	/*!< in: update vector */
3942 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3943 				updates */
3944 	que_thr_t*	thr,	/*!< in: query thread */
3945 	trx_id_t	trx_id,	/*!< in: transaction id */
3946 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
3947 				is a secondary index, the caller must
3948 				mtr_commit(mtr) before latching any
3949 				further pages */
3950 {
3951 	dict_index_t*	index;
3952 	buf_block_t*	block;
3953 	page_zip_des_t*	page_zip;
3954 	dberr_t		err;
3955 	rec_t*		rec;
3956 	roll_ptr_t	roll_ptr	= 0;
3957 	ulint		was_delete_marked;
3958 	ibool		is_hashed;
3959 
3960 	rec = btr_cur_get_rec(cursor);
3961 	index = cursor->index;
3962 	ut_ad(rec_offs_validate(rec, index, offsets));
3963 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3964 	ut_ad(trx_id > 0
3965 	      || (flags & BTR_KEEP_SYS_FLAG)
3966 	      || dict_table_is_intrinsic(index->table));
3967 	/* The insert buffer tree should never be updated in place. */
3968 	ut_ad(!dict_index_is_ibuf(index));
3969 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3970 	      || dict_index_is_clust(index));
3971 	ut_ad(thr_get_trx(thr)->id == trx_id
3972 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3973 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3974 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3975 	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3976 	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3977 
3978 	DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3979 			      ") by " TRX_ID_FMT ": %s",
3980 			      index->name(), index->id, trx_id,
3981 			      rec_printer(rec, offsets).str().c_str()));
3982 
3983 	block = btr_cur_get_block(cursor);
3984 	page_zip = buf_block_get_page_zip(block);
3985 
3986 	/* Check that enough space is available on the compressed page. */
3987 	if (page_zip) {
3988 		if (!btr_cur_update_alloc_zip(
3989 			    page_zip, btr_cur_get_page_cur(cursor),
3990 			    index, offsets, rec_offs_size(offsets),
3991 			    false, mtr)) {
3992 			return(DB_ZIP_OVERFLOW);
3993 		}
3994 
3995 		rec = btr_cur_get_rec(cursor);
3996 	}
3997 
3998 	/* Do lock checking and undo logging */
3999 	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
4000 					update, cmpl_info,
4001 					thr, mtr, &roll_ptr);
4002 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
4003 		/* We may need to update the IBUF_BITMAP_FREE
4004 		bits after a reorganize that was done in
4005 		btr_cur_update_alloc_zip(). */
4006 		goto func_exit;
4007 	}
4008 
4009 	if (!(flags & BTR_KEEP_SYS_FLAG)
4010 	    && !dict_table_is_intrinsic(index->table)) {
4011 		row_upd_rec_sys_fields(rec, NULL, index, offsets,
4012 				       thr_get_trx(thr), roll_ptr);
4013 	}
4014 
4015 	was_delete_marked = rec_get_deleted_flag(
4016 		rec, page_is_comp(buf_block_get_frame(block)));
4017 
4018 	is_hashed = (block->index != NULL);
4019 
4020 	if (is_hashed) {
4021 		/* TO DO: Can we skip this if none of the fields
4022 		index->search_info->curr_n_fields
4023 		are being updated? */
4024 
4025 		/* The function row_upd_changes_ord_field_binary works only
4026 		if the update vector was built for a clustered index, we must
4027 		NOT call it if index is secondary */
4028 
4029 		if (!dict_index_is_clust(index)
4030 		    || row_upd_changes_ord_field_binary(index, update, thr,
4031 							NULL, NULL)) {
4032 
4033 			/* Remove possible hash index pointer to this record */
4034 			btr_search_update_hash_on_delete(cursor);
4035 		}
4036 
4037 		rw_lock_x_lock(btr_get_search_latch(index));
4038 	}
4039 
4040 	assert_block_ahi_valid(block);
4041 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
4042 
4043 	if (is_hashed) {
4044 		rw_lock_x_unlock(btr_get_search_latch(index));
4045 	}
4046 
4047 	btr_cur_update_in_place_log(flags, rec, index, update,
4048 				    trx_id, roll_ptr, mtr);
4049 
4050 	if (was_delete_marked
4051 	    && !rec_get_deleted_flag(
4052 		    rec, page_is_comp(buf_block_get_frame(block)))) {
4053 		/* The new updated record owns its possible externally
4054 		stored fields */
4055 
4056 		btr_cur_unmark_extern_fields(page_zip,
4057 					     rec, index, offsets, mtr);
4058 	}
4059 
4060 	ut_ad(err == DB_SUCCESS);
4061 
4062 func_exit:
4063 	if (page_zip
4064 	    && !(flags & BTR_KEEP_IBUF_BITMAP)
4065 	    && !dict_index_is_clust(index)
4066 	    && !dict_table_is_temporary(index->table)
4067 	    && page_is_leaf(buf_block_get_frame(block))) {
4068 		/* Update the free bits in the insert buffer. */
4069 		ibuf_update_free_bits_zip(block, mtr);
4070 	}
4071 
4072 	return(err);
4073 }
4074 
4075 /*************************************************************//**
4076 Tries to update a record on a page in an index tree. It is assumed that mtr
4077 holds an x-latch on the page. The operation does not succeed if there is too
4078 little space on the page or if the update would result in too empty a page,
4079 so that tree compression is recommended. We assume here that the ordering
4080 fields of the record do not change.
4081 @return error code, including
4082 @retval DB_SUCCESS on success
4083 @retval DB_OVERFLOW if the updated record does not fit
4084 @retval DB_UNDERFLOW if the page would become too empty
4085 @retval DB_ZIP_OVERFLOW if there is not enough space left
4086 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4087 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4088 btr_cur_optimistic_update(
4089 /*======================*/
4090 	ulint		flags,	/*!< in: undo logging and locking flags */
4091 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
4092 				cursor stays valid and positioned on the
4093 				same record */
4094 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
4095 	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
4096 	const upd_t*	update,	/*!< in: update vector; this must also
4097 				contain trx id and roll ptr fields */
4098 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4099 				updates */
4100 	que_thr_t*	thr,	/*!< in: query thread */
4101 	trx_id_t	trx_id,	/*!< in: transaction id */
4102 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
4103 				is a secondary index, the caller must
4104 				mtr_commit(mtr) before latching any
4105 				further pages */
4106 {
4107 	dict_index_t*	index;
4108 	page_cur_t*	page_cursor;
4109 	dberr_t		err;
4110 	buf_block_t*	block;
4111 	page_t*		page;
4112 	page_zip_des_t*	page_zip;
4113 	rec_t*		rec;
4114 	ulint		max_size;
4115 	ulint		new_rec_size;
4116 	ulint		old_rec_size;
4117 	ulint		max_ins_size = 0;
4118 	dtuple_t*	new_entry;
4119 	roll_ptr_t	roll_ptr;
4120 	ulint		i;
4121 	ulint		n_ext;
4122 
4123 	block = btr_cur_get_block(cursor);
4124 	page = buf_block_get_frame(block);
4125 	rec = btr_cur_get_rec(cursor);
4126 	index = cursor->index;
4127 	ut_ad(trx_id > 0
4128 	      || (flags & BTR_KEEP_SYS_FLAG)
4129 	      || dict_table_is_intrinsic(index->table));
4130 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4131 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4132 	/* This is intended only for leaf page updates */
4133 	ut_ad(page_is_leaf(page));
4134 	/* The insert buffer tree should never be updated in place. */
4135 	ut_ad(!dict_index_is_ibuf(index));
4136 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4137 	      || dict_index_is_clust(index));
4138 	ut_ad(thr_get_trx(thr)->id == trx_id
4139 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
4140 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4141 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4142 	ut_ad(fil_page_index_page_check(page));
4143 	ut_ad(btr_page_get_index_id(page) == index->id);
4144 
4145 	*offsets = rec_get_offsets(rec, index, *offsets,
4146 				   ULINT_UNDEFINED, heap);
4147 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4148 	ut_a(!rec_offs_any_null_extern(rec, *offsets)
4149 	     || trx_is_recv(thr_get_trx(thr)));
4150 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4151 
4152 	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
4153 
4154 		/* The simplest and the most common case: the update does not
4155 		change the size of any field and none of the updated fields is
4156 		externally stored in rec or update, and there is enough space
4157 		on the compressed page to log the update. */
4158 
4159 		return(btr_cur_update_in_place(
4160 			       flags, cursor, *offsets, update,
4161 			       cmpl_info, thr, trx_id, mtr));
4162 	}
4163 
4164 	if (rec_offs_any_extern(*offsets)) {
4165 any_extern:
4166 		/* Externally stored fields are treated in pessimistic
4167 		update */
4168 
4169 		/* prefetch siblings of the leaf for the pessimistic
4170 		operation. */
4171 		btr_cur_prefetch_siblings(block);
4172 
4173 		return(DB_OVERFLOW);
4174 	}
4175 
4176 	for (i = 0; i < upd_get_n_fields(update); i++) {
4177 		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4178 
4179 			goto any_extern;
4180 		}
4181 	}
4182 
4183 	DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4184 			      ": %s",
4185 			      index->name(), index->id, trx_id,
4186 			      rec_printer(rec, *offsets).str().c_str()));
4187 
4188 	page_cursor = btr_cur_get_page_cur(cursor);
4189 
4190 	if (!*heap) {
4191 		*heap = mem_heap_create(
4192 			rec_offs_size(*offsets)
4193 			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4194 	}
4195 
4196 	new_entry = row_rec_to_index_entry(rec, index, *offsets,
4197 					   &n_ext, *heap);
4198 	/* We checked above that there are no externally stored fields. */
4199 	ut_a(!n_ext);
4200 
4201 	/* The page containing the clustered index record
4202 	corresponding to new_entry is latched in mtr.
4203 	Thus the following call is safe. */
4204 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4205 						     FALSE, *heap);
4206 	old_rec_size = rec_offs_size(*offsets);
4207 	new_rec_size = rec_get_converted_size(index, new_entry, 0);
4208 
4209 	page_zip = buf_block_get_page_zip(block);
4210 #ifdef UNIV_ZIP_DEBUG
4211 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4212 #endif /* UNIV_ZIP_DEBUG */
4213 
4214 	if (page_zip) {
4215 		if (!btr_cur_update_alloc_zip(
4216 			    page_zip, page_cursor, index, *offsets,
4217 			    new_rec_size, true, mtr)) {
4218 			return(DB_ZIP_OVERFLOW);
4219 		}
4220 
4221 		rec = page_cur_get_rec(page_cursor);
4222 	}
4223 
4224 	/* We limit max record size to 16k even for 64k page size. */
4225 	if (new_rec_size >= REC_MAX_DATA_SIZE) {
4226 		err = DB_OVERFLOW;
4227 
4228 		goto func_exit;
4229 	}
4230 
4231 	if (UNIV_UNLIKELY(new_rec_size
4232 			  >= (page_get_free_space_of_empty(page_is_comp(page))
4233 			      / 2))) {
4234 		/* We may need to update the IBUF_BITMAP_FREE
4235 		bits after a reorganize that was done in
4236 		btr_cur_update_alloc_zip(). */
4237 		err = DB_OVERFLOW;
4238 		goto func_exit;
4239 	}
4240 
4241 	if (UNIV_UNLIKELY(page_get_data_size(page)
4242 			  - old_rec_size + new_rec_size
4243 			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4244 		/* We may need to update the IBUF_BITMAP_FREE
4245 		bits after a reorganize that was done in
4246 		btr_cur_update_alloc_zip(). */
4247 
4248 		/* The page would become too empty */
4249 		err = DB_UNDERFLOW;
4250 		goto func_exit;
4251 	}
4252 
4253 	/* We do not attempt to reorganize if the page is compressed.
4254 	This is because the page may fail to compress after reorganization. */
4255 	max_size = page_zip
4256 		? page_get_max_insert_size(page, 1)
4257 		: (old_rec_size
4258 		   + page_get_max_insert_size_after_reorganize(page, 1));
4259 
4260 	if (!page_zip) {
4261 		max_ins_size = page_get_max_insert_size_after_reorganize(
4262 				page, 1);
4263 	}
4264 
4265 	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4266 	       && (max_size >= new_rec_size))
4267 	      || (page_get_n_recs(page) <= 1))) {
4268 
4269 		/* We may need to update the IBUF_BITMAP_FREE
4270 		bits after a reorganize that was done in
4271 		btr_cur_update_alloc_zip(). */
4272 
4273 		/* There was not enough space, or it did not pay to
4274 		reorganize: for simplicity, we decide what to do assuming a
4275 		reorganization is needed, though it might not be necessary */
4276 
4277 		err = DB_OVERFLOW;
4278 		goto func_exit;
4279 	}
4280 
4281 	/* Do lock checking and undo logging */
4282 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4283 					update, cmpl_info,
4284 					thr, mtr, &roll_ptr);
4285 	if (err != DB_SUCCESS) {
4286 		/* We may need to update the IBUF_BITMAP_FREE
4287 		bits after a reorganize that was done in
4288 		btr_cur_update_alloc_zip(). */
4289 		goto func_exit;
4290 	}
4291 
4292 	/* Ok, we may do the replacement. Store on the page infimum the
4293 	explicit locks on rec, before deleting rec (see the comment in
4294 	btr_cur_pessimistic_update). */
4295 	if (!dict_table_is_locking_disabled(index->table)) {
4296 		lock_rec_store_on_page_infimum(block, rec);
4297 	}
4298 
4299 	btr_search_update_hash_on_delete(cursor);
4300 
4301 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4302 
4303 	page_cur_move_to_prev(page_cursor);
4304 
4305 	if (!(flags & BTR_KEEP_SYS_FLAG)
4306 	    && !dict_table_is_intrinsic(index->table)) {
4307 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4308 					      roll_ptr);
4309 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4310 					      trx_id);
4311 	}
4312 
4313 	/* There are no externally stored columns in new_entry */
4314 	rec = btr_cur_insert_if_possible(
4315 		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4316 	ut_a(rec); /* <- We calculated above the insert would fit */
4317 
4318 	/* Restore the old explicit lock state on the record */
4319 	if (!dict_table_is_locking_disabled(index->table)) {
4320 		lock_rec_restore_from_page_infimum(block, rec, block);
4321 	}
4322 
4323 	page_cur_move_to_next(page_cursor);
4324 	ut_ad(err == DB_SUCCESS);
4325 
4326 func_exit:
4327 	if (!(flags & BTR_KEEP_IBUF_BITMAP)
4328 	    && !dict_index_is_clust(index)
4329 	    && !dict_table_is_temporary(index->table)) {
4330 		/* Update the free bits in the insert buffer. */
4331 		if (page_zip) {
4332 			ibuf_update_free_bits_zip(block, mtr);
4333 		} else {
4334 			ibuf_update_free_bits_low(block, max_ins_size, mtr);
4335 		}
4336 	}
4337 
4338 	if (err != DB_SUCCESS) {
4339 		/* prefetch siblings of the leaf for the pessimistic
4340 		operation. */
4341 		btr_cur_prefetch_siblings(block);
4342 	}
4343 
4344 	return(err);
4345 }
4346 
4347 /*************************************************************//**
4348 If, in a split, a new supremum record was created as the predecessor of the
4349 updated record, the supremum record must inherit exactly the locks on the
4350 updated record. In the split it may have inherited locks from the successor
4351 of the updated record, which is not correct. This function restores the
4352 right locks for the new supremum. */
4353 static
4354 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4355 btr_cur_pess_upd_restore_supremum(
4356 /*==============================*/
4357 	buf_block_t*	block,	/*!< in: buffer block of rec */
4358 	const rec_t*	rec,	/*!< in: updated record */
4359 	mtr_t*		mtr)	/*!< in: mtr */
4360 {
4361 	page_t*		page;
4362 	buf_block_t*	prev_block;
4363 
4364 	page = buf_block_get_frame(block);
4365 
4366 	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4367 		/* Updated record is not the first user record on its page */
4368 
4369 		return;
4370 	}
4371 
4372 	const ulint	prev_page_no = btr_page_get_prev(page, mtr);
4373 
4374 	const page_id_t	page_id(block->page.id.space(), prev_page_no);
4375 
4376 	ut_ad(prev_page_no != FIL_NULL);
4377 	prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4378 #ifdef UNIV_BTR_DEBUG
4379 	ut_a(btr_page_get_next(prev_block->frame, mtr)
4380 	     == page_get_page_no(page));
4381 #endif /* UNIV_BTR_DEBUG */
4382 
4383 	/* We must already have an x-latch on prev_block! */
4384 	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4385 
4386 	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4387 					     PAGE_HEAP_NO_SUPREMUM,
4388 					     page_rec_get_heap_no(rec));
4389 }
4390 
4391 /*************************************************************//**
4392 Performs an update of a record on a page of a tree. It is assumed
4393 that mtr holds an x-latch on the tree and on the cursor page. If the
4394 update is made on the leaf level, to avoid deadlocks, mtr must also
4395 own x-latches to brothers of page, if those brothers exist. We assume
4396 here that the ordering fields of the record do not change.
4397 @return DB_SUCCESS or error code */
4398 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4399 btr_cur_pessimistic_update(
4400 /*=======================*/
4401 	ulint		flags,	/*!< in: undo logging, locking, and rollback
4402 				flags */
4403 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
4404 				cursor may become invalid if *big_rec == NULL
4405 				|| !(flags & BTR_KEEP_POS_FLAG) */
4406 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
4407 	mem_heap_t**	offsets_heap,
4408 				/*!< in/out: pointer to memory heap
4409 				that can be emptied, or NULL */
4410 	mem_heap_t*	entry_heap,
4411 				/*!< in/out: memory heap for allocating
4412 				big_rec and the index tuple */
4413 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
4414 				be stored externally by the caller, or NULL */
4415 	upd_t*		update,	/*!< in/out: update vector; this is allowed to
4416 				also contain trx id and roll ptr fields.
4417 				Non-updated columns that are moved offpage will
4418 				be appended to this. */
4419 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4420 				updates */
4421 	que_thr_t*	thr,	/*!< in: query thread */
4422 	trx_id_t	trx_id,	/*!< in: transaction id */
4423 	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
4424 				committed before latching any further pages */
4425 {
4426 	big_rec_t*	big_rec_vec	= NULL;
4427 	big_rec_t*	dummy_big_rec;
4428 	dict_index_t*	index;
4429 	buf_block_t*	block;
4430 	page_t*		page;
4431 	page_zip_des_t*	page_zip;
4432 	rec_t*		rec;
4433 	page_cur_t*	page_cursor;
4434 	dberr_t		err;
4435 	dberr_t		optim_err;
4436 	roll_ptr_t	roll_ptr;
4437 	ibool		was_first;
4438 	ulint		n_reserved	= 0;
4439 	ulint		n_ext;
4440 	ulint		max_ins_size	= 0;
4441 
4442 	*offsets = NULL;
4443 	*big_rec = NULL;
4444 
4445 	block = btr_cur_get_block(cursor);
4446 	page = buf_block_get_frame(block);
4447 	page_zip = buf_block_get_page_zip(block);
4448 	index = cursor->index;
4449 
4450 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4451 					MTR_MEMO_X_LOCK |
4452 					MTR_MEMO_SX_LOCK)
4453 	      || dict_table_is_intrinsic(index->table));
4454 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4455 #ifdef UNIV_ZIP_DEBUG
4456 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4457 #endif /* UNIV_ZIP_DEBUG */
4458 	/* The insert buffer tree should never be updated in place. */
4459 	ut_ad(!dict_index_is_ibuf(index));
4460 	ut_ad(trx_id > 0
4461 	      || (flags & BTR_KEEP_SYS_FLAG)
4462 	      || dict_table_is_intrinsic(index->table));
4463 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4464 	      || dict_index_is_clust(index));
4465 	ut_ad(thr_get_trx(thr)->id == trx_id
4466 	      || (flags & ~BTR_KEEP_POS_FLAG)
4467 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4468 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4469 
4470 	err = optim_err = btr_cur_optimistic_update(
4471 		flags | BTR_KEEP_IBUF_BITMAP,
4472 		cursor, offsets, offsets_heap, update,
4473 		cmpl_info, thr, trx_id, mtr);
4474 
4475 	switch (err) {
4476 	case DB_ZIP_OVERFLOW:
4477 	case DB_UNDERFLOW:
4478 	case DB_OVERFLOW:
4479 		break;
4480 	default:
4481 	err_exit:
4482 		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4483 		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4484 		already reset by btr_cur_update_alloc_zip() if the
4485 		page was recompressed. */
4486 		if (page_zip
4487 		    && optim_err != DB_ZIP_OVERFLOW
4488 		    && !dict_index_is_clust(index)
4489 		    && !dict_table_is_temporary(index->table)
4490 		    && page_is_leaf(page)) {
4491 			ibuf_update_free_bits_zip(block, mtr);
4492 		}
4493 
4494 		if (big_rec_vec != NULL) {
4495 			dtuple_big_rec_free(big_rec_vec);
4496 		}
4497 
4498 		return(err);
4499 	}
4500 
4501 	rec = btr_cur_get_rec(cursor);
4502 
4503 	*offsets = rec_get_offsets(
4504 		rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4505 
4506 	dtuple_t*	new_entry = row_rec_to_index_entry(
4507 		rec, index, *offsets, &n_ext, entry_heap);
4508 
4509 	/* The page containing the clustered index record
4510 	corresponding to new_entry is latched in mtr.  If the
4511 	clustered index record is delete-marked, then its externally
4512 	stored fields cannot have been purged yet, because then the
4513 	purge would also have removed the clustered index record
4514 	itself.  Thus the following call is safe. */
4515 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4516 						     FALSE, entry_heap);
4517 
4518 	/* We have to set appropriate extern storage bits in the new
4519 	record to be inserted: we have to remember which fields were such */
4520 
4521 	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4522 	ut_ad(rec_offs_validate(rec, index, *offsets));
4523 
4524 	/* Get number of externally stored columns in updated record */
4525 	n_ext = new_entry->get_n_ext();
4526 
4527 	/* UNDO logging is also turned-off during normal operation on intrinsic
4528 	table so condition needs to ensure that table is not intrinsic. */
4529 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
4530 	    && rec_offs_any_extern(*offsets)
4531 	    && !dict_table_is_intrinsic(index->table)) {
4532 		/* We are in a transaction rollback undoing a row
4533 		update: we must free possible externally stored fields
4534 		which got new values in the update, if they are not
4535 		inherited values. They can be inherited if we have
4536 		updated the primary key to another value, and then
4537 		update it back again. */
4538 
4539 		ut_ad(big_rec_vec == NULL);
4540 		ut_ad(dict_index_is_clust(index));
4541 		ut_ad(thr_get_trx(thr)->in_rollback);
4542 
4543 		DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4544 		RECOVERY_CRASH(99);
4545 
4546 		btr_rec_free_updated_extern_fields(
4547 			index, rec, page_zip, *offsets, update, true, mtr);
4548 	}
4549 
4550 	if (page_zip_rec_needs_ext(
4551 			rec_get_converted_size(index, new_entry, n_ext),
4552 			page_is_comp(page),
4553 			dict_index_get_n_fields(index),
4554 			block->page.size)) {
4555 
4556 		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4557 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4558 
4559 			/* We cannot goto return_after_reservations,
4560 			because we may need to update the
4561 			IBUF_BITMAP_FREE bits, which was suppressed by
4562 			BTR_KEEP_IBUF_BITMAP. */
4563 #ifdef UNIV_ZIP_DEBUG
4564 			ut_a(!page_zip
4565 			     || page_zip_validate(page_zip, page, index));
4566 #endif /* UNIV_ZIP_DEBUG */
4567 			if (n_reserved > 0) {
4568 				fil_space_release_free_extents(
4569 					index->space, n_reserved);
4570 			}
4571 
4572 			err = DB_TOO_BIG_RECORD;
4573 			goto err_exit;
4574 		}
4575 
4576 		ut_ad(page_is_leaf(page));
4577 		ut_ad(dict_index_is_clust(index));
4578 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4579 	}
4580 
4581 	/* Do lock checking and undo logging */
4582 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4583 					update, cmpl_info,
4584 					thr, mtr, &roll_ptr);
4585 	if (err != DB_SUCCESS) {
4586 		goto err_exit;
4587 	}
4588 
4589 	if (optim_err == DB_OVERFLOW) {
4590 
4591 		/* First reserve enough free space for the file segments
4592 		of the index tree, so that the update will not fail because
4593 		of lack of space */
4594 
4595 		ulint	n_extents = cursor->tree_height / 16 + 3;
4596 
4597 		if (!fsp_reserve_free_extents(
4598 		            &n_reserved, index->space, n_extents,
4599 		            flags & BTR_NO_UNDO_LOG_FLAG
4600 		            ? FSP_CLEANING : FSP_NORMAL,
4601 		            mtr)) {
4602 			err = DB_OUT_OF_FILE_SPACE;
4603 			goto err_exit;
4604 		}
4605 	}
4606 
4607 	if (!(flags & BTR_KEEP_SYS_FLAG)
4608 	    && !dict_table_is_intrinsic(index->table)) {
4609 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4610 					      roll_ptr);
4611 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4612 					      trx_id);
4613 	}
4614 
4615 	if (!page_zip) {
4616 		max_ins_size = page_get_max_insert_size_after_reorganize(
4617 				page, 1);
4618 	}
4619 
4620 	/* Store state of explicit locks on rec on the page infimum record,
4621 	before deleting rec. The page infimum acts as a dummy carrier of the
4622 	locks, taking care also of lock releases, before we can move the locks
4623 	back on the actual record. There is a special case: if we are
4624 	inserting on the root page and the insert causes a call of
4625 	btr_root_raise_and_insert. Therefore we cannot in the lock system
4626 	delete the lock structs set on the root page even if the root
4627 	page carries just node pointers. */
4628 	if (!dict_table_is_locking_disabled(index->table)) {
4629 		lock_rec_store_on_page_infimum(block, rec);
4630 	}
4631 
4632 	btr_search_update_hash_on_delete(cursor);
4633 
4634 #ifdef UNIV_ZIP_DEBUG
4635 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4636 #endif /* UNIV_ZIP_DEBUG */
4637 	page_cursor = btr_cur_get_page_cur(cursor);
4638 
4639 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4640 
4641 	page_cur_move_to_prev(page_cursor);
4642 
4643 	rec = btr_cur_insert_if_possible(cursor, new_entry,
4644 					 offsets, offsets_heap, n_ext, mtr);
4645 
4646 	if (rec) {
4647 		page_cursor->rec = rec;
4648 
4649 		if (!dict_table_is_locking_disabled(index->table)) {
4650 			lock_rec_restore_from_page_infimum(
4651 				btr_cur_get_block(cursor), rec, block);
4652 		}
4653 
4654 		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4655 			/* The new inserted record owns its possible externally
4656 			stored fields */
4657 			btr_cur_unmark_extern_fields(
4658 				page_zip, rec, index, *offsets, mtr);
4659 		}
4660 
4661 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4662 
4663 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4664 			if (adjust) {
4665 				rec_offs_make_valid(
4666 					page_cursor->rec, index, *offsets);
4667 			}
4668 		} else if (!dict_index_is_clust(index)
4669 			   && !dict_table_is_temporary(index->table)
4670 			   && page_is_leaf(page)) {
4671 			/* Update the free bits in the insert buffer.
4672 			This is the same block which was skipped by
4673 			BTR_KEEP_IBUF_BITMAP. */
4674 			if (page_zip) {
4675 				ibuf_update_free_bits_zip(block, mtr);
4676 			} else {
4677 				ibuf_update_free_bits_low(block, max_ins_size,
4678 							  mtr);
4679 			}
4680 		}
4681 
4682 		if (!srv_read_only_mode
4683 		    && !big_rec_vec
4684 		    && page_is_leaf(page)
4685 		    && !dict_index_is_online_ddl(index)) {
4686 
4687 			mtr_memo_release(mtr, dict_index_get_lock(index),
4688 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4689 
4690 			/* NOTE: We cannot release root block latch here, because it
4691 			has segment header and already modified in most of cases.*/
4692 		}
4693 
4694 		err = DB_SUCCESS;
4695 		goto return_after_reservations;
4696 	} else {
4697 		/* If the page is compressed and it initially
4698 		compresses very well, and there is a subsequent insert
4699 		of a badly-compressing record, it is possible for
4700 		btr_cur_optimistic_update() to return DB_UNDERFLOW and
4701 		btr_cur_insert_if_possible() to return FALSE. */
4702 		ut_a(page_zip || optim_err != DB_UNDERFLOW);
4703 
4704 		/* Out of space: reset the free bits.
4705 		This is the same block which was skipped by
4706 		BTR_KEEP_IBUF_BITMAP. */
4707 		if (!dict_index_is_clust(index)
4708 		    && !dict_table_is_temporary(index->table)
4709 		    && page_is_leaf(page)) {
4710 			ibuf_reset_free_bits(block);
4711 		}
4712 	}
4713 
4714 	if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4715 		ut_ad(page_is_leaf(page));
4716 		ut_ad(dict_index_is_clust(index));
4717 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4718 
4719 		/* btr_page_split_and_insert() in
4720 		btr_cur_pessimistic_insert() invokes
4721 		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4722 		We must keep the index->lock when we created a
4723 		big_rec, so that row_upd_clust_rec() can store the
4724 		big_rec in the same mini-transaction. */
4725 
4726 		ut_ad(mtr_memo_contains_flagged(mtr,
4727 						dict_index_get_lock(index),
4728 						MTR_MEMO_X_LOCK |
4729 						MTR_MEMO_SX_LOCK));
4730 
4731 		mtr_sx_lock(dict_index_get_lock(index), mtr);
4732 	}
4733 
4734 	/* Was the record to be updated positioned as the first user
4735 	record on its page? */
4736 	was_first = page_cur_is_before_first(page_cursor);
4737 
4738 	/* Lock checks and undo logging were already performed by
4739 	btr_cur_upd_lock_and_undo(). We do not try
4740 	btr_cur_optimistic_insert() because
4741 	btr_cur_insert_if_possible() already failed above. */
4742 
4743 	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4744 					 | BTR_NO_LOCKING_FLAG
4745 					 | BTR_KEEP_SYS_FLAG,
4746 					 cursor, offsets, offsets_heap,
4747 					 new_entry, &rec,
4748 					 &dummy_big_rec, n_ext, NULL, mtr);
4749 	ut_a(rec);
4750 	ut_a(err == DB_SUCCESS);
4751 	ut_a(dummy_big_rec == NULL);
4752 	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4753 	page_cursor->rec = rec;
4754 
4755 	/* Multiple transactions cannot simultaneously operate on the
4756 	same temp-table in parallel.
4757 	max_trx_id is ignored for temp tables because it not required
4758 	for MVCC. */
4759 	if (dict_index_is_sec_or_ibuf(index)
4760 	    && !dict_table_is_temporary(index->table)) {
4761 		/* Update PAGE_MAX_TRX_ID in the index page header.
4762 		It was not updated by btr_cur_pessimistic_insert()
4763 		because of BTR_NO_LOCKING_FLAG. */
4764 		buf_block_t*	rec_block;
4765 
4766 		rec_block = btr_cur_get_block(cursor);
4767 
4768 		page_update_max_trx_id(rec_block,
4769 				       buf_block_get_page_zip(rec_block),
4770 				       trx_id, mtr);
4771 	}
4772 
4773 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4774 		/* The new inserted record owns its possible externally
4775 		stored fields */
4776 		buf_block_t*	rec_block = btr_cur_get_block(cursor);
4777 
4778 #ifdef UNIV_ZIP_DEBUG
4779 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4780 		page = buf_block_get_frame(rec_block);
4781 #endif /* UNIV_ZIP_DEBUG */
4782 		page_zip = buf_block_get_page_zip(rec_block);
4783 
4784 		btr_cur_unmark_extern_fields(page_zip,
4785 					     rec, index, *offsets, mtr);
4786 	}
4787 
4788 	if (!dict_table_is_locking_disabled(index->table)) {
4789 		lock_rec_restore_from_page_infimum(
4790 			btr_cur_get_block(cursor), rec, block);
4791 	}
4792 
4793 	/* If necessary, restore also the correct lock state for a new,
4794 	preceding supremum record created in a page split. While the old
4795 	record was nonexistent, the supremum might have inherited its locks
4796 	from a wrong record. */
4797 
4798 	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4799 		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4800 						  rec, mtr);
4801 	}
4802 
4803 return_after_reservations:
4804 #ifdef UNIV_ZIP_DEBUG
4805 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4806 #endif /* UNIV_ZIP_DEBUG */
4807 
4808 	if (n_reserved > 0) {
4809 		fil_space_release_free_extents(index->space, n_reserved);
4810 	}
4811 
4812 	*big_rec = big_rec_vec;
4813 
4814 	return(err);
4815 }
4816 
4817 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4818 
4819 /****************************************************************//**
4820 Writes the redo log record for delete marking or unmarking of an index
4821 record. */
4822 UNIV_INLINE
4823 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4824 btr_cur_del_mark_set_clust_rec_log(
4825 /*===============================*/
4826 	rec_t*		rec,	/*!< in: record */
4827 	dict_index_t*	index,	/*!< in: index of the record */
4828 	trx_id_t	trx_id,	/*!< in: transaction id */
4829 	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
4830 	mtr_t*		mtr)	/*!< in: mtr */
4831 {
4832 	byte*	log_ptr;
4833 
4834 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4835 	ut_ad(mtr->is_named_space(index->space));
4836 
4837 	log_ptr = mlog_open_and_write_index(mtr, rec, index,
4838 					    page_rec_is_comp(rec)
4839 					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
4840 					    : MLOG_REC_CLUST_DELETE_MARK,
4841 					    1 + 1 + DATA_ROLL_PTR_LEN
4842 					    + 14 + 2);
4843 
4844 	if (!log_ptr) {
4845 		/* Logging in mtr is switched off during crash recovery */
4846 		return;
4847 	}
4848 
4849 	*log_ptr++ = 0;
4850 	*log_ptr++ = 1;
4851 
4852 	log_ptr = row_upd_write_sys_vals_to_log(
4853 		index, trx_id, roll_ptr, log_ptr, mtr);
4854 	mach_write_to_2(log_ptr, page_offset(rec));
4855 	log_ptr += 2;
4856 
4857 	mlog_close(mtr, log_ptr);
4858 }
4859 #endif /* !UNIV_HOTBACKUP */
4860 
4861 /****************************************************************//**
4862 Parses the redo log record for delete marking or unmarking of a clustered
4863 index record.
4864 @return end of log record or NULL */
4865 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4866 btr_cur_parse_del_mark_set_clust_rec(
4867 /*=================================*/
4868 	byte*		ptr,	/*!< in: buffer */
4869 	byte*		end_ptr,/*!< in: buffer end */
4870 	page_t*		page,	/*!< in/out: page or NULL */
4871 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
4872 	dict_index_t*	index)	/*!< in: index corresponding to page */
4873 {
4874 	ulint		flags;
4875 	ulint		val;
4876 	ulint		pos;
4877 	trx_id_t	trx_id;
4878 	roll_ptr_t	roll_ptr;
4879 	ulint		offset;
4880 	rec_t*		rec;
4881 
4882 	ut_ad(!page
4883 	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
4884 
4885 	if (end_ptr < ptr + 2) {
4886 
4887 		return(NULL);
4888 	}
4889 
4890 	flags = mach_read_from_1(ptr);
4891 	ptr++;
4892 	val = mach_read_from_1(ptr);
4893 	ptr++;
4894 
4895 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4896 
4897 	if (ptr == NULL) {
4898 
4899 		return(NULL);
4900 	}
4901 
4902 	if (end_ptr < ptr + 2) {
4903 
4904 		return(NULL);
4905 	}
4906 
4907 	offset = mach_read_from_2(ptr);
4908 	ptr += 2;
4909 
4910 	ut_a(offset <= UNIV_PAGE_SIZE);
4911 
4912 	if (page) {
4913 		rec = page + offset;
4914 
4915 		/* We do not need to reserve search latch, as the page
4916 		is only being recovered, and there cannot be a hash index to
4917 		it. Besides, these fields are being updated in place
4918 		and the adaptive hash index does not depend on them. */
4919 
4920 		btr_rec_set_deleted_flag(rec, page_zip, val);
4921 
4922 		if (!(flags & BTR_KEEP_SYS_FLAG)) {
4923 			mem_heap_t*	heap		= NULL;
4924 			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4925 			rec_offs_init(offsets_);
4926 
4927 			row_upd_rec_sys_fields_in_recovery(
4928 				rec, page_zip,
4929 				rec_get_offsets(rec, index, offsets_,
4930 						ULINT_UNDEFINED, &heap),
4931 				pos, trx_id, roll_ptr);
4932 			if (UNIV_LIKELY_NULL(heap)) {
4933 				mem_heap_free(heap);
4934 			}
4935 		}
4936 	}
4937 
4938 	return(ptr);
4939 }
4940 
4941 #ifndef UNIV_HOTBACKUP
4942 /***********************************************************//**
4943 Marks a clustered index record deleted. Writes an undo log record to
4944 undo log on this delete marking. Writes in the trx id field the id
4945 of the deleting transaction, and in the roll ptr field pointer to the
4946 undo log record created.
4947 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4948 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4949 btr_cur_del_mark_set_clust_rec(
4950 /*===========================*/
4951 	ulint		flags,  /*!< in: undo logging and locking flags */
4952 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
4953 	rec_t*		rec,	/*!< in/out: record */
4954 	dict_index_t*	index,	/*!< in: clustered index of the record */
4955 	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
4956 	que_thr_t*	thr,	/*!< in: query thread */
4957 	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
4958 				contains the virtual cols if there are any */
4959 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4960 {
4961 	roll_ptr_t	roll_ptr;
4962 	dberr_t		err;
4963 	page_zip_des_t*	page_zip;
4964 	trx_t*		trx;
4965 
4966 	ut_ad(dict_index_is_clust(index));
4967 	ut_ad(rec_offs_validate(rec, index, offsets));
4968 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4969 	ut_ad(buf_block_get_frame(block) == page_align(rec));
4970 	ut_ad(page_is_leaf(page_align(rec)));
4971 	ut_ad(mtr->is_named_space(index->space));
4972 
4973 	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4974 		/* While cascading delete operations, this becomes possible. */
4975 		ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4976 		return(DB_SUCCESS);
4977 	}
4978 
4979 	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4980 						   rec, index, offsets, thr);
4981 
4982 	if (err != DB_SUCCESS) {
4983 
4984 		return(err);
4985 	}
4986 
4987 	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4988 					    index, entry, NULL, 0, rec, offsets,
4989 					    &roll_ptr);
4990 	if (err != DB_SUCCESS) {
4991 
4992 		return(err);
4993 	}
4994 
4995 	/* The search latch is not needed here, because
4996 	the adaptive hash index does not depend on the delete-mark
4997 	and the delete-mark is being updated in place. */
4998 
4999 	page_zip = buf_block_get_page_zip(block);
5000 
5001 	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
5002 
5003 	/* For intrinsic table, roll-ptr is not maintained as there is no UNDO
5004 	logging. Skip updating it. */
5005 	if (dict_table_is_intrinsic(index->table)) {
5006 		return(err);
5007 	}
5008 
5009 	trx = thr_get_trx(thr);
5010 	/* This function must not be invoked during rollback
5011 	(of a TRX_STATE_PREPARE transaction or otherwise). */
5012 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
5013 	ut_ad(!trx->in_rollback);
5014 
5015 	DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
5016 			      ") by " TRX_ID_FMT ": %s",
5017 			      index->table_name, index->id,
5018 			      trx_get_id_for_print(trx),
5019 			      rec_printer(rec, offsets).str().c_str()));
5020 
5021 	if (dict_index_is_online_ddl(index)) {
5022 		row_log_table_delete(rec, entry, index, offsets, NULL);
5023 	}
5024 
5025 	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
5026 
5027 	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
5028 					   roll_ptr, mtr);
5029 
5030 	return(err);
5031 }
5032 
5033 /****************************************************************//**
5034 Writes the redo log record for a delete mark setting of a secondary
5035 index record. */
5036 UNIV_INLINE
5037 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)5038 btr_cur_del_mark_set_sec_rec_log(
5039 /*=============================*/
5040 	rec_t*		rec,	/*!< in: record */
5041 	ibool		val,	/*!< in: value to set */
5042 	mtr_t*		mtr)	/*!< in: mtr */
5043 {
5044 	byte*	log_ptr;
5045 	ut_ad(val <= 1);
5046 
5047 	log_ptr = mlog_open(mtr, 11 + 1 + 2);
5048 
5049 	if (!log_ptr) {
5050 		/* Logging in mtr is switched off during crash recovery:
5051 		in that case mlog_open returns NULL */
5052 		return;
5053 	}
5054 
5055 	log_ptr = mlog_write_initial_log_record_fast(
5056 		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
5057 	mach_write_to_1(log_ptr, val);
5058 	log_ptr++;
5059 
5060 	mach_write_to_2(log_ptr, page_offset(rec));
5061 	log_ptr += 2;
5062 
5063 	mlog_close(mtr, log_ptr);
5064 }
5065 #endif /* !UNIV_HOTBACKUP */
5066 
5067 /****************************************************************//**
5068 Parses the redo log record for delete marking or unmarking of a secondary
5069 index record.
5070 @return end of log record or NULL */
5071 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)5072 btr_cur_parse_del_mark_set_sec_rec(
5073 /*===============================*/
5074 	byte*		ptr,	/*!< in: buffer */
5075 	byte*		end_ptr,/*!< in: buffer end */
5076 	page_t*		page,	/*!< in/out: page or NULL */
5077 	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
5078 {
5079 	ulint	val;
5080 	ulint	offset;
5081 	rec_t*	rec;
5082 
5083 	if (end_ptr < ptr + 3) {
5084 
5085 		return(NULL);
5086 	}
5087 
5088 	val = mach_read_from_1(ptr);
5089 	ptr++;
5090 
5091 	offset = mach_read_from_2(ptr);
5092 	ptr += 2;
5093 
5094 	ut_a(offset <= UNIV_PAGE_SIZE);
5095 
5096 	if (page) {
5097 		rec = page + offset;
5098 
5099 		/* We do not need to reserve search latch, as the page
5100 		is only being recovered, and there cannot be a hash index to
5101 		it. Besides, the delete-mark flag is being updated in place
5102 		and the adaptive hash index does not depend on it. */
5103 
5104 		btr_rec_set_deleted_flag(rec, page_zip, val);
5105 	}
5106 
5107 	return(ptr);
5108 }
5109 
5110 #ifndef UNIV_HOTBACKUP
5111 /***********************************************************//**
5112 Sets a secondary index record delete mark to TRUE or FALSE.
5113 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
5114 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)5115 btr_cur_del_mark_set_sec_rec(
5116 /*=========================*/
5117 	ulint		flags,	/*!< in: locking flag */
5118 	btr_cur_t*	cursor,	/*!< in: cursor */
5119 	ibool		val,	/*!< in: value to set */
5120 	que_thr_t*	thr,	/*!< in: query thread */
5121 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5122 {
5123 	buf_block_t*	block;
5124 	rec_t*		rec;
5125 	dberr_t		err;
5126 
5127 	block = btr_cur_get_block(cursor);
5128 	rec = btr_cur_get_rec(cursor);
5129 
5130 	err = lock_sec_rec_modify_check_and_lock(flags,
5131 						 btr_cur_get_block(cursor),
5132 						 rec, cursor->index, thr, mtr);
5133 	if (err != DB_SUCCESS) {
5134 
5135 		return(err);
5136 	}
5137 
5138 	ut_ad(!!page_rec_is_comp(rec)
5139 	      == dict_table_is_comp(cursor->index->table));
5140 
5141 	DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
5142 			      IB_ID_FMT ") by " TRX_ID_FMT,
5143 			      unsigned(val),
5144 			      block->page.id.space(), block->page.id.page_no(),
5145 			      unsigned(page_rec_get_heap_no(rec)),
5146 			      cursor->index->name(), cursor->index->id,
5147 			      trx_get_id_for_print(thr_get_trx(thr))));
5148 
5149 	/* We do not need to reserve search latch, as the
5150 	delete-mark flag is being updated in place and the adaptive
5151 	hash index does not depend on it. */
5152 	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
5153 
5154 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5155 
5156 	return(DB_SUCCESS);
5157 }
5158 
5159 /***********************************************************//**
5160 Sets a secondary index record's delete mark to the given value. This
5161 function is only used by the insert buffer merge mechanism. */
5162 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)5163 btr_cur_set_deleted_flag_for_ibuf(
5164 /*==============================*/
5165 	rec_t*		rec,		/*!< in/out: record */
5166 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
5167 					corresponding to rec, or NULL
5168 					when the tablespace is
5169 					uncompressed */
5170 	ibool		val,		/*!< in: value to set */
5171 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
5172 {
5173 	/* We do not need to reserve search latch, as the page
5174 	has just been read to the buffer pool and there cannot be
5175 	a hash index to it.  Besides, the delete-mark flag is being
5176 	updated in place and the adaptive hash index does not depend
5177 	on it. */
5178 
5179 	btr_rec_set_deleted_flag(rec, page_zip, val);
5180 
5181 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5182 }
5183 
5184 /*==================== B-TREE RECORD REMOVE =========================*/
5185 
5186 /*************************************************************//**
5187 Tries to compress a page of the tree if it seems useful. It is assumed
5188 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5189 deadlocks, mtr must also own x-latches to brothers of page, if those
5190 brothers exist. NOTE: it is assumed that the caller has reserved enough
5191 free extents so that the compression will always succeed if done!
5192 @return TRUE if compression occurred */
5193 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5194 btr_cur_compress_if_useful(
5195 /*=======================*/
5196 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
5197 				cursor does not stay valid if !adjust and
5198 				compression occurs */
5199 	ibool		adjust,	/*!< in: TRUE if should adjust the
5200 				cursor position even if compression occurs */
5201 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5202 {
5203 	/* Avoid applying compression as we don't accept lot of page garbage
5204 	given the workload of intrinsic table. */
5205 	if (dict_table_is_intrinsic(cursor->index->table)) {
5206 		return(FALSE);
5207 	}
5208 
5209 	ut_ad(mtr_memo_contains_flagged(
5210 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5211 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5212 	      || dict_table_is_intrinsic(cursor->index->table));
5213 	ut_ad(mtr_is_block_fix(
5214 		mtr, btr_cur_get_block(cursor),
5215 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5216 
5217 	if (dict_index_is_spatial(cursor->index)) {
5218 		const page_t*   page = btr_cur_get_page(cursor);
5219 		const trx_t*	trx = NULL;
5220 
5221 		if (cursor->rtr_info->thr != NULL) {
5222 			trx = thr_get_trx(cursor->rtr_info->thr);
5223 		}
5224 
5225 		/* Check whether page lock prevents the compression */
5226 		if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5227 					      page_get_page_no(page))) {
5228 			return(false);
5229 		}
5230 	}
5231 
5232 	return(btr_cur_compress_recommendation(cursor, mtr)
5233 	       && btr_compress(cursor, adjust, mtr));
5234 }
5235 
5236 /*******************************************************//**
5237 Removes the record on which the tree cursor is positioned on a leaf page.
5238 It is assumed that the mtr has an x-latch on the page where the cursor is
5239 positioned, but no latch on the whole tree.
5240 @return TRUE if success, i.e., the page did not become too empty */
5241 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5242 btr_cur_optimistic_delete_func(
5243 /*===========================*/
5244 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
5245 				delete; cursor stays valid: if deletion
5246 				succeeds, on function exit it points to the
5247 				successor of the deleted record */
5248 #ifdef UNIV_DEBUG
5249 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5250 #endif /* UNIV_DEBUG */
5251 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
5252 				TRUE on a leaf page of a secondary
5253 				index, the mtr must be committed
5254 				before latching any further pages */
5255 {
5256 	buf_block_t*	block;
5257 	rec_t*		rec;
5258 	mem_heap_t*	heap		= NULL;
5259 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
5260 	ulint*		offsets		= offsets_;
5261 	ibool		no_compress_needed;
5262 	rec_offs_init(offsets_);
5263 
5264 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5265 	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5266 				MTR_MEMO_PAGE_X_FIX));
5267 	ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5268 			       MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5269 	ut_ad(mtr->is_named_space(cursor->index->space));
5270 
5271 	/* This is intended only for leaf page deletions */
5272 
5273 	block = btr_cur_get_block(cursor);
5274 
5275 	SRV_CORRUPT_TABLE_CHECK(block, return(DB_CORRUPTION););
5276 
5277 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
5278 	ut_ad(!dict_index_is_online_ddl(cursor->index)
5279 	      || dict_index_is_clust(cursor->index)
5280 	      || (flags & BTR_CREATE_FLAG));
5281 
5282 	rec = btr_cur_get_rec(cursor);
5283 	offsets = rec_get_offsets(rec, cursor->index, offsets,
5284 				  ULINT_UNDEFINED, &heap);
5285 
5286 	no_compress_needed = !rec_offs_any_extern(offsets)
5287 		&& btr_cur_can_delete_without_compress(
5288 			cursor, rec_offs_size(offsets), mtr);
5289 
5290 	if (no_compress_needed) {
5291 
5292 		page_t*		page	= buf_block_get_frame(block);
5293 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
5294 
5295 		lock_update_delete(block, rec);
5296 
5297 		btr_search_update_hash_on_delete(cursor);
5298 
5299 		if (page_zip) {
5300 #ifdef UNIV_ZIP_DEBUG
5301 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5302 #endif /* UNIV_ZIP_DEBUG */
5303 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5304 					    cursor->index, offsets, mtr);
5305 #ifdef UNIV_ZIP_DEBUG
5306 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5307 #endif /* UNIV_ZIP_DEBUG */
5308 
5309 			/* On compressed pages, the IBUF_BITMAP_FREE
5310 			space is not affected by deleting (purging)
5311 			records, because it is defined as the minimum
5312 			of space available *without* reorganize, and
5313 			space available in the modification log. */
5314 		} else {
5315 			const ulint	max_ins
5316 				= page_get_max_insert_size_after_reorganize(
5317 					page, 1);
5318 
5319 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5320 					    cursor->index, offsets, mtr);
5321 
5322 			/* The change buffer does not handle inserts
5323 			into non-leaf pages, into clustered indexes,
5324 			or into the change buffer. */
5325 			if (!dict_index_is_clust(cursor->index)
5326 			    && !dict_table_is_temporary(cursor->index->table)
5327 			    && !dict_index_is_ibuf(cursor->index)) {
5328 				ibuf_update_free_bits_low(block, max_ins, mtr);
5329 			}
5330 		}
5331 	} else {
5332 		/* prefetch siblings of the leaf for the pessimistic
5333 		operation. */
5334 		btr_cur_prefetch_siblings(block);
5335 	}
5336 
5337 	if (UNIV_LIKELY_NULL(heap)) {
5338 		mem_heap_free(heap);
5339 	}
5340 
5341 	return(no_compress_needed);
5342 }
5343 
5344 /*************************************************************//**
5345 Removes the record on which the tree cursor is positioned. Tries
5346 to compress the page if its fillfactor drops below a threshold
5347 or if it is the only page on the level. It is assumed that mtr holds
5348 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5349 mtr must also own x-latches to brothers of page, if those brothers
5350 exist.
5351 @return TRUE if compression occurred and FALSE if not or something
5352 wrong. */
5353 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5354 btr_cur_pessimistic_delete(
5355 /*=======================*/
5356 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5357 				the latter may occur because we may have
5358 				to update node pointers on upper levels,
5359 				and in the case of variable length keys
5360 				these may actually grow in size */
5361 	ibool		has_reserved_extents, /*!< in: TRUE if the
5362 				caller has already reserved enough free
5363 				extents so that he knows that the operation
5364 				will succeed */
5365 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
5366 				if compression does not occur, the cursor
5367 				stays valid: it points to successor of
5368 				deleted record on function exit */
5369 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5370 	bool		rollback,/*!< in: performing rollback? */
5371 	mtr_t*		mtr)	/*!< in: mtr */
5372 {
5373 	buf_block_t*	block;
5374 	page_t*		page;
5375 	page_zip_des_t*	page_zip;
5376 	dict_index_t*	index;
5377 	rec_t*		rec;
5378 	ulint		n_reserved	= 0;
5379 	bool		success;
5380 	ibool		ret		= FALSE;
5381 	ulint		level;
5382 	mem_heap_t*	heap;
5383 	ulint*		offsets;
5384 	bool		allow_merge = true; /* if true, implies we have taken appropriate page
5385 			latches needed to merge this page.*/
5386 #ifdef UNIV_DEBUG
5387 	bool		parent_latched	= false;
5388 #endif /* UNIV_DEBUG */
5389 
5390 	block = btr_cur_get_block(cursor);
5391 	page = buf_block_get_frame(block);
5392 	index = btr_cur_get_index(cursor);
5393 
5394 	ulint rec_size_est = dict_index_node_ptr_max_size(index);
5395 	const page_size_t       page_size(dict_table_page_size(index->table));
5396 
5397 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5398 	ut_ad(!dict_index_is_online_ddl(index)
5399 	      || dict_index_is_clust(index)
5400 	      || (flags & BTR_CREATE_FLAG));
5401 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5402 					MTR_MEMO_X_LOCK
5403 					| MTR_MEMO_SX_LOCK)
5404 	      || dict_table_is_intrinsic(index->table));
5405 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5406 	ut_ad(mtr->is_named_space(index->space));
5407 
5408 	if (!has_reserved_extents) {
5409 		/* First reserve enough free space for the file segments
5410 		of the index tree, so that the node pointer updates will
5411 		not fail because of lack of space */
5412 
5413 		ut_a(cursor->tree_height != ULINT_UNDEFINED);
5414 
5415 		ulint	n_extents = cursor->tree_height / 32 + 1;
5416 
5417 		success = fsp_reserve_free_extents(&n_reserved,
5418 						   index->space,
5419 						   n_extents,
5420 						   FSP_CLEANING, mtr);
5421 		if (!success) {
5422 			*err = DB_OUT_OF_FILE_SPACE;
5423 
5424 			return(FALSE);
5425 		}
5426 	}
5427 
5428 	heap = mem_heap_create(1024);
5429 	rec = btr_cur_get_rec(cursor);
5430 	page_zip = buf_block_get_page_zip(block);
5431 #ifdef UNIV_ZIP_DEBUG
5432 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5433 #endif /* UNIV_ZIP_DEBUG */
5434 
5435 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5436 
5437 	if (rec_offs_any_extern(offsets)) {
5438 		btr_rec_free_externally_stored_fields(index,
5439 						      rec, offsets, page_zip,
5440 						      rollback, mtr);
5441 #ifdef UNIV_ZIP_DEBUG
5442 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5443 #endif /* UNIV_ZIP_DEBUG */
5444 	}
5445 
5446 	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5447 	    && UNIV_UNLIKELY(dict_index_get_page(index)
5448 			     != block->page.id.page_no())) {
5449 
5450 		/* If there is only one record, drop the whole page in
5451 		btr_discard_page, if this is not the root page */
5452 
5453 		btr_discard_page(cursor, mtr);
5454 
5455 		ret = TRUE;
5456 
5457 		goto return_after_reservations;
5458 	}
5459 
5460 	if (flags == 0) {
5461 		lock_update_delete(block, rec);
5462 	}
5463 
5464 	level = btr_page_get_level(page, mtr);
5465 
5466 	if (level > 0
5467 	    && UNIV_UNLIKELY(rec == page_rec_get_next(
5468 				     page_get_infimum_rec(page)))) {
5469 
5470 		rec_t*	next_rec = page_rec_get_next(rec);
5471 
5472 		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5473 
5474 			/* If we delete the leftmost node pointer on a
5475 			non-leaf level, we must mark the new leftmost node
5476 			pointer as the predefined minimum record */
5477 
5478 			/* This will make page_zip_validate() fail until
5479 			page_cur_delete_rec() completes.  This is harmless,
5480 			because everything will take place within a single
5481 			mini-transaction and because writing to the redo log
5482 			is an atomic operation (performed by mtr_commit()). */
5483 			btr_set_min_rec_mark(next_rec, mtr);
5484 		} else if (dict_index_is_spatial(index)) {
5485 			/* For rtree, if delete the leftmost node pointer,
5486 			we need to update parent page. */
5487 			rtr_mbr_t	father_mbr;
5488 			rec_t*		father_rec;
5489 			btr_cur_t	father_cursor;
5490 			ulint*		offsets;
5491 			bool		upd_ret;
5492 			ulint		len;
5493 
5494 			rtr_page_get_father_block(NULL, heap, index,
5495 						  block, mtr, NULL,
5496 						  &father_cursor);
5497 			offsets = rec_get_offsets(
5498 				btr_cur_get_rec(&father_cursor), index,
5499 				NULL, ULINT_UNDEFINED, &heap);
5500 
5501 			father_rec = btr_cur_get_rec(&father_cursor);
5502 			rtr_read_mbr(rec_get_nth_field(
5503 				father_rec, offsets, 0, &len), &father_mbr);
5504 
5505 			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5506 						       NULL, page, &father_mbr,
5507 						       next_rec, mtr);
5508 
5509 			if (!upd_ret) {
5510 				*err = DB_ERROR;
5511 
5512 				mem_heap_free(heap);
5513 				return(FALSE);
5514 			}
5515 
5516 			ut_d(parent_latched = true);
5517 		} else {
5518 			/* Otherwise, if we delete the leftmost node pointer
5519 			on a page, we have to change the parent node pointer
5520 			so that it is equal to the new leftmost node pointer
5521 			on the page */
5522 
5523 			btr_node_ptr_delete(index, block, mtr);
5524 
5525 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
5526 				index, next_rec, block->page.id.page_no(),
5527 				heap, level);
5528 
5529 			btr_insert_on_non_leaf_level(
5530 				flags, index, level + 1, node_ptr, mtr);
5531 
5532 			ut_d(parent_latched = true);
5533 		}
5534 	}
5535 
5536 	btr_search_update_hash_on_delete(cursor);
5537 
5538 	if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5539 	/* Set allow merge to true for spatial indexes as the tree is X
5540         locked incase of delete operation on spatial indexes thus avoiding
5541         possibility of upward locking.*/
5542 		allow_merge = true;
5543 	} else {
5544 		allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5545                                         rec,rec_size_est,page_size,mtr);
5546 	}
5547 	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5548 #ifdef UNIV_ZIP_DEBUG
5549 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5550 #endif /* UNIV_ZIP_DEBUG */
5551 
5552 	/* btr_check_node_ptr() needs parent block latched */
5553 	ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5554 
5555 return_after_reservations:
5556 	*err = DB_SUCCESS;
5557 
5558 	mem_heap_free(heap);
5559 
5560 	if(!ret) {
5561 		bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5562 		/* We are not allowed do merge because appropriate locks
5563 		are not taken while positioning the cursor. */
5564 		if (!allow_merge && do_merge) {
5565 			ib::info() << "Ignoring merge recommendation for page"
5566 				"as we could not predict it early .Page"
5567 				"number being\n" << page_get_page_no(page) <<
5568 				"Index name\n" << index->name;
5569 			ut_ad(false);
5570 		} else if (do_merge) {
5571 
5572 			ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5573 		}
5574 	}
5575 
5576 	if (!srv_read_only_mode
5577 	    && page_is_leaf(page)
5578 	    && !dict_index_is_online_ddl(index)) {
5579 
5580 		mtr_memo_release(mtr, dict_index_get_lock(index),
5581 				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5582 
5583 		/* NOTE: We cannot release root block latch here, because it
5584 		has segment header and already modified in most of cases.*/
5585 	}
5586 
5587 	if (n_reserved > 0) {
5588 		fil_space_release_free_extents(index->space, n_reserved);
5589 	}
5590 
5591 	return(ret);
5592 }
5593 
5594 /*******************************************************************//**
5595 Adds path information to the cursor for the current page, for which
5596 the binary search has been performed. */
5597 static
5598 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5599 btr_cur_add_path_info(
5600 /*==================*/
5601 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
5602 	ulint		height,		/*!< in: height of the page in tree;
5603 					0 means leaf node */
5604 	ulint		root_height)	/*!< in: root node height in tree */
5605 {
5606 	btr_path_t*	slot;
5607 	const rec_t*	rec;
5608 	const page_t*	page;
5609 
5610 	ut_a(cursor->path_arr);
5611 
5612 	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5613 		/* Do nothing; return empty path */
5614 
5615 		slot = cursor->path_arr;
5616 		slot->nth_rec = ULINT_UNDEFINED;
5617 
5618 		return;
5619 	}
5620 
5621 	if (height == 0) {
5622 		/* Mark end of slots for path */
5623 		slot = cursor->path_arr + root_height + 1;
5624 		slot->nth_rec = ULINT_UNDEFINED;
5625 	}
5626 
5627 	rec = btr_cur_get_rec(cursor);
5628 
5629 	slot = cursor->path_arr + (root_height - height);
5630 
5631 	page = page_align(rec);
5632 
5633 	slot->nth_rec = page_rec_get_n_recs_before(rec);
5634 	slot->n_recs = page_get_n_recs(page);
5635 	slot->page_no = page_get_page_no(page);
5636 	slot->page_level = btr_page_get_level_low(page);
5637 }
5638 
5639 /*******************************************************************//**
5640 Estimate the number of rows between slot1 and slot2 for any level on a
5641 B-tree. This function starts from slot1->page and reads a few pages to
5642 the right, counting their records. If we reach slot2->page quickly then
5643 we know exactly how many records there are between slot1 and slot2 and
5644 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5645 then we calculate the average number of records in the pages scanned
5646 so far and assume that all pages that we did not scan up to slot2->page
5647 contain the same number of records, then we multiply that average to
5648 the number of pages between slot1->page and slot2->page (which is
5649 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5650 @return number of rows, not including the borders (exact or estimated) */
5651 static
5652 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5653 btr_estimate_n_rows_in_range_on_level(
5654 /*==================================*/
5655 	dict_index_t*	index,			/*!< in: index */
5656 	btr_path_t*	slot1,			/*!< in: left border */
5657 	btr_path_t*	slot2,			/*!< in: right border */
5658 	int64_t		n_rows_on_prev_level,	/*!< in: number of rows
5659 						on the previous level for the
5660 						same descend paths; used to
5661 						determine the number of pages
5662 						on this level */
5663 	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
5664 						value is exact i.e. not an
5665 						estimation */
5666 {
5667 	int64_t		n_rows;
5668 	ulint		n_pages_read;
5669 	ulint		level;
5670 
5671 	n_rows = 0;
5672 	n_pages_read = 0;
5673 
5674 	/* Assume by default that we will scan all pages between
5675 	slot1->page_no and slot2->page_no. */
5676 	*is_n_rows_exact = TRUE;
5677 
5678 	/* Add records from slot1->page_no which are to the right of
5679 	the record which serves as a left border of the range, if any
5680 	(we don't include the record itself in this count). */
5681 	if (slot1->nth_rec <= slot1->n_recs) {
5682 		n_rows += slot1->n_recs - slot1->nth_rec;
5683 	}
5684 
5685 	/* Add records from slot2->page_no which are to the left of
5686 	the record which servers as a right border of the range, if any
5687 	(we don't include the record itself in this count). */
5688 	if (slot2->nth_rec > 1) {
5689 		n_rows += slot2->nth_rec - 1;
5690 	}
5691 
5692 	/* Count the records in the pages between slot1->page_no and
5693 	slot2->page_no (non inclusive), if any. */
5694 
5695 	/* Do not read more than this number of pages in order not to hurt
5696 	performance with this code which is just an estimation. If we read
5697 	this many pages before reaching slot2->page_no then we estimate the
5698 	average from the pages scanned so far. */
5699 #	define N_PAGES_READ_LIMIT	10
5700 
5701 	page_id_t		page_id(
5702 		dict_index_get_space(index), slot1->page_no);
5703 	const fil_space_t*	space = fil_space_get(index->space);
5704 	ut_ad(space);
5705 	const page_size_t	page_size(space->flags);
5706 
5707 	level = slot1->page_level;
5708 
5709 	do {
5710 		mtr_t		mtr;
5711 		page_t*		page;
5712 		buf_block_t*	block;
5713 		dberr_t		err = DB_SUCCESS;
5714 
5715 		mtr_start(&mtr);
5716 
5717 		/* Fetch the page. Because we are not holding the
5718 		index->lock, the tree may have changed and we may be
5719 		attempting to read a page that is no longer part of
5720 		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5721 		silence a debug assertion about this. */
5722 		block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5723 					 NULL, BUF_GET_POSSIBLY_FREED,
5724 					 __FILE__, __LINE__, &mtr);
5725 
5726 		ut_ad((block != NULL) == (err == DB_SUCCESS));
5727 
5728 		if (err != DB_SUCCESS) {
5729 			if (err == DB_DECRYPTION_FAILED) {
5730 				ib::warn() << "Table is encrypted but encryption service or"
5731 					" used key_id is not available. "
5732 					" Can't continue reading table.";
5733 
5734 				index->table->set_file_unreadable();
5735 			}
5736 
5737 			mtr_commit(&mtr);
5738 			goto inexact;
5739 		}
5740 
5741 		page = buf_block_get_frame(block);
5742 
5743 		/* It is possible that the tree has been reorganized in the
5744 		meantime and this is a different page. If this happens the
5745 		calculated estimate will be bogus, which is not fatal as
5746 		this is only an estimate. We are sure that a page with
5747 		page_no exists because InnoDB never frees pages, only
5748 		reuses them. */
5749 		if (!fil_page_index_page_check(page)
5750 		    || btr_page_get_index_id(page) != index->id
5751 		    || btr_page_get_level_low(page) != level) {
5752 
5753 			/* The page got reused for something else */
5754 			mtr_commit(&mtr);
5755 			goto inexact;
5756 		}
5757 
5758 		/* It is possible but highly unlikely that the page was
5759 		originally written by an old version of InnoDB that did
5760 		not initialize FIL_PAGE_TYPE on other than B-tree pages.
5761 		For example, this could be an almost-empty BLOB page
5762 		that happens to contain the magic values in the fields
5763 		that we checked above. */
5764 
5765 		n_pages_read++;
5766 
5767 		if (page_id.page_no() != slot1->page_no) {
5768 			/* Do not count the records on slot1->page_no,
5769 			we already counted them before this loop. */
5770 			n_rows += page_get_n_recs(page);
5771 		}
5772 
5773 		page_id.set_page_no(btr_page_get_next(page, &mtr));
5774 
5775 		mtr_commit(&mtr);
5776 
5777 		if (n_pages_read == N_PAGES_READ_LIMIT
5778 		    || page_id.page_no() == FIL_NULL) {
5779 			/* Either we read too many pages or
5780 			we reached the end of the level without passing
5781 			through slot2->page_no, the tree must have changed
5782 			in the meantime */
5783 			goto inexact;
5784 		}
5785 
5786 	} while (page_id.page_no() != slot2->page_no);
5787 
5788 	return(n_rows);
5789 
5790 inexact:
5791 
5792 	*is_n_rows_exact = FALSE;
5793 
5794 	/* We did interrupt before reaching slot2->page */
5795 
5796 	if (n_pages_read > 0) {
5797 		/* The number of pages on this level is
5798 		n_rows_on_prev_level, multiply it by the
5799 		average number of recs per page so far */
5800 		n_rows = n_rows_on_prev_level
5801 			* n_rows / n_pages_read;
5802 	} else {
5803 		/* The tree changed before we could even
5804 		start with slot1->page_no */
5805 		n_rows = 10;
5806 	}
5807 
5808 	return(n_rows);
5809 }
5810 
5811 /** If the tree gets changed too much between the two dives for the left
5812 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5813 that many times before giving up and returning the value stored in
5814 rows_in_range_arbitrary_ret_val. */
5815 static const unsigned	rows_in_range_max_retries = 4;
5816 
5817 /** We pretend that a range has that many records if the tree keeps changing
5818 for rows_in_range_max_retries retries while we try to estimate the records
5819 in a given range. */
5820 static const int64_t	rows_in_range_arbitrary_ret_val = 10;
5821 
5822 /** Estimates the number of rows in a given index range.
5823 @param[in]	index		index
5824 @param[in]	tuple1		range start, may also be empty tuple
5825 @param[in]	mode1		search mode for range start
5826 @param[in]	tuple2		range end, may also be empty tuple
5827 @param[in]	mode2		search mode for range end
5828 @param[in]	nth_attempt	if the tree gets modified too much while
5829 we are trying to analyze it, then we will retry (this function will call
5830 itself, incrementing this parameter)
5831 @return estimated number of rows; if after rows_in_range_max_retries
5832 retries the tree keeps changing, then we will just return
5833 rows_in_range_arbitrary_ret_val as a result (if
5834 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5835 the two dives). */
5836 static
5837 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5838 btr_estimate_n_rows_in_range_low(
5839 	dict_index_t*	index,
5840 	const dtuple_t*	tuple1,
5841 	page_cur_mode_t	mode1,
5842 	const dtuple_t*	tuple2,
5843 	page_cur_mode_t	mode2,
5844 	unsigned	nth_attempt)
5845 {
5846 	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
5847 	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
5848 	btr_cur_t	cursor;
5849 	btr_path_t*	slot1;
5850 	btr_path_t*	slot2;
5851 	ibool		diverged;
5852 	ibool		diverged_lot;
5853 	ulint		divergence_level;
5854 	int64_t		n_rows;
5855 	ibool		is_n_rows_exact;
5856 	ulint		i;
5857 	mtr_t		mtr;
5858 	int64_t		table_n_rows;
5859 
5860 	table_n_rows = dict_table_get_n_rows(index->table);
5861 
5862 	/* Below we dive to the two records specified by tuple1 and tuple2 and
5863 	we remember the entire dive paths from the tree root. The place where
5864 	the tuple1 path ends on the leaf level we call "left border" of our
5865 	interval and the place where the tuple2 path ends on the leaf level -
5866 	"right border". We take care to either include or exclude the interval
5867 	boundaries depending on whether <, <=, > or >= was specified. For
5868 	example if "5 < x AND x <= 10" then we should not include the left
5869 	boundary, but should include the right one. */
5870 
5871 	mtr_start(&mtr);
5872 
5873 	cursor.path_arr = path1;
5874 
5875 	bool	should_count_the_left_border = false;
5876 
5877 	if (dtuple_get_n_fields(tuple1) > 0) {
5878 
5879 		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5880 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5881 					    &cursor, 0,
5882 					    __FILE__, __LINE__, &mtr);
5883 
5884 		if (index->is_readable())
5885 		{
5886 			ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5887 
5888 			/* We should count the border if there are any records to
5889 			match the criteria, i.e. if the maximum record on the tree is
5890 			5 and x > 3 is specified then the cursor will be positioned at
5891 			5 and we should count the border, but if x > 7 is specified,
5892 			then the cursor will be positioned at 'sup' on the rightmost
5893 			leaf page in the tree and we should not count the border. */
5894 			should_count_the_left_border
5895 				= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5896 		}
5897 	} else {
5898 		dberr_t err = btr_cur_open_at_index_side(true, index,
5899 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5900 					   &cursor, 0, &mtr);
5901 
5902 		if (err != DB_SUCCESS) {
5903 			ib::warn() << " Error code: " << err
5904 				   << " btr_estimate_n_rows_in_range_low "
5905 				   << " called from file: "
5906 				   << __FILE__ << " line: " << __LINE__
5907 				   << " table: " << index->table->name
5908 				   << " index: " << index->name;
5909 		}
5910 
5911 		if (index->is_readable()) {
5912                   ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5913 
5914 			/* The range specified is wihout a left border, just
5915 			'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5916 			positioned the cursor on the infimum record on the leftmost
5917 			page, which must not be counted. */
5918 			should_count_the_left_border = false;
5919 		}
5920 	}
5921 
5922 	mtr_commit(&mtr);
5923 
5924 	if (!index->is_readable()) {
5925 		return 0;
5926 	}
5927 
5928 #ifdef UNIV_DEBUG
5929 	if (!strcmp(index->name, "iC")) {
5930 		DEBUG_SYNC_C("btr_estimate_n_rows_in_range_between_dives");
5931 	}
5932 #endif
5933 
5934 	mtr_start(&mtr);
5935 
5936 	cursor.path_arr = path2;
5937 
5938 	bool	should_count_the_right_border;
5939 
5940 	if (dtuple_get_n_fields(tuple2) > 0) {
5941 
5942 		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5943 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5944 					    &cursor, 0,
5945 					    __FILE__, __LINE__, &mtr);
5946 
5947 		const rec_t*	rec = btr_cur_get_rec(&cursor);
5948 
5949 		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5950 
5951 		should_count_the_right_border
5952 			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5953 			   /* and the record was found */
5954 			   && cursor.low_match >= dtuple_get_n_fields(tuple2))
5955 			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
5956 			    /* and there are any records to match the criteria,
5957 			    i.e. if the minimum record on the tree is 5 and
5958 			    x < 7 is specified then the cursor will be
5959 			    positioned at 5 and we should count the border, but
5960 			    if x < 2 is specified, then the cursor will be
5961 			    positioned at 'inf' and we should not count the
5962 			    border */
5963 			    && !page_rec_is_infimum(rec));
5964 		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
5965 		ha_innobase::records_in_range():
5966 		min_key=NULL (left-unbounded) which is expected
5967 		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5968 		unexpected - one would expect
5969 		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5970 		cursor will be positioned on the first record to the right of
5971 		the requested one (can also be positioned on the 'sup') and
5972 		we should not count the right border. */
5973 	} else {
5974 		dberr_t err = btr_cur_open_at_index_side(false, index,
5975 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5976 					   &cursor, 0, &mtr);
5977 
5978 		if (err != DB_SUCCESS) {
5979 			ib::warn() << " Error code: " << err
5980 				   << " btr_estimate_n_rows_in_range_low "
5981 				   << " called from file: "
5982 				   << __FILE__ << " line: " << __LINE__
5983 				   << " table: " << index->table->name
5984 				   << " index: " << index->name;
5985 		}
5986 
5987 		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5988 
5989 		/* The range specified is wihout a right border, just
5990 		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5991 		positioned the cursor on the supremum record on the rightmost
5992 		page, which must not be counted. */
5993 		should_count_the_right_border = false;
5994 	}
5995 
5996 	mtr_commit(&mtr);
5997 
5998 	/* We have the path information for the range in path1 and path2 */
5999 
6000 	n_rows = 0;
6001 	is_n_rows_exact = TRUE;
6002 
6003 	/* This becomes true when the two paths do not pass through the
6004 	same pages anymore. */
6005 	diverged = FALSE;
6006 
6007 	/* This becomes true when the paths are not the same or adjacent
6008 	any more. This means that they pass through the same or
6009 	neighboring-on-the-same-level pages only. */
6010 	diverged_lot = FALSE;
6011 
6012 	/* This is the level where paths diverged a lot. */
6013 	divergence_level = 1000000;
6014 
6015 	for (i = 0; ; i++) {
6016 		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
6017 
6018 		slot1 = path1 + i;
6019 		slot2 = path2 + i;
6020 
6021 		if (slot1->nth_rec == ULINT_UNDEFINED
6022 		    || slot2->nth_rec == ULINT_UNDEFINED) {
6023 
6024 			/* Here none of the borders were counted. For example,
6025 			if on the leaf level we descended to:
6026 			(inf, a, b, c, d, e, f, sup)
6027 			         ^        ^
6028 			       path1    path2
6029 			then n_rows will be 2 (c and d). */
6030 
6031 			if (is_n_rows_exact) {
6032 				/* Only fiddle to adjust this off-by-one
6033 				if the number is exact, otherwise we do
6034 				much grosser adjustments below. */
6035 
6036 				btr_path_t*	last1 = &path1[i - 1];
6037 				btr_path_t*	last2 = &path2[i - 1];
6038 
6039 				/* If both paths end up on the same record on
6040 				the leaf level. */
6041 				if (last1->page_no == last2->page_no
6042 				    && last1->nth_rec == last2->nth_rec) {
6043 
6044 					/* n_rows can be > 0 here if the paths
6045 					were first different and then converged
6046 					to the same record on the leaf level.
6047 					For example:
6048 					SELECT ... LIKE 'wait/synch/rwlock%'
6049 					mode1=PAGE_CUR_GE,
6050 					tuple1="wait/synch/rwlock"
6051 					path1[0]={nth_rec=58, n_recs=58,
6052 						  page_no=3, page_level=1}
6053 					path1[1]={nth_rec=56, n_recs=55,
6054 						  page_no=119, page_level=0}
6055 
6056 					mode2=PAGE_CUR_G
6057 					tuple2="wait/synch/rwlock"
6058 					path2[0]={nth_rec=57, n_recs=57,
6059 						  page_no=3, page_level=1}
6060 					path2[1]={nth_rec=56, n_recs=55,
6061 						  page_no=119, page_level=0} */
6062 
6063 					/* If the range is such that we should
6064 					count both borders, then avoid
6065 					counting that record twice - once as a
6066 					left border and once as a right
6067 					border. */
6068 					if (should_count_the_left_border
6069 					    && should_count_the_right_border) {
6070 
6071 						n_rows = 1;
6072 					} else {
6073 						/* Some of the borders should
6074 						not be counted, e.g. [3,3). */
6075 						n_rows = 0;
6076 					}
6077 				} else {
6078 					if (should_count_the_left_border) {
6079 						n_rows++;
6080 					}
6081 
6082 					if (should_count_the_right_border) {
6083 						n_rows++;
6084 					}
6085 				}
6086 			}
6087 
6088 			if (i > divergence_level + 1 && !is_n_rows_exact) {
6089 				/* In trees whose height is > 1 our algorithm
6090 				tends to underestimate: multiply the estimate
6091 				by 2: */
6092 
6093 				n_rows = n_rows * 2;
6094 			}
6095 
6096 			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
6097 
6098 			/* Do not estimate the number of rows in the range
6099 			to over 1 / 2 of the estimated rows in the whole
6100 			table */
6101 
6102 			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
6103 
6104 				n_rows = table_n_rows / 2;
6105 
6106 				/* If there are just 0 or 1 rows in the table,
6107 				then we estimate all rows are in the range */
6108 
6109 				if (n_rows == 0) {
6110 					n_rows = table_n_rows;
6111 				}
6112 			}
6113 
6114 			return(n_rows);
6115 		}
6116 
6117 		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
6118 
6119 			/* If both slots do not point to the same page or if
6120 			the paths have crossed and the same page on both
6121 			apparently contains a different number of records,
6122 			this means that the tree must have changed between
6123 			the dive for slot1 and the dive for slot2 at the
6124 			beginning of this function. */
6125 			if (slot1->page_no != slot2->page_no
6126 			    || slot1->page_level != slot2->page_level
6127 			    || (slot1->nth_rec >= slot2->nth_rec
6128 				&& slot1->n_recs != slot2->n_recs)) {
6129 
6130 				/* If the tree keeps changing even after a
6131 				few attempts, then just return some arbitrary
6132 				number. */
6133 				if (nth_attempt >= rows_in_range_max_retries) {
6134 					return(rows_in_range_arbitrary_ret_val);
6135 				}
6136 
6137 				const int64_t	ret =
6138 					btr_estimate_n_rows_in_range_low(
6139 						index, tuple1, mode1,
6140 						tuple2, mode2, nth_attempt + 1);
6141 
6142 				return(ret);
6143 			}
6144 
6145 			diverged = TRUE;
6146 
6147 			if (slot1->nth_rec < slot2->nth_rec) {
6148 				/* We do not count the borders (nor the left
6149 				nor the right one), thus "- 1". */
6150 				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
6151 
6152 				if (n_rows > 0) {
6153 					/* There is at least one row between
6154 					the two borders pointed to by slot1
6155 					and slot2, so on the level below the
6156 					slots will point to non-adjacent
6157 					pages. */
6158 					diverged_lot = TRUE;
6159 					divergence_level = i;
6160 				}
6161 			} else {
6162 				/* It is possible that
6163 				slot1->nth_rec >= slot2->nth_rec
6164 				if, for example, we have a single page
6165 				tree which contains (inf, 5, 6, supr)
6166 				and we select where x > 20 and x < 30;
6167 				in this case slot1->nth_rec will point
6168 				to the supr record and slot2->nth_rec
6169 				will point to 6 */
6170 				return(0);
6171 			}
6172 
6173 		} else if (diverged && !diverged_lot) {
6174 
6175 			if (slot1->nth_rec < slot1->n_recs
6176 			    || slot2->nth_rec > 1) {
6177 
6178 				diverged_lot = TRUE;
6179 				divergence_level = i;
6180 
6181 				n_rows = 0;
6182 
6183 				if (slot1->nth_rec < slot1->n_recs) {
6184 					n_rows += slot1->n_recs
6185 						- slot1->nth_rec;
6186 				}
6187 
6188 				if (slot2->nth_rec > 1) {
6189 					n_rows += slot2->nth_rec - 1;
6190 				}
6191 			}
6192 		} else if (diverged_lot) {
6193 
6194 			n_rows = btr_estimate_n_rows_in_range_on_level(
6195 				index, slot1, slot2, n_rows,
6196 				&is_n_rows_exact);
6197 		}
6198 	}
6199 }
6200 
6201 /** Estimates the number of rows in a given index range.
6202 @param[in]	index	index
6203 @param[in]	tuple1	range start, may also be empty tuple
6204 @param[in]	mode1	search mode for range start
6205 @param[in]	tuple2	range end, may also be empty tuple
6206 @param[in]	mode2	search mode for range end
6207 @return estimated number of rows */
6208 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)6209 btr_estimate_n_rows_in_range(
6210 	dict_index_t*	index,
6211 	const dtuple_t*	tuple1,
6212 	page_cur_mode_t	mode1,
6213 	const dtuple_t*	tuple2,
6214 	page_cur_mode_t	mode2)
6215 {
6216 	const int64_t	ret = btr_estimate_n_rows_in_range_low(
6217 		index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
6218 
6219 	return(ret);
6220 }
6221 
6222 /*******************************************************************//**
6223 Record the number of non_null key values in a given index for
6224 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6225 The estimates are eventually stored in the array:
6226 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6227 static
6228 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6229 btr_record_not_null_field_in_rec(
6230 /*=============================*/
6231 	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
6232 					number of columns uniquely determine
6233 					an index entry */
6234 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
6235 					its size could be for all fields or
6236 					that of "n_unique" */
6237 	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
6238 					not null rows for n-column prefix */
6239 {
6240 	ulint	i;
6241 
6242 	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6243 
6244 	if (n_not_null == NULL) {
6245 		return;
6246 	}
6247 
6248 	for (i = 0; i < n_unique; i++) {
6249 		if (rec_offs_nth_sql_null(offsets, i)) {
6250 			break;
6251 		}
6252 
6253 		n_not_null[i]++;
6254 	}
6255 }
6256 
6257 /*******************************************************************//**
6258 Estimates the number of different key values in a given index, for
6259 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6260 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6261 0..n_uniq-1) and the number of pages that were sampled is saved in
6262 index->stat_n_sample_sizes[].
6263 If innodb_stats_method is nulls_ignored, we also record the number of
6264 non-null values for each prefix and stored the estimates in
6265 array index->stat_n_non_null_key_vals.
6266 @return true if the index is available and we get the estimated numbers,
6267 false if the index is unavailable. */
6268 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6269 btr_estimate_number_of_different_key_vals(
6270 /*======================================*/
6271 	dict_index_t*	index)	/*!< in: index */
6272 {
6273 	btr_cur_t	cursor;
6274 	page_t*		page;
6275 	rec_t*		rec;
6276 	ulint		n_cols;
6277 	ib_uint64_t*	n_diff;
6278 	ib_uint64_t*	n_not_null;
6279 	ibool		stats_null_not_equal;
6280 	uintmax_t	n_sample_pages; /* number of pages to sample */
6281 	ulint		not_empty_flag	= 0;
6282 	ulint		total_external_size = 0;
6283 	ulint		i;
6284 	ulint		j;
6285 	uintmax_t	add_on;
6286 	mtr_t		mtr;
6287 	mem_heap_t*	heap		= NULL;
6288 	ulint*		offsets_rec	= NULL;
6289 	ulint*		offsets_next_rec = NULL;
6290 
6291 	/* For spatial index, there is no such stats can be
6292 	fetched. */
6293 	if (dict_index_is_spatial(index)) {
6294 		return(false);
6295 	}
6296 
6297 	n_cols = dict_index_get_n_unique(index);
6298 
6299 	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6300 			       * n_cols
6301 			       + dict_index_get_n_fields(index)
6302 			       * (sizeof *offsets_rec
6303 				  + sizeof *offsets_next_rec));
6304 
6305 	n_diff = (ib_uint64_t*) mem_heap_zalloc(
6306 		heap, n_cols * sizeof(n_diff[0]));
6307 
6308 	n_not_null = NULL;
6309 
6310 	/* Check srv_innodb_stats_method setting, and decide whether we
6311 	need to record non-null value and also decide if NULL is
6312 	considered equal (by setting stats_null_not_equal value) */
6313 	switch (srv_innodb_stats_method) {
6314 	case SRV_STATS_NULLS_IGNORED:
6315 		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6316 			heap, n_cols * sizeof *n_not_null);
6317 		/* fall through */
6318 
6319 	case SRV_STATS_NULLS_UNEQUAL:
6320 		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6321 		case, we will treat NULLs as unequal value */
6322 		stats_null_not_equal = TRUE;
6323 		break;
6324 
6325 	case SRV_STATS_NULLS_EQUAL:
6326 		stats_null_not_equal = FALSE;
6327 		break;
6328 
6329 	default:
6330 		ut_error;
6331 	}
6332 
6333 	/* It makes no sense to test more pages than are contained
6334 	in the index, thus we lower the number if it is too high */
6335 	if (srv_stats_transient_sample_pages > index->stat_index_size) {
6336 		if (index->stat_index_size > 0) {
6337 			n_sample_pages = index->stat_index_size;
6338 		} else {
6339 			n_sample_pages = 1;
6340 		}
6341 	} else {
6342 		n_sample_pages = srv_stats_transient_sample_pages;
6343 	}
6344 
6345 	/* We sample some pages in the index to get an estimate */
6346 
6347 	for (i = 0; i < n_sample_pages; i++) {
6348 		mtr_start(&mtr);
6349 
6350 		bool	available;
6351 
6352 		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6353 						    &cursor, &mtr);
6354 
6355 		if (!available) {
6356 			mtr_commit(&mtr);
6357 			mem_heap_free(heap);
6358 
6359 			return(false);
6360 		}
6361 
6362 		/* Count the number of different key values for each prefix of
6363 		the key on this index page. If the prefix does not determine
6364 		the index record uniquely in the B-tree, then we subtract one
6365 		because otherwise our algorithm would give a wrong estimate
6366 		for an index where there is just one key value. */
6367 
6368 		page = btr_cur_get_page(&cursor);
6369 
6370 		SRV_CORRUPT_TABLE_CHECK(page, goto exit_loop;);
6371 		DBUG_EXECUTE_IF("ib_corrupt_page_while_stats_calc",
6372 				page = NULL;);
6373 
6374 		SRV_CORRUPT_TABLE_CHECK(page,
6375 		{
6376 			mtr_commit(&mtr);
6377 			goto exit_loop;
6378 		});
6379 
6380 		rec = page_rec_get_next(page_get_infimum_rec(page));
6381 
6382 		if (!page_rec_is_supremum(rec)) {
6383 			not_empty_flag = 1;
6384 			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6385 						      ULINT_UNDEFINED, &heap);
6386 
6387 			if (n_not_null != NULL) {
6388 				btr_record_not_null_field_in_rec(
6389 					n_cols, offsets_rec, n_not_null);
6390 			}
6391 		}
6392 
6393 		while (!page_rec_is_supremum(rec)) {
6394 			ulint	matched_fields;
6395 			rec_t*	next_rec = page_rec_get_next(rec);
6396 			if (page_rec_is_supremum(next_rec)) {
6397 				total_external_size +=
6398 					btr_rec_get_externally_stored_len(
6399 						rec, offsets_rec);
6400 				break;
6401 			}
6402 
6403 			offsets_next_rec = rec_get_offsets(next_rec, index,
6404 							   offsets_next_rec,
6405 							   ULINT_UNDEFINED,
6406 							   &heap);
6407 
6408 			cmp_rec_rec_with_match(rec, next_rec,
6409 					       offsets_rec, offsets_next_rec,
6410 					       index,
6411 					       page_is_spatial_non_leaf(next_rec, index),
6412 					       stats_null_not_equal,
6413 					       &matched_fields);
6414 
6415 			for (j = matched_fields; j < n_cols; j++) {
6416 				/* We add one if this index record has
6417 				a different prefix from the previous */
6418 
6419 				n_diff[j]++;
6420 			}
6421 
6422 			if (n_not_null != NULL) {
6423 				btr_record_not_null_field_in_rec(
6424 					n_cols, offsets_next_rec, n_not_null);
6425 			}
6426 
6427 			total_external_size
6428 				+= btr_rec_get_externally_stored_len(
6429 					rec, offsets_rec);
6430 
6431 			rec = next_rec;
6432 			/* Initialize offsets_rec for the next round
6433 			and assign the old offsets_rec buffer to
6434 			offsets_next_rec. */
6435 			{
6436 				ulint*	offsets_tmp = offsets_rec;
6437 				offsets_rec = offsets_next_rec;
6438 				offsets_next_rec = offsets_tmp;
6439 			}
6440 		}
6441 
6442 
6443 		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6444 
6445 			/* If there is more than one leaf page in the tree,
6446 			we add one because we know that the first record
6447 			on the page certainly had a different prefix than the
6448 			last record on the previous index page in the
6449 			alphabetical order. Before this fix, if there was
6450 			just one big record on each clustered index page, the
6451 			algorithm grossly underestimated the number of rows
6452 			in the table. */
6453 
6454 			if (btr_page_get_prev(page, &mtr) != FIL_NULL
6455 			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
6456 
6457 				n_diff[n_cols - 1]++;
6458 			}
6459 		}
6460 
6461 		mtr_commit(&mtr);
6462 	}
6463 
6464 exit_loop:
6465 	/* If we saw k borders between different key values on
6466 	n_sample_pages leaf pages, we can estimate how many
6467 	there will be in index->stat_n_leaf_pages */
6468 
6469 	/* We must take into account that our sample actually represents
6470 	also the pages used for external storage of fields (those pages are
6471 	included in index->stat_n_leaf_pages) */
6472 
6473 	for (j = 0; j < n_cols; j++) {
6474 		index->stat_n_diff_key_vals[j]
6475 			= BTR_TABLE_STATS_FROM_SAMPLE(
6476 				n_diff[j], index, n_sample_pages,
6477 				total_external_size, not_empty_flag);
6478 
6479 		/* If the tree is small, smaller than
6480 		10 * n_sample_pages + total_external_size, then
6481 		the above estimate is ok. For bigger trees it is common that we
6482 		do not see any borders between key values in the few pages
6483 		we pick. But still there may be n_sample_pages
6484 		different key values, or even more. Let us try to approximate
6485 		that: */
6486 
6487 		add_on = index->stat_n_leaf_pages
6488 			/ (10 * (n_sample_pages
6489 				 + total_external_size));
6490 
6491 		if (add_on > n_sample_pages) {
6492 			add_on = n_sample_pages;
6493 		}
6494 
6495 		index->stat_n_diff_key_vals[j] += add_on;
6496 
6497 		index->stat_n_sample_sizes[j] = n_sample_pages;
6498 
6499 		/* Update the stat_n_non_null_key_vals[] with our
6500 		sampled result. stat_n_non_null_key_vals[] is created
6501 		and initialized to zero in dict_index_add_to_cache(),
6502 		along with stat_n_diff_key_vals[] array */
6503 		if (n_not_null != NULL) {
6504 			index->stat_n_non_null_key_vals[j] =
6505 				 BTR_TABLE_STATS_FROM_SAMPLE(
6506 					n_not_null[j], index, n_sample_pages,
6507 					total_external_size, not_empty_flag);
6508 		}
6509 	}
6510 
6511 	mem_heap_free(heap);
6512 
6513 	return(true);
6514 }
6515 
6516 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6517 
6518 /***********************************************************//**
6519 Gets the offset of the pointer to the externally stored part of a field.
6520 @return offset of the pointer to the externally stored part */
6521 static
6522 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6523 btr_rec_get_field_ref_offs(
6524 /*=======================*/
6525 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6526 	ulint		n)	/*!< in: index of the external field */
6527 {
6528 	ulint	field_ref_offs;
6529 	ulint	local_len;
6530 
6531 	ut_a(rec_offs_nth_extern(offsets, n));
6532 	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6533 	ut_a(local_len != UNIV_SQL_NULL);
6534 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6535 
6536 	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6537 }
6538 
6539 /** Gets a pointer to the externally stored part of a field.
6540 @param rec record
6541 @param offsets rec_get_offsets(rec)
6542 @param n index of the externally stored field
6543 @return pointer to the externally stored part */
6544 #define btr_rec_get_field_ref(rec, offsets, n)			\
6545 	((rec) + btr_rec_get_field_ref_offs(offsets, n))
6546 
6547 /** Gets the externally stored size of a record, in units of a database page.
6548 @param[in]	rec	record
6549 @param[in]	offsets	array returned by rec_get_offsets()
6550 @return externally stored part, in units of a database page */
6551 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6552 btr_rec_get_externally_stored_len(
6553 	const rec_t*	rec,
6554 	const ulint*	offsets)
6555 {
6556 	ulint	n_fields;
6557 	ulint	total_extern_len = 0;
6558 	ulint	i;
6559 
6560 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6561 
6562 	if (!rec_offs_any_extern(offsets)) {
6563 		return(0);
6564 	}
6565 
6566 	n_fields = rec_offs_n_fields(offsets);
6567 
6568 	for (i = 0; i < n_fields; i++) {
6569 		if (rec_offs_nth_extern(offsets, i)) {
6570 
6571 			ulint	extern_len = mach_read_from_4(
6572 				btr_rec_get_field_ref(rec, offsets, i)
6573 				+ BTR_EXTERN_LEN + 4);
6574 
6575 			total_extern_len += ut_calc_align(extern_len,
6576 							  UNIV_PAGE_SIZE);
6577 		}
6578 	}
6579 
6580 	return(total_extern_len / UNIV_PAGE_SIZE);
6581 }
6582 
6583 /*******************************************************************//**
6584 Sets the ownership bit of an externally stored field in a record. */
6585 static
6586 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6587 btr_cur_set_ownership_of_extern_field(
6588 /*==================================*/
6589 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6590 				part will be updated, or NULL */
6591 	rec_t*		rec,	/*!< in/out: clustered index record */
6592 	dict_index_t*	index,	/*!< in: index of the page */
6593 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6594 	ulint		i,	/*!< in: field number */
6595 	ibool		val,	/*!< in: value to set */
6596 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6597 {
6598 	byte*	data;
6599 	ulint	local_len;
6600 	ulint	byte_val;
6601 
6602 	data = rec_get_nth_field(rec, offsets, i, &local_len);
6603 	ut_ad(rec_offs_nth_extern(offsets, i));
6604 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6605 
6606 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6607 
6608 	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6609 
6610 	if (val) {
6611 		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6612 	} else {
6613 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6614 		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6615 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6616 		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6617 	}
6618 
6619 	if (page_zip) {
6620 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6621 		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6622 	} else if (mtr != NULL) {
6623 
6624 		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6625 				 MLOG_1BYTE, mtr);
6626 	} else {
6627 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6628 	}
6629 }
6630 
6631 /*******************************************************************//**
6632 Marks non-updated off-page fields as disowned by this record. The ownership
6633 must be transferred to the updated record which is inserted elsewhere in the
6634 index tree. In purge only the owner of externally stored field is allowed
6635 to free the field. */
6636 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6637 btr_cur_disown_inherited_fields(
6638 /*============================*/
6639 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6640 				part will be updated, or NULL */
6641 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6642 	dict_index_t*	index,	/*!< in: index of the page */
6643 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6644 	const upd_t*	update,	/*!< in: update vector */
6645 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
6646 {
6647 	ulint	i;
6648 
6649 	ut_ad(rec_offs_validate(rec, index, offsets));
6650 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6651 	ut_ad(rec_offs_any_extern(offsets));
6652 	ut_ad(mtr);
6653 
6654 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6655 		if (rec_offs_nth_extern(offsets, i)
6656 		    && !upd_get_field_by_field_no(update, i, false)) {
6657 			btr_cur_set_ownership_of_extern_field(
6658 				page_zip, rec, index, offsets, i, FALSE, mtr);
6659 		}
6660 	}
6661 }
6662 
6663 /*******************************************************************//**
6664 Marks all extern fields in a record as owned by the record. This function
6665 should be called if the delete mark of a record is removed: a not delete
6666 marked record always owns all its extern fields. */
6667 static
6668 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6669 btr_cur_unmark_extern_fields(
6670 /*=========================*/
6671 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6672 				part will be updated, or NULL */
6673 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6674 	dict_index_t*	index,	/*!< in: index of the page */
6675 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6676 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6677 {
6678 	ulint	n;
6679 	ulint	i;
6680 
6681 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6682 	n = rec_offs_n_fields(offsets);
6683 
6684 	if (!rec_offs_any_extern(offsets)) {
6685 
6686 		return;
6687 	}
6688 
6689 	for (i = 0; i < n; i++) {
6690 		if (rec_offs_nth_extern(offsets, i)) {
6691 
6692 			btr_cur_set_ownership_of_extern_field(
6693 				page_zip, rec, index, offsets, i, TRUE, mtr);
6694 		}
6695 	}
6696 }
6697 
6698 /*******************************************************************//**
6699 Returns the length of a BLOB part stored on the header page.
6700 @return part length */
6701 static
6702 ulint
btr_blob_get_part_len(const byte * blob_header)6703 btr_blob_get_part_len(
6704 /*==================*/
6705 	const byte*	blob_header)	/*!< in: blob header */
6706 {
6707 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6708 }
6709 
6710 /*******************************************************************//**
6711 Returns the page number where the next BLOB part is stored.
6712 @return page number or FIL_NULL if no more pages */
6713 static
6714 ulint
btr_blob_get_next_page_no(const byte * blob_header)6715 btr_blob_get_next_page_no(
6716 /*======================*/
6717 	const byte*	blob_header)	/*!< in: blob header */
6718 {
6719 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6720 }
6721 
6722 /*******************************************************************//**
6723 Deallocate a buffer block that was reserved for a BLOB part. */
6724 static
6725 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6726 btr_blob_free(
6727 /*==========*/
6728 	dict_index_t*	index,	/*!< in: index */
6729 	buf_block_t*	block,	/*!< in: buffer block */
6730 	ibool		all,	/*!< in: TRUE=remove also the compressed page
6731 				if there is one */
6732 	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
6733 {
6734 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
6735 	page_id_t	page_id(block->page.id.space(),
6736 				block->page.id.page_no());
6737 	bool	freed	= false;
6738 
6739 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6740 
6741 	mtr_commit(mtr);
6742 
6743 	mutex_enter(&buf_pool->LRU_list_mutex);
6744 	buf_page_mutex_enter(block);
6745 
6746 	/* Only free the block if it is still allocated to
6747 	the same file page. */
6748 
6749 	if (buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
6750 	    && page_id.equals_to(block->page.id)) {
6751 
6752 		freed = buf_LRU_free_page(&block->page, all);
6753 
6754 		if (!freed && all && block->page.zip.data
6755 		    && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
6756 		    && page_id.equals_to(block->page.id)) {
6757 
6758 			/* Attempt to deallocate the uncompressed page
6759 			if the whole block cannot be deallocted. */
6760 
6761 			freed = buf_LRU_free_page(&block->page, false);
6762 		}
6763 	}
6764 
6765 	if (!freed) {
6766 		mutex_exit(&buf_pool->LRU_list_mutex);
6767 		buf_page_mutex_exit(block);
6768 	}
6769 }
6770 
6771 /** Helper class used while writing blob pages, during insert or update. */
6772 struct btr_blob_log_check_t {
6773 	/** Persistent cursor on a clusterex index record with blobs. */
6774 	btr_pcur_t*	m_pcur;
6775 	/** Mini transaction holding the latches for m_pcur */
6776 	mtr_t*		m_mtr;
6777 	/** rec_get_offsets(rec, index); offset of clust_rec */
6778 	const ulint*	m_offsets;
6779 	/** The block containing clustered record */
6780 	buf_block_t**	m_block;
6781 	/** The clustered record pointer */
6782 	rec_t**		m_rec;
6783 	/** The blob operation code */
6784 	enum blob_op	m_op;
6785 
6786 	/** Constructor
6787 	@param[in]	pcur		persistent cursor on a clustered
6788 					index record with blobs.
6789 	@param[in]	mtr		mini-transaction holding latches for
6790 					pcur.
6791 	@param[in]	offsets		offsets of the clust_rec
6792 	@param[in,out]	block		record block containing pcur record
6793 	@param[in,out]	rec		the clustered record pointer
6794 	@param[in]	op		the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6795 	btr_blob_log_check_t(
6796 		btr_pcur_t*	pcur,
6797 		mtr_t*		mtr,
6798 		const ulint*	offsets,
6799 		buf_block_t**	block,
6800 		rec_t**		rec,
6801 		enum blob_op	op)
6802 		: m_pcur(pcur),
6803 		  m_mtr(mtr),
6804 		  m_offsets(offsets),
6805 		  m_block(block),
6806 		  m_rec(rec),
6807 		  m_op(op)
6808 	{
6809 		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6810 		ut_ad((*m_block)->frame == page_align(*m_rec));
6811 		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6812 	}
6813 
6814 	/** Check if there is enough space in log file. Commit and re-start the
6815 	mini transaction. */
checkbtr_blob_log_check_t6816 	void check()
6817 	{
6818 		dict_index_t*	index = m_pcur->index();
6819 		ulint		offs = 0;
6820 		ulint		page_no = ULINT_UNDEFINED;
6821 		FlushObserver*	observer = m_mtr->get_flush_observer();
6822 
6823 		if (m_op == BTR_STORE_INSERT_BULK) {
6824 			offs = page_offset(*m_rec);
6825 			page_no = page_get_page_no(
6826 				buf_block_get_frame(*m_block));
6827 
6828 			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6829 		} else {
6830 			btr_pcur_store_position(m_pcur, m_mtr);
6831 		}
6832 		m_mtr->commit();
6833 
6834 		DEBUG_SYNC_C("blob_write_middle");
6835 
6836 		log_free_check();
6837 
6838 		DEBUG_SYNC_C("blob_write_middle_after_check");
6839 
6840 		const mtr_log_t log_mode = m_mtr->get_log_mode();
6841 		m_mtr->start();
6842 		m_mtr->set_log_mode(log_mode);
6843 		m_mtr->set_named_space(index->space);
6844 		m_mtr->set_flush_observer(observer);
6845 
6846 		if (m_op == BTR_STORE_INSERT_BULK) {
6847 			page_id_t       page_id(dict_index_get_space(index),
6848 						page_no);
6849 			page_size_t     page_size(dict_table_page_size(
6850 						index->table));
6851 			page_cur_t*	page_cur = &m_pcur->btr_cur.page_cur;
6852 
6853 			mtr_x_lock(dict_index_get_lock(index), m_mtr);
6854 			page_cur->block = btr_block_get(
6855 				page_id, page_size, RW_X_LATCH, index, m_mtr);
6856 			page_cur->rec = buf_block_get_frame(page_cur->block)
6857 				+ offs;
6858 
6859 			buf_block_buf_fix_dec(page_cur->block);
6860 		} else {
6861 			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6862 			bool ret = btr_pcur_restore_position(
6863 				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6864 				m_pcur, m_mtr);
6865 
6866 			ut_a(ret);
6867 		}
6868 
6869 		*m_block	= btr_pcur_get_block(m_pcur);
6870 		*m_rec		= btr_pcur_get_rec(m_pcur);
6871 
6872 		ut_d(rec_offs_make_valid(
6873 			*m_rec, index, const_cast<ulint*>(m_offsets)));
6874 
6875 		ut_ad(m_mtr->memo_contains_page_flagged(
6876 		      *m_rec,
6877 		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6878 		      || dict_table_is_intrinsic(index->table));
6879 
6880 		ut_ad(mtr_memo_contains_flagged(m_mtr,
6881 		      dict_index_get_lock(index),
6882 		      MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6883 		      || dict_table_is_intrinsic(index->table));
6884 	}
6885 };
6886 
6887 
6888 /*******************************************************************//**
6889 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6890 them in rec.  The extern flags in rec will have to be set beforehand.
6891 The fields are stored on pages allocated from leaf node
6892 file segment of the index tree.
6893 
6894 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6895 any mini-transaction.  Tablespace extension should be redo-logged, so that
6896 recovery will not fail when the big_rec was written to the extended portion of
6897 the file, in case the file was somehow truncated in the crash.
6898 
6899 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6900 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6901 btr_store_big_rec_extern_fields(
6902 /*============================*/
6903 	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
6904 					btr_mtr is restarted, then this can
6905 					be repositioned. */
6906 	const upd_t*	upd,		/*!< in: update vector */
6907 	ulint*		offsets,	/*!< in/out: rec_get_offsets() on
6908 					pcur. the "external storage" flags
6909 					in offsets will correctly correspond
6910 					to rec when this function returns */
6911 	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
6912 					to be stored externally */
6913 	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
6914 					latches to the clustered index. can be
6915 					committed and restarted. */
6916 	enum blob_op	op)		/*! in: operation code */
6917 {
6918 	ulint		rec_page_no;
6919 	byte*		field_ref;
6920 	ulint		extern_len;
6921 	ulint		store_len;
6922 	ulint		page_no;
6923 	ulint		space_id;
6924 	ulint		prev_page_no;
6925 	ulint		hint_page_no;
6926 	ulint		i;
6927 	mtr_t		mtr;
6928 	mtr_t		mtr_bulk;
6929 	mem_heap_t*	heap = NULL;
6930 	page_zip_des_t*	page_zip;
6931 	z_stream	c_stream;
6932 	dberr_t		error		= DB_SUCCESS;
6933 	dict_index_t*	index		= pcur->index();
6934 	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
6935 	rec_t*		rec		= btr_pcur_get_rec(pcur);
6936 
6937 	ut_ad(rec_offs_validate(rec, index, offsets));
6938 	ut_ad(rec_offs_any_extern(offsets));
6939 	ut_ad(btr_mtr);
6940 	ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6941 					MTR_MEMO_X_LOCK
6942 					| MTR_MEMO_SX_LOCK)
6943 	      || dict_table_is_intrinsic(index->table)
6944 	      || !index->is_committed());
6945 	ut_ad(mtr_is_block_fix(
6946 		btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6947 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6948 	ut_a(dict_index_is_clust(index));
6949 
6950 	ut_a(dict_table_page_size(index->table)
6951 		.equals_to(rec_block->page.size));
6952 
6953 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6954 				      &rec, op);
6955 	page_zip = buf_block_get_page_zip(rec_block);
6956 	space_id = rec_block->page.id.space();
6957 	rec_page_no = rec_block->page.id.page_no();
6958 	ut_a(fil_page_index_page_check(page_align(rec))
6959 	     || op == BTR_STORE_INSERT_BULK);
6960 
6961 	if (page_zip) {
6962 		int	err;
6963 
6964 		/* Zlib deflate needs 128 kilobytes for the default
6965 		window size, plus 512 << memLevel, plus a few
6966 		kilobytes for small objects.  We use reduced memLevel
6967 		to limit the memory consumption, and preallocate the
6968 		heap, hoping to avoid memory fragmentation. */
6969 		heap = mem_heap_create(250000);
6970 		page_zip_set_alloc(&c_stream, heap);
6971 
6972 		err = deflateInit2(&c_stream, page_zip_level,
6973 				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6974 		ut_a(err == Z_OK);
6975 	}
6976 
6977 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6978 	/* All pointers to externally stored columns in the record
6979 	must either be zero or they must be pointers to inherited
6980 	columns, owned by this record or an earlier record version. */
6981 	for (i = 0; i < big_rec_vec->n_fields; i++) {
6982 		field_ref = btr_rec_get_field_ref(
6983 			rec, offsets, big_rec_vec->fields[i].field_no);
6984 
6985 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6986 		/* Either this must be an update in place,
6987 		or the BLOB must be inherited, or the BLOB pointer
6988 		must be zero (will be written in this function). */
6989 		ut_a(op == BTR_STORE_UPDATE
6990 		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6991 		     || !memcmp(field_ref, field_ref_zero,
6992 				BTR_EXTERN_FIELD_REF_SIZE));
6993 	}
6994 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6995 
6996 	const page_size_t	page_size(dict_table_page_size(index->table));
6997 
6998 	/* Space available in compressed page to carry blob data */
6999 	const ulint	payload_size_zip = page_size.physical()
7000 		- FIL_PAGE_DATA;
7001 
7002 	/* Space available in uncompressed page to carry blob data */
7003 	const ulint	payload_size = page_size.physical()
7004 		- FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
7005 
7006 	/* We have to create a file segment to the tablespace
7007 	for each field and put the pointer to the field in rec */
7008 
7009 	for (i = 0; i < big_rec_vec->n_fields; i++) {
7010 		const ulint field_no = big_rec_vec->fields[i].field_no;
7011 
7012 		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
7013 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7014 		/* A zero BLOB pointer should have been initially inserted. */
7015 		ut_a(!memcmp(field_ref, field_ref_zero,
7016 			     BTR_EXTERN_FIELD_REF_SIZE));
7017 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7018 		extern_len = big_rec_vec->fields[i].len;
7019 		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
7020 				   extern_len);
7021 
7022 		ut_a(extern_len > 0);
7023 
7024 		prev_page_no = FIL_NULL;
7025 
7026 		if (page_zip) {
7027 			int	err = deflateReset(&c_stream);
7028 			ut_a(err == Z_OK);
7029 
7030 			c_stream.next_in = (Bytef*)
7031 				big_rec_vec->fields[i].data;
7032 			c_stream.avail_in = static_cast<uInt>(extern_len);
7033 		}
7034 
7035 		for (ulint blob_npages = 0;; ++blob_npages) {
7036 			buf_block_t*	block;
7037 			page_t*		page;
7038 			const ulint	commit_freq = 4;
7039 			ulint		r_extents;
7040 
7041 			ut_ad(page_align(field_ref) == page_align(rec));
7042 
7043 			if (!(blob_npages % commit_freq)) {
7044 
7045 				redo_log.check();
7046 
7047 				field_ref = btr_rec_get_field_ref(
7048 					rec, offsets, field_no);
7049 
7050 				page_zip = buf_block_get_page_zip(rec_block);
7051 				rec_page_no = rec_block->page.id.page_no();
7052 			}
7053 
7054 			mtr_start(&mtr);
7055 			mtr.set_named_space(index->space);
7056 			mtr.set_log_mode(btr_mtr->get_log_mode());
7057 			mtr.set_flush_observer(btr_mtr->get_flush_observer());
7058 
7059 			buf_page_get(rec_block->page.id,
7060 				     rec_block->page.size, RW_X_LATCH, &mtr);
7061 
7062 			if (prev_page_no == FIL_NULL) {
7063 				hint_page_no = 1 + rec_page_no;
7064 			} else {
7065 				hint_page_no = prev_page_no + 1;
7066 			}
7067 
7068 			mtr_t	*alloc_mtr;
7069 
7070 			if (op == BTR_STORE_INSERT_BULK) {
7071 				mtr_start(&mtr_bulk);
7072 				mtr_bulk.set_spaces(mtr);
7073 				alloc_mtr = &mtr_bulk;
7074 			} else {
7075 				alloc_mtr = &mtr;
7076 			}
7077 
7078 			if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
7079 						      FSP_BLOB, alloc_mtr,
7080 						      1)) {
7081 
7082 				mtr_commit(alloc_mtr);
7083 				error = DB_OUT_OF_FILE_SPACE;
7084 				goto func_exit;
7085 			}
7086 
7087 			block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
7088 					       0, alloc_mtr, &mtr);
7089 
7090 			alloc_mtr->release_free_extents(r_extents);
7091 
7092 			if (op == BTR_STORE_INSERT_BULK) {
7093 				mtr_commit(&mtr_bulk);
7094 			}
7095 
7096 			ut_a(block != NULL);
7097 
7098 			page_no = block->page.id.page_no();
7099 			page = buf_block_get_frame(block);
7100 
7101 			if (prev_page_no != FIL_NULL) {
7102 				buf_block_t*	prev_block;
7103 				page_t*		prev_page;
7104 
7105 				prev_block = buf_page_get(
7106 					page_id_t(space_id, prev_page_no),
7107 					rec_block->page.size,
7108 					RW_X_LATCH, &mtr);
7109 
7110 				buf_block_dbg_add_level(prev_block,
7111 							SYNC_EXTERN_STORAGE);
7112 				prev_page = buf_block_get_frame(prev_block);
7113 
7114 				if (page_zip) {
7115 					mlog_write_ulint(
7116 						prev_page + FIL_PAGE_NEXT,
7117 						page_no, MLOG_4BYTES, &mtr);
7118 					memcpy(buf_block_get_page_zip(
7119 						       prev_block)
7120 					       ->data + FIL_PAGE_NEXT,
7121 					       prev_page + FIL_PAGE_NEXT, 4);
7122 				} else {
7123 					mlog_write_ulint(
7124 						prev_page + FIL_PAGE_DATA
7125 						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
7126 						page_no, MLOG_4BYTES, &mtr);
7127 				}
7128 
7129 			} else if (dict_index_is_online_ddl(index)) {
7130 				row_log_table_blob_alloc(index, page_no);
7131 			}
7132 
7133 			if (page_zip) {
7134 				int		err;
7135 				page_zip_des_t*	blob_page_zip;
7136 
7137 				/* Write FIL_PAGE_TYPE to the redo log
7138 				separately, before logging any other
7139 				changes to the page, so that the debug
7140 				assertions in
7141 				recv_parse_or_apply_log_rec_body() can
7142 				be made simpler.  Before InnoDB Plugin
7143 				1.0.4, the initialization of
7144 				FIL_PAGE_TYPE was logged as part of
7145 				the mlog_log_string() below. */
7146 
7147 				mlog_write_ulint(page + FIL_PAGE_TYPE,
7148 						 prev_page_no == FIL_NULL
7149 						 ? FIL_PAGE_TYPE_ZBLOB
7150 						 : FIL_PAGE_TYPE_ZBLOB2,
7151 						 MLOG_2BYTES, &mtr);
7152 
7153 				c_stream.next_out = page
7154 					+ FIL_PAGE_DATA;
7155 				c_stream.avail_out = static_cast<uInt>(
7156 					payload_size_zip);
7157 
7158 				err = deflate(&c_stream, Z_FINISH);
7159 				ut_a(err == Z_OK || err == Z_STREAM_END);
7160 				ut_a(err == Z_STREAM_END
7161 				     || c_stream.avail_out == 0);
7162 
7163 				/* Write the "next BLOB page" pointer */
7164 				mlog_write_ulint(page + FIL_PAGE_NEXT,
7165 						 FIL_NULL, MLOG_4BYTES, &mtr);
7166 				/* Initialize the unused "prev page" pointer */
7167 				mlog_write_ulint(page + FIL_PAGE_PREV,
7168 						 FIL_NULL, MLOG_4BYTES, &mtr);
7169 				/* Write a back pointer to the record
7170 				into the otherwise unused area.  This
7171 				information could be useful in
7172 				debugging.  Later, we might want to
7173 				implement the possibility to relocate
7174 				BLOB pages.  Then, we would need to be
7175 				able to adjust the BLOB pointer in the
7176 				record.  We do not store the heap
7177 				number of the record, because it can
7178 				change in page_zip_reorganize() or
7179 				btr_page_reorganize().  However, also
7180 				the page number of the record may
7181 				change when B-tree nodes are split or
7182 				merged.
7183 				NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
7184 				used by R-tree index for a Split Sequence
7185 				Number */
7186 				ut_ad(!dict_index_is_spatial(index));
7187 
7188 				mlog_write_ulint(page
7189 						 + FIL_PAGE_FILE_FLUSH_LSN,
7190 						 space_id,
7191 						 MLOG_4BYTES, &mtr);
7192 				mlog_write_ulint(page
7193 						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
7194 						 rec_page_no,
7195 						 MLOG_4BYTES, &mtr);
7196 
7197 				/* Zero out the unused part of the page. */
7198 				memset(page + page_zip_get_size(page_zip)
7199 				       - c_stream.avail_out,
7200 				       0, c_stream.avail_out);
7201 				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
7202 						page_zip_get_size(page_zip)
7203 						- FIL_PAGE_FILE_FLUSH_LSN,
7204 						&mtr);
7205 				/* Copy the page to compressed storage,
7206 				because it will be flushed to disk
7207 				from there. */
7208 				blob_page_zip = buf_block_get_page_zip(block);
7209 				ut_ad(blob_page_zip);
7210 				ut_ad(page_zip_get_size(blob_page_zip)
7211 				      == page_zip_get_size(page_zip));
7212 				memcpy(blob_page_zip->data, page,
7213 				       page_zip_get_size(page_zip));
7214 
7215 				if (err == Z_OK && prev_page_no != FIL_NULL) {
7216 
7217 					goto next_zip_page;
7218 				}
7219 
7220 				if (err == Z_STREAM_END) {
7221 					mach_write_to_4(field_ref
7222 							+ BTR_EXTERN_LEN, 0);
7223 					mach_write_to_4(field_ref
7224 							+ BTR_EXTERN_LEN + 4,
7225 							c_stream.total_in);
7226 				} else {
7227 					memset(field_ref + BTR_EXTERN_LEN,
7228 					       0, 8);
7229 				}
7230 
7231 				if (prev_page_no == FIL_NULL) {
7232 					ut_ad(blob_npages == 0);
7233 					mach_write_to_4(field_ref
7234 							+ BTR_EXTERN_SPACE_ID,
7235 							space_id);
7236 
7237 					mach_write_to_4(field_ref
7238 							+ BTR_EXTERN_PAGE_NO,
7239 							page_no);
7240 
7241 					mach_write_to_4(field_ref
7242 							+ BTR_EXTERN_OFFSET,
7243 							FIL_PAGE_NEXT);
7244 				}
7245 
7246 				/* We compress a page when finish bulk insert.*/
7247 				if (op != BTR_STORE_INSERT_BULK) {
7248 					page_zip_write_blob_ptr(
7249 						page_zip, rec, index, offsets,
7250 						field_no, &mtr);
7251 				}
7252 
7253 next_zip_page:
7254 				prev_page_no = page_no;
7255 
7256 				/* Commit mtr and release the
7257 				uncompressed page frame to save memory. */
7258 				btr_blob_free(index, block, FALSE, &mtr);
7259 
7260 				if (err == Z_STREAM_END) {
7261 					break;
7262 				}
7263 			} else {
7264 				mlog_write_ulint(page + FIL_PAGE_TYPE,
7265 						 FIL_PAGE_TYPE_BLOB,
7266 						 MLOG_2BYTES, &mtr);
7267 
7268 				if (extern_len > payload_size) {
7269 					store_len = payload_size;
7270 				} else {
7271 					store_len = extern_len;
7272 				}
7273 
7274 				mlog_write_string(page + FIL_PAGE_DATA
7275 						  + BTR_BLOB_HDR_SIZE,
7276 						  (const byte*)
7277 						  big_rec_vec->fields[i].data
7278 						  + big_rec_vec->fields[i].len
7279 						  - extern_len,
7280 						  store_len, &mtr);
7281 				mlog_write_ulint(page + FIL_PAGE_DATA
7282 						 + BTR_BLOB_HDR_PART_LEN,
7283 						 store_len, MLOG_4BYTES, &mtr);
7284 				mlog_write_ulint(page + FIL_PAGE_DATA
7285 						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7286 						 FIL_NULL, MLOG_4BYTES, &mtr);
7287 
7288 				extern_len -= store_len;
7289 
7290 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7291 						 MLOG_4BYTES, &mtr);
7292 				mlog_write_ulint(field_ref
7293 						 + BTR_EXTERN_LEN + 4,
7294 						 big_rec_vec->fields[i].len
7295 						 - extern_len,
7296 						 MLOG_4BYTES, &mtr);
7297 
7298 				if (prev_page_no == FIL_NULL) {
7299 					ut_ad(blob_npages == 0);
7300 					mlog_write_ulint(field_ref
7301 							 + BTR_EXTERN_SPACE_ID,
7302 							 space_id, MLOG_4BYTES,
7303 							 &mtr);
7304 
7305 					mlog_write_ulint(field_ref
7306 							 + BTR_EXTERN_PAGE_NO,
7307 							 page_no, MLOG_4BYTES,
7308 							 &mtr);
7309 
7310 					mlog_write_ulint(field_ref
7311 							 + BTR_EXTERN_OFFSET,
7312 							 FIL_PAGE_DATA,
7313 							 MLOG_4BYTES,
7314 							 &mtr);
7315 				}
7316 
7317 				prev_page_no = page_no;
7318 
7319 				mtr_commit(&mtr);
7320 
7321 				if (extern_len == 0) {
7322 					break;
7323 				}
7324 			}
7325 		}
7326 
7327 		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7328 				error = DB_OUT_OF_FILE_SPACE;
7329 				goto func_exit;);
7330 
7331 		rec_offs_make_nth_extern(offsets, field_no);
7332 	}
7333 
7334 func_exit:
7335 	if (page_zip) {
7336 		deflateEnd(&c_stream);
7337 	}
7338 
7339 	if (heap != NULL) {
7340 		mem_heap_free(heap);
7341 	}
7342 
7343 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7344 	/* All pointers to externally stored columns in the record
7345 	must be valid. */
7346 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7347 		if (!rec_offs_nth_extern(offsets, i)) {
7348 			continue;
7349 		}
7350 
7351 		field_ref = btr_rec_get_field_ref(rec, offsets, i);
7352 
7353 		/* The pointer must not be zero if the operation
7354 		succeeded. */
7355 		ut_a(0 != memcmp(field_ref, field_ref_zero,
7356 				 BTR_EXTERN_FIELD_REF_SIZE)
7357 		     || error != DB_SUCCESS);
7358 		/* The column must not be disowned by this record. */
7359 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7360 	}
7361 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7362 	return(error);
7363 }
7364 
7365 /*******************************************************************//**
7366 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7367 static
7368 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7369 btr_check_blob_fil_page_type(
7370 /*=========================*/
7371 	ulint		space_id,	/*!< in: space id */
7372 	ulint		page_no,	/*!< in: page number */
7373 	const page_t*	page,		/*!< in: page */
7374 	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
7375 {
7376 	ulint	type = fil_page_get_type(page);
7377 
7378 	ut_a(space_id == page_get_space_id(page));
7379 	ut_a(page_no == page_get_page_no(page));
7380 
7381 	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7382 		ulint	flags = fil_space_get_flags(space_id);
7383 
7384 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7385 		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7386 			/* Old versions of InnoDB did not initialize
7387 			FIL_PAGE_TYPE on BLOB pages.  Do not print
7388 			anything about the type mismatch when reading
7389 			a BLOB page that is in Antelope format.*/
7390 			return;
7391 		}
7392 #endif /* !UNIV_DEBUG */
7393 
7394 		ib::fatal() << "FIL_PAGE_TYPE=" << type
7395 			<< " on BLOB " << (read ? "read" : "purge")
7396 			<< " space " << space_id << " page " << page_no
7397 			<< " flags " << flags;
7398 	}
7399 }
7400 
7401 /*******************************************************************//**
7402 Frees the space in an externally stored field to the file space
7403 management if the field in data is owned by the externally stored field,
7404 in a rollback we may have the additional condition that the field must
7405 not be inherited. */
7406 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7407 btr_free_externally_stored_field(
7408 /*=============================*/
7409 	dict_index_t*	index,		/*!< in: index of the data, the index
7410 					tree MUST be X-latched; if the tree
7411 					height is 1, then also the root page
7412 					must be X-latched! (this is relevant
7413 					in the case this function is called
7414 					from purge where 'data' is located on
7415 					an undo log page, not an index
7416 					page) */
7417 	byte*		field_ref,	/*!< in/out: field reference */
7418 	const rec_t*	rec,		/*!< in: record containing field_ref, for
7419 					page_zip_write_blob_ptr(), or NULL */
7420 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
7421 					or NULL */
7422 	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
7423 					to rec, or NULL if rec == NULL */
7424 	ulint		i,		/*!< in: field number of field_ref;
7425 					ignored if rec == NULL */
7426 	bool		rollback,	/*!< in: performing rollback? */
7427 	mtr_t*		local_mtr)	/*!< in: mtr
7428 					containing the latch to data an an
7429 					X-latch to the index tree */
7430 {
7431 	page_t*		page;
7432 	const ulint	space_id	= mach_read_from_4(
7433 		field_ref + BTR_EXTERN_SPACE_ID);
7434 	const ulint	start_page	= mach_read_from_4(
7435 		field_ref + BTR_EXTERN_PAGE_NO);
7436 	ulint		page_no;
7437 	ulint		next_page_no;
7438 	mtr_t		mtr;
7439 
7440 	ut_ad(dict_index_is_clust(index));
7441 	ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7442 					MTR_MEMO_X_LOCK
7443 					| MTR_MEMO_SX_LOCK)
7444 	      || dict_table_is_intrinsic(index->table));
7445 	ut_ad(mtr_is_page_fix(
7446 		local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7447 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7448 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7449 	ut_ad(local_mtr->is_named_space(
7450 		      page_get_space_id(page_align(field_ref))));
7451 
7452 	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7453 				  BTR_EXTERN_FIELD_REF_SIZE))) {
7454 		/* In the rollback, we may encounter a clustered index
7455 		record with some unwritten off-page columns. There is
7456 		nothing to free then. */
7457 		ut_a(rollback);
7458 		return;
7459 	}
7460 
7461 	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7462 	        & ~((BTR_EXTERN_OWNER_FLAG
7463 	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
7464 	ut_ad(space_id == index->space);
7465 
7466 	const page_size_t	ext_page_size(dict_table_page_size(index->table));
7467 	const page_size_t&	rec_page_size(rec == NULL
7468 					      ? univ_page_size
7469 					      : ext_page_size);
7470 	if (rec == NULL) {
7471 		/* This is a call from row_purge_upd_exist_or_extern(). */
7472 		ut_ad(!page_zip);
7473 	}
7474 
7475 	for (;;) {
7476 #ifdef UNIV_DEBUG
7477 		buf_block_t*	rec_block;
7478 #endif /* UNIV_DEBUG */
7479 		buf_block_t*	ext_block;
7480 
7481 		mtr_start(&mtr);
7482 		mtr.set_spaces(*local_mtr);
7483 		mtr.set_log_mode(local_mtr->get_log_mode());
7484 
7485 		ut_ad(!dict_table_is_temporary(index->table)
7486 		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7487 
7488 		const page_t*	p = page_align(field_ref);
7489 
7490 		const page_id_t	page_id(page_get_space_id(p),
7491 					page_get_page_no(p));
7492 
7493 #ifdef UNIV_DEBUG
7494 		rec_block =
7495 #endif /* UNIV_DEBUG */
7496 		buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7497 
7498 		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7499 		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7500 
7501 		if (/* There is no external storage data */
7502 		    page_no == FIL_NULL
7503 		    /* This field does not own the externally stored field */
7504 		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7505 			& BTR_EXTERN_OWNER_FLAG)
7506 		    /* Rollback and inherited field */
7507 		    || (rollback
7508 			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7509 			    & BTR_EXTERN_INHERITED_FLAG))) {
7510 
7511 			/* Do not free */
7512 			mtr_commit(&mtr);
7513 
7514 			return;
7515 		}
7516 
7517 		if (page_no == start_page && dict_index_is_online_ddl(index)) {
7518 			row_log_table_blob_free(index, start_page);
7519 		}
7520 
7521 		ext_block = buf_page_get(
7522 			page_id_t(space_id, page_no), ext_page_size,
7523 			RW_X_LATCH, &mtr);
7524 
7525 		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7526 		page = buf_block_get_frame(ext_block);
7527 
7528 		if (ext_page_size.is_compressed()) {
7529 			/* Note that page_zip will be NULL
7530 			in row_purge_upd_exist_or_extern(). */
7531 			switch (fil_page_get_type(page)) {
7532 			case FIL_PAGE_TYPE_ZBLOB:
7533 			case FIL_PAGE_TYPE_ZBLOB2:
7534 				break;
7535 			default:
7536 				ut_error;
7537 			}
7538 			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7539 
7540 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7541 					  &mtr);
7542 
7543 			if (page_zip != NULL) {
7544 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7545 						next_page_no);
7546 				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7547 						0);
7548 				page_zip_write_blob_ptr(page_zip, rec, index,
7549 							offsets, i, &mtr);
7550 			} else {
7551 				mlog_write_ulint(field_ref
7552 						 + BTR_EXTERN_PAGE_NO,
7553 						 next_page_no,
7554 						 MLOG_4BYTES, &mtr);
7555 				mlog_write_ulint(field_ref
7556 						 + BTR_EXTERN_LEN + 4, 0,
7557 						 MLOG_4BYTES, &mtr);
7558 			}
7559 		} else {
7560 			ut_a(!page_zip);
7561 			btr_check_blob_fil_page_type(space_id, page_no, page,
7562 						     FALSE);
7563 
7564 			next_page_no = mach_read_from_4(
7565 				page + FIL_PAGE_DATA
7566 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
7567 
7568 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7569 					  &mtr);
7570 
7571 			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7572 					 next_page_no,
7573 					 MLOG_4BYTES, &mtr);
7574 			/* Zero out the BLOB length.  If the server
7575 			crashes during the execution of this function,
7576 			trx_rollback_or_clean_all_recovered() could
7577 			dereference the half-deleted BLOB, fetching a
7578 			wrong prefix for the BLOB. */
7579 			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7580 					 0,
7581 					 MLOG_4BYTES, &mtr);
7582 		}
7583 
7584 		/* Commit mtr and release the BLOB block to save memory. */
7585 		btr_blob_free(index, ext_block, TRUE, &mtr);
7586 	}
7587 }
7588 
7589 /***********************************************************//**
7590 Frees the externally stored fields for a record. */
7591 static
7592 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7593 btr_rec_free_externally_stored_fields(
7594 /*==================================*/
7595 	dict_index_t*	index,	/*!< in: index of the data, the index
7596 				tree MUST be X-latched */
7597 	rec_t*		rec,	/*!< in/out: record */
7598 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7599 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7600 				part will be updated, or NULL */
7601 	bool		rollback,/*!< in: performing rollback? */
7602 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7603 				an X-latch to record page and to the index
7604 				tree */
7605 {
7606 	ulint	n_fields;
7607 	ulint	i;
7608 
7609 	ut_ad(rec_offs_validate(rec, index, offsets));
7610 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7611 	/* Free possible externally stored fields in the record */
7612 
7613 	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7614 	n_fields = rec_offs_n_fields(offsets);
7615 
7616 	for (i = 0; i < n_fields; i++) {
7617 		if (rec_offs_nth_extern(offsets, i)) {
7618 			btr_free_externally_stored_field(
7619 				index, btr_rec_get_field_ref(rec, offsets, i),
7620 				rec, offsets, page_zip, i, rollback, mtr);
7621 		}
7622 	}
7623 }
7624 
7625 /***********************************************************//**
7626 Frees the externally stored fields for a record, if the field is mentioned
7627 in the update vector. */
7628 static
7629 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7630 btr_rec_free_updated_extern_fields(
7631 /*===============================*/
7632 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
7633 				X-latched */
7634 	rec_t*		rec,	/*!< in/out: record */
7635 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7636 				part will be updated, or NULL */
7637 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7638 	const upd_t*	update,	/*!< in: update vector */
7639 	bool		rollback,/*!< in: performing rollback? */
7640 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7641 				an X-latch to record page and to the tree */
7642 {
7643 	ulint	n_fields;
7644 	ulint	i;
7645 
7646 	ut_ad(rec_offs_validate(rec, index, offsets));
7647 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7648 
7649 	/* Free possible externally stored fields in the record */
7650 
7651 	n_fields = upd_get_n_fields(update);
7652 
7653 	for (i = 0; i < n_fields; i++) {
7654 		const upd_field_t* ufield = upd_get_nth_field(update, i);
7655 
7656 		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7657 			ulint	len;
7658 			byte*	data = rec_get_nth_field(
7659 				rec, offsets, ufield->field_no, &len);
7660 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7661 
7662 			btr_free_externally_stored_field(
7663 				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7664 				rec, offsets, page_zip,
7665 				ufield->field_no, rollback, mtr);
7666 		}
7667 	}
7668 }
7669 
7670 /*******************************************************************//**
7671 Copies the prefix of an uncompressed BLOB.  The clustered index record
7672 that points to this BLOB must be protected by a lock or a page latch.
7673 @return number of bytes written to buf */
7674 static
7675 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7676 btr_copy_blob_prefix(
7677 /*=================*/
7678 	byte*		buf,	/*!< out: the externally stored part of
7679 				the field, or a prefix of it */
7680 	ulint		len,	/*!< in: length of buf, in bytes */
7681 	ulint		space_id,/*!< in: space id of the BLOB pages */
7682 	ulint		page_no,/*!< in: page number of the first BLOB page */
7683 	ulint		offset)	/*!< in: offset on the first BLOB page */
7684 {
7685 	ulint	copied_len	= 0;
7686 
7687 	for (;;) {
7688 		mtr_t		mtr;
7689 		buf_block_t*	block;
7690 		const page_t*	page;
7691 		const byte*	blob_header;
7692 		ulint		part_len;
7693 		ulint		copy_len;
7694 
7695 		mtr_start(&mtr);
7696 
7697 		block = buf_page_get(page_id_t(space_id, page_no),
7698 				     univ_page_size, RW_S_LATCH, &mtr);
7699 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7700 		page = buf_block_get_frame(block);
7701 
7702 		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7703 
7704 		blob_header = page + offset;
7705 		part_len = btr_blob_get_part_len(blob_header);
7706 		copy_len = ut_min(part_len, len - copied_len);
7707 
7708 		memcpy(buf + copied_len,
7709 		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7710 		copied_len += copy_len;
7711 
7712 		page_no = btr_blob_get_next_page_no(blob_header);
7713 
7714 		mtr_commit(&mtr);
7715 
7716 		if (page_no == FIL_NULL || copy_len != part_len) {
7717 			UNIV_MEM_ASSERT_RW(buf, copied_len);
7718 			return(copied_len);
7719 		}
7720 
7721 		/* On other BLOB pages except the first the BLOB header
7722 		always is at the page data start: */
7723 
7724 		offset = FIL_PAGE_DATA;
7725 
7726 		ut_ad(copied_len <= len);
7727 	}
7728 }
7729 
7730 /** Copies the prefix of a compressed BLOB.
7731 The clustered index record that points to this BLOB must be protected
7732 by a lock or a page latch.
7733 @param[out]	buf		the externally stored part of the field,
7734 or a prefix of it
7735 @param[in]	len		length of buf, in bytes
7736 @param[in]	page_size	compressed BLOB page size
7737 @param[in]	space_id	space id of the BLOB pages
7738 @param[in]	offset		offset on the first BLOB page
7739 @return number of bytes written to buf */
7740 static
7741 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7742 btr_copy_zblob_prefix(
7743 	byte*			buf,
7744 	ulint			len,
7745 	const page_size_t&	page_size,
7746 	ulint			space_id,
7747 	ulint			page_no,
7748 	ulint			offset)
7749 {
7750 	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
7751 	mem_heap_t*	heap;
7752 	int		err;
7753 	z_stream	d_stream;
7754 
7755 	d_stream.next_out = buf;
7756 	d_stream.avail_out = static_cast<uInt>(len);
7757 	d_stream.next_in = Z_NULL;
7758 	d_stream.avail_in = 0;
7759 
7760 	/* Zlib inflate needs 32 kilobytes for the default
7761 	window size, plus a few kilobytes for small objects. */
7762 	heap = mem_heap_create(40000);
7763 	page_zip_set_alloc(&d_stream, heap);
7764 
7765 	ut_ad(page_size.is_compressed());
7766 	ut_ad(space_id);
7767 
7768 	err = inflateInit(&d_stream);
7769 	ut_a(err == Z_OK);
7770 
7771 	for (;;) {
7772 		buf_page_t*	bpage;
7773 		ulint		next_page_no;
7774 
7775 		/* There is no latch on bpage directly.  Instead,
7776 		bpage is protected by the B-tree page latch that
7777 		is being held on the clustered index record, or,
7778 		in row_merge_copy_blobs(), by an exclusive table lock. */
7779 		bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7780 					 page_size);
7781 
7782 		if (UNIV_UNLIKELY(!bpage)) {
7783 			ib::error() << "Cannot load compressed BLOB "
7784 				<< page_id_t(space_id, page_no);
7785 			goto func_exit;
7786 		}
7787 
7788 		if (UNIV_UNLIKELY
7789 		    (fil_page_get_type(bpage->zip.data) != page_type)) {
7790 
7791 			ib::error() << "Unexpected type "
7792 				<< fil_page_get_type(bpage->zip.data)
7793 				<< " of compressed BLOB page "
7794 				<< page_id_t(space_id, page_no);
7795 
7796 			ut_ad(0);
7797 			goto end_of_blob;
7798 		}
7799 
7800 		next_page_no = mach_read_from_4(bpage->zip.data + offset);
7801 
7802 		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7803 			/* When the BLOB begins at page header,
7804 			the compressed data payload does not
7805 			immediately follow the next page pointer. */
7806 			offset = FIL_PAGE_DATA;
7807 		} else {
7808 			offset += 4;
7809 		}
7810 
7811 		d_stream.next_in = bpage->zip.data + offset;
7812 		d_stream.avail_in = static_cast<uInt>(page_size.physical()
7813 						      - offset);
7814 
7815 		err = inflate(&d_stream, Z_NO_FLUSH);
7816 		switch (err) {
7817 		case Z_OK:
7818 			if (!d_stream.avail_out) {
7819 				goto end_of_blob;
7820 			}
7821 			break;
7822 		case Z_STREAM_END:
7823 			if (next_page_no == FIL_NULL) {
7824 				goto end_of_blob;
7825 			}
7826 			/* fall through */
7827 		default:
7828 inflate_error:
7829 			ib::error() << "inflate() of compressed BLOB page "
7830 				<< page_id_t(space_id, page_no)
7831 				<< " returned " << err
7832 				<< " (" << d_stream.msg << ")";
7833 
7834 		case Z_BUF_ERROR:
7835 			goto end_of_blob;
7836 		}
7837 
7838 		if (next_page_no == FIL_NULL) {
7839 			if (!d_stream.avail_in) {
7840 				ib::error()
7841 					<< "Unexpected end of compressed "
7842 					<< "BLOB page "
7843 					<< page_id_t(space_id, page_no);
7844 			} else {
7845 				err = inflate(&d_stream, Z_FINISH);
7846 				switch (err) {
7847 				case Z_STREAM_END:
7848 				case Z_BUF_ERROR:
7849 					break;
7850 				default:
7851 					goto inflate_error;
7852 				}
7853 			}
7854 
7855 end_of_blob:
7856 			buf_page_release_zip(bpage);
7857 			goto func_exit;
7858 		}
7859 
7860 		buf_page_release_zip(bpage);
7861 
7862 		/* On other BLOB pages except the first
7863 		the BLOB header always is at the page header: */
7864 
7865 		page_no = next_page_no;
7866 		offset = FIL_PAGE_NEXT;
7867 		page_type = FIL_PAGE_TYPE_ZBLOB2;
7868 	}
7869 
7870 func_exit:
7871 	inflateEnd(&d_stream);
7872 	mem_heap_free(heap);
7873 	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7874 	return(d_stream.total_out);
7875 }
7876 
7877 /** Copies the prefix of an externally stored field of a record.
7878 The clustered index record that points to this BLOB must be protected
7879 by a lock or a page latch.
7880 @param[out]	buf		the externally stored part of the
7881 field, or a prefix of it
7882 @param[in]	len		length of buf, in bytes
7883 @param[in]	page_size	BLOB page size
7884 @param[in]	space_id	space id of the first BLOB page
7885 @param[in]	page_no		page number of the first BLOB page
7886 @param[in]	offset		offset on the first BLOB page
7887 @return number of bytes written to buf */
7888 static
7889 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7890 btr_copy_externally_stored_field_prefix_low(
7891 	byte*			buf,
7892 	ulint			len,
7893 	const page_size_t&	page_size,
7894 	ulint			space_id,
7895 	ulint			page_no,
7896 	ulint			offset)
7897 {
7898 	if (len == 0) {
7899 		return(0);
7900 	}
7901 
7902 	if (page_size.is_compressed()) {
7903 		return(btr_copy_zblob_prefix(buf, len, page_size,
7904 					     space_id, page_no, offset));
7905 	} else {
7906 		ut_ad(page_size.equals_to(univ_page_size));
7907 		return(btr_copy_blob_prefix(buf, len, space_id,
7908 					    page_no, offset));
7909 	}
7910 }
7911 
7912 /** Copies the prefix of an externally stored field of a record.
7913 The clustered index record must be protected by a lock or a page latch.
7914 @param[out]	buf		the field, or a prefix of it
7915 @param[in]	len		length of buf, in bytes
7916 @param[in]	page_size	BLOB page size
7917 @param[in]	data		'internally' stored part of the field
7918 containing also the reference to the external part; must be protected by
7919 a lock or a page latch
7920 @param[in]	local_len	length of data, in bytes
7921 @return the length of the copied field, or 0 if the column was being
7922 or has been deleted */
7923 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7924 btr_copy_externally_stored_field_prefix(
7925 	byte*			buf,
7926 	ulint			len,
7927 	const page_size_t&	page_size,
7928 	const byte*		data,
7929 	ulint			local_len)
7930 {
7931 	ulint	space_id;
7932 	ulint	page_no;
7933 	ulint	offset;
7934 
7935 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7936 
7937 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7938 
7939 	if (UNIV_UNLIKELY(local_len >= len)) {
7940 		memcpy(buf, data, len);
7941 		return(len);
7942 	}
7943 
7944 	memcpy(buf, data, local_len);
7945 	data += local_len;
7946 
7947 	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7948 
7949 	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7950 		/* The externally stored part of the column has been
7951 		(partially) deleted.  Signal the half-deleted BLOB
7952 		to the caller. */
7953 
7954 		return(0);
7955 	}
7956 
7957 	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7958 
7959 	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7960 
7961 	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7962 
7963 	return(local_len
7964 	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7965 							     len - local_len,
7966 							     page_size,
7967 							     space_id, page_no,
7968 							     offset));
7969 }
7970 
7971 /** Copies an externally stored field of a record to mem heap.
7972 The clustered index record must be protected by a lock or a page latch.
7973 @param[out]	len		length of the whole field
7974 @param[in]	data		'internally' stored part of the field
7975 containing also the reference to the external part; must be protected by
7976 a lock or a page latch
7977 @param[in]	page_size	BLOB page size
7978 @param[in]	local_len	length of data
7979 @param[in,out]	heap		mem heap
7980 @return the whole field copied to heap */
7981 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7982 btr_copy_externally_stored_field(
7983 	ulint*			len,
7984 	const byte*		data,
7985 	const page_size_t&	page_size,
7986 	ulint			local_len,
7987 	mem_heap_t*		heap)
7988 {
7989 	ulint	space_id;
7990 	ulint	page_no;
7991 	ulint	offset;
7992 	ulint	extern_len;
7993 	byte*	buf;
7994 
7995 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7996 
7997 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7998 
7999 	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
8000 
8001 	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
8002 
8003 	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
8004 
8005 	/* Currently a BLOB cannot be bigger than 4 GB; we
8006 	leave the 4 upper bytes in the length field unused */
8007 
8008 	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
8009 
8010 	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
8011 
8012 	memcpy(buf, data, local_len);
8013 	*len = local_len
8014 		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
8015 							      extern_len,
8016 							      page_size,
8017 							      space_id,
8018 							      page_no, offset);
8019 
8020 	return(buf);
8021 }
8022 
8023 /** Copies an externally stored field of a record to mem heap.
8024 @param[in]	rec		record in a clustered index; must be
8025 protected by a lock or a page latch
8026 @param[in]	offset		array returned by rec_get_offsets()
8027 @param[in]	page_size	BLOB page size
8028 @param[in]	no		field number
8029 @param[out]	len		length of the field
8030 @param[in,out]	heap		mem heap
8031 @return the field copied to heap, or NULL if the field is incomplete */
8032 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)8033 btr_rec_copy_externally_stored_field(
8034 	const rec_t*		rec,
8035 	const ulint*		offsets,
8036 	const page_size_t&	page_size,
8037 	ulint			no,
8038 	ulint*			len,
8039 	mem_heap_t*		heap)
8040 {
8041 	ulint		local_len;
8042 	const byte*	data;
8043 
8044 	ut_a(rec_offs_nth_extern(offsets, no));
8045 
8046 	/* An externally stored field can contain some initial
8047 	data from the field, and in the last 20 bytes it has the
8048 	space id, page number, and offset where the rest of the
8049 	field data is stored, and the data length in addition to
8050 	the data stored locally. We may need to store some data
8051 	locally to get the local record length above the 128 byte
8052 	limit so that field offsets are stored in two bytes, and
8053 	the extern bit is available in those two bytes. */
8054 
8055 	data = rec_get_nth_field(rec, offsets, no, &local_len);
8056 
8057 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8058 
8059 	if (UNIV_UNLIKELY
8060 	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
8061 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8062 		/* The externally stored field was not written yet.
8063 		This record should only be seen by
8064 		recv_recovery_rollback_active() or any
8065 		TRX_ISO_READ_UNCOMMITTED transactions. */
8066 		return(NULL);
8067 	}
8068 
8069 	return(btr_copy_externally_stored_field(len, data,
8070 						page_size, local_len, heap));
8071 }
8072 #endif /* !UNIV_HOTBACKUP */
8073