1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2019, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 Copyright (c) 2015, 2021, MariaDB Corporation.
7 
8 Portions of this file contain modifications contributed and copyrighted by
9 Google, Inc. Those modifications are gratefully acknowledged and are described
10 briefly in the InnoDB documentation. The contributions by Google are
11 incorporated with their permission, and subject to the conditions contained in
12 the file COPYING.Google.
13 
14 This program is free software; you can redistribute it and/or modify it under
15 the terms of the GNU General Public License as published by the Free Software
16 Foundation; version 2 of the License.
17 
18 This program is distributed in the hope that it will be useful, but WITHOUT
19 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
20 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
21 
22 You should have received a copy of the GNU General Public License along with
23 this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
25 
26 *****************************************************************************/
27 
28 /**************************************************//**
29 @file btr/btr0cur.cc
30 The index tree cursor
31 
32 All changes that row operations make to a B-tree or the records
33 there must go through this module! Undo log records are written here
34 of every modify or insert of a clustered index record.
35 
36 			NOTE!!!
37 To make sure we do not run out of disk space during a pessimistic
38 insert or update, we have to reserve 2 x the height of the index tree
39 many pages in the tablespace before we start the operation, because
40 if leaf splitting has been started, it is difficult to undo, except
41 by crashing the database and doing a roll-forward.
42 
43 Created 10/16/1994 Heikki Tuuri
44 *******************************************************/
45 
46 #include "btr0cur.h"
47 #include "row0upd.h"
48 #include "mtr0log.h"
49 #include "page0page.h"
50 #include "page0zip.h"
51 #include "rem0rec.h"
52 #include "rem0cmp.h"
53 #include "buf0lru.h"
54 #include "btr0btr.h"
55 #include "btr0sea.h"
56 #include "row0log.h"
57 #include "row0purge.h"
58 #include "row0upd.h"
59 #include "trx0rec.h"
60 #include "trx0roll.h"
61 #include "que0que.h"
62 #include "row0row.h"
63 #include "srv0srv.h"
64 #include "ibuf0ibuf.h"
65 #include "lock0lock.h"
66 #include "zlib.h"
67 #include "srv0start.h"
68 #include "mysql_com.h"
69 #include "dict0stats.h"
70 #ifdef WITH_WSREP
71 #include "mysql/service_wsrep.h"
72 #endif /* WITH_WSREP */
73 
74 /** Buffered B-tree operation types, introduced as part of delete buffering. */
75 enum btr_op_t {
76 	BTR_NO_OP = 0,			/*!< Not buffered */
77 	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
78 	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
79 	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
80 	BTR_DELMARK_OP			/*!< Mark a record for deletion */
81 };
82 
83 /** Modification types for the B-tree operation.
84     Note that the order must be DELETE, BOTH, INSERT !!
85  */
86 enum btr_intention_t {
87 	BTR_INTENTION_DELETE,
88 	BTR_INTENTION_BOTH,
89 	BTR_INTENTION_INSERT
90 };
91 
92 /** For the index->lock scalability improvement, only possibility of clear
93 performance regression observed was caused by grown huge history list length.
94 That is because the exclusive use of index->lock also worked as reserving
95 free blocks and read IO bandwidth with priority. To avoid huge glowing history
96 list as same level with previous implementation, prioritizes pessimistic tree
97 operations by purge as the previous, when it seems to be growing huge.
98 
99  Experimentally, the history list length starts to affect to performance
100 throughput clearly from about 100000. */
101 #define BTR_CUR_FINE_HISTORY_LENGTH	100000
102 
103 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
104 Atomic_counter<ulint>	btr_cur_n_non_sea;
105 /** Old value of btr_cur_n_non_sea.  Copied by
106 srv_refresh_innodb_monitor_stats().  Referenced by
107 srv_printf_innodb_monitor(). */
108 ulint	btr_cur_n_non_sea_old;
109 #ifdef BTR_CUR_HASH_ADAPT
110 /** Number of successful adaptive hash index lookups in
111 btr_cur_search_to_nth_level(). */
112 ulint	btr_cur_n_sea;
113 /** Old value of btr_cur_n_sea.  Copied by
114 srv_refresh_innodb_monitor_stats().  Referenced by
115 srv_printf_innodb_monitor(). */
116 ulint	btr_cur_n_sea_old;
117 #endif /* BTR_CUR_HASH_ADAPT */
118 
119 #ifdef UNIV_DEBUG
120 /* Flag to limit optimistic insert records */
121 uint	btr_cur_limit_optimistic_insert_debug;
122 #endif /* UNIV_DEBUG */
123 
124 /** In the optimistic insert, if the insert does not fit, but this much space
125 can be released by page reorganize, then it is reorganized */
126 #define BTR_CUR_PAGE_REORGANIZE_LIMIT	(srv_page_size / 32)
127 
128 /** The structure of a BLOB part header */
129 /* @{ */
130 /*--------------------------------------*/
131 #define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
132 						page */
133 #define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
134 						FIL_NULL if none */
135 /*--------------------------------------*/
136 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
137 						part header, in bytes */
138 
139 /** Estimated table level stats from sampled value.
140 @param value sampled stats
141 @param index index being sampled
142 @param sample number of sampled rows
143 @param ext_size external stored data size
144 @param not_empty table not empty
145 @return estimated table wide stats from sampled value */
146 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
147 	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
148 	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
149 
150 /* @} */
151 
152 /*******************************************************************//**
153 Marks all extern fields in a record as owned by the record. This function
154 should be called if the delete mark of a record is removed: a not delete
155 marked record always owns all its extern fields. */
156 static
157 void
158 btr_cur_unmark_extern_fields(
159 /*=========================*/
160 	buf_block_t*	block,	/*!< in/out: index page */
161 	rec_t*		rec,	/*!< in/out: record in a clustered index */
162 	dict_index_t*	index,	/*!< in: index of the page */
163 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
164 	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
165 /*******************************************************************//**
166 Adds path information to the cursor for the current page, for which
167 the binary search has been performed. */
168 static
169 void
170 btr_cur_add_path_info(
171 /*==================*/
172 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
173 	ulint		height,		/*!< in: height of the page in tree;
174 					0 means leaf node */
175 	ulint		root_height);	/*!< in: root node height in tree */
176 /***********************************************************//**
177 Frees the externally stored fields for a record, if the field is mentioned
178 in the update vector. */
179 static
180 void
181 btr_rec_free_updated_extern_fields(
182 /*===============================*/
183 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
184 				X-latched */
185 	rec_t*		rec,	/*!< in: record */
186 	buf_block_t*	block,	/*!< in: index page of rec */
187 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
188 	const upd_t*	update,	/*!< in: update vector */
189 	bool		rollback,/*!< in: performing rollback? */
190 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
191 				an X-latch to record page and to the tree */
192 /***********************************************************//**
193 Frees the externally stored fields for a record. */
194 static
195 void
196 btr_rec_free_externally_stored_fields(
197 /*==================================*/
198 	dict_index_t*	index,	/*!< in: index of the data, the index
199 				tree MUST be X-latched */
200 	rec_t*		rec,	/*!< in: record */
201 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
202 	buf_block_t*	block,	/*!< in: index page of rec */
203 	bool		rollback,/*!< in: performing rollback? */
204 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
205 				an X-latch to record page and to the index
206 				tree */
207 
208 /*==================== B-TREE SEARCH =========================*/
209 
210 /** Latches the leaf page or pages requested.
211 @param[in]	block		leaf page where the search converged
212 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
213 @param[in]	cursor		cursor
214 @param[in]	mtr		mini-transaction
215 @return	blocks and savepoints which actually latched. */
216 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)217 btr_cur_latch_leaves(
218 	buf_block_t*		block,
219 	ulint			latch_mode,
220 	btr_cur_t*		cursor,
221 	mtr_t*			mtr)
222 {
223 	rw_lock_type_t	mode;
224 	uint32_t	left_page_no;
225 	uint32_t	right_page_no;
226 	buf_block_t*	get_block;
227 	bool		spatial;
228 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
229 
230 	compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH));
231 	compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH));
232 	compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH));
233 	ut_ad(block->page.id().space() == cursor->index->table->space->id);
234 
235 	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
236 	ut_ad(block->page.in_file());
237 
238 	switch (latch_mode) {
239 	case BTR_SEARCH_LEAF:
240 	case BTR_MODIFY_LEAF:
241 	case BTR_SEARCH_TREE:
242 		if (spatial) {
243 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
244 				= mtr_set_savepoint(mtr);
245 		}
246 
247 		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
248 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
249 		get_block = btr_block_get(*cursor->index,
250 					  block->page.id().page_no(), mode,
251 					  true, mtr);
252 		latch_leaves.blocks[1] = get_block;
253 #ifdef UNIV_BTR_DEBUG
254 		ut_a(page_is_comp(get_block->frame)
255 		     == page_is_comp(block->frame));
256 #endif /* UNIV_BTR_DEBUG */
257 		if (spatial) {
258 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
259 				= get_block;
260 		}
261 
262 		return(latch_leaves);
263 	case BTR_MODIFY_TREE:
264 		/* It is exclusive for other operations which calls
265 		btr_page_set_prev() */
266 		ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
267 						 MTR_MEMO_X_LOCK
268 						 | MTR_MEMO_SX_LOCK));
269 		/* x-latch also siblings from left to right */
270 		left_page_no = btr_page_get_prev(block->frame);
271 
272 		if (left_page_no != FIL_NULL) {
273 
274 			if (spatial) {
275 				cursor->rtr_info->tree_savepoints[
276 					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
277 			}
278 
279 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
280 			get_block = btr_block_get(
281 				*cursor->index, left_page_no, RW_X_LATCH,
282 				true, mtr);
283 			latch_leaves.blocks[0] = get_block;
284 
285 			if (spatial) {
286 				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
287 					= get_block;
288 			}
289 		}
290 
291 		if (spatial) {
292 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
293 				= mtr_set_savepoint(mtr);
294 		}
295 
296 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
297 		get_block = btr_block_get(
298 			*cursor->index, block->page.id().page_no(),
299 			RW_X_LATCH, true, mtr);
300 		latch_leaves.blocks[1] = get_block;
301 
302 #ifdef UNIV_BTR_DEBUG
303 		/* Sanity check only after both the blocks are latched. */
304 		if (latch_leaves.blocks[0] != NULL) {
305 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
306 			     == page_is_comp(block->frame));
307 			ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame)
308 			     == block->page.id().page_no());
309 		}
310 		ut_a(page_is_comp(get_block->frame)
311 		     == page_is_comp(block->frame));
312 #endif /* UNIV_BTR_DEBUG */
313 
314 		if (spatial) {
315 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
316 				= get_block;
317 		}
318 
319 		right_page_no = btr_page_get_next(block->frame);
320 
321 		if (right_page_no != FIL_NULL) {
322 			if (spatial) {
323 				cursor->rtr_info->tree_savepoints[
324 					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
325 								mtr);
326 			}
327 			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
328 			get_block = btr_block_get(*cursor->index,
329 						  right_page_no, RW_X_LATCH,
330 						  true, mtr);
331 			latch_leaves.blocks[2] = get_block;
332 #ifdef UNIV_BTR_DEBUG
333 			if (get_block) {
334 				ut_a(page_is_comp(get_block->frame)
335 				     == page_is_comp(block->frame));
336 				ut_a(btr_page_get_prev(get_block->frame)
337 				     == block->page.id().page_no());
338 			}
339 #endif /* UNIV_BTR_DEBUG */
340 			if (spatial) {
341 				cursor->rtr_info->tree_blocks[
342 					RTR_MAX_LEVELS + 2] = get_block;
343 			}
344 		}
345 
346 		return(latch_leaves);
347 
348 	case BTR_SEARCH_PREV:
349 	case BTR_MODIFY_PREV:
350 		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
351 		/* latch also left sibling */
352 		rw_lock_s_lock(&block->lock);
353 		left_page_no = btr_page_get_prev(block->frame);
354 		rw_lock_s_unlock(&block->lock);
355 
356 		if (left_page_no != FIL_NULL) {
357 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
358 			get_block = btr_block_get(
359 				*cursor->index, left_page_no, mode,
360 				true, mtr);
361 			latch_leaves.blocks[0] = get_block;
362 			cursor->left_block = get_block;
363 #ifdef UNIV_BTR_DEBUG
364 			ut_a(page_is_comp(get_block->frame)
365 			     == page_is_comp(block->frame));
366 			ut_a(btr_page_get_next(get_block->frame)
367 			     == block->page.id().page_no());
368 #endif /* UNIV_BTR_DEBUG */
369 		}
370 
371 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
372 		get_block = btr_block_get(*cursor->index,
373 					  block->page.id().page_no(), mode,
374 					  true, mtr);
375 		latch_leaves.blocks[1] = get_block;
376 #ifdef UNIV_BTR_DEBUG
377 		ut_a(page_is_comp(get_block->frame)
378 		     == page_is_comp(block->frame));
379 #endif /* UNIV_BTR_DEBUG */
380 		return(latch_leaves);
381 	case BTR_CONT_MODIFY_TREE:
382 		ut_ad(dict_index_is_spatial(cursor->index));
383 		return(latch_leaves);
384 	}
385 
386 	ut_error;
387 	return(latch_leaves);
388 }
389 
390 /** Load the instant ALTER TABLE metadata from the clustered index
391 when loading a table definition.
392 @param[in,out]	index	clustered index definition
393 @param[in,out]	mtr	mini-transaction
394 @return	error code
395 @retval	DB_SUCCESS	if no error occurred
396 @retval	DB_CORRUPTION	if any corruption was noticed */
btr_cur_instant_init_low(dict_index_t * index,mtr_t * mtr)397 static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
398 {
399 	ut_ad(index->is_primary());
400 	ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
401 	ut_ad(index->table->supports_instant());
402 	ut_ad(index->table->is_readable());
403 
404 	const fil_space_t* space = index->table->space;
405 	if (!space) {
406 unreadable:
407 		ib::error() << "Table " << index->table->name
408 			    << " has an unreadable root page";
409 		index->table->corrupted = true;
410 		return DB_CORRUPTION;
411 	}
412 
413 	page_t* root = btr_root_get(index, mtr);
414 
415 	if (!root || btr_cur_instant_root_init(index, root)) {
416 		goto unreadable;
417 	}
418 
419 	ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
420 
421 	if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
422 		ut_ad(!index->is_instant());
423 		return DB_SUCCESS;
424 	}
425 
426 	btr_cur_t cur;
427 	/* Relax the assertion in rec_init_offsets(). */
428 	ut_ad(!index->in_instant_init);
429 	ut_d(index->in_instant_init = true);
430 	dberr_t err = btr_cur_open_at_index_side(true, index, BTR_SEARCH_LEAF,
431 						 &cur, 0, mtr);
432 	ut_d(index->in_instant_init = false);
433 	if (err != DB_SUCCESS) {
434 		index->table->corrupted = true;
435 		return err;
436 	}
437 
438 	ut_ad(page_cur_is_before_first(&cur.page_cur));
439 	ut_ad(page_is_leaf(cur.page_cur.block->frame));
440 
441 	page_cur_move_to_next(&cur.page_cur);
442 
443 	const rec_t* rec = cur.page_cur.rec;
444 	const ulint comp = dict_table_is_comp(index->table);
445 	const ulint info_bits = rec_get_info_bits(rec, comp);
446 
447 	if (page_rec_is_supremum(rec)
448 	    || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
449 		if (!index->is_instant()) {
450 			/* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
451 			assigned even if instant ADD COLUMN was not
452 			committed. Changes to these page header fields are not
453 			undo-logged, but changes to the hidden metadata record
454 			are. If the server is killed and restarted, the page
455 			header fields could remain set even though no metadata
456 			record is present. */
457 			return DB_SUCCESS;
458 		}
459 
460 		ib::error() << "Table " << index->table->name
461 			    << " is missing instant ALTER metadata";
462 		index->table->corrupted = true;
463 		return DB_CORRUPTION;
464 	}
465 
466 	if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
467 	    || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
468 incompatible:
469 		ib::error() << "Table " << index->table->name
470 			<< " contains unrecognizable instant ALTER metadata";
471 		index->table->corrupted = true;
472 		return DB_CORRUPTION;
473 	}
474 
475 	/* Read the metadata. We can get here on server restart
476 	or when the table was evicted from the data dictionary cache
477 	and is now being accessed again.
478 
479 	Here, READ COMMITTED and REPEATABLE READ should be equivalent.
480 	Committing the ADD COLUMN operation would acquire
481 	MDL_EXCLUSIVE and LOCK_X|LOCK_TABLE, which would prevent any
482 	concurrent operations on the table, including table eviction
483 	from the cache. */
484 
485 	if (info_bits & REC_INFO_DELETED_FLAG) {
486 		/* This metadata record includes a BLOB that identifies
487 		any dropped or reordered columns. */
488 		ulint trx_id_offset = index->trx_id_offset;
489 		/* If !index->trx_id_offset, the PRIMARY KEY contains
490 		variable-length columns. For the metadata record,
491 		variable-length columns should be written with zero
492 		length. However, before MDEV-21088 was fixed, for
493 		variable-length encoded PRIMARY KEY column of type
494 		CHAR, we wrote more than zero bytes. That is why we
495 		must determine the actual length of each PRIMARY KEY
496 		column.  The DB_TRX_ID will start right after any
497 		PRIMARY KEY columns. */
498 		ut_ad(index->n_uniq);
499 
500 		/* We cannot invoke rec_get_offsets() before
501 		index->table->deserialise_columns(). Therefore,
502 		we must duplicate some logic here. */
503 		if (trx_id_offset) {
504 		} else if (index->table->not_redundant()) {
505 			/* The PRIMARY KEY contains variable-length columns.
506 			For the metadata record, variable-length columns are
507 			always written with zero length. The DB_TRX_ID will
508 			start right after any fixed-length columns. */
509 
510 			/* OK, before MDEV-21088 was fixed, for
511 			variable-length encoded PRIMARY KEY column of
512 			type CHAR, we wrote more than zero bytes. In
513 			order to allow affected tables to be accessed,
514 			it would be nice to determine the actual
515 			length of each PRIMARY KEY column. However, to
516 			be able to do that, we should determine the
517 			size of the null-bit bitmap in the metadata
518 			record. And we cannot know that before reading
519 			the metadata BLOB, whose starting point we are
520 			trying to find here. (Although the PRIMARY KEY
521 			columns cannot be NULL, we would have to know
522 			where the lengths of variable-length PRIMARY KEY
523 			columns start.)
524 
525 			So, unfortunately we cannot help users who
526 			were affected by MDEV-21088 on a ROW_FORMAT=COMPACT
527 			or ROW_FORMAT=DYNAMIC table. */
528 
529 			for (uint i = index->n_uniq; i--; ) {
530 				trx_id_offset += index->fields[i].fixed_len;
531 			}
532 		} else if (rec_get_1byte_offs_flag(rec)) {
533 			trx_id_offset = rec_1_get_field_end_info(
534 				rec, index->n_uniq - 1);
535 			ut_ad(!(trx_id_offset & REC_1BYTE_SQL_NULL_MASK));
536 			trx_id_offset &= ~REC_1BYTE_SQL_NULL_MASK;
537 		} else {
538 			trx_id_offset = rec_2_get_field_end_info(
539 				rec, index->n_uniq - 1);
540 			ut_ad(!(trx_id_offset & REC_2BYTE_SQL_NULL_MASK));
541 			trx_id_offset &= ~REC_2BYTE_SQL_NULL_MASK;
542 		}
543 
544 		const byte* ptr = rec + trx_id_offset
545 			+ (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
546 
547 		if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
548 			goto incompatible;
549 		}
550 
551 		uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
552 		if (!len
553 		    || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
554 		    != FIL_PAGE_DATA
555 		    || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
556 		    != space->id) {
557 			goto incompatible;
558 		}
559 
560 		buf_block_t* block = buf_page_get(
561 			page_id_t(space->id,
562 				  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
563 			0, RW_S_LATCH, mtr);
564 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
565 		if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
566 		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
567 						      + BTR_BLOB_HDR_NEXT_PAGE_NO])
568 		    != FIL_NULL
569 		    || mach_read_from_4(&block->frame[FIL_PAGE_DATA
570 						      + BTR_BLOB_HDR_PART_LEN])
571 		    != len) {
572 			goto incompatible;
573 		}
574 
575 		/* The unused part of the BLOB page should be zero-filled. */
576 		for (const byte* b = block->frame
577 		       + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
578 		       * const end = block->frame + srv_page_size
579 		       - BTR_EXTERN_LEN;
580 		     b < end; ) {
581 			if (*b++) {
582 				goto incompatible;
583 			}
584 		}
585 
586 		if (index->table->deserialise_columns(
587 			    &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
588 			    len)) {
589 			goto incompatible;
590 		}
591 
592 		/* Proceed to initialize the default values of
593 		any instantly added columns. */
594 	}
595 
596 	mem_heap_t* heap = NULL;
597 	rec_offs* offsets = rec_get_offsets(rec, index, NULL,
598 					    index->n_core_fields,
599 					    ULINT_UNDEFINED, &heap);
600 	if (rec_offs_any_default(offsets)) {
601 inconsistent:
602 		mem_heap_free(heap);
603 		goto incompatible;
604 	}
605 
606 	/* In fact, because we only ever append fields to the metadata
607 	record, it is also OK to perform READ UNCOMMITTED and
608 	then ignore any extra fields, provided that
609 	trx_sys.is_registered(DB_TRX_ID). */
610 	if (rec_offs_n_fields(offsets)
611 	    > ulint(index->n_fields) + !!index->table->instant
612 	    && !trx_sys.is_registered(current_trx(),
613 				      row_get_rec_trx_id(rec, index,
614 							 offsets))) {
615 		goto inconsistent;
616 	}
617 
618 	for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
619 		dict_col_t* col = index->fields[i].col;
620 		const unsigned o = i + !!index->table->instant;
621 		ulint len;
622 		const byte* data = rec_get_nth_field(rec, offsets, o, &len);
623 		ut_ad(!col->is_added());
624 		ut_ad(!col->def_val.data);
625 		col->def_val.len = len;
626 		switch (len) {
627 		case UNIV_SQL_NULL:
628 			continue;
629 		case 0:
630 			col->def_val.data = field_ref_zero;
631 			continue;
632 		}
633 		ut_ad(len != UNIV_SQL_DEFAULT);
634 		if (!rec_offs_nth_extern(offsets, o)) {
635 			col->def_val.data = mem_heap_dup(
636 				index->table->heap, data, len);
637 		} else if (len < BTR_EXTERN_FIELD_REF_SIZE
638 			   || !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
639 				      field_ref_zero,
640 				      BTR_EXTERN_FIELD_REF_SIZE)) {
641 			col->def_val.len = UNIV_SQL_DEFAULT;
642 			goto inconsistent;
643 		} else {
644 			col->def_val.data = btr_copy_externally_stored_field(
645 				&col->def_val.len, data,
646 				cur.page_cur.block->zip_size(),
647 				len, index->table->heap);
648 		}
649 	}
650 
651 	mem_heap_free(heap);
652 	return DB_SUCCESS;
653 }
654 
655 /** Load the instant ALTER TABLE metadata from the clustered index
656 when loading a table definition.
657 @param[in,out]	table	table definition from the data dictionary
658 @return	error code
659 @retval	DB_SUCCESS	if no error occurred */
660 dberr_t
btr_cur_instant_init(dict_table_t * table)661 btr_cur_instant_init(dict_table_t* table)
662 {
663 	mtr_t		mtr;
664 	dict_index_t*	index = dict_table_get_first_index(table);
665 	mtr.start();
666 	dberr_t	err = index
667 		? btr_cur_instant_init_low(index, &mtr)
668 		: DB_CORRUPTION;
669 	mtr.commit();
670 	return(err);
671 }
672 
673 /** Initialize the n_core_null_bytes on first access to a clustered
674 index root page.
675 @param[in]	index	clustered index that is on its first access
676 @param[in]	page	clustered index root page
677 @return	whether the page is corrupted */
btr_cur_instant_root_init(dict_index_t * index,const page_t * page)678 bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
679 {
680 	ut_ad(!index->is_dummy);
681 	ut_ad(fil_page_index_page_check(page));
682 	ut_ad(!page_has_siblings(page));
683 	ut_ad(page_get_space_id(page) == index->table->space_id);
684 	ut_ad(page_get_page_no(page) == index->page);
685 	ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
686 	ut_ad(index->is_primary());
687 	ut_ad(!index->is_instant());
688 	ut_ad(index->table->supports_instant());
689 	/* This is normally executed as part of btr_cur_instant_init()
690 	when dict_load_table_one() is loading a table definition.
691 	Other threads should not access or modify the n_core_null_bytes,
692 	n_core_fields before dict_load_table_one() returns.
693 
694 	This can also be executed during IMPORT TABLESPACE, where the
695 	table definition is exclusively locked. */
696 
697 	switch (fil_page_get_type(page)) {
698 	default:
699 		ut_ad("wrong page type" == 0);
700 		return true;
701 	case FIL_PAGE_INDEX:
702 		/* The field PAGE_INSTANT is guaranteed 0 on clustered
703 		index root pages of ROW_FORMAT=COMPACT or
704 		ROW_FORMAT=DYNAMIC when instant ADD COLUMN is not used. */
705 		ut_ad(!page_is_comp(page) || !page_get_instant(page));
706 		index->n_core_null_bytes = static_cast<uint8_t>(
707 			UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
708 		return false;
709 	case FIL_PAGE_TYPE_INSTANT:
710 		break;
711 	}
712 
713 	const uint16_t n = page_get_instant(page);
714 
715 	if (n < index->n_uniq + DATA_ROLL_PTR) {
716 		/* The PRIMARY KEY (or hidden DB_ROW_ID) and
717 		DB_TRX_ID,DB_ROLL_PTR columns must always be present
718 		as 'core' fields. */
719 		return true;
720 	}
721 
722 	if (n > REC_MAX_N_FIELDS) {
723 		return true;
724 	}
725 
726 	index->n_core_fields = n & dict_index_t::MAX_N_FIELDS;
727 
728 	const rec_t* infimum = page_get_infimum_rec(page);
729 	const rec_t* supremum = page_get_supremum_rec(page);
730 
731 	if (!memcmp(infimum, "infimum", 8)
732 	    && !memcmp(supremum, "supremum", 8)) {
733 		if (n > index->n_fields) {
734 			/* All fields, including those for instantly
735 			added columns, must be present in the
736 			data dictionary. */
737 			return true;
738 		}
739 
740 		ut_ad(!index->is_dummy);
741 		ut_d(index->is_dummy = true);
742 		index->n_core_null_bytes = static_cast<uint8_t>(
743 			UT_BITS_IN_BYTES(index->get_n_nullable(n)));
744 		ut_d(index->is_dummy = false);
745 		return false;
746 	}
747 
748 	if (memcmp(infimum, field_ref_zero, 8)
749 	    || memcmp(supremum, field_ref_zero, 7)) {
750 		/* The infimum and supremum records must either contain
751 		the original strings, or they must be filled with zero
752 		bytes, except for the bytes that we have repurposed. */
753 		return true;
754 	}
755 
756 	index->n_core_null_bytes = supremum[7];
757 	return index->n_core_null_bytes > 128;
758 }
759 
760 /** Optimistically latches the leaf page or pages requested.
761 @param[in]	block		guessed buffer block
762 @param[in]	modify_clock	modify clock value
763 @param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
764 @param[in,out]	cursor		cursor
765 @param[in]	file		file name
766 @param[in]	line		line where called
767 @param[in]	mtr		mini-transaction
768 @return true if success */
769 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,unsigned line,mtr_t * mtr)770 btr_cur_optimistic_latch_leaves(
771 	buf_block_t*	block,
772 	ib_uint64_t	modify_clock,
773 	ulint*		latch_mode,
774 	btr_cur_t*	cursor,
775 	const char*	file,
776 	unsigned	line,
777 	mtr_t*		mtr)
778 {
779 	ut_ad(block->page.buf_fix_count());
780 	ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE);
781 
782 	switch (*latch_mode) {
783 	default:
784 		ut_error;
785 		return(false);
786 	case BTR_SEARCH_LEAF:
787 	case BTR_MODIFY_LEAF:
788 		return(buf_page_optimistic_get(*latch_mode, block,
789 				modify_clock, file, line, mtr));
790 	case BTR_SEARCH_PREV:
791 	case BTR_MODIFY_PREV:
792 		rw_lock_s_lock(&block->lock);
793 		if (block->modify_clock != modify_clock) {
794 			rw_lock_s_unlock(&block->lock);
795 			return false;
796 		}
797 		const uint32_t curr_page_no = block->page.id().page_no();
798 		const uint32_t left_page_no = btr_page_get_prev(block->frame);
799 		rw_lock_s_unlock(&block->lock);
800 
801 		const rw_lock_type_t mode = *latch_mode == BTR_SEARCH_PREV
802 			? RW_S_LATCH : RW_X_LATCH;
803 
804 		if (left_page_no != FIL_NULL) {
805 			dberr_t	err = DB_SUCCESS;
806 			cursor->left_block = buf_page_get_gen(
807 				page_id_t(cursor->index->table->space_id,
808 					  left_page_no),
809 				cursor->index->table->space->zip_size(),
810 				mode, nullptr, BUF_GET_POSSIBLY_FREED,
811 				__FILE__, __LINE__, mtr, &err);
812 
813 			if (!cursor->left_block) {
814 				cursor->index->table->file_unreadable = true;
815 			}
816 
817 			if (cursor->left_block->page.status
818 			    == buf_page_t::FREED
819 			    || btr_page_get_next(cursor->left_block->frame)
820 			    != curr_page_no) {
821 				/* release the left block */
822 				btr_leaf_page_release(
823 					cursor->left_block, mode, mtr);
824 				return false;
825 			}
826 		} else {
827 			cursor->left_block = NULL;
828 		}
829 
830 		if (buf_page_optimistic_get(mode, block, modify_clock,
831 					    file, line, mtr)) {
832 			if (btr_page_get_prev(block->frame) == left_page_no) {
833 				/* block was already buffer-fixed while
834 				entering the function and
835 				buf_page_optimistic_get() buffer-fixes
836 				it again. */
837 				ut_ad(2 <= block->page.buf_fix_count());
838 				*latch_mode = mode;
839 				return(true);
840 			} else {
841 				/* release the block and decrement of
842 				buf_fix_count which was incremented
843 				in buf_page_optimistic_get() */
844 				btr_leaf_page_release(block, mode, mtr);
845 			}
846 		}
847 
848 		ut_ad(block->page.buf_fix_count());
849 		/* release the left block */
850 		if (cursor->left_block != NULL) {
851 			btr_leaf_page_release(cursor->left_block,
852 					      mode, mtr);
853 		}
854 	}
855 
856 	return false;
857 }
858 
859 /**
860 Gets intention in btr_intention_t from latch_mode, and cleares the intention
861 at the latch_mode.
862 @param latch_mode	in/out: pointer to latch_mode
863 @return intention for latching tree */
864 static
865 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)866 btr_cur_get_and_clear_intention(
867 	ulint	*latch_mode)
868 {
869 	btr_intention_t	intention;
870 
871 	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
872 	case BTR_LATCH_FOR_INSERT:
873 		intention = BTR_INTENTION_INSERT;
874 		break;
875 	case BTR_LATCH_FOR_DELETE:
876 		intention = BTR_INTENTION_DELETE;
877 		break;
878 	default:
879 		/* both or unknown */
880 		intention = BTR_INTENTION_BOTH;
881 	}
882 	*latch_mode &= ulint(~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE));
883 
884 	return(intention);
885 }
886 
887 /**
888 Gets the desired latch type for the root leaf (root page is root leaf)
889 at the latch mode.
890 @param latch_mode	in: BTR_SEARCH_LEAF, ...
891 @return latch type */
892 static
893 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)894 btr_cur_latch_for_root_leaf(
895 	ulint	latch_mode)
896 {
897 	switch (latch_mode) {
898 	case BTR_SEARCH_LEAF:
899 	case BTR_SEARCH_TREE:
900 	case BTR_SEARCH_PREV:
901 		return(RW_S_LATCH);
902 	case BTR_MODIFY_LEAF:
903 	case BTR_MODIFY_TREE:
904 	case BTR_MODIFY_PREV:
905 		return(RW_X_LATCH);
906 	case BTR_CONT_MODIFY_TREE:
907 	case BTR_CONT_SEARCH_TREE:
908 		/* A root page should be latched already,
909 		and don't need to be latched here.
910 		fall through (RW_NO_LATCH) */
911 	case BTR_NO_LATCHES:
912 		return(RW_NO_LATCH);
913 	}
914 
915 	ut_error;
916 	return(RW_NO_LATCH); /* avoid compiler warnings */
917 }
918 
919 /** Detects whether the modifying record might need a modifying tree structure.
920 @param[in]	index		index
921 @param[in]	page		page
922 @param[in]	lock_intention	lock intention for the tree operation
923 @param[in]	rec		record (current node_ptr)
924 @param[in]	rec_size	size of the record or max size of node_ptr
925 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
926 @param[in]	mtr		mtr
927 @return true if tree modification is needed */
928 static
929 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,ulint zip_size,mtr_t * mtr)930 btr_cur_will_modify_tree(
931 	dict_index_t*	index,
932 	const page_t*	page,
933 	btr_intention_t	lock_intention,
934 	const rec_t*	rec,
935 	ulint		rec_size,
936 	ulint		zip_size,
937 	mtr_t*		mtr)
938 {
939 	ut_ad(!page_is_leaf(page));
940 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
941 					 | MTR_MEMO_SX_LOCK));
942 
943 	/* Pessimistic delete of the first record causes delete & insert
944 	of node_ptr at upper level. And a subsequent page shrink is
945 	possible. It causes delete of node_ptr at the upper level.
946 	So we should pay attention also to 2nd record not only
947 	first record and last record. Because if the "delete & insert" are
948 	done for the different page, the 2nd record become
949 	first record and following compress might delete the record and causes
950 	the uppper level node_ptr modification. */
951 
952 	const ulint n_recs = page_get_n_recs(page);
953 
954 	if (lock_intention <= BTR_INTENTION_BOTH) {
955 		compile_time_assert(BTR_INTENTION_DELETE < BTR_INTENTION_BOTH);
956 		compile_time_assert(BTR_INTENTION_BOTH < BTR_INTENTION_INSERT);
957 
958 		if (!page_has_siblings(page)) {
959 			return true;
960 		}
961 
962 		ulint margin = rec_size;
963 
964 		if (lock_intention == BTR_INTENTION_BOTH) {
965 			ulint	level = btr_page_get_level(page);
966 
967 			/* This value is the worst expectation for the node_ptr
968 			records to be deleted from this page. It is used to
969 			expect whether the cursor position can be the left_most
970 			record in this page or not. */
971 			ulint   max_nodes_deleted = 0;
972 
973 			/* By modifying tree operations from the under of this
974 			level, logically (2 ^ (level - 1)) opportunities to
975 			deleting records in maximum even unreally rare case. */
976 			if (level > 7) {
977 				/* TODO: adjust this practical limit. */
978 				max_nodes_deleted = 64;
979 			} else if (level > 0) {
980 				max_nodes_deleted = (ulint)1 << (level - 1);
981 			}
982 			/* check delete will cause. (BTR_INTENTION_BOTH
983 			or BTR_INTENTION_DELETE) */
984 			if (n_recs <= max_nodes_deleted * 2
985 			    || page_rec_is_first(rec, page)) {
986 				/* The cursor record can be the left most record
987 				in this page. */
988 				return true;
989 			}
990 
991 			if (page_has_prev(page)
992 			    && page_rec_distance_is_at_most(
993 				    page_get_infimum_rec(page), rec,
994 				    max_nodes_deleted)) {
995 				return true;
996 			}
997 
998 			if (page_has_next(page)
999 			    && page_rec_distance_is_at_most(
1000 				    rec, page_get_supremum_rec(page),
1001 				    max_nodes_deleted)) {
1002 				return true;
1003 			}
1004 
1005 			/* Delete at leftmost record in a page causes delete
1006 			& insert at its parent page. After that, the delete
1007 			might cause btr_compress() and delete record at its
1008 			parent page. Thus we should consider max deletes. */
1009 			margin *= max_nodes_deleted;
1010 		}
1011 
1012 		/* Safe because we already have SX latch of the index tree */
1013 		if (page_get_data_size(page)
1014 		    < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)) {
1015 			return(true);
1016 		}
1017 	}
1018 
1019 	if (lock_intention >= BTR_INTENTION_BOTH) {
1020 		/* check insert will cause. BTR_INTENTION_BOTH
1021 		or BTR_INTENTION_INSERT*/
1022 
1023 		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
1024 		we should check it here in advance, since the max allowable
1025 		records in a page is limited. */
1026 		LIMIT_OPTIMISTIC_INSERT_DEBUG(n_recs, return true);
1027 
1028 		/* needs 2 records' space for the case the single split and
1029 		insert cannot fit.
1030 		page_get_max_insert_size_after_reorganize() includes space
1031 		for page directory already */
1032 		ulint	max_size
1033 			= page_get_max_insert_size_after_reorganize(page, 2);
1034 
1035 		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
1036 		    || max_size < rec_size * 2) {
1037 			return(true);
1038 		}
1039 
1040 		/* TODO: optimize this condition for ROW_FORMAT=COMPRESSED.
1041 		This is based on the worst case, and we could invoke
1042 		page_zip_available() on the block->page.zip. */
1043 		/* needs 2 records' space also for worst compress rate. */
1044 		if (zip_size
1045 		    && page_zip_empty_size(index->n_fields, zip_size)
1046 		    <= rec_size * 2 + page_get_data_size(page)
1047 		    + page_dir_calc_reserved_space(n_recs + 2)) {
1048 			return(true);
1049 		}
1050 	}
1051 
1052 	return(false);
1053 }
1054 
1055 /** Detects whether the modifying record might need a opposite modification
1056 to the intention.
1057 @param[in]	page		page
1058 @param[in]	lock_intention	lock intention for the tree operation
1059 @param[in]	rec		record (current node_ptr)
1060 @return	true if tree modification is needed */
1061 static
1062 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)1063 btr_cur_need_opposite_intention(
1064 	const page_t*	page,
1065 	btr_intention_t	lock_intention,
1066 	const rec_t*	rec)
1067 {
1068 	switch (lock_intention) {
1069 	case BTR_INTENTION_DELETE:
1070 		return (page_has_prev(page) && page_rec_is_first(rec, page)) ||
1071 			(page_has_next(page) && page_rec_is_last(rec, page));
1072 	case BTR_INTENTION_INSERT:
1073 		return page_has_next(page) && page_rec_is_last(rec, page);
1074 	case BTR_INTENTION_BOTH:
1075 		return(false);
1076 	}
1077 
1078 	ut_error;
1079 	return(false);
1080 }
1081 
1082 /**
1083 @param[in]	index b-tree
1084 @return maximum size of a node pointer record in bytes */
btr_node_ptr_max_size(const dict_index_t * index)1085 static ulint btr_node_ptr_max_size(const dict_index_t* index)
1086 {
1087 	if (dict_index_is_ibuf(index)) {
1088 		/* cannot estimate accurately */
1089 		/* This is universal index for change buffer.
1090 		The max size of the entry is about max key length * 2.
1091 		(index key + primary key to be inserted to the index)
1092 		(The max key length is UNIV_PAGE_SIZE / 16 * 3 at
1093 		 ha_innobase::max_supported_key_length(),
1094 		 considering MAX_KEY_LENGTH = 3072 at MySQL imposes
1095 		 the 3500 historical InnoDB value for 16K page size case.)
1096 		For the universal index, node_ptr contains most of the entry.
1097 		And 512 is enough to contain ibuf columns and meta-data */
1098 		return srv_page_size / 8 * 3 + 512;
1099 	}
1100 
1101 	/* Each record has page_no, length of page_no and header. */
1102 	ulint comp = dict_table_is_comp(index->table);
1103 	ulint rec_max_size = comp
1104 		? REC_NODE_PTR_SIZE + 1 + REC_N_NEW_EXTRA_BYTES
1105 		+ UT_BITS_IN_BYTES(index->n_nullable)
1106 		: REC_NODE_PTR_SIZE + 2 + REC_N_OLD_EXTRA_BYTES
1107 		+ 2 * index->n_fields;
1108 
1109 	/* Compute the maximum possible record size. */
1110 	for (ulint i = 0; i < dict_index_get_n_unique_in_tree(index); i++) {
1111 		const dict_field_t*	field
1112 			= dict_index_get_nth_field(index, i);
1113 		const dict_col_t*	col
1114 			= dict_field_get_col(field);
1115 		ulint			field_max_size;
1116 		ulint			field_ext_max_size;
1117 
1118 		/* Determine the maximum length of the index field. */
1119 
1120 		field_max_size = dict_col_get_fixed_size(col, comp);
1121 		if (field_max_size) {
1122 			/* dict_index_add_col() should guarantee this */
1123 			ut_ad(!field->prefix_len
1124 			      || field->fixed_len == field->prefix_len);
1125 			/* Fixed lengths are not encoded
1126 			in ROW_FORMAT=COMPACT. */
1127 			rec_max_size += field_max_size;
1128 			continue;
1129 		}
1130 
1131 		field_max_size = dict_col_get_max_size(col);
1132 		if (UNIV_UNLIKELY(!field_max_size)) {
1133 			switch (col->mtype) {
1134 			case DATA_VARCHAR:
1135 				if (!comp
1136 				    && (!strcmp(index->table->name.m_name,
1137 						"SYS_FOREIGN")
1138 					|| !strcmp(index->table->name.m_name,
1139 						   "SYS_FOREIGN_COLS"))) {
1140 					break;
1141 				}
1142 				/* fall through */
1143 			case DATA_VARMYSQL:
1144 			case DATA_CHAR:
1145 			case DATA_MYSQL:
1146 				/* CHAR(0) and VARCHAR(0) are possible
1147 				data type definitions in MariaDB.
1148 				The InnoDB internal SQL parser maps
1149 				CHAR to DATA_VARCHAR, so DATA_CHAR (or
1150 				DATA_MYSQL) is only coming from the
1151 				MariaDB SQL layer. */
1152 				if (comp) {
1153 					/* Add a length byte, because
1154 					fixed-length empty field are
1155 					encoded as variable-length.
1156 					For ROW_FORMAT=REDUNDANT,
1157 					these bytes were added to
1158 					rec_max_size before this loop. */
1159 					rec_max_size++;
1160 				}
1161 				continue;
1162 			}
1163 
1164 			/* SYS_FOREIGN.ID is defined as CHAR in the
1165 			InnoDB internal SQL parser, which translates
1166 			into the incorrect VARCHAR(0).  InnoDB does
1167 			not enforce maximum lengths of columns, so
1168 			that is why any data can be inserted in the
1169 			first place.
1170 
1171 			Likewise, SYS_FOREIGN.FOR_NAME,
1172 			SYS_FOREIGN.REF_NAME, SYS_FOREIGN_COLS.ID, are
1173 			defined as CHAR, and also they are part of a key. */
1174 
1175 			ut_ad(!strcmp(index->table->name.m_name,
1176 				      "SYS_FOREIGN")
1177 			      || !strcmp(index->table->name.m_name,
1178 					 "SYS_FOREIGN_COLS"));
1179 			ut_ad(!comp);
1180 			ut_ad(col->mtype == DATA_VARCHAR);
1181 
1182 			rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
1183 				? REDUNDANT_REC_MAX_DATA_SIZE
1184 				: page_get_free_space_of_empty(FALSE) / 2;
1185 		} else if (field_max_size == NAME_LEN && i == 1
1186 			   && (!strcmp(index->table->name.m_name,
1187 				       TABLE_STATS_NAME)
1188 			       || !strcmp(index->table->name.m_name,
1189 					  INDEX_STATS_NAME))) {
1190 			/* Interpret "table_name" as VARCHAR(199) even
1191 			if it was incorrectly defined as VARCHAR(64).
1192 			While the caller of ha_innobase enforces the
1193 			maximum length on any data written, the InnoDB
1194 			internal SQL parser will happily write as much
1195 			data as is provided. The purpose of this hack
1196 			is to avoid InnoDB hangs after persistent
1197 			statistics on partitioned tables are
1198 			deleted. */
1199 			field_max_size = 199 * SYSTEM_CHARSET_MBMAXLEN;
1200 		}
1201 		field_ext_max_size = field_max_size < 256 ? 1 : 2;
1202 
1203 		if (field->prefix_len
1204 		    && field->prefix_len < field_max_size) {
1205 			field_max_size = field->prefix_len;
1206 		}
1207 
1208 		if (comp) {
1209 			/* Add the extra size for ROW_FORMAT=COMPACT.
1210 			For ROW_FORMAT=REDUNDANT, these bytes were
1211 			added to rec_max_size before this loop. */
1212 			rec_max_size += field_ext_max_size;
1213 		}
1214 
1215 		rec_max_size += field_max_size;
1216 	}
1217 
1218 	return rec_max_size;
1219 }
1220 
1221 /********************************************************************//**
1222 Searches an index tree and positions a tree cursor on a given level.
1223 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
1224 to node pointer page number fields on the upper levels of the tree!
1225 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
1226 cursor->up_match and cursor->low_match both will have sensible values.
1227 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
1228 
1229 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
1230 search tuple should be performed in the B-tree. InnoDB does an insert
1231 immediately after the cursor. Thus, the cursor may end up on a user record,
1232 or on a page infimum record. */
1233 dberr_t
btr_cur_search_to_nth_level_func(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,rw_lock_t * ahi_latch,const char * file,unsigned line,mtr_t * mtr,ib_uint64_t autoinc)1234 btr_cur_search_to_nth_level_func(
1235 	dict_index_t*	index,	/*!< in: index */
1236 	ulint		level,	/*!< in: the tree level of search */
1237 	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
1238 				tuple must be set so that it cannot get
1239 				compared to the node ptr page number field! */
1240 	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
1241 				Inserts should always be made using
1242 				PAGE_CUR_LE to search the position! */
1243 	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
1244 				at most one of BTR_INSERT, BTR_DELETE_MARK,
1245 				BTR_DELETE, or BTR_ESTIMATE;
1246 				cursor->left_block is used to store a pointer
1247 				to the left neighbor page, in the cases
1248 				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
1249 				NOTE that if ahi_latch, we might not have a
1250 				cursor page latch, we assume that ahi_latch
1251 				protects the record! */
1252 	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
1253 				s- or x-latched, but see also above! */
1254 #ifdef BTR_CUR_HASH_ADAPT
1255 	rw_lock_t*	ahi_latch,
1256 				/*!< in: currently held btr_search_latch
1257 				(in RW_S_LATCH mode), or NULL */
1258 #endif /* BTR_CUR_HASH_ADAPT */
1259 	const char*	file,	/*!< in: file name */
1260 	unsigned	line,	/*!< in: line where called */
1261 	mtr_t*		mtr,	/*!< in: mtr */
1262 	ib_uint64_t	autoinc)/*!< in: PAGE_ROOT_AUTO_INC to be written
1263 				(0 if none) */
1264 {
1265 	page_t*		page = NULL; /* remove warning */
1266 	buf_block_t*	block;
1267 	buf_block_t*	guess;
1268 	ulint		height;
1269 	ulint		up_match;
1270 	ulint		up_bytes;
1271 	ulint		low_match;
1272 	ulint		low_bytes;
1273 	ulint		rw_latch;
1274 	page_cur_mode_t	page_mode;
1275 	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
1276 	ulint		buf_mode;
1277 	ulint		estimate;
1278 	ulint		node_ptr_max_size = srv_page_size / 2;
1279 	page_cur_t*	page_cursor;
1280 	btr_op_t	btr_op;
1281 	ulint		root_height = 0; /* remove warning */
1282 	dberr_t		err = DB_SUCCESS;
1283 
1284 	btr_intention_t	lock_intention;
1285 	bool		modify_external;
1286 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
1287 	ulint		tree_savepoints[BTR_MAX_LEVELS];
1288 	ulint		n_blocks = 0;
1289 	ulint		n_releases = 0;
1290 	bool		detected_same_key_root = false;
1291 
1292 	bool		retrying_for_search_prev = false;
1293 	ulint		leftmost_from_level = 0;
1294 	buf_block_t**	prev_tree_blocks = NULL;
1295 	ulint*		prev_tree_savepoints = NULL;
1296 	ulint		prev_n_blocks = 0;
1297 	ulint		prev_n_releases = 0;
1298 	bool		need_path = true;
1299 	bool		rtree_parent_modified = false;
1300 	bool		mbr_adj = false;
1301 	bool		found = false;
1302 
1303 	DBUG_ENTER("btr_cur_search_to_nth_level");
1304 
1305 #ifdef BTR_CUR_ADAPT
1306 	btr_search_t*	info;
1307 #endif /* BTR_CUR_ADAPT */
1308 	mem_heap_t*	heap		= NULL;
1309 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
1310 	rec_offs*	offsets		= offsets_;
1311 	rec_offs	offsets2_[REC_OFFS_NORMAL_SIZE];
1312 	rec_offs*	offsets2	= offsets2_;
1313 	rec_offs_init(offsets_);
1314 	rec_offs_init(offsets2_);
1315 	/* Currently, PAGE_CUR_LE is the only search mode used for searches
1316 	ending to upper levels */
1317 
1318 	ut_ad(level == 0 || mode == PAGE_CUR_LE
1319 	      || RTREE_SEARCH_MODE(mode));
1320 	ut_ad(dict_index_check_search_tuple(index, tuple));
1321 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
1322 	ut_ad(dtuple_check_typed(tuple));
1323 	ut_ad(!(index->type & DICT_FTS));
1324 	ut_ad(index->page != FIL_NULL);
1325 
1326 	MEM_UNDEFINED(&cursor->up_match, sizeof cursor->up_match);
1327 	MEM_UNDEFINED(&cursor->up_bytes, sizeof cursor->up_bytes);
1328 	MEM_UNDEFINED(&cursor->low_match, sizeof cursor->low_match);
1329 	MEM_UNDEFINED(&cursor->low_bytes, sizeof cursor->low_bytes);
1330 #ifdef UNIV_DEBUG
1331 	cursor->up_match = ULINT_UNDEFINED;
1332 	cursor->low_match = ULINT_UNDEFINED;
1333 #endif /* UNIV_DEBUG */
1334 
1335 	ibool	s_latch_by_caller;
1336 
1337 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
1338 
1339 	ut_ad(!s_latch_by_caller
1340 	      || srv_read_only_mode
1341 	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
1342 					    | MTR_MEMO_SX_LOCK));
1343 
1344 	/* These flags are mutually exclusive, they are lumped together
1345 	with the latch mode for historical reasons. It's possible for
1346 	none of the flags to be set. */
1347 	switch (UNIV_EXPECT(latch_mode
1348 			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
1349 			    0)) {
1350 	case 0:
1351 		btr_op = BTR_NO_OP;
1352 		break;
1353 	case BTR_INSERT:
1354 		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
1355 			? BTR_INSERT_IGNORE_UNIQUE_OP
1356 			: BTR_INSERT_OP;
1357 		break;
1358 	case BTR_DELETE:
1359 		btr_op = BTR_DELETE_OP;
1360 		ut_a(cursor->purge_node);
1361 		break;
1362 	case BTR_DELETE_MARK:
1363 		btr_op = BTR_DELMARK_OP;
1364 		break;
1365 	default:
1366 		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
1367 		should be specified at a time */
1368 		ut_error;
1369 	}
1370 
1371 	/* Operations on the insert buffer tree cannot be buffered. */
1372 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
1373 	/* Operations on the clustered index cannot be buffered. */
1374 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
1375 	/* Operations on the temporary table(indexes) cannot be buffered. */
1376 	ut_ad(btr_op == BTR_NO_OP || !index->table->is_temporary());
1377 	/* Operation on the spatial index cannot be buffered. */
1378 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
1379 
1380 	estimate = latch_mode & BTR_ESTIMATE;
1381 
1382 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
1383 
1384 	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
1385 
1386 	/* Turn the flags unrelated to the latch mode off. */
1387 	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
1388 
1389 	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
1390 
1391 	ut_ad(!s_latch_by_caller
1392 	      || latch_mode == BTR_SEARCH_LEAF
1393 	      || latch_mode == BTR_SEARCH_TREE
1394 	      || latch_mode == BTR_MODIFY_LEAF);
1395 
1396 	ut_ad(autoinc == 0 || dict_index_is_clust(index));
1397 	ut_ad(autoinc == 0
1398 	      || latch_mode == BTR_MODIFY_TREE
1399 	      || latch_mode == BTR_MODIFY_LEAF);
1400 	ut_ad(autoinc == 0 || level == 0);
1401 
1402 	cursor->flag = BTR_CUR_BINARY;
1403 	cursor->index = index;
1404 
1405 #ifndef BTR_CUR_ADAPT
1406 	guess = NULL;
1407 #else
1408 	info = btr_search_get_info(index);
1409 	guess = info->root_guess;
1410 
1411 #ifdef BTR_CUR_HASH_ADAPT
1412 
1413 # ifdef UNIV_SEARCH_PERF_STAT
1414 	info->n_searches++;
1415 # endif
1416 	if (autoinc == 0
1417 	    && latch_mode <= BTR_MODIFY_LEAF
1418 	    && info->last_hash_succ
1419 # ifdef MYSQL_INDEX_DISABLE_AHI
1420 	    && !index->disable_ahi
1421 # endif
1422 	    && !estimate
1423 # ifdef PAGE_CUR_LE_OR_EXTENDS
1424 	    && mode != PAGE_CUR_LE_OR_EXTENDS
1425 # endif /* PAGE_CUR_LE_OR_EXTENDS */
1426 	    && !dict_index_is_spatial(index)
1427 	    /* If !ahi_latch, we do a dirty read of
1428 	    btr_search_enabled below, and btr_search_guess_on_hash()
1429 	    will have to check it again. */
1430 	    && btr_search_enabled
1431 	    && !modify_external
1432 	    && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1433 	    && btr_search_guess_on_hash(index, info, tuple, mode,
1434 					latch_mode, cursor,
1435 					ahi_latch, mtr)) {
1436 
1437 		/* Search using the hash index succeeded */
1438 
1439 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1440 		      || mode != PAGE_CUR_GE);
1441 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1442 		      || mode != PAGE_CUR_LE);
1443 		ut_ad(cursor->low_match != ULINT_UNDEFINED
1444 		      || mode != PAGE_CUR_LE);
1445 		btr_cur_n_sea++;
1446 
1447 		DBUG_RETURN(err);
1448 	}
1449 # endif /* BTR_CUR_HASH_ADAPT */
1450 #endif /* BTR_CUR_ADAPT */
1451 	btr_cur_n_non_sea++;
1452 
1453 	/* If the hash search did not succeed, do binary search down the
1454 	tree */
1455 
1456 #ifdef BTR_CUR_HASH_ADAPT
1457 	if (ahi_latch) {
1458 		/* Release possible search latch to obey latching order */
1459 		rw_lock_s_unlock(ahi_latch);
1460 	}
1461 #endif /* BTR_CUR_HASH_ADAPT */
1462 
1463 	/* Store the position of the tree latch we push to mtr so that we
1464 	know how to release it when we have latched leaf node(s) */
1465 
1466 	ulint savepoint = mtr_set_savepoint(mtr);
1467 
1468 	rw_lock_type_t upper_rw_latch;
1469 
1470 	switch (latch_mode) {
1471 	case BTR_MODIFY_TREE:
1472 		/* Most of delete-intended operations are purging.
1473 		Free blocks and read IO bandwidth should be prior
1474 		for them, when the history list is glowing huge. */
1475 		if (lock_intention == BTR_INTENTION_DELETE
1476 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
1477 		    && buf_pool.n_pend_reads) {
1478 x_latch_index:
1479 			mtr_x_lock_index(index, mtr);
1480 		} else if (index->is_spatial()
1481 			   && lock_intention <= BTR_INTENTION_BOTH) {
1482 			/* X lock the if there is possibility of
1483 			pessimistic delete on spatial index. As we could
1484 			lock upward for the tree */
1485 			goto x_latch_index;
1486 		} else {
1487 			mtr_sx_lock_index(index, mtr);
1488 		}
1489 		upper_rw_latch = RW_X_LATCH;
1490 		break;
1491 	case BTR_CONT_MODIFY_TREE:
1492 	case BTR_CONT_SEARCH_TREE:
1493 		/* Do nothing */
1494 		ut_ad(srv_read_only_mode
1495 		      || mtr->memo_contains_flagged(&index->lock,
1496 						    MTR_MEMO_X_LOCK
1497 						    | MTR_MEMO_SX_LOCK));
1498 		if (dict_index_is_spatial(index)
1499 		    && latch_mode == BTR_CONT_MODIFY_TREE) {
1500 			/* If we are about to locating parent page for split
1501 			and/or merge operation for R-Tree index, X latch
1502 			the parent */
1503 			upper_rw_latch = RW_X_LATCH;
1504 		} else {
1505 			upper_rw_latch = RW_NO_LATCH;
1506 		}
1507 		break;
1508 	default:
1509 		if (!srv_read_only_mode) {
1510 			if (s_latch_by_caller) {
1511 				ut_ad(rw_lock_own(dict_index_get_lock(index),
1512 				              RW_LOCK_S));
1513 			} else if (!modify_external) {
1514 				/* BTR_SEARCH_TREE is intended to be used with
1515 				BTR_ALREADY_S_LATCHED */
1516 				ut_ad(latch_mode != BTR_SEARCH_TREE);
1517 
1518 				mtr_s_lock_index(index, mtr);
1519 			} else {
1520 				/* BTR_MODIFY_EXTERNAL needs to be excluded */
1521 				mtr_sx_lock_index(index, mtr);
1522 			}
1523 			upper_rw_latch = RW_S_LATCH;
1524 		} else {
1525 			upper_rw_latch = RW_NO_LATCH;
1526 		}
1527 	}
1528 	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
1529 		latch_mode);
1530 
1531 	page_cursor = btr_cur_get_page_cur(cursor);
1532 
1533 	const ulint		zip_size = index->table->space->zip_size();
1534 
1535 	/* Start with the root page. */
1536 	page_id_t		page_id(index->table->space_id, index->page);
1537 
1538 	if (root_leaf_rw_latch == RW_X_LATCH) {
1539 		node_ptr_max_size = btr_node_ptr_max_size(index);
1540 	}
1541 
1542 	up_match = 0;
1543 	up_bytes = 0;
1544 	low_match = 0;
1545 	low_bytes = 0;
1546 
1547 	height = ULINT_UNDEFINED;
1548 
1549 	/* We use these modified search modes on non-leaf levels of the
1550 	B-tree. These let us end up in the right B-tree leaf. In that leaf
1551 	we use the original search mode. */
1552 
1553 	switch (mode) {
1554 	case PAGE_CUR_GE:
1555 		page_mode = PAGE_CUR_L;
1556 		break;
1557 	case PAGE_CUR_G:
1558 		page_mode = PAGE_CUR_LE;
1559 		break;
1560 	default:
1561 #ifdef PAGE_CUR_LE_OR_EXTENDS
1562 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1563 		      || RTREE_SEARCH_MODE(mode)
1564 		      || mode == PAGE_CUR_LE_OR_EXTENDS);
1565 #else /* PAGE_CUR_LE_OR_EXTENDS */
1566 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1567 		      || RTREE_SEARCH_MODE(mode));
1568 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1569 		page_mode = mode;
1570 		break;
1571 	}
1572 
1573 	/* Loop and search until we arrive at the desired level */
1574 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1575 
1576 search_loop:
1577 	buf_mode = BUF_GET;
1578 	rw_latch = RW_NO_LATCH;
1579 	rtree_parent_modified = false;
1580 
1581 	if (height != 0) {
1582 		/* We are about to fetch the root or a non-leaf page. */
1583 		if ((latch_mode != BTR_MODIFY_TREE || height == level)
1584 		    && !retrying_for_search_prev) {
1585 			/* If doesn't have SX or X latch of index,
1586 			each pages should be latched before reading. */
1587 			if (height == ULINT_UNDEFINED
1588 			    && upper_rw_latch == RW_S_LATCH
1589 			    && (modify_external || autoinc)) {
1590 				/* needs sx-latch of root page
1591 				for fseg operation or for writing
1592 				PAGE_ROOT_AUTO_INC */
1593 				rw_latch = RW_SX_LATCH;
1594 			} else {
1595 				rw_latch = upper_rw_latch;
1596 			}
1597 		}
1598 	} else if (latch_mode <= BTR_MODIFY_LEAF) {
1599 		rw_latch = latch_mode;
1600 
1601 		if (btr_op != BTR_NO_OP
1602 		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1603 
1604 			/* Try to buffer the operation if the leaf
1605 			page is not in the buffer pool. */
1606 
1607 			buf_mode = btr_op == BTR_DELETE_OP
1608 				? BUF_GET_IF_IN_POOL_OR_WATCH
1609 				: BUF_GET_IF_IN_POOL;
1610 		}
1611 	}
1612 
1613 retry_page_get:
1614 	ut_ad(n_blocks < BTR_MAX_LEVELS);
1615 	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1616 	block = buf_page_get_gen(page_id, zip_size, rw_latch, guess,
1617 				 buf_mode, file, line, mtr, &err,
1618 				 height == 0 && !index->is_clust());
1619 	tree_blocks[n_blocks] = block;
1620 
1621 	/* Note that block==NULL signifies either an error or change
1622 	buffering. */
1623 
1624 	if (err != DB_SUCCESS) {
1625 		ut_ad(block == NULL);
1626 		if (err == DB_DECRYPTION_FAILED) {
1627 			ib_push_warning((void *)NULL,
1628 				DB_DECRYPTION_FAILED,
1629 				"Table %s is encrypted but encryption service or"
1630 				" used key_id is not available. "
1631 				" Can't continue reading table.",
1632 				index->table->name.m_name);
1633 			index->table->file_unreadable = true;
1634 		}
1635 
1636 		goto func_exit;
1637 	}
1638 
1639 	if (block == NULL) {
1640 		/* This must be a search to perform an insert/delete
1641 		mark/ delete; try using the insert/delete buffer */
1642 
1643 		ut_ad(height == 0);
1644 		ut_ad(cursor->thr);
1645 
1646 		switch (btr_op) {
1647 		case BTR_INSERT_OP:
1648 		case BTR_INSERT_IGNORE_UNIQUE_OP:
1649 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1650 			ut_ad(!dict_index_is_spatial(index));
1651 
1652 			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1653 					page_id, zip_size, cursor->thr)) {
1654 
1655 				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1656 
1657 				goto func_exit;
1658 			}
1659 			break;
1660 
1661 		case BTR_DELMARK_OP:
1662 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1663 			ut_ad(!dict_index_is_spatial(index));
1664 
1665 			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1666 					index, page_id, zip_size,
1667 					cursor->thr)) {
1668 
1669 				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1670 
1671 				goto func_exit;
1672 			}
1673 
1674 			break;
1675 
1676 		case BTR_DELETE_OP:
1677 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1678 			ut_ad(!dict_index_is_spatial(index));
1679 
1680 			if (!row_purge_poss_sec(cursor->purge_node,
1681 						index, tuple)) {
1682 
1683 				/* The record cannot be purged yet. */
1684 				cursor->flag = BTR_CUR_DELETE_REF;
1685 			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1686 					       index, page_id, zip_size,
1687 					       cursor->thr)) {
1688 
1689 				/* The purge was buffered. */
1690 				cursor->flag = BTR_CUR_DELETE_IBUF;
1691 			} else {
1692 				/* The purge could not be buffered. */
1693 				buf_pool.watch_unset(page_id);
1694 				break;
1695 			}
1696 
1697 			buf_pool.watch_unset(page_id);
1698 			goto func_exit;
1699 
1700 		default:
1701 			ut_error;
1702 		}
1703 
1704 		/* Insert to the insert/delete buffer did not succeed, we
1705 		must read the page from disk. */
1706 
1707 		buf_mode = BUF_GET;
1708 
1709 		goto retry_page_get;
1710 	}
1711 
1712 	if (retrying_for_search_prev && height != 0) {
1713 		/* also latch left sibling */
1714 		uint32_t	left_page_no;
1715 		buf_block_t*	get_block;
1716 
1717 		ut_ad(rw_latch == RW_NO_LATCH);
1718 
1719 		rw_latch = upper_rw_latch;
1720 
1721 		rw_lock_s_lock(&block->lock);
1722 		left_page_no = btr_page_get_prev(buf_block_get_frame(block));
1723 		rw_lock_s_unlock(&block->lock);
1724 
1725 		if (left_page_no != FIL_NULL) {
1726 			ut_ad(prev_n_blocks < leftmost_from_level);
1727 
1728 			prev_tree_savepoints[prev_n_blocks]
1729 				= mtr_set_savepoint(mtr);
1730 			get_block = buf_page_get_gen(
1731 				page_id_t(page_id.space(), left_page_no),
1732 				zip_size, rw_latch, NULL, buf_mode,
1733 				file, line, mtr, &err);
1734 			prev_tree_blocks[prev_n_blocks] = get_block;
1735 			prev_n_blocks++;
1736 
1737 			if (err != DB_SUCCESS) {
1738 				if (err == DB_DECRYPTION_FAILED) {
1739 					ib_push_warning((void *)NULL,
1740 						DB_DECRYPTION_FAILED,
1741 						"Table %s is encrypted but encryption service or"
1742 						" used key_id is not available. "
1743 						" Can't continue reading table.",
1744 						index->table->name.m_name);
1745 					index->table->file_unreadable = true;
1746 				}
1747 
1748 				goto func_exit;
1749 			}
1750 
1751 			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1752 			without their parent page's lock. So, not needed to
1753 			retry here, because we have the parent page's lock. */
1754 		}
1755 
1756 		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
1757 		mtr_release_block_at_savepoint(
1758 			mtr, tree_savepoints[n_blocks],
1759 			tree_blocks[n_blocks]);
1760 
1761 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1762 		block = buf_page_get_gen(page_id, zip_size,
1763 					 rw_latch, NULL, buf_mode,
1764 					 file, line, mtr, &err);
1765 		tree_blocks[n_blocks] = block;
1766 
1767 		if (err != DB_SUCCESS) {
1768 			if (err == DB_DECRYPTION_FAILED) {
1769 				ib_push_warning((void *)NULL,
1770 					DB_DECRYPTION_FAILED,
1771 					"Table %s is encrypted but encryption service or"
1772 					" used key_id is not available. "
1773 					" Can't continue reading table.",
1774 					index->table->name.m_name);
1775 				index->table->file_unreadable = true;
1776 			}
1777 
1778 			goto func_exit;
1779 		}
1780 	}
1781 
1782 	page = buf_block_get_frame(block);
1783 
1784 	if (height == ULINT_UNDEFINED
1785 	    && page_is_leaf(page)
1786 	    && rw_latch != RW_NO_LATCH
1787 	    && rw_latch != root_leaf_rw_latch) {
1788 		/* The root page is also a leaf page (root_leaf).
1789 		We should reacquire the page, because the root page
1790 		is latched differently from leaf pages. */
1791 		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1792 		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1793 		ut_ad(rw_latch == RW_S_LATCH || modify_external || autoinc);
1794 		ut_ad(!autoinc || root_leaf_rw_latch == RW_X_LATCH);
1795 
1796 		ut_ad(n_blocks == 0);
1797 		mtr_release_block_at_savepoint(
1798 			mtr, tree_savepoints[n_blocks],
1799 			tree_blocks[n_blocks]);
1800 
1801 		upper_rw_latch = root_leaf_rw_latch;
1802 		goto search_loop;
1803 	}
1804 
1805 	if (rw_latch != RW_NO_LATCH) {
1806 #ifdef UNIV_ZIP_DEBUG
1807 		const page_zip_des_t*	page_zip
1808 			= buf_block_get_page_zip(block);
1809 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1810 #endif /* UNIV_ZIP_DEBUG */
1811 
1812 		buf_block_dbg_add_level(
1813 			block, dict_index_is_ibuf(index)
1814 			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1815 	}
1816 
1817 	ut_ad(fil_page_index_page_check(page));
1818 	ut_ad(index->id == btr_page_get_index_id(page));
1819 
1820 	if (height == ULINT_UNDEFINED) {
1821 		/* We are in the root node */
1822 
1823 		height = btr_page_get_level(page);
1824 		root_height = height;
1825 		cursor->tree_height = root_height + 1;
1826 
1827 		if (dict_index_is_spatial(index)) {
1828 			ut_ad(cursor->rtr_info);
1829 
1830 			/* If SSN in memory is not initialized, fetch
1831 			it from root page */
1832 			if (!rtr_get_current_ssn_id(index)) {
1833 				/* FIXME: do this in dict_load_table_one() */
1834 				index->set_ssn(page_get_ssn_id(page) + 1);
1835 			}
1836 
1837 			/* Save the MBR */
1838 			cursor->rtr_info->thr = cursor->thr;
1839 			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1840 		}
1841 
1842 #ifdef BTR_CUR_ADAPT
1843 		info->root_guess = block;
1844 #endif
1845 	}
1846 
1847 	if (height == 0) {
1848 		if (rw_latch == RW_NO_LATCH) {
1849 			latch_leaves = btr_cur_latch_leaves(
1850 				block, latch_mode, cursor, mtr);
1851 		}
1852 
1853 		switch (latch_mode) {
1854 		case BTR_MODIFY_TREE:
1855 		case BTR_CONT_MODIFY_TREE:
1856 		case BTR_CONT_SEARCH_TREE:
1857 			break;
1858 		default:
1859 			if (!s_latch_by_caller
1860 			    && !srv_read_only_mode
1861 			    && !modify_external) {
1862 				/* Release the tree s-latch */
1863 				/* NOTE: BTR_MODIFY_EXTERNAL
1864 				needs to keep tree sx-latch */
1865 				mtr_release_s_latch_at_savepoint(
1866 					mtr, savepoint,
1867 					dict_index_get_lock(index));
1868 			}
1869 
1870 			/* release upper blocks */
1871 			if (retrying_for_search_prev) {
1872 				ut_ad(!autoinc);
1873 				for (;
1874 				     prev_n_releases < prev_n_blocks;
1875 				     prev_n_releases++) {
1876 					mtr_release_block_at_savepoint(
1877 						mtr,
1878 						prev_tree_savepoints[
1879 							prev_n_releases],
1880 						prev_tree_blocks[
1881 							prev_n_releases]);
1882 				}
1883 			}
1884 
1885 			for (; n_releases < n_blocks; n_releases++) {
1886 				if (n_releases == 0
1887 				    && (modify_external || autoinc)) {
1888 					/* keep the root page latch */
1889 					ut_ad(mtr->memo_contains_flagged(
1890 						      tree_blocks[n_releases],
1891 						      MTR_MEMO_PAGE_SX_FIX
1892 						      | MTR_MEMO_PAGE_X_FIX));
1893 					continue;
1894 				}
1895 
1896 				mtr_release_block_at_savepoint(
1897 					mtr, tree_savepoints[n_releases],
1898 					tree_blocks[n_releases]);
1899 			}
1900 		}
1901 
1902 		page_mode = mode;
1903 	}
1904 
1905 	if (dict_index_is_spatial(index)) {
1906 		/* Remember the page search mode */
1907 		search_mode = page_mode;
1908 
1909 		/* Some adjustment on search mode, when the
1910 		page search mode is PAGE_CUR_RTREE_LOCATE
1911 		or PAGE_CUR_RTREE_INSERT, as we are searching
1912 		with MBRs. When it is not the target level, we
1913 		should search all sub-trees that "CONTAIN" the
1914 		search range/MBR. When it is at the target
1915 		level, the search becomes PAGE_CUR_LE */
1916 		if (page_mode == PAGE_CUR_RTREE_LOCATE
1917 		    && level == height) {
1918 			if (level == 0) {
1919 				page_mode = PAGE_CUR_LE;
1920 			} else {
1921 				page_mode = PAGE_CUR_RTREE_GET_FATHER;
1922 			}
1923 		}
1924 
1925 		if (page_mode == PAGE_CUR_RTREE_INSERT) {
1926 			page_mode = (level == height)
1927 					? PAGE_CUR_LE
1928 					: PAGE_CUR_RTREE_INSERT;
1929 
1930 			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1931 		}
1932 
1933 		/* "need_path" indicates if we need to tracking the parent
1934 		pages, if it is not spatial comparison, then no need to
1935 		track it */
1936 		if (page_mode < PAGE_CUR_CONTAIN) {
1937 			need_path = false;
1938 		}
1939 
1940 		up_match = 0;
1941 		low_match = 0;
1942 
1943 		if (latch_mode == BTR_MODIFY_TREE
1944 		    || latch_mode == BTR_CONT_MODIFY_TREE
1945 		    || latch_mode == BTR_CONT_SEARCH_TREE) {
1946 			/* Tree are locked, no need for Page Lock to protect
1947 			the "path" */
1948 			cursor->rtr_info->need_page_lock = false;
1949 		}
1950         }
1951 
1952 	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1953 		ut_ad(need_path);
1954 		found = rtr_cur_search_with_match(
1955 			block, index, tuple, page_mode, page_cursor,
1956 			cursor->rtr_info);
1957 
1958 		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1959 		if (search_mode == PAGE_CUR_RTREE_INSERT
1960 		    && cursor->rtr_info->mbr_adj) {
1961 			if (latch_mode & BTR_MODIFY_LEAF) {
1962 				/* Parent MBR needs updated, should retry
1963 				with BTR_MODIFY_TREE */
1964 				goto func_exit;
1965 			} else if (latch_mode & BTR_MODIFY_TREE) {
1966 				rtree_parent_modified = true;
1967 				cursor->rtr_info->mbr_adj = false;
1968 				mbr_adj = true;
1969 			} else {
1970 				ut_ad(0);
1971 			}
1972 		}
1973 
1974 		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1975 			cursor->low_match =
1976 				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1977 		}
1978 #ifdef BTR_CUR_HASH_ADAPT
1979 	} else if (height == 0 && btr_search_enabled
1980 		   && !(tuple->info_bits & REC_INFO_MIN_REC_FLAG)
1981 		   && !dict_index_is_spatial(index)) {
1982 		/* The adaptive hash index is only used when searching
1983 		for leaf pages (height==0), but not in r-trees.
1984 		We only need the byte prefix comparison for the purpose
1985 		of updating the adaptive hash index. */
1986 		page_cur_search_with_match_bytes(
1987 			block, index, tuple, page_mode, &up_match, &up_bytes,
1988 			&low_match, &low_bytes, page_cursor);
1989 #endif /* BTR_CUR_HASH_ADAPT */
1990 	} else {
1991 		/* Search for complete index fields. */
1992 		up_bytes = low_bytes = 0;
1993 		page_cur_search_with_match(
1994 			block, index, tuple, page_mode, &up_match,
1995 			&low_match, page_cursor,
1996 			need_path ? cursor->rtr_info : NULL);
1997 	}
1998 
1999 	if (estimate) {
2000 		btr_cur_add_path_info(cursor, height, root_height);
2001 	}
2002 
2003 	/* If this is the desired level, leave the loop */
2004 
2005 	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor)));
2006 
2007 	/* Add Predicate lock if it is serializable isolation
2008 	and only if it is in the search case */
2009 	if (dict_index_is_spatial(index)
2010 	    && cursor->rtr_info->need_prdt_lock
2011 	    && mode != PAGE_CUR_RTREE_INSERT
2012 	    && mode != PAGE_CUR_RTREE_LOCATE
2013 	    && mode >= PAGE_CUR_CONTAIN) {
2014 		trx_t*		trx = thr_get_trx(cursor->thr);
2015 		lock_prdt_t	prdt;
2016 
2017 		lock_mutex_enter();
2018 		lock_init_prdt_from_mbr(
2019 			&prdt, &cursor->rtr_info->mbr, mode,
2020 			trx->lock.lock_heap);
2021 		lock_mutex_exit();
2022 
2023 		if (rw_latch == RW_NO_LATCH && height != 0) {
2024 			rw_lock_s_lock(&(block->lock));
2025 		}
2026 
2027 		lock_prdt_lock(block, &prdt, index, LOCK_S,
2028 			       LOCK_PREDICATE, cursor->thr);
2029 
2030 		if (rw_latch == RW_NO_LATCH && height != 0) {
2031 			rw_lock_s_unlock(&(block->lock));
2032 		}
2033 	}
2034 
2035 	if (level != height) {
2036 
2037 		const rec_t*	node_ptr;
2038 		ut_ad(height > 0);
2039 
2040 		height--;
2041 		guess = NULL;
2042 
2043 		node_ptr = page_cur_get_rec(page_cursor);
2044 
2045 		offsets = rec_get_offsets(node_ptr, index, offsets, 0,
2046 					  ULINT_UNDEFINED, &heap);
2047 
2048 		/* If the rec is the first or last in the page for
2049 		pessimistic delete intention, it might cause node_ptr insert
2050 		for the upper level. We should change the intention and retry.
2051 		*/
2052 		if (latch_mode == BTR_MODIFY_TREE
2053 		    && btr_cur_need_opposite_intention(
2054 			page, lock_intention, node_ptr)) {
2055 
2056 need_opposite_intention:
2057 			ut_ad(upper_rw_latch == RW_X_LATCH);
2058 
2059 			if (n_releases > 0) {
2060 				/* release root block */
2061 				mtr_release_block_at_savepoint(
2062 					mtr, tree_savepoints[0],
2063 					tree_blocks[0]);
2064 			}
2065 
2066 			/* release all blocks */
2067 			for (; n_releases <= n_blocks; n_releases++) {
2068 				mtr_release_block_at_savepoint(
2069 					mtr, tree_savepoints[n_releases],
2070 					tree_blocks[n_releases]);
2071 			}
2072 
2073 			lock_intention = BTR_INTENTION_BOTH;
2074 
2075 			page_id.set_page_no(index->page);
2076 			up_match = 0;
2077 			low_match = 0;
2078 			height = ULINT_UNDEFINED;
2079 
2080 			n_blocks = 0;
2081 			n_releases = 0;
2082 
2083 			goto search_loop;
2084 		}
2085 
2086 		if (dict_index_is_spatial(index)) {
2087 			if (page_rec_is_supremum(node_ptr)) {
2088 				cursor->low_match = 0;
2089 				cursor->up_match = 0;
2090 				goto func_exit;
2091 			}
2092 
2093 			/* If we are doing insertion or record locating,
2094 			remember the tree nodes we visited */
2095 			if (page_mode == PAGE_CUR_RTREE_INSERT
2096 			    || (search_mode == PAGE_CUR_RTREE_LOCATE
2097 			        && (latch_mode != BTR_MODIFY_LEAF))) {
2098 				bool		add_latch = false;
2099 
2100 				if (latch_mode == BTR_MODIFY_TREE
2101 				    && rw_latch == RW_NO_LATCH) {
2102 					ut_ad(mtr->memo_contains_flagged(
2103 						&index->lock, MTR_MEMO_X_LOCK
2104 						| MTR_MEMO_SX_LOCK));
2105 					rw_lock_s_lock(&block->lock);
2106 					add_latch = true;
2107 				}
2108 
2109 				/* Store the parent cursor location */
2110 #ifdef UNIV_DEBUG
2111 				ulint	num_stored = rtr_store_parent_path(
2112 					block, cursor, latch_mode,
2113 					height + 1, mtr);
2114 #else
2115 				rtr_store_parent_path(
2116 					block, cursor, latch_mode,
2117 					height + 1, mtr);
2118 #endif
2119 
2120 				if (page_mode == PAGE_CUR_RTREE_INSERT) {
2121 					btr_pcur_t*     r_cursor =
2122 						rtr_get_parent_cursor(
2123 							cursor, height + 1,
2124 							true);
2125 					/* If it is insertion, there should
2126 					be only one parent for each level
2127 					traverse */
2128 #ifdef UNIV_DEBUG
2129 					ut_ad(num_stored == 1);
2130 #endif
2131 
2132 					node_ptr = btr_pcur_get_rec(r_cursor);
2133 
2134 				}
2135 
2136 				if (add_latch) {
2137 					rw_lock_s_unlock(&block->lock);
2138 				}
2139 
2140 				ut_ad(!page_rec_is_supremum(node_ptr));
2141 			}
2142 
2143 			ut_ad(page_mode == search_mode
2144 			      || (page_mode == PAGE_CUR_WITHIN
2145 				  && search_mode == PAGE_CUR_RTREE_LOCATE));
2146 
2147 			page_mode = search_mode;
2148 		}
2149 
2150 		/* If the first or the last record of the page
2151 		or the same key value to the first record or last record,
2152 		the another page might be chosen when BTR_CONT_MODIFY_TREE.
2153 		So, the parent page should not released to avoiding deadlock
2154 		with blocking the another search with the same key value. */
2155 		if (!detected_same_key_root
2156 		    && lock_intention == BTR_INTENTION_BOTH
2157 		    && !dict_index_is_unique(index)
2158 		    && latch_mode == BTR_MODIFY_TREE
2159 		    && (up_match >= rec_offs_n_fields(offsets) - 1
2160 			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
2161 			const rec_t*	first_rec = page_rec_get_next_const(
2162 				page_get_infimum_rec(page));
2163 			ulint		matched_fields;
2164 
2165 			ut_ad(upper_rw_latch == RW_X_LATCH);
2166 
2167 			if (node_ptr == first_rec
2168 			    || page_rec_is_last(node_ptr, page)) {
2169 				detected_same_key_root = true;
2170 			} else {
2171 				matched_fields = 0;
2172 
2173 				offsets2 = rec_get_offsets(
2174 					first_rec, index, offsets2,
2175 					0, ULINT_UNDEFINED, &heap);
2176 				cmp_rec_rec(node_ptr, first_rec,
2177 					    offsets, offsets2, index, false,
2178 					    &matched_fields);
2179 
2180 				if (matched_fields
2181 				    >= rec_offs_n_fields(offsets) - 1) {
2182 					detected_same_key_root = true;
2183 				} else {
2184 					const rec_t*	last_rec;
2185 
2186 					last_rec = page_rec_get_prev_const(
2187 						page_get_supremum_rec(page));
2188 
2189 					matched_fields = 0;
2190 
2191 					offsets2 = rec_get_offsets(
2192 						last_rec, index, offsets2,
2193 						0, ULINT_UNDEFINED, &heap);
2194 					cmp_rec_rec(
2195 						node_ptr, last_rec,
2196 						offsets, offsets2, index,
2197 						false, &matched_fields);
2198 					if (matched_fields
2199 					    >= rec_offs_n_fields(offsets) - 1) {
2200 						detected_same_key_root = true;
2201 					}
2202 				}
2203 			}
2204 		}
2205 
2206 		/* If the page might cause modify_tree,
2207 		we should not release the parent page's lock. */
2208 		if (!detected_same_key_root
2209 		    && latch_mode == BTR_MODIFY_TREE
2210 		    && !btr_cur_will_modify_tree(
2211 				index, page, lock_intention, node_ptr,
2212 				node_ptr_max_size, zip_size, mtr)
2213 		    && !rtree_parent_modified) {
2214 			ut_ad(upper_rw_latch == RW_X_LATCH);
2215 			ut_ad(n_releases <= n_blocks);
2216 
2217 			/* we can release upper blocks */
2218 			for (; n_releases < n_blocks; n_releases++) {
2219 				if (n_releases == 0) {
2220 					/* we should not release root page
2221 					to pin to same block. */
2222 					continue;
2223 				}
2224 
2225 				/* release unused blocks to unpin */
2226 				mtr_release_block_at_savepoint(
2227 					mtr, tree_savepoints[n_releases],
2228 					tree_blocks[n_releases]);
2229 			}
2230 		}
2231 
2232 		if (height == level
2233 		    && latch_mode == BTR_MODIFY_TREE) {
2234 			ut_ad(upper_rw_latch == RW_X_LATCH);
2235 			/* we should sx-latch root page, if released already.
2236 			It contains seg_header. */
2237 			if (n_releases > 0) {
2238 				mtr_block_sx_latch_at_savepoint(
2239 					mtr, tree_savepoints[0],
2240 					tree_blocks[0]);
2241 			}
2242 
2243 			/* x-latch the branch blocks not released yet. */
2244 			for (ulint i = n_releases; i <= n_blocks; i++) {
2245 				mtr_block_x_latch_at_savepoint(
2246 					mtr, tree_savepoints[i],
2247 					tree_blocks[i]);
2248 			}
2249 		}
2250 
2251 		/* We should consider prev_page of parent page, if the node_ptr
2252 		is the leftmost of the page. because BTR_SEARCH_PREV and
2253 		BTR_MODIFY_PREV latches prev_page of the leaf page. */
2254 		if ((latch_mode == BTR_SEARCH_PREV
2255 		     || latch_mode == BTR_MODIFY_PREV)
2256 		    && !retrying_for_search_prev) {
2257 			/* block should be latched for consistent
2258 			   btr_page_get_prev() */
2259 			ut_ad(mtr->memo_contains_flagged(
2260 				      block, MTR_MEMO_PAGE_S_FIX
2261 				      | MTR_MEMO_PAGE_X_FIX));
2262 
2263 			if (page_has_prev(page)
2264 			    && page_rec_is_first(node_ptr, page)) {
2265 
2266 				if (leftmost_from_level == 0) {
2267 					leftmost_from_level = height + 1;
2268 				}
2269 			} else {
2270 				leftmost_from_level = 0;
2271 			}
2272 
2273 			if (height == 0 && leftmost_from_level > 0) {
2274 				/* should retry to get also prev_page
2275 				from level==leftmost_from_level. */
2276 				retrying_for_search_prev = true;
2277 
2278 				prev_tree_blocks = static_cast<buf_block_t**>(
2279 					ut_malloc_nokey(sizeof(buf_block_t*)
2280 							* leftmost_from_level));
2281 
2282 				prev_tree_savepoints = static_cast<ulint*>(
2283 					ut_malloc_nokey(sizeof(ulint)
2284 							* leftmost_from_level));
2285 
2286 				/* back to the level (leftmost_from_level+1) */
2287 				ulint	idx = n_blocks
2288 					- (leftmost_from_level - 1);
2289 
2290 				page_id.set_page_no(
2291 					tree_blocks[idx]->page.id().page_no());
2292 
2293 				for (ulint i = n_blocks
2294 					       - (leftmost_from_level - 1);
2295 				     i <= n_blocks; i++) {
2296 					mtr_release_block_at_savepoint(
2297 						mtr, tree_savepoints[i],
2298 						tree_blocks[i]);
2299 				}
2300 
2301 				n_blocks -= (leftmost_from_level - 1);
2302 				height = leftmost_from_level;
2303 				ut_ad(n_releases == 0);
2304 
2305 				/* replay up_match, low_match */
2306 				up_match = 0;
2307 				low_match = 0;
2308 				rtr_info_t*	rtr_info	= need_path
2309 					? cursor->rtr_info : NULL;
2310 
2311 				for (ulint i = 0; i < n_blocks; i++) {
2312 					page_cur_search_with_match(
2313 						tree_blocks[i], index, tuple,
2314 						page_mode, &up_match,
2315 						&low_match, page_cursor,
2316 						rtr_info);
2317 				}
2318 
2319 				goto search_loop;
2320 			}
2321 		}
2322 
2323 		/* Go to the child node */
2324 		page_id.set_page_no(
2325 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2326 
2327 		n_blocks++;
2328 
2329 		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
2330 			/* We're doing a search on an ibuf tree and we're one
2331 			level above the leaf page. */
2332 
2333 			ut_ad(level == 0);
2334 
2335 			buf_mode = BUF_GET;
2336 			rw_latch = RW_NO_LATCH;
2337 			goto retry_page_get;
2338 		}
2339 
2340 		if (dict_index_is_spatial(index)
2341 		    && page_mode >= PAGE_CUR_CONTAIN
2342 		    && page_mode != PAGE_CUR_RTREE_INSERT) {
2343 			ut_ad(need_path);
2344 			rtr_node_path_t* path =
2345 				cursor->rtr_info->path;
2346 
2347 			if (!path->empty() && found) {
2348 				ut_ad(path->back().page_no
2349 				      == page_id.page_no());
2350 				path->pop_back();
2351 #ifdef UNIV_DEBUG
2352 				if (page_mode == PAGE_CUR_RTREE_LOCATE
2353 				    && (latch_mode != BTR_MODIFY_LEAF)) {
2354 					btr_pcur_t*	cur
2355 					= cursor->rtr_info->parent_path->back(
2356 					  ).cursor;
2357 					rec_t*	my_node_ptr
2358 						= btr_pcur_get_rec(cur);
2359 
2360 					offsets = rec_get_offsets(
2361 						my_node_ptr, index, offsets,
2362 						0, ULINT_UNDEFINED, &heap);
2363 
2364 					ulint	my_page_no
2365 					= btr_node_ptr_get_child_page_no(
2366 						my_node_ptr, offsets);
2367 
2368 					ut_ad(page_id.page_no() == my_page_no);
2369 				}
2370 #endif
2371 			}
2372 		}
2373 
2374 		goto search_loop;
2375 	} else if (!dict_index_is_spatial(index)
2376 		   && latch_mode == BTR_MODIFY_TREE
2377 		   && lock_intention == BTR_INTENTION_INSERT
2378 		   && page_has_next(page)
2379 		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
2380 
2381 		/* btr_insert_into_right_sibling() might cause
2382 		deleting node_ptr at upper level */
2383 
2384 		guess = NULL;
2385 
2386 		if (height == 0) {
2387 			/* release the leaf pages if latched */
2388 			for (uint i = 0; i < 3; i++) {
2389 				if (latch_leaves.blocks[i] != NULL) {
2390 					mtr_release_block_at_savepoint(
2391 						mtr, latch_leaves.savepoints[i],
2392 						latch_leaves.blocks[i]);
2393 					latch_leaves.blocks[i] = NULL;
2394 				}
2395 			}
2396 		}
2397 
2398 		goto need_opposite_intention;
2399 	}
2400 
2401 	if (level != 0) {
2402 		ut_ad(!autoinc);
2403 
2404 		if (upper_rw_latch == RW_NO_LATCH) {
2405 			ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
2406 			      || latch_mode == BTR_CONT_SEARCH_TREE);
2407 			buf_block_t* child_block = btr_block_get(
2408 				*index, page_id.page_no(),
2409 				latch_mode == BTR_CONT_MODIFY_TREE
2410 				? RW_X_LATCH : RW_SX_LATCH, false, mtr);
2411 			btr_assert_not_corrupted(child_block, index);
2412 		} else {
2413 			ut_ad(mtr->memo_contains_flagged(block,
2414 							 upper_rw_latch));
2415 			btr_assert_not_corrupted(block, index);
2416 
2417 			if (s_latch_by_caller) {
2418 				ut_ad(latch_mode == BTR_SEARCH_TREE);
2419 				/* to exclude modifying tree operations
2420 				should sx-latch the index. */
2421 				ut_ad(mtr->memo_contains(index->lock,
2422 							 MTR_MEMO_SX_LOCK));
2423 				/* because has sx-latch of index,
2424 				can release upper blocks. */
2425 				for (; n_releases < n_blocks; n_releases++) {
2426 					mtr_release_block_at_savepoint(
2427 						mtr,
2428 						tree_savepoints[n_releases],
2429 						tree_blocks[n_releases]);
2430 				}
2431 			}
2432 		}
2433 
2434 		if (page_mode <= PAGE_CUR_LE) {
2435 			cursor->low_match = low_match;
2436 			cursor->up_match = up_match;
2437 		}
2438 	} else {
2439 		cursor->low_match = low_match;
2440 		cursor->low_bytes = low_bytes;
2441 		cursor->up_match = up_match;
2442 		cursor->up_bytes = up_bytes;
2443 
2444 		if (autoinc) {
2445 			page_set_autoinc(tree_blocks[0], autoinc, mtr, false);
2446 		}
2447 
2448 #ifdef BTR_CUR_HASH_ADAPT
2449 		/* We do a dirty read of btr_search_enabled here.  We
2450 		will properly check btr_search_enabled again in
2451 		btr_search_build_page_hash_index() before building a
2452 		page hash index, while holding search latch. */
2453 		if (!btr_search_enabled) {
2454 # ifdef MYSQL_INDEX_DISABLE_AHI
2455 		} else if (index->disable_ahi) {
2456 # endif
2457 		} else if (tuple->info_bits & REC_INFO_MIN_REC_FLAG) {
2458 			ut_ad(index->is_instant());
2459 			/* This may be a search tuple for
2460 			btr_pcur_restore_position(). */
2461 			ut_ad(tuple->is_metadata()
2462 			      || (tuple->is_metadata(tuple->info_bits
2463 						     ^ REC_STATUS_INSTANT)));
2464 		} else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
2465 			/* Only user records belong in the adaptive
2466 			hash index. */
2467 		} else {
2468 			btr_search_info_update(index, cursor);
2469 		}
2470 #endif /* BTR_CUR_HASH_ADAPT */
2471 		ut_ad(cursor->up_match != ULINT_UNDEFINED
2472 		      || mode != PAGE_CUR_GE);
2473 		ut_ad(cursor->up_match != ULINT_UNDEFINED
2474 		      || mode != PAGE_CUR_LE);
2475 		ut_ad(cursor->low_match != ULINT_UNDEFINED
2476 		      || mode != PAGE_CUR_LE);
2477 	}
2478 
2479 	/* For spatial index, remember  what blocks are still latched */
2480 	if (dict_index_is_spatial(index)
2481 	    && (latch_mode == BTR_MODIFY_TREE
2482 		|| latch_mode == BTR_MODIFY_LEAF)) {
2483 		for (ulint i = 0; i < n_releases; i++) {
2484 			cursor->rtr_info->tree_blocks[i] = NULL;
2485 			cursor->rtr_info->tree_savepoints[i] = 0;
2486 		}
2487 
2488 		for (ulint i = n_releases; i <= n_blocks; i++) {
2489 			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
2490 			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
2491 		}
2492 	}
2493 
2494 func_exit:
2495 
2496 	if (UNIV_LIKELY_NULL(heap)) {
2497 		mem_heap_free(heap);
2498 	}
2499 
2500 	if (retrying_for_search_prev) {
2501 		ut_free(prev_tree_blocks);
2502 		ut_free(prev_tree_savepoints);
2503 	}
2504 
2505 	if (mbr_adj) {
2506 		/* remember that we will need to adjust parent MBR */
2507 		cursor->rtr_info->mbr_adj = true;
2508 	}
2509 
2510 #ifdef BTR_CUR_HASH_ADAPT
2511 	if (ahi_latch) {
2512 		rw_lock_s_lock(ahi_latch);
2513 	}
2514 #endif /* BTR_CUR_HASH_ADAPT */
2515 
2516 	DBUG_RETURN(err);
2517 }
2518 
2519 /*****************************************************************//**
2520 Opens a cursor at either end of an index. */
2521 dberr_t
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,unsigned line,mtr_t * mtr)2522 btr_cur_open_at_index_side_func(
2523 /*============================*/
2524 	bool		from_left,	/*!< in: true if open to the low end,
2525 					false if to the high end */
2526 	dict_index_t*	index,		/*!< in: index */
2527 	ulint		latch_mode,	/*!< in: latch mode */
2528 	btr_cur_t*	cursor,		/*!< in/out: cursor */
2529 	ulint		level,		/*!< in: level to search for
2530 					(0=leaf). */
2531 	const char*	file,		/*!< in: file name */
2532 	unsigned	line,		/*!< in: line where called */
2533 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
2534 {
2535 	page_cur_t*	page_cursor;
2536 	ulint		node_ptr_max_size = srv_page_size / 2;
2537 	ulint		height;
2538 	ulint		root_height = 0; /* remove warning */
2539 	rec_t*		node_ptr;
2540 	ulint		estimate;
2541 	btr_intention_t	lock_intention;
2542 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2543 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2544 	ulint		n_blocks = 0;
2545 	ulint		n_releases = 0;
2546 	mem_heap_t*	heap		= NULL;
2547 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
2548 	rec_offs*	offsets		= offsets_;
2549 	dberr_t		err = DB_SUCCESS;
2550 
2551 	rec_offs_init(offsets_);
2552 
2553 	estimate = latch_mode & BTR_ESTIMATE;
2554 	latch_mode &= ulint(~BTR_ESTIMATE);
2555 
2556 	ut_ad(level != ULINT_UNDEFINED);
2557 
2558 	bool	s_latch_by_caller;
2559 
2560 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2561 	latch_mode &= ulint(~BTR_ALREADY_S_LATCHED);
2562 
2563 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2564 
2565 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2566 
2567 	/* This function doesn't need to lock left page of the leaf page */
2568 	if (latch_mode == BTR_SEARCH_PREV) {
2569 		latch_mode = BTR_SEARCH_LEAF;
2570 	} else if (latch_mode == BTR_MODIFY_PREV) {
2571 		latch_mode = BTR_MODIFY_LEAF;
2572 	}
2573 
2574 	/* Store the position of the tree latch we push to mtr so that we
2575 	know how to release it when we have latched the leaf node */
2576 
2577 	ulint savepoint = mtr_set_savepoint(mtr);
2578 
2579 	rw_lock_type_t upper_rw_latch;
2580 
2581 	switch (latch_mode) {
2582 	case BTR_CONT_MODIFY_TREE:
2583 	case BTR_CONT_SEARCH_TREE:
2584 		upper_rw_latch = RW_NO_LATCH;
2585 		break;
2586 	case BTR_MODIFY_TREE:
2587 		/* Most of delete-intended operations are purging.
2588 		Free blocks and read IO bandwidth should be prior
2589 		for them, when the history list is glowing huge. */
2590 		if (lock_intention == BTR_INTENTION_DELETE
2591 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2592 		    && buf_pool.n_pend_reads) {
2593 			mtr_x_lock_index(index, mtr);
2594 		} else {
2595 			mtr_sx_lock_index(index, mtr);
2596 		}
2597 		upper_rw_latch = RW_X_LATCH;
2598 		break;
2599 	default:
2600 		ut_ad(!s_latch_by_caller
2601 		      || mtr->memo_contains_flagged(&index->lock,
2602 						    MTR_MEMO_SX_LOCK
2603 						    | MTR_MEMO_S_LOCK));
2604 		if (!srv_read_only_mode) {
2605 			if (!s_latch_by_caller) {
2606 				/* BTR_SEARCH_TREE is intended to be used with
2607 				BTR_ALREADY_S_LATCHED */
2608 				ut_ad(latch_mode != BTR_SEARCH_TREE);
2609 
2610 				mtr_s_lock_index(index, mtr);
2611 			}
2612 			upper_rw_latch = RW_S_LATCH;
2613 		} else {
2614 			upper_rw_latch = RW_NO_LATCH;
2615 		}
2616 	}
2617 
2618 	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
2619 		latch_mode);
2620 
2621 	page_cursor = btr_cur_get_page_cur(cursor);
2622 	cursor->index = index;
2623 
2624 	page_id_t		page_id(index->table->space_id, index->page);
2625 	const ulint		zip_size = index->table->space->zip_size();
2626 
2627 	if (root_leaf_rw_latch == RW_X_LATCH) {
2628 		node_ptr_max_size = btr_node_ptr_max_size(index);
2629 	}
2630 
2631 	height = ULINT_UNDEFINED;
2632 
2633 	for (;;) {
2634 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2635 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2636 
2637 		const ulint rw_latch = height
2638 			&& (latch_mode != BTR_MODIFY_TREE || height == level)
2639 			? upper_rw_latch : RW_NO_LATCH;
2640 		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
2641 						      rw_latch, NULL, BUF_GET,
2642 						      file, line, mtr, &err,
2643 						      height == 0
2644 						      && !index->is_clust());
2645 		ut_ad((block != NULL) == (err == DB_SUCCESS));
2646 		tree_blocks[n_blocks] = block;
2647 
2648 		if (err != DB_SUCCESS) {
2649 			if (err == DB_DECRYPTION_FAILED) {
2650 				ib_push_warning((void *)NULL,
2651 					DB_DECRYPTION_FAILED,
2652 					"Table %s is encrypted but encryption service or"
2653 					" used key_id is not available. "
2654 					" Can't continue reading table.",
2655 					index->table->name.m_name);
2656 				index->table->file_unreadable = true;
2657 			}
2658 
2659 			goto exit_loop;
2660 		}
2661 
2662 		const page_t* page = buf_block_get_frame(block);
2663 
2664 		if (height == ULINT_UNDEFINED
2665 		    && page_is_leaf(page)
2666 		    && rw_latch != RW_NO_LATCH
2667 		    && rw_latch != root_leaf_rw_latch) {
2668 			/* We should retry to get the page, because the root page
2669 			is latched with different level as a leaf page. */
2670 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2671 			ut_ad(rw_latch == RW_S_LATCH);
2672 
2673 			ut_ad(n_blocks == 0);
2674 			mtr_release_block_at_savepoint(
2675 				mtr, tree_savepoints[n_blocks],
2676 				tree_blocks[n_blocks]);
2677 
2678 			upper_rw_latch = root_leaf_rw_latch;
2679 			continue;
2680 		}
2681 
2682 		ut_ad(fil_page_index_page_check(page));
2683 		ut_ad(index->id == btr_page_get_index_id(page));
2684 
2685 		if (height == ULINT_UNDEFINED) {
2686 			/* We are in the root node */
2687 
2688 			height = btr_page_get_level(page);
2689 			root_height = height;
2690 			ut_a(height >= level);
2691 		} else {
2692 			/* TODO: flag the index corrupted if this fails */
2693 			ut_ad(height == btr_page_get_level(page));
2694 		}
2695 
2696 		if (height == 0) {
2697 			if (rw_latch == RW_NO_LATCH) {
2698 				btr_cur_latch_leaves(block, latch_mode,
2699 						     cursor, mtr);
2700 			}
2701 
2702 			/* In versions <= 3.23.52 we had forgotten to
2703 			release the tree latch here. If in an index
2704 			scan we had to scan far to find a record
2705 			visible to the current transaction, that could
2706 			starve others waiting for the tree latch. */
2707 
2708 			switch (latch_mode) {
2709 			case BTR_MODIFY_TREE:
2710 			case BTR_CONT_MODIFY_TREE:
2711 			case BTR_CONT_SEARCH_TREE:
2712 				break;
2713 			default:
2714 				if (UNIV_UNLIKELY(srv_read_only_mode)) {
2715 					break;
2716 				}
2717 				if (!s_latch_by_caller) {
2718 					/* Release the tree s-latch */
2719 					mtr_release_s_latch_at_savepoint(
2720 						mtr, savepoint, &index->lock);
2721 				}
2722 
2723 				/* release upper blocks */
2724 				for (; n_releases < n_blocks; n_releases++) {
2725 					mtr_release_block_at_savepoint(
2726 						mtr,
2727 						tree_savepoints[n_releases],
2728 						tree_blocks[n_releases]);
2729 				}
2730 			}
2731 		} else if (height == level /* height != 0 */
2732 			   && UNIV_LIKELY(!srv_read_only_mode)) {
2733 			/* We already have the block latched. */
2734 			ut_ad(latch_mode == BTR_SEARCH_TREE);
2735 			ut_ad(s_latch_by_caller);
2736 			ut_ad(upper_rw_latch == RW_S_LATCH);
2737 			ut_ad(mtr->memo_contains_flagged(block,
2738 							 MTR_MEMO_PAGE_S_FIX));
2739 
2740 			if (s_latch_by_caller) {
2741 				/* to exclude modifying tree operations
2742 				should sx-latch the index. */
2743 				ut_ad(mtr->memo_contains(index->lock,
2744 							 MTR_MEMO_SX_LOCK));
2745 				/* because has sx-latch of index,
2746 				can release upper blocks. */
2747 				for (; n_releases < n_blocks; n_releases++) {
2748 					mtr_release_block_at_savepoint(
2749 						mtr,
2750 						tree_savepoints[n_releases],
2751 						tree_blocks[n_releases]);
2752 				}
2753 			}
2754 		}
2755 
2756 		if (from_left) {
2757 			page_cur_set_before_first(block, page_cursor);
2758 		} else {
2759 			page_cur_set_after_last(block, page_cursor);
2760 		}
2761 
2762 		if (height == level) {
2763 			if (estimate) {
2764 				btr_cur_add_path_info(cursor, height,
2765 						      root_height);
2766 			}
2767 
2768 			break;
2769 		}
2770 
2771 		ut_ad(height > 0);
2772 
2773 		if (from_left) {
2774 			page_cur_move_to_next(page_cursor);
2775 		} else {
2776 			page_cur_move_to_prev(page_cursor);
2777 		}
2778 
2779 		if (estimate) {
2780 			btr_cur_add_path_info(cursor, height, root_height);
2781 		}
2782 
2783 		height--;
2784 
2785 		node_ptr = page_cur_get_rec(page_cursor);
2786 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2787 					  0, ULINT_UNDEFINED, &heap);
2788 
2789 		/* If the rec is the first or last in the page for
2790 		pessimistic delete intention, it might cause node_ptr insert
2791 		for the upper level. We should change the intention and retry.
2792 		*/
2793 		if (latch_mode == BTR_MODIFY_TREE
2794 		    && btr_cur_need_opposite_intention(
2795 			page, lock_intention, node_ptr)) {
2796 
2797 			ut_ad(upper_rw_latch == RW_X_LATCH);
2798 			/* release all blocks */
2799 			for (; n_releases <= n_blocks; n_releases++) {
2800 				mtr_release_block_at_savepoint(
2801 					mtr, tree_savepoints[n_releases],
2802 					tree_blocks[n_releases]);
2803 			}
2804 
2805 			lock_intention = BTR_INTENTION_BOTH;
2806 
2807 			page_id.set_page_no(dict_index_get_page(index));
2808 
2809 			height = ULINT_UNDEFINED;
2810 
2811 			n_blocks = 0;
2812 			n_releases = 0;
2813 
2814 			continue;
2815 		}
2816 
2817 		if (latch_mode == BTR_MODIFY_TREE
2818 		    && !btr_cur_will_modify_tree(
2819 				cursor->index, page, lock_intention, node_ptr,
2820 				node_ptr_max_size, zip_size, mtr)) {
2821 			ut_ad(upper_rw_latch == RW_X_LATCH);
2822 			ut_ad(n_releases <= n_blocks);
2823 
2824 			/* we can release upper blocks */
2825 			for (; n_releases < n_blocks; n_releases++) {
2826 				if (n_releases == 0) {
2827 					/* we should not release root page
2828 					to pin to same block. */
2829 					continue;
2830 				}
2831 
2832 				/* release unused blocks to unpin */
2833 				mtr_release_block_at_savepoint(
2834 					mtr, tree_savepoints[n_releases],
2835 					tree_blocks[n_releases]);
2836 			}
2837 		}
2838 
2839 		if (height == level
2840 		    && latch_mode == BTR_MODIFY_TREE) {
2841 			ut_ad(upper_rw_latch == RW_X_LATCH);
2842 			/* we should sx-latch root page, if released already.
2843 			It contains seg_header. */
2844 			if (n_releases > 0) {
2845 				mtr_block_sx_latch_at_savepoint(
2846 					mtr, tree_savepoints[0],
2847 					tree_blocks[0]);
2848 			}
2849 
2850 			/* x-latch the branch blocks not released yet. */
2851 			for (ulint i = n_releases; i <= n_blocks; i++) {
2852 				mtr_block_x_latch_at_savepoint(
2853 					mtr, tree_savepoints[i],
2854 					tree_blocks[i]);
2855 			}
2856 		}
2857 
2858 		/* Go to the child node */
2859 		page_id.set_page_no(
2860 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2861 
2862 		n_blocks++;
2863 	}
2864 
2865  exit_loop:
2866 	if (heap) {
2867 		mem_heap_free(heap);
2868 	}
2869 
2870 	return err;
2871 }
2872 
2873 /**********************************************************************//**
2874 Positions a cursor at a randomly chosen position within a B-tree.
2875 @return true if the index is available and we have put the cursor, false
2876 if the index is unavailable */
2877 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,unsigned line,mtr_t * mtr)2878 btr_cur_open_at_rnd_pos_func(
2879 /*=========================*/
2880 	dict_index_t*	index,		/*!< in: index */
2881 	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
2882 	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
2883 	const char*	file,		/*!< in: file name */
2884 	unsigned	line,		/*!< in: line where called */
2885 	mtr_t*		mtr)		/*!< in: mtr */
2886 {
2887 	page_cur_t*	page_cursor;
2888 	ulint		node_ptr_max_size = srv_page_size / 2;
2889 	ulint		height;
2890 	rec_t*		node_ptr;
2891 	btr_intention_t	lock_intention;
2892 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2893 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2894 	ulint		n_blocks = 0;
2895 	ulint		n_releases = 0;
2896 	mem_heap_t*	heap		= NULL;
2897 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
2898 	rec_offs*	offsets		= offsets_;
2899 	rec_offs_init(offsets_);
2900 
2901 	ut_ad(!index->is_spatial());
2902 
2903 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2904 
2905 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2906 
2907 	ulint savepoint = mtr_set_savepoint(mtr);
2908 
2909 	rw_lock_type_t upper_rw_latch;
2910 
2911 	switch (latch_mode) {
2912 	case BTR_MODIFY_TREE:
2913 		/* Most of delete-intended operations are purging.
2914 		Free blocks and read IO bandwidth should be prior
2915 		for them, when the history list is glowing huge. */
2916 		if (lock_intention == BTR_INTENTION_DELETE
2917 		    && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2918 		    && buf_pool.n_pend_reads) {
2919 			mtr_x_lock_index(index, mtr);
2920 		} else {
2921 			mtr_sx_lock_index(index, mtr);
2922 		}
2923 		upper_rw_latch = RW_X_LATCH;
2924 		break;
2925 	case BTR_SEARCH_PREV:
2926 	case BTR_MODIFY_PREV:
2927 		/* This function doesn't support left uncle
2928 		   page lock for left leaf page lock, when
2929 		   needed. */
2930 	case BTR_SEARCH_TREE:
2931 	case BTR_CONT_MODIFY_TREE:
2932 	case BTR_CONT_SEARCH_TREE:
2933 		ut_ad(0);
2934 		/* fall through */
2935 	default:
2936 		if (!srv_read_only_mode) {
2937 			mtr_s_lock_index(index, mtr);
2938 			upper_rw_latch = RW_S_LATCH;
2939 		} else {
2940 			upper_rw_latch = RW_NO_LATCH;
2941 		}
2942 	}
2943 
2944 	DBUG_EXECUTE_IF("test_index_is_unavailable",
2945 			return(false););
2946 
2947 	if (index->page == FIL_NULL) {
2948 		/* Since we don't hold index lock until just now, the index
2949 		could be modified by others, for example, if this is a
2950 		statistics updater for referenced table, it could be marked
2951 		as unavailable by 'DROP TABLE' in the mean time, since
2952 		we don't hold lock for statistics updater */
2953 		return(false);
2954 	}
2955 
2956 	const rw_lock_type_t root_leaf_rw_latch = btr_cur_latch_for_root_leaf(
2957 		latch_mode);
2958 
2959 	page_cursor = btr_cur_get_page_cur(cursor);
2960 	cursor->index = index;
2961 
2962 	page_id_t		page_id(index->table->space_id, index->page);
2963 	const ulint		zip_size = index->table->space->zip_size();
2964 	dberr_t			err = DB_SUCCESS;
2965 
2966 	if (root_leaf_rw_latch == RW_X_LATCH) {
2967 		node_ptr_max_size = btr_node_ptr_max_size(index);
2968 	}
2969 
2970 	height = ULINT_UNDEFINED;
2971 
2972 	for (;;) {
2973 		page_t*		page;
2974 
2975 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2976 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2977 
2978 		const rw_lock_type_t rw_latch = height
2979 			&& latch_mode != BTR_MODIFY_TREE
2980 			? upper_rw_latch : RW_NO_LATCH;
2981 		buf_block_t* block = buf_page_get_gen(page_id, zip_size,
2982 						      rw_latch, NULL, BUF_GET,
2983 						      file, line, mtr, &err,
2984 						      height == 0
2985 						      && !index->is_clust());
2986 		tree_blocks[n_blocks] = block;
2987 
2988 		ut_ad((block != NULL) == (err == DB_SUCCESS));
2989 
2990 		if (err != DB_SUCCESS) {
2991 			if (err == DB_DECRYPTION_FAILED) {
2992 				ib_push_warning((void *)NULL,
2993 					DB_DECRYPTION_FAILED,
2994 					"Table %s is encrypted but encryption service or"
2995 					" used key_id is not available. "
2996 					" Can't continue reading table.",
2997 					index->table->name.m_name);
2998 				index->table->file_unreadable = true;
2999 			}
3000 
3001 			break;
3002 		}
3003 
3004 		page = buf_block_get_frame(block);
3005 
3006 		if (height == ULINT_UNDEFINED
3007 		    && page_is_leaf(page)
3008 		    && rw_latch != RW_NO_LATCH
3009 		    && rw_latch != root_leaf_rw_latch) {
3010 			/* We should retry to get the page, because the root page
3011 			is latched with different level as a leaf page. */
3012 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
3013 			ut_ad(rw_latch == RW_S_LATCH);
3014 
3015 			ut_ad(n_blocks == 0);
3016 			mtr_release_block_at_savepoint(
3017 				mtr, tree_savepoints[n_blocks],
3018 				tree_blocks[n_blocks]);
3019 
3020 			upper_rw_latch = root_leaf_rw_latch;
3021 			continue;
3022 		}
3023 
3024 		ut_ad(fil_page_index_page_check(page));
3025 		ut_ad(index->id == btr_page_get_index_id(page));
3026 
3027 		if (height == ULINT_UNDEFINED) {
3028 			/* We are in the root node */
3029 
3030 			height = btr_page_get_level(page);
3031 		}
3032 
3033 		if (height == 0) {
3034 			if (rw_latch == RW_NO_LATCH
3035 			    || srv_read_only_mode) {
3036 				btr_cur_latch_leaves(block, latch_mode, cursor,
3037 						     mtr);
3038 			}
3039 
3040 			/* btr_cur_open_at_index_side_func() and
3041 			btr_cur_search_to_nth_level() release
3042 			tree s-latch here.*/
3043 			switch (latch_mode) {
3044 			case BTR_MODIFY_TREE:
3045 			case BTR_CONT_MODIFY_TREE:
3046 			case BTR_CONT_SEARCH_TREE:
3047 				break;
3048 			default:
3049 				/* Release the tree s-latch */
3050 				if (!srv_read_only_mode) {
3051 					mtr_release_s_latch_at_savepoint(
3052 						mtr, savepoint,
3053 						dict_index_get_lock(index));
3054 				}
3055 
3056 				/* release upper blocks */
3057 				for (; n_releases < n_blocks; n_releases++) {
3058 					mtr_release_block_at_savepoint(
3059 						mtr,
3060 						tree_savepoints[n_releases],
3061 						tree_blocks[n_releases]);
3062 				}
3063 			}
3064 		}
3065 
3066 		page_cur_open_on_rnd_user_rec(block, page_cursor);
3067 
3068 		if (height == 0) {
3069 
3070 			break;
3071 		}
3072 
3073 		ut_ad(height > 0);
3074 
3075 		height--;
3076 
3077 		node_ptr = page_cur_get_rec(page_cursor);
3078 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
3079 					  0, ULINT_UNDEFINED, &heap);
3080 
3081 		/* If the rec is the first or last in the page for
3082 		pessimistic delete intention, it might cause node_ptr insert
3083 		for the upper level. We should change the intention and retry.
3084 		*/
3085 		if (latch_mode == BTR_MODIFY_TREE
3086 		    && btr_cur_need_opposite_intention(
3087 			page, lock_intention, node_ptr)) {
3088 
3089 			ut_ad(upper_rw_latch == RW_X_LATCH);
3090 			/* release all blocks */
3091 			for (; n_releases <= n_blocks; n_releases++) {
3092 				mtr_release_block_at_savepoint(
3093 					mtr, tree_savepoints[n_releases],
3094 					tree_blocks[n_releases]);
3095 			}
3096 
3097 			lock_intention = BTR_INTENTION_BOTH;
3098 
3099 			page_id.set_page_no(dict_index_get_page(index));
3100 
3101 			height = ULINT_UNDEFINED;
3102 
3103 			n_blocks = 0;
3104 			n_releases = 0;
3105 
3106 			continue;
3107 		}
3108 
3109 		if (latch_mode == BTR_MODIFY_TREE
3110 		    && !btr_cur_will_modify_tree(
3111 				cursor->index, page, lock_intention, node_ptr,
3112 				node_ptr_max_size, zip_size, mtr)) {
3113 			ut_ad(upper_rw_latch == RW_X_LATCH);
3114 			ut_ad(n_releases <= n_blocks);
3115 
3116 			/* we can release upper blocks */
3117 			for (; n_releases < n_blocks; n_releases++) {
3118 				if (n_releases == 0) {
3119 					/* we should not release root page
3120 					to pin to same block. */
3121 					continue;
3122 				}
3123 
3124 				/* release unused blocks to unpin */
3125 				mtr_release_block_at_savepoint(
3126 					mtr, tree_savepoints[n_releases],
3127 					tree_blocks[n_releases]);
3128 			}
3129 		}
3130 
3131 		if (height == 0
3132 		    && latch_mode == BTR_MODIFY_TREE) {
3133 			ut_ad(upper_rw_latch == RW_X_LATCH);
3134 			/* we should sx-latch root page, if released already.
3135 			It contains seg_header. */
3136 			if (n_releases > 0) {
3137 				mtr_block_sx_latch_at_savepoint(
3138 					mtr, tree_savepoints[0],
3139 					tree_blocks[0]);
3140 			}
3141 
3142 			/* x-latch the branch blocks not released yet. */
3143 			for (ulint i = n_releases; i <= n_blocks; i++) {
3144 				mtr_block_x_latch_at_savepoint(
3145 					mtr, tree_savepoints[i],
3146 					tree_blocks[i]);
3147 			}
3148 		}
3149 
3150 		/* Go to the child node */
3151 		page_id.set_page_no(
3152 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
3153 
3154 		n_blocks++;
3155 	}
3156 
3157 	if (UNIV_LIKELY_NULL(heap)) {
3158 		mem_heap_free(heap);
3159 	}
3160 
3161 	return err == DB_SUCCESS;
3162 }
3163 
3164 /*==================== B-TREE INSERT =========================*/
3165 
3166 /*************************************************************//**
3167 Inserts a record if there is enough space, or if enough space can
3168 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
3169 no heuristics is applied to whether it pays to use CPU time for
3170 reorganizing the page or not.
3171 
3172 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
3173 if this is a compressed leaf page in a secondary index.
3174 This has to be done either within the same mini-transaction,
3175 or by invoking ibuf_reset_free_bits() before mtr_commit().
3176 
3177 @return pointer to inserted record if succeed, else NULL */
3178 static MY_ATTRIBUTE((nonnull, warn_unused_result))
3179 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,rec_offs ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)3180 btr_cur_insert_if_possible(
3181 /*=======================*/
3182 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3183 				cursor stays valid */
3184 	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
3185 				have been stored to tuple */
3186 	rec_offs**	offsets,/*!< out: offsets on *rec */
3187 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
3188 	ulint		n_ext,	/*!< in: number of externally stored columns */
3189 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3190 {
3191 	page_cur_t*	page_cursor;
3192 	rec_t*		rec;
3193 
3194 	ut_ad(dtuple_check_typed(tuple));
3195 
3196 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
3197 					 MTR_MEMO_PAGE_X_FIX));
3198 	page_cursor = btr_cur_get_page_cur(cursor);
3199 
3200 	/* Now, try the insert */
3201 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3202 				    offsets, heap, n_ext, mtr);
3203 
3204 	/* If the record did not fit, reorganize.
3205 	For compressed pages, page_cur_tuple_insert()
3206 	attempted this already. */
3207 	if (!rec && !page_cur_get_page_zip(page_cursor)
3208 	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3209 		rec = page_cur_tuple_insert(
3210 			page_cursor, tuple, cursor->index,
3211 			offsets, heap, n_ext, mtr);
3212 	}
3213 
3214 	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3215 	return(rec);
3216 }
3217 
3218 /*************************************************************//**
3219 For an insert, checks the locks and does the undo logging if desired.
3220 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3221 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
3222 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,bool * inherit)3223 btr_cur_ins_lock_and_undo(
3224 /*======================*/
3225 	ulint		flags,	/*!< in: undo logging and locking flags: if
3226 				not zero, the parameters index and thr
3227 				should be specified */
3228 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
3229 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3230 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3231 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3232 	bool*		inherit)/*!< out: true if the inserted new record maybe
3233 				should inherit LOCK_GAP type locks from the
3234 				successor record */
3235 {
3236 	dict_index_t*	index;
3237 	dberr_t		err = DB_SUCCESS;
3238 	rec_t*		rec;
3239 	roll_ptr_t	roll_ptr;
3240 
3241 	/* Check if we have to wait for a lock: enqueue an explicit lock
3242 	request if yes */
3243 
3244 	rec = btr_cur_get_rec(cursor);
3245 	index = cursor->index;
3246 
3247 	ut_ad(!dict_index_is_online_ddl(index)
3248 	      || dict_index_is_clust(index)
3249 	      || (flags & BTR_CREATE_FLAG));
3250 	ut_ad(mtr->is_named_space(index->table->space));
3251 
3252 	/* Check if there is predicate or GAP lock preventing the insertion */
3253 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3254 		const unsigned type = index->type;
3255 		if (UNIV_UNLIKELY(type & DICT_SPATIAL)) {
3256 			lock_prdt_t	prdt;
3257 			rtr_mbr_t	mbr;
3258 
3259 			rtr_get_mbr_from_tuple(entry, &mbr);
3260 
3261 			/* Use on stack MBR variable to test if a lock is
3262 			needed. If so, the predicate (MBR) will be allocated
3263 			from lock heap in lock_prdt_insert_check_and_lock() */
3264 			lock_init_prdt_from_mbr(
3265 				&prdt, &mbr, 0, NULL);
3266 
3267 			err = lock_prdt_insert_check_and_lock(
3268 				flags, rec, btr_cur_get_block(cursor),
3269 				index, thr, mtr, &prdt);
3270 			*inherit = false;
3271 		} else {
3272 #ifdef WITH_WSREP
3273 			trx_t* trx= thr_get_trx(thr);
3274 			/* If transaction scanning an unique secondary
3275 			key is wsrep high priority thread (brute
3276 			force) this scanning may involve GAP-locking
3277 			in the index. As this locking happens also
3278 			when applying replication events in high
3279 			priority applier threads, there is a
3280 			probability for lock conflicts between two
3281 			wsrep high priority threads. To avoid this
3282 			GAP-locking we mark that this transaction
3283 			is using unique key scan here. */
3284 			if ((type & (DICT_CLUSTERED | DICT_UNIQUE)) == DICT_UNIQUE
3285 			    && trx->is_wsrep()
3286 			    && wsrep_thd_is_BF(trx->mysql_thd, false)) {
3287 				trx->wsrep_UK_scan= true;
3288 			}
3289 #endif /* WITH_WSREP */
3290 			err = lock_rec_insert_check_and_lock(
3291 				flags, rec, btr_cur_get_block(cursor),
3292 				index, thr, mtr, inherit);
3293 #ifdef WITH_WSREP
3294 			trx->wsrep_UK_scan= false;
3295 #endif /* WITH_WSREP */
3296 		}
3297 	}
3298 
3299 	if (err != DB_SUCCESS
3300 	    || !(~flags | (BTR_NO_UNDO_LOG_FLAG | BTR_KEEP_SYS_FLAG))
3301 	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
3302 
3303 		return(err);
3304 	}
3305 
3306 	if (flags & BTR_NO_UNDO_LOG_FLAG) {
3307 		roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
3308 		if (!(flags & BTR_KEEP_SYS_FLAG)) {
3309 upd_sys:
3310 			dfield_t* r = dtuple_get_nth_field(
3311 				entry, index->db_roll_ptr());
3312 			ut_ad(r->len == DATA_ROLL_PTR_LEN);
3313 			trx_write_roll_ptr(static_cast<byte*>(r->data),
3314 					   roll_ptr);
3315 		}
3316 	} else {
3317 		err = trx_undo_report_row_operation(thr, index, entry,
3318 						    NULL, 0, NULL, NULL,
3319 						    &roll_ptr);
3320 		if (err == DB_SUCCESS) {
3321 			goto upd_sys;
3322 		}
3323 	}
3324 
3325 	return(err);
3326 }
3327 
3328 /**
3329 Prefetch siblings of the leaf for the pessimistic operation.
3330 @param block	leaf page
3331 @param index    index of the page */
btr_cur_prefetch_siblings(const buf_block_t * block,const dict_index_t * index)3332 static void btr_cur_prefetch_siblings(const buf_block_t *block,
3333                                       const dict_index_t *index)
3334 {
3335   ut_ad(page_is_leaf(block->frame));
3336 
3337   if (index->is_ibuf())
3338     return;
3339 
3340   const page_t *page= block->frame;
3341   uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV));
3342   uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT));
3343 
3344   fil_space_t *space= index->table->space;
3345 
3346   if (prev == FIL_NULL);
3347   else if (space->acquire())
3348     buf_read_page_background(space, page_id_t(space->id, prev),
3349                              block->zip_size());
3350   if (next == FIL_NULL);
3351   else if (space->acquire())
3352     buf_read_page_background(space, page_id_t(space->id, next),
3353                              block->zip_size());
3354 }
3355 
3356 /*************************************************************//**
3357 Tries to perform an insert to a page in an index tree, next to cursor.
3358 It is assumed that mtr holds an x-latch on the page. The operation does
3359 not succeed if there is too little space on the page. If there is just
3360 one record on the page, the insert will always succeed; this is to
3361 prevent trying to split a page with just one record.
3362 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3363 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3364 btr_cur_optimistic_insert(
3365 /*======================*/
3366 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3367 				zero, the parameters index and thr should be
3368 				specified */
3369 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3370 				cursor stays valid */
3371 	rec_offs**	offsets,/*!< out: offsets on *rec */
3372 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap */
3373 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3374 	rec_t**		rec,	/*!< out: pointer to inserted record if
3375 				succeed */
3376 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3377 				be stored externally by the caller */
3378 	ulint		n_ext,	/*!< in: number of externally stored columns */
3379 	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
3380 				!(~flags
3381 				& (BTR_NO_LOCKING_FLAG
3382 				| BTR_NO_UNDO_LOG_FLAG)) */
3383 	mtr_t*		mtr)	/*!< in/out: mini-transaction;
3384 				if this function returns DB_SUCCESS on
3385 				a leaf page of a secondary index in a
3386 				compressed tablespace, the caller must
3387 				mtr_commit(mtr) before latching
3388 				any further pages */
3389 {
3390 	big_rec_t*	big_rec_vec	= NULL;
3391 	dict_index_t*	index;
3392 	page_cur_t*	page_cursor;
3393 	buf_block_t*	block;
3394 	page_t*		page;
3395 	rec_t*		dummy;
3396 	bool		leaf;
3397 	bool		reorg __attribute__((unused));
3398 	bool		inherit = true;
3399 	ulint		rec_size;
3400 	dberr_t		err;
3401 
3402 	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3403 	*big_rec = NULL;
3404 
3405 	block = btr_cur_get_block(cursor);
3406 	page = buf_block_get_frame(block);
3407 	index = cursor->index;
3408 
3409 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
3410 	ut_ad(!dict_index_is_online_ddl(index)
3411 	      || dict_index_is_clust(index)
3412 	      || (flags & BTR_CREATE_FLAG));
3413 	ut_ad(dtuple_check_typed(entry));
3414 
3415 #ifdef HAVE_valgrind
3416 	if (block->page.zip.data) {
3417 		MEM_CHECK_DEFINED(page, srv_page_size);
3418 		MEM_CHECK_DEFINED(block->page.zip.data, block->zip_size());
3419 	}
3420 #endif /* HAVE_valgrind */
3421 
3422 	leaf = page_is_leaf(page);
3423 
3424 	if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
3425 		ut_ad(leaf);
3426 		goto convert_big_rec;
3427 	}
3428 
3429 	/* Calculate the record size when entry is converted to a record */
3430 	rec_size = rec_get_converted_size(index, entry, n_ext);
3431 
3432 	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3433 				   dtuple_get_n_fields(entry),
3434 				   block->zip_size())) {
3435 convert_big_rec:
3436 		/* The record is so big that we have to store some fields
3437 		externally on separate database pages */
3438 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3439 
3440 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3441 
3442 			return(DB_TOO_BIG_RECORD);
3443 		}
3444 
3445 		rec_size = rec_get_converted_size(index, entry, n_ext);
3446 	}
3447 
3448 	if (block->page.zip.data && page_zip_is_too_big(index, entry)) {
3449 		if (big_rec_vec != NULL) {
3450 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3451 		}
3452 
3453 		return(DB_TOO_BIG_RECORD);
3454 	}
3455 
3456 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3457 				      goto fail);
3458 
3459 	if (block->page.zip.data && leaf
3460 	    && (page_get_data_size(page) + rec_size
3461 		>= dict_index_zip_pad_optimal_page_size(index))) {
3462 		/* If compression padding tells us that insertion will
3463 		result in too packed up page i.e.: which is likely to
3464 		cause compression failure then don't do an optimistic
3465 		insertion. */
3466 fail:
3467 		err = DB_FAIL;
3468 
3469 		/* prefetch siblings of the leaf for the pessimistic
3470 		operation, if the page is leaf. */
3471 		if (page_is_leaf(page)) {
3472 			btr_cur_prefetch_siblings(block, index);
3473 		}
3474 fail_err:
3475 
3476 		if (big_rec_vec) {
3477 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3478 		}
3479 
3480 		return(err);
3481 	}
3482 
3483 	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
3484 	if (max_size < rec_size) {
3485 		goto fail;
3486 	}
3487 
3488 	const ulint n_recs = page_get_n_recs(page);
3489 	if (UNIV_UNLIKELY(n_recs >= 8189)) {
3490 		ut_ad(srv_page_size == 65536);
3491 		goto fail;
3492 	}
3493 
3494 	if (page_has_garbage(page)) {
3495 		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
3496 		    && n_recs > 1
3497 		    && page_get_max_insert_size(page, 1) < rec_size) {
3498 
3499 			goto fail;
3500 		}
3501 	}
3502 
3503 	/* If there have been many consecutive inserts to the
3504 	clustered index leaf page of an uncompressed table, check if
3505 	we have to split the page to reserve enough free space for
3506 	future updates of records. */
3507 
3508 	if (leaf && !block->page.zip.data && dict_index_is_clust(index)
3509 	    && page_get_n_recs(page) >= 2
3510 	    && dict_index_get_space_reserve() + rec_size > max_size
3511 	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
3512 		|| btr_page_get_split_rec_to_left(cursor))) {
3513 		goto fail;
3514 	}
3515 
3516 	page_cursor = btr_cur_get_page_cur(cursor);
3517 
3518 	DBUG_LOG("ib_cur",
3519 		 "insert " << index->name << " (" << index->id << ") by "
3520 		 << ib::hex(thr ? thr->graph->trx->id : 0)
3521 		 << ' ' << rec_printer(entry).str());
3522 	DBUG_EXECUTE_IF("do_page_reorganize",
3523 			btr_page_reorganize(page_cursor, index, mtr););
3524 
3525 	/* Now, try the insert */
3526 	{
3527 		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
3528 
3529 		/* Check locks and write to the undo log,
3530 		if specified */
3531 		err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3532 						thr, mtr, &inherit);
3533 		if (err != DB_SUCCESS) {
3534 			goto fail_err;
3535 		}
3536 
3537 #ifdef UNIV_DEBUG
3538 		if (!(flags & BTR_CREATE_FLAG)
3539 		    && index->is_primary() && page_is_leaf(page)) {
3540 			const dfield_t* trx_id = dtuple_get_nth_field(
3541 				entry, dict_col_get_clust_pos(
3542 					dict_table_get_sys_col(index->table,
3543 							       DATA_TRX_ID),
3544 					index));
3545 
3546 			ut_ad(trx_id->len == DATA_TRX_ID_LEN);
3547 			ut_ad(trx_id[1].len == DATA_ROLL_PTR_LEN);
3548 			ut_ad(*static_cast<const byte*>
3549 			      (trx_id[1].data) & 0x80);
3550 			if (flags & BTR_NO_UNDO_LOG_FLAG) {
3551 				ut_ad(!memcmp(trx_id->data, reset_trx_id,
3552 					      DATA_TRX_ID_LEN));
3553 			} else {
3554 				ut_ad(thr->graph->trx->id);
3555 				ut_ad(thr->graph->trx->id
3556 				      == trx_read_trx_id(
3557 					      static_cast<const byte*>(
3558 							trx_id->data))
3559 				      || index->table->is_temporary());
3560 			}
3561 		}
3562 #endif
3563 
3564 		*rec = page_cur_tuple_insert(
3565 			page_cursor, entry, index, offsets, heap,
3566 			n_ext, mtr);
3567 
3568 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3569 	}
3570 
3571 	if (*rec) {
3572 	} else if (block->page.zip.data) {
3573 		ut_ad(!index->table->is_temporary());
3574 		/* Reset the IBUF_BITMAP_FREE bits, because
3575 		page_cur_tuple_insert() will have attempted page
3576 		reorganize before failing. */
3577 		if (leaf
3578 		    && !dict_index_is_clust(index)) {
3579 			ibuf_reset_free_bits(block);
3580 		}
3581 
3582 		goto fail;
3583 	} else {
3584 		ut_ad(!reorg);
3585 
3586 		/* If the record did not fit, reorganize */
3587 		if (!btr_page_reorganize(page_cursor, index, mtr)) {
3588 			ut_ad(0);
3589 			goto fail;
3590 		}
3591 
3592 		ut_ad(page_get_max_insert_size(page, 1) == max_size);
3593 
3594 		reorg = TRUE;
3595 
3596 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
3597 					     offsets, heap, n_ext, mtr);
3598 
3599 		if (UNIV_UNLIKELY(!*rec)) {
3600 			ib::fatal() <<  "Cannot insert tuple " << *entry
3601 				<< "into index " << index->name
3602 				<< " of table " << index->table->name
3603 				<< ". Max size: " << max_size;
3604 		}
3605 	}
3606 
3607 #ifdef BTR_CUR_HASH_ADAPT
3608 	if (!leaf) {
3609 # ifdef MYSQL_INDEX_DISABLE_AHI
3610 	} else if (index->disable_ahi) {
3611 # endif
3612 	} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3613 		ut_ad(entry->is_metadata());
3614 		ut_ad(index->is_instant());
3615 		ut_ad(flags == BTR_NO_LOCKING_FLAG);
3616 	} else {
3617 		rw_lock_t* ahi_latch = btr_search_sys.get_latch(*index);
3618 		if (!reorg && cursor->flag == BTR_CUR_HASH) {
3619 			btr_search_update_hash_node_on_insert(
3620 				cursor, ahi_latch);
3621 		} else {
3622 			btr_search_update_hash_on_insert(cursor, ahi_latch);
3623 		}
3624 	}
3625 #endif /* BTR_CUR_HASH_ADAPT */
3626 
3627 	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3628 
3629 		lock_update_insert(block, *rec);
3630 	}
3631 
3632 	if (leaf
3633 	    && !dict_index_is_clust(index)
3634 	    && !index->table->is_temporary()) {
3635 		/* Update the free bits of the B-tree page in the
3636 		insert buffer bitmap. */
3637 
3638 		/* The free bits in the insert buffer bitmap must
3639 		never exceed the free space on a page.  It is safe to
3640 		decrement or reset the bits in the bitmap in a
3641 		mini-transaction that is committed before the
3642 		mini-transaction that affects the free space. */
3643 
3644 		/* It is unsafe to increment the bits in a separately
3645 		committed mini-transaction, because in crash recovery,
3646 		the free bits could momentarily be set too high. */
3647 
3648 		if (block->page.zip.data) {
3649 			/* Update the bits in the same mini-transaction. */
3650 			ibuf_update_free_bits_zip(block, mtr);
3651 		} else {
3652 			/* Decrement the bits in a separate
3653 			mini-transaction. */
3654 			ibuf_update_free_bits_if_full(
3655 				block, max_size,
3656 				rec_size + PAGE_DIR_SLOT_SIZE);
3657 		}
3658 	}
3659 
3660 	*big_rec = big_rec_vec;
3661 
3662 	return(DB_SUCCESS);
3663 }
3664 
3665 /*************************************************************//**
3666 Performs an insert on a page of an index tree. It is assumed that mtr
3667 holds an x-latch on the tree and on the cursor page. If the insert is
3668 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3669 to brothers of page, if those brothers exist.
3670 @return DB_SUCCESS or error number */
3671 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3672 btr_cur_pessimistic_insert(
3673 /*=======================*/
3674 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3675 				zero, the parameter thr should be
3676 				specified; if no undo logging is specified,
3677 				then the caller must have reserved enough
3678 				free extents in the file space so that the
3679 				insertion will certainly succeed */
3680 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
3681 				cursor stays valid */
3682 	rec_offs**	offsets,/*!< out: offsets on *rec */
3683 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
3684 				that can be emptied */
3685 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3686 	rec_t**		rec,	/*!< out: pointer to inserted record if
3687 				succeed */
3688 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3689 				be stored externally by the caller */
3690 	ulint		n_ext,	/*!< in: number of externally stored columns */
3691 	que_thr_t*	thr,	/*!< in/out: query thread; can be NULL if
3692 				!(~flags
3693 				& (BTR_NO_LOCKING_FLAG
3694 				| BTR_NO_UNDO_LOG_FLAG)) */
3695 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3696 {
3697 	dict_index_t*	index		= cursor->index;
3698 	big_rec_t*	big_rec_vec	= NULL;
3699 	dberr_t		err;
3700 	bool		inherit = false;
3701 	bool		success;
3702 	uint32_t	n_reserved	= 0;
3703 
3704 	ut_ad(dtuple_check_typed(entry));
3705 	ut_ad(thr || !(~flags & (BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG)));
3706 
3707 	*big_rec = NULL;
3708 
3709 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
3710 					 | MTR_MEMO_SX_LOCK));
3711 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
3712 					 MTR_MEMO_PAGE_X_FIX));
3713 	ut_ad(!dict_index_is_online_ddl(index)
3714 	      || dict_index_is_clust(index)
3715 	      || (flags & BTR_CREATE_FLAG));
3716 
3717 	cursor->flag = BTR_CUR_BINARY;
3718 
3719 	/* Check locks and write to undo log, if specified */
3720 
3721 	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3722 					thr, mtr, &inherit);
3723 
3724 	if (err != DB_SUCCESS) {
3725 
3726 		return(err);
3727 	}
3728 
3729 	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
3730 		/* First reserve enough free space for the file segments
3731 		of the index tree, so that the insert will not fail because
3732 		of lack of space */
3733 
3734 		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
3735 
3736 		success = fsp_reserve_free_extents(&n_reserved,
3737 						   index->table->space,
3738 						   n_extents, FSP_NORMAL, mtr);
3739 		if (!success) {
3740 			return(DB_OUT_OF_FILE_SPACE);
3741 		}
3742 	}
3743 
3744 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3745 				   index->table->not_redundant(),
3746 				   dtuple_get_n_fields(entry),
3747 				   btr_cur_get_block(cursor)->zip_size())
3748 	    || UNIV_UNLIKELY(entry->is_alter_metadata()
3749 			     && !dfield_is_ext(
3750 				     dtuple_get_nth_field(
3751 					     entry,
3752 					     index->first_user_field())))) {
3753 		/* The record is so big that we have to store some fields
3754 		externally on separate database pages */
3755 
3756 		if (UNIV_LIKELY_NULL(big_rec_vec)) {
3757 			/* This should never happen, but we handle
3758 			the situation in a robust manner. */
3759 			ut_ad(0);
3760 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3761 		}
3762 
3763 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3764 
3765 		if (big_rec_vec == NULL) {
3766 
3767 			index->table->space->release_free_extents(n_reserved);
3768 			return(DB_TOO_BIG_RECORD);
3769 		}
3770 	}
3771 
3772 	if (dict_index_get_page(index)
3773 	    == btr_cur_get_block(cursor)->page.id().page_no()) {
3774 
3775 		/* The page is the root page */
3776 		*rec = btr_root_raise_and_insert(
3777 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3778 	} else {
3779 		*rec = btr_page_split_and_insert(
3780 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3781 	}
3782 
3783 	if (*rec == NULL && os_has_said_disk_full) {
3784 		return(DB_OUT_OF_FILE_SPACE);
3785 	}
3786 
3787 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3788 	      || dict_index_is_spatial(index));
3789 
3790 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3791 		ut_ad(!index->table->is_temporary());
3792 		if (dict_index_is_spatial(index)) {
3793 			/* Do nothing */
3794 		} else {
3795 			/* The cursor might be moved to the other page
3796 			and the max trx id field should be updated after
3797 			the cursor was fixed. */
3798 			if (!dict_index_is_clust(index)) {
3799 				page_update_max_trx_id(
3800 					btr_cur_get_block(cursor),
3801 					btr_cur_get_page_zip(cursor),
3802 					thr_get_trx(thr)->id, mtr);
3803 			}
3804 
3805 			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3806 			    || !page_has_prev(btr_cur_get_page(cursor))) {
3807 				/* split and inserted need to call
3808 				lock_update_insert() always. */
3809 				inherit = true;
3810 			}
3811 		}
3812 	}
3813 
3814 	if (!page_is_leaf(btr_cur_get_page(cursor))) {
3815 		ut_ad(!big_rec_vec);
3816 	} else {
3817 #ifdef BTR_CUR_HASH_ADAPT
3818 # ifdef MYSQL_INDEX_DISABLE_AHI
3819 		if (index->disable_ahi); else
3820 # endif
3821 		if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
3822 			ut_ad(entry->is_metadata());
3823 			ut_ad(index->is_instant());
3824 			ut_ad(flags & BTR_NO_LOCKING_FLAG);
3825 			ut_ad(!(flags & BTR_CREATE_FLAG));
3826 		} else {
3827 			btr_search_update_hash_on_insert(
3828 				cursor, btr_search_sys.get_latch(*index));
3829 		}
3830 #endif /* BTR_CUR_HASH_ADAPT */
3831 		if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3832 
3833 			lock_update_insert(btr_cur_get_block(cursor), *rec);
3834 		}
3835 	}
3836 
3837 	index->table->space->release_free_extents(n_reserved);
3838 	*big_rec = big_rec_vec;
3839 
3840 	return(DB_SUCCESS);
3841 }
3842 
3843 /*==================== B-TREE UPDATE =========================*/
3844 
3845 /*************************************************************//**
3846 For an update, checks the locks and does the undo logging.
3847 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3848 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3849 dberr_t
3850 btr_cur_upd_lock_and_undo(
3851 /*======================*/
3852 	ulint		flags,	/*!< in: undo logging and locking flags */
3853 	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
3854 	const rec_offs*	offsets,/*!< in: rec_get_offsets() on cursor */
3855 	const upd_t*	update,	/*!< in: update vector */
3856 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3857 				updates */
3858 	que_thr_t*	thr,	/*!< in: query thread
3859 				(can be NULL if BTR_NO_LOCKING_FLAG) */
3860 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3861 	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
3862 {
3863 	dict_index_t*	index;
3864 	const rec_t*	rec;
3865 	dberr_t		err;
3866 
3867 	ut_ad((thr != NULL) || (flags & BTR_NO_LOCKING_FLAG));
3868 
3869 	rec = btr_cur_get_rec(cursor);
3870 	index = cursor->index;
3871 
3872 	ut_ad(rec_offs_validate(rec, index, offsets));
3873 	ut_ad(mtr->is_named_space(index->table->space));
3874 
3875 	if (!dict_index_is_clust(index)) {
3876 		ut_ad(dict_index_is_online_ddl(index)
3877 		      == !!(flags & BTR_CREATE_FLAG));
3878 
3879 		/* We do undo logging only when we update a clustered index
3880 		record */
3881 		return(lock_sec_rec_modify_check_and_lock(
3882 			       flags, btr_cur_get_block(cursor), rec,
3883 			       index, thr, mtr));
3884 	}
3885 
3886 	/* Check if we have to wait for a lock: enqueue an explicit lock
3887 	request if yes */
3888 
3889 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3890 		err = lock_clust_rec_modify_check_and_lock(
3891 			flags, btr_cur_get_block(cursor), rec, index,
3892 			offsets, thr);
3893 		if (err != DB_SUCCESS) {
3894 			return(err);
3895 		}
3896 	}
3897 
3898 	/* Append the info about the update in the undo log */
3899 
3900 	return((flags & BTR_NO_UNDO_LOG_FLAG)
3901 	       ? DB_SUCCESS
3902 	       : trx_undo_report_row_operation(
3903 		       thr, index, NULL, update,
3904 		       cmpl_info, rec, offsets, roll_ptr));
3905 }
3906 
3907 /** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
3908 @param[in,out]	entry		clustered index entry
3909 @param[in]	index		clustered index
3910 @param[in]	trx_id		DB_TRX_ID
3911 @param[in]	roll_ptr	DB_ROLL_PTR */
btr_cur_write_sys(dtuple_t * entry,const dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr)3912 static void btr_cur_write_sys(
3913 	dtuple_t*		entry,
3914 	const dict_index_t*	index,
3915 	trx_id_t		trx_id,
3916 	roll_ptr_t		roll_ptr)
3917 {
3918 	dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
3919 	ut_ad(t->len == DATA_TRX_ID_LEN);
3920 	trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
3921 	dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
3922 	ut_ad(r->len == DATA_ROLL_PTR_LEN);
3923 	trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
3924 }
3925 
3926 /** Update DB_TRX_ID, DB_ROLL_PTR in a clustered index record.
3927 @param[in,out]  block           clustered index leaf page
3928 @param[in,out]  rec             clustered index record
3929 @param[in]      index           clustered index
3930 @param[in]      offsets         rec_get_offsets(rec, index)
3931 @param[in]      trx             transaction
3932 @param[in]      roll_ptr        DB_ROLL_PTR value
3933 @param[in,out]  mtr             mini-transaction */
btr_cur_upd_rec_sys(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,const trx_t * trx,roll_ptr_t roll_ptr,mtr_t * mtr)3934 static void btr_cur_upd_rec_sys(buf_block_t *block, rec_t *rec,
3935                                 dict_index_t *index, const rec_offs *offsets,
3936                                 const trx_t *trx, roll_ptr_t roll_ptr,
3937                                 mtr_t *mtr)
3938 {
3939   ut_ad(index->is_primary());
3940   ut_ad(rec_offs_validate(rec, index, offsets));
3941 
3942   if (UNIV_LIKELY_NULL(block->page.zip.data))
3943   {
3944     page_zip_write_trx_id_and_roll_ptr(block, rec, offsets, index->db_trx_id(),
3945                                        trx->id, roll_ptr, mtr);
3946     return;
3947   }
3948 
3949   ulint offset= index->trx_id_offset;
3950 
3951   if (!offset)
3952     offset= row_get_trx_id_offset(index, offsets);
3953 
3954   compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
3955 
3956   /* During IMPORT the trx id in the record can be in the future, if
3957   the .ibd file is being imported from another instance. During IMPORT
3958   roll_ptr will be 0. */
3959   ut_ad(roll_ptr == 0 ||
3960         lock_check_trx_id_sanity(trx_read_trx_id(rec + offset),
3961                                  rec, index, offsets));
3962 
3963   byte sys[DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN];
3964 
3965   trx_write_trx_id(sys, trx->id);
3966   trx_write_roll_ptr(sys + DATA_TRX_ID_LEN, roll_ptr);
3967 
3968   ulint d= 0;
3969   const byte *src= nullptr;
3970   byte *dest= rec + offset;
3971   ulint len= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3972 
3973   if (UNIV_LIKELY(index->trx_id_offset))
3974   {
3975     const rec_t *prev= page_rec_get_prev_const(rec);
3976     if (UNIV_UNLIKELY(prev == rec))
3977       ut_ad(0);
3978     else if (page_rec_is_infimum(prev));
3979     else
3980       for (src= prev + offset; d < DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN; d++)
3981         if (src[d] != sys[d])
3982           break;
3983     if (d > 6 && memcmp(dest, sys, d))
3984     {
3985       /* We save space by replacing a single record
3986 
3987       WRITE,page_offset(dest),byte[13]
3988 
3989       with two records:
3990 
3991       MEMMOVE,page_offset(dest),d(1 byte),offset(1..3 bytes),
3992       WRITE|0x80,0,byte[13-d]
3993 
3994       The single WRITE record would be x+13 bytes long, with x>2.
3995       The MEMMOVE record would be up to x+1+3 = x+4 bytes, and the
3996       second WRITE would be 1+1+13-d = 15-d bytes.
3997 
3998       The total size is: x+13 versus x+4+15-d = x+19-d bytes.
3999       To save space, we must have d>6, that is, the complete DB_TRX_ID and
4000       the first byte(s) of DB_ROLL_PTR must match the previous record. */
4001       memcpy(dest, src, d);
4002       mtr->memmove(*block, page_offset(dest), page_offset(src), d);
4003       dest+= d;
4004       len-= d;
4005       /* DB_TRX_ID,DB_ROLL_PTR must be unique in each record when
4006       DB_TRX_ID refers to an active transaction. */
4007       ut_ad(len);
4008     }
4009     else
4010       d= 0;
4011   }
4012 
4013   if (UNIV_LIKELY(len)) /* extra safety, to avoid corrupting the log */
4014     mtr->memcpy<mtr_t::MAYBE_NOP>(*block, dest, sys + d, len);
4015 }
4016 
4017 /*************************************************************//**
4018 See if there is enough place in the page modification log to log
4019 an update-in-place.
4020 
4021 @retval false if out of space; IBUF_BITMAP_FREE will be reset
4022 outside mtr if the page was recompressed
4023 @retval true if enough place;
4024 
4025 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
4026 a secondary index leaf page. This has to be done either within the
4027 same mini-transaction, or by invoking ibuf_reset_free_bits() before
4028 mtr_commit(mtr). */
4029 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,rec_offs * offsets,ulint length,bool create,mtr_t * mtr)4030 btr_cur_update_alloc_zip_func(
4031 /*==========================*/
4032 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
4033 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
4034 	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
4035 #ifdef UNIV_DEBUG
4036 	rec_offs*	offsets,/*!< in/out: offsets of the cursor record */
4037 #endif /* UNIV_DEBUG */
4038 	ulint		length,	/*!< in: size needed */
4039 	bool		create,	/*!< in: true=delete-and-insert,
4040 				false=update-in-place */
4041 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4042 {
4043 
4044 	/* Have a local copy of the variables as these can change
4045 	dynamically. */
4046 	const page_t*	page = page_cur_get_page(cursor);
4047 
4048 	ut_ad(page_zip == page_cur_get_page_zip(cursor));
4049 	ut_ad(!dict_index_is_ibuf(index));
4050 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
4051 
4052 	if (page_zip_available(page_zip, dict_index_is_clust(index),
4053 			       length, create)) {
4054 		return(true);
4055 	}
4056 
4057 	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
4058 		/* The page has been freshly compressed, so
4059 		reorganizing it will not help. */
4060 		return(false);
4061 	}
4062 
4063 	if (create && page_is_leaf(page)
4064 	    && (length + page_get_data_size(page)
4065 		>= dict_index_zip_pad_optimal_page_size(index))) {
4066 		return(false);
4067 	}
4068 
4069 	if (!btr_page_reorganize(cursor, index, mtr)) {
4070 		goto out_of_space;
4071 	}
4072 
4073 	rec_offs_make_valid(page_cur_get_rec(cursor), index,
4074 			    page_is_leaf(page), offsets);
4075 
4076 	/* After recompressing a page, we must make sure that the free
4077 	bits in the insert buffer bitmap will not exceed the free
4078 	space on the page.  Because this function will not attempt
4079 	recompression unless page_zip_available() fails above, it is
4080 	safe to reset the free bits if page_zip_available() fails
4081 	again, below.  The free bits can safely be reset in a separate
4082 	mini-transaction.  If page_zip_available() succeeds below, we
4083 	can be sure that the btr_page_reorganize() above did not reduce
4084 	the free space available on the page. */
4085 
4086 	if (page_zip_available(page_zip, dict_index_is_clust(index),
4087 			       length, create)) {
4088 		return(true);
4089 	}
4090 
4091 out_of_space:
4092 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
4093 
4094 	/* Out of space: reset the free bits. */
4095 	if (!dict_index_is_clust(index)
4096 	    && !index->table->is_temporary()
4097 	    && page_is_leaf(page)) {
4098 		ibuf_reset_free_bits(page_cur_get_block(cursor));
4099 	}
4100 
4101 	return(false);
4102 }
4103 
4104 /** Apply an update vector to a record. No field size changes are allowed.
4105 
4106 This is usually invoked on a clustered index. The only use case for a
4107 secondary index is row_ins_sec_index_entry_by_modify() or its
4108 counterpart in ibuf_insert_to_index_page().
4109 @param[in,out]  rec     index record
4110 @param[in]      index   the index of the record
4111 @param[in]      offsets rec_get_offsets(rec, index)
4112 @param[in]      update  update vector
4113 @param[in,out]  block   index page
4114 @param[in,out]  mtr     mini-transaction */
btr_cur_upd_rec_in_place(rec_t * rec,const dict_index_t * index,const rec_offs * offsets,const upd_t * update,buf_block_t * block,mtr_t * mtr)4115 void btr_cur_upd_rec_in_place(rec_t *rec, const dict_index_t *index,
4116                               const rec_offs *offsets, const upd_t *update,
4117                               buf_block_t *block, mtr_t *mtr)
4118 {
4119 	ut_ad(rec_offs_validate(rec, index, offsets));
4120 	ut_ad(!index->table->skip_alter_undo);
4121 	ut_ad(!block->page.zip.data || index->table->not_redundant());
4122 
4123 #ifdef UNIV_DEBUG
4124 	if (rec_offs_comp(offsets)) {
4125 		switch (rec_get_status(rec)) {
4126 		case REC_STATUS_ORDINARY:
4127 			break;
4128 		case REC_STATUS_INSTANT:
4129 			ut_ad(index->is_instant());
4130 			break;
4131 		case REC_STATUS_NODE_PTR:
4132 		case REC_STATUS_INFIMUM:
4133 		case REC_STATUS_SUPREMUM:
4134 			ut_ad("wrong record status in update" == 0);
4135 		}
4136 	}
4137 #endif /* UNIV_DEBUG */
4138 
4139 	static_assert(REC_INFO_BITS_SHIFT == 0, "compatibility");
4140 	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4141 		ut_ad(rec_offs_comp(offsets));
4142 		byte* info_bits = &rec[-REC_NEW_INFO_BITS];
4143 		const bool flip_del_mark = (*info_bits ^ update->info_bits)
4144 			& REC_INFO_DELETED_FLAG;
4145 		*info_bits &= byte(~REC_INFO_BITS_MASK);
4146 		*info_bits |= update->info_bits;
4147 
4148 		if (flip_del_mark) {
4149 			page_zip_rec_set_deleted(block, rec, update->info_bits
4150 						 & REC_INFO_DELETED_FLAG, mtr);
4151 		}
4152 	} else {
4153 		byte* info_bits = &rec[rec_offs_comp(offsets)
4154 				       ? -REC_NEW_INFO_BITS
4155 				       : -REC_OLD_INFO_BITS];
4156 
4157 		mtr->write<1,mtr_t::MAYBE_NOP>(*block, info_bits,
4158 					       (*info_bits
4159 						& ~REC_INFO_BITS_MASK)
4160 					       | update->info_bits);
4161 	}
4162 
4163 	for (ulint i = 0; i < update->n_fields; i++) {
4164 		const upd_field_t* uf = upd_get_nth_field(update, i);
4165 		if (upd_fld_is_virtual_col(uf) && !index->has_virtual()) {
4166 			continue;
4167 		}
4168 		const ulint n = uf->field_no;
4169 
4170 		ut_ad(!dfield_is_ext(&uf->new_val)
4171 		      == !rec_offs_nth_extern(offsets, n));
4172 		ut_ad(!rec_offs_nth_default(offsets, n));
4173 
4174 		if (UNIV_UNLIKELY(dfield_is_null(&uf->new_val))) {
4175 			if (rec_offs_nth_sql_null(offsets, n)) {
4176 				ut_ad(index->table->is_instant());
4177 				ut_ad(n >= index->n_core_fields);
4178 				continue;
4179 			}
4180 
4181 			ut_ad(!index->table->not_redundant());
4182 			switch (ulint size = rec_get_nth_field_size(rec, n)) {
4183 			case 0:
4184 				break;
4185 			case 1:
4186 				mtr->write<1,mtr_t::MAYBE_NOP>(
4187 					*block,
4188 					rec_get_field_start_offs(rec, n) + rec,
4189 					0U);
4190 				break;
4191 			default:
4192 				mtr->memset(
4193 					block,
4194 					page_offset(rec_get_field_start_offs(
4195 							    rec, n) + rec),
4196 					size, 0);
4197 			}
4198 			ulint l = rec_get_1byte_offs_flag(rec)
4199 				? (n + 1) : (n + 1) * 2;
4200 			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
4201 			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
4202 					    == REC_2BYTE_SQL_NULL_MASK);
4203 			mtr->write<1>(*block, b,
4204 				      byte(*b | REC_1BYTE_SQL_NULL_MASK));
4205 			continue;
4206 		}
4207 
4208 		ulint len;
4209 		byte* data = rec_get_nth_field(rec, offsets, n, &len);
4210 		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4211 			ut_ad(len == uf->new_val.len);
4212 			memcpy(data, uf->new_val.data, len);
4213 			continue;
4214 		}
4215 
4216 		if (UNIV_UNLIKELY(len != uf->new_val.len)) {
4217 			ut_ad(len == UNIV_SQL_NULL);
4218 			ut_ad(!rec_offs_comp(offsets));
4219 			len = uf->new_val.len;
4220 			ut_ad(len == rec_get_nth_field_size(rec, n));
4221 			ulint l = rec_get_1byte_offs_flag(rec)
4222 				? (n + 1) : (n + 1) * 2;
4223 			byte* b = rec - REC_N_OLD_EXTRA_BYTES - l;
4224 			compile_time_assert(REC_1BYTE_SQL_NULL_MASK << 8
4225 					    == REC_2BYTE_SQL_NULL_MASK);
4226 			mtr->write<1>(*block, b,
4227 				      byte(*b & ~REC_1BYTE_SQL_NULL_MASK));
4228 		}
4229 
4230 		if (len) {
4231 			mtr->memcpy<mtr_t::MAYBE_NOP>(*block, data,
4232 						      uf->new_val.data, len);
4233 		}
4234 	}
4235 
4236 	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
4237 		page_zip_write_rec(block, rec, index, offsets, 0, mtr);
4238 	}
4239 }
4240 
4241 /*************************************************************//**
4242 Updates a record when the update causes no size changes in its fields.
4243 We assume here that the ordering fields of the record do not change.
4244 @return locking or undo log related error code, or
4245 @retval DB_SUCCESS on success
4246 @retval DB_ZIP_OVERFLOW if there is not enough space left
4247 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4248 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,rec_offs * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4249 btr_cur_update_in_place(
4250 /*====================*/
4251 	ulint		flags,	/*!< in: undo logging and locking flags */
4252 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
4253 				cursor stays valid and positioned on the
4254 				same record */
4255 	rec_offs*	offsets,/*!< in/out: offsets on cursor->page_cur.rec */
4256 	const upd_t*	update,	/*!< in: update vector */
4257 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4258 				updates */
4259 	que_thr_t*	thr,	/*!< in: query thread */
4260 	trx_id_t	trx_id,	/*!< in: transaction id */
4261 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
4262 				is a secondary index, the caller must
4263 				mtr_commit(mtr) before latching any
4264 				further pages */
4265 {
4266 	dict_index_t*	index;
4267 	dberr_t		err;
4268 	rec_t*		rec;
4269 	roll_ptr_t	roll_ptr	= 0;
4270 	ulint		was_delete_marked;
4271 
4272 	ut_ad(page_is_leaf(cursor->page_cur.block->frame));
4273 	rec = btr_cur_get_rec(cursor);
4274 	index = cursor->index;
4275 	ut_ad(rec_offs_validate(rec, index, offsets));
4276 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4277 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4278 	      || index->table->is_temporary());
4279 	/* The insert buffer tree should never be updated in place. */
4280 	ut_ad(!dict_index_is_ibuf(index));
4281 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4282 	      || dict_index_is_clust(index));
4283 	ut_ad(thr_get_trx(thr)->id == trx_id
4284 	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
4285 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4286 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4287 	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
4288 	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
4289 	ut_ad(!(update->info_bits & REC_INFO_MIN_REC_FLAG));
4290 
4291 	DBUG_LOG("ib_cur",
4292 		 "update-in-place " << index->name << " (" << index->id
4293 		 << ") by " << ib::hex(trx_id) << ": "
4294 		 << rec_printer(rec, offsets).str());
4295 
4296 	buf_block_t* block = btr_cur_get_block(cursor);
4297 	page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
4298 
4299 	/* Check that enough space is available on the compressed page. */
4300 	if (UNIV_LIKELY_NULL(page_zip)) {
4301 		ut_ad(!index->table->is_temporary());
4302 
4303 		if (!btr_cur_update_alloc_zip(
4304 			    page_zip, btr_cur_get_page_cur(cursor),
4305 			    index, offsets, rec_offs_size(offsets),
4306 			    false, mtr)) {
4307 			return(DB_ZIP_OVERFLOW);
4308 		}
4309 
4310 		rec = btr_cur_get_rec(cursor);
4311 	}
4312 
4313 	/* Do lock checking and undo logging */
4314 	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
4315 					update, cmpl_info,
4316 					thr, mtr, &roll_ptr);
4317 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
4318 		/* We may need to update the IBUF_BITMAP_FREE
4319 		bits after a reorganize that was done in
4320 		btr_cur_update_alloc_zip(). */
4321 		goto func_exit;
4322 	}
4323 
4324 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
4325 		btr_cur_upd_rec_sys(block, rec, index, offsets,
4326 				    thr_get_trx(thr), roll_ptr, mtr);
4327 	}
4328 
4329 	was_delete_marked = rec_get_deleted_flag(
4330 		rec, page_is_comp(buf_block_get_frame(block)));
4331 	/* In delete-marked records, DB_TRX_ID must always refer to an
4332 	existing undo log record. */
4333 	ut_ad(!was_delete_marked
4334 	      || !dict_index_is_clust(index)
4335 	      || row_get_rec_trx_id(rec, index, offsets));
4336 
4337 #ifdef BTR_CUR_HASH_ADAPT
4338 	{
4339 		rw_lock_t* ahi_latch = block->index
4340 			? btr_search_sys.get_latch(*index) : NULL;
4341 		if (ahi_latch) {
4342 			/* TO DO: Can we skip this if none of the fields
4343 			index->search_info->curr_n_fields
4344 			are being updated? */
4345 
4346 			/* The function row_upd_changes_ord_field_binary
4347 			does not work on a secondary index. */
4348 
4349 			if (!dict_index_is_clust(index)
4350 			    || row_upd_changes_ord_field_binary(
4351 				    index, update, thr, NULL, NULL)) {
4352 				ut_ad(!(update->info_bits
4353 					& REC_INFO_MIN_REC_FLAG));
4354 				/* Remove possible hash index pointer
4355 				to this record */
4356 				btr_search_update_hash_on_delete(cursor);
4357 			}
4358 
4359 			rw_lock_x_lock(ahi_latch);
4360 		}
4361 
4362 		assert_block_ahi_valid(block);
4363 #endif /* BTR_CUR_HASH_ADAPT */
4364 
4365 		btr_cur_upd_rec_in_place(rec, index, offsets, update, block,
4366 					 mtr);
4367 
4368 #ifdef BTR_CUR_HASH_ADAPT
4369 		if (ahi_latch) {
4370 			rw_lock_x_unlock(ahi_latch);
4371 		}
4372 	}
4373 #endif /* BTR_CUR_HASH_ADAPT */
4374 
4375 	if (was_delete_marked
4376 	    && !rec_get_deleted_flag(
4377 		    rec, page_is_comp(buf_block_get_frame(block)))) {
4378 		/* The new updated record owns its possible externally
4379 		stored fields */
4380 
4381 		btr_cur_unmark_extern_fields(block, rec, index, offsets, mtr);
4382 	}
4383 
4384 	ut_ad(err == DB_SUCCESS);
4385 
4386 func_exit:
4387 	if (page_zip
4388 	    && !(flags & BTR_KEEP_IBUF_BITMAP)
4389 	    && !dict_index_is_clust(index)
4390 	    && page_is_leaf(buf_block_get_frame(block))) {
4391 		/* Update the free bits in the insert buffer. */
4392 		ut_ad(!index->table->is_temporary());
4393 		ibuf_update_free_bits_zip(block, mtr);
4394 	}
4395 
4396 	return(err);
4397 }
4398 
4399 /** Trim a metadata record during the rollback of instant ALTER TABLE.
4400 @param[in]	entry	metadata tuple
4401 @param[in]	index	primary key
4402 @param[in]	update	update vector for the rollback */
4403 ATTRIBUTE_COLD
btr_cur_trim_alter_metadata(dtuple_t * entry,const dict_index_t * index,const upd_t * update)4404 static void btr_cur_trim_alter_metadata(dtuple_t* entry,
4405 					const dict_index_t* index,
4406 					const upd_t* update)
4407 {
4408 	ut_ad(index->is_instant());
4409 	ut_ad(update->is_alter_metadata());
4410 	ut_ad(entry->is_alter_metadata());
4411 
4412 	ut_ad(update->fields[0].field_no == index->first_user_field());
4413 	ut_ad(update->fields[0].new_val.ext);
4414 	ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
4415 	ut_ad(entry->n_fields - 1 == index->n_fields);
4416 
4417 	const byte* ptr = static_cast<const byte*>(
4418 		update->fields[0].new_val.data);
4419 	ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
4420 	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
4421 	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
4422 	ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
4423 	      == index->table->space->id);
4424 
4425 	ulint n_fields = update->fields[1].field_no;
4426 	ut_ad(n_fields <= index->n_fields);
4427 	if (n_fields != index->n_uniq) {
4428 		ut_ad(n_fields
4429 		      >= index->n_core_fields);
4430 		entry->n_fields = n_fields;
4431 		return;
4432 	}
4433 
4434 	/* This is based on dict_table_t::deserialise_columns()
4435 	and btr_cur_instant_init_low(). */
4436 	mtr_t mtr;
4437 	mtr.start();
4438 	buf_block_t* block = buf_page_get(
4439 		page_id_t(index->table->space->id,
4440 			  mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
4441 		0, RW_S_LATCH, &mtr);
4442 	buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
4443 	ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
4444 	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
4445 					     + BTR_BLOB_HDR_NEXT_PAGE_NO])
4446 	      == FIL_NULL);
4447 	ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
4448 					     + BTR_BLOB_HDR_PART_LEN])
4449 	      == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
4450 	n_fields = mach_read_from_4(
4451 		&block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
4452 		+ index->first_user_field();
4453 	/* Rollback should not increase the number of fields. */
4454 	ut_ad(n_fields <= index->n_fields);
4455 	ut_ad(n_fields + 1 <= entry->n_fields);
4456 	/* dict_index_t::clear_instant_alter() cannot be invoked while
4457 	rollback of an instant ALTER TABLE transaction is in progress
4458 	for an is_alter_metadata() record. */
4459 	ut_ad(n_fields >= index->n_core_fields);
4460 
4461 	mtr.commit();
4462 	entry->n_fields = n_fields + 1;
4463 }
4464 
4465 /** Trim an update tuple due to instant ADD COLUMN, if needed.
4466 For normal records, the trailing instantly added fields that match
4467 the initial default values are omitted.
4468 
4469 For the special metadata record on a table on which instant
4470 ADD COLUMN has already been executed, both ADD COLUMN and the
4471 rollback of ADD COLUMN need to be handled specially.
4472 
4473 @param[in,out]	entry	index entry
4474 @param[in]	index	index
4475 @param[in]	update	update vector
4476 @param[in]	thr	execution thread */
4477 static inline
4478 void
btr_cur_trim(dtuple_t * entry,const dict_index_t * index,const upd_t * update,const que_thr_t * thr)4479 btr_cur_trim(
4480 	dtuple_t*		entry,
4481 	const dict_index_t*	index,
4482 	const upd_t*		update,
4483 	const que_thr_t*	thr)
4484 {
4485 	if (!index->is_instant()) {
4486 	} else if (UNIV_UNLIKELY(update->is_metadata())) {
4487 		/* We are either updating a metadata record
4488 		(instant ALTER TABLE on a table where instant ALTER was
4489 		already executed) or rolling back such an operation. */
4490 		ut_ad(!upd_get_nth_field(update, 0)->orig_len);
4491 		ut_ad(entry->is_metadata());
4492 
4493 		if (thr->graph->trx->in_rollback) {
4494 			/* This rollback can occur either as part of
4495 			ha_innobase::commit_inplace_alter_table() rolling
4496 			back after a failed innobase_add_instant_try(),
4497 			or as part of crash recovery. Either way, the
4498 			table will be in the data dictionary cache, with
4499 			the instantly added columns going to be removed
4500 			later in the rollback. */
4501 			ut_ad(index->table->cached);
4502 			/* The DB_TRX_ID,DB_ROLL_PTR are always last,
4503 			and there should be some change to roll back.
4504 			The first field in the update vector is the
4505 			first instantly added column logged by
4506 			innobase_add_instant_try(). */
4507 			ut_ad(update->n_fields > 2);
4508 			if (update->is_alter_metadata()) {
4509 				btr_cur_trim_alter_metadata(
4510 					entry, index, update);
4511 				return;
4512 			}
4513 			ut_ad(!entry->is_alter_metadata());
4514 
4515 			ulint n_fields = upd_get_nth_field(update, 0)
4516 				->field_no;
4517 			ut_ad(n_fields + 1 >= entry->n_fields);
4518 			entry->n_fields = n_fields;
4519 		}
4520 	} else {
4521 		entry->trim(*index);
4522 	}
4523 }
4524 
4525 /*************************************************************//**
4526 Tries to update a record on a page in an index tree. It is assumed that mtr
4527 holds an x-latch on the page. The operation does not succeed if there is too
4528 little space on the page or if the update would result in too empty a page,
4529 so that tree compression is recommended. We assume here that the ordering
4530 fields of the record do not change.
4531 @return error code, including
4532 @retval DB_SUCCESS on success
4533 @retval DB_OVERFLOW if the updated record does not fit
4534 @retval DB_UNDERFLOW if the page would become too empty
4535 @retval DB_ZIP_OVERFLOW if there is not enough space left
4536 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
4537 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4538 btr_cur_optimistic_update(
4539 /*======================*/
4540 	ulint		flags,	/*!< in: undo logging and locking flags */
4541 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
4542 				cursor stays valid and positioned on the
4543 				same record */
4544 	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
4545 	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
4546 	const upd_t*	update,	/*!< in: update vector; this must also
4547 				contain trx id and roll ptr fields */
4548 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4549 				updates */
4550 	que_thr_t*	thr,	/*!< in: query thread */
4551 	trx_id_t	trx_id,	/*!< in: transaction id */
4552 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
4553 				is a secondary index, the caller must
4554 				mtr_commit(mtr) before latching any
4555 				further pages */
4556 {
4557 	dict_index_t*	index;
4558 	page_cur_t*	page_cursor;
4559 	dberr_t		err;
4560 	buf_block_t*	block;
4561 	page_t*		page;
4562 	page_zip_des_t*	page_zip;
4563 	rec_t*		rec;
4564 	ulint		max_size;
4565 	ulint		new_rec_size;
4566 	ulint		old_rec_size;
4567 	ulint		max_ins_size = 0;
4568 	dtuple_t*	new_entry;
4569 	roll_ptr_t	roll_ptr;
4570 	ulint		i;
4571 
4572 	block = btr_cur_get_block(cursor);
4573 	page = buf_block_get_frame(block);
4574 	rec = btr_cur_get_rec(cursor);
4575 	index = cursor->index;
4576 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4577 	      || index->table->is_temporary());
4578 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4579 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4580 	/* This is intended only for leaf page updates */
4581 	ut_ad(page_is_leaf(page));
4582 	/* The insert buffer tree should never be updated in place. */
4583 	ut_ad(!dict_index_is_ibuf(index));
4584 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4585 	      || dict_index_is_clust(index));
4586 	ut_ad(thr_get_trx(thr)->id == trx_id
4587 	      || (flags & ulint(~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP)))
4588 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4589 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4590 	ut_ad(fil_page_index_page_check(page));
4591 	ut_ad(btr_page_get_index_id(page) == index->id);
4592 
4593 	*offsets = rec_get_offsets(rec, index, *offsets, index->n_core_fields,
4594 				   ULINT_UNDEFINED, heap);
4595 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
4596 	ut_a(!rec_offs_any_null_extern(rec, *offsets)
4597 	     || thr_get_trx(thr) == trx_roll_crash_recv_trx);
4598 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
4599 
4600 	if (UNIV_LIKELY(!update->is_metadata())
4601 	    && !row_upd_changes_field_size_or_external(index, *offsets,
4602 						       update)) {
4603 
4604 		/* The simplest and the most common case: the update does not
4605 		change the size of any field and none of the updated fields is
4606 		externally stored in rec or update, and there is enough space
4607 		on the compressed page to log the update. */
4608 
4609 		return(btr_cur_update_in_place(
4610 			       flags, cursor, *offsets, update,
4611 			       cmpl_info, thr, trx_id, mtr));
4612 	}
4613 
4614 	if (rec_offs_any_extern(*offsets)) {
4615 any_extern:
4616 		ut_ad(!index->is_ibuf());
4617 		/* Externally stored fields are treated in pessimistic
4618 		update */
4619 
4620 		/* prefetch siblings of the leaf for the pessimistic
4621 		operation. */
4622 		btr_cur_prefetch_siblings(block, index);
4623 
4624 		return(DB_OVERFLOW);
4625 	}
4626 
4627 	if (rec_is_metadata(rec, *index) && index->table->instant) {
4628 		goto any_extern;
4629 	}
4630 
4631 	for (i = 0; i < upd_get_n_fields(update); i++) {
4632 		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4633 
4634 			goto any_extern;
4635 		}
4636 	}
4637 
4638 	DBUG_LOG("ib_cur",
4639 		 "update " << index->name << " (" << index->id << ") by "
4640 		 << ib::hex(trx_id) << ": "
4641 		 << rec_printer(rec, *offsets).str());
4642 
4643 	page_cursor = btr_cur_get_page_cur(cursor);
4644 
4645 	if (!*heap) {
4646 		*heap = mem_heap_create(
4647 			rec_offs_size(*offsets)
4648 			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4649 	}
4650 
4651 	new_entry = row_rec_to_index_entry(rec, index, *offsets, *heap);
4652 	ut_ad(!dtuple_get_n_ext(new_entry));
4653 
4654 	/* The page containing the clustered index record
4655 	corresponding to new_entry is latched in mtr.
4656 	Thus the following call is safe. */
4657 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4658 						     *heap);
4659 	btr_cur_trim(new_entry, index, update, thr);
4660 	old_rec_size = rec_offs_size(*offsets);
4661 	new_rec_size = rec_get_converted_size(index, new_entry, 0);
4662 
4663 	page_zip = buf_block_get_page_zip(block);
4664 #ifdef UNIV_ZIP_DEBUG
4665 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4666 #endif /* UNIV_ZIP_DEBUG */
4667 
4668 	if (page_zip) {
4669 		ut_ad(!index->table->is_temporary());
4670 
4671 		if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
4672 					   dict_index_get_n_fields(index),
4673 					   block->zip_size())) {
4674 			goto any_extern;
4675 		}
4676 
4677 		if (!btr_cur_update_alloc_zip(
4678 			    page_zip, page_cursor, index, *offsets,
4679 			    new_rec_size, true, mtr)) {
4680 			return(DB_ZIP_OVERFLOW);
4681 		}
4682 
4683 		rec = page_cur_get_rec(page_cursor);
4684 	}
4685 
4686 	/* We limit max record size to 16k even for 64k page size. */
4687 	if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
4688 			(!dict_table_is_comp(index->table)
4689 			 && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
4690 		err = DB_OVERFLOW;
4691 
4692 		goto func_exit;
4693 	}
4694 
4695 	if (UNIV_UNLIKELY(new_rec_size
4696 			  >= (page_get_free_space_of_empty(page_is_comp(page))
4697 			      / 2))) {
4698 		/* We may need to update the IBUF_BITMAP_FREE
4699 		bits after a reorganize that was done in
4700 		btr_cur_update_alloc_zip(). */
4701 		err = DB_OVERFLOW;
4702 		goto func_exit;
4703 	}
4704 
4705 	if (UNIV_UNLIKELY(page_get_data_size(page)
4706 			  - old_rec_size + new_rec_size
4707 			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4708 		/* We may need to update the IBUF_BITMAP_FREE
4709 		bits after a reorganize that was done in
4710 		btr_cur_update_alloc_zip(). */
4711 
4712 		/* The page would become too empty */
4713 		err = DB_UNDERFLOW;
4714 		goto func_exit;
4715 	}
4716 
4717 	/* We do not attempt to reorganize if the page is compressed.
4718 	This is because the page may fail to compress after reorganization. */
4719 	max_size = page_zip
4720 		? page_get_max_insert_size(page, 1)
4721 		: (old_rec_size
4722 		   + page_get_max_insert_size_after_reorganize(page, 1));
4723 
4724 	if (!page_zip) {
4725 		max_ins_size = page_get_max_insert_size_after_reorganize(
4726 				page, 1);
4727 	}
4728 
4729 	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4730 	       && (max_size >= new_rec_size))
4731 	      || (page_get_n_recs(page) <= 1))) {
4732 
4733 		/* We may need to update the IBUF_BITMAP_FREE
4734 		bits after a reorganize that was done in
4735 		btr_cur_update_alloc_zip(). */
4736 
4737 		/* There was not enough space, or it did not pay to
4738 		reorganize: for simplicity, we decide what to do assuming a
4739 		reorganization is needed, though it might not be necessary */
4740 
4741 		err = DB_OVERFLOW;
4742 		goto func_exit;
4743 	}
4744 
4745 	/* Do lock checking and undo logging */
4746 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4747 					update, cmpl_info,
4748 					thr, mtr, &roll_ptr);
4749 	if (err != DB_SUCCESS) {
4750 		/* We may need to update the IBUF_BITMAP_FREE
4751 		bits after a reorganize that was done in
4752 		btr_cur_update_alloc_zip(). */
4753 		goto func_exit;
4754 	}
4755 
4756 	/* Ok, we may do the replacement. Store on the page infimum the
4757 	explicit locks on rec, before deleting rec (see the comment in
4758 	btr_cur_pessimistic_update). */
4759 	if (!dict_table_is_locking_disabled(index->table)) {
4760 		lock_rec_store_on_page_infimum(block, rec);
4761 	}
4762 
4763 	if (UNIV_UNLIKELY(update->is_metadata())) {
4764 		ut_ad(new_entry->is_metadata());
4765 		ut_ad(index->is_instant());
4766 		/* This can be innobase_add_instant_try() performing a
4767 		subsequent instant ADD COLUMN, or its rollback by
4768 		row_undo_mod_clust_low(). */
4769 		ut_ad(flags & BTR_NO_LOCKING_FLAG);
4770 	} else {
4771 		btr_search_update_hash_on_delete(cursor);
4772 	}
4773 
4774 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4775 
4776 	page_cur_move_to_prev(page_cursor);
4777 
4778 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
4779 		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
4780 	}
4781 
4782 	/* There are no externally stored columns in new_entry */
4783 	rec = btr_cur_insert_if_possible(
4784 		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4785 	ut_a(rec); /* <- We calculated above the insert would fit */
4786 
4787 	if (UNIV_UNLIKELY(update->is_metadata())) {
4788 		/* We must empty the PAGE_FREE list, because if this
4789 		was a rollback, the shortened metadata record
4790 		would have too many fields, and we would be unable to
4791 		know the size of the freed record. */
4792 		btr_page_reorganize(page_cursor, index, mtr);
4793 	} else if (!dict_table_is_locking_disabled(index->table)) {
4794 		/* Restore the old explicit lock state on the record */
4795 		lock_rec_restore_from_page_infimum(block, rec, block);
4796 	}
4797 
4798 	page_cur_move_to_next(page_cursor);
4799 	ut_ad(err == DB_SUCCESS);
4800 
4801 func_exit:
4802 	if (!(flags & BTR_KEEP_IBUF_BITMAP)
4803 	    && !dict_index_is_clust(index)) {
4804 		/* Update the free bits in the insert buffer. */
4805 		if (page_zip) {
4806 			ut_ad(!index->table->is_temporary());
4807 			ibuf_update_free_bits_zip(block, mtr);
4808 		} else if (!index->table->is_temporary()) {
4809 			ibuf_update_free_bits_low(block, max_ins_size, mtr);
4810 		}
4811 	}
4812 
4813 	if (err != DB_SUCCESS) {
4814 		/* prefetch siblings of the leaf for the pessimistic
4815 		operation. */
4816 		btr_cur_prefetch_siblings(block, index);
4817 	}
4818 
4819 	return(err);
4820 }
4821 
4822 /*************************************************************//**
4823 If, in a split, a new supremum record was created as the predecessor of the
4824 updated record, the supremum record must inherit exactly the locks on the
4825 updated record. In the split it may have inherited locks from the successor
4826 of the updated record, which is not correct. This function restores the
4827 right locks for the new supremum. */
4828 static
4829 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4830 btr_cur_pess_upd_restore_supremum(
4831 /*==============================*/
4832 	buf_block_t*	block,	/*!< in: buffer block of rec */
4833 	const rec_t*	rec,	/*!< in: updated record */
4834 	mtr_t*		mtr)	/*!< in: mtr */
4835 {
4836 	page_t*		page;
4837 	buf_block_t*	prev_block;
4838 
4839 	page = buf_block_get_frame(block);
4840 
4841 	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4842 		/* Updated record is not the first user record on its page */
4843 
4844 		return;
4845 	}
4846 
4847 	const uint32_t	prev_page_no = btr_page_get_prev(page);
4848 
4849 	const page_id_t	page_id(block->page.id().space(), prev_page_no);
4850 
4851 	ut_ad(prev_page_no != FIL_NULL);
4852 	prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(),
4853 						mtr);
4854 #ifdef UNIV_BTR_DEBUG
4855 	ut_a(btr_page_get_next(prev_block->frame)
4856 	     == block->page.id().page_no());
4857 #endif /* UNIV_BTR_DEBUG */
4858 
4859 	/* We must already have an x-latch on prev_block! */
4860 	ut_ad(mtr->memo_contains_flagged(prev_block, MTR_MEMO_PAGE_X_FIX));
4861 
4862 	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4863 					     PAGE_HEAP_NO_SUPREMUM,
4864 					     page_rec_get_heap_no(rec));
4865 }
4866 
4867 /*************************************************************//**
4868 Performs an update of a record on a page of a tree. It is assumed
4869 that mtr holds an x-latch on the tree and on the cursor page. If the
4870 update is made on the leaf level, to avoid deadlocks, mtr must also
4871 own x-latches to brothers of page, if those brothers exist. We assume
4872 here that the ordering fields of the record do not change.
4873 @return DB_SUCCESS or error code */
4874 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,rec_offs ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4875 btr_cur_pessimistic_update(
4876 /*=======================*/
4877 	ulint		flags,	/*!< in: undo logging, locking, and rollback
4878 				flags */
4879 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
4880 				cursor may become invalid if *big_rec == NULL
4881 				|| !(flags & BTR_KEEP_POS_FLAG) */
4882 	rec_offs**	offsets,/*!< out: offsets on cursor->page_cur.rec */
4883 	mem_heap_t**	offsets_heap,
4884 				/*!< in/out: pointer to memory heap
4885 				that can be emptied */
4886 	mem_heap_t*	entry_heap,
4887 				/*!< in/out: memory heap for allocating
4888 				big_rec and the index tuple */
4889 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
4890 				be stored externally by the caller */
4891 	upd_t*		update,	/*!< in/out: update vector; this is allowed to
4892 				also contain trx id and roll ptr fields.
4893 				Non-updated columns that are moved offpage will
4894 				be appended to this. */
4895 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4896 				updates */
4897 	que_thr_t*	thr,	/*!< in: query thread */
4898 	trx_id_t	trx_id,	/*!< in: transaction id */
4899 	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
4900 				committed before latching any further pages */
4901 {
4902 	big_rec_t*	big_rec_vec	= NULL;
4903 	big_rec_t*	dummy_big_rec;
4904 	dict_index_t*	index;
4905 	buf_block_t*	block;
4906 	page_zip_des_t*	page_zip;
4907 	rec_t*		rec;
4908 	page_cur_t*	page_cursor;
4909 	dberr_t		err;
4910 	dberr_t		optim_err;
4911 	roll_ptr_t	roll_ptr;
4912 	bool		was_first;
4913 	uint32_t	n_reserved	= 0;
4914 
4915 	*offsets = NULL;
4916 	*big_rec = NULL;
4917 
4918 	block = btr_cur_get_block(cursor);
4919 	page_zip = buf_block_get_page_zip(block);
4920 	index = cursor->index;
4921 
4922 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK |
4923 					 MTR_MEMO_SX_LOCK));
4924 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4925 #ifdef UNIV_ZIP_DEBUG
4926 	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
4927 #endif /* UNIV_ZIP_DEBUG */
4928 	ut_ad(!page_zip || !index->table->is_temporary());
4929 	/* The insert buffer tree should never be updated in place. */
4930 	ut_ad(!dict_index_is_ibuf(index));
4931 	ut_ad(trx_id > 0 || (flags & BTR_KEEP_SYS_FLAG)
4932 	      || index->table->is_temporary());
4933 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4934 	      || dict_index_is_clust(index));
4935 	ut_ad(thr_get_trx(thr)->id == trx_id
4936 	      || (flags & ulint(~BTR_KEEP_POS_FLAG))
4937 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4938 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4939 
4940 	err = optim_err = btr_cur_optimistic_update(
4941 		flags | BTR_KEEP_IBUF_BITMAP,
4942 		cursor, offsets, offsets_heap, update,
4943 		cmpl_info, thr, trx_id, mtr);
4944 
4945 	switch (err) {
4946 	case DB_ZIP_OVERFLOW:
4947 	case DB_UNDERFLOW:
4948 	case DB_OVERFLOW:
4949 		break;
4950 	default:
4951 	err_exit:
4952 		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4953 		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4954 		already reset by btr_cur_update_alloc_zip() if the
4955 		page was recompressed. */
4956 		if (page_zip
4957 		    && optim_err != DB_ZIP_OVERFLOW
4958 		    && !dict_index_is_clust(index)
4959 		    && page_is_leaf(block->frame)) {
4960 			ut_ad(!index->table->is_temporary());
4961 			ibuf_update_free_bits_zip(block, mtr);
4962 		}
4963 
4964 		if (big_rec_vec != NULL) {
4965 			dtuple_big_rec_free(big_rec_vec);
4966 		}
4967 
4968 		return(err);
4969 	}
4970 
4971 	rec = btr_cur_get_rec(cursor);
4972 	ut_ad(rec_offs_validate(rec, index, *offsets));
4973 
4974 	dtuple_t* new_entry;
4975 
4976 	const bool is_metadata = rec_is_metadata(rec, *index);
4977 
4978 	if (UNIV_UNLIKELY(is_metadata)) {
4979 		ut_ad(update->is_metadata());
4980 		ut_ad(flags & BTR_NO_LOCKING_FLAG);
4981 		ut_ad(index->is_instant());
4982 		new_entry = row_metadata_to_tuple(
4983 			rec, index, *offsets, entry_heap,
4984 			update->info_bits, !thr_get_trx(thr)->in_rollback);
4985 		ut_ad(new_entry->n_fields
4986 		      == ulint(index->n_fields)
4987 		      + update->is_alter_metadata());
4988 	} else {
4989 		new_entry = row_rec_to_index_entry(rec, index, *offsets,
4990 						   entry_heap);
4991 	}
4992 
4993 	/* The page containing the clustered index record
4994 	corresponding to new_entry is latched in mtr.  If the
4995 	clustered index record is delete-marked, then its externally
4996 	stored fields cannot have been purged yet, because then the
4997 	purge would also have removed the clustered index record
4998 	itself.  Thus the following call is safe. */
4999 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
5000 						     entry_heap);
5001 	btr_cur_trim(new_entry, index, update, thr);
5002 
5003 	/* We have to set appropriate extern storage bits in the new
5004 	record to be inserted: we have to remember which fields were such */
5005 
5006 	ut_ad(!page_is_comp(block->frame) || !rec_get_node_ptr_flag(rec));
5007 	ut_ad(rec_offs_validate(rec, index, *offsets));
5008 
5009 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
5010 	    && rec_offs_any_extern(*offsets)) {
5011 		/* We are in a transaction rollback undoing a row
5012 		update: we must free possible externally stored fields
5013 		which got new values in the update, if they are not
5014 		inherited values. They can be inherited if we have
5015 		updated the primary key to another value, and then
5016 		update it back again. */
5017 
5018 		ut_ad(big_rec_vec == NULL);
5019 		ut_ad(dict_index_is_clust(index));
5020 		ut_ad(thr_get_trx(thr)->in_rollback);
5021 
5022 		DEBUG_SYNC_C("blob_rollback_middle");
5023 
5024 		btr_rec_free_updated_extern_fields(
5025 			index, rec, block, *offsets, update, true, mtr);
5026 	}
5027 
5028 	ulint n_ext = index->is_primary() ? dtuple_get_n_ext(new_entry) : 0;
5029 
5030 	if (page_zip_rec_needs_ext(
5031 		    rec_get_converted_size(index, new_entry, n_ext),
5032 		    page_is_comp(block->frame),
5033 		    dict_index_get_n_fields(index),
5034 		    block->zip_size())
5035 	    || (UNIV_UNLIKELY(update->is_alter_metadata())
5036 		&& !dfield_is_ext(dtuple_get_nth_field(
5037 					  new_entry,
5038 					  index->first_user_field())))) {
5039 		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
5040 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
5041 
5042 			/* We cannot goto return_after_reservations,
5043 			because we may need to update the
5044 			IBUF_BITMAP_FREE bits, which was suppressed by
5045 			BTR_KEEP_IBUF_BITMAP. */
5046 #ifdef UNIV_ZIP_DEBUG
5047 			ut_a(!page_zip
5048 			     || page_zip_validate(page_zip, block->frame,
5049 						  index));
5050 #endif /* UNIV_ZIP_DEBUG */
5051 			index->table->space->release_free_extents(n_reserved);
5052 			err = DB_TOO_BIG_RECORD;
5053 			goto err_exit;
5054 		}
5055 
5056 		ut_ad(page_is_leaf(block->frame));
5057 		ut_ad(dict_index_is_clust(index));
5058 		ut_ad(flags & BTR_KEEP_POS_FLAG);
5059 	}
5060 
5061 	/* Do lock checking and undo logging */
5062 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
5063 					update, cmpl_info,
5064 					thr, mtr, &roll_ptr);
5065 	if (err != DB_SUCCESS) {
5066 		goto err_exit;
5067 	}
5068 
5069 	if (optim_err == DB_OVERFLOW) {
5070 
5071 		/* First reserve enough free space for the file segments
5072 		of the index tree, so that the update will not fail because
5073 		of lack of space */
5074 
5075 		uint32_t n_extents = uint32_t(cursor->tree_height / 16 + 3);
5076 
5077 		if (!fsp_reserve_free_extents(
5078 		            &n_reserved, index->table->space, n_extents,
5079 		            flags & BTR_NO_UNDO_LOG_FLAG
5080 		            ? FSP_CLEANING : FSP_NORMAL,
5081 		            mtr)) {
5082 			err = DB_OUT_OF_FILE_SPACE;
5083 			goto err_exit;
5084 		}
5085 	}
5086 
5087 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
5088 		btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
5089 	}
5090 
5091 	const ulint max_ins_size = page_zip
5092 		? 0 : page_get_max_insert_size_after_reorganize(block->frame,
5093 								1);
5094 
5095 	if (UNIV_UNLIKELY(is_metadata)) {
5096 		ut_ad(new_entry->is_metadata());
5097 		ut_ad(index->is_instant());
5098 		/* This can be innobase_add_instant_try() performing a
5099 		subsequent instant ALTER TABLE, or its rollback by
5100 		row_undo_mod_clust_low(). */
5101 		ut_ad(flags & BTR_NO_LOCKING_FLAG);
5102 	} else {
5103 		btr_search_update_hash_on_delete(cursor);
5104 
5105 		/* Store state of explicit locks on rec on the page
5106 		infimum record, before deleting rec. The page infimum
5107 		acts as a dummy carrier of the locks, taking care also
5108 		of lock releases, before we can move the locks back on
5109 		the actual record. There is a special case: if we are
5110 		inserting on the root page and the insert causes a
5111 		call of btr_root_raise_and_insert. Therefore we cannot
5112 		in the lock system delete the lock structs set on the
5113 		root page even if the root page carries just node
5114 		pointers. */
5115 		if (!dict_table_is_locking_disabled(index->table)) {
5116 			lock_rec_store_on_page_infimum(block, rec);
5117 		}
5118 	}
5119 
5120 #ifdef UNIV_ZIP_DEBUG
5121 	ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index));
5122 #endif /* UNIV_ZIP_DEBUG */
5123 	page_cursor = btr_cur_get_page_cur(cursor);
5124 
5125 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
5126 
5127 	page_cur_move_to_prev(page_cursor);
5128 
5129 	rec = btr_cur_insert_if_possible(cursor, new_entry,
5130 					 offsets, offsets_heap, n_ext, mtr);
5131 
5132 	if (rec) {
5133 		page_cursor->rec = rec;
5134 
5135 		if (UNIV_UNLIKELY(is_metadata)) {
5136 			/* We must empty the PAGE_FREE list, because if this
5137 			was a rollback, the shortened metadata record
5138 			would have too many fields, and we would be unable to
5139 			know the size of the freed record. */
5140 			btr_page_reorganize(page_cursor, index, mtr);
5141 			rec = page_cursor->rec;
5142 			rec_offs_make_valid(rec, index, true, *offsets);
5143 			if (page_cursor->block->page.id().page_no()
5144 			    == index->page) {
5145 				btr_set_instant(page_cursor->block, *index,
5146 						mtr);
5147 			}
5148 		} else if (!dict_table_is_locking_disabled(index->table)) {
5149 			lock_rec_restore_from_page_infimum(
5150 				btr_cur_get_block(cursor), rec, block);
5151 		}
5152 
5153 		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
5154 		    || rec_is_alter_metadata(rec, *index)) {
5155 			/* The new inserted record owns its possible externally
5156 			stored fields */
5157 			btr_cur_unmark_extern_fields(btr_cur_get_block(cursor),
5158 						     rec, index, *offsets, mtr);
5159 		} else {
5160 			/* In delete-marked records, DB_TRX_ID must
5161 			always refer to an existing undo log record. */
5162 			ut_ad(row_get_rec_trx_id(rec, index, *offsets));
5163 		}
5164 
5165 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
5166 		ut_ad(!adjust || page_is_leaf(block->frame));
5167 
5168 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
5169 			if (adjust) {
5170 				rec_offs_make_valid(page_cursor->rec, index,
5171 						    true, *offsets);
5172 			}
5173 		} else if (!dict_index_is_clust(index)
5174 			   && page_is_leaf(block->frame)) {
5175 			/* Update the free bits in the insert buffer.
5176 			This is the same block which was skipped by
5177 			BTR_KEEP_IBUF_BITMAP. */
5178 			if (page_zip) {
5179 				ut_ad(!index->table->is_temporary());
5180 				ibuf_update_free_bits_zip(block, mtr);
5181 			} else if (!index->table->is_temporary()) {
5182 				ibuf_update_free_bits_low(block, max_ins_size,
5183 							  mtr);
5184 			}
5185 		}
5186 
5187 		if (!srv_read_only_mode
5188 		    && !big_rec_vec
5189 		    && page_is_leaf(block->frame)
5190 		    && !dict_index_is_online_ddl(index)) {
5191 
5192 			mtr_memo_release(mtr, dict_index_get_lock(index),
5193 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5194 
5195 			/* NOTE: We cannot release root block latch here, because it
5196 			has segment header and already modified in most of cases.*/
5197 		}
5198 
5199 		err = DB_SUCCESS;
5200 		goto return_after_reservations;
5201 	} else {
5202 		/* If the page is compressed and it initially
5203 		compresses very well, and there is a subsequent insert
5204 		of a badly-compressing record, it is possible for
5205 		btr_cur_optimistic_update() to return DB_UNDERFLOW and
5206 		btr_cur_insert_if_possible() to return FALSE. */
5207 		ut_a(page_zip || optim_err != DB_UNDERFLOW);
5208 
5209 		/* Out of space: reset the free bits.
5210 		This is the same block which was skipped by
5211 		BTR_KEEP_IBUF_BITMAP. */
5212 		if (!dict_index_is_clust(index)
5213 		    && !index->table->is_temporary()
5214 		    && page_is_leaf(block->frame)) {
5215 			ibuf_reset_free_bits(block);
5216 		}
5217 	}
5218 
5219 	if (big_rec_vec != NULL) {
5220 		ut_ad(page_is_leaf(block->frame));
5221 		ut_ad(dict_index_is_clust(index));
5222 		ut_ad(flags & BTR_KEEP_POS_FLAG);
5223 
5224 		/* btr_page_split_and_insert() in
5225 		btr_cur_pessimistic_insert() invokes
5226 		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
5227 		We must keep the index->lock when we created a
5228 		big_rec, so that row_upd_clust_rec() can store the
5229 		big_rec in the same mini-transaction. */
5230 
5231 		ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
5232 						 | MTR_MEMO_SX_LOCK));
5233 		mtr_sx_lock_index(index, mtr);
5234 	}
5235 
5236 	/* Was the record to be updated positioned as the first user
5237 	record on its page? */
5238 	was_first = page_cur_is_before_first(page_cursor);
5239 
5240 	/* Lock checks and undo logging were already performed by
5241 	btr_cur_upd_lock_and_undo(). We do not try
5242 	btr_cur_optimistic_insert() because
5243 	btr_cur_insert_if_possible() already failed above. */
5244 
5245 	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
5246 					 | BTR_NO_LOCKING_FLAG
5247 					 | BTR_KEEP_SYS_FLAG,
5248 					 cursor, offsets, offsets_heap,
5249 					 new_entry, &rec,
5250 					 &dummy_big_rec, n_ext, NULL, mtr);
5251 	ut_a(rec);
5252 	ut_a(err == DB_SUCCESS);
5253 	ut_a(dummy_big_rec == NULL);
5254 	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
5255 	page_cursor->rec = rec;
5256 
5257 	/* Multiple transactions cannot simultaneously operate on the
5258 	same temp-table in parallel.
5259 	max_trx_id is ignored for temp tables because it not required
5260 	for MVCC. */
5261 	if (dict_index_is_sec_or_ibuf(index)
5262 	    && !index->table->is_temporary()) {
5263 		/* Update PAGE_MAX_TRX_ID in the index page header.
5264 		It was not updated by btr_cur_pessimistic_insert()
5265 		because of BTR_NO_LOCKING_FLAG. */
5266 		page_update_max_trx_id(btr_cur_get_block(cursor),
5267 				       btr_cur_get_page_zip(cursor),
5268 				       trx_id, mtr);
5269 	}
5270 
5271 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
5272 		/* The new inserted record owns its possible externally
5273 		stored fields */
5274 #ifdef UNIV_ZIP_DEBUG
5275 		ut_a(!page_zip || page_zip_validate(page_zip, block->frame,
5276 						    index));
5277 #endif /* UNIV_ZIP_DEBUG */
5278 		btr_cur_unmark_extern_fields(btr_cur_get_block(cursor), rec,
5279 					     index, *offsets, mtr);
5280 	} else {
5281 		/* In delete-marked records, DB_TRX_ID must
5282 		always refer to an existing undo log record. */
5283 		ut_ad(row_get_rec_trx_id(rec, index, *offsets));
5284 	}
5285 
5286 	if (UNIV_UNLIKELY(is_metadata)) {
5287 		/* We must empty the PAGE_FREE list, because if this
5288 		was a rollback, the shortened metadata record
5289 		would have too many fields, and we would be unable to
5290 		know the size of the freed record. */
5291 		btr_page_reorganize(page_cursor, index, mtr);
5292 		rec = page_cursor->rec;
5293 	} else if (!dict_table_is_locking_disabled(index->table)) {
5294 		lock_rec_restore_from_page_infimum(
5295 			btr_cur_get_block(cursor), rec, block);
5296 	}
5297 
5298 	/* If necessary, restore also the correct lock state for a new,
5299 	preceding supremum record created in a page split. While the old
5300 	record was nonexistent, the supremum might have inherited its locks
5301 	from a wrong record. */
5302 
5303 	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
5304 		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
5305 						  rec, mtr);
5306 	}
5307 
5308 return_after_reservations:
5309 #ifdef UNIV_ZIP_DEBUG
5310 	ut_a(!page_zip || page_zip_validate(btr_cur_get_page_zip(cursor),
5311 					    btr_cur_get_page(cursor), index));
5312 #endif /* UNIV_ZIP_DEBUG */
5313 
5314 	index->table->space->release_free_extents(n_reserved);
5315 	*big_rec = big_rec_vec;
5316 	return(err);
5317 }
5318 
5319 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
5320 
5321 /** Modify the delete-mark flag of a record.
5322 @tparam         flag    the value of the delete-mark flag
5323 @param[in,out]  block   buffer block
5324 @param[in,out]  rec     record on a physical index page
5325 @param[in,out]  mtr     mini-transaction  */
5326 template<bool flag>
btr_rec_set_deleted(buf_block_t * block,rec_t * rec,mtr_t * mtr)5327 void btr_rec_set_deleted(buf_block_t *block, rec_t *rec, mtr_t *mtr)
5328 {
5329   if (page_rec_is_comp(rec))
5330   {
5331     byte *b= &rec[-REC_NEW_INFO_BITS];
5332     const byte v= flag
5333       ? (*b | REC_INFO_DELETED_FLAG)
5334       : (*b & byte(~REC_INFO_DELETED_FLAG));
5335     if (*b == v);
5336     else if (UNIV_LIKELY_NULL(block->page.zip.data))
5337     {
5338       *b= v;
5339       page_zip_rec_set_deleted(block, rec, flag, mtr);
5340     }
5341     else
5342       mtr->write<1>(*block, b, v);
5343   }
5344   else
5345   {
5346     ut_ad(!block->page.zip.data);
5347     byte *b= &rec[-REC_OLD_INFO_BITS];
5348     const byte v = flag
5349       ? (*b | REC_INFO_DELETED_FLAG)
5350       : (*b & byte(~REC_INFO_DELETED_FLAG));
5351     mtr->write<1,mtr_t::MAYBE_NOP>(*block, b, v);
5352   }
5353 }
5354 
5355 template void btr_rec_set_deleted<false>(buf_block_t *, rec_t *, mtr_t *);
5356 template void btr_rec_set_deleted<true>(buf_block_t *, rec_t *, mtr_t *);
5357 
5358 /***********************************************************//**
5359 Marks a clustered index record deleted. Writes an undo log record to
5360 undo log on this delete marking. Writes in the trx id field the id
5361 of the deleting transaction, and in the roll ptr field pointer to the
5362 undo log record created.
5363 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
5364 dberr_t
btr_cur_del_mark_set_clust_rec(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)5365 btr_cur_del_mark_set_clust_rec(
5366 /*===========================*/
5367 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
5368 	rec_t*		rec,	/*!< in/out: record */
5369 	dict_index_t*	index,	/*!< in: clustered index of the record */
5370 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec) */
5371 	que_thr_t*	thr,	/*!< in: query thread */
5372 	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
5373 				contains the virtual cols if there are any */
5374 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5375 {
5376 	roll_ptr_t	roll_ptr;
5377 	dberr_t		err;
5378 	trx_t*		trx;
5379 
5380 	ut_ad(dict_index_is_clust(index));
5381 	ut_ad(rec_offs_validate(rec, index, offsets));
5382 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
5383 	ut_ad(buf_block_get_frame(block) == page_align(rec));
5384 	ut_ad(page_rec_is_leaf(rec));
5385 	ut_ad(mtr->is_named_space(index->table->space));
5386 
5387 	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
5388 		/* We may already have delete-marked this record
5389 		when executing an ON DELETE CASCADE operation. */
5390 		ut_ad(row_get_rec_trx_id(rec, index, offsets)
5391 		      == thr_get_trx(thr)->id);
5392 		return(DB_SUCCESS);
5393 	}
5394 
5395 	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
5396 						   rec, index, offsets, thr);
5397 
5398 	if (err != DB_SUCCESS) {
5399 
5400 		return(err);
5401 	}
5402 
5403 	err = trx_undo_report_row_operation(thr, index,
5404 					    entry, NULL, 0, rec, offsets,
5405 					    &roll_ptr);
5406 	if (err != DB_SUCCESS) {
5407 
5408 		return(err);
5409 	}
5410 
5411 	/* The search latch is not needed here, because
5412 	the adaptive hash index does not depend on the delete-mark
5413 	and the delete-mark is being updated in place. */
5414 
5415 	btr_rec_set_deleted<true>(block, rec, mtr);
5416 
5417 	trx = thr_get_trx(thr);
5418 
5419 	DBUG_LOG("ib_cur",
5420 		 "delete-mark clust " << index->table->name
5421 		 << " (" << index->id << ") by "
5422 		 << ib::hex(trx_get_id_for_print(trx)) << ": "
5423 		 << rec_printer(rec, offsets).str());
5424 
5425 	if (dict_index_is_online_ddl(index)) {
5426 		row_log_table_delete(rec, index, offsets, NULL);
5427 	}
5428 
5429 	btr_cur_upd_rec_sys(block, rec, index, offsets, trx, roll_ptr, mtr);
5430 	return(err);
5431 }
5432 
5433 /*==================== B-TREE RECORD REMOVE =========================*/
5434 
5435 /*************************************************************//**
5436 Tries to compress a page of the tree if it seems useful. It is assumed
5437 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5438 deadlocks, mtr must also own x-latches to brothers of page, if those
5439 brothers exist. NOTE: it is assumed that the caller has reserved enough
5440 free extents so that the compression will always succeed if done!
5441 @return TRUE if compression occurred */
5442 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5443 btr_cur_compress_if_useful(
5444 /*=======================*/
5445 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
5446 				cursor does not stay valid if !adjust and
5447 				compression occurs */
5448 	ibool		adjust,	/*!< in: TRUE if should adjust the
5449 				cursor position even if compression occurs */
5450 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5451 {
5452 	ut_ad(mtr->memo_contains_flagged(&cursor->index->lock,
5453 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
5454 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
5455 					 MTR_MEMO_PAGE_X_FIX));
5456 
5457 	if (cursor->index->is_spatial()) {
5458 		const trx_t*	trx = cursor->rtr_info->thr
5459 			? thr_get_trx(cursor->rtr_info->thr)
5460 			: NULL;
5461 		const buf_block_t* block = btr_cur_get_block(cursor);
5462 
5463 		/* Check whether page lock prevents the compression */
5464 		if (!lock_test_prdt_page_lock(trx, block->page.id())) {
5465 			return(false);
5466 		}
5467 	}
5468 
5469 	return(btr_cur_compress_recommendation(cursor, mtr)
5470 	       && btr_compress(cursor, adjust, mtr));
5471 }
5472 
5473 /*******************************************************//**
5474 Removes the record on which the tree cursor is positioned on a leaf page.
5475 It is assumed that the mtr has an x-latch on the page where the cursor is
5476 positioned, but no latch on the whole tree.
5477 @return TRUE if success, i.e., the page did not become too empty */
5478 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5479 btr_cur_optimistic_delete_func(
5480 /*===========================*/
5481 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
5482 				delete; cursor stays valid: if deletion
5483 				succeeds, on function exit it points to the
5484 				successor of the deleted record */
5485 #ifdef UNIV_DEBUG
5486 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5487 #endif /* UNIV_DEBUG */
5488 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
5489 				TRUE on a leaf page of a secondary
5490 				index, the mtr must be committed
5491 				before latching any further pages */
5492 {
5493 	buf_block_t*	block;
5494 	rec_t*		rec;
5495 	mem_heap_t*	heap		= NULL;
5496 	rec_offs	offsets_[REC_OFFS_NORMAL_SIZE];
5497 	rec_offs*	offsets		= offsets_;
5498 	rec_offs_init(offsets_);
5499 
5500 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5501 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(cursor),
5502 					 MTR_MEMO_PAGE_X_FIX));
5503 	ut_ad(mtr->is_named_space(cursor->index->table->space));
5504 	ut_ad(!cursor->index->is_dummy);
5505 
5506 	/* This is intended only for leaf page deletions */
5507 
5508 	block = btr_cur_get_block(cursor);
5509 
5510 	ut_ad(block->page.id().space() == cursor->index->table->space->id);
5511 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
5512 	ut_ad(!dict_index_is_online_ddl(cursor->index)
5513 	      || dict_index_is_clust(cursor->index)
5514 	      || (flags & BTR_CREATE_FLAG));
5515 
5516 	rec = btr_cur_get_rec(cursor);
5517 
5518 	offsets = rec_get_offsets(rec, cursor->index, offsets,
5519 				  cursor->index->n_core_fields,
5520 				  ULINT_UNDEFINED, &heap);
5521 
5522 	const ibool no_compress_needed = !rec_offs_any_extern(offsets)
5523 		&& btr_cur_can_delete_without_compress(
5524 			cursor, rec_offs_size(offsets), mtr);
5525 
5526 	if (!no_compress_needed) {
5527 		/* prefetch siblings of the leaf for the pessimistic
5528 		operation. */
5529 		btr_cur_prefetch_siblings(block, cursor->index);
5530 		goto func_exit;
5531 	}
5532 
5533 	if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page
5534 			  && page_get_n_recs(block->frame) == 1
5535 			  + (cursor->index->is_instant()
5536 			     && !rec_is_metadata(rec, *cursor->index))
5537 			  && !cursor->index->must_avoid_clear_instant_add())) {
5538 		/* The whole index (and table) becomes logically empty.
5539 		Empty the whole page. That is, if we are deleting the
5540 		only user record, also delete the metadata record
5541 		if one exists for instant ADD COLUMN (not generic ALTER TABLE).
5542 		If we are deleting the metadata record and the
5543 		table becomes empty, clean up the whole page. */
5544 		dict_index_t* index = cursor->index;
5545 		const rec_t* first_rec = page_rec_get_next_const(
5546 			page_get_infimum_rec(block->frame));
5547 		ut_ad(!index->is_instant()
5548 		      || rec_is_metadata(first_rec, *index));
5549 		const bool is_metadata = rec_is_metadata(rec, *index);
5550 		/* We can remove the metadata when rolling back an
5551 		instant ALTER TABLE operation, or when deleting the
5552 		last user record on the page such that only metadata for
5553 		instant ADD COLUMN (not generic ALTER TABLE) remains. */
5554 		const bool empty_table = is_metadata
5555 			|| !index->is_instant()
5556 			|| (first_rec != rec
5557 			    && rec_is_add_metadata(first_rec, *index));
5558 		if (UNIV_LIKELY(empty_table)) {
5559 			if (UNIV_LIKELY(!is_metadata)) {
5560 				lock_update_delete(block, rec);
5561 			}
5562 			btr_page_empty(block, buf_block_get_page_zip(block),
5563 				       index, 0, mtr);
5564 			if (index->is_instant()) {
5565 				/* MDEV-17383: free metadata BLOBs! */
5566 				index->clear_instant_alter();
5567 			}
5568 			page_cur_set_after_last(block,
5569 						btr_cur_get_page_cur(cursor));
5570 			goto func_exit;
5571 		}
5572 	}
5573 
5574 	{
5575 		page_t*		page	= buf_block_get_frame(block);
5576 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
5577 
5578 		if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
5579 				  & REC_INFO_MIN_REC_FLAG)) {
5580 			/* This should be rolling back instant ADD COLUMN.
5581 			If this is a recovered transaction, then
5582 			index->is_instant() will hold until the
5583 			insert into SYS_COLUMNS is rolled back. */
5584 			ut_ad(cursor->index->table->supports_instant());
5585 			ut_ad(cursor->index->is_primary());
5586 			ut_ad(!page_zip);
5587 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5588 					    cursor->index, offsets, mtr);
5589 			/* We must empty the PAGE_FREE list, because
5590 			after rollback, this deleted metadata record
5591 			would have too many fields, and we would be
5592 			unable to know the size of the freed record. */
5593 			btr_page_reorganize(btr_cur_get_page_cur(cursor),
5594 					    cursor->index, mtr);
5595 			goto func_exit;
5596 		} else {
5597 			lock_update_delete(block, rec);
5598 
5599 			btr_search_update_hash_on_delete(cursor);
5600 		}
5601 
5602 		if (page_zip) {
5603 #ifdef UNIV_ZIP_DEBUG
5604 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5605 #endif /* UNIV_ZIP_DEBUG */
5606 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5607 					    cursor->index, offsets, mtr);
5608 #ifdef UNIV_ZIP_DEBUG
5609 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5610 #endif /* UNIV_ZIP_DEBUG */
5611 
5612 			/* On compressed pages, the IBUF_BITMAP_FREE
5613 			space is not affected by deleting (purging)
5614 			records, because it is defined as the minimum
5615 			of space available *without* reorganize, and
5616 			space available in the modification log. */
5617 		} else {
5618 			const ulint	max_ins
5619 				= page_get_max_insert_size_after_reorganize(
5620 					page, 1);
5621 
5622 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5623 					    cursor->index, offsets, mtr);
5624 
5625 			/* The change buffer does not handle inserts
5626 			into non-leaf pages, into clustered indexes,
5627 			or into the change buffer. */
5628 			if (!dict_index_is_clust(cursor->index)
5629 			    && !cursor->index->table->is_temporary()
5630 			    && !dict_index_is_ibuf(cursor->index)) {
5631 				ibuf_update_free_bits_low(block, max_ins, mtr);
5632 			}
5633 		}
5634 	}
5635 
5636 func_exit:
5637 	if (UNIV_LIKELY_NULL(heap)) {
5638 		mem_heap_free(heap);
5639 	}
5640 
5641 	return(no_compress_needed);
5642 }
5643 
5644 /*************************************************************//**
5645 Removes the record on which the tree cursor is positioned. Tries
5646 to compress the page if its fillfactor drops below a threshold
5647 or if it is the only page on the level. It is assumed that mtr holds
5648 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5649 mtr must also own x-latches to brothers of page, if those brothers
5650 exist.
5651 @return TRUE if compression occurred and FALSE if not or something
5652 wrong. */
5653 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5654 btr_cur_pessimistic_delete(
5655 /*=======================*/
5656 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5657 				the latter may occur because we may have
5658 				to update node pointers on upper levels,
5659 				and in the case of variable length keys
5660 				these may actually grow in size */
5661 	ibool		has_reserved_extents, /*!< in: TRUE if the
5662 				caller has already reserved enough free
5663 				extents so that he knows that the operation
5664 				will succeed */
5665 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
5666 				if compression does not occur, the cursor
5667 				stays valid: it points to successor of
5668 				deleted record on function exit */
5669 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5670 	bool		rollback,/*!< in: performing rollback? */
5671 	mtr_t*		mtr)	/*!< in: mtr */
5672 {
5673 	buf_block_t*	block;
5674 	page_t*		page;
5675 	page_zip_des_t*	page_zip;
5676 	dict_index_t*	index;
5677 	rec_t*		rec;
5678 	uint32_t	n_reserved	= 0;
5679 	bool		success;
5680 	ibool		ret		= FALSE;
5681 	mem_heap_t*	heap;
5682 	rec_offs*	offsets;
5683 #ifdef UNIV_DEBUG
5684 	bool		parent_latched	= false;
5685 #endif /* UNIV_DEBUG */
5686 
5687 	block = btr_cur_get_block(cursor);
5688 	page = buf_block_get_frame(block);
5689 	index = btr_cur_get_index(cursor);
5690 
5691 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5692 	ut_ad(!dict_index_is_online_ddl(index)
5693 	      || dict_index_is_clust(index)
5694 	      || (flags & BTR_CREATE_FLAG));
5695 	ut_ad(mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
5696 					 | MTR_MEMO_SX_LOCK));
5697 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
5698 	ut_ad(mtr->is_named_space(index->table->space));
5699 	ut_ad(!index->is_dummy);
5700 	ut_ad(block->page.id().space() == index->table->space->id);
5701 
5702 	if (!has_reserved_extents) {
5703 		/* First reserve enough free space for the file segments
5704 		of the index tree, so that the node pointer updates will
5705 		not fail because of lack of space */
5706 
5707 		uint32_t n_extents = uint32_t(cursor->tree_height / 32 + 1);
5708 
5709 		success = fsp_reserve_free_extents(&n_reserved,
5710 						   index->table->space,
5711 						   n_extents,
5712 						   FSP_CLEANING, mtr);
5713 		if (!success) {
5714 			*err = DB_OUT_OF_FILE_SPACE;
5715 
5716 			return(FALSE);
5717 		}
5718 	}
5719 
5720 	heap = mem_heap_create(1024);
5721 	rec = btr_cur_get_rec(cursor);
5722 	page_zip = buf_block_get_page_zip(block);
5723 #ifdef UNIV_ZIP_DEBUG
5724 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5725 #endif /* UNIV_ZIP_DEBUG */
5726 
5727 	offsets = rec_get_offsets(rec, index, NULL, page_is_leaf(page)
5728 				  ? index->n_core_fields : 0,
5729 				  ULINT_UNDEFINED, &heap);
5730 
5731 	if (rec_offs_any_extern(offsets)) {
5732 		btr_rec_free_externally_stored_fields(index,
5733 						      rec, offsets, block,
5734 						      rollback, mtr);
5735 #ifdef UNIV_ZIP_DEBUG
5736 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5737 #endif /* UNIV_ZIP_DEBUG */
5738 	}
5739 
5740 	rec_t* next_rec = NULL;
5741 	bool min_mark_next_rec = false;
5742 
5743 	if (page_is_leaf(page)) {
5744 		const bool is_metadata = rec_is_metadata(
5745 			rec, page_rec_is_comp(rec));
5746 		if (UNIV_UNLIKELY(is_metadata)) {
5747 			/* This should be rolling back instant ALTER TABLE.
5748 			If this is a recovered transaction, then
5749 			index->is_instant() will hold until the
5750 			insert into SYS_COLUMNS is rolled back. */
5751 			ut_ad(rollback);
5752 			ut_ad(index->table->supports_instant());
5753 			ut_ad(index->is_primary());
5754 		} else if (flags == 0) {
5755 			lock_update_delete(block, rec);
5756 		}
5757 
5758 		if (block->page.id().page_no() != index->page) {
5759 			if (page_get_n_recs(page) < 2) {
5760 				goto discard_page;
5761 			}
5762 		} else if (page_get_n_recs(page) == 1
5763 			   + (index->is_instant() && !is_metadata)
5764 			   && !index->must_avoid_clear_instant_add()) {
5765 			/* The whole index (and table) becomes logically empty.
5766 			Empty the whole page. That is, if we are deleting the
5767 			only user record, also delete the metadata record
5768 			if one exists for instant ADD COLUMN
5769 			(not generic ALTER TABLE).
5770 			If we are deleting the metadata record
5771 			(in the rollback of instant ALTER TABLE) and the
5772 			table becomes empty, clean up the whole page. */
5773 
5774 			const rec_t* first_rec = page_rec_get_next_const(
5775 				page_get_infimum_rec(page));
5776 			ut_ad(!index->is_instant()
5777 			      || rec_is_metadata(first_rec, *index));
5778 			if (is_metadata || !index->is_instant()
5779 			    || (first_rec != rec
5780 				&& rec_is_add_metadata(first_rec, *index))) {
5781 				btr_page_empty(block, page_zip, index, 0, mtr);
5782 				if (index->is_instant()) {
5783 					/* MDEV-17383: free metadata BLOBs! */
5784 					index->clear_instant_alter();
5785 				}
5786 				page_cur_set_after_last(
5787 					block,
5788 					btr_cur_get_page_cur(cursor));
5789 				ret = TRUE;
5790 				goto return_after_reservations;
5791 			}
5792 		}
5793 
5794 		if (UNIV_LIKELY(!is_metadata)) {
5795 			btr_search_update_hash_on_delete(cursor);
5796 		} else {
5797 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5798 					    index, offsets, mtr);
5799 			/* We must empty the PAGE_FREE list, because
5800 			after rollback, this deleted metadata record
5801 			would carry too many fields, and we would be
5802 			unable to know the size of the freed record. */
5803 			btr_page_reorganize(btr_cur_get_page_cur(cursor),
5804 					    index, mtr);
5805 			ut_ad(!ret);
5806 			goto return_after_reservations;
5807 		}
5808 	} else if (UNIV_UNLIKELY(page_rec_is_first(rec, page))) {
5809 		if (page_rec_is_last(rec, page)) {
5810 discard_page:
5811 			ut_ad(page_get_n_recs(page) == 1);
5812 			/* If there is only one record, drop
5813 			the whole page. */
5814 
5815 			btr_discard_page(cursor, mtr);
5816 
5817 			ret = TRUE;
5818 			goto return_after_reservations;
5819 		}
5820 
5821 		next_rec = page_rec_get_next(rec);
5822 
5823 		if (!page_has_prev(page)) {
5824 			/* If we delete the leftmost node pointer on a
5825 			non-leaf level, we must mark the new leftmost node
5826 			pointer as the predefined minimum record */
5827 
5828 			min_mark_next_rec = true;
5829 		} else if (index->is_spatial()) {
5830 			/* For rtree, if delete the leftmost node pointer,
5831 			we need to update parent page. */
5832 			rtr_mbr_t	father_mbr;
5833 			rec_t*		father_rec;
5834 			btr_cur_t	father_cursor;
5835 			rec_offs*	offsets;
5836 			bool		upd_ret;
5837 			ulint		len;
5838 
5839 			rtr_page_get_father_block(NULL, heap, index,
5840 						  block, mtr, NULL,
5841 						  &father_cursor);
5842 			offsets = rec_get_offsets(
5843 				btr_cur_get_rec(&father_cursor), index, NULL,
5844 				0, ULINT_UNDEFINED, &heap);
5845 
5846 			father_rec = btr_cur_get_rec(&father_cursor);
5847 			rtr_read_mbr(rec_get_nth_field(
5848 				father_rec, offsets, 0, &len), &father_mbr);
5849 
5850 			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5851 						       NULL, page, &father_mbr,
5852 						       next_rec, mtr);
5853 
5854 			if (!upd_ret) {
5855 				*err = DB_ERROR;
5856 
5857 				mem_heap_free(heap);
5858 				return(FALSE);
5859 			}
5860 
5861 			ut_d(parent_latched = true);
5862 		} else {
5863 			/* Otherwise, if we delete the leftmost node pointer
5864 			on a page, we have to change the parent node pointer
5865 			so that it is equal to the new leftmost node pointer
5866 			on the page */
5867 			btr_cur_t cursor;
5868 			btr_page_get_father(index, block, mtr, &cursor);
5869 			btr_cur_node_ptr_delete(&cursor, mtr);
5870 			const ulint	level = btr_page_get_level(page);
5871 			// FIXME: reuse the node_ptr from above
5872 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
5873 				index, next_rec, block->page.id().page_no(),
5874 				heap, level);
5875 
5876 			btr_insert_on_non_leaf_level(
5877 				flags, index, level + 1, node_ptr, mtr);
5878 
5879 			ut_d(parent_latched = true);
5880 		}
5881 	}
5882 
5883 	/* SPATIAL INDEX never use SX locks; we can allow page merges
5884 	while holding X lock on the spatial index tree.
5885 	Do not allow merges of non-leaf B-tree pages unless it is
5886 	safe to do so. */
5887 	{
5888 		const bool allow_merge = page_is_leaf(page)
5889 			|| dict_index_is_spatial(index)
5890 			|| btr_cur_will_modify_tree(
5891 				index, page, BTR_INTENTION_DELETE, rec,
5892 				btr_node_ptr_max_size(index),
5893 				block->zip_size(), mtr);
5894 		page_cur_delete_rec(btr_cur_get_page_cur(cursor), index,
5895 				    offsets, mtr);
5896 
5897 		if (min_mark_next_rec) {
5898 			btr_set_min_rec_mark(next_rec, *block, mtr);
5899 		}
5900 
5901 #ifdef UNIV_ZIP_DEBUG
5902 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5903 #endif /* UNIV_ZIP_DEBUG */
5904 
5905 		ut_ad(!parent_latched
5906 		      || btr_check_node_ptr(index, block, mtr));
5907 
5908 		if (!ret && btr_cur_compress_recommendation(cursor, mtr)) {
5909 			if (UNIV_LIKELY(allow_merge)) {
5910 				ret = btr_cur_compress_if_useful(
5911 					cursor, FALSE, mtr);
5912 			} else {
5913 				ib::warn() << "Not merging page "
5914 					   << block->page.id()
5915 					   << " in index " << index->name
5916 					   << " of " << index->table->name;
5917 				ut_ad("MDEV-14637" == 0);
5918 			}
5919 		}
5920 	}
5921 
5922 return_after_reservations:
5923 	*err = DB_SUCCESS;
5924 
5925 	mem_heap_free(heap);
5926 
5927 	if (!srv_read_only_mode
5928 	    && page_is_leaf(page)
5929 	    && !dict_index_is_online_ddl(index)) {
5930 
5931 		mtr_memo_release(mtr, dict_index_get_lock(index),
5932 				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5933 
5934 		/* NOTE: We cannot release root block latch here, because it
5935 		has segment header and already modified in most of cases.*/
5936 	}
5937 
5938 	index->table->space->release_free_extents(n_reserved);
5939 	return(ret);
5940 }
5941 
5942 /** Delete the node pointer in a parent page.
5943 @param[in,out]	parent	cursor pointing to parent record
5944 @param[in,out]	mtr	mini-transaction */
btr_cur_node_ptr_delete(btr_cur_t * parent,mtr_t * mtr)5945 void btr_cur_node_ptr_delete(btr_cur_t* parent, mtr_t* mtr)
5946 {
5947 	ut_ad(mtr->memo_contains_flagged(btr_cur_get_block(parent),
5948 					 MTR_MEMO_PAGE_X_FIX));
5949 	dberr_t err;
5950 	ibool compressed = btr_cur_pessimistic_delete(&err, TRUE, parent,
5951 						      BTR_CREATE_FLAG, false,
5952 						      mtr);
5953 	ut_a(err == DB_SUCCESS);
5954 	if (!compressed) {
5955 		btr_cur_compress_if_useful(parent, FALSE, mtr);
5956 	}
5957 }
5958 
5959 /*******************************************************************//**
5960 Adds path information to the cursor for the current page, for which
5961 the binary search has been performed. */
5962 static
5963 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5964 btr_cur_add_path_info(
5965 /*==================*/
5966 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
5967 	ulint		height,		/*!< in: height of the page in tree;
5968 					0 means leaf node */
5969 	ulint		root_height)	/*!< in: root node height in tree */
5970 {
5971 	btr_path_t*	slot;
5972 
5973 	ut_a(cursor->path_arr);
5974 
5975 	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5976 		/* Do nothing; return empty path */
5977 
5978 		slot = cursor->path_arr;
5979 		slot->nth_rec = ULINT_UNDEFINED;
5980 
5981 		return;
5982 	}
5983 
5984 	if (height == 0) {
5985 		/* Mark end of slots for path */
5986 		slot = cursor->path_arr + root_height + 1;
5987 		slot->nth_rec = ULINT_UNDEFINED;
5988 	}
5989 
5990 	slot = cursor->path_arr + (root_height - height);
5991 
5992 	const buf_block_t* block = btr_cur_get_block(cursor);
5993 
5994 	slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
5995 	slot->n_recs = page_get_n_recs(block->frame);
5996 	slot->page_no = block->page.id().page_no();
5997 	slot->page_level = btr_page_get_level(block->frame);
5998 }
5999 
6000 /*******************************************************************//**
6001 Estimate the number of rows between slot1 and slot2 for any level on a
6002 B-tree. This function starts from slot1->page and reads a few pages to
6003 the right, counting their records. If we reach slot2->page quickly then
6004 we know exactly how many records there are between slot1 and slot2 and
6005 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
6006 then we calculate the average number of records in the pages scanned
6007 so far and assume that all pages that we did not scan up to slot2->page
6008 contain the same number of records, then we multiply that average to
6009 the number of pages between slot1->page and slot2->page (which is
6010 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
6011 @return number of rows, not including the borders (exact or estimated) */
6012 static
6013 ha_rows
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,ha_rows n_rows_on_prev_level,bool * is_n_rows_exact)6014 btr_estimate_n_rows_in_range_on_level(
6015 /*==================================*/
6016 	dict_index_t*	index,			/*!< in: index */
6017 	btr_path_t*	slot1,			/*!< in: left border */
6018 	btr_path_t*	slot2,			/*!< in: right border */
6019 	ha_rows		n_rows_on_prev_level,	/*!< in: number of rows
6020 						on the previous level for the
6021 						same descend paths; used to
6022 						determine the number of pages
6023 						on this level */
6024 	bool*		is_n_rows_exact)	/*!< out: TRUE if the returned
6025 						value is exact i.e. not an
6026 						estimation */
6027 {
6028 	ha_rows		n_rows = 0;
6029 	uint		n_pages_read = 0;
6030 	ulint		level;
6031 
6032 	/* Assume by default that we will scan all pages between
6033 	slot1->page_no and slot2->page_no. */
6034 	*is_n_rows_exact = true;
6035 
6036 	/* Add records from slot1->page_no which are to the right of
6037 	the record which serves as a left border of the range, if any
6038 	(we don't include the record itself in this count). */
6039 	if (slot1->nth_rec <= slot1->n_recs) {
6040 		n_rows += slot1->n_recs - slot1->nth_rec;
6041 	}
6042 
6043 	/* Add records from slot2->page_no which are to the left of
6044 	the record which servers as a right border of the range, if any
6045 	(we don't include the record itself in this count). */
6046 	if (slot2->nth_rec > 1) {
6047 		n_rows += slot2->nth_rec - 1;
6048 	}
6049 
6050 	/* Count the records in the pages between slot1->page_no and
6051 	slot2->page_no (non inclusive), if any. */
6052 
6053 	/* Do not read more than this number of pages in order not to hurt
6054 	performance with this code which is just an estimation. If we read
6055 	this many pages before reaching slot2->page_no then we estimate the
6056 	average from the pages scanned so far. */
6057 #	define N_PAGES_READ_LIMIT	10
6058 
6059 	const fil_space_t*	space = index->table->space;
6060 	page_id_t		page_id(space->id, slot1->page_no);
6061 	const ulint		zip_size = space->zip_size();
6062 
6063 	level = slot1->page_level;
6064 
6065 	do {
6066 		mtr_t		mtr;
6067 		page_t*		page;
6068 		buf_block_t*	block;
6069 		dberr_t		err=DB_SUCCESS;
6070 
6071 		mtr_start(&mtr);
6072 
6073 		/* Fetch the page. Because we are not holding the
6074 		index->lock, the tree may have changed and we may be
6075 		attempting to read a page that is no longer part of
6076 		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
6077 		silence a debug assertion about this. */
6078 		block = buf_page_get_gen(page_id, zip_size, RW_S_LATCH,
6079 					 NULL, BUF_GET_POSSIBLY_FREED,
6080 					 __FILE__, __LINE__, &mtr, &err);
6081 
6082 		ut_ad((block != NULL) == (err == DB_SUCCESS));
6083 
6084 		if (!block) {
6085 			if (err == DB_DECRYPTION_FAILED) {
6086 				ib_push_warning((void *)NULL,
6087 					DB_DECRYPTION_FAILED,
6088 					"Table %s is encrypted but encryption service or"
6089 					" used key_id is not available. "
6090 					" Can't continue reading table.",
6091 					index->table->name.m_name);
6092 				index->table->file_unreadable = true;
6093 			}
6094 
6095 			mtr_commit(&mtr);
6096 			goto inexact;
6097 		}
6098 
6099 		page = buf_block_get_frame(block);
6100 
6101 		/* It is possible that the tree has been reorganized in the
6102 		meantime and this is a different page. If this happens the
6103 		calculated estimate will be bogus, which is not fatal as
6104 		this is only an estimate. We are sure that a page with
6105 		page_no exists because InnoDB never frees pages, only
6106 		reuses them. */
6107 		if (!fil_page_index_page_check(page)
6108 		    || btr_page_get_index_id(page) != index->id
6109 		    || btr_page_get_level(page) != level) {
6110 
6111 			/* The page got reused for something else */
6112 			mtr_commit(&mtr);
6113 			goto inexact;
6114 		}
6115 
6116 		/* It is possible but highly unlikely that the page was
6117 		originally written by an old version of InnoDB that did
6118 		not initialize FIL_PAGE_TYPE on other than B-tree pages.
6119 		For example, this could be an almost-empty BLOB page
6120 		that happens to contain the magic values in the fields
6121 		that we checked above. */
6122 
6123 		n_pages_read++;
6124 
6125 		if (page_id.page_no() != slot1->page_no) {
6126 			/* Do not count the records on slot1->page_no,
6127 			we already counted them before this loop. */
6128 			n_rows += page_get_n_recs(page);
6129 		}
6130 
6131 		page_id.set_page_no(btr_page_get_next(page));
6132 
6133 		mtr_commit(&mtr);
6134 
6135 		if (n_pages_read == N_PAGES_READ_LIMIT
6136 		    || page_id.page_no() == FIL_NULL) {
6137 			/* Either we read too many pages or
6138 			we reached the end of the level without passing
6139 			through slot2->page_no, the tree must have changed
6140 			in the meantime */
6141 			goto inexact;
6142 		}
6143 
6144 	} while (page_id.page_no() != slot2->page_no);
6145 
6146 	return(n_rows);
6147 
6148 inexact:
6149 
6150 	*is_n_rows_exact = false;
6151 
6152 	/* We did interrupt before reaching slot2->page */
6153 
6154 	if (n_pages_read > 0) {
6155 		/* The number of pages on this level is
6156 		n_rows_on_prev_level, multiply it by the
6157 		average number of recs per page so far */
6158 		n_rows = n_rows_on_prev_level * n_rows / n_pages_read;
6159 	} else {
6160 		/* The tree changed before we could even
6161 		start with slot1->page_no */
6162 		n_rows = 10;
6163 	}
6164 
6165 	return(n_rows);
6166 }
6167 
6168 /** If the tree gets changed too much between the two dives for the left
6169 and right boundary then btr_estimate_n_rows_in_range_low() will retry
6170 that many times before giving up and returning the value stored in
6171 rows_in_range_arbitrary_ret_val. */
6172 static const unsigned	rows_in_range_max_retries = 4;
6173 
6174 /** We pretend that a range has that many records if the tree keeps changing
6175 for rows_in_range_max_retries retries while we try to estimate the records
6176 in a given range. */
6177 static const ha_rows	rows_in_range_arbitrary_ret_val = 10;
6178 
6179 /** Estimates the number of rows in a given index range.
6180 @param[in]	index		index
6181 @param[in]	tuple1		range start
6182 @param[in]	tuple2		range end
6183 @param[in]	nth_attempt	if the tree gets modified too much while
6184 we are trying to analyze it, then we will retry (this function will call
6185 itself, incrementing this parameter)
6186 @return estimated number of rows; if after rows_in_range_max_retries
6187 retries the tree keeps changing, then we will just return
6188 rows_in_range_arbitrary_ret_val as a result (if
6189 nth_attempt >= rows_in_range_max_retries and the tree is modified between
6190 the two dives). */
6191 static
6192 ha_rows
btr_estimate_n_rows_in_range_low(dict_index_t * index,btr_pos_t * tuple1,btr_pos_t * tuple2,unsigned nth_attempt)6193 btr_estimate_n_rows_in_range_low(
6194 	dict_index_t*	index,
6195 	btr_pos_t*	tuple1,
6196 	btr_pos_t*	tuple2,
6197 	unsigned	nth_attempt)
6198 {
6199 	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
6200 	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
6201 	btr_cur_t	cursor;
6202 	btr_path_t*	slot1;
6203 	btr_path_t*	slot2;
6204 	bool		diverged;
6205 	bool		diverged_lot;
6206 	ulint		divergence_level;
6207 	ha_rows		n_rows;
6208 	bool		is_n_rows_exact;
6209 	ulint		i;
6210 	mtr_t		mtr;
6211 	ha_rows		table_n_rows;
6212         page_cur_mode_t mode2= tuple2->mode;
6213 
6214 	table_n_rows = dict_table_get_n_rows(index->table);
6215 
6216 	/* Below we dive to the two records specified by tuple1 and tuple2 and
6217 	we remember the entire dive paths from the tree root. The place where
6218 	the tuple1 path ends on the leaf level we call "left border" of our
6219 	interval and the place where the tuple2 path ends on the leaf level -
6220 	"right border". We take care to either include or exclude the interval
6221 	boundaries depending on whether <, <=, > or >= was specified. For
6222 	example if "5 < x AND x <= 10" then we should not include the left
6223 	boundary, but should include the right one. */
6224 
6225 	mtr_start(&mtr);
6226 
6227 	cursor.path_arr = path1;
6228 
6229 	bool	should_count_the_left_border;
6230 
6231 	if (dtuple_get_n_fields(tuple1->tuple) > 0) {
6232 
6233               btr_cur_search_to_nth_level(index, 0, tuple1->tuple,
6234                                             tuple1->mode,
6235 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
6236 					    &cursor, 0,
6237 					    __FILE__, __LINE__, &mtr);
6238 
6239 		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
6240 
6241 		/* We should count the border if there are any records to
6242 		match the criteria, i.e. if the maximum record on the tree is
6243 		5 and x > 3 is specified then the cursor will be positioned at
6244 		5 and we should count the border, but if x > 7 is specified,
6245 		then the cursor will be positioned at 'sup' on the rightmost
6246 		leaf page in the tree and we should not count the border. */
6247 		should_count_the_left_border
6248 			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
6249 	} else {
6250 		dberr_t err = DB_SUCCESS;
6251 
6252 		err = btr_cur_open_at_index_side(true, index,
6253 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
6254 					   &cursor, 0, &mtr);
6255 
6256 		if (err != DB_SUCCESS) {
6257 			ib::warn() << " Error code: " << err
6258 				   << " btr_estimate_n_rows_in_range_low "
6259 				   << " called from file: "
6260 				   << __FILE__ << " line: " << __LINE__
6261 				   << " table: " << index->table->name
6262 				   << " index: " << index->name;
6263 		}
6264 
6265 		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
6266 
6267 		/* The range specified is wihout a left border, just
6268 		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
6269 		positioned the cursor on the infimum record on the leftmost
6270 		page, which must not be counted. */
6271 		should_count_the_left_border = false;
6272 	}
6273 
6274         tuple1->page_id= cursor.page_cur.block->page.id();
6275 
6276 	mtr_commit(&mtr);
6277 
6278 	if (!index->is_readable()) {
6279 		return 0;
6280 	}
6281 
6282 	mtr_start(&mtr);
6283 
6284 	cursor.path_arr = path2;
6285 
6286 	bool	should_count_the_right_border;
6287 
6288 	if (dtuple_get_n_fields(tuple2->tuple) > 0) {
6289 
6290 		btr_cur_search_to_nth_level(index, 0, tuple2->tuple,
6291                                             mode2,
6292 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
6293 					    &cursor, 0,
6294 					    __FILE__, __LINE__, &mtr);
6295 
6296 		const rec_t*	rec = btr_cur_get_rec(&cursor);
6297 
6298 		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
6299 
6300 		should_count_the_right_border
6301 			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
6302 			   /* and the record was found */
6303 			   && cursor.low_match >= dtuple_get_n_fields(tuple2->tuple))
6304 			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
6305 			    /* and there are any records to match the criteria,
6306 			    i.e. if the minimum record on the tree is 5 and
6307 			    x < 7 is specified then the cursor will be
6308 			    positioned at 5 and we should count the border, but
6309 			    if x < 2 is specified, then the cursor will be
6310 			    positioned at 'inf' and we should not count the
6311 			    border */
6312 			    && !page_rec_is_infimum(rec));
6313 		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
6314 		ha_innobase::records_in_range():
6315 		min_key=NULL (left-unbounded) which is expected
6316 		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
6317 		unexpected - one would expect
6318 		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
6319 		cursor will be positioned on the first record to the right of
6320 		the requested one (can also be positioned on the 'sup') and
6321 		we should not count the right border. */
6322 	} else {
6323 		dberr_t err = DB_SUCCESS;
6324 
6325 		err = btr_cur_open_at_index_side(false, index,
6326 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
6327 					   &cursor, 0, &mtr);
6328 
6329 		if (err != DB_SUCCESS) {
6330 			ib::warn() << " Error code: " << err
6331 				   << " btr_estimate_n_rows_in_range_low "
6332 				   << " called from file: "
6333 				   << __FILE__ << " line: " << __LINE__
6334 				   << " table: " << index->table->name
6335 				   << " index: " << index->name;
6336 		}
6337 
6338 		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
6339 
6340 		/* The range specified is wihout a right border, just
6341 		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
6342 		positioned the cursor on the supremum record on the rightmost
6343 		page, which must not be counted. */
6344 		should_count_the_right_border = false;
6345 	}
6346 
6347         tuple2->page_id= cursor.page_cur.block->page.id();
6348 
6349 	mtr_commit(&mtr);
6350 
6351 	/* We have the path information for the range in path1 and path2 */
6352 
6353 	n_rows = 0;
6354 	is_n_rows_exact = true;
6355 
6356 	/* This becomes true when the two paths do not pass through the
6357 	same pages anymore. */
6358 	diverged = false;
6359 
6360 	/* This becomes true when the paths are not the same or adjacent
6361 	any more. This means that they pass through the same or
6362 	neighboring-on-the-same-level pages only. */
6363 	diverged_lot = false;
6364 
6365 	/* This is the level where paths diverged a lot. */
6366 	divergence_level = 1000000;
6367 
6368 	for (i = 0; ; i++) {
6369 		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
6370 
6371 		slot1 = path1 + i;
6372 		slot2 = path2 + i;
6373 
6374 		if (slot1->nth_rec == ULINT_UNDEFINED
6375 		    || slot2->nth_rec == ULINT_UNDEFINED) {
6376 
6377 			/* Here none of the borders were counted. For example,
6378 			if on the leaf level we descended to:
6379 			(inf, a, b, c, d, e, f, sup)
6380 			         ^        ^
6381 			       path1    path2
6382 			then n_rows will be 2 (c and d). */
6383 
6384 			if (is_n_rows_exact) {
6385 				/* Only fiddle to adjust this off-by-one
6386 				if the number is exact, otherwise we do
6387 				much grosser adjustments below. */
6388 
6389 				btr_path_t*	last1 = &path1[i - 1];
6390 				btr_path_t*	last2 = &path2[i - 1];
6391 
6392 				/* If both paths end up on the same record on
6393 				the leaf level. */
6394 				if (last1->page_no == last2->page_no
6395 				    && last1->nth_rec == last2->nth_rec) {
6396 
6397 					/* n_rows can be > 0 here if the paths
6398 					were first different and then converged
6399 					to the same record on the leaf level.
6400 					For example:
6401 					SELECT ... LIKE 'wait/synch/rwlock%'
6402 					mode1=PAGE_CUR_GE,
6403 					tuple1="wait/synch/rwlock"
6404 					path1[0]={nth_rec=58, n_recs=58,
6405 						  page_no=3, page_level=1}
6406 					path1[1]={nth_rec=56, n_recs=55,
6407 						  page_no=119, page_level=0}
6408 
6409 					mode2=PAGE_CUR_G
6410 					tuple2="wait/synch/rwlock"
6411 					path2[0]={nth_rec=57, n_recs=57,
6412 						  page_no=3, page_level=1}
6413 					path2[1]={nth_rec=56, n_recs=55,
6414 						  page_no=119, page_level=0} */
6415 
6416 					/* If the range is such that we should
6417 					count both borders, then avoid
6418 					counting that record twice - once as a
6419 					left border and once as a right
6420 					border. */
6421 					if (should_count_the_left_border
6422 					    && should_count_the_right_border) {
6423 
6424 						n_rows = 1;
6425 					} else {
6426 						/* Some of the borders should
6427 						not be counted, e.g. [3,3). */
6428 						n_rows = 0;
6429 					}
6430 				} else {
6431 					if (should_count_the_left_border) {
6432 						n_rows++;
6433 					}
6434 
6435 					if (should_count_the_right_border) {
6436 						n_rows++;
6437 					}
6438 				}
6439 			}
6440 
6441 			if (i > divergence_level + 1 && !is_n_rows_exact) {
6442 				/* In trees whose height is > 1 our algorithm
6443 				tends to underestimate: multiply the estimate
6444 				by 2: */
6445 
6446 				n_rows = n_rows * 2;
6447 			}
6448 
6449 			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
6450 
6451 			/* Do not estimate the number of rows in the range
6452 			to over 1 / 2 of the estimated rows in the whole
6453 			table */
6454 
6455 			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
6456 
6457 				n_rows = table_n_rows / 2;
6458 
6459 				/* If there are just 0 or 1 rows in the table,
6460 				then we estimate all rows are in the range */
6461 
6462 				if (n_rows == 0) {
6463 					n_rows = table_n_rows;
6464 				}
6465 			}
6466 
6467 			return(n_rows);
6468 		}
6469 
6470 		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
6471 
6472 			/* If both slots do not point to the same page,
6473 			this means that the tree must have changed between
6474 			the dive for slot1 and the dive for slot2 at the
6475 			beginning of this function. */
6476 			if (slot1->page_no != slot2->page_no
6477 			    || slot1->page_level != slot2->page_level) {
6478 
6479 				/* If the tree keeps changing even after a
6480 				few attempts, then just return some arbitrary
6481 				number. */
6482 				if (nth_attempt >= rows_in_range_max_retries) {
6483 					return(rows_in_range_arbitrary_ret_val);
6484 				}
6485 
6486 				return btr_estimate_n_rows_in_range_low(
6487                                        index, tuple1, tuple2,
6488                                        nth_attempt + 1);
6489 			}
6490 
6491 			diverged = true;
6492 
6493 			if (slot1->nth_rec < slot2->nth_rec) {
6494 				/* We do not count the borders (nor the left
6495 				nor the right one), thus "- 1". */
6496 				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
6497 
6498 				if (n_rows > 0) {
6499 					/* There is at least one row between
6500 					the two borders pointed to by slot1
6501 					and slot2, so on the level below the
6502 					slots will point to non-adjacent
6503 					pages. */
6504 					diverged_lot = true;
6505 					divergence_level = i;
6506 				}
6507 			} else {
6508 				/* It is possible that
6509 				slot1->nth_rec >= slot2->nth_rec
6510 				if, for example, we have a single page
6511 				tree which contains (inf, 5, 6, supr)
6512 				and we select where x > 20 and x < 30;
6513 				in this case slot1->nth_rec will point
6514 				to the supr record and slot2->nth_rec
6515 				will point to 6. */
6516 				n_rows = 0;
6517 				should_count_the_left_border = false;
6518 				should_count_the_right_border = false;
6519 			}
6520 
6521 		} else if (diverged && !diverged_lot) {
6522 
6523 			if (slot1->nth_rec < slot1->n_recs
6524 			    || slot2->nth_rec > 1) {
6525 
6526 				diverged_lot = true;
6527 				divergence_level = i;
6528 
6529 				n_rows = 0;
6530 
6531 				if (slot1->nth_rec < slot1->n_recs) {
6532 					n_rows += slot1->n_recs
6533 						- slot1->nth_rec;
6534 				}
6535 
6536 				if (slot2->nth_rec > 1) {
6537 					n_rows += slot2->nth_rec - 1;
6538 				}
6539 			}
6540 		} else if (diverged_lot) {
6541 
6542 			n_rows = btr_estimate_n_rows_in_range_on_level(
6543 				index, slot1, slot2, n_rows,
6544 				&is_n_rows_exact);
6545 		}
6546 	}
6547 }
6548 
6549 /** Estimates the number of rows in a given index range.
6550 @param[in]	index	index
6551 @param[in]	tuple1	range start, may also be empty tuple
6552 @param[in]	mode1	search mode for range start
6553 @param[in]	tuple2	range end, may also be empty tuple
6554 @param[in]	mode2	search mode for range end
6555 @return estimated number of rows */
6556 ha_rows
btr_estimate_n_rows_in_range(dict_index_t * index,btr_pos_t * tuple1,btr_pos_t * tuple2)6557 btr_estimate_n_rows_in_range(
6558 	dict_index_t*	index,
6559         btr_pos_t       *tuple1,
6560         btr_pos_t       *tuple2)
6561 {
6562 	return btr_estimate_n_rows_in_range_low(
6563 		index, tuple1, tuple2, 1);
6564 }
6565 
6566 /*******************************************************************//**
6567 Record the number of non_null key values in a given index for
6568 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6569 The estimates are eventually stored in the array:
6570 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6571 static
6572 void
btr_record_not_null_field_in_rec(ulint n_unique,const rec_offs * offsets,ib_uint64_t * n_not_null)6573 btr_record_not_null_field_in_rec(
6574 /*=============================*/
6575 	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
6576 					number of columns uniquely determine
6577 					an index entry */
6578 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
6579 					its size could be for all fields or
6580 					that of "n_unique" */
6581 	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
6582 					not null rows for n-column prefix */
6583 {
6584 	ulint	i;
6585 
6586 	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6587 
6588 	if (n_not_null == NULL) {
6589 		return;
6590 	}
6591 
6592 	for (i = 0; i < n_unique; i++) {
6593 		if (rec_offs_nth_sql_null(offsets, i)) {
6594 			break;
6595 		}
6596 
6597 		n_not_null[i]++;
6598 	}
6599 }
6600 
6601 /** Estimates the number of different key values in a given index, for
6602 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6603 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6604 0..n_uniq-1) and the number of pages that were sampled is saved in
6605 result.n_sample_sizes[].
6606 If innodb_stats_method is nulls_ignored, we also record the number of
6607 non-null values for each prefix and stored the estimates in
6608 array result.n_non_null_key_vals.
6609 @param[in]	index	index
6610 @return vector with statistics information
6611 empty vector if the index is unavailable. */
6612 std::vector<index_field_stats_t>
btr_estimate_number_of_different_key_vals(dict_index_t * index)6613 btr_estimate_number_of_different_key_vals(dict_index_t* index)
6614 {
6615 	btr_cur_t	cursor;
6616 	page_t*		page;
6617 	rec_t*		rec;
6618 	ulint		n_cols;
6619 	ib_uint64_t*	n_diff;
6620 	ib_uint64_t*	n_not_null;
6621 	ibool		stats_null_not_equal;
6622 	uintmax_t	n_sample_pages=1; /* number of pages to sample */
6623 	ulint		not_empty_flag	= 0;
6624 	ulint		total_external_size = 0;
6625 	ulint		i;
6626 	ulint		j;
6627 	uintmax_t	add_on;
6628 	mtr_t		mtr;
6629 	mem_heap_t*	heap		= NULL;
6630 	rec_offs*	offsets_rec	= NULL;
6631 	rec_offs*	offsets_next_rec = NULL;
6632 
6633 	std::vector<index_field_stats_t> result;
6634 
6635 	/* For spatial index, there is no such stats can be
6636 	fetched. */
6637 	ut_ad(!dict_index_is_spatial(index));
6638 
6639 	n_cols = dict_index_get_n_unique(index);
6640 
6641 	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6642 			       * n_cols
6643 			       + dict_index_get_n_fields(index)
6644 			       * (sizeof *offsets_rec
6645 				  + sizeof *offsets_next_rec));
6646 
6647 	n_diff = (ib_uint64_t*) mem_heap_zalloc(
6648 		heap, n_cols * sizeof(n_diff[0]));
6649 
6650 	n_not_null = NULL;
6651 
6652 	/* Check srv_innodb_stats_method setting, and decide whether we
6653 	need to record non-null value and also decide if NULL is
6654 	considered equal (by setting stats_null_not_equal value) */
6655 	switch (srv_innodb_stats_method) {
6656 	case SRV_STATS_NULLS_IGNORED:
6657 		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6658 			heap, n_cols * sizeof *n_not_null);
6659 		/* fall through */
6660 
6661 	case SRV_STATS_NULLS_UNEQUAL:
6662 		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6663 		case, we will treat NULLs as unequal value */
6664 		stats_null_not_equal = TRUE;
6665 		break;
6666 
6667 	case SRV_STATS_NULLS_EQUAL:
6668 		stats_null_not_equal = FALSE;
6669 		break;
6670 
6671 	default:
6672 		ut_error;
6673 	}
6674 
6675 	if (srv_stats_sample_traditional) {
6676 		/* It makes no sense to test more pages than are contained
6677 		in the index, thus we lower the number if it is too high */
6678 		if (srv_stats_transient_sample_pages > index->stat_index_size) {
6679 			if (index->stat_index_size > 0) {
6680 				n_sample_pages = index->stat_index_size;
6681 			}
6682 		} else {
6683 			n_sample_pages = srv_stats_transient_sample_pages;
6684 		}
6685 	} else {
6686 		/* New logaritmic number of pages that are estimated.
6687 		Number of pages estimated should be between 1 and
6688 		index->stat_index_size.
6689 
6690 		If we have only 0 or 1 index pages then we can only take 1
6691 		sample. We have already initialized n_sample_pages to 1.
6692 
6693 		So taking index size as I and sample as S and log(I)*S as L
6694 
6695 		requirement 1) we want the out limit of the expression to not exceed I;
6696 		requirement 2) we want the ideal pages to be at least S;
6697 		so the current expression is min(I, max( min(S,I), L)
6698 
6699 		looking for simplifications:
6700 
6701 		case 1: assume S < I
6702 		min(I, max( min(S,I), L) -> min(I , max( S, L))
6703 
6704 		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
6705 
6706 		so we have: min(I , L)
6707 
6708 		case 2: assume I < S
6709 		    min(I, max( min(S,I), L) -> min(I, max( I, L))
6710 
6711 		case 2a: L > I
6712 		    min(I, max( I, L)) -> min(I, L) -> I
6713 
6714 		case 2b: when L < I
6715 		    min(I, max( I, L))  ->  min(I, I ) -> I
6716 
6717 		so taking all case2 paths is I, our expression is:
6718 		n_pages = S < I? min(I,L) : I
6719                 */
6720 		if (index->stat_index_size > 1) {
6721 			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
6722 				? ut_min(index->stat_index_size,
6723 					 static_cast<ulint>(
6724 						 log2(double(index->stat_index_size))
6725 						 * double(srv_stats_transient_sample_pages)))
6726 				: index->stat_index_size;
6727 		}
6728 	}
6729 
6730 	/* Sanity check */
6731 	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
6732 
6733 	/* We sample some pages in the index to get an estimate */
6734 
6735 	for (i = 0; i < n_sample_pages; i++) {
6736 		mtr_start(&mtr);
6737 
6738 		bool	available;
6739 
6740 		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6741 						    &cursor, &mtr);
6742 
6743 		if (!available) {
6744 			mtr_commit(&mtr);
6745 			mem_heap_free(heap);
6746 
6747 			return result;
6748 		}
6749 
6750 		/* Count the number of different key values for each prefix of
6751 		the key on this index page. If the prefix does not determine
6752 		the index record uniquely in the B-tree, then we subtract one
6753 		because otherwise our algorithm would give a wrong estimate
6754 		for an index where there is just one key value. */
6755 
6756 		if (!index->is_readable()) {
6757 			mtr_commit(&mtr);
6758 			goto exit_loop;
6759 		}
6760 
6761 		page = btr_cur_get_page(&cursor);
6762 
6763 		rec = page_rec_get_next(page_get_infimum_rec(page));
6764 		const ulint n_core = page_is_leaf(page)
6765 			? index->n_core_fields : 0;
6766 
6767 		if (!page_rec_is_supremum(rec)) {
6768 			not_empty_flag = 1;
6769 			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6770 						      n_core,
6771 						      ULINT_UNDEFINED, &heap);
6772 
6773 			if (n_not_null != NULL) {
6774 				btr_record_not_null_field_in_rec(
6775 					n_cols, offsets_rec, n_not_null);
6776 			}
6777 		}
6778 
6779 		while (!page_rec_is_supremum(rec)) {
6780 			ulint	matched_fields;
6781 			rec_t*	next_rec = page_rec_get_next(rec);
6782 			if (page_rec_is_supremum(next_rec)) {
6783 				total_external_size +=
6784 					btr_rec_get_externally_stored_len(
6785 						rec, offsets_rec);
6786 				break;
6787 			}
6788 
6789 			offsets_next_rec = rec_get_offsets(next_rec, index,
6790 							   offsets_next_rec,
6791 							   n_core,
6792 							   ULINT_UNDEFINED,
6793 							   &heap);
6794 
6795 			cmp_rec_rec(rec, next_rec,
6796 				    offsets_rec, offsets_next_rec,
6797 				    index, stats_null_not_equal,
6798 				    &matched_fields);
6799 
6800 			for (j = matched_fields; j < n_cols; j++) {
6801 				/* We add one if this index record has
6802 				a different prefix from the previous */
6803 
6804 				n_diff[j]++;
6805 			}
6806 
6807 			if (n_not_null != NULL) {
6808 				btr_record_not_null_field_in_rec(
6809 					n_cols, offsets_next_rec, n_not_null);
6810 			}
6811 
6812 			total_external_size
6813 				+= btr_rec_get_externally_stored_len(
6814 					rec, offsets_rec);
6815 
6816 			rec = next_rec;
6817 			/* Initialize offsets_rec for the next round
6818 			and assign the old offsets_rec buffer to
6819 			offsets_next_rec. */
6820 			{
6821 				rec_offs* offsets_tmp = offsets_rec;
6822 				offsets_rec = offsets_next_rec;
6823 				offsets_next_rec = offsets_tmp;
6824 			}
6825 		}
6826 
6827 		if (n_cols == dict_index_get_n_unique_in_tree(index)
6828 		    && page_has_siblings(page)) {
6829 
6830 			/* If there is more than one leaf page in the tree,
6831 			we add one because we know that the first record
6832 			on the page certainly had a different prefix than the
6833 			last record on the previous index page in the
6834 			alphabetical order. Before this fix, if there was
6835 			just one big record on each clustered index page, the
6836 			algorithm grossly underestimated the number of rows
6837 			in the table. */
6838 
6839 			n_diff[n_cols - 1]++;
6840 		}
6841 
6842 		mtr_commit(&mtr);
6843 	}
6844 
6845 exit_loop:
6846 	/* If we saw k borders between different key values on
6847 	n_sample_pages leaf pages, we can estimate how many
6848 	there will be in index->stat_n_leaf_pages */
6849 
6850 	/* We must take into account that our sample actually represents
6851 	also the pages used for external storage of fields (those pages are
6852 	included in index->stat_n_leaf_pages) */
6853 
6854 	result.reserve(n_cols);
6855 
6856 	for (j = 0; j < n_cols; j++) {
6857 		index_field_stats_t stat;
6858 
6859 		stat.n_diff_key_vals
6860 			= BTR_TABLE_STATS_FROM_SAMPLE(
6861 				n_diff[j], index, n_sample_pages,
6862 				total_external_size, not_empty_flag);
6863 
6864 		/* If the tree is small, smaller than
6865 		10 * n_sample_pages + total_external_size, then
6866 		the above estimate is ok. For bigger trees it is common that we
6867 		do not see any borders between key values in the few pages
6868 		we pick. But still there may be n_sample_pages
6869 		different key values, or even more. Let us try to approximate
6870 		that: */
6871 
6872 		add_on = index->stat_n_leaf_pages
6873 			/ (10 * (n_sample_pages
6874 				 + total_external_size));
6875 
6876 		if (add_on > n_sample_pages) {
6877 			add_on = n_sample_pages;
6878 		}
6879 
6880 		stat.n_diff_key_vals += add_on;
6881 
6882 		stat.n_sample_sizes = n_sample_pages;
6883 
6884 		if (n_not_null != NULL) {
6885 			stat.n_non_null_key_vals =
6886 				 BTR_TABLE_STATS_FROM_SAMPLE(
6887 					n_not_null[j], index, n_sample_pages,
6888 					total_external_size, not_empty_flag);
6889 		}
6890 
6891 		result.push_back(stat);
6892 	}
6893 
6894 	mem_heap_free(heap);
6895 
6896 	return result;
6897 }
6898 
6899 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6900 
6901 /***********************************************************//**
6902 Gets the offset of the pointer to the externally stored part of a field.
6903 @return offset of the pointer to the externally stored part */
6904 static
6905 ulint
btr_rec_get_field_ref_offs(const rec_offs * offsets,ulint n)6906 btr_rec_get_field_ref_offs(
6907 /*=======================*/
6908 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
6909 	ulint		n)	/*!< in: index of the external field */
6910 {
6911 	ulint	field_ref_offs;
6912 	ulint	local_len;
6913 
6914 	ut_a(rec_offs_nth_extern(offsets, n));
6915 	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6916 	ut_a(len_is_stored(local_len));
6917 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6918 
6919 	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6920 }
6921 
6922 /** Gets a pointer to the externally stored part of a field.
6923 @param rec record
6924 @param offsets rec_get_offsets(rec)
6925 @param n index of the externally stored field
6926 @return pointer to the externally stored part */
6927 #define btr_rec_get_field_ref(rec, offsets, n)			\
6928 	((rec) + btr_rec_get_field_ref_offs(offsets, n))
6929 
6930 /** Gets the externally stored size of a record, in units of a database page.
6931 @param[in]	rec	record
6932 @param[in]	offsets	array returned by rec_get_offsets()
6933 @return externally stored part, in units of a database page */
6934 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const rec_offs * offsets)6935 btr_rec_get_externally_stored_len(
6936 	const rec_t*	rec,
6937 	const rec_offs*	offsets)
6938 {
6939 	ulint	n_fields;
6940 	ulint	total_extern_len = 0;
6941 	ulint	i;
6942 
6943 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6944 
6945 	if (!rec_offs_any_extern(offsets)) {
6946 		return(0);
6947 	}
6948 
6949 	n_fields = rec_offs_n_fields(offsets);
6950 
6951 	for (i = 0; i < n_fields; i++) {
6952 		if (rec_offs_nth_extern(offsets, i)) {
6953 
6954 			ulint	extern_len = mach_read_from_4(
6955 				btr_rec_get_field_ref(rec, offsets, i)
6956 				+ BTR_EXTERN_LEN + 4);
6957 
6958 			total_extern_len += ut_calc_align(
6959 				extern_len, ulint(srv_page_size));
6960 		}
6961 	}
6962 
6963 	return total_extern_len >> srv_page_size_shift;
6964 }
6965 
6966 /*******************************************************************//**
6967 Sets the ownership bit of an externally stored field in a record. */
6968 static
6969 void
btr_cur_set_ownership_of_extern_field(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,ulint i,bool val,mtr_t * mtr)6970 btr_cur_set_ownership_of_extern_field(
6971 /*==================================*/
6972 	buf_block_t*	block,	/*!< in/out: index page */
6973 	rec_t*		rec,	/*!< in/out: clustered index record */
6974 	dict_index_t*	index,	/*!< in: index of the page */
6975 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
6976 	ulint		i,	/*!< in: field number */
6977 	bool		val,	/*!< in: value to set */
6978 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6979 {
6980 	byte*	data;
6981 	ulint	local_len;
6982 	ulint	byte_val;
6983 
6984 	data = rec_get_nth_field(rec, offsets, i, &local_len);
6985 	ut_ad(rec_offs_nth_extern(offsets, i));
6986 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6987 
6988 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6989 
6990 	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6991 
6992 	if (val) {
6993 		byte_val &= ~BTR_EXTERN_OWNER_FLAG;
6994 	} else {
6995 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6996 		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6997 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6998 		byte_val |= BTR_EXTERN_OWNER_FLAG;
6999 	}
7000 
7001 	if (UNIV_LIKELY_NULL(block->page.zip.data)) {
7002 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
7003 		page_zip_write_blob_ptr(block, rec, index, offsets, i, mtr);
7004 	} else {
7005 		mtr->write<1,mtr_t::MAYBE_NOP>(*block, data + local_len
7006 					       + BTR_EXTERN_LEN, byte_val);
7007 	}
7008 }
7009 
7010 /*******************************************************************//**
7011 Marks non-updated off-page fields as disowned by this record. The ownership
7012 must be transferred to the updated record which is inserted elsewhere in the
7013 index tree. In purge only the owner of externally stored field is allowed
7014 to free the field. */
7015 void
btr_cur_disown_inherited_fields(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,const upd_t * update,mtr_t * mtr)7016 btr_cur_disown_inherited_fields(
7017 /*============================*/
7018 	buf_block_t*	block,	/*!< in/out: index page */
7019 	rec_t*		rec,	/*!< in/out: record in a clustered index */
7020 	dict_index_t*	index,	/*!< in: index of the page */
7021 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
7022 	const upd_t*	update,	/*!< in: update vector */
7023 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
7024 {
7025 	ut_ad(rec_offs_validate(rec, index, offsets));
7026 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
7027 	ut_ad(rec_offs_any_extern(offsets));
7028 
7029 	for (uint16_t i = 0; i < rec_offs_n_fields(offsets); i++) {
7030 		if (rec_offs_nth_extern(offsets, i)
7031 		    && !upd_get_field_by_field_no(update, i, false)) {
7032 			btr_cur_set_ownership_of_extern_field(
7033 				block, rec, index, offsets, i, false, mtr);
7034 		}
7035 	}
7036 }
7037 
7038 /*******************************************************************//**
7039 Marks all extern fields in a record as owned by the record. This function
7040 should be called if the delete mark of a record is removed: a not delete
7041 marked record always owns all its extern fields. */
7042 static
7043 void
btr_cur_unmark_extern_fields(buf_block_t * block,rec_t * rec,dict_index_t * index,const rec_offs * offsets,mtr_t * mtr)7044 btr_cur_unmark_extern_fields(
7045 /*=========================*/
7046 	buf_block_t*	block,	/*!< in/out: index page */
7047 	rec_t*		rec,	/*!< in/out: record in a clustered index */
7048 	dict_index_t*	index,	/*!< in: index of the page */
7049 	const rec_offs*	offsets,/*!< in: array returned by rec_get_offsets() */
7050 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
7051 {
7052 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
7053 	if (!rec_offs_any_extern(offsets)) {
7054 		return;
7055 	}
7056 
7057 	const ulint n = rec_offs_n_fields(offsets);
7058 
7059 	for (ulint i = 0; i < n; i++) {
7060 		if (rec_offs_nth_extern(offsets, i)) {
7061 			btr_cur_set_ownership_of_extern_field(
7062 				block, rec, index, offsets, i, true, mtr);
7063 		}
7064 	}
7065 }
7066 
7067 /*******************************************************************//**
7068 Returns the length of a BLOB part stored on the header page.
7069 @return part length */
7070 static
7071 uint32_t
btr_blob_get_part_len(const byte * blob_header)7072 btr_blob_get_part_len(
7073 /*==================*/
7074 	const byte*	blob_header)	/*!< in: blob header */
7075 {
7076 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
7077 }
7078 
7079 /*******************************************************************//**
7080 Returns the page number where the next BLOB part is stored.
7081 @return page number or FIL_NULL if no more pages */
7082 static
7083 uint32_t
btr_blob_get_next_page_no(const byte * blob_header)7084 btr_blob_get_next_page_no(
7085 /*======================*/
7086 	const byte*	blob_header)	/*!< in: blob header */
7087 {
7088 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
7089 }
7090 
7091 /** Deallocate a buffer block that was reserved for a BLOB part.
7092 @param block   buffer block
7093 @param all     flag whether to remove a ROW_FORMAT=COMPRESSED page
7094 @param mtr     mini-transaction to commit */
btr_blob_free(buf_block_t * block,bool all,mtr_t * mtr)7095 static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr)
7096 {
7097   const page_id_t page_id(block->page.id());
7098   ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
7099   mtr->commit();
7100 
7101   const ulint fold= page_id.fold();
7102 
7103   mysql_mutex_lock(&buf_pool.mutex);
7104 
7105   if (buf_page_t *bpage= buf_pool.page_hash_get_low(page_id, fold))
7106     if (!buf_LRU_free_page(bpage, all) && all && bpage->zip.data)
7107       /* Attempt to deallocate the redundant copy of the uncompressed page
7108       if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */
7109       buf_LRU_free_page(bpage, false);
7110 
7111   mysql_mutex_unlock(&buf_pool.mutex);
7112 }
7113 
7114 /** Helper class used while writing blob pages, during insert or update. */
7115 struct btr_blob_log_check_t {
7116 	/** Persistent cursor on a clusterex index record with blobs. */
7117 	btr_pcur_t*	m_pcur;
7118 	/** Mini transaction holding the latches for m_pcur */
7119 	mtr_t*		m_mtr;
7120 	/** rec_get_offsets(rec, index); offset of clust_rec */
7121 	const rec_offs*	m_offsets;
7122 	/** The block containing clustered record */
7123 	buf_block_t**	m_block;
7124 	/** The clustered record pointer */
7125 	rec_t**		m_rec;
7126 	/** The blob operation code */
7127 	enum blob_op	m_op;
7128 
7129 	/** Constructor
7130 	@param[in]	pcur		persistent cursor on a clustered
7131 					index record with blobs.
7132 	@param[in]	mtr		mini-transaction holding latches for
7133 					pcur.
7134 	@param[in]	offsets		offsets of the clust_rec
7135 	@param[in,out]	block		record block containing pcur record
7136 	@param[in,out]	rec		the clustered record pointer
7137 	@param[in]	op		the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t7138 	btr_blob_log_check_t(
7139 		btr_pcur_t*	pcur,
7140 		mtr_t*		mtr,
7141 		const rec_offs*	offsets,
7142 		buf_block_t**	block,
7143 		rec_t**		rec,
7144 		enum blob_op	op)
7145 		: m_pcur(pcur),
7146 		  m_mtr(mtr),
7147 		  m_offsets(offsets),
7148 		  m_block(block),
7149 		  m_rec(rec),
7150 		  m_op(op)
7151 	{
7152 		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
7153 		ut_ad((*m_block)->frame == page_align(*m_rec));
7154 		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
7155 	}
7156 
7157 	/** Check if there is enough space in log file. Commit and re-start the
7158 	mini transaction. */
checkbtr_blob_log_check_t7159 	void check()
7160 	{
7161 		dict_index_t*	index = m_pcur->index();
7162 		ulint		offs = 0;
7163 		uint32_t	page_no = FIL_NULL;
7164 
7165 		if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) {
7166 			offs = page_offset(*m_rec);
7167 			page_no = (*m_block)->page.id().page_no();
7168 			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
7169 			ut_ad(page_no != FIL_NULL);
7170 		} else {
7171 			btr_pcur_store_position(m_pcur, m_mtr);
7172 		}
7173 		m_mtr->commit();
7174 
7175 		DEBUG_SYNC_C("blob_write_middle");
7176 
7177 		log_free_check();
7178 
7179 		DEBUG_SYNC_C("blob_write_middle_after_check");
7180 
7181 		const mtr_log_t log_mode = m_mtr->get_log_mode();
7182 		m_mtr->start();
7183 		m_mtr->set_log_mode(log_mode);
7184 		index->set_modified(*m_mtr);
7185 
7186 		if (UNIV_UNLIKELY(page_no != FIL_NULL)) {
7187 			m_pcur->btr_cur.page_cur.block = btr_block_get(
7188 				*index, page_no, RW_X_LATCH, false, m_mtr);
7189 			m_pcur->btr_cur.page_cur.rec
7190 				= m_pcur->btr_cur.page_cur.block->frame
7191 				+ offs;
7192 
7193 			buf_block_buf_fix_dec(m_pcur->btr_cur.page_cur.block);
7194 		} else {
7195 			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
7196 			bool ret = btr_pcur_restore_position(
7197 				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
7198 				m_pcur, m_mtr);
7199 
7200 			ut_a(ret);
7201 		}
7202 
7203 		*m_block	= btr_pcur_get_block(m_pcur);
7204 		*m_rec		= btr_pcur_get_rec(m_pcur);
7205 
7206 		rec_offs_make_valid(*m_rec, index, true,
7207 				    const_cast<rec_offs*>(m_offsets));
7208 
7209 		ut_ad(m_mtr->memo_contains_page_flagged(
7210 		      *m_rec,
7211 		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX));
7212 
7213 		ut_ad((m_op == BTR_STORE_INSERT_BULK)
7214 		      == !m_mtr->memo_contains_flagged(&index->lock,
7215 						       MTR_MEMO_SX_LOCK
7216 						       | MTR_MEMO_X_LOCK));
7217 	}
7218 };
7219 
7220 /*******************************************************************//**
7221 Stores the fields in big_rec_vec to the tablespace and puts pointers to
7222 them in rec.  The extern flags in rec will have to be set beforehand.
7223 The fields are stored on pages allocated from leaf node
7224 file segment of the index tree.
7225 
7226 TODO: If the allocation extends the tablespace, it will not be redo logged, in
7227 any mini-transaction.  Tablespace extension should be redo-logged, so that
7228 recovery will not fail when the big_rec was written to the extended portion of
7229 the file, in case the file was somehow truncated in the crash.
7230 
7231 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
7232 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,rec_offs * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)7233 btr_store_big_rec_extern_fields(
7234 /*============================*/
7235 	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
7236 					btr_mtr is restarted, then this can
7237 					be repositioned. */
7238 	rec_offs*	offsets,	/*!< in/out: rec_get_offsets() on
7239 					pcur. the "external storage" flags
7240 					in offsets will correctly correspond
7241 					to rec when this function returns */
7242 	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
7243 					to be stored externally */
7244 	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
7245 					latches to the clustered index. can be
7246 					committed and restarted. */
7247 	enum blob_op	op)		/*! in: operation code */
7248 {
7249 	byte*		field_ref;
7250 	ulint		extern_len;
7251 	ulint		store_len;
7252 	ulint		space_id;
7253 	ulint		i;
7254 	mtr_t		mtr;
7255 	mem_heap_t*	heap = NULL;
7256 	page_zip_des_t*	page_zip;
7257 	z_stream	c_stream;
7258 	dberr_t		error		= DB_SUCCESS;
7259 	dict_index_t*	index		= pcur->index();
7260 	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
7261 	rec_t*		rec		= btr_pcur_get_rec(pcur);
7262 
7263 	ut_ad(rec_offs_validate(rec, index, offsets));
7264 	ut_ad(rec_offs_any_extern(offsets));
7265 	ut_ad(op == BTR_STORE_INSERT_BULK
7266 	      || btr_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
7267 						| MTR_MEMO_SX_LOCK));
7268 	ut_ad(btr_mtr->memo_contains_flagged(rec_block, MTR_MEMO_PAGE_X_FIX));
7269 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
7270 	ut_a(dict_index_is_clust(index));
7271 
7272 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
7273 				      &rec, op);
7274 	page_zip = buf_block_get_page_zip(rec_block);
7275 	space_id = rec_block->page.id().space();
7276 	ut_a(fil_page_index_page_check(page_align(rec))
7277 	     || op == BTR_STORE_INSERT_BULK);
7278 
7279 	if (page_zip) {
7280 		int	err;
7281 
7282 		/* Zlib deflate needs 128 kilobytes for the default
7283 		window size, plus 512 << memLevel, plus a few
7284 		kilobytes for small objects.  We use reduced memLevel
7285 		to limit the memory consumption, and preallocate the
7286 		heap, hoping to avoid memory fragmentation. */
7287 		heap = mem_heap_create(250000);
7288 		page_zip_set_alloc(&c_stream, heap);
7289 
7290 		err = deflateInit2(&c_stream, int(page_zip_level),
7291 				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
7292 		ut_a(err == Z_OK);
7293 	}
7294 
7295 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7296 	/* All pointers to externally stored columns in the record
7297 	must either be zero or they must be pointers to inherited
7298 	columns, owned by this record or an earlier record version. */
7299 	for (i = 0; i < big_rec_vec->n_fields; i++) {
7300 		field_ref = btr_rec_get_field_ref(
7301 			rec, offsets, big_rec_vec->fields[i].field_no);
7302 
7303 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7304 		/* Either this must be an update in place,
7305 		or the BLOB must be inherited, or the BLOB pointer
7306 		must be zero (will be written in this function). */
7307 		ut_a(op == BTR_STORE_UPDATE
7308 		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
7309 		     || !memcmp(field_ref, field_ref_zero,
7310 				BTR_EXTERN_FIELD_REF_SIZE));
7311 	}
7312 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7313 
7314 	/* Space available in compressed page to carry blob data */
7315 	const ulint	payload_size_zip = rec_block->physical_size()
7316 		- FIL_PAGE_DATA;
7317 
7318 	/* Space available in uncompressed page to carry blob data */
7319 	const ulint	payload_size = payload_size_zip
7320 		- (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
7321 
7322 	/* We have to create a file segment to the tablespace
7323 	for each field and put the pointer to the field in rec */
7324 
7325 	for (i = 0; i < big_rec_vec->n_fields; i++) {
7326 		const ulint field_no = big_rec_vec->fields[i].field_no;
7327 
7328 		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
7329 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7330 		/* A zero BLOB pointer should have been initially inserted. */
7331 		ut_a(!memcmp(field_ref, field_ref_zero,
7332 			     BTR_EXTERN_FIELD_REF_SIZE));
7333 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7334 		extern_len = big_rec_vec->fields[i].len;
7335 		MEM_CHECK_DEFINED(big_rec_vec->fields[i].data, extern_len);
7336 		ut_a(extern_len > 0);
7337 
7338 		uint32_t prev_page_no = FIL_NULL;
7339 
7340 		if (page_zip) {
7341 			int	err = deflateReset(&c_stream);
7342 			ut_a(err == Z_OK);
7343 
7344 			c_stream.next_in = (Bytef*)
7345 				big_rec_vec->fields[i].data;
7346 			c_stream.avail_in = static_cast<uInt>(extern_len);
7347 		}
7348 
7349 		for (ulint blob_npages = 0;; ++blob_npages) {
7350 			buf_block_t*	block;
7351 			const ulint	commit_freq = 4;
7352 			uint32_t	r_extents;
7353 
7354 			ut_ad(page_align(field_ref) == page_align(rec));
7355 
7356 			if (!(blob_npages % commit_freq)) {
7357 
7358 				redo_log.check();
7359 
7360 				field_ref = btr_rec_get_field_ref(
7361 					rec, offsets, field_no);
7362 
7363 				page_zip = buf_block_get_page_zip(rec_block);
7364 			}
7365 
7366 			mtr.start();
7367 			index->set_modified(mtr);
7368 			mtr.set_log_mode(btr_mtr->get_log_mode());
7369 
7370 			buf_page_get(rec_block->page.id(),
7371 				     rec_block->zip_size(), RW_X_LATCH, &mtr);
7372 
7373 			uint32_t hint_prev = prev_page_no;
7374 			if (hint_prev == FIL_NULL) {
7375 				hint_prev = rec_block->page.id().page_no();
7376 			}
7377 
7378 			if (!fsp_reserve_free_extents(&r_extents,
7379 						      index->table->space, 1,
7380 						      FSP_BLOB, &mtr, 1)) {
7381 				mtr.commit();
7382 				error = DB_OUT_OF_FILE_SPACE;
7383 				goto func_exit;
7384 			}
7385 
7386 			block = btr_page_alloc(index, hint_prev + 1,
7387 					       FSP_NO_DIR, 0, &mtr, &mtr);
7388 
7389 			index->table->space->release_free_extents(r_extents);
7390 
7391 			ut_a(block != NULL);
7392 
7393 			const uint32_t page_no = block->page.id().page_no();
7394 
7395 			if (prev_page_no != FIL_NULL) {
7396 				buf_block_t*	prev_block;
7397 
7398 				prev_block = buf_page_get(
7399 					page_id_t(space_id, prev_page_no),
7400 					rec_block->zip_size(),
7401 					RW_X_LATCH, &mtr);
7402 
7403 				buf_block_dbg_add_level(prev_block,
7404 							SYNC_EXTERN_STORAGE);
7405 
7406 				if (page_zip) {
7407 					mtr.write<4>(*prev_block,
7408 						     prev_block->frame
7409 						     + FIL_PAGE_NEXT,
7410 						     page_no);
7411 					memcpy_aligned<4>(
7412 						buf_block_get_page_zip(
7413 							prev_block)
7414 						->data + FIL_PAGE_NEXT,
7415 						prev_block->frame
7416 						+ FIL_PAGE_NEXT, 4);
7417 				} else {
7418 					mtr.write<4>(*prev_block,
7419 						     BTR_BLOB_HDR_NEXT_PAGE_NO
7420 						     + FIL_PAGE_DATA
7421 						     + prev_block->frame,
7422 						     page_no);
7423 				}
7424 			} else if (dict_index_is_online_ddl(index)) {
7425 				row_log_table_blob_alloc(index, page_no);
7426 			}
7427 
7428 			ut_ad(!page_has_siblings(block->frame));
7429 			ut_ad(!fil_page_get_type(block->frame));
7430 
7431 			if (page_zip) {
7432 				int		err;
7433 				page_zip_des_t*	blob_page_zip;
7434 
7435 				mtr.write<1>(*block,
7436 					     FIL_PAGE_TYPE + 1 + block->frame,
7437 					     prev_page_no == FIL_NULL
7438 					     ? FIL_PAGE_TYPE_ZBLOB
7439 					     : FIL_PAGE_TYPE_ZBLOB2);
7440 				block->page.zip.data[FIL_PAGE_TYPE + 1]
7441 					= block->frame[FIL_PAGE_TYPE + 1];
7442 
7443 				c_stream.next_out = block->frame
7444 					+ FIL_PAGE_DATA;
7445 				c_stream.avail_out = static_cast<uInt>(
7446 					payload_size_zip);
7447 
7448 				err = deflate(&c_stream, Z_FINISH);
7449 				ut_a(err == Z_OK || err == Z_STREAM_END);
7450 				ut_a(err == Z_STREAM_END
7451 				     || c_stream.avail_out == 0);
7452 
7453 				mtr.memcpy(*block,
7454 					   FIL_PAGE_DATA,
7455 					   page_zip_get_size(page_zip)
7456 					   - FIL_PAGE_DATA
7457 					   - c_stream.avail_out);
7458 				/* Copy the page to compressed storage,
7459 				because it will be flushed to disk
7460 				from there. */
7461 				blob_page_zip = buf_block_get_page_zip(block);
7462 				ut_ad(blob_page_zip);
7463 				ut_ad(page_zip_get_size(blob_page_zip)
7464 				      == page_zip_get_size(page_zip));
7465 				memcpy(blob_page_zip->data, block->frame,
7466 				       page_zip_get_size(page_zip));
7467 
7468 				if (err == Z_OK && prev_page_no != FIL_NULL) {
7469 
7470 					goto next_zip_page;
7471 				}
7472 
7473 				if (err == Z_STREAM_END) {
7474 					mach_write_to_4(field_ref
7475 							+ BTR_EXTERN_LEN, 0);
7476 					mach_write_to_4(field_ref
7477 							+ BTR_EXTERN_LEN + 4,
7478 							c_stream.total_in);
7479 				} else {
7480 					memset(field_ref + BTR_EXTERN_LEN,
7481 					       0, 8);
7482 				}
7483 
7484 				if (prev_page_no == FIL_NULL) {
7485 					ut_ad(blob_npages == 0);
7486 					mach_write_to_4(field_ref
7487 							+ BTR_EXTERN_SPACE_ID,
7488 							space_id);
7489 
7490 					mach_write_to_4(field_ref
7491 							+ BTR_EXTERN_PAGE_NO,
7492 							page_no);
7493 
7494 					mach_write_to_4(field_ref
7495 							+ BTR_EXTERN_OFFSET,
7496 							FIL_PAGE_NEXT);
7497 				}
7498 
7499 				/* We compress a page when finish bulk insert.*/
7500 				if (UNIV_LIKELY(op != BTR_STORE_INSERT_BULK)) {
7501 					page_zip_write_blob_ptr(
7502 						rec_block, rec, index, offsets,
7503 						field_no, &mtr);
7504 				}
7505 
7506 next_zip_page:
7507 				prev_page_no = page_no;
7508 
7509 				/* Commit mtr and release the
7510 				uncompressed page frame to save memory. */
7511 				btr_blob_free(block, FALSE, &mtr);
7512 
7513 				if (err == Z_STREAM_END) {
7514 					break;
7515 				}
7516 			} else {
7517 				mtr.write<1>(*block, FIL_PAGE_TYPE + 1
7518 					     + block->frame,
7519 					     FIL_PAGE_TYPE_BLOB);
7520 
7521 				if (extern_len > payload_size) {
7522 					store_len = payload_size;
7523 				} else {
7524 					store_len = extern_len;
7525 				}
7526 
7527 				mtr.memcpy<mtr_t::MAYBE_NOP>(
7528 					*block,
7529 					FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE
7530 					+ block->frame,
7531 					static_cast<const byte*>
7532 					(big_rec_vec->fields[i].data)
7533 					+ big_rec_vec->fields[i].len
7534 					- extern_len, store_len);
7535 				mtr.write<4>(*block, BTR_BLOB_HDR_PART_LEN
7536 					     + FIL_PAGE_DATA + block->frame,
7537 					     store_len);
7538 				compile_time_assert(FIL_NULL == 0xffffffff);
7539 				mtr.memset(block, BTR_BLOB_HDR_NEXT_PAGE_NO
7540 					   + FIL_PAGE_DATA, 4, 0xff);
7541 
7542 				extern_len -= store_len;
7543 
7544 				ut_ad(!mach_read_from_4(BTR_EXTERN_LEN
7545 							+ field_ref));
7546 				mtr.write<4>(*rec_block,
7547 					     BTR_EXTERN_LEN + 4 + field_ref,
7548 					     big_rec_vec->fields[i].len
7549 					     - extern_len);
7550 
7551 				if (prev_page_no == FIL_NULL) {
7552 					ut_ad(blob_npages == 0);
7553 					mtr.write<4,mtr_t::MAYBE_NOP>(
7554 						*rec_block,
7555 						field_ref + BTR_EXTERN_SPACE_ID,
7556 						space_id);
7557 
7558 					mtr.write<4>(*rec_block, field_ref
7559 						     + BTR_EXTERN_PAGE_NO,
7560 						     page_no);
7561 
7562 					mtr.write<4>(*rec_block, field_ref
7563 						     + BTR_EXTERN_OFFSET,
7564 						     FIL_PAGE_DATA);
7565 				}
7566 
7567 				prev_page_no = page_no;
7568 
7569 				mtr.commit();
7570 
7571 				if (extern_len == 0) {
7572 					break;
7573 				}
7574 			}
7575 		}
7576 
7577 		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7578 				error = DB_OUT_OF_FILE_SPACE;
7579 				goto func_exit;);
7580 
7581 		rec_offs_make_nth_extern(offsets, field_no);
7582 	}
7583 
7584 func_exit:
7585 	if (page_zip) {
7586 		deflateEnd(&c_stream);
7587 	}
7588 
7589 	if (heap != NULL) {
7590 		mem_heap_free(heap);
7591 	}
7592 
7593 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7594 	/* All pointers to externally stored columns in the record
7595 	must be valid. */
7596 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7597 		if (!rec_offs_nth_extern(offsets, i)) {
7598 			continue;
7599 		}
7600 
7601 		field_ref = btr_rec_get_field_ref(rec, offsets, i);
7602 
7603 		/* The pointer must not be zero if the operation
7604 		succeeded. */
7605 		ut_a(0 != memcmp(field_ref, field_ref_zero,
7606 				 BTR_EXTERN_FIELD_REF_SIZE)
7607 		     || error != DB_SUCCESS);
7608 		/* The column must not be disowned by this record. */
7609 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7610 	}
7611 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7612 	return(error);
7613 }
7614 
7615 /** Check the FIL_PAGE_TYPE on an uncompressed BLOB page.
7616 @param[in]      block   uncompressed BLOB page
7617 @param[in]      read    true=read, false=purge */
btr_check_blob_fil_page_type(const buf_block_t & block,bool read)7618 static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read)
7619 {
7620   uint16_t type= fil_page_get_type(block.frame);
7621 
7622   if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB))
7623     return;
7624   /* FIXME: take the tablespace as a parameter */
7625   if (fil_space_t *space= fil_space_t::get(block.page.id().space()))
7626   {
7627     /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB
7628     pages.  Do not print anything about the type mismatch when reading
7629     a BLOB page that may be from old versions. */
7630     if (space->full_crc32() || DICT_TF_HAS_ATOMIC_BLOBS(space->flags))
7631     {
7632       ib::fatal() << "FIL_PAGE_TYPE=" << type
7633 		  << (read ? " on BLOB read file " : " on BLOB purge file ")
7634 		  << space->chain.start->name
7635 		  << " page " << block.page.id().page_no();
7636     }
7637     space->release();
7638   }
7639 }
7640 
7641 /*******************************************************************//**
7642 Frees the space in an externally stored field to the file space
7643 management if the field in data is owned by the externally stored field,
7644 in a rollback we may have the additional condition that the field must
7645 not be inherited. */
7646 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const rec_offs * offsets,buf_block_t * block,ulint i,bool rollback,mtr_t * local_mtr)7647 btr_free_externally_stored_field(
7648 /*=============================*/
7649 	dict_index_t*	index,		/*!< in: index of the data, the index
7650 					tree MUST be X-latched; if the tree
7651 					height is 1, then also the root page
7652 					must be X-latched! (this is relevant
7653 					in the case this function is called
7654 					from purge where 'data' is located on
7655 					an undo log page, not an index
7656 					page) */
7657 	byte*		field_ref,	/*!< in/out: field reference */
7658 	const rec_t*	rec,		/*!< in: record containing field_ref, for
7659 					page_zip_write_blob_ptr(), or NULL */
7660 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
7661 					or NULL */
7662 	buf_block_t*	block,		/*!< in/out: page of field_ref */
7663 	ulint		i,		/*!< in: field number of field_ref;
7664 					ignored if rec == NULL */
7665 	bool		rollback,	/*!< in: performing rollback? */
7666 	mtr_t*		local_mtr)	/*!< in: mtr
7667 					containing the latch to data an an
7668 					X-latch to the index tree */
7669 {
7670 	page_t*		page;
7671 	const uint32_t	space_id	= mach_read_from_4(
7672 		field_ref + BTR_EXTERN_SPACE_ID);
7673 	const uint32_t	start_page	= mach_read_from_4(
7674 		field_ref + BTR_EXTERN_PAGE_NO);
7675 	uint32_t	page_no;
7676 	uint32_t	next_page_no;
7677 	mtr_t		mtr;
7678 
7679 	ut_ad(index->is_primary());
7680 	ut_ad(local_mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK
7681 					       | MTR_MEMO_SX_LOCK));
7682 	ut_ad(local_mtr->memo_contains_page_flagged(field_ref,
7683 						    MTR_MEMO_PAGE_X_FIX));
7684 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7685 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7686 	ut_ad(local_mtr->is_named_space(
7687 		      page_get_space_id(page_align(field_ref))));
7688 
7689 	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7690 				  BTR_EXTERN_FIELD_REF_SIZE))) {
7691 		/* In the rollback, we may encounter a clustered index
7692 		record with some unwritten off-page columns. There is
7693 		nothing to free then. */
7694 		ut_a(rollback);
7695 		return;
7696 	}
7697 
7698 	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7699 	        & ~((BTR_EXTERN_OWNER_FLAG
7700 	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
7701 	ut_ad(space_id == index->table->space->id);
7702 	ut_ad(space_id == index->table->space_id);
7703 
7704 	const ulint ext_zip_size = index->table->space->zip_size();
7705 	const ulint rec_zip_size = rec ? ext_zip_size : 0;
7706 
7707 	/* !rec holds in a call from purge when field_ref is in an undo page */
7708 	ut_ad(rec || !block->page.zip.data);
7709 
7710 	for (;;) {
7711 #ifdef UNIV_DEBUG
7712 		buf_block_t*	rec_block;
7713 #endif /* UNIV_DEBUG */
7714 		buf_block_t*	ext_block;
7715 
7716 		mtr_start(&mtr);
7717 		mtr.set_spaces(*local_mtr);
7718 		mtr.set_log_mode(local_mtr->get_log_mode());
7719 
7720 		ut_ad(!index->table->is_temporary()
7721 		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7722 
7723 		const page_t*	p = page_align(field_ref);
7724 
7725 		const page_id_t	page_id(page_get_space_id(p),
7726 					page_get_page_no(p));
7727 
7728 #ifdef UNIV_DEBUG
7729 		rec_block =
7730 #endif /* UNIV_DEBUG */
7731 		buf_page_get(page_id, rec_zip_size, RW_X_LATCH, &mtr);
7732 
7733 		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7734 		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7735 
7736 		if (/* There is no external storage data */
7737 		    page_no == FIL_NULL
7738 		    /* This field does not own the externally stored field */
7739 		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7740 			& BTR_EXTERN_OWNER_FLAG)
7741 		    /* Rollback and inherited field */
7742 		    || (rollback
7743 			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7744 			    & BTR_EXTERN_INHERITED_FLAG))) {
7745 
7746 			/* Do not free */
7747 			mtr_commit(&mtr);
7748 
7749 			return;
7750 		}
7751 
7752 		if (page_no == start_page && dict_index_is_online_ddl(index)) {
7753 			row_log_table_blob_free(index, start_page);
7754 		}
7755 
7756 		ext_block = buf_page_get(
7757 			page_id_t(space_id, page_no), ext_zip_size,
7758 			RW_X_LATCH, &mtr);
7759 
7760 		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7761 		page = buf_block_get_frame(ext_block);
7762 
7763 		if (ext_zip_size) {
7764 			/* Note that page_zip will be NULL
7765 			in row_purge_upd_exist_or_extern(). */
7766 			switch (fil_page_get_type(page)) {
7767 			case FIL_PAGE_TYPE_ZBLOB:
7768 			case FIL_PAGE_TYPE_ZBLOB2:
7769 				break;
7770 			default:
7771 				ut_error;
7772 			}
7773 			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7774 
7775 			btr_page_free(index, ext_block, &mtr, true);
7776 
7777 			if (UNIV_LIKELY_NULL(block->page.zip.data)) {
7778 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7779 						next_page_no);
7780 				memset(field_ref + BTR_EXTERN_LEN + 4, 0, 4);
7781 				page_zip_write_blob_ptr(block, rec, index,
7782 							offsets, i, &mtr);
7783 			} else {
7784 				mtr.write<4>(*block,
7785 					     BTR_EXTERN_PAGE_NO + field_ref,
7786 					     next_page_no);
7787 				mtr.write<4,mtr_t::MAYBE_NOP>(*block,
7788 							      BTR_EXTERN_LEN
7789 							      + 4 + field_ref,
7790 							      0U);
7791 			}
7792 		} else {
7793 			ut_ad(!block->page.zip.data);
7794 			btr_check_blob_fil_page_type(*ext_block, false);
7795 
7796 			next_page_no = mach_read_from_4(
7797 				page + FIL_PAGE_DATA
7798 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
7799 			btr_page_free(index, ext_block, &mtr, true);
7800 
7801 			mtr.write<4>(*block, BTR_EXTERN_PAGE_NO + field_ref,
7802 				     next_page_no);
7803 			/* Zero out the BLOB length.  If the server
7804 			crashes during the execution of this function,
7805 			trx_rollback_all_recovered() could
7806 			dereference the half-deleted BLOB, fetching a
7807 			wrong prefix for the BLOB. */
7808 			mtr.write<4,mtr_t::MAYBE_NOP>(*block,
7809 						      BTR_EXTERN_LEN + 4
7810 						      + field_ref, 0U);
7811 		}
7812 
7813 		/* Commit mtr and release the BLOB block to save memory. */
7814 		btr_blob_free(ext_block, TRUE, &mtr);
7815 	}
7816 }
7817 
7818 /***********************************************************//**
7819 Frees the externally stored fields for a record. */
7820 static
7821 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const rec_offs * offsets,buf_block_t * block,bool rollback,mtr_t * mtr)7822 btr_rec_free_externally_stored_fields(
7823 /*==================================*/
7824 	dict_index_t*	index,	/*!< in: index of the data, the index
7825 				tree MUST be X-latched */
7826 	rec_t*		rec,	/*!< in/out: record */
7827 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
7828 	buf_block_t*	block,	/*!< in: index page of rec */
7829 	bool		rollback,/*!< in: performing rollback? */
7830 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7831 				an X-latch to record page and to the index
7832 				tree */
7833 {
7834 	ulint	n_fields;
7835 	ulint	i;
7836 
7837 	ut_ad(rec_offs_validate(rec, index, offsets));
7838 	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
7839 	ut_ad(index->is_primary());
7840 	ut_ad(page_rec_is_leaf(rec));
7841 	/* Free possible externally stored fields in the record */
7842 
7843 	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7844 	n_fields = rec_offs_n_fields(offsets);
7845 
7846 	for (i = 0; i < n_fields; i++) {
7847 		if (rec_offs_nth_extern(offsets, i)) {
7848 			btr_free_externally_stored_field(
7849 				index, btr_rec_get_field_ref(rec, offsets, i),
7850 				rec, offsets, block, i, rollback, mtr);
7851 		}
7852 	}
7853 }
7854 
7855 /***********************************************************//**
7856 Frees the externally stored fields for a record, if the field is mentioned
7857 in the update vector. */
7858 static
7859 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,buf_block_t * block,const rec_offs * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7860 btr_rec_free_updated_extern_fields(
7861 /*===============================*/
7862 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
7863 				X-latched */
7864 	rec_t*		rec,	/*!< in/out: record */
7865 	buf_block_t*	block,	/*!< in: index page of rec */
7866 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
7867 	const upd_t*	update,	/*!< in: update vector */
7868 	bool		rollback,/*!< in: performing rollback? */
7869 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7870 				an X-latch to record page and to the tree */
7871 {
7872 	ulint	n_fields;
7873 	ulint	i;
7874 
7875 	ut_ad(rec_offs_validate(rec, index, offsets));
7876 	ut_ad(mtr->memo_contains_page_flagged(rec, MTR_MEMO_PAGE_X_FIX));
7877 
7878 	/* Free possible externally stored fields in the record */
7879 
7880 	n_fields = upd_get_n_fields(update);
7881 
7882 	for (i = 0; i < n_fields; i++) {
7883 		const upd_field_t* ufield = upd_get_nth_field(update, i);
7884 
7885 		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7886 			ulint	len;
7887 			byte*	data = rec_get_nth_field(
7888 				rec, offsets, ufield->field_no, &len);
7889 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7890 
7891 			btr_free_externally_stored_field(
7892 				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7893 				rec, offsets, block,
7894 				ufield->field_no, rollback, mtr);
7895 		}
7896 	}
7897 }
7898 
7899 /*******************************************************************//**
7900 Copies the prefix of an uncompressed BLOB.  The clustered index record
7901 that points to this BLOB must be protected by a lock or a page latch.
7902 @return number of bytes written to buf */
7903 static
7904 ulint
btr_copy_blob_prefix(byte * buf,uint32_t len,page_id_t id,uint32_t offset)7905 btr_copy_blob_prefix(
7906 /*=================*/
7907 	byte*		buf,	/*!< out: the externally stored part of
7908 				the field, or a prefix of it */
7909 	uint32_t	len,	/*!< in: length of buf, in bytes */
7910 	page_id_t	id,	/*!< in: page identifier of the first BLOB page */
7911 	uint32_t	offset)	/*!< in: offset on the first BLOB page */
7912 {
7913 	ulint	copied_len	= 0;
7914 
7915 	for (;;) {
7916 		mtr_t		mtr;
7917 		buf_block_t*	block;
7918 		const page_t*	page;
7919 		const byte*	blob_header;
7920 		ulint		part_len;
7921 		ulint		copy_len;
7922 
7923 		mtr_start(&mtr);
7924 
7925 		block = buf_page_get(id, 0, RW_S_LATCH, &mtr);
7926 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7927 		page = buf_block_get_frame(block);
7928 
7929 		btr_check_blob_fil_page_type(*block, true);
7930 
7931 		blob_header = page + offset;
7932 		part_len = btr_blob_get_part_len(blob_header);
7933 		copy_len = ut_min(part_len, len - copied_len);
7934 
7935 		memcpy(buf + copied_len,
7936 		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7937 		copied_len += copy_len;
7938 
7939 		id.set_page_no(btr_blob_get_next_page_no(blob_header));
7940 
7941 		mtr_commit(&mtr);
7942 
7943 		if (id.page_no() == FIL_NULL || copy_len != part_len) {
7944 			MEM_CHECK_DEFINED(buf, copied_len);
7945 			return(copied_len);
7946 		}
7947 
7948 		/* On other BLOB pages except the first the BLOB header
7949 		always is at the page data start: */
7950 
7951 		offset = FIL_PAGE_DATA;
7952 
7953 		ut_ad(copied_len <= len);
7954 	}
7955 }
7956 
7957 /** Copies the prefix of a compressed BLOB.
7958 The clustered index record that points to this BLOB must be protected
7959 by a lock or a page latch.
7960 @param[out]	buf		the externally stored part of the field,
7961 or a prefix of it
7962 @param[in]	len		length of buf, in bytes
7963 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size
7964 @param[in]	id		page identifier of the BLOB pages
7965 @return number of bytes written to buf */
7966 static
7967 ulint
btr_copy_zblob_prefix(byte * buf,uint32_t len,ulint zip_size,page_id_t id,uint32_t offset)7968 btr_copy_zblob_prefix(
7969 	byte*			buf,
7970 	uint32_t		len,
7971 	ulint			zip_size,
7972 	page_id_t		id,
7973 	uint32_t		offset)
7974 {
7975 	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
7976 	mem_heap_t*	heap;
7977 	int		err;
7978 	z_stream	d_stream;
7979 
7980 	d_stream.next_out = buf;
7981 	d_stream.avail_out = static_cast<uInt>(len);
7982 	d_stream.next_in = Z_NULL;
7983 	d_stream.avail_in = 0;
7984 
7985 	/* Zlib inflate needs 32 kilobytes for the default
7986 	window size, plus a few kilobytes for small objects. */
7987 	heap = mem_heap_create(40000);
7988 	page_zip_set_alloc(&d_stream, heap);
7989 
7990 	ut_ad(zip_size);
7991 	ut_ad(ut_is_2pow(zip_size));
7992 	ut_ad(id.space());
7993 
7994 	err = inflateInit(&d_stream);
7995 	ut_a(err == Z_OK);
7996 
7997 	for (;;) {
7998 		buf_page_t*	bpage;
7999 		uint32_t	next_page_no;
8000 
8001 		/* There is no latch on bpage directly.  Instead,
8002 		bpage is protected by the B-tree page latch that
8003 		is being held on the clustered index record, or,
8004 		in row_merge_copy_blobs(), by an exclusive table lock. */
8005 		bpage = buf_page_get_zip(id, zip_size);
8006 
8007 		if (UNIV_UNLIKELY(!bpage)) {
8008 			ib::error() << "Cannot load compressed BLOB " << id;
8009 			goto func_exit;
8010 		}
8011 
8012 		if (UNIV_UNLIKELY
8013 		    (fil_page_get_type(bpage->zip.data) != page_type)) {
8014 
8015 			ib::error() << "Unexpected type "
8016 				<< fil_page_get_type(bpage->zip.data)
8017 				<< " of compressed BLOB page " << id;
8018 
8019 			ut_ad(0);
8020 			goto end_of_blob;
8021 		}
8022 
8023 		next_page_no = mach_read_from_4(bpage->zip.data + offset);
8024 
8025 		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
8026 			/* When the BLOB begins at page header,
8027 			the compressed data payload does not
8028 			immediately follow the next page pointer. */
8029 			offset = FIL_PAGE_DATA;
8030 		} else {
8031 			offset += 4;
8032 		}
8033 
8034 		d_stream.next_in = bpage->zip.data + offset;
8035 		d_stream.avail_in = uInt(zip_size - offset);
8036 
8037 		err = inflate(&d_stream, Z_NO_FLUSH);
8038 		switch (err) {
8039 		case Z_OK:
8040 			if (!d_stream.avail_out) {
8041 				goto end_of_blob;
8042 			}
8043 			break;
8044 		case Z_STREAM_END:
8045 			if (next_page_no == FIL_NULL) {
8046 				goto end_of_blob;
8047 			}
8048 			/* fall through */
8049 		default:
8050 inflate_error:
8051 			ib::error() << "inflate() of compressed BLOB page "
8052 				<< id
8053 				<< " returned " << err
8054 				<< " (" << d_stream.msg << ")";
8055 
8056 		case Z_BUF_ERROR:
8057 			goto end_of_blob;
8058 		}
8059 
8060 		if (next_page_no == FIL_NULL) {
8061 			if (!d_stream.avail_in) {
8062 				ib::error()
8063 					<< "Unexpected end of compressed "
8064 					<< "BLOB page " << id;
8065 			} else {
8066 				err = inflate(&d_stream, Z_FINISH);
8067 				switch (err) {
8068 				case Z_STREAM_END:
8069 				case Z_BUF_ERROR:
8070 					break;
8071 				default:
8072 					goto inflate_error;
8073 				}
8074 			}
8075 
8076 end_of_blob:
8077 			buf_page_release_zip(bpage);
8078 			goto func_exit;
8079 		}
8080 
8081 		buf_page_release_zip(bpage);
8082 
8083 		/* On other BLOB pages except the first
8084 		the BLOB header always is at the page header: */
8085 
8086 		id.set_page_no(next_page_no);
8087 		offset = FIL_PAGE_NEXT;
8088 		page_type = FIL_PAGE_TYPE_ZBLOB2;
8089 	}
8090 
8091 func_exit:
8092 	inflateEnd(&d_stream);
8093 	mem_heap_free(heap);
8094 	MEM_CHECK_DEFINED(buf, d_stream.total_out);
8095 	return(d_stream.total_out);
8096 }
8097 
8098 /** Copies the prefix of an externally stored field of a record.
8099 The clustered index record that points to this BLOB must be protected
8100 by a lock or a page latch.
8101 @param[out]	buf		the externally stored part of the
8102 field, or a prefix of it
8103 @param[in]	len		length of buf, in bytes
8104 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
8105 @param[in]	id		page identifier of the first BLOB page
8106 @param[in]	offset		offset on the first BLOB page
8107 @return number of bytes written to buf */
8108 static
8109 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,uint32_t len,ulint zip_size,page_id_t id,uint32_t offset)8110 btr_copy_externally_stored_field_prefix_low(
8111 	byte*			buf,
8112 	uint32_t		len,
8113 	ulint			zip_size,
8114 	page_id_t		id,
8115 	uint32_t		offset)
8116 {
8117   if (len == 0)
8118     return 0;
8119 
8120   return zip_size
8121     ? btr_copy_zblob_prefix(buf, len, zip_size, id, offset)
8122     : btr_copy_blob_prefix(buf, len, id, offset);
8123 }
8124 
8125 /** Copies the prefix of an externally stored field of a record.
8126 The clustered index record must be protected by a lock or a page latch.
8127 @param[out]	buf		the field, or a prefix of it
8128 @param[in]	len		length of buf, in bytes
8129 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
8130 @param[in]	data		'internally' stored part of the field
8131 containing also the reference to the external part; must be protected by
8132 a lock or a page latch
8133 @param[in]	local_len	length of data, in bytes
8134 @return the length of the copied field, or 0 if the column was being
8135 or has been deleted */
8136 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,ulint zip_size,const byte * data,ulint local_len)8137 btr_copy_externally_stored_field_prefix(
8138 	byte*			buf,
8139 	ulint			len,
8140 	ulint			zip_size,
8141 	const byte*		data,
8142 	ulint			local_len)
8143 {
8144 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8145 
8146 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8147 
8148 	if (UNIV_UNLIKELY(local_len >= len)) {
8149 		memcpy(buf, data, len);
8150 		return(len);
8151 	}
8152 
8153 	memcpy(buf, data, local_len);
8154 	data += local_len;
8155 
8156 	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
8157 
8158 	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
8159 		/* The externally stored part of the column has been
8160 		(partially) deleted.  Signal the half-deleted BLOB
8161 		to the caller. */
8162 
8163 		return(0);
8164 	}
8165 
8166 	uint32_t space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
8167 	uint32_t page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
8168 	uint32_t offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
8169 	len -= local_len;
8170 
8171 	return(local_len
8172 	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
8173 							     uint32_t(len),
8174 							     zip_size,
8175 							     page_id_t(
8176 								     space_id,
8177 								     page_no),
8178 							     offset));
8179 }
8180 
8181 /** Copies an externally stored field of a record to mem heap.
8182 The clustered index record must be protected by a lock or a page latch.
8183 @param[out]	len		length of the whole field
8184 @param[in]	data		'internally' stored part of the field
8185 containing also the reference to the external part; must be protected by
8186 a lock or a page latch
8187 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
8188 @param[in]	local_len	length of data
8189 @param[in,out]	heap		mem heap
8190 @return the whole field copied to heap */
8191 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,ulint zip_size,ulint local_len,mem_heap_t * heap)8192 btr_copy_externally_stored_field(
8193 	ulint*			len,
8194 	const byte*		data,
8195 	ulint			zip_size,
8196 	ulint			local_len,
8197 	mem_heap_t*		heap)
8198 {
8199 	byte*	buf;
8200 
8201 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8202 
8203 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
8204 
8205 	uint32_t space_id = mach_read_from_4(data + local_len
8206 					     + BTR_EXTERN_SPACE_ID);
8207 	uint32_t page_no = mach_read_from_4(data + local_len
8208 					    + BTR_EXTERN_PAGE_NO);
8209 	uint32_t offset = mach_read_from_4(data + local_len
8210 					   + BTR_EXTERN_OFFSET);
8211 
8212 	/* Currently a BLOB cannot be bigger than 4 GB; we
8213 	leave the 4 upper bytes in the length field unused */
8214 
8215 	uint32_t extern_len = mach_read_from_4(data + local_len
8216 					       + BTR_EXTERN_LEN + 4);
8217 
8218 	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
8219 
8220 	memcpy(buf, data, local_len);
8221 	*len = local_len
8222 		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
8223 							      extern_len,
8224 							      zip_size,
8225 							      page_id_t(
8226 								      space_id,
8227 								      page_no),
8228 							      offset);
8229 
8230 	return(buf);
8231 }
8232 
8233 /** Copies an externally stored field of a record to mem heap.
8234 @param[in]	rec		record in a clustered index; must be
8235 protected by a lock or a page latch
8236 @param[in]	offset		array returned by rec_get_offsets()
8237 @param[in]	zip_size	ROW_FORMAT=COMPRESSED page size, or 0
8238 @param[in]	no		field number
8239 @param[out]	len		length of the field
8240 @param[in,out]	heap		mem heap
8241 @return the field copied to heap, or NULL if the field is incomplete */
8242 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const rec_offs * offsets,ulint zip_size,ulint no,ulint * len,mem_heap_t * heap)8243 btr_rec_copy_externally_stored_field(
8244 	const rec_t*		rec,
8245 	const rec_offs*		offsets,
8246 	ulint			zip_size,
8247 	ulint			no,
8248 	ulint*			len,
8249 	mem_heap_t*		heap)
8250 {
8251 	ulint		local_len;
8252 	const byte*	data;
8253 
8254 	ut_a(rec_offs_nth_extern(offsets, no));
8255 
8256 	/* An externally stored field can contain some initial
8257 	data from the field, and in the last 20 bytes it has the
8258 	space id, page number, and offset where the rest of the
8259 	field data is stored, and the data length in addition to
8260 	the data stored locally. We may need to store some data
8261 	locally to get the local record length above the 128 byte
8262 	limit so that field offsets are stored in two bytes, and
8263 	the extern bit is available in those two bytes. */
8264 
8265 	data = rec_get_nth_field(rec, offsets, no, &local_len);
8266 
8267 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
8268 
8269 	if (UNIV_UNLIKELY
8270 	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
8271 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
8272 		/* The externally stored field was not written yet.
8273 		This record should only be seen by
8274 		trx_rollback_recovered() or any
8275 		TRX_ISO_READ_UNCOMMITTED transactions. */
8276 		return(NULL);
8277 	}
8278 
8279 	return(btr_copy_externally_stored_field(len, data,
8280 						zip_size, local_len, heap));
8281 }
8282