1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 *****************************************************************************/
34 
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38 
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42 
43 			NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49 
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52 
53 #include "btr0cur.h"
54 
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58 
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81 
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 	BTR_NO_OP = 0,			/*!< Not buffered */
85 	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
86 	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
87 	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
88 	BTR_DELMARK_OP			/*!< Mark a record for deletion */
89 };
90 
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 	BTR_INTENTION_DELETE,
94 	BTR_INTENTION_BOTH,
95 	BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103 
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110 
111  Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH	100000
114 
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint	btr_cur_n_non_sea	= 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint	btr_cur_n_sea		= 0;
120 /** Old value of btr_cur_n_non_sea.  Copied by
121 srv_refresh_innodb_monitor_stats().  Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint	btr_cur_n_non_sea_old	= 0;
124 /** Old value of btr_cur_n_sea.  Copied by
125 srv_refresh_innodb_monitor_stats().  Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint	btr_cur_n_sea_old	= 0;
128 
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint	btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133 
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
137 
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
142 						page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
144 						FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
147 						part header, in bytes */
148 
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 	(((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159 
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162 
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
173 				part will be updated, or NULL */
174 	rec_t*		rec,	/*!< in/out: record in a clustered index */
175 	dict_index_t*	index,	/*!< in: index of the page */
176 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
177 	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
186 	ulint		height,		/*!< in: height of the page in tree;
187 					0 means leaf node */
188 	ulint		root_height);	/*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
197 				X-latched */
198 	rec_t*		rec,	/*!< in: record */
199 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
200 				part will be updated, or NULL */
201 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
202 	const upd_t*	update,	/*!< in: update vector */
203 	bool		rollback,/*!< in: performing rollback? */
204 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
205 				an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 	dict_index_t*	index,	/*!< in: index of the data, the index
213 				tree MUST be X-latched */
214 	rec_t*		rec,	/*!< in: record */
215 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
216 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
217 				part will be updated, or NULL */
218 	bool		rollback,/*!< in: performing rollback? */
219 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
220 				an X-latch to record page and to the index
221 				tree */
222 #endif /* !UNIV_HOTBACKUP */
223 
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226 
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236 
237 /** Latches the leaf page or pages requested.
238 @param[in]	block		leaf page where the search converged
239 @param[in]	page_id		page id of the leaf
240 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
241 @param[in]	cursor		cursor
242 @param[in]	mtr		mini-transaction
243 @return	blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 	buf_block_t*		block,
247 	const page_id_t&	page_id,
248 	const page_size_t&	page_size,
249 	ulint			latch_mode,
250 	btr_cur_t*		cursor,
251 	mtr_t*			mtr)
252 {
253 	ulint		mode;
254 	ulint		left_page_no;
255 	ulint		right_page_no;
256 	buf_block_t*	get_block;
257 	page_t*		page = buf_block_get_frame(block);
258 	bool		spatial;
259 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260 
261 	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 	ut_ad(buf_page_in_file(&block->page));
263 
264 	switch (latch_mode) {
265 	case BTR_SEARCH_LEAF:
266 	case BTR_MODIFY_LEAF:
267 	case BTR_SEARCH_TREE:
268 		if (spatial) {
269 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 				= mtr_set_savepoint(mtr);
271 		}
272 
273 		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 		get_block = btr_block_get(page_id, page_size, mode,
276 					  cursor->index, mtr);
277 		latch_leaves.blocks[1] = get_block;
278 #ifdef UNIV_BTR_DEBUG
279 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
280 #endif /* UNIV_BTR_DEBUG */
281 		if (spatial) {
282 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
283 				= get_block;
284 		}
285 
286 		return(latch_leaves);
287 	case BTR_MODIFY_TREE:
288 		/* It is exclusive for other operations which calls
289 		btr_page_set_prev() */
290 		ut_ad(mtr_memo_contains_flagged(mtr,
291 			dict_index_get_lock(cursor->index),
292 			MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
293 		      || dict_table_is_intrinsic(cursor->index->table));
294 		/* x-latch also siblings from left to right */
295 		left_page_no = btr_page_get_prev(page, mtr);
296 
297 		if (left_page_no != FIL_NULL) {
298 
299 			if (spatial) {
300 				cursor->rtr_info->tree_savepoints[
301 					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
302 			}
303 
304 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
305 			get_block = btr_block_get(
306 				page_id_t(page_id.space(), left_page_no),
307 				page_size, RW_X_LATCH, cursor->index, mtr);
308 			latch_leaves.blocks[0] = get_block;
309 
310 			if (spatial) {
311 				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
312 					= get_block;
313 			}
314 		}
315 
316 		if (spatial) {
317 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
318 				= mtr_set_savepoint(mtr);
319 		}
320 
321 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
322 		get_block = btr_block_get(
323 			page_id, page_size, RW_X_LATCH, cursor->index, mtr);
324 		latch_leaves.blocks[1] = get_block;
325 
326 #ifdef UNIV_BTR_DEBUG
327 		/* Sanity check only after both the blocks are latched. */
328 		if (latch_leaves.blocks[0] != NULL) {
329 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
330 				== page_is_comp(page));
331 			ut_a(btr_page_get_next(
332 				latch_leaves.blocks[0]->frame, mtr)
333 				== page_get_page_no(page));
334 		}
335 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
336 #endif /* UNIV_BTR_DEBUG */
337 
338 		if (spatial) {
339 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
340 				= get_block;
341 		}
342 
343 		right_page_no = btr_page_get_next(page, mtr);
344 
345 		if (right_page_no != FIL_NULL) {
346 			if (spatial) {
347 				cursor->rtr_info->tree_savepoints[
348 					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
349 								mtr);
350 			}
351 			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
352 			get_block = btr_block_get(
353 				page_id_t(page_id.space(), right_page_no),
354 				page_size, RW_X_LATCH, cursor->index, mtr);
355 			latch_leaves.blocks[2] = get_block;
356 #ifdef UNIV_BTR_DEBUG
357 			ut_a(page_is_comp(get_block->frame)
358 			     == page_is_comp(page));
359 			ut_a(btr_page_get_prev(get_block->frame, mtr)
360 			     == page_get_page_no(page));
361 #endif /* UNIV_BTR_DEBUG */
362 			if (spatial) {
363 				cursor->rtr_info->tree_blocks[
364 					RTR_MAX_LEVELS + 2] = get_block;
365 			}
366 		}
367 
368 		return(latch_leaves);
369 
370 	case BTR_SEARCH_PREV:
371 	case BTR_MODIFY_PREV:
372 		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
373 		/* latch also left sibling */
374 		rw_lock_s_lock(&block->lock);
375 		left_page_no = btr_page_get_prev(page, mtr);
376 		rw_lock_s_unlock(&block->lock);
377 
378 		if (left_page_no != FIL_NULL) {
379 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
380 			get_block = btr_block_get(
381 				page_id_t(page_id.space(), left_page_no),
382 				page_size, mode, cursor->index, mtr);
383 			latch_leaves.blocks[0] = get_block;
384 			cursor->left_block = get_block;
385 #ifdef UNIV_BTR_DEBUG
386 			ut_a(page_is_comp(get_block->frame)
387 			     == page_is_comp(page));
388 			ut_a(btr_page_get_next(get_block->frame, mtr)
389 			     == page_get_page_no(page));
390 #endif /* UNIV_BTR_DEBUG */
391 		}
392 
393 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
394 		get_block = btr_block_get(page_id, page_size, mode,
395 					  cursor->index, mtr);
396 		latch_leaves.blocks[1] = get_block;
397 #ifdef UNIV_BTR_DEBUG
398 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
399 #endif /* UNIV_BTR_DEBUG */
400 		return(latch_leaves);
401 	case BTR_CONT_MODIFY_TREE:
402 		ut_ad(dict_index_is_spatial(cursor->index));
403 		return(latch_leaves);
404 	}
405 
406 	ut_error;
407 	return(latch_leaves);
408 }
409 
410 /** Optimistically latches the leaf page or pages requested.
411 @param[in]	block		guessed buffer block
412 @param[in]	modify_clock	modify clock value
413 @param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
414 @param[in,out]	cursor		cursor
415 @param[in]	file		file name
416 @param[in]	line		line where called
417 @param[in]	mtr		mini-transaction
418 @return true if success */
419 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)420 btr_cur_optimistic_latch_leaves(
421 	buf_block_t*	block,
422 	ib_uint64_t	modify_clock,
423 	ulint*		latch_mode,
424 	btr_cur_t*	cursor,
425 	const char*	file,
426 	ulint		line,
427 	mtr_t*		mtr)
428 {
429 	ulint		mode;
430 	ulint		left_page_no;
431 	ut_ad(block->page.buf_fix_count > 0);
432 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
433 
434 	switch (*latch_mode) {
435 	case BTR_SEARCH_LEAF:
436 	case BTR_MODIFY_LEAF:
437 		return(buf_page_optimistic_get(*latch_mode, block,
438 				modify_clock, file, line, mtr));
439 	case BTR_SEARCH_PREV:
440 	case BTR_MODIFY_PREV:
441 		mode = *latch_mode == BTR_SEARCH_PREV
442 			? RW_S_LATCH : RW_X_LATCH;
443 
444 		rw_lock_s_lock(&block->lock);
445 		if (block->modify_clock != modify_clock) {
446 			rw_lock_s_unlock(&block->lock);
447 
448 			return(false);
449 		}
450 		left_page_no = btr_page_get_prev(
451 			buf_block_get_frame(block), mtr);
452 		rw_lock_s_unlock(&block->lock);
453 
454 		if (left_page_no != FIL_NULL) {
455 			const page_id_t	page_id(
456 				dict_index_get_space(cursor->index),
457 				left_page_no);
458 
459 			cursor->left_block = btr_block_get(
460 				page_id,
461 				dict_table_page_size(cursor->index->table),
462 				mode, cursor->index, mtr);
463 		} else {
464 			cursor->left_block = NULL;
465 		}
466 
467 		if (buf_page_optimistic_get(mode, block, modify_clock,
468 					    file, line, mtr)) {
469 			if (btr_page_get_prev(buf_block_get_frame(block), mtr)
470 			    == left_page_no) {
471 				/* We've entered this function with the block already buffer-fixed,
472 				and buf_page_optimistic_get() buffer-fixes it again. The caller should
473 				unfix the block once (to undo their buffer-fixing). */
474 				ut_ad(2 <= block->page.buf_fix_count);
475 				*latch_mode = mode;
476 				return(true);
477 			} else {
478 				/* release the block, which will also decrement the buf_fix_count once
479 				undoing the increment in successful buf_page_optimistic_get() */
480 				btr_leaf_page_release(block, mode, mtr);
481 			}
482 		}
483 
484 		/* If we are still here then buf_page_optimistic_get() did not buffer-fix
485 		the page, but it should still be buffer-fixed as it was before the call.*/
486 		ut_ad(0 < block->page.buf_fix_count);
487 		/* release the left block */
488 		if (cursor->left_block != NULL) {
489 			btr_leaf_page_release(cursor->left_block,
490 					      mode, mtr);
491 		}
492 
493 		return(false);
494 
495 	default:
496 		ut_error;
497 		return(false);
498 	}
499 }
500 
501 /**
502 Gets intention in btr_intention_t from latch_mode, and cleares the intention
503 at the latch_mode.
504 @param latch_mode	in/out: pointer to latch_mode
505 @return intention for latching tree */
506 static
507 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)508 btr_cur_get_and_clear_intention(
509 	ulint	*latch_mode)
510 {
511 	btr_intention_t	intention;
512 
513 	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
514 	case BTR_LATCH_FOR_INSERT:
515 		intention = BTR_INTENTION_INSERT;
516 		break;
517 	case BTR_LATCH_FOR_DELETE:
518 		intention = BTR_INTENTION_DELETE;
519 		break;
520 	default:
521 		/* both or unknown */
522 		intention = BTR_INTENTION_BOTH;
523 	}
524 	*latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
525 
526 	return(intention);
527 }
528 
529 /**
530 Gets the desired latch type for the root leaf (root page is root leaf)
531 at the latch mode.
532 @param latch_mode	in: BTR_SEARCH_LEAF, ...
533 @return latch type */
534 static
535 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)536 btr_cur_latch_for_root_leaf(
537 	ulint	latch_mode)
538 {
539 	switch (latch_mode) {
540 	case BTR_SEARCH_LEAF:
541 	case BTR_SEARCH_TREE:
542 	case BTR_SEARCH_PREV:
543 		return(RW_S_LATCH);
544 	case BTR_MODIFY_LEAF:
545 	case BTR_MODIFY_TREE:
546 	case BTR_MODIFY_PREV:
547 		return(RW_X_LATCH);
548 	case BTR_CONT_MODIFY_TREE:
549 	case BTR_CONT_SEARCH_TREE:
550 		/* A root page should be latched already,
551 		and don't need to be latched here.
552 		fall through (RW_NO_LATCH) */
553 	case BTR_NO_LATCHES:
554 		return(RW_NO_LATCH);
555 	}
556 
557 	ut_error;
558 	return(RW_NO_LATCH); /* avoid compiler warnings */
559 }
560 
561 /** Detects whether the modifying record might need a modifying tree structure.
562 @param[in]	index		index
563 @param[in]	page		page
564 @param[in]	lock_intention	lock intention for the tree operation
565 @param[in]	rec		record (current node_ptr)
566 @param[in]	rec_size	size of the record or max size of node_ptr
567 @param[in]	page_size	page size
568 @param[in]	mtr		mtr
569 @return true if tree modification is needed */
570 static
571 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)572 btr_cur_will_modify_tree(
573 	dict_index_t*	index,
574 	const page_t*	page,
575 	btr_intention_t	lock_intention,
576 	const rec_t*	rec,
577 	ulint		rec_size,
578 	const page_size_t&	page_size,
579 	mtr_t*		mtr)
580 {
581 	ut_ad(!page_is_leaf(page));
582 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
583 					MTR_MEMO_X_LOCK
584 					| MTR_MEMO_SX_LOCK)
585 	      || dict_table_is_intrinsic(index->table));
586 
587 	/* Pessimistic delete of the first record causes delete & insert
588 	of node_ptr at upper level. And a subsequent page shrink is
589 	possible. It causes delete of node_ptr at the upper level.
590 	So we should pay attention also to 2nd record not only
591 	first record and last record. Because if the "delete & insert" are
592 	done for the different page, the 2nd record become
593 	first record and following compress might delete the record and causes
594 	the uppper level node_ptr modification. */
595 
596 	if (lock_intention <= BTR_INTENTION_BOTH) {
597 		ulint	margin;
598 
599 		if (lock_intention == BTR_INTENTION_BOTH) {
600 			ulint	level = btr_page_get_level(page, mtr);
601 
602 			/* This value is the worst expectation for the node_ptr
603 			records to be deleted from this page. It is used to
604 			expect whether the cursor position can be the left_most
605 			record in this page or not. */
606 			ulint   max_nodes_deleted = 0;
607 
608 			/* By modifying tree operations from the under of this
609 			level, logically (2 ^ (level - 1)) opportunities to
610 			deleting records in maximum even unreally rare case. */
611 			if (level > 7) {
612 				/* TODO: adjust this practical limit. */
613 				max_nodes_deleted = 64;
614 			} else if (level > 0) {
615 				max_nodes_deleted = (ulint)1 << (level - 1);
616 			}
617 
618 			/* check delete will cause. (BTR_INTENTION_BOTH
619 			or BTR_INTENTION_DELETE) */
620 			if (page_get_n_recs(page) <= max_nodes_deleted * 2
621 			    || page_rec_is_first(rec, page)) {
622 				/* The cursor record can be the left most record
623 				in this page. */
624 				return(true);
625 			}
626 
627 			if (fil_page_get_prev(page) != FIL_NULL
628 			    && page_rec_distance_is_at_most(
629 					page_get_infimum_rec(page), rec,
630 					max_nodes_deleted)) {
631 				return (true);
632 			}
633 
634 			if (fil_page_get_next(page) != FIL_NULL
635 			    && page_rec_distance_is_at_most(
636 					rec, page_get_supremum_rec(page),
637 					max_nodes_deleted)) {
638 				return (true);
639 			}
640 
641 			/* Delete at leftmost record in a page causes delete
642 			& insert at its parent page. After that, the delete
643 			might cause btr_compress() and delete record at its
644 			parent page. Thus we should consider max deletes. */
645 
646 			margin = rec_size * max_nodes_deleted;
647 		} else {
648 			ut_ad(lock_intention == BTR_INTENTION_DELETE);
649 
650 			margin = rec_size;
651 		}
652 		/* Safe because we already have SX latch of the index tree */
653 		if (page_get_data_size(page)
654 			< margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
655 		    || (fil_page_get_next(page) == FIL_NULL
656 			&& fil_page_get_prev(page) == FIL_NULL)) {
657 			return(true);
658 		}
659 	}
660 
661 	if (lock_intention >= BTR_INTENTION_BOTH) {
662 		/* check insert will cause. BTR_INTENTION_BOTH
663 		or BTR_INTENTION_INSERT*/
664 
665 		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
666 		we should check it here in advance, since the max allowable
667 		records in a page is limited. */
668 		LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
669 					      return(true));
670 
671 		/* needs 2 records' space for the case the single split and
672 		insert cannot fit.
673 		page_get_max_insert_size_after_reorganize() includes space
674 		for page directory already */
675 		ulint	max_size
676 			= page_get_max_insert_size_after_reorganize(page, 2);
677 
678 		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
679 		    || max_size < rec_size * 2) {
680 			return(true);
681 		}
682 		/* TODO: optimize this condition for compressed page.
683 		this is based on the worst compress rate.
684 		currently looking only uncompressed page, but we can look
685 		also compressed page page_zip_available() if already in the
686 		buffer pool */
687 		/* needs 2 records' space also for worst compress rate. */
688 		if (page_size.is_compressed()
689 		    && page_zip_empty_size(index->n_fields,
690 					   page_size.physical())
691 		       < rec_size * 2 + page_get_data_size(page)
692 			 + page_dir_calc_reserved_space(
693 				page_get_n_recs(page) + 2) + 1) {
694 			return(true);
695 		}
696 	}
697 
698 	return(false);
699 }
700 
701 /** Detects whether the modifying record might need a opposite modification
702 to the intention.
703 @param[in]	page		page
704 @param[in]	lock_intention	lock intention for the tree operation
705 @param[in]	rec		record (current node_ptr)
706 @return	true if tree modification is needed */
707 static
708 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)709 btr_cur_need_opposite_intention(
710 	const page_t*	page,
711 	btr_intention_t	lock_intention,
712 	const rec_t*	rec)
713 {
714 	switch (lock_intention) {
715 	case BTR_INTENTION_DELETE:
716 		return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
717 			&& page_rec_is_first(rec, page))
718 		       || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
719 			   && page_rec_is_last(rec, page)));
720 	case BTR_INTENTION_INSERT:
721 		return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
722 		       && page_rec_is_last(rec, page));
723 	case BTR_INTENTION_BOTH:
724 		return(false);
725 	}
726 
727 	ut_error;
728 	return(false);
729 }
730 
731 /********************************************************************//**
732 Searches an index tree and positions a tree cursor on a given level.
733 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
734 to node pointer page number fields on the upper levels of the tree!
735 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
736 cursor->up_match and cursor->low_match both will have sensible values.
737 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
738 
739 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
740 search tuple should be performed in the B-tree. InnoDB does an insert
741 immediately after the cursor. Thus, the cursor may end up on a user record,
742 or on a page infimum record. */
743 void
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)744 btr_cur_search_to_nth_level(
745 /*========================*/
746 	dict_index_t*	index,	/*!< in: index */
747 	ulint		level,	/*!< in: the tree level of search */
748 	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
749 				tuple must be set so that it cannot get
750 				compared to the node ptr page number field! */
751 	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
752 				Inserts should always be made using
753 				PAGE_CUR_LE to search the position! */
754 	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
755 				at most one of BTR_INSERT, BTR_DELETE_MARK,
756 				BTR_DELETE, or BTR_ESTIMATE;
757 				cursor->left_block is used to store a pointer
758 				to the left neighbor page, in the cases
759 				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
760 				NOTE that if has_search_latch
761 				is != 0, we maybe do not have a latch set
762 				on the cursor page, we assume
763 				the caller uses his search latch
764 				to protect the record! */
765 	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
766 				s- or x-latched, but see also above! */
767 	ulint		has_search_latch,
768 				/*!< in: info on the latch mode the
769 				caller currently has on search system:
770 				RW_S_LATCH, or 0 */
771 	const char*	file,	/*!< in: file name */
772 	ulint		line,	/*!< in: line where called */
773 	mtr_t*		mtr)	/*!< in: mtr */
774 {
775 	page_t*		page = NULL; /* remove warning */
776 	buf_block_t*	block;
777 	ulint		height;
778 	ulint		up_match;
779 	ulint		up_bytes;
780 	ulint		low_match;
781 	ulint		low_bytes;
782 	ulint		savepoint;
783 	ulint		rw_latch;
784 	page_cur_mode_t	page_mode;
785 	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
786 	ulint		buf_mode;
787 	ulint		estimate;
788 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
789 	page_cur_t*	page_cursor;
790 	btr_op_t	btr_op;
791 	ulint		root_height = 0; /* remove warning */
792 
793 	ulint		upper_rw_latch, root_leaf_rw_latch;
794 	btr_intention_t	lock_intention;
795 	bool		modify_external;
796 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
797 	ulint		tree_savepoints[BTR_MAX_LEVELS];
798 	ulint		n_blocks = 0;
799 	ulint		n_releases = 0;
800 	bool		detected_same_key_root = false;
801 
802 	bool		retrying_for_search_prev = false;
803 	ulint		leftmost_from_level = 0;
804 	buf_block_t**	prev_tree_blocks = NULL;
805 	ulint*		prev_tree_savepoints = NULL;
806 	ulint		prev_n_blocks = 0;
807 	ulint		prev_n_releases = 0;
808 	bool		need_path = true;
809 	bool		rtree_parent_modified = false;
810 	bool		mbr_adj = false;
811 	bool		found = false;
812 
813 	DBUG_ENTER("btr_cur_search_to_nth_level");
814 
815 	btr_search_t*	info;
816 	mem_heap_t*	heap		= NULL;
817 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
818 	ulint*		offsets		= offsets_;
819 	ulint		offsets2_[REC_OFFS_NORMAL_SIZE];
820 	ulint*		offsets2	= offsets2_;
821 	rec_offs_init(offsets_);
822 	rec_offs_init(offsets2_);
823 	/* Currently, PAGE_CUR_LE is the only search mode used for searches
824 	ending to upper levels */
825 
826 	ut_ad(level == 0 || mode == PAGE_CUR_LE
827 	      || RTREE_SEARCH_MODE(mode));
828 	ut_ad(dict_index_check_search_tuple(index, tuple));
829 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
830 	ut_ad(dtuple_check_typed(tuple));
831 	ut_ad(!(index->type & DICT_FTS));
832 	ut_ad(index->page != FIL_NULL);
833 
834 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
835 	UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
836 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
837 	UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
838 #ifdef UNIV_DEBUG
839 	cursor->up_match = ULINT_UNDEFINED;
840 	cursor->low_match = ULINT_UNDEFINED;
841 #endif /* UNIV_DEBUG */
842 
843 	ibool	s_latch_by_caller;
844 
845 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
846 
847 	ut_ad(!s_latch_by_caller
848 	      || srv_read_only_mode
849 	      || mtr_memo_contains_flagged(mtr,
850 					   dict_index_get_lock(index),
851 					   MTR_MEMO_S_LOCK
852 					   | MTR_MEMO_SX_LOCK));
853 
854 	/* These flags are mutually exclusive, they are lumped together
855 	with the latch mode for historical reasons. It's possible for
856 	none of the flags to be set. */
857 	switch (UNIV_EXPECT(latch_mode
858 			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
859 			    0)) {
860 	case 0:
861 		btr_op = BTR_NO_OP;
862 		break;
863 	case BTR_INSERT:
864 		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
865 			? BTR_INSERT_IGNORE_UNIQUE_OP
866 			: BTR_INSERT_OP;
867 		break;
868 	case BTR_DELETE:
869 		btr_op = BTR_DELETE_OP;
870 		ut_a(cursor->purge_node);
871 		break;
872 	case BTR_DELETE_MARK:
873 		btr_op = BTR_DELMARK_OP;
874 		break;
875 	default:
876 		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
877 		should be specified at a time */
878 		ut_error;
879 	}
880 
881 	/* Operations on the insert buffer tree cannot be buffered. */
882 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
883 	/* Operations on the clustered index cannot be buffered. */
884 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
885 	/* Operations on the temporary table(indexes) cannot be buffered. */
886 	ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
887 	/* Operation on the spatial index cannot be buffered. */
888 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
889 
890 	estimate = latch_mode & BTR_ESTIMATE;
891 
892 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
893 
894 	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
895 
896 	/* Turn the flags unrelated to the latch mode off. */
897 	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
898 
899 	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
900 
901 	ut_ad(!s_latch_by_caller
902 	      || latch_mode == BTR_SEARCH_LEAF
903 	      || latch_mode == BTR_SEARCH_TREE
904 	      || latch_mode == BTR_MODIFY_LEAF);
905 
906 	cursor->flag = BTR_CUR_BINARY;
907 	cursor->index = index;
908 
909 	info = btr_search_get_info(index);
910 
911 # ifdef UNIV_SEARCH_PERF_STAT
912 	info->n_searches++;
913 # endif
914 	/* Use of AHI is disabled for intrinsic table as these tables re-use
915 	the index-id and AHI validation is based on index-id. */
916 	if (rw_lock_get_writer(btr_get_search_latch(index))
917 		== RW_LOCK_NOT_LOCKED
918 	    && latch_mode <= BTR_MODIFY_LEAF
919 	    && info->last_hash_succ
920 	    && !index->disable_ahi
921 	    && !estimate
922 # ifdef PAGE_CUR_LE_OR_EXTENDS
923 	    && mode != PAGE_CUR_LE_OR_EXTENDS
924 # endif /* PAGE_CUR_LE_OR_EXTENDS */
925 	    && !dict_index_is_spatial(index)
926 	    /* If !has_search_latch, we do a dirty read of
927 	    btr_search_enabled below, and btr_search_guess_on_hash()
928 	    will have to check it again. */
929 	    && UNIV_LIKELY(btr_search_enabled)
930 	    && !modify_external
931 	    && btr_search_guess_on_hash(index, info, tuple, mode,
932 					latch_mode, cursor,
933 					has_search_latch, mtr)) {
934 
935 		/* Search using the hash index succeeded */
936 
937 		ut_ad(cursor->up_match != ULINT_UNDEFINED
938 		      || mode != PAGE_CUR_GE);
939 		ut_ad(cursor->up_match != ULINT_UNDEFINED
940 		      || mode != PAGE_CUR_LE);
941 		ut_ad(cursor->low_match != ULINT_UNDEFINED
942 		      || mode != PAGE_CUR_LE);
943 		btr_cur_n_sea++;
944 
945 		DBUG_VOID_RETURN;
946 	}
947 	btr_cur_n_non_sea++;
948 
949 	/* If the hash search did not succeed, do binary search down the
950 	tree */
951 
952 	if (has_search_latch) {
953 		/* Release possible search latch to obey latching order */
954 		rw_lock_s_unlock(btr_get_search_latch(index));
955 	}
956 
957 	/* Store the position of the tree latch we push to mtr so that we
958 	know how to release it when we have latched leaf node(s) */
959 
960 	savepoint = mtr_set_savepoint(mtr);
961 
962 	switch (latch_mode) {
963 	case BTR_MODIFY_TREE:
964 		/* Most of delete-intended operations are purging.
965 		Free blocks and read IO bandwidth should be prior
966 		for them, when the history list is glowing huge. */
967 		if (lock_intention == BTR_INTENTION_DELETE
968 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
969 			&& buf_get_n_pending_read_ios()) {
970 			mtr_x_lock(dict_index_get_lock(index), mtr);
971 		} else if (dict_index_is_spatial(index)
972 			   && lock_intention <= BTR_INTENTION_BOTH) {
973 			/* X lock the if there is possibility of
974 			pessimistic delete on spatial index. As we could
975 			lock upward for the tree */
976 
977 			mtr_x_lock(dict_index_get_lock(index), mtr);
978 		} else {
979 			mtr_sx_lock(dict_index_get_lock(index), mtr);
980 		}
981 		upper_rw_latch = RW_X_LATCH;
982 		break;
983 	case BTR_CONT_MODIFY_TREE:
984 	case BTR_CONT_SEARCH_TREE:
985 		/* Do nothing */
986 		ut_ad(srv_read_only_mode
987 		      || mtr_memo_contains_flagged(mtr,
988 						   dict_index_get_lock(index),
989 						   MTR_MEMO_X_LOCK
990 						   | MTR_MEMO_SX_LOCK));
991 		if (dict_index_is_spatial(index)
992 		    && latch_mode == BTR_CONT_MODIFY_TREE) {
993 			/* If we are about to locating parent page for split
994 			and/or merge operation for R-Tree index, X latch
995 			the parent */
996 			upper_rw_latch = RW_X_LATCH;
997 		} else {
998 			upper_rw_latch = RW_NO_LATCH;
999 		}
1000 		break;
1001 	default:
1002 		if (!srv_read_only_mode) {
1003 			if (s_latch_by_caller) {
1004 				ut_ad(rw_lock_own(dict_index_get_lock(index),
1005 				              RW_LOCK_S));
1006 			} else if (!modify_external) {
1007 				/* BTR_SEARCH_TREE is intended to be used with
1008 				BTR_ALREADY_S_LATCHED */
1009 				ut_ad(latch_mode != BTR_SEARCH_TREE);
1010 
1011 				mtr_s_lock(dict_index_get_lock(index), mtr);
1012 			} else {
1013 				/* BTR_MODIFY_EXTERNAL needs to be excluded */
1014 				mtr_sx_lock(dict_index_get_lock(index), mtr);
1015 			}
1016 			upper_rw_latch = RW_S_LATCH;
1017 		} else {
1018 			upper_rw_latch = RW_NO_LATCH;
1019 		}
1020 	}
1021 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1022 
1023 	page_cursor = btr_cur_get_page_cur(cursor);
1024 
1025 	const ulint		space = dict_index_get_space(index);
1026 	const page_size_t	page_size(dict_table_page_size(index->table));
1027 
1028 	/* Start with the root page. */
1029 	page_id_t		page_id(space, dict_index_get_page(index));
1030 
1031 	if (root_leaf_rw_latch == RW_X_LATCH) {
1032 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
1033 	}
1034 
1035 	up_match = 0;
1036 	up_bytes = 0;
1037 	low_match = 0;
1038 	low_bytes = 0;
1039 
1040 	height = ULINT_UNDEFINED;
1041 
1042 	/* We use these modified search modes on non-leaf levels of the
1043 	B-tree. These let us end up in the right B-tree leaf. In that leaf
1044 	we use the original search mode. */
1045 
1046 	switch (mode) {
1047 	case PAGE_CUR_GE:
1048 		page_mode = PAGE_CUR_L;
1049 		break;
1050 	case PAGE_CUR_G:
1051 		page_mode = PAGE_CUR_LE;
1052 		break;
1053 	default:
1054 #ifdef PAGE_CUR_LE_OR_EXTENDS
1055 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1056 		      || RTREE_SEARCH_MODE(mode)
1057 		      || mode == PAGE_CUR_LE_OR_EXTENDS);
1058 #else /* PAGE_CUR_LE_OR_EXTENDS */
1059 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1060 		      || RTREE_SEARCH_MODE(mode));
1061 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1062 		page_mode = mode;
1063 		break;
1064 	}
1065 
1066 	/* Loop and search until we arrive at the desired level */
1067 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1068 
1069 search_loop:
1070 	buf_mode = BUF_GET;
1071 	rw_latch = RW_NO_LATCH;
1072 	rtree_parent_modified = false;
1073 
1074 	if (height != 0) {
1075 		/* We are about to fetch the root or a non-leaf page. */
1076 		if ((latch_mode != BTR_MODIFY_TREE
1077 		     || height == level)
1078 		    && !retrying_for_search_prev) {
1079 			/* If doesn't have SX or X latch of index,
1080 			each pages should be latched before reading. */
1081 			if (modify_external
1082 			    && height == ULINT_UNDEFINED
1083 			    && upper_rw_latch == RW_S_LATCH) {
1084 				/* needs sx-latch of root page
1085 				for fseg operation */
1086 				rw_latch = RW_SX_LATCH;
1087 			} else {
1088 				rw_latch = upper_rw_latch;
1089 			}
1090 		}
1091 	} else if (latch_mode <= BTR_MODIFY_LEAF) {
1092 		rw_latch = latch_mode;
1093 
1094 		if (btr_op != BTR_NO_OP
1095 		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1096 
1097 			/* Try to buffer the operation if the leaf
1098 			page is not in the buffer pool. */
1099 
1100 			buf_mode = btr_op == BTR_DELETE_OP
1101 				? BUF_GET_IF_IN_POOL_OR_WATCH
1102 				: BUF_GET_IF_IN_POOL;
1103 		}
1104 	}
1105 
1106 retry_page_get:
1107 	ut_ad(n_blocks < BTR_MAX_LEVELS);
1108 	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1109 	block = buf_page_get_gen(
1110 		page_id, page_size, rw_latch,
1111 		(height == ULINT_UNDEFINED ? info->root_guess : NULL),
1112 		buf_mode, file, line, mtr
1113 	);
1114 
1115 	tree_blocks[n_blocks] = block;
1116 
1117 	if (block == NULL) {
1118 		/* This must be a search to perform an insert/delete
1119 		mark/ delete; try using the insert/delete buffer */
1120 
1121 		ut_ad(height == 0);
1122 		ut_ad(cursor->thr);
1123 
1124 		switch (btr_op) {
1125 		case BTR_INSERT_OP:
1126 		case BTR_INSERT_IGNORE_UNIQUE_OP:
1127 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1128 			ut_ad(!dict_index_is_spatial(index));
1129 
1130 			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1131 					page_id, page_size, cursor->thr)) {
1132 
1133 				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1134 
1135 				goto func_exit;
1136 			}
1137 			break;
1138 
1139 		case BTR_DELMARK_OP:
1140 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1141 			ut_ad(!dict_index_is_spatial(index));
1142 
1143 			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1144 					index, page_id, page_size,
1145 					cursor->thr)) {
1146 
1147 				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1148 
1149 				goto func_exit;
1150 			}
1151 
1152 			break;
1153 
1154 		case BTR_DELETE_OP:
1155 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1156 			ut_ad(!dict_index_is_spatial(index));
1157 
1158 			if (!row_purge_poss_sec(cursor->purge_node,
1159 						index, tuple)) {
1160 
1161 				/* The record cannot be purged yet. */
1162 				cursor->flag = BTR_CUR_DELETE_REF;
1163 			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1164 					       index, page_id, page_size,
1165 					       cursor->thr)) {
1166 
1167 				/* The purge was buffered. */
1168 				cursor->flag = BTR_CUR_DELETE_IBUF;
1169 			} else {
1170 				/* The purge could not be buffered. */
1171 				buf_pool_watch_unset(page_id);
1172 				break;
1173 			}
1174 
1175 			buf_pool_watch_unset(page_id);
1176 			goto func_exit;
1177 
1178 		default:
1179 			ut_error;
1180 		}
1181 
1182 		/* Insert to the insert/delete buffer did not succeed, we
1183 		must read the page from disk. */
1184 
1185 		buf_mode = BUF_GET;
1186 
1187 		goto retry_page_get;
1188 	}
1189 
1190 	if (retrying_for_search_prev && height != 0) {
1191 		/* also latch left sibling */
1192 		ulint		left_page_no;
1193 		buf_block_t*	get_block;
1194 
1195 		ut_ad(rw_latch == RW_NO_LATCH);
1196 
1197 		rw_latch = upper_rw_latch;
1198 
1199 		rw_lock_s_lock(&block->lock);
1200 		left_page_no = btr_page_get_prev(
1201 			buf_block_get_frame(block), mtr);
1202 		rw_lock_s_unlock(&block->lock);
1203 
1204 		if (left_page_no != FIL_NULL) {
1205 			ut_ad(prev_n_blocks < leftmost_from_level);
1206 
1207 			prev_tree_savepoints[prev_n_blocks]
1208 				= mtr_set_savepoint(mtr);
1209 			get_block = buf_page_get_gen(
1210 				page_id_t(page_id.space(), left_page_no),
1211 				page_size, rw_latch, NULL, buf_mode,
1212 				file, line, mtr);
1213 			prev_tree_blocks[prev_n_blocks] = get_block;
1214 			prev_n_blocks++;
1215 
1216 			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1217 			without their parent page's lock. So, not needed to
1218 			retry here, because we have the parent page's lock. */
1219 		}
1220 
1221 		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
1222 		mtr_release_block_at_savepoint(
1223 			mtr, tree_savepoints[n_blocks],
1224 			tree_blocks[n_blocks]);
1225 
1226 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1227 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1228 					 buf_mode, file, line, mtr);
1229 		tree_blocks[n_blocks] = block;
1230 	}
1231 
1232 	page = buf_block_get_frame(block);
1233 
1234 	if (height == ULINT_UNDEFINED
1235 	    && page_is_leaf(page)
1236 	    && rw_latch != RW_NO_LATCH
1237 	    && rw_latch != root_leaf_rw_latch) {
1238 		/* We should retry to get the page, because the root page
1239 		is latched with different level as a leaf page. */
1240 		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1241 		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1242 		ut_ad(rw_latch == RW_S_LATCH || modify_external);
1243 
1244 		ut_ad(n_blocks == 0);
1245 		mtr_release_block_at_savepoint(
1246 			mtr, tree_savepoints[n_blocks],
1247 			tree_blocks[n_blocks]);
1248 
1249 		upper_rw_latch = root_leaf_rw_latch;
1250 		goto search_loop;
1251 	}
1252 
1253 	if (rw_latch != RW_NO_LATCH) {
1254 #ifdef UNIV_ZIP_DEBUG
1255 		const page_zip_des_t*	page_zip
1256 			= buf_block_get_page_zip(block);
1257 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1258 #endif /* UNIV_ZIP_DEBUG */
1259 
1260 		buf_block_dbg_add_level(
1261 			block, dict_index_is_ibuf(index)
1262 			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1263 	}
1264 
1265 	ut_ad(fil_page_index_page_check(page));
1266 	ut_ad(index->id == btr_page_get_index_id(page));
1267 
1268 	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1269 		/* We are in the root node */
1270 
1271 		height = btr_page_get_level(page, mtr);
1272 		root_height = height;
1273 		cursor->tree_height = root_height + 1;
1274 
1275 		if (dict_index_is_spatial(index)) {
1276 			ut_ad(cursor->rtr_info);
1277 
1278 			node_seq_t      seq_no = rtr_get_current_ssn_id(index);
1279 
1280 			/* If SSN in memory is not initialized, fetch
1281 			it from root page */
1282 			if (seq_no < 1) {
1283 				node_seq_t      root_seq_no;
1284 
1285 				root_seq_no = page_get_ssn_id(page);
1286 
1287 				mutex_enter(&(index->rtr_ssn.mutex));
1288 				index->rtr_ssn.seq_no = root_seq_no + 1;
1289 				mutex_exit(&(index->rtr_ssn.mutex));
1290 			}
1291 
1292 			/* Save the MBR */
1293 			cursor->rtr_info->thr = cursor->thr;
1294 			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1295 		}
1296 
1297 		info->root_guess = block;
1298 	}
1299 
1300 	if (height == 0) {
1301 		if (rw_latch == RW_NO_LATCH) {
1302 
1303 			latch_leaves = btr_cur_latch_leaves(
1304 				block, page_id, page_size, latch_mode,
1305 				cursor, mtr);
1306 		}
1307 
1308 		switch (latch_mode) {
1309 		case BTR_MODIFY_TREE:
1310 		case BTR_CONT_MODIFY_TREE:
1311 		case BTR_CONT_SEARCH_TREE:
1312 			break;
1313 		default:
1314 			if (!s_latch_by_caller
1315 			    && !srv_read_only_mode
1316 			    && !modify_external) {
1317 				/* Release the tree s-latch */
1318 				/* NOTE: BTR_MODIFY_EXTERNAL
1319 				needs to keep tree sx-latch */
1320 				mtr_release_s_latch_at_savepoint(
1321 					mtr, savepoint,
1322 					dict_index_get_lock(index));
1323 			}
1324 
1325 			/* release upper blocks */
1326 			if (retrying_for_search_prev) {
1327 				for (;
1328 				     prev_n_releases < prev_n_blocks;
1329 				     prev_n_releases++) {
1330 					mtr_release_block_at_savepoint(
1331 						mtr,
1332 						prev_tree_savepoints[
1333 							prev_n_releases],
1334 						prev_tree_blocks[
1335 							prev_n_releases]);
1336 				}
1337 			}
1338 
1339 			for (; n_releases < n_blocks; n_releases++) {
1340 				if (n_releases == 0 && modify_external) {
1341 					/* keep latch of root page */
1342 					ut_ad(mtr_memo_contains_flagged(
1343 						mtr, tree_blocks[n_releases],
1344 						MTR_MEMO_PAGE_SX_FIX
1345 						| MTR_MEMO_PAGE_X_FIX));
1346 					continue;
1347 				}
1348 
1349 				mtr_release_block_at_savepoint(
1350 					mtr, tree_savepoints[n_releases],
1351 					tree_blocks[n_releases]);
1352 			}
1353 		}
1354 
1355 		page_mode = mode;
1356 	}
1357 
1358 	if (dict_index_is_spatial(index)) {
1359 		/* Remember the page search mode */
1360 		search_mode = page_mode;
1361 
1362 		/* Some adjustment on search mode, when the
1363 		page search mode is PAGE_CUR_RTREE_LOCATE
1364 		or PAGE_CUR_RTREE_INSERT, as we are searching
1365 		with MBRs. When it is not the target level, we
1366 		should search all sub-trees that "CONTAIN" the
1367 		search range/MBR. When it is at the target
1368 		level, the search becomes PAGE_CUR_LE */
1369 		if (page_mode == PAGE_CUR_RTREE_LOCATE
1370 		    && level == height) {
1371 			if (level == 0) {
1372 				page_mode = PAGE_CUR_LE;
1373 			} else {
1374 				page_mode = PAGE_CUR_RTREE_GET_FATHER;
1375 			}
1376 		}
1377 
1378 		if (page_mode == PAGE_CUR_RTREE_INSERT) {
1379 			page_mode = (level == height)
1380 					? PAGE_CUR_LE
1381 					: PAGE_CUR_RTREE_INSERT;
1382 
1383 			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1384 		}
1385 
1386 		/* "need_path" indicates if we need to tracking the parent
1387 		pages, if it is not spatial comparison, then no need to
1388 		track it */
1389 		if (page_mode < PAGE_CUR_CONTAIN) {
1390 			need_path = false;
1391 		}
1392 
1393 		up_match = 0;
1394 		low_match = 0;
1395 
1396 		if (latch_mode == BTR_MODIFY_TREE
1397 		    || latch_mode == BTR_CONT_MODIFY_TREE
1398 		    || latch_mode == BTR_CONT_SEARCH_TREE) {
1399 			/* Tree are locked, no need for Page Lock to protect
1400 			the "path" */
1401 			cursor->rtr_info->need_page_lock = false;
1402 		}
1403         }
1404 
1405 	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1406 		ut_ad(need_path);
1407 		found = rtr_cur_search_with_match(
1408 			block, index, tuple, page_mode, page_cursor,
1409 			cursor->rtr_info);
1410 
1411 		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1412 		if (search_mode == PAGE_CUR_RTREE_INSERT
1413 		    && cursor->rtr_info->mbr_adj) {
1414 			if (latch_mode & BTR_MODIFY_LEAF) {
1415 				/* Parent MBR needs updated, should retry
1416 				with BTR_MODIFY_TREE */
1417 				goto func_exit;
1418 			} else if (latch_mode & BTR_MODIFY_TREE) {
1419 				rtree_parent_modified = true;
1420 				cursor->rtr_info->mbr_adj = false;
1421 				mbr_adj = true;
1422 			} else {
1423 				ut_ad(0);
1424 			}
1425 		}
1426 
1427 		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1428 			cursor->low_match =
1429 				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1430 		}
1431 	} else if (height == 0 && btr_search_enabled
1432 		   && !dict_index_is_spatial(index)) {
1433 		/* The adaptive hash index is only used when searching
1434 		for leaf pages (height==0), but not in r-trees.
1435 		We only need the byte prefix comparison for the purpose
1436 		of updating the adaptive hash index. */
1437 		page_cur_search_with_match_bytes(
1438 			block, index, tuple, page_mode, &up_match, &up_bytes,
1439 			&low_match, &low_bytes, page_cursor);
1440 	} else {
1441 		/* Search for complete index fields. */
1442 		up_bytes = low_bytes = 0;
1443 		page_cur_search_with_match(
1444 			block, index, tuple, page_mode, &up_match,
1445 			&low_match, page_cursor,
1446 			need_path ? cursor->rtr_info : NULL);
1447 	}
1448 
1449 	if (estimate) {
1450 		btr_cur_add_path_info(cursor, height, root_height);
1451 	}
1452 
1453 	/* If this is the desired level, leave the loop */
1454 
1455 	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1456 					   mtr));
1457 
1458 	/* Add Predicate lock if it is serializable isolation
1459 	and only if it is in the search case */
1460 	if (dict_index_is_spatial(index)
1461 	    && cursor->rtr_info->need_prdt_lock
1462 	    && mode != PAGE_CUR_RTREE_INSERT
1463 	    && mode != PAGE_CUR_RTREE_LOCATE
1464 	    && mode >= PAGE_CUR_CONTAIN) {
1465 		trx_t*		trx = thr_get_trx(cursor->thr);
1466 		lock_prdt_t	prdt;
1467 
1468 		lock_mutex_enter();
1469 		lock_init_prdt_from_mbr(
1470 			&prdt, &cursor->rtr_info->mbr, mode,
1471 			trx->lock.lock_heap);
1472 		lock_mutex_exit();
1473 
1474 		if (rw_latch == RW_NO_LATCH && height != 0) {
1475 			rw_lock_s_lock(&(block->lock));
1476 		}
1477 
1478 		lock_prdt_lock(block, &prdt, index, LOCK_S,
1479 			       LOCK_PREDICATE, cursor->thr, mtr);
1480 
1481 		if (rw_latch == RW_NO_LATCH && height != 0) {
1482 			rw_lock_s_unlock(&(block->lock));
1483 		}
1484 	}
1485 
1486 	if (level != height) {
1487 
1488 		const rec_t*	node_ptr;
1489 		ut_ad(height > 0);
1490 
1491 		height--;
1492 
1493 		node_ptr = page_cur_get_rec(page_cursor);
1494 
1495 		offsets = rec_get_offsets(
1496 			node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1497 
1498 		/* If the rec is the first or last in the page for
1499 		pessimistic delete intention, it might cause node_ptr insert
1500 		for the upper level. We should change the intention and retry.
1501 		*/
1502 		if (latch_mode == BTR_MODIFY_TREE
1503 		    && btr_cur_need_opposite_intention(
1504 			page, lock_intention, node_ptr)) {
1505 
1506 need_opposite_intention:
1507 			ut_ad(upper_rw_latch == RW_X_LATCH);
1508 
1509 			if (n_releases > 0) {
1510 				/* release root block */
1511 				mtr_release_block_at_savepoint(
1512 					mtr, tree_savepoints[0],
1513 					tree_blocks[0]);
1514 			}
1515 
1516 			/* release all blocks */
1517 			for (; n_releases <= n_blocks; n_releases++) {
1518 				mtr_release_block_at_savepoint(
1519 					mtr, tree_savepoints[n_releases],
1520 					tree_blocks[n_releases]);
1521 			}
1522 
1523 			lock_intention = BTR_INTENTION_BOTH;
1524 
1525 			page_id.reset(space, dict_index_get_page(index));
1526 			up_match = 0;
1527 			low_match = 0;
1528 			height = ULINT_UNDEFINED;
1529 
1530 			n_blocks = 0;
1531 			n_releases = 0;
1532 
1533 			goto search_loop;
1534 		}
1535 
1536 		if (dict_index_is_spatial(index)) {
1537 			if (page_rec_is_supremum(node_ptr)) {
1538 				cursor->low_match = 0;
1539 				cursor->up_match = 0;
1540 				goto func_exit;
1541 			}
1542 
1543 			/* If we are doing insertion or record locating,
1544 			remember the tree nodes we visited */
1545 			if (page_mode == PAGE_CUR_RTREE_INSERT
1546 			    || (search_mode == PAGE_CUR_RTREE_LOCATE
1547 			        && (latch_mode != BTR_MODIFY_LEAF))) {
1548 				bool		add_latch = false;
1549 
1550 				if (latch_mode == BTR_MODIFY_TREE
1551 				    && rw_latch == RW_NO_LATCH) {
1552 					ut_ad(mtr_memo_contains_flagged(
1553 						mtr, dict_index_get_lock(index),
1554 						MTR_MEMO_X_LOCK
1555 						| MTR_MEMO_SX_LOCK));
1556 					rw_lock_s_lock(&block->lock);
1557 					add_latch = true;
1558 				}
1559 
1560 				/* Store the parent cursor location */
1561 #ifdef UNIV_DEBUG
1562 				ulint	num_stored = rtr_store_parent_path(
1563 					block, cursor, latch_mode,
1564 					height + 1, mtr);
1565 #else
1566 				rtr_store_parent_path(
1567 					block, cursor, latch_mode,
1568 					height + 1, mtr);
1569 #endif
1570 
1571 				if (page_mode == PAGE_CUR_RTREE_INSERT) {
1572 					btr_pcur_t*     r_cursor =
1573 						rtr_get_parent_cursor(
1574 							cursor, height + 1,
1575 							true);
1576 					/* If it is insertion, there should
1577 					be only one parent for each level
1578 					traverse */
1579 #ifdef UNIV_DEBUG
1580 					ut_ad(num_stored == 1);
1581 #endif
1582 
1583 					node_ptr = btr_pcur_get_rec(r_cursor);
1584 
1585 				}
1586 
1587 				if (add_latch) {
1588 					rw_lock_s_unlock(&block->lock);
1589 				}
1590 
1591 				ut_ad(!page_rec_is_supremum(node_ptr));
1592 			}
1593 
1594 			ut_ad(page_mode == search_mode
1595 			      || (page_mode == PAGE_CUR_WITHIN
1596 				  && search_mode == PAGE_CUR_RTREE_LOCATE));
1597 
1598 			page_mode = search_mode;
1599 		}
1600 
1601 		/* If the first or the last record of the page
1602 		or the same key value to the first record or last record,
1603 		the another page might be choosen when BTR_CONT_MODIFY_TREE.
1604 		So, the parent page should not released to avoiding deadlock
1605 		with blocking the another search with the same key value. */
1606 		if (!detected_same_key_root
1607 		    && lock_intention == BTR_INTENTION_BOTH
1608 		    && !dict_index_is_unique(index)
1609 		    && latch_mode == BTR_MODIFY_TREE
1610 		    && (up_match >= rec_offs_n_fields(offsets) - 1
1611 			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
1612 			const rec_t*	first_rec
1613 						= page_rec_get_next_const(
1614 							page_get_infimum_rec(
1615 								page));
1616 			ulint		matched_fields;
1617 
1618 			ut_ad(upper_rw_latch == RW_X_LATCH);
1619 
1620 			if (node_ptr == first_rec
1621 			    || page_rec_is_last(node_ptr, page)) {
1622 				detected_same_key_root = true;
1623 			} else {
1624 				matched_fields = 0;
1625 
1626 				offsets2 = rec_get_offsets(
1627 					first_rec, index, offsets2,
1628 					ULINT_UNDEFINED, &heap);
1629 				cmp_rec_rec_with_match(node_ptr, first_rec,
1630 					offsets, offsets2, index,
1631 					page_is_spatial_non_leaf(first_rec, index),
1632 					false, &matched_fields);
1633 
1634 				if (matched_fields
1635 				    >= rec_offs_n_fields(offsets) - 1) {
1636 					detected_same_key_root = true;
1637 				} else {
1638 					const rec_t*	last_rec;
1639 
1640 					last_rec = page_rec_get_prev_const(
1641 							page_get_supremum_rec(
1642 								page));
1643 
1644 					matched_fields = 0;
1645 
1646 					offsets2 = rec_get_offsets(
1647 						last_rec, index, offsets2,
1648 						ULINT_UNDEFINED, &heap);
1649 					cmp_rec_rec_with_match(
1650 						node_ptr, last_rec,
1651 						offsets, offsets2, index,
1652 						page_is_spatial_non_leaf(last_rec, index),
1653 						false, &matched_fields);
1654 					if (matched_fields
1655 					    >= rec_offs_n_fields(offsets) - 1) {
1656 						detected_same_key_root = true;
1657 					}
1658 				}
1659 			}
1660 		}
1661 
1662 		/* If the page might cause modify_tree,
1663 		we should not release the parent page's lock. */
1664 		if (!detected_same_key_root
1665 		    && latch_mode == BTR_MODIFY_TREE
1666 		    && !btr_cur_will_modify_tree(
1667 				index, page, lock_intention, node_ptr,
1668 				node_ptr_max_size, page_size, mtr)
1669 		    && !rtree_parent_modified) {
1670 			ut_ad(upper_rw_latch == RW_X_LATCH);
1671 			ut_ad(n_releases <= n_blocks);
1672 
1673 			/* we can release upper blocks */
1674 			for (; n_releases < n_blocks; n_releases++) {
1675 				if (n_releases == 0) {
1676 					/* we should not release root page
1677 					to pin to same block. */
1678 					continue;
1679 				}
1680 
1681 				/* release unused blocks to unpin */
1682 				mtr_release_block_at_savepoint(
1683 					mtr, tree_savepoints[n_releases],
1684 					tree_blocks[n_releases]);
1685 			}
1686 		}
1687 
1688 		if (height == level
1689 		    && latch_mode == BTR_MODIFY_TREE) {
1690 			ut_ad(upper_rw_latch == RW_X_LATCH);
1691 			/* we should sx-latch root page, if released already.
1692 			It contains seg_header. */
1693 			if (n_releases > 0) {
1694 				mtr_block_sx_latch_at_savepoint(
1695 					mtr, tree_savepoints[0],
1696 					tree_blocks[0]);
1697 			}
1698 
1699 			/* x-latch the branch blocks not released yet. */
1700 			for (ulint i = n_releases; i <= n_blocks; i++) {
1701 				mtr_block_x_latch_at_savepoint(
1702 					mtr, tree_savepoints[i],
1703 					tree_blocks[i]);
1704 			}
1705 		}
1706 
1707 		/* We should consider prev_page of parent page, if the node_ptr
1708 		is the leftmost of the page. because BTR_SEARCH_PREV and
1709 		BTR_MODIFY_PREV latches prev_page of the leaf page. */
1710 		if ((latch_mode == BTR_SEARCH_PREV
1711 		     || latch_mode == BTR_MODIFY_PREV)
1712 		    && !retrying_for_search_prev) {
1713 			/* block should be latched for consistent
1714 			   btr_page_get_prev() */
1715 			ut_ad(mtr_memo_contains_flagged(mtr, block,
1716 				MTR_MEMO_PAGE_S_FIX
1717 				| MTR_MEMO_PAGE_X_FIX));
1718 
1719 			if (btr_page_get_prev(page, mtr) != FIL_NULL
1720 			    && page_rec_is_first(node_ptr, page)) {
1721 
1722 				if (leftmost_from_level == 0) {
1723 					leftmost_from_level = height + 1;
1724 				}
1725 			} else {
1726 				leftmost_from_level = 0;
1727 			}
1728 
1729 			if (height == 0 && leftmost_from_level > 0) {
1730 				/* should retry to get also prev_page
1731 				from level==leftmost_from_level. */
1732 				retrying_for_search_prev = true;
1733 
1734 				prev_tree_blocks = static_cast<buf_block_t**>(
1735 					ut_malloc_nokey(sizeof(buf_block_t*)
1736 							* leftmost_from_level));
1737 
1738 				prev_tree_savepoints = static_cast<ulint*>(
1739 					ut_malloc_nokey(sizeof(ulint)
1740 							* leftmost_from_level));
1741 
1742 				/* back to the level (leftmost_from_level+1) */
1743 				ulint	idx = n_blocks
1744 					- (leftmost_from_level - 1);
1745 
1746 				page_id.reset(
1747 					space,
1748 					tree_blocks[idx]->page.id.page_no());
1749 
1750 				for (ulint i = n_blocks
1751 					       - (leftmost_from_level - 1);
1752 				     i <= n_blocks; i++) {
1753 					mtr_release_block_at_savepoint(
1754 						mtr, tree_savepoints[i],
1755 						tree_blocks[i]);
1756 				}
1757 
1758 				n_blocks -= (leftmost_from_level - 1);
1759 				height = leftmost_from_level;
1760 				ut_ad(n_releases == 0);
1761 
1762 				/* replay up_match, low_match */
1763 				up_match = 0;
1764 				low_match = 0;
1765 				rtr_info_t*	rtr_info	= need_path
1766 					? cursor->rtr_info : NULL;
1767 
1768 				for (ulint i = 0; i < n_blocks; i++) {
1769 					page_cur_search_with_match(
1770 						tree_blocks[i], index, tuple,
1771 						page_mode, &up_match,
1772 						&low_match, page_cursor,
1773 						rtr_info);
1774 				}
1775 
1776 				goto search_loop;
1777 			}
1778 		}
1779 
1780 		/* Go to the child node */
1781 		page_id.reset(
1782 			space,
1783 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
1784 
1785 		n_blocks++;
1786 
1787 		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1788 			/* We're doing a search on an ibuf tree and we're one
1789 			level above the leaf page. */
1790 
1791 			ut_ad(level == 0);
1792 
1793 			buf_mode = BUF_GET;
1794 			rw_latch = RW_NO_LATCH;
1795 			goto retry_page_get;
1796 		}
1797 
1798 		if (dict_index_is_spatial(index)
1799 		    && page_mode >= PAGE_CUR_CONTAIN
1800 		    && page_mode != PAGE_CUR_RTREE_INSERT) {
1801 			ut_ad(need_path);
1802 			rtr_node_path_t* path =
1803 				cursor->rtr_info->path;
1804 
1805 			if (!path->empty() && found) {
1806 #ifdef UNIV_DEBUG
1807 				node_visit_t    last_visit = path->back();
1808 
1809 				ut_ad(last_visit.page_no == page_id.page_no());
1810 #endif /* UNIV_DEBUG */
1811 
1812 				path->pop_back();
1813 
1814 #ifdef UNIV_DEBUG
1815 				if (page_mode == PAGE_CUR_RTREE_LOCATE
1816 				    && (latch_mode != BTR_MODIFY_LEAF)) {
1817 					btr_pcur_t*	cur
1818 					= cursor->rtr_info->parent_path->back(
1819 					  ).cursor;
1820 					rec_t*	my_node_ptr
1821 						= btr_pcur_get_rec(cur);
1822 
1823 					offsets = rec_get_offsets(
1824 						my_node_ptr, index, offsets,
1825 						ULINT_UNDEFINED, &heap);
1826 
1827 					ulint	my_page_no
1828 					= btr_node_ptr_get_child_page_no(
1829 						my_node_ptr, offsets);
1830 
1831 					ut_ad(page_id.page_no() == my_page_no);
1832 
1833 				}
1834 #endif
1835 			}
1836 		}
1837 
1838 		goto search_loop;
1839 	} else if (!dict_index_is_spatial(index)
1840 		   && latch_mode == BTR_MODIFY_TREE
1841 		   && lock_intention == BTR_INTENTION_INSERT
1842 		   && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1843 		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1844 
1845 		/* btr_insert_into_right_sibling() might cause
1846 		deleting node_ptr at upper level */
1847 
1848 		if (height == 0) {
1849 			/* release the leaf pages if latched */
1850 			for (uint i = 0; i < 3; i++) {
1851 				if (latch_leaves.blocks[i] != NULL) {
1852 					mtr_release_block_at_savepoint(
1853 						mtr, latch_leaves.savepoints[i],
1854 						latch_leaves.blocks[i]);
1855 					latch_leaves.blocks[i] = NULL;
1856 				}
1857 			}
1858 		}
1859 
1860 		goto need_opposite_intention;
1861 	}
1862 
1863 	if (level != 0) {
1864 		if (upper_rw_latch == RW_NO_LATCH) {
1865 			/* latch the page */
1866 			buf_block_t*	child_block;
1867 
1868 			if (latch_mode == BTR_CONT_MODIFY_TREE) {
1869 				child_block = btr_block_get(
1870 					page_id, page_size, RW_X_LATCH,
1871 					index, mtr);
1872 			} else {
1873 				ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1874 				child_block = btr_block_get(
1875 					page_id, page_size, RW_SX_LATCH,
1876 					index, mtr);
1877 			}
1878 
1879 			btr_assert_not_corrupted(child_block, index);
1880 		} else {
1881 			ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1882 			btr_assert_not_corrupted(block, index);
1883 
1884 			if (s_latch_by_caller) {
1885 				ut_ad(latch_mode == BTR_SEARCH_TREE);
1886 				/* to exclude modifying tree operations
1887 				should sx-latch the index. */
1888 				ut_ad(mtr_memo_contains(
1889 					mtr, dict_index_get_lock(index),
1890 					MTR_MEMO_SX_LOCK));
1891 				/* because has sx-latch of index,
1892 				can release upper blocks. */
1893 				for (; n_releases < n_blocks; n_releases++) {
1894 					mtr_release_block_at_savepoint(
1895 						mtr,
1896 						tree_savepoints[n_releases],
1897 						tree_blocks[n_releases]);
1898 				}
1899 			}
1900 		}
1901 
1902 		if (page_mode <= PAGE_CUR_LE) {
1903 			cursor->low_match = low_match;
1904 			cursor->up_match = up_match;
1905 		}
1906 	} else {
1907 		cursor->low_match = low_match;
1908 		cursor->low_bytes = low_bytes;
1909 		cursor->up_match = up_match;
1910 		cursor->up_bytes = up_bytes;
1911 
1912 		/* We do a dirty read of btr_search_enabled here.  We
1913 		will properly check btr_search_enabled again in
1914 		btr_search_build_page_hash_index() before building a
1915 		page hash index, while holding search latch. */
1916 		if (btr_search_enabled && !index->disable_ahi) {
1917 			btr_search_info_update(index, cursor);
1918 		}
1919 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1920 		      || mode != PAGE_CUR_GE);
1921 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1922 		      || mode != PAGE_CUR_LE);
1923 		ut_ad(cursor->low_match != ULINT_UNDEFINED
1924 		      || mode != PAGE_CUR_LE);
1925 	}
1926 
1927 	/* For spatial index, remember  what blocks are still latched */
1928 	if (dict_index_is_spatial(index)
1929 	    && (latch_mode == BTR_MODIFY_TREE
1930 		|| latch_mode == BTR_MODIFY_LEAF)) {
1931 		for (ulint i = 0; i < n_releases; i++) {
1932 			cursor->rtr_info->tree_blocks[i] = NULL;
1933 			cursor->rtr_info->tree_savepoints[i] = 0;
1934 		}
1935 
1936 		for (ulint i = n_releases; i <= n_blocks; i++) {
1937 			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
1938 			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
1939 		}
1940 	}
1941 
1942 func_exit:
1943 
1944 	if (UNIV_LIKELY_NULL(heap)) {
1945 		mem_heap_free(heap);
1946 	}
1947 
1948 	if (retrying_for_search_prev) {
1949 		ut_free(prev_tree_blocks);
1950 		ut_free(prev_tree_savepoints);
1951 	}
1952 
1953 	if (has_search_latch) {
1954 
1955 		rw_lock_s_lock(btr_get_search_latch(index));
1956 	}
1957 
1958 	if (mbr_adj) {
1959 		/* remember that we will need to adjust parent MBR */
1960 		cursor->rtr_info->mbr_adj = true;
1961 	}
1962 
1963 	DBUG_VOID_RETURN;
1964 }
1965 
1966 /** Searches an index tree and positions a tree cursor on a given level.
1967 This function will avoid latching the traversal path and so should be
1968 used only for cases where-in latching is not needed.
1969 
1970 @param[in,out]	index	index
1971 @param[in]	level	the tree level of search
1972 @param[in]	tuple	data tuple; Note: n_fields_cmp in compared
1973 			to the node ptr page node field
1974 @param[in]	mode	PAGE_CUR_L, ....
1975 			Insert should always be made using PAGE_CUR_LE
1976 			to search the position.
1977 @param[in,out]	cursor	tree cursor; points to record of interest.
1978 @param[in]	file	file name
1979 @param[in[	line	line where called from
1980 @param[in,out]	mtr	mtr
1981 @param[in]	mark_dirty
1982 			if true then mark the block as dirty */
1983 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)1984 btr_cur_search_to_nth_level_with_no_latch(
1985 	dict_index_t*		index,
1986 	ulint			level,
1987 	const dtuple_t*		tuple,
1988 	page_cur_mode_t		mode,
1989 	btr_cur_t*		cursor,
1990 	const char*		file,
1991 	ulint			line,
1992 	mtr_t*			mtr,
1993 	bool			mark_dirty)
1994 {
1995 	page_t*		page = NULL; /* remove warning */
1996 	buf_block_t*	block;
1997 	ulint		height;
1998 	ulint		up_match;
1999 	ulint		low_match;
2000 	ulint		rw_latch;
2001 	page_cur_mode_t	page_mode;
2002 	ulint		buf_mode;
2003 	page_cur_t*	page_cursor;
2004 	ulint		root_height = 0; /* remove warning */
2005 	ulint		n_blocks = 0;
2006 
2007 	mem_heap_t*	heap		= NULL;
2008 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2009 	ulint*		offsets		= offsets_;
2010 	rec_offs_init(offsets_);
2011 
2012 	DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2013 
2014 	ut_ad(dict_table_is_intrinsic(index->table));
2015 	ut_ad(level == 0 || mode == PAGE_CUR_LE);
2016 	ut_ad(dict_index_check_search_tuple(index, tuple));
2017 	ut_ad(dtuple_check_typed(tuple));
2018 	ut_ad(index->page != FIL_NULL);
2019 
2020 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2021 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2022 #ifdef UNIV_DEBUG
2023 	cursor->up_match = ULINT_UNDEFINED;
2024 	cursor->low_match = ULINT_UNDEFINED;
2025 #endif /* UNIV_DEBUG */
2026 
2027 	cursor->flag = BTR_CUR_BINARY;
2028 	cursor->index = index;
2029 
2030 	page_cursor = btr_cur_get_page_cur(cursor);
2031 
2032         const ulint		space = dict_index_get_space(index);
2033         const page_size_t	page_size(dict_table_page_size(index->table));
2034         /* Start with the root page. */
2035         page_id_t		page_id(space, dict_index_get_page(index));
2036 
2037 	up_match = 0;
2038 	low_match = 0;
2039 
2040 	height = ULINT_UNDEFINED;
2041 
2042 	/* We use these modified search modes on non-leaf levels of the
2043 	B-tree. These let us end up in the right B-tree leaf. In that leaf
2044 	we use the original search mode. */
2045 
2046 	switch (mode) {
2047 	case PAGE_CUR_GE:
2048 		page_mode = PAGE_CUR_L;
2049 		break;
2050 	case PAGE_CUR_G:
2051 		page_mode = PAGE_CUR_LE;
2052 		break;
2053 	default:
2054 		page_mode = mode;
2055 		break;
2056 	}
2057 
2058 	/* Loop and search until we arrive at the desired level */
2059 	bool at_desired_level = false;
2060 	while (!at_desired_level) {
2061 		buf_mode = BUF_GET;
2062 		rw_latch = RW_NO_LATCH;
2063 
2064 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2065 
2066 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2067 				buf_mode, file, line, mtr, mark_dirty);
2068 
2069 		page = buf_block_get_frame(block);
2070 
2071 		if (height == ULINT_UNDEFINED) {
2072 			/* We are in the root node */
2073 
2074 			height = btr_page_get_level(page, mtr);
2075 			root_height = height;
2076 			cursor->tree_height = root_height + 1;
2077 		}
2078 
2079 		if (height == 0) {
2080 			/* On leaf level. Switch back to original search mode.*/
2081 			page_mode = mode;
2082 		}
2083 
2084 		page_cur_search_with_match(
2085 				block, index, tuple, page_mode, &up_match,
2086 				&low_match, page_cursor, NULL);
2087 
2088 		ut_ad(height == btr_page_get_level(
2089 			page_cur_get_page(page_cursor), mtr));
2090 
2091 		if (level != height) {
2092 
2093 			const rec_t*	node_ptr;
2094 			ut_ad(height > 0);
2095 
2096 			height--;
2097 
2098 			node_ptr = page_cur_get_rec(page_cursor);
2099 
2100 			offsets = rec_get_offsets(
2101 					node_ptr, index, offsets,
2102 					ULINT_UNDEFINED, &heap);
2103 
2104 			/* Go to the child node */
2105 			page_id.reset(space, btr_node_ptr_get_child_page_no(
2106 				node_ptr, offsets));
2107 
2108 			n_blocks++;
2109 		} else {
2110 			/* If this is the desired level, leave the loop */
2111 			at_desired_level = true;
2112 		}
2113 	}
2114 
2115 	cursor->low_match = low_match;
2116 	cursor->up_match = up_match;
2117 
2118 	if (heap != NULL) {
2119 		mem_heap_free(heap);
2120 	}
2121 
2122 	DBUG_VOID_RETURN;
2123 }
2124 
2125 /*****************************************************************//**
2126 Opens a cursor at either end of an index. */
2127 void
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2128 btr_cur_open_at_index_side_func(
2129 /*============================*/
2130 	bool		from_left,	/*!< in: true if open to the low end,
2131 					false if to the high end */
2132 	dict_index_t*	index,		/*!< in: index */
2133 	ulint		latch_mode,	/*!< in: latch mode */
2134 	btr_cur_t*	cursor,		/*!< in/out: cursor */
2135 	ulint		level,		/*!< in: level to search for
2136 					(0=leaf). */
2137 	const char*	file,		/*!< in: file name */
2138 	ulint		line,		/*!< in: line where called */
2139 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
2140 {
2141 	page_cur_t*	page_cursor;
2142 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2143 	ulint		height;
2144 	ulint		root_height = 0; /* remove warning */
2145 	rec_t*		node_ptr;
2146 	ulint		estimate;
2147 	ulint		savepoint;
2148 	ulint		upper_rw_latch, root_leaf_rw_latch;
2149 	btr_intention_t	lock_intention;
2150 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2151 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2152 	ulint		n_blocks = 0;
2153 	ulint		n_releases = 0;
2154 	mem_heap_t*	heap		= NULL;
2155 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2156 	ulint*		offsets		= offsets_;
2157 	rec_offs_init(offsets_);
2158 
2159 	estimate = latch_mode & BTR_ESTIMATE;
2160 	latch_mode &= ~BTR_ESTIMATE;
2161 
2162 	ut_ad(level != ULINT_UNDEFINED);
2163 
2164 	bool	s_latch_by_caller;
2165 
2166 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2167 	latch_mode &= ~BTR_ALREADY_S_LATCHED;
2168 
2169 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2170 
2171 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2172 
2173 	/* This function doesn't need to lock left page of the leaf page */
2174 	if (latch_mode == BTR_SEARCH_PREV) {
2175 		latch_mode = BTR_SEARCH_LEAF;
2176 	} else if (latch_mode == BTR_MODIFY_PREV) {
2177 		latch_mode = BTR_MODIFY_LEAF;
2178 	}
2179 
2180 	/* Store the position of the tree latch we push to mtr so that we
2181 	know how to release it when we have latched the leaf node */
2182 
2183 	savepoint = mtr_set_savepoint(mtr);
2184 
2185 	switch (latch_mode) {
2186 	case BTR_CONT_MODIFY_TREE:
2187 	case BTR_CONT_SEARCH_TREE:
2188 		upper_rw_latch = RW_NO_LATCH;
2189 		break;
2190 	case BTR_MODIFY_TREE:
2191 		/* Most of delete-intended operations are purging.
2192 		Free blocks and read IO bandwidth should be prior
2193 		for them, when the history list is glowing huge. */
2194 		if (lock_intention == BTR_INTENTION_DELETE
2195 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2196 		    && buf_get_n_pending_read_ios()) {
2197 			mtr_x_lock(dict_index_get_lock(index), mtr);
2198 		} else {
2199 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2200 		}
2201 		upper_rw_latch = RW_X_LATCH;
2202 		break;
2203 	default:
2204 		ut_ad(!s_latch_by_caller
2205 		      || mtr_memo_contains_flagged(mtr,
2206 						 dict_index_get_lock(index),
2207 						 MTR_MEMO_SX_LOCK
2208 						 | MTR_MEMO_S_LOCK));
2209 		if (!srv_read_only_mode) {
2210 			if (!s_latch_by_caller) {
2211 				/* BTR_SEARCH_TREE is intended to be used with
2212 				BTR_ALREADY_S_LATCHED */
2213 				ut_ad(latch_mode != BTR_SEARCH_TREE);
2214 
2215 				mtr_s_lock(dict_index_get_lock(index), mtr);
2216 			}
2217 			upper_rw_latch = RW_S_LATCH;
2218 		} else {
2219 			upper_rw_latch = RW_NO_LATCH;
2220 		}
2221 	}
2222 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2223 
2224 	page_cursor = btr_cur_get_page_cur(cursor);
2225 	cursor->index = index;
2226 
2227 	page_id_t		page_id(dict_index_get_space(index),
2228 					dict_index_get_page(index));
2229 	const page_size_t&	page_size = dict_table_page_size(index->table);
2230 
2231 	if (root_leaf_rw_latch == RW_X_LATCH) {
2232 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2233 	}
2234 
2235 	height = ULINT_UNDEFINED;
2236 
2237 	for (;;) {
2238 		buf_block_t*	block;
2239 		page_t*		page;
2240 		ulint		rw_latch;
2241 
2242 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2243 
2244 		if (height != 0
2245 		    && (latch_mode != BTR_MODIFY_TREE
2246 			|| height == level)) {
2247 			rw_latch = upper_rw_latch;
2248 		} else {
2249 			rw_latch = RW_NO_LATCH;
2250 		}
2251 
2252 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2253 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2254 					 BUF_GET, file, line, mtr);
2255 		tree_blocks[n_blocks] = block;
2256 
2257 		page = buf_block_get_frame(block);
2258 
2259 		if (height == ULINT_UNDEFINED
2260 		    && btr_page_get_level(page, mtr) == 0
2261 		    && rw_latch != RW_NO_LATCH
2262 		    && rw_latch != root_leaf_rw_latch) {
2263 			/* We should retry to get the page, because the root page
2264 			is latched with different level as a leaf page. */
2265 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2266 			ut_ad(rw_latch == RW_S_LATCH);
2267 
2268 			ut_ad(n_blocks == 0);
2269 			mtr_release_block_at_savepoint(
2270 				mtr, tree_savepoints[n_blocks],
2271 				tree_blocks[n_blocks]);
2272 
2273 			upper_rw_latch = root_leaf_rw_latch;
2274 			continue;
2275 		}
2276 
2277 		ut_ad(fil_page_index_page_check(page));
2278 		ut_ad(index->id == btr_page_get_index_id(page));
2279 
2280 		if (height == ULINT_UNDEFINED) {
2281 			/* We are in the root node */
2282 
2283 			height = btr_page_get_level(page, mtr);
2284 			root_height = height;
2285 			ut_a(height >= level);
2286 		} else {
2287 			/* TODO: flag the index corrupted if this fails */
2288 			ut_ad(height == btr_page_get_level(page, mtr));
2289 		}
2290 
2291 		if (height == level) {
2292 			if (srv_read_only_mode) {
2293 				btr_cur_latch_leaves(
2294 					block, page_id, page_size,
2295 					latch_mode, cursor, mtr);
2296 			} else if (height == 0) {
2297 				if (rw_latch == RW_NO_LATCH) {
2298 					btr_cur_latch_leaves(
2299 						block, page_id, page_size,
2300 						latch_mode, cursor, mtr);
2301 				}
2302 				/* In versions <= 3.23.52 we had
2303 				forgotten to release the tree latch
2304 				here. If in an index scan we had to
2305 				scan far to find a record visible to
2306 				the current transaction, that could
2307 				starve others waiting for the tree
2308 				latch. */
2309 
2310 				switch (latch_mode) {
2311 				case BTR_MODIFY_TREE:
2312 				case BTR_CONT_MODIFY_TREE:
2313 				case BTR_CONT_SEARCH_TREE:
2314 					break;
2315 				default:
2316 					if (!s_latch_by_caller) {
2317 						/* Release the tree s-latch */
2318 						mtr_release_s_latch_at_savepoint(
2319 							mtr, savepoint,
2320 							dict_index_get_lock(
2321 								index));
2322 					}
2323 
2324 					/* release upper blocks */
2325 					for (; n_releases < n_blocks;
2326 					     n_releases++) {
2327 						mtr_release_block_at_savepoint(
2328 							mtr,
2329 							tree_savepoints[
2330 								n_releases],
2331 							tree_blocks[
2332 								n_releases]);
2333 					}
2334 				}
2335 			} else { /* height != 0 */
2336 				/* We already have the block latched. */
2337 				ut_ad(latch_mode == BTR_SEARCH_TREE);
2338 				ut_ad(s_latch_by_caller);
2339 				ut_ad(upper_rw_latch == RW_S_LATCH);
2340 
2341 				ut_ad(mtr_memo_contains(mtr, block,
2342 							upper_rw_latch));
2343 
2344 				if (s_latch_by_caller) {
2345 					/* to exclude modifying tree operations
2346 					should sx-latch the index. */
2347 					ut_ad(mtr_memo_contains(
2348 						mtr,
2349 						dict_index_get_lock(index),
2350 						MTR_MEMO_SX_LOCK));
2351 					/* because has sx-latch of index,
2352 					can release upper blocks. */
2353 					for (; n_releases < n_blocks;
2354 					     n_releases++) {
2355 						mtr_release_block_at_savepoint(
2356 							mtr,
2357 							tree_savepoints[
2358 								n_releases],
2359 							tree_blocks[
2360 								n_releases]);
2361 					}
2362 				}
2363 			}
2364 		}
2365 
2366 		if (from_left) {
2367 			page_cur_set_before_first(block, page_cursor);
2368 		} else {
2369 			page_cur_set_after_last(block, page_cursor);
2370 		}
2371 
2372 		if (height == level) {
2373 			if (estimate) {
2374 				btr_cur_add_path_info(cursor, height,
2375 						      root_height);
2376 			}
2377 
2378 			break;
2379 		}
2380 
2381 		ut_ad(height > 0);
2382 
2383 		if (from_left) {
2384 			page_cur_move_to_next(page_cursor);
2385 		} else {
2386 			page_cur_move_to_prev(page_cursor);
2387 		}
2388 
2389 		if (estimate) {
2390 			btr_cur_add_path_info(cursor, height, root_height);
2391 		}
2392 
2393 		height--;
2394 
2395 		node_ptr = page_cur_get_rec(page_cursor);
2396 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2397 					  ULINT_UNDEFINED, &heap);
2398 
2399 		/* If the rec is the first or last in the page for
2400 		pessimistic delete intention, it might cause node_ptr insert
2401 		for the upper level. We should change the intention and retry.
2402 		*/
2403 		if (latch_mode == BTR_MODIFY_TREE
2404 		    && btr_cur_need_opposite_intention(
2405 			page, lock_intention, node_ptr)) {
2406 
2407 			ut_ad(upper_rw_latch == RW_X_LATCH);
2408 			/* release all blocks */
2409 			for (; n_releases <= n_blocks; n_releases++) {
2410 				mtr_release_block_at_savepoint(
2411 					mtr, tree_savepoints[n_releases],
2412 					tree_blocks[n_releases]);
2413 			}
2414 
2415 			lock_intention = BTR_INTENTION_BOTH;
2416 
2417 			page_id.set_page_no(dict_index_get_page(index));
2418 
2419 			height = ULINT_UNDEFINED;
2420 
2421 			n_blocks = 0;
2422 			n_releases = 0;
2423 
2424 			continue;
2425 		}
2426 
2427 		if (latch_mode == BTR_MODIFY_TREE
2428 		    && !btr_cur_will_modify_tree(
2429 				cursor->index, page, lock_intention, node_ptr,
2430 				node_ptr_max_size, page_size, mtr)) {
2431 			ut_ad(upper_rw_latch == RW_X_LATCH);
2432 			ut_ad(n_releases <= n_blocks);
2433 
2434 			/* we can release upper blocks */
2435 			for (; n_releases < n_blocks; n_releases++) {
2436 				if (n_releases == 0) {
2437 					/* we should not release root page
2438 					to pin to same block. */
2439 					continue;
2440 				}
2441 
2442 				/* release unused blocks to unpin */
2443 				mtr_release_block_at_savepoint(
2444 					mtr, tree_savepoints[n_releases],
2445 					tree_blocks[n_releases]);
2446 			}
2447 		}
2448 
2449 		if (height == level
2450 		    && latch_mode == BTR_MODIFY_TREE) {
2451 			ut_ad(upper_rw_latch == RW_X_LATCH);
2452 			/* we should sx-latch root page, if released already.
2453 			It contains seg_header. */
2454 			if (n_releases > 0) {
2455 				mtr_block_sx_latch_at_savepoint(
2456 					mtr, tree_savepoints[0],
2457 					tree_blocks[0]);
2458 			}
2459 
2460 			/* x-latch the branch blocks not released yet. */
2461 			for (ulint i = n_releases; i <= n_blocks; i++) {
2462 				mtr_block_x_latch_at_savepoint(
2463 					mtr, tree_savepoints[i],
2464 					tree_blocks[i]);
2465 			}
2466 		}
2467 
2468 		/* Go to the child node */
2469 		page_id.set_page_no(
2470 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2471 
2472 		n_blocks++;
2473 	}
2474 
2475 	if (heap) {
2476 		mem_heap_free(heap);
2477 	}
2478 }
2479 
2480 /** Opens a cursor at either end of an index.
2481 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2482 to keep them in buffer pool. This mode is used by intrinsic table
2483 as they are not shared and so there is no need of latching.
2484 @param[in]	from_left	true if open to low end, false if open
2485 				to high end.
2486 @param[in]	index		index
2487 @param[in,out]	cursor		cursor
2488 @param[in]	file		file name
2489 @param[in]	line		line where called
2490 @param[in,out]	mtr		mini transaction
2491 */
2492 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2493 btr_cur_open_at_index_side_with_no_latch_func(
2494 	bool		from_left,
2495 	dict_index_t*	index,
2496 	btr_cur_t*	cursor,
2497 	ulint		level,
2498 	const char*	file,
2499 	ulint		line,
2500 	mtr_t*		mtr)
2501 {
2502 	page_cur_t*	page_cursor;
2503 	ulint		height;
2504 	rec_t*		node_ptr;
2505 	ulint		n_blocks = 0;
2506 	mem_heap_t*	heap		= NULL;
2507 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2508 	ulint*		offsets		= offsets_;
2509 	rec_offs_init(offsets_);
2510 
2511 	ut_ad(level != ULINT_UNDEFINED);
2512 
2513 	page_cursor = btr_cur_get_page_cur(cursor);
2514 	cursor->index = index;
2515 	page_id_t		page_id(dict_index_get_space(index),
2516 					dict_index_get_page(index));
2517 	const page_size_t&	page_size = dict_table_page_size(index->table);
2518 
2519 	height = ULINT_UNDEFINED;
2520 
2521 	for (;;) {
2522 		buf_block_t*	block;
2523 		page_t*		page;
2524 		ulint		rw_latch = RW_NO_LATCH;
2525 
2526 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2527 
2528 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2529 					 BUF_GET, file, line, mtr);
2530 
2531 		page = buf_block_get_frame(block);
2532 
2533 		ut_ad(fil_page_index_page_check(page));
2534 		ut_ad(index->id == btr_page_get_index_id(page));
2535 
2536 		if (height == ULINT_UNDEFINED) {
2537 			/* We are in the root node */
2538 
2539 			height = btr_page_get_level(page, mtr);
2540 			ut_a(height >= level);
2541 		} else {
2542 			/* TODO: flag the index corrupted if this fails */
2543 			ut_ad(height == btr_page_get_level(page, mtr));
2544 		}
2545 
2546 		if (from_left) {
2547 			page_cur_set_before_first(block, page_cursor);
2548 		} else {
2549 			page_cur_set_after_last(block, page_cursor);
2550 		}
2551 
2552 		if (height == level) {
2553 			break;
2554 		}
2555 
2556 		ut_ad(height > 0);
2557 
2558 		if (from_left) {
2559 			page_cur_move_to_next(page_cursor);
2560 		} else {
2561 			page_cur_move_to_prev(page_cursor);
2562 		}
2563 
2564 		height--;
2565 
2566 		node_ptr = page_cur_get_rec(page_cursor);
2567 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2568 					  ULINT_UNDEFINED, &heap);
2569 
2570 		/* Go to the child node */
2571 		page_id.set_page_no(
2572 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2573 
2574 		n_blocks++;
2575 	}
2576 
2577 	if (heap != NULL) {
2578 		mem_heap_free(heap);
2579 	}
2580 }
2581 
2582 /**********************************************************************//**
2583 Positions a cursor at a randomly chosen position within a B-tree.
2584 @return true if the index is available and we have put the cursor, false
2585 if the index is unavailable */
2586 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2587 btr_cur_open_at_rnd_pos_func(
2588 /*=========================*/
2589 	dict_index_t*	index,		/*!< in: index */
2590 	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
2591 	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
2592 	const char*	file,		/*!< in: file name */
2593 	ulint		line,		/*!< in: line where called */
2594 	mtr_t*		mtr)		/*!< in: mtr */
2595 {
2596 	page_cur_t*	page_cursor;
2597 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2598 	ulint		height;
2599 	rec_t*		node_ptr;
2600 	ulint		savepoint;
2601 	ulint		upper_rw_latch, root_leaf_rw_latch;
2602 	btr_intention_t	lock_intention;
2603 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2604 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2605 	ulint		n_blocks = 0;
2606 	ulint		n_releases = 0;
2607 	mem_heap_t*	heap		= NULL;
2608 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2609 	ulint*		offsets		= offsets_;
2610 	rec_offs_init(offsets_);
2611 
2612 	ut_ad(!dict_index_is_spatial(index));
2613 
2614 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2615 
2616 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2617 
2618 	savepoint = mtr_set_savepoint(mtr);
2619 
2620 	switch (latch_mode) {
2621 	case BTR_MODIFY_TREE:
2622 		/* Most of delete-intended operations are purging.
2623 		Free blocks and read IO bandwidth should be prior
2624 		for them, when the history list is glowing huge. */
2625 		if (lock_intention == BTR_INTENTION_DELETE
2626 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2627 		    && buf_get_n_pending_read_ios()) {
2628 			mtr_x_lock(dict_index_get_lock(index), mtr);
2629 		} else {
2630 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2631 		}
2632 		upper_rw_latch = RW_X_LATCH;
2633 		break;
2634 	case BTR_SEARCH_PREV:
2635 	case BTR_MODIFY_PREV:
2636 		/* This function doesn't support left uncle
2637 		   page lock for left leaf page lock, when
2638 		   needed. */
2639 	case BTR_SEARCH_TREE:
2640 	case BTR_CONT_MODIFY_TREE:
2641 	case BTR_CONT_SEARCH_TREE:
2642 		ut_ad(0);
2643 		/* fall through */
2644 	default:
2645 		if (!srv_read_only_mode) {
2646 			mtr_s_lock(dict_index_get_lock(index), mtr);
2647 			upper_rw_latch = RW_S_LATCH;
2648 		} else {
2649 			upper_rw_latch = RW_NO_LATCH;
2650 		}
2651 	}
2652 
2653 	DBUG_EXECUTE_IF("test_index_is_unavailable",
2654 			return(false););
2655 
2656 	if (index->page == FIL_NULL) {
2657 		/* Since we don't hold index lock until just now, the index
2658 		could be modified by others, for example, if this is a
2659 		statistics updater for referenced table, it could be marked
2660 		as unavailable by 'DROP TABLE' in the mean time, since
2661 		we don't hold lock for statistics updater */
2662 		return(false);
2663 	}
2664 
2665 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2666 
2667 	page_cursor = btr_cur_get_page_cur(cursor);
2668 	cursor->index = index;
2669 
2670 	page_id_t		page_id(dict_index_get_space(index),
2671 					dict_index_get_page(index));
2672 	const page_size_t&	page_size = dict_table_page_size(index->table);
2673 
2674 	if (root_leaf_rw_latch == RW_X_LATCH) {
2675 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2676 	}
2677 
2678 	height = ULINT_UNDEFINED;
2679 
2680 	for (;;) {
2681 		buf_block_t*	block;
2682 		page_t*		page;
2683 		ulint		rw_latch;
2684 
2685 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2686 
2687 		if (height != 0
2688 		    && latch_mode != BTR_MODIFY_TREE) {
2689 			rw_latch = upper_rw_latch;
2690 		} else {
2691 			rw_latch = RW_NO_LATCH;
2692 		}
2693 
2694 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2695 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2696 					 BUF_GET, file, line, mtr);
2697 		tree_blocks[n_blocks] = block;
2698 
2699 		page = buf_block_get_frame(block);
2700 
2701 		if (height == ULINT_UNDEFINED
2702 		    && btr_page_get_level(page, mtr) == 0
2703 		    && rw_latch != RW_NO_LATCH
2704 		    && rw_latch != root_leaf_rw_latch) {
2705 			/* We should retry to get the page, because the root page
2706 			is latched with different level as a leaf page. */
2707 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2708 			ut_ad(rw_latch == RW_S_LATCH);
2709 
2710 			ut_ad(n_blocks == 0);
2711 			mtr_release_block_at_savepoint(
2712 				mtr, tree_savepoints[n_blocks],
2713 				tree_blocks[n_blocks]);
2714 
2715 			upper_rw_latch = root_leaf_rw_latch;
2716 			continue;
2717 		}
2718 
2719 		ut_ad(fil_page_index_page_check(page));
2720 		ut_ad(index->id == btr_page_get_index_id(page));
2721 
2722 		if (height == ULINT_UNDEFINED) {
2723 			/* We are in the root node */
2724 
2725 			height = btr_page_get_level(page, mtr);
2726 		}
2727 
2728 		if (height == 0) {
2729 			if (rw_latch == RW_NO_LATCH
2730 			    || srv_read_only_mode) {
2731 				btr_cur_latch_leaves(
2732 					block, page_id, page_size,
2733 					latch_mode, cursor, mtr);
2734 			}
2735 
2736 			/* btr_cur_open_at_index_side_func() and
2737 			btr_cur_search_to_nth_level() release
2738 			tree s-latch here.*/
2739 			switch (latch_mode) {
2740 			case BTR_MODIFY_TREE:
2741 			case BTR_CONT_MODIFY_TREE:
2742 			case BTR_CONT_SEARCH_TREE:
2743 				break;
2744 			default:
2745 				/* Release the tree s-latch */
2746 				if (!srv_read_only_mode) {
2747 					mtr_release_s_latch_at_savepoint(
2748 						mtr, savepoint,
2749 						dict_index_get_lock(index));
2750 				}
2751 
2752 				/* release upper blocks */
2753 				for (; n_releases < n_blocks; n_releases++) {
2754 					mtr_release_block_at_savepoint(
2755 						mtr,
2756 						tree_savepoints[n_releases],
2757 						tree_blocks[n_releases]);
2758 				}
2759 			}
2760 		}
2761 
2762 		page_cur_open_on_rnd_user_rec(block, page_cursor);
2763 
2764 		if (height == 0) {
2765 
2766 			break;
2767 		}
2768 
2769 		ut_ad(height > 0);
2770 
2771 		height--;
2772 
2773 		node_ptr = page_cur_get_rec(page_cursor);
2774 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2775 					  ULINT_UNDEFINED, &heap);
2776 
2777 		/* If the rec is the first or last in the page for
2778 		pessimistic delete intention, it might cause node_ptr insert
2779 		for the upper level. We should change the intention and retry.
2780 		*/
2781 		if (latch_mode == BTR_MODIFY_TREE
2782 		    && btr_cur_need_opposite_intention(
2783 			page, lock_intention, node_ptr)) {
2784 
2785 			ut_ad(upper_rw_latch == RW_X_LATCH);
2786 			/* release all blocks */
2787 			for (; n_releases <= n_blocks; n_releases++) {
2788 				mtr_release_block_at_savepoint(
2789 					mtr, tree_savepoints[n_releases],
2790 					tree_blocks[n_releases]);
2791 			}
2792 
2793 			lock_intention = BTR_INTENTION_BOTH;
2794 
2795 			page_id.set_page_no(dict_index_get_page(index));
2796 
2797 			height = ULINT_UNDEFINED;
2798 
2799 			n_blocks = 0;
2800 			n_releases = 0;
2801 
2802 			continue;
2803 		}
2804 
2805 		if (latch_mode == BTR_MODIFY_TREE
2806 		    && !btr_cur_will_modify_tree(
2807 				cursor->index, page, lock_intention, node_ptr,
2808 				node_ptr_max_size, page_size, mtr)) {
2809 			ut_ad(upper_rw_latch == RW_X_LATCH);
2810 			ut_ad(n_releases <= n_blocks);
2811 
2812 			/* we can release upper blocks */
2813 			for (; n_releases < n_blocks; n_releases++) {
2814 				if (n_releases == 0) {
2815 					/* we should not release root page
2816 					to pin to same block. */
2817 					continue;
2818 				}
2819 
2820 				/* release unused blocks to unpin */
2821 				mtr_release_block_at_savepoint(
2822 					mtr, tree_savepoints[n_releases],
2823 					tree_blocks[n_releases]);
2824 			}
2825 		}
2826 
2827 		if (height == 0
2828 		    && latch_mode == BTR_MODIFY_TREE) {
2829 			ut_ad(upper_rw_latch == RW_X_LATCH);
2830 			/* we should sx-latch root page, if released already.
2831 			It contains seg_header. */
2832 			if (n_releases > 0) {
2833 				mtr_block_sx_latch_at_savepoint(
2834 					mtr, tree_savepoints[0],
2835 					tree_blocks[0]);
2836 			}
2837 
2838 			/* x-latch the branch blocks not released yet. */
2839 			for (ulint i = n_releases; i <= n_blocks; i++) {
2840 				mtr_block_x_latch_at_savepoint(
2841 					mtr, tree_savepoints[i],
2842 					tree_blocks[i]);
2843 			}
2844 		}
2845 
2846 		/* Go to the child node */
2847 		page_id.set_page_no(
2848 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2849 
2850 		n_blocks++;
2851 	}
2852 
2853 	if (UNIV_LIKELY_NULL(heap)) {
2854 		mem_heap_free(heap);
2855 	}
2856 
2857 	return(true);
2858 }
2859 
2860 /*==================== B-TREE INSERT =========================*/
2861 
2862 /*************************************************************//**
2863 Inserts a record if there is enough space, or if enough space can
2864 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
2865 no heuristics is applied to whether it pays to use CPU time for
2866 reorganizing the page or not.
2867 
2868 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
2869 if this is a compressed leaf page in a secondary index.
2870 This has to be done either within the same mini-transaction,
2871 or by invoking ibuf_reset_free_bits() before mtr_commit().
2872 
2873 @return pointer to inserted record if succeed, else NULL */
2874 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2875 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)2876 btr_cur_insert_if_possible(
2877 /*=======================*/
2878 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
2879 				cursor stays valid */
2880 	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
2881 				have been stored to tuple */
2882 	ulint**		offsets,/*!< out: offsets on *rec */
2883 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2884 	ulint		n_ext,	/*!< in: number of externally stored columns */
2885 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
2886 {
2887 	page_cur_t*	page_cursor;
2888 	rec_t*		rec;
2889 
2890 	ut_ad(dtuple_check_typed(tuple));
2891 
2892 	ut_ad(mtr_is_block_fix(
2893 		mtr, btr_cur_get_block(cursor),
2894 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
2895 	page_cursor = btr_cur_get_page_cur(cursor);
2896 
2897 	/* Now, try the insert */
2898 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2899 				    offsets, heap, n_ext, mtr);
2900 
2901 	/* If the record did not fit, reorganize.
2902 	For compressed pages, page_cur_tuple_insert()
2903 	attempted this already. */
2904 	if (!rec && !page_cur_get_page_zip(page_cursor)
2905 	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
2906 		rec = page_cur_tuple_insert(
2907 			page_cursor, tuple, cursor->index,
2908 			offsets, heap, n_ext, mtr);
2909 	}
2910 
2911 	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
2912 	return(rec);
2913 }
2914 
2915 /*************************************************************//**
2916 For an insert, checks the locks and does the undo logging if desired.
2917 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
2918 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2919 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)2920 btr_cur_ins_lock_and_undo(
2921 /*======================*/
2922 	ulint		flags,	/*!< in: undo logging and locking flags: if
2923 				not zero, the parameters index and thr
2924 				should be specified */
2925 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
2926 	dtuple_t*	entry,	/*!< in/out: entry to insert */
2927 	que_thr_t*	thr,	/*!< in: query thread or NULL */
2928 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
2929 	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
2930 				should inherit LOCK_GAP type locks from the
2931 				successor record */
2932 {
2933 	dict_index_t*	index;
2934 	dberr_t		err = DB_SUCCESS;
2935 	rec_t*		rec;
2936 	roll_ptr_t	roll_ptr;
2937 
2938 	/* Check if we have to wait for a lock: enqueue an explicit lock
2939 	request if yes */
2940 
2941 	rec = btr_cur_get_rec(cursor);
2942 	index = cursor->index;
2943 
2944 	ut_ad(!dict_index_is_online_ddl(index)
2945 	      || dict_index_is_clust(index)
2946 	      || (flags & BTR_CREATE_FLAG));
2947 	ut_ad(mtr->is_named_space(index->space));
2948 
2949 	/* Check if there is predicate or GAP lock preventing the insertion */
2950 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
2951 		if (dict_index_is_spatial(index)) {
2952 			lock_prdt_t	prdt;
2953 			rtr_mbr_t	mbr;
2954 
2955 			rtr_get_mbr_from_tuple(entry, &mbr);
2956 
2957 			/* Use on stack MBR variable to test if a lock is
2958 			needed. If so, the predicate (MBR) will be allocated
2959 			from lock heap in lock_prdt_insert_check_and_lock() */
2960 			lock_init_prdt_from_mbr(
2961 				&prdt, &mbr, 0, NULL);
2962 
2963 			err = lock_prdt_insert_check_and_lock(
2964 				flags, rec, btr_cur_get_block(cursor),
2965 				index, thr, mtr, &prdt);
2966 			*inherit = false;
2967 		} else {
2968 			err = lock_rec_insert_check_and_lock(
2969 				flags, rec, btr_cur_get_block(cursor),
2970 				index, thr, mtr, inherit);
2971 		}
2972 	}
2973 
2974 	if (err != DB_SUCCESS
2975 	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
2976 
2977 		return(err);
2978 	}
2979 
2980 	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
2981 					    thr, index, entry,
2982 					    NULL, 0, NULL, NULL,
2983 					    &roll_ptr);
2984 	if (err != DB_SUCCESS) {
2985 
2986 		return(err);
2987 	}
2988 
2989 	/* Now we can fill in the roll ptr field in entry
2990 	(except if table is intrinsic) */
2991 
2992 	if (!(flags & BTR_KEEP_SYS_FLAG)
2993 	    && !dict_table_is_intrinsic(index->table)) {
2994 
2995 		row_upd_index_entry_sys_field(entry, index,
2996 					      DATA_ROLL_PTR, roll_ptr);
2997 	}
2998 
2999 	return(DB_SUCCESS);
3000 }
3001 
3002 /**
3003 Prefetch siblings of the leaf for the pessimistic operation.
3004 @param block	leaf page */
3005 static
3006 void
btr_cur_prefetch_siblings(buf_block_t * block)3007 btr_cur_prefetch_siblings(
3008 	buf_block_t*	block)
3009 {
3010 	page_t*	page = buf_block_get_frame(block);
3011 
3012 	ut_ad(page_is_leaf(page));
3013 
3014 	ulint left_page_no = fil_page_get_prev(page);
3015 	ulint right_page_no = fil_page_get_next(page);
3016 
3017 	if (left_page_no != FIL_NULL) {
3018 		buf_read_page_background(
3019 			page_id_t(block->page.id.space(), left_page_no),
3020 			block->page.size, false);
3021 	}
3022 	if (right_page_no != FIL_NULL) {
3023 		buf_read_page_background(
3024 			page_id_t(block->page.id.space(), right_page_no),
3025 			block->page.size, false);
3026 	}
3027 	if (left_page_no != FIL_NULL
3028 	    || right_page_no != FIL_NULL) {
3029 		os_aio_simulated_wake_handler_threads();
3030 	}
3031 }
3032 
3033 /*************************************************************//**
3034 Tries to perform an insert to a page in an index tree, next to cursor.
3035 It is assumed that mtr holds an x-latch on the page. The operation does
3036 not succeed if there is too little space on the page. If there is just
3037 one record on the page, the insert will always succeed; this is to
3038 prevent trying to split a page with just one record.
3039 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3040 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3041 btr_cur_optimistic_insert(
3042 /*======================*/
3043 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3044 				zero, the parameters index and thr should be
3045 				specified */
3046 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3047 				cursor stays valid */
3048 	ulint**		offsets,/*!< out: offsets on *rec */
3049 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
3050 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3051 	rec_t**		rec,	/*!< out: pointer to inserted record if
3052 				succeed */
3053 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3054 				be stored externally by the caller, or
3055 				NULL */
3056 	ulint		n_ext,	/*!< in: number of externally stored columns */
3057 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3058 	mtr_t*		mtr)	/*!< in/out: mini-transaction;
3059 				if this function returns DB_SUCCESS on
3060 				a leaf page of a secondary index in a
3061 				compressed tablespace, the caller must
3062 				mtr_commit(mtr) before latching
3063 				any further pages */
3064 {
3065 	big_rec_t*	big_rec_vec	= NULL;
3066 	dict_index_t*	index;
3067 	page_cur_t*	page_cursor;
3068 	buf_block_t*	block;
3069 	page_t*		page;
3070 	rec_t*		dummy;
3071 	ibool		leaf;
3072 	ibool		reorg;
3073 	ibool		inherit = TRUE;
3074 	ulint		rec_size;
3075 	dberr_t		err;
3076 
3077 	*big_rec = NULL;
3078 
3079 	block = btr_cur_get_block(cursor);
3080 	page = buf_block_get_frame(block);
3081 	index = cursor->index;
3082 
3083 	/* Block are not latched for insert if table is intrinsic
3084 	and index is auto-generated clustered index. */
3085 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3086 	ut_ad(!dict_index_is_online_ddl(index)
3087 	      || dict_index_is_clust(index)
3088 	      || (flags & BTR_CREATE_FLAG));
3089 	ut_ad(dtuple_check_typed(entry));
3090 
3091 	const page_size_t&	page_size = block->page.size;
3092 
3093 #ifdef UNIV_DEBUG_VALGRIND
3094 	if (page_size.is_compressed()) {
3095 		UNIV_MEM_ASSERT_RW(page, page_size.logical());
3096 		UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3097 	}
3098 #endif /* UNIV_DEBUG_VALGRIND */
3099 
3100 	leaf = page_is_leaf(page);
3101 
3102 	/* Calculate the record size when entry is converted to a record */
3103 	rec_size = rec_get_converted_size(index, entry, n_ext);
3104 
3105 	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3106 				   dtuple_get_n_fields(entry), page_size)) {
3107 
3108 		/* The record is so big that we have to store some fields
3109 		externally on separate database pages */
3110 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3111 
3112 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3113 
3114 			return(DB_TOO_BIG_RECORD);
3115 		}
3116 
3117 		rec_size = rec_get_converted_size(index, entry, n_ext);
3118 	}
3119 
3120 	if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3121 		if (big_rec_vec != NULL) {
3122 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3123 		}
3124 
3125 		return(DB_TOO_BIG_RECORD);
3126 	}
3127 
3128 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3129 				      goto fail);
3130 
3131 	if (leaf && page_size.is_compressed()
3132 	    && (page_get_data_size(page) + rec_size
3133 		>= dict_index_zip_pad_optimal_page_size(index))) {
3134 		/* If compression padding tells us that insertion will
3135 		result in too packed up page i.e.: which is likely to
3136 		cause compression failure then don't do an optimistic
3137 		insertion. */
3138 fail:
3139 		err = DB_FAIL;
3140 
3141 		/* prefetch siblings of the leaf for the pessimistic
3142 		operation, if the page is leaf. */
3143 		if (page_is_leaf(page)) {
3144 			btr_cur_prefetch_siblings(block);
3145 		}
3146 fail_err:
3147 
3148 		if (big_rec_vec) {
3149 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3150 		}
3151 
3152 		return(err);
3153 	}
3154 
3155 	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
3156 
3157 	if (page_has_garbage(page)) {
3158 		if ((max_size < rec_size
3159 		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3160 		    && page_get_n_recs(page) > 1
3161 		    && page_get_max_insert_size(page, 1) < rec_size) {
3162 
3163 			goto fail;
3164 		}
3165 	} else if (max_size < rec_size) {
3166 		goto fail;
3167 	}
3168 
3169 	/* If there have been many consecutive inserts to the
3170 	clustered index leaf page of an uncompressed table, check if
3171 	we have to split the page to reserve enough free space for
3172 	future updates of records. */
3173 
3174 	if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3175 	    && page_get_n_recs(page) >= 2
3176 	    && dict_index_get_space_reserve() + rec_size > max_size
3177 	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
3178 		|| btr_page_get_split_rec_to_left(cursor, &dummy))) {
3179 		goto fail;
3180 	}
3181 
3182 	page_cursor = btr_cur_get_page_cur(cursor);
3183 
3184 	DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3185 			      ": %s",
3186 			      index->name(), index->id,
3187 			      thr != NULL
3188 			      ? trx_get_id_for_print(thr_get_trx(thr))
3189 			      : 0,
3190 			      rec_printer(entry).str().c_str()));
3191 
3192 	DBUG_EXECUTE_IF("do_page_reorganize",
3193 			btr_page_reorganize(page_cursor, index, mtr););
3194 
3195 	/* Now, try the insert */
3196 	{
3197 		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
3198 
3199 		if (dict_table_is_intrinsic(index->table)) {
3200 
3201 			index->rec_cache.rec_size = rec_size;
3202 
3203 			*rec = page_cur_tuple_direct_insert(
3204 				page_cursor, entry, index, n_ext, mtr);
3205 		} else {
3206 			/* Check locks and write to the undo log,
3207 			if specified */
3208 			err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3209 							thr, mtr, &inherit);
3210 
3211 			if (err != DB_SUCCESS) {
3212 				goto fail_err;
3213 			}
3214 
3215 			*rec = page_cur_tuple_insert(
3216 				page_cursor, entry, index, offsets, heap,
3217 				n_ext, mtr);
3218 		}
3219 
3220 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3221 	}
3222 
3223 	if (*rec) {
3224 	} else if (page_size.is_compressed()) {
3225 		/* Reset the IBUF_BITMAP_FREE bits, because
3226 		page_cur_tuple_insert() will have attempted page
3227 		reorganize before failing. */
3228 		if (leaf
3229 		    && !dict_index_is_clust(index)
3230 		    && !dict_table_is_temporary(index->table)) {
3231 			ibuf_reset_free_bits(block);
3232 		}
3233 
3234 		goto fail;
3235 	} else {
3236 
3237 		/* For intrinsic table we take a consistent path
3238 		to re-organize using pessimistic path. */
3239 		if (dict_table_is_intrinsic(index->table)) {
3240 			goto fail;
3241 		}
3242 
3243 		ut_ad(!reorg);
3244 
3245 		/* If the record did not fit, reorganize */
3246 		if (!btr_page_reorganize(page_cursor, index, mtr)) {
3247 			ut_ad(0);
3248 			goto fail;
3249 		}
3250 
3251 		ut_ad(page_get_max_insert_size(page, 1) == max_size);
3252 
3253 		reorg = TRUE;
3254 
3255 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
3256 					     offsets, heap, n_ext, mtr);
3257 
3258 		if (UNIV_UNLIKELY(!*rec)) {
3259 			ib::fatal() <<  "Cannot insert tuple " << *entry
3260 				<< "into index " << index->name
3261 				<< " of table " << index->table->name
3262 				<< ". Max size: " << max_size;
3263 		}
3264 	}
3265 
3266 	if (!index->disable_ahi) {
3267 		if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3268 			btr_search_update_hash_node_on_insert(cursor);
3269 		} else {
3270 			btr_search_update_hash_on_insert(cursor);
3271 		}
3272 	}
3273 
3274 	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3275 
3276 		lock_update_insert(block, *rec);
3277 	}
3278 
3279 	if (leaf
3280 	    && !dict_index_is_clust(index)
3281 	    && !dict_table_is_temporary(index->table)) {
3282 		/* Update the free bits of the B-tree page in the
3283 		insert buffer bitmap. */
3284 
3285 		/* The free bits in the insert buffer bitmap must
3286 		never exceed the free space on a page.  It is safe to
3287 		decrement or reset the bits in the bitmap in a
3288 		mini-transaction that is committed before the
3289 		mini-transaction that affects the free space. */
3290 
3291 		/* It is unsafe to increment the bits in a separately
3292 		committed mini-transaction, because in crash recovery,
3293 		the free bits could momentarily be set too high. */
3294 
3295 		if (page_size.is_compressed()) {
3296 			/* Update the bits in the same mini-transaction. */
3297 			ibuf_update_free_bits_zip(block, mtr);
3298 		} else {
3299 			/* Decrement the bits in a separate
3300 			mini-transaction. */
3301 			ibuf_update_free_bits_if_full(
3302 				block, max_size,
3303 				rec_size + PAGE_DIR_SLOT_SIZE);
3304 		}
3305 	}
3306 
3307 	*big_rec = big_rec_vec;
3308 
3309 	return(DB_SUCCESS);
3310 }
3311 
3312 /*************************************************************//**
3313 Performs an insert on a page of an index tree. It is assumed that mtr
3314 holds an x-latch on the tree and on the cursor page. If the insert is
3315 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3316 to brothers of page, if those brothers exist.
3317 @return DB_SUCCESS or error number */
3318 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3319 btr_cur_pessimistic_insert(
3320 /*=======================*/
3321 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3322 				zero, the parameter thr should be
3323 				specified; if no undo logging is specified,
3324 				then the caller must have reserved enough
3325 				free extents in the file space so that the
3326 				insertion will certainly succeed */
3327 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
3328 				cursor stays valid */
3329 	ulint**		offsets,/*!< out: offsets on *rec */
3330 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
3331 				that can be emptied, or NULL */
3332 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3333 	rec_t**		rec,	/*!< out: pointer to inserted record if
3334 				succeed */
3335 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3336 				be stored externally by the caller, or
3337 				NULL */
3338 	ulint		n_ext,	/*!< in: number of externally stored columns */
3339 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3340 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3341 {
3342 	dict_index_t*	index		= cursor->index;
3343 	big_rec_t*	big_rec_vec	= NULL;
3344 	dberr_t		err;
3345 	ibool		inherit = FALSE;
3346 	bool		success;
3347 	ulint		n_reserved	= 0;
3348 
3349 	ut_ad(dtuple_check_typed(entry));
3350 
3351 	*big_rec = NULL;
3352 
3353 	ut_ad(mtr_memo_contains_flagged(
3354 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3355 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3356 	      || dict_table_is_intrinsic(cursor->index->table));
3357 	ut_ad(mtr_is_block_fix(
3358 		mtr, btr_cur_get_block(cursor),
3359 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3360 	ut_ad(!dict_index_is_online_ddl(index)
3361 	      || dict_index_is_clust(index)
3362 	      || (flags & BTR_CREATE_FLAG));
3363 
3364 	cursor->flag = BTR_CUR_BINARY;
3365 
3366 	/* Check locks and write to undo log, if specified */
3367 
3368 	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3369 					thr, mtr, &inherit);
3370 
3371 	if (err != DB_SUCCESS) {
3372 
3373 		return(err);
3374 	}
3375 
3376 	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3377 	    || dict_table_is_intrinsic(index->table)) {
3378 		/* First reserve enough free space for the file segments
3379 		of the index tree, so that the insert will not fail because
3380 		of lack of space */
3381 
3382 		ulint	n_extents = cursor->tree_height / 16 + 3;
3383 
3384 		success = fsp_reserve_free_extents(&n_reserved, index->space,
3385 						   n_extents, FSP_NORMAL, mtr);
3386 		if (!success) {
3387 			return(DB_OUT_OF_FILE_SPACE);
3388 		}
3389 	}
3390 
3391 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3392 				   dict_table_is_comp(index->table),
3393 				   dtuple_get_n_fields(entry),
3394 				   dict_table_page_size(index->table))) {
3395 		/* The record is so big that we have to store some fields
3396 		externally on separate database pages */
3397 
3398 		if (UNIV_LIKELY_NULL(big_rec_vec)) {
3399 			/* This should never happen, but we handle
3400 			the situation in a robust manner. */
3401 			ut_ad(0);
3402 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3403 		}
3404 
3405 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3406 
3407 		if (big_rec_vec == NULL) {
3408 
3409 			if (n_reserved > 0) {
3410 				fil_space_release_free_extents(index->space,
3411 							       n_reserved);
3412 			}
3413 			return(DB_TOO_BIG_RECORD);
3414 		}
3415 	}
3416 
3417 	if (dict_index_get_page(index)
3418 	    == btr_cur_get_block(cursor)->page.id.page_no()) {
3419 
3420 		/* The page is the root page */
3421 		*rec = btr_root_raise_and_insert(
3422 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3423 	} else {
3424 		*rec = btr_page_split_and_insert(
3425 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3426 	}
3427 
3428 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3429 	      || dict_index_is_spatial(index));
3430 
3431 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3432 		ut_ad(!dict_table_is_temporary(index->table));
3433 		if (dict_index_is_spatial(index)) {
3434 			/* Do nothing */
3435 		} else {
3436 			/* The cursor might be moved to the other page
3437 			and the max trx id field should be updated after
3438 			the cursor was fixed. */
3439 			if (!dict_index_is_clust(index)) {
3440 				page_update_max_trx_id(
3441 					btr_cur_get_block(cursor),
3442 					btr_cur_get_page_zip(cursor),
3443 					thr_get_trx(thr)->id, mtr);
3444 			}
3445 			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3446 			    || btr_page_get_prev(
3447 				buf_block_get_frame(
3448 					btr_cur_get_block(cursor)), mtr)
3449 			       == FIL_NULL) {
3450 				/* split and inserted need to call
3451 				lock_update_insert() always. */
3452 				inherit = TRUE;
3453 			}
3454 		}
3455 	}
3456 
3457 	if (!index->disable_ahi) {
3458 		btr_search_update_hash_on_insert(cursor);
3459 	}
3460 	if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3461 
3462 		lock_update_insert(btr_cur_get_block(cursor), *rec);
3463 	}
3464 
3465 	if (n_reserved > 0) {
3466 		fil_space_release_free_extents(index->space, n_reserved);
3467 	}
3468 
3469 	*big_rec = big_rec_vec;
3470 
3471 	return(DB_SUCCESS);
3472 }
3473 
3474 /*==================== B-TREE UPDATE =========================*/
3475 
3476 /*************************************************************//**
3477 For an update, checks the locks and does the undo logging.
3478 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3479 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3480 dberr_t
3481 btr_cur_upd_lock_and_undo(
3482 /*======================*/
3483 	ulint		flags,	/*!< in: undo logging and locking flags */
3484 	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
3485 	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
3486 	const upd_t*	update,	/*!< in: update vector */
3487 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3488 				updates */
3489 	que_thr_t*	thr,	/*!< in: query thread
3490 				(can be NULL if BTR_NO_LOCKING_FLAG) */
3491 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3492 	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
3493 {
3494 	dict_index_t*	index;
3495 	const rec_t*	rec;
3496 	dberr_t		err;
3497 
3498 	ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3499 
3500 	rec = btr_cur_get_rec(cursor);
3501 	index = cursor->index;
3502 
3503 	ut_ad(rec_offs_validate(rec, index, offsets));
3504 	ut_ad(mtr->is_named_space(index->space));
3505 
3506 	if (!dict_index_is_clust(index)) {
3507 		ut_ad(dict_index_is_online_ddl(index)
3508 		      == !!(flags & BTR_CREATE_FLAG));
3509 
3510 		/* We do undo logging only when we update a clustered index
3511 		record */
3512 		return(lock_sec_rec_modify_check_and_lock(
3513 			       flags, btr_cur_get_block(cursor), rec,
3514 			       index, thr, mtr));
3515 	}
3516 
3517 	/* Check if we have to wait for a lock: enqueue an explicit lock
3518 	request if yes */
3519 
3520 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3521 		err = lock_clust_rec_modify_check_and_lock(
3522 			flags, btr_cur_get_block(cursor), rec, index,
3523 			offsets, thr);
3524 		if (err != DB_SUCCESS) {
3525 			return(err);
3526 		}
3527 	}
3528 
3529 	/* Append the info about the update in the undo log */
3530 
3531 	return(trx_undo_report_row_operation(
3532 		       flags, TRX_UNDO_MODIFY_OP, thr,
3533 		       index, NULL, update,
3534 		       cmpl_info, rec, offsets, roll_ptr));
3535 }
3536 
3537 /***********************************************************//**
3538 Writes a redo log record of updating a record in-place. */
3539 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3540 btr_cur_update_in_place_log(
3541 /*========================*/
3542 	ulint		flags,		/*!< in: flags */
3543 	const rec_t*	rec,		/*!< in: record */
3544 	dict_index_t*	index,		/*!< in: index of the record */
3545 	const upd_t*	update,		/*!< in: update vector */
3546 	trx_id_t	trx_id,		/*!< in: transaction id */
3547 	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
3548 	mtr_t*		mtr)		/*!< in: mtr */
3549 {
3550 	byte*		log_ptr;
3551 	const page_t*	page	= page_align(rec);
3552 	ut_ad(flags < 256);
3553 	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3554 
3555 	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3556 					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
3557 					    : MLOG_REC_UPDATE_IN_PLACE,
3558 					    1 + DATA_ROLL_PTR_LEN + 14 + 2
3559 					    + MLOG_BUF_MARGIN);
3560 
3561 	if (!log_ptr) {
3562 		/* Logging in mtr is switched off during crash recovery */
3563 		return;
3564 	}
3565 
3566 	/* For secondary indexes, we could skip writing the dummy system fields
3567 	to the redo log but we have to change redo log parsing of
3568 	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3569 	new redo log record. For now, just write dummy sys fields to the redo
3570 	log if we are updating a secondary index record.
3571 	*/
3572 	mach_write_to_1(log_ptr, flags);
3573 	log_ptr++;
3574 
3575 	if (dict_index_is_clust(index)) {
3576 		log_ptr = row_upd_write_sys_vals_to_log(
3577 				index, trx_id, roll_ptr, log_ptr, mtr);
3578 	} else {
3579 		/* Dummy system fields for a secondary index */
3580 		/* TRX_ID Position */
3581 		log_ptr += mach_write_compressed(log_ptr, 0);
3582 		/* ROLL_PTR */
3583 		trx_write_roll_ptr(log_ptr, 0);
3584 		log_ptr += DATA_ROLL_PTR_LEN;
3585 		/* TRX_ID */
3586 		log_ptr += mach_u64_write_compressed(log_ptr, 0);
3587 	}
3588 
3589 	mach_write_to_2(log_ptr, page_offset(rec));
3590 	log_ptr += 2;
3591 
3592 	row_upd_index_write_log(update, log_ptr, mtr);
3593 }
3594 #endif /* UNIV_HOTBACKUP */
3595 
3596 /***********************************************************//**
3597 Parses a redo log record of updating a record in-place.
3598 @return end of log record or NULL */
3599 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3600 btr_cur_parse_update_in_place(
3601 /*==========================*/
3602 	byte*		ptr,	/*!< in: buffer */
3603 	byte*		end_ptr,/*!< in: buffer end */
3604 	page_t*		page,	/*!< in/out: page or NULL */
3605 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
3606 	dict_index_t*	index)	/*!< in: index corresponding to page */
3607 {
3608 	ulint		flags;
3609 	rec_t*		rec;
3610 	upd_t*		update;
3611 	ulint		pos;
3612 	trx_id_t	trx_id;
3613 	roll_ptr_t	roll_ptr;
3614 	ulint		rec_offset;
3615 	mem_heap_t*	heap;
3616 	ulint*		offsets;
3617 
3618 	if (end_ptr < ptr + 1) {
3619 
3620 		return(NULL);
3621 	}
3622 
3623 	flags = mach_read_from_1(ptr);
3624 	ptr++;
3625 
3626 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3627 
3628 	if (ptr == NULL) {
3629 
3630 		return(NULL);
3631 	}
3632 
3633 	if (end_ptr < ptr + 2) {
3634 
3635 		return(NULL);
3636 	}
3637 
3638 	rec_offset = mach_read_from_2(ptr);
3639 	ptr += 2;
3640 
3641 	ut_a(rec_offset <= UNIV_PAGE_SIZE);
3642 
3643 	heap = mem_heap_create(256);
3644 
3645 	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3646 
3647 	if (!ptr || !page) {
3648 
3649 		goto func_exit;
3650 	}
3651 
3652 	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3653 	rec = page + rec_offset;
3654 
3655 	/* We do not need to reserve search latch, as the page is only
3656 	being recovered, and there cannot be a hash index to it. */
3657 
3658 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3659 
3660 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
3661 		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3662 						   pos, trx_id, roll_ptr);
3663 	}
3664 
3665 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3666 
3667 func_exit:
3668 	mem_heap_free(heap);
3669 
3670 	return(ptr);
3671 }
3672 
3673 #ifndef UNIV_HOTBACKUP
3674 /*************************************************************//**
3675 See if there is enough place in the page modification log to log
3676 an update-in-place.
3677 
3678 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3679 outside mtr if the page was recompressed
3680 @retval true if enough place;
3681 
3682 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3683 a secondary index leaf page. This has to be done either within the
3684 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3685 mtr_commit(mtr). */
3686 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3687 btr_cur_update_alloc_zip_func(
3688 /*==========================*/
3689 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
3690 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
3691 	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
3692 #ifdef UNIV_DEBUG
3693 	ulint*		offsets,/*!< in/out: offsets of the cursor record */
3694 #endif /* UNIV_DEBUG */
3695 	ulint		length,	/*!< in: size needed */
3696 	bool		create,	/*!< in: true=delete-and-insert,
3697 				false=update-in-place */
3698 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3699 {
3700 	const page_t*	page = page_cur_get_page(cursor);
3701 
3702 	ut_ad(page_zip == page_cur_get_page_zip(cursor));
3703 	ut_ad(page_zip);
3704 	ut_ad(!dict_index_is_ibuf(index));
3705 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3706 
3707 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3708 			       length, create)) {
3709 		return(true);
3710 	}
3711 
3712 	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3713 		/* The page has been freshly compressed, so
3714 		reorganizing it will not help. */
3715 		return(false);
3716 	}
3717 
3718 	if (create && page_is_leaf(page)
3719 	    && (length + page_get_data_size(page)
3720 		>= dict_index_zip_pad_optimal_page_size(index))) {
3721 		return(false);
3722 	}
3723 
3724 	if (!btr_page_reorganize(cursor, index, mtr)) {
3725 		goto out_of_space;
3726 	}
3727 
3728 	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3729 
3730 	/* After recompressing a page, we must make sure that the free
3731 	bits in the insert buffer bitmap will not exceed the free
3732 	space on the page.  Because this function will not attempt
3733 	recompression unless page_zip_available() fails above, it is
3734 	safe to reset the free bits if page_zip_available() fails
3735 	again, below.  The free bits can safely be reset in a separate
3736 	mini-transaction.  If page_zip_available() succeeds below, we
3737 	can be sure that the btr_page_reorganize() above did not reduce
3738 	the free space available on the page. */
3739 
3740 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3741 			       length, create)) {
3742 		return(true);
3743 	}
3744 
3745 out_of_space:
3746 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3747 
3748 	/* Out of space: reset the free bits. */
3749 	if (!dict_index_is_clust(index)
3750 	    && !dict_table_is_temporary(index->table)
3751 	    && page_is_leaf(page)) {
3752 		ibuf_reset_free_bits(page_cur_get_block(cursor));
3753 	}
3754 
3755 	return(false);
3756 }
3757 
3758 /*************************************************************//**
3759 Updates a record when the update causes no size changes in its fields.
3760 We assume here that the ordering fields of the record do not change.
3761 @return locking or undo log related error code, or
3762 @retval DB_SUCCESS on success
3763 @retval DB_ZIP_OVERFLOW if there is not enough space left
3764 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3765 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3766 btr_cur_update_in_place(
3767 /*====================*/
3768 	ulint		flags,	/*!< in: undo logging and locking flags */
3769 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
3770 				cursor stays valid and positioned on the
3771 				same record */
3772 	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3773 	const upd_t*	update,	/*!< in: update vector */
3774 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3775 				updates */
3776 	que_thr_t*	thr,	/*!< in: query thread */
3777 	trx_id_t	trx_id,	/*!< in: transaction id */
3778 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
3779 				is a secondary index, the caller must
3780 				mtr_commit(mtr) before latching any
3781 				further pages */
3782 {
3783 	dict_index_t*	index;
3784 	buf_block_t*	block;
3785 	page_zip_des_t*	page_zip;
3786 	dberr_t		err;
3787 	rec_t*		rec;
3788 	roll_ptr_t	roll_ptr	= 0;
3789 	ulint		was_delete_marked;
3790 	ibool		is_hashed;
3791 
3792 	rec = btr_cur_get_rec(cursor);
3793 	index = cursor->index;
3794 	ut_ad(rec_offs_validate(rec, index, offsets));
3795 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3796 	ut_ad(trx_id > 0
3797 	      || (flags & BTR_KEEP_SYS_FLAG)
3798 	      || dict_table_is_intrinsic(index->table));
3799 	/* The insert buffer tree should never be updated in place. */
3800 	ut_ad(!dict_index_is_ibuf(index));
3801 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3802 	      || dict_index_is_clust(index));
3803 	ut_ad(thr_get_trx(thr)->id == trx_id
3804 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3805 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3806 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3807 	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3808 	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3809 
3810 	DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3811 			      ") by " TRX_ID_FMT ": %s",
3812 			      index->name(), index->id, trx_id,
3813 			      rec_printer(rec, offsets).str().c_str()));
3814 
3815 	block = btr_cur_get_block(cursor);
3816 	page_zip = buf_block_get_page_zip(block);
3817 
3818 	/* Check that enough space is available on the compressed page. */
3819 	if (page_zip) {
3820 		if (!btr_cur_update_alloc_zip(
3821 			    page_zip, btr_cur_get_page_cur(cursor),
3822 			    index, offsets, rec_offs_size(offsets),
3823 			    false, mtr)) {
3824 			return(DB_ZIP_OVERFLOW);
3825 		}
3826 
3827 		rec = btr_cur_get_rec(cursor);
3828 	}
3829 
3830 	/* Do lock checking and undo logging */
3831 	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
3832 					update, cmpl_info,
3833 					thr, mtr, &roll_ptr);
3834 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3835 		/* We may need to update the IBUF_BITMAP_FREE
3836 		bits after a reorganize that was done in
3837 		btr_cur_update_alloc_zip(). */
3838 		goto func_exit;
3839 	}
3840 
3841 	if (!(flags & BTR_KEEP_SYS_FLAG)
3842 	    && !dict_table_is_intrinsic(index->table)) {
3843 		row_upd_rec_sys_fields(rec, NULL, index, offsets,
3844 				       thr_get_trx(thr), roll_ptr);
3845 	}
3846 
3847 	was_delete_marked = rec_get_deleted_flag(
3848 		rec, page_is_comp(buf_block_get_frame(block)));
3849 
3850 	is_hashed = (block->index != NULL);
3851 
3852 	if (is_hashed) {
3853 		/* TO DO: Can we skip this if none of the fields
3854 		index->search_info->curr_n_fields
3855 		are being updated? */
3856 
3857 		/* The function row_upd_changes_ord_field_binary works only
3858 		if the update vector was built for a clustered index, we must
3859 		NOT call it if index is secondary */
3860 
3861 		if (!dict_index_is_clust(index)
3862 		    || row_upd_changes_ord_field_binary(index, update, thr,
3863 							NULL, NULL)) {
3864 
3865 			/* Remove possible hash index pointer to this record */
3866 			btr_search_update_hash_on_delete(cursor);
3867 		}
3868 
3869 		rw_lock_x_lock(btr_get_search_latch(index));
3870 	}
3871 
3872 	assert_block_ahi_valid(block);
3873 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3874 
3875 	if (is_hashed) {
3876 		rw_lock_x_unlock(btr_get_search_latch(index));
3877 	}
3878 
3879 	btr_cur_update_in_place_log(flags, rec, index, update,
3880 				    trx_id, roll_ptr, mtr);
3881 
3882 	if (was_delete_marked
3883 	    && !rec_get_deleted_flag(
3884 		    rec, page_is_comp(buf_block_get_frame(block)))) {
3885 		/* The new updated record owns its possible externally
3886 		stored fields */
3887 
3888 		btr_cur_unmark_extern_fields(page_zip,
3889 					     rec, index, offsets, mtr);
3890 	}
3891 
3892 	ut_ad(err == DB_SUCCESS);
3893 
3894 func_exit:
3895 	if (page_zip
3896 	    && !(flags & BTR_KEEP_IBUF_BITMAP)
3897 	    && !dict_index_is_clust(index)
3898 	    && !dict_table_is_temporary(index->table)
3899 	    && page_is_leaf(buf_block_get_frame(block))) {
3900 		/* Update the free bits in the insert buffer. */
3901 		ibuf_update_free_bits_zip(block, mtr);
3902 	}
3903 
3904 	return(err);
3905 }
3906 
3907 /*************************************************************//**
3908 Tries to update a record on a page in an index tree. It is assumed that mtr
3909 holds an x-latch on the page. The operation does not succeed if there is too
3910 little space on the page or if the update would result in too empty a page,
3911 so that tree compression is recommended. We assume here that the ordering
3912 fields of the record do not change.
3913 @return error code, including
3914 @retval DB_SUCCESS on success
3915 @retval DB_OVERFLOW if the updated record does not fit
3916 @retval DB_UNDERFLOW if the page would become too empty
3917 @retval DB_ZIP_OVERFLOW if there is not enough space left
3918 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3919 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3920 btr_cur_optimistic_update(
3921 /*======================*/
3922 	ulint		flags,	/*!< in: undo logging and locking flags */
3923 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
3924 				cursor stays valid and positioned on the
3925 				same record */
3926 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
3927 	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
3928 	const upd_t*	update,	/*!< in: update vector; this must also
3929 				contain trx id and roll ptr fields */
3930 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3931 				updates */
3932 	que_thr_t*	thr,	/*!< in: query thread */
3933 	trx_id_t	trx_id,	/*!< in: transaction id */
3934 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
3935 				is a secondary index, the caller must
3936 				mtr_commit(mtr) before latching any
3937 				further pages */
3938 {
3939 	dict_index_t*	index;
3940 	page_cur_t*	page_cursor;
3941 	dberr_t		err;
3942 	buf_block_t*	block;
3943 	page_t*		page;
3944 	page_zip_des_t*	page_zip;
3945 	rec_t*		rec;
3946 	ulint		max_size;
3947 	ulint		new_rec_size;
3948 	ulint		old_rec_size;
3949 	ulint		max_ins_size = 0;
3950 	dtuple_t*	new_entry;
3951 	roll_ptr_t	roll_ptr;
3952 	ulint		i;
3953 	ulint		n_ext;
3954 
3955 	block = btr_cur_get_block(cursor);
3956 	page = buf_block_get_frame(block);
3957 	rec = btr_cur_get_rec(cursor);
3958 	index = cursor->index;
3959 	ut_ad(trx_id > 0
3960 	      || (flags & BTR_KEEP_SYS_FLAG)
3961 	      || dict_table_is_intrinsic(index->table));
3962 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3963 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3964 	/* This is intended only for leaf page updates */
3965 	ut_ad(page_is_leaf(page));
3966 	/* The insert buffer tree should never be updated in place. */
3967 	ut_ad(!dict_index_is_ibuf(index));
3968 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3969 	      || dict_index_is_clust(index));
3970 	ut_ad(thr_get_trx(thr)->id == trx_id
3971 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3972 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3973 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3974 	ut_ad(fil_page_index_page_check(page));
3975 	ut_ad(btr_page_get_index_id(page) == index->id);
3976 
3977 	*offsets = rec_get_offsets(rec, index, *offsets,
3978 				   ULINT_UNDEFINED, heap);
3979 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3980 	ut_a(!rec_offs_any_null_extern(rec, *offsets)
3981 	     || trx_is_recv(thr_get_trx(thr)));
3982 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
3983 
3984 	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
3985 
3986 		/* The simplest and the most common case: the update does not
3987 		change the size of any field and none of the updated fields is
3988 		externally stored in rec or update, and there is enough space
3989 		on the compressed page to log the update. */
3990 
3991 		return(btr_cur_update_in_place(
3992 			       flags, cursor, *offsets, update,
3993 			       cmpl_info, thr, trx_id, mtr));
3994 	}
3995 
3996 	if (rec_offs_any_extern(*offsets)) {
3997 any_extern:
3998 		/* Externally stored fields are treated in pessimistic
3999 		update */
4000 
4001 		/* prefetch siblings of the leaf for the pessimistic
4002 		operation. */
4003 		btr_cur_prefetch_siblings(block);
4004 
4005 		return(DB_OVERFLOW);
4006 	}
4007 
4008 	for (i = 0; i < upd_get_n_fields(update); i++) {
4009 		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4010 
4011 			goto any_extern;
4012 		}
4013 	}
4014 
4015 	DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4016 			      ": %s",
4017 			      index->name(), index->id, trx_id,
4018 			      rec_printer(rec, *offsets).str().c_str()));
4019 
4020 	page_cursor = btr_cur_get_page_cur(cursor);
4021 
4022 	if (!*heap) {
4023 		*heap = mem_heap_create(
4024 			rec_offs_size(*offsets)
4025 			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4026 	}
4027 
4028 	new_entry = row_rec_to_index_entry(rec, index, *offsets,
4029 					   &n_ext, *heap);
4030 	/* We checked above that there are no externally stored fields. */
4031 	ut_a(!n_ext);
4032 
4033 	/* The page containing the clustered index record
4034 	corresponding to new_entry is latched in mtr.
4035 	Thus the following call is safe. */
4036 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4037 						     FALSE, *heap);
4038 	old_rec_size = rec_offs_size(*offsets);
4039 	new_rec_size = rec_get_converted_size(index, new_entry, 0);
4040 
4041 	page_zip = buf_block_get_page_zip(block);
4042 #ifdef UNIV_ZIP_DEBUG
4043 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4044 #endif /* UNIV_ZIP_DEBUG */
4045 
4046 	if (page_zip) {
4047 		if (!btr_cur_update_alloc_zip(
4048 			    page_zip, page_cursor, index, *offsets,
4049 			    new_rec_size, true, mtr)) {
4050 			return(DB_ZIP_OVERFLOW);
4051 		}
4052 
4053 		rec = page_cur_get_rec(page_cursor);
4054 	}
4055 
4056 	/* We limit max record size to 16k even for 64k page size. */
4057 	if (new_rec_size >= REC_MAX_DATA_SIZE) {
4058 		err = DB_OVERFLOW;
4059 
4060 		goto func_exit;
4061 	}
4062 
4063 	if (UNIV_UNLIKELY(new_rec_size
4064 			  >= (page_get_free_space_of_empty(page_is_comp(page))
4065 			      / 2))) {
4066 		/* We may need to update the IBUF_BITMAP_FREE
4067 		bits after a reorganize that was done in
4068 		btr_cur_update_alloc_zip(). */
4069 		err = DB_OVERFLOW;
4070 		goto func_exit;
4071 	}
4072 
4073 	if (UNIV_UNLIKELY(page_get_data_size(page)
4074 			  - old_rec_size + new_rec_size
4075 			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4076 		/* We may need to update the IBUF_BITMAP_FREE
4077 		bits after a reorganize that was done in
4078 		btr_cur_update_alloc_zip(). */
4079 
4080 		/* The page would become too empty */
4081 		err = DB_UNDERFLOW;
4082 		goto func_exit;
4083 	}
4084 
4085 	/* We do not attempt to reorganize if the page is compressed.
4086 	This is because the page may fail to compress after reorganization. */
4087 	max_size = page_zip
4088 		? page_get_max_insert_size(page, 1)
4089 		: (old_rec_size
4090 		   + page_get_max_insert_size_after_reorganize(page, 1));
4091 
4092 	if (!page_zip) {
4093 		max_ins_size = page_get_max_insert_size_after_reorganize(
4094 				page, 1);
4095 	}
4096 
4097 	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4098 	       && (max_size >= new_rec_size))
4099 	      || (page_get_n_recs(page) <= 1))) {
4100 
4101 		/* We may need to update the IBUF_BITMAP_FREE
4102 		bits after a reorganize that was done in
4103 		btr_cur_update_alloc_zip(). */
4104 
4105 		/* There was not enough space, or it did not pay to
4106 		reorganize: for simplicity, we decide what to do assuming a
4107 		reorganization is needed, though it might not be necessary */
4108 
4109 		err = DB_OVERFLOW;
4110 		goto func_exit;
4111 	}
4112 
4113 	/* Do lock checking and undo logging */
4114 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4115 					update, cmpl_info,
4116 					thr, mtr, &roll_ptr);
4117 	if (err != DB_SUCCESS) {
4118 		/* We may need to update the IBUF_BITMAP_FREE
4119 		bits after a reorganize that was done in
4120 		btr_cur_update_alloc_zip(). */
4121 		goto func_exit;
4122 	}
4123 
4124 	/* Ok, we may do the replacement. Store on the page infimum the
4125 	explicit locks on rec, before deleting rec (see the comment in
4126 	btr_cur_pessimistic_update). */
4127 	if (!dict_table_is_locking_disabled(index->table)) {
4128 		lock_rec_store_on_page_infimum(block, rec);
4129 	}
4130 
4131 	btr_search_update_hash_on_delete(cursor);
4132 
4133 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4134 
4135 	page_cur_move_to_prev(page_cursor);
4136 
4137 	if (!(flags & BTR_KEEP_SYS_FLAG)
4138 	    && !dict_table_is_intrinsic(index->table)) {
4139 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4140 					      roll_ptr);
4141 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4142 					      trx_id);
4143 	}
4144 
4145 	/* There are no externally stored columns in new_entry */
4146 	rec = btr_cur_insert_if_possible(
4147 		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4148 	ut_a(rec); /* <- We calculated above the insert would fit */
4149 
4150 	/* Restore the old explicit lock state on the record */
4151 	if (!dict_table_is_locking_disabled(index->table)) {
4152 		lock_rec_restore_from_page_infimum(block, rec, block);
4153 	}
4154 
4155 	page_cur_move_to_next(page_cursor);
4156 	ut_ad(err == DB_SUCCESS);
4157 
4158 func_exit:
4159 	if (!(flags & BTR_KEEP_IBUF_BITMAP)
4160 	    && !dict_index_is_clust(index)
4161 	    && !dict_table_is_temporary(index->table)) {
4162 		/* Update the free bits in the insert buffer. */
4163 		if (page_zip) {
4164 			ibuf_update_free_bits_zip(block, mtr);
4165 		} else {
4166 			ibuf_update_free_bits_low(block, max_ins_size, mtr);
4167 		}
4168 	}
4169 
4170 	if (err != DB_SUCCESS) {
4171 		/* prefetch siblings of the leaf for the pessimistic
4172 		operation. */
4173 		btr_cur_prefetch_siblings(block);
4174 	}
4175 
4176 	return(err);
4177 }
4178 
4179 /*************************************************************//**
4180 If, in a split, a new supremum record was created as the predecessor of the
4181 updated record, the supremum record must inherit exactly the locks on the
4182 updated record. In the split it may have inherited locks from the successor
4183 of the updated record, which is not correct. This function restores the
4184 right locks for the new supremum. */
4185 static
4186 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4187 btr_cur_pess_upd_restore_supremum(
4188 /*==============================*/
4189 	buf_block_t*	block,	/*!< in: buffer block of rec */
4190 	const rec_t*	rec,	/*!< in: updated record */
4191 	mtr_t*		mtr)	/*!< in: mtr */
4192 {
4193 	page_t*		page;
4194 	buf_block_t*	prev_block;
4195 
4196 	page = buf_block_get_frame(block);
4197 
4198 	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4199 		/* Updated record is not the first user record on its page */
4200 
4201 		return;
4202 	}
4203 
4204 	const ulint	prev_page_no = btr_page_get_prev(page, mtr);
4205 
4206 	const page_id_t	page_id(block->page.id.space(), prev_page_no);
4207 
4208 	ut_ad(prev_page_no != FIL_NULL);
4209 	prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4210 #ifdef UNIV_BTR_DEBUG
4211 	ut_a(btr_page_get_next(prev_block->frame, mtr)
4212 	     == page_get_page_no(page));
4213 #endif /* UNIV_BTR_DEBUG */
4214 
4215 	/* We must already have an x-latch on prev_block! */
4216 	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4217 
4218 	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4219 					     PAGE_HEAP_NO_SUPREMUM,
4220 					     page_rec_get_heap_no(rec));
4221 }
4222 
4223 /*************************************************************//**
4224 Performs an update of a record on a page of a tree. It is assumed
4225 that mtr holds an x-latch on the tree and on the cursor page. If the
4226 update is made on the leaf level, to avoid deadlocks, mtr must also
4227 own x-latches to brothers of page, if those brothers exist. We assume
4228 here that the ordering fields of the record do not change.
4229 @return DB_SUCCESS or error code */
4230 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4231 btr_cur_pessimistic_update(
4232 /*=======================*/
4233 	ulint		flags,	/*!< in: undo logging, locking, and rollback
4234 				flags */
4235 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
4236 				cursor may become invalid if *big_rec == NULL
4237 				|| !(flags & BTR_KEEP_POS_FLAG) */
4238 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
4239 	mem_heap_t**	offsets_heap,
4240 				/*!< in/out: pointer to memory heap
4241 				that can be emptied, or NULL */
4242 	mem_heap_t*	entry_heap,
4243 				/*!< in/out: memory heap for allocating
4244 				big_rec and the index tuple */
4245 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
4246 				be stored externally by the caller, or NULL */
4247 	upd_t*		update,	/*!< in/out: update vector; this is allowed to
4248 				also contain trx id and roll ptr fields.
4249 				Non-updated columns that are moved offpage will
4250 				be appended to this. */
4251 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4252 				updates */
4253 	que_thr_t*	thr,	/*!< in: query thread */
4254 	trx_id_t	trx_id,	/*!< in: transaction id */
4255 	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
4256 				committed before latching any further pages */
4257 {
4258 	big_rec_t*	big_rec_vec	= NULL;
4259 	big_rec_t*	dummy_big_rec;
4260 	dict_index_t*	index;
4261 	buf_block_t*	block;
4262 	page_t*		page;
4263 	page_zip_des_t*	page_zip;
4264 	rec_t*		rec;
4265 	page_cur_t*	page_cursor;
4266 	dberr_t		err;
4267 	dberr_t		optim_err;
4268 	roll_ptr_t	roll_ptr;
4269 	ibool		was_first;
4270 	ulint		n_reserved	= 0;
4271 	ulint		n_ext;
4272 	ulint		max_ins_size	= 0;
4273 
4274 	*offsets = NULL;
4275 	*big_rec = NULL;
4276 
4277 	block = btr_cur_get_block(cursor);
4278 	page = buf_block_get_frame(block);
4279 	page_zip = buf_block_get_page_zip(block);
4280 	index = cursor->index;
4281 
4282 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4283 					MTR_MEMO_X_LOCK |
4284 					MTR_MEMO_SX_LOCK)
4285 	      || dict_table_is_intrinsic(index->table));
4286 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4287 #ifdef UNIV_ZIP_DEBUG
4288 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4289 #endif /* UNIV_ZIP_DEBUG */
4290 	/* The insert buffer tree should never be updated in place. */
4291 	ut_ad(!dict_index_is_ibuf(index));
4292 	ut_ad(trx_id > 0
4293 	      || (flags & BTR_KEEP_SYS_FLAG)
4294 	      || dict_table_is_intrinsic(index->table));
4295 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4296 	      || dict_index_is_clust(index));
4297 	ut_ad(thr_get_trx(thr)->id == trx_id
4298 	      || (flags & ~BTR_KEEP_POS_FLAG)
4299 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4300 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4301 
4302 	err = optim_err = btr_cur_optimistic_update(
4303 		flags | BTR_KEEP_IBUF_BITMAP,
4304 		cursor, offsets, offsets_heap, update,
4305 		cmpl_info, thr, trx_id, mtr);
4306 
4307 	switch (err) {
4308 	case DB_ZIP_OVERFLOW:
4309 	case DB_UNDERFLOW:
4310 	case DB_OVERFLOW:
4311 		break;
4312 	default:
4313 	err_exit:
4314 		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4315 		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4316 		already reset by btr_cur_update_alloc_zip() if the
4317 		page was recompressed. */
4318 		if (page_zip
4319 		    && optim_err != DB_ZIP_OVERFLOW
4320 		    && !dict_index_is_clust(index)
4321 		    && !dict_table_is_temporary(index->table)
4322 		    && page_is_leaf(page)) {
4323 			ibuf_update_free_bits_zip(block, mtr);
4324 		}
4325 
4326 		if (big_rec_vec != NULL) {
4327 			dtuple_big_rec_free(big_rec_vec);
4328 		}
4329 
4330 		return(err);
4331 	}
4332 
4333 	rec = btr_cur_get_rec(cursor);
4334 
4335 	*offsets = rec_get_offsets(
4336 		rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4337 
4338 	dtuple_t*	new_entry = row_rec_to_index_entry(
4339 		rec, index, *offsets, &n_ext, entry_heap);
4340 
4341 	/* The page containing the clustered index record
4342 	corresponding to new_entry is latched in mtr.  If the
4343 	clustered index record is delete-marked, then its externally
4344 	stored fields cannot have been purged yet, because then the
4345 	purge would also have removed the clustered index record
4346 	itself.  Thus the following call is safe. */
4347 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4348 						     FALSE, entry_heap);
4349 
4350 	/* We have to set appropriate extern storage bits in the new
4351 	record to be inserted: we have to remember which fields were such */
4352 
4353 	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4354 	ut_ad(rec_offs_validate(rec, index, *offsets));
4355 
4356 	/* Get number of externally stored columns in updated record */
4357 	n_ext = new_entry->get_n_ext();
4358 
4359 	/* UNDO logging is also turned-off during normal operation on intrinsic
4360 	table so condition needs to ensure that table is not intrinsic. */
4361 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
4362 	    && rec_offs_any_extern(*offsets)
4363 	    && !dict_table_is_intrinsic(index->table)) {
4364 		/* We are in a transaction rollback undoing a row
4365 		update: we must free possible externally stored fields
4366 		which got new values in the update, if they are not
4367 		inherited values. They can be inherited if we have
4368 		updated the primary key to another value, and then
4369 		update it back again. */
4370 
4371 		ut_ad(big_rec_vec == NULL);
4372 		ut_ad(dict_index_is_clust(index));
4373 		ut_ad(thr_get_trx(thr)->in_rollback);
4374 
4375 		DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4376 		RECOVERY_CRASH(99);
4377 
4378 		btr_rec_free_updated_extern_fields(
4379 			index, rec, page_zip, *offsets, update, true, mtr);
4380 	}
4381 
4382 	if (page_zip_rec_needs_ext(
4383 			rec_get_converted_size(index, new_entry, n_ext),
4384 			page_is_comp(page),
4385 			dict_index_get_n_fields(index),
4386 			block->page.size)) {
4387 
4388 		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4389 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4390 
4391 			/* We cannot goto return_after_reservations,
4392 			because we may need to update the
4393 			IBUF_BITMAP_FREE bits, which was suppressed by
4394 			BTR_KEEP_IBUF_BITMAP. */
4395 #ifdef UNIV_ZIP_DEBUG
4396 			ut_a(!page_zip
4397 			     || page_zip_validate(page_zip, page, index));
4398 #endif /* UNIV_ZIP_DEBUG */
4399 			if (n_reserved > 0) {
4400 				fil_space_release_free_extents(
4401 					index->space, n_reserved);
4402 			}
4403 
4404 			err = DB_TOO_BIG_RECORD;
4405 			goto err_exit;
4406 		}
4407 
4408 		ut_ad(page_is_leaf(page));
4409 		ut_ad(dict_index_is_clust(index));
4410 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4411 	}
4412 
4413 	/* Do lock checking and undo logging */
4414 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4415 					update, cmpl_info,
4416 					thr, mtr, &roll_ptr);
4417 	if (err != DB_SUCCESS) {
4418 		goto err_exit;
4419 	}
4420 
4421 	if (optim_err == DB_OVERFLOW) {
4422 
4423 		/* First reserve enough free space for the file segments
4424 		of the index tree, so that the update will not fail because
4425 		of lack of space */
4426 
4427 		ulint	n_extents = cursor->tree_height / 16 + 3;
4428 
4429 		if (!fsp_reserve_free_extents(
4430 		            &n_reserved, index->space, n_extents,
4431 		            flags & BTR_NO_UNDO_LOG_FLAG
4432 		            ? FSP_CLEANING : FSP_NORMAL,
4433 		            mtr)) {
4434 			err = DB_OUT_OF_FILE_SPACE;
4435 			goto err_exit;
4436 		}
4437 	}
4438 
4439 	if (!(flags & BTR_KEEP_SYS_FLAG)
4440 	    && !dict_table_is_intrinsic(index->table)) {
4441 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4442 					      roll_ptr);
4443 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4444 					      trx_id);
4445 	}
4446 
4447 	if (!page_zip) {
4448 		max_ins_size = page_get_max_insert_size_after_reorganize(
4449 				page, 1);
4450 	}
4451 
4452 	/* Store state of explicit locks on rec on the page infimum record,
4453 	before deleting rec. The page infimum acts as a dummy carrier of the
4454 	locks, taking care also of lock releases, before we can move the locks
4455 	back on the actual record. There is a special case: if we are
4456 	inserting on the root page and the insert causes a call of
4457 	btr_root_raise_and_insert. Therefore we cannot in the lock system
4458 	delete the lock structs set on the root page even if the root
4459 	page carries just node pointers. */
4460 	if (!dict_table_is_locking_disabled(index->table)) {
4461 		lock_rec_store_on_page_infimum(block, rec);
4462 	}
4463 
4464 	btr_search_update_hash_on_delete(cursor);
4465 
4466 #ifdef UNIV_ZIP_DEBUG
4467 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4468 #endif /* UNIV_ZIP_DEBUG */
4469 	page_cursor = btr_cur_get_page_cur(cursor);
4470 
4471 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4472 
4473 	page_cur_move_to_prev(page_cursor);
4474 
4475 	rec = btr_cur_insert_if_possible(cursor, new_entry,
4476 					 offsets, offsets_heap, n_ext, mtr);
4477 
4478 	if (rec) {
4479 		page_cursor->rec = rec;
4480 
4481 		if (!dict_table_is_locking_disabled(index->table)) {
4482 			lock_rec_restore_from_page_infimum(
4483 				btr_cur_get_block(cursor), rec, block);
4484 		}
4485 
4486 		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4487 			/* The new inserted record owns its possible externally
4488 			stored fields */
4489 			btr_cur_unmark_extern_fields(
4490 				page_zip, rec, index, *offsets, mtr);
4491 		}
4492 
4493 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4494 
4495 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4496 			if (adjust) {
4497 				rec_offs_make_valid(
4498 					page_cursor->rec, index, *offsets);
4499 			}
4500 		} else if (!dict_index_is_clust(index)
4501 			   && !dict_table_is_temporary(index->table)
4502 			   && page_is_leaf(page)) {
4503 			/* Update the free bits in the insert buffer.
4504 			This is the same block which was skipped by
4505 			BTR_KEEP_IBUF_BITMAP. */
4506 			if (page_zip) {
4507 				ibuf_update_free_bits_zip(block, mtr);
4508 			} else {
4509 				ibuf_update_free_bits_low(block, max_ins_size,
4510 							  mtr);
4511 			}
4512 		}
4513 
4514 		if (!srv_read_only_mode
4515 		    && !big_rec_vec
4516 		    && page_is_leaf(page)
4517 		    && !dict_index_is_online_ddl(index)) {
4518 
4519 			mtr_memo_release(mtr, dict_index_get_lock(index),
4520 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4521 
4522 			/* NOTE: We cannot release root block latch here, because it
4523 			has segment header and already modified in most of cases.*/
4524 		}
4525 
4526 		err = DB_SUCCESS;
4527 		goto return_after_reservations;
4528 	} else {
4529 		/* If the page is compressed and it initially
4530 		compresses very well, and there is a subsequent insert
4531 		of a badly-compressing record, it is possible for
4532 		btr_cur_optimistic_update() to return DB_UNDERFLOW and
4533 		btr_cur_insert_if_possible() to return FALSE. */
4534 		ut_a(page_zip || optim_err != DB_UNDERFLOW);
4535 
4536 		/* Out of space: reset the free bits.
4537 		This is the same block which was skipped by
4538 		BTR_KEEP_IBUF_BITMAP. */
4539 		if (!dict_index_is_clust(index)
4540 		    && !dict_table_is_temporary(index->table)
4541 		    && page_is_leaf(page)) {
4542 			ibuf_reset_free_bits(block);
4543 		}
4544 	}
4545 
4546 	if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4547 		ut_ad(page_is_leaf(page));
4548 		ut_ad(dict_index_is_clust(index));
4549 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4550 
4551 		/* btr_page_split_and_insert() in
4552 		btr_cur_pessimistic_insert() invokes
4553 		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4554 		We must keep the index->lock when we created a
4555 		big_rec, so that row_upd_clust_rec() can store the
4556 		big_rec in the same mini-transaction. */
4557 
4558 		ut_ad(mtr_memo_contains_flagged(mtr,
4559 						dict_index_get_lock(index),
4560 						MTR_MEMO_X_LOCK |
4561 						MTR_MEMO_SX_LOCK));
4562 
4563 		mtr_sx_lock(dict_index_get_lock(index), mtr);
4564 	}
4565 
4566 	/* Was the record to be updated positioned as the first user
4567 	record on its page? */
4568 	was_first = page_cur_is_before_first(page_cursor);
4569 
4570 	/* Lock checks and undo logging were already performed by
4571 	btr_cur_upd_lock_and_undo(). We do not try
4572 	btr_cur_optimistic_insert() because
4573 	btr_cur_insert_if_possible() already failed above. */
4574 
4575 	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4576 					 | BTR_NO_LOCKING_FLAG
4577 					 | BTR_KEEP_SYS_FLAG,
4578 					 cursor, offsets, offsets_heap,
4579 					 new_entry, &rec,
4580 					 &dummy_big_rec, n_ext, NULL, mtr);
4581 	ut_a(rec);
4582 	ut_a(err == DB_SUCCESS);
4583 	ut_a(dummy_big_rec == NULL);
4584 	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4585 	page_cursor->rec = rec;
4586 
4587 	/* Multiple transactions cannot simultaneously operate on the
4588 	same temp-table in parallel.
4589 	max_trx_id is ignored for temp tables because it not required
4590 	for MVCC. */
4591 	if (dict_index_is_sec_or_ibuf(index)
4592 	    && !dict_table_is_temporary(index->table)) {
4593 		/* Update PAGE_MAX_TRX_ID in the index page header.
4594 		It was not updated by btr_cur_pessimistic_insert()
4595 		because of BTR_NO_LOCKING_FLAG. */
4596 		buf_block_t*	rec_block;
4597 
4598 		rec_block = btr_cur_get_block(cursor);
4599 
4600 		page_update_max_trx_id(rec_block,
4601 				       buf_block_get_page_zip(rec_block),
4602 				       trx_id, mtr);
4603 	}
4604 
4605 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4606 		/* The new inserted record owns its possible externally
4607 		stored fields */
4608 		buf_block_t*	rec_block = btr_cur_get_block(cursor);
4609 
4610 #ifdef UNIV_ZIP_DEBUG
4611 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4612 		page = buf_block_get_frame(rec_block);
4613 #endif /* UNIV_ZIP_DEBUG */
4614 		page_zip = buf_block_get_page_zip(rec_block);
4615 
4616 		btr_cur_unmark_extern_fields(page_zip,
4617 					     rec, index, *offsets, mtr);
4618 	}
4619 
4620 	if (!dict_table_is_locking_disabled(index->table)) {
4621 		lock_rec_restore_from_page_infimum(
4622 			btr_cur_get_block(cursor), rec, block);
4623 	}
4624 
4625 	/* If necessary, restore also the correct lock state for a new,
4626 	preceding supremum record created in a page split. While the old
4627 	record was nonexistent, the supremum might have inherited its locks
4628 	from a wrong record. */
4629 
4630 	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4631 		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4632 						  rec, mtr);
4633 	}
4634 
4635 return_after_reservations:
4636 #ifdef UNIV_ZIP_DEBUG
4637 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4638 #endif /* UNIV_ZIP_DEBUG */
4639 
4640 	if (n_reserved > 0) {
4641 		fil_space_release_free_extents(index->space, n_reserved);
4642 	}
4643 
4644 	*big_rec = big_rec_vec;
4645 
4646 	return(err);
4647 }
4648 
4649 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4650 
4651 /****************************************************************//**
4652 Writes the redo log record for delete marking or unmarking of an index
4653 record. */
4654 UNIV_INLINE
4655 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4656 btr_cur_del_mark_set_clust_rec_log(
4657 /*===============================*/
4658 	rec_t*		rec,	/*!< in: record */
4659 	dict_index_t*	index,	/*!< in: index of the record */
4660 	trx_id_t	trx_id,	/*!< in: transaction id */
4661 	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
4662 	mtr_t*		mtr)	/*!< in: mtr */
4663 {
4664 	byte*	log_ptr;
4665 
4666 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4667 	ut_ad(mtr->is_named_space(index->space));
4668 
4669 	log_ptr = mlog_open_and_write_index(mtr, rec, index,
4670 					    page_rec_is_comp(rec)
4671 					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
4672 					    : MLOG_REC_CLUST_DELETE_MARK,
4673 					    1 + 1 + DATA_ROLL_PTR_LEN
4674 					    + 14 + 2);
4675 
4676 	if (!log_ptr) {
4677 		/* Logging in mtr is switched off during crash recovery */
4678 		return;
4679 	}
4680 
4681 	*log_ptr++ = 0;
4682 	*log_ptr++ = 1;
4683 
4684 	log_ptr = row_upd_write_sys_vals_to_log(
4685 		index, trx_id, roll_ptr, log_ptr, mtr);
4686 	mach_write_to_2(log_ptr, page_offset(rec));
4687 	log_ptr += 2;
4688 
4689 	mlog_close(mtr, log_ptr);
4690 }
4691 #endif /* !UNIV_HOTBACKUP */
4692 
4693 /****************************************************************//**
4694 Parses the redo log record for delete marking or unmarking of a clustered
4695 index record.
4696 @return end of log record or NULL */
4697 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4698 btr_cur_parse_del_mark_set_clust_rec(
4699 /*=================================*/
4700 	byte*		ptr,	/*!< in: buffer */
4701 	byte*		end_ptr,/*!< in: buffer end */
4702 	page_t*		page,	/*!< in/out: page or NULL */
4703 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
4704 	dict_index_t*	index)	/*!< in: index corresponding to page */
4705 {
4706 	ulint		flags;
4707 	ulint		val;
4708 	ulint		pos;
4709 	trx_id_t	trx_id;
4710 	roll_ptr_t	roll_ptr;
4711 	ulint		offset;
4712 	rec_t*		rec;
4713 
4714 	ut_ad(!page
4715 	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
4716 
4717 	if (end_ptr < ptr + 2) {
4718 
4719 		return(NULL);
4720 	}
4721 
4722 	flags = mach_read_from_1(ptr);
4723 	ptr++;
4724 	val = mach_read_from_1(ptr);
4725 	ptr++;
4726 
4727 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4728 
4729 	if (ptr == NULL) {
4730 
4731 		return(NULL);
4732 	}
4733 
4734 	if (end_ptr < ptr + 2) {
4735 
4736 		return(NULL);
4737 	}
4738 
4739 	offset = mach_read_from_2(ptr);
4740 	ptr += 2;
4741 
4742 	ut_a(offset <= UNIV_PAGE_SIZE);
4743 
4744 	if (page) {
4745 		rec = page + offset;
4746 
4747 		/* We do not need to reserve search latch, as the page
4748 		is only being recovered, and there cannot be a hash index to
4749 		it. Besides, these fields are being updated in place
4750 		and the adaptive hash index does not depend on them. */
4751 
4752 		btr_rec_set_deleted_flag(rec, page_zip, val);
4753 
4754 		if (!(flags & BTR_KEEP_SYS_FLAG)) {
4755 			mem_heap_t*	heap		= NULL;
4756 			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4757 			rec_offs_init(offsets_);
4758 
4759 			row_upd_rec_sys_fields_in_recovery(
4760 				rec, page_zip,
4761 				rec_get_offsets(rec, index, offsets_,
4762 						ULINT_UNDEFINED, &heap),
4763 				pos, trx_id, roll_ptr);
4764 			if (UNIV_LIKELY_NULL(heap)) {
4765 				mem_heap_free(heap);
4766 			}
4767 		}
4768 	}
4769 
4770 	return(ptr);
4771 }
4772 
4773 #ifndef UNIV_HOTBACKUP
4774 /***********************************************************//**
4775 Marks a clustered index record deleted. Writes an undo log record to
4776 undo log on this delete marking. Writes in the trx id field the id
4777 of the deleting transaction, and in the roll ptr field pointer to the
4778 undo log record created.
4779 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4780 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4781 btr_cur_del_mark_set_clust_rec(
4782 /*===========================*/
4783 	ulint		flags,  /*!< in: undo logging and locking flags */
4784 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
4785 	rec_t*		rec,	/*!< in/out: record */
4786 	dict_index_t*	index,	/*!< in: clustered index of the record */
4787 	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
4788 	que_thr_t*	thr,	/*!< in: query thread */
4789 	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
4790 				contains the virtual cols if there are any */
4791 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4792 {
4793 	roll_ptr_t	roll_ptr;
4794 	dberr_t		err;
4795 	page_zip_des_t*	page_zip;
4796 	trx_t*		trx;
4797 
4798 	ut_ad(dict_index_is_clust(index));
4799 	ut_ad(rec_offs_validate(rec, index, offsets));
4800 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4801 	ut_ad(buf_block_get_frame(block) == page_align(rec));
4802 	ut_ad(page_is_leaf(page_align(rec)));
4803 	ut_ad(mtr->is_named_space(index->space));
4804 
4805 	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4806 		/* While cascading delete operations, this becomes possible. */
4807 		ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4808 		return(DB_SUCCESS);
4809 	}
4810 
4811 	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4812 						   rec, index, offsets, thr);
4813 
4814 	if (err != DB_SUCCESS) {
4815 
4816 		return(err);
4817 	}
4818 
4819 	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4820 					    index, entry, NULL, 0, rec, offsets,
4821 					    &roll_ptr);
4822 	if (err != DB_SUCCESS) {
4823 
4824 		return(err);
4825 	}
4826 
4827 	/* The search latch is not needed here, because
4828 	the adaptive hash index does not depend on the delete-mark
4829 	and the delete-mark is being updated in place. */
4830 
4831 	page_zip = buf_block_get_page_zip(block);
4832 
4833 	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
4834 
4835 	/* For intrinsic table, roll-ptr is not maintained as there is no UNDO
4836 	logging. Skip updating it. */
4837 	if (dict_table_is_intrinsic(index->table)) {
4838 		return(err);
4839 	}
4840 
4841 	trx = thr_get_trx(thr);
4842 	/* This function must not be invoked during rollback
4843 	(of a TRX_STATE_PREPARE transaction or otherwise). */
4844 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
4845 	ut_ad(!trx->in_rollback);
4846 
4847 	DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
4848 			      ") by " TRX_ID_FMT ": %s",
4849 			      index->table_name, index->id,
4850 			      trx_get_id_for_print(trx),
4851 			      rec_printer(rec, offsets).str().c_str()));
4852 
4853 	if (dict_index_is_online_ddl(index)) {
4854 		row_log_table_delete(rec, entry, index, offsets, NULL);
4855 	}
4856 
4857 	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
4858 
4859 	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
4860 					   roll_ptr, mtr);
4861 
4862 	return(err);
4863 }
4864 
4865 /****************************************************************//**
4866 Writes the redo log record for a delete mark setting of a secondary
4867 index record. */
4868 UNIV_INLINE
4869 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)4870 btr_cur_del_mark_set_sec_rec_log(
4871 /*=============================*/
4872 	rec_t*		rec,	/*!< in: record */
4873 	ibool		val,	/*!< in: value to set */
4874 	mtr_t*		mtr)	/*!< in: mtr */
4875 {
4876 	byte*	log_ptr;
4877 	ut_ad(val <= 1);
4878 
4879 	log_ptr = mlog_open(mtr, 11 + 1 + 2);
4880 
4881 	if (!log_ptr) {
4882 		/* Logging in mtr is switched off during crash recovery:
4883 		in that case mlog_open returns NULL */
4884 		return;
4885 	}
4886 
4887 	log_ptr = mlog_write_initial_log_record_fast(
4888 		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
4889 	mach_write_to_1(log_ptr, val);
4890 	log_ptr++;
4891 
4892 	mach_write_to_2(log_ptr, page_offset(rec));
4893 	log_ptr += 2;
4894 
4895 	mlog_close(mtr, log_ptr);
4896 }
4897 #endif /* !UNIV_HOTBACKUP */
4898 
4899 /****************************************************************//**
4900 Parses the redo log record for delete marking or unmarking of a secondary
4901 index record.
4902 @return end of log record or NULL */
4903 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)4904 btr_cur_parse_del_mark_set_sec_rec(
4905 /*===============================*/
4906 	byte*		ptr,	/*!< in: buffer */
4907 	byte*		end_ptr,/*!< in: buffer end */
4908 	page_t*		page,	/*!< in/out: page or NULL */
4909 	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
4910 {
4911 	ulint	val;
4912 	ulint	offset;
4913 	rec_t*	rec;
4914 
4915 	if (end_ptr < ptr + 3) {
4916 
4917 		return(NULL);
4918 	}
4919 
4920 	val = mach_read_from_1(ptr);
4921 	ptr++;
4922 
4923 	offset = mach_read_from_2(ptr);
4924 	ptr += 2;
4925 
4926 	ut_a(offset <= UNIV_PAGE_SIZE);
4927 
4928 	if (page) {
4929 		rec = page + offset;
4930 
4931 		/* We do not need to reserve search latch, as the page
4932 		is only being recovered, and there cannot be a hash index to
4933 		it. Besides, the delete-mark flag is being updated in place
4934 		and the adaptive hash index does not depend on it. */
4935 
4936 		btr_rec_set_deleted_flag(rec, page_zip, val);
4937 	}
4938 
4939 	return(ptr);
4940 }
4941 
4942 #ifndef UNIV_HOTBACKUP
4943 /***********************************************************//**
4944 Sets a secondary index record delete mark to TRUE or FALSE.
4945 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4946 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)4947 btr_cur_del_mark_set_sec_rec(
4948 /*=========================*/
4949 	ulint		flags,	/*!< in: locking flag */
4950 	btr_cur_t*	cursor,	/*!< in: cursor */
4951 	ibool		val,	/*!< in: value to set */
4952 	que_thr_t*	thr,	/*!< in: query thread */
4953 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4954 {
4955 	buf_block_t*	block;
4956 	rec_t*		rec;
4957 	dberr_t		err;
4958 
4959 	block = btr_cur_get_block(cursor);
4960 	rec = btr_cur_get_rec(cursor);
4961 
4962 	err = lock_sec_rec_modify_check_and_lock(flags,
4963 						 btr_cur_get_block(cursor),
4964 						 rec, cursor->index, thr, mtr);
4965 	if (err != DB_SUCCESS) {
4966 
4967 		return(err);
4968 	}
4969 
4970 	ut_ad(!!page_rec_is_comp(rec)
4971 	      == dict_table_is_comp(cursor->index->table));
4972 
4973 	DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
4974 			      IB_ID_FMT ") by " TRX_ID_FMT,
4975 			      unsigned(val),
4976 			      block->page.id.space(), block->page.id.page_no(),
4977 			      unsigned(page_rec_get_heap_no(rec)),
4978 			      cursor->index->name(), cursor->index->id,
4979 			      trx_get_id_for_print(thr_get_trx(thr))));
4980 
4981 	/* We do not need to reserve search latch, as the
4982 	delete-mark flag is being updated in place and the adaptive
4983 	hash index does not depend on it. */
4984 	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
4985 
4986 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
4987 
4988 	return(DB_SUCCESS);
4989 }
4990 
4991 /***********************************************************//**
4992 Sets a secondary index record's delete mark to the given value. This
4993 function is only used by the insert buffer merge mechanism. */
4994 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)4995 btr_cur_set_deleted_flag_for_ibuf(
4996 /*==============================*/
4997 	rec_t*		rec,		/*!< in/out: record */
4998 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
4999 					corresponding to rec, or NULL
5000 					when the tablespace is
5001 					uncompressed */
5002 	ibool		val,		/*!< in: value to set */
5003 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
5004 {
5005 	/* We do not need to reserve search latch, as the page
5006 	has just been read to the buffer pool and there cannot be
5007 	a hash index to it.  Besides, the delete-mark flag is being
5008 	updated in place and the adaptive hash index does not depend
5009 	on it. */
5010 
5011 	btr_rec_set_deleted_flag(rec, page_zip, val);
5012 
5013 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5014 }
5015 
5016 /*==================== B-TREE RECORD REMOVE =========================*/
5017 
5018 /*************************************************************//**
5019 Tries to compress a page of the tree if it seems useful. It is assumed
5020 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5021 deadlocks, mtr must also own x-latches to brothers of page, if those
5022 brothers exist. NOTE: it is assumed that the caller has reserved enough
5023 free extents so that the compression will always succeed if done!
5024 @return TRUE if compression occurred */
5025 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5026 btr_cur_compress_if_useful(
5027 /*=======================*/
5028 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
5029 				cursor does not stay valid if !adjust and
5030 				compression occurs */
5031 	ibool		adjust,	/*!< in: TRUE if should adjust the
5032 				cursor position even if compression occurs */
5033 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5034 {
5035 	/* Avoid applying compression as we don't accept lot of page garbage
5036 	given the workload of intrinsic table. */
5037 	if (dict_table_is_intrinsic(cursor->index->table)) {
5038 		return(FALSE);
5039 	}
5040 
5041 	ut_ad(mtr_memo_contains_flagged(
5042 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5043 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5044 	      || dict_table_is_intrinsic(cursor->index->table));
5045 	ut_ad(mtr_is_block_fix(
5046 		mtr, btr_cur_get_block(cursor),
5047 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5048 
5049 	if (dict_index_is_spatial(cursor->index)) {
5050 		const page_t*   page = btr_cur_get_page(cursor);
5051 		const trx_t*	trx = NULL;
5052 
5053 		if (cursor->rtr_info->thr != NULL) {
5054 			trx = thr_get_trx(cursor->rtr_info->thr);
5055 		}
5056 
5057 		/* Check whether page lock prevents the compression */
5058 		if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5059 					      page_get_page_no(page))) {
5060 			return(false);
5061 		}
5062 	}
5063 
5064 	return(btr_cur_compress_recommendation(cursor, mtr)
5065 	       && btr_compress(cursor, adjust, mtr));
5066 }
5067 
5068 /*******************************************************//**
5069 Removes the record on which the tree cursor is positioned on a leaf page.
5070 It is assumed that the mtr has an x-latch on the page where the cursor is
5071 positioned, but no latch on the whole tree.
5072 @return TRUE if success, i.e., the page did not become too empty */
5073 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5074 btr_cur_optimistic_delete_func(
5075 /*===========================*/
5076 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
5077 				delete; cursor stays valid: if deletion
5078 				succeeds, on function exit it points to the
5079 				successor of the deleted record */
5080 #ifdef UNIV_DEBUG
5081 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5082 #endif /* UNIV_DEBUG */
5083 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
5084 				TRUE on a leaf page of a secondary
5085 				index, the mtr must be committed
5086 				before latching any further pages */
5087 {
5088 	buf_block_t*	block;
5089 	rec_t*		rec;
5090 	mem_heap_t*	heap		= NULL;
5091 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
5092 	ulint*		offsets		= offsets_;
5093 	ibool		no_compress_needed;
5094 	rec_offs_init(offsets_);
5095 
5096 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5097 	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5098 				MTR_MEMO_PAGE_X_FIX));
5099 	ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5100 			       MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5101 	ut_ad(mtr->is_named_space(cursor->index->space));
5102 
5103 	/* This is intended only for leaf page deletions */
5104 
5105 	block = btr_cur_get_block(cursor);
5106 
5107 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
5108 	ut_ad(!dict_index_is_online_ddl(cursor->index)
5109 	      || dict_index_is_clust(cursor->index)
5110 	      || (flags & BTR_CREATE_FLAG));
5111 
5112 	rec = btr_cur_get_rec(cursor);
5113 	offsets = rec_get_offsets(rec, cursor->index, offsets,
5114 				  ULINT_UNDEFINED, &heap);
5115 
5116 	no_compress_needed = !rec_offs_any_extern(offsets)
5117 		&& btr_cur_can_delete_without_compress(
5118 			cursor, rec_offs_size(offsets), mtr);
5119 
5120 	if (no_compress_needed) {
5121 
5122 		page_t*		page	= buf_block_get_frame(block);
5123 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
5124 
5125 		lock_update_delete(block, rec);
5126 
5127 		btr_search_update_hash_on_delete(cursor);
5128 
5129 		if (page_zip) {
5130 #ifdef UNIV_ZIP_DEBUG
5131 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5132 #endif /* UNIV_ZIP_DEBUG */
5133 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5134 					    cursor->index, offsets, mtr);
5135 #ifdef UNIV_ZIP_DEBUG
5136 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5137 #endif /* UNIV_ZIP_DEBUG */
5138 
5139 			/* On compressed pages, the IBUF_BITMAP_FREE
5140 			space is not affected by deleting (purging)
5141 			records, because it is defined as the minimum
5142 			of space available *without* reorganize, and
5143 			space available in the modification log. */
5144 		} else {
5145 			const ulint	max_ins
5146 				= page_get_max_insert_size_after_reorganize(
5147 					page, 1);
5148 
5149 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5150 					    cursor->index, offsets, mtr);
5151 
5152 			/* The change buffer does not handle inserts
5153 			into non-leaf pages, into clustered indexes,
5154 			or into the change buffer. */
5155 			if (!dict_index_is_clust(cursor->index)
5156 			    && !dict_table_is_temporary(cursor->index->table)
5157 			    && !dict_index_is_ibuf(cursor->index)) {
5158 				ibuf_update_free_bits_low(block, max_ins, mtr);
5159 			}
5160 		}
5161 	} else {
5162 		/* prefetch siblings of the leaf for the pessimistic
5163 		operation. */
5164 		btr_cur_prefetch_siblings(block);
5165 	}
5166 
5167 	if (UNIV_LIKELY_NULL(heap)) {
5168 		mem_heap_free(heap);
5169 	}
5170 
5171 	return(no_compress_needed);
5172 }
5173 
5174 /*************************************************************//**
5175 Removes the record on which the tree cursor is positioned. Tries
5176 to compress the page if its fillfactor drops below a threshold
5177 or if it is the only page on the level. It is assumed that mtr holds
5178 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5179 mtr must also own x-latches to brothers of page, if those brothers
5180 exist.
5181 @return TRUE if compression occurred and FALSE if not or something
5182 wrong. */
5183 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5184 btr_cur_pessimistic_delete(
5185 /*=======================*/
5186 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5187 				the latter may occur because we may have
5188 				to update node pointers on upper levels,
5189 				and in the case of variable length keys
5190 				these may actually grow in size */
5191 	ibool		has_reserved_extents, /*!< in: TRUE if the
5192 				caller has already reserved enough free
5193 				extents so that he knows that the operation
5194 				will succeed */
5195 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
5196 				if compression does not occur, the cursor
5197 				stays valid: it points to successor of
5198 				deleted record on function exit */
5199 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5200 	bool		rollback,/*!< in: performing rollback? */
5201 	mtr_t*		mtr)	/*!< in: mtr */
5202 {
5203 	buf_block_t*	block;
5204 	page_t*		page;
5205 	page_zip_des_t*	page_zip;
5206 	dict_index_t*	index;
5207 	rec_t*		rec;
5208 	ulint		n_reserved	= 0;
5209 	bool		success;
5210 	ibool		ret		= FALSE;
5211 	ulint		level;
5212 	mem_heap_t*	heap;
5213 	ulint*		offsets;
5214 	bool		allow_merge = true; /* if true, implies we have taken appropriate page
5215 			latches needed to merge this page.*/
5216 #ifdef UNIV_DEBUG
5217 	bool		parent_latched	= false;
5218 #endif /* UNIV_DEBUG */
5219 
5220 	block = btr_cur_get_block(cursor);
5221 	page = buf_block_get_frame(block);
5222 	index = btr_cur_get_index(cursor);
5223 
5224 	ulint rec_size_est = dict_index_node_ptr_max_size(index);
5225 	const page_size_t       page_size(dict_table_page_size(index->table));
5226 
5227 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5228 	ut_ad(!dict_index_is_online_ddl(index)
5229 	      || dict_index_is_clust(index)
5230 	      || (flags & BTR_CREATE_FLAG));
5231 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5232 					MTR_MEMO_X_LOCK
5233 					| MTR_MEMO_SX_LOCK)
5234 	      || dict_table_is_intrinsic(index->table));
5235 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5236 	ut_ad(mtr->is_named_space(index->space));
5237 
5238 	if (!has_reserved_extents) {
5239 		/* First reserve enough free space for the file segments
5240 		of the index tree, so that the node pointer updates will
5241 		not fail because of lack of space */
5242 
5243 		ulint	n_extents = cursor->tree_height / 32 + 1;
5244 
5245 		success = fsp_reserve_free_extents(&n_reserved,
5246 						   index->space,
5247 						   n_extents,
5248 						   FSP_CLEANING, mtr);
5249 		if (!success) {
5250 			*err = DB_OUT_OF_FILE_SPACE;
5251 
5252 			return(FALSE);
5253 		}
5254 	}
5255 
5256 	heap = mem_heap_create(1024);
5257 	rec = btr_cur_get_rec(cursor);
5258 	page_zip = buf_block_get_page_zip(block);
5259 #ifdef UNIV_ZIP_DEBUG
5260 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5261 #endif /* UNIV_ZIP_DEBUG */
5262 
5263 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5264 
5265 	if (rec_offs_any_extern(offsets)) {
5266 		btr_rec_free_externally_stored_fields(index,
5267 						      rec, offsets, page_zip,
5268 						      rollback, mtr);
5269 #ifdef UNIV_ZIP_DEBUG
5270 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5271 #endif /* UNIV_ZIP_DEBUG */
5272 	}
5273 
5274 	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5275 	    && UNIV_UNLIKELY(dict_index_get_page(index)
5276 			     != block->page.id.page_no())) {
5277 
5278 		/* If there is only one record, drop the whole page in
5279 		btr_discard_page, if this is not the root page */
5280 
5281 		btr_discard_page(cursor, mtr);
5282 
5283 		ret = TRUE;
5284 
5285 		goto return_after_reservations;
5286 	}
5287 
5288 	if (flags == 0) {
5289 		lock_update_delete(block, rec);
5290 	}
5291 
5292 	level = btr_page_get_level(page, mtr);
5293 
5294 	if (level > 0
5295 	    && UNIV_UNLIKELY(rec == page_rec_get_next(
5296 				     page_get_infimum_rec(page)))) {
5297 
5298 		rec_t*	next_rec = page_rec_get_next(rec);
5299 
5300 		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5301 
5302 			/* If we delete the leftmost node pointer on a
5303 			non-leaf level, we must mark the new leftmost node
5304 			pointer as the predefined minimum record */
5305 
5306 			/* This will make page_zip_validate() fail until
5307 			page_cur_delete_rec() completes.  This is harmless,
5308 			because everything will take place within a single
5309 			mini-transaction and because writing to the redo log
5310 			is an atomic operation (performed by mtr_commit()). */
5311 			btr_set_min_rec_mark(next_rec, mtr);
5312 		} else if (dict_index_is_spatial(index)) {
5313 			/* For rtree, if delete the leftmost node pointer,
5314 			we need to update parent page. */
5315 			rtr_mbr_t	father_mbr;
5316 			rec_t*		father_rec;
5317 			btr_cur_t	father_cursor;
5318 			ulint*		offsets;
5319 			bool		upd_ret;
5320 			ulint		len;
5321 
5322 			rtr_page_get_father_block(NULL, heap, index,
5323 						  block, mtr, NULL,
5324 						  &father_cursor);
5325 			offsets = rec_get_offsets(
5326 				btr_cur_get_rec(&father_cursor), index,
5327 				NULL, ULINT_UNDEFINED, &heap);
5328 
5329 			father_rec = btr_cur_get_rec(&father_cursor);
5330 			rtr_read_mbr(rec_get_nth_field(
5331 				father_rec, offsets, 0, &len), &father_mbr);
5332 
5333 			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5334 						       NULL, page, &father_mbr,
5335 						       next_rec, mtr);
5336 
5337 			if (!upd_ret) {
5338 				*err = DB_ERROR;
5339 
5340 				mem_heap_free(heap);
5341 				return(FALSE);
5342 			}
5343 
5344 			ut_d(parent_latched = true);
5345 		} else {
5346 			/* Otherwise, if we delete the leftmost node pointer
5347 			on a page, we have to change the parent node pointer
5348 			so that it is equal to the new leftmost node pointer
5349 			on the page */
5350 
5351 			btr_node_ptr_delete(index, block, mtr);
5352 
5353 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
5354 				index, next_rec, block->page.id.page_no(),
5355 				heap, level);
5356 
5357 			btr_insert_on_non_leaf_level(
5358 				flags, index, level + 1, node_ptr, mtr);
5359 
5360 			ut_d(parent_latched = true);
5361 		}
5362 	}
5363 
5364 	btr_search_update_hash_on_delete(cursor);
5365 
5366 	if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5367 	/* Set allow merge to true for spatial indexes as the tree is X
5368         locked incase of delete operation on spatial indexes thus avoiding
5369         possibility of upward locking.*/
5370 		allow_merge = true;
5371 	} else {
5372 		allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5373                                         rec,rec_size_est,page_size,mtr);
5374 	}
5375 	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5376 #ifdef UNIV_ZIP_DEBUG
5377 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5378 #endif /* UNIV_ZIP_DEBUG */
5379 
5380 	/* btr_check_node_ptr() needs parent block latched */
5381 	ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5382 
5383 return_after_reservations:
5384 	*err = DB_SUCCESS;
5385 
5386 	mem_heap_free(heap);
5387 
5388 	if(!ret) {
5389 		bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5390 		/* We are not allowed do merge because appropriate locks
5391 		are not taken while positioning the cursor. */
5392 		if (!allow_merge && do_merge) {
5393 			ib::info() << "Ignoring merge recommendation for page"
5394 				"as we could not predict it early .Page"
5395 				"number being\n" << page_get_page_no(page) <<
5396 				"Index name\n" << index->name;
5397 			ut_ad(false);
5398 		} else if (do_merge) {
5399 
5400 			ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5401 		}
5402 	}
5403 
5404 	if (!srv_read_only_mode
5405 	    && page_is_leaf(page)
5406 	    && !dict_index_is_online_ddl(index)) {
5407 
5408 		mtr_memo_release(mtr, dict_index_get_lock(index),
5409 				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5410 
5411 		/* NOTE: We cannot release root block latch here, because it
5412 		has segment header and already modified in most of cases.*/
5413 	}
5414 
5415 	if (n_reserved > 0) {
5416 		fil_space_release_free_extents(index->space, n_reserved);
5417 	}
5418 
5419 	return(ret);
5420 }
5421 
5422 /*******************************************************************//**
5423 Adds path information to the cursor for the current page, for which
5424 the binary search has been performed. */
5425 static
5426 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5427 btr_cur_add_path_info(
5428 /*==================*/
5429 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
5430 	ulint		height,		/*!< in: height of the page in tree;
5431 					0 means leaf node */
5432 	ulint		root_height)	/*!< in: root node height in tree */
5433 {
5434 	btr_path_t*	slot;
5435 	const rec_t*	rec;
5436 	const page_t*	page;
5437 
5438 	ut_a(cursor->path_arr);
5439 
5440 	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5441 		/* Do nothing; return empty path */
5442 
5443 		slot = cursor->path_arr;
5444 		slot->nth_rec = ULINT_UNDEFINED;
5445 
5446 		return;
5447 	}
5448 
5449 	if (height == 0) {
5450 		/* Mark end of slots for path */
5451 		slot = cursor->path_arr + root_height + 1;
5452 		slot->nth_rec = ULINT_UNDEFINED;
5453 	}
5454 
5455 	rec = btr_cur_get_rec(cursor);
5456 
5457 	slot = cursor->path_arr + (root_height - height);
5458 
5459 	page = page_align(rec);
5460 
5461 	slot->nth_rec = page_rec_get_n_recs_before(rec);
5462 	slot->n_recs = page_get_n_recs(page);
5463 	slot->page_no = page_get_page_no(page);
5464 	slot->page_level = btr_page_get_level_low(page);
5465 }
5466 
5467 /*******************************************************************//**
5468 Estimate the number of rows between slot1 and slot2 for any level on a
5469 B-tree. This function starts from slot1->page and reads a few pages to
5470 the right, counting their records. If we reach slot2->page quickly then
5471 we know exactly how many records there are between slot1 and slot2 and
5472 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5473 then we calculate the average number of records in the pages scanned
5474 so far and assume that all pages that we did not scan up to slot2->page
5475 contain the same number of records, then we multiply that average to
5476 the number of pages between slot1->page and slot2->page (which is
5477 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5478 @return number of rows, not including the borders (exact or estimated) */
5479 static
5480 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5481 btr_estimate_n_rows_in_range_on_level(
5482 /*==================================*/
5483 	dict_index_t*	index,			/*!< in: index */
5484 	btr_path_t*	slot1,			/*!< in: left border */
5485 	btr_path_t*	slot2,			/*!< in: right border */
5486 	int64_t		n_rows_on_prev_level,	/*!< in: number of rows
5487 						on the previous level for the
5488 						same descend paths; used to
5489 						determine the number of pages
5490 						on this level */
5491 	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
5492 						value is exact i.e. not an
5493 						estimation */
5494 {
5495 	int64_t		n_rows;
5496 	ulint		n_pages_read;
5497 	ulint		level;
5498 
5499 	n_rows = 0;
5500 	n_pages_read = 0;
5501 
5502 	/* Assume by default that we will scan all pages between
5503 	slot1->page_no and slot2->page_no. */
5504 	*is_n_rows_exact = TRUE;
5505 
5506 	/* Add records from slot1->page_no which are to the right of
5507 	the record which serves as a left border of the range, if any
5508 	(we don't include the record itself in this count). */
5509 	if (slot1->nth_rec <= slot1->n_recs) {
5510 		n_rows += slot1->n_recs - slot1->nth_rec;
5511 	}
5512 
5513 	/* Add records from slot2->page_no which are to the left of
5514 	the record which servers as a right border of the range, if any
5515 	(we don't include the record itself in this count). */
5516 	if (slot2->nth_rec > 1) {
5517 		n_rows += slot2->nth_rec - 1;
5518 	}
5519 
5520 	/* Count the records in the pages between slot1->page_no and
5521 	slot2->page_no (non inclusive), if any. */
5522 
5523 	/* Do not read more than this number of pages in order not to hurt
5524 	performance with this code which is just an estimation. If we read
5525 	this many pages before reaching slot2->page_no then we estimate the
5526 	average from the pages scanned so far. */
5527 #	define N_PAGES_READ_LIMIT	10
5528 
5529 	page_id_t		page_id(
5530 		dict_index_get_space(index), slot1->page_no);
5531 	const fil_space_t*	space = fil_space_get(index->space);
5532 	ut_ad(space);
5533 	const page_size_t	page_size(space->flags);
5534 
5535 	level = slot1->page_level;
5536 
5537 	do {
5538 		mtr_t		mtr;
5539 		page_t*		page;
5540 		buf_block_t*	block;
5541 
5542 		mtr_start(&mtr);
5543 
5544 		/* Fetch the page. Because we are not holding the
5545 		index->lock, the tree may have changed and we may be
5546 		attempting to read a page that is no longer part of
5547 		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5548 		silence a debug assertion about this. */
5549 		block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5550 					 NULL, BUF_GET_POSSIBLY_FREED,
5551 					 __FILE__, __LINE__, &mtr);
5552 
5553 		page = buf_block_get_frame(block);
5554 
5555 		/* It is possible that the tree has been reorganized in the
5556 		meantime and this is a different page. If this happens the
5557 		calculated estimate will be bogus, which is not fatal as
5558 		this is only an estimate. We are sure that a page with
5559 		page_no exists because InnoDB never frees pages, only
5560 		reuses them. */
5561 		if (!fil_page_index_page_check(page)
5562 		    || btr_page_get_index_id(page) != index->id
5563 		    || btr_page_get_level_low(page) != level) {
5564 
5565 			/* The page got reused for something else */
5566 			mtr_commit(&mtr);
5567 			goto inexact;
5568 		}
5569 
5570 		/* It is possible but highly unlikely that the page was
5571 		originally written by an old version of InnoDB that did
5572 		not initialize FIL_PAGE_TYPE on other than B-tree pages.
5573 		For example, this could be an almost-empty BLOB page
5574 		that happens to contain the magic values in the fields
5575 		that we checked above. */
5576 
5577 		n_pages_read++;
5578 
5579 		if (page_id.page_no() != slot1->page_no) {
5580 			/* Do not count the records on slot1->page_no,
5581 			we already counted them before this loop. */
5582 			n_rows += page_get_n_recs(page);
5583 		}
5584 
5585 		page_id.set_page_no(btr_page_get_next(page, &mtr));
5586 
5587 		mtr_commit(&mtr);
5588 
5589 		if (n_pages_read == N_PAGES_READ_LIMIT
5590 		    || page_id.page_no() == FIL_NULL) {
5591 			/* Either we read too many pages or
5592 			we reached the end of the level without passing
5593 			through slot2->page_no, the tree must have changed
5594 			in the meantime */
5595 			goto inexact;
5596 		}
5597 
5598 	} while (page_id.page_no() != slot2->page_no);
5599 
5600 	return(n_rows);
5601 
5602 inexact:
5603 
5604 	*is_n_rows_exact = FALSE;
5605 
5606 	/* We did interrupt before reaching slot2->page */
5607 
5608 	if (n_pages_read > 0) {
5609 		/* The number of pages on this level is
5610 		n_rows_on_prev_level, multiply it by the
5611 		average number of recs per page so far */
5612 		n_rows = n_rows_on_prev_level
5613 			* n_rows / n_pages_read;
5614 	} else {
5615 		/* The tree changed before we could even
5616 		start with slot1->page_no */
5617 		n_rows = 10;
5618 	}
5619 
5620 	return(n_rows);
5621 }
5622 
5623 /** If the tree gets changed too much between the two dives for the left
5624 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5625 that many times before giving up and returning the value stored in
5626 rows_in_range_arbitrary_ret_val. */
5627 static const unsigned	rows_in_range_max_retries = 4;
5628 
5629 /** We pretend that a range has that many records if the tree keeps changing
5630 for rows_in_range_max_retries retries while we try to estimate the records
5631 in a given range. */
5632 static const int64_t	rows_in_range_arbitrary_ret_val = 10;
5633 
5634 /** Estimates the number of rows in a given index range.
5635 @param[in]	index		index
5636 @param[in]	tuple1		range start, may also be empty tuple
5637 @param[in]	mode1		search mode for range start
5638 @param[in]	tuple2		range end, may also be empty tuple
5639 @param[in]	mode2		search mode for range end
5640 @param[in]	nth_attempt	if the tree gets modified too much while
5641 we are trying to analyze it, then we will retry (this function will call
5642 itself, incrementing this parameter)
5643 @return estimated number of rows; if after rows_in_range_max_retries
5644 retries the tree keeps changing, then we will just return
5645 rows_in_range_arbitrary_ret_val as a result (if
5646 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5647 the two dives). */
5648 static
5649 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5650 btr_estimate_n_rows_in_range_low(
5651 	dict_index_t*	index,
5652 	const dtuple_t*	tuple1,
5653 	page_cur_mode_t	mode1,
5654 	const dtuple_t*	tuple2,
5655 	page_cur_mode_t	mode2,
5656 	unsigned	nth_attempt)
5657 {
5658 	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
5659 	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
5660 	btr_cur_t	cursor;
5661 	btr_path_t*	slot1;
5662 	btr_path_t*	slot2;
5663 	ibool		diverged;
5664 	ibool		diverged_lot;
5665 	ulint		divergence_level;
5666 	int64_t		n_rows;
5667 	ibool		is_n_rows_exact;
5668 	ulint		i;
5669 	mtr_t		mtr;
5670 	int64_t		table_n_rows;
5671 
5672 	table_n_rows = dict_table_get_n_rows(index->table);
5673 
5674 	/* Below we dive to the two records specified by tuple1 and tuple2 and
5675 	we remember the entire dive paths from the tree root. The place where
5676 	the tuple1 path ends on the leaf level we call "left border" of our
5677 	interval and the place where the tuple2 path ends on the leaf level -
5678 	"right border". We take care to either include or exclude the interval
5679 	boundaries depending on whether <, <=, > or >= was specified. For
5680 	example if "5 < x AND x <= 10" then we should not include the left
5681 	boundary, but should include the right one. */
5682 
5683 	mtr_start(&mtr);
5684 
5685 	cursor.path_arr = path1;
5686 
5687 	bool	should_count_the_left_border;
5688 
5689 	if (dtuple_get_n_fields(tuple1) > 0) {
5690 
5691 		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5692 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5693 					    &cursor, 0,
5694 					    __FILE__, __LINE__, &mtr);
5695 
5696 		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5697 
5698 		/* We should count the border if there are any records to
5699 		match the criteria, i.e. if the maximum record on the tree is
5700 		5 and x > 3 is specified then the cursor will be positioned at
5701 		5 and we should count the border, but if x > 7 is specified,
5702 		then the cursor will be positioned at 'sup' on the rightmost
5703 		leaf page in the tree and we should not count the border. */
5704 		should_count_the_left_border
5705 			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5706 	} else {
5707 		btr_cur_open_at_index_side(true, index,
5708 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5709 					   &cursor, 0, &mtr);
5710 
5711 		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5712 
5713 		/* The range specified is wihout a left border, just
5714 		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5715 		positioned the cursor on the infimum record on the leftmost
5716 		page, which must not be counted. */
5717 		should_count_the_left_border = false;
5718 	}
5719 
5720 	mtr_commit(&mtr);
5721 
5722 	mtr_start(&mtr);
5723 
5724 	cursor.path_arr = path2;
5725 
5726 	bool	should_count_the_right_border;
5727 
5728 	if (dtuple_get_n_fields(tuple2) > 0) {
5729 
5730 		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5731 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5732 					    &cursor, 0,
5733 					    __FILE__, __LINE__, &mtr);
5734 
5735 		const rec_t*	rec = btr_cur_get_rec(&cursor);
5736 
5737 		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5738 
5739 		should_count_the_right_border
5740 			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5741 			   /* and the record was found */
5742 			   && cursor.low_match >= dtuple_get_n_fields(tuple2))
5743 			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
5744 			    /* and there are any records to match the criteria,
5745 			    i.e. if the minimum record on the tree is 5 and
5746 			    x < 7 is specified then the cursor will be
5747 			    positioned at 5 and we should count the border, but
5748 			    if x < 2 is specified, then the cursor will be
5749 			    positioned at 'inf' and we should not count the
5750 			    border */
5751 			    && !page_rec_is_infimum(rec));
5752 		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
5753 		ha_innobase::records_in_range():
5754 		min_key=NULL (left-unbounded) which is expected
5755 		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5756 		unexpected - one would expect
5757 		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5758 		cursor will be positioned on the first record to the right of
5759 		the requested one (can also be positioned on the 'sup') and
5760 		we should not count the right border. */
5761 	} else {
5762 		btr_cur_open_at_index_side(false, index,
5763 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5764 					   &cursor, 0, &mtr);
5765 
5766 		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5767 
5768 		/* The range specified is wihout a right border, just
5769 		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5770 		positioned the cursor on the supremum record on the rightmost
5771 		page, which must not be counted. */
5772 		should_count_the_right_border = false;
5773 	}
5774 
5775 	mtr_commit(&mtr);
5776 
5777 	/* We have the path information for the range in path1 and path2 */
5778 
5779 	n_rows = 0;
5780 	is_n_rows_exact = TRUE;
5781 
5782 	/* This becomes true when the two paths do not pass through the
5783 	same pages anymore. */
5784 	diverged = FALSE;
5785 
5786 	/* This becomes true when the paths are not the same or adjacent
5787 	any more. This means that they pass through the same or
5788 	neighboring-on-the-same-level pages only. */
5789 	diverged_lot = FALSE;
5790 
5791 	/* This is the level where paths diverged a lot. */
5792 	divergence_level = 1000000;
5793 
5794 	for (i = 0; ; i++) {
5795 		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
5796 
5797 		slot1 = path1 + i;
5798 		slot2 = path2 + i;
5799 
5800 		if (slot1->nth_rec == ULINT_UNDEFINED
5801 		    || slot2->nth_rec == ULINT_UNDEFINED) {
5802 
5803 			/* Here none of the borders were counted. For example,
5804 			if on the leaf level we descended to:
5805 			(inf, a, b, c, d, e, f, sup)
5806 			         ^        ^
5807 			       path1    path2
5808 			then n_rows will be 2 (c and d). */
5809 
5810 			if (is_n_rows_exact) {
5811 				/* Only fiddle to adjust this off-by-one
5812 				if the number is exact, otherwise we do
5813 				much grosser adjustments below. */
5814 
5815 				btr_path_t*	last1 = &path1[i - 1];
5816 				btr_path_t*	last2 = &path2[i - 1];
5817 
5818 				/* If both paths end up on the same record on
5819 				the leaf level. */
5820 				if (last1->page_no == last2->page_no
5821 				    && last1->nth_rec == last2->nth_rec) {
5822 
5823 					/* n_rows can be > 0 here if the paths
5824 					were first different and then converged
5825 					to the same record on the leaf level.
5826 					For example:
5827 					SELECT ... LIKE 'wait/synch/rwlock%'
5828 					mode1=PAGE_CUR_GE,
5829 					tuple1="wait/synch/rwlock"
5830 					path1[0]={nth_rec=58, n_recs=58,
5831 						  page_no=3, page_level=1}
5832 					path1[1]={nth_rec=56, n_recs=55,
5833 						  page_no=119, page_level=0}
5834 
5835 					mode2=PAGE_CUR_G
5836 					tuple2="wait/synch/rwlock"
5837 					path2[0]={nth_rec=57, n_recs=57,
5838 						  page_no=3, page_level=1}
5839 					path2[1]={nth_rec=56, n_recs=55,
5840 						  page_no=119, page_level=0} */
5841 
5842 					/* If the range is such that we should
5843 					count both borders, then avoid
5844 					counting that record twice - once as a
5845 					left border and once as a right
5846 					border. */
5847 					if (should_count_the_left_border
5848 					    && should_count_the_right_border) {
5849 
5850 						n_rows = 1;
5851 					} else {
5852 						/* Some of the borders should
5853 						not be counted, e.g. [3,3). */
5854 						n_rows = 0;
5855 					}
5856 				} else {
5857 					if (should_count_the_left_border) {
5858 						n_rows++;
5859 					}
5860 
5861 					if (should_count_the_right_border) {
5862 						n_rows++;
5863 					}
5864 				}
5865 			}
5866 
5867 			if (i > divergence_level + 1 && !is_n_rows_exact) {
5868 				/* In trees whose height is > 1 our algorithm
5869 				tends to underestimate: multiply the estimate
5870 				by 2: */
5871 
5872 				n_rows = n_rows * 2;
5873 			}
5874 
5875 			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
5876 
5877 			/* Do not estimate the number of rows in the range
5878 			to over 1 / 2 of the estimated rows in the whole
5879 			table */
5880 
5881 			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
5882 
5883 				n_rows = table_n_rows / 2;
5884 
5885 				/* If there are just 0 or 1 rows in the table,
5886 				then we estimate all rows are in the range */
5887 
5888 				if (n_rows == 0) {
5889 					n_rows = table_n_rows;
5890 				}
5891 			}
5892 
5893 			return(n_rows);
5894 		}
5895 
5896 		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
5897 
5898 			/* If both slots do not point to the same page,
5899 			this means that the tree must have changed between
5900 			the dive for slot1 and the dive for slot2 at the
5901 			beginning of this function. */
5902 			if (slot1->page_no != slot2->page_no
5903 			    || slot1->page_level != slot2->page_level) {
5904 
5905 				/* If the tree keeps changing even after a
5906 				few attempts, then just return some arbitrary
5907 				number. */
5908 				if (nth_attempt >= rows_in_range_max_retries) {
5909 					return(rows_in_range_arbitrary_ret_val);
5910 				}
5911 
5912 				const int64_t	ret =
5913 					btr_estimate_n_rows_in_range_low(
5914 						index, tuple1, mode1,
5915 						tuple2, mode2, nth_attempt + 1);
5916 
5917 				return(ret);
5918 			}
5919 
5920 			diverged = TRUE;
5921 
5922 			if (slot1->nth_rec < slot2->nth_rec) {
5923 				/* We do not count the borders (nor the left
5924 				nor the right one), thus "- 1". */
5925 				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
5926 
5927 				if (n_rows > 0) {
5928 					/* There is at least one row between
5929 					the two borders pointed to by slot1
5930 					and slot2, so on the level below the
5931 					slots will point to non-adjacent
5932 					pages. */
5933 					diverged_lot = TRUE;
5934 					divergence_level = i;
5935 				}
5936 			} else {
5937 				/* It is possible that
5938 				slot1->nth_rec >= slot2->nth_rec
5939 				if, for example, we have a single page
5940 				tree which contains (inf, 5, 6, supr)
5941 				and we select where x > 20 and x < 30;
5942 				in this case slot1->nth_rec will point
5943 				to the supr record and slot2->nth_rec
5944 				will point to 6. */
5945 				n_rows = 0;
5946 				should_count_the_left_border = false;
5947 				should_count_the_right_border = false;
5948 			}
5949 
5950 		} else if (diverged && !diverged_lot) {
5951 
5952 			if (slot1->nth_rec < slot1->n_recs
5953 			    || slot2->nth_rec > 1) {
5954 
5955 				diverged_lot = TRUE;
5956 				divergence_level = i;
5957 
5958 				n_rows = 0;
5959 
5960 				if (slot1->nth_rec < slot1->n_recs) {
5961 					n_rows += slot1->n_recs
5962 						- slot1->nth_rec;
5963 				}
5964 
5965 				if (slot2->nth_rec > 1) {
5966 					n_rows += slot2->nth_rec - 1;
5967 				}
5968 			}
5969 		} else if (diverged_lot) {
5970 
5971 			n_rows = btr_estimate_n_rows_in_range_on_level(
5972 				index, slot1, slot2, n_rows,
5973 				&is_n_rows_exact);
5974 		}
5975 	}
5976 }
5977 
5978 /** Estimates the number of rows in a given index range.
5979 @param[in]	index	index
5980 @param[in]	tuple1	range start, may also be empty tuple
5981 @param[in]	mode1	search mode for range start
5982 @param[in]	tuple2	range end, may also be empty tuple
5983 @param[in]	mode2	search mode for range end
5984 @return estimated number of rows */
5985 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)5986 btr_estimate_n_rows_in_range(
5987 	dict_index_t*	index,
5988 	const dtuple_t*	tuple1,
5989 	page_cur_mode_t	mode1,
5990 	const dtuple_t*	tuple2,
5991 	page_cur_mode_t	mode2)
5992 {
5993 	const int64_t	ret = btr_estimate_n_rows_in_range_low(
5994 		index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
5995 
5996 	return(ret);
5997 }
5998 
5999 /*******************************************************************//**
6000 Record the number of non_null key values in a given index for
6001 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6002 The estimates are eventually stored in the array:
6003 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6004 static
6005 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6006 btr_record_not_null_field_in_rec(
6007 /*=============================*/
6008 	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
6009 					number of columns uniquely determine
6010 					an index entry */
6011 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
6012 					its size could be for all fields or
6013 					that of "n_unique" */
6014 	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
6015 					not null rows for n-column prefix */
6016 {
6017 	ulint	i;
6018 
6019 	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6020 
6021 	if (n_not_null == NULL) {
6022 		return;
6023 	}
6024 
6025 	for (i = 0; i < n_unique; i++) {
6026 		if (rec_offs_nth_sql_null(offsets, i)) {
6027 			break;
6028 		}
6029 
6030 		n_not_null[i]++;
6031 	}
6032 }
6033 
6034 /*******************************************************************//**
6035 Estimates the number of different key values in a given index, for
6036 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6037 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6038 0..n_uniq-1) and the number of pages that were sampled is saved in
6039 index->stat_n_sample_sizes[].
6040 If innodb_stats_method is nulls_ignored, we also record the number of
6041 non-null values for each prefix and stored the estimates in
6042 array index->stat_n_non_null_key_vals.
6043 @return true if the index is available and we get the estimated numbers,
6044 false if the index is unavailable. */
6045 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6046 btr_estimate_number_of_different_key_vals(
6047 /*======================================*/
6048 	dict_index_t*	index)	/*!< in: index */
6049 {
6050 	btr_cur_t	cursor;
6051 	page_t*		page;
6052 	rec_t*		rec;
6053 	ulint		n_cols;
6054 	ib_uint64_t*	n_diff;
6055 	ib_uint64_t*	n_not_null;
6056 	ibool		stats_null_not_equal;
6057 	uintmax_t	n_sample_pages; /* number of pages to sample */
6058 	ulint		not_empty_flag	= 0;
6059 	ulint		total_external_size = 0;
6060 	ulint		i;
6061 	ulint		j;
6062 	uintmax_t	add_on;
6063 	mtr_t		mtr;
6064 	mem_heap_t*	heap		= NULL;
6065 	ulint*		offsets_rec	= NULL;
6066 	ulint*		offsets_next_rec = NULL;
6067 
6068 	/* For spatial index, there is no such stats can be
6069 	fetched. */
6070 	if (dict_index_is_spatial(index)) {
6071 		return(false);
6072 	}
6073 
6074 	n_cols = dict_index_get_n_unique(index);
6075 
6076 	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6077 			       * n_cols
6078 			       + dict_index_get_n_fields(index)
6079 			       * (sizeof *offsets_rec
6080 				  + sizeof *offsets_next_rec));
6081 
6082 	n_diff = (ib_uint64_t*) mem_heap_zalloc(
6083 		heap, n_cols * sizeof(n_diff[0]));
6084 
6085 	n_not_null = NULL;
6086 
6087 	/* Check srv_innodb_stats_method setting, and decide whether we
6088 	need to record non-null value and also decide if NULL is
6089 	considered equal (by setting stats_null_not_equal value) */
6090 	switch (srv_innodb_stats_method) {
6091 	case SRV_STATS_NULLS_IGNORED:
6092 		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6093 			heap, n_cols * sizeof *n_not_null);
6094 		/* fall through */
6095 
6096 	case SRV_STATS_NULLS_UNEQUAL:
6097 		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6098 		case, we will treat NULLs as unequal value */
6099 		stats_null_not_equal = TRUE;
6100 		break;
6101 
6102 	case SRV_STATS_NULLS_EQUAL:
6103 		stats_null_not_equal = FALSE;
6104 		break;
6105 
6106 	default:
6107 		ut_error;
6108 	}
6109 
6110 	/* It makes no sense to test more pages than are contained
6111 	in the index, thus we lower the number if it is too high */
6112 	if (srv_stats_transient_sample_pages > index->stat_index_size) {
6113 		if (index->stat_index_size > 0) {
6114 			n_sample_pages = index->stat_index_size;
6115 		} else {
6116 			n_sample_pages = 1;
6117 		}
6118 	} else {
6119 		n_sample_pages = srv_stats_transient_sample_pages;
6120 	}
6121 
6122 	/* We sample some pages in the index to get an estimate */
6123 
6124 	for (i = 0; i < n_sample_pages; i++) {
6125 		mtr_start(&mtr);
6126 
6127 		bool	available;
6128 
6129 		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6130 						    &cursor, &mtr);
6131 
6132 		if (!available) {
6133 			mtr_commit(&mtr);
6134 			mem_heap_free(heap);
6135 
6136 			return(false);
6137 		}
6138 
6139 		/* Count the number of different key values for each prefix of
6140 		the key on this index page. If the prefix does not determine
6141 		the index record uniquely in the B-tree, then we subtract one
6142 		because otherwise our algorithm would give a wrong estimate
6143 		for an index where there is just one key value. */
6144 
6145 		page = btr_cur_get_page(&cursor);
6146 
6147 		rec = page_rec_get_next(page_get_infimum_rec(page));
6148 
6149 		if (!page_rec_is_supremum(rec)) {
6150 			not_empty_flag = 1;
6151 			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6152 						      ULINT_UNDEFINED, &heap);
6153 
6154 			if (n_not_null != NULL) {
6155 				btr_record_not_null_field_in_rec(
6156 					n_cols, offsets_rec, n_not_null);
6157 			}
6158 		}
6159 
6160 		while (!page_rec_is_supremum(rec)) {
6161 			ulint	matched_fields;
6162 			rec_t*	next_rec = page_rec_get_next(rec);
6163 			if (page_rec_is_supremum(next_rec)) {
6164 				total_external_size +=
6165 					btr_rec_get_externally_stored_len(
6166 						rec, offsets_rec);
6167 				break;
6168 			}
6169 
6170 			offsets_next_rec = rec_get_offsets(next_rec, index,
6171 							   offsets_next_rec,
6172 							   ULINT_UNDEFINED,
6173 							   &heap);
6174 
6175 			cmp_rec_rec_with_match(rec, next_rec,
6176 					       offsets_rec, offsets_next_rec,
6177 					       index,
6178 					       page_is_spatial_non_leaf(next_rec, index),
6179 					       stats_null_not_equal,
6180 					       &matched_fields);
6181 
6182 			for (j = matched_fields; j < n_cols; j++) {
6183 				/* We add one if this index record has
6184 				a different prefix from the previous */
6185 
6186 				n_diff[j]++;
6187 			}
6188 
6189 			if (n_not_null != NULL) {
6190 				btr_record_not_null_field_in_rec(
6191 					n_cols, offsets_next_rec, n_not_null);
6192 			}
6193 
6194 			total_external_size
6195 				+= btr_rec_get_externally_stored_len(
6196 					rec, offsets_rec);
6197 
6198 			rec = next_rec;
6199 			/* Initialize offsets_rec for the next round
6200 			and assign the old offsets_rec buffer to
6201 			offsets_next_rec. */
6202 			{
6203 				ulint*	offsets_tmp = offsets_rec;
6204 				offsets_rec = offsets_next_rec;
6205 				offsets_next_rec = offsets_tmp;
6206 			}
6207 		}
6208 
6209 
6210 		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6211 
6212 			/* If there is more than one leaf page in the tree,
6213 			we add one because we know that the first record
6214 			on the page certainly had a different prefix than the
6215 			last record on the previous index page in the
6216 			alphabetical order. Before this fix, if there was
6217 			just one big record on each clustered index page, the
6218 			algorithm grossly underestimated the number of rows
6219 			in the table. */
6220 
6221 			if (btr_page_get_prev(page, &mtr) != FIL_NULL
6222 			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
6223 
6224 				n_diff[n_cols - 1]++;
6225 			}
6226 		}
6227 
6228 		mtr_commit(&mtr);
6229 	}
6230 
6231 	/* If we saw k borders between different key values on
6232 	n_sample_pages leaf pages, we can estimate how many
6233 	there will be in index->stat_n_leaf_pages */
6234 
6235 	/* We must take into account that our sample actually represents
6236 	also the pages used for external storage of fields (those pages are
6237 	included in index->stat_n_leaf_pages) */
6238 
6239 	for (j = 0; j < n_cols; j++) {
6240 		index->stat_n_diff_key_vals[j]
6241 			= BTR_TABLE_STATS_FROM_SAMPLE(
6242 				n_diff[j], index, n_sample_pages,
6243 				total_external_size, not_empty_flag);
6244 
6245 		/* If the tree is small, smaller than
6246 		10 * n_sample_pages + total_external_size, then
6247 		the above estimate is ok. For bigger trees it is common that we
6248 		do not see any borders between key values in the few pages
6249 		we pick. But still there may be n_sample_pages
6250 		different key values, or even more. Let us try to approximate
6251 		that: */
6252 
6253 		add_on = index->stat_n_leaf_pages
6254 			/ (10 * (n_sample_pages
6255 				 + total_external_size));
6256 
6257 		if (add_on > n_sample_pages) {
6258 			add_on = n_sample_pages;
6259 		}
6260 
6261 		index->stat_n_diff_key_vals[j] += add_on;
6262 
6263 		index->stat_n_sample_sizes[j] = n_sample_pages;
6264 
6265 		/* Update the stat_n_non_null_key_vals[] with our
6266 		sampled result. stat_n_non_null_key_vals[] is created
6267 		and initialized to zero in dict_index_add_to_cache(),
6268 		along with stat_n_diff_key_vals[] array */
6269 		if (n_not_null != NULL) {
6270 			index->stat_n_non_null_key_vals[j] =
6271 				 BTR_TABLE_STATS_FROM_SAMPLE(
6272 					n_not_null[j], index, n_sample_pages,
6273 					total_external_size, not_empty_flag);
6274 		}
6275 	}
6276 
6277 	mem_heap_free(heap);
6278 
6279 	return(true);
6280 }
6281 
6282 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6283 
6284 /***********************************************************//**
6285 Gets the offset of the pointer to the externally stored part of a field.
6286 @return offset of the pointer to the externally stored part */
6287 static
6288 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6289 btr_rec_get_field_ref_offs(
6290 /*=======================*/
6291 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6292 	ulint		n)	/*!< in: index of the external field */
6293 {
6294 	ulint	field_ref_offs;
6295 	ulint	local_len;
6296 
6297 	ut_a(rec_offs_nth_extern(offsets, n));
6298 	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6299 	ut_a(local_len != UNIV_SQL_NULL);
6300 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6301 
6302 	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6303 }
6304 
6305 /** Gets a pointer to the externally stored part of a field.
6306 @param rec record
6307 @param offsets rec_get_offsets(rec)
6308 @param n index of the externally stored field
6309 @return pointer to the externally stored part */
6310 #define btr_rec_get_field_ref(rec, offsets, n)			\
6311 	((rec) + btr_rec_get_field_ref_offs(offsets, n))
6312 
6313 /** Gets the externally stored size of a record, in units of a database page.
6314 @param[in]	rec	record
6315 @param[in]	offsets	array returned by rec_get_offsets()
6316 @return externally stored part, in units of a database page */
6317 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6318 btr_rec_get_externally_stored_len(
6319 	const rec_t*	rec,
6320 	const ulint*	offsets)
6321 {
6322 	ulint	n_fields;
6323 	ulint	total_extern_len = 0;
6324 	ulint	i;
6325 
6326 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6327 
6328 	if (!rec_offs_any_extern(offsets)) {
6329 		return(0);
6330 	}
6331 
6332 	n_fields = rec_offs_n_fields(offsets);
6333 
6334 	for (i = 0; i < n_fields; i++) {
6335 		if (rec_offs_nth_extern(offsets, i)) {
6336 
6337 			ulint	extern_len = mach_read_from_4(
6338 				btr_rec_get_field_ref(rec, offsets, i)
6339 				+ BTR_EXTERN_LEN + 4);
6340 
6341 			total_extern_len += ut_calc_align(extern_len,
6342 							  UNIV_PAGE_SIZE);
6343 		}
6344 	}
6345 
6346 	return(total_extern_len / UNIV_PAGE_SIZE);
6347 }
6348 
6349 /*******************************************************************//**
6350 Sets the ownership bit of an externally stored field in a record. */
6351 static
6352 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6353 btr_cur_set_ownership_of_extern_field(
6354 /*==================================*/
6355 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6356 				part will be updated, or NULL */
6357 	rec_t*		rec,	/*!< in/out: clustered index record */
6358 	dict_index_t*	index,	/*!< in: index of the page */
6359 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6360 	ulint		i,	/*!< in: field number */
6361 	ibool		val,	/*!< in: value to set */
6362 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6363 {
6364 	byte*	data;
6365 	ulint	local_len;
6366 	ulint	byte_val;
6367 
6368 	data = rec_get_nth_field(rec, offsets, i, &local_len);
6369 	ut_ad(rec_offs_nth_extern(offsets, i));
6370 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6371 
6372 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6373 
6374 	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6375 
6376 	if (val) {
6377 		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6378 	} else {
6379 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6380 		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6381 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6382 		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6383 	}
6384 
6385 	if (page_zip) {
6386 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6387 		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6388 	} else if (mtr != NULL) {
6389 
6390 		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6391 				 MLOG_1BYTE, mtr);
6392 	} else {
6393 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6394 	}
6395 }
6396 
6397 /*******************************************************************//**
6398 Marks non-updated off-page fields as disowned by this record. The ownership
6399 must be transferred to the updated record which is inserted elsewhere in the
6400 index tree. In purge only the owner of externally stored field is allowed
6401 to free the field. */
6402 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6403 btr_cur_disown_inherited_fields(
6404 /*============================*/
6405 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6406 				part will be updated, or NULL */
6407 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6408 	dict_index_t*	index,	/*!< in: index of the page */
6409 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6410 	const upd_t*	update,	/*!< in: update vector */
6411 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
6412 {
6413 	ulint	i;
6414 
6415 	ut_ad(rec_offs_validate(rec, index, offsets));
6416 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6417 	ut_ad(rec_offs_any_extern(offsets));
6418 	ut_ad(mtr);
6419 
6420 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6421 		if (rec_offs_nth_extern(offsets, i)
6422 		    && !upd_get_field_by_field_no(update, i, false)) {
6423 			btr_cur_set_ownership_of_extern_field(
6424 				page_zip, rec, index, offsets, i, FALSE, mtr);
6425 		}
6426 	}
6427 }
6428 
6429 /*******************************************************************//**
6430 Marks all extern fields in a record as owned by the record. This function
6431 should be called if the delete mark of a record is removed: a not delete
6432 marked record always owns all its extern fields. */
6433 static
6434 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6435 btr_cur_unmark_extern_fields(
6436 /*=========================*/
6437 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6438 				part will be updated, or NULL */
6439 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6440 	dict_index_t*	index,	/*!< in: index of the page */
6441 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6442 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6443 {
6444 	ulint	n;
6445 	ulint	i;
6446 
6447 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6448 	n = rec_offs_n_fields(offsets);
6449 
6450 	if (!rec_offs_any_extern(offsets)) {
6451 
6452 		return;
6453 	}
6454 
6455 	for (i = 0; i < n; i++) {
6456 		if (rec_offs_nth_extern(offsets, i)) {
6457 
6458 			btr_cur_set_ownership_of_extern_field(
6459 				page_zip, rec, index, offsets, i, TRUE, mtr);
6460 		}
6461 	}
6462 }
6463 
6464 /*******************************************************************//**
6465 Returns the length of a BLOB part stored on the header page.
6466 @return part length */
6467 static
6468 ulint
btr_blob_get_part_len(const byte * blob_header)6469 btr_blob_get_part_len(
6470 /*==================*/
6471 	const byte*	blob_header)	/*!< in: blob header */
6472 {
6473 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6474 }
6475 
6476 /*******************************************************************//**
6477 Returns the page number where the next BLOB part is stored.
6478 @return page number or FIL_NULL if no more pages */
6479 static
6480 ulint
btr_blob_get_next_page_no(const byte * blob_header)6481 btr_blob_get_next_page_no(
6482 /*======================*/
6483 	const byte*	blob_header)	/*!< in: blob header */
6484 {
6485 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6486 }
6487 
6488 /*******************************************************************//**
6489 Deallocate a buffer block that was reserved for a BLOB part. */
6490 static
6491 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6492 btr_blob_free(
6493 /*==========*/
6494 	dict_index_t*	index,	/*!< in: index */
6495 	buf_block_t*	block,	/*!< in: buffer block */
6496 	ibool		all,	/*!< in: TRUE=remove also the compressed page
6497 				if there is one */
6498 	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
6499 {
6500 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
6501 	ulint		space = block->page.id.space();
6502 	ulint		page_no	= block->page.id.page_no();
6503 
6504 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6505 
6506 	mtr_commit(mtr);
6507 
6508 	buf_pool_mutex_enter(buf_pool);
6509 
6510 	/* Only free the block if it is still allocated to
6511 	the same file page. */
6512 
6513 	if (buf_block_get_state(block)
6514 	    == BUF_BLOCK_FILE_PAGE
6515 	    && block->page.id.space() == space
6516 	    && block->page.id.page_no() == page_no) {
6517 
6518 		if (!buf_LRU_free_page(&block->page, all)
6519 		    && all && block->page.zip.data) {
6520 			/* Attempt to deallocate the uncompressed page
6521 			if the whole block cannot be deallocted. */
6522 
6523 			buf_LRU_free_page(&block->page, false);
6524 		}
6525 	}
6526 
6527 	buf_pool_mutex_exit(buf_pool);
6528 }
6529 
6530 /** Helper class used while writing blob pages, during insert or update. */
6531 struct btr_blob_log_check_t {
6532 	/** Persistent cursor on a clusterex index record with blobs. */
6533 	btr_pcur_t*	m_pcur;
6534 	/** Mini transaction holding the latches for m_pcur */
6535 	mtr_t*		m_mtr;
6536 	/** rec_get_offsets(rec, index); offset of clust_rec */
6537 	const ulint*	m_offsets;
6538 	/** The block containing clustered record */
6539 	buf_block_t**	m_block;
6540 	/** The clustered record pointer */
6541 	rec_t**		m_rec;
6542 	/** The blob operation code */
6543 	enum blob_op	m_op;
6544 
6545 	/** Constructor
6546 	@param[in]	pcur		persistent cursor on a clustered
6547 					index record with blobs.
6548 	@param[in]	mtr		mini-transaction holding latches for
6549 					pcur.
6550 	@param[in]	offsets		offsets of the clust_rec
6551 	@param[in,out]	block		record block containing pcur record
6552 	@param[in,out]	rec		the clustered record pointer
6553 	@param[in]	op		the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6554 	btr_blob_log_check_t(
6555 		btr_pcur_t*	pcur,
6556 		mtr_t*		mtr,
6557 		const ulint*	offsets,
6558 		buf_block_t**	block,
6559 		rec_t**		rec,
6560 		enum blob_op	op)
6561 		: m_pcur(pcur),
6562 		  m_mtr(mtr),
6563 		  m_offsets(offsets),
6564 		  m_block(block),
6565 		  m_rec(rec),
6566 		  m_op(op)
6567 	{
6568 		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6569 		ut_ad((*m_block)->frame == page_align(*m_rec));
6570 		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6571 	}
6572 
6573 	/** Check if there is enough space in log file. Commit and re-start the
6574 	mini transaction. */
checkbtr_blob_log_check_t6575 	void check()
6576 	{
6577 		dict_index_t*	index = m_pcur->index();
6578 		ulint		offs = 0;
6579 		ulint		page_no = ULINT_UNDEFINED;
6580 		FlushObserver*	observer = m_mtr->get_flush_observer();
6581 
6582 		if (m_op == BTR_STORE_INSERT_BULK) {
6583 			offs = page_offset(*m_rec);
6584 			page_no = page_get_page_no(
6585 				buf_block_get_frame(*m_block));
6586 
6587 			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6588 		} else {
6589 			btr_pcur_store_position(m_pcur, m_mtr);
6590 		}
6591 		m_mtr->commit();
6592 
6593 		DEBUG_SYNC_C("blob_write_middle");
6594 
6595 		log_free_check();
6596 
6597 		DEBUG_SYNC_C("blob_write_middle_after_check");
6598 
6599 		const mtr_log_t log_mode = m_mtr->get_log_mode();
6600 		m_mtr->start();
6601 		m_mtr->set_log_mode(log_mode);
6602 		m_mtr->set_named_space(index->space);
6603 		m_mtr->set_flush_observer(observer);
6604 
6605 		if (m_op == BTR_STORE_INSERT_BULK) {
6606 			page_id_t       page_id(dict_index_get_space(index),
6607 						page_no);
6608 			page_size_t     page_size(dict_table_page_size(
6609 						index->table));
6610 			page_cur_t*	page_cur = &m_pcur->btr_cur.page_cur;
6611 
6612 			mtr_x_lock(dict_index_get_lock(index), m_mtr);
6613 			page_cur->block = btr_block_get(
6614 				page_id, page_size, RW_X_LATCH, index, m_mtr);
6615 			page_cur->rec = buf_block_get_frame(page_cur->block)
6616 				+ offs;
6617 
6618 			buf_block_buf_fix_dec(page_cur->block);
6619 		} else {
6620 			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6621 			bool ret = btr_pcur_restore_position(
6622 				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6623 				m_pcur, m_mtr);
6624 
6625 			ut_a(ret);
6626 		}
6627 
6628 		*m_block	= btr_pcur_get_block(m_pcur);
6629 		*m_rec		= btr_pcur_get_rec(m_pcur);
6630 
6631 		ut_d(rec_offs_make_valid(
6632 			*m_rec, index, const_cast<ulint*>(m_offsets)));
6633 
6634 		ut_ad(m_mtr->memo_contains_page_flagged(
6635 		      *m_rec,
6636 		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6637 		      || dict_table_is_intrinsic(index->table));
6638 
6639 		ut_ad(mtr_memo_contains_flagged(m_mtr,
6640 		      dict_index_get_lock(index),
6641 		      MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6642 		      || dict_table_is_intrinsic(index->table));
6643 	}
6644 };
6645 
6646 
6647 /*******************************************************************//**
6648 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6649 them in rec.  The extern flags in rec will have to be set beforehand.
6650 The fields are stored on pages allocated from leaf node
6651 file segment of the index tree.
6652 
6653 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6654 any mini-transaction.  Tablespace extension should be redo-logged, so that
6655 recovery will not fail when the big_rec was written to the extended portion of
6656 the file, in case the file was somehow truncated in the crash.
6657 
6658 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6659 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6660 btr_store_big_rec_extern_fields(
6661 /*============================*/
6662 	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
6663 					btr_mtr is restarted, then this can
6664 					be repositioned. */
6665 	const upd_t*	upd,		/*!< in: update vector */
6666 	ulint*		offsets,	/*!< in/out: rec_get_offsets() on
6667 					pcur. the "external storage" flags
6668 					in offsets will correctly correspond
6669 					to rec when this function returns */
6670 	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
6671 					to be stored externally */
6672 	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
6673 					latches to the clustered index. can be
6674 					committed and restarted. */
6675 	enum blob_op	op)		/*! in: operation code */
6676 {
6677 	ulint		rec_page_no;
6678 	byte*		field_ref;
6679 	ulint		extern_len;
6680 	ulint		store_len;
6681 	ulint		page_no;
6682 	ulint		space_id;
6683 	ulint		prev_page_no;
6684 	ulint		hint_page_no;
6685 	ulint		i;
6686 	mtr_t		mtr;
6687 	mtr_t		mtr_bulk;
6688 	mem_heap_t*	heap = NULL;
6689 	page_zip_des_t*	page_zip;
6690 	z_stream	c_stream;
6691 	dberr_t		error		= DB_SUCCESS;
6692 	dict_index_t*	index		= pcur->index();
6693 	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
6694 	rec_t*		rec		= btr_pcur_get_rec(pcur);
6695 
6696 	ut_ad(rec_offs_validate(rec, index, offsets));
6697 	ut_ad(rec_offs_any_extern(offsets));
6698 	ut_ad(btr_mtr);
6699 	ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6700 					MTR_MEMO_X_LOCK
6701 					| MTR_MEMO_SX_LOCK)
6702 	      || dict_table_is_intrinsic(index->table)
6703 	      || !index->is_committed());
6704 	ut_ad(mtr_is_block_fix(
6705 		btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6706 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6707 	ut_a(dict_index_is_clust(index));
6708 
6709 	ut_a(dict_table_page_size(index->table)
6710 		.equals_to(rec_block->page.size));
6711 
6712 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6713 				      &rec, op);
6714 	page_zip = buf_block_get_page_zip(rec_block);
6715 	space_id = rec_block->page.id.space();
6716 	rec_page_no = rec_block->page.id.page_no();
6717 	ut_a(fil_page_index_page_check(page_align(rec))
6718 	     || op == BTR_STORE_INSERT_BULK);
6719 
6720 	if (page_zip) {
6721 		int	err;
6722 
6723 		/* Zlib deflate needs 128 kilobytes for the default
6724 		window size, plus 512 << memLevel, plus a few
6725 		kilobytes for small objects.  We use reduced memLevel
6726 		to limit the memory consumption, and preallocate the
6727 		heap, hoping to avoid memory fragmentation. */
6728 		heap = mem_heap_create(250000);
6729 		page_zip_set_alloc(&c_stream, heap);
6730 
6731 		err = deflateInit2(&c_stream, page_zip_level,
6732 				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6733 		ut_a(err == Z_OK);
6734 	}
6735 
6736 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6737 	/* All pointers to externally stored columns in the record
6738 	must either be zero or they must be pointers to inherited
6739 	columns, owned by this record or an earlier record version. */
6740 	for (i = 0; i < big_rec_vec->n_fields; i++) {
6741 		field_ref = btr_rec_get_field_ref(
6742 			rec, offsets, big_rec_vec->fields[i].field_no);
6743 
6744 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6745 		/* Either this must be an update in place,
6746 		or the BLOB must be inherited, or the BLOB pointer
6747 		must be zero (will be written in this function). */
6748 		ut_a(op == BTR_STORE_UPDATE
6749 		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6750 		     || !memcmp(field_ref, field_ref_zero,
6751 				BTR_EXTERN_FIELD_REF_SIZE));
6752 	}
6753 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6754 
6755 	const page_size_t	page_size(dict_table_page_size(index->table));
6756 
6757 	/* Space available in compressed page to carry blob data */
6758 	const ulint	payload_size_zip = page_size.physical()
6759 		- FIL_PAGE_DATA;
6760 
6761 	/* Space available in uncompressed page to carry blob data */
6762 	const ulint	payload_size = page_size.physical()
6763 		- FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
6764 
6765 	/* We have to create a file segment to the tablespace
6766 	for each field and put the pointer to the field in rec */
6767 
6768 	for (i = 0; i < big_rec_vec->n_fields; i++) {
6769 		const ulint field_no = big_rec_vec->fields[i].field_no;
6770 
6771 		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
6772 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6773 		/* A zero BLOB pointer should have been initially inserted. */
6774 		ut_a(!memcmp(field_ref, field_ref_zero,
6775 			     BTR_EXTERN_FIELD_REF_SIZE));
6776 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6777 		extern_len = big_rec_vec->fields[i].len;
6778 		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
6779 				   extern_len);
6780 
6781 		ut_a(extern_len > 0);
6782 
6783 		prev_page_no = FIL_NULL;
6784 
6785 		if (page_zip) {
6786 			int	err = deflateReset(&c_stream);
6787 			ut_a(err == Z_OK);
6788 
6789 			c_stream.next_in = (Bytef*)
6790 				big_rec_vec->fields[i].data;
6791 			c_stream.avail_in = static_cast<uInt>(extern_len);
6792 		}
6793 
6794 		for (ulint blob_npages = 0;; ++blob_npages) {
6795 			buf_block_t*	block;
6796 			page_t*		page;
6797 			const ulint	commit_freq = 4;
6798 			ulint		r_extents;
6799 
6800 			ut_ad(page_align(field_ref) == page_align(rec));
6801 
6802 			if (!(blob_npages % commit_freq)) {
6803 
6804 				redo_log.check();
6805 
6806 				field_ref = btr_rec_get_field_ref(
6807 					rec, offsets, field_no);
6808 
6809 				page_zip = buf_block_get_page_zip(rec_block);
6810 				rec_page_no = rec_block->page.id.page_no();
6811 			}
6812 
6813 			mtr_start(&mtr);
6814 			mtr.set_named_space(index->space);
6815 			mtr.set_log_mode(btr_mtr->get_log_mode());
6816 			mtr.set_flush_observer(btr_mtr->get_flush_observer());
6817 
6818 			buf_page_get(rec_block->page.id,
6819 				     rec_block->page.size, RW_X_LATCH, &mtr);
6820 
6821 			if (prev_page_no == FIL_NULL) {
6822 				hint_page_no = 1 + rec_page_no;
6823 			} else {
6824 				hint_page_no = prev_page_no + 1;
6825 			}
6826 
6827 			mtr_t	*alloc_mtr;
6828 
6829 			if (op == BTR_STORE_INSERT_BULK) {
6830 				mtr_start(&mtr_bulk);
6831 				mtr_bulk.set_spaces(mtr);
6832 				alloc_mtr = &mtr_bulk;
6833 			} else {
6834 				alloc_mtr = &mtr;
6835 			}
6836 
6837 			if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
6838 						      FSP_BLOB, alloc_mtr,
6839 						      1)) {
6840 
6841 				mtr_commit(alloc_mtr);
6842 				error = DB_OUT_OF_FILE_SPACE;
6843 				goto func_exit;
6844 			}
6845 
6846 			block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
6847 					       0, alloc_mtr, &mtr);
6848 
6849 			alloc_mtr->release_free_extents(r_extents);
6850 
6851 			if (op == BTR_STORE_INSERT_BULK) {
6852 				mtr_commit(&mtr_bulk);
6853 			}
6854 
6855 			ut_a(block != NULL);
6856 
6857 			page_no = block->page.id.page_no();
6858 			page = buf_block_get_frame(block);
6859 
6860 			if (prev_page_no != FIL_NULL) {
6861 				buf_block_t*	prev_block;
6862 				page_t*		prev_page;
6863 
6864 				prev_block = buf_page_get(
6865 					page_id_t(space_id, prev_page_no),
6866 					rec_block->page.size,
6867 					RW_X_LATCH, &mtr);
6868 
6869 				buf_block_dbg_add_level(prev_block,
6870 							SYNC_EXTERN_STORAGE);
6871 				prev_page = buf_block_get_frame(prev_block);
6872 
6873 				if (page_zip) {
6874 					mlog_write_ulint(
6875 						prev_page + FIL_PAGE_NEXT,
6876 						page_no, MLOG_4BYTES, &mtr);
6877 					memcpy(buf_block_get_page_zip(
6878 						       prev_block)
6879 					       ->data + FIL_PAGE_NEXT,
6880 					       prev_page + FIL_PAGE_NEXT, 4);
6881 				} else {
6882 					mlog_write_ulint(
6883 						prev_page + FIL_PAGE_DATA
6884 						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
6885 						page_no, MLOG_4BYTES, &mtr);
6886 				}
6887 
6888 			} else if (dict_index_is_online_ddl(index)) {
6889 				row_log_table_blob_alloc(index, page_no);
6890 			}
6891 
6892 			if (page_zip) {
6893 				int		err;
6894 				page_zip_des_t*	blob_page_zip;
6895 
6896 				/* Write FIL_PAGE_TYPE to the redo log
6897 				separately, before logging any other
6898 				changes to the page, so that the debug
6899 				assertions in
6900 				recv_parse_or_apply_log_rec_body() can
6901 				be made simpler.  Before InnoDB Plugin
6902 				1.0.4, the initialization of
6903 				FIL_PAGE_TYPE was logged as part of
6904 				the mlog_log_string() below. */
6905 
6906 				mlog_write_ulint(page + FIL_PAGE_TYPE,
6907 						 prev_page_no == FIL_NULL
6908 						 ? FIL_PAGE_TYPE_ZBLOB
6909 						 : FIL_PAGE_TYPE_ZBLOB2,
6910 						 MLOG_2BYTES, &mtr);
6911 
6912 				c_stream.next_out = page
6913 					+ FIL_PAGE_DATA;
6914 				c_stream.avail_out = static_cast<uInt>(
6915 					payload_size_zip);
6916 
6917 				err = deflate(&c_stream, Z_FINISH);
6918 				ut_a(err == Z_OK || err == Z_STREAM_END);
6919 				ut_a(err == Z_STREAM_END
6920 				     || c_stream.avail_out == 0);
6921 
6922 				/* Write the "next BLOB page" pointer */
6923 				mlog_write_ulint(page + FIL_PAGE_NEXT,
6924 						 FIL_NULL, MLOG_4BYTES, &mtr);
6925 				/* Initialize the unused "prev page" pointer */
6926 				mlog_write_ulint(page + FIL_PAGE_PREV,
6927 						 FIL_NULL, MLOG_4BYTES, &mtr);
6928 				/* Write a back pointer to the record
6929 				into the otherwise unused area.  This
6930 				information could be useful in
6931 				debugging.  Later, we might want to
6932 				implement the possibility to relocate
6933 				BLOB pages.  Then, we would need to be
6934 				able to adjust the BLOB pointer in the
6935 				record.  We do not store the heap
6936 				number of the record, because it can
6937 				change in page_zip_reorganize() or
6938 				btr_page_reorganize().  However, also
6939 				the page number of the record may
6940 				change when B-tree nodes are split or
6941 				merged.
6942 				NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
6943 				used by R-tree index for a Split Sequence
6944 				Number */
6945 				ut_ad(!dict_index_is_spatial(index));
6946 
6947 				mlog_write_ulint(page
6948 						 + FIL_PAGE_FILE_FLUSH_LSN,
6949 						 space_id,
6950 						 MLOG_4BYTES, &mtr);
6951 				mlog_write_ulint(page
6952 						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
6953 						 rec_page_no,
6954 						 MLOG_4BYTES, &mtr);
6955 
6956 				/* Zero out the unused part of the page. */
6957 				memset(page + page_zip_get_size(page_zip)
6958 				       - c_stream.avail_out,
6959 				       0, c_stream.avail_out);
6960 				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
6961 						page_zip_get_size(page_zip)
6962 						- FIL_PAGE_FILE_FLUSH_LSN,
6963 						&mtr);
6964 				/* Copy the page to compressed storage,
6965 				because it will be flushed to disk
6966 				from there. */
6967 				blob_page_zip = buf_block_get_page_zip(block);
6968 				ut_ad(blob_page_zip);
6969 				ut_ad(page_zip_get_size(blob_page_zip)
6970 				      == page_zip_get_size(page_zip));
6971 				memcpy(blob_page_zip->data, page,
6972 				       page_zip_get_size(page_zip));
6973 
6974 				if (err == Z_OK && prev_page_no != FIL_NULL) {
6975 
6976 					goto next_zip_page;
6977 				}
6978 
6979 				if (err == Z_STREAM_END) {
6980 					mach_write_to_4(field_ref
6981 							+ BTR_EXTERN_LEN, 0);
6982 					mach_write_to_4(field_ref
6983 							+ BTR_EXTERN_LEN + 4,
6984 							c_stream.total_in);
6985 				} else {
6986 					memset(field_ref + BTR_EXTERN_LEN,
6987 					       0, 8);
6988 				}
6989 
6990 				if (prev_page_no == FIL_NULL) {
6991 					ut_ad(blob_npages == 0);
6992 					mach_write_to_4(field_ref
6993 							+ BTR_EXTERN_SPACE_ID,
6994 							space_id);
6995 
6996 					mach_write_to_4(field_ref
6997 							+ BTR_EXTERN_PAGE_NO,
6998 							page_no);
6999 
7000 					mach_write_to_4(field_ref
7001 							+ BTR_EXTERN_OFFSET,
7002 							FIL_PAGE_NEXT);
7003 				}
7004 
7005 				/* We compress a page when finish bulk insert.*/
7006 				if (op != BTR_STORE_INSERT_BULK) {
7007 					page_zip_write_blob_ptr(
7008 						page_zip, rec, index, offsets,
7009 						field_no, &mtr);
7010 				}
7011 
7012 next_zip_page:
7013 				prev_page_no = page_no;
7014 
7015 				/* Commit mtr and release the
7016 				uncompressed page frame to save memory. */
7017 				btr_blob_free(index, block, FALSE, &mtr);
7018 
7019 				if (err == Z_STREAM_END) {
7020 					break;
7021 				}
7022 			} else {
7023 				mlog_write_ulint(page + FIL_PAGE_TYPE,
7024 						 FIL_PAGE_TYPE_BLOB,
7025 						 MLOG_2BYTES, &mtr);
7026 
7027 				if (extern_len > payload_size) {
7028 					store_len = payload_size;
7029 				} else {
7030 					store_len = extern_len;
7031 				}
7032 
7033 				mlog_write_string(page + FIL_PAGE_DATA
7034 						  + BTR_BLOB_HDR_SIZE,
7035 						  (const byte*)
7036 						  big_rec_vec->fields[i].data
7037 						  + big_rec_vec->fields[i].len
7038 						  - extern_len,
7039 						  store_len, &mtr);
7040 				mlog_write_ulint(page + FIL_PAGE_DATA
7041 						 + BTR_BLOB_HDR_PART_LEN,
7042 						 store_len, MLOG_4BYTES, &mtr);
7043 				mlog_write_ulint(page + FIL_PAGE_DATA
7044 						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7045 						 FIL_NULL, MLOG_4BYTES, &mtr);
7046 
7047 				extern_len -= store_len;
7048 
7049 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7050 						 MLOG_4BYTES, &mtr);
7051 				mlog_write_ulint(field_ref
7052 						 + BTR_EXTERN_LEN + 4,
7053 						 big_rec_vec->fields[i].len
7054 						 - extern_len,
7055 						 MLOG_4BYTES, &mtr);
7056 
7057 				if (prev_page_no == FIL_NULL) {
7058 					ut_ad(blob_npages == 0);
7059 					mlog_write_ulint(field_ref
7060 							 + BTR_EXTERN_SPACE_ID,
7061 							 space_id, MLOG_4BYTES,
7062 							 &mtr);
7063 
7064 					mlog_write_ulint(field_ref
7065 							 + BTR_EXTERN_PAGE_NO,
7066 							 page_no, MLOG_4BYTES,
7067 							 &mtr);
7068 
7069 					mlog_write_ulint(field_ref
7070 							 + BTR_EXTERN_OFFSET,
7071 							 FIL_PAGE_DATA,
7072 							 MLOG_4BYTES,
7073 							 &mtr);
7074 				}
7075 
7076 				prev_page_no = page_no;
7077 
7078 				mtr_commit(&mtr);
7079 
7080 				if (extern_len == 0) {
7081 					break;
7082 				}
7083 			}
7084 		}
7085 
7086 		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7087 				error = DB_OUT_OF_FILE_SPACE;
7088 				goto func_exit;);
7089 
7090 		rec_offs_make_nth_extern(offsets, field_no);
7091 	}
7092 
7093 func_exit:
7094 	if (page_zip) {
7095 		deflateEnd(&c_stream);
7096 	}
7097 
7098 	if (heap != NULL) {
7099 		mem_heap_free(heap);
7100 	}
7101 
7102 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7103 	/* All pointers to externally stored columns in the record
7104 	must be valid. */
7105 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7106 		if (!rec_offs_nth_extern(offsets, i)) {
7107 			continue;
7108 		}
7109 
7110 		field_ref = btr_rec_get_field_ref(rec, offsets, i);
7111 
7112 		/* The pointer must not be zero if the operation
7113 		succeeded. */
7114 		ut_a(0 != memcmp(field_ref, field_ref_zero,
7115 				 BTR_EXTERN_FIELD_REF_SIZE)
7116 		     || error != DB_SUCCESS);
7117 		/* The column must not be disowned by this record. */
7118 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7119 	}
7120 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7121 	return(error);
7122 }
7123 
7124 /*******************************************************************//**
7125 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7126 static
7127 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7128 btr_check_blob_fil_page_type(
7129 /*=========================*/
7130 	ulint		space_id,	/*!< in: space id */
7131 	ulint		page_no,	/*!< in: page number */
7132 	const page_t*	page,		/*!< in: page */
7133 	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
7134 {
7135 	ulint	type = fil_page_get_type(page);
7136 
7137 	ut_a(space_id == page_get_space_id(page));
7138 	ut_a(page_no == page_get_page_no(page));
7139 
7140 	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7141 		ulint	flags = fil_space_get_flags(space_id);
7142 
7143 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7144 		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7145 			/* Old versions of InnoDB did not initialize
7146 			FIL_PAGE_TYPE on BLOB pages.  Do not print
7147 			anything about the type mismatch when reading
7148 			a BLOB page that is in Antelope format.*/
7149 			return;
7150 		}
7151 #endif /* !UNIV_DEBUG */
7152 
7153 		ib::fatal() << "FIL_PAGE_TYPE=" << type
7154 			<< " on BLOB " << (read ? "read" : "purge")
7155 			<< " space " << space_id << " page " << page_no
7156 			<< " flags " << flags;
7157 	}
7158 }
7159 
7160 /*******************************************************************//**
7161 Frees the space in an externally stored field to the file space
7162 management if the field in data is owned by the externally stored field,
7163 in a rollback we may have the additional condition that the field must
7164 not be inherited. */
7165 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7166 btr_free_externally_stored_field(
7167 /*=============================*/
7168 	dict_index_t*	index,		/*!< in: index of the data, the index
7169 					tree MUST be X-latched; if the tree
7170 					height is 1, then also the root page
7171 					must be X-latched! (this is relevant
7172 					in the case this function is called
7173 					from purge where 'data' is located on
7174 					an undo log page, not an index
7175 					page) */
7176 	byte*		field_ref,	/*!< in/out: field reference */
7177 	const rec_t*	rec,		/*!< in: record containing field_ref, for
7178 					page_zip_write_blob_ptr(), or NULL */
7179 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
7180 					or NULL */
7181 	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
7182 					to rec, or NULL if rec == NULL */
7183 	ulint		i,		/*!< in: field number of field_ref;
7184 					ignored if rec == NULL */
7185 	bool		rollback,	/*!< in: performing rollback? */
7186 	mtr_t*		local_mtr)	/*!< in: mtr
7187 					containing the latch to data an an
7188 					X-latch to the index tree */
7189 {
7190 	page_t*		page;
7191 	const ulint	space_id	= mach_read_from_4(
7192 		field_ref + BTR_EXTERN_SPACE_ID);
7193 	const ulint	start_page	= mach_read_from_4(
7194 		field_ref + BTR_EXTERN_PAGE_NO);
7195 	ulint		page_no;
7196 	ulint		next_page_no;
7197 	mtr_t		mtr;
7198 
7199 	ut_ad(dict_index_is_clust(index));
7200 	ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7201 					MTR_MEMO_X_LOCK
7202 					| MTR_MEMO_SX_LOCK)
7203 	      || dict_table_is_intrinsic(index->table));
7204 	ut_ad(mtr_is_page_fix(
7205 		local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7206 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7207 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7208 	ut_ad(local_mtr->is_named_space(
7209 		      page_get_space_id(page_align(field_ref))));
7210 
7211 	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7212 				  BTR_EXTERN_FIELD_REF_SIZE))) {
7213 		/* In the rollback, we may encounter a clustered index
7214 		record with some unwritten off-page columns. There is
7215 		nothing to free then. */
7216 		ut_a(rollback);
7217 		return;
7218 	}
7219 
7220 	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7221 	        & ~((BTR_EXTERN_OWNER_FLAG
7222 	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
7223 	ut_ad(space_id == index->space);
7224 
7225 	const page_size_t	ext_page_size(dict_table_page_size(index->table));
7226 	const page_size_t&	rec_page_size(rec == NULL
7227 					      ? univ_page_size
7228 					      : ext_page_size);
7229 	if (rec == NULL) {
7230 		/* This is a call from row_purge_upd_exist_or_extern(). */
7231 		ut_ad(!page_zip);
7232 	}
7233 
7234 	for (;;) {
7235 #ifdef UNIV_DEBUG
7236 		buf_block_t*	rec_block;
7237 #endif /* UNIV_DEBUG */
7238 		buf_block_t*	ext_block;
7239 
7240 		mtr_start(&mtr);
7241 		mtr.set_spaces(*local_mtr);
7242 		mtr.set_log_mode(local_mtr->get_log_mode());
7243 
7244 		ut_ad(!dict_table_is_temporary(index->table)
7245 		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7246 
7247 		const page_t*	p = page_align(field_ref);
7248 
7249 		const page_id_t	page_id(page_get_space_id(p),
7250 					page_get_page_no(p));
7251 
7252 #ifdef UNIV_DEBUG
7253 		rec_block =
7254 #endif /* UNIV_DEBUG */
7255 		buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7256 
7257 		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7258 		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7259 
7260 		if (/* There is no external storage data */
7261 		    page_no == FIL_NULL
7262 		    /* This field does not own the externally stored field */
7263 		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7264 			& BTR_EXTERN_OWNER_FLAG)
7265 		    /* Rollback and inherited field */
7266 		    || (rollback
7267 			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7268 			    & BTR_EXTERN_INHERITED_FLAG))) {
7269 
7270 			/* Do not free */
7271 			mtr_commit(&mtr);
7272 
7273 			return;
7274 		}
7275 
7276 		if (page_no == start_page && dict_index_is_online_ddl(index)) {
7277 			row_log_table_blob_free(index, start_page);
7278 		}
7279 
7280 		ext_block = buf_page_get(
7281 			page_id_t(space_id, page_no), ext_page_size,
7282 			RW_X_LATCH, &mtr);
7283 
7284 		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7285 		page = buf_block_get_frame(ext_block);
7286 
7287 		if (ext_page_size.is_compressed()) {
7288 			/* Note that page_zip will be NULL
7289 			in row_purge_upd_exist_or_extern(). */
7290 			switch (fil_page_get_type(page)) {
7291 			case FIL_PAGE_TYPE_ZBLOB:
7292 			case FIL_PAGE_TYPE_ZBLOB2:
7293 				break;
7294 			default:
7295 				ut_error;
7296 			}
7297 			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7298 
7299 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7300 					  &mtr);
7301 
7302 			if (page_zip != NULL) {
7303 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7304 						next_page_no);
7305 				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7306 						0);
7307 				page_zip_write_blob_ptr(page_zip, rec, index,
7308 							offsets, i, &mtr);
7309 			} else {
7310 				mlog_write_ulint(field_ref
7311 						 + BTR_EXTERN_PAGE_NO,
7312 						 next_page_no,
7313 						 MLOG_4BYTES, &mtr);
7314 				mlog_write_ulint(field_ref
7315 						 + BTR_EXTERN_LEN + 4, 0,
7316 						 MLOG_4BYTES, &mtr);
7317 			}
7318 		} else {
7319 			ut_a(!page_zip);
7320 			btr_check_blob_fil_page_type(space_id, page_no, page,
7321 						     FALSE);
7322 
7323 			next_page_no = mach_read_from_4(
7324 				page + FIL_PAGE_DATA
7325 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
7326 
7327 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7328 					  &mtr);
7329 
7330 			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7331 					 next_page_no,
7332 					 MLOG_4BYTES, &mtr);
7333 			/* Zero out the BLOB length.  If the server
7334 			crashes during the execution of this function,
7335 			trx_rollback_or_clean_all_recovered() could
7336 			dereference the half-deleted BLOB, fetching a
7337 			wrong prefix for the BLOB. */
7338 			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7339 					 0,
7340 					 MLOG_4BYTES, &mtr);
7341 		}
7342 
7343 		/* Commit mtr and release the BLOB block to save memory. */
7344 		btr_blob_free(index, ext_block, TRUE, &mtr);
7345 	}
7346 }
7347 
7348 /***********************************************************//**
7349 Frees the externally stored fields for a record. */
7350 static
7351 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7352 btr_rec_free_externally_stored_fields(
7353 /*==================================*/
7354 	dict_index_t*	index,	/*!< in: index of the data, the index
7355 				tree MUST be X-latched */
7356 	rec_t*		rec,	/*!< in/out: record */
7357 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7358 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7359 				part will be updated, or NULL */
7360 	bool		rollback,/*!< in: performing rollback? */
7361 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7362 				an X-latch to record page and to the index
7363 				tree */
7364 {
7365 	ulint	n_fields;
7366 	ulint	i;
7367 
7368 	ut_ad(rec_offs_validate(rec, index, offsets));
7369 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7370 	/* Free possible externally stored fields in the record */
7371 
7372 	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7373 	n_fields = rec_offs_n_fields(offsets);
7374 
7375 	for (i = 0; i < n_fields; i++) {
7376 		if (rec_offs_nth_extern(offsets, i)) {
7377 			btr_free_externally_stored_field(
7378 				index, btr_rec_get_field_ref(rec, offsets, i),
7379 				rec, offsets, page_zip, i, rollback, mtr);
7380 		}
7381 	}
7382 }
7383 
7384 /***********************************************************//**
7385 Frees the externally stored fields for a record, if the field is mentioned
7386 in the update vector. */
7387 static
7388 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7389 btr_rec_free_updated_extern_fields(
7390 /*===============================*/
7391 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
7392 				X-latched */
7393 	rec_t*		rec,	/*!< in/out: record */
7394 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7395 				part will be updated, or NULL */
7396 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7397 	const upd_t*	update,	/*!< in: update vector */
7398 	bool		rollback,/*!< in: performing rollback? */
7399 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7400 				an X-latch to record page and to the tree */
7401 {
7402 	ulint	n_fields;
7403 	ulint	i;
7404 
7405 	ut_ad(rec_offs_validate(rec, index, offsets));
7406 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7407 
7408 	/* Free possible externally stored fields in the record */
7409 
7410 	n_fields = upd_get_n_fields(update);
7411 
7412 	for (i = 0; i < n_fields; i++) {
7413 		const upd_field_t* ufield = upd_get_nth_field(update, i);
7414 
7415 		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7416 			ulint	len;
7417 			byte*	data = rec_get_nth_field(
7418 				rec, offsets, ufield->field_no, &len);
7419 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7420 
7421 			btr_free_externally_stored_field(
7422 				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7423 				rec, offsets, page_zip,
7424 				ufield->field_no, rollback, mtr);
7425 		}
7426 	}
7427 }
7428 
7429 /*******************************************************************//**
7430 Copies the prefix of an uncompressed BLOB.  The clustered index record
7431 that points to this BLOB must be protected by a lock or a page latch.
7432 @return number of bytes written to buf */
7433 static
7434 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7435 btr_copy_blob_prefix(
7436 /*=================*/
7437 	byte*		buf,	/*!< out: the externally stored part of
7438 				the field, or a prefix of it */
7439 	ulint		len,	/*!< in: length of buf, in bytes */
7440 	ulint		space_id,/*!< in: space id of the BLOB pages */
7441 	ulint		page_no,/*!< in: page number of the first BLOB page */
7442 	ulint		offset)	/*!< in: offset on the first BLOB page */
7443 {
7444 	ulint	copied_len	= 0;
7445 
7446 	for (;;) {
7447 		mtr_t		mtr;
7448 		buf_block_t*	block;
7449 		const page_t*	page;
7450 		const byte*	blob_header;
7451 		ulint		part_len;
7452 		ulint		copy_len;
7453 
7454 		mtr_start(&mtr);
7455 
7456 		block = buf_page_get(page_id_t(space_id, page_no),
7457 				     univ_page_size, RW_S_LATCH, &mtr);
7458 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7459 		page = buf_block_get_frame(block);
7460 
7461 		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7462 
7463 		blob_header = page + offset;
7464 		part_len = btr_blob_get_part_len(blob_header);
7465 		copy_len = ut_min(part_len, len - copied_len);
7466 
7467 		memcpy(buf + copied_len,
7468 		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7469 		copied_len += copy_len;
7470 
7471 		page_no = btr_blob_get_next_page_no(blob_header);
7472 
7473 		mtr_commit(&mtr);
7474 
7475 		if (page_no == FIL_NULL || copy_len != part_len) {
7476 			UNIV_MEM_ASSERT_RW(buf, copied_len);
7477 			return(copied_len);
7478 		}
7479 
7480 		/* On other BLOB pages except the first the BLOB header
7481 		always is at the page data start: */
7482 
7483 		offset = FIL_PAGE_DATA;
7484 
7485 		ut_ad(copied_len <= len);
7486 	}
7487 }
7488 
7489 /** Copies the prefix of a compressed BLOB.
7490 The clustered index record that points to this BLOB must be protected
7491 by a lock or a page latch.
7492 @param[out]	buf		the externally stored part of the field,
7493 or a prefix of it
7494 @param[in]	len		length of buf, in bytes
7495 @param[in]	page_size	compressed BLOB page size
7496 @param[in]	space_id	space id of the BLOB pages
7497 @param[in]	offset		offset on the first BLOB page
7498 @return number of bytes written to buf */
7499 static
7500 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7501 btr_copy_zblob_prefix(
7502 	byte*			buf,
7503 	ulint			len,
7504 	const page_size_t&	page_size,
7505 	ulint			space_id,
7506 	ulint			page_no,
7507 	ulint			offset)
7508 {
7509 	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
7510 	mem_heap_t*	heap;
7511 	int		err;
7512 	z_stream	d_stream;
7513 
7514 	d_stream.next_out = buf;
7515 	d_stream.avail_out = static_cast<uInt>(len);
7516 	d_stream.next_in = Z_NULL;
7517 	d_stream.avail_in = 0;
7518 
7519 	/* Zlib inflate needs 32 kilobytes for the default
7520 	window size, plus a few kilobytes for small objects. */
7521 	heap = mem_heap_create(40000);
7522 	page_zip_set_alloc(&d_stream, heap);
7523 
7524 	ut_ad(page_size.is_compressed());
7525 	ut_ad(space_id);
7526 
7527 	err = inflateInit(&d_stream);
7528 	ut_a(err == Z_OK);
7529 
7530 	for (;;) {
7531 		buf_page_t*	bpage;
7532 		ulint		next_page_no;
7533 
7534 		/* There is no latch on bpage directly.  Instead,
7535 		bpage is protected by the B-tree page latch that
7536 		is being held on the clustered index record, or,
7537 		in row_merge_copy_blobs(), by an exclusive table lock. */
7538 		bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7539 					 page_size);
7540 
7541 		if (UNIV_UNLIKELY(!bpage)) {
7542 			ib::error() << "Cannot load compressed BLOB "
7543 				<< page_id_t(space_id, page_no);
7544 			goto func_exit;
7545 		}
7546 
7547 		if (UNIV_UNLIKELY
7548 		    (fil_page_get_type(bpage->zip.data) != page_type)) {
7549 
7550 			ib::error() << "Unexpected type "
7551 				<< fil_page_get_type(bpage->zip.data)
7552 				<< " of compressed BLOB page "
7553 				<< page_id_t(space_id, page_no);
7554 
7555 			ut_ad(0);
7556 			goto end_of_blob;
7557 		}
7558 
7559 		next_page_no = mach_read_from_4(bpage->zip.data + offset);
7560 
7561 		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7562 			/* When the BLOB begins at page header,
7563 			the compressed data payload does not
7564 			immediately follow the next page pointer. */
7565 			offset = FIL_PAGE_DATA;
7566 		} else {
7567 			offset += 4;
7568 		}
7569 
7570 		d_stream.next_in = bpage->zip.data + offset;
7571 		d_stream.avail_in = static_cast<uInt>(page_size.physical()
7572 						      - offset);
7573 
7574 		err = inflate(&d_stream, Z_NO_FLUSH);
7575 		switch (err) {
7576 		case Z_OK:
7577 			if (!d_stream.avail_out) {
7578 				goto end_of_blob;
7579 			}
7580 			break;
7581 		case Z_STREAM_END:
7582 			if (next_page_no == FIL_NULL) {
7583 				goto end_of_blob;
7584 			}
7585 			/* fall through */
7586 		default:
7587 inflate_error:
7588 			ib::error() << "inflate() of compressed BLOB page "
7589 				<< page_id_t(space_id, page_no)
7590 				<< " returned " << err
7591 				<< " (" << d_stream.msg << ")";
7592 
7593 		case Z_BUF_ERROR:
7594 			goto end_of_blob;
7595 		}
7596 
7597 		if (next_page_no == FIL_NULL) {
7598 			if (!d_stream.avail_in) {
7599 				ib::error()
7600 					<< "Unexpected end of compressed "
7601 					<< "BLOB page "
7602 					<< page_id_t(space_id, page_no);
7603 			} else {
7604 				err = inflate(&d_stream, Z_FINISH);
7605 				switch (err) {
7606 				case Z_STREAM_END:
7607 				case Z_BUF_ERROR:
7608 					break;
7609 				default:
7610 					goto inflate_error;
7611 				}
7612 			}
7613 
7614 end_of_blob:
7615 			buf_page_release_zip(bpage);
7616 			goto func_exit;
7617 		}
7618 
7619 		buf_page_release_zip(bpage);
7620 
7621 		/* On other BLOB pages except the first
7622 		the BLOB header always is at the page header: */
7623 
7624 		page_no = next_page_no;
7625 		offset = FIL_PAGE_NEXT;
7626 		page_type = FIL_PAGE_TYPE_ZBLOB2;
7627 	}
7628 
7629 func_exit:
7630 	inflateEnd(&d_stream);
7631 	mem_heap_free(heap);
7632 	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7633 	return(d_stream.total_out);
7634 }
7635 
7636 /** Copies the prefix of an externally stored field of a record.
7637 The clustered index record that points to this BLOB must be protected
7638 by a lock or a page latch.
7639 @param[out]	buf		the externally stored part of the
7640 field, or a prefix of it
7641 @param[in]	len		length of buf, in bytes
7642 @param[in]	page_size	BLOB page size
7643 @param[in]	space_id	space id of the first BLOB page
7644 @param[in]	page_no		page number of the first BLOB page
7645 @param[in]	offset		offset on the first BLOB page
7646 @return number of bytes written to buf */
7647 static
7648 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7649 btr_copy_externally_stored_field_prefix_low(
7650 	byte*			buf,
7651 	ulint			len,
7652 	const page_size_t&	page_size,
7653 	ulint			space_id,
7654 	ulint			page_no,
7655 	ulint			offset)
7656 {
7657 	if (len == 0) {
7658 		return(0);
7659 	}
7660 
7661 	if (page_size.is_compressed()) {
7662 		return(btr_copy_zblob_prefix(buf, len, page_size,
7663 					     space_id, page_no, offset));
7664 	} else {
7665 		ut_ad(page_size.equals_to(univ_page_size));
7666 		return(btr_copy_blob_prefix(buf, len, space_id,
7667 					    page_no, offset));
7668 	}
7669 }
7670 
7671 /** Copies the prefix of an externally stored field of a record.
7672 The clustered index record must be protected by a lock or a page latch.
7673 @param[out]	buf		the field, or a prefix of it
7674 @param[in]	len		length of buf, in bytes
7675 @param[in]	page_size	BLOB page size
7676 @param[in]	data		'internally' stored part of the field
7677 containing also the reference to the external part; must be protected by
7678 a lock or a page latch
7679 @param[in]	local_len	length of data, in bytes
7680 @return the length of the copied field, or 0 if the column was being
7681 or has been deleted */
7682 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7683 btr_copy_externally_stored_field_prefix(
7684 	byte*			buf,
7685 	ulint			len,
7686 	const page_size_t&	page_size,
7687 	const byte*		data,
7688 	ulint			local_len)
7689 {
7690 	ulint	space_id;
7691 	ulint	page_no;
7692 	ulint	offset;
7693 
7694 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7695 
7696 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7697 
7698 	if (UNIV_UNLIKELY(local_len >= len)) {
7699 		memcpy(buf, data, len);
7700 		return(len);
7701 	}
7702 
7703 	memcpy(buf, data, local_len);
7704 	data += local_len;
7705 
7706 	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7707 
7708 	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7709 		/* The externally stored part of the column has been
7710 		(partially) deleted.  Signal the half-deleted BLOB
7711 		to the caller. */
7712 
7713 		return(0);
7714 	}
7715 
7716 	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7717 
7718 	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7719 
7720 	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7721 
7722 	return(local_len
7723 	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7724 							     len - local_len,
7725 							     page_size,
7726 							     space_id, page_no,
7727 							     offset));
7728 }
7729 
7730 /** Copies an externally stored field of a record to mem heap.
7731 The clustered index record must be protected by a lock or a page latch.
7732 @param[out]	len		length of the whole field
7733 @param[in]	data		'internally' stored part of the field
7734 containing also the reference to the external part; must be protected by
7735 a lock or a page latch
7736 @param[in]	page_size	BLOB page size
7737 @param[in]	local_len	length of data
7738 @param[in,out]	heap		mem heap
7739 @return the whole field copied to heap */
7740 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7741 btr_copy_externally_stored_field(
7742 	ulint*			len,
7743 	const byte*		data,
7744 	const page_size_t&	page_size,
7745 	ulint			local_len,
7746 	mem_heap_t*		heap)
7747 {
7748 	ulint	space_id;
7749 	ulint	page_no;
7750 	ulint	offset;
7751 	ulint	extern_len;
7752 	byte*	buf;
7753 
7754 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7755 
7756 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7757 
7758 	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
7759 
7760 	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
7761 
7762 	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
7763 
7764 	/* Currently a BLOB cannot be bigger than 4 GB; we
7765 	leave the 4 upper bytes in the length field unused */
7766 
7767 	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
7768 
7769 	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
7770 
7771 	memcpy(buf, data, local_len);
7772 	*len = local_len
7773 		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
7774 							      extern_len,
7775 							      page_size,
7776 							      space_id,
7777 							      page_no, offset);
7778 
7779 	return(buf);
7780 }
7781 
7782 /** Copies an externally stored field of a record to mem heap.
7783 @param[in]	rec		record in a clustered index; must be
7784 protected by a lock or a page latch
7785 @param[in]	offset		array returned by rec_get_offsets()
7786 @param[in]	page_size	BLOB page size
7787 @param[in]	no		field number
7788 @param[out]	len		length of the field
7789 @param[in,out]	heap		mem heap
7790 @return the field copied to heap, or NULL if the field is incomplete */
7791 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)7792 btr_rec_copy_externally_stored_field(
7793 	const rec_t*		rec,
7794 	const ulint*		offsets,
7795 	const page_size_t&	page_size,
7796 	ulint			no,
7797 	ulint*			len,
7798 	mem_heap_t*		heap)
7799 {
7800 	ulint		local_len;
7801 	const byte*	data;
7802 
7803 	ut_a(rec_offs_nth_extern(offsets, no));
7804 
7805 	/* An externally stored field can contain some initial
7806 	data from the field, and in the last 20 bytes it has the
7807 	space id, page number, and offset where the rest of the
7808 	field data is stored, and the data length in addition to
7809 	the data stored locally. We may need to store some data
7810 	locally to get the local record length above the 128 byte
7811 	limit so that field offsets are stored in two bytes, and
7812 	the extern bit is available in those two bytes. */
7813 
7814 	data = rec_get_nth_field(rec, offsets, no, &local_len);
7815 
7816 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7817 
7818 	if (UNIV_UNLIKELY
7819 	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
7820 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
7821 		/* The externally stored field was not written yet.
7822 		This record should only be seen by
7823 		recv_recovery_rollback_active() or any
7824 		TRX_ISO_READ_UNCOMMITTED transactions. */
7825 		return(NULL);
7826 	}
7827 
7828 	return(btr_copy_externally_stored_field(len, data,
7829 						page_size, local_len, heap));
7830 }
7831 #endif /* !UNIV_HOTBACKUP */
7832