1 /*****************************************************************************
2 
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6 
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12 
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16 
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation.  The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23 
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27 GNU General Public License, version 2.0, for more details.
28 
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32 
33 *****************************************************************************/
34 
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38 
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42 
43 			NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49 
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52 
53 #include "btr0cur.h"
54 
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58 
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81 
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 	BTR_NO_OP = 0,			/*!< Not buffered */
85 	BTR_INSERT_OP,			/*!< Insert, do not ignore UNIQUE */
86 	BTR_INSERT_IGNORE_UNIQUE_OP,	/*!< Insert, ignoring UNIQUE */
87 	BTR_DELETE_OP,			/*!< Purge a delete-marked record */
88 	BTR_DELMARK_OP			/*!< Mark a record for deletion */
89 };
90 
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 	BTR_INTENTION_DELETE,
94 	BTR_INTENTION_BOTH,
95 	BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103 
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110 
111  Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH	100000
114 
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint	btr_cur_n_non_sea	= 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint	btr_cur_n_sea		= 0;
120 /** Old value of btr_cur_n_non_sea.  Copied by
121 srv_refresh_innodb_monitor_stats().  Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint	btr_cur_n_non_sea_old	= 0;
124 /** Old value of btr_cur_n_sea.  Copied by
125 srv_refresh_innodb_monitor_stats().  Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint	btr_cur_n_sea_old	= 0;
128 
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint	btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133 
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
137 
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
142 						page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
144 						FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
147 						part header, in bytes */
148 
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 	(((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159 
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162 
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
173 				part will be updated, or NULL */
174 	rec_t*		rec,	/*!< in/out: record in a clustered index */
175 	dict_index_t*	index,	/*!< in: index of the page */
176 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
177 	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
186 	ulint		height,		/*!< in: height of the page in tree;
187 					0 means leaf node */
188 	ulint		root_height);	/*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
197 				X-latched */
198 	rec_t*		rec,	/*!< in: record */
199 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
200 				part will be updated, or NULL */
201 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
202 	const upd_t*	update,	/*!< in: update vector */
203 	bool		rollback,/*!< in: performing rollback? */
204 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
205 				an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 	dict_index_t*	index,	/*!< in: index of the data, the index
213 				tree MUST be X-latched */
214 	rec_t*		rec,	/*!< in: record */
215 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
216 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
217 				part will be updated, or NULL */
218 	bool		rollback,/*!< in: performing rollback? */
219 	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
220 				an X-latch to record page and to the index
221 				tree */
222 #endif /* !UNIV_HOTBACKUP */
223 
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226 
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236 
237 /** Latches the leaf page or pages requested.
238 @param[in]	block		leaf page where the search converged
239 @param[in]	page_id		page id of the leaf
240 @param[in]	latch_mode	BTR_SEARCH_LEAF, ...
241 @param[in]	cursor		cursor
242 @param[in]	mtr		mini-transaction
243 @return	blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 	buf_block_t*		block,
247 	const page_id_t&	page_id,
248 	const page_size_t&	page_size,
249 	ulint			latch_mode,
250 	btr_cur_t*		cursor,
251 	mtr_t*			mtr)
252 {
253 	ulint		mode;
254 	ulint		left_page_no;
255 	ulint		right_page_no;
256 	buf_block_t*	get_block;
257 	page_t*		page = buf_block_get_frame(block);
258 	bool		spatial;
259 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260 
261 	spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 	ut_ad(buf_page_in_file(&block->page));
263 
264 	switch (latch_mode) {
265 	case BTR_SEARCH_LEAF:
266 	case BTR_MODIFY_LEAF:
267 	case BTR_SEARCH_TREE:
268 		if (spatial) {
269 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 				= mtr_set_savepoint(mtr);
271 		}
272 
273 		mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 		get_block = btr_block_get(page_id, page_size, mode,
276 					  cursor->index, mtr);
277 		latch_leaves.blocks[1] = get_block;
278 #ifdef UNIV_BTR_DEBUG
279 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
280 #endif /* UNIV_BTR_DEBUG */
281 		if (spatial) {
282 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
283 				= get_block;
284 		}
285 
286 		return(latch_leaves);
287 	case BTR_MODIFY_TREE:
288 		/* It is exclusive for other operations which calls
289 		btr_page_set_prev() */
290 		ut_ad(mtr_memo_contains_flagged(mtr,
291 			dict_index_get_lock(cursor->index),
292 			MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
293 		      || dict_table_is_intrinsic(cursor->index->table));
294 		/* x-latch also siblings from left to right */
295 		left_page_no = btr_page_get_prev(page, mtr);
296 
297 		if (left_page_no != FIL_NULL) {
298 
299 			if (spatial) {
300 				cursor->rtr_info->tree_savepoints[
301 					RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
302 			}
303 
304 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
305 			get_block = btr_block_get(
306 				page_id_t(page_id.space(), left_page_no),
307 				page_size, RW_X_LATCH, cursor->index, mtr);
308 			latch_leaves.blocks[0] = get_block;
309 
310 			if (spatial) {
311 				cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
312 					= get_block;
313 			}
314 		}
315 
316 		if (spatial) {
317 			cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
318 				= mtr_set_savepoint(mtr);
319 		}
320 
321 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
322 		get_block = btr_block_get(
323 			page_id, page_size, RW_X_LATCH, cursor->index, mtr);
324 		latch_leaves.blocks[1] = get_block;
325 
326 #ifdef UNIV_BTR_DEBUG
327 		/* Sanity check only after both the blocks are latched. */
328 		if (latch_leaves.blocks[0] != NULL) {
329 			ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
330 				== page_is_comp(page));
331 			ut_a(btr_page_get_next(
332 				latch_leaves.blocks[0]->frame, mtr)
333 				== page_get_page_no(page));
334 		}
335 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
336 #endif /* UNIV_BTR_DEBUG */
337 
338 		if (spatial) {
339 			cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
340 				= get_block;
341 		}
342 
343 		right_page_no = btr_page_get_next(page, mtr);
344 
345 		if (right_page_no != FIL_NULL) {
346 			if (spatial) {
347 				cursor->rtr_info->tree_savepoints[
348 					RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
349 								mtr);
350 			}
351 			latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
352 			get_block = btr_block_get(
353 				page_id_t(page_id.space(), right_page_no),
354 				page_size, RW_X_LATCH, cursor->index, mtr);
355 			latch_leaves.blocks[2] = get_block;
356 #ifdef UNIV_BTR_DEBUG
357 			ut_a(page_is_comp(get_block->frame)
358 			     == page_is_comp(page));
359 			ut_a(btr_page_get_prev(get_block->frame, mtr)
360 			     == page_get_page_no(page));
361 #endif /* UNIV_BTR_DEBUG */
362 			if (spatial) {
363 				cursor->rtr_info->tree_blocks[
364 					RTR_MAX_LEVELS + 2] = get_block;
365 			}
366 		}
367 
368 		return(latch_leaves);
369 
370 	case BTR_SEARCH_PREV:
371 	case BTR_MODIFY_PREV:
372 		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
373 		/* latch also left sibling */
374 		rw_lock_s_lock(&block->lock);
375 		left_page_no = btr_page_get_prev(page, mtr);
376 		rw_lock_s_unlock(&block->lock);
377 
378 		if (left_page_no != FIL_NULL) {
379 			latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
380 			get_block = btr_block_get(
381 				page_id_t(page_id.space(), left_page_no),
382 				page_size, mode, cursor->index, mtr);
383 			latch_leaves.blocks[0] = get_block;
384 			cursor->left_block = get_block;
385 #ifdef UNIV_BTR_DEBUG
386 			ut_a(page_is_comp(get_block->frame)
387 			     == page_is_comp(page));
388 			ut_a(btr_page_get_next(get_block->frame, mtr)
389 			     == page_get_page_no(page));
390 #endif /* UNIV_BTR_DEBUG */
391 		}
392 
393 		latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
394 		get_block = btr_block_get(page_id, page_size, mode,
395 					  cursor->index, mtr);
396 		latch_leaves.blocks[1] = get_block;
397 #ifdef UNIV_BTR_DEBUG
398 		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
399 #endif /* UNIV_BTR_DEBUG */
400 		return(latch_leaves);
401 	case BTR_CONT_MODIFY_TREE:
402 		ut_ad(dict_index_is_spatial(cursor->index));
403 		return(latch_leaves);
404 	}
405 
406 	ut_error;
407 	return(latch_leaves);
408 }
409 
410 /** Optimistically latches the leaf page or pages requested.
411 @param[in]	block		guessed buffer block
412 @param[in]	modify_clock	modify clock value
413 @param[in,out]	latch_mode	BTR_SEARCH_LEAF, ...
414 @param[in,out]	cursor		cursor
415 @param[in]	file		file name
416 @param[in]	line		line where called
417 @param[in]	mtr		mini-transaction
418 @return true if success */
419 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)420 btr_cur_optimistic_latch_leaves(
421 	buf_block_t*	block,
422 	ib_uint64_t	modify_clock,
423 	ulint*		latch_mode,
424 	btr_cur_t*	cursor,
425 	const char*	file,
426 	ulint		line,
427 	mtr_t*		mtr)
428 {
429 	ulint		mode;
430 	ulint		left_page_no;
431 	ut_ad(block->page.buf_fix_count > 0);
432 	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
433 
434 	switch (*latch_mode) {
435 	case BTR_SEARCH_LEAF:
436 	case BTR_MODIFY_LEAF:
437 		return(buf_page_optimistic_get(*latch_mode, block,
438 				modify_clock, file, line, mtr));
439 	case BTR_SEARCH_PREV:
440 	case BTR_MODIFY_PREV:
441 		mode = *latch_mode == BTR_SEARCH_PREV
442 			? RW_S_LATCH : RW_X_LATCH;
443 
444 		rw_lock_s_lock(&block->lock);
445 		if (block->modify_clock != modify_clock) {
446 			rw_lock_s_unlock(&block->lock);
447 
448 			return(false);
449 		}
450 		left_page_no = btr_page_get_prev(
451 			buf_block_get_frame(block), mtr);
452 		rw_lock_s_unlock(&block->lock);
453 
454 		if (left_page_no != FIL_NULL) {
455 			const page_id_t	page_id(
456 				dict_index_get_space(cursor->index),
457 				left_page_no);
458 
459 			cursor->left_block = btr_block_get(
460 				page_id,
461 				dict_table_page_size(cursor->index->table),
462 				mode, cursor->index, mtr);
463 		} else {
464 			cursor->left_block = NULL;
465 		}
466 
467 		if (buf_page_optimistic_get(mode, block, modify_clock,
468 					    file, line, mtr)) {
469 			if (btr_page_get_prev(buf_block_get_frame(block), mtr)
470 			    == left_page_no) {
471 				/* We've entered this function with the block already buffer-fixed,
472 				and buf_page_optimistic_get() buffer-fixes it again. The caller should
473 				unfix the block once (to undo their buffer-fixing). */
474 				ut_ad(2 <= block->page.buf_fix_count);
475 				*latch_mode = mode;
476 				return(true);
477 			} else {
478 				/* release the block, which will also decrement the buf_fix_count once
479 				undoing the increment in successful buf_page_optimistic_get() */
480 				btr_leaf_page_release(block, mode, mtr);
481 			}
482 		}
483 
484 		/* If we are still here then buf_page_optimistic_get() did not buffer-fix
485 		the page, but it should still be buffer-fixed as it was before the call.*/
486 		ut_ad(0 < block->page.buf_fix_count);
487 		/* release the left block */
488 		if (cursor->left_block != NULL) {
489 			btr_leaf_page_release(cursor->left_block,
490 					      mode, mtr);
491 		}
492 
493 		return(false);
494 
495 	default:
496 		ut_error;
497 		return(false);
498 	}
499 }
500 
501 /**
502 Gets intention in btr_intention_t from latch_mode, and cleares the intention
503 at the latch_mode.
504 @param latch_mode	in/out: pointer to latch_mode
505 @return intention for latching tree */
506 static
507 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)508 btr_cur_get_and_clear_intention(
509 	ulint	*latch_mode)
510 {
511 	btr_intention_t	intention;
512 
513 	switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
514 	case BTR_LATCH_FOR_INSERT:
515 		intention = BTR_INTENTION_INSERT;
516 		break;
517 	case BTR_LATCH_FOR_DELETE:
518 		intention = BTR_INTENTION_DELETE;
519 		break;
520 	default:
521 		/* both or unknown */
522 		intention = BTR_INTENTION_BOTH;
523 	}
524 	*latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
525 
526 	return(intention);
527 }
528 
529 /**
530 Gets the desired latch type for the root leaf (root page is root leaf)
531 at the latch mode.
532 @param latch_mode	in: BTR_SEARCH_LEAF, ...
533 @return latch type */
534 static
535 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)536 btr_cur_latch_for_root_leaf(
537 	ulint	latch_mode)
538 {
539 	switch (latch_mode) {
540 	case BTR_SEARCH_LEAF:
541 	case BTR_SEARCH_TREE:
542 	case BTR_SEARCH_PREV:
543 		return(RW_S_LATCH);
544 	case BTR_MODIFY_LEAF:
545 	case BTR_MODIFY_TREE:
546 	case BTR_MODIFY_PREV:
547 		return(RW_X_LATCH);
548 	case BTR_CONT_MODIFY_TREE:
549 	case BTR_CONT_SEARCH_TREE:
550 		/* A root page should be latched already,
551 		and don't need to be latched here.
552 		fall through (RW_NO_LATCH) */
553 	case BTR_NO_LATCHES:
554 		return(RW_NO_LATCH);
555 	}
556 
557 	ut_error;
558 	return(RW_NO_LATCH); /* avoid compiler warnings */
559 }
560 
561 /** Detects whether the modifying record might need a modifying tree structure.
562 @param[in]	index		index
563 @param[in]	page		page
564 @param[in]	lock_intention	lock intention for the tree operation
565 @param[in]	rec		record (current node_ptr)
566 @param[in]	rec_size	size of the record or max size of node_ptr
567 @param[in]	page_size	page size
568 @param[in]	mtr		mtr
569 @return true if tree modification is needed */
570 static
571 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)572 btr_cur_will_modify_tree(
573 	dict_index_t*	index,
574 	const page_t*	page,
575 	btr_intention_t	lock_intention,
576 	const rec_t*	rec,
577 	ulint		rec_size,
578 	const page_size_t&	page_size,
579 	mtr_t*		mtr)
580 {
581 	ut_ad(!page_is_leaf(page));
582 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
583 					MTR_MEMO_X_LOCK
584 					| MTR_MEMO_SX_LOCK)
585 	      || dict_table_is_intrinsic(index->table));
586 
587 	/* Pessimistic delete of the first record causes delete & insert
588 	of node_ptr at upper level. And a subsequent page shrink is
589 	possible. It causes delete of node_ptr at the upper level.
590 	So we should pay attention also to 2nd record not only
591 	first record and last record. Because if the "delete & insert" are
592 	done for the different page, the 2nd record become
593 	first record and following compress might delete the record and causes
594 	the uppper level node_ptr modification. */
595 
596 	if (lock_intention <= BTR_INTENTION_BOTH) {
597 		ulint	margin;
598 
599 		if (lock_intention == BTR_INTENTION_BOTH) {
600 			ulint	level = btr_page_get_level(page, mtr);
601 
602 			/* This value is the worst expectation for the node_ptr
603 			records to be deleted from this page. It is used to
604 			expect whether the cursor position can be the left_most
605 			record in this page or not. */
606 			ulint   max_nodes_deleted = 0;
607 
608 			/* By modifying tree operations from the under of this
609 			level, logically (2 ^ (level - 1)) opportunities to
610 			deleting records in maximum even unreally rare case. */
611 			if (level > 7) {
612 				/* TODO: adjust this practical limit. */
613 				max_nodes_deleted = 64;
614 			} else if (level > 0) {
615 				max_nodes_deleted = (ulint)1 << (level - 1);
616 			}
617 
618 			/* check delete will cause. (BTR_INTENTION_BOTH
619 			or BTR_INTENTION_DELETE) */
620 			if (page_get_n_recs(page) <= max_nodes_deleted * 2
621 			    || page_rec_is_first(rec, page)) {
622 				/* The cursor record can be the left most record
623 				in this page. */
624 				return(true);
625 			}
626 
627 			if (fil_page_get_prev(page) != FIL_NULL
628 			    && page_rec_distance_is_at_most(
629 					page_get_infimum_rec(page), rec,
630 					max_nodes_deleted)) {
631 				return (true);
632 			}
633 
634 			if (fil_page_get_next(page) != FIL_NULL
635 			    && page_rec_distance_is_at_most(
636 					rec, page_get_supremum_rec(page),
637 					max_nodes_deleted)) {
638 				return (true);
639 			}
640 
641 			/* Delete at leftmost record in a page causes delete
642 			& insert at its parent page. After that, the delete
643 			might cause btr_compress() and delete record at its
644 			parent page. Thus we should consider max deletes. */
645 
646 			margin = rec_size * max_nodes_deleted;
647 		} else {
648 			ut_ad(lock_intention == BTR_INTENTION_DELETE);
649 
650 			margin = rec_size;
651 		}
652 		/* Safe because we already have SX latch of the index tree */
653 		if (page_get_data_size(page)
654 			< margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
655 		    || (fil_page_get_next(page) == FIL_NULL
656 			&& fil_page_get_prev(page) == FIL_NULL)) {
657 			return(true);
658 		}
659 	}
660 
661 	if (lock_intention >= BTR_INTENTION_BOTH) {
662 		/* check insert will cause. BTR_INTENTION_BOTH
663 		or BTR_INTENTION_INSERT*/
664 
665 		/* Once we invoke the btr_cur_limit_optimistic_insert_debug,
666 		we should check it here in advance, since the max allowable
667 		records in a page is limited. */
668 		LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
669 					      return(true));
670 
671 		/* needs 2 records' space for the case the single split and
672 		insert cannot fit.
673 		page_get_max_insert_size_after_reorganize() includes space
674 		for page directory already */
675 		ulint	max_size
676 			= page_get_max_insert_size_after_reorganize(page, 2);
677 
678 		if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
679 		    || max_size < rec_size * 2) {
680 			return(true);
681 		}
682 		/* TODO: optimize this condition for compressed page.
683 		this is based on the worst compress rate.
684 		currently looking only uncompressed page, but we can look
685 		also compressed page page_zip_available() if already in the
686 		buffer pool */
687 		/* needs 2 records' space also for worst compress rate. */
688 		if (page_size.is_compressed()
689 		    && page_zip_empty_size(index->n_fields,
690 					   page_size.physical())
691 		       < rec_size * 2 + page_get_data_size(page)
692 			 + page_dir_calc_reserved_space(
693 				page_get_n_recs(page) + 2) + 1) {
694 			return(true);
695 		}
696 	}
697 
698 	return(false);
699 }
700 
701 /** Detects whether the modifying record might need a opposite modification
702 to the intention.
703 @param[in]	page		page
704 @param[in]	lock_intention	lock intention for the tree operation
705 @param[in]	rec		record (current node_ptr)
706 @return	true if tree modification is needed */
707 static
708 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)709 btr_cur_need_opposite_intention(
710 	const page_t*	page,
711 	btr_intention_t	lock_intention,
712 	const rec_t*	rec)
713 {
714 	switch (lock_intention) {
715 	case BTR_INTENTION_DELETE:
716 		return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
717 			&& page_rec_is_first(rec, page))
718 		       || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
719 			   && page_rec_is_last(rec, page)));
720 	case BTR_INTENTION_INSERT:
721 		return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
722 		       && page_rec_is_last(rec, page));
723 	case BTR_INTENTION_BOTH:
724 		return(false);
725 	}
726 
727 	ut_error;
728 	return(false);
729 }
730 
731 /********************************************************************//**
732 Searches an index tree and positions a tree cursor on a given level.
733 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
734 to node pointer page number fields on the upper levels of the tree!
735 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
736 cursor->up_match and cursor->low_match both will have sensible values.
737 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
738 
739 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
740 search tuple should be performed in the B-tree. InnoDB does an insert
741 immediately after the cursor. Thus, the cursor may end up on a user record,
742 or on a page infimum record. */
743 void
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)744 btr_cur_search_to_nth_level(
745 /*========================*/
746 	dict_index_t*	index,	/*!< in: index */
747 	ulint		level,	/*!< in: the tree level of search */
748 	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
749 				tuple must be set so that it cannot get
750 				compared to the node ptr page number field! */
751 	page_cur_mode_t	mode,	/*!< in: PAGE_CUR_L, ...;
752 				Inserts should always be made using
753 				PAGE_CUR_LE to search the position! */
754 	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
755 				at most one of BTR_INSERT, BTR_DELETE_MARK,
756 				BTR_DELETE, or BTR_ESTIMATE;
757 				cursor->left_block is used to store a pointer
758 				to the left neighbor page, in the cases
759 				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
760 				NOTE that if has_search_latch
761 				is != 0, we maybe do not have a latch set
762 				on the cursor page, we assume
763 				the caller uses his search latch
764 				to protect the record! */
765 	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
766 				s- or x-latched, but see also above! */
767 	ulint		has_search_latch,
768 				/*!< in: info on the latch mode the
769 				caller currently has on search system:
770 				RW_S_LATCH, or 0 */
771 	const char*	file,	/*!< in: file name */
772 	ulint		line,	/*!< in: line where called */
773 	mtr_t*		mtr)	/*!< in: mtr */
774 {
775 	page_t*		page = NULL; /* remove warning */
776 	buf_block_t*	block;
777 	ulint		height;
778 	ulint		up_match;
779 	ulint		up_bytes;
780 	ulint		low_match;
781 	ulint		low_bytes;
782 	ulint		savepoint;
783 	ulint		rw_latch;
784 	page_cur_mode_t	page_mode;
785 	page_cur_mode_t	search_mode = PAGE_CUR_UNSUPP;
786 	ulint		buf_mode;
787 	ulint		estimate;
788 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
789 	page_cur_t*	page_cursor;
790 	btr_op_t	btr_op;
791 	ulint		root_height = 0; /* remove warning */
792 
793 	ulint		upper_rw_latch, root_leaf_rw_latch;
794 	btr_intention_t	lock_intention;
795 	bool		modify_external;
796 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
797 	ulint		tree_savepoints[BTR_MAX_LEVELS];
798 	ulint		n_blocks = 0;
799 	ulint		n_releases = 0;
800 	bool		detected_same_key_root = false;
801 
802 	bool		retrying_for_search_prev = false;
803 	ulint		leftmost_from_level = 0;
804 	buf_block_t**	prev_tree_blocks = NULL;
805 	ulint*		prev_tree_savepoints = NULL;
806 	ulint		prev_n_blocks = 0;
807 	ulint		prev_n_releases = 0;
808 	bool		need_path = true;
809 	bool		rtree_parent_modified = false;
810 	bool		mbr_adj = false;
811 	bool		found = false;
812 
813 	DBUG_ENTER("btr_cur_search_to_nth_level");
814 
815 	btr_search_t*	info;
816 	mem_heap_t*	heap		= NULL;
817 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
818 	ulint*		offsets		= offsets_;
819 	ulint		offsets2_[REC_OFFS_NORMAL_SIZE];
820 	ulint*		offsets2	= offsets2_;
821 	rec_offs_init(offsets_);
822 	rec_offs_init(offsets2_);
823 	/* Currently, PAGE_CUR_LE is the only search mode used for searches
824 	ending to upper levels */
825 
826 	ut_ad(level == 0 || mode == PAGE_CUR_LE
827 	      || RTREE_SEARCH_MODE(mode));
828 	ut_ad(dict_index_check_search_tuple(index, tuple));
829 	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
830 	ut_ad(dtuple_check_typed(tuple));
831 	ut_ad(!(index->type & DICT_FTS));
832 	ut_ad(index->page != FIL_NULL);
833 
834 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
835 	UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
836 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
837 	UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
838 #ifdef UNIV_DEBUG
839 	cursor->up_match = ULINT_UNDEFINED;
840 	cursor->low_match = ULINT_UNDEFINED;
841 #endif /* UNIV_DEBUG */
842 
843 	ibool	s_latch_by_caller;
844 
845 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
846 
847 	ut_ad(!s_latch_by_caller
848 	      || srv_read_only_mode
849 	      || mtr_memo_contains_flagged(mtr,
850 					   dict_index_get_lock(index),
851 					   MTR_MEMO_S_LOCK
852 					   | MTR_MEMO_SX_LOCK));
853 
854 	/* These flags are mutually exclusive, they are lumped together
855 	with the latch mode for historical reasons. It's possible for
856 	none of the flags to be set. */
857 	switch (UNIV_EXPECT(latch_mode
858 			    & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
859 			    0)) {
860 	case 0:
861 		btr_op = BTR_NO_OP;
862 		break;
863 	case BTR_INSERT:
864 		btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
865 			? BTR_INSERT_IGNORE_UNIQUE_OP
866 			: BTR_INSERT_OP;
867 		break;
868 	case BTR_DELETE:
869 		btr_op = BTR_DELETE_OP;
870 		ut_a(cursor->purge_node);
871 		break;
872 	case BTR_DELETE_MARK:
873 		btr_op = BTR_DELMARK_OP;
874 		break;
875 	default:
876 		/* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
877 		should be specified at a time */
878 		ut_error;
879 	}
880 
881 	/* Operations on the insert buffer tree cannot be buffered. */
882 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
883 	/* Operations on the clustered index cannot be buffered. */
884 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
885 	/* Operations on the temporary table(indexes) cannot be buffered. */
886 	ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
887 	/* Operation on the spatial index cannot be buffered. */
888 	ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
889 
890 	estimate = latch_mode & BTR_ESTIMATE;
891 
892 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
893 
894 	modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
895 
896 	/* Turn the flags unrelated to the latch mode off. */
897 	latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
898 
899 	ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
900 
901 	ut_ad(!s_latch_by_caller
902 	      || latch_mode == BTR_SEARCH_LEAF
903 	      || latch_mode == BTR_SEARCH_TREE
904 	      || latch_mode == BTR_MODIFY_LEAF);
905 
906 	cursor->flag = BTR_CUR_BINARY;
907 	cursor->index = index;
908 
909 	info = btr_search_get_info(index);
910 
911 # ifdef UNIV_SEARCH_PERF_STAT
912 	info->n_searches++;
913 # endif
914 	/* Use of AHI is disabled for intrinsic table as these tables re-use
915 	the index-id and AHI validation is based on index-id. */
916 	if (rw_lock_get_writer(btr_get_search_latch(index))
917 		== RW_LOCK_NOT_LOCKED
918 	    && latch_mode <= BTR_MODIFY_LEAF
919 	    && info->last_hash_succ
920 	    && !index->disable_ahi
921 	    && !estimate
922 # ifdef PAGE_CUR_LE_OR_EXTENDS
923 	    && mode != PAGE_CUR_LE_OR_EXTENDS
924 # endif /* PAGE_CUR_LE_OR_EXTENDS */
925 	    && !dict_index_is_spatial(index)
926 	    /* If !has_search_latch, we do a dirty read of
927 	    btr_search_enabled below, and btr_search_guess_on_hash()
928 	    will have to check it again. */
929 	    && UNIV_LIKELY(btr_search_enabled)
930 	    && !modify_external
931 	    && btr_search_guess_on_hash(index, info, tuple, mode,
932 					latch_mode, cursor,
933 					has_search_latch, mtr)) {
934 
935 		/* Search using the hash index succeeded */
936 
937 		ut_ad(cursor->up_match != ULINT_UNDEFINED
938 		      || mode != PAGE_CUR_GE);
939 		ut_ad(cursor->up_match != ULINT_UNDEFINED
940 		      || mode != PAGE_CUR_LE);
941 		ut_ad(cursor->low_match != ULINT_UNDEFINED
942 		      || mode != PAGE_CUR_LE);
943 		btr_cur_n_sea++;
944 
945 		DBUG_VOID_RETURN;
946 	}
947 	btr_cur_n_non_sea++;
948 
949 	/* If the hash search did not succeed, do binary search down the
950 	tree */
951 
952 	if (has_search_latch) {
953 		/* Release possible search latch to obey latching order */
954 		rw_lock_s_unlock(btr_get_search_latch(index));
955 	}
956 
957 	/* Store the position of the tree latch we push to mtr so that we
958 	know how to release it when we have latched leaf node(s) */
959 
960 	savepoint = mtr_set_savepoint(mtr);
961 
962 	switch (latch_mode) {
963 	case BTR_MODIFY_TREE:
964 		/* Most of delete-intended operations are purging.
965 		Free blocks and read IO bandwidth should be prior
966 		for them, when the history list is glowing huge. */
967 		if (lock_intention == BTR_INTENTION_DELETE
968 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
969 			&& buf_get_n_pending_read_ios()) {
970 			mtr_x_lock(dict_index_get_lock(index), mtr);
971 		} else if (dict_index_is_spatial(index)
972 			   && lock_intention <= BTR_INTENTION_BOTH) {
973 			/* X lock the if there is possibility of
974 			pessimistic delete on spatial index. As we could
975 			lock upward for the tree */
976 
977 			mtr_x_lock(dict_index_get_lock(index), mtr);
978 		} else {
979 			mtr_sx_lock(dict_index_get_lock(index), mtr);
980 		}
981 		upper_rw_latch = RW_X_LATCH;
982 		break;
983 	case BTR_CONT_MODIFY_TREE:
984 	case BTR_CONT_SEARCH_TREE:
985 		/* Do nothing */
986 		ut_ad(srv_read_only_mode
987 		      || mtr_memo_contains_flagged(mtr,
988 						   dict_index_get_lock(index),
989 						   MTR_MEMO_X_LOCK
990 						   | MTR_MEMO_SX_LOCK));
991 		if (dict_index_is_spatial(index)
992 		    && latch_mode == BTR_CONT_MODIFY_TREE) {
993 			/* If we are about to locating parent page for split
994 			and/or merge operation for R-Tree index, X latch
995 			the parent */
996 			upper_rw_latch = RW_X_LATCH;
997 		} else {
998 			upper_rw_latch = RW_NO_LATCH;
999 		}
1000 		break;
1001 	default:
1002 		if (!srv_read_only_mode) {
1003 			if (s_latch_by_caller) {
1004 				ut_ad(rw_lock_own(dict_index_get_lock(index),
1005 				              RW_LOCK_S));
1006 			} else if (!modify_external) {
1007 				/* BTR_SEARCH_TREE is intended to be used with
1008 				BTR_ALREADY_S_LATCHED */
1009 				ut_ad(latch_mode != BTR_SEARCH_TREE);
1010 
1011 				mtr_s_lock(dict_index_get_lock(index), mtr);
1012 			} else {
1013 				/* BTR_MODIFY_EXTERNAL needs to be excluded */
1014 				mtr_sx_lock(dict_index_get_lock(index), mtr);
1015 			}
1016 			upper_rw_latch = RW_S_LATCH;
1017 		} else {
1018 			upper_rw_latch = RW_NO_LATCH;
1019 		}
1020 	}
1021 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1022 
1023 	page_cursor = btr_cur_get_page_cur(cursor);
1024 
1025 	const ulint		space = dict_index_get_space(index);
1026 	const page_size_t	page_size(dict_table_page_size(index->table));
1027 
1028 	/* Start with the root page. */
1029 	page_id_t		page_id(space, dict_index_get_page(index));
1030 
1031 	if (root_leaf_rw_latch == RW_X_LATCH) {
1032 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
1033 	}
1034 
1035 	up_match = 0;
1036 	up_bytes = 0;
1037 	low_match = 0;
1038 	low_bytes = 0;
1039 
1040 	height = ULINT_UNDEFINED;
1041 
1042 	/* We use these modified search modes on non-leaf levels of the
1043 	B-tree. These let us end up in the right B-tree leaf. In that leaf
1044 	we use the original search mode. */
1045 
1046 	switch (mode) {
1047 	case PAGE_CUR_GE:
1048 		page_mode = PAGE_CUR_L;
1049 		break;
1050 	case PAGE_CUR_G:
1051 		page_mode = PAGE_CUR_LE;
1052 		break;
1053 	default:
1054 #ifdef PAGE_CUR_LE_OR_EXTENDS
1055 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1056 		      || RTREE_SEARCH_MODE(mode)
1057 		      || mode == PAGE_CUR_LE_OR_EXTENDS);
1058 #else /* PAGE_CUR_LE_OR_EXTENDS */
1059 		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1060 		      || RTREE_SEARCH_MODE(mode));
1061 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1062 		page_mode = mode;
1063 		break;
1064 	}
1065 
1066 	/* Loop and search until we arrive at the desired level */
1067 	btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1068 
1069 search_loop:
1070 	buf_mode = BUF_GET;
1071 	rw_latch = RW_NO_LATCH;
1072 	rtree_parent_modified = false;
1073 
1074 	if (height != 0) {
1075 		/* We are about to fetch the root or a non-leaf page. */
1076 		if ((latch_mode != BTR_MODIFY_TREE
1077 		     || height == level)
1078 		    && !retrying_for_search_prev) {
1079 			/* If doesn't have SX or X latch of index,
1080 			each pages should be latched before reading. */
1081 			if (modify_external
1082 			    && height == ULINT_UNDEFINED
1083 			    && upper_rw_latch == RW_S_LATCH) {
1084 				/* needs sx-latch of root page
1085 				for fseg operation */
1086 				rw_latch = RW_SX_LATCH;
1087 			} else {
1088 				rw_latch = upper_rw_latch;
1089 			}
1090 		}
1091 	} else if (latch_mode <= BTR_MODIFY_LEAF) {
1092 		rw_latch = latch_mode;
1093 
1094 		if (btr_op != BTR_NO_OP
1095 		    && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1096 
1097 			/* Try to buffer the operation if the leaf
1098 			page is not in the buffer pool. */
1099 
1100 			buf_mode = btr_op == BTR_DELETE_OP
1101 				? BUF_GET_IF_IN_POOL_OR_WATCH
1102 				: BUF_GET_IF_IN_POOL;
1103 		}
1104 	}
1105 
1106 retry_page_get:
1107 	ut_ad(n_blocks < BTR_MAX_LEVELS);
1108 	tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1109 	block = buf_page_get_gen(
1110 		page_id, page_size, rw_latch,
1111 		(height == ULINT_UNDEFINED ? info->root_guess : NULL),
1112 		buf_mode, file, line, mtr
1113 	);
1114 
1115 	tree_blocks[n_blocks] = block;
1116 
1117 	if (block == NULL) {
1118 		/* This must be a search to perform an insert/delete
1119 		mark/ delete; try using the insert/delete buffer */
1120 
1121 		ut_ad(height == 0);
1122 		ut_ad(cursor->thr);
1123 
1124 		switch (btr_op) {
1125 		case BTR_INSERT_OP:
1126 		case BTR_INSERT_IGNORE_UNIQUE_OP:
1127 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1128 			ut_ad(!dict_index_is_spatial(index));
1129 
1130 			if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1131 					page_id, page_size, cursor->thr)) {
1132 
1133 				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1134 
1135 				goto func_exit;
1136 			}
1137 			break;
1138 
1139 		case BTR_DELMARK_OP:
1140 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1141 			ut_ad(!dict_index_is_spatial(index));
1142 
1143 			if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1144 					index, page_id, page_size,
1145 					cursor->thr)) {
1146 
1147 				cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1148 
1149 				goto func_exit;
1150 			}
1151 
1152 			break;
1153 
1154 		case BTR_DELETE_OP:
1155 			ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1156 			ut_ad(!dict_index_is_spatial(index));
1157 
1158 			if (!row_purge_poss_sec(cursor->purge_node,
1159 						index, tuple)) {
1160 
1161 				/* The record cannot be purged yet. */
1162 				cursor->flag = BTR_CUR_DELETE_REF;
1163 			} else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1164 					       index, page_id, page_size,
1165 					       cursor->thr)) {
1166 
1167 				/* The purge was buffered. */
1168 				cursor->flag = BTR_CUR_DELETE_IBUF;
1169 			} else {
1170 				/* The purge could not be buffered. */
1171 				buf_pool_watch_unset(page_id);
1172 				break;
1173 			}
1174 
1175 			buf_pool_watch_unset(page_id);
1176 			goto func_exit;
1177 
1178 		default:
1179 			ut_error;
1180 		}
1181 
1182 		/* Insert to the insert/delete buffer did not succeed, we
1183 		must read the page from disk. */
1184 
1185 		buf_mode = BUF_GET;
1186 
1187 		goto retry_page_get;
1188 	}
1189 
1190 	if (retrying_for_search_prev && height != 0) {
1191 		/* also latch left sibling */
1192 		ulint		left_page_no;
1193 		buf_block_t*	get_block;
1194 
1195 		ut_ad(rw_latch == RW_NO_LATCH);
1196 
1197 		rw_latch = upper_rw_latch;
1198 
1199 		rw_lock_s_lock(&block->lock);
1200 		left_page_no = btr_page_get_prev(
1201 			buf_block_get_frame(block), mtr);
1202 		rw_lock_s_unlock(&block->lock);
1203 
1204 		if (left_page_no != FIL_NULL) {
1205 			ut_ad(prev_n_blocks < leftmost_from_level);
1206 
1207 			prev_tree_savepoints[prev_n_blocks]
1208 				= mtr_set_savepoint(mtr);
1209 			get_block = buf_page_get_gen(
1210 				page_id_t(page_id.space(), left_page_no),
1211 				page_size, rw_latch, NULL, buf_mode,
1212 				file, line, mtr);
1213 			prev_tree_blocks[prev_n_blocks] = get_block;
1214 			prev_n_blocks++;
1215 
1216 			/* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1217 			without their parent page's lock. So, not needed to
1218 			retry here, because we have the parent page's lock. */
1219 		}
1220 
1221 		/* release RW_NO_LATCH page and lock with RW_S_LATCH */
1222 		mtr_release_block_at_savepoint(
1223 			mtr, tree_savepoints[n_blocks],
1224 			tree_blocks[n_blocks]);
1225 
1226 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1227 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1228 					 buf_mode, file, line, mtr);
1229 		tree_blocks[n_blocks] = block;
1230 	}
1231 
1232 	page = buf_block_get_frame(block);
1233 
1234 	if (height == ULINT_UNDEFINED
1235 	    && page_is_leaf(page)
1236 	    && rw_latch != RW_NO_LATCH
1237 	    && rw_latch != root_leaf_rw_latch) {
1238 		/* We should retry to get the page, because the root page
1239 		is latched with different level as a leaf page. */
1240 		ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1241 		ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1242 		ut_ad(rw_latch == RW_S_LATCH || modify_external);
1243 
1244 		ut_ad(n_blocks == 0);
1245 		mtr_release_block_at_savepoint(
1246 			mtr, tree_savepoints[n_blocks],
1247 			tree_blocks[n_blocks]);
1248 
1249 		upper_rw_latch = root_leaf_rw_latch;
1250 		goto search_loop;
1251 	}
1252 
1253 	if (rw_latch != RW_NO_LATCH) {
1254 #ifdef UNIV_ZIP_DEBUG
1255 		const page_zip_des_t*	page_zip
1256 			= buf_block_get_page_zip(block);
1257 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1258 #endif /* UNIV_ZIP_DEBUG */
1259 
1260 		buf_block_dbg_add_level(
1261 			block, dict_index_is_ibuf(index)
1262 			? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1263 	}
1264 
1265 	ut_ad(fil_page_index_page_check(page));
1266 	ut_ad(index->id == btr_page_get_index_id(page));
1267 
1268 	if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1269 		/* We are in the root node */
1270 
1271 		height = btr_page_get_level(page, mtr);
1272 		root_height = height;
1273 		cursor->tree_height = root_height + 1;
1274 
1275 		if (dict_index_is_spatial(index)) {
1276 			ut_ad(cursor->rtr_info);
1277 
1278 			node_seq_t      seq_no = rtr_get_current_ssn_id(index);
1279 
1280 			/* If SSN in memory is not initialized, fetch
1281 			it from root page */
1282 			if (seq_no < 1) {
1283 				node_seq_t      root_seq_no;
1284 
1285 				root_seq_no = page_get_ssn_id(page);
1286 
1287 				mutex_enter(&(index->rtr_ssn.mutex));
1288 				index->rtr_ssn.seq_no = root_seq_no + 1;
1289 				mutex_exit(&(index->rtr_ssn.mutex));
1290 			}
1291 
1292 			/* Save the MBR */
1293 			cursor->rtr_info->thr = cursor->thr;
1294 			rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1295 		}
1296 
1297 		info->root_guess = block;
1298 	}
1299 
1300 	if (height == 0) {
1301 		if (rw_latch == RW_NO_LATCH) {
1302 
1303 			latch_leaves = btr_cur_latch_leaves(
1304 				block, page_id, page_size, latch_mode,
1305 				cursor, mtr);
1306 		}
1307 
1308 		switch (latch_mode) {
1309 		case BTR_MODIFY_TREE:
1310 		case BTR_CONT_MODIFY_TREE:
1311 		case BTR_CONT_SEARCH_TREE:
1312 			break;
1313 		default:
1314 			if (!s_latch_by_caller
1315 			    && !srv_read_only_mode
1316 			    && !modify_external) {
1317 				/* Release the tree s-latch */
1318 				/* NOTE: BTR_MODIFY_EXTERNAL
1319 				needs to keep tree sx-latch */
1320 				mtr_release_s_latch_at_savepoint(
1321 					mtr, savepoint,
1322 					dict_index_get_lock(index));
1323 			}
1324 
1325 			/* release upper blocks */
1326 			if (retrying_for_search_prev) {
1327 				for (;
1328 				     prev_n_releases < prev_n_blocks;
1329 				     prev_n_releases++) {
1330 					mtr_release_block_at_savepoint(
1331 						mtr,
1332 						prev_tree_savepoints[
1333 							prev_n_releases],
1334 						prev_tree_blocks[
1335 							prev_n_releases]);
1336 				}
1337 			}
1338 
1339 			for (; n_releases < n_blocks; n_releases++) {
1340 				if (n_releases == 0 && modify_external) {
1341 					/* keep latch of root page */
1342 					ut_ad(mtr_memo_contains_flagged(
1343 						mtr, tree_blocks[n_releases],
1344 						MTR_MEMO_PAGE_SX_FIX
1345 						| MTR_MEMO_PAGE_X_FIX));
1346 					continue;
1347 				}
1348 
1349 				mtr_release_block_at_savepoint(
1350 					mtr, tree_savepoints[n_releases],
1351 					tree_blocks[n_releases]);
1352 			}
1353 		}
1354 
1355 		page_mode = mode;
1356 	}
1357 
1358 	if (dict_index_is_spatial(index)) {
1359 		/* Remember the page search mode */
1360 		search_mode = page_mode;
1361 
1362 		/* Some adjustment on search mode, when the
1363 		page search mode is PAGE_CUR_RTREE_LOCATE
1364 		or PAGE_CUR_RTREE_INSERT, as we are searching
1365 		with MBRs. When it is not the target level, we
1366 		should search all sub-trees that "CONTAIN" the
1367 		search range/MBR. When it is at the target
1368 		level, the search becomes PAGE_CUR_LE */
1369 		if (page_mode == PAGE_CUR_RTREE_LOCATE
1370 		    && level == height) {
1371 			if (level == 0) {
1372 				page_mode = PAGE_CUR_LE;
1373 			} else {
1374 				page_mode = PAGE_CUR_RTREE_GET_FATHER;
1375 			}
1376 		}
1377 
1378 		if (page_mode == PAGE_CUR_RTREE_INSERT) {
1379 			page_mode = (level == height)
1380 					? PAGE_CUR_LE
1381 					: PAGE_CUR_RTREE_INSERT;
1382 
1383 			ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1384 		}
1385 
1386 		/* "need_path" indicates if we need to tracking the parent
1387 		pages, if it is not spatial comparison, then no need to
1388 		track it */
1389 		if (page_mode < PAGE_CUR_CONTAIN) {
1390 			need_path = false;
1391 		}
1392 
1393 		up_match = 0;
1394 		low_match = 0;
1395 
1396 		if (latch_mode == BTR_MODIFY_TREE
1397 		    || latch_mode == BTR_CONT_MODIFY_TREE
1398 		    || latch_mode == BTR_CONT_SEARCH_TREE) {
1399 			/* Tree are locked, no need for Page Lock to protect
1400 			the "path" */
1401 			cursor->rtr_info->need_page_lock = false;
1402 		}
1403         }
1404 
1405 	if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1406 		ut_ad(need_path);
1407 		found = rtr_cur_search_with_match(
1408 			block, index, tuple, page_mode, page_cursor,
1409 			cursor->rtr_info);
1410 
1411 		/* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1412 		if (search_mode == PAGE_CUR_RTREE_INSERT
1413 		    && cursor->rtr_info->mbr_adj) {
1414 			if (latch_mode & BTR_MODIFY_LEAF) {
1415 				/* Parent MBR needs updated, should retry
1416 				with BTR_MODIFY_TREE */
1417 				goto func_exit;
1418 			} else if (latch_mode & BTR_MODIFY_TREE) {
1419 				rtree_parent_modified = true;
1420 				cursor->rtr_info->mbr_adj = false;
1421 				mbr_adj = true;
1422 			} else {
1423 				ut_ad(0);
1424 			}
1425 		}
1426 
1427 		if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1428 			cursor->low_match =
1429 				DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1430 		}
1431 	} else if (height == 0 && btr_search_enabled
1432 		   && !dict_index_is_spatial(index)) {
1433 		/* The adaptive hash index is only used when searching
1434 		for leaf pages (height==0), but not in r-trees.
1435 		We only need the byte prefix comparison for the purpose
1436 		of updating the adaptive hash index. */
1437 		page_cur_search_with_match_bytes(
1438 			block, index, tuple, page_mode, &up_match, &up_bytes,
1439 			&low_match, &low_bytes, page_cursor);
1440 	} else {
1441 		/* Search for complete index fields. */
1442 		up_bytes = low_bytes = 0;
1443 		page_cur_search_with_match(
1444 			block, index, tuple, page_mode, &up_match,
1445 			&low_match, page_cursor,
1446 			need_path ? cursor->rtr_info : NULL);
1447 	}
1448 
1449 	if (estimate) {
1450 		btr_cur_add_path_info(cursor, height, root_height);
1451 	}
1452 
1453 	/* If this is the desired level, leave the loop */
1454 
1455 	ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1456 					   mtr));
1457 
1458 	/* Add Predicate lock if it is serializable isolation
1459 	and only if it is in the search case */
1460 	if (dict_index_is_spatial(index)
1461 	    && cursor->rtr_info->need_prdt_lock
1462 	    && mode != PAGE_CUR_RTREE_INSERT
1463 	    && mode != PAGE_CUR_RTREE_LOCATE
1464 	    && mode >= PAGE_CUR_CONTAIN) {
1465 		trx_t*		trx = thr_get_trx(cursor->thr);
1466 		lock_prdt_t	prdt;
1467 
1468 		lock_mutex_enter();
1469 		lock_init_prdt_from_mbr(
1470 			&prdt, &cursor->rtr_info->mbr, mode,
1471 			trx->lock.lock_heap);
1472 		lock_mutex_exit();
1473 
1474 		if (rw_latch == RW_NO_LATCH && height != 0) {
1475 			rw_lock_s_lock(&(block->lock));
1476 		}
1477 
1478 		lock_prdt_lock(block, &prdt, index, LOCK_S,
1479 			       LOCK_PREDICATE, cursor->thr, mtr);
1480 
1481 		if (rw_latch == RW_NO_LATCH && height != 0) {
1482 			rw_lock_s_unlock(&(block->lock));
1483 		}
1484 	}
1485 
1486 	if (level != height) {
1487 
1488 		const rec_t*	node_ptr;
1489 		ut_ad(height > 0);
1490 
1491 		height--;
1492 
1493 		node_ptr = page_cur_get_rec(page_cursor);
1494 
1495 		offsets = rec_get_offsets(
1496 			node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1497 
1498 		/* If the rec is the first or last in the page for
1499 		pessimistic delete intention, it might cause node_ptr insert
1500 		for the upper level. We should change the intention and retry.
1501 		*/
1502 		if (latch_mode == BTR_MODIFY_TREE
1503 		    && btr_cur_need_opposite_intention(
1504 			page, lock_intention, node_ptr)) {
1505 
1506 need_opposite_intention:
1507 			ut_ad(upper_rw_latch == RW_X_LATCH);
1508 
1509 			if (n_releases > 0) {
1510 				/* release root block */
1511 				mtr_release_block_at_savepoint(
1512 					mtr, tree_savepoints[0],
1513 					tree_blocks[0]);
1514 			}
1515 
1516 			/* release all blocks */
1517 			for (; n_releases <= n_blocks; n_releases++) {
1518 				mtr_release_block_at_savepoint(
1519 					mtr, tree_savepoints[n_releases],
1520 					tree_blocks[n_releases]);
1521 			}
1522 
1523 			lock_intention = BTR_INTENTION_BOTH;
1524 
1525 			page_id.reset(space, dict_index_get_page(index));
1526 			up_match = 0;
1527 			low_match = 0;
1528 			height = ULINT_UNDEFINED;
1529 
1530 			n_blocks = 0;
1531 			n_releases = 0;
1532 
1533 			goto search_loop;
1534 		}
1535 
1536 		if (dict_index_is_spatial(index)) {
1537 			if (page_rec_is_supremum(node_ptr)) {
1538 				cursor->low_match = 0;
1539 				cursor->up_match = 0;
1540 				goto func_exit;
1541 			}
1542 
1543 			/* If we are doing insertion or record locating,
1544 			remember the tree nodes we visited */
1545 			if (page_mode == PAGE_CUR_RTREE_INSERT
1546 			    || (search_mode == PAGE_CUR_RTREE_LOCATE
1547 			        && (latch_mode != BTR_MODIFY_LEAF))) {
1548 				bool		add_latch = false;
1549 
1550 				if (latch_mode == BTR_MODIFY_TREE
1551 				    && rw_latch == RW_NO_LATCH) {
1552 					ut_ad(mtr_memo_contains_flagged(
1553 						mtr, dict_index_get_lock(index),
1554 						MTR_MEMO_X_LOCK
1555 						| MTR_MEMO_SX_LOCK));
1556 					rw_lock_s_lock(&block->lock);
1557 					add_latch = true;
1558 				}
1559 
1560 				/* Store the parent cursor location */
1561 #ifdef UNIV_DEBUG
1562 				ulint	num_stored = rtr_store_parent_path(
1563 					block, cursor, latch_mode,
1564 					height + 1, mtr);
1565 #else
1566 				rtr_store_parent_path(
1567 					block, cursor, latch_mode,
1568 					height + 1, mtr);
1569 #endif
1570 
1571 				if (page_mode == PAGE_CUR_RTREE_INSERT) {
1572 					btr_pcur_t*     r_cursor =
1573 						rtr_get_parent_cursor(
1574 							cursor, height + 1,
1575 							true);
1576 					/* If it is insertion, there should
1577 					be only one parent for each level
1578 					traverse */
1579 #ifdef UNIV_DEBUG
1580 					ut_ad(num_stored == 1);
1581 #endif
1582 
1583 					node_ptr = btr_pcur_get_rec(r_cursor);
1584 
1585 				}
1586 
1587 				if (add_latch) {
1588 					rw_lock_s_unlock(&block->lock);
1589 				}
1590 
1591 				ut_ad(!page_rec_is_supremum(node_ptr));
1592 			}
1593 
1594 			ut_ad(page_mode == search_mode
1595 			      || (page_mode == PAGE_CUR_WITHIN
1596 				  && search_mode == PAGE_CUR_RTREE_LOCATE));
1597 
1598 			page_mode = search_mode;
1599 		}
1600 
1601 		/* If the first or the last record of the page
1602 		or the same key value to the first record or last record,
1603 		the another page might be choosen when BTR_CONT_MODIFY_TREE.
1604 		So, the parent page should not released to avoiding deadlock
1605 		with blocking the another search with the same key value. */
1606 		if (!detected_same_key_root
1607 		    && lock_intention == BTR_INTENTION_BOTH
1608 		    && !dict_index_is_unique(index)
1609 		    && latch_mode == BTR_MODIFY_TREE
1610 		    && (up_match >= rec_offs_n_fields(offsets) - 1
1611 			|| low_match >= rec_offs_n_fields(offsets) - 1)) {
1612 			const rec_t*	first_rec
1613 						= page_rec_get_next_const(
1614 							page_get_infimum_rec(
1615 								page));
1616 			ulint		matched_fields;
1617 
1618 			ut_ad(upper_rw_latch == RW_X_LATCH);
1619 
1620 			if (node_ptr == first_rec
1621 			    || page_rec_is_last(node_ptr, page)) {
1622 				detected_same_key_root = true;
1623 			} else {
1624 				matched_fields = 0;
1625 
1626 				offsets2 = rec_get_offsets(
1627 					first_rec, index, offsets2,
1628 					ULINT_UNDEFINED, &heap);
1629 				cmp_rec_rec_with_match(node_ptr, first_rec,
1630 					offsets, offsets2, index,
1631 					page_is_spatial_non_leaf(first_rec, index),
1632 					false, &matched_fields);
1633 
1634 				if (matched_fields
1635 				    >= rec_offs_n_fields(offsets) - 1) {
1636 					detected_same_key_root = true;
1637 				} else {
1638 					const rec_t*	last_rec;
1639 
1640 					last_rec = page_rec_get_prev_const(
1641 							page_get_supremum_rec(
1642 								page));
1643 
1644 					matched_fields = 0;
1645 
1646 					offsets2 = rec_get_offsets(
1647 						last_rec, index, offsets2,
1648 						ULINT_UNDEFINED, &heap);
1649 					cmp_rec_rec_with_match(
1650 						node_ptr, last_rec,
1651 						offsets, offsets2, index,
1652 						page_is_spatial_non_leaf(last_rec, index),
1653 						false, &matched_fields);
1654 					if (matched_fields
1655 					    >= rec_offs_n_fields(offsets) - 1) {
1656 						detected_same_key_root = true;
1657 					}
1658 				}
1659 			}
1660 		}
1661 
1662 		/* If the page might cause modify_tree,
1663 		we should not release the parent page's lock. */
1664 		if (!detected_same_key_root
1665 		    && latch_mode == BTR_MODIFY_TREE
1666 		    && !btr_cur_will_modify_tree(
1667 				index, page, lock_intention, node_ptr,
1668 				node_ptr_max_size, page_size, mtr)
1669 		    && !rtree_parent_modified) {
1670 			ut_ad(upper_rw_latch == RW_X_LATCH);
1671 			ut_ad(n_releases <= n_blocks);
1672 
1673 			/* we can release upper blocks */
1674 			for (; n_releases < n_blocks; n_releases++) {
1675 				if (n_releases == 0) {
1676 					/* we should not release root page
1677 					to pin to same block. */
1678 					continue;
1679 				}
1680 
1681 				/* release unused blocks to unpin */
1682 				mtr_release_block_at_savepoint(
1683 					mtr, tree_savepoints[n_releases],
1684 					tree_blocks[n_releases]);
1685 			}
1686 		}
1687 
1688 		if (height == level
1689 		    && latch_mode == BTR_MODIFY_TREE) {
1690 			ut_ad(upper_rw_latch == RW_X_LATCH);
1691 			/* we should sx-latch root page, if released already.
1692 			It contains seg_header. */
1693 			if (n_releases > 0) {
1694 				mtr_block_sx_latch_at_savepoint(
1695 					mtr, tree_savepoints[0],
1696 					tree_blocks[0]);
1697 			}
1698 
1699 			/* x-latch the branch blocks not released yet. */
1700 			for (ulint i = n_releases; i <= n_blocks; i++) {
1701 				mtr_block_x_latch_at_savepoint(
1702 					mtr, tree_savepoints[i],
1703 					tree_blocks[i]);
1704 			}
1705 		}
1706 
1707 		/* We should consider prev_page of parent page, if the node_ptr
1708 		is the leftmost of the page. because BTR_SEARCH_PREV and
1709 		BTR_MODIFY_PREV latches prev_page of the leaf page. */
1710 		if ((latch_mode == BTR_SEARCH_PREV
1711 		     || latch_mode == BTR_MODIFY_PREV)
1712 		    && !retrying_for_search_prev) {
1713 			/* block should be latched for consistent
1714 			   btr_page_get_prev() */
1715 			ut_ad(mtr_memo_contains_flagged(mtr, block,
1716 				MTR_MEMO_PAGE_S_FIX
1717 				| MTR_MEMO_PAGE_X_FIX));
1718 
1719 			if (btr_page_get_prev(page, mtr) != FIL_NULL
1720 			    && page_rec_is_first(node_ptr, page)) {
1721 
1722 				if (leftmost_from_level == 0) {
1723 					leftmost_from_level = height + 1;
1724 				}
1725 			} else {
1726 				leftmost_from_level = 0;
1727 			}
1728 
1729 			if (height == 0 && leftmost_from_level > 0) {
1730 				/* should retry to get also prev_page
1731 				from level==leftmost_from_level. */
1732 				retrying_for_search_prev = true;
1733 
1734 				prev_tree_blocks = static_cast<buf_block_t**>(
1735 					ut_malloc_nokey(sizeof(buf_block_t*)
1736 							* leftmost_from_level));
1737 
1738 				prev_tree_savepoints = static_cast<ulint*>(
1739 					ut_malloc_nokey(sizeof(ulint)
1740 							* leftmost_from_level));
1741 
1742 				/* back to the level (leftmost_from_level+1) */
1743 				ulint	idx = n_blocks
1744 					- (leftmost_from_level - 1);
1745 
1746 				page_id.reset(
1747 					space,
1748 					tree_blocks[idx]->page.id.page_no());
1749 
1750 				for (ulint i = n_blocks
1751 					       - (leftmost_from_level - 1);
1752 				     i <= n_blocks; i++) {
1753 					mtr_release_block_at_savepoint(
1754 						mtr, tree_savepoints[i],
1755 						tree_blocks[i]);
1756 				}
1757 
1758 				n_blocks -= (leftmost_from_level - 1);
1759 				height = leftmost_from_level;
1760 				ut_ad(n_releases == 0);
1761 
1762 				/* replay up_match, low_match */
1763 				up_match = 0;
1764 				low_match = 0;
1765 				rtr_info_t*	rtr_info	= need_path
1766 					? cursor->rtr_info : NULL;
1767 
1768 				for (ulint i = 0; i < n_blocks; i++) {
1769 					page_cur_search_with_match(
1770 						tree_blocks[i], index, tuple,
1771 						page_mode, &up_match,
1772 						&low_match, page_cursor,
1773 						rtr_info);
1774 				}
1775 
1776 				goto search_loop;
1777 			}
1778 		}
1779 
1780 		/* Go to the child node */
1781 		page_id.reset(
1782 			space,
1783 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
1784 
1785 		n_blocks++;
1786 
1787 		if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1788 			/* We're doing a search on an ibuf tree and we're one
1789 			level above the leaf page. */
1790 
1791 			ut_ad(level == 0);
1792 
1793 			buf_mode = BUF_GET;
1794 			rw_latch = RW_NO_LATCH;
1795 			goto retry_page_get;
1796 		}
1797 
1798 		if (dict_index_is_spatial(index)
1799 		    && page_mode >= PAGE_CUR_CONTAIN
1800 		    && page_mode != PAGE_CUR_RTREE_INSERT) {
1801 			ut_ad(need_path);
1802 			rtr_node_path_t* path =
1803 				cursor->rtr_info->path;
1804 
1805 			if (!path->empty() && found) {
1806 #ifdef UNIV_DEBUG
1807 				node_visit_t    last_visit = path->back();
1808 
1809 				ut_ad(last_visit.page_no == page_id.page_no());
1810 #endif /* UNIV_DEBUG */
1811 
1812 				path->pop_back();
1813 
1814 #ifdef UNIV_DEBUG
1815 				if (page_mode == PAGE_CUR_RTREE_LOCATE
1816 				    && (latch_mode != BTR_MODIFY_LEAF)) {
1817 					btr_pcur_t*	cur
1818 					= cursor->rtr_info->parent_path->back(
1819 					  ).cursor;
1820 					rec_t*	my_node_ptr
1821 						= btr_pcur_get_rec(cur);
1822 
1823 					offsets = rec_get_offsets(
1824 						my_node_ptr, index, offsets,
1825 						ULINT_UNDEFINED, &heap);
1826 
1827 					ulint	my_page_no
1828 					= btr_node_ptr_get_child_page_no(
1829 						my_node_ptr, offsets);
1830 
1831 					ut_ad(page_id.page_no() == my_page_no);
1832 
1833 				}
1834 #endif
1835 			}
1836 		}
1837 
1838 		goto search_loop;
1839 	} else if (!dict_index_is_spatial(index)
1840 		   && latch_mode == BTR_MODIFY_TREE
1841 		   && lock_intention == BTR_INTENTION_INSERT
1842 		   && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1843 		   && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1844 
1845 		/* btr_insert_into_right_sibling() might cause
1846 		deleting node_ptr at upper level */
1847 
1848 		if (height == 0) {
1849 			/* release the leaf pages if latched */
1850 			for (uint i = 0; i < 3; i++) {
1851 				if (latch_leaves.blocks[i] != NULL) {
1852 					mtr_release_block_at_savepoint(
1853 						mtr, latch_leaves.savepoints[i],
1854 						latch_leaves.blocks[i]);
1855 					latch_leaves.blocks[i] = NULL;
1856 				}
1857 			}
1858 		}
1859 
1860 		goto need_opposite_intention;
1861 	}
1862 
1863 	if (level != 0) {
1864 		if (upper_rw_latch == RW_NO_LATCH) {
1865 			/* latch the page */
1866 			buf_block_t*	child_block;
1867 
1868 			if (latch_mode == BTR_CONT_MODIFY_TREE) {
1869 				child_block = btr_block_get(
1870 					page_id, page_size, RW_X_LATCH,
1871 					index, mtr);
1872 			} else {
1873 				ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1874 				child_block = btr_block_get(
1875 					page_id, page_size, RW_SX_LATCH,
1876 					index, mtr);
1877 			}
1878 
1879 			btr_assert_not_corrupted(child_block, index);
1880 		} else {
1881 			ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1882 			btr_assert_not_corrupted(block, index);
1883 
1884 			if (s_latch_by_caller) {
1885 				ut_ad(latch_mode == BTR_SEARCH_TREE);
1886 				/* to exclude modifying tree operations
1887 				should sx-latch the index. */
1888 				ut_ad(mtr_memo_contains(
1889 					mtr, dict_index_get_lock(index),
1890 					MTR_MEMO_SX_LOCK));
1891 				/* because has sx-latch of index,
1892 				can release upper blocks. */
1893 				for (; n_releases < n_blocks; n_releases++) {
1894 					mtr_release_block_at_savepoint(
1895 						mtr,
1896 						tree_savepoints[n_releases],
1897 						tree_blocks[n_releases]);
1898 				}
1899 			}
1900 		}
1901 
1902 		if (page_mode <= PAGE_CUR_LE) {
1903 			cursor->low_match = low_match;
1904 			cursor->up_match = up_match;
1905 		}
1906 	} else {
1907 		cursor->low_match = low_match;
1908 		cursor->low_bytes = low_bytes;
1909 		cursor->up_match = up_match;
1910 		cursor->up_bytes = up_bytes;
1911 
1912 		/* We do a dirty read of btr_search_enabled here.  We
1913 		will properly check btr_search_enabled again in
1914 		btr_search_build_page_hash_index() before building a
1915 		page hash index, while holding search latch. */
1916 		if (btr_search_enabled && !index->disable_ahi) {
1917 			btr_search_info_update(index, cursor);
1918 		}
1919 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1920 		      || mode != PAGE_CUR_GE);
1921 		ut_ad(cursor->up_match != ULINT_UNDEFINED
1922 		      || mode != PAGE_CUR_LE);
1923 		ut_ad(cursor->low_match != ULINT_UNDEFINED
1924 		      || mode != PAGE_CUR_LE);
1925 	}
1926 
1927 	/* For spatial index, remember  what blocks are still latched */
1928 	if (dict_index_is_spatial(index)
1929 	    && (latch_mode == BTR_MODIFY_TREE
1930 		|| latch_mode == BTR_MODIFY_LEAF)) {
1931 		for (ulint i = 0; i < n_releases; i++) {
1932 			cursor->rtr_info->tree_blocks[i] = NULL;
1933 			cursor->rtr_info->tree_savepoints[i] = 0;
1934 		}
1935 
1936 		for (ulint i = n_releases; i <= n_blocks; i++) {
1937 			cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
1938 			cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
1939 		}
1940 	}
1941 
1942 func_exit:
1943 
1944 	if (UNIV_LIKELY_NULL(heap)) {
1945 		mem_heap_free(heap);
1946 	}
1947 
1948 	if (retrying_for_search_prev) {
1949 		ut_free(prev_tree_blocks);
1950 		ut_free(prev_tree_savepoints);
1951 	}
1952 
1953 	if (has_search_latch) {
1954 
1955 		rw_lock_s_lock(btr_get_search_latch(index));
1956 	}
1957 
1958 	if (mbr_adj) {
1959 		/* remember that we will need to adjust parent MBR */
1960 		cursor->rtr_info->mbr_adj = true;
1961 	}
1962 
1963 	DBUG_VOID_RETURN;
1964 }
1965 
1966 /** Searches an index tree and positions a tree cursor on a given level.
1967 This function will avoid latching the traversal path and so should be
1968 used only for cases where-in latching is not needed.
1969 
1970 @param[in,out]	index	index
1971 @param[in]	level	the tree level of search
1972 @param[in]	tuple	data tuple; Note: n_fields_cmp in compared
1973 			to the node ptr page node field
1974 @param[in]	mode	PAGE_CUR_L, ....
1975 			Insert should always be made using PAGE_CUR_LE
1976 			to search the position.
1977 @param[in,out]	cursor	tree cursor; points to record of interest.
1978 @param[in]	file	file name
1979 @param[in[	line	line where called from
1980 @param[in,out]	mtr	mtr
1981 @param[in]	mark_dirty
1982 			if true then mark the block as dirty */
1983 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)1984 btr_cur_search_to_nth_level_with_no_latch(
1985 	dict_index_t*		index,
1986 	ulint			level,
1987 	const dtuple_t*		tuple,
1988 	page_cur_mode_t		mode,
1989 	btr_cur_t*		cursor,
1990 	const char*		file,
1991 	ulint			line,
1992 	mtr_t*			mtr,
1993 	bool			mark_dirty)
1994 {
1995 	page_t*		page = NULL; /* remove warning */
1996 	buf_block_t*	block;
1997 	ulint		height;
1998 	ulint		up_match;
1999 	ulint		low_match;
2000 	ulint		rw_latch;
2001 	page_cur_mode_t	page_mode;
2002 	ulint		buf_mode;
2003 	page_cur_t*	page_cursor;
2004 	ulint		root_height = 0; /* remove warning */
2005 	ulint		n_blocks = 0;
2006 
2007 	mem_heap_t*	heap		= NULL;
2008 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2009 	ulint*		offsets		= offsets_;
2010 	rec_offs_init(offsets_);
2011 
2012 	DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2013 
2014 	ut_ad(dict_table_is_intrinsic(index->table));
2015 	ut_ad(level == 0 || mode == PAGE_CUR_LE);
2016 	ut_ad(dict_index_check_search_tuple(index, tuple));
2017 	ut_ad(dtuple_check_typed(tuple));
2018 	ut_ad(index->page != FIL_NULL);
2019 
2020 	UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2021 	UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2022 #ifdef UNIV_DEBUG
2023 	cursor->up_match = ULINT_UNDEFINED;
2024 	cursor->low_match = ULINT_UNDEFINED;
2025 #endif /* UNIV_DEBUG */
2026 
2027 	cursor->flag = BTR_CUR_BINARY;
2028 	cursor->index = index;
2029 
2030 	page_cursor = btr_cur_get_page_cur(cursor);
2031 
2032         const ulint		space = dict_index_get_space(index);
2033         const page_size_t	page_size(dict_table_page_size(index->table));
2034         /* Start with the root page. */
2035         page_id_t		page_id(space, dict_index_get_page(index));
2036 
2037 	up_match = 0;
2038 	low_match = 0;
2039 
2040 	height = ULINT_UNDEFINED;
2041 
2042 	/* We use these modified search modes on non-leaf levels of the
2043 	B-tree. These let us end up in the right B-tree leaf. In that leaf
2044 	we use the original search mode. */
2045 
2046 	switch (mode) {
2047 	case PAGE_CUR_GE:
2048 		page_mode = PAGE_CUR_L;
2049 		break;
2050 	case PAGE_CUR_G:
2051 		page_mode = PAGE_CUR_LE;
2052 		break;
2053 	default:
2054 		page_mode = mode;
2055 		break;
2056 	}
2057 
2058 	/* Loop and search until we arrive at the desired level */
2059 	bool at_desired_level = false;
2060 	while (!at_desired_level) {
2061 		buf_mode = BUF_GET;
2062 		rw_latch = RW_NO_LATCH;
2063 
2064 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2065 
2066 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2067 				buf_mode, file, line, mtr, mark_dirty);
2068 
2069 		page = buf_block_get_frame(block);
2070 
2071 		if (height == ULINT_UNDEFINED) {
2072 			/* We are in the root node */
2073 
2074 			height = btr_page_get_level(page, mtr);
2075 			root_height = height;
2076 			cursor->tree_height = root_height + 1;
2077 		}
2078 
2079 		if (height == 0) {
2080 			/* On leaf level. Switch back to original search mode.*/
2081 			page_mode = mode;
2082 		}
2083 
2084 		page_cur_search_with_match(
2085 				block, index, tuple, page_mode, &up_match,
2086 				&low_match, page_cursor, NULL);
2087 
2088 		ut_ad(height == btr_page_get_level(
2089 			page_cur_get_page(page_cursor), mtr));
2090 
2091 		if (level != height) {
2092 
2093 			const rec_t*	node_ptr;
2094 			ut_ad(height > 0);
2095 
2096 			height--;
2097 
2098 			node_ptr = page_cur_get_rec(page_cursor);
2099 
2100 			offsets = rec_get_offsets(
2101 					node_ptr, index, offsets,
2102 					ULINT_UNDEFINED, &heap);
2103 
2104 			/* Go to the child node */
2105 			page_id.reset(space, btr_node_ptr_get_child_page_no(
2106 				node_ptr, offsets));
2107 
2108 			n_blocks++;
2109 		} else {
2110 			/* If this is the desired level, leave the loop */
2111 			at_desired_level = true;
2112 		}
2113 	}
2114 
2115 	cursor->low_match = low_match;
2116 	cursor->up_match = up_match;
2117 
2118 	if (heap != NULL) {
2119 		mem_heap_free(heap);
2120 	}
2121 
2122 	DBUG_VOID_RETURN;
2123 }
2124 
2125 /*****************************************************************//**
2126 Opens a cursor at either end of an index. */
2127 void
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2128 btr_cur_open_at_index_side_func(
2129 /*============================*/
2130 	bool		from_left,	/*!< in: true if open to the low end,
2131 					false if to the high end */
2132 	dict_index_t*	index,		/*!< in: index */
2133 	ulint		latch_mode,	/*!< in: latch mode */
2134 	btr_cur_t*	cursor,		/*!< in/out: cursor */
2135 	ulint		level,		/*!< in: level to search for
2136 					(0=leaf). */
2137 	const char*	file,		/*!< in: file name */
2138 	ulint		line,		/*!< in: line where called */
2139 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
2140 {
2141 	page_cur_t*	page_cursor;
2142 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2143 	ulint		height;
2144 	ulint		root_height = 0; /* remove warning */
2145 	rec_t*		node_ptr;
2146 	ulint		estimate;
2147 	ulint		savepoint;
2148 	ulint		upper_rw_latch, root_leaf_rw_latch;
2149 	btr_intention_t	lock_intention;
2150 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2151 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2152 	ulint		n_blocks = 0;
2153 	ulint		n_releases = 0;
2154 	mem_heap_t*	heap		= NULL;
2155 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2156 	ulint*		offsets		= offsets_;
2157 	rec_offs_init(offsets_);
2158 
2159 	estimate = latch_mode & BTR_ESTIMATE;
2160 	latch_mode &= ~BTR_ESTIMATE;
2161 
2162 	ut_ad(level != ULINT_UNDEFINED);
2163 
2164 	bool	s_latch_by_caller;
2165 
2166 	s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2167 	latch_mode &= ~BTR_ALREADY_S_LATCHED;
2168 
2169 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2170 
2171 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2172 
2173 	/* This function doesn't need to lock left page of the leaf page */
2174 	if (latch_mode == BTR_SEARCH_PREV) {
2175 		latch_mode = BTR_SEARCH_LEAF;
2176 	} else if (latch_mode == BTR_MODIFY_PREV) {
2177 		latch_mode = BTR_MODIFY_LEAF;
2178 	}
2179 
2180 	/* Store the position of the tree latch we push to mtr so that we
2181 	know how to release it when we have latched the leaf node */
2182 
2183 	savepoint = mtr_set_savepoint(mtr);
2184 
2185 	switch (latch_mode) {
2186 	case BTR_CONT_MODIFY_TREE:
2187 	case BTR_CONT_SEARCH_TREE:
2188 		upper_rw_latch = RW_NO_LATCH;
2189 		break;
2190 	case BTR_MODIFY_TREE:
2191 		/* Most of delete-intended operations are purging.
2192 		Free blocks and read IO bandwidth should be prior
2193 		for them, when the history list is glowing huge. */
2194 		if (lock_intention == BTR_INTENTION_DELETE
2195 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2196 		    && buf_get_n_pending_read_ios()) {
2197 			mtr_x_lock(dict_index_get_lock(index), mtr);
2198 		} else {
2199 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2200 		}
2201 		upper_rw_latch = RW_X_LATCH;
2202 		break;
2203 	default:
2204 		ut_ad(!s_latch_by_caller
2205 		      || mtr_memo_contains_flagged(mtr,
2206 						 dict_index_get_lock(index),
2207 						 MTR_MEMO_SX_LOCK
2208 						 | MTR_MEMO_S_LOCK));
2209 		if (!srv_read_only_mode) {
2210 			if (!s_latch_by_caller) {
2211 				/* BTR_SEARCH_TREE is intended to be used with
2212 				BTR_ALREADY_S_LATCHED */
2213 				ut_ad(latch_mode != BTR_SEARCH_TREE);
2214 
2215 				mtr_s_lock(dict_index_get_lock(index), mtr);
2216 			}
2217 			upper_rw_latch = RW_S_LATCH;
2218 		} else {
2219 			upper_rw_latch = RW_NO_LATCH;
2220 		}
2221 	}
2222 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2223 
2224 	page_cursor = btr_cur_get_page_cur(cursor);
2225 	cursor->index = index;
2226 
2227 	page_id_t		page_id(dict_index_get_space(index),
2228 					dict_index_get_page(index));
2229 	const page_size_t&	page_size = dict_table_page_size(index->table);
2230 
2231 	if (root_leaf_rw_latch == RW_X_LATCH) {
2232 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2233 	}
2234 
2235 	height = ULINT_UNDEFINED;
2236 
2237 	for (;;) {
2238 		buf_block_t*	block;
2239 		page_t*		page;
2240 		ulint		rw_latch;
2241 
2242 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2243 
2244 		if (height != 0
2245 		    && (latch_mode != BTR_MODIFY_TREE
2246 			|| height == level)) {
2247 			rw_latch = upper_rw_latch;
2248 		} else {
2249 			rw_latch = RW_NO_LATCH;
2250 		}
2251 
2252 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2253 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2254 					 BUF_GET, file, line, mtr);
2255 		tree_blocks[n_blocks] = block;
2256 
2257 		page = buf_block_get_frame(block);
2258 
2259 		if (height == ULINT_UNDEFINED
2260 		    && btr_page_get_level(page, mtr) == 0
2261 		    && rw_latch != RW_NO_LATCH
2262 		    && rw_latch != root_leaf_rw_latch) {
2263 			/* We should retry to get the page, because the root page
2264 			is latched with different level as a leaf page. */
2265 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2266 			ut_ad(rw_latch == RW_S_LATCH);
2267 
2268 			ut_ad(n_blocks == 0);
2269 			mtr_release_block_at_savepoint(
2270 				mtr, tree_savepoints[n_blocks],
2271 				tree_blocks[n_blocks]);
2272 
2273 			upper_rw_latch = root_leaf_rw_latch;
2274 			continue;
2275 		}
2276 
2277 		ut_ad(fil_page_index_page_check(page));
2278 		ut_ad(index->id == btr_page_get_index_id(page));
2279 
2280 		if (height == ULINT_UNDEFINED) {
2281 			/* We are in the root node */
2282 
2283 			height = btr_page_get_level(page, mtr);
2284 			root_height = height;
2285 			ut_a(height >= level);
2286 		} else {
2287 			/* TODO: flag the index corrupted if this fails */
2288 			ut_ad(height == btr_page_get_level(page, mtr));
2289 		}
2290 
2291 		if (height == level) {
2292 			if (srv_read_only_mode) {
2293 				btr_cur_latch_leaves(
2294 					block, page_id, page_size,
2295 					latch_mode, cursor, mtr);
2296 			} else if (height == 0) {
2297 				if (rw_latch == RW_NO_LATCH) {
2298 					btr_cur_latch_leaves(
2299 						block, page_id, page_size,
2300 						latch_mode, cursor, mtr);
2301 				}
2302 				/* In versions <= 3.23.52 we had
2303 				forgotten to release the tree latch
2304 				here. If in an index scan we had to
2305 				scan far to find a record visible to
2306 				the current transaction, that could
2307 				starve others waiting for the tree
2308 				latch. */
2309 
2310 				switch (latch_mode) {
2311 				case BTR_MODIFY_TREE:
2312 				case BTR_CONT_MODIFY_TREE:
2313 				case BTR_CONT_SEARCH_TREE:
2314 					break;
2315 				default:
2316 					if (!s_latch_by_caller) {
2317 						/* Release the tree s-latch */
2318 						mtr_release_s_latch_at_savepoint(
2319 							mtr, savepoint,
2320 							dict_index_get_lock(
2321 								index));
2322 					}
2323 
2324 					/* release upper blocks */
2325 					for (; n_releases < n_blocks;
2326 					     n_releases++) {
2327 						mtr_release_block_at_savepoint(
2328 							mtr,
2329 							tree_savepoints[
2330 								n_releases],
2331 							tree_blocks[
2332 								n_releases]);
2333 					}
2334 				}
2335 			} else { /* height != 0 */
2336 				/* We already have the block latched. */
2337 				ut_ad(latch_mode == BTR_SEARCH_TREE);
2338 				ut_ad(s_latch_by_caller);
2339 				ut_ad(upper_rw_latch == RW_S_LATCH);
2340 
2341 				ut_ad(mtr_memo_contains(mtr, block,
2342 							upper_rw_latch));
2343 
2344 				if (s_latch_by_caller) {
2345 					/* to exclude modifying tree operations
2346 					should sx-latch the index. */
2347 					ut_ad(mtr_memo_contains(
2348 						mtr,
2349 						dict_index_get_lock(index),
2350 						MTR_MEMO_SX_LOCK));
2351 					/* because has sx-latch of index,
2352 					can release upper blocks. */
2353 					for (; n_releases < n_blocks;
2354 					     n_releases++) {
2355 						mtr_release_block_at_savepoint(
2356 							mtr,
2357 							tree_savepoints[
2358 								n_releases],
2359 							tree_blocks[
2360 								n_releases]);
2361 					}
2362 				}
2363 			}
2364 		}
2365 
2366 		if (from_left) {
2367 			page_cur_set_before_first(block, page_cursor);
2368 		} else {
2369 			page_cur_set_after_last(block, page_cursor);
2370 		}
2371 
2372 		if (height == level) {
2373 			if (estimate) {
2374 				btr_cur_add_path_info(cursor, height,
2375 						      root_height);
2376 			}
2377 
2378 			break;
2379 		}
2380 
2381 		ut_ad(height > 0);
2382 
2383 		if (from_left) {
2384 			page_cur_move_to_next(page_cursor);
2385 		} else {
2386 			page_cur_move_to_prev(page_cursor);
2387 		}
2388 
2389 		if (estimate) {
2390 			btr_cur_add_path_info(cursor, height, root_height);
2391 		}
2392 
2393 		height--;
2394 
2395 		node_ptr = page_cur_get_rec(page_cursor);
2396 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2397 					  ULINT_UNDEFINED, &heap);
2398 
2399 		/* If the rec is the first or last in the page for
2400 		pessimistic delete intention, it might cause node_ptr insert
2401 		for the upper level. We should change the intention and retry.
2402 		*/
2403 		if (latch_mode == BTR_MODIFY_TREE
2404 		    && btr_cur_need_opposite_intention(
2405 			page, lock_intention, node_ptr)) {
2406 
2407 			ut_ad(upper_rw_latch == RW_X_LATCH);
2408 			/* release all blocks */
2409 			for (; n_releases <= n_blocks; n_releases++) {
2410 				mtr_release_block_at_savepoint(
2411 					mtr, tree_savepoints[n_releases],
2412 					tree_blocks[n_releases]);
2413 			}
2414 
2415 			lock_intention = BTR_INTENTION_BOTH;
2416 
2417 			page_id.set_page_no(dict_index_get_page(index));
2418 
2419 			height = ULINT_UNDEFINED;
2420 
2421 			n_blocks = 0;
2422 			n_releases = 0;
2423 
2424 			continue;
2425 		}
2426 
2427 		if (latch_mode == BTR_MODIFY_TREE
2428 		    && !btr_cur_will_modify_tree(
2429 				cursor->index, page, lock_intention, node_ptr,
2430 				node_ptr_max_size, page_size, mtr)) {
2431 			ut_ad(upper_rw_latch == RW_X_LATCH);
2432 			ut_ad(n_releases <= n_blocks);
2433 
2434 			/* we can release upper blocks */
2435 			for (; n_releases < n_blocks; n_releases++) {
2436 				if (n_releases == 0) {
2437 					/* we should not release root page
2438 					to pin to same block. */
2439 					continue;
2440 				}
2441 
2442 				/* release unused blocks to unpin */
2443 				mtr_release_block_at_savepoint(
2444 					mtr, tree_savepoints[n_releases],
2445 					tree_blocks[n_releases]);
2446 			}
2447 		}
2448 
2449 		if (height == level
2450 		    && latch_mode == BTR_MODIFY_TREE) {
2451 			ut_ad(upper_rw_latch == RW_X_LATCH);
2452 			/* we should sx-latch root page, if released already.
2453 			It contains seg_header. */
2454 			if (n_releases > 0) {
2455 				mtr_block_sx_latch_at_savepoint(
2456 					mtr, tree_savepoints[0],
2457 					tree_blocks[0]);
2458 			}
2459 
2460 			/* x-latch the branch blocks not released yet. */
2461 			for (ulint i = n_releases; i <= n_blocks; i++) {
2462 				mtr_block_x_latch_at_savepoint(
2463 					mtr, tree_savepoints[i],
2464 					tree_blocks[i]);
2465 			}
2466 		}
2467 
2468 		/* Go to the child node */
2469 		page_id.set_page_no(
2470 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2471 
2472 		n_blocks++;
2473 	}
2474 
2475 	if (heap) {
2476 		mem_heap_free(heap);
2477 	}
2478 }
2479 
2480 /** Opens a cursor at either end of an index.
2481 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2482 to keep them in buffer pool. This mode is used by intrinsic table
2483 as they are not shared and so there is no need of latching.
2484 @param[in]	from_left	true if open to low end, false if open
2485 				to high end.
2486 @param[in]	index		index
2487 @param[in,out]	cursor		cursor
2488 @param[in]	file		file name
2489 @param[in]	line		line where called
2490 @param[in,out]	mtr		mini transaction
2491 */
2492 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2493 btr_cur_open_at_index_side_with_no_latch_func(
2494 	bool		from_left,
2495 	dict_index_t*	index,
2496 	btr_cur_t*	cursor,
2497 	ulint		level,
2498 	const char*	file,
2499 	ulint		line,
2500 	mtr_t*		mtr)
2501 {
2502 	page_cur_t*	page_cursor;
2503 	ulint		height;
2504 	rec_t*		node_ptr;
2505 	ulint		n_blocks = 0;
2506 	mem_heap_t*	heap		= NULL;
2507 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2508 	ulint*		offsets		= offsets_;
2509 	rec_offs_init(offsets_);
2510 
2511 	ut_ad(level != ULINT_UNDEFINED);
2512 
2513 	page_cursor = btr_cur_get_page_cur(cursor);
2514 	cursor->index = index;
2515 	page_id_t		page_id(dict_index_get_space(index),
2516 					dict_index_get_page(index));
2517 	const page_size_t&	page_size = dict_table_page_size(index->table);
2518 
2519 	height = ULINT_UNDEFINED;
2520 
2521 	for (;;) {
2522 		buf_block_t*	block;
2523 		page_t*		page;
2524 		ulint		rw_latch = RW_NO_LATCH;
2525 
2526 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2527 
2528 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2529 					 BUF_GET, file, line, mtr);
2530 
2531 		page = buf_block_get_frame(block);
2532 
2533 		ut_ad(fil_page_index_page_check(page));
2534 		ut_ad(index->id == btr_page_get_index_id(page));
2535 
2536 		if (height == ULINT_UNDEFINED) {
2537 			/* We are in the root node */
2538 
2539 			height = btr_page_get_level(page, mtr);
2540 			ut_a(height >= level);
2541 		} else {
2542 			/* TODO: flag the index corrupted if this fails */
2543 			ut_ad(height == btr_page_get_level(page, mtr));
2544 		}
2545 
2546 		if (from_left) {
2547 			page_cur_set_before_first(block, page_cursor);
2548 		} else {
2549 			page_cur_set_after_last(block, page_cursor);
2550 		}
2551 
2552 		if (height == level) {
2553 			break;
2554 		}
2555 
2556 		ut_ad(height > 0);
2557 
2558 		if (from_left) {
2559 			page_cur_move_to_next(page_cursor);
2560 		} else {
2561 			page_cur_move_to_prev(page_cursor);
2562 		}
2563 
2564 		height--;
2565 
2566 		node_ptr = page_cur_get_rec(page_cursor);
2567 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2568 					  ULINT_UNDEFINED, &heap);
2569 
2570 		/* Go to the child node */
2571 		page_id.set_page_no(
2572 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2573 
2574 		n_blocks++;
2575 	}
2576 
2577 	if (heap != NULL) {
2578 		mem_heap_free(heap);
2579 	}
2580 }
2581 
2582 /**********************************************************************//**
2583 Positions a cursor at a randomly chosen position within a B-tree.
2584 @return true if the index is available and we have put the cursor, false
2585 if the index is unavailable */
2586 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2587 btr_cur_open_at_rnd_pos_func(
2588 /*=========================*/
2589 	dict_index_t*	index,		/*!< in: index */
2590 	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
2591 	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
2592 	const char*	file,		/*!< in: file name */
2593 	ulint		line,		/*!< in: line where called */
2594 	mtr_t*		mtr)		/*!< in: mtr */
2595 {
2596 	page_cur_t*	page_cursor;
2597 	ulint		node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2598 	ulint		height;
2599 	rec_t*		node_ptr;
2600 	ulint		savepoint;
2601 	ulint		upper_rw_latch, root_leaf_rw_latch;
2602 	btr_intention_t	lock_intention;
2603 	buf_block_t*	tree_blocks[BTR_MAX_LEVELS];
2604 	ulint		tree_savepoints[BTR_MAX_LEVELS];
2605 	ulint		n_blocks = 0;
2606 	ulint		n_releases = 0;
2607 	mem_heap_t*	heap		= NULL;
2608 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
2609 	ulint*		offsets		= offsets_;
2610 	rec_offs_init(offsets_);
2611 
2612 	ut_ad(!dict_index_is_spatial(index));
2613 
2614 	lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2615 
2616 	ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2617 
2618 	savepoint = mtr_set_savepoint(mtr);
2619 
2620 	switch (latch_mode) {
2621 	case BTR_MODIFY_TREE:
2622 		/* Most of delete-intended operations are purging.
2623 		Free blocks and read IO bandwidth should be prior
2624 		for them, when the history list is glowing huge. */
2625 		if (lock_intention == BTR_INTENTION_DELETE
2626 		    && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2627 		    && buf_get_n_pending_read_ios()) {
2628 			mtr_x_lock(dict_index_get_lock(index), mtr);
2629 		} else {
2630 			mtr_sx_lock(dict_index_get_lock(index), mtr);
2631 		}
2632 		upper_rw_latch = RW_X_LATCH;
2633 		break;
2634 	case BTR_SEARCH_PREV:
2635 	case BTR_MODIFY_PREV:
2636 		/* This function doesn't support left uncle
2637 		   page lock for left leaf page lock, when
2638 		   needed. */
2639 	case BTR_SEARCH_TREE:
2640 	case BTR_CONT_MODIFY_TREE:
2641 	case BTR_CONT_SEARCH_TREE:
2642 		ut_ad(0);
2643 		/* fall through */
2644 	default:
2645 		if (!srv_read_only_mode) {
2646 			mtr_s_lock(dict_index_get_lock(index), mtr);
2647 			upper_rw_latch = RW_S_LATCH;
2648 		} else {
2649 			upper_rw_latch = RW_NO_LATCH;
2650 		}
2651 	}
2652 
2653 	DBUG_EXECUTE_IF("test_index_is_unavailable",
2654 			return(false););
2655 
2656 	if (index->page == FIL_NULL) {
2657 		/* Since we don't hold index lock until just now, the index
2658 		could be modified by others, for example, if this is a
2659 		statistics updater for referenced table, it could be marked
2660 		as unavailable by 'DROP TABLE' in the mean time, since
2661 		we don't hold lock for statistics updater */
2662 		return(false);
2663 	}
2664 
2665 	root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2666 
2667 	page_cursor = btr_cur_get_page_cur(cursor);
2668 	cursor->index = index;
2669 
2670 	page_id_t		page_id(dict_index_get_space(index),
2671 					dict_index_get_page(index));
2672 	const page_size_t&	page_size = dict_table_page_size(index->table);
2673 
2674 	if (root_leaf_rw_latch == RW_X_LATCH) {
2675 		node_ptr_max_size = dict_index_node_ptr_max_size(index);
2676 	}
2677 
2678 	height = ULINT_UNDEFINED;
2679 
2680 	for (;;) {
2681 		buf_block_t*	block;
2682 		page_t*		page;
2683 		ulint		rw_latch;
2684 
2685 		ut_ad(n_blocks < BTR_MAX_LEVELS);
2686 
2687 		if (height != 0
2688 		    && latch_mode != BTR_MODIFY_TREE) {
2689 			rw_latch = upper_rw_latch;
2690 		} else {
2691 			rw_latch = RW_NO_LATCH;
2692 		}
2693 
2694 		tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2695 		block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2696 					 BUF_GET, file, line, mtr);
2697 		tree_blocks[n_blocks] = block;
2698 
2699 		page = buf_block_get_frame(block);
2700 
2701 		if (height == ULINT_UNDEFINED
2702 		    && btr_page_get_level(page, mtr) == 0
2703 		    && rw_latch != RW_NO_LATCH
2704 		    && rw_latch != root_leaf_rw_latch) {
2705 			/* We should retry to get the page, because the root page
2706 			is latched with different level as a leaf page. */
2707 			ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2708 			ut_ad(rw_latch == RW_S_LATCH);
2709 
2710 			ut_ad(n_blocks == 0);
2711 			mtr_release_block_at_savepoint(
2712 				mtr, tree_savepoints[n_blocks],
2713 				tree_blocks[n_blocks]);
2714 
2715 			upper_rw_latch = root_leaf_rw_latch;
2716 			continue;
2717 		}
2718 
2719 		ut_ad(fil_page_index_page_check(page));
2720 		ut_ad(index->id == btr_page_get_index_id(page));
2721 
2722 		if (height == ULINT_UNDEFINED) {
2723 			/* We are in the root node */
2724 
2725 			height = btr_page_get_level(page, mtr);
2726 		}
2727 
2728 		if (height == 0) {
2729 			if (rw_latch == RW_NO_LATCH
2730 			    || srv_read_only_mode) {
2731 				btr_cur_latch_leaves(
2732 					block, page_id, page_size,
2733 					latch_mode, cursor, mtr);
2734 			}
2735 
2736 			/* btr_cur_open_at_index_side_func() and
2737 			btr_cur_search_to_nth_level() release
2738 			tree s-latch here.*/
2739 			switch (latch_mode) {
2740 			case BTR_MODIFY_TREE:
2741 			case BTR_CONT_MODIFY_TREE:
2742 			case BTR_CONT_SEARCH_TREE:
2743 				break;
2744 			default:
2745 				/* Release the tree s-latch */
2746 				if (!srv_read_only_mode) {
2747 					mtr_release_s_latch_at_savepoint(
2748 						mtr, savepoint,
2749 						dict_index_get_lock(index));
2750 				}
2751 
2752 				/* release upper blocks */
2753 				for (; n_releases < n_blocks; n_releases++) {
2754 					mtr_release_block_at_savepoint(
2755 						mtr,
2756 						tree_savepoints[n_releases],
2757 						tree_blocks[n_releases]);
2758 				}
2759 			}
2760 		}
2761 
2762 		page_cur_open_on_rnd_user_rec(block, page_cursor);
2763 
2764 		if (height == 0) {
2765 
2766 			break;
2767 		}
2768 
2769 		ut_ad(height > 0);
2770 
2771 		height--;
2772 
2773 		node_ptr = page_cur_get_rec(page_cursor);
2774 		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2775 					  ULINT_UNDEFINED, &heap);
2776 
2777 		/* If the rec is the first or last in the page for
2778 		pessimistic delete intention, it might cause node_ptr insert
2779 		for the upper level. We should change the intention and retry.
2780 		*/
2781 		if (latch_mode == BTR_MODIFY_TREE
2782 		    && btr_cur_need_opposite_intention(
2783 			page, lock_intention, node_ptr)) {
2784 
2785 			ut_ad(upper_rw_latch == RW_X_LATCH);
2786 			/* release all blocks */
2787 			for (; n_releases <= n_blocks; n_releases++) {
2788 				mtr_release_block_at_savepoint(
2789 					mtr, tree_savepoints[n_releases],
2790 					tree_blocks[n_releases]);
2791 			}
2792 
2793 			lock_intention = BTR_INTENTION_BOTH;
2794 
2795 			page_id.set_page_no(dict_index_get_page(index));
2796 
2797 			height = ULINT_UNDEFINED;
2798 
2799 			n_blocks = 0;
2800 			n_releases = 0;
2801 
2802 			continue;
2803 		}
2804 
2805 		if (latch_mode == BTR_MODIFY_TREE
2806 		    && !btr_cur_will_modify_tree(
2807 				cursor->index, page, lock_intention, node_ptr,
2808 				node_ptr_max_size, page_size, mtr)) {
2809 			ut_ad(upper_rw_latch == RW_X_LATCH);
2810 			ut_ad(n_releases <= n_blocks);
2811 
2812 			/* we can release upper blocks */
2813 			for (; n_releases < n_blocks; n_releases++) {
2814 				if (n_releases == 0) {
2815 					/* we should not release root page
2816 					to pin to same block. */
2817 					continue;
2818 				}
2819 
2820 				/* release unused blocks to unpin */
2821 				mtr_release_block_at_savepoint(
2822 					mtr, tree_savepoints[n_releases],
2823 					tree_blocks[n_releases]);
2824 			}
2825 		}
2826 
2827 		if (height == 0
2828 		    && latch_mode == BTR_MODIFY_TREE) {
2829 			ut_ad(upper_rw_latch == RW_X_LATCH);
2830 			/* we should sx-latch root page, if released already.
2831 			It contains seg_header. */
2832 			if (n_releases > 0) {
2833 				mtr_block_sx_latch_at_savepoint(
2834 					mtr, tree_savepoints[0],
2835 					tree_blocks[0]);
2836 			}
2837 
2838 			/* x-latch the branch blocks not released yet. */
2839 			for (ulint i = n_releases; i <= n_blocks; i++) {
2840 				mtr_block_x_latch_at_savepoint(
2841 					mtr, tree_savepoints[i],
2842 					tree_blocks[i]);
2843 			}
2844 		}
2845 
2846 		/* Go to the child node */
2847 		page_id.set_page_no(
2848 			btr_node_ptr_get_child_page_no(node_ptr, offsets));
2849 
2850 		n_blocks++;
2851 	}
2852 
2853 	if (UNIV_LIKELY_NULL(heap)) {
2854 		mem_heap_free(heap);
2855 	}
2856 
2857 	return(true);
2858 }
2859 
2860 /*==================== B-TREE INSERT =========================*/
2861 
2862 /*************************************************************//**
2863 Inserts a record if there is enough space, or if enough space can
2864 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
2865 no heuristics is applied to whether it pays to use CPU time for
2866 reorganizing the page or not.
2867 
2868 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
2869 if this is a compressed leaf page in a secondary index.
2870 This has to be done either within the same mini-transaction,
2871 or by invoking ibuf_reset_free_bits() before mtr_commit().
2872 
2873 @return pointer to inserted record if succeed, else NULL */
2874 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2875 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)2876 btr_cur_insert_if_possible(
2877 /*=======================*/
2878 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
2879 				cursor stays valid */
2880 	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
2881 				have been stored to tuple */
2882 	ulint**		offsets,/*!< out: offsets on *rec */
2883 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
2884 	ulint		n_ext,	/*!< in: number of externally stored columns */
2885 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
2886 {
2887 	page_cur_t*	page_cursor;
2888 	rec_t*		rec;
2889 
2890 	ut_ad(dtuple_check_typed(tuple));
2891 
2892 	ut_ad(mtr_is_block_fix(
2893 		mtr, btr_cur_get_block(cursor),
2894 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
2895 	page_cursor = btr_cur_get_page_cur(cursor);
2896 
2897 	/* Now, try the insert */
2898 	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2899 				    offsets, heap, n_ext, mtr);
2900 
2901 	/* If the record did not fit, reorganize.
2902 	For compressed pages, page_cur_tuple_insert()
2903 	attempted this already. */
2904 	if (!rec && !page_cur_get_page_zip(page_cursor)
2905 	    && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
2906 		rec = page_cur_tuple_insert(
2907 			page_cursor, tuple, cursor->index,
2908 			offsets, heap, n_ext, mtr);
2909 	}
2910 
2911 	ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
2912 	return(rec);
2913 }
2914 
2915 /*************************************************************//**
2916 For an insert, checks the locks and does the undo logging if desired.
2917 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
2918 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2919 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)2920 btr_cur_ins_lock_and_undo(
2921 /*======================*/
2922 	ulint		flags,	/*!< in: undo logging and locking flags: if
2923 				not zero, the parameters index and thr
2924 				should be specified */
2925 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
2926 	dtuple_t*	entry,	/*!< in/out: entry to insert */
2927 	que_thr_t*	thr,	/*!< in: query thread or NULL */
2928 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
2929 	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
2930 				should inherit LOCK_GAP type locks from the
2931 				successor record */
2932 {
2933 	dict_index_t*	index;
2934 	dberr_t		err = DB_SUCCESS;
2935 	rec_t*		rec;
2936 	roll_ptr_t	roll_ptr;
2937 
2938 	/* Check if we have to wait for a lock: enqueue an explicit lock
2939 	request if yes */
2940 
2941 	rec = btr_cur_get_rec(cursor);
2942 	index = cursor->index;
2943 
2944 	ut_ad(!dict_index_is_online_ddl(index)
2945 	      || dict_index_is_clust(index)
2946 	      || (flags & BTR_CREATE_FLAG));
2947 	ut_ad(mtr->is_named_space(index->space));
2948 
2949 	/* Check if there is predicate or GAP lock preventing the insertion */
2950 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
2951 		if (dict_index_is_spatial(index)) {
2952 			lock_prdt_t	prdt;
2953 			rtr_mbr_t	mbr;
2954 
2955 			rtr_get_mbr_from_tuple(entry, &mbr);
2956 
2957 			/* Use on stack MBR variable to test if a lock is
2958 			needed. If so, the predicate (MBR) will be allocated
2959 			from lock heap in lock_prdt_insert_check_and_lock() */
2960 			lock_init_prdt_from_mbr(
2961 				&prdt, &mbr, 0, NULL);
2962 
2963 			err = lock_prdt_insert_check_and_lock(
2964 				flags, rec, btr_cur_get_block(cursor),
2965 				index, thr, mtr, &prdt);
2966 			*inherit = false;
2967 		} else {
2968 			err = lock_rec_insert_check_and_lock(
2969 				flags, rec, btr_cur_get_block(cursor),
2970 				index, thr, mtr, inherit);
2971 		}
2972 	}
2973 
2974 	if (err != DB_SUCCESS
2975 	    || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
2976 
2977 		return(err);
2978 	}
2979 
2980 	err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
2981 					    thr, index, entry,
2982 					    NULL, 0, NULL, NULL,
2983 					    &roll_ptr);
2984 	if (err != DB_SUCCESS) {
2985 
2986 		return(err);
2987 	}
2988 
2989 	/* Now we can fill in the roll ptr field in entry
2990 	(except if table is intrinsic) */
2991 
2992 	if (!(flags & BTR_KEEP_SYS_FLAG)
2993 	    && !dict_table_is_intrinsic(index->table)) {
2994 
2995 		row_upd_index_entry_sys_field(entry, index,
2996 					      DATA_ROLL_PTR, roll_ptr);
2997 	}
2998 
2999 	return(DB_SUCCESS);
3000 }
3001 
3002 /**
3003 Prefetch siblings of the leaf for the pessimistic operation.
3004 @param block	leaf page */
3005 static
3006 void
btr_cur_prefetch_siblings(buf_block_t * block)3007 btr_cur_prefetch_siblings(
3008 	buf_block_t*	block)
3009 {
3010 	page_t*	page = buf_block_get_frame(block);
3011 
3012 	ut_ad(page_is_leaf(page));
3013 
3014 	ulint left_page_no = fil_page_get_prev(page);
3015 	ulint right_page_no = fil_page_get_next(page);
3016 
3017 	if (left_page_no != FIL_NULL) {
3018 		buf_read_page_background(
3019 			page_id_t(block->page.id.space(), left_page_no),
3020 			block->page.size, false);
3021 	}
3022 	if (right_page_no != FIL_NULL) {
3023 		buf_read_page_background(
3024 			page_id_t(block->page.id.space(), right_page_no),
3025 			block->page.size, false);
3026 	}
3027 	if (left_page_no != FIL_NULL
3028 	    || right_page_no != FIL_NULL) {
3029 		os_aio_simulated_wake_handler_threads();
3030 	}
3031 }
3032 
3033 /*************************************************************//**
3034 Tries to perform an insert to a page in an index tree, next to cursor.
3035 It is assumed that mtr holds an x-latch on the page. The operation does
3036 not succeed if there is too little space on the page. If there is just
3037 one record on the page, the insert will always succeed; this is to
3038 prevent trying to split a page with just one record.
3039 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3040 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3041 btr_cur_optimistic_insert(
3042 /*======================*/
3043 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3044 				zero, the parameters index and thr should be
3045 				specified */
3046 	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
3047 				cursor stays valid */
3048 	ulint**		offsets,/*!< out: offsets on *rec */
3049 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
3050 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3051 	rec_t**		rec,	/*!< out: pointer to inserted record if
3052 				succeed */
3053 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3054 				be stored externally by the caller, or
3055 				NULL */
3056 	ulint		n_ext,	/*!< in: number of externally stored columns */
3057 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3058 	mtr_t*		mtr)	/*!< in/out: mini-transaction;
3059 				if this function returns DB_SUCCESS on
3060 				a leaf page of a secondary index in a
3061 				compressed tablespace, the caller must
3062 				mtr_commit(mtr) before latching
3063 				any further pages */
3064 {
3065 	big_rec_t*	big_rec_vec	= NULL;
3066 	dict_index_t*	index;
3067 	page_cur_t*	page_cursor;
3068 	buf_block_t*	block;
3069 	page_t*		page;
3070 	rec_t*		dummy;
3071 	ibool		leaf;
3072 	ibool		reorg;
3073 	ibool		inherit = TRUE;
3074 	ulint		rec_size;
3075 	dberr_t		err;
3076 
3077 	*big_rec = NULL;
3078 
3079 	block = btr_cur_get_block(cursor);
3080 	page = buf_block_get_frame(block);
3081 	index = cursor->index;
3082 
3083 	/* Block are not latched for insert if table is intrinsic
3084 	and index is auto-generated clustered index. */
3085 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3086 	ut_ad(!dict_index_is_online_ddl(index)
3087 	      || dict_index_is_clust(index)
3088 	      || (flags & BTR_CREATE_FLAG));
3089 	ut_ad(dtuple_check_typed(entry));
3090 
3091 	const page_size_t&	page_size = block->page.size;
3092 
3093 #ifdef UNIV_DEBUG_VALGRIND
3094 	if (page_size.is_compressed()) {
3095 		UNIV_MEM_ASSERT_RW(page, page_size.logical());
3096 		UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3097 	}
3098 #endif /* UNIV_DEBUG_VALGRIND */
3099 
3100 	leaf = page_is_leaf(page);
3101 
3102 	/* Calculate the record size when entry is converted to a record */
3103 	rec_size = rec_get_converted_size(index, entry, n_ext);
3104 
3105 	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3106 				   dtuple_get_n_fields(entry), page_size)) {
3107 
3108 		/* The record is so big that we have to store some fields
3109 		externally on separate database pages */
3110 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3111 
3112 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3113 
3114 			return(DB_TOO_BIG_RECORD);
3115 		}
3116 
3117 		rec_size = rec_get_converted_size(index, entry, n_ext);
3118 	}
3119 
3120 	if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3121 		if (big_rec_vec != NULL) {
3122 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3123 		}
3124 
3125 		return(DB_TOO_BIG_RECORD);
3126 	}
3127 
3128 	LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3129 				      goto fail);
3130 
3131 	if (leaf && page_size.is_compressed()
3132 	    && (page_get_data_size(page) + rec_size
3133 		>= dict_index_zip_pad_optimal_page_size(index))) {
3134 		/* If compression padding tells us that insertion will
3135 		result in too packed up page i.e.: which is likely to
3136 		cause compression failure then don't do an optimistic
3137 		insertion. */
3138 fail:
3139 		err = DB_FAIL;
3140 
3141 		/* prefetch siblings of the leaf for the pessimistic
3142 		operation, if the page is leaf. */
3143 		if (page_is_leaf(page)) {
3144 			btr_cur_prefetch_siblings(block);
3145 		}
3146 fail_err:
3147 
3148 		if (big_rec_vec) {
3149 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3150 		}
3151 
3152 		return(err);
3153 	}
3154 
3155 	ulint	max_size = page_get_max_insert_size_after_reorganize(page, 1);
3156 
3157 	if (page_has_garbage(page)) {
3158 		if ((max_size < rec_size
3159 		     || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3160 		    && page_get_n_recs(page) > 1
3161 		    && page_get_max_insert_size(page, 1) < rec_size) {
3162 
3163 			goto fail;
3164 		}
3165 	} else if (max_size < rec_size) {
3166 		goto fail;
3167 	}
3168 
3169 	/* If there have been many consecutive inserts to the
3170 	clustered index leaf page of an uncompressed table, check if
3171 	we have to split the page to reserve enough free space for
3172 	future updates of records. */
3173 
3174 	if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3175 	    && page_get_n_recs(page) >= 2
3176 	    && dict_index_get_space_reserve() + rec_size > max_size
3177 	    && (btr_page_get_split_rec_to_right(cursor, &dummy)
3178 		|| btr_page_get_split_rec_to_left(cursor, &dummy))) {
3179 		goto fail;
3180 	}
3181 
3182 	page_cursor = btr_cur_get_page_cur(cursor);
3183 
3184 	DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3185 			      ": %s",
3186 			      index->name(), index->id,
3187 			      thr != NULL
3188 			      ? trx_get_id_for_print(thr_get_trx(thr))
3189 			      : 0,
3190 			      rec_printer(entry).str().c_str()));
3191 
3192 	DBUG_EXECUTE_IF("do_page_reorganize",
3193 			btr_page_reorganize(page_cursor, index, mtr););
3194 
3195 	/* Now, try the insert */
3196 	{
3197 		const rec_t*	page_cursor_rec = page_cur_get_rec(page_cursor);
3198 
3199 		if (dict_table_is_intrinsic(index->table)) {
3200 
3201 			index->rec_cache.rec_size = rec_size;
3202 
3203 			*rec = page_cur_tuple_direct_insert(
3204 				page_cursor, entry, index, n_ext, mtr);
3205 		} else {
3206 			/* Check locks and write to the undo log,
3207 			if specified */
3208 			err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3209 							thr, mtr, &inherit);
3210 
3211 			if (err != DB_SUCCESS) {
3212 				goto fail_err;
3213 			}
3214 
3215 			*rec = page_cur_tuple_insert(
3216 				page_cursor, entry, index, offsets, heap,
3217 				n_ext, mtr);
3218 		}
3219 
3220 		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3221 	}
3222 
3223 	if (*rec) {
3224 	} else if (page_size.is_compressed()) {
3225 		/* Reset the IBUF_BITMAP_FREE bits, because
3226 		page_cur_tuple_insert() will have attempted page
3227 		reorganize before failing. */
3228 		if (leaf
3229 		    && !dict_index_is_clust(index)
3230 		    && !dict_table_is_temporary(index->table)) {
3231 			ibuf_reset_free_bits(block);
3232 		}
3233 
3234 		goto fail;
3235 	} else {
3236 
3237 		/* For intrinsic table we take a consistent path
3238 		to re-organize using pessimistic path. */
3239 		if (dict_table_is_intrinsic(index->table)) {
3240 			goto fail;
3241 		}
3242 
3243 		ut_ad(!reorg);
3244 
3245 		/* If the record did not fit, reorganize */
3246 		if (!btr_page_reorganize(page_cursor, index, mtr)) {
3247 			ut_ad(0);
3248 			goto fail;
3249 		}
3250 
3251 		ut_ad(page_get_max_insert_size(page, 1) == max_size);
3252 
3253 		reorg = TRUE;
3254 
3255 		*rec = page_cur_tuple_insert(page_cursor, entry, index,
3256 					     offsets, heap, n_ext, mtr);
3257 
3258 		if (UNIV_UNLIKELY(!*rec)) {
3259 			ib::fatal() <<  "Cannot insert tuple " << *entry
3260 				<< "into index " << index->name
3261 				<< " of table " << index->table->name
3262 				<< ". Max size: " << max_size;
3263 		}
3264 	}
3265 
3266 	if (!index->disable_ahi) {
3267 		if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3268 			btr_search_update_hash_node_on_insert(cursor);
3269 		} else {
3270 			btr_search_update_hash_on_insert(cursor);
3271 		}
3272 	}
3273 
3274 	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3275 
3276 		lock_update_insert(block, *rec);
3277 	}
3278 
3279 	if (leaf
3280 	    && !dict_index_is_clust(index)
3281 	    && !dict_table_is_temporary(index->table)) {
3282 		/* Update the free bits of the B-tree page in the
3283 		insert buffer bitmap. */
3284 
3285 		/* The free bits in the insert buffer bitmap must
3286 		never exceed the free space on a page.  It is safe to
3287 		decrement or reset the bits in the bitmap in a
3288 		mini-transaction that is committed before the
3289 		mini-transaction that affects the free space. */
3290 
3291 		/* It is unsafe to increment the bits in a separately
3292 		committed mini-transaction, because in crash recovery,
3293 		the free bits could momentarily be set too high. */
3294 
3295 		if (page_size.is_compressed()) {
3296 			/* Update the bits in the same mini-transaction. */
3297 			ibuf_update_free_bits_zip(block, mtr);
3298 		} else {
3299 			/* Decrement the bits in a separate
3300 			mini-transaction. */
3301 			ibuf_update_free_bits_if_full(
3302 				block, max_size,
3303 				rec_size + PAGE_DIR_SLOT_SIZE);
3304 		}
3305 	}
3306 
3307 	*big_rec = big_rec_vec;
3308 
3309 	return(DB_SUCCESS);
3310 }
3311 
3312 /*************************************************************//**
3313 Performs an insert on a page of an index tree. It is assumed that mtr
3314 holds an x-latch on the tree and on the cursor page. If the insert is
3315 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3316 to brothers of page, if those brothers exist.
3317 @return DB_SUCCESS or error number */
3318 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3319 btr_cur_pessimistic_insert(
3320 /*=======================*/
3321 	ulint		flags,	/*!< in: undo logging and locking flags: if not
3322 				zero, the parameter thr should be
3323 				specified; if no undo logging is specified,
3324 				then the caller must have reserved enough
3325 				free extents in the file space so that the
3326 				insertion will certainly succeed */
3327 	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
3328 				cursor stays valid */
3329 	ulint**		offsets,/*!< out: offsets on *rec */
3330 	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap
3331 				that can be emptied, or NULL */
3332 	dtuple_t*	entry,	/*!< in/out: entry to insert */
3333 	rec_t**		rec,	/*!< out: pointer to inserted record if
3334 				succeed */
3335 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
3336 				be stored externally by the caller, or
3337 				NULL */
3338 	ulint		n_ext,	/*!< in: number of externally stored columns */
3339 	que_thr_t*	thr,	/*!< in: query thread or NULL */
3340 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3341 {
3342 	dict_index_t*	index		= cursor->index;
3343 	big_rec_t*	big_rec_vec	= NULL;
3344 	dberr_t		err;
3345 	ibool		inherit = FALSE;
3346 	bool		success;
3347 	ulint		n_reserved	= 0;
3348 
3349 	ut_ad(dtuple_check_typed(entry));
3350 
3351 	*big_rec = NULL;
3352 
3353 	ut_ad(mtr_memo_contains_flagged(
3354 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3355 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3356 	      || dict_table_is_intrinsic(cursor->index->table));
3357 	ut_ad(mtr_is_block_fix(
3358 		mtr, btr_cur_get_block(cursor),
3359 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3360 	ut_ad(!dict_index_is_online_ddl(index)
3361 	      || dict_index_is_clust(index)
3362 	      || (flags & BTR_CREATE_FLAG));
3363 
3364 	cursor->flag = BTR_CUR_BINARY;
3365 
3366 	/* Check locks and write to undo log, if specified */
3367 
3368 	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3369 					thr, mtr, &inherit);
3370 
3371 	if (err != DB_SUCCESS) {
3372 
3373 		return(err);
3374 	}
3375 
3376 	if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3377 	    || dict_table_is_intrinsic(index->table)) {
3378 		/* First reserve enough free space for the file segments
3379 		of the index tree, so that the insert will not fail because
3380 		of lack of space */
3381 
3382 		ulint	n_extents = cursor->tree_height / 16 + 3;
3383 
3384 		success = fsp_reserve_free_extents(&n_reserved, index->space,
3385 						   n_extents, FSP_NORMAL, mtr);
3386 		if (!success) {
3387 			return(DB_OUT_OF_FILE_SPACE);
3388 		}
3389 	}
3390 
3391 	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3392 				   dict_table_is_comp(index->table),
3393 				   dtuple_get_n_fields(entry),
3394 				   dict_table_page_size(index->table))) {
3395 		/* The record is so big that we have to store some fields
3396 		externally on separate database pages */
3397 
3398 		if (UNIV_LIKELY_NULL(big_rec_vec)) {
3399 			/* This should never happen, but we handle
3400 			the situation in a robust manner. */
3401 			ut_ad(0);
3402 			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3403 		}
3404 
3405 		big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3406 
3407 		if (big_rec_vec == NULL) {
3408 
3409 			if (n_reserved > 0) {
3410 				fil_space_release_free_extents(index->space,
3411 							       n_reserved);
3412 			}
3413 			return(DB_TOO_BIG_RECORD);
3414 		}
3415 	}
3416 
3417 	if (dict_index_get_page(index)
3418 	    == btr_cur_get_block(cursor)->page.id.page_no()) {
3419 
3420 		/* The page is the root page */
3421 		*rec = btr_root_raise_and_insert(
3422 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3423 	} else {
3424 		*rec = btr_page_split_and_insert(
3425 			flags, cursor, offsets, heap, entry, n_ext, mtr);
3426 	}
3427 
3428 	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3429 	      || dict_index_is_spatial(index));
3430 
3431 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3432 		ut_ad(!dict_table_is_temporary(index->table));
3433 		if (dict_index_is_spatial(index)) {
3434 			/* Do nothing */
3435 		} else {
3436 			/* The cursor might be moved to the other page
3437 			and the max trx id field should be updated after
3438 			the cursor was fixed. */
3439 			if (!dict_index_is_clust(index)) {
3440 				page_update_max_trx_id(
3441 					btr_cur_get_block(cursor),
3442 					btr_cur_get_page_zip(cursor),
3443 					thr_get_trx(thr)->id, mtr);
3444 			}
3445 			if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3446 			    || btr_page_get_prev(
3447 				buf_block_get_frame(
3448 					btr_cur_get_block(cursor)), mtr)
3449 			       == FIL_NULL) {
3450 				/* split and inserted need to call
3451 				lock_update_insert() always. */
3452 				inherit = TRUE;
3453 			}
3454 		}
3455 	}
3456 
3457 	if (!index->disable_ahi) {
3458 		btr_search_update_hash_on_insert(cursor);
3459 	}
3460 	if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3461 
3462 		lock_update_insert(btr_cur_get_block(cursor), *rec);
3463 	}
3464 
3465 	if (n_reserved > 0) {
3466 		fil_space_release_free_extents(index->space, n_reserved);
3467 	}
3468 
3469 	*big_rec = big_rec_vec;
3470 
3471 	return(DB_SUCCESS);
3472 }
3473 
3474 /*==================== B-TREE UPDATE =========================*/
3475 
3476 /*************************************************************//**
3477 For an update, checks the locks and does the undo logging.
3478 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3479 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3480 dberr_t
3481 btr_cur_upd_lock_and_undo(
3482 /*======================*/
3483 	ulint		flags,	/*!< in: undo logging and locking flags */
3484 	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
3485 	const ulint*	offsets,/*!< in: rec_get_offsets() on cursor */
3486 	const upd_t*	update,	/*!< in: update vector */
3487 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3488 				updates */
3489 	que_thr_t*	thr,	/*!< in: query thread
3490 				(can be NULL if BTR_NO_LOCKING_FLAG) */
3491 	mtr_t*		mtr,	/*!< in/out: mini-transaction */
3492 	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
3493 {
3494 	dict_index_t*	index;
3495 	const rec_t*	rec;
3496 	dberr_t		err;
3497 
3498 	ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3499 
3500 	rec = btr_cur_get_rec(cursor);
3501 	index = cursor->index;
3502 
3503 	ut_ad(rec_offs_validate(rec, index, offsets));
3504 	ut_ad(mtr->is_named_space(index->space));
3505 
3506 	if (!dict_index_is_clust(index)) {
3507 		ut_ad(dict_index_is_online_ddl(index)
3508 		      == !!(flags & BTR_CREATE_FLAG));
3509 
3510 		/* We do undo logging only when we update a clustered index
3511 		record */
3512 		return(lock_sec_rec_modify_check_and_lock(
3513 			       flags, btr_cur_get_block(cursor), rec,
3514 			       index, thr, mtr));
3515 	}
3516 
3517 	/* Check if we have to wait for a lock: enqueue an explicit lock
3518 	request if yes */
3519 
3520 	if (!(flags & BTR_NO_LOCKING_FLAG)) {
3521 		err = lock_clust_rec_modify_check_and_lock(
3522 			flags, btr_cur_get_block(cursor), rec, index,
3523 			offsets, thr);
3524 		if (err != DB_SUCCESS) {
3525 			return(err);
3526 		}
3527 	}
3528 
3529 	/* Append the info about the update in the undo log */
3530 
3531 	return(trx_undo_report_row_operation(
3532 		       flags, TRX_UNDO_MODIFY_OP, thr,
3533 		       index, NULL, update,
3534 		       cmpl_info, rec, offsets, roll_ptr));
3535 }
3536 
3537 /***********************************************************//**
3538 Writes a redo log record of updating a record in-place. */
3539 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3540 btr_cur_update_in_place_log(
3541 /*========================*/
3542 	ulint		flags,		/*!< in: flags */
3543 	const rec_t*	rec,		/*!< in: record */
3544 	dict_index_t*	index,		/*!< in: index of the record */
3545 	const upd_t*	update,		/*!< in: update vector */
3546 	trx_id_t	trx_id,		/*!< in: transaction id */
3547 	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
3548 	mtr_t*		mtr)		/*!< in: mtr */
3549 {
3550 	byte*		log_ptr;
3551 	const page_t*	page	= page_align(rec);
3552 	ut_ad(flags < 256);
3553 	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3554 
3555 	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3556 					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
3557 					    : MLOG_REC_UPDATE_IN_PLACE,
3558 					    1 + DATA_ROLL_PTR_LEN + 14 + 2
3559 					    + MLOG_BUF_MARGIN);
3560 
3561 	if (!log_ptr) {
3562 		/* Logging in mtr is switched off during crash recovery */
3563 		return;
3564 	}
3565 
3566 	/* For secondary indexes, we could skip writing the dummy system fields
3567 	to the redo log but we have to change redo log parsing of
3568 	MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3569 	new redo log record. For now, just write dummy sys fields to the redo
3570 	log if we are updating a secondary index record.
3571 	*/
3572 	mach_write_to_1(log_ptr, flags);
3573 	log_ptr++;
3574 
3575 	if (dict_index_is_clust(index)) {
3576 		log_ptr = row_upd_write_sys_vals_to_log(
3577 				index, trx_id, roll_ptr, log_ptr, mtr);
3578 	} else {
3579 		/* Dummy system fields for a secondary index */
3580 		/* TRX_ID Position */
3581 		log_ptr += mach_write_compressed(log_ptr, 0);
3582 		/* ROLL_PTR */
3583 		trx_write_roll_ptr(log_ptr, 0);
3584 		log_ptr += DATA_ROLL_PTR_LEN;
3585 		/* TRX_ID */
3586 		log_ptr += mach_u64_write_compressed(log_ptr, 0);
3587 	}
3588 
3589 	mach_write_to_2(log_ptr, page_offset(rec));
3590 	log_ptr += 2;
3591 
3592 	row_upd_index_write_log(update, log_ptr, mtr);
3593 }
3594 #endif /* UNIV_HOTBACKUP */
3595 
3596 /***********************************************************//**
3597 Parses a redo log record of updating a record in-place.
3598 @return end of log record or NULL */
3599 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3600 btr_cur_parse_update_in_place(
3601 /*==========================*/
3602 	byte*		ptr,	/*!< in: buffer */
3603 	byte*		end_ptr,/*!< in: buffer end */
3604 	page_t*		page,	/*!< in/out: page or NULL */
3605 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
3606 	dict_index_t*	index)	/*!< in: index corresponding to page */
3607 {
3608 	ulint		flags;
3609 	rec_t*		rec;
3610 	upd_t*		update;
3611 	ulint		pos;
3612 	trx_id_t	trx_id;
3613 	roll_ptr_t	roll_ptr;
3614 	ulint		rec_offset;
3615 	mem_heap_t*	heap;
3616 	ulint*		offsets;
3617 
3618 	if (end_ptr < ptr + 1) {
3619 
3620 		return(NULL);
3621 	}
3622 
3623 	flags = mach_read_from_1(ptr);
3624 	ptr++;
3625 
3626 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3627 
3628 	if (ptr == NULL) {
3629 
3630 		return(NULL);
3631 	}
3632 
3633 	if (end_ptr < ptr + 2) {
3634 
3635 		return(NULL);
3636 	}
3637 
3638 	rec_offset = mach_read_from_2(ptr);
3639 	ptr += 2;
3640 
3641 	ut_a(rec_offset <= UNIV_PAGE_SIZE);
3642 
3643 	heap = mem_heap_create(256);
3644 
3645 	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3646 
3647 	if (!ptr || !page) {
3648 
3649 		goto func_exit;
3650 	}
3651 
3652 	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3653 	rec = page + rec_offset;
3654 
3655 	/* We do not need to reserve search latch, as the page is only
3656 	being recovered, and there cannot be a hash index to it. */
3657 
3658 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3659 
3660 	if (!(flags & BTR_KEEP_SYS_FLAG)) {
3661 		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3662 						   pos, trx_id, roll_ptr);
3663 	}
3664 
3665 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3666 
3667 func_exit:
3668 	mem_heap_free(heap);
3669 
3670 	return(ptr);
3671 }
3672 
3673 #ifndef UNIV_HOTBACKUP
3674 /*************************************************************//**
3675 See if there is enough place in the page modification log to log
3676 an update-in-place.
3677 
3678 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3679 outside mtr if the page was recompressed
3680 @retval true if enough place;
3681 
3682 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3683 a secondary index leaf page. This has to be done either within the
3684 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3685 mtr_commit(mtr). */
3686 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3687 btr_cur_update_alloc_zip_func(
3688 /*==========================*/
3689 	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
3690 	page_cur_t*	cursor,	/*!< in/out: B-tree page cursor */
3691 	dict_index_t*	index,	/*!< in: the index corresponding to cursor */
3692 #ifdef UNIV_DEBUG
3693 	ulint*		offsets,/*!< in/out: offsets of the cursor record */
3694 #endif /* UNIV_DEBUG */
3695 	ulint		length,	/*!< in: size needed */
3696 	bool		create,	/*!< in: true=delete-and-insert,
3697 				false=update-in-place */
3698 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3699 {
3700 	const page_t*	page = page_cur_get_page(cursor);
3701 
3702 	ut_ad(page_zip == page_cur_get_page_zip(cursor));
3703 	ut_ad(page_zip);
3704 	ut_ad(!dict_index_is_ibuf(index));
3705 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3706 
3707 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3708 			       length, create)) {
3709 		return(true);
3710 	}
3711 
3712 	if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3713 		/* The page has been freshly compressed, so
3714 		reorganizing it will not help. */
3715 		return(false);
3716 	}
3717 
3718 	if (create && page_is_leaf(page)
3719 	    && (length + page_get_data_size(page)
3720 		>= dict_index_zip_pad_optimal_page_size(index))) {
3721 		return(false);
3722 	}
3723 
3724 	if (!btr_page_reorganize(cursor, index, mtr)) {
3725 		goto out_of_space;
3726 	}
3727 
3728 	rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3729 
3730 	/* After recompressing a page, we must make sure that the free
3731 	bits in the insert buffer bitmap will not exceed the free
3732 	space on the page.  Because this function will not attempt
3733 	recompression unless page_zip_available() fails above, it is
3734 	safe to reset the free bits if page_zip_available() fails
3735 	again, below.  The free bits can safely be reset in a separate
3736 	mini-transaction.  If page_zip_available() succeeds below, we
3737 	can be sure that the btr_page_reorganize() above did not reduce
3738 	the free space available on the page. */
3739 
3740 	if (page_zip_available(page_zip, dict_index_is_clust(index),
3741 			       length, create)) {
3742 		return(true);
3743 	}
3744 
3745 out_of_space:
3746 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3747 
3748 	/* Out of space: reset the free bits. */
3749 	if (!dict_index_is_clust(index)
3750 	    && !dict_table_is_temporary(index->table)
3751 	    && page_is_leaf(page)) {
3752 		ibuf_reset_free_bits(page_cur_get_block(cursor));
3753 	}
3754 
3755 	return(false);
3756 }
3757 
3758 /*************************************************************//**
3759 Updates a record when the update causes no size changes in its fields.
3760 We assume here that the ordering fields of the record do not change.
3761 @return locking or undo log related error code, or
3762 @retval DB_SUCCESS on success
3763 @retval DB_ZIP_OVERFLOW if there is not enough space left
3764 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3765 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3766 btr_cur_update_in_place(
3767 /*====================*/
3768 	ulint		flags,	/*!< in: undo logging and locking flags */
3769 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
3770 				cursor stays valid and positioned on the
3771 				same record */
3772 	ulint*		offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3773 	const upd_t*	update,	/*!< in: update vector */
3774 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3775 				updates */
3776 	que_thr_t*	thr,	/*!< in: query thread */
3777 	trx_id_t	trx_id,	/*!< in: transaction id */
3778 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
3779 				is a secondary index, the caller must
3780 				mtr_commit(mtr) before latching any
3781 				further pages */
3782 {
3783 	dict_index_t*	index;
3784 	buf_block_t*	block;
3785 	page_zip_des_t*	page_zip;
3786 	dberr_t		err;
3787 	rec_t*		rec;
3788 	roll_ptr_t	roll_ptr	= 0;
3789 	ulint		was_delete_marked;
3790 	ibool		is_hashed;
3791 
3792 	rec = btr_cur_get_rec(cursor);
3793 	index = cursor->index;
3794 	ut_ad(rec_offs_validate(rec, index, offsets));
3795 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3796 	ut_ad(trx_id > 0
3797 	      || (flags & BTR_KEEP_SYS_FLAG)
3798 	      || dict_table_is_intrinsic(index->table));
3799 	/* The insert buffer tree should never be updated in place. */
3800 	ut_ad(!dict_index_is_ibuf(index));
3801 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3802 	      || dict_index_is_clust(index));
3803 	ut_ad(thr_get_trx(thr)->id == trx_id
3804 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3805 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3806 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3807 	ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3808 	ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3809 
3810 	DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3811 			      ") by " TRX_ID_FMT ": %s",
3812 			      index->name(), index->id, trx_id,
3813 			      rec_printer(rec, offsets).str().c_str()));
3814 
3815 	block = btr_cur_get_block(cursor);
3816 	page_zip = buf_block_get_page_zip(block);
3817 
3818 	/* Check that enough space is available on the compressed page. */
3819 	if (page_zip) {
3820 		if (!btr_cur_update_alloc_zip(
3821 			    page_zip, btr_cur_get_page_cur(cursor),
3822 			    index, offsets, rec_offs_size(offsets),
3823 			    false, mtr)) {
3824 			return(DB_ZIP_OVERFLOW);
3825 		}
3826 
3827 		rec = btr_cur_get_rec(cursor);
3828 	}
3829 
3830 	/* Do lock checking and undo logging */
3831 	err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
3832 					update, cmpl_info,
3833 					thr, mtr, &roll_ptr);
3834 	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3835 		/* We may need to update the IBUF_BITMAP_FREE
3836 		bits after a reorganize that was done in
3837 		btr_cur_update_alloc_zip(). */
3838 		goto func_exit;
3839 	}
3840 
3841 	if (!(flags & BTR_KEEP_SYS_FLAG)
3842 	    && !dict_table_is_intrinsic(index->table)) {
3843 		row_upd_rec_sys_fields(rec, NULL, index, offsets,
3844 				       thr_get_trx(thr), roll_ptr);
3845 	}
3846 
3847 	was_delete_marked = rec_get_deleted_flag(
3848 		rec, page_is_comp(buf_block_get_frame(block)));
3849 
3850 	is_hashed = (block->index != NULL);
3851 
3852 	if (is_hashed) {
3853 		/* TO DO: Can we skip this if none of the fields
3854 		index->search_info->curr_n_fields
3855 		are being updated? */
3856 
3857 		/* The function row_upd_changes_ord_field_binary works only
3858 		if the update vector was built for a clustered index, we must
3859 		NOT call it if index is secondary */
3860 
3861 		if (!dict_index_is_clust(index)
3862 		    || row_upd_changes_ord_field_binary(index, update, thr,
3863 							NULL, NULL)) {
3864 
3865 			/* Remove possible hash index pointer to this record */
3866 			btr_search_update_hash_on_delete(cursor);
3867 		}
3868 
3869 		rw_lock_x_lock(btr_get_search_latch(index));
3870 	}
3871 
3872 	assert_block_ahi_valid(block);
3873 	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3874 
3875 	if (is_hashed) {
3876 		rw_lock_x_unlock(btr_get_search_latch(index));
3877 	}
3878 
3879 	btr_cur_update_in_place_log(flags, rec, index, update,
3880 				    trx_id, roll_ptr, mtr);
3881 
3882 	if (was_delete_marked
3883 	    && !rec_get_deleted_flag(
3884 		    rec, page_is_comp(buf_block_get_frame(block)))) {
3885 		/* The new updated record owns its possible externally
3886 		stored fields */
3887 
3888 		btr_cur_unmark_extern_fields(page_zip,
3889 					     rec, index, offsets, mtr);
3890 	}
3891 
3892 	ut_ad(err == DB_SUCCESS);
3893 
3894 func_exit:
3895 	if (page_zip
3896 	    && !(flags & BTR_KEEP_IBUF_BITMAP)
3897 	    && !dict_index_is_clust(index)
3898 	    && !dict_table_is_temporary(index->table)
3899 	    && page_is_leaf(buf_block_get_frame(block))) {
3900 		/* Update the free bits in the insert buffer. */
3901 		ibuf_update_free_bits_zip(block, mtr);
3902 	}
3903 
3904 	return(err);
3905 }
3906 
3907 /*************************************************************//**
3908 Tries to update a record on a page in an index tree. It is assumed that mtr
3909 holds an x-latch on the page. The operation does not succeed if there is too
3910 little space on the page or if the update would result in too empty a page,
3911 so that tree compression is recommended. We assume here that the ordering
3912 fields of the record do not change.
3913 @return error code, including
3914 @retval DB_SUCCESS on success
3915 @retval DB_OVERFLOW if the updated record does not fit
3916 @retval DB_UNDERFLOW if the page would become too empty
3917 @retval DB_ZIP_OVERFLOW if there is not enough space left
3918 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3919 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3920 btr_cur_optimistic_update(
3921 /*======================*/
3922 	ulint		flags,	/*!< in: undo logging and locking flags */
3923 	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
3924 				cursor stays valid and positioned on the
3925 				same record */
3926 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
3927 	mem_heap_t**	heap,	/*!< in/out: pointer to NULL or memory heap */
3928 	const upd_t*	update,	/*!< in: update vector; this must also
3929 				contain trx id and roll ptr fields */
3930 	ulint		cmpl_info,/*!< in: compiler info on secondary index
3931 				updates */
3932 	que_thr_t*	thr,	/*!< in: query thread */
3933 	trx_id_t	trx_id,	/*!< in: transaction id */
3934 	mtr_t*		mtr)	/*!< in/out: mini-transaction; if this
3935 				is a secondary index, the caller must
3936 				mtr_commit(mtr) before latching any
3937 				further pages */
3938 {
3939 	dict_index_t*	index;
3940 	page_cur_t*	page_cursor;
3941 	dberr_t		err;
3942 	buf_block_t*	block;
3943 	page_t*		page;
3944 	page_zip_des_t*	page_zip;
3945 	rec_t*		rec;
3946 	ulint		max_size;
3947 	ulint		new_rec_size;
3948 	ulint		old_rec_size;
3949 	ulint		max_ins_size = 0;
3950 	dtuple_t*	new_entry;
3951 	roll_ptr_t	roll_ptr;
3952 	ulint		i;
3953 	ulint		n_ext;
3954 
3955 	block = btr_cur_get_block(cursor);
3956 	page = buf_block_get_frame(block);
3957 	rec = btr_cur_get_rec(cursor);
3958 	index = cursor->index;
3959 	ut_ad(trx_id > 0
3960 	      || (flags & BTR_KEEP_SYS_FLAG)
3961 	      || dict_table_is_intrinsic(index->table));
3962 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3963 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3964 	/* This is intended only for leaf page updates */
3965 	ut_ad(page_is_leaf(page));
3966 	/* The insert buffer tree should never be updated in place. */
3967 	ut_ad(!dict_index_is_ibuf(index));
3968 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3969 	      || dict_index_is_clust(index));
3970 	ut_ad(thr_get_trx(thr)->id == trx_id
3971 	      || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3972 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3973 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3974 	ut_ad(fil_page_index_page_check(page));
3975 	ut_ad(btr_page_get_index_id(page) == index->id);
3976 
3977 	*offsets = rec_get_offsets(rec, index, *offsets,
3978 				   ULINT_UNDEFINED, heap);
3979 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3980 	ut_a(!rec_offs_any_null_extern(rec, *offsets)
3981 	     || trx_is_recv(thr_get_trx(thr)));
3982 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
3983 
3984 	if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
3985 
3986 		/* The simplest and the most common case: the update does not
3987 		change the size of any field and none of the updated fields is
3988 		externally stored in rec or update, and there is enough space
3989 		on the compressed page to log the update. */
3990 
3991 		return(btr_cur_update_in_place(
3992 			       flags, cursor, *offsets, update,
3993 			       cmpl_info, thr, trx_id, mtr));
3994 	}
3995 
3996 	if (rec_offs_any_extern(*offsets)) {
3997 any_extern:
3998 		/* Externally stored fields are treated in pessimistic
3999 		update */
4000 
4001 		/* prefetch siblings of the leaf for the pessimistic
4002 		operation. */
4003 		btr_cur_prefetch_siblings(block);
4004 
4005 		return(DB_OVERFLOW);
4006 	}
4007 
4008 	for (i = 0; i < upd_get_n_fields(update); i++) {
4009 		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4010 
4011 			goto any_extern;
4012 		}
4013 	}
4014 
4015 	DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4016 			      ": %s",
4017 			      index->name(), index->id, trx_id,
4018 			      rec_printer(rec, *offsets).str().c_str()));
4019 
4020 	page_cursor = btr_cur_get_page_cur(cursor);
4021 
4022 	if (!*heap) {
4023 		*heap = mem_heap_create(
4024 			rec_offs_size(*offsets)
4025 			+ DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4026 	}
4027 
4028 	new_entry = row_rec_to_index_entry(rec, index, *offsets,
4029 					   &n_ext, *heap);
4030 	/* We checked above that there are no externally stored fields. */
4031 	ut_a(!n_ext);
4032 
4033 	/* The page containing the clustered index record
4034 	corresponding to new_entry is latched in mtr.
4035 	Thus the following call is safe. */
4036 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4037 						     FALSE, *heap);
4038 	old_rec_size = rec_offs_size(*offsets);
4039 	new_rec_size = rec_get_converted_size(index, new_entry, 0);
4040 
4041 	page_zip = buf_block_get_page_zip(block);
4042 #ifdef UNIV_ZIP_DEBUG
4043 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4044 #endif /* UNIV_ZIP_DEBUG */
4045 
4046 	if (page_zip) {
4047 		if (!btr_cur_update_alloc_zip(
4048 			    page_zip, page_cursor, index, *offsets,
4049 			    new_rec_size, true, mtr)) {
4050 			return(DB_ZIP_OVERFLOW);
4051 		}
4052 
4053 		rec = page_cur_get_rec(page_cursor);
4054 	}
4055 
4056 	/* We limit max record size to 16k even for 64k page size. */
4057 	if (new_rec_size >= REC_MAX_DATA_SIZE) {
4058 		err = DB_OVERFLOW;
4059 
4060 		goto func_exit;
4061 	}
4062 
4063 	if (UNIV_UNLIKELY(new_rec_size
4064 			  >= (page_get_free_space_of_empty(page_is_comp(page))
4065 			      / 2))) {
4066 		/* We may need to update the IBUF_BITMAP_FREE
4067 		bits after a reorganize that was done in
4068 		btr_cur_update_alloc_zip(). */
4069 		err = DB_OVERFLOW;
4070 		goto func_exit;
4071 	}
4072 
4073 	if (UNIV_UNLIKELY(page_get_data_size(page)
4074 			  - old_rec_size + new_rec_size
4075 			  < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4076 		/* We may need to update the IBUF_BITMAP_FREE
4077 		bits after a reorganize that was done in
4078 		btr_cur_update_alloc_zip(). */
4079 
4080 		/* The page would become too empty */
4081 		err = DB_UNDERFLOW;
4082 		goto func_exit;
4083 	}
4084 
4085 	/* We do not attempt to reorganize if the page is compressed.
4086 	This is because the page may fail to compress after reorganization. */
4087 	max_size = page_zip
4088 		? page_get_max_insert_size(page, 1)
4089 		: (old_rec_size
4090 		   + page_get_max_insert_size_after_reorganize(page, 1));
4091 
4092 	if (!page_zip) {
4093 		max_ins_size = page_get_max_insert_size_after_reorganize(
4094 				page, 1);
4095 	}
4096 
4097 	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4098 	       && (max_size >= new_rec_size))
4099 	      || (page_get_n_recs(page) <= 1))) {
4100 
4101 		/* We may need to update the IBUF_BITMAP_FREE
4102 		bits after a reorganize that was done in
4103 		btr_cur_update_alloc_zip(). */
4104 
4105 		/* There was not enough space, or it did not pay to
4106 		reorganize: for simplicity, we decide what to do assuming a
4107 		reorganization is needed, though it might not be necessary */
4108 
4109 		err = DB_OVERFLOW;
4110 		goto func_exit;
4111 	}
4112 
4113 	/* Do lock checking and undo logging */
4114 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4115 					update, cmpl_info,
4116 					thr, mtr, &roll_ptr);
4117 	if (err != DB_SUCCESS) {
4118 		/* We may need to update the IBUF_BITMAP_FREE
4119 		bits after a reorganize that was done in
4120 		btr_cur_update_alloc_zip(). */
4121 		goto func_exit;
4122 	}
4123 
4124 	/* Ok, we may do the replacement. Store on the page infimum the
4125 	explicit locks on rec, before deleting rec (see the comment in
4126 	btr_cur_pessimistic_update). */
4127 	if (!dict_table_is_locking_disabled(index->table)) {
4128 		lock_rec_store_on_page_infimum(block, rec);
4129 	}
4130 
4131 	btr_search_update_hash_on_delete(cursor);
4132 
4133 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4134 
4135 	page_cur_move_to_prev(page_cursor);
4136 
4137 	if (!(flags & BTR_KEEP_SYS_FLAG)
4138 	    && !dict_table_is_intrinsic(index->table)) {
4139 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4140 					      roll_ptr);
4141 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4142 					      trx_id);
4143 	}
4144 
4145 	/* There are no externally stored columns in new_entry */
4146 	rec = btr_cur_insert_if_possible(
4147 		cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4148 	ut_a(rec); /* <- We calculated above the insert would fit */
4149 
4150 	/* Restore the old explicit lock state on the record */
4151 	if (!dict_table_is_locking_disabled(index->table)) {
4152 		lock_rec_restore_from_page_infimum(block, rec, block);
4153 	}
4154 
4155 	page_cur_move_to_next(page_cursor);
4156 	ut_ad(err == DB_SUCCESS);
4157 
4158 func_exit:
4159 	if (!(flags & BTR_KEEP_IBUF_BITMAP)
4160 	    && !dict_index_is_clust(index)
4161 	    && !dict_table_is_temporary(index->table)) {
4162 		/* Update the free bits in the insert buffer. */
4163 		if (page_zip) {
4164 			ibuf_update_free_bits_zip(block, mtr);
4165 		} else {
4166 			ibuf_update_free_bits_low(block, max_ins_size, mtr);
4167 		}
4168 	}
4169 
4170 	if (err != DB_SUCCESS) {
4171 		/* prefetch siblings of the leaf for the pessimistic
4172 		operation. */
4173 		btr_cur_prefetch_siblings(block);
4174 	}
4175 
4176 	return(err);
4177 }
4178 
4179 /*************************************************************//**
4180 If, in a split, a new supremum record was created as the predecessor of the
4181 updated record, the supremum record must inherit exactly the locks on the
4182 updated record. In the split it may have inherited locks from the successor
4183 of the updated record, which is not correct. This function restores the
4184 right locks for the new supremum. */
4185 static
4186 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4187 btr_cur_pess_upd_restore_supremum(
4188 /*==============================*/
4189 	buf_block_t*	block,	/*!< in: buffer block of rec */
4190 	const rec_t*	rec,	/*!< in: updated record */
4191 	mtr_t*		mtr)	/*!< in: mtr */
4192 {
4193 	page_t*		page;
4194 	buf_block_t*	prev_block;
4195 
4196 	page = buf_block_get_frame(block);
4197 
4198 	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4199 		/* Updated record is not the first user record on its page */
4200 
4201 		return;
4202 	}
4203 
4204 	const ulint	prev_page_no = btr_page_get_prev(page, mtr);
4205 
4206 	const page_id_t	page_id(block->page.id.space(), prev_page_no);
4207 
4208 	ut_ad(prev_page_no != FIL_NULL);
4209 	prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4210 #ifdef UNIV_BTR_DEBUG
4211 	ut_a(btr_page_get_next(prev_block->frame, mtr)
4212 	     == page_get_page_no(page));
4213 #endif /* UNIV_BTR_DEBUG */
4214 
4215 	/* We must already have an x-latch on prev_block! */
4216 	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4217 
4218 	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4219 					     PAGE_HEAP_NO_SUPREMUM,
4220 					     page_rec_get_heap_no(rec));
4221 }
4222 
4223 /*************************************************************//**
4224 Performs an update of a record on a page of a tree. It is assumed
4225 that mtr holds an x-latch on the tree and on the cursor page. If the
4226 update is made on the leaf level, to avoid deadlocks, mtr must also
4227 own x-latches to brothers of page, if those brothers exist. We assume
4228 here that the ordering fields of the record do not change.
4229 @return DB_SUCCESS or error code */
4230 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4231 btr_cur_pessimistic_update(
4232 /*=======================*/
4233 	ulint		flags,	/*!< in: undo logging, locking, and rollback
4234 				flags */
4235 	btr_cur_t*	cursor,	/*!< in/out: cursor on the record to update;
4236 				cursor may become invalid if *big_rec == NULL
4237 				|| !(flags & BTR_KEEP_POS_FLAG) */
4238 	ulint**		offsets,/*!< out: offsets on cursor->page_cur.rec */
4239 	mem_heap_t**	offsets_heap,
4240 				/*!< in/out: pointer to memory heap
4241 				that can be emptied, or NULL */
4242 	mem_heap_t*	entry_heap,
4243 				/*!< in/out: memory heap for allocating
4244 				big_rec and the index tuple */
4245 	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
4246 				be stored externally by the caller, or NULL */
4247 	upd_t*		update,	/*!< in/out: update vector; this is allowed to
4248 				also contain trx id and roll ptr fields.
4249 				Non-updated columns that are moved offpage will
4250 				be appended to this. */
4251 	ulint		cmpl_info,/*!< in: compiler info on secondary index
4252 				updates */
4253 	que_thr_t*	thr,	/*!< in: query thread */
4254 	trx_id_t	trx_id,	/*!< in: transaction id */
4255 	mtr_t*		mtr)	/*!< in/out: mini-transaction; must be
4256 				committed before latching any further pages */
4257 {
4258 	big_rec_t*	big_rec_vec	= NULL;
4259 	big_rec_t*	dummy_big_rec;
4260 	dict_index_t*	index;
4261 	buf_block_t*	block;
4262 	page_t*		page;
4263 	page_zip_des_t*	page_zip;
4264 	rec_t*		rec;
4265 	page_cur_t*	page_cursor;
4266 	dberr_t		err;
4267 	dberr_t		optim_err;
4268 	roll_ptr_t	roll_ptr;
4269 	ibool		was_first;
4270 	ulint		n_reserved	= 0;
4271 	ulint		n_ext;
4272 	ulint		max_ins_size	= 0;
4273 
4274 	*offsets = NULL;
4275 	*big_rec = NULL;
4276 
4277 	block = btr_cur_get_block(cursor);
4278 	page = buf_block_get_frame(block);
4279 	page_zip = buf_block_get_page_zip(block);
4280 	index = cursor->index;
4281 
4282 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4283 					MTR_MEMO_X_LOCK |
4284 					MTR_MEMO_SX_LOCK)
4285 	      || dict_table_is_intrinsic(index->table));
4286 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4287 #ifdef UNIV_ZIP_DEBUG
4288 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4289 #endif /* UNIV_ZIP_DEBUG */
4290 	/* The insert buffer tree should never be updated in place. */
4291 	ut_ad(!dict_index_is_ibuf(index));
4292 	ut_ad(trx_id > 0
4293 	      || (flags & BTR_KEEP_SYS_FLAG)
4294 	      || dict_table_is_intrinsic(index->table));
4295 	ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4296 	      || dict_index_is_clust(index));
4297 	ut_ad(thr_get_trx(thr)->id == trx_id
4298 	      || (flags & ~BTR_KEEP_POS_FLAG)
4299 	      == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4300 		  | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4301 
4302 	err = optim_err = btr_cur_optimistic_update(
4303 		flags | BTR_KEEP_IBUF_BITMAP,
4304 		cursor, offsets, offsets_heap, update,
4305 		cmpl_info, thr, trx_id, mtr);
4306 
4307 	switch (err) {
4308 	case DB_ZIP_OVERFLOW:
4309 	case DB_UNDERFLOW:
4310 	case DB_OVERFLOW:
4311 		break;
4312 	default:
4313 	err_exit:
4314 		/* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4315 		For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4316 		already reset by btr_cur_update_alloc_zip() if the
4317 		page was recompressed. */
4318 		if (page_zip
4319 		    && optim_err != DB_ZIP_OVERFLOW
4320 		    && !dict_index_is_clust(index)
4321 		    && !dict_table_is_temporary(index->table)
4322 		    && page_is_leaf(page)) {
4323 			ibuf_update_free_bits_zip(block, mtr);
4324 		}
4325 
4326 		if (big_rec_vec != NULL) {
4327 			dtuple_big_rec_free(big_rec_vec);
4328 		}
4329 
4330 		return(err);
4331 	}
4332 
4333 	rec = btr_cur_get_rec(cursor);
4334 
4335 	*offsets = rec_get_offsets(
4336 		rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4337 
4338 	dtuple_t*	new_entry = row_rec_to_index_entry(
4339 		rec, index, *offsets, &n_ext, entry_heap);
4340 
4341 	/* The page containing the clustered index record
4342 	corresponding to new_entry is latched in mtr.  If the
4343 	clustered index record is delete-marked, then its externally
4344 	stored fields cannot have been purged yet, because then the
4345 	purge would also have removed the clustered index record
4346 	itself.  Thus the following call is safe. */
4347 	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4348 						     FALSE, entry_heap);
4349 
4350 	/* We have to set appropriate extern storage bits in the new
4351 	record to be inserted: we have to remember which fields were such */
4352 
4353 	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4354 	ut_ad(rec_offs_validate(rec, index, *offsets));
4355 
4356 	/* Get number of externally stored columns in updated record */
4357 	n_ext = new_entry->get_n_ext();
4358 
4359 	/* UNDO logging is also turned-off during normal operation on intrinsic
4360 	table so condition needs to ensure that table is not intrinsic. */
4361 	if ((flags & BTR_NO_UNDO_LOG_FLAG)
4362 	    && rec_offs_any_extern(*offsets)
4363 	    && !dict_table_is_intrinsic(index->table)) {
4364 		/* We are in a transaction rollback undoing a row
4365 		update: we must free possible externally stored fields
4366 		which got new values in the update, if they are not
4367 		inherited values. They can be inherited if we have
4368 		updated the primary key to another value, and then
4369 		update it back again. */
4370 
4371 		ut_ad(big_rec_vec == NULL);
4372 		ut_ad(dict_index_is_clust(index));
4373 		ut_ad(thr_get_trx(thr)->in_rollback);
4374 
4375 		DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4376 		RECOVERY_CRASH(99);
4377 
4378 		btr_rec_free_updated_extern_fields(
4379 			index, rec, page_zip, *offsets, update, true, mtr);
4380 	}
4381 
4382 	if (page_zip_rec_needs_ext(
4383 			rec_get_converted_size(index, new_entry, n_ext),
4384 			page_is_comp(page),
4385 			dict_index_get_n_fields(index),
4386 			block->page.size)) {
4387 
4388 		big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4389 		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4390 
4391 			/* We cannot goto return_after_reservations,
4392 			because we may need to update the
4393 			IBUF_BITMAP_FREE bits, which was suppressed by
4394 			BTR_KEEP_IBUF_BITMAP. */
4395 #ifdef UNIV_ZIP_DEBUG
4396 			ut_a(!page_zip
4397 			     || page_zip_validate(page_zip, page, index));
4398 #endif /* UNIV_ZIP_DEBUG */
4399 			if (n_reserved > 0) {
4400 				fil_space_release_free_extents(
4401 					index->space, n_reserved);
4402 			}
4403 
4404 			err = DB_TOO_BIG_RECORD;
4405 			goto err_exit;
4406 		}
4407 
4408 		ut_ad(page_is_leaf(page));
4409 		ut_ad(dict_index_is_clust(index));
4410 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4411 	}
4412 
4413 	/* Do lock checking and undo logging */
4414 	err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4415 					update, cmpl_info,
4416 					thr, mtr, &roll_ptr);
4417 	if (err != DB_SUCCESS) {
4418 		goto err_exit;
4419 	}
4420 
4421 	if (optim_err == DB_OVERFLOW) {
4422 
4423 		/* First reserve enough free space for the file segments
4424 		of the index tree, so that the update will not fail because
4425 		of lack of space */
4426 
4427 		ulint	n_extents = cursor->tree_height / 16 + 3;
4428 
4429 		if (!fsp_reserve_free_extents(
4430 		            &n_reserved, index->space, n_extents,
4431 		            flags & BTR_NO_UNDO_LOG_FLAG
4432 		            ? FSP_CLEANING : FSP_NORMAL,
4433 		            mtr)) {
4434 			err = DB_OUT_OF_FILE_SPACE;
4435 			goto err_exit;
4436 		}
4437 	}
4438 
4439 	if (!(flags & BTR_KEEP_SYS_FLAG)
4440 	    && !dict_table_is_intrinsic(index->table)) {
4441 		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4442 					      roll_ptr);
4443 		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4444 					      trx_id);
4445 	}
4446 
4447 	if (!page_zip) {
4448 		max_ins_size = page_get_max_insert_size_after_reorganize(
4449 				page, 1);
4450 	}
4451 
4452 	/* Store state of explicit locks on rec on the page infimum record,
4453 	before deleting rec. The page infimum acts as a dummy carrier of the
4454 	locks, taking care also of lock releases, before we can move the locks
4455 	back on the actual record. There is a special case: if we are
4456 	inserting on the root page and the insert causes a call of
4457 	btr_root_raise_and_insert. Therefore we cannot in the lock system
4458 	delete the lock structs set on the root page even if the root
4459 	page carries just node pointers. */
4460 	if (!dict_table_is_locking_disabled(index->table)) {
4461 		lock_rec_store_on_page_infimum(block, rec);
4462 	}
4463 
4464 	btr_search_update_hash_on_delete(cursor);
4465 
4466 #ifdef UNIV_ZIP_DEBUG
4467 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4468 #endif /* UNIV_ZIP_DEBUG */
4469 	page_cursor = btr_cur_get_page_cur(cursor);
4470 
4471 	page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4472 
4473 	page_cur_move_to_prev(page_cursor);
4474 
4475 	rec = btr_cur_insert_if_possible(cursor, new_entry,
4476 					 offsets, offsets_heap, n_ext, mtr);
4477 
4478 	if (rec) {
4479 		page_cursor->rec = rec;
4480 
4481 		if (!dict_table_is_locking_disabled(index->table)) {
4482 			lock_rec_restore_from_page_infimum(
4483 				btr_cur_get_block(cursor), rec, block);
4484 		}
4485 
4486 		if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4487 			/* The new inserted record owns its possible externally
4488 			stored fields */
4489 			btr_cur_unmark_extern_fields(
4490 				page_zip, rec, index, *offsets, mtr);
4491 		}
4492 
4493 		bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4494 
4495 		if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4496 			if (adjust) {
4497 				rec_offs_make_valid(
4498 					page_cursor->rec, index, *offsets);
4499 			}
4500 		} else if (!dict_index_is_clust(index)
4501 			   && !dict_table_is_temporary(index->table)
4502 			   && page_is_leaf(page)) {
4503 			/* Update the free bits in the insert buffer.
4504 			This is the same block which was skipped by
4505 			BTR_KEEP_IBUF_BITMAP. */
4506 			if (page_zip) {
4507 				ibuf_update_free_bits_zip(block, mtr);
4508 			} else {
4509 				ibuf_update_free_bits_low(block, max_ins_size,
4510 							  mtr);
4511 			}
4512 		}
4513 
4514 		if (!srv_read_only_mode
4515 		    && !big_rec_vec
4516 		    && page_is_leaf(page)
4517 		    && !dict_index_is_online_ddl(index)) {
4518 
4519 			mtr_memo_release(mtr, dict_index_get_lock(index),
4520 					 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4521 
4522 			/* NOTE: We cannot release root block latch here, because it
4523 			has segment header and already modified in most of cases.*/
4524 		}
4525 
4526 		err = DB_SUCCESS;
4527 		goto return_after_reservations;
4528 	} else {
4529 		/* If the page is compressed and it initially
4530 		compresses very well, and there is a subsequent insert
4531 		of a badly-compressing record, it is possible for
4532 		btr_cur_optimistic_update() to return DB_UNDERFLOW and
4533 		btr_cur_insert_if_possible() to return FALSE. */
4534 		ut_a(page_zip || optim_err != DB_UNDERFLOW);
4535 
4536 		/* Out of space: reset the free bits.
4537 		This is the same block which was skipped by
4538 		BTR_KEEP_IBUF_BITMAP. */
4539 		if (!dict_index_is_clust(index)
4540 		    && !dict_table_is_temporary(index->table)
4541 		    && page_is_leaf(page)) {
4542 			ibuf_reset_free_bits(block);
4543 		}
4544 	}
4545 
4546 	if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4547 		ut_ad(page_is_leaf(page));
4548 		ut_ad(dict_index_is_clust(index));
4549 		ut_ad(flags & BTR_KEEP_POS_FLAG);
4550 
4551 		/* btr_page_split_and_insert() in
4552 		btr_cur_pessimistic_insert() invokes
4553 		mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4554 		We must keep the index->lock when we created a
4555 		big_rec, so that row_upd_clust_rec() can store the
4556 		big_rec in the same mini-transaction. */
4557 
4558 		ut_ad(mtr_memo_contains_flagged(mtr,
4559 						dict_index_get_lock(index),
4560 						MTR_MEMO_X_LOCK |
4561 						MTR_MEMO_SX_LOCK));
4562 
4563 		mtr_sx_lock(dict_index_get_lock(index), mtr);
4564 	}
4565 
4566 	/* Was the record to be updated positioned as the first user
4567 	record on its page? */
4568 	was_first = page_cur_is_before_first(page_cursor);
4569 
4570 	/* Lock checks and undo logging were already performed by
4571 	btr_cur_upd_lock_and_undo(). We do not try
4572 	btr_cur_optimistic_insert() because
4573 	btr_cur_insert_if_possible() already failed above. */
4574 
4575 	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4576 					 | BTR_NO_LOCKING_FLAG
4577 					 | BTR_KEEP_SYS_FLAG,
4578 					 cursor, offsets, offsets_heap,
4579 					 new_entry, &rec,
4580 					 &dummy_big_rec, n_ext, NULL, mtr);
4581 	ut_a(rec);
4582 	ut_a(err == DB_SUCCESS);
4583 	ut_a(dummy_big_rec == NULL);
4584 	ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4585 	page_cursor->rec = rec;
4586 
4587 	/* Multiple transactions cannot simultaneously operate on the
4588 	same temp-table in parallel.
4589 	max_trx_id is ignored for temp tables because it not required
4590 	for MVCC. */
4591 	if (dict_index_is_sec_or_ibuf(index)
4592 	    && !dict_table_is_temporary(index->table)) {
4593 		/* Update PAGE_MAX_TRX_ID in the index page header.
4594 		It was not updated by btr_cur_pessimistic_insert()
4595 		because of BTR_NO_LOCKING_FLAG. */
4596 		buf_block_t*	rec_block;
4597 
4598 		rec_block = btr_cur_get_block(cursor);
4599 
4600 		page_update_max_trx_id(rec_block,
4601 				       buf_block_get_page_zip(rec_block),
4602 				       trx_id, mtr);
4603 	}
4604 
4605 	if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4606 		/* The new inserted record owns its possible externally
4607 		stored fields */
4608 		buf_block_t*	rec_block = btr_cur_get_block(cursor);
4609 
4610 #ifdef UNIV_ZIP_DEBUG
4611 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4612 		page = buf_block_get_frame(rec_block);
4613 #endif /* UNIV_ZIP_DEBUG */
4614 		page_zip = buf_block_get_page_zip(rec_block);
4615 
4616 		btr_cur_unmark_extern_fields(page_zip,
4617 					     rec, index, *offsets, mtr);
4618 	}
4619 
4620 	if (!dict_table_is_locking_disabled(index->table)) {
4621 		lock_rec_restore_from_page_infimum(
4622 			btr_cur_get_block(cursor), rec, block);
4623 	}
4624 
4625 	/* If necessary, restore also the correct lock state for a new,
4626 	preceding supremum record created in a page split. While the old
4627 	record was nonexistent, the supremum might have inherited its locks
4628 	from a wrong record. */
4629 
4630 	if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4631 		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4632 						  rec, mtr);
4633 	}
4634 
4635 return_after_reservations:
4636 #ifdef UNIV_ZIP_DEBUG
4637 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4638 #endif /* UNIV_ZIP_DEBUG */
4639 
4640 	if (n_reserved > 0) {
4641 		fil_space_release_free_extents(index->space, n_reserved);
4642 	}
4643 
4644 	*big_rec = big_rec_vec;
4645 
4646 	return(err);
4647 }
4648 
4649 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4650 
4651 /****************************************************************//**
4652 Writes the redo log record for delete marking or unmarking of an index
4653 record. */
4654 UNIV_INLINE
4655 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4656 btr_cur_del_mark_set_clust_rec_log(
4657 /*===============================*/
4658 	rec_t*		rec,	/*!< in: record */
4659 	dict_index_t*	index,	/*!< in: index of the record */
4660 	trx_id_t	trx_id,	/*!< in: transaction id */
4661 	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
4662 	mtr_t*		mtr)	/*!< in: mtr */
4663 {
4664 	byte*	log_ptr;
4665 
4666 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4667 	ut_ad(mtr->is_named_space(index->space));
4668 
4669 	log_ptr = mlog_open_and_write_index(mtr, rec, index,
4670 					    page_rec_is_comp(rec)
4671 					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
4672 					    : MLOG_REC_CLUST_DELETE_MARK,
4673 					    1 + 1 + DATA_ROLL_PTR_LEN
4674 					    + 14 + 2);
4675 
4676 	if (!log_ptr) {
4677 		/* Logging in mtr is switched off during crash recovery */
4678 		return;
4679 	}
4680 
4681 	*log_ptr++ = 0;
4682 	*log_ptr++ = 1;
4683 
4684 	log_ptr = row_upd_write_sys_vals_to_log(
4685 		index, trx_id, roll_ptr, log_ptr, mtr);
4686 	mach_write_to_2(log_ptr, page_offset(rec));
4687 	log_ptr += 2;
4688 
4689 	mlog_close(mtr, log_ptr);
4690 }
4691 #endif /* !UNIV_HOTBACKUP */
4692 
4693 /****************************************************************//**
4694 Parses the redo log record for delete marking or unmarking of a clustered
4695 index record.
4696 @return end of log record or NULL */
4697 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4698 btr_cur_parse_del_mark_set_clust_rec(
4699 /*=================================*/
4700 	byte*		ptr,	/*!< in: buffer */
4701 	byte*		end_ptr,/*!< in: buffer end */
4702 	page_t*		page,	/*!< in/out: page or NULL */
4703 	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
4704 	dict_index_t*	index)	/*!< in: index corresponding to page */
4705 {
4706 	ulint		flags;
4707 	ulint		val;
4708 	ulint		pos;
4709 	trx_id_t	trx_id;
4710 	roll_ptr_t	roll_ptr;
4711 	ulint		offset;
4712 	rec_t*		rec;
4713 
4714 	ut_ad(!page
4715 	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
4716 
4717 	if (end_ptr < ptr + 2) {
4718 
4719 		return(NULL);
4720 	}
4721 
4722 	flags = mach_read_from_1(ptr);
4723 	ptr++;
4724 	val = mach_read_from_1(ptr);
4725 	ptr++;
4726 
4727 	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4728 
4729 	if (ptr == NULL) {
4730 
4731 		return(NULL);
4732 	}
4733 
4734 	if (end_ptr < ptr + 2) {
4735 
4736 		return(NULL);
4737 	}
4738 
4739 	offset = mach_read_from_2(ptr);
4740 	ptr += 2;
4741 
4742 	ut_a(offset <= UNIV_PAGE_SIZE);
4743 
4744 	if (page) {
4745 		rec = page + offset;
4746 
4747 		/* We do not need to reserve search latch, as the page
4748 		is only being recovered, and there cannot be a hash index to
4749 		it. Besides, these fields are being updated in place
4750 		and the adaptive hash index does not depend on them. */
4751 
4752 		btr_rec_set_deleted_flag(rec, page_zip, val);
4753 
4754 		if (!(flags & BTR_KEEP_SYS_FLAG)) {
4755 			mem_heap_t*	heap		= NULL;
4756 			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
4757 			rec_offs_init(offsets_);
4758 
4759 			row_upd_rec_sys_fields_in_recovery(
4760 				rec, page_zip,
4761 				rec_get_offsets(rec, index, offsets_,
4762 						ULINT_UNDEFINED, &heap),
4763 				pos, trx_id, roll_ptr);
4764 			if (UNIV_LIKELY_NULL(heap)) {
4765 				mem_heap_free(heap);
4766 			}
4767 		}
4768 	}
4769 
4770 	return(ptr);
4771 }
4772 
4773 #ifndef UNIV_HOTBACKUP
4774 /***********************************************************//**
4775 Marks a clustered index record deleted. Writes an undo log record to
4776 undo log on this delete marking. Writes in the trx id field the id
4777 of the deleting transaction, and in the roll ptr field pointer to the
4778 undo log record created.
4779 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4780 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4781 btr_cur_del_mark_set_clust_rec(
4782 /*===========================*/
4783 	ulint		flags,  /*!< in: undo logging and locking flags */
4784 	buf_block_t*	block,	/*!< in/out: buffer block of the record */
4785 	rec_t*		rec,	/*!< in/out: record */
4786 	dict_index_t*	index,	/*!< in: clustered index of the record */
4787 	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
4788 	que_thr_t*	thr,	/*!< in: query thread */
4789 	const dtuple_t*	entry,	/*!< in: dtuple for the deleting record, also
4790 				contains the virtual cols if there are any */
4791 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4792 {
4793 	roll_ptr_t	roll_ptr;
4794 	dberr_t		err;
4795 	page_zip_des_t*	page_zip;
4796 	trx_t*		trx;
4797 
4798 	ut_ad(dict_index_is_clust(index));
4799 	ut_ad(rec_offs_validate(rec, index, offsets));
4800 	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4801 	ut_ad(buf_block_get_frame(block) == page_align(rec));
4802 	ut_ad(page_is_leaf(page_align(rec)));
4803 	ut_ad(mtr->is_named_space(index->space));
4804 
4805 	if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4806 		/* While cascading delete operations, this becomes possible. */
4807 #ifdef WITH_WSREP
4808           // following may assert with wsrep
4809 #endif
4810 		ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4811 		return(DB_SUCCESS);
4812 	}
4813 
4814 	err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4815 						   rec, index, offsets, thr);
4816 
4817 	if (err != DB_SUCCESS) {
4818 
4819 		return(err);
4820 	}
4821 
4822 	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4823 					    index, entry, NULL, 0, rec, offsets,
4824 					    &roll_ptr);
4825 	if (err != DB_SUCCESS) {
4826 
4827 		return(err);
4828 	}
4829 
4830 	/* The search latch is not needed here, because
4831 	the adaptive hash index does not depend on the delete-mark
4832 	and the delete-mark is being updated in place. */
4833 
4834 	page_zip = buf_block_get_page_zip(block);
4835 
4836 	btr_rec_set_deleted_flag(rec, page_zip, TRUE);
4837 
4838 	/* For intrinsic table, roll-ptr is not maintained as there is no UNDO
4839 	logging. Skip updating it. */
4840 	if (dict_table_is_intrinsic(index->table)) {
4841 		return(err);
4842 	}
4843 
4844 	trx = thr_get_trx(thr);
4845 	/* This function must not be invoked during rollback
4846 	(of a TRX_STATE_PREPARE transaction or otherwise). */
4847 	ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
4848 	ut_ad(!trx->in_rollback);
4849 
4850 	DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
4851 			      ") by " TRX_ID_FMT ": %s",
4852 			      index->table_name, index->id,
4853 			      trx_get_id_for_print(trx),
4854 			      rec_printer(rec, offsets).str().c_str()));
4855 
4856 	if (dict_index_is_online_ddl(index)) {
4857 		row_log_table_delete(rec, entry, index, offsets, NULL);
4858 	}
4859 
4860 	row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
4861 
4862 	btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
4863 					   roll_ptr, mtr);
4864 
4865 	return(err);
4866 }
4867 
4868 /****************************************************************//**
4869 Writes the redo log record for a delete mark setting of a secondary
4870 index record. */
4871 UNIV_INLINE
4872 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)4873 btr_cur_del_mark_set_sec_rec_log(
4874 /*=============================*/
4875 	rec_t*		rec,	/*!< in: record */
4876 	ibool		val,	/*!< in: value to set */
4877 	mtr_t*		mtr)	/*!< in: mtr */
4878 {
4879 	byte*	log_ptr;
4880 	ut_ad(val <= 1);
4881 
4882 	log_ptr = mlog_open(mtr, 11 + 1 + 2);
4883 
4884 	if (!log_ptr) {
4885 		/* Logging in mtr is switched off during crash recovery:
4886 		in that case mlog_open returns NULL */
4887 		return;
4888 	}
4889 
4890 	log_ptr = mlog_write_initial_log_record_fast(
4891 		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
4892 	mach_write_to_1(log_ptr, val);
4893 	log_ptr++;
4894 
4895 	mach_write_to_2(log_ptr, page_offset(rec));
4896 	log_ptr += 2;
4897 
4898 	mlog_close(mtr, log_ptr);
4899 }
4900 #endif /* !UNIV_HOTBACKUP */
4901 
4902 /****************************************************************//**
4903 Parses the redo log record for delete marking or unmarking of a secondary
4904 index record.
4905 @return end of log record or NULL */
4906 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)4907 btr_cur_parse_del_mark_set_sec_rec(
4908 /*===============================*/
4909 	byte*		ptr,	/*!< in: buffer */
4910 	byte*		end_ptr,/*!< in: buffer end */
4911 	page_t*		page,	/*!< in/out: page or NULL */
4912 	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
4913 {
4914 	ulint	val;
4915 	ulint	offset;
4916 	rec_t*	rec;
4917 
4918 	if (end_ptr < ptr + 3) {
4919 
4920 		return(NULL);
4921 	}
4922 
4923 	val = mach_read_from_1(ptr);
4924 	ptr++;
4925 
4926 	offset = mach_read_from_2(ptr);
4927 	ptr += 2;
4928 
4929 	ut_a(offset <= UNIV_PAGE_SIZE);
4930 
4931 	if (page) {
4932 		rec = page + offset;
4933 
4934 		/* We do not need to reserve search latch, as the page
4935 		is only being recovered, and there cannot be a hash index to
4936 		it. Besides, the delete-mark flag is being updated in place
4937 		and the adaptive hash index does not depend on it. */
4938 
4939 		btr_rec_set_deleted_flag(rec, page_zip, val);
4940 	}
4941 
4942 	return(ptr);
4943 }
4944 
4945 #ifndef UNIV_HOTBACKUP
4946 /***********************************************************//**
4947 Sets a secondary index record delete mark to TRUE or FALSE.
4948 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4949 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)4950 btr_cur_del_mark_set_sec_rec(
4951 /*=========================*/
4952 	ulint		flags,	/*!< in: locking flag */
4953 	btr_cur_t*	cursor,	/*!< in: cursor */
4954 	ibool		val,	/*!< in: value to set */
4955 	que_thr_t*	thr,	/*!< in: query thread */
4956 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4957 {
4958 	buf_block_t*	block;
4959 	rec_t*		rec;
4960 	dberr_t		err;
4961 
4962 	block = btr_cur_get_block(cursor);
4963 	rec = btr_cur_get_rec(cursor);
4964 
4965 	err = lock_sec_rec_modify_check_and_lock(flags,
4966 						 btr_cur_get_block(cursor),
4967 						 rec, cursor->index, thr, mtr);
4968 	if (err != DB_SUCCESS) {
4969 
4970 		return(err);
4971 	}
4972 
4973 	ut_ad(!!page_rec_is_comp(rec)
4974 	      == dict_table_is_comp(cursor->index->table));
4975 
4976 	DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
4977 			      IB_ID_FMT ") by " TRX_ID_FMT,
4978 			      unsigned(val),
4979 			      block->page.id.space(), block->page.id.page_no(),
4980 			      unsigned(page_rec_get_heap_no(rec)),
4981 			      cursor->index->name(), cursor->index->id,
4982 			      trx_get_id_for_print(thr_get_trx(thr))));
4983 
4984 	/* We do not need to reserve search latch, as the
4985 	delete-mark flag is being updated in place and the adaptive
4986 	hash index does not depend on it. */
4987 	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
4988 
4989 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
4990 
4991 	return(DB_SUCCESS);
4992 }
4993 
4994 /***********************************************************//**
4995 Sets a secondary index record's delete mark to the given value. This
4996 function is only used by the insert buffer merge mechanism. */
4997 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)4998 btr_cur_set_deleted_flag_for_ibuf(
4999 /*==============================*/
5000 	rec_t*		rec,		/*!< in/out: record */
5001 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
5002 					corresponding to rec, or NULL
5003 					when the tablespace is
5004 					uncompressed */
5005 	ibool		val,		/*!< in: value to set */
5006 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
5007 {
5008 	/* We do not need to reserve search latch, as the page
5009 	has just been read to the buffer pool and there cannot be
5010 	a hash index to it.  Besides, the delete-mark flag is being
5011 	updated in place and the adaptive hash index does not depend
5012 	on it. */
5013 
5014 	btr_rec_set_deleted_flag(rec, page_zip, val);
5015 
5016 	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5017 }
5018 
5019 /*==================== B-TREE RECORD REMOVE =========================*/
5020 
5021 /*************************************************************//**
5022 Tries to compress a page of the tree if it seems useful. It is assumed
5023 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5024 deadlocks, mtr must also own x-latches to brothers of page, if those
5025 brothers exist. NOTE: it is assumed that the caller has reserved enough
5026 free extents so that the compression will always succeed if done!
5027 @return TRUE if compression occurred */
5028 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5029 btr_cur_compress_if_useful(
5030 /*=======================*/
5031 	btr_cur_t*	cursor,	/*!< in/out: cursor on the page to compress;
5032 				cursor does not stay valid if !adjust and
5033 				compression occurs */
5034 	ibool		adjust,	/*!< in: TRUE if should adjust the
5035 				cursor position even if compression occurs */
5036 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
5037 {
5038 	/* Avoid applying compression as we don't accept lot of page garbage
5039 	given the workload of intrinsic table. */
5040 	if (dict_table_is_intrinsic(cursor->index->table)) {
5041 		return(FALSE);
5042 	}
5043 
5044 	ut_ad(mtr_memo_contains_flagged(
5045 		mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5046 		MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5047 	      || dict_table_is_intrinsic(cursor->index->table));
5048 	ut_ad(mtr_is_block_fix(
5049 		mtr, btr_cur_get_block(cursor),
5050 		MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5051 
5052 	if (dict_index_is_spatial(cursor->index)) {
5053 		const page_t*   page = btr_cur_get_page(cursor);
5054 		const trx_t*	trx = NULL;
5055 
5056 		if (cursor->rtr_info->thr != NULL) {
5057 			trx = thr_get_trx(cursor->rtr_info->thr);
5058 		}
5059 
5060 		/* Check whether page lock prevents the compression */
5061 		if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5062 					      page_get_page_no(page))) {
5063 			return(false);
5064 		}
5065 	}
5066 
5067 	return(btr_cur_compress_recommendation(cursor, mtr)
5068 	       && btr_compress(cursor, adjust, mtr));
5069 }
5070 
5071 /*******************************************************//**
5072 Removes the record on which the tree cursor is positioned on a leaf page.
5073 It is assumed that the mtr has an x-latch on the page where the cursor is
5074 positioned, but no latch on the whole tree.
5075 @return TRUE if success, i.e., the page did not become too empty */
5076 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5077 btr_cur_optimistic_delete_func(
5078 /*===========================*/
5079 	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
5080 				delete; cursor stays valid: if deletion
5081 				succeeds, on function exit it points to the
5082 				successor of the deleted record */
5083 #ifdef UNIV_DEBUG
5084 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5085 #endif /* UNIV_DEBUG */
5086 	mtr_t*		mtr)	/*!< in: mtr; if this function returns
5087 				TRUE on a leaf page of a secondary
5088 				index, the mtr must be committed
5089 				before latching any further pages */
5090 {
5091 	buf_block_t*	block;
5092 	rec_t*		rec;
5093 	mem_heap_t*	heap		= NULL;
5094 	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
5095 	ulint*		offsets		= offsets_;
5096 	ibool		no_compress_needed;
5097 	rec_offs_init(offsets_);
5098 
5099 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5100 	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5101 				MTR_MEMO_PAGE_X_FIX));
5102 	ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5103 			       MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5104 	ut_ad(mtr->is_named_space(cursor->index->space));
5105 
5106 	/* This is intended only for leaf page deletions */
5107 
5108 	block = btr_cur_get_block(cursor);
5109 
5110 	ut_ad(page_is_leaf(buf_block_get_frame(block)));
5111 	ut_ad(!dict_index_is_online_ddl(cursor->index)
5112 	      || dict_index_is_clust(cursor->index)
5113 	      || (flags & BTR_CREATE_FLAG));
5114 
5115 	rec = btr_cur_get_rec(cursor);
5116 	offsets = rec_get_offsets(rec, cursor->index, offsets,
5117 				  ULINT_UNDEFINED, &heap);
5118 
5119 	no_compress_needed = !rec_offs_any_extern(offsets)
5120 		&& btr_cur_can_delete_without_compress(
5121 			cursor, rec_offs_size(offsets), mtr);
5122 
5123 	if (no_compress_needed) {
5124 
5125 		page_t*		page	= buf_block_get_frame(block);
5126 		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
5127 
5128 		lock_update_delete(block, rec);
5129 
5130 		btr_search_update_hash_on_delete(cursor);
5131 
5132 		if (page_zip) {
5133 #ifdef UNIV_ZIP_DEBUG
5134 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5135 #endif /* UNIV_ZIP_DEBUG */
5136 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5137 					    cursor->index, offsets, mtr);
5138 #ifdef UNIV_ZIP_DEBUG
5139 			ut_a(page_zip_validate(page_zip, page, cursor->index));
5140 #endif /* UNIV_ZIP_DEBUG */
5141 
5142 			/* On compressed pages, the IBUF_BITMAP_FREE
5143 			space is not affected by deleting (purging)
5144 			records, because it is defined as the minimum
5145 			of space available *without* reorganize, and
5146 			space available in the modification log. */
5147 		} else {
5148 			const ulint	max_ins
5149 				= page_get_max_insert_size_after_reorganize(
5150 					page, 1);
5151 
5152 			page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5153 					    cursor->index, offsets, mtr);
5154 
5155 			/* The change buffer does not handle inserts
5156 			into non-leaf pages, into clustered indexes,
5157 			or into the change buffer. */
5158 			if (!dict_index_is_clust(cursor->index)
5159 			    && !dict_table_is_temporary(cursor->index->table)
5160 			    && !dict_index_is_ibuf(cursor->index)) {
5161 				ibuf_update_free_bits_low(block, max_ins, mtr);
5162 			}
5163 		}
5164 	} else {
5165 		/* prefetch siblings of the leaf for the pessimistic
5166 		operation. */
5167 		btr_cur_prefetch_siblings(block);
5168 	}
5169 
5170 	if (UNIV_LIKELY_NULL(heap)) {
5171 		mem_heap_free(heap);
5172 	}
5173 
5174 	return(no_compress_needed);
5175 }
5176 
5177 /*************************************************************//**
5178 Removes the record on which the tree cursor is positioned. Tries
5179 to compress the page if its fillfactor drops below a threshold
5180 or if it is the only page on the level. It is assumed that mtr holds
5181 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5182 mtr must also own x-latches to brothers of page, if those brothers
5183 exist.
5184 @return TRUE if compression occurred and FALSE if not or something
5185 wrong. */
5186 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5187 btr_cur_pessimistic_delete(
5188 /*=======================*/
5189 	dberr_t*	err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5190 				the latter may occur because we may have
5191 				to update node pointers on upper levels,
5192 				and in the case of variable length keys
5193 				these may actually grow in size */
5194 	ibool		has_reserved_extents, /*!< in: TRUE if the
5195 				caller has already reserved enough free
5196 				extents so that he knows that the operation
5197 				will succeed */
5198 	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
5199 				if compression does not occur, the cursor
5200 				stays valid: it points to successor of
5201 				deleted record on function exit */
5202 	ulint		flags,	/*!< in: BTR_CREATE_FLAG or 0 */
5203 	bool		rollback,/*!< in: performing rollback? */
5204 	mtr_t*		mtr)	/*!< in: mtr */
5205 {
5206 	buf_block_t*	block;
5207 	page_t*		page;
5208 	page_zip_des_t*	page_zip;
5209 	dict_index_t*	index;
5210 	rec_t*		rec;
5211 	ulint		n_reserved	= 0;
5212 	bool		success;
5213 	ibool		ret		= FALSE;
5214 	ulint		level;
5215 	mem_heap_t*	heap;
5216 	ulint*		offsets;
5217 	bool		allow_merge = true; /* if true, implies we have taken appropriate page
5218 			latches needed to merge this page.*/
5219 #ifdef UNIV_DEBUG
5220 	bool		parent_latched	= false;
5221 #endif /* UNIV_DEBUG */
5222 
5223 	block = btr_cur_get_block(cursor);
5224 	page = buf_block_get_frame(block);
5225 	index = btr_cur_get_index(cursor);
5226 
5227 	ulint rec_size_est = dict_index_node_ptr_max_size(index);
5228 	const page_size_t       page_size(dict_table_page_size(index->table));
5229 
5230 	ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5231 	ut_ad(!dict_index_is_online_ddl(index)
5232 	      || dict_index_is_clust(index)
5233 	      || (flags & BTR_CREATE_FLAG));
5234 	ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5235 					MTR_MEMO_X_LOCK
5236 					| MTR_MEMO_SX_LOCK)
5237 	      || dict_table_is_intrinsic(index->table));
5238 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5239 	ut_ad(mtr->is_named_space(index->space));
5240 
5241 	if (!has_reserved_extents) {
5242 		/* First reserve enough free space for the file segments
5243 		of the index tree, so that the node pointer updates will
5244 		not fail because of lack of space */
5245 
5246 		ulint	n_extents = cursor->tree_height / 32 + 1;
5247 
5248 		success = fsp_reserve_free_extents(&n_reserved,
5249 						   index->space,
5250 						   n_extents,
5251 						   FSP_CLEANING, mtr);
5252 		if (!success) {
5253 			*err = DB_OUT_OF_FILE_SPACE;
5254 
5255 			return(FALSE);
5256 		}
5257 	}
5258 
5259 	heap = mem_heap_create(1024);
5260 	rec = btr_cur_get_rec(cursor);
5261 	page_zip = buf_block_get_page_zip(block);
5262 #ifdef UNIV_ZIP_DEBUG
5263 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5264 #endif /* UNIV_ZIP_DEBUG */
5265 
5266 	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5267 
5268 	if (rec_offs_any_extern(offsets)) {
5269 		btr_rec_free_externally_stored_fields(index,
5270 						      rec, offsets, page_zip,
5271 						      rollback, mtr);
5272 #ifdef UNIV_ZIP_DEBUG
5273 		ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5274 #endif /* UNIV_ZIP_DEBUG */
5275 	}
5276 
5277 	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5278 	    && UNIV_UNLIKELY(dict_index_get_page(index)
5279 			     != block->page.id.page_no())) {
5280 
5281 		/* If there is only one record, drop the whole page in
5282 		btr_discard_page, if this is not the root page */
5283 
5284 		btr_discard_page(cursor, mtr);
5285 
5286 		ret = TRUE;
5287 
5288 		goto return_after_reservations;
5289 	}
5290 
5291 	if (flags == 0) {
5292 		lock_update_delete(block, rec);
5293 	}
5294 
5295 	level = btr_page_get_level(page, mtr);
5296 
5297 	if (level > 0
5298 	    && UNIV_UNLIKELY(rec == page_rec_get_next(
5299 				     page_get_infimum_rec(page)))) {
5300 
5301 		rec_t*	next_rec = page_rec_get_next(rec);
5302 
5303 		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5304 
5305 			/* If we delete the leftmost node pointer on a
5306 			non-leaf level, we must mark the new leftmost node
5307 			pointer as the predefined minimum record */
5308 
5309 			/* This will make page_zip_validate() fail until
5310 			page_cur_delete_rec() completes.  This is harmless,
5311 			because everything will take place within a single
5312 			mini-transaction and because writing to the redo log
5313 			is an atomic operation (performed by mtr_commit()). */
5314 			btr_set_min_rec_mark(next_rec, mtr);
5315 		} else if (dict_index_is_spatial(index)) {
5316 			/* For rtree, if delete the leftmost node pointer,
5317 			we need to update parent page. */
5318 			rtr_mbr_t	father_mbr;
5319 			rec_t*		father_rec;
5320 			btr_cur_t	father_cursor;
5321 			ulint*		offsets;
5322 			bool		upd_ret;
5323 			ulint		len;
5324 
5325 			rtr_page_get_father_block(NULL, heap, index,
5326 						  block, mtr, NULL,
5327 						  &father_cursor);
5328 			offsets = rec_get_offsets(
5329 				btr_cur_get_rec(&father_cursor), index,
5330 				NULL, ULINT_UNDEFINED, &heap);
5331 
5332 			father_rec = btr_cur_get_rec(&father_cursor);
5333 			rtr_read_mbr(rec_get_nth_field(
5334 				father_rec, offsets, 0, &len), &father_mbr);
5335 
5336 			upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5337 						       NULL, page, &father_mbr,
5338 						       next_rec, mtr);
5339 
5340 			if (!upd_ret) {
5341 				*err = DB_ERROR;
5342 
5343 				mem_heap_free(heap);
5344 				return(FALSE);
5345 			}
5346 
5347 			ut_d(parent_latched = true);
5348 		} else {
5349 			/* Otherwise, if we delete the leftmost node pointer
5350 			on a page, we have to change the parent node pointer
5351 			so that it is equal to the new leftmost node pointer
5352 			on the page */
5353 
5354 			btr_node_ptr_delete(index, block, mtr);
5355 
5356 			dtuple_t*	node_ptr = dict_index_build_node_ptr(
5357 				index, next_rec, block->page.id.page_no(),
5358 				heap, level);
5359 
5360 			btr_insert_on_non_leaf_level(
5361 				flags, index, level + 1, node_ptr, mtr);
5362 
5363 			ut_d(parent_latched = true);
5364 		}
5365 	}
5366 
5367 	btr_search_update_hash_on_delete(cursor);
5368 
5369 	if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5370 	/* Set allow merge to true for spatial indexes as the tree is X
5371         locked incase of delete operation on spatial indexes thus avoiding
5372         possibility of upward locking.*/
5373 		allow_merge = true;
5374 	} else {
5375 		allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5376                                         rec,rec_size_est,page_size,mtr);
5377 	}
5378 	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5379 #ifdef UNIV_ZIP_DEBUG
5380 	ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5381 #endif /* UNIV_ZIP_DEBUG */
5382 
5383 	/* btr_check_node_ptr() needs parent block latched */
5384 	ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5385 
5386 return_after_reservations:
5387 	*err = DB_SUCCESS;
5388 
5389 	mem_heap_free(heap);
5390 
5391 	if(!ret) {
5392 		bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5393 		/* We are not allowed do merge because appropriate locks
5394 		are not taken while positioning the cursor. */
5395 		if (!allow_merge && do_merge) {
5396 			ib::info() << "Ignoring merge recommendation for page"
5397 				"as we could not predict it early .Page"
5398 				"number being\n" << page_get_page_no(page) <<
5399 				"Index name\n" << index->name;
5400 			ut_ad(false);
5401 		} else if (do_merge) {
5402 
5403 			ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5404 		}
5405 	}
5406 
5407 	if (!srv_read_only_mode
5408 	    && page_is_leaf(page)
5409 	    && !dict_index_is_online_ddl(index)) {
5410 
5411 		mtr_memo_release(mtr, dict_index_get_lock(index),
5412 				 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5413 
5414 		/* NOTE: We cannot release root block latch here, because it
5415 		has segment header and already modified in most of cases.*/
5416 	}
5417 
5418 	if (n_reserved > 0) {
5419 		fil_space_release_free_extents(index->space, n_reserved);
5420 	}
5421 
5422 	return(ret);
5423 }
5424 
5425 /*******************************************************************//**
5426 Adds path information to the cursor for the current page, for which
5427 the binary search has been performed. */
5428 static
5429 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5430 btr_cur_add_path_info(
5431 /*==================*/
5432 	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
5433 	ulint		height,		/*!< in: height of the page in tree;
5434 					0 means leaf node */
5435 	ulint		root_height)	/*!< in: root node height in tree */
5436 {
5437 	btr_path_t*	slot;
5438 	const rec_t*	rec;
5439 	const page_t*	page;
5440 
5441 	ut_a(cursor->path_arr);
5442 
5443 	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5444 		/* Do nothing; return empty path */
5445 
5446 		slot = cursor->path_arr;
5447 		slot->nth_rec = ULINT_UNDEFINED;
5448 
5449 		return;
5450 	}
5451 
5452 	if (height == 0) {
5453 		/* Mark end of slots for path */
5454 		slot = cursor->path_arr + root_height + 1;
5455 		slot->nth_rec = ULINT_UNDEFINED;
5456 	}
5457 
5458 	rec = btr_cur_get_rec(cursor);
5459 
5460 	slot = cursor->path_arr + (root_height - height);
5461 
5462 	page = page_align(rec);
5463 
5464 	slot->nth_rec = page_rec_get_n_recs_before(rec);
5465 	slot->n_recs = page_get_n_recs(page);
5466 	slot->page_no = page_get_page_no(page);
5467 	slot->page_level = btr_page_get_level_low(page);
5468 }
5469 
5470 /*******************************************************************//**
5471 Estimate the number of rows between slot1 and slot2 for any level on a
5472 B-tree. This function starts from slot1->page and reads a few pages to
5473 the right, counting their records. If we reach slot2->page quickly then
5474 we know exactly how many records there are between slot1 and slot2 and
5475 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5476 then we calculate the average number of records in the pages scanned
5477 so far and assume that all pages that we did not scan up to slot2->page
5478 contain the same number of records, then we multiply that average to
5479 the number of pages between slot1->page and slot2->page (which is
5480 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5481 @return number of rows, not including the borders (exact or estimated) */
5482 static
5483 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5484 btr_estimate_n_rows_in_range_on_level(
5485 /*==================================*/
5486 	dict_index_t*	index,			/*!< in: index */
5487 	btr_path_t*	slot1,			/*!< in: left border */
5488 	btr_path_t*	slot2,			/*!< in: right border */
5489 	int64_t		n_rows_on_prev_level,	/*!< in: number of rows
5490 						on the previous level for the
5491 						same descend paths; used to
5492 						determine the number of pages
5493 						on this level */
5494 	ibool*		is_n_rows_exact)	/*!< out: TRUE if the returned
5495 						value is exact i.e. not an
5496 						estimation */
5497 {
5498 	int64_t		n_rows;
5499 	ulint		n_pages_read;
5500 	ulint		level;
5501 
5502 	n_rows = 0;
5503 	n_pages_read = 0;
5504 
5505 	/* Assume by default that we will scan all pages between
5506 	slot1->page_no and slot2->page_no. */
5507 	*is_n_rows_exact = TRUE;
5508 
5509 	/* Add records from slot1->page_no which are to the right of
5510 	the record which serves as a left border of the range, if any
5511 	(we don't include the record itself in this count). */
5512 	if (slot1->nth_rec <= slot1->n_recs) {
5513 		n_rows += slot1->n_recs - slot1->nth_rec;
5514 	}
5515 
5516 	/* Add records from slot2->page_no which are to the left of
5517 	the record which servers as a right border of the range, if any
5518 	(we don't include the record itself in this count). */
5519 	if (slot2->nth_rec > 1) {
5520 		n_rows += slot2->nth_rec - 1;
5521 	}
5522 
5523 	/* Count the records in the pages between slot1->page_no and
5524 	slot2->page_no (non inclusive), if any. */
5525 
5526 	/* Do not read more than this number of pages in order not to hurt
5527 	performance with this code which is just an estimation. If we read
5528 	this many pages before reaching slot2->page_no then we estimate the
5529 	average from the pages scanned so far. */
5530 #	define N_PAGES_READ_LIMIT	10
5531 
5532 	page_id_t		page_id(
5533 		dict_index_get_space(index), slot1->page_no);
5534 	const fil_space_t*	space = fil_space_get(index->space);
5535 	ut_ad(space);
5536 	const page_size_t	page_size(space->flags);
5537 
5538 	level = slot1->page_level;
5539 
5540 	do {
5541 		mtr_t		mtr;
5542 		page_t*		page;
5543 		buf_block_t*	block;
5544 
5545 		mtr_start(&mtr);
5546 
5547 		/* Fetch the page. Because we are not holding the
5548 		index->lock, the tree may have changed and we may be
5549 		attempting to read a page that is no longer part of
5550 		the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5551 		silence a debug assertion about this. */
5552 		block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5553 					 NULL, BUF_GET_POSSIBLY_FREED,
5554 					 __FILE__, __LINE__, &mtr);
5555 
5556 		page = buf_block_get_frame(block);
5557 
5558 		/* It is possible that the tree has been reorganized in the
5559 		meantime and this is a different page. If this happens the
5560 		calculated estimate will be bogus, which is not fatal as
5561 		this is only an estimate. We are sure that a page with
5562 		page_no exists because InnoDB never frees pages, only
5563 		reuses them. */
5564 		if (!fil_page_index_page_check(page)
5565 		    || btr_page_get_index_id(page) != index->id
5566 		    || btr_page_get_level_low(page) != level) {
5567 
5568 			/* The page got reused for something else */
5569 			mtr_commit(&mtr);
5570 			goto inexact;
5571 		}
5572 
5573 		/* It is possible but highly unlikely that the page was
5574 		originally written by an old version of InnoDB that did
5575 		not initialize FIL_PAGE_TYPE on other than B-tree pages.
5576 		For example, this could be an almost-empty BLOB page
5577 		that happens to contain the magic values in the fields
5578 		that we checked above. */
5579 
5580 		n_pages_read++;
5581 
5582 		if (page_id.page_no() != slot1->page_no) {
5583 			/* Do not count the records on slot1->page_no,
5584 			we already counted them before this loop. */
5585 			n_rows += page_get_n_recs(page);
5586 		}
5587 
5588 		page_id.set_page_no(btr_page_get_next(page, &mtr));
5589 
5590 		mtr_commit(&mtr);
5591 
5592 		if (n_pages_read == N_PAGES_READ_LIMIT
5593 		    || page_id.page_no() == FIL_NULL) {
5594 			/* Either we read too many pages or
5595 			we reached the end of the level without passing
5596 			through slot2->page_no, the tree must have changed
5597 			in the meantime */
5598 			goto inexact;
5599 		}
5600 
5601 	} while (page_id.page_no() != slot2->page_no);
5602 
5603 	return(n_rows);
5604 
5605 inexact:
5606 
5607 	*is_n_rows_exact = FALSE;
5608 
5609 	/* We did interrupt before reaching slot2->page */
5610 
5611 	if (n_pages_read > 0) {
5612 		/* The number of pages on this level is
5613 		n_rows_on_prev_level, multiply it by the
5614 		average number of recs per page so far */
5615 		n_rows = n_rows_on_prev_level
5616 			* n_rows / n_pages_read;
5617 	} else {
5618 		/* The tree changed before we could even
5619 		start with slot1->page_no */
5620 		n_rows = 10;
5621 	}
5622 
5623 	return(n_rows);
5624 }
5625 
5626 /** If the tree gets changed too much between the two dives for the left
5627 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5628 that many times before giving up and returning the value stored in
5629 rows_in_range_arbitrary_ret_val. */
5630 static const unsigned	rows_in_range_max_retries = 4;
5631 
5632 /** We pretend that a range has that many records if the tree keeps changing
5633 for rows_in_range_max_retries retries while we try to estimate the records
5634 in a given range. */
5635 static const int64_t	rows_in_range_arbitrary_ret_val = 10;
5636 
5637 /** Estimates the number of rows in a given index range.
5638 @param[in]	index		index
5639 @param[in]	tuple1		range start, may also be empty tuple
5640 @param[in]	mode1		search mode for range start
5641 @param[in]	tuple2		range end, may also be empty tuple
5642 @param[in]	mode2		search mode for range end
5643 @param[in]	nth_attempt	if the tree gets modified too much while
5644 we are trying to analyze it, then we will retry (this function will call
5645 itself, incrementing this parameter)
5646 @return estimated number of rows; if after rows_in_range_max_retries
5647 retries the tree keeps changing, then we will just return
5648 rows_in_range_arbitrary_ret_val as a result (if
5649 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5650 the two dives). */
5651 static
5652 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5653 btr_estimate_n_rows_in_range_low(
5654 	dict_index_t*	index,
5655 	const dtuple_t*	tuple1,
5656 	page_cur_mode_t	mode1,
5657 	const dtuple_t*	tuple2,
5658 	page_cur_mode_t	mode2,
5659 	unsigned	nth_attempt)
5660 {
5661 	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
5662 	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
5663 	btr_cur_t	cursor;
5664 	btr_path_t*	slot1;
5665 	btr_path_t*	slot2;
5666 	ibool		diverged;
5667 	ibool		diverged_lot;
5668 	ulint		divergence_level;
5669 	int64_t		n_rows;
5670 	ibool		is_n_rows_exact;
5671 	ulint		i;
5672 	mtr_t		mtr;
5673 	int64_t		table_n_rows;
5674 
5675 	table_n_rows = dict_table_get_n_rows(index->table);
5676 
5677 	/* Below we dive to the two records specified by tuple1 and tuple2 and
5678 	we remember the entire dive paths from the tree root. The place where
5679 	the tuple1 path ends on the leaf level we call "left border" of our
5680 	interval and the place where the tuple2 path ends on the leaf level -
5681 	"right border". We take care to either include or exclude the interval
5682 	boundaries depending on whether <, <=, > or >= was specified. For
5683 	example if "5 < x AND x <= 10" then we should not include the left
5684 	boundary, but should include the right one. */
5685 
5686 	mtr_start(&mtr);
5687 
5688 	cursor.path_arr = path1;
5689 
5690 	bool	should_count_the_left_border;
5691 
5692 	if (dtuple_get_n_fields(tuple1) > 0) {
5693 
5694 		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5695 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5696 					    &cursor, 0,
5697 					    __FILE__, __LINE__, &mtr);
5698 
5699 		ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5700 
5701 		/* We should count the border if there are any records to
5702 		match the criteria, i.e. if the maximum record on the tree is
5703 		5 and x > 3 is specified then the cursor will be positioned at
5704 		5 and we should count the border, but if x > 7 is specified,
5705 		then the cursor will be positioned at 'sup' on the rightmost
5706 		leaf page in the tree and we should not count the border. */
5707 		should_count_the_left_border
5708 			= !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5709 	} else {
5710 		btr_cur_open_at_index_side(true, index,
5711 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5712 					   &cursor, 0, &mtr);
5713 
5714 		ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5715 
5716 		/* The range specified is wihout a left border, just
5717 		'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5718 		positioned the cursor on the infimum record on the leftmost
5719 		page, which must not be counted. */
5720 		should_count_the_left_border = false;
5721 	}
5722 
5723 	mtr_commit(&mtr);
5724 
5725 	mtr_start(&mtr);
5726 
5727 	cursor.path_arr = path2;
5728 
5729 	bool	should_count_the_right_border;
5730 
5731 	if (dtuple_get_n_fields(tuple2) > 0) {
5732 
5733 		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5734 					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
5735 					    &cursor, 0,
5736 					    __FILE__, __LINE__, &mtr);
5737 
5738 		const rec_t*	rec = btr_cur_get_rec(&cursor);
5739 
5740 		ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5741 
5742 		should_count_the_right_border
5743 			= (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5744 			   /* and the record was found */
5745 			   && cursor.low_match >= dtuple_get_n_fields(tuple2))
5746 			|| (mode2 == PAGE_CUR_L /* or if the range is '<' */
5747 			    /* and there are any records to match the criteria,
5748 			    i.e. if the minimum record on the tree is 5 and
5749 			    x < 7 is specified then the cursor will be
5750 			    positioned at 5 and we should count the border, but
5751 			    if x < 2 is specified, then the cursor will be
5752 			    positioned at 'inf' and we should not count the
5753 			    border */
5754 			    && !page_rec_is_infimum(rec));
5755 		/* Notice that for "WHERE col <= 'foo'" MySQL passes to
5756 		ha_innobase::records_in_range():
5757 		min_key=NULL (left-unbounded) which is expected
5758 		max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5759 		unexpected - one would expect
5760 		flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5761 		cursor will be positioned on the first record to the right of
5762 		the requested one (can also be positioned on the 'sup') and
5763 		we should not count the right border. */
5764 	} else {
5765 		btr_cur_open_at_index_side(false, index,
5766 					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
5767 					   &cursor, 0, &mtr);
5768 
5769 		ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5770 
5771 		/* The range specified is wihout a right border, just
5772 		'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5773 		positioned the cursor on the supremum record on the rightmost
5774 		page, which must not be counted. */
5775 		should_count_the_right_border = false;
5776 	}
5777 
5778 	mtr_commit(&mtr);
5779 
5780 	/* We have the path information for the range in path1 and path2 */
5781 
5782 	n_rows = 0;
5783 	is_n_rows_exact = TRUE;
5784 
5785 	/* This becomes true when the two paths do not pass through the
5786 	same pages anymore. */
5787 	diverged = FALSE;
5788 
5789 	/* This becomes true when the paths are not the same or adjacent
5790 	any more. This means that they pass through the same or
5791 	neighboring-on-the-same-level pages only. */
5792 	diverged_lot = FALSE;
5793 
5794 	/* This is the level where paths diverged a lot. */
5795 	divergence_level = 1000000;
5796 
5797 	for (i = 0; ; i++) {
5798 		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
5799 
5800 		slot1 = path1 + i;
5801 		slot2 = path2 + i;
5802 
5803 		if (slot1->nth_rec == ULINT_UNDEFINED
5804 		    || slot2->nth_rec == ULINT_UNDEFINED) {
5805 
5806 			/* Here none of the borders were counted. For example,
5807 			if on the leaf level we descended to:
5808 			(inf, a, b, c, d, e, f, sup)
5809 			         ^        ^
5810 			       path1    path2
5811 			then n_rows will be 2 (c and d). */
5812 
5813 			if (is_n_rows_exact) {
5814 				/* Only fiddle to adjust this off-by-one
5815 				if the number is exact, otherwise we do
5816 				much grosser adjustments below. */
5817 
5818 				btr_path_t*	last1 = &path1[i - 1];
5819 				btr_path_t*	last2 = &path2[i - 1];
5820 
5821 				/* If both paths end up on the same record on
5822 				the leaf level. */
5823 				if (last1->page_no == last2->page_no
5824 				    && last1->nth_rec == last2->nth_rec) {
5825 
5826 					/* n_rows can be > 0 here if the paths
5827 					were first different and then converged
5828 					to the same record on the leaf level.
5829 					For example:
5830 					SELECT ... LIKE 'wait/synch/rwlock%'
5831 					mode1=PAGE_CUR_GE,
5832 					tuple1="wait/synch/rwlock"
5833 					path1[0]={nth_rec=58, n_recs=58,
5834 						  page_no=3, page_level=1}
5835 					path1[1]={nth_rec=56, n_recs=55,
5836 						  page_no=119, page_level=0}
5837 
5838 					mode2=PAGE_CUR_G
5839 					tuple2="wait/synch/rwlock"
5840 					path2[0]={nth_rec=57, n_recs=57,
5841 						  page_no=3, page_level=1}
5842 					path2[1]={nth_rec=56, n_recs=55,
5843 						  page_no=119, page_level=0} */
5844 
5845 					/* If the range is such that we should
5846 					count both borders, then avoid
5847 					counting that record twice - once as a
5848 					left border and once as a right
5849 					border. */
5850 					if (should_count_the_left_border
5851 					    && should_count_the_right_border) {
5852 
5853 						n_rows = 1;
5854 					} else {
5855 						/* Some of the borders should
5856 						not be counted, e.g. [3,3). */
5857 						n_rows = 0;
5858 					}
5859 				} else {
5860 					if (should_count_the_left_border) {
5861 						n_rows++;
5862 					}
5863 
5864 					if (should_count_the_right_border) {
5865 						n_rows++;
5866 					}
5867 				}
5868 			}
5869 
5870 			if (i > divergence_level + 1 && !is_n_rows_exact) {
5871 				/* In trees whose height is > 1 our algorithm
5872 				tends to underestimate: multiply the estimate
5873 				by 2: */
5874 
5875 				n_rows = n_rows * 2;
5876 			}
5877 
5878 			DBUG_EXECUTE_IF("bug14007649", return(n_rows););
5879 
5880 			/* Do not estimate the number of rows in the range
5881 			to over 1 / 2 of the estimated rows in the whole
5882 			table */
5883 
5884 			if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
5885 
5886 				n_rows = table_n_rows / 2;
5887 
5888 				/* If there are just 0 or 1 rows in the table,
5889 				then we estimate all rows are in the range */
5890 
5891 				if (n_rows == 0) {
5892 					n_rows = table_n_rows;
5893 				}
5894 			}
5895 
5896 			return(n_rows);
5897 		}
5898 
5899 		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
5900 
5901 			/* If both slots do not point to the same page,
5902 			this means that the tree must have changed between
5903 			the dive for slot1 and the dive for slot2 at the
5904 			beginning of this function. */
5905 			if (slot1->page_no != slot2->page_no
5906 			    || slot1->page_level != slot2->page_level) {
5907 
5908 				/* If the tree keeps changing even after a
5909 				few attempts, then just return some arbitrary
5910 				number. */
5911 				if (nth_attempt >= rows_in_range_max_retries) {
5912 					return(rows_in_range_arbitrary_ret_val);
5913 				}
5914 
5915 				const int64_t	ret =
5916 					btr_estimate_n_rows_in_range_low(
5917 						index, tuple1, mode1,
5918 						tuple2, mode2, nth_attempt + 1);
5919 
5920 				return(ret);
5921 			}
5922 
5923 			diverged = TRUE;
5924 
5925 			if (slot1->nth_rec < slot2->nth_rec) {
5926 				/* We do not count the borders (nor the left
5927 				nor the right one), thus "- 1". */
5928 				n_rows = slot2->nth_rec - slot1->nth_rec - 1;
5929 
5930 				if (n_rows > 0) {
5931 					/* There is at least one row between
5932 					the two borders pointed to by slot1
5933 					and slot2, so on the level below the
5934 					slots will point to non-adjacent
5935 					pages. */
5936 					diverged_lot = TRUE;
5937 					divergence_level = i;
5938 				}
5939 			} else {
5940 				/* It is possible that
5941 				slot1->nth_rec >= slot2->nth_rec
5942 				if, for example, we have a single page
5943 				tree which contains (inf, 5, 6, supr)
5944 				and we select where x > 20 and x < 30;
5945 				in this case slot1->nth_rec will point
5946 				to the supr record and slot2->nth_rec
5947 				will point to 6. */
5948 				n_rows = 0;
5949 				should_count_the_left_border = false;
5950 				should_count_the_right_border = false;
5951 			}
5952 
5953 		} else if (diverged && !diverged_lot) {
5954 
5955 			if (slot1->nth_rec < slot1->n_recs
5956 			    || slot2->nth_rec > 1) {
5957 
5958 				diverged_lot = TRUE;
5959 				divergence_level = i;
5960 
5961 				n_rows = 0;
5962 
5963 				if (slot1->nth_rec < slot1->n_recs) {
5964 					n_rows += slot1->n_recs
5965 						- slot1->nth_rec;
5966 				}
5967 
5968 				if (slot2->nth_rec > 1) {
5969 					n_rows += slot2->nth_rec - 1;
5970 				}
5971 			}
5972 		} else if (diverged_lot) {
5973 
5974 			n_rows = btr_estimate_n_rows_in_range_on_level(
5975 				index, slot1, slot2, n_rows,
5976 				&is_n_rows_exact);
5977 		}
5978 	}
5979 }
5980 
5981 /** Estimates the number of rows in a given index range.
5982 @param[in]	index	index
5983 @param[in]	tuple1	range start, may also be empty tuple
5984 @param[in]	mode1	search mode for range start
5985 @param[in]	tuple2	range end, may also be empty tuple
5986 @param[in]	mode2	search mode for range end
5987 @return estimated number of rows */
5988 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)5989 btr_estimate_n_rows_in_range(
5990 	dict_index_t*	index,
5991 	const dtuple_t*	tuple1,
5992 	page_cur_mode_t	mode1,
5993 	const dtuple_t*	tuple2,
5994 	page_cur_mode_t	mode2)
5995 {
5996 	const int64_t	ret = btr_estimate_n_rows_in_range_low(
5997 		index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
5998 
5999 	return(ret);
6000 }
6001 
6002 /*******************************************************************//**
6003 Record the number of non_null key values in a given index for
6004 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6005 The estimates are eventually stored in the array:
6006 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6007 static
6008 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6009 btr_record_not_null_field_in_rec(
6010 /*=============================*/
6011 	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
6012 					number of columns uniquely determine
6013 					an index entry */
6014 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
6015 					its size could be for all fields or
6016 					that of "n_unique" */
6017 	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
6018 					not null rows for n-column prefix */
6019 {
6020 	ulint	i;
6021 
6022 	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6023 
6024 	if (n_not_null == NULL) {
6025 		return;
6026 	}
6027 
6028 	for (i = 0; i < n_unique; i++) {
6029 		if (rec_offs_nth_sql_null(offsets, i)) {
6030 			break;
6031 		}
6032 
6033 		n_not_null[i]++;
6034 	}
6035 }
6036 
6037 /*******************************************************************//**
6038 Estimates the number of different key values in a given index, for
6039 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6040 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6041 0..n_uniq-1) and the number of pages that were sampled is saved in
6042 index->stat_n_sample_sizes[].
6043 If innodb_stats_method is nulls_ignored, we also record the number of
6044 non-null values for each prefix and stored the estimates in
6045 array index->stat_n_non_null_key_vals.
6046 @return true if the index is available and we get the estimated numbers,
6047 false if the index is unavailable. */
6048 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6049 btr_estimate_number_of_different_key_vals(
6050 /*======================================*/
6051 	dict_index_t*	index)	/*!< in: index */
6052 {
6053 	btr_cur_t	cursor;
6054 	page_t*		page;
6055 	rec_t*		rec;
6056 	ulint		n_cols;
6057 	ib_uint64_t*	n_diff;
6058 	ib_uint64_t*	n_not_null;
6059 	ibool		stats_null_not_equal;
6060 	uintmax_t	n_sample_pages; /* number of pages to sample */
6061 	ulint		not_empty_flag	= 0;
6062 	ulint		total_external_size = 0;
6063 	ulint		i;
6064 	ulint		j;
6065 	uintmax_t	add_on;
6066 	mtr_t		mtr;
6067 	mem_heap_t*	heap		= NULL;
6068 	ulint*		offsets_rec	= NULL;
6069 	ulint*		offsets_next_rec = NULL;
6070 
6071 	/* For spatial index, there is no such stats can be
6072 	fetched. */
6073 	if (dict_index_is_spatial(index)) {
6074 		return(false);
6075 	}
6076 
6077 	n_cols = dict_index_get_n_unique(index);
6078 
6079 	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6080 			       * n_cols
6081 			       + dict_index_get_n_fields(index)
6082 			       * (sizeof *offsets_rec
6083 				  + sizeof *offsets_next_rec));
6084 
6085 	n_diff = (ib_uint64_t*) mem_heap_zalloc(
6086 		heap, n_cols * sizeof(n_diff[0]));
6087 
6088 	n_not_null = NULL;
6089 
6090 	/* Check srv_innodb_stats_method setting, and decide whether we
6091 	need to record non-null value and also decide if NULL is
6092 	considered equal (by setting stats_null_not_equal value) */
6093 	switch (srv_innodb_stats_method) {
6094 	case SRV_STATS_NULLS_IGNORED:
6095 		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6096 			heap, n_cols * sizeof *n_not_null);
6097 		/* fall through */
6098 
6099 	case SRV_STATS_NULLS_UNEQUAL:
6100 		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6101 		case, we will treat NULLs as unequal value */
6102 		stats_null_not_equal = TRUE;
6103 		break;
6104 
6105 	case SRV_STATS_NULLS_EQUAL:
6106 		stats_null_not_equal = FALSE;
6107 		break;
6108 
6109 	default:
6110 		ut_error;
6111 	}
6112 
6113 	/* It makes no sense to test more pages than are contained
6114 	in the index, thus we lower the number if it is too high */
6115 	if (srv_stats_transient_sample_pages > index->stat_index_size) {
6116 		if (index->stat_index_size > 0) {
6117 			n_sample_pages = index->stat_index_size;
6118 		} else {
6119 			n_sample_pages = 1;
6120 		}
6121 	} else {
6122 		n_sample_pages = srv_stats_transient_sample_pages;
6123 	}
6124 
6125 	/* We sample some pages in the index to get an estimate */
6126 
6127 	for (i = 0; i < n_sample_pages; i++) {
6128 		mtr_start(&mtr);
6129 
6130 		bool	available;
6131 
6132 		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6133 						    &cursor, &mtr);
6134 
6135 		if (!available) {
6136 			mtr_commit(&mtr);
6137 			mem_heap_free(heap);
6138 
6139 			return(false);
6140 		}
6141 
6142 		/* Count the number of different key values for each prefix of
6143 		the key on this index page. If the prefix does not determine
6144 		the index record uniquely in the B-tree, then we subtract one
6145 		because otherwise our algorithm would give a wrong estimate
6146 		for an index where there is just one key value. */
6147 
6148 		page = btr_cur_get_page(&cursor);
6149 
6150 		rec = page_rec_get_next(page_get_infimum_rec(page));
6151 
6152 		if (!page_rec_is_supremum(rec)) {
6153 			not_empty_flag = 1;
6154 			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6155 						      ULINT_UNDEFINED, &heap);
6156 
6157 			if (n_not_null != NULL) {
6158 				btr_record_not_null_field_in_rec(
6159 					n_cols, offsets_rec, n_not_null);
6160 			}
6161 		}
6162 
6163 		while (!page_rec_is_supremum(rec)) {
6164 			ulint	matched_fields;
6165 			rec_t*	next_rec = page_rec_get_next(rec);
6166 			if (page_rec_is_supremum(next_rec)) {
6167 				total_external_size +=
6168 					btr_rec_get_externally_stored_len(
6169 						rec, offsets_rec);
6170 				break;
6171 			}
6172 
6173 			offsets_next_rec = rec_get_offsets(next_rec, index,
6174 							   offsets_next_rec,
6175 							   ULINT_UNDEFINED,
6176 							   &heap);
6177 
6178 			cmp_rec_rec_with_match(rec, next_rec,
6179 					       offsets_rec, offsets_next_rec,
6180 					       index,
6181 					       page_is_spatial_non_leaf(next_rec, index),
6182 					       stats_null_not_equal,
6183 					       &matched_fields);
6184 
6185 			for (j = matched_fields; j < n_cols; j++) {
6186 				/* We add one if this index record has
6187 				a different prefix from the previous */
6188 
6189 				n_diff[j]++;
6190 			}
6191 
6192 			if (n_not_null != NULL) {
6193 				btr_record_not_null_field_in_rec(
6194 					n_cols, offsets_next_rec, n_not_null);
6195 			}
6196 
6197 			total_external_size
6198 				+= btr_rec_get_externally_stored_len(
6199 					rec, offsets_rec);
6200 
6201 			rec = next_rec;
6202 			/* Initialize offsets_rec for the next round
6203 			and assign the old offsets_rec buffer to
6204 			offsets_next_rec. */
6205 			{
6206 				ulint*	offsets_tmp = offsets_rec;
6207 				offsets_rec = offsets_next_rec;
6208 				offsets_next_rec = offsets_tmp;
6209 			}
6210 		}
6211 
6212 
6213 		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6214 
6215 			/* If there is more than one leaf page in the tree,
6216 			we add one because we know that the first record
6217 			on the page certainly had a different prefix than the
6218 			last record on the previous index page in the
6219 			alphabetical order. Before this fix, if there was
6220 			just one big record on each clustered index page, the
6221 			algorithm grossly underestimated the number of rows
6222 			in the table. */
6223 
6224 			if (btr_page_get_prev(page, &mtr) != FIL_NULL
6225 			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
6226 
6227 				n_diff[n_cols - 1]++;
6228 			}
6229 		}
6230 
6231 		mtr_commit(&mtr);
6232 	}
6233 
6234 	/* If we saw k borders between different key values on
6235 	n_sample_pages leaf pages, we can estimate how many
6236 	there will be in index->stat_n_leaf_pages */
6237 
6238 	/* We must take into account that our sample actually represents
6239 	also the pages used for external storage of fields (those pages are
6240 	included in index->stat_n_leaf_pages) */
6241 
6242 	for (j = 0; j < n_cols; j++) {
6243 		index->stat_n_diff_key_vals[j]
6244 			= BTR_TABLE_STATS_FROM_SAMPLE(
6245 				n_diff[j], index, n_sample_pages,
6246 				total_external_size, not_empty_flag);
6247 
6248 		/* If the tree is small, smaller than
6249 		10 * n_sample_pages + total_external_size, then
6250 		the above estimate is ok. For bigger trees it is common that we
6251 		do not see any borders between key values in the few pages
6252 		we pick. But still there may be n_sample_pages
6253 		different key values, or even more. Let us try to approximate
6254 		that: */
6255 
6256 		add_on = index->stat_n_leaf_pages
6257 			/ (10 * (n_sample_pages
6258 				 + total_external_size));
6259 
6260 		if (add_on > n_sample_pages) {
6261 			add_on = n_sample_pages;
6262 		}
6263 
6264 		index->stat_n_diff_key_vals[j] += add_on;
6265 
6266 		index->stat_n_sample_sizes[j] = n_sample_pages;
6267 
6268 		/* Update the stat_n_non_null_key_vals[] with our
6269 		sampled result. stat_n_non_null_key_vals[] is created
6270 		and initialized to zero in dict_index_add_to_cache(),
6271 		along with stat_n_diff_key_vals[] array */
6272 		if (n_not_null != NULL) {
6273 			index->stat_n_non_null_key_vals[j] =
6274 				 BTR_TABLE_STATS_FROM_SAMPLE(
6275 					n_not_null[j], index, n_sample_pages,
6276 					total_external_size, not_empty_flag);
6277 		}
6278 	}
6279 
6280 	mem_heap_free(heap);
6281 
6282 	return(true);
6283 }
6284 
6285 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6286 
6287 /***********************************************************//**
6288 Gets the offset of the pointer to the externally stored part of a field.
6289 @return offset of the pointer to the externally stored part */
6290 static
6291 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6292 btr_rec_get_field_ref_offs(
6293 /*=======================*/
6294 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6295 	ulint		n)	/*!< in: index of the external field */
6296 {
6297 	ulint	field_ref_offs;
6298 	ulint	local_len;
6299 
6300 	ut_a(rec_offs_nth_extern(offsets, n));
6301 	field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6302 	ut_a(local_len != UNIV_SQL_NULL);
6303 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6304 
6305 	return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6306 }
6307 
6308 /** Gets a pointer to the externally stored part of a field.
6309 @param rec record
6310 @param offsets rec_get_offsets(rec)
6311 @param n index of the externally stored field
6312 @return pointer to the externally stored part */
6313 #define btr_rec_get_field_ref(rec, offsets, n)			\
6314 	((rec) + btr_rec_get_field_ref_offs(offsets, n))
6315 
6316 /** Gets the externally stored size of a record, in units of a database page.
6317 @param[in]	rec	record
6318 @param[in]	offsets	array returned by rec_get_offsets()
6319 @return externally stored part, in units of a database page */
6320 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6321 btr_rec_get_externally_stored_len(
6322 	const rec_t*	rec,
6323 	const ulint*	offsets)
6324 {
6325 	ulint	n_fields;
6326 	ulint	total_extern_len = 0;
6327 	ulint	i;
6328 
6329 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6330 
6331 	if (!rec_offs_any_extern(offsets)) {
6332 		return(0);
6333 	}
6334 
6335 	n_fields = rec_offs_n_fields(offsets);
6336 
6337 	for (i = 0; i < n_fields; i++) {
6338 		if (rec_offs_nth_extern(offsets, i)) {
6339 
6340 			ulint	extern_len = mach_read_from_4(
6341 				btr_rec_get_field_ref(rec, offsets, i)
6342 				+ BTR_EXTERN_LEN + 4);
6343 
6344 			total_extern_len += ut_calc_align(extern_len,
6345 							  UNIV_PAGE_SIZE);
6346 		}
6347 	}
6348 
6349 	return(total_extern_len / UNIV_PAGE_SIZE);
6350 }
6351 
6352 /*******************************************************************//**
6353 Sets the ownership bit of an externally stored field in a record. */
6354 static
6355 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6356 btr_cur_set_ownership_of_extern_field(
6357 /*==================================*/
6358 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6359 				part will be updated, or NULL */
6360 	rec_t*		rec,	/*!< in/out: clustered index record */
6361 	dict_index_t*	index,	/*!< in: index of the page */
6362 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6363 	ulint		i,	/*!< in: field number */
6364 	ibool		val,	/*!< in: value to set */
6365 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6366 {
6367 	byte*	data;
6368 	ulint	local_len;
6369 	ulint	byte_val;
6370 
6371 	data = rec_get_nth_field(rec, offsets, i, &local_len);
6372 	ut_ad(rec_offs_nth_extern(offsets, i));
6373 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6374 
6375 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6376 
6377 	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6378 
6379 	if (val) {
6380 		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6381 	} else {
6382 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6383 		ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6384 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6385 		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6386 	}
6387 
6388 	if (page_zip) {
6389 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6390 		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6391 	} else if (mtr != NULL) {
6392 
6393 		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6394 				 MLOG_1BYTE, mtr);
6395 	} else {
6396 		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6397 	}
6398 }
6399 
6400 /*******************************************************************//**
6401 Marks non-updated off-page fields as disowned by this record. The ownership
6402 must be transferred to the updated record which is inserted elsewhere in the
6403 index tree. In purge only the owner of externally stored field is allowed
6404 to free the field. */
6405 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6406 btr_cur_disown_inherited_fields(
6407 /*============================*/
6408 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6409 				part will be updated, or NULL */
6410 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6411 	dict_index_t*	index,	/*!< in: index of the page */
6412 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6413 	const upd_t*	update,	/*!< in: update vector */
6414 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
6415 {
6416 	ulint	i;
6417 
6418 	ut_ad(rec_offs_validate(rec, index, offsets));
6419 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6420 	ut_ad(rec_offs_any_extern(offsets));
6421 	ut_ad(mtr);
6422 
6423 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6424 		if (rec_offs_nth_extern(offsets, i)
6425 		    && !upd_get_field_by_field_no(update, i, false)) {
6426 			btr_cur_set_ownership_of_extern_field(
6427 				page_zip, rec, index, offsets, i, FALSE, mtr);
6428 		}
6429 	}
6430 }
6431 
6432 /*******************************************************************//**
6433 Marks all extern fields in a record as owned by the record. This function
6434 should be called if the delete mark of a record is removed: a not delete
6435 marked record always owns all its extern fields. */
6436 static
6437 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6438 btr_cur_unmark_extern_fields(
6439 /*=========================*/
6440 	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
6441 				part will be updated, or NULL */
6442 	rec_t*		rec,	/*!< in/out: record in a clustered index */
6443 	dict_index_t*	index,	/*!< in: index of the page */
6444 	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
6445 	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
6446 {
6447 	ulint	n;
6448 	ulint	i;
6449 
6450 	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6451 	n = rec_offs_n_fields(offsets);
6452 
6453 	if (!rec_offs_any_extern(offsets)) {
6454 
6455 		return;
6456 	}
6457 
6458 	for (i = 0; i < n; i++) {
6459 		if (rec_offs_nth_extern(offsets, i)) {
6460 
6461 			btr_cur_set_ownership_of_extern_field(
6462 				page_zip, rec, index, offsets, i, TRUE, mtr);
6463 		}
6464 	}
6465 }
6466 
6467 /*******************************************************************//**
6468 Returns the length of a BLOB part stored on the header page.
6469 @return part length */
6470 static
6471 ulint
btr_blob_get_part_len(const byte * blob_header)6472 btr_blob_get_part_len(
6473 /*==================*/
6474 	const byte*	blob_header)	/*!< in: blob header */
6475 {
6476 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6477 }
6478 
6479 /*******************************************************************//**
6480 Returns the page number where the next BLOB part is stored.
6481 @return page number or FIL_NULL if no more pages */
6482 static
6483 ulint
btr_blob_get_next_page_no(const byte * blob_header)6484 btr_blob_get_next_page_no(
6485 /*======================*/
6486 	const byte*	blob_header)	/*!< in: blob header */
6487 {
6488 	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6489 }
6490 
6491 /*******************************************************************//**
6492 Deallocate a buffer block that was reserved for a BLOB part. */
6493 static
6494 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6495 btr_blob_free(
6496 /*==========*/
6497 	dict_index_t*	index,	/*!< in: index */
6498 	buf_block_t*	block,	/*!< in: buffer block */
6499 	ibool		all,	/*!< in: TRUE=remove also the compressed page
6500 				if there is one */
6501 	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
6502 {
6503 	buf_pool_t*	buf_pool = buf_pool_from_block(block);
6504 	ulint		space = block->page.id.space();
6505 	ulint		page_no	= block->page.id.page_no();
6506 
6507 	ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6508 
6509 	mtr_commit(mtr);
6510 
6511 	buf_pool_mutex_enter(buf_pool);
6512 
6513 	/* Only free the block if it is still allocated to
6514 	the same file page. */
6515 
6516 	if (buf_block_get_state(block)
6517 	    == BUF_BLOCK_FILE_PAGE
6518 	    && block->page.id.space() == space
6519 	    && block->page.id.page_no() == page_no) {
6520 
6521 		if (!buf_LRU_free_page(&block->page, all)
6522 		    && all && block->page.zip.data) {
6523 			/* Attempt to deallocate the uncompressed page
6524 			if the whole block cannot be deallocted. */
6525 
6526 			buf_LRU_free_page(&block->page, false);
6527 		}
6528 	}
6529 
6530 	buf_pool_mutex_exit(buf_pool);
6531 }
6532 
6533 /** Helper class used while writing blob pages, during insert or update. */
6534 struct btr_blob_log_check_t {
6535 	/** Persistent cursor on a clusterex index record with blobs. */
6536 	btr_pcur_t*	m_pcur;
6537 	/** Mini transaction holding the latches for m_pcur */
6538 	mtr_t*		m_mtr;
6539 	/** rec_get_offsets(rec, index); offset of clust_rec */
6540 	const ulint*	m_offsets;
6541 	/** The block containing clustered record */
6542 	buf_block_t**	m_block;
6543 	/** The clustered record pointer */
6544 	rec_t**		m_rec;
6545 	/** The blob operation code */
6546 	enum blob_op	m_op;
6547 
6548 	/** Constructor
6549 	@param[in]	pcur		persistent cursor on a clustered
6550 					index record with blobs.
6551 	@param[in]	mtr		mini-transaction holding latches for
6552 					pcur.
6553 	@param[in]	offsets		offsets of the clust_rec
6554 	@param[in,out]	block		record block containing pcur record
6555 	@param[in,out]	rec		the clustered record pointer
6556 	@param[in]	op		the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6557 	btr_blob_log_check_t(
6558 		btr_pcur_t*	pcur,
6559 		mtr_t*		mtr,
6560 		const ulint*	offsets,
6561 		buf_block_t**	block,
6562 		rec_t**		rec,
6563 		enum blob_op	op)
6564 		: m_pcur(pcur),
6565 		  m_mtr(mtr),
6566 		  m_offsets(offsets),
6567 		  m_block(block),
6568 		  m_rec(rec),
6569 		  m_op(op)
6570 	{
6571 		ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6572 		ut_ad((*m_block)->frame == page_align(*m_rec));
6573 		ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6574 	}
6575 
6576 	/** Check if there is enough space in log file. Commit and re-start the
6577 	mini transaction. */
checkbtr_blob_log_check_t6578 	void check()
6579 	{
6580 		dict_index_t*	index = m_pcur->index();
6581 		ulint		offs = 0;
6582 		ulint		page_no = ULINT_UNDEFINED;
6583 		FlushObserver*	observer = m_mtr->get_flush_observer();
6584 
6585 		if (m_op == BTR_STORE_INSERT_BULK) {
6586 			offs = page_offset(*m_rec);
6587 			page_no = page_get_page_no(
6588 				buf_block_get_frame(*m_block));
6589 
6590 			buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6591 		} else {
6592 			btr_pcur_store_position(m_pcur, m_mtr);
6593 		}
6594 		m_mtr->commit();
6595 
6596 		DEBUG_SYNC_C("blob_write_middle");
6597 
6598 		log_free_check();
6599 
6600 		DEBUG_SYNC_C("blob_write_middle_after_check");
6601 
6602 		const mtr_log_t log_mode = m_mtr->get_log_mode();
6603 		m_mtr->start();
6604 		m_mtr->set_log_mode(log_mode);
6605 		m_mtr->set_named_space(index->space);
6606 		m_mtr->set_flush_observer(observer);
6607 
6608 		if (m_op == BTR_STORE_INSERT_BULK) {
6609 			page_id_t       page_id(dict_index_get_space(index),
6610 						page_no);
6611 			page_size_t     page_size(dict_table_page_size(
6612 						index->table));
6613 			page_cur_t*	page_cur = &m_pcur->btr_cur.page_cur;
6614 
6615 			mtr_x_lock(dict_index_get_lock(index), m_mtr);
6616 			page_cur->block = btr_block_get(
6617 				page_id, page_size, RW_X_LATCH, index, m_mtr);
6618 			page_cur->rec = buf_block_get_frame(page_cur->block)
6619 				+ offs;
6620 
6621 			buf_block_buf_fix_dec(page_cur->block);
6622 		} else {
6623 			ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6624 			bool ret = btr_pcur_restore_position(
6625 				BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6626 				m_pcur, m_mtr);
6627 
6628 			ut_a(ret);
6629 		}
6630 
6631 		*m_block	= btr_pcur_get_block(m_pcur);
6632 		*m_rec		= btr_pcur_get_rec(m_pcur);
6633 
6634 		ut_d(rec_offs_make_valid(
6635 			*m_rec, index, const_cast<ulint*>(m_offsets)));
6636 
6637 		ut_ad(m_mtr->memo_contains_page_flagged(
6638 		      *m_rec,
6639 		      MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6640 		      || dict_table_is_intrinsic(index->table));
6641 
6642 		ut_ad(mtr_memo_contains_flagged(m_mtr,
6643 		      dict_index_get_lock(index),
6644 		      MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6645 		      || dict_table_is_intrinsic(index->table));
6646 	}
6647 };
6648 
6649 
6650 /*******************************************************************//**
6651 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6652 them in rec.  The extern flags in rec will have to be set beforehand.
6653 The fields are stored on pages allocated from leaf node
6654 file segment of the index tree.
6655 
6656 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6657 any mini-transaction.  Tablespace extension should be redo-logged, so that
6658 recovery will not fail when the big_rec was written to the extended portion of
6659 the file, in case the file was somehow truncated in the crash.
6660 
6661 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6662 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6663 btr_store_big_rec_extern_fields(
6664 /*============================*/
6665 	btr_pcur_t*	pcur,		/*!< in/out: a persistent cursor. if
6666 					btr_mtr is restarted, then this can
6667 					be repositioned. */
6668 	const upd_t*	upd,		/*!< in: update vector */
6669 	ulint*		offsets,	/*!< in/out: rec_get_offsets() on
6670 					pcur. the "external storage" flags
6671 					in offsets will correctly correspond
6672 					to rec when this function returns */
6673 	const big_rec_t*big_rec_vec,	/*!< in: vector containing fields
6674 					to be stored externally */
6675 	mtr_t*		btr_mtr,	/*!< in/out: mtr containing the
6676 					latches to the clustered index. can be
6677 					committed and restarted. */
6678 	enum blob_op	op)		/*! in: operation code */
6679 {
6680 	ulint		rec_page_no;
6681 	byte*		field_ref;
6682 	ulint		extern_len;
6683 	ulint		store_len;
6684 	ulint		page_no;
6685 	ulint		space_id;
6686 	ulint		prev_page_no;
6687 	ulint		hint_page_no;
6688 	ulint		i;
6689 	mtr_t		mtr;
6690 	mtr_t		mtr_bulk;
6691 	mem_heap_t*	heap = NULL;
6692 	page_zip_des_t*	page_zip;
6693 	z_stream	c_stream;
6694 	dberr_t		error		= DB_SUCCESS;
6695 	dict_index_t*	index		= pcur->index();
6696 	buf_block_t*	rec_block	= btr_pcur_get_block(pcur);
6697 	rec_t*		rec		= btr_pcur_get_rec(pcur);
6698 
6699 	ut_ad(rec_offs_validate(rec, index, offsets));
6700 	ut_ad(rec_offs_any_extern(offsets));
6701 	ut_ad(btr_mtr);
6702 	ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6703 					MTR_MEMO_X_LOCK
6704 					| MTR_MEMO_SX_LOCK)
6705 	      || dict_table_is_intrinsic(index->table)
6706 	      || !index->is_committed());
6707 	ut_ad(mtr_is_block_fix(
6708 		btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6709 	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6710 	ut_a(dict_index_is_clust(index));
6711 
6712 	ut_a(dict_table_page_size(index->table)
6713 		.equals_to(rec_block->page.size));
6714 
6715 	btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6716 				      &rec, op);
6717 	page_zip = buf_block_get_page_zip(rec_block);
6718 	space_id = rec_block->page.id.space();
6719 	rec_page_no = rec_block->page.id.page_no();
6720 	ut_a(fil_page_index_page_check(page_align(rec))
6721 	     || op == BTR_STORE_INSERT_BULK);
6722 
6723 	if (page_zip) {
6724 		int	err;
6725 
6726 		/* Zlib deflate needs 128 kilobytes for the default
6727 		window size, plus 512 << memLevel, plus a few
6728 		kilobytes for small objects.  We use reduced memLevel
6729 		to limit the memory consumption, and preallocate the
6730 		heap, hoping to avoid memory fragmentation. */
6731 		heap = mem_heap_create(250000);
6732 		page_zip_set_alloc(&c_stream, heap);
6733 
6734 		err = deflateInit2(&c_stream, page_zip_level,
6735 				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6736 		ut_a(err == Z_OK);
6737 	}
6738 
6739 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6740 	/* All pointers to externally stored columns in the record
6741 	must either be zero or they must be pointers to inherited
6742 	columns, owned by this record or an earlier record version. */
6743 	for (i = 0; i < big_rec_vec->n_fields; i++) {
6744 		field_ref = btr_rec_get_field_ref(
6745 			rec, offsets, big_rec_vec->fields[i].field_no);
6746 
6747 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6748 		/* Either this must be an update in place,
6749 		or the BLOB must be inherited, or the BLOB pointer
6750 		must be zero (will be written in this function). */
6751 		ut_a(op == BTR_STORE_UPDATE
6752 		     || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6753 		     || !memcmp(field_ref, field_ref_zero,
6754 				BTR_EXTERN_FIELD_REF_SIZE));
6755 	}
6756 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6757 
6758 	const page_size_t	page_size(dict_table_page_size(index->table));
6759 
6760 	/* Space available in compressed page to carry blob data */
6761 	const ulint	payload_size_zip = page_size.physical()
6762 		- FIL_PAGE_DATA;
6763 
6764 	/* Space available in uncompressed page to carry blob data */
6765 	const ulint	payload_size = page_size.physical()
6766 		- FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
6767 
6768 	/* We have to create a file segment to the tablespace
6769 	for each field and put the pointer to the field in rec */
6770 
6771 	for (i = 0; i < big_rec_vec->n_fields; i++) {
6772 		const ulint field_no = big_rec_vec->fields[i].field_no;
6773 
6774 		field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
6775 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6776 		/* A zero BLOB pointer should have been initially inserted. */
6777 		ut_a(!memcmp(field_ref, field_ref_zero,
6778 			     BTR_EXTERN_FIELD_REF_SIZE));
6779 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6780 		extern_len = big_rec_vec->fields[i].len;
6781 		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
6782 				   extern_len);
6783 
6784 		ut_a(extern_len > 0);
6785 
6786 		prev_page_no = FIL_NULL;
6787 
6788 		if (page_zip) {
6789 			int	err = deflateReset(&c_stream);
6790 			ut_a(err == Z_OK);
6791 
6792 			c_stream.next_in = (Bytef*)
6793 				big_rec_vec->fields[i].data;
6794 			c_stream.avail_in = static_cast<uInt>(extern_len);
6795 		}
6796 
6797 		for (ulint blob_npages = 0;; ++blob_npages) {
6798 			buf_block_t*	block;
6799 			page_t*		page;
6800 			const ulint	commit_freq = 4;
6801 			ulint		r_extents;
6802 
6803 			ut_ad(page_align(field_ref) == page_align(rec));
6804 
6805 			if (!(blob_npages % commit_freq)) {
6806 
6807 				redo_log.check();
6808 
6809 				field_ref = btr_rec_get_field_ref(
6810 					rec, offsets, field_no);
6811 
6812 				page_zip = buf_block_get_page_zip(rec_block);
6813 				rec_page_no = rec_block->page.id.page_no();
6814 			}
6815 
6816 			mtr_start(&mtr);
6817 			mtr.set_named_space(index->space);
6818 			mtr.set_log_mode(btr_mtr->get_log_mode());
6819 			mtr.set_flush_observer(btr_mtr->get_flush_observer());
6820 
6821 			buf_page_get(rec_block->page.id,
6822 				     rec_block->page.size, RW_X_LATCH, &mtr);
6823 
6824 			if (prev_page_no == FIL_NULL) {
6825 				hint_page_no = 1 + rec_page_no;
6826 			} else {
6827 				hint_page_no = prev_page_no + 1;
6828 			}
6829 
6830 			mtr_t	*alloc_mtr;
6831 
6832 			if (op == BTR_STORE_INSERT_BULK) {
6833 				mtr_start(&mtr_bulk);
6834 				mtr_bulk.set_spaces(mtr);
6835 				alloc_mtr = &mtr_bulk;
6836 			} else {
6837 				alloc_mtr = &mtr;
6838 			}
6839 
6840 			if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
6841 						      FSP_BLOB, alloc_mtr,
6842 						      1)) {
6843 
6844 				mtr_commit(alloc_mtr);
6845 				error = DB_OUT_OF_FILE_SPACE;
6846 				goto func_exit;
6847 			}
6848 
6849 			block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
6850 					       0, alloc_mtr, &mtr);
6851 
6852 			alloc_mtr->release_free_extents(r_extents);
6853 
6854 			if (op == BTR_STORE_INSERT_BULK) {
6855 				mtr_commit(&mtr_bulk);
6856 			}
6857 
6858 			ut_a(block != NULL);
6859 
6860 			page_no = block->page.id.page_no();
6861 			page = buf_block_get_frame(block);
6862 
6863 			if (prev_page_no != FIL_NULL) {
6864 				buf_block_t*	prev_block;
6865 				page_t*		prev_page;
6866 
6867 				prev_block = buf_page_get(
6868 					page_id_t(space_id, prev_page_no),
6869 					rec_block->page.size,
6870 					RW_X_LATCH, &mtr);
6871 
6872 				buf_block_dbg_add_level(prev_block,
6873 							SYNC_EXTERN_STORAGE);
6874 				prev_page = buf_block_get_frame(prev_block);
6875 
6876 				if (page_zip) {
6877 					mlog_write_ulint(
6878 						prev_page + FIL_PAGE_NEXT,
6879 						page_no, MLOG_4BYTES, &mtr);
6880 					memcpy(buf_block_get_page_zip(
6881 						       prev_block)
6882 					       ->data + FIL_PAGE_NEXT,
6883 					       prev_page + FIL_PAGE_NEXT, 4);
6884 				} else {
6885 					mlog_write_ulint(
6886 						prev_page + FIL_PAGE_DATA
6887 						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
6888 						page_no, MLOG_4BYTES, &mtr);
6889 				}
6890 
6891 			} else if (dict_index_is_online_ddl(index)) {
6892 				row_log_table_blob_alloc(index, page_no);
6893 			}
6894 
6895 			if (page_zip) {
6896 				int		err;
6897 				page_zip_des_t*	blob_page_zip;
6898 
6899 				/* Write FIL_PAGE_TYPE to the redo log
6900 				separately, before logging any other
6901 				changes to the page, so that the debug
6902 				assertions in
6903 				recv_parse_or_apply_log_rec_body() can
6904 				be made simpler.  Before InnoDB Plugin
6905 				1.0.4, the initialization of
6906 				FIL_PAGE_TYPE was logged as part of
6907 				the mlog_log_string() below. */
6908 
6909 				mlog_write_ulint(page + FIL_PAGE_TYPE,
6910 						 prev_page_no == FIL_NULL
6911 						 ? FIL_PAGE_TYPE_ZBLOB
6912 						 : FIL_PAGE_TYPE_ZBLOB2,
6913 						 MLOG_2BYTES, &mtr);
6914 
6915 				c_stream.next_out = page
6916 					+ FIL_PAGE_DATA;
6917 				c_stream.avail_out = static_cast<uInt>(
6918 					payload_size_zip);
6919 
6920 				err = deflate(&c_stream, Z_FINISH);
6921 				ut_a(err == Z_OK || err == Z_STREAM_END);
6922 				ut_a(err == Z_STREAM_END
6923 				     || c_stream.avail_out == 0);
6924 
6925 				/* Write the "next BLOB page" pointer */
6926 				mlog_write_ulint(page + FIL_PAGE_NEXT,
6927 						 FIL_NULL, MLOG_4BYTES, &mtr);
6928 				/* Initialize the unused "prev page" pointer */
6929 				mlog_write_ulint(page + FIL_PAGE_PREV,
6930 						 FIL_NULL, MLOG_4BYTES, &mtr);
6931 				/* Write a back pointer to the record
6932 				into the otherwise unused area.  This
6933 				information could be useful in
6934 				debugging.  Later, we might want to
6935 				implement the possibility to relocate
6936 				BLOB pages.  Then, we would need to be
6937 				able to adjust the BLOB pointer in the
6938 				record.  We do not store the heap
6939 				number of the record, because it can
6940 				change in page_zip_reorganize() or
6941 				btr_page_reorganize().  However, also
6942 				the page number of the record may
6943 				change when B-tree nodes are split or
6944 				merged.
6945 				NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
6946 				used by R-tree index for a Split Sequence
6947 				Number */
6948 				ut_ad(!dict_index_is_spatial(index));
6949 
6950 				mlog_write_ulint(page
6951 						 + FIL_PAGE_FILE_FLUSH_LSN,
6952 						 space_id,
6953 						 MLOG_4BYTES, &mtr);
6954 				mlog_write_ulint(page
6955 						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
6956 						 rec_page_no,
6957 						 MLOG_4BYTES, &mtr);
6958 
6959 				/* Zero out the unused part of the page. */
6960 				memset(page + page_zip_get_size(page_zip)
6961 				       - c_stream.avail_out,
6962 				       0, c_stream.avail_out);
6963 				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
6964 						page_zip_get_size(page_zip)
6965 						- FIL_PAGE_FILE_FLUSH_LSN,
6966 						&mtr);
6967 				/* Copy the page to compressed storage,
6968 				because it will be flushed to disk
6969 				from there. */
6970 				blob_page_zip = buf_block_get_page_zip(block);
6971 				ut_ad(blob_page_zip);
6972 				ut_ad(page_zip_get_size(blob_page_zip)
6973 				      == page_zip_get_size(page_zip));
6974 				memcpy(blob_page_zip->data, page,
6975 				       page_zip_get_size(page_zip));
6976 
6977 				if (err == Z_OK && prev_page_no != FIL_NULL) {
6978 
6979 					goto next_zip_page;
6980 				}
6981 
6982 				if (err == Z_STREAM_END) {
6983 					mach_write_to_4(field_ref
6984 							+ BTR_EXTERN_LEN, 0);
6985 					mach_write_to_4(field_ref
6986 							+ BTR_EXTERN_LEN + 4,
6987 							c_stream.total_in);
6988 				} else {
6989 					memset(field_ref + BTR_EXTERN_LEN,
6990 					       0, 8);
6991 				}
6992 
6993 				if (prev_page_no == FIL_NULL) {
6994 					ut_ad(blob_npages == 0);
6995 					mach_write_to_4(field_ref
6996 							+ BTR_EXTERN_SPACE_ID,
6997 							space_id);
6998 
6999 					mach_write_to_4(field_ref
7000 							+ BTR_EXTERN_PAGE_NO,
7001 							page_no);
7002 
7003 					mach_write_to_4(field_ref
7004 							+ BTR_EXTERN_OFFSET,
7005 							FIL_PAGE_NEXT);
7006 				}
7007 
7008 				/* We compress a page when finish bulk insert.*/
7009 				if (op != BTR_STORE_INSERT_BULK) {
7010 					page_zip_write_blob_ptr(
7011 						page_zip, rec, index, offsets,
7012 						field_no, &mtr);
7013 				}
7014 
7015 next_zip_page:
7016 				prev_page_no = page_no;
7017 
7018 				/* Commit mtr and release the
7019 				uncompressed page frame to save memory. */
7020 				btr_blob_free(index, block, FALSE, &mtr);
7021 
7022 				if (err == Z_STREAM_END) {
7023 					break;
7024 				}
7025 			} else {
7026 				mlog_write_ulint(page + FIL_PAGE_TYPE,
7027 						 FIL_PAGE_TYPE_BLOB,
7028 						 MLOG_2BYTES, &mtr);
7029 
7030 				if (extern_len > payload_size) {
7031 					store_len = payload_size;
7032 				} else {
7033 					store_len = extern_len;
7034 				}
7035 
7036 				mlog_write_string(page + FIL_PAGE_DATA
7037 						  + BTR_BLOB_HDR_SIZE,
7038 						  (const byte*)
7039 						  big_rec_vec->fields[i].data
7040 						  + big_rec_vec->fields[i].len
7041 						  - extern_len,
7042 						  store_len, &mtr);
7043 				mlog_write_ulint(page + FIL_PAGE_DATA
7044 						 + BTR_BLOB_HDR_PART_LEN,
7045 						 store_len, MLOG_4BYTES, &mtr);
7046 				mlog_write_ulint(page + FIL_PAGE_DATA
7047 						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7048 						 FIL_NULL, MLOG_4BYTES, &mtr);
7049 
7050 				extern_len -= store_len;
7051 
7052 				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7053 						 MLOG_4BYTES, &mtr);
7054 				mlog_write_ulint(field_ref
7055 						 + BTR_EXTERN_LEN + 4,
7056 						 big_rec_vec->fields[i].len
7057 						 - extern_len,
7058 						 MLOG_4BYTES, &mtr);
7059 
7060 				if (prev_page_no == FIL_NULL) {
7061 					ut_ad(blob_npages == 0);
7062 					mlog_write_ulint(field_ref
7063 							 + BTR_EXTERN_SPACE_ID,
7064 							 space_id, MLOG_4BYTES,
7065 							 &mtr);
7066 
7067 					mlog_write_ulint(field_ref
7068 							 + BTR_EXTERN_PAGE_NO,
7069 							 page_no, MLOG_4BYTES,
7070 							 &mtr);
7071 
7072 					mlog_write_ulint(field_ref
7073 							 + BTR_EXTERN_OFFSET,
7074 							 FIL_PAGE_DATA,
7075 							 MLOG_4BYTES,
7076 							 &mtr);
7077 				}
7078 
7079 				prev_page_no = page_no;
7080 
7081 				mtr_commit(&mtr);
7082 
7083 				if (extern_len == 0) {
7084 					break;
7085 				}
7086 			}
7087 		}
7088 
7089 		DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7090 				error = DB_OUT_OF_FILE_SPACE;
7091 				goto func_exit;);
7092 
7093 		rec_offs_make_nth_extern(offsets, field_no);
7094 	}
7095 
7096 func_exit:
7097 	if (page_zip) {
7098 		deflateEnd(&c_stream);
7099 	}
7100 
7101 	if (heap != NULL) {
7102 		mem_heap_free(heap);
7103 	}
7104 
7105 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7106 	/* All pointers to externally stored columns in the record
7107 	must be valid. */
7108 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7109 		if (!rec_offs_nth_extern(offsets, i)) {
7110 			continue;
7111 		}
7112 
7113 		field_ref = btr_rec_get_field_ref(rec, offsets, i);
7114 
7115 		/* The pointer must not be zero if the operation
7116 		succeeded. */
7117 		ut_a(0 != memcmp(field_ref, field_ref_zero,
7118 				 BTR_EXTERN_FIELD_REF_SIZE)
7119 		     || error != DB_SUCCESS);
7120 		/* The column must not be disowned by this record. */
7121 		ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7122 	}
7123 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7124 	return(error);
7125 }
7126 
7127 /*******************************************************************//**
7128 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7129 static
7130 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7131 btr_check_blob_fil_page_type(
7132 /*=========================*/
7133 	ulint		space_id,	/*!< in: space id */
7134 	ulint		page_no,	/*!< in: page number */
7135 	const page_t*	page,		/*!< in: page */
7136 	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
7137 {
7138 	ulint	type = fil_page_get_type(page);
7139 
7140 	ut_a(space_id == page_get_space_id(page));
7141 	ut_a(page_no == page_get_page_no(page));
7142 
7143 	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7144 		ulint	flags = fil_space_get_flags(space_id);
7145 
7146 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7147 		if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7148 			/* Old versions of InnoDB did not initialize
7149 			FIL_PAGE_TYPE on BLOB pages.  Do not print
7150 			anything about the type mismatch when reading
7151 			a BLOB page that is in Antelope format.*/
7152 			return;
7153 		}
7154 #endif /* !UNIV_DEBUG */
7155 
7156 		ib::fatal() << "FIL_PAGE_TYPE=" << type
7157 			<< " on BLOB " << (read ? "read" : "purge")
7158 			<< " space " << space_id << " page " << page_no
7159 			<< " flags " << flags;
7160 	}
7161 }
7162 
7163 /*******************************************************************//**
7164 Frees the space in an externally stored field to the file space
7165 management if the field in data is owned by the externally stored field,
7166 in a rollback we may have the additional condition that the field must
7167 not be inherited. */
7168 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7169 btr_free_externally_stored_field(
7170 /*=============================*/
7171 	dict_index_t*	index,		/*!< in: index of the data, the index
7172 					tree MUST be X-latched; if the tree
7173 					height is 1, then also the root page
7174 					must be X-latched! (this is relevant
7175 					in the case this function is called
7176 					from purge where 'data' is located on
7177 					an undo log page, not an index
7178 					page) */
7179 	byte*		field_ref,	/*!< in/out: field reference */
7180 	const rec_t*	rec,		/*!< in: record containing field_ref, for
7181 					page_zip_write_blob_ptr(), or NULL */
7182 	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
7183 					or NULL */
7184 	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
7185 					to rec, or NULL if rec == NULL */
7186 	ulint		i,		/*!< in: field number of field_ref;
7187 					ignored if rec == NULL */
7188 	bool		rollback,	/*!< in: performing rollback? */
7189 	mtr_t*		local_mtr)	/*!< in: mtr
7190 					containing the latch to data an an
7191 					X-latch to the index tree */
7192 {
7193 	page_t*		page;
7194 	const ulint	space_id	= mach_read_from_4(
7195 		field_ref + BTR_EXTERN_SPACE_ID);
7196 	const ulint	start_page	= mach_read_from_4(
7197 		field_ref + BTR_EXTERN_PAGE_NO);
7198 	ulint		page_no;
7199 	ulint		next_page_no;
7200 	mtr_t		mtr;
7201 
7202 	ut_ad(dict_index_is_clust(index));
7203 	ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7204 					MTR_MEMO_X_LOCK
7205 					| MTR_MEMO_SX_LOCK)
7206 	      || dict_table_is_intrinsic(index->table));
7207 	ut_ad(mtr_is_page_fix(
7208 		local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7209 	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7210 	ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7211 	ut_ad(local_mtr->is_named_space(
7212 		      page_get_space_id(page_align(field_ref))));
7213 
7214 	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7215 				  BTR_EXTERN_FIELD_REF_SIZE))) {
7216 		/* In the rollback, we may encounter a clustered index
7217 		record with some unwritten off-page columns. There is
7218 		nothing to free then. */
7219 		ut_a(rollback);
7220 		return;
7221 	}
7222 
7223 	ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7224 	        & ~((BTR_EXTERN_OWNER_FLAG
7225 	             | BTR_EXTERN_INHERITED_FLAG) << 24)));
7226 	ut_ad(space_id == index->space);
7227 
7228 	const page_size_t	ext_page_size(dict_table_page_size(index->table));
7229 	const page_size_t&	rec_page_size(rec == NULL
7230 					      ? univ_page_size
7231 					      : ext_page_size);
7232 	if (rec == NULL) {
7233 		/* This is a call from row_purge_upd_exist_or_extern(). */
7234 		ut_ad(!page_zip);
7235 	}
7236 
7237 	for (;;) {
7238 #ifdef UNIV_DEBUG
7239 		buf_block_t*	rec_block;
7240 #endif /* UNIV_DEBUG */
7241 		buf_block_t*	ext_block;
7242 
7243 		mtr_start(&mtr);
7244 		mtr.set_spaces(*local_mtr);
7245 		mtr.set_log_mode(local_mtr->get_log_mode());
7246 
7247 		ut_ad(!dict_table_is_temporary(index->table)
7248 		      || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7249 
7250 		const page_t*	p = page_align(field_ref);
7251 
7252 		const page_id_t	page_id(page_get_space_id(p),
7253 					page_get_page_no(p));
7254 
7255 #ifdef UNIV_DEBUG
7256 		rec_block =
7257 #endif /* UNIV_DEBUG */
7258 		buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7259 
7260 		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7261 		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7262 
7263 		if (/* There is no external storage data */
7264 		    page_no == FIL_NULL
7265 		    /* This field does not own the externally stored field */
7266 		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7267 			& BTR_EXTERN_OWNER_FLAG)
7268 		    /* Rollback and inherited field */
7269 		    || (rollback
7270 			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7271 			    & BTR_EXTERN_INHERITED_FLAG))) {
7272 
7273 			/* Do not free */
7274 			mtr_commit(&mtr);
7275 
7276 			return;
7277 		}
7278 
7279 		if (page_no == start_page && dict_index_is_online_ddl(index)) {
7280 			row_log_table_blob_free(index, start_page);
7281 		}
7282 
7283 		ext_block = buf_page_get(
7284 			page_id_t(space_id, page_no), ext_page_size,
7285 			RW_X_LATCH, &mtr);
7286 
7287 		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7288 		page = buf_block_get_frame(ext_block);
7289 
7290 		if (ext_page_size.is_compressed()) {
7291 			/* Note that page_zip will be NULL
7292 			in row_purge_upd_exist_or_extern(). */
7293 			switch (fil_page_get_type(page)) {
7294 			case FIL_PAGE_TYPE_ZBLOB:
7295 			case FIL_PAGE_TYPE_ZBLOB2:
7296 				break;
7297 			default:
7298 				ut_error;
7299 			}
7300 			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7301 
7302 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7303 					  &mtr);
7304 
7305 			if (page_zip != NULL) {
7306 				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7307 						next_page_no);
7308 				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7309 						0);
7310 				page_zip_write_blob_ptr(page_zip, rec, index,
7311 							offsets, i, &mtr);
7312 			} else {
7313 				mlog_write_ulint(field_ref
7314 						 + BTR_EXTERN_PAGE_NO,
7315 						 next_page_no,
7316 						 MLOG_4BYTES, &mtr);
7317 				mlog_write_ulint(field_ref
7318 						 + BTR_EXTERN_LEN + 4, 0,
7319 						 MLOG_4BYTES, &mtr);
7320 			}
7321 		} else {
7322 			ut_a(!page_zip);
7323 			btr_check_blob_fil_page_type(space_id, page_no, page,
7324 						     FALSE);
7325 
7326 			next_page_no = mach_read_from_4(
7327 				page + FIL_PAGE_DATA
7328 				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
7329 
7330 			btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7331 					  &mtr);
7332 
7333 			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7334 					 next_page_no,
7335 					 MLOG_4BYTES, &mtr);
7336 			/* Zero out the BLOB length.  If the server
7337 			crashes during the execution of this function,
7338 			trx_rollback_or_clean_all_recovered() could
7339 			dereference the half-deleted BLOB, fetching a
7340 			wrong prefix for the BLOB. */
7341 			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7342 					 0,
7343 					 MLOG_4BYTES, &mtr);
7344 		}
7345 
7346 		/* Commit mtr and release the BLOB block to save memory. */
7347 		btr_blob_free(index, ext_block, TRUE, &mtr);
7348 	}
7349 }
7350 
7351 /***********************************************************//**
7352 Frees the externally stored fields for a record. */
7353 static
7354 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7355 btr_rec_free_externally_stored_fields(
7356 /*==================================*/
7357 	dict_index_t*	index,	/*!< in: index of the data, the index
7358 				tree MUST be X-latched */
7359 	rec_t*		rec,	/*!< in/out: record */
7360 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7361 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7362 				part will be updated, or NULL */
7363 	bool		rollback,/*!< in: performing rollback? */
7364 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7365 				an X-latch to record page and to the index
7366 				tree */
7367 {
7368 	ulint	n_fields;
7369 	ulint	i;
7370 
7371 	ut_ad(rec_offs_validate(rec, index, offsets));
7372 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7373 	/* Free possible externally stored fields in the record */
7374 
7375 	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7376 	n_fields = rec_offs_n_fields(offsets);
7377 
7378 	for (i = 0; i < n_fields; i++) {
7379 		if (rec_offs_nth_extern(offsets, i)) {
7380 			btr_free_externally_stored_field(
7381 				index, btr_rec_get_field_ref(rec, offsets, i),
7382 				rec, offsets, page_zip, i, rollback, mtr);
7383 		}
7384 	}
7385 }
7386 
7387 /***********************************************************//**
7388 Frees the externally stored fields for a record, if the field is mentioned
7389 in the update vector. */
7390 static
7391 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7392 btr_rec_free_updated_extern_fields(
7393 /*===============================*/
7394 	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
7395 				X-latched */
7396 	rec_t*		rec,	/*!< in/out: record */
7397 	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
7398 				part will be updated, or NULL */
7399 	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
7400 	const upd_t*	update,	/*!< in: update vector */
7401 	bool		rollback,/*!< in: performing rollback? */
7402 	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
7403 				an X-latch to record page and to the tree */
7404 {
7405 	ulint	n_fields;
7406 	ulint	i;
7407 
7408 	ut_ad(rec_offs_validate(rec, index, offsets));
7409 	ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7410 
7411 	/* Free possible externally stored fields in the record */
7412 
7413 	n_fields = upd_get_n_fields(update);
7414 
7415 	for (i = 0; i < n_fields; i++) {
7416 		const upd_field_t* ufield = upd_get_nth_field(update, i);
7417 
7418 		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7419 			ulint	len;
7420 			byte*	data = rec_get_nth_field(
7421 				rec, offsets, ufield->field_no, &len);
7422 			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7423 
7424 			btr_free_externally_stored_field(
7425 				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7426 				rec, offsets, page_zip,
7427 				ufield->field_no, rollback, mtr);
7428 		}
7429 	}
7430 }
7431 
7432 /*******************************************************************//**
7433 Copies the prefix of an uncompressed BLOB.  The clustered index record
7434 that points to this BLOB must be protected by a lock or a page latch.
7435 @return number of bytes written to buf */
7436 static
7437 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7438 btr_copy_blob_prefix(
7439 /*=================*/
7440 	byte*		buf,	/*!< out: the externally stored part of
7441 				the field, or a prefix of it */
7442 	ulint		len,	/*!< in: length of buf, in bytes */
7443 	ulint		space_id,/*!< in: space id of the BLOB pages */
7444 	ulint		page_no,/*!< in: page number of the first BLOB page */
7445 	ulint		offset)	/*!< in: offset on the first BLOB page */
7446 {
7447 	ulint	copied_len	= 0;
7448 
7449 	for (;;) {
7450 		mtr_t		mtr;
7451 		buf_block_t*	block;
7452 		const page_t*	page;
7453 		const byte*	blob_header;
7454 		ulint		part_len;
7455 		ulint		copy_len;
7456 
7457 		mtr_start(&mtr);
7458 
7459 		block = buf_page_get(page_id_t(space_id, page_no),
7460 				     univ_page_size, RW_S_LATCH, &mtr);
7461 		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7462 		page = buf_block_get_frame(block);
7463 
7464 		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7465 
7466 		blob_header = page + offset;
7467 		part_len = btr_blob_get_part_len(blob_header);
7468 		copy_len = ut_min(part_len, len - copied_len);
7469 
7470 		memcpy(buf + copied_len,
7471 		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7472 		copied_len += copy_len;
7473 
7474 		page_no = btr_blob_get_next_page_no(blob_header);
7475 
7476 		mtr_commit(&mtr);
7477 
7478 		if (page_no == FIL_NULL || copy_len != part_len) {
7479 			UNIV_MEM_ASSERT_RW(buf, copied_len);
7480 			return(copied_len);
7481 		}
7482 
7483 		/* On other BLOB pages except the first the BLOB header
7484 		always is at the page data start: */
7485 
7486 		offset = FIL_PAGE_DATA;
7487 
7488 		ut_ad(copied_len <= len);
7489 	}
7490 }
7491 
7492 /** Copies the prefix of a compressed BLOB.
7493 The clustered index record that points to this BLOB must be protected
7494 by a lock or a page latch.
7495 @param[out]	buf		the externally stored part of the field,
7496 or a prefix of it
7497 @param[in]	len		length of buf, in bytes
7498 @param[in]	page_size	compressed BLOB page size
7499 @param[in]	space_id	space id of the BLOB pages
7500 @param[in]	offset		offset on the first BLOB page
7501 @return number of bytes written to buf */
7502 static
7503 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7504 btr_copy_zblob_prefix(
7505 	byte*			buf,
7506 	ulint			len,
7507 	const page_size_t&	page_size,
7508 	ulint			space_id,
7509 	ulint			page_no,
7510 	ulint			offset)
7511 {
7512 	ulint		page_type = FIL_PAGE_TYPE_ZBLOB;
7513 	mem_heap_t*	heap;
7514 	int		err;
7515 	z_stream	d_stream;
7516 
7517 	d_stream.next_out = buf;
7518 	d_stream.avail_out = static_cast<uInt>(len);
7519 	d_stream.next_in = Z_NULL;
7520 	d_stream.avail_in = 0;
7521 
7522 	/* Zlib inflate needs 32 kilobytes for the default
7523 	window size, plus a few kilobytes for small objects. */
7524 	heap = mem_heap_create(40000);
7525 	page_zip_set_alloc(&d_stream, heap);
7526 
7527 	ut_ad(page_size.is_compressed());
7528 	ut_ad(space_id);
7529 
7530 	err = inflateInit(&d_stream);
7531 	ut_a(err == Z_OK);
7532 
7533 	for (;;) {
7534 		buf_page_t*	bpage;
7535 		ulint		next_page_no;
7536 
7537 		/* There is no latch on bpage directly.  Instead,
7538 		bpage is protected by the B-tree page latch that
7539 		is being held on the clustered index record, or,
7540 		in row_merge_copy_blobs(), by an exclusive table lock. */
7541 		bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7542 					 page_size);
7543 
7544 		if (UNIV_UNLIKELY(!bpage)) {
7545 			ib::error() << "Cannot load compressed BLOB "
7546 				<< page_id_t(space_id, page_no);
7547 			goto func_exit;
7548 		}
7549 
7550 		if (UNIV_UNLIKELY
7551 		    (fil_page_get_type(bpage->zip.data) != page_type)) {
7552 
7553 			ib::error() << "Unexpected type "
7554 				<< fil_page_get_type(bpage->zip.data)
7555 				<< " of compressed BLOB page "
7556 				<< page_id_t(space_id, page_no);
7557 
7558 			ut_ad(0);
7559 			goto end_of_blob;
7560 		}
7561 
7562 		next_page_no = mach_read_from_4(bpage->zip.data + offset);
7563 
7564 		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7565 			/* When the BLOB begins at page header,
7566 			the compressed data payload does not
7567 			immediately follow the next page pointer. */
7568 			offset = FIL_PAGE_DATA;
7569 		} else {
7570 			offset += 4;
7571 		}
7572 
7573 		d_stream.next_in = bpage->zip.data + offset;
7574 		d_stream.avail_in = static_cast<uInt>(page_size.physical()
7575 						      - offset);
7576 
7577 		err = inflate(&d_stream, Z_NO_FLUSH);
7578 		switch (err) {
7579 		case Z_OK:
7580 			if (!d_stream.avail_out) {
7581 				goto end_of_blob;
7582 			}
7583 			break;
7584 		case Z_STREAM_END:
7585 			if (next_page_no == FIL_NULL) {
7586 				goto end_of_blob;
7587 			}
7588 			/* fall through */
7589 		default:
7590 inflate_error:
7591 			ib::error() << "inflate() of compressed BLOB page "
7592 				<< page_id_t(space_id, page_no)
7593 				<< " returned " << err
7594 				<< " (" << d_stream.msg << ")";
7595 
7596 		case Z_BUF_ERROR:
7597 			goto end_of_blob;
7598 		}
7599 
7600 		if (next_page_no == FIL_NULL) {
7601 			if (!d_stream.avail_in) {
7602 				ib::error()
7603 					<< "Unexpected end of compressed "
7604 					<< "BLOB page "
7605 					<< page_id_t(space_id, page_no);
7606 			} else {
7607 				err = inflate(&d_stream, Z_FINISH);
7608 				switch (err) {
7609 				case Z_STREAM_END:
7610 				case Z_BUF_ERROR:
7611 					break;
7612 				default:
7613 					goto inflate_error;
7614 				}
7615 			}
7616 
7617 end_of_blob:
7618 			buf_page_release_zip(bpage);
7619 			goto func_exit;
7620 		}
7621 
7622 		buf_page_release_zip(bpage);
7623 
7624 		/* On other BLOB pages except the first
7625 		the BLOB header always is at the page header: */
7626 
7627 		page_no = next_page_no;
7628 		offset = FIL_PAGE_NEXT;
7629 		page_type = FIL_PAGE_TYPE_ZBLOB2;
7630 	}
7631 
7632 func_exit:
7633 	inflateEnd(&d_stream);
7634 	mem_heap_free(heap);
7635 	UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7636 	return(d_stream.total_out);
7637 }
7638 
7639 /** Copies the prefix of an externally stored field of a record.
7640 The clustered index record that points to this BLOB must be protected
7641 by a lock or a page latch.
7642 @param[out]	buf		the externally stored part of the
7643 field, or a prefix of it
7644 @param[in]	len		length of buf, in bytes
7645 @param[in]	page_size	BLOB page size
7646 @param[in]	space_id	space id of the first BLOB page
7647 @param[in]	page_no		page number of the first BLOB page
7648 @param[in]	offset		offset on the first BLOB page
7649 @return number of bytes written to buf */
7650 static
7651 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7652 btr_copy_externally_stored_field_prefix_low(
7653 	byte*			buf,
7654 	ulint			len,
7655 	const page_size_t&	page_size,
7656 	ulint			space_id,
7657 	ulint			page_no,
7658 	ulint			offset)
7659 {
7660 	if (len == 0) {
7661 		return(0);
7662 	}
7663 
7664 	if (page_size.is_compressed()) {
7665 		return(btr_copy_zblob_prefix(buf, len, page_size,
7666 					     space_id, page_no, offset));
7667 	} else {
7668 		ut_ad(page_size.equals_to(univ_page_size));
7669 		return(btr_copy_blob_prefix(buf, len, space_id,
7670 					    page_no, offset));
7671 	}
7672 }
7673 
7674 /** Copies the prefix of an externally stored field of a record.
7675 The clustered index record must be protected by a lock or a page latch.
7676 @param[out]	buf		the field, or a prefix of it
7677 @param[in]	len		length of buf, in bytes
7678 @param[in]	page_size	BLOB page size
7679 @param[in]	data		'internally' stored part of the field
7680 containing also the reference to the external part; must be protected by
7681 a lock or a page latch
7682 @param[in]	local_len	length of data, in bytes
7683 @return the length of the copied field, or 0 if the column was being
7684 or has been deleted */
7685 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7686 btr_copy_externally_stored_field_prefix(
7687 	byte*			buf,
7688 	ulint			len,
7689 	const page_size_t&	page_size,
7690 	const byte*		data,
7691 	ulint			local_len)
7692 {
7693 	ulint	space_id;
7694 	ulint	page_no;
7695 	ulint	offset;
7696 
7697 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7698 
7699 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7700 
7701 	if (UNIV_UNLIKELY(local_len >= len)) {
7702 		memcpy(buf, data, len);
7703 		return(len);
7704 	}
7705 
7706 	memcpy(buf, data, local_len);
7707 	data += local_len;
7708 
7709 	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7710 
7711 	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7712 		/* The externally stored part of the column has been
7713 		(partially) deleted.  Signal the half-deleted BLOB
7714 		to the caller. */
7715 
7716 		return(0);
7717 	}
7718 
7719 	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7720 
7721 	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7722 
7723 	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7724 
7725 	return(local_len
7726 	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7727 							     len - local_len,
7728 							     page_size,
7729 							     space_id, page_no,
7730 							     offset));
7731 }
7732 
7733 /** Copies an externally stored field of a record to mem heap.
7734 The clustered index record must be protected by a lock or a page latch.
7735 @param[out]	len		length of the whole field
7736 @param[in]	data		'internally' stored part of the field
7737 containing also the reference to the external part; must be protected by
7738 a lock or a page latch
7739 @param[in]	page_size	BLOB page size
7740 @param[in]	local_len	length of data
7741 @param[in,out]	heap		mem heap
7742 @return the whole field copied to heap */
7743 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7744 btr_copy_externally_stored_field(
7745 	ulint*			len,
7746 	const byte*		data,
7747 	const page_size_t&	page_size,
7748 	ulint			local_len,
7749 	mem_heap_t*		heap)
7750 {
7751 	ulint	space_id;
7752 	ulint	page_no;
7753 	ulint	offset;
7754 	ulint	extern_len;
7755 	byte*	buf;
7756 
7757 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7758 
7759 	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7760 
7761 	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
7762 
7763 	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
7764 
7765 	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
7766 
7767 	/* Currently a BLOB cannot be bigger than 4 GB; we
7768 	leave the 4 upper bytes in the length field unused */
7769 
7770 	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
7771 
7772 	buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
7773 
7774 	memcpy(buf, data, local_len);
7775 	*len = local_len
7776 		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
7777 							      extern_len,
7778 							      page_size,
7779 							      space_id,
7780 							      page_no, offset);
7781 
7782 	return(buf);
7783 }
7784 
7785 /** Copies an externally stored field of a record to mem heap.
7786 @param[in]	rec		record in a clustered index; must be
7787 protected by a lock or a page latch
7788 @param[in]	offset		array returned by rec_get_offsets()
7789 @param[in]	page_size	BLOB page size
7790 @param[in]	no		field number
7791 @param[out]	len		length of the field
7792 @param[in,out]	heap		mem heap
7793 @return the field copied to heap, or NULL if the field is incomplete */
7794 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)7795 btr_rec_copy_externally_stored_field(
7796 	const rec_t*		rec,
7797 	const ulint*		offsets,
7798 	const page_size_t&	page_size,
7799 	ulint			no,
7800 	ulint*			len,
7801 	mem_heap_t*		heap)
7802 {
7803 	ulint		local_len;
7804 	const byte*	data;
7805 
7806 	ut_a(rec_offs_nth_extern(offsets, no));
7807 
7808 	/* An externally stored field can contain some initial
7809 	data from the field, and in the last 20 bytes it has the
7810 	space id, page number, and offset where the rest of the
7811 	field data is stored, and the data length in addition to
7812 	the data stored locally. We may need to store some data
7813 	locally to get the local record length above the 128 byte
7814 	limit so that field offsets are stored in two bytes, and
7815 	the extern bit is available in those two bytes. */
7816 
7817 	data = rec_get_nth_field(rec, offsets, no, &local_len);
7818 
7819 	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7820 
7821 	if (UNIV_UNLIKELY
7822 	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
7823 		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
7824 		/* The externally stored field was not written yet.
7825 		This record should only be seen by
7826 		recv_recovery_rollback_active() or any
7827 		TRX_ISO_READ_UNCOMMITTED transactions. */
7828 		return(NULL);
7829 	}
7830 
7831 	return(btr_copy_externally_stored_field(len, data,
7832 						page_size, local_len, heap));
7833 }
7834 #endif /* !UNIV_HOTBACKUP */
7835