1 /*****************************************************************************
2
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 *****************************************************************************/
34
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42
43 NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52
53 #include "btr0cur.h"
54
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 BTR_NO_OP = 0, /*!< Not buffered */
85 BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
86 BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
87 BTR_DELETE_OP, /*!< Purge a delete-marked record */
88 BTR_DELMARK_OP /*!< Mark a record for deletion */
89 };
90
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 BTR_INTENTION_DELETE,
94 BTR_INTENTION_BOTH,
95 BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110
111 Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH 100000
114
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint btr_cur_n_non_sea = 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint btr_cur_n_sea = 0;
120 /** Old value of btr_cur_n_non_sea. Copied by
121 srv_refresh_innodb_monitor_stats(). Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint btr_cur_n_non_sea_old = 0;
124 /** Old value of btr_cur_n_sea. Copied by
125 srv_refresh_innodb_monitor_stats(). Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint btr_cur_n_sea_old = 0;
128
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
137
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
142 page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
144 FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
147 part header, in bytes */
148
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 (((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
173 part will be updated, or NULL */
174 rec_t* rec, /*!< in/out: record in a clustered index */
175 dict_index_t* index, /*!< in: index of the page */
176 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
177 mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
186 ulint height, /*!< in: height of the page in tree;
187 0 means leaf node */
188 ulint root_height); /*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
197 X-latched */
198 rec_t* rec, /*!< in: record */
199 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
200 part will be updated, or NULL */
201 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
202 const upd_t* update, /*!< in: update vector */
203 bool rollback,/*!< in: performing rollback? */
204 mtr_t* mtr); /*!< in: mini-transaction handle which contains
205 an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 dict_index_t* index, /*!< in: index of the data, the index
213 tree MUST be X-latched */
214 rec_t* rec, /*!< in: record */
215 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
216 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
217 part will be updated, or NULL */
218 bool rollback,/*!< in: performing rollback? */
219 mtr_t* mtr); /*!< in: mini-transaction handle which contains
220 an X-latch to record page and to the index
221 tree */
222 #endif /* !UNIV_HOTBACKUP */
223
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236
237 /** Latches the leaf page or pages requested.
238 @param[in] block leaf page where the search converged
239 @param[in] page_id page id of the leaf
240 @param[in] latch_mode BTR_SEARCH_LEAF, ...
241 @param[in] cursor cursor
242 @param[in] mtr mini-transaction
243 @return blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 buf_block_t* block,
247 const page_id_t& page_id,
248 const page_size_t& page_size,
249 ulint latch_mode,
250 btr_cur_t* cursor,
251 mtr_t* mtr)
252 {
253 ulint mode;
254 ulint left_page_no;
255 ulint right_page_no;
256 buf_block_t* get_block;
257 page_t* page = buf_block_get_frame(block);
258 bool spatial;
259 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260
261 spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 ut_ad(buf_page_in_file(&block->page));
263
264 switch (latch_mode) {
265 case BTR_SEARCH_LEAF:
266 case BTR_MODIFY_LEAF:
267 case BTR_SEARCH_TREE:
268 if (spatial) {
269 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 = mtr_set_savepoint(mtr);
271 }
272
273 mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 get_block = btr_block_get(page_id, page_size, mode,
276 cursor->index, mtr);
277 latch_leaves.blocks[1] = get_block;
278 #ifdef UNIV_BTR_DEBUG
279 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
280 #endif /* UNIV_BTR_DEBUG */
281 if (spatial) {
282 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
283 = get_block;
284 }
285
286 return(latch_leaves);
287 case BTR_MODIFY_TREE:
288 /* It is exclusive for other operations which calls
289 btr_page_set_prev() */
290 ut_ad(mtr_memo_contains_flagged(mtr,
291 dict_index_get_lock(cursor->index),
292 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
293 || dict_table_is_intrinsic(cursor->index->table));
294 /* x-latch also siblings from left to right */
295 left_page_no = btr_page_get_prev(page, mtr);
296
297 if (left_page_no != FIL_NULL) {
298
299 if (spatial) {
300 cursor->rtr_info->tree_savepoints[
301 RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
302 }
303
304 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
305 get_block = btr_block_get(
306 page_id_t(page_id.space(), left_page_no),
307 page_size, RW_X_LATCH, cursor->index, mtr);
308 latch_leaves.blocks[0] = get_block;
309
310 if (spatial) {
311 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
312 = get_block;
313 }
314 }
315
316 if (spatial) {
317 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
318 = mtr_set_savepoint(mtr);
319 }
320
321 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
322 get_block = btr_block_get(
323 page_id, page_size, RW_X_LATCH, cursor->index, mtr);
324 latch_leaves.blocks[1] = get_block;
325
326 #ifdef UNIV_BTR_DEBUG
327 /* Sanity check only after both the blocks are latched. */
328 if (latch_leaves.blocks[0] != NULL) {
329 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
330 == page_is_comp(page));
331 ut_a(btr_page_get_next(
332 latch_leaves.blocks[0]->frame, mtr)
333 == page_get_page_no(page));
334 }
335 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
336 #endif /* UNIV_BTR_DEBUG */
337
338 if (spatial) {
339 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
340 = get_block;
341 }
342
343 right_page_no = btr_page_get_next(page, mtr);
344
345 if (right_page_no != FIL_NULL) {
346 if (spatial) {
347 cursor->rtr_info->tree_savepoints[
348 RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
349 mtr);
350 }
351 latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
352 get_block = btr_block_get(
353 page_id_t(page_id.space(), right_page_no),
354 page_size, RW_X_LATCH, cursor->index, mtr);
355 latch_leaves.blocks[2] = get_block;
356 #ifdef UNIV_BTR_DEBUG
357 ut_a(page_is_comp(get_block->frame)
358 == page_is_comp(page));
359 ut_a(btr_page_get_prev(get_block->frame, mtr)
360 == page_get_page_no(page));
361 #endif /* UNIV_BTR_DEBUG */
362 if (spatial) {
363 cursor->rtr_info->tree_blocks[
364 RTR_MAX_LEVELS + 2] = get_block;
365 }
366 }
367
368 return(latch_leaves);
369
370 case BTR_SEARCH_PREV:
371 case BTR_MODIFY_PREV:
372 mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
373 /* latch also left sibling */
374 rw_lock_s_lock(&block->lock);
375 left_page_no = btr_page_get_prev(page, mtr);
376 rw_lock_s_unlock(&block->lock);
377
378 if (left_page_no != FIL_NULL) {
379 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
380 get_block = btr_block_get(
381 page_id_t(page_id.space(), left_page_no),
382 page_size, mode, cursor->index, mtr);
383 latch_leaves.blocks[0] = get_block;
384 cursor->left_block = get_block;
385 #ifdef UNIV_BTR_DEBUG
386 ut_a(page_is_comp(get_block->frame)
387 == page_is_comp(page));
388 ut_a(btr_page_get_next(get_block->frame, mtr)
389 == page_get_page_no(page));
390 #endif /* UNIV_BTR_DEBUG */
391 }
392
393 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
394 get_block = btr_block_get(page_id, page_size, mode,
395 cursor->index, mtr);
396 latch_leaves.blocks[1] = get_block;
397 #ifdef UNIV_BTR_DEBUG
398 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
399 #endif /* UNIV_BTR_DEBUG */
400 return(latch_leaves);
401 case BTR_CONT_MODIFY_TREE:
402 ut_ad(dict_index_is_spatial(cursor->index));
403 return(latch_leaves);
404 }
405
406 ut_error;
407 return(latch_leaves);
408 }
409
410 /** Optimistically latches the leaf page or pages requested.
411 @param[in] block guessed buffer block
412 @param[in] modify_clock modify clock value
413 @param[in,out] latch_mode BTR_SEARCH_LEAF, ...
414 @param[in,out] cursor cursor
415 @param[in] file file name
416 @param[in] line line where called
417 @param[in] mtr mini-transaction
418 @return true if success */
419 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)420 btr_cur_optimistic_latch_leaves(
421 buf_block_t* block,
422 ib_uint64_t modify_clock,
423 ulint* latch_mode,
424 btr_cur_t* cursor,
425 const char* file,
426 ulint line,
427 mtr_t* mtr)
428 {
429 ulint mode;
430 ulint left_page_no;
431 ut_ad(block->page.buf_fix_count > 0);
432 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
433
434 switch (*latch_mode) {
435 case BTR_SEARCH_LEAF:
436 case BTR_MODIFY_LEAF:
437 return(buf_page_optimistic_get(*latch_mode, block,
438 modify_clock, file, line, mtr));
439 case BTR_SEARCH_PREV:
440 case BTR_MODIFY_PREV:
441 mode = *latch_mode == BTR_SEARCH_PREV
442 ? RW_S_LATCH : RW_X_LATCH;
443
444 rw_lock_s_lock(&block->lock);
445 if (block->modify_clock != modify_clock) {
446 rw_lock_s_unlock(&block->lock);
447
448 return(false);
449 }
450 left_page_no = btr_page_get_prev(
451 buf_block_get_frame(block), mtr);
452 rw_lock_s_unlock(&block->lock);
453
454 if (left_page_no != FIL_NULL) {
455 const page_id_t page_id(
456 dict_index_get_space(cursor->index),
457 left_page_no);
458
459 cursor->left_block = btr_block_get(
460 page_id,
461 dict_table_page_size(cursor->index->table),
462 mode, cursor->index, mtr);
463 } else {
464 cursor->left_block = NULL;
465 }
466
467 if (buf_page_optimistic_get(mode, block, modify_clock,
468 file, line, mtr)) {
469 if (btr_page_get_prev(buf_block_get_frame(block), mtr)
470 == left_page_no) {
471 /* We've entered this function with the block already buffer-fixed,
472 and buf_page_optimistic_get() buffer-fixes it again. The caller should
473 unfix the block once (to undo their buffer-fixing). */
474 ut_ad(2 <= block->page.buf_fix_count);
475 *latch_mode = mode;
476 return(true);
477 } else {
478 /* release the block, which will also decrement the buf_fix_count once
479 undoing the increment in successful buf_page_optimistic_get() */
480 btr_leaf_page_release(block, mode, mtr);
481 }
482 }
483
484 /* If we are still here then buf_page_optimistic_get() did not buffer-fix
485 the page, but it should still be buffer-fixed as it was before the call.*/
486 ut_ad(0 < block->page.buf_fix_count);
487 /* release the left block */
488 if (cursor->left_block != NULL) {
489 btr_leaf_page_release(cursor->left_block,
490 mode, mtr);
491 }
492
493 return(false);
494
495 default:
496 ut_error;
497 return(false);
498 }
499 }
500
501 /**
502 Gets intention in btr_intention_t from latch_mode, and cleares the intention
503 at the latch_mode.
504 @param latch_mode in/out: pointer to latch_mode
505 @return intention for latching tree */
506 static
507 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)508 btr_cur_get_and_clear_intention(
509 ulint *latch_mode)
510 {
511 btr_intention_t intention;
512
513 switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
514 case BTR_LATCH_FOR_INSERT:
515 intention = BTR_INTENTION_INSERT;
516 break;
517 case BTR_LATCH_FOR_DELETE:
518 intention = BTR_INTENTION_DELETE;
519 break;
520 default:
521 /* both or unknown */
522 intention = BTR_INTENTION_BOTH;
523 }
524 *latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
525
526 return(intention);
527 }
528
529 /**
530 Gets the desired latch type for the root leaf (root page is root leaf)
531 at the latch mode.
532 @param latch_mode in: BTR_SEARCH_LEAF, ...
533 @return latch type */
534 static
535 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)536 btr_cur_latch_for_root_leaf(
537 ulint latch_mode)
538 {
539 switch (latch_mode) {
540 case BTR_SEARCH_LEAF:
541 case BTR_SEARCH_TREE:
542 case BTR_SEARCH_PREV:
543 return(RW_S_LATCH);
544 case BTR_MODIFY_LEAF:
545 case BTR_MODIFY_TREE:
546 case BTR_MODIFY_PREV:
547 return(RW_X_LATCH);
548 case BTR_CONT_MODIFY_TREE:
549 case BTR_CONT_SEARCH_TREE:
550 /* A root page should be latched already,
551 and don't need to be latched here.
552 fall through (RW_NO_LATCH) */
553 case BTR_NO_LATCHES:
554 return(RW_NO_LATCH);
555 }
556
557 ut_error;
558 return(RW_NO_LATCH); /* avoid compiler warnings */
559 }
560
561 /** Detects whether the modifying record might need a modifying tree structure.
562 @param[in] index index
563 @param[in] page page
564 @param[in] lock_intention lock intention for the tree operation
565 @param[in] rec record (current node_ptr)
566 @param[in] rec_size size of the record or max size of node_ptr
567 @param[in] page_size page size
568 @param[in] mtr mtr
569 @return true if tree modification is needed */
570 static
571 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)572 btr_cur_will_modify_tree(
573 dict_index_t* index,
574 const page_t* page,
575 btr_intention_t lock_intention,
576 const rec_t* rec,
577 ulint rec_size,
578 const page_size_t& page_size,
579 mtr_t* mtr)
580 {
581 ut_ad(!page_is_leaf(page));
582 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
583 MTR_MEMO_X_LOCK
584 | MTR_MEMO_SX_LOCK)
585 || dict_table_is_intrinsic(index->table));
586
587 /* Pessimistic delete of the first record causes delete & insert
588 of node_ptr at upper level. And a subsequent page shrink is
589 possible. It causes delete of node_ptr at the upper level.
590 So we should pay attention also to 2nd record not only
591 first record and last record. Because if the "delete & insert" are
592 done for the different page, the 2nd record become
593 first record and following compress might delete the record and causes
594 the uppper level node_ptr modification. */
595
596 if (lock_intention <= BTR_INTENTION_BOTH) {
597 ulint margin;
598
599 if (lock_intention == BTR_INTENTION_BOTH) {
600 ulint level = btr_page_get_level(page, mtr);
601
602 /* This value is the worst expectation for the node_ptr
603 records to be deleted from this page. It is used to
604 expect whether the cursor position can be the left_most
605 record in this page or not. */
606 ulint max_nodes_deleted = 0;
607
608 /* By modifying tree operations from the under of this
609 level, logically (2 ^ (level - 1)) opportunities to
610 deleting records in maximum even unreally rare case. */
611 if (level > 7) {
612 /* TODO: adjust this practical limit. */
613 max_nodes_deleted = 64;
614 } else if (level > 0) {
615 max_nodes_deleted = (ulint)1 << (level - 1);
616 }
617
618 /* check delete will cause. (BTR_INTENTION_BOTH
619 or BTR_INTENTION_DELETE) */
620 if (page_get_n_recs(page) <= max_nodes_deleted * 2
621 || page_rec_is_first(rec, page)) {
622 /* The cursor record can be the left most record
623 in this page. */
624 return(true);
625 }
626
627 if (fil_page_get_prev(page) != FIL_NULL
628 && page_rec_distance_is_at_most(
629 page_get_infimum_rec(page), rec,
630 max_nodes_deleted)) {
631 return (true);
632 }
633
634 if (fil_page_get_next(page) != FIL_NULL
635 && page_rec_distance_is_at_most(
636 rec, page_get_supremum_rec(page),
637 max_nodes_deleted)) {
638 return (true);
639 }
640
641 /* Delete at leftmost record in a page causes delete
642 & insert at its parent page. After that, the delete
643 might cause btr_compress() and delete record at its
644 parent page. Thus we should consider max deletes. */
645
646 margin = rec_size * max_nodes_deleted;
647 } else {
648 ut_ad(lock_intention == BTR_INTENTION_DELETE);
649
650 margin = rec_size;
651 }
652 /* Safe because we already have SX latch of the index tree */
653 if (page_get_data_size(page)
654 < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
655 || (fil_page_get_next(page) == FIL_NULL
656 && fil_page_get_prev(page) == FIL_NULL)) {
657 return(true);
658 }
659 }
660
661 if (lock_intention >= BTR_INTENTION_BOTH) {
662 /* check insert will cause. BTR_INTENTION_BOTH
663 or BTR_INTENTION_INSERT*/
664
665 /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
666 we should check it here in advance, since the max allowable
667 records in a page is limited. */
668 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
669 return(true));
670
671 /* needs 2 records' space for the case the single split and
672 insert cannot fit.
673 page_get_max_insert_size_after_reorganize() includes space
674 for page directory already */
675 ulint max_size
676 = page_get_max_insert_size_after_reorganize(page, 2);
677
678 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
679 || max_size < rec_size * 2) {
680 return(true);
681 }
682 /* TODO: optimize this condition for compressed page.
683 this is based on the worst compress rate.
684 currently looking only uncompressed page, but we can look
685 also compressed page page_zip_available() if already in the
686 buffer pool */
687 /* needs 2 records' space also for worst compress rate. */
688 if (page_size.is_compressed()
689 && page_zip_empty_size(index->n_fields,
690 page_size.physical())
691 < rec_size * 2 + page_get_data_size(page)
692 + page_dir_calc_reserved_space(
693 page_get_n_recs(page) + 2) + 1) {
694 return(true);
695 }
696 }
697
698 return(false);
699 }
700
701 /** Detects whether the modifying record might need a opposite modification
702 to the intention.
703 @param[in] page page
704 @param[in] lock_intention lock intention for the tree operation
705 @param[in] rec record (current node_ptr)
706 @return true if tree modification is needed */
707 static
708 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)709 btr_cur_need_opposite_intention(
710 const page_t* page,
711 btr_intention_t lock_intention,
712 const rec_t* rec)
713 {
714 switch (lock_intention) {
715 case BTR_INTENTION_DELETE:
716 return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
717 && page_rec_is_first(rec, page))
718 || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
719 && page_rec_is_last(rec, page)));
720 case BTR_INTENTION_INSERT:
721 return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
722 && page_rec_is_last(rec, page));
723 case BTR_INTENTION_BOTH:
724 return(false);
725 }
726
727 ut_error;
728 return(false);
729 }
730
731 /********************************************************************//**
732 Searches an index tree and positions a tree cursor on a given level.
733 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
734 to node pointer page number fields on the upper levels of the tree!
735 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
736 cursor->up_match and cursor->low_match both will have sensible values.
737 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
738
739 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
740 search tuple should be performed in the B-tree. InnoDB does an insert
741 immediately after the cursor. Thus, the cursor may end up on a user record,
742 or on a page infimum record. */
743 void
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)744 btr_cur_search_to_nth_level(
745 /*========================*/
746 dict_index_t* index, /*!< in: index */
747 ulint level, /*!< in: the tree level of search */
748 const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
749 tuple must be set so that it cannot get
750 compared to the node ptr page number field! */
751 page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
752 Inserts should always be made using
753 PAGE_CUR_LE to search the position! */
754 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
755 at most one of BTR_INSERT, BTR_DELETE_MARK,
756 BTR_DELETE, or BTR_ESTIMATE;
757 cursor->left_block is used to store a pointer
758 to the left neighbor page, in the cases
759 BTR_SEARCH_PREV and BTR_MODIFY_PREV;
760 NOTE that if has_search_latch
761 is != 0, we maybe do not have a latch set
762 on the cursor page, we assume
763 the caller uses his search latch
764 to protect the record! */
765 btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
766 s- or x-latched, but see also above! */
767 ulint has_search_latch,
768 /*!< in: info on the latch mode the
769 caller currently has on search system:
770 RW_S_LATCH, or 0 */
771 const char* file, /*!< in: file name */
772 ulint line, /*!< in: line where called */
773 mtr_t* mtr) /*!< in: mtr */
774 {
775 page_t* page = NULL; /* remove warning */
776 buf_block_t* block;
777 ulint height;
778 ulint up_match;
779 ulint up_bytes;
780 ulint low_match;
781 ulint low_bytes;
782 ulint savepoint;
783 ulint rw_latch;
784 page_cur_mode_t page_mode;
785 page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
786 ulint buf_mode;
787 ulint estimate;
788 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
789 page_cur_t* page_cursor;
790 btr_op_t btr_op;
791 ulint root_height = 0; /* remove warning */
792
793 ulint upper_rw_latch, root_leaf_rw_latch;
794 btr_intention_t lock_intention;
795 bool modify_external;
796 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
797 ulint tree_savepoints[BTR_MAX_LEVELS];
798 ulint n_blocks = 0;
799 ulint n_releases = 0;
800 bool detected_same_key_root = false;
801
802 bool retrying_for_search_prev = false;
803 ulint leftmost_from_level = 0;
804 buf_block_t** prev_tree_blocks = NULL;
805 ulint* prev_tree_savepoints = NULL;
806 ulint prev_n_blocks = 0;
807 ulint prev_n_releases = 0;
808 bool need_path = true;
809 bool rtree_parent_modified = false;
810 bool mbr_adj = false;
811 bool found = false;
812
813 DBUG_ENTER("btr_cur_search_to_nth_level");
814
815 btr_search_t* info;
816 mem_heap_t* heap = NULL;
817 ulint offsets_[REC_OFFS_NORMAL_SIZE];
818 ulint* offsets = offsets_;
819 ulint offsets2_[REC_OFFS_NORMAL_SIZE];
820 ulint* offsets2 = offsets2_;
821 rec_offs_init(offsets_);
822 rec_offs_init(offsets2_);
823 /* Currently, PAGE_CUR_LE is the only search mode used for searches
824 ending to upper levels */
825
826 ut_ad(level == 0 || mode == PAGE_CUR_LE
827 || RTREE_SEARCH_MODE(mode));
828 ut_ad(dict_index_check_search_tuple(index, tuple));
829 ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
830 ut_ad(dtuple_check_typed(tuple));
831 ut_ad(!(index->type & DICT_FTS));
832 ut_ad(index->page != FIL_NULL);
833
834 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
835 UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
836 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
837 UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
838 #ifdef UNIV_DEBUG
839 cursor->up_match = ULINT_UNDEFINED;
840 cursor->low_match = ULINT_UNDEFINED;
841 #endif /* UNIV_DEBUG */
842
843 ibool s_latch_by_caller;
844
845 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
846
847 ut_ad(!s_latch_by_caller
848 || srv_read_only_mode
849 || mtr_memo_contains_flagged(mtr,
850 dict_index_get_lock(index),
851 MTR_MEMO_S_LOCK
852 | MTR_MEMO_SX_LOCK));
853
854 /* These flags are mutually exclusive, they are lumped together
855 with the latch mode for historical reasons. It's possible for
856 none of the flags to be set. */
857 switch (UNIV_EXPECT(latch_mode
858 & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
859 0)) {
860 case 0:
861 btr_op = BTR_NO_OP;
862 break;
863 case BTR_INSERT:
864 btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
865 ? BTR_INSERT_IGNORE_UNIQUE_OP
866 : BTR_INSERT_OP;
867 break;
868 case BTR_DELETE:
869 btr_op = BTR_DELETE_OP;
870 ut_a(cursor->purge_node);
871 break;
872 case BTR_DELETE_MARK:
873 btr_op = BTR_DELMARK_OP;
874 break;
875 default:
876 /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
877 should be specified at a time */
878 ut_error;
879 }
880
881 /* Operations on the insert buffer tree cannot be buffered. */
882 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
883 /* Operations on the clustered index cannot be buffered. */
884 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
885 /* Operations on the temporary table(indexes) cannot be buffered. */
886 ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
887 /* Operation on the spatial index cannot be buffered. */
888 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
889
890 estimate = latch_mode & BTR_ESTIMATE;
891
892 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
893
894 modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
895
896 /* Turn the flags unrelated to the latch mode off. */
897 latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
898
899 ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
900
901 ut_ad(!s_latch_by_caller
902 || latch_mode == BTR_SEARCH_LEAF
903 || latch_mode == BTR_SEARCH_TREE
904 || latch_mode == BTR_MODIFY_LEAF);
905
906 cursor->flag = BTR_CUR_BINARY;
907 cursor->index = index;
908
909 info = btr_search_get_info(index);
910
911 # ifdef UNIV_SEARCH_PERF_STAT
912 info->n_searches++;
913 # endif
914 /* Use of AHI is disabled for intrinsic table as these tables re-use
915 the index-id and AHI validation is based on index-id. */
916 if (rw_lock_get_writer(btr_get_search_latch(index))
917 == RW_LOCK_NOT_LOCKED
918 && latch_mode <= BTR_MODIFY_LEAF
919 && info->last_hash_succ
920 && !index->disable_ahi
921 && !estimate
922 # ifdef PAGE_CUR_LE_OR_EXTENDS
923 && mode != PAGE_CUR_LE_OR_EXTENDS
924 # endif /* PAGE_CUR_LE_OR_EXTENDS */
925 && !dict_index_is_spatial(index)
926 /* If !has_search_latch, we do a dirty read of
927 btr_search_enabled below, and btr_search_guess_on_hash()
928 will have to check it again. */
929 && UNIV_LIKELY(btr_search_enabled)
930 && !modify_external
931 && btr_search_guess_on_hash(index, info, tuple, mode,
932 latch_mode, cursor,
933 has_search_latch, mtr)) {
934
935 /* Search using the hash index succeeded */
936
937 ut_ad(cursor->up_match != ULINT_UNDEFINED
938 || mode != PAGE_CUR_GE);
939 ut_ad(cursor->up_match != ULINT_UNDEFINED
940 || mode != PAGE_CUR_LE);
941 ut_ad(cursor->low_match != ULINT_UNDEFINED
942 || mode != PAGE_CUR_LE);
943 btr_cur_n_sea++;
944
945 DBUG_VOID_RETURN;
946 }
947 btr_cur_n_non_sea++;
948
949 /* If the hash search did not succeed, do binary search down the
950 tree */
951
952 if (has_search_latch) {
953 /* Release possible search latch to obey latching order */
954 rw_lock_s_unlock(btr_get_search_latch(index));
955 }
956
957 /* Store the position of the tree latch we push to mtr so that we
958 know how to release it when we have latched leaf node(s) */
959
960 savepoint = mtr_set_savepoint(mtr);
961
962 switch (latch_mode) {
963 case BTR_MODIFY_TREE:
964 /* Most of delete-intended operations are purging.
965 Free blocks and read IO bandwidth should be prior
966 for them, when the history list is glowing huge. */
967 if (lock_intention == BTR_INTENTION_DELETE
968 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
969 && buf_get_n_pending_read_ios()) {
970 mtr_x_lock(dict_index_get_lock(index), mtr);
971 } else if (dict_index_is_spatial(index)
972 && lock_intention <= BTR_INTENTION_BOTH) {
973 /* X lock the if there is possibility of
974 pessimistic delete on spatial index. As we could
975 lock upward for the tree */
976
977 mtr_x_lock(dict_index_get_lock(index), mtr);
978 } else {
979 mtr_sx_lock(dict_index_get_lock(index), mtr);
980 }
981 upper_rw_latch = RW_X_LATCH;
982 break;
983 case BTR_CONT_MODIFY_TREE:
984 case BTR_CONT_SEARCH_TREE:
985 /* Do nothing */
986 ut_ad(srv_read_only_mode
987 || mtr_memo_contains_flagged(mtr,
988 dict_index_get_lock(index),
989 MTR_MEMO_X_LOCK
990 | MTR_MEMO_SX_LOCK));
991 if (dict_index_is_spatial(index)
992 && latch_mode == BTR_CONT_MODIFY_TREE) {
993 /* If we are about to locating parent page for split
994 and/or merge operation for R-Tree index, X latch
995 the parent */
996 upper_rw_latch = RW_X_LATCH;
997 } else {
998 upper_rw_latch = RW_NO_LATCH;
999 }
1000 break;
1001 default:
1002 if (!srv_read_only_mode) {
1003 if (s_latch_by_caller) {
1004 ut_ad(rw_lock_own(dict_index_get_lock(index),
1005 RW_LOCK_S));
1006 } else if (!modify_external) {
1007 /* BTR_SEARCH_TREE is intended to be used with
1008 BTR_ALREADY_S_LATCHED */
1009 ut_ad(latch_mode != BTR_SEARCH_TREE);
1010
1011 mtr_s_lock(dict_index_get_lock(index), mtr);
1012 } else {
1013 /* BTR_MODIFY_EXTERNAL needs to be excluded */
1014 mtr_sx_lock(dict_index_get_lock(index), mtr);
1015 }
1016 upper_rw_latch = RW_S_LATCH;
1017 } else {
1018 upper_rw_latch = RW_NO_LATCH;
1019 }
1020 }
1021 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1022
1023 page_cursor = btr_cur_get_page_cur(cursor);
1024
1025 const ulint space = dict_index_get_space(index);
1026 const page_size_t page_size(dict_table_page_size(index->table));
1027
1028 /* Start with the root page. */
1029 page_id_t page_id(space, dict_index_get_page(index));
1030
1031 if (root_leaf_rw_latch == RW_X_LATCH) {
1032 node_ptr_max_size = dict_index_node_ptr_max_size(index);
1033 }
1034
1035 up_match = 0;
1036 up_bytes = 0;
1037 low_match = 0;
1038 low_bytes = 0;
1039
1040 height = ULINT_UNDEFINED;
1041
1042 /* We use these modified search modes on non-leaf levels of the
1043 B-tree. These let us end up in the right B-tree leaf. In that leaf
1044 we use the original search mode. */
1045
1046 switch (mode) {
1047 case PAGE_CUR_GE:
1048 page_mode = PAGE_CUR_L;
1049 break;
1050 case PAGE_CUR_G:
1051 page_mode = PAGE_CUR_LE;
1052 break;
1053 default:
1054 #ifdef PAGE_CUR_LE_OR_EXTENDS
1055 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1056 || RTREE_SEARCH_MODE(mode)
1057 || mode == PAGE_CUR_LE_OR_EXTENDS);
1058 #else /* PAGE_CUR_LE_OR_EXTENDS */
1059 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1060 || RTREE_SEARCH_MODE(mode));
1061 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1062 page_mode = mode;
1063 break;
1064 }
1065
1066 /* Loop and search until we arrive at the desired level */
1067 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1068
1069 search_loop:
1070 buf_mode = BUF_GET;
1071 rw_latch = RW_NO_LATCH;
1072 rtree_parent_modified = false;
1073
1074 if (height != 0) {
1075 /* We are about to fetch the root or a non-leaf page. */
1076 if ((latch_mode != BTR_MODIFY_TREE
1077 || height == level)
1078 && !retrying_for_search_prev) {
1079 /* If doesn't have SX or X latch of index,
1080 each pages should be latched before reading. */
1081 if (modify_external
1082 && height == ULINT_UNDEFINED
1083 && upper_rw_latch == RW_S_LATCH) {
1084 /* needs sx-latch of root page
1085 for fseg operation */
1086 rw_latch = RW_SX_LATCH;
1087 } else {
1088 rw_latch = upper_rw_latch;
1089 }
1090 }
1091 } else if (latch_mode <= BTR_MODIFY_LEAF) {
1092 rw_latch = latch_mode;
1093
1094 if (btr_op != BTR_NO_OP
1095 && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1096
1097 /* Try to buffer the operation if the leaf
1098 page is not in the buffer pool. */
1099
1100 buf_mode = btr_op == BTR_DELETE_OP
1101 ? BUF_GET_IF_IN_POOL_OR_WATCH
1102 : BUF_GET_IF_IN_POOL;
1103 }
1104 }
1105
1106 retry_page_get:
1107 ut_ad(n_blocks < BTR_MAX_LEVELS);
1108 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1109 block = buf_page_get_gen(
1110 page_id, page_size, rw_latch,
1111 (height == ULINT_UNDEFINED ? info->root_guess : NULL),
1112 buf_mode, file, line, mtr
1113 );
1114
1115 tree_blocks[n_blocks] = block;
1116
1117 if (block == NULL) {
1118 /* This must be a search to perform an insert/delete
1119 mark/ delete; try using the insert/delete buffer */
1120
1121 ut_ad(height == 0);
1122 ut_ad(cursor->thr);
1123
1124 switch (btr_op) {
1125 case BTR_INSERT_OP:
1126 case BTR_INSERT_IGNORE_UNIQUE_OP:
1127 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1128 ut_ad(!dict_index_is_spatial(index));
1129
1130 if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1131 page_id, page_size, cursor->thr)) {
1132
1133 cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1134
1135 goto func_exit;
1136 }
1137 break;
1138
1139 case BTR_DELMARK_OP:
1140 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1141 ut_ad(!dict_index_is_spatial(index));
1142
1143 if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1144 index, page_id, page_size,
1145 cursor->thr)) {
1146
1147 cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1148
1149 goto func_exit;
1150 }
1151
1152 break;
1153
1154 case BTR_DELETE_OP:
1155 ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1156 ut_ad(!dict_index_is_spatial(index));
1157
1158 if (!row_purge_poss_sec(cursor->purge_node,
1159 index, tuple)) {
1160
1161 /* The record cannot be purged yet. */
1162 cursor->flag = BTR_CUR_DELETE_REF;
1163 } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1164 index, page_id, page_size,
1165 cursor->thr)) {
1166
1167 /* The purge was buffered. */
1168 cursor->flag = BTR_CUR_DELETE_IBUF;
1169 } else {
1170 /* The purge could not be buffered. */
1171 buf_pool_watch_unset(page_id);
1172 break;
1173 }
1174
1175 buf_pool_watch_unset(page_id);
1176 goto func_exit;
1177
1178 default:
1179 ut_error;
1180 }
1181
1182 /* Insert to the insert/delete buffer did not succeed, we
1183 must read the page from disk. */
1184
1185 buf_mode = BUF_GET;
1186
1187 goto retry_page_get;
1188 }
1189
1190 if (retrying_for_search_prev && height != 0) {
1191 /* also latch left sibling */
1192 ulint left_page_no;
1193 buf_block_t* get_block;
1194
1195 ut_ad(rw_latch == RW_NO_LATCH);
1196
1197 rw_latch = upper_rw_latch;
1198
1199 rw_lock_s_lock(&block->lock);
1200 left_page_no = btr_page_get_prev(
1201 buf_block_get_frame(block), mtr);
1202 rw_lock_s_unlock(&block->lock);
1203
1204 if (left_page_no != FIL_NULL) {
1205 ut_ad(prev_n_blocks < leftmost_from_level);
1206
1207 prev_tree_savepoints[prev_n_blocks]
1208 = mtr_set_savepoint(mtr);
1209 get_block = buf_page_get_gen(
1210 page_id_t(page_id.space(), left_page_no),
1211 page_size, rw_latch, NULL, buf_mode,
1212 file, line, mtr);
1213 prev_tree_blocks[prev_n_blocks] = get_block;
1214 prev_n_blocks++;
1215
1216 /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1217 without their parent page's lock. So, not needed to
1218 retry here, because we have the parent page's lock. */
1219 }
1220
1221 /* release RW_NO_LATCH page and lock with RW_S_LATCH */
1222 mtr_release_block_at_savepoint(
1223 mtr, tree_savepoints[n_blocks],
1224 tree_blocks[n_blocks]);
1225
1226 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1227 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1228 buf_mode, file, line, mtr);
1229 tree_blocks[n_blocks] = block;
1230 }
1231
1232 page = buf_block_get_frame(block);
1233
1234 if (height == ULINT_UNDEFINED
1235 && page_is_leaf(page)
1236 && rw_latch != RW_NO_LATCH
1237 && rw_latch != root_leaf_rw_latch) {
1238 /* We should retry to get the page, because the root page
1239 is latched with different level as a leaf page. */
1240 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1241 ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1242 ut_ad(rw_latch == RW_S_LATCH || modify_external);
1243
1244 ut_ad(n_blocks == 0);
1245 mtr_release_block_at_savepoint(
1246 mtr, tree_savepoints[n_blocks],
1247 tree_blocks[n_blocks]);
1248
1249 upper_rw_latch = root_leaf_rw_latch;
1250 goto search_loop;
1251 }
1252
1253 if (rw_latch != RW_NO_LATCH) {
1254 #ifdef UNIV_ZIP_DEBUG
1255 const page_zip_des_t* page_zip
1256 = buf_block_get_page_zip(block);
1257 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1258 #endif /* UNIV_ZIP_DEBUG */
1259
1260 buf_block_dbg_add_level(
1261 block, dict_index_is_ibuf(index)
1262 ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1263 }
1264
1265 ut_ad(fil_page_index_page_check(page));
1266 ut_ad(index->id == btr_page_get_index_id(page));
1267
1268 if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1269 /* We are in the root node */
1270
1271 height = btr_page_get_level(page, mtr);
1272 root_height = height;
1273 cursor->tree_height = root_height + 1;
1274
1275 if (dict_index_is_spatial(index)) {
1276 ut_ad(cursor->rtr_info);
1277
1278 node_seq_t seq_no = rtr_get_current_ssn_id(index);
1279
1280 /* If SSN in memory is not initialized, fetch
1281 it from root page */
1282 if (seq_no < 1) {
1283 node_seq_t root_seq_no;
1284
1285 root_seq_no = page_get_ssn_id(page);
1286
1287 mutex_enter(&(index->rtr_ssn.mutex));
1288 index->rtr_ssn.seq_no = root_seq_no + 1;
1289 mutex_exit(&(index->rtr_ssn.mutex));
1290 }
1291
1292 /* Save the MBR */
1293 cursor->rtr_info->thr = cursor->thr;
1294 rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1295 }
1296
1297 info->root_guess = block;
1298 }
1299
1300 if (height == 0) {
1301 if (rw_latch == RW_NO_LATCH) {
1302
1303 latch_leaves = btr_cur_latch_leaves(
1304 block, page_id, page_size, latch_mode,
1305 cursor, mtr);
1306 }
1307
1308 switch (latch_mode) {
1309 case BTR_MODIFY_TREE:
1310 case BTR_CONT_MODIFY_TREE:
1311 case BTR_CONT_SEARCH_TREE:
1312 break;
1313 default:
1314 if (!s_latch_by_caller
1315 && !srv_read_only_mode
1316 && !modify_external) {
1317 /* Release the tree s-latch */
1318 /* NOTE: BTR_MODIFY_EXTERNAL
1319 needs to keep tree sx-latch */
1320 mtr_release_s_latch_at_savepoint(
1321 mtr, savepoint,
1322 dict_index_get_lock(index));
1323 }
1324
1325 /* release upper blocks */
1326 if (retrying_for_search_prev) {
1327 for (;
1328 prev_n_releases < prev_n_blocks;
1329 prev_n_releases++) {
1330 mtr_release_block_at_savepoint(
1331 mtr,
1332 prev_tree_savepoints[
1333 prev_n_releases],
1334 prev_tree_blocks[
1335 prev_n_releases]);
1336 }
1337 }
1338
1339 for (; n_releases < n_blocks; n_releases++) {
1340 if (n_releases == 0 && modify_external) {
1341 /* keep latch of root page */
1342 ut_ad(mtr_memo_contains_flagged(
1343 mtr, tree_blocks[n_releases],
1344 MTR_MEMO_PAGE_SX_FIX
1345 | MTR_MEMO_PAGE_X_FIX));
1346 continue;
1347 }
1348
1349 mtr_release_block_at_savepoint(
1350 mtr, tree_savepoints[n_releases],
1351 tree_blocks[n_releases]);
1352 }
1353 }
1354
1355 page_mode = mode;
1356 }
1357
1358 if (dict_index_is_spatial(index)) {
1359 /* Remember the page search mode */
1360 search_mode = page_mode;
1361
1362 /* Some adjustment on search mode, when the
1363 page search mode is PAGE_CUR_RTREE_LOCATE
1364 or PAGE_CUR_RTREE_INSERT, as we are searching
1365 with MBRs. When it is not the target level, we
1366 should search all sub-trees that "CONTAIN" the
1367 search range/MBR. When it is at the target
1368 level, the search becomes PAGE_CUR_LE */
1369 if (page_mode == PAGE_CUR_RTREE_LOCATE
1370 && level == height) {
1371 if (level == 0) {
1372 page_mode = PAGE_CUR_LE;
1373 } else {
1374 page_mode = PAGE_CUR_RTREE_GET_FATHER;
1375 }
1376 }
1377
1378 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1379 page_mode = (level == height)
1380 ? PAGE_CUR_LE
1381 : PAGE_CUR_RTREE_INSERT;
1382
1383 ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1384 }
1385
1386 /* "need_path" indicates if we need to tracking the parent
1387 pages, if it is not spatial comparison, then no need to
1388 track it */
1389 if (page_mode < PAGE_CUR_CONTAIN) {
1390 need_path = false;
1391 }
1392
1393 up_match = 0;
1394 low_match = 0;
1395
1396 if (latch_mode == BTR_MODIFY_TREE
1397 || latch_mode == BTR_CONT_MODIFY_TREE
1398 || latch_mode == BTR_CONT_SEARCH_TREE) {
1399 /* Tree are locked, no need for Page Lock to protect
1400 the "path" */
1401 cursor->rtr_info->need_page_lock = false;
1402 }
1403 }
1404
1405 if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1406 ut_ad(need_path);
1407 found = rtr_cur_search_with_match(
1408 block, index, tuple, page_mode, page_cursor,
1409 cursor->rtr_info);
1410
1411 /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1412 if (search_mode == PAGE_CUR_RTREE_INSERT
1413 && cursor->rtr_info->mbr_adj) {
1414 if (latch_mode & BTR_MODIFY_LEAF) {
1415 /* Parent MBR needs updated, should retry
1416 with BTR_MODIFY_TREE */
1417 goto func_exit;
1418 } else if (latch_mode & BTR_MODIFY_TREE) {
1419 rtree_parent_modified = true;
1420 cursor->rtr_info->mbr_adj = false;
1421 mbr_adj = true;
1422 } else {
1423 ut_ad(0);
1424 }
1425 }
1426
1427 if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1428 cursor->low_match =
1429 DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1430 }
1431 } else if (height == 0 && btr_search_enabled
1432 && !dict_index_is_spatial(index)) {
1433 /* The adaptive hash index is only used when searching
1434 for leaf pages (height==0), but not in r-trees.
1435 We only need the byte prefix comparison for the purpose
1436 of updating the adaptive hash index. */
1437 page_cur_search_with_match_bytes(
1438 block, index, tuple, page_mode, &up_match, &up_bytes,
1439 &low_match, &low_bytes, page_cursor);
1440 } else {
1441 /* Search for complete index fields. */
1442 up_bytes = low_bytes = 0;
1443 page_cur_search_with_match(
1444 block, index, tuple, page_mode, &up_match,
1445 &low_match, page_cursor,
1446 need_path ? cursor->rtr_info : NULL);
1447 }
1448
1449 if (estimate) {
1450 btr_cur_add_path_info(cursor, height, root_height);
1451 }
1452
1453 /* If this is the desired level, leave the loop */
1454
1455 ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1456 mtr));
1457
1458 /* Add Predicate lock if it is serializable isolation
1459 and only if it is in the search case */
1460 if (dict_index_is_spatial(index)
1461 && cursor->rtr_info->need_prdt_lock
1462 && mode != PAGE_CUR_RTREE_INSERT
1463 && mode != PAGE_CUR_RTREE_LOCATE
1464 && mode >= PAGE_CUR_CONTAIN) {
1465 trx_t* trx = thr_get_trx(cursor->thr);
1466 lock_prdt_t prdt;
1467
1468 lock_mutex_enter();
1469 lock_init_prdt_from_mbr(
1470 &prdt, &cursor->rtr_info->mbr, mode,
1471 trx->lock.lock_heap);
1472 lock_mutex_exit();
1473
1474 if (rw_latch == RW_NO_LATCH && height != 0) {
1475 rw_lock_s_lock(&(block->lock));
1476 }
1477
1478 lock_prdt_lock(block, &prdt, index, LOCK_S,
1479 LOCK_PREDICATE, cursor->thr, mtr);
1480
1481 if (rw_latch == RW_NO_LATCH && height != 0) {
1482 rw_lock_s_unlock(&(block->lock));
1483 }
1484 }
1485
1486 if (level != height) {
1487
1488 const rec_t* node_ptr;
1489 ut_ad(height > 0);
1490
1491 height--;
1492
1493 node_ptr = page_cur_get_rec(page_cursor);
1494
1495 offsets = rec_get_offsets(
1496 node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1497
1498 /* If the rec is the first or last in the page for
1499 pessimistic delete intention, it might cause node_ptr insert
1500 for the upper level. We should change the intention and retry.
1501 */
1502 if (latch_mode == BTR_MODIFY_TREE
1503 && btr_cur_need_opposite_intention(
1504 page, lock_intention, node_ptr)) {
1505
1506 need_opposite_intention:
1507 ut_ad(upper_rw_latch == RW_X_LATCH);
1508
1509 if (n_releases > 0) {
1510 /* release root block */
1511 mtr_release_block_at_savepoint(
1512 mtr, tree_savepoints[0],
1513 tree_blocks[0]);
1514 }
1515
1516 /* release all blocks */
1517 for (; n_releases <= n_blocks; n_releases++) {
1518 mtr_release_block_at_savepoint(
1519 mtr, tree_savepoints[n_releases],
1520 tree_blocks[n_releases]);
1521 }
1522
1523 lock_intention = BTR_INTENTION_BOTH;
1524
1525 page_id.reset(space, dict_index_get_page(index));
1526 up_match = 0;
1527 low_match = 0;
1528 height = ULINT_UNDEFINED;
1529
1530 n_blocks = 0;
1531 n_releases = 0;
1532
1533 goto search_loop;
1534 }
1535
1536 if (dict_index_is_spatial(index)) {
1537 if (page_rec_is_supremum(node_ptr)) {
1538 cursor->low_match = 0;
1539 cursor->up_match = 0;
1540 goto func_exit;
1541 }
1542
1543 /* If we are doing insertion or record locating,
1544 remember the tree nodes we visited */
1545 if (page_mode == PAGE_CUR_RTREE_INSERT
1546 || (search_mode == PAGE_CUR_RTREE_LOCATE
1547 && (latch_mode != BTR_MODIFY_LEAF))) {
1548 bool add_latch = false;
1549
1550 if (latch_mode == BTR_MODIFY_TREE
1551 && rw_latch == RW_NO_LATCH) {
1552 ut_ad(mtr_memo_contains_flagged(
1553 mtr, dict_index_get_lock(index),
1554 MTR_MEMO_X_LOCK
1555 | MTR_MEMO_SX_LOCK));
1556 rw_lock_s_lock(&block->lock);
1557 add_latch = true;
1558 }
1559
1560 /* Store the parent cursor location */
1561 #ifdef UNIV_DEBUG
1562 ulint num_stored = rtr_store_parent_path(
1563 block, cursor, latch_mode,
1564 height + 1, mtr);
1565 #else
1566 rtr_store_parent_path(
1567 block, cursor, latch_mode,
1568 height + 1, mtr);
1569 #endif
1570
1571 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1572 btr_pcur_t* r_cursor =
1573 rtr_get_parent_cursor(
1574 cursor, height + 1,
1575 true);
1576 /* If it is insertion, there should
1577 be only one parent for each level
1578 traverse */
1579 #ifdef UNIV_DEBUG
1580 ut_ad(num_stored == 1);
1581 #endif
1582
1583 node_ptr = btr_pcur_get_rec(r_cursor);
1584
1585 }
1586
1587 if (add_latch) {
1588 rw_lock_s_unlock(&block->lock);
1589 }
1590
1591 ut_ad(!page_rec_is_supremum(node_ptr));
1592 }
1593
1594 ut_ad(page_mode == search_mode
1595 || (page_mode == PAGE_CUR_WITHIN
1596 && search_mode == PAGE_CUR_RTREE_LOCATE));
1597
1598 page_mode = search_mode;
1599 }
1600
1601 /* If the first or the last record of the page
1602 or the same key value to the first record or last record,
1603 the another page might be choosen when BTR_CONT_MODIFY_TREE.
1604 So, the parent page should not released to avoiding deadlock
1605 with blocking the another search with the same key value. */
1606 if (!detected_same_key_root
1607 && lock_intention == BTR_INTENTION_BOTH
1608 && !dict_index_is_unique(index)
1609 && latch_mode == BTR_MODIFY_TREE
1610 && (up_match >= rec_offs_n_fields(offsets) - 1
1611 || low_match >= rec_offs_n_fields(offsets) - 1)) {
1612 const rec_t* first_rec
1613 = page_rec_get_next_const(
1614 page_get_infimum_rec(
1615 page));
1616 ulint matched_fields;
1617
1618 ut_ad(upper_rw_latch == RW_X_LATCH);
1619
1620 if (node_ptr == first_rec
1621 || page_rec_is_last(node_ptr, page)) {
1622 detected_same_key_root = true;
1623 } else {
1624 matched_fields = 0;
1625
1626 offsets2 = rec_get_offsets(
1627 first_rec, index, offsets2,
1628 ULINT_UNDEFINED, &heap);
1629 cmp_rec_rec_with_match(node_ptr, first_rec,
1630 offsets, offsets2, index,
1631 page_is_spatial_non_leaf(first_rec, index),
1632 false, &matched_fields);
1633
1634 if (matched_fields
1635 >= rec_offs_n_fields(offsets) - 1) {
1636 detected_same_key_root = true;
1637 } else {
1638 const rec_t* last_rec;
1639
1640 last_rec = page_rec_get_prev_const(
1641 page_get_supremum_rec(
1642 page));
1643
1644 matched_fields = 0;
1645
1646 offsets2 = rec_get_offsets(
1647 last_rec, index, offsets2,
1648 ULINT_UNDEFINED, &heap);
1649 cmp_rec_rec_with_match(
1650 node_ptr, last_rec,
1651 offsets, offsets2, index,
1652 page_is_spatial_non_leaf(last_rec, index),
1653 false, &matched_fields);
1654 if (matched_fields
1655 >= rec_offs_n_fields(offsets) - 1) {
1656 detected_same_key_root = true;
1657 }
1658 }
1659 }
1660 }
1661
1662 /* If the page might cause modify_tree,
1663 we should not release the parent page's lock. */
1664 if (!detected_same_key_root
1665 && latch_mode == BTR_MODIFY_TREE
1666 && !btr_cur_will_modify_tree(
1667 index, page, lock_intention, node_ptr,
1668 node_ptr_max_size, page_size, mtr)
1669 && !rtree_parent_modified) {
1670 ut_ad(upper_rw_latch == RW_X_LATCH);
1671 ut_ad(n_releases <= n_blocks);
1672
1673 /* we can release upper blocks */
1674 for (; n_releases < n_blocks; n_releases++) {
1675 if (n_releases == 0) {
1676 /* we should not release root page
1677 to pin to same block. */
1678 continue;
1679 }
1680
1681 /* release unused blocks to unpin */
1682 mtr_release_block_at_savepoint(
1683 mtr, tree_savepoints[n_releases],
1684 tree_blocks[n_releases]);
1685 }
1686 }
1687
1688 if (height == level
1689 && latch_mode == BTR_MODIFY_TREE) {
1690 ut_ad(upper_rw_latch == RW_X_LATCH);
1691 /* we should sx-latch root page, if released already.
1692 It contains seg_header. */
1693 if (n_releases > 0) {
1694 mtr_block_sx_latch_at_savepoint(
1695 mtr, tree_savepoints[0],
1696 tree_blocks[0]);
1697 }
1698
1699 /* x-latch the branch blocks not released yet. */
1700 for (ulint i = n_releases; i <= n_blocks; i++) {
1701 mtr_block_x_latch_at_savepoint(
1702 mtr, tree_savepoints[i],
1703 tree_blocks[i]);
1704 }
1705 }
1706
1707 /* We should consider prev_page of parent page, if the node_ptr
1708 is the leftmost of the page. because BTR_SEARCH_PREV and
1709 BTR_MODIFY_PREV latches prev_page of the leaf page. */
1710 if ((latch_mode == BTR_SEARCH_PREV
1711 || latch_mode == BTR_MODIFY_PREV)
1712 && !retrying_for_search_prev) {
1713 /* block should be latched for consistent
1714 btr_page_get_prev() */
1715 ut_ad(mtr_memo_contains_flagged(mtr, block,
1716 MTR_MEMO_PAGE_S_FIX
1717 | MTR_MEMO_PAGE_X_FIX));
1718
1719 if (btr_page_get_prev(page, mtr) != FIL_NULL
1720 && page_rec_is_first(node_ptr, page)) {
1721
1722 if (leftmost_from_level == 0) {
1723 leftmost_from_level = height + 1;
1724 }
1725 } else {
1726 leftmost_from_level = 0;
1727 }
1728
1729 if (height == 0 && leftmost_from_level > 0) {
1730 /* should retry to get also prev_page
1731 from level==leftmost_from_level. */
1732 retrying_for_search_prev = true;
1733
1734 prev_tree_blocks = static_cast<buf_block_t**>(
1735 ut_malloc_nokey(sizeof(buf_block_t*)
1736 * leftmost_from_level));
1737
1738 prev_tree_savepoints = static_cast<ulint*>(
1739 ut_malloc_nokey(sizeof(ulint)
1740 * leftmost_from_level));
1741
1742 /* back to the level (leftmost_from_level+1) */
1743 ulint idx = n_blocks
1744 - (leftmost_from_level - 1);
1745
1746 page_id.reset(
1747 space,
1748 tree_blocks[idx]->page.id.page_no());
1749
1750 for (ulint i = n_blocks
1751 - (leftmost_from_level - 1);
1752 i <= n_blocks; i++) {
1753 mtr_release_block_at_savepoint(
1754 mtr, tree_savepoints[i],
1755 tree_blocks[i]);
1756 }
1757
1758 n_blocks -= (leftmost_from_level - 1);
1759 height = leftmost_from_level;
1760 ut_ad(n_releases == 0);
1761
1762 /* replay up_match, low_match */
1763 up_match = 0;
1764 low_match = 0;
1765 rtr_info_t* rtr_info = need_path
1766 ? cursor->rtr_info : NULL;
1767
1768 for (ulint i = 0; i < n_blocks; i++) {
1769 page_cur_search_with_match(
1770 tree_blocks[i], index, tuple,
1771 page_mode, &up_match,
1772 &low_match, page_cursor,
1773 rtr_info);
1774 }
1775
1776 goto search_loop;
1777 }
1778 }
1779
1780 /* Go to the child node */
1781 page_id.reset(
1782 space,
1783 btr_node_ptr_get_child_page_no(node_ptr, offsets));
1784
1785 n_blocks++;
1786
1787 if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1788 /* We're doing a search on an ibuf tree and we're one
1789 level above the leaf page. */
1790
1791 ut_ad(level == 0);
1792
1793 buf_mode = BUF_GET;
1794 rw_latch = RW_NO_LATCH;
1795 goto retry_page_get;
1796 }
1797
1798 if (dict_index_is_spatial(index)
1799 && page_mode >= PAGE_CUR_CONTAIN
1800 && page_mode != PAGE_CUR_RTREE_INSERT) {
1801 ut_ad(need_path);
1802 rtr_node_path_t* path =
1803 cursor->rtr_info->path;
1804
1805 if (!path->empty() && found) {
1806 #ifdef UNIV_DEBUG
1807 node_visit_t last_visit = path->back();
1808
1809 ut_ad(last_visit.page_no == page_id.page_no());
1810 #endif /* UNIV_DEBUG */
1811
1812 path->pop_back();
1813
1814 #ifdef UNIV_DEBUG
1815 if (page_mode == PAGE_CUR_RTREE_LOCATE
1816 && (latch_mode != BTR_MODIFY_LEAF)) {
1817 btr_pcur_t* cur
1818 = cursor->rtr_info->parent_path->back(
1819 ).cursor;
1820 rec_t* my_node_ptr
1821 = btr_pcur_get_rec(cur);
1822
1823 offsets = rec_get_offsets(
1824 my_node_ptr, index, offsets,
1825 ULINT_UNDEFINED, &heap);
1826
1827 ulint my_page_no
1828 = btr_node_ptr_get_child_page_no(
1829 my_node_ptr, offsets);
1830
1831 ut_ad(page_id.page_no() == my_page_no);
1832
1833 }
1834 #endif
1835 }
1836 }
1837
1838 goto search_loop;
1839 } else if (!dict_index_is_spatial(index)
1840 && latch_mode == BTR_MODIFY_TREE
1841 && lock_intention == BTR_INTENTION_INSERT
1842 && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1843 && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1844
1845 /* btr_insert_into_right_sibling() might cause
1846 deleting node_ptr at upper level */
1847
1848 if (height == 0) {
1849 /* release the leaf pages if latched */
1850 for (uint i = 0; i < 3; i++) {
1851 if (latch_leaves.blocks[i] != NULL) {
1852 mtr_release_block_at_savepoint(
1853 mtr, latch_leaves.savepoints[i],
1854 latch_leaves.blocks[i]);
1855 latch_leaves.blocks[i] = NULL;
1856 }
1857 }
1858 }
1859
1860 goto need_opposite_intention;
1861 }
1862
1863 if (level != 0) {
1864 if (upper_rw_latch == RW_NO_LATCH) {
1865 /* latch the page */
1866 buf_block_t* child_block;
1867
1868 if (latch_mode == BTR_CONT_MODIFY_TREE) {
1869 child_block = btr_block_get(
1870 page_id, page_size, RW_X_LATCH,
1871 index, mtr);
1872 } else {
1873 ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1874 child_block = btr_block_get(
1875 page_id, page_size, RW_SX_LATCH,
1876 index, mtr);
1877 }
1878
1879 btr_assert_not_corrupted(child_block, index);
1880 } else {
1881 ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1882 btr_assert_not_corrupted(block, index);
1883
1884 if (s_latch_by_caller) {
1885 ut_ad(latch_mode == BTR_SEARCH_TREE);
1886 /* to exclude modifying tree operations
1887 should sx-latch the index. */
1888 ut_ad(mtr_memo_contains(
1889 mtr, dict_index_get_lock(index),
1890 MTR_MEMO_SX_LOCK));
1891 /* because has sx-latch of index,
1892 can release upper blocks. */
1893 for (; n_releases < n_blocks; n_releases++) {
1894 mtr_release_block_at_savepoint(
1895 mtr,
1896 tree_savepoints[n_releases],
1897 tree_blocks[n_releases]);
1898 }
1899 }
1900 }
1901
1902 if (page_mode <= PAGE_CUR_LE) {
1903 cursor->low_match = low_match;
1904 cursor->up_match = up_match;
1905 }
1906 } else {
1907 cursor->low_match = low_match;
1908 cursor->low_bytes = low_bytes;
1909 cursor->up_match = up_match;
1910 cursor->up_bytes = up_bytes;
1911
1912 /* We do a dirty read of btr_search_enabled here. We
1913 will properly check btr_search_enabled again in
1914 btr_search_build_page_hash_index() before building a
1915 page hash index, while holding search latch. */
1916 if (btr_search_enabled && !index->disable_ahi) {
1917 btr_search_info_update(index, cursor);
1918 }
1919 ut_ad(cursor->up_match != ULINT_UNDEFINED
1920 || mode != PAGE_CUR_GE);
1921 ut_ad(cursor->up_match != ULINT_UNDEFINED
1922 || mode != PAGE_CUR_LE);
1923 ut_ad(cursor->low_match != ULINT_UNDEFINED
1924 || mode != PAGE_CUR_LE);
1925 }
1926
1927 /* For spatial index, remember what blocks are still latched */
1928 if (dict_index_is_spatial(index)
1929 && (latch_mode == BTR_MODIFY_TREE
1930 || latch_mode == BTR_MODIFY_LEAF)) {
1931 for (ulint i = 0; i < n_releases; i++) {
1932 cursor->rtr_info->tree_blocks[i] = NULL;
1933 cursor->rtr_info->tree_savepoints[i] = 0;
1934 }
1935
1936 for (ulint i = n_releases; i <= n_blocks; i++) {
1937 cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
1938 cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
1939 }
1940 }
1941
1942 func_exit:
1943
1944 if (UNIV_LIKELY_NULL(heap)) {
1945 mem_heap_free(heap);
1946 }
1947
1948 if (retrying_for_search_prev) {
1949 ut_free(prev_tree_blocks);
1950 ut_free(prev_tree_savepoints);
1951 }
1952
1953 if (has_search_latch) {
1954
1955 rw_lock_s_lock(btr_get_search_latch(index));
1956 }
1957
1958 if (mbr_adj) {
1959 /* remember that we will need to adjust parent MBR */
1960 cursor->rtr_info->mbr_adj = true;
1961 }
1962
1963 DBUG_VOID_RETURN;
1964 }
1965
1966 /** Searches an index tree and positions a tree cursor on a given level.
1967 This function will avoid latching the traversal path and so should be
1968 used only for cases where-in latching is not needed.
1969
1970 @param[in,out] index index
1971 @param[in] level the tree level of search
1972 @param[in] tuple data tuple; Note: n_fields_cmp in compared
1973 to the node ptr page node field
1974 @param[in] mode PAGE_CUR_L, ....
1975 Insert should always be made using PAGE_CUR_LE
1976 to search the position.
1977 @param[in,out] cursor tree cursor; points to record of interest.
1978 @param[in] file file name
1979 @param[in[ line line where called from
1980 @param[in,out] mtr mtr
1981 @param[in] mark_dirty
1982 if true then mark the block as dirty */
1983 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)1984 btr_cur_search_to_nth_level_with_no_latch(
1985 dict_index_t* index,
1986 ulint level,
1987 const dtuple_t* tuple,
1988 page_cur_mode_t mode,
1989 btr_cur_t* cursor,
1990 const char* file,
1991 ulint line,
1992 mtr_t* mtr,
1993 bool mark_dirty)
1994 {
1995 page_t* page = NULL; /* remove warning */
1996 buf_block_t* block;
1997 ulint height;
1998 ulint up_match;
1999 ulint low_match;
2000 ulint rw_latch;
2001 page_cur_mode_t page_mode;
2002 ulint buf_mode;
2003 page_cur_t* page_cursor;
2004 ulint root_height = 0; /* remove warning */
2005 ulint n_blocks = 0;
2006
2007 mem_heap_t* heap = NULL;
2008 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2009 ulint* offsets = offsets_;
2010 rec_offs_init(offsets_);
2011
2012 DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2013
2014 ut_ad(dict_table_is_intrinsic(index->table));
2015 ut_ad(level == 0 || mode == PAGE_CUR_LE);
2016 ut_ad(dict_index_check_search_tuple(index, tuple));
2017 ut_ad(dtuple_check_typed(tuple));
2018 ut_ad(index->page != FIL_NULL);
2019
2020 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2021 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2022 #ifdef UNIV_DEBUG
2023 cursor->up_match = ULINT_UNDEFINED;
2024 cursor->low_match = ULINT_UNDEFINED;
2025 #endif /* UNIV_DEBUG */
2026
2027 cursor->flag = BTR_CUR_BINARY;
2028 cursor->index = index;
2029
2030 page_cursor = btr_cur_get_page_cur(cursor);
2031
2032 const ulint space = dict_index_get_space(index);
2033 const page_size_t page_size(dict_table_page_size(index->table));
2034 /* Start with the root page. */
2035 page_id_t page_id(space, dict_index_get_page(index));
2036
2037 up_match = 0;
2038 low_match = 0;
2039
2040 height = ULINT_UNDEFINED;
2041
2042 /* We use these modified search modes on non-leaf levels of the
2043 B-tree. These let us end up in the right B-tree leaf. In that leaf
2044 we use the original search mode. */
2045
2046 switch (mode) {
2047 case PAGE_CUR_GE:
2048 page_mode = PAGE_CUR_L;
2049 break;
2050 case PAGE_CUR_G:
2051 page_mode = PAGE_CUR_LE;
2052 break;
2053 default:
2054 page_mode = mode;
2055 break;
2056 }
2057
2058 /* Loop and search until we arrive at the desired level */
2059 bool at_desired_level = false;
2060 while (!at_desired_level) {
2061 buf_mode = BUF_GET;
2062 rw_latch = RW_NO_LATCH;
2063
2064 ut_ad(n_blocks < BTR_MAX_LEVELS);
2065
2066 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2067 buf_mode, file, line, mtr, mark_dirty);
2068
2069 page = buf_block_get_frame(block);
2070
2071 if (height == ULINT_UNDEFINED) {
2072 /* We are in the root node */
2073
2074 height = btr_page_get_level(page, mtr);
2075 root_height = height;
2076 cursor->tree_height = root_height + 1;
2077 }
2078
2079 if (height == 0) {
2080 /* On leaf level. Switch back to original search mode.*/
2081 page_mode = mode;
2082 }
2083
2084 page_cur_search_with_match(
2085 block, index, tuple, page_mode, &up_match,
2086 &low_match, page_cursor, NULL);
2087
2088 ut_ad(height == btr_page_get_level(
2089 page_cur_get_page(page_cursor), mtr));
2090
2091 if (level != height) {
2092
2093 const rec_t* node_ptr;
2094 ut_ad(height > 0);
2095
2096 height--;
2097
2098 node_ptr = page_cur_get_rec(page_cursor);
2099
2100 offsets = rec_get_offsets(
2101 node_ptr, index, offsets,
2102 ULINT_UNDEFINED, &heap);
2103
2104 /* Go to the child node */
2105 page_id.reset(space, btr_node_ptr_get_child_page_no(
2106 node_ptr, offsets));
2107
2108 n_blocks++;
2109 } else {
2110 /* If this is the desired level, leave the loop */
2111 at_desired_level = true;
2112 }
2113 }
2114
2115 cursor->low_match = low_match;
2116 cursor->up_match = up_match;
2117
2118 if (heap != NULL) {
2119 mem_heap_free(heap);
2120 }
2121
2122 DBUG_VOID_RETURN;
2123 }
2124
2125 /*****************************************************************//**
2126 Opens a cursor at either end of an index. */
2127 void
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2128 btr_cur_open_at_index_side_func(
2129 /*============================*/
2130 bool from_left, /*!< in: true if open to the low end,
2131 false if to the high end */
2132 dict_index_t* index, /*!< in: index */
2133 ulint latch_mode, /*!< in: latch mode */
2134 btr_cur_t* cursor, /*!< in/out: cursor */
2135 ulint level, /*!< in: level to search for
2136 (0=leaf). */
2137 const char* file, /*!< in: file name */
2138 ulint line, /*!< in: line where called */
2139 mtr_t* mtr) /*!< in/out: mini-transaction */
2140 {
2141 page_cur_t* page_cursor;
2142 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2143 ulint height;
2144 ulint root_height = 0; /* remove warning */
2145 rec_t* node_ptr;
2146 ulint estimate;
2147 ulint savepoint;
2148 ulint upper_rw_latch, root_leaf_rw_latch;
2149 btr_intention_t lock_intention;
2150 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2151 ulint tree_savepoints[BTR_MAX_LEVELS];
2152 ulint n_blocks = 0;
2153 ulint n_releases = 0;
2154 mem_heap_t* heap = NULL;
2155 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2156 ulint* offsets = offsets_;
2157 rec_offs_init(offsets_);
2158
2159 estimate = latch_mode & BTR_ESTIMATE;
2160 latch_mode &= ~BTR_ESTIMATE;
2161
2162 ut_ad(level != ULINT_UNDEFINED);
2163
2164 bool s_latch_by_caller;
2165
2166 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2167 latch_mode &= ~BTR_ALREADY_S_LATCHED;
2168
2169 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2170
2171 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2172
2173 /* This function doesn't need to lock left page of the leaf page */
2174 if (latch_mode == BTR_SEARCH_PREV) {
2175 latch_mode = BTR_SEARCH_LEAF;
2176 } else if (latch_mode == BTR_MODIFY_PREV) {
2177 latch_mode = BTR_MODIFY_LEAF;
2178 }
2179
2180 /* Store the position of the tree latch we push to mtr so that we
2181 know how to release it when we have latched the leaf node */
2182
2183 savepoint = mtr_set_savepoint(mtr);
2184
2185 switch (latch_mode) {
2186 case BTR_CONT_MODIFY_TREE:
2187 case BTR_CONT_SEARCH_TREE:
2188 upper_rw_latch = RW_NO_LATCH;
2189 break;
2190 case BTR_MODIFY_TREE:
2191 /* Most of delete-intended operations are purging.
2192 Free blocks and read IO bandwidth should be prior
2193 for them, when the history list is glowing huge. */
2194 if (lock_intention == BTR_INTENTION_DELETE
2195 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2196 && buf_get_n_pending_read_ios()) {
2197 mtr_x_lock(dict_index_get_lock(index), mtr);
2198 } else {
2199 mtr_sx_lock(dict_index_get_lock(index), mtr);
2200 }
2201 upper_rw_latch = RW_X_LATCH;
2202 break;
2203 default:
2204 ut_ad(!s_latch_by_caller
2205 || mtr_memo_contains_flagged(mtr,
2206 dict_index_get_lock(index),
2207 MTR_MEMO_SX_LOCK
2208 | MTR_MEMO_S_LOCK));
2209 if (!srv_read_only_mode) {
2210 if (!s_latch_by_caller) {
2211 /* BTR_SEARCH_TREE is intended to be used with
2212 BTR_ALREADY_S_LATCHED */
2213 ut_ad(latch_mode != BTR_SEARCH_TREE);
2214
2215 mtr_s_lock(dict_index_get_lock(index), mtr);
2216 }
2217 upper_rw_latch = RW_S_LATCH;
2218 } else {
2219 upper_rw_latch = RW_NO_LATCH;
2220 }
2221 }
2222 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2223
2224 page_cursor = btr_cur_get_page_cur(cursor);
2225 cursor->index = index;
2226
2227 page_id_t page_id(dict_index_get_space(index),
2228 dict_index_get_page(index));
2229 const page_size_t& page_size = dict_table_page_size(index->table);
2230
2231 if (root_leaf_rw_latch == RW_X_LATCH) {
2232 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2233 }
2234
2235 height = ULINT_UNDEFINED;
2236
2237 for (;;) {
2238 buf_block_t* block;
2239 page_t* page;
2240 ulint rw_latch;
2241
2242 ut_ad(n_blocks < BTR_MAX_LEVELS);
2243
2244 if (height != 0
2245 && (latch_mode != BTR_MODIFY_TREE
2246 || height == level)) {
2247 rw_latch = upper_rw_latch;
2248 } else {
2249 rw_latch = RW_NO_LATCH;
2250 }
2251
2252 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2253 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2254 BUF_GET, file, line, mtr);
2255 tree_blocks[n_blocks] = block;
2256
2257 page = buf_block_get_frame(block);
2258
2259 if (height == ULINT_UNDEFINED
2260 && btr_page_get_level(page, mtr) == 0
2261 && rw_latch != RW_NO_LATCH
2262 && rw_latch != root_leaf_rw_latch) {
2263 /* We should retry to get the page, because the root page
2264 is latched with different level as a leaf page. */
2265 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2266 ut_ad(rw_latch == RW_S_LATCH);
2267
2268 ut_ad(n_blocks == 0);
2269 mtr_release_block_at_savepoint(
2270 mtr, tree_savepoints[n_blocks],
2271 tree_blocks[n_blocks]);
2272
2273 upper_rw_latch = root_leaf_rw_latch;
2274 continue;
2275 }
2276
2277 ut_ad(fil_page_index_page_check(page));
2278 ut_ad(index->id == btr_page_get_index_id(page));
2279
2280 if (height == ULINT_UNDEFINED) {
2281 /* We are in the root node */
2282
2283 height = btr_page_get_level(page, mtr);
2284 root_height = height;
2285 ut_a(height >= level);
2286 } else {
2287 /* TODO: flag the index corrupted if this fails */
2288 ut_ad(height == btr_page_get_level(page, mtr));
2289 }
2290
2291 if (height == level) {
2292 if (srv_read_only_mode) {
2293 btr_cur_latch_leaves(
2294 block, page_id, page_size,
2295 latch_mode, cursor, mtr);
2296 } else if (height == 0) {
2297 if (rw_latch == RW_NO_LATCH) {
2298 btr_cur_latch_leaves(
2299 block, page_id, page_size,
2300 latch_mode, cursor, mtr);
2301 }
2302 /* In versions <= 3.23.52 we had
2303 forgotten to release the tree latch
2304 here. If in an index scan we had to
2305 scan far to find a record visible to
2306 the current transaction, that could
2307 starve others waiting for the tree
2308 latch. */
2309
2310 switch (latch_mode) {
2311 case BTR_MODIFY_TREE:
2312 case BTR_CONT_MODIFY_TREE:
2313 case BTR_CONT_SEARCH_TREE:
2314 break;
2315 default:
2316 if (!s_latch_by_caller) {
2317 /* Release the tree s-latch */
2318 mtr_release_s_latch_at_savepoint(
2319 mtr, savepoint,
2320 dict_index_get_lock(
2321 index));
2322 }
2323
2324 /* release upper blocks */
2325 for (; n_releases < n_blocks;
2326 n_releases++) {
2327 mtr_release_block_at_savepoint(
2328 mtr,
2329 tree_savepoints[
2330 n_releases],
2331 tree_blocks[
2332 n_releases]);
2333 }
2334 }
2335 } else { /* height != 0 */
2336 /* We already have the block latched. */
2337 ut_ad(latch_mode == BTR_SEARCH_TREE);
2338 ut_ad(s_latch_by_caller);
2339 ut_ad(upper_rw_latch == RW_S_LATCH);
2340
2341 ut_ad(mtr_memo_contains(mtr, block,
2342 upper_rw_latch));
2343
2344 if (s_latch_by_caller) {
2345 /* to exclude modifying tree operations
2346 should sx-latch the index. */
2347 ut_ad(mtr_memo_contains(
2348 mtr,
2349 dict_index_get_lock(index),
2350 MTR_MEMO_SX_LOCK));
2351 /* because has sx-latch of index,
2352 can release upper blocks. */
2353 for (; n_releases < n_blocks;
2354 n_releases++) {
2355 mtr_release_block_at_savepoint(
2356 mtr,
2357 tree_savepoints[
2358 n_releases],
2359 tree_blocks[
2360 n_releases]);
2361 }
2362 }
2363 }
2364 }
2365
2366 if (from_left) {
2367 page_cur_set_before_first(block, page_cursor);
2368 } else {
2369 page_cur_set_after_last(block, page_cursor);
2370 }
2371
2372 if (height == level) {
2373 if (estimate) {
2374 btr_cur_add_path_info(cursor, height,
2375 root_height);
2376 }
2377
2378 break;
2379 }
2380
2381 ut_ad(height > 0);
2382
2383 if (from_left) {
2384 page_cur_move_to_next(page_cursor);
2385 } else {
2386 page_cur_move_to_prev(page_cursor);
2387 }
2388
2389 if (estimate) {
2390 btr_cur_add_path_info(cursor, height, root_height);
2391 }
2392
2393 height--;
2394
2395 node_ptr = page_cur_get_rec(page_cursor);
2396 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2397 ULINT_UNDEFINED, &heap);
2398
2399 /* If the rec is the first or last in the page for
2400 pessimistic delete intention, it might cause node_ptr insert
2401 for the upper level. We should change the intention and retry.
2402 */
2403 if (latch_mode == BTR_MODIFY_TREE
2404 && btr_cur_need_opposite_intention(
2405 page, lock_intention, node_ptr)) {
2406
2407 ut_ad(upper_rw_latch == RW_X_LATCH);
2408 /* release all blocks */
2409 for (; n_releases <= n_blocks; n_releases++) {
2410 mtr_release_block_at_savepoint(
2411 mtr, tree_savepoints[n_releases],
2412 tree_blocks[n_releases]);
2413 }
2414
2415 lock_intention = BTR_INTENTION_BOTH;
2416
2417 page_id.set_page_no(dict_index_get_page(index));
2418
2419 height = ULINT_UNDEFINED;
2420
2421 n_blocks = 0;
2422 n_releases = 0;
2423
2424 continue;
2425 }
2426
2427 if (latch_mode == BTR_MODIFY_TREE
2428 && !btr_cur_will_modify_tree(
2429 cursor->index, page, lock_intention, node_ptr,
2430 node_ptr_max_size, page_size, mtr)) {
2431 ut_ad(upper_rw_latch == RW_X_LATCH);
2432 ut_ad(n_releases <= n_blocks);
2433
2434 /* we can release upper blocks */
2435 for (; n_releases < n_blocks; n_releases++) {
2436 if (n_releases == 0) {
2437 /* we should not release root page
2438 to pin to same block. */
2439 continue;
2440 }
2441
2442 /* release unused blocks to unpin */
2443 mtr_release_block_at_savepoint(
2444 mtr, tree_savepoints[n_releases],
2445 tree_blocks[n_releases]);
2446 }
2447 }
2448
2449 if (height == level
2450 && latch_mode == BTR_MODIFY_TREE) {
2451 ut_ad(upper_rw_latch == RW_X_LATCH);
2452 /* we should sx-latch root page, if released already.
2453 It contains seg_header. */
2454 if (n_releases > 0) {
2455 mtr_block_sx_latch_at_savepoint(
2456 mtr, tree_savepoints[0],
2457 tree_blocks[0]);
2458 }
2459
2460 /* x-latch the branch blocks not released yet. */
2461 for (ulint i = n_releases; i <= n_blocks; i++) {
2462 mtr_block_x_latch_at_savepoint(
2463 mtr, tree_savepoints[i],
2464 tree_blocks[i]);
2465 }
2466 }
2467
2468 /* Go to the child node */
2469 page_id.set_page_no(
2470 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2471
2472 n_blocks++;
2473 }
2474
2475 if (heap) {
2476 mem_heap_free(heap);
2477 }
2478 }
2479
2480 /** Opens a cursor at either end of an index.
2481 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2482 to keep them in buffer pool. This mode is used by intrinsic table
2483 as they are not shared and so there is no need of latching.
2484 @param[in] from_left true if open to low end, false if open
2485 to high end.
2486 @param[in] index index
2487 @param[in,out] cursor cursor
2488 @param[in] file file name
2489 @param[in] line line where called
2490 @param[in,out] mtr mini transaction
2491 */
2492 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2493 btr_cur_open_at_index_side_with_no_latch_func(
2494 bool from_left,
2495 dict_index_t* index,
2496 btr_cur_t* cursor,
2497 ulint level,
2498 const char* file,
2499 ulint line,
2500 mtr_t* mtr)
2501 {
2502 page_cur_t* page_cursor;
2503 ulint height;
2504 rec_t* node_ptr;
2505 ulint n_blocks = 0;
2506 mem_heap_t* heap = NULL;
2507 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2508 ulint* offsets = offsets_;
2509 rec_offs_init(offsets_);
2510
2511 ut_ad(level != ULINT_UNDEFINED);
2512
2513 page_cursor = btr_cur_get_page_cur(cursor);
2514 cursor->index = index;
2515 page_id_t page_id(dict_index_get_space(index),
2516 dict_index_get_page(index));
2517 const page_size_t& page_size = dict_table_page_size(index->table);
2518
2519 height = ULINT_UNDEFINED;
2520
2521 for (;;) {
2522 buf_block_t* block;
2523 page_t* page;
2524 ulint rw_latch = RW_NO_LATCH;
2525
2526 ut_ad(n_blocks < BTR_MAX_LEVELS);
2527
2528 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2529 BUF_GET, file, line, mtr);
2530
2531 page = buf_block_get_frame(block);
2532
2533 ut_ad(fil_page_index_page_check(page));
2534 ut_ad(index->id == btr_page_get_index_id(page));
2535
2536 if (height == ULINT_UNDEFINED) {
2537 /* We are in the root node */
2538
2539 height = btr_page_get_level(page, mtr);
2540 ut_a(height >= level);
2541 } else {
2542 /* TODO: flag the index corrupted if this fails */
2543 ut_ad(height == btr_page_get_level(page, mtr));
2544 }
2545
2546 if (from_left) {
2547 page_cur_set_before_first(block, page_cursor);
2548 } else {
2549 page_cur_set_after_last(block, page_cursor);
2550 }
2551
2552 if (height == level) {
2553 break;
2554 }
2555
2556 ut_ad(height > 0);
2557
2558 if (from_left) {
2559 page_cur_move_to_next(page_cursor);
2560 } else {
2561 page_cur_move_to_prev(page_cursor);
2562 }
2563
2564 height--;
2565
2566 node_ptr = page_cur_get_rec(page_cursor);
2567 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2568 ULINT_UNDEFINED, &heap);
2569
2570 /* Go to the child node */
2571 page_id.set_page_no(
2572 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2573
2574 n_blocks++;
2575 }
2576
2577 if (heap != NULL) {
2578 mem_heap_free(heap);
2579 }
2580 }
2581
2582 /**********************************************************************//**
2583 Positions a cursor at a randomly chosen position within a B-tree.
2584 @return true if the index is available and we have put the cursor, false
2585 if the index is unavailable */
2586 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2587 btr_cur_open_at_rnd_pos_func(
2588 /*=========================*/
2589 dict_index_t* index, /*!< in: index */
2590 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
2591 btr_cur_t* cursor, /*!< in/out: B-tree cursor */
2592 const char* file, /*!< in: file name */
2593 ulint line, /*!< in: line where called */
2594 mtr_t* mtr) /*!< in: mtr */
2595 {
2596 page_cur_t* page_cursor;
2597 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2598 ulint height;
2599 rec_t* node_ptr;
2600 ulint savepoint;
2601 ulint upper_rw_latch, root_leaf_rw_latch;
2602 btr_intention_t lock_intention;
2603 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2604 ulint tree_savepoints[BTR_MAX_LEVELS];
2605 ulint n_blocks = 0;
2606 ulint n_releases = 0;
2607 mem_heap_t* heap = NULL;
2608 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2609 ulint* offsets = offsets_;
2610 rec_offs_init(offsets_);
2611
2612 ut_ad(!dict_index_is_spatial(index));
2613
2614 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2615
2616 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2617
2618 savepoint = mtr_set_savepoint(mtr);
2619
2620 switch (latch_mode) {
2621 case BTR_MODIFY_TREE:
2622 /* Most of delete-intended operations are purging.
2623 Free blocks and read IO bandwidth should be prior
2624 for them, when the history list is glowing huge. */
2625 if (lock_intention == BTR_INTENTION_DELETE
2626 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2627 && buf_get_n_pending_read_ios()) {
2628 mtr_x_lock(dict_index_get_lock(index), mtr);
2629 } else {
2630 mtr_sx_lock(dict_index_get_lock(index), mtr);
2631 }
2632 upper_rw_latch = RW_X_LATCH;
2633 break;
2634 case BTR_SEARCH_PREV:
2635 case BTR_MODIFY_PREV:
2636 /* This function doesn't support left uncle
2637 page lock for left leaf page lock, when
2638 needed. */
2639 case BTR_SEARCH_TREE:
2640 case BTR_CONT_MODIFY_TREE:
2641 case BTR_CONT_SEARCH_TREE:
2642 ut_ad(0);
2643 /* fall through */
2644 default:
2645 if (!srv_read_only_mode) {
2646 mtr_s_lock(dict_index_get_lock(index), mtr);
2647 upper_rw_latch = RW_S_LATCH;
2648 } else {
2649 upper_rw_latch = RW_NO_LATCH;
2650 }
2651 }
2652
2653 DBUG_EXECUTE_IF("test_index_is_unavailable",
2654 return(false););
2655
2656 if (index->page == FIL_NULL) {
2657 /* Since we don't hold index lock until just now, the index
2658 could be modified by others, for example, if this is a
2659 statistics updater for referenced table, it could be marked
2660 as unavailable by 'DROP TABLE' in the mean time, since
2661 we don't hold lock for statistics updater */
2662 return(false);
2663 }
2664
2665 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2666
2667 page_cursor = btr_cur_get_page_cur(cursor);
2668 cursor->index = index;
2669
2670 page_id_t page_id(dict_index_get_space(index),
2671 dict_index_get_page(index));
2672 const page_size_t& page_size = dict_table_page_size(index->table);
2673
2674 if (root_leaf_rw_latch == RW_X_LATCH) {
2675 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2676 }
2677
2678 height = ULINT_UNDEFINED;
2679
2680 for (;;) {
2681 buf_block_t* block;
2682 page_t* page;
2683 ulint rw_latch;
2684
2685 ut_ad(n_blocks < BTR_MAX_LEVELS);
2686
2687 if (height != 0
2688 && latch_mode != BTR_MODIFY_TREE) {
2689 rw_latch = upper_rw_latch;
2690 } else {
2691 rw_latch = RW_NO_LATCH;
2692 }
2693
2694 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2695 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2696 BUF_GET, file, line, mtr);
2697 tree_blocks[n_blocks] = block;
2698
2699 page = buf_block_get_frame(block);
2700
2701 if (height == ULINT_UNDEFINED
2702 && btr_page_get_level(page, mtr) == 0
2703 && rw_latch != RW_NO_LATCH
2704 && rw_latch != root_leaf_rw_latch) {
2705 /* We should retry to get the page, because the root page
2706 is latched with different level as a leaf page. */
2707 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2708 ut_ad(rw_latch == RW_S_LATCH);
2709
2710 ut_ad(n_blocks == 0);
2711 mtr_release_block_at_savepoint(
2712 mtr, tree_savepoints[n_blocks],
2713 tree_blocks[n_blocks]);
2714
2715 upper_rw_latch = root_leaf_rw_latch;
2716 continue;
2717 }
2718
2719 ut_ad(fil_page_index_page_check(page));
2720 ut_ad(index->id == btr_page_get_index_id(page));
2721
2722 if (height == ULINT_UNDEFINED) {
2723 /* We are in the root node */
2724
2725 height = btr_page_get_level(page, mtr);
2726 }
2727
2728 if (height == 0) {
2729 if (rw_latch == RW_NO_LATCH
2730 || srv_read_only_mode) {
2731 btr_cur_latch_leaves(
2732 block, page_id, page_size,
2733 latch_mode, cursor, mtr);
2734 }
2735
2736 /* btr_cur_open_at_index_side_func() and
2737 btr_cur_search_to_nth_level() release
2738 tree s-latch here.*/
2739 switch (latch_mode) {
2740 case BTR_MODIFY_TREE:
2741 case BTR_CONT_MODIFY_TREE:
2742 case BTR_CONT_SEARCH_TREE:
2743 break;
2744 default:
2745 /* Release the tree s-latch */
2746 if (!srv_read_only_mode) {
2747 mtr_release_s_latch_at_savepoint(
2748 mtr, savepoint,
2749 dict_index_get_lock(index));
2750 }
2751
2752 /* release upper blocks */
2753 for (; n_releases < n_blocks; n_releases++) {
2754 mtr_release_block_at_savepoint(
2755 mtr,
2756 tree_savepoints[n_releases],
2757 tree_blocks[n_releases]);
2758 }
2759 }
2760 }
2761
2762 page_cur_open_on_rnd_user_rec(block, page_cursor);
2763
2764 if (height == 0) {
2765
2766 break;
2767 }
2768
2769 ut_ad(height > 0);
2770
2771 height--;
2772
2773 node_ptr = page_cur_get_rec(page_cursor);
2774 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2775 ULINT_UNDEFINED, &heap);
2776
2777 /* If the rec is the first or last in the page for
2778 pessimistic delete intention, it might cause node_ptr insert
2779 for the upper level. We should change the intention and retry.
2780 */
2781 if (latch_mode == BTR_MODIFY_TREE
2782 && btr_cur_need_opposite_intention(
2783 page, lock_intention, node_ptr)) {
2784
2785 ut_ad(upper_rw_latch == RW_X_LATCH);
2786 /* release all blocks */
2787 for (; n_releases <= n_blocks; n_releases++) {
2788 mtr_release_block_at_savepoint(
2789 mtr, tree_savepoints[n_releases],
2790 tree_blocks[n_releases]);
2791 }
2792
2793 lock_intention = BTR_INTENTION_BOTH;
2794
2795 page_id.set_page_no(dict_index_get_page(index));
2796
2797 height = ULINT_UNDEFINED;
2798
2799 n_blocks = 0;
2800 n_releases = 0;
2801
2802 continue;
2803 }
2804
2805 if (latch_mode == BTR_MODIFY_TREE
2806 && !btr_cur_will_modify_tree(
2807 cursor->index, page, lock_intention, node_ptr,
2808 node_ptr_max_size, page_size, mtr)) {
2809 ut_ad(upper_rw_latch == RW_X_LATCH);
2810 ut_ad(n_releases <= n_blocks);
2811
2812 /* we can release upper blocks */
2813 for (; n_releases < n_blocks; n_releases++) {
2814 if (n_releases == 0) {
2815 /* we should not release root page
2816 to pin to same block. */
2817 continue;
2818 }
2819
2820 /* release unused blocks to unpin */
2821 mtr_release_block_at_savepoint(
2822 mtr, tree_savepoints[n_releases],
2823 tree_blocks[n_releases]);
2824 }
2825 }
2826
2827 if (height == 0
2828 && latch_mode == BTR_MODIFY_TREE) {
2829 ut_ad(upper_rw_latch == RW_X_LATCH);
2830 /* we should sx-latch root page, if released already.
2831 It contains seg_header. */
2832 if (n_releases > 0) {
2833 mtr_block_sx_latch_at_savepoint(
2834 mtr, tree_savepoints[0],
2835 tree_blocks[0]);
2836 }
2837
2838 /* x-latch the branch blocks not released yet. */
2839 for (ulint i = n_releases; i <= n_blocks; i++) {
2840 mtr_block_x_latch_at_savepoint(
2841 mtr, tree_savepoints[i],
2842 tree_blocks[i]);
2843 }
2844 }
2845
2846 /* Go to the child node */
2847 page_id.set_page_no(
2848 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2849
2850 n_blocks++;
2851 }
2852
2853 if (UNIV_LIKELY_NULL(heap)) {
2854 mem_heap_free(heap);
2855 }
2856
2857 return(true);
2858 }
2859
2860 /*==================== B-TREE INSERT =========================*/
2861
2862 /*************************************************************//**
2863 Inserts a record if there is enough space, or if enough space can
2864 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
2865 no heuristics is applied to whether it pays to use CPU time for
2866 reorganizing the page or not.
2867
2868 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
2869 if this is a compressed leaf page in a secondary index.
2870 This has to be done either within the same mini-transaction,
2871 or by invoking ibuf_reset_free_bits() before mtr_commit().
2872
2873 @return pointer to inserted record if succeed, else NULL */
2874 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2875 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)2876 btr_cur_insert_if_possible(
2877 /*=======================*/
2878 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
2879 cursor stays valid */
2880 const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
2881 have been stored to tuple */
2882 ulint** offsets,/*!< out: offsets on *rec */
2883 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2884 ulint n_ext, /*!< in: number of externally stored columns */
2885 mtr_t* mtr) /*!< in/out: mini-transaction */
2886 {
2887 page_cur_t* page_cursor;
2888 rec_t* rec;
2889
2890 ut_ad(dtuple_check_typed(tuple));
2891
2892 ut_ad(mtr_is_block_fix(
2893 mtr, btr_cur_get_block(cursor),
2894 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
2895 page_cursor = btr_cur_get_page_cur(cursor);
2896
2897 /* Now, try the insert */
2898 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2899 offsets, heap, n_ext, mtr);
2900
2901 /* If the record did not fit, reorganize.
2902 For compressed pages, page_cur_tuple_insert()
2903 attempted this already. */
2904 if (!rec && !page_cur_get_page_zip(page_cursor)
2905 && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
2906 rec = page_cur_tuple_insert(
2907 page_cursor, tuple, cursor->index,
2908 offsets, heap, n_ext, mtr);
2909 }
2910
2911 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
2912 return(rec);
2913 }
2914
2915 /*************************************************************//**
2916 For an insert, checks the locks and does the undo logging if desired.
2917 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
2918 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2919 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)2920 btr_cur_ins_lock_and_undo(
2921 /*======================*/
2922 ulint flags, /*!< in: undo logging and locking flags: if
2923 not zero, the parameters index and thr
2924 should be specified */
2925 btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
2926 dtuple_t* entry, /*!< in/out: entry to insert */
2927 que_thr_t* thr, /*!< in: query thread or NULL */
2928 mtr_t* mtr, /*!< in/out: mini-transaction */
2929 ibool* inherit)/*!< out: TRUE if the inserted new record maybe
2930 should inherit LOCK_GAP type locks from the
2931 successor record */
2932 {
2933 dict_index_t* index;
2934 dberr_t err = DB_SUCCESS;
2935 rec_t* rec;
2936 roll_ptr_t roll_ptr;
2937
2938 /* Check if we have to wait for a lock: enqueue an explicit lock
2939 request if yes */
2940
2941 rec = btr_cur_get_rec(cursor);
2942 index = cursor->index;
2943
2944 ut_ad(!dict_index_is_online_ddl(index)
2945 || dict_index_is_clust(index)
2946 || (flags & BTR_CREATE_FLAG));
2947 ut_ad(mtr->is_named_space(index->space));
2948
2949 /* Check if there is predicate or GAP lock preventing the insertion */
2950 if (!(flags & BTR_NO_LOCKING_FLAG)) {
2951 if (dict_index_is_spatial(index)) {
2952 lock_prdt_t prdt;
2953 rtr_mbr_t mbr;
2954
2955 rtr_get_mbr_from_tuple(entry, &mbr);
2956
2957 /* Use on stack MBR variable to test if a lock is
2958 needed. If so, the predicate (MBR) will be allocated
2959 from lock heap in lock_prdt_insert_check_and_lock() */
2960 lock_init_prdt_from_mbr(
2961 &prdt, &mbr, 0, NULL);
2962
2963 err = lock_prdt_insert_check_and_lock(
2964 flags, rec, btr_cur_get_block(cursor),
2965 index, thr, mtr, &prdt);
2966 *inherit = false;
2967 } else {
2968 err = lock_rec_insert_check_and_lock(
2969 flags, rec, btr_cur_get_block(cursor),
2970 index, thr, mtr, inherit);
2971 }
2972 }
2973
2974 if (err != DB_SUCCESS
2975 || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
2976
2977 return(err);
2978 }
2979
2980 err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
2981 thr, index, entry,
2982 NULL, 0, NULL, NULL,
2983 &roll_ptr);
2984 if (err != DB_SUCCESS) {
2985
2986 return(err);
2987 }
2988
2989 /* Now we can fill in the roll ptr field in entry
2990 (except if table is intrinsic) */
2991
2992 if (!(flags & BTR_KEEP_SYS_FLAG)
2993 && !dict_table_is_intrinsic(index->table)) {
2994
2995 row_upd_index_entry_sys_field(entry, index,
2996 DATA_ROLL_PTR, roll_ptr);
2997 }
2998
2999 return(DB_SUCCESS);
3000 }
3001
3002 /**
3003 Prefetch siblings of the leaf for the pessimistic operation.
3004 @param block leaf page */
3005 static
3006 void
btr_cur_prefetch_siblings(buf_block_t * block)3007 btr_cur_prefetch_siblings(
3008 buf_block_t* block)
3009 {
3010 page_t* page = buf_block_get_frame(block);
3011
3012 ut_ad(page_is_leaf(page));
3013
3014 ulint left_page_no = fil_page_get_prev(page);
3015 ulint right_page_no = fil_page_get_next(page);
3016
3017 if (left_page_no != FIL_NULL) {
3018 buf_read_page_background(
3019 page_id_t(block->page.id.space(), left_page_no),
3020 block->page.size, false);
3021 }
3022 if (right_page_no != FIL_NULL) {
3023 buf_read_page_background(
3024 page_id_t(block->page.id.space(), right_page_no),
3025 block->page.size, false);
3026 }
3027 if (left_page_no != FIL_NULL
3028 || right_page_no != FIL_NULL) {
3029 os_aio_simulated_wake_handler_threads();
3030 }
3031 }
3032
3033 /*************************************************************//**
3034 Tries to perform an insert to a page in an index tree, next to cursor.
3035 It is assumed that mtr holds an x-latch on the page. The operation does
3036 not succeed if there is too little space on the page. If there is just
3037 one record on the page, the insert will always succeed; this is to
3038 prevent trying to split a page with just one record.
3039 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3040 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3041 btr_cur_optimistic_insert(
3042 /*======================*/
3043 ulint flags, /*!< in: undo logging and locking flags: if not
3044 zero, the parameters index and thr should be
3045 specified */
3046 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3047 cursor stays valid */
3048 ulint** offsets,/*!< out: offsets on *rec */
3049 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
3050 dtuple_t* entry, /*!< in/out: entry to insert */
3051 rec_t** rec, /*!< out: pointer to inserted record if
3052 succeed */
3053 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3054 be stored externally by the caller, or
3055 NULL */
3056 ulint n_ext, /*!< in: number of externally stored columns */
3057 que_thr_t* thr, /*!< in: query thread or NULL */
3058 mtr_t* mtr) /*!< in/out: mini-transaction;
3059 if this function returns DB_SUCCESS on
3060 a leaf page of a secondary index in a
3061 compressed tablespace, the caller must
3062 mtr_commit(mtr) before latching
3063 any further pages */
3064 {
3065 big_rec_t* big_rec_vec = NULL;
3066 dict_index_t* index;
3067 page_cur_t* page_cursor;
3068 buf_block_t* block;
3069 page_t* page;
3070 rec_t* dummy;
3071 ibool leaf;
3072 ibool reorg;
3073 ibool inherit = TRUE;
3074 ulint rec_size;
3075 dberr_t err;
3076
3077 *big_rec = NULL;
3078
3079 block = btr_cur_get_block(cursor);
3080 page = buf_block_get_frame(block);
3081 index = cursor->index;
3082
3083 /* Block are not latched for insert if table is intrinsic
3084 and index is auto-generated clustered index. */
3085 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3086 ut_ad(!dict_index_is_online_ddl(index)
3087 || dict_index_is_clust(index)
3088 || (flags & BTR_CREATE_FLAG));
3089 ut_ad(dtuple_check_typed(entry));
3090
3091 const page_size_t& page_size = block->page.size;
3092
3093 #ifdef UNIV_DEBUG_VALGRIND
3094 if (page_size.is_compressed()) {
3095 UNIV_MEM_ASSERT_RW(page, page_size.logical());
3096 UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3097 }
3098 #endif /* UNIV_DEBUG_VALGRIND */
3099
3100 leaf = page_is_leaf(page);
3101
3102 /* Calculate the record size when entry is converted to a record */
3103 rec_size = rec_get_converted_size(index, entry, n_ext);
3104
3105 if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3106 dtuple_get_n_fields(entry), page_size)) {
3107
3108 /* The record is so big that we have to store some fields
3109 externally on separate database pages */
3110 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3111
3112 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3113
3114 return(DB_TOO_BIG_RECORD);
3115 }
3116
3117 rec_size = rec_get_converted_size(index, entry, n_ext);
3118 }
3119
3120 if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3121 if (big_rec_vec != NULL) {
3122 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3123 }
3124
3125 return(DB_TOO_BIG_RECORD);
3126 }
3127
3128 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3129 goto fail);
3130
3131 if (leaf && page_size.is_compressed()
3132 && (page_get_data_size(page) + rec_size
3133 >= dict_index_zip_pad_optimal_page_size(index))) {
3134 /* If compression padding tells us that insertion will
3135 result in too packed up page i.e.: which is likely to
3136 cause compression failure then don't do an optimistic
3137 insertion. */
3138 fail:
3139 err = DB_FAIL;
3140
3141 /* prefetch siblings of the leaf for the pessimistic
3142 operation, if the page is leaf. */
3143 if (page_is_leaf(page)) {
3144 btr_cur_prefetch_siblings(block);
3145 }
3146 fail_err:
3147
3148 if (big_rec_vec) {
3149 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3150 }
3151
3152 return(err);
3153 }
3154
3155 ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
3156
3157 if (page_has_garbage(page)) {
3158 if ((max_size < rec_size
3159 || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3160 && page_get_n_recs(page) > 1
3161 && page_get_max_insert_size(page, 1) < rec_size) {
3162
3163 goto fail;
3164 }
3165 } else if (max_size < rec_size) {
3166 goto fail;
3167 }
3168
3169 /* If there have been many consecutive inserts to the
3170 clustered index leaf page of an uncompressed table, check if
3171 we have to split the page to reserve enough free space for
3172 future updates of records. */
3173
3174 if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3175 && page_get_n_recs(page) >= 2
3176 && dict_index_get_space_reserve() + rec_size > max_size
3177 && (btr_page_get_split_rec_to_right(cursor, &dummy)
3178 || btr_page_get_split_rec_to_left(cursor, &dummy))) {
3179 goto fail;
3180 }
3181
3182 page_cursor = btr_cur_get_page_cur(cursor);
3183
3184 DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3185 ": %s",
3186 index->name(), index->id,
3187 thr != NULL
3188 ? trx_get_id_for_print(thr_get_trx(thr))
3189 : 0,
3190 rec_printer(entry).str().c_str()));
3191
3192 DBUG_EXECUTE_IF("do_page_reorganize",
3193 btr_page_reorganize(page_cursor, index, mtr););
3194
3195 /* Now, try the insert */
3196 {
3197 const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
3198
3199 if (dict_table_is_intrinsic(index->table)) {
3200
3201 index->rec_cache.rec_size = rec_size;
3202
3203 *rec = page_cur_tuple_direct_insert(
3204 page_cursor, entry, index, n_ext, mtr);
3205 } else {
3206 /* Check locks and write to the undo log,
3207 if specified */
3208 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3209 thr, mtr, &inherit);
3210
3211 if (err != DB_SUCCESS) {
3212 goto fail_err;
3213 }
3214
3215 *rec = page_cur_tuple_insert(
3216 page_cursor, entry, index, offsets, heap,
3217 n_ext, mtr);
3218 }
3219
3220 reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3221 }
3222
3223 if (*rec) {
3224 } else if (page_size.is_compressed()) {
3225 /* Reset the IBUF_BITMAP_FREE bits, because
3226 page_cur_tuple_insert() will have attempted page
3227 reorganize before failing. */
3228 if (leaf
3229 && !dict_index_is_clust(index)
3230 && !dict_table_is_temporary(index->table)) {
3231 ibuf_reset_free_bits(block);
3232 }
3233
3234 goto fail;
3235 } else {
3236
3237 /* For intrinsic table we take a consistent path
3238 to re-organize using pessimistic path. */
3239 if (dict_table_is_intrinsic(index->table)) {
3240 goto fail;
3241 }
3242
3243 ut_ad(!reorg);
3244
3245 /* If the record did not fit, reorganize */
3246 if (!btr_page_reorganize(page_cursor, index, mtr)) {
3247 ut_ad(0);
3248 goto fail;
3249 }
3250
3251 ut_ad(page_get_max_insert_size(page, 1) == max_size);
3252
3253 reorg = TRUE;
3254
3255 *rec = page_cur_tuple_insert(page_cursor, entry, index,
3256 offsets, heap, n_ext, mtr);
3257
3258 if (UNIV_UNLIKELY(!*rec)) {
3259 ib::fatal() << "Cannot insert tuple " << *entry
3260 << "into index " << index->name
3261 << " of table " << index->table->name
3262 << ". Max size: " << max_size;
3263 }
3264 }
3265
3266 if (!index->disable_ahi) {
3267 if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3268 btr_search_update_hash_node_on_insert(cursor);
3269 } else {
3270 btr_search_update_hash_on_insert(cursor);
3271 }
3272 }
3273
3274 if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3275
3276 lock_update_insert(block, *rec);
3277 }
3278
3279 if (leaf
3280 && !dict_index_is_clust(index)
3281 && !dict_table_is_temporary(index->table)) {
3282 /* Update the free bits of the B-tree page in the
3283 insert buffer bitmap. */
3284
3285 /* The free bits in the insert buffer bitmap must
3286 never exceed the free space on a page. It is safe to
3287 decrement or reset the bits in the bitmap in a
3288 mini-transaction that is committed before the
3289 mini-transaction that affects the free space. */
3290
3291 /* It is unsafe to increment the bits in a separately
3292 committed mini-transaction, because in crash recovery,
3293 the free bits could momentarily be set too high. */
3294
3295 if (page_size.is_compressed()) {
3296 /* Update the bits in the same mini-transaction. */
3297 ibuf_update_free_bits_zip(block, mtr);
3298 } else {
3299 /* Decrement the bits in a separate
3300 mini-transaction. */
3301 ibuf_update_free_bits_if_full(
3302 block, max_size,
3303 rec_size + PAGE_DIR_SLOT_SIZE);
3304 }
3305 }
3306
3307 *big_rec = big_rec_vec;
3308
3309 return(DB_SUCCESS);
3310 }
3311
3312 /*************************************************************//**
3313 Performs an insert on a page of an index tree. It is assumed that mtr
3314 holds an x-latch on the tree and on the cursor page. If the insert is
3315 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3316 to brothers of page, if those brothers exist.
3317 @return DB_SUCCESS or error number */
3318 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3319 btr_cur_pessimistic_insert(
3320 /*=======================*/
3321 ulint flags, /*!< in: undo logging and locking flags: if not
3322 zero, the parameter thr should be
3323 specified; if no undo logging is specified,
3324 then the caller must have reserved enough
3325 free extents in the file space so that the
3326 insertion will certainly succeed */
3327 btr_cur_t* cursor, /*!< in: cursor after which to insert;
3328 cursor stays valid */
3329 ulint** offsets,/*!< out: offsets on *rec */
3330 mem_heap_t** heap, /*!< in/out: pointer to memory heap
3331 that can be emptied, or NULL */
3332 dtuple_t* entry, /*!< in/out: entry to insert */
3333 rec_t** rec, /*!< out: pointer to inserted record if
3334 succeed */
3335 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3336 be stored externally by the caller, or
3337 NULL */
3338 ulint n_ext, /*!< in: number of externally stored columns */
3339 que_thr_t* thr, /*!< in: query thread or NULL */
3340 mtr_t* mtr) /*!< in/out: mini-transaction */
3341 {
3342 dict_index_t* index = cursor->index;
3343 big_rec_t* big_rec_vec = NULL;
3344 dberr_t err;
3345 ibool inherit = FALSE;
3346 bool success;
3347 ulint n_reserved = 0;
3348
3349 ut_ad(dtuple_check_typed(entry));
3350
3351 *big_rec = NULL;
3352
3353 ut_ad(mtr_memo_contains_flagged(
3354 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3355 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3356 || dict_table_is_intrinsic(cursor->index->table));
3357 ut_ad(mtr_is_block_fix(
3358 mtr, btr_cur_get_block(cursor),
3359 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3360 ut_ad(!dict_index_is_online_ddl(index)
3361 || dict_index_is_clust(index)
3362 || (flags & BTR_CREATE_FLAG));
3363
3364 cursor->flag = BTR_CUR_BINARY;
3365
3366 /* Check locks and write to undo log, if specified */
3367
3368 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3369 thr, mtr, &inherit);
3370
3371 if (err != DB_SUCCESS) {
3372
3373 return(err);
3374 }
3375
3376 if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3377 || dict_table_is_intrinsic(index->table)) {
3378 /* First reserve enough free space for the file segments
3379 of the index tree, so that the insert will not fail because
3380 of lack of space */
3381
3382 ulint n_extents = cursor->tree_height / 16 + 3;
3383
3384 success = fsp_reserve_free_extents(&n_reserved, index->space,
3385 n_extents, FSP_NORMAL, mtr);
3386 if (!success) {
3387 return(DB_OUT_OF_FILE_SPACE);
3388 }
3389 }
3390
3391 if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3392 dict_table_is_comp(index->table),
3393 dtuple_get_n_fields(entry),
3394 dict_table_page_size(index->table))) {
3395 /* The record is so big that we have to store some fields
3396 externally on separate database pages */
3397
3398 if (UNIV_LIKELY_NULL(big_rec_vec)) {
3399 /* This should never happen, but we handle
3400 the situation in a robust manner. */
3401 ut_ad(0);
3402 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3403 }
3404
3405 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3406
3407 if (big_rec_vec == NULL) {
3408
3409 if (n_reserved > 0) {
3410 fil_space_release_free_extents(index->space,
3411 n_reserved);
3412 }
3413 return(DB_TOO_BIG_RECORD);
3414 }
3415 }
3416
3417 if (dict_index_get_page(index)
3418 == btr_cur_get_block(cursor)->page.id.page_no()) {
3419
3420 /* The page is the root page */
3421 *rec = btr_root_raise_and_insert(
3422 flags, cursor, offsets, heap, entry, n_ext, mtr);
3423 } else {
3424 *rec = btr_page_split_and_insert(
3425 flags, cursor, offsets, heap, entry, n_ext, mtr);
3426 }
3427
3428 ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3429 || dict_index_is_spatial(index));
3430
3431 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3432 ut_ad(!dict_table_is_temporary(index->table));
3433 if (dict_index_is_spatial(index)) {
3434 /* Do nothing */
3435 } else {
3436 /* The cursor might be moved to the other page
3437 and the max trx id field should be updated after
3438 the cursor was fixed. */
3439 if (!dict_index_is_clust(index)) {
3440 page_update_max_trx_id(
3441 btr_cur_get_block(cursor),
3442 btr_cur_get_page_zip(cursor),
3443 thr_get_trx(thr)->id, mtr);
3444 }
3445 if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3446 || btr_page_get_prev(
3447 buf_block_get_frame(
3448 btr_cur_get_block(cursor)), mtr)
3449 == FIL_NULL) {
3450 /* split and inserted need to call
3451 lock_update_insert() always. */
3452 inherit = TRUE;
3453 }
3454 }
3455 }
3456
3457 if (!index->disable_ahi) {
3458 btr_search_update_hash_on_insert(cursor);
3459 }
3460 if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3461
3462 lock_update_insert(btr_cur_get_block(cursor), *rec);
3463 }
3464
3465 if (n_reserved > 0) {
3466 fil_space_release_free_extents(index->space, n_reserved);
3467 }
3468
3469 *big_rec = big_rec_vec;
3470
3471 return(DB_SUCCESS);
3472 }
3473
3474 /*==================== B-TREE UPDATE =========================*/
3475
3476 /*************************************************************//**
3477 For an update, checks the locks and does the undo logging.
3478 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3479 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3480 dberr_t
3481 btr_cur_upd_lock_and_undo(
3482 /*======================*/
3483 ulint flags, /*!< in: undo logging and locking flags */
3484 btr_cur_t* cursor, /*!< in: cursor on record to update */
3485 const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
3486 const upd_t* update, /*!< in: update vector */
3487 ulint cmpl_info,/*!< in: compiler info on secondary index
3488 updates */
3489 que_thr_t* thr, /*!< in: query thread
3490 (can be NULL if BTR_NO_LOCKING_FLAG) */
3491 mtr_t* mtr, /*!< in/out: mini-transaction */
3492 roll_ptr_t* roll_ptr)/*!< out: roll pointer */
3493 {
3494 dict_index_t* index;
3495 const rec_t* rec;
3496 dberr_t err;
3497
3498 ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3499
3500 rec = btr_cur_get_rec(cursor);
3501 index = cursor->index;
3502
3503 ut_ad(rec_offs_validate(rec, index, offsets));
3504 ut_ad(mtr->is_named_space(index->space));
3505
3506 if (!dict_index_is_clust(index)) {
3507 ut_ad(dict_index_is_online_ddl(index)
3508 == !!(flags & BTR_CREATE_FLAG));
3509
3510 /* We do undo logging only when we update a clustered index
3511 record */
3512 return(lock_sec_rec_modify_check_and_lock(
3513 flags, btr_cur_get_block(cursor), rec,
3514 index, thr, mtr));
3515 }
3516
3517 /* Check if we have to wait for a lock: enqueue an explicit lock
3518 request if yes */
3519
3520 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3521 err = lock_clust_rec_modify_check_and_lock(
3522 flags, btr_cur_get_block(cursor), rec, index,
3523 offsets, thr);
3524 if (err != DB_SUCCESS) {
3525 return(err);
3526 }
3527 }
3528
3529 /* Append the info about the update in the undo log */
3530
3531 return(trx_undo_report_row_operation(
3532 flags, TRX_UNDO_MODIFY_OP, thr,
3533 index, NULL, update,
3534 cmpl_info, rec, offsets, roll_ptr));
3535 }
3536
3537 /***********************************************************//**
3538 Writes a redo log record of updating a record in-place. */
3539 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3540 btr_cur_update_in_place_log(
3541 /*========================*/
3542 ulint flags, /*!< in: flags */
3543 const rec_t* rec, /*!< in: record */
3544 dict_index_t* index, /*!< in: index of the record */
3545 const upd_t* update, /*!< in: update vector */
3546 trx_id_t trx_id, /*!< in: transaction id */
3547 roll_ptr_t roll_ptr, /*!< in: roll ptr */
3548 mtr_t* mtr) /*!< in: mtr */
3549 {
3550 byte* log_ptr;
3551 const page_t* page = page_align(rec);
3552 ut_ad(flags < 256);
3553 ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3554
3555 log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3556 ? MLOG_COMP_REC_UPDATE_IN_PLACE
3557 : MLOG_REC_UPDATE_IN_PLACE,
3558 1 + DATA_ROLL_PTR_LEN + 14 + 2
3559 + MLOG_BUF_MARGIN);
3560
3561 if (!log_ptr) {
3562 /* Logging in mtr is switched off during crash recovery */
3563 return;
3564 }
3565
3566 /* For secondary indexes, we could skip writing the dummy system fields
3567 to the redo log but we have to change redo log parsing of
3568 MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3569 new redo log record. For now, just write dummy sys fields to the redo
3570 log if we are updating a secondary index record.
3571 */
3572 mach_write_to_1(log_ptr, flags);
3573 log_ptr++;
3574
3575 if (dict_index_is_clust(index)) {
3576 log_ptr = row_upd_write_sys_vals_to_log(
3577 index, trx_id, roll_ptr, log_ptr, mtr);
3578 } else {
3579 /* Dummy system fields for a secondary index */
3580 /* TRX_ID Position */
3581 log_ptr += mach_write_compressed(log_ptr, 0);
3582 /* ROLL_PTR */
3583 trx_write_roll_ptr(log_ptr, 0);
3584 log_ptr += DATA_ROLL_PTR_LEN;
3585 /* TRX_ID */
3586 log_ptr += mach_u64_write_compressed(log_ptr, 0);
3587 }
3588
3589 mach_write_to_2(log_ptr, page_offset(rec));
3590 log_ptr += 2;
3591
3592 row_upd_index_write_log(update, log_ptr, mtr);
3593 }
3594 #endif /* UNIV_HOTBACKUP */
3595
3596 /***********************************************************//**
3597 Parses a redo log record of updating a record in-place.
3598 @return end of log record or NULL */
3599 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3600 btr_cur_parse_update_in_place(
3601 /*==========================*/
3602 byte* ptr, /*!< in: buffer */
3603 byte* end_ptr,/*!< in: buffer end */
3604 page_t* page, /*!< in/out: page or NULL */
3605 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
3606 dict_index_t* index) /*!< in: index corresponding to page */
3607 {
3608 ulint flags;
3609 rec_t* rec;
3610 upd_t* update;
3611 ulint pos;
3612 trx_id_t trx_id;
3613 roll_ptr_t roll_ptr;
3614 ulint rec_offset;
3615 mem_heap_t* heap;
3616 ulint* offsets;
3617
3618 if (end_ptr < ptr + 1) {
3619
3620 return(NULL);
3621 }
3622
3623 flags = mach_read_from_1(ptr);
3624 ptr++;
3625
3626 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3627
3628 if (ptr == NULL) {
3629
3630 return(NULL);
3631 }
3632
3633 if (end_ptr < ptr + 2) {
3634
3635 return(NULL);
3636 }
3637
3638 rec_offset = mach_read_from_2(ptr);
3639 ptr += 2;
3640
3641 ut_a(rec_offset <= UNIV_PAGE_SIZE);
3642
3643 heap = mem_heap_create(256);
3644
3645 ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3646
3647 if (!ptr || !page) {
3648
3649 goto func_exit;
3650 }
3651
3652 ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3653 rec = page + rec_offset;
3654
3655 /* We do not need to reserve search latch, as the page is only
3656 being recovered, and there cannot be a hash index to it. */
3657
3658 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3659
3660 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3661 row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3662 pos, trx_id, roll_ptr);
3663 }
3664
3665 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3666
3667 func_exit:
3668 mem_heap_free(heap);
3669
3670 return(ptr);
3671 }
3672
3673 #ifndef UNIV_HOTBACKUP
3674 /*************************************************************//**
3675 See if there is enough place in the page modification log to log
3676 an update-in-place.
3677
3678 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3679 outside mtr if the page was recompressed
3680 @retval true if enough place;
3681
3682 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3683 a secondary index leaf page. This has to be done either within the
3684 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3685 mtr_commit(mtr). */
3686 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3687 btr_cur_update_alloc_zip_func(
3688 /*==========================*/
3689 page_zip_des_t* page_zip,/*!< in/out: compressed page */
3690 page_cur_t* cursor, /*!< in/out: B-tree page cursor */
3691 dict_index_t* index, /*!< in: the index corresponding to cursor */
3692 #ifdef UNIV_DEBUG
3693 ulint* offsets,/*!< in/out: offsets of the cursor record */
3694 #endif /* UNIV_DEBUG */
3695 ulint length, /*!< in: size needed */
3696 bool create, /*!< in: true=delete-and-insert,
3697 false=update-in-place */
3698 mtr_t* mtr) /*!< in/out: mini-transaction */
3699 {
3700 const page_t* page = page_cur_get_page(cursor);
3701
3702 ut_ad(page_zip == page_cur_get_page_zip(cursor));
3703 ut_ad(page_zip);
3704 ut_ad(!dict_index_is_ibuf(index));
3705 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3706
3707 if (page_zip_available(page_zip, dict_index_is_clust(index),
3708 length, create)) {
3709 return(true);
3710 }
3711
3712 if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3713 /* The page has been freshly compressed, so
3714 reorganizing it will not help. */
3715 return(false);
3716 }
3717
3718 if (create && page_is_leaf(page)
3719 && (length + page_get_data_size(page)
3720 >= dict_index_zip_pad_optimal_page_size(index))) {
3721 return(false);
3722 }
3723
3724 if (!btr_page_reorganize(cursor, index, mtr)) {
3725 goto out_of_space;
3726 }
3727
3728 rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3729
3730 /* After recompressing a page, we must make sure that the free
3731 bits in the insert buffer bitmap will not exceed the free
3732 space on the page. Because this function will not attempt
3733 recompression unless page_zip_available() fails above, it is
3734 safe to reset the free bits if page_zip_available() fails
3735 again, below. The free bits can safely be reset in a separate
3736 mini-transaction. If page_zip_available() succeeds below, we
3737 can be sure that the btr_page_reorganize() above did not reduce
3738 the free space available on the page. */
3739
3740 if (page_zip_available(page_zip, dict_index_is_clust(index),
3741 length, create)) {
3742 return(true);
3743 }
3744
3745 out_of_space:
3746 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3747
3748 /* Out of space: reset the free bits. */
3749 if (!dict_index_is_clust(index)
3750 && !dict_table_is_temporary(index->table)
3751 && page_is_leaf(page)) {
3752 ibuf_reset_free_bits(page_cur_get_block(cursor));
3753 }
3754
3755 return(false);
3756 }
3757
3758 /*************************************************************//**
3759 Updates a record when the update causes no size changes in its fields.
3760 We assume here that the ordering fields of the record do not change.
3761 @return locking or undo log related error code, or
3762 @retval DB_SUCCESS on success
3763 @retval DB_ZIP_OVERFLOW if there is not enough space left
3764 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3765 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3766 btr_cur_update_in_place(
3767 /*====================*/
3768 ulint flags, /*!< in: undo logging and locking flags */
3769 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3770 cursor stays valid and positioned on the
3771 same record */
3772 ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3773 const upd_t* update, /*!< in: update vector */
3774 ulint cmpl_info,/*!< in: compiler info on secondary index
3775 updates */
3776 que_thr_t* thr, /*!< in: query thread */
3777 trx_id_t trx_id, /*!< in: transaction id */
3778 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3779 is a secondary index, the caller must
3780 mtr_commit(mtr) before latching any
3781 further pages */
3782 {
3783 dict_index_t* index;
3784 buf_block_t* block;
3785 page_zip_des_t* page_zip;
3786 dberr_t err;
3787 rec_t* rec;
3788 roll_ptr_t roll_ptr = 0;
3789 ulint was_delete_marked;
3790 ibool is_hashed;
3791
3792 rec = btr_cur_get_rec(cursor);
3793 index = cursor->index;
3794 ut_ad(rec_offs_validate(rec, index, offsets));
3795 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3796 ut_ad(trx_id > 0
3797 || (flags & BTR_KEEP_SYS_FLAG)
3798 || dict_table_is_intrinsic(index->table));
3799 /* The insert buffer tree should never be updated in place. */
3800 ut_ad(!dict_index_is_ibuf(index));
3801 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3802 || dict_index_is_clust(index));
3803 ut_ad(thr_get_trx(thr)->id == trx_id
3804 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3805 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3806 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3807 ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3808 ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3809
3810 DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3811 ") by " TRX_ID_FMT ": %s",
3812 index->name(), index->id, trx_id,
3813 rec_printer(rec, offsets).str().c_str()));
3814
3815 block = btr_cur_get_block(cursor);
3816 page_zip = buf_block_get_page_zip(block);
3817
3818 /* Check that enough space is available on the compressed page. */
3819 if (page_zip) {
3820 if (!btr_cur_update_alloc_zip(
3821 page_zip, btr_cur_get_page_cur(cursor),
3822 index, offsets, rec_offs_size(offsets),
3823 false, mtr)) {
3824 return(DB_ZIP_OVERFLOW);
3825 }
3826
3827 rec = btr_cur_get_rec(cursor);
3828 }
3829
3830 /* Do lock checking and undo logging */
3831 err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
3832 update, cmpl_info,
3833 thr, mtr, &roll_ptr);
3834 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3835 /* We may need to update the IBUF_BITMAP_FREE
3836 bits after a reorganize that was done in
3837 btr_cur_update_alloc_zip(). */
3838 goto func_exit;
3839 }
3840
3841 if (!(flags & BTR_KEEP_SYS_FLAG)
3842 && !dict_table_is_intrinsic(index->table)) {
3843 row_upd_rec_sys_fields(rec, NULL, index, offsets,
3844 thr_get_trx(thr), roll_ptr);
3845 }
3846
3847 was_delete_marked = rec_get_deleted_flag(
3848 rec, page_is_comp(buf_block_get_frame(block)));
3849
3850 is_hashed = (block->index != NULL);
3851
3852 if (is_hashed) {
3853 /* TO DO: Can we skip this if none of the fields
3854 index->search_info->curr_n_fields
3855 are being updated? */
3856
3857 /* The function row_upd_changes_ord_field_binary works only
3858 if the update vector was built for a clustered index, we must
3859 NOT call it if index is secondary */
3860
3861 if (!dict_index_is_clust(index)
3862 || row_upd_changes_ord_field_binary(index, update, thr,
3863 NULL, NULL)) {
3864
3865 /* Remove possible hash index pointer to this record */
3866 btr_search_update_hash_on_delete(cursor);
3867 }
3868
3869 rw_lock_x_lock(btr_get_search_latch(index));
3870 }
3871
3872 assert_block_ahi_valid(block);
3873 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3874
3875 if (is_hashed) {
3876 rw_lock_x_unlock(btr_get_search_latch(index));
3877 }
3878
3879 btr_cur_update_in_place_log(flags, rec, index, update,
3880 trx_id, roll_ptr, mtr);
3881
3882 if (was_delete_marked
3883 && !rec_get_deleted_flag(
3884 rec, page_is_comp(buf_block_get_frame(block)))) {
3885 /* The new updated record owns its possible externally
3886 stored fields */
3887
3888 btr_cur_unmark_extern_fields(page_zip,
3889 rec, index, offsets, mtr);
3890 }
3891
3892 ut_ad(err == DB_SUCCESS);
3893
3894 func_exit:
3895 if (page_zip
3896 && !(flags & BTR_KEEP_IBUF_BITMAP)
3897 && !dict_index_is_clust(index)
3898 && !dict_table_is_temporary(index->table)
3899 && page_is_leaf(buf_block_get_frame(block))) {
3900 /* Update the free bits in the insert buffer. */
3901 ibuf_update_free_bits_zip(block, mtr);
3902 }
3903
3904 return(err);
3905 }
3906
3907 /*************************************************************//**
3908 Tries to update a record on a page in an index tree. It is assumed that mtr
3909 holds an x-latch on the page. The operation does not succeed if there is too
3910 little space on the page or if the update would result in too empty a page,
3911 so that tree compression is recommended. We assume here that the ordering
3912 fields of the record do not change.
3913 @return error code, including
3914 @retval DB_SUCCESS on success
3915 @retval DB_OVERFLOW if the updated record does not fit
3916 @retval DB_UNDERFLOW if the page would become too empty
3917 @retval DB_ZIP_OVERFLOW if there is not enough space left
3918 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3919 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3920 btr_cur_optimistic_update(
3921 /*======================*/
3922 ulint flags, /*!< in: undo logging and locking flags */
3923 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3924 cursor stays valid and positioned on the
3925 same record */
3926 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
3927 mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
3928 const upd_t* update, /*!< in: update vector; this must also
3929 contain trx id and roll ptr fields */
3930 ulint cmpl_info,/*!< in: compiler info on secondary index
3931 updates */
3932 que_thr_t* thr, /*!< in: query thread */
3933 trx_id_t trx_id, /*!< in: transaction id */
3934 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3935 is a secondary index, the caller must
3936 mtr_commit(mtr) before latching any
3937 further pages */
3938 {
3939 dict_index_t* index;
3940 page_cur_t* page_cursor;
3941 dberr_t err;
3942 buf_block_t* block;
3943 page_t* page;
3944 page_zip_des_t* page_zip;
3945 rec_t* rec;
3946 ulint max_size;
3947 ulint new_rec_size;
3948 ulint old_rec_size;
3949 ulint max_ins_size = 0;
3950 dtuple_t* new_entry;
3951 roll_ptr_t roll_ptr;
3952 ulint i;
3953 ulint n_ext;
3954
3955 block = btr_cur_get_block(cursor);
3956 page = buf_block_get_frame(block);
3957 rec = btr_cur_get_rec(cursor);
3958 index = cursor->index;
3959 ut_ad(trx_id > 0
3960 || (flags & BTR_KEEP_SYS_FLAG)
3961 || dict_table_is_intrinsic(index->table));
3962 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3963 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3964 /* This is intended only for leaf page updates */
3965 ut_ad(page_is_leaf(page));
3966 /* The insert buffer tree should never be updated in place. */
3967 ut_ad(!dict_index_is_ibuf(index));
3968 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3969 || dict_index_is_clust(index));
3970 ut_ad(thr_get_trx(thr)->id == trx_id
3971 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3972 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3973 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3974 ut_ad(fil_page_index_page_check(page));
3975 ut_ad(btr_page_get_index_id(page) == index->id);
3976
3977 *offsets = rec_get_offsets(rec, index, *offsets,
3978 ULINT_UNDEFINED, heap);
3979 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3980 ut_a(!rec_offs_any_null_extern(rec, *offsets)
3981 || trx_is_recv(thr_get_trx(thr)));
3982 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
3983
3984 if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
3985
3986 /* The simplest and the most common case: the update does not
3987 change the size of any field and none of the updated fields is
3988 externally stored in rec or update, and there is enough space
3989 on the compressed page to log the update. */
3990
3991 return(btr_cur_update_in_place(
3992 flags, cursor, *offsets, update,
3993 cmpl_info, thr, trx_id, mtr));
3994 }
3995
3996 if (rec_offs_any_extern(*offsets)) {
3997 any_extern:
3998 /* Externally stored fields are treated in pessimistic
3999 update */
4000
4001 /* prefetch siblings of the leaf for the pessimistic
4002 operation. */
4003 btr_cur_prefetch_siblings(block);
4004
4005 return(DB_OVERFLOW);
4006 }
4007
4008 for (i = 0; i < upd_get_n_fields(update); i++) {
4009 if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4010
4011 goto any_extern;
4012 }
4013 }
4014
4015 DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4016 ": %s",
4017 index->name(), index->id, trx_id,
4018 rec_printer(rec, *offsets).str().c_str()));
4019
4020 page_cursor = btr_cur_get_page_cur(cursor);
4021
4022 if (!*heap) {
4023 *heap = mem_heap_create(
4024 rec_offs_size(*offsets)
4025 + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4026 }
4027
4028 new_entry = row_rec_to_index_entry(rec, index, *offsets,
4029 &n_ext, *heap);
4030 /* We checked above that there are no externally stored fields. */
4031 ut_a(!n_ext);
4032
4033 /* The page containing the clustered index record
4034 corresponding to new_entry is latched in mtr.
4035 Thus the following call is safe. */
4036 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4037 FALSE, *heap);
4038 old_rec_size = rec_offs_size(*offsets);
4039 new_rec_size = rec_get_converted_size(index, new_entry, 0);
4040
4041 page_zip = buf_block_get_page_zip(block);
4042 #ifdef UNIV_ZIP_DEBUG
4043 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4044 #endif /* UNIV_ZIP_DEBUG */
4045
4046 if (page_zip) {
4047 if (!btr_cur_update_alloc_zip(
4048 page_zip, page_cursor, index, *offsets,
4049 new_rec_size, true, mtr)) {
4050 return(DB_ZIP_OVERFLOW);
4051 }
4052
4053 rec = page_cur_get_rec(page_cursor);
4054 }
4055
4056 /* We limit max record size to 16k even for 64k page size. */
4057 if (new_rec_size >= REC_MAX_DATA_SIZE) {
4058 err = DB_OVERFLOW;
4059
4060 goto func_exit;
4061 }
4062
4063 if (UNIV_UNLIKELY(new_rec_size
4064 >= (page_get_free_space_of_empty(page_is_comp(page))
4065 / 2))) {
4066 /* We may need to update the IBUF_BITMAP_FREE
4067 bits after a reorganize that was done in
4068 btr_cur_update_alloc_zip(). */
4069 err = DB_OVERFLOW;
4070 goto func_exit;
4071 }
4072
4073 if (UNIV_UNLIKELY(page_get_data_size(page)
4074 - old_rec_size + new_rec_size
4075 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4076 /* We may need to update the IBUF_BITMAP_FREE
4077 bits after a reorganize that was done in
4078 btr_cur_update_alloc_zip(). */
4079
4080 /* The page would become too empty */
4081 err = DB_UNDERFLOW;
4082 goto func_exit;
4083 }
4084
4085 /* We do not attempt to reorganize if the page is compressed.
4086 This is because the page may fail to compress after reorganization. */
4087 max_size = page_zip
4088 ? page_get_max_insert_size(page, 1)
4089 : (old_rec_size
4090 + page_get_max_insert_size_after_reorganize(page, 1));
4091
4092 if (!page_zip) {
4093 max_ins_size = page_get_max_insert_size_after_reorganize(
4094 page, 1);
4095 }
4096
4097 if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4098 && (max_size >= new_rec_size))
4099 || (page_get_n_recs(page) <= 1))) {
4100
4101 /* We may need to update the IBUF_BITMAP_FREE
4102 bits after a reorganize that was done in
4103 btr_cur_update_alloc_zip(). */
4104
4105 /* There was not enough space, or it did not pay to
4106 reorganize: for simplicity, we decide what to do assuming a
4107 reorganization is needed, though it might not be necessary */
4108
4109 err = DB_OVERFLOW;
4110 goto func_exit;
4111 }
4112
4113 /* Do lock checking and undo logging */
4114 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4115 update, cmpl_info,
4116 thr, mtr, &roll_ptr);
4117 if (err != DB_SUCCESS) {
4118 /* We may need to update the IBUF_BITMAP_FREE
4119 bits after a reorganize that was done in
4120 btr_cur_update_alloc_zip(). */
4121 goto func_exit;
4122 }
4123
4124 /* Ok, we may do the replacement. Store on the page infimum the
4125 explicit locks on rec, before deleting rec (see the comment in
4126 btr_cur_pessimistic_update). */
4127 if (!dict_table_is_locking_disabled(index->table)) {
4128 lock_rec_store_on_page_infimum(block, rec);
4129 }
4130
4131 btr_search_update_hash_on_delete(cursor);
4132
4133 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4134
4135 page_cur_move_to_prev(page_cursor);
4136
4137 if (!(flags & BTR_KEEP_SYS_FLAG)
4138 && !dict_table_is_intrinsic(index->table)) {
4139 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4140 roll_ptr);
4141 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4142 trx_id);
4143 }
4144
4145 /* There are no externally stored columns in new_entry */
4146 rec = btr_cur_insert_if_possible(
4147 cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4148 ut_a(rec); /* <- We calculated above the insert would fit */
4149
4150 /* Restore the old explicit lock state on the record */
4151 if (!dict_table_is_locking_disabled(index->table)) {
4152 lock_rec_restore_from_page_infimum(block, rec, block);
4153 }
4154
4155 page_cur_move_to_next(page_cursor);
4156 ut_ad(err == DB_SUCCESS);
4157
4158 func_exit:
4159 if (!(flags & BTR_KEEP_IBUF_BITMAP)
4160 && !dict_index_is_clust(index)
4161 && !dict_table_is_temporary(index->table)) {
4162 /* Update the free bits in the insert buffer. */
4163 if (page_zip) {
4164 ibuf_update_free_bits_zip(block, mtr);
4165 } else {
4166 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4167 }
4168 }
4169
4170 if (err != DB_SUCCESS) {
4171 /* prefetch siblings of the leaf for the pessimistic
4172 operation. */
4173 btr_cur_prefetch_siblings(block);
4174 }
4175
4176 return(err);
4177 }
4178
4179 /*************************************************************//**
4180 If, in a split, a new supremum record was created as the predecessor of the
4181 updated record, the supremum record must inherit exactly the locks on the
4182 updated record. In the split it may have inherited locks from the successor
4183 of the updated record, which is not correct. This function restores the
4184 right locks for the new supremum. */
4185 static
4186 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4187 btr_cur_pess_upd_restore_supremum(
4188 /*==============================*/
4189 buf_block_t* block, /*!< in: buffer block of rec */
4190 const rec_t* rec, /*!< in: updated record */
4191 mtr_t* mtr) /*!< in: mtr */
4192 {
4193 page_t* page;
4194 buf_block_t* prev_block;
4195
4196 page = buf_block_get_frame(block);
4197
4198 if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4199 /* Updated record is not the first user record on its page */
4200
4201 return;
4202 }
4203
4204 const ulint prev_page_no = btr_page_get_prev(page, mtr);
4205
4206 const page_id_t page_id(block->page.id.space(), prev_page_no);
4207
4208 ut_ad(prev_page_no != FIL_NULL);
4209 prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4210 #ifdef UNIV_BTR_DEBUG
4211 ut_a(btr_page_get_next(prev_block->frame, mtr)
4212 == page_get_page_no(page));
4213 #endif /* UNIV_BTR_DEBUG */
4214
4215 /* We must already have an x-latch on prev_block! */
4216 ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4217
4218 lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4219 PAGE_HEAP_NO_SUPREMUM,
4220 page_rec_get_heap_no(rec));
4221 }
4222
4223 /*************************************************************//**
4224 Performs an update of a record on a page of a tree. It is assumed
4225 that mtr holds an x-latch on the tree and on the cursor page. If the
4226 update is made on the leaf level, to avoid deadlocks, mtr must also
4227 own x-latches to brothers of page, if those brothers exist. We assume
4228 here that the ordering fields of the record do not change.
4229 @return DB_SUCCESS or error code */
4230 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4231 btr_cur_pessimistic_update(
4232 /*=======================*/
4233 ulint flags, /*!< in: undo logging, locking, and rollback
4234 flags */
4235 btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
4236 cursor may become invalid if *big_rec == NULL
4237 || !(flags & BTR_KEEP_POS_FLAG) */
4238 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4239 mem_heap_t** offsets_heap,
4240 /*!< in/out: pointer to memory heap
4241 that can be emptied, or NULL */
4242 mem_heap_t* entry_heap,
4243 /*!< in/out: memory heap for allocating
4244 big_rec and the index tuple */
4245 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
4246 be stored externally by the caller, or NULL */
4247 upd_t* update, /*!< in/out: update vector; this is allowed to
4248 also contain trx id and roll ptr fields.
4249 Non-updated columns that are moved offpage will
4250 be appended to this. */
4251 ulint cmpl_info,/*!< in: compiler info on secondary index
4252 updates */
4253 que_thr_t* thr, /*!< in: query thread */
4254 trx_id_t trx_id, /*!< in: transaction id */
4255 mtr_t* mtr) /*!< in/out: mini-transaction; must be
4256 committed before latching any further pages */
4257 {
4258 big_rec_t* big_rec_vec = NULL;
4259 big_rec_t* dummy_big_rec;
4260 dict_index_t* index;
4261 buf_block_t* block;
4262 page_t* page;
4263 page_zip_des_t* page_zip;
4264 rec_t* rec;
4265 page_cur_t* page_cursor;
4266 dberr_t err;
4267 dberr_t optim_err;
4268 roll_ptr_t roll_ptr;
4269 ibool was_first;
4270 ulint n_reserved = 0;
4271 ulint n_ext;
4272 ulint max_ins_size = 0;
4273
4274 *offsets = NULL;
4275 *big_rec = NULL;
4276
4277 block = btr_cur_get_block(cursor);
4278 page = buf_block_get_frame(block);
4279 page_zip = buf_block_get_page_zip(block);
4280 index = cursor->index;
4281
4282 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4283 MTR_MEMO_X_LOCK |
4284 MTR_MEMO_SX_LOCK)
4285 || dict_table_is_intrinsic(index->table));
4286 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4287 #ifdef UNIV_ZIP_DEBUG
4288 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4289 #endif /* UNIV_ZIP_DEBUG */
4290 /* The insert buffer tree should never be updated in place. */
4291 ut_ad(!dict_index_is_ibuf(index));
4292 ut_ad(trx_id > 0
4293 || (flags & BTR_KEEP_SYS_FLAG)
4294 || dict_table_is_intrinsic(index->table));
4295 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4296 || dict_index_is_clust(index));
4297 ut_ad(thr_get_trx(thr)->id == trx_id
4298 || (flags & ~BTR_KEEP_POS_FLAG)
4299 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4300 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4301
4302 err = optim_err = btr_cur_optimistic_update(
4303 flags | BTR_KEEP_IBUF_BITMAP,
4304 cursor, offsets, offsets_heap, update,
4305 cmpl_info, thr, trx_id, mtr);
4306
4307 switch (err) {
4308 case DB_ZIP_OVERFLOW:
4309 case DB_UNDERFLOW:
4310 case DB_OVERFLOW:
4311 break;
4312 default:
4313 err_exit:
4314 /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4315 For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4316 already reset by btr_cur_update_alloc_zip() if the
4317 page was recompressed. */
4318 if (page_zip
4319 && optim_err != DB_ZIP_OVERFLOW
4320 && !dict_index_is_clust(index)
4321 && !dict_table_is_temporary(index->table)
4322 && page_is_leaf(page)) {
4323 ibuf_update_free_bits_zip(block, mtr);
4324 }
4325
4326 if (big_rec_vec != NULL) {
4327 dtuple_big_rec_free(big_rec_vec);
4328 }
4329
4330 return(err);
4331 }
4332
4333 rec = btr_cur_get_rec(cursor);
4334
4335 *offsets = rec_get_offsets(
4336 rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4337
4338 dtuple_t* new_entry = row_rec_to_index_entry(
4339 rec, index, *offsets, &n_ext, entry_heap);
4340
4341 /* The page containing the clustered index record
4342 corresponding to new_entry is latched in mtr. If the
4343 clustered index record is delete-marked, then its externally
4344 stored fields cannot have been purged yet, because then the
4345 purge would also have removed the clustered index record
4346 itself. Thus the following call is safe. */
4347 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4348 FALSE, entry_heap);
4349
4350 /* We have to set appropriate extern storage bits in the new
4351 record to be inserted: we have to remember which fields were such */
4352
4353 ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4354 ut_ad(rec_offs_validate(rec, index, *offsets));
4355
4356 /* Get number of externally stored columns in updated record */
4357 n_ext = new_entry->get_n_ext();
4358
4359 /* UNDO logging is also turned-off during normal operation on intrinsic
4360 table so condition needs to ensure that table is not intrinsic. */
4361 if ((flags & BTR_NO_UNDO_LOG_FLAG)
4362 && rec_offs_any_extern(*offsets)
4363 && !dict_table_is_intrinsic(index->table)) {
4364 /* We are in a transaction rollback undoing a row
4365 update: we must free possible externally stored fields
4366 which got new values in the update, if they are not
4367 inherited values. They can be inherited if we have
4368 updated the primary key to another value, and then
4369 update it back again. */
4370
4371 ut_ad(big_rec_vec == NULL);
4372 ut_ad(dict_index_is_clust(index));
4373 ut_ad(thr_get_trx(thr)->in_rollback);
4374
4375 DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4376 RECOVERY_CRASH(99);
4377
4378 btr_rec_free_updated_extern_fields(
4379 index, rec, page_zip, *offsets, update, true, mtr);
4380 }
4381
4382 if (page_zip_rec_needs_ext(
4383 rec_get_converted_size(index, new_entry, n_ext),
4384 page_is_comp(page),
4385 dict_index_get_n_fields(index),
4386 block->page.size)) {
4387
4388 big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4389 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4390
4391 /* We cannot goto return_after_reservations,
4392 because we may need to update the
4393 IBUF_BITMAP_FREE bits, which was suppressed by
4394 BTR_KEEP_IBUF_BITMAP. */
4395 #ifdef UNIV_ZIP_DEBUG
4396 ut_a(!page_zip
4397 || page_zip_validate(page_zip, page, index));
4398 #endif /* UNIV_ZIP_DEBUG */
4399 if (n_reserved > 0) {
4400 fil_space_release_free_extents(
4401 index->space, n_reserved);
4402 }
4403
4404 err = DB_TOO_BIG_RECORD;
4405 goto err_exit;
4406 }
4407
4408 ut_ad(page_is_leaf(page));
4409 ut_ad(dict_index_is_clust(index));
4410 ut_ad(flags & BTR_KEEP_POS_FLAG);
4411 }
4412
4413 /* Do lock checking and undo logging */
4414 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4415 update, cmpl_info,
4416 thr, mtr, &roll_ptr);
4417 if (err != DB_SUCCESS) {
4418 goto err_exit;
4419 }
4420
4421 if (optim_err == DB_OVERFLOW) {
4422
4423 /* First reserve enough free space for the file segments
4424 of the index tree, so that the update will not fail because
4425 of lack of space */
4426
4427 ulint n_extents = cursor->tree_height / 16 + 3;
4428
4429 if (!fsp_reserve_free_extents(
4430 &n_reserved, index->space, n_extents,
4431 flags & BTR_NO_UNDO_LOG_FLAG
4432 ? FSP_CLEANING : FSP_NORMAL,
4433 mtr)) {
4434 err = DB_OUT_OF_FILE_SPACE;
4435 goto err_exit;
4436 }
4437 }
4438
4439 if (!(flags & BTR_KEEP_SYS_FLAG)
4440 && !dict_table_is_intrinsic(index->table)) {
4441 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4442 roll_ptr);
4443 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4444 trx_id);
4445 }
4446
4447 if (!page_zip) {
4448 max_ins_size = page_get_max_insert_size_after_reorganize(
4449 page, 1);
4450 }
4451
4452 /* Store state of explicit locks on rec on the page infimum record,
4453 before deleting rec. The page infimum acts as a dummy carrier of the
4454 locks, taking care also of lock releases, before we can move the locks
4455 back on the actual record. There is a special case: if we are
4456 inserting on the root page and the insert causes a call of
4457 btr_root_raise_and_insert. Therefore we cannot in the lock system
4458 delete the lock structs set on the root page even if the root
4459 page carries just node pointers. */
4460 if (!dict_table_is_locking_disabled(index->table)) {
4461 lock_rec_store_on_page_infimum(block, rec);
4462 }
4463
4464 btr_search_update_hash_on_delete(cursor);
4465
4466 #ifdef UNIV_ZIP_DEBUG
4467 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4468 #endif /* UNIV_ZIP_DEBUG */
4469 page_cursor = btr_cur_get_page_cur(cursor);
4470
4471 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4472
4473 page_cur_move_to_prev(page_cursor);
4474
4475 rec = btr_cur_insert_if_possible(cursor, new_entry,
4476 offsets, offsets_heap, n_ext, mtr);
4477
4478 if (rec) {
4479 page_cursor->rec = rec;
4480
4481 if (!dict_table_is_locking_disabled(index->table)) {
4482 lock_rec_restore_from_page_infimum(
4483 btr_cur_get_block(cursor), rec, block);
4484 }
4485
4486 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4487 /* The new inserted record owns its possible externally
4488 stored fields */
4489 btr_cur_unmark_extern_fields(
4490 page_zip, rec, index, *offsets, mtr);
4491 }
4492
4493 bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4494
4495 if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4496 if (adjust) {
4497 rec_offs_make_valid(
4498 page_cursor->rec, index, *offsets);
4499 }
4500 } else if (!dict_index_is_clust(index)
4501 && !dict_table_is_temporary(index->table)
4502 && page_is_leaf(page)) {
4503 /* Update the free bits in the insert buffer.
4504 This is the same block which was skipped by
4505 BTR_KEEP_IBUF_BITMAP. */
4506 if (page_zip) {
4507 ibuf_update_free_bits_zip(block, mtr);
4508 } else {
4509 ibuf_update_free_bits_low(block, max_ins_size,
4510 mtr);
4511 }
4512 }
4513
4514 if (!srv_read_only_mode
4515 && !big_rec_vec
4516 && page_is_leaf(page)
4517 && !dict_index_is_online_ddl(index)) {
4518
4519 mtr_memo_release(mtr, dict_index_get_lock(index),
4520 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4521
4522 /* NOTE: We cannot release root block latch here, because it
4523 has segment header and already modified in most of cases.*/
4524 }
4525
4526 err = DB_SUCCESS;
4527 goto return_after_reservations;
4528 } else {
4529 /* If the page is compressed and it initially
4530 compresses very well, and there is a subsequent insert
4531 of a badly-compressing record, it is possible for
4532 btr_cur_optimistic_update() to return DB_UNDERFLOW and
4533 btr_cur_insert_if_possible() to return FALSE. */
4534 ut_a(page_zip || optim_err != DB_UNDERFLOW);
4535
4536 /* Out of space: reset the free bits.
4537 This is the same block which was skipped by
4538 BTR_KEEP_IBUF_BITMAP. */
4539 if (!dict_index_is_clust(index)
4540 && !dict_table_is_temporary(index->table)
4541 && page_is_leaf(page)) {
4542 ibuf_reset_free_bits(block);
4543 }
4544 }
4545
4546 if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4547 ut_ad(page_is_leaf(page));
4548 ut_ad(dict_index_is_clust(index));
4549 ut_ad(flags & BTR_KEEP_POS_FLAG);
4550
4551 /* btr_page_split_and_insert() in
4552 btr_cur_pessimistic_insert() invokes
4553 mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4554 We must keep the index->lock when we created a
4555 big_rec, so that row_upd_clust_rec() can store the
4556 big_rec in the same mini-transaction. */
4557
4558 ut_ad(mtr_memo_contains_flagged(mtr,
4559 dict_index_get_lock(index),
4560 MTR_MEMO_X_LOCK |
4561 MTR_MEMO_SX_LOCK));
4562
4563 mtr_sx_lock(dict_index_get_lock(index), mtr);
4564 }
4565
4566 /* Was the record to be updated positioned as the first user
4567 record on its page? */
4568 was_first = page_cur_is_before_first(page_cursor);
4569
4570 /* Lock checks and undo logging were already performed by
4571 btr_cur_upd_lock_and_undo(). We do not try
4572 btr_cur_optimistic_insert() because
4573 btr_cur_insert_if_possible() already failed above. */
4574
4575 err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4576 | BTR_NO_LOCKING_FLAG
4577 | BTR_KEEP_SYS_FLAG,
4578 cursor, offsets, offsets_heap,
4579 new_entry, &rec,
4580 &dummy_big_rec, n_ext, NULL, mtr);
4581 ut_a(rec);
4582 ut_a(err == DB_SUCCESS);
4583 ut_a(dummy_big_rec == NULL);
4584 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4585 page_cursor->rec = rec;
4586
4587 /* Multiple transactions cannot simultaneously operate on the
4588 same temp-table in parallel.
4589 max_trx_id is ignored for temp tables because it not required
4590 for MVCC. */
4591 if (dict_index_is_sec_or_ibuf(index)
4592 && !dict_table_is_temporary(index->table)) {
4593 /* Update PAGE_MAX_TRX_ID in the index page header.
4594 It was not updated by btr_cur_pessimistic_insert()
4595 because of BTR_NO_LOCKING_FLAG. */
4596 buf_block_t* rec_block;
4597
4598 rec_block = btr_cur_get_block(cursor);
4599
4600 page_update_max_trx_id(rec_block,
4601 buf_block_get_page_zip(rec_block),
4602 trx_id, mtr);
4603 }
4604
4605 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4606 /* The new inserted record owns its possible externally
4607 stored fields */
4608 buf_block_t* rec_block = btr_cur_get_block(cursor);
4609
4610 #ifdef UNIV_ZIP_DEBUG
4611 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4612 page = buf_block_get_frame(rec_block);
4613 #endif /* UNIV_ZIP_DEBUG */
4614 page_zip = buf_block_get_page_zip(rec_block);
4615
4616 btr_cur_unmark_extern_fields(page_zip,
4617 rec, index, *offsets, mtr);
4618 }
4619
4620 if (!dict_table_is_locking_disabled(index->table)) {
4621 lock_rec_restore_from_page_infimum(
4622 btr_cur_get_block(cursor), rec, block);
4623 }
4624
4625 /* If necessary, restore also the correct lock state for a new,
4626 preceding supremum record created in a page split. While the old
4627 record was nonexistent, the supremum might have inherited its locks
4628 from a wrong record. */
4629
4630 if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4631 btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4632 rec, mtr);
4633 }
4634
4635 return_after_reservations:
4636 #ifdef UNIV_ZIP_DEBUG
4637 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4638 #endif /* UNIV_ZIP_DEBUG */
4639
4640 if (n_reserved > 0) {
4641 fil_space_release_free_extents(index->space, n_reserved);
4642 }
4643
4644 *big_rec = big_rec_vec;
4645
4646 return(err);
4647 }
4648
4649 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4650
4651 /****************************************************************//**
4652 Writes the redo log record for delete marking or unmarking of an index
4653 record. */
4654 UNIV_INLINE
4655 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4656 btr_cur_del_mark_set_clust_rec_log(
4657 /*===============================*/
4658 rec_t* rec, /*!< in: record */
4659 dict_index_t* index, /*!< in: index of the record */
4660 trx_id_t trx_id, /*!< in: transaction id */
4661 roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
4662 mtr_t* mtr) /*!< in: mtr */
4663 {
4664 byte* log_ptr;
4665
4666 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4667 ut_ad(mtr->is_named_space(index->space));
4668
4669 log_ptr = mlog_open_and_write_index(mtr, rec, index,
4670 page_rec_is_comp(rec)
4671 ? MLOG_COMP_REC_CLUST_DELETE_MARK
4672 : MLOG_REC_CLUST_DELETE_MARK,
4673 1 + 1 + DATA_ROLL_PTR_LEN
4674 + 14 + 2);
4675
4676 if (!log_ptr) {
4677 /* Logging in mtr is switched off during crash recovery */
4678 return;
4679 }
4680
4681 *log_ptr++ = 0;
4682 *log_ptr++ = 1;
4683
4684 log_ptr = row_upd_write_sys_vals_to_log(
4685 index, trx_id, roll_ptr, log_ptr, mtr);
4686 mach_write_to_2(log_ptr, page_offset(rec));
4687 log_ptr += 2;
4688
4689 mlog_close(mtr, log_ptr);
4690 }
4691 #endif /* !UNIV_HOTBACKUP */
4692
4693 /****************************************************************//**
4694 Parses the redo log record for delete marking or unmarking of a clustered
4695 index record.
4696 @return end of log record or NULL */
4697 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4698 btr_cur_parse_del_mark_set_clust_rec(
4699 /*=================================*/
4700 byte* ptr, /*!< in: buffer */
4701 byte* end_ptr,/*!< in: buffer end */
4702 page_t* page, /*!< in/out: page or NULL */
4703 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
4704 dict_index_t* index) /*!< in: index corresponding to page */
4705 {
4706 ulint flags;
4707 ulint val;
4708 ulint pos;
4709 trx_id_t trx_id;
4710 roll_ptr_t roll_ptr;
4711 ulint offset;
4712 rec_t* rec;
4713
4714 ut_ad(!page
4715 || !!page_is_comp(page) == dict_table_is_comp(index->table));
4716
4717 if (end_ptr < ptr + 2) {
4718
4719 return(NULL);
4720 }
4721
4722 flags = mach_read_from_1(ptr);
4723 ptr++;
4724 val = mach_read_from_1(ptr);
4725 ptr++;
4726
4727 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4728
4729 if (ptr == NULL) {
4730
4731 return(NULL);
4732 }
4733
4734 if (end_ptr < ptr + 2) {
4735
4736 return(NULL);
4737 }
4738
4739 offset = mach_read_from_2(ptr);
4740 ptr += 2;
4741
4742 ut_a(offset <= UNIV_PAGE_SIZE);
4743
4744 if (page) {
4745 rec = page + offset;
4746
4747 /* We do not need to reserve search latch, as the page
4748 is only being recovered, and there cannot be a hash index to
4749 it. Besides, these fields are being updated in place
4750 and the adaptive hash index does not depend on them. */
4751
4752 btr_rec_set_deleted_flag(rec, page_zip, val);
4753
4754 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4755 mem_heap_t* heap = NULL;
4756 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4757 rec_offs_init(offsets_);
4758
4759 row_upd_rec_sys_fields_in_recovery(
4760 rec, page_zip,
4761 rec_get_offsets(rec, index, offsets_,
4762 ULINT_UNDEFINED, &heap),
4763 pos, trx_id, roll_ptr);
4764 if (UNIV_LIKELY_NULL(heap)) {
4765 mem_heap_free(heap);
4766 }
4767 }
4768 }
4769
4770 return(ptr);
4771 }
4772
4773 #ifndef UNIV_HOTBACKUP
4774 /***********************************************************//**
4775 Marks a clustered index record deleted. Writes an undo log record to
4776 undo log on this delete marking. Writes in the trx id field the id
4777 of the deleting transaction, and in the roll ptr field pointer to the
4778 undo log record created.
4779 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4780 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4781 btr_cur_del_mark_set_clust_rec(
4782 /*===========================*/
4783 ulint flags, /*!< in: undo logging and locking flags */
4784 buf_block_t* block, /*!< in/out: buffer block of the record */
4785 rec_t* rec, /*!< in/out: record */
4786 dict_index_t* index, /*!< in: clustered index of the record */
4787 const ulint* offsets,/*!< in: rec_get_offsets(rec) */
4788 que_thr_t* thr, /*!< in: query thread */
4789 const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
4790 contains the virtual cols if there are any */
4791 mtr_t* mtr) /*!< in/out: mini-transaction */
4792 {
4793 roll_ptr_t roll_ptr;
4794 dberr_t err;
4795 page_zip_des_t* page_zip;
4796 trx_t* trx;
4797
4798 ut_ad(dict_index_is_clust(index));
4799 ut_ad(rec_offs_validate(rec, index, offsets));
4800 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4801 ut_ad(buf_block_get_frame(block) == page_align(rec));
4802 ut_ad(page_is_leaf(page_align(rec)));
4803 ut_ad(mtr->is_named_space(index->space));
4804
4805 if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4806 /* While cascading delete operations, this becomes possible. */
4807 #ifdef WITH_WSREP
4808 // following may assert with wsrep
4809 #endif
4810 ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4811 return(DB_SUCCESS);
4812 }
4813
4814 err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4815 rec, index, offsets, thr);
4816
4817 if (err != DB_SUCCESS) {
4818
4819 return(err);
4820 }
4821
4822 err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4823 index, entry, NULL, 0, rec, offsets,
4824 &roll_ptr);
4825 if (err != DB_SUCCESS) {
4826
4827 return(err);
4828 }
4829
4830 /* The search latch is not needed here, because
4831 the adaptive hash index does not depend on the delete-mark
4832 and the delete-mark is being updated in place. */
4833
4834 page_zip = buf_block_get_page_zip(block);
4835
4836 btr_rec_set_deleted_flag(rec, page_zip, TRUE);
4837
4838 /* For intrinsic table, roll-ptr is not maintained as there is no UNDO
4839 logging. Skip updating it. */
4840 if (dict_table_is_intrinsic(index->table)) {
4841 return(err);
4842 }
4843
4844 trx = thr_get_trx(thr);
4845 /* This function must not be invoked during rollback
4846 (of a TRX_STATE_PREPARE transaction or otherwise). */
4847 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
4848 ut_ad(!trx->in_rollback);
4849
4850 DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
4851 ") by " TRX_ID_FMT ": %s",
4852 index->table_name, index->id,
4853 trx_get_id_for_print(trx),
4854 rec_printer(rec, offsets).str().c_str()));
4855
4856 if (dict_index_is_online_ddl(index)) {
4857 row_log_table_delete(rec, entry, index, offsets, NULL);
4858 }
4859
4860 row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
4861
4862 btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
4863 roll_ptr, mtr);
4864
4865 return(err);
4866 }
4867
4868 /****************************************************************//**
4869 Writes the redo log record for a delete mark setting of a secondary
4870 index record. */
4871 UNIV_INLINE
4872 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)4873 btr_cur_del_mark_set_sec_rec_log(
4874 /*=============================*/
4875 rec_t* rec, /*!< in: record */
4876 ibool val, /*!< in: value to set */
4877 mtr_t* mtr) /*!< in: mtr */
4878 {
4879 byte* log_ptr;
4880 ut_ad(val <= 1);
4881
4882 log_ptr = mlog_open(mtr, 11 + 1 + 2);
4883
4884 if (!log_ptr) {
4885 /* Logging in mtr is switched off during crash recovery:
4886 in that case mlog_open returns NULL */
4887 return;
4888 }
4889
4890 log_ptr = mlog_write_initial_log_record_fast(
4891 rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
4892 mach_write_to_1(log_ptr, val);
4893 log_ptr++;
4894
4895 mach_write_to_2(log_ptr, page_offset(rec));
4896 log_ptr += 2;
4897
4898 mlog_close(mtr, log_ptr);
4899 }
4900 #endif /* !UNIV_HOTBACKUP */
4901
4902 /****************************************************************//**
4903 Parses the redo log record for delete marking or unmarking of a secondary
4904 index record.
4905 @return end of log record or NULL */
4906 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)4907 btr_cur_parse_del_mark_set_sec_rec(
4908 /*===============================*/
4909 byte* ptr, /*!< in: buffer */
4910 byte* end_ptr,/*!< in: buffer end */
4911 page_t* page, /*!< in/out: page or NULL */
4912 page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
4913 {
4914 ulint val;
4915 ulint offset;
4916 rec_t* rec;
4917
4918 if (end_ptr < ptr + 3) {
4919
4920 return(NULL);
4921 }
4922
4923 val = mach_read_from_1(ptr);
4924 ptr++;
4925
4926 offset = mach_read_from_2(ptr);
4927 ptr += 2;
4928
4929 ut_a(offset <= UNIV_PAGE_SIZE);
4930
4931 if (page) {
4932 rec = page + offset;
4933
4934 /* We do not need to reserve search latch, as the page
4935 is only being recovered, and there cannot be a hash index to
4936 it. Besides, the delete-mark flag is being updated in place
4937 and the adaptive hash index does not depend on it. */
4938
4939 btr_rec_set_deleted_flag(rec, page_zip, val);
4940 }
4941
4942 return(ptr);
4943 }
4944
4945 #ifndef UNIV_HOTBACKUP
4946 /***********************************************************//**
4947 Sets a secondary index record delete mark to TRUE or FALSE.
4948 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4949 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)4950 btr_cur_del_mark_set_sec_rec(
4951 /*=========================*/
4952 ulint flags, /*!< in: locking flag */
4953 btr_cur_t* cursor, /*!< in: cursor */
4954 ibool val, /*!< in: value to set */
4955 que_thr_t* thr, /*!< in: query thread */
4956 mtr_t* mtr) /*!< in/out: mini-transaction */
4957 {
4958 buf_block_t* block;
4959 rec_t* rec;
4960 dberr_t err;
4961
4962 block = btr_cur_get_block(cursor);
4963 rec = btr_cur_get_rec(cursor);
4964
4965 err = lock_sec_rec_modify_check_and_lock(flags,
4966 btr_cur_get_block(cursor),
4967 rec, cursor->index, thr, mtr);
4968 if (err != DB_SUCCESS) {
4969
4970 return(err);
4971 }
4972
4973 ut_ad(!!page_rec_is_comp(rec)
4974 == dict_table_is_comp(cursor->index->table));
4975
4976 DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
4977 IB_ID_FMT ") by " TRX_ID_FMT,
4978 unsigned(val),
4979 block->page.id.space(), block->page.id.page_no(),
4980 unsigned(page_rec_get_heap_no(rec)),
4981 cursor->index->name(), cursor->index->id,
4982 trx_get_id_for_print(thr_get_trx(thr))));
4983
4984 /* We do not need to reserve search latch, as the
4985 delete-mark flag is being updated in place and the adaptive
4986 hash index does not depend on it. */
4987 btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
4988
4989 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
4990
4991 return(DB_SUCCESS);
4992 }
4993
4994 /***********************************************************//**
4995 Sets a secondary index record's delete mark to the given value. This
4996 function is only used by the insert buffer merge mechanism. */
4997 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)4998 btr_cur_set_deleted_flag_for_ibuf(
4999 /*==============================*/
5000 rec_t* rec, /*!< in/out: record */
5001 page_zip_des_t* page_zip, /*!< in/out: compressed page
5002 corresponding to rec, or NULL
5003 when the tablespace is
5004 uncompressed */
5005 ibool val, /*!< in: value to set */
5006 mtr_t* mtr) /*!< in/out: mini-transaction */
5007 {
5008 /* We do not need to reserve search latch, as the page
5009 has just been read to the buffer pool and there cannot be
5010 a hash index to it. Besides, the delete-mark flag is being
5011 updated in place and the adaptive hash index does not depend
5012 on it. */
5013
5014 btr_rec_set_deleted_flag(rec, page_zip, val);
5015
5016 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5017 }
5018
5019 /*==================== B-TREE RECORD REMOVE =========================*/
5020
5021 /*************************************************************//**
5022 Tries to compress a page of the tree if it seems useful. It is assumed
5023 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5024 deadlocks, mtr must also own x-latches to brothers of page, if those
5025 brothers exist. NOTE: it is assumed that the caller has reserved enough
5026 free extents so that the compression will always succeed if done!
5027 @return TRUE if compression occurred */
5028 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5029 btr_cur_compress_if_useful(
5030 /*=======================*/
5031 btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
5032 cursor does not stay valid if !adjust and
5033 compression occurs */
5034 ibool adjust, /*!< in: TRUE if should adjust the
5035 cursor position even if compression occurs */
5036 mtr_t* mtr) /*!< in/out: mini-transaction */
5037 {
5038 /* Avoid applying compression as we don't accept lot of page garbage
5039 given the workload of intrinsic table. */
5040 if (dict_table_is_intrinsic(cursor->index->table)) {
5041 return(FALSE);
5042 }
5043
5044 ut_ad(mtr_memo_contains_flagged(
5045 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5046 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5047 || dict_table_is_intrinsic(cursor->index->table));
5048 ut_ad(mtr_is_block_fix(
5049 mtr, btr_cur_get_block(cursor),
5050 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5051
5052 if (dict_index_is_spatial(cursor->index)) {
5053 const page_t* page = btr_cur_get_page(cursor);
5054 const trx_t* trx = NULL;
5055
5056 if (cursor->rtr_info->thr != NULL) {
5057 trx = thr_get_trx(cursor->rtr_info->thr);
5058 }
5059
5060 /* Check whether page lock prevents the compression */
5061 if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5062 page_get_page_no(page))) {
5063 return(false);
5064 }
5065 }
5066
5067 return(btr_cur_compress_recommendation(cursor, mtr)
5068 && btr_compress(cursor, adjust, mtr));
5069 }
5070
5071 /*******************************************************//**
5072 Removes the record on which the tree cursor is positioned on a leaf page.
5073 It is assumed that the mtr has an x-latch on the page where the cursor is
5074 positioned, but no latch on the whole tree.
5075 @return TRUE if success, i.e., the page did not become too empty */
5076 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5077 btr_cur_optimistic_delete_func(
5078 /*===========================*/
5079 btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
5080 delete; cursor stays valid: if deletion
5081 succeeds, on function exit it points to the
5082 successor of the deleted record */
5083 #ifdef UNIV_DEBUG
5084 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5085 #endif /* UNIV_DEBUG */
5086 mtr_t* mtr) /*!< in: mtr; if this function returns
5087 TRUE on a leaf page of a secondary
5088 index, the mtr must be committed
5089 before latching any further pages */
5090 {
5091 buf_block_t* block;
5092 rec_t* rec;
5093 mem_heap_t* heap = NULL;
5094 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5095 ulint* offsets = offsets_;
5096 ibool no_compress_needed;
5097 rec_offs_init(offsets_);
5098
5099 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5100 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5101 MTR_MEMO_PAGE_X_FIX));
5102 ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5103 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5104 ut_ad(mtr->is_named_space(cursor->index->space));
5105
5106 /* This is intended only for leaf page deletions */
5107
5108 block = btr_cur_get_block(cursor);
5109
5110 ut_ad(page_is_leaf(buf_block_get_frame(block)));
5111 ut_ad(!dict_index_is_online_ddl(cursor->index)
5112 || dict_index_is_clust(cursor->index)
5113 || (flags & BTR_CREATE_FLAG));
5114
5115 rec = btr_cur_get_rec(cursor);
5116 offsets = rec_get_offsets(rec, cursor->index, offsets,
5117 ULINT_UNDEFINED, &heap);
5118
5119 no_compress_needed = !rec_offs_any_extern(offsets)
5120 && btr_cur_can_delete_without_compress(
5121 cursor, rec_offs_size(offsets), mtr);
5122
5123 if (no_compress_needed) {
5124
5125 page_t* page = buf_block_get_frame(block);
5126 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
5127
5128 lock_update_delete(block, rec);
5129
5130 btr_search_update_hash_on_delete(cursor);
5131
5132 if (page_zip) {
5133 #ifdef UNIV_ZIP_DEBUG
5134 ut_a(page_zip_validate(page_zip, page, cursor->index));
5135 #endif /* UNIV_ZIP_DEBUG */
5136 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5137 cursor->index, offsets, mtr);
5138 #ifdef UNIV_ZIP_DEBUG
5139 ut_a(page_zip_validate(page_zip, page, cursor->index));
5140 #endif /* UNIV_ZIP_DEBUG */
5141
5142 /* On compressed pages, the IBUF_BITMAP_FREE
5143 space is not affected by deleting (purging)
5144 records, because it is defined as the minimum
5145 of space available *without* reorganize, and
5146 space available in the modification log. */
5147 } else {
5148 const ulint max_ins
5149 = page_get_max_insert_size_after_reorganize(
5150 page, 1);
5151
5152 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5153 cursor->index, offsets, mtr);
5154
5155 /* The change buffer does not handle inserts
5156 into non-leaf pages, into clustered indexes,
5157 or into the change buffer. */
5158 if (!dict_index_is_clust(cursor->index)
5159 && !dict_table_is_temporary(cursor->index->table)
5160 && !dict_index_is_ibuf(cursor->index)) {
5161 ibuf_update_free_bits_low(block, max_ins, mtr);
5162 }
5163 }
5164 } else {
5165 /* prefetch siblings of the leaf for the pessimistic
5166 operation. */
5167 btr_cur_prefetch_siblings(block);
5168 }
5169
5170 if (UNIV_LIKELY_NULL(heap)) {
5171 mem_heap_free(heap);
5172 }
5173
5174 return(no_compress_needed);
5175 }
5176
5177 /*************************************************************//**
5178 Removes the record on which the tree cursor is positioned. Tries
5179 to compress the page if its fillfactor drops below a threshold
5180 or if it is the only page on the level. It is assumed that mtr holds
5181 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5182 mtr must also own x-latches to brothers of page, if those brothers
5183 exist.
5184 @return TRUE if compression occurred and FALSE if not or something
5185 wrong. */
5186 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5187 btr_cur_pessimistic_delete(
5188 /*=======================*/
5189 dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5190 the latter may occur because we may have
5191 to update node pointers on upper levels,
5192 and in the case of variable length keys
5193 these may actually grow in size */
5194 ibool has_reserved_extents, /*!< in: TRUE if the
5195 caller has already reserved enough free
5196 extents so that he knows that the operation
5197 will succeed */
5198 btr_cur_t* cursor, /*!< in: cursor on the record to delete;
5199 if compression does not occur, the cursor
5200 stays valid: it points to successor of
5201 deleted record on function exit */
5202 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5203 bool rollback,/*!< in: performing rollback? */
5204 mtr_t* mtr) /*!< in: mtr */
5205 {
5206 buf_block_t* block;
5207 page_t* page;
5208 page_zip_des_t* page_zip;
5209 dict_index_t* index;
5210 rec_t* rec;
5211 ulint n_reserved = 0;
5212 bool success;
5213 ibool ret = FALSE;
5214 ulint level;
5215 mem_heap_t* heap;
5216 ulint* offsets;
5217 bool allow_merge = true; /* if true, implies we have taken appropriate page
5218 latches needed to merge this page.*/
5219 #ifdef UNIV_DEBUG
5220 bool parent_latched = false;
5221 #endif /* UNIV_DEBUG */
5222
5223 block = btr_cur_get_block(cursor);
5224 page = buf_block_get_frame(block);
5225 index = btr_cur_get_index(cursor);
5226
5227 ulint rec_size_est = dict_index_node_ptr_max_size(index);
5228 const page_size_t page_size(dict_table_page_size(index->table));
5229
5230 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5231 ut_ad(!dict_index_is_online_ddl(index)
5232 || dict_index_is_clust(index)
5233 || (flags & BTR_CREATE_FLAG));
5234 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5235 MTR_MEMO_X_LOCK
5236 | MTR_MEMO_SX_LOCK)
5237 || dict_table_is_intrinsic(index->table));
5238 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5239 ut_ad(mtr->is_named_space(index->space));
5240
5241 if (!has_reserved_extents) {
5242 /* First reserve enough free space for the file segments
5243 of the index tree, so that the node pointer updates will
5244 not fail because of lack of space */
5245
5246 ulint n_extents = cursor->tree_height / 32 + 1;
5247
5248 success = fsp_reserve_free_extents(&n_reserved,
5249 index->space,
5250 n_extents,
5251 FSP_CLEANING, mtr);
5252 if (!success) {
5253 *err = DB_OUT_OF_FILE_SPACE;
5254
5255 return(FALSE);
5256 }
5257 }
5258
5259 heap = mem_heap_create(1024);
5260 rec = btr_cur_get_rec(cursor);
5261 page_zip = buf_block_get_page_zip(block);
5262 #ifdef UNIV_ZIP_DEBUG
5263 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5264 #endif /* UNIV_ZIP_DEBUG */
5265
5266 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5267
5268 if (rec_offs_any_extern(offsets)) {
5269 btr_rec_free_externally_stored_fields(index,
5270 rec, offsets, page_zip,
5271 rollback, mtr);
5272 #ifdef UNIV_ZIP_DEBUG
5273 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5274 #endif /* UNIV_ZIP_DEBUG */
5275 }
5276
5277 if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5278 && UNIV_UNLIKELY(dict_index_get_page(index)
5279 != block->page.id.page_no())) {
5280
5281 /* If there is only one record, drop the whole page in
5282 btr_discard_page, if this is not the root page */
5283
5284 btr_discard_page(cursor, mtr);
5285
5286 ret = TRUE;
5287
5288 goto return_after_reservations;
5289 }
5290
5291 if (flags == 0) {
5292 lock_update_delete(block, rec);
5293 }
5294
5295 level = btr_page_get_level(page, mtr);
5296
5297 if (level > 0
5298 && UNIV_UNLIKELY(rec == page_rec_get_next(
5299 page_get_infimum_rec(page)))) {
5300
5301 rec_t* next_rec = page_rec_get_next(rec);
5302
5303 if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5304
5305 /* If we delete the leftmost node pointer on a
5306 non-leaf level, we must mark the new leftmost node
5307 pointer as the predefined minimum record */
5308
5309 /* This will make page_zip_validate() fail until
5310 page_cur_delete_rec() completes. This is harmless,
5311 because everything will take place within a single
5312 mini-transaction and because writing to the redo log
5313 is an atomic operation (performed by mtr_commit()). */
5314 btr_set_min_rec_mark(next_rec, mtr);
5315 } else if (dict_index_is_spatial(index)) {
5316 /* For rtree, if delete the leftmost node pointer,
5317 we need to update parent page. */
5318 rtr_mbr_t father_mbr;
5319 rec_t* father_rec;
5320 btr_cur_t father_cursor;
5321 ulint* offsets;
5322 bool upd_ret;
5323 ulint len;
5324
5325 rtr_page_get_father_block(NULL, heap, index,
5326 block, mtr, NULL,
5327 &father_cursor);
5328 offsets = rec_get_offsets(
5329 btr_cur_get_rec(&father_cursor), index,
5330 NULL, ULINT_UNDEFINED, &heap);
5331
5332 father_rec = btr_cur_get_rec(&father_cursor);
5333 rtr_read_mbr(rec_get_nth_field(
5334 father_rec, offsets, 0, &len), &father_mbr);
5335
5336 upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5337 NULL, page, &father_mbr,
5338 next_rec, mtr);
5339
5340 if (!upd_ret) {
5341 *err = DB_ERROR;
5342
5343 mem_heap_free(heap);
5344 return(FALSE);
5345 }
5346
5347 ut_d(parent_latched = true);
5348 } else {
5349 /* Otherwise, if we delete the leftmost node pointer
5350 on a page, we have to change the parent node pointer
5351 so that it is equal to the new leftmost node pointer
5352 on the page */
5353
5354 btr_node_ptr_delete(index, block, mtr);
5355
5356 dtuple_t* node_ptr = dict_index_build_node_ptr(
5357 index, next_rec, block->page.id.page_no(),
5358 heap, level);
5359
5360 btr_insert_on_non_leaf_level(
5361 flags, index, level + 1, node_ptr, mtr);
5362
5363 ut_d(parent_latched = true);
5364 }
5365 }
5366
5367 btr_search_update_hash_on_delete(cursor);
5368
5369 if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5370 /* Set allow merge to true for spatial indexes as the tree is X
5371 locked incase of delete operation on spatial indexes thus avoiding
5372 possibility of upward locking.*/
5373 allow_merge = true;
5374 } else {
5375 allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5376 rec,rec_size_est,page_size,mtr);
5377 }
5378 page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5379 #ifdef UNIV_ZIP_DEBUG
5380 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5381 #endif /* UNIV_ZIP_DEBUG */
5382
5383 /* btr_check_node_ptr() needs parent block latched */
5384 ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5385
5386 return_after_reservations:
5387 *err = DB_SUCCESS;
5388
5389 mem_heap_free(heap);
5390
5391 if(!ret) {
5392 bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5393 /* We are not allowed do merge because appropriate locks
5394 are not taken while positioning the cursor. */
5395 if (!allow_merge && do_merge) {
5396 ib::info() << "Ignoring merge recommendation for page"
5397 "as we could not predict it early .Page"
5398 "number being\n" << page_get_page_no(page) <<
5399 "Index name\n" << index->name;
5400 ut_ad(false);
5401 } else if (do_merge) {
5402
5403 ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5404 }
5405 }
5406
5407 if (!srv_read_only_mode
5408 && page_is_leaf(page)
5409 && !dict_index_is_online_ddl(index)) {
5410
5411 mtr_memo_release(mtr, dict_index_get_lock(index),
5412 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5413
5414 /* NOTE: We cannot release root block latch here, because it
5415 has segment header and already modified in most of cases.*/
5416 }
5417
5418 if (n_reserved > 0) {
5419 fil_space_release_free_extents(index->space, n_reserved);
5420 }
5421
5422 return(ret);
5423 }
5424
5425 /*******************************************************************//**
5426 Adds path information to the cursor for the current page, for which
5427 the binary search has been performed. */
5428 static
5429 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5430 btr_cur_add_path_info(
5431 /*==================*/
5432 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
5433 ulint height, /*!< in: height of the page in tree;
5434 0 means leaf node */
5435 ulint root_height) /*!< in: root node height in tree */
5436 {
5437 btr_path_t* slot;
5438 const rec_t* rec;
5439 const page_t* page;
5440
5441 ut_a(cursor->path_arr);
5442
5443 if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5444 /* Do nothing; return empty path */
5445
5446 slot = cursor->path_arr;
5447 slot->nth_rec = ULINT_UNDEFINED;
5448
5449 return;
5450 }
5451
5452 if (height == 0) {
5453 /* Mark end of slots for path */
5454 slot = cursor->path_arr + root_height + 1;
5455 slot->nth_rec = ULINT_UNDEFINED;
5456 }
5457
5458 rec = btr_cur_get_rec(cursor);
5459
5460 slot = cursor->path_arr + (root_height - height);
5461
5462 page = page_align(rec);
5463
5464 slot->nth_rec = page_rec_get_n_recs_before(rec);
5465 slot->n_recs = page_get_n_recs(page);
5466 slot->page_no = page_get_page_no(page);
5467 slot->page_level = btr_page_get_level_low(page);
5468 }
5469
5470 /*******************************************************************//**
5471 Estimate the number of rows between slot1 and slot2 for any level on a
5472 B-tree. This function starts from slot1->page and reads a few pages to
5473 the right, counting their records. If we reach slot2->page quickly then
5474 we know exactly how many records there are between slot1 and slot2 and
5475 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5476 then we calculate the average number of records in the pages scanned
5477 so far and assume that all pages that we did not scan up to slot2->page
5478 contain the same number of records, then we multiply that average to
5479 the number of pages between slot1->page and slot2->page (which is
5480 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5481 @return number of rows, not including the borders (exact or estimated) */
5482 static
5483 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5484 btr_estimate_n_rows_in_range_on_level(
5485 /*==================================*/
5486 dict_index_t* index, /*!< in: index */
5487 btr_path_t* slot1, /*!< in: left border */
5488 btr_path_t* slot2, /*!< in: right border */
5489 int64_t n_rows_on_prev_level, /*!< in: number of rows
5490 on the previous level for the
5491 same descend paths; used to
5492 determine the number of pages
5493 on this level */
5494 ibool* is_n_rows_exact) /*!< out: TRUE if the returned
5495 value is exact i.e. not an
5496 estimation */
5497 {
5498 int64_t n_rows;
5499 ulint n_pages_read;
5500 ulint level;
5501
5502 n_rows = 0;
5503 n_pages_read = 0;
5504
5505 /* Assume by default that we will scan all pages between
5506 slot1->page_no and slot2->page_no. */
5507 *is_n_rows_exact = TRUE;
5508
5509 /* Add records from slot1->page_no which are to the right of
5510 the record which serves as a left border of the range, if any
5511 (we don't include the record itself in this count). */
5512 if (slot1->nth_rec <= slot1->n_recs) {
5513 n_rows += slot1->n_recs - slot1->nth_rec;
5514 }
5515
5516 /* Add records from slot2->page_no which are to the left of
5517 the record which servers as a right border of the range, if any
5518 (we don't include the record itself in this count). */
5519 if (slot2->nth_rec > 1) {
5520 n_rows += slot2->nth_rec - 1;
5521 }
5522
5523 /* Count the records in the pages between slot1->page_no and
5524 slot2->page_no (non inclusive), if any. */
5525
5526 /* Do not read more than this number of pages in order not to hurt
5527 performance with this code which is just an estimation. If we read
5528 this many pages before reaching slot2->page_no then we estimate the
5529 average from the pages scanned so far. */
5530 # define N_PAGES_READ_LIMIT 10
5531
5532 page_id_t page_id(
5533 dict_index_get_space(index), slot1->page_no);
5534 const fil_space_t* space = fil_space_get(index->space);
5535 ut_ad(space);
5536 const page_size_t page_size(space->flags);
5537
5538 level = slot1->page_level;
5539
5540 do {
5541 mtr_t mtr;
5542 page_t* page;
5543 buf_block_t* block;
5544
5545 mtr_start(&mtr);
5546
5547 /* Fetch the page. Because we are not holding the
5548 index->lock, the tree may have changed and we may be
5549 attempting to read a page that is no longer part of
5550 the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5551 silence a debug assertion about this. */
5552 block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5553 NULL, BUF_GET_POSSIBLY_FREED,
5554 __FILE__, __LINE__, &mtr);
5555
5556 page = buf_block_get_frame(block);
5557
5558 /* It is possible that the tree has been reorganized in the
5559 meantime and this is a different page. If this happens the
5560 calculated estimate will be bogus, which is not fatal as
5561 this is only an estimate. We are sure that a page with
5562 page_no exists because InnoDB never frees pages, only
5563 reuses them. */
5564 if (!fil_page_index_page_check(page)
5565 || btr_page_get_index_id(page) != index->id
5566 || btr_page_get_level_low(page) != level) {
5567
5568 /* The page got reused for something else */
5569 mtr_commit(&mtr);
5570 goto inexact;
5571 }
5572
5573 /* It is possible but highly unlikely that the page was
5574 originally written by an old version of InnoDB that did
5575 not initialize FIL_PAGE_TYPE on other than B-tree pages.
5576 For example, this could be an almost-empty BLOB page
5577 that happens to contain the magic values in the fields
5578 that we checked above. */
5579
5580 n_pages_read++;
5581
5582 if (page_id.page_no() != slot1->page_no) {
5583 /* Do not count the records on slot1->page_no,
5584 we already counted them before this loop. */
5585 n_rows += page_get_n_recs(page);
5586 }
5587
5588 page_id.set_page_no(btr_page_get_next(page, &mtr));
5589
5590 mtr_commit(&mtr);
5591
5592 if (n_pages_read == N_PAGES_READ_LIMIT
5593 || page_id.page_no() == FIL_NULL) {
5594 /* Either we read too many pages or
5595 we reached the end of the level without passing
5596 through slot2->page_no, the tree must have changed
5597 in the meantime */
5598 goto inexact;
5599 }
5600
5601 } while (page_id.page_no() != slot2->page_no);
5602
5603 return(n_rows);
5604
5605 inexact:
5606
5607 *is_n_rows_exact = FALSE;
5608
5609 /* We did interrupt before reaching slot2->page */
5610
5611 if (n_pages_read > 0) {
5612 /* The number of pages on this level is
5613 n_rows_on_prev_level, multiply it by the
5614 average number of recs per page so far */
5615 n_rows = n_rows_on_prev_level
5616 * n_rows / n_pages_read;
5617 } else {
5618 /* The tree changed before we could even
5619 start with slot1->page_no */
5620 n_rows = 10;
5621 }
5622
5623 return(n_rows);
5624 }
5625
5626 /** If the tree gets changed too much between the two dives for the left
5627 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5628 that many times before giving up and returning the value stored in
5629 rows_in_range_arbitrary_ret_val. */
5630 static const unsigned rows_in_range_max_retries = 4;
5631
5632 /** We pretend that a range has that many records if the tree keeps changing
5633 for rows_in_range_max_retries retries while we try to estimate the records
5634 in a given range. */
5635 static const int64_t rows_in_range_arbitrary_ret_val = 10;
5636
5637 /** Estimates the number of rows in a given index range.
5638 @param[in] index index
5639 @param[in] tuple1 range start, may also be empty tuple
5640 @param[in] mode1 search mode for range start
5641 @param[in] tuple2 range end, may also be empty tuple
5642 @param[in] mode2 search mode for range end
5643 @param[in] nth_attempt if the tree gets modified too much while
5644 we are trying to analyze it, then we will retry (this function will call
5645 itself, incrementing this parameter)
5646 @return estimated number of rows; if after rows_in_range_max_retries
5647 retries the tree keeps changing, then we will just return
5648 rows_in_range_arbitrary_ret_val as a result (if
5649 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5650 the two dives). */
5651 static
5652 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5653 btr_estimate_n_rows_in_range_low(
5654 dict_index_t* index,
5655 const dtuple_t* tuple1,
5656 page_cur_mode_t mode1,
5657 const dtuple_t* tuple2,
5658 page_cur_mode_t mode2,
5659 unsigned nth_attempt)
5660 {
5661 btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
5662 btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
5663 btr_cur_t cursor;
5664 btr_path_t* slot1;
5665 btr_path_t* slot2;
5666 ibool diverged;
5667 ibool diverged_lot;
5668 ulint divergence_level;
5669 int64_t n_rows;
5670 ibool is_n_rows_exact;
5671 ulint i;
5672 mtr_t mtr;
5673 int64_t table_n_rows;
5674
5675 table_n_rows = dict_table_get_n_rows(index->table);
5676
5677 /* Below we dive to the two records specified by tuple1 and tuple2 and
5678 we remember the entire dive paths from the tree root. The place where
5679 the tuple1 path ends on the leaf level we call "left border" of our
5680 interval and the place where the tuple2 path ends on the leaf level -
5681 "right border". We take care to either include or exclude the interval
5682 boundaries depending on whether <, <=, > or >= was specified. For
5683 example if "5 < x AND x <= 10" then we should not include the left
5684 boundary, but should include the right one. */
5685
5686 mtr_start(&mtr);
5687
5688 cursor.path_arr = path1;
5689
5690 bool should_count_the_left_border;
5691
5692 if (dtuple_get_n_fields(tuple1) > 0) {
5693
5694 btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5695 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5696 &cursor, 0,
5697 __FILE__, __LINE__, &mtr);
5698
5699 ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5700
5701 /* We should count the border if there are any records to
5702 match the criteria, i.e. if the maximum record on the tree is
5703 5 and x > 3 is specified then the cursor will be positioned at
5704 5 and we should count the border, but if x > 7 is specified,
5705 then the cursor will be positioned at 'sup' on the rightmost
5706 leaf page in the tree and we should not count the border. */
5707 should_count_the_left_border
5708 = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5709 } else {
5710 btr_cur_open_at_index_side(true, index,
5711 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5712 &cursor, 0, &mtr);
5713
5714 ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5715
5716 /* The range specified is wihout a left border, just
5717 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5718 positioned the cursor on the infimum record on the leftmost
5719 page, which must not be counted. */
5720 should_count_the_left_border = false;
5721 }
5722
5723 mtr_commit(&mtr);
5724
5725 mtr_start(&mtr);
5726
5727 cursor.path_arr = path2;
5728
5729 bool should_count_the_right_border;
5730
5731 if (dtuple_get_n_fields(tuple2) > 0) {
5732
5733 btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5734 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5735 &cursor, 0,
5736 __FILE__, __LINE__, &mtr);
5737
5738 const rec_t* rec = btr_cur_get_rec(&cursor);
5739
5740 ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5741
5742 should_count_the_right_border
5743 = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5744 /* and the record was found */
5745 && cursor.low_match >= dtuple_get_n_fields(tuple2))
5746 || (mode2 == PAGE_CUR_L /* or if the range is '<' */
5747 /* and there are any records to match the criteria,
5748 i.e. if the minimum record on the tree is 5 and
5749 x < 7 is specified then the cursor will be
5750 positioned at 5 and we should count the border, but
5751 if x < 2 is specified, then the cursor will be
5752 positioned at 'inf' and we should not count the
5753 border */
5754 && !page_rec_is_infimum(rec));
5755 /* Notice that for "WHERE col <= 'foo'" MySQL passes to
5756 ha_innobase::records_in_range():
5757 min_key=NULL (left-unbounded) which is expected
5758 max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5759 unexpected - one would expect
5760 flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5761 cursor will be positioned on the first record to the right of
5762 the requested one (can also be positioned on the 'sup') and
5763 we should not count the right border. */
5764 } else {
5765 btr_cur_open_at_index_side(false, index,
5766 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5767 &cursor, 0, &mtr);
5768
5769 ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5770
5771 /* The range specified is wihout a right border, just
5772 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5773 positioned the cursor on the supremum record on the rightmost
5774 page, which must not be counted. */
5775 should_count_the_right_border = false;
5776 }
5777
5778 mtr_commit(&mtr);
5779
5780 /* We have the path information for the range in path1 and path2 */
5781
5782 n_rows = 0;
5783 is_n_rows_exact = TRUE;
5784
5785 /* This becomes true when the two paths do not pass through the
5786 same pages anymore. */
5787 diverged = FALSE;
5788
5789 /* This becomes true when the paths are not the same or adjacent
5790 any more. This means that they pass through the same or
5791 neighboring-on-the-same-level pages only. */
5792 diverged_lot = FALSE;
5793
5794 /* This is the level where paths diverged a lot. */
5795 divergence_level = 1000000;
5796
5797 for (i = 0; ; i++) {
5798 ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
5799
5800 slot1 = path1 + i;
5801 slot2 = path2 + i;
5802
5803 if (slot1->nth_rec == ULINT_UNDEFINED
5804 || slot2->nth_rec == ULINT_UNDEFINED) {
5805
5806 /* Here none of the borders were counted. For example,
5807 if on the leaf level we descended to:
5808 (inf, a, b, c, d, e, f, sup)
5809 ^ ^
5810 path1 path2
5811 then n_rows will be 2 (c and d). */
5812
5813 if (is_n_rows_exact) {
5814 /* Only fiddle to adjust this off-by-one
5815 if the number is exact, otherwise we do
5816 much grosser adjustments below. */
5817
5818 btr_path_t* last1 = &path1[i - 1];
5819 btr_path_t* last2 = &path2[i - 1];
5820
5821 /* If both paths end up on the same record on
5822 the leaf level. */
5823 if (last1->page_no == last2->page_no
5824 && last1->nth_rec == last2->nth_rec) {
5825
5826 /* n_rows can be > 0 here if the paths
5827 were first different and then converged
5828 to the same record on the leaf level.
5829 For example:
5830 SELECT ... LIKE 'wait/synch/rwlock%'
5831 mode1=PAGE_CUR_GE,
5832 tuple1="wait/synch/rwlock"
5833 path1[0]={nth_rec=58, n_recs=58,
5834 page_no=3, page_level=1}
5835 path1[1]={nth_rec=56, n_recs=55,
5836 page_no=119, page_level=0}
5837
5838 mode2=PAGE_CUR_G
5839 tuple2="wait/synch/rwlock"
5840 path2[0]={nth_rec=57, n_recs=57,
5841 page_no=3, page_level=1}
5842 path2[1]={nth_rec=56, n_recs=55,
5843 page_no=119, page_level=0} */
5844
5845 /* If the range is such that we should
5846 count both borders, then avoid
5847 counting that record twice - once as a
5848 left border and once as a right
5849 border. */
5850 if (should_count_the_left_border
5851 && should_count_the_right_border) {
5852
5853 n_rows = 1;
5854 } else {
5855 /* Some of the borders should
5856 not be counted, e.g. [3,3). */
5857 n_rows = 0;
5858 }
5859 } else {
5860 if (should_count_the_left_border) {
5861 n_rows++;
5862 }
5863
5864 if (should_count_the_right_border) {
5865 n_rows++;
5866 }
5867 }
5868 }
5869
5870 if (i > divergence_level + 1 && !is_n_rows_exact) {
5871 /* In trees whose height is > 1 our algorithm
5872 tends to underestimate: multiply the estimate
5873 by 2: */
5874
5875 n_rows = n_rows * 2;
5876 }
5877
5878 DBUG_EXECUTE_IF("bug14007649", return(n_rows););
5879
5880 /* Do not estimate the number of rows in the range
5881 to over 1 / 2 of the estimated rows in the whole
5882 table */
5883
5884 if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
5885
5886 n_rows = table_n_rows / 2;
5887
5888 /* If there are just 0 or 1 rows in the table,
5889 then we estimate all rows are in the range */
5890
5891 if (n_rows == 0) {
5892 n_rows = table_n_rows;
5893 }
5894 }
5895
5896 return(n_rows);
5897 }
5898
5899 if (!diverged && slot1->nth_rec != slot2->nth_rec) {
5900
5901 /* If both slots do not point to the same page,
5902 this means that the tree must have changed between
5903 the dive for slot1 and the dive for slot2 at the
5904 beginning of this function. */
5905 if (slot1->page_no != slot2->page_no
5906 || slot1->page_level != slot2->page_level) {
5907
5908 /* If the tree keeps changing even after a
5909 few attempts, then just return some arbitrary
5910 number. */
5911 if (nth_attempt >= rows_in_range_max_retries) {
5912 return(rows_in_range_arbitrary_ret_val);
5913 }
5914
5915 const int64_t ret =
5916 btr_estimate_n_rows_in_range_low(
5917 index, tuple1, mode1,
5918 tuple2, mode2, nth_attempt + 1);
5919
5920 return(ret);
5921 }
5922
5923 diverged = TRUE;
5924
5925 if (slot1->nth_rec < slot2->nth_rec) {
5926 /* We do not count the borders (nor the left
5927 nor the right one), thus "- 1". */
5928 n_rows = slot2->nth_rec - slot1->nth_rec - 1;
5929
5930 if (n_rows > 0) {
5931 /* There is at least one row between
5932 the two borders pointed to by slot1
5933 and slot2, so on the level below the
5934 slots will point to non-adjacent
5935 pages. */
5936 diverged_lot = TRUE;
5937 divergence_level = i;
5938 }
5939 } else {
5940 /* It is possible that
5941 slot1->nth_rec >= slot2->nth_rec
5942 if, for example, we have a single page
5943 tree which contains (inf, 5, 6, supr)
5944 and we select where x > 20 and x < 30;
5945 in this case slot1->nth_rec will point
5946 to the supr record and slot2->nth_rec
5947 will point to 6. */
5948 n_rows = 0;
5949 should_count_the_left_border = false;
5950 should_count_the_right_border = false;
5951 }
5952
5953 } else if (diverged && !diverged_lot) {
5954
5955 if (slot1->nth_rec < slot1->n_recs
5956 || slot2->nth_rec > 1) {
5957
5958 diverged_lot = TRUE;
5959 divergence_level = i;
5960
5961 n_rows = 0;
5962
5963 if (slot1->nth_rec < slot1->n_recs) {
5964 n_rows += slot1->n_recs
5965 - slot1->nth_rec;
5966 }
5967
5968 if (slot2->nth_rec > 1) {
5969 n_rows += slot2->nth_rec - 1;
5970 }
5971 }
5972 } else if (diverged_lot) {
5973
5974 n_rows = btr_estimate_n_rows_in_range_on_level(
5975 index, slot1, slot2, n_rows,
5976 &is_n_rows_exact);
5977 }
5978 }
5979 }
5980
5981 /** Estimates the number of rows in a given index range.
5982 @param[in] index index
5983 @param[in] tuple1 range start, may also be empty tuple
5984 @param[in] mode1 search mode for range start
5985 @param[in] tuple2 range end, may also be empty tuple
5986 @param[in] mode2 search mode for range end
5987 @return estimated number of rows */
5988 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)5989 btr_estimate_n_rows_in_range(
5990 dict_index_t* index,
5991 const dtuple_t* tuple1,
5992 page_cur_mode_t mode1,
5993 const dtuple_t* tuple2,
5994 page_cur_mode_t mode2)
5995 {
5996 const int64_t ret = btr_estimate_n_rows_in_range_low(
5997 index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
5998
5999 return(ret);
6000 }
6001
6002 /*******************************************************************//**
6003 Record the number of non_null key values in a given index for
6004 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6005 The estimates are eventually stored in the array:
6006 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6007 static
6008 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6009 btr_record_not_null_field_in_rec(
6010 /*=============================*/
6011 ulint n_unique, /*!< in: dict_index_get_n_unique(index),
6012 number of columns uniquely determine
6013 an index entry */
6014 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
6015 its size could be for all fields or
6016 that of "n_unique" */
6017 ib_uint64_t* n_not_null) /*!< in/out: array to record number of
6018 not null rows for n-column prefix */
6019 {
6020 ulint i;
6021
6022 ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6023
6024 if (n_not_null == NULL) {
6025 return;
6026 }
6027
6028 for (i = 0; i < n_unique; i++) {
6029 if (rec_offs_nth_sql_null(offsets, i)) {
6030 break;
6031 }
6032
6033 n_not_null[i]++;
6034 }
6035 }
6036
6037 /*******************************************************************//**
6038 Estimates the number of different key values in a given index, for
6039 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6040 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6041 0..n_uniq-1) and the number of pages that were sampled is saved in
6042 index->stat_n_sample_sizes[].
6043 If innodb_stats_method is nulls_ignored, we also record the number of
6044 non-null values for each prefix and stored the estimates in
6045 array index->stat_n_non_null_key_vals.
6046 @return true if the index is available and we get the estimated numbers,
6047 false if the index is unavailable. */
6048 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6049 btr_estimate_number_of_different_key_vals(
6050 /*======================================*/
6051 dict_index_t* index) /*!< in: index */
6052 {
6053 btr_cur_t cursor;
6054 page_t* page;
6055 rec_t* rec;
6056 ulint n_cols;
6057 ib_uint64_t* n_diff;
6058 ib_uint64_t* n_not_null;
6059 ibool stats_null_not_equal;
6060 uintmax_t n_sample_pages; /* number of pages to sample */
6061 ulint not_empty_flag = 0;
6062 ulint total_external_size = 0;
6063 ulint i;
6064 ulint j;
6065 uintmax_t add_on;
6066 mtr_t mtr;
6067 mem_heap_t* heap = NULL;
6068 ulint* offsets_rec = NULL;
6069 ulint* offsets_next_rec = NULL;
6070
6071 /* For spatial index, there is no such stats can be
6072 fetched. */
6073 if (dict_index_is_spatial(index)) {
6074 return(false);
6075 }
6076
6077 n_cols = dict_index_get_n_unique(index);
6078
6079 heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6080 * n_cols
6081 + dict_index_get_n_fields(index)
6082 * (sizeof *offsets_rec
6083 + sizeof *offsets_next_rec));
6084
6085 n_diff = (ib_uint64_t*) mem_heap_zalloc(
6086 heap, n_cols * sizeof(n_diff[0]));
6087
6088 n_not_null = NULL;
6089
6090 /* Check srv_innodb_stats_method setting, and decide whether we
6091 need to record non-null value and also decide if NULL is
6092 considered equal (by setting stats_null_not_equal value) */
6093 switch (srv_innodb_stats_method) {
6094 case SRV_STATS_NULLS_IGNORED:
6095 n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6096 heap, n_cols * sizeof *n_not_null);
6097 /* fall through */
6098
6099 case SRV_STATS_NULLS_UNEQUAL:
6100 /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6101 case, we will treat NULLs as unequal value */
6102 stats_null_not_equal = TRUE;
6103 break;
6104
6105 case SRV_STATS_NULLS_EQUAL:
6106 stats_null_not_equal = FALSE;
6107 break;
6108
6109 default:
6110 ut_error;
6111 }
6112
6113 /* It makes no sense to test more pages than are contained
6114 in the index, thus we lower the number if it is too high */
6115 if (srv_stats_transient_sample_pages > index->stat_index_size) {
6116 if (index->stat_index_size > 0) {
6117 n_sample_pages = index->stat_index_size;
6118 } else {
6119 n_sample_pages = 1;
6120 }
6121 } else {
6122 n_sample_pages = srv_stats_transient_sample_pages;
6123 }
6124
6125 /* We sample some pages in the index to get an estimate */
6126
6127 for (i = 0; i < n_sample_pages; i++) {
6128 mtr_start(&mtr);
6129
6130 bool available;
6131
6132 available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6133 &cursor, &mtr);
6134
6135 if (!available) {
6136 mtr_commit(&mtr);
6137 mem_heap_free(heap);
6138
6139 return(false);
6140 }
6141
6142 /* Count the number of different key values for each prefix of
6143 the key on this index page. If the prefix does not determine
6144 the index record uniquely in the B-tree, then we subtract one
6145 because otherwise our algorithm would give a wrong estimate
6146 for an index where there is just one key value. */
6147
6148 page = btr_cur_get_page(&cursor);
6149
6150 rec = page_rec_get_next(page_get_infimum_rec(page));
6151
6152 if (!page_rec_is_supremum(rec)) {
6153 not_empty_flag = 1;
6154 offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6155 ULINT_UNDEFINED, &heap);
6156
6157 if (n_not_null != NULL) {
6158 btr_record_not_null_field_in_rec(
6159 n_cols, offsets_rec, n_not_null);
6160 }
6161 }
6162
6163 while (!page_rec_is_supremum(rec)) {
6164 ulint matched_fields;
6165 rec_t* next_rec = page_rec_get_next(rec);
6166 if (page_rec_is_supremum(next_rec)) {
6167 total_external_size +=
6168 btr_rec_get_externally_stored_len(
6169 rec, offsets_rec);
6170 break;
6171 }
6172
6173 offsets_next_rec = rec_get_offsets(next_rec, index,
6174 offsets_next_rec,
6175 ULINT_UNDEFINED,
6176 &heap);
6177
6178 cmp_rec_rec_with_match(rec, next_rec,
6179 offsets_rec, offsets_next_rec,
6180 index,
6181 page_is_spatial_non_leaf(next_rec, index),
6182 stats_null_not_equal,
6183 &matched_fields);
6184
6185 for (j = matched_fields; j < n_cols; j++) {
6186 /* We add one if this index record has
6187 a different prefix from the previous */
6188
6189 n_diff[j]++;
6190 }
6191
6192 if (n_not_null != NULL) {
6193 btr_record_not_null_field_in_rec(
6194 n_cols, offsets_next_rec, n_not_null);
6195 }
6196
6197 total_external_size
6198 += btr_rec_get_externally_stored_len(
6199 rec, offsets_rec);
6200
6201 rec = next_rec;
6202 /* Initialize offsets_rec for the next round
6203 and assign the old offsets_rec buffer to
6204 offsets_next_rec. */
6205 {
6206 ulint* offsets_tmp = offsets_rec;
6207 offsets_rec = offsets_next_rec;
6208 offsets_next_rec = offsets_tmp;
6209 }
6210 }
6211
6212
6213 if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6214
6215 /* If there is more than one leaf page in the tree,
6216 we add one because we know that the first record
6217 on the page certainly had a different prefix than the
6218 last record on the previous index page in the
6219 alphabetical order. Before this fix, if there was
6220 just one big record on each clustered index page, the
6221 algorithm grossly underestimated the number of rows
6222 in the table. */
6223
6224 if (btr_page_get_prev(page, &mtr) != FIL_NULL
6225 || btr_page_get_next(page, &mtr) != FIL_NULL) {
6226
6227 n_diff[n_cols - 1]++;
6228 }
6229 }
6230
6231 mtr_commit(&mtr);
6232 }
6233
6234 /* If we saw k borders between different key values on
6235 n_sample_pages leaf pages, we can estimate how many
6236 there will be in index->stat_n_leaf_pages */
6237
6238 /* We must take into account that our sample actually represents
6239 also the pages used for external storage of fields (those pages are
6240 included in index->stat_n_leaf_pages) */
6241
6242 for (j = 0; j < n_cols; j++) {
6243 index->stat_n_diff_key_vals[j]
6244 = BTR_TABLE_STATS_FROM_SAMPLE(
6245 n_diff[j], index, n_sample_pages,
6246 total_external_size, not_empty_flag);
6247
6248 /* If the tree is small, smaller than
6249 10 * n_sample_pages + total_external_size, then
6250 the above estimate is ok. For bigger trees it is common that we
6251 do not see any borders between key values in the few pages
6252 we pick. But still there may be n_sample_pages
6253 different key values, or even more. Let us try to approximate
6254 that: */
6255
6256 add_on = index->stat_n_leaf_pages
6257 / (10 * (n_sample_pages
6258 + total_external_size));
6259
6260 if (add_on > n_sample_pages) {
6261 add_on = n_sample_pages;
6262 }
6263
6264 index->stat_n_diff_key_vals[j] += add_on;
6265
6266 index->stat_n_sample_sizes[j] = n_sample_pages;
6267
6268 /* Update the stat_n_non_null_key_vals[] with our
6269 sampled result. stat_n_non_null_key_vals[] is created
6270 and initialized to zero in dict_index_add_to_cache(),
6271 along with stat_n_diff_key_vals[] array */
6272 if (n_not_null != NULL) {
6273 index->stat_n_non_null_key_vals[j] =
6274 BTR_TABLE_STATS_FROM_SAMPLE(
6275 n_not_null[j], index, n_sample_pages,
6276 total_external_size, not_empty_flag);
6277 }
6278 }
6279
6280 mem_heap_free(heap);
6281
6282 return(true);
6283 }
6284
6285 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6286
6287 /***********************************************************//**
6288 Gets the offset of the pointer to the externally stored part of a field.
6289 @return offset of the pointer to the externally stored part */
6290 static
6291 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6292 btr_rec_get_field_ref_offs(
6293 /*=======================*/
6294 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6295 ulint n) /*!< in: index of the external field */
6296 {
6297 ulint field_ref_offs;
6298 ulint local_len;
6299
6300 ut_a(rec_offs_nth_extern(offsets, n));
6301 field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6302 ut_a(local_len != UNIV_SQL_NULL);
6303 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6304
6305 return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6306 }
6307
6308 /** Gets a pointer to the externally stored part of a field.
6309 @param rec record
6310 @param offsets rec_get_offsets(rec)
6311 @param n index of the externally stored field
6312 @return pointer to the externally stored part */
6313 #define btr_rec_get_field_ref(rec, offsets, n) \
6314 ((rec) + btr_rec_get_field_ref_offs(offsets, n))
6315
6316 /** Gets the externally stored size of a record, in units of a database page.
6317 @param[in] rec record
6318 @param[in] offsets array returned by rec_get_offsets()
6319 @return externally stored part, in units of a database page */
6320 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6321 btr_rec_get_externally_stored_len(
6322 const rec_t* rec,
6323 const ulint* offsets)
6324 {
6325 ulint n_fields;
6326 ulint total_extern_len = 0;
6327 ulint i;
6328
6329 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6330
6331 if (!rec_offs_any_extern(offsets)) {
6332 return(0);
6333 }
6334
6335 n_fields = rec_offs_n_fields(offsets);
6336
6337 for (i = 0; i < n_fields; i++) {
6338 if (rec_offs_nth_extern(offsets, i)) {
6339
6340 ulint extern_len = mach_read_from_4(
6341 btr_rec_get_field_ref(rec, offsets, i)
6342 + BTR_EXTERN_LEN + 4);
6343
6344 total_extern_len += ut_calc_align(extern_len,
6345 UNIV_PAGE_SIZE);
6346 }
6347 }
6348
6349 return(total_extern_len / UNIV_PAGE_SIZE);
6350 }
6351
6352 /*******************************************************************//**
6353 Sets the ownership bit of an externally stored field in a record. */
6354 static
6355 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6356 btr_cur_set_ownership_of_extern_field(
6357 /*==================================*/
6358 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6359 part will be updated, or NULL */
6360 rec_t* rec, /*!< in/out: clustered index record */
6361 dict_index_t* index, /*!< in: index of the page */
6362 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6363 ulint i, /*!< in: field number */
6364 ibool val, /*!< in: value to set */
6365 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6366 {
6367 byte* data;
6368 ulint local_len;
6369 ulint byte_val;
6370
6371 data = rec_get_nth_field(rec, offsets, i, &local_len);
6372 ut_ad(rec_offs_nth_extern(offsets, i));
6373 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6374
6375 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6376
6377 byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6378
6379 if (val) {
6380 byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6381 } else {
6382 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6383 ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6384 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6385 byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6386 }
6387
6388 if (page_zip) {
6389 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6390 page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6391 } else if (mtr != NULL) {
6392
6393 mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6394 MLOG_1BYTE, mtr);
6395 } else {
6396 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6397 }
6398 }
6399
6400 /*******************************************************************//**
6401 Marks non-updated off-page fields as disowned by this record. The ownership
6402 must be transferred to the updated record which is inserted elsewhere in the
6403 index tree. In purge only the owner of externally stored field is allowed
6404 to free the field. */
6405 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6406 btr_cur_disown_inherited_fields(
6407 /*============================*/
6408 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6409 part will be updated, or NULL */
6410 rec_t* rec, /*!< in/out: record in a clustered index */
6411 dict_index_t* index, /*!< in: index of the page */
6412 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6413 const upd_t* update, /*!< in: update vector */
6414 mtr_t* mtr) /*!< in/out: mini-transaction */
6415 {
6416 ulint i;
6417
6418 ut_ad(rec_offs_validate(rec, index, offsets));
6419 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6420 ut_ad(rec_offs_any_extern(offsets));
6421 ut_ad(mtr);
6422
6423 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6424 if (rec_offs_nth_extern(offsets, i)
6425 && !upd_get_field_by_field_no(update, i, false)) {
6426 btr_cur_set_ownership_of_extern_field(
6427 page_zip, rec, index, offsets, i, FALSE, mtr);
6428 }
6429 }
6430 }
6431
6432 /*******************************************************************//**
6433 Marks all extern fields in a record as owned by the record. This function
6434 should be called if the delete mark of a record is removed: a not delete
6435 marked record always owns all its extern fields. */
6436 static
6437 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6438 btr_cur_unmark_extern_fields(
6439 /*=========================*/
6440 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6441 part will be updated, or NULL */
6442 rec_t* rec, /*!< in/out: record in a clustered index */
6443 dict_index_t* index, /*!< in: index of the page */
6444 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6445 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6446 {
6447 ulint n;
6448 ulint i;
6449
6450 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6451 n = rec_offs_n_fields(offsets);
6452
6453 if (!rec_offs_any_extern(offsets)) {
6454
6455 return;
6456 }
6457
6458 for (i = 0; i < n; i++) {
6459 if (rec_offs_nth_extern(offsets, i)) {
6460
6461 btr_cur_set_ownership_of_extern_field(
6462 page_zip, rec, index, offsets, i, TRUE, mtr);
6463 }
6464 }
6465 }
6466
6467 /*******************************************************************//**
6468 Returns the length of a BLOB part stored on the header page.
6469 @return part length */
6470 static
6471 ulint
btr_blob_get_part_len(const byte * blob_header)6472 btr_blob_get_part_len(
6473 /*==================*/
6474 const byte* blob_header) /*!< in: blob header */
6475 {
6476 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6477 }
6478
6479 /*******************************************************************//**
6480 Returns the page number where the next BLOB part is stored.
6481 @return page number or FIL_NULL if no more pages */
6482 static
6483 ulint
btr_blob_get_next_page_no(const byte * blob_header)6484 btr_blob_get_next_page_no(
6485 /*======================*/
6486 const byte* blob_header) /*!< in: blob header */
6487 {
6488 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6489 }
6490
6491 /*******************************************************************//**
6492 Deallocate a buffer block that was reserved for a BLOB part. */
6493 static
6494 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6495 btr_blob_free(
6496 /*==========*/
6497 dict_index_t* index, /*!< in: index */
6498 buf_block_t* block, /*!< in: buffer block */
6499 ibool all, /*!< in: TRUE=remove also the compressed page
6500 if there is one */
6501 mtr_t* mtr) /*!< in: mini-transaction to commit */
6502 {
6503 buf_pool_t* buf_pool = buf_pool_from_block(block);
6504 ulint space = block->page.id.space();
6505 ulint page_no = block->page.id.page_no();
6506
6507 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6508
6509 mtr_commit(mtr);
6510
6511 buf_pool_mutex_enter(buf_pool);
6512
6513 /* Only free the block if it is still allocated to
6514 the same file page. */
6515
6516 if (buf_block_get_state(block)
6517 == BUF_BLOCK_FILE_PAGE
6518 && block->page.id.space() == space
6519 && block->page.id.page_no() == page_no) {
6520
6521 if (!buf_LRU_free_page(&block->page, all)
6522 && all && block->page.zip.data) {
6523 /* Attempt to deallocate the uncompressed page
6524 if the whole block cannot be deallocted. */
6525
6526 buf_LRU_free_page(&block->page, false);
6527 }
6528 }
6529
6530 buf_pool_mutex_exit(buf_pool);
6531 }
6532
6533 /** Helper class used while writing blob pages, during insert or update. */
6534 struct btr_blob_log_check_t {
6535 /** Persistent cursor on a clusterex index record with blobs. */
6536 btr_pcur_t* m_pcur;
6537 /** Mini transaction holding the latches for m_pcur */
6538 mtr_t* m_mtr;
6539 /** rec_get_offsets(rec, index); offset of clust_rec */
6540 const ulint* m_offsets;
6541 /** The block containing clustered record */
6542 buf_block_t** m_block;
6543 /** The clustered record pointer */
6544 rec_t** m_rec;
6545 /** The blob operation code */
6546 enum blob_op m_op;
6547
6548 /** Constructor
6549 @param[in] pcur persistent cursor on a clustered
6550 index record with blobs.
6551 @param[in] mtr mini-transaction holding latches for
6552 pcur.
6553 @param[in] offsets offsets of the clust_rec
6554 @param[in,out] block record block containing pcur record
6555 @param[in,out] rec the clustered record pointer
6556 @param[in] op the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6557 btr_blob_log_check_t(
6558 btr_pcur_t* pcur,
6559 mtr_t* mtr,
6560 const ulint* offsets,
6561 buf_block_t** block,
6562 rec_t** rec,
6563 enum blob_op op)
6564 : m_pcur(pcur),
6565 m_mtr(mtr),
6566 m_offsets(offsets),
6567 m_block(block),
6568 m_rec(rec),
6569 m_op(op)
6570 {
6571 ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6572 ut_ad((*m_block)->frame == page_align(*m_rec));
6573 ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6574 }
6575
6576 /** Check if there is enough space in log file. Commit and re-start the
6577 mini transaction. */
checkbtr_blob_log_check_t6578 void check()
6579 {
6580 dict_index_t* index = m_pcur->index();
6581 ulint offs = 0;
6582 ulint page_no = ULINT_UNDEFINED;
6583 FlushObserver* observer = m_mtr->get_flush_observer();
6584
6585 if (m_op == BTR_STORE_INSERT_BULK) {
6586 offs = page_offset(*m_rec);
6587 page_no = page_get_page_no(
6588 buf_block_get_frame(*m_block));
6589
6590 buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6591 } else {
6592 btr_pcur_store_position(m_pcur, m_mtr);
6593 }
6594 m_mtr->commit();
6595
6596 DEBUG_SYNC_C("blob_write_middle");
6597
6598 log_free_check();
6599
6600 DEBUG_SYNC_C("blob_write_middle_after_check");
6601
6602 const mtr_log_t log_mode = m_mtr->get_log_mode();
6603 m_mtr->start();
6604 m_mtr->set_log_mode(log_mode);
6605 m_mtr->set_named_space(index->space);
6606 m_mtr->set_flush_observer(observer);
6607
6608 if (m_op == BTR_STORE_INSERT_BULK) {
6609 page_id_t page_id(dict_index_get_space(index),
6610 page_no);
6611 page_size_t page_size(dict_table_page_size(
6612 index->table));
6613 page_cur_t* page_cur = &m_pcur->btr_cur.page_cur;
6614
6615 mtr_x_lock(dict_index_get_lock(index), m_mtr);
6616 page_cur->block = btr_block_get(
6617 page_id, page_size, RW_X_LATCH, index, m_mtr);
6618 page_cur->rec = buf_block_get_frame(page_cur->block)
6619 + offs;
6620
6621 buf_block_buf_fix_dec(page_cur->block);
6622 } else {
6623 ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6624 bool ret = btr_pcur_restore_position(
6625 BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6626 m_pcur, m_mtr);
6627
6628 ut_a(ret);
6629 }
6630
6631 *m_block = btr_pcur_get_block(m_pcur);
6632 *m_rec = btr_pcur_get_rec(m_pcur);
6633
6634 ut_d(rec_offs_make_valid(
6635 *m_rec, index, const_cast<ulint*>(m_offsets)));
6636
6637 ut_ad(m_mtr->memo_contains_page_flagged(
6638 *m_rec,
6639 MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6640 || dict_table_is_intrinsic(index->table));
6641
6642 ut_ad(mtr_memo_contains_flagged(m_mtr,
6643 dict_index_get_lock(index),
6644 MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6645 || dict_table_is_intrinsic(index->table));
6646 }
6647 };
6648
6649
6650 /*******************************************************************//**
6651 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6652 them in rec. The extern flags in rec will have to be set beforehand.
6653 The fields are stored on pages allocated from leaf node
6654 file segment of the index tree.
6655
6656 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6657 any mini-transaction. Tablespace extension should be redo-logged, so that
6658 recovery will not fail when the big_rec was written to the extended portion of
6659 the file, in case the file was somehow truncated in the crash.
6660
6661 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6662 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6663 btr_store_big_rec_extern_fields(
6664 /*============================*/
6665 btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
6666 btr_mtr is restarted, then this can
6667 be repositioned. */
6668 const upd_t* upd, /*!< in: update vector */
6669 ulint* offsets, /*!< in/out: rec_get_offsets() on
6670 pcur. the "external storage" flags
6671 in offsets will correctly correspond
6672 to rec when this function returns */
6673 const big_rec_t*big_rec_vec, /*!< in: vector containing fields
6674 to be stored externally */
6675 mtr_t* btr_mtr, /*!< in/out: mtr containing the
6676 latches to the clustered index. can be
6677 committed and restarted. */
6678 enum blob_op op) /*! in: operation code */
6679 {
6680 ulint rec_page_no;
6681 byte* field_ref;
6682 ulint extern_len;
6683 ulint store_len;
6684 ulint page_no;
6685 ulint space_id;
6686 ulint prev_page_no;
6687 ulint hint_page_no;
6688 ulint i;
6689 mtr_t mtr;
6690 mtr_t mtr_bulk;
6691 mem_heap_t* heap = NULL;
6692 page_zip_des_t* page_zip;
6693 z_stream c_stream;
6694 dberr_t error = DB_SUCCESS;
6695 dict_index_t* index = pcur->index();
6696 buf_block_t* rec_block = btr_pcur_get_block(pcur);
6697 rec_t* rec = btr_pcur_get_rec(pcur);
6698
6699 ut_ad(rec_offs_validate(rec, index, offsets));
6700 ut_ad(rec_offs_any_extern(offsets));
6701 ut_ad(btr_mtr);
6702 ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6703 MTR_MEMO_X_LOCK
6704 | MTR_MEMO_SX_LOCK)
6705 || dict_table_is_intrinsic(index->table)
6706 || !index->is_committed());
6707 ut_ad(mtr_is_block_fix(
6708 btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6709 ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6710 ut_a(dict_index_is_clust(index));
6711
6712 ut_a(dict_table_page_size(index->table)
6713 .equals_to(rec_block->page.size));
6714
6715 btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6716 &rec, op);
6717 page_zip = buf_block_get_page_zip(rec_block);
6718 space_id = rec_block->page.id.space();
6719 rec_page_no = rec_block->page.id.page_no();
6720 ut_a(fil_page_index_page_check(page_align(rec))
6721 || op == BTR_STORE_INSERT_BULK);
6722
6723 if (page_zip) {
6724 int err;
6725
6726 /* Zlib deflate needs 128 kilobytes for the default
6727 window size, plus 512 << memLevel, plus a few
6728 kilobytes for small objects. We use reduced memLevel
6729 to limit the memory consumption, and preallocate the
6730 heap, hoping to avoid memory fragmentation. */
6731 heap = mem_heap_create(250000);
6732 page_zip_set_alloc(&c_stream, heap);
6733
6734 err = deflateInit2(&c_stream, page_zip_level,
6735 Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6736 ut_a(err == Z_OK);
6737 }
6738
6739 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6740 /* All pointers to externally stored columns in the record
6741 must either be zero or they must be pointers to inherited
6742 columns, owned by this record or an earlier record version. */
6743 for (i = 0; i < big_rec_vec->n_fields; i++) {
6744 field_ref = btr_rec_get_field_ref(
6745 rec, offsets, big_rec_vec->fields[i].field_no);
6746
6747 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6748 /* Either this must be an update in place,
6749 or the BLOB must be inherited, or the BLOB pointer
6750 must be zero (will be written in this function). */
6751 ut_a(op == BTR_STORE_UPDATE
6752 || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6753 || !memcmp(field_ref, field_ref_zero,
6754 BTR_EXTERN_FIELD_REF_SIZE));
6755 }
6756 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6757
6758 const page_size_t page_size(dict_table_page_size(index->table));
6759
6760 /* Space available in compressed page to carry blob data */
6761 const ulint payload_size_zip = page_size.physical()
6762 - FIL_PAGE_DATA;
6763
6764 /* Space available in uncompressed page to carry blob data */
6765 const ulint payload_size = page_size.physical()
6766 - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
6767
6768 /* We have to create a file segment to the tablespace
6769 for each field and put the pointer to the field in rec */
6770
6771 for (i = 0; i < big_rec_vec->n_fields; i++) {
6772 const ulint field_no = big_rec_vec->fields[i].field_no;
6773
6774 field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
6775 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6776 /* A zero BLOB pointer should have been initially inserted. */
6777 ut_a(!memcmp(field_ref, field_ref_zero,
6778 BTR_EXTERN_FIELD_REF_SIZE));
6779 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6780 extern_len = big_rec_vec->fields[i].len;
6781 UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
6782 extern_len);
6783
6784 ut_a(extern_len > 0);
6785
6786 prev_page_no = FIL_NULL;
6787
6788 if (page_zip) {
6789 int err = deflateReset(&c_stream);
6790 ut_a(err == Z_OK);
6791
6792 c_stream.next_in = (Bytef*)
6793 big_rec_vec->fields[i].data;
6794 c_stream.avail_in = static_cast<uInt>(extern_len);
6795 }
6796
6797 for (ulint blob_npages = 0;; ++blob_npages) {
6798 buf_block_t* block;
6799 page_t* page;
6800 const ulint commit_freq = 4;
6801 ulint r_extents;
6802
6803 ut_ad(page_align(field_ref) == page_align(rec));
6804
6805 if (!(blob_npages % commit_freq)) {
6806
6807 redo_log.check();
6808
6809 field_ref = btr_rec_get_field_ref(
6810 rec, offsets, field_no);
6811
6812 page_zip = buf_block_get_page_zip(rec_block);
6813 rec_page_no = rec_block->page.id.page_no();
6814 }
6815
6816 mtr_start(&mtr);
6817 mtr.set_named_space(index->space);
6818 mtr.set_log_mode(btr_mtr->get_log_mode());
6819 mtr.set_flush_observer(btr_mtr->get_flush_observer());
6820
6821 buf_page_get(rec_block->page.id,
6822 rec_block->page.size, RW_X_LATCH, &mtr);
6823
6824 if (prev_page_no == FIL_NULL) {
6825 hint_page_no = 1 + rec_page_no;
6826 } else {
6827 hint_page_no = prev_page_no + 1;
6828 }
6829
6830 mtr_t *alloc_mtr;
6831
6832 if (op == BTR_STORE_INSERT_BULK) {
6833 mtr_start(&mtr_bulk);
6834 mtr_bulk.set_spaces(mtr);
6835 alloc_mtr = &mtr_bulk;
6836 } else {
6837 alloc_mtr = &mtr;
6838 }
6839
6840 if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
6841 FSP_BLOB, alloc_mtr,
6842 1)) {
6843
6844 mtr_commit(alloc_mtr);
6845 error = DB_OUT_OF_FILE_SPACE;
6846 goto func_exit;
6847 }
6848
6849 block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
6850 0, alloc_mtr, &mtr);
6851
6852 alloc_mtr->release_free_extents(r_extents);
6853
6854 if (op == BTR_STORE_INSERT_BULK) {
6855 mtr_commit(&mtr_bulk);
6856 }
6857
6858 ut_a(block != NULL);
6859
6860 page_no = block->page.id.page_no();
6861 page = buf_block_get_frame(block);
6862
6863 if (prev_page_no != FIL_NULL) {
6864 buf_block_t* prev_block;
6865 page_t* prev_page;
6866
6867 prev_block = buf_page_get(
6868 page_id_t(space_id, prev_page_no),
6869 rec_block->page.size,
6870 RW_X_LATCH, &mtr);
6871
6872 buf_block_dbg_add_level(prev_block,
6873 SYNC_EXTERN_STORAGE);
6874 prev_page = buf_block_get_frame(prev_block);
6875
6876 if (page_zip) {
6877 mlog_write_ulint(
6878 prev_page + FIL_PAGE_NEXT,
6879 page_no, MLOG_4BYTES, &mtr);
6880 memcpy(buf_block_get_page_zip(
6881 prev_block)
6882 ->data + FIL_PAGE_NEXT,
6883 prev_page + FIL_PAGE_NEXT, 4);
6884 } else {
6885 mlog_write_ulint(
6886 prev_page + FIL_PAGE_DATA
6887 + BTR_BLOB_HDR_NEXT_PAGE_NO,
6888 page_no, MLOG_4BYTES, &mtr);
6889 }
6890
6891 } else if (dict_index_is_online_ddl(index)) {
6892 row_log_table_blob_alloc(index, page_no);
6893 }
6894
6895 if (page_zip) {
6896 int err;
6897 page_zip_des_t* blob_page_zip;
6898
6899 /* Write FIL_PAGE_TYPE to the redo log
6900 separately, before logging any other
6901 changes to the page, so that the debug
6902 assertions in
6903 recv_parse_or_apply_log_rec_body() can
6904 be made simpler. Before InnoDB Plugin
6905 1.0.4, the initialization of
6906 FIL_PAGE_TYPE was logged as part of
6907 the mlog_log_string() below. */
6908
6909 mlog_write_ulint(page + FIL_PAGE_TYPE,
6910 prev_page_no == FIL_NULL
6911 ? FIL_PAGE_TYPE_ZBLOB
6912 : FIL_PAGE_TYPE_ZBLOB2,
6913 MLOG_2BYTES, &mtr);
6914
6915 c_stream.next_out = page
6916 + FIL_PAGE_DATA;
6917 c_stream.avail_out = static_cast<uInt>(
6918 payload_size_zip);
6919
6920 err = deflate(&c_stream, Z_FINISH);
6921 ut_a(err == Z_OK || err == Z_STREAM_END);
6922 ut_a(err == Z_STREAM_END
6923 || c_stream.avail_out == 0);
6924
6925 /* Write the "next BLOB page" pointer */
6926 mlog_write_ulint(page + FIL_PAGE_NEXT,
6927 FIL_NULL, MLOG_4BYTES, &mtr);
6928 /* Initialize the unused "prev page" pointer */
6929 mlog_write_ulint(page + FIL_PAGE_PREV,
6930 FIL_NULL, MLOG_4BYTES, &mtr);
6931 /* Write a back pointer to the record
6932 into the otherwise unused area. This
6933 information could be useful in
6934 debugging. Later, we might want to
6935 implement the possibility to relocate
6936 BLOB pages. Then, we would need to be
6937 able to adjust the BLOB pointer in the
6938 record. We do not store the heap
6939 number of the record, because it can
6940 change in page_zip_reorganize() or
6941 btr_page_reorganize(). However, also
6942 the page number of the record may
6943 change when B-tree nodes are split or
6944 merged.
6945 NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
6946 used by R-tree index for a Split Sequence
6947 Number */
6948 ut_ad(!dict_index_is_spatial(index));
6949
6950 mlog_write_ulint(page
6951 + FIL_PAGE_FILE_FLUSH_LSN,
6952 space_id,
6953 MLOG_4BYTES, &mtr);
6954 mlog_write_ulint(page
6955 + FIL_PAGE_FILE_FLUSH_LSN + 4,
6956 rec_page_no,
6957 MLOG_4BYTES, &mtr);
6958
6959 /* Zero out the unused part of the page. */
6960 memset(page + page_zip_get_size(page_zip)
6961 - c_stream.avail_out,
6962 0, c_stream.avail_out);
6963 mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
6964 page_zip_get_size(page_zip)
6965 - FIL_PAGE_FILE_FLUSH_LSN,
6966 &mtr);
6967 /* Copy the page to compressed storage,
6968 because it will be flushed to disk
6969 from there. */
6970 blob_page_zip = buf_block_get_page_zip(block);
6971 ut_ad(blob_page_zip);
6972 ut_ad(page_zip_get_size(blob_page_zip)
6973 == page_zip_get_size(page_zip));
6974 memcpy(blob_page_zip->data, page,
6975 page_zip_get_size(page_zip));
6976
6977 if (err == Z_OK && prev_page_no != FIL_NULL) {
6978
6979 goto next_zip_page;
6980 }
6981
6982 if (err == Z_STREAM_END) {
6983 mach_write_to_4(field_ref
6984 + BTR_EXTERN_LEN, 0);
6985 mach_write_to_4(field_ref
6986 + BTR_EXTERN_LEN + 4,
6987 c_stream.total_in);
6988 } else {
6989 memset(field_ref + BTR_EXTERN_LEN,
6990 0, 8);
6991 }
6992
6993 if (prev_page_no == FIL_NULL) {
6994 ut_ad(blob_npages == 0);
6995 mach_write_to_4(field_ref
6996 + BTR_EXTERN_SPACE_ID,
6997 space_id);
6998
6999 mach_write_to_4(field_ref
7000 + BTR_EXTERN_PAGE_NO,
7001 page_no);
7002
7003 mach_write_to_4(field_ref
7004 + BTR_EXTERN_OFFSET,
7005 FIL_PAGE_NEXT);
7006 }
7007
7008 /* We compress a page when finish bulk insert.*/
7009 if (op != BTR_STORE_INSERT_BULK) {
7010 page_zip_write_blob_ptr(
7011 page_zip, rec, index, offsets,
7012 field_no, &mtr);
7013 }
7014
7015 next_zip_page:
7016 prev_page_no = page_no;
7017
7018 /* Commit mtr and release the
7019 uncompressed page frame to save memory. */
7020 btr_blob_free(index, block, FALSE, &mtr);
7021
7022 if (err == Z_STREAM_END) {
7023 break;
7024 }
7025 } else {
7026 mlog_write_ulint(page + FIL_PAGE_TYPE,
7027 FIL_PAGE_TYPE_BLOB,
7028 MLOG_2BYTES, &mtr);
7029
7030 if (extern_len > payload_size) {
7031 store_len = payload_size;
7032 } else {
7033 store_len = extern_len;
7034 }
7035
7036 mlog_write_string(page + FIL_PAGE_DATA
7037 + BTR_BLOB_HDR_SIZE,
7038 (const byte*)
7039 big_rec_vec->fields[i].data
7040 + big_rec_vec->fields[i].len
7041 - extern_len,
7042 store_len, &mtr);
7043 mlog_write_ulint(page + FIL_PAGE_DATA
7044 + BTR_BLOB_HDR_PART_LEN,
7045 store_len, MLOG_4BYTES, &mtr);
7046 mlog_write_ulint(page + FIL_PAGE_DATA
7047 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7048 FIL_NULL, MLOG_4BYTES, &mtr);
7049
7050 extern_len -= store_len;
7051
7052 mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7053 MLOG_4BYTES, &mtr);
7054 mlog_write_ulint(field_ref
7055 + BTR_EXTERN_LEN + 4,
7056 big_rec_vec->fields[i].len
7057 - extern_len,
7058 MLOG_4BYTES, &mtr);
7059
7060 if (prev_page_no == FIL_NULL) {
7061 ut_ad(blob_npages == 0);
7062 mlog_write_ulint(field_ref
7063 + BTR_EXTERN_SPACE_ID,
7064 space_id, MLOG_4BYTES,
7065 &mtr);
7066
7067 mlog_write_ulint(field_ref
7068 + BTR_EXTERN_PAGE_NO,
7069 page_no, MLOG_4BYTES,
7070 &mtr);
7071
7072 mlog_write_ulint(field_ref
7073 + BTR_EXTERN_OFFSET,
7074 FIL_PAGE_DATA,
7075 MLOG_4BYTES,
7076 &mtr);
7077 }
7078
7079 prev_page_no = page_no;
7080
7081 mtr_commit(&mtr);
7082
7083 if (extern_len == 0) {
7084 break;
7085 }
7086 }
7087 }
7088
7089 DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7090 error = DB_OUT_OF_FILE_SPACE;
7091 goto func_exit;);
7092
7093 rec_offs_make_nth_extern(offsets, field_no);
7094 }
7095
7096 func_exit:
7097 if (page_zip) {
7098 deflateEnd(&c_stream);
7099 }
7100
7101 if (heap != NULL) {
7102 mem_heap_free(heap);
7103 }
7104
7105 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7106 /* All pointers to externally stored columns in the record
7107 must be valid. */
7108 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7109 if (!rec_offs_nth_extern(offsets, i)) {
7110 continue;
7111 }
7112
7113 field_ref = btr_rec_get_field_ref(rec, offsets, i);
7114
7115 /* The pointer must not be zero if the operation
7116 succeeded. */
7117 ut_a(0 != memcmp(field_ref, field_ref_zero,
7118 BTR_EXTERN_FIELD_REF_SIZE)
7119 || error != DB_SUCCESS);
7120 /* The column must not be disowned by this record. */
7121 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7122 }
7123 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7124 return(error);
7125 }
7126
7127 /*******************************************************************//**
7128 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7129 static
7130 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7131 btr_check_blob_fil_page_type(
7132 /*=========================*/
7133 ulint space_id, /*!< in: space id */
7134 ulint page_no, /*!< in: page number */
7135 const page_t* page, /*!< in: page */
7136 ibool read) /*!< in: TRUE=read, FALSE=purge */
7137 {
7138 ulint type = fil_page_get_type(page);
7139
7140 ut_a(space_id == page_get_space_id(page));
7141 ut_a(page_no == page_get_page_no(page));
7142
7143 if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7144 ulint flags = fil_space_get_flags(space_id);
7145
7146 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7147 if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7148 /* Old versions of InnoDB did not initialize
7149 FIL_PAGE_TYPE on BLOB pages. Do not print
7150 anything about the type mismatch when reading
7151 a BLOB page that is in Antelope format.*/
7152 return;
7153 }
7154 #endif /* !UNIV_DEBUG */
7155
7156 ib::fatal() << "FIL_PAGE_TYPE=" << type
7157 << " on BLOB " << (read ? "read" : "purge")
7158 << " space " << space_id << " page " << page_no
7159 << " flags " << flags;
7160 }
7161 }
7162
7163 /*******************************************************************//**
7164 Frees the space in an externally stored field to the file space
7165 management if the field in data is owned by the externally stored field,
7166 in a rollback we may have the additional condition that the field must
7167 not be inherited. */
7168 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7169 btr_free_externally_stored_field(
7170 /*=============================*/
7171 dict_index_t* index, /*!< in: index of the data, the index
7172 tree MUST be X-latched; if the tree
7173 height is 1, then also the root page
7174 must be X-latched! (this is relevant
7175 in the case this function is called
7176 from purge where 'data' is located on
7177 an undo log page, not an index
7178 page) */
7179 byte* field_ref, /*!< in/out: field reference */
7180 const rec_t* rec, /*!< in: record containing field_ref, for
7181 page_zip_write_blob_ptr(), or NULL */
7182 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
7183 or NULL */
7184 page_zip_des_t* page_zip, /*!< in: compressed page corresponding
7185 to rec, or NULL if rec == NULL */
7186 ulint i, /*!< in: field number of field_ref;
7187 ignored if rec == NULL */
7188 bool rollback, /*!< in: performing rollback? */
7189 mtr_t* local_mtr) /*!< in: mtr
7190 containing the latch to data an an
7191 X-latch to the index tree */
7192 {
7193 page_t* page;
7194 const ulint space_id = mach_read_from_4(
7195 field_ref + BTR_EXTERN_SPACE_ID);
7196 const ulint start_page = mach_read_from_4(
7197 field_ref + BTR_EXTERN_PAGE_NO);
7198 ulint page_no;
7199 ulint next_page_no;
7200 mtr_t mtr;
7201
7202 ut_ad(dict_index_is_clust(index));
7203 ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7204 MTR_MEMO_X_LOCK
7205 | MTR_MEMO_SX_LOCK)
7206 || dict_table_is_intrinsic(index->table));
7207 ut_ad(mtr_is_page_fix(
7208 local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7209 ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7210 ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7211 ut_ad(local_mtr->is_named_space(
7212 page_get_space_id(page_align(field_ref))));
7213
7214 if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7215 BTR_EXTERN_FIELD_REF_SIZE))) {
7216 /* In the rollback, we may encounter a clustered index
7217 record with some unwritten off-page columns. There is
7218 nothing to free then. */
7219 ut_a(rollback);
7220 return;
7221 }
7222
7223 ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7224 & ~((BTR_EXTERN_OWNER_FLAG
7225 | BTR_EXTERN_INHERITED_FLAG) << 24)));
7226 ut_ad(space_id == index->space);
7227
7228 const page_size_t ext_page_size(dict_table_page_size(index->table));
7229 const page_size_t& rec_page_size(rec == NULL
7230 ? univ_page_size
7231 : ext_page_size);
7232 if (rec == NULL) {
7233 /* This is a call from row_purge_upd_exist_or_extern(). */
7234 ut_ad(!page_zip);
7235 }
7236
7237 for (;;) {
7238 #ifdef UNIV_DEBUG
7239 buf_block_t* rec_block;
7240 #endif /* UNIV_DEBUG */
7241 buf_block_t* ext_block;
7242
7243 mtr_start(&mtr);
7244 mtr.set_spaces(*local_mtr);
7245 mtr.set_log_mode(local_mtr->get_log_mode());
7246
7247 ut_ad(!dict_table_is_temporary(index->table)
7248 || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7249
7250 const page_t* p = page_align(field_ref);
7251
7252 const page_id_t page_id(page_get_space_id(p),
7253 page_get_page_no(p));
7254
7255 #ifdef UNIV_DEBUG
7256 rec_block =
7257 #endif /* UNIV_DEBUG */
7258 buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7259
7260 buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7261 page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7262
7263 if (/* There is no external storage data */
7264 page_no == FIL_NULL
7265 /* This field does not own the externally stored field */
7266 || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7267 & BTR_EXTERN_OWNER_FLAG)
7268 /* Rollback and inherited field */
7269 || (rollback
7270 && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7271 & BTR_EXTERN_INHERITED_FLAG))) {
7272
7273 /* Do not free */
7274 mtr_commit(&mtr);
7275
7276 return;
7277 }
7278
7279 if (page_no == start_page && dict_index_is_online_ddl(index)) {
7280 row_log_table_blob_free(index, start_page);
7281 }
7282
7283 ext_block = buf_page_get(
7284 page_id_t(space_id, page_no), ext_page_size,
7285 RW_X_LATCH, &mtr);
7286
7287 buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7288 page = buf_block_get_frame(ext_block);
7289
7290 if (ext_page_size.is_compressed()) {
7291 /* Note that page_zip will be NULL
7292 in row_purge_upd_exist_or_extern(). */
7293 switch (fil_page_get_type(page)) {
7294 case FIL_PAGE_TYPE_ZBLOB:
7295 case FIL_PAGE_TYPE_ZBLOB2:
7296 break;
7297 default:
7298 ut_error;
7299 }
7300 next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7301
7302 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7303 &mtr);
7304
7305 if (page_zip != NULL) {
7306 mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7307 next_page_no);
7308 mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7309 0);
7310 page_zip_write_blob_ptr(page_zip, rec, index,
7311 offsets, i, &mtr);
7312 } else {
7313 mlog_write_ulint(field_ref
7314 + BTR_EXTERN_PAGE_NO,
7315 next_page_no,
7316 MLOG_4BYTES, &mtr);
7317 mlog_write_ulint(field_ref
7318 + BTR_EXTERN_LEN + 4, 0,
7319 MLOG_4BYTES, &mtr);
7320 }
7321 } else {
7322 ut_a(!page_zip);
7323 btr_check_blob_fil_page_type(space_id, page_no, page,
7324 FALSE);
7325
7326 next_page_no = mach_read_from_4(
7327 page + FIL_PAGE_DATA
7328 + BTR_BLOB_HDR_NEXT_PAGE_NO);
7329
7330 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7331 &mtr);
7332
7333 mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7334 next_page_no,
7335 MLOG_4BYTES, &mtr);
7336 /* Zero out the BLOB length. If the server
7337 crashes during the execution of this function,
7338 trx_rollback_or_clean_all_recovered() could
7339 dereference the half-deleted BLOB, fetching a
7340 wrong prefix for the BLOB. */
7341 mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7342 0,
7343 MLOG_4BYTES, &mtr);
7344 }
7345
7346 /* Commit mtr and release the BLOB block to save memory. */
7347 btr_blob_free(index, ext_block, TRUE, &mtr);
7348 }
7349 }
7350
7351 /***********************************************************//**
7352 Frees the externally stored fields for a record. */
7353 static
7354 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7355 btr_rec_free_externally_stored_fields(
7356 /*==================================*/
7357 dict_index_t* index, /*!< in: index of the data, the index
7358 tree MUST be X-latched */
7359 rec_t* rec, /*!< in/out: record */
7360 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7361 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7362 part will be updated, or NULL */
7363 bool rollback,/*!< in: performing rollback? */
7364 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7365 an X-latch to record page and to the index
7366 tree */
7367 {
7368 ulint n_fields;
7369 ulint i;
7370
7371 ut_ad(rec_offs_validate(rec, index, offsets));
7372 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7373 /* Free possible externally stored fields in the record */
7374
7375 ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7376 n_fields = rec_offs_n_fields(offsets);
7377
7378 for (i = 0; i < n_fields; i++) {
7379 if (rec_offs_nth_extern(offsets, i)) {
7380 btr_free_externally_stored_field(
7381 index, btr_rec_get_field_ref(rec, offsets, i),
7382 rec, offsets, page_zip, i, rollback, mtr);
7383 }
7384 }
7385 }
7386
7387 /***********************************************************//**
7388 Frees the externally stored fields for a record, if the field is mentioned
7389 in the update vector. */
7390 static
7391 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7392 btr_rec_free_updated_extern_fields(
7393 /*===============================*/
7394 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
7395 X-latched */
7396 rec_t* rec, /*!< in/out: record */
7397 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7398 part will be updated, or NULL */
7399 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7400 const upd_t* update, /*!< in: update vector */
7401 bool rollback,/*!< in: performing rollback? */
7402 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7403 an X-latch to record page and to the tree */
7404 {
7405 ulint n_fields;
7406 ulint i;
7407
7408 ut_ad(rec_offs_validate(rec, index, offsets));
7409 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7410
7411 /* Free possible externally stored fields in the record */
7412
7413 n_fields = upd_get_n_fields(update);
7414
7415 for (i = 0; i < n_fields; i++) {
7416 const upd_field_t* ufield = upd_get_nth_field(update, i);
7417
7418 if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7419 ulint len;
7420 byte* data = rec_get_nth_field(
7421 rec, offsets, ufield->field_no, &len);
7422 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7423
7424 btr_free_externally_stored_field(
7425 index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7426 rec, offsets, page_zip,
7427 ufield->field_no, rollback, mtr);
7428 }
7429 }
7430 }
7431
7432 /*******************************************************************//**
7433 Copies the prefix of an uncompressed BLOB. The clustered index record
7434 that points to this BLOB must be protected by a lock or a page latch.
7435 @return number of bytes written to buf */
7436 static
7437 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7438 btr_copy_blob_prefix(
7439 /*=================*/
7440 byte* buf, /*!< out: the externally stored part of
7441 the field, or a prefix of it */
7442 ulint len, /*!< in: length of buf, in bytes */
7443 ulint space_id,/*!< in: space id of the BLOB pages */
7444 ulint page_no,/*!< in: page number of the first BLOB page */
7445 ulint offset) /*!< in: offset on the first BLOB page */
7446 {
7447 ulint copied_len = 0;
7448
7449 for (;;) {
7450 mtr_t mtr;
7451 buf_block_t* block;
7452 const page_t* page;
7453 const byte* blob_header;
7454 ulint part_len;
7455 ulint copy_len;
7456
7457 mtr_start(&mtr);
7458
7459 block = buf_page_get(page_id_t(space_id, page_no),
7460 univ_page_size, RW_S_LATCH, &mtr);
7461 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7462 page = buf_block_get_frame(block);
7463
7464 btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7465
7466 blob_header = page + offset;
7467 part_len = btr_blob_get_part_len(blob_header);
7468 copy_len = ut_min(part_len, len - copied_len);
7469
7470 memcpy(buf + copied_len,
7471 blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7472 copied_len += copy_len;
7473
7474 page_no = btr_blob_get_next_page_no(blob_header);
7475
7476 mtr_commit(&mtr);
7477
7478 if (page_no == FIL_NULL || copy_len != part_len) {
7479 UNIV_MEM_ASSERT_RW(buf, copied_len);
7480 return(copied_len);
7481 }
7482
7483 /* On other BLOB pages except the first the BLOB header
7484 always is at the page data start: */
7485
7486 offset = FIL_PAGE_DATA;
7487
7488 ut_ad(copied_len <= len);
7489 }
7490 }
7491
7492 /** Copies the prefix of a compressed BLOB.
7493 The clustered index record that points to this BLOB must be protected
7494 by a lock or a page latch.
7495 @param[out] buf the externally stored part of the field,
7496 or a prefix of it
7497 @param[in] len length of buf, in bytes
7498 @param[in] page_size compressed BLOB page size
7499 @param[in] space_id space id of the BLOB pages
7500 @param[in] offset offset on the first BLOB page
7501 @return number of bytes written to buf */
7502 static
7503 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7504 btr_copy_zblob_prefix(
7505 byte* buf,
7506 ulint len,
7507 const page_size_t& page_size,
7508 ulint space_id,
7509 ulint page_no,
7510 ulint offset)
7511 {
7512 ulint page_type = FIL_PAGE_TYPE_ZBLOB;
7513 mem_heap_t* heap;
7514 int err;
7515 z_stream d_stream;
7516
7517 d_stream.next_out = buf;
7518 d_stream.avail_out = static_cast<uInt>(len);
7519 d_stream.next_in = Z_NULL;
7520 d_stream.avail_in = 0;
7521
7522 /* Zlib inflate needs 32 kilobytes for the default
7523 window size, plus a few kilobytes for small objects. */
7524 heap = mem_heap_create(40000);
7525 page_zip_set_alloc(&d_stream, heap);
7526
7527 ut_ad(page_size.is_compressed());
7528 ut_ad(space_id);
7529
7530 err = inflateInit(&d_stream);
7531 ut_a(err == Z_OK);
7532
7533 for (;;) {
7534 buf_page_t* bpage;
7535 ulint next_page_no;
7536
7537 /* There is no latch on bpage directly. Instead,
7538 bpage is protected by the B-tree page latch that
7539 is being held on the clustered index record, or,
7540 in row_merge_copy_blobs(), by an exclusive table lock. */
7541 bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7542 page_size);
7543
7544 if (UNIV_UNLIKELY(!bpage)) {
7545 ib::error() << "Cannot load compressed BLOB "
7546 << page_id_t(space_id, page_no);
7547 goto func_exit;
7548 }
7549
7550 if (UNIV_UNLIKELY
7551 (fil_page_get_type(bpage->zip.data) != page_type)) {
7552
7553 ib::error() << "Unexpected type "
7554 << fil_page_get_type(bpage->zip.data)
7555 << " of compressed BLOB page "
7556 << page_id_t(space_id, page_no);
7557
7558 ut_ad(0);
7559 goto end_of_blob;
7560 }
7561
7562 next_page_no = mach_read_from_4(bpage->zip.data + offset);
7563
7564 if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7565 /* When the BLOB begins at page header,
7566 the compressed data payload does not
7567 immediately follow the next page pointer. */
7568 offset = FIL_PAGE_DATA;
7569 } else {
7570 offset += 4;
7571 }
7572
7573 d_stream.next_in = bpage->zip.data + offset;
7574 d_stream.avail_in = static_cast<uInt>(page_size.physical()
7575 - offset);
7576
7577 err = inflate(&d_stream, Z_NO_FLUSH);
7578 switch (err) {
7579 case Z_OK:
7580 if (!d_stream.avail_out) {
7581 goto end_of_blob;
7582 }
7583 break;
7584 case Z_STREAM_END:
7585 if (next_page_no == FIL_NULL) {
7586 goto end_of_blob;
7587 }
7588 /* fall through */
7589 default:
7590 inflate_error:
7591 ib::error() << "inflate() of compressed BLOB page "
7592 << page_id_t(space_id, page_no)
7593 << " returned " << err
7594 << " (" << d_stream.msg << ")";
7595
7596 case Z_BUF_ERROR:
7597 goto end_of_blob;
7598 }
7599
7600 if (next_page_no == FIL_NULL) {
7601 if (!d_stream.avail_in) {
7602 ib::error()
7603 << "Unexpected end of compressed "
7604 << "BLOB page "
7605 << page_id_t(space_id, page_no);
7606 } else {
7607 err = inflate(&d_stream, Z_FINISH);
7608 switch (err) {
7609 case Z_STREAM_END:
7610 case Z_BUF_ERROR:
7611 break;
7612 default:
7613 goto inflate_error;
7614 }
7615 }
7616
7617 end_of_blob:
7618 buf_page_release_zip(bpage);
7619 goto func_exit;
7620 }
7621
7622 buf_page_release_zip(bpage);
7623
7624 /* On other BLOB pages except the first
7625 the BLOB header always is at the page header: */
7626
7627 page_no = next_page_no;
7628 offset = FIL_PAGE_NEXT;
7629 page_type = FIL_PAGE_TYPE_ZBLOB2;
7630 }
7631
7632 func_exit:
7633 inflateEnd(&d_stream);
7634 mem_heap_free(heap);
7635 UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7636 return(d_stream.total_out);
7637 }
7638
7639 /** Copies the prefix of an externally stored field of a record.
7640 The clustered index record that points to this BLOB must be protected
7641 by a lock or a page latch.
7642 @param[out] buf the externally stored part of the
7643 field, or a prefix of it
7644 @param[in] len length of buf, in bytes
7645 @param[in] page_size BLOB page size
7646 @param[in] space_id space id of the first BLOB page
7647 @param[in] page_no page number of the first BLOB page
7648 @param[in] offset offset on the first BLOB page
7649 @return number of bytes written to buf */
7650 static
7651 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7652 btr_copy_externally_stored_field_prefix_low(
7653 byte* buf,
7654 ulint len,
7655 const page_size_t& page_size,
7656 ulint space_id,
7657 ulint page_no,
7658 ulint offset)
7659 {
7660 if (len == 0) {
7661 return(0);
7662 }
7663
7664 if (page_size.is_compressed()) {
7665 return(btr_copy_zblob_prefix(buf, len, page_size,
7666 space_id, page_no, offset));
7667 } else {
7668 ut_ad(page_size.equals_to(univ_page_size));
7669 return(btr_copy_blob_prefix(buf, len, space_id,
7670 page_no, offset));
7671 }
7672 }
7673
7674 /** Copies the prefix of an externally stored field of a record.
7675 The clustered index record must be protected by a lock or a page latch.
7676 @param[out] buf the field, or a prefix of it
7677 @param[in] len length of buf, in bytes
7678 @param[in] page_size BLOB page size
7679 @param[in] data 'internally' stored part of the field
7680 containing also the reference to the external part; must be protected by
7681 a lock or a page latch
7682 @param[in] local_len length of data, in bytes
7683 @return the length of the copied field, or 0 if the column was being
7684 or has been deleted */
7685 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7686 btr_copy_externally_stored_field_prefix(
7687 byte* buf,
7688 ulint len,
7689 const page_size_t& page_size,
7690 const byte* data,
7691 ulint local_len)
7692 {
7693 ulint space_id;
7694 ulint page_no;
7695 ulint offset;
7696
7697 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7698
7699 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7700
7701 if (UNIV_UNLIKELY(local_len >= len)) {
7702 memcpy(buf, data, len);
7703 return(len);
7704 }
7705
7706 memcpy(buf, data, local_len);
7707 data += local_len;
7708
7709 ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7710
7711 if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7712 /* The externally stored part of the column has been
7713 (partially) deleted. Signal the half-deleted BLOB
7714 to the caller. */
7715
7716 return(0);
7717 }
7718
7719 space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7720
7721 page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7722
7723 offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7724
7725 return(local_len
7726 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7727 len - local_len,
7728 page_size,
7729 space_id, page_no,
7730 offset));
7731 }
7732
7733 /** Copies an externally stored field of a record to mem heap.
7734 The clustered index record must be protected by a lock or a page latch.
7735 @param[out] len length of the whole field
7736 @param[in] data 'internally' stored part of the field
7737 containing also the reference to the external part; must be protected by
7738 a lock or a page latch
7739 @param[in] page_size BLOB page size
7740 @param[in] local_len length of data
7741 @param[in,out] heap mem heap
7742 @return the whole field copied to heap */
7743 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7744 btr_copy_externally_stored_field(
7745 ulint* len,
7746 const byte* data,
7747 const page_size_t& page_size,
7748 ulint local_len,
7749 mem_heap_t* heap)
7750 {
7751 ulint space_id;
7752 ulint page_no;
7753 ulint offset;
7754 ulint extern_len;
7755 byte* buf;
7756
7757 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7758
7759 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7760
7761 space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
7762
7763 page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
7764
7765 offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
7766
7767 /* Currently a BLOB cannot be bigger than 4 GB; we
7768 leave the 4 upper bytes in the length field unused */
7769
7770 extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
7771
7772 buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
7773
7774 memcpy(buf, data, local_len);
7775 *len = local_len
7776 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7777 extern_len,
7778 page_size,
7779 space_id,
7780 page_no, offset);
7781
7782 return(buf);
7783 }
7784
7785 /** Copies an externally stored field of a record to mem heap.
7786 @param[in] rec record in a clustered index; must be
7787 protected by a lock or a page latch
7788 @param[in] offset array returned by rec_get_offsets()
7789 @param[in] page_size BLOB page size
7790 @param[in] no field number
7791 @param[out] len length of the field
7792 @param[in,out] heap mem heap
7793 @return the field copied to heap, or NULL if the field is incomplete */
7794 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)7795 btr_rec_copy_externally_stored_field(
7796 const rec_t* rec,
7797 const ulint* offsets,
7798 const page_size_t& page_size,
7799 ulint no,
7800 ulint* len,
7801 mem_heap_t* heap)
7802 {
7803 ulint local_len;
7804 const byte* data;
7805
7806 ut_a(rec_offs_nth_extern(offsets, no));
7807
7808 /* An externally stored field can contain some initial
7809 data from the field, and in the last 20 bytes it has the
7810 space id, page number, and offset where the rest of the
7811 field data is stored, and the data length in addition to
7812 the data stored locally. We may need to store some data
7813 locally to get the local record length above the 128 byte
7814 limit so that field offsets are stored in two bytes, and
7815 the extern bit is available in those two bytes. */
7816
7817 data = rec_get_nth_field(rec, offsets, no, &local_len);
7818
7819 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7820
7821 if (UNIV_UNLIKELY
7822 (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
7823 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
7824 /* The externally stored field was not written yet.
7825 This record should only be seen by
7826 recv_recovery_rollback_active() or any
7827 TRX_ISO_READ_UNCOMMITTED transactions. */
7828 return(NULL);
7829 }
7830
7831 return(btr_copy_externally_stored_field(len, data,
7832 page_size, local_len, heap));
7833 }
7834 #endif /* !UNIV_HOTBACKUP */
7835