1 /*****************************************************************************
2
3 Copyright (c) 1994, 2021, Oracle and/or its affiliates.
4 Copyright (c) 2008, Google Inc.
5 Copyright (c) 2012, Facebook Inc.
6
7 Portions of this file contain modifications contributed and copyrighted by
8 Google, Inc. Those modifications are gratefully acknowledged and are described
9 briefly in the InnoDB documentation. The contributions by Google are
10 incorporated with their permission, and subject to the conditions contained in
11 the file COPYING.Google.
12
13 This program is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License, version 2.0,
15 as published by the Free Software Foundation.
16
17 This program is also distributed with certain software (including
18 but not limited to OpenSSL) that is licensed under separate terms,
19 as designated in a particular file or component or in included license
20 documentation. The authors of MySQL hereby grant you an additional
21 permission to link the program and your derivative works with the
22 separately licensed software that they have included with MySQL.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License, version 2.0, for more details.
28
29 You should have received a copy of the GNU General Public License along with
30 this program; if not, write to the Free Software Foundation, Inc.,
31 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
32
33 *****************************************************************************/
34
35 /**************************************************//**
36 @file btr/btr0cur.cc
37 The index tree cursor
38
39 All changes that row operations make to a B-tree or the records
40 there must go through this module! Undo log records are written here
41 of every modify or insert of a clustered index record.
42
43 NOTE!!!
44 To make sure we do not run out of disk space during a pessimistic
45 insert or update, we have to reserve 2 x the height of the index tree
46 many pages in the tablespace before we start the operation, because
47 if leaf splitting has been started, it is difficult to undo, except
48 by crashing the database and doing a roll-forward.
49
50 Created 10/16/1994 Heikki Tuuri
51 *******************************************************/
52
53 #include "btr0cur.h"
54
55 #ifdef UNIV_NONINL
56 #include "btr0cur.ic"
57 #endif
58
59 #include "row0upd.h"
60 #ifndef UNIV_HOTBACKUP
61 #include "mtr0log.h"
62 #include "page0page.h"
63 #include "page0zip.h"
64 #include "rem0rec.h"
65 #include "rem0cmp.h"
66 #include "buf0lru.h"
67 #include "btr0btr.h"
68 #include "btr0sea.h"
69 #include "row0log.h"
70 #include "row0purge.h"
71 #include "row0upd.h"
72 #include "trx0rec.h"
73 #include "trx0roll.h"
74 #include "que0que.h"
75 #include "row0row.h"
76 #include "srv0srv.h"
77 #include "ibuf0ibuf.h"
78 #include "lock0lock.h"
79 #include "zlib.h"
80 #include "srv0start.h"
81
82 /** Buffered B-tree operation types, introduced as part of delete buffering. */
83 enum btr_op_t {
84 BTR_NO_OP = 0, /*!< Not buffered */
85 BTR_INSERT_OP, /*!< Insert, do not ignore UNIQUE */
86 BTR_INSERT_IGNORE_UNIQUE_OP, /*!< Insert, ignoring UNIQUE */
87 BTR_DELETE_OP, /*!< Purge a delete-marked record */
88 BTR_DELMARK_OP /*!< Mark a record for deletion */
89 };
90
91 /** Modification types for the B-tree operation. */
92 enum btr_intention_t {
93 BTR_INTENTION_DELETE,
94 BTR_INTENTION_BOTH,
95 BTR_INTENTION_INSERT
96 };
97 #if BTR_INTENTION_DELETE > BTR_INTENTION_BOTH
98 #error "BTR_INTENTION_DELETE > BTR_INTENTION_BOTH"
99 #endif
100 #if BTR_INTENTION_BOTH > BTR_INTENTION_INSERT
101 #error "BTR_INTENTION_BOTH > BTR_INTENTION_INSERT"
102 #endif
103
104 /** For the index->lock scalability improvement, only possibility of clear
105 performance regression observed was caused by grown huge history list length.
106 That is because the exclusive use of index->lock also worked as reserving
107 free blocks and read IO bandwidth with priority. To avoid huge glowing history
108 list as same level with previous implementation, prioritizes pessimistic tree
109 operations by purge as the previous, when it seems to be growing huge.
110
111 Experimentally, the history list length starts to affect to performance
112 throughput clearly from about 100000. */
113 #define BTR_CUR_FINE_HISTORY_LENGTH 100000
114
115 /** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
116 ulint btr_cur_n_non_sea = 0;
117 /** Number of successful adaptive hash index lookups in
118 btr_cur_search_to_nth_level(). */
119 ulint btr_cur_n_sea = 0;
120 /** Old value of btr_cur_n_non_sea. Copied by
121 srv_refresh_innodb_monitor_stats(). Referenced by
122 srv_printf_innodb_monitor(). */
123 ulint btr_cur_n_non_sea_old = 0;
124 /** Old value of btr_cur_n_sea. Copied by
125 srv_refresh_innodb_monitor_stats(). Referenced by
126 srv_printf_innodb_monitor(). */
127 ulint btr_cur_n_sea_old = 0;
128
129 #ifdef UNIV_DEBUG
130 /* Flag to limit optimistic insert records */
131 uint btr_cur_limit_optimistic_insert_debug = 0;
132 #endif /* UNIV_DEBUG */
133
134 /** In the optimistic insert, if the insert does not fit, but this much space
135 can be released by page reorganize, then it is reorganized */
136 #define BTR_CUR_PAGE_REORGANIZE_LIMIT (UNIV_PAGE_SIZE / 32)
137
138 /** The structure of a BLOB part header */
139 /* @{ */
140 /*--------------------------------------*/
141 #define BTR_BLOB_HDR_PART_LEN 0 /*!< BLOB part len on this
142 page */
143 #define BTR_BLOB_HDR_NEXT_PAGE_NO 4 /*!< next BLOB part page no,
144 FIL_NULL if none */
145 /*--------------------------------------*/
146 #define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
147 part header, in bytes */
148
149 /** Estimated table level stats from sampled value.
150 @param value sampled stats
151 @param index index being sampled
152 @param sample number of sampled rows
153 @param ext_size external stored data size
154 @param not_empty table not empty
155 @return estimated table wide stats from sampled value */
156 #define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
157 (((value) * static_cast<int64_t>(index->stat_n_leaf_pages) \
158 + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
159
160 /* @} */
161 #endif /* !UNIV_HOTBACKUP */
162
163 #ifndef UNIV_HOTBACKUP
164 /*******************************************************************//**
165 Marks all extern fields in a record as owned by the record. This function
166 should be called if the delete mark of a record is removed: a not delete
167 marked record always owns all its extern fields. */
168 static
169 void
170 btr_cur_unmark_extern_fields(
171 /*=========================*/
172 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
173 part will be updated, or NULL */
174 rec_t* rec, /*!< in/out: record in a clustered index */
175 dict_index_t* index, /*!< in: index of the page */
176 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
177 mtr_t* mtr); /*!< in: mtr, or NULL if not logged */
178 /*******************************************************************//**
179 Adds path information to the cursor for the current page, for which
180 the binary search has been performed. */
181 static
182 void
183 btr_cur_add_path_info(
184 /*==================*/
185 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
186 ulint height, /*!< in: height of the page in tree;
187 0 means leaf node */
188 ulint root_height); /*!< in: root node height in tree */
189 /***********************************************************//**
190 Frees the externally stored fields for a record, if the field is mentioned
191 in the update vector. */
192 static
193 void
194 btr_rec_free_updated_extern_fields(
195 /*===============================*/
196 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
197 X-latched */
198 rec_t* rec, /*!< in: record */
199 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
200 part will be updated, or NULL */
201 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
202 const upd_t* update, /*!< in: update vector */
203 bool rollback,/*!< in: performing rollback? */
204 mtr_t* mtr); /*!< in: mini-transaction handle which contains
205 an X-latch to record page and to the tree */
206 /***********************************************************//**
207 Frees the externally stored fields for a record. */
208 static
209 void
210 btr_rec_free_externally_stored_fields(
211 /*==================================*/
212 dict_index_t* index, /*!< in: index of the data, the index
213 tree MUST be X-latched */
214 rec_t* rec, /*!< in: record */
215 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
216 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
217 part will be updated, or NULL */
218 bool rollback,/*!< in: performing rollback? */
219 mtr_t* mtr); /*!< in: mini-transaction handle which contains
220 an X-latch to record page and to the index
221 tree */
222 #endif /* !UNIV_HOTBACKUP */
223
224 #ifndef UNIV_HOTBACKUP
225 /*==================== B-TREE SEARCH =========================*/
226
227 #if MTR_MEMO_PAGE_S_FIX != RW_S_LATCH
228 #error "MTR_MEMO_PAGE_S_FIX != RW_S_LATCH"
229 #endif
230 #if MTR_MEMO_PAGE_X_FIX != RW_X_LATCH
231 #error "MTR_MEMO_PAGE_X_FIX != RW_X_LATCH"
232 #endif
233 #if MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH
234 #error "MTR_MEMO_PAGE_SX_FIX != RW_SX_LATCH"
235 #endif
236
237 /** Latches the leaf page or pages requested.
238 @param[in] block leaf page where the search converged
239 @param[in] page_id page id of the leaf
240 @param[in] latch_mode BTR_SEARCH_LEAF, ...
241 @param[in] cursor cursor
242 @param[in] mtr mini-transaction
243 @return blocks and savepoints which actually latched. */
244 btr_latch_leaves_t
btr_cur_latch_leaves(buf_block_t * block,const page_id_t & page_id,const page_size_t & page_size,ulint latch_mode,btr_cur_t * cursor,mtr_t * mtr)245 btr_cur_latch_leaves(
246 buf_block_t* block,
247 const page_id_t& page_id,
248 const page_size_t& page_size,
249 ulint latch_mode,
250 btr_cur_t* cursor,
251 mtr_t* mtr)
252 {
253 ulint mode;
254 ulint left_page_no;
255 ulint right_page_no;
256 buf_block_t* get_block;
257 page_t* page = buf_block_get_frame(block);
258 bool spatial;
259 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
260
261 spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info;
262 ut_ad(buf_page_in_file(&block->page));
263
264 switch (latch_mode) {
265 case BTR_SEARCH_LEAF:
266 case BTR_MODIFY_LEAF:
267 case BTR_SEARCH_TREE:
268 if (spatial) {
269 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS]
270 = mtr_set_savepoint(mtr);
271 }
272
273 mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH;
274 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
275 get_block = btr_block_get(page_id, page_size, mode,
276 cursor->index, mtr);
277 latch_leaves.blocks[1] = get_block;
278 #ifdef UNIV_BTR_DEBUG
279 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
280 #endif /* UNIV_BTR_DEBUG */
281 if (spatial) {
282 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
283 = get_block;
284 }
285
286 return(latch_leaves);
287 case BTR_MODIFY_TREE:
288 /* It is exclusive for other operations which calls
289 btr_page_set_prev() */
290 ut_ad(mtr_memo_contains_flagged(mtr,
291 dict_index_get_lock(cursor->index),
292 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
293 || dict_table_is_intrinsic(cursor->index->table));
294 /* x-latch also siblings from left to right */
295 left_page_no = btr_page_get_prev(page, mtr);
296
297 if (left_page_no != FIL_NULL) {
298
299 if (spatial) {
300 cursor->rtr_info->tree_savepoints[
301 RTR_MAX_LEVELS] = mtr_set_savepoint(mtr);
302 }
303
304 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
305 get_block = btr_block_get(
306 page_id_t(page_id.space(), left_page_no),
307 page_size, RW_X_LATCH, cursor->index, mtr);
308 latch_leaves.blocks[0] = get_block;
309
310 if (spatial) {
311 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS]
312 = get_block;
313 }
314 }
315
316 if (spatial) {
317 cursor->rtr_info->tree_savepoints[RTR_MAX_LEVELS + 1]
318 = mtr_set_savepoint(mtr);
319 }
320
321 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
322 get_block = btr_block_get(
323 page_id, page_size, RW_X_LATCH, cursor->index, mtr);
324 latch_leaves.blocks[1] = get_block;
325
326 #ifdef UNIV_BTR_DEBUG
327 /* Sanity check only after both the blocks are latched. */
328 if (latch_leaves.blocks[0] != NULL) {
329 ut_a(page_is_comp(latch_leaves.blocks[0]->frame)
330 == page_is_comp(page));
331 ut_a(btr_page_get_next(
332 latch_leaves.blocks[0]->frame, mtr)
333 == page_get_page_no(page));
334 }
335 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
336 #endif /* UNIV_BTR_DEBUG */
337
338 if (spatial) {
339 cursor->rtr_info->tree_blocks[RTR_MAX_LEVELS + 1]
340 = get_block;
341 }
342
343 right_page_no = btr_page_get_next(page, mtr);
344
345 if (right_page_no != FIL_NULL) {
346 if (spatial) {
347 cursor->rtr_info->tree_savepoints[
348 RTR_MAX_LEVELS + 2] = mtr_set_savepoint(
349 mtr);
350 }
351 latch_leaves.savepoints[2] = mtr_set_savepoint(mtr);
352 get_block = btr_block_get(
353 page_id_t(page_id.space(), right_page_no),
354 page_size, RW_X_LATCH, cursor->index, mtr);
355 latch_leaves.blocks[2] = get_block;
356 #ifdef UNIV_BTR_DEBUG
357 ut_a(page_is_comp(get_block->frame)
358 == page_is_comp(page));
359 ut_a(btr_page_get_prev(get_block->frame, mtr)
360 == page_get_page_no(page));
361 #endif /* UNIV_BTR_DEBUG */
362 if (spatial) {
363 cursor->rtr_info->tree_blocks[
364 RTR_MAX_LEVELS + 2] = get_block;
365 }
366 }
367
368 return(latch_leaves);
369
370 case BTR_SEARCH_PREV:
371 case BTR_MODIFY_PREV:
372 mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
373 /* latch also left sibling */
374 rw_lock_s_lock(&block->lock);
375 left_page_no = btr_page_get_prev(page, mtr);
376 rw_lock_s_unlock(&block->lock);
377
378 if (left_page_no != FIL_NULL) {
379 latch_leaves.savepoints[0] = mtr_set_savepoint(mtr);
380 get_block = btr_block_get(
381 page_id_t(page_id.space(), left_page_no),
382 page_size, mode, cursor->index, mtr);
383 latch_leaves.blocks[0] = get_block;
384 cursor->left_block = get_block;
385 #ifdef UNIV_BTR_DEBUG
386 ut_a(page_is_comp(get_block->frame)
387 == page_is_comp(page));
388 ut_a(btr_page_get_next(get_block->frame, mtr)
389 == page_get_page_no(page));
390 #endif /* UNIV_BTR_DEBUG */
391 }
392
393 latch_leaves.savepoints[1] = mtr_set_savepoint(mtr);
394 get_block = btr_block_get(page_id, page_size, mode,
395 cursor->index, mtr);
396 latch_leaves.blocks[1] = get_block;
397 #ifdef UNIV_BTR_DEBUG
398 ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
399 #endif /* UNIV_BTR_DEBUG */
400 return(latch_leaves);
401 case BTR_CONT_MODIFY_TREE:
402 ut_ad(dict_index_is_spatial(cursor->index));
403 return(latch_leaves);
404 }
405
406 ut_error;
407 return(latch_leaves);
408 }
409
410 /** Optimistically latches the leaf page or pages requested.
411 @param[in] block guessed buffer block
412 @param[in] modify_clock modify clock value
413 @param[in,out] latch_mode BTR_SEARCH_LEAF, ...
414 @param[in,out] cursor cursor
415 @param[in] file file name
416 @param[in] line line where called
417 @param[in] mtr mini-transaction
418 @return true if success */
419 bool
btr_cur_optimistic_latch_leaves(buf_block_t * block,ib_uint64_t modify_clock,ulint * latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)420 btr_cur_optimistic_latch_leaves(
421 buf_block_t* block,
422 ib_uint64_t modify_clock,
423 ulint* latch_mode,
424 btr_cur_t* cursor,
425 const char* file,
426 ulint line,
427 mtr_t* mtr)
428 {
429 ulint mode;
430 ulint left_page_no;
431 ut_ad(block->page.buf_fix_count > 0);
432 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
433
434 switch (*latch_mode) {
435 case BTR_SEARCH_LEAF:
436 case BTR_MODIFY_LEAF:
437 return(buf_page_optimistic_get(*latch_mode, block,
438 modify_clock, file, line, mtr));
439 case BTR_SEARCH_PREV:
440 case BTR_MODIFY_PREV:
441 mode = *latch_mode == BTR_SEARCH_PREV
442 ? RW_S_LATCH : RW_X_LATCH;
443
444 rw_lock_s_lock(&block->lock);
445 if (block->modify_clock != modify_clock) {
446 rw_lock_s_unlock(&block->lock);
447
448 return(false);
449 }
450 left_page_no = btr_page_get_prev(
451 buf_block_get_frame(block), mtr);
452 rw_lock_s_unlock(&block->lock);
453
454 if (left_page_no != FIL_NULL) {
455 const page_id_t page_id(
456 dict_index_get_space(cursor->index),
457 left_page_no);
458
459 cursor->left_block = btr_block_get(
460 page_id,
461 dict_table_page_size(cursor->index->table),
462 mode, cursor->index, mtr);
463 } else {
464 cursor->left_block = NULL;
465 }
466
467 if (buf_page_optimistic_get(mode, block, modify_clock,
468 file, line, mtr)) {
469 if (btr_page_get_prev(buf_block_get_frame(block), mtr)
470 == left_page_no) {
471 /* We've entered this function with the block already buffer-fixed,
472 and buf_page_optimistic_get() buffer-fixes it again. The caller should
473 unfix the block once (to undo their buffer-fixing). */
474 ut_ad(2 <= block->page.buf_fix_count);
475 *latch_mode = mode;
476 return(true);
477 } else {
478 /* release the block, which will also decrement the buf_fix_count once
479 undoing the increment in successful buf_page_optimistic_get() */
480 btr_leaf_page_release(block, mode, mtr);
481 }
482 }
483
484 /* If we are still here then buf_page_optimistic_get() did not buffer-fix
485 the page, but it should still be buffer-fixed as it was before the call.*/
486 ut_ad(0 < block->page.buf_fix_count);
487 /* release the left block */
488 if (cursor->left_block != NULL) {
489 btr_leaf_page_release(cursor->left_block,
490 mode, mtr);
491 }
492
493 return(false);
494
495 default:
496 ut_error;
497 return(false);
498 }
499 }
500
501 /**
502 Gets intention in btr_intention_t from latch_mode, and cleares the intention
503 at the latch_mode.
504 @param latch_mode in/out: pointer to latch_mode
505 @return intention for latching tree */
506 static
507 btr_intention_t
btr_cur_get_and_clear_intention(ulint * latch_mode)508 btr_cur_get_and_clear_intention(
509 ulint *latch_mode)
510 {
511 btr_intention_t intention;
512
513 switch (*latch_mode & (BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE)) {
514 case BTR_LATCH_FOR_INSERT:
515 intention = BTR_INTENTION_INSERT;
516 break;
517 case BTR_LATCH_FOR_DELETE:
518 intention = BTR_INTENTION_DELETE;
519 break;
520 default:
521 /* both or unknown */
522 intention = BTR_INTENTION_BOTH;
523 }
524 *latch_mode &= ~(BTR_LATCH_FOR_INSERT | BTR_LATCH_FOR_DELETE);
525
526 return(intention);
527 }
528
529 /**
530 Gets the desired latch type for the root leaf (root page is root leaf)
531 at the latch mode.
532 @param latch_mode in: BTR_SEARCH_LEAF, ...
533 @return latch type */
534 static
535 rw_lock_type_t
btr_cur_latch_for_root_leaf(ulint latch_mode)536 btr_cur_latch_for_root_leaf(
537 ulint latch_mode)
538 {
539 switch (latch_mode) {
540 case BTR_SEARCH_LEAF:
541 case BTR_SEARCH_TREE:
542 case BTR_SEARCH_PREV:
543 return(RW_S_LATCH);
544 case BTR_MODIFY_LEAF:
545 case BTR_MODIFY_TREE:
546 case BTR_MODIFY_PREV:
547 return(RW_X_LATCH);
548 case BTR_CONT_MODIFY_TREE:
549 case BTR_CONT_SEARCH_TREE:
550 /* A root page should be latched already,
551 and don't need to be latched here.
552 fall through (RW_NO_LATCH) */
553 case BTR_NO_LATCHES:
554 return(RW_NO_LATCH);
555 }
556
557 ut_error;
558 return(RW_NO_LATCH); /* avoid compiler warnings */
559 }
560
561 /** Detects whether the modifying record might need a modifying tree structure.
562 @param[in] index index
563 @param[in] page page
564 @param[in] lock_intention lock intention for the tree operation
565 @param[in] rec record (current node_ptr)
566 @param[in] rec_size size of the record or max size of node_ptr
567 @param[in] page_size page size
568 @param[in] mtr mtr
569 @return true if tree modification is needed */
570 static
571 bool
btr_cur_will_modify_tree(dict_index_t * index,const page_t * page,btr_intention_t lock_intention,const rec_t * rec,ulint rec_size,const page_size_t & page_size,mtr_t * mtr)572 btr_cur_will_modify_tree(
573 dict_index_t* index,
574 const page_t* page,
575 btr_intention_t lock_intention,
576 const rec_t* rec,
577 ulint rec_size,
578 const page_size_t& page_size,
579 mtr_t* mtr)
580 {
581 ut_ad(!page_is_leaf(page));
582 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
583 MTR_MEMO_X_LOCK
584 | MTR_MEMO_SX_LOCK)
585 || dict_table_is_intrinsic(index->table));
586
587 /* Pessimistic delete of the first record causes delete & insert
588 of node_ptr at upper level. And a subsequent page shrink is
589 possible. It causes delete of node_ptr at the upper level.
590 So we should pay attention also to 2nd record not only
591 first record and last record. Because if the "delete & insert" are
592 done for the different page, the 2nd record become
593 first record and following compress might delete the record and causes
594 the uppper level node_ptr modification. */
595
596 if (lock_intention <= BTR_INTENTION_BOTH) {
597 ulint margin;
598
599 if (lock_intention == BTR_INTENTION_BOTH) {
600 ulint level = btr_page_get_level(page, mtr);
601
602 /* This value is the worst expectation for the node_ptr
603 records to be deleted from this page. It is used to
604 expect whether the cursor position can be the left_most
605 record in this page or not. */
606 ulint max_nodes_deleted = 0;
607
608 /* By modifying tree operations from the under of this
609 level, logically (2 ^ (level - 1)) opportunities to
610 deleting records in maximum even unreally rare case. */
611 if (level > 7) {
612 /* TODO: adjust this practical limit. */
613 max_nodes_deleted = 64;
614 } else if (level > 0) {
615 max_nodes_deleted = (ulint)1 << (level - 1);
616 }
617
618 /* check delete will cause. (BTR_INTENTION_BOTH
619 or BTR_INTENTION_DELETE) */
620 if (page_get_n_recs(page) <= max_nodes_deleted * 2
621 || page_rec_is_first(rec, page)) {
622 /* The cursor record can be the left most record
623 in this page. */
624 return(true);
625 }
626
627 if (fil_page_get_prev(page) != FIL_NULL
628 && page_rec_distance_is_at_most(
629 page_get_infimum_rec(page), rec,
630 max_nodes_deleted)) {
631 return (true);
632 }
633
634 if (fil_page_get_next(page) != FIL_NULL
635 && page_rec_distance_is_at_most(
636 rec, page_get_supremum_rec(page),
637 max_nodes_deleted)) {
638 return (true);
639 }
640
641 /* Delete at leftmost record in a page causes delete
642 & insert at its parent page. After that, the delete
643 might cause btr_compress() and delete record at its
644 parent page. Thus we should consider max deletes. */
645
646 margin = rec_size * max_nodes_deleted;
647 } else {
648 ut_ad(lock_intention == BTR_INTENTION_DELETE);
649
650 margin = rec_size;
651 }
652 /* Safe because we already have SX latch of the index tree */
653 if (page_get_data_size(page)
654 < margin + BTR_CUR_PAGE_COMPRESS_LIMIT(index)
655 || (fil_page_get_next(page) == FIL_NULL
656 && fil_page_get_prev(page) == FIL_NULL)) {
657 return(true);
658 }
659 }
660
661 if (lock_intention >= BTR_INTENTION_BOTH) {
662 /* check insert will cause. BTR_INTENTION_BOTH
663 or BTR_INTENTION_INSERT*/
664
665 /* Once we invoke the btr_cur_limit_optimistic_insert_debug,
666 we should check it here in advance, since the max allowable
667 records in a page is limited. */
668 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
669 return(true));
670
671 /* needs 2 records' space for the case the single split and
672 insert cannot fit.
673 page_get_max_insert_size_after_reorganize() includes space
674 for page directory already */
675 ulint max_size
676 = page_get_max_insert_size_after_reorganize(page, 2);
677
678 if (max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT + rec_size
679 || max_size < rec_size * 2) {
680 return(true);
681 }
682 /* TODO: optimize this condition for compressed page.
683 this is based on the worst compress rate.
684 currently looking only uncompressed page, but we can look
685 also compressed page page_zip_available() if already in the
686 buffer pool */
687 /* needs 2 records' space also for worst compress rate. */
688 if (page_size.is_compressed()
689 && page_zip_empty_size(index->n_fields,
690 page_size.physical())
691 < rec_size * 2 + page_get_data_size(page)
692 + page_dir_calc_reserved_space(
693 page_get_n_recs(page) + 2) + 1) {
694 return(true);
695 }
696 }
697
698 return(false);
699 }
700
701 /** Detects whether the modifying record might need a opposite modification
702 to the intention.
703 @param[in] page page
704 @param[in] lock_intention lock intention for the tree operation
705 @param[in] rec record (current node_ptr)
706 @return true if tree modification is needed */
707 static
708 bool
btr_cur_need_opposite_intention(const page_t * page,btr_intention_t lock_intention,const rec_t * rec)709 btr_cur_need_opposite_intention(
710 const page_t* page,
711 btr_intention_t lock_intention,
712 const rec_t* rec)
713 {
714 switch (lock_intention) {
715 case BTR_INTENTION_DELETE:
716 return((mach_read_from_4(page + FIL_PAGE_PREV) != FIL_NULL
717 && page_rec_is_first(rec, page))
718 || (mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
719 && page_rec_is_last(rec, page)));
720 case BTR_INTENTION_INSERT:
721 return(mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
722 && page_rec_is_last(rec, page));
723 case BTR_INTENTION_BOTH:
724 return(false);
725 }
726
727 ut_error;
728 return(false);
729 }
730
731 /********************************************************************//**
732 Searches an index tree and positions a tree cursor on a given level.
733 NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
734 to node pointer page number fields on the upper levels of the tree!
735 Note that if mode is PAGE_CUR_LE, which is used in inserts, then
736 cursor->up_match and cursor->low_match both will have sensible values.
737 If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
738
739 If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
740 search tuple should be performed in the B-tree. InnoDB does an insert
741 immediately after the cursor. Thus, the cursor may end up on a user record,
742 or on a page infimum record. */
743 void
btr_cur_search_to_nth_level(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,ulint latch_mode,btr_cur_t * cursor,ulint has_search_latch,const char * file,ulint line,mtr_t * mtr)744 btr_cur_search_to_nth_level(
745 /*========================*/
746 dict_index_t* index, /*!< in: index */
747 ulint level, /*!< in: the tree level of search */
748 const dtuple_t* tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
749 tuple must be set so that it cannot get
750 compared to the node ptr page number field! */
751 page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
752 Inserts should always be made using
753 PAGE_CUR_LE to search the position! */
754 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
755 at most one of BTR_INSERT, BTR_DELETE_MARK,
756 BTR_DELETE, or BTR_ESTIMATE;
757 cursor->left_block is used to store a pointer
758 to the left neighbor page, in the cases
759 BTR_SEARCH_PREV and BTR_MODIFY_PREV;
760 NOTE that if has_search_latch
761 is != 0, we maybe do not have a latch set
762 on the cursor page, we assume
763 the caller uses his search latch
764 to protect the record! */
765 btr_cur_t* cursor, /*!< in/out: tree cursor; the cursor page is
766 s- or x-latched, but see also above! */
767 ulint has_search_latch,
768 /*!< in: info on the latch mode the
769 caller currently has on search system:
770 RW_S_LATCH, or 0 */
771 const char* file, /*!< in: file name */
772 ulint line, /*!< in: line where called */
773 mtr_t* mtr) /*!< in: mtr */
774 {
775 page_t* page = NULL; /* remove warning */
776 buf_block_t* block;
777 ulint height;
778 ulint up_match;
779 ulint up_bytes;
780 ulint low_match;
781 ulint low_bytes;
782 ulint savepoint;
783 ulint rw_latch;
784 page_cur_mode_t page_mode;
785 page_cur_mode_t search_mode = PAGE_CUR_UNSUPP;
786 ulint buf_mode;
787 ulint estimate;
788 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
789 page_cur_t* page_cursor;
790 btr_op_t btr_op;
791 ulint root_height = 0; /* remove warning */
792
793 ulint upper_rw_latch, root_leaf_rw_latch;
794 btr_intention_t lock_intention;
795 bool modify_external;
796 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
797 ulint tree_savepoints[BTR_MAX_LEVELS];
798 ulint n_blocks = 0;
799 ulint n_releases = 0;
800 bool detected_same_key_root = false;
801
802 bool retrying_for_search_prev = false;
803 ulint leftmost_from_level = 0;
804 buf_block_t** prev_tree_blocks = NULL;
805 ulint* prev_tree_savepoints = NULL;
806 ulint prev_n_blocks = 0;
807 ulint prev_n_releases = 0;
808 bool need_path = true;
809 bool rtree_parent_modified = false;
810 bool mbr_adj = false;
811 bool found = false;
812
813 DBUG_ENTER("btr_cur_search_to_nth_level");
814
815 btr_search_t* info;
816 mem_heap_t* heap = NULL;
817 ulint offsets_[REC_OFFS_NORMAL_SIZE];
818 ulint* offsets = offsets_;
819 ulint offsets2_[REC_OFFS_NORMAL_SIZE];
820 ulint* offsets2 = offsets2_;
821 rec_offs_init(offsets_);
822 rec_offs_init(offsets2_);
823 /* Currently, PAGE_CUR_LE is the only search mode used for searches
824 ending to upper levels */
825
826 ut_ad(level == 0 || mode == PAGE_CUR_LE
827 || RTREE_SEARCH_MODE(mode));
828 ut_ad(dict_index_check_search_tuple(index, tuple));
829 ut_ad(!dict_index_is_ibuf(index) || ibuf_inside(mtr));
830 ut_ad(dtuple_check_typed(tuple));
831 ut_ad(!(index->type & DICT_FTS));
832 ut_ad(index->page != FIL_NULL);
833
834 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
835 UNIV_MEM_INVALID(&cursor->up_bytes, sizeof cursor->up_bytes);
836 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
837 UNIV_MEM_INVALID(&cursor->low_bytes, sizeof cursor->low_bytes);
838 #ifdef UNIV_DEBUG
839 cursor->up_match = ULINT_UNDEFINED;
840 cursor->low_match = ULINT_UNDEFINED;
841 #endif /* UNIV_DEBUG */
842
843 ibool s_latch_by_caller;
844
845 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
846
847 ut_ad(!s_latch_by_caller
848 || srv_read_only_mode
849 || mtr_memo_contains_flagged(mtr,
850 dict_index_get_lock(index),
851 MTR_MEMO_S_LOCK
852 | MTR_MEMO_SX_LOCK));
853
854 /* These flags are mutually exclusive, they are lumped together
855 with the latch mode for historical reasons. It's possible for
856 none of the flags to be set. */
857 switch (UNIV_EXPECT(latch_mode
858 & (BTR_INSERT | BTR_DELETE | BTR_DELETE_MARK),
859 0)) {
860 case 0:
861 btr_op = BTR_NO_OP;
862 break;
863 case BTR_INSERT:
864 btr_op = (latch_mode & BTR_IGNORE_SEC_UNIQUE)
865 ? BTR_INSERT_IGNORE_UNIQUE_OP
866 : BTR_INSERT_OP;
867 break;
868 case BTR_DELETE:
869 btr_op = BTR_DELETE_OP;
870 ut_a(cursor->purge_node);
871 break;
872 case BTR_DELETE_MARK:
873 btr_op = BTR_DELMARK_OP;
874 break;
875 default:
876 /* only one of BTR_INSERT, BTR_DELETE, BTR_DELETE_MARK
877 should be specified at a time */
878 ut_error;
879 }
880
881 /* Operations on the insert buffer tree cannot be buffered. */
882 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_ibuf(index));
883 /* Operations on the clustered index cannot be buffered. */
884 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_clust(index));
885 /* Operations on the temporary table(indexes) cannot be buffered. */
886 ut_ad(btr_op == BTR_NO_OP || !dict_table_is_temporary(index->table));
887 /* Operation on the spatial index cannot be buffered. */
888 ut_ad(btr_op == BTR_NO_OP || !dict_index_is_spatial(index));
889
890 estimate = latch_mode & BTR_ESTIMATE;
891
892 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
893
894 modify_external = latch_mode & BTR_MODIFY_EXTERNAL;
895
896 /* Turn the flags unrelated to the latch mode off. */
897 latch_mode = BTR_LATCH_MODE_WITHOUT_FLAGS(latch_mode);
898
899 ut_ad(!modify_external || latch_mode == BTR_MODIFY_LEAF);
900
901 ut_ad(!s_latch_by_caller
902 || latch_mode == BTR_SEARCH_LEAF
903 || latch_mode == BTR_SEARCH_TREE
904 || latch_mode == BTR_MODIFY_LEAF);
905
906 cursor->flag = BTR_CUR_BINARY;
907 cursor->index = index;
908
909 info = btr_search_get_info(index);
910
911 # ifdef UNIV_SEARCH_PERF_STAT
912 info->n_searches++;
913 # endif
914 /* Use of AHI is disabled for intrinsic table as these tables re-use
915 the index-id and AHI validation is based on index-id. */
916 if (rw_lock_get_writer(btr_get_search_latch(index))
917 == RW_LOCK_NOT_LOCKED
918 && latch_mode <= BTR_MODIFY_LEAF
919 && info->last_hash_succ
920 && !index->disable_ahi
921 && !estimate
922 # ifdef PAGE_CUR_LE_OR_EXTENDS
923 && mode != PAGE_CUR_LE_OR_EXTENDS
924 # endif /* PAGE_CUR_LE_OR_EXTENDS */
925 && !dict_index_is_spatial(index)
926 /* If !has_search_latch, we do a dirty read of
927 btr_search_enabled below, and btr_search_guess_on_hash()
928 will have to check it again. */
929 && UNIV_LIKELY(btr_search_enabled)
930 && !modify_external
931 && btr_search_guess_on_hash(index, info, tuple, mode,
932 latch_mode, cursor,
933 has_search_latch, mtr)) {
934
935 /* Search using the hash index succeeded */
936
937 ut_ad(cursor->up_match != ULINT_UNDEFINED
938 || mode != PAGE_CUR_GE);
939 ut_ad(cursor->up_match != ULINT_UNDEFINED
940 || mode != PAGE_CUR_LE);
941 ut_ad(cursor->low_match != ULINT_UNDEFINED
942 || mode != PAGE_CUR_LE);
943 btr_cur_n_sea++;
944
945 DBUG_VOID_RETURN;
946 }
947 btr_cur_n_non_sea++;
948
949 /* If the hash search did not succeed, do binary search down the
950 tree */
951
952 if (has_search_latch) {
953 /* Release possible search latch to obey latching order */
954 rw_lock_s_unlock(btr_get_search_latch(index));
955 }
956
957 /* Store the position of the tree latch we push to mtr so that we
958 know how to release it when we have latched leaf node(s) */
959
960 savepoint = mtr_set_savepoint(mtr);
961
962 switch (latch_mode) {
963 case BTR_MODIFY_TREE:
964 /* Most of delete-intended operations are purging.
965 Free blocks and read IO bandwidth should be prior
966 for them, when the history list is glowing huge. */
967 if (lock_intention == BTR_INTENTION_DELETE
968 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
969 && buf_get_n_pending_read_ios()) {
970 mtr_x_lock(dict_index_get_lock(index), mtr);
971 } else if (dict_index_is_spatial(index)
972 && lock_intention <= BTR_INTENTION_BOTH) {
973 /* X lock the if there is possibility of
974 pessimistic delete on spatial index. As we could
975 lock upward for the tree */
976
977 mtr_x_lock(dict_index_get_lock(index), mtr);
978 } else {
979 mtr_sx_lock(dict_index_get_lock(index), mtr);
980 }
981 upper_rw_latch = RW_X_LATCH;
982 break;
983 case BTR_CONT_MODIFY_TREE:
984 case BTR_CONT_SEARCH_TREE:
985 /* Do nothing */
986 ut_ad(srv_read_only_mode
987 || mtr_memo_contains_flagged(mtr,
988 dict_index_get_lock(index),
989 MTR_MEMO_X_LOCK
990 | MTR_MEMO_SX_LOCK));
991 if (dict_index_is_spatial(index)
992 && latch_mode == BTR_CONT_MODIFY_TREE) {
993 /* If we are about to locating parent page for split
994 and/or merge operation for R-Tree index, X latch
995 the parent */
996 upper_rw_latch = RW_X_LATCH;
997 } else {
998 upper_rw_latch = RW_NO_LATCH;
999 }
1000 break;
1001 default:
1002 if (!srv_read_only_mode) {
1003 if (s_latch_by_caller) {
1004 ut_ad(rw_lock_own(dict_index_get_lock(index),
1005 RW_LOCK_S));
1006 } else if (!modify_external) {
1007 /* BTR_SEARCH_TREE is intended to be used with
1008 BTR_ALREADY_S_LATCHED */
1009 ut_ad(latch_mode != BTR_SEARCH_TREE);
1010
1011 mtr_s_lock(dict_index_get_lock(index), mtr);
1012 } else {
1013 /* BTR_MODIFY_EXTERNAL needs to be excluded */
1014 mtr_sx_lock(dict_index_get_lock(index), mtr);
1015 }
1016 upper_rw_latch = RW_S_LATCH;
1017 } else {
1018 upper_rw_latch = RW_NO_LATCH;
1019 }
1020 }
1021 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
1022
1023 page_cursor = btr_cur_get_page_cur(cursor);
1024
1025 const ulint space = dict_index_get_space(index);
1026 const page_size_t page_size(dict_table_page_size(index->table));
1027
1028 /* Start with the root page. */
1029 page_id_t page_id(space, dict_index_get_page(index));
1030
1031 if (root_leaf_rw_latch == RW_X_LATCH) {
1032 node_ptr_max_size = dict_index_node_ptr_max_size(index);
1033 }
1034
1035 up_match = 0;
1036 up_bytes = 0;
1037 low_match = 0;
1038 low_bytes = 0;
1039
1040 height = ULINT_UNDEFINED;
1041
1042 /* We use these modified search modes on non-leaf levels of the
1043 B-tree. These let us end up in the right B-tree leaf. In that leaf
1044 we use the original search mode. */
1045
1046 switch (mode) {
1047 case PAGE_CUR_GE:
1048 page_mode = PAGE_CUR_L;
1049 break;
1050 case PAGE_CUR_G:
1051 page_mode = PAGE_CUR_LE;
1052 break;
1053 default:
1054 #ifdef PAGE_CUR_LE_OR_EXTENDS
1055 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1056 || RTREE_SEARCH_MODE(mode)
1057 || mode == PAGE_CUR_LE_OR_EXTENDS);
1058 #else /* PAGE_CUR_LE_OR_EXTENDS */
1059 ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
1060 || RTREE_SEARCH_MODE(mode));
1061 #endif /* PAGE_CUR_LE_OR_EXTENDS */
1062 page_mode = mode;
1063 break;
1064 }
1065
1066 /* Loop and search until we arrive at the desired level */
1067 btr_latch_leaves_t latch_leaves = {{NULL, NULL, NULL}, {0, 0, 0}};
1068
1069 search_loop:
1070 buf_mode = BUF_GET;
1071 rw_latch = RW_NO_LATCH;
1072 rtree_parent_modified = false;
1073
1074 if (height != 0) {
1075 /* We are about to fetch the root or a non-leaf page. */
1076 if ((latch_mode != BTR_MODIFY_TREE
1077 || height == level)
1078 && !retrying_for_search_prev) {
1079 /* If doesn't have SX or X latch of index,
1080 each pages should be latched before reading. */
1081 if (modify_external
1082 && height == ULINT_UNDEFINED
1083 && upper_rw_latch == RW_S_LATCH) {
1084 /* needs sx-latch of root page
1085 for fseg operation */
1086 rw_latch = RW_SX_LATCH;
1087 } else {
1088 rw_latch = upper_rw_latch;
1089 }
1090 }
1091 } else if (latch_mode <= BTR_MODIFY_LEAF) {
1092 rw_latch = latch_mode;
1093
1094 if (btr_op != BTR_NO_OP
1095 && ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
1096
1097 /* Try to buffer the operation if the leaf
1098 page is not in the buffer pool. */
1099
1100 buf_mode = btr_op == BTR_DELETE_OP
1101 ? BUF_GET_IF_IN_POOL_OR_WATCH
1102 : BUF_GET_IF_IN_POOL;
1103 }
1104 }
1105
1106 retry_page_get:
1107 ut_ad(n_blocks < BTR_MAX_LEVELS);
1108 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1109 block = buf_page_get_gen(
1110 page_id, page_size, rw_latch,
1111 (height == ULINT_UNDEFINED ? info->root_guess : NULL),
1112 buf_mode, file, line, mtr
1113 );
1114
1115 tree_blocks[n_blocks] = block;
1116
1117 if (block == NULL) {
1118 /* This must be a search to perform an insert/delete
1119 mark/ delete; try using the insert/delete buffer */
1120
1121 ut_ad(height == 0);
1122 ut_ad(cursor->thr);
1123
1124 switch (btr_op) {
1125 case BTR_INSERT_OP:
1126 case BTR_INSERT_IGNORE_UNIQUE_OP:
1127 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1128 ut_ad(!dict_index_is_spatial(index));
1129
1130 if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
1131 page_id, page_size, cursor->thr)) {
1132
1133 cursor->flag = BTR_CUR_INSERT_TO_IBUF;
1134
1135 goto func_exit;
1136 }
1137 break;
1138
1139 case BTR_DELMARK_OP:
1140 ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
1141 ut_ad(!dict_index_is_spatial(index));
1142
1143 if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
1144 index, page_id, page_size,
1145 cursor->thr)) {
1146
1147 cursor->flag = BTR_CUR_DEL_MARK_IBUF;
1148
1149 goto func_exit;
1150 }
1151
1152 break;
1153
1154 case BTR_DELETE_OP:
1155 ut_ad(buf_mode == BUF_GET_IF_IN_POOL_OR_WATCH);
1156 ut_ad(!dict_index_is_spatial(index));
1157
1158 if (!row_purge_poss_sec(cursor->purge_node,
1159 index, tuple)) {
1160
1161 /* The record cannot be purged yet. */
1162 cursor->flag = BTR_CUR_DELETE_REF;
1163 } else if (ibuf_insert(IBUF_OP_DELETE, tuple,
1164 index, page_id, page_size,
1165 cursor->thr)) {
1166
1167 /* The purge was buffered. */
1168 cursor->flag = BTR_CUR_DELETE_IBUF;
1169 } else {
1170 /* The purge could not be buffered. */
1171 buf_pool_watch_unset(page_id);
1172 break;
1173 }
1174
1175 buf_pool_watch_unset(page_id);
1176 goto func_exit;
1177
1178 default:
1179 ut_error;
1180 }
1181
1182 /* Insert to the insert/delete buffer did not succeed, we
1183 must read the page from disk. */
1184
1185 buf_mode = BUF_GET;
1186
1187 goto retry_page_get;
1188 }
1189
1190 if (retrying_for_search_prev && height != 0) {
1191 /* also latch left sibling */
1192 ulint left_page_no;
1193 buf_block_t* get_block;
1194
1195 ut_ad(rw_latch == RW_NO_LATCH);
1196
1197 rw_latch = upper_rw_latch;
1198
1199 rw_lock_s_lock(&block->lock);
1200 left_page_no = btr_page_get_prev(
1201 buf_block_get_frame(block), mtr);
1202 rw_lock_s_unlock(&block->lock);
1203
1204 if (left_page_no != FIL_NULL) {
1205 ut_ad(prev_n_blocks < leftmost_from_level);
1206
1207 prev_tree_savepoints[prev_n_blocks]
1208 = mtr_set_savepoint(mtr);
1209 get_block = buf_page_get_gen(
1210 page_id_t(page_id.space(), left_page_no),
1211 page_size, rw_latch, NULL, buf_mode,
1212 file, line, mtr);
1213 prev_tree_blocks[prev_n_blocks] = get_block;
1214 prev_n_blocks++;
1215
1216 /* BTR_MODIFY_TREE doesn't update prev/next_page_no,
1217 without their parent page's lock. So, not needed to
1218 retry here, because we have the parent page's lock. */
1219 }
1220
1221 /* release RW_NO_LATCH page and lock with RW_S_LATCH */
1222 mtr_release_block_at_savepoint(
1223 mtr, tree_savepoints[n_blocks],
1224 tree_blocks[n_blocks]);
1225
1226 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
1227 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
1228 buf_mode, file, line, mtr);
1229 tree_blocks[n_blocks] = block;
1230 }
1231
1232 page = buf_block_get_frame(block);
1233
1234 if (height == ULINT_UNDEFINED
1235 && page_is_leaf(page)
1236 && rw_latch != RW_NO_LATCH
1237 && rw_latch != root_leaf_rw_latch) {
1238 /* We should retry to get the page, because the root page
1239 is latched with different level as a leaf page. */
1240 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
1241 ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_SX_LATCH);
1242 ut_ad(rw_latch == RW_S_LATCH || modify_external);
1243
1244 ut_ad(n_blocks == 0);
1245 mtr_release_block_at_savepoint(
1246 mtr, tree_savepoints[n_blocks],
1247 tree_blocks[n_blocks]);
1248
1249 upper_rw_latch = root_leaf_rw_latch;
1250 goto search_loop;
1251 }
1252
1253 if (rw_latch != RW_NO_LATCH) {
1254 #ifdef UNIV_ZIP_DEBUG
1255 const page_zip_des_t* page_zip
1256 = buf_block_get_page_zip(block);
1257 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1258 #endif /* UNIV_ZIP_DEBUG */
1259
1260 buf_block_dbg_add_level(
1261 block, dict_index_is_ibuf(index)
1262 ? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);
1263 }
1264
1265 ut_ad(fil_page_index_page_check(page));
1266 ut_ad(index->id == btr_page_get_index_id(page));
1267
1268 if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
1269 /* We are in the root node */
1270
1271 height = btr_page_get_level(page, mtr);
1272 root_height = height;
1273 cursor->tree_height = root_height + 1;
1274
1275 if (dict_index_is_spatial(index)) {
1276 ut_ad(cursor->rtr_info);
1277
1278 node_seq_t seq_no = rtr_get_current_ssn_id(index);
1279
1280 /* If SSN in memory is not initialized, fetch
1281 it from root page */
1282 if (seq_no < 1) {
1283 node_seq_t root_seq_no;
1284
1285 root_seq_no = page_get_ssn_id(page);
1286
1287 mutex_enter(&(index->rtr_ssn.mutex));
1288 index->rtr_ssn.seq_no = root_seq_no + 1;
1289 mutex_exit(&(index->rtr_ssn.mutex));
1290 }
1291
1292 /* Save the MBR */
1293 cursor->rtr_info->thr = cursor->thr;
1294 rtr_get_mbr_from_tuple(tuple, &cursor->rtr_info->mbr);
1295 }
1296
1297 info->root_guess = block;
1298 }
1299
1300 if (height == 0) {
1301 if (rw_latch == RW_NO_LATCH) {
1302
1303 latch_leaves = btr_cur_latch_leaves(
1304 block, page_id, page_size, latch_mode,
1305 cursor, mtr);
1306 }
1307
1308 switch (latch_mode) {
1309 case BTR_MODIFY_TREE:
1310 case BTR_CONT_MODIFY_TREE:
1311 case BTR_CONT_SEARCH_TREE:
1312 break;
1313 default:
1314 if (!s_latch_by_caller
1315 && !srv_read_only_mode
1316 && !modify_external) {
1317 /* Release the tree s-latch */
1318 /* NOTE: BTR_MODIFY_EXTERNAL
1319 needs to keep tree sx-latch */
1320 mtr_release_s_latch_at_savepoint(
1321 mtr, savepoint,
1322 dict_index_get_lock(index));
1323 }
1324
1325 /* release upper blocks */
1326 if (retrying_for_search_prev) {
1327 for (;
1328 prev_n_releases < prev_n_blocks;
1329 prev_n_releases++) {
1330 mtr_release_block_at_savepoint(
1331 mtr,
1332 prev_tree_savepoints[
1333 prev_n_releases],
1334 prev_tree_blocks[
1335 prev_n_releases]);
1336 }
1337 }
1338
1339 for (; n_releases < n_blocks; n_releases++) {
1340 if (n_releases == 0 && modify_external) {
1341 /* keep latch of root page */
1342 ut_ad(mtr_memo_contains_flagged(
1343 mtr, tree_blocks[n_releases],
1344 MTR_MEMO_PAGE_SX_FIX
1345 | MTR_MEMO_PAGE_X_FIX));
1346 continue;
1347 }
1348
1349 mtr_release_block_at_savepoint(
1350 mtr, tree_savepoints[n_releases],
1351 tree_blocks[n_releases]);
1352 }
1353 }
1354
1355 page_mode = mode;
1356 }
1357
1358 if (dict_index_is_spatial(index)) {
1359 /* Remember the page search mode */
1360 search_mode = page_mode;
1361
1362 /* Some adjustment on search mode, when the
1363 page search mode is PAGE_CUR_RTREE_LOCATE
1364 or PAGE_CUR_RTREE_INSERT, as we are searching
1365 with MBRs. When it is not the target level, we
1366 should search all sub-trees that "CONTAIN" the
1367 search range/MBR. When it is at the target
1368 level, the search becomes PAGE_CUR_LE */
1369 if (page_mode == PAGE_CUR_RTREE_LOCATE
1370 && level == height) {
1371 if (level == 0) {
1372 page_mode = PAGE_CUR_LE;
1373 } else {
1374 page_mode = PAGE_CUR_RTREE_GET_FATHER;
1375 }
1376 }
1377
1378 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1379 page_mode = (level == height)
1380 ? PAGE_CUR_LE
1381 : PAGE_CUR_RTREE_INSERT;
1382
1383 ut_ad(!page_is_leaf(page) || page_mode == PAGE_CUR_LE);
1384 }
1385
1386 /* "need_path" indicates if we need to tracking the parent
1387 pages, if it is not spatial comparison, then no need to
1388 track it */
1389 if (page_mode < PAGE_CUR_CONTAIN) {
1390 need_path = false;
1391 }
1392
1393 up_match = 0;
1394 low_match = 0;
1395
1396 if (latch_mode == BTR_MODIFY_TREE
1397 || latch_mode == BTR_CONT_MODIFY_TREE
1398 || latch_mode == BTR_CONT_SEARCH_TREE) {
1399 /* Tree are locked, no need for Page Lock to protect
1400 the "path" */
1401 cursor->rtr_info->need_page_lock = false;
1402 }
1403 }
1404
1405 if (dict_index_is_spatial(index) && page_mode >= PAGE_CUR_CONTAIN) {
1406 ut_ad(need_path);
1407 found = rtr_cur_search_with_match(
1408 block, index, tuple, page_mode, page_cursor,
1409 cursor->rtr_info);
1410
1411 /* Need to use BTR_MODIFY_TREE to do the MBR adjustment */
1412 if (search_mode == PAGE_CUR_RTREE_INSERT
1413 && cursor->rtr_info->mbr_adj) {
1414 if (latch_mode & BTR_MODIFY_LEAF) {
1415 /* Parent MBR needs updated, should retry
1416 with BTR_MODIFY_TREE */
1417 goto func_exit;
1418 } else if (latch_mode & BTR_MODIFY_TREE) {
1419 rtree_parent_modified = true;
1420 cursor->rtr_info->mbr_adj = false;
1421 mbr_adj = true;
1422 } else {
1423 ut_ad(0);
1424 }
1425 }
1426
1427 if (found && page_mode == PAGE_CUR_RTREE_GET_FATHER) {
1428 cursor->low_match =
1429 DICT_INDEX_SPATIAL_NODEPTR_SIZE + 1;
1430 }
1431 } else if (height == 0 && btr_search_enabled
1432 && !dict_index_is_spatial(index)) {
1433 /* The adaptive hash index is only used when searching
1434 for leaf pages (height==0), but not in r-trees.
1435 We only need the byte prefix comparison for the purpose
1436 of updating the adaptive hash index. */
1437 page_cur_search_with_match_bytes(
1438 block, index, tuple, page_mode, &up_match, &up_bytes,
1439 &low_match, &low_bytes, page_cursor);
1440 } else {
1441 /* Search for complete index fields. */
1442 up_bytes = low_bytes = 0;
1443 page_cur_search_with_match(
1444 block, index, tuple, page_mode, &up_match,
1445 &low_match, page_cursor,
1446 need_path ? cursor->rtr_info : NULL);
1447 }
1448
1449 if (estimate) {
1450 btr_cur_add_path_info(cursor, height, root_height);
1451 }
1452
1453 /* If this is the desired level, leave the loop */
1454
1455 ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
1456 mtr));
1457
1458 /* Add Predicate lock if it is serializable isolation
1459 and only if it is in the search case */
1460 if (dict_index_is_spatial(index)
1461 && cursor->rtr_info->need_prdt_lock
1462 && mode != PAGE_CUR_RTREE_INSERT
1463 && mode != PAGE_CUR_RTREE_LOCATE
1464 && mode >= PAGE_CUR_CONTAIN) {
1465 trx_t* trx = thr_get_trx(cursor->thr);
1466 lock_prdt_t prdt;
1467
1468 lock_mutex_enter();
1469 lock_init_prdt_from_mbr(
1470 &prdt, &cursor->rtr_info->mbr, mode,
1471 trx->lock.lock_heap);
1472 lock_mutex_exit();
1473
1474 if (rw_latch == RW_NO_LATCH && height != 0) {
1475 rw_lock_s_lock(&(block->lock));
1476 }
1477
1478 lock_prdt_lock(block, &prdt, index, LOCK_S,
1479 LOCK_PREDICATE, cursor->thr, mtr);
1480
1481 if (rw_latch == RW_NO_LATCH && height != 0) {
1482 rw_lock_s_unlock(&(block->lock));
1483 }
1484 }
1485
1486 if (level != height) {
1487
1488 const rec_t* node_ptr;
1489 ut_ad(height > 0);
1490
1491 height--;
1492
1493 node_ptr = page_cur_get_rec(page_cursor);
1494
1495 offsets = rec_get_offsets(
1496 node_ptr, index, offsets, ULINT_UNDEFINED, &heap);
1497
1498 /* If the rec is the first or last in the page for
1499 pessimistic delete intention, it might cause node_ptr insert
1500 for the upper level. We should change the intention and retry.
1501 */
1502 if (latch_mode == BTR_MODIFY_TREE
1503 && btr_cur_need_opposite_intention(
1504 page, lock_intention, node_ptr)) {
1505
1506 need_opposite_intention:
1507 ut_ad(upper_rw_latch == RW_X_LATCH);
1508
1509 if (n_releases > 0) {
1510 /* release root block */
1511 mtr_release_block_at_savepoint(
1512 mtr, tree_savepoints[0],
1513 tree_blocks[0]);
1514 }
1515
1516 /* release all blocks */
1517 for (; n_releases <= n_blocks; n_releases++) {
1518 mtr_release_block_at_savepoint(
1519 mtr, tree_savepoints[n_releases],
1520 tree_blocks[n_releases]);
1521 }
1522
1523 lock_intention = BTR_INTENTION_BOTH;
1524
1525 page_id.reset(space, dict_index_get_page(index));
1526 up_match = 0;
1527 low_match = 0;
1528 height = ULINT_UNDEFINED;
1529
1530 n_blocks = 0;
1531 n_releases = 0;
1532
1533 goto search_loop;
1534 }
1535
1536 if (dict_index_is_spatial(index)) {
1537 if (page_rec_is_supremum(node_ptr)) {
1538 cursor->low_match = 0;
1539 cursor->up_match = 0;
1540 goto func_exit;
1541 }
1542
1543 /* If we are doing insertion or record locating,
1544 remember the tree nodes we visited */
1545 if (page_mode == PAGE_CUR_RTREE_INSERT
1546 || (search_mode == PAGE_CUR_RTREE_LOCATE
1547 && (latch_mode != BTR_MODIFY_LEAF))) {
1548 bool add_latch = false;
1549
1550 if (latch_mode == BTR_MODIFY_TREE
1551 && rw_latch == RW_NO_LATCH) {
1552 ut_ad(mtr_memo_contains_flagged(
1553 mtr, dict_index_get_lock(index),
1554 MTR_MEMO_X_LOCK
1555 | MTR_MEMO_SX_LOCK));
1556 rw_lock_s_lock(&block->lock);
1557 add_latch = true;
1558 }
1559
1560 /* Store the parent cursor location */
1561 #ifdef UNIV_DEBUG
1562 ulint num_stored = rtr_store_parent_path(
1563 block, cursor, latch_mode,
1564 height + 1, mtr);
1565 #else
1566 rtr_store_parent_path(
1567 block, cursor, latch_mode,
1568 height + 1, mtr);
1569 #endif
1570
1571 if (page_mode == PAGE_CUR_RTREE_INSERT) {
1572 btr_pcur_t* r_cursor =
1573 rtr_get_parent_cursor(
1574 cursor, height + 1,
1575 true);
1576 /* If it is insertion, there should
1577 be only one parent for each level
1578 traverse */
1579 #ifdef UNIV_DEBUG
1580 ut_ad(num_stored == 1);
1581 #endif
1582
1583 node_ptr = btr_pcur_get_rec(r_cursor);
1584
1585 }
1586
1587 if (add_latch) {
1588 rw_lock_s_unlock(&block->lock);
1589 }
1590
1591 ut_ad(!page_rec_is_supremum(node_ptr));
1592 }
1593
1594 ut_ad(page_mode == search_mode
1595 || (page_mode == PAGE_CUR_WITHIN
1596 && search_mode == PAGE_CUR_RTREE_LOCATE));
1597
1598 page_mode = search_mode;
1599 }
1600
1601 /* If the first or the last record of the page
1602 or the same key value to the first record or last record,
1603 the another page might be choosen when BTR_CONT_MODIFY_TREE.
1604 So, the parent page should not released to avoiding deadlock
1605 with blocking the another search with the same key value. */
1606 if (!detected_same_key_root
1607 && lock_intention == BTR_INTENTION_BOTH
1608 && !dict_index_is_unique(index)
1609 && latch_mode == BTR_MODIFY_TREE
1610 && (up_match >= rec_offs_n_fields(offsets) - 1
1611 || low_match >= rec_offs_n_fields(offsets) - 1)) {
1612 const rec_t* first_rec
1613 = page_rec_get_next_const(
1614 page_get_infimum_rec(
1615 page));
1616 ulint matched_fields;
1617
1618 ut_ad(upper_rw_latch == RW_X_LATCH);
1619
1620 if (node_ptr == first_rec
1621 || page_rec_is_last(node_ptr, page)) {
1622 detected_same_key_root = true;
1623 } else {
1624 matched_fields = 0;
1625
1626 offsets2 = rec_get_offsets(
1627 first_rec, index, offsets2,
1628 ULINT_UNDEFINED, &heap);
1629 cmp_rec_rec_with_match(node_ptr, first_rec,
1630 offsets, offsets2, index,
1631 page_is_spatial_non_leaf(first_rec, index),
1632 false, &matched_fields);
1633
1634 if (matched_fields
1635 >= rec_offs_n_fields(offsets) - 1) {
1636 detected_same_key_root = true;
1637 } else {
1638 const rec_t* last_rec;
1639
1640 last_rec = page_rec_get_prev_const(
1641 page_get_supremum_rec(
1642 page));
1643
1644 matched_fields = 0;
1645
1646 offsets2 = rec_get_offsets(
1647 last_rec, index, offsets2,
1648 ULINT_UNDEFINED, &heap);
1649 cmp_rec_rec_with_match(
1650 node_ptr, last_rec,
1651 offsets, offsets2, index,
1652 page_is_spatial_non_leaf(last_rec, index),
1653 false, &matched_fields);
1654 if (matched_fields
1655 >= rec_offs_n_fields(offsets) - 1) {
1656 detected_same_key_root = true;
1657 }
1658 }
1659 }
1660 }
1661
1662 /* If the page might cause modify_tree,
1663 we should not release the parent page's lock. */
1664 if (!detected_same_key_root
1665 && latch_mode == BTR_MODIFY_TREE
1666 && !btr_cur_will_modify_tree(
1667 index, page, lock_intention, node_ptr,
1668 node_ptr_max_size, page_size, mtr)
1669 && !rtree_parent_modified) {
1670 ut_ad(upper_rw_latch == RW_X_LATCH);
1671 ut_ad(n_releases <= n_blocks);
1672
1673 /* we can release upper blocks */
1674 for (; n_releases < n_blocks; n_releases++) {
1675 if (n_releases == 0) {
1676 /* we should not release root page
1677 to pin to same block. */
1678 continue;
1679 }
1680
1681 /* release unused blocks to unpin */
1682 mtr_release_block_at_savepoint(
1683 mtr, tree_savepoints[n_releases],
1684 tree_blocks[n_releases]);
1685 }
1686 }
1687
1688 if (height == level
1689 && latch_mode == BTR_MODIFY_TREE) {
1690 ut_ad(upper_rw_latch == RW_X_LATCH);
1691 /* we should sx-latch root page, if released already.
1692 It contains seg_header. */
1693 if (n_releases > 0) {
1694 mtr_block_sx_latch_at_savepoint(
1695 mtr, tree_savepoints[0],
1696 tree_blocks[0]);
1697 }
1698
1699 /* x-latch the branch blocks not released yet. */
1700 for (ulint i = n_releases; i <= n_blocks; i++) {
1701 mtr_block_x_latch_at_savepoint(
1702 mtr, tree_savepoints[i],
1703 tree_blocks[i]);
1704 }
1705 }
1706
1707 /* We should consider prev_page of parent page, if the node_ptr
1708 is the leftmost of the page. because BTR_SEARCH_PREV and
1709 BTR_MODIFY_PREV latches prev_page of the leaf page. */
1710 if ((latch_mode == BTR_SEARCH_PREV
1711 || latch_mode == BTR_MODIFY_PREV)
1712 && !retrying_for_search_prev) {
1713 /* block should be latched for consistent
1714 btr_page_get_prev() */
1715 ut_ad(mtr_memo_contains_flagged(mtr, block,
1716 MTR_MEMO_PAGE_S_FIX
1717 | MTR_MEMO_PAGE_X_FIX));
1718
1719 if (btr_page_get_prev(page, mtr) != FIL_NULL
1720 && page_rec_is_first(node_ptr, page)) {
1721
1722 if (leftmost_from_level == 0) {
1723 leftmost_from_level = height + 1;
1724 }
1725 } else {
1726 leftmost_from_level = 0;
1727 }
1728
1729 if (height == 0 && leftmost_from_level > 0) {
1730 /* should retry to get also prev_page
1731 from level==leftmost_from_level. */
1732 retrying_for_search_prev = true;
1733
1734 prev_tree_blocks = static_cast<buf_block_t**>(
1735 ut_malloc_nokey(sizeof(buf_block_t*)
1736 * leftmost_from_level));
1737
1738 prev_tree_savepoints = static_cast<ulint*>(
1739 ut_malloc_nokey(sizeof(ulint)
1740 * leftmost_from_level));
1741
1742 /* back to the level (leftmost_from_level+1) */
1743 ulint idx = n_blocks
1744 - (leftmost_from_level - 1);
1745
1746 page_id.reset(
1747 space,
1748 tree_blocks[idx]->page.id.page_no());
1749
1750 for (ulint i = n_blocks
1751 - (leftmost_from_level - 1);
1752 i <= n_blocks; i++) {
1753 mtr_release_block_at_savepoint(
1754 mtr, tree_savepoints[i],
1755 tree_blocks[i]);
1756 }
1757
1758 n_blocks -= (leftmost_from_level - 1);
1759 height = leftmost_from_level;
1760 ut_ad(n_releases == 0);
1761
1762 /* replay up_match, low_match */
1763 up_match = 0;
1764 low_match = 0;
1765 rtr_info_t* rtr_info = need_path
1766 ? cursor->rtr_info : NULL;
1767
1768 for (ulint i = 0; i < n_blocks; i++) {
1769 page_cur_search_with_match(
1770 tree_blocks[i], index, tuple,
1771 page_mode, &up_match,
1772 &low_match, page_cursor,
1773 rtr_info);
1774 }
1775
1776 goto search_loop;
1777 }
1778 }
1779
1780 /* Go to the child node */
1781 page_id.reset(
1782 space,
1783 btr_node_ptr_get_child_page_no(node_ptr, offsets));
1784
1785 n_blocks++;
1786
1787 if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
1788 /* We're doing a search on an ibuf tree and we're one
1789 level above the leaf page. */
1790
1791 ut_ad(level == 0);
1792
1793 buf_mode = BUF_GET;
1794 rw_latch = RW_NO_LATCH;
1795 goto retry_page_get;
1796 }
1797
1798 if (dict_index_is_spatial(index)
1799 && page_mode >= PAGE_CUR_CONTAIN
1800 && page_mode != PAGE_CUR_RTREE_INSERT) {
1801 ut_ad(need_path);
1802 rtr_node_path_t* path =
1803 cursor->rtr_info->path;
1804
1805 if (!path->empty() && found) {
1806 #ifdef UNIV_DEBUG
1807 node_visit_t last_visit = path->back();
1808
1809 ut_ad(last_visit.page_no == page_id.page_no());
1810 #endif /* UNIV_DEBUG */
1811
1812 path->pop_back();
1813
1814 #ifdef UNIV_DEBUG
1815 if (page_mode == PAGE_CUR_RTREE_LOCATE
1816 && (latch_mode != BTR_MODIFY_LEAF)) {
1817 btr_pcur_t* cur
1818 = cursor->rtr_info->parent_path->back(
1819 ).cursor;
1820 rec_t* my_node_ptr
1821 = btr_pcur_get_rec(cur);
1822
1823 offsets = rec_get_offsets(
1824 my_node_ptr, index, offsets,
1825 ULINT_UNDEFINED, &heap);
1826
1827 ulint my_page_no
1828 = btr_node_ptr_get_child_page_no(
1829 my_node_ptr, offsets);
1830
1831 ut_ad(page_id.page_no() == my_page_no);
1832
1833 }
1834 #endif
1835 }
1836 }
1837
1838 goto search_loop;
1839 } else if (!dict_index_is_spatial(index)
1840 && latch_mode == BTR_MODIFY_TREE
1841 && lock_intention == BTR_INTENTION_INSERT
1842 && mach_read_from_4(page + FIL_PAGE_NEXT) != FIL_NULL
1843 && page_rec_is_last(page_cur_get_rec(page_cursor), page)) {
1844
1845 /* btr_insert_into_right_sibling() might cause
1846 deleting node_ptr at upper level */
1847
1848 if (height == 0) {
1849 /* release the leaf pages if latched */
1850 for (uint i = 0; i < 3; i++) {
1851 if (latch_leaves.blocks[i] != NULL) {
1852 mtr_release_block_at_savepoint(
1853 mtr, latch_leaves.savepoints[i],
1854 latch_leaves.blocks[i]);
1855 latch_leaves.blocks[i] = NULL;
1856 }
1857 }
1858 }
1859
1860 goto need_opposite_intention;
1861 }
1862
1863 if (level != 0) {
1864 if (upper_rw_latch == RW_NO_LATCH) {
1865 /* latch the page */
1866 buf_block_t* child_block;
1867
1868 if (latch_mode == BTR_CONT_MODIFY_TREE) {
1869 child_block = btr_block_get(
1870 page_id, page_size, RW_X_LATCH,
1871 index, mtr);
1872 } else {
1873 ut_ad(latch_mode == BTR_CONT_SEARCH_TREE);
1874 child_block = btr_block_get(
1875 page_id, page_size, RW_SX_LATCH,
1876 index, mtr);
1877 }
1878
1879 btr_assert_not_corrupted(child_block, index);
1880 } else {
1881 ut_ad(mtr_memo_contains(mtr, block, upper_rw_latch));
1882 btr_assert_not_corrupted(block, index);
1883
1884 if (s_latch_by_caller) {
1885 ut_ad(latch_mode == BTR_SEARCH_TREE);
1886 /* to exclude modifying tree operations
1887 should sx-latch the index. */
1888 ut_ad(mtr_memo_contains(
1889 mtr, dict_index_get_lock(index),
1890 MTR_MEMO_SX_LOCK));
1891 /* because has sx-latch of index,
1892 can release upper blocks. */
1893 for (; n_releases < n_blocks; n_releases++) {
1894 mtr_release_block_at_savepoint(
1895 mtr,
1896 tree_savepoints[n_releases],
1897 tree_blocks[n_releases]);
1898 }
1899 }
1900 }
1901
1902 if (page_mode <= PAGE_CUR_LE) {
1903 cursor->low_match = low_match;
1904 cursor->up_match = up_match;
1905 }
1906 } else {
1907 cursor->low_match = low_match;
1908 cursor->low_bytes = low_bytes;
1909 cursor->up_match = up_match;
1910 cursor->up_bytes = up_bytes;
1911
1912 /* We do a dirty read of btr_search_enabled here. We
1913 will properly check btr_search_enabled again in
1914 btr_search_build_page_hash_index() before building a
1915 page hash index, while holding search latch. */
1916 if (btr_search_enabled && !index->disable_ahi) {
1917 btr_search_info_update(index, cursor);
1918 }
1919 ut_ad(cursor->up_match != ULINT_UNDEFINED
1920 || mode != PAGE_CUR_GE);
1921 ut_ad(cursor->up_match != ULINT_UNDEFINED
1922 || mode != PAGE_CUR_LE);
1923 ut_ad(cursor->low_match != ULINT_UNDEFINED
1924 || mode != PAGE_CUR_LE);
1925 }
1926
1927 /* For spatial index, remember what blocks are still latched */
1928 if (dict_index_is_spatial(index)
1929 && (latch_mode == BTR_MODIFY_TREE
1930 || latch_mode == BTR_MODIFY_LEAF)) {
1931 for (ulint i = 0; i < n_releases; i++) {
1932 cursor->rtr_info->tree_blocks[i] = NULL;
1933 cursor->rtr_info->tree_savepoints[i] = 0;
1934 }
1935
1936 for (ulint i = n_releases; i <= n_blocks; i++) {
1937 cursor->rtr_info->tree_blocks[i] = tree_blocks[i];
1938 cursor->rtr_info->tree_savepoints[i] = tree_savepoints[i];
1939 }
1940 }
1941
1942 func_exit:
1943
1944 if (UNIV_LIKELY_NULL(heap)) {
1945 mem_heap_free(heap);
1946 }
1947
1948 if (retrying_for_search_prev) {
1949 ut_free(prev_tree_blocks);
1950 ut_free(prev_tree_savepoints);
1951 }
1952
1953 if (has_search_latch) {
1954
1955 rw_lock_s_lock(btr_get_search_latch(index));
1956 }
1957
1958 if (mbr_adj) {
1959 /* remember that we will need to adjust parent MBR */
1960 cursor->rtr_info->mbr_adj = true;
1961 }
1962
1963 DBUG_VOID_RETURN;
1964 }
1965
1966 /** Searches an index tree and positions a tree cursor on a given level.
1967 This function will avoid latching the traversal path and so should be
1968 used only for cases where-in latching is not needed.
1969
1970 @param[in,out] index index
1971 @param[in] level the tree level of search
1972 @param[in] tuple data tuple; Note: n_fields_cmp in compared
1973 to the node ptr page node field
1974 @param[in] mode PAGE_CUR_L, ....
1975 Insert should always be made using PAGE_CUR_LE
1976 to search the position.
1977 @param[in,out] cursor tree cursor; points to record of interest.
1978 @param[in] file file name
1979 @param[in[ line line where called from
1980 @param[in,out] mtr mtr
1981 @param[in] mark_dirty
1982 if true then mark the block as dirty */
1983 void
btr_cur_search_to_nth_level_with_no_latch(dict_index_t * index,ulint level,const dtuple_t * tuple,page_cur_mode_t mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr,bool mark_dirty)1984 btr_cur_search_to_nth_level_with_no_latch(
1985 dict_index_t* index,
1986 ulint level,
1987 const dtuple_t* tuple,
1988 page_cur_mode_t mode,
1989 btr_cur_t* cursor,
1990 const char* file,
1991 ulint line,
1992 mtr_t* mtr,
1993 bool mark_dirty)
1994 {
1995 page_t* page = NULL; /* remove warning */
1996 buf_block_t* block;
1997 ulint height;
1998 ulint up_match;
1999 ulint low_match;
2000 ulint rw_latch;
2001 page_cur_mode_t page_mode;
2002 ulint buf_mode;
2003 page_cur_t* page_cursor;
2004 ulint root_height = 0; /* remove warning */
2005 ulint n_blocks = 0;
2006
2007 mem_heap_t* heap = NULL;
2008 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2009 ulint* offsets = offsets_;
2010 rec_offs_init(offsets_);
2011
2012 DBUG_ENTER("btr_cur_search_to_nth_level_with_no_latch");
2013
2014 ut_ad(dict_table_is_intrinsic(index->table));
2015 ut_ad(level == 0 || mode == PAGE_CUR_LE);
2016 ut_ad(dict_index_check_search_tuple(index, tuple));
2017 ut_ad(dtuple_check_typed(tuple));
2018 ut_ad(index->page != FIL_NULL);
2019
2020 UNIV_MEM_INVALID(&cursor->up_match, sizeof cursor->up_match);
2021 UNIV_MEM_INVALID(&cursor->low_match, sizeof cursor->low_match);
2022 #ifdef UNIV_DEBUG
2023 cursor->up_match = ULINT_UNDEFINED;
2024 cursor->low_match = ULINT_UNDEFINED;
2025 #endif /* UNIV_DEBUG */
2026
2027 cursor->flag = BTR_CUR_BINARY;
2028 cursor->index = index;
2029
2030 page_cursor = btr_cur_get_page_cur(cursor);
2031
2032 const ulint space = dict_index_get_space(index);
2033 const page_size_t page_size(dict_table_page_size(index->table));
2034 /* Start with the root page. */
2035 page_id_t page_id(space, dict_index_get_page(index));
2036
2037 up_match = 0;
2038 low_match = 0;
2039
2040 height = ULINT_UNDEFINED;
2041
2042 /* We use these modified search modes on non-leaf levels of the
2043 B-tree. These let us end up in the right B-tree leaf. In that leaf
2044 we use the original search mode. */
2045
2046 switch (mode) {
2047 case PAGE_CUR_GE:
2048 page_mode = PAGE_CUR_L;
2049 break;
2050 case PAGE_CUR_G:
2051 page_mode = PAGE_CUR_LE;
2052 break;
2053 default:
2054 page_mode = mode;
2055 break;
2056 }
2057
2058 /* Loop and search until we arrive at the desired level */
2059 bool at_desired_level = false;
2060 while (!at_desired_level) {
2061 buf_mode = BUF_GET;
2062 rw_latch = RW_NO_LATCH;
2063
2064 ut_ad(n_blocks < BTR_MAX_LEVELS);
2065
2066 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2067 buf_mode, file, line, mtr, mark_dirty);
2068
2069 page = buf_block_get_frame(block);
2070
2071 if (height == ULINT_UNDEFINED) {
2072 /* We are in the root node */
2073
2074 height = btr_page_get_level(page, mtr);
2075 root_height = height;
2076 cursor->tree_height = root_height + 1;
2077 }
2078
2079 if (height == 0) {
2080 /* On leaf level. Switch back to original search mode.*/
2081 page_mode = mode;
2082 }
2083
2084 page_cur_search_with_match(
2085 block, index, tuple, page_mode, &up_match,
2086 &low_match, page_cursor, NULL);
2087
2088 ut_ad(height == btr_page_get_level(
2089 page_cur_get_page(page_cursor), mtr));
2090
2091 if (level != height) {
2092
2093 const rec_t* node_ptr;
2094 ut_ad(height > 0);
2095
2096 height--;
2097
2098 node_ptr = page_cur_get_rec(page_cursor);
2099
2100 offsets = rec_get_offsets(
2101 node_ptr, index, offsets,
2102 ULINT_UNDEFINED, &heap);
2103
2104 /* Go to the child node */
2105 page_id.reset(space, btr_node_ptr_get_child_page_no(
2106 node_ptr, offsets));
2107
2108 n_blocks++;
2109 } else {
2110 /* If this is the desired level, leave the loop */
2111 at_desired_level = true;
2112 }
2113 }
2114
2115 cursor->low_match = low_match;
2116 cursor->up_match = up_match;
2117
2118 if (heap != NULL) {
2119 mem_heap_free(heap);
2120 }
2121
2122 DBUG_VOID_RETURN;
2123 }
2124
2125 /*****************************************************************//**
2126 Opens a cursor at either end of an index. */
2127 void
btr_cur_open_at_index_side_func(bool from_left,dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2128 btr_cur_open_at_index_side_func(
2129 /*============================*/
2130 bool from_left, /*!< in: true if open to the low end,
2131 false if to the high end */
2132 dict_index_t* index, /*!< in: index */
2133 ulint latch_mode, /*!< in: latch mode */
2134 btr_cur_t* cursor, /*!< in/out: cursor */
2135 ulint level, /*!< in: level to search for
2136 (0=leaf). */
2137 const char* file, /*!< in: file name */
2138 ulint line, /*!< in: line where called */
2139 mtr_t* mtr) /*!< in/out: mini-transaction */
2140 {
2141 page_cur_t* page_cursor;
2142 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2143 ulint height;
2144 ulint root_height = 0; /* remove warning */
2145 rec_t* node_ptr;
2146 ulint estimate;
2147 ulint savepoint;
2148 ulint upper_rw_latch, root_leaf_rw_latch;
2149 btr_intention_t lock_intention;
2150 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2151 ulint tree_savepoints[BTR_MAX_LEVELS];
2152 ulint n_blocks = 0;
2153 ulint n_releases = 0;
2154 mem_heap_t* heap = NULL;
2155 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2156 ulint* offsets = offsets_;
2157 rec_offs_init(offsets_);
2158
2159 estimate = latch_mode & BTR_ESTIMATE;
2160 latch_mode &= ~BTR_ESTIMATE;
2161
2162 ut_ad(level != ULINT_UNDEFINED);
2163
2164 bool s_latch_by_caller;
2165
2166 s_latch_by_caller = latch_mode & BTR_ALREADY_S_LATCHED;
2167 latch_mode &= ~BTR_ALREADY_S_LATCHED;
2168
2169 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2170
2171 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2172
2173 /* This function doesn't need to lock left page of the leaf page */
2174 if (latch_mode == BTR_SEARCH_PREV) {
2175 latch_mode = BTR_SEARCH_LEAF;
2176 } else if (latch_mode == BTR_MODIFY_PREV) {
2177 latch_mode = BTR_MODIFY_LEAF;
2178 }
2179
2180 /* Store the position of the tree latch we push to mtr so that we
2181 know how to release it when we have latched the leaf node */
2182
2183 savepoint = mtr_set_savepoint(mtr);
2184
2185 switch (latch_mode) {
2186 case BTR_CONT_MODIFY_TREE:
2187 case BTR_CONT_SEARCH_TREE:
2188 upper_rw_latch = RW_NO_LATCH;
2189 break;
2190 case BTR_MODIFY_TREE:
2191 /* Most of delete-intended operations are purging.
2192 Free blocks and read IO bandwidth should be prior
2193 for them, when the history list is glowing huge. */
2194 if (lock_intention == BTR_INTENTION_DELETE
2195 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2196 && buf_get_n_pending_read_ios()) {
2197 mtr_x_lock(dict_index_get_lock(index), mtr);
2198 } else {
2199 mtr_sx_lock(dict_index_get_lock(index), mtr);
2200 }
2201 upper_rw_latch = RW_X_LATCH;
2202 break;
2203 default:
2204 ut_ad(!s_latch_by_caller
2205 || mtr_memo_contains_flagged(mtr,
2206 dict_index_get_lock(index),
2207 MTR_MEMO_SX_LOCK
2208 | MTR_MEMO_S_LOCK));
2209 if (!srv_read_only_mode) {
2210 if (!s_latch_by_caller) {
2211 /* BTR_SEARCH_TREE is intended to be used with
2212 BTR_ALREADY_S_LATCHED */
2213 ut_ad(latch_mode != BTR_SEARCH_TREE);
2214
2215 mtr_s_lock(dict_index_get_lock(index), mtr);
2216 }
2217 upper_rw_latch = RW_S_LATCH;
2218 } else {
2219 upper_rw_latch = RW_NO_LATCH;
2220 }
2221 }
2222 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2223
2224 page_cursor = btr_cur_get_page_cur(cursor);
2225 cursor->index = index;
2226
2227 page_id_t page_id(dict_index_get_space(index),
2228 dict_index_get_page(index));
2229 const page_size_t& page_size = dict_table_page_size(index->table);
2230
2231 if (root_leaf_rw_latch == RW_X_LATCH) {
2232 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2233 }
2234
2235 height = ULINT_UNDEFINED;
2236
2237 for (;;) {
2238 buf_block_t* block;
2239 page_t* page;
2240 ulint rw_latch;
2241
2242 ut_ad(n_blocks < BTR_MAX_LEVELS);
2243
2244 if (height != 0
2245 && (latch_mode != BTR_MODIFY_TREE
2246 || height == level)) {
2247 rw_latch = upper_rw_latch;
2248 } else {
2249 rw_latch = RW_NO_LATCH;
2250 }
2251
2252 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2253 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2254 BUF_GET, file, line, mtr);
2255 tree_blocks[n_blocks] = block;
2256
2257 page = buf_block_get_frame(block);
2258
2259 if (height == ULINT_UNDEFINED
2260 && btr_page_get_level(page, mtr) == 0
2261 && rw_latch != RW_NO_LATCH
2262 && rw_latch != root_leaf_rw_latch) {
2263 /* We should retry to get the page, because the root page
2264 is latched with different level as a leaf page. */
2265 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2266 ut_ad(rw_latch == RW_S_LATCH);
2267
2268 ut_ad(n_blocks == 0);
2269 mtr_release_block_at_savepoint(
2270 mtr, tree_savepoints[n_blocks],
2271 tree_blocks[n_blocks]);
2272
2273 upper_rw_latch = root_leaf_rw_latch;
2274 continue;
2275 }
2276
2277 ut_ad(fil_page_index_page_check(page));
2278 ut_ad(index->id == btr_page_get_index_id(page));
2279
2280 if (height == ULINT_UNDEFINED) {
2281 /* We are in the root node */
2282
2283 height = btr_page_get_level(page, mtr);
2284 root_height = height;
2285 ut_a(height >= level);
2286 } else {
2287 /* TODO: flag the index corrupted if this fails */
2288 ut_ad(height == btr_page_get_level(page, mtr));
2289 }
2290
2291 if (height == level) {
2292 if (srv_read_only_mode) {
2293 btr_cur_latch_leaves(
2294 block, page_id, page_size,
2295 latch_mode, cursor, mtr);
2296 } else if (height == 0) {
2297 if (rw_latch == RW_NO_LATCH) {
2298 btr_cur_latch_leaves(
2299 block, page_id, page_size,
2300 latch_mode, cursor, mtr);
2301 }
2302 /* In versions <= 3.23.52 we had
2303 forgotten to release the tree latch
2304 here. If in an index scan we had to
2305 scan far to find a record visible to
2306 the current transaction, that could
2307 starve others waiting for the tree
2308 latch. */
2309
2310 switch (latch_mode) {
2311 case BTR_MODIFY_TREE:
2312 case BTR_CONT_MODIFY_TREE:
2313 case BTR_CONT_SEARCH_TREE:
2314 break;
2315 default:
2316 if (!s_latch_by_caller) {
2317 /* Release the tree s-latch */
2318 mtr_release_s_latch_at_savepoint(
2319 mtr, savepoint,
2320 dict_index_get_lock(
2321 index));
2322 }
2323
2324 /* release upper blocks */
2325 for (; n_releases < n_blocks;
2326 n_releases++) {
2327 mtr_release_block_at_savepoint(
2328 mtr,
2329 tree_savepoints[
2330 n_releases],
2331 tree_blocks[
2332 n_releases]);
2333 }
2334 }
2335 } else { /* height != 0 */
2336 /* We already have the block latched. */
2337 ut_ad(latch_mode == BTR_SEARCH_TREE);
2338 ut_ad(s_latch_by_caller);
2339 ut_ad(upper_rw_latch == RW_S_LATCH);
2340
2341 ut_ad(mtr_memo_contains(mtr, block,
2342 upper_rw_latch));
2343
2344 if (s_latch_by_caller) {
2345 /* to exclude modifying tree operations
2346 should sx-latch the index. */
2347 ut_ad(mtr_memo_contains(
2348 mtr,
2349 dict_index_get_lock(index),
2350 MTR_MEMO_SX_LOCK));
2351 /* because has sx-latch of index,
2352 can release upper blocks. */
2353 for (; n_releases < n_blocks;
2354 n_releases++) {
2355 mtr_release_block_at_savepoint(
2356 mtr,
2357 tree_savepoints[
2358 n_releases],
2359 tree_blocks[
2360 n_releases]);
2361 }
2362 }
2363 }
2364 }
2365
2366 if (from_left) {
2367 page_cur_set_before_first(block, page_cursor);
2368 } else {
2369 page_cur_set_after_last(block, page_cursor);
2370 }
2371
2372 if (height == level) {
2373 if (estimate) {
2374 btr_cur_add_path_info(cursor, height,
2375 root_height);
2376 }
2377
2378 break;
2379 }
2380
2381 ut_ad(height > 0);
2382
2383 if (from_left) {
2384 page_cur_move_to_next(page_cursor);
2385 } else {
2386 page_cur_move_to_prev(page_cursor);
2387 }
2388
2389 if (estimate) {
2390 btr_cur_add_path_info(cursor, height, root_height);
2391 }
2392
2393 height--;
2394
2395 node_ptr = page_cur_get_rec(page_cursor);
2396 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2397 ULINT_UNDEFINED, &heap);
2398
2399 /* If the rec is the first or last in the page for
2400 pessimistic delete intention, it might cause node_ptr insert
2401 for the upper level. We should change the intention and retry.
2402 */
2403 if (latch_mode == BTR_MODIFY_TREE
2404 && btr_cur_need_opposite_intention(
2405 page, lock_intention, node_ptr)) {
2406
2407 ut_ad(upper_rw_latch == RW_X_LATCH);
2408 /* release all blocks */
2409 for (; n_releases <= n_blocks; n_releases++) {
2410 mtr_release_block_at_savepoint(
2411 mtr, tree_savepoints[n_releases],
2412 tree_blocks[n_releases]);
2413 }
2414
2415 lock_intention = BTR_INTENTION_BOTH;
2416
2417 page_id.set_page_no(dict_index_get_page(index));
2418
2419 height = ULINT_UNDEFINED;
2420
2421 n_blocks = 0;
2422 n_releases = 0;
2423
2424 continue;
2425 }
2426
2427 if (latch_mode == BTR_MODIFY_TREE
2428 && !btr_cur_will_modify_tree(
2429 cursor->index, page, lock_intention, node_ptr,
2430 node_ptr_max_size, page_size, mtr)) {
2431 ut_ad(upper_rw_latch == RW_X_LATCH);
2432 ut_ad(n_releases <= n_blocks);
2433
2434 /* we can release upper blocks */
2435 for (; n_releases < n_blocks; n_releases++) {
2436 if (n_releases == 0) {
2437 /* we should not release root page
2438 to pin to same block. */
2439 continue;
2440 }
2441
2442 /* release unused blocks to unpin */
2443 mtr_release_block_at_savepoint(
2444 mtr, tree_savepoints[n_releases],
2445 tree_blocks[n_releases]);
2446 }
2447 }
2448
2449 if (height == level
2450 && latch_mode == BTR_MODIFY_TREE) {
2451 ut_ad(upper_rw_latch == RW_X_LATCH);
2452 /* we should sx-latch root page, if released already.
2453 It contains seg_header. */
2454 if (n_releases > 0) {
2455 mtr_block_sx_latch_at_savepoint(
2456 mtr, tree_savepoints[0],
2457 tree_blocks[0]);
2458 }
2459
2460 /* x-latch the branch blocks not released yet. */
2461 for (ulint i = n_releases; i <= n_blocks; i++) {
2462 mtr_block_x_latch_at_savepoint(
2463 mtr, tree_savepoints[i],
2464 tree_blocks[i]);
2465 }
2466 }
2467
2468 /* Go to the child node */
2469 page_id.set_page_no(
2470 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2471
2472 n_blocks++;
2473 }
2474
2475 if (heap) {
2476 mem_heap_free(heap);
2477 }
2478 }
2479
2480 /** Opens a cursor at either end of an index.
2481 Avoid taking latches on buffer, just pin (by incrementing fix_count)
2482 to keep them in buffer pool. This mode is used by intrinsic table
2483 as they are not shared and so there is no need of latching.
2484 @param[in] from_left true if open to low end, false if open
2485 to high end.
2486 @param[in] index index
2487 @param[in,out] cursor cursor
2488 @param[in] file file name
2489 @param[in] line line where called
2490 @param[in,out] mtr mini transaction
2491 */
2492 void
btr_cur_open_at_index_side_with_no_latch_func(bool from_left,dict_index_t * index,btr_cur_t * cursor,ulint level,const char * file,ulint line,mtr_t * mtr)2493 btr_cur_open_at_index_side_with_no_latch_func(
2494 bool from_left,
2495 dict_index_t* index,
2496 btr_cur_t* cursor,
2497 ulint level,
2498 const char* file,
2499 ulint line,
2500 mtr_t* mtr)
2501 {
2502 page_cur_t* page_cursor;
2503 ulint height;
2504 rec_t* node_ptr;
2505 ulint n_blocks = 0;
2506 mem_heap_t* heap = NULL;
2507 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2508 ulint* offsets = offsets_;
2509 rec_offs_init(offsets_);
2510
2511 ut_ad(level != ULINT_UNDEFINED);
2512
2513 page_cursor = btr_cur_get_page_cur(cursor);
2514 cursor->index = index;
2515 page_id_t page_id(dict_index_get_space(index),
2516 dict_index_get_page(index));
2517 const page_size_t& page_size = dict_table_page_size(index->table);
2518
2519 height = ULINT_UNDEFINED;
2520
2521 for (;;) {
2522 buf_block_t* block;
2523 page_t* page;
2524 ulint rw_latch = RW_NO_LATCH;
2525
2526 ut_ad(n_blocks < BTR_MAX_LEVELS);
2527
2528 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2529 BUF_GET, file, line, mtr);
2530
2531 page = buf_block_get_frame(block);
2532
2533 ut_ad(fil_page_index_page_check(page));
2534 ut_ad(index->id == btr_page_get_index_id(page));
2535
2536 if (height == ULINT_UNDEFINED) {
2537 /* We are in the root node */
2538
2539 height = btr_page_get_level(page, mtr);
2540 ut_a(height >= level);
2541 } else {
2542 /* TODO: flag the index corrupted if this fails */
2543 ut_ad(height == btr_page_get_level(page, mtr));
2544 }
2545
2546 if (from_left) {
2547 page_cur_set_before_first(block, page_cursor);
2548 } else {
2549 page_cur_set_after_last(block, page_cursor);
2550 }
2551
2552 if (height == level) {
2553 break;
2554 }
2555
2556 ut_ad(height > 0);
2557
2558 if (from_left) {
2559 page_cur_move_to_next(page_cursor);
2560 } else {
2561 page_cur_move_to_prev(page_cursor);
2562 }
2563
2564 height--;
2565
2566 node_ptr = page_cur_get_rec(page_cursor);
2567 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2568 ULINT_UNDEFINED, &heap);
2569
2570 /* Go to the child node */
2571 page_id.set_page_no(
2572 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2573
2574 n_blocks++;
2575 }
2576
2577 if (heap != NULL) {
2578 mem_heap_free(heap);
2579 }
2580 }
2581
2582 /**********************************************************************//**
2583 Positions a cursor at a randomly chosen position within a B-tree.
2584 @return true if the index is available and we have put the cursor, false
2585 if the index is unavailable */
2586 bool
btr_cur_open_at_rnd_pos_func(dict_index_t * index,ulint latch_mode,btr_cur_t * cursor,const char * file,ulint line,mtr_t * mtr)2587 btr_cur_open_at_rnd_pos_func(
2588 /*=========================*/
2589 dict_index_t* index, /*!< in: index */
2590 ulint latch_mode, /*!< in: BTR_SEARCH_LEAF, ... */
2591 btr_cur_t* cursor, /*!< in/out: B-tree cursor */
2592 const char* file, /*!< in: file name */
2593 ulint line, /*!< in: line where called */
2594 mtr_t* mtr) /*!< in: mtr */
2595 {
2596 page_cur_t* page_cursor;
2597 ulint node_ptr_max_size = UNIV_PAGE_SIZE / 2;
2598 ulint height;
2599 rec_t* node_ptr;
2600 ulint savepoint;
2601 ulint upper_rw_latch, root_leaf_rw_latch;
2602 btr_intention_t lock_intention;
2603 buf_block_t* tree_blocks[BTR_MAX_LEVELS];
2604 ulint tree_savepoints[BTR_MAX_LEVELS];
2605 ulint n_blocks = 0;
2606 ulint n_releases = 0;
2607 mem_heap_t* heap = NULL;
2608 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2609 ulint* offsets = offsets_;
2610 rec_offs_init(offsets_);
2611
2612 ut_ad(!dict_index_is_spatial(index));
2613
2614 lock_intention = btr_cur_get_and_clear_intention(&latch_mode);
2615
2616 ut_ad(!(latch_mode & BTR_MODIFY_EXTERNAL));
2617
2618 savepoint = mtr_set_savepoint(mtr);
2619
2620 switch (latch_mode) {
2621 case BTR_MODIFY_TREE:
2622 /* Most of delete-intended operations are purging.
2623 Free blocks and read IO bandwidth should be prior
2624 for them, when the history list is glowing huge. */
2625 if (lock_intention == BTR_INTENTION_DELETE
2626 && trx_sys->rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
2627 && buf_get_n_pending_read_ios()) {
2628 mtr_x_lock(dict_index_get_lock(index), mtr);
2629 } else {
2630 mtr_sx_lock(dict_index_get_lock(index), mtr);
2631 }
2632 upper_rw_latch = RW_X_LATCH;
2633 break;
2634 case BTR_SEARCH_PREV:
2635 case BTR_MODIFY_PREV:
2636 /* This function doesn't support left uncle
2637 page lock for left leaf page lock, when
2638 needed. */
2639 case BTR_SEARCH_TREE:
2640 case BTR_CONT_MODIFY_TREE:
2641 case BTR_CONT_SEARCH_TREE:
2642 ut_ad(0);
2643 /* fall through */
2644 default:
2645 if (!srv_read_only_mode) {
2646 mtr_s_lock(dict_index_get_lock(index), mtr);
2647 upper_rw_latch = RW_S_LATCH;
2648 } else {
2649 upper_rw_latch = RW_NO_LATCH;
2650 }
2651 }
2652
2653 DBUG_EXECUTE_IF("test_index_is_unavailable",
2654 return(false););
2655
2656 if (index->page == FIL_NULL) {
2657 /* Since we don't hold index lock until just now, the index
2658 could be modified by others, for example, if this is a
2659 statistics updater for referenced table, it could be marked
2660 as unavailable by 'DROP TABLE' in the mean time, since
2661 we don't hold lock for statistics updater */
2662 return(false);
2663 }
2664
2665 root_leaf_rw_latch = btr_cur_latch_for_root_leaf(latch_mode);
2666
2667 page_cursor = btr_cur_get_page_cur(cursor);
2668 cursor->index = index;
2669
2670 page_id_t page_id(dict_index_get_space(index),
2671 dict_index_get_page(index));
2672 const page_size_t& page_size = dict_table_page_size(index->table);
2673
2674 if (root_leaf_rw_latch == RW_X_LATCH) {
2675 node_ptr_max_size = dict_index_node_ptr_max_size(index);
2676 }
2677
2678 height = ULINT_UNDEFINED;
2679
2680 for (;;) {
2681 buf_block_t* block;
2682 page_t* page;
2683 ulint rw_latch;
2684
2685 ut_ad(n_blocks < BTR_MAX_LEVELS);
2686
2687 if (height != 0
2688 && latch_mode != BTR_MODIFY_TREE) {
2689 rw_latch = upper_rw_latch;
2690 } else {
2691 rw_latch = RW_NO_LATCH;
2692 }
2693
2694 tree_savepoints[n_blocks] = mtr_set_savepoint(mtr);
2695 block = buf_page_get_gen(page_id, page_size, rw_latch, NULL,
2696 BUF_GET, file, line, mtr);
2697 tree_blocks[n_blocks] = block;
2698
2699 page = buf_block_get_frame(block);
2700
2701 if (height == ULINT_UNDEFINED
2702 && btr_page_get_level(page, mtr) == 0
2703 && rw_latch != RW_NO_LATCH
2704 && rw_latch != root_leaf_rw_latch) {
2705 /* We should retry to get the page, because the root page
2706 is latched with different level as a leaf page. */
2707 ut_ad(root_leaf_rw_latch != RW_NO_LATCH);
2708 ut_ad(rw_latch == RW_S_LATCH);
2709
2710 ut_ad(n_blocks == 0);
2711 mtr_release_block_at_savepoint(
2712 mtr, tree_savepoints[n_blocks],
2713 tree_blocks[n_blocks]);
2714
2715 upper_rw_latch = root_leaf_rw_latch;
2716 continue;
2717 }
2718
2719 ut_ad(fil_page_index_page_check(page));
2720 ut_ad(index->id == btr_page_get_index_id(page));
2721
2722 if (height == ULINT_UNDEFINED) {
2723 /* We are in the root node */
2724
2725 height = btr_page_get_level(page, mtr);
2726 }
2727
2728 if (height == 0) {
2729 if (rw_latch == RW_NO_LATCH
2730 || srv_read_only_mode) {
2731 btr_cur_latch_leaves(
2732 block, page_id, page_size,
2733 latch_mode, cursor, mtr);
2734 }
2735
2736 /* btr_cur_open_at_index_side_func() and
2737 btr_cur_search_to_nth_level() release
2738 tree s-latch here.*/
2739 switch (latch_mode) {
2740 case BTR_MODIFY_TREE:
2741 case BTR_CONT_MODIFY_TREE:
2742 case BTR_CONT_SEARCH_TREE:
2743 break;
2744 default:
2745 /* Release the tree s-latch */
2746 if (!srv_read_only_mode) {
2747 mtr_release_s_latch_at_savepoint(
2748 mtr, savepoint,
2749 dict_index_get_lock(index));
2750 }
2751
2752 /* release upper blocks */
2753 for (; n_releases < n_blocks; n_releases++) {
2754 mtr_release_block_at_savepoint(
2755 mtr,
2756 tree_savepoints[n_releases],
2757 tree_blocks[n_releases]);
2758 }
2759 }
2760 }
2761
2762 page_cur_open_on_rnd_user_rec(block, page_cursor);
2763
2764 if (height == 0) {
2765
2766 break;
2767 }
2768
2769 ut_ad(height > 0);
2770
2771 height--;
2772
2773 node_ptr = page_cur_get_rec(page_cursor);
2774 offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
2775 ULINT_UNDEFINED, &heap);
2776
2777 /* If the rec is the first or last in the page for
2778 pessimistic delete intention, it might cause node_ptr insert
2779 for the upper level. We should change the intention and retry.
2780 */
2781 if (latch_mode == BTR_MODIFY_TREE
2782 && btr_cur_need_opposite_intention(
2783 page, lock_intention, node_ptr)) {
2784
2785 ut_ad(upper_rw_latch == RW_X_LATCH);
2786 /* release all blocks */
2787 for (; n_releases <= n_blocks; n_releases++) {
2788 mtr_release_block_at_savepoint(
2789 mtr, tree_savepoints[n_releases],
2790 tree_blocks[n_releases]);
2791 }
2792
2793 lock_intention = BTR_INTENTION_BOTH;
2794
2795 page_id.set_page_no(dict_index_get_page(index));
2796
2797 height = ULINT_UNDEFINED;
2798
2799 n_blocks = 0;
2800 n_releases = 0;
2801
2802 continue;
2803 }
2804
2805 if (latch_mode == BTR_MODIFY_TREE
2806 && !btr_cur_will_modify_tree(
2807 cursor->index, page, lock_intention, node_ptr,
2808 node_ptr_max_size, page_size, mtr)) {
2809 ut_ad(upper_rw_latch == RW_X_LATCH);
2810 ut_ad(n_releases <= n_blocks);
2811
2812 /* we can release upper blocks */
2813 for (; n_releases < n_blocks; n_releases++) {
2814 if (n_releases == 0) {
2815 /* we should not release root page
2816 to pin to same block. */
2817 continue;
2818 }
2819
2820 /* release unused blocks to unpin */
2821 mtr_release_block_at_savepoint(
2822 mtr, tree_savepoints[n_releases],
2823 tree_blocks[n_releases]);
2824 }
2825 }
2826
2827 if (height == 0
2828 && latch_mode == BTR_MODIFY_TREE) {
2829 ut_ad(upper_rw_latch == RW_X_LATCH);
2830 /* we should sx-latch root page, if released already.
2831 It contains seg_header. */
2832 if (n_releases > 0) {
2833 mtr_block_sx_latch_at_savepoint(
2834 mtr, tree_savepoints[0],
2835 tree_blocks[0]);
2836 }
2837
2838 /* x-latch the branch blocks not released yet. */
2839 for (ulint i = n_releases; i <= n_blocks; i++) {
2840 mtr_block_x_latch_at_savepoint(
2841 mtr, tree_savepoints[i],
2842 tree_blocks[i]);
2843 }
2844 }
2845
2846 /* Go to the child node */
2847 page_id.set_page_no(
2848 btr_node_ptr_get_child_page_no(node_ptr, offsets));
2849
2850 n_blocks++;
2851 }
2852
2853 if (UNIV_LIKELY_NULL(heap)) {
2854 mem_heap_free(heap);
2855 }
2856
2857 return(true);
2858 }
2859
2860 /*==================== B-TREE INSERT =========================*/
2861
2862 /*************************************************************//**
2863 Inserts a record if there is enough space, or if enough space can
2864 be freed by reorganizing. Differs from btr_cur_optimistic_insert because
2865 no heuristics is applied to whether it pays to use CPU time for
2866 reorganizing the page or not.
2867
2868 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE
2869 if this is a compressed leaf page in a secondary index.
2870 This has to be done either within the same mini-transaction,
2871 or by invoking ibuf_reset_free_bits() before mtr_commit().
2872
2873 @return pointer to inserted record if succeed, else NULL */
2874 static MY_ATTRIBUTE((nonnull, warn_unused_result))
2875 rec_t*
btr_cur_insert_if_possible(btr_cur_t * cursor,const dtuple_t * tuple,ulint ** offsets,mem_heap_t ** heap,ulint n_ext,mtr_t * mtr)2876 btr_cur_insert_if_possible(
2877 /*=======================*/
2878 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
2879 cursor stays valid */
2880 const dtuple_t* tuple, /*!< in: tuple to insert; the size info need not
2881 have been stored to tuple */
2882 ulint** offsets,/*!< out: offsets on *rec */
2883 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2884 ulint n_ext, /*!< in: number of externally stored columns */
2885 mtr_t* mtr) /*!< in/out: mini-transaction */
2886 {
2887 page_cur_t* page_cursor;
2888 rec_t* rec;
2889
2890 ut_ad(dtuple_check_typed(tuple));
2891
2892 ut_ad(mtr_is_block_fix(
2893 mtr, btr_cur_get_block(cursor),
2894 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
2895 page_cursor = btr_cur_get_page_cur(cursor);
2896
2897 /* Now, try the insert */
2898 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
2899 offsets, heap, n_ext, mtr);
2900
2901 /* If the record did not fit, reorganize.
2902 For compressed pages, page_cur_tuple_insert()
2903 attempted this already. */
2904 if (!rec && !page_cur_get_page_zip(page_cursor)
2905 && btr_page_reorganize(page_cursor, cursor->index, mtr)) {
2906 rec = page_cur_tuple_insert(
2907 page_cursor, tuple, cursor->index,
2908 offsets, heap, n_ext, mtr);
2909 }
2910
2911 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
2912 return(rec);
2913 }
2914
2915 /*************************************************************//**
2916 For an insert, checks the locks and does the undo logging if desired.
2917 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
2918 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result, nonnull(2,3,5,6)))
2919 dberr_t
btr_cur_ins_lock_and_undo(ulint flags,btr_cur_t * cursor,dtuple_t * entry,que_thr_t * thr,mtr_t * mtr,ibool * inherit)2920 btr_cur_ins_lock_and_undo(
2921 /*======================*/
2922 ulint flags, /*!< in: undo logging and locking flags: if
2923 not zero, the parameters index and thr
2924 should be specified */
2925 btr_cur_t* cursor, /*!< in: cursor on page after which to insert */
2926 dtuple_t* entry, /*!< in/out: entry to insert */
2927 que_thr_t* thr, /*!< in: query thread or NULL */
2928 mtr_t* mtr, /*!< in/out: mini-transaction */
2929 ibool* inherit)/*!< out: TRUE if the inserted new record maybe
2930 should inherit LOCK_GAP type locks from the
2931 successor record */
2932 {
2933 dict_index_t* index;
2934 dberr_t err = DB_SUCCESS;
2935 rec_t* rec;
2936 roll_ptr_t roll_ptr;
2937
2938 /* Check if we have to wait for a lock: enqueue an explicit lock
2939 request if yes */
2940
2941 rec = btr_cur_get_rec(cursor);
2942 index = cursor->index;
2943
2944 ut_ad(!dict_index_is_online_ddl(index)
2945 || dict_index_is_clust(index)
2946 || (flags & BTR_CREATE_FLAG));
2947 ut_ad(mtr->is_named_space(index->space));
2948
2949 /* Check if there is predicate or GAP lock preventing the insertion */
2950 if (!(flags & BTR_NO_LOCKING_FLAG)) {
2951 if (dict_index_is_spatial(index)) {
2952 lock_prdt_t prdt;
2953 rtr_mbr_t mbr;
2954
2955 rtr_get_mbr_from_tuple(entry, &mbr);
2956
2957 /* Use on stack MBR variable to test if a lock is
2958 needed. If so, the predicate (MBR) will be allocated
2959 from lock heap in lock_prdt_insert_check_and_lock() */
2960 lock_init_prdt_from_mbr(
2961 &prdt, &mbr, 0, NULL);
2962
2963 err = lock_prdt_insert_check_and_lock(
2964 flags, rec, btr_cur_get_block(cursor),
2965 index, thr, mtr, &prdt);
2966 *inherit = false;
2967 } else {
2968 err = lock_rec_insert_check_and_lock(
2969 flags, rec, btr_cur_get_block(cursor),
2970 index, thr, mtr, inherit);
2971 }
2972 }
2973
2974 if (err != DB_SUCCESS
2975 || !dict_index_is_clust(index) || dict_index_is_ibuf(index)) {
2976
2977 return(err);
2978 }
2979
2980 err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
2981 thr, index, entry,
2982 NULL, 0, NULL, NULL,
2983 &roll_ptr);
2984 if (err != DB_SUCCESS) {
2985
2986 return(err);
2987 }
2988
2989 /* Now we can fill in the roll ptr field in entry
2990 (except if table is intrinsic) */
2991
2992 if (!(flags & BTR_KEEP_SYS_FLAG)
2993 && !dict_table_is_intrinsic(index->table)) {
2994
2995 row_upd_index_entry_sys_field(entry, index,
2996 DATA_ROLL_PTR, roll_ptr);
2997 }
2998
2999 return(DB_SUCCESS);
3000 }
3001
3002 /**
3003 Prefetch siblings of the leaf for the pessimistic operation.
3004 @param block leaf page */
3005 static
3006 void
btr_cur_prefetch_siblings(buf_block_t * block)3007 btr_cur_prefetch_siblings(
3008 buf_block_t* block)
3009 {
3010 page_t* page = buf_block_get_frame(block);
3011
3012 ut_ad(page_is_leaf(page));
3013
3014 ulint left_page_no = fil_page_get_prev(page);
3015 ulint right_page_no = fil_page_get_next(page);
3016
3017 if (left_page_no != FIL_NULL) {
3018 buf_read_page_background(
3019 page_id_t(block->page.id.space(), left_page_no),
3020 block->page.size, false);
3021 }
3022 if (right_page_no != FIL_NULL) {
3023 buf_read_page_background(
3024 page_id_t(block->page.id.space(), right_page_no),
3025 block->page.size, false);
3026 }
3027 if (left_page_no != FIL_NULL
3028 || right_page_no != FIL_NULL) {
3029 os_aio_simulated_wake_handler_threads();
3030 }
3031 }
3032
3033 /*************************************************************//**
3034 Tries to perform an insert to a page in an index tree, next to cursor.
3035 It is assumed that mtr holds an x-latch on the page. The operation does
3036 not succeed if there is too little space on the page. If there is just
3037 one record on the page, the insert will always succeed; this is to
3038 prevent trying to split a page with just one record.
3039 @return DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
3040 dberr_t
btr_cur_optimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3041 btr_cur_optimistic_insert(
3042 /*======================*/
3043 ulint flags, /*!< in: undo logging and locking flags: if not
3044 zero, the parameters index and thr should be
3045 specified */
3046 btr_cur_t* cursor, /*!< in: cursor on page after which to insert;
3047 cursor stays valid */
3048 ulint** offsets,/*!< out: offsets on *rec */
3049 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
3050 dtuple_t* entry, /*!< in/out: entry to insert */
3051 rec_t** rec, /*!< out: pointer to inserted record if
3052 succeed */
3053 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3054 be stored externally by the caller, or
3055 NULL */
3056 ulint n_ext, /*!< in: number of externally stored columns */
3057 que_thr_t* thr, /*!< in: query thread or NULL */
3058 mtr_t* mtr) /*!< in/out: mini-transaction;
3059 if this function returns DB_SUCCESS on
3060 a leaf page of a secondary index in a
3061 compressed tablespace, the caller must
3062 mtr_commit(mtr) before latching
3063 any further pages */
3064 {
3065 big_rec_t* big_rec_vec = NULL;
3066 dict_index_t* index;
3067 page_cur_t* page_cursor;
3068 buf_block_t* block;
3069 page_t* page;
3070 rec_t* dummy;
3071 ibool leaf;
3072 ibool reorg;
3073 ibool inherit = TRUE;
3074 ulint rec_size;
3075 dberr_t err;
3076
3077 *big_rec = NULL;
3078
3079 block = btr_cur_get_block(cursor);
3080 page = buf_block_get_frame(block);
3081 index = cursor->index;
3082
3083 /* Block are not latched for insert if table is intrinsic
3084 and index is auto-generated clustered index. */
3085 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3086 ut_ad(!dict_index_is_online_ddl(index)
3087 || dict_index_is_clust(index)
3088 || (flags & BTR_CREATE_FLAG));
3089 ut_ad(dtuple_check_typed(entry));
3090
3091 const page_size_t& page_size = block->page.size;
3092
3093 #ifdef UNIV_DEBUG_VALGRIND
3094 if (page_size.is_compressed()) {
3095 UNIV_MEM_ASSERT_RW(page, page_size.logical());
3096 UNIV_MEM_ASSERT_RW(block->page.zip.data, page_size.physical());
3097 }
3098 #endif /* UNIV_DEBUG_VALGRIND */
3099
3100 leaf = page_is_leaf(page);
3101
3102 /* Calculate the record size when entry is converted to a record */
3103 rec_size = rec_get_converted_size(index, entry, n_ext);
3104
3105 if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
3106 dtuple_get_n_fields(entry), page_size)) {
3107
3108 /* The record is so big that we have to store some fields
3109 externally on separate database pages */
3110 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3111
3112 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
3113
3114 return(DB_TOO_BIG_RECORD);
3115 }
3116
3117 rec_size = rec_get_converted_size(index, entry, n_ext);
3118 }
3119
3120 if (page_size.is_compressed() && page_zip_is_too_big(index, entry)) {
3121 if (big_rec_vec != NULL) {
3122 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3123 }
3124
3125 return(DB_TOO_BIG_RECORD);
3126 }
3127
3128 LIMIT_OPTIMISTIC_INSERT_DEBUG(page_get_n_recs(page),
3129 goto fail);
3130
3131 if (leaf && page_size.is_compressed()
3132 && (page_get_data_size(page) + rec_size
3133 >= dict_index_zip_pad_optimal_page_size(index))) {
3134 /* If compression padding tells us that insertion will
3135 result in too packed up page i.e.: which is likely to
3136 cause compression failure then don't do an optimistic
3137 insertion. */
3138 fail:
3139 err = DB_FAIL;
3140
3141 /* prefetch siblings of the leaf for the pessimistic
3142 operation, if the page is leaf. */
3143 if (page_is_leaf(page)) {
3144 btr_cur_prefetch_siblings(block);
3145 }
3146 fail_err:
3147
3148 if (big_rec_vec) {
3149 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3150 }
3151
3152 return(err);
3153 }
3154
3155 ulint max_size = page_get_max_insert_size_after_reorganize(page, 1);
3156
3157 if (page_has_garbage(page)) {
3158 if ((max_size < rec_size
3159 || max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT)
3160 && page_get_n_recs(page) > 1
3161 && page_get_max_insert_size(page, 1) < rec_size) {
3162
3163 goto fail;
3164 }
3165 } else if (max_size < rec_size) {
3166 goto fail;
3167 }
3168
3169 /* If there have been many consecutive inserts to the
3170 clustered index leaf page of an uncompressed table, check if
3171 we have to split the page to reserve enough free space for
3172 future updates of records. */
3173
3174 if (leaf && !page_size.is_compressed() && dict_index_is_clust(index)
3175 && page_get_n_recs(page) >= 2
3176 && dict_index_get_space_reserve() + rec_size > max_size
3177 && (btr_page_get_split_rec_to_right(cursor, &dummy)
3178 || btr_page_get_split_rec_to_left(cursor, &dummy))) {
3179 goto fail;
3180 }
3181
3182 page_cursor = btr_cur_get_page_cur(cursor);
3183
3184 DBUG_PRINT("ib_cur", ("insert %s (" IB_ID_FMT ") by " TRX_ID_FMT
3185 ": %s",
3186 index->name(), index->id,
3187 thr != NULL
3188 ? trx_get_id_for_print(thr_get_trx(thr))
3189 : 0,
3190 rec_printer(entry).str().c_str()));
3191
3192 DBUG_EXECUTE_IF("do_page_reorganize",
3193 btr_page_reorganize(page_cursor, index, mtr););
3194
3195 /* Now, try the insert */
3196 {
3197 const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
3198
3199 if (dict_table_is_intrinsic(index->table)) {
3200
3201 index->rec_cache.rec_size = rec_size;
3202
3203 *rec = page_cur_tuple_direct_insert(
3204 page_cursor, entry, index, n_ext, mtr);
3205 } else {
3206 /* Check locks and write to the undo log,
3207 if specified */
3208 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3209 thr, mtr, &inherit);
3210
3211 if (err != DB_SUCCESS) {
3212 goto fail_err;
3213 }
3214
3215 *rec = page_cur_tuple_insert(
3216 page_cursor, entry, index, offsets, heap,
3217 n_ext, mtr);
3218 }
3219
3220 reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
3221 }
3222
3223 if (*rec) {
3224 } else if (page_size.is_compressed()) {
3225 /* Reset the IBUF_BITMAP_FREE bits, because
3226 page_cur_tuple_insert() will have attempted page
3227 reorganize before failing. */
3228 if (leaf
3229 && !dict_index_is_clust(index)
3230 && !dict_table_is_temporary(index->table)) {
3231 ibuf_reset_free_bits(block);
3232 }
3233
3234 goto fail;
3235 } else {
3236
3237 /* For intrinsic table we take a consistent path
3238 to re-organize using pessimistic path. */
3239 if (dict_table_is_intrinsic(index->table)) {
3240 goto fail;
3241 }
3242
3243 ut_ad(!reorg);
3244
3245 /* If the record did not fit, reorganize */
3246 if (!btr_page_reorganize(page_cursor, index, mtr)) {
3247 ut_ad(0);
3248 goto fail;
3249 }
3250
3251 ut_ad(page_get_max_insert_size(page, 1) == max_size);
3252
3253 reorg = TRUE;
3254
3255 *rec = page_cur_tuple_insert(page_cursor, entry, index,
3256 offsets, heap, n_ext, mtr);
3257
3258 if (UNIV_UNLIKELY(!*rec)) {
3259 ib::fatal() << "Cannot insert tuple " << *entry
3260 << "into index " << index->name
3261 << " of table " << index->table->name
3262 << ". Max size: " << max_size;
3263 }
3264 }
3265
3266 if (!index->disable_ahi) {
3267 if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
3268 btr_search_update_hash_node_on_insert(cursor);
3269 } else {
3270 btr_search_update_hash_on_insert(cursor);
3271 }
3272 }
3273
3274 if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
3275
3276 lock_update_insert(block, *rec);
3277 }
3278
3279 if (leaf
3280 && !dict_index_is_clust(index)
3281 && !dict_table_is_temporary(index->table)) {
3282 /* Update the free bits of the B-tree page in the
3283 insert buffer bitmap. */
3284
3285 /* The free bits in the insert buffer bitmap must
3286 never exceed the free space on a page. It is safe to
3287 decrement or reset the bits in the bitmap in a
3288 mini-transaction that is committed before the
3289 mini-transaction that affects the free space. */
3290
3291 /* It is unsafe to increment the bits in a separately
3292 committed mini-transaction, because in crash recovery,
3293 the free bits could momentarily be set too high. */
3294
3295 if (page_size.is_compressed()) {
3296 /* Update the bits in the same mini-transaction. */
3297 ibuf_update_free_bits_zip(block, mtr);
3298 } else {
3299 /* Decrement the bits in a separate
3300 mini-transaction. */
3301 ibuf_update_free_bits_if_full(
3302 block, max_size,
3303 rec_size + PAGE_DIR_SLOT_SIZE);
3304 }
3305 }
3306
3307 *big_rec = big_rec_vec;
3308
3309 return(DB_SUCCESS);
3310 }
3311
3312 /*************************************************************//**
3313 Performs an insert on a page of an index tree. It is assumed that mtr
3314 holds an x-latch on the tree and on the cursor page. If the insert is
3315 made on the leaf level, to avoid deadlocks, mtr must also own x-latches
3316 to brothers of page, if those brothers exist.
3317 @return DB_SUCCESS or error number */
3318 dberr_t
btr_cur_pessimistic_insert(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,dtuple_t * entry,rec_t ** rec,big_rec_t ** big_rec,ulint n_ext,que_thr_t * thr,mtr_t * mtr)3319 btr_cur_pessimistic_insert(
3320 /*=======================*/
3321 ulint flags, /*!< in: undo logging and locking flags: if not
3322 zero, the parameter thr should be
3323 specified; if no undo logging is specified,
3324 then the caller must have reserved enough
3325 free extents in the file space so that the
3326 insertion will certainly succeed */
3327 btr_cur_t* cursor, /*!< in: cursor after which to insert;
3328 cursor stays valid */
3329 ulint** offsets,/*!< out: offsets on *rec */
3330 mem_heap_t** heap, /*!< in/out: pointer to memory heap
3331 that can be emptied, or NULL */
3332 dtuple_t* entry, /*!< in/out: entry to insert */
3333 rec_t** rec, /*!< out: pointer to inserted record if
3334 succeed */
3335 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
3336 be stored externally by the caller, or
3337 NULL */
3338 ulint n_ext, /*!< in: number of externally stored columns */
3339 que_thr_t* thr, /*!< in: query thread or NULL */
3340 mtr_t* mtr) /*!< in/out: mini-transaction */
3341 {
3342 dict_index_t* index = cursor->index;
3343 big_rec_t* big_rec_vec = NULL;
3344 dberr_t err;
3345 ibool inherit = FALSE;
3346 bool success;
3347 ulint n_reserved = 0;
3348
3349 ut_ad(dtuple_check_typed(entry));
3350
3351 *big_rec = NULL;
3352
3353 ut_ad(mtr_memo_contains_flagged(
3354 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
3355 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
3356 || dict_table_is_intrinsic(cursor->index->table));
3357 ut_ad(mtr_is_block_fix(
3358 mtr, btr_cur_get_block(cursor),
3359 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
3360 ut_ad(!dict_index_is_online_ddl(index)
3361 || dict_index_is_clust(index)
3362 || (flags & BTR_CREATE_FLAG));
3363
3364 cursor->flag = BTR_CUR_BINARY;
3365
3366 /* Check locks and write to undo log, if specified */
3367
3368 err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
3369 thr, mtr, &inherit);
3370
3371 if (err != DB_SUCCESS) {
3372
3373 return(err);
3374 }
3375
3376 if (!(flags & BTR_NO_UNDO_LOG_FLAG)
3377 || dict_table_is_intrinsic(index->table)) {
3378 /* First reserve enough free space for the file segments
3379 of the index tree, so that the insert will not fail because
3380 of lack of space */
3381
3382 ulint n_extents = cursor->tree_height / 16 + 3;
3383
3384 success = fsp_reserve_free_extents(&n_reserved, index->space,
3385 n_extents, FSP_NORMAL, mtr);
3386 if (!success) {
3387 return(DB_OUT_OF_FILE_SPACE);
3388 }
3389 }
3390
3391 if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
3392 dict_table_is_comp(index->table),
3393 dtuple_get_n_fields(entry),
3394 dict_table_page_size(index->table))) {
3395 /* The record is so big that we have to store some fields
3396 externally on separate database pages */
3397
3398 if (UNIV_LIKELY_NULL(big_rec_vec)) {
3399 /* This should never happen, but we handle
3400 the situation in a robust manner. */
3401 ut_ad(0);
3402 dtuple_convert_back_big_rec(index, entry, big_rec_vec);
3403 }
3404
3405 big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
3406
3407 if (big_rec_vec == NULL) {
3408
3409 if (n_reserved > 0) {
3410 fil_space_release_free_extents(index->space,
3411 n_reserved);
3412 }
3413 return(DB_TOO_BIG_RECORD);
3414 }
3415 }
3416
3417 if (dict_index_get_page(index)
3418 == btr_cur_get_block(cursor)->page.id.page_no()) {
3419
3420 /* The page is the root page */
3421 *rec = btr_root_raise_and_insert(
3422 flags, cursor, offsets, heap, entry, n_ext, mtr);
3423 } else {
3424 *rec = btr_page_split_and_insert(
3425 flags, cursor, offsets, heap, entry, n_ext, mtr);
3426 }
3427
3428 ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec
3429 || dict_index_is_spatial(index));
3430
3431 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3432 ut_ad(!dict_table_is_temporary(index->table));
3433 if (dict_index_is_spatial(index)) {
3434 /* Do nothing */
3435 } else {
3436 /* The cursor might be moved to the other page
3437 and the max trx id field should be updated after
3438 the cursor was fixed. */
3439 if (!dict_index_is_clust(index)) {
3440 page_update_max_trx_id(
3441 btr_cur_get_block(cursor),
3442 btr_cur_get_page_zip(cursor),
3443 thr_get_trx(thr)->id, mtr);
3444 }
3445 if (!page_rec_is_infimum(btr_cur_get_rec(cursor))
3446 || btr_page_get_prev(
3447 buf_block_get_frame(
3448 btr_cur_get_block(cursor)), mtr)
3449 == FIL_NULL) {
3450 /* split and inserted need to call
3451 lock_update_insert() always. */
3452 inherit = TRUE;
3453 }
3454 }
3455 }
3456
3457 if (!index->disable_ahi) {
3458 btr_search_update_hash_on_insert(cursor);
3459 }
3460 if (inherit && !(flags & BTR_NO_LOCKING_FLAG)) {
3461
3462 lock_update_insert(btr_cur_get_block(cursor), *rec);
3463 }
3464
3465 if (n_reserved > 0) {
3466 fil_space_release_free_extents(index->space, n_reserved);
3467 }
3468
3469 *big_rec = big_rec_vec;
3470
3471 return(DB_SUCCESS);
3472 }
3473
3474 /*==================== B-TREE UPDATE =========================*/
3475
3476 /*************************************************************//**
3477 For an update, checks the locks and does the undo logging.
3478 @return DB_SUCCESS, DB_WAIT_LOCK, or error number */
UNIV_INLINE(warn_unused_result)3479 UNIV_INLINE MY_ATTRIBUTE((warn_unused_result))
3480 dberr_t
3481 btr_cur_upd_lock_and_undo(
3482 /*======================*/
3483 ulint flags, /*!< in: undo logging and locking flags */
3484 btr_cur_t* cursor, /*!< in: cursor on record to update */
3485 const ulint* offsets,/*!< in: rec_get_offsets() on cursor */
3486 const upd_t* update, /*!< in: update vector */
3487 ulint cmpl_info,/*!< in: compiler info on secondary index
3488 updates */
3489 que_thr_t* thr, /*!< in: query thread
3490 (can be NULL if BTR_NO_LOCKING_FLAG) */
3491 mtr_t* mtr, /*!< in/out: mini-transaction */
3492 roll_ptr_t* roll_ptr)/*!< out: roll pointer */
3493 {
3494 dict_index_t* index;
3495 const rec_t* rec;
3496 dberr_t err;
3497
3498 ut_ad(thr != NULL || (flags & BTR_NO_LOCKING_FLAG));
3499
3500 rec = btr_cur_get_rec(cursor);
3501 index = cursor->index;
3502
3503 ut_ad(rec_offs_validate(rec, index, offsets));
3504 ut_ad(mtr->is_named_space(index->space));
3505
3506 if (!dict_index_is_clust(index)) {
3507 ut_ad(dict_index_is_online_ddl(index)
3508 == !!(flags & BTR_CREATE_FLAG));
3509
3510 /* We do undo logging only when we update a clustered index
3511 record */
3512 return(lock_sec_rec_modify_check_and_lock(
3513 flags, btr_cur_get_block(cursor), rec,
3514 index, thr, mtr));
3515 }
3516
3517 /* Check if we have to wait for a lock: enqueue an explicit lock
3518 request if yes */
3519
3520 if (!(flags & BTR_NO_LOCKING_FLAG)) {
3521 err = lock_clust_rec_modify_check_and_lock(
3522 flags, btr_cur_get_block(cursor), rec, index,
3523 offsets, thr);
3524 if (err != DB_SUCCESS) {
3525 return(err);
3526 }
3527 }
3528
3529 /* Append the info about the update in the undo log */
3530
3531 return(trx_undo_report_row_operation(
3532 flags, TRX_UNDO_MODIFY_OP, thr,
3533 index, NULL, update,
3534 cmpl_info, rec, offsets, roll_ptr));
3535 }
3536
3537 /***********************************************************//**
3538 Writes a redo log record of updating a record in-place. */
3539 void
btr_cur_update_in_place_log(ulint flags,const rec_t * rec,dict_index_t * index,const upd_t * update,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3540 btr_cur_update_in_place_log(
3541 /*========================*/
3542 ulint flags, /*!< in: flags */
3543 const rec_t* rec, /*!< in: record */
3544 dict_index_t* index, /*!< in: index of the record */
3545 const upd_t* update, /*!< in: update vector */
3546 trx_id_t trx_id, /*!< in: transaction id */
3547 roll_ptr_t roll_ptr, /*!< in: roll ptr */
3548 mtr_t* mtr) /*!< in: mtr */
3549 {
3550 byte* log_ptr;
3551 const page_t* page = page_align(rec);
3552 ut_ad(flags < 256);
3553 ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
3554
3555 log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
3556 ? MLOG_COMP_REC_UPDATE_IN_PLACE
3557 : MLOG_REC_UPDATE_IN_PLACE,
3558 1 + DATA_ROLL_PTR_LEN + 14 + 2
3559 + MLOG_BUF_MARGIN);
3560
3561 if (!log_ptr) {
3562 /* Logging in mtr is switched off during crash recovery */
3563 return;
3564 }
3565
3566 /* For secondary indexes, we could skip writing the dummy system fields
3567 to the redo log but we have to change redo log parsing of
3568 MLOG_REC_UPDATE_IN_PLACE/MLOG_COMP_REC_UPDATE_IN_PLACE or we have to add
3569 new redo log record. For now, just write dummy sys fields to the redo
3570 log if we are updating a secondary index record.
3571 */
3572 mach_write_to_1(log_ptr, flags);
3573 log_ptr++;
3574
3575 if (dict_index_is_clust(index)) {
3576 log_ptr = row_upd_write_sys_vals_to_log(
3577 index, trx_id, roll_ptr, log_ptr, mtr);
3578 } else {
3579 /* Dummy system fields for a secondary index */
3580 /* TRX_ID Position */
3581 log_ptr += mach_write_compressed(log_ptr, 0);
3582 /* ROLL_PTR */
3583 trx_write_roll_ptr(log_ptr, 0);
3584 log_ptr += DATA_ROLL_PTR_LEN;
3585 /* TRX_ID */
3586 log_ptr += mach_u64_write_compressed(log_ptr, 0);
3587 }
3588
3589 mach_write_to_2(log_ptr, page_offset(rec));
3590 log_ptr += 2;
3591
3592 row_upd_index_write_log(update, log_ptr, mtr);
3593 }
3594 #endif /* UNIV_HOTBACKUP */
3595
3596 /***********************************************************//**
3597 Parses a redo log record of updating a record in-place.
3598 @return end of log record or NULL */
3599 byte*
btr_cur_parse_update_in_place(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)3600 btr_cur_parse_update_in_place(
3601 /*==========================*/
3602 byte* ptr, /*!< in: buffer */
3603 byte* end_ptr,/*!< in: buffer end */
3604 page_t* page, /*!< in/out: page or NULL */
3605 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
3606 dict_index_t* index) /*!< in: index corresponding to page */
3607 {
3608 ulint flags;
3609 rec_t* rec;
3610 upd_t* update;
3611 ulint pos;
3612 trx_id_t trx_id;
3613 roll_ptr_t roll_ptr;
3614 ulint rec_offset;
3615 mem_heap_t* heap;
3616 ulint* offsets;
3617
3618 if (end_ptr < ptr + 1) {
3619
3620 return(NULL);
3621 }
3622
3623 flags = mach_read_from_1(ptr);
3624 ptr++;
3625
3626 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
3627
3628 if (ptr == NULL) {
3629
3630 return(NULL);
3631 }
3632
3633 if (end_ptr < ptr + 2) {
3634
3635 return(NULL);
3636 }
3637
3638 rec_offset = mach_read_from_2(ptr);
3639 ptr += 2;
3640
3641 ut_a(rec_offset <= UNIV_PAGE_SIZE);
3642
3643 heap = mem_heap_create(256);
3644
3645 ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
3646
3647 if (!ptr || !page) {
3648
3649 goto func_exit;
3650 }
3651
3652 ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
3653 rec = page + rec_offset;
3654
3655 /* We do not need to reserve search latch, as the page is only
3656 being recovered, and there cannot be a hash index to it. */
3657
3658 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
3659
3660 if (!(flags & BTR_KEEP_SYS_FLAG)) {
3661 row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
3662 pos, trx_id, roll_ptr);
3663 }
3664
3665 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3666
3667 func_exit:
3668 mem_heap_free(heap);
3669
3670 return(ptr);
3671 }
3672
3673 #ifndef UNIV_HOTBACKUP
3674 /*************************************************************//**
3675 See if there is enough place in the page modification log to log
3676 an update-in-place.
3677
3678 @retval false if out of space; IBUF_BITMAP_FREE will be reset
3679 outside mtr if the page was recompressed
3680 @retval true if enough place;
3681
3682 IMPORTANT: The caller will have to update IBUF_BITMAP_FREE if this is
3683 a secondary index leaf page. This has to be done either within the
3684 same mini-transaction, or by invoking ibuf_reset_free_bits() before
3685 mtr_commit(mtr). */
3686 bool
btr_cur_update_alloc_zip_func(page_zip_des_t * page_zip,page_cur_t * cursor,dict_index_t * index,ulint * offsets,ulint length,bool create,mtr_t * mtr)3687 btr_cur_update_alloc_zip_func(
3688 /*==========================*/
3689 page_zip_des_t* page_zip,/*!< in/out: compressed page */
3690 page_cur_t* cursor, /*!< in/out: B-tree page cursor */
3691 dict_index_t* index, /*!< in: the index corresponding to cursor */
3692 #ifdef UNIV_DEBUG
3693 ulint* offsets,/*!< in/out: offsets of the cursor record */
3694 #endif /* UNIV_DEBUG */
3695 ulint length, /*!< in: size needed */
3696 bool create, /*!< in: true=delete-and-insert,
3697 false=update-in-place */
3698 mtr_t* mtr) /*!< in/out: mini-transaction */
3699 {
3700 const page_t* page = page_cur_get_page(cursor);
3701
3702 ut_ad(page_zip == page_cur_get_page_zip(cursor));
3703 ut_ad(page_zip);
3704 ut_ad(!dict_index_is_ibuf(index));
3705 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3706
3707 if (page_zip_available(page_zip, dict_index_is_clust(index),
3708 length, create)) {
3709 return(true);
3710 }
3711
3712 if (!page_zip->m_nonempty && !page_has_garbage(page)) {
3713 /* The page has been freshly compressed, so
3714 reorganizing it will not help. */
3715 return(false);
3716 }
3717
3718 if (create && page_is_leaf(page)
3719 && (length + page_get_data_size(page)
3720 >= dict_index_zip_pad_optimal_page_size(index))) {
3721 return(false);
3722 }
3723
3724 if (!btr_page_reorganize(cursor, index, mtr)) {
3725 goto out_of_space;
3726 }
3727
3728 rec_offs_make_valid(page_cur_get_rec(cursor), index, offsets);
3729
3730 /* After recompressing a page, we must make sure that the free
3731 bits in the insert buffer bitmap will not exceed the free
3732 space on the page. Because this function will not attempt
3733 recompression unless page_zip_available() fails above, it is
3734 safe to reset the free bits if page_zip_available() fails
3735 again, below. The free bits can safely be reset in a separate
3736 mini-transaction. If page_zip_available() succeeds below, we
3737 can be sure that the btr_page_reorganize() above did not reduce
3738 the free space available on the page. */
3739
3740 if (page_zip_available(page_zip, dict_index_is_clust(index),
3741 length, create)) {
3742 return(true);
3743 }
3744
3745 out_of_space:
3746 ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
3747
3748 /* Out of space: reset the free bits. */
3749 if (!dict_index_is_clust(index)
3750 && !dict_table_is_temporary(index->table)
3751 && page_is_leaf(page)) {
3752 ibuf_reset_free_bits(page_cur_get_block(cursor));
3753 }
3754
3755 return(false);
3756 }
3757
3758 /*************************************************************//**
3759 Updates a record when the update causes no size changes in its fields.
3760 We assume here that the ordering fields of the record do not change.
3761 @return locking or undo log related error code, or
3762 @retval DB_SUCCESS on success
3763 @retval DB_ZIP_OVERFLOW if there is not enough space left
3764 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3765 dberr_t
btr_cur_update_in_place(ulint flags,btr_cur_t * cursor,ulint * offsets,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3766 btr_cur_update_in_place(
3767 /*====================*/
3768 ulint flags, /*!< in: undo logging and locking flags */
3769 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3770 cursor stays valid and positioned on the
3771 same record */
3772 ulint* offsets,/*!< in/out: offsets on cursor->page_cur.rec */
3773 const upd_t* update, /*!< in: update vector */
3774 ulint cmpl_info,/*!< in: compiler info on secondary index
3775 updates */
3776 que_thr_t* thr, /*!< in: query thread */
3777 trx_id_t trx_id, /*!< in: transaction id */
3778 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3779 is a secondary index, the caller must
3780 mtr_commit(mtr) before latching any
3781 further pages */
3782 {
3783 dict_index_t* index;
3784 buf_block_t* block;
3785 page_zip_des_t* page_zip;
3786 dberr_t err;
3787 rec_t* rec;
3788 roll_ptr_t roll_ptr = 0;
3789 ulint was_delete_marked;
3790 ibool is_hashed;
3791
3792 rec = btr_cur_get_rec(cursor);
3793 index = cursor->index;
3794 ut_ad(rec_offs_validate(rec, index, offsets));
3795 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3796 ut_ad(trx_id > 0
3797 || (flags & BTR_KEEP_SYS_FLAG)
3798 || dict_table_is_intrinsic(index->table));
3799 /* The insert buffer tree should never be updated in place. */
3800 ut_ad(!dict_index_is_ibuf(index));
3801 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3802 || dict_index_is_clust(index));
3803 ut_ad(thr_get_trx(thr)->id == trx_id
3804 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3805 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3806 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3807 ut_ad(fil_page_index_page_check(btr_cur_get_page(cursor)));
3808 ut_ad(btr_page_get_index_id(btr_cur_get_page(cursor)) == index->id);
3809
3810 DBUG_PRINT("ib_cur", ("update-in-place %s (" IB_ID_FMT
3811 ") by " TRX_ID_FMT ": %s",
3812 index->name(), index->id, trx_id,
3813 rec_printer(rec, offsets).str().c_str()));
3814
3815 block = btr_cur_get_block(cursor);
3816 page_zip = buf_block_get_page_zip(block);
3817
3818 /* Check that enough space is available on the compressed page. */
3819 if (page_zip) {
3820 if (!btr_cur_update_alloc_zip(
3821 page_zip, btr_cur_get_page_cur(cursor),
3822 index, offsets, rec_offs_size(offsets),
3823 false, mtr)) {
3824 return(DB_ZIP_OVERFLOW);
3825 }
3826
3827 rec = btr_cur_get_rec(cursor);
3828 }
3829
3830 /* Do lock checking and undo logging */
3831 err = btr_cur_upd_lock_and_undo(flags, cursor, offsets,
3832 update, cmpl_info,
3833 thr, mtr, &roll_ptr);
3834 if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
3835 /* We may need to update the IBUF_BITMAP_FREE
3836 bits after a reorganize that was done in
3837 btr_cur_update_alloc_zip(). */
3838 goto func_exit;
3839 }
3840
3841 if (!(flags & BTR_KEEP_SYS_FLAG)
3842 && !dict_table_is_intrinsic(index->table)) {
3843 row_upd_rec_sys_fields(rec, NULL, index, offsets,
3844 thr_get_trx(thr), roll_ptr);
3845 }
3846
3847 was_delete_marked = rec_get_deleted_flag(
3848 rec, page_is_comp(buf_block_get_frame(block)));
3849
3850 is_hashed = (block->index != NULL);
3851
3852 if (is_hashed) {
3853 /* TO DO: Can we skip this if none of the fields
3854 index->search_info->curr_n_fields
3855 are being updated? */
3856
3857 /* The function row_upd_changes_ord_field_binary works only
3858 if the update vector was built for a clustered index, we must
3859 NOT call it if index is secondary */
3860
3861 if (!dict_index_is_clust(index)
3862 || row_upd_changes_ord_field_binary(index, update, thr,
3863 NULL, NULL)) {
3864
3865 /* Remove possible hash index pointer to this record */
3866 btr_search_update_hash_on_delete(cursor);
3867 }
3868
3869 rw_lock_x_lock(btr_get_search_latch(index));
3870 }
3871
3872 assert_block_ahi_valid(block);
3873 row_upd_rec_in_place(rec, index, offsets, update, page_zip);
3874
3875 if (is_hashed) {
3876 rw_lock_x_unlock(btr_get_search_latch(index));
3877 }
3878
3879 btr_cur_update_in_place_log(flags, rec, index, update,
3880 trx_id, roll_ptr, mtr);
3881
3882 if (was_delete_marked
3883 && !rec_get_deleted_flag(
3884 rec, page_is_comp(buf_block_get_frame(block)))) {
3885 /* The new updated record owns its possible externally
3886 stored fields */
3887
3888 btr_cur_unmark_extern_fields(page_zip,
3889 rec, index, offsets, mtr);
3890 }
3891
3892 ut_ad(err == DB_SUCCESS);
3893
3894 func_exit:
3895 if (page_zip
3896 && !(flags & BTR_KEEP_IBUF_BITMAP)
3897 && !dict_index_is_clust(index)
3898 && !dict_table_is_temporary(index->table)
3899 && page_is_leaf(buf_block_get_frame(block))) {
3900 /* Update the free bits in the insert buffer. */
3901 ibuf_update_free_bits_zip(block, mtr);
3902 }
3903
3904 return(err);
3905 }
3906
3907 /*************************************************************//**
3908 Tries to update a record on a page in an index tree. It is assumed that mtr
3909 holds an x-latch on the page. The operation does not succeed if there is too
3910 little space on the page or if the update would result in too empty a page,
3911 so that tree compression is recommended. We assume here that the ordering
3912 fields of the record do not change.
3913 @return error code, including
3914 @retval DB_SUCCESS on success
3915 @retval DB_OVERFLOW if the updated record does not fit
3916 @retval DB_UNDERFLOW if the page would become too empty
3917 @retval DB_ZIP_OVERFLOW if there is not enough space left
3918 on the compressed page (IBUF_BITMAP_FREE was reset outside mtr) */
3919 dberr_t
btr_cur_optimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** heap,const upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)3920 btr_cur_optimistic_update(
3921 /*======================*/
3922 ulint flags, /*!< in: undo logging and locking flags */
3923 btr_cur_t* cursor, /*!< in: cursor on the record to update;
3924 cursor stays valid and positioned on the
3925 same record */
3926 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
3927 mem_heap_t** heap, /*!< in/out: pointer to NULL or memory heap */
3928 const upd_t* update, /*!< in: update vector; this must also
3929 contain trx id and roll ptr fields */
3930 ulint cmpl_info,/*!< in: compiler info on secondary index
3931 updates */
3932 que_thr_t* thr, /*!< in: query thread */
3933 trx_id_t trx_id, /*!< in: transaction id */
3934 mtr_t* mtr) /*!< in/out: mini-transaction; if this
3935 is a secondary index, the caller must
3936 mtr_commit(mtr) before latching any
3937 further pages */
3938 {
3939 dict_index_t* index;
3940 page_cur_t* page_cursor;
3941 dberr_t err;
3942 buf_block_t* block;
3943 page_t* page;
3944 page_zip_des_t* page_zip;
3945 rec_t* rec;
3946 ulint max_size;
3947 ulint new_rec_size;
3948 ulint old_rec_size;
3949 ulint max_ins_size = 0;
3950 dtuple_t* new_entry;
3951 roll_ptr_t roll_ptr;
3952 ulint i;
3953 ulint n_ext;
3954
3955 block = btr_cur_get_block(cursor);
3956 page = buf_block_get_frame(block);
3957 rec = btr_cur_get_rec(cursor);
3958 index = cursor->index;
3959 ut_ad(trx_id > 0
3960 || (flags & BTR_KEEP_SYS_FLAG)
3961 || dict_table_is_intrinsic(index->table));
3962 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
3963 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
3964 /* This is intended only for leaf page updates */
3965 ut_ad(page_is_leaf(page));
3966 /* The insert buffer tree should never be updated in place. */
3967 ut_ad(!dict_index_is_ibuf(index));
3968 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
3969 || dict_index_is_clust(index));
3970 ut_ad(thr_get_trx(thr)->id == trx_id
3971 || (flags & ~(BTR_KEEP_POS_FLAG | BTR_KEEP_IBUF_BITMAP))
3972 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
3973 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
3974 ut_ad(fil_page_index_page_check(page));
3975 ut_ad(btr_page_get_index_id(page) == index->id);
3976
3977 *offsets = rec_get_offsets(rec, index, *offsets,
3978 ULINT_UNDEFINED, heap);
3979 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
3980 ut_a(!rec_offs_any_null_extern(rec, *offsets)
3981 || trx_is_recv(thr_get_trx(thr)));
3982 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
3983
3984 if (!row_upd_changes_field_size_or_external(index, *offsets, update)) {
3985
3986 /* The simplest and the most common case: the update does not
3987 change the size of any field and none of the updated fields is
3988 externally stored in rec or update, and there is enough space
3989 on the compressed page to log the update. */
3990
3991 return(btr_cur_update_in_place(
3992 flags, cursor, *offsets, update,
3993 cmpl_info, thr, trx_id, mtr));
3994 }
3995
3996 if (rec_offs_any_extern(*offsets)) {
3997 any_extern:
3998 /* Externally stored fields are treated in pessimistic
3999 update */
4000
4001 /* prefetch siblings of the leaf for the pessimistic
4002 operation. */
4003 btr_cur_prefetch_siblings(block);
4004
4005 return(DB_OVERFLOW);
4006 }
4007
4008 for (i = 0; i < upd_get_n_fields(update); i++) {
4009 if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
4010
4011 goto any_extern;
4012 }
4013 }
4014
4015 DBUG_PRINT("ib_cur", ("update %s (" IB_ID_FMT ") by " TRX_ID_FMT
4016 ": %s",
4017 index->name(), index->id, trx_id,
4018 rec_printer(rec, *offsets).str().c_str()));
4019
4020 page_cursor = btr_cur_get_page_cur(cursor);
4021
4022 if (!*heap) {
4023 *heap = mem_heap_create(
4024 rec_offs_size(*offsets)
4025 + DTUPLE_EST_ALLOC(rec_offs_n_fields(*offsets)));
4026 }
4027
4028 new_entry = row_rec_to_index_entry(rec, index, *offsets,
4029 &n_ext, *heap);
4030 /* We checked above that there are no externally stored fields. */
4031 ut_a(!n_ext);
4032
4033 /* The page containing the clustered index record
4034 corresponding to new_entry is latched in mtr.
4035 Thus the following call is safe. */
4036 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4037 FALSE, *heap);
4038 old_rec_size = rec_offs_size(*offsets);
4039 new_rec_size = rec_get_converted_size(index, new_entry, 0);
4040
4041 page_zip = buf_block_get_page_zip(block);
4042 #ifdef UNIV_ZIP_DEBUG
4043 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4044 #endif /* UNIV_ZIP_DEBUG */
4045
4046 if (page_zip) {
4047 if (!btr_cur_update_alloc_zip(
4048 page_zip, page_cursor, index, *offsets,
4049 new_rec_size, true, mtr)) {
4050 return(DB_ZIP_OVERFLOW);
4051 }
4052
4053 rec = page_cur_get_rec(page_cursor);
4054 }
4055
4056 /* We limit max record size to 16k even for 64k page size. */
4057 if (new_rec_size >= REC_MAX_DATA_SIZE) {
4058 err = DB_OVERFLOW;
4059
4060 goto func_exit;
4061 }
4062
4063 if (UNIV_UNLIKELY(new_rec_size
4064 >= (page_get_free_space_of_empty(page_is_comp(page))
4065 / 2))) {
4066 /* We may need to update the IBUF_BITMAP_FREE
4067 bits after a reorganize that was done in
4068 btr_cur_update_alloc_zip(). */
4069 err = DB_OVERFLOW;
4070 goto func_exit;
4071 }
4072
4073 if (UNIV_UNLIKELY(page_get_data_size(page)
4074 - old_rec_size + new_rec_size
4075 < BTR_CUR_PAGE_COMPRESS_LIMIT(index))) {
4076 /* We may need to update the IBUF_BITMAP_FREE
4077 bits after a reorganize that was done in
4078 btr_cur_update_alloc_zip(). */
4079
4080 /* The page would become too empty */
4081 err = DB_UNDERFLOW;
4082 goto func_exit;
4083 }
4084
4085 /* We do not attempt to reorganize if the page is compressed.
4086 This is because the page may fail to compress after reorganization. */
4087 max_size = page_zip
4088 ? page_get_max_insert_size(page, 1)
4089 : (old_rec_size
4090 + page_get_max_insert_size_after_reorganize(page, 1));
4091
4092 if (!page_zip) {
4093 max_ins_size = page_get_max_insert_size_after_reorganize(
4094 page, 1);
4095 }
4096
4097 if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
4098 && (max_size >= new_rec_size))
4099 || (page_get_n_recs(page) <= 1))) {
4100
4101 /* We may need to update the IBUF_BITMAP_FREE
4102 bits after a reorganize that was done in
4103 btr_cur_update_alloc_zip(). */
4104
4105 /* There was not enough space, or it did not pay to
4106 reorganize: for simplicity, we decide what to do assuming a
4107 reorganization is needed, though it might not be necessary */
4108
4109 err = DB_OVERFLOW;
4110 goto func_exit;
4111 }
4112
4113 /* Do lock checking and undo logging */
4114 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4115 update, cmpl_info,
4116 thr, mtr, &roll_ptr);
4117 if (err != DB_SUCCESS) {
4118 /* We may need to update the IBUF_BITMAP_FREE
4119 bits after a reorganize that was done in
4120 btr_cur_update_alloc_zip(). */
4121 goto func_exit;
4122 }
4123
4124 /* Ok, we may do the replacement. Store on the page infimum the
4125 explicit locks on rec, before deleting rec (see the comment in
4126 btr_cur_pessimistic_update). */
4127 if (!dict_table_is_locking_disabled(index->table)) {
4128 lock_rec_store_on_page_infimum(block, rec);
4129 }
4130
4131 btr_search_update_hash_on_delete(cursor);
4132
4133 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4134
4135 page_cur_move_to_prev(page_cursor);
4136
4137 if (!(flags & BTR_KEEP_SYS_FLAG)
4138 && !dict_table_is_intrinsic(index->table)) {
4139 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4140 roll_ptr);
4141 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4142 trx_id);
4143 }
4144
4145 /* There are no externally stored columns in new_entry */
4146 rec = btr_cur_insert_if_possible(
4147 cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
4148 ut_a(rec); /* <- We calculated above the insert would fit */
4149
4150 /* Restore the old explicit lock state on the record */
4151 if (!dict_table_is_locking_disabled(index->table)) {
4152 lock_rec_restore_from_page_infimum(block, rec, block);
4153 }
4154
4155 page_cur_move_to_next(page_cursor);
4156 ut_ad(err == DB_SUCCESS);
4157
4158 func_exit:
4159 if (!(flags & BTR_KEEP_IBUF_BITMAP)
4160 && !dict_index_is_clust(index)
4161 && !dict_table_is_temporary(index->table)) {
4162 /* Update the free bits in the insert buffer. */
4163 if (page_zip) {
4164 ibuf_update_free_bits_zip(block, mtr);
4165 } else {
4166 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4167 }
4168 }
4169
4170 if (err != DB_SUCCESS) {
4171 /* prefetch siblings of the leaf for the pessimistic
4172 operation. */
4173 btr_cur_prefetch_siblings(block);
4174 }
4175
4176 return(err);
4177 }
4178
4179 /*************************************************************//**
4180 If, in a split, a new supremum record was created as the predecessor of the
4181 updated record, the supremum record must inherit exactly the locks on the
4182 updated record. In the split it may have inherited locks from the successor
4183 of the updated record, which is not correct. This function restores the
4184 right locks for the new supremum. */
4185 static
4186 void
btr_cur_pess_upd_restore_supremum(buf_block_t * block,const rec_t * rec,mtr_t * mtr)4187 btr_cur_pess_upd_restore_supremum(
4188 /*==============================*/
4189 buf_block_t* block, /*!< in: buffer block of rec */
4190 const rec_t* rec, /*!< in: updated record */
4191 mtr_t* mtr) /*!< in: mtr */
4192 {
4193 page_t* page;
4194 buf_block_t* prev_block;
4195
4196 page = buf_block_get_frame(block);
4197
4198 if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
4199 /* Updated record is not the first user record on its page */
4200
4201 return;
4202 }
4203
4204 const ulint prev_page_no = btr_page_get_prev(page, mtr);
4205
4206 const page_id_t page_id(block->page.id.space(), prev_page_no);
4207
4208 ut_ad(prev_page_no != FIL_NULL);
4209 prev_block = buf_page_get_with_no_latch(page_id, block->page.size, mtr);
4210 #ifdef UNIV_BTR_DEBUG
4211 ut_a(btr_page_get_next(prev_block->frame, mtr)
4212 == page_get_page_no(page));
4213 #endif /* UNIV_BTR_DEBUG */
4214
4215 /* We must already have an x-latch on prev_block! */
4216 ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
4217
4218 lock_rec_reset_and_inherit_gap_locks(prev_block, block,
4219 PAGE_HEAP_NO_SUPREMUM,
4220 page_rec_get_heap_no(rec));
4221 }
4222
4223 /*************************************************************//**
4224 Performs an update of a record on a page of a tree. It is assumed
4225 that mtr holds an x-latch on the tree and on the cursor page. If the
4226 update is made on the leaf level, to avoid deadlocks, mtr must also
4227 own x-latches to brothers of page, if those brothers exist. We assume
4228 here that the ordering fields of the record do not change.
4229 @return DB_SUCCESS or error code */
4230 dberr_t
btr_cur_pessimistic_update(ulint flags,btr_cur_t * cursor,ulint ** offsets,mem_heap_t ** offsets_heap,mem_heap_t * entry_heap,big_rec_t ** big_rec,upd_t * update,ulint cmpl_info,que_thr_t * thr,trx_id_t trx_id,mtr_t * mtr)4231 btr_cur_pessimistic_update(
4232 /*=======================*/
4233 ulint flags, /*!< in: undo logging, locking, and rollback
4234 flags */
4235 btr_cur_t* cursor, /*!< in/out: cursor on the record to update;
4236 cursor may become invalid if *big_rec == NULL
4237 || !(flags & BTR_KEEP_POS_FLAG) */
4238 ulint** offsets,/*!< out: offsets on cursor->page_cur.rec */
4239 mem_heap_t** offsets_heap,
4240 /*!< in/out: pointer to memory heap
4241 that can be emptied, or NULL */
4242 mem_heap_t* entry_heap,
4243 /*!< in/out: memory heap for allocating
4244 big_rec and the index tuple */
4245 big_rec_t** big_rec,/*!< out: big rec vector whose fields have to
4246 be stored externally by the caller, or NULL */
4247 upd_t* update, /*!< in/out: update vector; this is allowed to
4248 also contain trx id and roll ptr fields.
4249 Non-updated columns that are moved offpage will
4250 be appended to this. */
4251 ulint cmpl_info,/*!< in: compiler info on secondary index
4252 updates */
4253 que_thr_t* thr, /*!< in: query thread */
4254 trx_id_t trx_id, /*!< in: transaction id */
4255 mtr_t* mtr) /*!< in/out: mini-transaction; must be
4256 committed before latching any further pages */
4257 {
4258 big_rec_t* big_rec_vec = NULL;
4259 big_rec_t* dummy_big_rec;
4260 dict_index_t* index;
4261 buf_block_t* block;
4262 page_t* page;
4263 page_zip_des_t* page_zip;
4264 rec_t* rec;
4265 page_cur_t* page_cursor;
4266 dberr_t err;
4267 dberr_t optim_err;
4268 roll_ptr_t roll_ptr;
4269 ibool was_first;
4270 ulint n_reserved = 0;
4271 ulint n_ext;
4272 ulint max_ins_size = 0;
4273
4274 *offsets = NULL;
4275 *big_rec = NULL;
4276
4277 block = btr_cur_get_block(cursor);
4278 page = buf_block_get_frame(block);
4279 page_zip = buf_block_get_page_zip(block);
4280 index = cursor->index;
4281
4282 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4283 MTR_MEMO_X_LOCK |
4284 MTR_MEMO_SX_LOCK)
4285 || dict_table_is_intrinsic(index->table));
4286 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
4287 #ifdef UNIV_ZIP_DEBUG
4288 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4289 #endif /* UNIV_ZIP_DEBUG */
4290 /* The insert buffer tree should never be updated in place. */
4291 ut_ad(!dict_index_is_ibuf(index));
4292 ut_ad(trx_id > 0
4293 || (flags & BTR_KEEP_SYS_FLAG)
4294 || dict_table_is_intrinsic(index->table));
4295 ut_ad(dict_index_is_online_ddl(index) == !!(flags & BTR_CREATE_FLAG)
4296 || dict_index_is_clust(index));
4297 ut_ad(thr_get_trx(thr)->id == trx_id
4298 || (flags & ~BTR_KEEP_POS_FLAG)
4299 == (BTR_NO_UNDO_LOG_FLAG | BTR_NO_LOCKING_FLAG
4300 | BTR_CREATE_FLAG | BTR_KEEP_SYS_FLAG));
4301
4302 err = optim_err = btr_cur_optimistic_update(
4303 flags | BTR_KEEP_IBUF_BITMAP,
4304 cursor, offsets, offsets_heap, update,
4305 cmpl_info, thr, trx_id, mtr);
4306
4307 switch (err) {
4308 case DB_ZIP_OVERFLOW:
4309 case DB_UNDERFLOW:
4310 case DB_OVERFLOW:
4311 break;
4312 default:
4313 err_exit:
4314 /* We suppressed this with BTR_KEEP_IBUF_BITMAP.
4315 For DB_ZIP_OVERFLOW, the IBUF_BITMAP_FREE bits were
4316 already reset by btr_cur_update_alloc_zip() if the
4317 page was recompressed. */
4318 if (page_zip
4319 && optim_err != DB_ZIP_OVERFLOW
4320 && !dict_index_is_clust(index)
4321 && !dict_table_is_temporary(index->table)
4322 && page_is_leaf(page)) {
4323 ibuf_update_free_bits_zip(block, mtr);
4324 }
4325
4326 if (big_rec_vec != NULL) {
4327 dtuple_big_rec_free(big_rec_vec);
4328 }
4329
4330 return(err);
4331 }
4332
4333 rec = btr_cur_get_rec(cursor);
4334
4335 *offsets = rec_get_offsets(
4336 rec, index, *offsets, ULINT_UNDEFINED, offsets_heap);
4337
4338 dtuple_t* new_entry = row_rec_to_index_entry(
4339 rec, index, *offsets, &n_ext, entry_heap);
4340
4341 /* The page containing the clustered index record
4342 corresponding to new_entry is latched in mtr. If the
4343 clustered index record is delete-marked, then its externally
4344 stored fields cannot have been purged yet, because then the
4345 purge would also have removed the clustered index record
4346 itself. Thus the following call is safe. */
4347 row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
4348 FALSE, entry_heap);
4349
4350 /* We have to set appropriate extern storage bits in the new
4351 record to be inserted: we have to remember which fields were such */
4352
4353 ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
4354 ut_ad(rec_offs_validate(rec, index, *offsets));
4355
4356 /* Get number of externally stored columns in updated record */
4357 n_ext = new_entry->get_n_ext();
4358
4359 /* UNDO logging is also turned-off during normal operation on intrinsic
4360 table so condition needs to ensure that table is not intrinsic. */
4361 if ((flags & BTR_NO_UNDO_LOG_FLAG)
4362 && rec_offs_any_extern(*offsets)
4363 && !dict_table_is_intrinsic(index->table)) {
4364 /* We are in a transaction rollback undoing a row
4365 update: we must free possible externally stored fields
4366 which got new values in the update, if they are not
4367 inherited values. They can be inherited if we have
4368 updated the primary key to another value, and then
4369 update it back again. */
4370
4371 ut_ad(big_rec_vec == NULL);
4372 ut_ad(dict_index_is_clust(index));
4373 ut_ad(thr_get_trx(thr)->in_rollback);
4374
4375 DBUG_EXECUTE_IF("ib_blob_update_rollback", DBUG_SUICIDE(););
4376 RECOVERY_CRASH(99);
4377
4378 btr_rec_free_updated_extern_fields(
4379 index, rec, page_zip, *offsets, update, true, mtr);
4380 }
4381
4382 if (page_zip_rec_needs_ext(
4383 rec_get_converted_size(index, new_entry, n_ext),
4384 page_is_comp(page),
4385 dict_index_get_n_fields(index),
4386 block->page.size)) {
4387
4388 big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
4389 if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
4390
4391 /* We cannot goto return_after_reservations,
4392 because we may need to update the
4393 IBUF_BITMAP_FREE bits, which was suppressed by
4394 BTR_KEEP_IBUF_BITMAP. */
4395 #ifdef UNIV_ZIP_DEBUG
4396 ut_a(!page_zip
4397 || page_zip_validate(page_zip, page, index));
4398 #endif /* UNIV_ZIP_DEBUG */
4399 if (n_reserved > 0) {
4400 fil_space_release_free_extents(
4401 index->space, n_reserved);
4402 }
4403
4404 err = DB_TOO_BIG_RECORD;
4405 goto err_exit;
4406 }
4407
4408 ut_ad(page_is_leaf(page));
4409 ut_ad(dict_index_is_clust(index));
4410 ut_ad(flags & BTR_KEEP_POS_FLAG);
4411 }
4412
4413 /* Do lock checking and undo logging */
4414 err = btr_cur_upd_lock_and_undo(flags, cursor, *offsets,
4415 update, cmpl_info,
4416 thr, mtr, &roll_ptr);
4417 if (err != DB_SUCCESS) {
4418 goto err_exit;
4419 }
4420
4421 if (optim_err == DB_OVERFLOW) {
4422
4423 /* First reserve enough free space for the file segments
4424 of the index tree, so that the update will not fail because
4425 of lack of space */
4426
4427 ulint n_extents = cursor->tree_height / 16 + 3;
4428
4429 if (!fsp_reserve_free_extents(
4430 &n_reserved, index->space, n_extents,
4431 flags & BTR_NO_UNDO_LOG_FLAG
4432 ? FSP_CLEANING : FSP_NORMAL,
4433 mtr)) {
4434 err = DB_OUT_OF_FILE_SPACE;
4435 goto err_exit;
4436 }
4437 }
4438
4439 if (!(flags & BTR_KEEP_SYS_FLAG)
4440 && !dict_table_is_intrinsic(index->table)) {
4441 row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
4442 roll_ptr);
4443 row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
4444 trx_id);
4445 }
4446
4447 if (!page_zip) {
4448 max_ins_size = page_get_max_insert_size_after_reorganize(
4449 page, 1);
4450 }
4451
4452 /* Store state of explicit locks on rec on the page infimum record,
4453 before deleting rec. The page infimum acts as a dummy carrier of the
4454 locks, taking care also of lock releases, before we can move the locks
4455 back on the actual record. There is a special case: if we are
4456 inserting on the root page and the insert causes a call of
4457 btr_root_raise_and_insert. Therefore we cannot in the lock system
4458 delete the lock structs set on the root page even if the root
4459 page carries just node pointers. */
4460 if (!dict_table_is_locking_disabled(index->table)) {
4461 lock_rec_store_on_page_infimum(block, rec);
4462 }
4463
4464 btr_search_update_hash_on_delete(cursor);
4465
4466 #ifdef UNIV_ZIP_DEBUG
4467 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4468 #endif /* UNIV_ZIP_DEBUG */
4469 page_cursor = btr_cur_get_page_cur(cursor);
4470
4471 page_cur_delete_rec(page_cursor, index, *offsets, mtr);
4472
4473 page_cur_move_to_prev(page_cursor);
4474
4475 rec = btr_cur_insert_if_possible(cursor, new_entry,
4476 offsets, offsets_heap, n_ext, mtr);
4477
4478 if (rec) {
4479 page_cursor->rec = rec;
4480
4481 if (!dict_table_is_locking_disabled(index->table)) {
4482 lock_rec_restore_from_page_infimum(
4483 btr_cur_get_block(cursor), rec, block);
4484 }
4485
4486 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4487 /* The new inserted record owns its possible externally
4488 stored fields */
4489 btr_cur_unmark_extern_fields(
4490 page_zip, rec, index, *offsets, mtr);
4491 }
4492
4493 bool adjust = big_rec_vec && (flags & BTR_KEEP_POS_FLAG);
4494
4495 if (btr_cur_compress_if_useful(cursor, adjust, mtr)) {
4496 if (adjust) {
4497 rec_offs_make_valid(
4498 page_cursor->rec, index, *offsets);
4499 }
4500 } else if (!dict_index_is_clust(index)
4501 && !dict_table_is_temporary(index->table)
4502 && page_is_leaf(page)) {
4503 /* Update the free bits in the insert buffer.
4504 This is the same block which was skipped by
4505 BTR_KEEP_IBUF_BITMAP. */
4506 if (page_zip) {
4507 ibuf_update_free_bits_zip(block, mtr);
4508 } else {
4509 ibuf_update_free_bits_low(block, max_ins_size,
4510 mtr);
4511 }
4512 }
4513
4514 if (!srv_read_only_mode
4515 && !big_rec_vec
4516 && page_is_leaf(page)
4517 && !dict_index_is_online_ddl(index)) {
4518
4519 mtr_memo_release(mtr, dict_index_get_lock(index),
4520 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
4521
4522 /* NOTE: We cannot release root block latch here, because it
4523 has segment header and already modified in most of cases.*/
4524 }
4525
4526 err = DB_SUCCESS;
4527 goto return_after_reservations;
4528 } else {
4529 /* If the page is compressed and it initially
4530 compresses very well, and there is a subsequent insert
4531 of a badly-compressing record, it is possible for
4532 btr_cur_optimistic_update() to return DB_UNDERFLOW and
4533 btr_cur_insert_if_possible() to return FALSE. */
4534 ut_a(page_zip || optim_err != DB_UNDERFLOW);
4535
4536 /* Out of space: reset the free bits.
4537 This is the same block which was skipped by
4538 BTR_KEEP_IBUF_BITMAP. */
4539 if (!dict_index_is_clust(index)
4540 && !dict_table_is_temporary(index->table)
4541 && page_is_leaf(page)) {
4542 ibuf_reset_free_bits(block);
4543 }
4544 }
4545
4546 if (big_rec_vec != NULL && !dict_table_is_intrinsic(index->table)) {
4547 ut_ad(page_is_leaf(page));
4548 ut_ad(dict_index_is_clust(index));
4549 ut_ad(flags & BTR_KEEP_POS_FLAG);
4550
4551 /* btr_page_split_and_insert() in
4552 btr_cur_pessimistic_insert() invokes
4553 mtr_memo_release(mtr, index->lock, MTR_MEMO_SX_LOCK).
4554 We must keep the index->lock when we created a
4555 big_rec, so that row_upd_clust_rec() can store the
4556 big_rec in the same mini-transaction. */
4557
4558 ut_ad(mtr_memo_contains_flagged(mtr,
4559 dict_index_get_lock(index),
4560 MTR_MEMO_X_LOCK |
4561 MTR_MEMO_SX_LOCK));
4562
4563 mtr_sx_lock(dict_index_get_lock(index), mtr);
4564 }
4565
4566 /* Was the record to be updated positioned as the first user
4567 record on its page? */
4568 was_first = page_cur_is_before_first(page_cursor);
4569
4570 /* Lock checks and undo logging were already performed by
4571 btr_cur_upd_lock_and_undo(). We do not try
4572 btr_cur_optimistic_insert() because
4573 btr_cur_insert_if_possible() already failed above. */
4574
4575 err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
4576 | BTR_NO_LOCKING_FLAG
4577 | BTR_KEEP_SYS_FLAG,
4578 cursor, offsets, offsets_heap,
4579 new_entry, &rec,
4580 &dummy_big_rec, n_ext, NULL, mtr);
4581 ut_a(rec);
4582 ut_a(err == DB_SUCCESS);
4583 ut_a(dummy_big_rec == NULL);
4584 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
4585 page_cursor->rec = rec;
4586
4587 /* Multiple transactions cannot simultaneously operate on the
4588 same temp-table in parallel.
4589 max_trx_id is ignored for temp tables because it not required
4590 for MVCC. */
4591 if (dict_index_is_sec_or_ibuf(index)
4592 && !dict_table_is_temporary(index->table)) {
4593 /* Update PAGE_MAX_TRX_ID in the index page header.
4594 It was not updated by btr_cur_pessimistic_insert()
4595 because of BTR_NO_LOCKING_FLAG. */
4596 buf_block_t* rec_block;
4597
4598 rec_block = btr_cur_get_block(cursor);
4599
4600 page_update_max_trx_id(rec_block,
4601 buf_block_get_page_zip(rec_block),
4602 trx_id, mtr);
4603 }
4604
4605 if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
4606 /* The new inserted record owns its possible externally
4607 stored fields */
4608 buf_block_t* rec_block = btr_cur_get_block(cursor);
4609
4610 #ifdef UNIV_ZIP_DEBUG
4611 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4612 page = buf_block_get_frame(rec_block);
4613 #endif /* UNIV_ZIP_DEBUG */
4614 page_zip = buf_block_get_page_zip(rec_block);
4615
4616 btr_cur_unmark_extern_fields(page_zip,
4617 rec, index, *offsets, mtr);
4618 }
4619
4620 if (!dict_table_is_locking_disabled(index->table)) {
4621 lock_rec_restore_from_page_infimum(
4622 btr_cur_get_block(cursor), rec, block);
4623 }
4624
4625 /* If necessary, restore also the correct lock state for a new,
4626 preceding supremum record created in a page split. While the old
4627 record was nonexistent, the supremum might have inherited its locks
4628 from a wrong record. */
4629
4630 if (!was_first && !dict_table_is_locking_disabled(index->table)) {
4631 btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
4632 rec, mtr);
4633 }
4634
4635 return_after_reservations:
4636 #ifdef UNIV_ZIP_DEBUG
4637 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4638 #endif /* UNIV_ZIP_DEBUG */
4639
4640 if (n_reserved > 0) {
4641 fil_space_release_free_extents(index->space, n_reserved);
4642 }
4643
4644 *big_rec = big_rec_vec;
4645
4646 return(err);
4647 }
4648
4649 /*==================== B-TREE DELETE MARK AND UNMARK ===============*/
4650
4651 /****************************************************************//**
4652 Writes the redo log record for delete marking or unmarking of an index
4653 record. */
4654 UNIV_INLINE
4655 void
btr_cur_del_mark_set_clust_rec_log(rec_t * rec,dict_index_t * index,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)4656 btr_cur_del_mark_set_clust_rec_log(
4657 /*===============================*/
4658 rec_t* rec, /*!< in: record */
4659 dict_index_t* index, /*!< in: index of the record */
4660 trx_id_t trx_id, /*!< in: transaction id */
4661 roll_ptr_t roll_ptr,/*!< in: roll ptr to the undo log record */
4662 mtr_t* mtr) /*!< in: mtr */
4663 {
4664 byte* log_ptr;
4665
4666 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4667 ut_ad(mtr->is_named_space(index->space));
4668
4669 log_ptr = mlog_open_and_write_index(mtr, rec, index,
4670 page_rec_is_comp(rec)
4671 ? MLOG_COMP_REC_CLUST_DELETE_MARK
4672 : MLOG_REC_CLUST_DELETE_MARK,
4673 1 + 1 + DATA_ROLL_PTR_LEN
4674 + 14 + 2);
4675
4676 if (!log_ptr) {
4677 /* Logging in mtr is switched off during crash recovery */
4678 return;
4679 }
4680
4681 *log_ptr++ = 0;
4682 *log_ptr++ = 1;
4683
4684 log_ptr = row_upd_write_sys_vals_to_log(
4685 index, trx_id, roll_ptr, log_ptr, mtr);
4686 mach_write_to_2(log_ptr, page_offset(rec));
4687 log_ptr += 2;
4688
4689 mlog_close(mtr, log_ptr);
4690 }
4691 #endif /* !UNIV_HOTBACKUP */
4692
4693 /****************************************************************//**
4694 Parses the redo log record for delete marking or unmarking of a clustered
4695 index record.
4696 @return end of log record or NULL */
4697 byte*
btr_cur_parse_del_mark_set_clust_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip,dict_index_t * index)4698 btr_cur_parse_del_mark_set_clust_rec(
4699 /*=================================*/
4700 byte* ptr, /*!< in: buffer */
4701 byte* end_ptr,/*!< in: buffer end */
4702 page_t* page, /*!< in/out: page or NULL */
4703 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
4704 dict_index_t* index) /*!< in: index corresponding to page */
4705 {
4706 ulint flags;
4707 ulint val;
4708 ulint pos;
4709 trx_id_t trx_id;
4710 roll_ptr_t roll_ptr;
4711 ulint offset;
4712 rec_t* rec;
4713
4714 ut_ad(!page
4715 || !!page_is_comp(page) == dict_table_is_comp(index->table));
4716
4717 if (end_ptr < ptr + 2) {
4718
4719 return(NULL);
4720 }
4721
4722 flags = mach_read_from_1(ptr);
4723 ptr++;
4724 val = mach_read_from_1(ptr);
4725 ptr++;
4726
4727 ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
4728
4729 if (ptr == NULL) {
4730
4731 return(NULL);
4732 }
4733
4734 if (end_ptr < ptr + 2) {
4735
4736 return(NULL);
4737 }
4738
4739 offset = mach_read_from_2(ptr);
4740 ptr += 2;
4741
4742 ut_a(offset <= UNIV_PAGE_SIZE);
4743
4744 if (page) {
4745 rec = page + offset;
4746
4747 /* We do not need to reserve search latch, as the page
4748 is only being recovered, and there cannot be a hash index to
4749 it. Besides, these fields are being updated in place
4750 and the adaptive hash index does not depend on them. */
4751
4752 btr_rec_set_deleted_flag(rec, page_zip, val);
4753
4754 if (!(flags & BTR_KEEP_SYS_FLAG)) {
4755 mem_heap_t* heap = NULL;
4756 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4757 rec_offs_init(offsets_);
4758
4759 row_upd_rec_sys_fields_in_recovery(
4760 rec, page_zip,
4761 rec_get_offsets(rec, index, offsets_,
4762 ULINT_UNDEFINED, &heap),
4763 pos, trx_id, roll_ptr);
4764 if (UNIV_LIKELY_NULL(heap)) {
4765 mem_heap_free(heap);
4766 }
4767 }
4768 }
4769
4770 return(ptr);
4771 }
4772
4773 #ifndef UNIV_HOTBACKUP
4774 /***********************************************************//**
4775 Marks a clustered index record deleted. Writes an undo log record to
4776 undo log on this delete marking. Writes in the trx id field the id
4777 of the deleting transaction, and in the roll ptr field pointer to the
4778 undo log record created.
4779 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4780 dberr_t
btr_cur_del_mark_set_clust_rec(ulint flags,buf_block_t * block,rec_t * rec,dict_index_t * index,const ulint * offsets,que_thr_t * thr,const dtuple_t * entry,mtr_t * mtr)4781 btr_cur_del_mark_set_clust_rec(
4782 /*===========================*/
4783 ulint flags, /*!< in: undo logging and locking flags */
4784 buf_block_t* block, /*!< in/out: buffer block of the record */
4785 rec_t* rec, /*!< in/out: record */
4786 dict_index_t* index, /*!< in: clustered index of the record */
4787 const ulint* offsets,/*!< in: rec_get_offsets(rec) */
4788 que_thr_t* thr, /*!< in: query thread */
4789 const dtuple_t* entry, /*!< in: dtuple for the deleting record, also
4790 contains the virtual cols if there are any */
4791 mtr_t* mtr) /*!< in/out: mini-transaction */
4792 {
4793 roll_ptr_t roll_ptr;
4794 dberr_t err;
4795 page_zip_des_t* page_zip;
4796 trx_t* trx;
4797
4798 ut_ad(dict_index_is_clust(index));
4799 ut_ad(rec_offs_validate(rec, index, offsets));
4800 ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
4801 ut_ad(buf_block_get_frame(block) == page_align(rec));
4802 ut_ad(page_is_leaf(page_align(rec)));
4803 ut_ad(mtr->is_named_space(index->space));
4804
4805 if (rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
4806 /* While cascading delete operations, this becomes possible. */
4807 ut_ad(rec_get_trx_id(rec, index) == thr_get_trx(thr)->id);
4808 return(DB_SUCCESS);
4809 }
4810
4811 err = lock_clust_rec_modify_check_and_lock(BTR_NO_LOCKING_FLAG, block,
4812 rec, index, offsets, thr);
4813
4814 if (err != DB_SUCCESS) {
4815
4816 return(err);
4817 }
4818
4819 err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
4820 index, entry, NULL, 0, rec, offsets,
4821 &roll_ptr);
4822 if (err != DB_SUCCESS) {
4823
4824 return(err);
4825 }
4826
4827 /* The search latch is not needed here, because
4828 the adaptive hash index does not depend on the delete-mark
4829 and the delete-mark is being updated in place. */
4830
4831 page_zip = buf_block_get_page_zip(block);
4832
4833 btr_rec_set_deleted_flag(rec, page_zip, TRUE);
4834
4835 /* For intrinsic table, roll-ptr is not maintained as there is no UNDO
4836 logging. Skip updating it. */
4837 if (dict_table_is_intrinsic(index->table)) {
4838 return(err);
4839 }
4840
4841 trx = thr_get_trx(thr);
4842 /* This function must not be invoked during rollback
4843 (of a TRX_STATE_PREPARE transaction or otherwise). */
4844 ut_ad(trx_state_eq(trx, TRX_STATE_ACTIVE));
4845 ut_ad(!trx->in_rollback);
4846
4847 DBUG_PRINT("ib_cur", ("delete-mark clust %s (" IB_ID_FMT
4848 ") by " TRX_ID_FMT ": %s",
4849 index->table_name, index->id,
4850 trx_get_id_for_print(trx),
4851 rec_printer(rec, offsets).str().c_str()));
4852
4853 if (dict_index_is_online_ddl(index)) {
4854 row_log_table_delete(rec, entry, index, offsets, NULL);
4855 }
4856
4857 row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr);
4858
4859 btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id,
4860 roll_ptr, mtr);
4861
4862 return(err);
4863 }
4864
4865 /****************************************************************//**
4866 Writes the redo log record for a delete mark setting of a secondary
4867 index record. */
4868 UNIV_INLINE
4869 void
btr_cur_del_mark_set_sec_rec_log(rec_t * rec,ibool val,mtr_t * mtr)4870 btr_cur_del_mark_set_sec_rec_log(
4871 /*=============================*/
4872 rec_t* rec, /*!< in: record */
4873 ibool val, /*!< in: value to set */
4874 mtr_t* mtr) /*!< in: mtr */
4875 {
4876 byte* log_ptr;
4877 ut_ad(val <= 1);
4878
4879 log_ptr = mlog_open(mtr, 11 + 1 + 2);
4880
4881 if (!log_ptr) {
4882 /* Logging in mtr is switched off during crash recovery:
4883 in that case mlog_open returns NULL */
4884 return;
4885 }
4886
4887 log_ptr = mlog_write_initial_log_record_fast(
4888 rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
4889 mach_write_to_1(log_ptr, val);
4890 log_ptr++;
4891
4892 mach_write_to_2(log_ptr, page_offset(rec));
4893 log_ptr += 2;
4894
4895 mlog_close(mtr, log_ptr);
4896 }
4897 #endif /* !UNIV_HOTBACKUP */
4898
4899 /****************************************************************//**
4900 Parses the redo log record for delete marking or unmarking of a secondary
4901 index record.
4902 @return end of log record or NULL */
4903 byte*
btr_cur_parse_del_mark_set_sec_rec(byte * ptr,byte * end_ptr,page_t * page,page_zip_des_t * page_zip)4904 btr_cur_parse_del_mark_set_sec_rec(
4905 /*===============================*/
4906 byte* ptr, /*!< in: buffer */
4907 byte* end_ptr,/*!< in: buffer end */
4908 page_t* page, /*!< in/out: page or NULL */
4909 page_zip_des_t* page_zip)/*!< in/out: compressed page, or NULL */
4910 {
4911 ulint val;
4912 ulint offset;
4913 rec_t* rec;
4914
4915 if (end_ptr < ptr + 3) {
4916
4917 return(NULL);
4918 }
4919
4920 val = mach_read_from_1(ptr);
4921 ptr++;
4922
4923 offset = mach_read_from_2(ptr);
4924 ptr += 2;
4925
4926 ut_a(offset <= UNIV_PAGE_SIZE);
4927
4928 if (page) {
4929 rec = page + offset;
4930
4931 /* We do not need to reserve search latch, as the page
4932 is only being recovered, and there cannot be a hash index to
4933 it. Besides, the delete-mark flag is being updated in place
4934 and the adaptive hash index does not depend on it. */
4935
4936 btr_rec_set_deleted_flag(rec, page_zip, val);
4937 }
4938
4939 return(ptr);
4940 }
4941
4942 #ifndef UNIV_HOTBACKUP
4943 /***********************************************************//**
4944 Sets a secondary index record delete mark to TRUE or FALSE.
4945 @return DB_SUCCESS, DB_LOCK_WAIT, or error number */
4946 dberr_t
btr_cur_del_mark_set_sec_rec(ulint flags,btr_cur_t * cursor,ibool val,que_thr_t * thr,mtr_t * mtr)4947 btr_cur_del_mark_set_sec_rec(
4948 /*=========================*/
4949 ulint flags, /*!< in: locking flag */
4950 btr_cur_t* cursor, /*!< in: cursor */
4951 ibool val, /*!< in: value to set */
4952 que_thr_t* thr, /*!< in: query thread */
4953 mtr_t* mtr) /*!< in/out: mini-transaction */
4954 {
4955 buf_block_t* block;
4956 rec_t* rec;
4957 dberr_t err;
4958
4959 block = btr_cur_get_block(cursor);
4960 rec = btr_cur_get_rec(cursor);
4961
4962 err = lock_sec_rec_modify_check_and_lock(flags,
4963 btr_cur_get_block(cursor),
4964 rec, cursor->index, thr, mtr);
4965 if (err != DB_SUCCESS) {
4966
4967 return(err);
4968 }
4969
4970 ut_ad(!!page_rec_is_comp(rec)
4971 == dict_table_is_comp(cursor->index->table));
4972
4973 DBUG_PRINT("ib_cur", ("delete-mark=%u sec %u:%u:%u in %s("
4974 IB_ID_FMT ") by " TRX_ID_FMT,
4975 unsigned(val),
4976 block->page.id.space(), block->page.id.page_no(),
4977 unsigned(page_rec_get_heap_no(rec)),
4978 cursor->index->name(), cursor->index->id,
4979 trx_get_id_for_print(thr_get_trx(thr))));
4980
4981 /* We do not need to reserve search latch, as the
4982 delete-mark flag is being updated in place and the adaptive
4983 hash index does not depend on it. */
4984 btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
4985
4986 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
4987
4988 return(DB_SUCCESS);
4989 }
4990
4991 /***********************************************************//**
4992 Sets a secondary index record's delete mark to the given value. This
4993 function is only used by the insert buffer merge mechanism. */
4994 void
btr_cur_set_deleted_flag_for_ibuf(rec_t * rec,page_zip_des_t * page_zip,ibool val,mtr_t * mtr)4995 btr_cur_set_deleted_flag_for_ibuf(
4996 /*==============================*/
4997 rec_t* rec, /*!< in/out: record */
4998 page_zip_des_t* page_zip, /*!< in/out: compressed page
4999 corresponding to rec, or NULL
5000 when the tablespace is
5001 uncompressed */
5002 ibool val, /*!< in: value to set */
5003 mtr_t* mtr) /*!< in/out: mini-transaction */
5004 {
5005 /* We do not need to reserve search latch, as the page
5006 has just been read to the buffer pool and there cannot be
5007 a hash index to it. Besides, the delete-mark flag is being
5008 updated in place and the adaptive hash index does not depend
5009 on it. */
5010
5011 btr_rec_set_deleted_flag(rec, page_zip, val);
5012
5013 btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
5014 }
5015
5016 /*==================== B-TREE RECORD REMOVE =========================*/
5017
5018 /*************************************************************//**
5019 Tries to compress a page of the tree if it seems useful. It is assumed
5020 that mtr holds an x-latch on the tree and on the cursor page. To avoid
5021 deadlocks, mtr must also own x-latches to brothers of page, if those
5022 brothers exist. NOTE: it is assumed that the caller has reserved enough
5023 free extents so that the compression will always succeed if done!
5024 @return TRUE if compression occurred */
5025 ibool
btr_cur_compress_if_useful(btr_cur_t * cursor,ibool adjust,mtr_t * mtr)5026 btr_cur_compress_if_useful(
5027 /*=======================*/
5028 btr_cur_t* cursor, /*!< in/out: cursor on the page to compress;
5029 cursor does not stay valid if !adjust and
5030 compression occurs */
5031 ibool adjust, /*!< in: TRUE if should adjust the
5032 cursor position even if compression occurs */
5033 mtr_t* mtr) /*!< in/out: mini-transaction */
5034 {
5035 /* Avoid applying compression as we don't accept lot of page garbage
5036 given the workload of intrinsic table. */
5037 if (dict_table_is_intrinsic(cursor->index->table)) {
5038 return(FALSE);
5039 }
5040
5041 ut_ad(mtr_memo_contains_flagged(
5042 mtr, dict_index_get_lock(btr_cur_get_index(cursor)),
5043 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)
5044 || dict_table_is_intrinsic(cursor->index->table));
5045 ut_ad(mtr_is_block_fix(
5046 mtr, btr_cur_get_block(cursor),
5047 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5048
5049 if (dict_index_is_spatial(cursor->index)) {
5050 const page_t* page = btr_cur_get_page(cursor);
5051 const trx_t* trx = NULL;
5052
5053 if (cursor->rtr_info->thr != NULL) {
5054 trx = thr_get_trx(cursor->rtr_info->thr);
5055 }
5056
5057 /* Check whether page lock prevents the compression */
5058 if (!lock_test_prdt_page_lock(trx, page_get_space_id(page),
5059 page_get_page_no(page))) {
5060 return(false);
5061 }
5062 }
5063
5064 return(btr_cur_compress_recommendation(cursor, mtr)
5065 && btr_compress(cursor, adjust, mtr));
5066 }
5067
5068 /*******************************************************//**
5069 Removes the record on which the tree cursor is positioned on a leaf page.
5070 It is assumed that the mtr has an x-latch on the page where the cursor is
5071 positioned, but no latch on the whole tree.
5072 @return TRUE if success, i.e., the page did not become too empty */
5073 ibool
btr_cur_optimistic_delete_func(btr_cur_t * cursor,ulint flags,mtr_t * mtr)5074 btr_cur_optimistic_delete_func(
5075 /*===========================*/
5076 btr_cur_t* cursor, /*!< in: cursor on leaf page, on the record to
5077 delete; cursor stays valid: if deletion
5078 succeeds, on function exit it points to the
5079 successor of the deleted record */
5080 #ifdef UNIV_DEBUG
5081 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5082 #endif /* UNIV_DEBUG */
5083 mtr_t* mtr) /*!< in: mtr; if this function returns
5084 TRUE on a leaf page of a secondary
5085 index, the mtr must be committed
5086 before latching any further pages */
5087 {
5088 buf_block_t* block;
5089 rec_t* rec;
5090 mem_heap_t* heap = NULL;
5091 ulint offsets_[REC_OFFS_NORMAL_SIZE];
5092 ulint* offsets = offsets_;
5093 ibool no_compress_needed;
5094 rec_offs_init(offsets_);
5095
5096 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5097 ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
5098 MTR_MEMO_PAGE_X_FIX));
5099 ut_ad(mtr_is_block_fix(mtr, btr_cur_get_block(cursor),
5100 MTR_MEMO_PAGE_X_FIX, cursor->index->table));
5101 ut_ad(mtr->is_named_space(cursor->index->space));
5102
5103 /* This is intended only for leaf page deletions */
5104
5105 block = btr_cur_get_block(cursor);
5106
5107 ut_ad(page_is_leaf(buf_block_get_frame(block)));
5108 ut_ad(!dict_index_is_online_ddl(cursor->index)
5109 || dict_index_is_clust(cursor->index)
5110 || (flags & BTR_CREATE_FLAG));
5111
5112 rec = btr_cur_get_rec(cursor);
5113 offsets = rec_get_offsets(rec, cursor->index, offsets,
5114 ULINT_UNDEFINED, &heap);
5115
5116 no_compress_needed = !rec_offs_any_extern(offsets)
5117 && btr_cur_can_delete_without_compress(
5118 cursor, rec_offs_size(offsets), mtr);
5119
5120 if (no_compress_needed) {
5121
5122 page_t* page = buf_block_get_frame(block);
5123 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
5124
5125 lock_update_delete(block, rec);
5126
5127 btr_search_update_hash_on_delete(cursor);
5128
5129 if (page_zip) {
5130 #ifdef UNIV_ZIP_DEBUG
5131 ut_a(page_zip_validate(page_zip, page, cursor->index));
5132 #endif /* UNIV_ZIP_DEBUG */
5133 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5134 cursor->index, offsets, mtr);
5135 #ifdef UNIV_ZIP_DEBUG
5136 ut_a(page_zip_validate(page_zip, page, cursor->index));
5137 #endif /* UNIV_ZIP_DEBUG */
5138
5139 /* On compressed pages, the IBUF_BITMAP_FREE
5140 space is not affected by deleting (purging)
5141 records, because it is defined as the minimum
5142 of space available *without* reorganize, and
5143 space available in the modification log. */
5144 } else {
5145 const ulint max_ins
5146 = page_get_max_insert_size_after_reorganize(
5147 page, 1);
5148
5149 page_cur_delete_rec(btr_cur_get_page_cur(cursor),
5150 cursor->index, offsets, mtr);
5151
5152 /* The change buffer does not handle inserts
5153 into non-leaf pages, into clustered indexes,
5154 or into the change buffer. */
5155 if (!dict_index_is_clust(cursor->index)
5156 && !dict_table_is_temporary(cursor->index->table)
5157 && !dict_index_is_ibuf(cursor->index)) {
5158 ibuf_update_free_bits_low(block, max_ins, mtr);
5159 }
5160 }
5161 } else {
5162 /* prefetch siblings of the leaf for the pessimistic
5163 operation. */
5164 btr_cur_prefetch_siblings(block);
5165 }
5166
5167 if (UNIV_LIKELY_NULL(heap)) {
5168 mem_heap_free(heap);
5169 }
5170
5171 return(no_compress_needed);
5172 }
5173
5174 /*************************************************************//**
5175 Removes the record on which the tree cursor is positioned. Tries
5176 to compress the page if its fillfactor drops below a threshold
5177 or if it is the only page on the level. It is assumed that mtr holds
5178 an x-latch on the tree and on the cursor page. To avoid deadlocks,
5179 mtr must also own x-latches to brothers of page, if those brothers
5180 exist.
5181 @return TRUE if compression occurred and FALSE if not or something
5182 wrong. */
5183 ibool
btr_cur_pessimistic_delete(dberr_t * err,ibool has_reserved_extents,btr_cur_t * cursor,ulint flags,bool rollback,mtr_t * mtr)5184 btr_cur_pessimistic_delete(
5185 /*=======================*/
5186 dberr_t* err, /*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
5187 the latter may occur because we may have
5188 to update node pointers on upper levels,
5189 and in the case of variable length keys
5190 these may actually grow in size */
5191 ibool has_reserved_extents, /*!< in: TRUE if the
5192 caller has already reserved enough free
5193 extents so that he knows that the operation
5194 will succeed */
5195 btr_cur_t* cursor, /*!< in: cursor on the record to delete;
5196 if compression does not occur, the cursor
5197 stays valid: it points to successor of
5198 deleted record on function exit */
5199 ulint flags, /*!< in: BTR_CREATE_FLAG or 0 */
5200 bool rollback,/*!< in: performing rollback? */
5201 mtr_t* mtr) /*!< in: mtr */
5202 {
5203 buf_block_t* block;
5204 page_t* page;
5205 page_zip_des_t* page_zip;
5206 dict_index_t* index;
5207 rec_t* rec;
5208 ulint n_reserved = 0;
5209 bool success;
5210 ibool ret = FALSE;
5211 ulint level;
5212 mem_heap_t* heap;
5213 ulint* offsets;
5214 bool allow_merge = true; /* if true, implies we have taken appropriate page
5215 latches needed to merge this page.*/
5216 #ifdef UNIV_DEBUG
5217 bool parent_latched = false;
5218 #endif /* UNIV_DEBUG */
5219
5220 block = btr_cur_get_block(cursor);
5221 page = buf_block_get_frame(block);
5222 index = btr_cur_get_index(cursor);
5223
5224 ulint rec_size_est = dict_index_node_ptr_max_size(index);
5225 const page_size_t page_size(dict_table_page_size(index->table));
5226
5227 ut_ad(flags == 0 || flags == BTR_CREATE_FLAG);
5228 ut_ad(!dict_index_is_online_ddl(index)
5229 || dict_index_is_clust(index)
5230 || (flags & BTR_CREATE_FLAG));
5231 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
5232 MTR_MEMO_X_LOCK
5233 | MTR_MEMO_SX_LOCK)
5234 || dict_table_is_intrinsic(index->table));
5235 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
5236 ut_ad(mtr->is_named_space(index->space));
5237
5238 if (!has_reserved_extents) {
5239 /* First reserve enough free space for the file segments
5240 of the index tree, so that the node pointer updates will
5241 not fail because of lack of space */
5242
5243 ulint n_extents = cursor->tree_height / 32 + 1;
5244
5245 success = fsp_reserve_free_extents(&n_reserved,
5246 index->space,
5247 n_extents,
5248 FSP_CLEANING, mtr);
5249 if (!success) {
5250 *err = DB_OUT_OF_FILE_SPACE;
5251
5252 return(FALSE);
5253 }
5254 }
5255
5256 heap = mem_heap_create(1024);
5257 rec = btr_cur_get_rec(cursor);
5258 page_zip = buf_block_get_page_zip(block);
5259 #ifdef UNIV_ZIP_DEBUG
5260 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5261 #endif /* UNIV_ZIP_DEBUG */
5262
5263 offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
5264
5265 if (rec_offs_any_extern(offsets)) {
5266 btr_rec_free_externally_stored_fields(index,
5267 rec, offsets, page_zip,
5268 rollback, mtr);
5269 #ifdef UNIV_ZIP_DEBUG
5270 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5271 #endif /* UNIV_ZIP_DEBUG */
5272 }
5273
5274 if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
5275 && UNIV_UNLIKELY(dict_index_get_page(index)
5276 != block->page.id.page_no())) {
5277
5278 /* If there is only one record, drop the whole page in
5279 btr_discard_page, if this is not the root page */
5280
5281 btr_discard_page(cursor, mtr);
5282
5283 ret = TRUE;
5284
5285 goto return_after_reservations;
5286 }
5287
5288 if (flags == 0) {
5289 lock_update_delete(block, rec);
5290 }
5291
5292 level = btr_page_get_level(page, mtr);
5293
5294 if (level > 0
5295 && UNIV_UNLIKELY(rec == page_rec_get_next(
5296 page_get_infimum_rec(page)))) {
5297
5298 rec_t* next_rec = page_rec_get_next(rec);
5299
5300 if (btr_page_get_prev(page, mtr) == FIL_NULL) {
5301
5302 /* If we delete the leftmost node pointer on a
5303 non-leaf level, we must mark the new leftmost node
5304 pointer as the predefined minimum record */
5305
5306 /* This will make page_zip_validate() fail until
5307 page_cur_delete_rec() completes. This is harmless,
5308 because everything will take place within a single
5309 mini-transaction and because writing to the redo log
5310 is an atomic operation (performed by mtr_commit()). */
5311 btr_set_min_rec_mark(next_rec, mtr);
5312 } else if (dict_index_is_spatial(index)) {
5313 /* For rtree, if delete the leftmost node pointer,
5314 we need to update parent page. */
5315 rtr_mbr_t father_mbr;
5316 rec_t* father_rec;
5317 btr_cur_t father_cursor;
5318 ulint* offsets;
5319 bool upd_ret;
5320 ulint len;
5321
5322 rtr_page_get_father_block(NULL, heap, index,
5323 block, mtr, NULL,
5324 &father_cursor);
5325 offsets = rec_get_offsets(
5326 btr_cur_get_rec(&father_cursor), index,
5327 NULL, ULINT_UNDEFINED, &heap);
5328
5329 father_rec = btr_cur_get_rec(&father_cursor);
5330 rtr_read_mbr(rec_get_nth_field(
5331 father_rec, offsets, 0, &len), &father_mbr);
5332
5333 upd_ret = rtr_update_mbr_field(&father_cursor, offsets,
5334 NULL, page, &father_mbr,
5335 next_rec, mtr);
5336
5337 if (!upd_ret) {
5338 *err = DB_ERROR;
5339
5340 mem_heap_free(heap);
5341 return(FALSE);
5342 }
5343
5344 ut_d(parent_latched = true);
5345 } else {
5346 /* Otherwise, if we delete the leftmost node pointer
5347 on a page, we have to change the parent node pointer
5348 so that it is equal to the new leftmost node pointer
5349 on the page */
5350
5351 btr_node_ptr_delete(index, block, mtr);
5352
5353 dtuple_t* node_ptr = dict_index_build_node_ptr(
5354 index, next_rec, block->page.id.page_no(),
5355 heap, level);
5356
5357 btr_insert_on_non_leaf_level(
5358 flags, index, level + 1, node_ptr, mtr);
5359
5360 ut_d(parent_latched = true);
5361 }
5362 }
5363
5364 btr_search_update_hash_on_delete(cursor);
5365
5366 if (page_is_leaf(page) || dict_index_is_spatial(index)) {
5367 /* Set allow merge to true for spatial indexes as the tree is X
5368 locked incase of delete operation on spatial indexes thus avoiding
5369 possibility of upward locking.*/
5370 allow_merge = true;
5371 } else {
5372 allow_merge = btr_cur_will_modify_tree(index,page,BTR_INTENTION_DELETE,
5373 rec,rec_size_est,page_size,mtr);
5374 }
5375 page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
5376 #ifdef UNIV_ZIP_DEBUG
5377 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5378 #endif /* UNIV_ZIP_DEBUG */
5379
5380 /* btr_check_node_ptr() needs parent block latched */
5381 ut_ad(!parent_latched || btr_check_node_ptr(index, block, mtr));
5382
5383 return_after_reservations:
5384 *err = DB_SUCCESS;
5385
5386 mem_heap_free(heap);
5387
5388 if(!ret) {
5389 bool do_merge = btr_cur_compress_recommendation(cursor,mtr);
5390 /* We are not allowed do merge because appropriate locks
5391 are not taken while positioning the cursor. */
5392 if (!allow_merge && do_merge) {
5393 ib::info() << "Ignoring merge recommendation for page"
5394 "as we could not predict it early .Page"
5395 "number being\n" << page_get_page_no(page) <<
5396 "Index name\n" << index->name;
5397 ut_ad(false);
5398 } else if (do_merge) {
5399
5400 ret = btr_cur_compress_if_useful(cursor, FALSE, mtr);
5401 }
5402 }
5403
5404 if (!srv_read_only_mode
5405 && page_is_leaf(page)
5406 && !dict_index_is_online_ddl(index)) {
5407
5408 mtr_memo_release(mtr, dict_index_get_lock(index),
5409 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
5410
5411 /* NOTE: We cannot release root block latch here, because it
5412 has segment header and already modified in most of cases.*/
5413 }
5414
5415 if (n_reserved > 0) {
5416 fil_space_release_free_extents(index->space, n_reserved);
5417 }
5418
5419 return(ret);
5420 }
5421
5422 /*******************************************************************//**
5423 Adds path information to the cursor for the current page, for which
5424 the binary search has been performed. */
5425 static
5426 void
btr_cur_add_path_info(btr_cur_t * cursor,ulint height,ulint root_height)5427 btr_cur_add_path_info(
5428 /*==================*/
5429 btr_cur_t* cursor, /*!< in: cursor positioned on a page */
5430 ulint height, /*!< in: height of the page in tree;
5431 0 means leaf node */
5432 ulint root_height) /*!< in: root node height in tree */
5433 {
5434 btr_path_t* slot;
5435 const rec_t* rec;
5436 const page_t* page;
5437
5438 ut_a(cursor->path_arr);
5439
5440 if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
5441 /* Do nothing; return empty path */
5442
5443 slot = cursor->path_arr;
5444 slot->nth_rec = ULINT_UNDEFINED;
5445
5446 return;
5447 }
5448
5449 if (height == 0) {
5450 /* Mark end of slots for path */
5451 slot = cursor->path_arr + root_height + 1;
5452 slot->nth_rec = ULINT_UNDEFINED;
5453 }
5454
5455 rec = btr_cur_get_rec(cursor);
5456
5457 slot = cursor->path_arr + (root_height - height);
5458
5459 page = page_align(rec);
5460
5461 slot->nth_rec = page_rec_get_n_recs_before(rec);
5462 slot->n_recs = page_get_n_recs(page);
5463 slot->page_no = page_get_page_no(page);
5464 slot->page_level = btr_page_get_level_low(page);
5465 }
5466
5467 /*******************************************************************//**
5468 Estimate the number of rows between slot1 and slot2 for any level on a
5469 B-tree. This function starts from slot1->page and reads a few pages to
5470 the right, counting their records. If we reach slot2->page quickly then
5471 we know exactly how many records there are between slot1 and slot2 and
5472 we set is_n_rows_exact to TRUE. If we cannot reach slot2->page quickly
5473 then we calculate the average number of records in the pages scanned
5474 so far and assume that all pages that we did not scan up to slot2->page
5475 contain the same number of records, then we multiply that average to
5476 the number of pages between slot1->page and slot2->page (which is
5477 n_rows_on_prev_level). In this case we set is_n_rows_exact to FALSE.
5478 @return number of rows, not including the borders (exact or estimated) */
5479 static
5480 int64_t
btr_estimate_n_rows_in_range_on_level(dict_index_t * index,btr_path_t * slot1,btr_path_t * slot2,int64_t n_rows_on_prev_level,ibool * is_n_rows_exact)5481 btr_estimate_n_rows_in_range_on_level(
5482 /*==================================*/
5483 dict_index_t* index, /*!< in: index */
5484 btr_path_t* slot1, /*!< in: left border */
5485 btr_path_t* slot2, /*!< in: right border */
5486 int64_t n_rows_on_prev_level, /*!< in: number of rows
5487 on the previous level for the
5488 same descend paths; used to
5489 determine the number of pages
5490 on this level */
5491 ibool* is_n_rows_exact) /*!< out: TRUE if the returned
5492 value is exact i.e. not an
5493 estimation */
5494 {
5495 int64_t n_rows;
5496 ulint n_pages_read;
5497 ulint level;
5498
5499 n_rows = 0;
5500 n_pages_read = 0;
5501
5502 /* Assume by default that we will scan all pages between
5503 slot1->page_no and slot2->page_no. */
5504 *is_n_rows_exact = TRUE;
5505
5506 /* Add records from slot1->page_no which are to the right of
5507 the record which serves as a left border of the range, if any
5508 (we don't include the record itself in this count). */
5509 if (slot1->nth_rec <= slot1->n_recs) {
5510 n_rows += slot1->n_recs - slot1->nth_rec;
5511 }
5512
5513 /* Add records from slot2->page_no which are to the left of
5514 the record which servers as a right border of the range, if any
5515 (we don't include the record itself in this count). */
5516 if (slot2->nth_rec > 1) {
5517 n_rows += slot2->nth_rec - 1;
5518 }
5519
5520 /* Count the records in the pages between slot1->page_no and
5521 slot2->page_no (non inclusive), if any. */
5522
5523 /* Do not read more than this number of pages in order not to hurt
5524 performance with this code which is just an estimation. If we read
5525 this many pages before reaching slot2->page_no then we estimate the
5526 average from the pages scanned so far. */
5527 # define N_PAGES_READ_LIMIT 10
5528
5529 page_id_t page_id(
5530 dict_index_get_space(index), slot1->page_no);
5531 const fil_space_t* space = fil_space_get(index->space);
5532 ut_ad(space);
5533 const page_size_t page_size(space->flags);
5534
5535 level = slot1->page_level;
5536
5537 do {
5538 mtr_t mtr;
5539 page_t* page;
5540 buf_block_t* block;
5541
5542 mtr_start(&mtr);
5543
5544 /* Fetch the page. Because we are not holding the
5545 index->lock, the tree may have changed and we may be
5546 attempting to read a page that is no longer part of
5547 the B-tree. We pass BUF_GET_POSSIBLY_FREED in order to
5548 silence a debug assertion about this. */
5549 block = buf_page_get_gen(page_id, page_size, RW_S_LATCH,
5550 NULL, BUF_GET_POSSIBLY_FREED,
5551 __FILE__, __LINE__, &mtr);
5552
5553 page = buf_block_get_frame(block);
5554
5555 /* It is possible that the tree has been reorganized in the
5556 meantime and this is a different page. If this happens the
5557 calculated estimate will be bogus, which is not fatal as
5558 this is only an estimate. We are sure that a page with
5559 page_no exists because InnoDB never frees pages, only
5560 reuses them. */
5561 if (!fil_page_index_page_check(page)
5562 || btr_page_get_index_id(page) != index->id
5563 || btr_page_get_level_low(page) != level) {
5564
5565 /* The page got reused for something else */
5566 mtr_commit(&mtr);
5567 goto inexact;
5568 }
5569
5570 /* It is possible but highly unlikely that the page was
5571 originally written by an old version of InnoDB that did
5572 not initialize FIL_PAGE_TYPE on other than B-tree pages.
5573 For example, this could be an almost-empty BLOB page
5574 that happens to contain the magic values in the fields
5575 that we checked above. */
5576
5577 n_pages_read++;
5578
5579 if (page_id.page_no() != slot1->page_no) {
5580 /* Do not count the records on slot1->page_no,
5581 we already counted them before this loop. */
5582 n_rows += page_get_n_recs(page);
5583 }
5584
5585 page_id.set_page_no(btr_page_get_next(page, &mtr));
5586
5587 mtr_commit(&mtr);
5588
5589 if (n_pages_read == N_PAGES_READ_LIMIT
5590 || page_id.page_no() == FIL_NULL) {
5591 /* Either we read too many pages or
5592 we reached the end of the level without passing
5593 through slot2->page_no, the tree must have changed
5594 in the meantime */
5595 goto inexact;
5596 }
5597
5598 } while (page_id.page_no() != slot2->page_no);
5599
5600 return(n_rows);
5601
5602 inexact:
5603
5604 *is_n_rows_exact = FALSE;
5605
5606 /* We did interrupt before reaching slot2->page */
5607
5608 if (n_pages_read > 0) {
5609 /* The number of pages on this level is
5610 n_rows_on_prev_level, multiply it by the
5611 average number of recs per page so far */
5612 n_rows = n_rows_on_prev_level
5613 * n_rows / n_pages_read;
5614 } else {
5615 /* The tree changed before we could even
5616 start with slot1->page_no */
5617 n_rows = 10;
5618 }
5619
5620 return(n_rows);
5621 }
5622
5623 /** If the tree gets changed too much between the two dives for the left
5624 and right boundary then btr_estimate_n_rows_in_range_low() will retry
5625 that many times before giving up and returning the value stored in
5626 rows_in_range_arbitrary_ret_val. */
5627 static const unsigned rows_in_range_max_retries = 4;
5628
5629 /** We pretend that a range has that many records if the tree keeps changing
5630 for rows_in_range_max_retries retries while we try to estimate the records
5631 in a given range. */
5632 static const int64_t rows_in_range_arbitrary_ret_val = 10;
5633
5634 /** Estimates the number of rows in a given index range.
5635 @param[in] index index
5636 @param[in] tuple1 range start, may also be empty tuple
5637 @param[in] mode1 search mode for range start
5638 @param[in] tuple2 range end, may also be empty tuple
5639 @param[in] mode2 search mode for range end
5640 @param[in] nth_attempt if the tree gets modified too much while
5641 we are trying to analyze it, then we will retry (this function will call
5642 itself, incrementing this parameter)
5643 @return estimated number of rows; if after rows_in_range_max_retries
5644 retries the tree keeps changing, then we will just return
5645 rows_in_range_arbitrary_ret_val as a result (if
5646 nth_attempt >= rows_in_range_max_retries and the tree is modified between
5647 the two dives). */
5648 static
5649 int64_t
btr_estimate_n_rows_in_range_low(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2,unsigned nth_attempt)5650 btr_estimate_n_rows_in_range_low(
5651 dict_index_t* index,
5652 const dtuple_t* tuple1,
5653 page_cur_mode_t mode1,
5654 const dtuple_t* tuple2,
5655 page_cur_mode_t mode2,
5656 unsigned nth_attempt)
5657 {
5658 btr_path_t path1[BTR_PATH_ARRAY_N_SLOTS];
5659 btr_path_t path2[BTR_PATH_ARRAY_N_SLOTS];
5660 btr_cur_t cursor;
5661 btr_path_t* slot1;
5662 btr_path_t* slot2;
5663 ibool diverged;
5664 ibool diverged_lot;
5665 ulint divergence_level;
5666 int64_t n_rows;
5667 ibool is_n_rows_exact;
5668 ulint i;
5669 mtr_t mtr;
5670 int64_t table_n_rows;
5671
5672 table_n_rows = dict_table_get_n_rows(index->table);
5673
5674 /* Below we dive to the two records specified by tuple1 and tuple2 and
5675 we remember the entire dive paths from the tree root. The place where
5676 the tuple1 path ends on the leaf level we call "left border" of our
5677 interval and the place where the tuple2 path ends on the leaf level -
5678 "right border". We take care to either include or exclude the interval
5679 boundaries depending on whether <, <=, > or >= was specified. For
5680 example if "5 < x AND x <= 10" then we should not include the left
5681 boundary, but should include the right one. */
5682
5683 mtr_start(&mtr);
5684
5685 cursor.path_arr = path1;
5686
5687 bool should_count_the_left_border;
5688
5689 if (dtuple_get_n_fields(tuple1) > 0) {
5690
5691 btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
5692 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5693 &cursor, 0,
5694 __FILE__, __LINE__, &mtr);
5695
5696 ut_ad(!page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5697
5698 /* We should count the border if there are any records to
5699 match the criteria, i.e. if the maximum record on the tree is
5700 5 and x > 3 is specified then the cursor will be positioned at
5701 5 and we should count the border, but if x > 7 is specified,
5702 then the cursor will be positioned at 'sup' on the rightmost
5703 leaf page in the tree and we should not count the border. */
5704 should_count_the_left_border
5705 = !page_rec_is_supremum(btr_cur_get_rec(&cursor));
5706 } else {
5707 btr_cur_open_at_index_side(true, index,
5708 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5709 &cursor, 0, &mtr);
5710
5711 ut_ad(page_rec_is_infimum(btr_cur_get_rec(&cursor)));
5712
5713 /* The range specified is wihout a left border, just
5714 'x < 123' or 'x <= 123' and btr_cur_open_at_index_side()
5715 positioned the cursor on the infimum record on the leftmost
5716 page, which must not be counted. */
5717 should_count_the_left_border = false;
5718 }
5719
5720 mtr_commit(&mtr);
5721
5722 mtr_start(&mtr);
5723
5724 cursor.path_arr = path2;
5725
5726 bool should_count_the_right_border;
5727
5728 if (dtuple_get_n_fields(tuple2) > 0) {
5729
5730 btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
5731 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5732 &cursor, 0,
5733 __FILE__, __LINE__, &mtr);
5734
5735 const rec_t* rec = btr_cur_get_rec(&cursor);
5736
5737 ut_ad(!(mode2 == PAGE_CUR_L && page_rec_is_supremum(rec)));
5738
5739 should_count_the_right_border
5740 = (mode2 == PAGE_CUR_LE /* if the range is '<=' */
5741 /* and the record was found */
5742 && cursor.low_match >= dtuple_get_n_fields(tuple2))
5743 || (mode2 == PAGE_CUR_L /* or if the range is '<' */
5744 /* and there are any records to match the criteria,
5745 i.e. if the minimum record on the tree is 5 and
5746 x < 7 is specified then the cursor will be
5747 positioned at 5 and we should count the border, but
5748 if x < 2 is specified, then the cursor will be
5749 positioned at 'inf' and we should not count the
5750 border */
5751 && !page_rec_is_infimum(rec));
5752 /* Notice that for "WHERE col <= 'foo'" MySQL passes to
5753 ha_innobase::records_in_range():
5754 min_key=NULL (left-unbounded) which is expected
5755 max_key='foo' flag=HA_READ_AFTER_KEY (PAGE_CUR_G), which is
5756 unexpected - one would expect
5757 flag=HA_READ_KEY_OR_PREV (PAGE_CUR_LE). In this case the
5758 cursor will be positioned on the first record to the right of
5759 the requested one (can also be positioned on the 'sup') and
5760 we should not count the right border. */
5761 } else {
5762 btr_cur_open_at_index_side(false, index,
5763 BTR_SEARCH_LEAF | BTR_ESTIMATE,
5764 &cursor, 0, &mtr);
5765
5766 ut_ad(page_rec_is_supremum(btr_cur_get_rec(&cursor)));
5767
5768 /* The range specified is wihout a right border, just
5769 'x > 123' or 'x >= 123' and btr_cur_open_at_index_side()
5770 positioned the cursor on the supremum record on the rightmost
5771 page, which must not be counted. */
5772 should_count_the_right_border = false;
5773 }
5774
5775 mtr_commit(&mtr);
5776
5777 /* We have the path information for the range in path1 and path2 */
5778
5779 n_rows = 0;
5780 is_n_rows_exact = TRUE;
5781
5782 /* This becomes true when the two paths do not pass through the
5783 same pages anymore. */
5784 diverged = FALSE;
5785
5786 /* This becomes true when the paths are not the same or adjacent
5787 any more. This means that they pass through the same or
5788 neighboring-on-the-same-level pages only. */
5789 diverged_lot = FALSE;
5790
5791 /* This is the level where paths diverged a lot. */
5792 divergence_level = 1000000;
5793
5794 for (i = 0; ; i++) {
5795 ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
5796
5797 slot1 = path1 + i;
5798 slot2 = path2 + i;
5799
5800 if (slot1->nth_rec == ULINT_UNDEFINED
5801 || slot2->nth_rec == ULINT_UNDEFINED) {
5802
5803 /* Here none of the borders were counted. For example,
5804 if on the leaf level we descended to:
5805 (inf, a, b, c, d, e, f, sup)
5806 ^ ^
5807 path1 path2
5808 then n_rows will be 2 (c and d). */
5809
5810 if (is_n_rows_exact) {
5811 /* Only fiddle to adjust this off-by-one
5812 if the number is exact, otherwise we do
5813 much grosser adjustments below. */
5814
5815 btr_path_t* last1 = &path1[i - 1];
5816 btr_path_t* last2 = &path2[i - 1];
5817
5818 /* If both paths end up on the same record on
5819 the leaf level. */
5820 if (last1->page_no == last2->page_no
5821 && last1->nth_rec == last2->nth_rec) {
5822
5823 /* n_rows can be > 0 here if the paths
5824 were first different and then converged
5825 to the same record on the leaf level.
5826 For example:
5827 SELECT ... LIKE 'wait/synch/rwlock%'
5828 mode1=PAGE_CUR_GE,
5829 tuple1="wait/synch/rwlock"
5830 path1[0]={nth_rec=58, n_recs=58,
5831 page_no=3, page_level=1}
5832 path1[1]={nth_rec=56, n_recs=55,
5833 page_no=119, page_level=0}
5834
5835 mode2=PAGE_CUR_G
5836 tuple2="wait/synch/rwlock"
5837 path2[0]={nth_rec=57, n_recs=57,
5838 page_no=3, page_level=1}
5839 path2[1]={nth_rec=56, n_recs=55,
5840 page_no=119, page_level=0} */
5841
5842 /* If the range is such that we should
5843 count both borders, then avoid
5844 counting that record twice - once as a
5845 left border and once as a right
5846 border. */
5847 if (should_count_the_left_border
5848 && should_count_the_right_border) {
5849
5850 n_rows = 1;
5851 } else {
5852 /* Some of the borders should
5853 not be counted, e.g. [3,3). */
5854 n_rows = 0;
5855 }
5856 } else {
5857 if (should_count_the_left_border) {
5858 n_rows++;
5859 }
5860
5861 if (should_count_the_right_border) {
5862 n_rows++;
5863 }
5864 }
5865 }
5866
5867 if (i > divergence_level + 1 && !is_n_rows_exact) {
5868 /* In trees whose height is > 1 our algorithm
5869 tends to underestimate: multiply the estimate
5870 by 2: */
5871
5872 n_rows = n_rows * 2;
5873 }
5874
5875 DBUG_EXECUTE_IF("bug14007649", return(n_rows););
5876
5877 /* Do not estimate the number of rows in the range
5878 to over 1 / 2 of the estimated rows in the whole
5879 table */
5880
5881 if (n_rows > table_n_rows / 2 && !is_n_rows_exact) {
5882
5883 n_rows = table_n_rows / 2;
5884
5885 /* If there are just 0 or 1 rows in the table,
5886 then we estimate all rows are in the range */
5887
5888 if (n_rows == 0) {
5889 n_rows = table_n_rows;
5890 }
5891 }
5892
5893 return(n_rows);
5894 }
5895
5896 if (!diverged && slot1->nth_rec != slot2->nth_rec) {
5897
5898 /* If both slots do not point to the same page,
5899 this means that the tree must have changed between
5900 the dive for slot1 and the dive for slot2 at the
5901 beginning of this function. */
5902 if (slot1->page_no != slot2->page_no
5903 || slot1->page_level != slot2->page_level) {
5904
5905 /* If the tree keeps changing even after a
5906 few attempts, then just return some arbitrary
5907 number. */
5908 if (nth_attempt >= rows_in_range_max_retries) {
5909 return(rows_in_range_arbitrary_ret_val);
5910 }
5911
5912 const int64_t ret =
5913 btr_estimate_n_rows_in_range_low(
5914 index, tuple1, mode1,
5915 tuple2, mode2, nth_attempt + 1);
5916
5917 return(ret);
5918 }
5919
5920 diverged = TRUE;
5921
5922 if (slot1->nth_rec < slot2->nth_rec) {
5923 /* We do not count the borders (nor the left
5924 nor the right one), thus "- 1". */
5925 n_rows = slot2->nth_rec - slot1->nth_rec - 1;
5926
5927 if (n_rows > 0) {
5928 /* There is at least one row between
5929 the two borders pointed to by slot1
5930 and slot2, so on the level below the
5931 slots will point to non-adjacent
5932 pages. */
5933 diverged_lot = TRUE;
5934 divergence_level = i;
5935 }
5936 } else {
5937 /* It is possible that
5938 slot1->nth_rec >= slot2->nth_rec
5939 if, for example, we have a single page
5940 tree which contains (inf, 5, 6, supr)
5941 and we select where x > 20 and x < 30;
5942 in this case slot1->nth_rec will point
5943 to the supr record and slot2->nth_rec
5944 will point to 6. */
5945 n_rows = 0;
5946 should_count_the_left_border = false;
5947 should_count_the_right_border = false;
5948 }
5949
5950 } else if (diverged && !diverged_lot) {
5951
5952 if (slot1->nth_rec < slot1->n_recs
5953 || slot2->nth_rec > 1) {
5954
5955 diverged_lot = TRUE;
5956 divergence_level = i;
5957
5958 n_rows = 0;
5959
5960 if (slot1->nth_rec < slot1->n_recs) {
5961 n_rows += slot1->n_recs
5962 - slot1->nth_rec;
5963 }
5964
5965 if (slot2->nth_rec > 1) {
5966 n_rows += slot2->nth_rec - 1;
5967 }
5968 }
5969 } else if (diverged_lot) {
5970
5971 n_rows = btr_estimate_n_rows_in_range_on_level(
5972 index, slot1, slot2, n_rows,
5973 &is_n_rows_exact);
5974 }
5975 }
5976 }
5977
5978 /** Estimates the number of rows in a given index range.
5979 @param[in] index index
5980 @param[in] tuple1 range start, may also be empty tuple
5981 @param[in] mode1 search mode for range start
5982 @param[in] tuple2 range end, may also be empty tuple
5983 @param[in] mode2 search mode for range end
5984 @return estimated number of rows */
5985 int64_t
btr_estimate_n_rows_in_range(dict_index_t * index,const dtuple_t * tuple1,page_cur_mode_t mode1,const dtuple_t * tuple2,page_cur_mode_t mode2)5986 btr_estimate_n_rows_in_range(
5987 dict_index_t* index,
5988 const dtuple_t* tuple1,
5989 page_cur_mode_t mode1,
5990 const dtuple_t* tuple2,
5991 page_cur_mode_t mode2)
5992 {
5993 const int64_t ret = btr_estimate_n_rows_in_range_low(
5994 index, tuple1, mode1, tuple2, mode2, 1 /* first attempt */);
5995
5996 return(ret);
5997 }
5998
5999 /*******************************************************************//**
6000 Record the number of non_null key values in a given index for
6001 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6002 The estimates are eventually stored in the array:
6003 index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
6004 static
6005 void
btr_record_not_null_field_in_rec(ulint n_unique,const ulint * offsets,ib_uint64_t * n_not_null)6006 btr_record_not_null_field_in_rec(
6007 /*=============================*/
6008 ulint n_unique, /*!< in: dict_index_get_n_unique(index),
6009 number of columns uniquely determine
6010 an index entry */
6011 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
6012 its size could be for all fields or
6013 that of "n_unique" */
6014 ib_uint64_t* n_not_null) /*!< in/out: array to record number of
6015 not null rows for n-column prefix */
6016 {
6017 ulint i;
6018
6019 ut_ad(rec_offs_n_fields(offsets) >= n_unique);
6020
6021 if (n_not_null == NULL) {
6022 return;
6023 }
6024
6025 for (i = 0; i < n_unique; i++) {
6026 if (rec_offs_nth_sql_null(offsets, i)) {
6027 break;
6028 }
6029
6030 n_not_null[i]++;
6031 }
6032 }
6033
6034 /*******************************************************************//**
6035 Estimates the number of different key values in a given index, for
6036 each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
6037 The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
6038 0..n_uniq-1) and the number of pages that were sampled is saved in
6039 index->stat_n_sample_sizes[].
6040 If innodb_stats_method is nulls_ignored, we also record the number of
6041 non-null values for each prefix and stored the estimates in
6042 array index->stat_n_non_null_key_vals.
6043 @return true if the index is available and we get the estimated numbers,
6044 false if the index is unavailable. */
6045 bool
btr_estimate_number_of_different_key_vals(dict_index_t * index)6046 btr_estimate_number_of_different_key_vals(
6047 /*======================================*/
6048 dict_index_t* index) /*!< in: index */
6049 {
6050 btr_cur_t cursor;
6051 page_t* page;
6052 rec_t* rec;
6053 ulint n_cols;
6054 ib_uint64_t* n_diff;
6055 ib_uint64_t* n_not_null;
6056 ibool stats_null_not_equal;
6057 uintmax_t n_sample_pages; /* number of pages to sample */
6058 ulint not_empty_flag = 0;
6059 ulint total_external_size = 0;
6060 ulint i;
6061 ulint j;
6062 uintmax_t add_on;
6063 mtr_t mtr;
6064 mem_heap_t* heap = NULL;
6065 ulint* offsets_rec = NULL;
6066 ulint* offsets_next_rec = NULL;
6067
6068 /* For spatial index, there is no such stats can be
6069 fetched. */
6070 if (dict_index_is_spatial(index)) {
6071 return(false);
6072 }
6073
6074 n_cols = dict_index_get_n_unique(index);
6075
6076 heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
6077 * n_cols
6078 + dict_index_get_n_fields(index)
6079 * (sizeof *offsets_rec
6080 + sizeof *offsets_next_rec));
6081
6082 n_diff = (ib_uint64_t*) mem_heap_zalloc(
6083 heap, n_cols * sizeof(n_diff[0]));
6084
6085 n_not_null = NULL;
6086
6087 /* Check srv_innodb_stats_method setting, and decide whether we
6088 need to record non-null value and also decide if NULL is
6089 considered equal (by setting stats_null_not_equal value) */
6090 switch (srv_innodb_stats_method) {
6091 case SRV_STATS_NULLS_IGNORED:
6092 n_not_null = (ib_uint64_t*) mem_heap_zalloc(
6093 heap, n_cols * sizeof *n_not_null);
6094 /* fall through */
6095
6096 case SRV_STATS_NULLS_UNEQUAL:
6097 /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
6098 case, we will treat NULLs as unequal value */
6099 stats_null_not_equal = TRUE;
6100 break;
6101
6102 case SRV_STATS_NULLS_EQUAL:
6103 stats_null_not_equal = FALSE;
6104 break;
6105
6106 default:
6107 ut_error;
6108 }
6109
6110 /* It makes no sense to test more pages than are contained
6111 in the index, thus we lower the number if it is too high */
6112 if (srv_stats_transient_sample_pages > index->stat_index_size) {
6113 if (index->stat_index_size > 0) {
6114 n_sample_pages = index->stat_index_size;
6115 } else {
6116 n_sample_pages = 1;
6117 }
6118 } else {
6119 n_sample_pages = srv_stats_transient_sample_pages;
6120 }
6121
6122 /* We sample some pages in the index to get an estimate */
6123
6124 for (i = 0; i < n_sample_pages; i++) {
6125 mtr_start(&mtr);
6126
6127 bool available;
6128
6129 available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
6130 &cursor, &mtr);
6131
6132 if (!available) {
6133 mtr_commit(&mtr);
6134 mem_heap_free(heap);
6135
6136 return(false);
6137 }
6138
6139 /* Count the number of different key values for each prefix of
6140 the key on this index page. If the prefix does not determine
6141 the index record uniquely in the B-tree, then we subtract one
6142 because otherwise our algorithm would give a wrong estimate
6143 for an index where there is just one key value. */
6144
6145 page = btr_cur_get_page(&cursor);
6146
6147 rec = page_rec_get_next(page_get_infimum_rec(page));
6148
6149 if (!page_rec_is_supremum(rec)) {
6150 not_empty_flag = 1;
6151 offsets_rec = rec_get_offsets(rec, index, offsets_rec,
6152 ULINT_UNDEFINED, &heap);
6153
6154 if (n_not_null != NULL) {
6155 btr_record_not_null_field_in_rec(
6156 n_cols, offsets_rec, n_not_null);
6157 }
6158 }
6159
6160 while (!page_rec_is_supremum(rec)) {
6161 ulint matched_fields;
6162 rec_t* next_rec = page_rec_get_next(rec);
6163 if (page_rec_is_supremum(next_rec)) {
6164 total_external_size +=
6165 btr_rec_get_externally_stored_len(
6166 rec, offsets_rec);
6167 break;
6168 }
6169
6170 offsets_next_rec = rec_get_offsets(next_rec, index,
6171 offsets_next_rec,
6172 ULINT_UNDEFINED,
6173 &heap);
6174
6175 cmp_rec_rec_with_match(rec, next_rec,
6176 offsets_rec, offsets_next_rec,
6177 index,
6178 page_is_spatial_non_leaf(next_rec, index),
6179 stats_null_not_equal,
6180 &matched_fields);
6181
6182 for (j = matched_fields; j < n_cols; j++) {
6183 /* We add one if this index record has
6184 a different prefix from the previous */
6185
6186 n_diff[j]++;
6187 }
6188
6189 if (n_not_null != NULL) {
6190 btr_record_not_null_field_in_rec(
6191 n_cols, offsets_next_rec, n_not_null);
6192 }
6193
6194 total_external_size
6195 += btr_rec_get_externally_stored_len(
6196 rec, offsets_rec);
6197
6198 rec = next_rec;
6199 /* Initialize offsets_rec for the next round
6200 and assign the old offsets_rec buffer to
6201 offsets_next_rec. */
6202 {
6203 ulint* offsets_tmp = offsets_rec;
6204 offsets_rec = offsets_next_rec;
6205 offsets_next_rec = offsets_tmp;
6206 }
6207 }
6208
6209
6210 if (n_cols == dict_index_get_n_unique_in_tree(index)) {
6211
6212 /* If there is more than one leaf page in the tree,
6213 we add one because we know that the first record
6214 on the page certainly had a different prefix than the
6215 last record on the previous index page in the
6216 alphabetical order. Before this fix, if there was
6217 just one big record on each clustered index page, the
6218 algorithm grossly underestimated the number of rows
6219 in the table. */
6220
6221 if (btr_page_get_prev(page, &mtr) != FIL_NULL
6222 || btr_page_get_next(page, &mtr) != FIL_NULL) {
6223
6224 n_diff[n_cols - 1]++;
6225 }
6226 }
6227
6228 mtr_commit(&mtr);
6229 }
6230
6231 /* If we saw k borders between different key values on
6232 n_sample_pages leaf pages, we can estimate how many
6233 there will be in index->stat_n_leaf_pages */
6234
6235 /* We must take into account that our sample actually represents
6236 also the pages used for external storage of fields (those pages are
6237 included in index->stat_n_leaf_pages) */
6238
6239 for (j = 0; j < n_cols; j++) {
6240 index->stat_n_diff_key_vals[j]
6241 = BTR_TABLE_STATS_FROM_SAMPLE(
6242 n_diff[j], index, n_sample_pages,
6243 total_external_size, not_empty_flag);
6244
6245 /* If the tree is small, smaller than
6246 10 * n_sample_pages + total_external_size, then
6247 the above estimate is ok. For bigger trees it is common that we
6248 do not see any borders between key values in the few pages
6249 we pick. But still there may be n_sample_pages
6250 different key values, or even more. Let us try to approximate
6251 that: */
6252
6253 add_on = index->stat_n_leaf_pages
6254 / (10 * (n_sample_pages
6255 + total_external_size));
6256
6257 if (add_on > n_sample_pages) {
6258 add_on = n_sample_pages;
6259 }
6260
6261 index->stat_n_diff_key_vals[j] += add_on;
6262
6263 index->stat_n_sample_sizes[j] = n_sample_pages;
6264
6265 /* Update the stat_n_non_null_key_vals[] with our
6266 sampled result. stat_n_non_null_key_vals[] is created
6267 and initialized to zero in dict_index_add_to_cache(),
6268 along with stat_n_diff_key_vals[] array */
6269 if (n_not_null != NULL) {
6270 index->stat_n_non_null_key_vals[j] =
6271 BTR_TABLE_STATS_FROM_SAMPLE(
6272 n_not_null[j], index, n_sample_pages,
6273 total_external_size, not_empty_flag);
6274 }
6275 }
6276
6277 mem_heap_free(heap);
6278
6279 return(true);
6280 }
6281
6282 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
6283
6284 /***********************************************************//**
6285 Gets the offset of the pointer to the externally stored part of a field.
6286 @return offset of the pointer to the externally stored part */
6287 static
6288 ulint
btr_rec_get_field_ref_offs(const ulint * offsets,ulint n)6289 btr_rec_get_field_ref_offs(
6290 /*=======================*/
6291 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6292 ulint n) /*!< in: index of the external field */
6293 {
6294 ulint field_ref_offs;
6295 ulint local_len;
6296
6297 ut_a(rec_offs_nth_extern(offsets, n));
6298 field_ref_offs = rec_get_nth_field_offs(offsets, n, &local_len);
6299 ut_a(local_len != UNIV_SQL_NULL);
6300 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6301
6302 return(field_ref_offs + local_len - BTR_EXTERN_FIELD_REF_SIZE);
6303 }
6304
6305 /** Gets a pointer to the externally stored part of a field.
6306 @param rec record
6307 @param offsets rec_get_offsets(rec)
6308 @param n index of the externally stored field
6309 @return pointer to the externally stored part */
6310 #define btr_rec_get_field_ref(rec, offsets, n) \
6311 ((rec) + btr_rec_get_field_ref_offs(offsets, n))
6312
6313 /** Gets the externally stored size of a record, in units of a database page.
6314 @param[in] rec record
6315 @param[in] offsets array returned by rec_get_offsets()
6316 @return externally stored part, in units of a database page */
6317 ulint
btr_rec_get_externally_stored_len(const rec_t * rec,const ulint * offsets)6318 btr_rec_get_externally_stored_len(
6319 const rec_t* rec,
6320 const ulint* offsets)
6321 {
6322 ulint n_fields;
6323 ulint total_extern_len = 0;
6324 ulint i;
6325
6326 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6327
6328 if (!rec_offs_any_extern(offsets)) {
6329 return(0);
6330 }
6331
6332 n_fields = rec_offs_n_fields(offsets);
6333
6334 for (i = 0; i < n_fields; i++) {
6335 if (rec_offs_nth_extern(offsets, i)) {
6336
6337 ulint extern_len = mach_read_from_4(
6338 btr_rec_get_field_ref(rec, offsets, i)
6339 + BTR_EXTERN_LEN + 4);
6340
6341 total_extern_len += ut_calc_align(extern_len,
6342 UNIV_PAGE_SIZE);
6343 }
6344 }
6345
6346 return(total_extern_len / UNIV_PAGE_SIZE);
6347 }
6348
6349 /*******************************************************************//**
6350 Sets the ownership bit of an externally stored field in a record. */
6351 static
6352 void
btr_cur_set_ownership_of_extern_field(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,ulint i,ibool val,mtr_t * mtr)6353 btr_cur_set_ownership_of_extern_field(
6354 /*==================================*/
6355 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6356 part will be updated, or NULL */
6357 rec_t* rec, /*!< in/out: clustered index record */
6358 dict_index_t* index, /*!< in: index of the page */
6359 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6360 ulint i, /*!< in: field number */
6361 ibool val, /*!< in: value to set */
6362 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6363 {
6364 byte* data;
6365 ulint local_len;
6366 ulint byte_val;
6367
6368 data = rec_get_nth_field(rec, offsets, i, &local_len);
6369 ut_ad(rec_offs_nth_extern(offsets, i));
6370 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
6371
6372 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
6373
6374 byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
6375
6376 if (val) {
6377 byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
6378 } else {
6379 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6380 ut_a(!(byte_val & BTR_EXTERN_OWNER_FLAG));
6381 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6382 byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
6383 }
6384
6385 if (page_zip) {
6386 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6387 page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
6388 } else if (mtr != NULL) {
6389
6390 mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
6391 MLOG_1BYTE, mtr);
6392 } else {
6393 mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
6394 }
6395 }
6396
6397 /*******************************************************************//**
6398 Marks non-updated off-page fields as disowned by this record. The ownership
6399 must be transferred to the updated record which is inserted elsewhere in the
6400 index tree. In purge only the owner of externally stored field is allowed
6401 to free the field. */
6402 void
btr_cur_disown_inherited_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,const upd_t * update,mtr_t * mtr)6403 btr_cur_disown_inherited_fields(
6404 /*============================*/
6405 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6406 part will be updated, or NULL */
6407 rec_t* rec, /*!< in/out: record in a clustered index */
6408 dict_index_t* index, /*!< in: index of the page */
6409 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6410 const upd_t* update, /*!< in: update vector */
6411 mtr_t* mtr) /*!< in/out: mini-transaction */
6412 {
6413 ulint i;
6414
6415 ut_ad(rec_offs_validate(rec, index, offsets));
6416 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6417 ut_ad(rec_offs_any_extern(offsets));
6418 ut_ad(mtr);
6419
6420 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
6421 if (rec_offs_nth_extern(offsets, i)
6422 && !upd_get_field_by_field_no(update, i, false)) {
6423 btr_cur_set_ownership_of_extern_field(
6424 page_zip, rec, index, offsets, i, FALSE, mtr);
6425 }
6426 }
6427 }
6428
6429 /*******************************************************************//**
6430 Marks all extern fields in a record as owned by the record. This function
6431 should be called if the delete mark of a record is removed: a not delete
6432 marked record always owns all its extern fields. */
6433 static
6434 void
btr_cur_unmark_extern_fields(page_zip_des_t * page_zip,rec_t * rec,dict_index_t * index,const ulint * offsets,mtr_t * mtr)6435 btr_cur_unmark_extern_fields(
6436 /*=========================*/
6437 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
6438 part will be updated, or NULL */
6439 rec_t* rec, /*!< in/out: record in a clustered index */
6440 dict_index_t* index, /*!< in: index of the page */
6441 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
6442 mtr_t* mtr) /*!< in: mtr, or NULL if not logged */
6443 {
6444 ulint n;
6445 ulint i;
6446
6447 ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
6448 n = rec_offs_n_fields(offsets);
6449
6450 if (!rec_offs_any_extern(offsets)) {
6451
6452 return;
6453 }
6454
6455 for (i = 0; i < n; i++) {
6456 if (rec_offs_nth_extern(offsets, i)) {
6457
6458 btr_cur_set_ownership_of_extern_field(
6459 page_zip, rec, index, offsets, i, TRUE, mtr);
6460 }
6461 }
6462 }
6463
6464 /*******************************************************************//**
6465 Returns the length of a BLOB part stored on the header page.
6466 @return part length */
6467 static
6468 ulint
btr_blob_get_part_len(const byte * blob_header)6469 btr_blob_get_part_len(
6470 /*==================*/
6471 const byte* blob_header) /*!< in: blob header */
6472 {
6473 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
6474 }
6475
6476 /*******************************************************************//**
6477 Returns the page number where the next BLOB part is stored.
6478 @return page number or FIL_NULL if no more pages */
6479 static
6480 ulint
btr_blob_get_next_page_no(const byte * blob_header)6481 btr_blob_get_next_page_no(
6482 /*======================*/
6483 const byte* blob_header) /*!< in: blob header */
6484 {
6485 return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
6486 }
6487
6488 /*******************************************************************//**
6489 Deallocate a buffer block that was reserved for a BLOB part. */
6490 static
6491 void
btr_blob_free(dict_index_t * index,buf_block_t * block,ibool all,mtr_t * mtr)6492 btr_blob_free(
6493 /*==========*/
6494 dict_index_t* index, /*!< in: index */
6495 buf_block_t* block, /*!< in: buffer block */
6496 ibool all, /*!< in: TRUE=remove also the compressed page
6497 if there is one */
6498 mtr_t* mtr) /*!< in: mini-transaction to commit */
6499 {
6500 buf_pool_t* buf_pool = buf_pool_from_block(block);
6501 ulint space = block->page.id.space();
6502 ulint page_no = block->page.id.page_no();
6503
6504 ut_ad(mtr_is_block_fix(mtr, block, MTR_MEMO_PAGE_X_FIX, index->table));
6505
6506 mtr_commit(mtr);
6507
6508 buf_pool_mutex_enter(buf_pool);
6509
6510 /* Only free the block if it is still allocated to
6511 the same file page. */
6512
6513 if (buf_block_get_state(block)
6514 == BUF_BLOCK_FILE_PAGE
6515 && block->page.id.space() == space
6516 && block->page.id.page_no() == page_no) {
6517
6518 if (!buf_LRU_free_page(&block->page, all)
6519 && all && block->page.zip.data) {
6520 /* Attempt to deallocate the uncompressed page
6521 if the whole block cannot be deallocted. */
6522
6523 buf_LRU_free_page(&block->page, false);
6524 }
6525 }
6526
6527 buf_pool_mutex_exit(buf_pool);
6528 }
6529
6530 /** Helper class used while writing blob pages, during insert or update. */
6531 struct btr_blob_log_check_t {
6532 /** Persistent cursor on a clusterex index record with blobs. */
6533 btr_pcur_t* m_pcur;
6534 /** Mini transaction holding the latches for m_pcur */
6535 mtr_t* m_mtr;
6536 /** rec_get_offsets(rec, index); offset of clust_rec */
6537 const ulint* m_offsets;
6538 /** The block containing clustered record */
6539 buf_block_t** m_block;
6540 /** The clustered record pointer */
6541 rec_t** m_rec;
6542 /** The blob operation code */
6543 enum blob_op m_op;
6544
6545 /** Constructor
6546 @param[in] pcur persistent cursor on a clustered
6547 index record with blobs.
6548 @param[in] mtr mini-transaction holding latches for
6549 pcur.
6550 @param[in] offsets offsets of the clust_rec
6551 @param[in,out] block record block containing pcur record
6552 @param[in,out] rec the clustered record pointer
6553 @param[in] op the blob operation code */
btr_blob_log_check_tbtr_blob_log_check_t6554 btr_blob_log_check_t(
6555 btr_pcur_t* pcur,
6556 mtr_t* mtr,
6557 const ulint* offsets,
6558 buf_block_t** block,
6559 rec_t** rec,
6560 enum blob_op op)
6561 : m_pcur(pcur),
6562 m_mtr(mtr),
6563 m_offsets(offsets),
6564 m_block(block),
6565 m_rec(rec),
6566 m_op(op)
6567 {
6568 ut_ad(rec_offs_validate(*m_rec, m_pcur->index(), m_offsets));
6569 ut_ad((*m_block)->frame == page_align(*m_rec));
6570 ut_ad(*m_rec == btr_pcur_get_rec(m_pcur));
6571 }
6572
6573 /** Check if there is enough space in log file. Commit and re-start the
6574 mini transaction. */
checkbtr_blob_log_check_t6575 void check()
6576 {
6577 dict_index_t* index = m_pcur->index();
6578 ulint offs = 0;
6579 ulint page_no = ULINT_UNDEFINED;
6580 FlushObserver* observer = m_mtr->get_flush_observer();
6581
6582 if (m_op == BTR_STORE_INSERT_BULK) {
6583 offs = page_offset(*m_rec);
6584 page_no = page_get_page_no(
6585 buf_block_get_frame(*m_block));
6586
6587 buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__);
6588 } else {
6589 btr_pcur_store_position(m_pcur, m_mtr);
6590 }
6591 m_mtr->commit();
6592
6593 DEBUG_SYNC_C("blob_write_middle");
6594
6595 log_free_check();
6596
6597 DEBUG_SYNC_C("blob_write_middle_after_check");
6598
6599 const mtr_log_t log_mode = m_mtr->get_log_mode();
6600 m_mtr->start();
6601 m_mtr->set_log_mode(log_mode);
6602 m_mtr->set_named_space(index->space);
6603 m_mtr->set_flush_observer(observer);
6604
6605 if (m_op == BTR_STORE_INSERT_BULK) {
6606 page_id_t page_id(dict_index_get_space(index),
6607 page_no);
6608 page_size_t page_size(dict_table_page_size(
6609 index->table));
6610 page_cur_t* page_cur = &m_pcur->btr_cur.page_cur;
6611
6612 mtr_x_lock(dict_index_get_lock(index), m_mtr);
6613 page_cur->block = btr_block_get(
6614 page_id, page_size, RW_X_LATCH, index, m_mtr);
6615 page_cur->rec = buf_block_get_frame(page_cur->block)
6616 + offs;
6617
6618 buf_block_buf_fix_dec(page_cur->block);
6619 } else {
6620 ut_ad(m_pcur->rel_pos == BTR_PCUR_ON);
6621 bool ret = btr_pcur_restore_position(
6622 BTR_MODIFY_LEAF | BTR_MODIFY_EXTERNAL,
6623 m_pcur, m_mtr);
6624
6625 ut_a(ret);
6626 }
6627
6628 *m_block = btr_pcur_get_block(m_pcur);
6629 *m_rec = btr_pcur_get_rec(m_pcur);
6630
6631 ut_d(rec_offs_make_valid(
6632 *m_rec, index, const_cast<ulint*>(m_offsets)));
6633
6634 ut_ad(m_mtr->memo_contains_page_flagged(
6635 *m_rec,
6636 MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)
6637 || dict_table_is_intrinsic(index->table));
6638
6639 ut_ad(mtr_memo_contains_flagged(m_mtr,
6640 dict_index_get_lock(index),
6641 MTR_MEMO_SX_LOCK | MTR_MEMO_X_LOCK)
6642 || dict_table_is_intrinsic(index->table));
6643 }
6644 };
6645
6646
6647 /*******************************************************************//**
6648 Stores the fields in big_rec_vec to the tablespace and puts pointers to
6649 them in rec. The extern flags in rec will have to be set beforehand.
6650 The fields are stored on pages allocated from leaf node
6651 file segment of the index tree.
6652
6653 TODO: If the allocation extends the tablespace, it will not be redo logged, in
6654 any mini-transaction. Tablespace extension should be redo-logged, so that
6655 recovery will not fail when the big_rec was written to the extended portion of
6656 the file, in case the file was somehow truncated in the crash.
6657
6658 @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
6659 dberr_t
btr_store_big_rec_extern_fields(btr_pcur_t * pcur,const upd_t * upd,ulint * offsets,const big_rec_t * big_rec_vec,mtr_t * btr_mtr,enum blob_op op)6660 btr_store_big_rec_extern_fields(
6661 /*============================*/
6662 btr_pcur_t* pcur, /*!< in/out: a persistent cursor. if
6663 btr_mtr is restarted, then this can
6664 be repositioned. */
6665 const upd_t* upd, /*!< in: update vector */
6666 ulint* offsets, /*!< in/out: rec_get_offsets() on
6667 pcur. the "external storage" flags
6668 in offsets will correctly correspond
6669 to rec when this function returns */
6670 const big_rec_t*big_rec_vec, /*!< in: vector containing fields
6671 to be stored externally */
6672 mtr_t* btr_mtr, /*!< in/out: mtr containing the
6673 latches to the clustered index. can be
6674 committed and restarted. */
6675 enum blob_op op) /*! in: operation code */
6676 {
6677 ulint rec_page_no;
6678 byte* field_ref;
6679 ulint extern_len;
6680 ulint store_len;
6681 ulint page_no;
6682 ulint space_id;
6683 ulint prev_page_no;
6684 ulint hint_page_no;
6685 ulint i;
6686 mtr_t mtr;
6687 mtr_t mtr_bulk;
6688 mem_heap_t* heap = NULL;
6689 page_zip_des_t* page_zip;
6690 z_stream c_stream;
6691 dberr_t error = DB_SUCCESS;
6692 dict_index_t* index = pcur->index();
6693 buf_block_t* rec_block = btr_pcur_get_block(pcur);
6694 rec_t* rec = btr_pcur_get_rec(pcur);
6695
6696 ut_ad(rec_offs_validate(rec, index, offsets));
6697 ut_ad(rec_offs_any_extern(offsets));
6698 ut_ad(btr_mtr);
6699 ut_ad(mtr_memo_contains_flagged(btr_mtr, dict_index_get_lock(index),
6700 MTR_MEMO_X_LOCK
6701 | MTR_MEMO_SX_LOCK)
6702 || dict_table_is_intrinsic(index->table)
6703 || !index->is_committed());
6704 ut_ad(mtr_is_block_fix(
6705 btr_mtr, rec_block, MTR_MEMO_PAGE_X_FIX, index->table));
6706 ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
6707 ut_a(dict_index_is_clust(index));
6708
6709 ut_a(dict_table_page_size(index->table)
6710 .equals_to(rec_block->page.size));
6711
6712 btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
6713 &rec, op);
6714 page_zip = buf_block_get_page_zip(rec_block);
6715 space_id = rec_block->page.id.space();
6716 rec_page_no = rec_block->page.id.page_no();
6717 ut_a(fil_page_index_page_check(page_align(rec))
6718 || op == BTR_STORE_INSERT_BULK);
6719
6720 if (page_zip) {
6721 int err;
6722
6723 /* Zlib deflate needs 128 kilobytes for the default
6724 window size, plus 512 << memLevel, plus a few
6725 kilobytes for small objects. We use reduced memLevel
6726 to limit the memory consumption, and preallocate the
6727 heap, hoping to avoid memory fragmentation. */
6728 heap = mem_heap_create(250000);
6729 page_zip_set_alloc(&c_stream, heap);
6730
6731 err = deflateInit2(&c_stream, page_zip_level,
6732 Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
6733 ut_a(err == Z_OK);
6734 }
6735
6736 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6737 /* All pointers to externally stored columns in the record
6738 must either be zero or they must be pointers to inherited
6739 columns, owned by this record or an earlier record version. */
6740 for (i = 0; i < big_rec_vec->n_fields; i++) {
6741 field_ref = btr_rec_get_field_ref(
6742 rec, offsets, big_rec_vec->fields[i].field_no);
6743
6744 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
6745 /* Either this must be an update in place,
6746 or the BLOB must be inherited, or the BLOB pointer
6747 must be zero (will be written in this function). */
6748 ut_a(op == BTR_STORE_UPDATE
6749 || (field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_INHERITED_FLAG)
6750 || !memcmp(field_ref, field_ref_zero,
6751 BTR_EXTERN_FIELD_REF_SIZE));
6752 }
6753 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6754
6755 const page_size_t page_size(dict_table_page_size(index->table));
6756
6757 /* Space available in compressed page to carry blob data */
6758 const ulint payload_size_zip = page_size.physical()
6759 - FIL_PAGE_DATA;
6760
6761 /* Space available in uncompressed page to carry blob data */
6762 const ulint payload_size = page_size.physical()
6763 - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
6764
6765 /* We have to create a file segment to the tablespace
6766 for each field and put the pointer to the field in rec */
6767
6768 for (i = 0; i < big_rec_vec->n_fields; i++) {
6769 const ulint field_no = big_rec_vec->fields[i].field_no;
6770
6771 field_ref = btr_rec_get_field_ref(rec, offsets, field_no);
6772 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
6773 /* A zero BLOB pointer should have been initially inserted. */
6774 ut_a(!memcmp(field_ref, field_ref_zero,
6775 BTR_EXTERN_FIELD_REF_SIZE));
6776 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
6777 extern_len = big_rec_vec->fields[i].len;
6778 UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
6779 extern_len);
6780
6781 ut_a(extern_len > 0);
6782
6783 prev_page_no = FIL_NULL;
6784
6785 if (page_zip) {
6786 int err = deflateReset(&c_stream);
6787 ut_a(err == Z_OK);
6788
6789 c_stream.next_in = (Bytef*)
6790 big_rec_vec->fields[i].data;
6791 c_stream.avail_in = static_cast<uInt>(extern_len);
6792 }
6793
6794 for (ulint blob_npages = 0;; ++blob_npages) {
6795 buf_block_t* block;
6796 page_t* page;
6797 const ulint commit_freq = 4;
6798 ulint r_extents;
6799
6800 ut_ad(page_align(field_ref) == page_align(rec));
6801
6802 if (!(blob_npages % commit_freq)) {
6803
6804 redo_log.check();
6805
6806 field_ref = btr_rec_get_field_ref(
6807 rec, offsets, field_no);
6808
6809 page_zip = buf_block_get_page_zip(rec_block);
6810 rec_page_no = rec_block->page.id.page_no();
6811 }
6812
6813 mtr_start(&mtr);
6814 mtr.set_named_space(index->space);
6815 mtr.set_log_mode(btr_mtr->get_log_mode());
6816 mtr.set_flush_observer(btr_mtr->get_flush_observer());
6817
6818 buf_page_get(rec_block->page.id,
6819 rec_block->page.size, RW_X_LATCH, &mtr);
6820
6821 if (prev_page_no == FIL_NULL) {
6822 hint_page_no = 1 + rec_page_no;
6823 } else {
6824 hint_page_no = prev_page_no + 1;
6825 }
6826
6827 mtr_t *alloc_mtr;
6828
6829 if (op == BTR_STORE_INSERT_BULK) {
6830 mtr_start(&mtr_bulk);
6831 mtr_bulk.set_spaces(mtr);
6832 alloc_mtr = &mtr_bulk;
6833 } else {
6834 alloc_mtr = &mtr;
6835 }
6836
6837 if (!fsp_reserve_free_extents(&r_extents, space_id, 1,
6838 FSP_BLOB, alloc_mtr,
6839 1)) {
6840
6841 mtr_commit(alloc_mtr);
6842 error = DB_OUT_OF_FILE_SPACE;
6843 goto func_exit;
6844 }
6845
6846 block = btr_page_alloc(index, hint_page_no, FSP_NO_DIR,
6847 0, alloc_mtr, &mtr);
6848
6849 alloc_mtr->release_free_extents(r_extents);
6850
6851 if (op == BTR_STORE_INSERT_BULK) {
6852 mtr_commit(&mtr_bulk);
6853 }
6854
6855 ut_a(block != NULL);
6856
6857 page_no = block->page.id.page_no();
6858 page = buf_block_get_frame(block);
6859
6860 if (prev_page_no != FIL_NULL) {
6861 buf_block_t* prev_block;
6862 page_t* prev_page;
6863
6864 prev_block = buf_page_get(
6865 page_id_t(space_id, prev_page_no),
6866 rec_block->page.size,
6867 RW_X_LATCH, &mtr);
6868
6869 buf_block_dbg_add_level(prev_block,
6870 SYNC_EXTERN_STORAGE);
6871 prev_page = buf_block_get_frame(prev_block);
6872
6873 if (page_zip) {
6874 mlog_write_ulint(
6875 prev_page + FIL_PAGE_NEXT,
6876 page_no, MLOG_4BYTES, &mtr);
6877 memcpy(buf_block_get_page_zip(
6878 prev_block)
6879 ->data + FIL_PAGE_NEXT,
6880 prev_page + FIL_PAGE_NEXT, 4);
6881 } else {
6882 mlog_write_ulint(
6883 prev_page + FIL_PAGE_DATA
6884 + BTR_BLOB_HDR_NEXT_PAGE_NO,
6885 page_no, MLOG_4BYTES, &mtr);
6886 }
6887
6888 } else if (dict_index_is_online_ddl(index)) {
6889 row_log_table_blob_alloc(index, page_no);
6890 }
6891
6892 if (page_zip) {
6893 int err;
6894 page_zip_des_t* blob_page_zip;
6895
6896 /* Write FIL_PAGE_TYPE to the redo log
6897 separately, before logging any other
6898 changes to the page, so that the debug
6899 assertions in
6900 recv_parse_or_apply_log_rec_body() can
6901 be made simpler. Before InnoDB Plugin
6902 1.0.4, the initialization of
6903 FIL_PAGE_TYPE was logged as part of
6904 the mlog_log_string() below. */
6905
6906 mlog_write_ulint(page + FIL_PAGE_TYPE,
6907 prev_page_no == FIL_NULL
6908 ? FIL_PAGE_TYPE_ZBLOB
6909 : FIL_PAGE_TYPE_ZBLOB2,
6910 MLOG_2BYTES, &mtr);
6911
6912 c_stream.next_out = page
6913 + FIL_PAGE_DATA;
6914 c_stream.avail_out = static_cast<uInt>(
6915 payload_size_zip);
6916
6917 err = deflate(&c_stream, Z_FINISH);
6918 ut_a(err == Z_OK || err == Z_STREAM_END);
6919 ut_a(err == Z_STREAM_END
6920 || c_stream.avail_out == 0);
6921
6922 /* Write the "next BLOB page" pointer */
6923 mlog_write_ulint(page + FIL_PAGE_NEXT,
6924 FIL_NULL, MLOG_4BYTES, &mtr);
6925 /* Initialize the unused "prev page" pointer */
6926 mlog_write_ulint(page + FIL_PAGE_PREV,
6927 FIL_NULL, MLOG_4BYTES, &mtr);
6928 /* Write a back pointer to the record
6929 into the otherwise unused area. This
6930 information could be useful in
6931 debugging. Later, we might want to
6932 implement the possibility to relocate
6933 BLOB pages. Then, we would need to be
6934 able to adjust the BLOB pointer in the
6935 record. We do not store the heap
6936 number of the record, because it can
6937 change in page_zip_reorganize() or
6938 btr_page_reorganize(). However, also
6939 the page number of the record may
6940 change when B-tree nodes are split or
6941 merged.
6942 NOTE: FIL_PAGE_FILE_FLUSH_LSN space is
6943 used by R-tree index for a Split Sequence
6944 Number */
6945 ut_ad(!dict_index_is_spatial(index));
6946
6947 mlog_write_ulint(page
6948 + FIL_PAGE_FILE_FLUSH_LSN,
6949 space_id,
6950 MLOG_4BYTES, &mtr);
6951 mlog_write_ulint(page
6952 + FIL_PAGE_FILE_FLUSH_LSN + 4,
6953 rec_page_no,
6954 MLOG_4BYTES, &mtr);
6955
6956 /* Zero out the unused part of the page. */
6957 memset(page + page_zip_get_size(page_zip)
6958 - c_stream.avail_out,
6959 0, c_stream.avail_out);
6960 mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
6961 page_zip_get_size(page_zip)
6962 - FIL_PAGE_FILE_FLUSH_LSN,
6963 &mtr);
6964 /* Copy the page to compressed storage,
6965 because it will be flushed to disk
6966 from there. */
6967 blob_page_zip = buf_block_get_page_zip(block);
6968 ut_ad(blob_page_zip);
6969 ut_ad(page_zip_get_size(blob_page_zip)
6970 == page_zip_get_size(page_zip));
6971 memcpy(blob_page_zip->data, page,
6972 page_zip_get_size(page_zip));
6973
6974 if (err == Z_OK && prev_page_no != FIL_NULL) {
6975
6976 goto next_zip_page;
6977 }
6978
6979 if (err == Z_STREAM_END) {
6980 mach_write_to_4(field_ref
6981 + BTR_EXTERN_LEN, 0);
6982 mach_write_to_4(field_ref
6983 + BTR_EXTERN_LEN + 4,
6984 c_stream.total_in);
6985 } else {
6986 memset(field_ref + BTR_EXTERN_LEN,
6987 0, 8);
6988 }
6989
6990 if (prev_page_no == FIL_NULL) {
6991 ut_ad(blob_npages == 0);
6992 mach_write_to_4(field_ref
6993 + BTR_EXTERN_SPACE_ID,
6994 space_id);
6995
6996 mach_write_to_4(field_ref
6997 + BTR_EXTERN_PAGE_NO,
6998 page_no);
6999
7000 mach_write_to_4(field_ref
7001 + BTR_EXTERN_OFFSET,
7002 FIL_PAGE_NEXT);
7003 }
7004
7005 /* We compress a page when finish bulk insert.*/
7006 if (op != BTR_STORE_INSERT_BULK) {
7007 page_zip_write_blob_ptr(
7008 page_zip, rec, index, offsets,
7009 field_no, &mtr);
7010 }
7011
7012 next_zip_page:
7013 prev_page_no = page_no;
7014
7015 /* Commit mtr and release the
7016 uncompressed page frame to save memory. */
7017 btr_blob_free(index, block, FALSE, &mtr);
7018
7019 if (err == Z_STREAM_END) {
7020 break;
7021 }
7022 } else {
7023 mlog_write_ulint(page + FIL_PAGE_TYPE,
7024 FIL_PAGE_TYPE_BLOB,
7025 MLOG_2BYTES, &mtr);
7026
7027 if (extern_len > payload_size) {
7028 store_len = payload_size;
7029 } else {
7030 store_len = extern_len;
7031 }
7032
7033 mlog_write_string(page + FIL_PAGE_DATA
7034 + BTR_BLOB_HDR_SIZE,
7035 (const byte*)
7036 big_rec_vec->fields[i].data
7037 + big_rec_vec->fields[i].len
7038 - extern_len,
7039 store_len, &mtr);
7040 mlog_write_ulint(page + FIL_PAGE_DATA
7041 + BTR_BLOB_HDR_PART_LEN,
7042 store_len, MLOG_4BYTES, &mtr);
7043 mlog_write_ulint(page + FIL_PAGE_DATA
7044 + BTR_BLOB_HDR_NEXT_PAGE_NO,
7045 FIL_NULL, MLOG_4BYTES, &mtr);
7046
7047 extern_len -= store_len;
7048
7049 mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
7050 MLOG_4BYTES, &mtr);
7051 mlog_write_ulint(field_ref
7052 + BTR_EXTERN_LEN + 4,
7053 big_rec_vec->fields[i].len
7054 - extern_len,
7055 MLOG_4BYTES, &mtr);
7056
7057 if (prev_page_no == FIL_NULL) {
7058 ut_ad(blob_npages == 0);
7059 mlog_write_ulint(field_ref
7060 + BTR_EXTERN_SPACE_ID,
7061 space_id, MLOG_4BYTES,
7062 &mtr);
7063
7064 mlog_write_ulint(field_ref
7065 + BTR_EXTERN_PAGE_NO,
7066 page_no, MLOG_4BYTES,
7067 &mtr);
7068
7069 mlog_write_ulint(field_ref
7070 + BTR_EXTERN_OFFSET,
7071 FIL_PAGE_DATA,
7072 MLOG_4BYTES,
7073 &mtr);
7074 }
7075
7076 prev_page_no = page_no;
7077
7078 mtr_commit(&mtr);
7079
7080 if (extern_len == 0) {
7081 break;
7082 }
7083 }
7084 }
7085
7086 DBUG_EXECUTE_IF("btr_store_big_rec_extern",
7087 error = DB_OUT_OF_FILE_SPACE;
7088 goto func_exit;);
7089
7090 rec_offs_make_nth_extern(offsets, field_no);
7091 }
7092
7093 func_exit:
7094 if (page_zip) {
7095 deflateEnd(&c_stream);
7096 }
7097
7098 if (heap != NULL) {
7099 mem_heap_free(heap);
7100 }
7101
7102 #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
7103 /* All pointers to externally stored columns in the record
7104 must be valid. */
7105 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
7106 if (!rec_offs_nth_extern(offsets, i)) {
7107 continue;
7108 }
7109
7110 field_ref = btr_rec_get_field_ref(rec, offsets, i);
7111
7112 /* The pointer must not be zero if the operation
7113 succeeded. */
7114 ut_a(0 != memcmp(field_ref, field_ref_zero,
7115 BTR_EXTERN_FIELD_REF_SIZE)
7116 || error != DB_SUCCESS);
7117 /* The column must not be disowned by this record. */
7118 ut_a(!(field_ref[BTR_EXTERN_LEN] & BTR_EXTERN_OWNER_FLAG));
7119 }
7120 #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
7121 return(error);
7122 }
7123
7124 /*******************************************************************//**
7125 Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
7126 static
7127 void
btr_check_blob_fil_page_type(ulint space_id,ulint page_no,const page_t * page,ibool read)7128 btr_check_blob_fil_page_type(
7129 /*=========================*/
7130 ulint space_id, /*!< in: space id */
7131 ulint page_no, /*!< in: page number */
7132 const page_t* page, /*!< in: page */
7133 ibool read) /*!< in: TRUE=read, FALSE=purge */
7134 {
7135 ulint type = fil_page_get_type(page);
7136
7137 ut_a(space_id == page_get_space_id(page));
7138 ut_a(page_no == page_get_page_no(page));
7139
7140 if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
7141 ulint flags = fil_space_get_flags(space_id);
7142
7143 #ifndef UNIV_DEBUG /* Improve debug test coverage */
7144 if (dict_tf_get_format(flags) == UNIV_FORMAT_A) {
7145 /* Old versions of InnoDB did not initialize
7146 FIL_PAGE_TYPE on BLOB pages. Do not print
7147 anything about the type mismatch when reading
7148 a BLOB page that is in Antelope format.*/
7149 return;
7150 }
7151 #endif /* !UNIV_DEBUG */
7152
7153 ib::fatal() << "FIL_PAGE_TYPE=" << type
7154 << " on BLOB " << (read ? "read" : "purge")
7155 << " space " << space_id << " page " << page_no
7156 << " flags " << flags;
7157 }
7158 }
7159
7160 /*******************************************************************//**
7161 Frees the space in an externally stored field to the file space
7162 management if the field in data is owned by the externally stored field,
7163 in a rollback we may have the additional condition that the field must
7164 not be inherited. */
7165 void
btr_free_externally_stored_field(dict_index_t * index,byte * field_ref,const rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,ulint i,bool rollback,mtr_t * local_mtr)7166 btr_free_externally_stored_field(
7167 /*=============================*/
7168 dict_index_t* index, /*!< in: index of the data, the index
7169 tree MUST be X-latched; if the tree
7170 height is 1, then also the root page
7171 must be X-latched! (this is relevant
7172 in the case this function is called
7173 from purge where 'data' is located on
7174 an undo log page, not an index
7175 page) */
7176 byte* field_ref, /*!< in/out: field reference */
7177 const rec_t* rec, /*!< in: record containing field_ref, for
7178 page_zip_write_blob_ptr(), or NULL */
7179 const ulint* offsets, /*!< in: rec_get_offsets(rec, index),
7180 or NULL */
7181 page_zip_des_t* page_zip, /*!< in: compressed page corresponding
7182 to rec, or NULL if rec == NULL */
7183 ulint i, /*!< in: field number of field_ref;
7184 ignored if rec == NULL */
7185 bool rollback, /*!< in: performing rollback? */
7186 mtr_t* local_mtr) /*!< in: mtr
7187 containing the latch to data an an
7188 X-latch to the index tree */
7189 {
7190 page_t* page;
7191 const ulint space_id = mach_read_from_4(
7192 field_ref + BTR_EXTERN_SPACE_ID);
7193 const ulint start_page = mach_read_from_4(
7194 field_ref + BTR_EXTERN_PAGE_NO);
7195 ulint page_no;
7196 ulint next_page_no;
7197 mtr_t mtr;
7198
7199 ut_ad(dict_index_is_clust(index));
7200 ut_ad(mtr_memo_contains_flagged(local_mtr, dict_index_get_lock(index),
7201 MTR_MEMO_X_LOCK
7202 | MTR_MEMO_SX_LOCK)
7203 || dict_table_is_intrinsic(index->table));
7204 ut_ad(mtr_is_page_fix(
7205 local_mtr, field_ref, MTR_MEMO_PAGE_X_FIX, index->table));
7206 ut_ad(!rec || rec_offs_validate(rec, index, offsets));
7207 ut_ad(!rec || field_ref == btr_rec_get_field_ref(rec, offsets, i));
7208 ut_ad(local_mtr->is_named_space(
7209 page_get_space_id(page_align(field_ref))));
7210
7211 if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
7212 BTR_EXTERN_FIELD_REF_SIZE))) {
7213 /* In the rollback, we may encounter a clustered index
7214 record with some unwritten off-page columns. There is
7215 nothing to free then. */
7216 ut_a(rollback);
7217 return;
7218 }
7219
7220 ut_ad(!(mach_read_from_4(field_ref + BTR_EXTERN_LEN)
7221 & ~((BTR_EXTERN_OWNER_FLAG
7222 | BTR_EXTERN_INHERITED_FLAG) << 24)));
7223 ut_ad(space_id == index->space);
7224
7225 const page_size_t ext_page_size(dict_table_page_size(index->table));
7226 const page_size_t& rec_page_size(rec == NULL
7227 ? univ_page_size
7228 : ext_page_size);
7229 if (rec == NULL) {
7230 /* This is a call from row_purge_upd_exist_or_extern(). */
7231 ut_ad(!page_zip);
7232 }
7233
7234 for (;;) {
7235 #ifdef UNIV_DEBUG
7236 buf_block_t* rec_block;
7237 #endif /* UNIV_DEBUG */
7238 buf_block_t* ext_block;
7239
7240 mtr_start(&mtr);
7241 mtr.set_spaces(*local_mtr);
7242 mtr.set_log_mode(local_mtr->get_log_mode());
7243
7244 ut_ad(!dict_table_is_temporary(index->table)
7245 || local_mtr->get_log_mode() == MTR_LOG_NO_REDO);
7246
7247 const page_t* p = page_align(field_ref);
7248
7249 const page_id_t page_id(page_get_space_id(p),
7250 page_get_page_no(p));
7251
7252 #ifdef UNIV_DEBUG
7253 rec_block =
7254 #endif /* UNIV_DEBUG */
7255 buf_page_get(page_id, rec_page_size, RW_X_LATCH, &mtr);
7256
7257 buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
7258 page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
7259
7260 if (/* There is no external storage data */
7261 page_no == FIL_NULL
7262 /* This field does not own the externally stored field */
7263 || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7264 & BTR_EXTERN_OWNER_FLAG)
7265 /* Rollback and inherited field */
7266 || (rollback
7267 && (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
7268 & BTR_EXTERN_INHERITED_FLAG))) {
7269
7270 /* Do not free */
7271 mtr_commit(&mtr);
7272
7273 return;
7274 }
7275
7276 if (page_no == start_page && dict_index_is_online_ddl(index)) {
7277 row_log_table_blob_free(index, start_page);
7278 }
7279
7280 ext_block = buf_page_get(
7281 page_id_t(space_id, page_no), ext_page_size,
7282 RW_X_LATCH, &mtr);
7283
7284 buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
7285 page = buf_block_get_frame(ext_block);
7286
7287 if (ext_page_size.is_compressed()) {
7288 /* Note that page_zip will be NULL
7289 in row_purge_upd_exist_or_extern(). */
7290 switch (fil_page_get_type(page)) {
7291 case FIL_PAGE_TYPE_ZBLOB:
7292 case FIL_PAGE_TYPE_ZBLOB2:
7293 break;
7294 default:
7295 ut_error;
7296 }
7297 next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
7298
7299 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7300 &mtr);
7301
7302 if (page_zip != NULL) {
7303 mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
7304 next_page_no);
7305 mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
7306 0);
7307 page_zip_write_blob_ptr(page_zip, rec, index,
7308 offsets, i, &mtr);
7309 } else {
7310 mlog_write_ulint(field_ref
7311 + BTR_EXTERN_PAGE_NO,
7312 next_page_no,
7313 MLOG_4BYTES, &mtr);
7314 mlog_write_ulint(field_ref
7315 + BTR_EXTERN_LEN + 4, 0,
7316 MLOG_4BYTES, &mtr);
7317 }
7318 } else {
7319 ut_a(!page_zip);
7320 btr_check_blob_fil_page_type(space_id, page_no, page,
7321 FALSE);
7322
7323 next_page_no = mach_read_from_4(
7324 page + FIL_PAGE_DATA
7325 + BTR_BLOB_HDR_NEXT_PAGE_NO);
7326
7327 btr_page_free_low(index, ext_block, ULINT_UNDEFINED,
7328 &mtr);
7329
7330 mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
7331 next_page_no,
7332 MLOG_4BYTES, &mtr);
7333 /* Zero out the BLOB length. If the server
7334 crashes during the execution of this function,
7335 trx_rollback_or_clean_all_recovered() could
7336 dereference the half-deleted BLOB, fetching a
7337 wrong prefix for the BLOB. */
7338 mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
7339 0,
7340 MLOG_4BYTES, &mtr);
7341 }
7342
7343 /* Commit mtr and release the BLOB block to save memory. */
7344 btr_blob_free(index, ext_block, TRUE, &mtr);
7345 }
7346 }
7347
7348 /***********************************************************//**
7349 Frees the externally stored fields for a record. */
7350 static
7351 void
btr_rec_free_externally_stored_fields(dict_index_t * index,rec_t * rec,const ulint * offsets,page_zip_des_t * page_zip,bool rollback,mtr_t * mtr)7352 btr_rec_free_externally_stored_fields(
7353 /*==================================*/
7354 dict_index_t* index, /*!< in: index of the data, the index
7355 tree MUST be X-latched */
7356 rec_t* rec, /*!< in/out: record */
7357 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7358 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7359 part will be updated, or NULL */
7360 bool rollback,/*!< in: performing rollback? */
7361 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7362 an X-latch to record page and to the index
7363 tree */
7364 {
7365 ulint n_fields;
7366 ulint i;
7367
7368 ut_ad(rec_offs_validate(rec, index, offsets));
7369 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7370 /* Free possible externally stored fields in the record */
7371
7372 ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
7373 n_fields = rec_offs_n_fields(offsets);
7374
7375 for (i = 0; i < n_fields; i++) {
7376 if (rec_offs_nth_extern(offsets, i)) {
7377 btr_free_externally_stored_field(
7378 index, btr_rec_get_field_ref(rec, offsets, i),
7379 rec, offsets, page_zip, i, rollback, mtr);
7380 }
7381 }
7382 }
7383
7384 /***********************************************************//**
7385 Frees the externally stored fields for a record, if the field is mentioned
7386 in the update vector. */
7387 static
7388 void
btr_rec_free_updated_extern_fields(dict_index_t * index,rec_t * rec,page_zip_des_t * page_zip,const ulint * offsets,const upd_t * update,bool rollback,mtr_t * mtr)7389 btr_rec_free_updated_extern_fields(
7390 /*===============================*/
7391 dict_index_t* index, /*!< in: index of rec; the index tree MUST be
7392 X-latched */
7393 rec_t* rec, /*!< in/out: record */
7394 page_zip_des_t* page_zip,/*!< in: compressed page whose uncompressed
7395 part will be updated, or NULL */
7396 const ulint* offsets,/*!< in: rec_get_offsets(rec, index) */
7397 const upd_t* update, /*!< in: update vector */
7398 bool rollback,/*!< in: performing rollback? */
7399 mtr_t* mtr) /*!< in: mini-transaction handle which contains
7400 an X-latch to record page and to the tree */
7401 {
7402 ulint n_fields;
7403 ulint i;
7404
7405 ut_ad(rec_offs_validate(rec, index, offsets));
7406 ut_ad(mtr_is_page_fix(mtr, rec, MTR_MEMO_PAGE_X_FIX, index->table));
7407
7408 /* Free possible externally stored fields in the record */
7409
7410 n_fields = upd_get_n_fields(update);
7411
7412 for (i = 0; i < n_fields; i++) {
7413 const upd_field_t* ufield = upd_get_nth_field(update, i);
7414
7415 if (rec_offs_nth_extern(offsets, ufield->field_no)) {
7416 ulint len;
7417 byte* data = rec_get_nth_field(
7418 rec, offsets, ufield->field_no, &len);
7419 ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
7420
7421 btr_free_externally_stored_field(
7422 index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
7423 rec, offsets, page_zip,
7424 ufield->field_no, rollback, mtr);
7425 }
7426 }
7427 }
7428
7429 /*******************************************************************//**
7430 Copies the prefix of an uncompressed BLOB. The clustered index record
7431 that points to this BLOB must be protected by a lock or a page latch.
7432 @return number of bytes written to buf */
7433 static
7434 ulint
btr_copy_blob_prefix(byte * buf,ulint len,ulint space_id,ulint page_no,ulint offset)7435 btr_copy_blob_prefix(
7436 /*=================*/
7437 byte* buf, /*!< out: the externally stored part of
7438 the field, or a prefix of it */
7439 ulint len, /*!< in: length of buf, in bytes */
7440 ulint space_id,/*!< in: space id of the BLOB pages */
7441 ulint page_no,/*!< in: page number of the first BLOB page */
7442 ulint offset) /*!< in: offset on the first BLOB page */
7443 {
7444 ulint copied_len = 0;
7445
7446 for (;;) {
7447 mtr_t mtr;
7448 buf_block_t* block;
7449 const page_t* page;
7450 const byte* blob_header;
7451 ulint part_len;
7452 ulint copy_len;
7453
7454 mtr_start(&mtr);
7455
7456 block = buf_page_get(page_id_t(space_id, page_no),
7457 univ_page_size, RW_S_LATCH, &mtr);
7458 buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
7459 page = buf_block_get_frame(block);
7460
7461 btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
7462
7463 blob_header = page + offset;
7464 part_len = btr_blob_get_part_len(blob_header);
7465 copy_len = ut_min(part_len, len - copied_len);
7466
7467 memcpy(buf + copied_len,
7468 blob_header + BTR_BLOB_HDR_SIZE, copy_len);
7469 copied_len += copy_len;
7470
7471 page_no = btr_blob_get_next_page_no(blob_header);
7472
7473 mtr_commit(&mtr);
7474
7475 if (page_no == FIL_NULL || copy_len != part_len) {
7476 UNIV_MEM_ASSERT_RW(buf, copied_len);
7477 return(copied_len);
7478 }
7479
7480 /* On other BLOB pages except the first the BLOB header
7481 always is at the page data start: */
7482
7483 offset = FIL_PAGE_DATA;
7484
7485 ut_ad(copied_len <= len);
7486 }
7487 }
7488
7489 /** Copies the prefix of a compressed BLOB.
7490 The clustered index record that points to this BLOB must be protected
7491 by a lock or a page latch.
7492 @param[out] buf the externally stored part of the field,
7493 or a prefix of it
7494 @param[in] len length of buf, in bytes
7495 @param[in] page_size compressed BLOB page size
7496 @param[in] space_id space id of the BLOB pages
7497 @param[in] offset offset on the first BLOB page
7498 @return number of bytes written to buf */
7499 static
7500 ulint
btr_copy_zblob_prefix(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7501 btr_copy_zblob_prefix(
7502 byte* buf,
7503 ulint len,
7504 const page_size_t& page_size,
7505 ulint space_id,
7506 ulint page_no,
7507 ulint offset)
7508 {
7509 ulint page_type = FIL_PAGE_TYPE_ZBLOB;
7510 mem_heap_t* heap;
7511 int err;
7512 z_stream d_stream;
7513
7514 d_stream.next_out = buf;
7515 d_stream.avail_out = static_cast<uInt>(len);
7516 d_stream.next_in = Z_NULL;
7517 d_stream.avail_in = 0;
7518
7519 /* Zlib inflate needs 32 kilobytes for the default
7520 window size, plus a few kilobytes for small objects. */
7521 heap = mem_heap_create(40000);
7522 page_zip_set_alloc(&d_stream, heap);
7523
7524 ut_ad(page_size.is_compressed());
7525 ut_ad(space_id);
7526
7527 err = inflateInit(&d_stream);
7528 ut_a(err == Z_OK);
7529
7530 for (;;) {
7531 buf_page_t* bpage;
7532 ulint next_page_no;
7533
7534 /* There is no latch on bpage directly. Instead,
7535 bpage is protected by the B-tree page latch that
7536 is being held on the clustered index record, or,
7537 in row_merge_copy_blobs(), by an exclusive table lock. */
7538 bpage = buf_page_get_zip(page_id_t(space_id, page_no),
7539 page_size);
7540
7541 if (UNIV_UNLIKELY(!bpage)) {
7542 ib::error() << "Cannot load compressed BLOB "
7543 << page_id_t(space_id, page_no);
7544 goto func_exit;
7545 }
7546
7547 if (UNIV_UNLIKELY
7548 (fil_page_get_type(bpage->zip.data) != page_type)) {
7549
7550 ib::error() << "Unexpected type "
7551 << fil_page_get_type(bpage->zip.data)
7552 << " of compressed BLOB page "
7553 << page_id_t(space_id, page_no);
7554
7555 ut_ad(0);
7556 goto end_of_blob;
7557 }
7558
7559 next_page_no = mach_read_from_4(bpage->zip.data + offset);
7560
7561 if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
7562 /* When the BLOB begins at page header,
7563 the compressed data payload does not
7564 immediately follow the next page pointer. */
7565 offset = FIL_PAGE_DATA;
7566 } else {
7567 offset += 4;
7568 }
7569
7570 d_stream.next_in = bpage->zip.data + offset;
7571 d_stream.avail_in = static_cast<uInt>(page_size.physical()
7572 - offset);
7573
7574 err = inflate(&d_stream, Z_NO_FLUSH);
7575 switch (err) {
7576 case Z_OK:
7577 if (!d_stream.avail_out) {
7578 goto end_of_blob;
7579 }
7580 break;
7581 case Z_STREAM_END:
7582 if (next_page_no == FIL_NULL) {
7583 goto end_of_blob;
7584 }
7585 /* fall through */
7586 default:
7587 inflate_error:
7588 ib::error() << "inflate() of compressed BLOB page "
7589 << page_id_t(space_id, page_no)
7590 << " returned " << err
7591 << " (" << d_stream.msg << ")";
7592
7593 case Z_BUF_ERROR:
7594 goto end_of_blob;
7595 }
7596
7597 if (next_page_no == FIL_NULL) {
7598 if (!d_stream.avail_in) {
7599 ib::error()
7600 << "Unexpected end of compressed "
7601 << "BLOB page "
7602 << page_id_t(space_id, page_no);
7603 } else {
7604 err = inflate(&d_stream, Z_FINISH);
7605 switch (err) {
7606 case Z_STREAM_END:
7607 case Z_BUF_ERROR:
7608 break;
7609 default:
7610 goto inflate_error;
7611 }
7612 }
7613
7614 end_of_blob:
7615 buf_page_release_zip(bpage);
7616 goto func_exit;
7617 }
7618
7619 buf_page_release_zip(bpage);
7620
7621 /* On other BLOB pages except the first
7622 the BLOB header always is at the page header: */
7623
7624 page_no = next_page_no;
7625 offset = FIL_PAGE_NEXT;
7626 page_type = FIL_PAGE_TYPE_ZBLOB2;
7627 }
7628
7629 func_exit:
7630 inflateEnd(&d_stream);
7631 mem_heap_free(heap);
7632 UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
7633 return(d_stream.total_out);
7634 }
7635
7636 /** Copies the prefix of an externally stored field of a record.
7637 The clustered index record that points to this BLOB must be protected
7638 by a lock or a page latch.
7639 @param[out] buf the externally stored part of the
7640 field, or a prefix of it
7641 @param[in] len length of buf, in bytes
7642 @param[in] page_size BLOB page size
7643 @param[in] space_id space id of the first BLOB page
7644 @param[in] page_no page number of the first BLOB page
7645 @param[in] offset offset on the first BLOB page
7646 @return number of bytes written to buf */
7647 static
7648 ulint
btr_copy_externally_stored_field_prefix_low(byte * buf,ulint len,const page_size_t & page_size,ulint space_id,ulint page_no,ulint offset)7649 btr_copy_externally_stored_field_prefix_low(
7650 byte* buf,
7651 ulint len,
7652 const page_size_t& page_size,
7653 ulint space_id,
7654 ulint page_no,
7655 ulint offset)
7656 {
7657 if (len == 0) {
7658 return(0);
7659 }
7660
7661 if (page_size.is_compressed()) {
7662 return(btr_copy_zblob_prefix(buf, len, page_size,
7663 space_id, page_no, offset));
7664 } else {
7665 ut_ad(page_size.equals_to(univ_page_size));
7666 return(btr_copy_blob_prefix(buf, len, space_id,
7667 page_no, offset));
7668 }
7669 }
7670
7671 /** Copies the prefix of an externally stored field of a record.
7672 The clustered index record must be protected by a lock or a page latch.
7673 @param[out] buf the field, or a prefix of it
7674 @param[in] len length of buf, in bytes
7675 @param[in] page_size BLOB page size
7676 @param[in] data 'internally' stored part of the field
7677 containing also the reference to the external part; must be protected by
7678 a lock or a page latch
7679 @param[in] local_len length of data, in bytes
7680 @return the length of the copied field, or 0 if the column was being
7681 or has been deleted */
7682 ulint
btr_copy_externally_stored_field_prefix(byte * buf,ulint len,const page_size_t & page_size,const byte * data,ulint local_len)7683 btr_copy_externally_stored_field_prefix(
7684 byte* buf,
7685 ulint len,
7686 const page_size_t& page_size,
7687 const byte* data,
7688 ulint local_len)
7689 {
7690 ulint space_id;
7691 ulint page_no;
7692 ulint offset;
7693
7694 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7695
7696 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7697
7698 if (UNIV_UNLIKELY(local_len >= len)) {
7699 memcpy(buf, data, len);
7700 return(len);
7701 }
7702
7703 memcpy(buf, data, local_len);
7704 data += local_len;
7705
7706 ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
7707
7708 if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
7709 /* The externally stored part of the column has been
7710 (partially) deleted. Signal the half-deleted BLOB
7711 to the caller. */
7712
7713 return(0);
7714 }
7715
7716 space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
7717
7718 page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
7719
7720 offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
7721
7722 return(local_len
7723 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7724 len - local_len,
7725 page_size,
7726 space_id, page_no,
7727 offset));
7728 }
7729
7730 /** Copies an externally stored field of a record to mem heap.
7731 The clustered index record must be protected by a lock or a page latch.
7732 @param[out] len length of the whole field
7733 @param[in] data 'internally' stored part of the field
7734 containing also the reference to the external part; must be protected by
7735 a lock or a page latch
7736 @param[in] page_size BLOB page size
7737 @param[in] local_len length of data
7738 @param[in,out] heap mem heap
7739 @return the whole field copied to heap */
7740 byte*
btr_copy_externally_stored_field(ulint * len,const byte * data,const page_size_t & page_size,ulint local_len,mem_heap_t * heap)7741 btr_copy_externally_stored_field(
7742 ulint* len,
7743 const byte* data,
7744 const page_size_t& page_size,
7745 ulint local_len,
7746 mem_heap_t* heap)
7747 {
7748 ulint space_id;
7749 ulint page_no;
7750 ulint offset;
7751 ulint extern_len;
7752 byte* buf;
7753
7754 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7755
7756 local_len -= BTR_EXTERN_FIELD_REF_SIZE;
7757
7758 space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
7759
7760 page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
7761
7762 offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
7763
7764 /* Currently a BLOB cannot be bigger than 4 GB; we
7765 leave the 4 upper bytes in the length field unused */
7766
7767 extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
7768
7769 buf = (byte*) mem_heap_alloc(heap, local_len + extern_len);
7770
7771 memcpy(buf, data, local_len);
7772 *len = local_len
7773 + btr_copy_externally_stored_field_prefix_low(buf + local_len,
7774 extern_len,
7775 page_size,
7776 space_id,
7777 page_no, offset);
7778
7779 return(buf);
7780 }
7781
7782 /** Copies an externally stored field of a record to mem heap.
7783 @param[in] rec record in a clustered index; must be
7784 protected by a lock or a page latch
7785 @param[in] offset array returned by rec_get_offsets()
7786 @param[in] page_size BLOB page size
7787 @param[in] no field number
7788 @param[out] len length of the field
7789 @param[in,out] heap mem heap
7790 @return the field copied to heap, or NULL if the field is incomplete */
7791 byte*
btr_rec_copy_externally_stored_field(const rec_t * rec,const ulint * offsets,const page_size_t & page_size,ulint no,ulint * len,mem_heap_t * heap)7792 btr_rec_copy_externally_stored_field(
7793 const rec_t* rec,
7794 const ulint* offsets,
7795 const page_size_t& page_size,
7796 ulint no,
7797 ulint* len,
7798 mem_heap_t* heap)
7799 {
7800 ulint local_len;
7801 const byte* data;
7802
7803 ut_a(rec_offs_nth_extern(offsets, no));
7804
7805 /* An externally stored field can contain some initial
7806 data from the field, and in the last 20 bytes it has the
7807 space id, page number, and offset where the rest of the
7808 field data is stored, and the data length in addition to
7809 the data stored locally. We may need to store some data
7810 locally to get the local record length above the 128 byte
7811 limit so that field offsets are stored in two bytes, and
7812 the extern bit is available in those two bytes. */
7813
7814 data = rec_get_nth_field(rec, offsets, no, &local_len);
7815
7816 ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
7817
7818 if (UNIV_UNLIKELY
7819 (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
7820 field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
7821 /* The externally stored field was not written yet.
7822 This record should only be seen by
7823 recv_recovery_rollback_active() or any
7824 TRX_ISO_READ_UNCOMMITTED transactions. */
7825 return(NULL);
7826 }
7827
7828 return(btr_copy_externally_stored_field(len, data,
7829 page_size, local_len, heap));
7830 }
7831 #endif /* !UNIV_HOTBACKUP */
7832